├── test
├── single
│ ├── data
│ │ ├── gene_test.bed
│ │ ├── rice_leaf_DHpeaks_test.bed
│ │ ├── rice_callus_DHpeaks_test.bed
│ │ ├── rice_H3K27ac_test.bw
│ │ ├── rice_leaf_DHS_test.bw
│ │ ├── rice_callus_DHS_test.bw
│ │ ├── rice.chrom.sizes
│ │ ├── genopheno_test.bed
│ │ ├── editing_results_test.bed
│ │ └── genes_motifs_JASPAR_test.bed
│ └── config.ini
└── batch
│ ├── data
│ └── README.md
│ └── config.ini
├── requirements.txt
├── .gitignore
├── config.ini
├── lib
├── cores.py
├── genopheno.py
├── misc.py
└── features.py
├── README.md
├── single.py
└── batch.py
/test/single/data/gene_test.bed:
--------------------------------------------------------------------------------
1 | Chr1 4003659 4004888 LOC_Os01g08220 . -
2 |
--------------------------------------------------------------------------------
/test/single/data/rice_leaf_DHpeaks_test.bed:
--------------------------------------------------------------------------------
1 | Chr1 4000759 4001568
2 | Chr1 4004878 4005453
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | scipy
4 | tqdm
5 | biopython
6 | pyBigWig
7 | pybedtools
--------------------------------------------------------------------------------
/test/single/data/rice_callus_DHpeaks_test.bed:
--------------------------------------------------------------------------------
1 | Chr1 4000853 4001184
2 | Chr1 4002860 4003235
3 | Chr1 4004230 4004515
4 |
--------------------------------------------------------------------------------
/test/single/data/rice_H3K27ac_test.bw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/CAPE/main/test/single/data/rice_H3K27ac_test.bw
--------------------------------------------------------------------------------
/test/single/data/rice_leaf_DHS_test.bw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/CAPE/main/test/single/data/rice_leaf_DHS_test.bw
--------------------------------------------------------------------------------
/test/single/data/rice_callus_DHS_test.bw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/CAPE/main/test/single/data/rice_callus_DHS_test.bw
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | node_modules
3 | .vscode
4 | .idea
5 | .env
6 | .envrc
7 | .venv
8 | *.local
9 | *.log*
10 | logs
11 | .DS_Store
--------------------------------------------------------------------------------
/test/single/data/rice.chrom.sizes:
--------------------------------------------------------------------------------
1 | Chr1 43270923
2 | Chr2 35937250
3 | Chr3 36413819
4 | Chr4 35502694
5 | Chr5 29958434
6 | Chr6 31248787
7 | Chr7 29697621
8 | Chr8 28443022
9 | Chr9 23012720
10 | Chr10 23207287
11 | Chr11 29021106
12 | Chr12 27531856
13 | ChrSy 592136
14 | ChrUn 633585
15 | ChrC 134525
16 | ChrM 490520
17 |
--------------------------------------------------------------------------------
/test/batch/data/README.md:
--------------------------------------------------------------------------------
1 | For genome-wide analysis, omics data can be downloaded from several database:
2 |
3 | (1) [PlantDHS](http://plantdhs.org/): DNase-seq data
4 | (2) [PlantRegMap](http://plantregmap.gao-lab.org/): TF motifs, sequence conservation (CNSs)
5 | (3) [MBKbase](http://www.mbkbase.org/rice): Genomic variation and phenotypes
6 |
7 | Then, data files are put in the data folder.
--------------------------------------------------------------------------------
/test/batch/config.ini:
--------------------------------------------------------------------------------
1 | [General]
2 | workdir = results
3 | binsize = 10
4 | step = 10
5 | upstream = 2000
6 | slop = 200
7 | withutr = 0
8 | threads = 64
9 |
10 | [Features]
11 | ocfiles = data/rice_leaf_DHS_test.bw,data/rice_callus_DHS_test.bw
12 | ocpeaks = data/rice_leaf_DHpeaks_test.bed,data/rice_callus_DHpeaks_test.bed
13 | ptmfiles = data/rice_H3K27ac_test.bw
14 | motifs = data/genome_wide_motifs_JASPAR_test.bed
15 | cnss = data/genome_wide_PhastCons_test.bedGraph
16 | genopheno =
17 | phenodata =
18 |
19 | [Genes]
20 | gene_file =
21 | gff_file = data/annotation.gff3
22 | chrom_sizes = data/rice.chrom.sizes
23 |
--------------------------------------------------------------------------------
/test/single/config.ini:
--------------------------------------------------------------------------------
1 | [General]
2 | workdir = results
3 | binsize = 10
4 | step = 10
5 | upstream = 2000
6 | slop = 200
7 | withutr = 0
8 | threads = 8
9 |
10 | [Features]
11 | ocfiles = data/rice_leaf_DHS_test.bw,data/rice_callus_DHS_test.bw
12 | ocpeaks = data/rice_leaf_DHpeaks_test.bed,data/rice_callus_DHpeaks_test.bed
13 | ptmfiles = data/rice_H3K27ac_test.bw
14 | motifs = data/genes_motifs_JASPAR_test.bed
15 | cnss = data/genes_PhastCons_test.bedGraph
16 | genopheno = data/genopheno_test.bed
17 | phenodata = data/editing_results_test.bed
18 |
19 | [Genes]
20 | gene_file = data/gene_test.bed
21 | gff_file =
22 | chrom_sizes = data/rice.chrom.sizes
23 |
--------------------------------------------------------------------------------
/config.ini:
--------------------------------------------------------------------------------
1 | [General]
2 | # Work directory ( also known as output directory )
3 | workdir = results
4 | # binsize and sliding step, not recommend to change
5 | binsize = 10
6 | step = 10
7 | # Promoter length defined as sequence upstream of the TSS
8 | upstream = 2000
9 | # Extended length for generating raw scores of each features ( Useful for genome browser visualization )
10 | slop = 200
11 | # Whether or not including the 5'-UTR for analysis ( 0: Not include; 1: promoter + 5'-UTR )
12 | withutr = 0
13 | # Threads for batch mode (simultaneously process n genes)
14 | threads = 8
15 |
16 | [Features]
17 | # Features with 1-bp resolution are recommended.
18 | # If feature files are unavailable, just leave a blank.
19 | # Multiple files are separated by comma.
20 |
21 | # Open chromatin BigWig files ( from ATAC-seq/DNase-seq/MNase-seq/etc. )
22 | ocfiles = ATAC_profile.bw
23 | # Open chromatin peaks ( from MACS2/Genrich/Popera/etc. )
24 | ocpeaks = ATAC_peaks.bed
25 | # Histone modification BigWig files ( H3K27ac from ChIP-seq )
26 | ptmfiles = H3K27ac.bw
27 | # TF binding motifs ( from PlantTFBS/JARSPR motifs called by FIMO )
28 | motifs = genome_wide_motifs_JASPAR.bed
29 | # Conserved non-coding sequences ( from PhastCons/mVISTA scores )
30 | cnss = PhastCons.bedGraph
31 | # Genotype and phenotype files directory ( from MBKbase/etc. )
32 | genopheno =
33 | # Phenotypes for evaluation ( Phenodata measured after gene-editing )
34 | phenodata =
35 |
36 | [Genes]
37 | # Gene for single mode (BED format: chr start end genename . strand)
38 | gene_file = gene.bed
39 | # GFF/GFF3 file for batch mode ( Use batch mode if gff_file is defined )
40 | gff_file = annotation.gff3
41 | # Chromosome length ( in case out of range )
42 | chrom_sizes = genome.chrom.sizes
43 |
--------------------------------------------------------------------------------
/lib/cores.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | from pybedtools import BedTool
4 |
5 |
6 | def get_scores(geneinfo, scorefile, regionfile):
7 | score_bed = BedTool(scorefile)
8 | region_bed = BedTool(regionfile)
9 | scores = {}
10 | for interval in score_bed.intersect(region_bed, wo=True):
11 | info = str(interval).rstrip().split("\t")
12 | chrom = info[0]
13 | score = float(info[3])
14 | region_start = info[5]
15 | region_end = info[6]
16 | name = "_".join([chrom, region_start, region_end])
17 | if name not in scores:
18 | scores[name] = []
19 | scores[name].append(score)
20 | return scores
21 |
22 |
23 | def get_cores(geneinfo, scores):
24 | binsize = geneinfo.binsize
25 | cores = ""
26 | for region in scores:
27 | values = scores[region]
28 | if len(values) >= 5:
29 | cutoff = np.average(values)
30 | else:
31 | cutoff = 0
32 | chrom, rstart, rend = region.split("_")
33 | for i, score in enumerate(values):
34 | if score >= cutoff:
35 | start = int(rstart) + int(binsize * i)
36 | end = start + binsize
37 | cores += "\t".join([chrom, str(start), str(end)]) + "\n"
38 | core_regions = BedTool(cores, from_string=True).merge()
39 | return core_regions
40 |
41 |
42 | def output_cores(geneinfo, scorefile, regionfile, minlen = 2, outfile = ""):
43 | scores = get_scores(geneinfo, scorefile, regionfile)
44 | cores = get_cores(geneinfo, scores)
45 | binsize = geneinfo.binsize
46 | core_regions = []
47 | if not outfile:
48 | outfile = regionfile.replace("key_regions_merged", "core_regions")
49 | outf = open(outfile, "w")
50 | for interval in cores:
51 | info = str(interval).rstrip().split("\t")
52 | chrom = info[0]
53 | start = int(info[1])
54 | end = int(info[2])
55 | if end - start >= binsize * minlen:
56 | core_regions.append([chrom, start, end])
57 | print(chrom, start, end, sep="\t", file=outf)
58 | outf.close()
59 | return core_regions
60 |
61 |
--------------------------------------------------------------------------------
/test/single/data/genopheno_test.bed:
--------------------------------------------------------------------------------
1 | Chr1 4004909 4004910 25.08108508360879 3.578586951318265e-06 5.446288425978628
2 | Chr1 4004918 4004919 11.643805566694892 0.0006441668288457089 3.1910016427888794
3 | Chr1 4004944 4004945 13.03230509818196 0.0014793498931556398 2.8299290953100207
4 | Chr1 4005034 4005035 39.84383952471281 2.2285386897378036e-09 8.65197982171073
5 | Chr1 4005064 4005065 39.78991300947906 2.2894447758468627e-09 8.640269827713585
6 | Chr1 4005209 4005210 29.288540872193153 6.23633981593134e-08 7.205070228287081
7 | Chr1 4005297 4005298 119.18579893750855 1.3156264619263447e-26 25.88086739989521
8 | Chr1 4005403 4005404 11.643805566694892 0.0006441668288457089 3.1910016427888794
9 | Chr1 4005530 4005531 110.42334562249201 7.914747797493608e-26 25.101562919323662
10 | Chr1 4005658 4005659 1.0339549084633033 0.30923158012311003 0.5097161603783631
11 | Chr1 4005677 4005678 39.26142674474035 2.981875498831951e-09 8.525510493444743
12 | Chr1 4005679 4005680 10.544113986204366 0.001165586841256689 2.9334553641223033
13 | Chr1 4005691 4005692 15.548917696064617 8.039766076595917e-05 4.094756587211936
14 | Chr1 4005703 4005704 26.935587397725662 1.4158313132317057e-06 5.848988486827513
15 | Chr1 4005717 4005718 2.0448089072330937 0.15272616080064952 0.8160865653126603
16 | Chr1 4005796 4005797 55.075947097667324 1.0975139395761731e-12 11.959589955056167
17 | Chr1 4005839 4005840 14.03972184543109 0.0008939498123160222 3.0486868624636307
18 | Chr1 4005869 4005870 0.06792994717983242 0.7943749668296061 0.0999744502365751
19 | Chr1 4005872 4005873 29.350441454008816 4.232847455651902e-07 6.373367382450242
20 | Chr1 4005888 4005889 2.708257829770254 0.09982996124023241 1.000739097505115
21 | Chr1 4005917 4005918 177.4827977726221 2.8846966307842555e-39 38.539899852700266
22 | Chr1 4005956 4005957 11.643805566694892 0.0006441668288457089 3.1910016427888794
23 | Chr1 4005978 4005979 12.852202910120727 0.0016187493160071459 2.7908204020831735
24 | Chr1 4006010 4006011 11.643805566694892 0.0006441668288457089 3.1910016427888794
25 | Chr1 4006040 4006041 110.69254779348225 9.19218280830204e-25 24.036581347260658
26 | Chr1 4006055 4006056 60.973148779748776 5.752398579227342e-14 13.240151029655443
27 | Chr1 4006137 4006138 15.137103054769431 0.0005164399562436108 3.2869801643436105
28 | Chr1 4006467 4006468 16.791333717316366 0.00022584382019740588 3.6461917886132573
29 | Chr1 4006638 4006639 177.2643953648572 3.2175523937172934e-39 38.492474372436924
30 | Chr1 4006648 4006649 1.1183907309091496 0.2902652352336645 0.5372049760535923
31 | Chr1 4006756 4006757 164.24334090511462 1.3384517804049304e-37 36.8733972702606
32 | Chr1 4006758 4006759 110.42334562249201 7.914747797493608e-26 25.101562919323662
33 | Chr1 4006787 4006788 29.288540872193153 6.23633981593134e-08 7.205070228287081
34 | Chr1 4006817 4006818 134.09712513211326 5.2027049613596005e-31 30.28377080168162
35 | Chr1 4006835 4006836 11.643805566694892 0.0006441668288457089 3.1910016427888794
36 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CAPE
2 |
3 | The computational pipeline of CAPE (CRISPR-Cas12a promoter editing)
4 |
5 |
6 | ## Prerequisition
7 |
8 | 1. Python >= 3.5
9 | 2. Open chromatin data (profiles in BigWig format, peaks in BED format)
10 | 3. TF binding motifs (identified by FIMO, matrix files are from PlantTFDB or JARSPR)
11 | 4. Sequence conservation (Scores are from PhastCons/mVISTA, or manually calculate with PHAST package)
12 | 5. Genome annotation file (in GFF3 format) and chromosome sizes file
13 | 6. (Optional) H3K27ac histone modification profile (BigWig format), genomic variations and phenotypes from rice3K/RFGB/MBKBase/etc.
14 |
15 | ## Install
16 |
17 | ```bash
18 | # Install CAPE dependencies
19 | git clone https://github.com/zhangtaolab/CAPE.git
20 | cd CAPE
21 | pip install -r requirements.txt
22 |
23 | # Run test for single gene
24 | cd test/single
25 | python ../../single.py config.ini
26 | ```
27 |
28 | ### Run the pipeline for single gene
29 |
30 | ```bash
31 | # Modify the config.ini file
32 | [General]
33 | workdir = results
34 | binsize = 10
35 | step = 10
36 | upstream = 2000
37 | slop = 200
38 | withutr = 0
39 | threads = 16
40 |
41 | [Features]
42 | ocfiles = Rice_leaf_DNase.bw,Rice_callus_DNase.bw
43 | ocpeaks = TIGR7_DHSs.bed
44 | ptmfiles = rice_H3K27ac.bw
45 | motifs = genome_wide_motifs_JASPAR.bed
46 | cnss = Osj_PhastCons.bedGraph
47 | genopheno =
48 | phenodata =
49 |
50 | [Genes]
51 | gene_file = gene.bed
52 | gff_file =
53 | chrom_sizes = osativa_7.chrom.sizes
54 | ```
55 |
56 | ```bash
57 | # Run the pipeline
58 | python single.py config.ini
59 | ```
60 |
61 | ### Run the pipeline for whole genome genes
62 |
63 | ```bash
64 | # Modify the config.ini file
65 | [General]
66 | workdir = results
67 | binsize = 10
68 | step = 10
69 | upstream = 2000
70 | slop = 200
71 | withutr = 0
72 | threads = 16
73 |
74 | [Features]
75 | ocfiles = Rice_leaf_DNase.bw,Rice_callus_DNase.bw
76 | ocpeaks = TIGR7_DHSs.bed
77 | ptmfiles = rice_H3K27ac.bw
78 | motifs = genome_wide_motifs_JASPAR.bed
79 | cnss = Osj_PhastCons.bedGraph
80 | genopheno =
81 | phenodata =
82 |
83 | [Genes]
84 | gene_file =
85 | gff_file = TIGR7_all.gff3
86 | chrom_sizes = osativa_7.chrom.sizes
87 | ```
88 |
89 | ```bash
90 | # Run the pipeline
91 | python batch.py config.ini
92 | ```
93 |
94 | ## Input (Feature data processing)
95 |
96 | The instruction of how to generate feature data for calculation:
97 | 1. Open chromatin data:
98 | (1) Raw sequencing data (from DNase-seq/ATAC-seq/MNase-seq) first align to reference genome by BWA/Bowtie2;
99 | (2) Call peaks from the alignment using Macs2/Genrich/F-seq2/Popera;
100 | (3) Generate profiles from the alignment (BigWig format, using DeepTools/F-seq2/Popera).
101 | 2. TF binding motifs:
102 | (1) Download the TF PFM data from database (PlantTFDB/JASPAR/CisBP);
103 | (2) Find the occurrences of TF motifs in the genome by FIMO;
104 | (3) Merge results of all TF motifs (BED format, TFs from the same family can be merged into one).
105 | 3. Sequence conservation:
106 | (1) Pre-calculated sequence conservation of plant genomes can be retrieved from PlantRegMap database;
107 | (2) If no existed result for the target genome, calculate conservation scores using multiple close related genomes with PHAST/mVISTA.
108 | 4. H3K27ac histone modification:
109 | (1) Raw sequencing data (from ChIP-seq) first align to reference genome by BWA/Bowtie2;
110 | (2) Generate profiles from the alignment (BigWig format, using DeepTools).
111 | 5. Relationships between genomic variations and phenotypes (GenoPheno):
112 | (1) Get the genotype data from public database, in FASTA format (for rice, using rice3K/RFGB/MBKBase/etc);
113 | (2) Get the corresponding phenotype data from public database.
114 | (two column tab format, first column is Genotype_ID, second is Phenotype_Values separated by comma)
115 | 6. Genome annotation file (BED/GFF3 format) is required for getting the promoter of target gene.
116 | 7. Chromosome sizes file is required for converting input file format.
117 | (two column tab format, first column is chromosome name, second is chromosome length)
118 |
119 | \* Note that H3K27ac and GenoPheno data are optional for analysis.
120 |
121 | ## Output
122 |
123 | All output files are stored in the workdir defined in the config.ini file.
124 | A folder will be created for each gene analyzed.
125 | In the output gene folder, several files are generated:
126 | 1. analysis_region.bed (File records the analyzed regions in the genome for this gene)
127 | 2. OCpeaks_*_raw.bed (Open chromatin regions overlap with the analysis region)
128 | 3. OCscores*.bedGraph (Open chromatin scores for the analysis region, suffix 'raw' means raw scores from BigWig file, others are normalized in range 0 to 1)
129 | 4. motifs*.bedGraph (Raw file contains motifs identified in the analysis region, another file is the normalized motifs scores)
130 | 5. CNS*.bedGraph (Raw file contains raw conserved score in the analysis region, another file is the normalized CNS scores)
131 | 6. PTM*.bedGraph (H3K27ac profile for the analysis region, scores from BigWig file, others are normalized in range 0 to 1)
132 | 7. aggregate.bedGraph (The aggregate scores (AS) calculated from all above features)
133 | 8. key_regions_*.bed (Merged file means merged key regions when two key regions are adjacent)
134 | 9. core_regions.bed (Core regions which have high AS within the key regions)
135 | ( **Optional:** if CRISPR edited phenotype data are provided, also export the statistical analysis results. )
136 | 10. phenoscores_*.bedGraph (phenotype scores, measured by kmeans-like method)
137 | 11. scores_by_sample.txt (Features scores and aggregate scores for each CRISPR edited sample)
138 | 12. plot_scores.txt (Comparison between phenotype difference and estimated scores)
139 | 13. statistics.txt (Cutoff for defining key regions and significance analysis)
140 |
141 |
--------------------------------------------------------------------------------
/lib/genopheno.py:
--------------------------------------------------------------------------------
1 | from Bio import pairwise2
2 | import re
3 | from tqdm import tqdm
4 | from itertools import chain
5 | import numpy as np
6 | import pandas as pd
7 | from scipy import stats
8 |
9 |
10 | def load_fasta(seqfile):
11 | seqinfo = {}
12 | with open(seqfile, "r") as infile:
13 | for line in infile:
14 | if line.startswith(">"):
15 | info = line.split("|")
16 | name = info[0][1:]
17 | if name == "REF":
18 | sample_num = 0
19 | else:
20 | sample_num = int(info[1].split(":")[1])
21 | else:
22 | seq = line.rstrip()
23 | if name != "REF":
24 | if seq == seqinfo["REF"]["seq"]:
25 | refid = name
26 | seqinfo[name] = {}
27 | seqinfo[name]["seq"] = seq
28 | seqinfo[name]["num"] = sample_num
29 | infile.close()
30 | return seqinfo, refid
31 |
32 |
33 | def parse_alignment(alignment):
34 | aligninfo = {}
35 | refseq = alignment[0]
36 | altseq = alignment[1]
37 | indels = re.compile(r'-+')
38 | inspos = {}
39 | for i in range(len(refseq.replace("-", ""))):
40 | inspos[i] = 0
41 | for m in indels.finditer(refseq):
42 | start = m.span()[0]
43 | end = m.span()[1]
44 | for j in range(start+1, len(inspos)):
45 | inspos[j] += end - start
46 | aligninfo[start-inspos[start]] = {}
47 | aligninfo[start-inspos[start]]["ref"] = refseq[start-inspos[start]]
48 | aligninfo[start-inspos[start]]["alt"] = altseq[start-inspos[start]:end]
49 | for m in indels.finditer(altseq):
50 | start = m.span()[0]
51 | end = m.span()[1]
52 | aligninfo[start-inspos[start]] = {}
53 | aligninfo[start-inspos[start]]["ref"] = refseq[start-inspos[start]:end]
54 | aligninfo[start-inspos[start]]["alt"] = altseq[start-inspos[start]]
55 | for i in range(len(refseq)):
56 | refbase = refseq[i]
57 | altbase = altseq[i]
58 | if refbase != altbase:
59 | if refbase != "-" and altbase != "-":
60 | aligninfo[i-inspos[i]] = {}
61 | aligninfo[i-inspos[i]]["ref"] = refbase
62 | aligninfo[i-inspos[i]]["alt"] = altbase
63 | return aligninfo
64 |
65 |
66 | def pairwise_alignment(seqfile):
67 | seqinfo, refid = load_fasta(seqfile)
68 | mutinfo = {}
69 | refseq = seqinfo["REF"]["seq"]
70 | total_num = sum([seqinfo[x]["num"] for x in seqinfo])
71 | count = 0
72 | for sample in tqdm(seqinfo, desc="Finding mutations"):
73 | if sample == "REF":
74 | continue
75 | # if count >= 5:
76 | # break
77 | altseq = seqinfo[sample]["seq"]
78 | num = seqinfo[sample]["num"]
79 | ratio = round(num / total_num, 4)
80 | alignments = pairwise2.align.globalms(refseq, altseq, 2, -1, -1.5, -.5)
81 | # print(sample, ratio, alignments[0], sep="\n")
82 | mutinfo[sample] = {}
83 | mutinfo[sample]["ratio"] = ratio
84 | mutinfo[sample]["alignment"] = parse_alignment(alignments[0])
85 | count += 1
86 | return mutinfo, refid
87 |
88 |
89 | def mut2pos(seqfile):
90 | mutinfo, refid = pairwise_alignment(seqfile)
91 | vcfinfo = {}
92 | for sample in mutinfo:
93 | for pos in mutinfo[sample]["alignment"]:
94 | refbase = mutinfo[sample]["alignment"][pos]["ref"]
95 | altbase = mutinfo[sample]["alignment"][pos]["alt"]
96 | if altbase in ["a", "c", "g", "t", "n"]:
97 | altbase = altbase.upper()
98 | homozygous = 1
99 | else:
100 | homozygous = 0
101 | if pos not in vcfinfo:
102 | vcfinfo[pos] = {}
103 | vcfinfo[pos]["ref"] = refbase
104 | vcfinfo[pos]["alt"] = {}
105 | if altbase not in vcfinfo[pos]["alt"]:
106 | vcfinfo[pos]["alt"][altbase] = {}
107 | ratio = mutinfo[sample]["ratio"]
108 | vcfinfo[pos]["alt"][altbase][sample] = [ratio, homozygous]
109 | return vcfinfo, refid
110 |
111 |
112 | def load_phenodata(phenodata):
113 | gid_info = {}
114 | with open(phenodata, "r") as infile:
115 | for line in infile:
116 | if line.startswith("Genotype_ID"):
117 | continue
118 | info = line.rstrip().split("\t")
119 | sample = info[0]
120 | if len(info) > 1:
121 | values = list(map(float, [x for x in info[1].split(", ")]))
122 | if len(values) > 1:
123 | gid_info[sample] = values
124 | infile.close()
125 | return gid_info
126 |
127 |
128 | def link_genopheno(genoinfo, seqfile, phenodata):
129 | posinfo, refid = mut2pos(seqfile)
130 | phenoinfo = load_phenodata(phenodata)
131 | startpos = genoinfo.start
132 | outfile = seqfile.replace(".fasta", "_geno_pheno.txt")
133 | outf = open(outfile, "w")
134 | print("name", "pos", "ref", "alt", "value", "avg", "sd", sep="\t", file=outf)
135 | for pos in sorted(posinfo):
136 | pos_abs = pos + startpos
137 | ref = posinfo[pos]["ref"]
138 | flag = 0
139 | for alt in posinfo[pos]["alt"]:
140 | input_lst = [phenoinfo[x] for x in posinfo[pos]["alt"][alt] if x in phenoinfo]
141 | values = list(chain(*input_lst))
142 | name = str(pos_abs) + "_" + ref + "/" + alt
143 | if values:
144 | flag = 1
145 | avg_value = round(np.average(values), 4)
146 | sd = round(np.std(values), 4)
147 | for value in values:
148 | print(name, pos_abs, ref, alt, value, avg_value, sd, sep="\t", file=outf)
149 | if flag:
150 | ref_values = phenoinfo[refid]
151 | ref_avg = round(np.average(ref_values), 4)
152 | ref_sd = round(np.std(ref_values), 4)
153 | ref_name = str(pos_abs) + "_" + ref + "/" + ref
154 | for value in ref_values:
155 | print(ref_name, pos_abs, ref, ref, value, ref_avg, ref_sd, sep="\t", file=outf)
156 | outf.close()
157 | return outfile
158 |
159 |
160 | def output_genopheno(genoinfo, seqfile, phenodata, outfile = "", startpos = 0):
161 | infile = link_genopheno(genoinfo, seqfile, phenodata)
162 | geno_pheno = pd.read_table(infile)
163 | chrom = genoinfo.chrom
164 | if not outfile:
165 | outfile = infile.replace(".txt", ".bed")
166 | outf = open(outfile, "w")
167 | for pos in pd.unique(geno_pheno.pos):
168 | value_lst = []
169 | ref = pd.unique(geno_pheno[geno_pheno.pos==pos].ref)
170 | for alt in pd.unique(geno_pheno[geno_pheno.pos==pos].alt):
171 | value_lst.append(geno_pheno[(geno_pheno.pos==pos) & (geno_pheno.alt==alt)].value.tolist())
172 | if len(value_lst) > 1:
173 | kruskal = stats.kruskal(*value_lst)
174 | statistic = kruskal[0]
175 | pvalue1 = kruskal[1]
176 | pvalue2 = -np.log10(pvalue1)
177 | # print(statistic, pvalue, pvalue2)
178 | real_pos = pos + startpos
179 | print(chrom, real_pos, real_pos+len(ref), statistic, pvalue1, pvalue2, sep="\t", file=outf)
180 | outf.close()
181 | return outfile
182 |
183 |
--------------------------------------------------------------------------------
/test/single/data/editing_results_test.bed:
--------------------------------------------------------------------------------
1 | Chr1 4004888 4007388 WT 98.0 1.5811388300841898
2 | Chr1 4006709 4006711 pZJP078-01-1-1-3 87.8 1.6911534525287764
3 | Chr1 4006505 4006509 pZJP078-01-1-1-3 87.8 1.6911534525287764
4 | Chr1 4006092 4006097 pZJP078-01-1-1-3 87.8 1.6911534525287764
5 | Chr1 4005476 4005478 pZJP078-01-1-1-3 87.8 1.6911534525287764
6 | Chr1 4006503 4006505 pZJP078-02-1-1-3 89.6 1.5937377450509227
7 | Chr1 4006091 4006100 pZJP078-02-1-1-3 89.6 1.5937377450509227
8 | Chr1 4005475 4005484 pZJP078-02-1-1-3 89.6 1.5937377450509227
9 | Chr1 4006490 4006521 pZJP078-02-1-2-2 83.3 1.3266499161421599
10 | Chr1 4006469 4006486 pZJP078-02-1-2-2 83.3 1.3266499161421599
11 | Chr1 4006092 4006099 pZJP078-02-1-2-2 83.3 1.3266499161421599
12 | Chr1 4005474 4005482 pZJP078-02-1-2-2 83.3 1.3266499161421599
13 | Chr1 4005244 4005279 pZJP078-02-1-2-2 83.3 1.3266499161421599
14 | Chr1 4006701 4006722 pZJP078-04-2-1-1 80.1 1.5620499351813308
15 | Chr1 4006505 4006508 pZJP078-04-2-1-1 80.1 1.5620499351813308
16 | Chr1 4006091 4006099 pZJP078-04-2-1-1 80.1 1.5620499351813308
17 | Chr1 4005411 4005482 pZJP078-04-2-1-1 80.1 1.5620499351813308
18 | Chr1 4005242 4005254 pZJP078-04-2-1-1 80.1 1.5620499351813308
19 | Chr1 4006698 4006715 pZJP078-05-1-1-1 95.5 1.0
20 | Chr1 4006508 4006514 pZJP078-05-1-1-1 95.5 1.0
21 | Chr1 4006092 4006098 pZJP078-05-1-1-1 95.5 1.0
22 | Chr1 4005475 4005490 pZJP078-05-1-1-1 95.5 1.0
23 | Chr1 4006487 4006514 pZJP078-05-2-1-2 85.2 1.5033296378372907
24 | Chr1 4006091 4006099 pZJP078-05-2-1-2 85.2 1.5033296378372907
25 | Chr1 4005474 4005481 pZJP078-05-2-1-2 85.2 1.5033296378372907
26 | Chr1 4005246 4005256 pZJP078-05-2-1-2 85.2 1.5033296378372907
27 | Chr1 4006704 4006725 pZJP078-07-2-1-1 90.0 1.224744871391589
28 | Chr1 4006507 4006511 pZJP078-07-2-1-1 90.0 1.224744871391589
29 | Chr1 4006088 4006097 pZJP078-07-2-1-1 90.0 1.224744871391589
30 | Chr1 4005465 4005482 pZJP078-07-2-1-1 90.0 1.224744871391589
31 | Chr1 4005236 4005256 pZJP078-07-2-1-1 90.0 1.224744871391589
32 | Chr1 4006704 4006725 pZJP078-07-1-2-3 87.1 0.66332495807108
33 | Chr1 4006507 4006511 pZJP078-07-1-2-3 87.1 0.66332495807108
34 | Chr1 4006088 4006097 pZJP078-07-1-2-3 87.1 0.66332495807108
35 | Chr1 4005465 4005482 pZJP078-07-1-2-3 87.1 0.66332495807108
36 | Chr1 4005236 4005256 pZJP078-07-1-2-3 87.1 0.66332495807108
37 | Chr1 4006506 4006512 pZJP078-08-1-1-1 81.7 1.0770329614269007
38 | Chr1 4005744 4006469 pZJP078-08-1-1-1 81.7 1.0770329614269007
39 | Chr1 4005474 4005482 pZJP078-08-1-1-1 81.7 1.0770329614269007
40 | Chr1 4005245 4005253 pZJP078-08-1-1-1 81.7 1.0770329614269007
41 | Chr1 4006506 4006512 pZJP078-08-2-1-1 83.5 1.0
42 | Chr1 4005744 4006469 pZJP078-08-2-1-1 83.5 1.0
43 | Chr1 4005474 4005482 pZJP078-08-2-1-1 83.5 1.0
44 | Chr1 4005245 4005253 pZJP078-08-2-1-1 83.5 1.0
45 | Chr1 4006506 4006512 pZJP078-08-2-2-2 82.7 2.6758176320519302
46 | Chr1 4005744 4006469 pZJP078-08-2-2-2 82.7 2.6758176320519302
47 | Chr1 4005474 4005482 pZJP078-08-2-2-2 82.7 2.6758176320519302
48 | Chr1 4005245 4005253 pZJP078-08-2-2-2 82.7 2.6758176320519302
49 | Chr1 4006706 4006714 pZJP078-08-1-2-1 73.1 1.019803902718557
50 | Chr1 4006503 4006505 pZJP078-08-1-2-1 73.1 1.019803902718557
51 | Chr1 4006092 4006100 pZJP078-08-1-2-1 73.1 1.019803902718557
52 | Chr1 4005553 4005615 pZJP078-08-1-2-1 73.1 1.019803902718557
53 | Chr1 4005263 4005552 pZJP078-08-1-2-1 73.1 1.019803902718557
54 | Chr1 4005246 4005262 pZJP078-08-1-2-1 73.1 1.019803902718557
55 | Chr1 4006706 4006714 pZJP078-08-3-1-3 74.9 1.2
56 | Chr1 4006503 4006505 pZJP078-08-3-1-3 74.9 1.2
57 | Chr1 4006092 4006100 pZJP078-08-3-1-3 74.9 1.2
58 | Chr1 4005553 4005615 pZJP078-08-3-1-3 74.9 1.2
59 | Chr1 4005263 4005552 pZJP078-08-3-1-3 74.9 1.2
60 | Chr1 4005246 4005262 pZJP078-08-3-1-3 74.9 1.2
61 | Chr1 4006503 4006505 pZJP078-09-1-2-1 87.2 0.9273618495495702
62 | Chr1 4006090 4006100 pZJP078-09-1-2-1 87.2 0.9273618495495702
63 | Chr1 4005474 4005480 pZJP078-09-1-2-1 87.2 0.9273618495495702
64 | Chr1 4005240 4005350 pZJP078-09-1-2-1 87.2 0.9273618495495702
65 | Chr1 4006709 4006715 pZJP078-10-1-1-2 95.5 1.0
66 | Chr1 4006505 4006510 pZJP078-10-1-1-2 95.5 1.0
67 | Chr1 4006091 4006100 pZJP078-10-1-1-2 95.5 1.0
68 | Chr1 4005474 4005480 pZJP078-10-1-1-2 95.5 1.0
69 | Chr1 4005251 4005253 pZJP078-10-1-1-2 95.5 1.0
70 | Chr1 4006506 4006713 pZJP078-10-1-2-1 87.0 1.3038404810405297
71 | Chr1 4006095 4006100 pZJP078-10-1-2-1 87.0 1.3038404810405297
72 | Chr1 4005475 4005482 pZJP078-10-1-2-1 87.0 1.3038404810405297
73 | Chr1 4005239 4005257 pZJP078-10-1-2-1 87.0 1.3038404810405297
74 | Chr1 4006709 4006711 pZJP078-10-5-1-1 87.6 1.019803902718557
75 | Chr1 4006504 4006514 pZJP078-10-5-1-1 87.6 1.019803902718557
76 | Chr1 4006095 4006100 pZJP078-10-5-1-1 87.6 1.019803902718557
77 | Chr1 4005473 4005481 pZJP078-10-5-1-1 87.6 1.019803902718557
78 | Chr1 4005250 4005255 pZJP078-10-5-1-1 87.6 1.019803902718557
79 | Chr1 4006504 4006717 pZJP078-12-2-1-1 84.9 0.9165151389911681
80 | Chr1 4006092 4006097 pZJP078-12-2-1-1 84.9 0.9165151389911681
81 | Chr1 4005474 4005484 pZJP078-12-2-1-1 84.9 0.9165151389911681
82 | Chr1 4005246 4005257 pZJP078-12-2-1-1 84.9 0.9165151389911681
83 | Chr1 4006504 4006717 pZJP078-12-2-1-2 83.5 1.0
84 | Chr1 4006092 4006097 pZJP078-12-2-1-2 83.5 1.0
85 | Chr1 4005474 4005484 pZJP078-12-2-1-2 83.5 1.0
86 | Chr1 4005246 4005257 pZJP078-12-2-1-2 83.5 1.0
87 | Chr1 4006504 4006715 pZJP078-12-3-1-1 80.1 1.42828568570857
88 | Chr1 4005473 4006102 pZJP078-12-3-1-1 80.1 1.42828568570857
89 | Chr1 4005250 4005253 pZJP078-12-3-1-1 80.1 1.42828568570857
90 | Chr1 4006158 4006165 pZJP079-1-1-01-2 91.9 1.5620499351813308
91 | Chr1 4005990 4005994 pZJP079-1-1-01-2 91.9 1.5620499351813308
92 | Chr1 4005168 4005177 pZJP079-1-1-01-2 91.9 1.5620499351813308
93 | Chr1 4006835 4006852 pZJP079-1-1-02-1 80.9 1.2806248474865698
94 | Chr1 4006159 4006167 pZJP079-1-1-02-1 80.9 1.2806248474865698
95 | Chr1 4005945 4006001 pZJP079-1-1-02-1 80.9 1.2806248474865698
96 | Chr1 4005171 4005351 pZJP079-1-1-02-1 80.9 1.2806248474865698
97 | Chr1 4006835 4006852 pZJP079-1-1-01-1 80.4 0.8602325267042626
98 | Chr1 4006159 4006167 pZJP079-1-1-01-1 80.4 0.8602325267042626
99 | Chr1 4005945 4006001 pZJP079-1-1-01-1 80.4 0.8602325267042626
100 | Chr1 4005171 4005351 pZJP079-1-1-01-1 80.4 0.8602325267042626
101 | Chr1 4006611 4006864 pZJP079-5-1-02-2 80.1 1.8
102 | Chr1 4006159 4006206 pZJP079-5-1-02-2 80.1 1.8
103 | Chr1 4005989 4006024 pZJP079-5-1-02-2 80.1 1.8
104 | Chr1 4005330 4005366 pZJP079-5-1-02-2 80.1 1.8
105 | Chr1 4005171 4005174 pZJP079-5-1-02-2 80.1 1.8
106 | Chr1 4005348 4006852 pZJP079-6-3-01-3 66.2 1.4696938456699067
107 | Chr1 4005171 4005174 pZJP079-6-3-01-3 66.2 1.4696938456699067
108 | Chr1 4006163 4006166 pZJP079-7-1-01-2 68.5 1.1832159566199232
109 | Chr1 4005168 4005997 pZJP079-7-1-01-2 68.5 1.1832159566199232
110 | Chr1 4006163 4006166 pZJP079-7-2-03-2 68.8 1.5033296378372907
111 | Chr1 4005168 4005997 pZJP079-7-2-03-2 68.8 1.5033296378372907
112 | Chr1 4005987 4006558 pZJP079-8-1-01-2 74.3 1.0770329614269007
113 | Chr1 4005287 4005378 pZJP079-8-1-01-2 74.3 1.0770329614269007
114 | Chr1 4005171 4005174 pZJP079-8-1-01-2 74.3 1.0770329614269007
115 | Chr1 4006833 4006852 pZJP079-7-2-23-1 69.6 2.0591260281974
116 | Chr1 4006156 4006166 pZJP079-7-2-23-1 69.6 2.0591260281974
117 | Chr1 4005171 4005993 pZJP079-7-2-23-1 69.6 2.0591260281974
118 | Chr1 4006163 4006166 pZJP079-3-3-04-1 96.3 1.2489995996796797
119 | Chr1 4006138 4006166 pZJP079-8-1-14-1 89.6 1.3564659966250536
120 | Chr1 4005980 4005996 pZJP079-8-1-14-1 89.6 1.3564659966250536
121 | Chr1 4005346 4005354 pZJP079-8-1-14-1 89.6 1.3564659966250536
122 | Chr1 4005171 4005174 pZJP079-8-1-14-1 89.6 1.3564659966250536
123 |
--------------------------------------------------------------------------------
/lib/misc.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import pandas as pd
4 | from scipy import stats
5 | import pyBigWig
6 | from pybedtools import BedTool
7 |
8 |
9 | def check_outdir(path):
10 | dirpath = os.path.abspath(os.path.dirname(path))
11 | if not os.path.exists(dirpath):
12 | print("Create directory:", dirpath)
13 | os.makedirs(dirpath)
14 |
15 |
16 | def split_region(geneinfo):
17 | # Position info split by binsize and step
18 | step = geneinfo.step
19 | binstart = geneinfo.start
20 | binstop = geneinfo.end
21 | posinfo = {}
22 | for i, pos in enumerate(range(binstart, binstop, step)):
23 | posinfo[i] = pos
24 |
25 | return posinfo
26 |
27 |
28 | def get_chrom_sizes(file):
29 | chrlens = {}
30 | with open(file) as infile:
31 | for line in infile:
32 | if line.startswith("#"):
33 | continue
34 | info = line.rstrip().split("\t")
35 | if len(info) == 2:
36 | chrom = info[0]
37 | length = info[1]
38 | else:
39 | chrom = info[0]
40 | length = info[2]
41 | chrlens[chrom] = int(length)
42 | return chrlens
43 |
44 |
45 | def bigwig2bedGraph(bwfile, geneinfo, chrlens, outfile, ext = 50):
46 | # Convert the bigwig file from Deeptools/Popera to bedGraph file in single-base-pair resolution
47 | # Suitable for DNase-seq/ATAC-seq/MNase-seq/ChIP-seq
48 | bwin = pyBigWig.open(bwfile)
49 | chrom = geneinfo.chrom
50 | start = geneinfo.start
51 | end = geneinfo.end
52 | chrom_len = chrlens[chrom]
53 | check_outdir(outfile)
54 | outf = open(outfile, "w")
55 | for i in range(max(1, start-ext), min(end+ext, chrom_len)):
56 | try:
57 | value = bwin.values(chrom, i, i+1)[0]
58 | if np.isnan(value):
59 | value = 0
60 | print(chrom, i, i+1, value, sep="\t", file=outf)
61 | except:
62 | continue
63 | outf.close()
64 |
65 |
66 | def fimo_filter(gfffile, matrixinfo, geneinfo, outfile, pcut = 1e-5, qcut = 1):
67 | # Filter FIMO results with p-value or q-value cutoff
68 | motif_family = {}
69 | # Matrix from JASPAR
70 | if matrixinfo.startswith("JASPAR"):
71 | with open(matrixinfo, "r") as infile:
72 | for line in infile:
73 | if line.startswith("MOTIF"):
74 | info = line.rstrip().split()
75 | motif_id = info[1]
76 | motif_name = info[2]
77 | motif_family[motif_id] = motif_name
78 | # Matrix from PlantTFDB
79 | else:
80 | with open(matrixinfo, "r") as infile:
81 | for line in infile:
82 | if line.startswith("#"):
83 | continue
84 | info = line.rstrip().split()
85 | genename = info[0]
86 | family = info[1]
87 | motif_family[genename] = family
88 | # Get gene info
89 | chrom = geneinfo.chrom
90 | begin = geneinfo.start
91 | # Output filtered motifs
92 | motif_list = []
93 | with open(gfffile, "r") as infile:
94 | for line in infile:
95 | if line.startswith("#"):
96 | continue
97 | info = line.rstrip().split("\t")
98 | start = int(info[3]) + begin
99 | end = int(info[4]) + begin
100 | strand = info[6]
101 | desc = info[8].split(";")
102 | motif_id = desc[0].split("=")[1]
103 | motif_name = motif_family[motif_id]
104 | pvalue = float(desc[2].split("=")[1])
105 | qvalue = float(desc[3].split("= ")[1])
106 | if pvalue <= pcut and qvalue <= qcut:
107 | motif_list.append([chrom, start, end, motif_name, ".", strand, pvalue, qvalue])
108 | outf = open(outfile, "w")
109 | for lst in sorted(motif_list):
110 | print("\t".join(list(map(str, lst))), file=outf)
111 | outf.close()
112 |
113 |
114 | def smooth_scores_fill2(info, posinfo, minscore=0.01, minratio=0.5):
115 | """
116 | Make the discrete score values smoothly (fill zero scores).
117 |
118 | Mandatory parameters:
119 | 1. info - A list contains scores in different bins
120 | 2. posinfo - Position information of each bin
121 |
122 | """
123 |
124 | # In case original score info be modified
125 | new_info = info.copy()
126 | minval = max(min([x for x in new_info if x]), minscore)*minratio
127 | zerocnt = 0
128 | flag = 0
129 | for i in posinfo:
130 | pos = posinfo[i]
131 | score = new_info[i]
132 | if i == 0:
133 | if score == 0:
134 | flag = 1
135 | zerocnt += 1
136 | zerostart = i
137 | continue
138 | elif i == len(new_info)-1:
139 | if score == 0:
140 | for j in range(zerostart+1, len(new_info), 1):
141 | score1 = new_info[j-1]
142 | score2 = new_info[j]
143 | if score1 == new_info[zerostart]:
144 | new_info[j] = np.average([score1*minratio, minval])
145 | else:
146 | new_info[j] = np.average([score1, minval])
147 | else:
148 | if score == 0:
149 | zerocnt += 1
150 | zerostart = i
151 | continue
152 | else:
153 | if flag:
154 | for j in range(i, zerostart, -1):
155 | new_info[j-1] = np.average([score*minratio, minval])
156 | flag = 0
157 | else:
158 | if zerocnt:
159 | right = int(score*zerocnt/(score+new_info[zerostart]))
160 | left = zerocnt - right
161 | for j in range(zerostart+1, zerostart+left+1, 1):
162 | score1 = new_info[j-1]
163 | score2 = new_info[j]
164 | if score1 == new_info[zerostart]:
165 | new_info[j] = np.average([score1*minratio, minval])
166 | else:
167 | new_info[j] = np.average([score1, minval])
168 | for k in range(i-1, zerostart+left, -1):
169 | score1 = new_info[k]
170 | score2 = new_info[k+1]
171 | if score2 == score:
172 | new_info[k] = np.average([minval, score2*minratio])
173 | else:
174 | new_info[k] = np.average([minval, score2])
175 | zerostart = i
176 | zerocnt = 0
177 | # smooth scores
178 | smooth_info = smooth_scores2(new_info, posinfo, keep_tails=False)
179 |
180 | return smooth_info
181 |
182 |
183 | def smooth_scores_fill(info, posinfo):
184 | """
185 | Make the discrete score values smoothly (fill zero scores).
186 |
187 | Mandatory parameters:
188 | 1. info - A list contains scores in different bins
189 | 2. posinfo - Position information of each bin
190 |
191 | """
192 |
193 | new_info = info.copy()
194 | smooth_info = {}
195 | nonzero = [x for x in new_info if x]
196 | if sum(nonzero):
197 | minscore = min(nonzero)
198 | maxscore = max(new_info)
199 | else:
200 | return smooth_info
201 | # Set the minimum fill score
202 | if minscore / maxscore > 0.1:
203 | bottom = 0.1
204 | else:
205 | bottom = minscore
206 | for i in range(len(new_info)):
207 | if i:
208 | score0 = new_info[i-1]
209 | score1 = new_info[i]
210 | if not score1:
211 | for j in range(i+1, len(new_info)):
212 | score2 = new_info[j]
213 | if score2:
214 | break
215 | if j == len(new_info)-1 and score2 == 0:
216 | score2 = bottom
217 | ranges = j - i
218 | diff1 = abs(score0 - bottom)
219 | diff2 = abs(score2 - bottom)
220 | total = diff1 + diff2
221 | if total:
222 | mid = int(ranges * diff1 / total)
223 | else:
224 | mid = 0
225 | # print(i, j, ranges, mid, score0, score2, diff1, diff2, sep="\t")
226 | if ranges > 1:
227 | for k in range(mid):
228 | new_info[i+k] = score0 - diff1 * (k+1)/(mid+1)
229 | for k in range(mid+1, ranges):
230 | new_info[i+k] = bottom + diff2 * (k-mid)/(ranges-mid)
231 | new_info[i+mid] = bottom
232 | else:
233 | score = new_info[i]
234 | if score:
235 | pass
236 | else:
237 | new_info[i] = bottom
238 |
239 | smooth_info = smooth_scores2(new_info, posinfo, keep_tails=False)
240 |
241 | return smooth_info
242 |
243 |
244 | def smooth_scores1(info, posinfo, keep_tails=True):
245 | """
246 | Make the discrete score values smoothly.
247 | (Remove missing values between two scores)
248 |
249 | Mandatory parameters:
250 | 1. info - A list contains scores in different bins
251 | 2. posinfo - Position information of each bin
252 |
253 | """
254 |
255 | # In case original score info be modified
256 | new_info = info.copy()
257 | score_num = len(new_info)
258 | # Fill gap between two scores
259 | for i in range(score_num):
260 | score = new_info[i]
261 | if i == 0:
262 | tmp_score = score
263 | tmp_idx = i
264 | else:
265 | if score and tmp_score:
266 | interval = i - tmp_idx
267 | if interval > 1:
268 | for n, j in enumerate(range(tmp_idx+1, i)):
269 | new_info[j] = tmp_score + (score - tmp_score) * n / (i - tmp_idx)
270 | tmp_score = score
271 | tmp_idx = i
272 | smooth_info = {}
273 | if max(new_info):
274 | new_info = [x/max(new_info) for x in new_info]
275 | else:
276 | return smooth_info
277 | # Smooth the scores
278 | smooth_info = smooth_scores2(new_info, posinfo, keep_tails=keep_tails)
279 |
280 | return smooth_info
281 |
282 |
283 | def smooth_scores2(info, posinfo, keep_tails=False):
284 | """
285 | Make the discrete score values smoothly.
286 |
287 | Mandatory parameters:
288 | 1. info - A list contains scores in different bins
289 | 2. posinfo - Position information of each bin
290 |
291 | Alternative parameters:
292 | 1. keep_tails - Whether or not to keep the missing values in the two tails
293 |
294 | """
295 |
296 | # In case original score info be modified
297 | new_info = info.copy()
298 | smooth_info = {}
299 | if not max(new_info):
300 | return smooth_info
301 | score_num = len(new_info)
302 | begin = 0
303 | end = score_num
304 | # Find the two tails
305 | for i in range(end):
306 | if i:
307 | begin_avg = np.average(new_info[:i])
308 | else:
309 | begin_avg = new_info[i]
310 | if i == end-1:
311 | end_avg = new_info[i]
312 | else:
313 | end_avg = np.average(new_info[i:])
314 | if begin_avg == 0:
315 | begin = i
316 | if end_avg == 0:
317 | end = i
318 | break
319 | # Get average value in adjacent scores
320 | if not keep_tails:
321 | for i in range(begin, 0, -1):
322 | if i:
323 | if begin == score_num-1:
324 | score = new_info[i]
325 | else:
326 | score = (new_info[i-1] + new_info[i] + new_info[i+1]) / 3
327 | else:
328 | score = (new_info[i] + new_info[i+1]) / 2
329 | new_info[i] = score
330 | for i in range(end, score_num):
331 | if i < score_num - 1:
332 | score = (new_info[i-1] + new_info[i] + new_info[i+1]) / 3
333 | else:
334 | score = (new_info[i-1] + new_info[i]) / 2
335 | new_info[i] = score
336 | for i in range(begin, end):
337 | if i == begin:
338 | if begin == score_num-1:
339 | score = new_info[i]
340 | else:
341 | score = (new_info[i] + new_info[i+1]) / 2
342 | elif i == end - 1:
343 | score = (new_info[i-1] + new_info[i]) / 2
344 | else:
345 | score = (new_info[i-1] + new_info[i] + new_info[i+1]) / 3
346 | new_info[i] = score
347 | for i in posinfo:
348 | # provide real positions for smoothed scores
349 | pos = posinfo[i]
350 | smooth_info[pos] = new_info[i] / max(new_info)
351 |
352 | return smooth_info
353 |
354 |
355 | def merge_regions(regions, geneinfo, minlen = 2, mindist = 1):
356 | # Filter and merge key regions
357 | chromosome = geneinfo.chrom
358 | binsize = geneinfo.binsize
359 | merged = {}
360 | for pos, score in regions:
361 | start = pos
362 | end = pos + binsize
363 | if not merged:
364 | tmppos = start
365 | merged[tmppos] = [end, [score]]
366 | tmp_end = end
367 | continue
368 | if start - tmp_end <= binsize * mindist:
369 | merged[tmppos][0] = end
370 | merged[tmppos][1].append(score)
371 | else:
372 | merged[start] = [end, [score]]
373 | tmppos = start
374 | tmp_end = end
375 |
376 | merged_regions = []
377 | for pos in merged:
378 | start = pos
379 | end = merged[pos][0]
380 | if end - start >= binsize * minlen:
381 | score = np.average(merged[pos][1])
382 | merged_regions.append([chromosome, start, end, score])
383 |
384 | return merged_regions
385 |
386 |
387 | def calc_importance(phenotypes, scorelist, namelist, geneinfo, outdir="./", side="none"):
388 | # Calculate the correlation between phenodata and scores from different features
389 | ziplist = zip(scorelist, namelist)
390 | gene = geneinfo.gene
391 | genename = geneinfo.alias
392 | if genename == "NA":
393 | gene_alias = gene
394 | else:
395 | gene_alias = genename
396 | sample_scores = {}
397 | for item in ziplist:
398 | scores = item[0]
399 | name = item[1]
400 | score_bed = BedTool("\n".join(["\t".join(map(str, [geneinfo.chrom, x, x+geneinfo.binsize, scores[x]]))
401 | for x in scores]),
402 | from_string=True)
403 | pheno_bed = BedTool(phenotypes)
404 | intersect = pheno_bed.intersect(score_bed, wo=True)
405 | fscores = {}
406 | for interval in intersect:
407 | info = str(interval).rstrip().split("\t")
408 | sample = info[3]
409 | if sample == "WT":
410 | wt_value = float(info[4])
411 | continue
412 | ratio = int(info[-1]) / geneinfo.binsize
413 | if sample in fscores:
414 | fscores[sample]["feature"] += float(info[-2]) * ratio
415 | else:
416 | fscores[sample] = {}
417 | if side == "none":
418 | fscores[sample]["pheno"] = abs(float(info[4]) - wt_value)
419 | else:
420 | fscores[sample]["pheno"] = float(info[4]) - wt_value
421 | fscores[sample]["feature"] = float(info[-2]) * ratio
422 | min_score = min([fscores[x]["feature"] for x in fscores])
423 | max_score = max([fscores[x]["feature"] for x in fscores])
424 | avg_pheno = np.average([fscores[x]["pheno"] for x in fscores])
425 | if avg_pheno < 0:
426 | for s in fscores:
427 | fscores[s]["pheno"] *= -1
428 | min_pheno = min([fscores[x]["pheno"] for x in fscores])
429 | max_pheno = max([fscores[x]["pheno"] for x in fscores])
430 | feature_scores = []
431 | pheno_scores = []
432 | for s in fscores:
433 | score1 = (fscores[s]["feature"]-min_score)/(max_score-min_score)
434 | feature_scores.append(score1)
435 | if side == "none":
436 | score2 = fscores[s]["pheno"]
437 | else:
438 | score2 = (fscores[s]["pheno"]-min_pheno)/(max_pheno-min_pheno)
439 | pheno_scores.append(score2)
440 | if s not in sample_scores:
441 | sample_scores[s] = {}
442 | sample_scores[s]["pheno"] = score2
443 | sample_scores[s][name] = score1
444 | pearson = stats.pearsonr(feature_scores, pheno_scores)
445 | print(name, "Pearson correlation:", pearson[0])
446 |
447 | outfile = outdir + "/" + gene_alias + "/scores_by_sample.txt"
448 | df = pd.DataFrame(sample_scores).T
449 | df.index.name = "sample"
450 | df.to_csv(outfile, sep="\t")
451 |
452 |
--------------------------------------------------------------------------------
/single.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | ##################################################
4 | # CRISPR-Cas12a promoter editing (CAPE) #
5 | # Script: Single Mode #
6 | ##################################################
7 |
8 | import os
9 | import sys
10 | import shutil
11 | from glob import glob
12 | import configparser
13 | from time import time
14 | from multiprocessing import Pool
15 | from pybedtools import BedTool, cleanup
16 |
17 | from lib import misc
18 | from lib.features import *
19 | from lib.cores import output_cores
20 |
21 |
22 | class Features_info():
23 | # Provide Gene infomation
24 | def __init__(self):
25 | self.geneinfo = Geneinfo()
26 | self.feature = "feature"
27 | self.workdir = "results"
28 | self.outname = "name"
29 | self.slop = 200
30 | self.config = {}
31 | self.chrlens = {}
32 |
33 |
34 | def get_gene_info(gene_file):
35 |
36 | genes_info = {}
37 | with open(gene_file) as infile:
38 | for line in infile:
39 | if line.startswith("#") or line.startswith("\n"):
40 | continue
41 | info = line.rstrip().split("\t")
42 | chrom = info[0]
43 | start = int(info[1])
44 | end = int(info[2])
45 | gene_name = info[3]
46 | strand = info[5]
47 | genes_info[gene_name] = [chrom, start, end, strand]
48 | break
49 |
50 | print("Genes infomation loaded.\n")
51 |
52 | return genes_info
53 |
54 |
55 | def generate_regions(geneinfo, workdir, gene, chrlens):
56 | chrom = geneinfo.chrom
57 | start = geneinfo.start
58 | end = geneinfo.end
59 | strand = geneinfo.strand
60 | chrom_len = chrlens[chrom]
61 | outfile = workdir + "/" + gene + "/analysis_region.bed"
62 | misc.check_outdir(outfile)
63 | if os.path.exists(outfile):
64 | return outfile
65 | outf = open(outfile, "w")
66 | print(chrom, max(0, start), min(end, chrom_len), gene, '.', strand,
67 | sep="\t", file=outf)
68 | outf.close()
69 |
70 | return outfile
71 |
72 |
73 | def generate_features(Features_info):
74 |
75 | geneinfo = Features_info.geneinfo
76 | feature_file = Features_info.feature
77 | workdir = Features_info.workdir
78 | outname = Features_info.outname
79 | slop = Features_info.slop
80 | chrlens = Features_info.chrlens
81 |
82 | chrom = geneinfo.chrom
83 | start = geneinfo.start
84 | end = geneinfo.end
85 | gene = geneinfo.gene
86 |
87 | if "peak" in outname:
88 | outfile = workdir + "/" + gene + "/" + outname + "_raw.bed"
89 | else:
90 | outfile = workdir + "/" + gene + "/" + outname + "_raw.bedGraph"
91 | if os.path.exists(outfile):
92 | return outfile
93 | if feature_file.endswith(".bw") or feature_file.endswith(".bigwig"):
94 | misc.bigwig2bedGraph(feature_file, geneinfo, chrlens, outfile, ext = slop)
95 | else:
96 | target_bed = BedTool("\t".join([chrom, str(max(0, start-slop)), str(end+slop)]), from_string = True)
97 | feature_bed = BedTool(feature_file)
98 | feature_bed.intersect(target_bed, wa=True).moveto(outfile)
99 |
100 | cleanup()
101 |
102 | return outfile
103 |
104 |
105 | def generate_features_from_large(inputfile, genes_info, upstream, slop, workdir, feature):
106 |
107 | basemap = {}
108 | existed = set()
109 | num = 0
110 | genelens = len(genes_info)
111 | for gene in genes_info:
112 | outfile = os.path.join(workdir, gene, feature+"_raw.bedGraph")
113 | if os.path.exists(outfile):
114 | filesize = os.path.getsize(outfile)
115 | if filesize > 10:
116 | existed.add(gene)
117 | chrom, start, end, strand = genes_info[gene][:4]
118 | if chrom not in basemap:
119 | basemap[chrom] = {}
120 | if strand == "+":
121 | for i in range(max(0, start-upstream-slop), start+slop+1):
122 | if i in basemap[chrom]:
123 | basemap[chrom][i].append(gene)
124 | else:
125 | basemap[chrom][i] = [gene]
126 | else:
127 | for i in range(end-slop, end+upstream+slop+1):
128 | if i in basemap[chrom]:
129 | basemap[chrom][i].append(gene)
130 | else:
131 | basemap[chrom][i] = [gene]
132 | print("%s / %s genes processed, %s existed genes." % (num, genelens, len(existed)),
133 | end="\r")
134 | num += 1
135 | print("Load genes completed.", " "*30)
136 | genenums = []
137 | total_num = max(1, genelens-len(existed))
138 | outf = {}
139 | split = 500
140 | kept = split * 0.9
141 | tmp_cnt = 0
142 | tmp_mod = 0
143 | num = 0
144 | with open(inputfile) as infile:
145 | for line in infile:
146 | chrom, start, end = line.rstrip().split("\t")[:3]
147 | if chrom not in basemap:
148 | continue
149 | if feature == "CNS":
150 | s = int(start)
151 | else:
152 | s = int((int(start) + int(end)) / 2)
153 | if s in basemap[chrom]:
154 | genes = basemap[chrom][s]
155 | for gene in genes:
156 | if gene in existed:
157 | continue
158 | else:
159 | outfile = os.path.join(workdir, gene, feature+"_raw.bedGraph")
160 | if gene not in outf:
161 | outf[gene] = open(outfile, "w")
162 | # try:
163 | # outf[gene] = open(outfile, "w")
164 | # except:
165 | # opened = len(outf)
166 | # print("Processing %s, %s genes opened." % (gene, opened))
167 | # outf[gene] = open(outfile, "w")
168 | print(line.rstrip(), file=outf[gene])
169 | if gene not in genenums:
170 | genenums.append(gene)
171 | cnt = len(genenums)
172 | remain = cnt % split
173 | mod = cnt // split
174 | if mod - tmp_mod > 0:
175 | st = max(0, int(split * (mod - 1) - kept - 1))
176 | ed = int(split * mod - kept)
177 | # print("#"*100+"\n", tmp_cnt, cnt, tmp_mod, mod, st, ed, genes, sep=", ")
178 | for j in genenums[st:ed]:
179 | outf[j].close()
180 | if tmp_cnt != cnt:
181 | pct = round(cnt * 100 / total_num, 2)
182 | print(pct, "%", " output.", end="\r")
183 | tmp_cnt = cnt
184 | tmp_mod = mod
185 | print("All files output.")
186 |
187 | for gene in outf:
188 | outf[gene].close()
189 |
190 | return cnt
191 |
192 |
193 | def run_analysis(feature_info):
194 |
195 | workdir = feature_info.workdir
196 | geneinfo = feature_info.geneinfo
197 | gene = feature_info.geneinfo.gene
198 |
199 | # Check if calculated
200 | # check = os.path.join(workdir, gene, "key_regions_merged.bed")
201 | # if os.path.exists(check):
202 | # return (gene, 0)
203 | check = os.path.join(workdir, gene, "aggregate.bedGraph")
204 | if os.path.exists(check):
205 | filesize = os.path.getsize(check)
206 | if filesize > 10:
207 | return (gene, 0)
208 |
209 | # Open chromatin
210 | ocscores = glob(os.path.join(workdir, gene, "OCscores*_raw.bedGraph"))
211 | ocpeaks = glob(os.path.join(workdir, gene, "OCpeaks*_raw.bed"))
212 | # Calculate scores
213 | ocscorelist = []
214 | for idx, ocscorefile in enumerate(ocscores):
215 | if idx + 1 > len(ocpeaks):
216 | ocpeakfile = ""
217 | else:
218 | ocpeakfile = ocpeaks[idx]
219 | if len(ocscores) > 1:
220 | ocname = os.path.basename(ocscorefile).split("_raw")[0]
221 | else:
222 | ocname = "OCscores"
223 | scores_oc1 = openchromatin_scores(geneinfo, ocscorefile, ocpeakfile,
224 | samplename = ocname, outdir = workdir)
225 | ocscorelist.append(scores_oc1)
226 | if len(ocscores) > 1:
227 | scores_oc = merge_reps(geneinfo, ocscorelist, samplename = "OCscores", outdir = workdir)
228 | else:
229 | scores_oc = scores_oc1
230 |
231 | # Histone modification
232 | ptmfiles = glob(os.path.join(workdir, gene, "PTM*_raw.bedGraph"))
233 | # Calculate scores
234 | ptmscorelist = []
235 | for ptmscorefile in ptmfiles:
236 | if len(ptmfiles) > 1:
237 | ptmname = os.path.basename(ptmscorefile).split("_raw")[0]
238 | else:
239 | ptmname = "PTMscores"
240 | scores_ptm1 = ptm_scores(geneinfo, ptmscorefile, ocname="OCscores",
241 | samplename = ptmname, outdir = workdir)
242 | ptmscorelist.append(scores_ptm1)
243 | if len(ptmfiles) > 1:
244 | scores_ptm = merge_reps(geneinfo, ptmscorelist, samplename = "PTMscores", outdir = workdir)
245 | else:
246 | scores_ptm = scores_ptm1
247 |
248 | # TF motifs
249 | motiffile = os.path.join(workdir, gene, "motifs_raw.bedGraph")
250 | # Calculate scores
251 | scores_motif = motif_scores(geneinfo, motiffile, outdir = workdir)
252 |
253 | # Conserved sequences
254 | cnsfile = os.path.join(workdir, gene, "CNS_raw.bedGraph")
255 | # Calculate scores
256 | scores_cns = cns_scores(geneinfo, cnsfile, outdir = workdir)
257 |
258 | # Genotype versus Phenotype (MBKbase)
259 | genopheno = os.path.join(workdir, gene, "genopheno_raw.bedGraph")
260 | # Calculate scores
261 | if os.path.exists(genopheno):
262 | scores_genopheno = genopheno_scores(geneinfo, genopheno, outdir = workdir)
263 | else:
264 | scores_genopheno = {}
265 |
266 | # Aggregate scores
267 | if scores_genopheno:
268 | scorelist = [scores_oc, scores_motif, scores_cns, scores_ptm, scores_genopheno]
269 | weightlist = [0.25, 0.2, 0.3, 0.1, 0.05]
270 | else:
271 | scorelist = [scores_oc, scores_motif, scores_cns, scores_ptm]
272 | weightlist = [0.25, 0.2, 0.3, 0.1]
273 | scores_aggregate = aggregate_scores(geneinfo, scorelist, weightlist, outdir = workdir)
274 |
275 | # Load phenodata from CRISPR-edited results
276 | phenodata = os.path.join(workdir, gene, "phenoscores_raw.bedGraph")
277 | # Calculate scores
278 | if os.path.exists(phenodata):
279 | scores_phenodata = phenodata_scores(geneinfo, phenodata, method = "kmeans2",
280 | outdir = workdir)
281 | else:
282 | scores_phenodata = {}
283 |
284 | # Find the feature importance
285 | if scores_phenodata:
286 | namelist = ["DHS", "H3K27ac", "TF motif", "CNS", "GenoPheno", "Aggregate"]
287 | misc.calc_importance(phenodata, scorelist+[scores_aggregate],
288 | namelist, geneinfo, side="both", outdir = workdir)
289 |
290 | # Define key regions
291 | key_regions = define_key_regions(geneinfo, scores_aggregate, phenodata,
292 | outdir = workdir)
293 |
294 | # Get the core of key regions
295 | scorefile = os.path.join(workdir, gene, "aggregate.bedGraph")
296 | regionfile = os.path.join(workdir, gene, "key_regions_merged.bed")
297 | core_regions = output_cores(geneinfo, scorefile, regionfile)
298 |
299 | cleanup()
300 |
301 | return (gene, 1)
302 |
303 |
304 | def check_options(config):
305 |
306 | print("# Using the following options:")
307 | if config["General"]["workdir"]:
308 | config["General"]["workdir"] = os.path.abspath(config["General"]["workdir"])
309 | else:
310 | config["General"]["workdir"] = "results"
311 | misc.check_outdir(config["General"]["workdir"])
312 | for section in config.sections():
313 | for param in config.options(section):
314 | values = config[section][param]
315 | if section == "Features":
316 | if "," in values:
317 | values = values.split(",")
318 | for file in values:
319 | if file and not os.path.exists(file):
320 | print("# Error, cannot find the %s: %s" % (param, file))
321 | sys.exit(1)
322 | else:
323 | file = values
324 | if file and not os.path.exists(file):
325 | print("# Error, cannot find the %s: %s" % (param, file))
326 | sys.exit(1)
327 | print("%s: %s" % (param, values))
328 | if int(config["General"]["threads"]) > os.cpu_count():
329 | config["General"]["threads"] = os.cpu_count()
330 | if int(config["General"]["slop"]) > 5e4:
331 | config["General"]["slop"] = 5e4
332 | if int(config["General"]["upstream"]) > 1e4:
333 | config["General"]["upstream"] = 1e4
334 | if int(config["General"]["binsize"]) > int(config["General"]["upstream"]) / 2:
335 | config["General"]["binsize"] = int(config["General"]["upstream"]) / 2
336 | if int(config["General"]["step"]) > int(config["General"]["binsize"]):
337 | config["General"]["step"] = int(config["General"]["binsize"])
338 | if config["Genes"]["gene_file"]:
339 | print("\n# Using Single mode.\n")
340 |
341 | return config
342 |
343 |
344 | def main():
345 |
346 | # Load configs
347 | config = configparser.ConfigParser()
348 | if len(sys.argv) == 1:
349 | config_file = "config.ini"
350 | elif len(sys.argv) == 2:
351 | config_file = sys.argv[1]
352 | else:
353 | print("Usage:\n python single.py [configfile]\n")
354 | sys.exit(1)
355 | config.read(config_file)
356 |
357 | config = check_options(config)
358 | workdir = config["General"]["workdir"]
359 | threads = int(config["General"]["threads"])
360 | slop = int(config["General"]["slop"])
361 | upstream = int(config["General"]["upstream"])
362 | binsize = int(config["General"]["binsize"])
363 | step = int(config["General"]["step"])
364 | gene_file = config["Genes"]["gene_file"]
365 | chrom_sizes = config["Genes"]["chrom_sizes"]
366 |
367 | # Load genes
368 | if gene_file:
369 | genes_info = get_gene_info(gene_file)
370 | else:
371 | print("No gene annotation file found, stop!")
372 | sys.exit(1)
373 |
374 | # Define the input numbers of multiprocessing list
375 | inputnum = 512
376 | if inputnum < threads:
377 | inputnum = threads
378 | else:
379 | roundnum = (inputnum // threads) * threads
380 | inputnum = int(max(roundnum, threads*4))
381 |
382 | # Load chromosome sizes
383 | chrlens = misc.get_chrom_sizes(chrom_sizes)
384 |
385 | # Define features information
386 | feature_map = {"ocfiles":"OCscores", "ocpeaks":"OCpeaks", "ptmfiles":"PTM",
387 | "motifs":"motifs", "cnss":"CNS", "genopheno":"genopheno",
388 | "phenodata":"phenoscores"}
389 | for item in config["Features"]:
390 | feature_files = config["Features"][item]
391 | if not feature_files:
392 | continue
393 | filelist = feature_files.split(",")
394 | count = 1
395 | for file in filelist:
396 | feature_infos = []
397 | num = 1
398 | for gene in genes_info:
399 | chrom = genes_info[gene][0]
400 | start = genes_info[gene][1]
401 | end = genes_info[gene][2]
402 | strand = genes_info[gene][3]
403 | feature_info = Features_info()
404 | feature_info.workdir = workdir
405 | feature_info.slop = slop
406 | feature_info.config = config
407 | feature_info.idx = num
408 | feature_info.geneinfo = Geneinfo()
409 | feature_info.geneinfo.gene = gene
410 | feature_info.geneinfo.chrom = chrom
411 | feature_info.geneinfo.strand = strand
412 | if strand == "+":
413 | feature_info.geneinfo.start = start - upstream
414 | feature_info.geneinfo.end = start - 1
415 | else:
416 | feature_info.geneinfo.start = end
417 | feature_info.geneinfo.end = end + upstream - 1
418 | feature_info.geneinfo.binsize = binsize
419 | feature_info.geneinfo.step = step
420 | num += 1
421 | # Output analyzed gene regions
422 | generate_regions(feature_info.geneinfo, workdir, gene, chrlens)
423 | feature_info.feature = file
424 | if len(filelist) > 1:
425 | outname = feature_map[item] + "_" + str(count)
426 | else:
427 | outname = feature_map[item]
428 | feature_info.outname = outname
429 | feature_info.chrlens = chrlens
430 | feature_infos.append(feature_info)
431 | count += 1
432 | # Generate features file
433 | time_st = time()
434 | file_suffix = file.split(".")[-1].lower()
435 | filesize = os.path.getsize(file)
436 | if file_suffix in ["bed", "bedgraph", "txt"] and filesize > 1e8:
437 | results = generate_features_from_large(file, genes_info, upstream, slop,
438 | workdir, outname)
439 | else:
440 | # Multiprocessing
441 | results = generate_features(feature_infos[0])
442 | time_ed = time()
443 | time_elapse = round(time_ed - time_st)
444 | print("Generate %s features files finished.\nUsing %ss" % (outname, time_elapse))
445 |
446 | # Perform analysis
447 | time_st = time()
448 | result = run_analysis(feature_infos[0])
449 | if result[1]:
450 | time_total = round(time() - time_st, 2)
451 | print("\nGene analysis finished using %ss. %s\n" % (time_total, " "*30))
452 |
453 | print("All the processes completed.", " "*10)
454 |
455 |
456 |
457 | if __name__ == '__main__':
458 |
459 | try:
460 | main()
461 |
462 | except KeyboardInterrupt:
463 | sys.stderr.write("User interrupt\n")
464 | sys.exit(0)
465 |
466 |
--------------------------------------------------------------------------------
/batch.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | ##################################################
4 | # CRISPR-Cas12a promoter editing (CAPE) #
5 | # Script: Batch Mode #
6 | ##################################################
7 |
8 | import os
9 | import sys
10 | import shutil
11 | from glob import glob
12 | import configparser
13 | from time import time
14 | from multiprocessing import Pool
15 | from pybedtools import BedTool, cleanup
16 |
17 | from lib import misc
18 | from lib.features import *
19 | from lib.cores import output_cores
20 |
21 |
22 | class Features_info():
23 | # Provide Gene infomation
24 | def __init__(self):
25 | self.geneinfo = Geneinfo()
26 | self.feature = "feature"
27 | self.workdir = "results"
28 | self.outname = "name"
29 | self.slop = 200
30 | self.config = {}
31 | self.chrlens = {}
32 |
33 |
34 | def get_gene_info(gff_file):
35 |
36 | suffix = gff_file.split(".")[-1].lower()
37 |
38 | if suffix not in ["gtf", "gff", "gff3"]:
39 | print("Input gene annotataion file is not in GFF/GFF3 format.\nPlease check the file.")
40 | sys.exit(1)
41 |
42 | genes_info = {}
43 | cds_info = {}
44 | with open(gff_file) as infile:
45 | for line in infile:
46 | if line.startswith("#") or line.startswith("\n"):
47 | continue
48 | info = line.rstrip().split("\t")
49 | chrom = info[0]
50 | category = info[2].lower()
51 | start = int(info[3])
52 | end = int(info[4])
53 | strand = info[6]
54 | desc = info[8].split(";")
55 | if category == "gene":
56 | gene_name = desc[0].split("=")[-1]
57 | genes_info[gene_name] = [chrom, start, end, strand]
58 | elif category == "transcript":
59 | gene_name = desc[0].split("\"")[1]
60 | genes_info[gene_name] = [chrom, start, end, strand]
61 | elif category == "cds":
62 | if gene_name in cds_info:
63 | cds_info[gene_name] = [min(cds_info[gene_name][0], start),
64 | max(cds_info[gene_name][1], end)]
65 | else:
66 | cds_info[gene_name] = [start, end]
67 |
68 | for gene in genes_info:
69 | if gene in cds_info:
70 | s, e = cds_info[gene]
71 | dist1 = s - genes_info[gene][1]
72 | dist2 = genes_info[gene][2] - e
73 | if dist1 >= 0 and dist2 >= 0:
74 | genes_info[gene].extend([dist1, dist2])
75 | else:
76 | genes_info[gene].extend([0, 0])
77 | else:
78 | genes_info[gene].extend([0, 0])
79 |
80 | print("%s genes found in the annotation file.\n" % len(genes_info))
81 |
82 | return genes_info
83 |
84 |
85 | def generate_regions(geneinfo, workdir, gene, chrlens):
86 | chrom = geneinfo.chrom
87 | start = geneinfo.start
88 | end = geneinfo.end
89 | strand = geneinfo.strand
90 | chrom_len = chrlens[chrom]
91 | outfile = workdir + "/" + gene + "/analysis_region.bed"
92 | misc.check_outdir(outfile)
93 | if os.path.exists(outfile):
94 | return outfile
95 | outf = open(outfile, "w")
96 | print(chrom, max(0, start), min(end, chrom_len), gene, '.', strand,
97 | sep="\t", file=outf)
98 | outf.close()
99 |
100 | return outfile
101 |
102 |
103 | def generate_features(Features_info):
104 |
105 | geneinfo = Features_info.geneinfo
106 | feature_file = Features_info.feature
107 | workdir = Features_info.workdir
108 | outname = Features_info.outname
109 | slop = Features_info.slop
110 | chrlens = Features_info.chrlens
111 |
112 | chrom = geneinfo.chrom
113 | start = geneinfo.start
114 | end = geneinfo.end
115 | gene = geneinfo.gene
116 |
117 | if "peak" in outname:
118 | outfile = workdir + "/" + gene + "/" + outname + "_raw.bed"
119 | else:
120 | outfile = workdir + "/" + gene + "/" + outname + "_raw.bedGraph"
121 | if os.path.exists(outfile):
122 | return outfile
123 | if feature_file.endswith(".bw") or feature_file.endswith(".bigwig"):
124 | misc.bigwig2bedGraph(feature_file, geneinfo, chrlens, outfile, ext = slop)
125 | else:
126 | target_bed = BedTool("\t".join([chrom, str(max(0, start-slop)), str(end+slop)]), from_string = True)
127 | feature_bed = BedTool(feature_file)
128 | target_bed.intersect(feature_bed).moveto(outfile)
129 |
130 | cleanup()
131 |
132 | return outfile
133 |
134 |
135 | def generate_features_from_large(inputfile, genes_info, upstream, slop, workdir, feature):
136 |
137 | basemap = {}
138 | existed = set()
139 | num = 0
140 | genelens = len(genes_info)
141 | for gene in genes_info:
142 | outfile = os.path.join(workdir, gene, feature+"_raw.bedGraph")
143 | if os.path.exists(outfile):
144 | filesize = os.path.getsize(outfile)
145 | if filesize > 10:
146 | existed.add(gene)
147 | chrom, start, end, strand = genes_info[gene][:4]
148 | if chrom not in basemap:
149 | basemap[chrom] = {}
150 | if strand == "+":
151 | for i in range(max(0, start-upstream-slop), start+slop+1):
152 | if i in basemap[chrom]:
153 | basemap[chrom][i].append(gene)
154 | else:
155 | basemap[chrom][i] = [gene]
156 | else:
157 | for i in range(end-slop, end+upstream+slop+1):
158 | if i in basemap[chrom]:
159 | basemap[chrom][i].append(gene)
160 | else:
161 | basemap[chrom][i] = [gene]
162 | print("%s / %s genes processed, %s existed genes." % (num, genelens, len(existed)),
163 | end="\r")
164 | num += 1
165 | print("Load genes completed.", " "*30)
166 | genenums = []
167 | total_num = max(1, genelens-len(existed))
168 | outf = {}
169 | split = 500
170 | kept = split * 0.9
171 | tmp_cnt = 0
172 | tmp_mod = 0
173 | num = 0
174 | with open(inputfile) as infile:
175 | for line in infile:
176 | chrom, start, end = line.rstrip().split("\t")[:3]
177 | if chrom not in basemap:
178 | continue
179 | if feature == "CNS":
180 | s = int(start)
181 | else:
182 | s = int((int(start) + int(end)) / 2)
183 | if s in basemap[chrom]:
184 | genes = basemap[chrom][s]
185 | for gene in genes:
186 | if gene in existed:
187 | continue
188 | else:
189 | outfile = os.path.join(workdir, gene, feature+"_raw.bedGraph")
190 | if gene not in outf:
191 | outf[gene] = open(outfile, "w")
192 | # try:
193 | # outf[gene] = open(outfile, "w")
194 | # except:
195 | # opened = len(outf)
196 | # print("Processing %s, %s genes opened." % (gene, opened))
197 | # outf[gene] = open(outfile, "w")
198 | print(line.rstrip(), file=outf[gene])
199 | if gene not in genenums:
200 | genenums.append(gene)
201 | cnt = len(genenums)
202 | remain = cnt % split
203 | mod = cnt // split
204 | if mod - tmp_mod > 0:
205 | st = max(0, int(split * (mod - 1) - kept - 1))
206 | ed = int(split * mod - kept)
207 | # print("#"*100+"\n", tmp_cnt, cnt, tmp_mod, mod, st, ed, genes, sep=", ")
208 | for j in genenums[st:ed]:
209 | outf[j].close()
210 | if tmp_cnt != cnt:
211 | pct = round(cnt * 100 / total_num, 2)
212 | print(pct, "%", " output.", end="\r")
213 | tmp_cnt = cnt
214 | tmp_mod = mod
215 | print("All files output.")
216 |
217 | for gene in outf:
218 | outf[gene].close()
219 |
220 | return cnt
221 |
222 |
223 | def run_analysis(feature_info):
224 |
225 | workdir = feature_info.workdir
226 | geneinfo = feature_info.geneinfo
227 | gene = feature_info.geneinfo.gene
228 |
229 | # Check if calculated
230 | # check = os.path.join(workdir, gene, "key_regions_merged.bed")
231 | # if os.path.exists(check):
232 | # return (gene, 0)
233 | check = os.path.join(workdir, gene, "aggregate.bedGraph")
234 | if os.path.exists(check):
235 | filesize = os.path.getsize(check)
236 | if filesize > 10:
237 | return (gene, 0)
238 |
239 | # Open chromatin
240 | ocscores = glob(os.path.join(workdir, gene, "OCscores*_raw.bedGraph"))
241 | ocpeaks = glob(os.path.join(workdir, gene, "OCpeaks*_raw.bed"))
242 | # Calculate scores
243 | ocscorelist = []
244 | for idx, ocscorefile in enumerate(ocscores):
245 | if idx + 1 > len(ocpeaks):
246 | ocpeakfile = ""
247 | else:
248 | ocpeakfile = ocpeaks[idx]
249 | if len(ocscores) > 1:
250 | ocname = os.path.basename(ocscorefile).split("_raw")[0]
251 | else:
252 | ocname = "OCscores"
253 | scores_oc1 = openchromatin_scores(geneinfo, ocscorefile, ocpeakfile,
254 | samplename = ocname, outdir = workdir)
255 | ocscorelist.append(scores_oc1)
256 | if len(ocscores) > 1:
257 | scores_oc = merge_reps(geneinfo, ocscorelist, samplename = "OCscores", outdir = workdir)
258 | else:
259 | scores_oc = scores_oc1
260 |
261 | # Histone modification
262 | ptmfiles = glob(os.path.join(workdir, gene, "PTM*_raw.bedGraph"))
263 | # Calculate scores
264 | ptmscorelist = []
265 | for ptmscorefile in ptmfiles:
266 | if len(ptmfiles) > 1:
267 | ptmname = os.path.basename(ptmscorefile).split("_raw")[0]
268 | else:
269 | ptmname = "PTMscores"
270 | scores_ptm1 = ptm_scores(geneinfo, ptmscorefile, ocname="OCscores",
271 | samplename = ptmname, outdir = workdir)
272 | ptmscorelist.append(scores_ptm1)
273 | if len(ptmfiles) > 1:
274 | scores_ptm = merge_reps(geneinfo, ptmscorelist, samplename = "PTMscores", outdir = workdir)
275 | else:
276 | scores_ptm = scores_ptm1
277 |
278 | # TF motifs
279 | motiffile = os.path.join(workdir, gene, "motifs_raw.bedGraph")
280 | # Calculate scores
281 | scores_motif = motif_scores(geneinfo, motiffile, outdir = workdir)
282 |
283 | # Conserved sequences
284 | cnsfile = os.path.join(workdir, gene, "CNS_raw.bedGraph")
285 | # Calculate scores
286 | scores_cns = cns_scores(geneinfo, cnsfile, outdir = workdir)
287 |
288 | # Genotype versus Phenotype (MBKbase)
289 | genopheno = os.path.join(workdir, gene, "genopheno_raw.bedGraph")
290 | # Calculate scores
291 | scores_genopheno = genopheno_scores(geneinfo, genopheno, outdir = workdir)
292 |
293 | # Aggregate scores
294 | if scores_genopheno:
295 | scorelist = [scores_oc, scores_motif, scores_cns, scores_ptm, scores_genopheno]
296 | weightlist = [0.25, 0.2, 0.3, 0.1, 0.05]
297 | else:
298 | scorelist = [scores_oc, scores_motif, scores_cns, scores_ptm]
299 | weightlist = [0.25, 0.2, 0.3, 0.1]
300 | scores_aggregate = aggregate_scores(geneinfo, scorelist, weightlist, outdir = workdir)
301 |
302 | # Load phenodata from CRISPR-edited results
303 | phenodata = os.path.join(workdir, gene, "phenoscores_raw.bedGraph")
304 | # Calculate scores
305 | if os.path.exists(phenodata):
306 | scores_phenodata = phenodata_scores(geneinfo, phenodata, method = "kmeans2",
307 | outdir = workdir)
308 | else:
309 | scores_phenodata = {}
310 |
311 | # Find the feature importance
312 | if scores_phenodata:
313 | namelist = ["DHS", "H3K27ac", "TF motif", "CNS", "GenoPheno", "Aggregate"]
314 | misc.calc_importance(phenodata, scorelist+[scores_aggregate],
315 | namelist, geneinfo, side="both", outdir = workdir)
316 |
317 | # Define key regions
318 | key_regions = define_key_regions(geneinfo, scores_aggregate, phenodata,
319 | outdir = workdir)
320 |
321 | # Get the core of key regions
322 | scorefile = os.path.join(workdir, gene, "aggregate.bedGraph")
323 | regionfile = os.path.join(workdir, gene, "key_regions_merged.bed")
324 | core_regions = output_cores(geneinfo, scorefile, regionfile)
325 |
326 | cleanup()
327 |
328 | return (gene, 1)
329 |
330 |
331 | def check_options(config):
332 |
333 | print("# Using the following options:")
334 | if config["General"]["workdir"]:
335 | config["General"]["workdir"] = os.path.abspath(config["General"]["workdir"])
336 | else:
337 | config["General"]["workdir"] = "results"
338 | misc.check_outdir(config["General"]["workdir"])
339 | for section in config.sections():
340 | for param in config.options(section):
341 | values = config[section][param]
342 | if section == "Features":
343 | if "," in values:
344 | values = values.split(",")
345 | for file in values:
346 | if file and not os.path.exists(file):
347 | print("# Error, cannot find the %s: %s" % (param, file))
348 | sys.exit(1)
349 | else:
350 | file = values
351 | if file and not os.path.exists(file):
352 | print("# Error, cannot find the %s: %s" % (param, file))
353 | sys.exit(1)
354 | print("%s: %s" % (param, values))
355 | if int(config["General"]["threads"]) > os.cpu_count():
356 | config["General"]["threads"] = os.cpu_count()
357 | if int(config["General"]["slop"]) > 5e4:
358 | config["General"]["slop"] = 5e4
359 | if int(config["General"]["upstream"]) > 1e4:
360 | config["General"]["upstream"] = 1e4
361 | if int(config["General"]["binsize"]) > int(config["General"]["upstream"]) / 2:
362 | config["General"]["binsize"] = int(config["General"]["upstream"]) / 2
363 | if int(config["General"]["step"]) > int(config["General"]["binsize"]):
364 | config["General"]["step"] = int(config["General"]["binsize"])
365 | if config["Genes"]["gff_file"]:
366 | print("\n# Using Batch mode.\n")
367 |
368 | return config
369 |
370 |
371 | def main():
372 |
373 | # Load configs
374 | config = configparser.ConfigParser()
375 | if len(sys.argv) == 1:
376 | config_file = "config.ini"
377 | elif len(sys.argv) == 2:
378 | config_file = sys.argv[1]
379 | else:
380 | print("Usage:\n python batch.py [configfile]\n")
381 | sys.exit(1)
382 | config.read(config_file)
383 |
384 | config = check_options(config)
385 | workdir = config["General"]["workdir"]
386 | threads = int(config["General"]["threads"])
387 | slop = int(config["General"]["slop"])
388 | upstream = int(config["General"]["upstream"])
389 | binsize = int(config["General"]["binsize"])
390 | step = int(config["General"]["step"])
391 | withutr = int(config["General"]["withutr"])
392 | gff_file = config["Genes"]["gff_file"]
393 | chrom_sizes = config["Genes"]["chrom_sizes"]
394 |
395 | # Load genes
396 | if gff_file:
397 | genes_info = get_gene_info(gff_file)
398 | total_genes = len(genes_info)
399 | else:
400 | print("No genome annotation file found, stop!")
401 | sys.exit(1)
402 |
403 | # Define the input numbers of multiprocessing list
404 | inputnum = 512
405 | if inputnum < threads:
406 | inputnum = threads
407 | else:
408 | roundnum = (inputnum // threads) * threads
409 | inputnum = int(max(roundnum, threads*4))
410 |
411 | # Load chromosome sizes
412 | chrlens = misc.get_chrom_sizes(chrom_sizes)
413 |
414 | # Define features information
415 | feature_map = {"ocfiles":"OCscores", "ocpeaks":"OCpeaks", "ptmfiles":"PTM",
416 | "motifs":"motifs", "cnss":"CNS", "genopheno":"genopheno",
417 | "phenodata":"phenoscores"}
418 | for item in config["Features"]:
419 | feature_files = config["Features"][item]
420 | if not feature_files:
421 | continue
422 | filelist = feature_files.split(",")
423 | count = 1
424 | for file in filelist:
425 | feature_infos = []
426 | num = 1
427 | for gene in genes_info:
428 | chrom = genes_info[gene][0]
429 | start = genes_info[gene][1]
430 | end = genes_info[gene][2]
431 | strand = genes_info[gene][3]
432 | utrst = genes_info[gene][4]
433 | utred = genes_info[gene][5]
434 | feature_info = Features_info()
435 | feature_info.workdir = workdir
436 | feature_info.slop = slop
437 | feature_info.config = config
438 | feature_info.idx = num
439 | feature_info.geneinfo = Geneinfo()
440 | feature_info.geneinfo.gene = gene
441 | feature_info.geneinfo.chrom = chrom
442 | feature_info.geneinfo.strand = strand
443 | if strand == "+":
444 | feature_info.geneinfo.start = start - upstream
445 | if withutr:
446 | feature_info.geneinfo.end = start + utrst
447 | else:
448 | feature_info.geneinfo.end = start
449 | else:
450 | if withutr:
451 | feature_info.geneinfo.start = end - utred
452 | else:
453 | feature_info.geneinfo.start = end
454 | feature_info.geneinfo.end = end + upstream
455 | feature_info.geneinfo.binsize = binsize
456 | feature_info.geneinfo.step = step
457 | num += 1
458 | # Output analyzed gene regions
459 | generate_regions(feature_info.geneinfo, workdir, gene, chrlens)
460 | feature_info.feature = file
461 | if len(filelist) > 1:
462 | outname = feature_map[item] + "_" + str(count)
463 | else:
464 | outname = feature_map[item]
465 | feature_info.outname = outname
466 | feature_info.chrlens = chrlens
467 | feature_infos.append(feature_info)
468 | count += 1
469 | # continue # Skip generate features raw data
470 | # Generate features file
471 | time_st = time()
472 | file_suffix = file.split(".")[-1].lower()
473 | filesize = os.path.getsize(file)
474 | if file_suffix in ["bed", "bedgraph", "txt"] and filesize > 1e8:
475 | results = generate_features_from_large(file, genes_info, upstream, slop,
476 | workdir, outname)
477 | else:
478 | # Multiprocessing
479 | for i in range(0, total_genes, inputnum):
480 | # Set Pool size
481 | pool = Pool(threads)
482 | if i + inputnum < total_genes:
483 | inputlist = feature_infos[i:i+inputnum]
484 | else:
485 | inputlist = feature_infos[i:]
486 | results = pool.map(generate_features, inputlist)
487 | pool.close()
488 | pool.join()
489 | print("Round %s finished." % round(i/inputnum))
490 | time_ed = time()
491 | time_elapse = round(time_ed - time_st)
492 | print("Generate %s features files finished.\nUsing %ss" % (outname, time_elapse))
493 |
494 | # Perform analysis
495 | time_st = time()
496 | cnt = 1
497 | new = 0
498 | for i in range(0, total_genes, inputnum):
499 | # Set Pool size
500 | pool = Pool(threads)
501 | if i + inputnum < total_genes:
502 | inputlist = feature_infos[i:i+inputnum]
503 | else:
504 | inputlist = feature_infos[i:]
505 | for result in pool.imap_unordered(run_analysis, inputlist):
506 | if result[1]:
507 | new += 1
508 | time_ed = time()
509 | if new:
510 | speed = round((time_ed - time_st) / new, 2)
511 | else:
512 | time_st = time()
513 | speed = 0.0
514 | print("%s / %s Gene (%s) analyzed (speed %s s)." % (cnt, total_genes, result[0], speed))
515 | cnt += 1
516 | pool.close()
517 | pool.join()
518 | print("Round %s finished." % round(i/inputnum))
519 | time_total = round(time() - time_st, 2)
520 | print("\nAll the genes analysis finished using %ss. %s\n" % (time_total, " "*30))
521 |
522 | print("All the processes completed.", " "*10)
523 |
524 |
525 |
526 | if __name__ == '__main__':
527 |
528 | try:
529 | main()
530 |
531 | except KeyboardInterrupt:
532 | sys.stderr.write("User interrupt\n")
533 | sys.exit(0)
534 |
535 |
--------------------------------------------------------------------------------
/test/single/data/genes_motifs_JASPAR_test.bed:
--------------------------------------------------------------------------------
1 | Chr1 4001197 4001217 AT2G28810 . - 2.22e-07 0.00793
2 | Chr1 4001197 4001217 AT5G02460 . + 1.71e-07 0.0044
3 | Chr1 4001198 4001218 OBP1 . - 3.96e-07 0.00957
4 | Chr1 4001236 4001248 O2 . - 9.03e-06 0.301
5 | Chr1 4001296 4001305 ERF6 . + 9.61e-06 0.159
6 | Chr1 4001395 4001406 bZIP42 . - 7.49e-06 0.417
7 | Chr1 4002087 4002097 bHLH80 . + 2.49e-07 0.183
8 | Chr1 4002208 4002218 LEC2 . + 4.49e-06 0.403
9 | Chr1 4002257 4002275 AT3G45610 . - 5.07e-07 0.0322
10 | Chr1 4002257 4002277 COG1 . + 4.99e-07 0.0211
11 | Chr1 4002260 4002280 Adof1 . + 8.69e-08 0.00468
12 | Chr1 4002264 4002282 AT3G45610 . - 1.84e-06 0.0522
13 | Chr1 4002265 4002275 AT3G52440 . + 2.77e-06 0.412
14 | Chr1 4002265 4002275 DAG2 . + 4.51e-07 0.244
15 | Chr1 4002265 4002278 OBP4 . + 2.42e-06 0.132
16 | Chr1 4002301 4002315 AT3G46070 . + 2.75e-06 0.305
17 | Chr1 4002446 4002474 AT5G66940 . - 6.68e-10 0.000144
18 | Chr1 4002447 4002467 Adof1 . + 4.23e-08 0.0033
19 | Chr1 4002448 4002466 dof4.2 . + 2.98e-07 0.0263
20 | Chr1 4002448 4002468 OBP3 . - 2.29e-07 0.0043
21 | Chr1 4002448 4002474 AT1G69570 . - 7.81e-11 8.63e-05
22 | Chr1 4002449 4002469 OBP3 . - 3.85e-07 0.00558
23 | Chr1 4002450 4002464 IDD2 . + 4.71e-06 0.238
24 | Chr1 4002450 4002469 AT1G14580 . - 7.57e-06 0.243
25 | Chr1 4002450 4002470 OBP3 . - 8.67e-08 0.0026
26 | Chr1 4002451 4002471 OBP3 . - 1.48e-07 0.00347
27 | Chr1 4002452 4002472 AT2G28810 . - 1.09e-09 0.000736
28 | Chr1 4002452 4002472 OBP3 . - 3.65e-10 0.000258
29 | Chr1 4002452 4002472 AT5G02460 . + 2.32e-10 0.000203
30 | Chr1 4002452 4002480 AT5G66940 . - 1.64e-06 0.0126
31 | Chr1 4002453 4002473 OBP1 . - 2.45e-09 0.0007
32 | Chr1 4002453 4002481 AT5G66940 . - 8.62e-07 0.00878
33 | Chr1 4002454 4002472 AT3G45610 . - 2.38e-06 0.0569
34 | Chr1 4002454 4002474 FLC . + 9.09e-06 0.238
35 | Chr1 4002454 4002474 COG1 . + 2.91e-08 0.00648
36 | Chr1 4002454 4002482 AT5G66940 . - 1.31e-06 0.0111
37 | Chr1 4002455 4002483 AT5G66940 . - 3.34e-07 0.00507
38 | Chr1 4002456 4002482 AT1G69570 . - 7.97e-11 8.63e-05
39 | Chr1 4002456 4002484 AT5G66940 . - 2.41e-13 4.58e-06
40 | Chr1 4002457 4002477 Adof1 . + 1.76e-08 0.00217
41 | Chr1 4002458 4002476 dof4.2 . + 7.77e-07 0.0374
42 | Chr1 4002458 4002478 OBP3 . - 2.98e-07 0.0049
43 | Chr1 4002459 4002472 PI . + 3.44e-06 0.121
44 | Chr1 4002459 4002479 OBP3 . - 4.05e-08 0.00179
45 | Chr1 4002460 4002474 IDD2 . + 4.71e-06 0.238
46 | Chr1 4002460 4002479 AT1G14580 . - 8.53e-06 0.243
47 | Chr1 4002460 4002480 OBP3 . - 3.01e-11 0.000105
48 | Chr1 4002462 4002480 AT3G45610 . - 1.37e-07 0.0213
49 | Chr1 4002462 4002482 COG1 . + 1.7e-08 0.00523
50 | Chr1 4002462 4002482 AT2G28810 . - 5.06e-10 0.000736
51 | Chr1 4002462 4002482 AT5G02460 . + 4.9e-11 0.000151
52 | Chr1 4002463 4002483 OBP1 . - 6.9e-10 0.000466
53 | Chr1 4002464 4002492 AT5G66940 . - 3.37e-10 0.000104
54 | Chr1 4002465 4002485 OBP3 . - 1.5e-07 0.00349
55 | Chr1 4002465 4002485 Adof1 . + 2.53e-09 0.000917
56 | Chr1 4002466 4002484 dof4.2 . + 3.04e-07 0.0263
57 | Chr1 4002466 4002486 OBP3 . - 2.03e-07 0.00408
58 | Chr1 4002466 4002492 AT1G69570 . - 8.94e-08 0.00258
59 | Chr1 4002467 4002480 PI . + 3.73e-06 0.121
60 | Chr1 4002467 4002487 OBP3 . - 3.69e-07 0.00546
61 | Chr1 4002468 4002488 OBP3 . - 4.62e-07 0.00611
62 | Chr1 4002469 4002489 OBP3 . - 6.01e-07 0.00699
63 | Chr1 4002470 4002490 AT2G28810 . - 6.95e-08 0.00437
64 | Chr1 4002470 4002490 OBP3 . - 3.72e-08 0.0017
65 | Chr1 4002470 4002490 AT5G02460 . + 2.51e-08 0.00146
66 | Chr1 4002471 4002491 OBP1 . - 5.85e-08 0.00332
67 | Chr1 4002472 4002490 AT3G45610 . - 9.86e-06 0.0968
68 | Chr1 4002574 4002588 ATHB34 . - 4.56e-06 0.167
69 | Chr1 4002662 4002675 OBP4 . - 1.92e-06 0.121
70 | Chr1 4002663 4002683 COG1 . - 3.89e-06 0.0492
71 | Chr1 4002665 4002675 AT3G52440 . - 3.89e-06 0.413
72 | Chr1 4002665 4002675 DAG2 . - 1.35e-06 0.244
73 | Chr1 4002665 4002683 AT3G45610 . + 6.95e-06 0.0849
74 | Chr1 4003265 4003285 COG1 . - 6.22e-06 0.0592
75 | Chr1 4003265 4003291 AT1G69570 . + 6.17e-07 0.00778
76 | Chr1 4003267 4003285 AT3G45610 . + 9.86e-06 0.0968
77 | Chr1 4003273 4003293 Adof1 . - 5.11e-07 0.0113
78 | Chr1 4003360 4003380 ATHB40 . + 3.79e-06 0.371
79 | Chr1 4003363 4003373 ATHB53 . + 3.89e-06 0.497
80 | Chr1 4003363 4003373 ATHB20 . + 3.27e-06 0.401
81 | Chr1 4003363 4003373 ATHB13 . + 1.82e-06 0.435
82 | Chr1 4003366 4003375 ATHB23 . + 7.3e-06 0.138
83 | Chr1 4003367 4003379 ZHD1 . + 8.36e-06 0.148
84 | Chr1 4003368 4003377 ATHB23 . - 2.43e-06 0.0816
85 | Chr1 4003439 4003451 ZHD1 . + 5.09e-06 0.145
86 | Chr1 4003440 4003452 ZHD1 . - 2.47e-06 0.124
87 | Chr1 4003442 4003451 ATHB23 . + 2.43e-06 0.0816
88 | Chr1 4003442 4003456 ATHB34 . + 1.68e-07 0.0503
89 | Chr1 4003443 4003455 ZHD1 . + 4.49e-07 0.0757
90 | Chr1 4003443 4003457 ATHB34 . - 1.68e-07 0.0503
91 | Chr1 4003444 4003453 ATHB23 . - 2.43e-06 0.0816
92 | Chr1 4003444 4003456 ZHD1 . - 4.49e-07 0.0757
93 | Chr1 4003446 4003455 ATHB23 . + 2.43e-06 0.0816
94 | Chr1 4003446 4003460 ATHB34 . + 1.68e-07 0.0503
95 | Chr1 4003447 4003459 ZHD1 . + 4.49e-07 0.0757
96 | Chr1 4003447 4003461 ATHB34 . - 5.3e-07 0.0728
97 | Chr1 4003447 4003461 ZHD6 . - 5.02e-06 0.19
98 | Chr1 4003448 4003457 ATHB23 . - 2.43e-06 0.0816
99 | Chr1 4003448 4003460 ZHD1 . - 4.49e-07 0.0757
100 | Chr1 4003450 4003459 ATHB23 . + 2.43e-06 0.0816
101 | Chr1 4003452 4003461 ATHB23 . - 7.3e-06 0.138
102 | Chr1 4003579 4003599 OBP3 . + 1.77e-07 0.0038
103 | Chr1 4003579 4003599 Adof1 . - 2.54e-08 0.00256
104 | Chr1 4003580 4003598 dof4.2 . - 7.74e-06 0.0818
105 | Chr1 4003582 4003608 AT1G69570 . + 5.08e-07 0.00696
106 | Chr1 4003584 4003604 OBP3 . + 1.72e-07 0.00374
107 | Chr1 4003600 4003628 AT5G66940 . + 1.12e-06 0.0102
108 | Chr1 4003602 4003622 AT5G02460 . - 1.49e-06 0.0153
109 | Chr1 4003607 4003627 Adof1 . - 5.57e-07 0.0118
110 | Chr1 4003610 4003630 COG1 . - 7.99e-06 0.0649
111 | Chr1 4003664 4003684 RAP212 . + 3.07e-08 0.000483
112 | Chr1 4003664 4003684 ERF9 . + 1.45e-08 0.000157
113 | Chr1 4003665 4003685 LEP . - 4.68e-09 7.99e-05
114 | Chr1 4003667 4003687 RAP212 . + 5.88e-08 0.00073
115 | Chr1 4003667 4003687 ERF9 . + 5.02e-09 7.55e-05
116 | Chr1 4003668 4003679 CBF1 . - 2.11e-06 0.413
117 | Chr1 4003668 4003686 ABR1 . + 7.15e-08 0.00108
118 | Chr1 4003668 4003688 LEP . - 1.78e-09 4.03e-05
119 | Chr1 4003669 4003681 RAP21 . + 2.92e-06 0.0754
120 | Chr1 4003669 4003683 AT4G16750 . - 1.84e-07 0.0115
121 | Chr1 4003670 4003684 AT5G67000 . + 9.22e-07 0.0127
122 | Chr1 4003670 4003684 CEJ1 . - 8.85e-07 0.015
123 | Chr1 4003670 4003684 AT1G44830 . - 4.22e-07 0.00624
124 | Chr1 4003670 4003684 AT1G75490 . - 3.01e-07 0.003
125 | Chr1 4003670 4003688 ESE3 . + 4.53e-08 0.000528
126 | Chr1 4003670 4003690 RAP212 . + 1.98e-07 0.00165
127 | Chr1 4003671 4003684 AT1G36060 . + 8.82e-06 0.121
128 | Chr1 4003671 4003685 AT5G18450 . + 4.52e-07 0.00453
129 | Chr1 4003671 4003689 ERF104 . - 4.57e-09 6.88e-05
130 | Chr1 4003671 4003690 DREB26 . - 7.25e-07 0.0108
131 | Chr1 4003671 4003690 AT4G28140 . + 6.38e-09 0.00014
132 | Chr1 4003672 4003686 AT4G16750 . - 1.06e-06 0.0159
133 | Chr1 4003672 4003686 ERF15 . - 1.88e-07 0.00152
134 | Chr1 4003672 4003686 ERF105 . + 1.04e-08 0.000197
135 | Chr1 4003673 4003683 ERF118 . - 7.65e-08 0.000844
136 | Chr1 4003673 4003684 CRF4 . - 3.82e-08 0.000617
137 | Chr1 4003673 4003687 AT1G44830 . - 5.84e-07 0.00709
138 | Chr1 4003673 4003687 RAP211 . - 2.18e-07 0.00205
139 | Chr1 4003673 4003687 AT1G75490 . - 1.54e-07 0.00218
140 | Chr1 4003673 4003687 RAP26 . + 3.11e-08 0.000521
141 | Chr1 4003673 4003687 PUCHI . - 1.64e-08 0.000302
142 | Chr1 4003673 4003687 ERF087 . + 1.23e-08 0.000228
143 | Chr1 4003673 4003687 ERF5 . - 4.47e-09 0.000101
144 | Chr1 4003673 4003687 ESE1 . + 2.71e-09 6.43e-05
145 | Chr1 4003673 4003693 ERF2 . - 1.84e-09 3.48e-05
146 | Chr1 4003674 4003684 AT3G57600 . - 2.83e-07 0.00301
147 | Chr1 4003674 4003692 ERF104 . - 6.75e-10 1.67e-05
148 | Chr1 4003674 4003694 ERF10 . - 5.11e-09 7.85e-05
149 | Chr1 4003675 4003685 ERF3 . - 7.65e-08 0.000947
150 | Chr1 4003675 4003685 AT2G33710 . - 7.65e-08 0.000947
151 | Chr1 4003675 4003689 AT4G16750 . - 1.31e-06 0.0173
152 | Chr1 4003675 4003689 ERF105 . + 4.06e-08 0.000499
153 | Chr1 4003675 4003689 ERF15 . - 5.85e-09 0.000126
154 | Chr1 4003675 4003691 AT4G18450 . + 5.98e-09 0.000101
155 | Chr1 4003676 4003686 ERF118 . - 7.65e-08 0.000844
156 | Chr1 4003676 4003690 RAP211 . - 3.47e-07 0.00286
157 | Chr1 4003697 4003711 ERF021 . - 5.97e-06 0.132
158 | Chr1 4003697 4003717 RAP212 . + 1.98e-07 0.00165
159 | Chr1 4003698 4003711 AT1G36060 . + 4.95e-06 0.113
160 | Chr1 4003698 4003718 LEP . - 9.25e-09 0.000126
161 | Chr1 4003699 4003711 RAP21 . + 3.24e-06 0.0756
162 | Chr1 4003699 4003713 AT4G16750 . - 8.99e-08 0.0115
163 | Chr1 4003700 4003714 AT5G67000 . + 9.22e-07 0.0127
164 | Chr1 4003700 4003714 CEJ1 . - 8.85e-07 0.015
165 | Chr1 4003700 4003714 AT1G44830 . - 4.22e-07 0.00624
166 | Chr1 4003700 4003714 AT1G75490 . - 3.01e-07 0.003
167 | Chr1 4003700 4003718 ESE3 . + 8.12e-09 0.000172
168 | Chr1 4003700 4003720 RAP212 . + 7.38e-09 0.00019
169 | Chr1 4003700 4003720 ERF2 . - 2.75e-09 4.69e-05
170 | Chr1 4003700 4003720 ERF9 . + 3.89e-10 1.26e-05
171 | Chr1 4003701 4003714 AT1G36060 . + 8.82e-06 0.121
172 | Chr1 4003701 4003715 AT5G18450 . + 4.52e-07 0.00453
173 | Chr1 4003701 4003719 ABR1 . + 5.63e-09 0.000235
174 | Chr1 4003701 4003719 ERF104 . - 2.99e-10 9.2e-06
175 | Chr1 4003701 4003720 DREB26 . - 1.51e-07 0.00495
176 | Chr1 4003701 4003720 AT4G28140 . + 4.85e-10 2.32e-05
177 | Chr1 4003701 4003721 ERF10 . - 6.61e-09 9.44e-05
178 | Chr1 4003701 4003721 LEP . - 1.04e-09 2.74e-05
179 | Chr1 4003702 4003716 AT4G16750 . - 1.06e-06 0.0159
180 | Chr1 4003702 4003716 ERF15 . - 1.88e-07 0.00152
181 | Chr1 4003702 4003716 ERF105 . + 1.04e-08 0.000197
182 | Chr1 4003702 4003718 AT4G18450 . + 3.24e-09 6.39e-05
183 | Chr1 4003703 4003713 ERF118 . - 7.65e-08 0.000844
184 | Chr1 4003703 4003714 CRF4 . - 3.82e-08 0.000617
185 | Chr1 4003703 4003717 CEJ1 . - 1.04e-06 0.015
186 | Chr1 4003703 4003717 AT5G67000 . + 2.14e-07 0.00625
187 | Chr1 4003703 4003717 AT1G44830 . - 1.31e-07 0.00309
188 | Chr1 4003703 4003717 RAP211 . - 1.06e-08 0.000349
189 | Chr1 4003703 4003717 RAP26 . + 1.67e-09 6.56e-05
190 | Chr1 4003703 4003717 ERF087 . + 1.02e-09 4.07e-05
191 | Chr1 4003703 4003717 PUCHI . - 7.31e-10 2.93e-05
192 | Chr1 4003703 4003717 AT1G75490 . - 1.24e-08 0.000337
193 | Chr1 4003703 4003717 ESE1 . + 1.98e-10 9.3e-06
194 | Chr1 4003703 4003717 ERF5 . - 1.98e-10 9.29e-06
195 | Chr1 4003703 4003721 ESE3 . + 1.43e-08 0.000248
196 | Chr1 4003703 4003723 RAP212 . + 2.09e-08 0.00037
197 | Chr1 4003703 4003723 ERF2 . - 8.05e-12 6.1e-07
198 | Chr1 4003703 4003723 ERF9 . + 1.56e-09 3.3e-05
199 | Chr1 4003704 4003714 AT3G57600 . - 2.83e-07 0.00301
200 | Chr1 4003704 4003718 AT5G18450 . + 2.56e-08 0.000707
201 | Chr1 4003704 4003718 AT1G22810 . + 2.22e-08 0.000651
202 | Chr1 4003704 4003722 ERF104 . - 7.24e-11 3.51e-06
203 | Chr1 4003704 4003723 AT4G28140 . + 3.3e-10 1.73e-05
204 | Chr1 4003704 4003723 DREB26 . - 3.14e-07 0.00693
205 | Chr1 4003704 4003724 LEP . - 2.37e-08 0.00024
206 | Chr1 4003704 4003724 ERF10 . - 2.74e-11 1.87e-06
207 | Chr1 4003705 4003715 ERF3 . - 7.65e-08 0.000947
208 | Chr1 4003705 4003715 AT2G33710 . - 7.65e-08 0.000947
209 | Chr1 4003705 4003719 AT4G16750 . - 5.16e-07 0.0115
210 | Chr1 4003705 4003719 ERF105 . + 4.38e-10 1.66e-05
211 | Chr1 4003705 4003719 ERF15 . - 1.98e-10 8.39e-06
212 | Chr1 4003705 4003721 AT4G18450 . + 7.11e-10 2.28e-05
213 | Chr1 4003706 4003716 ERF118 . - 7.65e-08 0.000844
214 | Chr1 4003706 4003717 CRF4 . - 3.82e-08 0.000617
215 | Chr1 4003706 4003720 AT1G75490 . - 3.68e-07 0.00332
216 | Chr1 4003706 4003720 RAP26 . + 4.91e-08 0.000723
217 | Chr1 4003706 4003720 PUCHI . - 2.43e-08 0.00038
218 | Chr1 4003706 4003720 ESE1 . + 9.34e-09 0.000168
219 | Chr1 4003706 4003720 RAP211 . - 2.01e-07 0.00205
220 | Chr1 4003706 4003720 ERF087 . + 1.57e-08 0.000268
221 | Chr1 4003706 4003720 ERF5 . - 1.59e-08 0.000252
222 | Chr1 4003706 4003726 ERF2 . - 5.52e-11 2.61e-06
223 | Chr1 4003707 4003717 AT3G57600 . - 2.83e-07 0.00301
224 | Chr1 4003707 4003725 ERF104 . - 2.99e-10 9.2e-06
225 | Chr1 4003707 4003726 DREB26 . - 2.6e-07 0.00644
226 | Chr1 4003707 4003726 AT4G28140 . + 6.29e-10 2.82e-05
227 | Chr1 4003707 4003727 AT1G77640 . + 2.68e-10 0.000394
228 | Chr1 4003707 4003727 ERF10 . - 4.95e-11 2.8e-06
229 | Chr1 4003708 4003718 ERF3 . - 7.65e-08 0.000947
230 | Chr1 4003708 4003718 AT2G33710 . - 7.65e-08 0.000947
231 | Chr1 4003708 4003722 ERF15 . - 2.86e-08 0.00041
232 | Chr1 4003708 4003724 AT4G18450 . + 2.07e-09 4.75e-05
233 | Chr1 4003709 4003719 ERF118 . - 7.65e-08 0.000844
234 | Chr1 4003709 4003723 AT1G75490 . - 4.69e-07 0.00399
235 | Chr1 4003709 4003723 RAP211 . - 4.12e-07 0.00318
236 | Chr1 4003709 4003723 ERF5 . - 3.5e-08 0.000401
237 | Chr1 4003709 4003729 ERF2 . - 5.71e-09 7.86e-05
238 | Chr1 4003710 4003729 DREB26 . - 8.17e-07 0.0115
239 | Chr1 4003710 4003729 AT4G28140 . + 9.14e-09 0.000179
240 | Chr1 4003712 4003732 DREB2 . + 8.63e-06 0.113
241 | Chr1 4003713 4003727 AT1G01250 . + 3.39e-06 0.434
242 | Chr1 4003713 4003727 AT1G22810 . + 1.24e-08 0.000651
243 | Chr1 4003714 4003728 RAP2-1 . - 8.85e-06 0.405
244 | Chr1 4003714 4003728 TINY . - 1.41e-06 0.435
245 | Chr1 4003714 4003728 ERF019 . - 4e-07 0.171
246 | Chr1 4003715 4003729 AT2G44940 . + 1.9e-06 0.468
247 | Chr1 4003716 4003727 ERF015 . - 8.81e-06 0.435
248 | Chr1 4003716 4003729 AT1G36060 . + 7.16e-06 0.118
249 | Chr1 4003717 4003729 RAP21 . + 1.69e-06 0.0581
250 | Chr1 4003789 4003798 PIF3 . + 1.36e-06 0.427
251 | Chr1 4003789 4003800 ABF4 . + 9.55e-06 0.37
252 | Chr1 4003789 4003800 HYH . - 6.67e-06 0.454
253 | Chr1 4003790 4003797 PIF4 . - 9.9e-06 0.439
254 | Chr1 4003988 4004007 DREB26 . + 1.08e-06 0.0135
255 | Chr1 4003994 4004008 AT1G44830 . + 3.45e-08 0.00309
256 | Chr1 4003994 4004008 AT1G75490 . + 1.3e-08 0.000348
257 | Chr1 4003994 4004008 CEJ1 . + 1.25e-07 0.0118
258 | Chr1 4003997 4004007 AT3G57600 . + 9.31e-08 0.00301
259 | Chr1 4003997 4004011 RAP211 . + 2.4e-08 0.000683
260 | Chr1 4003997 4004016 AT4G28140 . - 2.68e-10 1.47e-05
261 | Chr1 4003997 4004016 DREB26 . + 1.57e-08 0.00198
262 | Chr1 4003997 4004017 ERF9 . - 1.79e-08 0.000181
263 | Chr1 4003999 4004013 AT5G18450 . - 2.85e-07 0.0039
264 | Chr1 4003999 4004019 AT1G77640 . - 1.81e-07 0.0145
265 | Chr1 4004003 4004012 ERF6 . - 4.62e-06 0.159
266 | Chr1 4004003 4004017 CEJ1 . + 5.77e-07 0.015
267 | Chr1 4004003 4004017 AT1G44830 . + 1.21e-07 0.00309
268 | Chr1 4004003 4004017 AT1G75490 . + 5.05e-07 0.00422
269 | Chr1 4004003 4004023 DREB2 . - 1.59e-06 0.0611
270 | Chr1 4004006 4004018 RAP21 . - 1.88e-06 0.0606
271 | Chr1 4004006 4004019 AT1G36060 . - 7.55e-06 0.12
272 | Chr1 4004007 4004021 ERF019 . + 7.8e-06 0.216
273 | Chr1 4004158 4004178 AT1G77640 . - 4.17e-06 0.0683
274 | Chr1 4004164 4004175 ERF4 . + 3.71e-06 0.0782
275 | Chr1 4004164 4004175 ERF11 . + 2.41e-06 0.0748
276 | Chr1 4004257 4004270 ARF7 . - 4.3e-06 0.479
277 | Chr1 4004374 4004384 LEC2 . - 4.99e-07 0.28
278 | Chr1 4004534 4004541 ERF008 . - 6.67e-06 0.101
279 | Chr1 4004593 4004602 ERF6 . - 1.33e-06 0.159
280 | Chr1 4004644 4004651 RAP2-3 . + 6.67e-06 0.07
281 | Chr1 4004644 4004651 RAP2-6 . + 6.67e-06 0.07
282 | Chr1 4004644 4004651 ERF109 . + 6.67e-06 0.0701
283 | Chr1 4004644 4004652 ERF069 . + 5.17e-06 0.0506
284 | Chr1 4004647 4004656 ERF6 . - 4.62e-06 0.159
285 | Chr1 4004693 4004700 ERF008 . + 6.67e-06 0.101
286 | Chr1 4004710 4004717 RAP2-3 . + 6.67e-06 0.07
287 | Chr1 4004710 4004717 RAP2-6 . + 6.67e-06 0.07
288 | Chr1 4004710 4004717 ERF109 . + 6.67e-06 0.0701
289 | Chr1 4004710 4004718 ERF069 . + 5.17e-06 0.0506
290 | Chr1 4004716 4004724 CMTA3 . + 1.83e-06 0.202
291 | Chr1 4004751 4004771 ERF2 . + 8.69e-09 0.000108
292 | Chr1 4004752 4004770 ERF104 . + 4.9e-09 7.24e-05
293 | Chr1 4004753 4004773 ERF10 . + 1.49e-09 3.27e-05
294 | Chr1 4004754 4004773 DREB26 . + 1.06e-06 0.0134
295 | Chr1 4004754 4004773 AT4G28140 . - 3.82e-09 0.0001
296 | Chr1 4004754 4004774 RAP212 . - 1.35e-07 0.00128
297 | Chr1 4004754 4004774 ERF2 . + 3.33e-10 9.84e-06
298 | Chr1 4004755 4004769 ERF15 . + 4.67e-08 0.000571
299 | Chr1 4004755 4004773 ERF104 . + 2.43e-10 8.01e-06
300 | Chr1 4004756 4004772 AT4G18450 . - 2.29e-09 4.99e-05
301 | Chr1 4004756 4004774 ESE3 . - 1.67e-09 6.11e-05
302 | Chr1 4004756 4004776 ERF10 . + 1.72e-11 1.31e-06
303 | Chr1 4004756 4004776 LEP . + 1.64e-11 1.45e-06
304 | Chr1 4004757 4004771 AT1G44830 . + 5.84e-07 0.00709
305 | Chr1 4004757 4004771 RAP211 . + 2.18e-07 0.00205
306 | Chr1 4004757 4004771 AT1G75490 . + 1.54e-07 0.00218
307 | Chr1 4004757 4004771 RAP26 . - 3.11e-08 0.000521
308 | Chr1 4004757 4004771 PUCHI . + 1.64e-08 0.000302
309 | Chr1 4004757 4004771 ERF087 . - 1.23e-08 0.000228
310 | Chr1 4004757 4004771 ERF5 . + 4.47e-09 0.000101
311 | Chr1 4004757 4004771 ESE1 . - 2.71e-09 6.43e-05
312 | Chr1 4004757 4004776 AT4G28140 . - 8.21e-11 6.99e-06
313 | Chr1 4004757 4004776 DREB26 . + 1.59e-07 0.00495
314 | Chr1 4004757 4004777 RAP212 . - 1.53e-10 1.82e-05
315 | Chr1 4004757 4004777 ERF2 . + 1.03e-12 1.47e-07
316 | Chr1 4004757 4004777 ERF9 . - 4.22e-13 9.89e-08
317 | Chr1 4004758 4004768 ERF118 . + 7.65e-08 0.000844
318 | Chr1 4004758 4004772 AT4G16750 . + 5.16e-07 0.0115
319 | Chr1 4004758 4004772 ERF105 . - 4.38e-10 1.66e-05
320 | Chr1 4004758 4004772 ERF15 . + 1.98e-10 8.39e-06
321 | Chr1 4004758 4004776 ABR1 . - 3.36e-10 3.92e-05
322 | Chr1 4004758 4004776 ERF104 . + 5.1e-13 8.58e-08
323 | Chr1 4004759 4004769 ERF3 . + 7.65e-08 0.000947
324 | Chr1 4004759 4004769 AT2G33710 . + 7.65e-08 0.000947
325 | Chr1 4004759 4004773 AT5G18450 . - 2.56e-08 0.000707
326 | Chr1 4004759 4004773 AT1G22810 . - 2.22e-08 0.000651
327 | Chr1 4004759 4004775 AT4G18450 . - 1e-11 9.64e-07
328 | Chr1 4004759 4004777 ESE3 . - 5.72e-11 6.48e-06
329 | Chr1 4004759 4004779 AT1G77640 . - 4.82e-06 0.0722
330 | Chr1 4004759 4004779 ERF10 . + 4.96e-12 5.43e-07
331 | Chr1 4004759 4004779 LEP . + 3.52e-12 5.17e-07
332 | Chr1 4004760 4004770 AT3G57600 . + 2.83e-07 0.00301
333 | Chr1 4004760 4004771 CRF4 . + 3.82e-08 0.000617
334 | Chr1 4004760 4004774 CEJ1 . + 1.04e-06 0.015
335 | Chr1 4004760 4004774 AT5G67000 . - 2.14e-07 0.00625
336 | Chr1 4004760 4004774 AT1G44830 . + 1.31e-07 0.00309
337 | Chr1 4004760 4004774 RAP211 . + 1.06e-08 0.000349
338 | Chr1 4004760 4004774 RAP26 . - 1.67e-09 6.56e-05
339 | Chr1 4004760 4004774 ERF087 . - 1.02e-09 4.07e-05
340 | Chr1 4004760 4004774 PUCHI . + 7.31e-10 2.93e-05
341 | Chr1 4004760 4004774 AT1G75490 . + 1.24e-08 0.000337
342 | Chr1 4004760 4004774 ERF5 . + 1.98e-10 9.29e-06
343 | Chr1 4004760 4004774 ESE1 . - 1.98e-10 9.3e-06
344 | Chr1 4004760 4004779 DREB26 . + 2.04e-07 0.00564
345 | Chr1 4004760 4004779 AT4G28140 . - 3.77e-11 4.5e-06
346 | Chr1 4004760 4004780 RAP212 . - 1.32e-09 6.24e-05
347 | Chr1 4004760 4004780 ERF2 . + 1.65e-10 6.05e-06
348 | Chr1 4004760 4004780 ERF9 . - 4.09e-11 2.53e-06
349 | Chr1 4004761 4004771 ERF118 . + 7.65e-08 0.000844
350 | Chr1 4004761 4004775 AT4G16750 . + 5.16e-07 0.0115
351 | Chr1 4004761 4004775 ERF105 . - 4.38e-10 1.66e-05
352 | Chr1 4004761 4004775 ERF15 . + 1.98e-10 8.39e-06
353 | Chr1 4004761 4004779 ERF104 . + 3.62e-12 4.16e-07
354 | Chr1 4004761 4004779 ABR1 . - 2.51e-09 0.000135
355 | Chr1 4004762 4004772 ERF3 . + 7.65e-08 0.000947
356 | Chr1 4004762 4004772 AT2G33710 . + 7.65e-08 0.000947
357 | Chr1 4004762 4004776 AT5G18450 . - 2.56e-08 0.000707
358 | Chr1 4004762 4004776 AT1G22810 . - 2.22e-08 0.000651
359 | Chr1 4004762 4004778 AT4G18450 . - 4.67e-11 3.4e-06
360 | Chr1 4004762 4004780 ESE3 . - 7.68e-09 0.000166
361 | Chr1 4004762 4004782 LEP . + 9.02e-10 2.51e-05
362 | Chr1 4004763 4004773 AT3G57600 . + 2.83e-07 0.00301
363 | Chr1 4004763 4004774 CRF4 . + 3.82e-08 0.000617
364 | Chr1 4004763 4004777 CEJ1 . + 1.04e-06 0.015
365 | Chr1 4004763 4004777 AT5G67000 . - 2.14e-07 0.00625
366 | Chr1 4004763 4004777 AT1G44830 . + 1.31e-07 0.00309
367 | Chr1 4004763 4004777 RAP211 . + 1.06e-08 0.000349
368 | Chr1 4004763 4004777 RAP26 . - 1.67e-09 6.56e-05
369 | Chr1 4004763 4004777 ERF087 . - 1.02e-09 4.07e-05
370 | Chr1 4004763 4004777 PUCHI . + 7.31e-10 2.93e-05
371 | Chr1 4004763 4004777 AT1G75490 . + 1.24e-08 0.000337
372 | Chr1 4004763 4004777 ERF5 . + 1.98e-10 9.29e-06
373 | Chr1 4004763 4004777 ESE1 . - 1.98e-10 9.3e-06
374 | Chr1 4004763 4004783 RAP212 . - 3.23e-08 0.000499
375 | Chr1 4004763 4004783 ERF9 . - 1.12e-08 0.00013
376 | Chr1 4004764 4004774 ERF118 . + 7.65e-08 0.000844
377 | Chr1 4004764 4004778 AT4G16750 . + 1.75e-06 0.02
378 | Chr1 4004764 4004778 ERF105 . - 6.12e-09 0.000131
379 | Chr1 4004764 4004778 ERF15 . + 2.81e-09 6.94e-05
380 | Chr1 4004764 4004782 ABR1 . - 8.01e-08 0.00116
381 | Chr1 4004765 4004775 ERF3 . + 7.65e-08 0.000947
382 | Chr1 4004765 4004775 AT2G33710 . + 7.65e-08 0.000947
383 | Chr1 4004765 4004779 AT5G18450 . - 3.55e-07 0.00402
384 | Chr1 4004765 4004779 AT1G22810 . - 1.88e-07 0.00284
385 | Chr1 4004766 4004776 AT3G57600 . + 2.83e-07 0.00301
386 | Chr1 4004766 4004777 CRF4 . + 3.82e-08 0.000617
387 | Chr1 4004766 4004780 RAP211 . + 2.68e-08 0.000729
388 | Chr1 4004767 4004777 ERF118 . + 7.65e-08 0.000844
389 | Chr1 4004768 4004778 ERF3 . + 4.49e-07 0.0035
390 | Chr1 4004768 4004778 AT2G33710 . + 3.56e-07 0.00302
391 | Chr1 4004769 4004782 AT1G36060 . - 1.49e-06 0.1
392 | Chr1 4004769 4004782 ERF017 . - 3.65e-08 0.064
393 | Chr1 4004834 4004841 ERF008 . + 6.67e-06 0.101
394 | Chr1 4005108 4005128 COG1 . + 2.8e-06 0.0424
395 | Chr1 4005169 4005181 ZHD1 . + 4.72e-06 0.14
396 | Chr1 4005172 4005181 ATHB23 . + 4.87e-06 0.124
397 | Chr1 4005222 4005236 AT3G46070 . - 7.32e-06 0.429
398 | Chr1 4005552 4005566 ZHD6 . - 1.56e-07 0.084
399 | Chr1 4005553 4005562 ATHB23 . - 7.3e-06 0.138
400 | Chr1 4005776 4005785 ATHB23 . - 9.73e-06 0.153
401 | Chr1 4005779 4005793 ATHB34 . - 5.15e-06 0.167
402 | Chr1 4005859 4005868 WRKY40 . + 5.98e-06 0.352
403 | Chr1 4005941 4005969 AT5G66940 . + 1.72e-06 0.013
404 | Chr1 4006116 4006126 REF6 . + 6.08e-07 0.182
405 | Chr1 4006183 4006212 BPC5 . + 3.23e-10 6.32e-06
406 | Chr1 4006187 4006216 BPC5 . + 1.09e-09 1.79e-05
407 | Chr1 4006188 4006211 BPC1 . + 6.88e-09 7.67e-05
408 | Chr1 4006189 4006218 BPC5 . + 9.8e-13 3.88e-08
409 | Chr1 4006191 4006220 BPC5 . + 8.75e-15 5.32e-10
410 | Chr1 4006192 4006212 BPC6 . - 4.25e-08 0.000425
411 | Chr1 4006192 4006215 BPC1 . + 4.19e-12 1.37e-07
412 | Chr1 4006193 4006222 BPC5 . + 4.26e-16 3.18e-11
413 | Chr1 4006194 4006214 BPC6 . - 7.6e-12 2.74e-07
414 | Chr1 4006194 4006217 BPC1 . + 1.88e-13 8.38e-09
415 | Chr1 4006195 4006224 BPC5 . + 4.25e-16 3.18e-11
416 | Chr1 4006196 4006216 BPC6 . - 1.28e-09 2.35e-05
417 | Chr1 4006196 4006219 BPC1 . + 1.36e-13 6.14e-09
418 | Chr1 4006197 4006226 BPC5 . + 4.26e-16 3.18e-11
419 | Chr1 4006198 4006211 RAMOSA1 . + 3.63e-08 0.00049
420 | Chr1 4006198 4006218 BPC6 . - 3.01e-11 8.79e-07
421 | Chr1 4006198 4006221 BPC1 . + 1.05e-13 4.88e-09
422 | Chr1 4006199 4006228 BPC5 . + 7.5e-19 8.4e-14
423 | Chr1 4006200 4006213 RAMOSA1 . + 3.48e-09 7.3e-05
424 | Chr1 4006200 4006220 BPC6 . - 1.86e-13 8.8e-09
425 | Chr1 4006200 4006223 BPC1 . + 3.16e-15 2.04e-10
426 | Chr1 4006201 4006230 BPC5 . + 7.5e-19 8.4e-14
427 | Chr1 4006202 4006215 RAMOSA1 . + 3.48e-09 7.3e-05
428 | Chr1 4006202 4006222 BPC6 . - 1.86e-13 8.8e-09
429 | Chr1 4006202 4006225 BPC1 . + 3.16e-15 2.04e-10
430 | Chr1 4006203 4006232 BPC5 . + 7.5e-19 8.4e-14
431 | Chr1 4006204 4006217 RAMOSA1 . + 3.48e-09 7.3e-05
432 | Chr1 4006204 4006224 BPC6 . - 1.86e-13 8.8e-09
433 | Chr1 4006204 4006227 BPC1 . + 3.16e-15 2.04e-10
434 | Chr1 4006205 4006234 BPC5 . + 2.05e-15 1.41e-10
435 | Chr1 4006206 4006219 RAMOSA1 . + 3.48e-09 7.3e-05
436 | Chr1 4006206 4006226 BPC6 . - 1.86e-13 8.8e-09
437 | Chr1 4006206 4006229 BPC1 . + 3.16e-15 2.04e-10
438 | Chr1 4006207 4006236 BPC5 . + 5.12e-12 1.72e-07
439 | Chr1 4006208 4006221 RAMOSA1 . + 3.48e-09 7.3e-05
440 | Chr1 4006208 4006228 BPC6 . - 1.86e-13 8.8e-09
441 | Chr1 4006208 4006231 BPC1 . + 3.16e-15 2.04e-10
442 | Chr1 4006209 4006238 BPC5 . + 1.51e-13 7.3e-09
443 | Chr1 4006210 4006223 RAMOSA1 . + 3.48e-09 7.3e-05
444 | Chr1 4006210 4006230 BPC6 . - 1.86e-13 8.8e-09
445 | Chr1 4006210 4006233 BPC1 . + 2.64e-14 1.5e-09
446 | Chr1 4006212 4006225 RAMOSA1 . + 3.48e-09 7.3e-05
447 | Chr1 4006212 4006232 BPC6 . - 1.86e-13 8.8e-09
448 | Chr1 4006212 4006235 BPC1 . + 8.32e-12 2.49e-07
449 | Chr1 4006214 4006227 RAMOSA1 . + 3.48e-09 7.3e-05
450 | Chr1 4006214 4006234 BPC6 . - 8.29e-12 2.9e-07
451 | Chr1 4006214 4006237 BPC1 . + 1.12e-10 2.43e-06
452 | Chr1 4006216 4006229 RAMOSA1 . + 3.48e-09 7.3e-05
453 | Chr1 4006216 4006236 BPC6 . - 1.32e-09 2.42e-05
454 | Chr1 4006218 4006231 RAMOSA1 . + 3.48e-09 7.3e-05
455 | Chr1 4006220 4006233 RAMOSA1 . + 6.34e-09 0.000117
456 | Chr1 4006312 4006321 ERF7 . - 2.34e-06 0.113
457 | Chr1 4006312 4006321 ERF8 . - 2.34e-06 0.115
458 | Chr1 4006312 4006323 ERF11 . - 6.47e-06 0.0878
459 | Chr1 4006312 4006323 ERF4 . - 5.49e-06 0.0839
460 | Chr1 4006314 4006321 ERF1B . - 8.13e-06 0.12
461 | Chr1 4006314 4006321 ERF13 . - 8.13e-06 0.12
462 | Chr1 4006327 4006334 ERF008 . - 6.67e-06 0.101
463 | Chr1 4006339 4006348 ERF7 . - 9.64e-06 0.118
464 | Chr1 4006432 4006442 AT3G57600 . + 2.83e-07 0.00301
465 | Chr1 4006432 4006445 AT1G36060 . - 9.53e-06 0.124
466 | Chr1 4006436 4006443 ERF008 . + 6.67e-06 0.101
467 | Chr1 4006500 4006509 abi4 . + 1.84e-06 0.0682
468 | Chr1 4006502 4006509 RAP2-3 . + 6.67e-06 0.07
469 | Chr1 4006502 4006509 RAP2-6 . + 6.67e-06 0.07
470 | Chr1 4006502 4006509 ERF109 . + 6.67e-06 0.0701
471 | Chr1 4006503 4006510 ERF008 . + 6.67e-06 0.101
472 | Chr1 4006620 4006634 At5g05790 . + 5.6e-06 0.347
473 | Chr1 4006679 4006699 TRP2 . - 6.24e-06 0.298
474 | Chr1 4006688 4006706 dof4.2 . - 8.21e-06 0.0838
475 | Chr1 4006862 4006888 AT1G69570 . - 1.16e-07 0.003
476 | Chr1 4006868 4006888 COG1 . + 4.79e-08 0.00795
477 | Chr1 4006870 4006898 AT5G66940 . - 8.61e-08 0.00226
478 | Chr1 4006871 4006891 Adof1 . + 3.38e-07 0.00921
479 | Chr1 4006872 4006890 dof4.2 . + 1.95e-07 0.0235
480 | Chr1 4006872 4006892 Adof1 . + 1.01e-06 0.0163
481 | Chr1 4006873 4006893 OBP3 . - 1.05e-07 0.00287
482 | Chr1 4006874 4006894 OBP3 . - 2.09e-07 0.00412
483 | Chr1 4006875 4006888 PI . + 8e-06 0.155
484 | Chr1 4006876 4006889 OBP4 . + 9.69e-07 0.0923
485 | Chr1 4006876 4006896 AT5G02460 . + 4.76e-07 0.00809
486 | Chr1 4006876 4006896 AT2G28810 . - 1.2e-07 0.00572
487 | Chr1 4006877 4006897 OBP1 . - 1.25e-07 0.00512
488 | Chr1 4007043 4007055 GBF2 . - 8.58e-06 0.447
489 | Chr1 4007045 4007056 HYH . - 4.13e-06 0.359
490 | Chr1 4007046 4007053 PIF4 . - 9.9e-06 0.439
491 | Chr1 4007283 4007302 DREB26 . + 1.14e-06 0.0139
492 | Chr1 4007289 4007303 CEJ1 . + 4.58e-07 0.015
493 | Chr1 4007289 4007303 AT1G44830 . + 8.94e-08 0.00309
494 | Chr1 4007289 4007303 AT1G75490 . + 5.64e-08 0.00117
495 | Chr1 4007347 4007361 AT5G18450 . - 5.01e-07 0.00483
496 | Chr1 4007347 4007367 AT1G77640 . - 4.46e-06 0.0705
497 | Chr1 4007348 4007361 AT1G36060 . - 1.45e-06 0.1
498 | Chr1 4007352 4007359 ERF008 . + 6.67e-06 0.101
499 |
--------------------------------------------------------------------------------
/lib/features.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 | import pandas as pd
4 | import numpy as np
5 | from scipy import stats
6 | from pybedtools import BedTool
7 |
8 | from lib import misc
9 |
10 |
11 | class Geneinfo():
12 | # Provide Gene infomation
13 | def __init__(self):
14 | self.gene = "Gene"
15 | self.alias = "NA"
16 | self.chrom = "chrom"
17 | self.start = 0
18 | self.end = 1000
19 | self.strand = "+"
20 | self.binsize = 10
21 | self.step = 10
22 |
23 |
24 | def kmeans_like_diff(wt, control):
25 | # Calculate the k-means-like phenotype difference between mutants and WT
26 | ## Do not consider the length of mutations
27 | y_avg = np.average(wt)
28 | diff = 0
29 | for x in control:
30 | diff += abs(x - y_avg)
31 | diff_score = diff / len(control)
32 | return diff_score
33 |
34 |
35 | def kmeans_like_diff2(wt, control, binsize):
36 | # Calculate the k-means-like phenotype difference between mutants and WT
37 | ## Consider the influence of length of mutations
38 | y_avg = np.average(wt)
39 | diff = 0
40 | for x in control:
41 | diff += abs(float(x[4]) - y_avg) * ((int(x[2]) - int(x[1]))/binsize)
42 | diff_score = diff / len(control)
43 | return diff_score
44 |
45 |
46 | def openchromatin_scores(geneinfo, bedfile, peakfile = "", outdir = "./", samplename = "openchromatin"):
47 | """
48 | Generate the open chromatin feature in specific bins.
49 | (Alternative data: ATAC-seq, DNase-seq, MNase-seq)
50 |
51 | Mandatory parameters:
52 | 1. geneinfo - A class that defines the information of target gene
53 | 2. bedfile - Open chromatin values in bedGraph format
54 | 3. peakfile - Enrichment regions called from open chromatin data in BED format
55 |
56 | Alternative parameters:
57 | 1. outdir - Output directory for saving the scores file (bedGraph format)
58 | """
59 |
60 | # Get gene info
61 | gene = geneinfo.gene
62 | genename = geneinfo.alias
63 | if genename == "NA":
64 | gene_alias = gene
65 | else:
66 | gene_alias = genename
67 | chromosome = geneinfo.chrom
68 | binstart = geneinfo.start
69 | binstop = geneinfo.end
70 | binsize = geneinfo.binsize
71 | step = geneinfo.step
72 |
73 | # Check output directory
74 | misc.check_outdir(outdir)
75 | if not os.path.exists(bedfile):
76 | smooth_openchromatin = {}
77 | return smooth_openchromatin
78 |
79 | # Convert BigWig file to bedGraph file
80 | # Load bedGraph file as bed file
81 | if peakfile:
82 | oc_peak = BedTool(peakfile)
83 | oc_score = BedTool(bedfile)
84 |
85 | # Calculate scores
86 | oc_info = []
87 | overlap_list = []
88 | posinfo = {}
89 | for i, pos in enumerate(range(binstart, binstop, step)):
90 | posinfo[i] = pos
91 | binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n",
92 | from_string=True)
93 | score_in_bin = oc_score.intersect(binbed)
94 | if peakfile:
95 | peak_in_bin = oc_peak.intersect(binbed)
96 | overlap = [(int(str(x).split()[2])-int(str(x).split()[1]))/binsize for x in peak_in_bin]
97 | if overlap:
98 | overlap = 1
99 | else:
100 | overlap = 0.5
101 | else:
102 | overlap = 0.5
103 | sclst = [float(str(x).split()[3]) for x in score_in_bin]
104 | if sum(sclst):
105 | score = np.average(sclst)
106 | else:
107 | score = 0
108 | oc_info.append(score)
109 | overlap_list.append(overlap)
110 |
111 | # Smooth the scores
112 | max_score = max(oc_info)
113 | smooth_openchromatin = {}
114 | if max_score:
115 | oc_info = [x*overlap_list[i]/max_score for i,x in enumerate(oc_info)]
116 | else:
117 | return smooth_openchromatin
118 | outf = open(outdir + "/" + gene_alias + "/" + samplename + ".bedGraph", "w")
119 | for i in posinfo:
120 | pos = posinfo[i]
121 | score = oc_info[i]
122 | smooth_openchromatin[pos] = score
123 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
124 | outf.close()
125 |
126 | return smooth_openchromatin
127 |
128 |
129 | def ptm_scores(geneinfo, bedfile, ocname, outdir = "./", samplename = "PTM", minratio = 0.2):
130 | """
131 | Generate the histone modification feature in specific bins.
132 | (Alternative data: ChIP-seq)
133 |
134 | Mandatory parameters:
135 | 1. geneinfo - A class that defines the information of target gene
136 | 2. bedfile - histone modification values in bedGraph format
137 |
138 | Alternative parameters:
139 | 1. outdir - Output directory for saving the scores file (bedGraph format)
140 | """
141 |
142 | # Get gene info
143 | gene = geneinfo.gene
144 | genename = geneinfo.alias
145 | if genename == "NA":
146 | gene_alias = gene
147 | else:
148 | gene_alias = genename
149 | chromosome = geneinfo.chrom
150 | binstart = geneinfo.start
151 | binstop = geneinfo.end
152 | binsize = geneinfo.binsize
153 | step = geneinfo.step
154 |
155 | # Check output directory
156 | misc.check_outdir(outdir)
157 |
158 | # Load bedGraph file as bed file
159 | ptm_score = BedTool(bedfile)
160 |
161 | # Calculate scores
162 | ptm_info = []
163 | posinfo = {}
164 | for i, pos in enumerate(range(binstart, binstop, step)):
165 | posinfo[i] = pos
166 | binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n",
167 | from_string=True)
168 | ptm_in_bin = ptm_score.intersect(binbed)
169 | score = np.average([float(str(x).split()[3]) for x in ptm_in_bin])
170 | if pd.isna(score):
171 | score = 0
172 | ptm_info.append(score)
173 |
174 | max_score = max(ptm_info)
175 | ptm_info = [x/max_score for x in ptm_info]
176 |
177 | # Get ratios from open chromatin results
178 | ocfile = outdir + "/" + gene_alias + "/" + ocname + ".bedGraph"
179 | oc_scores = BedTool(ocfile)
180 | oc_ratios = {}
181 | for interval in oc_scores:
182 | chrom, start, end, score = str(interval).rstrip().split("\t")
183 | if float(score) > minratio:
184 | oc_ratios[int(start)] = float(score)
185 | else:
186 | oc_ratios[int(start)] = minratio
187 |
188 | # Smooth the scores
189 | smooth_ptm = misc.smooth_scores_fill2(ptm_info, posinfo)
190 | outf = open(outdir + "/" + gene_alias + "/" + samplename + ".bedGraph", "w")
191 | ptm_scores = [(1-smooth_ptm[x])*oc_ratios[x] for x in smooth_ptm]
192 | max_score2 = max(ptm_scores)
193 | for pos in smooth_ptm:
194 | # score = smooth_ptm[pos]
195 | score = (1 - smooth_ptm[pos]) * oc_ratios[pos] / max_score2
196 | smooth_ptm[pos] = score
197 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
198 | outf.close()
199 |
200 | return smooth_ptm
201 |
202 |
203 | def merge_reps(geneinfo, feature_list, outdir = "./", samplename = "merged"):
204 | """
205 | Merge the NGS feature in specific bins.
206 | (Alternative data: DNase-seq, ATAC-seq, ChIP-seq)
207 |
208 | Mandatory parameters:
209 | 1. geneinfo - A class that defines the information of target gene
210 | 2. feature_list - A list contains features need to be merged
211 |
212 | Alternative parameters:
213 | 1. outdir - Output directory for saving the scores file (bedGraph format)
214 | """
215 |
216 | # Get gene info
217 | gene = geneinfo.gene
218 | genename = geneinfo.alias
219 | if genename == "NA":
220 | gene_alias = gene
221 | else:
222 | gene_alias = genename
223 | chromosome = geneinfo.chrom
224 | binsize = geneinfo.binsize
225 |
226 | # Check output directory
227 | misc.check_outdir(outdir)
228 |
229 | # Merge the feature scores
230 | scorelist = []
231 | cnt = 0
232 | for feature in feature_list:
233 | if not feature:
234 | continue
235 | if cnt:
236 | for i, pos in enumerate(feature):
237 | scorelist[i] += feature[pos]
238 | else:
239 | for pos in feature:
240 | scorelist.append(feature[pos])
241 | cnt += 1
242 | scores_merge = {}
243 | if not scorelist:
244 | return scores_merge
245 | outf = open(outdir + "/" + gene_alias + "/" + samplename + ".bedGraph", "w")
246 | for i, pos in enumerate(feature_list[0]):
247 | score = scorelist[i] / max(scorelist)
248 | scores_merge[pos] = score
249 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
250 | outf.close()
251 |
252 | return scores_merge
253 |
254 |
255 | def motif_scores(geneinfo, bedfile, outdir = "./", flanking = 3):
256 | """
257 | Generate the TF motifs feature in specific bins.
258 | (Alternative data: Motif sites calculated by FIMO with PlantTFDB/JASPAR PWM files)
259 |
260 | Mandatory parameters:
261 | 1. geneinfo - A class that defines the information of target gene
262 | 2. bedfile - Motif positions in BED format
263 |
264 | Alternative parameters:
265 | 1. outdir - Output directory for saving the scores file (bedGraph format)
266 | """
267 |
268 | # Get gene info
269 | gene = geneinfo.gene
270 | genename = geneinfo.alias
271 | if genename == "NA":
272 | gene_alias = gene
273 | else:
274 | gene_alias = genename
275 | chromosome = geneinfo.chrom
276 | binstart = geneinfo.start
277 | binstop = geneinfo.end
278 | strand = geneinfo.strand
279 | binsize = geneinfo.binsize
280 | step = geneinfo.step
281 |
282 | # Check output directory
283 | misc.check_outdir(outdir)
284 |
285 | # Load bed file
286 | tf_motif = BedTool(bedfile)
287 |
288 | # Calculate scores
289 | motif_density = []
290 | motif_info = []
291 | posinfo = {}
292 | count = 0
293 | for i, pos in enumerate(range(binstart, binstop, step)):
294 | posinfo[i] = pos
295 | binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n",
296 | from_string=True)
297 | motif_in_bin = tf_motif.intersect(binbed)
298 | motif_lens = [int(str(x).split()[1]) for x in motif_in_bin]+[int(str(x).split()[2]) for x in motif_in_bin]
299 | if motif_lens:
300 | motif_density.append((max(motif_lens) - min(motif_lens)) / binsize)
301 | else:
302 | motif_density.append(0)
303 | bincount = len(motif_density)
304 | for i, score in enumerate(motif_density):
305 | if i > flanking:
306 | if i+flanking+1 > bincount:
307 | density = sum(motif_density[i-flanking:]) / (flanking+bincount-i)
308 | else:
309 | density = sum(motif_density[i-flanking:i+flanking+1]) / (2*flanking+1)
310 | else:
311 | density = sum(motif_density[:i+flanking+1]) / (flanking+i+1)
312 | motif_info.append(density)
313 |
314 | # Smooth the scores
315 | smooth_motif = misc.smooth_scores_fill2(motif_info, posinfo, minratio=1)
316 | max_score = max(smooth_motif.values())
317 | outf = open(outdir + "/" + gene_alias + "/motifs.bedGraph", "w")
318 | for pos in smooth_motif:
319 | score = smooth_motif[pos] / max_score
320 | smooth_motif[pos] = score
321 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
322 | outf.close()
323 |
324 | return smooth_motif
325 |
326 |
327 | def cns_scores(geneinfo, bedfile, outdir = "./"):
328 | """
329 | Generate the conservation feature in specific bins.
330 | (Alternative data: Phastcons scores)
331 |
332 | Mandatory parameters:
333 | 1. geneinfo - A class that defines the information of target gene
334 | 2. bedfile - Conservation scores in BED format
335 |
336 | Alternative parameters:
337 | 1. outdir - Output directory for saving the scores file (bedGraph format)
338 | """
339 |
340 | # Get gene info
341 | gene = geneinfo.gene
342 | genename = geneinfo.alias
343 | if genename == "NA":
344 | gene_alias = gene
345 | else:
346 | gene_alias = genename
347 | chromosome = geneinfo.chrom
348 | binstart = geneinfo.start
349 | binstop = geneinfo.end
350 | binsize = geneinfo.binsize
351 | step = geneinfo.step
352 |
353 | # Check output directory
354 | misc.check_outdir(outdir)
355 | if not os.path.exists(bedfile):
356 | smooth_cns = {}
357 | return smooth_cns
358 |
359 | # Load bed file
360 | cns = BedTool(bedfile)
361 |
362 | # Calculate scores
363 | cns_info = []
364 | posinfo = {}
365 | for i, pos in enumerate(range(binstart, binstop, step)):
366 | posinfo[i] = pos
367 | binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n",
368 | from_string=True)
369 | cns_in_bin = cns.intersect(binbed)
370 | sclst = [float(str(x).split()[3]) for x in cns_in_bin]
371 | if sum(sclst):
372 | score = np.average(sclst)
373 | else:
374 | score = 0
375 | cns_info.append(score)
376 |
377 | # Smooth the scores
378 | smooth_cns = misc.smooth_scores2(cns_info, posinfo)
379 | outf = open(outdir + "/" + gene_alias + "/CNS.bedGraph", "w")
380 | for pos in smooth_cns:
381 | score = smooth_cns[pos]
382 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
383 | outf.close()
384 |
385 | return smooth_cns
386 |
387 |
388 | def genopheno_scores(geneinfo, bedfile, outdir = "./"):
389 | """
390 | Generate the genotype and phenotype relationship feature in specific bins.
391 | (Alternative data: SNPs&Indels and Phenotype data)
392 |
393 | Mandatory parameters:
394 | 1. geneinfo - A class that defines the information of target gene
395 | 2. bedfile - genotype and phenotype relationship scores in BED format
396 |
397 | Alternative parameters:
398 | 1. outdir - Output directory for saving the scores file (bedGraph format)
399 | """
400 |
401 | # Get gene info
402 | gene = geneinfo.gene
403 | genename = geneinfo.alias
404 | if genename == "NA":
405 | gene_alias = gene
406 | else:
407 | gene_alias = genename
408 | chromosome = geneinfo.chrom
409 | binstart = geneinfo.start
410 | binstop = geneinfo.end
411 | binsize = geneinfo.binsize
412 | step = geneinfo.step
413 |
414 | # Check output directory
415 | misc.check_outdir(outdir)
416 |
417 | # Load bed file
418 | genopheno = BedTool(bedfile)
419 |
420 | # Calculate scores
421 | genopheno_info = []
422 | posinfo = {}
423 | for i, pos in enumerate(range(binstart, binstop, step)):
424 | posinfo[i] = pos
425 | binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n",
426 | from_string=True)
427 | genopheno_in_bin = genopheno.intersect(binbed)
428 | values = [float(str(x).split()[-1]) for x in genopheno_in_bin]
429 | # values = [x if x <= highest else highest for x in values]
430 | if values:
431 | score = sum(values)
432 | else:
433 | score = 0
434 | genopheno_info.append(score)
435 |
436 | # Smooth the scores
437 | smooth_genopheno = misc.smooth_scores_fill2(genopheno_info, posinfo)
438 | max_score = max(smooth_genopheno.values())
439 | outf = open(outdir + "/" + gene_alias + "/genopheno.bedGraph", "w")
440 | for pos in smooth_genopheno:
441 | score = smooth_genopheno[pos] / max_score
442 | smooth_genopheno[pos] = score
443 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
444 | outf.close()
445 |
446 | return smooth_genopheno
447 |
448 |
449 | def aggregate_scores(geneinfo, scorelist, weightlist, outdir = "./"):
450 | """
451 | Generate the aggregate score in specific bins.
452 |
453 | Mandatory parameters:
454 | 1. geneinfo - A class that defines the information of target gene
455 | 2. scorelist - A list host multiple feature scores from different data
456 | 3. weightlist - A list contains different weights assigned to different features
457 | (Should have the same order and numbers as scorelist)
458 |
459 | Alternative parameters:
460 | 1. outdir - Output directory for saving the scores file (bedGraph format)
461 | """
462 |
463 | # Get gene info
464 | gene = geneinfo.gene
465 | genename = geneinfo.alias
466 | if genename == "NA":
467 | gene_alias = gene
468 | else:
469 | gene_alias = genename
470 | chromosome = geneinfo.chrom
471 | binsize = geneinfo.binsize
472 |
473 | # Check output directory
474 | misc.check_outdir(outdir)
475 |
476 | # Calculate scores
477 | outf = open(outdir + "/" + gene_alias + "/aggregate.bedGraph", "w")
478 | ziplist = zip(scorelist, weightlist)
479 | aggregate_info = {}
480 | total = sum(weightlist)
481 | for item in ziplist:
482 | scorelist = item[0]
483 | weight = item[1]
484 | for pos in scorelist:
485 | aggregate = scorelist[pos] * weight / total
486 | if pos in aggregate_info:
487 | aggregate_info[pos] += aggregate
488 | else:
489 | aggregate_info[pos] = aggregate
490 | if aggregate_info:
491 | max_score = max(aggregate_info.values())
492 | else:
493 | print(gene_alias)
494 | return aggregate_info
495 | if not max_score:
496 | return aggregate_info
497 | for pos in aggregate_info:
498 | score = aggregate_info[pos] / max_score
499 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
500 | outf.close()
501 |
502 | return aggregate_info
503 |
504 |
505 | def phenodata_scores(geneinfo, bedfile, method = "kmeans1", outdir = "./", randbg = 0.02):
506 | """
507 | Calcuate the average phenodata value from multiple samples in specific bins.
508 |
509 | Mandatory parameters:
510 | 1. geneinfo - A class that defines the information of target gene
511 | 2. phenodata - Phenotype data of mutants in BED format
512 | chrom start end samplename avg_value
513 |
514 | Alternative parameters:
515 | 1. method - Methods used for calculating phenotype difference between WT and mutants
516 | ["ratio", "stdev", "utest", "kmeans1", "kmeans2"]
517 | 2. outdir - Output directory for saving the scores file (bedGraph format)
518 | """
519 |
520 | # Get gene info
521 | gene = geneinfo.gene
522 | genename = geneinfo.alias
523 | if genename == "NA":
524 | gene_alias = gene
525 | else:
526 | gene_alias = genename
527 | chromosome = geneinfo.chrom
528 | binstart = geneinfo.start
529 | binstop = geneinfo.end
530 | binsize = geneinfo.binsize
531 | step = geneinfo.step
532 |
533 | # Check output directory
534 | misc.check_outdir(outdir)
535 |
536 | # Load bed file
537 | phenodata = BedTool(bedfile)
538 |
539 | # Calculate scores
540 | methods = ["ratio", "stdev", "utest", "kmeans1", "kmeans2"]
541 | phenoinfo = []
542 | posinfo = {}
543 | for i, pos in enumerate(range(binstart, binstop, step)):
544 | posinfo[i] = pos
545 | binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n",
546 | from_string=True)
547 | pheno_in_bin = phenodata.intersect(binbed)
548 | mutant_phenos = [float(str(x).split()[4]) for x in pheno_in_bin if str(x).split()[3] != "WT"]
549 | wt_phenos = [float(str(x).split()[4]) for x in pheno_in_bin if str(x).split()[3] == "WT"]
550 | if mutant_phenos:
551 | if method == methods[0]:
552 | score = np.average(mutant_phenos) / np.average(wt_phenos)
553 | elif method == methods[1]:
554 | score = np.std(wt_phenos + mutant_phenos)
555 | elif method == methods[2]:
556 | mannwhitneyu = stats.mannwhitneyu(wt_phenos, mutant_phenos)
557 | score = -np.log10(mannwhitneyu[1])
558 | elif method == methods[3]:
559 | score = kmeans_like_diff(wt_phenos, mutant_phenos)
560 | elif method == methods[4]:
561 | mutant_phenos = [str(x).split()[:5] for x in pheno_in_bin if str(x).split()[3] != "WT"]
562 | score = kmeans_like_diff2(wt_phenos, mutant_phenos, binsize)
563 | else:
564 | print("Cannot find this method. Available methods are:", methods)
565 | else:
566 | score = 0
567 | phenoinfo.append(score)
568 |
569 | # Smooth the scores
570 | max_score = max(phenoinfo)
571 | random.seed(81)
572 | phenoinfo = [max(x/max_score+random.uniform(-randbg, randbg), 0) if x else x for x in phenoinfo]
573 | # Output raw scores of phenotypes
574 | outraw = open(outdir + "/" + gene_alias + "/phenoscores_" + method + "_raw.bedGraph", "w")
575 | for i in posinfo:
576 | pos = posinfo[i]
577 | score = phenoinfo[i]
578 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outraw)
579 | outraw.close()
580 | # Output smooth and gap-filled scores of phenotypes
581 | smooth_phenos = misc.smooth_scores1(phenoinfo, posinfo)
582 | max_score = max(smooth_phenos.values())
583 | min_score = min([x for x in smooth_phenos.values() if x])
584 | outf = open(outdir + "/" + gene_alias + "/phenoscores_" + method + ".bedGraph", "w")
585 | for pos in smooth_phenos:
586 | if smooth_phenos[pos]:
587 | score = (smooth_phenos[pos] - min_score) / (max_score - min_score)
588 | else:
589 | score = 0
590 | smooth_phenos[pos] = score
591 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
592 | outf.close()
593 |
594 | return smooth_phenos
595 |
596 |
597 | def define_key_regions(geneinfo, aggregate, phenodata, threshold = 0, outdir = "./"):
598 | """
599 | Define the key regions of the target site.
600 |
601 | Mandatory parameters:
602 | 1. geneinfo - A class that defines the information of target gene
603 | 2. aggregate - Aggregate scores
604 | 3. phenotypes - Phenotype scores
605 | (Should have the same order and numbers as scorelist)
606 |
607 | Alternative parameters:
608 | 1. threshold - Bin with score above the threshold is defined as a key region
609 | (Default: average of aggregate scores)
610 | 2. outdir - Output directory for saving the scores file (bedGraph format)
611 |
612 | Outputs:
613 | 1. plot_scores - Phenotype and aggregate scores for R/ggplot2
614 | 2. key_regions - Key regions in the target site
615 | 3. stats - Statistics of Pearson correlation and differential significance
616 | """
617 |
618 | # Get gene info
619 | gene = geneinfo.gene
620 | genename = geneinfo.alias
621 | if genename == "NA":
622 | gene_alias = gene
623 | else:
624 | gene_alias = genename
625 | chromosome = geneinfo.chrom
626 | binsize = geneinfo.binsize
627 |
628 | # Check output directory
629 | misc.check_outdir(outdir)
630 |
631 | # Define the cutoff
632 | if threshold:
633 | cutoff = threshold
634 | cutoff_dev = 0
635 | else:
636 | cutoff_dev = np.std(list(aggregate.values()))
637 | cutoff = np.average(list(aggregate.values()))
638 |
639 | # Classify key regions and other regions
640 | key_regions = []
641 | aggregate_all = []
642 | phenotype_all = []
643 | for pos in aggregate:
644 | score = aggregate[pos]
645 | if score >= cutoff:
646 | aggregate_all.append(score)
647 | key_regions.append([pos, score])
648 | else:
649 | aggregate_all.append(score)
650 |
651 | # Output key regions info
652 | merged_regions = misc.merge_regions(key_regions, geneinfo)
653 | raw_file = outdir + "/" + gene_alias + "/key_regions_raw.bed"
654 | outregion1 = open(raw_file, "w")
655 | merged_file = outdir + "/" + gene_alias + "/key_regions_merged.bed"
656 | outregion2 = open(merged_file, "w")
657 | for region in key_regions:
658 | pos, score = region
659 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outregion1)
660 | for lst in merged_regions:
661 | print("\t".join(list(map(str, lst))), file=outregion2)
662 | outregion1.close()
663 | outregion2.close()
664 |
665 | if os.path.exists(phenodata):
666 | # Calculate statistical values
667 | outf = open(outdir + "/" + gene_alias + "/plot_scores.txt", "w")
668 | print("sample", "group", "ratio", "difference", sep="\t", file=outf)
669 | outstat = open(outdir + "/" + gene_alias + "/statistics.txt", "w")
670 | # Cutoff of key regions definition
671 | print("Cutoff for defining key regions: %s" % cutoff, file=outstat)
672 | print("Cutoff deviation: %s" % cutoff_dev, file=outstat)
673 | # Calculate difference
674 | pheno_all = new_stats(geneinfo, phenodata, outdir = outdir)
675 | mean_ratio = np.average([x[1] for x in pheno_all])
676 | min_ratio = min(([x[1] for x in pheno_all]))
677 | max_ratio = max(([x[1] for x in pheno_all]))
678 | high_edited = []
679 | high_edited2 = []
680 | low_edited = []
681 | low_edited2 = []
682 | for scores in pheno_all:
683 | diff = scores[0]
684 | ratio = scores[1]
685 | sample = scores[2]
686 | if ratio > mean_ratio:
687 | high_edited.append((diff))
688 | high_edited2.append((diff-min_ratio)/(max_ratio-min_ratio))
689 | print(sample, "high", ratio, diff, sep="\t", file=outf)
690 | else:
691 | low_edited.append(diff)
692 | low_edited2.append((diff-min_ratio)/(max_ratio-min_ratio))
693 | print(sample, "low", ratio, diff, sep="\t", file=outf)
694 | outf.close()
695 |
696 | phe_high = np.average(high_edited2)
697 | phe_low = np.average(low_edited2)
698 | phe_ratio = phe_high / phe_low
699 | phe_pvalue = stats.mannwhitneyu(low_edited, high_edited)
700 | phe_pvalue2 = stats.ks_2samp(low_edited, high_edited, alternative="greater")
701 | phe_pvalue3 = stats.f_oneway(low_edited, high_edited)
702 | print("Phenotype differential ratio:", phe_ratio)
703 | print("Phenotype significance (U test):", phe_pvalue[1])
704 | print("Phenotype significance (KS test):", phe_pvalue2[1])
705 | print("Phenotype significance (ANOVA):", phe_pvalue3[1])
706 | print("Phenotype differential ratio:", phe_ratio, file=outstat)
707 | print("Phenotype significance (U test):", phe_pvalue[1], file=outstat)
708 | print("Phenotype significance (KS test):", phe_pvalue2[1], file=outstat)
709 | print("Phenotype significance (ANOVA):", phe_pvalue3[1], file=outstat)
710 | outstat.close()
711 | else:
712 | print("No Phenotype data detected, output key regions.")
713 |
714 | return key_regions
715 |
716 |
717 | def new_stats(geneinfo, phenodata, outdir = "./", side="both"):
718 |
719 | pheno_bed = BedTool(phenodata)
720 | gene = geneinfo.gene
721 | genename = geneinfo.alias
722 | if genename == "NA":
723 | gene_alias = gene
724 | else:
725 | gene_alias = genename
726 | key_regions = outdir + "/" + gene_alias + "/key_regions_merged.bed"
727 | region_bed = BedTool(key_regions)
728 | regionlens = sum([int(str(x).split()[2])-int(str(x).split()[1]) for x in region_bed])
729 | intersect = pheno_bed.intersect(region_bed, wao=True)
730 | sample_values = {}
731 | for interval in intersect:
732 | info = str(interval).rstrip().split("\t")
733 | sample = info[3]
734 | pheno = float(info[4])
735 | if sample == "WT":
736 | wt_value = pheno
737 | continue
738 | length = int(info[-1])
739 | if sample not in sample_values:
740 | if side == "none":
741 | phenoscore = abs(pheno - wt_value)
742 | else:
743 | phenoscore = pheno - wt_value
744 | sample_values[sample] = [phenoscore, 0]
745 | sample_values[sample][1] += length / regionlens
746 | max_ratio = max([x[1] for x in sample_values.values()])
747 | mean_pheno = np.average([x[0] for x in sample_values.values()])
748 | if mean_pheno < 0:
749 | for s in sample_values:
750 | sample_values[s][0] *= -1
751 | scores_list = sorted([(sample_values[s][0], sample_values[s][1]/max_ratio, s) for s in sample_values],
752 | key=lambda x:x[1], reverse=True)
753 |
754 | return scores_list
755 |
756 |
--------------------------------------------------------------------------------