├── test
    ├── single
    │   ├── data
    │   │   ├── gene_test.bed
    │   │   ├── rice_leaf_DHpeaks_test.bed
    │   │   ├── rice_callus_DHpeaks_test.bed
    │   │   ├── rice_H3K27ac_test.bw
    │   │   ├── rice_leaf_DHS_test.bw
    │   │   ├── rice_callus_DHS_test.bw
    │   │   ├── rice.chrom.sizes
    │   │   ├── genopheno_test.bed
    │   │   ├── editing_results_test.bed
    │   │   └── genes_motifs_JASPAR_test.bed
    │   └── config.ini
    └── batch
    │   ├── data
    │       └── README.md
    │   └── config.ini
├── requirements.txt
├── .gitignore
├── config.ini
├── lib
    ├── cores.py
    ├── genopheno.py
    ├── misc.py
    └── features.py
├── README.md
├── single.py
└── batch.py


/test/single/data/gene_test.bed:
--------------------------------------------------------------------------------
1 | Chr1	4003659	4004888	LOC_Os01g08220	.	-
2 | 


--------------------------------------------------------------------------------
/test/single/data/rice_leaf_DHpeaks_test.bed:
--------------------------------------------------------------------------------
1 | Chr1	4000759	4001568
2 | Chr1	4004878	4005453
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | scipy
4 | tqdm
5 | biopython
6 | pyBigWig
7 | pybedtools


--------------------------------------------------------------------------------
/test/single/data/rice_callus_DHpeaks_test.bed:
--------------------------------------------------------------------------------
1 | Chr1	4000853	4001184
2 | Chr1	4002860	4003235
3 | Chr1	4004230	4004515
4 | 


--------------------------------------------------------------------------------
/test/single/data/rice_H3K27ac_test.bw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/CAPE/main/test/single/data/rice_H3K27ac_test.bw


--------------------------------------------------------------------------------
/test/single/data/rice_leaf_DHS_test.bw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/CAPE/main/test/single/data/rice_leaf_DHS_test.bw


--------------------------------------------------------------------------------
/test/single/data/rice_callus_DHS_test.bw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/CAPE/main/test/single/data/rice_callus_DHS_test.bw


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | node_modules
 3 | .vscode
 4 | .idea
 5 | .env
 6 | .envrc
 7 | .venv
 8 | *.local
 9 | *.log*
10 | logs
11 | .DS_Store


--------------------------------------------------------------------------------
/test/single/data/rice.chrom.sizes:
--------------------------------------------------------------------------------
 1 | Chr1	43270923
 2 | Chr2	35937250
 3 | Chr3	36413819
 4 | Chr4	35502694
 5 | Chr5	29958434
 6 | Chr6	31248787
 7 | Chr7	29697621
 8 | Chr8	28443022
 9 | Chr9	23012720
10 | Chr10	23207287
11 | Chr11	29021106
12 | Chr12	27531856
13 | ChrSy	592136
14 | ChrUn	633585
15 | ChrC	134525
16 | ChrM	490520
17 | 


--------------------------------------------------------------------------------
/test/batch/data/README.md:
--------------------------------------------------------------------------------
1 | For genome-wide analysis, omics data can be downloaded from several database:  
2 | 
3 | (1) [PlantDHS](http://plantdhs.org/): DNase-seq data  
4 | (2) [PlantRegMap](http://plantregmap.gao-lab.org/): TF motifs, sequence conservation (CNSs)  
5 | (3) [MBKbase](http://www.mbkbase.org/rice): Genomic variation and phenotypes  
6 | 
7 | Then, data files are put in the data folder.


--------------------------------------------------------------------------------
/test/batch/config.ini:
--------------------------------------------------------------------------------
 1 | [General]
 2 | workdir = results
 3 | binsize = 10
 4 | step = 10
 5 | upstream = 2000
 6 | slop = 200
 7 | withutr = 0
 8 | threads = 64
 9 | 
10 | [Features]
11 | ocfiles = data/rice_leaf_DHS_test.bw,data/rice_callus_DHS_test.bw
12 | ocpeaks = data/rice_leaf_DHpeaks_test.bed,data/rice_callus_DHpeaks_test.bed
13 | ptmfiles = data/rice_H3K27ac_test.bw
14 | motifs = data/genome_wide_motifs_JASPAR_test.bed
15 | cnss = data/genome_wide_PhastCons_test.bedGraph
16 | genopheno = 
17 | phenodata = 
18 | 
19 | [Genes]
20 | gene_file = 
21 | gff_file = data/annotation.gff3
22 | chrom_sizes = data/rice.chrom.sizes
23 | 


--------------------------------------------------------------------------------
/test/single/config.ini:
--------------------------------------------------------------------------------
 1 | [General]
 2 | workdir = results
 3 | binsize = 10
 4 | step = 10
 5 | upstream = 2000
 6 | slop = 200
 7 | withutr = 0
 8 | threads = 8
 9 | 
10 | [Features]
11 | ocfiles = data/rice_leaf_DHS_test.bw,data/rice_callus_DHS_test.bw
12 | ocpeaks = data/rice_leaf_DHpeaks_test.bed,data/rice_callus_DHpeaks_test.bed
13 | ptmfiles = data/rice_H3K27ac_test.bw
14 | motifs = data/genes_motifs_JASPAR_test.bed
15 | cnss = data/genes_PhastCons_test.bedGraph
16 | genopheno = data/genopheno_test.bed
17 | phenodata = data/editing_results_test.bed
18 | 
19 | [Genes]
20 | gene_file = data/gene_test.bed
21 | gff_file = 
22 | chrom_sizes = data/rice.chrom.sizes
23 | 


--------------------------------------------------------------------------------
/config.ini:
--------------------------------------------------------------------------------
 1 | [General]
 2 | # Work directory ( also known as output directory )
 3 | workdir = results
 4 | # binsize and sliding step, not recommend to change
 5 | binsize = 10
 6 | step = 10
 7 | # Promoter length defined as sequence upstream of the TSS
 8 | upstream = 2000
 9 | # Extended length for generating raw scores of each features ( Useful for genome browser visualization )
10 | slop = 200
11 | # Whether or not including the 5'-UTR for analysis ( 0: Not include; 1: promoter + 5'-UTR )
12 | withutr = 0
13 | # Threads for batch mode (simultaneously process n genes)
14 | threads = 8
15 | 
16 | [Features]
17 | # Features with 1-bp resolution are recommended.
18 | # If feature files are unavailable, just leave a blank.
19 | # Multiple files are separated by comma.
20 | 
21 | # Open chromatin BigWig files ( from ATAC-seq/DNase-seq/MNase-seq/etc. )
22 | ocfiles = ATAC_profile.bw
23 | # Open chromatin peaks ( from MACS2/Genrich/Popera/etc. )
24 | ocpeaks = ATAC_peaks.bed
25 | # Histone modification BigWig files ( H3K27ac from ChIP-seq )
26 | ptmfiles = H3K27ac.bw
27 | # TF binding motifs ( from PlantTFBS/JARSPR motifs called by FIMO )
28 | motifs = genome_wide_motifs_JASPAR.bed
29 | # Conserved non-coding sequences ( from PhastCons/mVISTA scores ) 
30 | cnss = PhastCons.bedGraph
31 | # Genotype and phenotype files directory ( from MBKbase/etc. )
32 | genopheno = 
33 | # Phenotypes for evaluation ( Phenodata measured after gene-editing )
34 | phenodata = 
35 | 
36 | [Genes]
37 | # Gene for single mode (BED format: chr start end genename . strand)
38 | gene_file = gene.bed
39 | # GFF/GFF3 file for batch mode ( Use batch mode if gff_file is defined )
40 | gff_file = annotation.gff3
41 | # Chromosome length ( in case out of range )
42 | chrom_sizes = genome.chrom.sizes
43 | 


--------------------------------------------------------------------------------
/lib/cores.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from pybedtools import BedTool
 4 | 
 5 | 
 6 | def get_scores(geneinfo, scorefile, regionfile):
 7 |     score_bed = BedTool(scorefile)
 8 |     region_bed = BedTool(regionfile)
 9 |     scores = {}
10 |     for interval in score_bed.intersect(region_bed, wo=True):
11 |         info = str(interval).rstrip().split("\t")
12 |         chrom = info[0]
13 |         score = float(info[3])
14 |         region_start = info[5]
15 |         region_end = info[6]
16 |         name = "_".join([chrom, region_start, region_end])
17 |         if name not in scores:
18 |             scores[name] = []
19 |         scores[name].append(score)
20 |     return scores
21 | 
22 | 
23 | def get_cores(geneinfo, scores):
24 |     binsize = geneinfo.binsize
25 |     cores = ""
26 |     for region in scores:
27 |         values = scores[region]
28 |         if len(values) >= 5:
29 |             cutoff = np.average(values)
30 |         else:
31 |             cutoff = 0
32 |         chrom, rstart, rend = region.split("_")
33 |         for i, score in enumerate(values):
34 |             if score >= cutoff:
35 |                 start = int(rstart) + int(binsize * i)
36 |                 end = start + binsize
37 |                 cores += "\t".join([chrom, str(start), str(end)]) + "\n"
38 |     core_regions = BedTool(cores, from_string=True).merge()
39 |     return core_regions
40 | 
41 | 
42 | def output_cores(geneinfo, scorefile, regionfile, minlen = 2, outfile = ""):
43 |     scores = get_scores(geneinfo, scorefile, regionfile)
44 |     cores = get_cores(geneinfo, scores)
45 |     binsize = geneinfo.binsize
46 |     core_regions = []
47 |     if not outfile:
48 |         outfile = regionfile.replace("key_regions_merged", "core_regions")
49 |     outf = open(outfile, "w")
50 |     for interval in cores:
51 |         info = str(interval).rstrip().split("\t")
52 |         chrom = info[0]
53 |         start = int(info[1])
54 |         end = int(info[2])
55 |         if end - start >= binsize * minlen:
56 |             core_regions.append([chrom, start, end])
57 |             print(chrom, start, end, sep="\t", file=outf)
58 |     outf.close()
59 |     return core_regions
60 | 
61 | 


--------------------------------------------------------------------------------
/test/single/data/genopheno_test.bed:
--------------------------------------------------------------------------------
 1 | Chr1	4004909	4004910	25.08108508360879	3.578586951318265e-06	5.446288425978628
 2 | Chr1	4004918	4004919	11.643805566694892	0.0006441668288457089	3.1910016427888794
 3 | Chr1	4004944	4004945	13.03230509818196	0.0014793498931556398	2.8299290953100207
 4 | Chr1	4005034	4005035	39.84383952471281	2.2285386897378036e-09	8.65197982171073
 5 | Chr1	4005064	4005065	39.78991300947906	2.2894447758468627e-09	8.640269827713585
 6 | Chr1	4005209	4005210	29.288540872193153	6.23633981593134e-08	7.205070228287081
 7 | Chr1	4005297	4005298	119.18579893750855	1.3156264619263447e-26	25.88086739989521
 8 | Chr1	4005403	4005404	11.643805566694892	0.0006441668288457089	3.1910016427888794
 9 | Chr1	4005530	4005531	110.42334562249201	7.914747797493608e-26	25.101562919323662
10 | Chr1	4005658	4005659	1.0339549084633033	0.30923158012311003	0.5097161603783631
11 | Chr1	4005677	4005678	39.26142674474035	2.981875498831951e-09	8.525510493444743
12 | Chr1	4005679	4005680	10.544113986204366	0.001165586841256689	2.9334553641223033
13 | Chr1	4005691	4005692	15.548917696064617	8.039766076595917e-05	4.094756587211936
14 | Chr1	4005703	4005704	26.935587397725662	1.4158313132317057e-06	5.848988486827513
15 | Chr1	4005717	4005718	2.0448089072330937	0.15272616080064952	0.8160865653126603
16 | Chr1	4005796	4005797	55.075947097667324	1.0975139395761731e-12	11.959589955056167
17 | Chr1	4005839	4005840	14.03972184543109	0.0008939498123160222	3.0486868624636307
18 | Chr1	4005869	4005870	0.06792994717983242	0.7943749668296061	0.0999744502365751
19 | Chr1	4005872	4005873	29.350441454008816	4.232847455651902e-07	6.373367382450242
20 | Chr1	4005888	4005889	2.708257829770254	0.09982996124023241	1.000739097505115
21 | Chr1	4005917	4005918	177.4827977726221	2.8846966307842555e-39	38.539899852700266
22 | Chr1	4005956	4005957	11.643805566694892	0.0006441668288457089	3.1910016427888794
23 | Chr1	4005978	4005979	12.852202910120727	0.0016187493160071459	2.7908204020831735
24 | Chr1	4006010	4006011	11.643805566694892	0.0006441668288457089	3.1910016427888794
25 | Chr1	4006040	4006041	110.69254779348225	9.19218280830204e-25	24.036581347260658
26 | Chr1	4006055	4006056	60.973148779748776	5.752398579227342e-14	13.240151029655443
27 | Chr1	4006137	4006138	15.137103054769431	0.0005164399562436108	3.2869801643436105
28 | Chr1	4006467	4006468	16.791333717316366	0.00022584382019740588	3.6461917886132573
29 | Chr1	4006638	4006639	177.2643953648572	3.2175523937172934e-39	38.492474372436924
30 | Chr1	4006648	4006649	1.1183907309091496	0.2902652352336645	0.5372049760535923
31 | Chr1	4006756	4006757	164.24334090511462	1.3384517804049304e-37	36.8733972702606
32 | Chr1	4006758	4006759	110.42334562249201	7.914747797493608e-26	25.101562919323662
33 | Chr1	4006787	4006788	29.288540872193153	6.23633981593134e-08	7.205070228287081
34 | Chr1	4006817	4006818	134.09712513211326	5.2027049613596005e-31	30.28377080168162
35 | Chr1	4006835	4006836	11.643805566694892	0.0006441668288457089	3.1910016427888794
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CAPE
  2 | 
  3 | The computational pipeline of CAPE (<ins>C</ins>RISPR-Cas12<ins>a</ins> <ins>p</ins>romoter <ins>e</ins>diting)
  4 | 
  5 | 
  6 | ## Prerequisition
  7 | 
  8 | 1. Python >= 3.5
  9 | 2. Open chromatin data (profiles in BigWig format, peaks in BED format)
 10 | 3. TF binding motifs (identified by FIMO, matrix files are from PlantTFDB or JARSPR)
 11 | 4. Sequence conservation (Scores are from PhastCons/mVISTA, or manually calculate with PHAST package)
 12 | 5. Genome annotation file (in GFF3 format) and chromosome sizes file
 13 | 6. (Optional) H3K27ac histone modification profile (BigWig format), genomic variations and phenotypes from rice3K/RFGB/MBKBase/etc.
 14 | 
 15 | ## Install
 16 | 
 17 | ```bash
 18 | # Install CAPE dependencies
 19 | git clone https://github.com/zhangtaolab/CAPE.git
 20 | cd CAPE
 21 | pip install -r requirements.txt
 22 | 
 23 | # Run test for single gene
 24 | cd test/single
 25 | python ../../single.py config.ini
 26 | ```
 27 | 
 28 | ### Run the pipeline for single gene
 29 | 
 30 | ```bash
 31 | # Modify the config.ini file
 32 | [General]
 33 | workdir = results
 34 | binsize = 10
 35 | step = 10
 36 | upstream = 2000
 37 | slop = 200
 38 | withutr = 0
 39 | threads = 16
 40 | 
 41 | [Features]
 42 | ocfiles = Rice_leaf_DNase.bw,Rice_callus_DNase.bw
 43 | ocpeaks = TIGR7_DHSs.bed
 44 | ptmfiles = rice_H3K27ac.bw
 45 | motifs = genome_wide_motifs_JASPAR.bed
 46 | cnss = Osj_PhastCons.bedGraph
 47 | genopheno = 
 48 | phenodata = 
 49 | 
 50 | [Genes]
 51 | gene_file = gene.bed
 52 | gff_file = 
 53 | chrom_sizes = osativa_7.chrom.sizes
 54 | ```
 55 | 
 56 | ```bash
 57 | # Run the pipeline
 58 | python single.py config.ini
 59 | ```
 60 | 
 61 | ### Run the pipeline for whole genome genes
 62 | 
 63 | ```bash
 64 | # Modify the config.ini file
 65 | [General]
 66 | workdir = results
 67 | binsize = 10
 68 | step = 10
 69 | upstream = 2000
 70 | slop = 200
 71 | withutr = 0
 72 | threads = 16
 73 | 
 74 | [Features]
 75 | ocfiles = Rice_leaf_DNase.bw,Rice_callus_DNase.bw
 76 | ocpeaks = TIGR7_DHSs.bed
 77 | ptmfiles = rice_H3K27ac.bw
 78 | motifs = genome_wide_motifs_JASPAR.bed
 79 | cnss = Osj_PhastCons.bedGraph
 80 | genopheno = 
 81 | phenodata = 
 82 | 
 83 | [Genes]
 84 | gene_file = 
 85 | gff_file = TIGR7_all.gff3
 86 | chrom_sizes = osativa_7.chrom.sizes
 87 | ```
 88 | 
 89 | ```bash
 90 | # Run the pipeline
 91 | python batch.py config.ini
 92 | ```
 93 | 
 94 | ## Input (Feature data processing)
 95 | 
 96 | The instruction of how to generate feature data for calculation:  
 97 | 1. Open chromatin data:  
 98 | (1) Raw sequencing data (from DNase-seq/ATAC-seq/MNase-seq) first align to reference genome by BWA/Bowtie2;  
 99 | (2) Call peaks from the alignment using Macs2/Genrich/F-seq2/Popera;  
100 | (3) Generate profiles from the alignment (BigWig format, using DeepTools/F-seq2/Popera).  
101 | 2. TF binding motifs:  
102 | (1) Download the TF PFM data from database (PlantTFDB/JASPAR/CisBP);  
103 | (2) Find the occurrences of TF motifs in the genome by FIMO;  
104 | (3) Merge results of all TF motifs (BED format, TFs from the same family can be merged into one).  
105 | 3. Sequence conservation:  
106 | (1) Pre-calculated sequence conservation of plant genomes can be retrieved from PlantRegMap database;  
107 | (2) If no existed result for the target genome, calculate conservation scores using multiple close related genomes with PHAST/mVISTA.  
108 | 4. H3K27ac histone modification:  
109 | (1) Raw sequencing data (from ChIP-seq) first align to reference genome by BWA/Bowtie2;  
110 | (2) Generate profiles from the alignment (BigWig format, using DeepTools).  
111 | 5. Relationships between genomic variations and phenotypes (GenoPheno):  
112 | (1) Get the genotype data from public database, in FASTA format (for rice, using rice3K/RFGB/MBKBase/etc);  
113 | (2) Get the corresponding phenotype data from public database.   
114 |     (two column tab format, first column is Genotype_ID, second is Phenotype_Values separated by comma)  
115 | 6. Genome annotation file (BED/GFF3 format) is required for getting the promoter of target gene.  
116 | 7. Chromosome sizes file is required for converting input file format.  
117 |    (two column tab format, first column is chromosome name, second is chromosome length)  
118 | 
119 | \* Note that H3K27ac and GenoPheno data are optional for analysis.
120 | 
121 | ## Output
122 | 
123 | All output files are stored in the workdir defined in the config.ini file.  
124 | A folder will be created for each gene analyzed.  
125 | In the output gene folder, several files are generated:  
126 | 1. analysis_region.bed (File records the analyzed regions in the genome for this gene)
127 | 2. OCpeaks_*_raw.bed (Open chromatin regions overlap with the analysis region)
128 | 3. OCscores*.bedGraph (Open chromatin scores for the analysis region, suffix 'raw' means raw scores from BigWig file, others are normalized in range 0 to 1)
129 | 4. motifs*.bedGraph (Raw file contains motifs identified in the analysis region, another file is the normalized motifs scores)
130 | 5. CNS*.bedGraph (Raw file contains raw conserved score in the analysis region, another file is the normalized CNS scores)
131 | 6. PTM*.bedGraph (H3K27ac profile for the analysis region, scores from BigWig file, others are normalized in range 0 to 1)
132 | 7. aggregate.bedGraph (The aggregate scores (AS) calculated from all above features)
133 | 8. key_regions_*.bed (Merged file means merged key regions when two key regions are adjacent)
134 | 9. core_regions.bed (Core regions which have high AS within the key regions)  
135 | ( **Optional:** if CRISPR edited phenotype data are provided, also export the statistical analysis results. )
136 | 10. phenoscores_*.bedGraph (phenotype scores, measured by kmeans-like method)
137 | 11. scores_by_sample.txt (Features scores and aggregate scores for each CRISPR edited sample)
138 | 12. plot_scores.txt (Comparison between phenotype difference and estimated scores)
139 | 13. statistics.txt (Cutoff for defining key regions and significance analysis)
140 | 
141 | 


--------------------------------------------------------------------------------
/lib/genopheno.py:
--------------------------------------------------------------------------------
  1 | from Bio import pairwise2
  2 | import re
  3 | from tqdm import tqdm
  4 | from itertools import chain
  5 | import numpy as np
  6 | import pandas as pd
  7 | from scipy import stats
  8 | 
  9 | 
 10 | def load_fasta(seqfile):
 11 |     seqinfo = {}
 12 |     with open(seqfile, "r") as infile:
 13 |         for line in infile:
 14 |             if line.startswith(">"):
 15 |                 info = line.split("|")
 16 |                 name = info[0][1:]
 17 |                 if name == "REF":
 18 |                     sample_num = 0
 19 |                 else:
 20 |                     sample_num = int(info[1].split(":")[1])
 21 |             else:
 22 |                 seq = line.rstrip()
 23 |                 if name != "REF":
 24 |                     if seq == seqinfo["REF"]["seq"]:
 25 |                         refid = name
 26 |                 seqinfo[name] = {}
 27 |                 seqinfo[name]["seq"] = seq
 28 |                 seqinfo[name]["num"] = sample_num
 29 |     infile.close()
 30 |     return seqinfo, refid
 31 | 
 32 | 
 33 | def parse_alignment(alignment):
 34 |     aligninfo = {}
 35 |     refseq = alignment[0]
 36 |     altseq = alignment[1]
 37 |     indels = re.compile(r'-+')
 38 |     inspos = {}
 39 |     for i in range(len(refseq.replace("-", ""))):
 40 |         inspos[i] = 0
 41 |     for m in indels.finditer(refseq):
 42 |         start = m.span()[0]
 43 |         end = m.span()[1]
 44 |         for j in range(start+1, len(inspos)):
 45 |             inspos[j] += end - start
 46 |         aligninfo[start-inspos[start]] = {}
 47 |         aligninfo[start-inspos[start]]["ref"] = refseq[start-inspos[start]]
 48 |         aligninfo[start-inspos[start]]["alt"] = altseq[start-inspos[start]:end]
 49 |     for m in indels.finditer(altseq):
 50 |         start = m.span()[0]
 51 |         end = m.span()[1]
 52 |         aligninfo[start-inspos[start]] = {}
 53 |         aligninfo[start-inspos[start]]["ref"] = refseq[start-inspos[start]:end]
 54 |         aligninfo[start-inspos[start]]["alt"] = altseq[start-inspos[start]]
 55 |     for i in range(len(refseq)):
 56 |         refbase = refseq[i]
 57 |         altbase = altseq[i]
 58 |         if refbase != altbase:
 59 |             if refbase != "-" and altbase != "-":
 60 |                 aligninfo[i-inspos[i]] = {}
 61 |                 aligninfo[i-inspos[i]]["ref"] = refbase
 62 |                 aligninfo[i-inspos[i]]["alt"] = altbase
 63 |     return aligninfo
 64 | 
 65 | 
 66 | def pairwise_alignment(seqfile):
 67 |     seqinfo, refid = load_fasta(seqfile)
 68 |     mutinfo = {}
 69 |     refseq = seqinfo["REF"]["seq"]
 70 |     total_num = sum([seqinfo[x]["num"] for x in seqinfo])
 71 |     count = 0
 72 |     for sample in tqdm(seqinfo, desc="Finding mutations"):
 73 |         if sample == "REF":
 74 |             continue
 75 | #         if count >= 5:
 76 | #             break
 77 |         altseq = seqinfo[sample]["seq"]
 78 |         num = seqinfo[sample]["num"]
 79 |         ratio = round(num / total_num, 4)
 80 |         alignments = pairwise2.align.globalms(refseq, altseq, 2, -1, -1.5, -.5)
 81 |         # print(sample, ratio, alignments[0], sep="\n")
 82 |         mutinfo[sample] = {}
 83 |         mutinfo[sample]["ratio"] = ratio
 84 |         mutinfo[sample]["alignment"] = parse_alignment(alignments[0])
 85 |         count += 1
 86 |     return mutinfo, refid
 87 | 
 88 | 
 89 | def mut2pos(seqfile):
 90 |     mutinfo, refid = pairwise_alignment(seqfile)
 91 |     vcfinfo = {}
 92 |     for sample in mutinfo:
 93 |         for pos in mutinfo[sample]["alignment"]:
 94 |             refbase = mutinfo[sample]["alignment"][pos]["ref"]
 95 |             altbase = mutinfo[sample]["alignment"][pos]["alt"]
 96 |             if altbase in ["a", "c", "g", "t", "n"]:
 97 |                 altbase = altbase.upper()
 98 |                 homozygous = 1
 99 |             else:
100 |                 homozygous = 0
101 |             if pos not in vcfinfo:
102 |                 vcfinfo[pos] = {}
103 |                 vcfinfo[pos]["ref"] = refbase
104 |                 vcfinfo[pos]["alt"] = {}
105 |             if altbase not in vcfinfo[pos]["alt"]:
106 |                 vcfinfo[pos]["alt"][altbase] = {}
107 |             ratio = mutinfo[sample]["ratio"]
108 |             vcfinfo[pos]["alt"][altbase][sample] = [ratio, homozygous]
109 |     return vcfinfo, refid
110 | 
111 | 
112 | def load_phenodata(phenodata):
113 |     gid_info = {}
114 |     with open(phenodata, "r") as infile:
115 |         for line in infile:
116 |             if line.startswith("Genotype_ID"):
117 |                 continue
118 |             info = line.rstrip().split("\t")
119 |             sample = info[0]
120 |             if len(info) > 1:
121 |                 values = list(map(float, [x for x in info[1].split(", ")]))
122 |                 if len(values) > 1:
123 |                     gid_info[sample] = values
124 |     infile.close()
125 |     return gid_info
126 | 
127 | 
128 | def link_genopheno(genoinfo, seqfile, phenodata):
129 |     posinfo, refid = mut2pos(seqfile)
130 |     phenoinfo = load_phenodata(phenodata)
131 |     startpos = genoinfo.start
132 |     outfile = seqfile.replace(".fasta", "_geno_pheno.txt")
133 |     outf = open(outfile, "w")
134 |     print("name", "pos", "ref", "alt", "value", "avg", "sd", sep="\t", file=outf)
135 |     for pos in sorted(posinfo):
136 |         pos_abs = pos + startpos
137 |         ref = posinfo[pos]["ref"]
138 |         flag = 0
139 |         for alt in posinfo[pos]["alt"]:
140 |             input_lst = [phenoinfo[x] for x in posinfo[pos]["alt"][alt] if x in phenoinfo]
141 |             values = list(chain(*input_lst))
142 |             name = str(pos_abs) + "_" + ref + "/" + alt
143 |             if values:
144 |                 flag = 1
145 |                 avg_value = round(np.average(values), 4)
146 |                 sd = round(np.std(values), 4)
147 |                 for value in values:
148 |                     print(name, pos_abs, ref, alt, value, avg_value, sd, sep="\t", file=outf)
149 |         if flag:
150 |             ref_values = phenoinfo[refid]
151 |             ref_avg = round(np.average(ref_values), 4)
152 |             ref_sd = round(np.std(ref_values), 4)
153 |             ref_name = str(pos_abs) + "_" + ref + "/" + ref
154 |             for value in ref_values:
155 |                 print(ref_name, pos_abs, ref, ref, value, ref_avg, ref_sd, sep="\t", file=outf)
156 |     outf.close()
157 |     return outfile
158 | 
159 | 
160 | def output_genopheno(genoinfo, seqfile, phenodata, outfile = "", startpos = 0):
161 |     infile = link_genopheno(genoinfo, seqfile, phenodata)
162 |     geno_pheno = pd.read_table(infile)
163 |     chrom = genoinfo.chrom
164 |     if not outfile:
165 |         outfile = infile.replace(".txt", ".bed")
166 |     outf = open(outfile, "w")
167 |     for pos in pd.unique(geno_pheno.pos):
168 |         value_lst = []
169 |         ref = pd.unique(geno_pheno[geno_pheno.pos==pos].ref)
170 |         for alt in pd.unique(geno_pheno[geno_pheno.pos==pos].alt):
171 |             value_lst.append(geno_pheno[(geno_pheno.pos==pos) & (geno_pheno.alt==alt)].value.tolist())
172 |         if len(value_lst) > 1:
173 |             kruskal = stats.kruskal(*value_lst)
174 |             statistic = kruskal[0]
175 |             pvalue1 = kruskal[1]
176 |             pvalue2 = -np.log10(pvalue1)
177 |             # print(statistic, pvalue, pvalue2)
178 |             real_pos = pos + startpos
179 |             print(chrom, real_pos, real_pos+len(ref), statistic, pvalue1, pvalue2, sep="\t", file=outf)
180 |     outf.close()
181 |     return outfile
182 | 
183 | 


--------------------------------------------------------------------------------
/test/single/data/editing_results_test.bed:
--------------------------------------------------------------------------------
  1 | Chr1	4004888	4007388	WT	98.0	1.5811388300841898
  2 | Chr1	4006709	4006711	pZJP078-01-1-1-3	87.8	1.6911534525287764
  3 | Chr1	4006505	4006509	pZJP078-01-1-1-3	87.8	1.6911534525287764
  4 | Chr1	4006092	4006097	pZJP078-01-1-1-3	87.8	1.6911534525287764
  5 | Chr1	4005476	4005478	pZJP078-01-1-1-3	87.8	1.6911534525287764
  6 | Chr1	4006503	4006505	pZJP078-02-1-1-3	89.6	1.5937377450509227
  7 | Chr1	4006091	4006100	pZJP078-02-1-1-3	89.6	1.5937377450509227
  8 | Chr1	4005475	4005484	pZJP078-02-1-1-3	89.6	1.5937377450509227
  9 | Chr1	4006490	4006521	pZJP078-02-1-2-2	83.3	1.3266499161421599
 10 | Chr1	4006469	4006486	pZJP078-02-1-2-2	83.3	1.3266499161421599
 11 | Chr1	4006092	4006099	pZJP078-02-1-2-2	83.3	1.3266499161421599
 12 | Chr1	4005474	4005482	pZJP078-02-1-2-2	83.3	1.3266499161421599
 13 | Chr1	4005244	4005279	pZJP078-02-1-2-2	83.3	1.3266499161421599
 14 | Chr1	4006701	4006722	pZJP078-04-2-1-1	80.1	1.5620499351813308
 15 | Chr1	4006505	4006508	pZJP078-04-2-1-1	80.1	1.5620499351813308
 16 | Chr1	4006091	4006099	pZJP078-04-2-1-1	80.1	1.5620499351813308
 17 | Chr1	4005411	4005482	pZJP078-04-2-1-1	80.1	1.5620499351813308
 18 | Chr1	4005242	4005254	pZJP078-04-2-1-1	80.1	1.5620499351813308
 19 | Chr1	4006698	4006715	pZJP078-05-1-1-1	95.5	1.0
 20 | Chr1	4006508	4006514	pZJP078-05-1-1-1	95.5	1.0
 21 | Chr1	4006092	4006098	pZJP078-05-1-1-1	95.5	1.0
 22 | Chr1	4005475	4005490	pZJP078-05-1-1-1	95.5	1.0
 23 | Chr1	4006487	4006514	pZJP078-05-2-1-2	85.2	1.5033296378372907
 24 | Chr1	4006091	4006099	pZJP078-05-2-1-2	85.2	1.5033296378372907
 25 | Chr1	4005474	4005481	pZJP078-05-2-1-2	85.2	1.5033296378372907
 26 | Chr1	4005246	4005256	pZJP078-05-2-1-2	85.2	1.5033296378372907
 27 | Chr1	4006704	4006725	pZJP078-07-2-1-1	90.0	1.224744871391589
 28 | Chr1	4006507	4006511	pZJP078-07-2-1-1	90.0	1.224744871391589
 29 | Chr1	4006088	4006097	pZJP078-07-2-1-1	90.0	1.224744871391589
 30 | Chr1	4005465	4005482	pZJP078-07-2-1-1	90.0	1.224744871391589
 31 | Chr1	4005236	4005256	pZJP078-07-2-1-1	90.0	1.224744871391589
 32 | Chr1	4006704	4006725	pZJP078-07-1-2-3	87.1	0.66332495807108
 33 | Chr1	4006507	4006511	pZJP078-07-1-2-3	87.1	0.66332495807108
 34 | Chr1	4006088	4006097	pZJP078-07-1-2-3	87.1	0.66332495807108
 35 | Chr1	4005465	4005482	pZJP078-07-1-2-3	87.1	0.66332495807108
 36 | Chr1	4005236	4005256	pZJP078-07-1-2-3	87.1	0.66332495807108
 37 | Chr1	4006506	4006512	pZJP078-08-1-1-1	81.7	1.0770329614269007
 38 | Chr1	4005744	4006469	pZJP078-08-1-1-1	81.7	1.0770329614269007
 39 | Chr1	4005474	4005482	pZJP078-08-1-1-1	81.7	1.0770329614269007
 40 | Chr1	4005245	4005253	pZJP078-08-1-1-1	81.7	1.0770329614269007
 41 | Chr1	4006506	4006512	pZJP078-08-2-1-1	83.5	1.0
 42 | Chr1	4005744	4006469	pZJP078-08-2-1-1	83.5	1.0
 43 | Chr1	4005474	4005482	pZJP078-08-2-1-1	83.5	1.0
 44 | Chr1	4005245	4005253	pZJP078-08-2-1-1	83.5	1.0
 45 | Chr1	4006506	4006512	pZJP078-08-2-2-2	82.7	2.6758176320519302
 46 | Chr1	4005744	4006469	pZJP078-08-2-2-2	82.7	2.6758176320519302
 47 | Chr1	4005474	4005482	pZJP078-08-2-2-2	82.7	2.6758176320519302
 48 | Chr1	4005245	4005253	pZJP078-08-2-2-2	82.7	2.6758176320519302
 49 | Chr1	4006706	4006714	pZJP078-08-1-2-1	73.1	1.019803902718557
 50 | Chr1	4006503	4006505	pZJP078-08-1-2-1	73.1	1.019803902718557
 51 | Chr1	4006092	4006100	pZJP078-08-1-2-1	73.1	1.019803902718557
 52 | Chr1	4005553	4005615	pZJP078-08-1-2-1	73.1	1.019803902718557
 53 | Chr1	4005263	4005552	pZJP078-08-1-2-1	73.1	1.019803902718557
 54 | Chr1	4005246	4005262	pZJP078-08-1-2-1	73.1	1.019803902718557
 55 | Chr1	4006706	4006714	pZJP078-08-3-1-3	74.9	1.2
 56 | Chr1	4006503	4006505	pZJP078-08-3-1-3	74.9	1.2
 57 | Chr1	4006092	4006100	pZJP078-08-3-1-3	74.9	1.2
 58 | Chr1	4005553	4005615	pZJP078-08-3-1-3	74.9	1.2
 59 | Chr1	4005263	4005552	pZJP078-08-3-1-3	74.9	1.2
 60 | Chr1	4005246	4005262	pZJP078-08-3-1-3	74.9	1.2
 61 | Chr1	4006503	4006505	pZJP078-09-1-2-1	87.2	0.9273618495495702
 62 | Chr1	4006090	4006100	pZJP078-09-1-2-1	87.2	0.9273618495495702
 63 | Chr1	4005474	4005480	pZJP078-09-1-2-1	87.2	0.9273618495495702
 64 | Chr1	4005240	4005350	pZJP078-09-1-2-1	87.2	0.9273618495495702
 65 | Chr1	4006709	4006715	pZJP078-10-1-1-2	95.5	1.0
 66 | Chr1	4006505	4006510	pZJP078-10-1-1-2	95.5	1.0
 67 | Chr1	4006091	4006100	pZJP078-10-1-1-2	95.5	1.0
 68 | Chr1	4005474	4005480	pZJP078-10-1-1-2	95.5	1.0
 69 | Chr1	4005251	4005253	pZJP078-10-1-1-2	95.5	1.0
 70 | Chr1	4006506	4006713	pZJP078-10-1-2-1	87.0	1.3038404810405297
 71 | Chr1	4006095	4006100	pZJP078-10-1-2-1	87.0	1.3038404810405297
 72 | Chr1	4005475	4005482	pZJP078-10-1-2-1	87.0	1.3038404810405297
 73 | Chr1	4005239	4005257	pZJP078-10-1-2-1	87.0	1.3038404810405297
 74 | Chr1	4006709	4006711	pZJP078-10-5-1-1	87.6	1.019803902718557
 75 | Chr1	4006504	4006514	pZJP078-10-5-1-1	87.6	1.019803902718557
 76 | Chr1	4006095	4006100	pZJP078-10-5-1-1	87.6	1.019803902718557
 77 | Chr1	4005473	4005481	pZJP078-10-5-1-1	87.6	1.019803902718557
 78 | Chr1	4005250	4005255	pZJP078-10-5-1-1	87.6	1.019803902718557
 79 | Chr1	4006504	4006717	pZJP078-12-2-1-1	84.9	0.9165151389911681
 80 | Chr1	4006092	4006097	pZJP078-12-2-1-1	84.9	0.9165151389911681
 81 | Chr1	4005474	4005484	pZJP078-12-2-1-1	84.9	0.9165151389911681
 82 | Chr1	4005246	4005257	pZJP078-12-2-1-1	84.9	0.9165151389911681
 83 | Chr1	4006504	4006717	pZJP078-12-2-1-2	83.5	1.0
 84 | Chr1	4006092	4006097	pZJP078-12-2-1-2	83.5	1.0
 85 | Chr1	4005474	4005484	pZJP078-12-2-1-2	83.5	1.0
 86 | Chr1	4005246	4005257	pZJP078-12-2-1-2	83.5	1.0
 87 | Chr1	4006504	4006715	pZJP078-12-3-1-1	80.1	1.42828568570857
 88 | Chr1	4005473	4006102	pZJP078-12-3-1-1	80.1	1.42828568570857
 89 | Chr1	4005250	4005253	pZJP078-12-3-1-1	80.1	1.42828568570857
 90 | Chr1	4006158	4006165	pZJP079-1-1-01-2	91.9	1.5620499351813308
 91 | Chr1	4005990	4005994	pZJP079-1-1-01-2	91.9	1.5620499351813308
 92 | Chr1	4005168	4005177	pZJP079-1-1-01-2	91.9	1.5620499351813308
 93 | Chr1	4006835	4006852	pZJP079-1-1-02-1	80.9	1.2806248474865698
 94 | Chr1	4006159	4006167	pZJP079-1-1-02-1	80.9	1.2806248474865698
 95 | Chr1	4005945	4006001	pZJP079-1-1-02-1	80.9	1.2806248474865698
 96 | Chr1	4005171	4005351	pZJP079-1-1-02-1	80.9	1.2806248474865698
 97 | Chr1	4006835	4006852	pZJP079-1-1-01-1	80.4	0.8602325267042626
 98 | Chr1	4006159	4006167	pZJP079-1-1-01-1	80.4	0.8602325267042626
 99 | Chr1	4005945	4006001	pZJP079-1-1-01-1	80.4	0.8602325267042626
100 | Chr1	4005171	4005351	pZJP079-1-1-01-1	80.4	0.8602325267042626
101 | Chr1	4006611	4006864	pZJP079-5-1-02-2	80.1	1.8
102 | Chr1	4006159	4006206	pZJP079-5-1-02-2	80.1	1.8
103 | Chr1	4005989	4006024	pZJP079-5-1-02-2	80.1	1.8
104 | Chr1	4005330	4005366	pZJP079-5-1-02-2	80.1	1.8
105 | Chr1	4005171	4005174	pZJP079-5-1-02-2	80.1	1.8
106 | Chr1	4005348	4006852	pZJP079-6-3-01-3	66.2	1.4696938456699067
107 | Chr1	4005171	4005174	pZJP079-6-3-01-3	66.2	1.4696938456699067
108 | Chr1	4006163	4006166	pZJP079-7-1-01-2	68.5	1.1832159566199232
109 | Chr1	4005168	4005997	pZJP079-7-1-01-2	68.5	1.1832159566199232
110 | Chr1	4006163	4006166	pZJP079-7-2-03-2	68.8	1.5033296378372907
111 | Chr1	4005168	4005997	pZJP079-7-2-03-2	68.8	1.5033296378372907
112 | Chr1	4005987	4006558	pZJP079-8-1-01-2	74.3	1.0770329614269007
113 | Chr1	4005287	4005378	pZJP079-8-1-01-2	74.3	1.0770329614269007
114 | Chr1	4005171	4005174	pZJP079-8-1-01-2	74.3	1.0770329614269007
115 | Chr1	4006833	4006852	pZJP079-7-2-23-1	69.6	2.0591260281974
116 | Chr1	4006156	4006166	pZJP079-7-2-23-1	69.6	2.0591260281974
117 | Chr1	4005171	4005993	pZJP079-7-2-23-1	69.6	2.0591260281974
118 | Chr1	4006163	4006166	pZJP079-3-3-04-1	96.3	1.2489995996796797
119 | Chr1	4006138	4006166	pZJP079-8-1-14-1	89.6	1.3564659966250536
120 | Chr1	4005980	4005996	pZJP079-8-1-14-1	89.6	1.3564659966250536
121 | Chr1	4005346	4005354	pZJP079-8-1-14-1	89.6	1.3564659966250536
122 | Chr1	4005171	4005174	pZJP079-8-1-14-1	89.6	1.3564659966250536
123 | 


--------------------------------------------------------------------------------
/lib/misc.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import pandas as pd
  4 | from scipy import stats
  5 | import pyBigWig
  6 | from pybedtools import BedTool
  7 | 
  8 | 
  9 | def check_outdir(path):
 10 |     dirpath = os.path.abspath(os.path.dirname(path))
 11 |     if not os.path.exists(dirpath):
 12 |         print("Create directory:", dirpath)
 13 |         os.makedirs(dirpath)
 14 | 
 15 | 
 16 | def split_region(geneinfo):
 17 |     # Position info split by binsize and step
 18 |     step = geneinfo.step
 19 |     binstart = geneinfo.start
 20 |     binstop = geneinfo.end
 21 |     posinfo = {}
 22 |     for i, pos in enumerate(range(binstart, binstop, step)):
 23 |         posinfo[i] = pos
 24 | 
 25 |     return posinfo
 26 | 
 27 | 
 28 | def get_chrom_sizes(file):
 29 |     chrlens = {}
 30 |     with open(file) as infile:
 31 |         for line in infile:
 32 |             if line.startswith("#"):
 33 |                 continue
 34 |             info = line.rstrip().split("\t")
 35 |             if len(info) == 2:
 36 |                 chrom = info[0]
 37 |                 length = info[1]
 38 |             else:
 39 |                 chrom = info[0]
 40 |                 length = info[2]
 41 |             chrlens[chrom] = int(length)
 42 |     return chrlens
 43 | 
 44 | 
 45 | def bigwig2bedGraph(bwfile, geneinfo, chrlens, outfile, ext = 50):
 46 |     # Convert the bigwig file from Deeptools/Popera to bedGraph file in single-base-pair resolution
 47 |     # Suitable for DNase-seq/ATAC-seq/MNase-seq/ChIP-seq
 48 |     bwin = pyBigWig.open(bwfile)
 49 |     chrom = geneinfo.chrom
 50 |     start = geneinfo.start
 51 |     end = geneinfo.end
 52 |     chrom_len = chrlens[chrom]
 53 |     check_outdir(outfile)
 54 |     outf = open(outfile, "w")
 55 |     for i in range(max(1, start-ext), min(end+ext, chrom_len)):
 56 |         try:
 57 |             value = bwin.values(chrom, i, i+1)[0]
 58 |             if np.isnan(value):
 59 |                 value = 0
 60 |             print(chrom, i, i+1, value, sep="\t", file=outf)
 61 |         except:
 62 |             continue
 63 |     outf.close()
 64 | 
 65 | 
 66 | def fimo_filter(gfffile, matrixinfo, geneinfo, outfile, pcut = 1e-5, qcut = 1):
 67 |     # Filter FIMO results with p-value or q-value cutoff
 68 |     motif_family = {}
 69 |     # Matrix from JASPAR
 70 |     if matrixinfo.startswith("JASPAR"):
 71 |         with open(matrixinfo, "r") as infile:
 72 |             for line in infile:
 73 |                 if line.startswith("MOTIF"):
 74 |                     info = line.rstrip().split()
 75 |                     motif_id = info[1]
 76 |                     motif_name = info[2]
 77 |                     motif_family[motif_id] = motif_name
 78 |     # Matrix from PlantTFDB
 79 |     else:
 80 |         with open(matrixinfo, "r") as infile:
 81 |             for line in infile:
 82 |                 if line.startswith("#"):
 83 |                     continue
 84 |                 info = line.rstrip().split()
 85 |                 genename = info[0]
 86 |                 family = info[1]
 87 |                 motif_family[genename] = family
 88 |     # Get gene info
 89 |     chrom = geneinfo.chrom
 90 |     begin = geneinfo.start
 91 |     # Output filtered motifs
 92 |     motif_list = []
 93 |     with open(gfffile, "r") as infile:
 94 |         for line in infile:
 95 |             if line.startswith("#"):
 96 |                 continue
 97 |             info = line.rstrip().split("\t")
 98 |             start = int(info[3]) + begin
 99 |             end = int(info[4]) + begin
100 |             strand = info[6]
101 |             desc = info[8].split(";")
102 |             motif_id = desc[0].split("=")[1]
103 |             motif_name = motif_family[motif_id]
104 |             pvalue = float(desc[2].split("=")[1])
105 |             qvalue = float(desc[3].split("= ")[1])
106 |             if pvalue <= pcut and qvalue <= qcut:
107 |                 motif_list.append([chrom, start, end, motif_name, ".", strand, pvalue, qvalue])
108 |     outf = open(outfile, "w")
109 |     for lst in sorted(motif_list):
110 |         print("\t".join(list(map(str, lst))), file=outf)
111 |     outf.close()
112 | 
113 | 
114 | def smooth_scores_fill2(info, posinfo, minscore=0.01, minratio=0.5):
115 |     """
116 |     Make the discrete score values smoothly (fill zero scores).
117 | 
118 |     Mandatory parameters:
119 |     1. info - A list contains scores in different bins
120 |     2. posinfo - Position information of each bin
121 | 
122 |     """
123 | 
124 |     # In case original score info be modified
125 |     new_info = info.copy()
126 |     minval = max(min([x for x in new_info if x]), minscore)*minratio
127 |     zerocnt = 0
128 |     flag = 0
129 |     for i in posinfo:
130 |         pos = posinfo[i]
131 |         score = new_info[i]
132 |         if i == 0:
133 |             if score == 0:
134 |                 flag = 1
135 |                 zerocnt += 1
136 |                 zerostart = i
137 |                 continue
138 |         elif i == len(new_info)-1:
139 |             if score == 0:
140 |                 for j in range(zerostart+1, len(new_info), 1):
141 |                     score1 = new_info[j-1]
142 |                     score2 = new_info[j]
143 |                     if score1 == new_info[zerostart]:
144 |                         new_info[j] = np.average([score1*minratio, minval])
145 |                     else:
146 |                         new_info[j] = np.average([score1, minval])
147 |         else:
148 |             if score == 0:
149 |                 zerocnt += 1
150 |                 zerostart = i
151 |                 continue
152 |             else:
153 |                 if flag:
154 |                     for j in range(i, zerostart, -1):
155 |                         new_info[j-1] = np.average([score*minratio, minval])
156 |                     flag = 0
157 |                 else:
158 |                     if zerocnt:
159 |                         right = int(score*zerocnt/(score+new_info[zerostart]))
160 |                         left = zerocnt - right
161 |                         for j in range(zerostart+1, zerostart+left+1, 1):
162 |                             score1 = new_info[j-1]
163 |                             score2 = new_info[j]
164 |                             if score1 == new_info[zerostart]:
165 |                                 new_info[j] = np.average([score1*minratio, minval])
166 |                             else:
167 |                                 new_info[j] = np.average([score1, minval])
168 |                         for k in range(i-1, zerostart+left, -1):
169 |                             score1 = new_info[k]
170 |                             score2 = new_info[k+1]
171 |                             if score2 == score:
172 |                                 new_info[k] = np.average([minval, score2*minratio])
173 |                             else:
174 |                                 new_info[k] = np.average([minval, score2])
175 |                 zerostart = i
176 |                 zerocnt = 0
177 |     # smooth scores
178 |     smooth_info = smooth_scores2(new_info, posinfo, keep_tails=False)
179 | 
180 |     return smooth_info
181 | 
182 | 
183 | def smooth_scores_fill(info, posinfo):
184 |     """
185 |     Make the discrete score values smoothly (fill zero scores).
186 | 
187 |     Mandatory parameters:
188 |     1. info - A list contains scores in different bins
189 |     2. posinfo - Position information of each bin
190 | 
191 |     """
192 | 
193 |     new_info = info.copy()
194 |     smooth_info = {}
195 |     nonzero = [x for x in new_info if x]
196 |     if sum(nonzero):
197 |         minscore = min(nonzero)
198 |         maxscore = max(new_info)
199 |     else:
200 |         return smooth_info
201 |     # Set the minimum fill score
202 |     if minscore / maxscore > 0.1:
203 |         bottom = 0.1
204 |     else:
205 |         bottom = minscore
206 |     for i in range(len(new_info)):
207 |         if i:
208 |             score0 = new_info[i-1]
209 |             score1 = new_info[i]
210 |             if not score1:
211 |                 for j in range(i+1, len(new_info)):
212 |                     score2 = new_info[j]
213 |                     if score2:
214 |                         break
215 |                     if j == len(new_info)-1 and score2 == 0:
216 |                         score2 = bottom
217 |                 ranges = j - i
218 |                 diff1 = abs(score0 - bottom)
219 |                 diff2 = abs(score2 - bottom)
220 |                 total = diff1 + diff2
221 |                 if total:
222 |                     mid = int(ranges * diff1 / total)
223 |                 else:
224 |                     mid = 0
225 |                 # print(i, j, ranges, mid, score0, score2, diff1, diff2, sep="\t")
226 |                 if ranges > 1:
227 |                     for k in range(mid):
228 |                         new_info[i+k] = score0 - diff1 * (k+1)/(mid+1)
229 |                     for k in range(mid+1, ranges):
230 |                         new_info[i+k] = bottom + diff2 * (k-mid)/(ranges-mid)
231 |                 new_info[i+mid] = bottom
232 |         else:
233 |             score = new_info[i]
234 |             if score:
235 |                 pass
236 |             else:
237 |                 new_info[i] = bottom
238 | 
239 |     smooth_info = smooth_scores2(new_info, posinfo, keep_tails=False)
240 | 
241 |     return smooth_info
242 | 
243 | 
244 | def smooth_scores1(info, posinfo, keep_tails=True):
245 |     """
246 |     Make the discrete score values smoothly.
247 |     (Remove missing values between two scores)
248 | 
249 |     Mandatory parameters:
250 |     1. info - A list contains scores in different bins
251 |     2. posinfo - Position information of each bin
252 | 
253 |     """
254 | 
255 |     # In case original score info be modified
256 |     new_info = info.copy()
257 |     score_num = len(new_info)
258 |     # Fill gap between two scores
259 |     for i in range(score_num):
260 |         score = new_info[i]
261 |         if i == 0:
262 |             tmp_score = score
263 |             tmp_idx = i
264 |         else:
265 |             if score and tmp_score:
266 |                 interval = i - tmp_idx
267 |                 if interval > 1:
268 |                     for n, j in enumerate(range(tmp_idx+1, i)):
269 |                         new_info[j] = tmp_score + (score - tmp_score) * n / (i - tmp_idx)
270 |                 tmp_score = score
271 |                 tmp_idx = i
272 |     smooth_info = {}
273 |     if max(new_info):
274 |         new_info = [x/max(new_info) for x in new_info]
275 |     else:
276 |         return smooth_info
277 |     # Smooth the scores
278 |     smooth_info = smooth_scores2(new_info, posinfo, keep_tails=keep_tails)
279 | 
280 |     return smooth_info
281 | 
282 | 
283 | def smooth_scores2(info, posinfo, keep_tails=False):
284 |     """
285 |     Make the discrete score values smoothly.
286 | 
287 |     Mandatory parameters:
288 |     1. info - A list contains scores in different bins
289 |     2. posinfo - Position information of each bin
290 | 
291 |     Alternative parameters:
292 |     1. keep_tails - Whether or not to keep the missing values in the two tails
293 | 
294 |     """
295 | 
296 |     # In case original score info be modified
297 |     new_info = info.copy()
298 |     smooth_info = {}
299 |     if not max(new_info):
300 |         return smooth_info
301 |     score_num = len(new_info)
302 |     begin = 0
303 |     end = score_num
304 |     # Find the two tails
305 |     for i in range(end):
306 |         if i:
307 |             begin_avg = np.average(new_info[:i])
308 |         else:
309 |             begin_avg = new_info[i]
310 |         if i == end-1:
311 |             end_avg = new_info[i]
312 |         else:
313 |             end_avg = np.average(new_info[i:])
314 |         if begin_avg == 0:
315 |             begin = i
316 |         if end_avg == 0:
317 |             end = i
318 |             break
319 |     # Get average value in adjacent scores
320 |     if not keep_tails:
321 |         for i in range(begin, 0, -1):
322 |             if i:
323 |                 if begin == score_num-1:
324 |                     score = new_info[i]
325 |                 else:
326 |                     score = (new_info[i-1] + new_info[i] + new_info[i+1]) / 3
327 |             else:
328 |                 score = (new_info[i] + new_info[i+1]) / 2
329 |             new_info[i] = score
330 |         for i in range(end, score_num):
331 |             if i < score_num - 1:
332 |                 score = (new_info[i-1] + new_info[i] + new_info[i+1]) / 3
333 |             else:
334 |                 score = (new_info[i-1] + new_info[i]) / 2
335 |             new_info[i] = score
336 |     for i in range(begin, end):
337 |         if i == begin:
338 |             if begin == score_num-1:
339 |                 score = new_info[i]
340 |             else:
341 |                 score = (new_info[i] + new_info[i+1]) / 2
342 |         elif i == end - 1:
343 |             score = (new_info[i-1] + new_info[i]) / 2
344 |         else:
345 |             score = (new_info[i-1] + new_info[i] + new_info[i+1]) / 3
346 |         new_info[i] = score
347 |     for i in posinfo:
348 |         # provide real positions for smoothed scores
349 |         pos = posinfo[i]
350 |         smooth_info[pos] = new_info[i] / max(new_info)
351 | 
352 |     return smooth_info
353 | 
354 | 
355 | def merge_regions(regions, geneinfo, minlen = 2, mindist = 1):
356 |     # Filter and merge key regions
357 |     chromosome = geneinfo.chrom
358 |     binsize = geneinfo.binsize
359 |     merged = {}
360 |     for pos, score in regions:
361 |         start = pos
362 |         end = pos + binsize
363 |         if not merged:
364 |             tmppos = start
365 |             merged[tmppos] = [end, [score]]
366 |             tmp_end = end
367 |             continue
368 |         if start - tmp_end <= binsize * mindist:
369 |             merged[tmppos][0] = end
370 |             merged[tmppos][1].append(score)
371 |         else:
372 |             merged[start] = [end, [score]]
373 |             tmppos = start
374 |         tmp_end = end
375 | 
376 |     merged_regions = []
377 |     for pos in merged:
378 |         start = pos
379 |         end = merged[pos][0]
380 |         if end - start >= binsize * minlen:
381 |             score = np.average(merged[pos][1])
382 |             merged_regions.append([chromosome, start, end, score])
383 | 
384 |     return merged_regions
385 | 
386 | 
387 | def calc_importance(phenotypes, scorelist, namelist, geneinfo, outdir="./", side="none"):
388 |     # Calculate the correlation between phenodata and scores from different features
389 |     ziplist = zip(scorelist, namelist)
390 |     gene = geneinfo.gene
391 |     genename = geneinfo.alias
392 |     if genename == "NA":
393 |         gene_alias = gene
394 |     else:
395 |         gene_alias = genename
396 |     sample_scores = {}
397 |     for item in ziplist:
398 |         scores = item[0]
399 |         name = item[1]
400 |         score_bed = BedTool("\n".join(["\t".join(map(str, [geneinfo.chrom, x, x+geneinfo.binsize, scores[x]])) 
401 |                                        for x in scores]), 
402 |                             from_string=True)
403 |         pheno_bed = BedTool(phenotypes)
404 |         intersect = pheno_bed.intersect(score_bed, wo=True)
405 |         fscores = {}
406 |         for interval in intersect:
407 |             info = str(interval).rstrip().split("\t")
408 |             sample = info[3]
409 |             if sample == "WT":
410 |                 wt_value = float(info[4])
411 |                 continue
412 |             ratio = int(info[-1]) / geneinfo.binsize
413 |             if sample in fscores:
414 |                 fscores[sample]["feature"] += float(info[-2]) * ratio
415 |             else:
416 |                 fscores[sample] = {}
417 |                 if side == "none":
418 |                     fscores[sample]["pheno"] = abs(float(info[4]) - wt_value)
419 |                 else:
420 |                     fscores[sample]["pheno"] = float(info[4]) - wt_value
421 |                 fscores[sample]["feature"] = float(info[-2]) * ratio
422 |         min_score = min([fscores[x]["feature"] for x in fscores])
423 |         max_score = max([fscores[x]["feature"] for x in fscores])
424 |         avg_pheno = np.average([fscores[x]["pheno"] for x in fscores])
425 |         if avg_pheno < 0:
426 |             for s in fscores:
427 |                 fscores[s]["pheno"] *= -1
428 |         min_pheno = min([fscores[x]["pheno"] for x in fscores])
429 |         max_pheno = max([fscores[x]["pheno"] for x in fscores])
430 |         feature_scores = []
431 |         pheno_scores = []
432 |         for s in fscores:
433 |             score1 = (fscores[s]["feature"]-min_score)/(max_score-min_score)
434 |             feature_scores.append(score1)
435 |             if side == "none":
436 |                 score2 = fscores[s]["pheno"]
437 |             else:
438 |                 score2 = (fscores[s]["pheno"]-min_pheno)/(max_pheno-min_pheno)
439 |             pheno_scores.append(score2)
440 |             if s not in sample_scores:
441 |                 sample_scores[s] = {}
442 |                 sample_scores[s]["pheno"] = score2
443 |             sample_scores[s][name] = score1
444 |         pearson = stats.pearsonr(feature_scores, pheno_scores)
445 |         print(name, "Pearson correlation:", pearson[0])
446 |         
447 |     outfile = outdir + "/" + gene_alias + "/scores_by_sample.txt"
448 |     df = pd.DataFrame(sample_scores).T
449 |     df.index.name = "sample"
450 |     df.to_csv(outfile, sep="\t")
451 | 
452 | 


--------------------------------------------------------------------------------
/single.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | ##################################################
  4 | # CRISPR-Cas12a promoter editing (CAPE)          #
  5 | # Script: Single Mode                            #
  6 | ##################################################
  7 | 
  8 | import os
  9 | import sys
 10 | import shutil
 11 | from glob import glob
 12 | import configparser
 13 | from time import time
 14 | from multiprocessing import Pool
 15 | from pybedtools import BedTool, cleanup
 16 | 
 17 | from lib import misc
 18 | from lib.features import *
 19 | from lib.cores import output_cores
 20 | 
 21 | 
 22 | class Features_info():
 23 |     # Provide Gene infomation
 24 |     def __init__(self):
 25 |         self.geneinfo = Geneinfo()
 26 |         self.feature = "feature"
 27 |         self.workdir = "results"
 28 |         self.outname = "name"
 29 |         self.slop = 200
 30 |         self.config = {}
 31 |         self.chrlens = {}
 32 | 
 33 | 
 34 | def get_gene_info(gene_file):
 35 | 
 36 |     genes_info = {}
 37 |     with open(gene_file) as infile:
 38 |         for line in infile:
 39 |             if line.startswith("#") or line.startswith("\n"):
 40 |                 continue
 41 |             info = line.rstrip().split("\t")
 42 |             chrom = info[0]
 43 |             start = int(info[1])
 44 |             end = int(info[2])
 45 |             gene_name = info[3]
 46 |             strand = info[5]
 47 |             genes_info[gene_name] = [chrom, start, end, strand]
 48 |             break
 49 | 
 50 |     print("Genes infomation loaded.\n")
 51 | 
 52 |     return genes_info
 53 | 
 54 | 
 55 | def generate_regions(geneinfo, workdir, gene, chrlens):
 56 |     chrom = geneinfo.chrom
 57 |     start = geneinfo.start
 58 |     end = geneinfo.end
 59 |     strand = geneinfo.strand
 60 |     chrom_len = chrlens[chrom]
 61 |     outfile = workdir + "/" + gene + "/analysis_region.bed"
 62 |     misc.check_outdir(outfile)
 63 |     if os.path.exists(outfile):
 64 |         return outfile
 65 |     outf = open(outfile, "w")
 66 |     print(chrom, max(0, start), min(end, chrom_len), gene, '.', strand, 
 67 |           sep="\t", file=outf)
 68 |     outf.close()
 69 | 
 70 |     return outfile
 71 | 
 72 | 
 73 | def generate_features(Features_info):
 74 | 
 75 |     geneinfo = Features_info.geneinfo
 76 |     feature_file = Features_info.feature
 77 |     workdir = Features_info.workdir
 78 |     outname = Features_info.outname
 79 |     slop = Features_info.slop
 80 |     chrlens = Features_info.chrlens
 81 | 
 82 |     chrom = geneinfo.chrom
 83 |     start = geneinfo.start
 84 |     end = geneinfo.end
 85 |     gene = geneinfo.gene
 86 | 
 87 |     if "peak" in outname:
 88 |         outfile = workdir + "/" + gene + "/" + outname + "_raw.bed"
 89 |     else:
 90 |         outfile = workdir + "/" + gene + "/" + outname + "_raw.bedGraph"
 91 |     if os.path.exists(outfile):
 92 |         return outfile
 93 |     if feature_file.endswith(".bw") or feature_file.endswith(".bigwig"):
 94 |         misc.bigwig2bedGraph(feature_file, geneinfo, chrlens, outfile, ext = slop)
 95 |     else:
 96 |         target_bed = BedTool("\t".join([chrom, str(max(0, start-slop)), str(end+slop)]), from_string = True)
 97 |         feature_bed = BedTool(feature_file)
 98 |         feature_bed.intersect(target_bed, wa=True).moveto(outfile)
 99 |     
100 |     cleanup()
101 | 
102 |     return outfile
103 | 
104 | 
105 | def generate_features_from_large(inputfile, genes_info, upstream, slop, workdir, feature):
106 |     
107 |     basemap = {}
108 |     existed = set()
109 |     num = 0
110 |     genelens = len(genes_info)
111 |     for gene in genes_info:
112 |         outfile = os.path.join(workdir, gene, feature+"_raw.bedGraph")
113 |         if os.path.exists(outfile):
114 |             filesize = os.path.getsize(outfile)
115 |             if filesize > 10:
116 |                 existed.add(gene)
117 |         chrom, start, end, strand = genes_info[gene][:4]
118 |         if chrom not in basemap:
119 |             basemap[chrom] = {}
120 |         if strand == "+":
121 |             for i in range(max(0, start-upstream-slop), start+slop+1):
122 |                 if i in basemap[chrom]:
123 |                     basemap[chrom][i].append(gene)
124 |                 else:
125 |                     basemap[chrom][i] = [gene]
126 |         else:
127 |             for i in range(end-slop, end+upstream+slop+1):
128 |                 if i in basemap[chrom]:
129 |                     basemap[chrom][i].append(gene)
130 |                 else:
131 |                     basemap[chrom][i] = [gene]
132 |         print("%s / %s genes processed, %s existed genes." % (num, genelens, len(existed)), 
133 |               end="\r")
134 |         num += 1
135 |     print("Load genes completed.", " "*30)
136 |     genenums = []
137 |     total_num = max(1, genelens-len(existed))
138 |     outf = {}
139 |     split = 500
140 |     kept = split * 0.9
141 |     tmp_cnt = 0
142 |     tmp_mod = 0
143 |     num = 0
144 |     with open(inputfile) as infile:
145 |         for line in infile:
146 |             chrom, start, end = line.rstrip().split("\t")[:3]
147 |             if chrom not in basemap:
148 |                 continue
149 |             if feature == "CNS":
150 |                 s = int(start)
151 |             else:
152 |                 s = int((int(start) + int(end)) / 2)
153 |             if s in basemap[chrom]:
154 |                 genes = basemap[chrom][s]
155 |                 for gene in genes:
156 |                     if gene in existed:
157 |                         continue
158 |                     else:
159 |                         outfile = os.path.join(workdir, gene, feature+"_raw.bedGraph")
160 |                         if gene not in outf:
161 |                             outf[gene] = open(outfile, "w")
162 |                             # try:
163 |                             #     outf[gene] = open(outfile, "w")
164 |                             # except:
165 |                             #     opened = len(outf)
166 |                             #     print("Processing %s, %s genes opened." % (gene, opened))
167 |                             #     outf[gene] = open(outfile, "w")
168 |                     print(line.rstrip(), file=outf[gene])
169 |                     if gene not in genenums:
170 |                         genenums.append(gene)
171 |                 cnt = len(genenums)
172 |                 remain = cnt % split
173 |                 mod = cnt // split
174 |             if mod - tmp_mod > 0:
175 |                 st = max(0, int(split * (mod - 1) - kept - 1))
176 |                 ed = int(split * mod - kept)
177 |                 # print("#"*100+"\n", tmp_cnt, cnt, tmp_mod, mod, st, ed, genes, sep=", ")
178 |                 for j in genenums[st:ed]:
179 |                     outf[j].close()
180 |             if tmp_cnt != cnt:
181 |                 pct = round(cnt * 100 / total_num, 2)
182 |                 print(pct, "%", " output.", end="\r")
183 |             tmp_cnt = cnt
184 |             tmp_mod = mod
185 |     print("All files output.")
186 | 
187 |     for gene in outf:
188 |         outf[gene].close()
189 | 
190 |     return cnt
191 | 
192 | 
193 | def run_analysis(feature_info):
194 |     
195 |     workdir = feature_info.workdir
196 |     geneinfo = feature_info.geneinfo
197 |     gene = feature_info.geneinfo.gene
198 | 
199 |     # Check if calculated
200 |     # check = os.path.join(workdir, gene, "key_regions_merged.bed")
201 |     # if os.path.exists(check):
202 |     #     return (gene, 0)
203 |     check = os.path.join(workdir, gene, "aggregate.bedGraph")
204 |     if os.path.exists(check):
205 |         filesize = os.path.getsize(check)
206 |         if filesize > 10:
207 |             return (gene, 0)
208 | 
209 |     # Open chromatin
210 |     ocscores = glob(os.path.join(workdir, gene, "OCscores*_raw.bedGraph"))
211 |     ocpeaks = glob(os.path.join(workdir, gene, "OCpeaks*_raw.bed"))
212 |     # Calculate scores
213 |     ocscorelist = []
214 |     for idx, ocscorefile in enumerate(ocscores):
215 |         if idx + 1 > len(ocpeaks):
216 |             ocpeakfile = ""
217 |         else:
218 |             ocpeakfile = ocpeaks[idx]
219 |         if len(ocscores) > 1:
220 |             ocname = os.path.basename(ocscorefile).split("_raw")[0]
221 |         else:
222 |             ocname = "OCscores"
223 |         scores_oc1 = openchromatin_scores(geneinfo, ocscorefile, ocpeakfile, 
224 |                                           samplename = ocname, outdir = workdir)
225 |         ocscorelist.append(scores_oc1)
226 |     if len(ocscores) > 1:
227 |         scores_oc = merge_reps(geneinfo, ocscorelist, samplename = "OCscores", outdir = workdir)
228 |     else:
229 |         scores_oc = scores_oc1
230 | 
231 |     # Histone modification
232 |     ptmfiles = glob(os.path.join(workdir, gene, "PTM*_raw.bedGraph"))
233 |     # Calculate scores
234 |     ptmscorelist = []
235 |     for ptmscorefile in ptmfiles:
236 |         if len(ptmfiles) > 1:
237 |             ptmname = os.path.basename(ptmscorefile).split("_raw")[0]
238 |         else:
239 |             ptmname = "PTMscores"
240 |         scores_ptm1 = ptm_scores(geneinfo, ptmscorefile, ocname="OCscores",
241 |                                  samplename = ptmname, outdir = workdir)
242 |         ptmscorelist.append(scores_ptm1)
243 |     if len(ptmfiles) > 1:
244 |         scores_ptm = merge_reps(geneinfo, ptmscorelist, samplename = "PTMscores", outdir = workdir)
245 |     else:
246 |         scores_ptm = scores_ptm1
247 | 
248 |     # TF motifs
249 |     motiffile = os.path.join(workdir, gene, "motifs_raw.bedGraph")
250 |     # Calculate scores
251 |     scores_motif = motif_scores(geneinfo, motiffile, outdir = workdir)
252 | 
253 |     # Conserved sequences
254 |     cnsfile = os.path.join(workdir, gene, "CNS_raw.bedGraph")
255 |     # Calculate scores
256 |     scores_cns = cns_scores(geneinfo, cnsfile, outdir = workdir)
257 | 
258 |     # Genotype versus Phenotype (MBKbase)
259 |     genopheno = os.path.join(workdir, gene, "genopheno_raw.bedGraph")
260 |     # Calculate scores
261 |     if os.path.exists(genopheno):
262 |         scores_genopheno = genopheno_scores(geneinfo, genopheno, outdir = workdir) 
263 |     else: 
264 |         scores_genopheno = {}
265 |     
266 |     # Aggregate scores
267 |     if scores_genopheno:
268 |         scorelist = [scores_oc, scores_motif, scores_cns, scores_ptm, scores_genopheno]
269 |         weightlist = [0.25, 0.2, 0.3, 0.1, 0.05]
270 |     else:
271 |         scorelist = [scores_oc, scores_motif, scores_cns, scores_ptm]
272 |         weightlist = [0.25, 0.2, 0.3, 0.1]
273 |     scores_aggregate = aggregate_scores(geneinfo, scorelist, weightlist, outdir = workdir)
274 | 
275 |     # Load phenodata from CRISPR-edited results
276 |     phenodata = os.path.join(workdir, gene, "phenoscores_raw.bedGraph")
277 |     # Calculate scores
278 |     if os.path.exists(phenodata):
279 |         scores_phenodata = phenodata_scores(geneinfo, phenodata, method = "kmeans2", 
280 |                                             outdir = workdir)
281 |     else:
282 |         scores_phenodata = {}
283 | 
284 |     # Find the feature importance
285 |     if scores_phenodata:
286 |         namelist = ["DHS", "H3K27ac", "TF motif", "CNS", "GenoPheno", "Aggregate"]
287 |         misc.calc_importance(phenodata, scorelist+[scores_aggregate], 
288 |                              namelist, geneinfo, side="both", outdir = workdir)
289 | 
290 |     # Define key regions
291 |     key_regions = define_key_regions(geneinfo, scores_aggregate, phenodata, 
292 |                                      outdir = workdir)
293 | 
294 |     # Get the core of key regions
295 |     scorefile = os.path.join(workdir, gene, "aggregate.bedGraph")
296 |     regionfile = os.path.join(workdir, gene, "key_regions_merged.bed")
297 |     core_regions = output_cores(geneinfo, scorefile, regionfile)
298 |     
299 |     cleanup()
300 | 
301 |     return (gene, 1)
302 | 
303 | 
304 | def check_options(config):
305 | 
306 |     print("# Using the following options:")
307 |     if config["General"]["workdir"]:
308 |         config["General"]["workdir"] = os.path.abspath(config["General"]["workdir"])
309 |     else:
310 |         config["General"]["workdir"] = "results"
311 |     misc.check_outdir(config["General"]["workdir"])
312 |     for section in config.sections():
313 |         for param in config.options(section):
314 |             values = config[section][param]
315 |             if section == "Features":
316 |                 if "," in values:
317 |                     values = values.split(",")
318 |                     for file in values:
319 |                         if file and not os.path.exists(file):
320 |                             print("# Error, cannot find the %s: %s" % (param, file))
321 |                             sys.exit(1)
322 |                 else:
323 |                     file = values
324 |                     if file and not os.path.exists(file):
325 |                         print("# Error, cannot find the %s: %s" % (param, file))
326 |                         sys.exit(1)
327 |             print("%s: %s" % (param, values))
328 |     if int(config["General"]["threads"]) > os.cpu_count():
329 |         config["General"]["threads"] = os.cpu_count()
330 |     if int(config["General"]["slop"]) > 5e4:
331 |         config["General"]["slop"] = 5e4
332 |     if int(config["General"]["upstream"]) > 1e4:
333 |         config["General"]["upstream"] = 1e4
334 |     if int(config["General"]["binsize"]) > int(config["General"]["upstream"]) / 2:
335 |         config["General"]["binsize"] = int(config["General"]["upstream"]) / 2
336 |     if int(config["General"]["step"]) > int(config["General"]["binsize"]):
337 |         config["General"]["step"] = int(config["General"]["binsize"])
338 |     if config["Genes"]["gene_file"]:
339 |         print("\n# Using Single mode.\n")
340 |     
341 |     return config
342 | 
343 | 
344 | def main():
345 | 
346 |     # Load configs
347 |     config = configparser.ConfigParser()
348 |     if len(sys.argv) == 1:
349 |         config_file = "config.ini"
350 |     elif len(sys.argv) == 2:
351 |         config_file = sys.argv[1]
352 |     else:
353 |         print("Usage:\n    python single.py [configfile]\n")
354 |         sys.exit(1)
355 |     config.read(config_file)
356 | 
357 |     config = check_options(config)
358 |     workdir = config["General"]["workdir"]
359 |     threads = int(config["General"]["threads"])
360 |     slop = int(config["General"]["slop"])
361 |     upstream = int(config["General"]["upstream"])
362 |     binsize = int(config["General"]["binsize"])
363 |     step = int(config["General"]["step"])
364 |     gene_file = config["Genes"]["gene_file"]
365 |     chrom_sizes = config["Genes"]["chrom_sizes"]
366 | 
367 |     # Load genes
368 |     if gene_file:
369 |         genes_info = get_gene_info(gene_file)
370 |     else:
371 |         print("No gene annotation file found, stop!")
372 |         sys.exit(1)
373 |     
374 |     # Define the input numbers of multiprocessing list
375 |     inputnum = 512
376 |     if inputnum < threads:
377 |         inputnum = threads
378 |     else:
379 |         roundnum = (inputnum // threads) * threads
380 |         inputnum = int(max(roundnum, threads*4))
381 | 
382 |     # Load chromosome sizes
383 |     chrlens = misc.get_chrom_sizes(chrom_sizes)
384 | 
385 |     # Define features information
386 |     feature_map = {"ocfiles":"OCscores", "ocpeaks":"OCpeaks", "ptmfiles":"PTM", 
387 |                    "motifs":"motifs", "cnss":"CNS", "genopheno":"genopheno", 
388 |                    "phenodata":"phenoscores"}
389 |     for item in config["Features"]:
390 |         feature_files = config["Features"][item]
391 |         if not feature_files:
392 |             continue
393 |         filelist = feature_files.split(",")
394 |         count = 1
395 |         for file in filelist:
396 |             feature_infos = []
397 |             num = 1
398 |             for gene in genes_info:
399 |                 chrom = genes_info[gene][0]
400 |                 start = genes_info[gene][1]
401 |                 end = genes_info[gene][2]
402 |                 strand = genes_info[gene][3]
403 |                 feature_info = Features_info()
404 |                 feature_info.workdir = workdir
405 |                 feature_info.slop = slop
406 |                 feature_info.config = config
407 |                 feature_info.idx = num
408 |                 feature_info.geneinfo = Geneinfo()
409 |                 feature_info.geneinfo.gene = gene
410 |                 feature_info.geneinfo.chrom = chrom
411 |                 feature_info.geneinfo.strand = strand
412 |                 if strand == "+":
413 |                     feature_info.geneinfo.start = start - upstream
414 |                     feature_info.geneinfo.end = start - 1
415 |                 else:
416 |                     feature_info.geneinfo.start = end
417 |                     feature_info.geneinfo.end = end + upstream - 1
418 |                 feature_info.geneinfo.binsize = binsize
419 |                 feature_info.geneinfo.step = step
420 |                 num += 1
421 |                 # Output analyzed gene regions
422 |                 generate_regions(feature_info.geneinfo, workdir, gene, chrlens)
423 |                 feature_info.feature = file
424 |                 if len(filelist) > 1:
425 |                     outname = feature_map[item] + "_" + str(count)
426 |                 else:
427 |                     outname = feature_map[item]
428 |                 feature_info.outname = outname
429 |                 feature_info.chrlens = chrlens
430 |                 feature_infos.append(feature_info)
431 |             count += 1
432 |             # Generate features file
433 |             time_st = time()
434 |             file_suffix = file.split(".")[-1].lower()
435 |             filesize = os.path.getsize(file)
436 |             if file_suffix in ["bed", "bedgraph", "txt"] and filesize > 1e8:
437 |                 results = generate_features_from_large(file, genes_info, upstream, slop, 
438 |                                                        workdir, outname)
439 |             else:
440 |                 # Multiprocessing
441 |                 results = generate_features(feature_infos[0])
442 |             time_ed = time()
443 |             time_elapse = round(time_ed - time_st)
444 |             print("Generate %s features files finished.\nUsing %ss" % (outname, time_elapse))
445 |     
446 |     # Perform analysis
447 |     time_st = time()
448 |     result = run_analysis(feature_infos[0])
449 |     if result[1]:
450 |         time_total = round(time() - time_st, 2)
451 |         print("\nGene analysis finished using %ss. %s\n" % (time_total, " "*30))
452 | 
453 |     print("All the processes completed.", " "*10)
454 | 
455 | 
456 | 
457 | if __name__ == '__main__':
458 | 
459 |     try:
460 |         main()
461 | 
462 |     except KeyboardInterrupt:
463 |         sys.stderr.write("User interrupt\n")
464 |         sys.exit(0)
465 | 
466 | 


--------------------------------------------------------------------------------
/batch.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | ##################################################
  4 | # CRISPR-Cas12a promoter editing (CAPE)          #
  5 | # Script: Batch Mode                             #
  6 | ##################################################
  7 | 
  8 | import os
  9 | import sys
 10 | import shutil
 11 | from glob import glob
 12 | import configparser
 13 | from time import time
 14 | from multiprocessing import Pool
 15 | from pybedtools import BedTool, cleanup
 16 | 
 17 | from lib import misc
 18 | from lib.features import *
 19 | from lib.cores import output_cores
 20 | 
 21 | 
 22 | class Features_info():
 23 |     # Provide Gene infomation
 24 |     def __init__(self):
 25 |         self.geneinfo = Geneinfo()
 26 |         self.feature = "feature"
 27 |         self.workdir = "results"
 28 |         self.outname = "name"
 29 |         self.slop = 200
 30 |         self.config = {}
 31 |         self.chrlens = {}
 32 | 
 33 | 
 34 | def get_gene_info(gff_file):
 35 |     
 36 |     suffix = gff_file.split(".")[-1].lower()
 37 | 
 38 |     if suffix not in ["gtf", "gff", "gff3"]:
 39 |         print("Input gene annotataion file is not in GFF/GFF3 format.\nPlease check the file.")
 40 |         sys.exit(1)
 41 | 
 42 |     genes_info = {}
 43 |     cds_info = {}
 44 |     with open(gff_file) as infile:
 45 |         for line in infile:
 46 |             if line.startswith("#") or line.startswith("\n"):
 47 |                 continue
 48 |             info = line.rstrip().split("\t")
 49 |             chrom = info[0]
 50 |             category = info[2].lower()
 51 |             start = int(info[3])
 52 |             end = int(info[4])
 53 |             strand = info[6]
 54 |             desc = info[8].split(";")
 55 |             if category == "gene":
 56 |                 gene_name = desc[0].split("=")[-1]
 57 |                 genes_info[gene_name] = [chrom, start, end, strand]
 58 |             elif category == "transcript":
 59 |                 gene_name = desc[0].split("\"")[1]
 60 |                 genes_info[gene_name] = [chrom, start, end, strand]
 61 |             elif category == "cds":
 62 |                 if gene_name in cds_info:
 63 |                     cds_info[gene_name] = [min(cds_info[gene_name][0], start), 
 64 |                                            max(cds_info[gene_name][1], end)]
 65 |                 else:
 66 |                     cds_info[gene_name] = [start, end]
 67 |     
 68 |     for gene in genes_info:
 69 |         if gene in cds_info:
 70 |             s, e = cds_info[gene]
 71 |             dist1 = s - genes_info[gene][1]
 72 |             dist2 = genes_info[gene][2] - e
 73 |             if dist1 >= 0 and dist2 >= 0:
 74 |                 genes_info[gene].extend([dist1, dist2])
 75 |             else:
 76 |                 genes_info[gene].extend([0, 0])
 77 |         else:
 78 |             genes_info[gene].extend([0, 0])
 79 | 
 80 |     print("%s genes found in the annotation file.\n" % len(genes_info))
 81 | 
 82 |     return genes_info
 83 | 
 84 | 
 85 | def generate_regions(geneinfo, workdir, gene, chrlens):
 86 |     chrom = geneinfo.chrom
 87 |     start = geneinfo.start
 88 |     end = geneinfo.end
 89 |     strand = geneinfo.strand
 90 |     chrom_len = chrlens[chrom]
 91 |     outfile = workdir + "/" + gene + "/analysis_region.bed"
 92 |     misc.check_outdir(outfile)
 93 |     if os.path.exists(outfile):
 94 |         return outfile
 95 |     outf = open(outfile, "w")
 96 |     print(chrom, max(0, start), min(end, chrom_len), gene, '.', strand, 
 97 |           sep="\t", file=outf)
 98 |     outf.close()
 99 | 
100 |     return outfile
101 | 
102 | 
103 | def generate_features(Features_info):
104 | 
105 |     geneinfo = Features_info.geneinfo
106 |     feature_file = Features_info.feature
107 |     workdir = Features_info.workdir
108 |     outname = Features_info.outname
109 |     slop = Features_info.slop
110 |     chrlens = Features_info.chrlens
111 | 
112 |     chrom = geneinfo.chrom
113 |     start = geneinfo.start
114 |     end = geneinfo.end
115 |     gene = geneinfo.gene
116 | 
117 |     if "peak" in outname:
118 |         outfile = workdir + "/" + gene + "/" + outname + "_raw.bed"
119 |     else:
120 |         outfile = workdir + "/" + gene + "/" + outname + "_raw.bedGraph"
121 |     if os.path.exists(outfile):
122 |         return outfile
123 |     if feature_file.endswith(".bw") or feature_file.endswith(".bigwig"):
124 |         misc.bigwig2bedGraph(feature_file, geneinfo, chrlens, outfile, ext = slop)
125 |     else:
126 |         target_bed = BedTool("\t".join([chrom, str(max(0, start-slop)), str(end+slop)]), from_string = True)
127 |         feature_bed = BedTool(feature_file)
128 |         target_bed.intersect(feature_bed).moveto(outfile)
129 |     
130 |     cleanup()
131 | 
132 |     return outfile
133 | 
134 | 
135 | def generate_features_from_large(inputfile, genes_info, upstream, slop, workdir, feature):
136 |     
137 |     basemap = {}
138 |     existed = set()
139 |     num = 0
140 |     genelens = len(genes_info)
141 |     for gene in genes_info:
142 |         outfile = os.path.join(workdir, gene, feature+"_raw.bedGraph")
143 |         if os.path.exists(outfile):
144 |             filesize = os.path.getsize(outfile)
145 |             if filesize > 10:
146 |                 existed.add(gene)
147 |         chrom, start, end, strand = genes_info[gene][:4]
148 |         if chrom not in basemap:
149 |             basemap[chrom] = {}
150 |         if strand == "+":
151 |             for i in range(max(0, start-upstream-slop), start+slop+1):
152 |                 if i in basemap[chrom]:
153 |                     basemap[chrom][i].append(gene)
154 |                 else:
155 |                     basemap[chrom][i] = [gene]
156 |         else:
157 |             for i in range(end-slop, end+upstream+slop+1):
158 |                 if i in basemap[chrom]:
159 |                     basemap[chrom][i].append(gene)
160 |                 else:
161 |                     basemap[chrom][i] = [gene]
162 |         print("%s / %s genes processed, %s existed genes." % (num, genelens, len(existed)), 
163 |               end="\r")
164 |         num += 1
165 |     print("Load genes completed.", " "*30)
166 |     genenums = []
167 |     total_num = max(1, genelens-len(existed))
168 |     outf = {}
169 |     split = 500
170 |     kept = split * 0.9
171 |     tmp_cnt = 0
172 |     tmp_mod = 0
173 |     num = 0
174 |     with open(inputfile) as infile:
175 |         for line in infile:
176 |             chrom, start, end = line.rstrip().split("\t")[:3]
177 |             if chrom not in basemap:
178 |                 continue
179 |             if feature == "CNS":
180 |                 s = int(start)
181 |             else:
182 |                 s = int((int(start) + int(end)) / 2)
183 |             if s in basemap[chrom]:
184 |                 genes = basemap[chrom][s]
185 |                 for gene in genes:
186 |                     if gene in existed:
187 |                         continue
188 |                     else:
189 |                         outfile = os.path.join(workdir, gene, feature+"_raw.bedGraph")
190 |                         if gene not in outf:
191 |                             outf[gene] = open(outfile, "w")
192 |                             # try:
193 |                             #     outf[gene] = open(outfile, "w")
194 |                             # except:
195 |                             #     opened = len(outf)
196 |                             #     print("Processing %s, %s genes opened." % (gene, opened))
197 |                             #     outf[gene] = open(outfile, "w")
198 |                     print(line.rstrip(), file=outf[gene])
199 |                     if gene not in genenums:
200 |                         genenums.append(gene)
201 |                 cnt = len(genenums)
202 |                 remain = cnt % split
203 |                 mod = cnt // split
204 |             if mod - tmp_mod > 0:
205 |                 st = max(0, int(split * (mod - 1) - kept - 1))
206 |                 ed = int(split * mod - kept)
207 |                 # print("#"*100+"\n", tmp_cnt, cnt, tmp_mod, mod, st, ed, genes, sep=", ")
208 |                 for j in genenums[st:ed]:
209 |                     outf[j].close()
210 |             if tmp_cnt != cnt:
211 |                 pct = round(cnt * 100 / total_num, 2)
212 |                 print(pct, "%", " output.", end="\r")
213 |             tmp_cnt = cnt
214 |             tmp_mod = mod
215 |     print("All files output.")
216 | 
217 |     for gene in outf:
218 |         outf[gene].close()
219 | 
220 |     return cnt
221 | 
222 | 
223 | def run_analysis(feature_info):
224 |     
225 |     workdir = feature_info.workdir
226 |     geneinfo = feature_info.geneinfo
227 |     gene = feature_info.geneinfo.gene
228 | 
229 |     # Check if calculated
230 |     # check = os.path.join(workdir, gene, "key_regions_merged.bed")
231 |     # if os.path.exists(check):
232 |     #     return (gene, 0)
233 |     check = os.path.join(workdir, gene, "aggregate.bedGraph")
234 |     if os.path.exists(check):
235 |         filesize = os.path.getsize(check)
236 |         if filesize > 10:
237 |             return (gene, 0)
238 | 
239 |     # Open chromatin
240 |     ocscores = glob(os.path.join(workdir, gene, "OCscores*_raw.bedGraph"))
241 |     ocpeaks = glob(os.path.join(workdir, gene, "OCpeaks*_raw.bed"))
242 |     # Calculate scores
243 |     ocscorelist = []
244 |     for idx, ocscorefile in enumerate(ocscores):
245 |         if idx + 1 > len(ocpeaks):
246 |             ocpeakfile = ""
247 |         else:
248 |             ocpeakfile = ocpeaks[idx]
249 |         if len(ocscores) > 1:
250 |             ocname = os.path.basename(ocscorefile).split("_raw")[0]
251 |         else:
252 |             ocname = "OCscores"
253 |         scores_oc1 = openchromatin_scores(geneinfo, ocscorefile, ocpeakfile, 
254 |                                           samplename = ocname, outdir = workdir)
255 |         ocscorelist.append(scores_oc1)
256 |     if len(ocscores) > 1:
257 |         scores_oc = merge_reps(geneinfo, ocscorelist, samplename = "OCscores", outdir = workdir)
258 |     else:
259 |         scores_oc = scores_oc1
260 | 
261 |     # Histone modification
262 |     ptmfiles = glob(os.path.join(workdir, gene, "PTM*_raw.bedGraph"))
263 |     # Calculate scores
264 |     ptmscorelist = []
265 |     for ptmscorefile in ptmfiles:
266 |         if len(ptmfiles) > 1:
267 |             ptmname = os.path.basename(ptmscorefile).split("_raw")[0]
268 |         else:
269 |             ptmname = "PTMscores"
270 |         scores_ptm1 = ptm_scores(geneinfo, ptmscorefile, ocname="OCscores",
271 |                                  samplename = ptmname, outdir = workdir)
272 |         ptmscorelist.append(scores_ptm1)
273 |     if len(ptmfiles) > 1:
274 |         scores_ptm = merge_reps(geneinfo, ptmscorelist, samplename = "PTMscores", outdir = workdir)
275 |     else:
276 |         scores_ptm = scores_ptm1
277 | 
278 |     # TF motifs
279 |     motiffile = os.path.join(workdir, gene, "motifs_raw.bedGraph")
280 |     # Calculate scores
281 |     scores_motif = motif_scores(geneinfo, motiffile, outdir = workdir)
282 | 
283 |     # Conserved sequences
284 |     cnsfile = os.path.join(workdir, gene, "CNS_raw.bedGraph")
285 |     # Calculate scores
286 |     scores_cns = cns_scores(geneinfo, cnsfile, outdir = workdir)
287 | 
288 |     # Genotype versus Phenotype (MBKbase)
289 |     genopheno = os.path.join(workdir, gene, "genopheno_raw.bedGraph")
290 |     # Calculate scores
291 |     scores_genopheno = genopheno_scores(geneinfo, genopheno, outdir = workdir)
292 | 
293 |     # Aggregate scores
294 |     if scores_genopheno:
295 |         scorelist = [scores_oc, scores_motif, scores_cns, scores_ptm, scores_genopheno]
296 |         weightlist = [0.25, 0.2, 0.3, 0.1, 0.05]
297 |     else:
298 |         scorelist = [scores_oc, scores_motif, scores_cns, scores_ptm]
299 |         weightlist = [0.25, 0.2, 0.3, 0.1]
300 |     scores_aggregate = aggregate_scores(geneinfo, scorelist, weightlist, outdir = workdir)
301 | 
302 |     # Load phenodata from CRISPR-edited results
303 |     phenodata = os.path.join(workdir, gene, "phenoscores_raw.bedGraph")
304 |     # Calculate scores
305 |     if os.path.exists(phenodata):
306 |         scores_phenodata = phenodata_scores(geneinfo, phenodata, method = "kmeans2", 
307 |                                             outdir = workdir)
308 |     else:
309 |         scores_phenodata = {}
310 | 
311 |     # Find the feature importance
312 |     if scores_phenodata:
313 |         namelist = ["DHS", "H3K27ac", "TF motif", "CNS", "GenoPheno", "Aggregate"]
314 |         misc.calc_importance(phenodata, scorelist+[scores_aggregate], 
315 |                              namelist, geneinfo, side="both", outdir = workdir)
316 | 
317 |     # Define key regions
318 |     key_regions = define_key_regions(geneinfo, scores_aggregate, phenodata, 
319 |                                      outdir = workdir)
320 | 
321 |     # Get the core of key regions
322 |     scorefile = os.path.join(workdir, gene, "aggregate.bedGraph")
323 |     regionfile = os.path.join(workdir, gene, "key_regions_merged.bed")
324 |     core_regions = output_cores(geneinfo, scorefile, regionfile)
325 |     
326 |     cleanup()
327 | 
328 |     return (gene, 1)
329 | 
330 | 
331 | def check_options(config):
332 | 
333 |     print("# Using the following options:")
334 |     if config["General"]["workdir"]:
335 |         config["General"]["workdir"] = os.path.abspath(config["General"]["workdir"])
336 |     else:
337 |         config["General"]["workdir"] = "results"
338 |     misc.check_outdir(config["General"]["workdir"])
339 |     for section in config.sections():
340 |         for param in config.options(section):
341 |             values = config[section][param]
342 |             if section == "Features":
343 |                 if "," in values:
344 |                     values = values.split(",")
345 |                     for file in values:
346 |                         if file and not os.path.exists(file):
347 |                             print("# Error, cannot find the %s: %s" % (param, file))
348 |                             sys.exit(1)
349 |                 else:
350 |                     file = values
351 |                     if file and not os.path.exists(file):
352 |                         print("# Error, cannot find the %s: %s" % (param, file))
353 |                         sys.exit(1)
354 |             print("%s: %s" % (param, values))
355 |     if int(config["General"]["threads"]) > os.cpu_count():
356 |         config["General"]["threads"] = os.cpu_count()
357 |     if int(config["General"]["slop"]) > 5e4:
358 |         config["General"]["slop"] = 5e4
359 |     if int(config["General"]["upstream"]) > 1e4:
360 |         config["General"]["upstream"] = 1e4
361 |     if int(config["General"]["binsize"]) > int(config["General"]["upstream"]) / 2:
362 |         config["General"]["binsize"] = int(config["General"]["upstream"]) / 2
363 |     if int(config["General"]["step"]) > int(config["General"]["binsize"]):
364 |         config["General"]["step"] = int(config["General"]["binsize"])
365 |     if config["Genes"]["gff_file"]:
366 |         print("\n# Using Batch mode.\n")
367 |     
368 |     return config
369 | 
370 | 
371 | def main():
372 | 
373 |     # Load configs
374 |     config = configparser.ConfigParser()
375 |     if len(sys.argv) == 1:
376 |         config_file = "config.ini"
377 |     elif len(sys.argv) == 2:
378 |         config_file = sys.argv[1]
379 |     else:
380 |         print("Usage:\n    python batch.py [configfile]\n")
381 |         sys.exit(1)
382 |     config.read(config_file)
383 | 
384 |     config = check_options(config)
385 |     workdir = config["General"]["workdir"]
386 |     threads = int(config["General"]["threads"])
387 |     slop = int(config["General"]["slop"])
388 |     upstream = int(config["General"]["upstream"])
389 |     binsize = int(config["General"]["binsize"])
390 |     step = int(config["General"]["step"])
391 |     withutr = int(config["General"]["withutr"])
392 |     gff_file = config["Genes"]["gff_file"]
393 |     chrom_sizes = config["Genes"]["chrom_sizes"]
394 | 
395 |     # Load genes
396 |     if gff_file:
397 |         genes_info = get_gene_info(gff_file)
398 |         total_genes = len(genes_info)
399 |     else:
400 |         print("No genome annotation file found, stop!")
401 |         sys.exit(1)
402 |     
403 |     # Define the input numbers of multiprocessing list
404 |     inputnum = 512
405 |     if inputnum < threads:
406 |         inputnum = threads
407 |     else:
408 |         roundnum = (inputnum // threads) * threads
409 |         inputnum = int(max(roundnum, threads*4))
410 | 
411 |     # Load chromosome sizes
412 |     chrlens = misc.get_chrom_sizes(chrom_sizes)
413 | 
414 |     # Define features information
415 |     feature_map = {"ocfiles":"OCscores", "ocpeaks":"OCpeaks", "ptmfiles":"PTM", 
416 |                    "motifs":"motifs", "cnss":"CNS", "genopheno":"genopheno", 
417 |                    "phenodata":"phenoscores"}
418 |     for item in config["Features"]:
419 |         feature_files = config["Features"][item]
420 |         if not feature_files:
421 |             continue
422 |         filelist = feature_files.split(",")
423 |         count = 1
424 |         for file in filelist:
425 |             feature_infos = []
426 |             num = 1
427 |             for gene in genes_info:
428 |                 chrom = genes_info[gene][0]
429 |                 start = genes_info[gene][1]
430 |                 end = genes_info[gene][2]
431 |                 strand = genes_info[gene][3]
432 |                 utrst = genes_info[gene][4]
433 |                 utred = genes_info[gene][5]
434 |                 feature_info = Features_info()
435 |                 feature_info.workdir = workdir
436 |                 feature_info.slop = slop
437 |                 feature_info.config = config
438 |                 feature_info.idx = num
439 |                 feature_info.geneinfo = Geneinfo()
440 |                 feature_info.geneinfo.gene = gene
441 |                 feature_info.geneinfo.chrom = chrom
442 |                 feature_info.geneinfo.strand = strand
443 |                 if strand == "+":
444 |                     feature_info.geneinfo.start = start - upstream
445 |                     if withutr:
446 |                         feature_info.geneinfo.end = start + utrst
447 |                     else:
448 |                         feature_info.geneinfo.end = start
449 |                 else:
450 |                     if withutr:
451 |                         feature_info.geneinfo.start = end - utred
452 |                     else:
453 |                         feature_info.geneinfo.start = end
454 |                     feature_info.geneinfo.end = end + upstream
455 |                 feature_info.geneinfo.binsize = binsize
456 |                 feature_info.geneinfo.step = step
457 |                 num += 1
458 |                 # Output analyzed gene regions
459 |                 generate_regions(feature_info.geneinfo, workdir, gene, chrlens)
460 |                 feature_info.feature = file
461 |                 if len(filelist) > 1:
462 |                     outname = feature_map[item] + "_" + str(count)
463 |                 else:
464 |                     outname = feature_map[item]
465 |                 feature_info.outname = outname
466 |                 feature_info.chrlens = chrlens
467 |                 feature_infos.append(feature_info)
468 |             count += 1
469 |             # continue                                        # Skip generate features raw data
470 |             # Generate features file
471 |             time_st = time()
472 |             file_suffix = file.split(".")[-1].lower()
473 |             filesize = os.path.getsize(file)
474 |             if file_suffix in ["bed", "bedgraph", "txt"] and filesize > 1e8:
475 |                 results = generate_features_from_large(file, genes_info, upstream, slop, 
476 |                                                        workdir, outname)
477 |             else:
478 |                 # Multiprocessing
479 |                 for i in range(0, total_genes, inputnum):
480 |                     # Set Pool size
481 |                     pool = Pool(threads)
482 |                     if i + inputnum < total_genes:
483 |                         inputlist = feature_infos[i:i+inputnum]
484 |                     else:
485 |                         inputlist = feature_infos[i:]
486 |                     results = pool.map(generate_features, inputlist)
487 |                     pool.close()
488 |                     pool.join()
489 |                     print("Round %s finished." % round(i/inputnum))
490 |             time_ed = time()
491 |             time_elapse = round(time_ed - time_st)
492 |             print("Generate %s features files finished.\nUsing %ss" % (outname, time_elapse))
493 |     
494 |     # Perform analysis
495 |     time_st = time()
496 |     cnt = 1
497 |     new = 0
498 |     for i in range(0, total_genes, inputnum):
499 |         # Set Pool size
500 |         pool = Pool(threads)
501 |         if i + inputnum < total_genes:
502 |             inputlist = feature_infos[i:i+inputnum]
503 |         else:
504 |             inputlist = feature_infos[i:]
505 |         for result in pool.imap_unordered(run_analysis, inputlist):
506 |             if result[1]:
507 |                 new += 1
508 |             time_ed = time()
509 |             if new:
510 |                 speed = round((time_ed - time_st) / new, 2)
511 |             else:
512 |                 time_st = time()
513 |                 speed = 0.0
514 |             print("%s / %s Gene (%s) analyzed (speed %s s)." % (cnt, total_genes, result[0], speed))
515 |             cnt += 1
516 |         pool.close()
517 |         pool.join()
518 |         print("Round %s finished." % round(i/inputnum))
519 |     time_total = round(time() - time_st, 2)
520 |     print("\nAll the genes analysis finished using %ss. %s\n" % (time_total, " "*30))
521 | 
522 |     print("All the processes completed.", " "*10)
523 | 
524 | 
525 | 
526 | if __name__ == '__main__':
527 | 
528 |     try:
529 |         main()
530 | 
531 |     except KeyboardInterrupt:
532 |         sys.stderr.write("User interrupt\n")
533 |         sys.exit(0)
534 | 
535 | 


--------------------------------------------------------------------------------
/test/single/data/genes_motifs_JASPAR_test.bed:
--------------------------------------------------------------------------------
  1 | Chr1	4001197	4001217	AT2G28810	.	-	2.22e-07	0.00793
  2 | Chr1	4001197	4001217	AT5G02460	.	+	1.71e-07	0.0044
  3 | Chr1	4001198	4001218	OBP1	.	-	3.96e-07	0.00957
  4 | Chr1	4001236	4001248	O2	.	-	9.03e-06	0.301
  5 | Chr1	4001296	4001305	ERF6	.	+	9.61e-06	0.159
  6 | Chr1	4001395	4001406	bZIP42	.	-	7.49e-06	0.417
  7 | Chr1	4002087	4002097	bHLH80	.	+	2.49e-07	0.183
  8 | Chr1	4002208	4002218	LEC2	.	+	4.49e-06	0.403
  9 | Chr1	4002257	4002275	AT3G45610	.	-	5.07e-07	0.0322
 10 | Chr1	4002257	4002277	COG1	.	+	4.99e-07	0.0211
 11 | Chr1	4002260	4002280	Adof1	.	+	8.69e-08	0.00468
 12 | Chr1	4002264	4002282	AT3G45610	.	-	1.84e-06	0.0522
 13 | Chr1	4002265	4002275	AT3G52440	.	+	2.77e-06	0.412
 14 | Chr1	4002265	4002275	DAG2	.	+	4.51e-07	0.244
 15 | Chr1	4002265	4002278	OBP4	.	+	2.42e-06	0.132
 16 | Chr1	4002301	4002315	AT3G46070	.	+	2.75e-06	0.305
 17 | Chr1	4002446	4002474	AT5G66940	.	-	6.68e-10	0.000144
 18 | Chr1	4002447	4002467	Adof1	.	+	4.23e-08	0.0033
 19 | Chr1	4002448	4002466	dof4.2	.	+	2.98e-07	0.0263
 20 | Chr1	4002448	4002468	OBP3	.	-	2.29e-07	0.0043
 21 | Chr1	4002448	4002474	AT1G69570	.	-	7.81e-11	8.63e-05
 22 | Chr1	4002449	4002469	OBP3	.	-	3.85e-07	0.00558
 23 | Chr1	4002450	4002464	IDD2	.	+	4.71e-06	0.238
 24 | Chr1	4002450	4002469	AT1G14580	.	-	7.57e-06	0.243
 25 | Chr1	4002450	4002470	OBP3	.	-	8.67e-08	0.0026
 26 | Chr1	4002451	4002471	OBP3	.	-	1.48e-07	0.00347
 27 | Chr1	4002452	4002472	AT2G28810	.	-	1.09e-09	0.000736
 28 | Chr1	4002452	4002472	OBP3	.	-	3.65e-10	0.000258
 29 | Chr1	4002452	4002472	AT5G02460	.	+	2.32e-10	0.000203
 30 | Chr1	4002452	4002480	AT5G66940	.	-	1.64e-06	0.0126
 31 | Chr1	4002453	4002473	OBP1	.	-	2.45e-09	0.0007
 32 | Chr1	4002453	4002481	AT5G66940	.	-	8.62e-07	0.00878
 33 | Chr1	4002454	4002472	AT3G45610	.	-	2.38e-06	0.0569
 34 | Chr1	4002454	4002474	FLC	.	+	9.09e-06	0.238
 35 | Chr1	4002454	4002474	COG1	.	+	2.91e-08	0.00648
 36 | Chr1	4002454	4002482	AT5G66940	.	-	1.31e-06	0.0111
 37 | Chr1	4002455	4002483	AT5G66940	.	-	3.34e-07	0.00507
 38 | Chr1	4002456	4002482	AT1G69570	.	-	7.97e-11	8.63e-05
 39 | Chr1	4002456	4002484	AT5G66940	.	-	2.41e-13	4.58e-06
 40 | Chr1	4002457	4002477	Adof1	.	+	1.76e-08	0.00217
 41 | Chr1	4002458	4002476	dof4.2	.	+	7.77e-07	0.0374
 42 | Chr1	4002458	4002478	OBP3	.	-	2.98e-07	0.0049
 43 | Chr1	4002459	4002472	PI	.	+	3.44e-06	0.121
 44 | Chr1	4002459	4002479	OBP3	.	-	4.05e-08	0.00179
 45 | Chr1	4002460	4002474	IDD2	.	+	4.71e-06	0.238
 46 | Chr1	4002460	4002479	AT1G14580	.	-	8.53e-06	0.243
 47 | Chr1	4002460	4002480	OBP3	.	-	3.01e-11	0.000105
 48 | Chr1	4002462	4002480	AT3G45610	.	-	1.37e-07	0.0213
 49 | Chr1	4002462	4002482	COG1	.	+	1.7e-08	0.00523
 50 | Chr1	4002462	4002482	AT2G28810	.	-	5.06e-10	0.000736
 51 | Chr1	4002462	4002482	AT5G02460	.	+	4.9e-11	0.000151
 52 | Chr1	4002463	4002483	OBP1	.	-	6.9e-10	0.000466
 53 | Chr1	4002464	4002492	AT5G66940	.	-	3.37e-10	0.000104
 54 | Chr1	4002465	4002485	OBP3	.	-	1.5e-07	0.00349
 55 | Chr1	4002465	4002485	Adof1	.	+	2.53e-09	0.000917
 56 | Chr1	4002466	4002484	dof4.2	.	+	3.04e-07	0.0263
 57 | Chr1	4002466	4002486	OBP3	.	-	2.03e-07	0.00408
 58 | Chr1	4002466	4002492	AT1G69570	.	-	8.94e-08	0.00258
 59 | Chr1	4002467	4002480	PI	.	+	3.73e-06	0.121
 60 | Chr1	4002467	4002487	OBP3	.	-	3.69e-07	0.00546
 61 | Chr1	4002468	4002488	OBP3	.	-	4.62e-07	0.00611
 62 | Chr1	4002469	4002489	OBP3	.	-	6.01e-07	0.00699
 63 | Chr1	4002470	4002490	AT2G28810	.	-	6.95e-08	0.00437
 64 | Chr1	4002470	4002490	OBP3	.	-	3.72e-08	0.0017
 65 | Chr1	4002470	4002490	AT5G02460	.	+	2.51e-08	0.00146
 66 | Chr1	4002471	4002491	OBP1	.	-	5.85e-08	0.00332
 67 | Chr1	4002472	4002490	AT3G45610	.	-	9.86e-06	0.0968
 68 | Chr1	4002574	4002588	ATHB34	.	-	4.56e-06	0.167
 69 | Chr1	4002662	4002675	OBP4	.	-	1.92e-06	0.121
 70 | Chr1	4002663	4002683	COG1	.	-	3.89e-06	0.0492
 71 | Chr1	4002665	4002675	AT3G52440	.	-	3.89e-06	0.413
 72 | Chr1	4002665	4002675	DAG2	.	-	1.35e-06	0.244
 73 | Chr1	4002665	4002683	AT3G45610	.	+	6.95e-06	0.0849
 74 | Chr1	4003265	4003285	COG1	.	-	6.22e-06	0.0592
 75 | Chr1	4003265	4003291	AT1G69570	.	+	6.17e-07	0.00778
 76 | Chr1	4003267	4003285	AT3G45610	.	+	9.86e-06	0.0968
 77 | Chr1	4003273	4003293	Adof1	.	-	5.11e-07	0.0113
 78 | Chr1	4003360	4003380	ATHB40	.	+	3.79e-06	0.371
 79 | Chr1	4003363	4003373	ATHB53	.	+	3.89e-06	0.497
 80 | Chr1	4003363	4003373	ATHB20	.	+	3.27e-06	0.401
 81 | Chr1	4003363	4003373	ATHB13	.	+	1.82e-06	0.435
 82 | Chr1	4003366	4003375	ATHB23	.	+	7.3e-06	0.138
 83 | Chr1	4003367	4003379	ZHD1	.	+	8.36e-06	0.148
 84 | Chr1	4003368	4003377	ATHB23	.	-	2.43e-06	0.0816
 85 | Chr1	4003439	4003451	ZHD1	.	+	5.09e-06	0.145
 86 | Chr1	4003440	4003452	ZHD1	.	-	2.47e-06	0.124
 87 | Chr1	4003442	4003451	ATHB23	.	+	2.43e-06	0.0816
 88 | Chr1	4003442	4003456	ATHB34	.	+	1.68e-07	0.0503
 89 | Chr1	4003443	4003455	ZHD1	.	+	4.49e-07	0.0757
 90 | Chr1	4003443	4003457	ATHB34	.	-	1.68e-07	0.0503
 91 | Chr1	4003444	4003453	ATHB23	.	-	2.43e-06	0.0816
 92 | Chr1	4003444	4003456	ZHD1	.	-	4.49e-07	0.0757
 93 | Chr1	4003446	4003455	ATHB23	.	+	2.43e-06	0.0816
 94 | Chr1	4003446	4003460	ATHB34	.	+	1.68e-07	0.0503
 95 | Chr1	4003447	4003459	ZHD1	.	+	4.49e-07	0.0757
 96 | Chr1	4003447	4003461	ATHB34	.	-	5.3e-07	0.0728
 97 | Chr1	4003447	4003461	ZHD6	.	-	5.02e-06	0.19
 98 | Chr1	4003448	4003457	ATHB23	.	-	2.43e-06	0.0816
 99 | Chr1	4003448	4003460	ZHD1	.	-	4.49e-07	0.0757
100 | Chr1	4003450	4003459	ATHB23	.	+	2.43e-06	0.0816
101 | Chr1	4003452	4003461	ATHB23	.	-	7.3e-06	0.138
102 | Chr1	4003579	4003599	OBP3	.	+	1.77e-07	0.0038
103 | Chr1	4003579	4003599	Adof1	.	-	2.54e-08	0.00256
104 | Chr1	4003580	4003598	dof4.2	.	-	7.74e-06	0.0818
105 | Chr1	4003582	4003608	AT1G69570	.	+	5.08e-07	0.00696
106 | Chr1	4003584	4003604	OBP3	.	+	1.72e-07	0.00374
107 | Chr1	4003600	4003628	AT5G66940	.	+	1.12e-06	0.0102
108 | Chr1	4003602	4003622	AT5G02460	.	-	1.49e-06	0.0153
109 | Chr1	4003607	4003627	Adof1	.	-	5.57e-07	0.0118
110 | Chr1	4003610	4003630	COG1	.	-	7.99e-06	0.0649
111 | Chr1	4003664	4003684	RAP212	.	+	3.07e-08	0.000483
112 | Chr1	4003664	4003684	ERF9	.	+	1.45e-08	0.000157
113 | Chr1	4003665	4003685	LEP	.	-	4.68e-09	7.99e-05
114 | Chr1	4003667	4003687	RAP212	.	+	5.88e-08	0.00073
115 | Chr1	4003667	4003687	ERF9	.	+	5.02e-09	7.55e-05
116 | Chr1	4003668	4003679	CBF1	.	-	2.11e-06	0.413
117 | Chr1	4003668	4003686	ABR1	.	+	7.15e-08	0.00108
118 | Chr1	4003668	4003688	LEP	.	-	1.78e-09	4.03e-05
119 | Chr1	4003669	4003681	RAP21	.	+	2.92e-06	0.0754
120 | Chr1	4003669	4003683	AT4G16750	.	-	1.84e-07	0.0115
121 | Chr1	4003670	4003684	AT5G67000	.	+	9.22e-07	0.0127
122 | Chr1	4003670	4003684	CEJ1	.	-	8.85e-07	0.015
123 | Chr1	4003670	4003684	AT1G44830	.	-	4.22e-07	0.00624
124 | Chr1	4003670	4003684	AT1G75490	.	-	3.01e-07	0.003
125 | Chr1	4003670	4003688	ESE3	.	+	4.53e-08	0.000528
126 | Chr1	4003670	4003690	RAP212	.	+	1.98e-07	0.00165
127 | Chr1	4003671	4003684	AT1G36060	.	+	8.82e-06	0.121
128 | Chr1	4003671	4003685	AT5G18450	.	+	4.52e-07	0.00453
129 | Chr1	4003671	4003689	ERF104	.	-	4.57e-09	6.88e-05
130 | Chr1	4003671	4003690	DREB26	.	-	7.25e-07	0.0108
131 | Chr1	4003671	4003690	AT4G28140	.	+	6.38e-09	0.00014
132 | Chr1	4003672	4003686	AT4G16750	.	-	1.06e-06	0.0159
133 | Chr1	4003672	4003686	ERF15	.	-	1.88e-07	0.00152
134 | Chr1	4003672	4003686	ERF105	.	+	1.04e-08	0.000197
135 | Chr1	4003673	4003683	ERF118	.	-	7.65e-08	0.000844
136 | Chr1	4003673	4003684	CRF4	.	-	3.82e-08	0.000617
137 | Chr1	4003673	4003687	AT1G44830	.	-	5.84e-07	0.00709
138 | Chr1	4003673	4003687	RAP211	.	-	2.18e-07	0.00205
139 | Chr1	4003673	4003687	AT1G75490	.	-	1.54e-07	0.00218
140 | Chr1	4003673	4003687	RAP26	.	+	3.11e-08	0.000521
141 | Chr1	4003673	4003687	PUCHI	.	-	1.64e-08	0.000302
142 | Chr1	4003673	4003687	ERF087	.	+	1.23e-08	0.000228
143 | Chr1	4003673	4003687	ERF5	.	-	4.47e-09	0.000101
144 | Chr1	4003673	4003687	ESE1	.	+	2.71e-09	6.43e-05
145 | Chr1	4003673	4003693	ERF2	.	-	1.84e-09	3.48e-05
146 | Chr1	4003674	4003684	AT3G57600	.	-	2.83e-07	0.00301
147 | Chr1	4003674	4003692	ERF104	.	-	6.75e-10	1.67e-05
148 | Chr1	4003674	4003694	ERF10	.	-	5.11e-09	7.85e-05
149 | Chr1	4003675	4003685	ERF3	.	-	7.65e-08	0.000947
150 | Chr1	4003675	4003685	AT2G33710	.	-	7.65e-08	0.000947
151 | Chr1	4003675	4003689	AT4G16750	.	-	1.31e-06	0.0173
152 | Chr1	4003675	4003689	ERF105	.	+	4.06e-08	0.000499
153 | Chr1	4003675	4003689	ERF15	.	-	5.85e-09	0.000126
154 | Chr1	4003675	4003691	AT4G18450	.	+	5.98e-09	0.000101
155 | Chr1	4003676	4003686	ERF118	.	-	7.65e-08	0.000844
156 | Chr1	4003676	4003690	RAP211	.	-	3.47e-07	0.00286
157 | Chr1	4003697	4003711	ERF021	.	-	5.97e-06	0.132
158 | Chr1	4003697	4003717	RAP212	.	+	1.98e-07	0.00165
159 | Chr1	4003698	4003711	AT1G36060	.	+	4.95e-06	0.113
160 | Chr1	4003698	4003718	LEP	.	-	9.25e-09	0.000126
161 | Chr1	4003699	4003711	RAP21	.	+	3.24e-06	0.0756
162 | Chr1	4003699	4003713	AT4G16750	.	-	8.99e-08	0.0115
163 | Chr1	4003700	4003714	AT5G67000	.	+	9.22e-07	0.0127
164 | Chr1	4003700	4003714	CEJ1	.	-	8.85e-07	0.015
165 | Chr1	4003700	4003714	AT1G44830	.	-	4.22e-07	0.00624
166 | Chr1	4003700	4003714	AT1G75490	.	-	3.01e-07	0.003
167 | Chr1	4003700	4003718	ESE3	.	+	8.12e-09	0.000172
168 | Chr1	4003700	4003720	RAP212	.	+	7.38e-09	0.00019
169 | Chr1	4003700	4003720	ERF2	.	-	2.75e-09	4.69e-05
170 | Chr1	4003700	4003720	ERF9	.	+	3.89e-10	1.26e-05
171 | Chr1	4003701	4003714	AT1G36060	.	+	8.82e-06	0.121
172 | Chr1	4003701	4003715	AT5G18450	.	+	4.52e-07	0.00453
173 | Chr1	4003701	4003719	ABR1	.	+	5.63e-09	0.000235
174 | Chr1	4003701	4003719	ERF104	.	-	2.99e-10	9.2e-06
175 | Chr1	4003701	4003720	DREB26	.	-	1.51e-07	0.00495
176 | Chr1	4003701	4003720	AT4G28140	.	+	4.85e-10	2.32e-05
177 | Chr1	4003701	4003721	ERF10	.	-	6.61e-09	9.44e-05
178 | Chr1	4003701	4003721	LEP	.	-	1.04e-09	2.74e-05
179 | Chr1	4003702	4003716	AT4G16750	.	-	1.06e-06	0.0159
180 | Chr1	4003702	4003716	ERF15	.	-	1.88e-07	0.00152
181 | Chr1	4003702	4003716	ERF105	.	+	1.04e-08	0.000197
182 | Chr1	4003702	4003718	AT4G18450	.	+	3.24e-09	6.39e-05
183 | Chr1	4003703	4003713	ERF118	.	-	7.65e-08	0.000844
184 | Chr1	4003703	4003714	CRF4	.	-	3.82e-08	0.000617
185 | Chr1	4003703	4003717	CEJ1	.	-	1.04e-06	0.015
186 | Chr1	4003703	4003717	AT5G67000	.	+	2.14e-07	0.00625
187 | Chr1	4003703	4003717	AT1G44830	.	-	1.31e-07	0.00309
188 | Chr1	4003703	4003717	RAP211	.	-	1.06e-08	0.000349
189 | Chr1	4003703	4003717	RAP26	.	+	1.67e-09	6.56e-05
190 | Chr1	4003703	4003717	ERF087	.	+	1.02e-09	4.07e-05
191 | Chr1	4003703	4003717	PUCHI	.	-	7.31e-10	2.93e-05
192 | Chr1	4003703	4003717	AT1G75490	.	-	1.24e-08	0.000337
193 | Chr1	4003703	4003717	ESE1	.	+	1.98e-10	9.3e-06
194 | Chr1	4003703	4003717	ERF5	.	-	1.98e-10	9.29e-06
195 | Chr1	4003703	4003721	ESE3	.	+	1.43e-08	0.000248
196 | Chr1	4003703	4003723	RAP212	.	+	2.09e-08	0.00037
197 | Chr1	4003703	4003723	ERF2	.	-	8.05e-12	6.1e-07
198 | Chr1	4003703	4003723	ERF9	.	+	1.56e-09	3.3e-05
199 | Chr1	4003704	4003714	AT3G57600	.	-	2.83e-07	0.00301
200 | Chr1	4003704	4003718	AT5G18450	.	+	2.56e-08	0.000707
201 | Chr1	4003704	4003718	AT1G22810	.	+	2.22e-08	0.000651
202 | Chr1	4003704	4003722	ERF104	.	-	7.24e-11	3.51e-06
203 | Chr1	4003704	4003723	AT4G28140	.	+	3.3e-10	1.73e-05
204 | Chr1	4003704	4003723	DREB26	.	-	3.14e-07	0.00693
205 | Chr1	4003704	4003724	LEP	.	-	2.37e-08	0.00024
206 | Chr1	4003704	4003724	ERF10	.	-	2.74e-11	1.87e-06
207 | Chr1	4003705	4003715	ERF3	.	-	7.65e-08	0.000947
208 | Chr1	4003705	4003715	AT2G33710	.	-	7.65e-08	0.000947
209 | Chr1	4003705	4003719	AT4G16750	.	-	5.16e-07	0.0115
210 | Chr1	4003705	4003719	ERF105	.	+	4.38e-10	1.66e-05
211 | Chr1	4003705	4003719	ERF15	.	-	1.98e-10	8.39e-06
212 | Chr1	4003705	4003721	AT4G18450	.	+	7.11e-10	2.28e-05
213 | Chr1	4003706	4003716	ERF118	.	-	7.65e-08	0.000844
214 | Chr1	4003706	4003717	CRF4	.	-	3.82e-08	0.000617
215 | Chr1	4003706	4003720	AT1G75490	.	-	3.68e-07	0.00332
216 | Chr1	4003706	4003720	RAP26	.	+	4.91e-08	0.000723
217 | Chr1	4003706	4003720	PUCHI	.	-	2.43e-08	0.00038
218 | Chr1	4003706	4003720	ESE1	.	+	9.34e-09	0.000168
219 | Chr1	4003706	4003720	RAP211	.	-	2.01e-07	0.00205
220 | Chr1	4003706	4003720	ERF087	.	+	1.57e-08	0.000268
221 | Chr1	4003706	4003720	ERF5	.	-	1.59e-08	0.000252
222 | Chr1	4003706	4003726	ERF2	.	-	5.52e-11	2.61e-06
223 | Chr1	4003707	4003717	AT3G57600	.	-	2.83e-07	0.00301
224 | Chr1	4003707	4003725	ERF104	.	-	2.99e-10	9.2e-06
225 | Chr1	4003707	4003726	DREB26	.	-	2.6e-07	0.00644
226 | Chr1	4003707	4003726	AT4G28140	.	+	6.29e-10	2.82e-05
227 | Chr1	4003707	4003727	AT1G77640	.	+	2.68e-10	0.000394
228 | Chr1	4003707	4003727	ERF10	.	-	4.95e-11	2.8e-06
229 | Chr1	4003708	4003718	ERF3	.	-	7.65e-08	0.000947
230 | Chr1	4003708	4003718	AT2G33710	.	-	7.65e-08	0.000947
231 | Chr1	4003708	4003722	ERF15	.	-	2.86e-08	0.00041
232 | Chr1	4003708	4003724	AT4G18450	.	+	2.07e-09	4.75e-05
233 | Chr1	4003709	4003719	ERF118	.	-	7.65e-08	0.000844
234 | Chr1	4003709	4003723	AT1G75490	.	-	4.69e-07	0.00399
235 | Chr1	4003709	4003723	RAP211	.	-	4.12e-07	0.00318
236 | Chr1	4003709	4003723	ERF5	.	-	3.5e-08	0.000401
237 | Chr1	4003709	4003729	ERF2	.	-	5.71e-09	7.86e-05
238 | Chr1	4003710	4003729	DREB26	.	-	8.17e-07	0.0115
239 | Chr1	4003710	4003729	AT4G28140	.	+	9.14e-09	0.000179
240 | Chr1	4003712	4003732	DREB2	.	+	8.63e-06	0.113
241 | Chr1	4003713	4003727	AT1G01250	.	+	3.39e-06	0.434
242 | Chr1	4003713	4003727	AT1G22810	.	+	1.24e-08	0.000651
243 | Chr1	4003714	4003728	RAP2-1	.	-	8.85e-06	0.405
244 | Chr1	4003714	4003728	TINY	.	-	1.41e-06	0.435
245 | Chr1	4003714	4003728	ERF019	.	-	4e-07	0.171
246 | Chr1	4003715	4003729	AT2G44940	.	+	1.9e-06	0.468
247 | Chr1	4003716	4003727	ERF015	.	-	8.81e-06	0.435
248 | Chr1	4003716	4003729	AT1G36060	.	+	7.16e-06	0.118
249 | Chr1	4003717	4003729	RAP21	.	+	1.69e-06	0.0581
250 | Chr1	4003789	4003798	PIF3	.	+	1.36e-06	0.427
251 | Chr1	4003789	4003800	ABF4	.	+	9.55e-06	0.37
252 | Chr1	4003789	4003800	HYH	.	-	6.67e-06	0.454
253 | Chr1	4003790	4003797	PIF4	.	-	9.9e-06	0.439
254 | Chr1	4003988	4004007	DREB26	.	+	1.08e-06	0.0135
255 | Chr1	4003994	4004008	AT1G44830	.	+	3.45e-08	0.00309
256 | Chr1	4003994	4004008	AT1G75490	.	+	1.3e-08	0.000348
257 | Chr1	4003994	4004008	CEJ1	.	+	1.25e-07	0.0118
258 | Chr1	4003997	4004007	AT3G57600	.	+	9.31e-08	0.00301
259 | Chr1	4003997	4004011	RAP211	.	+	2.4e-08	0.000683
260 | Chr1	4003997	4004016	AT4G28140	.	-	2.68e-10	1.47e-05
261 | Chr1	4003997	4004016	DREB26	.	+	1.57e-08	0.00198
262 | Chr1	4003997	4004017	ERF9	.	-	1.79e-08	0.000181
263 | Chr1	4003999	4004013	AT5G18450	.	-	2.85e-07	0.0039
264 | Chr1	4003999	4004019	AT1G77640	.	-	1.81e-07	0.0145
265 | Chr1	4004003	4004012	ERF6	.	-	4.62e-06	0.159
266 | Chr1	4004003	4004017	CEJ1	.	+	5.77e-07	0.015
267 | Chr1	4004003	4004017	AT1G44830	.	+	1.21e-07	0.00309
268 | Chr1	4004003	4004017	AT1G75490	.	+	5.05e-07	0.00422
269 | Chr1	4004003	4004023	DREB2	.	-	1.59e-06	0.0611
270 | Chr1	4004006	4004018	RAP21	.	-	1.88e-06	0.0606
271 | Chr1	4004006	4004019	AT1G36060	.	-	7.55e-06	0.12
272 | Chr1	4004007	4004021	ERF019	.	+	7.8e-06	0.216
273 | Chr1	4004158	4004178	AT1G77640	.	-	4.17e-06	0.0683
274 | Chr1	4004164	4004175	ERF4	.	+	3.71e-06	0.0782
275 | Chr1	4004164	4004175	ERF11	.	+	2.41e-06	0.0748
276 | Chr1	4004257	4004270	ARF7	.	-	4.3e-06	0.479
277 | Chr1	4004374	4004384	LEC2	.	-	4.99e-07	0.28
278 | Chr1	4004534	4004541	ERF008	.	-	6.67e-06	0.101
279 | Chr1	4004593	4004602	ERF6	.	-	1.33e-06	0.159
280 | Chr1	4004644	4004651	RAP2-3	.	+	6.67e-06	0.07
281 | Chr1	4004644	4004651	RAP2-6	.	+	6.67e-06	0.07
282 | Chr1	4004644	4004651	ERF109	.	+	6.67e-06	0.0701
283 | Chr1	4004644	4004652	ERF069	.	+	5.17e-06	0.0506
284 | Chr1	4004647	4004656	ERF6	.	-	4.62e-06	0.159
285 | Chr1	4004693	4004700	ERF008	.	+	6.67e-06	0.101
286 | Chr1	4004710	4004717	RAP2-3	.	+	6.67e-06	0.07
287 | Chr1	4004710	4004717	RAP2-6	.	+	6.67e-06	0.07
288 | Chr1	4004710	4004717	ERF109	.	+	6.67e-06	0.0701
289 | Chr1	4004710	4004718	ERF069	.	+	5.17e-06	0.0506
290 | Chr1	4004716	4004724	CMTA3	.	+	1.83e-06	0.202
291 | Chr1	4004751	4004771	ERF2	.	+	8.69e-09	0.000108
292 | Chr1	4004752	4004770	ERF104	.	+	4.9e-09	7.24e-05
293 | Chr1	4004753	4004773	ERF10	.	+	1.49e-09	3.27e-05
294 | Chr1	4004754	4004773	DREB26	.	+	1.06e-06	0.0134
295 | Chr1	4004754	4004773	AT4G28140	.	-	3.82e-09	0.0001
296 | Chr1	4004754	4004774	RAP212	.	-	1.35e-07	0.00128
297 | Chr1	4004754	4004774	ERF2	.	+	3.33e-10	9.84e-06
298 | Chr1	4004755	4004769	ERF15	.	+	4.67e-08	0.000571
299 | Chr1	4004755	4004773	ERF104	.	+	2.43e-10	8.01e-06
300 | Chr1	4004756	4004772	AT4G18450	.	-	2.29e-09	4.99e-05
301 | Chr1	4004756	4004774	ESE3	.	-	1.67e-09	6.11e-05
302 | Chr1	4004756	4004776	ERF10	.	+	1.72e-11	1.31e-06
303 | Chr1	4004756	4004776	LEP	.	+	1.64e-11	1.45e-06
304 | Chr1	4004757	4004771	AT1G44830	.	+	5.84e-07	0.00709
305 | Chr1	4004757	4004771	RAP211	.	+	2.18e-07	0.00205
306 | Chr1	4004757	4004771	AT1G75490	.	+	1.54e-07	0.00218
307 | Chr1	4004757	4004771	RAP26	.	-	3.11e-08	0.000521
308 | Chr1	4004757	4004771	PUCHI	.	+	1.64e-08	0.000302
309 | Chr1	4004757	4004771	ERF087	.	-	1.23e-08	0.000228
310 | Chr1	4004757	4004771	ERF5	.	+	4.47e-09	0.000101
311 | Chr1	4004757	4004771	ESE1	.	-	2.71e-09	6.43e-05
312 | Chr1	4004757	4004776	AT4G28140	.	-	8.21e-11	6.99e-06
313 | Chr1	4004757	4004776	DREB26	.	+	1.59e-07	0.00495
314 | Chr1	4004757	4004777	RAP212	.	-	1.53e-10	1.82e-05
315 | Chr1	4004757	4004777	ERF2	.	+	1.03e-12	1.47e-07
316 | Chr1	4004757	4004777	ERF9	.	-	4.22e-13	9.89e-08
317 | Chr1	4004758	4004768	ERF118	.	+	7.65e-08	0.000844
318 | Chr1	4004758	4004772	AT4G16750	.	+	5.16e-07	0.0115
319 | Chr1	4004758	4004772	ERF105	.	-	4.38e-10	1.66e-05
320 | Chr1	4004758	4004772	ERF15	.	+	1.98e-10	8.39e-06
321 | Chr1	4004758	4004776	ABR1	.	-	3.36e-10	3.92e-05
322 | Chr1	4004758	4004776	ERF104	.	+	5.1e-13	8.58e-08
323 | Chr1	4004759	4004769	ERF3	.	+	7.65e-08	0.000947
324 | Chr1	4004759	4004769	AT2G33710	.	+	7.65e-08	0.000947
325 | Chr1	4004759	4004773	AT5G18450	.	-	2.56e-08	0.000707
326 | Chr1	4004759	4004773	AT1G22810	.	-	2.22e-08	0.000651
327 | Chr1	4004759	4004775	AT4G18450	.	-	1e-11	9.64e-07
328 | Chr1	4004759	4004777	ESE3	.	-	5.72e-11	6.48e-06
329 | Chr1	4004759	4004779	AT1G77640	.	-	4.82e-06	0.0722
330 | Chr1	4004759	4004779	ERF10	.	+	4.96e-12	5.43e-07
331 | Chr1	4004759	4004779	LEP	.	+	3.52e-12	5.17e-07
332 | Chr1	4004760	4004770	AT3G57600	.	+	2.83e-07	0.00301
333 | Chr1	4004760	4004771	CRF4	.	+	3.82e-08	0.000617
334 | Chr1	4004760	4004774	CEJ1	.	+	1.04e-06	0.015
335 | Chr1	4004760	4004774	AT5G67000	.	-	2.14e-07	0.00625
336 | Chr1	4004760	4004774	AT1G44830	.	+	1.31e-07	0.00309
337 | Chr1	4004760	4004774	RAP211	.	+	1.06e-08	0.000349
338 | Chr1	4004760	4004774	RAP26	.	-	1.67e-09	6.56e-05
339 | Chr1	4004760	4004774	ERF087	.	-	1.02e-09	4.07e-05
340 | Chr1	4004760	4004774	PUCHI	.	+	7.31e-10	2.93e-05
341 | Chr1	4004760	4004774	AT1G75490	.	+	1.24e-08	0.000337
342 | Chr1	4004760	4004774	ERF5	.	+	1.98e-10	9.29e-06
343 | Chr1	4004760	4004774	ESE1	.	-	1.98e-10	9.3e-06
344 | Chr1	4004760	4004779	DREB26	.	+	2.04e-07	0.00564
345 | Chr1	4004760	4004779	AT4G28140	.	-	3.77e-11	4.5e-06
346 | Chr1	4004760	4004780	RAP212	.	-	1.32e-09	6.24e-05
347 | Chr1	4004760	4004780	ERF2	.	+	1.65e-10	6.05e-06
348 | Chr1	4004760	4004780	ERF9	.	-	4.09e-11	2.53e-06
349 | Chr1	4004761	4004771	ERF118	.	+	7.65e-08	0.000844
350 | Chr1	4004761	4004775	AT4G16750	.	+	5.16e-07	0.0115
351 | Chr1	4004761	4004775	ERF105	.	-	4.38e-10	1.66e-05
352 | Chr1	4004761	4004775	ERF15	.	+	1.98e-10	8.39e-06
353 | Chr1	4004761	4004779	ERF104	.	+	3.62e-12	4.16e-07
354 | Chr1	4004761	4004779	ABR1	.	-	2.51e-09	0.000135
355 | Chr1	4004762	4004772	ERF3	.	+	7.65e-08	0.000947
356 | Chr1	4004762	4004772	AT2G33710	.	+	7.65e-08	0.000947
357 | Chr1	4004762	4004776	AT5G18450	.	-	2.56e-08	0.000707
358 | Chr1	4004762	4004776	AT1G22810	.	-	2.22e-08	0.000651
359 | Chr1	4004762	4004778	AT4G18450	.	-	4.67e-11	3.4e-06
360 | Chr1	4004762	4004780	ESE3	.	-	7.68e-09	0.000166
361 | Chr1	4004762	4004782	LEP	.	+	9.02e-10	2.51e-05
362 | Chr1	4004763	4004773	AT3G57600	.	+	2.83e-07	0.00301
363 | Chr1	4004763	4004774	CRF4	.	+	3.82e-08	0.000617
364 | Chr1	4004763	4004777	CEJ1	.	+	1.04e-06	0.015
365 | Chr1	4004763	4004777	AT5G67000	.	-	2.14e-07	0.00625
366 | Chr1	4004763	4004777	AT1G44830	.	+	1.31e-07	0.00309
367 | Chr1	4004763	4004777	RAP211	.	+	1.06e-08	0.000349
368 | Chr1	4004763	4004777	RAP26	.	-	1.67e-09	6.56e-05
369 | Chr1	4004763	4004777	ERF087	.	-	1.02e-09	4.07e-05
370 | Chr1	4004763	4004777	PUCHI	.	+	7.31e-10	2.93e-05
371 | Chr1	4004763	4004777	AT1G75490	.	+	1.24e-08	0.000337
372 | Chr1	4004763	4004777	ERF5	.	+	1.98e-10	9.29e-06
373 | Chr1	4004763	4004777	ESE1	.	-	1.98e-10	9.3e-06
374 | Chr1	4004763	4004783	RAP212	.	-	3.23e-08	0.000499
375 | Chr1	4004763	4004783	ERF9	.	-	1.12e-08	0.00013
376 | Chr1	4004764	4004774	ERF118	.	+	7.65e-08	0.000844
377 | Chr1	4004764	4004778	AT4G16750	.	+	1.75e-06	0.02
378 | Chr1	4004764	4004778	ERF105	.	-	6.12e-09	0.000131
379 | Chr1	4004764	4004778	ERF15	.	+	2.81e-09	6.94e-05
380 | Chr1	4004764	4004782	ABR1	.	-	8.01e-08	0.00116
381 | Chr1	4004765	4004775	ERF3	.	+	7.65e-08	0.000947
382 | Chr1	4004765	4004775	AT2G33710	.	+	7.65e-08	0.000947
383 | Chr1	4004765	4004779	AT5G18450	.	-	3.55e-07	0.00402
384 | Chr1	4004765	4004779	AT1G22810	.	-	1.88e-07	0.00284
385 | Chr1	4004766	4004776	AT3G57600	.	+	2.83e-07	0.00301
386 | Chr1	4004766	4004777	CRF4	.	+	3.82e-08	0.000617
387 | Chr1	4004766	4004780	RAP211	.	+	2.68e-08	0.000729
388 | Chr1	4004767	4004777	ERF118	.	+	7.65e-08	0.000844
389 | Chr1	4004768	4004778	ERF3	.	+	4.49e-07	0.0035
390 | Chr1	4004768	4004778	AT2G33710	.	+	3.56e-07	0.00302
391 | Chr1	4004769	4004782	AT1G36060	.	-	1.49e-06	0.1
392 | Chr1	4004769	4004782	ERF017	.	-	3.65e-08	0.064
393 | Chr1	4004834	4004841	ERF008	.	+	6.67e-06	0.101
394 | Chr1	4005108	4005128	COG1	.	+	2.8e-06	0.0424
395 | Chr1	4005169	4005181	ZHD1	.	+	4.72e-06	0.14
396 | Chr1	4005172	4005181	ATHB23	.	+	4.87e-06	0.124
397 | Chr1	4005222	4005236	AT3G46070	.	-	7.32e-06	0.429
398 | Chr1	4005552	4005566	ZHD6	.	-	1.56e-07	0.084
399 | Chr1	4005553	4005562	ATHB23	.	-	7.3e-06	0.138
400 | Chr1	4005776	4005785	ATHB23	.	-	9.73e-06	0.153
401 | Chr1	4005779	4005793	ATHB34	.	-	5.15e-06	0.167
402 | Chr1	4005859	4005868	WRKY40	.	+	5.98e-06	0.352
403 | Chr1	4005941	4005969	AT5G66940	.	+	1.72e-06	0.013
404 | Chr1	4006116	4006126	REF6	.	+	6.08e-07	0.182
405 | Chr1	4006183	4006212	BPC5	.	+	3.23e-10	6.32e-06
406 | Chr1	4006187	4006216	BPC5	.	+	1.09e-09	1.79e-05
407 | Chr1	4006188	4006211	BPC1	.	+	6.88e-09	7.67e-05
408 | Chr1	4006189	4006218	BPC5	.	+	9.8e-13	3.88e-08
409 | Chr1	4006191	4006220	BPC5	.	+	8.75e-15	5.32e-10
410 | Chr1	4006192	4006212	BPC6	.	-	4.25e-08	0.000425
411 | Chr1	4006192	4006215	BPC1	.	+	4.19e-12	1.37e-07
412 | Chr1	4006193	4006222	BPC5	.	+	4.26e-16	3.18e-11
413 | Chr1	4006194	4006214	BPC6	.	-	7.6e-12	2.74e-07
414 | Chr1	4006194	4006217	BPC1	.	+	1.88e-13	8.38e-09
415 | Chr1	4006195	4006224	BPC5	.	+	4.25e-16	3.18e-11
416 | Chr1	4006196	4006216	BPC6	.	-	1.28e-09	2.35e-05
417 | Chr1	4006196	4006219	BPC1	.	+	1.36e-13	6.14e-09
418 | Chr1	4006197	4006226	BPC5	.	+	4.26e-16	3.18e-11
419 | Chr1	4006198	4006211	RAMOSA1	.	+	3.63e-08	0.00049
420 | Chr1	4006198	4006218	BPC6	.	-	3.01e-11	8.79e-07
421 | Chr1	4006198	4006221	BPC1	.	+	1.05e-13	4.88e-09
422 | Chr1	4006199	4006228	BPC5	.	+	7.5e-19	8.4e-14
423 | Chr1	4006200	4006213	RAMOSA1	.	+	3.48e-09	7.3e-05
424 | Chr1	4006200	4006220	BPC6	.	-	1.86e-13	8.8e-09
425 | Chr1	4006200	4006223	BPC1	.	+	3.16e-15	2.04e-10
426 | Chr1	4006201	4006230	BPC5	.	+	7.5e-19	8.4e-14
427 | Chr1	4006202	4006215	RAMOSA1	.	+	3.48e-09	7.3e-05
428 | Chr1	4006202	4006222	BPC6	.	-	1.86e-13	8.8e-09
429 | Chr1	4006202	4006225	BPC1	.	+	3.16e-15	2.04e-10
430 | Chr1	4006203	4006232	BPC5	.	+	7.5e-19	8.4e-14
431 | Chr1	4006204	4006217	RAMOSA1	.	+	3.48e-09	7.3e-05
432 | Chr1	4006204	4006224	BPC6	.	-	1.86e-13	8.8e-09
433 | Chr1	4006204	4006227	BPC1	.	+	3.16e-15	2.04e-10
434 | Chr1	4006205	4006234	BPC5	.	+	2.05e-15	1.41e-10
435 | Chr1	4006206	4006219	RAMOSA1	.	+	3.48e-09	7.3e-05
436 | Chr1	4006206	4006226	BPC6	.	-	1.86e-13	8.8e-09
437 | Chr1	4006206	4006229	BPC1	.	+	3.16e-15	2.04e-10
438 | Chr1	4006207	4006236	BPC5	.	+	5.12e-12	1.72e-07
439 | Chr1	4006208	4006221	RAMOSA1	.	+	3.48e-09	7.3e-05
440 | Chr1	4006208	4006228	BPC6	.	-	1.86e-13	8.8e-09
441 | Chr1	4006208	4006231	BPC1	.	+	3.16e-15	2.04e-10
442 | Chr1	4006209	4006238	BPC5	.	+	1.51e-13	7.3e-09
443 | Chr1	4006210	4006223	RAMOSA1	.	+	3.48e-09	7.3e-05
444 | Chr1	4006210	4006230	BPC6	.	-	1.86e-13	8.8e-09
445 | Chr1	4006210	4006233	BPC1	.	+	2.64e-14	1.5e-09
446 | Chr1	4006212	4006225	RAMOSA1	.	+	3.48e-09	7.3e-05
447 | Chr1	4006212	4006232	BPC6	.	-	1.86e-13	8.8e-09
448 | Chr1	4006212	4006235	BPC1	.	+	8.32e-12	2.49e-07
449 | Chr1	4006214	4006227	RAMOSA1	.	+	3.48e-09	7.3e-05
450 | Chr1	4006214	4006234	BPC6	.	-	8.29e-12	2.9e-07
451 | Chr1	4006214	4006237	BPC1	.	+	1.12e-10	2.43e-06
452 | Chr1	4006216	4006229	RAMOSA1	.	+	3.48e-09	7.3e-05
453 | Chr1	4006216	4006236	BPC6	.	-	1.32e-09	2.42e-05
454 | Chr1	4006218	4006231	RAMOSA1	.	+	3.48e-09	7.3e-05
455 | Chr1	4006220	4006233	RAMOSA1	.	+	6.34e-09	0.000117
456 | Chr1	4006312	4006321	ERF7	.	-	2.34e-06	0.113
457 | Chr1	4006312	4006321	ERF8	.	-	2.34e-06	0.115
458 | Chr1	4006312	4006323	ERF11	.	-	6.47e-06	0.0878
459 | Chr1	4006312	4006323	ERF4	.	-	5.49e-06	0.0839
460 | Chr1	4006314	4006321	ERF1B	.	-	8.13e-06	0.12
461 | Chr1	4006314	4006321	ERF13	.	-	8.13e-06	0.12
462 | Chr1	4006327	4006334	ERF008	.	-	6.67e-06	0.101
463 | Chr1	4006339	4006348	ERF7	.	-	9.64e-06	0.118
464 | Chr1	4006432	4006442	AT3G57600	.	+	2.83e-07	0.00301
465 | Chr1	4006432	4006445	AT1G36060	.	-	9.53e-06	0.124
466 | Chr1	4006436	4006443	ERF008	.	+	6.67e-06	0.101
467 | Chr1	4006500	4006509	abi4	.	+	1.84e-06	0.0682
468 | Chr1	4006502	4006509	RAP2-3	.	+	6.67e-06	0.07
469 | Chr1	4006502	4006509	RAP2-6	.	+	6.67e-06	0.07
470 | Chr1	4006502	4006509	ERF109	.	+	6.67e-06	0.0701
471 | Chr1	4006503	4006510	ERF008	.	+	6.67e-06	0.101
472 | Chr1	4006620	4006634	At5g05790	.	+	5.6e-06	0.347
473 | Chr1	4006679	4006699	TRP2	.	-	6.24e-06	0.298
474 | Chr1	4006688	4006706	dof4.2	.	-	8.21e-06	0.0838
475 | Chr1	4006862	4006888	AT1G69570	.	-	1.16e-07	0.003
476 | Chr1	4006868	4006888	COG1	.	+	4.79e-08	0.00795
477 | Chr1	4006870	4006898	AT5G66940	.	-	8.61e-08	0.00226
478 | Chr1	4006871	4006891	Adof1	.	+	3.38e-07	0.00921
479 | Chr1	4006872	4006890	dof4.2	.	+	1.95e-07	0.0235
480 | Chr1	4006872	4006892	Adof1	.	+	1.01e-06	0.0163
481 | Chr1	4006873	4006893	OBP3	.	-	1.05e-07	0.00287
482 | Chr1	4006874	4006894	OBP3	.	-	2.09e-07	0.00412
483 | Chr1	4006875	4006888	PI	.	+	8e-06	0.155
484 | Chr1	4006876	4006889	OBP4	.	+	9.69e-07	0.0923
485 | Chr1	4006876	4006896	AT5G02460	.	+	4.76e-07	0.00809
486 | Chr1	4006876	4006896	AT2G28810	.	-	1.2e-07	0.00572
487 | Chr1	4006877	4006897	OBP1	.	-	1.25e-07	0.00512
488 | Chr1	4007043	4007055	GBF2	.	-	8.58e-06	0.447
489 | Chr1	4007045	4007056	HYH	.	-	4.13e-06	0.359
490 | Chr1	4007046	4007053	PIF4	.	-	9.9e-06	0.439
491 | Chr1	4007283	4007302	DREB26	.	+	1.14e-06	0.0139
492 | Chr1	4007289	4007303	CEJ1	.	+	4.58e-07	0.015
493 | Chr1	4007289	4007303	AT1G44830	.	+	8.94e-08	0.00309
494 | Chr1	4007289	4007303	AT1G75490	.	+	5.64e-08	0.00117
495 | Chr1	4007347	4007361	AT5G18450	.	-	5.01e-07	0.00483
496 | Chr1	4007347	4007367	AT1G77640	.	-	4.46e-06	0.0705
497 | Chr1	4007348	4007361	AT1G36060	.	-	1.45e-06	0.1
498 | Chr1	4007352	4007359	ERF008	.	+	6.67e-06	0.101
499 | 


--------------------------------------------------------------------------------
/lib/features.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import pandas as pd
  4 | import numpy as np
  5 | from scipy import stats
  6 | from pybedtools import BedTool
  7 | 
  8 | from lib import misc
  9 | 
 10 | 
 11 | class Geneinfo():
 12 |     # Provide Gene infomation
 13 |     def __init__(self):
 14 |         self.gene = "Gene"
 15 |         self.alias = "NA"
 16 |         self.chrom = "chrom"
 17 |         self.start = 0
 18 |         self.end = 1000
 19 |         self.strand = "+"
 20 |         self.binsize = 10
 21 |         self.step = 10
 22 | 
 23 | 
 24 | def kmeans_like_diff(wt, control):
 25 |     # Calculate the k-means-like phenotype difference between mutants and WT
 26 |     ## Do not consider the length of mutations
 27 |     y_avg = np.average(wt)
 28 |     diff = 0
 29 |     for x in control:
 30 |         diff += abs(x - y_avg)
 31 |     diff_score = diff / len(control)
 32 |     return diff_score
 33 | 
 34 | 
 35 | def kmeans_like_diff2(wt, control, binsize):
 36 |     # Calculate the k-means-like phenotype difference between mutants and WT
 37 |     ## Consider the influence of length of mutations
 38 |     y_avg = np.average(wt)
 39 |     diff = 0
 40 |     for x in control:
 41 |         diff += abs(float(x[4]) - y_avg) * ((int(x[2]) - int(x[1]))/binsize)
 42 |     diff_score = diff / len(control)
 43 |     return diff_score
 44 | 
 45 | 
 46 | def openchromatin_scores(geneinfo, bedfile, peakfile = "", outdir = "./", samplename = "openchromatin"):
 47 |     """
 48 |     Generate the open chromatin feature in specific bins.
 49 |     (Alternative data: ATAC-seq, DNase-seq, MNase-seq)
 50 | 
 51 |     Mandatory parameters:
 52 |     1. geneinfo - A class that defines the information of target gene
 53 |     2. bedfile - Open chromatin values in bedGraph format
 54 |     3. peakfile - Enrichment regions called from open chromatin data in BED format
 55 | 
 56 |     Alternative parameters:
 57 |     1. outdir - Output directory for saving the scores file (bedGraph format)
 58 |     """
 59 | 
 60 |     # Get gene info
 61 |     gene = geneinfo.gene
 62 |     genename = geneinfo.alias
 63 |     if genename == "NA":
 64 |         gene_alias = gene
 65 |     else:
 66 |         gene_alias = genename
 67 |     chromosome = geneinfo.chrom
 68 |     binstart = geneinfo.start
 69 |     binstop = geneinfo.end
 70 |     binsize = geneinfo.binsize
 71 |     step = geneinfo.step
 72 |     
 73 |     # Check output directory
 74 |     misc.check_outdir(outdir)
 75 |     if not os.path.exists(bedfile):
 76 |         smooth_openchromatin = {}
 77 |         return smooth_openchromatin
 78 | 
 79 |     # Convert BigWig file to bedGraph file
 80 |     # Load bedGraph file as bed file
 81 |     if peakfile:
 82 |         oc_peak = BedTool(peakfile)
 83 |     oc_score = BedTool(bedfile)
 84 | 
 85 |     # Calculate scores
 86 |     oc_info = []
 87 |     overlap_list = []
 88 |     posinfo = {}
 89 |     for i, pos in enumerate(range(binstart, binstop, step)):
 90 |         posinfo[i] = pos
 91 |         binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n", 
 92 |                          from_string=True)
 93 |         score_in_bin = oc_score.intersect(binbed)
 94 |         if peakfile:
 95 |             peak_in_bin = oc_peak.intersect(binbed)
 96 |             overlap = [(int(str(x).split()[2])-int(str(x).split()[1]))/binsize for x in peak_in_bin]
 97 |             if overlap:
 98 |                 overlap = 1
 99 |             else:
100 |                 overlap = 0.5
101 |         else:
102 |             overlap = 0.5
103 |         sclst = [float(str(x).split()[3]) for x in score_in_bin]
104 |         if sum(sclst):
105 |             score = np.average(sclst)
106 |         else:
107 |             score = 0
108 |         oc_info.append(score)
109 |         overlap_list.append(overlap)
110 | 
111 |     # Smooth the scores
112 |     max_score = max(oc_info)
113 |     smooth_openchromatin = {}
114 |     if max_score:
115 |         oc_info = [x*overlap_list[i]/max_score for i,x in enumerate(oc_info)]
116 |     else:
117 |         return smooth_openchromatin
118 |     outf = open(outdir + "/" + gene_alias + "/" + samplename + ".bedGraph", "w")
119 |     for i in posinfo:
120 |         pos = posinfo[i]
121 |         score = oc_info[i]
122 |         smooth_openchromatin[pos] = score
123 |         print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
124 |     outf.close()
125 | 
126 |     return smooth_openchromatin
127 | 
128 | 
129 | def ptm_scores(geneinfo, bedfile, ocname, outdir = "./", samplename = "PTM", minratio = 0.2):
130 |     """
131 |     Generate the histone modification feature in specific bins.
132 |     (Alternative data: ChIP-seq)
133 | 
134 |     Mandatory parameters:
135 |     1. geneinfo - A class that defines the information of target gene
136 |     2. bedfile - histone modification values in bedGraph format
137 | 
138 |     Alternative parameters:
139 |     1. outdir - Output directory for saving the scores file (bedGraph format)
140 |     """
141 | 
142 |     # Get gene info
143 |     gene = geneinfo.gene
144 |     genename = geneinfo.alias
145 |     if genename == "NA":
146 |         gene_alias = gene
147 |     else:
148 |         gene_alias = genename
149 |     chromosome = geneinfo.chrom
150 |     binstart = geneinfo.start
151 |     binstop = geneinfo.end
152 |     binsize = geneinfo.binsize
153 |     step = geneinfo.step
154 | 
155 |     # Check output directory
156 |     misc.check_outdir(outdir)
157 | 
158 |     # Load bedGraph file as bed file
159 |     ptm_score = BedTool(bedfile)
160 | 
161 |     # Calculate scores
162 |     ptm_info = []
163 |     posinfo = {}
164 |     for i, pos in enumerate(range(binstart, binstop, step)):
165 |         posinfo[i] = pos
166 |         binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n", 
167 |                          from_string=True)
168 |         ptm_in_bin = ptm_score.intersect(binbed)
169 |         score = np.average([float(str(x).split()[3]) for x in ptm_in_bin])
170 |         if pd.isna(score):
171 |             score = 0
172 |         ptm_info.append(score)
173 | 
174 |     max_score = max(ptm_info)
175 |     ptm_info = [x/max_score for x in ptm_info]
176 |     
177 |     # Get ratios from open chromatin results
178 |     ocfile = outdir + "/" + gene_alias + "/" + ocname + ".bedGraph"
179 |     oc_scores = BedTool(ocfile)
180 |     oc_ratios = {}
181 |     for interval in oc_scores:
182 |         chrom, start, end, score = str(interval).rstrip().split("\t")
183 |         if float(score) > minratio:
184 |             oc_ratios[int(start)] = float(score)
185 |         else:
186 |             oc_ratios[int(start)] = minratio
187 | 
188 |     # Smooth the scores
189 |     smooth_ptm = misc.smooth_scores_fill2(ptm_info, posinfo)
190 |     outf = open(outdir + "/" + gene_alias + "/" + samplename + ".bedGraph", "w")
191 |     ptm_scores = [(1-smooth_ptm[x])*oc_ratios[x] for x in smooth_ptm]
192 |     max_score2 = max(ptm_scores)
193 |     for pos in smooth_ptm:
194 | #         score = smooth_ptm[pos]
195 |         score = (1 - smooth_ptm[pos]) * oc_ratios[pos] / max_score2
196 |         smooth_ptm[pos] = score
197 |         print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
198 |     outf.close()
199 | 
200 |     return smooth_ptm
201 | 
202 | 
203 | def merge_reps(geneinfo, feature_list, outdir = "./", samplename = "merged"):
204 |     """
205 |     Merge the NGS feature in specific bins.
206 |     (Alternative data: DNase-seq, ATAC-seq, ChIP-seq)
207 | 
208 |     Mandatory parameters:
209 |     1. geneinfo - A class that defines the information of target gene
210 |     2. feature_list - A list contains features need to be merged
211 | 
212 |     Alternative parameters:
213 |     1. outdir - Output directory for saving the scores file (bedGraph format)
214 |     """
215 | 
216 |     # Get gene info
217 |     gene = geneinfo.gene
218 |     genename = geneinfo.alias
219 |     if genename == "NA":
220 |         gene_alias = gene
221 |     else:
222 |         gene_alias = genename
223 |     chromosome = geneinfo.chrom
224 |     binsize = geneinfo.binsize
225 | 
226 |     # Check output directory
227 |     misc.check_outdir(outdir)
228 | 
229 |     # Merge the feature scores
230 |     scorelist = []
231 |     cnt = 0
232 |     for feature in feature_list:
233 |         if not feature:
234 |             continue
235 |         if cnt:
236 |             for i, pos in enumerate(feature):
237 |                 scorelist[i] += feature[pos]
238 |         else:
239 |             for pos in feature:
240 |                 scorelist.append(feature[pos])
241 |         cnt += 1
242 |     scores_merge = {}
243 |     if not scorelist:
244 |         return scores_merge
245 |     outf = open(outdir + "/" + gene_alias + "/" + samplename + ".bedGraph", "w")
246 |     for i, pos in enumerate(feature_list[0]):
247 |         score = scorelist[i] / max(scorelist)
248 |         scores_merge[pos] = score
249 |         print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
250 |     outf.close()
251 | 
252 |     return scores_merge
253 | 
254 | 
255 | def motif_scores(geneinfo, bedfile, outdir = "./", flanking = 3):
256 |     """
257 |     Generate the TF motifs feature in specific bins.
258 |     (Alternative data: Motif sites calculated by FIMO with PlantTFDB/JASPAR PWM files)
259 | 
260 |     Mandatory parameters:
261 |     1. geneinfo - A class that defines the information of target gene
262 |     2. bedfile - Motif positions in BED format
263 | 
264 |     Alternative parameters:
265 |     1. outdir - Output directory for saving the scores file (bedGraph format)
266 |     """
267 | 
268 |     # Get gene info
269 |     gene = geneinfo.gene
270 |     genename = geneinfo.alias
271 |     if genename == "NA":
272 |         gene_alias = gene
273 |     else:
274 |         gene_alias = genename
275 |     chromosome = geneinfo.chrom
276 |     binstart = geneinfo.start
277 |     binstop = geneinfo.end
278 |     strand = geneinfo.strand
279 |     binsize = geneinfo.binsize
280 |     step = geneinfo.step
281 | 
282 |     # Check output directory
283 |     misc.check_outdir(outdir)
284 | 
285 |     # Load bed file
286 |     tf_motif = BedTool(bedfile)
287 | 
288 |     # Calculate scores
289 |     motif_density = []
290 |     motif_info = []
291 |     posinfo = {}
292 |     count = 0
293 |     for i, pos in enumerate(range(binstart, binstop, step)):
294 |         posinfo[i] = pos
295 |         binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n", 
296 |                          from_string=True)
297 |         motif_in_bin = tf_motif.intersect(binbed)
298 |         motif_lens = [int(str(x).split()[1]) for x in motif_in_bin]+[int(str(x).split()[2]) for x in motif_in_bin]
299 |         if motif_lens:
300 |             motif_density.append((max(motif_lens) - min(motif_lens)) / binsize)
301 |         else:
302 |             motif_density.append(0)
303 |     bincount = len(motif_density)
304 |     for i, score in enumerate(motif_density):
305 |         if i > flanking:
306 |             if i+flanking+1 > bincount:
307 |                 density = sum(motif_density[i-flanking:]) / (flanking+bincount-i)
308 |             else:
309 |                 density = sum(motif_density[i-flanking:i+flanking+1]) / (2*flanking+1)
310 |         else:
311 |             density = sum(motif_density[:i+flanking+1]) / (flanking+i+1)
312 |         motif_info.append(density)
313 | 
314 |     # Smooth the scores
315 |     smooth_motif = misc.smooth_scores_fill2(motif_info, posinfo, minratio=1)
316 |     max_score = max(smooth_motif.values())
317 |     outf = open(outdir + "/" + gene_alias + "/motifs.bedGraph", "w")
318 |     for pos in smooth_motif:
319 |         score = smooth_motif[pos] / max_score
320 |         smooth_motif[pos] = score
321 |         print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
322 |     outf.close()
323 | 
324 |     return smooth_motif
325 | 
326 | 
327 | def cns_scores(geneinfo, bedfile, outdir = "./"):
328 |     """
329 |     Generate the conservation feature in specific bins.
330 |     (Alternative data: Phastcons scores)
331 | 
332 |     Mandatory parameters:
333 |     1. geneinfo - A class that defines the information of target gene
334 |     2. bedfile - Conservation scores in BED format
335 | 
336 |     Alternative parameters:
337 |     1. outdir - Output directory for saving the scores file (bedGraph format)
338 |     """
339 | 
340 |     # Get gene info
341 |     gene = geneinfo.gene
342 |     genename = geneinfo.alias
343 |     if genename == "NA":
344 |         gene_alias = gene
345 |     else:
346 |         gene_alias = genename
347 |     chromosome = geneinfo.chrom
348 |     binstart = geneinfo.start
349 |     binstop = geneinfo.end
350 |     binsize = geneinfo.binsize
351 |     step = geneinfo.step
352 | 
353 |     # Check output directory
354 |     misc.check_outdir(outdir)
355 |     if not os.path.exists(bedfile):
356 |         smooth_cns = {}
357 |         return smooth_cns
358 | 
359 |     # Load bed file
360 |     cns = BedTool(bedfile)
361 | 
362 |     # Calculate scores
363 |     cns_info = []
364 |     posinfo = {}
365 |     for i, pos in enumerate(range(binstart, binstop, step)):
366 |         posinfo[i] = pos
367 |         binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n", 
368 |                          from_string=True)
369 |         cns_in_bin = cns.intersect(binbed)
370 |         sclst = [float(str(x).split()[3]) for x in cns_in_bin]
371 |         if sum(sclst):
372 |             score = np.average(sclst)
373 |         else:
374 |             score = 0
375 |         cns_info.append(score)
376 | 
377 |     # Smooth the scores
378 |     smooth_cns = misc.smooth_scores2(cns_info, posinfo)
379 |     outf = open(outdir + "/" + gene_alias + "/CNS.bedGraph", "w")
380 |     for pos in smooth_cns:
381 |         score = smooth_cns[pos]
382 |         print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
383 |     outf.close()
384 | 
385 |     return smooth_cns
386 | 
387 | 
388 | def genopheno_scores(geneinfo, bedfile, outdir = "./"):
389 |     """
390 |     Generate the genotype and phenotype relationship feature in specific bins.
391 |     (Alternative data: SNPs&Indels and Phenotype data)
392 | 
393 |     Mandatory parameters:
394 |     1. geneinfo - A class that defines the information of target gene
395 |     2. bedfile - genotype and phenotype relationship scores in BED format
396 | 
397 |     Alternative parameters:
398 |     1. outdir - Output directory for saving the scores file (bedGraph format)
399 |     """
400 | 
401 |     # Get gene info
402 |     gene = geneinfo.gene
403 |     genename = geneinfo.alias
404 |     if genename == "NA":
405 |         gene_alias = gene
406 |     else:
407 |         gene_alias = genename
408 |     chromosome = geneinfo.chrom
409 |     binstart = geneinfo.start
410 |     binstop = geneinfo.end
411 |     binsize = geneinfo.binsize
412 |     step = geneinfo.step
413 | 
414 |     # Check output directory
415 |     misc.check_outdir(outdir)
416 | 
417 |     # Load bed file
418 |     genopheno = BedTool(bedfile)
419 | 
420 |     # Calculate scores
421 |     genopheno_info = []
422 |     posinfo = {}
423 |     for i, pos in enumerate(range(binstart, binstop, step)):
424 |         posinfo[i] = pos
425 |         binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n", 
426 |                          from_string=True)
427 |         genopheno_in_bin = genopheno.intersect(binbed)
428 |         values = [float(str(x).split()[-1]) for x in genopheno_in_bin]
429 |         # values = [x if x <= highest else highest for x in values]
430 |         if values:
431 |             score = sum(values)
432 |         else:
433 |             score = 0
434 |         genopheno_info.append(score)
435 | 
436 |     # Smooth the scores
437 |     smooth_genopheno = misc.smooth_scores_fill2(genopheno_info, posinfo)
438 |     max_score = max(smooth_genopheno.values())
439 |     outf = open(outdir + "/" + gene_alias + "/genopheno.bedGraph", "w")
440 |     for pos in smooth_genopheno:
441 |         score = smooth_genopheno[pos] / max_score
442 |         smooth_genopheno[pos] = score
443 |         print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
444 |     outf.close()
445 | 
446 |     return smooth_genopheno
447 | 
448 | 
449 | def aggregate_scores(geneinfo, scorelist, weightlist, outdir = "./"):
450 |     """
451 |     Generate the aggregate score in specific bins.
452 | 
453 |     Mandatory parameters:
454 |     1. geneinfo - A class that defines the information of target gene
455 |     2. scorelist - A list host multiple feature scores from different data
456 |     3. weightlist - A list contains different weights assigned to different features
457 |        (Should have the same order and numbers as scorelist)
458 | 
459 |     Alternative parameters:
460 |     1. outdir - Output directory for saving the scores file (bedGraph format)
461 |     """
462 | 
463 |     # Get gene info
464 |     gene = geneinfo.gene
465 |     genename = geneinfo.alias
466 |     if genename == "NA":
467 |         gene_alias = gene
468 |     else:
469 |         gene_alias = genename
470 |     chromosome = geneinfo.chrom
471 |     binsize = geneinfo.binsize
472 | 
473 |     # Check output directory
474 |     misc.check_outdir(outdir)
475 | 
476 |     # Calculate scores
477 |     outf = open(outdir + "/" + gene_alias + "/aggregate.bedGraph", "w")
478 |     ziplist = zip(scorelist, weightlist)
479 |     aggregate_info = {}
480 |     total = sum(weightlist)
481 |     for item in ziplist:
482 |         scorelist = item[0]
483 |         weight = item[1]
484 |         for pos in scorelist:
485 |             aggregate = scorelist[pos] * weight / total
486 |             if pos in aggregate_info:
487 |                 aggregate_info[pos] += aggregate
488 |             else:
489 |                 aggregate_info[pos] = aggregate
490 |     if aggregate_info:
491 |         max_score = max(aggregate_info.values())
492 |     else:
493 |         print(gene_alias)
494 |         return aggregate_info
495 |     if not max_score:
496 |         return aggregate_info
497 |     for pos in aggregate_info:
498 |         score = aggregate_info[pos] / max_score
499 |         print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
500 |     outf.close()
501 | 
502 |     return aggregate_info
503 | 
504 | 
505 | def phenodata_scores(geneinfo, bedfile, method = "kmeans1", outdir = "./", randbg = 0.02):
506 |     """
507 |     Calcuate the average phenodata value from multiple samples in specific bins.
508 | 
509 |     Mandatory parameters:
510 |     1. geneinfo - A class that defines the information of target gene
511 |     2. phenodata - Phenotype data of mutants in BED format
512 |     chrom start end samplename avg_value
513 | 
514 |     Alternative parameters:
515 |     1. method - Methods used for calculating phenotype difference between WT and mutants
516 |     ["ratio", "stdev", "utest", "kmeans1", "kmeans2"]
517 |     2. outdir - Output directory for saving the scores file (bedGraph format)
518 |     """
519 | 
520 |     # Get gene info
521 |     gene = geneinfo.gene
522 |     genename = geneinfo.alias
523 |     if genename == "NA":
524 |         gene_alias = gene
525 |     else:
526 |         gene_alias = genename
527 |     chromosome = geneinfo.chrom
528 |     binstart = geneinfo.start
529 |     binstop = geneinfo.end
530 |     binsize = geneinfo.binsize
531 |     step = geneinfo.step
532 | 
533 |     # Check output directory
534 |     misc.check_outdir(outdir)
535 | 
536 |     # Load bed file
537 |     phenodata = BedTool(bedfile)
538 | 
539 |     # Calculate scores
540 |     methods = ["ratio", "stdev", "utest", "kmeans1", "kmeans2"]
541 |     phenoinfo = []
542 |     posinfo = {}
543 |     for i, pos in enumerate(range(binstart, binstop, step)):
544 |         posinfo[i] = pos
545 |         binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n", 
546 |                          from_string=True)
547 |         pheno_in_bin = phenodata.intersect(binbed)
548 |         mutant_phenos = [float(str(x).split()[4]) for x in pheno_in_bin if str(x).split()[3] != "WT"]
549 |         wt_phenos = [float(str(x).split()[4]) for x in pheno_in_bin if str(x).split()[3] == "WT"]
550 |         if mutant_phenos:
551 |             if method == methods[0]:
552 |                 score = np.average(mutant_phenos) / np.average(wt_phenos)
553 |             elif method == methods[1]:
554 |                 score = np.std(wt_phenos + mutant_phenos)
555 |             elif method == methods[2]:
556 |                 mannwhitneyu = stats.mannwhitneyu(wt_phenos, mutant_phenos)
557 |                 score = -np.log10(mannwhitneyu[1])
558 |             elif method == methods[3]:
559 |                 score = kmeans_like_diff(wt_phenos, mutant_phenos)
560 |             elif method == methods[4]:
561 |                 mutant_phenos = [str(x).split()[:5] for x in pheno_in_bin if str(x).split()[3] != "WT"]
562 |                 score = kmeans_like_diff2(wt_phenos, mutant_phenos, binsize)
563 |             else:
564 |                 print("Cannot find this method. Available methods are:", methods)
565 |         else:
566 |             score = 0
567 |         phenoinfo.append(score)
568 | 
569 |     # Smooth the scores
570 |     max_score = max(phenoinfo)
571 |     random.seed(81)
572 |     phenoinfo = [max(x/max_score+random.uniform(-randbg, randbg), 0) if x else x for x in phenoinfo]
573 |     # Output raw scores of phenotypes
574 |     outraw = open(outdir + "/" + gene_alias + "/phenoscores_" + method + "_raw.bedGraph", "w")
575 |     for i in posinfo:
576 |         pos = posinfo[i]
577 |         score = phenoinfo[i]
578 |         print(chromosome, pos, pos+binsize, score, sep="\t", file=outraw)
579 |     outraw.close()
580 |     # Output smooth and gap-filled scores of phenotypes
581 |     smooth_phenos = misc.smooth_scores1(phenoinfo, posinfo)
582 |     max_score = max(smooth_phenos.values())
583 |     min_score = min([x for x in smooth_phenos.values() if x])
584 |     outf = open(outdir + "/" + gene_alias + "/phenoscores_" + method + ".bedGraph", "w")
585 |     for pos in smooth_phenos:
586 |         if smooth_phenos[pos]:
587 |             score = (smooth_phenos[pos] - min_score) / (max_score - min_score)
588 |         else:
589 |             score = 0
590 |         smooth_phenos[pos] = score
591 |         print(chromosome, pos, pos+binsize, score, sep="\t", file=outf)
592 |     outf.close()
593 | 
594 |     return smooth_phenos
595 | 
596 | 
597 | def define_key_regions(geneinfo, aggregate, phenodata, threshold = 0, outdir = "./"):
598 |     """
599 |     Define the key regions of the target site.
600 | 
601 |     Mandatory parameters:
602 |     1. geneinfo - A class that defines the information of target gene
603 |     2. aggregate - Aggregate scores
604 |     3. phenotypes - Phenotype scores
605 |        (Should have the same order and numbers as scorelist)
606 | 
607 |     Alternative parameters:
608 |     1. threshold - Bin with score above the threshold is defined as a key region
609 |        (Default: average of aggregate scores)
610 |     2. outdir - Output directory for saving the scores file (bedGraph format)
611 | 
612 |     Outputs:
613 |     1. plot_scores - Phenotype and aggregate scores for R/ggplot2
614 |     2. key_regions - Key regions in the target site
615 |     3. stats - Statistics of Pearson correlation and differential significance
616 |     """
617 | 
618 |     # Get gene info
619 |     gene = geneinfo.gene
620 |     genename = geneinfo.alias
621 |     if genename == "NA":
622 |         gene_alias = gene
623 |     else:
624 |         gene_alias = genename
625 |     chromosome = geneinfo.chrom
626 |     binsize = geneinfo.binsize
627 | 
628 |     # Check output directory
629 |     misc.check_outdir(outdir)
630 | 
631 |     # Define the cutoff
632 |     if threshold:
633 |         cutoff = threshold
634 |         cutoff_dev = 0
635 |     else:
636 |         cutoff_dev = np.std(list(aggregate.values()))
637 |         cutoff = np.average(list(aggregate.values()))
638 | 
639 |     # Classify key regions and other regions
640 |     key_regions = []
641 |     aggregate_all = []
642 |     phenotype_all = []
643 |     for pos in aggregate:
644 |         score = aggregate[pos]
645 |         if score >= cutoff:
646 |             aggregate_all.append(score)
647 |             key_regions.append([pos, score])
648 |         else:
649 |             aggregate_all.append(score)
650 | 
651 |     # Output key regions info
652 |     merged_regions = misc.merge_regions(key_regions, geneinfo)
653 |     raw_file = outdir + "/" + gene_alias + "/key_regions_raw.bed"
654 |     outregion1 = open(raw_file, "w")
655 |     merged_file = outdir + "/" + gene_alias + "/key_regions_merged.bed"
656 |     outregion2 = open(merged_file, "w")
657 |     for region in key_regions:
658 |         pos, score = region
659 |         print(chromosome, pos, pos+binsize, score, sep="\t", file=outregion1)
660 |     for lst in merged_regions:
661 |         print("\t".join(list(map(str, lst))), file=outregion2)
662 |     outregion1.close()
663 |     outregion2.close()
664 | 
665 |     if os.path.exists(phenodata):
666 |         # Calculate statistical values
667 |         outf = open(outdir + "/" + gene_alias + "/plot_scores.txt", "w")
668 |         print("sample", "group", "ratio", "difference", sep="\t", file=outf)
669 |         outstat = open(outdir + "/" + gene_alias + "/statistics.txt", "w")
670 |         # Cutoff of key regions definition
671 |         print("Cutoff for defining key regions: %s" % cutoff, file=outstat)
672 |         print("Cutoff deviation: %s" % cutoff_dev, file=outstat)
673 |         # Calculate difference
674 |         pheno_all = new_stats(geneinfo, phenodata, outdir = outdir)
675 |         mean_ratio = np.average([x[1] for x in pheno_all])
676 |         min_ratio = min(([x[1] for x in pheno_all]))
677 |         max_ratio = max(([x[1] for x in pheno_all]))
678 |         high_edited = []
679 |         high_edited2 = []
680 |         low_edited = []
681 |         low_edited2 = []
682 |         for scores in pheno_all:
683 |             diff = scores[0]
684 |             ratio = scores[1]
685 |             sample = scores[2]
686 |             if ratio > mean_ratio:
687 |                 high_edited.append((diff))
688 |                 high_edited2.append((diff-min_ratio)/(max_ratio-min_ratio))
689 |                 print(sample, "high", ratio, diff, sep="\t", file=outf)
690 |             else:
691 |                 low_edited.append(diff)
692 |                 low_edited2.append((diff-min_ratio)/(max_ratio-min_ratio))
693 |                 print(sample, "low", ratio, diff, sep="\t", file=outf)
694 |         outf.close()
695 |         
696 |         phe_high = np.average(high_edited2)
697 |         phe_low = np.average(low_edited2)
698 |         phe_ratio = phe_high / phe_low
699 |         phe_pvalue = stats.mannwhitneyu(low_edited, high_edited)
700 |         phe_pvalue2 = stats.ks_2samp(low_edited, high_edited, alternative="greater")
701 |         phe_pvalue3 = stats.f_oneway(low_edited, high_edited)
702 |         print("Phenotype differential ratio:", phe_ratio)
703 |         print("Phenotype significance (U test):", phe_pvalue[1])
704 |         print("Phenotype significance (KS test):", phe_pvalue2[1])
705 |         print("Phenotype significance (ANOVA):", phe_pvalue3[1])
706 |         print("Phenotype differential ratio:", phe_ratio, file=outstat)
707 |         print("Phenotype significance (U test):", phe_pvalue[1], file=outstat)
708 |         print("Phenotype significance (KS test):", phe_pvalue2[1], file=outstat)
709 |         print("Phenotype significance (ANOVA):", phe_pvalue3[1], file=outstat)
710 |         outstat.close()
711 |     else:
712 |         print("No Phenotype data detected, output key regions.")
713 | 
714 |     return key_regions
715 | 
716 | 
717 | def new_stats(geneinfo, phenodata, outdir = "./", side="both"):
718 | 
719 |     pheno_bed = BedTool(phenodata)
720 |     gene = geneinfo.gene
721 |     genename = geneinfo.alias
722 |     if genename == "NA":
723 |         gene_alias = gene
724 |     else:
725 |         gene_alias = genename
726 |     key_regions = outdir + "/" + gene_alias + "/key_regions_merged.bed"
727 |     region_bed = BedTool(key_regions)
728 |     regionlens = sum([int(str(x).split()[2])-int(str(x).split()[1]) for x in region_bed])
729 |     intersect = pheno_bed.intersect(region_bed, wao=True)
730 |     sample_values = {}
731 |     for interval in intersect:
732 |         info = str(interval).rstrip().split("\t")
733 |         sample = info[3]
734 |         pheno = float(info[4])
735 |         if sample == "WT":
736 |             wt_value = pheno
737 |             continue
738 |         length = int(info[-1])
739 |         if sample not in sample_values:
740 |             if side == "none":
741 |                 phenoscore = abs(pheno - wt_value)
742 |             else:
743 |                 phenoscore = pheno - wt_value
744 |             sample_values[sample] = [phenoscore, 0]
745 |         sample_values[sample][1] += length / regionlens
746 |     max_ratio = max([x[1] for x in sample_values.values()])
747 |     mean_pheno = np.average([x[0] for x in sample_values.values()])
748 |     if mean_pheno < 0:
749 |         for s in sample_values:
750 |             sample_values[s][0] *= -1
751 |     scores_list = sorted([(sample_values[s][0], sample_values[s][1]/max_ratio, s) for s in sample_values], 
752 |                          key=lambda x:x[1], reverse=True)
753 | 
754 |     return scores_list
755 | 
756 | 


--------------------------------------------------------------------------------