├── Jazzlib
├── __init__.py
├── Peak.pyc
├── sta.pyc
├── FRegion.pyc
├── Hotspot.pyc
├── bgcount.pyc
├── cEM_zip.so
├── jazzio.pyc
├── kernel.pyc
├── region.pyc
├── __init__.pyc
├── countreads.pyc
├── localmax.pyc
├── peakcount.pyc
├── peaksscan.pyc
├── randombg.pyc
├── hotspotsscan.pyc
├── kernelsmooth.pyc
├── readscounter.pyc
├── hotspotscount.pyc
├── normalize_ratio.pyc
├── cEM_zip.cpython-36m-darwin.so
├── cEM_zip.cpython-37m-darwin.so
├── __pycache__
│ ├── Peak.cpython-36.pyc
│ ├── sta.cpython-36.pyc
│ ├── FRegion.cpython-36.pyc
│ ├── FRegion.cpython-37.pyc
│ ├── Hotspot.cpython-36.pyc
│ ├── jazzio.cpython-36.pyc
│ ├── kernel.cpython-36.pyc
│ ├── region.cpython-36.pyc
│ ├── __init__.cpython-36.pyc
│ ├── __init__.cpython-37.pyc
│ ├── localmax.cpython-36.pyc
│ ├── peaksscan.cpython-36.pyc
│ ├── randombg.cpython-36.pyc
│ ├── countreads.cpython-36.pyc
│ ├── countreads.cpython-37.pyc
│ ├── hotspotsscan.cpython-36.pyc
│ ├── kernelsmooth.cpython-36.pyc
│ └── normalize_ratio.cpython-36.pyc
├── cEM_zip.cpython-36m-x86_64-linux-gnu.so
├── Peak.py
├── Hotspot.py
├── kernel.py
├── cEM_zip.pyx
├── normalize_ratio.py
├── normalize_ratio.py.bak
├── kernelsmooth.py
├── kernelsmooth.py.bak
├── region.py
├── region.py.bak
├── jazzio.py.bak
├── jazzio.py
├── randombg.py
├── randombg.py.bak
├── localmax.py
├── sta.py
├── localmax.py.bak
├── sta.py.bak
├── FRegion.py.bak
├── FRegion.py
├── Jazz.py
├── Jazz.py.bak
├── bgcount.py
├── bgcount.py.bak
├── hotspotsscan.py
├── hotspotsscan.py.bak
├── peaksscan.py
└── peaksscan.py.bak
├── requirements.txt
├── .idea
├── .gitignore
├── vcs.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
├── misc.xml
└── Jazz.iml
├── .DS_Store
├── setup.py
├── Readme.txt
└── Jazz.py
/Jazzlib/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython
2 | numpy
3 | pysam
4 | scipy
5 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Default ignored files
3 | /workspace.xml
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/.DS_Store
--------------------------------------------------------------------------------
/Jazzlib/Peak.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/Peak.pyc
--------------------------------------------------------------------------------
/Jazzlib/sta.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/sta.pyc
--------------------------------------------------------------------------------
/Jazzlib/FRegion.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/FRegion.pyc
--------------------------------------------------------------------------------
/Jazzlib/Hotspot.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/Hotspot.pyc
--------------------------------------------------------------------------------
/Jazzlib/bgcount.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/bgcount.pyc
--------------------------------------------------------------------------------
/Jazzlib/cEM_zip.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/cEM_zip.so
--------------------------------------------------------------------------------
/Jazzlib/jazzio.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/jazzio.pyc
--------------------------------------------------------------------------------
/Jazzlib/kernel.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/kernel.pyc
--------------------------------------------------------------------------------
/Jazzlib/region.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/region.pyc
--------------------------------------------------------------------------------
/Jazzlib/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__init__.pyc
--------------------------------------------------------------------------------
/Jazzlib/countreads.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/countreads.pyc
--------------------------------------------------------------------------------
/Jazzlib/localmax.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/localmax.pyc
--------------------------------------------------------------------------------
/Jazzlib/peakcount.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/peakcount.pyc
--------------------------------------------------------------------------------
/Jazzlib/peaksscan.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/peaksscan.pyc
--------------------------------------------------------------------------------
/Jazzlib/randombg.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/randombg.pyc
--------------------------------------------------------------------------------
/Jazzlib/hotspotsscan.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/hotspotsscan.pyc
--------------------------------------------------------------------------------
/Jazzlib/kernelsmooth.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/kernelsmooth.pyc
--------------------------------------------------------------------------------
/Jazzlib/readscounter.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/readscounter.pyc
--------------------------------------------------------------------------------
/Jazzlib/hotspotscount.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/hotspotscount.pyc
--------------------------------------------------------------------------------
/Jazzlib/normalize_ratio.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/normalize_ratio.pyc
--------------------------------------------------------------------------------
/Jazzlib/cEM_zip.cpython-36m-darwin.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/cEM_zip.cpython-36m-darwin.so
--------------------------------------------------------------------------------
/Jazzlib/cEM_zip.cpython-37m-darwin.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/cEM_zip.cpython-37m-darwin.so
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/Peak.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/Peak.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/sta.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/sta.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/FRegion.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/FRegion.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/FRegion.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/FRegion.cpython-37.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/Hotspot.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/Hotspot.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/jazzio.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/jazzio.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/kernel.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/kernel.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/region.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/region.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/localmax.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/localmax.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/peaksscan.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/peaksscan.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/randombg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/randombg.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/countreads.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/countreads.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/countreads.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/countreads.cpython-37.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/hotspotsscan.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/hotspotsscan.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/kernelsmooth.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/kernelsmooth.cpython-36.pyc
--------------------------------------------------------------------------------
/Jazzlib/cEM_zip.cpython-36m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/cEM_zip.cpython-36m-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/Jazzlib/__pycache__/normalize_ratio.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/normalize_ratio.cpython-36.pyc
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | from Cython.Build import cythonize
3 |
4 | setup(
5 | name = "cEM_zip",
6 | ext_modules = cythonize("Jazzlib/*.pyx"),
7 | )
8 |
9 | #python setup.py build_ext --inplace
10 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/Jazz.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/Jazzlib/Peak.py:
--------------------------------------------------------------------------------
1 | class Peak:
2 | """
3 | Peaks
4 |
5 | """
6 |
7 | def __init__(self, start, end, chromosome, peakpoint, peakid, score, parent=1, fdr=1):
8 |
9 | self.start = start
10 |
11 | self.end = end
12 |
13 | self.chromosome = chromosome
14 |
15 | self.peakpoint = peakpoint
16 |
17 | self.peakid = peakid
18 |
19 | self.score = score
20 |
21 | self.fdr = fdr
22 |
23 | self.parent = parent
24 |
25 |
--------------------------------------------------------------------------------
/Jazzlib/Hotspot.py:
--------------------------------------------------------------------------------
1 | class Hotspot:
2 | """
3 | Hotspot
4 |
5 | """
6 |
7 | def __init__(self, start, end, chromosome, hotspotid, peaks=list(), score=0, fdr=1):
8 |
9 | self.start = start
10 |
11 | self.end = end
12 |
13 | self.chromosome = chromosome
14 |
15 | self.hotspotid = hotspotid
16 |
17 | self.score = score
18 |
19 | self.fdr = fdr
20 |
21 | self.peaks = peaks
22 |
23 | def addpeak(self, peak):
24 |
25 | self.peaks.append(peak)
26 |
--------------------------------------------------------------------------------
/Jazzlib/kernel.py:
--------------------------------------------------------------------------------
1 | from numpy import *
2 | #from scipy.ndimage.filters import *
3 |
4 |
5 | def kde(z, w, xv):
6 |
7 | return sum(exp(-0.5*((z-xv)/w)**2)/sqrt(2*pi*w**2))
8 |
9 |
10 | def smooth_kernel(length):
11 |
12 | if length % 2 == 0:
13 |
14 | length = length + 1
15 |
16 | bandwidth = (length - 1)/6.0
17 |
18 | one_kernel = dict()
19 |
20 | for pos in linspace(-(length-1)/2, (length-1)/2, length):
21 |
22 | one_kernel[int(pos)] = kde(pos, bandwidth, 0)
23 |
24 | return one_kernel
25 |
26 |
27 | def smooth_kernel_adj(length, minscore):
28 |
29 | if length % 2 == 0:
30 |
31 | length = length + 1
32 |
33 | bandwidth = (length - 1)/6.0
34 |
35 | one_kernel = dict()
36 |
37 | for pos in linspace(-(length-1)/2, (length-1)/2, length):
38 |
39 | one_kernel[int(pos)] = kde(pos, bandwidth, 0)/minscore
40 |
41 | return one_kernel
42 |
--------------------------------------------------------------------------------
/Jazzlib/cEM_zip.pyx:
--------------------------------------------------------------------------------
1 | def cEM_zip(testdata):
2 |
3 | cdef float sumzip = sum(testdata)
4 |
5 | cdef int lengthoflist = len(testdata)
6 |
7 | cdef float phat = 0.5
8 |
9 | cdef float phatpre = -1.0
10 |
11 | cdef float lhatpre = -1.0
12 |
13 | cdef float base
14 |
15 | cdef int i = 0
16 |
17 | cdef int j = 0
18 |
19 | cdef int n
20 |
21 | cdef float c
22 |
23 | zhat = []
24 |
25 | zerolist = []
26 |
27 | for i from 0<=i20, default==600
24 | -t THRESHOLD, --threshold=THRESHOLD
25 | Hot spots threshold, default=4.0
26 | -l MINLENGTH, --minlength=MINLENGTH
27 | minimum length of hot spots, default=50
28 | -p PVALUE, --pavlue=PVALUE
29 | p-value cutoff for peak identification, default=0.01
30 | -i INITIAL, --initial=INITIAL
31 | Peak's initial length, >5 and chr_length:
25 |
26 | renewend = chr_length
27 |
28 | insertsize_middle_site_count = midsiteinsersizecounter(bamfile=bamfile, regionchromosome=regionchromosome,
29 | regionstart=renewstart, regionend=renewend,
30 | jobtype=jobtype, maxinsert=maxinsert)
31 |
32 |
33 | renewlength = renewend - renewstart + 1
34 |
35 | smoothed_score = np.repeat(0, renewlength)
36 |
37 | for insertlen in insertsize_middle_site_count:
38 |
39 | # print ("count size", insertlen)
40 |
41 | readcount_nowinsertsize = list()
42 |
43 | kernelnow = smooth_kernel(insertlen)
44 |
45 | kernel_score = list()
46 |
47 | for w in sorted(kernelnow):
48 |
49 | kernel_score.append(kernelnow[w])
50 |
51 | for n in range(renewstart, renewend+1):
52 |
53 | nowscore = 0
54 |
55 | if n in insertsize_middle_site_count[insertlen]:
56 |
57 | nowscore = insertsize_middle_site_count[insertlen][n]
58 |
59 | readcount_nowinsertsize.append(nowscore)
60 |
61 | nowsmoothed = np.correlate(np.array(readcount_nowinsertsize), kernel_score, "same")
62 |
63 | smoothed_score = nowsmoothed + smoothed_score
64 |
65 | outputscore = dict()
66 |
67 | outputscore['chromosome'] = regionchromosome
68 |
69 | outputscore['score'] = dict()
70 |
71 | # print (smoothed_score[0])
72 |
73 | for j in range(0, renewlength):
74 |
75 | nowsite = j + renewstart
76 |
77 | nowscore = smoothed_score[j]
78 |
79 | if regionstart <= nowsite <= regionend:
80 |
81 | outputscore['score'][nowsite] = nowscore
82 |
83 | return outputscore
84 |
85 | except KeyboardInterrupt:
86 |
87 | raise KeyboardInterruptError()
88 | sys.exit(0)
--------------------------------------------------------------------------------
/Jazzlib/kernelsmooth.py.bak:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from countreads import *
3 | from kernel import *
4 | import numpy as np
5 |
6 |
7 | class KeyboardInterruptError(Exception):
8 |
9 | pass
10 |
11 |
12 | def regionsmooth(bamfile, jobtype, maxinsert, regionchromosome, regionstart, regionend, chr_length):
13 |
14 | try:
15 |
16 | renewstart = regionstart - maxinsert*2
17 |
18 | renewend = regionend + maxinsert*2
19 |
20 | if renewstart < 1:
21 |
22 | renewstart = 1
23 |
24 | if renewend > chr_length:
25 |
26 | renewend = chr_length
27 |
28 | insertsize_middle_site_count = midsiteinsersizecounter(bamfile=bamfile, regionchromosome=regionchromosome,
29 | regionstart=renewstart, regionend=renewend,
30 | jobtype=jobtype, maxinsert=maxinsert)
31 |
32 |
33 | renewlength = renewend - renewstart + 1
34 |
35 | smoothed_score = np.repeat(0, renewlength)
36 |
37 | for insertlen in insertsize_middle_site_count:
38 |
39 | # print ("count size", insertlen)
40 |
41 | readcount_nowinsertsize = list()
42 |
43 | kernelnow = smooth_kernel(insertlen)
44 |
45 | kernel_score = list()
46 |
47 | for w in sorted(kernelnow):
48 |
49 | kernel_score.append(kernelnow[w])
50 |
51 | for n in range(renewstart, renewend+1):
52 |
53 | nowscore = 0
54 |
55 | if n in insertsize_middle_site_count[insertlen]:
56 |
57 | nowscore = insertsize_middle_site_count[insertlen][n]
58 |
59 | readcount_nowinsertsize.append(nowscore)
60 |
61 | nowsmoothed = np.correlate(np.array(readcount_nowinsertsize), kernel_score, "same")
62 |
63 | smoothed_score = nowsmoothed + smoothed_score
64 |
65 | outputscore = dict()
66 |
67 | outputscore['chromosome'] = regionchromosome
68 |
69 | outputscore['score'] = dict()
70 |
71 | # print (smoothed_score[0])
72 |
73 | for j in range(0, renewlength):
74 |
75 | nowsite = j + renewstart
76 |
77 | nowscore = smoothed_score[j]
78 |
79 | if regionstart <= nowsite <= regionend:
80 |
81 | outputscore['score'][nowsite] = nowscore
82 |
83 | return outputscore
84 |
85 | except KeyboardInterrupt:
86 |
87 | raise KeyboardInterruptError()
88 | sys.exit(0)
--------------------------------------------------------------------------------
/Jazzlib/region.py:
--------------------------------------------------------------------------------
1 | def effectregion(chrlength, windowsize, bw):
2 |
3 | """
4 | count effect region
5 | ===================--
6 | --=================--
7 |
8 | """
9 | scare = int(chrlength/windowsize)
10 |
11 | efregions = dict()
12 |
13 | for i in range(0, scare+1):
14 | efregions[i] = dict()
15 | if i == 0:
16 |
17 | efregions[i]['ctstart'] = 1
18 | efregions[i]['ctend'] = int(windowsize + 1.5 * bw)
19 | efregions[i]['efstart'] = 1
20 | efregions[i]['efend'] = int(windowsize)
21 | elif i == scare:
22 |
23 | efregions[i]['ctstart'] = int(i * windowsize - 1.5 * bw)
24 | efregions[i]['ctend'] = int(chrlength)
25 | efregions[i]['efstart'] = int(i * windowsize + 1)
26 | efregions[i]['efend'] = int(chrlength)
27 |
28 | else:
29 |
30 | efregions[i]['ctstart'] = int(i * windowsize - 1.5 * bw)
31 | efregions[i]['ctend'] = int((i + 1) * windowsize + 1.5 * bw)
32 | efregions[i]['efstart'] = int(i * windowsize + 1)
33 | efregions[i]['efend'] = int((i + 1) * windowsize)
34 |
35 | return efregions
36 |
37 |
38 | def continueregion(points, minlength=2):
39 |
40 | try:
41 |
42 | points.sort()
43 |
44 | start_index = 0
45 |
46 | end_index = 0
47 |
48 | continue_region = list()
49 |
50 | for index_now in range(1, len(points)):
51 |
52 | pre_index = index_now - 1
53 |
54 | if points[pre_index] + 1 == points[index_now]:
55 |
56 | if index_now == len(points) -1:
57 |
58 | if points[index_now] - points[start_index] + 1>= minlength :
59 | #print (points[start_index], points[index_now])
60 | region_now = dict()
61 | region_now['start_site'] = points[start_index]
62 | region_now['end_site'] = points[index_now]
63 | continue_region.append(region_now)
64 |
65 | else:
66 |
67 | end_index = index_now
68 |
69 | else:
70 |
71 | if points[end_index] - points[start_index] + 1 >= minlength :
72 |
73 | #print (points[start_index], points[end_index])
74 | region_now = dict()
75 | region_now['start_site'] = points[start_index]
76 | region_now['end_site'] = points[end_index]
77 | continue_region.append(region_now)
78 |
79 | start_index = index_now
80 |
81 | end_index = index_now
82 |
83 | return continue_region
84 |
85 | except Exception as e:
86 |
87 | print(('got exception in Jazzlib.region.continueregion: %r, terminating the pool' % (e,)))
88 |
89 |
90 | def windowregion(chr_length, site, windowsize, chromsome):
91 |
92 | windowstart = site - int(windowsize/2)
93 |
94 | windowend = site + int(windowsize/2)
95 |
96 | if windowstart < 1:
97 |
98 | windowstart = 1
99 |
100 | if windowend > chr_length:
101 |
102 | windowend = chr_length
103 |
104 | windowregion = chromsome+":"+str(windowstart)+'-'+str(windowend)
105 |
106 | return windowregion
107 |
108 |
109 | if __name__ == "__main__":
110 |
111 | try:
112 |
113 | relist = [1,2,3,4,7,9,10]
114 |
115 | creg = continueregion(relist, 1)
116 |
117 | print (creg)
118 |
119 | except:
120 |
121 | pass
122 |
123 |
124 |
--------------------------------------------------------------------------------
/Jazzlib/region.py.bak:
--------------------------------------------------------------------------------
1 | def effectregion(chrlength, windowsize, bw):
2 |
3 | """
4 | count effect region
5 | ===================--
6 | --=================--
7 |
8 | """
9 | scare = int(chrlength/windowsize)
10 |
11 | efregions = dict()
12 |
13 | for i in range(0, scare+1):
14 | efregions[i] = dict()
15 | if i == 0:
16 |
17 | efregions[i]['ctstart'] = 1
18 | efregions[i]['ctend'] = int(windowsize + 1.5 * bw)
19 | efregions[i]['efstart'] = 1
20 | efregions[i]['efend'] = int(windowsize)
21 | elif i == scare:
22 |
23 | efregions[i]['ctstart'] = int(i * windowsize - 1.5 * bw)
24 | efregions[i]['ctend'] = int(chrlength)
25 | efregions[i]['efstart'] = int(i * windowsize + 1)
26 | efregions[i]['efend'] = int(chrlength)
27 |
28 | else:
29 |
30 | efregions[i]['ctstart'] = int(i * windowsize - 1.5 * bw)
31 | efregions[i]['ctend'] = int((i + 1) * windowsize + 1.5 * bw)
32 | efregions[i]['efstart'] = int(i * windowsize + 1)
33 | efregions[i]['efend'] = int((i + 1) * windowsize)
34 |
35 | return efregions
36 |
37 |
38 | def continueregion(points, minlength=2):
39 |
40 | try:
41 |
42 | points.sort()
43 |
44 | start_index = 0
45 |
46 | end_index = 0
47 |
48 | continue_region = list()
49 |
50 | for index_now in range(1, len(points)):
51 |
52 | pre_index = index_now - 1
53 |
54 | if points[pre_index] + 1 == points[index_now]:
55 |
56 | if index_now == len(points) -1:
57 |
58 | if points[index_now] - points[start_index] + 1>= minlength :
59 | #print (points[start_index], points[index_now])
60 | region_now = dict()
61 | region_now['start_site'] = points[start_index]
62 | region_now['end_site'] = points[index_now]
63 | continue_region.append(region_now)
64 |
65 | else:
66 |
67 | end_index = index_now
68 |
69 | else:
70 |
71 | if points[end_index] - points[start_index] + 1 >= minlength :
72 |
73 | #print (points[start_index], points[end_index])
74 | region_now = dict()
75 | region_now['start_site'] = points[start_index]
76 | region_now['end_site'] = points[end_index]
77 | continue_region.append(region_now)
78 |
79 | start_index = index_now
80 |
81 | end_index = index_now
82 |
83 | return continue_region
84 |
85 | except Exception, e:
86 |
87 | print ('got exception in Jazzlib.region.continueregion: %r, terminating the pool' % (e,))
88 |
89 |
90 | def windowregion(chr_length, site, windowsize, chromsome):
91 |
92 | windowstart = site - int(windowsize/2)
93 |
94 | windowend = site + int(windowsize/2)
95 |
96 | if windowstart < 1:
97 |
98 | windowstart = 1
99 |
100 | if windowend > chr_length:
101 |
102 | windowend = chr_length
103 |
104 | windowregion = chromsome+":"+str(windowstart)+'-'+str(windowend)
105 |
106 | return windowregion
107 |
108 |
109 | if __name__ == "__main__":
110 |
111 | try:
112 |
113 | relist = [1,2,3,4,7,9,10]
114 |
115 | creg = continueregion(relist, 1)
116 |
117 | print (creg)
118 |
119 | except:
120 |
121 | pass
122 |
123 |
124 |
--------------------------------------------------------------------------------
/Jazzlib/jazzio.py.bak:
--------------------------------------------------------------------------------
1 | import io
2 | from Peak import *
3 | from Hotspot import *
4 | from Peak import *
5 | from FRegion import *
6 |
7 | def peakbedswriter(samplename, peaks):
8 |
9 | bedfilename =samplename+ '_' + 'peak' + ".bed"
10 |
11 | open_bed = io.FileIO(bedfilename, 'w')
12 |
13 | for peak in peaks:
14 |
15 | #bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end), hotspot.hotspotid]
16 | bedlist = [str(peak.chromosome), str(peak.start), str(peak.end),str(peak.peakid),str(peak.score)]
17 |
18 | linker = "\t"
19 |
20 | outstring = linker.join(bedlist) + "\n"
21 |
22 | open_bed.write(outstring)
23 |
24 | open_bed.close()
25 |
26 |
27 | def peakbedgraphswriter(samplename, peaks):
28 |
29 | bedfilename =samplename+ '_' + 'peak' + ".bedgraph"
30 |
31 | open_bed = io.FileIO(bedfilename, 'w')
32 |
33 | for peak in peaks:
34 |
35 | #bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end), hotspot.hotspotid]
36 | bedlist = [str(peak.chromosome), str(peak.start), str(peak.end),str(peak.score)]
37 |
38 | linker = "\t"
39 |
40 | outstring = linker.join(bedlist) + "\n"
41 |
42 | open_bed.write(outstring)
43 |
44 | open_bed.close()
45 |
46 |
47 | def hotspotsbedswriter(samplename, hotspots):
48 |
49 | bedfilename =samplename+ '_' + 'hotspots' + ".bed"
50 |
51 | open_bed = io.FileIO(bedfilename, 'w')
52 |
53 | for hotspot in hotspots:
54 |
55 | #bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end), hotspot.hotspotid]
56 | bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end)]
57 |
58 | linker = "\t"
59 |
60 | outstring = linker.join(bedlist) + "\n"
61 |
62 | open_bed.write(outstring)
63 |
64 | open_bed.close()
65 |
66 |
67 | def hotpeakbedswriter2(samplename, hotspots):
68 |
69 | bedfilename =samplename+ '_' + 'peaks' + ".bed"
70 |
71 | open_bed = io.FileIO(bedfilename, 'w')
72 |
73 | for hotspot in hotspots:
74 |
75 | for peak in hotspot.peaks:
76 |
77 | bedlist = [str(peak.chromosome), str(peak.start), str(peak.end),str(peak.peakid),str(peak.score)]
78 |
79 | linker = "\t"
80 |
81 | outstring = linker.join(bedlist) + "\n"
82 |
83 | open_bed.write(outstring)
84 |
85 | open_bed.close()
86 |
87 |
88 | def jazzgffout(samplename, hotspots, peaks, fregion):
89 |
90 | bedfilename =samplename+ '_' + 'peaks_hotspots' + ".gff3"
91 |
92 | open_bed = io.FileIO(bedfilename, 'w')
93 | linker = "\t"
94 |
95 | frsite = dict()
96 |
97 | for fr in fregion.filted_region:
98 |
99 | (frchrnow,frstartend) = fr.split(":")
100 |
101 | (frstart,frend) = frstartend.split("-")
102 |
103 | for sitenow in range(int(frstart), int(frend)+1):
104 |
105 | if frchrnow in frsite:
106 |
107 | frsite[frchrnow][sitenow] = 1
108 |
109 | else:
110 |
111 | frsite[frchrnow] = dict()
112 |
113 | frsite[frchrnow][sitenow] = 1
114 |
115 | hotspotsinfr = dict()
116 |
117 | for hotspot in hotspots:
118 |
119 |
120 |
121 | if hotspot.chromosome in frsite:
122 |
123 | for nowsite in range(hotspot.start, hotspot.end+1):
124 |
125 | if nowsite in frsite[frchrnow]:
126 |
127 | hotspotanno = "ID="+str(hotspot.hotspotid)+";anno=FREGION"
128 |
129 | hotspotsinfr[hotspot.hotspotid] = 1
130 |
131 | else:
132 |
133 | hotspotanno = "ID="+str(hotspot.hotspotid)
134 |
135 | else:
136 |
137 | hotspotanno = "ID="+str(hotspot.hotspotid)
138 |
139 | hotspotsstr = [str(hotspot.chromosome), "JAZZ", "gene", str(hotspot.start), str(hotspot.end),
140 | '.', '.', '.',hotspotanno
141 | ]
142 |
143 | hotspotstring = linker.join(hotspotsstr) + "\n"
144 |
145 | open_bed.write(hotspotstring)
146 |
147 |
148 | for peak in peaks:
149 |
150 | # peakanno = "Parent="+str(peak.parent)+";"+"ID="+str(peak.peakid)
151 |
152 | if peak.parent in hotspotsinfr:
153 | peakanno = "Parent="+str(peak.parent)+";"+"ID="+str(peak.peakid)+";anno=FREGION"
154 | else:
155 | peakanno = "Parent="+str(peak.parent)+";"+"ID="+str(peak.peakid)
156 |
157 | peakstr = [str(peak.chromosome), "JAZZ", "CDS", str(peak.start), str(peak.end),
158 | '.', '.', '.',peakanno]
159 |
160 | peakstring = linker.join(peakstr)+"\n"
161 |
162 | open_bed.write(peakstring)
163 |
164 |
165 | open_bed.close()
166 |
--------------------------------------------------------------------------------
/Jazzlib/jazzio.py:
--------------------------------------------------------------------------------
1 | import io
2 | from .Peak import *
3 | from .Hotspot import *
4 | from .Peak import *
5 | from .FRegion import *
6 |
7 | def peakbedswriter(samplename, peaks):
8 |
9 | bedfilename =samplename+ '_' + 'peak' + ".bed"
10 |
11 | open_bed = io.FileIO(bedfilename, 'w')
12 |
13 | for peak in peaks:
14 |
15 | #bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end), hotspot.hotspotid]
16 | bedlist = [str(peak.chromosome), str(peak.start), str(peak.end),str(peak.peakid),str(peak.score)]
17 |
18 | linker = "\t"
19 |
20 | outstring = linker.join(bedlist) + "\n"
21 |
22 | open_bed.write(bytes(outstring, encoding = 'utf-8'))
23 |
24 | open_bed.close()
25 |
26 |
27 | def peakbedgraphswriter(samplename, peaks):
28 |
29 | bedfilename =samplename+ '_' + 'peak' + ".bedgraph"
30 |
31 | open_bed = io.FileIO(bedfilename, 'w')
32 |
33 | for peak in peaks:
34 |
35 | #bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end), hotspot.hotspotid]
36 | bedlist = [str(peak.chromosome), str(peak.start), str(peak.end),str(peak.score)]
37 |
38 | linker = "\t"
39 |
40 | outstring = linker.join(bedlist) + "\n"
41 |
42 | open_bed.write(bytes(outstring, encoding = 'utf-8'))
43 |
44 | open_bed.close()
45 |
46 |
47 | def hotspotsbedswriter(samplename, hotspots):
48 |
49 | bedfilename =samplename+ '_' + 'hotspots' + ".bed"
50 |
51 | open_bed = io.FileIO(bedfilename, 'w')
52 |
53 | for hotspot in hotspots:
54 |
55 | #bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end), hotspot.hotspotid]
56 | bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end)]
57 |
58 | linker = "\t"
59 |
60 | outstring = linker.join(bedlist) + "\n"
61 |
62 | open_bed.write(bytes(outstring, encoding = 'utf-8'))
63 |
64 | open_bed.close()
65 |
66 |
67 | def hotpeakbedswriter2(samplename, hotspots):
68 |
69 | bedfilename =samplename+ '_' + 'peaks' + ".bed"
70 |
71 | open_bed = io.FileIO(bedfilename, 'w')
72 |
73 | for hotspot in hotspots:
74 |
75 | for peak in hotspot.peaks:
76 |
77 | bedlist = [str(peak.chromosome), str(peak.start), str(peak.end),str(peak.peakid),str(peak.score)]
78 |
79 | linker = "\t"
80 |
81 | outstring = linker.join(bedlist) + "\n"
82 |
83 | open_bed.write(bytes(outstring, encoding = 'utf-8'))
84 |
85 | open_bed.close()
86 |
87 |
88 | def jazzgffout(samplename, hotspots, peaks, fregion):
89 |
90 | bedfilename =samplename+ '_' + 'peaks_hotspots' + ".gff3"
91 |
92 | open_bed = io.FileIO(bedfilename, 'w')
93 | linker = "\t"
94 |
95 | frsite = dict()
96 |
97 | for fr in fregion.filted_region:
98 |
99 | (frchrnow,frstartend) = fr.split(":")
100 |
101 | (frstart,frend) = frstartend.split("-")
102 |
103 | for sitenow in range(int(frstart), int(frend)+1):
104 |
105 | if frchrnow in frsite:
106 |
107 | frsite[frchrnow][sitenow] = 1
108 |
109 | else:
110 |
111 | frsite[frchrnow] = dict()
112 |
113 | frsite[frchrnow][sitenow] = 1
114 |
115 | hotspotsinfr = dict()
116 |
117 | for hotspot in hotspots:
118 |
119 |
120 |
121 | if hotspot.chromosome in frsite:
122 |
123 | for nowsite in range(hotspot.start, hotspot.end+1):
124 |
125 | if nowsite in frsite[frchrnow]:
126 |
127 | hotspotanno = "ID="+str(hotspot.hotspotid)+";anno=FREGION"
128 |
129 | hotspotsinfr[hotspot.hotspotid] = 1
130 |
131 | else:
132 |
133 | hotspotanno = "ID="+str(hotspot.hotspotid)
134 |
135 | else:
136 |
137 | hotspotanno = "ID="+str(hotspot.hotspotid)
138 |
139 | hotspotsstr = [str(hotspot.chromosome), "JAZZ", "gene", str(hotspot.start), str(hotspot.end),
140 | '.', '.', '.',hotspotanno
141 | ]
142 |
143 | hotspotstring = linker.join(hotspotsstr) + "\n"
144 |
145 | # open_bed.write(hotspotstring)
146 | open_bed.write(bytes(hotspotstring, encoding='utf-8'))
147 |
148 | for peak in peaks:
149 |
150 | # peakanno = "Parent="+str(peak.parent)+";"+"ID="+str(peak.peakid)
151 |
152 | if peak.parent in hotspotsinfr:
153 | peakanno = "Parent="+str(peak.parent)+";"+"ID="+str(peak.peakid)+";anno=FREGION"
154 | else:
155 | peakanno = "Parent="+str(peak.parent)+";"+"ID="+str(peak.peakid)
156 |
157 | peakstr = [str(peak.chromosome), "JAZZ", "CDS", str(peak.start), str(peak.end),
158 | '.', '.', '.',peakanno]
159 |
160 | peakstring = linker.join(peakstr)+"\n"
161 |
162 | #open_bed.write(peakstring)
163 | open_bed.write(bytes(peakstring, encoding='utf-8'))
164 |
165 |
166 | open_bed.close()
167 |
--------------------------------------------------------------------------------
/Jazzlib/randombg.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from multiprocessing import Pool
4 | from .FRegion import *
5 | import random as rnd
6 | from .kernel import *
7 | from numpy import *
8 |
9 | class KeyboardInterruptError(Exception):
10 |
11 | pass
12 |
13 |
14 | def randombg2(fregion, nthreads, maxinsert, randomthreshold=2, runtime=1000, randomwindow=10000):
15 |
16 | countgenomelength = fregion.countgenomelength
17 |
18 | adjreads = fregion.adjreads
19 |
20 | bg = adjreads/countgenomelength
21 |
22 | return bg
23 |
24 |
25 | def randombg(fregion, nthreads, maxinsert, randomthreshold=2, runtime=1000, randomwindow=10000):
26 |
27 | pool = Pool(nthreads)
28 |
29 | try:
30 |
31 |
32 |
33 | countgenomeuniqlength = fregion.countgenomeuniqlength
34 |
35 | adjreads = fregion.adjreads
36 |
37 | countgenomelength = fregion.countgenomelength
38 |
39 | uniqrate = countgenomeuniqlength/countgenomelength
40 |
41 | if uniqrate <0.5:
42 |
43 | uniqrate = uniqrate * 2
44 |
45 | countreads = int(adjreads/countgenomeuniqlength * randomwindow)+1
46 |
47 | onekernel = smooth_kernel(length=maxinsert)
48 |
49 | kernel_score = list()
50 |
51 | pars = list()
52 |
53 | for i in sorted(onekernel):
54 |
55 | kernel_score.append(onekernel[i])
56 |
57 | for j in range(runtime):
58 |
59 | par = dict()
60 |
61 | par['countreads'] = countreads
62 |
63 | par['kernel_score'] = kernel_score
64 |
65 | par['uniqrate'] = uniqrate
66 |
67 | par['randomwindo'] = randomwindow
68 |
69 | par['randomthreshold'] =randomthreshold
70 |
71 | # print (par)
72 |
73 | pars.append(par)
74 |
75 | randths = pool.map(sim_bg_worker, pars)
76 |
77 | thsum = 0
78 |
79 | for randth in randths:
80 |
81 | thsum = thsum + randth
82 |
83 | random_th = thsum/runtime
84 |
85 | pool.close()
86 |
87 | return random_th
88 |
89 | except KeyboardInterrupt:
90 |
91 | pool.terminate()
92 |
93 | print ("You cancelled the program!")
94 |
95 | sys.exit(1)
96 |
97 | except Exception as e:
98 |
99 | print ('got exception in Jazzlib.randombg.randombg: %r, terminating the pool' % (e,))
100 |
101 | pool.terminate()
102 |
103 | print ('pool is terminated')
104 |
105 | finally:
106 | # print ('joining pool processes')
107 | pool.join()
108 |
109 |
110 | def sim_bg_worker(par):
111 |
112 | try:
113 |
114 | countreads = par['countreads']
115 |
116 | kernel_score = par['kernel_score']
117 |
118 | uniqrate = par['uniqrate']
119 |
120 | randomwindow = par['randomwindo']
121 |
122 | randomthreshold = par['randomthreshold']
123 |
124 | totaluniqsite = int(uniqrate * randomwindow)
125 |
126 | rand_reads_count = list()
127 |
128 | region_site = list(range(0, randomwindow))
129 |
130 | for i in range(0, randomwindow):
131 |
132 | rand_reads_count.append(0)
133 |
134 | sim_uniqsite = rnd.sample(region_site, totaluniqsite)
135 |
136 |
137 | for k in range(0, countreads):
138 |
139 | rand_number = int(rnd.uniform(0, totaluniqsite))
140 |
141 | rand_reads = sim_uniqsite[rand_number]
142 |
143 | # print (rand_number, rand_reads)
144 |
145 | rand_reads_count[rand_reads] = rand_reads_count[rand_reads] + 1
146 |
147 | smoothed_result = correlate(array(rand_reads_count), kernel_score, "same")
148 |
149 | # scores = list()
150 |
151 | rand_mean = smoothed_result.mean()
152 |
153 | rand_std = smoothed_result.std()
154 |
155 | # total_sum = smoothed_result.sum()
156 | # print (rand_mean, rand_std, randomthreshold)
157 |
158 | rand_threshhold = rand_mean + randomthreshold * rand_std
159 |
160 | return rand_threshhold
161 |
162 |
163 | except KeyboardInterrupt:
164 |
165 | print ("You cancelled the program!")
166 |
167 | sys.exit(1)
168 |
169 | except Exception as e:
170 |
171 | print ('got exception in Jazzlib.randombg.sim_bg_worker: %r, terminating the pool' % (e,))
172 |
173 |
174 | if __name__ == "__main__":
175 | try:
176 |
177 | onekernel = smooth_kernel(length=100)
178 |
179 | kernel_score = list()
180 |
181 | pars = list()
182 |
183 | for i in sorted(onekernel):
184 |
185 | kernel_score.append(onekernel[i])
186 |
187 | par = dict()
188 |
189 | par['countreads'] = 100000
190 |
191 | par['kernel_score'] = kernel_score
192 |
193 | par['uniqrate'] = 0.3
194 |
195 | par['randomwindo'] = int(1e5)
196 |
197 | par['randomthreshold'] = 3
198 |
199 | th = sim_bg_worker(par)
200 |
201 | print (th)
202 |
203 | except KeyboardInterrupt:
204 | sys.stderr.write("User interrupt\n")
205 | sys.exit(0)
--------------------------------------------------------------------------------
/Jazzlib/randombg.py.bak:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from __future__ import print_function
3 | from multiprocessing import Pool
4 | from FRegion import *
5 | import random as rnd
6 | from kernel import *
7 | from numpy import *
8 |
9 | class KeyboardInterruptError(Exception):
10 |
11 | pass
12 |
13 |
14 | def randombg2(fregion, nthreads, maxinsert, randomthreshold=2, runtime=1000, randomwindow=10000):
15 |
16 | countgenomelength = fregion.countgenomelength
17 |
18 | adjreads = fregion.adjreads
19 |
20 | bg = adjreads/countgenomelength
21 |
22 | return bg
23 |
24 |
25 | def randombg(fregion, nthreads, maxinsert, randomthreshold=2, runtime=1000, randomwindow=10000):
26 |
27 | pool = Pool(nthreads)
28 |
29 | try:
30 |
31 |
32 |
33 | countgenomeuniqlength = fregion.countgenomeuniqlength
34 |
35 | adjreads = fregion.adjreads
36 |
37 | countgenomelength = fregion.countgenomelength
38 |
39 | uniqrate = countgenomeuniqlength/countgenomelength
40 |
41 | if uniqrate <0.5:
42 |
43 | uniqrate = uniqrate * 2
44 |
45 | countreads = int(adjreads/countgenomeuniqlength * randomwindow)+1
46 |
47 | onekernel = smooth_kernel(length=maxinsert)
48 |
49 | kernel_score = list()
50 |
51 | pars = list()
52 |
53 | for i in sorted(onekernel):
54 |
55 | kernel_score.append(onekernel[i])
56 |
57 | for j in range(runtime):
58 |
59 | par = dict()
60 |
61 | par['countreads'] = countreads
62 |
63 | par['kernel_score'] = kernel_score
64 |
65 | par['uniqrate'] = uniqrate
66 |
67 | par['randomwindo'] = randomwindow
68 |
69 | par['randomthreshold'] =randomthreshold
70 |
71 | # print (par)
72 |
73 | pars.append(par)
74 |
75 | randths = pool.map(sim_bg_worker, pars)
76 |
77 | thsum = 0
78 |
79 | for randth in randths:
80 |
81 | thsum = thsum + randth
82 |
83 | random_th = thsum/runtime
84 |
85 | pool.close()
86 |
87 | return random_th
88 |
89 | except KeyboardInterrupt:
90 |
91 | pool.terminate()
92 |
93 | print ("You cancelled the program!")
94 |
95 | sys.exit(1)
96 |
97 | except Exception, e:
98 |
99 | print ('got exception in Jazzlib.randombg.randombg: %r, terminating the pool' % (e,))
100 |
101 | pool.terminate()
102 |
103 | print ('pool is terminated')
104 |
105 | finally:
106 | # print ('joining pool processes')
107 | pool.join()
108 |
109 |
110 | def sim_bg_worker(par):
111 |
112 | try:
113 |
114 | countreads = par['countreads']
115 |
116 | kernel_score = par['kernel_score']
117 |
118 | uniqrate = par['uniqrate']
119 |
120 | randomwindow = par['randomwindo']
121 |
122 | randomthreshold = par['randomthreshold']
123 |
124 | totaluniqsite = int(uniqrate * randomwindow)
125 |
126 | rand_reads_count = list()
127 |
128 | region_site = range(0, randomwindow)
129 |
130 | for i in range(0, randomwindow):
131 |
132 | rand_reads_count.append(0)
133 |
134 | sim_uniqsite = rnd.sample(region_site, totaluniqsite)
135 |
136 |
137 | for k in range(0, countreads):
138 |
139 | rand_number = int(rnd.uniform(0, totaluniqsite))
140 |
141 | rand_reads = sim_uniqsite[rand_number]
142 |
143 | # print (rand_number, rand_reads)
144 |
145 | rand_reads_count[rand_reads] = rand_reads_count[rand_reads] + 1
146 |
147 | smoothed_result = correlate(array(rand_reads_count), kernel_score, "same")
148 |
149 | # scores = list()
150 |
151 | rand_mean = smoothed_result.mean()
152 |
153 | rand_std = smoothed_result.std()
154 |
155 | # total_sum = smoothed_result.sum()
156 | # print (rand_mean, rand_std, randomthreshold)
157 |
158 | rand_threshhold = rand_mean + randomthreshold * rand_std
159 |
160 | return rand_threshhold
161 |
162 |
163 | except KeyboardInterrupt:
164 |
165 | print ("You cancelled the program!")
166 |
167 | sys.exit(1)
168 |
169 | except Exception, e:
170 |
171 | print ('got exception in Jazzlib.randombg.sim_bg_worker: %r, terminating the pool' % (e,))
172 |
173 |
174 | if __name__ == "__main__":
175 | try:
176 |
177 | onekernel = smooth_kernel(length=100)
178 |
179 | kernel_score = list()
180 |
181 | pars = list()
182 |
183 | for i in sorted(onekernel):
184 |
185 | kernel_score.append(onekernel[i])
186 |
187 | par = dict()
188 |
189 | par['countreads'] = 100000
190 |
191 | par['kernel_score'] = kernel_score
192 |
193 | par['uniqrate'] = 0.3
194 |
195 | par['randomwindo'] = int(1e5)
196 |
197 | par['randomthreshold'] = 3
198 |
199 | th = sim_bg_worker(par)
200 |
201 | print (th)
202 |
203 | except KeyboardInterrupt:
204 | sys.stderr.write("User interrupt\n")
205 | sys.exit(0)
--------------------------------------------------------------------------------
/Jazzlib/localmax.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from .kernelsmooth import *
4 | from multiprocessing import Pool
5 | from .kernel import *
6 | from .FRegion import *
7 |
8 |
9 | class KeyboardInterruptError(Exception):
10 |
11 | pass
12 |
13 |
14 | def get_all_localmax(bamfile, jobtype, maxinsert, nthreads, fregion, countchr, rndth):
15 |
16 | pool = Pool(nthreads)
17 |
18 | try:
19 |
20 | pars = list()
21 |
22 | windowsize = 100000
23 |
24 | adjreads = fregion.adjreads
25 |
26 | totallength = 0
27 |
28 | onesmoothkernel = smooth_kernel(30)
29 |
30 | kermax = max(onesmoothkernel.values())
31 | #
32 |
33 |
34 | for chromosmoe in countchr:
35 |
36 | chr_length = fregion.chrs_length[chromosmoe]
37 |
38 | totallength = totallength + chr_length
39 |
40 | for scare in range(0, int(chr_length/windowsize)+1):
41 |
42 | nowstart = scare*windowsize + 1 -200
43 |
44 | nowend = (scare+1)*windowsize + 200
45 |
46 | if nowend > chr_length:
47 |
48 | nowend = chr_length
49 |
50 | if nowstart < 1:
51 |
52 | nowstart = 1
53 |
54 | nowregion = chromosmoe + ":" + str(nowstart) + "-" + str(nowend)
55 |
56 | par = dict()
57 |
58 | par['region'] = nowregion
59 |
60 | par['maxinsert'] = maxinsert
61 |
62 | par['bamfile'] = bamfile
63 |
64 | par['jobtype'] = jobtype
65 |
66 | par['chrlength'] = chr_length
67 |
68 | par['regionchromosome'] = chromosmoe
69 |
70 | par['regionstart'] = nowstart
71 |
72 | par['regionend'] = nowend
73 |
74 | par['rndth'] = rndth
75 |
76 | pars.append(par)
77 |
78 | avgcount = adjreads/totallength
79 |
80 | threshhold = int(avgcount + 1) * kermax
81 | ###test threhhold
82 | #threshhold = avgcount
83 |
84 | # print ("threshhold:", threshhold)
85 |
86 | filted_region = fregion.filted_region
87 |
88 | filted_site = dict()
89 |
90 | for fr in filted_region:
91 |
92 | chromosome, sesite = fr.split(':')
93 |
94 | startsite, endsite = sesite.split('-')
95 |
96 | startsite = int(startsite)
97 |
98 | endsite = int(endsite)
99 |
100 | if chromosome in filted_site:
101 |
102 | for i in range(startsite,endsite):
103 |
104 | filted_site[chromosome][i] = 1
105 |
106 | else:
107 |
108 | filted_site[chromosome] = dict()
109 |
110 | for i in range(startsite,endsite):
111 |
112 | filted_site[chromosome][i] = 1
113 |
114 | localmax = dict()
115 |
116 | localmax_worker_returnres = pool.map(localmax_worker, pars)
117 |
118 | for each_worker_res in localmax_worker_returnres:
119 |
120 | for chromosome in each_worker_res:
121 |
122 | for site in each_worker_res[chromosome]:
123 |
124 | if chromosome in localmax:
125 |
126 | if each_worker_res[chromosome][site] > threshhold:
127 |
128 | if chromosome in filted_site:
129 |
130 | if site in filted_site[chromosome]:
131 |
132 | continue
133 |
134 | localmax[chromosome][site] = each_worker_res[chromosome][site]
135 |
136 | else:
137 |
138 | if each_worker_res[chromosome][site]>threshhold:
139 |
140 | if chromosome in filted_site:
141 |
142 | if site in filted_site[chromosome]:
143 |
144 | continue
145 |
146 | localmax[chromosome] = dict()
147 |
148 | localmax[chromosome][site] = each_worker_res[chromosome][site]
149 |
150 | pool.close()
151 |
152 | # print (localmax)
153 |
154 | return localmax
155 |
156 | except KeyboardInterrupt:
157 |
158 | pool.terminate()
159 |
160 | print ("You cancelled the program!")
161 |
162 | sys.exit(1)
163 |
164 | except Exception as e:
165 |
166 | print ('got exception in Jazzlib.localmax.get_all_localmax: %r, terminating the pool' % (e,))
167 |
168 | pool.terminate()
169 |
170 | print ('pool is terminated')
171 |
172 | finally:
173 | # print ('joining pool processes')
174 | pool.join()
175 | # print ('join complete')
176 |
177 |
178 | def localmax_worker(par):
179 |
180 | try:
181 |
182 | nowregion = par['region']
183 |
184 | maxinsert = par['maxinsert']
185 |
186 | bamfile = par['bamfile']
187 |
188 | jobtype = par['jobtype']
189 |
190 | chr_length = par['chrlength']
191 |
192 | regionchromosome = par['regionchromosome']
193 |
194 | regionstart = par['regionstart']
195 |
196 | regionend = par['regionend']
197 |
198 | rndth = par['rndth']
199 |
200 | # smoothedscore = regionsmooth(bamfile=bamfile, maxinsert=maxinsert, jobtype=jobtype, region=nowregion,
201 | # chr_length=chr_length)
202 |
203 | smoothedscore = regionsmooth(bamfile=bamfile, maxinsert=maxinsert, jobtype=jobtype,
204 | regionchromosome=regionchromosome,
205 | regionstart=regionstart, regionend=regionend,
206 | chr_length=chr_length)
207 |
208 | localmax = smoothedlocalmax(smoothedscore, rndth)
209 |
210 | return localmax
211 |
212 | except KeyboardInterrupt:
213 |
214 | raise KeyboardInterruptError()
215 |
216 | except Exception as e:
217 |
218 | print ('got exception in Jazzlib.localmax.localmax_worker: %r,' % (e,))
219 |
220 |
221 | def smoothedlocalmax(smoothedscore, rndth):
222 |
223 | try:
224 |
225 | maxsites = dict()
226 |
227 | startsite = min(smoothedscore['score'].keys())
228 |
229 | endsite = max(smoothedscore['score'].keys())
230 |
231 | chromosome = smoothedscore['chromosome']
232 |
233 | maxsites[chromosome] = dict()
234 |
235 | for nowsite in range(startsite+2, endsite-2):
236 |
237 | if smoothedscore['score'][nowsite] >=rndth:
238 |
239 | if (smoothedscore['score'][nowsite-2]=smoothedscore['score'][nowsite+1]>smoothedscore['score'][nowsite+2]):
240 |
241 | maxsites[chromosome][nowsite] = smoothedscore['score'][nowsite]
242 |
243 | # print (nowsite)
244 |
245 | return maxsites
246 |
247 | except KeyboardInterrupt:
248 |
249 | raise KeyboardInterruptError()
250 |
251 | except Exception as e:
252 |
253 | print ('got exception in Jazzlib.localmax.smoothedlocalmax: %r,' % (e,))
254 |
255 |
--------------------------------------------------------------------------------
/Jazzlib/sta.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from scipy.special import gammaincc
4 | from scipy import math
5 | import scipy.stats as stats
6 | from decimal import Decimal, localcontext
7 | from .Peak import *
8 | import sys
9 |
10 | def bionompvalue(x, n, p):
11 |
12 | bionompvalue = 1 - stats.binom.cdf(x, n, p)
13 |
14 | return bionompvalue
15 |
16 |
17 | def poissonpvalue(x,mu):
18 |
19 | poissonpvalue = Decimal(1) - Decimal(stats.poisson.cdf(x, mu))
20 |
21 | return poissonpvalue
22 |
23 |
24 |
25 | def fdr(pnow, plist, prank):
26 | #FDR=length(pvalue)*pvalue/rank(pvalue)
27 |
28 | rankofplist = prank
29 |
30 | lengthofplist = len(plist)
31 |
32 | for i in range(0,lengthofplist):
33 |
34 | if plist[i] == pnow:
35 | now_rank = rankofplist[i]
36 | fdr = lengthofplist*pnow/now_rank
37 | fdr = min(1,fdr)
38 | break
39 |
40 | return fdr
41 |
42 |
43 | def bayesfactor(locallambda, peakscore):
44 |
45 | try:
46 |
47 | # bayesfactor = 2 * (math.log((gammaincc(peakscore-1, locallambda)*gamma(peakscore-1)), math.e) - (peakscore-1)*math.log(locallambda, math.e) + locallambda)
48 | #
49 | # a = (math.log(gammaincc(peakscore-1, locallambda), math.e) )
50 | # b = math.lgamma(peakscore-1)
51 | # c=(peakscore-1)*math.log(locallambda, math.e)
52 | # print (locallambda,peakscore,a,b,c)
53 | bayesfactor2 = 2 * (math.log(gammaincc(peakscore-1, locallambda), math.e)+math.lgamma(peakscore-1) - (peakscore-1)*math.log(locallambda, math.e) + locallambda)
54 |
55 | return bayesfactor2
56 |
57 | except Exception as e:
58 |
59 | print ('got exception in Jazzlib.sta.bayesfactor: %r,' % (e,))
60 |
61 | print (locallambda, peakscore)
62 |
63 | except KeyboardInterrupt:
64 |
65 | sys.stderr.write("User interrupt\n")
66 |
67 | sys.exit(0)
68 |
69 | def fdr_control(chippeaks, inputpeaks, fdr):
70 |
71 | fdrpeakdict = dict()
72 |
73 | chipscore = list()
74 |
75 | inputscore = list()
76 |
77 | overlaptedpeak = dict()
78 |
79 | fdrth = -1
80 |
81 | # print ("check fdr")
82 |
83 | for inputpeak in inputpeaks:
84 |
85 | start = inputpeak.start
86 |
87 | end = inputpeak.end
88 |
89 | inputscore.append(inputpeak.score)
90 |
91 | for chippeak in chippeaks:
92 |
93 | if chippeak.chromosome == inputpeak.chromosome:
94 |
95 | if chippeak.peakpoint == inputpeak.peakpoint:
96 |
97 | overlaptedpeak[chippeak.peakid] = dict()
98 |
99 | overlaptedpeak[chippeak.peakid]['inputscore'] = inputpeak.score
100 |
101 | overlaptedpeak[chippeak.peakid]['chipscore'] = chippeak.score
102 |
103 | chipscore.append(chippeak.score)
104 |
105 | # print(chippeak.chromosome, chippeak.peakpoint, chippeak.score, inputpeak.score, chippeak.peakid)
106 |
107 | for i in sorted(chipscore):
108 |
109 | # print("score", i)
110 |
111 | chippeakcount = 0.0
112 |
113 | inputpeakcount = 0.0
114 |
115 | for peakid in overlaptedpeak:
116 |
117 | if i <= overlaptedpeak[peakid]['inputscore']:
118 |
119 | inputpeakcount = inputpeakcount + 1
120 |
121 | for chippeak in chippeaks:
122 |
123 | if i <= chippeak.score:
124 |
125 | chippeakcount = chippeakcount + 1
126 |
127 | nowfdr = inputpeakcount/chippeakcount
128 |
129 | # print (i, chippeakcount, inputpeakcount, nowfdr)
130 |
131 | if chippeakcount == 0:
132 |
133 | break
134 |
135 | for peaknow in chippeaks:
136 |
137 | if peaknow.score > i:
138 |
139 | peaknow.fdr = nowfdr
140 |
141 |
142 | # if (inputpeakcount/chippeakcount) < fdr:
143 | #
144 | # fdrth = i
145 | #
146 | # break
147 |
148 | return chippeaks
149 |
150 |
151 |
152 | def fdr_control2(chippeaks, inputpeaks, fdr):
153 |
154 | fdrpeakdict = dict()
155 |
156 | chipscore = list()
157 |
158 | inputscore = list()
159 |
160 | overlaptedpeak = dict()
161 |
162 | fdrth = -1
163 |
164 | # print ("check fdr")
165 |
166 | for inputpeak in inputpeaks:
167 |
168 | start = inputpeak.start
169 |
170 | end = inputpeak.end
171 |
172 | inputscore.append(inputpeak.score)
173 |
174 | for chippeak in chippeaks:
175 |
176 | if chippeak.chromosome == inputpeak.chromosome:
177 |
178 | if inputpeak.start i:
219 |
220 | peaknow.fdr = nowfdr
221 |
222 | return chippeaks
223 |
224 |
225 | def fdr_bh(peaks):
226 |
227 | b01s = list()
228 |
229 | peakscores = list()
230 |
231 | for peak in peaks:
232 |
233 | b01 = 1/(math.e**(peak.score/2))
234 |
235 | peakscores.append(peak.score)
236 |
237 | b01s.append(b01)
238 |
239 | sortedb01s = sorted(b01s,reverse=True)
240 |
241 | listlength = len(sortedb01s)
242 |
243 | for peak in peaks:
244 |
245 | b01 = 1/(math.e**(peak.score/2))
246 |
247 | rank = 1
248 |
249 | for i in range(0,listlength):
250 |
251 | if sortedb01s[i] == b01:
252 |
253 | rank = i + 1
254 |
255 | break
256 |
257 | fdr = b01*listlength/rank
258 |
259 | peak.fdr = fdr
260 |
261 | return peaks
262 |
263 |
264 |
265 |
266 |
267 | if __name__ == "__main__":
268 |
269 | try:
270 |
271 | for i in range(100,2000,100):
272 |
273 | for j in range (2,80):
274 |
275 |
276 | bs = bayesfactor(locallambda=i, peakscore=j)
277 | # if bs == 1500:
278 | # bs = 'error'
279 | print ("locallambda:",i, "peakscore",j,"bs",bs)
280 |
281 | except KeyboardInterrupt:
282 |
283 | sys.stderr.write("User interrupt\n")
284 |
285 | sys.exit(0)
--------------------------------------------------------------------------------
/Jazzlib/localmax.py.bak:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from __future__ import print_function
3 | from kernelsmooth import *
4 | from multiprocessing import Pool
5 | from kernel import *
6 | from FRegion import *
7 |
8 |
9 | class KeyboardInterruptError(Exception):
10 |
11 | pass
12 |
13 |
14 | def get_all_localmax(bamfile, jobtype, maxinsert, nthreads, fregion, countchr, rndth):
15 |
16 | pool = Pool(nthreads)
17 |
18 | try:
19 |
20 | pars = list()
21 |
22 | windowsize = 100000
23 |
24 | adjreads = fregion.adjreads
25 |
26 | totallength = 0
27 |
28 | onesmoothkernel = smooth_kernel(30)
29 |
30 | kermax = max(onesmoothkernel.values())
31 | #
32 |
33 |
34 | for chromosmoe in countchr:
35 |
36 | chr_length = fregion.chrs_length[chromosmoe]
37 |
38 | totallength = totallength + chr_length
39 |
40 | for scare in range(0, int(chr_length/windowsize)+1):
41 |
42 | nowstart = scare*windowsize + 1 -200
43 |
44 | nowend = (scare+1)*windowsize + 200
45 |
46 | if nowend > chr_length:
47 |
48 | nowend = chr_length
49 |
50 | if nowstart < 1:
51 |
52 | nowstart = 1
53 |
54 | nowregion = chromosmoe + ":" + str(nowstart) + "-" + str(nowend)
55 |
56 | par = dict()
57 |
58 | par['region'] = nowregion
59 |
60 | par['maxinsert'] = maxinsert
61 |
62 | par['bamfile'] = bamfile
63 |
64 | par['jobtype'] = jobtype
65 |
66 | par['chrlength'] = chr_length
67 |
68 | par['regionchromosome'] = chromosmoe
69 |
70 | par['regionstart'] = nowstart
71 |
72 | par['regionend'] = nowend
73 |
74 | par['rndth'] = rndth
75 |
76 | pars.append(par)
77 |
78 | avgcount = adjreads/totallength
79 |
80 | threshhold = int(avgcount + 1) * kermax
81 | ###test threhhold
82 | #threshhold = avgcount
83 |
84 | # print ("threshhold:", threshhold)
85 |
86 | filted_region = fregion.filted_region
87 |
88 | filted_site = dict()
89 |
90 | for fr in filted_region:
91 |
92 | chromosome, sesite = fr.split(':')
93 |
94 | startsite, endsite = sesite.split('-')
95 |
96 | startsite = int(startsite)
97 |
98 | endsite = int(endsite)
99 |
100 | if chromosome in filted_site:
101 |
102 | for i in range(startsite,endsite):
103 |
104 | filted_site[chromosome][i] = 1
105 |
106 | else:
107 |
108 | filted_site[chromosome] = dict()
109 |
110 | for i in range(startsite,endsite):
111 |
112 | filted_site[chromosome][i] = 1
113 |
114 | localmax = dict()
115 |
116 | localmax_worker_returnres = pool.map(localmax_worker, pars)
117 |
118 | for each_worker_res in localmax_worker_returnres:
119 |
120 | for chromosome in each_worker_res:
121 |
122 | for site in each_worker_res[chromosome]:
123 |
124 | if chromosome in localmax:
125 |
126 | if each_worker_res[chromosome][site] > threshhold:
127 |
128 | if chromosome in filted_site:
129 |
130 | if site in filted_site[chromosome]:
131 |
132 | continue
133 |
134 | localmax[chromosome][site] = each_worker_res[chromosome][site]
135 |
136 | else:
137 |
138 | if each_worker_res[chromosome][site]>threshhold:
139 |
140 | if chromosome in filted_site:
141 |
142 | if site in filted_site[chromosome]:
143 |
144 | continue
145 |
146 | localmax[chromosome] = dict()
147 |
148 | localmax[chromosome][site] = each_worker_res[chromosome][site]
149 |
150 | pool.close()
151 |
152 | # print (localmax)
153 |
154 | return localmax
155 |
156 | except KeyboardInterrupt:
157 |
158 | pool.terminate()
159 |
160 | print ("You cancelled the program!")
161 |
162 | sys.exit(1)
163 |
164 | except Exception, e:
165 |
166 | print ('got exception in Jazzlib.localmax.get_all_localmax: %r, terminating the pool' % (e,))
167 |
168 | pool.terminate()
169 |
170 | print ('pool is terminated')
171 |
172 | finally:
173 | # print ('joining pool processes')
174 | pool.join()
175 | # print ('join complete')
176 |
177 |
178 | def localmax_worker(par):
179 |
180 | try:
181 |
182 | nowregion = par['region']
183 |
184 | maxinsert = par['maxinsert']
185 |
186 | bamfile = par['bamfile']
187 |
188 | jobtype = par['jobtype']
189 |
190 | chr_length = par['chrlength']
191 |
192 | regionchromosome = par['regionchromosome']
193 |
194 | regionstart = par['regionstart']
195 |
196 | regionend = par['regionend']
197 |
198 | rndth = par['rndth']
199 |
200 | # smoothedscore = regionsmooth(bamfile=bamfile, maxinsert=maxinsert, jobtype=jobtype, region=nowregion,
201 | # chr_length=chr_length)
202 |
203 | smoothedscore = regionsmooth(bamfile=bamfile, maxinsert=maxinsert, jobtype=jobtype,
204 | regionchromosome=regionchromosome,
205 | regionstart=regionstart, regionend=regionend,
206 | chr_length=chr_length)
207 |
208 | localmax = smoothedlocalmax(smoothedscore, rndth)
209 |
210 | return localmax
211 |
212 | except KeyboardInterrupt:
213 |
214 | raise KeyboardInterruptError()
215 |
216 | except Exception, e:
217 |
218 | print ('got exception in Jazzlib.localmax.localmax_worker: %r,' % (e,))
219 |
220 |
221 | def smoothedlocalmax(smoothedscore, rndth):
222 |
223 | try:
224 |
225 | maxsites = dict()
226 |
227 | startsite = min(smoothedscore['score'].keys())
228 |
229 | endsite = max(smoothedscore['score'].keys())
230 |
231 | chromosome = smoothedscore['chromosome']
232 |
233 | maxsites[chromosome] = dict()
234 |
235 | for nowsite in range(startsite+2, endsite-2):
236 |
237 | if smoothedscore['score'][nowsite] >=rndth:
238 |
239 | if (smoothedscore['score'][nowsite-2]=smoothedscore['score'][nowsite+1]>smoothedscore['score'][nowsite+2]):
240 |
241 | maxsites[chromosome][nowsite] = smoothedscore['score'][nowsite]
242 |
243 | # print (nowsite)
244 |
245 | return maxsites
246 |
247 | except KeyboardInterrupt:
248 |
249 | raise KeyboardInterruptError()
250 |
251 | except Exception, e:
252 |
253 | print ('got exception in Jazzlib.localmax.smoothedlocalmax: %r,' % (e,))
254 |
255 |
--------------------------------------------------------------------------------
/Jazzlib/sta.py.bak:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from __future__ import print_function
3 | from scipy.special import gammaincc
4 | from scipy import math
5 | import scipy.stats as stats
6 | from decimal import Decimal, localcontext
7 | from Peak import *
8 | import sys
9 |
10 | def bionompvalue(x, n, p):
11 |
12 | bionompvalue = 1 - stats.binom.cdf(x, n, p)
13 |
14 | return bionompvalue
15 |
16 |
17 | def poissonpvalue(x,mu):
18 |
19 | poissonpvalue = Decimal(1) - Decimal(stats.poisson.cdf(x, mu))
20 |
21 | return poissonpvalue
22 |
23 |
24 |
25 | def fdr(pnow, plist, prank):
26 | #FDR=length(pvalue)*pvalue/rank(pvalue)
27 |
28 | rankofplist = prank
29 |
30 | lengthofplist = len(plist)
31 |
32 | for i in range(0,lengthofplist):
33 |
34 | if plist[i] == pnow:
35 | now_rank = rankofplist[i]
36 | fdr = lengthofplist*pnow/now_rank
37 | fdr = min(1,fdr)
38 | break
39 |
40 | return fdr
41 |
42 |
43 | def bayesfactor(locallambda, peakscore):
44 |
45 | try:
46 |
47 | # bayesfactor = 2 * (math.log((gammaincc(peakscore-1, locallambda)*gamma(peakscore-1)), math.e) - (peakscore-1)*math.log(locallambda, math.e) + locallambda)
48 | #
49 | # a = (math.log(gammaincc(peakscore-1, locallambda), math.e) )
50 | # b = math.lgamma(peakscore-1)
51 | # c=(peakscore-1)*math.log(locallambda, math.e)
52 | # print (locallambda,peakscore,a,b,c)
53 | bayesfactor2 = 2 * (math.log(gammaincc(peakscore-1, locallambda), math.e)+math.lgamma(peakscore-1) - (peakscore-1)*math.log(locallambda, math.e) + locallambda)
54 |
55 | return bayesfactor2
56 |
57 | except Exception, e:
58 |
59 | print ('got exception in Jazzlib.sta.bayesfactor: %r,' % (e,))
60 |
61 | print (locallambda, peakscore)
62 |
63 | except KeyboardInterrupt:
64 |
65 | sys.stderr.write("User interrupt\n")
66 |
67 | sys.exit(0)
68 |
69 | def fdr_control(chippeaks, inputpeaks, fdr):
70 |
71 | fdrpeakdict = dict()
72 |
73 | chipscore = list()
74 |
75 | inputscore = list()
76 |
77 | overlaptedpeak = dict()
78 |
79 | fdrth = -1
80 |
81 | # print ("check fdr")
82 |
83 | for inputpeak in inputpeaks:
84 |
85 | start = inputpeak.start
86 |
87 | end = inputpeak.end
88 |
89 | inputscore.append(inputpeak.score)
90 |
91 | for chippeak in chippeaks:
92 |
93 | if chippeak.chromosome == inputpeak.chromosome:
94 |
95 | if chippeak.peakpoint == inputpeak.peakpoint:
96 |
97 | overlaptedpeak[chippeak.peakid] = dict()
98 |
99 | overlaptedpeak[chippeak.peakid]['inputscore'] = inputpeak.score
100 |
101 | overlaptedpeak[chippeak.peakid]['chipscore'] = chippeak.score
102 |
103 | chipscore.append(chippeak.score)
104 |
105 | # print(chippeak.chromosome, chippeak.peakpoint, chippeak.score, inputpeak.score, chippeak.peakid)
106 |
107 | for i in sorted(chipscore):
108 |
109 | # print("score", i)
110 |
111 | chippeakcount = 0.0
112 |
113 | inputpeakcount = 0.0
114 |
115 | for peakid in overlaptedpeak:
116 |
117 | if i <= overlaptedpeak[peakid]['inputscore']:
118 |
119 | inputpeakcount = inputpeakcount + 1
120 |
121 | for chippeak in chippeaks:
122 |
123 | if i <= chippeak.score:
124 |
125 | chippeakcount = chippeakcount + 1
126 |
127 | nowfdr = inputpeakcount/chippeakcount
128 |
129 | # print (i, chippeakcount, inputpeakcount, nowfdr)
130 |
131 | if chippeakcount == 0:
132 |
133 | break
134 |
135 | for peaknow in chippeaks:
136 |
137 | if peaknow.score > i:
138 |
139 | peaknow.fdr = nowfdr
140 |
141 |
142 | # if (inputpeakcount/chippeakcount) < fdr:
143 | #
144 | # fdrth = i
145 | #
146 | # break
147 |
148 | return chippeaks
149 |
150 |
151 |
152 | def fdr_control2(chippeaks, inputpeaks, fdr):
153 |
154 | fdrpeakdict = dict()
155 |
156 | chipscore = list()
157 |
158 | inputscore = list()
159 |
160 | overlaptedpeak = dict()
161 |
162 | fdrth = -1
163 |
164 | # print ("check fdr")
165 |
166 | for inputpeak in inputpeaks:
167 |
168 | start = inputpeak.start
169 |
170 | end = inputpeak.end
171 |
172 | inputscore.append(inputpeak.score)
173 |
174 | for chippeak in chippeaks:
175 |
176 | if chippeak.chromosome == inputpeak.chromosome:
177 |
178 | if inputpeak.start i:
219 |
220 | peaknow.fdr = nowfdr
221 |
222 | return chippeaks
223 |
224 |
225 | def fdr_bh(peaks):
226 |
227 | b01s = list()
228 |
229 | peakscores = list()
230 |
231 | for peak in peaks:
232 |
233 | b01 = 1/(math.e**(peak.score/2))
234 |
235 | peakscores.append(peak.score)
236 |
237 | b01s.append(b01)
238 |
239 | sortedb01s = sorted(b01s,reverse=True)
240 |
241 | listlength = len(sortedb01s)
242 |
243 | for peak in peaks:
244 |
245 | b01 = 1/(math.e**(peak.score/2))
246 |
247 | rank = 1
248 |
249 | for i in range(0,listlength):
250 |
251 | if sortedb01s[i] == b01:
252 |
253 | rank = i + 1
254 |
255 | break
256 |
257 | fdr = b01*listlength/rank
258 |
259 | peak.fdr = fdr
260 |
261 | return peaks
262 |
263 |
264 |
265 |
266 |
267 | if __name__ == "__main__":
268 |
269 | try:
270 |
271 | for i in range(100,2000,100):
272 |
273 | for j in range (2,80):
274 |
275 |
276 | bs = bayesfactor(locallambda=i, peakscore=j)
277 | # if bs == 1500:
278 | # bs = 'error'
279 | print ("locallambda:",i, "peakscore",j,"bs",bs)
280 |
281 | except KeyboardInterrupt:
282 |
283 | sys.stderr.write("User interrupt\n")
284 |
285 | sys.exit(0)
--------------------------------------------------------------------------------
/Jazzlib/FRegion.py.bak:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from __future__ import print_function
3 | from numpy import *
4 | from countreads import *
5 | from multiprocessing import Pool
6 | from countreads import *
7 | import timeit
8 | import sys
9 |
10 |
11 | class KeyboardInterruptError(Exception):
12 |
13 | pass
14 |
15 |
16 | class FRegion:
17 |
18 | def __init__(self, bamfile, nthreads, maxinsert, jobtype, countchr=[]):
19 |
20 | self.bamfile = bamfile
21 |
22 | self.count_chr = countchr
23 |
24 | self.nthreads = nthreads
25 |
26 | self.maxinsert = maxinsert
27 |
28 | self.jobtype = jobtype
29 |
30 | self.__filte_region()
31 |
32 | def filte_region(self):
33 |
34 | bam_file = self.bamfile
35 |
36 | count_chr = self.count_chr
37 |
38 | nthreads = self.nthreads
39 |
40 | jobtype = self.jobtype
41 |
42 | maxinsert = self.maxinsert
43 |
44 | res = filter_region(bamfile=bam_file, count_chr=count_chr, nthreads=nthreads, maxinsert=maxinsert,
45 | jobtype=jobtype)
46 |
47 | filted_region = res['filted_region']
48 |
49 | thresh_hold = res['thresh_hold']
50 |
51 | scare_std = res['region_std']
52 |
53 | scare_mean = res['region_mean']
54 |
55 | chr_total_reads = res['chr_total_reads']
56 |
57 | chrs_length = res['chrs_length']
58 |
59 | chrsfrcount = res['chrfrcount']
60 |
61 | filterreadscount = res['filterreadscount']
62 |
63 | totalreads = res['totalreads']
64 |
65 | chruniqlength = res['chruniqlength']
66 |
67 | readlengthmean = res['readlengthmean']
68 |
69 | adjreads = totalreads - filterreadscount
70 |
71 | countgenomelength = 0
72 |
73 | countgenomeuniqlength = 0
74 |
75 | for chromosome in count_chr:
76 |
77 | countgenomelength = countgenomelength + int(chrs_length[chromosome])
78 |
79 | countgenomeuniqlength = countgenomeuniqlength + int(chruniqlength[chromosome])
80 |
81 | self.countgenomelength = countgenomelength
82 |
83 | self.filted_region = filted_region
84 |
85 | self.thresh_hold = thresh_hold
86 |
87 | self.region_std = scare_std
88 |
89 | self.region_mean = scare_mean
90 |
91 | self.chr_total_reads = chr_total_reads
92 |
93 | self.chrs_length = chrs_length
94 |
95 | self.chrsfcount = chrsfrcount
96 |
97 | self.totalreads = totalreads
98 |
99 | self.filterreadscount = filterreadscount
100 |
101 | self.adjreads = adjreads
102 |
103 | self.chruniqlength = chruniqlength
104 |
105 | self.countgenomeuniqlength = countgenomeuniqlength
106 |
107 | self.readlengthmean = readlengthmean
108 |
109 | __filte_region = filte_region
110 |
111 |
112 | def filter_region(bamfile, count_chr, nthreads, maxinsert, jobtype):
113 |
114 | pool = Pool(nthreads)
115 |
116 | try:
117 |
118 | samfile = pysam.Samfile(bamfile)
119 |
120 | windowsize = 1000
121 |
122 | totalreads = 0
123 |
124 | refere_ncenumber = samfile.nreferences
125 |
126 | ref_lengths = samfile.lengths
127 |
128 | sam_ref = samfile.references
129 |
130 | chrs_length = dict()
131 |
132 | chr_total_reads = dict()
133 |
134 | pars = list()
135 |
136 | chruniqlength = dict()
137 |
138 | chrreadlengthmean = dict()
139 |
140 | for chromosome in count_chr:
141 |
142 | for i in range(refere_ncenumber):
143 |
144 | if sam_ref[i] == chromosome:
145 |
146 | chr_length = ref_lengths[i]
147 |
148 | chrs_length[chromosome] = chr_length
149 |
150 | chrcount = windowcounter(bamfile=bamfile, regionchromosome=chromosome,
151 | regionstart=1, regionend=int(chr_length),
152 | maxinsert=maxinsert,
153 | jobtype=jobtype)
154 |
155 | chr_total_reads[chromosome] = chrcount
156 |
157 | totalreads = chrcount + totalreads
158 |
159 | for chromosome in chrs_length:
160 |
161 | par = dict()
162 |
163 | par['chrmosome'] = chromosome
164 |
165 | par['windowsize'] = windowsize
166 |
167 | par['chr_length'] = chrs_length[chromosome]
168 |
169 | par['bamfile'] = bamfile
170 |
171 | par['maxinsert'] = maxinsert
172 |
173 | par['jobtype'] = jobtype
174 |
175 | pars.append(par)
176 |
177 | windowcountlist = list()
178 |
179 | windowregionlist = list()
180 |
181 | chrswindow = pool.map(chrwindow_counter, pars)
182 |
183 | for nowchrcount in chrswindow:
184 |
185 | nowchromosome = nowchrcount['chromosome']
186 |
187 | nowchromosome = str(nowchromosome)
188 |
189 | nowwindowcount = nowchrcount['windowcount']
190 |
191 | nowuniqcount = nowchrcount['uniqcount']
192 |
193 | nowreadslengthmean = nowchrcount['readlengthmean']
194 |
195 | chrreadlengthmean[nowchromosome] = nowreadslengthmean
196 |
197 | chruniqlength[nowchromosome] = nowuniqcount
198 |
199 | for nowscare in nowwindowcount:
200 |
201 | nowstart = nowscare * windowsize + 1
202 |
203 | nowend = (nowscare+1) * windowsize
204 |
205 | if nowend > chrs_length[nowchromosome]:
206 |
207 | nowend = chrs_length[nowchromosome]
208 |
209 | nowregion = nowchromosome+":"+str(nowstart)+"-"+str(nowend)
210 |
211 | windowcountlist.append(nowwindowcount[nowscare])
212 |
213 | windowregionlist.append(nowregion)
214 |
215 | scare_mean = mean(windowcountlist)
216 |
217 | scare_std = std(windowcountlist)
218 |
219 | print ("mean:", scare_mean, "std",scare_std)
220 |
221 | thresh_hold = scare_mean + 10 * scare_std
222 |
223 | chrsfrcount = 0
224 |
225 | filterreadscount = 0
226 |
227 | filted_region = list()
228 |
229 | for i in range(0, len(windowcountlist)):
230 |
231 | if windowcountlist[i] >= thresh_hold:
232 |
233 | # print (windowregionlist[i]," reads count ", windowcountlist[i])
234 |
235 | filted_region.append(windowregionlist[i])
236 |
237 | filterreadscount = filterreadscount + windowcountlist[i]
238 |
239 | res = dict()
240 |
241 | res['filted_region'] = filted_region
242 |
243 | res['thresh_hold'] = thresh_hold
244 |
245 | res['region_std'] = scare_std
246 |
247 | res['region_mean'] = scare_mean
248 |
249 | res['chr_total_reads'] = chr_total_reads
250 |
251 | res['chrs_length'] = chrs_length
252 |
253 | res['chrfrcount'] = chrsfrcount
254 |
255 | res['filterreadscount'] = filterreadscount
256 |
257 | res['totalreads'] = totalreads
258 |
259 | res['chruniqlength'] = chruniqlength
260 |
261 | # res['chrreadlengthmean'] = chrreadlengthmean
262 |
263 | totallengmean = 0
264 |
265 | totalchrnumber = 0
266 |
267 | for chromsome in count_chr:
268 |
269 | if chromsome in chrreadlengthmean:
270 |
271 | totallengmean = totallengmean + chrreadlengthmean[chromsome]
272 |
273 | totalchrnumber = totalchrnumber + 1
274 |
275 | readlengthmean = totallengmean/totalchrnumber
276 |
277 | res['readlengthmean'] = readlengthmean
278 |
279 | pool.close()
280 |
281 | return res
282 |
283 | except KeyboardInterrupt:
284 |
285 | pool.terminate()
286 |
287 | print ("You cancelled the program!")
288 |
289 | sys.exit(1)
290 |
291 | except Exception, e:
292 |
293 | print ('got exception in Jazzlib.FRegion.filter_region: %r, terminating the pool' % (e,))
294 |
295 | pool.terminate()
296 |
297 | print ('pool is terminated')
298 |
299 | finally:
300 | # print ('joining pool processes')
301 | pool.join()
302 | # print ('join complete')
303 |
304 |
305 | def chrwindow_counter(par):
306 |
307 | try:
308 |
309 | chromosome = par['chrmosome']
310 |
311 | windowsize = par['windowsize']
312 |
313 | chr_length = par['chr_length']
314 |
315 | bamfile = par['bamfile']
316 |
317 | maxinsert = par['maxinsert']
318 |
319 | jobtype = par['jobtype']
320 |
321 | windowcount = windowscarecounter(bamfile=bamfile, regionchromosome=chromosome,
322 | regionstart=1, regionend=chr_length,
323 | windowsize=windowsize, maxinsert=maxinsert, jobtype=jobtype)
324 |
325 | uniqcount = uniqsitecount(bamfile=bamfile, regionchromosome=chromosome,
326 | regionstart=1, regionend=chr_length, maxinsert=maxinsert,
327 | jobtype=jobtype)
328 |
329 | readlengthmean = readslengthmean(bamfile=bamfile, regionchromosome=chromosome,
330 | regionstart=1, regionend=chr_length, maxinsert=maxinsert,
331 | jobtype=jobtype)
332 |
333 | chrwindowcount = dict()
334 |
335 | chrwindowcount['windowcount'] = windowcount
336 |
337 | chrwindowcount['chromosome'] = chromosome
338 |
339 | chrwindowcount['uniqcount'] = uniqcount
340 |
341 | chrwindowcount['readlengthmean'] = readlengthmean
342 |
343 | return chrwindowcount
344 |
345 | except KeyboardInterrupt:
346 |
347 | print ("You cancelled the program!")
348 |
349 | sys.exit(1)
350 |
--------------------------------------------------------------------------------
/Jazzlib/FRegion.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from numpy import *
4 | from .countreads import *
5 | from multiprocessing import Pool
6 | from .countreads import *
7 | import timeit
8 | import sys
9 |
10 |
11 | class KeyboardInterruptError(Exception):
12 |
13 | pass
14 |
15 |
16 | class FRegion:
17 |
18 | def __init__(self, bamfile, nthreads, maxinsert, jobtype, countchr=[]):
19 |
20 | self.bamfile = bamfile
21 |
22 | self.count_chr = countchr
23 |
24 | self.nthreads = nthreads
25 |
26 | self.maxinsert = maxinsert
27 |
28 | self.jobtype = jobtype
29 |
30 | self.__filte_region()
31 |
32 | def filte_region(self):
33 |
34 | bam_file = self.bamfile
35 |
36 | count_chr = self.count_chr
37 |
38 | nthreads = self.nthreads
39 |
40 | jobtype = self.jobtype
41 |
42 | maxinsert = self.maxinsert
43 |
44 | res = filter_region(bamfile=bam_file, count_chr=count_chr, nthreads=nthreads, maxinsert=maxinsert,
45 | jobtype=jobtype)
46 |
47 | filted_region = res['filted_region']
48 |
49 | thresh_hold = res['thresh_hold']
50 |
51 | scare_std = res['region_std']
52 |
53 | scare_mean = res['region_mean']
54 |
55 | chr_total_reads = res['chr_total_reads']
56 |
57 | chrs_length = res['chrs_length']
58 |
59 | chrsfrcount = res['chrfrcount']
60 |
61 | filterreadscount = res['filterreadscount']
62 |
63 | totalreads = res['totalreads']
64 |
65 | chruniqlength = res['chruniqlength']
66 |
67 | readlengthmean = res['readlengthmean']
68 |
69 | adjreads = totalreads - filterreadscount
70 |
71 | countgenomelength = 0
72 |
73 | countgenomeuniqlength = 0
74 |
75 | for chromosome in count_chr:
76 |
77 | countgenomelength = countgenomelength + int(chrs_length[chromosome])
78 |
79 | countgenomeuniqlength = countgenomeuniqlength + int(chruniqlength[chromosome])
80 |
81 | self.countgenomelength = countgenomelength
82 |
83 | self.filted_region = filted_region
84 |
85 | self.thresh_hold = thresh_hold
86 |
87 | self.region_std = scare_std
88 |
89 | self.region_mean = scare_mean
90 |
91 | self.chr_total_reads = chr_total_reads
92 |
93 | self.chrs_length = chrs_length
94 |
95 | self.chrsfcount = chrsfrcount
96 |
97 | self.totalreads = totalreads
98 |
99 | self.filterreadscount = filterreadscount
100 |
101 | self.adjreads = adjreads
102 |
103 | self.chruniqlength = chruniqlength
104 |
105 | self.countgenomeuniqlength = countgenomeuniqlength
106 |
107 | self.readlengthmean = readlengthmean
108 |
109 | __filte_region = filte_region
110 |
111 |
112 | def filter_region(bamfile, count_chr, nthreads, maxinsert, jobtype):
113 |
114 | pool = Pool(nthreads)
115 |
116 | try:
117 |
118 | samfile = pysam.Samfile(bamfile)
119 |
120 | windowsize = 1000
121 |
122 | totalreads = 0
123 |
124 | refere_ncenumber = samfile.nreferences
125 |
126 | ref_lengths = samfile.lengths
127 |
128 | sam_ref = samfile.references
129 |
130 | chrs_length = dict()
131 |
132 | chr_total_reads = dict()
133 |
134 | pars = list()
135 |
136 | chruniqlength = dict()
137 |
138 | chrreadlengthmean = dict()
139 |
140 | for chromosome in count_chr:
141 |
142 | for i in range(refere_ncenumber):
143 |
144 | if sam_ref[i] == chromosome:
145 |
146 | chr_length = ref_lengths[i]
147 |
148 | chrs_length[chromosome] = chr_length
149 |
150 | chrcount = windowcounter(bamfile=bamfile, regionchromosome=chromosome,
151 | regionstart=1, regionend=int(chr_length),
152 | maxinsert=maxinsert,
153 | jobtype=jobtype)
154 |
155 | chr_total_reads[chromosome] = chrcount
156 |
157 | totalreads = chrcount + totalreads
158 |
159 | for chromosome in chrs_length:
160 |
161 | par = dict()
162 |
163 | par['chrmosome'] = chromosome
164 |
165 | par['windowsize'] = windowsize
166 |
167 | par['chr_length'] = chrs_length[chromosome]
168 |
169 | par['bamfile'] = bamfile
170 |
171 | par['maxinsert'] = maxinsert
172 |
173 | par['jobtype'] = jobtype
174 |
175 | pars.append(par)
176 |
177 | windowcountlist = list()
178 |
179 | windowregionlist = list()
180 |
181 | chrswindow = pool.map(chrwindow_counter, pars)
182 |
183 | for nowchrcount in chrswindow:
184 |
185 | nowchromosome = nowchrcount['chromosome']
186 |
187 | nowchromosome = str(nowchromosome)
188 |
189 | nowwindowcount = nowchrcount['windowcount']
190 |
191 | nowuniqcount = nowchrcount['uniqcount']
192 |
193 | nowreadslengthmean = nowchrcount['readlengthmean']
194 |
195 | print(nowchromosome, nowreadslengthmean)
196 |
197 | chrreadlengthmean[nowchromosome] = nowreadslengthmean
198 |
199 | chruniqlength[nowchromosome] = nowuniqcount
200 |
201 | for nowscare in nowwindowcount:
202 |
203 | nowstart = nowscare * windowsize + 1
204 |
205 | nowend = (nowscare+1) * windowsize
206 |
207 | if nowend > chrs_length[nowchromosome]:
208 |
209 | nowend = chrs_length[nowchromosome]
210 |
211 | nowregion = nowchromosome+":"+str(nowstart)+"-"+str(nowend)
212 |
213 | windowcountlist.append(nowwindowcount[nowscare])
214 |
215 | windowregionlist.append(nowregion)
216 |
217 | scare_mean = mean(windowcountlist)
218 |
219 | scare_std = std(windowcountlist)
220 |
221 | print ("mean:", scare_mean, "std",scare_std)
222 |
223 | thresh_hold = scare_mean + 10 * scare_std
224 |
225 | chrsfrcount = 0
226 |
227 | filterreadscount = 0
228 |
229 | filted_region = list()
230 |
231 | for i in range(0, len(windowcountlist)):
232 |
233 | if windowcountlist[i] >= thresh_hold:
234 |
235 | # print (windowregionlist[i]," reads count ", windowcountlist[i])
236 |
237 | filted_region.append(windowregionlist[i])
238 |
239 | filterreadscount = filterreadscount + windowcountlist[i]
240 |
241 | res = dict()
242 |
243 | res['filted_region'] = filted_region
244 |
245 | res['thresh_hold'] = thresh_hold
246 |
247 | res['region_std'] = scare_std
248 |
249 | res['region_mean'] = scare_mean
250 |
251 | res['chr_total_reads'] = chr_total_reads
252 |
253 | res['chrs_length'] = chrs_length
254 |
255 | res['chrfrcount'] = chrsfrcount
256 |
257 | res['filterreadscount'] = filterreadscount
258 |
259 | res['totalreads'] = totalreads
260 |
261 | res['chruniqlength'] = chruniqlength
262 |
263 | # res['chrreadlengthmean'] = chrreadlengthmean
264 |
265 | totallengmean = 0
266 |
267 | totalchrnumber = 0
268 |
269 | for chromsome in count_chr:
270 |
271 | if chromsome in chrreadlengthmean:
272 |
273 | totallengmean = totallengmean + chrreadlengthmean[chromsome]
274 |
275 | totalchrnumber = totalchrnumber + 1
276 |
277 | readlengthmean = totallengmean/totalchrnumber
278 |
279 | res['readlengthmean'] = readlengthmean
280 |
281 | pool.close()
282 |
283 | return res
284 |
285 | except KeyboardInterrupt:
286 |
287 | pool.terminate()
288 |
289 | print ("You cancelled the program!")
290 |
291 | sys.exit(1)
292 |
293 | except Exception as e:
294 |
295 | print ('got exception in Jazzlib.FRegion.filter_region: %r, terminating the pool' % (e,))
296 |
297 | pool.terminate()
298 |
299 | print ('pool is terminated')
300 |
301 | finally:
302 | # print ('joining pool processes')
303 | pool.join()
304 | # print ('join complete')
305 |
306 |
307 | def chrwindow_counter(par):
308 |
309 | try:
310 |
311 | chromosome = par['chrmosome']
312 |
313 | windowsize = par['windowsize']
314 |
315 | chr_length = par['chr_length']
316 |
317 | bamfile = par['bamfile']
318 |
319 | maxinsert = par['maxinsert']
320 |
321 | jobtype = par['jobtype']
322 |
323 | windowcount = windowscarecounter(bamfile=bamfile, regionchromosome=chromosome,
324 | regionstart=1, regionend=chr_length,
325 | windowsize=windowsize, maxinsert=maxinsert, jobtype=jobtype)
326 |
327 | uniqcount = uniqsitecount(bamfile=bamfile, regionchromosome=chromosome,
328 | regionstart=1, regionend=chr_length, maxinsert=maxinsert,
329 | jobtype=jobtype)
330 |
331 | readlengthmean = readslengthmean(bamfile=bamfile, regionchromosome=chromosome,
332 | regionstart=1, regionend=chr_length, maxinsert=maxinsert,
333 | jobtype=jobtype)
334 |
335 | chrwindowcount = dict()
336 |
337 | chrwindowcount['windowcount'] = windowcount
338 |
339 | chrwindowcount['chromosome'] = chromosome
340 |
341 | chrwindowcount['uniqcount'] = uniqcount
342 |
343 | chrwindowcount['readlengthmean'] = readlengthmean
344 |
345 | # for debug
346 | print("in chrwindow_counter", readlengthmean)
347 |
348 | return chrwindowcount
349 |
350 | except KeyboardInterrupt:
351 |
352 | print ("You cancelled the program!")
353 |
354 | sys.exit(1)
355 |
--------------------------------------------------------------------------------
/Jazzlib/Jazz.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import os
4 | import sys
5 | from optparse import OptionParser
6 | import logging
7 | from Jazzlib.FRegion import *
8 | from Jazzlib.localmax import *
9 | from Jazzlib.normalize_ratio import *
10 | from Jazzlib.countreads import *
11 | from Jazzlib.Peak import *
12 | from Jazzlib.sta import *
13 | from Jazzlib.jazzio import *
14 | from Jazzlib.randombg import *
15 | from Jazzlib.hotspotsscan import *
16 | from Jazzlib.Hotspot import *
17 |
18 |
19 | def main():
20 |
21 | opt = opt_check(get_optparser())
22 |
23 | if opt.controlfile == "no":
24 |
25 | nocontrol(opt)
26 |
27 | else:
28 |
29 | withcontrol(opt)
30 |
31 |
32 | def withcontrol(opt):
33 |
34 | try:
35 |
36 | datafile = opt.datafile
37 |
38 | inputfile = opt.controlfile
39 |
40 | jobtype = opt.jobtype
41 |
42 | count_chr = opt.countchr
43 |
44 | maxinsert = opt.maxinsert
45 |
46 | nthreads = opt.nthreads
47 |
48 | bayesfactorthreshold = opt.threshold
49 |
50 | # bayesfactorthreshold = 10
51 |
52 | samplename = opt.samplename
53 |
54 | fdr = opt.fdr
55 |
56 | chipfregion = FRegion(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert)
57 |
58 | inputfregion = FRegion(bamfile=inputfile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert)
59 |
60 | rndth = randombg(fregion=chipfregion, nthreads=nthreads, maxinsert=maxinsert)
61 |
62 | localmax = get_all_localmax(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads,
63 | maxinsert=maxinsert, fregion=chipfregion, rndth=rndth)
64 |
65 | ratio = normalize_ratio_input2(fregegion_input=inputfregion, fregion_chip=chipfregion)
66 |
67 | inputadjreads = inputfregion.totalreads - inputfregion.filterreadscount
68 |
69 | genomelength = inputfregion.countgenomelength
70 |
71 | gloablumbda = inputadjreads/genomelength
72 |
73 | windowscare = 1000000
74 |
75 | # print (ratio, inputadjreads, genomelength,gloablumbda)
76 |
77 | # peaks = peakscan_control(localmax=localmax,
78 | # datafile=datafile,
79 | # inputfile=inputfile,
80 | # maxinsert=maxinsert,
81 | # windowscare=windowscare,
82 | # gloablumbda=gloablumbda,
83 | # ratio=ratio,
84 | # bayesfactorthreshold=bayesfactorthreshold,
85 | # nthreads=nthreads,
86 | # inputfregion=inputfregion,
87 | # chipfregion=chipfregion,
88 | # jobtype=jobtype)
89 |
90 | # peakbedgraphswriter(samplename, peaks)
91 |
92 | except KeyboardInterrupt:
93 |
94 | sys.stderr.write("User interrupt\n")
95 |
96 | sys.exit(0)
97 |
98 |
99 | def nocontrol(opt):
100 |
101 | try:
102 |
103 | datafile = opt.datafile
104 |
105 | jobtype = opt.jobtype
106 |
107 | count_chr = opt.countchr
108 |
109 | maxinsert = opt.maxinsert
110 |
111 | print ("maxinsert",maxinsert)
112 |
113 | nthreads = opt.nthreads
114 |
115 | bayesfactorthreshold = opt.threshold
116 |
117 | fdr = opt.fdr
118 |
119 | samplename = opt.samplename
120 |
121 | chipfregion = FRegion(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert)
122 |
123 | # rndth = randombg2(fregion=chipfregion, nthreads=nthreads, maxinsert=maxinsert)
124 |
125 |
126 |
127 | hotspots = hotspotsscan_withoutcontrol(file=datafile, maxinsert=maxinsert, windowscare=100000, countchr=count_chr,
128 | bayesfactorthreshold=bayesfactorthreshold, nthreads=nthreads,
129 | fregion=chipfregion, jobtype=jobtype)
130 |
131 | hotspotsbedswriter(hotspots=hotspots, samplename=samplename)
132 |
133 | # print (rndth)
134 | #
135 | # localmax = get_all_localmax(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads,
136 | # maxinsert=maxinsert, fregion=chipfregion, rndth=rndth)
137 | #
138 | # ratio = 1
139 | #
140 | # inputadjreads = chipfregion.totalreads - chipfregion.filterreadscount
141 | #
142 | # genomelength = chipfregion.countgenomelength
143 | #
144 | # gloablumbda = inputadjreads/genomelength*maxinsert
145 | #
146 | # print ("gloablumbda",gloablumbda)
147 | #
148 | # windowscare = 1000000
149 | #
150 | # peaks = peakscan_withoutcontrol(localmax=localmax,
151 | # file=datafile,
152 | # maxinsert=maxinsert,
153 | # windowscare=windowscare,
154 | # gloablumbda=gloablumbda,
155 | # ratio=ratio,
156 | # bayesfactorthreshold=bayesfactorthreshold,
157 | # nthreads=nthreads,
158 | # fregion=chipfregion,
159 | # jobtype=jobtype)
160 | #
161 | #
162 | # peakbedgraphswriter(samplename, peaks)
163 |
164 | except KeyboardInterrupt:
165 |
166 | sys.stderr.write("User interrupt\n")
167 |
168 | sys.exit(0)
169 |
170 |
171 | def get_optparser():
172 |
173 | usage = """usage: %prog <-d datafile> [-n name] [options]
174 | Example %prog -i nh_sample1.bam -n sample1
175 | """
176 |
177 | description = "%prog Non-Histone protein banding site identification"
178 |
179 | jazzopt = OptionParser(version="%prog 0.1 20140521", description=description, usage=usage, add_help_option=False)
180 |
181 | jazzopt.add_option("-h", "--help", action="help", help="show this help message and exit.")
182 |
183 | jazzopt.add_option("-d", "--data", dest="datafile", type="string", help='data file, should be sorted bam format')
184 |
185 | jazzopt.add_option("-c", "--control", dest="controlfile", type="string", help='control(input) file, should be sorted bam format', default="no")
186 |
187 | jazzopt.add_option("-n", "--name", dest="samplename", help="NH sample name default=NH_sample", type="string" , default="DH_sample")
188 |
189 | jazzopt.add_option("-t", "--threshold", dest="threshold", type="float", help="peak threshold, default=6.0", default=6.0)
190 |
191 | jazzopt.add_option("--threads", dest="nthreads", type="int", help="threads number or cpu number, default=4", default=4)
192 |
193 | jazzopt.add_option("-w", "--wig", action="store_true", help="whether out put wiggle file, default=False", default=False)
194 |
195 | jazzopt.add_option("-f","--fdr", dest="fdr", type="float",help="using FDR as threshold", default=0.1)
196 |
197 | jazzopt.add_option("-x", "--excludechr", dest="excludechr", help="Don't count those chromosome, strongly suggest skip mitochondrion and chloroplast, example='-x ChrM,ChrC'")
198 |
199 | jazzopt.add_option("-g", "--gff", action="store_true", help="whether out put gff file, default=False", default=False)
200 |
201 | jazzopt.add_option("-j","--jobtype",dest="jobtype",type="string",help="job type, such as nhpaired or nhsingle")
202 |
203 | jazzopt.add_option("-m","--maxinsert",dest="maxinsert",type="int",help="when you use paired library, please set the maxinsert size",default=80)
204 |
205 | jazzopt.add_option("--pe", dest="pe", action="store_true", help="paired-end reads or single-end reads, default=False (single end)", default=False)
206 |
207 | return jazzopt
208 |
209 |
210 | def opt_check(jazzopt):
211 |
212 | (opt, args) = jazzopt.parse_args()
213 |
214 | if not opt.datafile:
215 |
216 | logging.error("you need input a bam file, '-d nh_sample1.bam -j nhsingle'")
217 |
218 | jazzopt.print_help()
219 |
220 | sys.exit(1)
221 |
222 | if not os.path.isfile (opt.datafile):
223 |
224 | logging.error("No such file: %s" % opt.datafile)
225 |
226 | sys.exit(1)
227 |
228 | dataindexfile = opt.datafile + '.bai'
229 |
230 | if not os.path.isfile (dataindexfile):
231 |
232 | logging.error("Missing bam index file: %s" % dataindexfile)
233 |
234 | sys.exit(1)
235 |
236 | if not opt.controlfile == "no":
237 |
238 | if not os.path.isfile (opt.controlfile):
239 |
240 | logging.error("No such file: %s" % opt.controlfile)
241 |
242 | sys.exit(1)
243 |
244 | controlindexfile = opt.controlfile + '.bai'
245 |
246 | if not os.path.isfile (controlindexfile):
247 |
248 | logging.error("Missing bam index file: %s" % controlindexfile)
249 |
250 | sys.exit(1)
251 |
252 | else:
253 |
254 | opt.controlfile = "no"
255 |
256 | if not (opt.nthreads > 0):
257 |
258 | logging.error("threads number should >=1")
259 |
260 | jazzopt.print_help()
261 |
262 | sys.exit(1)
263 |
264 | if (opt.jobtype):
265 |
266 | if opt.jobtype == 'nhsingle':
267 |
268 | if (opt.maxinsert < 0):
269 |
270 | logging.error("maxinsert size error")
271 |
272 | jazzopt.print_help()
273 |
274 | sys.exit(1)
275 |
276 | elif opt.jobtype == 'nhpaired':
277 |
278 | if (opt.maxinsert < 0):
279 |
280 | logging.error("maxinsert size error")
281 |
282 | jazzopt.print_help()
283 |
284 | sys.exit(1)
285 |
286 | else:
287 |
288 | logging.error("missing or wrong jobtype")
289 |
290 | jazzopt.print_help()
291 |
292 | sys.exit(1)
293 |
294 | else:
295 |
296 | logging.error("missing or wrong jobtype")
297 |
298 | jazzopt.print_help()
299 |
300 | sys.exit(1)
301 |
302 | opt.countchr = list()
303 |
304 | samfile = pysam.Samfile(opt.datafile)
305 |
306 | sam_ref = samfile.references
307 |
308 | for i in sam_ref:
309 |
310 | opt.countchr.append(i)
311 |
312 | if (opt.excludechr):
313 |
314 | excludchr = opt.excludechr.split(',')
315 |
316 | for chri in excludchr:
317 |
318 | if not chri in sam_ref:
319 |
320 | print (chri,'not in the %s file' % opt.datafile)
321 |
322 | print ("try to selcet exclude Chr from", end =" : ")
323 |
324 | print (sam_ref, sep=",")
325 |
326 | jazzopt.print_help()
327 |
328 | sys.exit(1)
329 |
330 | else:
331 |
332 | j = 0
333 |
334 | for n in opt.countchr:
335 |
336 | if chri == n:
337 |
338 | del opt.countchr[j]
339 |
340 | j = j + 1
341 |
342 | return opt
343 |
344 | if __name__ == "__main__":
345 |
346 | try:
347 |
348 | main()
349 |
350 | except KeyboardInterrupt:
351 |
352 | sys.stderr.write("User interrupt\n")
353 |
354 | sys.exit(0)
355 |
356 |
--------------------------------------------------------------------------------
/Jazzlib/Jazz.py.bak:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from __future__ import print_function
3 | import os
4 | import sys
5 | from optparse import OptionParser
6 | import logging
7 | from Jazzlib.FRegion import *
8 | from Jazzlib.localmax import *
9 | from Jazzlib.normalize_ratio import *
10 | from Jazzlib.countreads import *
11 | from Jazzlib.Peak import *
12 | from Jazzlib.sta import *
13 | from Jazzlib.jazzio import *
14 | from Jazzlib.randombg import *
15 | from Jazzlib.hotspotsscan import *
16 | from Jazzlib.Hotspot import *
17 |
18 |
19 | def main():
20 |
21 | opt = opt_check(get_optparser())
22 |
23 | if opt.controlfile == "no":
24 |
25 | nocontrol(opt)
26 |
27 | else:
28 |
29 | withcontrol(opt)
30 |
31 |
32 | def withcontrol(opt):
33 |
34 | try:
35 |
36 | datafile = opt.datafile
37 |
38 | inputfile = opt.controlfile
39 |
40 | jobtype = opt.jobtype
41 |
42 | count_chr = opt.countchr
43 |
44 | maxinsert = opt.maxinsert
45 |
46 | nthreads = opt.nthreads
47 |
48 | bayesfactorthreshold = opt.threshold
49 |
50 | # bayesfactorthreshold = 10
51 |
52 | samplename = opt.samplename
53 |
54 | fdr = opt.fdr
55 |
56 | chipfregion = FRegion(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert)
57 |
58 | inputfregion = FRegion(bamfile=inputfile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert)
59 |
60 | rndth = randombg(fregion=chipfregion, nthreads=nthreads, maxinsert=maxinsert)
61 |
62 | localmax = get_all_localmax(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads,
63 | maxinsert=maxinsert, fregion=chipfregion, rndth=rndth)
64 |
65 | ratio = normalize_ratio_input2(fregegion_input=inputfregion, fregion_chip=chipfregion)
66 |
67 | inputadjreads = inputfregion.totalreads - inputfregion.filterreadscount
68 |
69 | genomelength = inputfregion.countgenomelength
70 |
71 | gloablumbda = inputadjreads/genomelength
72 |
73 | windowscare = 1000000
74 |
75 | # print (ratio, inputadjreads, genomelength,gloablumbda)
76 |
77 | # peaks = peakscan_control(localmax=localmax,
78 | # datafile=datafile,
79 | # inputfile=inputfile,
80 | # maxinsert=maxinsert,
81 | # windowscare=windowscare,
82 | # gloablumbda=gloablumbda,
83 | # ratio=ratio,
84 | # bayesfactorthreshold=bayesfactorthreshold,
85 | # nthreads=nthreads,
86 | # inputfregion=inputfregion,
87 | # chipfregion=chipfregion,
88 | # jobtype=jobtype)
89 |
90 | # peakbedgraphswriter(samplename, peaks)
91 |
92 | except KeyboardInterrupt:
93 |
94 | sys.stderr.write("User interrupt\n")
95 |
96 | sys.exit(0)
97 |
98 |
99 | def nocontrol(opt):
100 |
101 | try:
102 |
103 | datafile = opt.datafile
104 |
105 | jobtype = opt.jobtype
106 |
107 | count_chr = opt.countchr
108 |
109 | maxinsert = opt.maxinsert
110 |
111 | print ("maxinsert",maxinsert)
112 |
113 | nthreads = opt.nthreads
114 |
115 | bayesfactorthreshold = opt.threshold
116 |
117 | fdr = opt.fdr
118 |
119 | samplename = opt.samplename
120 |
121 | chipfregion = FRegion(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert)
122 |
123 | # rndth = randombg2(fregion=chipfregion, nthreads=nthreads, maxinsert=maxinsert)
124 |
125 |
126 |
127 | hotspots = hotspotsscan_withoutcontrol(file=datafile, maxinsert=maxinsert, windowscare=100000, countchr=count_chr,
128 | bayesfactorthreshold=bayesfactorthreshold, nthreads=nthreads,
129 | fregion=chipfregion, jobtype=jobtype)
130 |
131 | hotspotsbedswriter(hotspots=hotspots, samplename=samplename)
132 |
133 | # print (rndth)
134 | #
135 | # localmax = get_all_localmax(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads,
136 | # maxinsert=maxinsert, fregion=chipfregion, rndth=rndth)
137 | #
138 | # ratio = 1
139 | #
140 | # inputadjreads = chipfregion.totalreads - chipfregion.filterreadscount
141 | #
142 | # genomelength = chipfregion.countgenomelength
143 | #
144 | # gloablumbda = inputadjreads/genomelength*maxinsert
145 | #
146 | # print ("gloablumbda",gloablumbda)
147 | #
148 | # windowscare = 1000000
149 | #
150 | # peaks = peakscan_withoutcontrol(localmax=localmax,
151 | # file=datafile,
152 | # maxinsert=maxinsert,
153 | # windowscare=windowscare,
154 | # gloablumbda=gloablumbda,
155 | # ratio=ratio,
156 | # bayesfactorthreshold=bayesfactorthreshold,
157 | # nthreads=nthreads,
158 | # fregion=chipfregion,
159 | # jobtype=jobtype)
160 | #
161 | #
162 | # peakbedgraphswriter(samplename, peaks)
163 |
164 | except KeyboardInterrupt:
165 |
166 | sys.stderr.write("User interrupt\n")
167 |
168 | sys.exit(0)
169 |
170 |
171 | def get_optparser():
172 |
173 | usage = """usage: %prog <-d datafile> [-n name] [options]
174 | Example %prog -i nh_sample1.bam -n sample1
175 | """
176 |
177 | description = "%prog Non-Histone protein banding site identification"
178 |
179 | jazzopt = OptionParser(version="%prog 0.1 20140521", description=description, usage=usage, add_help_option=False)
180 |
181 | jazzopt.add_option("-h", "--help", action="help", help="show this help message and exit.")
182 |
183 | jazzopt.add_option("-d", "--data", dest="datafile", type="string", help='data file, should be sorted bam format')
184 |
185 | jazzopt.add_option("-c", "--control", dest="controlfile", type="string", help='control(input) file, should be sorted bam format', default="no")
186 |
187 | jazzopt.add_option("-n", "--name", dest="samplename", help="NH sample name default=NH_sample", type="string" , default="DH_sample")
188 |
189 | jazzopt.add_option("-t", "--threshold", dest="threshold", type="float", help="peak threshold, default=6.0", default=6.0)
190 |
191 | jazzopt.add_option("--threads", dest="nthreads", type="int", help="threads number or cpu number, default=4", default=4)
192 |
193 | jazzopt.add_option("-w", "--wig", action="store_true", help="whether out put wiggle file, default=False", default=False)
194 |
195 | jazzopt.add_option("-f","--fdr", dest="fdr", type="float",help="using FDR as threshold", default=0.1)
196 |
197 | jazzopt.add_option("-x", "--excludechr", dest="excludechr", help="Don't count those chromosome, strongly suggest skip mitochondrion and chloroplast, example='-x ChrM,ChrC'")
198 |
199 | jazzopt.add_option("-g", "--gff", action="store_true", help="whether out put gff file, default=False", default=False)
200 |
201 | jazzopt.add_option("-j","--jobtype",dest="jobtype",type="string",help="job type, such as nhpaired or nhsingle")
202 |
203 | jazzopt.add_option("-m","--maxinsert",dest="maxinsert",type="int",help="when you use paired library, please set the maxinsert size",default=80)
204 |
205 | jazzopt.add_option("--pe", dest="pe", action="store_true", help="paired-end reads or single-end reads, default=False (single end)", default=False)
206 |
207 | return jazzopt
208 |
209 |
210 | def opt_check(jazzopt):
211 |
212 | (opt, args) = jazzopt.parse_args()
213 |
214 | if not opt.datafile:
215 |
216 | logging.error("you need input a bam file, '-d nh_sample1.bam -j nhsingle'")
217 |
218 | jazzopt.print_help()
219 |
220 | sys.exit(1)
221 |
222 | if not os.path.isfile (opt.datafile):
223 |
224 | logging.error("No such file: %s" % opt.datafile)
225 |
226 | sys.exit(1)
227 |
228 | dataindexfile = opt.datafile + '.bai'
229 |
230 | if not os.path.isfile (dataindexfile):
231 |
232 | logging.error("Missing bam index file: %s" % dataindexfile)
233 |
234 | sys.exit(1)
235 |
236 | if not opt.controlfile == "no":
237 |
238 | if not os.path.isfile (opt.controlfile):
239 |
240 | logging.error("No such file: %s" % opt.controlfile)
241 |
242 | sys.exit(1)
243 |
244 | controlindexfile = opt.controlfile + '.bai'
245 |
246 | if not os.path.isfile (controlindexfile):
247 |
248 | logging.error("Missing bam index file: %s" % controlindexfile)
249 |
250 | sys.exit(1)
251 |
252 | else:
253 |
254 | opt.controlfile = "no"
255 |
256 | if not (opt.nthreads > 0):
257 |
258 | logging.error("threads number should >=1")
259 |
260 | jazzopt.print_help()
261 |
262 | sys.exit(1)
263 |
264 | if (opt.jobtype):
265 |
266 | if opt.jobtype == 'nhsingle':
267 |
268 | if (opt.maxinsert < 0):
269 |
270 | logging.error("maxinsert size error")
271 |
272 | jazzopt.print_help()
273 |
274 | sys.exit(1)
275 |
276 | elif opt.jobtype == 'nhpaired':
277 |
278 | if (opt.maxinsert < 0):
279 |
280 | logging.error("maxinsert size error")
281 |
282 | jazzopt.print_help()
283 |
284 | sys.exit(1)
285 |
286 | else:
287 |
288 | logging.error("missing or wrong jobtype")
289 |
290 | jazzopt.print_help()
291 |
292 | sys.exit(1)
293 |
294 | else:
295 |
296 | logging.error("missing or wrong jobtype")
297 |
298 | jazzopt.print_help()
299 |
300 | sys.exit(1)
301 |
302 | opt.countchr = list()
303 |
304 | samfile = pysam.Samfile(opt.datafile)
305 |
306 | sam_ref = samfile.references
307 |
308 | for i in sam_ref:
309 |
310 | opt.countchr.append(i)
311 |
312 | if (opt.excludechr):
313 |
314 | excludchr = opt.excludechr.split(',')
315 |
316 | for chri in excludchr:
317 |
318 | if not chri in sam_ref:
319 |
320 | print (chri,'not in the %s file' % opt.datafile)
321 |
322 | print ("try to selcet exclude Chr from", end =" : ")
323 |
324 | print (sam_ref, sep=",")
325 |
326 | jazzopt.print_help()
327 |
328 | sys.exit(1)
329 |
330 | else:
331 |
332 | j = 0
333 |
334 | for n in opt.countchr:
335 |
336 | if chri == n:
337 |
338 | del opt.countchr[j]
339 |
340 | j = j + 1
341 |
342 | return opt
343 |
344 | if __name__ == "__main__":
345 |
346 | try:
347 |
348 | main()
349 |
350 | except KeyboardInterrupt:
351 |
352 | sys.stderr.write("User interrupt\n")
353 |
354 | sys.exit(0)
355 |
356 |
--------------------------------------------------------------------------------
/Jazzlib/bgcount.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import pysam
4 | from numpy import *
5 | from multiprocessing import Pool
6 | import random as rnd
7 | from .kernel import *
8 | import sys
9 | from . import readscounter
10 |
11 |
12 | class KeyboardInterruptError(Exception):
13 |
14 | pass
15 |
16 |
17 | def nhnoncontrol(uniqueratio, threshold, kernellength, nthreads=4):
18 |
19 | bgscore = sim_replicate_nthreads(run_times=200, uniqueratio=uniqueratio,
20 | nthreads=nthreads, kernellength=kernellength, threshold=threshold)
21 |
22 | cutoff = bgscore['mean'] + bgscore['std'] * threshold
23 |
24 | return cutoff
25 |
26 |
27 | def dhnoncontrol(uniqueratio, threshold, kernellength, nthreads=4):
28 |
29 | bgscore = sim_replicate_nthreads(run_times=200, uniqueratio=uniqueratio,
30 | nthreads=nthreads, kernellength=kernellength, threshold=threshold)
31 |
32 | cutoff = bgscore['mean'] + bgscore['std'] * threshold
33 |
34 | return cutoff
35 |
36 |
37 | def nhcontrol(bamfile, chromosome, paired, chrlength, ultratio, filted_region,maxinsert, kernellength = 600, threshold = 4):
38 |
39 | """
40 | region: chr:start-end
41 | ultraio = chrlength * uniqratio / chr_total_reads
42 | filter region
43 |
44 | """
45 |
46 | region = chromosome + ':' + str(1) + '-' + str(chrlength)
47 |
48 | readscount = readscounter.nhreadscounter(bamfile, region, paired, maxinsert=maxinsert)
49 |
50 | kernel = smooth_kernel(kernellength)
51 |
52 | kernel_score = list()
53 |
54 | for i in sorted(kernel):
55 |
56 | kernel_score.append(kernel[i])
57 |
58 | threshold = filted_region.threshold/100
59 |
60 | for site in readscount:
61 |
62 | if readscount[site] > threshold:
63 |
64 | readscount[site] = threshold
65 |
66 | smoothed_result = correlate(array(readscount), kernel_score)
67 |
68 | ultratiolist = list()
69 |
70 | ultratiolist.append(ultratio)
71 |
72 | smoothed_result = correlate(smoothed_result, ultratiolist)
73 |
74 | #scores = list()
75 |
76 | bg_mean = smoothed_result.mean()
77 |
78 | bg_std = smoothed_result.std()
79 |
80 | bg_threshold = bg_mean + threshold * bg_std
81 |
82 | #bgscore['rand_mean'] = bg_mean
83 |
84 | #bgscore['rand_std'] = bg_std
85 |
86 | cutoff = bg_threshold
87 |
88 | return cutoff
89 |
90 |
91 | def nhuniquerate(bamfile, chromosome, paired, fregion, regionstart=1, regionend = -1, maxinsert = 100000):
92 |
93 | samfile = pysam.Samfile(bamfile)
94 |
95 | ref_lengths = samfile.lengths
96 |
97 | sam_ref = samfile.references
98 |
99 | refere_ncenumber = samfile.nreferences
100 |
101 | if regionend == -1:
102 |
103 | for i in range(refere_ncenumber):
104 |
105 | if sam_ref[i] == chromosome:
106 |
107 | regionend = ref_lengths[i]
108 |
109 | region = chromosome + ':' + str(regionstart) + '-' + str(regionend)
110 |
111 | region_length = regionend - regionstart
112 |
113 | nhreadscount = readscounter.nhreadscounter(bamfile, region, paired, maxinsert=maxinsert)
114 |
115 | totaluniq = len(nhreadscount) + 0.0
116 |
117 | uniquerate = totaluniq/region_length
118 |
119 | return uniquerate
120 |
121 |
122 | def dhuniquerate(bamfile, chromosome, regionstart=1, regionend=-1):
123 |
124 | """
125 | Count unique Rate in a region
126 |
127 | """
128 |
129 | samfile = pysam.Samfile(bamfile)
130 |
131 | ref_lengths = samfile.lengths
132 |
133 | sam_ref = samfile.references
134 |
135 | refere_ncenumber = samfile.nreferences
136 |
137 | if regionend == -1:
138 |
139 | for i in range(refere_ncenumber):
140 |
141 | if sam_ref[i] == chromosome:
142 |
143 | regionend = ref_lengths[i]
144 |
145 | region = chromosome + ':' + str(regionstart) + '-' + str(regionend)
146 |
147 | region_length = regionend - regionstart
148 |
149 | dhreadscount = readscounter.dhreadscounter(bamfile, region)
150 |
151 | totaluniq = len(dhreadscount) + 0.0
152 |
153 | uniquerate = totaluniq/region_length
154 |
155 | return uniquerate
156 |
157 |
158 | def ultratio(chrlength, uniqueratio, chrtotalreads, frcount):
159 | """
160 | ultraio = chrlength * uniqueratio / chr_total_reads
161 | """
162 | ultratio = chrlength * uniqueratio / (chrtotalreads - frcount)
163 |
164 | return ultratio
165 |
166 |
167 | def sim_replicate_nthreads(run_times=1000, uniqueratio=1, kernellength = 600, threshold = 4, nthreads = 2):
168 | # randomthresh = list()
169 |
170 | pars = list()
171 |
172 | for i in range(0,run_times):
173 |
174 | par=dict()
175 |
176 | par['uniqueratio'] = uniqueratio
177 |
178 | par['kernellength'] = kernellength
179 |
180 | par['threshold'] = threshold
181 |
182 | pars.append(par)
183 |
184 | pool=Pool(nthreads)
185 |
186 | outscore = dict()
187 |
188 | try:
189 | randomthresh = pool.map(sim_bg_thread_worker, pars)
190 |
191 | summean = 0.0
192 |
193 | sumstd = 0.0
194 |
195 | for randscore in randomthresh:
196 |
197 | randmean = randscore['rand_mean']
198 |
199 | randstd = randscore['rand_std']
200 | # print (randmean, randstd)
201 | summean = summean + randmean
202 |
203 | sumstd = sumstd + randstd
204 |
205 | mean_of_mean = summean/run_times
206 |
207 | mean_of_std = sumstd/run_times
208 | # print ('mean_of_mean',mean_of_mean, 'mean_of_std',mean_of_std)
209 |
210 | outscore['mean'] = mean_of_mean
211 |
212 | outscore['std'] = mean_of_std
213 | #return (mean_of_mean, mean_of_std)
214 |
215 | pool.close()
216 |
217 | return outscore
218 |
219 | except KeyboardInterrupt:
220 |
221 | pool.terminate()
222 |
223 | print ("You cancelled the program!")
224 |
225 | sys.exit(1)
226 |
227 | except Exception as e:
228 |
229 | print ('got exception: %r, terminating the pool' % (e,))
230 |
231 | pool.terminate()
232 |
233 | print ('pool is terminated')
234 |
235 | finally:
236 | # print ('joining pool processes')
237 | pool.join()
238 | # print ('join complete')
239 | # pool.join()
240 | # pool.close()
241 |
242 |
243 | def sim_bg_thread_worker(par):
244 |
245 | try:
246 |
247 | uniqueratio=par['uniqueratio']
248 |
249 | kernellength = par['kernellength']
250 |
251 | threshold = par['threshold']
252 |
253 | kernel = smooth_kernel(length=kernellength)
254 |
255 | sim_genome_size = int(1e5)
256 |
257 | total_reads = int(sim_genome_size * uniqueratio)
258 |
259 | region_site = list(range(0,sim_genome_size))
260 |
261 | sim_uniqsite = rnd.sample(region_site, total_reads)
262 |
263 | rand_reads_count = list()
264 |
265 | for i in range(0,sim_genome_size):
266 |
267 | rand_reads_count.append(0)
268 |
269 | kernel_score = list()
270 |
271 | for i in sorted(kernel):
272 | kernel_score.append(kernel[i])
273 |
274 |
275 |
276 | kdesmooth_result = dict()
277 |
278 | for i in range(0,total_reads):
279 |
280 | rand_number = int(rnd.uniform(0,total_reads))
281 |
282 | rand_reads = sim_uniqsite[rand_number]
283 |
284 | rand_reads_count[rand_reads] = rand_reads_count[rand_reads] + 1.0
285 |
286 | smoothed_result = correlate(array(rand_reads_count), kernel_score)
287 |
288 | scores = list()
289 |
290 | rand_mean = smoothed_result.mean()
291 |
292 | rand_std = smoothed_result.std()
293 |
294 | total_sum = smoothed_result.sum()
295 |
296 | rand_threshhold = rand_mean + threshold * rand_std
297 |
298 | higher_count = 0
299 |
300 | for now_site in kdesmooth_result:
301 |
302 | if kdesmooth_result[now_site] > rand_threshhold:
303 |
304 | higher_count = higher_count + 1
305 |
306 | # print (total_sum, rand_mean, rand_std, rand_threshhold, higher_count, total_reads)
307 |
308 | randscore = dict()
309 |
310 | randscore['rand_mean'] = rand_mean
311 |
312 | randscore['rand_std'] = rand_std
313 |
314 | return randscore
315 |
316 | except KeyboardInterrupt:
317 |
318 | raise KeyboardInterruptError()
319 |
320 |
321 | def get_bpc(bamfile, hotspots, jobtype, filted_region, nthreads, maxinsert = 100000):
322 |
323 | #bpc average readscount per basepare
324 |
325 | # total_reads = 0
326 | #
327 | total_length = 0
328 | #
329 | # samfile = pysam.Samfile(bamfile)
330 |
331 | pars = list()
332 |
333 | for hotspot_now in hotspots:
334 |
335 | par = dict()
336 |
337 | par['bamfile'] = bamfile
338 |
339 | par['hotspot'] = hotspot_now
340 |
341 | par['jobtype'] = jobtype
342 |
343 | par['filted_region'] = filted_region
344 |
345 | par['maxinsert'] = maxinsert
346 |
347 | pars.append(par)
348 |
349 | total_length = hotspot_now.end - hotspot_now.start + 1 + total_length
350 |
351 |
352 | # print ("total length %s" % total_length)
353 |
354 | pool = Pool(nthreads)
355 |
356 | try:
357 |
358 | reads_count = pool.map(bpc_runner, pars)
359 |
360 | total_reads = 0.0
361 |
362 | for count_now in reads_count:
363 |
364 | total_reads = total_reads + count_now
365 |
366 | bpc = (total_reads+0.0)/total_length
367 |
368 | pool.close()
369 |
370 | return bpc
371 |
372 | except KeyboardInterrupt:
373 |
374 | pool.terminate()
375 |
376 | print ("You cancelled the program!")
377 |
378 | sys.exit(1)
379 |
380 | except Exception as e:
381 |
382 | print ('got exception: %r, terminating the pool' % (e,))
383 |
384 | pool.terminate()
385 |
386 | print ('pool is terminated')
387 |
388 | finally:
389 | # print ('joining pool processes')
390 | pool.join()
391 | # print ('join complete')
392 |
393 |
394 | def bpc_runner(par):
395 |
396 | try:
397 |
398 | bamfile = par['bamfile']
399 |
400 | hotspot = par['hotspot']
401 |
402 | jobtype = par['jobtype']
403 |
404 | filted_region = par['filted_region']
405 |
406 | maxinsert = par['maxinsert']
407 |
408 | start_site = hotspot.start
409 |
410 | end_site = hotspot.end
411 |
412 | whether_in_fr = 0
413 |
414 | chromosome = hotspot.chromosome
415 |
416 | hotspotregio = chromosome + ':' + str(start_site) + '-' + str(end_site)
417 |
418 | hotspotreads = 0
419 |
420 | for i in range(start_site, end_site + 1):
421 |
422 | parentscare = int(i/100)
423 |
424 | if chromosome in filted_region:
425 |
426 | if parentscare in filted_region[chromosome]:
427 |
428 | whether_in_fr = 1
429 |
430 | if whether_in_fr == 0:
431 |
432 | readscount = dict()
433 |
434 | if jobtype == 'nhsingle':
435 |
436 | readscount = readscounter.nhreadscounter(bamfile = bamfile, region=hotspotregio, paired=False, maxinsert=maxinsert)
437 |
438 | elif jobtype == 'nhpaired':
439 |
440 | readscount = readscounter.nhreadscounter(bamfile = bamfile, region=hotspotregio, paired=True, maxinsert=maxinsert)
441 |
442 | elif jobtype == 'dh':
443 |
444 | readscount = readscounter.dhreadscounter(bamfile = bamfile, region = hotspotregio)
445 |
446 | else:
447 |
448 | print ("%s count type error!!!!" % jobtype)
449 |
450 | sys.exit(1)
451 |
452 | for i in readscount:
453 |
454 | hotspotreads = hotspotreads+readscount[i]
455 |
456 | # print (hotspotreads)
457 | return hotspotreads
458 |
459 | except KeyboardInterrupt:
460 |
461 | raise KeyboardInterruptError()
--------------------------------------------------------------------------------
/Jazzlib/bgcount.py.bak:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from __future__ import print_function
3 | import pysam
4 | from numpy import *
5 | from multiprocessing import Pool
6 | import random as rnd
7 | from kernel import *
8 | import sys
9 | import readscounter
10 |
11 |
12 | class KeyboardInterruptError(Exception):
13 |
14 | pass
15 |
16 |
17 | def nhnoncontrol(uniqueratio, threshold, kernellength, nthreads=4):
18 |
19 | bgscore = sim_replicate_nthreads(run_times=200, uniqueratio=uniqueratio,
20 | nthreads=nthreads, kernellength=kernellength, threshold=threshold)
21 |
22 | cutoff = bgscore['mean'] + bgscore['std'] * threshold
23 |
24 | return cutoff
25 |
26 |
27 | def dhnoncontrol(uniqueratio, threshold, kernellength, nthreads=4):
28 |
29 | bgscore = sim_replicate_nthreads(run_times=200, uniqueratio=uniqueratio,
30 | nthreads=nthreads, kernellength=kernellength, threshold=threshold)
31 |
32 | cutoff = bgscore['mean'] + bgscore['std'] * threshold
33 |
34 | return cutoff
35 |
36 |
37 | def nhcontrol(bamfile, chromosome, paired, chrlength, ultratio, filted_region,maxinsert, kernellength = 600, threshold = 4):
38 |
39 | """
40 | region: chr:start-end
41 | ultraio = chrlength * uniqratio / chr_total_reads
42 | filter region
43 |
44 | """
45 |
46 | region = chromosome + ':' + str(1) + '-' + str(chrlength)
47 |
48 | readscount = readscounter.nhreadscounter(bamfile, region, paired, maxinsert=maxinsert)
49 |
50 | kernel = smooth_kernel(kernellength)
51 |
52 | kernel_score = list()
53 |
54 | for i in sorted(kernel):
55 |
56 | kernel_score.append(kernel[i])
57 |
58 | threshold = filted_region.threshold/100
59 |
60 | for site in readscount:
61 |
62 | if readscount[site] > threshold:
63 |
64 | readscount[site] = threshold
65 |
66 | smoothed_result = correlate(array(readscount), kernel_score)
67 |
68 | ultratiolist = list()
69 |
70 | ultratiolist.append(ultratio)
71 |
72 | smoothed_result = correlate(smoothed_result, ultratiolist)
73 |
74 | #scores = list()
75 |
76 | bg_mean = smoothed_result.mean()
77 |
78 | bg_std = smoothed_result.std()
79 |
80 | bg_threshold = bg_mean + threshold * bg_std
81 |
82 | #bgscore['rand_mean'] = bg_mean
83 |
84 | #bgscore['rand_std'] = bg_std
85 |
86 | cutoff = bg_threshold
87 |
88 | return cutoff
89 |
90 |
91 | def nhuniquerate(bamfile, chromosome, paired, fregion, regionstart=1, regionend = -1, maxinsert = 100000):
92 |
93 | samfile = pysam.Samfile(bamfile)
94 |
95 | ref_lengths = samfile.lengths
96 |
97 | sam_ref = samfile.references
98 |
99 | refere_ncenumber = samfile.nreferences
100 |
101 | if regionend == -1:
102 |
103 | for i in range(refere_ncenumber):
104 |
105 | if sam_ref[i] == chromosome:
106 |
107 | regionend = ref_lengths[i]
108 |
109 | region = chromosome + ':' + str(regionstart) + '-' + str(regionend)
110 |
111 | region_length = regionend - regionstart
112 |
113 | nhreadscount = readscounter.nhreadscounter(bamfile, region, paired, maxinsert=maxinsert)
114 |
115 | totaluniq = len(nhreadscount) + 0.0
116 |
117 | uniquerate = totaluniq/region_length
118 |
119 | return uniquerate
120 |
121 |
122 | def dhuniquerate(bamfile, chromosome, regionstart=1, regionend=-1):
123 |
124 | """
125 | Count unique Rate in a region
126 |
127 | """
128 |
129 | samfile = pysam.Samfile(bamfile)
130 |
131 | ref_lengths = samfile.lengths
132 |
133 | sam_ref = samfile.references
134 |
135 | refere_ncenumber = samfile.nreferences
136 |
137 | if regionend == -1:
138 |
139 | for i in range(refere_ncenumber):
140 |
141 | if sam_ref[i] == chromosome:
142 |
143 | regionend = ref_lengths[i]
144 |
145 | region = chromosome + ':' + str(regionstart) + '-' + str(regionend)
146 |
147 | region_length = regionend - regionstart
148 |
149 | dhreadscount = readscounter.dhreadscounter(bamfile, region)
150 |
151 | totaluniq = len(dhreadscount) + 0.0
152 |
153 | uniquerate = totaluniq/region_length
154 |
155 | return uniquerate
156 |
157 |
158 | def ultratio(chrlength, uniqueratio, chrtotalreads, frcount):
159 | """
160 | ultraio = chrlength * uniqueratio / chr_total_reads
161 | """
162 | ultratio = chrlength * uniqueratio / (chrtotalreads - frcount)
163 |
164 | return ultratio
165 |
166 |
167 | def sim_replicate_nthreads(run_times=1000, uniqueratio=1, kernellength = 600, threshold = 4, nthreads = 2):
168 | # randomthresh = list()
169 |
170 | pars = list()
171 |
172 | for i in range(0,run_times):
173 |
174 | par=dict()
175 |
176 | par['uniqueratio'] = uniqueratio
177 |
178 | par['kernellength'] = kernellength
179 |
180 | par['threshold'] = threshold
181 |
182 | pars.append(par)
183 |
184 | pool=Pool(nthreads)
185 |
186 | outscore = dict()
187 |
188 | try:
189 | randomthresh = pool.map(sim_bg_thread_worker, pars)
190 |
191 | summean = 0.0
192 |
193 | sumstd = 0.0
194 |
195 | for randscore in randomthresh:
196 |
197 | randmean = randscore['rand_mean']
198 |
199 | randstd = randscore['rand_std']
200 | # print (randmean, randstd)
201 | summean = summean + randmean
202 |
203 | sumstd = sumstd + randstd
204 |
205 | mean_of_mean = summean/run_times
206 |
207 | mean_of_std = sumstd/run_times
208 | # print ('mean_of_mean',mean_of_mean, 'mean_of_std',mean_of_std)
209 |
210 | outscore['mean'] = mean_of_mean
211 |
212 | outscore['std'] = mean_of_std
213 | #return (mean_of_mean, mean_of_std)
214 |
215 | pool.close()
216 |
217 | return outscore
218 |
219 | except KeyboardInterrupt:
220 |
221 | pool.terminate()
222 |
223 | print ("You cancelled the program!")
224 |
225 | sys.exit(1)
226 |
227 | except Exception, e:
228 |
229 | print ('got exception: %r, terminating the pool' % (e,))
230 |
231 | pool.terminate()
232 |
233 | print ('pool is terminated')
234 |
235 | finally:
236 | # print ('joining pool processes')
237 | pool.join()
238 | # print ('join complete')
239 | # pool.join()
240 | # pool.close()
241 |
242 |
243 | def sim_bg_thread_worker(par):
244 |
245 | try:
246 |
247 | uniqueratio=par['uniqueratio']
248 |
249 | kernellength = par['kernellength']
250 |
251 | threshold = par['threshold']
252 |
253 | kernel = smooth_kernel(length=kernellength)
254 |
255 | sim_genome_size = int(1e5)
256 |
257 | total_reads = int(sim_genome_size * uniqueratio)
258 |
259 | region_site = range(0,sim_genome_size)
260 |
261 | sim_uniqsite = rnd.sample(region_site, total_reads)
262 |
263 | rand_reads_count = list()
264 |
265 | for i in range(0,sim_genome_size):
266 |
267 | rand_reads_count.append(0)
268 |
269 | kernel_score = list()
270 |
271 | for i in sorted(kernel):
272 | kernel_score.append(kernel[i])
273 |
274 |
275 |
276 | kdesmooth_result = dict()
277 |
278 | for i in range(0,total_reads):
279 |
280 | rand_number = int(rnd.uniform(0,total_reads))
281 |
282 | rand_reads = sim_uniqsite[rand_number]
283 |
284 | rand_reads_count[rand_reads] = rand_reads_count[rand_reads] + 1.0
285 |
286 | smoothed_result = correlate(array(rand_reads_count), kernel_score)
287 |
288 | scores = list()
289 |
290 | rand_mean = smoothed_result.mean()
291 |
292 | rand_std = smoothed_result.std()
293 |
294 | total_sum = smoothed_result.sum()
295 |
296 | rand_threshhold = rand_mean + threshold * rand_std
297 |
298 | higher_count = 0
299 |
300 | for now_site in kdesmooth_result:
301 |
302 | if kdesmooth_result[now_site] > rand_threshhold:
303 |
304 | higher_count = higher_count + 1
305 |
306 | # print (total_sum, rand_mean, rand_std, rand_threshhold, higher_count, total_reads)
307 |
308 | randscore = dict()
309 |
310 | randscore['rand_mean'] = rand_mean
311 |
312 | randscore['rand_std'] = rand_std
313 |
314 | return randscore
315 |
316 | except KeyboardInterrupt:
317 |
318 | raise KeyboardInterruptError()
319 |
320 |
321 | def get_bpc(bamfile, hotspots, jobtype, filted_region, nthreads, maxinsert = 100000):
322 |
323 | #bpc average readscount per basepare
324 |
325 | # total_reads = 0
326 | #
327 | total_length = 0
328 | #
329 | # samfile = pysam.Samfile(bamfile)
330 |
331 | pars = list()
332 |
333 | for hotspot_now in hotspots:
334 |
335 | par = dict()
336 |
337 | par['bamfile'] = bamfile
338 |
339 | par['hotspot'] = hotspot_now
340 |
341 | par['jobtype'] = jobtype
342 |
343 | par['filted_region'] = filted_region
344 |
345 | par['maxinsert'] = maxinsert
346 |
347 | pars.append(par)
348 |
349 | total_length = hotspot_now.end - hotspot_now.start + 1 + total_length
350 |
351 |
352 | # print ("total length %s" % total_length)
353 |
354 | pool = Pool(nthreads)
355 |
356 | try:
357 |
358 | reads_count = pool.map(bpc_runner, pars)
359 |
360 | total_reads = 0.0
361 |
362 | for count_now in reads_count:
363 |
364 | total_reads = total_reads + count_now
365 |
366 | bpc = (total_reads+0.0)/total_length
367 |
368 | pool.close()
369 |
370 | return bpc
371 |
372 | except KeyboardInterrupt:
373 |
374 | pool.terminate()
375 |
376 | print ("You cancelled the program!")
377 |
378 | sys.exit(1)
379 |
380 | except Exception, e:
381 |
382 | print ('got exception: %r, terminating the pool' % (e,))
383 |
384 | pool.terminate()
385 |
386 | print ('pool is terminated')
387 |
388 | finally:
389 | # print ('joining pool processes')
390 | pool.join()
391 | # print ('join complete')
392 |
393 |
394 | def bpc_runner(par):
395 |
396 | try:
397 |
398 | bamfile = par['bamfile']
399 |
400 | hotspot = par['hotspot']
401 |
402 | jobtype = par['jobtype']
403 |
404 | filted_region = par['filted_region']
405 |
406 | maxinsert = par['maxinsert']
407 |
408 | start_site = hotspot.start
409 |
410 | end_site = hotspot.end
411 |
412 | whether_in_fr = 0
413 |
414 | chromosome = hotspot.chromosome
415 |
416 | hotspotregio = chromosome + ':' + str(start_site) + '-' + str(end_site)
417 |
418 | hotspotreads = 0
419 |
420 | for i in range(start_site, end_site + 1):
421 |
422 | parentscare = int(i/100)
423 |
424 | if chromosome in filted_region:
425 |
426 | if parentscare in filted_region[chromosome]:
427 |
428 | whether_in_fr = 1
429 |
430 | if whether_in_fr == 0:
431 |
432 | readscount = dict()
433 |
434 | if jobtype == 'nhsingle':
435 |
436 | readscount = readscounter.nhreadscounter(bamfile = bamfile, region=hotspotregio, paired=False, maxinsert=maxinsert)
437 |
438 | elif jobtype == 'nhpaired':
439 |
440 | readscount = readscounter.nhreadscounter(bamfile = bamfile, region=hotspotregio, paired=True, maxinsert=maxinsert)
441 |
442 | elif jobtype == 'dh':
443 |
444 | readscount = readscounter.dhreadscounter(bamfile = bamfile, region = hotspotregio)
445 |
446 | else:
447 |
448 | print ("%s count type error!!!!" % jobtype)
449 |
450 | sys.exit(1)
451 |
452 | for i in readscount:
453 |
454 | hotspotreads = hotspotreads+readscount[i]
455 |
456 | # print (hotspotreads)
457 | return hotspotreads
458 |
459 | except KeyboardInterrupt:
460 |
461 | raise KeyboardInterruptError()
--------------------------------------------------------------------------------
/Jazz.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import os
4 | import sys
5 | from optparse import OptionParser
6 | import logging
7 | from Jazzlib.FRegion import *
8 | from Jazzlib.localmax import *
9 | from Jazzlib.normalize_ratio import *
10 | from Jazzlib.countreads import *
11 | from Jazzlib.Peak import *
12 | from Jazzlib.sta import *
13 | from Jazzlib.jazzio import *
14 | from Jazzlib.randombg import *
15 | from Jazzlib.hotspotsscan import *
16 | from Jazzlib.Hotspot import *
17 | from Jazzlib.peaksscan import *
18 |
19 | def main():
20 |
21 | opt = opt_check(get_optparser())
22 |
23 | if opt.controlfile == "no":
24 |
25 | nocontrol(opt)
26 |
27 | else:
28 |
29 | withcontrol(opt)
30 |
31 |
32 | def withcontrol(opt):
33 |
34 | try:
35 |
36 | datafile = opt.datafile
37 |
38 | inputfile = opt.controlfile
39 |
40 | jobtype = opt.jobtype
41 |
42 | count_chr = opt.countchr
43 |
44 | maxinsert = opt.maxinsert
45 |
46 | nthreads = opt.nthreads
47 |
48 | bayesfactorthreshold = opt.threshold
49 |
50 | # bayesfactorthreshold = 10
51 |
52 | samplename = opt.samplename
53 |
54 | fdr = opt.fdr
55 |
56 |
57 | chipfregion = FRegion(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert)
58 |
59 | inputfregion = FRegion(bamfile=inputfile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert)
60 |
61 | ratio = normalize_ratio_input2(fregegion_input=inputfregion, fregion_chip=chipfregion)
62 |
63 | if opt.genomesize:
64 |
65 | print("###chipfregion.adjreads * chipfregion.readlengthmean/chipfregion.countgenomelength,",
66 | chipfregion.adjreads, chipfregion.readlengthmean, opt.genomesize)
67 |
68 | gloablumbda = chipfregion.adjreads * chipfregion.readlengthmean / opt.genomesize
69 |
70 | else:
71 |
72 | print("###inputfregion.adjreads,inputfregion.readlengthmean,inputfregion.countgenomelength",
73 | inputfregion.adjreads , inputfregion.readlengthmean,inputfregion.countgenomelength)
74 |
75 | gloablumbda = inputfregion.adjreads * inputfregion.readlengthmean/inputfregion.countgenomelength
76 |
77 | windowscare=100000
78 |
79 | hotspots = hotspotsscan_withcontrol(chipfile=datafile,maxinsert=maxinsert, windowscare=windowscare,
80 | countchr=count_chr, inputgloablumbda=gloablumbda,
81 | bayesfactorthreshold=bayesfactorthreshold, nthreads=nthreads,
82 | chipfregion=chipfregion, jobtype=jobtype, ratio=ratio, inputfile=inputfile,
83 | inputfregion=inputfregion)
84 |
85 | peaks = peakscan_control(datafile=datafile,maxinsert=maxinsert, bayesfactorthreshold=bayesfactorthreshold,
86 | nthreads=nthreads,chipfregion=chipfregion, jobtype=jobtype, hotspots=hotspots,
87 | gloablumbda=gloablumbda,inputfile=inputfile,ratio=ratio,inputfregion=inputfregion)
88 |
89 | if opt.hotonly:
90 |
91 | hotspotsbedswriter(hotspots=hotspots, samplename=samplename)
92 |
93 | else:
94 |
95 | hotspotsenrich = hotspotsfilter(hotspots=hotspots, peaks=peaks)
96 |
97 | hotspotsbedswriter(hotspots=hotspotsenrich, samplename=samplename)
98 |
99 | peakbedswriter(samplename=samplename,peaks=peaks)
100 |
101 | jazzgffout(samplename=samplename, hotspots=hotspotsenrich, peaks=peaks, fregion=chipfregion)
102 |
103 | except KeyboardInterrupt:
104 |
105 | sys.stderr.write("User interrupt\n")
106 |
107 | sys.exit(0)
108 |
109 |
110 | def nocontrol(opt):
111 |
112 | try:
113 |
114 | datafile = opt.datafile
115 |
116 | jobtype = opt.jobtype
117 |
118 | count_chr = opt.countchr
119 |
120 | maxinsert = opt.maxinsert
121 |
122 | print ("maxinsert",maxinsert)
123 |
124 | nthreads = opt.nthreads
125 |
126 | bayesfactorthreshold = opt.threshold
127 |
128 | samplename = opt.samplename
129 |
130 | chipfregion = FRegion(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads,
131 | maxinsert=maxinsert)
132 |
133 | if opt.genomesize:
134 |
135 | print("###chipfregion.adjreads * chipfregion.readlengthmean/chipfregion.countgenomelength,", chipfregion.adjreads, chipfregion.readlengthmean,opt.genomesize)
136 |
137 | gloablumbda = chipfregion.adjreads * chipfregion.readlengthmean / opt.genomesize
138 |
139 | else:
140 |
141 | print("###chipfregion.adjreads * chipfregion.readlengthmean/chipfregion.countgenomelength,", chipfregion.adjreads, chipfregion.readlengthmean,chipfregion.countgenomelength)
142 |
143 | gloablumbda = chipfregion.adjreads * chipfregion.readlengthmean/chipfregion.countgenomelength
144 |
145 | windowscare=100000
146 |
147 | for fregions in chipfregion.filted_region:
148 |
149 | print (fregions)
150 |
151 | hotspots = hotspotsscan_withoutcontrol(file=datafile, maxinsert=maxinsert, windowscare=windowscare, countchr=count_chr,
152 | bayesfactorthreshold=bayesfactorthreshold, nthreads=nthreads,
153 | fregion=chipfregion, jobtype=jobtype, gloablumbda=gloablumbda)
154 |
155 | peaks = peakscan_without_control(datafile=datafile,maxinsert=maxinsert,
156 | bayesfactorthreshold=bayesfactorthreshold, nthreads=nthreads,
157 | fregion=chipfregion,jobtype=jobtype,
158 | hotspots=hotspots, gloablumbda=gloablumbda)
159 |
160 | if opt.hotonly:
161 |
162 | hotspotsbedswriter(hotspots=hotspots, samplename=samplename)
163 |
164 | else:
165 |
166 | hotspotsenrich = hotspotsfilter(hotspots=hotspots, peaks=peaks)
167 |
168 | hotspotsbedswriter(hotspots=hotspotsenrich, samplename=samplename)
169 |
170 | peakbedswriter(samplename=samplename,peaks=peaks)
171 |
172 | jazzgffout(samplename=samplename, hotspots=hotspotsenrich, peaks=peaks, fregion=chipfregion)
173 |
174 | except KeyboardInterrupt:
175 |
176 | sys.stderr.write("User interrupt\n")
177 |
178 | sys.exit(0)
179 |
180 |
181 | def get_optparser():
182 |
183 | usage = """usage: %prog <-d datafile> [-n name] [options]
184 | Example %prog -i nh_sample1.bam -n sample1
185 | """
186 |
187 | description = "%prog Non-Histone protein banding site identification"
188 |
189 | jazzopt = OptionParser(version="%prog 0.1 20140521", description=description, usage=usage, add_help_option=False)
190 |
191 | jazzopt.add_option("-h", "--help", action="help", help="show this help message and exit.")
192 |
193 | jazzopt.add_option("-d", "--data", dest="datafile", type="string", help='data file, should be sorted bam format')
194 |
195 | jazzopt.add_option("-c", "--control", dest="controlfile", type="string", help='control(input) file, should be sorted bam format', default="no")
196 |
197 | jazzopt.add_option("-n", "--name", dest="samplename", help="NH sample name default=NH_sample", type="string" , default="DH_sample")
198 |
199 | jazzopt.add_option("-t", "--threshold", dest="threshold", type="float", help="peak threshold, default=6.0", default=6.0)
200 |
201 | jazzopt.add_option("--threads", dest="nthreads", type="int", help="threads number or cpu number, default=4", default=4)
202 |
203 | jazzopt.add_option("-w", "--wig", action="store_true", help="whether out put wiggle file, default=False", default=False)
204 |
205 | jazzopt.add_option("-f","--fdr", dest="fdr", type="float",help="using FDR as threshold", default=0.1)
206 |
207 | jazzopt.add_option("-x", "--excludechr", dest="excludechr", help="Don't count those chromosome, strongly suggest skip mitochondrion and chloroplast, example='-x ChrM,ChrC'")
208 |
209 | jazzopt.add_option("-g", "--gff", action="store_true", help="whether out put gff file, default=False", default=False)
210 |
211 | jazzopt.add_option("-j","--jobtype",dest="jobtype",type="string",help="job type, such as nhpaired or nhsingle")
212 |
213 | jazzopt.add_option("-m","--maxinsert",dest="maxinsert",type="int",help="when you use paired library, please set the maxinsert size",default=130)
214 |
215 | jazzopt.add_option("--pe", dest="pe", action="store_true", help="paired-end reads or single-end reads, default=False (single end)", default=False)
216 |
217 | jazzopt.add_option("--genomesize", dest="genomesize", type="int",
218 | help="Set genome size", default=False)
219 |
220 | jazzopt.add_option("--hotonly", dest="hotonly", action="store_true", default=False, help="calculate hotsports only.")
221 |
222 | return jazzopt
223 |
224 |
225 | def opt_check(jazzopt):
226 |
227 | (opt, args) = jazzopt.parse_args()
228 |
229 | if not opt.datafile:
230 |
231 | logging.error("you need input a bam file, '-d nh_sample1.bam -j nhsingle'")
232 |
233 | jazzopt.print_help()
234 |
235 | sys.exit(1)
236 |
237 | if not os.path.isfile (opt.datafile):
238 |
239 | logging.error("No such file: %s" % opt.datafile)
240 |
241 | sys.exit(1)
242 |
243 | dataindexfile1 = opt.datafile + '.bai'
244 |
245 | dataindexfile2 = opt.datafile + '.csi'
246 |
247 | if not (os.path.isfile(dataindexfile1) or os.path.isfile(dataindexfile2)):
248 |
249 | logging.error("Missing bam index file: %s or %s" % (dataindexfile1, dataindexfile2))
250 |
251 | sys.exit(1)
252 |
253 | if not opt.controlfile == "no":
254 |
255 | if not os.path.isfile (opt.controlfile):
256 |
257 | logging.error("No such file: %s" % opt.controlfile)
258 |
259 | sys.exit(1)
260 |
261 | controlindexfile1 = opt.controlfile + '.bai'
262 |
263 | controlindexfile2 = opt.controlfile + '.csi'
264 |
265 | if not (os.path.isfile(controlindexfile1) or os.path.isfile(controlindexfile2)):
266 |
267 | logging.error("Missing bam index file: %s or %s" % (controlindexfile1, controlindexfile2))
268 |
269 | sys.exit(1)
270 |
271 | else:
272 |
273 | opt.controlfile = "no"
274 |
275 | if not (opt.nthreads > 0):
276 |
277 | logging.error("threads number should >=1")
278 |
279 | jazzopt.print_help()
280 |
281 | sys.exit(1)
282 |
283 | if (opt.jobtype):
284 |
285 | if opt.jobtype == 'nhsingle':
286 |
287 | if (opt.maxinsert < 0):
288 |
289 | logging.error("maxinsert size error")
290 |
291 | jazzopt.print_help()
292 |
293 | sys.exit(1)
294 |
295 | elif opt.jobtype == 'nhpaired':
296 |
297 | if (opt.maxinsert < 0):
298 |
299 | logging.error("maxinsert size error")
300 |
301 | jazzopt.print_help()
302 |
303 | sys.exit(1)
304 |
305 | else:
306 |
307 | logging.error("missing or wrong jobtype")
308 |
309 | jazzopt.print_help()
310 |
311 | sys.exit(1)
312 |
313 | else:
314 |
315 | logging.error("missing or wrong jobtype")
316 |
317 | jazzopt.print_help()
318 |
319 | sys.exit(1)
320 |
321 | opt.countchr = list()
322 |
323 | samfile = pysam.Samfile(opt.datafile)
324 |
325 | sam_ref = samfile.references
326 |
327 | for i in sam_ref:
328 |
329 | opt.countchr.append(i)
330 |
331 | if (opt.excludechr):
332 |
333 | excludchr = opt.excludechr.split(',')
334 |
335 | for chri in excludchr:
336 |
337 | if not chri in sam_ref:
338 |
339 | print (chri,'not in the %s file' % opt.datafile)
340 |
341 | print ("try to selcet exclude Chr from", end =" : ")
342 |
343 | print (sam_ref, sep=",")
344 |
345 | jazzopt.print_help()
346 |
347 | sys.exit(1)
348 |
349 | else:
350 |
351 | j = 0
352 |
353 | for n in opt.countchr:
354 |
355 | if chri == n:
356 |
357 | del opt.countchr[j]
358 |
359 | j = j + 1
360 |
361 | return opt
362 |
363 | if __name__ == "__main__":
364 |
365 | try:
366 |
367 | main()
368 |
369 | except KeyboardInterrupt:
370 |
371 | sys.stderr.write("User interrupt\n")
372 |
373 | sys.exit(0)
374 |
375 |
--------------------------------------------------------------------------------
/Jazzlib/hotspotsscan.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from .countreads import *
4 | from .cEM_zip import *
5 | from .FRegion import *
6 | from multiprocessing import Pool
7 | from .Hotspot import *
8 | from .sta import *
9 | from .region import *
10 | from .Peak import *
11 |
12 |
13 | class KeyboardInterruptError(Exception):
14 |
15 | pass
16 |
17 |
18 | def hotspotsscan_withoutcontrol(file, maxinsert, windowscare,countchr,gloablumbda,
19 | bayesfactorthreshold, nthreads, fregion, jobtype):
20 |
21 | pool = Pool(nthreads)
22 |
23 | try:
24 |
25 | pars = list()
26 |
27 | hotspots = list()
28 |
29 | print ("gloablumbda",gloablumbda , "readlengthmean", fregion.readlengthmean)
30 |
31 | bayesfactorthresholdcount = 2
32 |
33 | i = 2
34 |
35 | while True:
36 |
37 | nowbayesfactor = bayesfactor(gloablumbda, i)
38 |
39 | if nowbayesfactor > bayesfactorthreshold:
40 |
41 | break
42 |
43 | bayesfactorthresholdcount = i
44 |
45 | i = i + 1
46 |
47 | print ("bayesfactorthresholdcount", bayesfactorthresholdcount)
48 |
49 | windowsize = 100000
50 |
51 | for chromosmoe in countchr:
52 |
53 | chr_length = fregion.chrs_length[chromosmoe]
54 |
55 | for scare in range(0, int(chr_length/windowsize)+1):
56 |
57 | nowstart = scare*windowsize + 1 -200
58 |
59 | nowend = (scare+1)*windowsize + 200
60 |
61 | if nowend > chr_length:
62 |
63 | nowend = chr_length
64 |
65 | if nowstart < 1:
66 |
67 | nowstart = 1
68 |
69 | nowregion = chromosmoe + ":" + str(nowstart) + "-" + str(nowend)
70 |
71 | par = dict()
72 |
73 | par['region'] = nowregion
74 |
75 | par['maxinsert'] = maxinsert
76 |
77 | par['bamfile'] = file
78 |
79 | par['jobtype'] = jobtype
80 |
81 | par['chrlength'] = chr_length
82 |
83 | par['regionchromosome'] = chromosmoe
84 |
85 | par['regionstart'] = nowstart
86 |
87 | par['regionend'] = nowend
88 |
89 | # par['bayesfactordic'] = bayesfactordic
90 |
91 | par['bayesfactorcount'] = bayesfactorthresholdcount
92 |
93 | par['readlengthmean'] = fregion.readlengthmean
94 |
95 | pars.append(par)
96 |
97 | enrichedinthreads = pool.map(hotspot_withoutcontrol_worker, pars)
98 |
99 | chrenrichedpotin = dict()
100 |
101 | for enrichedinthread in enrichedinthreads:
102 |
103 | nowchr = enrichedinthread['chromosome']
104 |
105 | if nowchr in chrenrichedpotin:
106 |
107 | chrenrichedpotin[nowchr].append(enrichedinthread['list'])
108 |
109 | else:
110 |
111 | chrenrichedpotin[nowchr] = list()
112 |
113 | chrenrichedpotin[nowchr].append(enrichedinthread['list'])
114 |
115 | chrhotpars = list()
116 |
117 | for nowchr in chrenrichedpotin:
118 |
119 | hotpar = dict()
120 |
121 | hotpar['chromosome'] = nowchr
122 |
123 | hotpar['preregion'] = chrenrichedpotin[nowchr]
124 |
125 | hotpar['chr_length'] = fregion.chrs_length[chromosmoe]
126 |
127 | hotpar['fregion'] = fregion
128 |
129 | chrhotpars.append(hotpar)
130 |
131 | hotsptosinthreads = pool.map(hotspots_chromsome_merge,chrhotpars)
132 |
133 | for hotinth in hotsptosinthreads:
134 |
135 | for hotspotnow in hotinth:
136 |
137 | hotspots.append(hotspotnow)
138 |
139 | pool.close()
140 |
141 | return hotspots
142 |
143 | except KeyboardInterrupt:
144 |
145 | pool.terminate()
146 |
147 | print ("You cancelled the program!")
148 |
149 | sys.exit(1)
150 |
151 | except Exception as e:
152 |
153 | print ('got exception in Jazzlib.hotspotsscan.hotspotsscan_withoutcontrol: %r, terminating the pool' % (e,))
154 |
155 | pool.terminate()
156 |
157 | print ('pool is terminated')
158 |
159 | finally:
160 |
161 | pool.join()
162 |
163 |
164 | def hotspot_withoutcontrol_worker(par):
165 |
166 | try:
167 |
168 | maxinsert = par['maxinsert']
169 |
170 | bamfile = par['bamfile']
171 |
172 | jobtype = par['jobtype']
173 |
174 | chromosome = par['regionchromosome']
175 |
176 | nowstart = par['regionstart']
177 |
178 | nowend = par['regionend']
179 |
180 | bayesfactorcount = par['bayesfactorcount']
181 |
182 | readlengthmean = par['readlengthmean']
183 |
184 | datacount = extenddepthcount(bamfile=bamfile, regionchromosome=chromosome, regionstart=nowstart,
185 | regionend=nowend, maxinsert=maxinsert, jobtype=jobtype,
186 | readlengthmean=readlengthmean)
187 |
188 | enrichedlist = dict()
189 |
190 | enrichedlist['chromosome'] = chromosome
191 |
192 | enrichedlist['list'] = list()
193 |
194 | for site in datacount:
195 |
196 | if datacount[site] >= bayesfactorcount:
197 |
198 | enrichedlist['list'].append(site)
199 |
200 | return enrichedlist
201 |
202 | except Exception as e:
203 |
204 | print ('got exception in Jazzlib.hotspotsscan.hotspot_withoutcontrol_worker: %r, terminating the pool' % (e,))
205 |
206 | print ('pool is terminated')
207 |
208 | except KeyboardInterrupt:
209 |
210 | print ("You cancelled the program!")
211 |
212 | sys.exit(1)
213 |
214 |
215 | def hotspotsscan_withcontrol(chipfile, maxinsert, windowscare,countchr,inputgloablumbda,
216 | bayesfactorthreshold, nthreads, chipfregion, jobtype, ratio,
217 | inputfile, inputfregion):
218 |
219 | pool = Pool(nthreads)
220 |
221 | try:
222 |
223 | pars = list()
224 |
225 | hotspots = list()
226 |
227 | print ("gloablumbda",inputgloablumbda , "readlengthmean", inputfregion.readlengthmean)
228 |
229 | bayesfactorthresholdcount = 2
230 |
231 | i = 2
232 |
233 | while True:
234 |
235 | nowbayesfactor = bayesfactor(inputgloablumbda, i)
236 |
237 | if nowbayesfactor > bayesfactorthreshold:
238 |
239 | break
240 |
241 | bayesfactorthresholdcount = i
242 |
243 | i = i + 1
244 |
245 | print ("bayesfactorthresholdcount", bayesfactorthresholdcount)
246 |
247 | windowsize = 100000
248 |
249 | for chromosmoe in countchr:
250 |
251 | chr_length = chipfregion.chrs_length[chromosmoe]
252 |
253 | for scare in range(0, int(chr_length/windowsize)+1):
254 |
255 | nowstart = scare*windowsize + 1 -200
256 |
257 | nowend = (scare+1)*windowsize + 200
258 |
259 | if nowend > chr_length:
260 |
261 | nowend = chr_length
262 |
263 | if nowstart < 1:
264 |
265 | nowstart = 1
266 |
267 | nowregion = chromosmoe + ":" + str(nowstart) + "-" + str(nowend)
268 |
269 | par = dict()
270 |
271 | par['region'] = nowregion
272 |
273 | par['maxinsert'] = maxinsert
274 |
275 | par['bamfile'] = chipfile
276 |
277 | par['jobtype'] = jobtype
278 |
279 | par['chrlength'] = chr_length
280 |
281 | par['regionchromosome'] = chromosmoe
282 |
283 | par['regionstart'] = nowstart
284 |
285 | par['regionend'] = nowend
286 |
287 | par['ratio'] = ratio
288 |
289 | # par['bayesfactordic'] = bayesfactordic
290 |
291 | par['bayesfactorcount'] = bayesfactorthresholdcount
292 |
293 | par['readlengthmean'] = chipfregion.readlengthmean
294 |
295 | pars.append(par)
296 |
297 | enrichedinthreads = pool.map(hotspot_control_worker, pars)
298 |
299 | chrenrichedpotin = dict()
300 |
301 | for enrichedinthread in enrichedinthreads:
302 |
303 | nowchr = enrichedinthread['chromosome']
304 |
305 | if nowchr in chrenrichedpotin:
306 |
307 | chrenrichedpotin[nowchr].append(enrichedinthread['list'])
308 |
309 | else:
310 |
311 | chrenrichedpotin[nowchr] = list()
312 |
313 | chrenrichedpotin[nowchr].append(enrichedinthread['list'])
314 |
315 | chrhotpars = list()
316 |
317 | for nowchr in chrenrichedpotin:
318 |
319 | hotpar = dict()
320 |
321 | hotpar['chromosome'] = nowchr
322 |
323 | hotpar['preregion'] = chrenrichedpotin[nowchr]
324 |
325 | hotpar['chr_length'] = chipfregion.chrs_length[chromosmoe]
326 |
327 | hotpar['fregion'] = chipfregion
328 |
329 | chrhotpars.append(hotpar)
330 |
331 | hotsptosinthreads = pool.map(hotspots_chromsome_merge, chrhotpars)
332 |
333 | for hotinth in hotsptosinthreads:
334 |
335 | for hotspotnow in hotinth:
336 |
337 | hotspots.append(hotspotnow)
338 |
339 | pool.close()
340 |
341 | pool.close()
342 |
343 | return hotspots
344 |
345 | except KeyboardInterrupt:
346 |
347 | pool.terminate()
348 |
349 | print ("You cancelled the program!")
350 |
351 | sys.exit(1)
352 |
353 | except Exception as e:
354 |
355 | print ('got exception in Jazzlib.hotspotsscan.hotspotsscan_withcontrol: %r, terminating the pool' % (e,))
356 |
357 | pool.terminate()
358 |
359 | print ('pool is terminated')
360 |
361 | finally:
362 |
363 | pool.join()
364 |
365 |
366 | def hotspot_control_worker(par):
367 |
368 | try:
369 |
370 | maxinsert = par['maxinsert']
371 |
372 | bamfile = par['bamfile']
373 |
374 | jobtype = par['jobtype']
375 |
376 | chromosome = par['regionchromosome']
377 |
378 | nowstart = par['regionstart']
379 |
380 | nowend = par['regionend']
381 |
382 | bayesfactorcount = par['bayesfactorcount']
383 |
384 | readlengthmean = par['readlengthmean']
385 |
386 | ratio = par['ratio']
387 |
388 | datacount = extenddepthcount(bamfile=bamfile, regionchromosome=chromosome, regionstart=nowstart,
389 | regionend=nowend, maxinsert=maxinsert, jobtype=jobtype,
390 | readlengthmean=readlengthmean)
391 |
392 | enrichedlist = dict()
393 |
394 | enrichedlist['chromosome'] = chromosome
395 |
396 | enrichedlist['list'] = list()
397 |
398 | for site in datacount:
399 |
400 | if datacount[site]*ratio >= bayesfactorcount:
401 |
402 | enrichedlist['list'].append(site)
403 |
404 | return enrichedlist
405 |
406 | except Exception as e:
407 |
408 | print ('got exception in Jazzlib.hotspotsscan.hotspot_withoutcontrol_worker: %r, terminating the pool' % (e,))
409 |
410 | print ('pool is terminated')
411 |
412 | except KeyboardInterrupt:
413 |
414 | print ("You cancelled the program!")
415 |
416 | sys.exit(1)
417 |
418 |
419 |
420 | def hotspotsfilter(hotspots, peaks):
421 |
422 | peaksparent = dict()
423 |
424 | for peak in peaks:
425 |
426 | if peak.parent not in peaksparent:
427 |
428 | peaksparent[peak.parent] = 1
429 |
430 | hotspotreturen = list()
431 |
432 | for hotspot in hotspots:
433 |
434 | if hotspot.hotspotid in peaksparent:
435 |
436 | hotspotreturen.append(hotspot)
437 |
438 | return hotspotreturen
439 |
440 |
441 | def hotspots_chromsome_merge(par):
442 |
443 | try:
444 |
445 | chromosome = par['chromosome']
446 |
447 | preregion = par['preregion']
448 |
449 | chr_length = par['chr_length']
450 |
451 | fregion = par['fregion']
452 |
453 | hotspotslist = list()
454 |
455 | enrichedpotin = dict()
456 |
457 | for regionpoint in preregion:
458 |
459 | for nowsite in regionpoint:
460 |
461 | if not nowsite in enrichedpotin:
462 |
463 | enrichedpotin[nowsite] = 1
464 |
465 | chrenrichlist = list(enrichedpotin.keys())
466 |
467 | temphotspots = continueregion(chrenrichlist, 2)
468 |
469 | for hotspotstarend in temphotspots:
470 |
471 | hotspotstart = hotspotstarend['start_site']
472 |
473 | hotspotend = hotspotstarend['end_site']
474 |
475 | if hotspotend-hotspotstart < fregion.readlengthmean/2:
476 |
477 | continue
478 |
479 | hotspotid = str(chromosome) + ":" + str(hotspotstart) +"-"+ str(hotspotend)
480 |
481 | hotspot = Hotspot(start=hotspotstart, end=hotspotend, chromosome=chromosome, hotspotid=hotspotid)
482 |
483 | hotspotslist.append(hotspot)
484 |
485 | return hotspotslist
486 |
487 | except Exception as e:
488 |
489 | print ('got exception in Jazzlib.hotspotsscan.hotspots_chromsome_merge: %r, terminating the pool' % (e,))
490 |
491 | print (par)
492 |
493 | print ('pool is terminated')
494 |
495 | except KeyboardInterrupt:
496 |
497 | print ("You cancelled the program!")
498 |
499 | sys.exit(1)
--------------------------------------------------------------------------------
/Jazzlib/hotspotsscan.py.bak:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from __future__ import print_function
3 | from countreads import *
4 | from cEM_zip import *
5 | from FRegion import *
6 | from multiprocessing import Pool
7 | from Hotspot import *
8 | from sta import *
9 | from region import *
10 | from Peak import *
11 |
12 |
13 | class KeyboardInterruptError(Exception):
14 |
15 | pass
16 |
17 |
18 | def hotspotsscan_withoutcontrol(file, maxinsert, windowscare,countchr,gloablumbda,
19 | bayesfactorthreshold, nthreads, fregion, jobtype):
20 |
21 | pool = Pool(nthreads)
22 |
23 | try:
24 |
25 | pars = list()
26 |
27 | hotspots = list()
28 |
29 | print ("gloablumbda",gloablumbda , "readlengthmean", fregion.readlengthmean)
30 |
31 | bayesfactorthresholdcount = 2
32 |
33 | i = 2
34 |
35 | while True:
36 |
37 | nowbayesfactor = bayesfactor(gloablumbda, i)
38 |
39 | if nowbayesfactor > bayesfactorthreshold:
40 |
41 | break
42 |
43 | bayesfactorthresholdcount = i
44 |
45 | i = i + 1
46 |
47 | print ("bayesfactorthresholdcount", bayesfactorthresholdcount)
48 |
49 | windowsize = 100000
50 |
51 | for chromosmoe in countchr:
52 |
53 | chr_length = fregion.chrs_length[chromosmoe]
54 |
55 | for scare in range(0, int(chr_length/windowsize)+1):
56 |
57 | nowstart = scare*windowsize + 1 -200
58 |
59 | nowend = (scare+1)*windowsize + 200
60 |
61 | if nowend > chr_length:
62 |
63 | nowend = chr_length
64 |
65 | if nowstart < 1:
66 |
67 | nowstart = 1
68 |
69 | nowregion = chromosmoe + ":" + str(nowstart) + "-" + str(nowend)
70 |
71 | par = dict()
72 |
73 | par['region'] = nowregion
74 |
75 | par['maxinsert'] = maxinsert
76 |
77 | par['bamfile'] = file
78 |
79 | par['jobtype'] = jobtype
80 |
81 | par['chrlength'] = chr_length
82 |
83 | par['regionchromosome'] = chromosmoe
84 |
85 | par['regionstart'] = nowstart
86 |
87 | par['regionend'] = nowend
88 |
89 | # par['bayesfactordic'] = bayesfactordic
90 |
91 | par['bayesfactorcount'] = bayesfactorthresholdcount
92 |
93 | par['readlengthmean'] = fregion.readlengthmean
94 |
95 | pars.append(par)
96 |
97 | enrichedinthreads = pool.map(hotspot_withoutcontrol_worker, pars)
98 |
99 | chrenrichedpotin = dict()
100 |
101 | for enrichedinthread in enrichedinthreads:
102 |
103 | nowchr = enrichedinthread['chromosome']
104 |
105 | if nowchr in chrenrichedpotin:
106 |
107 | chrenrichedpotin[nowchr].append(enrichedinthread['list'])
108 |
109 | else:
110 |
111 | chrenrichedpotin[nowchr] = list()
112 |
113 | chrenrichedpotin[nowchr].append(enrichedinthread['list'])
114 |
115 | chrhotpars = list()
116 |
117 | for nowchr in chrenrichedpotin:
118 |
119 | hotpar = dict()
120 |
121 | hotpar['chromosome'] = nowchr
122 |
123 | hotpar['preregion'] = chrenrichedpotin[nowchr]
124 |
125 | hotpar['chr_length'] = fregion.chrs_length[chromosmoe]
126 |
127 | hotpar['fregion'] = fregion
128 |
129 | chrhotpars.append(hotpar)
130 |
131 | hotsptosinthreads = pool.map(hotspots_chromsome_merge,chrhotpars)
132 |
133 | for hotinth in hotsptosinthreads:
134 |
135 | for hotspotnow in hotinth:
136 |
137 | hotspots.append(hotspotnow)
138 |
139 | pool.close()
140 |
141 | return hotspots
142 |
143 | except KeyboardInterrupt:
144 |
145 | pool.terminate()
146 |
147 | print ("You cancelled the program!")
148 |
149 | sys.exit(1)
150 |
151 | except Exception, e:
152 |
153 | print ('got exception in Jazzlib.hotspotsscan.hotspotsscan_withoutcontrol: %r, terminating the pool' % (e,))
154 |
155 | pool.terminate()
156 |
157 | print ('pool is terminated')
158 |
159 | finally:
160 |
161 | pool.join()
162 |
163 |
164 | def hotspot_withoutcontrol_worker(par):
165 |
166 | try:
167 |
168 | maxinsert = par['maxinsert']
169 |
170 | bamfile = par['bamfile']
171 |
172 | jobtype = par['jobtype']
173 |
174 | chromosome = par['regionchromosome']
175 |
176 | nowstart = par['regionstart']
177 |
178 | nowend = par['regionend']
179 |
180 | bayesfactorcount = par['bayesfactorcount']
181 |
182 | readlengthmean = par['readlengthmean']
183 |
184 | datacount = extenddepthcount(bamfile=bamfile, regionchromosome=chromosome, regionstart=nowstart,
185 | regionend=nowend, maxinsert=maxinsert, jobtype=jobtype,
186 | readlengthmean=readlengthmean)
187 |
188 | enrichedlist = dict()
189 |
190 | enrichedlist['chromosome'] = chromosome
191 |
192 | enrichedlist['list'] = list()
193 |
194 | for site in datacount:
195 |
196 | if datacount[site] >= bayesfactorcount:
197 |
198 | enrichedlist['list'].append(site)
199 |
200 | return enrichedlist
201 |
202 | except Exception, e:
203 |
204 | print ('got exception in Jazzlib.hotspotsscan.hotspot_withoutcontrol_worker: %r, terminating the pool' % (e,))
205 |
206 | print ('pool is terminated')
207 |
208 | except KeyboardInterrupt:
209 |
210 | print ("You cancelled the program!")
211 |
212 | sys.exit(1)
213 |
214 |
215 | def hotspotsscan_withcontrol(chipfile, maxinsert, windowscare,countchr,inputgloablumbda,
216 | bayesfactorthreshold, nthreads, chipfregion, jobtype, ratio,
217 | inputfile, inputfregion):
218 |
219 | pool = Pool(nthreads)
220 |
221 | try:
222 |
223 | pars = list()
224 |
225 | hotspots = list()
226 |
227 | print ("gloablumbda",inputgloablumbda , "readlengthmean", inputfregion.readlengthmean)
228 |
229 | bayesfactorthresholdcount = 2
230 |
231 | i = 2
232 |
233 | while True:
234 |
235 | nowbayesfactor = bayesfactor(inputgloablumbda, i)
236 |
237 | if nowbayesfactor > bayesfactorthreshold:
238 |
239 | break
240 |
241 | bayesfactorthresholdcount = i
242 |
243 | i = i + 1
244 |
245 | print ("bayesfactorthresholdcount", bayesfactorthresholdcount)
246 |
247 | windowsize = 100000
248 |
249 | for chromosmoe in countchr:
250 |
251 | chr_length = chipfregion.chrs_length[chromosmoe]
252 |
253 | for scare in range(0, int(chr_length/windowsize)+1):
254 |
255 | nowstart = scare*windowsize + 1 -200
256 |
257 | nowend = (scare+1)*windowsize + 200
258 |
259 | if nowend > chr_length:
260 |
261 | nowend = chr_length
262 |
263 | if nowstart < 1:
264 |
265 | nowstart = 1
266 |
267 | nowregion = chromosmoe + ":" + str(nowstart) + "-" + str(nowend)
268 |
269 | par = dict()
270 |
271 | par['region'] = nowregion
272 |
273 | par['maxinsert'] = maxinsert
274 |
275 | par['bamfile'] = chipfile
276 |
277 | par['jobtype'] = jobtype
278 |
279 | par['chrlength'] = chr_length
280 |
281 | par['regionchromosome'] = chromosmoe
282 |
283 | par['regionstart'] = nowstart
284 |
285 | par['regionend'] = nowend
286 |
287 | par['ratio'] = ratio
288 |
289 | # par['bayesfactordic'] = bayesfactordic
290 |
291 | par['bayesfactorcount'] = bayesfactorthresholdcount
292 |
293 | par['readlengthmean'] = chipfregion.readlengthmean
294 |
295 | pars.append(par)
296 |
297 | enrichedinthreads = pool.map(hotspot_control_worker, pars)
298 |
299 | chrenrichedpotin = dict()
300 |
301 | for enrichedinthread in enrichedinthreads:
302 |
303 | nowchr = enrichedinthread['chromosome']
304 |
305 | if nowchr in chrenrichedpotin:
306 |
307 | chrenrichedpotin[nowchr].append(enrichedinthread['list'])
308 |
309 | else:
310 |
311 | chrenrichedpotin[nowchr] = list()
312 |
313 | chrenrichedpotin[nowchr].append(enrichedinthread['list'])
314 |
315 | chrhotpars = list()
316 |
317 | for nowchr in chrenrichedpotin:
318 |
319 | hotpar = dict()
320 |
321 | hotpar['chromosome'] = nowchr
322 |
323 | hotpar['preregion'] = chrenrichedpotin[nowchr]
324 |
325 | hotpar['chr_length'] = chipfregion.chrs_length[chromosmoe]
326 |
327 | hotpar['fregion'] = chipfregion
328 |
329 | chrhotpars.append(hotpar)
330 |
331 | hotsptosinthreads = pool.map(hotspots_chromsome_merge, chrhotpars)
332 |
333 | for hotinth in hotsptosinthreads:
334 |
335 | for hotspotnow in hotinth:
336 |
337 | hotspots.append(hotspotnow)
338 |
339 | pool.close()
340 |
341 | pool.close()
342 |
343 | return hotspots
344 |
345 | except KeyboardInterrupt:
346 |
347 | pool.terminate()
348 |
349 | print ("You cancelled the program!")
350 |
351 | sys.exit(1)
352 |
353 | except Exception, e:
354 |
355 | print ('got exception in Jazzlib.hotspotsscan.hotspotsscan_withcontrol: %r, terminating the pool' % (e,))
356 |
357 | pool.terminate()
358 |
359 | print ('pool is terminated')
360 |
361 | finally:
362 |
363 | pool.join()
364 |
365 |
366 | def hotspot_control_worker(par):
367 |
368 | try:
369 |
370 | maxinsert = par['maxinsert']
371 |
372 | bamfile = par['bamfile']
373 |
374 | jobtype = par['jobtype']
375 |
376 | chromosome = par['regionchromosome']
377 |
378 | nowstart = par['regionstart']
379 |
380 | nowend = par['regionend']
381 |
382 | bayesfactorcount = par['bayesfactorcount']
383 |
384 | readlengthmean = par['readlengthmean']
385 |
386 | ratio = par['ratio']
387 |
388 | datacount = extenddepthcount(bamfile=bamfile, regionchromosome=chromosome, regionstart=nowstart,
389 | regionend=nowend, maxinsert=maxinsert, jobtype=jobtype,
390 | readlengthmean=readlengthmean)
391 |
392 | enrichedlist = dict()
393 |
394 | enrichedlist['chromosome'] = chromosome
395 |
396 | enrichedlist['list'] = list()
397 |
398 | for site in datacount:
399 |
400 | if datacount[site]*ratio >= bayesfactorcount:
401 |
402 | enrichedlist['list'].append(site)
403 |
404 | return enrichedlist
405 |
406 | except Exception, e:
407 |
408 | print ('got exception in Jazzlib.hotspotsscan.hotspot_withoutcontrol_worker: %r, terminating the pool' % (e,))
409 |
410 | print ('pool is terminated')
411 |
412 | except KeyboardInterrupt:
413 |
414 | print ("You cancelled the program!")
415 |
416 | sys.exit(1)
417 |
418 |
419 |
420 | def hotspotsfilter(hotspots, peaks):
421 |
422 | peaksparent = dict()
423 |
424 | for peak in peaks:
425 |
426 | if peak.parent not in peaksparent:
427 |
428 | peaksparent[peak.parent] = 1
429 |
430 | hotspotreturen = list()
431 |
432 | for hotspot in hotspots:
433 |
434 | if hotspot.hotspotid in peaksparent:
435 |
436 | hotspotreturen.append(hotspot)
437 |
438 | return hotspotreturen
439 |
440 |
441 | def hotspots_chromsome_merge(par):
442 |
443 | try:
444 |
445 | chromosome = par['chromosome']
446 |
447 | preregion = par['preregion']
448 |
449 | chr_length = par['chr_length']
450 |
451 | fregion = par['fregion']
452 |
453 | hotspotslist = list()
454 |
455 | enrichedpotin = dict()
456 |
457 | for regionpoint in preregion:
458 |
459 | for nowsite in regionpoint:
460 |
461 | if not nowsite in enrichedpotin:
462 |
463 | enrichedpotin[nowsite] = 1
464 |
465 | chrenrichlist = enrichedpotin.keys()
466 |
467 | temphotspots = continueregion(chrenrichlist, 2)
468 |
469 | for hotspotstarend in temphotspots:
470 |
471 | hotspotstart = hotspotstarend['start_site']
472 |
473 | hotspotend = hotspotstarend['end_site']
474 |
475 | if hotspotend-hotspotstart < fregion.readlengthmean/2:
476 |
477 | continue
478 |
479 | hotspotid = str(chromosome) + ":" + str(hotspotstart) +"-"+ str(hotspotend)
480 |
481 | hotspot = Hotspot(start=hotspotstart, end=hotspotend, chromosome=chromosome, hotspotid=hotspotid)
482 |
483 | hotspotslist.append(hotspot)
484 |
485 | return hotspotslist
486 |
487 | except Exception, e:
488 |
489 | print ('got exception in Jazzlib.hotspotsscan.hotspots_chromsome_merge: %r, terminating the pool' % (e,))
490 |
491 | print (par)
492 |
493 | print ('pool is terminated')
494 |
495 | except KeyboardInterrupt:
496 |
497 | print ("You cancelled the program!")
498 |
499 | sys.exit(1)
--------------------------------------------------------------------------------
/Jazzlib/peaksscan.py:
--------------------------------------------------------------------------------
1 |
2 | from .countreads import *
3 | from .cEM_zip import *
4 | from .FRegion import *
5 | from multiprocessing import Pool
6 | from .Peak import *
7 | from .sta import *
8 | from .region import *
9 | from .Hotspot import *
10 |
11 |
12 | class KeyboardInterruptError(Exception):
13 |
14 | pass
15 |
16 |
17 | def peakscan_without_control(datafile, maxinsert, bayesfactorthreshold, nthreads, fregion,
18 | jobtype, hotspots, gloablumbda):
19 |
20 | pool = Pool(nthreads)
21 |
22 | try:
23 |
24 | pars = list()
25 |
26 | for hotspot in hotspots:
27 |
28 | par = dict()
29 |
30 | par['hotspot'] = hotspot
31 |
32 | par['datafile'] = datafile
33 |
34 | par['maxinsert'] = maxinsert
35 |
36 | par['bayesfactorthreashold'] = bayesfactorthreshold
37 |
38 | par['jobtype'] = jobtype
39 |
40 | par['gloablumbda'] = gloablumbda
41 |
42 | par['ratio'] = 1
43 |
44 | par['fregion'] = fregion
45 |
46 | pars.append(par)
47 |
48 | peaksinthreads = pool.map(peakscan_withoutcontrol_worker, pars)
49 |
50 | peaks = list()
51 |
52 | for hotspotnow in peaksinthreads:
53 |
54 | for peaknow in hotspotnow:
55 |
56 | print((peaknow.peakid))
57 |
58 | peaks.append(peaknow)
59 |
60 | pool.close()
61 |
62 | return peaks
63 |
64 | except KeyboardInterrupt:
65 |
66 | pool.terminate()
67 |
68 | print ("You cancelled the program!")
69 |
70 | sys.exit(1)
71 |
72 | except Exception as e:
73 |
74 | print(('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,)))
75 |
76 | pool.terminate()
77 |
78 | print ('pool is terminated')
79 |
80 | finally:
81 | # print ('joining pool processes')
82 | pool.join()
83 | # print ('join complete')
84 |
85 |
86 | def peakscan_withoutcontrol_worker(par):
87 |
88 | try:
89 | peaks = list()
90 |
91 | hotspot = par['hotspot']
92 |
93 | datafile = par['datafile']
94 |
95 | maxinsert = par['maxinsert']
96 |
97 | bayesfactorthreshold = par['bayesfactorthreashold']
98 |
99 | jobtype = par['jobtype']
100 |
101 | gloablumbda = par['gloablumbda']
102 |
103 | ratio = par['ratio']
104 |
105 | fregion = par['fregion']
106 |
107 | start = hotspot.start
108 |
109 | end = hotspot.end
110 |
111 | chromosome = hotspot.chromosome
112 |
113 | chrlength = fregion.chrs_length[chromosome]
114 |
115 | regionstart = start - 5100
116 |
117 | regionend = end + 5100
118 |
119 | if regionstart < 1:
120 |
121 | regionstart = 1
122 |
123 | if regionend > chrlength:
124 |
125 | regionend = chrlength
126 |
127 | datacount = depthcount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart,
128 | regionend=regionend, maxinsert=maxinsert, jobtype=jobtype)
129 |
130 | # datacount = midsitecount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart,
131 | # regionend=regionend, maxinsert=maxinsert, jobtype=jobtype)
132 |
133 | enrichedsite = dict()
134 |
135 | bayesfactorscore = dict()
136 |
137 | inputwindow5k = list()
138 |
139 | inputwindow10k = list()
140 |
141 | for sitenow in range(start-5000,end+5000):
142 |
143 | nowcount = 0
144 |
145 | if sitenow < 0:
146 |
147 | continue
148 |
149 | if sitenow > chrlength:
150 |
151 | continue
152 |
153 | if sitenow in datacount:
154 |
155 | nowcount = datacount[sitenow]
156 |
157 | inputwindow10k.append(nowcount)
158 |
159 | for sitenow in range(start-2500,end+2500):
160 |
161 | nowcount = 0
162 |
163 | if sitenow < 0:
164 |
165 | continue
166 |
167 | if sitenow > chrlength:
168 |
169 | continue
170 |
171 | if sitenow in datacount:
172 |
173 | nowcount = datacount[sitenow]
174 |
175 | inputwindow5k.append(nowcount)
176 |
177 |
178 | (window5klhat, window5kphat) = cEM_zip(inputwindow5k)
179 |
180 | (window10klhat, window10kphat) = cEM_zip(inputwindow10k)
181 |
182 | maxlhat = max(window5klhat, window10klhat, gloablumbda)
183 |
184 | if maxlhat > 400:
185 |
186 | maxlhat = gloablumbda * 5
187 |
188 | for wsite in range(start-1, end+1):
189 |
190 | if wsite in datacount:
191 |
192 | nowcount = datacount[wsite]
193 |
194 | if nowcount < 2:
195 |
196 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=2)
197 |
198 | else:
199 |
200 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=nowcount)
201 |
202 | bayesfactorscore[wsite] = nowbayesfactor
203 |
204 | if nowbayesfactor > bayesfactorthreshold:
205 |
206 | enrichedsite[wsite] = 1
207 |
208 | regionlist = list(enrichedsite.keys())
209 |
210 | tmppeaks = continueregion(points=regionlist, minlength=1)
211 |
212 | iniid = 1
213 |
214 | for tmppeak in tmppeaks:
215 |
216 | tmppeakstart = tmppeak['start_site']
217 |
218 | tmppeakend = tmppeak['end_site']
219 |
220 | totalbayesscore = 0
221 |
222 | maxscore = 0
223 |
224 | maxsite = 0
225 |
226 | for site in range(tmppeakstart, tmppeakend+1):
227 |
228 | score = bayesfactorscore[site]
229 |
230 | totalbayesscore = totalbayesscore + score
231 |
232 | if score > maxscore:
233 |
234 | score = maxscore
235 |
236 | maxsite = site
237 |
238 | avgbayescore = totalbayesscore/(tmppeakend - tmppeakstart + 1)
239 |
240 | peakid = hotspot.hotspotid+'.'+str(iniid)
241 |
242 | peak = Peak(start=tmppeakstart, end=tmppeakend, chromosome=chromosome, peakpoint=maxsite, peakid=peakid,
243 | score=avgbayescore, parent=hotspot.hotspotid)
244 |
245 | iniid= iniid +1
246 |
247 | peaks.append(peak)
248 |
249 | return peaks
250 |
251 | except Exception as e:
252 |
253 | print(('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,)))
254 |
255 | print((par['hotspot'].chromosome, par['hotspot'].start,par['hotspot'].end))
256 |
257 |
258 | except KeyboardInterrupt:
259 |
260 | print ("You cancelled the program!")
261 |
262 | sys.exit(1)
263 |
264 |
265 |
266 |
267 | def peakscan_control(datafile, maxinsert, bayesfactorthreshold, nthreads, chipfregion,
268 | jobtype, hotspots, gloablumbda, inputfile, ratio, inputfregion):
269 |
270 | pool = Pool(nthreads)
271 |
272 | try:
273 |
274 | pars = list()
275 |
276 | for hotspot in hotspots:
277 |
278 | par = dict()
279 |
280 | par['hotspot'] = hotspot
281 |
282 | par['datafile'] = datafile
283 |
284 | par['maxinsert'] = maxinsert
285 |
286 | par['bayesfactorthreashold'] = bayesfactorthreshold
287 |
288 | par['jobtype'] = jobtype
289 |
290 | par['gloablumbda'] = gloablumbda
291 |
292 | par['ratio'] = ratio
293 |
294 | par['inputfile'] = inputfile
295 |
296 | par['fregion'] = inputfregion
297 |
298 | pars.append(par)
299 |
300 | peaksinthreads = pool.map(peakscan_withoutcontrol_worker, pars)
301 |
302 | peaks = list()
303 |
304 | for hotspotnow in peaksinthreads:
305 |
306 | for peaknow in hotspotnow:
307 |
308 | print((peaknow.peakid))
309 |
310 | peaks.append(peaknow)
311 |
312 | pool.close()
313 |
314 | return peaks
315 |
316 | except KeyboardInterrupt:
317 |
318 | pool.terminate()
319 |
320 | print ("You cancelled the program!")
321 |
322 | sys.exit(1)
323 |
324 | except Exception as e:
325 |
326 | print(('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,)))
327 |
328 | pool.terminate()
329 |
330 | print ('pool is terminated')
331 |
332 | finally:
333 | # print ('joining pool processes')
334 | pool.join()
335 | # print ('join complete')
336 |
337 |
338 | def peakscan_control_worker(par):
339 |
340 | try:
341 | peaks = list()
342 |
343 | hotspot = par['hotspot']
344 |
345 | datafile = par['datafile']
346 |
347 | inputfile = par['inputfile']
348 |
349 | maxinsert = par['maxinsert']
350 |
351 | bayesfactorthreshold = par['bayesfactorthreashold']
352 |
353 | jobtype = par['jobtype']
354 |
355 | gloablumbda = par['gloablumbda']
356 |
357 | ratio = par['ratio']
358 |
359 | fregion = par['fregion']
360 |
361 | start = hotspot.start
362 |
363 | end = hotspot.end
364 |
365 | chromosome = hotspot.chromosome
366 |
367 | chrlength = fregion.chrs_length[chromosome]
368 |
369 | regionstart = start - 5100
370 |
371 | regionend = end + 5100
372 |
373 | if regionstart < 1:
374 |
375 | regionstart = 1
376 |
377 | if regionend > chrlength:
378 |
379 | regionend = chrlength
380 |
381 | datacount = depthcount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart,
382 | regionend=regionend, maxinsert=maxinsert, jobtype=jobtype)
383 |
384 | inputcount = depthcount(bamfile=inputfile, regionchromosome=chromosome, regionstart=regionstart,
385 | regionend=regionend, maxinsert=maxinsert, jobtype=jobtype)
386 |
387 | # datacount = midsitecount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart,
388 | # regionend=regionend, maxinsert=maxinsert, jobtype=jobtype)
389 |
390 | enrichedsite = dict()
391 |
392 | bayesfactorscore = dict()
393 |
394 | inputwindow5k = list()
395 |
396 | inputwindow10k = list()
397 |
398 | inputwindow1k = list()
399 |
400 | for sitenow in range(start-5000,end+5000):
401 |
402 | nowcount = 0
403 |
404 | if sitenow < 0:
405 |
406 | continue
407 |
408 | if sitenow > chrlength:
409 |
410 | continue
411 |
412 | if sitenow in inputcount:
413 |
414 | nowcount = inputcount[sitenow]
415 |
416 | inputwindow10k.append(nowcount)
417 |
418 | for sitenow in range(start-2500,end+2500):
419 |
420 | nowcount = 0
421 |
422 | if sitenow < 0:
423 |
424 | continue
425 |
426 | if sitenow > chrlength:
427 |
428 | continue
429 |
430 | if sitenow in inputcount:
431 |
432 | nowcount = inputcount[sitenow]
433 |
434 | inputwindow5k.append(nowcount)
435 |
436 | for sitenow in range(start-500,end+500):
437 |
438 | nowcount = 0
439 |
440 | if sitenow < 0:
441 |
442 | continue
443 |
444 | if sitenow > chrlength:
445 |
446 | continue
447 |
448 | if sitenow in inputcount:
449 |
450 | nowcount = inputcount[sitenow]
451 |
452 | inputwindow1k.append(nowcount)
453 |
454 |
455 | (window5klhat, window5kphat) = cEM_zip(inputwindow5k)
456 |
457 | (window10klhat, window10kphat) = cEM_zip(inputwindow10k)
458 |
459 | (window1klhat, window1kphat) = cEM_zip(inputwindow1k)
460 |
461 | maxlhat = max(window5klhat, window10klhat, window1klhat, gloablumbda)
462 |
463 | if maxlhat > 400:
464 |
465 | maxlhat = gloablumbda * 5
466 |
467 | for wsite in range(start-1, end+1):
468 |
469 | if wsite in datacount:
470 |
471 | nowcount = int(datacount[wsite]*ratio)
472 |
473 | if nowcount < 2:
474 |
475 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=2)
476 |
477 | else:
478 |
479 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=nowcount)
480 |
481 | bayesfactorscore[wsite] = nowbayesfactor
482 |
483 | if nowbayesfactor > bayesfactorthreshold:
484 |
485 | enrichedsite[wsite] = 1
486 |
487 | regionlist = list(enrichedsite.keys())
488 |
489 | tmppeaks = continueregion(points=regionlist, minlength=1)
490 |
491 | iniid = 1
492 |
493 | for tmppeak in tmppeaks:
494 |
495 | tmppeakstart = tmppeak['start_site']
496 |
497 | tmppeakend = tmppeak['end_site']
498 |
499 | totalbayesscore = 0
500 |
501 | maxscore = 0
502 |
503 | maxsite = 0
504 |
505 | for site in range(tmppeakstart, tmppeakend+1):
506 |
507 | score = bayesfactorscore[site]
508 |
509 | totalbayesscore = totalbayesscore + score
510 |
511 | if score > maxscore:
512 |
513 | score = maxscore
514 |
515 | maxsite = site
516 |
517 | avgbayescore = totalbayesscore/(tmppeakend - tmppeakstart + 1)
518 |
519 | peakid = hotspot.hotspotid+'.'+str(iniid)
520 |
521 | peak = Peak(start=tmppeakstart, end=tmppeakend, chromosome=chromosome, peakpoint=maxsite, peakid=peakid,
522 | score=avgbayescore, parent=hotspot.hotspotid)
523 |
524 | iniid= iniid +1
525 |
526 | peaks.append(peak)
527 |
528 | return peaks
529 |
530 | except Exception as e:
531 |
532 | print(('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,)))
533 |
534 | print((par['hotspot'].chromosome, par['hotspot'].start,par['hotspot'].end))
535 |
536 |
537 | except KeyboardInterrupt:
538 |
539 | print ("You cancelled the program!")
540 |
541 | sys.exit(1)
--------------------------------------------------------------------------------
/Jazzlib/peaksscan.py.bak:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from countreads import *
3 | from cEM_zip import *
4 | from FRegion import *
5 | from multiprocessing import Pool
6 | from Peak import *
7 | from sta import *
8 | from region import *
9 | from Hotspot import *
10 |
11 |
12 | class KeyboardInterruptError(Exception):
13 |
14 | pass
15 |
16 |
17 | def peakscan_without_control(datafile, maxinsert, bayesfactorthreshold, nthreads, fregion,
18 | jobtype, hotspots, gloablumbda):
19 |
20 | pool = Pool(nthreads)
21 |
22 | try:
23 |
24 | pars = list()
25 |
26 | for hotspot in hotspots:
27 |
28 | par = dict()
29 |
30 | par['hotspot'] = hotspot
31 |
32 | par['datafile'] = datafile
33 |
34 | par['maxinsert'] = maxinsert
35 |
36 | par['bayesfactorthreashold'] = bayesfactorthreshold
37 |
38 | par['jobtype'] = jobtype
39 |
40 | par['gloablumbda'] = gloablumbda
41 |
42 | par['ratio'] = 1
43 |
44 | par['fregion'] = fregion
45 |
46 | pars.append(par)
47 |
48 | peaksinthreads = pool.map(peakscan_withoutcontrol_worker, pars)
49 |
50 | peaks = list()
51 |
52 | for hotspotnow in peaksinthreads:
53 |
54 | for peaknow in hotspotnow:
55 |
56 | print (peaknow.peakid)
57 |
58 | peaks.append(peaknow)
59 |
60 | pool.close()
61 |
62 | return peaks
63 |
64 | except KeyboardInterrupt:
65 |
66 | pool.terminate()
67 |
68 | print ("You cancelled the program!")
69 |
70 | sys.exit(1)
71 |
72 | except Exception, e:
73 |
74 | print ('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,))
75 |
76 | pool.terminate()
77 |
78 | print ('pool is terminated')
79 |
80 | finally:
81 | # print ('joining pool processes')
82 | pool.join()
83 | # print ('join complete')
84 |
85 |
86 | def peakscan_withoutcontrol_worker(par):
87 |
88 | try:
89 | peaks = list()
90 |
91 | hotspot = par['hotspot']
92 |
93 | datafile = par['datafile']
94 |
95 | maxinsert = par['maxinsert']
96 |
97 | bayesfactorthreshold = par['bayesfactorthreashold']
98 |
99 | jobtype = par['jobtype']
100 |
101 | gloablumbda = par['gloablumbda']
102 |
103 | ratio = par['ratio']
104 |
105 | fregion = par['fregion']
106 |
107 | start = hotspot.start
108 |
109 | end = hotspot.end
110 |
111 | chromosome = hotspot.chromosome
112 |
113 | chrlength = fregion.chrs_length[chromosome]
114 |
115 | regionstart = start - 5100
116 |
117 | regionend = end + 5100
118 |
119 | if regionstart < 1:
120 |
121 | regionstart = 1
122 |
123 | if regionend > chrlength:
124 |
125 | regionend = chrlength
126 |
127 | datacount = depthcount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart,
128 | regionend=regionend, maxinsert=maxinsert, jobtype=jobtype)
129 |
130 | # datacount = midsitecount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart,
131 | # regionend=regionend, maxinsert=maxinsert, jobtype=jobtype)
132 |
133 | enrichedsite = dict()
134 |
135 | bayesfactorscore = dict()
136 |
137 | inputwindow5k = list()
138 |
139 | inputwindow10k = list()
140 |
141 | for sitenow in range(start-5000,end+5000):
142 |
143 | nowcount = 0
144 |
145 | if sitenow < 0:
146 |
147 | continue
148 |
149 | if sitenow > chrlength:
150 |
151 | continue
152 |
153 | if sitenow in datacount:
154 |
155 | nowcount = datacount[sitenow]
156 |
157 | inputwindow10k.append(nowcount)
158 |
159 | for sitenow in range(start-2500,end+2500):
160 |
161 | nowcount = 0
162 |
163 | if sitenow < 0:
164 |
165 | continue
166 |
167 | if sitenow > chrlength:
168 |
169 | continue
170 |
171 | if sitenow in datacount:
172 |
173 | nowcount = datacount[sitenow]
174 |
175 | inputwindow5k.append(nowcount)
176 |
177 |
178 | (window5klhat, window5kphat) = cEM_zip(inputwindow5k)
179 |
180 | (window10klhat, window10kphat) = cEM_zip(inputwindow10k)
181 |
182 | maxlhat = max(window5klhat, window10klhat, gloablumbda)
183 |
184 | if maxlhat > 400:
185 |
186 | maxlhat = gloablumbda * 5
187 |
188 | for wsite in range(start-1, end+1):
189 |
190 | if wsite in datacount:
191 |
192 | nowcount = datacount[wsite]
193 |
194 | if nowcount < 2:
195 |
196 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=2)
197 |
198 | else:
199 |
200 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=nowcount)
201 |
202 | bayesfactorscore[wsite] = nowbayesfactor
203 |
204 | if nowbayesfactor > bayesfactorthreshold:
205 |
206 | enrichedsite[wsite] = 1
207 |
208 | regionlist = enrichedsite.keys()
209 |
210 | tmppeaks = continueregion(points=regionlist, minlength=1)
211 |
212 | iniid = 1
213 |
214 | for tmppeak in tmppeaks:
215 |
216 | tmppeakstart = tmppeak['start_site']
217 |
218 | tmppeakend = tmppeak['end_site']
219 |
220 | totalbayesscore = 0
221 |
222 | maxscore = 0
223 |
224 | maxsite = 0
225 |
226 | for site in range(tmppeakstart, tmppeakend+1):
227 |
228 | score = bayesfactorscore[site]
229 |
230 | totalbayesscore = totalbayesscore + score
231 |
232 | if score > maxscore:
233 |
234 | score = maxscore
235 |
236 | maxsite = site
237 |
238 | avgbayescore = totalbayesscore/(tmppeakend - tmppeakstart + 1)
239 |
240 | peakid = hotspot.hotspotid+'.'+str(iniid)
241 |
242 | peak = Peak(start=tmppeakstart, end=tmppeakend, chromosome=chromosome, peakpoint=maxsite, peakid=peakid,
243 | score=avgbayescore, parent=hotspot.hotspotid)
244 |
245 | iniid= iniid +1
246 |
247 | peaks.append(peak)
248 |
249 | return peaks
250 |
251 | except Exception, e:
252 |
253 | print ('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,))
254 |
255 | print (par['hotspot'].chromosome, par['hotspot'].start,par['hotspot'].end)
256 |
257 |
258 | except KeyboardInterrupt:
259 |
260 | print ("You cancelled the program!")
261 |
262 | sys.exit(1)
263 |
264 |
265 |
266 |
267 | def peakscan_control(datafile, maxinsert, bayesfactorthreshold, nthreads, chipfregion,
268 | jobtype, hotspots, gloablumbda, inputfile, ratio, inputfregion):
269 |
270 | pool = Pool(nthreads)
271 |
272 | try:
273 |
274 | pars = list()
275 |
276 | for hotspot in hotspots:
277 |
278 | par = dict()
279 |
280 | par['hotspot'] = hotspot
281 |
282 | par['datafile'] = datafile
283 |
284 | par['maxinsert'] = maxinsert
285 |
286 | par['bayesfactorthreashold'] = bayesfactorthreshold
287 |
288 | par['jobtype'] = jobtype
289 |
290 | par['gloablumbda'] = gloablumbda
291 |
292 | par['ratio'] = ratio
293 |
294 | par['inputfile'] = inputfile
295 |
296 | par['fregion'] = inputfregion
297 |
298 | pars.append(par)
299 |
300 | peaksinthreads = pool.map(peakscan_withoutcontrol_worker, pars)
301 |
302 | peaks = list()
303 |
304 | for hotspotnow in peaksinthreads:
305 |
306 | for peaknow in hotspotnow:
307 |
308 | print (peaknow.peakid)
309 |
310 | peaks.append(peaknow)
311 |
312 | pool.close()
313 |
314 | return peaks
315 |
316 | except KeyboardInterrupt:
317 |
318 | pool.terminate()
319 |
320 | print ("You cancelled the program!")
321 |
322 | sys.exit(1)
323 |
324 | except Exception, e:
325 |
326 | print ('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,))
327 |
328 | pool.terminate()
329 |
330 | print ('pool is terminated')
331 |
332 | finally:
333 | # print ('joining pool processes')
334 | pool.join()
335 | # print ('join complete')
336 |
337 |
338 | def peakscan_control_worker(par):
339 |
340 | try:
341 | peaks = list()
342 |
343 | hotspot = par['hotspot']
344 |
345 | datafile = par['datafile']
346 |
347 | inputfile = par['inputfile']
348 |
349 | maxinsert = par['maxinsert']
350 |
351 | bayesfactorthreshold = par['bayesfactorthreashold']
352 |
353 | jobtype = par['jobtype']
354 |
355 | gloablumbda = par['gloablumbda']
356 |
357 | ratio = par['ratio']
358 |
359 | fregion = par['fregion']
360 |
361 | start = hotspot.start
362 |
363 | end = hotspot.end
364 |
365 | chromosome = hotspot.chromosome
366 |
367 | chrlength = fregion.chrs_length[chromosome]
368 |
369 | regionstart = start - 5100
370 |
371 | regionend = end + 5100
372 |
373 | if regionstart < 1:
374 |
375 | regionstart = 1
376 |
377 | if regionend > chrlength:
378 |
379 | regionend = chrlength
380 |
381 | datacount = depthcount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart,
382 | regionend=regionend, maxinsert=maxinsert, jobtype=jobtype)
383 |
384 | inputcount = depthcount(bamfile=inputfile, regionchromosome=chromosome, regionstart=regionstart,
385 | regionend=regionend, maxinsert=maxinsert, jobtype=jobtype)
386 |
387 | # datacount = midsitecount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart,
388 | # regionend=regionend, maxinsert=maxinsert, jobtype=jobtype)
389 |
390 | enrichedsite = dict()
391 |
392 | bayesfactorscore = dict()
393 |
394 | inputwindow5k = list()
395 |
396 | inputwindow10k = list()
397 |
398 | inputwindow1k = list()
399 |
400 | for sitenow in range(start-5000,end+5000):
401 |
402 | nowcount = 0
403 |
404 | if sitenow < 0:
405 |
406 | continue
407 |
408 | if sitenow > chrlength:
409 |
410 | continue
411 |
412 | if sitenow in inputcount:
413 |
414 | nowcount = inputcount[sitenow]
415 |
416 | inputwindow10k.append(nowcount)
417 |
418 | for sitenow in range(start-2500,end+2500):
419 |
420 | nowcount = 0
421 |
422 | if sitenow < 0:
423 |
424 | continue
425 |
426 | if sitenow > chrlength:
427 |
428 | continue
429 |
430 | if sitenow in inputcount:
431 |
432 | nowcount = inputcount[sitenow]
433 |
434 | inputwindow5k.append(nowcount)
435 |
436 | for sitenow in range(start-500,end+500):
437 |
438 | nowcount = 0
439 |
440 | if sitenow < 0:
441 |
442 | continue
443 |
444 | if sitenow > chrlength:
445 |
446 | continue
447 |
448 | if sitenow in inputcount:
449 |
450 | nowcount = inputcount[sitenow]
451 |
452 | inputwindow1k.append(nowcount)
453 |
454 |
455 | (window5klhat, window5kphat) = cEM_zip(inputwindow5k)
456 |
457 | (window10klhat, window10kphat) = cEM_zip(inputwindow10k)
458 |
459 | (window1klhat, window1kphat) = cEM_zip(inputwindow1k)
460 |
461 | maxlhat = max(window5klhat, window10klhat, window1klhat, gloablumbda)
462 |
463 | if maxlhat > 400:
464 |
465 | maxlhat = gloablumbda * 5
466 |
467 | for wsite in range(start-1, end+1):
468 |
469 | if wsite in datacount:
470 |
471 | nowcount = int(datacount[wsite]*ratio)
472 |
473 | if nowcount < 2:
474 |
475 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=2)
476 |
477 | else:
478 |
479 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=nowcount)
480 |
481 | bayesfactorscore[wsite] = nowbayesfactor
482 |
483 | if nowbayesfactor > bayesfactorthreshold:
484 |
485 | enrichedsite[wsite] = 1
486 |
487 | regionlist = enrichedsite.keys()
488 |
489 | tmppeaks = continueregion(points=regionlist, minlength=1)
490 |
491 | iniid = 1
492 |
493 | for tmppeak in tmppeaks:
494 |
495 | tmppeakstart = tmppeak['start_site']
496 |
497 | tmppeakend = tmppeak['end_site']
498 |
499 | totalbayesscore = 0
500 |
501 | maxscore = 0
502 |
503 | maxsite = 0
504 |
505 | for site in range(tmppeakstart, tmppeakend+1):
506 |
507 | score = bayesfactorscore[site]
508 |
509 | totalbayesscore = totalbayesscore + score
510 |
511 | if score > maxscore:
512 |
513 | score = maxscore
514 |
515 | maxsite = site
516 |
517 | avgbayescore = totalbayesscore/(tmppeakend - tmppeakstart + 1)
518 |
519 | peakid = hotspot.hotspotid+'.'+str(iniid)
520 |
521 | peak = Peak(start=tmppeakstart, end=tmppeakend, chromosome=chromosome, peakpoint=maxsite, peakid=peakid,
522 | score=avgbayescore, parent=hotspot.hotspotid)
523 |
524 | iniid= iniid +1
525 |
526 | peaks.append(peak)
527 |
528 | return peaks
529 |
530 | except Exception, e:
531 |
532 | print ('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,))
533 |
534 | print (par['hotspot'].chromosome, par['hotspot'].start,par['hotspot'].end)
535 |
536 |
537 | except KeyboardInterrupt:
538 |
539 | print ("You cancelled the program!")
540 |
541 | sys.exit(1)
--------------------------------------------------------------------------------