├── Jazzlib ├── __init__.py ├── Peak.pyc ├── sta.pyc ├── FRegion.pyc ├── Hotspot.pyc ├── bgcount.pyc ├── cEM_zip.so ├── jazzio.pyc ├── kernel.pyc ├── region.pyc ├── __init__.pyc ├── countreads.pyc ├── localmax.pyc ├── peakcount.pyc ├── peaksscan.pyc ├── randombg.pyc ├── hotspotsscan.pyc ├── kernelsmooth.pyc ├── readscounter.pyc ├── hotspotscount.pyc ├── normalize_ratio.pyc ├── cEM_zip.cpython-36m-darwin.so ├── cEM_zip.cpython-37m-darwin.so ├── __pycache__ │ ├── Peak.cpython-36.pyc │ ├── sta.cpython-36.pyc │ ├── FRegion.cpython-36.pyc │ ├── FRegion.cpython-37.pyc │ ├── Hotspot.cpython-36.pyc │ ├── jazzio.cpython-36.pyc │ ├── kernel.cpython-36.pyc │ ├── region.cpython-36.pyc │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-37.pyc │ ├── localmax.cpython-36.pyc │ ├── peaksscan.cpython-36.pyc │ ├── randombg.cpython-36.pyc │ ├── countreads.cpython-36.pyc │ ├── countreads.cpython-37.pyc │ ├── hotspotsscan.cpython-36.pyc │ ├── kernelsmooth.cpython-36.pyc │ └── normalize_ratio.cpython-36.pyc ├── cEM_zip.cpython-36m-x86_64-linux-gnu.so ├── Peak.py ├── Hotspot.py ├── kernel.py ├── cEM_zip.pyx ├── normalize_ratio.py ├── normalize_ratio.py.bak ├── kernelsmooth.py ├── kernelsmooth.py.bak ├── region.py ├── region.py.bak ├── jazzio.py.bak ├── jazzio.py ├── randombg.py ├── randombg.py.bak ├── localmax.py ├── sta.py ├── localmax.py.bak ├── sta.py.bak ├── FRegion.py.bak ├── FRegion.py ├── Jazz.py ├── Jazz.py.bak ├── bgcount.py ├── bgcount.py.bak ├── hotspotsscan.py ├── hotspotsscan.py.bak ├── peaksscan.py └── peaksscan.py.bak ├── requirements.txt ├── .idea ├── .gitignore ├── vcs.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml ├── misc.xml └── Jazz.iml ├── .DS_Store ├── setup.py ├── Readme.txt └── Jazz.py /Jazzlib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython 2 | numpy 3 | pysam 4 | scipy 5 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Default ignored files 3 | /workspace.xml -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/.DS_Store -------------------------------------------------------------------------------- /Jazzlib/Peak.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/Peak.pyc -------------------------------------------------------------------------------- /Jazzlib/sta.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/sta.pyc -------------------------------------------------------------------------------- /Jazzlib/FRegion.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/FRegion.pyc -------------------------------------------------------------------------------- /Jazzlib/Hotspot.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/Hotspot.pyc -------------------------------------------------------------------------------- /Jazzlib/bgcount.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/bgcount.pyc -------------------------------------------------------------------------------- /Jazzlib/cEM_zip.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/cEM_zip.so -------------------------------------------------------------------------------- /Jazzlib/jazzio.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/jazzio.pyc -------------------------------------------------------------------------------- /Jazzlib/kernel.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/kernel.pyc -------------------------------------------------------------------------------- /Jazzlib/region.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/region.pyc -------------------------------------------------------------------------------- /Jazzlib/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__init__.pyc -------------------------------------------------------------------------------- /Jazzlib/countreads.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/countreads.pyc -------------------------------------------------------------------------------- /Jazzlib/localmax.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/localmax.pyc -------------------------------------------------------------------------------- /Jazzlib/peakcount.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/peakcount.pyc -------------------------------------------------------------------------------- /Jazzlib/peaksscan.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/peaksscan.pyc -------------------------------------------------------------------------------- /Jazzlib/randombg.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/randombg.pyc -------------------------------------------------------------------------------- /Jazzlib/hotspotsscan.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/hotspotsscan.pyc -------------------------------------------------------------------------------- /Jazzlib/kernelsmooth.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/kernelsmooth.pyc -------------------------------------------------------------------------------- /Jazzlib/readscounter.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/readscounter.pyc -------------------------------------------------------------------------------- /Jazzlib/hotspotscount.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/hotspotscount.pyc -------------------------------------------------------------------------------- /Jazzlib/normalize_ratio.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/normalize_ratio.pyc -------------------------------------------------------------------------------- /Jazzlib/cEM_zip.cpython-36m-darwin.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/cEM_zip.cpython-36m-darwin.so -------------------------------------------------------------------------------- /Jazzlib/cEM_zip.cpython-37m-darwin.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/cEM_zip.cpython-37m-darwin.so -------------------------------------------------------------------------------- /Jazzlib/__pycache__/Peak.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/Peak.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/sta.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/sta.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/FRegion.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/FRegion.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/FRegion.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/FRegion.cpython-37.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/Hotspot.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/Hotspot.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/jazzio.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/jazzio.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/kernel.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/kernel.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/region.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/region.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/localmax.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/localmax.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/peaksscan.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/peaksscan.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/randombg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/randombg.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/countreads.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/countreads.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/countreads.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/countreads.cpython-37.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/hotspotsscan.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/hotspotsscan.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/__pycache__/kernelsmooth.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/kernelsmooth.cpython-36.pyc -------------------------------------------------------------------------------- /Jazzlib/cEM_zip.cpython-36m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/cEM_zip.cpython-36m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /Jazzlib/__pycache__/normalize_ratio.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/Jazz/master/Jazzlib/__pycache__/normalize_ratio.cpython-36.pyc -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | 4 | setup( 5 | name = "cEM_zip", 6 | ext_modules = cythonize("Jazzlib/*.pyx"), 7 | ) 8 | 9 | #python setup.py build_ext --inplace 10 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/Jazz.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /Jazzlib/Peak.py: -------------------------------------------------------------------------------- 1 | class Peak: 2 | """ 3 | Peaks 4 | 5 | """ 6 | 7 | def __init__(self, start, end, chromosome, peakpoint, peakid, score, parent=1, fdr=1): 8 | 9 | self.start = start 10 | 11 | self.end = end 12 | 13 | self.chromosome = chromosome 14 | 15 | self.peakpoint = peakpoint 16 | 17 | self.peakid = peakid 18 | 19 | self.score = score 20 | 21 | self.fdr = fdr 22 | 23 | self.parent = parent 24 | 25 | -------------------------------------------------------------------------------- /Jazzlib/Hotspot.py: -------------------------------------------------------------------------------- 1 | class Hotspot: 2 | """ 3 | Hotspot 4 | 5 | """ 6 | 7 | def __init__(self, start, end, chromosome, hotspotid, peaks=list(), score=0, fdr=1): 8 | 9 | self.start = start 10 | 11 | self.end = end 12 | 13 | self.chromosome = chromosome 14 | 15 | self.hotspotid = hotspotid 16 | 17 | self.score = score 18 | 19 | self.fdr = fdr 20 | 21 | self.peaks = peaks 22 | 23 | def addpeak(self, peak): 24 | 25 | self.peaks.append(peak) 26 | -------------------------------------------------------------------------------- /Jazzlib/kernel.py: -------------------------------------------------------------------------------- 1 | from numpy import * 2 | #from scipy.ndimage.filters import * 3 | 4 | 5 | def kde(z, w, xv): 6 | 7 | return sum(exp(-0.5*((z-xv)/w)**2)/sqrt(2*pi*w**2)) 8 | 9 | 10 | def smooth_kernel(length): 11 | 12 | if length % 2 == 0: 13 | 14 | length = length + 1 15 | 16 | bandwidth = (length - 1)/6.0 17 | 18 | one_kernel = dict() 19 | 20 | for pos in linspace(-(length-1)/2, (length-1)/2, length): 21 | 22 | one_kernel[int(pos)] = kde(pos, bandwidth, 0) 23 | 24 | return one_kernel 25 | 26 | 27 | def smooth_kernel_adj(length, minscore): 28 | 29 | if length % 2 == 0: 30 | 31 | length = length + 1 32 | 33 | bandwidth = (length - 1)/6.0 34 | 35 | one_kernel = dict() 36 | 37 | for pos in linspace(-(length-1)/2, (length-1)/2, length): 38 | 39 | one_kernel[int(pos)] = kde(pos, bandwidth, 0)/minscore 40 | 41 | return one_kernel 42 | -------------------------------------------------------------------------------- /Jazzlib/cEM_zip.pyx: -------------------------------------------------------------------------------- 1 | def cEM_zip(testdata): 2 | 3 | cdef float sumzip = sum(testdata) 4 | 5 | cdef int lengthoflist = len(testdata) 6 | 7 | cdef float phat = 0.5 8 | 9 | cdef float phatpre = -1.0 10 | 11 | cdef float lhatpre = -1.0 12 | 13 | cdef float base 14 | 15 | cdef int i = 0 16 | 17 | cdef int j = 0 18 | 19 | cdef int n 20 | 21 | cdef float c 22 | 23 | zhat = [] 24 | 25 | zerolist = [] 26 | 27 | for i from 0<=i20, default==600 24 | -t THRESHOLD, --threshold=THRESHOLD 25 | Hot spots threshold, default=4.0 26 | -l MINLENGTH, --minlength=MINLENGTH 27 | minimum length of hot spots, default=50 28 | -p PVALUE, --pavlue=PVALUE 29 | p-value cutoff for peak identification, default=0.01 30 | -i INITIAL, --initial=INITIAL 31 | Peak's initial length, >5 and chr_length: 25 | 26 | renewend = chr_length 27 | 28 | insertsize_middle_site_count = midsiteinsersizecounter(bamfile=bamfile, regionchromosome=regionchromosome, 29 | regionstart=renewstart, regionend=renewend, 30 | jobtype=jobtype, maxinsert=maxinsert) 31 | 32 | 33 | renewlength = renewend - renewstart + 1 34 | 35 | smoothed_score = np.repeat(0, renewlength) 36 | 37 | for insertlen in insertsize_middle_site_count: 38 | 39 | # print ("count size", insertlen) 40 | 41 | readcount_nowinsertsize = list() 42 | 43 | kernelnow = smooth_kernel(insertlen) 44 | 45 | kernel_score = list() 46 | 47 | for w in sorted(kernelnow): 48 | 49 | kernel_score.append(kernelnow[w]) 50 | 51 | for n in range(renewstart, renewend+1): 52 | 53 | nowscore = 0 54 | 55 | if n in insertsize_middle_site_count[insertlen]: 56 | 57 | nowscore = insertsize_middle_site_count[insertlen][n] 58 | 59 | readcount_nowinsertsize.append(nowscore) 60 | 61 | nowsmoothed = np.correlate(np.array(readcount_nowinsertsize), kernel_score, "same") 62 | 63 | smoothed_score = nowsmoothed + smoothed_score 64 | 65 | outputscore = dict() 66 | 67 | outputscore['chromosome'] = regionchromosome 68 | 69 | outputscore['score'] = dict() 70 | 71 | # print (smoothed_score[0]) 72 | 73 | for j in range(0, renewlength): 74 | 75 | nowsite = j + renewstart 76 | 77 | nowscore = smoothed_score[j] 78 | 79 | if regionstart <= nowsite <= regionend: 80 | 81 | outputscore['score'][nowsite] = nowscore 82 | 83 | return outputscore 84 | 85 | except KeyboardInterrupt: 86 | 87 | raise KeyboardInterruptError() 88 | sys.exit(0) -------------------------------------------------------------------------------- /Jazzlib/kernelsmooth.py.bak: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from countreads import * 3 | from kernel import * 4 | import numpy as np 5 | 6 | 7 | class KeyboardInterruptError(Exception): 8 | 9 | pass 10 | 11 | 12 | def regionsmooth(bamfile, jobtype, maxinsert, regionchromosome, regionstart, regionend, chr_length): 13 | 14 | try: 15 | 16 | renewstart = regionstart - maxinsert*2 17 | 18 | renewend = regionend + maxinsert*2 19 | 20 | if renewstart < 1: 21 | 22 | renewstart = 1 23 | 24 | if renewend > chr_length: 25 | 26 | renewend = chr_length 27 | 28 | insertsize_middle_site_count = midsiteinsersizecounter(bamfile=bamfile, regionchromosome=regionchromosome, 29 | regionstart=renewstart, regionend=renewend, 30 | jobtype=jobtype, maxinsert=maxinsert) 31 | 32 | 33 | renewlength = renewend - renewstart + 1 34 | 35 | smoothed_score = np.repeat(0, renewlength) 36 | 37 | for insertlen in insertsize_middle_site_count: 38 | 39 | # print ("count size", insertlen) 40 | 41 | readcount_nowinsertsize = list() 42 | 43 | kernelnow = smooth_kernel(insertlen) 44 | 45 | kernel_score = list() 46 | 47 | for w in sorted(kernelnow): 48 | 49 | kernel_score.append(kernelnow[w]) 50 | 51 | for n in range(renewstart, renewend+1): 52 | 53 | nowscore = 0 54 | 55 | if n in insertsize_middle_site_count[insertlen]: 56 | 57 | nowscore = insertsize_middle_site_count[insertlen][n] 58 | 59 | readcount_nowinsertsize.append(nowscore) 60 | 61 | nowsmoothed = np.correlate(np.array(readcount_nowinsertsize), kernel_score, "same") 62 | 63 | smoothed_score = nowsmoothed + smoothed_score 64 | 65 | outputscore = dict() 66 | 67 | outputscore['chromosome'] = regionchromosome 68 | 69 | outputscore['score'] = dict() 70 | 71 | # print (smoothed_score[0]) 72 | 73 | for j in range(0, renewlength): 74 | 75 | nowsite = j + renewstart 76 | 77 | nowscore = smoothed_score[j] 78 | 79 | if regionstart <= nowsite <= regionend: 80 | 81 | outputscore['score'][nowsite] = nowscore 82 | 83 | return outputscore 84 | 85 | except KeyboardInterrupt: 86 | 87 | raise KeyboardInterruptError() 88 | sys.exit(0) -------------------------------------------------------------------------------- /Jazzlib/region.py: -------------------------------------------------------------------------------- 1 | def effectregion(chrlength, windowsize, bw): 2 | 3 | """ 4 | count effect region 5 | ===================-- 6 | --=================-- 7 | 8 | """ 9 | scare = int(chrlength/windowsize) 10 | 11 | efregions = dict() 12 | 13 | for i in range(0, scare+1): 14 | efregions[i] = dict() 15 | if i == 0: 16 | 17 | efregions[i]['ctstart'] = 1 18 | efregions[i]['ctend'] = int(windowsize + 1.5 * bw) 19 | efregions[i]['efstart'] = 1 20 | efregions[i]['efend'] = int(windowsize) 21 | elif i == scare: 22 | 23 | efregions[i]['ctstart'] = int(i * windowsize - 1.5 * bw) 24 | efregions[i]['ctend'] = int(chrlength) 25 | efregions[i]['efstart'] = int(i * windowsize + 1) 26 | efregions[i]['efend'] = int(chrlength) 27 | 28 | else: 29 | 30 | efregions[i]['ctstart'] = int(i * windowsize - 1.5 * bw) 31 | efregions[i]['ctend'] = int((i + 1) * windowsize + 1.5 * bw) 32 | efregions[i]['efstart'] = int(i * windowsize + 1) 33 | efregions[i]['efend'] = int((i + 1) * windowsize) 34 | 35 | return efregions 36 | 37 | 38 | def continueregion(points, minlength=2): 39 | 40 | try: 41 | 42 | points.sort() 43 | 44 | start_index = 0 45 | 46 | end_index = 0 47 | 48 | continue_region = list() 49 | 50 | for index_now in range(1, len(points)): 51 | 52 | pre_index = index_now - 1 53 | 54 | if points[pre_index] + 1 == points[index_now]: 55 | 56 | if index_now == len(points) -1: 57 | 58 | if points[index_now] - points[start_index] + 1>= minlength : 59 | #print (points[start_index], points[index_now]) 60 | region_now = dict() 61 | region_now['start_site'] = points[start_index] 62 | region_now['end_site'] = points[index_now] 63 | continue_region.append(region_now) 64 | 65 | else: 66 | 67 | end_index = index_now 68 | 69 | else: 70 | 71 | if points[end_index] - points[start_index] + 1 >= minlength : 72 | 73 | #print (points[start_index], points[end_index]) 74 | region_now = dict() 75 | region_now['start_site'] = points[start_index] 76 | region_now['end_site'] = points[end_index] 77 | continue_region.append(region_now) 78 | 79 | start_index = index_now 80 | 81 | end_index = index_now 82 | 83 | return continue_region 84 | 85 | except Exception as e: 86 | 87 | print(('got exception in Jazzlib.region.continueregion: %r, terminating the pool' % (e,))) 88 | 89 | 90 | def windowregion(chr_length, site, windowsize, chromsome): 91 | 92 | windowstart = site - int(windowsize/2) 93 | 94 | windowend = site + int(windowsize/2) 95 | 96 | if windowstart < 1: 97 | 98 | windowstart = 1 99 | 100 | if windowend > chr_length: 101 | 102 | windowend = chr_length 103 | 104 | windowregion = chromsome+":"+str(windowstart)+'-'+str(windowend) 105 | 106 | return windowregion 107 | 108 | 109 | if __name__ == "__main__": 110 | 111 | try: 112 | 113 | relist = [1,2,3,4,7,9,10] 114 | 115 | creg = continueregion(relist, 1) 116 | 117 | print (creg) 118 | 119 | except: 120 | 121 | pass 122 | 123 | 124 | -------------------------------------------------------------------------------- /Jazzlib/region.py.bak: -------------------------------------------------------------------------------- 1 | def effectregion(chrlength, windowsize, bw): 2 | 3 | """ 4 | count effect region 5 | ===================-- 6 | --=================-- 7 | 8 | """ 9 | scare = int(chrlength/windowsize) 10 | 11 | efregions = dict() 12 | 13 | for i in range(0, scare+1): 14 | efregions[i] = dict() 15 | if i == 0: 16 | 17 | efregions[i]['ctstart'] = 1 18 | efregions[i]['ctend'] = int(windowsize + 1.5 * bw) 19 | efregions[i]['efstart'] = 1 20 | efregions[i]['efend'] = int(windowsize) 21 | elif i == scare: 22 | 23 | efregions[i]['ctstart'] = int(i * windowsize - 1.5 * bw) 24 | efregions[i]['ctend'] = int(chrlength) 25 | efregions[i]['efstart'] = int(i * windowsize + 1) 26 | efregions[i]['efend'] = int(chrlength) 27 | 28 | else: 29 | 30 | efregions[i]['ctstart'] = int(i * windowsize - 1.5 * bw) 31 | efregions[i]['ctend'] = int((i + 1) * windowsize + 1.5 * bw) 32 | efregions[i]['efstart'] = int(i * windowsize + 1) 33 | efregions[i]['efend'] = int((i + 1) * windowsize) 34 | 35 | return efregions 36 | 37 | 38 | def continueregion(points, minlength=2): 39 | 40 | try: 41 | 42 | points.sort() 43 | 44 | start_index = 0 45 | 46 | end_index = 0 47 | 48 | continue_region = list() 49 | 50 | for index_now in range(1, len(points)): 51 | 52 | pre_index = index_now - 1 53 | 54 | if points[pre_index] + 1 == points[index_now]: 55 | 56 | if index_now == len(points) -1: 57 | 58 | if points[index_now] - points[start_index] + 1>= minlength : 59 | #print (points[start_index], points[index_now]) 60 | region_now = dict() 61 | region_now['start_site'] = points[start_index] 62 | region_now['end_site'] = points[index_now] 63 | continue_region.append(region_now) 64 | 65 | else: 66 | 67 | end_index = index_now 68 | 69 | else: 70 | 71 | if points[end_index] - points[start_index] + 1 >= minlength : 72 | 73 | #print (points[start_index], points[end_index]) 74 | region_now = dict() 75 | region_now['start_site'] = points[start_index] 76 | region_now['end_site'] = points[end_index] 77 | continue_region.append(region_now) 78 | 79 | start_index = index_now 80 | 81 | end_index = index_now 82 | 83 | return continue_region 84 | 85 | except Exception, e: 86 | 87 | print ('got exception in Jazzlib.region.continueregion: %r, terminating the pool' % (e,)) 88 | 89 | 90 | def windowregion(chr_length, site, windowsize, chromsome): 91 | 92 | windowstart = site - int(windowsize/2) 93 | 94 | windowend = site + int(windowsize/2) 95 | 96 | if windowstart < 1: 97 | 98 | windowstart = 1 99 | 100 | if windowend > chr_length: 101 | 102 | windowend = chr_length 103 | 104 | windowregion = chromsome+":"+str(windowstart)+'-'+str(windowend) 105 | 106 | return windowregion 107 | 108 | 109 | if __name__ == "__main__": 110 | 111 | try: 112 | 113 | relist = [1,2,3,4,7,9,10] 114 | 115 | creg = continueregion(relist, 1) 116 | 117 | print (creg) 118 | 119 | except: 120 | 121 | pass 122 | 123 | 124 | -------------------------------------------------------------------------------- /Jazzlib/jazzio.py.bak: -------------------------------------------------------------------------------- 1 | import io 2 | from Peak import * 3 | from Hotspot import * 4 | from Peak import * 5 | from FRegion import * 6 | 7 | def peakbedswriter(samplename, peaks): 8 | 9 | bedfilename =samplename+ '_' + 'peak' + ".bed" 10 | 11 | open_bed = io.FileIO(bedfilename, 'w') 12 | 13 | for peak in peaks: 14 | 15 | #bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end), hotspot.hotspotid] 16 | bedlist = [str(peak.chromosome), str(peak.start), str(peak.end),str(peak.peakid),str(peak.score)] 17 | 18 | linker = "\t" 19 | 20 | outstring = linker.join(bedlist) + "\n" 21 | 22 | open_bed.write(outstring) 23 | 24 | open_bed.close() 25 | 26 | 27 | def peakbedgraphswriter(samplename, peaks): 28 | 29 | bedfilename =samplename+ '_' + 'peak' + ".bedgraph" 30 | 31 | open_bed = io.FileIO(bedfilename, 'w') 32 | 33 | for peak in peaks: 34 | 35 | #bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end), hotspot.hotspotid] 36 | bedlist = [str(peak.chromosome), str(peak.start), str(peak.end),str(peak.score)] 37 | 38 | linker = "\t" 39 | 40 | outstring = linker.join(bedlist) + "\n" 41 | 42 | open_bed.write(outstring) 43 | 44 | open_bed.close() 45 | 46 | 47 | def hotspotsbedswriter(samplename, hotspots): 48 | 49 | bedfilename =samplename+ '_' + 'hotspots' + ".bed" 50 | 51 | open_bed = io.FileIO(bedfilename, 'w') 52 | 53 | for hotspot in hotspots: 54 | 55 | #bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end), hotspot.hotspotid] 56 | bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end)] 57 | 58 | linker = "\t" 59 | 60 | outstring = linker.join(bedlist) + "\n" 61 | 62 | open_bed.write(outstring) 63 | 64 | open_bed.close() 65 | 66 | 67 | def hotpeakbedswriter2(samplename, hotspots): 68 | 69 | bedfilename =samplename+ '_' + 'peaks' + ".bed" 70 | 71 | open_bed = io.FileIO(bedfilename, 'w') 72 | 73 | for hotspot in hotspots: 74 | 75 | for peak in hotspot.peaks: 76 | 77 | bedlist = [str(peak.chromosome), str(peak.start), str(peak.end),str(peak.peakid),str(peak.score)] 78 | 79 | linker = "\t" 80 | 81 | outstring = linker.join(bedlist) + "\n" 82 | 83 | open_bed.write(outstring) 84 | 85 | open_bed.close() 86 | 87 | 88 | def jazzgffout(samplename, hotspots, peaks, fregion): 89 | 90 | bedfilename =samplename+ '_' + 'peaks_hotspots' + ".gff3" 91 | 92 | open_bed = io.FileIO(bedfilename, 'w') 93 | linker = "\t" 94 | 95 | frsite = dict() 96 | 97 | for fr in fregion.filted_region: 98 | 99 | (frchrnow,frstartend) = fr.split(":") 100 | 101 | (frstart,frend) = frstartend.split("-") 102 | 103 | for sitenow in range(int(frstart), int(frend)+1): 104 | 105 | if frchrnow in frsite: 106 | 107 | frsite[frchrnow][sitenow] = 1 108 | 109 | else: 110 | 111 | frsite[frchrnow] = dict() 112 | 113 | frsite[frchrnow][sitenow] = 1 114 | 115 | hotspotsinfr = dict() 116 | 117 | for hotspot in hotspots: 118 | 119 | 120 | 121 | if hotspot.chromosome in frsite: 122 | 123 | for nowsite in range(hotspot.start, hotspot.end+1): 124 | 125 | if nowsite in frsite[frchrnow]: 126 | 127 | hotspotanno = "ID="+str(hotspot.hotspotid)+";anno=FREGION" 128 | 129 | hotspotsinfr[hotspot.hotspotid] = 1 130 | 131 | else: 132 | 133 | hotspotanno = "ID="+str(hotspot.hotspotid) 134 | 135 | else: 136 | 137 | hotspotanno = "ID="+str(hotspot.hotspotid) 138 | 139 | hotspotsstr = [str(hotspot.chromosome), "JAZZ", "gene", str(hotspot.start), str(hotspot.end), 140 | '.', '.', '.',hotspotanno 141 | ] 142 | 143 | hotspotstring = linker.join(hotspotsstr) + "\n" 144 | 145 | open_bed.write(hotspotstring) 146 | 147 | 148 | for peak in peaks: 149 | 150 | # peakanno = "Parent="+str(peak.parent)+";"+"ID="+str(peak.peakid) 151 | 152 | if peak.parent in hotspotsinfr: 153 | peakanno = "Parent="+str(peak.parent)+";"+"ID="+str(peak.peakid)+";anno=FREGION" 154 | else: 155 | peakanno = "Parent="+str(peak.parent)+";"+"ID="+str(peak.peakid) 156 | 157 | peakstr = [str(peak.chromosome), "JAZZ", "CDS", str(peak.start), str(peak.end), 158 | '.', '.', '.',peakanno] 159 | 160 | peakstring = linker.join(peakstr)+"\n" 161 | 162 | open_bed.write(peakstring) 163 | 164 | 165 | open_bed.close() 166 | -------------------------------------------------------------------------------- /Jazzlib/jazzio.py: -------------------------------------------------------------------------------- 1 | import io 2 | from .Peak import * 3 | from .Hotspot import * 4 | from .Peak import * 5 | from .FRegion import * 6 | 7 | def peakbedswriter(samplename, peaks): 8 | 9 | bedfilename =samplename+ '_' + 'peak' + ".bed" 10 | 11 | open_bed = io.FileIO(bedfilename, 'w') 12 | 13 | for peak in peaks: 14 | 15 | #bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end), hotspot.hotspotid] 16 | bedlist = [str(peak.chromosome), str(peak.start), str(peak.end),str(peak.peakid),str(peak.score)] 17 | 18 | linker = "\t" 19 | 20 | outstring = linker.join(bedlist) + "\n" 21 | 22 | open_bed.write(bytes(outstring, encoding = 'utf-8')) 23 | 24 | open_bed.close() 25 | 26 | 27 | def peakbedgraphswriter(samplename, peaks): 28 | 29 | bedfilename =samplename+ '_' + 'peak' + ".bedgraph" 30 | 31 | open_bed = io.FileIO(bedfilename, 'w') 32 | 33 | for peak in peaks: 34 | 35 | #bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end), hotspot.hotspotid] 36 | bedlist = [str(peak.chromosome), str(peak.start), str(peak.end),str(peak.score)] 37 | 38 | linker = "\t" 39 | 40 | outstring = linker.join(bedlist) + "\n" 41 | 42 | open_bed.write(bytes(outstring, encoding = 'utf-8')) 43 | 44 | open_bed.close() 45 | 46 | 47 | def hotspotsbedswriter(samplename, hotspots): 48 | 49 | bedfilename =samplename+ '_' + 'hotspots' + ".bed" 50 | 51 | open_bed = io.FileIO(bedfilename, 'w') 52 | 53 | for hotspot in hotspots: 54 | 55 | #bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end), hotspot.hotspotid] 56 | bedlist = [str(hotspot.chromosome), str(hotspot.start), str(hotspot.end)] 57 | 58 | linker = "\t" 59 | 60 | outstring = linker.join(bedlist) + "\n" 61 | 62 | open_bed.write(bytes(outstring, encoding = 'utf-8')) 63 | 64 | open_bed.close() 65 | 66 | 67 | def hotpeakbedswriter2(samplename, hotspots): 68 | 69 | bedfilename =samplename+ '_' + 'peaks' + ".bed" 70 | 71 | open_bed = io.FileIO(bedfilename, 'w') 72 | 73 | for hotspot in hotspots: 74 | 75 | for peak in hotspot.peaks: 76 | 77 | bedlist = [str(peak.chromosome), str(peak.start), str(peak.end),str(peak.peakid),str(peak.score)] 78 | 79 | linker = "\t" 80 | 81 | outstring = linker.join(bedlist) + "\n" 82 | 83 | open_bed.write(bytes(outstring, encoding = 'utf-8')) 84 | 85 | open_bed.close() 86 | 87 | 88 | def jazzgffout(samplename, hotspots, peaks, fregion): 89 | 90 | bedfilename =samplename+ '_' + 'peaks_hotspots' + ".gff3" 91 | 92 | open_bed = io.FileIO(bedfilename, 'w') 93 | linker = "\t" 94 | 95 | frsite = dict() 96 | 97 | for fr in fregion.filted_region: 98 | 99 | (frchrnow,frstartend) = fr.split(":") 100 | 101 | (frstart,frend) = frstartend.split("-") 102 | 103 | for sitenow in range(int(frstart), int(frend)+1): 104 | 105 | if frchrnow in frsite: 106 | 107 | frsite[frchrnow][sitenow] = 1 108 | 109 | else: 110 | 111 | frsite[frchrnow] = dict() 112 | 113 | frsite[frchrnow][sitenow] = 1 114 | 115 | hotspotsinfr = dict() 116 | 117 | for hotspot in hotspots: 118 | 119 | 120 | 121 | if hotspot.chromosome in frsite: 122 | 123 | for nowsite in range(hotspot.start, hotspot.end+1): 124 | 125 | if nowsite in frsite[frchrnow]: 126 | 127 | hotspotanno = "ID="+str(hotspot.hotspotid)+";anno=FREGION" 128 | 129 | hotspotsinfr[hotspot.hotspotid] = 1 130 | 131 | else: 132 | 133 | hotspotanno = "ID="+str(hotspot.hotspotid) 134 | 135 | else: 136 | 137 | hotspotanno = "ID="+str(hotspot.hotspotid) 138 | 139 | hotspotsstr = [str(hotspot.chromosome), "JAZZ", "gene", str(hotspot.start), str(hotspot.end), 140 | '.', '.', '.',hotspotanno 141 | ] 142 | 143 | hotspotstring = linker.join(hotspotsstr) + "\n" 144 | 145 | # open_bed.write(hotspotstring) 146 | open_bed.write(bytes(hotspotstring, encoding='utf-8')) 147 | 148 | for peak in peaks: 149 | 150 | # peakanno = "Parent="+str(peak.parent)+";"+"ID="+str(peak.peakid) 151 | 152 | if peak.parent in hotspotsinfr: 153 | peakanno = "Parent="+str(peak.parent)+";"+"ID="+str(peak.peakid)+";anno=FREGION" 154 | else: 155 | peakanno = "Parent="+str(peak.parent)+";"+"ID="+str(peak.peakid) 156 | 157 | peakstr = [str(peak.chromosome), "JAZZ", "CDS", str(peak.start), str(peak.end), 158 | '.', '.', '.',peakanno] 159 | 160 | peakstring = linker.join(peakstr)+"\n" 161 | 162 | #open_bed.write(peakstring) 163 | open_bed.write(bytes(peakstring, encoding='utf-8')) 164 | 165 | 166 | open_bed.close() 167 | -------------------------------------------------------------------------------- /Jazzlib/randombg.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from multiprocessing import Pool 4 | from .FRegion import * 5 | import random as rnd 6 | from .kernel import * 7 | from numpy import * 8 | 9 | class KeyboardInterruptError(Exception): 10 | 11 | pass 12 | 13 | 14 | def randombg2(fregion, nthreads, maxinsert, randomthreshold=2, runtime=1000, randomwindow=10000): 15 | 16 | countgenomelength = fregion.countgenomelength 17 | 18 | adjreads = fregion.adjreads 19 | 20 | bg = adjreads/countgenomelength 21 | 22 | return bg 23 | 24 | 25 | def randombg(fregion, nthreads, maxinsert, randomthreshold=2, runtime=1000, randomwindow=10000): 26 | 27 | pool = Pool(nthreads) 28 | 29 | try: 30 | 31 | 32 | 33 | countgenomeuniqlength = fregion.countgenomeuniqlength 34 | 35 | adjreads = fregion.adjreads 36 | 37 | countgenomelength = fregion.countgenomelength 38 | 39 | uniqrate = countgenomeuniqlength/countgenomelength 40 | 41 | if uniqrate <0.5: 42 | 43 | uniqrate = uniqrate * 2 44 | 45 | countreads = int(adjreads/countgenomeuniqlength * randomwindow)+1 46 | 47 | onekernel = smooth_kernel(length=maxinsert) 48 | 49 | kernel_score = list() 50 | 51 | pars = list() 52 | 53 | for i in sorted(onekernel): 54 | 55 | kernel_score.append(onekernel[i]) 56 | 57 | for j in range(runtime): 58 | 59 | par = dict() 60 | 61 | par['countreads'] = countreads 62 | 63 | par['kernel_score'] = kernel_score 64 | 65 | par['uniqrate'] = uniqrate 66 | 67 | par['randomwindo'] = randomwindow 68 | 69 | par['randomthreshold'] =randomthreshold 70 | 71 | # print (par) 72 | 73 | pars.append(par) 74 | 75 | randths = pool.map(sim_bg_worker, pars) 76 | 77 | thsum = 0 78 | 79 | for randth in randths: 80 | 81 | thsum = thsum + randth 82 | 83 | random_th = thsum/runtime 84 | 85 | pool.close() 86 | 87 | return random_th 88 | 89 | except KeyboardInterrupt: 90 | 91 | pool.terminate() 92 | 93 | print ("You cancelled the program!") 94 | 95 | sys.exit(1) 96 | 97 | except Exception as e: 98 | 99 | print ('got exception in Jazzlib.randombg.randombg: %r, terminating the pool' % (e,)) 100 | 101 | pool.terminate() 102 | 103 | print ('pool is terminated') 104 | 105 | finally: 106 | # print ('joining pool processes') 107 | pool.join() 108 | 109 | 110 | def sim_bg_worker(par): 111 | 112 | try: 113 | 114 | countreads = par['countreads'] 115 | 116 | kernel_score = par['kernel_score'] 117 | 118 | uniqrate = par['uniqrate'] 119 | 120 | randomwindow = par['randomwindo'] 121 | 122 | randomthreshold = par['randomthreshold'] 123 | 124 | totaluniqsite = int(uniqrate * randomwindow) 125 | 126 | rand_reads_count = list() 127 | 128 | region_site = list(range(0, randomwindow)) 129 | 130 | for i in range(0, randomwindow): 131 | 132 | rand_reads_count.append(0) 133 | 134 | sim_uniqsite = rnd.sample(region_site, totaluniqsite) 135 | 136 | 137 | for k in range(0, countreads): 138 | 139 | rand_number = int(rnd.uniform(0, totaluniqsite)) 140 | 141 | rand_reads = sim_uniqsite[rand_number] 142 | 143 | # print (rand_number, rand_reads) 144 | 145 | rand_reads_count[rand_reads] = rand_reads_count[rand_reads] + 1 146 | 147 | smoothed_result = correlate(array(rand_reads_count), kernel_score, "same") 148 | 149 | # scores = list() 150 | 151 | rand_mean = smoothed_result.mean() 152 | 153 | rand_std = smoothed_result.std() 154 | 155 | # total_sum = smoothed_result.sum() 156 | # print (rand_mean, rand_std, randomthreshold) 157 | 158 | rand_threshhold = rand_mean + randomthreshold * rand_std 159 | 160 | return rand_threshhold 161 | 162 | 163 | except KeyboardInterrupt: 164 | 165 | print ("You cancelled the program!") 166 | 167 | sys.exit(1) 168 | 169 | except Exception as e: 170 | 171 | print ('got exception in Jazzlib.randombg.sim_bg_worker: %r, terminating the pool' % (e,)) 172 | 173 | 174 | if __name__ == "__main__": 175 | try: 176 | 177 | onekernel = smooth_kernel(length=100) 178 | 179 | kernel_score = list() 180 | 181 | pars = list() 182 | 183 | for i in sorted(onekernel): 184 | 185 | kernel_score.append(onekernel[i]) 186 | 187 | par = dict() 188 | 189 | par['countreads'] = 100000 190 | 191 | par['kernel_score'] = kernel_score 192 | 193 | par['uniqrate'] = 0.3 194 | 195 | par['randomwindo'] = int(1e5) 196 | 197 | par['randomthreshold'] = 3 198 | 199 | th = sim_bg_worker(par) 200 | 201 | print (th) 202 | 203 | except KeyboardInterrupt: 204 | sys.stderr.write("User interrupt\n") 205 | sys.exit(0) -------------------------------------------------------------------------------- /Jazzlib/randombg.py.bak: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from multiprocessing import Pool 4 | from FRegion import * 5 | import random as rnd 6 | from kernel import * 7 | from numpy import * 8 | 9 | class KeyboardInterruptError(Exception): 10 | 11 | pass 12 | 13 | 14 | def randombg2(fregion, nthreads, maxinsert, randomthreshold=2, runtime=1000, randomwindow=10000): 15 | 16 | countgenomelength = fregion.countgenomelength 17 | 18 | adjreads = fregion.adjreads 19 | 20 | bg = adjreads/countgenomelength 21 | 22 | return bg 23 | 24 | 25 | def randombg(fregion, nthreads, maxinsert, randomthreshold=2, runtime=1000, randomwindow=10000): 26 | 27 | pool = Pool(nthreads) 28 | 29 | try: 30 | 31 | 32 | 33 | countgenomeuniqlength = fregion.countgenomeuniqlength 34 | 35 | adjreads = fregion.adjreads 36 | 37 | countgenomelength = fregion.countgenomelength 38 | 39 | uniqrate = countgenomeuniqlength/countgenomelength 40 | 41 | if uniqrate <0.5: 42 | 43 | uniqrate = uniqrate * 2 44 | 45 | countreads = int(adjreads/countgenomeuniqlength * randomwindow)+1 46 | 47 | onekernel = smooth_kernel(length=maxinsert) 48 | 49 | kernel_score = list() 50 | 51 | pars = list() 52 | 53 | for i in sorted(onekernel): 54 | 55 | kernel_score.append(onekernel[i]) 56 | 57 | for j in range(runtime): 58 | 59 | par = dict() 60 | 61 | par['countreads'] = countreads 62 | 63 | par['kernel_score'] = kernel_score 64 | 65 | par['uniqrate'] = uniqrate 66 | 67 | par['randomwindo'] = randomwindow 68 | 69 | par['randomthreshold'] =randomthreshold 70 | 71 | # print (par) 72 | 73 | pars.append(par) 74 | 75 | randths = pool.map(sim_bg_worker, pars) 76 | 77 | thsum = 0 78 | 79 | for randth in randths: 80 | 81 | thsum = thsum + randth 82 | 83 | random_th = thsum/runtime 84 | 85 | pool.close() 86 | 87 | return random_th 88 | 89 | except KeyboardInterrupt: 90 | 91 | pool.terminate() 92 | 93 | print ("You cancelled the program!") 94 | 95 | sys.exit(1) 96 | 97 | except Exception, e: 98 | 99 | print ('got exception in Jazzlib.randombg.randombg: %r, terminating the pool' % (e,)) 100 | 101 | pool.terminate() 102 | 103 | print ('pool is terminated') 104 | 105 | finally: 106 | # print ('joining pool processes') 107 | pool.join() 108 | 109 | 110 | def sim_bg_worker(par): 111 | 112 | try: 113 | 114 | countreads = par['countreads'] 115 | 116 | kernel_score = par['kernel_score'] 117 | 118 | uniqrate = par['uniqrate'] 119 | 120 | randomwindow = par['randomwindo'] 121 | 122 | randomthreshold = par['randomthreshold'] 123 | 124 | totaluniqsite = int(uniqrate * randomwindow) 125 | 126 | rand_reads_count = list() 127 | 128 | region_site = range(0, randomwindow) 129 | 130 | for i in range(0, randomwindow): 131 | 132 | rand_reads_count.append(0) 133 | 134 | sim_uniqsite = rnd.sample(region_site, totaluniqsite) 135 | 136 | 137 | for k in range(0, countreads): 138 | 139 | rand_number = int(rnd.uniform(0, totaluniqsite)) 140 | 141 | rand_reads = sim_uniqsite[rand_number] 142 | 143 | # print (rand_number, rand_reads) 144 | 145 | rand_reads_count[rand_reads] = rand_reads_count[rand_reads] + 1 146 | 147 | smoothed_result = correlate(array(rand_reads_count), kernel_score, "same") 148 | 149 | # scores = list() 150 | 151 | rand_mean = smoothed_result.mean() 152 | 153 | rand_std = smoothed_result.std() 154 | 155 | # total_sum = smoothed_result.sum() 156 | # print (rand_mean, rand_std, randomthreshold) 157 | 158 | rand_threshhold = rand_mean + randomthreshold * rand_std 159 | 160 | return rand_threshhold 161 | 162 | 163 | except KeyboardInterrupt: 164 | 165 | print ("You cancelled the program!") 166 | 167 | sys.exit(1) 168 | 169 | except Exception, e: 170 | 171 | print ('got exception in Jazzlib.randombg.sim_bg_worker: %r, terminating the pool' % (e,)) 172 | 173 | 174 | if __name__ == "__main__": 175 | try: 176 | 177 | onekernel = smooth_kernel(length=100) 178 | 179 | kernel_score = list() 180 | 181 | pars = list() 182 | 183 | for i in sorted(onekernel): 184 | 185 | kernel_score.append(onekernel[i]) 186 | 187 | par = dict() 188 | 189 | par['countreads'] = 100000 190 | 191 | par['kernel_score'] = kernel_score 192 | 193 | par['uniqrate'] = 0.3 194 | 195 | par['randomwindo'] = int(1e5) 196 | 197 | par['randomthreshold'] = 3 198 | 199 | th = sim_bg_worker(par) 200 | 201 | print (th) 202 | 203 | except KeyboardInterrupt: 204 | sys.stderr.write("User interrupt\n") 205 | sys.exit(0) -------------------------------------------------------------------------------- /Jazzlib/localmax.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from .kernelsmooth import * 4 | from multiprocessing import Pool 5 | from .kernel import * 6 | from .FRegion import * 7 | 8 | 9 | class KeyboardInterruptError(Exception): 10 | 11 | pass 12 | 13 | 14 | def get_all_localmax(bamfile, jobtype, maxinsert, nthreads, fregion, countchr, rndth): 15 | 16 | pool = Pool(nthreads) 17 | 18 | try: 19 | 20 | pars = list() 21 | 22 | windowsize = 100000 23 | 24 | adjreads = fregion.adjreads 25 | 26 | totallength = 0 27 | 28 | onesmoothkernel = smooth_kernel(30) 29 | 30 | kermax = max(onesmoothkernel.values()) 31 | # 32 | 33 | 34 | for chromosmoe in countchr: 35 | 36 | chr_length = fregion.chrs_length[chromosmoe] 37 | 38 | totallength = totallength + chr_length 39 | 40 | for scare in range(0, int(chr_length/windowsize)+1): 41 | 42 | nowstart = scare*windowsize + 1 -200 43 | 44 | nowend = (scare+1)*windowsize + 200 45 | 46 | if nowend > chr_length: 47 | 48 | nowend = chr_length 49 | 50 | if nowstart < 1: 51 | 52 | nowstart = 1 53 | 54 | nowregion = chromosmoe + ":" + str(nowstart) + "-" + str(nowend) 55 | 56 | par = dict() 57 | 58 | par['region'] = nowregion 59 | 60 | par['maxinsert'] = maxinsert 61 | 62 | par['bamfile'] = bamfile 63 | 64 | par['jobtype'] = jobtype 65 | 66 | par['chrlength'] = chr_length 67 | 68 | par['regionchromosome'] = chromosmoe 69 | 70 | par['regionstart'] = nowstart 71 | 72 | par['regionend'] = nowend 73 | 74 | par['rndth'] = rndth 75 | 76 | pars.append(par) 77 | 78 | avgcount = adjreads/totallength 79 | 80 | threshhold = int(avgcount + 1) * kermax 81 | ###test threhhold 82 | #threshhold = avgcount 83 | 84 | # print ("threshhold:", threshhold) 85 | 86 | filted_region = fregion.filted_region 87 | 88 | filted_site = dict() 89 | 90 | for fr in filted_region: 91 | 92 | chromosome, sesite = fr.split(':') 93 | 94 | startsite, endsite = sesite.split('-') 95 | 96 | startsite = int(startsite) 97 | 98 | endsite = int(endsite) 99 | 100 | if chromosome in filted_site: 101 | 102 | for i in range(startsite,endsite): 103 | 104 | filted_site[chromosome][i] = 1 105 | 106 | else: 107 | 108 | filted_site[chromosome] = dict() 109 | 110 | for i in range(startsite,endsite): 111 | 112 | filted_site[chromosome][i] = 1 113 | 114 | localmax = dict() 115 | 116 | localmax_worker_returnres = pool.map(localmax_worker, pars) 117 | 118 | for each_worker_res in localmax_worker_returnres: 119 | 120 | for chromosome in each_worker_res: 121 | 122 | for site in each_worker_res[chromosome]: 123 | 124 | if chromosome in localmax: 125 | 126 | if each_worker_res[chromosome][site] > threshhold: 127 | 128 | if chromosome in filted_site: 129 | 130 | if site in filted_site[chromosome]: 131 | 132 | continue 133 | 134 | localmax[chromosome][site] = each_worker_res[chromosome][site] 135 | 136 | else: 137 | 138 | if each_worker_res[chromosome][site]>threshhold: 139 | 140 | if chromosome in filted_site: 141 | 142 | if site in filted_site[chromosome]: 143 | 144 | continue 145 | 146 | localmax[chromosome] = dict() 147 | 148 | localmax[chromosome][site] = each_worker_res[chromosome][site] 149 | 150 | pool.close() 151 | 152 | # print (localmax) 153 | 154 | return localmax 155 | 156 | except KeyboardInterrupt: 157 | 158 | pool.terminate() 159 | 160 | print ("You cancelled the program!") 161 | 162 | sys.exit(1) 163 | 164 | except Exception as e: 165 | 166 | print ('got exception in Jazzlib.localmax.get_all_localmax: %r, terminating the pool' % (e,)) 167 | 168 | pool.terminate() 169 | 170 | print ('pool is terminated') 171 | 172 | finally: 173 | # print ('joining pool processes') 174 | pool.join() 175 | # print ('join complete') 176 | 177 | 178 | def localmax_worker(par): 179 | 180 | try: 181 | 182 | nowregion = par['region'] 183 | 184 | maxinsert = par['maxinsert'] 185 | 186 | bamfile = par['bamfile'] 187 | 188 | jobtype = par['jobtype'] 189 | 190 | chr_length = par['chrlength'] 191 | 192 | regionchromosome = par['regionchromosome'] 193 | 194 | regionstart = par['regionstart'] 195 | 196 | regionend = par['regionend'] 197 | 198 | rndth = par['rndth'] 199 | 200 | # smoothedscore = regionsmooth(bamfile=bamfile, maxinsert=maxinsert, jobtype=jobtype, region=nowregion, 201 | # chr_length=chr_length) 202 | 203 | smoothedscore = regionsmooth(bamfile=bamfile, maxinsert=maxinsert, jobtype=jobtype, 204 | regionchromosome=regionchromosome, 205 | regionstart=regionstart, regionend=regionend, 206 | chr_length=chr_length) 207 | 208 | localmax = smoothedlocalmax(smoothedscore, rndth) 209 | 210 | return localmax 211 | 212 | except KeyboardInterrupt: 213 | 214 | raise KeyboardInterruptError() 215 | 216 | except Exception as e: 217 | 218 | print ('got exception in Jazzlib.localmax.localmax_worker: %r,' % (e,)) 219 | 220 | 221 | def smoothedlocalmax(smoothedscore, rndth): 222 | 223 | try: 224 | 225 | maxsites = dict() 226 | 227 | startsite = min(smoothedscore['score'].keys()) 228 | 229 | endsite = max(smoothedscore['score'].keys()) 230 | 231 | chromosome = smoothedscore['chromosome'] 232 | 233 | maxsites[chromosome] = dict() 234 | 235 | for nowsite in range(startsite+2, endsite-2): 236 | 237 | if smoothedscore['score'][nowsite] >=rndth: 238 | 239 | if (smoothedscore['score'][nowsite-2]=smoothedscore['score'][nowsite+1]>smoothedscore['score'][nowsite+2]): 240 | 241 | maxsites[chromosome][nowsite] = smoothedscore['score'][nowsite] 242 | 243 | # print (nowsite) 244 | 245 | return maxsites 246 | 247 | except KeyboardInterrupt: 248 | 249 | raise KeyboardInterruptError() 250 | 251 | except Exception as e: 252 | 253 | print ('got exception in Jazzlib.localmax.smoothedlocalmax: %r,' % (e,)) 254 | 255 | -------------------------------------------------------------------------------- /Jazzlib/sta.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from scipy.special import gammaincc 4 | from scipy import math 5 | import scipy.stats as stats 6 | from decimal import Decimal, localcontext 7 | from .Peak import * 8 | import sys 9 | 10 | def bionompvalue(x, n, p): 11 | 12 | bionompvalue = 1 - stats.binom.cdf(x, n, p) 13 | 14 | return bionompvalue 15 | 16 | 17 | def poissonpvalue(x,mu): 18 | 19 | poissonpvalue = Decimal(1) - Decimal(stats.poisson.cdf(x, mu)) 20 | 21 | return poissonpvalue 22 | 23 | 24 | 25 | def fdr(pnow, plist, prank): 26 | #FDR=length(pvalue)*pvalue/rank(pvalue) 27 | 28 | rankofplist = prank 29 | 30 | lengthofplist = len(plist) 31 | 32 | for i in range(0,lengthofplist): 33 | 34 | if plist[i] == pnow: 35 | now_rank = rankofplist[i] 36 | fdr = lengthofplist*pnow/now_rank 37 | fdr = min(1,fdr) 38 | break 39 | 40 | return fdr 41 | 42 | 43 | def bayesfactor(locallambda, peakscore): 44 | 45 | try: 46 | 47 | # bayesfactor = 2 * (math.log((gammaincc(peakscore-1, locallambda)*gamma(peakscore-1)), math.e) - (peakscore-1)*math.log(locallambda, math.e) + locallambda) 48 | # 49 | # a = (math.log(gammaincc(peakscore-1, locallambda), math.e) ) 50 | # b = math.lgamma(peakscore-1) 51 | # c=(peakscore-1)*math.log(locallambda, math.e) 52 | # print (locallambda,peakscore,a,b,c) 53 | bayesfactor2 = 2 * (math.log(gammaincc(peakscore-1, locallambda), math.e)+math.lgamma(peakscore-1) - (peakscore-1)*math.log(locallambda, math.e) + locallambda) 54 | 55 | return bayesfactor2 56 | 57 | except Exception as e: 58 | 59 | print ('got exception in Jazzlib.sta.bayesfactor: %r,' % (e,)) 60 | 61 | print (locallambda, peakscore) 62 | 63 | except KeyboardInterrupt: 64 | 65 | sys.stderr.write("User interrupt\n") 66 | 67 | sys.exit(0) 68 | 69 | def fdr_control(chippeaks, inputpeaks, fdr): 70 | 71 | fdrpeakdict = dict() 72 | 73 | chipscore = list() 74 | 75 | inputscore = list() 76 | 77 | overlaptedpeak = dict() 78 | 79 | fdrth = -1 80 | 81 | # print ("check fdr") 82 | 83 | for inputpeak in inputpeaks: 84 | 85 | start = inputpeak.start 86 | 87 | end = inputpeak.end 88 | 89 | inputscore.append(inputpeak.score) 90 | 91 | for chippeak in chippeaks: 92 | 93 | if chippeak.chromosome == inputpeak.chromosome: 94 | 95 | if chippeak.peakpoint == inputpeak.peakpoint: 96 | 97 | overlaptedpeak[chippeak.peakid] = dict() 98 | 99 | overlaptedpeak[chippeak.peakid]['inputscore'] = inputpeak.score 100 | 101 | overlaptedpeak[chippeak.peakid]['chipscore'] = chippeak.score 102 | 103 | chipscore.append(chippeak.score) 104 | 105 | # print(chippeak.chromosome, chippeak.peakpoint, chippeak.score, inputpeak.score, chippeak.peakid) 106 | 107 | for i in sorted(chipscore): 108 | 109 | # print("score", i) 110 | 111 | chippeakcount = 0.0 112 | 113 | inputpeakcount = 0.0 114 | 115 | for peakid in overlaptedpeak: 116 | 117 | if i <= overlaptedpeak[peakid]['inputscore']: 118 | 119 | inputpeakcount = inputpeakcount + 1 120 | 121 | for chippeak in chippeaks: 122 | 123 | if i <= chippeak.score: 124 | 125 | chippeakcount = chippeakcount + 1 126 | 127 | nowfdr = inputpeakcount/chippeakcount 128 | 129 | # print (i, chippeakcount, inputpeakcount, nowfdr) 130 | 131 | if chippeakcount == 0: 132 | 133 | break 134 | 135 | for peaknow in chippeaks: 136 | 137 | if peaknow.score > i: 138 | 139 | peaknow.fdr = nowfdr 140 | 141 | 142 | # if (inputpeakcount/chippeakcount) < fdr: 143 | # 144 | # fdrth = i 145 | # 146 | # break 147 | 148 | return chippeaks 149 | 150 | 151 | 152 | def fdr_control2(chippeaks, inputpeaks, fdr): 153 | 154 | fdrpeakdict = dict() 155 | 156 | chipscore = list() 157 | 158 | inputscore = list() 159 | 160 | overlaptedpeak = dict() 161 | 162 | fdrth = -1 163 | 164 | # print ("check fdr") 165 | 166 | for inputpeak in inputpeaks: 167 | 168 | start = inputpeak.start 169 | 170 | end = inputpeak.end 171 | 172 | inputscore.append(inputpeak.score) 173 | 174 | for chippeak in chippeaks: 175 | 176 | if chippeak.chromosome == inputpeak.chromosome: 177 | 178 | if inputpeak.start i: 219 | 220 | peaknow.fdr = nowfdr 221 | 222 | return chippeaks 223 | 224 | 225 | def fdr_bh(peaks): 226 | 227 | b01s = list() 228 | 229 | peakscores = list() 230 | 231 | for peak in peaks: 232 | 233 | b01 = 1/(math.e**(peak.score/2)) 234 | 235 | peakscores.append(peak.score) 236 | 237 | b01s.append(b01) 238 | 239 | sortedb01s = sorted(b01s,reverse=True) 240 | 241 | listlength = len(sortedb01s) 242 | 243 | for peak in peaks: 244 | 245 | b01 = 1/(math.e**(peak.score/2)) 246 | 247 | rank = 1 248 | 249 | for i in range(0,listlength): 250 | 251 | if sortedb01s[i] == b01: 252 | 253 | rank = i + 1 254 | 255 | break 256 | 257 | fdr = b01*listlength/rank 258 | 259 | peak.fdr = fdr 260 | 261 | return peaks 262 | 263 | 264 | 265 | 266 | 267 | if __name__ == "__main__": 268 | 269 | try: 270 | 271 | for i in range(100,2000,100): 272 | 273 | for j in range (2,80): 274 | 275 | 276 | bs = bayesfactor(locallambda=i, peakscore=j) 277 | # if bs == 1500: 278 | # bs = 'error' 279 | print ("locallambda:",i, "peakscore",j,"bs",bs) 280 | 281 | except KeyboardInterrupt: 282 | 283 | sys.stderr.write("User interrupt\n") 284 | 285 | sys.exit(0) -------------------------------------------------------------------------------- /Jazzlib/localmax.py.bak: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from kernelsmooth import * 4 | from multiprocessing import Pool 5 | from kernel import * 6 | from FRegion import * 7 | 8 | 9 | class KeyboardInterruptError(Exception): 10 | 11 | pass 12 | 13 | 14 | def get_all_localmax(bamfile, jobtype, maxinsert, nthreads, fregion, countchr, rndth): 15 | 16 | pool = Pool(nthreads) 17 | 18 | try: 19 | 20 | pars = list() 21 | 22 | windowsize = 100000 23 | 24 | adjreads = fregion.adjreads 25 | 26 | totallength = 0 27 | 28 | onesmoothkernel = smooth_kernel(30) 29 | 30 | kermax = max(onesmoothkernel.values()) 31 | # 32 | 33 | 34 | for chromosmoe in countchr: 35 | 36 | chr_length = fregion.chrs_length[chromosmoe] 37 | 38 | totallength = totallength + chr_length 39 | 40 | for scare in range(0, int(chr_length/windowsize)+1): 41 | 42 | nowstart = scare*windowsize + 1 -200 43 | 44 | nowend = (scare+1)*windowsize + 200 45 | 46 | if nowend > chr_length: 47 | 48 | nowend = chr_length 49 | 50 | if nowstart < 1: 51 | 52 | nowstart = 1 53 | 54 | nowregion = chromosmoe + ":" + str(nowstart) + "-" + str(nowend) 55 | 56 | par = dict() 57 | 58 | par['region'] = nowregion 59 | 60 | par['maxinsert'] = maxinsert 61 | 62 | par['bamfile'] = bamfile 63 | 64 | par['jobtype'] = jobtype 65 | 66 | par['chrlength'] = chr_length 67 | 68 | par['regionchromosome'] = chromosmoe 69 | 70 | par['regionstart'] = nowstart 71 | 72 | par['regionend'] = nowend 73 | 74 | par['rndth'] = rndth 75 | 76 | pars.append(par) 77 | 78 | avgcount = adjreads/totallength 79 | 80 | threshhold = int(avgcount + 1) * kermax 81 | ###test threhhold 82 | #threshhold = avgcount 83 | 84 | # print ("threshhold:", threshhold) 85 | 86 | filted_region = fregion.filted_region 87 | 88 | filted_site = dict() 89 | 90 | for fr in filted_region: 91 | 92 | chromosome, sesite = fr.split(':') 93 | 94 | startsite, endsite = sesite.split('-') 95 | 96 | startsite = int(startsite) 97 | 98 | endsite = int(endsite) 99 | 100 | if chromosome in filted_site: 101 | 102 | for i in range(startsite,endsite): 103 | 104 | filted_site[chromosome][i] = 1 105 | 106 | else: 107 | 108 | filted_site[chromosome] = dict() 109 | 110 | for i in range(startsite,endsite): 111 | 112 | filted_site[chromosome][i] = 1 113 | 114 | localmax = dict() 115 | 116 | localmax_worker_returnres = pool.map(localmax_worker, pars) 117 | 118 | for each_worker_res in localmax_worker_returnres: 119 | 120 | for chromosome in each_worker_res: 121 | 122 | for site in each_worker_res[chromosome]: 123 | 124 | if chromosome in localmax: 125 | 126 | if each_worker_res[chromosome][site] > threshhold: 127 | 128 | if chromosome in filted_site: 129 | 130 | if site in filted_site[chromosome]: 131 | 132 | continue 133 | 134 | localmax[chromosome][site] = each_worker_res[chromosome][site] 135 | 136 | else: 137 | 138 | if each_worker_res[chromosome][site]>threshhold: 139 | 140 | if chromosome in filted_site: 141 | 142 | if site in filted_site[chromosome]: 143 | 144 | continue 145 | 146 | localmax[chromosome] = dict() 147 | 148 | localmax[chromosome][site] = each_worker_res[chromosome][site] 149 | 150 | pool.close() 151 | 152 | # print (localmax) 153 | 154 | return localmax 155 | 156 | except KeyboardInterrupt: 157 | 158 | pool.terminate() 159 | 160 | print ("You cancelled the program!") 161 | 162 | sys.exit(1) 163 | 164 | except Exception, e: 165 | 166 | print ('got exception in Jazzlib.localmax.get_all_localmax: %r, terminating the pool' % (e,)) 167 | 168 | pool.terminate() 169 | 170 | print ('pool is terminated') 171 | 172 | finally: 173 | # print ('joining pool processes') 174 | pool.join() 175 | # print ('join complete') 176 | 177 | 178 | def localmax_worker(par): 179 | 180 | try: 181 | 182 | nowregion = par['region'] 183 | 184 | maxinsert = par['maxinsert'] 185 | 186 | bamfile = par['bamfile'] 187 | 188 | jobtype = par['jobtype'] 189 | 190 | chr_length = par['chrlength'] 191 | 192 | regionchromosome = par['regionchromosome'] 193 | 194 | regionstart = par['regionstart'] 195 | 196 | regionend = par['regionend'] 197 | 198 | rndth = par['rndth'] 199 | 200 | # smoothedscore = regionsmooth(bamfile=bamfile, maxinsert=maxinsert, jobtype=jobtype, region=nowregion, 201 | # chr_length=chr_length) 202 | 203 | smoothedscore = regionsmooth(bamfile=bamfile, maxinsert=maxinsert, jobtype=jobtype, 204 | regionchromosome=regionchromosome, 205 | regionstart=regionstart, regionend=regionend, 206 | chr_length=chr_length) 207 | 208 | localmax = smoothedlocalmax(smoothedscore, rndth) 209 | 210 | return localmax 211 | 212 | except KeyboardInterrupt: 213 | 214 | raise KeyboardInterruptError() 215 | 216 | except Exception, e: 217 | 218 | print ('got exception in Jazzlib.localmax.localmax_worker: %r,' % (e,)) 219 | 220 | 221 | def smoothedlocalmax(smoothedscore, rndth): 222 | 223 | try: 224 | 225 | maxsites = dict() 226 | 227 | startsite = min(smoothedscore['score'].keys()) 228 | 229 | endsite = max(smoothedscore['score'].keys()) 230 | 231 | chromosome = smoothedscore['chromosome'] 232 | 233 | maxsites[chromosome] = dict() 234 | 235 | for nowsite in range(startsite+2, endsite-2): 236 | 237 | if smoothedscore['score'][nowsite] >=rndth: 238 | 239 | if (smoothedscore['score'][nowsite-2]=smoothedscore['score'][nowsite+1]>smoothedscore['score'][nowsite+2]): 240 | 241 | maxsites[chromosome][nowsite] = smoothedscore['score'][nowsite] 242 | 243 | # print (nowsite) 244 | 245 | return maxsites 246 | 247 | except KeyboardInterrupt: 248 | 249 | raise KeyboardInterruptError() 250 | 251 | except Exception, e: 252 | 253 | print ('got exception in Jazzlib.localmax.smoothedlocalmax: %r,' % (e,)) 254 | 255 | -------------------------------------------------------------------------------- /Jazzlib/sta.py.bak: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from scipy.special import gammaincc 4 | from scipy import math 5 | import scipy.stats as stats 6 | from decimal import Decimal, localcontext 7 | from Peak import * 8 | import sys 9 | 10 | def bionompvalue(x, n, p): 11 | 12 | bionompvalue = 1 - stats.binom.cdf(x, n, p) 13 | 14 | return bionompvalue 15 | 16 | 17 | def poissonpvalue(x,mu): 18 | 19 | poissonpvalue = Decimal(1) - Decimal(stats.poisson.cdf(x, mu)) 20 | 21 | return poissonpvalue 22 | 23 | 24 | 25 | def fdr(pnow, plist, prank): 26 | #FDR=length(pvalue)*pvalue/rank(pvalue) 27 | 28 | rankofplist = prank 29 | 30 | lengthofplist = len(plist) 31 | 32 | for i in range(0,lengthofplist): 33 | 34 | if plist[i] == pnow: 35 | now_rank = rankofplist[i] 36 | fdr = lengthofplist*pnow/now_rank 37 | fdr = min(1,fdr) 38 | break 39 | 40 | return fdr 41 | 42 | 43 | def bayesfactor(locallambda, peakscore): 44 | 45 | try: 46 | 47 | # bayesfactor = 2 * (math.log((gammaincc(peakscore-1, locallambda)*gamma(peakscore-1)), math.e) - (peakscore-1)*math.log(locallambda, math.e) + locallambda) 48 | # 49 | # a = (math.log(gammaincc(peakscore-1, locallambda), math.e) ) 50 | # b = math.lgamma(peakscore-1) 51 | # c=(peakscore-1)*math.log(locallambda, math.e) 52 | # print (locallambda,peakscore,a,b,c) 53 | bayesfactor2 = 2 * (math.log(gammaincc(peakscore-1, locallambda), math.e)+math.lgamma(peakscore-1) - (peakscore-1)*math.log(locallambda, math.e) + locallambda) 54 | 55 | return bayesfactor2 56 | 57 | except Exception, e: 58 | 59 | print ('got exception in Jazzlib.sta.bayesfactor: %r,' % (e,)) 60 | 61 | print (locallambda, peakscore) 62 | 63 | except KeyboardInterrupt: 64 | 65 | sys.stderr.write("User interrupt\n") 66 | 67 | sys.exit(0) 68 | 69 | def fdr_control(chippeaks, inputpeaks, fdr): 70 | 71 | fdrpeakdict = dict() 72 | 73 | chipscore = list() 74 | 75 | inputscore = list() 76 | 77 | overlaptedpeak = dict() 78 | 79 | fdrth = -1 80 | 81 | # print ("check fdr") 82 | 83 | for inputpeak in inputpeaks: 84 | 85 | start = inputpeak.start 86 | 87 | end = inputpeak.end 88 | 89 | inputscore.append(inputpeak.score) 90 | 91 | for chippeak in chippeaks: 92 | 93 | if chippeak.chromosome == inputpeak.chromosome: 94 | 95 | if chippeak.peakpoint == inputpeak.peakpoint: 96 | 97 | overlaptedpeak[chippeak.peakid] = dict() 98 | 99 | overlaptedpeak[chippeak.peakid]['inputscore'] = inputpeak.score 100 | 101 | overlaptedpeak[chippeak.peakid]['chipscore'] = chippeak.score 102 | 103 | chipscore.append(chippeak.score) 104 | 105 | # print(chippeak.chromosome, chippeak.peakpoint, chippeak.score, inputpeak.score, chippeak.peakid) 106 | 107 | for i in sorted(chipscore): 108 | 109 | # print("score", i) 110 | 111 | chippeakcount = 0.0 112 | 113 | inputpeakcount = 0.0 114 | 115 | for peakid in overlaptedpeak: 116 | 117 | if i <= overlaptedpeak[peakid]['inputscore']: 118 | 119 | inputpeakcount = inputpeakcount + 1 120 | 121 | for chippeak in chippeaks: 122 | 123 | if i <= chippeak.score: 124 | 125 | chippeakcount = chippeakcount + 1 126 | 127 | nowfdr = inputpeakcount/chippeakcount 128 | 129 | # print (i, chippeakcount, inputpeakcount, nowfdr) 130 | 131 | if chippeakcount == 0: 132 | 133 | break 134 | 135 | for peaknow in chippeaks: 136 | 137 | if peaknow.score > i: 138 | 139 | peaknow.fdr = nowfdr 140 | 141 | 142 | # if (inputpeakcount/chippeakcount) < fdr: 143 | # 144 | # fdrth = i 145 | # 146 | # break 147 | 148 | return chippeaks 149 | 150 | 151 | 152 | def fdr_control2(chippeaks, inputpeaks, fdr): 153 | 154 | fdrpeakdict = dict() 155 | 156 | chipscore = list() 157 | 158 | inputscore = list() 159 | 160 | overlaptedpeak = dict() 161 | 162 | fdrth = -1 163 | 164 | # print ("check fdr") 165 | 166 | for inputpeak in inputpeaks: 167 | 168 | start = inputpeak.start 169 | 170 | end = inputpeak.end 171 | 172 | inputscore.append(inputpeak.score) 173 | 174 | for chippeak in chippeaks: 175 | 176 | if chippeak.chromosome == inputpeak.chromosome: 177 | 178 | if inputpeak.start i: 219 | 220 | peaknow.fdr = nowfdr 221 | 222 | return chippeaks 223 | 224 | 225 | def fdr_bh(peaks): 226 | 227 | b01s = list() 228 | 229 | peakscores = list() 230 | 231 | for peak in peaks: 232 | 233 | b01 = 1/(math.e**(peak.score/2)) 234 | 235 | peakscores.append(peak.score) 236 | 237 | b01s.append(b01) 238 | 239 | sortedb01s = sorted(b01s,reverse=True) 240 | 241 | listlength = len(sortedb01s) 242 | 243 | for peak in peaks: 244 | 245 | b01 = 1/(math.e**(peak.score/2)) 246 | 247 | rank = 1 248 | 249 | for i in range(0,listlength): 250 | 251 | if sortedb01s[i] == b01: 252 | 253 | rank = i + 1 254 | 255 | break 256 | 257 | fdr = b01*listlength/rank 258 | 259 | peak.fdr = fdr 260 | 261 | return peaks 262 | 263 | 264 | 265 | 266 | 267 | if __name__ == "__main__": 268 | 269 | try: 270 | 271 | for i in range(100,2000,100): 272 | 273 | for j in range (2,80): 274 | 275 | 276 | bs = bayesfactor(locallambda=i, peakscore=j) 277 | # if bs == 1500: 278 | # bs = 'error' 279 | print ("locallambda:",i, "peakscore",j,"bs",bs) 280 | 281 | except KeyboardInterrupt: 282 | 283 | sys.stderr.write("User interrupt\n") 284 | 285 | sys.exit(0) -------------------------------------------------------------------------------- /Jazzlib/FRegion.py.bak: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from numpy import * 4 | from countreads import * 5 | from multiprocessing import Pool 6 | from countreads import * 7 | import timeit 8 | import sys 9 | 10 | 11 | class KeyboardInterruptError(Exception): 12 | 13 | pass 14 | 15 | 16 | class FRegion: 17 | 18 | def __init__(self, bamfile, nthreads, maxinsert, jobtype, countchr=[]): 19 | 20 | self.bamfile = bamfile 21 | 22 | self.count_chr = countchr 23 | 24 | self.nthreads = nthreads 25 | 26 | self.maxinsert = maxinsert 27 | 28 | self.jobtype = jobtype 29 | 30 | self.__filte_region() 31 | 32 | def filte_region(self): 33 | 34 | bam_file = self.bamfile 35 | 36 | count_chr = self.count_chr 37 | 38 | nthreads = self.nthreads 39 | 40 | jobtype = self.jobtype 41 | 42 | maxinsert = self.maxinsert 43 | 44 | res = filter_region(bamfile=bam_file, count_chr=count_chr, nthreads=nthreads, maxinsert=maxinsert, 45 | jobtype=jobtype) 46 | 47 | filted_region = res['filted_region'] 48 | 49 | thresh_hold = res['thresh_hold'] 50 | 51 | scare_std = res['region_std'] 52 | 53 | scare_mean = res['region_mean'] 54 | 55 | chr_total_reads = res['chr_total_reads'] 56 | 57 | chrs_length = res['chrs_length'] 58 | 59 | chrsfrcount = res['chrfrcount'] 60 | 61 | filterreadscount = res['filterreadscount'] 62 | 63 | totalreads = res['totalreads'] 64 | 65 | chruniqlength = res['chruniqlength'] 66 | 67 | readlengthmean = res['readlengthmean'] 68 | 69 | adjreads = totalreads - filterreadscount 70 | 71 | countgenomelength = 0 72 | 73 | countgenomeuniqlength = 0 74 | 75 | for chromosome in count_chr: 76 | 77 | countgenomelength = countgenomelength + int(chrs_length[chromosome]) 78 | 79 | countgenomeuniqlength = countgenomeuniqlength + int(chruniqlength[chromosome]) 80 | 81 | self.countgenomelength = countgenomelength 82 | 83 | self.filted_region = filted_region 84 | 85 | self.thresh_hold = thresh_hold 86 | 87 | self.region_std = scare_std 88 | 89 | self.region_mean = scare_mean 90 | 91 | self.chr_total_reads = chr_total_reads 92 | 93 | self.chrs_length = chrs_length 94 | 95 | self.chrsfcount = chrsfrcount 96 | 97 | self.totalreads = totalreads 98 | 99 | self.filterreadscount = filterreadscount 100 | 101 | self.adjreads = adjreads 102 | 103 | self.chruniqlength = chruniqlength 104 | 105 | self.countgenomeuniqlength = countgenomeuniqlength 106 | 107 | self.readlengthmean = readlengthmean 108 | 109 | __filte_region = filte_region 110 | 111 | 112 | def filter_region(bamfile, count_chr, nthreads, maxinsert, jobtype): 113 | 114 | pool = Pool(nthreads) 115 | 116 | try: 117 | 118 | samfile = pysam.Samfile(bamfile) 119 | 120 | windowsize = 1000 121 | 122 | totalreads = 0 123 | 124 | refere_ncenumber = samfile.nreferences 125 | 126 | ref_lengths = samfile.lengths 127 | 128 | sam_ref = samfile.references 129 | 130 | chrs_length = dict() 131 | 132 | chr_total_reads = dict() 133 | 134 | pars = list() 135 | 136 | chruniqlength = dict() 137 | 138 | chrreadlengthmean = dict() 139 | 140 | for chromosome in count_chr: 141 | 142 | for i in range(refere_ncenumber): 143 | 144 | if sam_ref[i] == chromosome: 145 | 146 | chr_length = ref_lengths[i] 147 | 148 | chrs_length[chromosome] = chr_length 149 | 150 | chrcount = windowcounter(bamfile=bamfile, regionchromosome=chromosome, 151 | regionstart=1, regionend=int(chr_length), 152 | maxinsert=maxinsert, 153 | jobtype=jobtype) 154 | 155 | chr_total_reads[chromosome] = chrcount 156 | 157 | totalreads = chrcount + totalreads 158 | 159 | for chromosome in chrs_length: 160 | 161 | par = dict() 162 | 163 | par['chrmosome'] = chromosome 164 | 165 | par['windowsize'] = windowsize 166 | 167 | par['chr_length'] = chrs_length[chromosome] 168 | 169 | par['bamfile'] = bamfile 170 | 171 | par['maxinsert'] = maxinsert 172 | 173 | par['jobtype'] = jobtype 174 | 175 | pars.append(par) 176 | 177 | windowcountlist = list() 178 | 179 | windowregionlist = list() 180 | 181 | chrswindow = pool.map(chrwindow_counter, pars) 182 | 183 | for nowchrcount in chrswindow: 184 | 185 | nowchromosome = nowchrcount['chromosome'] 186 | 187 | nowchromosome = str(nowchromosome) 188 | 189 | nowwindowcount = nowchrcount['windowcount'] 190 | 191 | nowuniqcount = nowchrcount['uniqcount'] 192 | 193 | nowreadslengthmean = nowchrcount['readlengthmean'] 194 | 195 | chrreadlengthmean[nowchromosome] = nowreadslengthmean 196 | 197 | chruniqlength[nowchromosome] = nowuniqcount 198 | 199 | for nowscare in nowwindowcount: 200 | 201 | nowstart = nowscare * windowsize + 1 202 | 203 | nowend = (nowscare+1) * windowsize 204 | 205 | if nowend > chrs_length[nowchromosome]: 206 | 207 | nowend = chrs_length[nowchromosome] 208 | 209 | nowregion = nowchromosome+":"+str(nowstart)+"-"+str(nowend) 210 | 211 | windowcountlist.append(nowwindowcount[nowscare]) 212 | 213 | windowregionlist.append(nowregion) 214 | 215 | scare_mean = mean(windowcountlist) 216 | 217 | scare_std = std(windowcountlist) 218 | 219 | print ("mean:", scare_mean, "std",scare_std) 220 | 221 | thresh_hold = scare_mean + 10 * scare_std 222 | 223 | chrsfrcount = 0 224 | 225 | filterreadscount = 0 226 | 227 | filted_region = list() 228 | 229 | for i in range(0, len(windowcountlist)): 230 | 231 | if windowcountlist[i] >= thresh_hold: 232 | 233 | # print (windowregionlist[i]," reads count ", windowcountlist[i]) 234 | 235 | filted_region.append(windowregionlist[i]) 236 | 237 | filterreadscount = filterreadscount + windowcountlist[i] 238 | 239 | res = dict() 240 | 241 | res['filted_region'] = filted_region 242 | 243 | res['thresh_hold'] = thresh_hold 244 | 245 | res['region_std'] = scare_std 246 | 247 | res['region_mean'] = scare_mean 248 | 249 | res['chr_total_reads'] = chr_total_reads 250 | 251 | res['chrs_length'] = chrs_length 252 | 253 | res['chrfrcount'] = chrsfrcount 254 | 255 | res['filterreadscount'] = filterreadscount 256 | 257 | res['totalreads'] = totalreads 258 | 259 | res['chruniqlength'] = chruniqlength 260 | 261 | # res['chrreadlengthmean'] = chrreadlengthmean 262 | 263 | totallengmean = 0 264 | 265 | totalchrnumber = 0 266 | 267 | for chromsome in count_chr: 268 | 269 | if chromsome in chrreadlengthmean: 270 | 271 | totallengmean = totallengmean + chrreadlengthmean[chromsome] 272 | 273 | totalchrnumber = totalchrnumber + 1 274 | 275 | readlengthmean = totallengmean/totalchrnumber 276 | 277 | res['readlengthmean'] = readlengthmean 278 | 279 | pool.close() 280 | 281 | return res 282 | 283 | except KeyboardInterrupt: 284 | 285 | pool.terminate() 286 | 287 | print ("You cancelled the program!") 288 | 289 | sys.exit(1) 290 | 291 | except Exception, e: 292 | 293 | print ('got exception in Jazzlib.FRegion.filter_region: %r, terminating the pool' % (e,)) 294 | 295 | pool.terminate() 296 | 297 | print ('pool is terminated') 298 | 299 | finally: 300 | # print ('joining pool processes') 301 | pool.join() 302 | # print ('join complete') 303 | 304 | 305 | def chrwindow_counter(par): 306 | 307 | try: 308 | 309 | chromosome = par['chrmosome'] 310 | 311 | windowsize = par['windowsize'] 312 | 313 | chr_length = par['chr_length'] 314 | 315 | bamfile = par['bamfile'] 316 | 317 | maxinsert = par['maxinsert'] 318 | 319 | jobtype = par['jobtype'] 320 | 321 | windowcount = windowscarecounter(bamfile=bamfile, regionchromosome=chromosome, 322 | regionstart=1, regionend=chr_length, 323 | windowsize=windowsize, maxinsert=maxinsert, jobtype=jobtype) 324 | 325 | uniqcount = uniqsitecount(bamfile=bamfile, regionchromosome=chromosome, 326 | regionstart=1, regionend=chr_length, maxinsert=maxinsert, 327 | jobtype=jobtype) 328 | 329 | readlengthmean = readslengthmean(bamfile=bamfile, regionchromosome=chromosome, 330 | regionstart=1, regionend=chr_length, maxinsert=maxinsert, 331 | jobtype=jobtype) 332 | 333 | chrwindowcount = dict() 334 | 335 | chrwindowcount['windowcount'] = windowcount 336 | 337 | chrwindowcount['chromosome'] = chromosome 338 | 339 | chrwindowcount['uniqcount'] = uniqcount 340 | 341 | chrwindowcount['readlengthmean'] = readlengthmean 342 | 343 | return chrwindowcount 344 | 345 | except KeyboardInterrupt: 346 | 347 | print ("You cancelled the program!") 348 | 349 | sys.exit(1) 350 | -------------------------------------------------------------------------------- /Jazzlib/FRegion.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from numpy import * 4 | from .countreads import * 5 | from multiprocessing import Pool 6 | from .countreads import * 7 | import timeit 8 | import sys 9 | 10 | 11 | class KeyboardInterruptError(Exception): 12 | 13 | pass 14 | 15 | 16 | class FRegion: 17 | 18 | def __init__(self, bamfile, nthreads, maxinsert, jobtype, countchr=[]): 19 | 20 | self.bamfile = bamfile 21 | 22 | self.count_chr = countchr 23 | 24 | self.nthreads = nthreads 25 | 26 | self.maxinsert = maxinsert 27 | 28 | self.jobtype = jobtype 29 | 30 | self.__filte_region() 31 | 32 | def filte_region(self): 33 | 34 | bam_file = self.bamfile 35 | 36 | count_chr = self.count_chr 37 | 38 | nthreads = self.nthreads 39 | 40 | jobtype = self.jobtype 41 | 42 | maxinsert = self.maxinsert 43 | 44 | res = filter_region(bamfile=bam_file, count_chr=count_chr, nthreads=nthreads, maxinsert=maxinsert, 45 | jobtype=jobtype) 46 | 47 | filted_region = res['filted_region'] 48 | 49 | thresh_hold = res['thresh_hold'] 50 | 51 | scare_std = res['region_std'] 52 | 53 | scare_mean = res['region_mean'] 54 | 55 | chr_total_reads = res['chr_total_reads'] 56 | 57 | chrs_length = res['chrs_length'] 58 | 59 | chrsfrcount = res['chrfrcount'] 60 | 61 | filterreadscount = res['filterreadscount'] 62 | 63 | totalreads = res['totalreads'] 64 | 65 | chruniqlength = res['chruniqlength'] 66 | 67 | readlengthmean = res['readlengthmean'] 68 | 69 | adjreads = totalreads - filterreadscount 70 | 71 | countgenomelength = 0 72 | 73 | countgenomeuniqlength = 0 74 | 75 | for chromosome in count_chr: 76 | 77 | countgenomelength = countgenomelength + int(chrs_length[chromosome]) 78 | 79 | countgenomeuniqlength = countgenomeuniqlength + int(chruniqlength[chromosome]) 80 | 81 | self.countgenomelength = countgenomelength 82 | 83 | self.filted_region = filted_region 84 | 85 | self.thresh_hold = thresh_hold 86 | 87 | self.region_std = scare_std 88 | 89 | self.region_mean = scare_mean 90 | 91 | self.chr_total_reads = chr_total_reads 92 | 93 | self.chrs_length = chrs_length 94 | 95 | self.chrsfcount = chrsfrcount 96 | 97 | self.totalreads = totalreads 98 | 99 | self.filterreadscount = filterreadscount 100 | 101 | self.adjreads = adjreads 102 | 103 | self.chruniqlength = chruniqlength 104 | 105 | self.countgenomeuniqlength = countgenomeuniqlength 106 | 107 | self.readlengthmean = readlengthmean 108 | 109 | __filte_region = filte_region 110 | 111 | 112 | def filter_region(bamfile, count_chr, nthreads, maxinsert, jobtype): 113 | 114 | pool = Pool(nthreads) 115 | 116 | try: 117 | 118 | samfile = pysam.Samfile(bamfile) 119 | 120 | windowsize = 1000 121 | 122 | totalreads = 0 123 | 124 | refere_ncenumber = samfile.nreferences 125 | 126 | ref_lengths = samfile.lengths 127 | 128 | sam_ref = samfile.references 129 | 130 | chrs_length = dict() 131 | 132 | chr_total_reads = dict() 133 | 134 | pars = list() 135 | 136 | chruniqlength = dict() 137 | 138 | chrreadlengthmean = dict() 139 | 140 | for chromosome in count_chr: 141 | 142 | for i in range(refere_ncenumber): 143 | 144 | if sam_ref[i] == chromosome: 145 | 146 | chr_length = ref_lengths[i] 147 | 148 | chrs_length[chromosome] = chr_length 149 | 150 | chrcount = windowcounter(bamfile=bamfile, regionchromosome=chromosome, 151 | regionstart=1, regionend=int(chr_length), 152 | maxinsert=maxinsert, 153 | jobtype=jobtype) 154 | 155 | chr_total_reads[chromosome] = chrcount 156 | 157 | totalreads = chrcount + totalreads 158 | 159 | for chromosome in chrs_length: 160 | 161 | par = dict() 162 | 163 | par['chrmosome'] = chromosome 164 | 165 | par['windowsize'] = windowsize 166 | 167 | par['chr_length'] = chrs_length[chromosome] 168 | 169 | par['bamfile'] = bamfile 170 | 171 | par['maxinsert'] = maxinsert 172 | 173 | par['jobtype'] = jobtype 174 | 175 | pars.append(par) 176 | 177 | windowcountlist = list() 178 | 179 | windowregionlist = list() 180 | 181 | chrswindow = pool.map(chrwindow_counter, pars) 182 | 183 | for nowchrcount in chrswindow: 184 | 185 | nowchromosome = nowchrcount['chromosome'] 186 | 187 | nowchromosome = str(nowchromosome) 188 | 189 | nowwindowcount = nowchrcount['windowcount'] 190 | 191 | nowuniqcount = nowchrcount['uniqcount'] 192 | 193 | nowreadslengthmean = nowchrcount['readlengthmean'] 194 | 195 | print(nowchromosome, nowreadslengthmean) 196 | 197 | chrreadlengthmean[nowchromosome] = nowreadslengthmean 198 | 199 | chruniqlength[nowchromosome] = nowuniqcount 200 | 201 | for nowscare in nowwindowcount: 202 | 203 | nowstart = nowscare * windowsize + 1 204 | 205 | nowend = (nowscare+1) * windowsize 206 | 207 | if nowend > chrs_length[nowchromosome]: 208 | 209 | nowend = chrs_length[nowchromosome] 210 | 211 | nowregion = nowchromosome+":"+str(nowstart)+"-"+str(nowend) 212 | 213 | windowcountlist.append(nowwindowcount[nowscare]) 214 | 215 | windowregionlist.append(nowregion) 216 | 217 | scare_mean = mean(windowcountlist) 218 | 219 | scare_std = std(windowcountlist) 220 | 221 | print ("mean:", scare_mean, "std",scare_std) 222 | 223 | thresh_hold = scare_mean + 10 * scare_std 224 | 225 | chrsfrcount = 0 226 | 227 | filterreadscount = 0 228 | 229 | filted_region = list() 230 | 231 | for i in range(0, len(windowcountlist)): 232 | 233 | if windowcountlist[i] >= thresh_hold: 234 | 235 | # print (windowregionlist[i]," reads count ", windowcountlist[i]) 236 | 237 | filted_region.append(windowregionlist[i]) 238 | 239 | filterreadscount = filterreadscount + windowcountlist[i] 240 | 241 | res = dict() 242 | 243 | res['filted_region'] = filted_region 244 | 245 | res['thresh_hold'] = thresh_hold 246 | 247 | res['region_std'] = scare_std 248 | 249 | res['region_mean'] = scare_mean 250 | 251 | res['chr_total_reads'] = chr_total_reads 252 | 253 | res['chrs_length'] = chrs_length 254 | 255 | res['chrfrcount'] = chrsfrcount 256 | 257 | res['filterreadscount'] = filterreadscount 258 | 259 | res['totalreads'] = totalreads 260 | 261 | res['chruniqlength'] = chruniqlength 262 | 263 | # res['chrreadlengthmean'] = chrreadlengthmean 264 | 265 | totallengmean = 0 266 | 267 | totalchrnumber = 0 268 | 269 | for chromsome in count_chr: 270 | 271 | if chromsome in chrreadlengthmean: 272 | 273 | totallengmean = totallengmean + chrreadlengthmean[chromsome] 274 | 275 | totalchrnumber = totalchrnumber + 1 276 | 277 | readlengthmean = totallengmean/totalchrnumber 278 | 279 | res['readlengthmean'] = readlengthmean 280 | 281 | pool.close() 282 | 283 | return res 284 | 285 | except KeyboardInterrupt: 286 | 287 | pool.terminate() 288 | 289 | print ("You cancelled the program!") 290 | 291 | sys.exit(1) 292 | 293 | except Exception as e: 294 | 295 | print ('got exception in Jazzlib.FRegion.filter_region: %r, terminating the pool' % (e,)) 296 | 297 | pool.terminate() 298 | 299 | print ('pool is terminated') 300 | 301 | finally: 302 | # print ('joining pool processes') 303 | pool.join() 304 | # print ('join complete') 305 | 306 | 307 | def chrwindow_counter(par): 308 | 309 | try: 310 | 311 | chromosome = par['chrmosome'] 312 | 313 | windowsize = par['windowsize'] 314 | 315 | chr_length = par['chr_length'] 316 | 317 | bamfile = par['bamfile'] 318 | 319 | maxinsert = par['maxinsert'] 320 | 321 | jobtype = par['jobtype'] 322 | 323 | windowcount = windowscarecounter(bamfile=bamfile, regionchromosome=chromosome, 324 | regionstart=1, regionend=chr_length, 325 | windowsize=windowsize, maxinsert=maxinsert, jobtype=jobtype) 326 | 327 | uniqcount = uniqsitecount(bamfile=bamfile, regionchromosome=chromosome, 328 | regionstart=1, regionend=chr_length, maxinsert=maxinsert, 329 | jobtype=jobtype) 330 | 331 | readlengthmean = readslengthmean(bamfile=bamfile, regionchromosome=chromosome, 332 | regionstart=1, regionend=chr_length, maxinsert=maxinsert, 333 | jobtype=jobtype) 334 | 335 | chrwindowcount = dict() 336 | 337 | chrwindowcount['windowcount'] = windowcount 338 | 339 | chrwindowcount['chromosome'] = chromosome 340 | 341 | chrwindowcount['uniqcount'] = uniqcount 342 | 343 | chrwindowcount['readlengthmean'] = readlengthmean 344 | 345 | # for debug 346 | print("in chrwindow_counter", readlengthmean) 347 | 348 | return chrwindowcount 349 | 350 | except KeyboardInterrupt: 351 | 352 | print ("You cancelled the program!") 353 | 354 | sys.exit(1) 355 | -------------------------------------------------------------------------------- /Jazzlib/Jazz.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | import sys 5 | from optparse import OptionParser 6 | import logging 7 | from Jazzlib.FRegion import * 8 | from Jazzlib.localmax import * 9 | from Jazzlib.normalize_ratio import * 10 | from Jazzlib.countreads import * 11 | from Jazzlib.Peak import * 12 | from Jazzlib.sta import * 13 | from Jazzlib.jazzio import * 14 | from Jazzlib.randombg import * 15 | from Jazzlib.hotspotsscan import * 16 | from Jazzlib.Hotspot import * 17 | 18 | 19 | def main(): 20 | 21 | opt = opt_check(get_optparser()) 22 | 23 | if opt.controlfile == "no": 24 | 25 | nocontrol(opt) 26 | 27 | else: 28 | 29 | withcontrol(opt) 30 | 31 | 32 | def withcontrol(opt): 33 | 34 | try: 35 | 36 | datafile = opt.datafile 37 | 38 | inputfile = opt.controlfile 39 | 40 | jobtype = opt.jobtype 41 | 42 | count_chr = opt.countchr 43 | 44 | maxinsert = opt.maxinsert 45 | 46 | nthreads = opt.nthreads 47 | 48 | bayesfactorthreshold = opt.threshold 49 | 50 | # bayesfactorthreshold = 10 51 | 52 | samplename = opt.samplename 53 | 54 | fdr = opt.fdr 55 | 56 | chipfregion = FRegion(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert) 57 | 58 | inputfregion = FRegion(bamfile=inputfile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert) 59 | 60 | rndth = randombg(fregion=chipfregion, nthreads=nthreads, maxinsert=maxinsert) 61 | 62 | localmax = get_all_localmax(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, 63 | maxinsert=maxinsert, fregion=chipfregion, rndth=rndth) 64 | 65 | ratio = normalize_ratio_input2(fregegion_input=inputfregion, fregion_chip=chipfregion) 66 | 67 | inputadjreads = inputfregion.totalreads - inputfregion.filterreadscount 68 | 69 | genomelength = inputfregion.countgenomelength 70 | 71 | gloablumbda = inputadjreads/genomelength 72 | 73 | windowscare = 1000000 74 | 75 | # print (ratio, inputadjreads, genomelength,gloablumbda) 76 | 77 | # peaks = peakscan_control(localmax=localmax, 78 | # datafile=datafile, 79 | # inputfile=inputfile, 80 | # maxinsert=maxinsert, 81 | # windowscare=windowscare, 82 | # gloablumbda=gloablumbda, 83 | # ratio=ratio, 84 | # bayesfactorthreshold=bayesfactorthreshold, 85 | # nthreads=nthreads, 86 | # inputfregion=inputfregion, 87 | # chipfregion=chipfregion, 88 | # jobtype=jobtype) 89 | 90 | # peakbedgraphswriter(samplename, peaks) 91 | 92 | except KeyboardInterrupt: 93 | 94 | sys.stderr.write("User interrupt\n") 95 | 96 | sys.exit(0) 97 | 98 | 99 | def nocontrol(opt): 100 | 101 | try: 102 | 103 | datafile = opt.datafile 104 | 105 | jobtype = opt.jobtype 106 | 107 | count_chr = opt.countchr 108 | 109 | maxinsert = opt.maxinsert 110 | 111 | print ("maxinsert",maxinsert) 112 | 113 | nthreads = opt.nthreads 114 | 115 | bayesfactorthreshold = opt.threshold 116 | 117 | fdr = opt.fdr 118 | 119 | samplename = opt.samplename 120 | 121 | chipfregion = FRegion(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert) 122 | 123 | # rndth = randombg2(fregion=chipfregion, nthreads=nthreads, maxinsert=maxinsert) 124 | 125 | 126 | 127 | hotspots = hotspotsscan_withoutcontrol(file=datafile, maxinsert=maxinsert, windowscare=100000, countchr=count_chr, 128 | bayesfactorthreshold=bayesfactorthreshold, nthreads=nthreads, 129 | fregion=chipfregion, jobtype=jobtype) 130 | 131 | hotspotsbedswriter(hotspots=hotspots, samplename=samplename) 132 | 133 | # print (rndth) 134 | # 135 | # localmax = get_all_localmax(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, 136 | # maxinsert=maxinsert, fregion=chipfregion, rndth=rndth) 137 | # 138 | # ratio = 1 139 | # 140 | # inputadjreads = chipfregion.totalreads - chipfregion.filterreadscount 141 | # 142 | # genomelength = chipfregion.countgenomelength 143 | # 144 | # gloablumbda = inputadjreads/genomelength*maxinsert 145 | # 146 | # print ("gloablumbda",gloablumbda) 147 | # 148 | # windowscare = 1000000 149 | # 150 | # peaks = peakscan_withoutcontrol(localmax=localmax, 151 | # file=datafile, 152 | # maxinsert=maxinsert, 153 | # windowscare=windowscare, 154 | # gloablumbda=gloablumbda, 155 | # ratio=ratio, 156 | # bayesfactorthreshold=bayesfactorthreshold, 157 | # nthreads=nthreads, 158 | # fregion=chipfregion, 159 | # jobtype=jobtype) 160 | # 161 | # 162 | # peakbedgraphswriter(samplename, peaks) 163 | 164 | except KeyboardInterrupt: 165 | 166 | sys.stderr.write("User interrupt\n") 167 | 168 | sys.exit(0) 169 | 170 | 171 | def get_optparser(): 172 | 173 | usage = """usage: %prog <-d datafile> [-n name] [options] 174 | Example %prog -i nh_sample1.bam -n sample1 175 | """ 176 | 177 | description = "%prog Non-Histone protein banding site identification" 178 | 179 | jazzopt = OptionParser(version="%prog 0.1 20140521", description=description, usage=usage, add_help_option=False) 180 | 181 | jazzopt.add_option("-h", "--help", action="help", help="show this help message and exit.") 182 | 183 | jazzopt.add_option("-d", "--data", dest="datafile", type="string", help='data file, should be sorted bam format') 184 | 185 | jazzopt.add_option("-c", "--control", dest="controlfile", type="string", help='control(input) file, should be sorted bam format', default="no") 186 | 187 | jazzopt.add_option("-n", "--name", dest="samplename", help="NH sample name default=NH_sample", type="string" , default="DH_sample") 188 | 189 | jazzopt.add_option("-t", "--threshold", dest="threshold", type="float", help="peak threshold, default=6.0", default=6.0) 190 | 191 | jazzopt.add_option("--threads", dest="nthreads", type="int", help="threads number or cpu number, default=4", default=4) 192 | 193 | jazzopt.add_option("-w", "--wig", action="store_true", help="whether out put wiggle file, default=False", default=False) 194 | 195 | jazzopt.add_option("-f","--fdr", dest="fdr", type="float",help="using FDR as threshold", default=0.1) 196 | 197 | jazzopt.add_option("-x", "--excludechr", dest="excludechr", help="Don't count those chromosome, strongly suggest skip mitochondrion and chloroplast, example='-x ChrM,ChrC'") 198 | 199 | jazzopt.add_option("-g", "--gff", action="store_true", help="whether out put gff file, default=False", default=False) 200 | 201 | jazzopt.add_option("-j","--jobtype",dest="jobtype",type="string",help="job type, such as nhpaired or nhsingle") 202 | 203 | jazzopt.add_option("-m","--maxinsert",dest="maxinsert",type="int",help="when you use paired library, please set the maxinsert size",default=80) 204 | 205 | jazzopt.add_option("--pe", dest="pe", action="store_true", help="paired-end reads or single-end reads, default=False (single end)", default=False) 206 | 207 | return jazzopt 208 | 209 | 210 | def opt_check(jazzopt): 211 | 212 | (opt, args) = jazzopt.parse_args() 213 | 214 | if not opt.datafile: 215 | 216 | logging.error("you need input a bam file, '-d nh_sample1.bam -j nhsingle'") 217 | 218 | jazzopt.print_help() 219 | 220 | sys.exit(1) 221 | 222 | if not os.path.isfile (opt.datafile): 223 | 224 | logging.error("No such file: %s" % opt.datafile) 225 | 226 | sys.exit(1) 227 | 228 | dataindexfile = opt.datafile + '.bai' 229 | 230 | if not os.path.isfile (dataindexfile): 231 | 232 | logging.error("Missing bam index file: %s" % dataindexfile) 233 | 234 | sys.exit(1) 235 | 236 | if not opt.controlfile == "no": 237 | 238 | if not os.path.isfile (opt.controlfile): 239 | 240 | logging.error("No such file: %s" % opt.controlfile) 241 | 242 | sys.exit(1) 243 | 244 | controlindexfile = opt.controlfile + '.bai' 245 | 246 | if not os.path.isfile (controlindexfile): 247 | 248 | logging.error("Missing bam index file: %s" % controlindexfile) 249 | 250 | sys.exit(1) 251 | 252 | else: 253 | 254 | opt.controlfile = "no" 255 | 256 | if not (opt.nthreads > 0): 257 | 258 | logging.error("threads number should >=1") 259 | 260 | jazzopt.print_help() 261 | 262 | sys.exit(1) 263 | 264 | if (opt.jobtype): 265 | 266 | if opt.jobtype == 'nhsingle': 267 | 268 | if (opt.maxinsert < 0): 269 | 270 | logging.error("maxinsert size error") 271 | 272 | jazzopt.print_help() 273 | 274 | sys.exit(1) 275 | 276 | elif opt.jobtype == 'nhpaired': 277 | 278 | if (opt.maxinsert < 0): 279 | 280 | logging.error("maxinsert size error") 281 | 282 | jazzopt.print_help() 283 | 284 | sys.exit(1) 285 | 286 | else: 287 | 288 | logging.error("missing or wrong jobtype") 289 | 290 | jazzopt.print_help() 291 | 292 | sys.exit(1) 293 | 294 | else: 295 | 296 | logging.error("missing or wrong jobtype") 297 | 298 | jazzopt.print_help() 299 | 300 | sys.exit(1) 301 | 302 | opt.countchr = list() 303 | 304 | samfile = pysam.Samfile(opt.datafile) 305 | 306 | sam_ref = samfile.references 307 | 308 | for i in sam_ref: 309 | 310 | opt.countchr.append(i) 311 | 312 | if (opt.excludechr): 313 | 314 | excludchr = opt.excludechr.split(',') 315 | 316 | for chri in excludchr: 317 | 318 | if not chri in sam_ref: 319 | 320 | print (chri,'not in the %s file' % opt.datafile) 321 | 322 | print ("try to selcet exclude Chr from", end =" : ") 323 | 324 | print (sam_ref, sep=",") 325 | 326 | jazzopt.print_help() 327 | 328 | sys.exit(1) 329 | 330 | else: 331 | 332 | j = 0 333 | 334 | for n in opt.countchr: 335 | 336 | if chri == n: 337 | 338 | del opt.countchr[j] 339 | 340 | j = j + 1 341 | 342 | return opt 343 | 344 | if __name__ == "__main__": 345 | 346 | try: 347 | 348 | main() 349 | 350 | except KeyboardInterrupt: 351 | 352 | sys.stderr.write("User interrupt\n") 353 | 354 | sys.exit(0) 355 | 356 | -------------------------------------------------------------------------------- /Jazzlib/Jazz.py.bak: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | from optparse import OptionParser 6 | import logging 7 | from Jazzlib.FRegion import * 8 | from Jazzlib.localmax import * 9 | from Jazzlib.normalize_ratio import * 10 | from Jazzlib.countreads import * 11 | from Jazzlib.Peak import * 12 | from Jazzlib.sta import * 13 | from Jazzlib.jazzio import * 14 | from Jazzlib.randombg import * 15 | from Jazzlib.hotspotsscan import * 16 | from Jazzlib.Hotspot import * 17 | 18 | 19 | def main(): 20 | 21 | opt = opt_check(get_optparser()) 22 | 23 | if opt.controlfile == "no": 24 | 25 | nocontrol(opt) 26 | 27 | else: 28 | 29 | withcontrol(opt) 30 | 31 | 32 | def withcontrol(opt): 33 | 34 | try: 35 | 36 | datafile = opt.datafile 37 | 38 | inputfile = opt.controlfile 39 | 40 | jobtype = opt.jobtype 41 | 42 | count_chr = opt.countchr 43 | 44 | maxinsert = opt.maxinsert 45 | 46 | nthreads = opt.nthreads 47 | 48 | bayesfactorthreshold = opt.threshold 49 | 50 | # bayesfactorthreshold = 10 51 | 52 | samplename = opt.samplename 53 | 54 | fdr = opt.fdr 55 | 56 | chipfregion = FRegion(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert) 57 | 58 | inputfregion = FRegion(bamfile=inputfile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert) 59 | 60 | rndth = randombg(fregion=chipfregion, nthreads=nthreads, maxinsert=maxinsert) 61 | 62 | localmax = get_all_localmax(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, 63 | maxinsert=maxinsert, fregion=chipfregion, rndth=rndth) 64 | 65 | ratio = normalize_ratio_input2(fregegion_input=inputfregion, fregion_chip=chipfregion) 66 | 67 | inputadjreads = inputfregion.totalreads - inputfregion.filterreadscount 68 | 69 | genomelength = inputfregion.countgenomelength 70 | 71 | gloablumbda = inputadjreads/genomelength 72 | 73 | windowscare = 1000000 74 | 75 | # print (ratio, inputadjreads, genomelength,gloablumbda) 76 | 77 | # peaks = peakscan_control(localmax=localmax, 78 | # datafile=datafile, 79 | # inputfile=inputfile, 80 | # maxinsert=maxinsert, 81 | # windowscare=windowscare, 82 | # gloablumbda=gloablumbda, 83 | # ratio=ratio, 84 | # bayesfactorthreshold=bayesfactorthreshold, 85 | # nthreads=nthreads, 86 | # inputfregion=inputfregion, 87 | # chipfregion=chipfregion, 88 | # jobtype=jobtype) 89 | 90 | # peakbedgraphswriter(samplename, peaks) 91 | 92 | except KeyboardInterrupt: 93 | 94 | sys.stderr.write("User interrupt\n") 95 | 96 | sys.exit(0) 97 | 98 | 99 | def nocontrol(opt): 100 | 101 | try: 102 | 103 | datafile = opt.datafile 104 | 105 | jobtype = opt.jobtype 106 | 107 | count_chr = opt.countchr 108 | 109 | maxinsert = opt.maxinsert 110 | 111 | print ("maxinsert",maxinsert) 112 | 113 | nthreads = opt.nthreads 114 | 115 | bayesfactorthreshold = opt.threshold 116 | 117 | fdr = opt.fdr 118 | 119 | samplename = opt.samplename 120 | 121 | chipfregion = FRegion(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert) 122 | 123 | # rndth = randombg2(fregion=chipfregion, nthreads=nthreads, maxinsert=maxinsert) 124 | 125 | 126 | 127 | hotspots = hotspotsscan_withoutcontrol(file=datafile, maxinsert=maxinsert, windowscare=100000, countchr=count_chr, 128 | bayesfactorthreshold=bayesfactorthreshold, nthreads=nthreads, 129 | fregion=chipfregion, jobtype=jobtype) 130 | 131 | hotspotsbedswriter(hotspots=hotspots, samplename=samplename) 132 | 133 | # print (rndth) 134 | # 135 | # localmax = get_all_localmax(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, 136 | # maxinsert=maxinsert, fregion=chipfregion, rndth=rndth) 137 | # 138 | # ratio = 1 139 | # 140 | # inputadjreads = chipfregion.totalreads - chipfregion.filterreadscount 141 | # 142 | # genomelength = chipfregion.countgenomelength 143 | # 144 | # gloablumbda = inputadjreads/genomelength*maxinsert 145 | # 146 | # print ("gloablumbda",gloablumbda) 147 | # 148 | # windowscare = 1000000 149 | # 150 | # peaks = peakscan_withoutcontrol(localmax=localmax, 151 | # file=datafile, 152 | # maxinsert=maxinsert, 153 | # windowscare=windowscare, 154 | # gloablumbda=gloablumbda, 155 | # ratio=ratio, 156 | # bayesfactorthreshold=bayesfactorthreshold, 157 | # nthreads=nthreads, 158 | # fregion=chipfregion, 159 | # jobtype=jobtype) 160 | # 161 | # 162 | # peakbedgraphswriter(samplename, peaks) 163 | 164 | except KeyboardInterrupt: 165 | 166 | sys.stderr.write("User interrupt\n") 167 | 168 | sys.exit(0) 169 | 170 | 171 | def get_optparser(): 172 | 173 | usage = """usage: %prog <-d datafile> [-n name] [options] 174 | Example %prog -i nh_sample1.bam -n sample1 175 | """ 176 | 177 | description = "%prog Non-Histone protein banding site identification" 178 | 179 | jazzopt = OptionParser(version="%prog 0.1 20140521", description=description, usage=usage, add_help_option=False) 180 | 181 | jazzopt.add_option("-h", "--help", action="help", help="show this help message and exit.") 182 | 183 | jazzopt.add_option("-d", "--data", dest="datafile", type="string", help='data file, should be sorted bam format') 184 | 185 | jazzopt.add_option("-c", "--control", dest="controlfile", type="string", help='control(input) file, should be sorted bam format', default="no") 186 | 187 | jazzopt.add_option("-n", "--name", dest="samplename", help="NH sample name default=NH_sample", type="string" , default="DH_sample") 188 | 189 | jazzopt.add_option("-t", "--threshold", dest="threshold", type="float", help="peak threshold, default=6.0", default=6.0) 190 | 191 | jazzopt.add_option("--threads", dest="nthreads", type="int", help="threads number or cpu number, default=4", default=4) 192 | 193 | jazzopt.add_option("-w", "--wig", action="store_true", help="whether out put wiggle file, default=False", default=False) 194 | 195 | jazzopt.add_option("-f","--fdr", dest="fdr", type="float",help="using FDR as threshold", default=0.1) 196 | 197 | jazzopt.add_option("-x", "--excludechr", dest="excludechr", help="Don't count those chromosome, strongly suggest skip mitochondrion and chloroplast, example='-x ChrM,ChrC'") 198 | 199 | jazzopt.add_option("-g", "--gff", action="store_true", help="whether out put gff file, default=False", default=False) 200 | 201 | jazzopt.add_option("-j","--jobtype",dest="jobtype",type="string",help="job type, such as nhpaired or nhsingle") 202 | 203 | jazzopt.add_option("-m","--maxinsert",dest="maxinsert",type="int",help="when you use paired library, please set the maxinsert size",default=80) 204 | 205 | jazzopt.add_option("--pe", dest="pe", action="store_true", help="paired-end reads or single-end reads, default=False (single end)", default=False) 206 | 207 | return jazzopt 208 | 209 | 210 | def opt_check(jazzopt): 211 | 212 | (opt, args) = jazzopt.parse_args() 213 | 214 | if not opt.datafile: 215 | 216 | logging.error("you need input a bam file, '-d nh_sample1.bam -j nhsingle'") 217 | 218 | jazzopt.print_help() 219 | 220 | sys.exit(1) 221 | 222 | if not os.path.isfile (opt.datafile): 223 | 224 | logging.error("No such file: %s" % opt.datafile) 225 | 226 | sys.exit(1) 227 | 228 | dataindexfile = opt.datafile + '.bai' 229 | 230 | if not os.path.isfile (dataindexfile): 231 | 232 | logging.error("Missing bam index file: %s" % dataindexfile) 233 | 234 | sys.exit(1) 235 | 236 | if not opt.controlfile == "no": 237 | 238 | if not os.path.isfile (opt.controlfile): 239 | 240 | logging.error("No such file: %s" % opt.controlfile) 241 | 242 | sys.exit(1) 243 | 244 | controlindexfile = opt.controlfile + '.bai' 245 | 246 | if not os.path.isfile (controlindexfile): 247 | 248 | logging.error("Missing bam index file: %s" % controlindexfile) 249 | 250 | sys.exit(1) 251 | 252 | else: 253 | 254 | opt.controlfile = "no" 255 | 256 | if not (opt.nthreads > 0): 257 | 258 | logging.error("threads number should >=1") 259 | 260 | jazzopt.print_help() 261 | 262 | sys.exit(1) 263 | 264 | if (opt.jobtype): 265 | 266 | if opt.jobtype == 'nhsingle': 267 | 268 | if (opt.maxinsert < 0): 269 | 270 | logging.error("maxinsert size error") 271 | 272 | jazzopt.print_help() 273 | 274 | sys.exit(1) 275 | 276 | elif opt.jobtype == 'nhpaired': 277 | 278 | if (opt.maxinsert < 0): 279 | 280 | logging.error("maxinsert size error") 281 | 282 | jazzopt.print_help() 283 | 284 | sys.exit(1) 285 | 286 | else: 287 | 288 | logging.error("missing or wrong jobtype") 289 | 290 | jazzopt.print_help() 291 | 292 | sys.exit(1) 293 | 294 | else: 295 | 296 | logging.error("missing or wrong jobtype") 297 | 298 | jazzopt.print_help() 299 | 300 | sys.exit(1) 301 | 302 | opt.countchr = list() 303 | 304 | samfile = pysam.Samfile(opt.datafile) 305 | 306 | sam_ref = samfile.references 307 | 308 | for i in sam_ref: 309 | 310 | opt.countchr.append(i) 311 | 312 | if (opt.excludechr): 313 | 314 | excludchr = opt.excludechr.split(',') 315 | 316 | for chri in excludchr: 317 | 318 | if not chri in sam_ref: 319 | 320 | print (chri,'not in the %s file' % opt.datafile) 321 | 322 | print ("try to selcet exclude Chr from", end =" : ") 323 | 324 | print (sam_ref, sep=",") 325 | 326 | jazzopt.print_help() 327 | 328 | sys.exit(1) 329 | 330 | else: 331 | 332 | j = 0 333 | 334 | for n in opt.countchr: 335 | 336 | if chri == n: 337 | 338 | del opt.countchr[j] 339 | 340 | j = j + 1 341 | 342 | return opt 343 | 344 | if __name__ == "__main__": 345 | 346 | try: 347 | 348 | main() 349 | 350 | except KeyboardInterrupt: 351 | 352 | sys.stderr.write("User interrupt\n") 353 | 354 | sys.exit(0) 355 | 356 | -------------------------------------------------------------------------------- /Jazzlib/bgcount.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import pysam 4 | from numpy import * 5 | from multiprocessing import Pool 6 | import random as rnd 7 | from .kernel import * 8 | import sys 9 | from . import readscounter 10 | 11 | 12 | class KeyboardInterruptError(Exception): 13 | 14 | pass 15 | 16 | 17 | def nhnoncontrol(uniqueratio, threshold, kernellength, nthreads=4): 18 | 19 | bgscore = sim_replicate_nthreads(run_times=200, uniqueratio=uniqueratio, 20 | nthreads=nthreads, kernellength=kernellength, threshold=threshold) 21 | 22 | cutoff = bgscore['mean'] + bgscore['std'] * threshold 23 | 24 | return cutoff 25 | 26 | 27 | def dhnoncontrol(uniqueratio, threshold, kernellength, nthreads=4): 28 | 29 | bgscore = sim_replicate_nthreads(run_times=200, uniqueratio=uniqueratio, 30 | nthreads=nthreads, kernellength=kernellength, threshold=threshold) 31 | 32 | cutoff = bgscore['mean'] + bgscore['std'] * threshold 33 | 34 | return cutoff 35 | 36 | 37 | def nhcontrol(bamfile, chromosome, paired, chrlength, ultratio, filted_region,maxinsert, kernellength = 600, threshold = 4): 38 | 39 | """ 40 | region: chr:start-end 41 | ultraio = chrlength * uniqratio / chr_total_reads 42 | filter region 43 | 44 | """ 45 | 46 | region = chromosome + ':' + str(1) + '-' + str(chrlength) 47 | 48 | readscount = readscounter.nhreadscounter(bamfile, region, paired, maxinsert=maxinsert) 49 | 50 | kernel = smooth_kernel(kernellength) 51 | 52 | kernel_score = list() 53 | 54 | for i in sorted(kernel): 55 | 56 | kernel_score.append(kernel[i]) 57 | 58 | threshold = filted_region.threshold/100 59 | 60 | for site in readscount: 61 | 62 | if readscount[site] > threshold: 63 | 64 | readscount[site] = threshold 65 | 66 | smoothed_result = correlate(array(readscount), kernel_score) 67 | 68 | ultratiolist = list() 69 | 70 | ultratiolist.append(ultratio) 71 | 72 | smoothed_result = correlate(smoothed_result, ultratiolist) 73 | 74 | #scores = list() 75 | 76 | bg_mean = smoothed_result.mean() 77 | 78 | bg_std = smoothed_result.std() 79 | 80 | bg_threshold = bg_mean + threshold * bg_std 81 | 82 | #bgscore['rand_mean'] = bg_mean 83 | 84 | #bgscore['rand_std'] = bg_std 85 | 86 | cutoff = bg_threshold 87 | 88 | return cutoff 89 | 90 | 91 | def nhuniquerate(bamfile, chromosome, paired, fregion, regionstart=1, regionend = -1, maxinsert = 100000): 92 | 93 | samfile = pysam.Samfile(bamfile) 94 | 95 | ref_lengths = samfile.lengths 96 | 97 | sam_ref = samfile.references 98 | 99 | refere_ncenumber = samfile.nreferences 100 | 101 | if regionend == -1: 102 | 103 | for i in range(refere_ncenumber): 104 | 105 | if sam_ref[i] == chromosome: 106 | 107 | regionend = ref_lengths[i] 108 | 109 | region = chromosome + ':' + str(regionstart) + '-' + str(regionend) 110 | 111 | region_length = regionend - regionstart 112 | 113 | nhreadscount = readscounter.nhreadscounter(bamfile, region, paired, maxinsert=maxinsert) 114 | 115 | totaluniq = len(nhreadscount) + 0.0 116 | 117 | uniquerate = totaluniq/region_length 118 | 119 | return uniquerate 120 | 121 | 122 | def dhuniquerate(bamfile, chromosome, regionstart=1, regionend=-1): 123 | 124 | """ 125 | Count unique Rate in a region 126 | 127 | """ 128 | 129 | samfile = pysam.Samfile(bamfile) 130 | 131 | ref_lengths = samfile.lengths 132 | 133 | sam_ref = samfile.references 134 | 135 | refere_ncenumber = samfile.nreferences 136 | 137 | if regionend == -1: 138 | 139 | for i in range(refere_ncenumber): 140 | 141 | if sam_ref[i] == chromosome: 142 | 143 | regionend = ref_lengths[i] 144 | 145 | region = chromosome + ':' + str(regionstart) + '-' + str(regionend) 146 | 147 | region_length = regionend - regionstart 148 | 149 | dhreadscount = readscounter.dhreadscounter(bamfile, region) 150 | 151 | totaluniq = len(dhreadscount) + 0.0 152 | 153 | uniquerate = totaluniq/region_length 154 | 155 | return uniquerate 156 | 157 | 158 | def ultratio(chrlength, uniqueratio, chrtotalreads, frcount): 159 | """ 160 | ultraio = chrlength * uniqueratio / chr_total_reads 161 | """ 162 | ultratio = chrlength * uniqueratio / (chrtotalreads - frcount) 163 | 164 | return ultratio 165 | 166 | 167 | def sim_replicate_nthreads(run_times=1000, uniqueratio=1, kernellength = 600, threshold = 4, nthreads = 2): 168 | # randomthresh = list() 169 | 170 | pars = list() 171 | 172 | for i in range(0,run_times): 173 | 174 | par=dict() 175 | 176 | par['uniqueratio'] = uniqueratio 177 | 178 | par['kernellength'] = kernellength 179 | 180 | par['threshold'] = threshold 181 | 182 | pars.append(par) 183 | 184 | pool=Pool(nthreads) 185 | 186 | outscore = dict() 187 | 188 | try: 189 | randomthresh = pool.map(sim_bg_thread_worker, pars) 190 | 191 | summean = 0.0 192 | 193 | sumstd = 0.0 194 | 195 | for randscore in randomthresh: 196 | 197 | randmean = randscore['rand_mean'] 198 | 199 | randstd = randscore['rand_std'] 200 | # print (randmean, randstd) 201 | summean = summean + randmean 202 | 203 | sumstd = sumstd + randstd 204 | 205 | mean_of_mean = summean/run_times 206 | 207 | mean_of_std = sumstd/run_times 208 | # print ('mean_of_mean',mean_of_mean, 'mean_of_std',mean_of_std) 209 | 210 | outscore['mean'] = mean_of_mean 211 | 212 | outscore['std'] = mean_of_std 213 | #return (mean_of_mean, mean_of_std) 214 | 215 | pool.close() 216 | 217 | return outscore 218 | 219 | except KeyboardInterrupt: 220 | 221 | pool.terminate() 222 | 223 | print ("You cancelled the program!") 224 | 225 | sys.exit(1) 226 | 227 | except Exception as e: 228 | 229 | print ('got exception: %r, terminating the pool' % (e,)) 230 | 231 | pool.terminate() 232 | 233 | print ('pool is terminated') 234 | 235 | finally: 236 | # print ('joining pool processes') 237 | pool.join() 238 | # print ('join complete') 239 | # pool.join() 240 | # pool.close() 241 | 242 | 243 | def sim_bg_thread_worker(par): 244 | 245 | try: 246 | 247 | uniqueratio=par['uniqueratio'] 248 | 249 | kernellength = par['kernellength'] 250 | 251 | threshold = par['threshold'] 252 | 253 | kernel = smooth_kernel(length=kernellength) 254 | 255 | sim_genome_size = int(1e5) 256 | 257 | total_reads = int(sim_genome_size * uniqueratio) 258 | 259 | region_site = list(range(0,sim_genome_size)) 260 | 261 | sim_uniqsite = rnd.sample(region_site, total_reads) 262 | 263 | rand_reads_count = list() 264 | 265 | for i in range(0,sim_genome_size): 266 | 267 | rand_reads_count.append(0) 268 | 269 | kernel_score = list() 270 | 271 | for i in sorted(kernel): 272 | kernel_score.append(kernel[i]) 273 | 274 | 275 | 276 | kdesmooth_result = dict() 277 | 278 | for i in range(0,total_reads): 279 | 280 | rand_number = int(rnd.uniform(0,total_reads)) 281 | 282 | rand_reads = sim_uniqsite[rand_number] 283 | 284 | rand_reads_count[rand_reads] = rand_reads_count[rand_reads] + 1.0 285 | 286 | smoothed_result = correlate(array(rand_reads_count), kernel_score) 287 | 288 | scores = list() 289 | 290 | rand_mean = smoothed_result.mean() 291 | 292 | rand_std = smoothed_result.std() 293 | 294 | total_sum = smoothed_result.sum() 295 | 296 | rand_threshhold = rand_mean + threshold * rand_std 297 | 298 | higher_count = 0 299 | 300 | for now_site in kdesmooth_result: 301 | 302 | if kdesmooth_result[now_site] > rand_threshhold: 303 | 304 | higher_count = higher_count + 1 305 | 306 | # print (total_sum, rand_mean, rand_std, rand_threshhold, higher_count, total_reads) 307 | 308 | randscore = dict() 309 | 310 | randscore['rand_mean'] = rand_mean 311 | 312 | randscore['rand_std'] = rand_std 313 | 314 | return randscore 315 | 316 | except KeyboardInterrupt: 317 | 318 | raise KeyboardInterruptError() 319 | 320 | 321 | def get_bpc(bamfile, hotspots, jobtype, filted_region, nthreads, maxinsert = 100000): 322 | 323 | #bpc average readscount per basepare 324 | 325 | # total_reads = 0 326 | # 327 | total_length = 0 328 | # 329 | # samfile = pysam.Samfile(bamfile) 330 | 331 | pars = list() 332 | 333 | for hotspot_now in hotspots: 334 | 335 | par = dict() 336 | 337 | par['bamfile'] = bamfile 338 | 339 | par['hotspot'] = hotspot_now 340 | 341 | par['jobtype'] = jobtype 342 | 343 | par['filted_region'] = filted_region 344 | 345 | par['maxinsert'] = maxinsert 346 | 347 | pars.append(par) 348 | 349 | total_length = hotspot_now.end - hotspot_now.start + 1 + total_length 350 | 351 | 352 | # print ("total length %s" % total_length) 353 | 354 | pool = Pool(nthreads) 355 | 356 | try: 357 | 358 | reads_count = pool.map(bpc_runner, pars) 359 | 360 | total_reads = 0.0 361 | 362 | for count_now in reads_count: 363 | 364 | total_reads = total_reads + count_now 365 | 366 | bpc = (total_reads+0.0)/total_length 367 | 368 | pool.close() 369 | 370 | return bpc 371 | 372 | except KeyboardInterrupt: 373 | 374 | pool.terminate() 375 | 376 | print ("You cancelled the program!") 377 | 378 | sys.exit(1) 379 | 380 | except Exception as e: 381 | 382 | print ('got exception: %r, terminating the pool' % (e,)) 383 | 384 | pool.terminate() 385 | 386 | print ('pool is terminated') 387 | 388 | finally: 389 | # print ('joining pool processes') 390 | pool.join() 391 | # print ('join complete') 392 | 393 | 394 | def bpc_runner(par): 395 | 396 | try: 397 | 398 | bamfile = par['bamfile'] 399 | 400 | hotspot = par['hotspot'] 401 | 402 | jobtype = par['jobtype'] 403 | 404 | filted_region = par['filted_region'] 405 | 406 | maxinsert = par['maxinsert'] 407 | 408 | start_site = hotspot.start 409 | 410 | end_site = hotspot.end 411 | 412 | whether_in_fr = 0 413 | 414 | chromosome = hotspot.chromosome 415 | 416 | hotspotregio = chromosome + ':' + str(start_site) + '-' + str(end_site) 417 | 418 | hotspotreads = 0 419 | 420 | for i in range(start_site, end_site + 1): 421 | 422 | parentscare = int(i/100) 423 | 424 | if chromosome in filted_region: 425 | 426 | if parentscare in filted_region[chromosome]: 427 | 428 | whether_in_fr = 1 429 | 430 | if whether_in_fr == 0: 431 | 432 | readscount = dict() 433 | 434 | if jobtype == 'nhsingle': 435 | 436 | readscount = readscounter.nhreadscounter(bamfile = bamfile, region=hotspotregio, paired=False, maxinsert=maxinsert) 437 | 438 | elif jobtype == 'nhpaired': 439 | 440 | readscount = readscounter.nhreadscounter(bamfile = bamfile, region=hotspotregio, paired=True, maxinsert=maxinsert) 441 | 442 | elif jobtype == 'dh': 443 | 444 | readscount = readscounter.dhreadscounter(bamfile = bamfile, region = hotspotregio) 445 | 446 | else: 447 | 448 | print ("%s count type error!!!!" % jobtype) 449 | 450 | sys.exit(1) 451 | 452 | for i in readscount: 453 | 454 | hotspotreads = hotspotreads+readscount[i] 455 | 456 | # print (hotspotreads) 457 | return hotspotreads 458 | 459 | except KeyboardInterrupt: 460 | 461 | raise KeyboardInterruptError() -------------------------------------------------------------------------------- /Jazzlib/bgcount.py.bak: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | import pysam 4 | from numpy import * 5 | from multiprocessing import Pool 6 | import random as rnd 7 | from kernel import * 8 | import sys 9 | import readscounter 10 | 11 | 12 | class KeyboardInterruptError(Exception): 13 | 14 | pass 15 | 16 | 17 | def nhnoncontrol(uniqueratio, threshold, kernellength, nthreads=4): 18 | 19 | bgscore = sim_replicate_nthreads(run_times=200, uniqueratio=uniqueratio, 20 | nthreads=nthreads, kernellength=kernellength, threshold=threshold) 21 | 22 | cutoff = bgscore['mean'] + bgscore['std'] * threshold 23 | 24 | return cutoff 25 | 26 | 27 | def dhnoncontrol(uniqueratio, threshold, kernellength, nthreads=4): 28 | 29 | bgscore = sim_replicate_nthreads(run_times=200, uniqueratio=uniqueratio, 30 | nthreads=nthreads, kernellength=kernellength, threshold=threshold) 31 | 32 | cutoff = bgscore['mean'] + bgscore['std'] * threshold 33 | 34 | return cutoff 35 | 36 | 37 | def nhcontrol(bamfile, chromosome, paired, chrlength, ultratio, filted_region,maxinsert, kernellength = 600, threshold = 4): 38 | 39 | """ 40 | region: chr:start-end 41 | ultraio = chrlength * uniqratio / chr_total_reads 42 | filter region 43 | 44 | """ 45 | 46 | region = chromosome + ':' + str(1) + '-' + str(chrlength) 47 | 48 | readscount = readscounter.nhreadscounter(bamfile, region, paired, maxinsert=maxinsert) 49 | 50 | kernel = smooth_kernel(kernellength) 51 | 52 | kernel_score = list() 53 | 54 | for i in sorted(kernel): 55 | 56 | kernel_score.append(kernel[i]) 57 | 58 | threshold = filted_region.threshold/100 59 | 60 | for site in readscount: 61 | 62 | if readscount[site] > threshold: 63 | 64 | readscount[site] = threshold 65 | 66 | smoothed_result = correlate(array(readscount), kernel_score) 67 | 68 | ultratiolist = list() 69 | 70 | ultratiolist.append(ultratio) 71 | 72 | smoothed_result = correlate(smoothed_result, ultratiolist) 73 | 74 | #scores = list() 75 | 76 | bg_mean = smoothed_result.mean() 77 | 78 | bg_std = smoothed_result.std() 79 | 80 | bg_threshold = bg_mean + threshold * bg_std 81 | 82 | #bgscore['rand_mean'] = bg_mean 83 | 84 | #bgscore['rand_std'] = bg_std 85 | 86 | cutoff = bg_threshold 87 | 88 | return cutoff 89 | 90 | 91 | def nhuniquerate(bamfile, chromosome, paired, fregion, regionstart=1, regionend = -1, maxinsert = 100000): 92 | 93 | samfile = pysam.Samfile(bamfile) 94 | 95 | ref_lengths = samfile.lengths 96 | 97 | sam_ref = samfile.references 98 | 99 | refere_ncenumber = samfile.nreferences 100 | 101 | if regionend == -1: 102 | 103 | for i in range(refere_ncenumber): 104 | 105 | if sam_ref[i] == chromosome: 106 | 107 | regionend = ref_lengths[i] 108 | 109 | region = chromosome + ':' + str(regionstart) + '-' + str(regionend) 110 | 111 | region_length = regionend - regionstart 112 | 113 | nhreadscount = readscounter.nhreadscounter(bamfile, region, paired, maxinsert=maxinsert) 114 | 115 | totaluniq = len(nhreadscount) + 0.0 116 | 117 | uniquerate = totaluniq/region_length 118 | 119 | return uniquerate 120 | 121 | 122 | def dhuniquerate(bamfile, chromosome, regionstart=1, regionend=-1): 123 | 124 | """ 125 | Count unique Rate in a region 126 | 127 | """ 128 | 129 | samfile = pysam.Samfile(bamfile) 130 | 131 | ref_lengths = samfile.lengths 132 | 133 | sam_ref = samfile.references 134 | 135 | refere_ncenumber = samfile.nreferences 136 | 137 | if regionend == -1: 138 | 139 | for i in range(refere_ncenumber): 140 | 141 | if sam_ref[i] == chromosome: 142 | 143 | regionend = ref_lengths[i] 144 | 145 | region = chromosome + ':' + str(regionstart) + '-' + str(regionend) 146 | 147 | region_length = regionend - regionstart 148 | 149 | dhreadscount = readscounter.dhreadscounter(bamfile, region) 150 | 151 | totaluniq = len(dhreadscount) + 0.0 152 | 153 | uniquerate = totaluniq/region_length 154 | 155 | return uniquerate 156 | 157 | 158 | def ultratio(chrlength, uniqueratio, chrtotalreads, frcount): 159 | """ 160 | ultraio = chrlength * uniqueratio / chr_total_reads 161 | """ 162 | ultratio = chrlength * uniqueratio / (chrtotalreads - frcount) 163 | 164 | return ultratio 165 | 166 | 167 | def sim_replicate_nthreads(run_times=1000, uniqueratio=1, kernellength = 600, threshold = 4, nthreads = 2): 168 | # randomthresh = list() 169 | 170 | pars = list() 171 | 172 | for i in range(0,run_times): 173 | 174 | par=dict() 175 | 176 | par['uniqueratio'] = uniqueratio 177 | 178 | par['kernellength'] = kernellength 179 | 180 | par['threshold'] = threshold 181 | 182 | pars.append(par) 183 | 184 | pool=Pool(nthreads) 185 | 186 | outscore = dict() 187 | 188 | try: 189 | randomthresh = pool.map(sim_bg_thread_worker, pars) 190 | 191 | summean = 0.0 192 | 193 | sumstd = 0.0 194 | 195 | for randscore in randomthresh: 196 | 197 | randmean = randscore['rand_mean'] 198 | 199 | randstd = randscore['rand_std'] 200 | # print (randmean, randstd) 201 | summean = summean + randmean 202 | 203 | sumstd = sumstd + randstd 204 | 205 | mean_of_mean = summean/run_times 206 | 207 | mean_of_std = sumstd/run_times 208 | # print ('mean_of_mean',mean_of_mean, 'mean_of_std',mean_of_std) 209 | 210 | outscore['mean'] = mean_of_mean 211 | 212 | outscore['std'] = mean_of_std 213 | #return (mean_of_mean, mean_of_std) 214 | 215 | pool.close() 216 | 217 | return outscore 218 | 219 | except KeyboardInterrupt: 220 | 221 | pool.terminate() 222 | 223 | print ("You cancelled the program!") 224 | 225 | sys.exit(1) 226 | 227 | except Exception, e: 228 | 229 | print ('got exception: %r, terminating the pool' % (e,)) 230 | 231 | pool.terminate() 232 | 233 | print ('pool is terminated') 234 | 235 | finally: 236 | # print ('joining pool processes') 237 | pool.join() 238 | # print ('join complete') 239 | # pool.join() 240 | # pool.close() 241 | 242 | 243 | def sim_bg_thread_worker(par): 244 | 245 | try: 246 | 247 | uniqueratio=par['uniqueratio'] 248 | 249 | kernellength = par['kernellength'] 250 | 251 | threshold = par['threshold'] 252 | 253 | kernel = smooth_kernel(length=kernellength) 254 | 255 | sim_genome_size = int(1e5) 256 | 257 | total_reads = int(sim_genome_size * uniqueratio) 258 | 259 | region_site = range(0,sim_genome_size) 260 | 261 | sim_uniqsite = rnd.sample(region_site, total_reads) 262 | 263 | rand_reads_count = list() 264 | 265 | for i in range(0,sim_genome_size): 266 | 267 | rand_reads_count.append(0) 268 | 269 | kernel_score = list() 270 | 271 | for i in sorted(kernel): 272 | kernel_score.append(kernel[i]) 273 | 274 | 275 | 276 | kdesmooth_result = dict() 277 | 278 | for i in range(0,total_reads): 279 | 280 | rand_number = int(rnd.uniform(0,total_reads)) 281 | 282 | rand_reads = sim_uniqsite[rand_number] 283 | 284 | rand_reads_count[rand_reads] = rand_reads_count[rand_reads] + 1.0 285 | 286 | smoothed_result = correlate(array(rand_reads_count), kernel_score) 287 | 288 | scores = list() 289 | 290 | rand_mean = smoothed_result.mean() 291 | 292 | rand_std = smoothed_result.std() 293 | 294 | total_sum = smoothed_result.sum() 295 | 296 | rand_threshhold = rand_mean + threshold * rand_std 297 | 298 | higher_count = 0 299 | 300 | for now_site in kdesmooth_result: 301 | 302 | if kdesmooth_result[now_site] > rand_threshhold: 303 | 304 | higher_count = higher_count + 1 305 | 306 | # print (total_sum, rand_mean, rand_std, rand_threshhold, higher_count, total_reads) 307 | 308 | randscore = dict() 309 | 310 | randscore['rand_mean'] = rand_mean 311 | 312 | randscore['rand_std'] = rand_std 313 | 314 | return randscore 315 | 316 | except KeyboardInterrupt: 317 | 318 | raise KeyboardInterruptError() 319 | 320 | 321 | def get_bpc(bamfile, hotspots, jobtype, filted_region, nthreads, maxinsert = 100000): 322 | 323 | #bpc average readscount per basepare 324 | 325 | # total_reads = 0 326 | # 327 | total_length = 0 328 | # 329 | # samfile = pysam.Samfile(bamfile) 330 | 331 | pars = list() 332 | 333 | for hotspot_now in hotspots: 334 | 335 | par = dict() 336 | 337 | par['bamfile'] = bamfile 338 | 339 | par['hotspot'] = hotspot_now 340 | 341 | par['jobtype'] = jobtype 342 | 343 | par['filted_region'] = filted_region 344 | 345 | par['maxinsert'] = maxinsert 346 | 347 | pars.append(par) 348 | 349 | total_length = hotspot_now.end - hotspot_now.start + 1 + total_length 350 | 351 | 352 | # print ("total length %s" % total_length) 353 | 354 | pool = Pool(nthreads) 355 | 356 | try: 357 | 358 | reads_count = pool.map(bpc_runner, pars) 359 | 360 | total_reads = 0.0 361 | 362 | for count_now in reads_count: 363 | 364 | total_reads = total_reads + count_now 365 | 366 | bpc = (total_reads+0.0)/total_length 367 | 368 | pool.close() 369 | 370 | return bpc 371 | 372 | except KeyboardInterrupt: 373 | 374 | pool.terminate() 375 | 376 | print ("You cancelled the program!") 377 | 378 | sys.exit(1) 379 | 380 | except Exception, e: 381 | 382 | print ('got exception: %r, terminating the pool' % (e,)) 383 | 384 | pool.terminate() 385 | 386 | print ('pool is terminated') 387 | 388 | finally: 389 | # print ('joining pool processes') 390 | pool.join() 391 | # print ('join complete') 392 | 393 | 394 | def bpc_runner(par): 395 | 396 | try: 397 | 398 | bamfile = par['bamfile'] 399 | 400 | hotspot = par['hotspot'] 401 | 402 | jobtype = par['jobtype'] 403 | 404 | filted_region = par['filted_region'] 405 | 406 | maxinsert = par['maxinsert'] 407 | 408 | start_site = hotspot.start 409 | 410 | end_site = hotspot.end 411 | 412 | whether_in_fr = 0 413 | 414 | chromosome = hotspot.chromosome 415 | 416 | hotspotregio = chromosome + ':' + str(start_site) + '-' + str(end_site) 417 | 418 | hotspotreads = 0 419 | 420 | for i in range(start_site, end_site + 1): 421 | 422 | parentscare = int(i/100) 423 | 424 | if chromosome in filted_region: 425 | 426 | if parentscare in filted_region[chromosome]: 427 | 428 | whether_in_fr = 1 429 | 430 | if whether_in_fr == 0: 431 | 432 | readscount = dict() 433 | 434 | if jobtype == 'nhsingle': 435 | 436 | readscount = readscounter.nhreadscounter(bamfile = bamfile, region=hotspotregio, paired=False, maxinsert=maxinsert) 437 | 438 | elif jobtype == 'nhpaired': 439 | 440 | readscount = readscounter.nhreadscounter(bamfile = bamfile, region=hotspotregio, paired=True, maxinsert=maxinsert) 441 | 442 | elif jobtype == 'dh': 443 | 444 | readscount = readscounter.dhreadscounter(bamfile = bamfile, region = hotspotregio) 445 | 446 | else: 447 | 448 | print ("%s count type error!!!!" % jobtype) 449 | 450 | sys.exit(1) 451 | 452 | for i in readscount: 453 | 454 | hotspotreads = hotspotreads+readscount[i] 455 | 456 | # print (hotspotreads) 457 | return hotspotreads 458 | 459 | except KeyboardInterrupt: 460 | 461 | raise KeyboardInterruptError() -------------------------------------------------------------------------------- /Jazz.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | import sys 5 | from optparse import OptionParser 6 | import logging 7 | from Jazzlib.FRegion import * 8 | from Jazzlib.localmax import * 9 | from Jazzlib.normalize_ratio import * 10 | from Jazzlib.countreads import * 11 | from Jazzlib.Peak import * 12 | from Jazzlib.sta import * 13 | from Jazzlib.jazzio import * 14 | from Jazzlib.randombg import * 15 | from Jazzlib.hotspotsscan import * 16 | from Jazzlib.Hotspot import * 17 | from Jazzlib.peaksscan import * 18 | 19 | def main(): 20 | 21 | opt = opt_check(get_optparser()) 22 | 23 | if opt.controlfile == "no": 24 | 25 | nocontrol(opt) 26 | 27 | else: 28 | 29 | withcontrol(opt) 30 | 31 | 32 | def withcontrol(opt): 33 | 34 | try: 35 | 36 | datafile = opt.datafile 37 | 38 | inputfile = opt.controlfile 39 | 40 | jobtype = opt.jobtype 41 | 42 | count_chr = opt.countchr 43 | 44 | maxinsert = opt.maxinsert 45 | 46 | nthreads = opt.nthreads 47 | 48 | bayesfactorthreshold = opt.threshold 49 | 50 | # bayesfactorthreshold = 10 51 | 52 | samplename = opt.samplename 53 | 54 | fdr = opt.fdr 55 | 56 | 57 | chipfregion = FRegion(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert) 58 | 59 | inputfregion = FRegion(bamfile=inputfile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, maxinsert=maxinsert) 60 | 61 | ratio = normalize_ratio_input2(fregegion_input=inputfregion, fregion_chip=chipfregion) 62 | 63 | if opt.genomesize: 64 | 65 | print("###chipfregion.adjreads * chipfregion.readlengthmean/chipfregion.countgenomelength,", 66 | chipfregion.adjreads, chipfregion.readlengthmean, opt.genomesize) 67 | 68 | gloablumbda = chipfregion.adjreads * chipfregion.readlengthmean / opt.genomesize 69 | 70 | else: 71 | 72 | print("###inputfregion.adjreads,inputfregion.readlengthmean,inputfregion.countgenomelength", 73 | inputfregion.adjreads , inputfregion.readlengthmean,inputfregion.countgenomelength) 74 | 75 | gloablumbda = inputfregion.adjreads * inputfregion.readlengthmean/inputfregion.countgenomelength 76 | 77 | windowscare=100000 78 | 79 | hotspots = hotspotsscan_withcontrol(chipfile=datafile,maxinsert=maxinsert, windowscare=windowscare, 80 | countchr=count_chr, inputgloablumbda=gloablumbda, 81 | bayesfactorthreshold=bayesfactorthreshold, nthreads=nthreads, 82 | chipfregion=chipfregion, jobtype=jobtype, ratio=ratio, inputfile=inputfile, 83 | inputfregion=inputfregion) 84 | 85 | peaks = peakscan_control(datafile=datafile,maxinsert=maxinsert, bayesfactorthreshold=bayesfactorthreshold, 86 | nthreads=nthreads,chipfregion=chipfregion, jobtype=jobtype, hotspots=hotspots, 87 | gloablumbda=gloablumbda,inputfile=inputfile,ratio=ratio,inputfregion=inputfregion) 88 | 89 | if opt.hotonly: 90 | 91 | hotspotsbedswriter(hotspots=hotspots, samplename=samplename) 92 | 93 | else: 94 | 95 | hotspotsenrich = hotspotsfilter(hotspots=hotspots, peaks=peaks) 96 | 97 | hotspotsbedswriter(hotspots=hotspotsenrich, samplename=samplename) 98 | 99 | peakbedswriter(samplename=samplename,peaks=peaks) 100 | 101 | jazzgffout(samplename=samplename, hotspots=hotspotsenrich, peaks=peaks, fregion=chipfregion) 102 | 103 | except KeyboardInterrupt: 104 | 105 | sys.stderr.write("User interrupt\n") 106 | 107 | sys.exit(0) 108 | 109 | 110 | def nocontrol(opt): 111 | 112 | try: 113 | 114 | datafile = opt.datafile 115 | 116 | jobtype = opt.jobtype 117 | 118 | count_chr = opt.countchr 119 | 120 | maxinsert = opt.maxinsert 121 | 122 | print ("maxinsert",maxinsert) 123 | 124 | nthreads = opt.nthreads 125 | 126 | bayesfactorthreshold = opt.threshold 127 | 128 | samplename = opt.samplename 129 | 130 | chipfregion = FRegion(bamfile=datafile, jobtype=jobtype, countchr=count_chr, nthreads=nthreads, 131 | maxinsert=maxinsert) 132 | 133 | if opt.genomesize: 134 | 135 | print("###chipfregion.adjreads * chipfregion.readlengthmean/chipfregion.countgenomelength,", chipfregion.adjreads, chipfregion.readlengthmean,opt.genomesize) 136 | 137 | gloablumbda = chipfregion.adjreads * chipfregion.readlengthmean / opt.genomesize 138 | 139 | else: 140 | 141 | print("###chipfregion.adjreads * chipfregion.readlengthmean/chipfregion.countgenomelength,", chipfregion.adjreads, chipfregion.readlengthmean,chipfregion.countgenomelength) 142 | 143 | gloablumbda = chipfregion.adjreads * chipfregion.readlengthmean/chipfregion.countgenomelength 144 | 145 | windowscare=100000 146 | 147 | for fregions in chipfregion.filted_region: 148 | 149 | print (fregions) 150 | 151 | hotspots = hotspotsscan_withoutcontrol(file=datafile, maxinsert=maxinsert, windowscare=windowscare, countchr=count_chr, 152 | bayesfactorthreshold=bayesfactorthreshold, nthreads=nthreads, 153 | fregion=chipfregion, jobtype=jobtype, gloablumbda=gloablumbda) 154 | 155 | peaks = peakscan_without_control(datafile=datafile,maxinsert=maxinsert, 156 | bayesfactorthreshold=bayesfactorthreshold, nthreads=nthreads, 157 | fregion=chipfregion,jobtype=jobtype, 158 | hotspots=hotspots, gloablumbda=gloablumbda) 159 | 160 | if opt.hotonly: 161 | 162 | hotspotsbedswriter(hotspots=hotspots, samplename=samplename) 163 | 164 | else: 165 | 166 | hotspotsenrich = hotspotsfilter(hotspots=hotspots, peaks=peaks) 167 | 168 | hotspotsbedswriter(hotspots=hotspotsenrich, samplename=samplename) 169 | 170 | peakbedswriter(samplename=samplename,peaks=peaks) 171 | 172 | jazzgffout(samplename=samplename, hotspots=hotspotsenrich, peaks=peaks, fregion=chipfregion) 173 | 174 | except KeyboardInterrupt: 175 | 176 | sys.stderr.write("User interrupt\n") 177 | 178 | sys.exit(0) 179 | 180 | 181 | def get_optparser(): 182 | 183 | usage = """usage: %prog <-d datafile> [-n name] [options] 184 | Example %prog -i nh_sample1.bam -n sample1 185 | """ 186 | 187 | description = "%prog Non-Histone protein banding site identification" 188 | 189 | jazzopt = OptionParser(version="%prog 0.1 20140521", description=description, usage=usage, add_help_option=False) 190 | 191 | jazzopt.add_option("-h", "--help", action="help", help="show this help message and exit.") 192 | 193 | jazzopt.add_option("-d", "--data", dest="datafile", type="string", help='data file, should be sorted bam format') 194 | 195 | jazzopt.add_option("-c", "--control", dest="controlfile", type="string", help='control(input) file, should be sorted bam format', default="no") 196 | 197 | jazzopt.add_option("-n", "--name", dest="samplename", help="NH sample name default=NH_sample", type="string" , default="DH_sample") 198 | 199 | jazzopt.add_option("-t", "--threshold", dest="threshold", type="float", help="peak threshold, default=6.0", default=6.0) 200 | 201 | jazzopt.add_option("--threads", dest="nthreads", type="int", help="threads number or cpu number, default=4", default=4) 202 | 203 | jazzopt.add_option("-w", "--wig", action="store_true", help="whether out put wiggle file, default=False", default=False) 204 | 205 | jazzopt.add_option("-f","--fdr", dest="fdr", type="float",help="using FDR as threshold", default=0.1) 206 | 207 | jazzopt.add_option("-x", "--excludechr", dest="excludechr", help="Don't count those chromosome, strongly suggest skip mitochondrion and chloroplast, example='-x ChrM,ChrC'") 208 | 209 | jazzopt.add_option("-g", "--gff", action="store_true", help="whether out put gff file, default=False", default=False) 210 | 211 | jazzopt.add_option("-j","--jobtype",dest="jobtype",type="string",help="job type, such as nhpaired or nhsingle") 212 | 213 | jazzopt.add_option("-m","--maxinsert",dest="maxinsert",type="int",help="when you use paired library, please set the maxinsert size",default=130) 214 | 215 | jazzopt.add_option("--pe", dest="pe", action="store_true", help="paired-end reads or single-end reads, default=False (single end)", default=False) 216 | 217 | jazzopt.add_option("--genomesize", dest="genomesize", type="int", 218 | help="Set genome size", default=False) 219 | 220 | jazzopt.add_option("--hotonly", dest="hotonly", action="store_true", default=False, help="calculate hotsports only.") 221 | 222 | return jazzopt 223 | 224 | 225 | def opt_check(jazzopt): 226 | 227 | (opt, args) = jazzopt.parse_args() 228 | 229 | if not opt.datafile: 230 | 231 | logging.error("you need input a bam file, '-d nh_sample1.bam -j nhsingle'") 232 | 233 | jazzopt.print_help() 234 | 235 | sys.exit(1) 236 | 237 | if not os.path.isfile (opt.datafile): 238 | 239 | logging.error("No such file: %s" % opt.datafile) 240 | 241 | sys.exit(1) 242 | 243 | dataindexfile1 = opt.datafile + '.bai' 244 | 245 | dataindexfile2 = opt.datafile + '.csi' 246 | 247 | if not (os.path.isfile(dataindexfile1) or os.path.isfile(dataindexfile2)): 248 | 249 | logging.error("Missing bam index file: %s or %s" % (dataindexfile1, dataindexfile2)) 250 | 251 | sys.exit(1) 252 | 253 | if not opt.controlfile == "no": 254 | 255 | if not os.path.isfile (opt.controlfile): 256 | 257 | logging.error("No such file: %s" % opt.controlfile) 258 | 259 | sys.exit(1) 260 | 261 | controlindexfile1 = opt.controlfile + '.bai' 262 | 263 | controlindexfile2 = opt.controlfile + '.csi' 264 | 265 | if not (os.path.isfile(controlindexfile1) or os.path.isfile(controlindexfile2)): 266 | 267 | logging.error("Missing bam index file: %s or %s" % (controlindexfile1, controlindexfile2)) 268 | 269 | sys.exit(1) 270 | 271 | else: 272 | 273 | opt.controlfile = "no" 274 | 275 | if not (opt.nthreads > 0): 276 | 277 | logging.error("threads number should >=1") 278 | 279 | jazzopt.print_help() 280 | 281 | sys.exit(1) 282 | 283 | if (opt.jobtype): 284 | 285 | if opt.jobtype == 'nhsingle': 286 | 287 | if (opt.maxinsert < 0): 288 | 289 | logging.error("maxinsert size error") 290 | 291 | jazzopt.print_help() 292 | 293 | sys.exit(1) 294 | 295 | elif opt.jobtype == 'nhpaired': 296 | 297 | if (opt.maxinsert < 0): 298 | 299 | logging.error("maxinsert size error") 300 | 301 | jazzopt.print_help() 302 | 303 | sys.exit(1) 304 | 305 | else: 306 | 307 | logging.error("missing or wrong jobtype") 308 | 309 | jazzopt.print_help() 310 | 311 | sys.exit(1) 312 | 313 | else: 314 | 315 | logging.error("missing or wrong jobtype") 316 | 317 | jazzopt.print_help() 318 | 319 | sys.exit(1) 320 | 321 | opt.countchr = list() 322 | 323 | samfile = pysam.Samfile(opt.datafile) 324 | 325 | sam_ref = samfile.references 326 | 327 | for i in sam_ref: 328 | 329 | opt.countchr.append(i) 330 | 331 | if (opt.excludechr): 332 | 333 | excludchr = opt.excludechr.split(',') 334 | 335 | for chri in excludchr: 336 | 337 | if not chri in sam_ref: 338 | 339 | print (chri,'not in the %s file' % opt.datafile) 340 | 341 | print ("try to selcet exclude Chr from", end =" : ") 342 | 343 | print (sam_ref, sep=",") 344 | 345 | jazzopt.print_help() 346 | 347 | sys.exit(1) 348 | 349 | else: 350 | 351 | j = 0 352 | 353 | for n in opt.countchr: 354 | 355 | if chri == n: 356 | 357 | del opt.countchr[j] 358 | 359 | j = j + 1 360 | 361 | return opt 362 | 363 | if __name__ == "__main__": 364 | 365 | try: 366 | 367 | main() 368 | 369 | except KeyboardInterrupt: 370 | 371 | sys.stderr.write("User interrupt\n") 372 | 373 | sys.exit(0) 374 | 375 | -------------------------------------------------------------------------------- /Jazzlib/hotspotsscan.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from .countreads import * 4 | from .cEM_zip import * 5 | from .FRegion import * 6 | from multiprocessing import Pool 7 | from .Hotspot import * 8 | from .sta import * 9 | from .region import * 10 | from .Peak import * 11 | 12 | 13 | class KeyboardInterruptError(Exception): 14 | 15 | pass 16 | 17 | 18 | def hotspotsscan_withoutcontrol(file, maxinsert, windowscare,countchr,gloablumbda, 19 | bayesfactorthreshold, nthreads, fregion, jobtype): 20 | 21 | pool = Pool(nthreads) 22 | 23 | try: 24 | 25 | pars = list() 26 | 27 | hotspots = list() 28 | 29 | print ("gloablumbda",gloablumbda , "readlengthmean", fregion.readlengthmean) 30 | 31 | bayesfactorthresholdcount = 2 32 | 33 | i = 2 34 | 35 | while True: 36 | 37 | nowbayesfactor = bayesfactor(gloablumbda, i) 38 | 39 | if nowbayesfactor > bayesfactorthreshold: 40 | 41 | break 42 | 43 | bayesfactorthresholdcount = i 44 | 45 | i = i + 1 46 | 47 | print ("bayesfactorthresholdcount", bayesfactorthresholdcount) 48 | 49 | windowsize = 100000 50 | 51 | for chromosmoe in countchr: 52 | 53 | chr_length = fregion.chrs_length[chromosmoe] 54 | 55 | for scare in range(0, int(chr_length/windowsize)+1): 56 | 57 | nowstart = scare*windowsize + 1 -200 58 | 59 | nowend = (scare+1)*windowsize + 200 60 | 61 | if nowend > chr_length: 62 | 63 | nowend = chr_length 64 | 65 | if nowstart < 1: 66 | 67 | nowstart = 1 68 | 69 | nowregion = chromosmoe + ":" + str(nowstart) + "-" + str(nowend) 70 | 71 | par = dict() 72 | 73 | par['region'] = nowregion 74 | 75 | par['maxinsert'] = maxinsert 76 | 77 | par['bamfile'] = file 78 | 79 | par['jobtype'] = jobtype 80 | 81 | par['chrlength'] = chr_length 82 | 83 | par['regionchromosome'] = chromosmoe 84 | 85 | par['regionstart'] = nowstart 86 | 87 | par['regionend'] = nowend 88 | 89 | # par['bayesfactordic'] = bayesfactordic 90 | 91 | par['bayesfactorcount'] = bayesfactorthresholdcount 92 | 93 | par['readlengthmean'] = fregion.readlengthmean 94 | 95 | pars.append(par) 96 | 97 | enrichedinthreads = pool.map(hotspot_withoutcontrol_worker, pars) 98 | 99 | chrenrichedpotin = dict() 100 | 101 | for enrichedinthread in enrichedinthreads: 102 | 103 | nowchr = enrichedinthread['chromosome'] 104 | 105 | if nowchr in chrenrichedpotin: 106 | 107 | chrenrichedpotin[nowchr].append(enrichedinthread['list']) 108 | 109 | else: 110 | 111 | chrenrichedpotin[nowchr] = list() 112 | 113 | chrenrichedpotin[nowchr].append(enrichedinthread['list']) 114 | 115 | chrhotpars = list() 116 | 117 | for nowchr in chrenrichedpotin: 118 | 119 | hotpar = dict() 120 | 121 | hotpar['chromosome'] = nowchr 122 | 123 | hotpar['preregion'] = chrenrichedpotin[nowchr] 124 | 125 | hotpar['chr_length'] = fregion.chrs_length[chromosmoe] 126 | 127 | hotpar['fregion'] = fregion 128 | 129 | chrhotpars.append(hotpar) 130 | 131 | hotsptosinthreads = pool.map(hotspots_chromsome_merge,chrhotpars) 132 | 133 | for hotinth in hotsptosinthreads: 134 | 135 | for hotspotnow in hotinth: 136 | 137 | hotspots.append(hotspotnow) 138 | 139 | pool.close() 140 | 141 | return hotspots 142 | 143 | except KeyboardInterrupt: 144 | 145 | pool.terminate() 146 | 147 | print ("You cancelled the program!") 148 | 149 | sys.exit(1) 150 | 151 | except Exception as e: 152 | 153 | print ('got exception in Jazzlib.hotspotsscan.hotspotsscan_withoutcontrol: %r, terminating the pool' % (e,)) 154 | 155 | pool.terminate() 156 | 157 | print ('pool is terminated') 158 | 159 | finally: 160 | 161 | pool.join() 162 | 163 | 164 | def hotspot_withoutcontrol_worker(par): 165 | 166 | try: 167 | 168 | maxinsert = par['maxinsert'] 169 | 170 | bamfile = par['bamfile'] 171 | 172 | jobtype = par['jobtype'] 173 | 174 | chromosome = par['regionchromosome'] 175 | 176 | nowstart = par['regionstart'] 177 | 178 | nowend = par['regionend'] 179 | 180 | bayesfactorcount = par['bayesfactorcount'] 181 | 182 | readlengthmean = par['readlengthmean'] 183 | 184 | datacount = extenddepthcount(bamfile=bamfile, regionchromosome=chromosome, regionstart=nowstart, 185 | regionend=nowend, maxinsert=maxinsert, jobtype=jobtype, 186 | readlengthmean=readlengthmean) 187 | 188 | enrichedlist = dict() 189 | 190 | enrichedlist['chromosome'] = chromosome 191 | 192 | enrichedlist['list'] = list() 193 | 194 | for site in datacount: 195 | 196 | if datacount[site] >= bayesfactorcount: 197 | 198 | enrichedlist['list'].append(site) 199 | 200 | return enrichedlist 201 | 202 | except Exception as e: 203 | 204 | print ('got exception in Jazzlib.hotspotsscan.hotspot_withoutcontrol_worker: %r, terminating the pool' % (e,)) 205 | 206 | print ('pool is terminated') 207 | 208 | except KeyboardInterrupt: 209 | 210 | print ("You cancelled the program!") 211 | 212 | sys.exit(1) 213 | 214 | 215 | def hotspotsscan_withcontrol(chipfile, maxinsert, windowscare,countchr,inputgloablumbda, 216 | bayesfactorthreshold, nthreads, chipfregion, jobtype, ratio, 217 | inputfile, inputfregion): 218 | 219 | pool = Pool(nthreads) 220 | 221 | try: 222 | 223 | pars = list() 224 | 225 | hotspots = list() 226 | 227 | print ("gloablumbda",inputgloablumbda , "readlengthmean", inputfregion.readlengthmean) 228 | 229 | bayesfactorthresholdcount = 2 230 | 231 | i = 2 232 | 233 | while True: 234 | 235 | nowbayesfactor = bayesfactor(inputgloablumbda, i) 236 | 237 | if nowbayesfactor > bayesfactorthreshold: 238 | 239 | break 240 | 241 | bayesfactorthresholdcount = i 242 | 243 | i = i + 1 244 | 245 | print ("bayesfactorthresholdcount", bayesfactorthresholdcount) 246 | 247 | windowsize = 100000 248 | 249 | for chromosmoe in countchr: 250 | 251 | chr_length = chipfregion.chrs_length[chromosmoe] 252 | 253 | for scare in range(0, int(chr_length/windowsize)+1): 254 | 255 | nowstart = scare*windowsize + 1 -200 256 | 257 | nowend = (scare+1)*windowsize + 200 258 | 259 | if nowend > chr_length: 260 | 261 | nowend = chr_length 262 | 263 | if nowstart < 1: 264 | 265 | nowstart = 1 266 | 267 | nowregion = chromosmoe + ":" + str(nowstart) + "-" + str(nowend) 268 | 269 | par = dict() 270 | 271 | par['region'] = nowregion 272 | 273 | par['maxinsert'] = maxinsert 274 | 275 | par['bamfile'] = chipfile 276 | 277 | par['jobtype'] = jobtype 278 | 279 | par['chrlength'] = chr_length 280 | 281 | par['regionchromosome'] = chromosmoe 282 | 283 | par['regionstart'] = nowstart 284 | 285 | par['regionend'] = nowend 286 | 287 | par['ratio'] = ratio 288 | 289 | # par['bayesfactordic'] = bayesfactordic 290 | 291 | par['bayesfactorcount'] = bayesfactorthresholdcount 292 | 293 | par['readlengthmean'] = chipfregion.readlengthmean 294 | 295 | pars.append(par) 296 | 297 | enrichedinthreads = pool.map(hotspot_control_worker, pars) 298 | 299 | chrenrichedpotin = dict() 300 | 301 | for enrichedinthread in enrichedinthreads: 302 | 303 | nowchr = enrichedinthread['chromosome'] 304 | 305 | if nowchr in chrenrichedpotin: 306 | 307 | chrenrichedpotin[nowchr].append(enrichedinthread['list']) 308 | 309 | else: 310 | 311 | chrenrichedpotin[nowchr] = list() 312 | 313 | chrenrichedpotin[nowchr].append(enrichedinthread['list']) 314 | 315 | chrhotpars = list() 316 | 317 | for nowchr in chrenrichedpotin: 318 | 319 | hotpar = dict() 320 | 321 | hotpar['chromosome'] = nowchr 322 | 323 | hotpar['preregion'] = chrenrichedpotin[nowchr] 324 | 325 | hotpar['chr_length'] = chipfregion.chrs_length[chromosmoe] 326 | 327 | hotpar['fregion'] = chipfregion 328 | 329 | chrhotpars.append(hotpar) 330 | 331 | hotsptosinthreads = pool.map(hotspots_chromsome_merge, chrhotpars) 332 | 333 | for hotinth in hotsptosinthreads: 334 | 335 | for hotspotnow in hotinth: 336 | 337 | hotspots.append(hotspotnow) 338 | 339 | pool.close() 340 | 341 | pool.close() 342 | 343 | return hotspots 344 | 345 | except KeyboardInterrupt: 346 | 347 | pool.terminate() 348 | 349 | print ("You cancelled the program!") 350 | 351 | sys.exit(1) 352 | 353 | except Exception as e: 354 | 355 | print ('got exception in Jazzlib.hotspotsscan.hotspotsscan_withcontrol: %r, terminating the pool' % (e,)) 356 | 357 | pool.terminate() 358 | 359 | print ('pool is terminated') 360 | 361 | finally: 362 | 363 | pool.join() 364 | 365 | 366 | def hotspot_control_worker(par): 367 | 368 | try: 369 | 370 | maxinsert = par['maxinsert'] 371 | 372 | bamfile = par['bamfile'] 373 | 374 | jobtype = par['jobtype'] 375 | 376 | chromosome = par['regionchromosome'] 377 | 378 | nowstart = par['regionstart'] 379 | 380 | nowend = par['regionend'] 381 | 382 | bayesfactorcount = par['bayesfactorcount'] 383 | 384 | readlengthmean = par['readlengthmean'] 385 | 386 | ratio = par['ratio'] 387 | 388 | datacount = extenddepthcount(bamfile=bamfile, regionchromosome=chromosome, regionstart=nowstart, 389 | regionend=nowend, maxinsert=maxinsert, jobtype=jobtype, 390 | readlengthmean=readlengthmean) 391 | 392 | enrichedlist = dict() 393 | 394 | enrichedlist['chromosome'] = chromosome 395 | 396 | enrichedlist['list'] = list() 397 | 398 | for site in datacount: 399 | 400 | if datacount[site]*ratio >= bayesfactorcount: 401 | 402 | enrichedlist['list'].append(site) 403 | 404 | return enrichedlist 405 | 406 | except Exception as e: 407 | 408 | print ('got exception in Jazzlib.hotspotsscan.hotspot_withoutcontrol_worker: %r, terminating the pool' % (e,)) 409 | 410 | print ('pool is terminated') 411 | 412 | except KeyboardInterrupt: 413 | 414 | print ("You cancelled the program!") 415 | 416 | sys.exit(1) 417 | 418 | 419 | 420 | def hotspotsfilter(hotspots, peaks): 421 | 422 | peaksparent = dict() 423 | 424 | for peak in peaks: 425 | 426 | if peak.parent not in peaksparent: 427 | 428 | peaksparent[peak.parent] = 1 429 | 430 | hotspotreturen = list() 431 | 432 | for hotspot in hotspots: 433 | 434 | if hotspot.hotspotid in peaksparent: 435 | 436 | hotspotreturen.append(hotspot) 437 | 438 | return hotspotreturen 439 | 440 | 441 | def hotspots_chromsome_merge(par): 442 | 443 | try: 444 | 445 | chromosome = par['chromosome'] 446 | 447 | preregion = par['preregion'] 448 | 449 | chr_length = par['chr_length'] 450 | 451 | fregion = par['fregion'] 452 | 453 | hotspotslist = list() 454 | 455 | enrichedpotin = dict() 456 | 457 | for regionpoint in preregion: 458 | 459 | for nowsite in regionpoint: 460 | 461 | if not nowsite in enrichedpotin: 462 | 463 | enrichedpotin[nowsite] = 1 464 | 465 | chrenrichlist = list(enrichedpotin.keys()) 466 | 467 | temphotspots = continueregion(chrenrichlist, 2) 468 | 469 | for hotspotstarend in temphotspots: 470 | 471 | hotspotstart = hotspotstarend['start_site'] 472 | 473 | hotspotend = hotspotstarend['end_site'] 474 | 475 | if hotspotend-hotspotstart < fregion.readlengthmean/2: 476 | 477 | continue 478 | 479 | hotspotid = str(chromosome) + ":" + str(hotspotstart) +"-"+ str(hotspotend) 480 | 481 | hotspot = Hotspot(start=hotspotstart, end=hotspotend, chromosome=chromosome, hotspotid=hotspotid) 482 | 483 | hotspotslist.append(hotspot) 484 | 485 | return hotspotslist 486 | 487 | except Exception as e: 488 | 489 | print ('got exception in Jazzlib.hotspotsscan.hotspots_chromsome_merge: %r, terminating the pool' % (e,)) 490 | 491 | print (par) 492 | 493 | print ('pool is terminated') 494 | 495 | except KeyboardInterrupt: 496 | 497 | print ("You cancelled the program!") 498 | 499 | sys.exit(1) -------------------------------------------------------------------------------- /Jazzlib/hotspotsscan.py.bak: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from countreads import * 4 | from cEM_zip import * 5 | from FRegion import * 6 | from multiprocessing import Pool 7 | from Hotspot import * 8 | from sta import * 9 | from region import * 10 | from Peak import * 11 | 12 | 13 | class KeyboardInterruptError(Exception): 14 | 15 | pass 16 | 17 | 18 | def hotspotsscan_withoutcontrol(file, maxinsert, windowscare,countchr,gloablumbda, 19 | bayesfactorthreshold, nthreads, fregion, jobtype): 20 | 21 | pool = Pool(nthreads) 22 | 23 | try: 24 | 25 | pars = list() 26 | 27 | hotspots = list() 28 | 29 | print ("gloablumbda",gloablumbda , "readlengthmean", fregion.readlengthmean) 30 | 31 | bayesfactorthresholdcount = 2 32 | 33 | i = 2 34 | 35 | while True: 36 | 37 | nowbayesfactor = bayesfactor(gloablumbda, i) 38 | 39 | if nowbayesfactor > bayesfactorthreshold: 40 | 41 | break 42 | 43 | bayesfactorthresholdcount = i 44 | 45 | i = i + 1 46 | 47 | print ("bayesfactorthresholdcount", bayesfactorthresholdcount) 48 | 49 | windowsize = 100000 50 | 51 | for chromosmoe in countchr: 52 | 53 | chr_length = fregion.chrs_length[chromosmoe] 54 | 55 | for scare in range(0, int(chr_length/windowsize)+1): 56 | 57 | nowstart = scare*windowsize + 1 -200 58 | 59 | nowend = (scare+1)*windowsize + 200 60 | 61 | if nowend > chr_length: 62 | 63 | nowend = chr_length 64 | 65 | if nowstart < 1: 66 | 67 | nowstart = 1 68 | 69 | nowregion = chromosmoe + ":" + str(nowstart) + "-" + str(nowend) 70 | 71 | par = dict() 72 | 73 | par['region'] = nowregion 74 | 75 | par['maxinsert'] = maxinsert 76 | 77 | par['bamfile'] = file 78 | 79 | par['jobtype'] = jobtype 80 | 81 | par['chrlength'] = chr_length 82 | 83 | par['regionchromosome'] = chromosmoe 84 | 85 | par['regionstart'] = nowstart 86 | 87 | par['regionend'] = nowend 88 | 89 | # par['bayesfactordic'] = bayesfactordic 90 | 91 | par['bayesfactorcount'] = bayesfactorthresholdcount 92 | 93 | par['readlengthmean'] = fregion.readlengthmean 94 | 95 | pars.append(par) 96 | 97 | enrichedinthreads = pool.map(hotspot_withoutcontrol_worker, pars) 98 | 99 | chrenrichedpotin = dict() 100 | 101 | for enrichedinthread in enrichedinthreads: 102 | 103 | nowchr = enrichedinthread['chromosome'] 104 | 105 | if nowchr in chrenrichedpotin: 106 | 107 | chrenrichedpotin[nowchr].append(enrichedinthread['list']) 108 | 109 | else: 110 | 111 | chrenrichedpotin[nowchr] = list() 112 | 113 | chrenrichedpotin[nowchr].append(enrichedinthread['list']) 114 | 115 | chrhotpars = list() 116 | 117 | for nowchr in chrenrichedpotin: 118 | 119 | hotpar = dict() 120 | 121 | hotpar['chromosome'] = nowchr 122 | 123 | hotpar['preregion'] = chrenrichedpotin[nowchr] 124 | 125 | hotpar['chr_length'] = fregion.chrs_length[chromosmoe] 126 | 127 | hotpar['fregion'] = fregion 128 | 129 | chrhotpars.append(hotpar) 130 | 131 | hotsptosinthreads = pool.map(hotspots_chromsome_merge,chrhotpars) 132 | 133 | for hotinth in hotsptosinthreads: 134 | 135 | for hotspotnow in hotinth: 136 | 137 | hotspots.append(hotspotnow) 138 | 139 | pool.close() 140 | 141 | return hotspots 142 | 143 | except KeyboardInterrupt: 144 | 145 | pool.terminate() 146 | 147 | print ("You cancelled the program!") 148 | 149 | sys.exit(1) 150 | 151 | except Exception, e: 152 | 153 | print ('got exception in Jazzlib.hotspotsscan.hotspotsscan_withoutcontrol: %r, terminating the pool' % (e,)) 154 | 155 | pool.terminate() 156 | 157 | print ('pool is terminated') 158 | 159 | finally: 160 | 161 | pool.join() 162 | 163 | 164 | def hotspot_withoutcontrol_worker(par): 165 | 166 | try: 167 | 168 | maxinsert = par['maxinsert'] 169 | 170 | bamfile = par['bamfile'] 171 | 172 | jobtype = par['jobtype'] 173 | 174 | chromosome = par['regionchromosome'] 175 | 176 | nowstart = par['regionstart'] 177 | 178 | nowend = par['regionend'] 179 | 180 | bayesfactorcount = par['bayesfactorcount'] 181 | 182 | readlengthmean = par['readlengthmean'] 183 | 184 | datacount = extenddepthcount(bamfile=bamfile, regionchromosome=chromosome, regionstart=nowstart, 185 | regionend=nowend, maxinsert=maxinsert, jobtype=jobtype, 186 | readlengthmean=readlengthmean) 187 | 188 | enrichedlist = dict() 189 | 190 | enrichedlist['chromosome'] = chromosome 191 | 192 | enrichedlist['list'] = list() 193 | 194 | for site in datacount: 195 | 196 | if datacount[site] >= bayesfactorcount: 197 | 198 | enrichedlist['list'].append(site) 199 | 200 | return enrichedlist 201 | 202 | except Exception, e: 203 | 204 | print ('got exception in Jazzlib.hotspotsscan.hotspot_withoutcontrol_worker: %r, terminating the pool' % (e,)) 205 | 206 | print ('pool is terminated') 207 | 208 | except KeyboardInterrupt: 209 | 210 | print ("You cancelled the program!") 211 | 212 | sys.exit(1) 213 | 214 | 215 | def hotspotsscan_withcontrol(chipfile, maxinsert, windowscare,countchr,inputgloablumbda, 216 | bayesfactorthreshold, nthreads, chipfregion, jobtype, ratio, 217 | inputfile, inputfregion): 218 | 219 | pool = Pool(nthreads) 220 | 221 | try: 222 | 223 | pars = list() 224 | 225 | hotspots = list() 226 | 227 | print ("gloablumbda",inputgloablumbda , "readlengthmean", inputfregion.readlengthmean) 228 | 229 | bayesfactorthresholdcount = 2 230 | 231 | i = 2 232 | 233 | while True: 234 | 235 | nowbayesfactor = bayesfactor(inputgloablumbda, i) 236 | 237 | if nowbayesfactor > bayesfactorthreshold: 238 | 239 | break 240 | 241 | bayesfactorthresholdcount = i 242 | 243 | i = i + 1 244 | 245 | print ("bayesfactorthresholdcount", bayesfactorthresholdcount) 246 | 247 | windowsize = 100000 248 | 249 | for chromosmoe in countchr: 250 | 251 | chr_length = chipfregion.chrs_length[chromosmoe] 252 | 253 | for scare in range(0, int(chr_length/windowsize)+1): 254 | 255 | nowstart = scare*windowsize + 1 -200 256 | 257 | nowend = (scare+1)*windowsize + 200 258 | 259 | if nowend > chr_length: 260 | 261 | nowend = chr_length 262 | 263 | if nowstart < 1: 264 | 265 | nowstart = 1 266 | 267 | nowregion = chromosmoe + ":" + str(nowstart) + "-" + str(nowend) 268 | 269 | par = dict() 270 | 271 | par['region'] = nowregion 272 | 273 | par['maxinsert'] = maxinsert 274 | 275 | par['bamfile'] = chipfile 276 | 277 | par['jobtype'] = jobtype 278 | 279 | par['chrlength'] = chr_length 280 | 281 | par['regionchromosome'] = chromosmoe 282 | 283 | par['regionstart'] = nowstart 284 | 285 | par['regionend'] = nowend 286 | 287 | par['ratio'] = ratio 288 | 289 | # par['bayesfactordic'] = bayesfactordic 290 | 291 | par['bayesfactorcount'] = bayesfactorthresholdcount 292 | 293 | par['readlengthmean'] = chipfregion.readlengthmean 294 | 295 | pars.append(par) 296 | 297 | enrichedinthreads = pool.map(hotspot_control_worker, pars) 298 | 299 | chrenrichedpotin = dict() 300 | 301 | for enrichedinthread in enrichedinthreads: 302 | 303 | nowchr = enrichedinthread['chromosome'] 304 | 305 | if nowchr in chrenrichedpotin: 306 | 307 | chrenrichedpotin[nowchr].append(enrichedinthread['list']) 308 | 309 | else: 310 | 311 | chrenrichedpotin[nowchr] = list() 312 | 313 | chrenrichedpotin[nowchr].append(enrichedinthread['list']) 314 | 315 | chrhotpars = list() 316 | 317 | for nowchr in chrenrichedpotin: 318 | 319 | hotpar = dict() 320 | 321 | hotpar['chromosome'] = nowchr 322 | 323 | hotpar['preregion'] = chrenrichedpotin[nowchr] 324 | 325 | hotpar['chr_length'] = chipfregion.chrs_length[chromosmoe] 326 | 327 | hotpar['fregion'] = chipfregion 328 | 329 | chrhotpars.append(hotpar) 330 | 331 | hotsptosinthreads = pool.map(hotspots_chromsome_merge, chrhotpars) 332 | 333 | for hotinth in hotsptosinthreads: 334 | 335 | for hotspotnow in hotinth: 336 | 337 | hotspots.append(hotspotnow) 338 | 339 | pool.close() 340 | 341 | pool.close() 342 | 343 | return hotspots 344 | 345 | except KeyboardInterrupt: 346 | 347 | pool.terminate() 348 | 349 | print ("You cancelled the program!") 350 | 351 | sys.exit(1) 352 | 353 | except Exception, e: 354 | 355 | print ('got exception in Jazzlib.hotspotsscan.hotspotsscan_withcontrol: %r, terminating the pool' % (e,)) 356 | 357 | pool.terminate() 358 | 359 | print ('pool is terminated') 360 | 361 | finally: 362 | 363 | pool.join() 364 | 365 | 366 | def hotspot_control_worker(par): 367 | 368 | try: 369 | 370 | maxinsert = par['maxinsert'] 371 | 372 | bamfile = par['bamfile'] 373 | 374 | jobtype = par['jobtype'] 375 | 376 | chromosome = par['regionchromosome'] 377 | 378 | nowstart = par['regionstart'] 379 | 380 | nowend = par['regionend'] 381 | 382 | bayesfactorcount = par['bayesfactorcount'] 383 | 384 | readlengthmean = par['readlengthmean'] 385 | 386 | ratio = par['ratio'] 387 | 388 | datacount = extenddepthcount(bamfile=bamfile, regionchromosome=chromosome, regionstart=nowstart, 389 | regionend=nowend, maxinsert=maxinsert, jobtype=jobtype, 390 | readlengthmean=readlengthmean) 391 | 392 | enrichedlist = dict() 393 | 394 | enrichedlist['chromosome'] = chromosome 395 | 396 | enrichedlist['list'] = list() 397 | 398 | for site in datacount: 399 | 400 | if datacount[site]*ratio >= bayesfactorcount: 401 | 402 | enrichedlist['list'].append(site) 403 | 404 | return enrichedlist 405 | 406 | except Exception, e: 407 | 408 | print ('got exception in Jazzlib.hotspotsscan.hotspot_withoutcontrol_worker: %r, terminating the pool' % (e,)) 409 | 410 | print ('pool is terminated') 411 | 412 | except KeyboardInterrupt: 413 | 414 | print ("You cancelled the program!") 415 | 416 | sys.exit(1) 417 | 418 | 419 | 420 | def hotspotsfilter(hotspots, peaks): 421 | 422 | peaksparent = dict() 423 | 424 | for peak in peaks: 425 | 426 | if peak.parent not in peaksparent: 427 | 428 | peaksparent[peak.parent] = 1 429 | 430 | hotspotreturen = list() 431 | 432 | for hotspot in hotspots: 433 | 434 | if hotspot.hotspotid in peaksparent: 435 | 436 | hotspotreturen.append(hotspot) 437 | 438 | return hotspotreturen 439 | 440 | 441 | def hotspots_chromsome_merge(par): 442 | 443 | try: 444 | 445 | chromosome = par['chromosome'] 446 | 447 | preregion = par['preregion'] 448 | 449 | chr_length = par['chr_length'] 450 | 451 | fregion = par['fregion'] 452 | 453 | hotspotslist = list() 454 | 455 | enrichedpotin = dict() 456 | 457 | for regionpoint in preregion: 458 | 459 | for nowsite in regionpoint: 460 | 461 | if not nowsite in enrichedpotin: 462 | 463 | enrichedpotin[nowsite] = 1 464 | 465 | chrenrichlist = enrichedpotin.keys() 466 | 467 | temphotspots = continueregion(chrenrichlist, 2) 468 | 469 | for hotspotstarend in temphotspots: 470 | 471 | hotspotstart = hotspotstarend['start_site'] 472 | 473 | hotspotend = hotspotstarend['end_site'] 474 | 475 | if hotspotend-hotspotstart < fregion.readlengthmean/2: 476 | 477 | continue 478 | 479 | hotspotid = str(chromosome) + ":" + str(hotspotstart) +"-"+ str(hotspotend) 480 | 481 | hotspot = Hotspot(start=hotspotstart, end=hotspotend, chromosome=chromosome, hotspotid=hotspotid) 482 | 483 | hotspotslist.append(hotspot) 484 | 485 | return hotspotslist 486 | 487 | except Exception, e: 488 | 489 | print ('got exception in Jazzlib.hotspotsscan.hotspots_chromsome_merge: %r, terminating the pool' % (e,)) 490 | 491 | print (par) 492 | 493 | print ('pool is terminated') 494 | 495 | except KeyboardInterrupt: 496 | 497 | print ("You cancelled the program!") 498 | 499 | sys.exit(1) -------------------------------------------------------------------------------- /Jazzlib/peaksscan.py: -------------------------------------------------------------------------------- 1 | 2 | from .countreads import * 3 | from .cEM_zip import * 4 | from .FRegion import * 5 | from multiprocessing import Pool 6 | from .Peak import * 7 | from .sta import * 8 | from .region import * 9 | from .Hotspot import * 10 | 11 | 12 | class KeyboardInterruptError(Exception): 13 | 14 | pass 15 | 16 | 17 | def peakscan_without_control(datafile, maxinsert, bayesfactorthreshold, nthreads, fregion, 18 | jobtype, hotspots, gloablumbda): 19 | 20 | pool = Pool(nthreads) 21 | 22 | try: 23 | 24 | pars = list() 25 | 26 | for hotspot in hotspots: 27 | 28 | par = dict() 29 | 30 | par['hotspot'] = hotspot 31 | 32 | par['datafile'] = datafile 33 | 34 | par['maxinsert'] = maxinsert 35 | 36 | par['bayesfactorthreashold'] = bayesfactorthreshold 37 | 38 | par['jobtype'] = jobtype 39 | 40 | par['gloablumbda'] = gloablumbda 41 | 42 | par['ratio'] = 1 43 | 44 | par['fregion'] = fregion 45 | 46 | pars.append(par) 47 | 48 | peaksinthreads = pool.map(peakscan_withoutcontrol_worker, pars) 49 | 50 | peaks = list() 51 | 52 | for hotspotnow in peaksinthreads: 53 | 54 | for peaknow in hotspotnow: 55 | 56 | print((peaknow.peakid)) 57 | 58 | peaks.append(peaknow) 59 | 60 | pool.close() 61 | 62 | return peaks 63 | 64 | except KeyboardInterrupt: 65 | 66 | pool.terminate() 67 | 68 | print ("You cancelled the program!") 69 | 70 | sys.exit(1) 71 | 72 | except Exception as e: 73 | 74 | print(('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,))) 75 | 76 | pool.terminate() 77 | 78 | print ('pool is terminated') 79 | 80 | finally: 81 | # print ('joining pool processes') 82 | pool.join() 83 | # print ('join complete') 84 | 85 | 86 | def peakscan_withoutcontrol_worker(par): 87 | 88 | try: 89 | peaks = list() 90 | 91 | hotspot = par['hotspot'] 92 | 93 | datafile = par['datafile'] 94 | 95 | maxinsert = par['maxinsert'] 96 | 97 | bayesfactorthreshold = par['bayesfactorthreashold'] 98 | 99 | jobtype = par['jobtype'] 100 | 101 | gloablumbda = par['gloablumbda'] 102 | 103 | ratio = par['ratio'] 104 | 105 | fregion = par['fregion'] 106 | 107 | start = hotspot.start 108 | 109 | end = hotspot.end 110 | 111 | chromosome = hotspot.chromosome 112 | 113 | chrlength = fregion.chrs_length[chromosome] 114 | 115 | regionstart = start - 5100 116 | 117 | regionend = end + 5100 118 | 119 | if regionstart < 1: 120 | 121 | regionstart = 1 122 | 123 | if regionend > chrlength: 124 | 125 | regionend = chrlength 126 | 127 | datacount = depthcount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart, 128 | regionend=regionend, maxinsert=maxinsert, jobtype=jobtype) 129 | 130 | # datacount = midsitecount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart, 131 | # regionend=regionend, maxinsert=maxinsert, jobtype=jobtype) 132 | 133 | enrichedsite = dict() 134 | 135 | bayesfactorscore = dict() 136 | 137 | inputwindow5k = list() 138 | 139 | inputwindow10k = list() 140 | 141 | for sitenow in range(start-5000,end+5000): 142 | 143 | nowcount = 0 144 | 145 | if sitenow < 0: 146 | 147 | continue 148 | 149 | if sitenow > chrlength: 150 | 151 | continue 152 | 153 | if sitenow in datacount: 154 | 155 | nowcount = datacount[sitenow] 156 | 157 | inputwindow10k.append(nowcount) 158 | 159 | for sitenow in range(start-2500,end+2500): 160 | 161 | nowcount = 0 162 | 163 | if sitenow < 0: 164 | 165 | continue 166 | 167 | if sitenow > chrlength: 168 | 169 | continue 170 | 171 | if sitenow in datacount: 172 | 173 | nowcount = datacount[sitenow] 174 | 175 | inputwindow5k.append(nowcount) 176 | 177 | 178 | (window5klhat, window5kphat) = cEM_zip(inputwindow5k) 179 | 180 | (window10klhat, window10kphat) = cEM_zip(inputwindow10k) 181 | 182 | maxlhat = max(window5klhat, window10klhat, gloablumbda) 183 | 184 | if maxlhat > 400: 185 | 186 | maxlhat = gloablumbda * 5 187 | 188 | for wsite in range(start-1, end+1): 189 | 190 | if wsite in datacount: 191 | 192 | nowcount = datacount[wsite] 193 | 194 | if nowcount < 2: 195 | 196 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=2) 197 | 198 | else: 199 | 200 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=nowcount) 201 | 202 | bayesfactorscore[wsite] = nowbayesfactor 203 | 204 | if nowbayesfactor > bayesfactorthreshold: 205 | 206 | enrichedsite[wsite] = 1 207 | 208 | regionlist = list(enrichedsite.keys()) 209 | 210 | tmppeaks = continueregion(points=regionlist, minlength=1) 211 | 212 | iniid = 1 213 | 214 | for tmppeak in tmppeaks: 215 | 216 | tmppeakstart = tmppeak['start_site'] 217 | 218 | tmppeakend = tmppeak['end_site'] 219 | 220 | totalbayesscore = 0 221 | 222 | maxscore = 0 223 | 224 | maxsite = 0 225 | 226 | for site in range(tmppeakstart, tmppeakend+1): 227 | 228 | score = bayesfactorscore[site] 229 | 230 | totalbayesscore = totalbayesscore + score 231 | 232 | if score > maxscore: 233 | 234 | score = maxscore 235 | 236 | maxsite = site 237 | 238 | avgbayescore = totalbayesscore/(tmppeakend - tmppeakstart + 1) 239 | 240 | peakid = hotspot.hotspotid+'.'+str(iniid) 241 | 242 | peak = Peak(start=tmppeakstart, end=tmppeakend, chromosome=chromosome, peakpoint=maxsite, peakid=peakid, 243 | score=avgbayescore, parent=hotspot.hotspotid) 244 | 245 | iniid= iniid +1 246 | 247 | peaks.append(peak) 248 | 249 | return peaks 250 | 251 | except Exception as e: 252 | 253 | print(('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,))) 254 | 255 | print((par['hotspot'].chromosome, par['hotspot'].start,par['hotspot'].end)) 256 | 257 | 258 | except KeyboardInterrupt: 259 | 260 | print ("You cancelled the program!") 261 | 262 | sys.exit(1) 263 | 264 | 265 | 266 | 267 | def peakscan_control(datafile, maxinsert, bayesfactorthreshold, nthreads, chipfregion, 268 | jobtype, hotspots, gloablumbda, inputfile, ratio, inputfregion): 269 | 270 | pool = Pool(nthreads) 271 | 272 | try: 273 | 274 | pars = list() 275 | 276 | for hotspot in hotspots: 277 | 278 | par = dict() 279 | 280 | par['hotspot'] = hotspot 281 | 282 | par['datafile'] = datafile 283 | 284 | par['maxinsert'] = maxinsert 285 | 286 | par['bayesfactorthreashold'] = bayesfactorthreshold 287 | 288 | par['jobtype'] = jobtype 289 | 290 | par['gloablumbda'] = gloablumbda 291 | 292 | par['ratio'] = ratio 293 | 294 | par['inputfile'] = inputfile 295 | 296 | par['fregion'] = inputfregion 297 | 298 | pars.append(par) 299 | 300 | peaksinthreads = pool.map(peakscan_withoutcontrol_worker, pars) 301 | 302 | peaks = list() 303 | 304 | for hotspotnow in peaksinthreads: 305 | 306 | for peaknow in hotspotnow: 307 | 308 | print((peaknow.peakid)) 309 | 310 | peaks.append(peaknow) 311 | 312 | pool.close() 313 | 314 | return peaks 315 | 316 | except KeyboardInterrupt: 317 | 318 | pool.terminate() 319 | 320 | print ("You cancelled the program!") 321 | 322 | sys.exit(1) 323 | 324 | except Exception as e: 325 | 326 | print(('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,))) 327 | 328 | pool.terminate() 329 | 330 | print ('pool is terminated') 331 | 332 | finally: 333 | # print ('joining pool processes') 334 | pool.join() 335 | # print ('join complete') 336 | 337 | 338 | def peakscan_control_worker(par): 339 | 340 | try: 341 | peaks = list() 342 | 343 | hotspot = par['hotspot'] 344 | 345 | datafile = par['datafile'] 346 | 347 | inputfile = par['inputfile'] 348 | 349 | maxinsert = par['maxinsert'] 350 | 351 | bayesfactorthreshold = par['bayesfactorthreashold'] 352 | 353 | jobtype = par['jobtype'] 354 | 355 | gloablumbda = par['gloablumbda'] 356 | 357 | ratio = par['ratio'] 358 | 359 | fregion = par['fregion'] 360 | 361 | start = hotspot.start 362 | 363 | end = hotspot.end 364 | 365 | chromosome = hotspot.chromosome 366 | 367 | chrlength = fregion.chrs_length[chromosome] 368 | 369 | regionstart = start - 5100 370 | 371 | regionend = end + 5100 372 | 373 | if regionstart < 1: 374 | 375 | regionstart = 1 376 | 377 | if regionend > chrlength: 378 | 379 | regionend = chrlength 380 | 381 | datacount = depthcount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart, 382 | regionend=regionend, maxinsert=maxinsert, jobtype=jobtype) 383 | 384 | inputcount = depthcount(bamfile=inputfile, regionchromosome=chromosome, regionstart=regionstart, 385 | regionend=regionend, maxinsert=maxinsert, jobtype=jobtype) 386 | 387 | # datacount = midsitecount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart, 388 | # regionend=regionend, maxinsert=maxinsert, jobtype=jobtype) 389 | 390 | enrichedsite = dict() 391 | 392 | bayesfactorscore = dict() 393 | 394 | inputwindow5k = list() 395 | 396 | inputwindow10k = list() 397 | 398 | inputwindow1k = list() 399 | 400 | for sitenow in range(start-5000,end+5000): 401 | 402 | nowcount = 0 403 | 404 | if sitenow < 0: 405 | 406 | continue 407 | 408 | if sitenow > chrlength: 409 | 410 | continue 411 | 412 | if sitenow in inputcount: 413 | 414 | nowcount = inputcount[sitenow] 415 | 416 | inputwindow10k.append(nowcount) 417 | 418 | for sitenow in range(start-2500,end+2500): 419 | 420 | nowcount = 0 421 | 422 | if sitenow < 0: 423 | 424 | continue 425 | 426 | if sitenow > chrlength: 427 | 428 | continue 429 | 430 | if sitenow in inputcount: 431 | 432 | nowcount = inputcount[sitenow] 433 | 434 | inputwindow5k.append(nowcount) 435 | 436 | for sitenow in range(start-500,end+500): 437 | 438 | nowcount = 0 439 | 440 | if sitenow < 0: 441 | 442 | continue 443 | 444 | if sitenow > chrlength: 445 | 446 | continue 447 | 448 | if sitenow in inputcount: 449 | 450 | nowcount = inputcount[sitenow] 451 | 452 | inputwindow1k.append(nowcount) 453 | 454 | 455 | (window5klhat, window5kphat) = cEM_zip(inputwindow5k) 456 | 457 | (window10klhat, window10kphat) = cEM_zip(inputwindow10k) 458 | 459 | (window1klhat, window1kphat) = cEM_zip(inputwindow1k) 460 | 461 | maxlhat = max(window5klhat, window10klhat, window1klhat, gloablumbda) 462 | 463 | if maxlhat > 400: 464 | 465 | maxlhat = gloablumbda * 5 466 | 467 | for wsite in range(start-1, end+1): 468 | 469 | if wsite in datacount: 470 | 471 | nowcount = int(datacount[wsite]*ratio) 472 | 473 | if nowcount < 2: 474 | 475 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=2) 476 | 477 | else: 478 | 479 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=nowcount) 480 | 481 | bayesfactorscore[wsite] = nowbayesfactor 482 | 483 | if nowbayesfactor > bayesfactorthreshold: 484 | 485 | enrichedsite[wsite] = 1 486 | 487 | regionlist = list(enrichedsite.keys()) 488 | 489 | tmppeaks = continueregion(points=regionlist, minlength=1) 490 | 491 | iniid = 1 492 | 493 | for tmppeak in tmppeaks: 494 | 495 | tmppeakstart = tmppeak['start_site'] 496 | 497 | tmppeakend = tmppeak['end_site'] 498 | 499 | totalbayesscore = 0 500 | 501 | maxscore = 0 502 | 503 | maxsite = 0 504 | 505 | for site in range(tmppeakstart, tmppeakend+1): 506 | 507 | score = bayesfactorscore[site] 508 | 509 | totalbayesscore = totalbayesscore + score 510 | 511 | if score > maxscore: 512 | 513 | score = maxscore 514 | 515 | maxsite = site 516 | 517 | avgbayescore = totalbayesscore/(tmppeakend - tmppeakstart + 1) 518 | 519 | peakid = hotspot.hotspotid+'.'+str(iniid) 520 | 521 | peak = Peak(start=tmppeakstart, end=tmppeakend, chromosome=chromosome, peakpoint=maxsite, peakid=peakid, 522 | score=avgbayescore, parent=hotspot.hotspotid) 523 | 524 | iniid= iniid +1 525 | 526 | peaks.append(peak) 527 | 528 | return peaks 529 | 530 | except Exception as e: 531 | 532 | print(('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,))) 533 | 534 | print((par['hotspot'].chromosome, par['hotspot'].start,par['hotspot'].end)) 535 | 536 | 537 | except KeyboardInterrupt: 538 | 539 | print ("You cancelled the program!") 540 | 541 | sys.exit(1) -------------------------------------------------------------------------------- /Jazzlib/peaksscan.py.bak: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from countreads import * 3 | from cEM_zip import * 4 | from FRegion import * 5 | from multiprocessing import Pool 6 | from Peak import * 7 | from sta import * 8 | from region import * 9 | from Hotspot import * 10 | 11 | 12 | class KeyboardInterruptError(Exception): 13 | 14 | pass 15 | 16 | 17 | def peakscan_without_control(datafile, maxinsert, bayesfactorthreshold, nthreads, fregion, 18 | jobtype, hotspots, gloablumbda): 19 | 20 | pool = Pool(nthreads) 21 | 22 | try: 23 | 24 | pars = list() 25 | 26 | for hotspot in hotspots: 27 | 28 | par = dict() 29 | 30 | par['hotspot'] = hotspot 31 | 32 | par['datafile'] = datafile 33 | 34 | par['maxinsert'] = maxinsert 35 | 36 | par['bayesfactorthreashold'] = bayesfactorthreshold 37 | 38 | par['jobtype'] = jobtype 39 | 40 | par['gloablumbda'] = gloablumbda 41 | 42 | par['ratio'] = 1 43 | 44 | par['fregion'] = fregion 45 | 46 | pars.append(par) 47 | 48 | peaksinthreads = pool.map(peakscan_withoutcontrol_worker, pars) 49 | 50 | peaks = list() 51 | 52 | for hotspotnow in peaksinthreads: 53 | 54 | for peaknow in hotspotnow: 55 | 56 | print (peaknow.peakid) 57 | 58 | peaks.append(peaknow) 59 | 60 | pool.close() 61 | 62 | return peaks 63 | 64 | except KeyboardInterrupt: 65 | 66 | pool.terminate() 67 | 68 | print ("You cancelled the program!") 69 | 70 | sys.exit(1) 71 | 72 | except Exception, e: 73 | 74 | print ('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,)) 75 | 76 | pool.terminate() 77 | 78 | print ('pool is terminated') 79 | 80 | finally: 81 | # print ('joining pool processes') 82 | pool.join() 83 | # print ('join complete') 84 | 85 | 86 | def peakscan_withoutcontrol_worker(par): 87 | 88 | try: 89 | peaks = list() 90 | 91 | hotspot = par['hotspot'] 92 | 93 | datafile = par['datafile'] 94 | 95 | maxinsert = par['maxinsert'] 96 | 97 | bayesfactorthreshold = par['bayesfactorthreashold'] 98 | 99 | jobtype = par['jobtype'] 100 | 101 | gloablumbda = par['gloablumbda'] 102 | 103 | ratio = par['ratio'] 104 | 105 | fregion = par['fregion'] 106 | 107 | start = hotspot.start 108 | 109 | end = hotspot.end 110 | 111 | chromosome = hotspot.chromosome 112 | 113 | chrlength = fregion.chrs_length[chromosome] 114 | 115 | regionstart = start - 5100 116 | 117 | regionend = end + 5100 118 | 119 | if regionstart < 1: 120 | 121 | regionstart = 1 122 | 123 | if regionend > chrlength: 124 | 125 | regionend = chrlength 126 | 127 | datacount = depthcount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart, 128 | regionend=regionend, maxinsert=maxinsert, jobtype=jobtype) 129 | 130 | # datacount = midsitecount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart, 131 | # regionend=regionend, maxinsert=maxinsert, jobtype=jobtype) 132 | 133 | enrichedsite = dict() 134 | 135 | bayesfactorscore = dict() 136 | 137 | inputwindow5k = list() 138 | 139 | inputwindow10k = list() 140 | 141 | for sitenow in range(start-5000,end+5000): 142 | 143 | nowcount = 0 144 | 145 | if sitenow < 0: 146 | 147 | continue 148 | 149 | if sitenow > chrlength: 150 | 151 | continue 152 | 153 | if sitenow in datacount: 154 | 155 | nowcount = datacount[sitenow] 156 | 157 | inputwindow10k.append(nowcount) 158 | 159 | for sitenow in range(start-2500,end+2500): 160 | 161 | nowcount = 0 162 | 163 | if sitenow < 0: 164 | 165 | continue 166 | 167 | if sitenow > chrlength: 168 | 169 | continue 170 | 171 | if sitenow in datacount: 172 | 173 | nowcount = datacount[sitenow] 174 | 175 | inputwindow5k.append(nowcount) 176 | 177 | 178 | (window5klhat, window5kphat) = cEM_zip(inputwindow5k) 179 | 180 | (window10klhat, window10kphat) = cEM_zip(inputwindow10k) 181 | 182 | maxlhat = max(window5klhat, window10klhat, gloablumbda) 183 | 184 | if maxlhat > 400: 185 | 186 | maxlhat = gloablumbda * 5 187 | 188 | for wsite in range(start-1, end+1): 189 | 190 | if wsite in datacount: 191 | 192 | nowcount = datacount[wsite] 193 | 194 | if nowcount < 2: 195 | 196 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=2) 197 | 198 | else: 199 | 200 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=nowcount) 201 | 202 | bayesfactorscore[wsite] = nowbayesfactor 203 | 204 | if nowbayesfactor > bayesfactorthreshold: 205 | 206 | enrichedsite[wsite] = 1 207 | 208 | regionlist = enrichedsite.keys() 209 | 210 | tmppeaks = continueregion(points=regionlist, minlength=1) 211 | 212 | iniid = 1 213 | 214 | for tmppeak in tmppeaks: 215 | 216 | tmppeakstart = tmppeak['start_site'] 217 | 218 | tmppeakend = tmppeak['end_site'] 219 | 220 | totalbayesscore = 0 221 | 222 | maxscore = 0 223 | 224 | maxsite = 0 225 | 226 | for site in range(tmppeakstart, tmppeakend+1): 227 | 228 | score = bayesfactorscore[site] 229 | 230 | totalbayesscore = totalbayesscore + score 231 | 232 | if score > maxscore: 233 | 234 | score = maxscore 235 | 236 | maxsite = site 237 | 238 | avgbayescore = totalbayesscore/(tmppeakend - tmppeakstart + 1) 239 | 240 | peakid = hotspot.hotspotid+'.'+str(iniid) 241 | 242 | peak = Peak(start=tmppeakstart, end=tmppeakend, chromosome=chromosome, peakpoint=maxsite, peakid=peakid, 243 | score=avgbayescore, parent=hotspot.hotspotid) 244 | 245 | iniid= iniid +1 246 | 247 | peaks.append(peak) 248 | 249 | return peaks 250 | 251 | except Exception, e: 252 | 253 | print ('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,)) 254 | 255 | print (par['hotspot'].chromosome, par['hotspot'].start,par['hotspot'].end) 256 | 257 | 258 | except KeyboardInterrupt: 259 | 260 | print ("You cancelled the program!") 261 | 262 | sys.exit(1) 263 | 264 | 265 | 266 | 267 | def peakscan_control(datafile, maxinsert, bayesfactorthreshold, nthreads, chipfregion, 268 | jobtype, hotspots, gloablumbda, inputfile, ratio, inputfregion): 269 | 270 | pool = Pool(nthreads) 271 | 272 | try: 273 | 274 | pars = list() 275 | 276 | for hotspot in hotspots: 277 | 278 | par = dict() 279 | 280 | par['hotspot'] = hotspot 281 | 282 | par['datafile'] = datafile 283 | 284 | par['maxinsert'] = maxinsert 285 | 286 | par['bayesfactorthreashold'] = bayesfactorthreshold 287 | 288 | par['jobtype'] = jobtype 289 | 290 | par['gloablumbda'] = gloablumbda 291 | 292 | par['ratio'] = ratio 293 | 294 | par['inputfile'] = inputfile 295 | 296 | par['fregion'] = inputfregion 297 | 298 | pars.append(par) 299 | 300 | peaksinthreads = pool.map(peakscan_withoutcontrol_worker, pars) 301 | 302 | peaks = list() 303 | 304 | for hotspotnow in peaksinthreads: 305 | 306 | for peaknow in hotspotnow: 307 | 308 | print (peaknow.peakid) 309 | 310 | peaks.append(peaknow) 311 | 312 | pool.close() 313 | 314 | return peaks 315 | 316 | except KeyboardInterrupt: 317 | 318 | pool.terminate() 319 | 320 | print ("You cancelled the program!") 321 | 322 | sys.exit(1) 323 | 324 | except Exception, e: 325 | 326 | print ('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,)) 327 | 328 | pool.terminate() 329 | 330 | print ('pool is terminated') 331 | 332 | finally: 333 | # print ('joining pool processes') 334 | pool.join() 335 | # print ('join complete') 336 | 337 | 338 | def peakscan_control_worker(par): 339 | 340 | try: 341 | peaks = list() 342 | 343 | hotspot = par['hotspot'] 344 | 345 | datafile = par['datafile'] 346 | 347 | inputfile = par['inputfile'] 348 | 349 | maxinsert = par['maxinsert'] 350 | 351 | bayesfactorthreshold = par['bayesfactorthreashold'] 352 | 353 | jobtype = par['jobtype'] 354 | 355 | gloablumbda = par['gloablumbda'] 356 | 357 | ratio = par['ratio'] 358 | 359 | fregion = par['fregion'] 360 | 361 | start = hotspot.start 362 | 363 | end = hotspot.end 364 | 365 | chromosome = hotspot.chromosome 366 | 367 | chrlength = fregion.chrs_length[chromosome] 368 | 369 | regionstart = start - 5100 370 | 371 | regionend = end + 5100 372 | 373 | if regionstart < 1: 374 | 375 | regionstart = 1 376 | 377 | if regionend > chrlength: 378 | 379 | regionend = chrlength 380 | 381 | datacount = depthcount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart, 382 | regionend=regionend, maxinsert=maxinsert, jobtype=jobtype) 383 | 384 | inputcount = depthcount(bamfile=inputfile, regionchromosome=chromosome, regionstart=regionstart, 385 | regionend=regionend, maxinsert=maxinsert, jobtype=jobtype) 386 | 387 | # datacount = midsitecount(bamfile=datafile, regionchromosome=chromosome, regionstart=regionstart, 388 | # regionend=regionend, maxinsert=maxinsert, jobtype=jobtype) 389 | 390 | enrichedsite = dict() 391 | 392 | bayesfactorscore = dict() 393 | 394 | inputwindow5k = list() 395 | 396 | inputwindow10k = list() 397 | 398 | inputwindow1k = list() 399 | 400 | for sitenow in range(start-5000,end+5000): 401 | 402 | nowcount = 0 403 | 404 | if sitenow < 0: 405 | 406 | continue 407 | 408 | if sitenow > chrlength: 409 | 410 | continue 411 | 412 | if sitenow in inputcount: 413 | 414 | nowcount = inputcount[sitenow] 415 | 416 | inputwindow10k.append(nowcount) 417 | 418 | for sitenow in range(start-2500,end+2500): 419 | 420 | nowcount = 0 421 | 422 | if sitenow < 0: 423 | 424 | continue 425 | 426 | if sitenow > chrlength: 427 | 428 | continue 429 | 430 | if sitenow in inputcount: 431 | 432 | nowcount = inputcount[sitenow] 433 | 434 | inputwindow5k.append(nowcount) 435 | 436 | for sitenow in range(start-500,end+500): 437 | 438 | nowcount = 0 439 | 440 | if sitenow < 0: 441 | 442 | continue 443 | 444 | if sitenow > chrlength: 445 | 446 | continue 447 | 448 | if sitenow in inputcount: 449 | 450 | nowcount = inputcount[sitenow] 451 | 452 | inputwindow1k.append(nowcount) 453 | 454 | 455 | (window5klhat, window5kphat) = cEM_zip(inputwindow5k) 456 | 457 | (window10klhat, window10kphat) = cEM_zip(inputwindow10k) 458 | 459 | (window1klhat, window1kphat) = cEM_zip(inputwindow1k) 460 | 461 | maxlhat = max(window5klhat, window10klhat, window1klhat, gloablumbda) 462 | 463 | if maxlhat > 400: 464 | 465 | maxlhat = gloablumbda * 5 466 | 467 | for wsite in range(start-1, end+1): 468 | 469 | if wsite in datacount: 470 | 471 | nowcount = int(datacount[wsite]*ratio) 472 | 473 | if nowcount < 2: 474 | 475 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=2) 476 | 477 | else: 478 | 479 | nowbayesfactor = bayesfactor(locallambda=maxlhat, peakscore=nowcount) 480 | 481 | bayesfactorscore[wsite] = nowbayesfactor 482 | 483 | if nowbayesfactor > bayesfactorthreshold: 484 | 485 | enrichedsite[wsite] = 1 486 | 487 | regionlist = enrichedsite.keys() 488 | 489 | tmppeaks = continueregion(points=regionlist, minlength=1) 490 | 491 | iniid = 1 492 | 493 | for tmppeak in tmppeaks: 494 | 495 | tmppeakstart = tmppeak['start_site'] 496 | 497 | tmppeakend = tmppeak['end_site'] 498 | 499 | totalbayesscore = 0 500 | 501 | maxscore = 0 502 | 503 | maxsite = 0 504 | 505 | for site in range(tmppeakstart, tmppeakend+1): 506 | 507 | score = bayesfactorscore[site] 508 | 509 | totalbayesscore = totalbayesscore + score 510 | 511 | if score > maxscore: 512 | 513 | score = maxscore 514 | 515 | maxsite = site 516 | 517 | avgbayescore = totalbayesscore/(tmppeakend - tmppeakstart + 1) 518 | 519 | peakid = hotspot.hotspotid+'.'+str(iniid) 520 | 521 | peak = Peak(start=tmppeakstart, end=tmppeakend, chromosome=chromosome, peakpoint=maxsite, peakid=peakid, 522 | score=avgbayescore, parent=hotspot.hotspotid) 523 | 524 | iniid= iniid +1 525 | 526 | peaks.append(peak) 527 | 528 | return peaks 529 | 530 | except Exception, e: 531 | 532 | print ('got exception in Jazzlib.peaksscan.peakscan_without_control: %r, terminating the pool' % (e,)) 533 | 534 | print (par['hotspot'].chromosome, par['hotspot'].start,par['hotspot'].end) 535 | 536 | 537 | except KeyboardInterrupt: 538 | 539 | print ("You cancelled the program!") 540 | 541 | sys.exit(1) --------------------------------------------------------------------------------