├── .gitignore
├── CLAM
    ├── __init__.py
    ├── bak
    │   ├── CLAM.fdr_peak.MP.py
    │   ├── CLAM.lite_aligner.py
    │   ├── deep_getsizeof.py
    │   ├── peakcaller.bak.py
    │   ├── peakcaller.bak2.py
    │   ├── realigner.bak.py
    │   ├── sim_callpeak.r
    │   └── utils.py
    ├── config.py
    ├── download_data.py
    ├── peak_annotator.py
    ├── peakcaller.py
    ├── permutation_peakcaller.py
    ├── preprocessor.py
    ├── realigner.py
    ├── stats
    │   ├── __init__.py
    │   ├── bin_test_alternatives.py
    │   └── ztnb_em.py
    └── utils
    │   └── parseBAM.py
├── LICENSE
├── README.md
├── _config.yml
├── bin
    └── CLAM
├── check
    └── compare_realign.py
├── deprecated
    ├── CLAM.fdr_peak.MP.py
    ├── CLAM.lite_aligner.py
    ├── GTF
    │   └── hg19_ensembl.sorted_gene.bed
    ├── README.md
    ├── requirements.txt
    └── runCLAM_git.sh
├── docs
    ├── .nojekyll
    ├── CLAM.rst
    ├── CLAM.stats.rst
    ├── Makefile
    ├── conf.py
    ├── image.png
    ├── index.rst
    └── modules.rst
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | .vscode/
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/CLAM/__init__.py:
--------------------------------------------------------------------------------
1 | # metadata
2 | __author__ = 'Zijun Zhang'
3 | __version__ = '1.1.3'
4 | __email__ = 'zj.z@ucla.edu'
5 | 


--------------------------------------------------------------------------------
/CLAM/bak/CLAM.fdr_peak.MP.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | """
  4 | This peak-caller script is part of the CLAM pipeline.
  5 | 
  6 | It takes input from re-aligner output, and use permutation to call peaks.
  7 | 
  8 | Tested under python 2.7.3
  9 | """
 10 | 
 11 | __author__ = 'Zijun Zhang'
 12 | __version__ = '1.0.0'
 13 | __email__ = 'zj.z@ucla.edu'
 14 | 
 15 | 
 16 | from optparse import OptionParser
 17 | import os, subprocess, sys
 18 | from collections import defaultdict
 19 | from statsmodels.sandbox.stats.multicomp import multipletests
 20 | from time import strftime
 21 | import cPickle as pickle
 22 | import bisect, random
 23 | import pysam
 24 | import pybedtools
 25 | from multiprocessing import Pool
 26 | 
 27 | def main():
 28 | 	"""
 29 | 	The main wrapper for CLAM peak-caller.
 30 | 	"""
 31 | 	# options parsing
 32 | 	usage='usage: %prog <options>'
 33 | 	parser=OptionParser(usage)
 34 | 	
 35 | 	parser.add_option('--resume', dest='resume', action='store_true', default=False, help='Resume mode - skipping pre-processing [Default: %default]')
 36 | 	parser.add_option('--verbose', dest='verbose', action='store_true', default=False, help='Verbose mode - print out all intermediate steps [Default: %default]')
 37 | 	parser.add_option('-o', dest='output_dir', default='./out_CLAM', help='Output file folder [Default %default]')
 38 | 	parser.add_option('-t', dest='tmp_dir', default='./tmp_CLAM', help='Temporary file folder [Default %default]')
 39 | 	parser.add_option('-p', dest='peak_file', default=None, help='Output peak calling filename; if None then do not call peaks  [Default %default]')
 40 | 	parser.add_option('--is-stranded', dest='is_stranded', default=False, action='store_true', help='Indicates if the reads are mapped with strand information. [Default: %default]')
 41 | 	parser.add_option('--extend', dest='extend', type='int', default=50, help='Extend to given nucleotides symmetrically at peak calling [Default: %default]')
 42 | 	parser.add_option('--pval-cutoff', dest='pval_cutoff', type='float', default=0.001, help='Corrected p-value threshold at peak calling [Default: %default]')
 43 | 	parser.add_option('--merge-size', dest='merge_size', type='int', default=50, help='merging window size at peak calling [Default: %default]')
 44 | 	parser.add_option('--max-iter', dest='max_iter', type='int', default=1000, help='maximum iterations for permutation tests [Default: %default]')
 45 | 	parser.add_option('-g', dest='gtf', default='./GTF/hg19_ensembl.sorted_gene.bed', help='GTF file [Default: %default]')
 46 | 	parser.add_option('--ThreadN', dest='nb_proc', type='int', default=4, help='Number of threads when doing permutations. [Default: %default]')
 47 | 	parser.add_option('--seed', dest='seed', type='int', default=100, help='Random seed for permutations. [Default: %default]')
 48 | 	parser.add_option('--merge-method', dest='merge_method', type='int', default=1, help='Peak merging method. 1: Narrow peak 2: Broad peak [Default: %default]')
 49 | 	parser.add_option('--pval-method', dest='correction_method', type='int', default=1, help='Multiple testing correction method. 1: Bonferroni 2: BH FDR [Default: %default]')
 50 | 	parser.add_option('--call-transcriptome', dest='call_all', action='store_true', default=False, help='Call peaks on transcriptome instead of genes with multi-mappers. [Default: %default]')
 51 | 	
 52 | 	(options,args)=parser.parse_args()
 53 | 	
 54 | 	output_dir=os.path.abspath(options.output_dir)
 55 | 	tmp_dir=os.path.abspath(options.tmp_dir)
 56 | 	verbose=options.verbose
 57 | 	
 58 | 	#random.seed(options.seed)
 59 | 	
 60 | 	write_parameter_log(options, output_dir)
 61 | 	
 62 | 	# find transcripts associated with multi-mapped reads
 63 | 	if verbose:
 64 | 		print_time_stamp('Finding transcripts with multimapped reads.')
 65 | 	if not os.path.isfile(output_dir + '/CLAM_mapper.sorted.out'):
 66 | 		subprocess.call(''' sort -k1,1 -k2,2n %s/CLAM_mapper.out | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6}' > %s/CLAM_mapper.sorted.out ''' % (output_dir, output_dir), shell=True)
 67 | 	# Note: tid_list: tid -> [chr:strand, start, end]
 68 | 	tid_list=read_aligner_output(output_dir + '/CLAM_mapper.sorted.out', options.gtf, options.is_stranded, tmp_dir, options.resume, options.call_all)
 69 | 	
 70 | 	# make bam file for re-aligner output, if non-exist
 71 | 	if not (options.resume and os.path.isfile(output_dir + '/assigned_multimapped_reads.bam')):
 72 | 		if verbose:
 73 | 			print_time_stamp('Making bamfile for aligner output.')
 74 | 		header_cmd='samtools view -H ' + tmp_dir + '/filter100.sorted.bam > ' + output_dir + '/sam_header.sam'
 75 | 		subprocess.call(header_cmd, shell=True)
 76 | 		body_cmd = ''' awk '{if($6=="+"){print $4"\t256\t"$1"\t"$2+1"\t0\t"$3-$2+1"M\t*\t0\t0\t*\t*\tAS:f:"$5}else{print $4"\t272\t"$1"\t"$2+1"\t0\t"$3-$2+1"M\t*\t0\t0\t*\t*\tAS:f:"$5 }}' ''' + output_dir + '/CLAM_mapper.sorted.out > ' + output_dir + '/CLAM_mapper.sorted.sam'
 77 | 		subprocess.call(body_cmd, shell=True)
 78 | 		makeBam_cmd = 'cat %s/sam_header.sam %s/CLAM_mapper.sorted.sam | samtools view -bS - > %s/assigned_multimapped_reads.bam' % (output_dir, output_dir,output_dir)
 79 | 		subprocess.call(makeBam_cmd, shell=True)
 80 | 		index_cmd = 'samtools index %s/assigned_multimapped_reads.bam' % output_dir
 81 | 		subprocess.call(index_cmd, shell=True)
 82 | 	
 83 | 	# multi-processing peak-caller
 84 | 	if not (options.resume and os.path.isfile(tmp_dir+'/unique_to_qval.pdata') and os.path.isfile(tmp_dir+'/combined_to_qval.pdata')):
 85 | 		child_transcr_ind = list(chunkify(range(len(tid_list)), options.nb_proc))
 86 | 		
 87 | 		pool = Pool(processes=options.nb_proc)
 88 | 		
 89 | 		unibam_file=tmp_dir+'/filter100.sorted.bam'
 90 | 		multibam_file=output_dir+'/assigned_multimapped_reads.bam'
 91 | 		tid_to_qval_compact = pool.map(get_permutation_fdr, [ (unibam_file, multibam_file, tid_list, child_transcr_ind[i], options.pval_cutoff, options.max_iter, options.is_stranded, verbose, options.correction_method, options.seed) for i in range(options.nb_proc) ])
 92 | 		
 93 | 		pool.terminate()
 94 | 		pool.join()
 95 | 		
 96 | 		unique_tid_to_qval, combined_tid_to_qval = unpack_tid_to_qval(tid_to_qval_compact)
 97 | 		pickle.dump(unique_tid_to_qval, open(tmp_dir+'/unique_to_qval.pdata','wb'), -1)
 98 | 		pickle.dump(combined_tid_to_qval, open(tmp_dir+'/combined_to_qval.pdata','wb'), -1)
 99 | 	else:
100 | 		print_time_stamp('Resume mode, found qval data files.')
101 | 		unique_tid_to_qval=pickle.load(open(tmp_dir+'/unique_to_qval.pdata','rb'))
102 | 		combined_tid_to_qval=pickle.load(open(tmp_dir+'/combined_to_qval.pdata','rb'))
103 | 	
104 | 	# merge peaks
105 | 	if options.merge_method==1:
106 | 		merge_peaks=merge_peaks_singleNucl
107 | 		mm='singleNucl'
108 | 	elif options.merge_method==2:
109 | 		merge_peaks=merge_peaks_broadPeak
110 | 		mm='broadPeak'
111 | 	else:
112 | 		merge_peaks=merge_peaks_singleNucl
113 | 		mm='unknown selection, using default singleNucl'
114 | 		
115 | 	if verbose:
116 | 		print_time_stamp('Merging peaks within ' + str(options.merge_size) + 'bp, using ' + mm + '..')
117 | 	
118 | 	unique_peaks=merge_peaks(unique_tid_to_qval, options.merge_size, options.pval_cutoff)
119 | 	combined_peaks=merge_peaks(combined_tid_to_qval, options.merge_size, options.pval_cutoff)
120 | 	
121 | 	print_time_stamp('Comparing results and writing to file..')
122 | 	
123 | 	# write peak-calling results to file.
124 | 	with open(output_dir + '/all_peaks.txt', 'w') as f:
125 | 		for peak in unique_peaks:  # peak = ['chr\tstart\tend\tstrand', 'height\tqval\t', tid]
126 | 			if options.extend is None:
127 | 				wt_loc=peak[0]
128 | 			else:
129 | 				wt_loc=extend_peak_region(peak[0], options.extend)
130 | 			f.write(wt_loc + '\t' + '\t'.join([str(x) for x in peak[1]]) + '\t' + peak[2] + '\tunique\n')
131 | 		for peak in combined_peaks:
132 | 			if options.extend is None:
133 | 				wt_loc=peak[0]
134 | 			else:
135 | 				wt_loc=extend_peak_region(peak[0], options.extend)
136 | 			f.write(wt_loc + '\t' + '\t'.join([str(x) for x in peak[1]]) + '\t' + peak[2] + '\tcombined\n')
137 | 	subprocess.call(''' sort -k1,1 -k2,2n %s/all_peaks.txt | awk '{print $1"\t"$2"\t"$3"\t"$5";"$6";"$7"\t"$8"\t"$4}'  | bedtools merge -s -d -1 -i stdin -c 4,5,6, -o collapse,collapse,distinct  > %s''' % (output_dir, options.peak_file), shell=True)
138 | 		
139 | 	print_time_stamp('Peak-calling done.')
140 | 
141 | def write_parameter_log(options, output_dir):
142 | 	"""
143 | 	Write paramter values to a log file, named by current time.
144 | 	"""
145 | 	merge_method_dict={1:'narrowPeak', 2:'broadPeak'}
146 | 	correction_method_dict={1:'Bonferroni', 2:'BH_FDR'}
147 | 	with open(output_dir+'/CLAM_Peaker.Parameters.'+ strftime("%Y%m%d_%H%M") + '.txt', 'w') as log:
148 | 		log.write('CLAM Peaker ' + __version__ + '\n')
149 | 		log.write('resume: ' + str(options.resume) + '\n')
150 | 		log.write('verbose: ' + str(options.verbose) + '\n')
151 | 		log.write('output_dir:' + str(options.output_dir) + '\n')
152 | 		log.write('tmp_dir: ' + str(options.tmp_dir) + '\n')
153 | 		log.write('peak_file: ' + str(options.peak_file) + '\n')
154 | 		log.write('is_stranded: ' + str(options.is_stranded) + '\n')
155 | 		log.write('extend: ' + str(options.extend) + '\n')
156 | 		log.write('pval_cutoff: ' + str(options.pval_cutoff) + '\n')
157 | 		log.write('merge_size: ' + str(options.merge_size) + '\n')
158 | 		log.write('max_iter: ' + str(options.max_iter) + '\n')
159 | 		log.write('gtf: ' + str(options.gtf) + '\n')
160 | 		log.write('seed: ' + str(options.seed) + '\n')
161 | 		log.write('merge_method: ' + merge_method_dict[options.merge_method] + '\n')
162 | 		log.write('correction_method: ' + correction_method_dict[options.correction_method] + '\n')
163 | 		log.write('thread: ' + str(options.nb_proc) + '\n')
164 | 
165 | def chunkify(a, n):
166 | 	"""
167 | 	Separate a list (a) into consecutive n chunks.
168 | 	Returns the chunkified index
169 | 	"""
170 | 	k, m = len(a) / n, len(a) % n
171 | 	return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n))
172 | 	
173 | def unpack_tid_to_qval(compact):
174 | 	"""
175 | 	Unpacks the returned values from multi-processing.
176 | 	"""
177 | 	unique_tid_to_qval=defaultdict(list)
178 | 	combined_tid_to_qval=defaultdict(list)
179 | 	for item in compact:
180 | 		unique, combined = item[0], item[1]
181 | 		for tid in combined:
182 | 			if len(unique[tid])>0:
183 | 				unique_tid_to_qval[tid]=unique[tid]
184 | 			if len(combined[tid])>1:
185 | 				combined_tid_to_qval[tid]=combined[tid]
186 | 	return unique_tid_to_qval,combined_tid_to_qval
187 | 	
188 | def get_permutation_fdr((unibam_file, multibam_file, tid_list, tid_ind, pval_cutoff, max_iter, is_stranded, verbose, correction_method, seed)):
189 | 	"""
190 | 	General permutation wrapper for a list of genes. Gets called by multi-processing generated by Pool()
191 | 	Returns packed FDRs from each child process.
192 | 	"""
193 | 	random.seed(seed)
194 | 	
195 | 	unique_tid_to_qval=defaultdict(list)
196 | 	combined_tid_to_qval=defaultdict(list)
197 | 	
198 | 	unibam=pysam.Samfile(unibam_file, 'rb')
199 | 	multibam=pysam.Samfile(multibam_file, 'rb')
200 | 	
201 | 	processed=0
202 | 	pid=os.getpid()
203 | 	
204 | 	for ind in tid_ind:
205 | 		processed+=1
206 | 		if verbose and not processed % 100:
207 | 			print_time_stamp(str(processed) + '/' + str(len(tid_ind)) + ' finished in pid ' + str(pid))
208 | 		tid, chr, strand, start, end = tid_list[ind]
209 | 		unique_reads = read_tid_frag_from_bam(tid_list[ind], unibam, is_stranded, True)
210 | 		multi_reads = read_tid_frag_from_bam(tid_list[ind], multibam, is_stranded, False)
211 | 		
212 | 		this_unique_to_qval = do_permutation(tid_list[ind], unique_reads, max_iter, pval_cutoff, correction_method)
213 | 		this_combined_to_qval = do_permutation(tid_list[ind], unique_reads+multi_reads, max_iter, pval_cutoff, correction_method)
214 | 		
215 | 		unique_tid_to_qval[tid].extend(this_unique_to_qval)
216 | 		combined_tid_to_qval[tid].extend(this_combined_to_qval)
217 | 	unibam.close()
218 | 	multibam.close()
219 | 	return unique_tid_to_qval, combined_tid_to_qval
220 | 
221 | def do_permutation(transcr, read_transcript, max_iter, pval_cutoff, correction_method):	
222 | 	"""
223 | 	Permutes the reads along a given gene length, sub-routine that get called by get_permutation_fdr(..).
224 | 	Returns the locally corrected p-values for each observed height on the given gene.
225 | 	"""
226 | 	tid, chr, strand, tstart, tend = transcr
227 | 	tid_length=tend-tstart+1
228 | 	obs_heights_count=count_pileup_heights(tid_length, read_transcript)
229 | 	
230 | 	tid_to_qval=[]
231 | 	
232 | 	rand_heights_dist=defaultdict(int)
233 | 	rand_sum=0
234 | 	# need to account for the 'observed' data, since permutation tests should never report p-value as 0. 3/22/16
235 | 	for i in obs_heights_count:
236 | 		if i==0:
237 | 			continue
238 | 		else:
239 | 			rand_heights_dist[int(i)]+=1
240 | 			rand_sum+=1
241 | 	for B in range(max_iter):
242 | 		new_heights_count=permutate_heights(tid_length, read_transcript)
243 | 		for i in new_heights_count:
244 | 			if i==0:
245 | 				continue
246 | 			else:
247 | 				rand_heights_dist[i]+=1
248 | 				rand_sum+=1
249 | 	height_to_pval={}
250 | 	for h in set(obs_heights_count):
251 | 		if h < 1:
252 | 			continue
253 | 		else:
254 | 			lefter=0
255 | 			for j in range(int(h), max(rand_heights_dist)+1):
256 | 				lefter+=rand_heights_dist[j]
257 | 			height_to_pval[h]=lefter/float(rand_sum)
258 | 	pval_list=[]
259 | 	for i in obs_heights_count:
260 | 		if i<1:
261 | 			continue
262 | 		pval_list.append(height_to_pval[i])
263 | 	if len(pval_list)<=1:
264 | 		return []
265 | 
266 | 	if correction_method==2:
267 | 		qval_list=multipletests(pval_list, method='fdr_bh')[1]
268 | 	else:
269 | 		qval_list=[min(x*(len(set([int(y) for y in height_to_pval if y!=0]))), 1.0) for x in pval_list]
270 | 	
271 | 	ind=0
272 | 	last_height=0
273 | 	for j in range(len(obs_heights_count)):
274 | 		this_height=obs_heights_count[j]
275 | 		if this_height<1:
276 | 			last_height=0
277 | 			continue
278 | 		if qval_list[ind] <= pval_cutoff:
279 | 			if this_height==last_height:
280 | 				chr, last_start, last_end, last_strand, last_height, last_qval=tid_to_qval[-1]
281 | 				tid_to_qval[-1]=[chr, last_start, tstart+j+1, strand, last_height, last_qval]
282 | 			else:
283 | 				tid_to_qval.append([chr, tstart+j, tstart+j+1, strand, obs_heights_count[j], qval_list[ind]])  # chr, start, end, strand, height, this_qval
284 | 				last_height=this_height
285 | 		ind+=1
286 | 	return tid_to_qval
287 | 
288 | def heights_to_dist(rand_heights):
289 | 	"""
290 | 	sub-routine 
291 | 	"""
292 | 	rand_heights_dist=defaultdict(int)
293 | 	rand_sum=0
294 | 	for new_heights_count in rand_heights:
295 | 		for i in new_heights_count:
296 | 			if i==0:
297 | 				continue
298 | 			else:
299 | 				rand_heights_dist[i]+=1
300 | 				rand_sum+=1
301 | 	return rand_heights_dist, rand_sum
302 | 
303 | def permutate_heights(tlen, reads):
304 | 	"""
305 | 	Sub-routine for do_permutation(...)
306 | 	Randomly allocate the read locations.
307 | 	"""
308 | 	loc_heights=[0] * tlen
309 | 	for id, pos, read_len, score in reads:
310 | 		if score<1 and random.random() > score:
311 | 			continue
312 | 		rand_pos=random.randint(1, max(1, tlen-read_len))
313 | 		for i in range(rand_pos, min(rand_pos + read_len, tlen)):
314 | 			loc_heights[i]+=1
315 | 	return loc_heights
316 | 
317 | def count_pileup_heights(tlen, reads):
318 | 	"""
319 | 	Sub-routine for do_permutation(...)
320 | 	Counts the distribution of pile-up heights for a given gene/permutation
321 | 	"""
322 | 	loc_heights=[0] * tlen
323 | 	for id, pos, read_len, score in reads:
324 | 		for i in range(pos, min(pos+read_len-1, tlen)):
325 | 			loc_heights[i]+=score
326 | 	return loc_heights
327 | 
328 | def merge_peaks_broadPeak(transcript_to_qval, merge_size, pval_cutoff):
329 | 	"""
330 | 	Merge called peaks on a gene using option 2, 
331 | 	i.e. if two peaks close to each other, region
332 | 	between two peaks are also called as peaks
333 | 	Retuns a list of merged peaks.
334 | 	"""
335 | 	peaks=[]
336 | 	last_qval=[0,1]
337 | 	for tid in transcript_to_qval:
338 | 		init=True
339 | 		for chr, start, end, strand, height, this_qval in transcript_to_qval[tid]:
340 | 			loc=[chr, str(start), str(end), strand]
341 | 			this_qval=[height, this_qval]  # this_qval=[height, qval] so that when qval=0, we can compare height
342 | 			if  this_qval[1] > pval_cutoff:
343 | 				continue
344 | 			if init:
345 | 				last_qval=this_qval
346 | 				last_pos=[start, end]
347 | 				last_loc=loc
348 | 				last_chr=chr
349 | 				write_out=False
350 | 				init=False
351 | 				continue
352 | 			if int(start) - int(last_pos[1]) > merge_size:
353 | 				write_out=True
354 | 			else:
355 | 				last_pos=[last_pos[0], end]
356 | 				last_qval=this_qval if last_qval[0]<this_qval[0] else last_qval
357 | 				last_loc[2]=str(end)
358 | 				write_out=False
359 | 				
360 | 			if write_out and last_qval[1] < pval_cutoff:
361 | 				peaks.append(['\t'.join(last_loc), last_qval, tid])
362 | 				last_qval=this_qval
363 | 				last_pos=[start, end]
364 | 				last_loc=loc
365 | 				last_chr=[chr, str(start), str(end), strand]
366 | 				write_out=False
367 | 		if last_qval[1] < pval_cutoff:
368 | 			peaks.append(['\t'.join(last_loc), last_qval, tid])
369 | 	return peaks
370 | 
371 | def merge_peaks_singleNucl(transcript_to_qval, merge_size, pval_cutoff):
372 | 	"""
373 | 	Merge called peaks on a gene using option 1 
374 | 	(default), i.e. if two peaks close to each other, 
375 | 	only pick the most significant one peak
376 | 	Retuns a list of merged peaks.
377 | 	"""
378 | 	peaks=[]
379 | 	last_qval=[0,1]
380 | 	for tid in transcript_to_qval:
381 | 		init=True
382 | 		for chr, start, end, strand, height, this_qval in transcript_to_qval[tid]:
383 | 			loc='\t'.join([chr, str(start), str(end), strand])
384 | 			this_qval=[height, this_qval]  # this_qval=[height, qval] so that when qval=0, we can compare height
385 | 			if  this_qval[1] > pval_cutoff:
386 | 				continue
387 | 			if init:
388 | 				last_qval=this_qval
389 | 				last_pos=[start, end]
390 | 				last_loc=loc
391 | 				last_chr=chr
392 | 				write_out=False
393 | 				init=False
394 | 				continue
395 | 			if last_chr == chr:
396 | 				if abs( int(start) - int(last_pos[0]) ) > merge_size:
397 | 					write_out=True
398 | 				elif last_qval[0] < this_qval[0]:
399 | 					last_pos=[start, end]
400 | 					last_qval=this_qval
401 | 					last_loc=loc
402 | 					write_out=False
403 | 			else:
404 | 				write_out=True
405 | 				
406 | 			if write_out and last_qval[1] < pval_cutoff:
407 | 				#peaks[last_loc]=last_qval
408 | 				peaks.append([last_loc, last_qval, tid])
409 | 				last_qval=this_qval
410 | 				last_pos=[start, end]
411 | 				last_loc=loc
412 | 				last_chr=chr
413 | 				write_out=False
414 | 		if last_qval[1] < pval_cutoff:
415 | 			peaks.append([last_loc, last_qval, tid])
416 | 	return peaks
417 | 
418 | def extend_peak_region(loc, target_len):
419 | 	"""
420 | 	Extends peak symmetrically if peak is smaller than target_len.
421 | 	"""
422 | 	chr, start, end, strand = loc.split('\t')
423 | 	start = int(start)
424 | 	end = int(end)
425 | 	old_len = end - start
426 | 	if old_len > target_len:
427 | 		return loc
428 | 	else:
429 | 		center = int((start + end)/2)
430 | 		start = center - int(target_len /2)
431 | 		end = center + int(target_len/2)
432 | 		return '\t'.join([chr, str(start), str(end), strand])
433 | 
434 | def read_aligner_output(rm_out, gtffile, is_stranded, tmp_dir, resume, call_all):
435 | 	"""
436 | 	Use bedtools to get transcripts/genes with multi-mapped reads.
437 | 	Returns a list of transcripts/genes.
438 | 	"""
439 | 	if not (resume and os.path.isfile(tmp_dir + '/gtf2multireads.bed')):
440 | 		rm_bed=pybedtools.BedTool(rm_out)
441 | 		gtf=pybedtools.BedTool(gtffile)
442 | 		gtf_bed_rm = gtf.intersect(rm_bed, s=True, u=True) if is_stranded else gtf.intersect(rm_bed, u=True)
443 | 		gtf_bed_rm.saveas(tmp_dir + '/gtf2multireads.bed')
444 | 		pybedtools.cleanup()
445 | 	
446 | 	tid_list=[]
447 | 	if call_all:
448 | 		gtf_to_read=gtffile
449 | 	else:
450 | 		gtf_to_read=tmp_dir+'/gtf2multireads.bed'
451 | 	with open(gtf_to_read,'r') as f:
452 | 		for line in f:
453 | 			ele=line.rstrip().split('\t')
454 | 			gene_id=ele[3]
455 | 			gene_chr, gene_start, gene_end=ele[0], int(ele[1]), int(ele[2])
456 | 			gene_strand=ele[5]
457 | 			tid_list.append([gene_id, gene_chr, gene_strand, gene_start, gene_end])
458 | 	print_time_stamp('Read transcripts with multi-reads: ' + str(len(tid_list)))
459 | 	return tid_list
460 | 
461 | def read_tid_frag_from_bam(tid, bamfile, is_stranded, is_unique):
462 | 	"""
463 | 	Use pysam to fetch reads info for a given gene and its loci.
464 | 	Returns reads, read weights and its mapped loci.
465 | 	"""
466 | 	tid_reads=[]
467 | 	gene, chr, strand, start, end=tid
468 | 	if strand=='-':
469 | 		is_reverse=True
470 | 	else:
471 | 		is_reverse=False
472 | 	reads=[x for x in bamfile.fetch(chr, int(start), int(end)) if x.is_reverse==is_reverse or not is_stranded]
473 | 	reads=[x for x in reads if x.pos>=int(start) and x.pos<=int(end)]
474 | 	for read in reads:
475 | 		if is_unique:
476 | 			try:
477 | 				opt_NH=read.opt('NH')
478 | 				if opt_NH > 1:
479 | 					continue
480 | 			except:
481 | 				pass
482 | 			score=1
483 | 		else:
484 | 			try:
485 | 				opt_AS=read.opt('AS')
486 | 				if isinstance(opt_AS, float):
487 | 					score=opt_AS
488 | 				else:
489 | 					continue
490 | 			except:
491 | 				continue
492 | 		read_length = read.qlen if read.qlen > 0 else read.positions[-1] - read.positions[0] + 1
493 | 		if read.pos-start>=0 and read_length<500:    # to avoid junction reads
494 | 			tid_reads.append([read.qname, read.pos-start, read_length, score])
495 | 	return tid_reads
496 | 
497 | def print_time_stamp(msg):
498 | 	"""
499 | 	Reporter function for logging.
500 | 	"""
501 | 	current_time='[' + strftime("%Y-%m-%d %H:%M:%S") + '] '
502 | 	print >> sys.stderr, current_time + msg
503 | 
504 | if __name__=='__main__':
505 | 	main()


--------------------------------------------------------------------------------
/CLAM/bak/deep_getsizeof.py:
--------------------------------------------------------------------------------
 1 | from collections import Mapping, Container 
 2 | from sys import getsizeof
 3 | 
 4 | def deep_getsizeof(o, ids): 
 5 | 	"""Find the memory footprint of a Python object
 6 | 
 7 | 	This is a recursive function that drills down a Python object graph
 8 | 	like a dictionary holding nested dictionaries with lists of lists
 9 | 	and tuples and sets.
10 | 
11 | 	The sys.getsizeof function does a shallow size of only. It counts each
12 | 	object inside a container as pointer only regardless of how big it
13 | 	really is.
14 | 
15 | 	:param o: the object
16 | 	:param ids:
17 | 	:return:
18 | 	"""
19 | 	d = deep_getsizeof
20 | 	if id(o) in ids:
21 | 		return 0
22 | 
23 | 	r = getsizeof(o)
24 | 	ids.add(id(o))
25 | 
26 | 	if isinstance(o, str) or isinstance(0, unicode):
27 | 		return r
28 | 
29 | 	if isinstance(o, Mapping):
30 | 		return r + sum(d(k, ids) + d(v, ids) for k, v in o.iteritems())
31 | 
32 | 	if isinstance(o, Container):
33 | 		return r + sum(d(x, ids) for x in o)
34 | 
35 | 	return r


--------------------------------------------------------------------------------
/CLAM/bak/peakcaller.bak.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This peak-caller script is part of the CLAM pipeline.
  5 | 
  6 | It takes input from re-aligner output, and use permutation to call peaks.
  7 | 
  8 | Tested under python 2.7.3
  9 | """
 10 | 
 11 | __author__ = 'Zijun Zhang'
 12 | __version__ = '1.1.0'
 13 | __email__ = 'zj.z@ucla.edu'
 14 | 
 15 | 
 16 | import os
 17 | import sys
 18 | from collections import defaultdict
 19 | from statsmodels.sandbox.stats.multicomp import multipletests
 20 | import pysam
 21 | import logging
 22 | import numpy as np
 23 | from collections import defaultdict
 24 | import re
 25 | from scipy.stats import fisher_exact, poisson, chi2
 26 | import scipy.optimize as optimize
 27 | from tqdm import tqdm
 28 | import datetime
 29 | from stats import ztnb_em
 30 | 
 31 | 
 32 | ### get logger
 33 | ###
 34 | logger = logging.getLogger('CLAM.Peakcaller')
 35 | ###
 36 | 
 37 | def read_gtf(fn):
 38 | 	"""read in the gene annotation from GTF file
 39 | 	"""
 40 | 	logger.info('read GTF from "%s" '% fn)
 41 | 	gene_annot = {}
 42 | 	with open(fn, 'r') as f:
 43 | 		for line in f:
 44 | 			if line.startswith('#'):
 45 | 				continue
 46 | 			ele = line.strip().split('\t')
 47 | 			if ele[2] != 'gene':
 48 | 				continue
 49 | 			chr, start, end, strand = ele[0], int(ele[3]), int(ele[4]), ele[6]
 50 | 			try:
 51 | 				gene_id = re.search(r'gene_id "(.+?)"', ele[-1]).group(1)
 52 | 			except AttributeError:
 53 | 				continue
 54 | 			gene_annot[gene_id] = [chr, start, end, strand]
 55 | 	return gene_annot
 56 | 
 57 | 
 58 | def count_gene_read_tags(bam_list, (chr, start, end, strand), is_unique=True, unstranded=False):
 59 | 	""" count the tagger positions for all reads in a given genomic interval
 60 | 	Args:
 61 | 	Returns:
 62 | 	"""
 63 | 	# placeholder for interval: 'num of replicate' x 'interval length'
 64 | 	interval = np.zeros( (len(bam_list), end-start+1) )
 65 | 	is_reverse = True if strand=='-' else False
 66 | 	# construct the (tag, score) pairs
 67 | 	for i in range(len(bam_list)):
 68 | 		bam = bam_list[i]
 69 | 		if is_unique:
 70 | 			read_tags = [ (x.opt('RT'), 1.0) for x in bam.fetch(chr, start, end) \
 71 | 				if unstranded or x.is_reverse==is_reverse]
 72 | 		else:
 73 | 			read_tags = [ (x.opt('RT'), x.opt('AS')) for x in bam.fetch(chr, start, end) \
 74 | 				if unstranded or x.is_reverse==is_reverse]
 75 | 		
 76 | 		for tag in read_tags:
 77 | 			if tag[0]<start or tag[0]>=end:
 78 | 				continue
 79 | 			interval[i, tag[0]-start] += tag[1]
 80 | 	return interval
 81 | 
 82 | 
 83 | def	bin_interval_counts(interval, winsize=50):
 84 | 	bins = np.zeros( ( interval.shape[0], int(np.ceil(interval.shape[1]/float(winsize))) ) )
 85 | 	for i in range(bins.shape[1]):
 86 | 		for j in range(interval.shape[0]):
 87 | 			start, end = i*winsize, (i+1)*winsize-1
 88 | 			bins[j, i] = np.sum(interval[j, start:end])
 89 | 	return bins
 90 | 
 91 | 
 92 | def test_bin_negbinom(intv_bin_ip, intv_bin_con, correction_method='fdr_bh'):
 93 | 	"""DOCSTRING
 94 | 	Args
 95 | 	Returns
 96 | 	"""
 97 | 	def _par_to_vec(par, data, is_constrained):
 98 | 		if is_constrained:
 99 | 			beta = par[0]
100 | 			mu_vec = par[1::]
101 | 			delta = 0
102 | 		else:
103 | 			beta, delta = par[0], par[1]
104 | 			mu_vec = par[2::]
105 | 		ip_counter = data['this_ip'].shape[0]
106 | 		con_counter = data['this_con'].shape[0]
107 | 		mu0 = np.asarray(mu_vec[0:con_counter])
108 | 		mu1 = np.asarray(mu_vec[con_counter::])
109 | 		lamb1_this = np.exp(mu1 + beta + delta)
110 | 		lamb1_others = np.exp(mu1)
111 | 		lamb0_this = np.exp(mu0 + beta)
112 | 		lamb0_others = np.exp(mu0)
113 | 		return (lamb1_this, lamb1_others, lamb0_this, lamb0_others)
114 | 	
115 | 	def _negative_binom_logpmf(y, mu, alpha):
116 | 		y  = np.asarray(y)
117 | 		ll = np.empty(len(y))
118 | 		for i in range(len(y)):
119 | 			alpha_inv = 1.0/alpha[i]
120 | 			alpha_mu = float(alpha[i] * mu[i])
121 | 			ll[i] = y[i]* np.log(alpha_mu/(1+alpha_mu))- \
122 | 				alpha_inv*np.log(1+alpha_mu)
123 | 		return ll
124 | 		
125 | 	def _neg_loglik_unconstrain(par, data):
126 | 		(l1, l2, l3, l4) = _par_to_vec(par, data, False)
127 | 		ll = np.sum( _negative_binom_logpmf(data['this_ip'], mu=l1, alpha=alpha_ip_vec))
128 | 		ll += np.sum( _negative_binom_logpmf(data['others_ip'], mu=l2, alpha=alpha_ip_vec))
129 | 		ll += np.sum( _negative_binom_logpmf(data['this_con'], mu=l3, alpha=alpha_con_vec))
130 | 		ll += np.sum( _negative_binom_logpmf(data['others_con'], mu=l4, alpha=alpha_con_vec))
131 | 		return -ll
132 | 	
133 | 	def _neg_loglik_constrain(par, data):
134 | 		(l1, l2, l3, l4) = _par_to_vec(par, data, True)
135 | 		ll = np.sum(_negative_binom_logpmf(data['this_ip'], mu=l1, alpha=alpha_ip_vec)) + \
136 | 			np.sum(_negative_binom_logpmf(data['others_ip'], mu=l2, alpha=alpha_ip_vec)) + \
137 | 			np.sum(_negative_binom_logpmf(data['this_con'], mu=l3, alpha=alpha_con_vec)) + \
138 | 			np.sum(_negative_binom_logpmf(data['others_con'], mu=l4, alpha=alpha_con_vec))
139 | 		return -ll
140 | 	
141 | 	# initialize placeholders
142 | 	intv_counter = intv_bin_ip.shape[1]
143 | 	assert intv_counter == intv_bin_con.shape[1]
144 | 	binscore = np.empty(intv_counter)
145 | 	binsignal = np.empty(intv_counter)
146 | 	alpha_ip_vec = np.empty(intv_bin_ip.shape[0])
147 | 	alpha_con_vec = np.empty(intv_bin_con.shape[0])
148 | 	ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip)
149 | 	con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con)
150 | 	
151 | 	
152 | 	# compute the dispersion parameters
153 | 	for i in range(intv_bin_con.shape[0]):
154 | 		height = ztnb_em.collapse_data(intv_bin_con[i,])
155 | 		height[0] = 0
156 | 		ll, mu, alpha = ztnb_em.EM_estim_params(height, max_iter=100, verbose=False)
157 | 		alpha_con_vec[i] = alpha
158 | 	
159 | 	max_alpha = np.max(alpha_con_vec)
160 | 	for i in range(intv_bin_ip.shape[0]):
161 | 		height = ztnb_em.collapse_data(intv_bin_ip[i,])
162 | 		height[0] = 0
163 | 		ll, mu, alpha = ztnb_em.EM_estim_params(height, max_iter=100, verbose=False)
164 | 		alpha = max_alpha if alpha>max_alpha else alpha
165 | 		alpha_ip_vec[i] = alpha
166 | 	
167 | 	
168 | 	# perform test on each bin
169 | 	for i in range(intv_counter):
170 | 		this_ip = intv_bin_ip[:, i]
171 | 		others_ip = ip_sum - this_ip
172 | 		this_con = intv_bin_con[:, i]
173 | 		others_con = con_sum - this_con
174 | 		if np.sum(this_ip) == 0:
175 | 			binsignal[i], binscore[i] = np.nan, np.nan
176 | 			continue
177 | 		data = {
178 | 				'this_ip':np.round(this_ip),
179 | 				'others_ip':np.round(others_ip),
180 | 				'this_con':np.round(this_con),
181 | 				'others_con':np.round(others_con)
182 | 			}
183 | 		## constrained likelihood
184 | 		res_constrain = optimize.minimize(
185 | 				x0=np.ones(1+this_ip.shape[0]+others_ip.shape[0]), 
186 | 				fun=_neg_loglik_constrain,
187 | 				args=(data),
188 | 				method='bfgs',
189 | 				options={'disp':False}
190 | 			)
191 | 		## unconstrained likelihood
192 | 		res_unconstrain = optimize.minimize(
193 | 				x0=np.ones(2+this_ip.shape[0]+others_ip.shape[0]), 
194 | 				fun=_neg_loglik_unconstrain,
195 | 				args=(data),
196 | 				method='bfgs',
197 | 				options={'disp':False}
198 | 			)
199 | 		
200 | 		delta_mle = res_unconstrain.x[1]
201 | 		pval = 1 - chi2.cdf(2*(res_constrain.fun - res_unconstrain.fun), 1)
202 | 		binscore[i] = pval
203 | 		binsignal[i] = delta_mle
204 | 	
205 | 	# correcting for multiple-testing
206 | 	adj = multipletests(binscore[~ np.isnan(binscore)], alpha=0.05, method=correction_method)
207 | 	binscore_adj = np.asarray(binscore)
208 | 	binscore_adj[ ~ np.isnan(binscore) ] = adj[1]
209 | 	return binsignal, binscore_adj
210 | 
211 | 
212 | def test_bin_poisson(intv_bin_ip, intv_bin_con, correction_method='fdr_bh'):
213 | 	"""DOCSTRING
214 | 	Args
215 | 	Returns
216 | 	"""
217 | 	def _par_to_vec(par, data, is_constrained):
218 | 		if is_constrained:
219 | 			beta = par[0]
220 | 			mu_vec = par[1::]
221 | 			delta = 0
222 | 		else:
223 | 			beta, delta = par[0], par[1]
224 | 			mu_vec = par[2::]
225 | 		ip_counter = data['this_ip'].shape[0]
226 | 		con_counter = data['this_con'].shape[0]
227 | 		mu0 = np.asarray(mu_vec[0:con_counter])
228 | 		mu1 = np.asarray(mu_vec[con_counter::])
229 | 		lamb1_this = np.exp(mu1 + beta + delta)
230 | 		lamb1_others = np.exp(mu1)
231 | 		lamb0_this = np.exp(mu0 + beta)
232 | 		lamb0_others = np.exp(mu0)
233 | 		return (lamb1_this, lamb1_others, lamb0_this, lamb0_others)
234 | 		
235 | 	def _neg_loglik_unconstrain(par, data):
236 | 		(l1, l2, l3, l4) = _par_to_vec(par, data, False)
237 | 		ll = np.sum(poisson.logpmf(data['this_ip'], mu=l1)) + \
238 | 			np.sum(poisson.logpmf(data['others_ip'], mu=l2)) + \
239 | 			np.sum(poisson.logpmf(data['this_con'], mu=l3)) + \
240 | 			np.sum(poisson.logpmf(data['others_con'], mu=l4))
241 | 		return -ll
242 | 	
243 | 	def _neg_loglik_constrain(par, data):
244 | 		(l1, l2, l3, l4) = _par_to_vec(par, data, True)
245 | 		ll = np.sum(poisson.logpmf(data['this_ip'], mu=l1)) + \
246 | 			np.sum(poisson.logpmf(data['others_ip'], mu=l2)) + \
247 | 			np.sum(poisson.logpmf(data['this_con'], mu=l3)) + \
248 | 			np.sum(poisson.logpmf(data['others_con'], mu=l4))
249 | 		return -ll
250 | 		
251 | 	intv_counter = intv_bin_ip.shape[1]
252 | 	assert intv_counter == intv_bin_con.shape[1]
253 | 	binscore = np.empty(intv_counter)
254 | 	binsignal = np.empty(intv_counter)
255 | 	ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip)
256 | 	con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con)
257 | 	for i in range(intv_counter):
258 | 		this_ip = intv_bin_ip[:, i]
259 | 		others_ip = ip_sum - this_ip
260 | 		this_con = intv_bin_con[:, i]
261 | 		others_con = con_sum - this_con
262 | 		if this_ip == 0:
263 | 			binsignal[i], binscore[i] = np.nan, 1.0
264 | 			continue
265 | 		## because Poisson (and other count-based methods) only
266 | 		## takes integers, here we take the floor of the fractional
267 | 		## multi-reads as a conservative approach
268 | 		data = {
269 | 				'this_ip':np.floor(this_ip),
270 | 				'others_ip':np.floor(others_ip),
271 | 				'this_con':np.floor(this_con),
272 | 				'others_con':np.floor(others_con)
273 | 			}
274 | 		
275 | 		res_constrain = optimize.minimize(
276 | 				x0=np.ones(1+this_ip.shape[0]+others_ip.shape[0]), 
277 | 				fun=_neg_loglik_constrain,
278 | 				args=(data),
279 | 				method='Nelder-Mead',
280 | 				options={'disp':False}
281 | 			)
282 | 		
283 | 		res_unconstrain = optimize.minimize(
284 | 				x0=np.ones(2+this_ip.shape[0]+others_ip.shape[0]), 
285 | 				fun=_neg_loglik_unconstrain,
286 | 				args=(data),
287 | 				method='Nelder-Mead',
288 | 				options={'disp':False}
289 | 			)
290 | 		
291 | 		delta_mle = res_unconstrain.x[1]
292 | 		pval = 1 - chi2.cdf(2*(res_constrain.fun - res_unconstrain.fun), 1)
293 | 		binscore[i] = pval
294 | 		binsignal[i] = delta_mle
295 | 	adj = multipletests(binscore, alpha=0.05, method=correction_method)
296 | 	binscore_adj = adj[1]
297 | 	return binsignal, binscore_adj
298 | 
299 | 
300 | def test_bin_fisher(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh'):
301 | 	"""DOCSTRING
302 | 	Args
303 | 	Returns
304 | 	"""
305 | 	if intv_bin_ip.shape[0] != 1:
306 | 		raise Exception('Fisher exact test does not deal with replicates.')
307 | 	intv_counter = intv_bin_ip.shape[1]
308 | 	assert intv_counter == intv_bin_con.shape[1]
309 | 	binscore = np.empty(intv_counter)
310 | 	binsignal = np.empty(intv_counter)
311 | 	ip_sum = np.sum(intv_bin_ip[0,])
312 | 	con_sum = np.sum(intv_bin_con[0,])
313 | 	for i in range(intv_counter):
314 | 		this_ip = intv_bin_ip[0, i]
315 | 		others_ip = ip_sum - this_ip
316 | 		this_con = intv_bin_con[0, i]
317 | 		others_con = con_sum - this_con
318 | 		if this_ip == 0:
319 | 			binsignal[i], binscore[i] = np.nan, 1.0
320 | 			continue
321 | 		_, binscore[i] = fisher_exact([[this_ip, others_ip], [this_con, others_con]], alternative='greater')
322 | 		if with_control:
323 | 			binsignal[i] = this_ip/others_ip / this_con*others_con
324 | 		else:
325 | 			binsignal[i] = this_ip
326 | 		
327 | 	adj = multipletests(binscore, alpha=0.05, method=correction_method)
328 | 	binscore_adj = adj[1]
329 | 	return binsignal, binscore_adj
330 | 
331 | 
332 | def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, winsize=50, unstranded=False):
333 | 	"""DOCSTRING
334 | 	Args
335 | 	Returns
336 | 	"""
337 | 	# fetch the IP tag counts to gene regions
338 | 	if unique_only:
339 | 		interval_ip = \
340 | 			count_gene_read_tags(bam_dict['ubam.ip'], gene, is_unique=True, unstranded=unstranded)
341 | 	else:
342 | 		interval_ip = \
343 | 			count_gene_read_tags(bam_dict['ubam.ip'], gene, is_unique=True, unstranded=unstranded) + \
344 | 			count_gene_read_tags(bam_dict['mbam.ip'], gene, is_unique=False, unstranded=unstranded)
345 | 	
346 | 	# skip if there are no reads
347 | 	if np.sum(interval_ip) == 0:
348 | 		#print "no reads"
349 | 		return ''
350 | 		
351 | 	# fetch/construct the input tag counts
352 | 	if with_control:
353 | 		## count control tags if available
354 | 		if unique_only:
355 | 			interval_con = \
356 | 				count_gene_read_tags(bam_dict['ubam.con'], gene, is_unique=True, unstranded=unstranded)
357 | 		else:
358 | 			interval_con = \
359 | 				count_gene_read_tags(bam_dict['ubam.con'], gene, is_unique=True, unstranded=unstranded) + \
360 | 				count_gene_read_tags(bam_dict['mbam.con'], gene, is_unique=False, unstranded=unstranded)
361 | 	else:
362 | 		## otherwise, construct a uniform *fake* control
363 | 		interval_con = \
364 | 			np.ones((1, interval_ip.shape[1]))*np.sum(interval_ip)/interval_ip.shape[1]
365 | 	
366 | 	# bin tag counts into bins
367 | 	intv_bin_ip = bin_interval_counts(interval_ip, winsize=winsize)
368 | 	intv_bin_con = bin_interval_counts(interval_con, winsize=winsize)
369 | 	
370 | 	# perform statistical test
371 | 	signal_val, binscore_adj = test_bin_negbinom(intv_bin_ip, intv_bin_con)
372 | 	#signal_val, binscore_adj = test_bin_poisson(intv_bin_ip, intv_bin_con)
373 | 	#signal_val, binscore_adj = test_bin_fisher(bin_interval_ip, bin_interval_con, with_control=with_control)
374 | 	
375 | 	# build human-readable outputs
376 | 	## "narrowPeak" format from 
377 | 	## https://genome.ucsc.edu/FAQ/FAQformat.html#format12
378 | 	## chr start end name 1000 strand signalValue pVal qVal peak
379 | 	narrowPeak_formatter = "%s\t%i\t%i\t.\t1000\t%s\t%s\t.\t%.3f\t.\n"
380 | 	BED = ''
381 | 	for i in range(len(binscore_adj)):
382 | 		qval = binscore_adj[i]
383 | 		signal = signal_val[i]
384 | 		if qval<0.05:
385 | 			chr = gene[0]
386 | 			binstart = gene[1] + i*winsize
387 | 			binend = gene[1] + (i+1)*winsize-1
388 | 			strand = gene[3]
389 | 			BED += narrowPeak_formatter % (chr, binstart, binend, strand, signal, qval)
390 | 	return BED
391 | 	
392 | 
393 | 
394 | def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False):
395 | 	"""DOCSTRING
396 | 	Args:
397 | 	Returns:
398 | 	"""
399 | 	# file handlers
400 | 	mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb')
401 | 	ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
402 | 	bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]}
403 | 	if unique_only:
404 | 		ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w')
405 | 	else:
406 | 		ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w')
407 | 	
408 | 	# read in GTF
409 | 	gene_annot = read_gtf(gtf_fp)
410 | 	
411 | 	# iteratively call peaks in each gene
412 | 	peak_counter = 0
413 | 	for gene_name in tqdm(gene_annot):
414 | 		gene = gene_annot[gene_name]
415 | 		BED = call_gene_peak(bam_dict, gene, 
416 | 			unique_only=unique_only, with_control=with_control, 
417 | 			unstranded=unstranded)
418 | 		ofile.write(BED)
419 | 		#print BED
420 | 		peak_counter += len(BED.split('\n'))
421 | 	ofile.close()
422 | 	logger.info('called %i peaks'%peak_counter)
423 | 	return
424 | 	
425 | 
426 | def chunkify(a, n):
427 | 	"""Separate a list (a) into consecutive n chunks.
428 | 	Args:
429 | 	Returns:
430 | 		the chunkified index
431 | 	"""
432 | 	k, m = len(a) / n, len(a) % n
433 | 	return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n))
434 | 
435 | 
436 | if __name__ == '__main__':
437 | 	### set up logger
438 | 	logger = logging.getLogger('CLAM')
439 | 	logger.setLevel(logging.DEBUG)
440 | 	# create file handler which logs even debug messages
441 | 	fh = logging.FileHandler(
442 | 		'CLAM.Peakcaller.'+'-'.join(str(datetime.datetime.now()).replace(':','-').split()) + '.log')
443 | 	fh.setLevel(logging.DEBUG)
444 | 	# create console handler with a higher log level
445 | 	ch = logging.StreamHandler()
446 | 	ch.setLevel(logging.DEBUG)
447 | 	# create formatter and add it to the handlers
448 | 	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -\n %(message)s')
449 | 	fh.setFormatter(formatter)
450 | 	ch.setFormatter(formatter)
451 | 	# add the handlers to the logger
452 | 	logger.addHandler(fh)
453 | 	logger.addHandler(ch)
454 | 	###
455 | 	logger.info('start')
456 | 	logger.info('run info: %s'%(' '.join(sys.argv)))
457 | 	
458 | 	tmp_dir, out_dir, unique_only = sys.argv[1], sys.argv[2], sys.argv[3]
459 | 	unique_only = False if unique_only=='0' else True
460 | 	gtf_fp = '/u/nobackup/yxing/NOBACKUP/frankwoe/hg19/gencode.v19.annotation.gtf'
461 | 	peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=unique_only)
462 | 	logger.info('end')
463 | 


--------------------------------------------------------------------------------
/CLAM/bak/peakcaller.bak2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This peak-caller script is part of the CLAM pipeline.
  5 | 
  6 | It takes input from re-aligner output, and use permutation to call peaks.
  7 | 
  8 | Tested under python 2.7.3
  9 | """
 10 | 
 11 | __author__ = 'Zijun Zhang'
 12 | __version__ = '1.1.0'
 13 | __email__ = 'zj.z@ucla.edu'
 14 | 
 15 | 
 16 | import os
 17 | import sys
 18 | from collections import defaultdict
 19 | from statsmodels.sandbox.stats.multicomp import multipletests
 20 | import pysam
 21 | import logging
 22 | import numpy as np
 23 | from collections import defaultdict
 24 | import re
 25 | from scipy.stats import fisher_exact, poisson, chi2
 26 | import scipy.optimize as optimize
 27 | from tqdm import tqdm
 28 | import datetime
 29 | from stats import ztnb_em
 30 | 
 31 | 
 32 | ### get logger
 33 | ###
 34 | logger = logging.getLogger('CLAM.Peakcaller')
 35 | ###
 36 | 
 37 | def read_gtf(fn):
 38 | 	"""read in the gene annotation from GTF file
 39 | 	"""
 40 | 	gene_annot = {}
 41 | 	with open(fn, 'r') as f:
 42 | 		for line in f:
 43 | 			if line.startswith('#'):
 44 | 				continue
 45 | 			ele = line.strip().split('\t')
 46 | 			if ele[2] != 'gene':
 47 | 				continue
 48 | 			chr, start, end, strand = ele[0], int(ele[3]), int(ele[4]), ele[6]
 49 | 			try:
 50 | 				gene_id = re.search(r'gene_id "(.+?)"', ele[-1]).group(1)
 51 | 			except AttributeError:
 52 | 				continue
 53 | 			gene_annot[gene_id] = [chr, start, end, strand]
 54 | 	return gene_annot
 55 | 
 56 | 
 57 | def count_gene_read_tags(bam_list, (chr, start, end, strand), is_unique=True, unstranded=False):
 58 | 	""" count the tagger positions for all reads in a given genomic interval
 59 | 	Args:
 60 | 	Returns:
 61 | 	"""
 62 | 	# placeholder for interval: 'num of replicate' x 'interval length'
 63 | 	interval = np.zeros( (len(bam_list), end-start+1) )
 64 | 	is_reverse = True if strand=='-' else False
 65 | 	# construct the (tag, score) pairs
 66 | 	for i in range(len(bam_list)):
 67 | 		bam = bam_list[i]
 68 | 		if is_unique:
 69 | 			read_tags = [ (x.opt('RT'), 1.0) for x in bam.fetch(chr, start, end) \
 70 | 				if unstranded or x.is_reverse==is_reverse]
 71 | 		else:
 72 | 			read_tags = [ (x.opt('RT'), x.opt('AS')) for x in bam.fetch(chr, start, end) \
 73 | 				if unstranded or x.is_reverse==is_reverse]
 74 | 		
 75 | 		for tag in read_tags:
 76 | 			if tag[0]<start or tag[0]>=end:
 77 | 				continue
 78 | 			interval[i, tag[0]-start] += tag[1]
 79 | 	return interval
 80 | 
 81 | 
 82 | def	bin_interval_counts(interval, winsize=50):
 83 | 	bins = np.zeros( ( interval.shape[0], int(np.ceil(interval.shape[1]/float(winsize))) ) )
 84 | 	for i in range(bins.shape[1]):
 85 | 		for j in range(interval.shape[0]):
 86 | 			start, end = i*winsize, (i+1)*winsize-1
 87 | 			bins[j, i] = np.sum(interval[j, start:end])
 88 | 	return bins
 89 | 
 90 | 
 91 | def test_bin_poisson(intv_bin_ip, intv_bin_con, correction_method='fdr_bh'):
 92 | 	"""DOCSTRING
 93 | 	Args
 94 | 	Returns
 95 | 	"""
 96 | 	def _par_to_vec(par, data, is_constrained):
 97 | 		if is_constrained:
 98 | 			beta = par[0]
 99 | 			mu_vec = par[1::]
100 | 			delta = 0
101 | 		else:
102 | 			beta, delta = par[0], par[1]
103 | 			mu_vec = par[2::]
104 | 		ip_counter = data['this_ip'].shape[0]
105 | 		con_counter = data['this_con'].shape[0]
106 | 		mu0 = np.asarray(mu_vec[0:con_counter])
107 | 		mu1 = np.asarray(mu_vec[con_counter::])
108 | 		lamb1_this = np.exp(mu1 + beta + delta)
109 | 		lamb1_others = np.exp(mu1)
110 | 		lamb0_this = np.exp(mu0 + beta)
111 | 		lamb0_others = np.exp(mu0)
112 | 		return (lamb1_this, lamb1_others, lamb0_this, lamb0_others)
113 | 		
114 | 	def _neg_loglik_unconstrain(par, data):
115 | 		(l1, l2, l3, l4) = _par_to_vec(par, data, False)
116 | 		ll = np.sum(poisson.logpmf(data['this_ip'], mu=l1)) + \
117 | 			np.sum(poisson.logpmf(data['others_ip'], mu=l2)) + \
118 | 			np.sum(poisson.logpmf(data['this_con'], mu=l3)) + \
119 | 			np.sum(poisson.logpmf(data['others_con'], mu=l4))
120 | 		return -ll
121 | 	
122 | 	def _neg_loglik_constrain(par, data):
123 | 		(l1, l2, l3, l4) = _par_to_vec(par, data, True)
124 | 		ll = np.sum(poisson.logpmf(data['this_ip'], mu=l1)) + \
125 | 			np.sum(poisson.logpmf(data['others_ip'], mu=l2)) + \
126 | 			np.sum(poisson.logpmf(data['this_con'], mu=l3)) + \
127 | 			np.sum(poisson.logpmf(data['others_con'], mu=l4))
128 | 		return -ll
129 | 		
130 | 	intv_counter = intv_bin_ip.shape[1]
131 | 	assert intv_counter == intv_bin_con.shape[1]
132 | 	binscore = np.empty(intv_counter)
133 | 	binsignal = np.empty(intv_counter)
134 | 	ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip)
135 | 	con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con)
136 | 	for i in range(intv_counter):
137 | 		this_ip = intv_bin_ip[:, i]
138 | 		others_ip = ip_sum - this_ip
139 | 		this_con = intv_bin_con[:, i]
140 | 		others_con = con_sum - this_con
141 | 		if np.sum(this_ip) == 0:
142 | 			binsignal[i], binscore[i] = np.nan, 1.0
143 | 			continue
144 | 		data = {
145 | 				'this_ip':np.round(this_ip),
146 | 				'others_ip':np.round(others_ip),
147 | 				'this_con':np.round(this_con),
148 | 				'others_con':np.round(others_con)
149 | 			}
150 | 		
151 | 		res_constrain = optimize.minimize(
152 | 				x0=np.ones(1+this_ip.shape[0]+others_ip.shape[0]), 
153 | 				fun=_neg_loglik_constrain,
154 | 				args=(data),
155 | 				method='bfgs',
156 | 				options={'disp':False}
157 | 			)
158 | 		
159 | 		res_unconstrain = optimize.minimize(
160 | 				x0=np.ones(2+this_ip.shape[0]+others_ip.shape[0]), 
161 | 				fun=_neg_loglik_unconstrain,
162 | 				args=(data),
163 | 				method='bfgs',
164 | 				options={'disp':False}
165 | 			)
166 | 		
167 | 		delta_mle = res_unconstrain.x[1]
168 | 		pval = 1 - chi2.cdf(2*(res_constrain.fun - res_unconstrain.fun), 1)
169 | 		binscore[i] = pval
170 | 		binsignal[i] = delta_mle
171 | 	adj = multipletests(binscore, alpha=0.05, method=correction_method)
172 | 	binscore_adj = adj[1]
173 | 	return binsignal, binscore_adj
174 | 
175 | 
176 | 
177 | def test_bin_negbinom(intv_bin_ip, intv_bin_con, alpha_ip_vec, alpha_con_vec, correction_method='fdr_bh'):
178 | 	"""DOCSTRING
179 | 	Args
180 | 	Returns
181 | 	"""
182 | 	def _par_to_vec(par, data, is_constrained):
183 | 		if is_constrained:
184 | 			beta = par[0]
185 | 			mu_vec = par[1::]
186 | 			delta = 0
187 | 		else:
188 | 			beta, delta = par[0], par[1]
189 | 			mu_vec = par[2::]
190 | 		ip_counter = data['this_ip'].shape[0]
191 | 		con_counter = data['this_con'].shape[0]
192 | 		mu0 = np.asarray(mu_vec[0:con_counter])
193 | 		mu1 = np.asarray(mu_vec[con_counter::])
194 | 		lamb1_this = np.exp(mu1 + beta + delta)
195 | 		lamb1_others = np.exp(mu1)
196 | 		lamb0_this = np.exp(mu0 + beta)
197 | 		lamb0_others = np.exp(mu0)
198 | 		return (lamb1_this, lamb1_others, lamb0_this, lamb0_others)
199 | 	
200 | 	def _negative_binom_logpmf(y, mu, alpha):
201 | 		y  = np.asarray(y)
202 | 		ll = np.empty(len(y))
203 | 		for i in range(len(y)):
204 | 			alpha_inv = 1.0/alpha[i]
205 | 			alpha_mu = float(alpha[i] * mu[i])
206 | 			ll[i] = y[i]* np.log(alpha_mu/(1+alpha_mu))- \
207 | 				alpha_inv*np.log(1+alpha_mu)
208 | 		return ll
209 | 		
210 | 	def _neg_loglik_unconstrain(par, data):
211 | 		(l1, l2, l3, l4) = _par_to_vec(par, data, False)
212 | 		ll = np.sum( _negative_binom_logpmf(data['this_ip'], mu=l1, alpha=alpha_ip_vec))
213 | 		ll += np.sum( _negative_binom_logpmf(data['others_ip'], mu=l2, alpha=alpha_ip_vec))
214 | 		ll += np.sum( _negative_binom_logpmf(data['this_con'], mu=l3, alpha=alpha_con_vec))
215 | 		ll += np.sum( _negative_binom_logpmf(data['others_con'], mu=l4, alpha=alpha_con_vec))
216 | 		return -ll
217 | 	
218 | 	def _neg_loglik_constrain(par, data):
219 | 		(l1, l2, l3, l4) = _par_to_vec(par, data, True)
220 | 		ll = np.sum(_negative_binom_logpmf(data['this_ip'], mu=l1, alpha=alpha_ip_vec)) + \
221 | 			np.sum(_negative_binom_logpmf(data['others_ip'], mu=l2, alpha=alpha_ip_vec)) + \
222 | 			np.sum(_negative_binom_logpmf(data['this_con'], mu=l3, alpha=alpha_con_vec)) + \
223 | 			np.sum(_negative_binom_logpmf(data['others_con'], mu=l4, alpha=alpha_con_vec))
224 | 		return -ll
225 | 		
226 | 	intv_counter = intv_bin_ip.shape[1]
227 | 	assert intv_counter == intv_bin_con.shape[1]
228 | 	binscore = np.empty(intv_counter)
229 | 	binsignal = np.empty(intv_counter)
230 | 	ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip)
231 | 	con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con)
232 | 	for i in range(intv_counter):
233 | 		this_ip = intv_bin_ip[:, i]
234 | 		others_ip = ip_sum - this_ip
235 | 		this_con = intv_bin_con[:, i]
236 | 		others_con = con_sum - this_con
237 | 		if np.sum(this_ip) == 0:
238 | 			binsignal[i], binscore[i] = np.nan, 1.0
239 | 			continue
240 | 		data = {
241 | 				'this_ip':np.round(this_ip),
242 | 				'others_ip':np.round(others_ip),
243 | 				'this_con':np.round(this_con),
244 | 				'others_con':np.round(others_con)
245 | 			}
246 | 		
247 | 		res_constrain = optimize.minimize(
248 | 				x0=np.ones(1+this_ip.shape[0]+others_ip.shape[0]), 
249 | 				fun=_neg_loglik_constrain,
250 | 				args=(data),
251 | 				method='Nelder-Mead',
252 | 				options={'disp':False}
253 | 			)
254 | 		
255 | 		res_unconstrain = optimize.minimize(
256 | 				x0=np.ones(2+this_ip.shape[0]+others_ip.shape[0]), 
257 | 				fun=_neg_loglik_unconstrain,
258 | 				args=(data),
259 | 				method='bfgs',
260 | 				options={'disp':False}
261 | 			)
262 | 		
263 | 		delta_mle = res_unconstrain.x[1]
264 | 		pval = 1 - chi2.cdf(2*(res_constrain.fun - res_unconstrain.fun), 1)
265 | 		binscore[i] = pval
266 | 		binsignal[i] = delta_mle
267 | 	adj = multipletests(binscore, alpha=0.05, method=correction_method)
268 | 	binscore_adj = adj[1]
269 | 	return binsignal, binscore_adj
270 | 
271 | 
272 | 
273 | def test_bin_fisher(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh'):
274 | 	"""DOCSTRING
275 | 	Args
276 | 	Returns
277 | 	"""
278 | 	if intv_bin_ip.shape[0] != 1:
279 | 		raise Exception('Fisher exact test does not deal with replicates.')
280 | 	intv_counter = intv_bin_ip.shape[1]
281 | 	assert intv_counter == intv_bin_con.shape[1]
282 | 	binscore = np.empty(intv_counter)
283 | 	binsignal = np.empty(intv_counter)
284 | 	ip_sum = np.sum(intv_bin_ip[0,])
285 | 	con_sum = np.sum(intv_bin_con[0,])
286 | 	for i in range(intv_counter):
287 | 		this_ip = intv_bin_ip[0, i]
288 | 		others_ip = ip_sum - this_ip
289 | 		this_con = intv_bin_con[0, i]
290 | 		others_con = con_sum - this_con
291 | 		if this_ip == 0:
292 | 			binsignal[i], binscore[i] = np.nan, 1.0
293 | 			continue
294 | 		_, binscore[i] = fisher_exact([[this_ip, others_ip], [this_con, others_con]], alternative='greater')
295 | 		if with_control:
296 | 			binsignal[i] = this_ip/others_ip / this_con*others_con
297 | 		else:
298 | 			binsignal[i] = this_ip
299 | 		
300 | 	adj = multipletests(binscore, alpha=0.05, method=correction_method)
301 | 	binscore_adj = adj[1]
302 | 	return binsignal, binscore_adj
303 | 
304 | 
305 | def gene_to_count(bam_dict, gene, unique_only=False, with_control=False, winsize=50, unstranded=False):
306 | 	"""DOCSTRING
307 | 	Args
308 | 	Returns
309 | 	"""
310 | 	# fetch the IP tag counts to gene regions
311 | 	if unique_only:
312 | 		interval_ip = \
313 | 			count_gene_read_tags(bam_dict['ubam.ip'], gene, is_unique=True, unstranded=unstranded)
314 | 	else:
315 | 		interval_ip = \
316 | 			count_gene_read_tags(bam_dict['ubam.ip'], gene, is_unique=True, unstranded=unstranded) + \
317 | 			count_gene_read_tags(bam_dict['mbam.ip'], gene, is_unique=False, unstranded=unstranded)
318 | 	
319 | 	# skip if there are no reads
320 | 	if np.sum(interval_ip) == 0:
321 | 		#print "no reads"
322 | 		return None, None
323 | 		
324 | 	# fetch/construct the input tag counts
325 | 	if with_control:
326 | 		## count control tags if available
327 | 		if unique_only:
328 | 			interval_con = \
329 | 				count_gene_read_tags(bam_dict['ubam.con'], gene, is_unique=True, unstranded=unstranded)
330 | 		else:
331 | 			interval_con = \
332 | 				count_gene_read_tags(bam_dict['ubam.con'], gene, is_unique=True, unstranded=unstranded) + \
333 | 				count_gene_read_tags(bam_dict['mbam.con'], gene, is_unique=False, unstranded=unstranded)
334 | 	else:
335 | 		## otherwise, construct a uniform *fake* control
336 | 		interval_con = \
337 | 			np.ones((1, interval_ip.shape[1]))*np.sum(interval_ip)/interval_ip.shape[1]
338 | 	
339 | 	# bin tag counts into bins
340 | 	bin_interval_ip = bin_interval_counts(interval_ip, winsize=winsize)
341 | 	bin_interval_con = bin_interval_counts(interval_con, winsize=winsize)
342 | 	
343 | 	return bin_interval_ip, bin_interval_con
344 | 	
345 | 	
346 | def test_gene_bin(gene, bin_interval_ip, bin_interval_con, alpha_ip_vec, alpha_con_vec):
347 | 	# perform statistical test
348 | 	signal_val, binscore_adj = test_bin_poisson(bin_interval_ip, bin_interval_con)
349 | 	#signal_val, binscore_adj = test_bin_fisher(bin_interval_ip, bin_interval_con, with_control=with_control)
350 | 	
351 | 	# build human-readable outputs
352 | 	## "narrowPeak" format from 
353 | 	## https://genome.ucsc.edu/FAQ/FAQformat.html#format12
354 | 	## chr start end name 1000 strand signalValue pVal qVal peak
355 | 	narrowPeak_formatter = "%s\t%i\t%i\t.\t1000\t%s\t%s\t.\t%.3f\t.\n"
356 | 	BED = ''
357 | 	for i in range(len(binscore_adj)):
358 | 		qval = binscore_adj[i]
359 | 		signal = signal_val[i]
360 | 		if qval<0.05:
361 | 			chr = gene[0]
362 | 			binstart = gene[1] + i*winsize
363 | 			binend = gene[1] + (i+1)*winsize-1
364 | 			strand = gene[3]
365 | 			BED += narrowPeak_formatter % (chr, binstart, binend, strand, signal, qval)
366 | 	return BED
367 | 
368 | 
369 | def estim_dispersion_param(alpha_len, gene_count, type):
370 | 	"""DOCSTRING
371 | 	Args
372 | 	Returns
373 | 	"""
374 | 	alpha_vec = np.zeros(alpha_len)
375 | 	for i in range(len(alpha_ip_vec)):
376 | 		dist = defaultdict(int)
377 | 		for gene_name in gene_count:
378 | 			this_height = ztnb_em.collapse_data(gene_count[gene_name][type][i,])
379 | 			for h in this_height:
380 | 				#if h==0 or h>50:
381 | 				#	continue
382 | 				dist[int(h)] += this_height[h]
383 | 		dist[0] = 0 ## truncate all zeros
384 | 		ll, mu, alpha = EM_estim_params(dist, max_iter=1000, verbose=True)
385 | 		alpha_vec[i] = alpha
386 | 	return alpha_vec
387 | 
388 | 
389 | def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False):
390 | 	"""DOCSTRING
391 | 	Args:
392 | 	Returns:
393 | 	"""
394 | 	# file handlers
395 | 	mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb')
396 | 	ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
397 | 	bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]}
398 | 	if unique_only:
399 | 		ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w')
400 | 	else:
401 | 		ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w')
402 | 	
403 | 	# read in GTF
404 | 	logger.info('read gtf from "%s" '% fn)
405 | 	gene_annot = read_gtf(gtf_fp)
406 | 	
407 | 	# fetch tag counts in each gene
408 | 	##  gene_name: { 'ip': bin_interval_ip, 'con': bin_interval_con }
409 | 	logger.info('reading gene counts')
410 | 	gene_count = defaultdict(dict)
411 | 	for gene_name in tqdm(gene_annot):
412 | 		gene = gene_annot[gene_name]
413 | 		bin_interval_ip, bin_interval_con = \
414 | 			gene_to_count(bam_dict, gene, 
415 | 				unique_only=unique_only, with_control=with_control, 
416 | 				unstranded=unstranded)
417 | 		if bin_interval_ip is None:  ## if no IP reads in this gene
418 | 			continue
419 | 		gene_count[gene_name]['ip'] = bin_interval_ip
420 | 		gene_count[gene_name]['con'] = bin_interval_con
421 | 	
422 | 	# estimate global overdispersion param. for each dataset
423 | 	logger.info('estimating dispersion parameter')
424 | 	alpha_ip_vec = estim_dispersion_param(len(bam_dict['ubam.ip']), gene_count, 'ip' )
425 | 	if with_control:
426 | 		alpha_con_vec = estim_dispersion_param(len(bam_dict['ubam.con']), gene_count, 'con' )
427 | 	else:
428 | 		alpha_con_vec = np.asarray(alpha_ip_vec)
429 | 	
430 | 	# perform statistical test
431 | 	logger.info('calling peaks')
432 | 	for gene_name in gene_to_count:	
433 | 		gene = gene_annot[gene_name]
434 | 		BED = test_gene_bin(gene, gene_count[gene_name]['ip'], gene_count[gene_name]['con'],
435 | 			alpha_ip_vec, alpha_con_vec)
436 | 		ofile.write(BED)
437 | 		peak_counter += len(BED.split('\n'))
438 | 	ofile.close()
439 | 	logger.info('called %i peaks'%peak_counter)
440 | 	
441 | 	return
442 | 	
443 | 
444 | def chunkify(a, n):
445 | 	"""Separate a list (a) into consecutive n chunks.
446 | 	Args:
447 | 	Returns:
448 | 		the chunkified index
449 | 	"""
450 | 	k, m = len(a) / n, len(a) % n
451 | 	return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n))
452 | 
453 | 
454 | if __name__ == '__main__':
455 | 	### set up logger
456 | 	logger = logging.getLogger('CLAM')
457 | 	logger.setLevel(logging.DEBUG)
458 | 	# create file handler which logs even debug messages
459 | 	fh = logging.FileHandler(
460 | 		'CLAM.Peakcaller.'+'-'.join(str(datetime.datetime.now()).replace(':','-').split()) + '.log')
461 | 	fh.setLevel(logging.DEBUG)
462 | 	# create console handler with a higher log level
463 | 	ch = logging.StreamHandler()
464 | 	ch.setLevel(logging.DEBUG)
465 | 	# create formatter and add it to the handlers
466 | 	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -\n %(message)s')
467 | 	fh.setFormatter(formatter)
468 | 	ch.setFormatter(formatter)
469 | 	# add the handlers to the logger
470 | 	logger.addHandler(fh)
471 | 	logger.addHandler(ch)
472 | 	###
473 | 	logger.info('start')
474 | 	logger.info('run info: %s'%(' '.join(sys.argv)))
475 | 	
476 | 	tmp_dir, out_dir, unique_only = sys.argv[1], sys.argv[2], sys.argv[3]
477 | 	unique_only = False if unique_only=='0' else True
478 | 	gtf_fp = '/u/nobackup/yxing/NOBACKUP/frankwoe/hg19/gencode.v19.annotation.gtf'
479 | 	peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=unique_only)
480 | 	logger.info('end')
481 | 


--------------------------------------------------------------------------------
/CLAM/bak/realigner.bak.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This re-aligner script is part of the CLAM pipeline.
  5 | 
  6 | It takes bam file as input, and outputs a weighed bam file for multi-mapped reads.
  7 | 
  8 | Tested under python 2.7.3
  9 | """
 10 | 
 11 | __author__ = 'Zijun Zhang'
 12 | __version__ = '1.1.0'
 13 | __email__ = 'zj.z@ucla.edu'
 14 | 
 15 | 
 16 | import os
 17 | import sys
 18 | import pysam
 19 | import numpy as np
 20 | from collections import defaultdict
 21 | from tqdm import tqdm
 22 | import logging
 23 | import datetime
 24 | 
 25 | ### get logger
 26 | ###
 27 | logger = logging.getLogger('CLAM.Realigner')
 28 | ###
 29 | 
 30 | 
 31 | class Bit:
 32 | 	""" Binary Indexed Tree to store values in genomic intervals.
 33 | 	Implementation modified from http://www.geeksforgeeks.org/binary-indexed-tree-or-fenwick-tree-2/
 34 | 	Args:
 35 | 	Returns:
 36 | 	"""
 37 | 	
 38 | 	def __init__(self, n):
 39 | 		sz = 1
 40 | 		while n >= sz:
 41 | 			sz *= 2
 42 | 		self.size = sz
 43 | 		self.array_size = n
 44 | 		self.data = [0]*sz
 45 | 		
 46 | 	def sum(self, i):
 47 | 		assert i >= 0
 48 | 		if i==0:
 49 | 			return 0
 50 | 		if i > self.array_size:
 51 | 			i = self.array_size 
 52 | 		s = 0
 53 | 		while i > 0:
 54 | 			s += self.data[i]
 55 | 			i -= i & -i
 56 | 		return s
 57 | 	
 58 | 	def add(self, i, x):
 59 | 		assert i > 0	
 60 | 		while i < self.size:
 61 | 			self.data[i] += x
 62 | 			i += i & -i
 63 | 
 64 | 
 65 | def construct_BIT_track(subgraph, read_to_locations, ubam, unstranded=False):
 66 | 	"""Construct BIT for each genomic region / node.
 67 | 	Args:
 68 | 	Returns:
 69 | 	Returns a node-track dictionary and a dictionary for multi-mapped reads.
 70 | 	"""
 71 | 	node_track = {}
 72 | 	total_len = 0
 73 | 	
 74 | 	# initialized BIT tracks, add mreads to the tracks,
 75 | 	# and keep a dict of read scores
 76 | 	multi_reads_weights = defaultdict(dict)
 77 | 	obs_reads = read_to_locations.keys()
 78 | 	for read_x_qname in obs_reads:
 79 | 		read_x_nodes = read_to_locations[read_x_qname]
 80 | 		read_x_score = 1.0 / len(read_x_nodes)
 81 | 		for node in read_x_nodes:
 82 | 			chr, strand, start, end = node.split(':')
 83 | 			start, end = int(start), int(end)
 84 | 			if not node in node_track:
 85 | 				this_len = end - start + 1
 86 | 				node_track[node] = Bit(this_len)
 87 | 			read_x_tag = read_x_nodes[node].opt('RT')
 88 | 			node_locus = read_x_tag - start + 1
 89 | 			node_track[node].add(node_locus, read_x_score)
 90 | 			multi_reads_weights[read_x_qname][node]=[read_x_score, node_locus]
 91 | 			#del read_to_locations[read_x_qname][node]
 92 | 		#del read_to_locations[read_x_qname]
 93 | 	
 94 | 	# now add ureads by fetching from ubam
 95 | 	for node in node_track:
 96 | 		chr, strand, start, end = node.split(':')
 97 | 		start, end = int(start), int(end)
 98 | 		is_reverse = True if strand=='-' else False
 99 | 		uread_tagger = [x.opt('RT') for x in ubam.fetch(chr, start, end) \
100 | 			if unstranded or x.is_reverse==is_reverse]
101 | 		for uread_x_tagger in uread_tagger:
102 | 			if uread_x_tagger>=start and uread_x_tagger<=end:
103 | 				node_locus = uread_x_tagger - start + 1
104 | 				node_track[node].add(node_locus, 1)
105 | 	
106 | 	return node_track, multi_reads_weights
107 | 
108 | 
109 | 
110 | def run_EM(node_track, multi_reads_weights, w=50, epsilon=1e-6, max_iter=100, verbose=True):
111 | 	"""	EM implementation for re-assigning multi-mapped reads, given the 
112 | 	compatibility matrix of a subgraph.
113 | 	Args:
114 | 	Returns:
115 | 	"""
116 | 	iter=1
117 | 	residue=1
118 | 	while iter < max_iter and residue > epsilon:
119 | 		residue = 0
120 | 		reweight=defaultdict(dict)
121 | 		## calculate re-distribute probability; M-step
122 | 		for read in multi_reads_weights:
123 | 			for nd in multi_reads_weights[read]:
124 | 				track_len=node_track[nd].array_size
125 | 				old_score, read_tag = multi_reads_weights[read][nd]
126 | 				reweight[read][nd] = max( 0, node_track[nd].sum(min(track_len, read_tag + w)) - node_track[nd].sum(max(0,read_tag - w)) )
127 | 		## update track by expectation; E-step
128 | 		for read in reweight:
129 | 			dn=sum([reweight[read][x] for x in reweight[read]])
130 | 			if dn==0:
131 | 				logger.debug('Error: no read weight found @ %s.'%read )
132 | 				dn=1
133 | 			for nd in reweight[read]:
134 | 				old_score, read_tag = multi_reads_weights[read][nd]
135 | 				new_score = reweight[read][nd] / float(dn)
136 | 				node_track[nd].add(read_tag, new_score - old_score)
137 | 				residue += (old_score - new_score)**2
138 | 				multi_reads_weights[read][nd][0] = new_score
139 | 		if verbose and (not iter % 10 or iter == max_iter):
140 | 			logger.debug('Iter %d, residue = %f' % (iter, residue))
141 | 		iter += 1
142 | 	return multi_reads_weights
143 | 
144 | 
145 | def build_read_cluster(alignment, chr_list, mbam, unstranded=False, winsize=50):
146 | 	"""DOCSTRING
147 | 	Args:
148 | 	Returns:
149 | 	"""
150 | 	chrom = chr_list[alignment.reference_id]
151 | 	site = alignment.opt('RT')
152 | 	is_reverse = alignment.is_reverse
153 | 	this_mread_dict = {}
154 | 	this_mread_dict_set = defaultdict(set)
155 | 	discarded_mread_alignments = []
156 | 	## note to me: need to be more careful with
157 | 	## one read mapped to *multiple-locations* within one cluster
158 | 	## currently tossing away those alignments.. (in `discarded_mread_alignments`)
159 | 	
160 | 	# find the right boundary
161 | 	current = site
162 | 	while True:
163 | 		mread_list = [x for x in mbam.fetch(chrom, current, current+winsize) \
164 | 			if unstranded or x.is_reverse==is_reverse]
165 | 		for x in mread_list:
166 | 			this_mread_dict_set[x.qname].add(x)
167 | 		mread_tagger = [x.opt('RT') for x in mread_list]
168 | 		if len(mread_tagger)==0:
169 | 			break
170 | 			#return (None,None,discarded_mread_alignments)
171 | 		end = max(mread_tagger) + winsize
172 | 		if max(mread_tagger)!=current:
173 | 			## has to restrict step-size to smaller than winsize; 
174 | 			## in order to avoid missing of very short reads
175 | 			current = max(mread_tagger) if max(mread_tagger)<current+winsize else current+winsize
176 | 		else:
177 | 			break
178 | 		
179 | 	# find the left boundary
180 | 	current = site
181 | 	while True:
182 | 		mread_list = [x for x in mbam.fetch(chrom, current-winsize, current) \
183 | 			if unstranded or x.is_reverse==is_reverse]
184 | 		for x in mread_list:
185 | 			this_mread_dict_set[x.qname].add(x)
186 | 		mread_tagger = [x.opt('RT') for x in mread_list]
187 | 		if len(mread_list)==0:
188 | 			break
189 | 			#return (None,None,discarded_mread_alignments)
190 | 		start = min(mread_tagger) - winsize
191 | 		if min(mread_tagger) != current:
192 | 			current = min(mread_tagger) if min(mread_tagger)>current-winsize else current-winsize
193 | 		else:
194 | 			break
195 | 		
196 | 	strand = '+' if unstranded or is_reverse==False else '-'
197 | 	for read_x_qname in this_mread_dict_set:
198 | 		if len(this_mread_dict_set[read_x_qname])>1:
199 | 			discarded_mread_alignments.extend( [ x for x in list(this_mread_dict_set[read_x_qname]) ])
200 | 		else:
201 | 			this_mread_dict[read_x_qname] = list(this_mread_dict_set[read_x_qname])[0]
202 | 		
203 | 	genomic_cluster = (chrom, strand, start, end)
204 | 	
205 | 	return genomic_cluster, this_mread_dict, discarded_mread_alignments
206 | 
207 | 
208 | def construct_subgraph(mbam, read_qname, mread_dict, processed_mreads, chr_list, winsize=50, unstranded=False):
209 | 	"""DOCSTRING
210 | 	Args:
211 | 	Returns:
212 | 	"""
213 | 	# record of processed alignments only need kept on within-subgraph level
214 | 	processed_mread_alignments = set()
215 | 	counter = 0
216 | 	# a list of `pysam.AlignedSegment` objects
217 | 	# note that all taggers are already stored in `pysam.AlignedSegment.opt('RT')`
218 | 	read_aln_list = [x for x in mread_dict[read_qname]] 
219 | 	processed_mreads.add(read_qname)
220 | 	read_to_locations = defaultdict(dict) # read_qname -> {node_name1:segment1, node_name2:segment2}
221 | 
222 | 	# enumerate all connected components
223 | 	while True:
224 | 		counter+=1; print "%i: %i"%(counter, len(read_aln_list))
225 | 		next_read_aln_list = []
226 | 		
227 | 		gen = read_aln_list if len(read_aln_list)<200 else tqdm(read_aln_list)
228 | 		for alignment in gen:
229 | 			## build a node for this mread alignment 
230 | 			## (if not already processed, i.e. built before)
231 | 			if alignment in processed_mread_alignments:
232 | 				continue
233 | 			
234 | 			genomic_cluster, this_mread_dict, discarded_mread_list = \
235 | 				build_read_cluster(alignment, chr_list, mbam, unstranded=unstranded, winsize=winsize)
236 | 			_ = map(processed_mread_alignments.add, discarded_mread_list)
237 | 			if genomic_cluster is None:  # this cluster is invald (only double-mappers)
238 | 				continue
239 | 			
240 | 			## update loc2read, read2loc
241 | 			node_name = ':'.join([str(x) for x in genomic_cluster])
242 | 			#if node_name in subgraph:
243 | 			#	logger.debug("I revisited '%s' at read '%s'."%(node_name, read_qname))
244 | 			#	break
245 | 			#subgraph.add(node_name)
246 | 			for x_qname in this_mread_dict:
247 | 				read_to_locations[x_qname].update({node_name :  this_mread_dict[x_qname]})
248 | 			
249 | 			## then add new alignments(edges) to generate connected nodes
250 | 			## in the next iteration
251 | 			_ = map(processed_mread_alignments.add, this_mread_dict.values())
252 | 			for read_x_qname in this_mread_dict:
253 | 				if read_x_qname in processed_mreads:
254 | 					continue
255 | 				x_aln_list = [aln for aln in mread_dict[read_x_qname] if not aln in processed_mread_alignments]
256 | 				next_read_aln_list.extend(x_aln_list)
257 | 			
258 | 			## .. and record to processed reads since we have generated
259 | 			## the nodes for them
260 | 			_ = map(processed_mreads.add, this_mread_dict.keys())
261 | 		
262 | 		# if no more connected nodes can be found, break loop 
263 | 		if len(next_read_aln_list)==0:
264 | 			break
265 | 		read_aln_list = next_read_aln_list		
266 | 	return read_to_locations, processed_mreads
267 | 
268 | 
269 | def realigner(out_dir, tmp_dir, winsize=50, unstranded=False):
270 | 	"""DOCSTRING
271 | 	Args:
272 | 	Returns:
273 | 	"""
274 | 	# file handlers
275 | 	mbam = pysam.Samfile(os.path.join(tmp_dir, 'multi.sorted.bam'),'rb')
276 | 	ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
277 | 	obam = pysam.Samfile(os.path.join(out_dir, 'realigned.bam'), 'wb', template = mbam)
278 | 	chr_list=[x['SN'] for x in ubam.header['SQ']]
279 | 	
280 | 	# construct the mread_dict; this will be needed throughout
281 | 	mread_dict = defaultdict(list)
282 | 	for alignment in mbam:
283 | 		mread_dict[alignment.qname].append(alignment)
284 | 		
285 | 	# keep a record of processed reads
286 | 	processed_mreads = set()
287 | 	
288 | 	# iterate through all mreads
289 | 	for read_qname in mread_dict:
290 | 		if read_qname in processed_mreads:
291 | 			continue
292 | 			
293 | 		## construct the fully-connected subgraph for each read
294 | 		read_to_locations, processed_mreads = \
295 | 			construct_subgraph(mbam, read_qname, mread_dict, processed_mreads, chr_list, winsize=winsize, unstranded=unstranded)
296 | 		subgraph = set()
297 | 		for read in read_to_locations:
298 | 			_ = map(subgraph.add, read_to_locations[read].keys())
299 | 		subgraph = list(subgraph)
300 | 		
301 | 		## build the BIT tracks
302 | 		node_track, multi_reads_weights = \
303 | 			construct_BIT_track(subgraph, read_to_locations, ubam, unstranded)
304 | 			
305 | 		## run EM
306 | 		multi_reads_weights = \
307 | 			run_EM(node_track, multi_reads_weights, w=winsize)
308 | 		
309 | 		## write to obam
310 | 		for read in multi_reads_weights:
311 | 			for node in multi_reads_weights[read]:
312 | 				alignment = read_to_locations[read][node]
313 | 				score = round(multi_reads_weights[read][node][0], 3)
314 | 				alignment.set_tag('AS', score)
315 | 				alignment.set_tag('PG', 'CLAM')
316 | 				obam.write(alignment)
317 | 	# sort the final output
318 | 	logger.info('sorting output')
319 | 	obam.close()
320 | 	ubam.close()
321 | 	mbam.close()
322 | 	obam_sorted_fn = os.path.join(out_dir, 'realigned.sorted.bam')
323 | 	pysam.sort('-o', obam_sorted_fn, os.path.join(out_dir, 'realigned.bam'))
324 | 	pysam.index(obam_sorted_fn)
325 | 	os.remove(os.path.join(out_dir, 'realigned.bam'))
326 | 	return
327 | 	
328 | def read_tagger(alignment, method='median'):
329 | 	""" tag a read alignment to a genomic locus
330 | 	Args:
331 | 	Returns:
332 | 	"""
333 | 	tagger_func = {
334 | 		'median': lambda x:  int(np.median(x.positions))+1,
335 | 		'start': lambda x: x.positions[-1] if x.is_reverse else x.positions[0]+1
336 | 		}
337 | 	try:
338 | 		tag=tagger_func[method](alignment)
339 | 	except:
340 | 		tag=-1
341 | 	return tag
342 | 
343 | 
344 | def filter_bam_multihits(filename, max_hits, tmp_dir, read_tagger, omit_detail=True):
345 | 	"""Pre-processing function for cleaning up the input bam file.
346 | 	Args:
347 | 	Returns:
348 | 	"""
349 | 	logger.info('Filtering input bam..')
350 | 	
351 | 	in_bam = pysam.Samfile(filename,'rb')
352 | 	# unique read bam
353 | 	ubam_fn = os.path.join(tmp_dir, 'unique.bam')
354 | 	sorted_ubam_fn = os.path.join(tmp_dir, 'unique.sorted.bam')
355 | 	ubam=pysam.Samfile(ubam_fn, 'wb', template=in_bam)
356 | 	unique_counter = 0
357 | 	
358 | 	# multi-read bam
359 | 	mbam_fn = os.path.join(tmp_dir, 'multi.bam')
360 | 	sorted_mbam_fn = os.path.join(tmp_dir, 'multi.sorted.bam')
361 | 	mbam=pysam.Samfile(mbam_fn, 'wb', template=in_bam)
362 | 	mread_set = set()
363 | 	
364 | 	# splitting unique and multi- reads
365 | 	# and add the read taggers we need
366 | 	for read in tqdm(in_bam):
367 | 		read_tag = read_tagger(read)
368 | 		## skip reads with unassigned tagger
369 | 		if read_tag==-1:
370 | 			continue
371 | 		read.tags += [('RT', read_tag)] ## add the tag
372 | 		## omit the details in read sequence and quality
373 | 		## recommended for larger bam because this
374 | 		## can save some memory/storage for large bams
375 | 		if omit_detail:
376 | 			read.query_sequence = '*'
377 | 			read.query_qualities = [0]
378 | 		if read.is_secondary or (read.has_tag('NH') and read.opt("NH")>1):
379 | 			try:
380 | 				if read.opt("NH") < max_hits:
381 | 					mbam.write(read)
382 | 					mread_set.add(read.qname)
383 | 			except KeyError:
384 | 				#print read
385 | 				raise Exception('%s: missing NH tag when is_secondary=%s'%(read.qname,read.is_secondary))
386 | 		else:
387 | 			ubam.write(read)
388 | 			unique_counter += 1
389 | 	
390 | 	in_bam.close()
391 | 	ubam.close()
392 | 	mbam.close()
393 | 	
394 | 	# sorting
395 | 	pysam.sort('-o', sorted_ubam_fn, ubam_fn)
396 | 	os.remove(ubam_fn)
397 | 	pysam.sort('-o', sorted_mbam_fn, mbam_fn)
398 | 	os.remove(mbam_fn)
399 | 	pysam.index(sorted_ubam_fn)
400 | 	pysam.index(sorted_mbam_fn)
401 | 	
402 | 	# log the statistics
403 | 	multi_counter = len(mread_set)
404 | 	logger.info(
405 | 			'Unique reads = %s;  ' % unique_counter + \
406 | 			'Multi reads = %s (%.2f %%)' % \
407 | 			( multi_counter, float(multi_counter)/(multi_counter+unique_counter)*100 )
408 | 		)
409 | 	return
410 | 
411 | 
412 | if __name__=='__main__':		
413 | 	### set up logger
414 | 	logger = logging.getLogger('CLAM')
415 | 	logger.setLevel(logging.DEBUG)
416 | 	# create file handler which logs even debug messages
417 | 	fh = logging.FileHandler(
418 | 		'CLAM.Realigner.'+'-'.join(str(datetime.datetime.now()).replace(':','-').split()) + '.log')
419 | 	fh.setLevel(logging.INFO)
420 | 	# create console handler with a higher log level
421 | 	ch = logging.StreamHandler()
422 | 	ch.setLevel(logging.DEBUG)
423 | 	# create formatter and add it to the handlers
424 | 	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -\n %(message)s')
425 | 	fh.setFormatter(formatter)
426 | 	ch.setFormatter(formatter)
427 | 	# add the handlers to the logger
428 | 	logger.addHandler(fh)
429 | 	logger.addHandler(ch)
430 | 	logger.info('start')
431 | 	
432 | 	logger.info('run info: %s'%(' '.join(sys.argv)))
433 | 	bam, tmp_dir, out_dir = sys.argv[1:4]
434 | 	retag = False
435 | 	if len(sys.argv)>4:
436 | 		tagger_method = sys.argv[4]
437 | 		retag = True
438 | 		logger.info('Retag with "%s"'%tagger_method)
439 | 	else:
440 | 		tagger_method = 'median'
441 | 		
442 | 	if retag or not (
443 | 			os.path.isfile(os.path.join(tmp_dir,'unique.sorted.bam')) and \
444 | 			os.path.isfile(os.path.join(tmp_dir,'multi.sorted.bam')) \
445 | 			) :
446 | 		filter_bam_multihits(bam, max_hits=100, tmp_dir=tmp_dir, read_tagger=lambda x: read_tagger(x, tagger_method))
447 | 	realigner(out_dir, tmp_dir, winsize=50, unstranded=False)
448 | 	logger.info('end')


--------------------------------------------------------------------------------
/CLAM/bak/sim_callpeak.r:
--------------------------------------------------------------------------------
 1 | ## simulate read counts in bins and
 2 | ## perform LRT test as peak calling
 3 | ## *-- prototyping --*
 4 | ## Zijun Zhang
 5 | ## 9.1.2017
 6 | 
 7 | 
 8 | sim_bin_counts = function(mu_vec, beta, delta)
 9 | {
10 | 	others = c(
11 | 		rpois(n=1, lambda=exp(mu_vec[1])),
12 | 		rpois(n=1, lambda=exp(mu_vec[2]))
13 | 		)
14 | 	this = c(
15 | 		rpois(n=1, lambda=exp(mu_vec[1]+beta+delta)),
16 | 		rpois(n=1, lambda=exp(mu_vec[2]+beta))
17 | 		)
18 | 	res = matrix(c(this,others), nrow=2, byrow=T)
19 | 	rownames(res) = c('this','others')
20 | 	colnames(res) = c('IP','Input')
21 | 	return(as.data.frame(res))
22 | }
23 | 
24 | 
25 | loglik_constrain = function(par, data)
26 | {
27 | 	ll = 0
28 | 	mu1=par[1]; mu0=par[2]; beta = par[3]
29 | 	lamb1.this = exp(mu1 + beta)
30 | 	lamb1.others = exp(mu1)
31 | 	lamb0.this = exp(mu0+beta)
32 | 	lamb0.others = exp(mu0)
33 | 	ll = ll + dpois(data['this','IP'], lamb1.this, log=T)
34 | 	ll = ll + dpois(data['this','Input'], lamb0.this, log=T)
35 | 	ll = ll + dpois(data['others','IP'], lamb1.others, log=T)
36 | 	ll = ll + dpois(data['others','Input'], lamb0.others, log=T)
37 | 	return(ll)
38 | }
39 | 
40 | loglik_unconstrain = function(par, data)
41 | {
42 | 	ll = 0
43 | 	mu1=par[1]; mu0=par[2]; beta = par[3]; delta=par[4]
44 | 	lamb1.this = exp(mu1 + beta + delta)
45 | 	lamb1.others = exp(mu1)
46 | 	lamb0.this = exp(mu0+beta)
47 | 	lamb0.others = exp(mu0)
48 | 	ll = ll + dpois(data['this','IP'], lamb1.this, log=T)
49 | 	ll = ll + dpois(data['this','Input'], lamb0.this, log=T)
50 | 	ll = ll + dpois(data['others','IP'], lamb1.others, log=T)
51 | 	ll = ll + dpois(data['others','Input'], lamb0.others, log=T)
52 | 	return(ll)
53 | }
54 | 
55 | 
56 | callpeak_LRT = function(data)
57 | {
58 | 	ll0 = optim(rep(1,3), loglik_constrain, control=list(fnscale=-1), data=data)
59 | 	ll1 = optim(rep(1,4), loglik_unconstrain, control=list(fnscale=-1), data=data)
60 | 	pval = 1-pchisq(2*(ll1$value-ll0$value),1)
61 | 	pval
62 | }
63 | 
64 | 
65 | 
66 | B=200
67 | res = matrix(NA, nrow=B, ncol=2)
68 | colnames(res) = c('fisher', 'lrt')
69 | for(b in 1:B) {
70 | 	data = sim_bin_counts(c(2.5,2), -0.5, 1)
71 | 	p1=fisher.test(data)$p.value
72 | 	p2=callpeak_LRT(data)
73 | 	res[b,] = c(p1, p2)
74 | }
75 | 
76 | plot(res[,'fisher'], res[,'lrt'])
77 | abline(0,1)
78 | mean(res[,'fisher']<0.05)
79 | mean(res[,'lrt']<0.05)
80 | 


--------------------------------------------------------------------------------
/CLAM/bak/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class CLAM_mread(object):
 3 | 	"""
 4 | 	object to store a read alignment
 5 | 	"""
 6 | 	def __init__(self, alignment, read_tagger_func, tag_of_interest = ['NH']):
 7 | 		self.reference_id = alignment.reference_id
 8 | 		self.is_reverse = alignment.is_reverse
 9 | 		self.cigarstring = alignment.cigarstring
10 | 		self.pos = alignment.pos
11 | 		self.qname = alignment.qname
12 | 		self.tag = read_tagger_func(alignment)
13 | 		self.flag = alignment.flag
14 | 		self.alignment_tags = [x for x in alignment.tags if x[0] in tag_of_interest]
15 | 	
16 | 	def __eq__(self, other):
17 | 		this = (self.reference_id, self.is_reverse, self.pos, self.qname)
18 | 		that = (other.reference_id, other.is_reverse, other.pos, other.qname)
19 | 		return this == that
20 | 	
21 | 	def __hash__(self):
22 | 		return hash((self.reference_id, self.is_reverse, self.pos, self.qname))
23 | 	
24 | 	def __str__(self):
25 | 		s = "\t".join([
26 | 			self.qname, 
27 | 			str(self.flag),
28 | 			str(self.reference_id), 
29 | 			str(self.pos),
30 | 			self.cigarstring])
31 | 		return s


--------------------------------------------------------------------------------
/CLAM/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | 
3 | """ General Version and other info
4 | """
5 | 
6 | __version__ = '1.2.3'
7 | __author__ = 'Zijun Zhang'
8 | __email__ = 'zj.z@ucla.edu'


--------------------------------------------------------------------------------
/CLAM/download_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | 
 5 | 
 6 | def parser(args):
 7 |     """DOCSTRING
 8 |     Args
 9 |     Returns
10 |     """
11 |     try:
12 |         genome = args.genome
13 |         download_genome(genome)
14 |     except KeyboardInterrupt():
15 |         sys.exit(0)
16 | 
17 | 
18 | def download_genome(genome):
19 |     curr_dir = os.path.abspath('.')
20 |     
21 |     admin = (os.getuid() == 0)
22 |     cmd = []
23 |     home = os.environ['HOME']
24 |     if admin:
25 |         profile = '/etc/profile'
26 |     else:
27 |         profile = '{home}/.bashrc'.format(home=home)
28 |     
29 |     if not os.path.isdir('{home}/.clam_data'.format(home=home)):
30 |         os.mkdir('{home}/.clam_data'.format(home=home))
31 |     os.chdir('{home}/.clam_data'.format(home=home))
32 | 
33 |     if 'CLAM_DAT' not in os.environ or not os.environ['CLAM_DAT'] == '{home}/.clam_data'.format(home=home):
34 |         cmd.append('echo "export CLAM_DAT=\'{clam_data}\'" >> {profile}'.format(
35 |             clam_data=os.path.abspath('.'), profile=profile))
36 |         cmd.append('source {profile}'.format(profile=profile))
37 |         os.environ['CLAM_DAT'] = os.path.abspath('.')
38 | 
39 |     if not check_genome_data(genome):
40 |         cmd.append('chmod -R 755 {home}/.clam_data'.format(home=home))
41 |         cmd.append(
42 |             'wget https://raw.githubusercontent.com/wkdeng/clam_data/master/{genome}.zip'.format(genome=genome))
43 |         cmd.append('unzip -o {genome}.zip'.format(genome=genome))
44 |         cmd.append('rm  {genome}.zip'.format(genome=genome))
45 |         for item in cmd:
46 |             subprocess.call(item, shell=True, executable='/bin/bash')
47 |         print('Download finished')    
48 |     os.chdir(curr_dir)
49 | 
50 | def check_genome_data(genome):
51 |     if not os.path.isdir(os.environ['CLAM_DAT'] + '/' + genome):
52 |         return False
53 |     if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/3UTRs.bed'):
54 |         return False
55 |     if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/5UTRs.bed'):
56 |         return False
57 |     if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/cds.bed'):
58 |         return False
59 |     if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/exons.bed'):
60 |         return False
61 |     if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/introns.bed'):
62 |         return False
63 |     if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/proximal200_intron.bed'):
64 |         return False
65 |     if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/proximal500_intron.bed'):
66 |         return False
67 |     return True
68 | 
69 | if __name__ == '__main__':
70 |     download_genome('hg38')
71 | 


--------------------------------------------------------------------------------
/CLAM/peak_annotator.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import pybedtools
  4 | import argparse as ap
  5 | import logging
  6 | from . import download_data, config
  7 | 
  8 | '''
  9 | Assign peaks to genomic regions
 10 | Zijun Zhang
 11 | 8.1.2018
 12 | 10.25.2018: wrapped to a function with document
 13 | 
 14 | DWK
 15 | modified to output annotation file
 16 | 6.12.2019
 17 | '''
 18 | 
 19 | # pylint: disable-msg=too-many-function-args
 20 | # pylint: disable-msg=unexpected-keyword-arg
 21 | 
 22 | 
 23 | def parser(args):
 24 |     """DOCSTRING
 25 |     Args
 26 |     Returns
 27 |     """
 28 |     try:
 29 |         peak_in = args.peak_in
 30 |         genome = args.genome
 31 |         out_file = args.out_file
 32 |         if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome):
 33 |             print("Unable to locate CLAM data folder for genomic regions, will try to download.")
 34 |             print("Downloading...")
 35 |             download_data.download_genome(genome)
 36 |         genome_data = os.environ['CLAM_DAT']
 37 |         intersect_gtf_regions(
 38 |             peak_in, out_file, os.path.join(genome_data, genome))
 39 |     except KeyboardInterrupt():
 40 |         sys.exit(0)
 41 | 
 42 | 
 43 | def intersect_gtf_regions(peak_fp, outfn, gtf_dir):
 44 |     '''function: intersect_gtf_regions(peak_fp, outfn, gtf_dir)
 45 |     Intersect a peak BED file with a list of genomic region annotations (e.g. start/stop codon, UTR, intron),
 46 |     output the peak-region annotations.
 47 |     :param peak_fp: filepath to a BED-format peakquit
 48 |     :param outfn: filepath to output count file, has to end with ".txt"; annotation will be "NNN.annot.txt"
 49 | 
 50 |     '''
 51 |     # input arguments
 52 | 
 53 |     # make pybedtools objects
 54 |     print("Loading peaks...")
 55 |     peaks = pybedtools.BedTool(peak_fp)
 56 |     print("Peak file loaded.")
 57 |     print("Loading genome annotation...")
 58 |     ref_dict = {
 59 |         'exon': pybedtools.BedTool(os.path.join(gtf_dir, 'exons.bed')),
 60 |         '3UTR': pybedtools.BedTool(os.path.join(gtf_dir, '3UTRs.bed')),
 61 |         '5UTR': pybedtools.BedTool(os.path.join(gtf_dir, '5UTRs.bed')),
 62 |         'cds': pybedtools.BedTool(os.path.join(gtf_dir, 'cds.bed')),
 63 |         'intron': pybedtools.BedTool(os.path.join(gtf_dir, 'introns.bed')),
 64 |         'proximal200': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal200_intron.bed')),
 65 |         'proximal500': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal500_intron.bed'))
 66 |     }
 67 |     print("Genome annotation loaded.")
 68 | 
 69 |     # # process reference for use
 70 |     target = {
 71 |         "3UTR": ref_dict['3UTR'],
 72 |         "5UTR": ref_dict['5UTR'],
 73 |         "CDS": ref_dict['cds'],
 74 |         "other_exon": ref_dict['exon']-ref_dict['3UTR']-ref_dict['5UTR']-ref_dict['cds'],
 75 |         "px200_intron": ref_dict['proximal200'],
 76 |         "px500_intron": ref_dict['proximal500'].subtract(ref_dict['proximal200']),
 77 |         "distal_intron": ref_dict['intron'].subtract(ref_dict['exon']).subtract(ref_dict['proximal500'])
 78 |     }
 79 |     category_list = ['3UTR', '5UTR', 'CDS',
 80 |                      'other_exon', "px200_intron", "px500_intron", "distal_intron"]
 81 |     init = True
 82 | 
 83 |     print("Intersecting peaks with genome annotation...")
 84 |     for cat in category_list:
 85 |         bed_arr = []
 86 |         for interval in target[cat]:
 87 |             bed_arr.append('\t'.join([str(x) for x in interval.fields]))
 88 |             bed_arr[-1] = bed_arr[-1] + '\t' + cat
 89 |         bed_arr = list(dict.fromkeys(bed_arr))
 90 |         for i in range(len(bed_arr)):
 91 |             bed_arr[i] = bed_arr[i].split('\t')
 92 |         target[cat] = pybedtools.BedTool(bed_arr)
 93 | 
 94 |         if init:
 95 |             init = False
 96 |             result_bed = peaks.intersect(target[cat], wa=True, wb=True)
 97 |         else:
 98 |             result_bed = result_bed.cat(peaks.intersect(
 99 |                 target[cat], wa=True, wb=True), postmerge=False)
100 |     result_bed = result_bed.sort()
101 | 
102 |     print("Preparing output...")
103 |     result_bed.saveas(outfn + '_')
104 |     prepend = ['## Annotation peaks to genomic regions, all intersected genomic regions are presented.',
105 |                '## CLAM version: %s'%config.__version__,
106 |                '## Column 1:  Peak chromosome',
107 |                '## Column 2:  Peak start',
108 |                '## Column 3:  Peak end',
109 |                '## Column 4:  Peak name',
110 |                '## Column 5:  Peak score',
111 |                '## Column 6:  Peak strand',
112 |                '## Column 7:  Peak signal value',
113 |                '## Column 8:  Peak pValue',
114 |                '## Column 9:  Peak qValue',
115 |                '## Column 10: Point-source called for this peak',
116 |                '## Column 11: Genomic region chromosome',
117 |                '## Column 12: Genomic region start',
118 |                '## Column 13: Genomic region end',
119 |                '## Column 14: Gene ID',
120 |                '## Column 15: Quality score',
121 |                '## Column 16: Genomic region strand',
122 |                '## Column 17: Genomic region type']
123 |     if os.path.exists(outfn):
124 |         os.remove(outfn)
125 |     for line in prepend:
126 |         cmd = 'echo "{prepend}" >> {outfn}'.format(
127 |             prepend=line, outfn=outfn)
128 |         os.system(cmd)
129 |     os.system('cat {outtmp} >> {outfn}'.format(
130 |         outtmp=outfn + '_', outfn=outfn))
131 |     os.remove(outfn+'_')
132 |     print("DONE")
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     # peak_fp, genome, outfn = sys.argv[1], sys.argv[2], sys.argv[3]
137 |     os.chdir('/mnt/h/yi_lab/m6a/src/scripts/peakComposition')
138 |     peak_in, genome, out_file = 'narrow_peak.unique.bed', 'mm10', 'annotate_peak.bed'
139 |     if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome):
140 |         print("Unable to find CLAM data folder for genomic regions, please try to download it using download_genome command.")
141 |         print("Downloading...")
142 |         download_data.download_genome(genome)
143 |     genome_data = os.environ['CLAM_DAT']
144 |     intersect_gtf_regions(
145 |         peak_in, out_file, os.path.join(genome_data, genome))
146 | 


--------------------------------------------------------------------------------
/CLAM/permutation_peakcaller.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """This peak-caller script is part of the CLAM pipeline.
  4 | 
  5 | This subcommand will call peaks using permutation by randomly placing reads along the gene.
  6 | More details about the permutation procedure is described in our NAR paper.
  7 | 
  8 | Example run:
  9 | 	```
 10 | 	CLAM permutation_callpeak -i path/to/outdir/unique.sorted.bam path/to/outdir/realigned.sorted.bam \
 11 | 	-o path/to/peaks/outdir -p 8 \
 12 | 	--gtf path/to/gencode.v19.annotation.gtf
 13 | 	```
 14 | Author:
 15 | 	Zijun Zhang <zj.z@ucla.edu>
 16 | 	Wankun Deng	<dengwankun@hotmail.com>
 17 | Tested under python 3.7.6
 18 | """
 19 | 
 20 | from . import config
 21 | __version__ = config.__version__
 22 | 
 23 | 
 24 | import os
 25 | import sys
 26 | from collections import defaultdict
 27 | from statsmodels.sandbox.stats.multicomp import multipletests
 28 | import logging
 29 | import bisect
 30 | import random
 31 | import pysam
 32 | import re
 33 | from multiprocessing import Pool
 34 | 
 35 | 
 36 | ###setup logger
 37 | logger = logging.getLogger('CLAM.permutation_peakcaller')
 38 | ###
 39 | 
 40 | def parser(args):
 41 | 	"""The main wrapper for CLAM peak-caller.
 42 | 	"""
 43 | 	# logging info
 44 | 	logger.info('start')
 45 | 	logger.info('run info: %s'%(' '.join(sys.argv)))
 46 | 	# some back-reference (to v1.0.0) parameters here
 47 | 	random_state = args.random_state
 48 | 	merge_size = args.merge_size
 49 | 	output_dir = os.path.abspath(args.out_dir)
 50 | 	nthread = args.nthread
 51 | 	max_iter = 200
 52 | 	
 53 | 	# read in gtf gene annotations
 54 | 	gene_annot = read_gtf(args.gtf_fp)
 55 | 	
 56 | 	# read in GTF
 57 | 	gene_list = gene_annot.keys()
 58 | 	child_gene_list = [x for x in chunkify(list(gene_list), nthread)]	
 59 | 	
 60 | 	# call peaks
 61 | 	unibam_file=args.in_bam[0]
 62 | 	multibam_file=args.in_bam[1] if len(args.in_bam)>=2 else None
 63 | 	
 64 | 	if nthread>1:
 65 | 		pool = Pool(processes=args.nthread)
 66 | 		# assert len(args.in_bam)==2
 67 | 		tid_to_qval_compact = pool.map(
 68 | 			_child_get_permutation_fdr, 
 69 | 			[ (unibam_file, multibam_file, child_gene_list[i], gene_annot, args.qval_cutoff, max_iter, ~(args.lib_type=='unstranded'), 'fdr', random_state)
 70 | 				for i in range(args.nthread) 
 71 | 			])
 72 | 
 73 | 		pool.terminate()
 74 | 		pool.join()
 75 | 
 76 | 		unique_tid_to_qval, combined_tid_to_qval = unpack_tid_to_qval(tid_to_qval_compact)
 77 | 	else:
 78 | 		unique_tid_to_qval, combined_tid_to_qval = _child_get_permutation_fdr(
 79 | 			(unibam_file, multibam_file, gene_list, gene_annot, args.qval_cutoff, max_iter, ~(
 80 |                     	args.lib_type == 'unstranded'), 'fdr', random_state))
 81 | 
 82 | 	
 83 | 	#pickle.dump(unique_tid_to_qval, open(tmp_dir+'/unique_to_qval.pdata','wb'), -1)
 84 | 	#pickle.dump(combined_tid_to_qval, open(tmp_dir+'/combined_to_qval.pdata','wb'), -1)
 85 | 	merge_peaks = merge_peaks_singleNucl
 86 | 	#if args.merge_method==1:
 87 | 	#	merge_peaks=merge_peaks_singleNucl
 88 | 	#	mm='singleNucl'
 89 | 	#elif args.merge_method==2:
 90 | 	#	merge_peaks=merge_peaks_broadPeak
 91 | 	#	mm='broadPeak'
 92 | 	#else:
 93 | 	#	merge_peaks=merge_peaks_singleNucl
 94 | 	#	mm='unknown selection, using default singleNucl'
 95 | 		
 96 | 	
 97 | 	unique_peaks=merge_peaks(unique_tid_to_qval, merge_size, args.qval_cutoff)
 98 | 	combined_peaks=merge_peaks(combined_tid_to_qval, merge_size, args.qval_cutoff)  if multibam_file is not None else None
 99 | 	
100 | 	# write peak-calling results to file.
101 | 	narrowPeak_formatter = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t.\t%.3e\t.\n"
102 | 	## chr start end name unique/combined strand signalValue pVal qVal peak
103 | 	with open(output_dir + '/all_permutation_peaks.bed', 'w') as f:
104 | 		for peak in unique_peaks:  # peak = ['chr\tstart\tend\tstrand', 'height\tqval\t', tid]
105 | 			if args.extend is None:
106 | 				wt_loc=peak[0]
107 | 			else:
108 | 				wt_loc=extend_peak_region(peak[0], args.extend)
109 | 			#f.write(wt_loc + '\t' + '\t'.join([str(x) for x in peak[1]]) + '\t' + peak[2] + '\tunique\n')
110 | 			chr, start, end, strand = wt_loc.split('\t')
111 | 			_, signal_qval, gene_name = peak
112 | 			signal, qval = signal_qval
113 | 			f.write( narrowPeak_formatter % (chr, start, end, gene_name, 'unique', strand, signal, qval) )
114 | 		if combined_peaks is not None:
115 | 			for peak in combined_peaks:
116 | 				if args.extend is None:
117 | 					wt_loc=peak[0]
118 | 				else:
119 | 					wt_loc=extend_peak_region(peak[0], args.extend)
120 | 				#f.write(wt_loc + '\t' + '\t'.join([str(x) for x in peak[1]]) + '\t' + peak[2] + '\tcombined\n')
121 | 				chr, start, end, strand = wt_loc.split('\t')
122 | 				_, signal_qval, gene_name = peak
123 | 				signal, qval = signal_qval
124 | 				f.write( narrowPeak_formatter % (chr, start, end, gene_name, 'combined', strand, signal, qval) )
125 | 	if args.lib_type=='unstranded':
126 | 		cmd = ''' sort -k1,1 -k2,2n %s/all_permutation_peaks.bed |awk '{OFS="\t"; print $1,$2,$3,$4":"$7":"$9,$5,$6}'| \
127 | 			bedtools merge -d -1 -i stdin -c 4,5,6 -o collapse,collapse,distinct  > %s''' % (output_dir, os.path.join(output_dir,'narrow_peak.permutation.bed') )
128 | 	else:
129 | 		cmd = ''' sort -k1,1 -k2,2n %s/all_permutation_peaks.bed |awk '{OFS="\t"; print $1,$2,$3,$4":"$7":"$9,$5,$6}'| \
130 | 			bedtools merge -s -d -1 -i stdin -c 4,5,6 -o collapse,collapse,distinct  > %s''' % (output_dir, os.path.join(output_dir,'narrow_peak.permutation.bed') )
131 | 	os.system( cmd )
132 | 	logger.info('end')
133 | 	return
134 | 
135 | 
136 | def chunkify(a, n):
137 | 	"""
138 | 	Separate a list (a) into consecutive n chunks.
139 | 	Returns the chunkified index
140 | 	"""
141 | 	k, m = len(a) / n, len(a) % n
142 | 	return (a[int(i * k + min(i, m)):int((i + 1) * k + min(i + 1, m))] for i in range(n))
143 | 
144 | 	
145 | def unpack_tid_to_qval(compact):
146 | 	"""
147 | 	Unpacks the returned values from multi-processing.
148 | 	"""
149 | 	unique_tid_to_qval=defaultdict(list)
150 | 	combined_tid_to_qval=defaultdict(list)
151 | 	for item in compact:
152 | 		unique, combined = item[0], item[1]
153 | 		if combined is None:
154 | 			combined_tid_to_qval=None
155 | 			for tid in unique:
156 | 				if len(unique[tid]) > 0:
157 | 					unique_tid_to_qval[tid] = unique[tid]
158 | 		else:
159 | 			for tid in combined:
160 | 				if len(unique[tid])>0:
161 | 					unique_tid_to_qval[tid]=unique[tid]
162 | 				if len(combined[tid])>1:
163 | 					combined_tid_to_qval[tid]=combined[tid]
164 | 	return unique_tid_to_qval,combined_tid_to_qval
165 | 
166 | 	
167 | def _child_get_permutation_fdr(args):
168 | 	"""
169 | 	General permutation wrapper for a list of genes. Gets called by multi-processing generated by Pool()
170 | 	Returns packed FDRs from each child process.
171 | 	"""
172 | 	(unibam_file, multibam_file, child_gene_list, gene_annot, pval_cutoff, max_iter, is_stranded, correction_method,seed)=args
173 | 	random.seed(seed)
174 | 	
175 | 	unique_tid_to_qval=defaultdict(list)
176 | 	combined_tid_to_qval = defaultdict(
177 | 		list) if multibam_file is not None else None
178 | 	
179 | 	unibam=pysam.Samfile(unibam_file, 'rb')
180 | 	multibam=pysam.Samfile(multibam_file, 'rb') if multibam_file is not None else None
181 | 	
182 | 	pid = os.getpid()
183 | 	tot = len(child_gene_list)
184 | 	
185 | 	for i in range(len(child_gene_list)):
186 | 		if not i % 200:
187 | 			logger.debug('pid %s : %i / %i (%.2f%%)'% (pid, i, tot, float(i)/float(tot)*100))
188 | 		gene_name = child_gene_list[i]
189 | 		gene = gene_annot[gene_name]
190 | 		chr, start, end, strand, tid = gene[0:5]
191 | 		unique_reads = read_tid_frag_from_bam(gene, unibam, is_stranded, True)
192 | 		multi_reads = read_tid_frag_from_bam(gene, multibam, is_stranded, False) if multibam_file is not None else None
193 | 		
194 | 		this_unique_to_qval = do_permutation(gene, unique_reads, max_iter, pval_cutoff, correction_method)
195 | 		this_combined_to_qval = do_permutation(gene, unique_reads+multi_reads, max_iter, pval_cutoff, correction_method) if multibam_file is not None else None
196 | 		
197 | 		unique_tid_to_qval[tid].extend(this_unique_to_qval)
198 | 		if multibam_file is not None:
199 | 			combined_tid_to_qval[tid].extend(this_combined_to_qval)
200 | 	unibam.close()
201 | 	if multibam_file is not None:
202 | 		multibam.close() 
203 | 	return unique_tid_to_qval, combined_tid_to_qval
204 | 
205 | 
206 | def do_permutation(transcr, read_transcript, max_iter, pval_cutoff, correction_method):	
207 | 	"""
208 | 	Permutes the reads along a given gene length, sub-routine that get called by get_permutation_fdr(..).
209 | 	Returns the locally corrected p-values for each observed height on the given gene.
210 | 	"""
211 | 	chr, tstart, tend, strand, tid = transcr[0:5]
212 | 	tid_length=tend-tstart+1
213 | 	obs_heights_count=count_pileup_heights(tid_length, read_transcript)
214 | 	
215 | 	tid_to_qval=[]
216 | 	
217 | 	rand_heights_dist=defaultdict(int)
218 | 	rand_sum=0
219 | 	# need to account for the 'observed' data, since permutation tests should never report p-value as 0. 3/22/16
220 | 	for i in obs_heights_count:
221 | 		if i==0:
222 | 			continue
223 | 		else:
224 | 			rand_heights_dist[int(i)]+=1
225 | 			rand_sum+=1
226 | 	for B in range(max_iter):
227 | 		new_heights_count=permutate_heights(tid_length, read_transcript)
228 | 		for i in new_heights_count:
229 | 			if i==0:
230 | 				continue
231 | 			else:
232 | 				rand_heights_dist[i]+=1
233 | 				rand_sum+=1
234 | 	height_to_pval={}
235 | 	for h in set(obs_heights_count):
236 | 		if h < 1:
237 | 			continue
238 | 		else:
239 | 			lefter=0
240 | 			for j in range(int(h), max(rand_heights_dist)+1):
241 | 				lefter+=rand_heights_dist[j]
242 | 			height_to_pval[h]=lefter/float(rand_sum)
243 | 	pval_list=[]
244 | 	for i in obs_heights_count:
245 | 		if i<1:
246 | 			continue
247 | 		pval_list.append(height_to_pval[i])
248 | 	if len(pval_list)<=1:
249 | 		return []
250 | 	
251 | 	qval_list=multipletests(pval_list, method='fdr_bh')[1]
252 | 	#if correction_method==2 or correction_method.lower()=='fdr':
253 | 	#	qval_list=multipletests(pval_list, method='fdr_bh')[1]
254 | 	#else:
255 | 	#	qval_list=[min(x*(len(set([int(y) for y in height_to_pval if y!=0]))), 1.0) for x in pval_list]
256 | 	
257 | 	ind=0
258 | 	last_height=0
259 | 	for j in range(len(obs_heights_count)):
260 | 		this_height=obs_heights_count[j]
261 | 		if this_height<1:
262 | 			last_height=0
263 | 			continue
264 | 		if qval_list[ind] <= pval_cutoff:
265 | 			if this_height==last_height:
266 | 				chr, last_start, last_end, last_strand, last_height, last_qval=tid_to_qval[-1]
267 | 				tid_to_qval[-1]=[chr, last_start, tstart+j+1, strand, last_height, last_qval]
268 | 			else:
269 | 				tid_to_qval.append([chr, tstart+j, tstart+j+1, strand, obs_heights_count[j], qval_list[ind]])  # chr, start, end, strand, height, this_qval
270 | 				last_height=this_height
271 | 		ind+=1
272 | 	return tid_to_qval
273 | 
274 | 
275 | def heights_to_dist(rand_heights):
276 | 	"""
277 | 	sub-routine 
278 | 	"""
279 | 	rand_heights_dist=defaultdict(int)
280 | 	rand_sum=0
281 | 	for new_heights_count in rand_heights:
282 | 		for i in new_heights_count:
283 | 			if i==0:
284 | 				continue
285 | 			else:
286 | 				rand_heights_dist[i]+=1
287 | 				rand_sum+=1
288 | 	return rand_heights_dist, rand_sum
289 | 
290 | 
291 | def permutate_heights(tlen, reads):
292 | 	"""
293 | 	Sub-routine for do_permutation(...)
294 | 	Randomly allocate the read locations.
295 | 	"""
296 | 	loc_heights=[0] * tlen
297 | 	for id, pos, read_len, score in reads:
298 | 		if score<1 and random.random() > score:
299 | 			continue
300 | 		rand_pos=random.randint(1, max(1, tlen-read_len))
301 | 		for i in range(rand_pos, min(rand_pos + read_len, tlen)):
302 | 			loc_heights[i]+=1
303 | 	return loc_heights
304 | 
305 | 
306 | def count_pileup_heights(tlen, reads):
307 | 	"""
308 | 	Sub-routine for do_permutation(...)
309 | 	Counts the distribution of pile-up heights for a given gene/permutation
310 | 	"""
311 | 	loc_heights=[0] * tlen
312 | 	for id, pos, read_len, score in reads:
313 | 		for i in range(pos, min(pos+read_len-1, tlen)):
314 | 			loc_heights[i]+=score
315 | 	return loc_heights
316 | 
317 | 
318 | def merge_peaks_broadPeak(transcript_to_qval, merge_size, pval_cutoff):
319 | 	"""
320 | 	Merge called peaks on a gene using option 2, 
321 | 	i.e. if two peaks close to each other, region
322 | 	between two peaks are also called as peaks
323 | 	Retuns a list of merged peaks.
324 | 	"""
325 | 	peaks=[]
326 | 	last_qval=[0,1]
327 | 	for tid in transcript_to_qval:
328 | 		init=True
329 | 		for chr, start, end, strand, height, this_qval in transcript_to_qval[tid]:
330 | 			loc=[chr, str(start), str(end), strand]
331 | 			this_qval=[height, this_qval]  # this_qval=[height, qval] so that when qval=0, we can compare height
332 | 			if  this_qval[1] > pval_cutoff:
333 | 				continue
334 | 			if init:
335 | 				last_qval=this_qval
336 | 				last_pos=[start, end]
337 | 				last_loc=loc
338 | 				last_chr=chr
339 | 				write_out=False
340 | 				init=False
341 | 				continue
342 | 			if int(start) - int(last_pos[1]) > merge_size:
343 | 				write_out=True
344 | 			else:
345 | 				last_pos=[last_pos[0], end]
346 | 				last_qval=this_qval if last_qval[0]<this_qval[0] else last_qval
347 | 				last_loc[2]=str(end)
348 | 				write_out=False
349 | 				
350 | 			if write_out and last_qval[1] < pval_cutoff:
351 | 				peaks.append(['\t'.join(last_loc), last_qval, tid])
352 | 				last_qval=this_qval
353 | 				last_pos=[start, end]
354 | 				last_loc=loc
355 | 				last_chr=[chr, str(start), str(end), strand]
356 | 				write_out=False
357 | 		if last_qval[1] < pval_cutoff:
358 | 			peaks.append(['\t'.join(last_loc), last_qval, tid])
359 | 	return peaks
360 | 
361 | 
362 | def read_gtf(fn):
363 | 	"""read in the gene annotation from GTF file
364 | 	"""
365 | 	logger.info('read GTF from "%s" '% fn)
366 | 	gene_annot = {}
367 | 	with open(fn, 'r') as f:
368 | 		for line in f:
369 | 			if line.startswith('#'):
370 | 				continue
371 | 			ele = line.strip().split('\t')
372 | 			if ele[2] != 'gene':
373 | 				continue
374 | 			chr, start, end, strand = ele[0], int(ele[3]), int(ele[4]), ele[6]
375 | 			try:
376 | 				gene_id = re.search(r'gene_id "(.+?)"', ele[-1]).group(1)
377 | 			except AttributeError:
378 | 				continue
379 | 			gene_annot[gene_id] = [chr, start, end, strand, gene_id]
380 | 	return gene_annot
381 | 
382 | 
383 | def merge_peaks_singleNucl(transcript_to_qval, merge_size, pval_cutoff):
384 | 	"""
385 | 	Merge called peaks on a gene using option 1 
386 | 	(default), i.e. if two peaks close to each other, 
387 | 	only pick the most significant one peak
388 | 	Retuns a list of merged peaks.
389 | 	"""
390 | 	peaks=[]
391 | 	last_qval=[0,1]
392 | 	for tid in transcript_to_qval:
393 | 		init=True
394 | 		for chr, start, end, strand, height, this_qval in transcript_to_qval[tid]:
395 | 			loc='\t'.join([chr, str(start), str(end), strand])
396 | 			this_qval=[height, this_qval]  # this_qval=[height, qval] so that when qval=0, we can compare height
397 | 			if  this_qval[1] > pval_cutoff:
398 | 				continue
399 | 			if init:
400 | 				last_qval=this_qval
401 | 				last_pos=[start, end]
402 | 				last_loc=loc
403 | 				last_chr=chr
404 | 				write_out=False
405 | 				init=False
406 | 				continue
407 | 			if last_chr == chr:
408 | 				if abs( int(start) - int(last_pos[0]) ) > merge_size:
409 | 					write_out=True
410 | 				elif last_qval[0] < this_qval[0]:
411 | 					last_pos=[start, end]
412 | 					last_qval=this_qval
413 | 					last_loc=loc
414 | 					write_out=False
415 | 			else:
416 | 				write_out=True
417 | 				
418 | 			if write_out and last_qval[1] < pval_cutoff:
419 | 				#peaks[last_loc]=last_qval
420 | 				peaks.append([last_loc, last_qval, tid])
421 | 				last_qval=this_qval
422 | 				last_pos=[start, end]
423 | 				last_loc=loc
424 | 				last_chr=chr
425 | 				write_out=False
426 | 		if last_qval[1] < pval_cutoff:
427 | 			peaks.append([last_loc, last_qval, tid])
428 | 	return peaks
429 | 
430 | 
431 | def extend_peak_region(loc, target_len):
432 | 	"""
433 | 	Extends peak symmetrically if peak is smaller than target_len.
434 | 	"""
435 | 	chr, start, end, strand = loc.split('\t')
436 | 	start = int(start)
437 | 	end = int(end)
438 | 	old_len = end - start
439 | 	if old_len > target_len:
440 | 		return loc
441 | 	else:
442 | 		center = int((start + end)/2)
443 | 		start = center - int(target_len /2)
444 | 		end = center + int(target_len/2)
445 | 		return '\t'.join([chr, str(start), str(end), strand])
446 | 
447 | 
448 | def read_tid_frag_from_bam(tid, bamfile, is_stranded, is_unique):
449 | 	"""
450 | 	Use pysam to fetch reads info for a given gene and its loci.
451 | 	Returns reads, read weights and its mapped loci.
452 | 	"""
453 | 	tid_reads=[]
454 | 	#gene, chr, strand, start, end=tid
455 | 	chr, start, end, strand, gene = tid[0:5]
456 | 	if strand=='-':
457 | 		is_reverse=True
458 | 	else:
459 | 		is_reverse=False
460 | 	reads=[x for x in bamfile.fetch(chr, int(start), int(end)) if x.is_reverse==is_reverse or not is_stranded]
461 | 	reads=[x for x in reads if x.pos>=int(start) and x.pos<=int(end)]
462 | 	for read in reads:
463 | 		if is_unique:
464 | 			try:
465 | 				opt_NH=read.opt('NH')
466 | 				if opt_NH > 1:
467 | 					continue
468 | 			except:
469 | 				pass
470 | 			score=1
471 | 		else:
472 | 			try:
473 | 				opt_AS=read.opt('AS')
474 | 				if isinstance(opt_AS, float):
475 | 					score=opt_AS
476 | 				else:
477 | 					continue
478 | 			except:
479 | 				continue
480 | 		try:
481 | 			read_length = read.opt('RL')
482 | 		except:
483 | 			read_length = read.positions[-1] - read.positions[0] + 1
484 | 		
485 | 		if (not 'N' in read.cigarstring) and \
486 | 			(read.pos-start>=0 and read_length<500):    # to avoid junction reads
487 | 				tid_reads.append([read.qname, read.pos-start, read_length, score])
488 | 	return tid_reads
489 | 
490 | 
491 | 


--------------------------------------------------------------------------------
/CLAM/preprocessor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """This preprocessing script is part of the CLAM pipeline.
  4 | 
  5 | This subcommand (new v1.1) will prepare the input files for CLAM pipeline. As of the current version (v1.1), it looks for 
  6 | reads passing QC, splits the input bam file by sorting them into `unique.sorted.bam` and `multi.sorted.bam`, 
  7 | and adding an additional tag "RT" (short for Read Tag) to each alignment based which read tagger function the user supplied.
  8 | 
  9 | Note that you can also run `CLAM realigner` directly, which will call `preprocessor` and automatically determine
 10 | if `preprocessor` has been called in the output folder. 
 11 | 
 12 | If you don't want to run `realigner`, you can also run `peakcaller` directly after `preprocessor`.
 13 | 
 14 | Example run:
 15 | 	```
 16 | 	CLAM preprocessor -i path/to/input/Aligned.out.bam -o path/to/clam/outdir/ --read-tagger-method median
 17 | 	```
 18 | Author:
 19 | 	Zijun Zhang <zj.z@ucla.edu>
 20 | 
 21 | Tested under python 2.7
 22 | """
 23 | from . import config
 24 | __version__ = config.__version__
 25 | 
 26 | import os
 27 | import sys
 28 | import pysam
 29 | import numpy as np
 30 | from collections import defaultdict
 31 | #from tqdm import tqdm
 32 | import logging
 33 | import datetime
 34 | import bisect
 35 | import argparse as ap
 36 | import inspect
 37 | import hashlib
 38 | 
 39 | 
 40 | logger = logging.getLogger('CLAM.Preprocessor')
 41 | 
 42 | 
 43 | def alignment_mutation(x, mut_ref, mut_obs):
 44 | 	"""DOCSTRING
 45 | 	Need to read reference genome
 46 | 	NotImplemented
 47 | 	"""	
 48 | 	raise NotImplementedError()
 49 | 
 50 | 
 51 | def read_tagger_collection(alignment, method='median', **kwargs):
 52 | 	""" tag a read alignment to a genomic locus
 53 | 	Args:
 54 | 	Returns:
 55 | 	"""
 56 | 	tagger_func = {
 57 | 		# center of the read; must dicard junction reads
 58 | 		'median': lambda x: -1 if 'N' in x.cigarstring else int(np.median(x.positions))+1,
 59 | 		# start site of the read; truncation in iCLIP/eCLIP
 60 | 		'start': lambda x: -1 if 'N' in x.cigarstring else x.positions[-1]+1 if x.is_reverse else x.positions[0]+1,
 61 | 		# extend from 5' site to certain length; need kwargs
 62 | 		'extend': lambda x: -1 if 'N' in x.cigarstring else x.positions[-1]-kwargs['ext_len'] if x.is_reverse else x.positions[0]+kwargs['ext_len'],
 63 | 		# mutation tag a specific mutation type
 64 | 		'mutation': lambda x: alignment_mutation(x, kwargs['mut_ref'], kwargs['mut_obs'])
 65 | 		}
 66 | 	try:
 67 | 		tag=tagger_func[method](alignment)
 68 | 	except:
 69 | 		tag=-1
 70 | 	return tag
 71 | 
 72 | 
 73 | def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_method, lib_type):
 74 | 	"""Pre-processing function for cleaning up the input bam file.
 75 | 	Args:
 76 | 	Returns:
 77 | 	"""
 78 | 	# logging the parameter values
 79 | 	frame = inspect.currentframe()
 80 | 	args, _, _, values = inspect.getargvalues(frame)
 81 | 	msg = 'Params:\n'
 82 | 	for i in args:
 83 | 		msg += "%s = %s \n"%(i, values[i])
 84 | 	logger.info(msg)
 85 | 	read_tagger=lambda x: read_tagger_collection(x, method=read_tagger_method)
 86 | 	logger.info('filtering input bam')
 87 | 	
 88 | 	in_bam = pysam.Samfile(filename,'rb')
 89 | 	# unique read bam
 90 | 	ubam_fn = os.path.join(out_dir, 'unique.bam')
 91 | 	sorted_ubam_fn = os.path.join(out_dir, 'unique.sorted.bam')
 92 | 	ubam=pysam.Samfile(ubam_fn, 'wb', template=in_bam)
 93 | 	unique_counter = 0
 94 | 	
 95 | 	# multi-read bam
 96 | 	mbam_fn = os.path.join(out_dir, 'multi.bam')
 97 | 	sorted_mbam_fn = os.path.join(out_dir, 'multi.sorted.bam')
 98 | 	mbam=pysam.Samfile(mbam_fn, 'wb', template=in_bam)
 99 | 	mread_set = set()
100 | 	
101 | 	# splitting unique and multi- reads
102 | 	# and add the read taggers we need
103 | 	if not \
104 | 		(os.path.isfile( os.path.join(out_dir,'unique.sorted.bam') ) and \
105 | 		os.path.isfile( os.path.join(out_dir,'multi.sorted.bam')) ):
106 | 			
107 | 		#for read in tqdm(in_bam):
108 | 		counter = 0
109 | 		for read in in_bam:
110 | 			# poor man's progress bar
111 | 			counter += 1
112 | 			if not counter % 10**6:
113 | 				logger.debug('tagged %i alignments'%counter)
114 | 			read_tag = read_tagger(read)
115 | 			## skip reads with unassigned tagger
116 | 			if read_tag==-1:
117 | 				continue
118 | 			read.tags += [('RT', read_tag)] ## add the tag
119 | 
120 | 			tagged_read = pysam.AlignedSegment()
121 | 			tagged_read.query_name = read.query_name
122 | 			tagged_read.query_sequence = 'N'
123 | 			tagged_read.flag = read.flag
124 | 			tagged_read.reference_id = read.reference_id
125 | 			tagged_read.reference_start = read_tag - 1  # 0-based leftmost coordinate
126 | 			tagged_read.mapping_quality = read.mapping_quality
127 | 			tagged_read.cigar = ((0, 1),)
128 | 			tagged_read.template_length = read.template_length
129 | 			tagged_read.query_qualities = pysam.qualitystring_to_array("<")
130 | 			tagged_read.tags = read.tags
131 | 			read_len = sum([i[1] for i in read.cigar if i[0] == 0])
132 | 			tagged_read.tags += [('RL', read_len)]
133 | 			if len(read.query_sequence) >= 32:
134 | 				tagged_read.tags += [('SQ',
135 | 				                      hashlib.md5(read.query_sequence.encode('utf-8')).hexdigest())]
136 | 			else:
137 | 				tagged_read.tags += [('SQ', read.query_sequence)]
138 | 
139 | 
140 | 			# add lib_type check
141 | 			if lib_type != "unstranded":
142 | 				tagged_read.is_reverse = (read.is_reverse) ^ (lib_type!="sense")
143 | 			
144 | 			if read.is_secondary or (read.has_tag('NH') and read.opt("NH")>1):
145 | 				#try:
146 | 				if read.opt("NH") < max_hits:
147 | 					mbam.write(tagged_read)
148 | 					mread_set.add(read.qname)
149 | 				#except KeyError:
150 | 				#	#print read
151 | 				#	raise Exception('%s: missing NH tag when is_secondary=%s'%(read.qname,read.is_secondary))
152 | 			else:
153 | 				ubam.write(tagged_read)
154 | 				unique_counter += 1
155 | 		
156 | 		ubam.close()
157 | 		mbam.close()
158 | 		
159 | 		# sorting
160 | 		pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_ubam_fn), '-o', sorted_ubam_fn, ubam_fn)
161 | 		os.remove(ubam_fn)
162 | 		pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_mbam_fn), '-o', sorted_mbam_fn, mbam_fn)
163 | 		os.remove(mbam_fn)
164 | 		pysam.index(sorted_ubam_fn)
165 | 		pysam.index(sorted_mbam_fn)
166 | 		
167 | 		# log the statistics
168 | 		multi_counter = len(mread_set)
169 | 		logger.info(
170 | 				'Unique reads = %s;  ' % unique_counter + \
171 | 				'Multi reads = %s (%.2f %%)' % \
172 | 				( multi_counter, float(multi_counter)/(multi_counter+unique_counter)*100 )
173 | 			)
174 | 	else:
175 | 		logger.info('found previously sorted tag-bam. checking if need collapsing.')
176 | 	
177 | 	# filter redundant tags if turned on
178 | 	if max_tags>0:
179 | 		logger.info('collapsing unique')
180 | 		filter_bam_maxtags(os.path.join(out_dir, 'unique.sorted.collapsed.bam'), os.path.join(out_dir, 'unique.sorted.bam'), max_tags)
181 | 		logger.info('collapsing multi')
182 | 		filter_bam_maxtags(os.path.join(out_dir, 'multi.sorted.collapsed.bam'), os.path.join(out_dir, 'multi.sorted.bam'), max_tags)
183 | 	
184 | 	in_bam.close()
185 | 	return
186 | 
187 | 
188 | def collapse_stack(stack, collapse_dict, max_tags):
189 | 	"""DOCSTRING
190 | 	Args
191 | 	Returns
192 | 	"""
193 | 	new_alignment_list = []
194 | 	new_alignment_dict = defaultdict(list)
195 | 	for aln in stack:
196 | 		new_alignment_dict[aln.tags[-1][1]].append(aln)
197 | 	
198 | 	# TODO 2017.10.21: 
199 | 	# further collapse `new_alignment_dict`
200 | 	# based on degeneracy and/or read tags
201 | 	
202 | 	for seq in new_alignment_dict:
203 | 		this_alignment_qname_list = [x.qname for x in new_alignment_dict[seq] ]
204 | 		is_collapsed = [True if x in collapse_dict else False for x in this_alignment_qname_list]
205 | 		## if any of the alignment is collapsed before,
206 | 		## we require all of them to be collapsed
207 | 		if any(is_collapsed):
208 | 			assert all(is_collapsed)
209 | 			target_alignment_qname = collapse_dict[this_alignment_qname_list[0]][0:max_tags]
210 | 			assert len(collapse_dict[this_alignment_qname_list[0]]) <= max_tags
211 | 			target_alignment = [new_alignment_dict[seq][this_alignment_qname_list.index(x)] for x in target_alignment_qname]
212 | 		else:
213 | 			target_alignment = new_alignment_dict[seq][0:max_tags]
214 | 			for aln_qname in this_alignment_qname_list:
215 | 				collapse_dict[aln_qname] = [x.qname for x in target_alignment]
216 | 		for read in target_alignment:
217 | 			read.tags=read.tags[:-1]
218 | 			new_alignment_list.append( read )
219 | 	return new_alignment_list, collapse_dict
220 | 
221 | 
222 | def filter_bam_maxtags(obam_fn, ibam_fn, max_tags=1):
223 | 	"""DOCSTRING
224 | 	Args
225 | 	Returns
226 | 	"""
227 | 	assert max_tags>0
228 | 	# prepare files
229 | 	ibam = pysam.Samfile(ibam_fn, 'rb')
230 | 	obam = pysam.Samfile(obam_fn, 'wb', template=ibam)
231 | 	# init 
232 | 	collapse_dict = defaultdict(list)
233 | 	chr_list=[x['SN'] for x in ibam.header['SQ']]
234 | 	input_counter = 0
235 | 	output_counter = 0
236 | 	
237 | 	for chr in chr_list:
238 | 		# empty stack for each new chromosome
239 | 		stack = []
240 | 		last_pos = -1
241 | 		for read in ibam.fetch(chr):
242 | 			input_counter += 1
243 | 			if not (input_counter % (5*(10**6)) ):
244 | 				logger.debug('collapsed %i alignments'%input_counter)
245 | 			if read.positions[0] > last_pos:
246 | 				new_alignment_list, collapse_dict = collapse_stack(stack, collapse_dict, max_tags)
247 | 				output_counter += len(new_alignment_list)
248 | 				last_pos = read.positions[0]
249 | 				stack = [read]
250 | 				for new_alignment in new_alignment_list:
251 | 					#new_alignment.query_sequence = '*'
252 | 					#new_alignment.query_qualities = '0'
253 | 					_ = obam.write(new_alignment)
254 | 			else:
255 | 				stack.append(read)
256 | 		new_alignment_list, collapse_dict = collapse_stack(stack, collapse_dict, max_tags)
257 | 		output_counter += len(new_alignment_list)
258 | 		last_pos = read.positions[0]
259 | 		for new_alignment in new_alignment_list:
260 | 			#new_alignment.query_sequence = '*'
261 | 			#new_alignment.query_qualities = '0'
262 | 			_ = obam.write(new_alignment)
263 | 	ibam.close()
264 | 	obam.close()
265 | 	#os.rename(obam_fn, ibam_fn)
266 | 	#pysam.sort(obam_fn)
267 | 	pysam.index(obam_fn)
268 | 	logger.info('Input = %s; Output = %s; Redundancy = %.2f'%(input_counter,output_counter, 1-float(output_counter)/input_counter))
269 | 	return
270 | 	
271 | 	
272 | 
273 | 
274 | def parser(args):
275 | 	"""DOCSTRING
276 | 	Args
277 | 	Returns
278 | 	"""
279 | 	try:
280 | 		in_bam = args.in_bam
281 | 		out_dir  = args.out_dir
282 | 		if not os.path.isdir(out_dir):
283 | 			os.mkdir(out_dir)
284 | 		tag_method = args.tag_method
285 | 		max_hits = args.max_hits
286 | 		## Note: if specified max_tags, need pre-sorted bam
287 | 		max_tags = args.max_tags
288 | 		lib_type = args.lib_type
289 | 		
290 | 		#logger = logging.getLogger('CLAM.Preprocessor')
291 | 		logger.info('start')
292 | 		logger.info('run info: %s'%(' '.join(sys.argv)))
293 | 		
294 | 		filter_bam_multihits(in_bam, max_hits=max_hits, max_tags=max_tags, out_dir=out_dir, 
295 | 			read_tagger_method=tag_method, lib_type=lib_type)
296 | 		
297 | 		logger.info('end')
298 | 	except KeyboardInterrupt:
299 | 		sys.exit(0)
300 | 	return
301 | 


--------------------------------------------------------------------------------
/CLAM/realigner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """This re-aligner script is part of the CLAM pipeline.
  4 | 
  5 | This subcommand will run expectation-maxmization to assign the multi-mapped reads in a probablistic framework. 
  6 | More details about the EM model is described in our NAR paper.
  7 | 
  8 | Note when `--retag` is specified, `realigner` will re-run `preprocessor` regardless; otherwise, it will use 
  9 | the prepared files in `outdir` if available.
 10 | 
 11 | Example run:
 12 | 	```
 13 | 	CLAM realigner -i path/to/input/Aligned.out.bam -o path/to/clam/outdir/ --read-tagger-method start --retag
 14 | 	```
 15 | Author:
 16 | 	Zijun Zhang <zj.z@ucla.edu>
 17 | 
 18 | Tested under python 2.7
 19 | """
 20 | 
 21 | from . import config
 22 | __version__ = config.__version__
 23 | 
 24 | import os
 25 | import sys
 26 | import pysam
 27 | import numpy as np
 28 | from collections import defaultdict, deque
 29 | #from tqdm import tqdm
 30 | import logging
 31 | import datetime
 32 | import bisect
 33 | import argparse as ap
 34 | import inspect
 35 | from .preprocessor import *
 36 | 
 37 | logger = logging.getLogger('CLAM.Realigner')
 38 | 
 39 | class Bit:
 40 | 	""" Binary Indexed Tree to store values in genomic intervals.
 41 | 	Implementation modified from http://www.geeksforgeeks.org/binary-indexed-tree-or-fenwick-tree-2/
 42 | 	Args:
 43 | 		n (int): length of the interval to construct
 44 | 	Returns:
 45 | 		a BIT object with `add` and `sum` method over arbitrary sub-intervals
 46 | 		with O(log(n)) time
 47 | 	"""
 48 | 	
 49 | 	def __init__(self, n):
 50 | 		sz = 1
 51 | 		while n >= sz:
 52 | 			sz *= 2
 53 | 		self.size = sz
 54 | 		self.array_size = n
 55 | 		self.data = [0]*sz
 56 | 		
 57 | 	def sum(self, i):
 58 | 		assert i >= 0
 59 | 		if i==0:
 60 | 			return 0
 61 | 		if i > self.array_size:
 62 | 			i = self.array_size 
 63 | 		s = 0
 64 | 		while i > 0:
 65 | 			s += self.data[i]
 66 | 			i -= i & -i
 67 | 		return s
 68 | 	
 69 | 	def add(self, i, x):
 70 | 		assert i > 0	
 71 | 		while i < self.size:
 72 | 			self.data[i] += x
 73 | 			i += i & -i
 74 | 
 75 | 
 76 | def construct_BIT_track(subgraph, read_to_locations, ubam, unstranded=False):
 77 | 	"""Construct BIT for each genomic region / node.
 78 | 	Args:
 79 | 		subgraph (list): a list of node names
 80 | 		read_to_locations (dict): 
 81 | 	Returns:
 82 | 		node_track (dict/BIT): a node-track dictionary; node_name => BIT
 83 | 		multi_reads_weights (dict): a dictionary for multi-mapped reads; read_qname => node => [score, locus]
 84 | 	"""
 85 | 	node_track = {}
 86 | 	total_len = 0
 87 | 	
 88 | 	# initialized BIT tracks, add mreads to the tracks,
 89 | 	# and keep a dict of read scores
 90 | 	multi_reads_weights = defaultdict(dict)
 91 | 	obs_reads = read_to_locations.keys()
 92 | 	for read_x_qname in obs_reads:
 93 | 		read_x_nodes = read_to_locations[read_x_qname]
 94 | 		read_x_score = 1.0 / len(read_x_nodes)
 95 | 		for node in read_x_nodes:
 96 | 			chr, strand, start, end = node.split(':')
 97 | 			start, end = int(start), int(end)
 98 | 			if not node in node_track:
 99 | 				this_len = end - start + 1
100 | 				node_track[node] = Bit(this_len)
101 | 			read_x_tag = read_x_nodes[node].opt('RT')
102 | 			node_locus = read_x_tag - start + 1
103 | 			node_track[node].add(node_locus, read_x_score)
104 | 			multi_reads_weights[read_x_qname][node]=[read_x_score, node_locus]
105 | 			#del read_to_locations[read_x_qname][node]
106 | 		#del read_to_locations[read_x_qname]
107 | 	
108 | 	# now add ureads by fetching from ubam; 
109 | 	# we don't need to keep track of them, just add the weights
110 | 	for node in node_track:
111 | 		chr, strand, start, end = node.split(':')
112 | 		start, end = int(start), int(end)
113 | 		is_reverse = True if strand=='-' else False
114 | 		uread_tagger = [x.opt('RT') for x in ubam.fetch(chr, start, end) \
115 | 			if unstranded or x.is_reverse==is_reverse]
116 | 		for uread_x_tagger in uread_tagger:
117 | 			if uread_x_tagger>=start and uread_x_tagger<=end:
118 | 				node_locus = uread_x_tagger - start + 1
119 | 				node_track[node].add(node_locus, 1)
120 | 	
121 | 	return node_track, multi_reads_weights
122 | 
123 | 
124 | 
125 | def run_EM(node_track, multi_reads_weights, w=50, epsilon=1e-6, max_iter=100, verbose=True):
126 | 	"""	EM implementation for re-assigning multi-mapped reads, given the 
127 | 	compatibility matrix of a subgraph.
128 | 	Args:
129 | 		node_track (dict): dict. of BIT returned from `construct_BIT_track`
130 | 		multi_reads_weights (dict): dict of mread qname and locus returned from `construct_BIT_track`
131 | 		w (int): window size for search vicinity reads
132 | 		epsilon (float): a small number for testing convergence between iterations
133 | 		max_iter (int): maximum iterations of EM
134 | 		verbose (bool, options): prints status in verbose mode
135 | 	Returns:
136 | 		multi_reads_weights (dict): the mread weight after EM
137 | 	"""
138 | 	iter = 1
139 | 	residue = 1
140 | 	#n_est = sum([1. for r in multi_reads_weights for n in multi_reads_weights[r] ])
141 | 	n_est = sum([1. for r in multi_reads_weights ])
142 | 	while iter < max_iter and residue > epsilon:
143 | 		residue = 0
144 | 		reweight=defaultdict(dict)
145 | 		## calculate re-distribute probability; M-step
146 | 		for read in multi_reads_weights:
147 | 			for nd in multi_reads_weights[read]:
148 | 				track_len=node_track[nd].array_size
149 | 				old_score, read_tag = multi_reads_weights[read][nd]
150 | 				reweight[read][nd] = max( 0, node_track[nd].sum(min(track_len, read_tag + w)) - node_track[nd].sum(max(0,read_tag - w)) )
151 | 		## update track by expectation; E-step
152 | 		for read in reweight:
153 | 			dn=sum([reweight[read][x] for x in reweight[read]])
154 | 			if dn==0:
155 | 				logger.debug('Error: no read weight found @ %s.'%read )
156 | 				dn=1
157 | 			for nd in reweight[read]:
158 | 				old_score, read_tag = multi_reads_weights[read][nd]
159 | 				new_score = reweight[read][nd] / float(dn)
160 | 				node_track[nd].add(read_tag, new_score - old_score)
161 | 				residue += (old_score - new_score)**2
162 | 				multi_reads_weights[read][nd][0] = new_score
163 | 			residue /= n_est
164 | 		if verbose and (not iter % 10 or iter == max_iter):
165 | 			logger.debug('Iter %d, residue = %f' % (iter, residue))
166 | 		iter += 1
167 | 	return multi_reads_weights
168 | 
169 | 
170 | def build_read_cluster(alignment, chr_dict, location_to_reads, genomic_cluster_dict, unstranded=False, winsize=50):
171 | 	"""Given an alignment, find its genomic cluster, and all other mreads 
172 | 	in that cluster
173 | 	Args:
174 | 		alignment (pysam.AlignedSegment): pysam alignment object
175 | 		chr_dict (dict): a dict of chrom name and sizes
176 | 		location_to_reads (dict): stores all mreads indexed by aligned locus; cluster name => mread alignments
177 | 		genomic_cluster_dict (dict): stores genomic clusters; chrom => [intv1, intv2, ..]
178 | 		unstranded (bool): if true, don't use the strand info in alignment
179 | 		winsize (int): window size for search ureads
180 | 	Returns:
181 | 		genomic_cluster (tuple): the target chrom and coordinates after expanding the window size
182 | 		this_mread_dict (dict): dict. of mread alignments in the target cluster indexed by read_qname
183 | 		discarded_mread_alignments (list): discarded mread alignments because of multiple occurences within one cluster
184 | 	"""
185 | 	chr_list = chr_dict['name']
186 | 	chr_size = chr_dict['size']
187 | 	chrom = chr_list[alignment.reference_id]
188 | 	chr_len = chr_size[alignment.reference_id]
189 | 	site = alignment.opt('RT')
190 | 	is_reverse = alignment.is_reverse
191 | 	strand = '+' if unstranded or is_reverse==False else '-'
192 | 	this_mread_dict = {}
193 | 	this_mread_dict_set = defaultdict(set)
194 | 	discarded_mread_alignments = []
195 | 	
196 | 	## note to me: need to be more careful with
197 | 	## one read mapped to *multiple-locations* within one cluster
198 | 	## currently tossing away those alignments.. (in `discarded_mread_alignments`)
199 | 	
200 | 	## check junction reads; should be filtered out in tagging step
201 | 	#if 'N' in alignment.cigarstring:
202 | 	#	return None, None, [alignment]
203 | 	
204 | 	## find the corresponding genomic cluster from `genomic_cluster_dict`
205 | 	chr_strand = chrom+':'+strand
206 | 	idx = bisect.bisect_right(genomic_cluster_dict[chr_strand], site)
207 | 	if not idx%2:
208 | 		print(alignment)
209 | 		raise Exception('%s falls out of region %s'%(alignment.qname, chr_strand+':'+str(site)) )
210 | 	start = genomic_cluster_dict[chr_strand][idx-1] - winsize
211 | 	start = 1 if start<1 else start
212 | 	end = genomic_cluster_dict[chr_strand][idx] + winsize
213 | 	end = chr_len-1 if end>=chr_len else end
214 | 	genomic_cluster = (chrom, strand, start, end)
215 | 	
216 | 	## fetch the reads
217 | 	cluster_name = ':'.join([chrom, strand, str(genomic_cluster_dict[chr_strand][idx-1]), str(genomic_cluster_dict[chr_strand][idx])])
218 | 	if not cluster_name in location_to_reads:
219 | 		raise Exception("cannot find cluster '%s' associated with read '%s' in `location_to_reads` of len %i"%(cluster_name, alignment.qname, len(location_to_reads)))
220 | 	mread_list = location_to_reads[cluster_name]
221 | 	#print(alignment, cluster_name)
222 | 	for x in mread_list:
223 | 		this_mread_dict_set[x.qname].add(x)
224 | 	del location_to_reads[cluster_name]
225 | 	
226 | 	## find other mreads in this cluster
227 | 	for read_x_qname in this_mread_dict_set:
228 | 		if len(this_mread_dict_set[read_x_qname])>1:
229 | 			discarded_mread_alignments.extend( [ x for x in list(this_mread_dict_set[read_x_qname]) ])
230 | 		else:
231 | 			this_mread_dict[read_x_qname] = list(this_mread_dict_set[read_x_qname])[0]
232 | 
233 | 	return genomic_cluster, this_mread_dict, discarded_mread_alignments
234 | 
235 | 
236 | def construct_subgraph(location_to_reads, read_qname, mread_dict, processed_mreads, chr_dict, genomic_cluster_dict, winsize=50, unstranded=False):
237 | 	"""Given a mread_qname, find exhaustively all other connected mreads.
238 | 	Args:
239 | 		location_to_reads (dict): genomic cluster => Alignment
240 | 		read_qname (str): target read ID
241 | 		mread_dict (dict): stores all read ID => Alignment 
242 | 		processed_mreads (set): 
243 | 		chr_dict (dict): map ref_id to chrom_name, chrom_size
244 | 		genomic_cluster (dict): chrom:strand => [interval1, interval2, ..]
245 | 	Returns:
246 | 		read_to_locations (dict): collect a subset of mread alignments in the same 
247 | 			subgraph starting with read_qname 
248 | 		processed_mreads (set): record all processed mread_qname to avoid re-processing
249 | 	"""
250 | 	# record of processed alignments only need kept on within-subgraph level
251 | 	processed_mread_alignments = set()
252 | 	counter = 0
253 | 	# a list of `pysam.AlignedSegment` objects
254 | 	# note that all taggers are already stored in `pysam.AlignedSegment.opt('RT')`
255 | 	read_aln_list = [x for x in mread_dict[read_qname]] 
256 | 	processed_mreads.add(read_qname)
257 | 	read_to_locations = defaultdict(dict) # read_qname -> {node_name1:segment1, node_name2:segment2}
258 | 	
259 | 	# enumerate all connected components
260 | 	while True:
261 | 		counter+=1; #print "%i: %i"%(counter, len(read_aln_list))
262 | 		next_read_aln_list = []
263 | 		
264 | 		#gen = read_aln_list if len(read_aln_list)<200 else tqdm(read_aln_list)
265 | 		gen = read_aln_list
266 | 		for alignment in gen:
267 | 			## build a node for this mread alignment 
268 | 			## (if not already processed, i.e. built before)
269 | 			if alignment in processed_mread_alignments:
270 | 				continue
271 | 			
272 | 			genomic_cluster, this_mread_dict, discarded_mread_list = \
273 | 				build_read_cluster(alignment, chr_dict, 
274 | 					location_to_reads, genomic_cluster_dict, 
275 | 					unstranded=unstranded, winsize=winsize)
276 | 			## record those discarded alignments/reads
277 | 			## note: we mark discarded_mread as processed as well,
278 | 			## so as not to create a bias to less clustered regions.
279 | 			# THIS IS PYTHON3 INCOMPATIBLE
280 | 			#_ = map(processed_mread_alignments.add, discarded_mread_list)
281 | 			#_ = map(processed_mreads.add, [x.qname for x in discarded_mread_list])
282 | 			for x in discarded_mread_list:
283 | 				processed_mread_alignments.add(x)
284 | 			for x in discarded_mread_list:
285 | 				processed_mreads.add(x.qname)
286 | 			if genomic_cluster is None:  # this cluster is invald (only double-mappers)
287 | 				continue
288 | 			
289 | 			## update read_to_locations
290 | 			node_name = ':'.join([str(x) for x in genomic_cluster])
291 | 			#if node_name in subgraph:
292 | 				#logger.debug("I revisited '%s' at read '%s'."%(node_name, read_qname))
293 | 				#print("I revisited '%s' at read '%s'."%(node_name, read_qname))
294 | 				#break
295 | 			for x_qname in this_mread_dict:
296 | 				read_to_locations[x_qname].update({node_name :  this_mread_dict[x_qname]})
297 | 			
298 | 			## then add new alignments(edges) to generate connected nodes
299 | 			## in the next iteration
300 | 			# THIS IS PYTHON3 INCOMPATIBLE
301 | 			#_ = map(processed_mread_alignments.add, this_mread_dict.values())
302 | 			for x in list(this_mread_dict.values()):
303 | 				processed_mread_alignments.add(x)
304 | 			for read_x_qname in this_mread_dict:
305 | 				if read_x_qname in processed_mreads:
306 | 					continue
307 | 				x_aln_list = [aln for aln in mread_dict[read_x_qname] if not aln in processed_mread_alignments]
308 | 				next_read_aln_list.extend(x_aln_list)
309 | 			
310 | 			## .. and record to processed reads since we have generated
311 | 			## the nodes for them
312 | 			#_ = map(processed_mreads.add, this_mread_dict.keys())  # this is python3 incompatible
313 | 			for x in list(this_mread_dict.keys()):
314 | 				processed_mreads.add(x)
315 | 		
316 | 		# if no more connected nodes can be found, break loop 
317 | 		if len(next_read_aln_list)==0:
318 | 			break
319 | 		read_aln_list = next_read_aln_list		
320 | 	return read_to_locations, processed_mreads
321 | 
322 | 
323 | def get_genomic_clusters(mbam, winsize=50, unstranded=False):
324 | 	"""Parsing the mbam to cluster the mread, and construct interval=>alignment.
325 | 	Using the same object in difference references, and just keep one copy of 
326 | 	the mread-alignments to minimize memory usage.
327 | 	Args:
328 | 		mbam (pysam.Samfile): multi-read bam file handler
329 | 		winsize (int): window size for search mreads
330 | 		unstranded (bool): if turned on, all reads will be pushed to forward strand
331 | 	Returns:
332 | 		genomic_cluster_dict (dict): chrom:+/- => [intv1, intv2, ..]
333 | 		mread_dict (dict): read_qname => [aln1, aln2, ..]
334 | 		location_to_reads (dict): chrom:strand:start:end => [read1_aln, real2_aln, ..]
335 | 	"""
336 | 	# chrom:+/- => [intv1_1, intv1_2, intv2_1, intv2_2]
337 | 	genomic_cluster_dict = defaultdict(list)
338 | 	# read_qname => [aln1, aln2, ..]
339 | 	mread_dict = defaultdict(list)
340 | 	# chrom:+/-:start:end => [read1_aln, read2_aln,]
341 | 	location_to_reads = defaultdict(list)
342 | 	chr_list=[x['SN'] for x in mbam.header['SQ']]
343 | 	chr_size=[x['LN'] for x in mbam.header['SQ']]
344 | 	chr_dict = {'name':chr_list, 'size':chr_size}
345 | 	logger.info('finding genomic clusters')
346 | 	for chrom in chr_list:
347 | 		## initialze the placeholder for current positive/negative strand clusters
348 | 		## pos/neg: [start, end, tag_counter]
349 | 		cur_cluster = {'+':[0,0,0], '-':[0,0,0]}
350 | 		cur_cluster_aln = {'+':[], '-':[]}
351 | 		for read_alignment in mbam.fetch(chrom):
352 | 			## should filter out junction reads in tagging step
353 | 			#if 'N' in read_alignment.cigarstring:
354 | 			#	continue
355 | 			## add current alignment to mread_dict
356 | 			mread_dict[read_alignment.qname].append(read_alignment)
357 | 			site = read_alignment.opt('RT')
358 | 			strand = '-' if read_alignment.is_reverse and not unstranded else '+'
359 | 			### if this read is within the window size
360 | 			if site <= cur_cluster[strand][1]+winsize:
361 | 				if site < cur_cluster[strand][0]:
362 | 					cur_cluster[strand][0] = site
363 | 				if site > cur_cluster[strand][1]:
364 | 					cur_cluster[strand][1] = site
365 | 				cur_cluster[strand][2] += 1
366 | 				cur_cluster_aln[strand].append(read_alignment)
367 | 			### otherwise, push the current cluster to `genomic_cluster_dict`
368 | 			else:
369 | 				if cur_cluster[strand][2] > 0:
370 | 					genomic_cluster_dict[chrom+':'+strand].extend(
371 | 						[
372 | 							cur_cluster[strand][0], 
373 | 							cur_cluster[strand][1]+1
374 | 						]
375 | 						)
376 | 					genomic_cluster_str = ':'.join([chrom, strand, str(cur_cluster[strand][0]), str(cur_cluster[strand][1]+1) ])
377 | 					location_to_reads[genomic_cluster_str].extend(cur_cluster_aln[strand])
378 | 				cur_cluster[strand] = [site, site+1, 1]
379 | 				cur_cluster_aln[strand] = [read_alignment]
380 | 		## remember to push the last genomic cluster to dict
381 | 		if cur_cluster['+'][2] > 0:
382 | 			genomic_cluster_dict[chrom+':+'].extend([cur_cluster['+'][0],cur_cluster['+'][1]+1])
383 | 			genomic_cluster_str = ':'.join([chrom, '+', str(cur_cluster['+'][0]), str(cur_cluster['+'][1]+1) ])
384 | 			location_to_reads[genomic_cluster_str].extend(cur_cluster_aln['+'])
385 | 		if cur_cluster['-'][2] > 0:
386 | 			genomic_cluster_dict[chrom+':-'].extend([cur_cluster['-'][0],cur_cluster['-'][1]+1])
387 | 			genomic_cluster_str = ':'.join([chrom, '-', str(cur_cluster['-'][0]), str(cur_cluster['-'][1]+1) ])
388 | 			location_to_reads[genomic_cluster_str].extend(cur_cluster_aln['-'])
389 | 	
390 | 	return genomic_cluster_dict, mread_dict, location_to_reads
391 | 
392 | 
393 | 
394 | def realigner(in_bam, out_dir, max_hits=100, max_tags=-1, read_tagger_method='median', 
395 |               winsize=50, unstranded=False, retag=False, lib_type="sense"):
396 | 	"""The main entry for CLAM-realigner.
397 | 
398 | 	Args:
399 | 		in_bam (str): filepath for input bam
400 | 		out_dir (str): filepath for CLAM output folder
401 | 		max_hits (int):  maximum number of aligned loci allowed for mreads
402 | 		max_tags (int): maximum number of identical alignments allowed for each 
403 | 			genomic locus, more amount will be collapsed; -1 is no collapsing
404 | 		read_tagger_method (str): the tagger function type
405 | 		winsize (int): window size 
406 | 		unstranded (bool): ignore alignment strand info if turned on
407 | 		retag (bool): force to call `preprocessor` to process `in_bam` if turned on
408 | 		lib_type (str): specifies if the expected read alignment strand is `sense` with 
409 | 			transcript strand, or `antisense`, or `unstranded`. 
410 | 	
411 | 	Returns:
412 | 		None
413 | 	"""
414 | 	# logging the parameter values
415 | 	frame = inspect.currentframe()
416 | 	args, _, _, values = inspect.getargvalues(frame)
417 | 	msg = 'Params:\n'
418 | 	for i in args:
419 | 		msg += "%s = %s \n"%(i, values[i])
420 | 	logger.info(msg)
421 | 	# preprocessing
422 | 	if retag or not (
423 | 			os.path.isfile(os.path.join(out_dir,'unique.sorted.bam')) and \
424 | 			os.path.isfile(os.path.join(out_dir,'multi.sorted.bam')) \
425 | 			) :
426 | 		filter_bam_multihits(in_bam, max_tags=max_tags, max_hits=max_hits, out_dir=out_dir, read_tagger_method=read_tagger_method, 
427 | 			lib_type=lib_type)
428 | 	else:
429 | 		logger.info("found existing bams; skipped tagging.")
430 | 
431 | 	# file handlers
432 | 	if max_tags>0:
433 | 		mbam = pysam.Samfile(os.path.join(out_dir, 'multi.sorted.collapsed.bam'),'rb')
434 | 		ubam = pysam.Samfile(os.path.join(out_dir, 'unique.sorted.collapsed.bam'),'rb')
435 | 	else:
436 | 		mbam = pysam.Samfile(os.path.join(out_dir, 'multi.sorted.bam'),'rb')
437 | 		ubam = pysam.Samfile(os.path.join(out_dir, 'unique.sorted.bam'),'rb')
438 | 	obam = pysam.Samfile(os.path.join(out_dir, 'realigned.bam'), 'wb', template = mbam)
439 | 	chr_list=[x['SN'] for x in ubam.header['SQ']]
440 | 	chr_size=[x['LN'] for x in mbam.header['SQ']]
441 | 	chr_dict = {'name':chr_list, 'size':chr_size}
442 | 	
443 | 	# construct the `mread_dict`, this will be needed throughout the program;
444 | 	# also construct the genomic cluster dict and cluster to alignment,
445 | 	# by going through all mreads at once
446 | 	genomic_cluster_dict, mread_dict, location_to_reads = get_genomic_clusters(mbam, winsize=winsize, unstranded=unstranded)
447 | 	logger.debug('found %i mreads @ %i locations' % ( len(mread_dict), len(location_to_reads) ) )
448 | 	
449 | 	# keep a record of processed reads
450 | 	processed_mreads = set()
451 | 	
452 | 	# iterate through all mreads
453 | 	logger.info('running em')
454 | 	subg_counter = 0
455 | 	for read_qname in mread_dict:
456 | 		if read_qname in processed_mreads:
457 | 			continue
458 | 			
459 | 		## construct the fully-connected subgraph for each read
460 | 		read_to_locations, processed_mreads = \
461 | 			construct_subgraph(location_to_reads, read_qname, mread_dict, processed_mreads, chr_dict, \
462 | 				genomic_cluster_dict, winsize=winsize, unstranded=unstranded)
463 | 		subgraph = set()
464 | 		for read in read_to_locations:
465 | 			_ = deque(map(subgraph.add, read_to_locations[read].keys()))
466 | 		subgraph = list(subgraph)
467 | 		#if len(subgraph)==1 and len(read_to_locations)>10:
468 | 		#	raise Exception('Incorrect mread assigned to one location')
469 | 		if len(subgraph)==0:
470 | 			continue
471 | 		subg_counter += 1
472 | 		logger.debug("subgraph %i: |e|=%i, |v|=%i"%(subg_counter, len(read_to_locations), len(subgraph)) )
473 | 		
474 | 		## build the BIT tracks
475 | 		node_track, multi_reads_weights = \
476 | 			construct_BIT_track(subgraph, read_to_locations, ubam, unstranded)
477 | 			
478 | 		## run EM
479 | 		multi_reads_weights = \
480 | 			run_EM(node_track, multi_reads_weights, w=winsize)
481 | 		
482 | 		## write to obam
483 | 		for read in multi_reads_weights:
484 | 			for node in multi_reads_weights[read]:
485 | 				alignment = read_to_locations[read][node]
486 | 				score = round(multi_reads_weights[read][node][0], 3)
487 | 				alignment.set_tag('AS', score)
488 | 				#alignment.set_tag('PG', 'CLAM')
489 | 				obam.write(alignment)
490 | 	# sort the final output
491 | 	logger.info('sorting output')
492 | 	obam.close()
493 | 	ubam.close()
494 | 	mbam.close()
495 | 	obam_sorted_fn = os.path.join(out_dir, 'realigned.sorted.bam')
496 | 	pysam.sort('-o', obam_sorted_fn, os.path.join(out_dir, 'realigned.bam'))
497 | 	pysam.index(obam_sorted_fn)
498 | 	os.remove(os.path.join(out_dir, 'realigned.bam'))
499 | 	return
500 | 
501 | 
502 | 
503 | def parser(args):
504 | 	"""The command-line parser for CLAM-realigner
505 | 	Args:
506 | 		args (argparse.ArgumentParser): receives commandline arguments
507 | 	Returns:
508 | 		None
509 | 	"""
510 | 	try:
511 | 		in_bam = args.in_bam
512 | 		out_dir  = args.out_dir
513 | 		if not os.path.isdir(out_dir):
514 | 			os.mkdir(out_dir)
515 | 		tag_method = args.tag_method
516 | 		max_hits = args.max_hits
517 | 		max_tags = args.max_tags
518 | 		retag = args.retag
519 | 		winsize = args.winsize
520 | 		lib_type = args.lib_type
521 | 		unstranded = lib_type == "unstranded"
522 | 		
523 | 		logger.info('start')
524 | 		logger.info('run info: %s'%(' '.join(sys.argv)))
525 | 		
526 | 		realigner(in_bam, out_dir, max_hits=max_hits, max_tags=max_tags, read_tagger_method=tag_method, 
527 | 			winsize=winsize, unstranded=unstranded, retag=retag, lib_type=lib_type)
528 | 		
529 | 		logger.info('end')
530 | 	except KeyboardInterrupt:
531 | 		sys.exit(0)
532 | 	return
533 | 


--------------------------------------------------------------------------------
/CLAM/stats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xinglab/CLAM/aed16a4d4e56535e17302448c48f32d35cba14cd/CLAM/stats/__init__.py


--------------------------------------------------------------------------------
/CLAM/stats/bin_test_alternatives.py:
--------------------------------------------------------------------------------
  1 | """ A script for storing alternative peak calling
  2 | statistical models / bin tests other than negative binomial
  3 | Zijun Zhang
  4 | Last revisited: 9.6.2017
  5 | """
  6 | 
  7 | 
  8 | def test_bin_poisson(intv_bin_ip, intv_bin_con, correction_method='fdr_bh'):
  9 | 	"""DOCSTRING
 10 | 	Args
 11 | 	Returns
 12 | 	"""
 13 | 	def _par_to_vec(par, data, is_constrained):
 14 | 		if is_constrained:
 15 | 			beta = par[0]
 16 | 			mu_vec = par[1::]
 17 | 			delta = 0
 18 | 		else:
 19 | 			beta, delta = par[0], par[1]
 20 | 			mu_vec = par[2::]
 21 | 		ip_counter = data['this_ip'].shape[0]
 22 | 		con_counter = data['this_con'].shape[0]
 23 | 		mu0 = np.asarray(mu_vec[0:con_counter])
 24 | 		mu1 = np.asarray(mu_vec[con_counter::])
 25 | 		lamb1_this = np.exp(mu1 + beta + delta)
 26 | 		lamb1_others = np.exp(mu1)
 27 | 		lamb0_this = np.exp(mu0 + beta)
 28 | 		lamb0_others = np.exp(mu0)
 29 | 		return (lamb1_this, lamb1_others, lamb0_this, lamb0_others)
 30 | 		
 31 | 	def _neg_loglik_unconstrain(par, data):
 32 | 		(l1, l2, l3, l4) = _par_to_vec(par, data, False)
 33 | 		ll = np.sum(poisson.logpmf(data['this_ip'], mu=l1)) + \
 34 | 			np.sum(poisson.logpmf(data['others_ip'], mu=l2)) + \
 35 | 			np.sum(poisson.logpmf(data['this_con'], mu=l3)) + \
 36 | 			np.sum(poisson.logpmf(data['others_con'], mu=l4))
 37 | 		return -ll
 38 | 	
 39 | 	def _neg_loglik_constrain(par, data):
 40 | 		(l1, l2, l3, l4) = _par_to_vec(par, data, True)
 41 | 		ll = np.sum(poisson.logpmf(data['this_ip'], mu=l1)) + \
 42 | 			np.sum(poisson.logpmf(data['others_ip'], mu=l2)) + \
 43 | 			np.sum(poisson.logpmf(data['this_con'], mu=l3)) + \
 44 | 			np.sum(poisson.logpmf(data['others_con'], mu=l4))
 45 | 		return -ll
 46 | 		
 47 | 	intv_counter = intv_bin_ip.shape[1]
 48 | 	assert intv_counter == intv_bin_con.shape[1]
 49 | 	binscore = np.empty(intv_counter)
 50 | 	binsignal = np.empty(intv_counter)
 51 | 	ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip)
 52 | 	con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con)
 53 | 	for i in range(intv_counter):
 54 | 		this_ip = intv_bin_ip[:, i]
 55 | 		others_ip = ip_sum - this_ip
 56 | 		this_con = intv_bin_con[:, i]
 57 | 		others_con = con_sum - this_con
 58 | 		if this_ip == 0:
 59 | 			binsignal[i], binscore[i] = np.nan, 1.0
 60 | 			continue
 61 | 		## because Poisson (and other count-based methods) only
 62 | 		## takes integers, here we take the floor of the fractional
 63 | 		## multi-reads as a conservative approach
 64 | 		data = {
 65 | 				'this_ip':np.floor(this_ip),
 66 | 				'others_ip':np.floor(others_ip),
 67 | 				'this_con':np.floor(this_con),
 68 | 				'others_con':np.floor(others_con)
 69 | 			}
 70 | 		
 71 | 		res_constrain = optimize.minimize(
 72 | 				x0=np.ones(1+this_ip.shape[0]+others_ip.shape[0]), 
 73 | 				fun=_neg_loglik_constrain,
 74 | 				args=(data),
 75 | 				method='Nelder-Mead',
 76 | 				options={'disp':False}
 77 | 			)
 78 | 		
 79 | 		res_unconstrain = optimize.minimize(
 80 | 				x0=np.ones(2+this_ip.shape[0]+others_ip.shape[0]), 
 81 | 				fun=_neg_loglik_unconstrain,
 82 | 				args=(data),
 83 | 				method='Nelder-Mead',
 84 | 				options={'disp':False}
 85 | 			)
 86 | 		
 87 | 		delta_mle = res_unconstrain.x[1]
 88 | 		pval = 1 - chi2.cdf(2*(res_constrain.fun - res_unconstrain.fun), 1)
 89 | 		binscore[i] = pval
 90 | 		binsignal[i] = delta_mle
 91 | 	adj = multipletests(binscore, alpha=0.05, method=correction_method)
 92 | 	binscore_adj = adj[1]
 93 | 	return binsignal, binscore_adj
 94 | 
 95 | 
 96 | def test_bin_fisher(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh'):
 97 | 	"""DOCSTRING
 98 | 	Args
 99 | 	Returns
100 | 	"""
101 | 	if intv_bin_ip.shape[0] != 1:
102 | 		raise Exception('Fisher exact test does not deal with replicates.')
103 | 	intv_counter = intv_bin_ip.shape[1]
104 | 	assert intv_counter == intv_bin_con.shape[1]
105 | 	binscore = np.empty(intv_counter)
106 | 	binsignal = np.empty(intv_counter)
107 | 	ip_sum = np.sum(intv_bin_ip[0,])
108 | 	con_sum = np.sum(intv_bin_con[0,])
109 | 	for i in range(intv_counter):
110 | 		this_ip = intv_bin_ip[0, i]
111 | 		others_ip = ip_sum - this_ip
112 | 		this_con = intv_bin_con[0, i]
113 | 		others_con = con_sum - this_con
114 | 		if this_ip == 0:
115 | 			binsignal[i], binscore[i] = np.nan, 1.0
116 | 			continue
117 | 		_, binscore[i] = fisher_exact([[this_ip, others_ip], [this_con, others_con]], alternative='greater')
118 | 		if with_control:
119 | 			binsignal[i] = this_ip/others_ip / this_con*others_con
120 | 		else:
121 | 			binsignal[i] = this_ip
122 | 		
123 | 	adj = multipletests(binscore, alpha=0.05, method=correction_method)
124 | 	binscore_adj = adj[1]
125 | 	return binsignal, binscore_adj
126 | 


--------------------------------------------------------------------------------
/CLAM/stats/ztnb_em.py:
--------------------------------------------------------------------------------
  1 | import scipy.special as special
  2 | import numpy as np
  3 | from numpy.random import negative_binomial
  4 | import mpmath
  5 | from collections import defaultdict
  6 | 
  7 | 
  8 | ##############################
  9 | ## distribution characterizing functions
 10 | ##############################
 11 | 
 12 | def trunc_logLik(data, mu, alpha):
 13 | 	log_1_plus_a_mu = np.log(1 + alpha*mu)
 14 | 	log_1_minus_prob_zero = np.log(1.0 - np.exp(-np.log(1.0+alpha*mu)/alpha))
 15 | 	alpha_inv = 1.0/alpha
 16 | 	lim = int(np.max(data.keys()))
 17 | 	holding_val=0.0
 18 | 	log_L=0.0
 19 | 	for i in range(1, lim+1):
 20 | 		holding_val += np.log(1+alpha*(i-1))
 21 | 		log_L += data[i]* (holding_val - special.gammaln(i) + i*np.log(mu)-(i+alpha_inv)*log_1_plus_a_mu - log_1_minus_prob_zero)
 22 | 	return log_L
 23 | 
 24 | def ztnb_pmf(y, mu, alpha):
 25 | 	r = 1.0 / alpha
 26 | 	if y <= 0:
 27 | 		raise Exception('y must be larger than 0.')
 28 | 	p = mu/(mu+r+0.0)
 29 | 	ztnbin_mpmath = lambda y, p, r: mpmath.gamma(y + r)/(mpmath.gamma(y+1)*mpmath.gamma(r))*np.power(1-p, r)*np.power(p, y)/(1-np.power(1-p, r))
 30 | 	ztnbin = np.frompyfunc(ztnbin_mpmath, 3, 1)
 31 | 	return float(ztnbin(y, p, r))
 32 | 
 33 | def ztnb_cdf(y, mu, alpha):
 34 | 	r = 1.0/alpha
 35 | 	if y <= 0:
 36 | 		raise Exception('y must be larger than 0.')
 37 | 	p = mu/(mu+r+0.0)
 38 | 	F_ztnb = ( 1 - special.btdtr(y+1, r, p) - np.power(1-p, r) ) / (1-np.power(1-p,r))
 39 | 	return F_ztnb
 40 | 
 41 | def ztnb_pval(y, mu, alpha):
 42 | 	pval = 1 - ztnb_cdf(y, mu, alpha) + ztnb_pmf(y, mu, alpha)
 43 | 	if pval <= 10**-5:
 44 | 		return 0
 45 | 	else:
 46 | 		return pval
 47 | 
 48 | def rztnb(mu=3, alpha=0.5, size=100):
 49 | 	r = 1.0/alpha
 50 | 	p = mu/(mu+r+0.0)
 51 | 	ztnb=[]
 52 | 	while(len(ztnb)<size):
 53 | 		x = negative_binomial(n=r,p=1-p)
 54 | 		if x>0:
 55 | 			ztnb.append(x)
 56 | 	return ztnb
 57 | 
 58 | def collapse_data(data):
 59 | 	col_data = defaultdict(int)
 60 | 	for i in data:
 61 | 		col_data[i] += 1
 62 | 	return col_data	
 63 | 
 64 | ##############################
 65 | ## parameter estimation functions	
 66 | ##############################
 67 | 
 68 | def EM_estim_params(height, tol = 10**-4, max_iter = 1000, verbose = False, mu = None, alpha = None):
 69 | 	tot_size = np.sum([height[x] for x in height if x>0])
 70 | 	error = 10000
 71 | 	prev_score = 10000
 72 | 	score = 0.0
 73 | 	if mu is None or alpha is None:
 74 | 		mu = np.sum([x*height[x] for x in height])/(tot_size+0.0)
 75 | 		var = np.sum([ height[x]*(x-mu)**2 for x in height]) / tot_size
 76 | 		alpha = (var - mu) / (mu * mu)
 77 | 	#mu = 0.01
 78 | 	#alpha = 100
 79 | 	for h in range(1, int(max(height.keys()))+1):
 80 | 		if not h in height:
 81 | 			height[h]=0
 82 | 	
 83 | 	for i in range(1, (max_iter+1)):
 84 | 		height[0] = expected_zeros(tot_size, mu, alpha)
 85 | 		mu, alpha = estim_params(height, tol)
 86 | 		height[0] = 0
 87 | 		score = trunc_logLik(height, mu, alpha)
 88 | 		if score == 0:
 89 | 			raise ZeroDivisionError('invalid loglik function value')
 90 | 		error = abs((score - prev_score)/score)
 91 | 		if verbose:
 92 | 			print('Iter ' + str(i) + ': eps = ' + str(error) + '; mu = ' + str(mu) + '; alpha = ' + str(alpha))
 93 | 		if(error < tol):
 94 | 			break
 95 | 		prev_score = score
 96 | 	return (trunc_logLik(height, mu, alpha), mu, alpha)
 97 | 
 98 | def expected_zeros(pseudo_size, mu, alpha):
 99 | 	min_allowed_alpha=10**-4
100 | 	max_allowed_prob_zero=0.99
101 | 	if alpha < min_allowed_alpha:
102 | 		prob_zero = max_allowed_prob_zero
103 | 	else:
104 | 		prob_zero = np.min([np.power(1.0+alpha*mu, -1.0/alpha), 0.99])
105 | 	expected_zeros = int(pseudo_size*(prob_zero/(1-prob_zero)))
106 | 	return expected_zeros
107 | 
108 | 
109 | def estim_params(pseudo_hist, tolerance = 10**-4):
110 | 	min_allowed_alpha = 10**-3
111 | 	max_allowed_alpha = 1000
112 | 	
113 | 	mu = compute_mean(pseudo_hist)  
114 | 	pseudo_size = np.sum(pseudo_hist.values())
115 |   
116 | 	a_low = min_allowed_alpha
117 | 	a_high = max_allowed_alpha
118 |   
119 | 	diff = 10000
120 | 	prev_val = 10000
121 |   
122 | 	while diff > tolerance and movement(a_high, a_low) > tolerance:
123 | 		a_mid = (a_low + a_high)/2
124 | 		mid_val = alpha_score_function(pseudo_hist, mu, a_mid, pseudo_size)
125 | 		#print str(a_mid) + '; ' + str(mid_val) + '; ' + str(trunc_logLik(pseudo_hist, mu, a_mid))
126 | 		if (mid_val < 0): 
127 | 			a_high = a_mid
128 | 		else:
129 | 			a_low = a_mid
130 | 		diff = np.abs((prev_val - mid_val)/prev_val)
131 | 		prev_val = mid_val
132 | 	
133 | 	alpha = a_mid
134 | 	return mu, alpha
135 | 
136 | def alpha_score_function(vals_hist, mean, a_mid, vals_count):
137 | 	one_plus_alpha_mu = 1.0 + a_mid*mean
138 | 	return (score_fun_first_term(vals_hist, a_mid)/(vals_count+0.0) + (np.log(one_plus_alpha_mu)/(a_mid+0.0) - mean)/(a_mid+0.0))
139 | 
140 | def score_fun_first_term(vals_hist,a_mid):
141 | 	sum = 0.0
142 | 	lim = int(np.max(vals_hist.keys()))
143 | 	for i in range(0, lim+1):
144 | 		if (vals_hist[i] > 0):
145 | 			inner_sum = 0.0
146 | 			for j in range(0, i):
147 | 				inner_sum += j/(1.0 + a_mid*j)
148 | 			sum += vals_hist[i]*inner_sum
149 |     
150 | 	return sum 
151 | 
152 | 	
153 | ##############################	
154 | ## in-line functions
155 | ##############################
156 | 
157 | def compute_mean(height):
158 | 	tot_size = np.sum(height.values())
159 | 	mean = np.sum([x*height[x] for x in height])/(tot_size + 0.0)
160 | 	return(mean)
161 | 
162 | def movement(a, b):
163 | 	return abs(a - b)/max(a, b)
164 | 
165 | ##############################
166 | ## testing function
167 | ##############################
168 | 
169 | def test(size=10**3, mu=0.01, alpha=50, max_iter=100):
170 | 	data=rztnb(mu, alpha, size)
171 | 	height=collapse_data(data)
172 | 	return EM_estim_params(height, max_iter=max_iter, verbose=True)


--------------------------------------------------------------------------------
/CLAM/utils/parseBAM.py:
--------------------------------------------------------------------------------
  1 | """ parseBAM from Yan Gao
  2 | https://github.com/yangao07/pyParseBAM/blob/master/parse_bam.py
  3 | Author: Yan Gao
  4 | Date: 1.9.2018
  5 | """
  6 | 
  7 | import sys, re
  8 | import pysam as ps
  9 | import utils as ut
 10 | 
 11 | #### cigar operation:
 12 | BAM_CMATCH = 0  # M
 13 | BAM_CINS = 1  # I
 14 | BAM_CDEL = 2  # D
 15 | BAM_CREF_SKIP = 3  # N
 16 | BAM_CSOFT_CLIP = 4  # S
 17 | BAM_CHARD_CLIP = 5  # H
 18 | BAM_CPAD = 6  # P
 19 | BAM_CEQUAL = 7  # =
 20 | BAM_CDIFF = 8  # X
 21 | BAM_CBACK = 9  # B
 22 | 
 23 | 
 24 | # cigar stats:
 25 | # M	BAM_CMATCH	0
 26 | # I	BAM_CINS	1
 27 | # D	BAM_CDEL	2
 28 | # N	BAM_CREF_SKIP	3
 29 | # S	BAM_CSOFT_CLIP	4
 30 | # H	BAM_CHARD_CLIP	5
 31 | # P	BAM_CPAD	6
 32 | # =	BAM_CEQUAL	7
 33 | # X	BAM_CDIFF	8
 34 | # B	BAM_CBACK	9
 35 | # NM	NM tag	10
 36 | def get_ref_op_length(cigar_stats=[]):
 37 |     # get op length for MDNP=X
 38 |     op_len = cigar_stats[0] + cigar_stats[2] + cigar_stats[3] + cigar_stats[6] + cigar_stats[7] + cigar_stats[8]
 39 |     return op_len
 40 | 
 41 | 
 42 | def get_read_op_length(cigar_stats):
 43 |     # get op length for MISH=X
 44 |     op_len = cigar_stats[0] + cigar_stats[1] + cigar_stats[4] + cigar_stats[5] + cigar_stats[7] + cigar_stats[8]
 45 |     return op_len
 46 | 
 47 | 
 48 | def get_aligned_read_length(cigar_stats):
 49 |     # get op length for MI=X
 50 |     op_len = cigar_stats[0] + cigar_stats[1] + cigar_stats[7] + cigar_stats[8]
 51 |     return op_len
 52 | 
 53 | 
 54 | def minipulate_cigar(r=ps.AlignedSegment, old='', new=''):
 55 |     r.cigarstring = re.sub(r'%s' % old, new, r.cigarstring)
 56 | 
 57 | 
 58 | def get_spec_MD(mdstr='', start=0, end=0):  # '23AC20T', 10, 30 => ' 13AC5
 59 |     mSub = re.sub(r'([ACGTNacgtn])', ' \\1 ', mdstr)
 60 |     mSplit = re.split('[ ]+', mSub)
 61 |     start_remain_len = start
 62 |     end_remain_len = end - start
 63 |     ret_md = []
 64 |     mi = 0
 65 |     # print mSplit, start_remain_len, end_remain_len
 66 |     while start_remain_len > 0:
 67 |         if mSplit[mi].isdigit():
 68 |             if int(mSplit[mi]) > start_remain_len:
 69 |                 mSplit[mi] = str(int(mSplit[mi]) - start_remain_len)
 70 |                 start_remain_len = 0
 71 |                 break
 72 |             else:
 73 |                 start_remain_len -= int(mSplit[mi])
 74 |         else:  # isalpha()
 75 |             start_remain_len -= 1
 76 |         mi += 1
 77 |     while end_remain_len > 0:
 78 |         if mSplit[mi].isdigit():
 79 |             if int(mSplit[mi]) >= end_remain_len:
 80 |                 ret_md.append(str(end_remain_len))
 81 |                 end_remain_len = 0
 82 |                 break
 83 |             else:
 84 |                 end_remain_len -= int(mSplit[mi])
 85 |                 ret_md.append(mSplit[mi])
 86 |         else:  # isalpha()
 87 |             end_remain_len -= 1
 88 |             ret_md.append(mSplit[mi])
 89 |         mi += 1
 90 |     # print ret_md
 91 |     return ret_md
 92 | 
 93 | 
 94 | # MISMATCH: read_pos(first), ref_pos(first), len, read_base, ref_base
 95 | # INSERTION: ins_read_pos(first), ins_ref_pos(left),  len, ins_base
 96 | # DELETION: del_read_pos(left), del_ref_pos(first), len, del_base
 97 | def get_error_from_MD(cigartuples=[], mdstr='', full_query_seq='', ref_start=0):
 98 |     mis, ins, dele = [], [], []
 99 |     last_error = ''
100 |     md_i, m_pos = 0, 0
101 |     mdSub = re.sub(r'([\\^][ACGTNacgtn]+)[0]*', ' \\1 ', mdstr)
102 |     mdSplit = mdSub.rsplit()
103 |     ref_pos, query_pos = ref_start, 0
104 | 
105 |     for tuples in cigartuples:
106 |         if tuples[0] == BAM_CMATCH:
107 |             m = mdSplit[md_i]
108 | 
109 |             if m.startswith('^'):
110 |                 ut.format_time(sys.stderr, 'get_error_from_MD', 'Unexpected MD string: {}\n'.format(mdstr))
111 |                 sys.exit(1)
112 |             mSub = re.sub(r'([ACGTNacgtn])', ' \\1 ', m)
113 |             m_len = sum(map(int, (re.sub(r'([ACGTNacgtn])', '1', mSub)).rsplit()))
114 | 
115 |             # from m_pos to m_pos + tuples[1]
116 |             sub_ms = get_spec_MD(m, m_pos, m_pos + tuples[1])
117 |             for ms in sub_ms:
118 |                 if ms.isalpha():  # MISMATCH
119 |                     if last_error != 'MIS' or mis[-1][0] != query_pos - 1:
120 |                         mis_error = [query_pos, ref_pos, 1, full_query_seq[query_pos], ms]
121 |                         mis.append(mis_error)
122 |                     else:  # last_error == 'MIS' and  mis[-1][1] == ap[0] - 1:
123 |                         mis[-1][-3] += 1
124 |                         mis[-1][-2] += full_query_seq[query_pos]
125 |                         mis[-1][-1] += ms
126 |                     query_pos += 1
127 |                     ref_pos += 1
128 |                     last_error = 'MIS'
129 |                 elif ms.isdigit():  # MATCH
130 |                     query_pos += int(ms)
131 |                     ref_pos += int(ms)
132 | 
133 |             if m_pos + tuples[1] == m_len:
134 |                 md_i += 1
135 |                 m_pos = 0
136 |             elif m_pos + tuples[1] < m_len:
137 |                 m_pos += tuples[1]
138 |             else:  #
139 |                 ut.format_time(sys.stderr, 'get_error_from_MD', 'Unexpected MD string: {}\n'.format(mdstr))
140 |                 sys.exit(1)
141 |         elif tuples[0] == BAM_CDEL:
142 |             m = mdSplit[md_i]
143 |             if not m.startswith('^'):
144 |                 ut.format_time(sys.stderr, 'get_error_from_MD', 'Unexpected MD string: {}\n'.format(mdstr))
145 |                 sys.exit(1)
146 |             del_error = [query_pos - 1, ref_pos, tuples[1], m[1:]]
147 |             dele.append(del_error)
148 |             ref_pos += tuples[1]
149 |             last_error = 'DEL'
150 |             md_i += 1
151 |         elif tuples[0] == BAM_CINS:
152 |             ins_error = [query_pos, ref_pos - 1, tuples[1], full_query_seq[query_pos:query_pos + tuples[1]]]
153 |             ins.append(ins_error)
154 |             query_pos += tuples[1]
155 |             last_error = 'INS'
156 |         elif tuples[0] == BAM_CSOFT_CLIP or tuples[0] == BAM_CHARD_CLIP:
157 |             query_pos += tuples[1]
158 |         elif tuples[0] == BAM_CREF_SKIP:
159 |             ref_pos += tuples[1]
160 |         else:
161 |             ut.format_time(sys.stderr, 'get_error_from_MD', 'Unexpected cigar: {}\n'.format(cigartuples))
162 |             sys.exit(1)
163 | 
164 |     return mis, ins, dele
165 | 
166 | 
167 | def get_error_from_Cigar(cigartuples=[], full_query_seq='', align_ref_seq='', ref_start=0):
168 |     mis, ins, dele = [], [], []
169 |     last_error = ''
170 |     ref_pos, query_pos = ref_start, 0
171 |     for tuples in cigartuples:
172 |         if tuples[0] == BAM_CMATCH:
173 |             for q, r in zip(full_query_seq[query_pos:query_pos + tuples[1]],
174 |                             align_ref_seq[ref_pos - ref_start:ref_pos - ref_start + tuples[1]]):
175 |                 if q != r:  # MISMATCH
176 |                     if last_error != 'MIS' or mis[-1][0] != query_pos - 1:
177 |                         mis_error = [query_pos, ref_pos, 1, q, r]
178 |                         mis.append(mis_error)
179 |                     else:  # last_error == 'MIS' and  mis[-1][1] == ap[0] - 1:
180 |                         mis[-1][-3] += 1
181 |                         mis[-1][-2] += q
182 |                         mis[-1][-1] += r
183 |                     last_error = 'MIS'
184 |                 ref_pos += 1
185 |                 query_pos += 1
186 |         elif tuples[0] == BAM_CDEL:
187 |             del_error = [query_pos - 1, ref_pos, tuples[1],
188 |                          align_ref_seq[ref_pos - ref_start:ref_pos - ref_start + tuples[1]]]
189 |             dele.append(del_error)
190 |             last_error = 'DEL'
191 |             ref_pos += tuples[1]
192 |         elif tuples[0] == BAM_CINS:
193 |             ins_error = [query_pos, ref_pos - 1, tuples[1], full_query_seq[query_pos:query_pos + tuples[1]]]
194 |             ins.append(ins_error)
195 |             last_error = 'INS'
196 |             query_pos += tuples[1]
197 |         elif tuples[0] == BAM_CHARD_CLIP or tuples[0] == BAM_CSOFT_CLIP:
198 |             query_pos += tuples[1]
199 |         elif tuples[0] == BAM_CREF_SKIP:
200 |             ref_pos += tuples[1]
201 |         else:
202 |             ut.format_time(sys.stderr, 'get_error_from_Cigar', 'Unexpected cigar: {}\n'.format(cigartuples))
203 |             sys.exit(1)
204 | 
205 |     return mis, ins, dele
206 | 
207 | 
208 | def get_align_block(cigartuples=[], ref_start=0):
209 |     align_block = []
210 |     start = ref_start
211 |     end = ref_start - 1
212 |     for tuples in cigartuples:
213 |         if tuples[0] == BAM_CMATCH or tuples[0] == BAM_CDEL:
214 |             end += tuples[1]
215 |         elif tuples[0] == BAM_CREF_SKIP:
216 |             align_block.append((start, end))
217 |             start = end + tuples[1] + 1
218 |             end = start -1
219 |     align_block.append((start, end))
220 | 
221 |     return align_block
222 | 	
223 | 
224 | # MISMATCH: read_pos(first), ref_pos(first), len, read_base, ref_base
225 | # INSERTION: ins_read_pos(first), ins_ref_pos(left),  len, ins_base
226 | # DELETION: del_read_pos(left), del_ref_pos(first), len, del_base
227 | def get_align_detial(r, ref_fa):
228 |     if r.cigartuples[0][0] == BAM_CSOFT_CLIP or r.cigartuples[0][0] == BAM_CHARD_CLIP:
229 |         left_clip = r.cigartuples[0][1]
230 |     else:
231 |         left_clip = 0
232 |     if r.cigartuples[-1][0] == BAM_CSOFT_CLIP or r.cigartuples[-1][0] == BAM_CHARD_CLIP:
233 |         right_clip = r.cigartuples[-1][1]
234 |     else:
235 |         right_clip = 0
236 | 
237 |     if r.has_tag('MD'):
238 |         mdstr = r.get_tag('MD')
239 |         mis_err, ins_err, dele_err = get_error_from_MD(r.cigartuples, mdstr, r.query_sequence, r.reference_start)
240 |     else:
241 |         ref = ps.FastaFile(ref_fa)
242 |         align_ref_seq = ref.fetch(r.reference_name, r.reference_start, r.reference_start + r.reference_length)
243 |         mis_err, ins_err, dele_err = get_error_from_Cigar(r.cigartuples, r.query_sequence, align_ref_seq,
244 |                                                              r.reference_start)
245 |     return [r.is_reverse, r.infer_read_length(), r.reference_start, r.reference_length, left_clip, right_clip, mis_err,
246 |             ins_err, dele_err]


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate


--------------------------------------------------------------------------------
/bin/CLAM:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | """CLIP-seq Analysis of Multi-mapped reads
  5 | 
  6 | This is the main entry for CLAM. CLAM is a comprehensive peak caller for CLIP/RIP-seq 
  7 | data that considers both uniquely-mapped and multi-mapped reads.
  8 | 
  9 | Example:
 10 | 	$ CLAM [realigner|peakcaller] <options>
 11 | 
 12 | Authors:
 13 | 	Zijun Zhang <zj.z[at]ucla.edu>
 14 | 	Yi Xing <yxing[at]ucla.edu>
 15 | 	
 16 | Citation:
 17 | 	@article{zhang2017clip,
 18 | 	title={CLIP-seq analysis of multi-mapped reads discovers novel functional RNA regulatory sites in the human transcriptome},
 19 | 	author={Zhang, Zijun and Xing, Yi},
 20 | 	journal={Nucleic Acids Research},
 21 | 	year={2017}
 22 | 	}
 23 | 
 24 | Todo:
 25 | 	add `visualize` and `evaluate` subcommands
 26 | 
 27 | This program is free software: you can redistribute it and/or modify it under
 28 | the terms of the GNU General Public License as published by the Free Software
 29 | Foundation, either version 3 of the License, or (at your option) any later
 30 | version
 31 | """
 32 | 
 33 | from CLAM import config
 34 | import os
 35 | import sys
 36 | import logging
 37 | import argparse as ap
 38 | import datetime
 39 | 
 40 | 
 41 | def main():
 42 | 	"""main entry for CLAM
 43 | 	This function setup the logging and handle the input options
 44 | 	Args
 45 | 		None
 46 | 	Returns
 47 | 		None
 48 | 	"""
 49 | 	logger = setup_logger()
 50 | 	argparser = get_arg_parser()
 51 | 	args = argparser.parse_args()
 52 | 	
 53 | 	subcommand = args.subcommand
 54 | 	
 55 | 	if subcommand == 'preprocessor':
 56 | 		from CLAM import preprocessor
 57 | 		preprocessor.parser( args )
 58 | 	elif subcommand == 'realigner':
 59 | 		from CLAM import realigner
 60 | 		#print args
 61 | 		realigner.parser( args )
 62 | 	
 63 | 	elif subcommand == 'peakcaller':
 64 | 		from CLAM import peakcaller
 65 | 		#print args
 66 | 		peakcaller.parser( args )
 67 | 
 68 | 	elif subcommand == 'permutation_callpeak':
 69 | 		from CLAM import permutation_peakcaller
 70 | 		permutation_peakcaller.parser( args )
 71 | 
 72 | 	elif subcommand == 'peak_annotator':
 73 | 		from CLAM import peak_annotator
 74 | 		peak_annotator.parser(args)
 75 | 
 76 | 	elif subcommand == 'data_downloader':
 77 | 		from CLAM import download_data
 78 | 		download_data.parser(args)
 79 | 
 80 | 
 81 | def setup_logger():
 82 | 	"""Set up the logger for the whole pipeline
 83 | 	Args
 84 | 		None
 85 | 	Returns
 86 | 		logger: logging object
 87 | 	"""
 88 | 	# setup logger
 89 | 	logger = logging.getLogger('CLAM')
 90 | 	logger.setLevel(logging.DEBUG)
 91 | 	# create file handler which logs even debug messages
 92 | 	#fh = logging.FileHandler(
 93 | 	#	'log.CLAM.'+'-'.join(str(datetime.datetime.now()).replace(':','-').split()) + '.txt')
 94 | 	fh = logging.FileHandler('log.CLAM.txt')
 95 | 	fh.setLevel(logging.INFO)
 96 | 	# create console handler with a higher log level
 97 | 	ch = logging.StreamHandler()
 98 | 	ch.setLevel(logging.DEBUG)
 99 | 	# create formatter and add it to the handlers
100 | 	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -\n %(message)s')
101 | 	fh.setFormatter(formatter)
102 | 	ch.setFormatter(formatter)
103 | 	# add the handlers to the logger
104 | 	logger.addHandler(fh)
105 | 	logger.addHandler(ch)
106 | 	return logger
107 | 
108 | 
109 | def get_arg_parser():
110 | 	"""DOCSTRING
111 | 	Args
112 | 	Returns
113 | 	"""
114 | 	description = "%(prog)s -- CLip-seq Analysis of Multi-mapped reads"
115 | 	epilog = "For command line options of each sub-command, type: %(prog)s COMMAND -h"
116 | 	
117 | 	argparser = ap.ArgumentParser(description=description, epilog=epilog)
118 | 	argparser.add_argument("--version", action="version", version="%(prog)s "+config.__version__)
119 | 	
120 | 	subparsers = argparser.add_subparsers( dest="subcommand" )
121 | 	
122 | 	# preprocessing
123 | 	add_preprocessor_parser(subparsers)
124 | 	
125 | 	# realigner
126 | 	add_realigner_parser(subparsers)
127 | 	
128 | 	# peakcaller
129 | 	add_peakcaller_parser(subparsers)
130 | 	
131 | 	# permutation_callpeak
132 | 	add_permutation_callpeak_parser(subparsers)
133 |   
134 | 	# peak_annotator
135 | 	add_peak_annotator_parser(subparsers)
136 | 
137 | 	# data_downloader
138 | 	add_data_downloader_parser(subparsers)
139 | 
140 | 	return argparser
141 | 
142 | 
143 | def add_preprocessor_parser( subparsers ):
144 | 	ag_prep = subparsers.add_parser("preprocessor", help="CLAM Preprocessor: tag read alignments to specific locus")
145 | 	
146 | 	# input/output
147 | 	ag_prep.add_argument("-i", "--input", dest="in_bam", type=str, required=True,
148 | 		help="Input bam file")
149 | 	
150 | 	ag_prep.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True,
151 | 		help="Output folder")
152 | 	
153 | 	# processing
154 | 	ag_prep.add_argument("--read-tagger-method", dest="tag_method", type=str, 
155 | 		choices= ('median', 'start'), default='median',
156 | 		help="Read tagger method, 'median' for read center, 'start' for read start site; default: median")
157 | 	
158 | 	ag_prep.add_argument("--max-multihits", dest="max_hits", type=int, default=100,
159 | 		help="The maximum hits allowed for multi-mapped reads; default: 100")
160 | 	
161 | 	ag_prep.add_argument("--max-tags", dest="max_tags", type=int, default=-1,
162 | 		help="The maximum identical tags at given location; default: -1, no filter")
163 | 
164 | 	# lib_type
165 | 	ag_prep.add_argument("--lib-type", dest="lib_type", type=str, 
166 |                       default="sense", choices=['sense', 'antisense', 'unstranded'],
167 |                       help="The expected read strandness with transcription direction: sense, antisense, or unstranded; default: sense")
168 | 
169 | 	return
170 | 
171 | 
172 | def add_realigner_parser( subparsers ):
173 | 	ag_realigner = subparsers.add_parser("realigner", help="CLAM Realigner: realign multi-mapped reads using expectation-maximization")
174 | 	
175 | 	# input/output
176 | 	ag_realigner.add_argument("-i", "--input", dest="in_bam", type=str, required=True,
177 | 		help="Input bam file")
178 | 	
179 | 	ag_realigner.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True,
180 | 		help="Output folder")
181 | 	
182 | 	# processing
183 | 	ag_realigner.add_argument("--read-tagger-method", dest="tag_method", type=str, 
184 | 		choices= ('median', 'start'), default='median',
185 | 		help="Read tagger method, 'median' for read center, 'start' for read start site; default: median")
186 | 	
187 | 	ag_realigner.add_argument("--max-multihits", dest="max_hits", type=int, default=100,
188 | 		help="The maximum hits allowed for multi-mapped reads; default: 100")
189 | 
190 | 	ag_realigner.add_argument("--max-tags", dest="max_tags", type=int, default=-1,
191 | 		help="The maximum identical tags at given location; default: -1, no filter")
192 | 
193 | 	ag_realigner.add_argument("--retag", dest="retag", default=False, action='store_true',
194 | 		help="Retag the bam regardless when turned on; invalid when no previous files found")
195 | 	
196 | 	# realign
197 | 	ag_realigner.add_argument("--winsize", dest="winsize", type=int, default=50,
198 | 		help="Local window size for em computations; default: 50")
199 | 
200 | 	#ag_realigner.add_argument("--unstranded", dest="unstranded", default=False, action="store_true",
201 | 	#	help="Unstranded alignments if turned on")
202 | 
203 | 	# lib_type
204 | 	ag_realigner.add_argument("--lib-type", dest="lib_type", type=str,
205 |                            default="sense", choices=['sense', 'antisense', 'unstranded'],
206 |                       help="The expected read strandness with transcription direction: sense, antisense, or unstranded; default: sense")
207 | 	
208 | 	return
209 | 
210 | 
211 | def add_peakcaller_parser( subparsers ):
212 | 	ag_peakcaller = subparsers.add_parser("peakcaller", help="CLAM Peakcaller: negative binomial model-based peak calling combining unique- and multi-reads")
213 | 	
214 | 	# input/output
215 | 	ag_peakcaller.add_argument("-i", "--input", dest="in_bam", nargs='+', type=str, required=True,
216 | 		help="Filepaths for IP bam files, e.g ubam1,ubam2 mbam1,mbam2")
217 | 	
218 | 	ag_peakcaller.add_argument("-c", "--control-dir", dest="con_bam", nargs='+', type=str, required=True,
219 | 		help="Filepaths for control bam files")
220 | 	
221 | 	ag_peakcaller.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True,
222 | 		help="Output folder")
223 | 	
224 | 	ag_peakcaller.add_argument("--gtf", dest="gtf_fp", type=str, required=True,
225 | 		help="GTF filepath")
226 | 	
227 | 	# processing
228 | 	ag_peakcaller.add_argument("-p", "--nthread", dest="nthread", type=int, default=8,
229 | 		help="Number of threads; default: 8")
230 | 	
231 | 	ag_peakcaller.add_argument("-u", "--unique-only", dest="unique_only", default=False, action='store_true',
232 | 		help="Call peaks using only unique-mapped reads when turned on")
233 | 	
234 | 	ag_peakcaller.add_argument("--pool", dest="pooling", default=False, action="store_true",
235 | 		help="Pool the read counts if provided with multiple replicates; default: False")
236 | 	
237 | 	ag_peakcaller.add_argument("--min-clip-cov", dest="min_clip_cov", type=int, default=4,
238 | 		help="Minimum CLIP reads per gene to perform analysis; default: 4")
239 | 	
240 | 	# callpeak
241 | 	ag_peakcaller.add_argument("--qval-cutoff", dest="qval_cutoff", type=float, default=0.05,
242 | 		help="Cutoff for adjusted p-values; default: 0.05")
243 | 	
244 | 	ag_peakcaller.add_argument("--fold-change", dest="fold_change", nargs='+', type=float, default=[2.],
245 | 		help="Threasholds for signal range (fold change w/ control; tag count w/o control); default: 2-inf")
246 | 
247 | 	ag_peakcaller.add_argument("--normalize-lib", dest="norm_lib", action="store_true", default=False,
248 | 		help="use total library size to normalize signal and control, instead of gene-by-gene basis; default: False")
249 | 	
250 | 	ag_peakcaller.add_argument("-b", "--binsize", dest="binsize", type=int, default=50,
251 | 		help="Bin size for calling peaks; default: 50")
252 | 	
253 | 	ag_peakcaller.add_argument("--lib-type", dest="lib_type", type=str,
254 |                             default="sense", choices=['sense', 'antisense', 'unstranded'],
255 |                             help="The expected read strandness with transcription direction: sense, antisense, or unstranded; default: sense")
256 | 	
257 | 	return
258 | 
259 | 
260 | 
261 | def add_permutation_callpeak_parser( subparsers ):
262 | 	ag_peakcaller = subparsers.add_parser("permutation_callpeak", help="CLAM permutation peakcaller: call peaks using permutation (as in v1.0.0)")
263 | 	
264 | 	# input/output
265 | 	ag_peakcaller.add_argument("-i", "--input", dest="in_bam", nargs='+', type=str, required=True,
266 | 		help="Filepaths for CLIP bam, e.g ubam mbam")
267 | 		
268 | 	ag_peakcaller.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True,
269 | 		help="Output folder")
270 | 	
271 | 	ag_peakcaller.add_argument("--gtf", dest="gtf_fp", type=str, required=True,
272 | 		help="GTF filepath")
273 | 	
274 | 	# processing
275 | 	ag_peakcaller.add_argument("-p", "--nthread", dest="nthread", type=int, default=8,
276 | 		help="Number of threads; default: 8")
277 | 	
278 | 	ag_peakcaller.add_argument("--random-state", dest="random_state", type=int, default=777,
279 | 		help="Seed for random number generator in permutations; default: 777")
280 | 	
281 | 	# callpeak
282 | 	ag_peakcaller.add_argument("--qval-cutoff", dest="qval_cutoff", type=float, default=0.005,
283 | 		help="Cutoff for adjusted p-values; default: 0.005")
284 | 	
285 | 	ag_peakcaller.add_argument("--merge-size", dest="merge_size", type=int, default=50,
286 | 		help="Select best peak within this size; default: 50")
287 | 	
288 | 	ag_peakcaller.add_argument("--extend", dest="extend", type=int, default=50,
289 | 		help="Extend peak to this size if less than it; default: 50")
290 | 	
291 | 	ag_peakcaller.add_argument("--lib-type", dest="lib_type", type=str,
292 |                       default="sense", choices=['sense', 'antisense', 'unstranded'],
293 |                             help="The expected read strandness with transcription direction: sense, antisense, or unstranded; default: sense")
294 | 	
295 | 	return
296 | 
297 | 
298 | def add_peak_annotator_parser(subparsers):
299 | 	ag_anno = subparsers.add_parser(
300 | 		"peak_annotator", help="CLAM peak annotator: assign peaks to genomic regions")
301 | 
302 | 	# input/output
303 | 	ag_anno.add_argument("-i", "--input", dest="peak_in", type=str, required=True,
304 | 						help="Input peak file")
305 | 
306 | 	ag_anno.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True,
307 | 						help="Genome version (hg19, hg38, mm10 avaiable)")
308 | 
309 | 	ag_anno.add_argument("-o", "--out-file", dest="out_file", type=str, required=True,
310 | 						help="Output file")
311 | 
312 | 	return
313 | 
314 | 
315 | def add_data_downloader_parser(subparsers):
316 | 	ag_down = subparsers.add_parser(
317 | 		"data_downloader", help="CLAM data downloader: download data of genomic regions")
318 | 
319 | 	# input/output
320 | 	ag_down.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True,
321 | 		help="Genome version (hg19, hg38, mm10 avaiable)")
322 | 
323 | 	return
324 | 
325 | 
326 | 
327 | if __name__ == '__main__':
328 | 	try:
329 | 		main()
330 | 	except KeyboardInterrupt:
331 | 		sys.stderr.write("User interrupted; program terminated.")
332 | 		sys.exit(0)
333 | 


--------------------------------------------------------------------------------
/check/compare_realign.py:
--------------------------------------------------------------------------------
 1 | """Compare the realigner outputs for different
 2 | version
 3 | ZZJ
 4 | 2019.5.27
 5 | """
 6 | 
 7 | import sys
 8 | import pysam
 9 | 
10 | 
11 | 
12 | def read_as_score(bfile):
13 | 	s1 = {}
14 | 	with pysam.Samfile(bfile, 'rb') as bam:
15 | 		i = 0
16 | 		for r1 in bam:
17 | 			i += 1
18 | 			#if i>30000:
19 | 			#	break
20 | 			s1[(r1.qname, r1.rname, r1.pos)] = r1.opt('AS')
21 | 	return s1
22 | 
23 | 
24 | def plot_scatter(new, old):
25 | 	import matplotlib.pyplot as plt
26 | 	import seaborn as sns
27 | 	ax = sns.jointplot(new, old, kind="reg")
28 | 	ax.set_axis_labels('New', 'Old')
29 | 	plt.savefig('realign_check.png')
30 | 
31 | def compare():
32 | 	s1 = read_as_score('new_out/realigned.sorted.bam')
33 | 	s2 = read_as_score('old_out/realigned.sorted.bam')
34 | 	k = list([x for x in s1 if x in s2])
35 | 	old = []
36 | 	new = []
37 | 	print("ID\tnew\told\n")
38 | 	for k_ in k:
39 | 		print("%s\t%s\t%s\n"%(k_, s1[k_], s2[k_] ) )
40 | 		new.append(s1[k_])
41 | 		old.append(s2[k_])
42 | 	plot_scatter(new, old)
43 | 
44 | 
45 | if __name__ == '__main__':
46 | 	compare()


--------------------------------------------------------------------------------
/deprecated/CLAM.fdr_peak.MP.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | """
  4 | This peak-caller script is part of the CLAM pipeline.
  5 | 
  6 | It takes input from re-aligner output, and use permutation to call peaks.
  7 | 
  8 | Tested under python 2.7.3
  9 | """
 10 | 
 11 | __author__ = 'Zijun Zhang'
 12 | __version__ = '1.0.0'
 13 | __email__ = 'zj.z@ucla.edu'
 14 | 
 15 | 
 16 | from optparse import OptionParser
 17 | import os, subprocess, sys
 18 | from collections import defaultdict
 19 | from statsmodels.sandbox.stats.multicomp import multipletests
 20 | from time import strftime
 21 | import cPickle as pickle
 22 | import bisect, random
 23 | import pysam
 24 | import pybedtools
 25 | from multiprocessing import Pool
 26 | 
 27 | def main():
 28 | 	"""
 29 | 	The main wrapper for CLAM peak-caller.
 30 | 	"""
 31 | 	# options parsing
 32 | 	usage='usage: %prog <options>'
 33 | 	parser=OptionParser(usage)
 34 | 	
 35 | 	parser.add_option('--resume', dest='resume', action='store_true', default=False, help='Resume mode - skipping pre-processing [Default: %default]')
 36 | 	parser.add_option('--verbose', dest='verbose', action='store_true', default=False, help='Verbose mode - print out all intermediate steps [Default: %default]')
 37 | 	parser.add_option('-o', dest='output_dir', default='./out_CLAM', help='Output file folder [Default %default]')
 38 | 	parser.add_option('-t', dest='tmp_dir', default='./tmp_CLAM', help='Temporary file folder [Default %default]')
 39 | 	parser.add_option('-p', dest='peak_file', default=None, help='Output peak calling filename; if None then do not call peaks  [Default %default]')
 40 | 	parser.add_option('--is-stranded', dest='is_stranded', default=False, action='store_true', help='Indicates if the reads are mapped with strand information. [Default: %default]')
 41 | 	parser.add_option('--extend', dest='extend', type='int', default=50, help='Extend to given nucleotides symmetrically at peak calling [Default: %default]')
 42 | 	parser.add_option('--pval-cutoff', dest='pval_cutoff', type='float', default=0.001, help='Corrected p-value threshold at peak calling [Default: %default]')
 43 | 	parser.add_option('--merge-size', dest='merge_size', type='int', default=50, help='merging window size at peak calling [Default: %default]')
 44 | 	parser.add_option('--max-iter', dest='max_iter', type='int', default=1000, help='maximum iterations for permutation tests [Default: %default]')
 45 | 	parser.add_option('-g', dest='gtf', default='./GTF/hg19_ensembl.sorted_gene.bed', help='GTF file [Default: %default]')
 46 | 	parser.add_option('--ThreadN', dest='nb_proc', type='int', default=4, help='Number of threads when doing permutations. [Default: %default]')
 47 | 	parser.add_option('--seed', dest='seed', type='int', default=100, help='Random seed for permutations. [Default: %default]')
 48 | 	parser.add_option('--merge-method', dest='merge_method', type='int', default=1, help='Peak merging method. 1: Narrow peak 2: Broad peak [Default: %default]')
 49 | 	parser.add_option('--pval-method', dest='correction_method', type='int', default=1, help='Multiple testing correction method. 1: Bonferroni 2: BH FDR [Default: %default]')
 50 | 	parser.add_option('--call-transcriptome', dest='call_all', action='store_true', default=False, help='Call peaks on transcriptome instead of genes with multi-mappers. [Default: %default]')
 51 | 	
 52 | 	(options,args)=parser.parse_args()
 53 | 	
 54 | 	output_dir=os.path.abspath(options.output_dir)
 55 | 	tmp_dir=os.path.abspath(options.tmp_dir)
 56 | 	verbose=options.verbose
 57 | 	
 58 | 	#random.seed(options.seed)
 59 | 	
 60 | 	write_parameter_log(options, output_dir)
 61 | 	
 62 | 	# find transcripts associated with multi-mapped reads
 63 | 	if verbose:
 64 | 		print_time_stamp('Finding transcripts with multimapped reads.')
 65 | 	if not os.path.isfile(output_dir + '/CLAM_mapper.sorted.out'):
 66 | 		subprocess.call(''' sort -k1,1 -k2,2n %s/CLAM_mapper.out | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6}' > %s/CLAM_mapper.sorted.out ''' % (output_dir, output_dir), shell=True)
 67 | 	# Note: tid_list: tid -> [chr:strand, start, end]
 68 | 	tid_list=read_aligner_output(output_dir + '/CLAM_mapper.sorted.out', options.gtf, options.is_stranded, tmp_dir, options.resume, options.call_all)
 69 | 	
 70 | 	# make bam file for re-aligner output, if non-exist
 71 | 	if not (options.resume and os.path.isfile(output_dir + '/assigned_multimapped_reads.bam')):
 72 | 		if verbose:
 73 | 			print_time_stamp('Making bamfile for aligner output.')
 74 | 		header_cmd='samtools view -H ' + tmp_dir + '/filter100.sorted.bam > ' + output_dir + '/sam_header.sam'
 75 | 		subprocess.call(header_cmd, shell=True)
 76 | 		body_cmd = ''' awk '{if($6=="+"){print $4"\t256\t"$1"\t"$2+1"\t0\t"$3-$2+1"M\t*\t0\t0\t*\t*\tAS:f:"$5}else{print $4"\t272\t"$1"\t"$2+1"\t0\t"$3-$2+1"M\t*\t0\t0\t*\t*\tAS:f:"$5 }}' ''' + output_dir + '/CLAM_mapper.sorted.out > ' + output_dir + '/CLAM_mapper.sorted.sam'
 77 | 		subprocess.call(body_cmd, shell=True)
 78 | 		makeBam_cmd = 'cat %s/sam_header.sam %s/CLAM_mapper.sorted.sam | samtools view -bS - > %s/assigned_multimapped_reads.bam' % (output_dir, output_dir,output_dir)
 79 | 		subprocess.call(makeBam_cmd, shell=True)
 80 | 		index_cmd = 'samtools index %s/assigned_multimapped_reads.bam' % output_dir
 81 | 		subprocess.call(index_cmd, shell=True)
 82 | 	
 83 | 	# multi-processing peak-caller
 84 | 	if not (options.resume and os.path.isfile(tmp_dir+'/unique_to_qval.pdata') and os.path.isfile(tmp_dir+'/combined_to_qval.pdata')):
 85 | 		child_transcr_ind = list(chunkify(range(len(tid_list)), options.nb_proc))
 86 | 		
 87 | 		pool = Pool(processes=options.nb_proc)
 88 | 		
 89 | 		unibam_file=tmp_dir+'/filter100.sorted.bam'
 90 | 		multibam_file=output_dir+'/assigned_multimapped_reads.bam'
 91 | 		tid_to_qval_compact = pool.map(get_permutation_fdr, [ (unibam_file, multibam_file, tid_list, child_transcr_ind[i], options.pval_cutoff, options.max_iter, options.is_stranded, verbose, options.correction_method, options.seed) for i in range(options.nb_proc) ])
 92 | 		
 93 | 		pool.terminate()
 94 | 		pool.join()
 95 | 		
 96 | 		unique_tid_to_qval, combined_tid_to_qval = unpack_tid_to_qval(tid_to_qval_compact)
 97 | 		pickle.dump(unique_tid_to_qval, open(tmp_dir+'/unique_to_qval.pdata','wb'), -1)
 98 | 		pickle.dump(combined_tid_to_qval, open(tmp_dir+'/combined_to_qval.pdata','wb'), -1)
 99 | 	else:
100 | 		print_time_stamp('Resume mode, found qval data files.')
101 | 		unique_tid_to_qval=pickle.load(open(tmp_dir+'/unique_to_qval.pdata','rb'))
102 | 		combined_tid_to_qval=pickle.load(open(tmp_dir+'/combined_to_qval.pdata','rb'))
103 | 	
104 | 	# merge peaks
105 | 	if options.merge_method==1:
106 | 		merge_peaks=merge_peaks_singleNucl
107 | 		mm='singleNucl'
108 | 	elif options.merge_method==2:
109 | 		merge_peaks=merge_peaks_broadPeak
110 | 		mm='broadPeak'
111 | 	else:
112 | 		merge_peaks=merge_peaks_singleNucl
113 | 		mm='unknown selection, using default singleNucl'
114 | 		
115 | 	if verbose:
116 | 		print_time_stamp('Merging peaks within ' + str(options.merge_size) + 'bp, using ' + mm + '..')
117 | 	
118 | 	unique_peaks=merge_peaks(unique_tid_to_qval, options.merge_size, options.pval_cutoff)
119 | 	combined_peaks=merge_peaks(combined_tid_to_qval, options.merge_size, options.pval_cutoff)
120 | 	
121 | 	print_time_stamp('Comparing results and writing to file..')
122 | 	
123 | 	# write peak-calling results to file.
124 | 	with open(output_dir + '/all_peaks.txt', 'w') as f:
125 | 		for peak in unique_peaks:  # peak = ['chr\tstart\tend\tstrand', 'height\tqval\t', tid]
126 | 			if options.extend is None:
127 | 				wt_loc=peak[0]
128 | 			else:
129 | 				wt_loc=extend_peak_region(peak[0], options.extend)
130 | 			f.write(wt_loc + '\t' + '\t'.join([str(x) for x in peak[1]]) + '\t' + peak[2] + '\tunique\n')
131 | 		for peak in combined_peaks:
132 | 			if options.extend is None:
133 | 				wt_loc=peak[0]
134 | 			else:
135 | 				wt_loc=extend_peak_region(peak[0], options.extend)
136 | 			f.write(wt_loc + '\t' + '\t'.join([str(x) for x in peak[1]]) + '\t' + peak[2] + '\tcombined\n')
137 | 	subprocess.call(''' sort -k1,1 -k2,2n %s/all_peaks.txt | awk '{print $1"\t"$2"\t"$3"\t"$5";"$6";"$7"\t"$8"\t"$4}'  | bedtools merge -s -d -1 -i stdin -c 4,5,6, -o collapse,collapse,distinct  > %s''' % (output_dir, options.peak_file), shell=True)
138 | 		
139 | 	print_time_stamp('Peak-calling done.')
140 | 
141 | def write_parameter_log(options, output_dir):
142 | 	"""
143 | 	Write paramter values to a log file, named by current time.
144 | 	"""
145 | 	merge_method_dict={1:'narrowPeak', 2:'broadPeak'}
146 | 	correction_method_dict={1:'Bonferroni', 2:'BH_FDR'}
147 | 	with open(output_dir+'/CLAM_Peaker.Parameters.'+ strftime("%Y%m%d_%H%M") + '.txt', 'w') as log:
148 | 		log.write('CLAM Peaker ' + __version__ + '\n')
149 | 		log.write('resume: ' + str(options.resume) + '\n')
150 | 		log.write('verbose: ' + str(options.verbose) + '\n')
151 | 		log.write('output_dir:' + str(options.output_dir) + '\n')
152 | 		log.write('tmp_dir: ' + str(options.tmp_dir) + '\n')
153 | 		log.write('peak_file: ' + str(options.peak_file) + '\n')
154 | 		log.write('is_stranded: ' + str(options.is_stranded) + '\n')
155 | 		log.write('extend: ' + str(options.extend) + '\n')
156 | 		log.write('pval_cutoff: ' + str(options.pval_cutoff) + '\n')
157 | 		log.write('merge_size: ' + str(options.merge_size) + '\n')
158 | 		log.write('max_iter: ' + str(options.max_iter) + '\n')
159 | 		log.write('gtf: ' + str(options.gtf) + '\n')
160 | 		log.write('seed: ' + str(options.seed) + '\n')
161 | 		log.write('merge_method: ' + merge_method_dict[options.merge_method] + '\n')
162 | 		log.write('correction_method: ' + correction_method_dict[options.correction_method] + '\n')
163 | 		log.write('thread: ' + str(options.nb_proc) + '\n')
164 | 
165 | def chunkify(a, n):
166 | 	"""
167 | 	Separate a list (a) into consecutive n chunks.
168 | 	Returns the chunkified index
169 | 	"""
170 | 	k, m = len(a) / n, len(a) % n
171 | 	return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n))
172 | 	
173 | def unpack_tid_to_qval(compact):
174 | 	"""
175 | 	Unpacks the returned values from multi-processing.
176 | 	"""
177 | 	unique_tid_to_qval=defaultdict(list)
178 | 	combined_tid_to_qval=defaultdict(list)
179 | 	for item in compact:
180 | 		unique, combined = item[0], item[1]
181 | 		for tid in combined:
182 | 			if len(unique[tid])>0:
183 | 				unique_tid_to_qval[tid]=unique[tid]
184 | 			if len(combined[tid])>1:
185 | 				combined_tid_to_qval[tid]=combined[tid]
186 | 	return unique_tid_to_qval,combined_tid_to_qval
187 | 	
188 | def get_permutation_fdr((unibam_file, multibam_file, tid_list, tid_ind, pval_cutoff, max_iter, is_stranded, verbose, correction_method, seed)):
189 | 	"""
190 | 	General permutation wrapper for a list of genes. Gets called by multi-processing generated by Pool()
191 | 	Returns packed FDRs from each child process.
192 | 	"""
193 | 	random.seed(seed)
194 | 	
195 | 	unique_tid_to_qval=defaultdict(list)
196 | 	combined_tid_to_qval=defaultdict(list)
197 | 	
198 | 	unibam=pysam.Samfile(unibam_file, 'rb')
199 | 	multibam=pysam.Samfile(multibam_file, 'rb')
200 | 	
201 | 	processed=0
202 | 	pid=os.getpid()
203 | 	
204 | 	for ind in tid_ind:
205 | 		processed+=1
206 | 		if verbose and not processed % 100:
207 | 			print_time_stamp(str(processed) + '/' + str(len(tid_ind)) + ' finished in pid ' + str(pid))
208 | 		tid, chr, strand, start, end = tid_list[ind]
209 | 		unique_reads = read_tid_frag_from_bam(tid_list[ind], unibam, is_stranded, True)
210 | 		multi_reads = read_tid_frag_from_bam(tid_list[ind], multibam, is_stranded, False)
211 | 		
212 | 		this_unique_to_qval = do_permutation(tid_list[ind], unique_reads, max_iter, pval_cutoff, correction_method)
213 | 		this_combined_to_qval = do_permutation(tid_list[ind], unique_reads+multi_reads, max_iter, pval_cutoff, correction_method)
214 | 		
215 | 		unique_tid_to_qval[tid].extend(this_unique_to_qval)
216 | 		combined_tid_to_qval[tid].extend(this_combined_to_qval)
217 | 	unibam.close()
218 | 	multibam.close()
219 | 	return unique_tid_to_qval, combined_tid_to_qval
220 | 
221 | def do_permutation(transcr, read_transcript, max_iter, pval_cutoff, correction_method):	
222 | 	"""
223 | 	Permutes the reads along a given gene length, sub-routine that get called by get_permutation_fdr(..).
224 | 	Returns the locally corrected p-values for each observed height on the given gene.
225 | 	"""
226 | 	tid, chr, strand, tstart, tend = transcr
227 | 	tid_length=tend-tstart+1
228 | 	obs_heights_count=count_pileup_heights(tid_length, read_transcript)
229 | 	
230 | 	tid_to_qval=[]
231 | 	
232 | 	rand_heights_dist=defaultdict(int)
233 | 	rand_sum=0
234 | 	# need to account for the 'observed' data, since permutation tests should never report p-value as 0. 3/22/16
235 | 	for i in obs_heights_count:
236 | 		if i==0:
237 | 			continue
238 | 		else:
239 | 			rand_heights_dist[int(i)]+=1
240 | 			rand_sum+=1
241 | 	for B in range(max_iter):
242 | 		new_heights_count=permutate_heights(tid_length, read_transcript)
243 | 		for i in new_heights_count:
244 | 			if i==0:
245 | 				continue
246 | 			else:
247 | 				rand_heights_dist[i]+=1
248 | 				rand_sum+=1
249 | 	height_to_pval={}
250 | 	for h in set(obs_heights_count):
251 | 		if h < 1:
252 | 			continue
253 | 		else:
254 | 			lefter=0
255 | 			for j in range(int(h), max(rand_heights_dist)+1):
256 | 				lefter+=rand_heights_dist[j]
257 | 			height_to_pval[h]=lefter/float(rand_sum)
258 | 	pval_list=[]
259 | 	for i in obs_heights_count:
260 | 		if i<1:
261 | 			continue
262 | 		pval_list.append(height_to_pval[i])
263 | 	if len(pval_list)<=1:
264 | 		return []
265 | 
266 | 	if correction_method==2:
267 | 		qval_list=multipletests(pval_list, method='fdr_bh')[1]
268 | 	else:
269 | 		qval_list=[min(x*(len(set([int(y) for y in height_to_pval if y!=0]))), 1.0) for x in pval_list]
270 | 	
271 | 	ind=0
272 | 	last_height=0
273 | 	for j in range(len(obs_heights_count)):
274 | 		this_height=obs_heights_count[j]
275 | 		if this_height<1:
276 | 			last_height=0
277 | 			continue
278 | 		if qval_list[ind] <= pval_cutoff:
279 | 			if this_height==last_height:
280 | 				chr, last_start, last_end, last_strand, last_height, last_qval=tid_to_qval[-1]
281 | 				tid_to_qval[-1]=[chr, last_start, tstart+j+1, strand, last_height, last_qval]
282 | 			else:
283 | 				tid_to_qval.append([chr, tstart+j, tstart+j+1, strand, obs_heights_count[j], qval_list[ind]])  # chr, start, end, strand, height, this_qval
284 | 				last_height=this_height
285 | 		ind+=1
286 | 	return tid_to_qval
287 | 
288 | def heights_to_dist(rand_heights):
289 | 	"""
290 | 	sub-routine 
291 | 	"""
292 | 	rand_heights_dist=defaultdict(int)
293 | 	rand_sum=0
294 | 	for new_heights_count in rand_heights:
295 | 		for i in new_heights_count:
296 | 			if i==0:
297 | 				continue
298 | 			else:
299 | 				rand_heights_dist[i]+=1
300 | 				rand_sum+=1
301 | 	return rand_heights_dist, rand_sum
302 | 
303 | def permutate_heights(tlen, reads):
304 | 	"""
305 | 	Sub-routine for do_permutation(...)
306 | 	Randomly allocate the read locations.
307 | 	"""
308 | 	loc_heights=[0] * tlen
309 | 	for id, pos, read_len, score in reads:
310 | 		if score<1 and random.random() > score:
311 | 			continue
312 | 		rand_pos=random.randint(1, max(1, tlen-read_len))
313 | 		for i in range(rand_pos, min(rand_pos + read_len, tlen)):
314 | 			loc_heights[i]+=1
315 | 	return loc_heights
316 | 
317 | def count_pileup_heights(tlen, reads):
318 | 	"""
319 | 	Sub-routine for do_permutation(...)
320 | 	Counts the distribution of pile-up heights for a given gene/permutation
321 | 	"""
322 | 	loc_heights=[0] * tlen
323 | 	for id, pos, read_len, score in reads:
324 | 		for i in range(pos, min(pos+read_len-1, tlen)):
325 | 			loc_heights[i]+=score
326 | 	return loc_heights
327 | 
328 | def merge_peaks_broadPeak(transcript_to_qval, merge_size, pval_cutoff):
329 | 	"""
330 | 	Merge called peaks on a gene using option 2, 
331 | 	i.e. if two peaks close to each other, region
332 | 	between two peaks are also called as peaks
333 | 	Retuns a list of merged peaks.
334 | 	"""
335 | 	peaks=[]
336 | 	last_qval=[0,1]
337 | 	for tid in transcript_to_qval:
338 | 		init=True
339 | 		for chr, start, end, strand, height, this_qval in transcript_to_qval[tid]:
340 | 			loc=[chr, str(start), str(end), strand]
341 | 			this_qval=[height, this_qval]  # this_qval=[height, qval] so that when qval=0, we can compare height
342 | 			if  this_qval[1] > pval_cutoff:
343 | 				continue
344 | 			if init:
345 | 				last_qval=this_qval
346 | 				last_pos=[start, end]
347 | 				last_loc=loc
348 | 				last_chr=chr
349 | 				write_out=False
350 | 				init=False
351 | 				continue
352 | 			if int(start) - int(last_pos[1]) > merge_size:
353 | 				write_out=True
354 | 			else:
355 | 				last_pos=[last_pos[0], end]
356 | 				last_qval=this_qval if last_qval[0]<this_qval[0] else last_qval
357 | 				last_loc[2]=str(end)
358 | 				write_out=False
359 | 				
360 | 			if write_out and last_qval[1] < pval_cutoff:
361 | 				peaks.append(['\t'.join(last_loc), last_qval, tid])
362 | 				last_qval=this_qval
363 | 				last_pos=[start, end]
364 | 				last_loc=loc
365 | 				last_chr=[chr, str(start), str(end), strand]
366 | 				write_out=False
367 | 		if last_qval[1] < pval_cutoff:
368 | 			peaks.append(['\t'.join(last_loc), last_qval, tid])
369 | 	return peaks
370 | 
371 | def merge_peaks_singleNucl(transcript_to_qval, merge_size, pval_cutoff):
372 | 	"""
373 | 	Merge called peaks on a gene using option 1 
374 | 	(default), i.e. if two peaks close to each other, 
375 | 	only pick the most significant one peak
376 | 	Retuns a list of merged peaks.
377 | 	"""
378 | 	peaks=[]
379 | 	last_qval=[0,1]
380 | 	for tid in transcript_to_qval:
381 | 		init=True
382 | 		for chr, start, end, strand, height, this_qval in transcript_to_qval[tid]:
383 | 			loc='\t'.join([chr, str(start), str(end), strand])
384 | 			this_qval=[height, this_qval]  # this_qval=[height, qval] so that when qval=0, we can compare height
385 | 			if  this_qval[1] > pval_cutoff:
386 | 				continue
387 | 			if init:
388 | 				last_qval=this_qval
389 | 				last_pos=[start, end]
390 | 				last_loc=loc
391 | 				last_chr=chr
392 | 				write_out=False
393 | 				init=False
394 | 				continue
395 | 			if last_chr == chr:
396 | 				if abs( int(start) - int(last_pos[0]) ) > merge_size:
397 | 					write_out=True
398 | 				elif last_qval[0] < this_qval[0]:
399 | 					last_pos=[start, end]
400 | 					last_qval=this_qval
401 | 					last_loc=loc
402 | 					write_out=False
403 | 			else:
404 | 				write_out=True
405 | 				
406 | 			if write_out and last_qval[1] < pval_cutoff:
407 | 				#peaks[last_loc]=last_qval
408 | 				peaks.append([last_loc, last_qval, tid])
409 | 				last_qval=this_qval
410 | 				last_pos=[start, end]
411 | 				last_loc=loc
412 | 				last_chr=chr
413 | 				write_out=False
414 | 		if last_qval[1] < pval_cutoff:
415 | 			peaks.append([last_loc, last_qval, tid])
416 | 	return peaks
417 | 
418 | def extend_peak_region(loc, target_len):
419 | 	"""
420 | 	Extends peak symmetrically if peak is smaller than target_len.
421 | 	"""
422 | 	chr, start, end, strand = loc.split('\t')
423 | 	start = int(start)
424 | 	end = int(end)
425 | 	old_len = end - start
426 | 	if old_len > target_len:
427 | 		return loc
428 | 	else:
429 | 		center = int((start + end)/2)
430 | 		start = center - int(target_len /2)
431 | 		end = center + int(target_len/2)
432 | 		return '\t'.join([chr, str(start), str(end), strand])
433 | 
434 | def read_aligner_output(rm_out, gtffile, is_stranded, tmp_dir, resume, call_all):
435 | 	"""
436 | 	Use bedtools to get transcripts/genes with multi-mapped reads.
437 | 	Returns a list of transcripts/genes.
438 | 	"""
439 | 	if not (resume and os.path.isfile(tmp_dir + '/gtf2multireads.bed')):
440 | 		rm_bed=pybedtools.BedTool(rm_out)
441 | 		gtf=pybedtools.BedTool(gtffile)
442 | 		gtf_bed_rm = gtf.intersect(rm_bed, s=True, u=True) if is_stranded else gtf.intersect(rm_bed, u=True)
443 | 		gtf_bed_rm.saveas(tmp_dir + '/gtf2multireads.bed')
444 | 		pybedtools.cleanup()
445 | 	
446 | 	tid_list=[]
447 | 	if call_all:
448 | 		gtf_to_read=gtffile
449 | 	else:
450 | 		gtf_to_read=tmp_dir+'/gtf2multireads.bed'
451 | 	with open(gtf_to_read,'r') as f:
452 | 		for line in f:
453 | 			ele=line.rstrip().split('\t')
454 | 			gene_id=ele[3]
455 | 			gene_chr, gene_start, gene_end=ele[0], int(ele[1]), int(ele[2])
456 | 			gene_strand=ele[5]
457 | 			tid_list.append([gene_id, gene_chr, gene_strand, gene_start, gene_end])
458 | 	print_time_stamp('Read transcripts with multi-reads: ' + str(len(tid_list)))
459 | 	return tid_list
460 | 
461 | def read_tid_frag_from_bam(tid, bamfile, is_stranded, is_unique):
462 | 	"""
463 | 	Use pysam to fetch reads info for a given gene and its loci.
464 | 	Returns reads, read weights and its mapped loci.
465 | 	"""
466 | 	tid_reads=[]
467 | 	gene, chr, strand, start, end=tid
468 | 	if strand=='-':
469 | 		is_reverse=True
470 | 	else:
471 | 		is_reverse=False
472 | 	reads=[x for x in bamfile.fetch(chr, int(start), int(end)) if x.is_reverse==is_reverse or not is_stranded]
473 | 	reads=[x for x in reads if x.pos>=int(start) and x.pos<=int(end)]
474 | 	for read in reads:
475 | 		if is_unique:
476 | 			try:
477 | 				opt_NH=read.opt('NH')
478 | 				if opt_NH > 1:
479 | 					continue
480 | 			except:
481 | 				pass
482 | 			score=1
483 | 		else:
484 | 			try:
485 | 				opt_AS=read.opt('AS')
486 | 				if isinstance(opt_AS, float):
487 | 					score=opt_AS
488 | 				else:
489 | 					continue
490 | 			except:
491 | 				continue
492 | 		read_length = read.qlen if read.qlen > 0 else read.positions[-1] - read.positions[0] + 1
493 | 		if read.pos-start>=0 and read_length<500:    # to avoid junction reads
494 | 			tid_reads.append([read.qname, read.pos-start, read_length, score])
495 | 	return tid_reads
496 | 
497 | def print_time_stamp(msg):
498 | 	"""
499 | 	Reporter function for logging.
500 | 	"""
501 | 	current_time='[' + strftime("%Y-%m-%d %H:%M:%S") + '] '
502 | 	print >> sys.stderr, current_time + msg
503 | 
504 | if __name__=='__main__':
505 | 	main()


--------------------------------------------------------------------------------
/deprecated/README.md:
--------------------------------------------------------------------------------
  1 | # CLAM Version 1.0.0
  2 | # CLIP-seq Analysis of Multi-mapped reads
  3 | 
  4 | ## Download the latest version [here](https://github.com/Xinglab/CLAM/releases/download/1.0.0/CLAM-v1.zip).
  5 | 
  6 | ## Requirements
  7 | 
  8 | CLAM is a two-stage algorithm implemented in Python. It is intended to be used in Unix-based environment. It has been tested on Linux with Python 2.7.3.
  9 | 
 10 | CLAM depends on several commonly-used Python libraries, including [pysam](http://pysam.readthedocs.io/en/latest/) and [pybedtools](https://daler.github.io/pybedtools/index.html).
 11 | 
 12 | A detailed dependency requirements (with version info) can be found in "requirements.txt". Alternatively, just run
 13 | ```
 14 | pip install -r requirements.txt
 15 | ```
 16 | and you will be good to go.
 17 | 
 18 | ## Usage
 19 | We provide a general shell script wrapper that runs the whole pipeline sequentially with default parameters for CLIP-seq. You only need to give the paths to input bam file and output folder, and a binary (0/1) indicator for strandness:
 20 | ```
 21 | $ sh runCLAM_git.sh $bam $output_dir $temp_dir $is_stranded
 22 | ```
 23 | ..and the CLAM pipeline's output will be generated in $output_dir as specified. Check "Output" section below to understand the file formats in the CLAM output folder.
 24 | 
 25 | 
 26 | Alternatively, if you want to dig more into the parameters, you can run the pipeline with "--help" in command line and check the options. The following should be printed to the screen:
 27 | 
 28 | For CLAM re-aligner,
 29 | ```
 30 | $ python CLAM.lite_aligner.py --help
 31 | Usage: CLAM.lite_aligner.py <options> input_file.bam
 32 | 
 33 | Options:
 34 |   -h, --help            show this help message and exit
 35 |   -o OUTPUT_DIR         Output file folder [Default ./out_CLAM]
 36 |   -t TMP_DIR            Temporary file folder [Default ./tmp_CLAM]
 37 |   -w WINDOW_SIZE        Local window size for EM [Default: 50]
 38 |   --max-multihits=MAX_MULTIHITS
 39 |                         Discard reads mapped to more than <max_multihits>
 40 |                         locations. [Default: 100]
 41 |   --min-unique-reads=MIN_UNIQUE_READS
 42 |                         Discard genomic regions with less than
 43 |                         <min_unique_reads> of unique reads. [Default: 0]
 44 |   --is-stranded         Indicates if the reads are mapped with strand
 45 |                         information. [Default: False]
 46 |   --resume              Resume mode - skipping pre-processing [Default: False]
 47 |   --verbose             Verbose mode - print out all intermediate steps
 48 |                         [Default: False]
 49 |   --max-gap=MAX_GAPS    Maximum distance allowed in grouping reads. [Default:
 50 |                         50]
 51 | ```
 52 | 
 53 | For CLAM peak-caller,
 54 | ```
 55 | $ python CLAM.fdr_peak.MP.py --help
 56 | Usage: CLAM.fdr_peak.MP.py <options>
 57 | 
 58 | Options:
 59 |   -h, --help            show this help message and exit
 60 |   --resume              Resume mode - skipping pre-processing [Default: False]
 61 |   --verbose             Verbose mode - print out all intermediate steps
 62 |                         [Default: False]
 63 |   -o OUTPUT_DIR         Output file folder [Default ./out_CLAM]
 64 |   -t TMP_DIR            Temporary file folder [Default ./tmp_CLAM]
 65 |   -p PEAK_FILE          Output peak calling filename; if None then do not call
 66 |                         peaks  [Default none]
 67 |   --is-stranded         Indicates if the reads are mapped with strand
 68 |                         information. [Default: False]
 69 |   --extend=EXTEND       Extend to given nucleotides symmetrically at peak
 70 |                         calling [Default: 50]
 71 |   --pval-cutoff=PVAL_CUTOFF
 72 |                         Corrected p-value threshold at peak calling [Default:
 73 |                         0.001]
 74 |   --merge-size=MERGE_SIZE
 75 |                         merging window size at peak calling [Default: 50]
 76 |   --max-iter=MAX_ITER   maximum iterations for permutation tests [Default:
 77 |                         1000]
 78 |   -g GTF                GTF file [Default: ./GTF/hg19_ensembl.sorted_gene.bed]
 79 |   --ThreadN=NB_PROC     Number of threads when doing permutations. [Default:
 80 |                         4]
 81 |   --seed=SEED           Random seed for permutations. [Default: 100]
 82 |   --merge-method=MERGE_METHOD
 83 |                         Peak merging method. 1: Narrow peak 2: Broad peak
 84 |                         [Default: 1]
 85 |   --pval-method=CORRECTION_METHOD
 86 |                         Multiple testing correction method. 1: Bonferroni 2:
 87 |                         BH FDR [Default: 1]
 88 |   --call-transcriptome  Call peaks on transcriptome instead of genes with
 89 |                         multi-mappers. [Default: False]
 90 | ```
 91 | And you can specify your own parameters accordingly. For example, for **m6A RIP-seq**, window size parameter (-w) and maximum gaps (--max-gap) for re-aligner should be set to 100.
 92 | 
 93 | ## Output
 94 | The output of the re-aligner is "assigned_multimapped_reads.bam", which is a customized BAM file following SAM format. Note that the re-aligned weights are stored in "AS:f" tag, so please be aware and do not change/omit it.
 95 | Output of re-aligner could also be seen as an intermediate file for CLAM pipeline.
 96 | 
 97 | The output of the peak-caller is a bed file whose name is specified by user. It is a 6-column [BED](https://genome.ucsc.edu/FAQ/FAQformat.html#format1) format file, separated by tabulate and ordered as 
 98 | ```
 99 | chr    start    end    height;fdr;gene    unique/combined    strand
100 | ```
101 | Hence a peak with "combined" but no "unique" on the fifth column indicates this is a rescued peak; both "unique" and "combined" as common peak; or lost peak otherwise.
102 | 
103 | ## Testing data
104 | Once downloaded the CLAM source code, please download the hnRNPC iCLIP dataset from [here](http://www.mimg.ucla.edu/faculty/xing/CLAM/hnRNPC_iCLIP_rep1_E-MAT-1371_novoalign.sorted.bam).
105 | 
106 | Then run CLAM on the dataset; if finished correctly, you should have rescued peaks at these two loci:
107 | 
108 | chr11:82,624,179-82,626,008
109 | 
110 | chr20:37,054,180-37,055,310
111 | 
112 | 
113 | 
114 | ## Contacts
115 | Yi Xing [yxing@ucla.edu](mailto:yxing@ucla.edu)
116 | 
117 | Zijun Zhang [zj.z@ucla.edu](mailto:zj.z@ucla.edu)
118 | 
119 | If you found a bug or mistake in this project, we would like to know about it. Before you send us the bug report though, please check the following:
120 | 
121 | 1. Are you using the latest version? The bug you found may already have been fixed.
122 | 2. Check that your input is in the correct format and you have selected the correct options.
123 | 3. Please reduce your input to the smallest possible size that still produces the bug; we will need your input data to reproduce the problem, and the smaller you can make it, the easier it will be.
124 | 
125 | ## Copyright and License Information
126 | Copyright (C) 2016 University of California, Los Angeles (UCLA) Zijun Zhang and Yi Xing
127 | 
128 | Authors: Zijun Zhang and Yi Xing
129 | 
130 | This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
131 | 
132 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
133 | 
134 | You should have received a copy of the GNU General Public License along with this program. If not, see [http://www.gnu.org/licenses/](http://www.gnu.org/licenses/).
135 | 


--------------------------------------------------------------------------------
/deprecated/requirements.txt:
--------------------------------------------------------------------------------
1 | pysam==0.9.0
2 | pybedtools==0.7.4
3 | multiprocessing
4 | optparse
5 | statsmodels.sandbox.stats.multicomp
6 | operator


--------------------------------------------------------------------------------
/deprecated/runCLAM_git.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # -- our name ---
 3 | #$ -S /bin/bash
 4 | #$ -R y
 5 | #$ -V
 6 | # Make sure that the .e and .o file arrive in the
 7 | # working directory
 8 | #$ -cwd
 9 | #Merge the standard out and standard error to one file
10 | #$ -j y
11 | #
12 | # Send mail at submission and completion of script
13 | #$ -m be
14 | #$ -M your@email.com
15 | #
16 | 
17 | # change the script dir to your own if necessary
18 | script_dir="./"  
19 | 
20 | echo "bamfile: $1"
21 | echo "output folder: $2"
22 | echo "tmp folder: $3"
23 | echo "is_stranded: $4"
24 | 
25 | if [ $4 -eq 1 ]
26 | then
27 | echo "is stranded"
28 | python $script_dir"/CLAM.lite_aligner.py" --verbose -o $2 -t $3 --is-stranded $1
29 | else
30 | echo "unstranded"
31 | python $script_dir"/CLAM.lite_aligner.py" --verbose -o $2 -t $3 $1
32 | fi
33 | 
34 | if [ $4 -eq 1 ]
35 | then
36 | python $script_dir"/CLAM.fdr_peak.MP.py" --verbose --is-stranded -o $2 -t $3 --pval-cutoff=0.001 --pval-method=2 -p CLAM_peak.bed --ThreadN=30 --max-iter=1000
37 | else
38 | python $script_dir"/CLAM.fdr_peak.MP.py" --verbose -o $2 -t $3 --pval-cutoff=0.001 --pval-method=2 -p CLAM_peak.bed --ThreadN=30 --max-iter=1000
39 | fi
40 | 


--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xinglab/CLAM/aed16a4d4e56535e17302448c48f32d35cba14cd/docs/.nojekyll


--------------------------------------------------------------------------------
/docs/CLAM.rst:
--------------------------------------------------------------------------------
 1 | CLAM package
 2 | ============
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     CLAM.stats
10 | 
11 | Submodules
12 | ----------
13 | 
14 | CLAM.peakcaller module
15 | ----------------------
16 | 
17 | .. automodule:: CLAM.peakcaller
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | CLAM.permutation\_peakcaller module
23 | -----------------------------------
24 | 
25 | .. automodule:: CLAM.permutation_peakcaller
26 |     :members:
27 |     :undoc-members:
28 |     :show-inheritance:
29 | 
30 | CLAM.preprocessor module
31 | ------------------------
32 | 
33 | .. automodule:: CLAM.preprocessor
34 |     :members:
35 |     :undoc-members:
36 |     :show-inheritance:
37 | 
38 | CLAM.realigner module
39 | ---------------------
40 | 
41 | .. automodule:: CLAM.realigner
42 |     :members:
43 |     :undoc-members:
44 |     :show-inheritance:
45 | 
46 | 
47 | Module contents
48 | ---------------
49 | 
50 | .. automodule:: CLAM
51 |     :members:
52 |     :undoc-members:
53 |     :show-inheritance:
54 | 


--------------------------------------------------------------------------------
/docs/CLAM.stats.rst:
--------------------------------------------------------------------------------
 1 | CLAM.stats package
 2 | ==================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | CLAM.stats.bin\_test\_alternatives module
 8 | -----------------------------------------
 9 | 
10 | .. automodule:: CLAM.stats.bin_test_alternatives
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | CLAM.stats.ztnb\_em module
16 | --------------------------
17 | 
18 | .. automodule:: CLAM.stats.ztnb_em
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: CLAM.stats
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = CLAM
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | # import os
 16 | # import sys
 17 | # sys.path.insert(0, os.path.abspath('.'))
 18 | 
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | project = 'CLAM'
 23 | copyright = '2018, Zijun Zhang'
 24 | author = 'Zijun Zhang'
 25 | 
 26 | # The short X.Y version
 27 | version = ''
 28 | # The full version, including alpha/beta/rc tags
 29 | release = ''
 30 | 
 31 | 
 32 | # -- General configuration ---------------------------------------------------
 33 | 
 34 | # If your documentation needs a minimal Sphinx version, state it here.
 35 | #
 36 | # needs_sphinx = '1.0'
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 40 | # ones.
 41 | extensions = [
 42 |     'sphinx.ext.autodoc',
 43 |     'sphinx.ext.todo',
 44 |     'sphinx.ext.coverage',
 45 |     'sphinx.ext.viewcode',
 46 |     'sphinx.ext.githubpages',
 47 | ]
 48 | 
 49 | # Add any paths that contain templates here, relative to this directory.
 50 | templates_path = ['_templates']
 51 | 
 52 | # The suffix(es) of source filenames.
 53 | # You can specify multiple suffix as a list of string:
 54 | #
 55 | # source_suffix = ['.rst', '.md']
 56 | source_suffix = '.rst'
 57 | 
 58 | # The master toctree document.
 59 | master_doc = 'index'
 60 | 
 61 | # The language for content autogenerated by Sphinx. Refer to documentation
 62 | # for a list of supported languages.
 63 | #
 64 | # This is also used if you do content translation via gettext catalogs.
 65 | # Usually you set "language" from the command line for these cases.
 66 | language = None
 67 | 
 68 | # List of patterns, relative to source directory, that match files and
 69 | # directories to ignore when looking for source files.
 70 | # This pattern also affects html_static_path and html_extra_path .
 71 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 72 | 
 73 | # The name of the Pygments (syntax highlighting) style to use.
 74 | pygments_style = 'sphinx'
 75 | 
 76 | 
 77 | # -- Options for HTML output -------------------------------------------------
 78 | 
 79 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 80 | # a list of builtin themes.
 81 | #
 82 | html_theme = 'alabaster'
 83 | 
 84 | # Theme options are theme-specific and customize the look and feel of a theme
 85 | # further.  For a list of options available for each theme, see the
 86 | # documentation.
 87 | #
 88 | # html_theme_options = {}
 89 | 
 90 | # Add any paths that contain custom static files (such as style sheets) here,
 91 | # relative to this directory. They are copied after the builtin static files,
 92 | # so a file named "default.css" will overwrite the builtin "default.css".
 93 | html_static_path = ['_static']
 94 | 
 95 | # Custom sidebar templates, must be a dictionary that maps document names
 96 | # to template names.
 97 | #
 98 | # The default sidebars (for documents that don't match any pattern) are
 99 | # defined by theme itself.  Builtin themes are using these templates by
100 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
101 | # 'searchbox.html']``.
102 | #
103 | # html_sidebars = {}
104 | 
105 | 
106 | # -- Options for HTMLHelp output ---------------------------------------------
107 | 
108 | # Output file base name for HTML help builder.
109 | htmlhelp_basename = 'CLAMdoc'
110 | 
111 | 
112 | # -- Options for LaTeX output ------------------------------------------------
113 | 
114 | latex_elements = {
115 |     # The paper size ('letterpaper' or 'a4paper').
116 |     #
117 |     # 'papersize': 'letterpaper',
118 | 
119 |     # The font size ('10pt', '11pt' or '12pt').
120 |     #
121 |     # 'pointsize': '10pt',
122 | 
123 |     # Additional stuff for the LaTeX preamble.
124 |     #
125 |     # 'preamble': '',
126 | 
127 |     # Latex figure (float) alignment
128 |     #
129 |     # 'figure_align': 'htbp',
130 | }
131 | 
132 | # Grouping the document tree into LaTeX files. List of tuples
133 | # (source start file, target name, title,
134 | #  author, documentclass [howto, manual, or own class]).
135 | latex_documents = [
136 |     (master_doc, 'CLAM.tex', 'CLAM Documentation',
137 |      'Zijun Zhang', 'manual'),
138 | ]
139 | 
140 | 
141 | # -- Options for manual page output ------------------------------------------
142 | 
143 | # One entry per manual page. List of tuples
144 | # (source start file, name, description, authors, manual section).
145 | man_pages = [
146 |     (master_doc, 'clam', 'CLAM Documentation',
147 |      [author], 1)
148 | ]
149 | 
150 | 
151 | # -- Options for Texinfo output ----------------------------------------------
152 | 
153 | # Grouping the document tree into Texinfo files. List of tuples
154 | # (source start file, target name, title, author,
155 | #  dir menu entry, description, category)
156 | texinfo_documents = [
157 |     (master_doc, 'CLAM', 'CLAM Documentation',
158 |      author, 'CLAM', 'One line description of project.',
159 |      'Miscellaneous'),
160 | ]
161 | 
162 | 
163 | # -- Extension configuration -------------------------------------------------
164 | # -- Options for todo extension ----------------------------------------------
165 | 
166 | # If true, `todo` and `todoList` produce output, else they produce nothing.
167 | todo_include_todos = True
168 | 


--------------------------------------------------------------------------------
/docs/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xinglab/CLAM/aed16a4d4e56535e17302448c48f32d35cba14cd/docs/image.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. CLAM documentation master file, created by
 2 |    sphinx-quickstart on Mon Jun 25 23:36:42 2018.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to CLAM's documentation!
 7 | ================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | 
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | CLAM
2 | ====
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    CLAM
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pysam>=0.9.0
2 | multiprocessing
3 | statsmodels
4 | tqdm
5 | pybedtools
6 | mpmath


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | from CLAM.config import __version__
 5 | 
 6 | def main():
 7 | 	setup(name='CLAM',
 8 | 		version=__version__,
 9 | 		description='CLIP-seq Analysis of Multi-mapped reads',
10 | 		author='Zijun Zhang',
11 | 		author_email='zj.z@ucla.edu',
12 | 		url='https://github.com/Xinglab/CLAM',
13 | 		packages=['CLAM', 'CLAM.stats'],
14 | 		scripts=['bin/CLAM'],
15 | 		install_requires=[
16 | 			'scipy',
17 | 			'pysam',
18 | 			'numpy',
19 | 			'statsmodels',
20 | 			'tqdm',
21 | 			'pybedtools',
22 | 			'mpmath']
23 | 		 )
24 | 	return
25 | 
26 | if __name__ == '__main__':
27 | 	main()
28 | 


--------------------------------------------------------------------------------