├── .gitignore ├── CLAM ├── __init__.py ├── bak │ ├── CLAM.fdr_peak.MP.py │ ├── CLAM.lite_aligner.py │ ├── deep_getsizeof.py │ ├── peakcaller.bak.py │ ├── peakcaller.bak2.py │ ├── realigner.bak.py │ ├── sim_callpeak.r │ └── utils.py ├── config.py ├── download_data.py ├── peak_annotator.py ├── peakcaller.py ├── permutation_peakcaller.py ├── preprocessor.py ├── realigner.py ├── stats │ ├── __init__.py │ ├── bin_test_alternatives.py │ └── ztnb_em.py └── utils │ └── parseBAM.py ├── LICENSE ├── README.md ├── _config.yml ├── bin └── CLAM ├── check └── compare_realign.py ├── deprecated ├── CLAM.fdr_peak.MP.py ├── CLAM.lite_aligner.py ├── GTF │ └── hg19_ensembl.sorted_gene.bed ├── README.md ├── requirements.txt └── runCLAM_git.sh ├── docs ├── .nojekyll ├── CLAM.rst ├── CLAM.stats.rst ├── Makefile ├── conf.py ├── image.png ├── index.rst └── modules.rst ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .vscode/ 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /CLAM/__init__.py: -------------------------------------------------------------------------------- 1 | # metadata 2 | __author__ = 'Zijun Zhang' 3 | __version__ = '1.1.3' 4 | __email__ = 'zj.z@ucla.edu' 5 | -------------------------------------------------------------------------------- /CLAM/bak/CLAM.fdr_peak.MP.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | This peak-caller script is part of the CLAM pipeline. 5 | 6 | It takes input from re-aligner output, and use permutation to call peaks. 7 | 8 | Tested under python 2.7.3 9 | """ 10 | 11 | __author__ = 'Zijun Zhang' 12 | __version__ = '1.0.0' 13 | __email__ = 'zj.z@ucla.edu' 14 | 15 | 16 | from optparse import OptionParser 17 | import os, subprocess, sys 18 | from collections import defaultdict 19 | from statsmodels.sandbox.stats.multicomp import multipletests 20 | from time import strftime 21 | import cPickle as pickle 22 | import bisect, random 23 | import pysam 24 | import pybedtools 25 | from multiprocessing import Pool 26 | 27 | def main(): 28 | """ 29 | The main wrapper for CLAM peak-caller. 30 | """ 31 | # options parsing 32 | usage='usage: %prog ' 33 | parser=OptionParser(usage) 34 | 35 | parser.add_option('--resume', dest='resume', action='store_true', default=False, help='Resume mode - skipping pre-processing [Default: %default]') 36 | parser.add_option('--verbose', dest='verbose', action='store_true', default=False, help='Verbose mode - print out all intermediate steps [Default: %default]') 37 | parser.add_option('-o', dest='output_dir', default='./out_CLAM', help='Output file folder [Default %default]') 38 | parser.add_option('-t', dest='tmp_dir', default='./tmp_CLAM', help='Temporary file folder [Default %default]') 39 | parser.add_option('-p', dest='peak_file', default=None, help='Output peak calling filename; if None then do not call peaks [Default %default]') 40 | parser.add_option('--is-stranded', dest='is_stranded', default=False, action='store_true', help='Indicates if the reads are mapped with strand information. [Default: %default]') 41 | parser.add_option('--extend', dest='extend', type='int', default=50, help='Extend to given nucleotides symmetrically at peak calling [Default: %default]') 42 | parser.add_option('--pval-cutoff', dest='pval_cutoff', type='float', default=0.001, help='Corrected p-value threshold at peak calling [Default: %default]') 43 | parser.add_option('--merge-size', dest='merge_size', type='int', default=50, help='merging window size at peak calling [Default: %default]') 44 | parser.add_option('--max-iter', dest='max_iter', type='int', default=1000, help='maximum iterations for permutation tests [Default: %default]') 45 | parser.add_option('-g', dest='gtf', default='./GTF/hg19_ensembl.sorted_gene.bed', help='GTF file [Default: %default]') 46 | parser.add_option('--ThreadN', dest='nb_proc', type='int', default=4, help='Number of threads when doing permutations. [Default: %default]') 47 | parser.add_option('--seed', dest='seed', type='int', default=100, help='Random seed for permutations. [Default: %default]') 48 | parser.add_option('--merge-method', dest='merge_method', type='int', default=1, help='Peak merging method. 1: Narrow peak 2: Broad peak [Default: %default]') 49 | parser.add_option('--pval-method', dest='correction_method', type='int', default=1, help='Multiple testing correction method. 1: Bonferroni 2: BH FDR [Default: %default]') 50 | parser.add_option('--call-transcriptome', dest='call_all', action='store_true', default=False, help='Call peaks on transcriptome instead of genes with multi-mappers. [Default: %default]') 51 | 52 | (options,args)=parser.parse_args() 53 | 54 | output_dir=os.path.abspath(options.output_dir) 55 | tmp_dir=os.path.abspath(options.tmp_dir) 56 | verbose=options.verbose 57 | 58 | #random.seed(options.seed) 59 | 60 | write_parameter_log(options, output_dir) 61 | 62 | # find transcripts associated with multi-mapped reads 63 | if verbose: 64 | print_time_stamp('Finding transcripts with multimapped reads.') 65 | if not os.path.isfile(output_dir + '/CLAM_mapper.sorted.out'): 66 | subprocess.call(''' sort -k1,1 -k2,2n %s/CLAM_mapper.out | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6}' > %s/CLAM_mapper.sorted.out ''' % (output_dir, output_dir), shell=True) 67 | # Note: tid_list: tid -> [chr:strand, start, end] 68 | tid_list=read_aligner_output(output_dir + '/CLAM_mapper.sorted.out', options.gtf, options.is_stranded, tmp_dir, options.resume, options.call_all) 69 | 70 | # make bam file for re-aligner output, if non-exist 71 | if not (options.resume and os.path.isfile(output_dir + '/assigned_multimapped_reads.bam')): 72 | if verbose: 73 | print_time_stamp('Making bamfile for aligner output.') 74 | header_cmd='samtools view -H ' + tmp_dir + '/filter100.sorted.bam > ' + output_dir + '/sam_header.sam' 75 | subprocess.call(header_cmd, shell=True) 76 | body_cmd = ''' awk '{if($6=="+"){print $4"\t256\t"$1"\t"$2+1"\t0\t"$3-$2+1"M\t*\t0\t0\t*\t*\tAS:f:"$5}else{print $4"\t272\t"$1"\t"$2+1"\t0\t"$3-$2+1"M\t*\t0\t0\t*\t*\tAS:f:"$5 }}' ''' + output_dir + '/CLAM_mapper.sorted.out > ' + output_dir + '/CLAM_mapper.sorted.sam' 77 | subprocess.call(body_cmd, shell=True) 78 | makeBam_cmd = 'cat %s/sam_header.sam %s/CLAM_mapper.sorted.sam | samtools view -bS - > %s/assigned_multimapped_reads.bam' % (output_dir, output_dir,output_dir) 79 | subprocess.call(makeBam_cmd, shell=True) 80 | index_cmd = 'samtools index %s/assigned_multimapped_reads.bam' % output_dir 81 | subprocess.call(index_cmd, shell=True) 82 | 83 | # multi-processing peak-caller 84 | if not (options.resume and os.path.isfile(tmp_dir+'/unique_to_qval.pdata') and os.path.isfile(tmp_dir+'/combined_to_qval.pdata')): 85 | child_transcr_ind = list(chunkify(range(len(tid_list)), options.nb_proc)) 86 | 87 | pool = Pool(processes=options.nb_proc) 88 | 89 | unibam_file=tmp_dir+'/filter100.sorted.bam' 90 | multibam_file=output_dir+'/assigned_multimapped_reads.bam' 91 | tid_to_qval_compact = pool.map(get_permutation_fdr, [ (unibam_file, multibam_file, tid_list, child_transcr_ind[i], options.pval_cutoff, options.max_iter, options.is_stranded, verbose, options.correction_method, options.seed) for i in range(options.nb_proc) ]) 92 | 93 | pool.terminate() 94 | pool.join() 95 | 96 | unique_tid_to_qval, combined_tid_to_qval = unpack_tid_to_qval(tid_to_qval_compact) 97 | pickle.dump(unique_tid_to_qval, open(tmp_dir+'/unique_to_qval.pdata','wb'), -1) 98 | pickle.dump(combined_tid_to_qval, open(tmp_dir+'/combined_to_qval.pdata','wb'), -1) 99 | else: 100 | print_time_stamp('Resume mode, found qval data files.') 101 | unique_tid_to_qval=pickle.load(open(tmp_dir+'/unique_to_qval.pdata','rb')) 102 | combined_tid_to_qval=pickle.load(open(tmp_dir+'/combined_to_qval.pdata','rb')) 103 | 104 | # merge peaks 105 | if options.merge_method==1: 106 | merge_peaks=merge_peaks_singleNucl 107 | mm='singleNucl' 108 | elif options.merge_method==2: 109 | merge_peaks=merge_peaks_broadPeak 110 | mm='broadPeak' 111 | else: 112 | merge_peaks=merge_peaks_singleNucl 113 | mm='unknown selection, using default singleNucl' 114 | 115 | if verbose: 116 | print_time_stamp('Merging peaks within ' + str(options.merge_size) + 'bp, using ' + mm + '..') 117 | 118 | unique_peaks=merge_peaks(unique_tid_to_qval, options.merge_size, options.pval_cutoff) 119 | combined_peaks=merge_peaks(combined_tid_to_qval, options.merge_size, options.pval_cutoff) 120 | 121 | print_time_stamp('Comparing results and writing to file..') 122 | 123 | # write peak-calling results to file. 124 | with open(output_dir + '/all_peaks.txt', 'w') as f: 125 | for peak in unique_peaks: # peak = ['chr\tstart\tend\tstrand', 'height\tqval\t', tid] 126 | if options.extend is None: 127 | wt_loc=peak[0] 128 | else: 129 | wt_loc=extend_peak_region(peak[0], options.extend) 130 | f.write(wt_loc + '\t' + '\t'.join([str(x) for x in peak[1]]) + '\t' + peak[2] + '\tunique\n') 131 | for peak in combined_peaks: 132 | if options.extend is None: 133 | wt_loc=peak[0] 134 | else: 135 | wt_loc=extend_peak_region(peak[0], options.extend) 136 | f.write(wt_loc + '\t' + '\t'.join([str(x) for x in peak[1]]) + '\t' + peak[2] + '\tcombined\n') 137 | subprocess.call(''' sort -k1,1 -k2,2n %s/all_peaks.txt | awk '{print $1"\t"$2"\t"$3"\t"$5";"$6";"$7"\t"$8"\t"$4}' | bedtools merge -s -d -1 -i stdin -c 4,5,6, -o collapse,collapse,distinct > %s''' % (output_dir, options.peak_file), shell=True) 138 | 139 | print_time_stamp('Peak-calling done.') 140 | 141 | def write_parameter_log(options, output_dir): 142 | """ 143 | Write paramter values to a log file, named by current time. 144 | """ 145 | merge_method_dict={1:'narrowPeak', 2:'broadPeak'} 146 | correction_method_dict={1:'Bonferroni', 2:'BH_FDR'} 147 | with open(output_dir+'/CLAM_Peaker.Parameters.'+ strftime("%Y%m%d_%H%M") + '.txt', 'w') as log: 148 | log.write('CLAM Peaker ' + __version__ + '\n') 149 | log.write('resume: ' + str(options.resume) + '\n') 150 | log.write('verbose: ' + str(options.verbose) + '\n') 151 | log.write('output_dir:' + str(options.output_dir) + '\n') 152 | log.write('tmp_dir: ' + str(options.tmp_dir) + '\n') 153 | log.write('peak_file: ' + str(options.peak_file) + '\n') 154 | log.write('is_stranded: ' + str(options.is_stranded) + '\n') 155 | log.write('extend: ' + str(options.extend) + '\n') 156 | log.write('pval_cutoff: ' + str(options.pval_cutoff) + '\n') 157 | log.write('merge_size: ' + str(options.merge_size) + '\n') 158 | log.write('max_iter: ' + str(options.max_iter) + '\n') 159 | log.write('gtf: ' + str(options.gtf) + '\n') 160 | log.write('seed: ' + str(options.seed) + '\n') 161 | log.write('merge_method: ' + merge_method_dict[options.merge_method] + '\n') 162 | log.write('correction_method: ' + correction_method_dict[options.correction_method] + '\n') 163 | log.write('thread: ' + str(options.nb_proc) + '\n') 164 | 165 | def chunkify(a, n): 166 | """ 167 | Separate a list (a) into consecutive n chunks. 168 | Returns the chunkified index 169 | """ 170 | k, m = len(a) / n, len(a) % n 171 | return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n)) 172 | 173 | def unpack_tid_to_qval(compact): 174 | """ 175 | Unpacks the returned values from multi-processing. 176 | """ 177 | unique_tid_to_qval=defaultdict(list) 178 | combined_tid_to_qval=defaultdict(list) 179 | for item in compact: 180 | unique, combined = item[0], item[1] 181 | for tid in combined: 182 | if len(unique[tid])>0: 183 | unique_tid_to_qval[tid]=unique[tid] 184 | if len(combined[tid])>1: 185 | combined_tid_to_qval[tid]=combined[tid] 186 | return unique_tid_to_qval,combined_tid_to_qval 187 | 188 | def get_permutation_fdr((unibam_file, multibam_file, tid_list, tid_ind, pval_cutoff, max_iter, is_stranded, verbose, correction_method, seed)): 189 | """ 190 | General permutation wrapper for a list of genes. Gets called by multi-processing generated by Pool() 191 | Returns packed FDRs from each child process. 192 | """ 193 | random.seed(seed) 194 | 195 | unique_tid_to_qval=defaultdict(list) 196 | combined_tid_to_qval=defaultdict(list) 197 | 198 | unibam=pysam.Samfile(unibam_file, 'rb') 199 | multibam=pysam.Samfile(multibam_file, 'rb') 200 | 201 | processed=0 202 | pid=os.getpid() 203 | 204 | for ind in tid_ind: 205 | processed+=1 206 | if verbose and not processed % 100: 207 | print_time_stamp(str(processed) + '/' + str(len(tid_ind)) + ' finished in pid ' + str(pid)) 208 | tid, chr, strand, start, end = tid_list[ind] 209 | unique_reads = read_tid_frag_from_bam(tid_list[ind], unibam, is_stranded, True) 210 | multi_reads = read_tid_frag_from_bam(tid_list[ind], multibam, is_stranded, False) 211 | 212 | this_unique_to_qval = do_permutation(tid_list[ind], unique_reads, max_iter, pval_cutoff, correction_method) 213 | this_combined_to_qval = do_permutation(tid_list[ind], unique_reads+multi_reads, max_iter, pval_cutoff, correction_method) 214 | 215 | unique_tid_to_qval[tid].extend(this_unique_to_qval) 216 | combined_tid_to_qval[tid].extend(this_combined_to_qval) 217 | unibam.close() 218 | multibam.close() 219 | return unique_tid_to_qval, combined_tid_to_qval 220 | 221 | def do_permutation(transcr, read_transcript, max_iter, pval_cutoff, correction_method): 222 | """ 223 | Permutes the reads along a given gene length, sub-routine that get called by get_permutation_fdr(..). 224 | Returns the locally corrected p-values for each observed height on the given gene. 225 | """ 226 | tid, chr, strand, tstart, tend = transcr 227 | tid_length=tend-tstart+1 228 | obs_heights_count=count_pileup_heights(tid_length, read_transcript) 229 | 230 | tid_to_qval=[] 231 | 232 | rand_heights_dist=defaultdict(int) 233 | rand_sum=0 234 | # need to account for the 'observed' data, since permutation tests should never report p-value as 0. 3/22/16 235 | for i in obs_heights_count: 236 | if i==0: 237 | continue 238 | else: 239 | rand_heights_dist[int(i)]+=1 240 | rand_sum+=1 241 | for B in range(max_iter): 242 | new_heights_count=permutate_heights(tid_length, read_transcript) 243 | for i in new_heights_count: 244 | if i==0: 245 | continue 246 | else: 247 | rand_heights_dist[i]+=1 248 | rand_sum+=1 249 | height_to_pval={} 250 | for h in set(obs_heights_count): 251 | if h < 1: 252 | continue 253 | else: 254 | lefter=0 255 | for j in range(int(h), max(rand_heights_dist)+1): 256 | lefter+=rand_heights_dist[j] 257 | height_to_pval[h]=lefter/float(rand_sum) 258 | pval_list=[] 259 | for i in obs_heights_count: 260 | if i<1: 261 | continue 262 | pval_list.append(height_to_pval[i]) 263 | if len(pval_list)<=1: 264 | return [] 265 | 266 | if correction_method==2: 267 | qval_list=multipletests(pval_list, method='fdr_bh')[1] 268 | else: 269 | qval_list=[min(x*(len(set([int(y) for y in height_to_pval if y!=0]))), 1.0) for x in pval_list] 270 | 271 | ind=0 272 | last_height=0 273 | for j in range(len(obs_heights_count)): 274 | this_height=obs_heights_count[j] 275 | if this_height<1: 276 | last_height=0 277 | continue 278 | if qval_list[ind] <= pval_cutoff: 279 | if this_height==last_height: 280 | chr, last_start, last_end, last_strand, last_height, last_qval=tid_to_qval[-1] 281 | tid_to_qval[-1]=[chr, last_start, tstart+j+1, strand, last_height, last_qval] 282 | else: 283 | tid_to_qval.append([chr, tstart+j, tstart+j+1, strand, obs_heights_count[j], qval_list[ind]]) # chr, start, end, strand, height, this_qval 284 | last_height=this_height 285 | ind+=1 286 | return tid_to_qval 287 | 288 | def heights_to_dist(rand_heights): 289 | """ 290 | sub-routine 291 | """ 292 | rand_heights_dist=defaultdict(int) 293 | rand_sum=0 294 | for new_heights_count in rand_heights: 295 | for i in new_heights_count: 296 | if i==0: 297 | continue 298 | else: 299 | rand_heights_dist[i]+=1 300 | rand_sum+=1 301 | return rand_heights_dist, rand_sum 302 | 303 | def permutate_heights(tlen, reads): 304 | """ 305 | Sub-routine for do_permutation(...) 306 | Randomly allocate the read locations. 307 | """ 308 | loc_heights=[0] * tlen 309 | for id, pos, read_len, score in reads: 310 | if score<1 and random.random() > score: 311 | continue 312 | rand_pos=random.randint(1, max(1, tlen-read_len)) 313 | for i in range(rand_pos, min(rand_pos + read_len, tlen)): 314 | loc_heights[i]+=1 315 | return loc_heights 316 | 317 | def count_pileup_heights(tlen, reads): 318 | """ 319 | Sub-routine for do_permutation(...) 320 | Counts the distribution of pile-up heights for a given gene/permutation 321 | """ 322 | loc_heights=[0] * tlen 323 | for id, pos, read_len, score in reads: 324 | for i in range(pos, min(pos+read_len-1, tlen)): 325 | loc_heights[i]+=score 326 | return loc_heights 327 | 328 | def merge_peaks_broadPeak(transcript_to_qval, merge_size, pval_cutoff): 329 | """ 330 | Merge called peaks on a gene using option 2, 331 | i.e. if two peaks close to each other, region 332 | between two peaks are also called as peaks 333 | Retuns a list of merged peaks. 334 | """ 335 | peaks=[] 336 | last_qval=[0,1] 337 | for tid in transcript_to_qval: 338 | init=True 339 | for chr, start, end, strand, height, this_qval in transcript_to_qval[tid]: 340 | loc=[chr, str(start), str(end), strand] 341 | this_qval=[height, this_qval] # this_qval=[height, qval] so that when qval=0, we can compare height 342 | if this_qval[1] > pval_cutoff: 343 | continue 344 | if init: 345 | last_qval=this_qval 346 | last_pos=[start, end] 347 | last_loc=loc 348 | last_chr=chr 349 | write_out=False 350 | init=False 351 | continue 352 | if int(start) - int(last_pos[1]) > merge_size: 353 | write_out=True 354 | else: 355 | last_pos=[last_pos[0], end] 356 | last_qval=this_qval if last_qval[0] pval_cutoff: 386 | continue 387 | if init: 388 | last_qval=this_qval 389 | last_pos=[start, end] 390 | last_loc=loc 391 | last_chr=chr 392 | write_out=False 393 | init=False 394 | continue 395 | if last_chr == chr: 396 | if abs( int(start) - int(last_pos[0]) ) > merge_size: 397 | write_out=True 398 | elif last_qval[0] < this_qval[0]: 399 | last_pos=[start, end] 400 | last_qval=this_qval 401 | last_loc=loc 402 | write_out=False 403 | else: 404 | write_out=True 405 | 406 | if write_out and last_qval[1] < pval_cutoff: 407 | #peaks[last_loc]=last_qval 408 | peaks.append([last_loc, last_qval, tid]) 409 | last_qval=this_qval 410 | last_pos=[start, end] 411 | last_loc=loc 412 | last_chr=chr 413 | write_out=False 414 | if last_qval[1] < pval_cutoff: 415 | peaks.append([last_loc, last_qval, tid]) 416 | return peaks 417 | 418 | def extend_peak_region(loc, target_len): 419 | """ 420 | Extends peak symmetrically if peak is smaller than target_len. 421 | """ 422 | chr, start, end, strand = loc.split('\t') 423 | start = int(start) 424 | end = int(end) 425 | old_len = end - start 426 | if old_len > target_len: 427 | return loc 428 | else: 429 | center = int((start + end)/2) 430 | start = center - int(target_len /2) 431 | end = center + int(target_len/2) 432 | return '\t'.join([chr, str(start), str(end), strand]) 433 | 434 | def read_aligner_output(rm_out, gtffile, is_stranded, tmp_dir, resume, call_all): 435 | """ 436 | Use bedtools to get transcripts/genes with multi-mapped reads. 437 | Returns a list of transcripts/genes. 438 | """ 439 | if not (resume and os.path.isfile(tmp_dir + '/gtf2multireads.bed')): 440 | rm_bed=pybedtools.BedTool(rm_out) 441 | gtf=pybedtools.BedTool(gtffile) 442 | gtf_bed_rm = gtf.intersect(rm_bed, s=True, u=True) if is_stranded else gtf.intersect(rm_bed, u=True) 443 | gtf_bed_rm.saveas(tmp_dir + '/gtf2multireads.bed') 444 | pybedtools.cleanup() 445 | 446 | tid_list=[] 447 | if call_all: 448 | gtf_to_read=gtffile 449 | else: 450 | gtf_to_read=tmp_dir+'/gtf2multireads.bed' 451 | with open(gtf_to_read,'r') as f: 452 | for line in f: 453 | ele=line.rstrip().split('\t') 454 | gene_id=ele[3] 455 | gene_chr, gene_start, gene_end=ele[0], int(ele[1]), int(ele[2]) 456 | gene_strand=ele[5] 457 | tid_list.append([gene_id, gene_chr, gene_strand, gene_start, gene_end]) 458 | print_time_stamp('Read transcripts with multi-reads: ' + str(len(tid_list))) 459 | return tid_list 460 | 461 | def read_tid_frag_from_bam(tid, bamfile, is_stranded, is_unique): 462 | """ 463 | Use pysam to fetch reads info for a given gene and its loci. 464 | Returns reads, read weights and its mapped loci. 465 | """ 466 | tid_reads=[] 467 | gene, chr, strand, start, end=tid 468 | if strand=='-': 469 | is_reverse=True 470 | else: 471 | is_reverse=False 472 | reads=[x for x in bamfile.fetch(chr, int(start), int(end)) if x.is_reverse==is_reverse or not is_stranded] 473 | reads=[x for x in reads if x.pos>=int(start) and x.pos<=int(end)] 474 | for read in reads: 475 | if is_unique: 476 | try: 477 | opt_NH=read.opt('NH') 478 | if opt_NH > 1: 479 | continue 480 | except: 481 | pass 482 | score=1 483 | else: 484 | try: 485 | opt_AS=read.opt('AS') 486 | if isinstance(opt_AS, float): 487 | score=opt_AS 488 | else: 489 | continue 490 | except: 491 | continue 492 | read_length = read.qlen if read.qlen > 0 else read.positions[-1] - read.positions[0] + 1 493 | if read.pos-start>=0 and read_length<500: # to avoid junction reads 494 | tid_reads.append([read.qname, read.pos-start, read_length, score]) 495 | return tid_reads 496 | 497 | def print_time_stamp(msg): 498 | """ 499 | Reporter function for logging. 500 | """ 501 | current_time='[' + strftime("%Y-%m-%d %H:%M:%S") + '] ' 502 | print >> sys.stderr, current_time + msg 503 | 504 | if __name__=='__main__': 505 | main() -------------------------------------------------------------------------------- /CLAM/bak/deep_getsizeof.py: -------------------------------------------------------------------------------- 1 | from collections import Mapping, Container 2 | from sys import getsizeof 3 | 4 | def deep_getsizeof(o, ids): 5 | """Find the memory footprint of a Python object 6 | 7 | This is a recursive function that drills down a Python object graph 8 | like a dictionary holding nested dictionaries with lists of lists 9 | and tuples and sets. 10 | 11 | The sys.getsizeof function does a shallow size of only. It counts each 12 | object inside a container as pointer only regardless of how big it 13 | really is. 14 | 15 | :param o: the object 16 | :param ids: 17 | :return: 18 | """ 19 | d = deep_getsizeof 20 | if id(o) in ids: 21 | return 0 22 | 23 | r = getsizeof(o) 24 | ids.add(id(o)) 25 | 26 | if isinstance(o, str) or isinstance(0, unicode): 27 | return r 28 | 29 | if isinstance(o, Mapping): 30 | return r + sum(d(k, ids) + d(v, ids) for k, v in o.iteritems()) 31 | 32 | if isinstance(o, Container): 33 | return r + sum(d(x, ids) for x in o) 34 | 35 | return r -------------------------------------------------------------------------------- /CLAM/bak/peakcaller.bak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This peak-caller script is part of the CLAM pipeline. 5 | 6 | It takes input from re-aligner output, and use permutation to call peaks. 7 | 8 | Tested under python 2.7.3 9 | """ 10 | 11 | __author__ = 'Zijun Zhang' 12 | __version__ = '1.1.0' 13 | __email__ = 'zj.z@ucla.edu' 14 | 15 | 16 | import os 17 | import sys 18 | from collections import defaultdict 19 | from statsmodels.sandbox.stats.multicomp import multipletests 20 | import pysam 21 | import logging 22 | import numpy as np 23 | from collections import defaultdict 24 | import re 25 | from scipy.stats import fisher_exact, poisson, chi2 26 | import scipy.optimize as optimize 27 | from tqdm import tqdm 28 | import datetime 29 | from stats import ztnb_em 30 | 31 | 32 | ### get logger 33 | ### 34 | logger = logging.getLogger('CLAM.Peakcaller') 35 | ### 36 | 37 | def read_gtf(fn): 38 | """read in the gene annotation from GTF file 39 | """ 40 | logger.info('read GTF from "%s" '% fn) 41 | gene_annot = {} 42 | with open(fn, 'r') as f: 43 | for line in f: 44 | if line.startswith('#'): 45 | continue 46 | ele = line.strip().split('\t') 47 | if ele[2] != 'gene': 48 | continue 49 | chr, start, end, strand = ele[0], int(ele[3]), int(ele[4]), ele[6] 50 | try: 51 | gene_id = re.search(r'gene_id "(.+?)"', ele[-1]).group(1) 52 | except AttributeError: 53 | continue 54 | gene_annot[gene_id] = [chr, start, end, strand] 55 | return gene_annot 56 | 57 | 58 | def count_gene_read_tags(bam_list, (chr, start, end, strand), is_unique=True, unstranded=False): 59 | """ count the tagger positions for all reads in a given genomic interval 60 | Args: 61 | Returns: 62 | """ 63 | # placeholder for interval: 'num of replicate' x 'interval length' 64 | interval = np.zeros( (len(bam_list), end-start+1) ) 65 | is_reverse = True if strand=='-' else False 66 | # construct the (tag, score) pairs 67 | for i in range(len(bam_list)): 68 | bam = bam_list[i] 69 | if is_unique: 70 | read_tags = [ (x.opt('RT'), 1.0) for x in bam.fetch(chr, start, end) \ 71 | if unstranded or x.is_reverse==is_reverse] 72 | else: 73 | read_tags = [ (x.opt('RT'), x.opt('AS')) for x in bam.fetch(chr, start, end) \ 74 | if unstranded or x.is_reverse==is_reverse] 75 | 76 | for tag in read_tags: 77 | if tag[0]=end: 78 | continue 79 | interval[i, tag[0]-start] += tag[1] 80 | return interval 81 | 82 | 83 | def bin_interval_counts(interval, winsize=50): 84 | bins = np.zeros( ( interval.shape[0], int(np.ceil(interval.shape[1]/float(winsize))) ) ) 85 | for i in range(bins.shape[1]): 86 | for j in range(interval.shape[0]): 87 | start, end = i*winsize, (i+1)*winsize-1 88 | bins[j, i] = np.sum(interval[j, start:end]) 89 | return bins 90 | 91 | 92 | def test_bin_negbinom(intv_bin_ip, intv_bin_con, correction_method='fdr_bh'): 93 | """DOCSTRING 94 | Args 95 | Returns 96 | """ 97 | def _par_to_vec(par, data, is_constrained): 98 | if is_constrained: 99 | beta = par[0] 100 | mu_vec = par[1::] 101 | delta = 0 102 | else: 103 | beta, delta = par[0], par[1] 104 | mu_vec = par[2::] 105 | ip_counter = data['this_ip'].shape[0] 106 | con_counter = data['this_con'].shape[0] 107 | mu0 = np.asarray(mu_vec[0:con_counter]) 108 | mu1 = np.asarray(mu_vec[con_counter::]) 109 | lamb1_this = np.exp(mu1 + beta + delta) 110 | lamb1_others = np.exp(mu1) 111 | lamb0_this = np.exp(mu0 + beta) 112 | lamb0_others = np.exp(mu0) 113 | return (lamb1_this, lamb1_others, lamb0_this, lamb0_others) 114 | 115 | def _negative_binom_logpmf(y, mu, alpha): 116 | y = np.asarray(y) 117 | ll = np.empty(len(y)) 118 | for i in range(len(y)): 119 | alpha_inv = 1.0/alpha[i] 120 | alpha_mu = float(alpha[i] * mu[i]) 121 | ll[i] = y[i]* np.log(alpha_mu/(1+alpha_mu))- \ 122 | alpha_inv*np.log(1+alpha_mu) 123 | return ll 124 | 125 | def _neg_loglik_unconstrain(par, data): 126 | (l1, l2, l3, l4) = _par_to_vec(par, data, False) 127 | ll = np.sum( _negative_binom_logpmf(data['this_ip'], mu=l1, alpha=alpha_ip_vec)) 128 | ll += np.sum( _negative_binom_logpmf(data['others_ip'], mu=l2, alpha=alpha_ip_vec)) 129 | ll += np.sum( _negative_binom_logpmf(data['this_con'], mu=l3, alpha=alpha_con_vec)) 130 | ll += np.sum( _negative_binom_logpmf(data['others_con'], mu=l4, alpha=alpha_con_vec)) 131 | return -ll 132 | 133 | def _neg_loglik_constrain(par, data): 134 | (l1, l2, l3, l4) = _par_to_vec(par, data, True) 135 | ll = np.sum(_negative_binom_logpmf(data['this_ip'], mu=l1, alpha=alpha_ip_vec)) + \ 136 | np.sum(_negative_binom_logpmf(data['others_ip'], mu=l2, alpha=alpha_ip_vec)) + \ 137 | np.sum(_negative_binom_logpmf(data['this_con'], mu=l3, alpha=alpha_con_vec)) + \ 138 | np.sum(_negative_binom_logpmf(data['others_con'], mu=l4, alpha=alpha_con_vec)) 139 | return -ll 140 | 141 | # initialize placeholders 142 | intv_counter = intv_bin_ip.shape[1] 143 | assert intv_counter == intv_bin_con.shape[1] 144 | binscore = np.empty(intv_counter) 145 | binsignal = np.empty(intv_counter) 146 | alpha_ip_vec = np.empty(intv_bin_ip.shape[0]) 147 | alpha_con_vec = np.empty(intv_bin_con.shape[0]) 148 | ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip) 149 | con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con) 150 | 151 | 152 | # compute the dispersion parameters 153 | for i in range(intv_bin_con.shape[0]): 154 | height = ztnb_em.collapse_data(intv_bin_con[i,]) 155 | height[0] = 0 156 | ll, mu, alpha = ztnb_em.EM_estim_params(height, max_iter=100, verbose=False) 157 | alpha_con_vec[i] = alpha 158 | 159 | max_alpha = np.max(alpha_con_vec) 160 | for i in range(intv_bin_ip.shape[0]): 161 | height = ztnb_em.collapse_data(intv_bin_ip[i,]) 162 | height[0] = 0 163 | ll, mu, alpha = ztnb_em.EM_estim_params(height, max_iter=100, verbose=False) 164 | alpha = max_alpha if alpha>max_alpha else alpha 165 | alpha_ip_vec[i] = alpha 166 | 167 | 168 | # perform test on each bin 169 | for i in range(intv_counter): 170 | this_ip = intv_bin_ip[:, i] 171 | others_ip = ip_sum - this_ip 172 | this_con = intv_bin_con[:, i] 173 | others_con = con_sum - this_con 174 | if np.sum(this_ip) == 0: 175 | binsignal[i], binscore[i] = np.nan, np.nan 176 | continue 177 | data = { 178 | 'this_ip':np.round(this_ip), 179 | 'others_ip':np.round(others_ip), 180 | 'this_con':np.round(this_con), 181 | 'others_con':np.round(others_con) 182 | } 183 | ## constrained likelihood 184 | res_constrain = optimize.minimize( 185 | x0=np.ones(1+this_ip.shape[0]+others_ip.shape[0]), 186 | fun=_neg_loglik_constrain, 187 | args=(data), 188 | method='bfgs', 189 | options={'disp':False} 190 | ) 191 | ## unconstrained likelihood 192 | res_unconstrain = optimize.minimize( 193 | x0=np.ones(2+this_ip.shape[0]+others_ip.shape[0]), 194 | fun=_neg_loglik_unconstrain, 195 | args=(data), 196 | method='bfgs', 197 | options={'disp':False} 198 | ) 199 | 200 | delta_mle = res_unconstrain.x[1] 201 | pval = 1 - chi2.cdf(2*(res_constrain.fun - res_unconstrain.fun), 1) 202 | binscore[i] = pval 203 | binsignal[i] = delta_mle 204 | 205 | # correcting for multiple-testing 206 | adj = multipletests(binscore[~ np.isnan(binscore)], alpha=0.05, method=correction_method) 207 | binscore_adj = np.asarray(binscore) 208 | binscore_adj[ ~ np.isnan(binscore) ] = adj[1] 209 | return binsignal, binscore_adj 210 | 211 | 212 | def test_bin_poisson(intv_bin_ip, intv_bin_con, correction_method='fdr_bh'): 213 | """DOCSTRING 214 | Args 215 | Returns 216 | """ 217 | def _par_to_vec(par, data, is_constrained): 218 | if is_constrained: 219 | beta = par[0] 220 | mu_vec = par[1::] 221 | delta = 0 222 | else: 223 | beta, delta = par[0], par[1] 224 | mu_vec = par[2::] 225 | ip_counter = data['this_ip'].shape[0] 226 | con_counter = data['this_con'].shape[0] 227 | mu0 = np.asarray(mu_vec[0:con_counter]) 228 | mu1 = np.asarray(mu_vec[con_counter::]) 229 | lamb1_this = np.exp(mu1 + beta + delta) 230 | lamb1_others = np.exp(mu1) 231 | lamb0_this = np.exp(mu0 + beta) 232 | lamb0_others = np.exp(mu0) 233 | return (lamb1_this, lamb1_others, lamb0_this, lamb0_others) 234 | 235 | def _neg_loglik_unconstrain(par, data): 236 | (l1, l2, l3, l4) = _par_to_vec(par, data, False) 237 | ll = np.sum(poisson.logpmf(data['this_ip'], mu=l1)) + \ 238 | np.sum(poisson.logpmf(data['others_ip'], mu=l2)) + \ 239 | np.sum(poisson.logpmf(data['this_con'], mu=l3)) + \ 240 | np.sum(poisson.logpmf(data['others_con'], mu=l4)) 241 | return -ll 242 | 243 | def _neg_loglik_constrain(par, data): 244 | (l1, l2, l3, l4) = _par_to_vec(par, data, True) 245 | ll = np.sum(poisson.logpmf(data['this_ip'], mu=l1)) + \ 246 | np.sum(poisson.logpmf(data['others_ip'], mu=l2)) + \ 247 | np.sum(poisson.logpmf(data['this_con'], mu=l3)) + \ 248 | np.sum(poisson.logpmf(data['others_con'], mu=l4)) 249 | return -ll 250 | 251 | intv_counter = intv_bin_ip.shape[1] 252 | assert intv_counter == intv_bin_con.shape[1] 253 | binscore = np.empty(intv_counter) 254 | binsignal = np.empty(intv_counter) 255 | ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip) 256 | con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con) 257 | for i in range(intv_counter): 258 | this_ip = intv_bin_ip[:, i] 259 | others_ip = ip_sum - this_ip 260 | this_con = intv_bin_con[:, i] 261 | others_con = con_sum - this_con 262 | if this_ip == 0: 263 | binsignal[i], binscore[i] = np.nan, 1.0 264 | continue 265 | ## because Poisson (and other count-based methods) only 266 | ## takes integers, here we take the floor of the fractional 267 | ## multi-reads as a conservative approach 268 | data = { 269 | 'this_ip':np.floor(this_ip), 270 | 'others_ip':np.floor(others_ip), 271 | 'this_con':np.floor(this_con), 272 | 'others_con':np.floor(others_con) 273 | } 274 | 275 | res_constrain = optimize.minimize( 276 | x0=np.ones(1+this_ip.shape[0]+others_ip.shape[0]), 277 | fun=_neg_loglik_constrain, 278 | args=(data), 279 | method='Nelder-Mead', 280 | options={'disp':False} 281 | ) 282 | 283 | res_unconstrain = optimize.minimize( 284 | x0=np.ones(2+this_ip.shape[0]+others_ip.shape[0]), 285 | fun=_neg_loglik_unconstrain, 286 | args=(data), 287 | method='Nelder-Mead', 288 | options={'disp':False} 289 | ) 290 | 291 | delta_mle = res_unconstrain.x[1] 292 | pval = 1 - chi2.cdf(2*(res_constrain.fun - res_unconstrain.fun), 1) 293 | binscore[i] = pval 294 | binsignal[i] = delta_mle 295 | adj = multipletests(binscore, alpha=0.05, method=correction_method) 296 | binscore_adj = adj[1] 297 | return binsignal, binscore_adj 298 | 299 | 300 | def test_bin_fisher(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh'): 301 | """DOCSTRING 302 | Args 303 | Returns 304 | """ 305 | if intv_bin_ip.shape[0] != 1: 306 | raise Exception('Fisher exact test does not deal with replicates.') 307 | intv_counter = intv_bin_ip.shape[1] 308 | assert intv_counter == intv_bin_con.shape[1] 309 | binscore = np.empty(intv_counter) 310 | binsignal = np.empty(intv_counter) 311 | ip_sum = np.sum(intv_bin_ip[0,]) 312 | con_sum = np.sum(intv_bin_con[0,]) 313 | for i in range(intv_counter): 314 | this_ip = intv_bin_ip[0, i] 315 | others_ip = ip_sum - this_ip 316 | this_con = intv_bin_con[0, i] 317 | others_con = con_sum - this_con 318 | if this_ip == 0: 319 | binsignal[i], binscore[i] = np.nan, 1.0 320 | continue 321 | _, binscore[i] = fisher_exact([[this_ip, others_ip], [this_con, others_con]], alternative='greater') 322 | if with_control: 323 | binsignal[i] = this_ip/others_ip / this_con*others_con 324 | else: 325 | binsignal[i] = this_ip 326 | 327 | adj = multipletests(binscore, alpha=0.05, method=correction_method) 328 | binscore_adj = adj[1] 329 | return binsignal, binscore_adj 330 | 331 | 332 | def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, winsize=50, unstranded=False): 333 | """DOCSTRING 334 | Args 335 | Returns 336 | """ 337 | # fetch the IP tag counts to gene regions 338 | if unique_only: 339 | interval_ip = \ 340 | count_gene_read_tags(bam_dict['ubam.ip'], gene, is_unique=True, unstranded=unstranded) 341 | else: 342 | interval_ip = \ 343 | count_gene_read_tags(bam_dict['ubam.ip'], gene, is_unique=True, unstranded=unstranded) + \ 344 | count_gene_read_tags(bam_dict['mbam.ip'], gene, is_unique=False, unstranded=unstranded) 345 | 346 | # skip if there are no reads 347 | if np.sum(interval_ip) == 0: 348 | #print "no reads" 349 | return '' 350 | 351 | # fetch/construct the input tag counts 352 | if with_control: 353 | ## count control tags if available 354 | if unique_only: 355 | interval_con = \ 356 | count_gene_read_tags(bam_dict['ubam.con'], gene, is_unique=True, unstranded=unstranded) 357 | else: 358 | interval_con = \ 359 | count_gene_read_tags(bam_dict['ubam.con'], gene, is_unique=True, unstranded=unstranded) + \ 360 | count_gene_read_tags(bam_dict['mbam.con'], gene, is_unique=False, unstranded=unstranded) 361 | else: 362 | ## otherwise, construct a uniform *fake* control 363 | interval_con = \ 364 | np.ones((1, interval_ip.shape[1]))*np.sum(interval_ip)/interval_ip.shape[1] 365 | 366 | # bin tag counts into bins 367 | intv_bin_ip = bin_interval_counts(interval_ip, winsize=winsize) 368 | intv_bin_con = bin_interval_counts(interval_con, winsize=winsize) 369 | 370 | # perform statistical test 371 | signal_val, binscore_adj = test_bin_negbinom(intv_bin_ip, intv_bin_con) 372 | #signal_val, binscore_adj = test_bin_poisson(intv_bin_ip, intv_bin_con) 373 | #signal_val, binscore_adj = test_bin_fisher(bin_interval_ip, bin_interval_con, with_control=with_control) 374 | 375 | # build human-readable outputs 376 | ## "narrowPeak" format from 377 | ## https://genome.ucsc.edu/FAQ/FAQformat.html#format12 378 | ## chr start end name 1000 strand signalValue pVal qVal peak 379 | narrowPeak_formatter = "%s\t%i\t%i\t.\t1000\t%s\t%s\t.\t%.3f\t.\n" 380 | BED = '' 381 | for i in range(len(binscore_adj)): 382 | qval = binscore_adj[i] 383 | signal = signal_val[i] 384 | if qval<0.05: 385 | chr = gene[0] 386 | binstart = gene[1] + i*winsize 387 | binend = gene[1] + (i+1)*winsize-1 388 | strand = gene[3] 389 | BED += narrowPeak_formatter % (chr, binstart, binend, strand, signal, qval) 390 | return BED 391 | 392 | 393 | 394 | def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False): 395 | """DOCSTRING 396 | Args: 397 | Returns: 398 | """ 399 | # file handlers 400 | mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb') 401 | ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb') 402 | bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]} 403 | if unique_only: 404 | ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w') 405 | else: 406 | ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w') 407 | 408 | # read in GTF 409 | gene_annot = read_gtf(gtf_fp) 410 | 411 | # iteratively call peaks in each gene 412 | peak_counter = 0 413 | for gene_name in tqdm(gene_annot): 414 | gene = gene_annot[gene_name] 415 | BED = call_gene_peak(bam_dict, gene, 416 | unique_only=unique_only, with_control=with_control, 417 | unstranded=unstranded) 418 | ofile.write(BED) 419 | #print BED 420 | peak_counter += len(BED.split('\n')) 421 | ofile.close() 422 | logger.info('called %i peaks'%peak_counter) 423 | return 424 | 425 | 426 | def chunkify(a, n): 427 | """Separate a list (a) into consecutive n chunks. 428 | Args: 429 | Returns: 430 | the chunkified index 431 | """ 432 | k, m = len(a) / n, len(a) % n 433 | return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n)) 434 | 435 | 436 | if __name__ == '__main__': 437 | ### set up logger 438 | logger = logging.getLogger('CLAM') 439 | logger.setLevel(logging.DEBUG) 440 | # create file handler which logs even debug messages 441 | fh = logging.FileHandler( 442 | 'CLAM.Peakcaller.'+'-'.join(str(datetime.datetime.now()).replace(':','-').split()) + '.log') 443 | fh.setLevel(logging.DEBUG) 444 | # create console handler with a higher log level 445 | ch = logging.StreamHandler() 446 | ch.setLevel(logging.DEBUG) 447 | # create formatter and add it to the handlers 448 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -\n %(message)s') 449 | fh.setFormatter(formatter) 450 | ch.setFormatter(formatter) 451 | # add the handlers to the logger 452 | logger.addHandler(fh) 453 | logger.addHandler(ch) 454 | ### 455 | logger.info('start') 456 | logger.info('run info: %s'%(' '.join(sys.argv))) 457 | 458 | tmp_dir, out_dir, unique_only = sys.argv[1], sys.argv[2], sys.argv[3] 459 | unique_only = False if unique_only=='0' else True 460 | gtf_fp = '/u/nobackup/yxing/NOBACKUP/frankwoe/hg19/gencode.v19.annotation.gtf' 461 | peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=unique_only) 462 | logger.info('end') 463 | -------------------------------------------------------------------------------- /CLAM/bak/peakcaller.bak2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This peak-caller script is part of the CLAM pipeline. 5 | 6 | It takes input from re-aligner output, and use permutation to call peaks. 7 | 8 | Tested under python 2.7.3 9 | """ 10 | 11 | __author__ = 'Zijun Zhang' 12 | __version__ = '1.1.0' 13 | __email__ = 'zj.z@ucla.edu' 14 | 15 | 16 | import os 17 | import sys 18 | from collections import defaultdict 19 | from statsmodels.sandbox.stats.multicomp import multipletests 20 | import pysam 21 | import logging 22 | import numpy as np 23 | from collections import defaultdict 24 | import re 25 | from scipy.stats import fisher_exact, poisson, chi2 26 | import scipy.optimize as optimize 27 | from tqdm import tqdm 28 | import datetime 29 | from stats import ztnb_em 30 | 31 | 32 | ### get logger 33 | ### 34 | logger = logging.getLogger('CLAM.Peakcaller') 35 | ### 36 | 37 | def read_gtf(fn): 38 | """read in the gene annotation from GTF file 39 | """ 40 | gene_annot = {} 41 | with open(fn, 'r') as f: 42 | for line in f: 43 | if line.startswith('#'): 44 | continue 45 | ele = line.strip().split('\t') 46 | if ele[2] != 'gene': 47 | continue 48 | chr, start, end, strand = ele[0], int(ele[3]), int(ele[4]), ele[6] 49 | try: 50 | gene_id = re.search(r'gene_id "(.+?)"', ele[-1]).group(1) 51 | except AttributeError: 52 | continue 53 | gene_annot[gene_id] = [chr, start, end, strand] 54 | return gene_annot 55 | 56 | 57 | def count_gene_read_tags(bam_list, (chr, start, end, strand), is_unique=True, unstranded=False): 58 | """ count the tagger positions for all reads in a given genomic interval 59 | Args: 60 | Returns: 61 | """ 62 | # placeholder for interval: 'num of replicate' x 'interval length' 63 | interval = np.zeros( (len(bam_list), end-start+1) ) 64 | is_reverse = True if strand=='-' else False 65 | # construct the (tag, score) pairs 66 | for i in range(len(bam_list)): 67 | bam = bam_list[i] 68 | if is_unique: 69 | read_tags = [ (x.opt('RT'), 1.0) for x in bam.fetch(chr, start, end) \ 70 | if unstranded or x.is_reverse==is_reverse] 71 | else: 72 | read_tags = [ (x.opt('RT'), x.opt('AS')) for x in bam.fetch(chr, start, end) \ 73 | if unstranded or x.is_reverse==is_reverse] 74 | 75 | for tag in read_tags: 76 | if tag[0]=end: 77 | continue 78 | interval[i, tag[0]-start] += tag[1] 79 | return interval 80 | 81 | 82 | def bin_interval_counts(interval, winsize=50): 83 | bins = np.zeros( ( interval.shape[0], int(np.ceil(interval.shape[1]/float(winsize))) ) ) 84 | for i in range(bins.shape[1]): 85 | for j in range(interval.shape[0]): 86 | start, end = i*winsize, (i+1)*winsize-1 87 | bins[j, i] = np.sum(interval[j, start:end]) 88 | return bins 89 | 90 | 91 | def test_bin_poisson(intv_bin_ip, intv_bin_con, correction_method='fdr_bh'): 92 | """DOCSTRING 93 | Args 94 | Returns 95 | """ 96 | def _par_to_vec(par, data, is_constrained): 97 | if is_constrained: 98 | beta = par[0] 99 | mu_vec = par[1::] 100 | delta = 0 101 | else: 102 | beta, delta = par[0], par[1] 103 | mu_vec = par[2::] 104 | ip_counter = data['this_ip'].shape[0] 105 | con_counter = data['this_con'].shape[0] 106 | mu0 = np.asarray(mu_vec[0:con_counter]) 107 | mu1 = np.asarray(mu_vec[con_counter::]) 108 | lamb1_this = np.exp(mu1 + beta + delta) 109 | lamb1_others = np.exp(mu1) 110 | lamb0_this = np.exp(mu0 + beta) 111 | lamb0_others = np.exp(mu0) 112 | return (lamb1_this, lamb1_others, lamb0_this, lamb0_others) 113 | 114 | def _neg_loglik_unconstrain(par, data): 115 | (l1, l2, l3, l4) = _par_to_vec(par, data, False) 116 | ll = np.sum(poisson.logpmf(data['this_ip'], mu=l1)) + \ 117 | np.sum(poisson.logpmf(data['others_ip'], mu=l2)) + \ 118 | np.sum(poisson.logpmf(data['this_con'], mu=l3)) + \ 119 | np.sum(poisson.logpmf(data['others_con'], mu=l4)) 120 | return -ll 121 | 122 | def _neg_loglik_constrain(par, data): 123 | (l1, l2, l3, l4) = _par_to_vec(par, data, True) 124 | ll = np.sum(poisson.logpmf(data['this_ip'], mu=l1)) + \ 125 | np.sum(poisson.logpmf(data['others_ip'], mu=l2)) + \ 126 | np.sum(poisson.logpmf(data['this_con'], mu=l3)) + \ 127 | np.sum(poisson.logpmf(data['others_con'], mu=l4)) 128 | return -ll 129 | 130 | intv_counter = intv_bin_ip.shape[1] 131 | assert intv_counter == intv_bin_con.shape[1] 132 | binscore = np.empty(intv_counter) 133 | binsignal = np.empty(intv_counter) 134 | ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip) 135 | con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con) 136 | for i in range(intv_counter): 137 | this_ip = intv_bin_ip[:, i] 138 | others_ip = ip_sum - this_ip 139 | this_con = intv_bin_con[:, i] 140 | others_con = con_sum - this_con 141 | if np.sum(this_ip) == 0: 142 | binsignal[i], binscore[i] = np.nan, 1.0 143 | continue 144 | data = { 145 | 'this_ip':np.round(this_ip), 146 | 'others_ip':np.round(others_ip), 147 | 'this_con':np.round(this_con), 148 | 'others_con':np.round(others_con) 149 | } 150 | 151 | res_constrain = optimize.minimize( 152 | x0=np.ones(1+this_ip.shape[0]+others_ip.shape[0]), 153 | fun=_neg_loglik_constrain, 154 | args=(data), 155 | method='bfgs', 156 | options={'disp':False} 157 | ) 158 | 159 | res_unconstrain = optimize.minimize( 160 | x0=np.ones(2+this_ip.shape[0]+others_ip.shape[0]), 161 | fun=_neg_loglik_unconstrain, 162 | args=(data), 163 | method='bfgs', 164 | options={'disp':False} 165 | ) 166 | 167 | delta_mle = res_unconstrain.x[1] 168 | pval = 1 - chi2.cdf(2*(res_constrain.fun - res_unconstrain.fun), 1) 169 | binscore[i] = pval 170 | binsignal[i] = delta_mle 171 | adj = multipletests(binscore, alpha=0.05, method=correction_method) 172 | binscore_adj = adj[1] 173 | return binsignal, binscore_adj 174 | 175 | 176 | 177 | def test_bin_negbinom(intv_bin_ip, intv_bin_con, alpha_ip_vec, alpha_con_vec, correction_method='fdr_bh'): 178 | """DOCSTRING 179 | Args 180 | Returns 181 | """ 182 | def _par_to_vec(par, data, is_constrained): 183 | if is_constrained: 184 | beta = par[0] 185 | mu_vec = par[1::] 186 | delta = 0 187 | else: 188 | beta, delta = par[0], par[1] 189 | mu_vec = par[2::] 190 | ip_counter = data['this_ip'].shape[0] 191 | con_counter = data['this_con'].shape[0] 192 | mu0 = np.asarray(mu_vec[0:con_counter]) 193 | mu1 = np.asarray(mu_vec[con_counter::]) 194 | lamb1_this = np.exp(mu1 + beta + delta) 195 | lamb1_others = np.exp(mu1) 196 | lamb0_this = np.exp(mu0 + beta) 197 | lamb0_others = np.exp(mu0) 198 | return (lamb1_this, lamb1_others, lamb0_this, lamb0_others) 199 | 200 | def _negative_binom_logpmf(y, mu, alpha): 201 | y = np.asarray(y) 202 | ll = np.empty(len(y)) 203 | for i in range(len(y)): 204 | alpha_inv = 1.0/alpha[i] 205 | alpha_mu = float(alpha[i] * mu[i]) 206 | ll[i] = y[i]* np.log(alpha_mu/(1+alpha_mu))- \ 207 | alpha_inv*np.log(1+alpha_mu) 208 | return ll 209 | 210 | def _neg_loglik_unconstrain(par, data): 211 | (l1, l2, l3, l4) = _par_to_vec(par, data, False) 212 | ll = np.sum( _negative_binom_logpmf(data['this_ip'], mu=l1, alpha=alpha_ip_vec)) 213 | ll += np.sum( _negative_binom_logpmf(data['others_ip'], mu=l2, alpha=alpha_ip_vec)) 214 | ll += np.sum( _negative_binom_logpmf(data['this_con'], mu=l3, alpha=alpha_con_vec)) 215 | ll += np.sum( _negative_binom_logpmf(data['others_con'], mu=l4, alpha=alpha_con_vec)) 216 | return -ll 217 | 218 | def _neg_loglik_constrain(par, data): 219 | (l1, l2, l3, l4) = _par_to_vec(par, data, True) 220 | ll = np.sum(_negative_binom_logpmf(data['this_ip'], mu=l1, alpha=alpha_ip_vec)) + \ 221 | np.sum(_negative_binom_logpmf(data['others_ip'], mu=l2, alpha=alpha_ip_vec)) + \ 222 | np.sum(_negative_binom_logpmf(data['this_con'], mu=l3, alpha=alpha_con_vec)) + \ 223 | np.sum(_negative_binom_logpmf(data['others_con'], mu=l4, alpha=alpha_con_vec)) 224 | return -ll 225 | 226 | intv_counter = intv_bin_ip.shape[1] 227 | assert intv_counter == intv_bin_con.shape[1] 228 | binscore = np.empty(intv_counter) 229 | binsignal = np.empty(intv_counter) 230 | ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip) 231 | con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con) 232 | for i in range(intv_counter): 233 | this_ip = intv_bin_ip[:, i] 234 | others_ip = ip_sum - this_ip 235 | this_con = intv_bin_con[:, i] 236 | others_con = con_sum - this_con 237 | if np.sum(this_ip) == 0: 238 | binsignal[i], binscore[i] = np.nan, 1.0 239 | continue 240 | data = { 241 | 'this_ip':np.round(this_ip), 242 | 'others_ip':np.round(others_ip), 243 | 'this_con':np.round(this_con), 244 | 'others_con':np.round(others_con) 245 | } 246 | 247 | res_constrain = optimize.minimize( 248 | x0=np.ones(1+this_ip.shape[0]+others_ip.shape[0]), 249 | fun=_neg_loglik_constrain, 250 | args=(data), 251 | method='Nelder-Mead', 252 | options={'disp':False} 253 | ) 254 | 255 | res_unconstrain = optimize.minimize( 256 | x0=np.ones(2+this_ip.shape[0]+others_ip.shape[0]), 257 | fun=_neg_loglik_unconstrain, 258 | args=(data), 259 | method='bfgs', 260 | options={'disp':False} 261 | ) 262 | 263 | delta_mle = res_unconstrain.x[1] 264 | pval = 1 - chi2.cdf(2*(res_constrain.fun - res_unconstrain.fun), 1) 265 | binscore[i] = pval 266 | binsignal[i] = delta_mle 267 | adj = multipletests(binscore, alpha=0.05, method=correction_method) 268 | binscore_adj = adj[1] 269 | return binsignal, binscore_adj 270 | 271 | 272 | 273 | def test_bin_fisher(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh'): 274 | """DOCSTRING 275 | Args 276 | Returns 277 | """ 278 | if intv_bin_ip.shape[0] != 1: 279 | raise Exception('Fisher exact test does not deal with replicates.') 280 | intv_counter = intv_bin_ip.shape[1] 281 | assert intv_counter == intv_bin_con.shape[1] 282 | binscore = np.empty(intv_counter) 283 | binsignal = np.empty(intv_counter) 284 | ip_sum = np.sum(intv_bin_ip[0,]) 285 | con_sum = np.sum(intv_bin_con[0,]) 286 | for i in range(intv_counter): 287 | this_ip = intv_bin_ip[0, i] 288 | others_ip = ip_sum - this_ip 289 | this_con = intv_bin_con[0, i] 290 | others_con = con_sum - this_con 291 | if this_ip == 0: 292 | binsignal[i], binscore[i] = np.nan, 1.0 293 | continue 294 | _, binscore[i] = fisher_exact([[this_ip, others_ip], [this_con, others_con]], alternative='greater') 295 | if with_control: 296 | binsignal[i] = this_ip/others_ip / this_con*others_con 297 | else: 298 | binsignal[i] = this_ip 299 | 300 | adj = multipletests(binscore, alpha=0.05, method=correction_method) 301 | binscore_adj = adj[1] 302 | return binsignal, binscore_adj 303 | 304 | 305 | def gene_to_count(bam_dict, gene, unique_only=False, with_control=False, winsize=50, unstranded=False): 306 | """DOCSTRING 307 | Args 308 | Returns 309 | """ 310 | # fetch the IP tag counts to gene regions 311 | if unique_only: 312 | interval_ip = \ 313 | count_gene_read_tags(bam_dict['ubam.ip'], gene, is_unique=True, unstranded=unstranded) 314 | else: 315 | interval_ip = \ 316 | count_gene_read_tags(bam_dict['ubam.ip'], gene, is_unique=True, unstranded=unstranded) + \ 317 | count_gene_read_tags(bam_dict['mbam.ip'], gene, is_unique=False, unstranded=unstranded) 318 | 319 | # skip if there are no reads 320 | if np.sum(interval_ip) == 0: 321 | #print "no reads" 322 | return None, None 323 | 324 | # fetch/construct the input tag counts 325 | if with_control: 326 | ## count control tags if available 327 | if unique_only: 328 | interval_con = \ 329 | count_gene_read_tags(bam_dict['ubam.con'], gene, is_unique=True, unstranded=unstranded) 330 | else: 331 | interval_con = \ 332 | count_gene_read_tags(bam_dict['ubam.con'], gene, is_unique=True, unstranded=unstranded) + \ 333 | count_gene_read_tags(bam_dict['mbam.con'], gene, is_unique=False, unstranded=unstranded) 334 | else: 335 | ## otherwise, construct a uniform *fake* control 336 | interval_con = \ 337 | np.ones((1, interval_ip.shape[1]))*np.sum(interval_ip)/interval_ip.shape[1] 338 | 339 | # bin tag counts into bins 340 | bin_interval_ip = bin_interval_counts(interval_ip, winsize=winsize) 341 | bin_interval_con = bin_interval_counts(interval_con, winsize=winsize) 342 | 343 | return bin_interval_ip, bin_interval_con 344 | 345 | 346 | def test_gene_bin(gene, bin_interval_ip, bin_interval_con, alpha_ip_vec, alpha_con_vec): 347 | # perform statistical test 348 | signal_val, binscore_adj = test_bin_poisson(bin_interval_ip, bin_interval_con) 349 | #signal_val, binscore_adj = test_bin_fisher(bin_interval_ip, bin_interval_con, with_control=with_control) 350 | 351 | # build human-readable outputs 352 | ## "narrowPeak" format from 353 | ## https://genome.ucsc.edu/FAQ/FAQformat.html#format12 354 | ## chr start end name 1000 strand signalValue pVal qVal peak 355 | narrowPeak_formatter = "%s\t%i\t%i\t.\t1000\t%s\t%s\t.\t%.3f\t.\n" 356 | BED = '' 357 | for i in range(len(binscore_adj)): 358 | qval = binscore_adj[i] 359 | signal = signal_val[i] 360 | if qval<0.05: 361 | chr = gene[0] 362 | binstart = gene[1] + i*winsize 363 | binend = gene[1] + (i+1)*winsize-1 364 | strand = gene[3] 365 | BED += narrowPeak_formatter % (chr, binstart, binend, strand, signal, qval) 366 | return BED 367 | 368 | 369 | def estim_dispersion_param(alpha_len, gene_count, type): 370 | """DOCSTRING 371 | Args 372 | Returns 373 | """ 374 | alpha_vec = np.zeros(alpha_len) 375 | for i in range(len(alpha_ip_vec)): 376 | dist = defaultdict(int) 377 | for gene_name in gene_count: 378 | this_height = ztnb_em.collapse_data(gene_count[gene_name][type][i,]) 379 | for h in this_height: 380 | #if h==0 or h>50: 381 | # continue 382 | dist[int(h)] += this_height[h] 383 | dist[0] = 0 ## truncate all zeros 384 | ll, mu, alpha = EM_estim_params(dist, max_iter=1000, verbose=True) 385 | alpha_vec[i] = alpha 386 | return alpha_vec 387 | 388 | 389 | def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False): 390 | """DOCSTRING 391 | Args: 392 | Returns: 393 | """ 394 | # file handlers 395 | mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb') 396 | ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb') 397 | bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]} 398 | if unique_only: 399 | ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w') 400 | else: 401 | ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w') 402 | 403 | # read in GTF 404 | logger.info('read gtf from "%s" '% fn) 405 | gene_annot = read_gtf(gtf_fp) 406 | 407 | # fetch tag counts in each gene 408 | ## gene_name: { 'ip': bin_interval_ip, 'con': bin_interval_con } 409 | logger.info('reading gene counts') 410 | gene_count = defaultdict(dict) 411 | for gene_name in tqdm(gene_annot): 412 | gene = gene_annot[gene_name] 413 | bin_interval_ip, bin_interval_con = \ 414 | gene_to_count(bam_dict, gene, 415 | unique_only=unique_only, with_control=with_control, 416 | unstranded=unstranded) 417 | if bin_interval_ip is None: ## if no IP reads in this gene 418 | continue 419 | gene_count[gene_name]['ip'] = bin_interval_ip 420 | gene_count[gene_name]['con'] = bin_interval_con 421 | 422 | # estimate global overdispersion param. for each dataset 423 | logger.info('estimating dispersion parameter') 424 | alpha_ip_vec = estim_dispersion_param(len(bam_dict['ubam.ip']), gene_count, 'ip' ) 425 | if with_control: 426 | alpha_con_vec = estim_dispersion_param(len(bam_dict['ubam.con']), gene_count, 'con' ) 427 | else: 428 | alpha_con_vec = np.asarray(alpha_ip_vec) 429 | 430 | # perform statistical test 431 | logger.info('calling peaks') 432 | for gene_name in gene_to_count: 433 | gene = gene_annot[gene_name] 434 | BED = test_gene_bin(gene, gene_count[gene_name]['ip'], gene_count[gene_name]['con'], 435 | alpha_ip_vec, alpha_con_vec) 436 | ofile.write(BED) 437 | peak_counter += len(BED.split('\n')) 438 | ofile.close() 439 | logger.info('called %i peaks'%peak_counter) 440 | 441 | return 442 | 443 | 444 | def chunkify(a, n): 445 | """Separate a list (a) into consecutive n chunks. 446 | Args: 447 | Returns: 448 | the chunkified index 449 | """ 450 | k, m = len(a) / n, len(a) % n 451 | return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n)) 452 | 453 | 454 | if __name__ == '__main__': 455 | ### set up logger 456 | logger = logging.getLogger('CLAM') 457 | logger.setLevel(logging.DEBUG) 458 | # create file handler which logs even debug messages 459 | fh = logging.FileHandler( 460 | 'CLAM.Peakcaller.'+'-'.join(str(datetime.datetime.now()).replace(':','-').split()) + '.log') 461 | fh.setLevel(logging.DEBUG) 462 | # create console handler with a higher log level 463 | ch = logging.StreamHandler() 464 | ch.setLevel(logging.DEBUG) 465 | # create formatter and add it to the handlers 466 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -\n %(message)s') 467 | fh.setFormatter(formatter) 468 | ch.setFormatter(formatter) 469 | # add the handlers to the logger 470 | logger.addHandler(fh) 471 | logger.addHandler(ch) 472 | ### 473 | logger.info('start') 474 | logger.info('run info: %s'%(' '.join(sys.argv))) 475 | 476 | tmp_dir, out_dir, unique_only = sys.argv[1], sys.argv[2], sys.argv[3] 477 | unique_only = False if unique_only=='0' else True 478 | gtf_fp = '/u/nobackup/yxing/NOBACKUP/frankwoe/hg19/gencode.v19.annotation.gtf' 479 | peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=unique_only) 480 | logger.info('end') 481 | -------------------------------------------------------------------------------- /CLAM/bak/realigner.bak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This re-aligner script is part of the CLAM pipeline. 5 | 6 | It takes bam file as input, and outputs a weighed bam file for multi-mapped reads. 7 | 8 | Tested under python 2.7.3 9 | """ 10 | 11 | __author__ = 'Zijun Zhang' 12 | __version__ = '1.1.0' 13 | __email__ = 'zj.z@ucla.edu' 14 | 15 | 16 | import os 17 | import sys 18 | import pysam 19 | import numpy as np 20 | from collections import defaultdict 21 | from tqdm import tqdm 22 | import logging 23 | import datetime 24 | 25 | ### get logger 26 | ### 27 | logger = logging.getLogger('CLAM.Realigner') 28 | ### 29 | 30 | 31 | class Bit: 32 | """ Binary Indexed Tree to store values in genomic intervals. 33 | Implementation modified from http://www.geeksforgeeks.org/binary-indexed-tree-or-fenwick-tree-2/ 34 | Args: 35 | Returns: 36 | """ 37 | 38 | def __init__(self, n): 39 | sz = 1 40 | while n >= sz: 41 | sz *= 2 42 | self.size = sz 43 | self.array_size = n 44 | self.data = [0]*sz 45 | 46 | def sum(self, i): 47 | assert i >= 0 48 | if i==0: 49 | return 0 50 | if i > self.array_size: 51 | i = self.array_size 52 | s = 0 53 | while i > 0: 54 | s += self.data[i] 55 | i -= i & -i 56 | return s 57 | 58 | def add(self, i, x): 59 | assert i > 0 60 | while i < self.size: 61 | self.data[i] += x 62 | i += i & -i 63 | 64 | 65 | def construct_BIT_track(subgraph, read_to_locations, ubam, unstranded=False): 66 | """Construct BIT for each genomic region / node. 67 | Args: 68 | Returns: 69 | Returns a node-track dictionary and a dictionary for multi-mapped reads. 70 | """ 71 | node_track = {} 72 | total_len = 0 73 | 74 | # initialized BIT tracks, add mreads to the tracks, 75 | # and keep a dict of read scores 76 | multi_reads_weights = defaultdict(dict) 77 | obs_reads = read_to_locations.keys() 78 | for read_x_qname in obs_reads: 79 | read_x_nodes = read_to_locations[read_x_qname] 80 | read_x_score = 1.0 / len(read_x_nodes) 81 | for node in read_x_nodes: 82 | chr, strand, start, end = node.split(':') 83 | start, end = int(start), int(end) 84 | if not node in node_track: 85 | this_len = end - start + 1 86 | node_track[node] = Bit(this_len) 87 | read_x_tag = read_x_nodes[node].opt('RT') 88 | node_locus = read_x_tag - start + 1 89 | node_track[node].add(node_locus, read_x_score) 90 | multi_reads_weights[read_x_qname][node]=[read_x_score, node_locus] 91 | #del read_to_locations[read_x_qname][node] 92 | #del read_to_locations[read_x_qname] 93 | 94 | # now add ureads by fetching from ubam 95 | for node in node_track: 96 | chr, strand, start, end = node.split(':') 97 | start, end = int(start), int(end) 98 | is_reverse = True if strand=='-' else False 99 | uread_tagger = [x.opt('RT') for x in ubam.fetch(chr, start, end) \ 100 | if unstranded or x.is_reverse==is_reverse] 101 | for uread_x_tagger in uread_tagger: 102 | if uread_x_tagger>=start and uread_x_tagger<=end: 103 | node_locus = uread_x_tagger - start + 1 104 | node_track[node].add(node_locus, 1) 105 | 106 | return node_track, multi_reads_weights 107 | 108 | 109 | 110 | def run_EM(node_track, multi_reads_weights, w=50, epsilon=1e-6, max_iter=100, verbose=True): 111 | """ EM implementation for re-assigning multi-mapped reads, given the 112 | compatibility matrix of a subgraph. 113 | Args: 114 | Returns: 115 | """ 116 | iter=1 117 | residue=1 118 | while iter < max_iter and residue > epsilon: 119 | residue = 0 120 | reweight=defaultdict(dict) 121 | ## calculate re-distribute probability; M-step 122 | for read in multi_reads_weights: 123 | for nd in multi_reads_weights[read]: 124 | track_len=node_track[nd].array_size 125 | old_score, read_tag = multi_reads_weights[read][nd] 126 | reweight[read][nd] = max( 0, node_track[nd].sum(min(track_len, read_tag + w)) - node_track[nd].sum(max(0,read_tag - w)) ) 127 | ## update track by expectation; E-step 128 | for read in reweight: 129 | dn=sum([reweight[read][x] for x in reweight[read]]) 130 | if dn==0: 131 | logger.debug('Error: no read weight found @ %s.'%read ) 132 | dn=1 133 | for nd in reweight[read]: 134 | old_score, read_tag = multi_reads_weights[read][nd] 135 | new_score = reweight[read][nd] / float(dn) 136 | node_track[nd].add(read_tag, new_score - old_score) 137 | residue += (old_score - new_score)**2 138 | multi_reads_weights[read][nd][0] = new_score 139 | if verbose and (not iter % 10 or iter == max_iter): 140 | logger.debug('Iter %d, residue = %f' % (iter, residue)) 141 | iter += 1 142 | return multi_reads_weights 143 | 144 | 145 | def build_read_cluster(alignment, chr_list, mbam, unstranded=False, winsize=50): 146 | """DOCSTRING 147 | Args: 148 | Returns: 149 | """ 150 | chrom = chr_list[alignment.reference_id] 151 | site = alignment.opt('RT') 152 | is_reverse = alignment.is_reverse 153 | this_mread_dict = {} 154 | this_mread_dict_set = defaultdict(set) 155 | discarded_mread_alignments = [] 156 | ## note to me: need to be more careful with 157 | ## one read mapped to *multiple-locations* within one cluster 158 | ## currently tossing away those alignments.. (in `discarded_mread_alignments`) 159 | 160 | # find the right boundary 161 | current = site 162 | while True: 163 | mread_list = [x for x in mbam.fetch(chrom, current, current+winsize) \ 164 | if unstranded or x.is_reverse==is_reverse] 165 | for x in mread_list: 166 | this_mread_dict_set[x.qname].add(x) 167 | mread_tagger = [x.opt('RT') for x in mread_list] 168 | if len(mread_tagger)==0: 169 | break 170 | #return (None,None,discarded_mread_alignments) 171 | end = max(mread_tagger) + winsize 172 | if max(mread_tagger)!=current: 173 | ## has to restrict step-size to smaller than winsize; 174 | ## in order to avoid missing of very short reads 175 | current = max(mread_tagger) if max(mread_tagger)current-winsize else current-winsize 193 | else: 194 | break 195 | 196 | strand = '+' if unstranded or is_reverse==False else '-' 197 | for read_x_qname in this_mread_dict_set: 198 | if len(this_mread_dict_set[read_x_qname])>1: 199 | discarded_mread_alignments.extend( [ x for x in list(this_mread_dict_set[read_x_qname]) ]) 200 | else: 201 | this_mread_dict[read_x_qname] = list(this_mread_dict_set[read_x_qname])[0] 202 | 203 | genomic_cluster = (chrom, strand, start, end) 204 | 205 | return genomic_cluster, this_mread_dict, discarded_mread_alignments 206 | 207 | 208 | def construct_subgraph(mbam, read_qname, mread_dict, processed_mreads, chr_list, winsize=50, unstranded=False): 209 | """DOCSTRING 210 | Args: 211 | Returns: 212 | """ 213 | # record of processed alignments only need kept on within-subgraph level 214 | processed_mread_alignments = set() 215 | counter = 0 216 | # a list of `pysam.AlignedSegment` objects 217 | # note that all taggers are already stored in `pysam.AlignedSegment.opt('RT')` 218 | read_aln_list = [x for x in mread_dict[read_qname]] 219 | processed_mreads.add(read_qname) 220 | read_to_locations = defaultdict(dict) # read_qname -> {node_name1:segment1, node_name2:segment2} 221 | 222 | # enumerate all connected components 223 | while True: 224 | counter+=1; print "%i: %i"%(counter, len(read_aln_list)) 225 | next_read_aln_list = [] 226 | 227 | gen = read_aln_list if len(read_aln_list)<200 else tqdm(read_aln_list) 228 | for alignment in gen: 229 | ## build a node for this mread alignment 230 | ## (if not already processed, i.e. built before) 231 | if alignment in processed_mread_alignments: 232 | continue 233 | 234 | genomic_cluster, this_mread_dict, discarded_mread_list = \ 235 | build_read_cluster(alignment, chr_list, mbam, unstranded=unstranded, winsize=winsize) 236 | _ = map(processed_mread_alignments.add, discarded_mread_list) 237 | if genomic_cluster is None: # this cluster is invald (only double-mappers) 238 | continue 239 | 240 | ## update loc2read, read2loc 241 | node_name = ':'.join([str(x) for x in genomic_cluster]) 242 | #if node_name in subgraph: 243 | # logger.debug("I revisited '%s' at read '%s'."%(node_name, read_qname)) 244 | # break 245 | #subgraph.add(node_name) 246 | for x_qname in this_mread_dict: 247 | read_to_locations[x_qname].update({node_name : this_mread_dict[x_qname]}) 248 | 249 | ## then add new alignments(edges) to generate connected nodes 250 | ## in the next iteration 251 | _ = map(processed_mread_alignments.add, this_mread_dict.values()) 252 | for read_x_qname in this_mread_dict: 253 | if read_x_qname in processed_mreads: 254 | continue 255 | x_aln_list = [aln for aln in mread_dict[read_x_qname] if not aln in processed_mread_alignments] 256 | next_read_aln_list.extend(x_aln_list) 257 | 258 | ## .. and record to processed reads since we have generated 259 | ## the nodes for them 260 | _ = map(processed_mreads.add, this_mread_dict.keys()) 261 | 262 | # if no more connected nodes can be found, break loop 263 | if len(next_read_aln_list)==0: 264 | break 265 | read_aln_list = next_read_aln_list 266 | return read_to_locations, processed_mreads 267 | 268 | 269 | def realigner(out_dir, tmp_dir, winsize=50, unstranded=False): 270 | """DOCSTRING 271 | Args: 272 | Returns: 273 | """ 274 | # file handlers 275 | mbam = pysam.Samfile(os.path.join(tmp_dir, 'multi.sorted.bam'),'rb') 276 | ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb') 277 | obam = pysam.Samfile(os.path.join(out_dir, 'realigned.bam'), 'wb', template = mbam) 278 | chr_list=[x['SN'] for x in ubam.header['SQ']] 279 | 280 | # construct the mread_dict; this will be needed throughout 281 | mread_dict = defaultdict(list) 282 | for alignment in mbam: 283 | mread_dict[alignment.qname].append(alignment) 284 | 285 | # keep a record of processed reads 286 | processed_mreads = set() 287 | 288 | # iterate through all mreads 289 | for read_qname in mread_dict: 290 | if read_qname in processed_mreads: 291 | continue 292 | 293 | ## construct the fully-connected subgraph for each read 294 | read_to_locations, processed_mreads = \ 295 | construct_subgraph(mbam, read_qname, mread_dict, processed_mreads, chr_list, winsize=winsize, unstranded=unstranded) 296 | subgraph = set() 297 | for read in read_to_locations: 298 | _ = map(subgraph.add, read_to_locations[read].keys()) 299 | subgraph = list(subgraph) 300 | 301 | ## build the BIT tracks 302 | node_track, multi_reads_weights = \ 303 | construct_BIT_track(subgraph, read_to_locations, ubam, unstranded) 304 | 305 | ## run EM 306 | multi_reads_weights = \ 307 | run_EM(node_track, multi_reads_weights, w=winsize) 308 | 309 | ## write to obam 310 | for read in multi_reads_weights: 311 | for node in multi_reads_weights[read]: 312 | alignment = read_to_locations[read][node] 313 | score = round(multi_reads_weights[read][node][0], 3) 314 | alignment.set_tag('AS', score) 315 | alignment.set_tag('PG', 'CLAM') 316 | obam.write(alignment) 317 | # sort the final output 318 | logger.info('sorting output') 319 | obam.close() 320 | ubam.close() 321 | mbam.close() 322 | obam_sorted_fn = os.path.join(out_dir, 'realigned.sorted.bam') 323 | pysam.sort('-o', obam_sorted_fn, os.path.join(out_dir, 'realigned.bam')) 324 | pysam.index(obam_sorted_fn) 325 | os.remove(os.path.join(out_dir, 'realigned.bam')) 326 | return 327 | 328 | def read_tagger(alignment, method='median'): 329 | """ tag a read alignment to a genomic locus 330 | Args: 331 | Returns: 332 | """ 333 | tagger_func = { 334 | 'median': lambda x: int(np.median(x.positions))+1, 335 | 'start': lambda x: x.positions[-1] if x.is_reverse else x.positions[0]+1 336 | } 337 | try: 338 | tag=tagger_func[method](alignment) 339 | except: 340 | tag=-1 341 | return tag 342 | 343 | 344 | def filter_bam_multihits(filename, max_hits, tmp_dir, read_tagger, omit_detail=True): 345 | """Pre-processing function for cleaning up the input bam file. 346 | Args: 347 | Returns: 348 | """ 349 | logger.info('Filtering input bam..') 350 | 351 | in_bam = pysam.Samfile(filename,'rb') 352 | # unique read bam 353 | ubam_fn = os.path.join(tmp_dir, 'unique.bam') 354 | sorted_ubam_fn = os.path.join(tmp_dir, 'unique.sorted.bam') 355 | ubam=pysam.Samfile(ubam_fn, 'wb', template=in_bam) 356 | unique_counter = 0 357 | 358 | # multi-read bam 359 | mbam_fn = os.path.join(tmp_dir, 'multi.bam') 360 | sorted_mbam_fn = os.path.join(tmp_dir, 'multi.sorted.bam') 361 | mbam=pysam.Samfile(mbam_fn, 'wb', template=in_bam) 362 | mread_set = set() 363 | 364 | # splitting unique and multi- reads 365 | # and add the read taggers we need 366 | for read in tqdm(in_bam): 367 | read_tag = read_tagger(read) 368 | ## skip reads with unassigned tagger 369 | if read_tag==-1: 370 | continue 371 | read.tags += [('RT', read_tag)] ## add the tag 372 | ## omit the details in read sequence and quality 373 | ## recommended for larger bam because this 374 | ## can save some memory/storage for large bams 375 | if omit_detail: 376 | read.query_sequence = '*' 377 | read.query_qualities = [0] 378 | if read.is_secondary or (read.has_tag('NH') and read.opt("NH")>1): 379 | try: 380 | if read.opt("NH") < max_hits: 381 | mbam.write(read) 382 | mread_set.add(read.qname) 383 | except KeyError: 384 | #print read 385 | raise Exception('%s: missing NH tag when is_secondary=%s'%(read.qname,read.is_secondary)) 386 | else: 387 | ubam.write(read) 388 | unique_counter += 1 389 | 390 | in_bam.close() 391 | ubam.close() 392 | mbam.close() 393 | 394 | # sorting 395 | pysam.sort('-o', sorted_ubam_fn, ubam_fn) 396 | os.remove(ubam_fn) 397 | pysam.sort('-o', sorted_mbam_fn, mbam_fn) 398 | os.remove(mbam_fn) 399 | pysam.index(sorted_ubam_fn) 400 | pysam.index(sorted_mbam_fn) 401 | 402 | # log the statistics 403 | multi_counter = len(mread_set) 404 | logger.info( 405 | 'Unique reads = %s; ' % unique_counter + \ 406 | 'Multi reads = %s (%.2f %%)' % \ 407 | ( multi_counter, float(multi_counter)/(multi_counter+unique_counter)*100 ) 408 | ) 409 | return 410 | 411 | 412 | if __name__=='__main__': 413 | ### set up logger 414 | logger = logging.getLogger('CLAM') 415 | logger.setLevel(logging.DEBUG) 416 | # create file handler which logs even debug messages 417 | fh = logging.FileHandler( 418 | 'CLAM.Realigner.'+'-'.join(str(datetime.datetime.now()).replace(':','-').split()) + '.log') 419 | fh.setLevel(logging.INFO) 420 | # create console handler with a higher log level 421 | ch = logging.StreamHandler() 422 | ch.setLevel(logging.DEBUG) 423 | # create formatter and add it to the handlers 424 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -\n %(message)s') 425 | fh.setFormatter(formatter) 426 | ch.setFormatter(formatter) 427 | # add the handlers to the logger 428 | logger.addHandler(fh) 429 | logger.addHandler(ch) 430 | logger.info('start') 431 | 432 | logger.info('run info: %s'%(' '.join(sys.argv))) 433 | bam, tmp_dir, out_dir = sys.argv[1:4] 434 | retag = False 435 | if len(sys.argv)>4: 436 | tagger_method = sys.argv[4] 437 | retag = True 438 | logger.info('Retag with "%s"'%tagger_method) 439 | else: 440 | tagger_method = 'median' 441 | 442 | if retag or not ( 443 | os.path.isfile(os.path.join(tmp_dir,'unique.sorted.bam')) and \ 444 | os.path.isfile(os.path.join(tmp_dir,'multi.sorted.bam')) \ 445 | ) : 446 | filter_bam_multihits(bam, max_hits=100, tmp_dir=tmp_dir, read_tagger=lambda x: read_tagger(x, tagger_method)) 447 | realigner(out_dir, tmp_dir, winsize=50, unstranded=False) 448 | logger.info('end') -------------------------------------------------------------------------------- /CLAM/bak/sim_callpeak.r: -------------------------------------------------------------------------------- 1 | ## simulate read counts in bins and 2 | ## perform LRT test as peak calling 3 | ## *-- prototyping --* 4 | ## Zijun Zhang 5 | ## 9.1.2017 6 | 7 | 8 | sim_bin_counts = function(mu_vec, beta, delta) 9 | { 10 | others = c( 11 | rpois(n=1, lambda=exp(mu_vec[1])), 12 | rpois(n=1, lambda=exp(mu_vec[2])) 13 | ) 14 | this = c( 15 | rpois(n=1, lambda=exp(mu_vec[1]+beta+delta)), 16 | rpois(n=1, lambda=exp(mu_vec[2]+beta)) 17 | ) 18 | res = matrix(c(this,others), nrow=2, byrow=T) 19 | rownames(res) = c('this','others') 20 | colnames(res) = c('IP','Input') 21 | return(as.data.frame(res)) 22 | } 23 | 24 | 25 | loglik_constrain = function(par, data) 26 | { 27 | ll = 0 28 | mu1=par[1]; mu0=par[2]; beta = par[3] 29 | lamb1.this = exp(mu1 + beta) 30 | lamb1.others = exp(mu1) 31 | lamb0.this = exp(mu0+beta) 32 | lamb0.others = exp(mu0) 33 | ll = ll + dpois(data['this','IP'], lamb1.this, log=T) 34 | ll = ll + dpois(data['this','Input'], lamb0.this, log=T) 35 | ll = ll + dpois(data['others','IP'], lamb1.others, log=T) 36 | ll = ll + dpois(data['others','Input'], lamb0.others, log=T) 37 | return(ll) 38 | } 39 | 40 | loglik_unconstrain = function(par, data) 41 | { 42 | ll = 0 43 | mu1=par[1]; mu0=par[2]; beta = par[3]; delta=par[4] 44 | lamb1.this = exp(mu1 + beta + delta) 45 | lamb1.others = exp(mu1) 46 | lamb0.this = exp(mu0+beta) 47 | lamb0.others = exp(mu0) 48 | ll = ll + dpois(data['this','IP'], lamb1.this, log=T) 49 | ll = ll + dpois(data['this','Input'], lamb0.this, log=T) 50 | ll = ll + dpois(data['others','IP'], lamb1.others, log=T) 51 | ll = ll + dpois(data['others','Input'], lamb0.others, log=T) 52 | return(ll) 53 | } 54 | 55 | 56 | callpeak_LRT = function(data) 57 | { 58 | ll0 = optim(rep(1,3), loglik_constrain, control=list(fnscale=-1), data=data) 59 | ll1 = optim(rep(1,4), loglik_unconstrain, control=list(fnscale=-1), data=data) 60 | pval = 1-pchisq(2*(ll1$value-ll0$value),1) 61 | pval 62 | } 63 | 64 | 65 | 66 | B=200 67 | res = matrix(NA, nrow=B, ncol=2) 68 | colnames(res) = c('fisher', 'lrt') 69 | for(b in 1:B) { 70 | data = sim_bin_counts(c(2.5,2), -0.5, 1) 71 | p1=fisher.test(data)$p.value 72 | p2=callpeak_LRT(data) 73 | res[b,] = c(p1, p2) 74 | } 75 | 76 | plot(res[,'fisher'], res[,'lrt']) 77 | abline(0,1) 78 | mean(res[,'fisher']<0.05) 79 | mean(res[,'lrt']<0.05) 80 | -------------------------------------------------------------------------------- /CLAM/bak/utils.py: -------------------------------------------------------------------------------- 1 | 2 | class CLAM_mread(object): 3 | """ 4 | object to store a read alignment 5 | """ 6 | def __init__(self, alignment, read_tagger_func, tag_of_interest = ['NH']): 7 | self.reference_id = alignment.reference_id 8 | self.is_reverse = alignment.is_reverse 9 | self.cigarstring = alignment.cigarstring 10 | self.pos = alignment.pos 11 | self.qname = alignment.qname 12 | self.tag = read_tagger_func(alignment) 13 | self.flag = alignment.flag 14 | self.alignment_tags = [x for x in alignment.tags if x[0] in tag_of_interest] 15 | 16 | def __eq__(self, other): 17 | this = (self.reference_id, self.is_reverse, self.pos, self.qname) 18 | that = (other.reference_id, other.is_reverse, other.pos, other.qname) 19 | return this == that 20 | 21 | def __hash__(self): 22 | return hash((self.reference_id, self.is_reverse, self.pos, self.qname)) 23 | 24 | def __str__(self): 25 | s = "\t".join([ 26 | self.qname, 27 | str(self.flag), 28 | str(self.reference_id), 29 | str(self.pos), 30 | self.cigarstring]) 31 | return s -------------------------------------------------------------------------------- /CLAM/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | """ General Version and other info 4 | """ 5 | 6 | __version__ = '1.2.3' 7 | __author__ = 'Zijun Zhang' 8 | __email__ = 'zj.z@ucla.edu' -------------------------------------------------------------------------------- /CLAM/download_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | 5 | 6 | def parser(args): 7 | """DOCSTRING 8 | Args 9 | Returns 10 | """ 11 | try: 12 | genome = args.genome 13 | download_genome(genome) 14 | except KeyboardInterrupt(): 15 | sys.exit(0) 16 | 17 | 18 | def download_genome(genome): 19 | curr_dir = os.path.abspath('.') 20 | 21 | admin = (os.getuid() == 0) 22 | cmd = [] 23 | home = os.environ['HOME'] 24 | if admin: 25 | profile = '/etc/profile' 26 | else: 27 | profile = '{home}/.bashrc'.format(home=home) 28 | 29 | if not os.path.isdir('{home}/.clam_data'.format(home=home)): 30 | os.mkdir('{home}/.clam_data'.format(home=home)) 31 | os.chdir('{home}/.clam_data'.format(home=home)) 32 | 33 | if 'CLAM_DAT' not in os.environ or not os.environ['CLAM_DAT'] == '{home}/.clam_data'.format(home=home): 34 | cmd.append('echo "export CLAM_DAT=\'{clam_data}\'" >> {profile}'.format( 35 | clam_data=os.path.abspath('.'), profile=profile)) 36 | cmd.append('source {profile}'.format(profile=profile)) 37 | os.environ['CLAM_DAT'] = os.path.abspath('.') 38 | 39 | if not check_genome_data(genome): 40 | cmd.append('chmod -R 755 {home}/.clam_data'.format(home=home)) 41 | cmd.append( 42 | 'wget https://raw.githubusercontent.com/wkdeng/clam_data/master/{genome}.zip'.format(genome=genome)) 43 | cmd.append('unzip -o {genome}.zip'.format(genome=genome)) 44 | cmd.append('rm {genome}.zip'.format(genome=genome)) 45 | for item in cmd: 46 | subprocess.call(item, shell=True, executable='/bin/bash') 47 | print('Download finished') 48 | os.chdir(curr_dir) 49 | 50 | def check_genome_data(genome): 51 | if not os.path.isdir(os.environ['CLAM_DAT'] + '/' + genome): 52 | return False 53 | if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/3UTRs.bed'): 54 | return False 55 | if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/5UTRs.bed'): 56 | return False 57 | if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/cds.bed'): 58 | return False 59 | if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/exons.bed'): 60 | return False 61 | if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/introns.bed'): 62 | return False 63 | if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/proximal200_intron.bed'): 64 | return False 65 | if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/proximal500_intron.bed'): 66 | return False 67 | return True 68 | 69 | if __name__ == '__main__': 70 | download_genome('hg38') 71 | -------------------------------------------------------------------------------- /CLAM/peak_annotator.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import pybedtools 4 | import argparse as ap 5 | import logging 6 | from . import download_data, config 7 | 8 | ''' 9 | Assign peaks to genomic regions 10 | Zijun Zhang 11 | 8.1.2018 12 | 10.25.2018: wrapped to a function with document 13 | 14 | DWK 15 | modified to output annotation file 16 | 6.12.2019 17 | ''' 18 | 19 | # pylint: disable-msg=too-many-function-args 20 | # pylint: disable-msg=unexpected-keyword-arg 21 | 22 | 23 | def parser(args): 24 | """DOCSTRING 25 | Args 26 | Returns 27 | """ 28 | try: 29 | peak_in = args.peak_in 30 | genome = args.genome 31 | out_file = args.out_file 32 | if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome): 33 | print("Unable to locate CLAM data folder for genomic regions, will try to download.") 34 | print("Downloading...") 35 | download_data.download_genome(genome) 36 | genome_data = os.environ['CLAM_DAT'] 37 | intersect_gtf_regions( 38 | peak_in, out_file, os.path.join(genome_data, genome)) 39 | except KeyboardInterrupt(): 40 | sys.exit(0) 41 | 42 | 43 | def intersect_gtf_regions(peak_fp, outfn, gtf_dir): 44 | '''function: intersect_gtf_regions(peak_fp, outfn, gtf_dir) 45 | Intersect a peak BED file with a list of genomic region annotations (e.g. start/stop codon, UTR, intron), 46 | output the peak-region annotations. 47 | :param peak_fp: filepath to a BED-format peakquit 48 | :param outfn: filepath to output count file, has to end with ".txt"; annotation will be "NNN.annot.txt" 49 | 50 | ''' 51 | # input arguments 52 | 53 | # make pybedtools objects 54 | print("Loading peaks...") 55 | peaks = pybedtools.BedTool(peak_fp) 56 | print("Peak file loaded.") 57 | print("Loading genome annotation...") 58 | ref_dict = { 59 | 'exon': pybedtools.BedTool(os.path.join(gtf_dir, 'exons.bed')), 60 | '3UTR': pybedtools.BedTool(os.path.join(gtf_dir, '3UTRs.bed')), 61 | '5UTR': pybedtools.BedTool(os.path.join(gtf_dir, '5UTRs.bed')), 62 | 'cds': pybedtools.BedTool(os.path.join(gtf_dir, 'cds.bed')), 63 | 'intron': pybedtools.BedTool(os.path.join(gtf_dir, 'introns.bed')), 64 | 'proximal200': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal200_intron.bed')), 65 | 'proximal500': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal500_intron.bed')) 66 | } 67 | print("Genome annotation loaded.") 68 | 69 | # # process reference for use 70 | target = { 71 | "3UTR": ref_dict['3UTR'], 72 | "5UTR": ref_dict['5UTR'], 73 | "CDS": ref_dict['cds'], 74 | "other_exon": ref_dict['exon']-ref_dict['3UTR']-ref_dict['5UTR']-ref_dict['cds'], 75 | "px200_intron": ref_dict['proximal200'], 76 | "px500_intron": ref_dict['proximal500'].subtract(ref_dict['proximal200']), 77 | "distal_intron": ref_dict['intron'].subtract(ref_dict['exon']).subtract(ref_dict['proximal500']) 78 | } 79 | category_list = ['3UTR', '5UTR', 'CDS', 80 | 'other_exon', "px200_intron", "px500_intron", "distal_intron"] 81 | init = True 82 | 83 | print("Intersecting peaks with genome annotation...") 84 | for cat in category_list: 85 | bed_arr = [] 86 | for interval in target[cat]: 87 | bed_arr.append('\t'.join([str(x) for x in interval.fields])) 88 | bed_arr[-1] = bed_arr[-1] + '\t' + cat 89 | bed_arr = list(dict.fromkeys(bed_arr)) 90 | for i in range(len(bed_arr)): 91 | bed_arr[i] = bed_arr[i].split('\t') 92 | target[cat] = pybedtools.BedTool(bed_arr) 93 | 94 | if init: 95 | init = False 96 | result_bed = peaks.intersect(target[cat], wa=True, wb=True) 97 | else: 98 | result_bed = result_bed.cat(peaks.intersect( 99 | target[cat], wa=True, wb=True), postmerge=False) 100 | result_bed = result_bed.sort() 101 | 102 | print("Preparing output...") 103 | result_bed.saveas(outfn + '_') 104 | prepend = ['## Annotation peaks to genomic regions, all intersected genomic regions are presented.', 105 | '## CLAM version: %s'%config.__version__, 106 | '## Column 1: Peak chromosome', 107 | '## Column 2: Peak start', 108 | '## Column 3: Peak end', 109 | '## Column 4: Peak name', 110 | '## Column 5: Peak score', 111 | '## Column 6: Peak strand', 112 | '## Column 7: Peak signal value', 113 | '## Column 8: Peak pValue', 114 | '## Column 9: Peak qValue', 115 | '## Column 10: Point-source called for this peak', 116 | '## Column 11: Genomic region chromosome', 117 | '## Column 12: Genomic region start', 118 | '## Column 13: Genomic region end', 119 | '## Column 14: Gene ID', 120 | '## Column 15: Quality score', 121 | '## Column 16: Genomic region strand', 122 | '## Column 17: Genomic region type'] 123 | if os.path.exists(outfn): 124 | os.remove(outfn) 125 | for line in prepend: 126 | cmd = 'echo "{prepend}" >> {outfn}'.format( 127 | prepend=line, outfn=outfn) 128 | os.system(cmd) 129 | os.system('cat {outtmp} >> {outfn}'.format( 130 | outtmp=outfn + '_', outfn=outfn)) 131 | os.remove(outfn+'_') 132 | print("DONE") 133 | 134 | 135 | if __name__ == '__main__': 136 | # peak_fp, genome, outfn = sys.argv[1], sys.argv[2], sys.argv[3] 137 | os.chdir('/mnt/h/yi_lab/m6a/src/scripts/peakComposition') 138 | peak_in, genome, out_file = 'narrow_peak.unique.bed', 'mm10', 'annotate_peak.bed' 139 | if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome): 140 | print("Unable to find CLAM data folder for genomic regions, please try to download it using download_genome command.") 141 | print("Downloading...") 142 | download_data.download_genome(genome) 143 | genome_data = os.environ['CLAM_DAT'] 144 | intersect_gtf_regions( 145 | peak_in, out_file, os.path.join(genome_data, genome)) 146 | -------------------------------------------------------------------------------- /CLAM/permutation_peakcaller.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """This peak-caller script is part of the CLAM pipeline. 4 | 5 | This subcommand will call peaks using permutation by randomly placing reads along the gene. 6 | More details about the permutation procedure is described in our NAR paper. 7 | 8 | Example run: 9 | ``` 10 | CLAM permutation_callpeak -i path/to/outdir/unique.sorted.bam path/to/outdir/realigned.sorted.bam \ 11 | -o path/to/peaks/outdir -p 8 \ 12 | --gtf path/to/gencode.v19.annotation.gtf 13 | ``` 14 | Author: 15 | Zijun Zhang 16 | Wankun Deng 17 | Tested under python 3.7.6 18 | """ 19 | 20 | from . import config 21 | __version__ = config.__version__ 22 | 23 | 24 | import os 25 | import sys 26 | from collections import defaultdict 27 | from statsmodels.sandbox.stats.multicomp import multipletests 28 | import logging 29 | import bisect 30 | import random 31 | import pysam 32 | import re 33 | from multiprocessing import Pool 34 | 35 | 36 | ###setup logger 37 | logger = logging.getLogger('CLAM.permutation_peakcaller') 38 | ### 39 | 40 | def parser(args): 41 | """The main wrapper for CLAM peak-caller. 42 | """ 43 | # logging info 44 | logger.info('start') 45 | logger.info('run info: %s'%(' '.join(sys.argv))) 46 | # some back-reference (to v1.0.0) parameters here 47 | random_state = args.random_state 48 | merge_size = args.merge_size 49 | output_dir = os.path.abspath(args.out_dir) 50 | nthread = args.nthread 51 | max_iter = 200 52 | 53 | # read in gtf gene annotations 54 | gene_annot = read_gtf(args.gtf_fp) 55 | 56 | # read in GTF 57 | gene_list = gene_annot.keys() 58 | child_gene_list = [x for x in chunkify(list(gene_list), nthread)] 59 | 60 | # call peaks 61 | unibam_file=args.in_bam[0] 62 | multibam_file=args.in_bam[1] if len(args.in_bam)>=2 else None 63 | 64 | if nthread>1: 65 | pool = Pool(processes=args.nthread) 66 | # assert len(args.in_bam)==2 67 | tid_to_qval_compact = pool.map( 68 | _child_get_permutation_fdr, 69 | [ (unibam_file, multibam_file, child_gene_list[i], gene_annot, args.qval_cutoff, max_iter, ~(args.lib_type=='unstranded'), 'fdr', random_state) 70 | for i in range(args.nthread) 71 | ]) 72 | 73 | pool.terminate() 74 | pool.join() 75 | 76 | unique_tid_to_qval, combined_tid_to_qval = unpack_tid_to_qval(tid_to_qval_compact) 77 | else: 78 | unique_tid_to_qval, combined_tid_to_qval = _child_get_permutation_fdr( 79 | (unibam_file, multibam_file, gene_list, gene_annot, args.qval_cutoff, max_iter, ~( 80 | args.lib_type == 'unstranded'), 'fdr', random_state)) 81 | 82 | 83 | #pickle.dump(unique_tid_to_qval, open(tmp_dir+'/unique_to_qval.pdata','wb'), -1) 84 | #pickle.dump(combined_tid_to_qval, open(tmp_dir+'/combined_to_qval.pdata','wb'), -1) 85 | merge_peaks = merge_peaks_singleNucl 86 | #if args.merge_method==1: 87 | # merge_peaks=merge_peaks_singleNucl 88 | # mm='singleNucl' 89 | #elif args.merge_method==2: 90 | # merge_peaks=merge_peaks_broadPeak 91 | # mm='broadPeak' 92 | #else: 93 | # merge_peaks=merge_peaks_singleNucl 94 | # mm='unknown selection, using default singleNucl' 95 | 96 | 97 | unique_peaks=merge_peaks(unique_tid_to_qval, merge_size, args.qval_cutoff) 98 | combined_peaks=merge_peaks(combined_tid_to_qval, merge_size, args.qval_cutoff) if multibam_file is not None else None 99 | 100 | # write peak-calling results to file. 101 | narrowPeak_formatter = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t.\t%.3e\t.\n" 102 | ## chr start end name unique/combined strand signalValue pVal qVal peak 103 | with open(output_dir + '/all_permutation_peaks.bed', 'w') as f: 104 | for peak in unique_peaks: # peak = ['chr\tstart\tend\tstrand', 'height\tqval\t', tid] 105 | if args.extend is None: 106 | wt_loc=peak[0] 107 | else: 108 | wt_loc=extend_peak_region(peak[0], args.extend) 109 | #f.write(wt_loc + '\t' + '\t'.join([str(x) for x in peak[1]]) + '\t' + peak[2] + '\tunique\n') 110 | chr, start, end, strand = wt_loc.split('\t') 111 | _, signal_qval, gene_name = peak 112 | signal, qval = signal_qval 113 | f.write( narrowPeak_formatter % (chr, start, end, gene_name, 'unique', strand, signal, qval) ) 114 | if combined_peaks is not None: 115 | for peak in combined_peaks: 116 | if args.extend is None: 117 | wt_loc=peak[0] 118 | else: 119 | wt_loc=extend_peak_region(peak[0], args.extend) 120 | #f.write(wt_loc + '\t' + '\t'.join([str(x) for x in peak[1]]) + '\t' + peak[2] + '\tcombined\n') 121 | chr, start, end, strand = wt_loc.split('\t') 122 | _, signal_qval, gene_name = peak 123 | signal, qval = signal_qval 124 | f.write( narrowPeak_formatter % (chr, start, end, gene_name, 'combined', strand, signal, qval) ) 125 | if args.lib_type=='unstranded': 126 | cmd = ''' sort -k1,1 -k2,2n %s/all_permutation_peaks.bed |awk '{OFS="\t"; print $1,$2,$3,$4":"$7":"$9,$5,$6}'| \ 127 | bedtools merge -d -1 -i stdin -c 4,5,6 -o collapse,collapse,distinct > %s''' % (output_dir, os.path.join(output_dir,'narrow_peak.permutation.bed') ) 128 | else: 129 | cmd = ''' sort -k1,1 -k2,2n %s/all_permutation_peaks.bed |awk '{OFS="\t"; print $1,$2,$3,$4":"$7":"$9,$5,$6}'| \ 130 | bedtools merge -s -d -1 -i stdin -c 4,5,6 -o collapse,collapse,distinct > %s''' % (output_dir, os.path.join(output_dir,'narrow_peak.permutation.bed') ) 131 | os.system( cmd ) 132 | logger.info('end') 133 | return 134 | 135 | 136 | def chunkify(a, n): 137 | """ 138 | Separate a list (a) into consecutive n chunks. 139 | Returns the chunkified index 140 | """ 141 | k, m = len(a) / n, len(a) % n 142 | return (a[int(i * k + min(i, m)):int((i + 1) * k + min(i + 1, m))] for i in range(n)) 143 | 144 | 145 | def unpack_tid_to_qval(compact): 146 | """ 147 | Unpacks the returned values from multi-processing. 148 | """ 149 | unique_tid_to_qval=defaultdict(list) 150 | combined_tid_to_qval=defaultdict(list) 151 | for item in compact: 152 | unique, combined = item[0], item[1] 153 | if combined is None: 154 | combined_tid_to_qval=None 155 | for tid in unique: 156 | if len(unique[tid]) > 0: 157 | unique_tid_to_qval[tid] = unique[tid] 158 | else: 159 | for tid in combined: 160 | if len(unique[tid])>0: 161 | unique_tid_to_qval[tid]=unique[tid] 162 | if len(combined[tid])>1: 163 | combined_tid_to_qval[tid]=combined[tid] 164 | return unique_tid_to_qval,combined_tid_to_qval 165 | 166 | 167 | def _child_get_permutation_fdr(args): 168 | """ 169 | General permutation wrapper for a list of genes. Gets called by multi-processing generated by Pool() 170 | Returns packed FDRs from each child process. 171 | """ 172 | (unibam_file, multibam_file, child_gene_list, gene_annot, pval_cutoff, max_iter, is_stranded, correction_method,seed)=args 173 | random.seed(seed) 174 | 175 | unique_tid_to_qval=defaultdict(list) 176 | combined_tid_to_qval = defaultdict( 177 | list) if multibam_file is not None else None 178 | 179 | unibam=pysam.Samfile(unibam_file, 'rb') 180 | multibam=pysam.Samfile(multibam_file, 'rb') if multibam_file is not None else None 181 | 182 | pid = os.getpid() 183 | tot = len(child_gene_list) 184 | 185 | for i in range(len(child_gene_list)): 186 | if not i % 200: 187 | logger.debug('pid %s : %i / %i (%.2f%%)'% (pid, i, tot, float(i)/float(tot)*100)) 188 | gene_name = child_gene_list[i] 189 | gene = gene_annot[gene_name] 190 | chr, start, end, strand, tid = gene[0:5] 191 | unique_reads = read_tid_frag_from_bam(gene, unibam, is_stranded, True) 192 | multi_reads = read_tid_frag_from_bam(gene, multibam, is_stranded, False) if multibam_file is not None else None 193 | 194 | this_unique_to_qval = do_permutation(gene, unique_reads, max_iter, pval_cutoff, correction_method) 195 | this_combined_to_qval = do_permutation(gene, unique_reads+multi_reads, max_iter, pval_cutoff, correction_method) if multibam_file is not None else None 196 | 197 | unique_tid_to_qval[tid].extend(this_unique_to_qval) 198 | if multibam_file is not None: 199 | combined_tid_to_qval[tid].extend(this_combined_to_qval) 200 | unibam.close() 201 | if multibam_file is not None: 202 | multibam.close() 203 | return unique_tid_to_qval, combined_tid_to_qval 204 | 205 | 206 | def do_permutation(transcr, read_transcript, max_iter, pval_cutoff, correction_method): 207 | """ 208 | Permutes the reads along a given gene length, sub-routine that get called by get_permutation_fdr(..). 209 | Returns the locally corrected p-values for each observed height on the given gene. 210 | """ 211 | chr, tstart, tend, strand, tid = transcr[0:5] 212 | tid_length=tend-tstart+1 213 | obs_heights_count=count_pileup_heights(tid_length, read_transcript) 214 | 215 | tid_to_qval=[] 216 | 217 | rand_heights_dist=defaultdict(int) 218 | rand_sum=0 219 | # need to account for the 'observed' data, since permutation tests should never report p-value as 0. 3/22/16 220 | for i in obs_heights_count: 221 | if i==0: 222 | continue 223 | else: 224 | rand_heights_dist[int(i)]+=1 225 | rand_sum+=1 226 | for B in range(max_iter): 227 | new_heights_count=permutate_heights(tid_length, read_transcript) 228 | for i in new_heights_count: 229 | if i==0: 230 | continue 231 | else: 232 | rand_heights_dist[i]+=1 233 | rand_sum+=1 234 | height_to_pval={} 235 | for h in set(obs_heights_count): 236 | if h < 1: 237 | continue 238 | else: 239 | lefter=0 240 | for j in range(int(h), max(rand_heights_dist)+1): 241 | lefter+=rand_heights_dist[j] 242 | height_to_pval[h]=lefter/float(rand_sum) 243 | pval_list=[] 244 | for i in obs_heights_count: 245 | if i<1: 246 | continue 247 | pval_list.append(height_to_pval[i]) 248 | if len(pval_list)<=1: 249 | return [] 250 | 251 | qval_list=multipletests(pval_list, method='fdr_bh')[1] 252 | #if correction_method==2 or correction_method.lower()=='fdr': 253 | # qval_list=multipletests(pval_list, method='fdr_bh')[1] 254 | #else: 255 | # qval_list=[min(x*(len(set([int(y) for y in height_to_pval if y!=0]))), 1.0) for x in pval_list] 256 | 257 | ind=0 258 | last_height=0 259 | for j in range(len(obs_heights_count)): 260 | this_height=obs_heights_count[j] 261 | if this_height<1: 262 | last_height=0 263 | continue 264 | if qval_list[ind] <= pval_cutoff: 265 | if this_height==last_height: 266 | chr, last_start, last_end, last_strand, last_height, last_qval=tid_to_qval[-1] 267 | tid_to_qval[-1]=[chr, last_start, tstart+j+1, strand, last_height, last_qval] 268 | else: 269 | tid_to_qval.append([chr, tstart+j, tstart+j+1, strand, obs_heights_count[j], qval_list[ind]]) # chr, start, end, strand, height, this_qval 270 | last_height=this_height 271 | ind+=1 272 | return tid_to_qval 273 | 274 | 275 | def heights_to_dist(rand_heights): 276 | """ 277 | sub-routine 278 | """ 279 | rand_heights_dist=defaultdict(int) 280 | rand_sum=0 281 | for new_heights_count in rand_heights: 282 | for i in new_heights_count: 283 | if i==0: 284 | continue 285 | else: 286 | rand_heights_dist[i]+=1 287 | rand_sum+=1 288 | return rand_heights_dist, rand_sum 289 | 290 | 291 | def permutate_heights(tlen, reads): 292 | """ 293 | Sub-routine for do_permutation(...) 294 | Randomly allocate the read locations. 295 | """ 296 | loc_heights=[0] * tlen 297 | for id, pos, read_len, score in reads: 298 | if score<1 and random.random() > score: 299 | continue 300 | rand_pos=random.randint(1, max(1, tlen-read_len)) 301 | for i in range(rand_pos, min(rand_pos + read_len, tlen)): 302 | loc_heights[i]+=1 303 | return loc_heights 304 | 305 | 306 | def count_pileup_heights(tlen, reads): 307 | """ 308 | Sub-routine for do_permutation(...) 309 | Counts the distribution of pile-up heights for a given gene/permutation 310 | """ 311 | loc_heights=[0] * tlen 312 | for id, pos, read_len, score in reads: 313 | for i in range(pos, min(pos+read_len-1, tlen)): 314 | loc_heights[i]+=score 315 | return loc_heights 316 | 317 | 318 | def merge_peaks_broadPeak(transcript_to_qval, merge_size, pval_cutoff): 319 | """ 320 | Merge called peaks on a gene using option 2, 321 | i.e. if two peaks close to each other, region 322 | between two peaks are also called as peaks 323 | Retuns a list of merged peaks. 324 | """ 325 | peaks=[] 326 | last_qval=[0,1] 327 | for tid in transcript_to_qval: 328 | init=True 329 | for chr, start, end, strand, height, this_qval in transcript_to_qval[tid]: 330 | loc=[chr, str(start), str(end), strand] 331 | this_qval=[height, this_qval] # this_qval=[height, qval] so that when qval=0, we can compare height 332 | if this_qval[1] > pval_cutoff: 333 | continue 334 | if init: 335 | last_qval=this_qval 336 | last_pos=[start, end] 337 | last_loc=loc 338 | last_chr=chr 339 | write_out=False 340 | init=False 341 | continue 342 | if int(start) - int(last_pos[1]) > merge_size: 343 | write_out=True 344 | else: 345 | last_pos=[last_pos[0], end] 346 | last_qval=this_qval if last_qval[0] pval_cutoff: 398 | continue 399 | if init: 400 | last_qval=this_qval 401 | last_pos=[start, end] 402 | last_loc=loc 403 | last_chr=chr 404 | write_out=False 405 | init=False 406 | continue 407 | if last_chr == chr: 408 | if abs( int(start) - int(last_pos[0]) ) > merge_size: 409 | write_out=True 410 | elif last_qval[0] < this_qval[0]: 411 | last_pos=[start, end] 412 | last_qval=this_qval 413 | last_loc=loc 414 | write_out=False 415 | else: 416 | write_out=True 417 | 418 | if write_out and last_qval[1] < pval_cutoff: 419 | #peaks[last_loc]=last_qval 420 | peaks.append([last_loc, last_qval, tid]) 421 | last_qval=this_qval 422 | last_pos=[start, end] 423 | last_loc=loc 424 | last_chr=chr 425 | write_out=False 426 | if last_qval[1] < pval_cutoff: 427 | peaks.append([last_loc, last_qval, tid]) 428 | return peaks 429 | 430 | 431 | def extend_peak_region(loc, target_len): 432 | """ 433 | Extends peak symmetrically if peak is smaller than target_len. 434 | """ 435 | chr, start, end, strand = loc.split('\t') 436 | start = int(start) 437 | end = int(end) 438 | old_len = end - start 439 | if old_len > target_len: 440 | return loc 441 | else: 442 | center = int((start + end)/2) 443 | start = center - int(target_len /2) 444 | end = center + int(target_len/2) 445 | return '\t'.join([chr, str(start), str(end), strand]) 446 | 447 | 448 | def read_tid_frag_from_bam(tid, bamfile, is_stranded, is_unique): 449 | """ 450 | Use pysam to fetch reads info for a given gene and its loci. 451 | Returns reads, read weights and its mapped loci. 452 | """ 453 | tid_reads=[] 454 | #gene, chr, strand, start, end=tid 455 | chr, start, end, strand, gene = tid[0:5] 456 | if strand=='-': 457 | is_reverse=True 458 | else: 459 | is_reverse=False 460 | reads=[x for x in bamfile.fetch(chr, int(start), int(end)) if x.is_reverse==is_reverse or not is_stranded] 461 | reads=[x for x in reads if x.pos>=int(start) and x.pos<=int(end)] 462 | for read in reads: 463 | if is_unique: 464 | try: 465 | opt_NH=read.opt('NH') 466 | if opt_NH > 1: 467 | continue 468 | except: 469 | pass 470 | score=1 471 | else: 472 | try: 473 | opt_AS=read.opt('AS') 474 | if isinstance(opt_AS, float): 475 | score=opt_AS 476 | else: 477 | continue 478 | except: 479 | continue 480 | try: 481 | read_length = read.opt('RL') 482 | except: 483 | read_length = read.positions[-1] - read.positions[0] + 1 484 | 485 | if (not 'N' in read.cigarstring) and \ 486 | (read.pos-start>=0 and read_length<500): # to avoid junction reads 487 | tid_reads.append([read.qname, read.pos-start, read_length, score]) 488 | return tid_reads 489 | 490 | 491 | -------------------------------------------------------------------------------- /CLAM/preprocessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """This preprocessing script is part of the CLAM pipeline. 4 | 5 | This subcommand (new v1.1) will prepare the input files for CLAM pipeline. As of the current version (v1.1), it looks for 6 | reads passing QC, splits the input bam file by sorting them into `unique.sorted.bam` and `multi.sorted.bam`, 7 | and adding an additional tag "RT" (short for Read Tag) to each alignment based which read tagger function the user supplied. 8 | 9 | Note that you can also run `CLAM realigner` directly, which will call `preprocessor` and automatically determine 10 | if `preprocessor` has been called in the output folder. 11 | 12 | If you don't want to run `realigner`, you can also run `peakcaller` directly after `preprocessor`. 13 | 14 | Example run: 15 | ``` 16 | CLAM preprocessor -i path/to/input/Aligned.out.bam -o path/to/clam/outdir/ --read-tagger-method median 17 | ``` 18 | Author: 19 | Zijun Zhang 20 | 21 | Tested under python 2.7 22 | """ 23 | from . import config 24 | __version__ = config.__version__ 25 | 26 | import os 27 | import sys 28 | import pysam 29 | import numpy as np 30 | from collections import defaultdict 31 | #from tqdm import tqdm 32 | import logging 33 | import datetime 34 | import bisect 35 | import argparse as ap 36 | import inspect 37 | import hashlib 38 | 39 | 40 | logger = logging.getLogger('CLAM.Preprocessor') 41 | 42 | 43 | def alignment_mutation(x, mut_ref, mut_obs): 44 | """DOCSTRING 45 | Need to read reference genome 46 | NotImplemented 47 | """ 48 | raise NotImplementedError() 49 | 50 | 51 | def read_tagger_collection(alignment, method='median', **kwargs): 52 | """ tag a read alignment to a genomic locus 53 | Args: 54 | Returns: 55 | """ 56 | tagger_func = { 57 | # center of the read; must dicard junction reads 58 | 'median': lambda x: -1 if 'N' in x.cigarstring else int(np.median(x.positions))+1, 59 | # start site of the read; truncation in iCLIP/eCLIP 60 | 'start': lambda x: -1 if 'N' in x.cigarstring else x.positions[-1]+1 if x.is_reverse else x.positions[0]+1, 61 | # extend from 5' site to certain length; need kwargs 62 | 'extend': lambda x: -1 if 'N' in x.cigarstring else x.positions[-1]-kwargs['ext_len'] if x.is_reverse else x.positions[0]+kwargs['ext_len'], 63 | # mutation tag a specific mutation type 64 | 'mutation': lambda x: alignment_mutation(x, kwargs['mut_ref'], kwargs['mut_obs']) 65 | } 66 | try: 67 | tag=tagger_func[method](alignment) 68 | except: 69 | tag=-1 70 | return tag 71 | 72 | 73 | def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_method, lib_type): 74 | """Pre-processing function for cleaning up the input bam file. 75 | Args: 76 | Returns: 77 | """ 78 | # logging the parameter values 79 | frame = inspect.currentframe() 80 | args, _, _, values = inspect.getargvalues(frame) 81 | msg = 'Params:\n' 82 | for i in args: 83 | msg += "%s = %s \n"%(i, values[i]) 84 | logger.info(msg) 85 | read_tagger=lambda x: read_tagger_collection(x, method=read_tagger_method) 86 | logger.info('filtering input bam') 87 | 88 | in_bam = pysam.Samfile(filename,'rb') 89 | # unique read bam 90 | ubam_fn = os.path.join(out_dir, 'unique.bam') 91 | sorted_ubam_fn = os.path.join(out_dir, 'unique.sorted.bam') 92 | ubam=pysam.Samfile(ubam_fn, 'wb', template=in_bam) 93 | unique_counter = 0 94 | 95 | # multi-read bam 96 | mbam_fn = os.path.join(out_dir, 'multi.bam') 97 | sorted_mbam_fn = os.path.join(out_dir, 'multi.sorted.bam') 98 | mbam=pysam.Samfile(mbam_fn, 'wb', template=in_bam) 99 | mread_set = set() 100 | 101 | # splitting unique and multi- reads 102 | # and add the read taggers we need 103 | if not \ 104 | (os.path.isfile( os.path.join(out_dir,'unique.sorted.bam') ) and \ 105 | os.path.isfile( os.path.join(out_dir,'multi.sorted.bam')) ): 106 | 107 | #for read in tqdm(in_bam): 108 | counter = 0 109 | for read in in_bam: 110 | # poor man's progress bar 111 | counter += 1 112 | if not counter % 10**6: 113 | logger.debug('tagged %i alignments'%counter) 114 | read_tag = read_tagger(read) 115 | ## skip reads with unassigned tagger 116 | if read_tag==-1: 117 | continue 118 | read.tags += [('RT', read_tag)] ## add the tag 119 | 120 | tagged_read = pysam.AlignedSegment() 121 | tagged_read.query_name = read.query_name 122 | tagged_read.query_sequence = 'N' 123 | tagged_read.flag = read.flag 124 | tagged_read.reference_id = read.reference_id 125 | tagged_read.reference_start = read_tag - 1 # 0-based leftmost coordinate 126 | tagged_read.mapping_quality = read.mapping_quality 127 | tagged_read.cigar = ((0, 1),) 128 | tagged_read.template_length = read.template_length 129 | tagged_read.query_qualities = pysam.qualitystring_to_array("<") 130 | tagged_read.tags = read.tags 131 | read_len = sum([i[1] for i in read.cigar if i[0] == 0]) 132 | tagged_read.tags += [('RL', read_len)] 133 | if len(read.query_sequence) >= 32: 134 | tagged_read.tags += [('SQ', 135 | hashlib.md5(read.query_sequence.encode('utf-8')).hexdigest())] 136 | else: 137 | tagged_read.tags += [('SQ', read.query_sequence)] 138 | 139 | 140 | # add lib_type check 141 | if lib_type != "unstranded": 142 | tagged_read.is_reverse = (read.is_reverse) ^ (lib_type!="sense") 143 | 144 | if read.is_secondary or (read.has_tag('NH') and read.opt("NH")>1): 145 | #try: 146 | if read.opt("NH") < max_hits: 147 | mbam.write(tagged_read) 148 | mread_set.add(read.qname) 149 | #except KeyError: 150 | # #print read 151 | # raise Exception('%s: missing NH tag when is_secondary=%s'%(read.qname,read.is_secondary)) 152 | else: 153 | ubam.write(tagged_read) 154 | unique_counter += 1 155 | 156 | ubam.close() 157 | mbam.close() 158 | 159 | # sorting 160 | pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_ubam_fn), '-o', sorted_ubam_fn, ubam_fn) 161 | os.remove(ubam_fn) 162 | pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_mbam_fn), '-o', sorted_mbam_fn, mbam_fn) 163 | os.remove(mbam_fn) 164 | pysam.index(sorted_ubam_fn) 165 | pysam.index(sorted_mbam_fn) 166 | 167 | # log the statistics 168 | multi_counter = len(mread_set) 169 | logger.info( 170 | 'Unique reads = %s; ' % unique_counter + \ 171 | 'Multi reads = %s (%.2f %%)' % \ 172 | ( multi_counter, float(multi_counter)/(multi_counter+unique_counter)*100 ) 173 | ) 174 | else: 175 | logger.info('found previously sorted tag-bam. checking if need collapsing.') 176 | 177 | # filter redundant tags if turned on 178 | if max_tags>0: 179 | logger.info('collapsing unique') 180 | filter_bam_maxtags(os.path.join(out_dir, 'unique.sorted.collapsed.bam'), os.path.join(out_dir, 'unique.sorted.bam'), max_tags) 181 | logger.info('collapsing multi') 182 | filter_bam_maxtags(os.path.join(out_dir, 'multi.sorted.collapsed.bam'), os.path.join(out_dir, 'multi.sorted.bam'), max_tags) 183 | 184 | in_bam.close() 185 | return 186 | 187 | 188 | def collapse_stack(stack, collapse_dict, max_tags): 189 | """DOCSTRING 190 | Args 191 | Returns 192 | """ 193 | new_alignment_list = [] 194 | new_alignment_dict = defaultdict(list) 195 | for aln in stack: 196 | new_alignment_dict[aln.tags[-1][1]].append(aln) 197 | 198 | # TODO 2017.10.21: 199 | # further collapse `new_alignment_dict` 200 | # based on degeneracy and/or read tags 201 | 202 | for seq in new_alignment_dict: 203 | this_alignment_qname_list = [x.qname for x in new_alignment_dict[seq] ] 204 | is_collapsed = [True if x in collapse_dict else False for x in this_alignment_qname_list] 205 | ## if any of the alignment is collapsed before, 206 | ## we require all of them to be collapsed 207 | if any(is_collapsed): 208 | assert all(is_collapsed) 209 | target_alignment_qname = collapse_dict[this_alignment_qname_list[0]][0:max_tags] 210 | assert len(collapse_dict[this_alignment_qname_list[0]]) <= max_tags 211 | target_alignment = [new_alignment_dict[seq][this_alignment_qname_list.index(x)] for x in target_alignment_qname] 212 | else: 213 | target_alignment = new_alignment_dict[seq][0:max_tags] 214 | for aln_qname in this_alignment_qname_list: 215 | collapse_dict[aln_qname] = [x.qname for x in target_alignment] 216 | for read in target_alignment: 217 | read.tags=read.tags[:-1] 218 | new_alignment_list.append( read ) 219 | return new_alignment_list, collapse_dict 220 | 221 | 222 | def filter_bam_maxtags(obam_fn, ibam_fn, max_tags=1): 223 | """DOCSTRING 224 | Args 225 | Returns 226 | """ 227 | assert max_tags>0 228 | # prepare files 229 | ibam = pysam.Samfile(ibam_fn, 'rb') 230 | obam = pysam.Samfile(obam_fn, 'wb', template=ibam) 231 | # init 232 | collapse_dict = defaultdict(list) 233 | chr_list=[x['SN'] for x in ibam.header['SQ']] 234 | input_counter = 0 235 | output_counter = 0 236 | 237 | for chr in chr_list: 238 | # empty stack for each new chromosome 239 | stack = [] 240 | last_pos = -1 241 | for read in ibam.fetch(chr): 242 | input_counter += 1 243 | if not (input_counter % (5*(10**6)) ): 244 | logger.debug('collapsed %i alignments'%input_counter) 245 | if read.positions[0] > last_pos: 246 | new_alignment_list, collapse_dict = collapse_stack(stack, collapse_dict, max_tags) 247 | output_counter += len(new_alignment_list) 248 | last_pos = read.positions[0] 249 | stack = [read] 250 | for new_alignment in new_alignment_list: 251 | #new_alignment.query_sequence = '*' 252 | #new_alignment.query_qualities = '0' 253 | _ = obam.write(new_alignment) 254 | else: 255 | stack.append(read) 256 | new_alignment_list, collapse_dict = collapse_stack(stack, collapse_dict, max_tags) 257 | output_counter += len(new_alignment_list) 258 | last_pos = read.positions[0] 259 | for new_alignment in new_alignment_list: 260 | #new_alignment.query_sequence = '*' 261 | #new_alignment.query_qualities = '0' 262 | _ = obam.write(new_alignment) 263 | ibam.close() 264 | obam.close() 265 | #os.rename(obam_fn, ibam_fn) 266 | #pysam.sort(obam_fn) 267 | pysam.index(obam_fn) 268 | logger.info('Input = %s; Output = %s; Redundancy = %.2f'%(input_counter,output_counter, 1-float(output_counter)/input_counter)) 269 | return 270 | 271 | 272 | 273 | 274 | def parser(args): 275 | """DOCSTRING 276 | Args 277 | Returns 278 | """ 279 | try: 280 | in_bam = args.in_bam 281 | out_dir = args.out_dir 282 | if not os.path.isdir(out_dir): 283 | os.mkdir(out_dir) 284 | tag_method = args.tag_method 285 | max_hits = args.max_hits 286 | ## Note: if specified max_tags, need pre-sorted bam 287 | max_tags = args.max_tags 288 | lib_type = args.lib_type 289 | 290 | #logger = logging.getLogger('CLAM.Preprocessor') 291 | logger.info('start') 292 | logger.info('run info: %s'%(' '.join(sys.argv))) 293 | 294 | filter_bam_multihits(in_bam, max_hits=max_hits, max_tags=max_tags, out_dir=out_dir, 295 | read_tagger_method=tag_method, lib_type=lib_type) 296 | 297 | logger.info('end') 298 | except KeyboardInterrupt: 299 | sys.exit(0) 300 | return 301 | -------------------------------------------------------------------------------- /CLAM/realigner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """This re-aligner script is part of the CLAM pipeline. 4 | 5 | This subcommand will run expectation-maxmization to assign the multi-mapped reads in a probablistic framework. 6 | More details about the EM model is described in our NAR paper. 7 | 8 | Note when `--retag` is specified, `realigner` will re-run `preprocessor` regardless; otherwise, it will use 9 | the prepared files in `outdir` if available. 10 | 11 | Example run: 12 | ``` 13 | CLAM realigner -i path/to/input/Aligned.out.bam -o path/to/clam/outdir/ --read-tagger-method start --retag 14 | ``` 15 | Author: 16 | Zijun Zhang 17 | 18 | Tested under python 2.7 19 | """ 20 | 21 | from . import config 22 | __version__ = config.__version__ 23 | 24 | import os 25 | import sys 26 | import pysam 27 | import numpy as np 28 | from collections import defaultdict, deque 29 | #from tqdm import tqdm 30 | import logging 31 | import datetime 32 | import bisect 33 | import argparse as ap 34 | import inspect 35 | from .preprocessor import * 36 | 37 | logger = logging.getLogger('CLAM.Realigner') 38 | 39 | class Bit: 40 | """ Binary Indexed Tree to store values in genomic intervals. 41 | Implementation modified from http://www.geeksforgeeks.org/binary-indexed-tree-or-fenwick-tree-2/ 42 | Args: 43 | n (int): length of the interval to construct 44 | Returns: 45 | a BIT object with `add` and `sum` method over arbitrary sub-intervals 46 | with O(log(n)) time 47 | """ 48 | 49 | def __init__(self, n): 50 | sz = 1 51 | while n >= sz: 52 | sz *= 2 53 | self.size = sz 54 | self.array_size = n 55 | self.data = [0]*sz 56 | 57 | def sum(self, i): 58 | assert i >= 0 59 | if i==0: 60 | return 0 61 | if i > self.array_size: 62 | i = self.array_size 63 | s = 0 64 | while i > 0: 65 | s += self.data[i] 66 | i -= i & -i 67 | return s 68 | 69 | def add(self, i, x): 70 | assert i > 0 71 | while i < self.size: 72 | self.data[i] += x 73 | i += i & -i 74 | 75 | 76 | def construct_BIT_track(subgraph, read_to_locations, ubam, unstranded=False): 77 | """Construct BIT for each genomic region / node. 78 | Args: 79 | subgraph (list): a list of node names 80 | read_to_locations (dict): 81 | Returns: 82 | node_track (dict/BIT): a node-track dictionary; node_name => BIT 83 | multi_reads_weights (dict): a dictionary for multi-mapped reads; read_qname => node => [score, locus] 84 | """ 85 | node_track = {} 86 | total_len = 0 87 | 88 | # initialized BIT tracks, add mreads to the tracks, 89 | # and keep a dict of read scores 90 | multi_reads_weights = defaultdict(dict) 91 | obs_reads = read_to_locations.keys() 92 | for read_x_qname in obs_reads: 93 | read_x_nodes = read_to_locations[read_x_qname] 94 | read_x_score = 1.0 / len(read_x_nodes) 95 | for node in read_x_nodes: 96 | chr, strand, start, end = node.split(':') 97 | start, end = int(start), int(end) 98 | if not node in node_track: 99 | this_len = end - start + 1 100 | node_track[node] = Bit(this_len) 101 | read_x_tag = read_x_nodes[node].opt('RT') 102 | node_locus = read_x_tag - start + 1 103 | node_track[node].add(node_locus, read_x_score) 104 | multi_reads_weights[read_x_qname][node]=[read_x_score, node_locus] 105 | #del read_to_locations[read_x_qname][node] 106 | #del read_to_locations[read_x_qname] 107 | 108 | # now add ureads by fetching from ubam; 109 | # we don't need to keep track of them, just add the weights 110 | for node in node_track: 111 | chr, strand, start, end = node.split(':') 112 | start, end = int(start), int(end) 113 | is_reverse = True if strand=='-' else False 114 | uread_tagger = [x.opt('RT') for x in ubam.fetch(chr, start, end) \ 115 | if unstranded or x.is_reverse==is_reverse] 116 | for uread_x_tagger in uread_tagger: 117 | if uread_x_tagger>=start and uread_x_tagger<=end: 118 | node_locus = uread_x_tagger - start + 1 119 | node_track[node].add(node_locus, 1) 120 | 121 | return node_track, multi_reads_weights 122 | 123 | 124 | 125 | def run_EM(node_track, multi_reads_weights, w=50, epsilon=1e-6, max_iter=100, verbose=True): 126 | """ EM implementation for re-assigning multi-mapped reads, given the 127 | compatibility matrix of a subgraph. 128 | Args: 129 | node_track (dict): dict. of BIT returned from `construct_BIT_track` 130 | multi_reads_weights (dict): dict of mread qname and locus returned from `construct_BIT_track` 131 | w (int): window size for search vicinity reads 132 | epsilon (float): a small number for testing convergence between iterations 133 | max_iter (int): maximum iterations of EM 134 | verbose (bool, options): prints status in verbose mode 135 | Returns: 136 | multi_reads_weights (dict): the mread weight after EM 137 | """ 138 | iter = 1 139 | residue = 1 140 | #n_est = sum([1. for r in multi_reads_weights for n in multi_reads_weights[r] ]) 141 | n_est = sum([1. for r in multi_reads_weights ]) 142 | while iter < max_iter and residue > epsilon: 143 | residue = 0 144 | reweight=defaultdict(dict) 145 | ## calculate re-distribute probability; M-step 146 | for read in multi_reads_weights: 147 | for nd in multi_reads_weights[read]: 148 | track_len=node_track[nd].array_size 149 | old_score, read_tag = multi_reads_weights[read][nd] 150 | reweight[read][nd] = max( 0, node_track[nd].sum(min(track_len, read_tag + w)) - node_track[nd].sum(max(0,read_tag - w)) ) 151 | ## update track by expectation; E-step 152 | for read in reweight: 153 | dn=sum([reweight[read][x] for x in reweight[read]]) 154 | if dn==0: 155 | logger.debug('Error: no read weight found @ %s.'%read ) 156 | dn=1 157 | for nd in reweight[read]: 158 | old_score, read_tag = multi_reads_weights[read][nd] 159 | new_score = reweight[read][nd] / float(dn) 160 | node_track[nd].add(read_tag, new_score - old_score) 161 | residue += (old_score - new_score)**2 162 | multi_reads_weights[read][nd][0] = new_score 163 | residue /= n_est 164 | if verbose and (not iter % 10 or iter == max_iter): 165 | logger.debug('Iter %d, residue = %f' % (iter, residue)) 166 | iter += 1 167 | return multi_reads_weights 168 | 169 | 170 | def build_read_cluster(alignment, chr_dict, location_to_reads, genomic_cluster_dict, unstranded=False, winsize=50): 171 | """Given an alignment, find its genomic cluster, and all other mreads 172 | in that cluster 173 | Args: 174 | alignment (pysam.AlignedSegment): pysam alignment object 175 | chr_dict (dict): a dict of chrom name and sizes 176 | location_to_reads (dict): stores all mreads indexed by aligned locus; cluster name => mread alignments 177 | genomic_cluster_dict (dict): stores genomic clusters; chrom => [intv1, intv2, ..] 178 | unstranded (bool): if true, don't use the strand info in alignment 179 | winsize (int): window size for search ureads 180 | Returns: 181 | genomic_cluster (tuple): the target chrom and coordinates after expanding the window size 182 | this_mread_dict (dict): dict. of mread alignments in the target cluster indexed by read_qname 183 | discarded_mread_alignments (list): discarded mread alignments because of multiple occurences within one cluster 184 | """ 185 | chr_list = chr_dict['name'] 186 | chr_size = chr_dict['size'] 187 | chrom = chr_list[alignment.reference_id] 188 | chr_len = chr_size[alignment.reference_id] 189 | site = alignment.opt('RT') 190 | is_reverse = alignment.is_reverse 191 | strand = '+' if unstranded or is_reverse==False else '-' 192 | this_mread_dict = {} 193 | this_mread_dict_set = defaultdict(set) 194 | discarded_mread_alignments = [] 195 | 196 | ## note to me: need to be more careful with 197 | ## one read mapped to *multiple-locations* within one cluster 198 | ## currently tossing away those alignments.. (in `discarded_mread_alignments`) 199 | 200 | ## check junction reads; should be filtered out in tagging step 201 | #if 'N' in alignment.cigarstring: 202 | # return None, None, [alignment] 203 | 204 | ## find the corresponding genomic cluster from `genomic_cluster_dict` 205 | chr_strand = chrom+':'+strand 206 | idx = bisect.bisect_right(genomic_cluster_dict[chr_strand], site) 207 | if not idx%2: 208 | print(alignment) 209 | raise Exception('%s falls out of region %s'%(alignment.qname, chr_strand+':'+str(site)) ) 210 | start = genomic_cluster_dict[chr_strand][idx-1] - winsize 211 | start = 1 if start<1 else start 212 | end = genomic_cluster_dict[chr_strand][idx] + winsize 213 | end = chr_len-1 if end>=chr_len else end 214 | genomic_cluster = (chrom, strand, start, end) 215 | 216 | ## fetch the reads 217 | cluster_name = ':'.join([chrom, strand, str(genomic_cluster_dict[chr_strand][idx-1]), str(genomic_cluster_dict[chr_strand][idx])]) 218 | if not cluster_name in location_to_reads: 219 | raise Exception("cannot find cluster '%s' associated with read '%s' in `location_to_reads` of len %i"%(cluster_name, alignment.qname, len(location_to_reads))) 220 | mread_list = location_to_reads[cluster_name] 221 | #print(alignment, cluster_name) 222 | for x in mread_list: 223 | this_mread_dict_set[x.qname].add(x) 224 | del location_to_reads[cluster_name] 225 | 226 | ## find other mreads in this cluster 227 | for read_x_qname in this_mread_dict_set: 228 | if len(this_mread_dict_set[read_x_qname])>1: 229 | discarded_mread_alignments.extend( [ x for x in list(this_mread_dict_set[read_x_qname]) ]) 230 | else: 231 | this_mread_dict[read_x_qname] = list(this_mread_dict_set[read_x_qname])[0] 232 | 233 | return genomic_cluster, this_mread_dict, discarded_mread_alignments 234 | 235 | 236 | def construct_subgraph(location_to_reads, read_qname, mread_dict, processed_mreads, chr_dict, genomic_cluster_dict, winsize=50, unstranded=False): 237 | """Given a mread_qname, find exhaustively all other connected mreads. 238 | Args: 239 | location_to_reads (dict): genomic cluster => Alignment 240 | read_qname (str): target read ID 241 | mread_dict (dict): stores all read ID => Alignment 242 | processed_mreads (set): 243 | chr_dict (dict): map ref_id to chrom_name, chrom_size 244 | genomic_cluster (dict): chrom:strand => [interval1, interval2, ..] 245 | Returns: 246 | read_to_locations (dict): collect a subset of mread alignments in the same 247 | subgraph starting with read_qname 248 | processed_mreads (set): record all processed mread_qname to avoid re-processing 249 | """ 250 | # record of processed alignments only need kept on within-subgraph level 251 | processed_mread_alignments = set() 252 | counter = 0 253 | # a list of `pysam.AlignedSegment` objects 254 | # note that all taggers are already stored in `pysam.AlignedSegment.opt('RT')` 255 | read_aln_list = [x for x in mread_dict[read_qname]] 256 | processed_mreads.add(read_qname) 257 | read_to_locations = defaultdict(dict) # read_qname -> {node_name1:segment1, node_name2:segment2} 258 | 259 | # enumerate all connected components 260 | while True: 261 | counter+=1; #print "%i: %i"%(counter, len(read_aln_list)) 262 | next_read_aln_list = [] 263 | 264 | #gen = read_aln_list if len(read_aln_list)<200 else tqdm(read_aln_list) 265 | gen = read_aln_list 266 | for alignment in gen: 267 | ## build a node for this mread alignment 268 | ## (if not already processed, i.e. built before) 269 | if alignment in processed_mread_alignments: 270 | continue 271 | 272 | genomic_cluster, this_mread_dict, discarded_mread_list = \ 273 | build_read_cluster(alignment, chr_dict, 274 | location_to_reads, genomic_cluster_dict, 275 | unstranded=unstranded, winsize=winsize) 276 | ## record those discarded alignments/reads 277 | ## note: we mark discarded_mread as processed as well, 278 | ## so as not to create a bias to less clustered regions. 279 | # THIS IS PYTHON3 INCOMPATIBLE 280 | #_ = map(processed_mread_alignments.add, discarded_mread_list) 281 | #_ = map(processed_mreads.add, [x.qname for x in discarded_mread_list]) 282 | for x in discarded_mread_list: 283 | processed_mread_alignments.add(x) 284 | for x in discarded_mread_list: 285 | processed_mreads.add(x.qname) 286 | if genomic_cluster is None: # this cluster is invald (only double-mappers) 287 | continue 288 | 289 | ## update read_to_locations 290 | node_name = ':'.join([str(x) for x in genomic_cluster]) 291 | #if node_name in subgraph: 292 | #logger.debug("I revisited '%s' at read '%s'."%(node_name, read_qname)) 293 | #print("I revisited '%s' at read '%s'."%(node_name, read_qname)) 294 | #break 295 | for x_qname in this_mread_dict: 296 | read_to_locations[x_qname].update({node_name : this_mread_dict[x_qname]}) 297 | 298 | ## then add new alignments(edges) to generate connected nodes 299 | ## in the next iteration 300 | # THIS IS PYTHON3 INCOMPATIBLE 301 | #_ = map(processed_mread_alignments.add, this_mread_dict.values()) 302 | for x in list(this_mread_dict.values()): 303 | processed_mread_alignments.add(x) 304 | for read_x_qname in this_mread_dict: 305 | if read_x_qname in processed_mreads: 306 | continue 307 | x_aln_list = [aln for aln in mread_dict[read_x_qname] if not aln in processed_mread_alignments] 308 | next_read_aln_list.extend(x_aln_list) 309 | 310 | ## .. and record to processed reads since we have generated 311 | ## the nodes for them 312 | #_ = map(processed_mreads.add, this_mread_dict.keys()) # this is python3 incompatible 313 | for x in list(this_mread_dict.keys()): 314 | processed_mreads.add(x) 315 | 316 | # if no more connected nodes can be found, break loop 317 | if len(next_read_aln_list)==0: 318 | break 319 | read_aln_list = next_read_aln_list 320 | return read_to_locations, processed_mreads 321 | 322 | 323 | def get_genomic_clusters(mbam, winsize=50, unstranded=False): 324 | """Parsing the mbam to cluster the mread, and construct interval=>alignment. 325 | Using the same object in difference references, and just keep one copy of 326 | the mread-alignments to minimize memory usage. 327 | Args: 328 | mbam (pysam.Samfile): multi-read bam file handler 329 | winsize (int): window size for search mreads 330 | unstranded (bool): if turned on, all reads will be pushed to forward strand 331 | Returns: 332 | genomic_cluster_dict (dict): chrom:+/- => [intv1, intv2, ..] 333 | mread_dict (dict): read_qname => [aln1, aln2, ..] 334 | location_to_reads (dict): chrom:strand:start:end => [read1_aln, real2_aln, ..] 335 | """ 336 | # chrom:+/- => [intv1_1, intv1_2, intv2_1, intv2_2] 337 | genomic_cluster_dict = defaultdict(list) 338 | # read_qname => [aln1, aln2, ..] 339 | mread_dict = defaultdict(list) 340 | # chrom:+/-:start:end => [read1_aln, read2_aln,] 341 | location_to_reads = defaultdict(list) 342 | chr_list=[x['SN'] for x in mbam.header['SQ']] 343 | chr_size=[x['LN'] for x in mbam.header['SQ']] 344 | chr_dict = {'name':chr_list, 'size':chr_size} 345 | logger.info('finding genomic clusters') 346 | for chrom in chr_list: 347 | ## initialze the placeholder for current positive/negative strand clusters 348 | ## pos/neg: [start, end, tag_counter] 349 | cur_cluster = {'+':[0,0,0], '-':[0,0,0]} 350 | cur_cluster_aln = {'+':[], '-':[]} 351 | for read_alignment in mbam.fetch(chrom): 352 | ## should filter out junction reads in tagging step 353 | #if 'N' in read_alignment.cigarstring: 354 | # continue 355 | ## add current alignment to mread_dict 356 | mread_dict[read_alignment.qname].append(read_alignment) 357 | site = read_alignment.opt('RT') 358 | strand = '-' if read_alignment.is_reverse and not unstranded else '+' 359 | ### if this read is within the window size 360 | if site <= cur_cluster[strand][1]+winsize: 361 | if site < cur_cluster[strand][0]: 362 | cur_cluster[strand][0] = site 363 | if site > cur_cluster[strand][1]: 364 | cur_cluster[strand][1] = site 365 | cur_cluster[strand][2] += 1 366 | cur_cluster_aln[strand].append(read_alignment) 367 | ### otherwise, push the current cluster to `genomic_cluster_dict` 368 | else: 369 | if cur_cluster[strand][2] > 0: 370 | genomic_cluster_dict[chrom+':'+strand].extend( 371 | [ 372 | cur_cluster[strand][0], 373 | cur_cluster[strand][1]+1 374 | ] 375 | ) 376 | genomic_cluster_str = ':'.join([chrom, strand, str(cur_cluster[strand][0]), str(cur_cluster[strand][1]+1) ]) 377 | location_to_reads[genomic_cluster_str].extend(cur_cluster_aln[strand]) 378 | cur_cluster[strand] = [site, site+1, 1] 379 | cur_cluster_aln[strand] = [read_alignment] 380 | ## remember to push the last genomic cluster to dict 381 | if cur_cluster['+'][2] > 0: 382 | genomic_cluster_dict[chrom+':+'].extend([cur_cluster['+'][0],cur_cluster['+'][1]+1]) 383 | genomic_cluster_str = ':'.join([chrom, '+', str(cur_cluster['+'][0]), str(cur_cluster['+'][1]+1) ]) 384 | location_to_reads[genomic_cluster_str].extend(cur_cluster_aln['+']) 385 | if cur_cluster['-'][2] > 0: 386 | genomic_cluster_dict[chrom+':-'].extend([cur_cluster['-'][0],cur_cluster['-'][1]+1]) 387 | genomic_cluster_str = ':'.join([chrom, '-', str(cur_cluster['-'][0]), str(cur_cluster['-'][1]+1) ]) 388 | location_to_reads[genomic_cluster_str].extend(cur_cluster_aln['-']) 389 | 390 | return genomic_cluster_dict, mread_dict, location_to_reads 391 | 392 | 393 | 394 | def realigner(in_bam, out_dir, max_hits=100, max_tags=-1, read_tagger_method='median', 395 | winsize=50, unstranded=False, retag=False, lib_type="sense"): 396 | """The main entry for CLAM-realigner. 397 | 398 | Args: 399 | in_bam (str): filepath for input bam 400 | out_dir (str): filepath for CLAM output folder 401 | max_hits (int): maximum number of aligned loci allowed for mreads 402 | max_tags (int): maximum number of identical alignments allowed for each 403 | genomic locus, more amount will be collapsed; -1 is no collapsing 404 | read_tagger_method (str): the tagger function type 405 | winsize (int): window size 406 | unstranded (bool): ignore alignment strand info if turned on 407 | retag (bool): force to call `preprocessor` to process `in_bam` if turned on 408 | lib_type (str): specifies if the expected read alignment strand is `sense` with 409 | transcript strand, or `antisense`, or `unstranded`. 410 | 411 | Returns: 412 | None 413 | """ 414 | # logging the parameter values 415 | frame = inspect.currentframe() 416 | args, _, _, values = inspect.getargvalues(frame) 417 | msg = 'Params:\n' 418 | for i in args: 419 | msg += "%s = %s \n"%(i, values[i]) 420 | logger.info(msg) 421 | # preprocessing 422 | if retag or not ( 423 | os.path.isfile(os.path.join(out_dir,'unique.sorted.bam')) and \ 424 | os.path.isfile(os.path.join(out_dir,'multi.sorted.bam')) \ 425 | ) : 426 | filter_bam_multihits(in_bam, max_tags=max_tags, max_hits=max_hits, out_dir=out_dir, read_tagger_method=read_tagger_method, 427 | lib_type=lib_type) 428 | else: 429 | logger.info("found existing bams; skipped tagging.") 430 | 431 | # file handlers 432 | if max_tags>0: 433 | mbam = pysam.Samfile(os.path.join(out_dir, 'multi.sorted.collapsed.bam'),'rb') 434 | ubam = pysam.Samfile(os.path.join(out_dir, 'unique.sorted.collapsed.bam'),'rb') 435 | else: 436 | mbam = pysam.Samfile(os.path.join(out_dir, 'multi.sorted.bam'),'rb') 437 | ubam = pysam.Samfile(os.path.join(out_dir, 'unique.sorted.bam'),'rb') 438 | obam = pysam.Samfile(os.path.join(out_dir, 'realigned.bam'), 'wb', template = mbam) 439 | chr_list=[x['SN'] for x in ubam.header['SQ']] 440 | chr_size=[x['LN'] for x in mbam.header['SQ']] 441 | chr_dict = {'name':chr_list, 'size':chr_size} 442 | 443 | # construct the `mread_dict`, this will be needed throughout the program; 444 | # also construct the genomic cluster dict and cluster to alignment, 445 | # by going through all mreads at once 446 | genomic_cluster_dict, mread_dict, location_to_reads = get_genomic_clusters(mbam, winsize=winsize, unstranded=unstranded) 447 | logger.debug('found %i mreads @ %i locations' % ( len(mread_dict), len(location_to_reads) ) ) 448 | 449 | # keep a record of processed reads 450 | processed_mreads = set() 451 | 452 | # iterate through all mreads 453 | logger.info('running em') 454 | subg_counter = 0 455 | for read_qname in mread_dict: 456 | if read_qname in processed_mreads: 457 | continue 458 | 459 | ## construct the fully-connected subgraph for each read 460 | read_to_locations, processed_mreads = \ 461 | construct_subgraph(location_to_reads, read_qname, mread_dict, processed_mreads, chr_dict, \ 462 | genomic_cluster_dict, winsize=winsize, unstranded=unstranded) 463 | subgraph = set() 464 | for read in read_to_locations: 465 | _ = deque(map(subgraph.add, read_to_locations[read].keys())) 466 | subgraph = list(subgraph) 467 | #if len(subgraph)==1 and len(read_to_locations)>10: 468 | # raise Exception('Incorrect mread assigned to one location') 469 | if len(subgraph)==0: 470 | continue 471 | subg_counter += 1 472 | logger.debug("subgraph %i: |e|=%i, |v|=%i"%(subg_counter, len(read_to_locations), len(subgraph)) ) 473 | 474 | ## build the BIT tracks 475 | node_track, multi_reads_weights = \ 476 | construct_BIT_track(subgraph, read_to_locations, ubam, unstranded) 477 | 478 | ## run EM 479 | multi_reads_weights = \ 480 | run_EM(node_track, multi_reads_weights, w=winsize) 481 | 482 | ## write to obam 483 | for read in multi_reads_weights: 484 | for node in multi_reads_weights[read]: 485 | alignment = read_to_locations[read][node] 486 | score = round(multi_reads_weights[read][node][0], 3) 487 | alignment.set_tag('AS', score) 488 | #alignment.set_tag('PG', 'CLAM') 489 | obam.write(alignment) 490 | # sort the final output 491 | logger.info('sorting output') 492 | obam.close() 493 | ubam.close() 494 | mbam.close() 495 | obam_sorted_fn = os.path.join(out_dir, 'realigned.sorted.bam') 496 | pysam.sort('-o', obam_sorted_fn, os.path.join(out_dir, 'realigned.bam')) 497 | pysam.index(obam_sorted_fn) 498 | os.remove(os.path.join(out_dir, 'realigned.bam')) 499 | return 500 | 501 | 502 | 503 | def parser(args): 504 | """The command-line parser for CLAM-realigner 505 | Args: 506 | args (argparse.ArgumentParser): receives commandline arguments 507 | Returns: 508 | None 509 | """ 510 | try: 511 | in_bam = args.in_bam 512 | out_dir = args.out_dir 513 | if not os.path.isdir(out_dir): 514 | os.mkdir(out_dir) 515 | tag_method = args.tag_method 516 | max_hits = args.max_hits 517 | max_tags = args.max_tags 518 | retag = args.retag 519 | winsize = args.winsize 520 | lib_type = args.lib_type 521 | unstranded = lib_type == "unstranded" 522 | 523 | logger.info('start') 524 | logger.info('run info: %s'%(' '.join(sys.argv))) 525 | 526 | realigner(in_bam, out_dir, max_hits=max_hits, max_tags=max_tags, read_tagger_method=tag_method, 527 | winsize=winsize, unstranded=unstranded, retag=retag, lib_type=lib_type) 528 | 529 | logger.info('end') 530 | except KeyboardInterrupt: 531 | sys.exit(0) 532 | return 533 | -------------------------------------------------------------------------------- /CLAM/stats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xinglab/CLAM/aed16a4d4e56535e17302448c48f32d35cba14cd/CLAM/stats/__init__.py -------------------------------------------------------------------------------- /CLAM/stats/bin_test_alternatives.py: -------------------------------------------------------------------------------- 1 | """ A script for storing alternative peak calling 2 | statistical models / bin tests other than negative binomial 3 | Zijun Zhang 4 | Last revisited: 9.6.2017 5 | """ 6 | 7 | 8 | def test_bin_poisson(intv_bin_ip, intv_bin_con, correction_method='fdr_bh'): 9 | """DOCSTRING 10 | Args 11 | Returns 12 | """ 13 | def _par_to_vec(par, data, is_constrained): 14 | if is_constrained: 15 | beta = par[0] 16 | mu_vec = par[1::] 17 | delta = 0 18 | else: 19 | beta, delta = par[0], par[1] 20 | mu_vec = par[2::] 21 | ip_counter = data['this_ip'].shape[0] 22 | con_counter = data['this_con'].shape[0] 23 | mu0 = np.asarray(mu_vec[0:con_counter]) 24 | mu1 = np.asarray(mu_vec[con_counter::]) 25 | lamb1_this = np.exp(mu1 + beta + delta) 26 | lamb1_others = np.exp(mu1) 27 | lamb0_this = np.exp(mu0 + beta) 28 | lamb0_others = np.exp(mu0) 29 | return (lamb1_this, lamb1_others, lamb0_this, lamb0_others) 30 | 31 | def _neg_loglik_unconstrain(par, data): 32 | (l1, l2, l3, l4) = _par_to_vec(par, data, False) 33 | ll = np.sum(poisson.logpmf(data['this_ip'], mu=l1)) + \ 34 | np.sum(poisson.logpmf(data['others_ip'], mu=l2)) + \ 35 | np.sum(poisson.logpmf(data['this_con'], mu=l3)) + \ 36 | np.sum(poisson.logpmf(data['others_con'], mu=l4)) 37 | return -ll 38 | 39 | def _neg_loglik_constrain(par, data): 40 | (l1, l2, l3, l4) = _par_to_vec(par, data, True) 41 | ll = np.sum(poisson.logpmf(data['this_ip'], mu=l1)) + \ 42 | np.sum(poisson.logpmf(data['others_ip'], mu=l2)) + \ 43 | np.sum(poisson.logpmf(data['this_con'], mu=l3)) + \ 44 | np.sum(poisson.logpmf(data['others_con'], mu=l4)) 45 | return -ll 46 | 47 | intv_counter = intv_bin_ip.shape[1] 48 | assert intv_counter == intv_bin_con.shape[1] 49 | binscore = np.empty(intv_counter) 50 | binsignal = np.empty(intv_counter) 51 | ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip) 52 | con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con) 53 | for i in range(intv_counter): 54 | this_ip = intv_bin_ip[:, i] 55 | others_ip = ip_sum - this_ip 56 | this_con = intv_bin_con[:, i] 57 | others_con = con_sum - this_con 58 | if this_ip == 0: 59 | binsignal[i], binscore[i] = np.nan, 1.0 60 | continue 61 | ## because Poisson (and other count-based methods) only 62 | ## takes integers, here we take the floor of the fractional 63 | ## multi-reads as a conservative approach 64 | data = { 65 | 'this_ip':np.floor(this_ip), 66 | 'others_ip':np.floor(others_ip), 67 | 'this_con':np.floor(this_con), 68 | 'others_con':np.floor(others_con) 69 | } 70 | 71 | res_constrain = optimize.minimize( 72 | x0=np.ones(1+this_ip.shape[0]+others_ip.shape[0]), 73 | fun=_neg_loglik_constrain, 74 | args=(data), 75 | method='Nelder-Mead', 76 | options={'disp':False} 77 | ) 78 | 79 | res_unconstrain = optimize.minimize( 80 | x0=np.ones(2+this_ip.shape[0]+others_ip.shape[0]), 81 | fun=_neg_loglik_unconstrain, 82 | args=(data), 83 | method='Nelder-Mead', 84 | options={'disp':False} 85 | ) 86 | 87 | delta_mle = res_unconstrain.x[1] 88 | pval = 1 - chi2.cdf(2*(res_constrain.fun - res_unconstrain.fun), 1) 89 | binscore[i] = pval 90 | binsignal[i] = delta_mle 91 | adj = multipletests(binscore, alpha=0.05, method=correction_method) 92 | binscore_adj = adj[1] 93 | return binsignal, binscore_adj 94 | 95 | 96 | def test_bin_fisher(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh'): 97 | """DOCSTRING 98 | Args 99 | Returns 100 | """ 101 | if intv_bin_ip.shape[0] != 1: 102 | raise Exception('Fisher exact test does not deal with replicates.') 103 | intv_counter = intv_bin_ip.shape[1] 104 | assert intv_counter == intv_bin_con.shape[1] 105 | binscore = np.empty(intv_counter) 106 | binsignal = np.empty(intv_counter) 107 | ip_sum = np.sum(intv_bin_ip[0,]) 108 | con_sum = np.sum(intv_bin_con[0,]) 109 | for i in range(intv_counter): 110 | this_ip = intv_bin_ip[0, i] 111 | others_ip = ip_sum - this_ip 112 | this_con = intv_bin_con[0, i] 113 | others_con = con_sum - this_con 114 | if this_ip == 0: 115 | binsignal[i], binscore[i] = np.nan, 1.0 116 | continue 117 | _, binscore[i] = fisher_exact([[this_ip, others_ip], [this_con, others_con]], alternative='greater') 118 | if with_control: 119 | binsignal[i] = this_ip/others_ip / this_con*others_con 120 | else: 121 | binsignal[i] = this_ip 122 | 123 | adj = multipletests(binscore, alpha=0.05, method=correction_method) 124 | binscore_adj = adj[1] 125 | return binsignal, binscore_adj 126 | -------------------------------------------------------------------------------- /CLAM/stats/ztnb_em.py: -------------------------------------------------------------------------------- 1 | import scipy.special as special 2 | import numpy as np 3 | from numpy.random import negative_binomial 4 | import mpmath 5 | from collections import defaultdict 6 | 7 | 8 | ############################## 9 | ## distribution characterizing functions 10 | ############################## 11 | 12 | def trunc_logLik(data, mu, alpha): 13 | log_1_plus_a_mu = np.log(1 + alpha*mu) 14 | log_1_minus_prob_zero = np.log(1.0 - np.exp(-np.log(1.0+alpha*mu)/alpha)) 15 | alpha_inv = 1.0/alpha 16 | lim = int(np.max(data.keys())) 17 | holding_val=0.0 18 | log_L=0.0 19 | for i in range(1, lim+1): 20 | holding_val += np.log(1+alpha*(i-1)) 21 | log_L += data[i]* (holding_val - special.gammaln(i) + i*np.log(mu)-(i+alpha_inv)*log_1_plus_a_mu - log_1_minus_prob_zero) 22 | return log_L 23 | 24 | def ztnb_pmf(y, mu, alpha): 25 | r = 1.0 / alpha 26 | if y <= 0: 27 | raise Exception('y must be larger than 0.') 28 | p = mu/(mu+r+0.0) 29 | ztnbin_mpmath = lambda y, p, r: mpmath.gamma(y + r)/(mpmath.gamma(y+1)*mpmath.gamma(r))*np.power(1-p, r)*np.power(p, y)/(1-np.power(1-p, r)) 30 | ztnbin = np.frompyfunc(ztnbin_mpmath, 3, 1) 31 | return float(ztnbin(y, p, r)) 32 | 33 | def ztnb_cdf(y, mu, alpha): 34 | r = 1.0/alpha 35 | if y <= 0: 36 | raise Exception('y must be larger than 0.') 37 | p = mu/(mu+r+0.0) 38 | F_ztnb = ( 1 - special.btdtr(y+1, r, p) - np.power(1-p, r) ) / (1-np.power(1-p,r)) 39 | return F_ztnb 40 | 41 | def ztnb_pval(y, mu, alpha): 42 | pval = 1 - ztnb_cdf(y, mu, alpha) + ztnb_pmf(y, mu, alpha) 43 | if pval <= 10**-5: 44 | return 0 45 | else: 46 | return pval 47 | 48 | def rztnb(mu=3, alpha=0.5, size=100): 49 | r = 1.0/alpha 50 | p = mu/(mu+r+0.0) 51 | ztnb=[] 52 | while(len(ztnb)0: 55 | ztnb.append(x) 56 | return ztnb 57 | 58 | def collapse_data(data): 59 | col_data = defaultdict(int) 60 | for i in data: 61 | col_data[i] += 1 62 | return col_data 63 | 64 | ############################## 65 | ## parameter estimation functions 66 | ############################## 67 | 68 | def EM_estim_params(height, tol = 10**-4, max_iter = 1000, verbose = False, mu = None, alpha = None): 69 | tot_size = np.sum([height[x] for x in height if x>0]) 70 | error = 10000 71 | prev_score = 10000 72 | score = 0.0 73 | if mu is None or alpha is None: 74 | mu = np.sum([x*height[x] for x in height])/(tot_size+0.0) 75 | var = np.sum([ height[x]*(x-mu)**2 for x in height]) / tot_size 76 | alpha = (var - mu) / (mu * mu) 77 | #mu = 0.01 78 | #alpha = 100 79 | for h in range(1, int(max(height.keys()))+1): 80 | if not h in height: 81 | height[h]=0 82 | 83 | for i in range(1, (max_iter+1)): 84 | height[0] = expected_zeros(tot_size, mu, alpha) 85 | mu, alpha = estim_params(height, tol) 86 | height[0] = 0 87 | score = trunc_logLik(height, mu, alpha) 88 | if score == 0: 89 | raise ZeroDivisionError('invalid loglik function value') 90 | error = abs((score - prev_score)/score) 91 | if verbose: 92 | print('Iter ' + str(i) + ': eps = ' + str(error) + '; mu = ' + str(mu) + '; alpha = ' + str(alpha)) 93 | if(error < tol): 94 | break 95 | prev_score = score 96 | return (trunc_logLik(height, mu, alpha), mu, alpha) 97 | 98 | def expected_zeros(pseudo_size, mu, alpha): 99 | min_allowed_alpha=10**-4 100 | max_allowed_prob_zero=0.99 101 | if alpha < min_allowed_alpha: 102 | prob_zero = max_allowed_prob_zero 103 | else: 104 | prob_zero = np.min([np.power(1.0+alpha*mu, -1.0/alpha), 0.99]) 105 | expected_zeros = int(pseudo_size*(prob_zero/(1-prob_zero))) 106 | return expected_zeros 107 | 108 | 109 | def estim_params(pseudo_hist, tolerance = 10**-4): 110 | min_allowed_alpha = 10**-3 111 | max_allowed_alpha = 1000 112 | 113 | mu = compute_mean(pseudo_hist) 114 | pseudo_size = np.sum(pseudo_hist.values()) 115 | 116 | a_low = min_allowed_alpha 117 | a_high = max_allowed_alpha 118 | 119 | diff = 10000 120 | prev_val = 10000 121 | 122 | while diff > tolerance and movement(a_high, a_low) > tolerance: 123 | a_mid = (a_low + a_high)/2 124 | mid_val = alpha_score_function(pseudo_hist, mu, a_mid, pseudo_size) 125 | #print str(a_mid) + '; ' + str(mid_val) + '; ' + str(trunc_logLik(pseudo_hist, mu, a_mid)) 126 | if (mid_val < 0): 127 | a_high = a_mid 128 | else: 129 | a_low = a_mid 130 | diff = np.abs((prev_val - mid_val)/prev_val) 131 | prev_val = mid_val 132 | 133 | alpha = a_mid 134 | return mu, alpha 135 | 136 | def alpha_score_function(vals_hist, mean, a_mid, vals_count): 137 | one_plus_alpha_mu = 1.0 + a_mid*mean 138 | return (score_fun_first_term(vals_hist, a_mid)/(vals_count+0.0) + (np.log(one_plus_alpha_mu)/(a_mid+0.0) - mean)/(a_mid+0.0)) 139 | 140 | def score_fun_first_term(vals_hist,a_mid): 141 | sum = 0.0 142 | lim = int(np.max(vals_hist.keys())) 143 | for i in range(0, lim+1): 144 | if (vals_hist[i] > 0): 145 | inner_sum = 0.0 146 | for j in range(0, i): 147 | inner_sum += j/(1.0 + a_mid*j) 148 | sum += vals_hist[i]*inner_sum 149 | 150 | return sum 151 | 152 | 153 | ############################## 154 | ## in-line functions 155 | ############################## 156 | 157 | def compute_mean(height): 158 | tot_size = np.sum(height.values()) 159 | mean = np.sum([x*height[x] for x in height])/(tot_size + 0.0) 160 | return(mean) 161 | 162 | def movement(a, b): 163 | return abs(a - b)/max(a, b) 164 | 165 | ############################## 166 | ## testing function 167 | ############################## 168 | 169 | def test(size=10**3, mu=0.01, alpha=50, max_iter=100): 170 | data=rztnb(mu, alpha, size) 171 | height=collapse_data(data) 172 | return EM_estim_params(height, max_iter=max_iter, verbose=True) -------------------------------------------------------------------------------- /CLAM/utils/parseBAM.py: -------------------------------------------------------------------------------- 1 | """ parseBAM from Yan Gao 2 | https://github.com/yangao07/pyParseBAM/blob/master/parse_bam.py 3 | Author: Yan Gao 4 | Date: 1.9.2018 5 | """ 6 | 7 | import sys, re 8 | import pysam as ps 9 | import utils as ut 10 | 11 | #### cigar operation: 12 | BAM_CMATCH = 0 # M 13 | BAM_CINS = 1 # I 14 | BAM_CDEL = 2 # D 15 | BAM_CREF_SKIP = 3 # N 16 | BAM_CSOFT_CLIP = 4 # S 17 | BAM_CHARD_CLIP = 5 # H 18 | BAM_CPAD = 6 # P 19 | BAM_CEQUAL = 7 # = 20 | BAM_CDIFF = 8 # X 21 | BAM_CBACK = 9 # B 22 | 23 | 24 | # cigar stats: 25 | # M BAM_CMATCH 0 26 | # I BAM_CINS 1 27 | # D BAM_CDEL 2 28 | # N BAM_CREF_SKIP 3 29 | # S BAM_CSOFT_CLIP 4 30 | # H BAM_CHARD_CLIP 5 31 | # P BAM_CPAD 6 32 | # = BAM_CEQUAL 7 33 | # X BAM_CDIFF 8 34 | # B BAM_CBACK 9 35 | # NM NM tag 10 36 | def get_ref_op_length(cigar_stats=[]): 37 | # get op length for MDNP=X 38 | op_len = cigar_stats[0] + cigar_stats[2] + cigar_stats[3] + cigar_stats[6] + cigar_stats[7] + cigar_stats[8] 39 | return op_len 40 | 41 | 42 | def get_read_op_length(cigar_stats): 43 | # get op length for MISH=X 44 | op_len = cigar_stats[0] + cigar_stats[1] + cigar_stats[4] + cigar_stats[5] + cigar_stats[7] + cigar_stats[8] 45 | return op_len 46 | 47 | 48 | def get_aligned_read_length(cigar_stats): 49 | # get op length for MI=X 50 | op_len = cigar_stats[0] + cigar_stats[1] + cigar_stats[7] + cigar_stats[8] 51 | return op_len 52 | 53 | 54 | def minipulate_cigar(r=ps.AlignedSegment, old='', new=''): 55 | r.cigarstring = re.sub(r'%s' % old, new, r.cigarstring) 56 | 57 | 58 | def get_spec_MD(mdstr='', start=0, end=0): # '23AC20T', 10, 30 => ' 13AC5 59 | mSub = re.sub(r'([ACGTNacgtn])', ' \\1 ', mdstr) 60 | mSplit = re.split('[ ]+', mSub) 61 | start_remain_len = start 62 | end_remain_len = end - start 63 | ret_md = [] 64 | mi = 0 65 | # print mSplit, start_remain_len, end_remain_len 66 | while start_remain_len > 0: 67 | if mSplit[mi].isdigit(): 68 | if int(mSplit[mi]) > start_remain_len: 69 | mSplit[mi] = str(int(mSplit[mi]) - start_remain_len) 70 | start_remain_len = 0 71 | break 72 | else: 73 | start_remain_len -= int(mSplit[mi]) 74 | else: # isalpha() 75 | start_remain_len -= 1 76 | mi += 1 77 | while end_remain_len > 0: 78 | if mSplit[mi].isdigit(): 79 | if int(mSplit[mi]) >= end_remain_len: 80 | ret_md.append(str(end_remain_len)) 81 | end_remain_len = 0 82 | break 83 | else: 84 | end_remain_len -= int(mSplit[mi]) 85 | ret_md.append(mSplit[mi]) 86 | else: # isalpha() 87 | end_remain_len -= 1 88 | ret_md.append(mSplit[mi]) 89 | mi += 1 90 | # print ret_md 91 | return ret_md 92 | 93 | 94 | # MISMATCH: read_pos(first), ref_pos(first), len, read_base, ref_base 95 | # INSERTION: ins_read_pos(first), ins_ref_pos(left), len, ins_base 96 | # DELETION: del_read_pos(left), del_ref_pos(first), len, del_base 97 | def get_error_from_MD(cigartuples=[], mdstr='', full_query_seq='', ref_start=0): 98 | mis, ins, dele = [], [], [] 99 | last_error = '' 100 | md_i, m_pos = 0, 0 101 | mdSub = re.sub(r'([\\^][ACGTNacgtn]+)[0]*', ' \\1 ', mdstr) 102 | mdSplit = mdSub.rsplit() 103 | ref_pos, query_pos = ref_start, 0 104 | 105 | for tuples in cigartuples: 106 | if tuples[0] == BAM_CMATCH: 107 | m = mdSplit[md_i] 108 | 109 | if m.startswith('^'): 110 | ut.format_time(sys.stderr, 'get_error_from_MD', 'Unexpected MD string: {}\n'.format(mdstr)) 111 | sys.exit(1) 112 | mSub = re.sub(r'([ACGTNacgtn])', ' \\1 ', m) 113 | m_len = sum(map(int, (re.sub(r'([ACGTNacgtn])', '1', mSub)).rsplit())) 114 | 115 | # from m_pos to m_pos + tuples[1] 116 | sub_ms = get_spec_MD(m, m_pos, m_pos + tuples[1]) 117 | for ms in sub_ms: 118 | if ms.isalpha(): # MISMATCH 119 | if last_error != 'MIS' or mis[-1][0] != query_pos - 1: 120 | mis_error = [query_pos, ref_pos, 1, full_query_seq[query_pos], ms] 121 | mis.append(mis_error) 122 | else: # last_error == 'MIS' and mis[-1][1] == ap[0] - 1: 123 | mis[-1][-3] += 1 124 | mis[-1][-2] += full_query_seq[query_pos] 125 | mis[-1][-1] += ms 126 | query_pos += 1 127 | ref_pos += 1 128 | last_error = 'MIS' 129 | elif ms.isdigit(): # MATCH 130 | query_pos += int(ms) 131 | ref_pos += int(ms) 132 | 133 | if m_pos + tuples[1] == m_len: 134 | md_i += 1 135 | m_pos = 0 136 | elif m_pos + tuples[1] < m_len: 137 | m_pos += tuples[1] 138 | else: # 139 | ut.format_time(sys.stderr, 'get_error_from_MD', 'Unexpected MD string: {}\n'.format(mdstr)) 140 | sys.exit(1) 141 | elif tuples[0] == BAM_CDEL: 142 | m = mdSplit[md_i] 143 | if not m.startswith('^'): 144 | ut.format_time(sys.stderr, 'get_error_from_MD', 'Unexpected MD string: {}\n'.format(mdstr)) 145 | sys.exit(1) 146 | del_error = [query_pos - 1, ref_pos, tuples[1], m[1:]] 147 | dele.append(del_error) 148 | ref_pos += tuples[1] 149 | last_error = 'DEL' 150 | md_i += 1 151 | elif tuples[0] == BAM_CINS: 152 | ins_error = [query_pos, ref_pos - 1, tuples[1], full_query_seq[query_pos:query_pos + tuples[1]]] 153 | ins.append(ins_error) 154 | query_pos += tuples[1] 155 | last_error = 'INS' 156 | elif tuples[0] == BAM_CSOFT_CLIP or tuples[0] == BAM_CHARD_CLIP: 157 | query_pos += tuples[1] 158 | elif tuples[0] == BAM_CREF_SKIP: 159 | ref_pos += tuples[1] 160 | else: 161 | ut.format_time(sys.stderr, 'get_error_from_MD', 'Unexpected cigar: {}\n'.format(cigartuples)) 162 | sys.exit(1) 163 | 164 | return mis, ins, dele 165 | 166 | 167 | def get_error_from_Cigar(cigartuples=[], full_query_seq='', align_ref_seq='', ref_start=0): 168 | mis, ins, dele = [], [], [] 169 | last_error = '' 170 | ref_pos, query_pos = ref_start, 0 171 | for tuples in cigartuples: 172 | if tuples[0] == BAM_CMATCH: 173 | for q, r in zip(full_query_seq[query_pos:query_pos + tuples[1]], 174 | align_ref_seq[ref_pos - ref_start:ref_pos - ref_start + tuples[1]]): 175 | if q != r: # MISMATCH 176 | if last_error != 'MIS' or mis[-1][0] != query_pos - 1: 177 | mis_error = [query_pos, ref_pos, 1, q, r] 178 | mis.append(mis_error) 179 | else: # last_error == 'MIS' and mis[-1][1] == ap[0] - 1: 180 | mis[-1][-3] += 1 181 | mis[-1][-2] += q 182 | mis[-1][-1] += r 183 | last_error = 'MIS' 184 | ref_pos += 1 185 | query_pos += 1 186 | elif tuples[0] == BAM_CDEL: 187 | del_error = [query_pos - 1, ref_pos, tuples[1], 188 | align_ref_seq[ref_pos - ref_start:ref_pos - ref_start + tuples[1]]] 189 | dele.append(del_error) 190 | last_error = 'DEL' 191 | ref_pos += tuples[1] 192 | elif tuples[0] == BAM_CINS: 193 | ins_error = [query_pos, ref_pos - 1, tuples[1], full_query_seq[query_pos:query_pos + tuples[1]]] 194 | ins.append(ins_error) 195 | last_error = 'INS' 196 | query_pos += tuples[1] 197 | elif tuples[0] == BAM_CHARD_CLIP or tuples[0] == BAM_CSOFT_CLIP: 198 | query_pos += tuples[1] 199 | elif tuples[0] == BAM_CREF_SKIP: 200 | ref_pos += tuples[1] 201 | else: 202 | ut.format_time(sys.stderr, 'get_error_from_Cigar', 'Unexpected cigar: {}\n'.format(cigartuples)) 203 | sys.exit(1) 204 | 205 | return mis, ins, dele 206 | 207 | 208 | def get_align_block(cigartuples=[], ref_start=0): 209 | align_block = [] 210 | start = ref_start 211 | end = ref_start - 1 212 | for tuples in cigartuples: 213 | if tuples[0] == BAM_CMATCH or tuples[0] == BAM_CDEL: 214 | end += tuples[1] 215 | elif tuples[0] == BAM_CREF_SKIP: 216 | align_block.append((start, end)) 217 | start = end + tuples[1] + 1 218 | end = start -1 219 | align_block.append((start, end)) 220 | 221 | return align_block 222 | 223 | 224 | # MISMATCH: read_pos(first), ref_pos(first), len, read_base, ref_base 225 | # INSERTION: ins_read_pos(first), ins_ref_pos(left), len, ins_base 226 | # DELETION: del_read_pos(left), del_ref_pos(first), len, del_base 227 | def get_align_detial(r, ref_fa): 228 | if r.cigartuples[0][0] == BAM_CSOFT_CLIP or r.cigartuples[0][0] == BAM_CHARD_CLIP: 229 | left_clip = r.cigartuples[0][1] 230 | else: 231 | left_clip = 0 232 | if r.cigartuples[-1][0] == BAM_CSOFT_CLIP or r.cigartuples[-1][0] == BAM_CHARD_CLIP: 233 | right_clip = r.cigartuples[-1][1] 234 | else: 235 | right_clip = 0 236 | 237 | if r.has_tag('MD'): 238 | mdstr = r.get_tag('MD') 239 | mis_err, ins_err, dele_err = get_error_from_MD(r.cigartuples, mdstr, r.query_sequence, r.reference_start) 240 | else: 241 | ref = ps.FastaFile(ref_fa) 242 | align_ref_seq = ref.fetch(r.reference_name, r.reference_start, r.reference_start + r.reference_length) 243 | mis_err, ins_err, dele_err = get_error_from_Cigar(r.cigartuples, r.query_sequence, align_ref_seq, 244 | r.reference_start) 245 | return [r.is_reverse, r.infer_read_length(), r.reference_start, r.reference_length, left_clip, right_clip, mis_err, 246 | ins_err, dele_err] -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate -------------------------------------------------------------------------------- /bin/CLAM: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | """CLIP-seq Analysis of Multi-mapped reads 5 | 6 | This is the main entry for CLAM. CLAM is a comprehensive peak caller for CLIP/RIP-seq 7 | data that considers both uniquely-mapped and multi-mapped reads. 8 | 9 | Example: 10 | $ CLAM [realigner|peakcaller] 11 | 12 | Authors: 13 | Zijun Zhang 14 | Yi Xing 15 | 16 | Citation: 17 | @article{zhang2017clip, 18 | title={CLIP-seq analysis of multi-mapped reads discovers novel functional RNA regulatory sites in the human transcriptome}, 19 | author={Zhang, Zijun and Xing, Yi}, 20 | journal={Nucleic Acids Research}, 21 | year={2017} 22 | } 23 | 24 | Todo: 25 | add `visualize` and `evaluate` subcommands 26 | 27 | This program is free software: you can redistribute it and/or modify it under 28 | the terms of the GNU General Public License as published by the Free Software 29 | Foundation, either version 3 of the License, or (at your option) any later 30 | version 31 | """ 32 | 33 | from CLAM import config 34 | import os 35 | import sys 36 | import logging 37 | import argparse as ap 38 | import datetime 39 | 40 | 41 | def main(): 42 | """main entry for CLAM 43 | This function setup the logging and handle the input options 44 | Args 45 | None 46 | Returns 47 | None 48 | """ 49 | logger = setup_logger() 50 | argparser = get_arg_parser() 51 | args = argparser.parse_args() 52 | 53 | subcommand = args.subcommand 54 | 55 | if subcommand == 'preprocessor': 56 | from CLAM import preprocessor 57 | preprocessor.parser( args ) 58 | elif subcommand == 'realigner': 59 | from CLAM import realigner 60 | #print args 61 | realigner.parser( args ) 62 | 63 | elif subcommand == 'peakcaller': 64 | from CLAM import peakcaller 65 | #print args 66 | peakcaller.parser( args ) 67 | 68 | elif subcommand == 'permutation_callpeak': 69 | from CLAM import permutation_peakcaller 70 | permutation_peakcaller.parser( args ) 71 | 72 | elif subcommand == 'peak_annotator': 73 | from CLAM import peak_annotator 74 | peak_annotator.parser(args) 75 | 76 | elif subcommand == 'data_downloader': 77 | from CLAM import download_data 78 | download_data.parser(args) 79 | 80 | 81 | def setup_logger(): 82 | """Set up the logger for the whole pipeline 83 | Args 84 | None 85 | Returns 86 | logger: logging object 87 | """ 88 | # setup logger 89 | logger = logging.getLogger('CLAM') 90 | logger.setLevel(logging.DEBUG) 91 | # create file handler which logs even debug messages 92 | #fh = logging.FileHandler( 93 | # 'log.CLAM.'+'-'.join(str(datetime.datetime.now()).replace(':','-').split()) + '.txt') 94 | fh = logging.FileHandler('log.CLAM.txt') 95 | fh.setLevel(logging.INFO) 96 | # create console handler with a higher log level 97 | ch = logging.StreamHandler() 98 | ch.setLevel(logging.DEBUG) 99 | # create formatter and add it to the handlers 100 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -\n %(message)s') 101 | fh.setFormatter(formatter) 102 | ch.setFormatter(formatter) 103 | # add the handlers to the logger 104 | logger.addHandler(fh) 105 | logger.addHandler(ch) 106 | return logger 107 | 108 | 109 | def get_arg_parser(): 110 | """DOCSTRING 111 | Args 112 | Returns 113 | """ 114 | description = "%(prog)s -- CLip-seq Analysis of Multi-mapped reads" 115 | epilog = "For command line options of each sub-command, type: %(prog)s COMMAND -h" 116 | 117 | argparser = ap.ArgumentParser(description=description, epilog=epilog) 118 | argparser.add_argument("--version", action="version", version="%(prog)s "+config.__version__) 119 | 120 | subparsers = argparser.add_subparsers( dest="subcommand" ) 121 | 122 | # preprocessing 123 | add_preprocessor_parser(subparsers) 124 | 125 | # realigner 126 | add_realigner_parser(subparsers) 127 | 128 | # peakcaller 129 | add_peakcaller_parser(subparsers) 130 | 131 | # permutation_callpeak 132 | add_permutation_callpeak_parser(subparsers) 133 | 134 | # peak_annotator 135 | add_peak_annotator_parser(subparsers) 136 | 137 | # data_downloader 138 | add_data_downloader_parser(subparsers) 139 | 140 | return argparser 141 | 142 | 143 | def add_preprocessor_parser( subparsers ): 144 | ag_prep = subparsers.add_parser("preprocessor", help="CLAM Preprocessor: tag read alignments to specific locus") 145 | 146 | # input/output 147 | ag_prep.add_argument("-i", "--input", dest="in_bam", type=str, required=True, 148 | help="Input bam file") 149 | 150 | ag_prep.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True, 151 | help="Output folder") 152 | 153 | # processing 154 | ag_prep.add_argument("--read-tagger-method", dest="tag_method", type=str, 155 | choices= ('median', 'start'), default='median', 156 | help="Read tagger method, 'median' for read center, 'start' for read start site; default: median") 157 | 158 | ag_prep.add_argument("--max-multihits", dest="max_hits", type=int, default=100, 159 | help="The maximum hits allowed for multi-mapped reads; default: 100") 160 | 161 | ag_prep.add_argument("--max-tags", dest="max_tags", type=int, default=-1, 162 | help="The maximum identical tags at given location; default: -1, no filter") 163 | 164 | # lib_type 165 | ag_prep.add_argument("--lib-type", dest="lib_type", type=str, 166 | default="sense", choices=['sense', 'antisense', 'unstranded'], 167 | help="The expected read strandness with transcription direction: sense, antisense, or unstranded; default: sense") 168 | 169 | return 170 | 171 | 172 | def add_realigner_parser( subparsers ): 173 | ag_realigner = subparsers.add_parser("realigner", help="CLAM Realigner: realign multi-mapped reads using expectation-maximization") 174 | 175 | # input/output 176 | ag_realigner.add_argument("-i", "--input", dest="in_bam", type=str, required=True, 177 | help="Input bam file") 178 | 179 | ag_realigner.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True, 180 | help="Output folder") 181 | 182 | # processing 183 | ag_realigner.add_argument("--read-tagger-method", dest="tag_method", type=str, 184 | choices= ('median', 'start'), default='median', 185 | help="Read tagger method, 'median' for read center, 'start' for read start site; default: median") 186 | 187 | ag_realigner.add_argument("--max-multihits", dest="max_hits", type=int, default=100, 188 | help="The maximum hits allowed for multi-mapped reads; default: 100") 189 | 190 | ag_realigner.add_argument("--max-tags", dest="max_tags", type=int, default=-1, 191 | help="The maximum identical tags at given location; default: -1, no filter") 192 | 193 | ag_realigner.add_argument("--retag", dest="retag", default=False, action='store_true', 194 | help="Retag the bam regardless when turned on; invalid when no previous files found") 195 | 196 | # realign 197 | ag_realigner.add_argument("--winsize", dest="winsize", type=int, default=50, 198 | help="Local window size for em computations; default: 50") 199 | 200 | #ag_realigner.add_argument("--unstranded", dest="unstranded", default=False, action="store_true", 201 | # help="Unstranded alignments if turned on") 202 | 203 | # lib_type 204 | ag_realigner.add_argument("--lib-type", dest="lib_type", type=str, 205 | default="sense", choices=['sense', 'antisense', 'unstranded'], 206 | help="The expected read strandness with transcription direction: sense, antisense, or unstranded; default: sense") 207 | 208 | return 209 | 210 | 211 | def add_peakcaller_parser( subparsers ): 212 | ag_peakcaller = subparsers.add_parser("peakcaller", help="CLAM Peakcaller: negative binomial model-based peak calling combining unique- and multi-reads") 213 | 214 | # input/output 215 | ag_peakcaller.add_argument("-i", "--input", dest="in_bam", nargs='+', type=str, required=True, 216 | help="Filepaths for IP bam files, e.g ubam1,ubam2 mbam1,mbam2") 217 | 218 | ag_peakcaller.add_argument("-c", "--control-dir", dest="con_bam", nargs='+', type=str, required=True, 219 | help="Filepaths for control bam files") 220 | 221 | ag_peakcaller.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True, 222 | help="Output folder") 223 | 224 | ag_peakcaller.add_argument("--gtf", dest="gtf_fp", type=str, required=True, 225 | help="GTF filepath") 226 | 227 | # processing 228 | ag_peakcaller.add_argument("-p", "--nthread", dest="nthread", type=int, default=8, 229 | help="Number of threads; default: 8") 230 | 231 | ag_peakcaller.add_argument("-u", "--unique-only", dest="unique_only", default=False, action='store_true', 232 | help="Call peaks using only unique-mapped reads when turned on") 233 | 234 | ag_peakcaller.add_argument("--pool", dest="pooling", default=False, action="store_true", 235 | help="Pool the read counts if provided with multiple replicates; default: False") 236 | 237 | ag_peakcaller.add_argument("--min-clip-cov", dest="min_clip_cov", type=int, default=4, 238 | help="Minimum CLIP reads per gene to perform analysis; default: 4") 239 | 240 | # callpeak 241 | ag_peakcaller.add_argument("--qval-cutoff", dest="qval_cutoff", type=float, default=0.05, 242 | help="Cutoff for adjusted p-values; default: 0.05") 243 | 244 | ag_peakcaller.add_argument("--fold-change", dest="fold_change", nargs='+', type=float, default=[2.], 245 | help="Threasholds for signal range (fold change w/ control; tag count w/o control); default: 2-inf") 246 | 247 | ag_peakcaller.add_argument("--normalize-lib", dest="norm_lib", action="store_true", default=False, 248 | help="use total library size to normalize signal and control, instead of gene-by-gene basis; default: False") 249 | 250 | ag_peakcaller.add_argument("-b", "--binsize", dest="binsize", type=int, default=50, 251 | help="Bin size for calling peaks; default: 50") 252 | 253 | ag_peakcaller.add_argument("--lib-type", dest="lib_type", type=str, 254 | default="sense", choices=['sense', 'antisense', 'unstranded'], 255 | help="The expected read strandness with transcription direction: sense, antisense, or unstranded; default: sense") 256 | 257 | return 258 | 259 | 260 | 261 | def add_permutation_callpeak_parser( subparsers ): 262 | ag_peakcaller = subparsers.add_parser("permutation_callpeak", help="CLAM permutation peakcaller: call peaks using permutation (as in v1.0.0)") 263 | 264 | # input/output 265 | ag_peakcaller.add_argument("-i", "--input", dest="in_bam", nargs='+', type=str, required=True, 266 | help="Filepaths for CLIP bam, e.g ubam mbam") 267 | 268 | ag_peakcaller.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True, 269 | help="Output folder") 270 | 271 | ag_peakcaller.add_argument("--gtf", dest="gtf_fp", type=str, required=True, 272 | help="GTF filepath") 273 | 274 | # processing 275 | ag_peakcaller.add_argument("-p", "--nthread", dest="nthread", type=int, default=8, 276 | help="Number of threads; default: 8") 277 | 278 | ag_peakcaller.add_argument("--random-state", dest="random_state", type=int, default=777, 279 | help="Seed for random number generator in permutations; default: 777") 280 | 281 | # callpeak 282 | ag_peakcaller.add_argument("--qval-cutoff", dest="qval_cutoff", type=float, default=0.005, 283 | help="Cutoff for adjusted p-values; default: 0.005") 284 | 285 | ag_peakcaller.add_argument("--merge-size", dest="merge_size", type=int, default=50, 286 | help="Select best peak within this size; default: 50") 287 | 288 | ag_peakcaller.add_argument("--extend", dest="extend", type=int, default=50, 289 | help="Extend peak to this size if less than it; default: 50") 290 | 291 | ag_peakcaller.add_argument("--lib-type", dest="lib_type", type=str, 292 | default="sense", choices=['sense', 'antisense', 'unstranded'], 293 | help="The expected read strandness with transcription direction: sense, antisense, or unstranded; default: sense") 294 | 295 | return 296 | 297 | 298 | def add_peak_annotator_parser(subparsers): 299 | ag_anno = subparsers.add_parser( 300 | "peak_annotator", help="CLAM peak annotator: assign peaks to genomic regions") 301 | 302 | # input/output 303 | ag_anno.add_argument("-i", "--input", dest="peak_in", type=str, required=True, 304 | help="Input peak file") 305 | 306 | ag_anno.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True, 307 | help="Genome version (hg19, hg38, mm10 avaiable)") 308 | 309 | ag_anno.add_argument("-o", "--out-file", dest="out_file", type=str, required=True, 310 | help="Output file") 311 | 312 | return 313 | 314 | 315 | def add_data_downloader_parser(subparsers): 316 | ag_down = subparsers.add_parser( 317 | "data_downloader", help="CLAM data downloader: download data of genomic regions") 318 | 319 | # input/output 320 | ag_down.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True, 321 | help="Genome version (hg19, hg38, mm10 avaiable)") 322 | 323 | return 324 | 325 | 326 | 327 | if __name__ == '__main__': 328 | try: 329 | main() 330 | except KeyboardInterrupt: 331 | sys.stderr.write("User interrupted; program terminated.") 332 | sys.exit(0) 333 | -------------------------------------------------------------------------------- /check/compare_realign.py: -------------------------------------------------------------------------------- 1 | """Compare the realigner outputs for different 2 | version 3 | ZZJ 4 | 2019.5.27 5 | """ 6 | 7 | import sys 8 | import pysam 9 | 10 | 11 | 12 | def read_as_score(bfile): 13 | s1 = {} 14 | with pysam.Samfile(bfile, 'rb') as bam: 15 | i = 0 16 | for r1 in bam: 17 | i += 1 18 | #if i>30000: 19 | # break 20 | s1[(r1.qname, r1.rname, r1.pos)] = r1.opt('AS') 21 | return s1 22 | 23 | 24 | def plot_scatter(new, old): 25 | import matplotlib.pyplot as plt 26 | import seaborn as sns 27 | ax = sns.jointplot(new, old, kind="reg") 28 | ax.set_axis_labels('New', 'Old') 29 | plt.savefig('realign_check.png') 30 | 31 | def compare(): 32 | s1 = read_as_score('new_out/realigned.sorted.bam') 33 | s2 = read_as_score('old_out/realigned.sorted.bam') 34 | k = list([x for x in s1 if x in s2]) 35 | old = [] 36 | new = [] 37 | print("ID\tnew\told\n") 38 | for k_ in k: 39 | print("%s\t%s\t%s\n"%(k_, s1[k_], s2[k_] ) ) 40 | new.append(s1[k_]) 41 | old.append(s2[k_]) 42 | plot_scatter(new, old) 43 | 44 | 45 | if __name__ == '__main__': 46 | compare() -------------------------------------------------------------------------------- /deprecated/CLAM.fdr_peak.MP.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | This peak-caller script is part of the CLAM pipeline. 5 | 6 | It takes input from re-aligner output, and use permutation to call peaks. 7 | 8 | Tested under python 2.7.3 9 | """ 10 | 11 | __author__ = 'Zijun Zhang' 12 | __version__ = '1.0.0' 13 | __email__ = 'zj.z@ucla.edu' 14 | 15 | 16 | from optparse import OptionParser 17 | import os, subprocess, sys 18 | from collections import defaultdict 19 | from statsmodels.sandbox.stats.multicomp import multipletests 20 | from time import strftime 21 | import cPickle as pickle 22 | import bisect, random 23 | import pysam 24 | import pybedtools 25 | from multiprocessing import Pool 26 | 27 | def main(): 28 | """ 29 | The main wrapper for CLAM peak-caller. 30 | """ 31 | # options parsing 32 | usage='usage: %prog ' 33 | parser=OptionParser(usage) 34 | 35 | parser.add_option('--resume', dest='resume', action='store_true', default=False, help='Resume mode - skipping pre-processing [Default: %default]') 36 | parser.add_option('--verbose', dest='verbose', action='store_true', default=False, help='Verbose mode - print out all intermediate steps [Default: %default]') 37 | parser.add_option('-o', dest='output_dir', default='./out_CLAM', help='Output file folder [Default %default]') 38 | parser.add_option('-t', dest='tmp_dir', default='./tmp_CLAM', help='Temporary file folder [Default %default]') 39 | parser.add_option('-p', dest='peak_file', default=None, help='Output peak calling filename; if None then do not call peaks [Default %default]') 40 | parser.add_option('--is-stranded', dest='is_stranded', default=False, action='store_true', help='Indicates if the reads are mapped with strand information. [Default: %default]') 41 | parser.add_option('--extend', dest='extend', type='int', default=50, help='Extend to given nucleotides symmetrically at peak calling [Default: %default]') 42 | parser.add_option('--pval-cutoff', dest='pval_cutoff', type='float', default=0.001, help='Corrected p-value threshold at peak calling [Default: %default]') 43 | parser.add_option('--merge-size', dest='merge_size', type='int', default=50, help='merging window size at peak calling [Default: %default]') 44 | parser.add_option('--max-iter', dest='max_iter', type='int', default=1000, help='maximum iterations for permutation tests [Default: %default]') 45 | parser.add_option('-g', dest='gtf', default='./GTF/hg19_ensembl.sorted_gene.bed', help='GTF file [Default: %default]') 46 | parser.add_option('--ThreadN', dest='nb_proc', type='int', default=4, help='Number of threads when doing permutations. [Default: %default]') 47 | parser.add_option('--seed', dest='seed', type='int', default=100, help='Random seed for permutations. [Default: %default]') 48 | parser.add_option('--merge-method', dest='merge_method', type='int', default=1, help='Peak merging method. 1: Narrow peak 2: Broad peak [Default: %default]') 49 | parser.add_option('--pval-method', dest='correction_method', type='int', default=1, help='Multiple testing correction method. 1: Bonferroni 2: BH FDR [Default: %default]') 50 | parser.add_option('--call-transcriptome', dest='call_all', action='store_true', default=False, help='Call peaks on transcriptome instead of genes with multi-mappers. [Default: %default]') 51 | 52 | (options,args)=parser.parse_args() 53 | 54 | output_dir=os.path.abspath(options.output_dir) 55 | tmp_dir=os.path.abspath(options.tmp_dir) 56 | verbose=options.verbose 57 | 58 | #random.seed(options.seed) 59 | 60 | write_parameter_log(options, output_dir) 61 | 62 | # find transcripts associated with multi-mapped reads 63 | if verbose: 64 | print_time_stamp('Finding transcripts with multimapped reads.') 65 | if not os.path.isfile(output_dir + '/CLAM_mapper.sorted.out'): 66 | subprocess.call(''' sort -k1,1 -k2,2n %s/CLAM_mapper.out | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6}' > %s/CLAM_mapper.sorted.out ''' % (output_dir, output_dir), shell=True) 67 | # Note: tid_list: tid -> [chr:strand, start, end] 68 | tid_list=read_aligner_output(output_dir + '/CLAM_mapper.sorted.out', options.gtf, options.is_stranded, tmp_dir, options.resume, options.call_all) 69 | 70 | # make bam file for re-aligner output, if non-exist 71 | if not (options.resume and os.path.isfile(output_dir + '/assigned_multimapped_reads.bam')): 72 | if verbose: 73 | print_time_stamp('Making bamfile for aligner output.') 74 | header_cmd='samtools view -H ' + tmp_dir + '/filter100.sorted.bam > ' + output_dir + '/sam_header.sam' 75 | subprocess.call(header_cmd, shell=True) 76 | body_cmd = ''' awk '{if($6=="+"){print $4"\t256\t"$1"\t"$2+1"\t0\t"$3-$2+1"M\t*\t0\t0\t*\t*\tAS:f:"$5}else{print $4"\t272\t"$1"\t"$2+1"\t0\t"$3-$2+1"M\t*\t0\t0\t*\t*\tAS:f:"$5 }}' ''' + output_dir + '/CLAM_mapper.sorted.out > ' + output_dir + '/CLAM_mapper.sorted.sam' 77 | subprocess.call(body_cmd, shell=True) 78 | makeBam_cmd = 'cat %s/sam_header.sam %s/CLAM_mapper.sorted.sam | samtools view -bS - > %s/assigned_multimapped_reads.bam' % (output_dir, output_dir,output_dir) 79 | subprocess.call(makeBam_cmd, shell=True) 80 | index_cmd = 'samtools index %s/assigned_multimapped_reads.bam' % output_dir 81 | subprocess.call(index_cmd, shell=True) 82 | 83 | # multi-processing peak-caller 84 | if not (options.resume and os.path.isfile(tmp_dir+'/unique_to_qval.pdata') and os.path.isfile(tmp_dir+'/combined_to_qval.pdata')): 85 | child_transcr_ind = list(chunkify(range(len(tid_list)), options.nb_proc)) 86 | 87 | pool = Pool(processes=options.nb_proc) 88 | 89 | unibam_file=tmp_dir+'/filter100.sorted.bam' 90 | multibam_file=output_dir+'/assigned_multimapped_reads.bam' 91 | tid_to_qval_compact = pool.map(get_permutation_fdr, [ (unibam_file, multibam_file, tid_list, child_transcr_ind[i], options.pval_cutoff, options.max_iter, options.is_stranded, verbose, options.correction_method, options.seed) for i in range(options.nb_proc) ]) 92 | 93 | pool.terminate() 94 | pool.join() 95 | 96 | unique_tid_to_qval, combined_tid_to_qval = unpack_tid_to_qval(tid_to_qval_compact) 97 | pickle.dump(unique_tid_to_qval, open(tmp_dir+'/unique_to_qval.pdata','wb'), -1) 98 | pickle.dump(combined_tid_to_qval, open(tmp_dir+'/combined_to_qval.pdata','wb'), -1) 99 | else: 100 | print_time_stamp('Resume mode, found qval data files.') 101 | unique_tid_to_qval=pickle.load(open(tmp_dir+'/unique_to_qval.pdata','rb')) 102 | combined_tid_to_qval=pickle.load(open(tmp_dir+'/combined_to_qval.pdata','rb')) 103 | 104 | # merge peaks 105 | if options.merge_method==1: 106 | merge_peaks=merge_peaks_singleNucl 107 | mm='singleNucl' 108 | elif options.merge_method==2: 109 | merge_peaks=merge_peaks_broadPeak 110 | mm='broadPeak' 111 | else: 112 | merge_peaks=merge_peaks_singleNucl 113 | mm='unknown selection, using default singleNucl' 114 | 115 | if verbose: 116 | print_time_stamp('Merging peaks within ' + str(options.merge_size) + 'bp, using ' + mm + '..') 117 | 118 | unique_peaks=merge_peaks(unique_tid_to_qval, options.merge_size, options.pval_cutoff) 119 | combined_peaks=merge_peaks(combined_tid_to_qval, options.merge_size, options.pval_cutoff) 120 | 121 | print_time_stamp('Comparing results and writing to file..') 122 | 123 | # write peak-calling results to file. 124 | with open(output_dir + '/all_peaks.txt', 'w') as f: 125 | for peak in unique_peaks: # peak = ['chr\tstart\tend\tstrand', 'height\tqval\t', tid] 126 | if options.extend is None: 127 | wt_loc=peak[0] 128 | else: 129 | wt_loc=extend_peak_region(peak[0], options.extend) 130 | f.write(wt_loc + '\t' + '\t'.join([str(x) for x in peak[1]]) + '\t' + peak[2] + '\tunique\n') 131 | for peak in combined_peaks: 132 | if options.extend is None: 133 | wt_loc=peak[0] 134 | else: 135 | wt_loc=extend_peak_region(peak[0], options.extend) 136 | f.write(wt_loc + '\t' + '\t'.join([str(x) for x in peak[1]]) + '\t' + peak[2] + '\tcombined\n') 137 | subprocess.call(''' sort -k1,1 -k2,2n %s/all_peaks.txt | awk '{print $1"\t"$2"\t"$3"\t"$5";"$6";"$7"\t"$8"\t"$4}' | bedtools merge -s -d -1 -i stdin -c 4,5,6, -o collapse,collapse,distinct > %s''' % (output_dir, options.peak_file), shell=True) 138 | 139 | print_time_stamp('Peak-calling done.') 140 | 141 | def write_parameter_log(options, output_dir): 142 | """ 143 | Write paramter values to a log file, named by current time. 144 | """ 145 | merge_method_dict={1:'narrowPeak', 2:'broadPeak'} 146 | correction_method_dict={1:'Bonferroni', 2:'BH_FDR'} 147 | with open(output_dir+'/CLAM_Peaker.Parameters.'+ strftime("%Y%m%d_%H%M") + '.txt', 'w') as log: 148 | log.write('CLAM Peaker ' + __version__ + '\n') 149 | log.write('resume: ' + str(options.resume) + '\n') 150 | log.write('verbose: ' + str(options.verbose) + '\n') 151 | log.write('output_dir:' + str(options.output_dir) + '\n') 152 | log.write('tmp_dir: ' + str(options.tmp_dir) + '\n') 153 | log.write('peak_file: ' + str(options.peak_file) + '\n') 154 | log.write('is_stranded: ' + str(options.is_stranded) + '\n') 155 | log.write('extend: ' + str(options.extend) + '\n') 156 | log.write('pval_cutoff: ' + str(options.pval_cutoff) + '\n') 157 | log.write('merge_size: ' + str(options.merge_size) + '\n') 158 | log.write('max_iter: ' + str(options.max_iter) + '\n') 159 | log.write('gtf: ' + str(options.gtf) + '\n') 160 | log.write('seed: ' + str(options.seed) + '\n') 161 | log.write('merge_method: ' + merge_method_dict[options.merge_method] + '\n') 162 | log.write('correction_method: ' + correction_method_dict[options.correction_method] + '\n') 163 | log.write('thread: ' + str(options.nb_proc) + '\n') 164 | 165 | def chunkify(a, n): 166 | """ 167 | Separate a list (a) into consecutive n chunks. 168 | Returns the chunkified index 169 | """ 170 | k, m = len(a) / n, len(a) % n 171 | return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n)) 172 | 173 | def unpack_tid_to_qval(compact): 174 | """ 175 | Unpacks the returned values from multi-processing. 176 | """ 177 | unique_tid_to_qval=defaultdict(list) 178 | combined_tid_to_qval=defaultdict(list) 179 | for item in compact: 180 | unique, combined = item[0], item[1] 181 | for tid in combined: 182 | if len(unique[tid])>0: 183 | unique_tid_to_qval[tid]=unique[tid] 184 | if len(combined[tid])>1: 185 | combined_tid_to_qval[tid]=combined[tid] 186 | return unique_tid_to_qval,combined_tid_to_qval 187 | 188 | def get_permutation_fdr((unibam_file, multibam_file, tid_list, tid_ind, pval_cutoff, max_iter, is_stranded, verbose, correction_method, seed)): 189 | """ 190 | General permutation wrapper for a list of genes. Gets called by multi-processing generated by Pool() 191 | Returns packed FDRs from each child process. 192 | """ 193 | random.seed(seed) 194 | 195 | unique_tid_to_qval=defaultdict(list) 196 | combined_tid_to_qval=defaultdict(list) 197 | 198 | unibam=pysam.Samfile(unibam_file, 'rb') 199 | multibam=pysam.Samfile(multibam_file, 'rb') 200 | 201 | processed=0 202 | pid=os.getpid() 203 | 204 | for ind in tid_ind: 205 | processed+=1 206 | if verbose and not processed % 100: 207 | print_time_stamp(str(processed) + '/' + str(len(tid_ind)) + ' finished in pid ' + str(pid)) 208 | tid, chr, strand, start, end = tid_list[ind] 209 | unique_reads = read_tid_frag_from_bam(tid_list[ind], unibam, is_stranded, True) 210 | multi_reads = read_tid_frag_from_bam(tid_list[ind], multibam, is_stranded, False) 211 | 212 | this_unique_to_qval = do_permutation(tid_list[ind], unique_reads, max_iter, pval_cutoff, correction_method) 213 | this_combined_to_qval = do_permutation(tid_list[ind], unique_reads+multi_reads, max_iter, pval_cutoff, correction_method) 214 | 215 | unique_tid_to_qval[tid].extend(this_unique_to_qval) 216 | combined_tid_to_qval[tid].extend(this_combined_to_qval) 217 | unibam.close() 218 | multibam.close() 219 | return unique_tid_to_qval, combined_tid_to_qval 220 | 221 | def do_permutation(transcr, read_transcript, max_iter, pval_cutoff, correction_method): 222 | """ 223 | Permutes the reads along a given gene length, sub-routine that get called by get_permutation_fdr(..). 224 | Returns the locally corrected p-values for each observed height on the given gene. 225 | """ 226 | tid, chr, strand, tstart, tend = transcr 227 | tid_length=tend-tstart+1 228 | obs_heights_count=count_pileup_heights(tid_length, read_transcript) 229 | 230 | tid_to_qval=[] 231 | 232 | rand_heights_dist=defaultdict(int) 233 | rand_sum=0 234 | # need to account for the 'observed' data, since permutation tests should never report p-value as 0. 3/22/16 235 | for i in obs_heights_count: 236 | if i==0: 237 | continue 238 | else: 239 | rand_heights_dist[int(i)]+=1 240 | rand_sum+=1 241 | for B in range(max_iter): 242 | new_heights_count=permutate_heights(tid_length, read_transcript) 243 | for i in new_heights_count: 244 | if i==0: 245 | continue 246 | else: 247 | rand_heights_dist[i]+=1 248 | rand_sum+=1 249 | height_to_pval={} 250 | for h in set(obs_heights_count): 251 | if h < 1: 252 | continue 253 | else: 254 | lefter=0 255 | for j in range(int(h), max(rand_heights_dist)+1): 256 | lefter+=rand_heights_dist[j] 257 | height_to_pval[h]=lefter/float(rand_sum) 258 | pval_list=[] 259 | for i in obs_heights_count: 260 | if i<1: 261 | continue 262 | pval_list.append(height_to_pval[i]) 263 | if len(pval_list)<=1: 264 | return [] 265 | 266 | if correction_method==2: 267 | qval_list=multipletests(pval_list, method='fdr_bh')[1] 268 | else: 269 | qval_list=[min(x*(len(set([int(y) for y in height_to_pval if y!=0]))), 1.0) for x in pval_list] 270 | 271 | ind=0 272 | last_height=0 273 | for j in range(len(obs_heights_count)): 274 | this_height=obs_heights_count[j] 275 | if this_height<1: 276 | last_height=0 277 | continue 278 | if qval_list[ind] <= pval_cutoff: 279 | if this_height==last_height: 280 | chr, last_start, last_end, last_strand, last_height, last_qval=tid_to_qval[-1] 281 | tid_to_qval[-1]=[chr, last_start, tstart+j+1, strand, last_height, last_qval] 282 | else: 283 | tid_to_qval.append([chr, tstart+j, tstart+j+1, strand, obs_heights_count[j], qval_list[ind]]) # chr, start, end, strand, height, this_qval 284 | last_height=this_height 285 | ind+=1 286 | return tid_to_qval 287 | 288 | def heights_to_dist(rand_heights): 289 | """ 290 | sub-routine 291 | """ 292 | rand_heights_dist=defaultdict(int) 293 | rand_sum=0 294 | for new_heights_count in rand_heights: 295 | for i in new_heights_count: 296 | if i==0: 297 | continue 298 | else: 299 | rand_heights_dist[i]+=1 300 | rand_sum+=1 301 | return rand_heights_dist, rand_sum 302 | 303 | def permutate_heights(tlen, reads): 304 | """ 305 | Sub-routine for do_permutation(...) 306 | Randomly allocate the read locations. 307 | """ 308 | loc_heights=[0] * tlen 309 | for id, pos, read_len, score in reads: 310 | if score<1 and random.random() > score: 311 | continue 312 | rand_pos=random.randint(1, max(1, tlen-read_len)) 313 | for i in range(rand_pos, min(rand_pos + read_len, tlen)): 314 | loc_heights[i]+=1 315 | return loc_heights 316 | 317 | def count_pileup_heights(tlen, reads): 318 | """ 319 | Sub-routine for do_permutation(...) 320 | Counts the distribution of pile-up heights for a given gene/permutation 321 | """ 322 | loc_heights=[0] * tlen 323 | for id, pos, read_len, score in reads: 324 | for i in range(pos, min(pos+read_len-1, tlen)): 325 | loc_heights[i]+=score 326 | return loc_heights 327 | 328 | def merge_peaks_broadPeak(transcript_to_qval, merge_size, pval_cutoff): 329 | """ 330 | Merge called peaks on a gene using option 2, 331 | i.e. if two peaks close to each other, region 332 | between two peaks are also called as peaks 333 | Retuns a list of merged peaks. 334 | """ 335 | peaks=[] 336 | last_qval=[0,1] 337 | for tid in transcript_to_qval: 338 | init=True 339 | for chr, start, end, strand, height, this_qval in transcript_to_qval[tid]: 340 | loc=[chr, str(start), str(end), strand] 341 | this_qval=[height, this_qval] # this_qval=[height, qval] so that when qval=0, we can compare height 342 | if this_qval[1] > pval_cutoff: 343 | continue 344 | if init: 345 | last_qval=this_qval 346 | last_pos=[start, end] 347 | last_loc=loc 348 | last_chr=chr 349 | write_out=False 350 | init=False 351 | continue 352 | if int(start) - int(last_pos[1]) > merge_size: 353 | write_out=True 354 | else: 355 | last_pos=[last_pos[0], end] 356 | last_qval=this_qval if last_qval[0] pval_cutoff: 386 | continue 387 | if init: 388 | last_qval=this_qval 389 | last_pos=[start, end] 390 | last_loc=loc 391 | last_chr=chr 392 | write_out=False 393 | init=False 394 | continue 395 | if last_chr == chr: 396 | if abs( int(start) - int(last_pos[0]) ) > merge_size: 397 | write_out=True 398 | elif last_qval[0] < this_qval[0]: 399 | last_pos=[start, end] 400 | last_qval=this_qval 401 | last_loc=loc 402 | write_out=False 403 | else: 404 | write_out=True 405 | 406 | if write_out and last_qval[1] < pval_cutoff: 407 | #peaks[last_loc]=last_qval 408 | peaks.append([last_loc, last_qval, tid]) 409 | last_qval=this_qval 410 | last_pos=[start, end] 411 | last_loc=loc 412 | last_chr=chr 413 | write_out=False 414 | if last_qval[1] < pval_cutoff: 415 | peaks.append([last_loc, last_qval, tid]) 416 | return peaks 417 | 418 | def extend_peak_region(loc, target_len): 419 | """ 420 | Extends peak symmetrically if peak is smaller than target_len. 421 | """ 422 | chr, start, end, strand = loc.split('\t') 423 | start = int(start) 424 | end = int(end) 425 | old_len = end - start 426 | if old_len > target_len: 427 | return loc 428 | else: 429 | center = int((start + end)/2) 430 | start = center - int(target_len /2) 431 | end = center + int(target_len/2) 432 | return '\t'.join([chr, str(start), str(end), strand]) 433 | 434 | def read_aligner_output(rm_out, gtffile, is_stranded, tmp_dir, resume, call_all): 435 | """ 436 | Use bedtools to get transcripts/genes with multi-mapped reads. 437 | Returns a list of transcripts/genes. 438 | """ 439 | if not (resume and os.path.isfile(tmp_dir + '/gtf2multireads.bed')): 440 | rm_bed=pybedtools.BedTool(rm_out) 441 | gtf=pybedtools.BedTool(gtffile) 442 | gtf_bed_rm = gtf.intersect(rm_bed, s=True, u=True) if is_stranded else gtf.intersect(rm_bed, u=True) 443 | gtf_bed_rm.saveas(tmp_dir + '/gtf2multireads.bed') 444 | pybedtools.cleanup() 445 | 446 | tid_list=[] 447 | if call_all: 448 | gtf_to_read=gtffile 449 | else: 450 | gtf_to_read=tmp_dir+'/gtf2multireads.bed' 451 | with open(gtf_to_read,'r') as f: 452 | for line in f: 453 | ele=line.rstrip().split('\t') 454 | gene_id=ele[3] 455 | gene_chr, gene_start, gene_end=ele[0], int(ele[1]), int(ele[2]) 456 | gene_strand=ele[5] 457 | tid_list.append([gene_id, gene_chr, gene_strand, gene_start, gene_end]) 458 | print_time_stamp('Read transcripts with multi-reads: ' + str(len(tid_list))) 459 | return tid_list 460 | 461 | def read_tid_frag_from_bam(tid, bamfile, is_stranded, is_unique): 462 | """ 463 | Use pysam to fetch reads info for a given gene and its loci. 464 | Returns reads, read weights and its mapped loci. 465 | """ 466 | tid_reads=[] 467 | gene, chr, strand, start, end=tid 468 | if strand=='-': 469 | is_reverse=True 470 | else: 471 | is_reverse=False 472 | reads=[x for x in bamfile.fetch(chr, int(start), int(end)) if x.is_reverse==is_reverse or not is_stranded] 473 | reads=[x for x in reads if x.pos>=int(start) and x.pos<=int(end)] 474 | for read in reads: 475 | if is_unique: 476 | try: 477 | opt_NH=read.opt('NH') 478 | if opt_NH > 1: 479 | continue 480 | except: 481 | pass 482 | score=1 483 | else: 484 | try: 485 | opt_AS=read.opt('AS') 486 | if isinstance(opt_AS, float): 487 | score=opt_AS 488 | else: 489 | continue 490 | except: 491 | continue 492 | read_length = read.qlen if read.qlen > 0 else read.positions[-1] - read.positions[0] + 1 493 | if read.pos-start>=0 and read_length<500: # to avoid junction reads 494 | tid_reads.append([read.qname, read.pos-start, read_length, score]) 495 | return tid_reads 496 | 497 | def print_time_stamp(msg): 498 | """ 499 | Reporter function for logging. 500 | """ 501 | current_time='[' + strftime("%Y-%m-%d %H:%M:%S") + '] ' 502 | print >> sys.stderr, current_time + msg 503 | 504 | if __name__=='__main__': 505 | main() -------------------------------------------------------------------------------- /deprecated/README.md: -------------------------------------------------------------------------------- 1 | # CLAM Version 1.0.0 2 | # CLIP-seq Analysis of Multi-mapped reads 3 | 4 | ## Download the latest version [here](https://github.com/Xinglab/CLAM/releases/download/1.0.0/CLAM-v1.zip). 5 | 6 | ## Requirements 7 | 8 | CLAM is a two-stage algorithm implemented in Python. It is intended to be used in Unix-based environment. It has been tested on Linux with Python 2.7.3. 9 | 10 | CLAM depends on several commonly-used Python libraries, including [pysam](http://pysam.readthedocs.io/en/latest/) and [pybedtools](https://daler.github.io/pybedtools/index.html). 11 | 12 | A detailed dependency requirements (with version info) can be found in "requirements.txt". Alternatively, just run 13 | ``` 14 | pip install -r requirements.txt 15 | ``` 16 | and you will be good to go. 17 | 18 | ## Usage 19 | We provide a general shell script wrapper that runs the whole pipeline sequentially with default parameters for CLIP-seq. You only need to give the paths to input bam file and output folder, and a binary (0/1) indicator for strandness: 20 | ``` 21 | $ sh runCLAM_git.sh $bam $output_dir $temp_dir $is_stranded 22 | ``` 23 | ..and the CLAM pipeline's output will be generated in $output_dir as specified. Check "Output" section below to understand the file formats in the CLAM output folder. 24 | 25 | 26 | Alternatively, if you want to dig more into the parameters, you can run the pipeline with "--help" in command line and check the options. The following should be printed to the screen: 27 | 28 | For CLAM re-aligner, 29 | ``` 30 | $ python CLAM.lite_aligner.py --help 31 | Usage: CLAM.lite_aligner.py input_file.bam 32 | 33 | Options: 34 | -h, --help show this help message and exit 35 | -o OUTPUT_DIR Output file folder [Default ./out_CLAM] 36 | -t TMP_DIR Temporary file folder [Default ./tmp_CLAM] 37 | -w WINDOW_SIZE Local window size for EM [Default: 50] 38 | --max-multihits=MAX_MULTIHITS 39 | Discard reads mapped to more than 40 | locations. [Default: 100] 41 | --min-unique-reads=MIN_UNIQUE_READS 42 | Discard genomic regions with less than 43 | of unique reads. [Default: 0] 44 | --is-stranded Indicates if the reads are mapped with strand 45 | information. [Default: False] 46 | --resume Resume mode - skipping pre-processing [Default: False] 47 | --verbose Verbose mode - print out all intermediate steps 48 | [Default: False] 49 | --max-gap=MAX_GAPS Maximum distance allowed in grouping reads. [Default: 50 | 50] 51 | ``` 52 | 53 | For CLAM peak-caller, 54 | ``` 55 | $ python CLAM.fdr_peak.MP.py --help 56 | Usage: CLAM.fdr_peak.MP.py 57 | 58 | Options: 59 | -h, --help show this help message and exit 60 | --resume Resume mode - skipping pre-processing [Default: False] 61 | --verbose Verbose mode - print out all intermediate steps 62 | [Default: False] 63 | -o OUTPUT_DIR Output file folder [Default ./out_CLAM] 64 | -t TMP_DIR Temporary file folder [Default ./tmp_CLAM] 65 | -p PEAK_FILE Output peak calling filename; if None then do not call 66 | peaks [Default none] 67 | --is-stranded Indicates if the reads are mapped with strand 68 | information. [Default: False] 69 | --extend=EXTEND Extend to given nucleotides symmetrically at peak 70 | calling [Default: 50] 71 | --pval-cutoff=PVAL_CUTOFF 72 | Corrected p-value threshold at peak calling [Default: 73 | 0.001] 74 | --merge-size=MERGE_SIZE 75 | merging window size at peak calling [Default: 50] 76 | --max-iter=MAX_ITER maximum iterations for permutation tests [Default: 77 | 1000] 78 | -g GTF GTF file [Default: ./GTF/hg19_ensembl.sorted_gene.bed] 79 | --ThreadN=NB_PROC Number of threads when doing permutations. [Default: 80 | 4] 81 | --seed=SEED Random seed for permutations. [Default: 100] 82 | --merge-method=MERGE_METHOD 83 | Peak merging method. 1: Narrow peak 2: Broad peak 84 | [Default: 1] 85 | --pval-method=CORRECTION_METHOD 86 | Multiple testing correction method. 1: Bonferroni 2: 87 | BH FDR [Default: 1] 88 | --call-transcriptome Call peaks on transcriptome instead of genes with 89 | multi-mappers. [Default: False] 90 | ``` 91 | And you can specify your own parameters accordingly. For example, for **m6A RIP-seq**, window size parameter (-w) and maximum gaps (--max-gap) for re-aligner should be set to 100. 92 | 93 | ## Output 94 | The output of the re-aligner is "assigned_multimapped_reads.bam", which is a customized BAM file following SAM format. Note that the re-aligned weights are stored in "AS:f" tag, so please be aware and do not change/omit it. 95 | Output of re-aligner could also be seen as an intermediate file for CLAM pipeline. 96 | 97 | The output of the peak-caller is a bed file whose name is specified by user. It is a 6-column [BED](https://genome.ucsc.edu/FAQ/FAQformat.html#format1) format file, separated by tabulate and ordered as 98 | ``` 99 | chr start end height;fdr;gene unique/combined strand 100 | ``` 101 | Hence a peak with "combined" but no "unique" on the fifth column indicates this is a rescued peak; both "unique" and "combined" as common peak; or lost peak otherwise. 102 | 103 | ## Testing data 104 | Once downloaded the CLAM source code, please download the hnRNPC iCLIP dataset from [here](http://www.mimg.ucla.edu/faculty/xing/CLAM/hnRNPC_iCLIP_rep1_E-MAT-1371_novoalign.sorted.bam). 105 | 106 | Then run CLAM on the dataset; if finished correctly, you should have rescued peaks at these two loci: 107 | 108 | chr11:82,624,179-82,626,008 109 | 110 | chr20:37,054,180-37,055,310 111 | 112 | 113 | 114 | ## Contacts 115 | Yi Xing [yxing@ucla.edu](mailto:yxing@ucla.edu) 116 | 117 | Zijun Zhang [zj.z@ucla.edu](mailto:zj.z@ucla.edu) 118 | 119 | If you found a bug or mistake in this project, we would like to know about it. Before you send us the bug report though, please check the following: 120 | 121 | 1. Are you using the latest version? The bug you found may already have been fixed. 122 | 2. Check that your input is in the correct format and you have selected the correct options. 123 | 3. Please reduce your input to the smallest possible size that still produces the bug; we will need your input data to reproduce the problem, and the smaller you can make it, the easier it will be. 124 | 125 | ## Copyright and License Information 126 | Copyright (C) 2016 University of California, Los Angeles (UCLA) Zijun Zhang and Yi Xing 127 | 128 | Authors: Zijun Zhang and Yi Xing 129 | 130 | This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. 131 | 132 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 133 | 134 | You should have received a copy of the GNU General Public License along with this program. If not, see [http://www.gnu.org/licenses/](http://www.gnu.org/licenses/). 135 | -------------------------------------------------------------------------------- /deprecated/requirements.txt: -------------------------------------------------------------------------------- 1 | pysam==0.9.0 2 | pybedtools==0.7.4 3 | multiprocessing 4 | optparse 5 | statsmodels.sandbox.stats.multicomp 6 | operator -------------------------------------------------------------------------------- /deprecated/runCLAM_git.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # -- our name --- 3 | #$ -S /bin/bash 4 | #$ -R y 5 | #$ -V 6 | # Make sure that the .e and .o file arrive in the 7 | # working directory 8 | #$ -cwd 9 | #Merge the standard out and standard error to one file 10 | #$ -j y 11 | # 12 | # Send mail at submission and completion of script 13 | #$ -m be 14 | #$ -M your@email.com 15 | # 16 | 17 | # change the script dir to your own if necessary 18 | script_dir="./" 19 | 20 | echo "bamfile: $1" 21 | echo "output folder: $2" 22 | echo "tmp folder: $3" 23 | echo "is_stranded: $4" 24 | 25 | if [ $4 -eq 1 ] 26 | then 27 | echo "is stranded" 28 | python $script_dir"/CLAM.lite_aligner.py" --verbose -o $2 -t $3 --is-stranded $1 29 | else 30 | echo "unstranded" 31 | python $script_dir"/CLAM.lite_aligner.py" --verbose -o $2 -t $3 $1 32 | fi 33 | 34 | if [ $4 -eq 1 ] 35 | then 36 | python $script_dir"/CLAM.fdr_peak.MP.py" --verbose --is-stranded -o $2 -t $3 --pval-cutoff=0.001 --pval-method=2 -p CLAM_peak.bed --ThreadN=30 --max-iter=1000 37 | else 38 | python $script_dir"/CLAM.fdr_peak.MP.py" --verbose -o $2 -t $3 --pval-cutoff=0.001 --pval-method=2 -p CLAM_peak.bed --ThreadN=30 --max-iter=1000 39 | fi 40 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xinglab/CLAM/aed16a4d4e56535e17302448c48f32d35cba14cd/docs/.nojekyll -------------------------------------------------------------------------------- /docs/CLAM.rst: -------------------------------------------------------------------------------- 1 | CLAM package 2 | ============ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | CLAM.stats 10 | 11 | Submodules 12 | ---------- 13 | 14 | CLAM.peakcaller module 15 | ---------------------- 16 | 17 | .. automodule:: CLAM.peakcaller 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | CLAM.permutation\_peakcaller module 23 | ----------------------------------- 24 | 25 | .. automodule:: CLAM.permutation_peakcaller 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | CLAM.preprocessor module 31 | ------------------------ 32 | 33 | .. automodule:: CLAM.preprocessor 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | CLAM.realigner module 39 | --------------------- 40 | 41 | .. automodule:: CLAM.realigner 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | 47 | Module contents 48 | --------------- 49 | 50 | .. automodule:: CLAM 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | -------------------------------------------------------------------------------- /docs/CLAM.stats.rst: -------------------------------------------------------------------------------- 1 | CLAM.stats package 2 | ================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | CLAM.stats.bin\_test\_alternatives module 8 | ----------------------------------------- 9 | 10 | .. automodule:: CLAM.stats.bin_test_alternatives 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | CLAM.stats.ztnb\_em module 16 | -------------------------- 17 | 18 | .. automodule:: CLAM.stats.ztnb_em 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: CLAM.stats 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = CLAM 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'CLAM' 23 | copyright = '2018, Zijun Zhang' 24 | author = 'Zijun Zhang' 25 | 26 | # The short X.Y version 27 | version = '' 28 | # The full version, including alpha/beta/rc tags 29 | release = '' 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | 'sphinx.ext.autodoc', 43 | 'sphinx.ext.todo', 44 | 'sphinx.ext.coverage', 45 | 'sphinx.ext.viewcode', 46 | 'sphinx.ext.githubpages', 47 | ] 48 | 49 | # Add any paths that contain templates here, relative to this directory. 50 | templates_path = ['_templates'] 51 | 52 | # The suffix(es) of source filenames. 53 | # You can specify multiple suffix as a list of string: 54 | # 55 | # source_suffix = ['.rst', '.md'] 56 | source_suffix = '.rst' 57 | 58 | # The master toctree document. 59 | master_doc = 'index' 60 | 61 | # The language for content autogenerated by Sphinx. Refer to documentation 62 | # for a list of supported languages. 63 | # 64 | # This is also used if you do content translation via gettext catalogs. 65 | # Usually you set "language" from the command line for these cases. 66 | language = None 67 | 68 | # List of patterns, relative to source directory, that match files and 69 | # directories to ignore when looking for source files. 70 | # This pattern also affects html_static_path and html_extra_path . 71 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 72 | 73 | # The name of the Pygments (syntax highlighting) style to use. 74 | pygments_style = 'sphinx' 75 | 76 | 77 | # -- Options for HTML output ------------------------------------------------- 78 | 79 | # The theme to use for HTML and HTML Help pages. See the documentation for 80 | # a list of builtin themes. 81 | # 82 | html_theme = 'alabaster' 83 | 84 | # Theme options are theme-specific and customize the look and feel of a theme 85 | # further. For a list of options available for each theme, see the 86 | # documentation. 87 | # 88 | # html_theme_options = {} 89 | 90 | # Add any paths that contain custom static files (such as style sheets) here, 91 | # relative to this directory. They are copied after the builtin static files, 92 | # so a file named "default.css" will overwrite the builtin "default.css". 93 | html_static_path = ['_static'] 94 | 95 | # Custom sidebar templates, must be a dictionary that maps document names 96 | # to template names. 97 | # 98 | # The default sidebars (for documents that don't match any pattern) are 99 | # defined by theme itself. Builtin themes are using these templates by 100 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 101 | # 'searchbox.html']``. 102 | # 103 | # html_sidebars = {} 104 | 105 | 106 | # -- Options for HTMLHelp output --------------------------------------------- 107 | 108 | # Output file base name for HTML help builder. 109 | htmlhelp_basename = 'CLAMdoc' 110 | 111 | 112 | # -- Options for LaTeX output ------------------------------------------------ 113 | 114 | latex_elements = { 115 | # The paper size ('letterpaper' or 'a4paper'). 116 | # 117 | # 'papersize': 'letterpaper', 118 | 119 | # The font size ('10pt', '11pt' or '12pt'). 120 | # 121 | # 'pointsize': '10pt', 122 | 123 | # Additional stuff for the LaTeX preamble. 124 | # 125 | # 'preamble': '', 126 | 127 | # Latex figure (float) alignment 128 | # 129 | # 'figure_align': 'htbp', 130 | } 131 | 132 | # Grouping the document tree into LaTeX files. List of tuples 133 | # (source start file, target name, title, 134 | # author, documentclass [howto, manual, or own class]). 135 | latex_documents = [ 136 | (master_doc, 'CLAM.tex', 'CLAM Documentation', 137 | 'Zijun Zhang', 'manual'), 138 | ] 139 | 140 | 141 | # -- Options for manual page output ------------------------------------------ 142 | 143 | # One entry per manual page. List of tuples 144 | # (source start file, name, description, authors, manual section). 145 | man_pages = [ 146 | (master_doc, 'clam', 'CLAM Documentation', 147 | [author], 1) 148 | ] 149 | 150 | 151 | # -- Options for Texinfo output ---------------------------------------------- 152 | 153 | # Grouping the document tree into Texinfo files. List of tuples 154 | # (source start file, target name, title, author, 155 | # dir menu entry, description, category) 156 | texinfo_documents = [ 157 | (master_doc, 'CLAM', 'CLAM Documentation', 158 | author, 'CLAM', 'One line description of project.', 159 | 'Miscellaneous'), 160 | ] 161 | 162 | 163 | # -- Extension configuration ------------------------------------------------- 164 | # -- Options for todo extension ---------------------------------------------- 165 | 166 | # If true, `todo` and `todoList` produce output, else they produce nothing. 167 | todo_include_todos = True 168 | -------------------------------------------------------------------------------- /docs/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xinglab/CLAM/aed16a4d4e56535e17302448c48f32d35cba14cd/docs/image.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. CLAM documentation master file, created by 2 | sphinx-quickstart on Mon Jun 25 23:36:42 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to CLAM's documentation! 7 | ================================ 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | CLAM 2 | ==== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | CLAM 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pysam>=0.9.0 2 | multiprocessing 3 | statsmodels 4 | tqdm 5 | pybedtools 6 | mpmath -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | from CLAM.config import __version__ 5 | 6 | def main(): 7 | setup(name='CLAM', 8 | version=__version__, 9 | description='CLIP-seq Analysis of Multi-mapped reads', 10 | author='Zijun Zhang', 11 | author_email='zj.z@ucla.edu', 12 | url='https://github.com/Xinglab/CLAM', 13 | packages=['CLAM', 'CLAM.stats'], 14 | scripts=['bin/CLAM'], 15 | install_requires=[ 16 | 'scipy', 17 | 'pysam', 18 | 'numpy', 19 | 'statsmodels', 20 | 'tqdm', 21 | 'pybedtools', 22 | 'mpmath'] 23 | ) 24 | return 25 | 26 | if __name__ == '__main__': 27 | main() 28 | --------------------------------------------------------------------------------