├── README.md ├── callpeaks2.pl ├── py_peak_calling.py └── spike_in_calibration.csh /README.md: -------------------------------------------------------------------------------- 1 | # Cut-and-Run -------------------------------------------------------------------------------- /callpeaks2.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Siva Kasinathan, Henikoff Lab/FHCRC (skasin@uw.edu) 4 | # Improved peakcaller: requires peaks to have a minimum width (i.e., 5 | # a minimum number of continuous bases above threshold). This should 6 | # allow the usage of a lower threshold and calling broader peaks 7 | 8 | # Input BED file should be sorted: sort -k1,1 -k2,2n BED_file > BED_file_sorted 9 | 10 | use strict; 11 | use autodie; 12 | 13 | die "Usage: callpeaks2.pl [threshold] [interpeak distance] [min width] [max width]\n" 14 | if (! defined $ARGV[4]); 15 | 16 | my $bed_file = $ARGV[0]; 17 | my $threshold = $ARGV[1]; 18 | my $inter_peak = $ARGV[2]; 19 | my $minw = $ARGV[3]; 20 | my $maxw = $ARGV[4]; 21 | 22 | print STDERR "BED File: ", $bed_file, "\n"; 23 | print STDERR "Threshold: ", $threshold, "\n"; 24 | print STDERR "Inter-peak distance: ", $inter_peak, "\n"; 25 | print STDERR "Minimum peak width: ", $minw, "\n"; 26 | print STDERR "Maximum peak width: ", $maxw, "\n"; 27 | 28 | # Track number of peaks and peaks 29 | # that are thrown out 30 | my $npeaks = 0; 31 | my $nfail = 0; 32 | 33 | my @peak; 34 | my @current; 35 | reset_peak(\@peak); # Initialize peak 36 | 37 | open BED, '<', $bed_file 38 | or die "Could not open input BED file\n"; 39 | 40 | while(){ 41 | chomp; 42 | my @line = split/\s+/; 43 | 44 | # Assume bedgraph: chr start end value 45 | # (can change these depending on input BED format) 46 | $current[0] = $line[0]; 47 | $current[1] = $line[1]; 48 | $current[2] = $line[2]; 49 | $current[3] = $line[3]; 50 | 51 | if ($current[3] >= $threshold){ # process if above thresh 52 | my $add_success = update_peak(\@peak,\@current, \$inter_peak); 53 | 54 | if (! $add_success ){ 55 | 56 | print_peak(\@peak, \$minw, \$maxw, \$npeaks, \$nfail); 57 | reset_peak(\@peak); 58 | update_peak(\@peak,\@current); 59 | 60 | } 61 | } 62 | 63 | } 64 | 65 | close BED; 66 | 67 | # Print last peak 68 | print_peak(\@peak, \$minw, \$maxw, \$npeaks, \$nfail); 69 | 70 | print STDERR "Number of called peaks: ", $npeaks, "\n"; 71 | print STDERR "Number of peaks thrown out: ", $nfail, "\n"; 72 | print STDERR "DONE.\n"; 73 | 74 | exit; 75 | 76 | sub update_peak{ 77 | my $peak = $_[0]; 78 | my $current = $_[1]; 79 | my $inter_peak = $_[2]; 80 | 81 | my $result = 0; 82 | 83 | # Return failure if peak and the 84 | # current entry are not on the same 85 | # chromosome or if the current 86 | # entry is too far from peak 87 | if ($$peak[0] ne "*"){ 88 | return 0 if ($$peak[0] ne $$current[0]); 89 | return 0 if ($$current[1]- $$peak[2] >= $$inter_peak); 90 | } else { 91 | $$peak[0] = $$current[0]; 92 | $$peak[1] = $$current[1]; 93 | } 94 | 95 | # Add current entry to peak 96 | $$peak[2] = $$current[2]; # Extend peak end: 97 | $$peak[3] += $$current[3]; # Increment occupancy 98 | 99 | return 1; 100 | } 101 | 102 | sub print_peak{ 103 | my $peak = $_[0]; 104 | my $minw = $_[1]; 105 | my $maxw = $_[2]; 106 | my $npeaks = $_[3]; 107 | my $nfail = $_[4]; 108 | 109 | # Only print if peak width is within 110 | # bound specified by minw and maxw 111 | if ($$peak[2] - $$peak[1] >= $$minw && $$peak[2] - $$peak[1] <= $$maxw){ 112 | $$npeaks++; 113 | print $$peak[0], "\t", 114 | $$peak[1], "\t", 115 | $$peak[2], "\t", 116 | $$peak[3], "\n"; 117 | } else { 118 | $$nfail++; 119 | } 120 | 121 | return; 122 | } 123 | 124 | sub reset_peak{ 125 | my $peak = $_[0]; 126 | $$peak[0] = "*"; 127 | $$peak[1] = 0; 128 | $$peak[2] = 0; 129 | $$peak[3] = 0; 130 | return; 131 | } 132 | -------------------------------------------------------------------------------- /py_peak_calling.py: -------------------------------------------------------------------------------- 1 | # %load /home/pskene/bin/py_peak_calling.py 2 | def py_peak_calling(bedgraph, threshold, min_length, inter_peak_distance, merge_close_peaks=True, keep_highest_close_peak=False, max_length=10000, 3 | generate_ID=True, output_name = None, delete_overlap_bed=None): 4 | """ 5 | Created by Pete Skene 6 | - need to install a more up-to-date varsion of bedtools before invoking Jupyter 7 | type: module load bedtools/2.21.0 8 | (1) filters bedgraph based on threshold; 9 | 10 | (2) merges adjacent basepairs that are over threshold; 11 | 12 | (3) retains peaks that satisfy min/max length criteria; 13 | 14 | (4) merges any peaks that are closer than the inter-peak distance cutoff -or- 15 | alternatively keeps just the highest peak (this is beta functionality) 16 | 17 | - max length is typically defaulted to be very large 18 | - outputs a bed file (default col4 is the sum of the bedgraph scores; sorted by chrom;start;stop) 19 | - generate ID: will auto generate a integer list as a ID number (1... number of peaks). This will 20 | be reported as column 4 and the bedgraph scores will be shifted to column 5 as per standard bed format 21 | - note the peak score for merged peak is the *just* the sum of the two individual peaks not the 22 | total score in the merged region (i.e. there could be some sub-threshold scores in the intervening 23 | space that won't be included) 24 | -assumes bedgraph in standard format 25 | -output_name = option for user defined name (type with '...'), otherwise will generate name bedgraph_peaks.bed 26 | -delete_overlap_bed = option to add path to bedfile (as string), whereby any peaks that overlap this bed file will be discarded 27 | """ 28 | 29 | import pybedtools 30 | import glob 31 | from pybedtools import BedTool 32 | import pandas as pd 33 | import csv 34 | 35 | if merge_close_peaks==keep_highest_close_peak: 36 | return 'Exiting... merge_close_peaks and keep_highest_close_peak set the same' 37 | 38 | #generate name for output 39 | bedgraph_name = glob.glob(bedgraph) 40 | 41 | if output_name != None: 42 | filename = output_name 43 | 44 | elif output_name == None: 45 | filename = bedgraph_name[0].replace('.bg', '_peaks.bed') 46 | 47 | print 'input bedgraph file: ' + bedgraph_name[0] 48 | print 'output filename: ' + filename 49 | 50 | #import data as BedTool 51 | data = BedTool(bedgraph) 52 | 53 | #retains intervals above threshold 54 | above_thresh = data.filter(lambda b: float(b.name) >= threshold) 55 | 56 | #merge adjacent above threshold regions and sum bedgraph scores (assumes bedgraph score in col 4) 57 | #by increasing d value can allow for 58 | merge_regions= above_thresh.merge(d=0, c=4, o='sum' ) 59 | 60 | #filter based on length criteria 61 | peaks = BedTool(merge_regions.filter(lambda x: len(x) >= min_length and len(x) <= max_length)) 62 | 63 | # print 'number of regions identified before merging or filtering: ' + str(peaks.count()) 64 | 65 | if merge_close_peaks==True: 66 | #merge the bonafide peaks if they they are shorter than the inter peak distance and sum scores and sort 67 | print 'merging peaks that are closer than: ' + str(inter_peak_distance) 68 | merge_peaks = peaks.merge(d=inter_peak_distance, c= 4, o='sum').sort() 69 | 70 | if keep_highest_close_peak==True: 71 | #need to read each line to find close peaks and throw away the one with the lowest score out of the two 72 | print 'entering loop' 73 | 74 | peaks.saveas('temp_input.bed') 75 | 76 | print 'before keeping highest, number of regions identified: ' + str(BedTool('temp_input.bed').count()) 77 | 78 | last_line = [str(item) for item in (BedTool('temp_input.bed').to_dataframe().tail(n=1).iloc[0,:].tolist())] 79 | 80 | with open('temp_input.bed') as myfile: 81 | with open('test_output.bed', 'w') as output: 82 | file_output = csv.writer(output, delimiter='\t') 83 | 84 | prev_line = None 85 | 86 | for line in csv.reader(myfile, delimiter='\t'): 87 | # print 'testing line: ' +str(line) 88 | 89 | if prev_line is None: 90 | prev_line = line 91 | # print 92 | 93 | elif float(prev_line[2])+float(inter_peak_distance) <= float(line[1]): 94 | # print 'prev_line: ' + str(prev_line) 95 | # print 'line: ' + str(line) 96 | # print 'features far apart, so adding' 97 | # print 98 | file_output.writerow(prev_line) 99 | prev_line = line 100 | 101 | else: 102 | # print 'prev_line: ' + str(prev_line) 103 | # print 'line: ' + str(line) 104 | # print 'features must be close' 105 | # print 106 | if float(prev_line[3]) < float(line[3]): 107 | prev_line = line 108 | # print 'prev_line smaller, so new prev_line' 109 | # print 'prev_line: ' + str(prev_line) 110 | # print 111 | 112 | # print 'finished reading lines' 113 | # print line 114 | # print last_line 115 | if line==last_line: 116 | # print 'must be last line' 117 | file_output.writerow(prev_line) 118 | 119 | merge_peaks = BedTool('test_output.bed') 120 | 121 | print 'number of peaks found: ' + str(merge_peaks.count()) 122 | 123 | if delete_overlap_bed!=None: 124 | print 'delete_overlap_bed provided: ' + delete_overlap_bed 125 | merge_peaks = merge_peaks.intersect(b=delete_overlap_bed, v=True) 126 | print 'number of peaks retained: ' + str(merge_peaks.count()) 127 | 128 | if not generate_ID: 129 | print 'saving sorted peak bed file with no ID' 130 | 131 | merge_peaks.saveas(filename) 132 | 133 | if generate_ID: 134 | print 'saving sorted peak bed file with ID names' 135 | 136 | #change to pandas dataframe 137 | DF_peaks = merge_peaks.to_dataframe() 138 | 139 | #insert new column with id: 1.... # of peaks 140 | DF_peaks.insert(3, 'id', ['id' + str(item) for item in range(1, (len(DF_peaks)+1))]) 141 | 142 | ['id' + str(item) for item in range(1, 5)] 143 | #save output 144 | DF_peaks.to_csv(filename, sep = '\t', header = False, index = False) 145 | 146 | return 'Finished' 147 | 148 | -------------------------------------------------------------------------------- /spike_in_calibration.csh: -------------------------------------------------------------------------------- 1 | #!/bin/csh 2 | 3 | #Spike-in calibration example C-shell script 4 | #This is an EXAMPLE of one way to do spike-in calibration 5 | 6 | #Seven arguments 7 | # 1 2 3 4 5 6 7 8 | #spike_calibrate.csh genome.bed spike_genome.bed scale output(bg|bga|d) genome_chr_lens_file min_len max_len 9 | 10 | #Assumes Illumina paired-end sequencing 11 | #Packages used: An alignment program (e.g. bowtie2, BWA, etc.), picard and bedtools 12 | #Requires bed files of alignments to the experimental genome as well as to the spike-in genome 13 | #If there is no spike-in, put "none" in argument 2. 14 | #The only information used from the spike-in alignment is the number of fragments aligned 15 | #(the number of lines in spike_genome.bed). 16 | #min_len and max_len refer to the fragment lengths in genome.bed 17 | 18 | #Format for genome_chr_lens_file is (this information is in the sam file headers): 19 | #chr1 249250621 20 | #chr2 243199373 21 | #... 22 | 23 | #Before running this script: 24 | #1. Align Illumina fastq read files to both genomes producing two sam files. 25 | # Any alignment program that produces a sam/bam file can be used. 26 | # However the alignments are done, be careful that not too many reads are aligned 27 | # to both genomes. You may want to map to a masked version of the spike-in genome 28 | # to avoid cross-mapping to rDNA and mitochondrial DNA, but be aware that masked genomes 29 | # eliminate all repeated regions (e.g. centromeric, transposons, etc.), some of which 30 | # may be of interest. 31 | #2. Optionally remove duplicate reads (e.g. use picard MarkDuplicates) from the sam file. 32 | # Be aware that all duplicates will be removed since the program cannot determine the 33 | # origin of duplicate reads. You should only remove duplicates when you are starting with 34 | # very low amounts of material, such as when there are few cells. Unlike ChIP-seq, where 35 | # sonication fragments DNA more-or-less at random, CUT&RUN cuts DNA precisely, and so some 36 | # degree of identical cleavage events are expected, especially for sequence-specific DNA 37 | # binding proteins. 38 | #3. Extract properly aligned fragments from both genomes. 39 | # a. Convert sam to bam format (picard or samtools) 40 | # b. Extract aligned fragments from bam file (bedtools bamtobed) 41 | # For subsequent processing, the length of each fragment is 42 | # required in the bed file, this will probably need to be 43 | # added. awk can be used to add fragment length, e.g.: 44 | # cat bamtobed.bed | awk -v OFS='\t' '{len = $3 - $2; print $0, len }' 45 | # Assuming that $2 is the actual start position - 1 46 | 47 | #---------------------------------------------------------------------------------------------- 48 | #4. Spike-in calibration 49 | # NOTE: spike-in calibration may not be appropriate for your sample. You DO need to do it 50 | # when you need to compare samples, such as comparing treatment to control. The primary 51 | # genome to spike-in genome ratio per cell is expected to be the same for all samples. 52 | # The per base pair calculation used here to make a track is: 53 | # scale * (primary_genome_mapped_count_at_bp)/(spike-in_genome_total_of_mapped_fragments) 54 | # You can use any scale multiplier and any program you like to do the calculation, the 55 | # example here uses bedtools genomecov. 56 | 57 | if ($#argv < 7) then 58 | echo "USAGE spike_calibrate.csh genome.bed spike_genome.bed scale output(bg|bga|d) genome_chr_lens min_len max_len" 59 | echo "Spike-in calibration using bedtools genomecov" 60 | echo "scale is an arbitrary large number used as multiplier (e.g. 10000)" 61 | echo "If there is no spike-in, use spike_genome.bed = none" 62 | echo "min_len and max_len refer to the lengths of fragments in genome.bed to calibrate" 63 | echo "To calibrate all fragments use min_len = 1 and max_len = 1000" 64 | echo "Output will be placed in the current directory" 65 | exit(-1) 66 | endif 67 | 68 | set genome_bed = $1 69 | set spike_bed = $2 70 | set scale = $3 71 | set report = $4 72 | set genome_len = $5 73 | set min_len = $6 74 | set max_len = $7 75 | 76 | echo $genome_bed $spike_bed $scale $report $min_len $max_len 77 | if (!(-e $genome_bed) || (-z $genome_bed)) then 78 | echo "$genome_bed not found or is empty" 79 | exit(-1) 80 | endif 81 | if (!(-e $genome_len) || (-z $genome_len)) then 82 | echo "$genome_len not found or is empty" 83 | exit(-1) 84 | endif 85 | 86 | set temp = $genome_bed:t 87 | set name = $temp:r 88 | set output = $name.${min_len}-${max_len}.$report 89 | echo "Output is in file $output in the current directory" 90 | if (-e $spike_bed && !(-z $spike_bed)) then 91 | set spike_count = `wc -l $spike_bed | awk '{print $1}'` 92 | set scale_factor = `echo "$scale / $spike_count" | bc -l` 93 | else 94 | set spike_count = 0 95 | set scale_factor = $scale 96 | endif 97 | echo scale_factor=$scale_factor 98 | 99 | #Select fragments within the length range, assumes fragment length is in column 4 of the bed file 100 | cat $genome_bed | awk -v min=$min_len -v max=$max_len '{if ($4 >= min && $4 <= max) print}' > $$.temp.bed 101 | 102 | #Use genomecov to compute spike-in calibrated genome coverage 103 | #see http://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html 104 | #The first position is start-1 and the last is end for bed files, -bg and -bga 105 | #-bga prints zero intervals -bg doesn't, -d prints each bp starting from 1 (not 0) 106 | 107 | #This is for IGV which doesn't recognize .bg or .bga files 108 | if ($report == "bg" || $report == "bga") then 109 | set output = $name.${min_len}-${max_len}.bedgraph 110 | echo track type=bedGraph name=$name > $output 111 | endif 112 | bedtools genomecov -$report -scale $scale_factor -i $$.temp.bed -g $genome_len >> $output 113 | 114 | unalias rm 115 | rm $$.* 116 | exit 117 | --------------------------------------------------------------------------------