├── README.md ├── calculate_indel.py ├── count_spacers.py ├── design_library.py └── design_targeted_library.py /README.md: -------------------------------------------------------------------------------- 1 | # Screening_Protocols_manuscript 2 | Scripts from the Joung et al Nature Protocols 2016 manuscript on knockout and transcriptional activation screening 3 | -------------------------------------------------------------------------------- /calculate_indel.py: -------------------------------------------------------------------------------- 1 | # Supplementary Data 4: calculate_indel.py 2 | 3 | import difflib 4 | import numpy as np 5 | from scipy.stats import binom 6 | from Bio import SeqIO 7 | import argparse 8 | import itertools 9 | 10 | 11 | READ_TRUNCATION = 20 12 | HASH_READ_TRUNCATION = 0 13 | MIN_READ_LENGTH = 56 14 | MAX_AMBIGUOUS_BASES = 5 15 | MAX_INDEL_MISMATCH = 6 16 | ERROR_TOLERANCE_THRESHOLD = 0.15 17 | 18 | INITIAL_SEARCH_WINDOW = 20 # 20 works well 19 | SEARCH_INCREMENT = 3 20 | MAX_SEARCH_WINDOW = 50 21 | 22 | KMER_SIZE = 15 23 | 24 | SINGLE_FILE_STRUCTURE = '{}_out.csv' 25 | 26 | 27 | def find_loc(guide, target): 28 | loc = target.find(guide) 29 | return (loc, loc + len(guide)) 30 | 31 | 32 | def rc(seq): 33 | base_pairs = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} 34 | return ''.join(base_pairs[i] for i in seq[::-1]) 35 | 36 | 37 | def generate_hash(seq): 38 | kmer_index = {} 39 | for i in range(len(seq) - KMER_SIZE): 40 | kmer = seq[i:i + KMER_SIZE] 41 | if kmer in kmer_index: 42 | kmer_index[kmer] = None 43 | else: 44 | kmer_index[kmer] = i 45 | return kmer_index 46 | 47 | 48 | def calc_mle(total_reads, indel_counts, background): 49 | indel_range = np.array(range(indel_counts)) 50 | distrib = binom.pmf(indel_counts - indel_range, total_reads - 51 | indel_range, background) 52 | if len(distrib) == 0: 53 | mle_freq = 0 54 | else: 55 | mle_freq = distrib.argmax() / float(total_reads) 56 | 57 | z = 1.96 58 | upper_bound = (total_reads * mle_freq + z**2 / 2 + z * np.sqrt(total_reads * 59 | mle_freq * (1 - mle_freq) + z**4 / 4)) / (total_reads + z**2) 60 | lower_bound = (total_reads * mle_freq + z**2 / 2 - z * np.sqrt(total_reads * 61 | mle_freq * (1 - mle_freq) + z**4 / 4)) / (total_reads + z**2) 62 | 63 | return mle_freq, lower_bound, upper_bound 64 | 65 | 66 | def write_mle(sample_sheet, output_file, verbose, quiet): 67 | if not quiet: 68 | print 'Applying MLE correction' 69 | with open(output_file) as start_output_file: 70 | file_read = start_output_file.read().split('\n') 71 | output_header = file_read[0] 72 | output_text = file_read[1:-1] 73 | controls = [] 74 | with open(sample_sheet) as in_handle: 75 | for i, l in enumerate(in_handle): 76 | if len(l.strip().split(',')) < 5: 77 | print 'Sample and Control flags not detected' 78 | break 79 | elif l.strip().split(',')[4][0].upper().strip() == 'C': 80 | controls.append(i) 81 | background_list = [float(output_text[i].split(',')[ 82 | 7]) / 100 for i in controls] 83 | background = sum(background_list) / len(background_list) 84 | 85 | with open(output_file, 'w') as out_handle: 86 | out_handle.write(output_header+'\n') 87 | for i, l in enumerate(output_text): 88 | if i in controls: 89 | out_handle.write('{},{},{},{}\n'.format(l, 'NA', 'NA', 'NA')) 90 | else: 91 | samp_data = [int(l.split(',')[i]) for i in (1, 2, 5, 6)] 92 | mle_percentage, lower_bound, upper_bound = calc_mle( 93 | sum(samp_data), samp_data[1], background) 94 | 95 | out_handle.write('{},{},{},{}\n'.format( 96 | l, mle_percentage, lower_bound, upper_bound)) 97 | 98 | 99 | def op_ver(opcodes): 100 | ''' 101 | Designed to parse the opcodes from difflib.SequenceMatcher to generate edits. Detects if there are an odd number of edits 102 | and if there are edits with intervening equal regions. 103 | ''' 104 | ops = [x[0][0] for x in opcodes] 105 | if len(ops) % 2: 106 | # assumes read is longer than target 107 | if not (ops[0] == 'd' and ops[-1] == 'd' and set(ops[1::2]) == set(['e'])): 108 | return False 109 | else: 110 | proc_ops = [(x[0][0], x[3], x[4] - x[3], x[1], x[1] - x[2]) 111 | for x in opcodes[2:-2:2]] 112 | return proc_ops 113 | else: 114 | return False 115 | 116 | 117 | def indel_calc_window_hash(seq_handle, target): 118 | ''' 119 | Iterates through a SeqRecord iterator and calculates statistics about each read for a given window with hash algorithm 120 | ''' 121 | perf_total, indel_total, err_total, rejected_total, miscall_total, replace_total = ( 122 | 0,) * 6 123 | target_index = generate_hash(target) 124 | for readout in seq_handle: 125 | read = str(readout.seq)[HASH_READ_TRUNCATION:] 126 | if len(read) < MIN_READ_LENGTH or read.count('N') > MAX_AMBIGUOUS_BASES: # filtering for junk 127 | rejected_total += 1 128 | elif target in read: 129 | perf_total += 1 130 | else: 131 | read_index = generate_hash(read) 132 | mapping = {} 133 | for kmer in read_index: 134 | if read_index[kmer] is not None and kmer in target_index and target_index[kmer] is not None: 135 | mapping[read_index[kmer]] = target_index[kmer] 136 | if len(mapping) == 0: 137 | err_total += 1 138 | else: 139 | index_diff = ( 140 | mapping[i] - i if i in mapping else None for i in range(len(read) + KMER_SIZE + 1)) 141 | collapsed_dif = [[k, len(list(g))] 142 | for k, g in itertools.groupby(index_diff)] 143 | 144 | start = True 145 | indels = 0 146 | sing_mismatch = 0 147 | mult_mismatch = 0 148 | offset = 0 149 | if collapsed_dif[-1][0] is not None: 150 | err_total += 1 151 | else: 152 | for el in collapsed_dif[:-1]: 153 | 154 | if start: 155 | # advance to first non nan location (trim back from 156 | # start of read to first alignment) 157 | if el[0] is not None: 158 | offset = el[0] 159 | start = False 160 | if el[0] is not None: 161 | doff = el[0] - offset 162 | # append indel start loc to iloc and length of indel to ilen 163 | # insertion deletion combinations are summarized as follows for computaitonal simplicity 164 | # insertion deletion with len(ins)>len(del) = insertion 165 | # insertion deletion with len(ins) (KMER_SIZE): 174 | mult_mismatch += 1 175 | if indels > 0: 176 | indel_total += 1 177 | elif mult_mismatch > 0: 178 | replace_total += 1 179 | # print collapsed_dif 180 | elif sing_mismatch > 0: 181 | miscall_total += 1 182 | else: 183 | err_total += 1 184 | 185 | return (perf_total, indel_total, err_total, rejected_total, miscall_total, replace_total) 186 | 187 | 188 | def indel_calc_window(seq_handle, target): 189 | ''' 190 | Iterates through a SeqRecord iterator and calculates statistics about each read for a given window 191 | ''' 192 | perf_total, indel_total, err_total, rejected_total, miscall_total, replace_total = ( 193 | 0,) * 6 194 | for readout in seq_handle: 195 | read = str(readout.seq)[READ_TRUNCATION:] 196 | if len(read) < MIN_READ_LENGTH or read.count('N') > MAX_AMBIGUOUS_BASES: # filtering for junk 197 | rejected_total += 1 198 | elif target in read: 199 | perf_total += 1 200 | else: 201 | opcodes = difflib.SequenceMatcher( 202 | None, read, target, autojunk=False).get_opcodes() 203 | # filter out any reads with more than allowed indels + mismatches 204 | if len(opcodes) > 3 + MAX_INDEL_MISMATCH * 2: 205 | err_total += 1 206 | else: 207 | # if there are not an odd number of edits, try to shift 208 | # sequence and reattempt 209 | if not len(opcodes) % 2: 210 | opcodes = difflib.SequenceMatcher( 211 | None, read, target[1:-1], autojunk=False).get_opcodes() 212 | indel_list = op_ver(opcodes) 213 | if not indel_list: 214 | err_total += 1 215 | else: 216 | # check if only single mismatched bases, interpreted as 217 | # miscalled bases 218 | miscall = set.union(set(x[2] for x in indel_list), set( 219 | x[4] for x in indel_list), set(x[0] for x in indel_list)) == set(['r', 1, -1]) 220 | # check for larger replacement regions (not 221 | # insertions/deletions) 222 | mismatch = set(x[0] for x in indel_list) == set('r') 223 | 224 | if miscall: 225 | miscall_total += 1 226 | elif mismatch: 227 | replace_total += 1 228 | else: 229 | indel_total += 1 230 | return (perf_total, indel_total, err_total, rejected_total, miscall_total, replace_total) 231 | 232 | 233 | def file_calc(f_name, guide_loc, target, file_type, hash_flag): 234 | ''' 235 | Attempts different windows to pass error threshold 236 | ''' 237 | error_flag = True 238 | window_size = INITIAL_SEARCH_WINDOW 239 | min_error = 100 240 | min_total = [] 241 | note = '' 242 | 243 | if hash_flag: 244 | algorithm = indel_calc_window_hash 245 | else: 246 | algorithm = indel_calc_window 247 | 248 | while error_flag: # attempt windows while above threshold 249 | target_window = target[guide_loc[0] - 250 | window_size:guide_loc[1] + window_size] 251 | with open(f_name, 'rU') as f_handle: 252 | total_list = algorithm( 253 | SeqIO.parse(f_handle, file_type), target_window) 254 | 255 | err_total = total_list[2] 256 | rejected_total = total_list[3] 257 | error_percentage = float(err_total) / \ 258 | (sum(total_list) - rejected_total) * 100 259 | 260 | if error_percentage < min_error: # check if better than previously achieved 261 | min_error = error_percentage 262 | min_total = total_list 263 | 264 | error_flag = (error_percentage > ERROR_TOLERANCE_THRESHOLD) and ( 265 | window_size > MAX_SEARCH_WINDOW) 266 | window_size += SEARCH_INCREMENT 267 | 268 | if error_percentage > ERROR_TOLERANCE_THRESHOLD: 269 | note = 'Error threshold not met returning best attempt' 270 | return min_total, note 271 | 272 | 273 | def prep_entry(f_name, guide, target, file_type, hash_flag): 274 | ''' 275 | Finds guide location 276 | ''' 277 | 278 | if guide in target: 279 | total_list, note = file_calc( 280 | f_name, find_loc(guide, target), target, file_type, hash_flag) 281 | elif rc(guide) in target: 282 | total_list, note = file_calc(f_name, find_loc( 283 | rc(guide), target), target, file_type, hash_flag) 284 | else: 285 | total_list = (0,) * 6 286 | note = 'Guide not found in target sequence' 287 | return total_list, note 288 | 289 | 290 | def whole_file_read(sample_sheet, file_type, output_file, hash_flag, mle, verbose, quiet): 291 | ''' 292 | Reads through a complete file and constructs corresponding output file 293 | ''' 294 | if not quiet: 295 | print 'Reading input sheet from {}'.format(sample_sheet) 296 | if mle: 297 | mle_string = '' 298 | else: 299 | mle_string = ', MLE corrected rate, lower bound, upper bound' 300 | with open(sample_sheet) as in_handle, open(output_file, 'w') as out_handle: 301 | out_handle.write( 302 | 'sample,perfect matches,indels,misaligned reads,reads below threshold, reads with miscalled bases, reads with replacements,indel percentage, notes{}\n'.format(mle_string)) 303 | for l in in_handle: 304 | sample_name, file_name, guide, target = l.strip().split(',')[:4] 305 | if verbose: 306 | print 'Analyzing sample {} from {}'.format(sample_name, file_name) 307 | guide = guide.upper().strip() 308 | target = target.upper().strip() 309 | total_list, note = prep_entry( 310 | file_name, guide, target, file_type, hash_flag) 311 | indel_total = total_list[1] 312 | rejected_total = total_list[2] + total_list[3] 313 | indel_rate = float(indel_total) / \ 314 | (sum(total_list) - rejected_total) 315 | total_list_string = ','.join(str(s) for s in total_list) 316 | out_handle.write('{},{},{},{}\n'.format( 317 | sample_name, total_list_string, indel_rate, note)) 318 | if not mle: 319 | write_mle(sample_sheet, output_file, verbose, quiet) 320 | 321 | 322 | def single_entry_read(sample_sheet, file_type, input_name, hash_flag, verbose, quiet): 323 | ''' 324 | Reads through a single sample 325 | ''' 326 | 327 | with open(sample_sheet) as in_handle: 328 | for l in in_handle: 329 | sample_name, file_name, guide, target = l.strip().split(',')[:4] 330 | if sample_name.strip() == input_name.strip(): 331 | with open(SINGLE_FILE_STRUCTURE.format(input_name.strip()), 'w') as out_handle: 332 | guide = guide.upper().strip() 333 | target = target.upper().strip() 334 | total_list, note = prep_entry( 335 | file_name, guide, target, file_type, hash_flag) 336 | 337 | indel_total = total_list[1] 338 | rejected_total = total_list[2] + total_list[3] 339 | indel_rate = float(indel_total) / \ 340 | (sum(total_list) - rejected_total) 341 | 342 | total_list_string = ','.join(str(s) for s in total_list) 343 | 344 | out_handle.write('{},{},{},{}\n'.format( 345 | sample_name, total_list_string, indel_rate, note)) 346 | 347 | 348 | def combine_files(sample_sheet, file_type, output_file, mle, verbose, quiet): 349 | ''' 350 | Combines separately processed files 351 | ''' 352 | if mle: 353 | mle_string = '' 354 | else: 355 | mle_string = ', MLE corrected rate, lower bound, upper bound' 356 | 357 | with open(sample_sheet) as in_handle, open(output_file, 'w') as out_handle: 358 | out_handle.write( 359 | 'sample,perfect matches,indels,misaligned reads,reads below threshold, reads with miscalled bases, reads with replacements,indel percentage, notes{}\n'.format(mle_string)) 360 | for l in in_handle: 361 | sample_name, file_name, guide, target = l.strip().split(',')[:4] 362 | with open(SINGLE_FILE_STRUCTURE.format(sample_name.strip()), 'w') as samp_handle: 363 | out_handle.write(samp_handle.readline()) 364 | if not mle: 365 | write_mle(sample_sheet, output_file) 366 | 367 | if __name__ == '__main__': 368 | parser = argparse.ArgumentParser( 369 | description='Analyze sequencing data for the presence of indels') 370 | combine = parser.add_mutually_exclusive_group() 371 | verbosity = parser.add_mutually_exclusive_group() 372 | combine.add_argument( 373 | '-c', '--combine', help='combines files generated by individual samples', action='store_true') 374 | parser.add_argument( 375 | '-f', '--fasta', help='reads fasta files (default is fastq)', action='store_true') 376 | parser.add_argument( 377 | '-no-m', '--no-mle', dest='nomle', help='does not calculate MLE', action='store_true') 378 | parser.add_argument('-o', '--output', dest='output_file', 379 | help='output file name', default='calc_indel_out.csv') 380 | parser.add_argument( 381 | '-a', '--hash', help='uses alternative hashing algorithm', action='store_true') 382 | parser.add_argument('-i', '--input', dest='sample_sheet', 383 | help='input file name', default='sample_sheet.csv') 384 | combine.add_argument('-s', '--sample', dest='input_name', 385 | help='sample name for running in single sample mode') 386 | verbosity.add_argument( 387 | '-v', '--verbose', help='outputs verbose', action='store_true') 388 | verbosity.add_argument( 389 | '-q', '--quiet', help='supresses output', action='store_true') 390 | 391 | args = parser.parse_args() 392 | 393 | file_type = 'fasta' if args.fasta else 'fastq' 394 | if args.combine: 395 | combine_files(args.sample_sheet, file_type, 396 | args.output_file, args.nomle, args.verbose, args.quiet) 397 | elif args.input_name: 398 | single_entry_read(args.sample_sheet, file_type, 399 | args.input_name, args.hash, args.verbose, args.quiet) 400 | else: 401 | whole_file_read(args.sample_sheet, file_type, 402 | args.output_file, args.hash, args.nomle, args.verbose, args.quiet) 403 | -------------------------------------------------------------------------------- /count_spacers.py: -------------------------------------------------------------------------------- 1 | #Supplementary Data 3: count_spacers.py 2 | 3 | from Bio import SeqIO 4 | import csv 5 | from collections import OrderedDict 6 | import numpy as np 7 | import sys 8 | import argparse 9 | 10 | KEY_REGION_START = 30 #start index of key region 11 | KEY_REGION_END = 55 #end index of key region 12 | KEY = "CGAAACACC" #identifies sequence before guide to determine guide position 13 | 14 | def count_spacers(input_file, fastq_file, output_file, guide_g): 15 | """ 16 | creates a dictionary with guide counts from fastq_file, writes to output_file 17 | fastq_file: forward read fastq file 18 | output_file: csv file to write guide dictionary to 19 | dictionary: guide sequence as key, guide count as entry 20 | """ 21 | 22 | num_reads = 0 #total number of reads processed 23 | perfect_matches = 0 # guides with perfect match to library 24 | non_perfect_matches = 0 #number of guides without a perfect match to the library 25 | key_not_found = 0 #count of reads where key was not found 26 | 27 | # add 'G' to key sequence if included in library 28 | if guide_g: 29 | global KEY 30 | KEY += "G" 31 | 32 | # open library sequences and initiate dictionary of read counts for each guide 33 | try: 34 | with open(input_file, mode='rU') as infile: #rU mode is necessary for excel! 35 | reader = csv.reader(infile) 36 | dictionary = {rows[0]:0 for rows in reader} 37 | except: 38 | print 'could not open', input_file 39 | 40 | # open fastq file 41 | try: 42 | handle = open(fastq_file, "rU") 43 | except: 44 | print "could not find fastq file" 45 | return 46 | 47 | # process reads in fastq file 48 | readiter = SeqIO.parse(handle, "fastq") 49 | for record in readiter: #contains the seq and Qscore etc. 50 | num_reads += 1 51 | read_sequence = str.upper(str(record.seq)) 52 | key_region = read_sequence[KEY_REGION_START:KEY_REGION_END] 53 | key_index = key_region.find(KEY) 54 | if key_index >= 0: 55 | start_index = key_index + KEY_REGION_START + len(KEY) 56 | guide = read_sequence[start_index:(start_index + 20)] 57 | if guide in dictionary: 58 | dictionary[guide] += 1 59 | perfect_matches += 1 60 | else: 61 | non_perfect_matches += 1 62 | else: 63 | key_not_found += 1 64 | 65 | # create ordered dictionary with guides and respective counts and output as a csv file 66 | dict_sorted = OrderedDict(sorted(dictionary.items(), key=lambda t: t[0])) 67 | with open(output_file, 'w') as csvfile: 68 | mywriter = csv.writer(csvfile, delimiter=',') 69 | for guide in dict_sorted: 70 | count = dict_sorted[guide] 71 | mywriter.writerow([guide,count]) 72 | 73 | # percentage of guides that matched perfectly 74 | percent_matched = round(perfect_matches/float(perfect_matches + non_perfect_matches) * 100, 1) 75 | # percentage of undetected guides with no read counts 76 | guides_with_reads = np.count_nonzero(dictionary.values()) 77 | guides_no_reads = len(dictionary.values()) - guides_with_reads 78 | percent_no_reads = round(guides_no_reads/float(len(dictionary.values())) * 100, 1) 79 | # skew ratio of top 10% to bottom 10% of guide counts 80 | top_10 = np.percentile(dictionary.values(), 90) 81 | bottom_10 = np.percentile(dictionary.values(), 10) 82 | if top_10 != 0 and bottom_10 != 0: 83 | skew_ratio = top_10/bottom_10 84 | else: 85 | skew_ratio = 'Not enough perfect matches to determine skew ratio' 86 | 87 | # write analysis statistics to statistics.txt 88 | with open('statistics.txt', 'w') as infile: 89 | infile.write('Number of perfect guide matches: ' + str(perfect_matches) + '\n') 90 | infile.write('Number of nonperfect guide matches: ' + str(non_perfect_matches) + '\n') 91 | infile.write('Number of reads where key was not found: ' + str(key_not_found) + '\n') 92 | infile.write('Number of reads processed: ' + str(num_reads) + '\n') 93 | infile.write('Percentage of guides that matched perfectly: ' + str(percent_matched) + '\n') 94 | infile.write('Percentage of undetected guides: ' + str(percent_no_reads) + '\n') 95 | infile.write('Skew ratio of top 10% to bottom 10%: ' + str(skew_ratio)) 96 | infile.close() 97 | 98 | handle.close() 99 | return 100 | 101 | 102 | if __name__ == '__main__': 103 | parser = argparse.ArgumentParser( 104 | description='Analyze sequencing data for sgRNA library distribution') 105 | parser.add_argument('-f', '--fastq', type=str, dest='fastq_file', 106 | help='fastq file name', default='NGS.fastq') 107 | parser.add_argument('-o', '--output', type=str, dest='output_file', 108 | help='output file name', default='library_count.csv') 109 | parser.add_argument('-i', '--input', type=str, dest='input_file', 110 | help='input file name', default='library_sequences.csv') 111 | parser.add_argument('-no-g', dest='guide_g', help='presence of guanine before spacer', action='store_false') 112 | parser.set_defaults(guide_g=True) 113 | args = parser.parse_args() 114 | 115 | count_spacers(args.input_file, args.fastq_file, args.output_file, args.guide_g) 116 | -------------------------------------------------------------------------------- /design_library.py: -------------------------------------------------------------------------------- 1 | #Supplementary Data 1: design_library.py 2 | 3 | import argparse, tempfile, os, itertools, subprocess 4 | import twobitreader 5 | import sqlite3 6 | import numpy 7 | import time 8 | import math 9 | from operator import itemgetter 10 | import csv 11 | 12 | #guide design parameters 13 | GUIDE_LENGTH = 20 14 | PAM_LIST = ['AGG', 'TGG', 'GGG', 'CGG'] 15 | PAM_LENGTH = len(PAM_LIST[0]) 16 | CLEAVAGE_SITE = 17 #distance to 5' end of guide 17 | 18 | #seqmap parameters 19 | N_PROBES = 50000 20 | MAX_PROCESSES = 1 21 | MAX_MISMATCHES = 3 22 | tf_counter = 0 23 | 24 | #weights for off-target score calculations 25 | weights = numpy.array([0,0,0.014,0,0,0.395,0.317,0,0.389,0.079,0.445,0.508,0.613,0.851,0.732,0.828,0.615,0.804,0.685,0.583]) 26 | 27 | #flanking sequences around spacer for gecko and sam libraries 28 | gecko_flank = ['TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCG', 'GTTTTAGAGCTAGAAATAGCAAGTTAAAATAAGGCTAGTCCGT'] 29 | sam_flank = ['TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCG', 'GTTTTAGAGCTAGGCCAACATGAGGATCACC'] 30 | 31 | def revcomp(sequence): 32 | """ 33 | returns the reverse complement of sequence 34 | """ 35 | basecomplement = {'A':'T', 'C':'G', 'T':'A', 'G':'C', 'N':'N'} 36 | letters = list(sequence) 37 | letters.reverse() 38 | dna = '' 39 | for base in letters: 40 | dna += basecomplement[base] 41 | return dna 42 | 43 | 44 | def indexList(s, item, i=0): 45 | """ 46 | make a list of indexes of item in s 47 | """ 48 | i_list = [] 49 | while True: 50 | try: 51 | i = s.index(item, i) 52 | i_list.append(i) 53 | i += 1 54 | except: 55 | break 56 | return i_list 57 | 58 | def Target_stretch(guide): 59 | """ 60 | returns true if guide does not contain any of the following homopolymer target stretches 61 | """ 62 | st1 = ('AAAA') 63 | st2 = ('TTTT') 64 | st3 = ('GGGG') 65 | st4 = ('CCCC') 66 | 67 | if not ((st1 in guide) or (st2 in guide) or (st3 in guide) or (st4 in guide)): 68 | return 'true' 69 | 70 | 71 | def GC_content(GC_cutoff, guide): 72 | """ 73 | takes guide sequence as input, returns true if GC content above threshold defined above 74 | """ 75 | N = guide.count("G") 76 | N += guide.count("C") 77 | percent = float(N)/len(guide)*100 78 | if percent > GC_cutoff: 79 | return True 80 | else: 81 | return False 82 | 83 | def get_b_guides(region, GC_cutoff, index_list): 84 | """ 85 | takes a sequence s and and a list of indices in sequence that indicate the start of the 86 | reverse complement of the PAM sequence and returns a list of 20bp bottom guide sequences 87 | that have been filtered for GC content and target stretch 88 | """ 89 | guides = [] 90 | for i in index_list: 91 | if len(region) > i + GUIDE_LENGTH + PAM_LENGTH: 92 | guide = (revcomp(region[i+PAM_LENGTH:i+PAM_LENGTH+GUIDE_LENGTH])) 93 | if 'N' not in guide: 94 | if GC_content(GC_cutoff, guide) and Target_stretch(guide): 95 | guides.append([guide, i]) 96 | return guides 97 | 98 | def get_t_guides(region, GC_cutoff, index_list): 99 | """ 100 | takes a sequence s and and a list of indices in sequence that indicate the start of the 101 | PAM sequence and returns a list of 20bp top guide sequences that have been filtered 102 | for GC content and target stretch 103 | """ 104 | guides = [] 105 | for i in index_list: 106 | if i > GUIDE_LENGTH: 107 | guide = (region[i-GUIDE_LENGTH:i]) 108 | if 'N' not in guide: 109 | if GC_content(GC_cutoff, guide) and Target_stretch(guide): 110 | guides.append([guide, i]) 111 | return guides 112 | 113 | 114 | def get_location(gene, b_guides, t_guides): 115 | """ 116 | returns lists of bottom guide and top guide cleavage site distances from the start of the 117 | target genomic region 118 | """ 119 | b_guide_loc = [(long(gene["start"]) + GUIDE_LENGTH + PAM_LENGTH - CLEAVAGE_SITE + x[1]) for x in b_guides] 120 | t_guide_loc = [(long(gene["start"]) - GUIDE_LENGTH + CLEAVAGE_SITE + x[1]) for x in t_guides] 121 | return b_guide_loc, t_guide_loc 122 | 123 | def get_guides(region, GC_cutoff): 124 | """ 125 | finds all top and bottom guides in region and returns them with indices 126 | """ 127 | #find all the indices in region with pam sequence 128 | i_list_b = [] 129 | i_list_t = [] 130 | for pam in PAM_LIST: 131 | i_list_b.extend(indexList(region, revcomp(pam))) 132 | i_list_t.extend(indexList(region, pam)) 133 | 134 | #find all the guides that correspond with pam indices 135 | b_guides = get_b_guides(region, GC_cutoff, i_list_b) 136 | t_guides = get_t_guides(region, GC_cutoff, i_list_t) 137 | 138 | return b_guides, t_guides 139 | 140 | def generate_fa_files(input_prefix, genes_file, GC_cutoff): 141 | """ 142 | generate a genome fa file from a genome 2bit file 143 | generate a guide fa file that contains filtered unique guides that target the genome 144 | at the regions specified in genes file 145 | """ 146 | genome_2bit_file = input_prefix + '.2bit' 147 | genome_fa_file = input_prefix + '.fa' 148 | guide_fa_file = input_prefix + '_all_guides.fa' 149 | tbf = twobitreader.TwoBitFile(genome_2bit_file) 150 | all_guides = set([]) 151 | 152 | #iterate through the chromosomes in the genome 2bit file and write to genome fa file 153 | with open(genome_fa_file, 'wb') as genome_fa: 154 | for chrom in tbf: 155 | if '_' not in chrom: 156 | region = tbf[chrom][0:].upper() 157 | genome_fa.write('>{0}\n'.format(chrom)) 158 | genome_fa.write(region+'\n') 159 | 160 | #for each genomic region specified in genes file identify filtered unique guides 161 | #and write to guide fa file 162 | with open(genes_file, 'rb') as gf: 163 | f = [row for row in csv.reader(gf.read().splitlines())] 164 | for i,l in enumerate(f): # i is index, l is entry 165 | if i == 0: 166 | columns = l 167 | continue 168 | 169 | #fetch the current gene and region 170 | gene = dict([(columns[i],e) for i,e in enumerate(l)]) 171 | region_bounds = [long(gene["start"]), long(gene["end"]) + 1] 172 | region = tbf[gene["chrom"]][region_bounds[0]:region_bounds[1]] 173 | region = region.upper() 174 | if "N" in region: 175 | print "found N in target region of", gene["name"] 176 | continue 177 | 178 | #identify and filter guides that target region 179 | (b_guides, t_guides) = get_guides(region, GC_cutoff) 180 | current_guides = set([g[0] for g in (b_guides + t_guides)]) 181 | all_guides = all_guides | current_guides 182 | 183 | #write filtered unique guides to an output guide_fa_file 184 | guide_count = 0 185 | with open(guide_fa_file, 'wb') as guide_fa: 186 | for guide in all_guides: 187 | guide_count += 1 188 | guide_fa.write('>{0}\n'.format(guide_count)) 189 | guide_fa.write(guide+'\n') 190 | 191 | 192 | def find_offtargets(input_prefix): 193 | """ 194 | calls seqmap to find all close matches to a given sgRNA listed in guide fa file in genome fa file 195 | and prints results to an offtargets file 196 | """ 197 | global tf_counter 198 | genome_fa_file = input_prefix + '.fa' 199 | guide_fa_file = input_prefix + '_all_guides.fa' 200 | offtargets_file = input_prefix + '_offtargets.tsv' 201 | 202 | #break up sgrnas by n_probes 203 | tempfiles_in = [] 204 | with open(guide_fa_file) as guide_file_pointer: 205 | guide_file_pointer.seek(0) 206 | for k,g in itertools.groupby(enumerate(guide_file_pointer), 207 | key = lambda x:int(x[0]/(N_PROBES * 2))): 208 | lines = list(e for i,e in g) 209 | if len(lines) == 0 : continue 210 | f_in = tempfile.NamedTemporaryFile(mode='w', suffix='.{0}.probes.input.fa'.format(tf_counter), prefix='temp.', delete=False) 211 | tf_counter +=1 212 | for line in lines: f_in.writelines(line) 213 | tempfiles_in.append(f_in) 214 | f_in.close() 215 | tempfiles_out = [] 216 | 217 | #submit sgnra / probe scans {MAX_PROCESSES} at a time and wait for completion in groups 218 | for k,tfs_group in itertools.groupby(enumerate(tempfiles_in), key= lambda x:int(x[0] / MAX_PROCESSES)): 219 | f_out = tempfile.NamedTemporaryFile(mode='w', suffix='.{0}.seqmap.output'.format(tf_counter), prefix='temp.', delete=False) 220 | tf_counter +=1 221 | tempfiles_out.append(f_out) 222 | f_out.close() 223 | processes = [] 224 | 225 | for k,f_in in tfs_group: 226 | print "PROCESSING IN PARALLEL!!!" 227 | cmd = "seqmap-1.0.13-src/seqmap {0} {1} {2} {3} /output_all_matches /do_not_output_probe_without_match".format( 228 | MAX_MISMATCHES, f_in.name, genome_fa_file, f_out.name) 229 | processes.append(subprocess.Popen(cmd, shell=True)) 230 | for p in processes: 231 | p.communicate() 232 | 233 | #merge output and clean up 234 | for tf in tempfiles_in: 235 | try: 236 | os.remove(tf.name) 237 | except: 238 | continue 239 | with open(offtargets_file, "w") as offtargets_file_pointer: 240 | for tf in tempfiles_out: 241 | with open(tf.name) as f_out: 242 | lines = [] 243 | for i,l in enumerate(f_out): 244 | if i == 0: continue 245 | lines.append(l) 246 | if l[-1] != '\n': 247 | print l 248 | raise Exception() 249 | print "sorting {0} offtarget hits".format(len(lines)) 250 | lines_sorted = sorted(lines, key = lambda x:x.split('\t')[4]) 251 | os.remove(tf.name) 252 | offtargets_file_pointer.writelines(lines_sorted) 253 | return 254 | 255 | def make_db(input_prefix): 256 | """ 257 | Creates a sqlite database of guides and offtarget scores based on the offtargets file 258 | and outputs to database file 259 | """ 260 | offtargets_file = input_prefix + '_offtargets.tsv' 261 | offtarget_scores_file = input_prefix + '_offtarget_scores.csv' 262 | database_file = input_prefix + '_database.sqlite' 263 | to_db =[] 264 | ot_number = 0 265 | 266 | with open(offtargets_file) as otf: 267 | for ontarget_sequence,g in itertools.groupby(otf, key = lambda x:x.split("\t")[4]): 268 | rows = list(g) 269 | offtarget_sequences = [e.split('\t')[2] for e in rows] 270 | 271 | if len(ontarget_sequence) != GUIDE_LENGTH: 272 | print rows[0].split('\t') 273 | raise Exception("improperly formatted SGRNA") 274 | for i,s in enumerate(offtarget_sequences): 275 | if len(s) != GUIDE_LENGTH: 276 | raise Exception("improperly formatted OT") 277 | 278 | scores = [score_one_offtarget(ontarget_sequence, e) for e in offtarget_sequences] 279 | ot_number += 1 280 | if ot_number%10000 == 0: 281 | print ot_number, "guides scored" 282 | if ot_number == 10000: 283 | tic = time.clock() 284 | elif ot_number ==20000: 285 | toc = time.clock() 286 | print "time to process 10000 guides:", toc - tic 287 | if len(scores) == 0: raise Exception('ERROR: no matches for target.') 288 | total_score = score_sgrna(scores, has_ontarget = True) 289 | to_db.append((ontarget_sequence, total_score)) 290 | 291 | # make a csv file with unique isoforms 292 | os.system('touch ' + offtarget_scores_file) 293 | with open(offtarget_scores_file,'wb') as csvfile: 294 | mywriter = csv.writer(csvfile) 295 | for guide in to_db: 296 | mywriter.writerow(guide) 297 | 298 | print "opening sqlite connection to {0}".format(database_file) 299 | dbpath = os.path.abspath(database_file) 300 | if os.path.isfile(database_file): 301 | os.remove(database_file) 302 | dbaddress = "//{0}".format(dbpath) 303 | print dbaddress 304 | con = sqlite3.connect(dbaddress) 305 | cur = con.cursor() 306 | cur.execute("CREATE TABLE sgrnas (seq TEXT PRIMARY KEY, score NUM);") 307 | cur.executemany("INSERT INTO sgrnas (seq, score) VALUES (?, ?);", to_db) 308 | print "committing" 309 | con.commit() 310 | print "done creating databse with {0} entries".format(len(to_db)) 311 | 312 | def score_one_offtarget(sgrna_sequence, offtarget_sequence): 313 | """ 314 | scores a single offtarget match 315 | """ 316 | mismatches = numpy.array([i for i in range(GUIDE_LENGTH) 317 | if sgrna_sequence[i] != offtarget_sequence[i]]) 318 | if len(mismatches) == 0: 319 | score = 100 320 | else: 321 | score = 100 * (1 - weights[mismatches]).prod() 322 | if len(mismatches) > 1: 323 | mean_pairwise =float(sum(mismatches[1:] - mismatches[:-1])) / (len(mismatches)-1) 324 | mpw_factor = ((float((19-mean_pairwise))/19)*4 + 1) 325 | scl_factor = pow(len(mismatches),2) 326 | score = score / ( mpw_factor * scl_factor ) 327 | score = max([score,0]) 328 | return score 329 | 330 | def score_sgrna(scores, has_ontarget = True): 331 | """ 332 | computes a total score for an sgRNA guide sequence from all offtargets 333 | """ 334 | sum_scores = float(sum(scores)) 335 | norm_score = 100 / sum_scores 336 | return norm_score 337 | 338 | def remove_overlap(all_guides_sorted, spacing): 339 | """ 340 | removes guides that have cleavage site distances that are less than specified by spacing 341 | and returns a filtered list of guides 342 | """ 343 | all_guides_filtered = [all_guides_sorted[0]] 344 | prev_guide = all_guides_sorted[0] 345 | prev_loc = prev_guide[-1] 346 | 347 | for guide in all_guides_sorted[1:]: 348 | loc = guide[-1] 349 | if abs(loc - prev_loc) > spacing: 350 | all_guides_filtered.append(guide) 351 | prev_guide = guide 352 | prev_loc = loc 353 | 354 | return all_guides_filtered 355 | 356 | def get_sorted_guides(region, gene, GC_cutoff, spacing, input_prefix): 357 | """ 358 | returns a list of filtered guides in region sorted by distance to the start of the 359 | targeted region in the form: 360 | [name, spacer sequence, strand (b/t), chromosome, and cleavage site location] 361 | """ 362 | all_b_guides = [] 363 | all_t_guides = [] 364 | (b_guides, t_guides) = get_guides(region, GC_cutoff) 365 | #add location of cleavage site to each guide 366 | (b_guide_loc, t_guide_loc) = get_location(gene, b_guides, t_guides) 367 | for i, loc in enumerate(b_guide_loc): 368 | all_b_guides.append([gene["name"], b_guides[i][0], "b", gene["chrom"], loc]) 369 | for i, loc in enumerate(t_guide_loc): 370 | all_t_guides.append([gene["name"], t_guides[i][0], "t", gene["chrom"], loc]) 371 | all_guides = all_b_guides + all_t_guides #makes one nested list of bottom and top guides 372 | all_guides_sorted = sorted(all_guides, key=itemgetter(-1)) #sorts by location 373 | if len(all_guides_sorted) > 0: 374 | all_guides_filtered = remove_overlap(all_guides_sorted, spacing) 375 | else: 376 | return [] 377 | 378 | return all_guides_filtered 379 | 380 | def get_ot_guides(guide_list, input_prefix): 381 | """ 382 | connects to the offtarget database to fetch ot scores for guides in guide_list 383 | returns a list of guides with respective off-target scores 384 | """ 385 | database_file = input_prefix + '_database.sqlite' 386 | guides_scored = [] 387 | sequences = [] 388 | 389 | for guide in guide_list: 390 | sequences.append(guide[1]) 391 | 392 | dbpath = os.path.abspath(database_file) 393 | dbaddress = "//{0}".format(dbpath) 394 | conn = sqlite3.connect(dbaddress) 395 | c = conn.cursor() 396 | query = "select * FROM sgrnas WHERE seq IN ({0}) ORDER BY score".format(','.join(['?']*len(sequences))) 397 | try: 398 | c.execute(query, sequences) 399 | guides_scored = c.fetchall() 400 | 401 | except: 402 | print "did not find guides" 403 | 404 | return guides_scored 405 | 406 | def list_sgrnas(genes_file, input_prefix, GC_cutoff, spacing, guides_per_gene, gecko, sam): 407 | """ 408 | Returns a list of (ontarget) sgrna sequences using a genome file and list of 409 | transcription start sites form a .csv file. 410 | """ 411 | genome_2bit_file = input_prefix + '.2bit' 412 | tbf = twobitreader.TwoBitFile(genome_2bit_file) 413 | final_guides = [] 414 | 415 | with open(genes_file, 'rb') as gf: 416 | f = [row for row in csv.reader(gf.read().splitlines())] 417 | for i,l in enumerate(f): 418 | if i == 0: 419 | columns = l 420 | continue 421 | 422 | #fetch the current gene and region 423 | gene = dict([(columns[i],e) for i,e in enumerate(l)]) 424 | region_bounds = [long(gene["start"]), long(gene["end"]) + 1] 425 | region = tbf[gene["chrom"]][region_bounds[0]:region_bounds[1]] 426 | region = region.upper() 427 | if "N" in region: 428 | print "found N in target region of", gene["name"] 429 | continue 430 | 431 | #identify and filter guides that target region 432 | guides = get_sorted_guides(region, gene, GC_cutoff, spacing, input_prefix) 433 | if len(guides) == 0: 434 | continue 435 | 436 | #add offtarget scores to filtered guides and select guides with higher offtarget scores 437 | ot_guides_sql = get_ot_guides(guides, input_prefix) 438 | ot_guides_dict = dict(ot_guides_sql) 439 | for g in guides: 440 | spacer = g[1] 441 | g.append(ot_guides_dict[spacer]) 442 | 443 | #sort and add guides with the highest offtarget scores to final guides 444 | guides = sorted(guides, key=itemgetter(-1), reverse=True) 445 | 446 | if len(guides) <= guides_per_gene: 447 | final_guides.extend(guides) 448 | else: 449 | final_guides.extend(guides[:guides_per_gene]) 450 | 451 | # add gecko or sam flanking sequences to the spacer for the oligo library 452 | if sam or gecko: 453 | for guide in final_guides: 454 | spacer = guide[1] 455 | if gecko: 456 | oligo = gecko_flank[0] + spacer + gecko_flank[1] 457 | if sam: 458 | oligo = sam_flank[0] + spacer + sam_flank[1] 459 | guide.append(oligo) 460 | 461 | return final_guides 462 | 463 | def writecsv(data, filename): 464 | """ 465 | write data to a csv file 466 | """ 467 | with open(filename, 'wb') as csvfile: 468 | csvwriter = csv.writer(csvfile) 469 | for row in data: 470 | csvwriter.writerow(row) 471 | 472 | def __main__(): 473 | parser = argparse.ArgumentParser( 474 | description='Design oligo library sequences for custom library cloning') 475 | parser.add_argument('-o', '--output', type=str, dest='guide_file', 476 | help='output file name', default='final_guides.csv') 477 | parser.add_argument('-i', '--input', type=str, dest='input_prefix', 478 | help='input genome prefix', default='hg19') 479 | parser.add_argument('-g', '--genes', type=str, dest='genes_file', 480 | help='input gene file name', default='genes.csv') 481 | parser.add_argument('-gc', '--gc', type=int, dest='GC_cutoff', 482 | help='gc content cutoff', default=25) 483 | parser.add_argument('-s', '--spacing', type=int, dest='spacing', 484 | help='minimum spacing between cleavage sites', default=20) 485 | parser.add_argument('-n', '--guides-per-gene', type=int, dest='guides_per_gene', 486 | help='maximum number of guides per gene', default=3) 487 | parser.add_argument('-db', dest='db', help='use existing off-target database', action='store_false') 488 | parser.set_defaults(db=True) 489 | parser.add_argument('-gecko', dest='gecko', help='add gecko flanking sequences', action='store_true') 490 | parser.set_defaults(gecko=False) 491 | parser.add_argument('-sam', dest='sam', help='add sam flanking sequences', action='store_true') 492 | parser.set_defaults(sam=False) 493 | args = parser.parse_args() 494 | 495 | if args.db: 496 | generate_fa_files(args.input_prefix, args.genes_file, args.GC_cutoff) 497 | find_offtargets(args.input_prefix) 498 | make_db(args.input_prefix) 499 | final_guides = list_sgrnas(args.genes_file, args.input_prefix, args.GC_cutoff, args.spacing, args.guides_per_gene, args.gecko, args.sam) 500 | writecsv(final_guides, args.guide_file) 501 | 502 | if __name__ == "__main__": 503 | __main__() 504 | -------------------------------------------------------------------------------- /design_targeted_library.py: -------------------------------------------------------------------------------- 1 | #Supplementary Data 2: design_targeted_library.py 2 | 3 | import csv 4 | import sys 5 | import argparse 6 | 7 | # flanking sequences around spacer for gecko and sam libraries 8 | gecko_flank = ['TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCG', 'GTTTTAGAGCTAGAAATAGCAAGTTAAAATAAGGCTAGTCCGT'] 9 | sam_flank = ['TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCG', 'GTTTTAGAGCTAGGCCAACATGAGGATCACC'] 10 | 11 | def design_oligos(output_file, library_file, genes_file, gecko, sam): 12 | """ 13 | creates a list of spacers corresponding to the target genes, writes to output_file 14 | output_file: subset of spacers corresponding to the target genes 15 | library_file: all RefSeq genes with corresponding spacers 16 | genes_file: target genes 17 | """ 18 | # list of spacer sequences for the library 19 | spacer_list = [] 20 | 21 | # open gene file for list of target genes 22 | with open(genes_file, 'rb') as infile: 23 | target_genes = [row[0] for row in csv.reader(infile.read().splitlines())] 24 | 25 | # open library file for all RefSeq genes 26 | with open(library_file, 'rb') as infile: 27 | for row in csv.reader(infile.read().splitlines()): 28 | gene = row[0] 29 | spacer = row[1] 30 | 31 | # check if each gene is in the list of target genes 32 | if gene in target_genes: 33 | spacer_list.append(row) 34 | 35 | # add gecko or sam flanking sequences to the spacer for the oligo library 36 | if gecko: 37 | oligo = gecko_flank[0] + spacer + gecko_flank[1] 38 | spacer_list[-1].append(oligo) 39 | if sam: 40 | oligo = sam_flank[0] + spacer + sam_flank[1] 41 | spacer_list[-1].append(oligo) 42 | # sort spacer sequences in gene order 43 | spacer_list = sorted(spacer_list, key=lambda t: t[0]) 44 | 45 | # write spacer list to output file 46 | with open(output_file, 'wb') as outfile: 47 | csvwriter = csv.writer(outfile) 48 | for s in spacer_list: 49 | csvwriter.writerow(s) 50 | 51 | if __name__ == '__main__': 52 | parser = argparse.ArgumentParser( 53 | description='Design oligo library sequences for targeted library cloning') 54 | parser.add_argument('-o', '--output', type=str, dest='output_file', 55 | help='output file name', default='oligos.csv') 56 | parser.add_argument('-l', '--library', type=str, dest='library_file', 57 | help='input file name', default='annotated_library.csv') 58 | parser.add_argument('-g', '--genes', type=str, dest='genes_file', 59 | help='input file name', default='target_genes.csv') 60 | parser.add_argument('-gecko', dest='gecko', help='add gecko flanking sequences', action='store_true') 61 | parser.set_defaults(gecko=False) 62 | parser.add_argument('-sam', dest='sam', help='add sam flanking sequences', action='store_true') 63 | parser.set_defaults(sam=False) 64 | args = parser.parse_args() 65 | 66 | design_oligos(args.output_file, args.library_file, args.genes_file, args.gecko, args.sam) 67 | --------------------------------------------------------------------------------