├── README.md
├── calculate_indel.py
├── count_spacers.py
├── design_library.py
└── design_targeted_library.py


/README.md:
--------------------------------------------------------------------------------
1 | # Screening_Protocols_manuscript
2 | Scripts from the Joung et al Nature Protocols 2016 manuscript on knockout and transcriptional activation screening
3 | 


--------------------------------------------------------------------------------
/calculate_indel.py:
--------------------------------------------------------------------------------
  1 | # Supplementary Data 4: calculate_indel.py
  2 | 
  3 | import difflib
  4 | import numpy as np
  5 | from scipy.stats import binom
  6 | from Bio import SeqIO
  7 | import argparse
  8 | import itertools
  9 | 
 10 | 
 11 | READ_TRUNCATION = 20
 12 | HASH_READ_TRUNCATION = 0
 13 | MIN_READ_LENGTH = 56
 14 | MAX_AMBIGUOUS_BASES = 5
 15 | MAX_INDEL_MISMATCH = 6
 16 | ERROR_TOLERANCE_THRESHOLD = 0.15
 17 | 
 18 | INITIAL_SEARCH_WINDOW = 20  # 20 works well
 19 | SEARCH_INCREMENT = 3
 20 | MAX_SEARCH_WINDOW = 50
 21 | 
 22 | KMER_SIZE = 15
 23 | 
 24 | SINGLE_FILE_STRUCTURE = '{}_out.csv'
 25 | 
 26 | 
 27 | def find_loc(guide, target):
 28 |     loc = target.find(guide)
 29 |     return (loc, loc + len(guide))
 30 | 
 31 | 
 32 | def rc(seq):
 33 |     base_pairs = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
 34 |     return ''.join(base_pairs[i] for i in seq[::-1])
 35 | 
 36 | 
 37 | def generate_hash(seq):
 38 |     kmer_index = {}
 39 |     for i in range(len(seq) - KMER_SIZE):
 40 |         kmer = seq[i:i + KMER_SIZE]
 41 |         if kmer in kmer_index:
 42 |             kmer_index[kmer] = None
 43 |         else:
 44 |             kmer_index[kmer] = i
 45 |     return kmer_index
 46 | 
 47 | 
 48 | def calc_mle(total_reads, indel_counts, background):
 49 |     indel_range = np.array(range(indel_counts))
 50 |     distrib = binom.pmf(indel_counts - indel_range, total_reads -
 51 |                         indel_range, background)
 52 |     if len(distrib) == 0:
 53 |         mle_freq = 0
 54 |     else:
 55 |         mle_freq = distrib.argmax() / float(total_reads)
 56 | 
 57 |     z = 1.96
 58 |     upper_bound = (total_reads * mle_freq + z**2 / 2 + z * np.sqrt(total_reads *
 59 |                                                                    mle_freq * (1 - mle_freq) + z**4 / 4)) / (total_reads + z**2)
 60 |     lower_bound = (total_reads * mle_freq + z**2 / 2 - z * np.sqrt(total_reads *
 61 |                                                                    mle_freq * (1 - mle_freq) + z**4 / 4)) / (total_reads + z**2)
 62 | 
 63 |     return mle_freq, lower_bound, upper_bound
 64 | 
 65 | 
 66 | def write_mle(sample_sheet, output_file, verbose, quiet):
 67 |     if not quiet:
 68 |         print 'Applying MLE correction'
 69 |     with open(output_file) as start_output_file:
 70 |         file_read = start_output_file.read().split('\n')
 71 |         output_header = file_read[0]
 72 |         output_text = file_read[1:-1]
 73 |     controls = []
 74 |     with open(sample_sheet) as in_handle:
 75 |         for i, l in enumerate(in_handle):
 76 |             if len(l.strip().split(',')) < 5:
 77 |                 print 'Sample and Control flags not detected'
 78 |                 break
 79 |             elif l.strip().split(',')[4][0].upper().strip() == 'C':
 80 |                 controls.append(i)
 81 |     background_list = [float(output_text[i].split(',')[
 82 |                              7]) / 100 for i in controls]
 83 |     background = sum(background_list) / len(background_list)
 84 | 
 85 |     with open(output_file, 'w') as out_handle:
 86 |         out_handle.write(output_header+'\n')
 87 |         for i, l in enumerate(output_text):
 88 |             if i in controls:
 89 |                 out_handle.write('{},{},{},{}\n'.format(l, 'NA', 'NA', 'NA'))
 90 |             else:
 91 |                 samp_data = [int(l.split(',')[i]) for i in (1, 2, 5, 6)]
 92 |                 mle_percentage, lower_bound, upper_bound = calc_mle(
 93 |                     sum(samp_data), samp_data[1], background)
 94 | 
 95 |                 out_handle.write('{},{},{},{}\n'.format(
 96 |                     l, mle_percentage, lower_bound, upper_bound))
 97 | 
 98 | 
 99 | def op_ver(opcodes):
100 |     '''
101 |     Designed to parse the opcodes from difflib.SequenceMatcher to generate edits. Detects if there are an odd number of edits
102 |     and if there are edits with intervening equal regions.
103 |     '''
104 |     ops = [x[0][0] for x in opcodes]
105 |     if len(ops) % 2:
106 |         # assumes read is longer than target
107 |         if not (ops[0] == 'd' and ops[-1] == 'd' and set(ops[1::2]) == set(['e'])):
108 |             return False
109 |         else:
110 |             proc_ops = [(x[0][0], x[3], x[4] - x[3], x[1], x[1] - x[2])
111 |                         for x in opcodes[2:-2:2]]
112 |             return proc_ops
113 |     else:
114 |         return False
115 | 
116 | 
117 | def indel_calc_window_hash(seq_handle, target):
118 |     '''
119 |      Iterates through a SeqRecord iterator and calculates statistics about each read for a given window with hash algorithm
120 |     '''
121 |     perf_total, indel_total, err_total, rejected_total, miscall_total, replace_total = (
122 |         0,) * 6
123 |     target_index = generate_hash(target)
124 |     for readout in seq_handle:
125 |         read = str(readout.seq)[HASH_READ_TRUNCATION:]
126 |         if len(read) < MIN_READ_LENGTH or read.count('N') > MAX_AMBIGUOUS_BASES:  # filtering for junk
127 |             rejected_total += 1
128 |         elif target in read:
129 |             perf_total += 1
130 |         else:
131 |             read_index = generate_hash(read)
132 |             mapping = {}
133 |             for kmer in read_index:
134 |                 if read_index[kmer] is not None and kmer in target_index and target_index[kmer] is not None:
135 |                     mapping[read_index[kmer]] = target_index[kmer]
136 |             if len(mapping) == 0:
137 |                 err_total += 1
138 |             else:
139 |                 index_diff = (
140 |                     mapping[i] - i if i in mapping else None for i in range(len(read) + KMER_SIZE + 1))
141 |                 collapsed_dif = [[k, len(list(g))]
142 |                                  for k, g in itertools.groupby(index_diff)]
143 | 
144 |                 start = True
145 |                 indels = 0
146 |                 sing_mismatch = 0
147 |                 mult_mismatch = 0
148 |                 offset = 0
149 |                 if collapsed_dif[-1][0] is not None:
150 |                     err_total += 1
151 |                 else:
152 |                     for el in collapsed_dif[:-1]:
153 | 
154 |                         if start:
155 |                             # advance to first non nan location (trim back from
156 |                             # start of read to first alignment)
157 |                             if el[0] is not None:
158 |                                 offset = el[0]
159 |                                 start = False
160 |                         if el[0] is not None:
161 |                             doff = el[0] - offset
162 |                             # append indel start loc to iloc and length of indel to ilen
163 |                             # insertion deletion combinations are summarized as follows for computaitonal simplicity
164 |                             # insertion deletion with len(ins)>len(del) = insertion
165 |                             # insertion deletion with len(ins)<len(del) = deletion
166 |                             # insertion deletion with len(ins)==len(del) =
167 |                             # mismatches (currently not considered indel)
168 |                             if doff != 0:
169 |                                 indels += 1
170 |                         else:
171 |                             if el[1] < (KMER_SIZE + 1):
172 |                                 sing_mismatch += 1
173 |                             elif el[1] > (KMER_SIZE):
174 |                                 mult_mismatch += 1
175 |                     if indels > 0:
176 |                         indel_total += 1
177 |                     elif mult_mismatch > 0:
178 |                         replace_total += 1
179 |                         # print collapsed_dif
180 |                     elif sing_mismatch > 0:
181 |                         miscall_total += 1
182 |                     else:
183 |                         err_total += 1
184 | 
185 |     return (perf_total, indel_total, err_total, rejected_total, miscall_total, replace_total)
186 | 
187 | 
188 | def indel_calc_window(seq_handle, target):
189 |     '''
190 |     Iterates through a SeqRecord iterator and calculates statistics about each read for a given window
191 |     '''
192 |     perf_total, indel_total, err_total, rejected_total, miscall_total, replace_total = (
193 |         0,) * 6
194 |     for readout in seq_handle:
195 |         read = str(readout.seq)[READ_TRUNCATION:]
196 |         if len(read) < MIN_READ_LENGTH or read.count('N') > MAX_AMBIGUOUS_BASES:  # filtering for junk
197 |             rejected_total += 1
198 |         elif target in read:
199 |             perf_total += 1
200 |         else:
201 |             opcodes = difflib.SequenceMatcher(
202 |                 None, read, target, autojunk=False).get_opcodes()
203 |             # filter out any reads with more than allowed indels + mismatches
204 |             if len(opcodes) > 3 + MAX_INDEL_MISMATCH * 2:
205 |                 err_total += 1
206 |             else:
207 |                 # if there are not an odd number of edits, try to shift
208 |                 # sequence and reattempt
209 |                 if not len(opcodes) % 2:
210 |                     opcodes = difflib.SequenceMatcher(
211 |                         None, read, target[1:-1], autojunk=False).get_opcodes()
212 |                 indel_list = op_ver(opcodes)
213 |                 if not indel_list:
214 |                     err_total += 1
215 |                 else:
216 |                     # check if only single mismatched bases, interpreted as
217 |                     # miscalled bases
218 |                     miscall = set.union(set(x[2] for x in indel_list), set(
219 |                         x[4] for x in indel_list), set(x[0] for x in indel_list)) == set(['r', 1, -1])
220 |                     # check for larger replacement regions (not
221 |                     # insertions/deletions)
222 |                     mismatch = set(x[0] for x in indel_list) == set('r')
223 | 
224 |                     if miscall:
225 |                         miscall_total += 1
226 |                     elif mismatch:
227 |                         replace_total += 1
228 |                     else:
229 |                         indel_total += 1
230 |     return (perf_total, indel_total, err_total, rejected_total, miscall_total, replace_total)
231 | 
232 | 
233 | def file_calc(f_name, guide_loc, target, file_type, hash_flag):
234 |     '''
235 |     Attempts different windows to pass error threshold
236 |     '''
237 |     error_flag = True
238 |     window_size = INITIAL_SEARCH_WINDOW
239 |     min_error = 100
240 |     min_total = []
241 |     note = ''
242 | 
243 |     if hash_flag:
244 |         algorithm = indel_calc_window_hash
245 |     else:
246 |         algorithm = indel_calc_window
247 | 
248 |     while error_flag:  # attempt windows while above threshold
249 |         target_window = target[guide_loc[0] -
250 |                                window_size:guide_loc[1] + window_size]
251 |         with open(f_name, 'rU') as f_handle:
252 |             total_list = algorithm(
253 |                 SeqIO.parse(f_handle, file_type), target_window)
254 | 
255 |         err_total = total_list[2]
256 |         rejected_total = total_list[3]
257 |         error_percentage = float(err_total) / \
258 |             (sum(total_list) - rejected_total) * 100
259 | 
260 |         if error_percentage < min_error:  # check if better than previously achieved
261 |             min_error = error_percentage
262 |             min_total = total_list
263 | 
264 |         error_flag = (error_percentage > ERROR_TOLERANCE_THRESHOLD) and (
265 |             window_size > MAX_SEARCH_WINDOW)
266 |         window_size += SEARCH_INCREMENT
267 | 
268 |     if error_percentage > ERROR_TOLERANCE_THRESHOLD:
269 |         note = 'Error threshold not met returning best attempt'
270 |     return min_total, note
271 | 
272 | 
273 | def prep_entry(f_name, guide, target, file_type, hash_flag):
274 |     '''
275 |     Finds guide location
276 |     '''
277 | 
278 |     if guide in target:
279 |         total_list, note = file_calc(
280 |             f_name, find_loc(guide, target), target, file_type, hash_flag)
281 |     elif rc(guide) in target:
282 |         total_list, note = file_calc(f_name, find_loc(
283 |             rc(guide), target), target, file_type, hash_flag)
284 |     else:
285 |         total_list = (0,) * 6
286 |         note = 'Guide not found in target sequence'
287 |     return total_list, note
288 | 
289 | 
290 | def whole_file_read(sample_sheet, file_type, output_file, hash_flag, mle, verbose, quiet):
291 |     '''
292 |     Reads through a complete file and constructs corresponding output file
293 |     '''
294 |     if not quiet:
295 |         print 'Reading input sheet from {}'.format(sample_sheet)
296 |     if mle:
297 |         mle_string = ''
298 |     else:
299 |         mle_string = ', MLE corrected rate, lower bound, upper bound'
300 |     with open(sample_sheet) as in_handle, open(output_file, 'w') as out_handle:
301 |         out_handle.write(
302 |             'sample,perfect matches,indels,misaligned reads,reads below threshold, reads with miscalled bases, reads with replacements,indel percentage, notes{}\n'.format(mle_string))
303 |         for l in in_handle:
304 |             sample_name, file_name, guide, target = l.strip().split(',')[:4]
305 |             if verbose:
306 |                 print 'Analyzing sample {} from {}'.format(sample_name, file_name)
307 |             guide = guide.upper().strip()
308 |             target = target.upper().strip()
309 |             total_list, note = prep_entry(
310 |                 file_name, guide, target, file_type, hash_flag)
311 |             indel_total = total_list[1]
312 |             rejected_total = total_list[2] + total_list[3]
313 |             indel_rate = float(indel_total) / \
314 |                 (sum(total_list) - rejected_total)
315 |             total_list_string = ','.join(str(s) for s in total_list)
316 |             out_handle.write('{},{},{},{}\n'.format(
317 |                 sample_name, total_list_string, indel_rate, note))
318 |     if not mle:
319 |         write_mle(sample_sheet, output_file, verbose, quiet)
320 | 
321 | 
322 | def single_entry_read(sample_sheet, file_type, input_name, hash_flag, verbose, quiet):
323 |     '''
324 |     Reads through a single sample
325 |     '''
326 | 
327 |     with open(sample_sheet) as in_handle:
328 |         for l in in_handle:
329 |             sample_name, file_name, guide, target = l.strip().split(',')[:4]
330 |             if sample_name.strip() == input_name.strip():
331 |                 with open(SINGLE_FILE_STRUCTURE.format(input_name.strip()), 'w') as out_handle:
332 |                     guide = guide.upper().strip()
333 |                     target = target.upper().strip()
334 |                     total_list, note = prep_entry(
335 |                         file_name, guide, target, file_type, hash_flag)
336 | 
337 |                     indel_total = total_list[1]
338 |                     rejected_total = total_list[2] + total_list[3]
339 |                     indel_rate = float(indel_total) / \
340 |                         (sum(total_list) - rejected_total)
341 | 
342 |                     total_list_string = ','.join(str(s) for s in total_list)
343 | 
344 |                     out_handle.write('{},{},{},{}\n'.format(
345 |                         sample_name, total_list_string, indel_rate, note))
346 | 
347 | 
348 | def combine_files(sample_sheet, file_type, output_file, mle, verbose, quiet):
349 |     '''
350 |     Combines separately processed files
351 |     '''
352 |     if mle:
353 |         mle_string = ''
354 |     else:
355 |         mle_string = ', MLE corrected rate, lower bound, upper bound'
356 | 
357 |     with open(sample_sheet) as in_handle, open(output_file, 'w') as out_handle:
358 |         out_handle.write(
359 |             'sample,perfect matches,indels,misaligned reads,reads below threshold, reads with miscalled bases, reads with replacements,indel percentage, notes{}\n'.format(mle_string))
360 |         for l in in_handle:
361 |             sample_name, file_name, guide, target = l.strip().split(',')[:4]
362 |             with open(SINGLE_FILE_STRUCTURE.format(sample_name.strip()), 'w') as samp_handle:
363 |                 out_handle.write(samp_handle.readline())
364 |     if not mle:
365 |         write_mle(sample_sheet, output_file)
366 | 
367 | if __name__ == '__main__':
368 |     parser = argparse.ArgumentParser(
369 |         description='Analyze sequencing data for the presence of indels')
370 |     combine = parser.add_mutually_exclusive_group()
371 |     verbosity = parser.add_mutually_exclusive_group()
372 |     combine.add_argument(
373 |         '-c', '--combine', help='combines files generated by individual samples', action='store_true')
374 |     parser.add_argument(
375 |         '-f', '--fasta', help='reads fasta files (default is fastq)', action='store_true')
376 |     parser.add_argument(
377 |         '-no-m', '--no-mle', dest='nomle', help='does not calculate MLE', action='store_true')
378 |     parser.add_argument('-o', '--output', dest='output_file',
379 |                         help='output file name', default='calc_indel_out.csv')
380 |     parser.add_argument(
381 |         '-a', '--hash', help='uses alternative hashing algorithm', action='store_true')
382 |     parser.add_argument('-i', '--input', dest='sample_sheet',
383 |                         help='input file name', default='sample_sheet.csv')
384 |     combine.add_argument('-s', '--sample', dest='input_name',
385 |                          help='sample name for running in single sample mode')
386 |     verbosity.add_argument(
387 |         '-v', '--verbose', help='outputs verbose', action='store_true')
388 |     verbosity.add_argument(
389 |         '-q', '--quiet', help='supresses output', action='store_true')
390 | 
391 |     args = parser.parse_args()
392 | 
393 |     file_type = 'fasta' if args.fasta else 'fastq'
394 |     if args.combine:
395 |         combine_files(args.sample_sheet, file_type,
396 |                       args.output_file, args.nomle, args.verbose, args.quiet)
397 |     elif args.input_name:
398 |         single_entry_read(args.sample_sheet, file_type,
399 |                           args.input_name, args.hash, args.verbose, args.quiet)
400 |     else:
401 |         whole_file_read(args.sample_sheet, file_type,
402 |                         args.output_file, args.hash, args.nomle, args.verbose, args.quiet)
403 | 


--------------------------------------------------------------------------------
/count_spacers.py:
--------------------------------------------------------------------------------
  1 | #Supplementary Data 3: count_spacers.py
  2 | 
  3 | from Bio import SeqIO
  4 | import csv
  5 | from collections import OrderedDict
  6 | import numpy as np
  7 | import sys
  8 | import argparse
  9 | 
 10 | KEY_REGION_START = 30 #start index of key region
 11 | KEY_REGION_END = 55 #end index of key region
 12 | KEY = "CGAAACACC" #identifies sequence before guide to determine guide position
 13 | 
 14 | def count_spacers(input_file, fastq_file, output_file, guide_g): 
 15 | 	"""
 16 | 	creates a dictionary with guide counts from fastq_file, writes to output_file
 17 | 	fastq_file: forward read fastq file
 18 | 	output_file: csv file to write guide dictionary to
 19 | 	dictionary: guide sequence as key, guide count as entry
 20 | 	"""
 21 | 
 22 | 	num_reads = 0 #total number of reads processed
 23 | 	perfect_matches = 0 # guides with perfect match to library
 24 | 	non_perfect_matches = 0 #number of guides without a perfect match to the library
 25 | 	key_not_found = 0 #count of reads where key was not found
 26 | 
 27 | 	# add 'G' to key sequence if included in library
 28 | 	if guide_g:
 29 | 		global KEY
 30 | 		KEY += "G"
 31 | 
 32 | 	# open library sequences and initiate dictionary of read counts for each guide
 33 | 	try:
 34 | 		with open(input_file, mode='rU') as infile: #rU mode is necessary for excel!  
 35 | 			reader = csv.reader(infile)
 36 | 			dictionary = {rows[0]:0 for rows in reader}
 37 | 	except:
 38 | 		print  'could not open', input_file
 39 | 	  
 40 | 	# open fastq file
 41 | 	try:
 42 | 		handle = open(fastq_file, "rU")
 43 | 	except:
 44 | 		print "could not find fastq file"
 45 | 		return
 46 | 
 47 | 	# process reads in fastq file
 48 | 	readiter = SeqIO.parse(handle, "fastq")
 49 | 	for record in readiter: #contains the seq and Qscore etc.
 50 | 		num_reads += 1
 51 | 		read_sequence = str.upper(str(record.seq))
 52 | 		key_region = read_sequence[KEY_REGION_START:KEY_REGION_END]
 53 | 		key_index = key_region.find(KEY)
 54 | 		if key_index >= 0:
 55 | 			start_index = key_index + KEY_REGION_START + len(KEY)
 56 | 			guide = read_sequence[start_index:(start_index + 20)]
 57 | 			if guide in dictionary:
 58 | 				dictionary[guide] += 1
 59 | 				perfect_matches += 1
 60 | 			else:
 61 | 				non_perfect_matches += 1
 62 | 		else:
 63 | 			key_not_found += 1
 64 | 
 65 | 	# create ordered dictionary with guides and respective counts and output as a csv file                      
 66 | 	dict_sorted = OrderedDict(sorted(dictionary.items(), key=lambda t: t[0]))
 67 | 	with open(output_file, 'w') as csvfile:
 68 | 		mywriter = csv.writer(csvfile, delimiter=',')
 69 | 		for guide in dict_sorted:
 70 | 			count = dict_sorted[guide]
 71 | 			mywriter.writerow([guide,count])
 72 | 
 73 | 	# percentage of guides that matched perfectly
 74 | 	percent_matched = round(perfect_matches/float(perfect_matches + non_perfect_matches) * 100, 1)
 75 | 	# percentage of undetected guides with no read counts
 76 | 	guides_with_reads = np.count_nonzero(dictionary.values())
 77 | 	guides_no_reads = len(dictionary.values()) - guides_with_reads
 78 | 	percent_no_reads = round(guides_no_reads/float(len(dictionary.values())) * 100, 1)
 79 | 	# skew ratio of top 10% to bottom 10% of guide counts
 80 | 	top_10 = np.percentile(dictionary.values(), 90)
 81 | 	bottom_10 = np.percentile(dictionary.values(), 10)
 82 | 	if top_10 != 0 and bottom_10 != 0:
 83 | 		skew_ratio = top_10/bottom_10
 84 | 	else:
 85 | 		skew_ratio = 'Not enough perfect matches to determine skew ratio'
 86 | 
 87 | 	# write analysis statistics to statistics.txt
 88 | 	with open('statistics.txt', 'w') as infile:
 89 | 		infile.write('Number of perfect guide matches: ' + str(perfect_matches) + '\n')
 90 | 		infile.write('Number of nonperfect guide matches: ' + str(non_perfect_matches) + '\n')
 91 | 		infile.write('Number of reads where key was not found: ' + str(key_not_found) + '\n')
 92 | 		infile.write('Number of reads processed: ' + str(num_reads) + '\n')
 93 | 		infile.write('Percentage of guides that matched perfectly: ' + str(percent_matched) + '\n')
 94 | 		infile.write('Percentage of undetected guides: ' + str(percent_no_reads) + '\n')
 95 | 		infile.write('Skew ratio of top 10% to bottom 10%: ' + str(skew_ratio))
 96 | 		infile.close()
 97 | 
 98 | 	handle.close()           
 99 | 	return 
100 | 	
101 | 
102 | if __name__ == '__main__':
103 | 	parser = argparse.ArgumentParser(
104 | 		description='Analyze sequencing data for sgRNA library distribution')
105 | 	parser.add_argument('-f', '--fastq', type=str, dest='fastq_file',
106 | 						help='fastq file name', default='NGS.fastq')
107 | 	parser.add_argument('-o', '--output', type=str, dest='output_file',
108 | 						help='output file name', default='library_count.csv')
109 | 	parser.add_argument('-i', '--input', type=str, dest='input_file',
110 | 						help='input file name', default='library_sequences.csv')
111 | 	parser.add_argument('-no-g', dest='guide_g', help='presence of guanine before spacer', action='store_false')
112 | 	parser.set_defaults(guide_g=True)
113 | 	args = parser.parse_args()
114 | 
115 | 	count_spacers(args.input_file, args.fastq_file, args.output_file, args.guide_g)
116 | 


--------------------------------------------------------------------------------
/design_library.py:
--------------------------------------------------------------------------------
  1 | #Supplementary Data 1: design_library.py
  2 | 
  3 | import argparse, tempfile, os, itertools, subprocess
  4 | import twobitreader
  5 | import sqlite3
  6 | import numpy
  7 | import time
  8 | import math
  9 | from operator import itemgetter
 10 | import csv
 11 | 
 12 | #guide design parameters
 13 | GUIDE_LENGTH = 20
 14 | PAM_LIST = ['AGG', 'TGG', 'GGG', 'CGG']
 15 | PAM_LENGTH = len(PAM_LIST[0])
 16 | CLEAVAGE_SITE = 17 #distance to 5' end of guide
 17 | 
 18 | #seqmap parameters
 19 | N_PROBES = 50000
 20 | MAX_PROCESSES = 1
 21 | MAX_MISMATCHES = 3
 22 | tf_counter = 0
 23 | 
 24 | #weights for off-target score calculations
 25 | weights = numpy.array([0,0,0.014,0,0,0.395,0.317,0,0.389,0.079,0.445,0.508,0.613,0.851,0.732,0.828,0.615,0.804,0.685,0.583])
 26 | 
 27 | #flanking sequences around spacer for gecko and sam libraries
 28 | gecko_flank = ['TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCG', 'GTTTTAGAGCTAGAAATAGCAAGTTAAAATAAGGCTAGTCCGT']
 29 | sam_flank = ['TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCG', 'GTTTTAGAGCTAGGCCAACATGAGGATCACC']
 30 | 
 31 | def revcomp(sequence):
 32 | 	"""
 33 | 	returns the reverse complement of sequence
 34 | 	"""
 35 | 	basecomplement = {'A':'T', 'C':'G', 'T':'A', 'G':'C', 'N':'N'} 
 36 | 	letters = list(sequence) 
 37 | 	letters.reverse() 
 38 | 	dna = ''
 39 | 	for base in letters:
 40 | 		dna += basecomplement[base] 
 41 | 	return dna 
 42 | 
 43 | 
 44 | def indexList(s, item, i=0):
 45 | 	"""
 46 | 	make a list of indexes of item in s
 47 | 	"""
 48 | 	i_list = []
 49 | 	while True:
 50 | 		try:
 51 | 			i = s.index(item, i)
 52 | 			i_list.append(i)
 53 | 			i += 1
 54 | 		except:
 55 | 			break
 56 | 	return i_list
 57 | 
 58 | def Target_stretch(guide):
 59 | 	"""
 60 | 	returns true if guide does not contain any of the following homopolymer target stretches
 61 | 	"""
 62 | 	st1 = ('AAAA')
 63 | 	st2 = ('TTTT')
 64 | 	st3 = ('GGGG')
 65 | 	st4 = ('CCCC')
 66 | 
 67 | 	if not ((st1 in guide) or (st2 in guide) or (st3 in guide) or (st4 in guide)):
 68 | 		return 'true'
 69 | 
 70 | 
 71 | def GC_content(GC_cutoff, guide):
 72 | 	"""
 73 | 	takes guide sequence as input, returns true if GC content above threshold defined above
 74 | 	"""
 75 | 	N = guide.count("G")
 76 | 	N += guide.count("C")
 77 | 	percent = float(N)/len(guide)*100
 78 | 	if percent > GC_cutoff:
 79 | 		return True
 80 | 	else: 
 81 | 		return False
 82 | 
 83 | def get_b_guides(region, GC_cutoff, index_list):
 84 | 	"""
 85 | 	takes a sequence s and and a list of indices in sequence that indicate the start of the
 86 | 	reverse complement of the PAM sequence and returns a list of 20bp bottom guide sequences
 87 | 	that have been filtered for GC content and target stretch
 88 | 	"""
 89 | 	guides = []
 90 | 	for i in index_list:
 91 | 		if len(region) > i + GUIDE_LENGTH + PAM_LENGTH:
 92 | 			guide = (revcomp(region[i+PAM_LENGTH:i+PAM_LENGTH+GUIDE_LENGTH]))
 93 | 			if 'N' not in guide:
 94 | 				if  GC_content(GC_cutoff, guide) and Target_stretch(guide):
 95 | 					guides.append([guide, i])
 96 | 	return guides 
 97 | 
 98 | def get_t_guides(region, GC_cutoff, index_list):
 99 | 	"""
100 | 	takes a sequence s and and a list of indices in sequence that indicate the start of the
101 | 	PAM sequence and returns a list of 20bp top guide sequences that have been filtered
102 | 	for GC content and target stretch
103 | 	"""
104 | 	guides = []
105 | 	for i in index_list:
106 | 		if i > GUIDE_LENGTH:
107 | 			guide = (region[i-GUIDE_LENGTH:i])
108 | 			if 'N' not in guide:
109 | 				if  GC_content(GC_cutoff, guide) and Target_stretch(guide):
110 | 					guides.append([guide, i])
111 | 	return guides 
112 | 
113 | 
114 | def get_location(gene, b_guides, t_guides):
115 | 	"""
116 | 	returns lists of bottom guide and top guide cleavage site distances from the start of the
117 | 	target genomic region
118 | 	"""
119 | 	b_guide_loc = [(long(gene["start"]) + GUIDE_LENGTH + PAM_LENGTH - CLEAVAGE_SITE + x[1]) for x in b_guides]
120 | 	t_guide_loc = [(long(gene["start"]) - GUIDE_LENGTH + CLEAVAGE_SITE + x[1]) for x in t_guides]
121 | 	return b_guide_loc, t_guide_loc
122 | 
123 | def get_guides(region, GC_cutoff):
124 | 	"""
125 | 	finds all top and bottom guides in region and returns them with indices
126 | 	"""
127 | 	#find all the indices in region with pam sequence
128 | 	i_list_b = []
129 | 	i_list_t = []
130 | 	for pam in PAM_LIST:
131 | 		i_list_b.extend(indexList(region, revcomp(pam)))
132 | 		i_list_t.extend(indexList(region, pam))
133 | 
134 | 	#find all the guides that correspond with pam indices
135 | 	b_guides = get_b_guides(region, GC_cutoff, i_list_b)  
136 | 	t_guides = get_t_guides(region, GC_cutoff, i_list_t)
137 | 
138 | 	return b_guides, t_guides
139 | 
140 | def generate_fa_files(input_prefix, genes_file, GC_cutoff):
141 | 	"""
142 | 	generate a genome fa file from a genome 2bit file
143 | 	generate a guide fa file that contains filtered unique guides that target the genome
144 | 	at the regions specified in genes file
145 | 	"""
146 | 	genome_2bit_file = input_prefix + '.2bit'
147 | 	genome_fa_file = input_prefix + '.fa'
148 | 	guide_fa_file = input_prefix + '_all_guides.fa'
149 | 	tbf = twobitreader.TwoBitFile(genome_2bit_file)
150 | 	all_guides = set([])
151 | 
152 | 	#iterate through the chromosomes in the genome 2bit file and write to genome fa file
153 | 	with open(genome_fa_file, 'wb') as genome_fa:
154 | 		for chrom in tbf:
155 | 			if '_' not in chrom:
156 | 				region = tbf[chrom][0:].upper()
157 | 				genome_fa.write('>{0}\n'.format(chrom))
158 | 				genome_fa.write(region+'\n')
159 | 
160 | 	#for each genomic region specified in genes file identify filtered unique guides
161 | 	#and write to guide fa file
162 | 	with open(genes_file, 'rb') as gf:
163 | 		f = [row for row in csv.reader(gf.read().splitlines())]
164 | 		for i,l in enumerate(f): # i is index, l is entry
165 | 			if i == 0: 
166 | 				columns = l
167 | 				continue
168 | 
169 | 			#fetch the current gene and region
170 | 			gene = dict([(columns[i],e) for i,e in enumerate(l)])
171 | 			region_bounds = [long(gene["start"]), long(gene["end"]) + 1]
172 | 			region = tbf[gene["chrom"]][region_bounds[0]:region_bounds[1]]
173 | 			region = region.upper()
174 | 			if "N" in region: 
175 | 				print "found N in target region of", gene["name"]
176 | 				continue
177 | 
178 | 			#identify and filter guides that target region
179 | 			(b_guides, t_guides) = get_guides(region, GC_cutoff)
180 | 			current_guides = set([g[0] for g in (b_guides + t_guides)])
181 | 			all_guides = all_guides | current_guides
182 | 
183 | 	#write filtered unique guides to an output guide_fa_file
184 | 	guide_count = 0
185 | 	with open(guide_fa_file, 'wb') as guide_fa:
186 | 		for guide in all_guides:
187 | 			guide_count += 1
188 | 			guide_fa.write('>{0}\n'.format(guide_count))
189 | 			guide_fa.write(guide+'\n')
190 | 
191 | 
192 | def find_offtargets(input_prefix):
193 | 	"""
194 | 	calls seqmap to find all close matches to a given sgRNA listed in guide fa file in genome fa file
195 | 	and prints results to an offtargets file
196 | 	"""
197 | 	global tf_counter
198 | 	genome_fa_file = input_prefix + '.fa'
199 | 	guide_fa_file = input_prefix + '_all_guides.fa'
200 | 	offtargets_file = input_prefix + '_offtargets.tsv'
201 | 
202 | 	#break up sgrnas by n_probes
203 | 	tempfiles_in = []
204 | 	with open(guide_fa_file) as guide_file_pointer:
205 | 		guide_file_pointer.seek(0)
206 | 		for k,g in itertools.groupby(enumerate(guide_file_pointer), 
207 | 									 key = lambda x:int(x[0]/(N_PROBES * 2))): 
208 | 			lines = list(e for i,e in g)
209 | 			if len(lines) == 0 : continue
210 | 			f_in = tempfile.NamedTemporaryFile(mode='w', suffix='.{0}.probes.input.fa'.format(tf_counter), prefix='temp.', delete=False)
211 | 			tf_counter +=1
212 | 			for line in lines: f_in.writelines(line)
213 | 			tempfiles_in.append(f_in)
214 | 			f_in.close()
215 | 	tempfiles_out = []
216 | 
217 | 	#submit sgnra / probe scans {MAX_PROCESSES} at a time and wait for completion in groups
218 | 	for k,tfs_group in itertools.groupby(enumerate(tempfiles_in), key= lambda x:int(x[0] / MAX_PROCESSES)):
219 | 		f_out = tempfile.NamedTemporaryFile(mode='w', suffix='.{0}.seqmap.output'.format(tf_counter), prefix='temp.', delete=False)
220 | 		tf_counter +=1
221 | 		tempfiles_out.append(f_out)
222 | 		f_out.close()
223 | 		processes = []
224 | 	
225 | 		for k,f_in in tfs_group:
226 | 			print "PROCESSING IN PARALLEL!!!"
227 | 			cmd = "seqmap-1.0.13-src/seqmap {0} {1} {2} {3} /output_all_matches /do_not_output_probe_without_match".format(
228 | 				MAX_MISMATCHES, f_in.name, genome_fa_file, f_out.name)
229 | 			processes.append(subprocess.Popen(cmd, shell=True))
230 | 		for p in processes:
231 | 			p.communicate()
232 | 
233 | 	#merge output and clean up
234 | 	for tf in tempfiles_in:
235 | 		try:
236 | 			os.remove(tf.name)
237 | 		except:
238 | 			continue
239 | 	with open(offtargets_file, "w") as offtargets_file_pointer:
240 | 		for tf in tempfiles_out:
241 | 			with open(tf.name) as f_out:
242 | 				lines = []
243 | 				for i,l in enumerate(f_out):
244 | 					if i == 0: continue
245 | 					lines.append(l)
246 | 					if l[-1] != '\n':
247 | 						print l
248 | 						raise Exception()
249 | 				print "sorting {0} offtarget hits".format(len(lines))
250 | 				lines_sorted = sorted(lines, key = lambda x:x.split('\t')[4])
251 | 				os.remove(tf.name)
252 | 				offtargets_file_pointer.writelines(lines_sorted)        
253 | 	return
254 | 
255 | def make_db(input_prefix):
256 | 	"""
257 | 	Creates a sqlite database of guides and offtarget scores based on the offtargets file
258 | 	and outputs to database file
259 | 	"""
260 | 	offtargets_file = input_prefix + '_offtargets.tsv'
261 | 	offtarget_scores_file = input_prefix + '_offtarget_scores.csv'
262 | 	database_file = input_prefix + '_database.sqlite'
263 | 	to_db =[]
264 | 	ot_number = 0
265 | 
266 | 	with open(offtargets_file) as otf:
267 | 		for ontarget_sequence,g in itertools.groupby(otf, key = lambda x:x.split("\t")[4]):
268 | 			rows = list(g)
269 | 			offtarget_sequences = [e.split('\t')[2] for e in rows]
270 | 
271 | 			if len(ontarget_sequence) != GUIDE_LENGTH:
272 | 				print rows[0].split('\t')
273 | 				raise Exception("improperly formatted SGRNA")
274 | 			for i,s in enumerate(offtarget_sequences):
275 | 				if len(s) != GUIDE_LENGTH: 
276 | 					raise Exception("improperly formatted OT")
277 | 
278 | 			scores = [score_one_offtarget(ontarget_sequence, e) for e in offtarget_sequences]
279 | 			ot_number += 1
280 | 			if ot_number%10000 == 0:
281 | 				print ot_number, "guides scored"
282 | 				if ot_number == 10000:
283 | 					tic = time.clock()
284 | 				elif ot_number ==20000:
285 | 					toc = time.clock()
286 | 					print "time to process 10000 guides:", toc - tic
287 | 			if len(scores) == 0: raise Exception('ERROR: no matches for target.')
288 | 			total_score = score_sgrna(scores, has_ontarget = True)
289 | 			to_db.append((ontarget_sequence, total_score))
290 | 
291 | 		# make a csv file with unique isoforms  
292 | 		os.system('touch ' + offtarget_scores_file)
293 | 		with open(offtarget_scores_file,'wb') as csvfile:
294 | 			mywriter = csv.writer(csvfile)
295 | 			for guide in to_db:
296 | 				mywriter.writerow(guide)  
297 | 
298 | 	print "opening sqlite connection to {0}".format(database_file)
299 | 	dbpath = os.path.abspath(database_file)
300 | 	if os.path.isfile(database_file):
301 | 		os.remove(database_file)
302 | 	dbaddress = "//{0}".format(dbpath)
303 | 	print dbaddress
304 | 	con = sqlite3.connect(dbaddress)
305 | 	cur = con.cursor()
306 | 	cur.execute("CREATE TABLE sgrnas (seq TEXT PRIMARY KEY, score NUM);")          
307 | 	cur.executemany("INSERT INTO sgrnas (seq, score) VALUES (?, ?);", to_db)
308 | 	print "committing"
309 | 	con.commit()
310 | 	print "done creating databse with {0} entries".format(len(to_db))
311 | 	
312 | def score_one_offtarget(sgrna_sequence, offtarget_sequence):
313 | 	"""
314 | 	scores a single offtarget match
315 | 	"""
316 | 	mismatches = numpy.array([i for i in range(GUIDE_LENGTH) 
317 | 				  if sgrna_sequence[i] != offtarget_sequence[i]])
318 | 	if len(mismatches) == 0:
319 | 		score = 100
320 | 	else:
321 | 		score = 100 * (1 - weights[mismatches]).prod()
322 | 		if len(mismatches) > 1:
323 | 			mean_pairwise =float(sum(mismatches[1:] - mismatches[:-1])) / (len(mismatches)-1)
324 | 			mpw_factor = ((float((19-mean_pairwise))/19)*4 + 1)
325 | 			scl_factor = pow(len(mismatches),2)
326 | 			score  = score / ( mpw_factor * scl_factor )
327 | 			score = max([score,0])
328 | 	return score
329 | 
330 | def score_sgrna(scores, has_ontarget = True):
331 | 	"""
332 | 	computes a total score for an sgRNA guide sequence from all offtargets
333 | 	"""
334 | 	sum_scores = float(sum(scores))
335 | 	norm_score = 100 / sum_scores
336 | 	return norm_score
337 | 
338 | def remove_overlap(all_guides_sorted, spacing):
339 | 	"""
340 | 	removes guides that have cleavage site distances that are less than specified by spacing
341 | 	and returns a filtered list of guides
342 | 	"""
343 | 	all_guides_filtered = [all_guides_sorted[0]]
344 | 	prev_guide = all_guides_sorted[0]
345 | 	prev_loc = prev_guide[-1]
346 | 
347 | 	for guide in all_guides_sorted[1:]:
348 | 		loc = guide[-1]
349 | 		if abs(loc - prev_loc) > spacing:
350 | 			all_guides_filtered.append(guide)
351 | 			prev_guide = guide
352 | 			prev_loc = loc
353 | 
354 | 	return all_guides_filtered
355 | 
356 | def get_sorted_guides(region, gene, GC_cutoff, spacing, input_prefix):
357 | 	"""
358 | 	returns a list of filtered guides in region sorted by distance to the start of the
359 | 	targeted region in the form:
360 | 	[name, spacer sequence, strand (b/t), chromosome, and cleavage site location]
361 | 	"""
362 | 	all_b_guides = []
363 | 	all_t_guides = []
364 | 	(b_guides, t_guides) = get_guides(region, GC_cutoff)
365 | 	#add location of cleavage site to each guide
366 | 	(b_guide_loc, t_guide_loc) = get_location(gene, b_guides, t_guides)
367 | 	for i, loc in enumerate(b_guide_loc): 
368 | 		all_b_guides.append([gene["name"], b_guides[i][0], "b", gene["chrom"], loc]) 
369 | 	for i, loc in enumerate(t_guide_loc):
370 | 		all_t_guides.append([gene["name"], t_guides[i][0], "t", gene["chrom"], loc])
371 | 	all_guides = all_b_guides + all_t_guides #makes one nested list of bottom and top guides
372 | 	all_guides_sorted = sorted(all_guides, key=itemgetter(-1))  #sorts by location
373 | 	if len(all_guides_sorted) > 0:
374 | 		all_guides_filtered = remove_overlap(all_guides_sorted, spacing)
375 | 	else:
376 | 		return []
377 | 
378 | 	return all_guides_filtered
379 | 
380 | def get_ot_guides(guide_list, input_prefix):
381 | 	"""
382 | 	connects to the offtarget database to fetch ot scores for guides in guide_list
383 | 	returns a list of guides with respective off-target scores
384 | 	"""
385 | 	database_file = input_prefix + '_database.sqlite'
386 | 	guides_scored = []
387 | 	sequences = []
388 | 
389 | 	for guide in guide_list:
390 | 		sequences.append(guide[1])
391 | 
392 | 	dbpath = os.path.abspath(database_file)
393 | 	dbaddress = "//{0}".format(dbpath)
394 | 	conn = sqlite3.connect(dbaddress)
395 | 	c = conn.cursor()
396 | 	query = "select * FROM sgrnas WHERE seq IN ({0}) ORDER BY score".format(','.join(['?']*len(sequences)))
397 | 	try:
398 | 		c.execute(query, sequences)
399 | 		guides_scored = c.fetchall() 
400 | 	   
401 | 	except:
402 | 		print "did not find guides"
403 | 
404 | 	return guides_scored
405 | 
406 | def list_sgrnas(genes_file, input_prefix, GC_cutoff, spacing, guides_per_gene, gecko, sam):
407 | 	"""
408 | 	Returns a list of (ontarget) sgrna sequences using a genome file and list of
409 | 	transcription start sites form a .csv file.
410 | 	"""
411 | 	genome_2bit_file = input_prefix + '.2bit'
412 | 	tbf = twobitreader.TwoBitFile(genome_2bit_file)
413 | 	final_guides = []
414 | 
415 | 	with open(genes_file, 'rb') as gf:
416 | 		f = [row for row in csv.reader(gf.read().splitlines())]
417 | 		for i,l in enumerate(f):
418 | 			if i == 0: 
419 | 				columns = l
420 | 				continue
421 | 
422 | 			#fetch the current gene and region
423 | 			gene = dict([(columns[i],e) for i,e in enumerate(l)])
424 | 			region_bounds = [long(gene["start"]), long(gene["end"]) + 1]
425 | 			region = tbf[gene["chrom"]][region_bounds[0]:region_bounds[1]]
426 | 			region = region.upper()
427 | 			if "N" in region: 
428 | 				print "found N in target region of", gene["name"]
429 | 				continue
430 | 
431 | 			#identify and filter guides that target region
432 | 			guides = get_sorted_guides(region, gene, GC_cutoff, spacing, input_prefix)
433 | 			if len(guides) == 0:
434 | 				continue
435 | 
436 | 			#add offtarget scores to filtered guides and select guides with higher offtarget scores
437 | 			ot_guides_sql = get_ot_guides(guides, input_prefix) 
438 | 			ot_guides_dict = dict(ot_guides_sql)
439 | 			for g in guides:
440 | 				spacer = g[1]
441 | 				g.append(ot_guides_dict[spacer])
442 | 
443 | 			#sort and add guides with the highest offtarget scores to final guides
444 | 			guides = sorted(guides, key=itemgetter(-1), reverse=True)
445 | 
446 | 			if len(guides) <= guides_per_gene:
447 | 				final_guides.extend(guides)
448 | 			else:
449 | 				final_guides.extend(guides[:guides_per_gene])
450 | 
451 | 	# add gecko or sam flanking sequences to the spacer for the oligo library
452 | 	if sam or gecko:
453 | 		for guide in final_guides:
454 | 			spacer = guide[1]
455 | 			if gecko:
456 | 				oligo = gecko_flank[0] + spacer + gecko_flank[1]
457 | 			if sam:
458 | 				oligo = sam_flank[0] + spacer + sam_flank[1]
459 | 			guide.append(oligo)
460 | 
461 | 	return final_guides
462 | 
463 | def writecsv(data, filename):
464 | 	"""
465 | 	write data to a csv file
466 | 	"""
467 | 	with open(filename, 'wb') as csvfile:
468 | 		csvwriter = csv.writer(csvfile)
469 | 		for row in data:
470 | 			csvwriter.writerow(row)      
471 | 
472 | def __main__():
473 | 	parser = argparse.ArgumentParser(
474 | 		description='Design oligo library sequences for custom library cloning')
475 | 	parser.add_argument('-o', '--output', type=str, dest='guide_file',
476 | 						help='output file name', default='final_guides.csv')
477 | 	parser.add_argument('-i', '--input', type=str, dest='input_prefix',
478 | 						help='input genome prefix', default='hg19')
479 | 	parser.add_argument('-g', '--genes', type=str, dest='genes_file',
480 | 						help='input gene file name', default='genes.csv')
481 | 	parser.add_argument('-gc', '--gc', type=int, dest='GC_cutoff',
482 | 						help='gc content cutoff', default=25)
483 | 	parser.add_argument('-s', '--spacing', type=int, dest='spacing',
484 | 						help='minimum spacing between cleavage sites', default=20)
485 | 	parser.add_argument('-n', '--guides-per-gene', type=int, dest='guides_per_gene',
486 | 						help='maximum number of guides per gene', default=3)
487 | 	parser.add_argument('-db', dest='db', help='use existing off-target database', action='store_false')
488 | 	parser.set_defaults(db=True)
489 | 	parser.add_argument('-gecko', dest='gecko', help='add gecko flanking sequences', action='store_true')
490 | 	parser.set_defaults(gecko=False)
491 | 	parser.add_argument('-sam', dest='sam', help='add sam flanking sequences', action='store_true')
492 | 	parser.set_defaults(sam=False)
493 | 	args = parser.parse_args()
494 | 
495 | 	if args.db:
496 | 		generate_fa_files(args.input_prefix, args.genes_file, args.GC_cutoff)
497 | 		find_offtargets(args.input_prefix)
498 | 		make_db(args.input_prefix)  
499 | 	final_guides = list_sgrnas(args.genes_file, args.input_prefix, args.GC_cutoff, args.spacing, args.guides_per_gene, args.gecko, args.sam)
500 | 	writecsv(final_guides, args.guide_file)
501 | 
502 | if __name__ == "__main__":
503 | 	__main__()
504 | 


--------------------------------------------------------------------------------
/design_targeted_library.py:
--------------------------------------------------------------------------------
 1 | #Supplementary Data 2: design_targeted_library.py
 2 | 
 3 | import csv
 4 | import sys
 5 | import argparse
 6 | 
 7 | # flanking sequences around spacer for gecko and sam libraries
 8 | gecko_flank = ['TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCG', 'GTTTTAGAGCTAGAAATAGCAAGTTAAAATAAGGCTAGTCCGT']
 9 | sam_flank = ['TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCG', 'GTTTTAGAGCTAGGCCAACATGAGGATCACC']
10 | 
11 | def design_oligos(output_file, library_file, genes_file, gecko, sam):
12 | 	"""
13 | 	creates a list of spacers corresponding to the target genes, writes to output_file
14 | 	output_file: subset of spacers corresponding to the target genes
15 | 	library_file: all RefSeq genes with corresponding spacers
16 | 	genes_file: target genes
17 | 	"""
18 | 	# list of spacer sequences for the library
19 | 	spacer_list = []
20 | 
21 | 	# open gene file for list of target genes
22 | 	with open(genes_file, 'rb') as infile:
23 | 		target_genes = [row[0] for row in csv.reader(infile.read().splitlines())]
24 | 
25 | 	# open library file for all RefSeq genes
26 | 	with open(library_file, 'rb') as infile:
27 | 		for row in csv.reader(infile.read().splitlines()):
28 | 			gene = row[0]
29 | 			spacer = row[1]
30 | 
31 | 			# check if each gene is in the list of target genes
32 | 			if gene in target_genes:
33 | 				spacer_list.append(row)
34 | 
35 | 				# add gecko or sam flanking sequences to the spacer for the oligo library
36 | 				if gecko:
37 | 					oligo = gecko_flank[0] + spacer + gecko_flank[1]
38 | 					spacer_list[-1].append(oligo)
39 | 				if sam:
40 | 					oligo = sam_flank[0] + spacer + sam_flank[1]
41 | 					spacer_list[-1].append(oligo)
42 | 		# sort spacer sequences in gene order			
43 | 		spacer_list = sorted(spacer_list, key=lambda t: t[0])
44 | 
45 | 	# write spacer list to output file
46 | 	with open(output_file, 'wb') as outfile:
47 | 		csvwriter = csv.writer(outfile)
48 | 		for s in spacer_list:
49 | 			csvwriter.writerow(s)
50 | 
51 | if __name__ == '__main__':
52 | 	parser = argparse.ArgumentParser(
53 | 		description='Design oligo library sequences for targeted library cloning')
54 | 	parser.add_argument('-o', '--output', type=str, dest='output_file',
55 | 						help='output file name', default='oligos.csv')
56 | 	parser.add_argument('-l', '--library', type=str, dest='library_file',
57 | 						help='input file name', default='annotated_library.csv')
58 | 	parser.add_argument('-g', '--genes', type=str, dest='genes_file',
59 | 					help='input file name', default='target_genes.csv')
60 | 	parser.add_argument('-gecko', dest='gecko', help='add gecko flanking sequences', action='store_true')
61 | 	parser.set_defaults(gecko=False)
62 | 	parser.add_argument('-sam', dest='sam', help='add sam flanking sequences', action='store_true')
63 | 	parser.set_defaults(sam=False)
64 | 	args = parser.parse_args()
65 | 
66 | 	design_oligos(args.output_file, args.library_file, args.genes_file, args.gecko, args.sam)
67 | 


--------------------------------------------------------------------------------