└── Genome_Recoding.py


/Genome_Recoding.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | @Author: TS 27/11/2017
  3 |  
  4 | This script can automatically:
  5 |     1. identify all targeted codons & overlapping genes;
  6 |     2. report any embedded genes;
  7 |     3. split only overlapping genes where recoding will introduce mutation(s);
  8 |     4. identify the orientation of upstream and downstream overlapping genes;
  9 |     5. add synthetic insertion of customized size (20bp by default) in all split genes except when upstream gene is forward and downstream gene is reverse;
 10 |     6. re-annotate positions of all features in the genbank file after split;
 11 |     7. annotate synthetic insertion;
 12 |     8. recode the genome and annoate all recoded codons;
 13 |     9. export a final genbank file as output;
 14 |     10. print the total processing time.  
 15 | 
 16 | Usage: Just fill in the input session and click run. That is it!    
 17 | 
 18 | '''
 19 | #==============================================================================
 20 | # Input
 21 | #==============================================================================
 22 | Input_GenBank_File = 'mds AP012306 2.gb'
 23 | Output_GenBank_File = 'recoded_genome.gb'
 24 | 
 25 | recoding_scheme = [{'target': 'TAG', 'recode': 'TAA'},
 26 |                    {'target': 'TCG', 'recode': 'AGC'},
 27 |                    {'target': 'TCA', 'recode': 'AGT'}]
 28 | 
 29 | synthetic_insertion_size= 20
 30 | 
 31 | #==============================================================================
 32 | # Import Packages
 33 | #==============================================================================
 34 | from Bio import SeqIO
 35 | from Bio.Seq import Seq
 36 | import itertools 
 37 | import time
 38 | 
 39 | 
 40 | #==============================================================================
 41 | # Define functions 
 42 | #==============================================================================
 43 | # get a list of codon properties: 0 = codon, 1 = codon start, 2 = codon end, 3 = codon position set; 4 = strand ('f' or 'r'), 5= new genbank annotation
 44 | def get_codon(genbank, target_codon):  
 45 |     codon_pos_list = []
 46 |     if type(target_codon) != list:
 47 |         target_codon = [target_codon] 
 48 |     for feature in genbank.features:
 49 |         if feature.type == "CDS":
 50 |             start=feature.location.start.position
 51 |             end=feature.location.end.position
 52 |             sense=feature.strand   
 53 |             if sense == 1:                                                  
 54 |                 orf = genbank.seq[start:end]   
 55 |                 codon = [orf[i:i+3] for i in range(0,len(orf),3)]              # read codon in triplet
 56 |                 codon_pos = [{'codon': str(x),
 57 |                               'start': (i*3)+start,
 58 |                               'end': (i*3)+start+3,
 59 |                               'position': set(range((i*3)+start, (i*3)+start+3)), # set(range(start, end))
 60 |                               'strand': 'f'} for i, x in enumerate(codon) if x in target_codon] # extract codon info       
 61 |             else:
 62 |                 orf = genbank.seq[start:end].reverse_complement()              # if antisense, do reverse transcription
 63 |                 codon = [orf[i:i+3] for i in range(0,len(orf),3)]                          
 64 |                 codon_pos = [{'codon': str(x),
 65 |                               'start': ((len(codon)-i)*3)+start,
 66 |                               'end': ((len(codon)-i)*3)+start-3,
 67 |                               'position': set(range(((len(codon)-i)*3)+start-3, ((len(codon)-i)*3)+start)), # set(range(start, end))
 68 |                               'strand': 'r'} for i, x in enumerate(codon) if x in target_codon] 
 69 |             codon_pos_list.append(codon_pos)    
 70 |     return list(itertools.chain.from_iterable(codon_pos_list))
 71 | 
 72 | # get CDS propertie: 0 = index; 1 = gene; 2= start; 3 = end; 4 = strand
 73 | def get_CDS(genbank):
 74 |     count = 0
 75 |     CDS_info = []
 76 |     for feature in genbank.features:
 77 |         if feature.type == "CDS":
 78 |             start = feature.location.start.position
 79 |             end = feature.location.end.position
 80 |             sense = feature.strand
 81 |             strand = ['f' if x == 1 else 'r' for x in [sense]][0]
 82 |             try: 
 83 |                 gene = feature.qualifiers["gene"][0]
 84 |             except KeyError:
 85 |                 gene = 'NA'
 86 |             if sense == 1:
 87 |                 aa = genbank.seq[start:end].translate()
 88 |             else:
 89 |                 aa = genbank.seq[start:end].reverse_complement().translate()              
 90 |             CDS_info.append({'index': count, 'gene': gene, 'start': start, 'end': end,
 91 |                              'position': set(range(start, end)), 'strand': strand, 'aa': aa})
 92 |             count += 1
 93 |     return CDS_info
 94 | 
 95 | def report_embedded_genes(CDS): # CDS = output from get_CDS
 96 |     for i in range(1,len(CDS)):
 97 |         if len(CDS[i].get('position') & CDS[i-1].get('position')) == len(CDS[i].get('position')):
 98 |             print('Warning: {} at position {} is an embedded gene. '.format(CDS[i].get('gene'),CDS[i].get('start')) + 
 99 |                       'Please consider how this gene should be split')
100 |             
101 | # get overlapping info: 0 = index; 1 = overlapping codon; 2= upstream gene; 3 = downstream gene; 4 = start position; 5 = end position; 6 = orientation
102 | def get_overlap(codon_list): # codon_list = output from get_codon_info
103 |     overlap_list = []
104 |     for i in range(1,len(CDS)):  
105 |         upstream = CDS[i-1]
106 |         downstream = CDS[i]
107 |         upstream_set = set(range(upstream.get('start'), upstream.get('end')+1))
108 |         downstream_set = set(range(downstream.get('start'), downstream.get('end')+1))
109 |         overlap_region = (upstream_set & downstream_set)
110 |         if bool(overlap_region) == True:
111 |             overlap_codon = [codon_list[i].get('start') for i in range(len(codon_list)) if bool(codon_list[i].get('position') & overlap_region)==True]
112 |             if bool(overlap_codon) == True:
113 |                 overlap_list.append({'index': i,
114 |                                 'codon': overlap_codon,
115 |                                 'upstream gene': upstream.get('gene'),
116 |                                 'downstream gene': downstream.get('gene'),
117 |                                 'start': min(list(overlap_region)),
118 |                                 'end': max(list(overlap_region)),
119 |                                 'orientation': upstream.get('strand') + downstream.get('strand')})
120 |     return overlap_list
121 |                 
122 | def recoding(seq, codon_list): 
123 |     mutable_seq = seq.tomutable()
124 |     for i in codon_list:
125 |         start = i.get('start')
126 |         end = i.get('end')
127 |         codon = i.get('codon')
128 |         if i.get('strand') == 'f':
129 |             mutable_seq[start: end] = [a.get('recode') for a in recoding_scheme if a.get('target') == codon][0]
130 |         else:
131 |             mutable_seq[end: start] = [str(Seq(a.get('recode')).reverse_complement()) for a in recoding_scheme if a.get('target') == codon][0]
132 |     return mutable_seq.toseq()
133 | 
134 | 
135 |  
136 | if __name__ == '__main__':
137 |     start_time = time.time()
138 |     #==============================================================================
139 |     # Get essential information of the genbank file
140 |     #==============================================================================
141 |     mds = SeqIO.read(Input_GenBank_File, "genbank")                                # Read genbank files    
142 |     target_codon = [i.get('target') for i in recoding_scheme]                      # get a list of target codon
143 |     codon_list = get_codon(mds, target_codon)                                      # get info on the target codons
144 |     CDS = get_CDS(mds)                                                             # get info on all CDS in the genbank file
145 |     report_embedded_genes(CDS)                                                     # check for embedded gene(s)
146 |     overlap = get_overlap(codon_list)                                              # get info on all the overlapping CDS in the genbank
147 |     
148 |     #==============================================================================
149 |     # Identify overlaps that will introduce mutation after recoding
150 |     #==============================================================================
151 |     test_recoding = SeqIO.read(Input_GenBank_File, 'genbank')                      # create a genbank file to test whether recoding will introduce mutation at overlap regions
152 |     test_recoding.seq = recoding(test_recoding.seq, codon_list)# recoding
153 |     CDS_test_recoding = get_CDS(test_recoding) 
154 |     
155 |     overlap_mutated = []
156 |     for i in range(len(overlap)):
157 |         index = overlap[i].get('index')
158 |         if CDS_test_recoding[index-1].get('aa') != CDS[index-1].get('aa') or CDS_test_recoding[index].get('aa') != CDS[index].get('aa'):
159 |                 overlap_mutated.append(index)
160 |             
161 |     #==============================================================================
162 |     # Split overlapping genes
163 |     #==============================================================================
164 |     from Bio.SeqFeature import FeatureLocation
165 |     from Bio.SeqFeature import SeqFeature
166 |     
167 |     split = mds.seq
168 |     for i in range(len(overlap)-1, -1,-1):                                         # read overlap in reverse direction
169 |         if overlap[i].get('index') in overlap_mutated:                             # overlap need to be fix or else it will introduce missense mutation
170 |             start = overlap[i].get('start') 
171 |             end = overlap[i].get('end') 
172 |             orientation = overlap[i].get('orientation')
173 |             length = end - start 
174 |             if orientation == 'fr':                                                # no synthetic insertion
175 |                 split = (split[:end] + split[start:])
176 |             else:
177 |                 split = (split[:end] + split[start - synthetic_insertion_size: start] + split[start:])           
178 |             for feature in mds.features:                                           # After split, all subsequent genbank annotated has to be adjusted
179 |                 if feature.location.start.position >= start:
180 |                     if orientation == 'fr':                                        # no synthetic insertion
181 |                         new_start = int(feature.location.start.position + length)
182 |                         new_end = int(feature.location.end.position + length)
183 |                     else:
184 |                         new_start = int(feature.location.start.position + length + synthetic_insertion_size)
185 |                         new_end = int(feature.location.end.position + length + synthetic_insertion_size)
186 |                     feature.location = FeatureLocation(new_start, new_end, feature.location.strand)
187 |             if orientation != 'fr':                                                # annotate synthetic insertion
188 |                 mds.features.append(SeqFeature(FeatureLocation(end, end + synthetic_insertion_size), 
189 |                                            qualifiers = {"note": 'Synthetic Insertion'}, type = "misc_feature"))
190 |     
191 |     mds.seq = split           
192 |     
193 |     #==============================================================================
194 |     # Recode and annotate the genome
195 |     #==============================================================================
196 |     codon_list_after_split = get_codon(mds, target_codon)  
197 |     mds.seq = recoding(mds.seq, codon_list_after_split)                            # recoding
198 |         
199 |     for i in codon_list_after_split:
200 |         recoded_codon = [recoding_scheme[a].get('recode') for a in range(len(recoding_scheme)) if i.get('codon') == recoding_scheme[a].get('target')][0]  # get recoded codon from corresponding target codon
201 |         annotation = (i.get('codon') + ' to ' + recoded_codon)
202 |         if i.get('strand') == 'f':
203 |             mds.features.append(SeqFeature(FeatureLocation(i.get('start'), i.get('end')), 
204 |                                            qualifiers = {"note": annotation}, type = "misc_feature"))
205 |         else: 
206 |             mds.features.append(SeqFeature(FeatureLocation(i.get('end'), i.get('start')), 
207 |                                            qualifiers = {"note": annotation}, type = "misc_feature"))
208 |         
209 |     #==============================================================================
210 |     # Export recoded genbank file and calculate processing time
211 |     #==============================================================================                                                # convert mutable sequence back into non-mutable SeqObject                                                      # replace the DNA sequence in the original genbank file with the new recoded DNA sequence
212 |     SeqIO.write(mds, Output_GenBank_File, "genbank")                         
213 |     
214 |     end_time = time.time() - start_time
215 |     print('Total processing time = {}'.format(time.strftime('%H:%M:%S', time.gmtime(end_time))))
216 |     
217 | 
218 | 
219 | 
220 | 
221 | 


--------------------------------------------------------------------------------