└── Genome_Recoding.py /Genome_Recoding.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Author: TS 27/11/2017 3 | 4 | This script can automatically: 5 | 1. identify all targeted codons & overlapping genes; 6 | 2. report any embedded genes; 7 | 3. split only overlapping genes where recoding will introduce mutation(s); 8 | 4. identify the orientation of upstream and downstream overlapping genes; 9 | 5. add synthetic insertion of customized size (20bp by default) in all split genes except when upstream gene is forward and downstream gene is reverse; 10 | 6. re-annotate positions of all features in the genbank file after split; 11 | 7. annotate synthetic insertion; 12 | 8. recode the genome and annoate all recoded codons; 13 | 9. export a final genbank file as output; 14 | 10. print the total processing time. 15 | 16 | Usage: Just fill in the input session and click run. That is it! 17 | 18 | ''' 19 | #============================================================================== 20 | # Input 21 | #============================================================================== 22 | Input_GenBank_File = 'mds AP012306 2.gb' 23 | Output_GenBank_File = 'recoded_genome.gb' 24 | 25 | recoding_scheme = [{'target': 'TAG', 'recode': 'TAA'}, 26 | {'target': 'TCG', 'recode': 'AGC'}, 27 | {'target': 'TCA', 'recode': 'AGT'}] 28 | 29 | synthetic_insertion_size= 20 30 | 31 | #============================================================================== 32 | # Import Packages 33 | #============================================================================== 34 | from Bio import SeqIO 35 | from Bio.Seq import Seq 36 | import itertools 37 | import time 38 | 39 | 40 | #============================================================================== 41 | # Define functions 42 | #============================================================================== 43 | # get a list of codon properties: 0 = codon, 1 = codon start, 2 = codon end, 3 = codon position set; 4 = strand ('f' or 'r'), 5= new genbank annotation 44 | def get_codon(genbank, target_codon): 45 | codon_pos_list = [] 46 | if type(target_codon) != list: 47 | target_codon = [target_codon] 48 | for feature in genbank.features: 49 | if feature.type == "CDS": 50 | start=feature.location.start.position 51 | end=feature.location.end.position 52 | sense=feature.strand 53 | if sense == 1: 54 | orf = genbank.seq[start:end] 55 | codon = [orf[i:i+3] for i in range(0,len(orf),3)] # read codon in triplet 56 | codon_pos = [{'codon': str(x), 57 | 'start': (i*3)+start, 58 | 'end': (i*3)+start+3, 59 | 'position': set(range((i*3)+start, (i*3)+start+3)), # set(range(start, end)) 60 | 'strand': 'f'} for i, x in enumerate(codon) if x in target_codon] # extract codon info 61 | else: 62 | orf = genbank.seq[start:end].reverse_complement() # if antisense, do reverse transcription 63 | codon = [orf[i:i+3] for i in range(0,len(orf),3)] 64 | codon_pos = [{'codon': str(x), 65 | 'start': ((len(codon)-i)*3)+start, 66 | 'end': ((len(codon)-i)*3)+start-3, 67 | 'position': set(range(((len(codon)-i)*3)+start-3, ((len(codon)-i)*3)+start)), # set(range(start, end)) 68 | 'strand': 'r'} for i, x in enumerate(codon) if x in target_codon] 69 | codon_pos_list.append(codon_pos) 70 | return list(itertools.chain.from_iterable(codon_pos_list)) 71 | 72 | # get CDS propertie: 0 = index; 1 = gene; 2= start; 3 = end; 4 = strand 73 | def get_CDS(genbank): 74 | count = 0 75 | CDS_info = [] 76 | for feature in genbank.features: 77 | if feature.type == "CDS": 78 | start = feature.location.start.position 79 | end = feature.location.end.position 80 | sense = feature.strand 81 | strand = ['f' if x == 1 else 'r' for x in [sense]][0] 82 | try: 83 | gene = feature.qualifiers["gene"][0] 84 | except KeyError: 85 | gene = 'NA' 86 | if sense == 1: 87 | aa = genbank.seq[start:end].translate() 88 | else: 89 | aa = genbank.seq[start:end].reverse_complement().translate() 90 | CDS_info.append({'index': count, 'gene': gene, 'start': start, 'end': end, 91 | 'position': set(range(start, end)), 'strand': strand, 'aa': aa}) 92 | count += 1 93 | return CDS_info 94 | 95 | def report_embedded_genes(CDS): # CDS = output from get_CDS 96 | for i in range(1,len(CDS)): 97 | if len(CDS[i].get('position') & CDS[i-1].get('position')) == len(CDS[i].get('position')): 98 | print('Warning: {} at position {} is an embedded gene. '.format(CDS[i].get('gene'),CDS[i].get('start')) + 99 | 'Please consider how this gene should be split') 100 | 101 | # get overlapping info: 0 = index; 1 = overlapping codon; 2= upstream gene; 3 = downstream gene; 4 = start position; 5 = end position; 6 = orientation 102 | def get_overlap(codon_list): # codon_list = output from get_codon_info 103 | overlap_list = [] 104 | for i in range(1,len(CDS)): 105 | upstream = CDS[i-1] 106 | downstream = CDS[i] 107 | upstream_set = set(range(upstream.get('start'), upstream.get('end')+1)) 108 | downstream_set = set(range(downstream.get('start'), downstream.get('end')+1)) 109 | overlap_region = (upstream_set & downstream_set) 110 | if bool(overlap_region) == True: 111 | overlap_codon = [codon_list[i].get('start') for i in range(len(codon_list)) if bool(codon_list[i].get('position') & overlap_region)==True] 112 | if bool(overlap_codon) == True: 113 | overlap_list.append({'index': i, 114 | 'codon': overlap_codon, 115 | 'upstream gene': upstream.get('gene'), 116 | 'downstream gene': downstream.get('gene'), 117 | 'start': min(list(overlap_region)), 118 | 'end': max(list(overlap_region)), 119 | 'orientation': upstream.get('strand') + downstream.get('strand')}) 120 | return overlap_list 121 | 122 | def recoding(seq, codon_list): 123 | mutable_seq = seq.tomutable() 124 | for i in codon_list: 125 | start = i.get('start') 126 | end = i.get('end') 127 | codon = i.get('codon') 128 | if i.get('strand') == 'f': 129 | mutable_seq[start: end] = [a.get('recode') for a in recoding_scheme if a.get('target') == codon][0] 130 | else: 131 | mutable_seq[end: start] = [str(Seq(a.get('recode')).reverse_complement()) for a in recoding_scheme if a.get('target') == codon][0] 132 | return mutable_seq.toseq() 133 | 134 | 135 | 136 | if __name__ == '__main__': 137 | start_time = time.time() 138 | #============================================================================== 139 | # Get essential information of the genbank file 140 | #============================================================================== 141 | mds = SeqIO.read(Input_GenBank_File, "genbank") # Read genbank files 142 | target_codon = [i.get('target') for i in recoding_scheme] # get a list of target codon 143 | codon_list = get_codon(mds, target_codon) # get info on the target codons 144 | CDS = get_CDS(mds) # get info on all CDS in the genbank file 145 | report_embedded_genes(CDS) # check for embedded gene(s) 146 | overlap = get_overlap(codon_list) # get info on all the overlapping CDS in the genbank 147 | 148 | #============================================================================== 149 | # Identify overlaps that will introduce mutation after recoding 150 | #============================================================================== 151 | test_recoding = SeqIO.read(Input_GenBank_File, 'genbank') # create a genbank file to test whether recoding will introduce mutation at overlap regions 152 | test_recoding.seq = recoding(test_recoding.seq, codon_list)# recoding 153 | CDS_test_recoding = get_CDS(test_recoding) 154 | 155 | overlap_mutated = [] 156 | for i in range(len(overlap)): 157 | index = overlap[i].get('index') 158 | if CDS_test_recoding[index-1].get('aa') != CDS[index-1].get('aa') or CDS_test_recoding[index].get('aa') != CDS[index].get('aa'): 159 | overlap_mutated.append(index) 160 | 161 | #============================================================================== 162 | # Split overlapping genes 163 | #============================================================================== 164 | from Bio.SeqFeature import FeatureLocation 165 | from Bio.SeqFeature import SeqFeature 166 | 167 | split = mds.seq 168 | for i in range(len(overlap)-1, -1,-1): # read overlap in reverse direction 169 | if overlap[i].get('index') in overlap_mutated: # overlap need to be fix or else it will introduce missense mutation 170 | start = overlap[i].get('start') 171 | end = overlap[i].get('end') 172 | orientation = overlap[i].get('orientation') 173 | length = end - start 174 | if orientation == 'fr': # no synthetic insertion 175 | split = (split[:end] + split[start:]) 176 | else: 177 | split = (split[:end] + split[start - synthetic_insertion_size: start] + split[start:]) 178 | for feature in mds.features: # After split, all subsequent genbank annotated has to be adjusted 179 | if feature.location.start.position >= start: 180 | if orientation == 'fr': # no synthetic insertion 181 | new_start = int(feature.location.start.position + length) 182 | new_end = int(feature.location.end.position + length) 183 | else: 184 | new_start = int(feature.location.start.position + length + synthetic_insertion_size) 185 | new_end = int(feature.location.end.position + length + synthetic_insertion_size) 186 | feature.location = FeatureLocation(new_start, new_end, feature.location.strand) 187 | if orientation != 'fr': # annotate synthetic insertion 188 | mds.features.append(SeqFeature(FeatureLocation(end, end + synthetic_insertion_size), 189 | qualifiers = {"note": 'Synthetic Insertion'}, type = "misc_feature")) 190 | 191 | mds.seq = split 192 | 193 | #============================================================================== 194 | # Recode and annotate the genome 195 | #============================================================================== 196 | codon_list_after_split = get_codon(mds, target_codon) 197 | mds.seq = recoding(mds.seq, codon_list_after_split) # recoding 198 | 199 | for i in codon_list_after_split: 200 | recoded_codon = [recoding_scheme[a].get('recode') for a in range(len(recoding_scheme)) if i.get('codon') == recoding_scheme[a].get('target')][0] # get recoded codon from corresponding target codon 201 | annotation = (i.get('codon') + ' to ' + recoded_codon) 202 | if i.get('strand') == 'f': 203 | mds.features.append(SeqFeature(FeatureLocation(i.get('start'), i.get('end')), 204 | qualifiers = {"note": annotation}, type = "misc_feature")) 205 | else: 206 | mds.features.append(SeqFeature(FeatureLocation(i.get('end'), i.get('start')), 207 | qualifiers = {"note": annotation}, type = "misc_feature")) 208 | 209 | #============================================================================== 210 | # Export recoded genbank file and calculate processing time 211 | #============================================================================== # convert mutable sequence back into non-mutable SeqObject # replace the DNA sequence in the original genbank file with the new recoded DNA sequence 212 | SeqIO.write(mds, Output_GenBank_File, "genbank") 213 | 214 | end_time = time.time() - start_time 215 | print('Total processing time = {}'.format(time.strftime('%H:%M:%S', time.gmtime(end_time)))) 216 | 217 | 218 | 219 | 220 | 221 | --------------------------------------------------------------------------------