├── LICENSE ├── README.md └── dna2proteins.py /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Patricio Rodrigo Estévez Soto 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.34023.svg)](http://dx.doi.org/10.5281/zenodo.34023) 2 | # dna2proteins 3 | This Python script was produced as part of the course *Introduction to Scientific Programming in Python* of the UCL Graduate School. 4 | 5 | More information on the course can be found on its [home page](http://www.cs.ucl.ac.uk/scipython/). 6 | 7 | ## The script 8 | 9 | The script puts together a collection of functions that essentially import a fastafile containing sequences of DNA and produce a fastafile with the most likely protein sequence for each DNA sequence. 10 | 11 | The steps in the script are roughly the following: 12 | 1. Reads in the fasta file 13 | 2. Stores the sequences in a dictionary 14 | 3. Generates the six possible frames for each sequence (+1, +2, +3 and -1, -2, -3) 15 | 4. Swaps the DNA sequences for protein sequences 16 | 5. Finds the longest protein sequence between an open and close marker 17 | 6. Stores the longest protein sequence for each DNA sequence in a dictionary 18 | 7. Can save the protein sequences on a fasta file or print the sequences on the terminal 19 | 20 | ### Usage 21 | 22 | The script is quite simple. It contains three options that can be passed from the command line: 23 | - -h prints a very simple help 24 | - -i (--ifile) must be followed by the fasta file 25 | - -o (--ofile) must be followed by the name where the protein sequences will be stored 26 | - -p is an option that allows printing the protein sequences on the terminal 27 | 28 | To use the script enter the following in the terminal: 29 | 30 | `$ python dna2proteins.py -i sequences.fa -o proteins.fa -p` 31 | 32 | And substitute `sequences.fa` and `proteins.fa` for the appropriate filenames and paths. 33 | 34 | ### Credits 35 | 36 | The code for this script was developed jointly by: 37 | - Erin Vehstedt 38 | - Johanna Fischer 39 | - Maragatham Kumar 40 | - Andrés Calderón 41 | - Marya Koleva 42 | - Patricio R. Estévez Soto 43 | - With the guidance and help of Fabian Zimmer 44 | 45 | **This project is not maintained. We make no assurances nor offer any guarantees regarding its performance. It was developed as an effort to learn python.** 46 | -------------------------------------------------------------------------------- /dna2proteins.py: -------------------------------------------------------------------------------- 1 | ### Import all functions 2 | 3 | ## Function to read fasta files 4 | 5 | def read_fasta(fastafile): 6 | """ 7 | Reads a fasta file and returns a dictionary with sequence 8 | number as keys and sequence code as values 9 | """ 10 | sequences = [] 11 | with open(fastafile, "r") as f: 12 | ls = f.readlines() 13 | for i in ls: 14 | sequences.append(i.rstrip("\n")) 15 | 16 | seq_id = [] 17 | for i in sequences: 18 | if i[0] == ">": 19 | seq_id.append(i) 20 | 21 | seq_id_index = [] 22 | for i in range(len(seq_id)): 23 | seq_id_index.append(sequences.index(seq_id[i])) 24 | 25 | seq_dic = {} 26 | for i in range(len(seq_id_index)): 27 | if i == (len(seq_id_index) - 1): 28 | seq_dic[seq_id[i]] = sequences[seq_id_index[i]+1:] 29 | else: 30 | seq_dic[seq_id[i]] = sequences[seq_id_index[i]+1:seq_id_index[i+1]] 31 | 32 | seq_dic_2 = {} 33 | for keys, values in seq_dic.items(): 34 | seq_dic_2[keys] = "".join(values) 35 | 36 | return seq_dic_2 37 | 38 | ## Writes a dictionary to a fasta file 39 | 40 | def write_fasta(dictionary, filename): 41 | """ 42 | Takes a dictionary and writes it to a fasta file 43 | Must specify the filename when caling the function 44 | """ 45 | 46 | import textwrap 47 | with open(filename, "w") as outfile: 48 | for key, value in dictionary.items(): 49 | outfile.write(key + "\n") 50 | outfile.write("\n".join(textwrap.wrap(value, 60))) 51 | outfile.write("\n") 52 | 53 | print "Success! File written" 54 | 55 | ## Swaps DNA sequencs for proteins 56 | 57 | def swap_dna(dnastring): 58 | table = { 59 | 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 60 | 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 61 | 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 62 | 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', 63 | 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 64 | 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 65 | 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 66 | 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 67 | 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 68 | 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 69 | 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 70 | 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 71 | 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 72 | 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 73 | 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 74 | 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 75 | } 76 | protein = [] 77 | end = len(dnastring) - (len(dnastring) %3) - 1 78 | for i in range(0,end,3): 79 | codon = dnastring[i:i+3] 80 | if codon in table: 81 | aminoacid = table[codon] 82 | protein.append(aminoacid) 83 | else: 84 | protein.append("N") 85 | return "".join(protein) 86 | 87 | ## Generates the six possible frames per one sequence 88 | 89 | def frame_id(seq): 90 | ''' 91 | Usage: frame_id(dictionary['key']) 92 | frame_id(sequences['>Seq1']) 93 | six_frames = frame_id(sequences['>Seq1']) 94 | ''' 95 | frames = {'+1':[],'+2':[],'+3':[],'-1':[],'-2':[],'-3':[]} 96 | seq_rev = rev_seq(seq) 97 | for j in range(0,3): 98 | temp = ''.join([seq[j::]]) 99 | temp_rev = ''.join([seq_rev[j::]]) 100 | seq_trans = swap_dna(temp) 101 | seq_rev_trans = swap_dna(temp_rev) 102 | if j==0: 103 | frames['+1']=seq_trans 104 | frames['-1']=seq_rev_trans 105 | if j==1: 106 | frames['+2']=seq_trans 107 | frames['-2']=seq_rev_trans 108 | if j==2: 109 | frames['+3']=seq_trans 110 | frames['-3']=seq_rev_trans 111 | 112 | return frames 113 | 114 | ## Required function for the previos frame_id function 115 | 116 | def rev_seq(seq): 117 | trans=[] 118 | for i in seq: 119 | if i=='A': 120 | trans.append('T') 121 | elif i=='C': 122 | trans.append('G') 123 | elif i=='G': 124 | trans.append('C') 125 | elif i=='T': 126 | trans.append('A') 127 | else: 128 | trans.append(i) 129 | trans=''.join(trans) 130 | seq_rev= trans[::-1] 131 | return seq_rev 132 | 133 | ## Generates all the frames for all the sequences 134 | 135 | def gen_frames(dictionary): 136 | all_dict = {} 137 | for key, value in dictionary.items(): 138 | all_dict[key] = frame_id(dictionary[key]) 139 | 140 | return all_dict 141 | 142 | ## Find the open frames in the protein sequences 143 | 144 | def oframe(amino): 145 | oframes = [] 146 | for i in range(0,len(amino)): 147 | if amino[i]=='M': 148 | temp = ''.join([amino[i::]]) 149 | oframe=temp[0:temp.find('_')+1] 150 | oframes.append(oframe) 151 | return oframes 152 | 153 | ## Finds the longest proteins in each sequence 154 | 155 | def find_prots(dictionary): 156 | prots_dict = {} 157 | for key, value in dictionary.items(): 158 | poss_protein = [] 159 | for f in value: 160 | poss_protein += (oframe(value[f])) 161 | #print key, poss_protein 162 | c = 0 163 | result = "" 164 | for s in poss_protein: 165 | if len(s) > c: 166 | result = s 167 | c = len(s) 168 | else: 169 | continue 170 | prots_dict[key] = result 171 | 172 | return prots_dict 173 | 174 | ## For command line Usage 175 | 176 | import sys, getopt 177 | 178 | def main(argv): 179 | inputfile = "" 180 | outputfile = "" 181 | printprots = False 182 | try: 183 | opts, args = getopt.getopt(argv,"hi:o:p",["ifile=","ofile="]) 184 | except getopt.GetoptError: 185 | print 'dna2proteins.py -i -o -p' 186 | sys.exit(2) 187 | for opt, arg in opts: 188 | if opt == '-h': 189 | print 'dna2proteins.py -i -o -p' 190 | print "-p prints the protein sequences in the terminal" 191 | sys.exit() 192 | elif opt in ("-i", "--ifile"): 193 | inputfile = arg 194 | elif opt in ("-o", "--ofile"): 195 | outputfile = arg 196 | elif opt == "-p": 197 | printprots = True 198 | 199 | return inputfile, outputfile, printprots 200 | 201 | if __name__ == "__main__": 202 | inputfile, outputfile, printprots = main(sys.argv[1:]) 203 | 204 | sequences = read_fasta(inputfile) 205 | 206 | sequences_frames = gen_frames(sequences) 207 | 208 | proteins = find_prots(sequences_frames) 209 | 210 | if printprots == True: 211 | for key, values in proteins.items(): 212 | print key 213 | print values 214 | 215 | if outputfile != "": 216 | write_fasta(proteins, outputfile) 217 | --------------------------------------------------------------------------------