├── README
└── scripts
    ├── motifs
        ├── fasta.py
        ├── pymotGUI.py
        └── pymotif.py
    └── original_scripts
        ├── 3.0
            ├── AY162388.seq
            ├── code_01.py
            ├── code_02.py
            ├── code_03.py
            ├── code_04.py
            ├── code_05.py
            └── code_05.py.bak
        ├── code_01.py
        ├── code_02.py
        ├── code_03.py
        ├── code_04.py
        ├── code_05.py
        ├── code_06.py
        ├── code_07.py
        ├── code_08.py
        ├── code_09.py
        ├── code_10.py
        ├── code_11.py
        ├── code_12.py
        ├── code_13.py
        ├── code_14.py
        ├── code_15.py
        ├── code_16.py
        ├── code_17.py
        ├── code_18.py
        ├── code_18a.py
        ├── code_19.py
        ├── code_20.py
        ├── code_21.py
        ├── code_22.py
        ├── code_23.py
        ├── code_24.py
        ├── code_25.py
        ├── code_26.py
        ├── code_27.py
        ├── code_28.py
        ├── code_29.py
        ├── code_30.py
        ├── code_31.py
        ├── code_32.py
        ├── code_33.py
        ├── code_34.py
        ├── code_35.py
        ├── code_36.py
        ├── code_37.py
        ├── code_38.py
        ├── code_39.py
        ├── code_40.py
        ├── code_41.py
        ├── code_42.py
        ├── code_43.py
        ├── code_44.py
        ├── code_45.py
        ├── code_46.py
        ├── code_47.py
        ├── code_48.py
        ├── dnatranslate.py
        └── fasta.py


/README:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nuin/beginning-python-for-bioinformatics/b33813f4ec11a59a5c6381cc5b78044824d25e3f/README


--------------------------------------------------------------------------------
/scripts/motifs/fasta.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | class Fasta:
 4 |     def __init__(self, name, sequence):
 5 |         self.name = name
 6 |         self.sequence = sequence
 7 | 
 8 | def read_fasta(file):
 9 |     items = []
10 |     index = 0
11 |     for line in file:
12 |         if line.startswith(">"):
13 |            if index >= 1:
14 |                items.append(aninstance)
15 |            index+=1
16 |            name = line.strip()
17 |            seq = ''
18 |            aninstance = Fasta(name, seq)
19 |         else:
20 |            seq += line.strip()
21 |            aninstance = Fasta(name, seq)
22 | 
23 |     items.append(aninstance)
24 |     return items
25 | 
26 | def read_seqs(file):
27 |     items = []
28 |     seq = ''
29 |     index = 0
30 |     for line in file:
31 |         if line.startswith(">"):
32 |             if index >= 1:
33 |                 items.append(seq)
34 |                 seq = ''
35 |             index += 1
36 |         else:
37 |             seq += line[:-1]
38 | 
39 |     items.append(seq)
40 |     return items
41 | 
42 | def format_output(sequence, length):
43 |     temp = []
44 |     for j in range(0,len(sequence),length):
45 |         temp.append(sequence[j:j+length])
46 |     return '\n'.join(temp)
47 | 
48 | 


--------------------------------------------------------------------------------
/scripts/motifs/pymotGUI.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import wx
 4 | import pymot
 5 | import pymotif
 6 | import fasta
 7 | import os
 8 | 
 9 | class pymot(wx.App):
10 | 
11 |     def __init__(self, redirect=False):
12 |         wx.App.__init__(self, redirect)
13 |         
14 | 
15 | class pymotGUI(wx.Frame):
16 |     
17 |     fore_file = ''
18 |     back_file = ''
19 |         
20 |     def __init__(self, parent, id):
21 |         wx.Frame.__init__(self, parent, id,  'Python Motif Finder', style=wx.DEFAULT_FRAME_STYLE)
22 |         self.__do_layout()
23 | 
24 |     def __do_layout(self):
25 |         
26 |         #adding the panel
27 |         panel = wx.Panel(self)
28 |         
29 |         #defines the menubar
30 |         menubar = wx.MenuBar()
31 |         
32 |         #file menu
33 |         filemenu = wx.Menu()
34 |         foreground_menu = filemenu.Append(-1, 'Select foreground file')
35 |         background_menu = filemenu.Append(-1, 'Select background file')
36 |         sep = filemenu.AppendSeparator()
37 |         quitmenu = filemenu.Append(-1, 'Quit')
38 |         
39 |         #appends the menu to the menubar and creates it
40 |         menubar.Append(filemenu, 'File')
41 |         self.SetMenuBar(menubar)
42 | 
43 |         #input box for motif width, and label
44 |         self.one_label = wx.StaticText(panel, -1, 'Motif width', (10,50))
45 |         self.motif_width = wx.TextCtrl(panel, -1, '10', (95, 50), (40,18))
46 |         #result textbox
47 |         self.results = wx.TextCtrl(panel, -1, '', (150, 50), (200, 100), wx.TE_MULTILINE | wx.TE_AUTO_SCROLL | wx.HSCROLL)
48 |         
49 |         #run bbutton
50 |         self.run_button = wx.Button(panel, -1, 'Run', (10, 80))
51 | 
52 |         #labels
53 |         self.fore_label = wx.StaticText(panel, -1, 'Select the foreground file', (10, 10))
54 |         self.back_label = wx.StaticText(panel, -1, 'Select the background file', (10, 30))
55 | 
56 |         #binding the menus to functions 
57 |         self.Bind(wx.EVT_MENU, self.on_foreground, foreground_menu)
58 |         self.Bind(wx.EVT_MENU, self.on_background, background_menu)
59 |         self.Bind(wx.EVT_BUTTON, self.run_finder, self.run_button)
60 |         
61 |         
62 |     def on_foreground(self, event):
63 |         dialog = wx.FileDialog(self, style=wx.OPEN)
64 |         if dialog.ShowModal() == wx.ID_OK:
65 |             pymotGUI.fore_file = dialog.GetPath()
66 |             self.fore_label.SetLabel(pymotGUI.fore_file)
67 | 
68 |     def on_background(self, event):
69 |         dialog = wx.FileDialog(self, style=wx.OPEN)
70 |         if dialog.ShowModal() == wx.ID_OK:
71 |             pymotGUI.back_file = dialog.GetPath()
72 |             self.back_label.SetLabel(pymotGUI.back_file)
73 | 
74 |     def run_finder(self, event):
75 |         width = self.motif_width.GetValue()
76 |         result = pymotif.calculate_motifs(pymotGUI.fore_file, pymotGUI.back_file, int(width))
77 |         for motif in result:
78 |             self.results.WriteText(motif + '\n')
79 | 
80 | 
81 | #if __name__ == '__main__':
82 | app = pymot()
83 | frame = pymotGUI(parent=None, id = -1)
84 | #frame.CentreOnScreen()
85 | frame.Show()
86 | app.MainLoop()


--------------------------------------------------------------------------------
/scripts/motifs/pymotif.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import fasta
 4 | import sys
 5 | from collections import defaultdict
 6 | 
 7 | def choose(n, k):
 8 |     if 0 <= k <= n:
 9 |         ntok = 1
10 |         ktok = 1
11 |         for t in xrange(1, min(k, n - k) + 1):
12 |             ntok *= n
13 |             ktok *= t
14 |             n -= 1
15 |         return ntok // ktok
16 |     else:
17 |         return 0
18 |     
19 | def get_quorums(seqs, mlen):
20 |     """
21 |     add seq id_no to a set
22 |     use explicit counter to create seq_no
23 |     """
24 |     quorum = defaultdict(int)
25 |     for seq in seqs:
26 |         for n in range(len(seq) - mlen):
27 |             quorum[seq[n:n + mlen]] += 1
28 |     return quorum
29 |     
30 | def calculate_motifs(input_seqs, input_seqs2, width):
31 |     
32 |     print input_seqs, input_seqs2
33 |     input_seqs = fasta.read_seqs(open(input_seqs).readlines())
34 |     input_seqs2 = fasta.read_seqs(open(input_seqs2).readlines())
35 | 
36 |     foreground = get_quorums(input_seqs, width)
37 |     background = get_quorums(input_seqs2, width)
38 | 
39 |     N = len(input_seqs) + len(input_seqs2)
40 | 
41 |     res_motifs = []
42 |     for i in foreground:
43 |         term1 = choose(background[i], foreground[i])
44 |         term2 = choose((N - background[i]), len(input_seqs) - 1)
45 |         term3 = choose(N, len(input_seqs))
46 |         p = (float(term1) * float(term2)) / term3
47 |         if 0 < p <= 0.0001:
48 |             res_motifs.append(i + '\t' + str(foreground[i]) + '\t' + str(background[i]) + '\t' + str(p))
49 |     
50 |     res_motifs.sort()
51 |     return res_motifs


--------------------------------------------------------------------------------
/scripts/original_scripts/3.0/AY162388.seq:
--------------------------------------------------------------------------------
1 | GTGACTTTGTTCAACGGCCGCGGTATCCTAACCGTGCGAAGGTAGCGTAATCACTTGTTC
2 | TTTAAATAAGGACTAGTATGAATGGCATCACGAGGGCTTTACTGTCTCCTTTTTCTAATC
3 | AGTGAAACTAATCTCCCGTGAAGAAGCGGGAATTAACTTATAAGACGAGAAGACCCTATG
4 | GAGCTTTAAACCAAATAACATTTGCTATTTTACAACATTCAGATATCTAATCTTTATAGC
5 | ACTATGATTACAAGTTTTAGGTTGGGGTGACCGCGGAGTAAAAATTAACCTCCACATTGA
6 | AGGAATTTCTAAGCAAAAAGCTACAACTTTAAGCATCAACAAATTGACACTTATTGACCC
7 | AATATTTTGATCAACGAACCATTACCCTAGGGATAACAGCGCAATCCATTATGAGAGCTA
8 | TTATCGACAAGTGGGCTTACGACCTCGATGTTGGATCAGGG
9 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/3.0/code_01.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | 
3 | '''first script featured on the website
4 | like a hello world'''
5 | 
6 | dna = "ACGTACGTACGTACGTACGTACGT"
7 | print(dna)
8 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/3.0/code_02.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | '''second script available, shows a simple way to concatenate
 4 | two DNA sequences, strings'''
 5 | 
 6 | dna = "ACGTACGTACGTACGTACGTACGT"
 7 | dna2 = "TCGATCGATCGATCGATCGA"
 8 | print("First and Second sequences")
 9 | print(dna, dna2)
10 | dna3 = dna + dna2
11 | print("Concatenated sequence")
12 | print(dna3)
13 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/3.0/code_03.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | '''third script, shows how to import a module and use the 
 4 | regex module to transcribe DNA to RNA
 5 | '''
 6 | 
 7 | #import regular expression module
 8 | import re
 9 | 
10 | #setting the DNA string
11 | dna = 'ACGTTGCAACGTTGCAACGTTGCA'
12 | 
13 | #assigning a new regex and compiling it 
14 | #to find all Ts
15 | regexp = re.compile('T')
16 | 
17 | #create a new string tha will receive 
18 | #the regex result with Us replacing Ts
19 | rna = regexp.sub('U', dna)
20 | 
21 | print(rna)
22 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/3.0/code_04.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | '''simple script to open a file and print all lines'''
 4 | 
 5 | #assigning a filename to a variable
 6 | dnafile = "AY162388.seq"
 7 | 
 8 | #opening the file
 9 | file = open(dnafile, 'r')
10 | 
11 | #printing each line of the file
12 | for line in file:
13 |     print(line, end='')
14 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/3.0/code_05.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | simple script to find motifs on DNA sequences using regex
 5 | the script is interactive 
 6 | '''
 7 | 
 8 | # we use the RegEx module
 9 | import re
10 | import string
11 | 
12 | #still keep the file fixed
13 | dnafile = "AY162388.seq"
14 | 
15 | #opening the file, reading the sequence and storing in a list
16 | seqlist = open(dnafile, 'r').readlines()
17 | 
18 | #let's join the the lines in a temporary string
19 | temp = ''.join(seqlist)
20 | 
21 | #assigning our sequence, with no carriage returns to our
22 | #final variable/object
23 | sequence = temp.replace('\n', '')
24 | 
25 | #we start to deal with user input
26 | #first we use a boolean variable to check for valid input
27 | inputfromuser = True
28 | 
29 | #while loop: while there is an motif larger than 0
30 | #the loop continues
31 | while inputfromuser:
32 |     #raw_input received the user input as string
33 |     inmotif = input('Enter motif to search: ')
34 |     #now we check for the size of the input
35 |     if len(inmotif) >= 1:
36 |         #we compile a regex with the input given
37 |         motif = re.compile('%s' % inmotif)
38 |         #looking to see if the entered motif is in the sequence
39 |         if re.search(motif, sequence):
40 |             print('Yep, I found it')
41 |         else:
42 |             print('Sorry, try another one')
43 |     else:
44 |         print('Done, thanks for using motif_search')
45 |         inputfromuser = False
46 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/3.0/code_05.py.bak:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | simple script to find motifs on DNA sequences using regex
 5 | the script is interactive 
 6 | '''
 7 | 
 8 | # we use the RegEx module
 9 | import re
10 | import string
11 | 
12 | #still keep the file fixed
13 | dnafile = "AY162388.seq"
14 | 
15 | #opening the file, reading the sequence and storing in a list
16 | seqlist = open(dnafile, 'r').readlines()
17 | 
18 | #let's join the the lines in a temporary string
19 | temp = ''.join(seqlist)
20 | 
21 | #assigning our sequence, with no carriage returns to our
22 | #final variable/object
23 | sequence = temp.replace('\n', '')
24 | 
25 | #we start to deal with user input
26 | #first we use a boolean variable to check for valid input
27 | inputfromuser = True
28 | 
29 | #while loop: while there is an motif larger than 0
30 | #the loop continues
31 | while inputfromuser:
32 |     #raw_input received the user input as string
33 |     inmotif = raw_input('Enter motif to search: ')
34 |     #now we check for the size of the input
35 |     if len(inmotif) >= 1:
36 |         #we compile a regex with the input given
37 |         motif = re.compile('%s' % inmotif)
38 |         #looking to see if the entered motif is in the sequence
39 |         if re.search(motif, sequence):
40 |             print 'Yep, I found it'
41 |         else:
42 |             print 'Sorry, try another one'
43 |     else:
44 |         print 'Done, thanks for using motif_search'
45 |         inputfromuser = False
46 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_01.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | 
3 | '''first script featured on the website
4 | like a hello world'''
5 | 
6 | dna = "ACGTACGTACGTACGTACGTACGT"
7 | print dna
8 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_02.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | '''second script available, shows a simple way to concatenate
 4 | two DNA sequences, strings'''
 5 | 
 6 | dna = "ACGTACGTACGTACGTACGTACGT"
 7 | dna2 = "TCGATCGATCGATCGATCGA"
 8 | print "First and Second sequences"
 9 | print dna, dna2
10 | dna3 = dna + dna2
11 | print "Concatenated sequence"
12 | print dna3
13 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_03.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | '''third script, shows how to import a module and use the 
 4 | regex module to transcribe DNA to RNA
 5 | '''
 6 | 
 7 | #import regular expression module
 8 | import re
 9 | 
10 | #setting the DNA string
11 | dna = 'ACGTTGCAACGTTGCAACGTTGCA'
12 | 
13 | #assigning a new regex and compiling it 
14 | #to find all Ts
15 | regexp = re.compile('T')
16 | 
17 | #create a new string tha will receive 
18 | #the regex result with Us replacing Ts
19 | rna = regexp.sub('U', dna)
20 | 
21 | print rna
22 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_04.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | '''simple script to open a file and print all lines'''
 4 | 
 5 | #assigning a filename to a variable
 6 | dnafile = "AY162388.seq"
 7 | 
 8 | #opening the file
 9 | file = open(dnafile, 'r')
10 | 
11 | #printing each line of the file
12 | for line in file:
13 |     print line,


--------------------------------------------------------------------------------
/scripts/original_scripts/code_05.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | simple script to find motifs on DNA sequences using regex
 5 | the script is interactive 
 6 | '''
 7 | 
 8 | # we use the RegEx module
 9 | import re
10 | import string
11 | 
12 | #still keep the file fixed
13 | dnafile = "AY162388.seq"
14 | 
15 | #opening the file, reading the sequence and storing in a list
16 | seqlist = open(dnafile, 'r').readlines()
17 | 
18 | #let's join the the lines in a temporary string
19 | temp = ''.join(seqlist)
20 | 
21 | #assigning our sequence, with no carriage returns to our
22 | #final variable/object
23 | sequence = temp.replace('\n', '')
24 | 
25 | #we start to deal with user input
26 | #first we use a boolean variable to check for valid input
27 | inputfromuser = True
28 | 
29 | #while loop: while there is an motif larger than 0
30 | #the loop continues
31 | while inputfromuser:
32 |     #raw_input received the user input as string
33 |     inmotif = raw_input('Enter motif to search: ')
34 |     #now we check for the size of the input
35 |     if len(inmotif) >= 1:
36 |         #we compile a regex with the input given
37 |         motif = re.compile('%s' % inmotif)
38 |         #looking to see if the entered motif is in the sequence
39 |         if re.search(motif, sequence):
40 |             print 'Yep, I found it'
41 |         else:
42 |             print 'Sorry, try another one'
43 |     else:
44 |         print 'Done, thanks for using motif_search'
45 |         inputfromuser = False
46 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_06.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | another script to find motifs on DNA sequences with more features than
 5 | code_05.py
 6 | '''
 7 | 
 8 | # we use the RegEx module
 9 | import re
10 | import string
11 | #we also import the sys module
12 | import sys
13 | 
14 | #set the variable to control the loop
15 | fileinput = True
16 | while fileinput == True:
17 |     #ask user for the input
18 |     filename = raw_input('Enter file name:')
19 |     if len(filename) > 0:
20 |         #we try to open the file
21 |         try:
22 |             dnafile = open(filename, 'r')
23 |             #success! we finish the loop and move to the next input
24 |             fileinput = False
25 |         except:
26 |             #no dice, file does not exist
27 |             #keep the loop on and ask again
28 |             print 'File does not exist'
29 |     else:
30 | #        fileinput = False
31 |         sys.exit()
32 | 
33 | 
34 | #opening the file, reading the sequence and storing in a list
35 | seqlist = open(filename, 'r').readlines()
36 | 
37 | #let's join the the lines in a temporary string
38 | temp = ''.join(seqlist)
39 | 
40 | #assigning our sequence, with no carriage returns to our
41 | #final variable/object
42 | sequence = temp.replace('\n', '')
43 | 
44 | #we start to deal with user input
45 | #first we use a boolean variable to check for valid input
46 | inputfromuser = True
47 | 
48 | #while loop: while there is an motif larger than 0
49 | #the loop continues
50 | while inputfromuser:
51 |     #raw_input received the user input as string
52 |     inmotif = raw_input('Enter motif to search: ')
53 |     #now we check for the size of the input
54 |     if len(inmotif) >= 1:
55 |         #we compile a regex with the input given
56 |         motif = re.compile('%s' % inmotif)
57 |         #looking to see if the entered motif is in the sequence
58 |         if re.search(motif, sequence):
59 |             print 'Yep, I found it'
60 |         else:
61 |             print 'Sorry, try another one'
62 |     else:
63 |         print 'Done, thanks for using motif_search'
64 |         inputfromuser = False
65 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_07.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | counting the nucleotides in a sequence, iterating
 5 | through lists
 6 | '''
 7 | 
 8 | #let's keep the file fixed for now
 9 | dnafile = "AY162388.seq"
10 | 
11 | #opening the file, reading the sequence and storing in a list
12 | file = open(dnafile, 'r')
13 | 
14 | #initialize a string to receive the data
15 | sequence = ''
16 | for line in file:
17 |     sequence += line.strip() #notice the strip, to remove \n
18 | 
19 | #"exploding" the sequence in a list
20 | seqlist = list(sequence)
21 | 
22 | #initializing integers to store the counts
23 | total_a = 0
24 | total_c = 0
25 | total_g = 0
26 | total_t = 0
27 | 
28 | #checking each item in the list and updating counts	
29 | for base in seqlist:
30 |     if base == 'A':
31 |         total_a += 1
32 |     elif base == 'C':
33 |         total_c += 1
34 |     elif base == 'G':
35 |         total_g += 1
36 |     elif base == 'T':
37 |         total_t += 1
38 | 
39 | print str(total_a) + ' As found'
40 | print str(total_c) + ' Cs found'
41 | print str(total_g) + ' Gs found'
42 | print str(total_t) + ' Ts found'
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_08.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''script that counts the number of bases in a DNA sequence
 4 | showing the string.count() method'''
 5 | 
 6 | #still keep the file fixed
 7 | dnafile = "AY162388.seq"
 8 | 
 9 | #opening the file, reading the sequence and storing in a list
10 | seqlist = open(dnafile, 'r').readlines()
11 | 
12 | #let's join the the lines in a temporary string
13 | temp = ''.join(seqlist)
14 | 
15 | #counting
16 | total_a = temp.count('A')
17 | total_c = temp.count('C')
18 | total_g = temp.count('G')
19 | total_t = temp.count('T')
20 | 
21 | #printing results
22 | print str(total_a) + ' As found'
23 | print str(total_c) + ' Cs found'
24 | print str(total_g) + ' Gs found'
25 | print str(total_t) + ' Ts found'
26 | 
27 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_09.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | script that counts the number of each nucleotide in a sequence using 
 5 | user input and saving to a file.
 6 | '''
 7 | 
 8 | import sys
 9 | import re
10 | 
11 | 
12 | fileentered = True #flag that determines if a filename has been entered
13 | 
14 | while fileentered == True:
15 |     #ask the user to input a filename
16 |     filename = raw_input('Please enter a file to check: ')
17 |     #if a filename was entered, go ...
18 |     if len(filename) >= 1:
19 |         try:
20 |             #open the file
21 |             seqlist = open(filename, 'r').readlines()
22 |             #sequence is read as a list, convert to string
23 |             sequence = ''.join(seqlist)
24 |             #remove carriage returns
25 |             sequence = sequence.replace('\n', '')
26 |             #counting
27 |             total_a = sequence.count('A')
28 |             total_c = sequence.count('C')
29 |             total_g = sequence.count('G')
30 |             total_t = sequence.count('T')
31 |             #create a regex object with non-nucleotide letters to check for "errors"
32 |             otherletter = re.compile('[BDEFHIJKLMNOPQRSUVXZ]+')
33 |             #find possible non-nucleotides
34 |             extra = re.findall(otherletter, sequence)
35 |             #open an output filename to output counts
36 |             output = open(filename + '.count', 'w')
37 |             #writing the output
38 |             output.write('Count report for file ' + filename + '\n')
39 |             output.write('A = ' + str(total_a) + '\n')
40 |             output.write('C = ' + str(total_c) + '\n')
41 |             output.write('G = ' + str(total_g) + '\n')
42 |             output.write('T = ' + str(total_t) + '\n')
43 |             #if there are non-nucleotides in the sequence, report them
44 |             if len(extra) > 0:
45 |                 output.write('Also were found ' + str(len(extra)) + ' errors\n')
46 |                 for i in extra:
47 |                     output.write(i + ' ')
48 |             else:
49 |                 output.write('No error found')
50 |             print 'Result file saved on ' + filename + '.count'
51 |         except:
52 |             print 'File not found. Please try again.'
53 |     else:
54 |         #if no filename entered, exit
55 |         fileentered = False
56 |         sys.exit()
57 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_10.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | '''
 4 | Python functions
 5 | '''
 6 | 
 7 | def add_tail(seq):
 8 |     '''function that adds a poly-T tail to sequences'''
 9 |     result = seq + 'TTTTTTTTTTTTTTTTTTTTT'
10 |     return result
11 | 
12 | #opening the file
13 | dnafile = 'AY162388.seq'
14 | file = open(dnafile, 'r')
15 | 
16 | #reading the sequence from the file
17 | sequence = ''
18 | for line in file:
19 |     sequence += line.strip()
20 | 
21 | #printing result
22 | print sequence
23 | #calling the function to add the tail
24 | sequence = add_tail(sequence)
25 | #printing new sequence
26 | print sequence
27 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_11.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | making a function to count nucleotides
 5 | '''
 6 | 
 7 | import sys
 8 | 
 9 | def count_nucleotide_types(seq):
10 |     '''counting nucleotides and returning a list with counts'''
11 |     result = []
12 |     total_a = seq.count('A')
13 |     total_c = seq.count('C')
14 |     total_g = seq.count('G')
15 |     total_t = seq.count('T')
16 | 
17 |     result.append(total_a)
18 |     result.append(total_c)
19 |     result.append(total_g)
20 |     result.append(total_t)
21 | 
22 |     return result
23 | 
24 | #opening the file
25 | sequencefile = open(sys.argv[1], 'r').readlines()
26 | #joining a sequence as a list into a string
27 | sequence = ''.join(sequencefile)
28 | #replacing carriage returns
29 | sequence = sequence.replace('\n', '')
30 | #counting the nucleotides
31 | values = count_nucleotide_types(sequence)
32 | #printing the results
33 | print values
34 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_12.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | an extremely simple dice game
 5 | '''
 6 | 
 7 | #we need the random module
 8 | import random
 9 | import string
10 | 
11 | #generating two dices, between 1 and 6 for the human player
12 | dice1 = random.randint(1, 6)
13 | dice2 = random.randint(1, 6)
14 | 
15 | #generating two dices, between 1 and 6 for the computer player
16 | computerdice1 = random.randint(1, 6)
17 | computerdice2 = random.randint(1, 6)
18 | 
19 | #summing up both dices for each player
20 | mine = dice1 + dice2
21 | his_hers = computerdice1 + computerdice2
22 | 
23 | #printing the values
24 | print 'mine = ' + str(mine) + ' vs. computer = ' + str(his_hers)
25 | 
26 | #chdking the results and proclaiming the winner
27 | if mine > his_hers:
28 |     print "I won"
29 | elif mine < his_hers:
30 |     print "Computer won"
31 | else:
32 |     print "Tie. Try again"
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_13.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | very simple script to generate random DNA sequences
 5 | '''
 6 | 
 7 | #random module is needed
 8 | import random
 9 | import sys
10 | 
11 | #sequence length is a parameter
12 | length = int(sys.argv[1])
13 | 
14 | #template DNA is a list with ACGT repeats
15 | dnaseq = list('ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT')
16 | 
17 | #print the template
18 | print dnaseq
19 | 
20 | result = ''
21 | for i in range(length):
22 |     #for the simulated sequence we use random.choice
23 |     #that randonly selects items of a list
24 |     result += random.choice(dnaseq)
25 | 
26 | print result


--------------------------------------------------------------------------------
/scripts/original_scripts/code_14.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | a more elaborated script to generate random DNA sequences
 5 | '''
 6 | 
 7 | import random
 8 | import sys
 9 | 
10 | def simulate_sequence(length):
11 |     '''function the generates the simulations'''
12 |     #list with nucleotides
13 |     dna = ['A', 'C', 'G', 'T']
14 |     #initializing the sequence
15 |     sequence = ''
16 |     #iterates over the input sequence length ...
17 |     for i in range(length):
18 |         #and chooses randomly the nucletides
19 |         sequence += random.choice(dna)
20 |     #returns simulated sequence
21 |     return sequence
22 | 
23 | #first parameter is the number of sequences to generate
24 | setsize = int(sys.argv[1])
25 | #minimum and maximum sequence lengths
26 | minlength = int(sys.argv[2])
27 | maxlength = int(sys.argv[3])
28 | 
29 | #initializes a list to store the sequence set
30 | sequenceset = []
31 | for i in range(setsize):
32 |     #generate a random integer between min and max seq lenght
33 |     rlength = random.randint(minlength, maxlength)
34 |     #appending to the sequence set and calling simulated sequence
35 |     #function
36 |     sequenceset.append(simulate_sequence(rlength))
37 | 
38 | #printing output
39 | for sequence in sequenceset:
40 |     print sequence
41 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_15.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | a even more elaborated DNA sequence simulation script with
 5 | sequence identity calculation (not overall, just neighbours)
 6 | '''
 7 | 
 8 | import random
 9 | import sys
10 | 
11 | def simulate_sequence(length):
12 |     '''function that simulates the sequences'''
13 |     #nucleotides list
14 |     dna = ['A', 'C', 'G', 'T']
15 |     sequence = ''
16 |     #randomly picking from the nucleotide list
17 |     for i in range(length):
18 |         sequence += random.choice(dna)
19 |     return sequence
20 | 
21 | def nucleotide_percentage(sequence):
22 |     #counting the nucleotides
23 |     print str(sequence.count('A')) + ' As ',
24 |     print str(sequence.count('C')) + ' Cs ',
25 |     print str(sequence.count('G')) + ' Gs ',
26 |     print str(sequence.count('T')) + ' Ts '
27 | 
28 | def sequence_identity(seqset):
29 |     '''function that calculates sequence identies'''
30 |     iden = []
31 |     count = 0.0
32 |     #iterates through the sequences in the set -1
33 |     #and calculates sequence identities
34 |     for x in range(len(seqset) - 1):
35 | 	print str(x), str(x+1)
36 |         for n in range(len(seqset[x])):
37 |             #iterates over all nucleotides and checks for identical ones
38 |             if seqset[x][n] == seqset[x + 1][n]:
39 |                 count += 1
40 |         iden.append(count / len(seqset[x]))
41 |         count = 0.0
42 |     return iden
43 | 
44 | #input parameters
45 | setsize = int(sys.argv[1])
46 | minlength = int(sys.argv[2])
47 | maxlength = int(sys.argv[3])
48 | 
49 | #generates simulated sequence sets
50 | sequenceset = []
51 | for i in range(setsize):
52 |     rlength = random.randint(minlength, maxlength)
53 |     sequenceset.append(simulate_sequence(rlength))
54 | 
55 | #calculate sequence identities
56 | identity = sequence_identity(sequenceset)
57 | 
58 | #prints the results
59 | for i in range(len(sequenceset)):
60 |     print sequenceset[i]
61 |     if i < len(sequenceset) - 1:
62 |         print 'sequence identity to next sequence : ' + str(identity[i])
63 |     nucleotide_percentage(sequenceset[i])
64 |     print
65 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_16.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | '''
 4 | extremely simple script to DNA transcription
 5 | '''
 6 | 
 7 | 
 8 | dna = 'ACGTTGCAACGTTGCAACGTTGCA'
 9 | #string buil-in replace method
10 | rna = dna.replace('T', 'U')
11 | print rna
12 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_17.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | '''
 4 | reading a sequence file and printing first and last lines
 5 | '''
 6 | 
 7 | dnafile = "AY162388.seq"
 8 | file = open(dnafile, 'r').readlines()
 9 | print 'I want the first line'
10 | print file[0]
11 | print 'now the last line'
12 | #print file[len(file)-1]
13 | print file[-1]
14 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_18.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | translating DNA into proteins
 5 | '''
 6 | 
 7 | #import our homemade module that has the DNA
 8 | #translating function
 9 | import dnatranslate
10 | 
11 | #OK, we are using the same DNA file
12 | dnafile = open("AY162388.seq", 'r').readlines()
13 | 
14 | #opening the file and stripping and joining the lines
15 | sequence = ''
16 | for line in dnafile:
17 |     sequence += line.strip()
18 | 
19 | #call the function in our module and translating the sequence
20 | protein = dnatranslate.translate_dna(sequence)
21 | 
22 | #output, simple, we could make it better
23 | print sequence, len(sequence)
24 | print
25 | print protein, len(protein)
26 | 
27 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_18a.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import dnatranslate
 4 | 
 5 | dnafile = open("AY162388.seq", 'r').readlines()
 6 | 
 7 | sequence = ''
 8 | for line in dnafile:
 9 |     sequence += line.strip()
10 | 
11 | 
12 | protein = dnatranslate.translate_dna(sequence)
13 | 
14 | print sequence, len(sequence)
15 | print
16 | print protein, len(protein)
17 | 
18 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_19.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | using the fasta module to read sequences
 5 | '''
 6 | 
 7 | #import our freshly created module
 8 | import fasta
 9 | import sys
10 | 
11 | #read the fasta file in one line: open the file, read the contents
12 | #and send it to the fasta reading function
13 | sequences = fasta.read_fasta(open(sys.argv[1], 'r').readlines())
14 | 
15 | temp = []
16 | for i in sequences:
17 |     #print the sequence name
18 |     print i.name
19 |     #use range with a step of 80, printing 80 characters at
20 |     #a time. The value could be set by a input parameter
21 |     for j in range(0,len(i.sequence),80):
22 |         print i.sequence[j:j+80]
23 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_20.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | simple to translate dna into proteins
 5 | '''
 6 | 
 7 | #importing the dnatranslate module
 8 | import dnatranslate
 9 | import sys
10 | import fasta
11 | 
12 | #opening and reading the file in one take
13 | dna = fasta.read_fasta(open(sys.argv[1], 'r').readlines())
14 | 
15 | #iterate over the sequences and translate them
16 | for item in dna:
17 |     protein = dnatranslate.translate_dna(item.sequence)
18 |     print item.name
19 |     print protein


--------------------------------------------------------------------------------
/scripts/original_scripts/code_21.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | #import two modules
 4 | import dnatranslate
 5 | import fasta
 6 | import sys
 7 | 
 8 | 
 9 | #read the fasta file in one line: open the file, read the contents
10 | #and send it to the fasta reading function
11 | dna = fasta.read_fasta(open(sys.argv[1], 'r').readlines())
12 | 
13 | for item in dna:
14 |     #translate the DNA
15 |     protein = dnatranslate.translate_dna(item.sequence)
16 |     print item.name
17 |     #format and print the protein
18 |     print fasta.format_output(protein, 60)


--------------------------------------------------------------------------------
/scripts/original_scripts/code_22.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | import fasta
 4 | 
 5 | #function that reads the enzyme list
 6 | def read_enzymes(file):
 7 |     #initialize dictionary
 8 |     resenz = {}
 9 |     start = False
10 |     for line in file:
11 |         #if we found the string we jump a line and start reading the list
12 |         if line.find('Rich Roberts') >= 0:
13 |             start = True
14 |             line = file.next()
15 |         if start == True and len(line) > 10:
16 |             buffer = line.split()
17 |             #populating the dictionary
18 |             resenz[buffer[0]] = buffer[-1].replace('^', '')
19 |     return resenz
20 | 
21 | #function that checks if the input enzyme name was found in the list
22 | def check_enzyme(input, set):
23 |     if set.has_key(input):
24 |         return True
25 |     else:
26 |         return False
27 | 
28 | def find_sites(input, set, sequence):
29 |     #this is the IUPAC code
30 |     iupacdict = {'A':'[A]',
31 |     'C':'[C]',
32 |     'G':'[G]',
33 |     'T':'[T]',
34 |     'M':'[AC]',
35 |     'R':'[AG]',
36 |     'W':'[AT]',
37 |     'S':'[CG]',
38 |     'Y':'[CT]',
39 |     'K':'[GT]',
40 |     'V':'[ACG]',
41 |     'H':'[ACT]',
42 |     'D':'[AGT]',
43 |     'B':'[CGT]',
44 |     'X':'[ACGT]',
45 |     'N':'[ACGT]'}
46 | 
47 |     #we get the site
48 |     site = set[input]
49 |     pattern = ''
50 |     positions = []
51 |     #transform the site from IUPAC to nucleotides
52 |     for i in site:
53 |         pattern += iupacdict[i]
54 |     #search the pattern
55 |     searchpattern = re.compile(pattern)
56 |     #search all entries of the pattern
57 |     sites = searchpattern.findall(sequence)
58 |     temppos = searchpattern.finditer(sequence)
59 |     for i in temppos:
60 |         begin, end = i.span()
61 |         positions.append(begin)
62 | 
63 |     return sites, positions
64 | 
65 | #read the enzyme name
66 | enzyme = sys.argv[1]
67 | #read the list
68 | enzymeset = read_enzymes(open('bionet.709', 'r'))
69 | isname = check_enzyme(enzyme, enzymeset)
70 | 
71 | if isname:
72 |     print 'Name found'
73 |     #if we found the enzyme name we read the sequence file
74 |     sequences = fasta.read_fasta(open(sys.argv[2], 'r').readlines())
75 |     for item in sequences:
76 |         #let's search
77 |         sites, positions = find_sites(enzyme, enzymeset, item.sequence)
78 |         #print the sequence name
79 |         print item.name[:20]+'...'
80 |         #and use the zip function to combine the lists and print
81 |         for i in zip(sites,positions):
82 |             print i[0], '->', i[1]
83 | #if the name is not found, we bail out
84 | else:
85 |     print 'Enzyme name not found, please try again'
86 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_23.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | 
 4 | gbfile = open(sys.argv[1], 'r').readlines()
 5 | 
 6 | sequence = ''
 7 | issequence = False
 8 | for line in gbfile:
 9 |     if issequence == True:
10 |         sequence += line
11 |     elif line.find('ORIGIN') >= 0:
12 |         issequence = True
13 | 
14 | print sequence


--------------------------------------------------------------------------------
/scripts/original_scripts/code_24.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | gbfile = open(sys.argv[1], 'r').readlines()
 4 | 
 5 | sequence = ''
 6 | issequence = False
 7 | for line in gbfile:
 8 |     if issequence == True and not line.find('/') == 0:
 9 |         sequence += line.lstrip('0123456789 ').replace(' ', '')
10 |     elif line.find('ORIGIN') >= 0:
11 |         issequence = True
12 | 
13 | print sequence


--------------------------------------------------------------------------------
/scripts/original_scripts/code_25.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | gbfile = open(sys.argv[1], 'r').readlines()
 4 | 
 5 | locus = ''
 6 | organism = ''
 7 | accession = ''
 8 | for line in gbfile:
 9 |     if line.find('LOCUS') >= 0:
10 |         locus = line
11 |     elif line.find('ACCESSION') >= 0:
12 |         accession = line
13 |     elif line.find('ORGANISM') >= 0:
14 |         organism = line
15 | 
16 | print locus.strip()
17 | print organism.strip()
18 | print accession.strip()


--------------------------------------------------------------------------------
/scripts/original_scripts/code_26.py:
--------------------------------------------------------------------------------
 1 | class Fasta:
 2 |     def __init__(self, name, sequence):
 3 |         self.name = name
 4 |         self.sequence = sequence
 5 | 
 6 | def read_fasta(file):
 7 |     items = []
 8 |     index = 0
 9 |     for line in file:
10 |         if line.startswith(">"):
11 |            if index >= 1:
12 |                items.append(aninstance)
13 |            index += 1
14 |            name = line[:-1]
15 |            seq = ''
16 |            aninstance = Fasta(name, seq)
17 |         else:
18 |            seq += line[:-1]
19 |            aninstance = Fasta(name, seq)
20 | 
21 |     items.append(aninstance)
22 |     return items
23 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_27.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import fasta
 5 | 
 6 | file = sys.argv[1]
 7 | temp = file.split('.')
 8 | filename_base = temp[0]
 9 | tag = temp[1]
10 | 
11 | sequences = fasta.read_fasta(open(file, 'r').readlines())
12 | 
13 | count = 1
14 | for i in sequences:
15 |     f = filename_base + '_' + str(count) + '.' + tag
16 |     output = open(f, 'w')
17 |     output.write(i.name + '\n')
18 |     output.write(i.sequence)
19 |     count += 1
20 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_28.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import random
 4 | import sys
 5 | 
 6 | def simulate_sequence(length):
 7 |     dna = ['A', 'C', 'G', 'T']
 8 |     sequence = ''
 9 |     for i in range(length):
10 |         sequence += random.choice(dna)
11 |     return sequence
12 | 
13 | setsize = int(sys.argv[1])
14 | minlength = int(sys.argv[2])
15 | maxlength = int(sys.argv[3])
16 | nsets = int(sys.argv[4])
17 | 
18 | for i in range(nsets):
19 |     sequenceset = []
20 |     for i in range(setsize):
21 |         rlength = random.randint(minlength, maxlength)
22 |         sequenceset.append(simulate_sequence(rlength))
23 | 
24 |     for sequence in sequenceset:
25 |         print sequence
26 | 
27 |     print
28 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_29.py:
--------------------------------------------------------------------------------
 1 | def read_seqs(file):
 2 |     items = []
 3 |     seq = ''
 4 |     index = 0
 5 |     for line in file:
 6 |         if line.startswith(">"):
 7 |             if index >= 1:
 8 |                 items.append(seq)
 9 |                 seq = ''
10 |             index += 1
11 |         else:
12 |             seq += line[:-1]
13 | 
14 |     items.append(seq)
15 |     return items
16 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_30.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | 
3 | import fasta
4 | import sys
5 | 
6 | data = fasta.read_seqs(open(sys.argv[1], 'r').readlines())
7 | print [len(seq) for seq in data]
8 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_31.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | file = sys.argv[1]
 6 | start = int(sys.argv[2])
 7 | end = int(sys.argv[3])
 8 | 
 9 | size = 0
10 | segment = ''
11 | for line in open(file, 'r'):
12 |     if not line.startswith('>'):
13 |        size += len(line)
14 |     else:
15 |         name = line
16 |     if size >= start and size <= end:
17 |         segment += line
18 | 
19 | print name, segment
20 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_32.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | file = sys.argv[1]
 6 | start = int(sys.argv[2])
 7 | end = int(sys.argv[3])
 8 | 
 9 | size = 0
10 | linesize = 0
11 | segment = []
12 | for line in open(file, 'r'):
13 |     if not line.startswith('>'):
14 |         size += len(line)
15 |     else:
16 |         name = line
17 |         if size >= start and size <= end+linesize:
18 |             segment.append(line.strip())
19 |             linesize = len(line.strip())
20 | 
21 | startline = (start / linesize) + 1
22 | endline = (end / linesize) + 1
23 | 
24 | if not start % linesize == 0 and not end % linesize == 0:
25 |     segment[0] = segment[0][startline*linesize-start:]
26 |     segment[-1] = segment[-1][endline*linesize-end:]
27 | elif not start % linesize == 0:
28 |     segment[0] = segment[0][startline*linesize-start:]
29 | elif not end % linesize == 0:
30 |     segment[-1] = segment[-1][endline*linesize-end:]
31 | 
32 | print name, '\n'.join(segment)
33 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_33.py:
--------------------------------------------------------------------------------
 1 | def parse_entry(gene_data):
 2 |     #changes a string to list, splitting at line ends
 3 |     gene_data = gene_data.split('\n')
 4 |     start, end = 0, 0
 5 |     gi_id = ''
 6 |     id = ''
 7 |     complement = False
 8 |     for line in gene_data:
 9 |         if line.find('  CDS  ') >=0:
10 |             temp = line.split()
11 |             if temp[1].find('complement') >= 0:
12 |                 complement = True
13 |                 temp[1] = temp[1].replace('complement(', '')
14 |                 temp[1] = temp[1].replace(')', '')
15 |             temp2 = temp[1].split('..')
16 |             start = temp2[0]
17 |             end = temp2[1]
18 |         elif line.find('GI:') >= 0:
19 |             gi_id = 'gi' + line[line.find('GI:')+3:-1]
20 |         elif line.find('/product') >=0:
21 |             id = line[line.find('=') + 2:-1]
22 |         elif line.find('protein_id') >= 0:
23 |             id += '\t' + line[line.find('=') + 2: -1]
24 | 
25 |     return CDSinfo(gi_id, id, start, end, complement)
26 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_34.py:
--------------------------------------------------------------------------------
 1 | def parse_entry(gene_data):
 2 |     #changes a string to list, splitting at line ends
 3 |     gene_data = gene_data.split('\n')
 4 |     start, end = 0, 0
 5 |     gi_id = ''
 6 |     id = ''
 7 |     complement = False
 8 |     for line in gene_data:
 9 |         if line.find('  CDS  ') >=0:
10 |             temp = line.split()
11 |             if temp[1].find('complement') >= 0:
12 |                 complement = True
13 |                 temp[1] = temp[1].replace('complement(', '')
14 |                 temp[1] = temp[1].replace(')', '')
15 |             temp2 = temp[1].split('..')
16 |             start = temp2[0]
17 |             end = temp2[1]
18 |         elif line.find('GI:') >= 0:
19 |             gi_id = 'gi' + line[line.find('GI:')+3:-1]
20 |         elif line.find('/product') >=0:
21 |             id = line[line.find('=') + 2:-1]
22 |         elif line.find('protein_id') >= 0:
23 |             id += '\t' + line[line.find('=') + 2: -1]
24 | 
25 |     return CDSinfo(gi_id, id, start, end, complement)
26 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_35.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | '''
  4 | script that extracts sequences from a GenBank file. Script reads the gene CDS from the file and
  5 | builds a list of start and end positions, and if gene is complement outputs the 5'3' sequence and
  6 | its reverse complement
  7 | only input is a GenBank file
  8 | outputs a fasta file with the GI ID as its name.
  9 | '''
 10 | 
 11 | import sys
 12 | import fasta
 13 | 
 14 | class CDSinfo:
 15 |     '''
 16 |     CDSinf class to store all the information from CDS
 17 |     '''
 18 |     def __init__(self, gi_id, id, start, end, complement):
 19 |         self.gi_id = gi_id
 20 |         self.id = id
 21 |         self.start = start
 22 |         self.end = end
 23 |         self.complement = complement
 24 | 
 25 | def parse_entry(gene_data):
 26 |     '''
 27 |     each CDS entry is obtained in the main function and a string of lines with
 28 |     information is passed to parse_entry to be parsed and have information extracted
 29 |     '''
 30 | 
 31 |     gene_data = gene_data.splitlines() #changes a string to list, splitting at line ends
 32 |     start, end = 0, 0
 33 |     gi_id = ''
 34 |     id = ''
 35 |     complement = False
 36 |     for line in gene_data: #searches for regions annotated as CDS
 37 |         if line.find('  CDS  ') >=0:
 38 |             temp = line.split()
 39 |             #checks for complement sequence, if true remove extra characters
 40 |             if temp[1].find('complement') >= 0:
 41 |                 complement = True
 42 |                 temp[1] = temp[1].replace('complement(', '')
 43 |                 temp[1] = temp[1].replace(')', '')
 44 |             temp2 = temp[1].split('..')
 45 |             start = temp2[0]
 46 |             end = temp2[1]
 47 |         #checks for GI IDs
 48 |         elif line.find('GI:') >= 0:
 49 |             gi_id = 'gi' + line[line.find('GI:')+3:-1]
 50 |         #get the gene name/function
 51 |         elif line.find('/product') >=0:
 52 |             id = line[line.find('=') + 2:-1]
 53 |         #and adds the protein id
 54 |         elif line.find('protein_id') >= 0:
 55 |             id += '\t' + line[line.find('=') + 2: -1]
 56 | 
 57 |     return CDSinfo(gi_id, id, start, end, complement)
 58 | 
 59 | #only input is the genbank file with annotation and sequence
 60 | gbfile = open(sys.argv[1])
 61 | 
 62 | index = 0
 63 | entry = ''
 64 | sequence = []
 65 | is_seq = False
 66 | 
 67 | genes = []
 68 | for line in gbfile:
 69 |     #reads the genbank file and whenever finds a gene annotation
 70 |     #concatenate the lines up to the next gene
 71 |     if line.find('  gene ') >= 0:
 72 |         #if an entry is complete, send it to parse
 73 |         if index >= 1:
 74 |             #appends to a list of CDSinfo objects
 75 |             genes.append(parse_entry(entry))
 76 |             entry = ''
 77 |         index += 1
 78 |         entry += line
 79 |     elif line.find('ORIGIN') >= 0:
 80 |         #found sequence start, set the flag on and parses the last entry
 81 |         is_seq = True
 82 |         genes.append(parse_entry(entry))
 83 |     elif is_seq == True:
 84 |         #if flag is true keep going, usually sequences are store at the end of the file
 85 |         line = line.split()
 86 |         sequence.append(line)
 87 |     else:
 88 |         #this is an entry so append line
 89 |         entry += line
 90 | 
 91 | str_seq = ''
 92 | #make the sequence a string
 93 | for i in sequence:
 94 |     str_seq += ''.join(i[1:]).upper()
 95 | 
 96 | for i in genes:
 97 |     if len(i.gi_id) > 2:
 98 |         print i.id, i.start, i.end
 99 |         output = open(i.gi_id + '.DNA.fasta', 'w')
100 |         output.write('>' + i.gi_id + '\t' + i.id + '\n')
101 |         # if this is a complement, print both 5'-3' and reverse complement sequences
102 |         if i.complement == True:
103 |             output.write(fasta.format_output(fasta.invert(str_seq[int(i.start)-1:int(i.end)]), 80) + '\n')
104 |         else:
105 |             if not i.start.find('join') >= 0:
106 |                 output.write(fasta.format_output(str_seq[int(i.start)-1:int(i.end)], 80))
107 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_36.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | '''
 4 | input is a GenBank file. The script searches for gene annotations, extract all lines
 5 | from the file and then parses these lines in order to extract protein sequences
 6 | Ribosomal genes and other non-coding genes are not extracted - plan to do it later
 7 | output is a fasta formatted file
 8 | '''
 9 | 
10 | import sys
11 | import fasta
12 | 
13 | class Protein:
14 |     '''
15 |     class that stores protein information
16 |     '''
17 |     def __init__(self, gi, id, sequence):
18 |         self.gi = gi
19 |         self.id = id
20 |         self.sequence = sequence
21 | 
22 | def parse_entry(gene_data):
23 |     '''
24 |     parses the entry received from the main function
25 |     in order to extract information as protein id
26 |     gi, etc
27 |     '''
28 |     prot_id = ''
29 |     sequence = ''
30 |     gi_id = ''
31 |     gene_data = gene_data.splitlines()
32 |     for line in gene_data:
33 |         if line.find('/product') >=0:
34 |             prot_id = line[line.find('=') + 2:-1]
35 |         elif line.find('protein_id') >= 0:
36 |             prot_id += '\t' + line[line.find('=') + 2: -1]
37 |         elif line.find('GI:') >= 0:
38 |             gi_id = 'gi' + line[line.find('GI:')+3:-1]
39 |         elif line.find('/translation') >= 0:
40 |             sequence = line[line.find('=') + 2:]
41 |             temp = gene_data.index(line)
42 |             for i in range(temp+1, len(gene_data)):
43 |                 if gene_data[i].find('sig_peptide') >= 0:
44 |                     break
45 |                 else:
46 |                     sequence += gene_data[i].strip()
47 | 
48 |     return Protein(gi_id, prot_id, sequence)
49 | 
50 | #only input is a genbank file
51 | gbfile = open(sys.argv[1])
52 | 
53 | proteins = []
54 | index = 0
55 | entry = ''
56 | for line in gbfile:
57 |     if line.find('  gene ') >= 0:
58 |         if index >= 1:
59 |             #parses the CDS and appends to a list
60 |             proteins.append(parse_entry(entry))
61 |             entry = ''
62 |         index += 1
63 |         entry += line
64 |     elif line.find('ORIGIN') >= 0:
65 |         #found the DNA sequence, we can stop now
66 |         break
67 |     else:
68 |         entry += line
69 | 
70 | #parses the last entry after leaving the loop
71 | proteins.append(parse_entry(entry))
72 | 
73 | #output
74 | for i in proteins:
75 |     if len(i.gi) > 2:
76 |         print i.gi, i.id
77 |         output = open(i.gi + '.fasta', 'w')
78 |         output.write('>' + i.gi + '\t' + i.id + '\n')
79 |         i.sequence = i.sequence.replace('\"', '')
80 |         output.write(fasta.format_output(i.sequence, 80))
81 |         print i.id
82 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_37.py:
--------------------------------------------------------------------------------
 1 | def merge_seqs(data1, data2):
 2 | 
 3 |     myset1, myset2 = Set([]), Set([])
 4 | 
 5 |     for i in data1:
 6 |         myset1.add(i.name[i.name.find('|')+1:i.name.find('/')])
 7 | 
 8 |     for i in data2:
 9 |         myset2.add(i.name[i.name.find('|')+1:i.name.find('/')])
10 | 
11 |     mylist = Set.intersection(myset1, myset2)
12 | 
13 |     flist = []
14 |     for i in mylist:
15 |         for j in data1:
16 |             if j.name[j.name.find('|')+1:j.name.find('/')] == i:
17 |                 for k in data2:
18 |                     if k.name[k.name.find('|')+1:k.name.find('/')] == j.name[j.name.find('|')+1:j.name.find('/')]:
19 |                         tempname = j.name + '-' + k.name + '->' + str(len(j.sequence))
20 |                         tempseq = j.sequence + k.sequence
21 |                         flist.append(tempname + '\n' + tempseq)
22 | 
23 |     return flist
24 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_38.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nuin/beginning-python-for-bioinformatics/b33813f4ec11a59a5c6381cc5b78044824d25e3f/scripts/original_scripts/code_38.py


--------------------------------------------------------------------------------
/scripts/original_scripts/code_39.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nuin/beginning-python-for-bioinformatics/b33813f4ec11a59a5c6381cc5b78044824d25e3f/scripts/original_scripts/code_39.py


--------------------------------------------------------------------------------
/scripts/original_scripts/code_40.py:
--------------------------------------------------------------------------------
 1 | def merge_seqs(data1, data2):
 2 |     first, second = dict(), dict()
 3 |     for i in data1:
 4 |         first[i.name[i.name.find('|') + 1:i.name.find('/')]] = i
 5 | 
 6 |     for i in data2:
 7 |         second[i.name[i.name.find('|') + 1:i.name.find('/')]] = i
 8 | 
 9 |     shared_ids = set(first).intersection(set(second))
10 | 
11 |     flist = []
12 |     for i in shared_ids:
13 |         j = first[i]
14 |         k = second[i]
15 |         tempname = j.name + '-' + k.name + '-&gt;' + str(len(j.sequence))
16 |         tempseq = j.sequence + k.sequence
17 |         flist.append(tempname + '\n' + tempseq)
18 | 
19 |     return flist
20 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_41.py:
--------------------------------------------------------------------------------
 1 | def merge_seqs(data1, data2):
 2 |     from itertools import chain, groupby
 3 |     format = "%s-%s->%d\n%s%s"
 4 |     flist = []
 5 |     keyfunc = lambda it: it.name[it.name.find('|') + 1 : it.name.find('/')]
 6 |     for it, g in groupby(sorted(chain(data1, data2), key=keyfunc), keyfunc):
 7 |         values = list(g)
 8 |         if len(values) == 2:
 9 |             jname, jseq = values[0].name, values[0].sequence
10 |             kname, kseq = values[1].name, values[1].sequence
11 |             flist.append(format % (jname, kname, len(jseq), jseq, kseq) )
12 | 
13 |     return flist
14 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_42.py:
--------------------------------------------------------------------------------
 1 | import fasta
 2 | import sys
 3 | 
 4 | def permutations(items, n):
 5 |     if n == 0:
 6 |         yield ''
 7 |     else:
 8 |         for i in range(len(items)):
 9 |             for base in permutations(items, n - 1):
10 |                 yield str(items[i]) + str(base)
11 | 
12 | seqs = fasta.get_seqs(open(sys.argv[1]).readlines())
13 | length = sys.argv[2]
14 | 
15 | nucleotides = ['A', 'C', 'G', 'T']
16 | 
17 | merged_seqs = ''
18 | for i in seqs:
19 |     merged_seqs += i.sequence
20 | 
21 | for i in permutations(nucleotides, int(length)):
22 |     print i + '\t' + merged_seqs.count(i)
23 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_43.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from collections import defaultdict
 4 | import sys
 5 | import fasta
 6 | 
 7 | seqs = fasta.get_seqs(open(sys.argv[1]).readlines())
 8 | length = int(sys.argv[2])
 9 | 
10 | #for a missing key, the dict entry is initialized to zero
11 | counts = defaultdict(int)
12 | 
13 | #count the length-element subsequences in each sequence
14 | for i in seqs:
15 | 	for n in range(len(i.sequence) - length):
16 | 		counts[i.sequence[n : n + length]] += 1
17 | 
18 | #counts.keys() will then return the nucleotide sequences
19 | #that were actually in merged_seqs
20 | 
21 | #print out the sequences that occur more than once
22 | for count in counts:
23 |         print ''.join(count), counts[count]
24 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_44.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from collections import defaultdict
 4 | import sys
 5 | import fasta
 6 | 
 7 | seqs = fasta.get_seqs(open(sys.argv[1]).readlines())
 8 | length = int(sys.argv[2])
 9 | 
10 | quorum = defaultdict(list)
11 | 
12 | seq_number = 0
13 | for i in seqs:
14 |     seq_number += 1
15 |     for n in range(len(i.sequence) - int(length)):
16 |         if not seq_number in quorum[i.sequence[n : n + length]]:
17 |             quorum[i.sequence[n : n + length]].append(seq_number)
18 | 
19 | for i in quorum:
20 |     print ''.join(i).upper(), len(quorum[i])
21 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_45.py:
--------------------------------------------------------------------------------
 1 | def fac_01(n):
 2 |     result = 1
 3 |     for i in xrange(2, n+1):
 4 |         result *= i
 5 |     return result
 6 | 
 7 | def fac_02(n):
 8 |     value = reduce(lambda i, j : i * j, range(1, n + 1))
 9 |     return value
10 | 
11 | def fac_03(n):
12 |     import operator
13 |     value = reduce(operator.mul, xrange(2, n + 1))
14 |     return value
15 | 
16 | def fac_04(n):
17 |     fac = lambda n:n-1 + abs(n-1) and fac(n-1)*long(n) or 1
18 |     return fac(n)
19 | 
20 | def fac_05(n):
21 |     fac = lambda n:[1,0][n&gt;0] or fac(n-1)*n
22 |     return fac(n)
23 | 
24 | def fac_06(n):
25 |     fac = lambda n:reduce(lambda a,b:a*(b+1),range(n),1)
26 |     return fac(n)
27 | 
28 | def fac_07(n):
29 |     fac=lambda n: [1, 0][n &gt; 0] or reduce(lambda x, y: x*y, xrange(1,n + 1))
30 |     return fac(n)
31 | 
32 | def fac_08(n):
33 |     fac = lambda n: n &lt;= 0 or reduce(lambda a, b: a*b, xrange(1,n + 1))
34 |     return fac(n)
35 | 
36 | def fac_09(n):
37 |     fac = lambda n: [[[j for j in (j * i,)][0] for i in range(2, n+1)][-1] for j in (1,)][0]
38 |     return fac(n)
39 | 
40 | def fac_10(n):
41 |     fac = lambda n: [j for j in [1] for i in range(2, n+1) for j in [j * i]] [-1]
42 |     return fac(n)
43 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_46.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import timeit
 4 | 
 5 | def fac(n, m):
 6 |     result1 = 1
 7 |     for i in xrange(2, n + 1):
 8 |         result1 *= i
 9 |     result2 = 1
10 |     for i in xrange(2, m + 1):
11 |         result2 *= i
12 |     result3 = 1
13 |     for i in xrange(2, (n - m) + 1):
14 |         result3 *= i 
15 | 
16 |     return  result1 / (result2 * result3) 
17 | 
18 | def binom(n, m):
19 |     b = [0] * (n + 1)
20 |     b[0] = 1
21 |     for i in xrange(1, n + 1):
22 |         b[i] = 1
23 |         j = i - 1
24 |         while j &gt; 0:
25 |             b[j] += b[j - 1]
26 |             j -= 1
27 |     return b[m] 
28 | 
29 | def choose(n, k):
30 |     if 0 <= k <= n:
31 |         ntok = 1
32 |         ktok = 1
33 |         for t in xrange(1, min(k, n - k) + 1):
34 |             ntok *= n
35 |             ktok *= t
36 |             n -= 1
37 |         #print ntok // ktok
38 |         return ntok // ktok
39 |     else:
40 |         return 0
41 | 
42 | if __name__ == "__main__":
43 | 
44 |     stmt = "fac(3000, 7)"
45 |     t = timeit.Timer(stmt = stmt, setup='from __main__ import fac')
46 |     stmt2 = "binom(3000, 7)"
47 |     t2 = timeit.Timer(stmt = stmt2, setup = 'from __main__ import binom')
48 |     stmt3 = "choose(3000, 7)"
49 |     t3 = timeit.Timer(stmt = stmt3, setup = 'from __main__ import choose')
50 | 
51 |     print 'fac: %.9f' % (t.timeit(100)/100)
52 |     print 'binom: %.2f' % (t2.timeit(10)/10)
53 |     print 'choose %.9f' % (t3.timeit(100)/100)
54 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_47.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import fasta
 4 | import sys
 5 | from collections import defaultdict
 6 | 
 7 | def choose(n, k):
 8 |     if 0 <= k <= n:
 9 |         ntok = 1
10 |         ktok = 1
11 |         for t in xrange(1, min(k, n - k) + 1):
12 |             ntok *= n
13 |             ktok *= t
14 |             n -= 1
15 |         #print ntok // ktok
16 |         return ntok // ktok
17 |     else:
18 |         return 0
19 | 
20 | def get_quorums(seqs, mlen):
21 |     """
22 |     add seq id_no to a set
23 |     use explicit counter to create seq_no
24 |     """
25 |     quorum = defaultdict(set)
26 |     id_no = 0
27 |     for seq in seqs:
28 |         id_no += 1
29 |         for n in range(len(seq) - mlen):
30 |             quorum[seq[n:n + mlen]].add(id_no)
31 |     return quorum
32 | 
33 | input_seqs = fasta.read_seqs(open(sys.argv[1]).readlines())
34 | input_seqs2 = fasta.read_seqs(open(sys.argv[2]).readlines())
35 | 
36 | foreground = get_quorums(input_seqs, 10)
37 | background = get_quorums(input_seqs2, 10)
38 | 
39 | N = len(input_seqs) + len(input_seqs2)
40 | 
41 | for i in foreground:
42 |     term1 = choose(len(background[i]), len(foreground[i]))
43 |     term2 = choose((N - len(background[i])), len(input_seqs)-1)
44 |     term3 = choose(N, len(input_seqs))
45 |     p = (float(term1) * float(term2)) / term3
46 |     if 0 < p <= 0.0001:
47 |         print i, len(foreground[i]), len(background[i]), p
48 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/code_48.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import fasta
 4 | import sys
 5 | from collections import defaultdict
 6 | 
 7 | def choose(n, k):
 8 |     if 0 <= k <= n:
 9 |         ntok = 1
10 |         ktok = 1
11 |         for t in xrange(1, min(k, n - k) + 1):
12 |             ntok *= n
13 |             ktok *= t
14 |             n -= 1
15 |         return ntok // ktok
16 |     else:
17 |         return 0
18 | 
19 | def get_quorums(seqs, mlen):
20 |     """
21 |     add seq id_no to a set
22 |     use explicit counter to create seq_no
23 |     """
24 |     quorum = defaultdict(int)
25 |     for seq in seqs:
26 |         for n in range(len(seq) - mlen):
27 |             quorum[seq[n:n + mlen]] += 1
28 |     return quorum
29 | 
30 | input_seqs = fasta.read_seqs(open(sys.argv[1]).readlines())
31 | input_seqs2 = fasta.read_seqs(open(sys.argv[2]).readlines())
32 | 
33 | foreground = get_quorums(input_seqs, 10)
34 | background = get_quorums(input_seqs2, 10)
35 | 
36 | N = len(input_seqs) + len(input_seqs2)
37 | 
38 | res_motifs = []
39 | for i in foreground:
40 |     term1 = choose(len(background[i]), len(foreground[i]))
41 |     term2 = choose((N - len(background[i])), len(input_seqs)-1)
42 |     term3 = choose(N, len(input_seqs))
43 |     p = (float(term1) * float(term2)) / term3
44 |     if 0 < p <= 0.0001:
45 |         res_motifs.append(i + '\t' + str(foreground[i]) + '\t' + str(background[i]) + '\t' + str(p))
46 | 
47 | res_motifs.sort()
48 | for i in res_motifs:
49 |     print i
50 | 


--------------------------------------------------------------------------------
/scripts/original_scripts/dnatranslate.py:
--------------------------------------------------------------------------------
 1 | def translate_dna(sequence):
 2 | 
 3 |     #dictionary with the genetic code
 4 |     gencode = {
 5 |     'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
 6 |     'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
 7 |     'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
 8 |     'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
 9 |     'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
10 |     'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
11 |     'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
12 |     'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
13 |     'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
14 |     'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
15 |     'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
16 |     'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
17 |     'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
18 |     'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
19 |     'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
20 |     'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
21 |     }
22 |     
23 |     proteinseq = ''
24 |     #loop to read DNA sequence in codons, 3 nucleotides at a time
25 |     for n in range(0,len(sequence),3):
26 |         #checking to see if the dictionary has the key
27 |         if gencode.has_key(sequence[n:n+3]) == True:
28 |             proteinseq += gencode[sequence[n:n+3]]
29 |     #return protein sequence
30 |     return proteinseq


--------------------------------------------------------------------------------
/scripts/original_scripts/fasta.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | class Fasta:
 4 |     def __init__(self, name, sequence):
 5 |         self.name = name
 6 |         self.sequence = sequence
 7 | 
 8 | def read_fasta(file):
 9 |     items = []
10 |     index = 0
11 |     for line in file:
12 |         if line.startswith(">"):
13 |            if index >= 1:
14 |                items.append(aninstance)
15 |            index+=1
16 |            name = line.strip()
17 |            seq = ''
18 |            aninstance = Fasta(name, seq)
19 |         else:
20 |            seq += line.strip()
21 |            aninstance = Fasta(name, seq)
22 | 
23 |     items.append(aninstance)
24 |     return items
25 | 
26 | def read_seqs(file):
27 |     items = []
28 |     seq = ''
29 |     index = 0
30 |     for line in file:
31 |         if line.startswith(">"):
32 |             if index >= 1:
33 |                 items.append(seq)
34 |                 seq = ''
35 |             index += 1
36 |         else:
37 |             seq += line[:-1]
38 | 
39 |     items.append(seq)
40 |     return items
41 | 
42 | def format_output(sequence, length):
43 |     temp = []
44 |     for j in range(0,len(sequence),length):
45 |         temp.append(sequence[j:j+length])
46 |     return '\n'.join(temp)
47 | 
48 | 
49 | def complement(seq):
50 |     complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
51 |     complseq = [complement[base] for base in seq]
52 |     return complseq
53 | 
54 | def reverse_complement(seq):
55 |     seq = list(seq)
56 |     seq.reverse()
57 |     return ''.join(complement(seq))
58 | 
59 | def transcribe(seq):
60 |     RNA = seq.replace('T', 'U')  
61 |     return RNA  
62 | 


--------------------------------------------------------------------------------