├── README └── scripts ├── motifs ├── fasta.py ├── pymotGUI.py └── pymotif.py └── original_scripts ├── 3.0 ├── AY162388.seq ├── code_01.py ├── code_02.py ├── code_03.py ├── code_04.py ├── code_05.py └── code_05.py.bak ├── code_01.py ├── code_02.py ├── code_03.py ├── code_04.py ├── code_05.py ├── code_06.py ├── code_07.py ├── code_08.py ├── code_09.py ├── code_10.py ├── code_11.py ├── code_12.py ├── code_13.py ├── code_14.py ├── code_15.py ├── code_16.py ├── code_17.py ├── code_18.py ├── code_18a.py ├── code_19.py ├── code_20.py ├── code_21.py ├── code_22.py ├── code_23.py ├── code_24.py ├── code_25.py ├── code_26.py ├── code_27.py ├── code_28.py ├── code_29.py ├── code_30.py ├── code_31.py ├── code_32.py ├── code_33.py ├── code_34.py ├── code_35.py ├── code_36.py ├── code_37.py ├── code_38.py ├── code_39.py ├── code_40.py ├── code_41.py ├── code_42.py ├── code_43.py ├── code_44.py ├── code_45.py ├── code_46.py ├── code_47.py ├── code_48.py ├── dnatranslate.py └── fasta.py /README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nuin/beginning-python-for-bioinformatics/b33813f4ec11a59a5c6381cc5b78044824d25e3f/README -------------------------------------------------------------------------------- /scripts/motifs/fasta.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | class Fasta: 4 | def __init__(self, name, sequence): 5 | self.name = name 6 | self.sequence = sequence 7 | 8 | def read_fasta(file): 9 | items = [] 10 | index = 0 11 | for line in file: 12 | if line.startswith(">"): 13 | if index >= 1: 14 | items.append(aninstance) 15 | index+=1 16 | name = line.strip() 17 | seq = '' 18 | aninstance = Fasta(name, seq) 19 | else: 20 | seq += line.strip() 21 | aninstance = Fasta(name, seq) 22 | 23 | items.append(aninstance) 24 | return items 25 | 26 | def read_seqs(file): 27 | items = [] 28 | seq = '' 29 | index = 0 30 | for line in file: 31 | if line.startswith(">"): 32 | if index >= 1: 33 | items.append(seq) 34 | seq = '' 35 | index += 1 36 | else: 37 | seq += line[:-1] 38 | 39 | items.append(seq) 40 | return items 41 | 42 | def format_output(sequence, length): 43 | temp = [] 44 | for j in range(0,len(sequence),length): 45 | temp.append(sequence[j:j+length]) 46 | return '\n'.join(temp) 47 | 48 | -------------------------------------------------------------------------------- /scripts/motifs/pymotGUI.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import wx 4 | import pymot 5 | import pymotif 6 | import fasta 7 | import os 8 | 9 | class pymot(wx.App): 10 | 11 | def __init__(self, redirect=False): 12 | wx.App.__init__(self, redirect) 13 | 14 | 15 | class pymotGUI(wx.Frame): 16 | 17 | fore_file = '' 18 | back_file = '' 19 | 20 | def __init__(self, parent, id): 21 | wx.Frame.__init__(self, parent, id, 'Python Motif Finder', style=wx.DEFAULT_FRAME_STYLE) 22 | self.__do_layout() 23 | 24 | def __do_layout(self): 25 | 26 | #adding the panel 27 | panel = wx.Panel(self) 28 | 29 | #defines the menubar 30 | menubar = wx.MenuBar() 31 | 32 | #file menu 33 | filemenu = wx.Menu() 34 | foreground_menu = filemenu.Append(-1, 'Select foreground file') 35 | background_menu = filemenu.Append(-1, 'Select background file') 36 | sep = filemenu.AppendSeparator() 37 | quitmenu = filemenu.Append(-1, 'Quit') 38 | 39 | #appends the menu to the menubar and creates it 40 | menubar.Append(filemenu, 'File') 41 | self.SetMenuBar(menubar) 42 | 43 | #input box for motif width, and label 44 | self.one_label = wx.StaticText(panel, -1, 'Motif width', (10,50)) 45 | self.motif_width = wx.TextCtrl(panel, -1, '10', (95, 50), (40,18)) 46 | #result textbox 47 | self.results = wx.TextCtrl(panel, -1, '', (150, 50), (200, 100), wx.TE_MULTILINE | wx.TE_AUTO_SCROLL | wx.HSCROLL) 48 | 49 | #run bbutton 50 | self.run_button = wx.Button(panel, -1, 'Run', (10, 80)) 51 | 52 | #labels 53 | self.fore_label = wx.StaticText(panel, -1, 'Select the foreground file', (10, 10)) 54 | self.back_label = wx.StaticText(panel, -1, 'Select the background file', (10, 30)) 55 | 56 | #binding the menus to functions 57 | self.Bind(wx.EVT_MENU, self.on_foreground, foreground_menu) 58 | self.Bind(wx.EVT_MENU, self.on_background, background_menu) 59 | self.Bind(wx.EVT_BUTTON, self.run_finder, self.run_button) 60 | 61 | 62 | def on_foreground(self, event): 63 | dialog = wx.FileDialog(self, style=wx.OPEN) 64 | if dialog.ShowModal() == wx.ID_OK: 65 | pymotGUI.fore_file = dialog.GetPath() 66 | self.fore_label.SetLabel(pymotGUI.fore_file) 67 | 68 | def on_background(self, event): 69 | dialog = wx.FileDialog(self, style=wx.OPEN) 70 | if dialog.ShowModal() == wx.ID_OK: 71 | pymotGUI.back_file = dialog.GetPath() 72 | self.back_label.SetLabel(pymotGUI.back_file) 73 | 74 | def run_finder(self, event): 75 | width = self.motif_width.GetValue() 76 | result = pymotif.calculate_motifs(pymotGUI.fore_file, pymotGUI.back_file, int(width)) 77 | for motif in result: 78 | self.results.WriteText(motif + '\n') 79 | 80 | 81 | #if __name__ == '__main__': 82 | app = pymot() 83 | frame = pymotGUI(parent=None, id = -1) 84 | #frame.CentreOnScreen() 85 | frame.Show() 86 | app.MainLoop() -------------------------------------------------------------------------------- /scripts/motifs/pymotif.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import fasta 4 | import sys 5 | from collections import defaultdict 6 | 7 | def choose(n, k): 8 | if 0 <= k <= n: 9 | ntok = 1 10 | ktok = 1 11 | for t in xrange(1, min(k, n - k) + 1): 12 | ntok *= n 13 | ktok *= t 14 | n -= 1 15 | return ntok // ktok 16 | else: 17 | return 0 18 | 19 | def get_quorums(seqs, mlen): 20 | """ 21 | add seq id_no to a set 22 | use explicit counter to create seq_no 23 | """ 24 | quorum = defaultdict(int) 25 | for seq in seqs: 26 | for n in range(len(seq) - mlen): 27 | quorum[seq[n:n + mlen]] += 1 28 | return quorum 29 | 30 | def calculate_motifs(input_seqs, input_seqs2, width): 31 | 32 | print input_seqs, input_seqs2 33 | input_seqs = fasta.read_seqs(open(input_seqs).readlines()) 34 | input_seqs2 = fasta.read_seqs(open(input_seqs2).readlines()) 35 | 36 | foreground = get_quorums(input_seqs, width) 37 | background = get_quorums(input_seqs2, width) 38 | 39 | N = len(input_seqs) + len(input_seqs2) 40 | 41 | res_motifs = [] 42 | for i in foreground: 43 | term1 = choose(background[i], foreground[i]) 44 | term2 = choose((N - background[i]), len(input_seqs) - 1) 45 | term3 = choose(N, len(input_seqs)) 46 | p = (float(term1) * float(term2)) / term3 47 | if 0 < p <= 0.0001: 48 | res_motifs.append(i + '\t' + str(foreground[i]) + '\t' + str(background[i]) + '\t' + str(p)) 49 | 50 | res_motifs.sort() 51 | return res_motifs -------------------------------------------------------------------------------- /scripts/original_scripts/3.0/AY162388.seq: -------------------------------------------------------------------------------- 1 | GTGACTTTGTTCAACGGCCGCGGTATCCTAACCGTGCGAAGGTAGCGTAATCACTTGTTC 2 | TTTAAATAAGGACTAGTATGAATGGCATCACGAGGGCTTTACTGTCTCCTTTTTCTAATC 3 | AGTGAAACTAATCTCCCGTGAAGAAGCGGGAATTAACTTATAAGACGAGAAGACCCTATG 4 | GAGCTTTAAACCAAATAACATTTGCTATTTTACAACATTCAGATATCTAATCTTTATAGC 5 | ACTATGATTACAAGTTTTAGGTTGGGGTGACCGCGGAGTAAAAATTAACCTCCACATTGA 6 | AGGAATTTCTAAGCAAAAAGCTACAACTTTAAGCATCAACAAATTGACACTTATTGACCC 7 | AATATTTTGATCAACGAACCATTACCCTAGGGATAACAGCGCAATCCATTATGAGAGCTA 8 | TTATCGACAAGTGGGCTTACGACCTCGATGTTGGATCAGGG 9 | -------------------------------------------------------------------------------- /scripts/original_scripts/3.0/code_01.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | '''first script featured on the website 4 | like a hello world''' 5 | 6 | dna = "ACGTACGTACGTACGTACGTACGT" 7 | print(dna) 8 | -------------------------------------------------------------------------------- /scripts/original_scripts/3.0/code_02.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | '''second script available, shows a simple way to concatenate 4 | two DNA sequences, strings''' 5 | 6 | dna = "ACGTACGTACGTACGTACGTACGT" 7 | dna2 = "TCGATCGATCGATCGATCGA" 8 | print("First and Second sequences") 9 | print(dna, dna2) 10 | dna3 = dna + dna2 11 | print("Concatenated sequence") 12 | print(dna3) 13 | -------------------------------------------------------------------------------- /scripts/original_scripts/3.0/code_03.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | '''third script, shows how to import a module and use the 4 | regex module to transcribe DNA to RNA 5 | ''' 6 | 7 | #import regular expression module 8 | import re 9 | 10 | #setting the DNA string 11 | dna = 'ACGTTGCAACGTTGCAACGTTGCA' 12 | 13 | #assigning a new regex and compiling it 14 | #to find all Ts 15 | regexp = re.compile('T') 16 | 17 | #create a new string tha will receive 18 | #the regex result with Us replacing Ts 19 | rna = regexp.sub('U', dna) 20 | 21 | print(rna) 22 | -------------------------------------------------------------------------------- /scripts/original_scripts/3.0/code_04.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | '''simple script to open a file and print all lines''' 4 | 5 | #assigning a filename to a variable 6 | dnafile = "AY162388.seq" 7 | 8 | #opening the file 9 | file = open(dnafile, 'r') 10 | 11 | #printing each line of the file 12 | for line in file: 13 | print(line, end='') 14 | -------------------------------------------------------------------------------- /scripts/original_scripts/3.0/code_05.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | simple script to find motifs on DNA sequences using regex 5 | the script is interactive 6 | ''' 7 | 8 | # we use the RegEx module 9 | import re 10 | import string 11 | 12 | #still keep the file fixed 13 | dnafile = "AY162388.seq" 14 | 15 | #opening the file, reading the sequence and storing in a list 16 | seqlist = open(dnafile, 'r').readlines() 17 | 18 | #let's join the the lines in a temporary string 19 | temp = ''.join(seqlist) 20 | 21 | #assigning our sequence, with no carriage returns to our 22 | #final variable/object 23 | sequence = temp.replace('\n', '') 24 | 25 | #we start to deal with user input 26 | #first we use a boolean variable to check for valid input 27 | inputfromuser = True 28 | 29 | #while loop: while there is an motif larger than 0 30 | #the loop continues 31 | while inputfromuser: 32 | #raw_input received the user input as string 33 | inmotif = input('Enter motif to search: ') 34 | #now we check for the size of the input 35 | if len(inmotif) >= 1: 36 | #we compile a regex with the input given 37 | motif = re.compile('%s' % inmotif) 38 | #looking to see if the entered motif is in the sequence 39 | if re.search(motif, sequence): 40 | print('Yep, I found it') 41 | else: 42 | print('Sorry, try another one') 43 | else: 44 | print('Done, thanks for using motif_search') 45 | inputfromuser = False 46 | -------------------------------------------------------------------------------- /scripts/original_scripts/3.0/code_05.py.bak: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | simple script to find motifs on DNA sequences using regex 5 | the script is interactive 6 | ''' 7 | 8 | # we use the RegEx module 9 | import re 10 | import string 11 | 12 | #still keep the file fixed 13 | dnafile = "AY162388.seq" 14 | 15 | #opening the file, reading the sequence and storing in a list 16 | seqlist = open(dnafile, 'r').readlines() 17 | 18 | #let's join the the lines in a temporary string 19 | temp = ''.join(seqlist) 20 | 21 | #assigning our sequence, with no carriage returns to our 22 | #final variable/object 23 | sequence = temp.replace('\n', '') 24 | 25 | #we start to deal with user input 26 | #first we use a boolean variable to check for valid input 27 | inputfromuser = True 28 | 29 | #while loop: while there is an motif larger than 0 30 | #the loop continues 31 | while inputfromuser: 32 | #raw_input received the user input as string 33 | inmotif = raw_input('Enter motif to search: ') 34 | #now we check for the size of the input 35 | if len(inmotif) >= 1: 36 | #we compile a regex with the input given 37 | motif = re.compile('%s' % inmotif) 38 | #looking to see if the entered motif is in the sequence 39 | if re.search(motif, sequence): 40 | print 'Yep, I found it' 41 | else: 42 | print 'Sorry, try another one' 43 | else: 44 | print 'Done, thanks for using motif_search' 45 | inputfromuser = False 46 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_01.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | '''first script featured on the website 4 | like a hello world''' 5 | 6 | dna = "ACGTACGTACGTACGTACGTACGT" 7 | print dna 8 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_02.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | '''second script available, shows a simple way to concatenate 4 | two DNA sequences, strings''' 5 | 6 | dna = "ACGTACGTACGTACGTACGTACGT" 7 | dna2 = "TCGATCGATCGATCGATCGA" 8 | print "First and Second sequences" 9 | print dna, dna2 10 | dna3 = dna + dna2 11 | print "Concatenated sequence" 12 | print dna3 13 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_03.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | '''third script, shows how to import a module and use the 4 | regex module to transcribe DNA to RNA 5 | ''' 6 | 7 | #import regular expression module 8 | import re 9 | 10 | #setting the DNA string 11 | dna = 'ACGTTGCAACGTTGCAACGTTGCA' 12 | 13 | #assigning a new regex and compiling it 14 | #to find all Ts 15 | regexp = re.compile('T') 16 | 17 | #create a new string tha will receive 18 | #the regex result with Us replacing Ts 19 | rna = regexp.sub('U', dna) 20 | 21 | print rna 22 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_04.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | '''simple script to open a file and print all lines''' 4 | 5 | #assigning a filename to a variable 6 | dnafile = "AY162388.seq" 7 | 8 | #opening the file 9 | file = open(dnafile, 'r') 10 | 11 | #printing each line of the file 12 | for line in file: 13 | print line, -------------------------------------------------------------------------------- /scripts/original_scripts/code_05.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | simple script to find motifs on DNA sequences using regex 5 | the script is interactive 6 | ''' 7 | 8 | # we use the RegEx module 9 | import re 10 | import string 11 | 12 | #still keep the file fixed 13 | dnafile = "AY162388.seq" 14 | 15 | #opening the file, reading the sequence and storing in a list 16 | seqlist = open(dnafile, 'r').readlines() 17 | 18 | #let's join the the lines in a temporary string 19 | temp = ''.join(seqlist) 20 | 21 | #assigning our sequence, with no carriage returns to our 22 | #final variable/object 23 | sequence = temp.replace('\n', '') 24 | 25 | #we start to deal with user input 26 | #first we use a boolean variable to check for valid input 27 | inputfromuser = True 28 | 29 | #while loop: while there is an motif larger than 0 30 | #the loop continues 31 | while inputfromuser: 32 | #raw_input received the user input as string 33 | inmotif = raw_input('Enter motif to search: ') 34 | #now we check for the size of the input 35 | if len(inmotif) >= 1: 36 | #we compile a regex with the input given 37 | motif = re.compile('%s' % inmotif) 38 | #looking to see if the entered motif is in the sequence 39 | if re.search(motif, sequence): 40 | print 'Yep, I found it' 41 | else: 42 | print 'Sorry, try another one' 43 | else: 44 | print 'Done, thanks for using motif_search' 45 | inputfromuser = False 46 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_06.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | another script to find motifs on DNA sequences with more features than 5 | code_05.py 6 | ''' 7 | 8 | # we use the RegEx module 9 | import re 10 | import string 11 | #we also import the sys module 12 | import sys 13 | 14 | #set the variable to control the loop 15 | fileinput = True 16 | while fileinput == True: 17 | #ask user for the input 18 | filename = raw_input('Enter file name:') 19 | if len(filename) > 0: 20 | #we try to open the file 21 | try: 22 | dnafile = open(filename, 'r') 23 | #success! we finish the loop and move to the next input 24 | fileinput = False 25 | except: 26 | #no dice, file does not exist 27 | #keep the loop on and ask again 28 | print 'File does not exist' 29 | else: 30 | # fileinput = False 31 | sys.exit() 32 | 33 | 34 | #opening the file, reading the sequence and storing in a list 35 | seqlist = open(filename, 'r').readlines() 36 | 37 | #let's join the the lines in a temporary string 38 | temp = ''.join(seqlist) 39 | 40 | #assigning our sequence, with no carriage returns to our 41 | #final variable/object 42 | sequence = temp.replace('\n', '') 43 | 44 | #we start to deal with user input 45 | #first we use a boolean variable to check for valid input 46 | inputfromuser = True 47 | 48 | #while loop: while there is an motif larger than 0 49 | #the loop continues 50 | while inputfromuser: 51 | #raw_input received the user input as string 52 | inmotif = raw_input('Enter motif to search: ') 53 | #now we check for the size of the input 54 | if len(inmotif) >= 1: 55 | #we compile a regex with the input given 56 | motif = re.compile('%s' % inmotif) 57 | #looking to see if the entered motif is in the sequence 58 | if re.search(motif, sequence): 59 | print 'Yep, I found it' 60 | else: 61 | print 'Sorry, try another one' 62 | else: 63 | print 'Done, thanks for using motif_search' 64 | inputfromuser = False 65 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_07.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | counting the nucleotides in a sequence, iterating 5 | through lists 6 | ''' 7 | 8 | #let's keep the file fixed for now 9 | dnafile = "AY162388.seq" 10 | 11 | #opening the file, reading the sequence and storing in a list 12 | file = open(dnafile, 'r') 13 | 14 | #initialize a string to receive the data 15 | sequence = '' 16 | for line in file: 17 | sequence += line.strip() #notice the strip, to remove \n 18 | 19 | #"exploding" the sequence in a list 20 | seqlist = list(sequence) 21 | 22 | #initializing integers to store the counts 23 | total_a = 0 24 | total_c = 0 25 | total_g = 0 26 | total_t = 0 27 | 28 | #checking each item in the list and updating counts 29 | for base in seqlist: 30 | if base == 'A': 31 | total_a += 1 32 | elif base == 'C': 33 | total_c += 1 34 | elif base == 'G': 35 | total_g += 1 36 | elif base == 'T': 37 | total_t += 1 38 | 39 | print str(total_a) + ' As found' 40 | print str(total_c) + ' Cs found' 41 | print str(total_g) + ' Gs found' 42 | print str(total_t) + ' Ts found' 43 | 44 | 45 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_08.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | '''script that counts the number of bases in a DNA sequence 4 | showing the string.count() method''' 5 | 6 | #still keep the file fixed 7 | dnafile = "AY162388.seq" 8 | 9 | #opening the file, reading the sequence and storing in a list 10 | seqlist = open(dnafile, 'r').readlines() 11 | 12 | #let's join the the lines in a temporary string 13 | temp = ''.join(seqlist) 14 | 15 | #counting 16 | total_a = temp.count('A') 17 | total_c = temp.count('C') 18 | total_g = temp.count('G') 19 | total_t = temp.count('T') 20 | 21 | #printing results 22 | print str(total_a) + ' As found' 23 | print str(total_c) + ' Cs found' 24 | print str(total_g) + ' Gs found' 25 | print str(total_t) + ' Ts found' 26 | 27 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_09.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | script that counts the number of each nucleotide in a sequence using 5 | user input and saving to a file. 6 | ''' 7 | 8 | import sys 9 | import re 10 | 11 | 12 | fileentered = True #flag that determines if a filename has been entered 13 | 14 | while fileentered == True: 15 | #ask the user to input a filename 16 | filename = raw_input('Please enter a file to check: ') 17 | #if a filename was entered, go ... 18 | if len(filename) >= 1: 19 | try: 20 | #open the file 21 | seqlist = open(filename, 'r').readlines() 22 | #sequence is read as a list, convert to string 23 | sequence = ''.join(seqlist) 24 | #remove carriage returns 25 | sequence = sequence.replace('\n', '') 26 | #counting 27 | total_a = sequence.count('A') 28 | total_c = sequence.count('C') 29 | total_g = sequence.count('G') 30 | total_t = sequence.count('T') 31 | #create a regex object with non-nucleotide letters to check for "errors" 32 | otherletter = re.compile('[BDEFHIJKLMNOPQRSUVXZ]+') 33 | #find possible non-nucleotides 34 | extra = re.findall(otherletter, sequence) 35 | #open an output filename to output counts 36 | output = open(filename + '.count', 'w') 37 | #writing the output 38 | output.write('Count report for file ' + filename + '\n') 39 | output.write('A = ' + str(total_a) + '\n') 40 | output.write('C = ' + str(total_c) + '\n') 41 | output.write('G = ' + str(total_g) + '\n') 42 | output.write('T = ' + str(total_t) + '\n') 43 | #if there are non-nucleotides in the sequence, report them 44 | if len(extra) > 0: 45 | output.write('Also were found ' + str(len(extra)) + ' errors\n') 46 | for i in extra: 47 | output.write(i + ' ') 48 | else: 49 | output.write('No error found') 50 | print 'Result file saved on ' + filename + '.count' 51 | except: 52 | print 'File not found. Please try again.' 53 | else: 54 | #if no filename entered, exit 55 | fileentered = False 56 | sys.exit() 57 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_10.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | ''' 4 | Python functions 5 | ''' 6 | 7 | def add_tail(seq): 8 | '''function that adds a poly-T tail to sequences''' 9 | result = seq + 'TTTTTTTTTTTTTTTTTTTTT' 10 | return result 11 | 12 | #opening the file 13 | dnafile = 'AY162388.seq' 14 | file = open(dnafile, 'r') 15 | 16 | #reading the sequence from the file 17 | sequence = '' 18 | for line in file: 19 | sequence += line.strip() 20 | 21 | #printing result 22 | print sequence 23 | #calling the function to add the tail 24 | sequence = add_tail(sequence) 25 | #printing new sequence 26 | print sequence 27 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_11.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | making a function to count nucleotides 5 | ''' 6 | 7 | import sys 8 | 9 | def count_nucleotide_types(seq): 10 | '''counting nucleotides and returning a list with counts''' 11 | result = [] 12 | total_a = seq.count('A') 13 | total_c = seq.count('C') 14 | total_g = seq.count('G') 15 | total_t = seq.count('T') 16 | 17 | result.append(total_a) 18 | result.append(total_c) 19 | result.append(total_g) 20 | result.append(total_t) 21 | 22 | return result 23 | 24 | #opening the file 25 | sequencefile = open(sys.argv[1], 'r').readlines() 26 | #joining a sequence as a list into a string 27 | sequence = ''.join(sequencefile) 28 | #replacing carriage returns 29 | sequence = sequence.replace('\n', '') 30 | #counting the nucleotides 31 | values = count_nucleotide_types(sequence) 32 | #printing the results 33 | print values 34 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_12.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | an extremely simple dice game 5 | ''' 6 | 7 | #we need the random module 8 | import random 9 | import string 10 | 11 | #generating two dices, between 1 and 6 for the human player 12 | dice1 = random.randint(1, 6) 13 | dice2 = random.randint(1, 6) 14 | 15 | #generating two dices, between 1 and 6 for the computer player 16 | computerdice1 = random.randint(1, 6) 17 | computerdice2 = random.randint(1, 6) 18 | 19 | #summing up both dices for each player 20 | mine = dice1 + dice2 21 | his_hers = computerdice1 + computerdice2 22 | 23 | #printing the values 24 | print 'mine = ' + str(mine) + ' vs. computer = ' + str(his_hers) 25 | 26 | #chdking the results and proclaiming the winner 27 | if mine > his_hers: 28 | print "I won" 29 | elif mine < his_hers: 30 | print "Computer won" 31 | else: 32 | print "Tie. Try again" 33 | 34 | 35 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_13.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | very simple script to generate random DNA sequences 5 | ''' 6 | 7 | #random module is needed 8 | import random 9 | import sys 10 | 11 | #sequence length is a parameter 12 | length = int(sys.argv[1]) 13 | 14 | #template DNA is a list with ACGT repeats 15 | dnaseq = list('ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT') 16 | 17 | #print the template 18 | print dnaseq 19 | 20 | result = '' 21 | for i in range(length): 22 | #for the simulated sequence we use random.choice 23 | #that randonly selects items of a list 24 | result += random.choice(dnaseq) 25 | 26 | print result -------------------------------------------------------------------------------- /scripts/original_scripts/code_14.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | a more elaborated script to generate random DNA sequences 5 | ''' 6 | 7 | import random 8 | import sys 9 | 10 | def simulate_sequence(length): 11 | '''function the generates the simulations''' 12 | #list with nucleotides 13 | dna = ['A', 'C', 'G', 'T'] 14 | #initializing the sequence 15 | sequence = '' 16 | #iterates over the input sequence length ... 17 | for i in range(length): 18 | #and chooses randomly the nucletides 19 | sequence += random.choice(dna) 20 | #returns simulated sequence 21 | return sequence 22 | 23 | #first parameter is the number of sequences to generate 24 | setsize = int(sys.argv[1]) 25 | #minimum and maximum sequence lengths 26 | minlength = int(sys.argv[2]) 27 | maxlength = int(sys.argv[3]) 28 | 29 | #initializes a list to store the sequence set 30 | sequenceset = [] 31 | for i in range(setsize): 32 | #generate a random integer between min and max seq lenght 33 | rlength = random.randint(minlength, maxlength) 34 | #appending to the sequence set and calling simulated sequence 35 | #function 36 | sequenceset.append(simulate_sequence(rlength)) 37 | 38 | #printing output 39 | for sequence in sequenceset: 40 | print sequence 41 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_15.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | a even more elaborated DNA sequence simulation script with 5 | sequence identity calculation (not overall, just neighbours) 6 | ''' 7 | 8 | import random 9 | import sys 10 | 11 | def simulate_sequence(length): 12 | '''function that simulates the sequences''' 13 | #nucleotides list 14 | dna = ['A', 'C', 'G', 'T'] 15 | sequence = '' 16 | #randomly picking from the nucleotide list 17 | for i in range(length): 18 | sequence += random.choice(dna) 19 | return sequence 20 | 21 | def nucleotide_percentage(sequence): 22 | #counting the nucleotides 23 | print str(sequence.count('A')) + ' As ', 24 | print str(sequence.count('C')) + ' Cs ', 25 | print str(sequence.count('G')) + ' Gs ', 26 | print str(sequence.count('T')) + ' Ts ' 27 | 28 | def sequence_identity(seqset): 29 | '''function that calculates sequence identies''' 30 | iden = [] 31 | count = 0.0 32 | #iterates through the sequences in the set -1 33 | #and calculates sequence identities 34 | for x in range(len(seqset) - 1): 35 | print str(x), str(x+1) 36 | for n in range(len(seqset[x])): 37 | #iterates over all nucleotides and checks for identical ones 38 | if seqset[x][n] == seqset[x + 1][n]: 39 | count += 1 40 | iden.append(count / len(seqset[x])) 41 | count = 0.0 42 | return iden 43 | 44 | #input parameters 45 | setsize = int(sys.argv[1]) 46 | minlength = int(sys.argv[2]) 47 | maxlength = int(sys.argv[3]) 48 | 49 | #generates simulated sequence sets 50 | sequenceset = [] 51 | for i in range(setsize): 52 | rlength = random.randint(minlength, maxlength) 53 | sequenceset.append(simulate_sequence(rlength)) 54 | 55 | #calculate sequence identities 56 | identity = sequence_identity(sequenceset) 57 | 58 | #prints the results 59 | for i in range(len(sequenceset)): 60 | print sequenceset[i] 61 | if i < len(sequenceset) - 1: 62 | print 'sequence identity to next sequence : ' + str(identity[i]) 63 | nucleotide_percentage(sequenceset[i]) 64 | print 65 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_16.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | ''' 4 | extremely simple script to DNA transcription 5 | ''' 6 | 7 | 8 | dna = 'ACGTTGCAACGTTGCAACGTTGCA' 9 | #string buil-in replace method 10 | rna = dna.replace('T', 'U') 11 | print rna 12 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_17.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | ''' 4 | reading a sequence file and printing first and last lines 5 | ''' 6 | 7 | dnafile = "AY162388.seq" 8 | file = open(dnafile, 'r').readlines() 9 | print 'I want the first line' 10 | print file[0] 11 | print 'now the last line' 12 | #print file[len(file)-1] 13 | print file[-1] 14 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_18.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | translating DNA into proteins 5 | ''' 6 | 7 | #import our homemade module that has the DNA 8 | #translating function 9 | import dnatranslate 10 | 11 | #OK, we are using the same DNA file 12 | dnafile = open("AY162388.seq", 'r').readlines() 13 | 14 | #opening the file and stripping and joining the lines 15 | sequence = '' 16 | for line in dnafile: 17 | sequence += line.strip() 18 | 19 | #call the function in our module and translating the sequence 20 | protein = dnatranslate.translate_dna(sequence) 21 | 22 | #output, simple, we could make it better 23 | print sequence, len(sequence) 24 | print 25 | print protein, len(protein) 26 | 27 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_18a.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import dnatranslate 4 | 5 | dnafile = open("AY162388.seq", 'r').readlines() 6 | 7 | sequence = '' 8 | for line in dnafile: 9 | sequence += line.strip() 10 | 11 | 12 | protein = dnatranslate.translate_dna(sequence) 13 | 14 | print sequence, len(sequence) 15 | print 16 | print protein, len(protein) 17 | 18 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_19.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | using the fasta module to read sequences 5 | ''' 6 | 7 | #import our freshly created module 8 | import fasta 9 | import sys 10 | 11 | #read the fasta file in one line: open the file, read the contents 12 | #and send it to the fasta reading function 13 | sequences = fasta.read_fasta(open(sys.argv[1], 'r').readlines()) 14 | 15 | temp = [] 16 | for i in sequences: 17 | #print the sequence name 18 | print i.name 19 | #use range with a step of 80, printing 80 characters at 20 | #a time. The value could be set by a input parameter 21 | for j in range(0,len(i.sequence),80): 22 | print i.sequence[j:j+80] 23 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_20.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | simple to translate dna into proteins 5 | ''' 6 | 7 | #importing the dnatranslate module 8 | import dnatranslate 9 | import sys 10 | import fasta 11 | 12 | #opening and reading the file in one take 13 | dna = fasta.read_fasta(open(sys.argv[1], 'r').readlines()) 14 | 15 | #iterate over the sequences and translate them 16 | for item in dna: 17 | protein = dnatranslate.translate_dna(item.sequence) 18 | print item.name 19 | print protein -------------------------------------------------------------------------------- /scripts/original_scripts/code_21.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | #import two modules 4 | import dnatranslate 5 | import fasta 6 | import sys 7 | 8 | 9 | #read the fasta file in one line: open the file, read the contents 10 | #and send it to the fasta reading function 11 | dna = fasta.read_fasta(open(sys.argv[1], 'r').readlines()) 12 | 13 | for item in dna: 14 | #translate the DNA 15 | protein = dnatranslate.translate_dna(item.sequence) 16 | print item.name 17 | #format and print the protein 18 | print fasta.format_output(protein, 60) -------------------------------------------------------------------------------- /scripts/original_scripts/code_22.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import fasta 4 | 5 | #function that reads the enzyme list 6 | def read_enzymes(file): 7 | #initialize dictionary 8 | resenz = {} 9 | start = False 10 | for line in file: 11 | #if we found the string we jump a line and start reading the list 12 | if line.find('Rich Roberts') >= 0: 13 | start = True 14 | line = file.next() 15 | if start == True and len(line) > 10: 16 | buffer = line.split() 17 | #populating the dictionary 18 | resenz[buffer[0]] = buffer[-1].replace('^', '') 19 | return resenz 20 | 21 | #function that checks if the input enzyme name was found in the list 22 | def check_enzyme(input, set): 23 | if set.has_key(input): 24 | return True 25 | else: 26 | return False 27 | 28 | def find_sites(input, set, sequence): 29 | #this is the IUPAC code 30 | iupacdict = {'A':'[A]', 31 | 'C':'[C]', 32 | 'G':'[G]', 33 | 'T':'[T]', 34 | 'M':'[AC]', 35 | 'R':'[AG]', 36 | 'W':'[AT]', 37 | 'S':'[CG]', 38 | 'Y':'[CT]', 39 | 'K':'[GT]', 40 | 'V':'[ACG]', 41 | 'H':'[ACT]', 42 | 'D':'[AGT]', 43 | 'B':'[CGT]', 44 | 'X':'[ACGT]', 45 | 'N':'[ACGT]'} 46 | 47 | #we get the site 48 | site = set[input] 49 | pattern = '' 50 | positions = [] 51 | #transform the site from IUPAC to nucleotides 52 | for i in site: 53 | pattern += iupacdict[i] 54 | #search the pattern 55 | searchpattern = re.compile(pattern) 56 | #search all entries of the pattern 57 | sites = searchpattern.findall(sequence) 58 | temppos = searchpattern.finditer(sequence) 59 | for i in temppos: 60 | begin, end = i.span() 61 | positions.append(begin) 62 | 63 | return sites, positions 64 | 65 | #read the enzyme name 66 | enzyme = sys.argv[1] 67 | #read the list 68 | enzymeset = read_enzymes(open('bionet.709', 'r')) 69 | isname = check_enzyme(enzyme, enzymeset) 70 | 71 | if isname: 72 | print 'Name found' 73 | #if we found the enzyme name we read the sequence file 74 | sequences = fasta.read_fasta(open(sys.argv[2], 'r').readlines()) 75 | for item in sequences: 76 | #let's search 77 | sites, positions = find_sites(enzyme, enzymeset, item.sequence) 78 | #print the sequence name 79 | print item.name[:20]+'...' 80 | #and use the zip function to combine the lists and print 81 | for i in zip(sites,positions): 82 | print i[0], '->', i[1] 83 | #if the name is not found, we bail out 84 | else: 85 | print 'Enzyme name not found, please try again' 86 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_23.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | 4 | gbfile = open(sys.argv[1], 'r').readlines() 5 | 6 | sequence = '' 7 | issequence = False 8 | for line in gbfile: 9 | if issequence == True: 10 | sequence += line 11 | elif line.find('ORIGIN') >= 0: 12 | issequence = True 13 | 14 | print sequence -------------------------------------------------------------------------------- /scripts/original_scripts/code_24.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | gbfile = open(sys.argv[1], 'r').readlines() 4 | 5 | sequence = '' 6 | issequence = False 7 | for line in gbfile: 8 | if issequence == True and not line.find('/') == 0: 9 | sequence += line.lstrip('0123456789 ').replace(' ', '') 10 | elif line.find('ORIGIN') >= 0: 11 | issequence = True 12 | 13 | print sequence -------------------------------------------------------------------------------- /scripts/original_scripts/code_25.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | gbfile = open(sys.argv[1], 'r').readlines() 4 | 5 | locus = '' 6 | organism = '' 7 | accession = '' 8 | for line in gbfile: 9 | if line.find('LOCUS') >= 0: 10 | locus = line 11 | elif line.find('ACCESSION') >= 0: 12 | accession = line 13 | elif line.find('ORGANISM') >= 0: 14 | organism = line 15 | 16 | print locus.strip() 17 | print organism.strip() 18 | print accession.strip() -------------------------------------------------------------------------------- /scripts/original_scripts/code_26.py: -------------------------------------------------------------------------------- 1 | class Fasta: 2 | def __init__(self, name, sequence): 3 | self.name = name 4 | self.sequence = sequence 5 | 6 | def read_fasta(file): 7 | items = [] 8 | index = 0 9 | for line in file: 10 | if line.startswith(">"): 11 | if index >= 1: 12 | items.append(aninstance) 13 | index += 1 14 | name = line[:-1] 15 | seq = '' 16 | aninstance = Fasta(name, seq) 17 | else: 18 | seq += line[:-1] 19 | aninstance = Fasta(name, seq) 20 | 21 | items.append(aninstance) 22 | return items 23 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_27.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import fasta 5 | 6 | file = sys.argv[1] 7 | temp = file.split('.') 8 | filename_base = temp[0] 9 | tag = temp[1] 10 | 11 | sequences = fasta.read_fasta(open(file, 'r').readlines()) 12 | 13 | count = 1 14 | for i in sequences: 15 | f = filename_base + '_' + str(count) + '.' + tag 16 | output = open(f, 'w') 17 | output.write(i.name + '\n') 18 | output.write(i.sequence) 19 | count += 1 20 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_28.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import random 4 | import sys 5 | 6 | def simulate_sequence(length): 7 | dna = ['A', 'C', 'G', 'T'] 8 | sequence = '' 9 | for i in range(length): 10 | sequence += random.choice(dna) 11 | return sequence 12 | 13 | setsize = int(sys.argv[1]) 14 | minlength = int(sys.argv[2]) 15 | maxlength = int(sys.argv[3]) 16 | nsets = int(sys.argv[4]) 17 | 18 | for i in range(nsets): 19 | sequenceset = [] 20 | for i in range(setsize): 21 | rlength = random.randint(minlength, maxlength) 22 | sequenceset.append(simulate_sequence(rlength)) 23 | 24 | for sequence in sequenceset: 25 | print sequence 26 | 27 | print 28 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_29.py: -------------------------------------------------------------------------------- 1 | def read_seqs(file): 2 | items = [] 3 | seq = '' 4 | index = 0 5 | for line in file: 6 | if line.startswith(">"): 7 | if index >= 1: 8 | items.append(seq) 9 | seq = '' 10 | index += 1 11 | else: 12 | seq += line[:-1] 13 | 14 | items.append(seq) 15 | return items 16 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_30.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import fasta 4 | import sys 5 | 6 | data = fasta.read_seqs(open(sys.argv[1], 'r').readlines()) 7 | print [len(seq) for seq in data] 8 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_31.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | 5 | file = sys.argv[1] 6 | start = int(sys.argv[2]) 7 | end = int(sys.argv[3]) 8 | 9 | size = 0 10 | segment = '' 11 | for line in open(file, 'r'): 12 | if not line.startswith('>'): 13 | size += len(line) 14 | else: 15 | name = line 16 | if size >= start and size <= end: 17 | segment += line 18 | 19 | print name, segment 20 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_32.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | 5 | file = sys.argv[1] 6 | start = int(sys.argv[2]) 7 | end = int(sys.argv[3]) 8 | 9 | size = 0 10 | linesize = 0 11 | segment = [] 12 | for line in open(file, 'r'): 13 | if not line.startswith('>'): 14 | size += len(line) 15 | else: 16 | name = line 17 | if size >= start and size <= end+linesize: 18 | segment.append(line.strip()) 19 | linesize = len(line.strip()) 20 | 21 | startline = (start / linesize) + 1 22 | endline = (end / linesize) + 1 23 | 24 | if not start % linesize == 0 and not end % linesize == 0: 25 | segment[0] = segment[0][startline*linesize-start:] 26 | segment[-1] = segment[-1][endline*linesize-end:] 27 | elif not start % linesize == 0: 28 | segment[0] = segment[0][startline*linesize-start:] 29 | elif not end % linesize == 0: 30 | segment[-1] = segment[-1][endline*linesize-end:] 31 | 32 | print name, '\n'.join(segment) 33 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_33.py: -------------------------------------------------------------------------------- 1 | def parse_entry(gene_data): 2 | #changes a string to list, splitting at line ends 3 | gene_data = gene_data.split('\n') 4 | start, end = 0, 0 5 | gi_id = '' 6 | id = '' 7 | complement = False 8 | for line in gene_data: 9 | if line.find(' CDS ') >=0: 10 | temp = line.split() 11 | if temp[1].find('complement') >= 0: 12 | complement = True 13 | temp[1] = temp[1].replace('complement(', '') 14 | temp[1] = temp[1].replace(')', '') 15 | temp2 = temp[1].split('..') 16 | start = temp2[0] 17 | end = temp2[1] 18 | elif line.find('GI:') >= 0: 19 | gi_id = 'gi' + line[line.find('GI:')+3:-1] 20 | elif line.find('/product') >=0: 21 | id = line[line.find('=') + 2:-1] 22 | elif line.find('protein_id') >= 0: 23 | id += '\t' + line[line.find('=') + 2: -1] 24 | 25 | return CDSinfo(gi_id, id, start, end, complement) 26 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_34.py: -------------------------------------------------------------------------------- 1 | def parse_entry(gene_data): 2 | #changes a string to list, splitting at line ends 3 | gene_data = gene_data.split('\n') 4 | start, end = 0, 0 5 | gi_id = '' 6 | id = '' 7 | complement = False 8 | for line in gene_data: 9 | if line.find(' CDS ') >=0: 10 | temp = line.split() 11 | if temp[1].find('complement') >= 0: 12 | complement = True 13 | temp[1] = temp[1].replace('complement(', '') 14 | temp[1] = temp[1].replace(')', '') 15 | temp2 = temp[1].split('..') 16 | start = temp2[0] 17 | end = temp2[1] 18 | elif line.find('GI:') >= 0: 19 | gi_id = 'gi' + line[line.find('GI:')+3:-1] 20 | elif line.find('/product') >=0: 21 | id = line[line.find('=') + 2:-1] 22 | elif line.find('protein_id') >= 0: 23 | id += '\t' + line[line.find('=') + 2: -1] 24 | 25 | return CDSinfo(gi_id, id, start, end, complement) 26 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_35.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | ''' 4 | script that extracts sequences from a GenBank file. Script reads the gene CDS from the file and 5 | builds a list of start and end positions, and if gene is complement outputs the 5'3' sequence and 6 | its reverse complement 7 | only input is a GenBank file 8 | outputs a fasta file with the GI ID as its name. 9 | ''' 10 | 11 | import sys 12 | import fasta 13 | 14 | class CDSinfo: 15 | ''' 16 | CDSinf class to store all the information from CDS 17 | ''' 18 | def __init__(self, gi_id, id, start, end, complement): 19 | self.gi_id = gi_id 20 | self.id = id 21 | self.start = start 22 | self.end = end 23 | self.complement = complement 24 | 25 | def parse_entry(gene_data): 26 | ''' 27 | each CDS entry is obtained in the main function and a string of lines with 28 | information is passed to parse_entry to be parsed and have information extracted 29 | ''' 30 | 31 | gene_data = gene_data.splitlines() #changes a string to list, splitting at line ends 32 | start, end = 0, 0 33 | gi_id = '' 34 | id = '' 35 | complement = False 36 | for line in gene_data: #searches for regions annotated as CDS 37 | if line.find(' CDS ') >=0: 38 | temp = line.split() 39 | #checks for complement sequence, if true remove extra characters 40 | if temp[1].find('complement') >= 0: 41 | complement = True 42 | temp[1] = temp[1].replace('complement(', '') 43 | temp[1] = temp[1].replace(')', '') 44 | temp2 = temp[1].split('..') 45 | start = temp2[0] 46 | end = temp2[1] 47 | #checks for GI IDs 48 | elif line.find('GI:') >= 0: 49 | gi_id = 'gi' + line[line.find('GI:')+3:-1] 50 | #get the gene name/function 51 | elif line.find('/product') >=0: 52 | id = line[line.find('=') + 2:-1] 53 | #and adds the protein id 54 | elif line.find('protein_id') >= 0: 55 | id += '\t' + line[line.find('=') + 2: -1] 56 | 57 | return CDSinfo(gi_id, id, start, end, complement) 58 | 59 | #only input is the genbank file with annotation and sequence 60 | gbfile = open(sys.argv[1]) 61 | 62 | index = 0 63 | entry = '' 64 | sequence = [] 65 | is_seq = False 66 | 67 | genes = [] 68 | for line in gbfile: 69 | #reads the genbank file and whenever finds a gene annotation 70 | #concatenate the lines up to the next gene 71 | if line.find(' gene ') >= 0: 72 | #if an entry is complete, send it to parse 73 | if index >= 1: 74 | #appends to a list of CDSinfo objects 75 | genes.append(parse_entry(entry)) 76 | entry = '' 77 | index += 1 78 | entry += line 79 | elif line.find('ORIGIN') >= 0: 80 | #found sequence start, set the flag on and parses the last entry 81 | is_seq = True 82 | genes.append(parse_entry(entry)) 83 | elif is_seq == True: 84 | #if flag is true keep going, usually sequences are store at the end of the file 85 | line = line.split() 86 | sequence.append(line) 87 | else: 88 | #this is an entry so append line 89 | entry += line 90 | 91 | str_seq = '' 92 | #make the sequence a string 93 | for i in sequence: 94 | str_seq += ''.join(i[1:]).upper() 95 | 96 | for i in genes: 97 | if len(i.gi_id) > 2: 98 | print i.id, i.start, i.end 99 | output = open(i.gi_id + '.DNA.fasta', 'w') 100 | output.write('>' + i.gi_id + '\t' + i.id + '\n') 101 | # if this is a complement, print both 5'-3' and reverse complement sequences 102 | if i.complement == True: 103 | output.write(fasta.format_output(fasta.invert(str_seq[int(i.start)-1:int(i.end)]), 80) + '\n') 104 | else: 105 | if not i.start.find('join') >= 0: 106 | output.write(fasta.format_output(str_seq[int(i.start)-1:int(i.end)], 80)) 107 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_36.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | ''' 4 | input is a GenBank file. The script searches for gene annotations, extract all lines 5 | from the file and then parses these lines in order to extract protein sequences 6 | Ribosomal genes and other non-coding genes are not extracted - plan to do it later 7 | output is a fasta formatted file 8 | ''' 9 | 10 | import sys 11 | import fasta 12 | 13 | class Protein: 14 | ''' 15 | class that stores protein information 16 | ''' 17 | def __init__(self, gi, id, sequence): 18 | self.gi = gi 19 | self.id = id 20 | self.sequence = sequence 21 | 22 | def parse_entry(gene_data): 23 | ''' 24 | parses the entry received from the main function 25 | in order to extract information as protein id 26 | gi, etc 27 | ''' 28 | prot_id = '' 29 | sequence = '' 30 | gi_id = '' 31 | gene_data = gene_data.splitlines() 32 | for line in gene_data: 33 | if line.find('/product') >=0: 34 | prot_id = line[line.find('=') + 2:-1] 35 | elif line.find('protein_id') >= 0: 36 | prot_id += '\t' + line[line.find('=') + 2: -1] 37 | elif line.find('GI:') >= 0: 38 | gi_id = 'gi' + line[line.find('GI:')+3:-1] 39 | elif line.find('/translation') >= 0: 40 | sequence = line[line.find('=') + 2:] 41 | temp = gene_data.index(line) 42 | for i in range(temp+1, len(gene_data)): 43 | if gene_data[i].find('sig_peptide') >= 0: 44 | break 45 | else: 46 | sequence += gene_data[i].strip() 47 | 48 | return Protein(gi_id, prot_id, sequence) 49 | 50 | #only input is a genbank file 51 | gbfile = open(sys.argv[1]) 52 | 53 | proteins = [] 54 | index = 0 55 | entry = '' 56 | for line in gbfile: 57 | if line.find(' gene ') >= 0: 58 | if index >= 1: 59 | #parses the CDS and appends to a list 60 | proteins.append(parse_entry(entry)) 61 | entry = '' 62 | index += 1 63 | entry += line 64 | elif line.find('ORIGIN') >= 0: 65 | #found the DNA sequence, we can stop now 66 | break 67 | else: 68 | entry += line 69 | 70 | #parses the last entry after leaving the loop 71 | proteins.append(parse_entry(entry)) 72 | 73 | #output 74 | for i in proteins: 75 | if len(i.gi) > 2: 76 | print i.gi, i.id 77 | output = open(i.gi + '.fasta', 'w') 78 | output.write('>' + i.gi + '\t' + i.id + '\n') 79 | i.sequence = i.sequence.replace('\"', '') 80 | output.write(fasta.format_output(i.sequence, 80)) 81 | print i.id 82 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_37.py: -------------------------------------------------------------------------------- 1 | def merge_seqs(data1, data2): 2 | 3 | myset1, myset2 = Set([]), Set([]) 4 | 5 | for i in data1: 6 | myset1.add(i.name[i.name.find('|')+1:i.name.find('/')]) 7 | 8 | for i in data2: 9 | myset2.add(i.name[i.name.find('|')+1:i.name.find('/')]) 10 | 11 | mylist = Set.intersection(myset1, myset2) 12 | 13 | flist = [] 14 | for i in mylist: 15 | for j in data1: 16 | if j.name[j.name.find('|')+1:j.name.find('/')] == i: 17 | for k in data2: 18 | if k.name[k.name.find('|')+1:k.name.find('/')] == j.name[j.name.find('|')+1:j.name.find('/')]: 19 | tempname = j.name + '-' + k.name + '->' + str(len(j.sequence)) 20 | tempseq = j.sequence + k.sequence 21 | flist.append(tempname + '\n' + tempseq) 22 | 23 | return flist 24 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_38.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nuin/beginning-python-for-bioinformatics/b33813f4ec11a59a5c6381cc5b78044824d25e3f/scripts/original_scripts/code_38.py -------------------------------------------------------------------------------- /scripts/original_scripts/code_39.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nuin/beginning-python-for-bioinformatics/b33813f4ec11a59a5c6381cc5b78044824d25e3f/scripts/original_scripts/code_39.py -------------------------------------------------------------------------------- /scripts/original_scripts/code_40.py: -------------------------------------------------------------------------------- 1 | def merge_seqs(data1, data2): 2 | first, second = dict(), dict() 3 | for i in data1: 4 | first[i.name[i.name.find('|') + 1:i.name.find('/')]] = i 5 | 6 | for i in data2: 7 | second[i.name[i.name.find('|') + 1:i.name.find('/')]] = i 8 | 9 | shared_ids = set(first).intersection(set(second)) 10 | 11 | flist = [] 12 | for i in shared_ids: 13 | j = first[i] 14 | k = second[i] 15 | tempname = j.name + '-' + k.name + '->' + str(len(j.sequence)) 16 | tempseq = j.sequence + k.sequence 17 | flist.append(tempname + '\n' + tempseq) 18 | 19 | return flist 20 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_41.py: -------------------------------------------------------------------------------- 1 | def merge_seqs(data1, data2): 2 | from itertools import chain, groupby 3 | format = "%s-%s->%d\n%s%s" 4 | flist = [] 5 | keyfunc = lambda it: it.name[it.name.find('|') + 1 : it.name.find('/')] 6 | for it, g in groupby(sorted(chain(data1, data2), key=keyfunc), keyfunc): 7 | values = list(g) 8 | if len(values) == 2: 9 | jname, jseq = values[0].name, values[0].sequence 10 | kname, kseq = values[1].name, values[1].sequence 11 | flist.append(format % (jname, kname, len(jseq), jseq, kseq) ) 12 | 13 | return flist 14 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_42.py: -------------------------------------------------------------------------------- 1 | import fasta 2 | import sys 3 | 4 | def permutations(items, n): 5 | if n == 0: 6 | yield '' 7 | else: 8 | for i in range(len(items)): 9 | for base in permutations(items, n - 1): 10 | yield str(items[i]) + str(base) 11 | 12 | seqs = fasta.get_seqs(open(sys.argv[1]).readlines()) 13 | length = sys.argv[2] 14 | 15 | nucleotides = ['A', 'C', 'G', 'T'] 16 | 17 | merged_seqs = '' 18 | for i in seqs: 19 | merged_seqs += i.sequence 20 | 21 | for i in permutations(nucleotides, int(length)): 22 | print i + '\t' + merged_seqs.count(i) 23 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_43.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from collections import defaultdict 4 | import sys 5 | import fasta 6 | 7 | seqs = fasta.get_seqs(open(sys.argv[1]).readlines()) 8 | length = int(sys.argv[2]) 9 | 10 | #for a missing key, the dict entry is initialized to zero 11 | counts = defaultdict(int) 12 | 13 | #count the length-element subsequences in each sequence 14 | for i in seqs: 15 | for n in range(len(i.sequence) - length): 16 | counts[i.sequence[n : n + length]] += 1 17 | 18 | #counts.keys() will then return the nucleotide sequences 19 | #that were actually in merged_seqs 20 | 21 | #print out the sequences that occur more than once 22 | for count in counts: 23 | print ''.join(count), counts[count] 24 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_44.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from collections import defaultdict 4 | import sys 5 | import fasta 6 | 7 | seqs = fasta.get_seqs(open(sys.argv[1]).readlines()) 8 | length = int(sys.argv[2]) 9 | 10 | quorum = defaultdict(list) 11 | 12 | seq_number = 0 13 | for i in seqs: 14 | seq_number += 1 15 | for n in range(len(i.sequence) - int(length)): 16 | if not seq_number in quorum[i.sequence[n : n + length]]: 17 | quorum[i.sequence[n : n + length]].append(seq_number) 18 | 19 | for i in quorum: 20 | print ''.join(i).upper(), len(quorum[i]) 21 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_45.py: -------------------------------------------------------------------------------- 1 | def fac_01(n): 2 | result = 1 3 | for i in xrange(2, n+1): 4 | result *= i 5 | return result 6 | 7 | def fac_02(n): 8 | value = reduce(lambda i, j : i * j, range(1, n + 1)) 9 | return value 10 | 11 | def fac_03(n): 12 | import operator 13 | value = reduce(operator.mul, xrange(2, n + 1)) 14 | return value 15 | 16 | def fac_04(n): 17 | fac = lambda n:n-1 + abs(n-1) and fac(n-1)*long(n) or 1 18 | return fac(n) 19 | 20 | def fac_05(n): 21 | fac = lambda n:[1,0][n>0] or fac(n-1)*n 22 | return fac(n) 23 | 24 | def fac_06(n): 25 | fac = lambda n:reduce(lambda a,b:a*(b+1),range(n),1) 26 | return fac(n) 27 | 28 | def fac_07(n): 29 | fac=lambda n: [1, 0][n > 0] or reduce(lambda x, y: x*y, xrange(1,n + 1)) 30 | return fac(n) 31 | 32 | def fac_08(n): 33 | fac = lambda n: n <= 0 or reduce(lambda a, b: a*b, xrange(1,n + 1)) 34 | return fac(n) 35 | 36 | def fac_09(n): 37 | fac = lambda n: [[[j for j in (j * i,)][0] for i in range(2, n+1)][-1] for j in (1,)][0] 38 | return fac(n) 39 | 40 | def fac_10(n): 41 | fac = lambda n: [j for j in [1] for i in range(2, n+1) for j in [j * i]] [-1] 42 | return fac(n) 43 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_46.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import timeit 4 | 5 | def fac(n, m): 6 | result1 = 1 7 | for i in xrange(2, n + 1): 8 | result1 *= i 9 | result2 = 1 10 | for i in xrange(2, m + 1): 11 | result2 *= i 12 | result3 = 1 13 | for i in xrange(2, (n - m) + 1): 14 | result3 *= i 15 | 16 | return result1 / (result2 * result3) 17 | 18 | def binom(n, m): 19 | b = [0] * (n + 1) 20 | b[0] = 1 21 | for i in xrange(1, n + 1): 22 | b[i] = 1 23 | j = i - 1 24 | while j > 0: 25 | b[j] += b[j - 1] 26 | j -= 1 27 | return b[m] 28 | 29 | def choose(n, k): 30 | if 0 <= k <= n: 31 | ntok = 1 32 | ktok = 1 33 | for t in xrange(1, min(k, n - k) + 1): 34 | ntok *= n 35 | ktok *= t 36 | n -= 1 37 | #print ntok // ktok 38 | return ntok // ktok 39 | else: 40 | return 0 41 | 42 | if __name__ == "__main__": 43 | 44 | stmt = "fac(3000, 7)" 45 | t = timeit.Timer(stmt = stmt, setup='from __main__ import fac') 46 | stmt2 = "binom(3000, 7)" 47 | t2 = timeit.Timer(stmt = stmt2, setup = 'from __main__ import binom') 48 | stmt3 = "choose(3000, 7)" 49 | t3 = timeit.Timer(stmt = stmt3, setup = 'from __main__ import choose') 50 | 51 | print 'fac: %.9f' % (t.timeit(100)/100) 52 | print 'binom: %.2f' % (t2.timeit(10)/10) 53 | print 'choose %.9f' % (t3.timeit(100)/100) 54 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_47.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import fasta 4 | import sys 5 | from collections import defaultdict 6 | 7 | def choose(n, k): 8 | if 0 <= k <= n: 9 | ntok = 1 10 | ktok = 1 11 | for t in xrange(1, min(k, n - k) + 1): 12 | ntok *= n 13 | ktok *= t 14 | n -= 1 15 | #print ntok // ktok 16 | return ntok // ktok 17 | else: 18 | return 0 19 | 20 | def get_quorums(seqs, mlen): 21 | """ 22 | add seq id_no to a set 23 | use explicit counter to create seq_no 24 | """ 25 | quorum = defaultdict(set) 26 | id_no = 0 27 | for seq in seqs: 28 | id_no += 1 29 | for n in range(len(seq) - mlen): 30 | quorum[seq[n:n + mlen]].add(id_no) 31 | return quorum 32 | 33 | input_seqs = fasta.read_seqs(open(sys.argv[1]).readlines()) 34 | input_seqs2 = fasta.read_seqs(open(sys.argv[2]).readlines()) 35 | 36 | foreground = get_quorums(input_seqs, 10) 37 | background = get_quorums(input_seqs2, 10) 38 | 39 | N = len(input_seqs) + len(input_seqs2) 40 | 41 | for i in foreground: 42 | term1 = choose(len(background[i]), len(foreground[i])) 43 | term2 = choose((N - len(background[i])), len(input_seqs)-1) 44 | term3 = choose(N, len(input_seqs)) 45 | p = (float(term1) * float(term2)) / term3 46 | if 0 < p <= 0.0001: 47 | print i, len(foreground[i]), len(background[i]), p 48 | -------------------------------------------------------------------------------- /scripts/original_scripts/code_48.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import fasta 4 | import sys 5 | from collections import defaultdict 6 | 7 | def choose(n, k): 8 | if 0 <= k <= n: 9 | ntok = 1 10 | ktok = 1 11 | for t in xrange(1, min(k, n - k) + 1): 12 | ntok *= n 13 | ktok *= t 14 | n -= 1 15 | return ntok // ktok 16 | else: 17 | return 0 18 | 19 | def get_quorums(seqs, mlen): 20 | """ 21 | add seq id_no to a set 22 | use explicit counter to create seq_no 23 | """ 24 | quorum = defaultdict(int) 25 | for seq in seqs: 26 | for n in range(len(seq) - mlen): 27 | quorum[seq[n:n + mlen]] += 1 28 | return quorum 29 | 30 | input_seqs = fasta.read_seqs(open(sys.argv[1]).readlines()) 31 | input_seqs2 = fasta.read_seqs(open(sys.argv[2]).readlines()) 32 | 33 | foreground = get_quorums(input_seqs, 10) 34 | background = get_quorums(input_seqs2, 10) 35 | 36 | N = len(input_seqs) + len(input_seqs2) 37 | 38 | res_motifs = [] 39 | for i in foreground: 40 | term1 = choose(len(background[i]), len(foreground[i])) 41 | term2 = choose((N - len(background[i])), len(input_seqs)-1) 42 | term3 = choose(N, len(input_seqs)) 43 | p = (float(term1) * float(term2)) / term3 44 | if 0 < p <= 0.0001: 45 | res_motifs.append(i + '\t' + str(foreground[i]) + '\t' + str(background[i]) + '\t' + str(p)) 46 | 47 | res_motifs.sort() 48 | for i in res_motifs: 49 | print i 50 | -------------------------------------------------------------------------------- /scripts/original_scripts/dnatranslate.py: -------------------------------------------------------------------------------- 1 | def translate_dna(sequence): 2 | 3 | #dictionary with the genetic code 4 | gencode = { 5 | 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 6 | 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 7 | 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 8 | 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', 9 | 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 10 | 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 11 | 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 12 | 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 13 | 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 14 | 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 15 | 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 16 | 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 17 | 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 18 | 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 19 | 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 20 | 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 21 | } 22 | 23 | proteinseq = '' 24 | #loop to read DNA sequence in codons, 3 nucleotides at a time 25 | for n in range(0,len(sequence),3): 26 | #checking to see if the dictionary has the key 27 | if gencode.has_key(sequence[n:n+3]) == True: 28 | proteinseq += gencode[sequence[n:n+3]] 29 | #return protein sequence 30 | return proteinseq -------------------------------------------------------------------------------- /scripts/original_scripts/fasta.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | class Fasta: 4 | def __init__(self, name, sequence): 5 | self.name = name 6 | self.sequence = sequence 7 | 8 | def read_fasta(file): 9 | items = [] 10 | index = 0 11 | for line in file: 12 | if line.startswith(">"): 13 | if index >= 1: 14 | items.append(aninstance) 15 | index+=1 16 | name = line.strip() 17 | seq = '' 18 | aninstance = Fasta(name, seq) 19 | else: 20 | seq += line.strip() 21 | aninstance = Fasta(name, seq) 22 | 23 | items.append(aninstance) 24 | return items 25 | 26 | def read_seqs(file): 27 | items = [] 28 | seq = '' 29 | index = 0 30 | for line in file: 31 | if line.startswith(">"): 32 | if index >= 1: 33 | items.append(seq) 34 | seq = '' 35 | index += 1 36 | else: 37 | seq += line[:-1] 38 | 39 | items.append(seq) 40 | return items 41 | 42 | def format_output(sequence, length): 43 | temp = [] 44 | for j in range(0,len(sequence),length): 45 | temp.append(sequence[j:j+length]) 46 | return '\n'.join(temp) 47 | 48 | 49 | def complement(seq): 50 | complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} 51 | complseq = [complement[base] for base in seq] 52 | return complseq 53 | 54 | def reverse_complement(seq): 55 | seq = list(seq) 56 | seq.reverse() 57 | return ''.join(complement(seq)) 58 | 59 | def transcribe(seq): 60 | RNA = seq.replace('T', 'U') 61 | return RNA 62 | --------------------------------------------------------------------------------