├── Common Text Files ├── AA_Variant_Count.txt ├── AA_Mass_Table.txt ├── RNA_Codon_Table.txt ├── DNA_Codon_Table.txt └── Peptide_Scoring_Matrix.txt ├── Simple-Programs ├── K-mer Generator.py ├── Finding k-mers in DNA.py ├── Frequency Array.py ├── Transition to Transversion Ratio.py ├── Converting RNA into Protein.py ├── Consensus Scoring.py ├── Entropy Scoring.py ├── Frequent k-mers in DNA with Mismatches.py ├── Finding Protein Motifs.py ├── Determining Consensus Strand.py ├── RNA Splicing.py ├── Open Reading Frames.py ├── Finding Shared DNA Motifs.py └── Finding Shared k-mers.py ├── Eulerian Cycle.py ├── Genomic Suffix Tree.py ├── Eulerian Path.py ├── Viterbi Path.py ├── Burrows–Wheeler Transform.py ├── Multiple Alignment.py ├── Cyclopeptide Sequencing (Ideal Spectrum).py ├── Evolutionary Tree Reconstruction (Neighbor Joining).py ├── Genome Assembly with Single Reads.py ├── Genome Assembly with Paired Reads.py ├── Genome Graph Functions.py ├── Viterbi Learning.py ├── Local Alignment.py ├── Gibbs Motif Sampler.py ├── Global Alignment.py └── Cyclopeptide Sequencing (Non-Ideal Spectrum).py /Common Text Files/AA_Variant_Count.txt: -------------------------------------------------------------------------------- 1 | F 2 2 | L 2 3 | S 6 4 | Y 2 5 | C 2 6 | W 1 7 | L 6 8 | P 4 9 | H 2 10 | Q 2 11 | R 6 12 | I 3 13 | M 1 14 | T 4 15 | N 2 16 | K 2 17 | V 4 18 | A 4 19 | D 2 20 | E 2 21 | G 4 22 | -------------------------------------------------------------------------------- /Common Text Files/AA_Mass_Table.txt: -------------------------------------------------------------------------------- 1 | G 57 2 | A 71 3 | S 87 4 | P 97 5 | V 99 6 | T 101 7 | C 103 8 | I 113 9 | L 113 10 | N 114 11 | D 115 12 | K 128 13 | Q 128 14 | E 129 15 | M 131 16 | H 137 17 | F 147 18 | R 156 19 | Y 163 20 | W 186 21 | -------------------------------------------------------------------------------- /Simple-Programs/K-mer Generator.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | def GenerateKmers(k): 4 | nts = ['A','C','G','T'] 5 | kmers = ['A','C','G','T'] 6 | count = 0 7 | while count < k-1: 8 | kmers_temp = [] 9 | for kmer in kmers: 10 | for nt in nts: 11 | kmers_temp.append(kmer+nt) 12 | kmers = copy.deepcopy(kmers_temp) 13 | count += 1 14 | return kmers 15 | 16 | print GenerateKmers(4) 17 | 18 | #Output ->['AAAA', 'AAAC', 'AAAG', 'AAAT',...,'TTTA', 'TTTC', 'TTTG', 'TTTT'] 19 | -------------------------------------------------------------------------------- /Common Text Files/RNA_Codon_Table.txt: -------------------------------------------------------------------------------- 1 | UUU F 2 | CUU L 3 | AUU I 4 | GUU V 5 | UUC F 6 | CUC L 7 | AUC I 8 | GUC V 9 | UUA L 10 | CUA L 11 | AUA I 12 | GUA V 13 | UUG L 14 | CUG L 15 | AUG M 16 | GUG V 17 | UCU S 18 | CCU P 19 | ACU T 20 | GCU A 21 | UCC S 22 | CCC P 23 | ACC T 24 | GCC A 25 | UCA S 26 | CCA P 27 | ACA T 28 | GCA A 29 | UCG S 30 | CCG P 31 | ACG T 32 | GCG A 33 | UAU Y 34 | CAU H 35 | AAU N 36 | GAU D 37 | UAC Y 38 | CAC H 39 | AAC N 40 | GAC D 41 | UAA / 42 | CAA Q 43 | AAA K 44 | GAA E 45 | UAG / 46 | CAG Q 47 | AAG K 48 | GAG E 49 | UGU C 50 | CGU R 51 | AGU S 52 | GGU G 53 | UGC C 54 | CGC R 55 | AGC S 56 | GGC G 57 | UGA / 58 | CGA R 59 | AGA R 60 | GGA G 61 | UGG W 62 | CGG R 63 | AGG R 64 | GGG G 65 | -------------------------------------------------------------------------------- /Common Text Files/DNA_Codon_Table.txt: -------------------------------------------------------------------------------- 1 | TTT F 2 | TTC F 3 | TTA L 4 | TTG L 5 | CTT L 6 | CTC L 7 | CTA L 8 | CTG L 9 | ATT I 10 | ATC I 11 | ATA I 12 | ATG M 13 | GTT V 14 | GTC V 15 | GTA V 16 | GTG V 17 | TCT S 18 | TCC S 19 | TCA S 20 | TCG S 21 | CCT P 22 | CCC P 23 | CCA P 24 | CCG P 25 | ACT T 26 | ACC T 27 | ACA T 28 | ACG T 29 | GCT A 30 | GCC A 31 | GCA A 32 | GCG A 33 | TAT Y 34 | TAC Y 35 | TAA / 36 | TAG / 37 | CAT H 38 | CAC H 39 | CAA Q 40 | CAG Q 41 | AAT N 42 | AAC N 43 | AAA K 44 | AAG K 45 | GAT D 46 | GAC D 47 | GAA E 48 | GAG E 49 | TGT C 50 | TGC C 51 | TGA / 52 | TGG W 53 | CGT R 54 | CGC R 55 | CGA R 56 | CGG R 57 | AGT S 58 | AGC S 59 | AGA R 60 | AGG R 61 | GGT G 62 | GGC G 63 | GGA G 64 | GGG G 65 | -------------------------------------------------------------------------------- /Simple-Programs/Finding k-mers in DNA.py: -------------------------------------------------------------------------------- 1 | DNA = 'TCACGCAGCATCACGCAGCATGCTTCCGAGTGTAAAGGCCTGCTTCCGAGTGTAAAGGCCTGCTTCCGAGTCACGCAGCAAATCGATCCCTGCTTCCGAGAATCGATCCCAATCGATCCCTGCTTCCGAGTGCTTCCGAG' 2 | k = 3 #length of k-mer 3 | kmers = {} 4 | 5 | for i in range(0,len(DNA)-k): 6 | kmers[DNA[i:k+i]] = 0 7 | 8 | for kmer in kmers: 9 | for i in range(0,len(DNA)): 10 | if kmer == DNA[i:len(kmer)+i]: 11 | kmers[kmer] += 1 12 | 13 | import operator 14 | max_count = kmers[max(kmers.iteritems(), key=operator.itemgetter(1))[0]] 15 | print "Count: " + str(max_count) 16 | print "k-mers:", 17 | 18 | for kmer in kmers: 19 | if kmers[kmer] == max_count: 20 | print kmer, 21 | ''' 22 | 23 | Output 24 | Count: 9 25 | k-mers: CGA TCC 26 | 27 | ''' 28 | -------------------------------------------------------------------------------- /Simple-Programs/Frequency Array.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | string = 'ATCGTGTAACCGGTGGGACGCCCGGCGAGGCTGATCTGTCTATGCTCATTTCAGACCACTCGTAGCGAGCCTGTTCGTC' 4 | k = 2 #length of k-mer 5 | 6 | nts = ['A','C','G','T'] 7 | kmers_one = ['A','C','G','T'] 8 | kmers_two = [] 9 | while k > 1: 10 | kmers_two = [] 11 | for str in kmers_one: 12 | for nt in nts: 13 | temp = copy.deepcopy(str) 14 | temp += nt 15 | kmers_two.append(temp) 16 | k -= 1 17 | kmers_one = copy.deepcopy(kmers_two) 18 | 19 | kmer_dict = {} 20 | for kmer in kmers_one: 21 | kmer_dict[kmer] = 0 22 | 23 | k = 2 24 | i = 0 25 | while i < len(string)-k+1: 26 | kmer_dict[string[i:i+k]] += 1 27 | i += 1 28 | 29 | for kmer in kmers_one: 30 | print kmer_dict[kmer], 31 | 32 | #Output -> 1 4 4 4 3 5 8 6 5 6 5 7 3 8 6 3 33 | #Lexicographically ordered -> AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT 34 | -------------------------------------------------------------------------------- /Simple-Programs/Transition to Transversion Ratio.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Text file containing two strings to be compared 3 | >String_1 4 | CGCGAATTAAGTAACTATCACACAAGTCACGTAGACCTAATAGCACTCCAGCGCTGCAGC 5 | GACCATATAATCGTGACGGCGAAGGACCCCTGCGCTTTAAGGTTTTCACGTTCCTTACAT 6 | >String_2 7 | CGCGAACTAAGCAGCCATCATACAGGTCTCTAGGACCTGACCGCGCTTAAACTCTGCAGC 8 | GACTACATAATCGCCGTGACAAAGGACCCCTGCATTCTAACGTCTTCACGTACTTTATAT 9 | 10 | ''' 11 | 12 | infile = open('s1s2.txt', 'r') 13 | s1 = '' 14 | s2 = '' 15 | switch = 0 16 | for line in infile: 17 | if line[0] == '>': 18 | switch += 1 19 | if line[0] != '>': 20 | if switch == 1: 21 | s1 = s1 + line.rstrip('\n') 22 | if switch == 2: 23 | s2 = s2 + line.rstrip('\n') 24 | 25 | list = {'G' : 'Pyr', 'A' : 'Pyr', 'C' : 'Pur', 'T' : 'Pur'} 26 | 27 | transi = 0.00000 28 | transv = 0.00000 29 | 30 | for pos in range(0,len(s1)): 31 | if s1[pos] != s2[pos]: 32 | if list[s1[pos]] == list[s2[pos]]: 33 | transi += 1 34 | elif list[s1[pos]] != list[s2[pos]]: 35 | transv += 1 36 | 37 | print (transi/transv) 38 | 39 | #Output -> 2.77777777778 40 | -------------------------------------------------------------------------------- /Simple-Programs/Converting RNA into Protein.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Text file containing codon/protein pairs (per line) 3 | For example... 4 | 5 | UUU F CUU L AUU I GUU V 6 | UUC F CUC L AUC I GUC V 7 | UUA L CUA L AUA I GUA V 8 | UUG L CUG L AUG M GUG V 9 | UCU S CCU P ACU T GCU A 10 | UCC S CCC P ACC T GCC A 11 | UCA S CCA P ACA T GCA A 12 | UCG S CCG P ACG T GCG A 13 | UAU Y CAU H AAU N GAU D 14 | UAC Y CAC H AAC N GAC D 15 | UAA * CAA Q AAA K GAA E 16 | UAG * CAG Q AAG K GAG E 17 | UGU C CGU R AGU S GGU G 18 | UGC C CGC R AGC S GGC G 19 | UGA * CGA R AGA R GGA G 20 | UGG W CGG R AGG R GGG G 21 | 22 | ''' 23 | 24 | infile = open("codons.txt", "r") 25 | 26 | #Load Dictionary 27 | codons = {} 28 | for line in infile: 29 | codons[line[0:3]] = line[4] 30 | 31 | RNA = "AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGAUGA" 32 | Protein = "" 33 | 34 | x = 0 35 | y = 3 36 | 37 | while y < len(RNA): 38 | if codons[RNA[x:y]] == '*': 39 | break 40 | Protein = Protein + codons[RNA[x:y]] 41 | x += 3 42 | y += 3 43 | 44 | print Protein 45 | 46 | infile.close() 47 | 48 | #Output -> MAMAPRTEINSTRING 49 | -------------------------------------------------------------------------------- /Simple-Programs/Consensus Scoring.py: -------------------------------------------------------------------------------- 1 | k = 4 2 | list = ['CAGT', 'GTGT', 'TACC', 'ATCA', 'TCCA'] 3 | 4 | def score(list,k): 5 | AA = [0] * k 6 | CC = [0] * k 7 | GG = [0] * k 8 | TT = [0] * k 9 | x = 0 10 | y = 0 11 | consensus = '' 12 | 13 | for kmer in list: 14 | for i in range(0,len(kmer)): 15 | if kmer[i] == "A": 16 | AA[i] += 1 17 | elif kmer[i] == "C": 18 | CC[i] += 1 19 | elif kmer[i] == "G": 20 | GG[i] += 1 21 | elif kmer[i] == "T": 22 | TT[i] += 1 23 | 24 | for i in range(0,k): 25 | if AA[i] == max(AA[i],CC[i],GG[i],TT[i]): 26 | consensus = consensus + "A" 27 | elif CC[i] == max(AA[i],CC[i],GG[i],TT[i]): 28 | consensus = consensus + "C" 29 | elif GG[i] == max(AA[i],CC[i],GG[i],TT[i]): 30 | consensus = consensus + "G" 31 | elif TT[i] == max(AA[i],CC[i],GG[i],TT[i]): 32 | consensus = consensus + "T" 33 | 34 | score = 0 35 | pos = 0 36 | for nt in consensus: 37 | for kmer in list: 38 | if kmer[pos] == nt: 39 | pass 40 | else: 41 | score += 1 42 | pos += 1 43 | return score 44 | 45 | print score(list,k) 46 | 47 | #Output -> 11 48 | -------------------------------------------------------------------------------- /Simple-Programs/Entropy Scoring.py: -------------------------------------------------------------------------------- 1 | k = 4 2 | list = ['CAGT', 'GTGT', 'TACC', 'ATCA', 'TCCA'] 3 | 4 | def score(list,k): 5 | AA = [0.0] * k 6 | CC = [0.0] * k 7 | GG = [0.0] * k 8 | TT = [0.0] * k 9 | 10 | aa = [0.0] * k 11 | cc = [0.0] * k 12 | gg = [0.0] * k 13 | tt = [0.0] * k 14 | TTotal = [0.0] * k 15 | 16 | for kmer in list: 17 | for i in range(0,len(kmer)): 18 | if kmer[i] == "A": 19 | AA[i] += 1 20 | elif kmer[i] == "C": 21 | CC[i] += 1 22 | elif kmer[i] == "G": 23 | GG[i] += 1 24 | elif kmer[i] == "T": 25 | TT[i] += 1 26 | 27 | for i in range(0,k): 28 | TTotal[i] = AA[i]+CC[i]+GG[i]+TT[i] 29 | aa[i] = AA[i]/TTotal[i] 30 | cc[i] = CC[i]/TTotal[i] 31 | gg[i] = GG[i]/TTotal[i] 32 | tt[i] = TT[i]/TTotal[i] 33 | 34 | from math import log 35 | score = 0 36 | for i in range(0,k): 37 | if aa[i] == 0.0: 38 | score += 0 39 | else: 40 | score += (aa[i]*log(aa[i],2)) 41 | 42 | if cc[i] == 0.0: 43 | score += 0 44 | else: 45 | score += (cc[i]*log(cc[i],2)) 46 | 47 | if gg[i] == 0.0: 48 | score += 0 49 | else: 50 | score += (gg[i]*log(gg[i],2)) 51 | 52 | if tt[i] == 0.0: 53 | score += 0 54 | else: 55 | score += (tt[i]*log(tt[i],2)) 56 | 57 | return score*-1. 58 | 59 | print score(list,k) 60 | 61 | #Output -> 5.93673487912 62 | -------------------------------------------------------------------------------- /Eulerian Cycle.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | from copy import deepcopy 3 | 4 | infile = open('Cycle.txt', 'r') 5 | 6 | #Create adjacency list 7 | def CreateAdjacencyList(infile): 8 | 9 | adj_list = {} 10 | circuit_max = 0 11 | for line in infile: 12 | node = line.strip('\n') 13 | node = node.replace(' -> ', ' ') 14 | node = node.split(' ') 15 | adj_list.setdefault(node[0], []) 16 | for number in node[1].split(','): 17 | adj_list[node[0]].append(number) 18 | circuit_max += 1 19 | 20 | return adj_list, circuit_max 21 | 22 | def FindEulerianCycle(infile): 23 | 24 | #Create adjacency list 25 | adj_list, circuit_max = CreateAdjacencyList(infile) 26 | 27 | #Reduced adjacency list to keep track of traveled edges 28 | red_adj_list = {} 29 | red_adj_list = deepcopy(adj_list) 30 | 31 | #Arbitrary starting point (if graph is directed/balanced) 32 | start = '6' 33 | curr_vrtx = '6' 34 | 35 | stack = [] 36 | circuit = [] 37 | while len(circuit) != circuit_max: 38 | 39 | if red_adj_list[curr_vrtx] != []: #If neighbors exist 40 | stack.append(curr_vrtx) 41 | pick = randint(0,len(red_adj_list[curr_vrtx])-1) 42 | temp = deepcopy(curr_vrtx) 43 | curr_vrtx = red_adj_list[temp][pick] 44 | red_adj_list[temp].remove(curr_vrtx) 45 | 46 | else: 47 | circuit.append(curr_vrtx) 48 | curr_vrtx = stack[len(stack)-1] 49 | stack.pop() 50 | 51 | #Formatting 52 | path = start + '->' 53 | for vrtx in circuit[::-1]: 54 | path += (vrtx + '->') 55 | return path.strip('->') 56 | 57 | print FindEulerianCycle(infile) 58 | -------------------------------------------------------------------------------- /Genomic Suffix Tree.py: -------------------------------------------------------------------------------- 1 | class Node(object): 2 | def __init__(self, value, ID): 3 | self.value = value 4 | self.children = {} 5 | self.ID = ID #Label nodes with ID 6 | 7 | class Trie(object): 8 | def __init__(self): 9 | self.root = Node(None, 0) #Root is labeled 0 10 | 11 | def AddString(self, pattern, ID): 12 | current = self.root #Start at root 13 | for nt in pattern: 14 | if nt in current.children: 15 | current = current.children[nt] 16 | else: 17 | new_node = Node(nt, ID) 18 | current.children[nt] = new_node 19 | current = new_node 20 | ID += 1 21 | if None not in current.children: 22 | current.children[None] = None 23 | return ID 24 | 25 | def Traverse(self, suffix, branch_ids, edges): 26 | current = self.root 27 | edge_temp = '' 28 | for nt in suffix: 29 | if nt in current.children: 30 | current = current.children[nt] 31 | if len(current.children) >= 2: 32 | if current.ID in branch_ids: 33 | edge_temp = '' 34 | else: 35 | edge_temp += nt 36 | branch_ids.append(current.ID) 37 | edges.append(edge_temp) 38 | edge_temp = '' 39 | if len(current.children) == 1: 40 | edge_temp += nt 41 | edges.append(edge_temp) 42 | edge_temp = '' 43 | return branch_ids, edges 44 | 45 | def SuffixTree(genome): 46 | #Trie loading 47 | Tree = Trie() 48 | ID = 1 49 | i = 0 50 | while i < len(genome): 51 | ID = Trie.AddString(Tree, genome[i:], ID) 52 | i += 1 53 | 54 | #Trie traversal 55 | branch_ids = [] 56 | edges = [] 57 | i = 0 58 | while i < len(genome): 59 | branch_ids, edges = Trie.Traverse(Tree, genome[i:], branch_ids, edges) 60 | i += 1 61 | edges.remove('') 62 | 63 | return sorted(edges) 64 | 65 | genome = 'CTTGTTCGTAAA' 66 | edges = SuffixTree(genome) 67 | for edge in edges: 68 | print edge 69 | -------------------------------------------------------------------------------- /Simple-Programs/Frequent k-mers in DNA with Mismatches.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | def GenerateKmers(k): 4 | nts = ['A','C','G','T'] 5 | kmers = ['A','C','G','T'] 6 | count = 0 7 | while count < k-1: 8 | kmers_temp = [] 9 | for kmer in kmers: 10 | for nt in nts: 11 | kmers_temp.append(kmer+nt) 12 | kmers = copy.deepcopy(kmers_temp) 13 | count += 1 14 | return kmers 15 | 16 | def HammingDistance(genome, pattern): 17 | mismatch = 0 18 | i = 0 19 | while i < len(genome): 20 | if genome[i] != pattern[i]: 21 | mismatch += 1 22 | i += 1 23 | return mismatch 24 | 25 | def ReverseComplement(DNAin): 26 | y = len(DNAin) - 1 27 | DNAout = '' 28 | while y > -1: 29 | if DNAin[y] == "A": 30 | DNAout = DNAout + "T" 31 | elif DNAin[y] == "T": 32 | DNAout = DNAout + "A" 33 | elif DNAin[y] == "C": 34 | DNAout = DNAout + "G" 35 | elif DNAin[y] == "G": 36 | DNAout = DNAout + "C" 37 | y = y - 1 38 | return DNAout 39 | 40 | genome = 'ACGTTGCATGTCGCATGAGCTAGCTTTGATGAGGATGAGCTTTGGAGCGCCCAAACTGCATGAGAGCT' 41 | k = 5 #Length of k-mer 42 | d = 2 #Max mismatches allowed 43 | kmers = GenerateKmers(k) #Generate k-mers 44 | 45 | #Compute number of appearances of k-mer and its reverse compliment (with no more than d mismatches) throughout genome 46 | most_frequent = {} 47 | for kmer in kmers: 48 | j = 0 49 | while j < len(genome)-len(kmer)+1: 50 | kmer_rev = ReverseComplement(kmer) 51 | if HammingDistance(genome[j:j+len(kmer)],kmer) <= d: 52 | most_frequent.setdefault((kmer, kmer_rev),0) 53 | most_frequent[(kmer, kmer_rev)] += 1 54 | if HammingDistance(genome[j:j+len(kmer_rev)],kmer_rev) <= d: 55 | most_frequent.setdefault((kmer, kmer_rev),0) 56 | most_frequent[(kmer, kmer_rev)] += 1 57 | j += 1 58 | 59 | #Find maximum values 60 | output_ordered = [] 61 | values = [] 62 | for kmer in most_frequent: 63 | values.append(most_frequent[kmer]) 64 | for kmer in most_frequent: 65 | if most_frequent[kmer] == max(values): 66 | output_ordered.append(kmer) 67 | 68 | #Seperate copies of (kmer/kmer_rev) and (kmer_rev/kmer) 69 | seperate = {} 70 | for pair in sorted(output_ordered): 71 | for kmer in pair: 72 | seperate[kmer] = 0 73 | for kmer in seperate: 74 | print kmer, 75 | 76 | #Output -> TGCTC GAGCA 77 | -------------------------------------------------------------------------------- /Simple-Programs/Finding Protein Motifs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Text file containg protein IDs 3 | For example... 4 | 5 | Q640N1 6 | Q8PV50 7 | P0AAM4 8 | Q8CE94 9 | P49286 10 | 11 | ''' 12 | 13 | #Get primary structure and store as string 14 | import urllib2 15 | def find_protein(protein_id): 16 | give_protein = "" 17 | return_protein = "" 18 | 19 | for line in urllib2.urlopen("http://www.uniprot.org/uniprot/%s.fasta" % protein_id): 20 | if line[0] != ">": 21 | give_protein = give_protein + line 22 | 23 | for char in give_protein: 24 | if char != "\n": 25 | return_protein = return_protein + char 26 | 27 | print protein_id 28 | return return_protein 29 | 30 | 31 | #Finding the positions of N-glycosylation motif = N{P}[ST]{P} 32 | def find_motif(protein): 33 | a = 0 34 | b = 1 35 | c = 2 36 | d = 3 37 | 38 | positions = [] 39 | 40 | for i in range(0, len(protein)-4): 41 | if protein[a] == "N": 42 | if protein[b] != "P": 43 | if (protein[c] == "S") or (protein[c] == "T"): 44 | if protein[d] != "P": 45 | positions.append(i+1) 46 | a += 1 47 | b += 1 48 | c += 1 49 | d += 1 50 | else: 51 | a += 1 52 | b += 1 53 | c += 1 54 | d += 1 55 | else: 56 | a += 1 57 | b += 1 58 | c += 1 59 | d += 1 60 | else: 61 | a += 1 62 | b += 1 63 | c += 1 64 | d += 1 65 | else: 66 | a += 1 67 | b += 1 68 | c += 1 69 | d += 1 70 | 71 | pos = "" 72 | for num in positions: 73 | pos = pos + str(num) + " " 74 | print pos 75 | 76 | infile = open("proteins.txt", "r") 77 | 78 | listy = [] 79 | 80 | for line in infile: 81 | listy.append(line.strip()) 82 | 83 | for line in listy: 84 | find_motif(find_protein(line)) 85 | 86 | infile.close() 87 | 88 | ''' 89 | Output 90 | 91 | Q640N1 92 | 471 519 913 1030 93 | Q8PV50 94 | 188 195 95 | P0AAM4 96 | Q8CE94 97 | 369 98 | P49286 99 | 4 130 100 | 101 | ''' 102 | -------------------------------------------------------------------------------- /Simple-Programs/Determining Consensus Strand.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Text file containing ID and Sequence (one line each) 3 | Simple example...(sequences can be much longer) 4 | 5 | >Seq_1 6 | ATCCAGCTGTCCGACCCT 7 | >Seq_2 8 | GGGCAACTTTCCGACTTT 9 | >Seq_3 10 | ATGGATCTATCGGACTCT 11 | >Seq_4 12 | AAGCAACCATCGGACTTT 13 | >Seq_5 14 | TTGGAACTCTGGTACTCT 15 | >Seq_6 16 | ATGCCATTTCCGATCACA 17 | >Seq_7 18 | ATGGCACTACTGGGTCCA 19 | 20 | ''' 21 | 22 | infile = open("in.txt", "r") 23 | 24 | A = 0 25 | C = 0 26 | G = 0 27 | T = 0 28 | 29 | x = 0 30 | y = 0 31 | 32 | consensus = "" 33 | a = "" 34 | c = "" 35 | g = "" 36 | t = "" 37 | 38 | lines = [] 39 | 40 | start = 0 41 | z = -1 42 | 43 | for line in infile: 44 | if line[0] == ">": 45 | lines.append(line) 46 | start = 1 47 | z += 2 48 | else: 49 | if start == 1: 50 | lines.append(line) 51 | start = 0 52 | elif start == 0: 53 | lines[z] = lines[z] + line 54 | 55 | 56 | while x < (len(lines[1])-1): 57 | while y < len(lines): 58 | if lines[y][0] != ">": 59 | if lines[y][x] == "A": 60 | A += 1 61 | elif lines[y][x] == "C": 62 | C += 1 63 | elif lines[y][x] == "G": 64 | G += 1 65 | elif lines[y][x] == "T": 66 | T += 1 67 | y += 1 68 | else: 69 | y += 1 70 | 71 | if A == max(A,C,G,T): 72 | consensus = consensus + "A" 73 | elif C == max(A,C,G,T): 74 | consensus = consensus + "C" 75 | elif G == max(A,C,G,T): 76 | consensus = consensus + "G" 77 | elif T == max(A,C,G,T): 78 | consensus = consensus + "T" 79 | 80 | a = a + " " + str(A) 81 | c = c + " " + str(C) 82 | g = g + " " + str(G) 83 | t = t + " " + str(T) 84 | 85 | x += 1 86 | y = 0 87 | A = 0 88 | C = 0 89 | G = 0 90 | T = 0 91 | 92 | print("Consensus: "+ consensus + "\n") 93 | 94 | topofmatrix = " " 95 | for base in consensus: 96 | topofmatrix = topofmatrix + base + " " 97 | print(topofmatrix + "\n") 98 | print("A:" + a) 99 | print("C:" + c) 100 | print("G:" + g) 101 | print("T:" + t) 102 | 103 | infile.close() 104 | 105 | ''' 106 | Output 107 | 108 | Consensus: ATGCAACTATCGGACTCT 109 | 110 | A T G C A A C T A T C G G A C T C T 111 | 112 | A: 5 1 0 0 5 5 0 0 3 0 0 0 1 5 0 1 0 2 113 | C: 0 0 1 4 2 0 6 1 1 2 5 2 0 0 6 2 5 0 114 | G: 1 1 6 3 0 1 0 0 1 0 1 5 5 1 0 0 0 0 115 | T: 1 5 0 0 0 1 1 6 2 5 1 0 1 1 1 4 2 5 116 | 117 | ''' 118 | -------------------------------------------------------------------------------- /Simple-Programs/RNA Splicing.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | Text file containing codon/protein pairs (per line) 4 | For example... 5 | TTT F CTT L ATT I GTT V 6 | TTC F CTC L ATC I GTC V 7 | TTA L CTA L ATA I GTA V 8 | TTG L CTG L ATG M GTG V 9 | TCT S CCT P ACT T GCT A 10 | TCC S CCC P ACC T GCC A 11 | TCA S CCA P ACA T GCA A 12 | TCG S CCG P ACG T GCG A 13 | TAT Y CAT H AAT N GAT D 14 | TAC Y CAC H AAC N GAC D 15 | TAA * CAA Q AAA K GAA E 16 | TAG * CAG Q AAG K GAG E 17 | TGT C CGT R AGT S GGT G 18 | TGC C CGC R AGC S GGC G 19 | TGA * CGA R AGA R GGA G 20 | TGG W CGG R AGG R GGG G 21 | 22 | Text file containing pre-RNA and splice sites 23 | >RNA_1 24 | ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG 25 | >Intron_1 26 | ATCGGTCGAA 27 | >Intron_2 28 | ATCGGTCGAGCGTGT 29 | 30 | ''' 31 | 32 | RNA = open("splicing.txt", "r") 33 | SPLICE = open("introns.txt", "w") 34 | 35 | #Seperate the introns 36 | x = 0 37 | y = 0 38 | for line in RNA: 39 | if y != 2: 40 | if line[0] == ">": 41 | y += 1 42 | if y == 2: 43 | SPLICE.write(line) 44 | 45 | SPLICE.close() 46 | RNA.close() 47 | 48 | #Create mRNA (pre-RNA at this point) string 49 | RNA = open("splicing.txt", "r") 50 | mRNA = "" 51 | for line in RNA: 52 | if x != 2: 53 | if line[0] == ">": 54 | x += 1 55 | else: 56 | mRNA = mRNA + line 57 | 58 | #Remove internal line breaks 59 | mRNA_listy = list(mRNA) 60 | for pos in range(0,len(mRNA_listy)-1): 61 | if mRNA_listy[pos] == "\n": 62 | mRNA_listy[pos] = "" 63 | mRNA = "".join(mRNA_listy) 64 | 65 | #Load introns into list 66 | SPLICE = open("introns.txt", "r") 67 | introns = [] 68 | for line in SPLICE: 69 | if line[0] != '>': 70 | introns.append(line.rstrip('\n')) 71 | 72 | #Splice those introns out to get your mRNA 73 | for intron in introns: 74 | a = 0 75 | b = len(intron) 76 | while b < len(mRNA): 77 | if mRNA[a:b] == intron: 78 | mRNA = mRNA[:a] + mRNA[b:] 79 | a += 1 80 | b += 1 81 | else: 82 | a += 1 83 | b += 1 84 | 85 | #Text file containing codon/protein pairs 86 | infile = open("DNA_Codon_Table.txt", "r") 87 | 88 | #Load dictionary 89 | codons = {} 90 | for line in infile: 91 | codons[line[0:3]] = line[4] 92 | 93 | #Convert mRNA into Protein 94 | Protein = "" 95 | x = 0 96 | y = 3 97 | while y < len(mRNA): 98 | Protein = Protein + codons[mRNA[x:y]] 99 | x += 3 100 | y += 3 101 | 102 | print Protein 103 | 104 | #Output -> MVYIADKQHVASREAYGHMFKVCA* 105 | 106 | 107 | -------------------------------------------------------------------------------- /Eulerian Path.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | from copy import deepcopy 3 | 4 | infile = open('Path.txt', 'r') 5 | 6 | #Create adjacency list 7 | def CreateAdjacencyList(infile): 8 | 9 | adj_list = {} 10 | circuit_max = 0 11 | for line in infile: 12 | node = line.strip('\n') 13 | node = node.replace(' -> ', ' ') 14 | node = node.split(' ') 15 | adj_list.setdefault(node[0], []) 16 | for number in node[1].split(','): 17 | adj_list[node[0]].append(number) 18 | circuit_max += 1 19 | 20 | return adj_list, circuit_max 21 | 22 | #Find start/end nodes 23 | def FindStart(red_adj_list): 24 | 25 | start = {} 26 | for one in red_adj_list: 27 | start.setdefault(one, 0) 28 | start[one] += len(red_adj_list[one]) 29 | 30 | end = {} 31 | for one in red_adj_list: 32 | for two in red_adj_list[one]: 33 | end.setdefault(two, 0) 34 | end[two] += 1 35 | 36 | for one in end: 37 | try: 38 | if start[one] != end[one]: 39 | if start[one] > end[one]: 40 | start_node = one 41 | if start[one] < end[one]: 42 | end_node = one 43 | except KeyError: 44 | end_node = one 45 | 46 | for one in start: 47 | try: 48 | if end[one] != start[one]: 49 | if end[one] < start[one]: 50 | start_node = one 51 | if end[one] > start: 52 | end_node = one 53 | except KeyError: 54 | start_node = one 55 | 56 | red_adj_list[end_node] = [] 57 | 58 | return red_adj_list, start_node 59 | 60 | def FindEulerianPath(infile): 61 | 62 | #Create adjacency list 63 | adj_list, circuit_max = CreateAdjacencyList(infile) 64 | 65 | #Reduced adjacency list to keep track of traveled edges 66 | red_adj_list = {} 67 | red_adj_list = deepcopy(adj_list) 68 | 69 | #Find start node (graph must be directed/ubalanced) 70 | red_adj_list, start_node= FindStart(red_adj_list) 71 | 72 | start = start_node 73 | curr_vrtx = start_node 74 | path = [curr_vrtx] 75 | stack = [] 76 | circuit = [] 77 | while len(circuit) != circuit_max: 78 | 79 | if red_adj_list[curr_vrtx] != []: #If neighbors exist 80 | stack.append(curr_vrtx) 81 | pick = randint(0,len(red_adj_list[curr_vrtx])-1) 82 | temp = deepcopy(curr_vrtx) 83 | curr_vrtx = red_adj_list[temp][pick] 84 | red_adj_list[temp].remove(curr_vrtx) 85 | 86 | else: 87 | circuit.append(curr_vrtx) 88 | curr_vrtx = stack[len(stack)-1] 89 | stack.pop() 90 | 91 | #Formatting 92 | path = start + '->' 93 | for vrtx in circuit[::-1]: 94 | path += (vrtx + '->') 95 | return path.strip('->') 96 | 97 | print FindEulerianPath(infile) 98 | -------------------------------------------------------------------------------- /Viterbi Path.py: -------------------------------------------------------------------------------- 1 | infile = open('TransitionMatrix.txt', 'r') 2 | TransitionMatrix = [] 3 | for line in infile: 4 | TransitionMatrix.append(line.strip('\n').split('\t')) 5 | infile.close() 6 | 7 | infile = open('EmissionMatrix.txt', 'r') 8 | EmissionMatrix = [] 9 | for line in infile: 10 | EmissionMatrix.append(line.strip('\n').split('\t')) 11 | infile.close() 12 | 13 | string = 'xzzxzxxxyxxzxxyzxzyxzyzzxyzyyxxyyyzxzxxyxxyxxzxzyyzzxxzyxzxyxxyyzzzxzyyzyzzxyzyxyxyyxzxyxzyzzzyyyzzz' 14 | alphabet = 'x y z'.split() #Observation space 15 | states = 'A B C D'.split() #State space 16 | 17 | def Viterbi(string, alphabet, states, TransitionMatrix, EmissionMatrix): 18 | 19 | TM = {} #Setup transition matrix 20 | i = 0 21 | for state1 in states: 22 | TM[state1] = {} 23 | j = 1 24 | for state2 in states: 25 | TM[state1][state2] = float(TransitionMatrix[i][j]) 26 | j += 1 27 | i += 1 28 | 29 | EM = {} #Setup emission matrix 30 | i = 0 31 | for state in states: 32 | EM[state] = {} 33 | j = 1 34 | for letter in alphabet: 35 | EM[state][letter] = float(EmissionMatrix[i][j]) 36 | j += 1 37 | i += 1 38 | 39 | Pi = 1 #Probability of initial state set to equally likely = 1/|states| 40 | 41 | T1 = {} #Scoring matrix 42 | T2 = {} #Backtracking matrix 43 | for state in states: 44 | T1[state] = [0]*len(string) 45 | T2[state] = [0]*len(string) 46 | 47 | #Initalize scores at source 48 | for state in states: 49 | T1[state][0] = Pi*EM[state][string[0]] 50 | 51 | #Fill in remaining scores through dynamic programming 52 | i = 1 53 | while i < len(string): 54 | for j in states: 55 | values = [] 56 | K = [] 57 | for k in states: 58 | values.append(T1[k][i-1]*TM[k][j]*EM[j][string[i]]) 59 | K.append((T1[k][i-1]*TM[k][j]*EM[j][string[i]], k)) 60 | 61 | #Choose max weight of possible edges (k->j) 62 | T1[j][i] = max(values) 63 | #Remember which edge was chosen for backtracking 64 | for k in K: 65 | if k[0] == max(values): 66 | T2[j][i] = k[1] 67 | i += 1 68 | 69 | #Find maximum sink value and begin most probable path 70 | values = [] 71 | K = [] 72 | for state in T1: 73 | values.append(T1[state][len(string)-1]) 74 | for state in T1: 75 | if T1[state][len(string)-1] == max(values): 76 | path = state 77 | last = state 78 | 79 | #Backtracking 80 | i = len(string)-1 81 | while i > 0: 82 | for state in states: 83 | if T2[last][i] == state: 84 | last = state 85 | path += last 86 | i -= 1 87 | break 88 | 89 | return path[::-1] 90 | 91 | print Viterbi(string, alphabet, states, TransitionMatrix, EmissionMatrix) 92 | -------------------------------------------------------------------------------- /Burrows–Wheeler Transform.py: -------------------------------------------------------------------------------- 1 | def BWTConstruction(Text): 2 | #Find cyclic suffixes 3 | Suffixes = [] 4 | i = 0 5 | while i < len(Text): 6 | Suffixes.append(Text[i:]+Text[:i]) 7 | i += 1 8 | 9 | #Construct BWT from last char of sorted cyclic suffixes 10 | BWT = '' 11 | Suffixes = sorted(Suffixes) 12 | for suffix in Suffixes: 13 | BWT += suffix[len(suffix)-1] 14 | 15 | return BWT 16 | 17 | def BWTReconstruction(BWT): 18 | #Sort BWT to find FirstColumn 19 | BWT_list = [] 20 | for nt in BWT: 21 | BWT_list.append(nt) 22 | BWT_list = sorted(BWT_list) 23 | 24 | #Assign IDs to nts 25 | FirstColumn = [] 26 | LastColumn = [] 27 | Index = {'A':1, 'C':1, 'G':1, 'T':1, '$':1} 28 | for nt in BWT_list: 29 | FirstColumn.append(nt+str(Index[nt])) 30 | Index[nt] += 1 31 | Index = {'A':1, 'C':1, 'G':1, 'T':1, '$':1} 32 | for nt in BWT: 33 | LastColumn.append(nt+str(Index[nt])) 34 | Index[nt] += 1 35 | 36 | #Reconstruct 37 | Invert = [] 38 | Length = len(FirstColumn)-1 39 | Find = '$1' 40 | while len(Invert) < Length: 41 | for i in range(len(LastColumn)): 42 | if LastColumn[i] == Find: 43 | Invert.append(FirstColumn[i]) 44 | Find = FirstColumn[i] 45 | 46 | #Formatting 47 | Text = '' 48 | for char in Invert: 49 | Text += char[0] 50 | Text += '$' 51 | 52 | return Text 53 | 54 | def Compress(Text): 55 | #Run-length encoding 56 | i = 1 57 | k = 1 58 | S = text[0] 59 | TextCompressed = '' 60 | TextCompressedList = [] 61 | while i < len(text): 62 | if text[i] == S: 63 | k += 1 64 | S = text[i] 65 | i += 1 66 | else: 67 | TextCompressed += str(k)+S 68 | TextCompressedList.append((k,S)) 69 | k = 1 70 | S = text[i] 71 | i += 1 72 | TextCompressedList.append((k,S)) 73 | TextCompressed += str(k)+S 74 | 75 | return TextCompressed, TextCompressedList 76 | 77 | def Decompress(CompressedTextList): 78 | #Run-length decoding 79 | TextDecompressed = '' 80 | for pair in CompressedTextList: 81 | TextDecompressed += pair[0]*pair[1] 82 | 83 | return TextDecompressed 84 | 85 | '''==============[Demonstration]==============''' 86 | 87 | #Short string of nts with CpG repeats to display utility 88 | Text = 'ATTATCCCTCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCCGCGCTTATATACGCCTGGTCA$' 89 | print 'Original Text' 90 | print Text 91 | 92 | #A BWT naturally converts CpG repeats into C and G runs 93 | #Very useful for run-length encoding a full genome 94 | BWT = BWTConstruction(Text) 95 | print 'Constructed BWT' 96 | print BWT 97 | 98 | #Run-length encoding 99 | BWTCompressed, BWTCompressedList = Compress(BWT) 100 | print 'Compressed BWT' 101 | print BWTCompressed 102 | 103 | #Run-length decoding 104 | BWTDecompressed = Decompress(BWTCompressedList) 105 | print 'Decompressed BWT' 106 | print BWTDecompressed 107 | 108 | #Inverse BWT 109 | TextReconstructed = BWTReconstruction(BWT) 110 | print 'Reconstructed Text' 111 | print TextReconstructed 112 | -------------------------------------------------------------------------------- /Simple-Programs/Open Reading Frames.py: -------------------------------------------------------------------------------- 1 | #Setup DNA Codon Table 2 | infile = open("DNA_Codon_Table.txt", "r") 3 | 4 | codons = {} 5 | for line in infile: 6 | codons[line[0:3]] = line[4] 7 | 8 | #Find reverse compliment 9 | DNAin = "AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG" 10 | DNAout = "" 11 | 12 | y = len(DNAin) - 1 13 | 14 | while y > -1: 15 | if DNAin[y] == "A": 16 | DNAout = DNAout + "T" 17 | elif DNAin[y] == "T": 18 | DNAout = DNAout + "A" 19 | elif DNAin[y] == "C": 20 | DNAout = DNAout + "G" 21 | elif DNAin[y] == "G": 22 | DNAout = DNAout + "C" 23 | y = y - 1 24 | 25 | DNA_f = DNAin 26 | DNA_r = DNAout 27 | 28 | Proteins = [] 29 | Protein = "" 30 | 31 | #Start with forward 32 | x = 0 33 | y = 3 34 | z = 0 35 | w = 0 36 | 37 | while y < len(DNA_f): 38 | if codons[DNA_f[x:y]] == "/": 39 | if w == 1: 40 | break 41 | if codons[DNA_f[x:y]] == "M": 42 | z = 1 43 | w = 1 44 | if z == 1: 45 | Protein = Protein + codons[DNA_f[x:y]] 46 | x += 3 47 | y += 3 48 | Proteins.append(Protein) 49 | Protein = "" 50 | 51 | x = 1 52 | y = 4 53 | z = 0 54 | w = 0 55 | 56 | while y < len(DNA_f): 57 | if codons[DNA_f[x:y]] == "/": 58 | if w == 1: 59 | break 60 | if codons[DNA_f[x:y]] == "M": 61 | z = 1 62 | w = 1 63 | if z == 1: 64 | Protein = Protein + codons[DNA_f[x:y]] 65 | x += 3 66 | y += 3 67 | Proteins.append(Protein) 68 | Protein = "" 69 | 70 | x = 2 71 | y = 5 72 | z = 0 73 | w = 0 74 | 75 | while y < len(DNA_f): 76 | if codons[DNA_f[x:y]] == "/": 77 | if w == 1: 78 | break 79 | if codons[DNA_f[x:y]] == "M": 80 | z = 1 81 | w = 1 82 | if z == 1: 83 | Protein = Protein + codons[DNA_f[x:y]] 84 | x += 3 85 | y += 3 86 | Proteins.append(Protein) 87 | Protein = "" 88 | 89 | #Now with reverse 90 | x = 0 91 | y = 3 92 | z = 0 93 | w = 0 94 | 95 | while y < len(DNA_r): 96 | if codons[DNA_r[x:y]] == "/": 97 | if w == 1: 98 | break 99 | if codons[DNA_r[x:y]] == "M": 100 | z = 1 101 | w = 1 102 | if z == 1: 103 | Protein = Protein + codons[DNA_r[x:y]] 104 | x += 3 105 | y += 3 106 | Proteins.append(Protein) 107 | Protein = "" 108 | 109 | x = 1 110 | y = 4 111 | z = 0 112 | w = 0 113 | 114 | while y < len(DNA_r): 115 | if codons[DNA_r[x:y]] == "/": 116 | if w == 1: 117 | break 118 | if codons[DNA_r[x:y]] == "M": 119 | z = 1 120 | w = 1 121 | if z == 1: 122 | Protein = Protein + codons[DNA_r[x:y]] 123 | x += 3 124 | y += 3 125 | Proteins.append(Protein) 126 | Protein = "" 127 | 128 | x = 2 129 | y = 5 130 | z = 0 131 | w = 0 132 | 133 | while y < len(DNA_r): 134 | if codons[DNA_r[x:y]] == "/": 135 | if w == 1: 136 | break 137 | if codons[DNA_r[x:y]] == "M": 138 | z = 1 139 | w = 1 140 | if z == 1: 141 | Protein = Protein + codons[DNA_r[x:y]] 142 | x += 3 143 | y += 3 144 | Proteins.append(Protein) 145 | 146 | print Proteins 147 | 148 | #Output -> ['MGMTPRLGLESLLE', 'M', '', 'M', 'M', 'MLLGSFRLIPKETLIQVAGSSPCNLS'] 149 | -------------------------------------------------------------------------------- /Multiple Alignment.py: -------------------------------------------------------------------------------- 1 | #Simple scoring method (+1 match / -1 mismatch) 2 | def score(vwu): 3 | if vwu[0] == vwu[1] and vwu[0] == vwu[2]: 4 | return 1 5 | else: 6 | return -1 7 | 8 | def MultipleAlignment(v, w, u): 9 | #Initialize 3D scoring matrix 10 | s = [[[0 for j in range(len(w)+1)] for i in range(len(v)+1)] for k in range(len(u)+1)] 11 | backtrack = [[[0 for j in range(len(w)+1)] for i in range(len(v)+1)] for k in range(len(u)+1)] 12 | 13 | #3D scoring / s[k][i][j] 14 | for k in range(1,len(u)+1): 15 | for i in range(1,len(v)+1): 16 | for j in range(1,len(w)+1): 17 | 18 | values = [ 19 | s[k][i-1][j], #0 20 | s[k][i][j-1], #1 21 | s[k-1][i][j], #2 22 | s[k][i-1][j-1], #3 23 | s[k-1][i-1][j], #4 24 | s[k-1][i][j-1], #5 25 | s[k-1][i-1][j-1]+score(u[k-1]+v[i-1]+w[j-1]) #6 26 | ] 27 | 28 | s[k][i][j] = max(values) 29 | backtrack[k][i][j] = values.index(s[k][i][j]) 30 | 31 | #Backtracking and output 32 | i = len(v) 33 | j = len(w) 34 | k = len(u) 35 | V = '' 36 | W = '' 37 | U = '' 38 | while i+j+k != 0: 39 | if i*j*k == 0: 40 | if i == 0: 41 | V += '-' 42 | else: 43 | V += v[i-1] 44 | i -= 1 45 | if j == 0: 46 | W += '-' 47 | else: 48 | W += w[j-1] 49 | j -= 1 50 | if k == 0: 51 | U += '-' 52 | else: 53 | U += u[k-1] 54 | k -= 1 55 | 56 | elif backtrack[k][i][j] == 0: 57 | V += v[i-1] 58 | W += '-' 59 | U += '-' 60 | i = i-1 61 | elif backtrack[k][i][j] == 1: 62 | V += '-' 63 | W += w[j-1] 64 | U += '-' 65 | j = j-1 66 | elif backtrack[k][i][j] == 2: 67 | V += '-' 68 | W += '-' 69 | U += u[k-1] 70 | k = k-1 71 | elif backtrack[k][i][j] == 3: 72 | V += v[i-1] 73 | W += w[j-1] 74 | U += '-' 75 | i = i-1 76 | j = j-1 77 | elif backtrack[k][i][j] == 4: 78 | V += v[i-1] 79 | W += '-' 80 | U += u[k-1] 81 | i = i-1 82 | k = k-1 83 | elif backtrack[k][i][j] == 5: 84 | V += '-' 85 | W += w[j-1] 86 | U += u[k-1] 87 | j = j-1 88 | k = k-1 89 | elif backtrack[k][i][j] == 6: 90 | V += v[i-1] 91 | W += w[j-1] 92 | U += u[k-1] 93 | i = i-1 94 | j = j-1 95 | k = k-1 96 | 97 | return int(s[len(u)][len(v)][len(w)]), V[::-1], W[::-1], U[::-1] 98 | 99 | score, V, W, U = MultipleAlignment('ACGATACGT', 'CCCATTAAGT', 'GACTATAGAA') 100 | 101 | print 'Score: ' + str(score) 102 | print V 103 | print W 104 | print U 105 | -------------------------------------------------------------------------------- /Simple-Programs/Finding Shared DNA Motifs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Text file containg DNA sequences 3 | For example... 4 | 5 | >Seq_1 6 | AGATGAGCACGTGCATTCTAAATAATCAAGCGCGATGTCCTACGTATAGTTGAGGTCTAC 7 | GTACTATCAACCTACCCTTATGAATCGCTGTTCCATATCCAGAGGTCGTCTAGTGAACTC 8 | TTAAGCAGGTTGCATGCCAAGATAGCGCCATACACGATGGTTTTCGGGTCGGCTCCGCGC 9 | GGGTACACATATA 10 | >Seq_2 11 | CTGGCGGCAAAGCACCGAACCGTATACCGACGTGACGGTCGCATGCTTGATGCATTGCAC 12 | GACAGAGGAGGAACTCCCTTTAGTGTAAGCACGAGCTCGACAGATAATTATAGATTTCTG 13 | GCCTTTGACTATAACCCTTATGAATCGCTGTTCCATGGGCTCGCTCACAGAAGGTGTAGG 14 | TGAACGTTGCGTGTGACGACGGGACGCAGGCCGCACGCCCCTACGTCGAGACACAGCTTA 15 | GTCTACAAGTAGTTTGATGCATGTGTGGACTTGAGCCCCTTGGTAGGCTGTGTGGAAGGG 16 | GGTTGATCATCGTTGTCATAATGCCGCGGAACAAACATATCCCCCATGTTCGGTCACCAA 17 | >Seq_3 18 | AGAAACCTCGTCGCCCCTCAGAATCACAGACCGAGGAGATGATAAATCCGTTAGTATAAA 19 | CTCAGGTCTTACCCTTATGAATCGCTGTTCCATTACGCTGAGATTCTCTGCTCGAGAGAC 20 | TGATTAATCCCTCCTTGTCTTTAACAAGATGCGCAGCCCTCGATTTACGCGGATTTACAC 21 | >Seq_4 22 | TTGCCATTGTCACCGGATCTACTATTTGATCCAGCATGGGGTTTCTTAATTCATTAAACA 23 | ATATCCGGCTCTATCCCTCGGACGGCGTGAATGAACACTTTTGAGGGGACTTAATCCGAT 24 | TTTTTATGGCCACTGACACCCTTATGAATCGCTGTTCCATCTTCATGGGCCGTTAGTATA 25 | AACTCAGGTCTTTGCTCCACTTTGTTGTTGGTCTCTTTATTGATTTCCTACGCTGAGATT 26 | GTGAGCTTGCTCGCTTGGAGCGAAAAA 27 | 28 | ''' 29 | 30 | infile = open("strings.txt", "r") 31 | 32 | temp_string = "" 33 | strings = [] 34 | x = 0 35 | 36 | #Load strings from text file 37 | for string in infile: 38 | if string[0] != ">": 39 | x = 1 40 | temp_string = temp_string + string 41 | if string[0] == ">": 42 | if x == 1: 43 | strings.append(temp_string) 44 | temp_string = "" 45 | 46 | 47 | #Remove midstring line breaks 48 | temp_strings = [] 49 | for string in strings: 50 | listing = list(string) 51 | for pos in range(0,len(listing)-1): 52 | if listing[pos] == "\n": 53 | listing[pos] = "" 54 | temp_string = "".join(listing) 55 | temp_strings.append(temp_string) 56 | strings = temp_strings 57 | ref = strings[0] 58 | strings.remove(strings[0]) 59 | 60 | #Setup list for appending of all motifs 61 | all_motifs = [] 62 | 63 | #Function for finding substrings between str and ref 64 | def find_substrings(str, ref): 65 | motifs = [] 66 | match = 0 67 | x = 0 68 | y = 1 69 | for j in range(0, 2): 70 | while y < len(str): 71 | for i in range(0, len(str)): 72 | if str[i:i+len(ref[x:y])] == ref[x:y]: 73 | match = 1 74 | if match == 1: 75 | if ref[x:y] != "\n": 76 | motifs.append(ref[x:y]) 77 | y += 1 78 | match = 0 79 | elif match == 0: 80 | x += 1 81 | y = x + 1 82 | return motifs 83 | 84 | 85 | for seq in strings: 86 | all_motifs.append(find_substrings(seq, ref)) 87 | 88 | #Setup list for appending common motifs 89 | common_motifs = [] 90 | 91 | for substring in all_motifs[0]: 92 | w = 0 93 | for part in all_motifs: 94 | w = 0 95 | for sub in part: 96 | if substring == sub: 97 | w = 1 98 | if w == 1: 99 | common_motifs.append(substring) 100 | 101 | print max(common_motifs, key = len) 102 | 103 | infile.close() 104 | 105 | #Output looks something like this -> ACCCTTATGAATCGCTGTTCCAT 106 | -------------------------------------------------------------------------------- /Cyclopeptide Sequencing (Ideal Spectrum).py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | infile = open('AA_MassTablet', 'r') 4 | aa = {} 5 | for line in infile: 6 | aa[line[0]] = line[2:].strip('\n') 7 | infile.close() 8 | 9 | #Ideal experimental spectrum 10 | SpectrumIdealTemp = '0 71 97 99 101 101 113 114 129 131 163 186 202 202 211 227 228 230 230 234 260 287 299 301 324 329 331 331 359 365 374 388 400 413 430 430 445 460 462 464 487 501 510 514 517 531 558 561 561 576 593 611 615 616 630 632 673 675 689 690 694 712 729 744 744 747 774 788 791 795 804 818 841 843 845 860 875 875 892 905 917 931 940 946 974 974 976 981 1004 1006 1018 1045 1071 1075 1075 1077 1078 1094 1103 1103 1119 1142 1174 1176 1191 1192 1204 1204 1206 1208 1234 1305' 11 | SpectrumIdeal = SpectrumIdealTemp.split() 12 | 13 | def LinearSpectrum(peptide): 14 | #Find the linear spectrum of the peptide 15 | peptides = [] 16 | sizes = ['0'] 17 | peptides.append(peptide) 18 | n = len(peptide) 19 | for window in range(1,n): 20 | for i in range(0,n-window+1): 21 | peptides.append(peptide[i:i+window]) 22 | for string in peptides: 23 | total = 0 24 | for aminoacid in string: 25 | total += int(aa[aminoacid]) 26 | sizes.append(str(total)) 27 | return sizes 28 | 29 | def CheckCompatibility(SpectrumCheck, SpectrumIdeal): 30 | #Check compatibility with the ideal spectrum 31 | temp = deepcopy(SpectrumIdeal) 32 | compatibility = True 33 | for SpecProt in SpectrumCheck: 34 | present = 0 35 | for SpecNum in temp: 36 | if SpecProt == SpecNum: 37 | present = 1 38 | if present == 1: 39 | temp.remove(SpecProt) 40 | if present == 0: 41 | compatibility = False 42 | return compatibility 43 | return compatibility 44 | 45 | def FindMass(spectrum): 46 | temp = [] 47 | for num in spectrum: 48 | temp.append(int(num)) 49 | temp = sorted(temp) 50 | return temp[len(temp)-1] 51 | 52 | def IdealSequencing(SpectrumIdeal): 53 | #Find correct mass of the ideal spectrum 54 | CorrectMass = FindMass(SpectrumIdeal) 55 | 56 | Combos = [] #Store potential peptides 57 | Correct = [] #Store correct peptides 58 | for letter in aa: 59 | Combos.append(letter) 60 | 61 | #Recursive method for building and checking peptides 62 | while len(Combos) != 0: 63 | for string in Combos: 64 | for aminoacid in aa: 65 | if CheckCompatibility(LinearSpectrum(string+aminoacid), SpectrumIdeal) == True: 66 | 67 | #If compatible thus far, and mass is correct, add to correct pile 68 | if FindMass(LinearSpectrum(string+aminoacid)) == CorrectMass: 69 | Correct.append(string+aminoacid) 70 | 71 | #If compatible thus far, but mass is too small, retry with another amino acid appended next round 72 | elif FindMass(LinearSpectrum(string+aminoacid)) < CorrectMass: 73 | Combos.append(string+aminoacid) 74 | 75 | Combos.remove(string) 76 | 77 | #Selects unique peptides 78 | unique = {} 79 | for peptide in Correct: 80 | string = '' 81 | for aminoacid in peptide: 82 | if aminoacid == 'L' or aminoacid == 'I': 83 | string += '(L/I)' 84 | else: 85 | string += aminoacid 86 | unique[string] = 1 87 | return unique 88 | 89 | Peptides = IdealSequencing(SpectrumIdeal) 90 | 91 | for peptide in Peptides: 92 | print peptide 93 | -------------------------------------------------------------------------------- /Evolutionary Tree Reconstruction (Neighbor Joining).py: -------------------------------------------------------------------------------- 1 | import copy 2 | import operator 3 | 4 | infile = open('Matrix.txt', 'r') 5 | n = 9 #Size of matrix 6 | m = 9 #Start numbering unkown ancestors 7 | species = ['Cow', 'Pig', 'Horse', 'Mouse', 'Dog', 'Cat', 'Turkey', 'Civet', 'Human'] 8 | 9 | #Setup starting matrix D and fill it in 10 | D = {} 11 | for i in species: 12 | D[i] = {} 13 | for j in species: 14 | D[i][j] = 0 15 | i = 0 16 | for line in infile: 17 | j = 0 18 | for number in line.split(): 19 | D[species[i]][species[j]] = int(number) 20 | j += 1 21 | i += 1 22 | infile.close() 23 | 24 | #Use to visualize matrix D/D* 25 | def Visualize(D): 26 | temp = ' ' 27 | for cluster in D: 28 | temp += str(cluster) + ' ' 29 | print temp 30 | for line in D: 31 | print D[line] 32 | 33 | #Transform matrix D to D* and find closest neighbors Ni and Nj 34 | def TransformMatrix(D): 35 | #Find Total Distance 36 | TotalDistance = {} 37 | for i in D: 38 | total = 0 39 | for j in D: 40 | total += D[i][j] 41 | TotalDistance[i] = total 42 | 43 | #Create D* 44 | Dstar = {} 45 | for i in D: 46 | Dstar[i] = {} 47 | for j in D: 48 | Dstar[i][j] = 0 49 | for i in D: 50 | for j in D: 51 | if i == j: 52 | Dstar[i][j] = 0 53 | else: 54 | Dstar[i][j] = (n-2)*D[i][j]-TotalDistance[i]-TotalDistance[j] 55 | 56 | #Find minimum neighbors 57 | Dstar_values = [] 58 | for i in Dstar: 59 | for j in Dstar: 60 | if i != j: 61 | Dstar_values.append([Dstar[i][j],[i,j]]) 62 | Ni = min(Dstar_values)[1][0] 63 | Nj = min(Dstar_values)[1][1] 64 | return Ni, Nj, Dstar, TotalDistance 65 | 66 | #Remove Ni and Nj from matrix D 67 | def RemoveClusters(D,Ni,Nj): 68 | del D[Ni] 69 | del D[Nj] 70 | for i in D: 71 | del D[i][Ni] 72 | del D[i][Nj] 73 | return D 74 | 75 | #Add unkown ancestor m to matrix D 76 | def AddCluster(D,D_temp,m,Ni,Nj): 77 | D[m] = {} 78 | for k in D: 79 | if k != m: 80 | D[m][k] = (0.5)*(D_temp[k][Ni]+D_temp[k][Nj]-D_temp[Ni][Nj]) 81 | for k in D: 82 | if k != m: 83 | D[k][m] = (0.5)*(D_temp[k][Ni]+D_temp[k][Nj]-D_temp[Ni][Nj]) 84 | if k == m: 85 | D[m][m] = 0 86 | return D 87 | 88 | #Add new edges m->Ni and m->Nj to adjacency list 89 | def MakeLimbs(D_temp,m,Ni,Nj,TotalDistance): 90 | delta = (TotalDistance[Ni]-TotalDistance[Nj])/(n-2) 91 | LimbLength_Ni = (0.5)*(D_temp[Ni][Nj]+delta) 92 | LimbLength_Nj = (0.5)*(D_temp[Ni][Nj]-delta) 93 | adj_list.append([m, str(m) + '->' + str(Ni) + ':' + str(int(LimbLength_Ni))]) 94 | adj_list.append([Ni, str(Ni) + '->' + str(m) + ':' + str(int(LimbLength_Ni))]) 95 | adj_list.append([m, str(m) + '->' + str(Nj) + ':' + str(int(LimbLength_Nj))]) 96 | adj_list.append([Nj, str(Nj) + '->' + str(m) + ':' + str(int(LimbLength_Nj))]) 97 | 98 | #Neighbor Joining 99 | adj_list = [] 100 | while n != 2: 101 | Ni, Nj, Dstar, TotalDistance = TransformMatrix(D) 102 | D_temp = copy.deepcopy(D) 103 | RemoveClusters(D,Ni,Nj) 104 | D = AddCluster(D,D_temp,m,Ni,Nj) 105 | MakeLimbs(D_temp,m,Ni,Nj,TotalDistance) 106 | m += 1 107 | n -= 1 108 | 109 | if n == 2: 110 | for Mi in D: 111 | for Mj in D: 112 | if Mi != Mj: 113 | adj_list.append([Mi, str(Mi) + '->' + str(Mj) + ':' + str(int(D[Mi][Mj]))]) 114 | 115 | #Print adjacency list 116 | for edge in sorted(adj_list): 117 | print edge[1] 118 | -------------------------------------------------------------------------------- /Genome Assembly with Single Reads.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from random import randint 3 | from copy import deepcopy 4 | 5 | infile = open('SingleReads.txt', 'r') 6 | DNA = [] 7 | for line in infile: 8 | DNA.append(line.strip('\n')) 9 | DNA = sorted(DNA) 10 | infile.close() 11 | 12 | def suffix(kmer): 13 | suffix = '' 14 | for i in range (1,len(kmer)): 15 | suffix += kmer[i] 16 | return suffix 17 | 18 | def prefix(kmer): 19 | prefix = '' 20 | for i in range (0,len(kmer)-1): 21 | prefix += kmer[i] 22 | return prefix 23 | 24 | def AssembleGenome(DNA): 25 | #Represent every k-mer in Patterns as an isolated edge between its prefix and suffix 26 | edges = [] # edged = [kmer : [prefix(kmer), suffix(kmer)]] 27 | for kmer in DNA: 28 | edges.append(kmer) 29 | edges.append([prefix(kmer),suffix(kmer)]) 30 | 31 | #Find prefixes/suffixes 32 | prefixes = [] 33 | suffixes = [] 34 | i = 0 35 | while i < len(edges): 36 | prefixes.append(edges[i+1][0]) 37 | suffixes.append(edges[i+1][1]) 38 | i += 2 39 | 40 | #Sort pairs 41 | pairs = {} 42 | i = 0 43 | while i < len(prefixes): 44 | pairs.setdefault(prefixes[i],[]) 45 | pairs[prefixes[i]].append(suffixes[i]) 46 | i += 1 47 | pairs = collections.OrderedDict(sorted(pairs.items())) 48 | 49 | #Assign strings to unique IDs 50 | numbers = {} 51 | rev_numbers = {} 52 | i = 0 53 | for pair in pairs: 54 | numbers[pair] = str(i) 55 | rev_numbers[str(i)] = pair 56 | i += 1 57 | 58 | for pair in pairs: 59 | for end in pairs[pair]: 60 | numbers.setdefault(end, 'empty') 61 | if numbers[end] == 'empty': 62 | numbers[end] = str(i) 63 | rev_numbers[str(i)] = end 64 | i += 1 65 | 66 | adj_list = {} 67 | circuit_max = 0 68 | for start in pairs: 69 | for end in pairs[start]: 70 | adj_list.setdefault(numbers[start], []) 71 | adj_list[numbers[start]].append(numbers[end]) 72 | circuit_max += 1 73 | 74 | #Reduced adjacency list to keep track of traveled edges 75 | red_adj_list = {} 76 | red_adj_list = deepcopy(adj_list) 77 | 78 | #Find start and end node 79 | start = {} 80 | for one in red_adj_list: 81 | start.setdefault(one, 0) 82 | start[one] += len(red_adj_list[one]) 83 | end = {} 84 | for one in red_adj_list: 85 | for two in red_adj_list[one]: 86 | end.setdefault(two, 0) 87 | end[two] += 1 88 | for one in end: 89 | try: 90 | if start[one] != end[one]: 91 | if start[one] > end[one]: 92 | start_node = one 93 | if start[one] < end[one]: 94 | end_node = one 95 | except KeyError: 96 | end_node = one 97 | for one in start: 98 | try: 99 | if end[one] != start[one]: 100 | if end[one] < start[one]: 101 | start_node = one 102 | if end[one] > start: 103 | end_node = one 104 | except KeyError: 105 | start_node = one 106 | 107 | #Set end 108 | red_adj_list[end_node] = [] 109 | 110 | #Starting node (if graph is directed/ubalanced) 111 | start = start_node 112 | curr_vrtx = start_node 113 | 114 | path = [] 115 | path.append(curr_vrtx) 116 | stack = [] 117 | circuit = [] 118 | while len(circuit) != circuit_max: 119 | 120 | if red_adj_list[curr_vrtx] != []: #If neighbors exist 121 | stack.append(curr_vrtx) 122 | pick = randint(0,len(red_adj_list[curr_vrtx])-1) 123 | temp = deepcopy(curr_vrtx) 124 | curr_vrtx = red_adj_list[temp][pick] 125 | red_adj_list[temp].remove(curr_vrtx) 126 | 127 | else: 128 | circuit.append(curr_vrtx) 129 | curr_vrtx = stack[len(stack)-1] 130 | stack.pop() 131 | 132 | #Match IDs to strings 133 | circuit = [start] + circuit[::-1] 134 | corr_order = [] 135 | for vrtx in circuit: 136 | corr_order.append(rev_numbers[vrtx]) 137 | 138 | #Formatting 139 | genome = '' 140 | genome += corr_order[0] 141 | k = len(DNA[0]) 142 | i = 1 143 | while i < len(corr_order): 144 | genome += corr_order[i][k-2] 145 | i += 1 146 | 147 | return genome 148 | 149 | print AssembleGenome(DNA) 150 | -------------------------------------------------------------------------------- /Genome Assembly with Paired Reads.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from random import randint 3 | from copy import deepcopy 4 | 5 | infile = open('PairedReads.txt', 'r') 6 | DNA = [] 7 | for line in infile: 8 | DNA.append(line.strip('\n')) 9 | infile.close() 10 | 11 | k = 30 #Read length 12 | d = 100 #Distance between pairs 13 | 14 | def prefix(kmer_pair): 15 | pref = '' 16 | f = (len(kmer_pair)-1)/2 17 | pref += (kmer_pair[:f-1]) 18 | pref += '|' 19 | pref += (kmer_pair[f+1:len(kmer_pair)-1]) 20 | return pref 21 | 22 | def suffix(kmer_pair): 23 | suff = '' 24 | f = (len(kmer_pair)-1)/2 25 | suff += (kmer_pair[1:f]) 26 | suff += '|' 27 | suff += (kmer_pair[len(kmer_pair)-f+1:]) 28 | return suff 29 | 30 | def AssembleGenome(DNA, k, d): 31 | #Represent every k-mer in Patterns as an isolated edge between its prefix and suffix 32 | edges = [] # edged = [kmer : [prefix(kmer), suffix(kmer)]] 33 | for kmer in DNA: 34 | edges.append(kmer) 35 | edges.append([prefix(kmer),suffix(kmer)]) 36 | 37 | #Find prefixes/suffixes 38 | prefixes = [] 39 | suffixes = [] 40 | i = 0 41 | while i < len(edges): 42 | prefixes.append(edges[i+1][0]) 43 | suffixes.append(edges[i+1][1]) 44 | i += 2 45 | 46 | #Sort pairs 47 | pairs = {} 48 | i = 0 49 | while i < len(prefixes): 50 | pairs.setdefault(prefixes[i],[]) 51 | pairs[prefixes[i]].append(suffixes[i]) 52 | i += 1 53 | pairs = collections.OrderedDict(sorted(pairs.items())) 54 | 55 | #Assign strings to unique IDs 56 | numbers = {} 57 | rev_numbers = {} 58 | i = 0 59 | for pair in pairs: 60 | numbers[pair] = str(i) 61 | rev_numbers[str(i)] = pair 62 | i += 1 63 | 64 | for pair in pairs: 65 | for end in pairs[pair]: 66 | numbers.setdefault(end, 'empty') 67 | if numbers[end] == 'empty': 68 | numbers[end] = str(i) 69 | rev_numbers[str(i)] = end 70 | i += 1 71 | 72 | adj_list = {} 73 | circuit_max = 0 74 | for start in pairs: 75 | for end in pairs[start]: 76 | adj_list.setdefault(numbers[start], []) 77 | adj_list[numbers[start]].append(numbers[end]) 78 | circuit_max += 1 79 | 80 | #Reduced adjacency list to keep track of traveled edges 81 | red_adj_list = {} 82 | red_adj_list = deepcopy(adj_list) 83 | 84 | #Find start and end node 85 | start = {} 86 | for one in red_adj_list: 87 | start.setdefault(one, 0) 88 | start[one] += len(red_adj_list[one]) 89 | end = {} 90 | for one in red_adj_list: 91 | for two in red_adj_list[one]: 92 | end.setdefault(two, 0) 93 | end[two] += 1 94 | for one in end: 95 | try: 96 | if start[one] != end[one]: 97 | if start[one] > end[one]: 98 | start_node = one 99 | if start[one] < end[one]: 100 | end_node = one 101 | except KeyError: 102 | end_node = one 103 | for one in start: 104 | try: 105 | if end[one] != start[one]: 106 | if end[one] < start[one]: 107 | start_node = one 108 | if end[one] > start: 109 | end_node = one 110 | except KeyError: 111 | start_node = one 112 | 113 | #Set end 114 | red_adj_list[end_node] = [] 115 | 116 | #Starting node (if graph is directed/ubalanced) 117 | start = start_node 118 | curr_vrtx = start_node 119 | 120 | path = [] 121 | path.append(curr_vrtx) 122 | stack = [] 123 | circuit = [] 124 | while len(circuit) != circuit_max: 125 | 126 | if red_adj_list[curr_vrtx] != []: #If neighbors exist 127 | stack.append(curr_vrtx) 128 | pick = randint(0,len(red_adj_list[curr_vrtx])-1) 129 | temp = deepcopy(curr_vrtx) 130 | curr_vrtx = red_adj_list[temp][pick] 131 | red_adj_list[temp].remove(curr_vrtx) 132 | 133 | else: 134 | circuit.append(curr_vrtx) 135 | curr_vrtx = stack[len(stack)-1] 136 | stack.pop() 137 | 138 | #Match IDs to strings 139 | circuit = [start] + circuit[::-1] 140 | corr_order = [] 141 | for vrtx in circuit: 142 | corr_order.append(rev_numbers[vrtx]) 143 | 144 | #Formatting 145 | a = [] 146 | b = [] 147 | for kmer in corr_order: 148 | a.append(kmer[:k-1]) 149 | b.append(kmer[k:]) 150 | prefixstring = '' 151 | for kmer in a: 152 | prefixstring += kmer[0] 153 | suffixstring = '' 154 | for kmer in b: 155 | suffixstring += kmer[0] 156 | genome = '' 157 | genome += prefixstring[0:k+d] 158 | genome += suffixstring 159 | genome += b[len(b)-1][1:] 160 | 161 | return genome 162 | 163 | print AssembleGenome(DNA, k, d) 164 | -------------------------------------------------------------------------------- /Genome Graph Functions.py: -------------------------------------------------------------------------------- 1 | print ChromosomeToCycle('(+1 -2 -3 +4)') #Output -> (1 2 4 3 6 5 7 8) 2 | def ChromosomeToCycle(Chromosome): 3 | chr_for = [] 4 | active = 0 5 | temp = '' 6 | for char in Chromosome: 7 | if char == ' ' or char == ')': 8 | active = 2 9 | if active == 1: 10 | temp += char 11 | if char == '+' or char == '-': 12 | active = 1 13 | if char == '-': 14 | temp += '-' 15 | if active == 2: 16 | chr_for.append(int(temp)) 17 | temp = ' ' 18 | active = 0 19 | 20 | P_und = [] 21 | P_cyc = [] 22 | for num in chr_for: 23 | if num > 0: 24 | P_cyc.append(2*num-1) 25 | P_cyc.append(2*num) 26 | elif num < 0: 27 | P_cyc.append(2*(-1*num)) 28 | P_cyc.append(2*(-1*num)-1) 29 | 30 | cycle = '(' 31 | for num in P_cyc: 32 | cycle += str(num) + ' ' 33 | cycle = cycle.strip(' ') 34 | cycle += ')' 35 | return cycle 36 | 37 | 38 | print CycleToChromosome('(1 2 4 3 6 5 7 8)') #Output -> (+1 -2 -3 +4) 39 | def CycleToChromosome(Cycle): 40 | cyc_for = [] 41 | active = 0 42 | temp = '' 43 | for char in Cycle: 44 | if active == 1: 45 | if char == ' ' or char == ')': 46 | active = 2 47 | if active == 1: 48 | temp += char 49 | if active == 0: 50 | if char == '(' or char == ' ': 51 | active = 1 52 | if active == 2: 53 | cyc_for.append(int(temp)) 54 | temp = ' ' 55 | active = 1 56 | 57 | chrom_unf = [] 58 | for i in range(1,len(cyc_for)/2+1): 59 | if cyc_for[2*i-2] < cyc_for[2*i-1]: 60 | chrom_unf.append('+' + str(cyc_for[2*i-1]/2)) 61 | else: 62 | chrom_unf.append('-' + str(cyc_for[2*i-2]/2)) 63 | 64 | chrom_for = '(' 65 | for num in chrom_unf: 66 | chrom_for += num + ' ' 67 | chrom_for = chrom_for.strip(' ') 68 | chrom_for += ')' 69 | return chrom_for 70 | 71 | 72 | print ColoredEdges('(+1 -2 -3)(-4 +5 -6)') #Output -> [(2, 4), (3, 6), (5, 1), (7, 9), (10, 12), (11, 8)] 73 | def ColoredEdges(Genome): 74 | genomes = [] 75 | genome_temp = '' 76 | for char in Genome: 77 | if char == ')': 78 | genome_temp +=char 79 | genomes.append(genome_temp) 80 | genome_temp = '' 81 | if char != ')': 82 | genome_temp += char 83 | 84 | nodes = [] 85 | for chrom in genomes: 86 | nodes.append(ChromosomeToCycle(chrom)) 87 | nodes_lst = [] 88 | for chrom in nodes: 89 | nodes_tmp = [] 90 | active = 0 91 | temp = '' 92 | for char in chrom: 93 | if active == 1: 94 | if char == ' ' or char == ')': 95 | active = 2 96 | if active == 1: 97 | temp += char 98 | if active == 0: 99 | if char == '(' or char == ' ': 100 | active = 1 101 | if active == 2: 102 | nodes_tmp.append(int(temp)) 103 | temp = ' ' 104 | active = 1 105 | nodes_lst.append(nodes_tmp) 106 | 107 | colored_edges = [] 108 | for chrom in nodes_lst: 109 | j = 1 110 | while j < len(chrom)-2: 111 | colored_edges.append((chrom[j],chrom[j+1])) 112 | j += 2 113 | colored_edges.append((chrom[len(chrom)-1], chrom[0])) 114 | return colored_edges 115 | 116 | 117 | print GraphToGenome([(2, 4), (3, 6), (5, 1), (7, 9), (10, 12), (11, 8)]) #Output -> (+1 -2 -3)(-4 +5 -6) 118 | def GraphToGenome(Graph): 119 | edges = [] 120 | for pair in Graph: 121 | if pair[0] % 2 == 0: 122 | x = pair[0]/2 123 | else: 124 | x = ((pair[0]+1)/2)*-1 125 | if pair[1] % 2 == 0: 126 | y = pair[1]/2 127 | else: 128 | y = ((pair[1]+1)/2)*-1 129 | edges.append((x,y)) 130 | 131 | chromosomes = [] 132 | while len(edges) != 0: 133 | start = edges[0][0] 134 | end = edges[0][1] 135 | edges.remove((start,end)) 136 | chromosome = [start] 137 | 138 | cyc_end = start*-1 139 | while end != cyc_end: 140 | end = end*-1 141 | for pair in edges: 142 | if pair[0] == end: 143 | start = pair[0] 144 | end = pair[1] 145 | break 146 | edges.remove((start,end)) 147 | chromosome.append(start) 148 | chromosomes += [chromosome] 149 | 150 | genome_temp = '' 151 | for chromosome_temp in chromosomes: 152 | genome_temp += '(' 153 | for char in chromosome_temp: 154 | if char > 0: 155 | genome_temp += '+' + str(char) + ' ' 156 | else: 157 | genome_temp += str(char) + ' ' 158 | genome_temp = genome_temp.strip(' ') 159 | genome_temp += ')' 160 | return genome_temp 161 | -------------------------------------------------------------------------------- /Viterbi Learning.py: -------------------------------------------------------------------------------- 1 | infile = open('TransitionMatrix.txt', 'r') 2 | TransitionMatrix = [] 3 | for line in infile: 4 | TransitionMatrix.append(line.strip('\n').split('\t')) 5 | infile.close() 6 | 7 | infile = open('EmissionMatrix.txt', 'r') 8 | EmissionMatrix = [] 9 | for line in infile: 10 | EmissionMatrix.append(line.strip('\n').split('\t')) 11 | infile.close() 12 | 13 | string = 'zzxzzxyxzyyyyxzxxxyxyxzzzxyxxyyxxxyxyyxzyzxyyyzyzxyyzzxzxzyzyyzxyxyxyzzxyzzxxxzyyyzzyxzzxxyzzyxyzyxx' 14 | alphabet = 'x y z'.split() #Observation space 15 | states = 'A B C D'.split() #State space 16 | 17 | TM = {} #Setup transition matrix 18 | i = 0 19 | for state1 in states: 20 | TM[state1] = {} 21 | j = 1 22 | for state2 in states: 23 | TM[state1][state2] = float(TransitionMatrix[i][j]) 24 | j += 1 25 | i += 1 26 | 27 | EM = {} #Setup emission matrix 28 | i = 0 29 | for state in states: 30 | EM[state] = {} 31 | j = 1 32 | for letter in alphabet: 33 | EM[state][letter] = float(EmissionMatrix[i][j]) 34 | j += 1 35 | i += 1 36 | 37 | def Viterbi(string, alphabet, states, TM, EM): 38 | 39 | Pi = 1 #Probability of initial state set to equally likely = 1/|states| 40 | 41 | T1 = {} #Scoring matrix 42 | T2 = {} #Backtracking matrix 43 | for state in states: 44 | T1[state] = [0]*len(string) 45 | T2[state] = [0]*len(string) 46 | 47 | #Initalize scores at source 48 | for state in states: 49 | T1[state][0] = Pi*EM[state][string[0]] 50 | 51 | #Fill in remaining scores through dynamic programming 52 | i = 1 53 | while i < len(string): 54 | for j in states: 55 | values = [] 56 | K = [] 57 | for k in states: 58 | values.append(T1[k][i-1]*TM[k][j]*EM[j][string[i]]) 59 | K.append((T1[k][i-1]*TM[k][j]*EM[j][string[i]], k)) 60 | 61 | #Choose max weight of possible edges (k->j) 62 | T1[j][i] = max(values) 63 | #Remember which edge was chosen for backtracking 64 | for k in K: 65 | if k[0] == max(values): 66 | T2[j][i] = k[1] 67 | i += 1 68 | 69 | #Find maximum sink value and begin most probable path 70 | values = [] 71 | K = [] 72 | for state in T1: 73 | values.append(T1[state][len(string)-1]) 74 | for state in T1: 75 | if T1[state][len(string)-1] == max(values): 76 | path = state 77 | last = state 78 | 79 | #Backtracking 80 | i = len(string)-1 81 | while i > 0: 82 | for state in states: 83 | if T2[last][i] == state: 84 | last = state 85 | path += last 86 | i -= 1 87 | break 88 | 89 | return path[::-1] 90 | 91 | def EstimateParameters(string, alphabet, path, states): 92 | 93 | #Count TM instances 94 | TM_counts = {} 95 | i = 0 96 | while i < len(path)-1: 97 | TM_counts.setdefault(path[i], {}) 98 | TM_counts[path[i]].setdefault(path[i+1], 0) 99 | TM_counts[path[i]][path[i+1]] += 1 100 | i += 1 101 | 102 | #Count EM instances 103 | EM_counts = {} 104 | i = 0 105 | while i < len(path): 106 | EM_counts.setdefault(path[i], {}) 107 | EM_counts[path[i]].setdefault(string[i], 0) 108 | EM_counts[path[i]][string[i]] += 1 109 | i += 1 110 | 111 | #Calculate TM matrix 112 | TM_matrix = {} 113 | for state in states: 114 | TM_matrix.setdefault(state,{}) 115 | total = 0.0 116 | for column in states: 117 | try: 118 | total += TM_counts[state][column] 119 | except KeyError: 120 | pass 121 | if total == 0: 122 | for column in states: 123 | TM_matrix[state][column] = 1.0/len(states) 124 | else: 125 | for column in states: 126 | try: 127 | TM_matrix[state][column] = TM_counts[state][column]/total 128 | except KeyError: 129 | TM_matrix[state][column] = 0 130 | 131 | #Calculate EM matrix 132 | EM_matrix = {} 133 | for state in states: 134 | EM_matrix.setdefault(state,{}) 135 | total = 0.0 136 | for column in alphabet: 137 | try: 138 | total += EM_counts[state][column] 139 | except KeyError: 140 | pass 141 | if total == 0: 142 | for column in alphabet: 143 | EM_matrix[state][column] = 1.0/len(alphabet) 144 | else: 145 | for column in alphabet: 146 | try: 147 | EM_matrix[state][column] = EM_counts[state][column]/total 148 | except KeyError: 149 | EM_matrix[state][column] = 0 150 | 151 | return TM_matrix, EM_matrix 152 | 153 | #Viterbi Learning 154 | iteration = 0 155 | while iteration < 100: 156 | 157 | #Find the hidden path using parameters 158 | path = Viterbi(string, alphabet, states, TM, EM) 159 | 160 | #Estimate new parameters based on the hidden path 161 | TM, EM = EstimateParameters(string, alphabet, path, states) 162 | 163 | iteration += 1 164 | 165 | #Excluded from this point is the code necessary to format/output the final TM/EM 166 | -------------------------------------------------------------------------------- /Local Alignment.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | #Scoring Matrix 4 | pam = {'A': {'A': 2, 'C': -2, 'E': 0, 'D': 0, 'G': 1, 'F': -3, 'I': -1, 'H': -1, 'K': -1, 'M': -1, 'L': -2, 'N': 0, 'Q': 0, 'P': 1, 'S': 1, 'R': -2, 'T': 1, 'W': -6, 'V': 0, 'Y': -3}, 'C': {'A': -2, 'C': 12, 'E': -5, 'D': -5, 'G': -3, 'F': -4, 'I': -2, 'H': -3, 'K': -5, 'M': -5, 'L': -6, 'N': -4, 'Q': -5, 'P': -3, 'S': 0, 'R': -4, 'T': -2, 'W': -8, 'V': -2, 'Y': 0}, 'E': {'A': 0, 'C': -5, 'E': 4, 'D': 3, 'G': 0, 'F': -5, 'I': -2, 'H': 1, 'K': 0, 'M': -2, 'L': -3, 'N': 1, 'Q': 2, 'P': -1, 'S': 0, 'R': -1, 'T': 0, 'W': -7, 'V': -2, 'Y': -4}, 'D': {'A': 0, 'C': -5, 'E': 3, 'D': 4, 'G': 1, 'F': -6, 'I': -2, 'H': 1, 'K': 0, 'M': -3, 'L': -4, 'N': 2, 'Q': 2, 'P': -1, 'S': 0, 'R': -1, 'T': 0, 'W': -7, 'V': -2, 'Y': -4}, 'G': {'A': 1, 'C': -3, 'E': 0, 'D': 1, 'G': 5, 'F': -5, 'I': -3, 'H': -2, 'K': -2, 'M': -3, 'L': -4, 'N': 0, 'Q': -1, 'P': 0, 'S': 1, 'R': -3, 'T': 0, 'W': -7, 'V': -1, 'Y': -5}, 'F': {'A': -3, 'C': -4, 'E': -5, 'D': -6, 'G': -5, 'F': 9, 'I': 1, 'H': -2, 'K': -5, 'M': 0, 'L': 2, 'N': -3, 'Q': -5, 'P': -5, 'S': -3, 'R': -4, 'T': -3, 'W': 0, 'V': -1, 'Y': 7}, 'I': {'A': -1, 'C': -2, 'E': -2, 'D': -2, 'G': -3, 'F': 1, 'I': 5, 'H': -2, 'K': -2, 'M': 2, 'L': 2, 'N': -2, 'Q': -2, 'P': -2, 'S': -1, 'R': -2, 'T': 0, 'W': -5, 'V': 4, 'Y': -1}, 'H': {'A': -1, 'C': -3, 'E': 1, 'D': 1, 'G': -2, 'F': -2, 'I': -2, 'H': 6, 'K': 0, 'M': -2, 'L': -2, 'N': 2, 'Q': 3, 'P': 0, 'S': -1, 'R': 2, 'T': -1, 'W': -3, 'V': -2, 'Y': 0}, 'K': {'A': -1, 'C': -5, 'E': 0, 'D': 0, 'G': -2, 'F': -5, 'I': -2, 'H': 0, 'K': 5, 'M': 0, 'L': -3, 'N': 1, 'Q': 1, 'P': -1, 'S': 0, 'R': 3, 'T': 0, 'W': -3, 'V': -2, 'Y': -4}, 'M': {'A': -1, 'C': -5, 'E': -2, 'D': -3, 'G': -3, 'F': 0, 'I': 2, 'H': -2, 'K': 0, 'M': 6, 'L': 4, 'N': -2, 'Q': -1, 'P': -2, 'S': -2, 'R': 0, 'T': -1, 'W': -4, 'V': 2, 'Y': -2}, 'L': {'A': -2, 'C': -6, 'E': -3, 'D': -4, 'G': -4, 'F': 2, 'I': 2, 'H': -2, 'K': -3, 'M': 4, 'L': 6, 'N': -3, 'Q': -2, 'P': -3, 'S': -3, 'R': -3, 'T': -2, 'W': -2, 'V': 2, 'Y': -1}, 'N': {'A': 0, 'C': -4, 'E': 1, 'D': 2, 'G': 0, 'F': -3, 'I': -2, 'H': 2, 'K': 1, 'M': -2, 'L': -3, 'N': 2, 'Q': 1, 'P': 0, 'S': 1, 'R': 0, 'T': 0, 'W': -4, 'V': -2, 'Y': -2}, 'Q': {'A': 0, 'C': -5, 'E': 2, 'D': 2, 'G': -1, 'F': -5, 'I': -2, 'H': 3, 'K': 1, 'M': -1, 'L': -2, 'N': 1, 'Q': 4, 'P': 0, 'S': -1, 'R': 1, 'T': -1, 'W': -5, 'V': -2, 'Y': -4}, 'P': {'A': 1, 'C': -3, 'E': -1, 'D': -1, 'G': 0, 'F': -5, 'I': -2, 'H': 0, 'K': -1, 'M': -2, 'L': -3, 'N': 0, 'Q': 0, 'P': 6, 'S': 1, 'R': 0, 'T': 0, 'W': -6, 'V': -1, 'Y': -5}, 'S': {'A': 1, 'C': 0, 'E': 0, 'D': 0, 'G': 1, 'F': -3, 'I': -1, 'H': -1, 'K': 0, 'M': -2, 'L': -3, 'N': 1, 'Q': -1, 'P': 1, 'S': 2, 'R': 0, 'T': 1, 'W': -2, 'V': -1, 'Y': -3}, 'R': {'A': -2, 'C': -4, 'E': -1, 'D': -1, 'G': -3, 'F': -4, 'I': -2, 'H': 2, 'K': 3, 'M': 0, 'L': -3, 'N': 0, 'Q': 1, 'P': 0, 'S': 0, 'R': 6, 'T': -1, 'W': 2, 'V': -2, 'Y': -4}, 'T': {'A': 1, 'C': -2, 'E': 0, 'D': 0, 'G': 0, 'F': -3, 'I': 0, 'H': -1, 'K': 0, 'M': -1, 'L': -2, 'N': 0, 'Q': -1, 'P': 0, 'S': 1, 'R': -1, 'T': 3, 'W': -5, 'V': 0, 'Y': -3}, 'W': {'A': -6, 'C': -8, 'E': -7, 'D': -7, 'G': -7, 'F': 0, 'I': -5, 'H': -3, 'K': -3, 'M': -4, 'L': -2, 'N': -4, 'Q': -5, 'P': -6, 'S': -2, 'R': 2, 'T': -5, 'W': 17, 'V': -6, 'Y': 0}, 'V': {'A': 0, 'C': -2, 'E': -2, 'D': -2, 'G': -1, 'F': -1, 'I': 4, 'H': -2, 'K': -2, 'M': 2, 'L': 2, 'N': -2, 'Q': -2, 'P': -1, 'S': -1, 'R': -2, 'T': 0, 'W': -6, 'V': 4, 'Y': -2}, 'Y': {'A': -3, 'C': 0, 'E': -4, 'D': -4, 'G': -5, 'F': 7, 'I': -1, 'H': 0, 'K': -4, 'M': -2, 'L': -1, 'N': -2, 'Q': -4, 'P': -5, 'S': -3, 'R': -4, 'T': -3, 'W': 0, 'V': -2, 'Y': 10}} 5 | v = 'HFQFICLLWFWVESAVQTCSYYWEAFVPENWCFNEGCIMMKSNLMYFNHNMPFNWNHNIG' #x 6 | w = 'GQGIWDTICMGNICWENIMWHFQMNVSWFSPRHNKYIGFYPAKFFDPPKEKL' #y 7 | penalty = 5 8 | 9 | def LocalAlignment(v, w, matrix, penalty): 10 | #Initialize scoring and backtracking matrices 11 | s = numpy.zeros((len(v)+1,len(w)+1)) 12 | backtrack = numpy.zeros((len(v)+1,len(w)+1)) 13 | 14 | #Fill in scoring and backtracking matrices 15 | max_val = 0 16 | for i in range(1,len(v)+1): 17 | for j in range(1,len(w)+1): 18 | 19 | values = [0, #0 20 | s[i-1][j] - penalty, #1 21 | s[i][j-1] - penalty, #2 22 | s[i-1][j-1] + matrix[v[i-1]][w[j-1]] #3 23 | ] 24 | 25 | s[i][j] = max(values) 26 | backtrack[i][j] = values.index(s[i][j]) 27 | 28 | #Find max value 29 | if max(values) >= max_val: 30 | max_val = max(values) 31 | 32 | #Find maximum cell 33 | for i in range(1,len(v)+1): 34 | for j in range(1,len(w)+1): 35 | if s[i][j] == max_val: 36 | max_i = i 37 | max_j = j 38 | break 39 | 40 | #Backtracking and output 41 | i = max_i 42 | j = max_j 43 | V = '' 44 | W = '' 45 | while i*j != 0 and backtrack[i][j] != 0: 46 | 47 | if backtrack[i][j] == 1: 48 | V += v[i-1] 49 | W += '-' 50 | i = i-1 51 | 52 | elif backtrack[i][j] == 2: 53 | V += '-' 54 | W += w[j-1] 55 | j = j-1 56 | 57 | elif backtrack[i][j] == 3: 58 | V += v[i-1] 59 | W += w[j-1] 60 | i = i-1 61 | j = j-1 62 | 63 | return int(s[max_i][max_j]), V[::-1], W[::-1] 64 | 65 | score, v, w = LocalAlignment(v, w, pam, penalty) 66 | 67 | print 'Score: ' + str(score) 68 | print v 69 | print w 70 | -------------------------------------------------------------------------------- /Gibbs Motif Sampler.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | 3 | DNA = [] 4 | infile = open('DNA.txt','r') 5 | for line in infile: 6 | DNA.append(line.strip('\n')) 7 | 8 | k = 15 #Length of k-mer 9 | size = len(DNA) 10 | 11 | #Returns the most probable kmer 12 | def ProfileMostProb(string, k, a, c, g, t): 13 | KmersProb = [] 14 | Probs = [] 15 | for i in range(0,len(string)-k+1): 16 | kmer = string[i:i+k] 17 | prob = 1 18 | pos = 0 19 | for nt in kmer: 20 | if nt == 'A': 21 | prob = prob*a[pos] 22 | elif nt == 'C': 23 | prob = prob*c[pos] 24 | elif nt == 'G': 25 | prob = prob*g[pos] 26 | elif nt == 'T': 27 | prob = prob*t[pos] 28 | pos += 1 29 | KmersProb.append({kmer:prob}) 30 | Probs.append(prob) 31 | for pair in KmersProb: 32 | for kmer in pair: 33 | if pair[kmer] == max(Probs): 34 | return kmer 35 | 36 | #Scores a set of kmers based on consensus scoring method 37 | def Score(list, k): 38 | AA = [0]*k 39 | CC = [0]*k 40 | GG = [0]*k 41 | TT = [0]*k 42 | consensus = '' 43 | for kmer in list: 44 | for i in range(0,len(kmer)): 45 | if kmer[i] == 'A': 46 | AA[i] += 1 47 | elif kmer[i] == 'C': 48 | CC[i] += 1 49 | elif kmer[i] == 'G': 50 | GG[i] += 1 51 | elif kmer[i] == 'T': 52 | TT[i] += 1 53 | for i in range(0,k): 54 | if AA[i] == max(AA[i],CC[i],GG[i],TT[i]): 55 | consensus = consensus + 'A' 56 | elif CC[i] == max(AA[i],CC[i],GG[i],TT[i]): 57 | consensus = consensus + 'C' 58 | elif GG[i] == max(AA[i],CC[i],GG[i],TT[i]): 59 | consensus = consensus + 'G' 60 | elif TT[i] == max(AA[i],CC[i],GG[i],TT[i]): 61 | consensus = consensus + 'T' 62 | score = 0 63 | pos = 0 64 | for nt in consensus: 65 | for kmer in list: 66 | if kmer[pos] == nt: 67 | pass 68 | else: 69 | score += 1 70 | pos += 1 71 | return score 72 | 73 | def GibbsMotifSampler(DNA, k, size): 74 | 75 | DNA_Ordered = {} 76 | numbers = 1 77 | for seq in DNA: 78 | DNA_Ordered[numbers] = seq 79 | numbers += 1 80 | 81 | #Pseudocounts 82 | A = [float(1)]*k 83 | C = [float(1)]*k 84 | G = [float(1)]*k 85 | T = [float(1)]*k 86 | 87 | #Probabilities 88 | a = [float(0)]*k 89 | c = [float(0)]*k 90 | g = [float(0)]*k 91 | t = [float(0)]*k 92 | Total = [float(0)]*k 93 | 94 | #Randomly select k-mers 95 | RandomKmers = [] 96 | number = 1 97 | for seq in DNA: 98 | i = randint(0,len(seq)-k) 99 | RandomKmers.append({seq[i:i+k]:number}) 100 | number += 1 101 | 102 | BestKmers = [] 103 | for pair in RandomKmers: 104 | for kmer in pair: 105 | BestKmers.append(kmer) 106 | BestScore = Score(BestKmers,k) 107 | 108 | runs = 0 109 | while runs < 100: 110 | GoodKmers = [] 111 | number = 1 112 | for seq in DNA: 113 | i = randint(0,len(seq)-k) 114 | GoodKmers.append({seq[i:i+k]:number}) 115 | number += 1 116 | 117 | count = 0 118 | while count < 10: 119 | 120 | #Random roll 121 | roll = randint(1,size) 122 | 123 | #Adjust Frequencies 124 | A = [float(1)]*k 125 | C = [float(1)]*k 126 | G = [float(1)]*k 127 | T = [float(1)]*k 128 | for pair in GoodKmers: 129 | for kmer in pair: 130 | if pair[kmer] != roll: 131 | pos = 0 132 | for nt in kmer: 133 | if nt == 'A': 134 | A[pos] += 1 135 | elif nt == 'C': 136 | C[pos] += 1 137 | elif nt == 'G': 138 | G[pos] += 1 139 | elif nt == 'T': 140 | T[pos] += 1 141 | pos += 1 142 | 143 | #Reprofile 144 | Total = [float(0)]*k 145 | for i in range(0,k): 146 | Total[i] = A[i]+C[i]+G[i]+T[i] 147 | a[i] = A[i]/Total[i] 148 | c[i] = C[i]/Total[i] 149 | g[i] = G[i]/Total[i] 150 | t[i] = T[i]/Total[i] 151 | 152 | for pair in GoodKmers: 153 | for kmer in pair: 154 | if pair[kmer] == roll: 155 | del pair[kmer] 156 | pair[ProfileMostProb(DNA_Ordered[roll], k, a, c, g, t)] = roll 157 | Challenger = [] 158 | for pair in GoodKmers: 159 | for kmer in pair: 160 | Challenger.append(kmer) 161 | 162 | if Score(Challenger,k) < BestScore: 163 | BestKmers = Challenger 164 | BestScore = Score(BestKmers,k) 165 | 166 | count += 1 167 | runs += 1 168 | 169 | return BestKmers 170 | 171 | BestKmers = GibbsMotifSampler(DNA, k, size) 172 | 173 | #Formatting 174 | for kmer in BestKmers: 175 | print kmer 176 | print 'Score: ' + str(Score(BestKmers,k)) 177 | -------------------------------------------------------------------------------- /Global Alignment.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | #Scoring Matrix 4 | pam = {'A': {'A': 2, 'C': -2, 'E': 0, 'D': 0, 'G': 1, 'F': -3, 'I': -1, 'H': -1, 'K': -1, 'M': -1, 'L': -2, 'N': 0, 'Q': 0, 'P': 1, 'S': 1, 'R': -2, 'T': 1, 'W': -6, 'V': 0, 'Y': -3}, 'C': {'A': -2, 'C': 12, 'E': -5, 'D': -5, 'G': -3, 'F': -4, 'I': -2, 'H': -3, 'K': -5, 'M': -5, 'L': -6, 'N': -4, 'Q': -5, 'P': -3, 'S': 0, 'R': -4, 'T': -2, 'W': -8, 'V': -2, 'Y': 0}, 'E': {'A': 0, 'C': -5, 'E': 4, 'D': 3, 'G': 0, 'F': -5, 'I': -2, 'H': 1, 'K': 0, 'M': -2, 'L': -3, 'N': 1, 'Q': 2, 'P': -1, 'S': 0, 'R': -1, 'T': 0, 'W': -7, 'V': -2, 'Y': -4}, 'D': {'A': 0, 'C': -5, 'E': 3, 'D': 4, 'G': 1, 'F': -6, 'I': -2, 'H': 1, 'K': 0, 'M': -3, 'L': -4, 'N': 2, 'Q': 2, 'P': -1, 'S': 0, 'R': -1, 'T': 0, 'W': -7, 'V': -2, 'Y': -4}, 'G': {'A': 1, 'C': -3, 'E': 0, 'D': 1, 'G': 5, 'F': -5, 'I': -3, 'H': -2, 'K': -2, 'M': -3, 'L': -4, 'N': 0, 'Q': -1, 'P': 0, 'S': 1, 'R': -3, 'T': 0, 'W': -7, 'V': -1, 'Y': -5}, 'F': {'A': -3, 'C': -4, 'E': -5, 'D': -6, 'G': -5, 'F': 9, 'I': 1, 'H': -2, 'K': -5, 'M': 0, 'L': 2, 'N': -3, 'Q': -5, 'P': -5, 'S': -3, 'R': -4, 'T': -3, 'W': 0, 'V': -1, 'Y': 7}, 'I': {'A': -1, 'C': -2, 'E': -2, 'D': -2, 'G': -3, 'F': 1, 'I': 5, 'H': -2, 'K': -2, 'M': 2, 'L': 2, 'N': -2, 'Q': -2, 'P': -2, 'S': -1, 'R': -2, 'T': 0, 'W': -5, 'V': 4, 'Y': -1}, 'H': {'A': -1, 'C': -3, 'E': 1, 'D': 1, 'G': -2, 'F': -2, 'I': -2, 'H': 6, 'K': 0, 'M': -2, 'L': -2, 'N': 2, 'Q': 3, 'P': 0, 'S': -1, 'R': 2, 'T': -1, 'W': -3, 'V': -2, 'Y': 0}, 'K': {'A': -1, 'C': -5, 'E': 0, 'D': 0, 'G': -2, 'F': -5, 'I': -2, 'H': 0, 'K': 5, 'M': 0, 'L': -3, 'N': 1, 'Q': 1, 'P': -1, 'S': 0, 'R': 3, 'T': 0, 'W': -3, 'V': -2, 'Y': -4}, 'M': {'A': -1, 'C': -5, 'E': -2, 'D': -3, 'G': -3, 'F': 0, 'I': 2, 'H': -2, 'K': 0, 'M': 6, 'L': 4, 'N': -2, 'Q': -1, 'P': -2, 'S': -2, 'R': 0, 'T': -1, 'W': -4, 'V': 2, 'Y': -2}, 'L': {'A': -2, 'C': -6, 'E': -3, 'D': -4, 'G': -4, 'F': 2, 'I': 2, 'H': -2, 'K': -3, 'M': 4, 'L': 6, 'N': -3, 'Q': -2, 'P': -3, 'S': -3, 'R': -3, 'T': -2, 'W': -2, 'V': 2, 'Y': -1}, 'N': {'A': 0, 'C': -4, 'E': 1, 'D': 2, 'G': 0, 'F': -3, 'I': -2, 'H': 2, 'K': 1, 'M': -2, 'L': -3, 'N': 2, 'Q': 1, 'P': 0, 'S': 1, 'R': 0, 'T': 0, 'W': -4, 'V': -2, 'Y': -2}, 'Q': {'A': 0, 'C': -5, 'E': 2, 'D': 2, 'G': -1, 'F': -5, 'I': -2, 'H': 3, 'K': 1, 'M': -1, 'L': -2, 'N': 1, 'Q': 4, 'P': 0, 'S': -1, 'R': 1, 'T': -1, 'W': -5, 'V': -2, 'Y': -4}, 'P': {'A': 1, 'C': -3, 'E': -1, 'D': -1, 'G': 0, 'F': -5, 'I': -2, 'H': 0, 'K': -1, 'M': -2, 'L': -3, 'N': 0, 'Q': 0, 'P': 6, 'S': 1, 'R': 0, 'T': 0, 'W': -6, 'V': -1, 'Y': -5}, 'S': {'A': 1, 'C': 0, 'E': 0, 'D': 0, 'G': 1, 'F': -3, 'I': -1, 'H': -1, 'K': 0, 'M': -2, 'L': -3, 'N': 1, 'Q': -1, 'P': 1, 'S': 2, 'R': 0, 'T': 1, 'W': -2, 'V': -1, 'Y': -3}, 'R': {'A': -2, 'C': -4, 'E': -1, 'D': -1, 'G': -3, 'F': -4, 'I': -2, 'H': 2, 'K': 3, 'M': 0, 'L': -3, 'N': 0, 'Q': 1, 'P': 0, 'S': 0, 'R': 6, 'T': -1, 'W': 2, 'V': -2, 'Y': -4}, 'T': {'A': 1, 'C': -2, 'E': 0, 'D': 0, 'G': 0, 'F': -3, 'I': 0, 'H': -1, 'K': 0, 'M': -1, 'L': -2, 'N': 0, 'Q': -1, 'P': 0, 'S': 1, 'R': -1, 'T': 3, 'W': -5, 'V': 0, 'Y': -3}, 'W': {'A': -6, 'C': -8, 'E': -7, 'D': -7, 'G': -7, 'F': 0, 'I': -5, 'H': -3, 'K': -3, 'M': -4, 'L': -2, 'N': -4, 'Q': -5, 'P': -6, 'S': -2, 'R': 2, 'T': -5, 'W': 17, 'V': -6, 'Y': 0}, 'V': {'A': 0, 'C': -2, 'E': -2, 'D': -2, 'G': -1, 'F': -1, 'I': 4, 'H': -2, 'K': -2, 'M': 2, 'L': 2, 'N': -2, 'Q': -2, 'P': -1, 'S': -1, 'R': -2, 'T': 0, 'W': -6, 'V': 4, 'Y': -2}, 'Y': {'A': -3, 'C': 0, 'E': -4, 'D': -4, 'G': -5, 'F': 7, 'I': -1, 'H': 0, 'K': -4, 'M': -2, 'L': -1, 'N': -2, 'Q': -4, 'P': -5, 'S': -3, 'R': -4, 'T': -3, 'W': 0, 'V': -2, 'Y': 10}} 5 | v = 'HFQFICLLWFWVESAVQTCSYYWEAFVPENWCFNEGCIMMKSNLMYFNHNMPFNWNHNIG' #x 6 | w = 'GQGIWDTICMGNICWENIMWHFQMNVSWFSPRHNKYIGFYPAKFFDPPKEKL' #y 7 | penalty = 5 8 | 9 | def GlobalAlignment(v, w, matrix, penalty): 10 | #Initialize scoring and backtracking matrices 11 | s = numpy.zeros((len(v)+1,len(w)+1)) 12 | backtrack = numpy.zeros((len(v)+1,len(w)+1)) 13 | 14 | #Set starting penalties 15 | j = 1 16 | sub = -penalty 17 | while j < len(w)+1: 18 | s[0][j] = sub 19 | sub -= penalty 20 | j += 1 21 | i = 1 22 | sub = -penalty 23 | while i < len(v)+1: 24 | s[i][0] = sub 25 | sub -= penalty 26 | i += 1 27 | 28 | #Fill in scoring and backtracking matrices 29 | for i in range(1,len(v)+1): 30 | for j in range(1,len(w)+1): 31 | 32 | values = [ 33 | s[i-1][j] - penalty, #0 34 | s[i][j-1] - penalty, #1 35 | s[i-1][j-1] + matrix[v[i-1]][w[j-1]] #2 36 | ] 37 | 38 | s[i][j] = max(values) 39 | backtrack[i][j] = values.index(s[i][j]) 40 | 41 | #Backtracking and output 42 | i = len(v) 43 | j = len(w) 44 | V = '' 45 | W = '' 46 | while i+j != 0: 47 | 48 | if i == 0: 49 | V += '-' 50 | W += w[j-1] 51 | j = j-1 52 | 53 | elif j == 0: 54 | V += v[i-1] 55 | W += '-' 56 | i = i-1 57 | 58 | elif backtrack[i][j] == 0: 59 | V += v[i-1] 60 | W += '-' 61 | i = i-1 62 | 63 | elif backtrack[i][j] == 1: 64 | V += '-' 65 | W += w[j-1] 66 | j = j-1 67 | 68 | elif backtrack[i][j] == 2: 69 | V += v[i-1] 70 | W += w[j-1] 71 | i = i-1 72 | j = j-1 73 | 74 | return int(s[len(v)][len(w)]), V[::-1], W[::-1] 75 | 76 | score, v, w = GlobalAlignment(v, w, pam, penalty) 77 | 78 | print 'Score: ' + str(score) 79 | print v 80 | print w 81 | -------------------------------------------------------------------------------- /Common Text Files/Peptide_Scoring_Matrix.txt: -------------------------------------------------------------------------------- 1 | A C D E F G H I K L M N P Q R S T V W Y 2 | A 4 0 -2 -1 -2 0 -2 -1 -1 -1 -1 -2 -1 -1 -1 1 0 0 -3 -2 3 | C 0 9 -3 -4 -2 -3 -3 -1 -3 -1 -1 -3 -3 -3 -3 -1 -1 -1 -2 -2 4 | D -2 -3 6 2 -3 -1 -1 -3 -1 -4 -3 1 -1 0 -2 0 -1 -3 -4 -3 5 | E -1 -4 2 5 -3 -2 0 -3 1 -3 -2 0 -1 2 0 0 -1 -2 -3 -2 6 | F -2 -2 -3 -3 6 -3 -1 0 -3 0 0 -3 -4 -3 -3 -2 -2 -1 1 3 7 | G 0 -3 -1 -2 -3 6 -2 -4 -2 -4 -3 0 -2 -2 -2 0 -2 -3 -2 -3 8 | H -2 -3 -1 0 -1 -2 8 -3 -1 -3 -2 1 -2 0 0 -1 -2 -3 -2 2 9 | I -1 -1 -3 -3 0 -4 -3 4 -3 2 1 -3 -3 -3 -3 -2 -1 3 -3 -1 10 | K -1 -3 -1 1 -3 -2 -1 -3 5 -2 -1 0 -1 1 2 0 -1 -2 -3 -2 11 | L -1 -1 -4 -3 0 -4 -3 2 -2 4 2 -3 -3 -2 -2 -2 -1 1 -2 -1 12 | M -1 -1 -3 -2 0 -3 -2 1 -1 2 5 -2 -2 0 -1 -1 -1 1 -1 -1 13 | N -2 -3 1 0 -3 0 1 -3 0 -3 -2 6 -2 0 0 1 0 -3 -4 -2 14 | P -1 -3 -1 -1 -4 -2 -2 -3 -1 -3 -2 -2 7 -1 -2 -1 -1 -2 -4 -3 15 | Q -1 -3 0 2 -3 -2 0 -3 1 -2 0 0 -1 5 1 0 -1 -2 -2 -1 16 | R -1 -3 -2 0 -3 -2 0 -3 2 -2 -1 0 -2 1 5 -1 -1 -3 -3 -2 17 | S 1 -1 0 0 -2 0 -1 -2 0 -2 -1 1 -1 0 -1 4 1 -2 -3 -2 18 | T 0 -1 -1 -1 -2 -2 -2 -1 -1 -1 -1 0 -1 -1 -1 1 5 0 -2 -2 19 | V 0 -1 -3 -2 -1 -3 -3 3 -2 1 1 -3 -2 -2 -3 -2 0 4 -3 -1 20 | W -3 -2 -4 -3 1 -2 -2 -3 -3 -2 -1 -4 -4 -2 -3 -3 -2 -3 11 2 21 | Y -2 -2 -3 -2 3 -3 2 -1 -2 -1 -1 -2 -3 -1 -2 -2 -2 -1 2 7 22 | 23 | 24 | Shortcut for loading into a Python dictionary (2-way lookup) 25 | blosum_62 = {'GW': -2, 'GV': -3, 'GT': -2, 'GS': 0, 'GR': -2, 'GQ': -2, 'GP': -2, 'GY': -3, 'GG': 6, 'GF': -3, 'GE': -2, 'GD': -1, 26 | 'GC': -3, 'GA': 0, 'GN': 0, 'GM': -3, 'GL': -4, 'GK': -2, 'GI': -4, 'GH': -2, 'ME': -2, 'MD': -3, 'MG': -3, 'MF': 0, 27 | 'MA': -1, 'MC': -1, 'MM': 5, 'ML': 2, 'MN': -2, 'MI': 1, 'MH': -2, 'MK': -1, 'MT': -1, 'MW': -1, 'MV': 1, 'MQ': 0, 28 | 'MP': -2, 'MS': -1, 'MR': -1, 'MY': -1, 'FP': -4, 'FQ': -3, 'FR': -3, 'FS': -2, 'FT': -2, 'FV': -1, 'FW': 1, 'FY': 3, 29 | 'FA': -2, 'FC': -2, 'FD': -3, 'FE': -3, 'FF': 6, 'FG': -3, 'FH': -1, 'FI': 0, 'FK': -3, 'FL': 0, 'FM': 0, 'FN': -3, 30 | 'SY': -2, 'SS': 4, 'SR': -1, 'SQ': 0, 'SP': -1, 'SW': -3, 'SV': -2, 'ST': 1, 'SK': 0, 'SI': -2, 'SH': -1, 'SN': 1, 31 | 'SM': -1, 'SL': -2, 'SC': -1, 'SA': 1, 'SG': 0, 'SF': -2, 'SE': 0, 'SD': 0, 'YI': -1, 'YH': 2, 'YK': -2, 'YM': -1, 32 | 'YL': -1, 'YN': -2, 'YA': -2, 'YC': -2, 'YE': -2, 'YD': -3, 'YG': -3, 'YF': 3, 'YY': 7, 'YQ': -1, 'YP': -3, 'YS': -2, 33 | 'YR': -2, 'YT': -2, 'YW': 2, 'YV': -1, 'LF': 0, 'LG': -4, 'LD': -4, 'LE': -3, 'LC': -1, 'LA': -1, 'LN': -3, 'LL': 4, 34 | 'LM': 2, 'LK': -2, 'LH': -3, 'LI': 2, 'LV': 1, 'LW': -2, 'LT': -1, 'LR': -2, 'LS': -2, 'LP': -3, 'LQ': -2, 'LY': -1, 35 | 'RT': -1, 'RV': -3, 'RW': -3, 'RP': -2, 'RQ': 1, 'RR': 5, 'RS': -1, 'RY': -2, 'RD': -2, 'RE': 0, 'RF': -3, 'RG': -2, 36 | 'RA': -1, 'RC': -3, 'RL': -2, 'RM': -1, 'RN': 0, 'RH': 0, 'RI': -3, 'RK': 2, 'VH': -3, 'VI': 3, 'EM': -2, 'EL': -3, 37 | 'EN': 0, 'EI': -3, 'EH': 0, 'EK': 1, 'EE': 5, 'ED': 2, 'EG': -2, 'EF': -3, 'EA': -1, 'EC': -4, 'VM': 1, 'EY': -2, 38 | 'VN': -3, 'ET': -1, 'EW': -3, 'EV': -2, 'EQ': 2, 'EP': -1, 'ES': 0, 'ER': 0, 'VP': -2, 'VQ': -2, 'VR': -3, 'VT': 0, 39 | 'VW': -3, 'KC': -3, 'KA': -1, 'KG': -2, 'KF': -3, 'KE': 1, 'KD': -1, 'KK': 5, 'KI': -3, 'KH': -1, 'KN': 0, 'KM': -1, 40 | 'KL': -2, 'KS': 0, 'KR': 2, 'KQ': 1, 'KP': -1, 'KW': -3, 'KV': -2, 'KT': -1, 'KY': -2, 'DN': 1, 'DL': -4, 'DM': -3, 41 | 'DK': -1, 'DH': -1, 'DI': -3, 'DF': -3, 'DG': -1, 'DD': 6, 'DE': 2, 'DC': -3, 'DA': -2, 'DY': -3, 'DV': -3, 'DW': -4, 42 | 'DT': -1, 'DR': -2, 'DS': 0, 'DP': -1, 'DQ': 0, 'QQ': 5, 'QP': -1, 'QS': 0, 'QR': 1, 'QT': -1, 'QW': -2, 'QV': -2, 43 | 'QY': -1, 'QA': -1, 'QC': -3, 'QE': 2, 'QD': 0, 'QG': -2, 'QF': -3, 'QI': -3, 'QH': 0, 'QK': 1, 'QM': 0, 'QL': -2, 44 | 'QN': 0, 'WG': -2, 'WF': 1, 'WE': -3, 'WD': -4, 'WC': -2, 'WA': -3, 'WN': -4, 'WM': -1, 'WL': -2, 'WK': -3, 'WI': -3, 45 | 'WH': -2, 'WW': 11, 'WV': -3, 'WT': -2, 'WS': -3, 'WR': -3, 'WQ': -2, 'WP': -4, 'WY': 2, 'PR': -2, 'PS': -1, 'PP': 7, 46 | 'PQ': -1, 'PV': -2, 'PW': -4, 'PT': -1, 'PY': -3, 'PC': -3, 'PA': -1, 'PF': -4, 'PG': -2, 'PD': -1, 'PE': -1, 'PK': -1, 47 | 'PH': -2, 'PI': -3, 'PN': -2, 'PL': -3, 'PM': -2, 'CK': -3, 'CI': -1, 'CH': -3, 'CN': -3, 'CM': -1, 'CL': -1, 'CC': 9, 48 | 'CA': 0, 'CG': -3, 'CF': -2, 'CE': -4, 'CD': -3, 'CY': -2, 'CS': -1, 'CR': -3, 'CQ': -3, 'CP': -3, 'CW': -2, 'CV': -1, 49 | 'CT': -1, 'IY': -1, 'VA': 0, 'VC': -1, 'VD': -3, 'VE': -2, 'VF': -1, 'VG': -3, 'IQ': -3, 'IP': -3, 'IS': -2, 'IR': -3, 50 | 'VL': 1, 'IT': -1, 'IW': -3, 'IV': 3, 'II': 4, 'IH': -3, 'IK': -3, 'VS': -2, 'IM': 1, 'IL': 2, 'VV': 4, 'IN': -3, 'IA': -1, 51 | 'VY': -1, 'IC': -1, 'IE': -3, 'ID': -3, 'IG': -4, 'IF': 0, 'HY': 2, 'HR': 0, 'HS': -1, 'HP': -2, 'HQ': 0, 'HV': -3, 'HW': -2, 52 | 'HT': -2, 'HK': -1, 'HH': 8, 'HI': -3, 'HN': 1, 'HL': -3, 'HM': -2, 'HC': -3, 'HA': -2, 'HF': -1, 'HG': -2, 'HD': -1, 53 | 'HE': 0, 'NH': 1, 'NI': -3, 'NK': 0, 'NL': -3, 'NM': -2, 'NN': 6, 'NA': -2, 'NC': -3, 'ND': 1, 'NE': 0, 'NF': -3, 'NG': 0, 54 | 'NY': -2, 'NP': -2, 'NQ': 0, 'NR': 0, 'NS': 1, 'NT': 0, 'NV': -3, 'NW': -4, 'TY': -2, 'TV': 0, 'TW': -2, 'TT': 5, 'TR': -1, 55 | 'TS': 1, 'TP': -1, 'TQ': -1, 'TN': 0, 'TL': -1, 'TM': -1, 'TK': -1, 'TH': -2, 'TI': -1, 'TF': -2, 'TG': -2, 'TD': -1, 56 | 'TE': -1, 'TC': -1, 'TA': 0, 'AA': 4, 'AC': 0, 'AE': -1, 'AD': -2, 'AG': 0, 'AF': -2, 'AI': -1, 'AH': -2, 'AK': -1, 'AM': -1, 57 | 'AL': -1, 'AN': -2, 'AQ': -1, 'AP': -1, 'AS': 1, 'AR': -1, 'AT': 0, 'AW': -3, 'AV': 0, 'AY': -2, 'VK': -2} 58 | -------------------------------------------------------------------------------- /Cyclopeptide Sequencing (Non-Ideal Spectrum).py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import heapq 3 | import operator 4 | 5 | #Non-ideal experimental spectrum 6 | SpectrumNonIdealTemp = '456 783 658 508 156 361 426 1177 593 554 1137 245 1493 1390 1280 114 797 97 981 606 740 721 496 817 1232 753 583 1242 342 584 1293 1379 211 147 338 137 1356 1436 1048 867 469 1080 1076 359 996 512 887 527 1356 314 696 720 997 1248 1282 1248 1008 757 485 200 1362 261 103 1346 1024 1132 1396 622 773 892 676 772 559 622 415 626 1280 1037 417 626 270 114 1379 835 934 867 1122 497 398 1151 1394 910 710 1155 312 1095 939 360 137 813 1406 680 920 382 99 1078 1269 1181 371 114 900 871 284 852 245 964 1111 445 156 1209 871 1223 966 213 213 251 1269 1179 470 87 131 1134 1337 1133 529 715 573 57 601 1018 413 736 985 0 1067 224 1234 909 316 1337 1023 778 475 1379 259 641 356 224' 7 | SpectrumNonIdealTemp = SpectrumNonIdealTemp.split() 8 | SpectrumNonIdeal = [] 9 | for number in SpectrumNonIdealTemp: 10 | SpectrumNonIdeal.append(int(number)) 11 | SpectrumNonIdeal = sorted(SpectrumNonIdeal) 12 | 13 | m = 18 #Number of high frequency peptides present in convolution 14 | MinMass = 57 #Minimum mass of possible amino acid 15 | MaxMass = 200 #Maximum mass of possible amino acid 16 | Boundary = 5 #How many rounds in before Upper Score cutoff is enforced 17 | LowerScore = 3 #Lower Score cutoff 18 | UpperScore = 25 #Upper Score cutoff 19 | 20 | def FindConvolution(SpectrumNonIdeal, m, MinMass, MaxMass): 21 | #Find convolution of the peptide 22 | matrix = [] 23 | for number in SpectrumNonIdeal: 24 | matrix.append(number) 25 | matrix.append([0]*len(SpectrumNonIdeal)) 26 | k = 1 27 | i = 3 28 | while i < len(SpectrumNonIdeal)*2: 29 | j = 0 30 | k += 1 31 | while j < k-1: 32 | matrix[i][j] = matrix[i-1]-SpectrumNonIdeal[j] 33 | j += 1 34 | i += 2 35 | elements = [] 36 | i = 3 37 | while i < len(SpectrumNonIdeal)*2: 38 | for element in matrix[i]: 39 | if element != 0: 40 | elements.append(element) 41 | i += 2 42 | frequency = {} 43 | for element in sorted(elements): 44 | if element >= MinMass and element <= MaxMass: 45 | frequency.setdefault(element,0) 46 | frequency[element] += 1 47 | numbers = [] 48 | for element in frequency: 49 | numbers.append(frequency[element]) 50 | minimum = min(heapq.nlargest(m, numbers)) 51 | output = [] 52 | for element in frequency: 53 | if frequency[element] >= minimum: 54 | output.append(str(element)) 55 | return output 56 | 57 | def LinearSpectrum(peptide, CharToNum): 58 | #Find the linear spectrum of the peptide 59 | peptides = [] 60 | sizes = ['0'] 61 | peptides.append(peptide) 62 | n = len(peptide) 63 | for window in range(1,n): 64 | for i in range(0,n-window+1): 65 | peptides.append(peptide[i:i+window]) 66 | for string in peptides: 67 | total = 0 68 | for aminoacid in string: 69 | total += int(CharToNum[aminoacid]) 70 | sizes.append(str(total)) 71 | return sizes 72 | 73 | def CycloSpectrum(peptide, CharToNum): 74 | #Find the cyclo spectrum of the peptide 75 | peptides = [] 76 | sizes = ['0'] 77 | peptides.append(peptide) 78 | cyclic_protein = peptide*2 79 | n = len(peptide) 80 | for window in range(1,n): 81 | for i in range(0,n): 82 | peptides.append(cyclic_protein[i:i+window]) 83 | for string in peptides: 84 | total = 0 85 | for aminoacid in string: 86 | total += int(CharToNum[aminoacid]) 87 | sizes.append(str(total)) 88 | return sizes 89 | 90 | def FindMass(spectrum): 91 | temp = [] 92 | for num in spectrum: 93 | temp.append(int(num)) 94 | temp = sorted(temp) 95 | return temp[len(temp)-1] 96 | 97 | def Score(Experimental, Reference): 98 | temp = deepcopy(Reference) 99 | score = 0 100 | for Expaa in Experimental: 101 | present = 0 102 | for Refaa in temp: 103 | if Expaa == Refaa: 104 | present = 1 105 | if present == 1: 106 | score += 1 107 | temp.remove(Expaa) 108 | return score 109 | 110 | def NonIdealSequencing(SpectrumNonIdeal, m, MinMass, MaxMass, Boundary, LowerScore, UpperScore): 111 | #Give each mass an arbitrary ASCII character 112 | Frequency = FindConvolution(SpectrumNonIdeal, m, MinMass, MaxMass) 113 | NumToChar= {} 114 | CharToNum = {} 115 | i = 65 116 | for element in Frequency: 117 | if i == 91: 118 | i = 97 119 | NumToChar[element] = chr(i) 120 | CharToNum[chr(i)] = element 121 | i += 1 122 | 123 | #Convert integers to strings 124 | temp = [] 125 | for char in SpectrumNonIdeal: 126 | temp.append(str(char)) 127 | SpectrumNonIdeal = deepcopy(temp) 128 | 129 | #Find correct mass of the non-ideal spectrum 130 | CorrectMass = FindMass(SpectrumNonIdeal) 131 | 132 | combos = [] #Store potential peptides 133 | leaders = {} #Store lead scoring peptides 134 | for aminoacid in CharToNum: 135 | combos.append(aminoacid) 136 | 137 | #Recursive method for building and scoring peptides 138 | rounds = 1 139 | while len(combos) != 0: 140 | #Expand and score 141 | CombosTemp = {} 142 | Scores = [] 143 | for string in combos: 144 | for aminoacid in CharToNum: 145 | NewCombo = string+aminoacid 146 | NewSpec = LinearSpectrum(NewCombo, CharToNum) 147 | NewScore = Score(NewSpec, SpectrumNonIdeal) 148 | 149 | #If mass is too small, retry with another amino acid appended next round 150 | if FindMass(NewSpec) < CorrectMass: 151 | Scores.append(NewScore) 152 | CombosTemp[NewCombo] = NewScore 153 | 154 | #If mass is correct, score it 155 | elif FindMass(NewSpec) == CorrectMass: 156 | leaders[NewCombo] = Score(CycloSpectrum(NewCombo, CharToNum), SpectrumNonIdeal) 157 | 158 | if len(Scores) == 0: 159 | break 160 | 161 | #Trim and push forward 162 | if rounds >= Boundary: 163 | maximum = deepcopy(min(heapq.nlargest(LowerScore, Scores))) 164 | if rounds < Boundary: 165 | maximum = deepcopy(min(heapq.nlargest(UpperScore, Scores))) 166 | combos = [] 167 | counter = 0 168 | for string in CombosTemp: 169 | if CombosTemp[string] >= maximum: 170 | combos.append(string) 171 | counter += 1 172 | if rounds >= Boundary: 173 | if counter == 100: #Limit recursion 174 | break 175 | rounds += 1 176 | 177 | #Best scoring peptides with correct mass 178 | LeadingScores = [] 179 | for protein in leaders: 180 | LeadingScores.append(leaders[protein]) 181 | 182 | #Select best of the best 183 | TopScores = [] 184 | for protein in leaders: 185 | if leaders[protein] == max(LeadingScores): 186 | string = '' 187 | for amino_acid in protein: 188 | string += CharToNum[amino_acid] + '-' 189 | TopScores.append(string.strip('-')) 190 | return TopScores 191 | 192 | Peptides = NonIdealSequencing(SpectrumNonIdeal, m, MinMass, MaxMass, Boundary, LowerScore, UpperScore) 193 | 194 | for peptide in Peptides: 195 | print peptide 196 | -------------------------------------------------------------------------------- /Simple-Programs/Finding Shared k-mers.py: -------------------------------------------------------------------------------- 1 | def ReverseComp(DNAin): 2 | DNAout = '' 3 | y = len(DNAin) - 1 4 | while y > -1: 5 | if DNAin[y] == "A": 6 | DNAout = DNAout + "T" 7 | elif DNAin[y] == "T": 8 | DNAout = DNAout + "A" 9 | elif DNAin[y] == "C": 10 | DNAout = DNAout + "G" 11 | elif DNAin[y] == "G": 12 | DNAout = DNAout + "C" 13 | y = y - 1 14 | return DNAout 15 | 16 | k = 13 #Length of k-mer 17 | str_one = 'ACTTCATACCATAGGTGCGGGGGTAATGTAGACGGGACGTTTGGAGCTGGTCTTGTTCACTACCGACGCAAGGCATATATTTGAAGCTCACACCTTGGTCTCAGGATGAAATATGAGGAGGTCGTGTAGGAGGTTAAAGGGATAACCCGACCGTTTTGAGCTGCGGTGACCGAATATATGGCAAAAAGTGCTGATAATGAAAATCTTGGCCCGAATCATTAACGTGGTGCTATAATTCCCGGGTTTGCCGCAACAAGGGATCACGGTTCATTTGGGCACGAACAAGTAAAACAGCGTTCTCCTATTCGGTCGACACTCCATACCTCACATAGTGCAGCTAAACTTCACCATCTCTCCGCGACGCAGATCGCGCTACTGTTATTCAGCGTGGTGGATGCATGTTCGCGCCTACCCTCTTCGAAATTGGTGAGTTTTATAGAGTTGATGACCGTCTGCTAGTCAACAGTCTCATTATATTTGCCATTACTGAATGTCCACTTTCTGTCAGCGACGCGTCACGGACTGCTACCATTAGCGTGTAGTTCGAGCCCAGGGCATGACCGCCATTTGAACTGCCGACCCTTTGTGGCGGAATTATAAGGACATGTCCGTTACATGTAGTAGGACGGGATATCTAGCGTTATAGGACTGATCAGCTGTCGAGAAATCGATGTGGCCTACTAGGGTAGGATAGCCGAACCTTGTCTAAAATCCTCGGTCAGATCTTGCGTGTCACAGAGTAGCGCACCTTAACACTGATCATCGAGATGCCCAGTGCCGTGACCTCTGAGCCTGCAACCAAATGCCCATCTAACGGATACCGATCCAAGTGGCAACGAAGAAAACCTACCTCCACACACTAAGGGCGAAGCCAGGACCACCTGAAGAATGGCATGTACTTTGGGTGGCGATGTAACGACGCCAACTGTAGGCTGGCCATTATAGCGGTCTGTGCCAGCACTTTCCTGTGAAGACGGTCTTACCGGGGGACCTTCTAGGTTAACAGCTCAGTACTACAATTTGTAACGTTGTTCATCCGATGGCCACCTAGAATGTTGGTGGAGGGGAGATCACATAGGGGTTGAACATTACGCTAATACTGCCTATCCTATCGGTTTCCTATTCAGTGACGTCCGGTTCTACCCATTGGCGGATGATCAATATGGATTCCCTAATCATGATATCTCGGTACGGCTTAAATAGCCACAGAGGGCACCCTTTGACTCTCTGTGAAGGCGAGCCGGTGCGTTTTGCTACGTCCGATATGCCATGGTTTCGGCGTGCGCGTATTGCTATAACCCAGGACCTACCTAGGGGGCGTATTTCGTGGACTAGTTCACCAGACCGCCCTGAGTGGTAGAAGACGGCGTATATGAGTCAAGTGTTCGCCAACCCATTTCTCCGTGAGCACGAGACTACCAGGAAGTGCAGCCCAGCCAATAGCCTTTTAGGATCGCCATCATTATTGTTAAAGAGGCGTTCATGGTATCTTAAGGTTCGAATGACGTGAGCTCAGGACTTAGCTGTGCCCCCGCCAAAGAGTGATAGTCTACAGTAGCCACTAACCAATCGGATTACGGAAAGGGGATGTTTGTGGATTGACCACTCAGAAATTGCCCAATGAGAATTGAATCTAGTTTCAAGTCTACTCGCATCCACTAGTTCAGGCCGCGGCCCGGGCGTTACTTAAGATGCTGATGCACGGTCCGTTCAGGAAGATTGGTCTTAGACTGTAACTATATCTCCCAGTTAGCACTTCGAGGGTACCCATGGAGTTAAGAGTTAGTAAGCGTGTGCTACAAATACGTAATCAGTTCCGGCGTTCCTAATGATTGTTATTCTCCTGTCCTCGGGGATTTGTCCACGCCAAGCAGCTCTGCCGGGCTTCGGAGTTTCATTGGGGCCGGTGACGGTACGCAAGAGGGTCTTAACTTGTGCATCTGAGTCTGCTACCACGATAACGAATAAACTATGCAGCAGTAGTATCCTCTCCTTTTATAATCGGCAAATTAAGGAGGAGAATGTGGGTATGATATATCCCTAGATGTCAGGGTCGCCATGGGAGATTTATGCCTGTCTTAAGCATGCGCGCATTGTTGGTGAGGCGCCAAGATCGAAAAAATAGACTCTGGGTGGCTATGAGTTCCCTAGAAAGGTCGATCATATGGTAAAACCAATGATCACCCATAGCGCCGACGCCGGCCATTTCCTAGGGACGCAAATACCACCAGCGGCACAACTGCTACGGCTGGGTAGGTACTTTTTTGTTCGGTGTAAGTTAATCTACTGCGCCTGCCTGATGCTCGTTGGACTACCGTCCATTAGGTGATGGTGGGAATAATAACAGTTCTTTACCATCAATCGCAATAGTTGATTGGCCCCGTCTCCCGGTTCGAAGCGGTTGGTGGTCCAACAGTTTCGGTTAGGATCCCTGACAGATCGTCAAGGTGACACATCAGCGAGTCATCCCTGGTAATAATATCACGACCCGTGTCCGAGACATACTGATAAGCGCTACATCTTTACAATTAATTCGTTTTAGCACTCTATCTAATCGGGACACGTGTTCGCGTGGCTCATAGTTCCGTGTTATACTAGATTGTGCAACATACATCCTCACTGCAGAAGAAACTCCGAACGGGCGTCCAGGTATAGGTGGAAACAGCCAAGGGGTATTGTTCGAGATGCCGGACCGCGAAGTGGTTTATCCAATTCTCGGCGTACTGCCCTGCCGTGTAACTCTGAACTCTACGACTCAATTGTCCAGAACGGTCCGCATTGCTGTTTGGCCGCTGCTGTGCGAAGGTGGTCGATCTATATTACACTCGAGGCTCGTATCGAGCCTCTGGCTCTCCGCGGCTTGGCACGTGCGGTTACTCGATCGTAATATAAGACGTAAAGGAGGACATGACTATGTTGTTCTGTTTCTCTTTGTTATGTAAGCAGGGGGCGAGTTAGCAACCGTGGGTTGGATGGTCCACTACATTTTCATTGTAGCGTACACTCGGGCCAGCGTAGACATTATTTGAGTAGTTTATGTCTGGAAGTGCAAATTTCTCTGTTTCCGGAGGGCATTCGTCCTTTTTCCTAAAGAGCGGGTCTGAAACAGTTGGACGGTATATCCATCTGGCTGCTTATTGATATACTGGCCTCGGCGTGGCTAGAAGTATTTGCCGAGCCGGTTCACATTTTTCTGGTTTACAAGATCATGGGAGCTTTAAAGCTCAAGCCTCGCCGGGCGCGAGTCACGACCCTGTGTTTTTCGTGTTCGCCGGAAGCGTGAAGGAGAAGGCCTGATTAAGTGAGAACAAGAGTGGCTTCATTTAGAGTTACGGCCGTAAGTTAAAGTTTCCGAGTATTAGGGAAGAGCGATCAATTTAGTAGCGTTTTCCTAAATTTCGATTGTATTCTTGCTAAGAGACTTATAGGGGCTGACGATTGTAGGTATGACTCAATATCGGTGAGACCGGCTGATAACCTGCCTTCCCCACGCCCCGGTGTAGTATCGTGGGAGATAACGATGAATCACATCTTGGAAAACCGGTGGCAGGAGGGAGGGTTGACTTCCGCCTGGGATTTAGCTATACACTAGAGGGCGGCGATTAACCTGCGCCTACTTAGTGACCAACTCACTTCTCTAAGGACGATAACCTCAGATCGAAATACCCTCAAATGAGAGCATAAAAAAGTAATAACAGCGGTTTAGAGGAGTTTACGTAGACTCGATGAATGATCCGTAGTGCCTTCTCGAGAAATATGAAAGACTCTTACGGGATGTGGTAAGTTAACACTATAATGGATGCATTACTCGTACGTTTTACGGGTTGAGTTTATCGCCGTGACCAGTGTCTACTCTTGCGTCAGGTCCGCGGCCCTCCCAGCACCGCTGCTCCCCCAGCCTCAAGATCGATTTTGGAGAGAAAGACCATGTAAAAGTGGAGAGCTTCAACTGTTGCTCATCGTGAGAGGAATGTTGCATAAGGCATCCTATACTGGATAGAGCACCTGTGGGACGAACGTAGCATTACGGTGTCCGAGAACGTTGAGACGCTTGACGGATTTTTTTAAGATAGCCTTAGATTTAATTTCTGCCGTAAGAAGTCAGAAACTCAGACATCGATGGAGAATCCTGGTGTTGTAAGTACGGAAAAGCCCTAATTCCCACCTGCGTCGCGAAACGGGTGAAATGTATTACCGGGCTGCAAATATTCTTGTGCAACTTAACGGAGGTCACAGTACGCCGGGAAAGCCAAGATCGTTTGGGGGACCACAGAGGAAATATTGGTTCTCACGCTCCTCAGGCCGATACATAAGTTCGGGGCGCGGCCTCTTCGTGCGCCGACGGTTCAGCCAAACCCCCTAAGTTTTAATGTATTGGGCTCCACTGAACGACGACCATACCCTATATGGATGGCGCGTAGTGTCAAAGTGTACACGCATCTGATTGGCCGCGGCGTAAGGGGAATGCACCAGGGTGCCCGAACCGTCGGTTCCTGGGACCAAAGTATTTTCCCCGGCGATGATGTGGTCAAGAATGACGAATGGGCACTCACGTTTTTCCAAGTAAAGTTAAAAAAGGCTATAGGCGGCGGGGACTGCTGTGTTAGAGTTTGCCGACCAACAATTTCGTTACGGGGAGATCAGGTACTCGGCTCTGGCTAGTCACTTTGTAGCATCTGCTGTATTTCATACTCCTTCTGCTATGTGTCTCCGTTAGGTAACTGACGCGGGCGCACCTAGTTGACTCTAGAAGGCAACGCCTCGAGATTGCCATCGAATCTCGCTTACACTGTCAACAGAGTGTGCCTCCATATGTCCATCGGGTAGCCCATTACGCAAGAGACGGCCAGACGTACCCCAAAAATGGAAAGGTAAGCCGGAATCATAGTAACATCCTCTCGTCCCGTGAACAGCGTCATCGCAGCAGATACTCGGACCGGGTGGTCAGAGGGTTAAGGCGTGCAAACGCGTCCATCCAAGATCCGGACTCGTAAATTACAGACAGATGGACACGCGGGCCCAAATGGCACGACTCAGACCCGGGGACGCTGCCGGGCCGATTCGAAGGGGACTTACGCAGCGCGAGCCCAGTTTGATAGCTACTTTAGTGAGAAACTCCTTGTGTATCCGCCAGAGCGCAACTAAGTCGGAACCCTTATTCGTATGTAAGATGTCCGCCTTTGACCAAATCGGTCCGCACTTGCGCCGATAATAAGATCTGCATGGAGTGATACCCGTAACCGACCCCTGTCCAGCCAACCTTACTCACCGCATTACACGACCACCTCTAGAGCCCGACCCCTTGCTCCTTGGATAAGTATGTGGTCAGTGTATCCTCGACCTCAATTGGATGGCTAAACCACTACCGTGCGTGGCCGAGGTTGTGGGCTAATGCAGAGCTTGGATTAGAATCACGAGTCGTCGGTCAACGGTAACTCACGCCAAATGTCGACGGGTCACTCCTGCGCAAACTCGCGCTGGCGGAAACACCATATTGCATGGCTACTATCGCCTACGTTAGTCGTGTTTGTGGATCCGCGCCTTCTTGACTGAGTCAAGCTACCGATGTCCCGGTGATCCGCTCGACGCTTCGCTAAGGAAGAATAACGAAAACGACATGTGATGTGTAGGCGCAGGCTGTACGAGACCAGGCCAAGCCCCTCCAGAAAATCTTACTTAATAAATAGTGTGCGCCCCGTGTGAATTCCAATCCGACAACCGCCCGTTTGTCCCGTGACGGCAGCCCTAGTTCCCTTTGGGGTCTAGCTGCTTACTGCCGCATCACGGGATGCTTAGGATATCCTAAGCCACGTTTGCAATTGAGCATCTACGCATGGCTGTGGACACAACCGGGCCGTATATAGTCATGACTAGGTTTAGCACATCCACGCGTTACAGCCATAATCCGTGGAGGTGGGACAGCTTCTGAATTTAGCCTTTGTAAGTGTAGCATAAGCAATCGATCCTTAGGAACTTACCAATGTTCTCTTTTAACGTGAAAGTATTTCGCTACCGAACGTAGACTGCTTCAGACAGCCCTTCTGTAATGGTCAAATCGGGCGCCAGTCGGTAGGAAACTCCACGAAGTTAGCCAATCTGATAAGGGAGTAAAGAAAGCATGGATTATTTCGGGCGCGTGATATGGAACTCATGTTGGGACCAATCAGCTAATTAGTATAATGAACCTCTTTTTCTGCCTTCTGTGTGCCGCGCTCTAGCATGATTGGAACTGCGATGATGCCCAGCGCCAGGGGCACATGTGAGTCTCAGTGCTATGAGCGTGAAGTATCAGTCTTATTCTCGGGTCCTCGAGTAATCGGACGGTCGCGGCAAGCTACACTGTGCTTTCTCCGACGTGTACGATTGAGCAATACTCGGTAATTCCGTTGGAGTAATTTCCGGGCGAACTCATTCGATGAACGCTATCTAAGCTCGCATCGGTCTTCAAAGGACAACCGACCCCTGGGATCCTGCATAAGTAGTTATCTATAACCGACTTAAACAGGCTTGGCTCCAATTGGATCGCCTTACGTCACTCCGACACAATAAATATTAGCTAGAGCTCAACTAACGCCTGGTCGTGCGACCTGTGTAAGACGTCGTACAAGTGGCACTCTACCCGATGTAGAGTTAGAAGAAAGAACTATGCTGTGTACGTATTGTACAGCGATGGTGCAGTCCAACGGGAGTAAGTACTAAAATAGCCGTTCCCCTGTATACTGGGCGCGCATATTTCTATAGGTAAGCTGGTGTGTCCTAAGCAGGGGCGTGAATTGACGTTGACTTGACGTTCGCCTATACGTACATATTAACGCTCTCGCGAACAATCTAGAAGAGCCGAACGGTGACACCCCTGGGGAAGATTCATGGTACCAAGCAGCTAGCAAACTTGGACA' 18 | str_two = 'AAGTCCTAGATTCATTTCCCGTTTGTGCCGTATCGCCACGAGGATGACGGTAGACTCGGTATTCTTCATGATTGCCCTCTACTGACGAGCTCCCCAGGGTGATCAAGGAAAGACGTAAAACCAGTGGGATCGAACAGAAAGTTCAACTGGCCGGATTGCTGTCGGCTCGGCAAATTGGAATTTCGATAAACTTGCTCACATCATCCTAGACCAGCTTTGTTCCTAGCCCTAATCCAATTAGGACTTACACTACAATTAATTCAGTAGGCGCGTTTCGCTACGAAGTGTCACAACACTCTTAAATGACTATGATGTTTCGGGACGGTGGCCGGCGAATCTTATGAGGTTCTGATGATTAGTTAGATCAGCGCTCTCTTCTTGCTTCCCTGGTGACTTTCTCTCACCTTGCTGCCAGGGATAATTTTGCATGGGTGATCCAGCAGTACAATTAATTCAATATACAGTATTTTCTGCTATGCCAACCAAAACGCCTAGGGTATTCAACGACTCATCCTCCCTATGGAGATAGCACAGCGGCCCATGCTTTAACGAACTCCATAGTAAGTTTTGTACAACGGCATAAACATGACGCTCCCCAACTGGTCAGGTACAATAACCGATGAACCGGTCTCCTCTATTAAGTGAATCCCCTAATCTGACATCAGGCGCAACTTTCTGTTCGTCTATGAGTGATCGAAGCCTGGTAGGATAGATCCGCAGGCGTCGCGTGGCGTTCACGTGTGGATGAAATGGGGCAGCTAGCATGCTAGTAGCCAAGGACTGAAAGCAAGCGTGATCAACGGTTGGGTGGACGTTGAAACAGCATCACCAGGGAAATGTTGCAGCTTCGCACAAACCGTATGGCGAGAAGGAAGCACCTTACAATCCTCTACACCCCGGTAGTTCAATCAATGTCGTTTCTCAGCGAATGGATTAGGCTCAGTAAGGCACCAACATCAAGGAATTGTAGAATAATACGACGATACGGATGATAAATGTTTAGGGATAAACGCCATCCCGCCATTTGAGGTAACAAATGTCCCGATTGTTCTTGCATGGCAGGGCACTATCGATACAGTAGAGCATAACCTCTTCCTTGATGTTGGTAATATGTCCGCAAGGATTGGGCTCGATCGCCCAAAACTGCCGTTTGTACACTCCCAGCGAGATGTACTTCGTTATCCAATTTCCCTCTGAAGGTAGCGTGCGGTCCGCGCTTGCTTTCAGACTTCACTTGAACTGAGTTAAGCATTATTCTGCCTGCACGATAGGAAATACACCAAACTTGGACCCAAAACAACATCAAGGAACAACGGTTCCTTAACTCGGACTTGGTGATTGGAGAGCTGAAACATTCCTGCGTAGCCCACTAGTATAACCCCAGATTTGACTAGTGCCGTAGCCTGCTATAGTGATTAGCAGATATGCCACATGGGATTAACCACAGCGGCTCCGTAATGCTTGTAGCTTACAAAGAGAGACTAATGAAACTAACTACCTGGGTCTCAGGTAGTTTGTGCGAAAACAGTATGGGACATTATGCCCTTAGGCTGCTCATTGACCATGACCCCAAGAATAGTTGCCTGATTTAGGTGTCGAAAATTCTGCTTATCGGATCTGGATGGCGTTTGGGCGCGCCGCGGAGAGTTCTGAGCGCGAAGACCCCTCGAAAGTCGATAAATTTTCCGGGGATGCCCCTGGAAGCAGCCTAAGGGCTTCACAGCGGCACCTAACCTGCACATTAGCCAATAAGATCCACTAGTTGTCAGTCATATGAACGCGGGCGGATCTGACTCTGCTTCAGCCGAGTGCGTGATGTAAACTAAACTCAAAGCCGGGATCCTACCAACTAATCCTCTGCTGAAACGGGTAGACTTCGTGATGGTTAGTGCTCACTAATACTTATTTATGAGACTGTGAGACACTGCGGATGTCTCTTTGAGATGGACCACCTCAATATACCAGTGACAATGATCGCAAGATATAGGGATTTGGCTTCGAGAATAGTTGCCTATCGTGATTTTATTATCGCACAGCAAGAATGAACTAATTCCAATTGTGCACCCATATGAAATAGCAATGAGTTGGCTCACCACCAGGTCCAGGGCTGCTGCTGTCGTAGTGTCAACTCATGCGGTGAGTAAGCTTCGTCTAAGAGTGATCGAAGCCGAGCCGAATACTGAGAGTCGGGCTGTGATTAGTGCGTGATGTAATTTTGTAAGAACAATGTTCGGGTGGCCGTGTAGATAGCGGTCCCCAAGCTCCGGCTACTTGTCTGAAAACATAATCTCACACCTCGGCGACAACCGAAAGCCGGCCGGATTCCGGGTTTTAGTTCCCCCCCCTGTCCCGGAGATTTCTGTCTAAATCAGTTCCCCAGTAGTAAACAAGATGAGTTAGGGATGCCCTCATGATCCGCGACGTCGGAAAGACGAAGTTTTCGTGAGCAGACGTTTTTCAGAAATATTATGATGTCCTTATCGGTACTCGGGCGCGATACATCATGATCATTTGCTGCGGACGAGGCTATACCCCGGCTCACCCTTACTGAAGAGTTCAACGAACAAAGCCAACATCTAACACTCTAGCGTGATCAAATGTGGACAAGTGCCTTACTCGGGCGCTGGTTTATCCAATTTGCTTTCAGTACCATCGTGATCATGTCCACCAAGGAAAAGCAATGAAAGATGAGCCGCGGGAACTTTGTATATAAAGTGTAGGGAGCATTTGCCGAGCCGAAAAATACATTTACGTCGGCCCTGCAGTTAAACAACGGGGCCCCCAACACCTGCTGTTAAACCACTCAGCTGAAGAAAGCCTCGACTTCGAATGGTTTTCCTATCCCCCGCTTGTACCCTATGGTTCCAATTTCCCTTTCCAAACATGAGTCATAGAATATTGCAGATGCACAACTCCAGTCTGGTGTGCAGATAAGGTTGGCGCTACGAGCTTCAACCCTGCTCTAAAAGGCCGTTAGCTAAACTAGAGCTAGGTTACGGACATTAAGCCACAACGGTCGGTTTCTCCAAATGATAGTTCCTACTGGCCCTCGCAAACGCCTAAAAACCGGGCGGAGCTACTATTCCTCTCACTCATCATATCAGTCGATATATTCCGTGACAAGTTGACAAGTCCCCGTCCATTACTTTAAATAAGTATCACTCGAAGTAGACAGATACCATACGTTAGTTGATACGGTAGAGCTCTACTTCCCATAGCATGCGGCGAAACACCTAATTCCCATGGCGCAAGATATAGGGTATTAGTAGTGGCGTATGCGCGCCATGGTGCCATTTCTGGCAGCCAAAGGTCGGTGGTCCGCCGGGATAAAGTCCTTCTGATACGACCCTTACTCAATCGGCTCGGCAAATGTGATGTGCGCCACTGGTAGCCGAGCTACTATTCCAGTCTGCATTCACGCCGCGCTACAGAACTTATACCTTCAACATATTTTTGGAGTAACGGGTGTAAGTGACGAGTTTCTACTAGGACTTGTCAGTACAAAGTTGAATGACTGTCGCATCAACGTTGGCCTACGGAAACACACTCTAGGATTAGGACACCCCCAGCTGCGGACATATTACTTTCCTGGCGCGGGCGAGATCACCGCGCGTAGATGCTTCCTTCGCCAGGTTAGACATTAATACCGGAGGAGCCTGTATGACTAGGAATTTTGGACCGGAGTCCAAATTGGATAAACCCGTGAGGCGTATCCCTACGGCCGTAAACCCAGGTGTCCTTTACATATCAAGAAGGTAGCGTGCGGAAGTCCGATGGTAGCTTTTAGTGGTAGGATGACGCCTCTACCGCTTCTAATTGTTCTTACTATCGCGGAAATGCCACCAGACGGGCAATGGTGATGCAACTGAGGTTAAGTAACATGCTTCGTAGACCACGCTACATCGGTCTCGCGTCTACCTAGGGCGCGAAGCCCCCCGGGGGCGGGGAGGAGCGCAAGTTTGAAGGGTTGTTCGATCATCTATCCAGAGCATCACTGAAACCACGACATCCCCGGCTCATCCGCACATCACGTACTTTTCTGGTATGGGTCAATTATCTGGTCGGGGTAATAGTTCCCGTGGCCCCTATGCTACTAAGCGTAATCCGTGTCTCCGGGACATTGAAGCAGCCTAAGGGGGGTGACCTTTCTGTTCGTCACTGTACTGAACTTACGCGTAGGGCATGTACAGTTGGTAGGGCTACCTTGGCGTAAATTCCGGTGAAGAGTGTATGGTGTAAGGACGCTCTGCTTGCCCCGAAGCGCGGATGTCCCGGAGACAAGATCGAATTACAATTAATTCACGGAAGGGGCGAATTGTCAACGTCCTACTGTATATCTGCATCTGTGGTCAACGCAAGATATAGGGTTAAAGAAATAGCGACTGGTTAAGATCTAACCCTCCTGTCATCACTTCTTTATCTGGCCCTAATATGTCCGCAATATCATCTTTCTGTTCGTCCCAACATCATAATAAATAGCCCCTTTCAAGATTGTGCGCGGGCTCATTTATCCGACTCTTCGAGTAGTGTGGACCTATTACGGGTAGGAAGGGACGAACCGTCAGCCCTTGTGTAAGGACTTTAGTGAACAATTAATCTGTTCTTAAGGCGCATTCCCCATAGCAAGCCTCAATTGTTTTAGCACTCTCCCAGGGAGCAGATAGAATGACTAGGTACAGTATGGGTAGGGTACATTCTGAACTAACCACCGGGCCCTAGCGGCCTTCGACGACGGAAAGCTGCGCCACGCGTTACCTTTAAGACGGTAACCGGGGATAAGCTAGCCACTAAACGTCGACTAGTAGTCGGCGGGATAACATCGGTAGAAGCCGCGAGCTTTTTGTACAAAACCAGGATCTATTTACCTGAAACGGTTCTCTCTACTGTATATCTGTGTAGCCGAGCTCCGCTCAAATGGAGCACTGTCGGGTTCACCAGCGCCCGTGGCATTACTCGTCACAATAGCAGTTCTCACTGTTAAAAGCGTCGGCTCTGTTGGAAGCTGCAACATTGTCATCGAGATCAAGTTGTACCCTAATGTGCCAAGGACATACGATTCATCCACAACATCTCCAGTAGCTTCGACAGCAGCTTCCCTGGTGACAGACGGTAAAGATCGTCACGACTATTGCACGCGGCAATGGGTTACTAAGGCCGATCGGCCTCCCGAGAAGAACACTACCGAGAATCTAGACCCTTCCTGTGAGTAGCACTAAGAGTGCTAAAACTCGTTATTTGTTTTAGCACTCTTCGAACTCAGTAGGTGTGGAGCCCAACCCGGACCTTGGACATTTACTTTGCAACTTACCAAGTGCAGAAGGAGGCTGGTACAAGGTAGCGACCATAGTGCCGACCGTAACGGTGCTAGTTACCCGAGGGAAGGACAAGTGATGGGACTCGACGACTGACTACTACCAGCCGACGTGATTTATCCGCTAGAGCTATGACGAGACGATACGGACTTGGTGATCTCTGGAATGAAACGGATAATGTGTAAAGATTAATTGGCGAGGGGGGCAGTGAAGGAAATTTGCGAGCTCGGAAGAAGCCGCACGCTACCGCCGCAGCCTGACGCCTATGACATCTAACTCGCGCTCTTGGATCGAACGTAATCATTGCCTGCGTAAGATTACGCCACTGCATGCTGCGGGGGGTGAGCACAGGTAGCCCTAGCGCAGGGATACCGATAGTTTTCGGTATAAGAAGCCAGAGACTTTTTGCCATTTCACTTTGAAGGTCCGCTTCTAGTTTATGGTCTCGCAAAACGGGCAGAACTATGACACACACCCGTCGTCCCAACATCCAC' 19 | 20 | #Establish dictionary of kmers in the first sequence 21 | kmer_dict = {} 22 | i = 0 23 | while i < len(str_one)-k+1: 24 | kmer_dict.setdefault(str_one[i:i+k], []) 25 | kmer_dict[str_one[i:i+k]].append(i) 26 | i += 1 27 | 28 | #Scan through the second sequence / Lookup kmers and their reverse complement in the first sequence 29 | j = 0 30 | while j < len(str_two)-k+1: 31 | try: 32 | for pos in kmer_dict[str_two[j:j+k]]: 33 | print '(' + str(pos) + ', ' + str(j) + ')' 34 | except KeyError: 35 | pass 36 | try: 37 | for pos in kmer_dict[ReverseComp(str_two[j:j+k])]: 38 | print '(' + str(pos) + ', ' + str(j) + ')' 39 | except KeyError: 40 | pass 41 | j += 1 42 | 43 | ''' 44 | Output 45 | (3168, 162) 46 | (5261, 2131) 47 | (5260, 2132) 48 | (2700, 2624) 49 | (2701, 2625) 50 | (3168, 2725) 51 | (3168, 3358) 52 | (2701, 3688) 53 | (2522, 4288) 54 | (2535, 4635) 55 | (2535, 5215) 56 | (2535, 5237) 57 | ''' 58 | --------------------------------------------------------------------------------