├── .idea ├── .gitignore ├── vcs.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── Bioinformatics_Textbook_Track.iml └── modules.xml └── solutions ├── BA1G.py ├── BA9C.py ├── BA9I.py ├── BA3A.py ├── BA9R.py ├── BA1A.py ├── BA1L.py ├── BA1C.py ├── BA6B.py ├── BA1M.py ├── BA9G.py ├── BA3B.py ├── BA1D.py ├── BA3L.py ├── BA3E.py ├── BA4F.py ├── BA1B.py ├── BA1H.py ├── BA4K.py ├── BA3C.py ├── BA3H.py ├── BA9H.py ├── BA3K.py ├── BA4D.py ├── BA5A.py ├── BA4A.py ├── BA6G.py ├── BA8B.py ├── BA1F.py ├── BA3D.py ├── BA6F.py ├── BA11C.py ├── BA1K.py ├── BA2H.py ├── BA3I.py ├── BA4J.py ├── BA1N.py ├── BA6A.py ├── BA6H.py ├── BA9Q.py ├── BA4C.py ├── BA4L.py ├── BA4B.py ├── BA9K.py ├── BA5G.py ├── BA2C.py ├── BA7B.py ├── BA4H.py ├── BA11A.py ├── BA11D.py ├── BA1E.py ├── BA2A.py ├── BA2B.py ├── BA10A.py ├── BA9J.py ├── BA6J.py ├── BA5C.py ├── BA10B.py ├── BA5B.py ├── BA6I.py ├── BA6E.py ├── BA9D.py ├── BA3G.py ├── BA1I.py ├── BA11H.py ├── BA11I.py ├── BA6C.py ├── BA9F.py ├── BA8A.py ├── BA9B.py ├── BA9E.py ├── BA9A.py ├── BA3J.py ├── BA1J.py ├── BA11F.py ├── BA4G.py ├── BA4M.py ├── BA5N.py ├── BA8C.py ├── BA4I.py ├── BA9L.py ├── BA11E.py ├── BA3F.py ├── BA3M.py ├── BA4E.py ├── BA7A.py ├── BA9M.py ├── BA5H.py ├── BA2D.py ├── BA5I.py ├── BA10I.py ├── BA11B.py ├── BA2E.py ├── BA8E.py ├── BA11G.py ├── BA7C.py ├── BA10D.py ├── BA2F.py ├── BA8D.py ├── BA7E.py ├── BA9P.py ├── BA5D.py ├── BA9O.py ├── BA5M.py ├── BA2G.py ├── BA7D.py ├── BA9N.py ├── BA6K.py ├── BA10C.py ├── BA10J.py ├── BA10H.py ├── BA11J.py ├── BA6D.py ├── BA7F.py ├── BA5F.py └── BA5L.py /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/Bioinformatics_Textbook_Track.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /solutions/BA1G.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def hamming_dist(string1, string2): 5 | return sum([x != y for x, y in zip(string1, string2)]) 6 | 7 | 8 | if __name__ == "__main__": 9 | ''' 10 | Given: Two DNA strings. 11 | Return: An integer value representing the Hamming distance. 12 | ''' 13 | input_lines = sys.stdin.read().splitlines() 14 | dna1 = input_lines[0] 15 | dna2 = input_lines[1] 16 | 17 | print(hamming_dist(dna1, dna2)) 18 | -------------------------------------------------------------------------------- /solutions/BA9C.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from Tree_Trie_classes import Tree 3 | 4 | 5 | if __name__ == "__main__": 6 | ''' 7 | Given: A string Text. 8 | Return: The strings labeling the edges of SuffixTree(Text). (You may return these strings in any order.) 9 | ''' 10 | input_lines = sys.stdin.read().splitlines() 11 | Text = input_lines[0] 12 | 13 | tree = Tree() 14 | tree.PopulateSuffixTree(Text) 15 | 16 | result = tree.edge_labels(Text) 17 | print("\n".join(result)) -------------------------------------------------------------------------------- /solutions/BA9I.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def BurrowsWheelerTransform(Text): 5 | n = len(Text) 6 | rotations = sorted([Text[i:] + Text[:i] for i in range(n)]) 7 | bwt = ''.join([rot[-1] for rot in rotations]) 8 | return bwt 9 | 10 | 11 | if __name__ == "__main__": 12 | ''' 13 | Given: A string Text. 14 | Return: BWT(Text). 15 | ''' 16 | input_lines = sys.stdin.read().splitlines() 17 | Text = input_lines[0] 18 | 19 | print(BurrowsWheelerTransform(Text)) 20 | -------------------------------------------------------------------------------- /solutions/BA3A.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def composition(text, k): 5 | for i in range(len(text) - k + 1): 6 | yield text[i:i + k] 7 | 8 | 9 | if __name__ == "__main__": 10 | ''' 11 | Given: An integer k and a string Text. 12 | Return: Compositionk(Text) (the k-mers can be provided in any order). 13 | ''' 14 | input_lines = sys.stdin.read().splitlines() 15 | k = int(input_lines[0]) 16 | Text = input_lines[1] 17 | 18 | for kmer in composition(Text, k): 19 | print(kmer) 20 | -------------------------------------------------------------------------------- /solutions/BA9R.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from Tree_Trie_classes import Tree 3 | 4 | 5 | if __name__ == "__main__": 6 | ''' 7 | Given: A string Text, SuffixArray(Text), and LCP(Text). 8 | Return: The strings labeling the edges of SuffixTree(Text). (You may return these strings in any order.) 9 | ''' 10 | input_lines = sys.stdin.read().splitlines() 11 | Text = input_lines[0] 12 | 13 | tree = Tree() 14 | tree.PopulateSuffixTree(Text) 15 | 16 | result = tree.edge_labels(Text) 17 | print("\n".join(result)) -------------------------------------------------------------------------------- /solutions/BA1A.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def count_pattern(text, pattern): 5 | k = len(pattern) 6 | count = 0 7 | for i in range(len(text) - k + 1): 8 | if text[i:i+k] == pattern: 9 | count += 1 10 | return count 11 | 12 | 13 | if __name__ == "__main__": 14 | ''' 15 | Given: {DNA strings}} Text and Pattern. 16 | Return: Count(Text, Pattern). 17 | ''' 18 | input_lines = sys.stdin.read().splitlines() 19 | Text = input_lines[0] 20 | Pattern = input_lines[1] 21 | print(count_pattern(Text, Pattern)) -------------------------------------------------------------------------------- /solutions/BA1L.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def PatternToNumber(Pattern): 5 | indices = {'A': 0, 'C': 1, 'G': 2, 'T': 3} 6 | result = 0 7 | N = len(Pattern) 8 | for i in range(N): 9 | nuc = Pattern[i] 10 | result += indices[nuc] * 4 ** (N - i - 1) 11 | return result 12 | 13 | 14 | if __name__ == "__main__": 15 | ''' 16 | Given: A DNA string Pattern. 17 | Return: PatternToNumber(Pattern). 18 | ''' 19 | input_lines = sys.stdin.read().splitlines() 20 | Pattern = input_lines[0] 21 | 22 | print(PatternToNumber(Pattern)) -------------------------------------------------------------------------------- /solutions/BA1C.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def rev_comp(string): 5 | ''' Create reverse complement of the string 6 | :param string: DNA sequence to rev. comp. (string) 7 | :return: the reverse complement (string) 8 | ''' 9 | revc_seq = string[::-1].translate(str.maketrans("ACGT", "TGCA")) 10 | return revc_seq 11 | 12 | 13 | if __name__ == "__main__": 14 | ''' 15 | Given: A DNA string Pattern. 16 | Return: Pattern, the reverse complement of Pattern 17 | ''' 18 | DNA_string = sys.stdin.read().splitlines()[0] 19 | print(rev_comp(DNA_string)) 20 | -------------------------------------------------------------------------------- /solutions/BA6B.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def number_of_breakpoints(P): 5 | P = [0] + P 6 | P.append(max(P) + 1) 7 | num_bp = 0 8 | for i in range(1, len(P) - 1): 9 | if P[i] != P[i - 1] + 1: 10 | num_bp += 1 11 | return num_bp 12 | 13 | 14 | if __name__ == "__main__": 15 | ''' 16 | Given: A signed permutation P. 17 | Return: The number of breakpoints in P. 18 | ''' 19 | P = sys.stdin.readline().strip() 20 | P = P.replace("(", "").replace(")", "") 21 | P = [int(x) for x in P.split()] 22 | 23 | print(number_of_breakpoints(P)) 24 | -------------------------------------------------------------------------------- /solutions/BA1M.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def NumberToPattern(Number, k): 5 | reverse = ['A', 'C', 'G', 'T'] 6 | Pattern = '' 7 | for i in range(k - 1, -1, -1): 8 | current = Number // 4 ** i 9 | Pattern += reverse[current] 10 | Number %= 4 ** i 11 | return Pattern 12 | 13 | 14 | if __name__ == "__main__": 15 | ''' 16 | Given: Integers index and k. 17 | Return: NumberToPattern(index, k). 18 | ''' 19 | input_lines = sys.stdin.read().splitlines() 20 | index = int(input_lines[0]) 21 | k = int(input_lines[1]) 22 | 23 | print(NumberToPattern(index, k)) -------------------------------------------------------------------------------- /solutions/BA9G.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def SuffixArray(Text): 5 | suffixes = [] 6 | suffix_array = [] 7 | for i in range(len(Text)): 8 | suffixes.append(Text[i:]) 9 | suffix_array.append(i) 10 | 11 | suffix_array = [x for _, x in sorted(zip(suffixes, suffix_array), key=lambda pair: pair[0])] 12 | 13 | return suffix_array 14 | 15 | 16 | if __name__ == "__main__": 17 | ''' 18 | Input: A string Text. 19 | Output: SuffixArray(Text). 20 | ''' 21 | Text = sys.stdin.read().rstrip() 22 | suffix_array = SuffixArray(Text) 23 | 24 | print(', '.join(str(x) for x in suffix_array)) -------------------------------------------------------------------------------- /solutions/BA3B.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def reconstruct_string(pattern): 5 | string = pattern[0] 6 | for i in range(1, len(pattern)): 7 | string += pattern[i][-1] 8 | return string 9 | 10 | 11 | if __name__ == "__main__": 12 | ''' 13 | Given: A sequence of k-mers Pattern1, ... , Patternn such that the last k - 1 symbols of Patterni are equal to the 14 | first k - 1 symbols of Patterni+1 for i from 1 to n-1. 15 | Return: A string Text of length k+n-1 where the i-th k-mer in Text is equal to Patterni for all i. 16 | ''' 17 | Pattern = sys.stdin.read().splitlines() 18 | 19 | print(reconstruct_string(Pattern)) 20 | -------------------------------------------------------------------------------- /solutions/BA1D.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def positions_pattern(text, pattern): 5 | k = len(pattern) 6 | pos = [] 7 | for i in range(len(text) - k + 1): 8 | if text[i:i+k] == pattern: 9 | pos.append(i) 10 | return pos 11 | 12 | 13 | if __name__ == "__main__": 14 | ''' 15 | Given: Strings Pattern and Genome. 16 | Return: All starting positions in Genome where Pattern appears as a substring. Use 0-based indexing. 17 | ''' 18 | input_lines = sys.stdin.read().splitlines() 19 | Pattern = input_lines[0] 20 | Genome = input_lines[1] 21 | 22 | print(" ".join(map(str, positions_pattern(Genome, Pattern)))) 23 | -------------------------------------------------------------------------------- /solutions/BA3L.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA3J import StringSpelledByGappedPatterns 3 | 4 | 5 | if __name__ == "__main__": 6 | ''' 7 | Given: A sequence of (k, d)-mers (a1|b1), ... , (an|bn) such that Suffix(ai|bi) = Prefix(ai+1|bi+1) for all i 8 | from 1 to n-1. 9 | Return: A string Text where the i-th k-mer in Text is equal to Suffix(ai|bi) for all i from 1 to n, if such a 10 | string exists. 11 | ''' 12 | input_lines = sys.stdin.read().splitlines() 13 | k, d = [int(x) for x in input_lines[0].split()] 14 | Gapped_Patterns = [] 15 | for line in input_lines[1:]: 16 | Gapped_Patterns.append(line.split("|")) 17 | 18 | print(StringSpelledByGappedPatterns(Gapped_Patterns, k - 1, d)) 19 | -------------------------------------------------------------------------------- /solutions/BA3E.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def deBruijn_graph_kmers(patterns): 5 | adj_list = {} 6 | for pattern in patterns: 7 | if pattern[:-1] not in adj_list: 8 | adj_list[pattern[:-1]] = [pattern[1:]] 9 | else: 10 | adj_list[pattern[:-1]].append(pattern[1:]) 11 | return adj_list 12 | 13 | 14 | if __name__ == "__main__": 15 | ''' 16 | Given: A collection of k-mers Patterns. 17 | Return: The de Bruijn graph DeBruijn(Patterns), in the form of an adjacency list. 18 | ''' 19 | Patterns = sys.stdin.read().splitlines() 20 | 21 | adj_list = deBruijn_graph_kmers(Patterns) 22 | for key, val in adj_list.items(): 23 | print(key + ' -> ' + ",".join(val)) 24 | -------------------------------------------------------------------------------- /solutions/BA4F.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA4C import cyclospectrum 3 | 4 | 5 | def score(peptide, spectrum): 6 | pep_spec = cyclospectrum(peptide) 7 | result = 0 8 | unique_masses = set(pep_spec + spectrum) 9 | for mass in unique_masses: 10 | result += min(pep_spec.count(mass), spectrum.count(mass)) 11 | return result 12 | 13 | 14 | if __name__ == "__main__": 15 | ''' 16 | Given: An amino acid string Peptide and a collection of integers Spectrum. 17 | Return: The score of Peptide against Spectrum, Score(Peptide, Spectrum). 18 | ''' 19 | input_lines = sys.stdin.read().splitlines() 20 | Peptide = input_lines[0] 21 | Spectrum = [int(x) for x in input_lines[1].split()] 22 | 23 | print(score(Peptide, Spectrum)) 24 | -------------------------------------------------------------------------------- /solutions/BA1B.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def most_freq_kmers(text, k): 5 | count_dict = {} 6 | for i in range(len(text) - k + 1): 7 | kmer = text[i:i+k] 8 | if kmer not in count_dict: 9 | count_dict[kmer] = 1 10 | else: 11 | count_dict[kmer] += 1 12 | 13 | max_freq = max(count_dict.values()) 14 | return [kmer for kmer, count in count_dict.items() if count == max_freq] 15 | 16 | 17 | if __name__ == "__main__": 18 | ''' 19 | Given: A DNA string Text and an integer k. 20 | Return: All most frequent k-mers in Text (in any order). 21 | ''' 22 | input_lines = sys.stdin.read().splitlines() 23 | Text = input_lines[0] 24 | k = int(input_lines[1]) 25 | print(" ".join(most_freq_kmers(Text, k))) 26 | -------------------------------------------------------------------------------- /solutions/BA1H.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA1G import hamming_dist 3 | 4 | def positions_approx_pattern(text, pattern, d): 5 | k = len(pattern) 6 | pos = [] 7 | for i in range(len(text) - k + 1): 8 | if hamming_dist(text[i:i+k], pattern) <= d: 9 | pos.append(i) 10 | return pos 11 | 12 | 13 | if __name__ == "__main__": 14 | ''' 15 | Given: Strings Pattern and Text along with an integer d. 16 | Return: All starting positions where Pattern appears as a substring of Text with at most d mismatches. 17 | ''' 18 | input_lines = sys.stdin.read().splitlines() 19 | Pattern = input_lines[0] 20 | Genome = input_lines[1] 21 | d = int(input_lines[2]) 22 | 23 | print(" ".join(map(str, positions_approx_pattern(Genome, Pattern, d)))) 24 | -------------------------------------------------------------------------------- /solutions/BA4K.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA4J import LinearSpectrum 3 | 4 | 5 | def linear_score(peptide, spectrum): 6 | pep_spec = LinearSpectrum(peptide) 7 | result = 0 8 | unique_masses = set(pep_spec + spectrum) 9 | for mass in unique_masses: 10 | result += min(pep_spec.count(mass), spectrum.count(mass)) 11 | return result 12 | 13 | 14 | if __name__ == "__main__": 15 | ''' 16 | Given: An amino acid string Peptide and a collection of integers LinearSpectrum. 17 | Return: The linear score of Peptide against Spectrum, LinearScore(Peptide, Spectrum). 18 | ''' 19 | input_lines = sys.stdin.read().splitlines() 20 | Peptide = input_lines[0] 21 | Spectrum = [int(x) for x in input_lines[1].split()] 22 | 23 | print(linear_score(Peptide, Spectrum)) 24 | -------------------------------------------------------------------------------- /solutions/BA3C.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def overlap_graph(patterns): 5 | adj_list = [] 6 | for i in range(len(patterns) - 1): 7 | for j in range(i, len(patterns)): 8 | if patterns[i][1:] == patterns[j][:-1]: 9 | adj_list.append((patterns[i], patterns[j])) 10 | if patterns[j][1:] == patterns[i][:-1]: 11 | adj_list.append((patterns[j], patterns[i])) 12 | return adj_list 13 | 14 | 15 | if __name__ == "__main__": 16 | ''' 17 | Given: A collection Patterns of k-mers. 18 | Return: The overlap graph Overlap(Patterns), in the form of an adjacency list. 19 | ''' 20 | Patterns = sys.stdin.read().splitlines() 21 | 22 | adj_list = overlap_graph(Patterns) 23 | for edge in adj_list: 24 | print(" -> ".join(edge)) 25 | -------------------------------------------------------------------------------- /solutions/BA3H.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA3E import deBruijn_graph_kmers 3 | from BA3G import Eulerian_path 4 | 5 | 6 | def string_reconstruction(patterns): 7 | adj_list = deBruijn_graph_kmers(patterns) 8 | path = Eulerian_path(adj_list) 9 | ReconstructedString = path[0][:-1] 10 | for r in path: 11 | ReconstructedString += r[-1] 12 | return ReconstructedString 13 | 14 | 15 | if __name__ == "__main__": 16 | ''' 17 | Given: An integer k followed by a list of k-mers Patterns. 18 | Return: A string Text with k-mer composition equal to Patterns. (If multiple answers exist, you may return any one.) 19 | ''' 20 | input_lines = sys.stdin.read().splitlines() 21 | k = int(input_lines[0]) 22 | Patterns = input_lines[1:] 23 | 24 | print(string_reconstruction(Patterns)) 25 | -------------------------------------------------------------------------------- /solutions/BA9H.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def pattern_matching(Text, Patterns): 5 | # how to do this with suffix array? 6 | match_idx = [] 7 | for pattern in Patterns: 8 | for j in range(len(Text) - len(pattern) + 1): 9 | if pattern == Text[j:j + len(pattern)]: 10 | match_idx.append(j) 11 | match_idx = sorted(match_idx) 12 | return match_idx 13 | 14 | 15 | if __name__ == "__main__": 16 | ''' 17 | Given: A string Text and a collection of strings Patterns. 18 | Return: All starting positions in Text where a string from Patterns appears as a substring. 19 | ''' 20 | input_lines = sys.stdin.read().splitlines() 21 | Text = input_lines[0] 22 | Patterns = input_lines[1:] 23 | 24 | print(" ".join(map(str, pattern_matching(Text, Patterns)))) 25 | -------------------------------------------------------------------------------- /solutions/BA3K.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA3E import deBruijn_graph_kmers 3 | from BA3M import maximal_non_branching_paths 4 | 5 | 6 | def contig_generation(kmers): 7 | adj_list = deBruijn_graph_kmers(kmers) 8 | paths = maximal_non_branching_paths(adj_list) 9 | contigs = [] 10 | for path in paths: 11 | contig = path[0] 12 | for edge in path[1:]: 13 | contig += edge[-1] 14 | contigs.append(contig) 15 | return contigs 16 | 17 | 18 | if __name__ == "__main__": 19 | ''' 20 | Given: A collection of k-mers Patterns. 21 | Return: All contigs in DeBruijn(Patterns). (You may return the strings in any order.) 22 | ''' 23 | Patterns = sys.stdin.read().splitlines() 24 | contigs = contig_generation(Patterns) 25 | contigs.sort() 26 | print(" ".join(contigs)) 27 | -------------------------------------------------------------------------------- /solutions/BA4D.py: -------------------------------------------------------------------------------- 1 | import sys 2 | MASSES = [57, 71, 87, 97, 99, 101, 103, 113, 114, 115, 128, 129, 131, 137, 147, 156, 163, 186] 3 | 4 | 5 | def CountPeptides(Mass): 6 | NumPeptides={} 7 | for i in range(57): 8 | NumPeptides[i] = 0 9 | 10 | for mass in range(57, Mass + 1): 11 | NumPeptides[mass] = MASSES.count(mass) 12 | for int_mass in MASSES: 13 | if mass >= int_mass: 14 | if NumPeptides[mass - int_mass] > 0: 15 | NumPeptides[mass] += NumPeptides[mass - int_mass] 16 | 17 | return NumPeptides[Mass] 18 | 19 | 20 | if __name__ == "__main__": 21 | ''' 22 | Given: An integer m. 23 | Return: The number of linear peptides having integer mass m. 24 | ''' 25 | m = int(sys.stdin.readline().strip()) 26 | 27 | print(CountPeptides(m)) 28 | -------------------------------------------------------------------------------- /solutions/BA5A.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def dp_change(money, coins): 5 | MinNumCoins = [0] 6 | for m in range(1, money + 1): 7 | MinNumCoins.append(money + 1) 8 | for coin in coins: 9 | if m >= coin: 10 | current = MinNumCoins[m - coin] + 1 11 | if current < MinNumCoins[m]: 12 | MinNumCoins[m] = current 13 | return MinNumCoins[money] 14 | 15 | 16 | if __name__ == "__main__": 17 | ''' 18 | Given: An integer money and an array Coins of positive integers. 19 | Return: The minimum number of coins with denominations Coins that changes money. 20 | ''' 21 | input_lines = sys.stdin.read().splitlines() 22 | money = int(input_lines[0]) 23 | Coins = [int(x) for x in input_lines[1].split(",")] 24 | 25 | print(dp_change(money, Coins)) 26 | -------------------------------------------------------------------------------- /solutions/BA4A.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | bases = "UCAG" 4 | codons = [a + b + c for a in bases for b in bases for c in bases] 5 | amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG' 6 | CODON_TABLE = dict(zip(codons, amino_acids)) 7 | 8 | 9 | def translate_rna(rna): 10 | protein = "" 11 | idx = 0 12 | codon = rna[idx:idx + 3] 13 | while CODON_TABLE[codon] != "*": 14 | protein += CODON_TABLE[codon] 15 | idx += 3 16 | codon = rna[idx:idx + 3] 17 | if idx == len(rna): 18 | break 19 | return protein 20 | 21 | 22 | if __name__ == "__main__": 23 | ''' 24 | Given: An RNA string Pattern. 25 | Return: The translation of Pattern into an amino acid string Peptide. 26 | ''' 27 | Pattern = sys.stdin.read().splitlines()[0] 28 | 29 | print(translate_rna(Pattern)) 30 | -------------------------------------------------------------------------------- /solutions/BA6G.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def cycle_to_chromosome(Nodes): 5 | Chromosome = [] 6 | for i in range(0, len(Nodes), 2): 7 | if Nodes[i] < Nodes[i + 1]: 8 | Chromosome.append(Nodes[i + 1] // 2) 9 | else: 10 | Chromosome.append(-Nodes[i] // 2) 11 | return Chromosome 12 | 13 | 14 | if __name__ == "__main__": 15 | ''' 16 | Given: A sequence Nodes of integers between 1 and 2n. 17 | Return: The chromosome Chromosome containing n synteny blocks resulting from applying CycleToChromosome to Nodes. 18 | ''' 19 | Nodes = sys.stdin.readline().strip() 20 | Nodes = Nodes.replace("(", "").replace(")", "") 21 | Nodes = [int(x) for x in Nodes.split()] 22 | 23 | chromosome = cycle_to_chromosome(Nodes) 24 | print("(" + " ".join(["+" + str(x) if x > 0 else str(x) for x in chromosome]) + ")") 25 | -------------------------------------------------------------------------------- /solutions/BA8B.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA8A import distance_from_centers 3 | 4 | 5 | def squared_error_distortion(data, centers): 6 | distortion = 0 7 | for point in data: 8 | distortion += distance_from_centers(centers, point) ** 2 9 | distortion /= len(data) 10 | return distortion 11 | 12 | 13 | if __name__ == "__main__": 14 | ''' 15 | Given: Integers k and m, followed by a set of centers Centers and a set of points Data. 16 | Return: The squared error distortion Distortion(Data, Centers). 17 | ''' 18 | input_lines = sys.stdin.read().splitlines() 19 | k, m = [int(x) for x in input_lines[0].split()] 20 | 21 | centers = [[float(x) for x in line.split()] for line in input_lines[1:k + 1]] 22 | data = [[float(x) for x in line.split()] for line in input_lines[k + 2:]] 23 | 24 | print(squared_error_distortion(data, centers)) -------------------------------------------------------------------------------- /solutions/BA1F.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def skew(dna_seq): 5 | return dna_seq.count("G") - dna_seq.count("C") 6 | 7 | 8 | def minimal_skew(dna_seq): 9 | all_skew = [] 10 | min_skew = 1e6 11 | for i in range(len(dna_seq) + 1): 12 | skw = skew(dna_seq[:i]) 13 | all_skew.append(skw) 14 | if skw < min_skew: 15 | min_skew = skw 16 | 17 | idx_list = [] 18 | for i, skw in enumerate(all_skew): 19 | if skw == min_skew: 20 | idx_list.append(i) 21 | return idx_list 22 | 23 | 24 | if __name__ == "__main__": 25 | ''' 26 | Given: A DNA string Genome. 27 | Return: All integer(s) i minimizing Skew(Prefixi (Text)) over all values of i (from 0 to |Genome|). 28 | ''' 29 | input_lines = sys.stdin.read().splitlines() 30 | Genome = input_lines[0] 31 | 32 | print(" ".join(map(str, minimal_skew(Genome)))) 33 | -------------------------------------------------------------------------------- /solutions/BA3D.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import OrderedDict 3 | 4 | 5 | def deBruijn_graph(text, k): 6 | adj_list = OrderedDict() 7 | for i in range(len(text) - k + 2): 8 | adj_list[text[i:i + k - 1]] = set() 9 | 10 | for i in range(len(text) - k + 1): 11 | pattern = text[i:i + k - 1] 12 | pattern2 = text[i + 1:i + k] 13 | adj_list[pattern].add(pattern2) 14 | 15 | return adj_list 16 | 17 | 18 | if __name__ == "__main__": 19 | ''' 20 | Given: An integer k and a string Text. 21 | Return:DeBruijnk(Text), in the form of an adjacency list. 22 | ''' 23 | input_lines = sys.stdin.read().splitlines() 24 | k = int(input_lines[0]) 25 | Text = input_lines[1] 26 | 27 | adj_list = deBruijn_graph(Text, k) 28 | for key, val in adj_list.items(): 29 | if val: 30 | print(key + ' -> ' + ",".join(val)) 31 | -------------------------------------------------------------------------------- /solutions/BA6F.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def chromosome_to_cycle(Chromosome): 5 | Nodes = [] 6 | for block in Chromosome: 7 | if block > 0: 8 | Nodes.append(2 * block - 1) 9 | Nodes.append(2 * block) 10 | else: 11 | Nodes.append(-2 * block) 12 | Nodes.append(-2 * block - 1) 13 | return Nodes 14 | 15 | 16 | if __name__ == "__main__": 17 | ''' 18 | Given: A chromosome Chromosome containing n synteny blocks. 19 | Return: The sequence Nodes of integers between 1 and 2n resulting from applying ChromosomeToCycle to Chromosome. 20 | ''' 21 | Chromosome = sys.stdin.readline().strip() 22 | Chromosome = Chromosome.replace("(", "").replace(")", "") 23 | Chromosome = [int(x) for x in Chromosome.split()] 24 | 25 | cycle = chromosome_to_cycle(Chromosome) 26 | print("(" + " ".join(map(str, cycle)) + ")") -------------------------------------------------------------------------------- /solutions/BA11C.py: -------------------------------------------------------------------------------- 1 | import sys 2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 'L': 113, 3 | 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163, 'X': 4, 'Z': 5} 4 | 5 | 6 | def PeptideVector(peptide): 7 | prefixMasses = [] 8 | for i in range(len(peptide)): 9 | prefix = peptide[:i+1] 10 | mass = 0 11 | for aa in prefix: 12 | mass += aa_table[aa] 13 | prefixMasses.append(mass) 14 | 15 | vector = [0] * prefixMasses[-1] 16 | for mass in prefixMasses: 17 | vector[mass - 1] = 1 18 | return vector 19 | 20 | 21 | if __name__ == "__main__": 22 | ''' 23 | Given: A peptide P. 24 | Return: The peptide vector of P. 25 | ''' 26 | peptide = sys.stdin.read().rstrip() 27 | print(' '.join(str(x) for x in PeptideVector(peptide))) -------------------------------------------------------------------------------- /solutions/BA1K.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA1A import count_pattern 3 | 4 | 5 | def generate_all_kmers(k): 6 | if k == 1: 7 | return ["A", "C", "G", "T"] 8 | kmers = [] 9 | suff = generate_all_kmers(k - 1) 10 | for nuc in ["A", "C", "G", "T"]: 11 | for s in suff: 12 | kmers.append(nuc + s) 13 | return kmers 14 | 15 | 16 | def frequency_array(text, k): 17 | all_kmers = generate_all_kmers(k) 18 | freq_arr = [] 19 | for kmer in all_kmers: 20 | freq_arr.append(count_pattern(text, kmer)) 21 | return freq_arr 22 | 23 | 24 | if __name__ == "__main__": 25 | ''' 26 | Given: A DNA string Text and an integer k. 27 | Return: The frequency array of k-mers in Text. 28 | ''' 29 | input_lines = sys.stdin.read().splitlines() 30 | Text = input_lines[0] 31 | k = int(input_lines[1]) 32 | 33 | print(" ".join(map(str, frequency_array(Text, k)))) 34 | -------------------------------------------------------------------------------- /solutions/BA2H.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA1G import hamming_dist 3 | 4 | 5 | def distance(pattern, text): 6 | k = len(pattern) 7 | min_dist = float("Inf") 8 | for i in range(len(text) - k + 1): 9 | dist = hamming_dist(text[i:i + k], pattern) 10 | if dist < min_dist: 11 | min_dist = dist 12 | return min_dist 13 | 14 | 15 | def DistanceBetweenPatternAndStrings(dna_list, pattern): 16 | dist = 0 17 | for dna in dna_list: 18 | dist += distance(pattern, dna) 19 | return dist 20 | 21 | 22 | if __name__ == "__main__": 23 | ''' 24 | Given: A DNA string Pattern and a collection of DNA strings Dna. 25 | Return: DistanceBetweenPatternAndStrings(Pattern, Dna). 26 | ''' 27 | input_lines = sys.stdin.read().splitlines() 28 | Pattern = input_lines[0] 29 | DNA_list = input_lines[1].split() 30 | 31 | print(DistanceBetweenPatternAndStrings(DNA_list, Pattern)) 32 | -------------------------------------------------------------------------------- /solutions/BA3I.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA3E import deBruijn_graph_kmers 3 | from BA3F import Eulerian_cycle 4 | 5 | 6 | def k_universal_circular_string(k): 7 | kmers = [] 8 | for i in range(2 ** k): 9 | kmer = str(bin(i))[2:] 10 | if len(kmer) != k: 11 | kmer = '0' * (k - len(kmer)) + kmer 12 | kmers.append(kmer) 13 | 14 | adj_list = deBruijn_graph_kmers(kmers) 15 | cycle = Eulerian_cycle(adj_list) 16 | 17 | cycle = cycle[:len(cycle) - k + 1] 18 | string = cycle[0][:-1] 19 | for r in cycle: 20 | string += r[-1] 21 | return string 22 | 23 | 24 | if __name__ == "__main__": 25 | ''' 26 | Given: An integer k. 27 | Return: A k-universal circular string. (If multiple answers exist, you may return any one.) 28 | ''' 29 | input_lines = sys.stdin.read().splitlines() 30 | k = int(input_lines[0]) 31 | 32 | print(k_universal_circular_string(k)) 33 | -------------------------------------------------------------------------------- /solutions/BA4J.py: -------------------------------------------------------------------------------- 1 | import sys 2 | MASS_TABLE = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163} 4 | 5 | 6 | def LinearSpectrum(Peptide): 7 | PrefixMass = [0] 8 | for i in range(len(Peptide)): 9 | temp = PrefixMass[i] + MASS_TABLE[Peptide[i]] 10 | PrefixMass.append(temp) 11 | LinearSpectrum = [0] 12 | for i in range(len(Peptide)): 13 | for j in range(i + 1, len(Peptide) + 1): 14 | LinearSpectrum.append(PrefixMass[j] - PrefixMass[i]) 15 | LinearSpectrum.sort() 16 | return LinearSpectrum 17 | 18 | 19 | if __name__ == "__main__": 20 | ''' 21 | Given: An amino acid string Peptide. 22 | Return: The linear spectrum of Peptide. 23 | ''' 24 | Peptide = sys.stdin.readline().strip() 25 | 26 | print(" ".join(map(str, LinearSpectrum(Peptide)))) 27 | -------------------------------------------------------------------------------- /solutions/BA1N.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA1G import hamming_dist 3 | 4 | 5 | def neighbors(pattern, d): 6 | if d == 0: 7 | return pattern 8 | if len(pattern) == 1: 9 | return ['A', 'C', 'G', 'T'] 10 | neighborhood = set() 11 | suffix_neighbors = neighbors(pattern[1:], d) 12 | for suffix in suffix_neighbors: 13 | if hamming_dist(pattern[1:], suffix) < d: 14 | for nuc in ['A', 'C', 'G', 'T']: 15 | neighborhood.add(nuc + suffix) 16 | else: 17 | neighborhood.add(pattern[0] + suffix) 18 | return neighborhood 19 | 20 | 21 | if __name__ == "__main__": 22 | ''' 23 | Given: A DNA string Pattern and an integer d. 24 | Return: The collection of strings Neighbors(Pattern, d). 25 | ''' 26 | input_lines = sys.stdin.read().splitlines() 27 | Pattern = input_lines[0] 28 | d = int(input_lines[1]) 29 | 30 | result = neighbors(Pattern, d) 31 | for r in result: 32 | print(r) 33 | -------------------------------------------------------------------------------- /solutions/BA6A.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def kSortingReversal(P, k): 5 | j = k 6 | while P[j] != k + 1 and P[j] != -(k + 1): 7 | j += 1 8 | P[k:j+1] = list(map(lambda x: -x, P[k:j+1][::-1])) 9 | return P 10 | 11 | 12 | def GreedySorting(P): 13 | reversals = [] 14 | for k in range(len(P)): 15 | while P[k] != k + 1: 16 | P = kSortingReversal(P, k) 17 | reversals.append(list(P)) 18 | return reversals 19 | 20 | 21 | if __name__ == "__main__": 22 | ''' 23 | Given: A signed permutation P. 24 | Return: The sequence of permutations corresponding to applying GreedySorting to P, ending with the identity 25 | permutation. 26 | ''' 27 | P = sys.stdin.readline().strip() 28 | P = P.replace("(", "").replace(")", "") 29 | P = [int(x) for x in P.split()] 30 | 31 | result = GreedySorting(P) 32 | for res in result: 33 | print("(" + " ".join(["+" + str(x) if x > 0 else str(x) for x in res]) + ")") 34 | -------------------------------------------------------------------------------- /solutions/BA6H.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA6F import chromosome_to_cycle 3 | 4 | 5 | def colored_edges(P): 6 | Edges = list() 7 | for chromosome in P: 8 | Nodes = chromosome_to_cycle(chromosome) 9 | for j in range(1, len(Nodes), 2): 10 | if j != len(Nodes) - 1: 11 | Edges.append([Nodes[j], Nodes[j + 1]]) 12 | else: 13 | Edges.append([Nodes[j], Nodes[0]]) 14 | return Edges 15 | 16 | 17 | if __name__ == "__main__": 18 | ''' 19 | Given: A genome P. 20 | Return: The collection of colored edges in the genome graph of P in the form (x, y). 21 | ''' 22 | P = sys.stdin.readline().strip() 23 | P = P[1:-1] 24 | P = P.split(')(') 25 | 26 | for i in range(len(P)): 27 | P[i] = [int(x) for x in P[i].split(' ')] 28 | 29 | result = colored_edges(P) 30 | for j in range(len(result)): 31 | result[j] = '(' + ', '.join(str(i) for i in result[j]) + ')' 32 | print(', '.join(result)) 33 | -------------------------------------------------------------------------------- /solutions/BA9Q.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def PartialSuffixArray(Text, K): 5 | suffixes = [] 6 | suffix_array = [] 7 | for i in range(len(Text)): 8 | suffixes.append(Text[i:]) 9 | suffix_array.append(i) 10 | 11 | suffix_array = [x for _, x in sorted(zip(suffixes, suffix_array), key=lambda pair: pair[0])] 12 | 13 | partial_suffix_array = [(i, x) for i, x in enumerate(suffix_array) if x % K == 0] 14 | 15 | return partial_suffix_array 16 | 17 | 18 | if __name__ == '__main__': 19 | ''' 20 | Given: A string Text and a positive integer K. 21 | Return: SuffixArrayK(Text), in the form of a list of ordered pairs (i, SuffixArray(i)) for all nonempty entries in 22 | the partial suffix array. 23 | ''' 24 | input_lines = sys.stdin.read().splitlines() 25 | Text = input_lines[0] 26 | K = int(input_lines[1]) 27 | 28 | partial_suffix_array = PartialSuffixArray(Text, K) 29 | for elem in partial_suffix_array: 30 | print(','.join(map(str, elem))) 31 | -------------------------------------------------------------------------------- /solutions/BA4C.py: -------------------------------------------------------------------------------- 1 | import sys 2 | MASS_TABLE = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163} 4 | 5 | 6 | def cyclospectrum(peptide): 7 | full_mass = 0 8 | for aa in peptide: 9 | full_mass += MASS_TABLE[aa] 10 | spec = [0, full_mass] 11 | temp = peptide + peptide 12 | for k in range(1, len(peptide)): 13 | for i in range(len(peptide)): 14 | subpeptide = temp[i:i + k] 15 | mass = 0 16 | for aa in subpeptide: 17 | mass += MASS_TABLE[aa] 18 | spec.append(mass) 19 | spec.sort() 20 | return spec 21 | 22 | 23 | if __name__ == "__main__": 24 | ''' 25 | Given: An amino acid string Peptide. 26 | Return: Cyclospectrum(Peptide). 27 | ''' 28 | Peptide = sys.stdin.readline().strip() 29 | 30 | print(" ".join(map(str, cyclospectrum(Peptide)))) 31 | -------------------------------------------------------------------------------- /solutions/BA4L.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA4K import linear_score 3 | 4 | 5 | def Trim(leaderboard, spectrum, N): 6 | if len(leaderboard) <= N: 7 | return leaderboard 8 | 9 | scores = {} 10 | for i, peptide in enumerate(leaderboard): 11 | scores[i] = linear_score(peptide, spectrum) 12 | 13 | sorted_scores = sorted(scores.values(), reverse=True) 14 | threshold = sorted_scores[N - 1] 15 | 16 | return [leaderboard[idx] for idx, score in scores.items() if score >= threshold] 17 | 18 | 19 | if __name__ == "__main__": 20 | ''' 21 | Given: A leaderboard of linear peptides Leaderboard, a linear spectrum Spectrum, and an integer N. 22 | Return: The top N peptides from Leaderboard scored against Spectrum. Remember to use LinearScore. 23 | ''' 24 | input_lines = sys.stdin.read().splitlines() 25 | Leaderboard = input_lines[0].split() 26 | Spectrum = [int(x) for x in input_lines[1].split()] 27 | N = int(input_lines[2]) 28 | 29 | print(" ".join(Trim(Leaderboard, Spectrum, N))) 30 | -------------------------------------------------------------------------------- /solutions/BA4B.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA1C import rev_comp 3 | from BA4A import translate_rna 4 | 5 | 6 | def substrings_encoding_peptide(dna, peptide): 7 | k = len(peptide) * 3 8 | result = [] 9 | for i in range(len(dna) - k + 1): 10 | substring = dna[i:i + k] 11 | revc_substring = rev_comp(substring) 12 | 13 | substring = substring.replace("T", "U") 14 | revc_substring = revc_substring.replace("T", "U") 15 | 16 | if translate_rna(substring) == peptide or translate_rna(revc_substring) == peptide: 17 | result.append(substring.replace("U", "T")) 18 | return result 19 | 20 | 21 | if __name__ == "__main__": 22 | ''' 23 | Given: A DNA string Text and an amino acid string Peptide. 24 | Return: All substrings of Text encoding Peptide (if any such substrings exist). 25 | ''' 26 | input_lines = sys.stdin.read().splitlines() 27 | Text = input_lines[0] 28 | Peptide = input_lines[1] 29 | 30 | print("\n".join(substrings_encoding_peptide(Text, Peptide))) 31 | -------------------------------------------------------------------------------- /solutions/BA9K.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def LastToFirst(BWT, i): 5 | counts = {} 6 | BWT_list = [] 7 | for char in BWT: 8 | if char not in counts.keys(): 9 | counts[char] = 1 10 | else: 11 | counts[char] += 1 12 | tmp = char + str(counts[char]) 13 | BWT_list.append(tmp) 14 | 15 | first_col = sorted(BWT_list, key=lambda x: x[0]) 16 | 17 | last_to_first = [] 18 | for sym_last in BWT_list: 19 | for idx, sym_first in enumerate(first_col): 20 | if sym_first == sym_last: 21 | last_to_first.append(idx) 22 | 23 | return last_to_first[i] 24 | 25 | 26 | if __name__ == "__main__": 27 | ''' 28 | Given: A string Transform and an integer i. 29 | Return: The position LastToFirst(i) in FirstColumn in the Burrows-Wheeler matrix if LastColumn = Transform. 30 | ''' 31 | input_lines = sys.stdin.read().splitlines() 32 | Transform = input_lines[0] 33 | i = int(input_lines[1]) 34 | 35 | print(LastToFirst(Transform, i)) 36 | -------------------------------------------------------------------------------- /solutions/BA5G.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def edit_distance(str1, str2): 5 | str1 = "-" + str1 6 | str2 = "-" + str2 7 | 8 | score_mat = [[0 for _ in range(len(str2))] for _ in range(len(str1))] 9 | 10 | for j in range(len(str2)): 11 | score_mat[0][j] = j 12 | 13 | for i in range(len(str1)): 14 | score_mat[i][0] = i 15 | 16 | for i in range(1, len(str1)): 17 | for j in range(1, len(str2)): 18 | score1 = score_mat[i - 1][j - 1] + (1 if str1[i] != str2[j] else 0) 19 | score2 = score_mat[i - 1][j] + 1 20 | score3 = score_mat[i][j - 1] + 1 21 | score_mat[i][j] = min(score1, score2, score3) 22 | 23 | return score_mat[len(str1) - 1][len(str2) - 1] 24 | 25 | 26 | if __name__ == "__main__": 27 | ''' 28 | Given: Two amino acid strings. 29 | Return: The edit distance between these strings. 30 | ''' 31 | input_lines = sys.stdin.read().splitlines() 32 | string1 = input_lines[0] 33 | string2 = input_lines[1] 34 | 35 | print(edit_distance(string1, string2)) 36 | -------------------------------------------------------------------------------- /solutions/BA2C.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def probability(pattern, profile): 5 | indices = {"A": 0, "C": 1, "G": 2, "T": 3} 6 | prob = 1 7 | for i, nuc in enumerate(pattern): 8 | prob *= profile[indices[nuc]][i] 9 | return prob 10 | 11 | 12 | def profile_most_probable_kmer(text, profile, k): 13 | max_prob = 0 14 | for i in range(len(text) - k + 1): 15 | kmer = text[i:i + k] 16 | prob = probability(kmer, profile) 17 | if prob > max_prob: 18 | max_prob = prob 19 | result = kmer 20 | return result 21 | 22 | 23 | if __name__ == "__main__": 24 | ''' 25 | Given: A string Text, an integer k, and a 4 × k matrix Profile. 26 | Return: A Profile-most probable k-mer in Text. (If multiple answers exist, you may return any one.) 27 | ''' 28 | input_lines = sys.stdin.read().splitlines() 29 | Text = input_lines[0] 30 | k = int(input_lines[1]) 31 | profile = [[float(x) for x in line.split()] for line in input_lines[2:]] 32 | 33 | print(profile_most_probable_kmer(Text, profile, k)) 34 | -------------------------------------------------------------------------------- /solutions/BA7B.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def limb_length(distance_mat, j, n_leaves): 5 | other_leaves = [i for i in range(n_leaves) if i != j] 6 | 7 | ll = [] 8 | for idx_i in range(len(other_leaves) - 1): 9 | for idx_k in range(idx_i, len(other_leaves)): 10 | i = other_leaves[idx_i] 11 | k = other_leaves[idx_k] 12 | ll.append((distance_mat[i][j] + distance_mat[j][k] - distance_mat[i][k]) / 2) 13 | return min(ll) 14 | 15 | 16 | if __name__ == "__main__": 17 | ''' 18 | Given: An integer n, followed by an integer j between 0 and n - 1, followed by a space-separated additive distance 19 | matrix D (whose elements are integers). 20 | Return: The limb length of the leaf in Tree(D) corresponding to row j of this distance matrix (use 0-based indexing) 21 | ''' 22 | input_lines = sys.stdin.read().splitlines() 23 | n = int(input_lines[0]) 24 | j = int(input_lines[1]) 25 | distance_mat = [[int(x) for x in line.split()] for line in input_lines[2:]] 26 | 27 | print(limb_length(distance_mat, j, n)) 28 | -------------------------------------------------------------------------------- /solutions/BA4H.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def convolution(spectrum): 5 | spectrum.sort() 6 | conv = [] 7 | for i in range(len(spectrum) - 1): 8 | for j in range(i, len(spectrum)): 9 | if spectrum[j] - spectrum[i] != 0: 10 | conv.append(spectrum[j] - spectrum[i]) 11 | 12 | freq_dict = {} 13 | for mass in set(conv): 14 | freq_dict[mass] = conv.count(mass) 15 | 16 | sorted_mass_list = [k for k, _ in sorted(freq_dict.items(), key=lambda item: item[1], reverse=True)] 17 | conv = [] 18 | for mass in sorted_mass_list: 19 | conv += [mass] * freq_dict[mass] 20 | return conv 21 | 22 | 23 | if __name__ == "__main__": 24 | ''' 25 | Given: A collection of integers Spectrum. 26 | Return: The list of elements in the convolution of Spectrum in decreasing order of their multiplicities. If an 27 | element has multiplicity k, it should appear exactly k times. 28 | ''' 29 | spectrum = [int(x) for x in sys.stdin.readline().strip().split()] 30 | 31 | print(" ".join(map(str, convolution(spectrum)))) 32 | -------------------------------------------------------------------------------- /solutions/BA11A.py: -------------------------------------------------------------------------------- 1 | import sys 2 | MASS_TABLE = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163} 4 | INV_MASS_TABLE = {v: k for k, v in MASS_TABLE.items()} 5 | 6 | 7 | def spectrum_graph(spectrum): 8 | adj_list = [] 9 | for i in range(len(spectrum)): 10 | for j in range(i, len(spectrum)): 11 | if (spectrum[j] - spectrum[i]) in INV_MASS_TABLE: 12 | adj_list.append([spectrum[i], spectrum[j], INV_MASS_TABLE[spectrum[j] - spectrum[i]]]) 13 | return adj_list 14 | 15 | 16 | if __name__ == "__main__": 17 | ''' 18 | Given: A space-delimited list of integers Spectrum. 19 | Return: Graph(Spectrum). 20 | ''' 21 | Spectrum = sys.stdin.readline().strip() 22 | Spectrum = [int(x) for x in Spectrum.split()] 23 | Spectrum = [0] + Spectrum 24 | 25 | adj_list = spectrum_graph(Spectrum) 26 | for edge in adj_list: 27 | print(str(edge[0]) + '->' + str(edge[1]) + ':' + str(edge[2])) -------------------------------------------------------------------------------- /solutions/BA11D.py: -------------------------------------------------------------------------------- 1 | import sys 2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163} 4 | mass_table = {v: k for k, v in aa_table.items()} 5 | mass_table[4] = 'X' 6 | mass_table[5] = 'Z' 7 | 8 | 9 | def ConvertPeptideVector(vector): 10 | prefixMasses = [] 11 | for i in range(len(vector)): 12 | if vector[i] == 1: 13 | prefixMasses.append(i + 1) 14 | 15 | peptide = mass_table[prefixMasses[0]] 16 | for i in range(1, len(prefixMasses)): 17 | mass = prefixMasses[i] - prefixMasses[i - 1] 18 | peptide += mass_table[mass] 19 | 20 | return peptide 21 | 22 | 23 | if __name__ == "__main__": 24 | ''' 25 | Given: A space-delimited binary vector P. 26 | Return: A peptide whose binary peptide vector matches P. For masses with more than one amino acid, any choice may 27 | be used. 28 | ''' 29 | vector = [int(x) for x in sys.stdin.read().rstrip().split()] 30 | 31 | print(ConvertPeptideVector(vector)) -------------------------------------------------------------------------------- /solutions/BA1E.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def most_freq_kmers(text, k, t): 5 | count_dict = {} 6 | for i in range(len(text) - k + 1): 7 | kmer = text[i:i + k] 8 | if kmer not in count_dict: 9 | count_dict[kmer] = 1 10 | else: 11 | count_dict[kmer] += 1 12 | 13 | max_freq = max(count_dict.values()) 14 | if max_freq < t: 15 | return [] 16 | return [kmer for kmer, count in count_dict.items() if count == max_freq] 17 | 18 | 19 | def find_clumping_kmers(text, k, L, t): 20 | result = set() 21 | for i in range(len(text) - L + 1): 22 | window = text[i:i + L] 23 | win_kmers = most_freq_kmers(window, k, t) 24 | for kmer in win_kmers: 25 | result.add(kmer) 26 | return result 27 | 28 | 29 | if __name__ == "__main__": 30 | ''' 31 | Given: A string Genome, and integers k, L, and t. 32 | Return: All distinct k-mers forming (L, t)-clumps in Genome. 33 | ''' 34 | input_lines = sys.stdin.read().splitlines() 35 | Genome = input_lines[0] 36 | k, L, t = [int(x) for x in input_lines[1].split()] 37 | 38 | print(" ".join(find_clumping_kmers(Genome, k, L, t))) 39 | -------------------------------------------------------------------------------- /solutions/BA2A.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA1G import hamming_dist 3 | from BA1N import neighbors 4 | 5 | 6 | def count_approx_pattern(text, pattern, d): 7 | k = len(pattern) 8 | count = 0 9 | for i in range(len(text) - k + 1): 10 | if hamming_dist(text[i:i+k], pattern) <= d: 11 | count += 1 12 | return count 13 | 14 | 15 | def motif_enumeration(dna_list, k, d): 16 | patterns = set() 17 | for dna in dna_list: 18 | for i in range(len(dna) - k + 1): 19 | pattern = dna[i:i + k] 20 | neighborhood = neighbors(pattern, d) 21 | for neighbor in neighborhood: 22 | if all(count_approx_pattern(pat, neighbor, d) > 0 for pat in dna_list): 23 | patterns.add(neighbor) 24 | return patterns 25 | 26 | 27 | if __name__ == "__main__": 28 | ''' 29 | Given: Integers k and d, followed by a collection of strings Dna. 30 | Return: All (k, d)-motifs in Dna. 31 | ''' 32 | input_lines = sys.stdin.read().splitlines() 33 | k, d = [int(x) for x in input_lines[0].split()] 34 | DNA_list = input_lines[1:] 35 | 36 | print(" ".join(motif_enumeration(DNA_list, k, d))) 37 | -------------------------------------------------------------------------------- /solutions/BA2B.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA1G import hamming_dist 3 | from BA1K import generate_all_kmers 4 | 5 | 6 | def distance(pattern, text): 7 | k = len(pattern) 8 | min_dist = float("Inf") 9 | for i in range(len(text) - k + 1): 10 | dist = hamming_dist(text[i:i + k], pattern) 11 | if dist < min_dist: 12 | min_dist = dist 13 | return min_dist 14 | 15 | 16 | def median_string(dna_list, k): 17 | all_kmers = generate_all_kmers(k) 18 | 19 | min_dist = float("Inf") 20 | for kmer in all_kmers: 21 | dist = 0 22 | for dna in dna_list: 23 | dist += distance(kmer, dna) 24 | if dist < min_dist: 25 | min_dist = dist 26 | med_str = kmer 27 | return med_str 28 | 29 | 30 | if __name__ == "__main__": 31 | ''' 32 | Given: An integer k and a collection of strings Dna. 33 | Return: A k-mer Pattern that minimizes d(Pattern, Dna) over all k-mers Pattern. (If multiple answers exist, 34 | you may return any one.) 35 | ''' 36 | input_lines = sys.stdin.read().splitlines() 37 | k = int(input_lines[0]) 38 | DNA_list = input_lines[1:] 39 | 40 | print(median_string(DNA_list, k)) 41 | -------------------------------------------------------------------------------- /solutions/BA10A.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def HiddenPathProbability(hidden_path, transition_matrix): 5 | # initial probability 6 | prob_path = .5 7 | 8 | # Calculate ∏ (i: 1 -> n) transition𝜋𝑖−1,𝜋𝑖 9 | for idx in range(1, len(hidden_path)): 10 | prob_path *= transition_matrix[hidden_path[idx - 1]][hidden_path[idx]] 11 | 12 | return prob_path 13 | 14 | 15 | if __name__ == "__main__": 16 | ''' 17 | Given: A hidden path π followed by the states States and transition matrix Transition of an HMM (Σ, States, 18 | Transition, Emission). 19 | Return: The probability of this path, Pr(π). You may assume that initial probabilities are equal. 20 | ''' 21 | tmp = sys.stdin.read().splitlines() 22 | 23 | hidden_path = tmp[0] 24 | states = tmp[2].split(' ') 25 | 26 | col_syms = tmp[4].split('\t')[1:] 27 | transition_matrix = {} 28 | for i in range(5, len(tmp)): 29 | current_line = tmp[i].rstrip().split('\t') 30 | row_sym = current_line[0] 31 | transition_matrix[row_sym] = {} 32 | for j in range(1, len(current_line)): 33 | transition_matrix[row_sym][col_syms[j - 1]] = float(current_line[j]) 34 | 35 | print(HiddenPathProbability(hidden_path, transition_matrix)) -------------------------------------------------------------------------------- /solutions/BA9J.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def InverseBurrowsWheelerTransform(BWT): 5 | lenText = len(BWT) 6 | 7 | counts = {} 8 | BWT_list = [] 9 | for char in BWT: 10 | if char not in counts.keys(): 11 | counts[char] = 1 12 | else: 13 | counts[char] += 1 14 | tmp = char + str(counts[char]) 15 | BWT_list.append(tmp) 16 | 17 | first_col = sorted(BWT_list, key=lambda x: x[0]) 18 | 19 | first_row = ['$1'] 20 | for i in range(1, lenText): 21 | prev_symbol = first_row[i - 1] 22 | for BWT_idx, char in enumerate(BWT_list): 23 | if char == prev_symbol: 24 | idx = BWT_idx 25 | break 26 | first_row.append(first_col[idx]) 27 | 28 | Text = '' 29 | for i in range(1, len(first_row)): 30 | Text += ''.join(x for x in first_row[i] if not x.isdigit()) 31 | Text += '$' 32 | return Text 33 | 34 | 35 | if __name__ == "__main__": 36 | ''' 37 | Given: A string Transform (with a single "$" sign). 38 | Return: The string Text such that BWT(Text) = Transform 39 | ''' 40 | input_lines = sys.stdin.read().splitlines() 41 | BWT = input_lines[0] 42 | 43 | print(InverseBurrowsWheelerTransform(BWT)) 44 | -------------------------------------------------------------------------------- /solutions/BA6J.py: -------------------------------------------------------------------------------- 1 | def TwoBreakOnGenomeGraph(GenomeGraph, i1 , i2 , i3 , i4): 2 | if [i1, i2] in GenomeGraph: 3 | GenomeGraph.remove([i1, i2]) 4 | else: 5 | GenomeGraph.remove([i2, i1]) 6 | if [i3, i4] in GenomeGraph: 7 | GenomeGraph.remove([i3, i4]) 8 | else: 9 | GenomeGraph.remove([i4, i3]) 10 | GenomeGraph += [[i1, i3]] + [[i2, i4]] 11 | return GenomeGraph 12 | 13 | 14 | if __name__ == "__main__": 15 | ''' 16 | Given: The colored edges of a genome graph GenomeGraph, followed by indices i, i', j, and j'. 17 | Return: The colored edges of the genome graph resulting from applying the 2-break operation. 18 | ''' 19 | GenomeGraph = input().rstrip() 20 | GenomeGraph = GenomeGraph[1:-1] 21 | GenomeGraph = GenomeGraph.split('), (') 22 | for i in range(len(GenomeGraph)): 23 | GenomeGraph[i] = GenomeGraph[i].split(', ') 24 | for j in range(len(GenomeGraph[i])): 25 | GenomeGraph[i][j] = int(GenomeGraph[i][j]) 26 | i1, i2, i3, i4 = map(int, input().rstrip().split(', ')) 27 | result = TwoBreakOnGenomeGraph(GenomeGraph, i1, i2, i3, i4) 28 | for j in range(len(result)): 29 | result[j] = '(' + ', '.join(str(i) for i in result[j]) + ')' 30 | print(', '.join(result)) -------------------------------------------------------------------------------- /solutions/BA5C.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def LCS(v, w): 5 | v = '-' + v 6 | w = '-' + w 7 | 8 | S = [[0 for _ in range(len(w))] for _ in range(len(v))] 9 | Backtrack = [[None for _ in range(len(w))] for _ in range(len(v))] 10 | 11 | for i in range(1, len(v)): 12 | for j in range(1, len(w)): 13 | tmp = S[i - 1][j - 1] + (1 if v[i] == w[j] else 0) 14 | S[i][j] = max(S[i - 1][j], S[i][j - 1], tmp) 15 | 16 | if S[i][j] == S[i - 1][j]: 17 | Backtrack[i][j] = "up" 18 | elif S[i][j] == S[i][j - 1]: 19 | Backtrack[i][j] = "left" 20 | else: 21 | Backtrack[i][j] = "diag" 22 | 23 | LCS = "" 24 | while i > 0 and j > 0: 25 | if Backtrack[i][j] == "diag": 26 | LCS = v[i] + LCS 27 | i -= 1 28 | j -= 1 29 | elif Backtrack[i][j] == "left": 30 | j -= 1 31 | else: 32 | i -= 1 33 | 34 | return LCS 35 | 36 | 37 | if __name__ == "__main__": 38 | ''' 39 | Given: Two strings. 40 | Return: A longest common subsequence of these strings. 41 | ''' 42 | input_lines = sys.stdin.read().splitlines() 43 | s = input_lines[0] 44 | t = input_lines[1] 45 | 46 | print(LCS(s,t)) -------------------------------------------------------------------------------- /solutions/BA10B.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def ProbabilityOutcomeGivenPath(x, hidden_path, emission_matrix): 5 | emission_prob = 1 6 | # Calculate ∏ (i: 1 -> n) emission𝜋𝑖(𝑥𝑖) 7 | for i in range(len(x)): 8 | emission_prob *= emission_matrix[hidden_path[i]][x[i]] 9 | return emission_prob 10 | 11 | 12 | if __name__ == "__main__": 13 | ''' 14 | Given: A string x, followed by the alphabet Σ from which x was constructed, followed by a hidden path π, followed by 15 | the states States and emission matrix Emission of an HMM (Σ, States, Transition, Emission). 16 | Return: The conditional probability Pr(x|π) that string x will be emitted by the HMM given the hidden path π. 17 | ''' 18 | input_lines = sys.stdin.read().splitlines() 19 | 20 | x = input_lines[0] 21 | alphabet = input_lines[2].split(' ') 22 | hidden_path = input_lines[4] 23 | 24 | col_syms = input_lines[8].split() 25 | emission_matrix = {} 26 | for line in input_lines[9:]: 27 | current_line = line.split() 28 | row_sym = current_line[0] 29 | emission_matrix[row_sym] = {} 30 | for j in range(1, len(current_line)): 31 | emission_matrix[row_sym][col_syms[j - 1]] = float(current_line[j]) 32 | 33 | print(ProbabilityOutcomeGivenPath(x, hidden_path, emission_matrix)) -------------------------------------------------------------------------------- /solutions/BA5B.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def ManhattanTouristProblem(n, m, Down, Right): 5 | S = [[0 for _ in range(m + 1)] for _ in range(n + 1)] 6 | 7 | for i in range(1, n + 1): 8 | S[i][0] = S[i - 1][0] + Down[i - 1][0] 9 | for j in range(1, m + 1): 10 | S[0][j] = S[0][j - 1] + Right[0][j - 1] 11 | 12 | for i in range(1, n + 1): 13 | for j in range(1, m + 1): 14 | S[i][j] = max(S[i - 1][j] + Down[i - 1][j], S[i][j - 1] + Right[i][j - 1]) 15 | 16 | return S[n][m] 17 | 18 | 19 | if __name__ == "__main__": 20 | ''' 21 | Given: Integers n and m, followed by an n × (m+1) matrix Down and an (n+1) × m matrix Right. The two matrices are 22 | separated by the "-" symbol. 23 | Return: The length of a longest path from source (0, 0) to sink (n, m) in the n × m rectangular grid whose edges are 24 | defined by the matrices Down and Right. 25 | ''' 26 | input_lines = sys.stdin.read().splitlines() 27 | n, m = [int(x) for x in input_lines[0].split()] 28 | 29 | Down = [] 30 | for idx in range(1, n + 1): 31 | Down.append([int(x) for x in input_lines[idx].split()]) 32 | 33 | Right = [] 34 | for idx in range(n + 2, len(input_lines)): 35 | Right.append([int(x) for x in input_lines[idx].split()]) 36 | 37 | 38 | print(ManhattanTouristProblem(m, m, Down, Right)) 39 | -------------------------------------------------------------------------------- /solutions/BA6I.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA6G import cycle_to_chromosome 3 | 4 | 5 | def graph_to_genome(GenomeGraph): 6 | P = [] 7 | Cycles = [] 8 | temp = [] 9 | for i in range(len(GenomeGraph)): 10 | if i == len(GenomeGraph) - 1: 11 | temp += GenomeGraph[i] 12 | Cycles.append(temp) 13 | elif GenomeGraph[i][1] == GenomeGraph[i + 1][0] + 1 or GenomeGraph[i][1] == GenomeGraph[i + 1][0] - 1: 14 | temp += GenomeGraph[i] 15 | else: 16 | temp += GenomeGraph[i] 17 | Cycles.append(temp) 18 | temp = [] 19 | for Cycle in Cycles: 20 | Chromosome = cycle_to_chromosome([Cycle[-1]] + Cycle[:-1]) 21 | P.append(Chromosome) 22 | return P 23 | 24 | 25 | if __name__ == "__main__": 26 | ''' 27 | Given: The colored edges of a genome graph. 28 | Return: A genome corresponding to the genome graph. 29 | ''' 30 | Edges = sys.stdin.readline().strip() 31 | Edges = Edges.split('), (') 32 | 33 | for i in range(len(Edges)): 34 | Edges[i] = Edges[i].replace("(", "").replace(")", "") 35 | Edges[i] = [int(x) for x in Edges[i].split(", ")] 36 | 37 | result = graph_to_genome(Edges) 38 | for j in range(len(result)): 39 | result[j] = '(' + ' '.join(('+' if i > 0 else '') + str(i) for i in result[j]) + ')' 40 | print(''.join(result)) 41 | -------------------------------------------------------------------------------- /solutions/BA6E.py: -------------------------------------------------------------------------------- 1 | def reverse_comp(Seq): 2 | return Seq[::-1].translate(Seq.maketrans('ATCG', 'TAGC')) 3 | 4 | 5 | def SharedKmers(k, seq1, seq2): 6 | result = [] 7 | seq1dict = {} 8 | for i in range(len(seq1) - k + 1): 9 | key = seq1[i:i+k] 10 | if key in seq1dict.keys(): 11 | seq1dict[key].append(i) 12 | elif reverse_comp(key) in seq1dict.keys(): 13 | seq1dict[reverse_comp(key)].append(i) 14 | else: 15 | seq1dict[key] = [i] 16 | for j in range(len(seq2) - k + 1): 17 | sub2 = seq2[j:j+k] 18 | if sub2 in seq1dict.keys(): 19 | for pos in seq1dict[sub2]: 20 | result.append([pos, j]) 21 | elif reverse_comp(sub2) in seq1dict.keys(): 22 | for pos in seq1dict[reverse_comp(sub2)]: 23 | result.append([pos, j]) 24 | return result 25 | 26 | 27 | if __name__ == "__main__": 28 | ''' 29 | Given: An integer k and two strings. 30 | Return: All k-mers shared by these strings, in the form of ordered pairs (x, y) corresponding to starting positions 31 | of these k-mers in the respective strings. 32 | ''' 33 | k = int(input().rstrip()) 34 | seq1 = input().rstrip() 35 | seq2 = input().rstrip() 36 | result = SharedKmers(k, seq1, seq2) 37 | for r in result: 38 | print('(' + ', '.join(map(str, r)) + ')') 39 | -------------------------------------------------------------------------------- /solutions/BA9D.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from Tree_Trie_classes import Tree 3 | 4 | 5 | def backtrace_path_from_node(tree, node): 6 | # if root is reached, stop 7 | if node.label == 0: 8 | return '' 9 | 10 | for edge in tree.all_edges: 11 | if edge.target_node == node: 12 | incoming_edge = edge 13 | break 14 | 15 | path_substring = Text[incoming_edge.position: incoming_edge.position + incoming_edge.length] 16 | path_substring = backtrace_path_from_node(tree, incoming_edge.from_node) + path_substring 17 | 18 | return path_substring 19 | 20 | 21 | def LongestRepeat(Text): 22 | suffix_tree = Tree() 23 | suffix_tree.PopulateSuffixTree(Text + '$') 24 | 25 | ## Find deepest internal node 26 | max_dep = -1 27 | for node in suffix_tree.all_nodes: 28 | if len(node.edges) != 0 and node.depth > max_dep: 29 | max_dep = node.depth 30 | max_dep_node = node 31 | 32 | longest_substring = backtrace_path_from_node(suffix_tree, max_dep_node) 33 | return longest_substring 34 | 35 | 36 | if __name__ == "__main__": 37 | ''' 38 | Given: A string Text. 39 | Return: A longest substring of Text that appears in Text more than once. (Multiple solutions may exist, in which 40 | case you may return any one.) 41 | ''' 42 | Text = sys.stdin.read().rstrip() 43 | print(LongestRepeat(Text)) -------------------------------------------------------------------------------- /solutions/BA3G.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA3F import parse_adj_list 3 | from BA3F import Eulerian_cycle 4 | 5 | 6 | def Eulerian_path(adj_list): 7 | deg_diffs = {} 8 | for source, targets in adj_list.items(): 9 | if source in deg_diffs: 10 | deg_diffs[source] += len(targets) 11 | else: 12 | deg_diffs[source] = len(targets) 13 | for target in targets: 14 | if target in deg_diffs: 15 | deg_diffs[target] -= 1 16 | else: 17 | deg_diffs[target] = -1 18 | 19 | to_add_s = [node for node, diff in deg_diffs.items() if diff == -1][0] 20 | to_add_t = [node for node, diff in deg_diffs.items() if diff == 1][0] 21 | if to_add_s in adj_list: 22 | adj_list[to_add_s].append(to_add_t) 23 | else: 24 | adj_list[to_add_s] = [to_add_t] 25 | 26 | cycle = Eulerian_cycle(adj_list) 27 | idx = 0 28 | while True: 29 | if cycle[idx] == to_add_s and cycle[idx + 1] == to_add_t: 30 | break 31 | idx += 1 32 | return cycle[idx + 1:] + cycle[1:idx + 1] 33 | 34 | 35 | if __name__ == "__main__": 36 | ''' 37 | Given: A directed graph that contains an Eulerian path, where the graph is given in the form of an adjacency list. 38 | Return: An Eulerian path in this graph. 39 | ''' 40 | input_lines = sys.stdin.read().splitlines() 41 | Adj_list = parse_adj_list(input_lines) 42 | 43 | print("->".join(Eulerian_path(Adj_list))) 44 | -------------------------------------------------------------------------------- /solutions/BA1I.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA1G import hamming_dist 3 | 4 | 5 | def neighbors(pattern, d): 6 | if d == 0: 7 | return pattern 8 | if len(pattern) == 1: 9 | return ['A', 'C', 'G', 'T'] 10 | neighborhood = set() 11 | suffix_neighbors = neighbors(pattern[1:], d) 12 | for suffix in suffix_neighbors: 13 | if hamming_dist(pattern[1:], suffix) < d: 14 | for nuc in ['A', 'C', 'G', 'T']: 15 | neighborhood.add(nuc + suffix) 16 | else: 17 | neighborhood.add(pattern[0] + suffix) 18 | return neighborhood 19 | 20 | 21 | def most_freq_kmers_mismatch(text, k, d): 22 | count_dict = {} 23 | for i in range(len(text) - k + 1): 24 | kmer = text[i:i+k] 25 | neighborhood = neighbors(kmer, d) 26 | for approx_pattern in neighborhood: 27 | if approx_pattern in count_dict: 28 | count_dict[approx_pattern] += 1 29 | else: 30 | count_dict[approx_pattern] = 1 31 | max_freq = max(count_dict.values()) 32 | return [kmer for kmer, count in count_dict.items() if count == max_freq] 33 | 34 | 35 | if __name__ == "__main__": 36 | ''' 37 | Given: A string Text as well as integers k and d. 38 | Return: All most frequent k-mers with up to d mismatches in Text. 39 | ''' 40 | input_lines = sys.stdin.read().splitlines() 41 | Text = input_lines[0] 42 | k, d = [int(x) for x in input_lines[1].split()] 43 | 44 | print(" ".join(most_freq_kmers_mismatch(Text, k, d))) 45 | -------------------------------------------------------------------------------- /solutions/BA11H.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 4 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163} 5 | masses = list(aa_table.values()) 6 | 7 | 8 | def SpectralDictionarySize(spectral_vector, threshold, max_score): 9 | m = len(spectral_vector) 10 | 11 | Size = {} 12 | Size[0] = {} 13 | Size[0][0] = 1 14 | 15 | for t in range(1, max_score + 1): 16 | Size[0][t] = 0 17 | 18 | for i in range(1, m + 1): 19 | Size[i] = {} 20 | for t in range(max_score + 1): 21 | Size[i][t] = 0 22 | for a in masses: 23 | if (i - a) >= 0 and (t - spectral_vector[i - 1]) >= 0 and (t - spectral_vector[i - 1]) <= max_score: 24 | Size[i][t] += Size[i - a][t - spectral_vector[i - 1]] 25 | 26 | final_size = 0 27 | for t in range(threshold, max_score + 1): 28 | final_size += Size[m][t] 29 | 30 | return final_size 31 | 32 | 33 | if __name__ == "__main__": 34 | ''' 35 | Given: A spectral vector Spectrum', an integer threshold, and an integer max_score. 36 | Return: The size of the dictionary Dictionarythreshold(Spectrum'). 37 | ''' 38 | tmp = sys.stdin.read().splitlines() 39 | spectral_vector = [int(x) for x in tmp[0].rstrip().split(' ')] 40 | threshold = int(tmp[1]) 41 | max_score = int(tmp[2]) 42 | 43 | print(SpectralDictionarySize(spectral_vector, threshold, max_score)) -------------------------------------------------------------------------------- /solutions/BA11I.py: -------------------------------------------------------------------------------- 1 | import sys 2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163} 4 | masses = list(aa_table.values()) 5 | 6 | 7 | def SpectralDictionaryProbability(spectral_vector, threshold, max_score): 8 | m = len(spectral_vector) 9 | 10 | Prob = {} 11 | Prob[0] = {} 12 | Prob[0][0] = 1 13 | 14 | for t in range(1, max_score + 1): 15 | Prob[0][t] = 0 16 | 17 | for i in range(1, m + 1): 18 | Prob[i] = {} 19 | for t in range(max_score + 1): 20 | Prob[i][t] = 0 21 | for a in masses: 22 | if (i - a) >= 0 and (t - spectral_vector[i - 1]) >= 0 and (t - spectral_vector[i - 1]) <= max_score: 23 | Prob[i][t] += Prob[i - a][t - spectral_vector[i - 1]] 24 | Prob[i][t] /= 20 25 | 26 | final_Prob = 0 27 | for t in range(threshold, max_score + 1): 28 | final_Prob += Prob[m][t] 29 | 30 | return final_Prob 31 | 32 | if __name__ == "__main__": 33 | ''' 34 | Given: A spectral vector Spectrum', an integer threshold, and an integer max_score. 35 | Return: The probability of the dictionary Dictionarythreshold(Spectrum'). 36 | ''' 37 | tmp = sys.stdin.read().splitlines() 38 | spectral_vector = [int(x) for x in tmp[0].rstrip().split(' ')] 39 | threshold = int(tmp[1]) 40 | max_score = int(tmp[2]) 41 | 42 | print(SpectralDictionaryProbability(spectral_vector, threshold, max_score)) -------------------------------------------------------------------------------- /solutions/BA6C.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA6H import colored_edges 3 | 4 | 5 | def find_next_edge(current, edges): 6 | if len(edges) == 0: 7 | return -1 8 | idx = 0 9 | while not (current[0] in edges[idx] or current[1] in edges[idx]): 10 | idx += 1 11 | if idx == len(edges): 12 | return -1 13 | return edges[idx] 14 | 15 | 16 | def two_break_distance(P, Q): 17 | edgesP = colored_edges(P) 18 | edgesQ = colored_edges(Q) 19 | edges = edgesP + edgesQ 20 | blocks = set() 21 | for edge in edges: 22 | blocks.add(edge[0]) 23 | blocks.add(edge[1]) 24 | Cycles = [] 25 | while len(edges) != 0: 26 | start = edges[0] 27 | edges.remove(edges[0]) 28 | Cycle = [start] 29 | current = find_next_edge(start, edges) 30 | while current != -1: 31 | Cycle.append(current) 32 | edges.remove(current) 33 | current = find_next_edge(current, edges) 34 | Cycles.append(Cycle) 35 | return len(blocks) // 2 - len(Cycles) 36 | 37 | 38 | if __name__ == "__main__": 39 | ''' 40 | Given: Two genomes with circular chromosomes on the same set of synteny blocks. 41 | Return: The 2-break distance between these two genomes. 42 | ''' 43 | input_lines = sys.stdin.read().splitlines() 44 | P = input_lines[0] 45 | P = P[1:-1] 46 | P = P.split(')(') 47 | for i in range(len(P)): 48 | P[i] = [int(x) for x in P[i].split(' ')] 49 | 50 | Q = input_lines[1] 51 | Q = Q[1:-1] 52 | Q = Q.split(')(') 53 | for i in range(len(Q)): 54 | Q[i] = [int(x) for x in Q[i].split(' ')] 55 | 56 | print(two_break_distance(P, Q)) 57 | -------------------------------------------------------------------------------- /solutions/BA9F.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from Tree_Trie_classes import Tree 3 | 4 | 5 | def backtrace_path_from_node(tree, node, Text): 6 | # if root is reached, stop 7 | if node.label == 0: 8 | return '' 9 | 10 | for edge in tree.all_edges: 11 | if edge.target_node == node: 12 | incoming_edge = edge 13 | break 14 | 15 | path_substring = Text[incoming_edge.position: incoming_edge.position + incoming_edge.length] 16 | path_substring = backtrace_path_from_node(tree, incoming_edge.from_node, Text) + path_substring 17 | return path_substring 18 | 19 | 20 | def Shortest_NonShared_Substring(Text1, Text2): 21 | suffix_tree = Tree() 22 | combined_Text = Text1 + '#' + Text2 + '$' 23 | suffix_tree.PopulateSuffixTree(combined_Text) 24 | suffix_tree.add_indicators() 25 | 26 | ## Find shallowest Text1 internal node 27 | min_dep = 1e6 28 | for node in suffix_tree.all_nodes: 29 | 30 | if node.indicator == '#': 31 | if len(node.edges) != 0 and node.depth <= min_dep: 32 | min_dep = node.depth 33 | min_dep_node = node 34 | 35 | non_shared_substr = backtrace_path_from_node(suffix_tree, min_dep_node, combined_Text) 36 | return non_shared_substr 37 | 38 | 39 | if __name__ == "__main__": 40 | ''' 41 | Given: Strings Text1 and Text2. 42 | Return: The shortest substring of Text1 that does not appear in Text2. (Multiple solutions may exist, in which case 43 | you may return any one.) 44 | ''' 45 | input_lines = sys.stdin.read().splitlines() 46 | Text1 = input_lines[0] 47 | Text2 = input_lines[1] 48 | 49 | print(Shortest_NonShared_Substring(Text1, Text2)) -------------------------------------------------------------------------------- /solutions/BA8A.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def euclidean_distance(pointA, pointB): 5 | distance = 0 6 | for i in range(len(pointA)): 7 | distance += (pointA[i] - pointB[i]) ** 2 8 | distance = distance ** 0.5 9 | return distance 10 | 11 | 12 | def distance_from_centers(centers, point): 13 | min_dist = float("Inf") 14 | for x in centers: 15 | current = euclidean_distance(x, point) 16 | if current < min_dist: 17 | min_dist = current 18 | return min_dist 19 | 20 | 21 | def max_distance_point(data, centers): 22 | max_dist = -1 23 | for point in data: 24 | current = distance_from_centers(centers, point) 25 | if current > max_dist: 26 | max_dist = current 27 | max_point = point 28 | return max_point 29 | 30 | 31 | def farthest_first_traversal(data, k): 32 | centers = [data[0]] 33 | while len(centers) < k: 34 | point = max_distance_point(data, centers) 35 | centers.append(point) 36 | return centers 37 | 38 | 39 | if __name__ == "__main__": 40 | ''' 41 | Given: Integers k and m followed by a set of points Data in m-dimensional space. 42 | Return: A set Centers consisting of k points (centers) resulting from applying FarthestFirstTraversal(Data, k), 43 | where the first point from Data is chosen as the first center to initialize the algorithm. 44 | ''' 45 | input_lines = sys.stdin.read().splitlines() 46 | k, m = [int(x) for x in input_lines[0].split()] 47 | data = [[float(x) for x in line.split()] for line in input_lines[1:]] 48 | 49 | center_points = farthest_first_traversal(data, k) 50 | 51 | for center in center_points: 52 | print(" ".join(map(str, center))) 53 | -------------------------------------------------------------------------------- /solutions/BA9B.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA9A import TrieConstruction 3 | 4 | 5 | def PrefixTrieMatching(Prefix, trie): 6 | ''' Prefix Trie Matching 7 | ''' 8 | symbol = Prefix[0] 9 | node = trie.root 10 | 11 | idx = 1 12 | pattern = '' 13 | 14 | while True: 15 | # if node is a leaf 16 | if len(node.edges) == 0: 17 | return pattern 18 | 19 | # if there is an edge (node, some other node) in Trie, 20 | # labeled by symbol, extend pattern 21 | found = False 22 | for edge in node.edges: 23 | if edge.label == symbol: 24 | found = True 25 | pattern += symbol 26 | node = edge.target_node 27 | if idx != len(Prefix): 28 | symbol = Prefix[idx] 29 | idx += 1 30 | break 31 | 32 | if not found: 33 | return None 34 | 35 | 36 | def TrieMatching(Text, trie): 37 | indices = [] 38 | idx = 0 39 | while len(Text) != 0: 40 | match = PrefixTrieMatching(Text, trie) 41 | if match != None: 42 | indices.append(idx) 43 | Text = Text[1:] 44 | idx += 1 45 | return indices 46 | 47 | 48 | if __name__ == "__main__": 49 | ''' 50 | Given: A string Text and a collection of strings Patterns. 51 | Return: All starting positions in Text where a string from Patterns appears as a substring. 52 | ''' 53 | tmp = sys.stdin.read().splitlines() 54 | Text = tmp[0] 55 | Patterns = [] 56 | for i in range(1, len(tmp)): 57 | Patterns.append(tmp[i]) 58 | 59 | trie = TrieConstruction(Patterns) 60 | result = TrieMatching(Text, trie) 61 | print(' '.join(str(x) for x in result)) -------------------------------------------------------------------------------- /solutions/BA9E.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from Tree_Trie_classes import Tree 3 | 4 | 5 | def backtrace_path_from_node(tree, node, Text): 6 | # if root is reached, stop 7 | if node.label == 0: 8 | return '' 9 | 10 | for edge in tree.all_edges: 11 | if edge.target_node == node: 12 | incoming_edge = edge 13 | break 14 | 15 | path_substring = Text[incoming_edge.position : incoming_edge.position + incoming_edge.length] 16 | path_substring = backtrace_path_from_node(tree, incoming_edge.from_node, Text) + path_substring 17 | return path_substring 18 | 19 | 20 | def LongestSharedSubstring(Text1, Text2): 21 | suffix_tree = Tree() 22 | combined_Text = Text1 + '#' + Text2 + '$' 23 | suffix_tree.PopulateSuffixTree(combined_Text) 24 | suffix_tree.add_indicators() 25 | 26 | ## Find deepest common internal node 27 | max_dep = -1 28 | for node in suffix_tree.all_nodes: 29 | if node.indicator == '*': 30 | # print(str(node.label) + ': '+ str(node.indicator) + ': ' + str(node.depth)) 31 | if len(node.edges) != 0 and node.depth >= max_dep: 32 | max_dep = node.depth 33 | max_dep_node = node 34 | 35 | longest_substring = backtrace_path_from_node(suffix_tree, max_dep_node, combined_Text) 36 | return longest_substring 37 | 38 | 39 | if __name__ == "__main__": 40 | ''' 41 | Given: Strings Text1 and Text2. 42 | Return: The longest substring that occurs in both Text1 and Text2. (Multiple solutions may exist, in which case you 43 | may return any one.) 44 | ''' 45 | input_lines = sys.stdin.read().splitlines() 46 | Text1 = input_lines[0] 47 | Text2 = input_lines[1] 48 | 49 | print(LongestSharedSubstring(Text1, Text2)) -------------------------------------------------------------------------------- /solutions/BA9A.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from Tree_Trie_classes import Trie 3 | 4 | def TrieConstruction(Pattern_list): 5 | trie = Trie() 6 | 7 | for Pattern in Pattern_list: 8 | currentNode = trie.root 9 | 10 | for currentSymbol in Pattern: 11 | # if there is an outgoing edge from currentNode with label currentSymbol, 12 | # change currentNode to target_node 13 | for edge in currentNode.edges: 14 | if edge.label == currentSymbol: 15 | currentNode = edge.target_node 16 | break 17 | else: 18 | # add a new node newNode to Trie 19 | newNode = trie.add_node() 20 | # add a new edge from currentNode to newNode with label currentSymbol 21 | trie.add_edge(currentNode, newNode, currentSymbol) 22 | currentNode = newNode 23 | return trie 24 | 25 | 26 | if __name__ == "__main__": 27 | ''' 28 | Given: A collection of strings Patterns. 29 | Return: The adjacency list corresponding to Trie(Patterns), in the following format. If Trie(Patterns) has n nodes, 30 | first label the root with 1 and then label the remaining nodes with the integers 2 through n in any order you like. 31 | Each edge of the adjacency list of Trie(Patterns) will be encoded by a triple: the first two members of the triple 32 | must be the integers labeling the initial and terminal nodes of the edge, respectively; the third member of the 33 | triple must be the symbol labeling the edge. 34 | ''' 35 | Patterns = sys.stdin.read().splitlines() 36 | trie = TrieConstruction(Patterns) 37 | 38 | for edge in trie.all_edges: 39 | print(str(edge.from_node.label) + '->' + str(edge.target_node.label) + ':' + str(edge.label)) -------------------------------------------------------------------------------- /solutions/BA3J.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | from BA3G import Eulerian_path 4 | 5 | 6 | def deBruijn_graph_paired_reads(paired_reads): 7 | adj_list = defaultdict(list) 8 | for pair in paired_reads: 9 | adj_list[(pair[0][:-1], pair[1][:-1])].append((pair[0][1:], pair[1][1:])) 10 | return adj_list 11 | 12 | 13 | def StringSpelledByGappedPatterns(GappedPatterns, k, d): 14 | prefix_string = '' 15 | suffix_string = '' 16 | for i, pattern_pair in enumerate(GappedPatterns): 17 | if i != len(GappedPatterns) - 1: 18 | prefix_string += pattern_pair[0][0] 19 | suffix_string += pattern_pair[1][0] 20 | else: 21 | prefix_string += pattern_pair[0] 22 | suffix_string += pattern_pair[1] 23 | for i in range(k + d + 1, len(prefix_string)): 24 | if prefix_string[i] != suffix_string[i - k - d - 1]: 25 | return -1 26 | return prefix_string + suffix_string[len(suffix_string) - k - d - 1:] 27 | 28 | 29 | def string_reconstruction_read_pairs(k, d, paired_reads): 30 | adj_list = deBruijn_graph_paired_reads(paired_reads) 31 | path = Eulerian_path(adj_list) 32 | return StringSpelledByGappedPatterns(path, k - 1, d) 33 | 34 | 35 | if __name__ == "__main__": 36 | ''' 37 | Given: Integers k and d followed by a collection of paired k-mers PairedReads. 38 | Return: A string Text with (k, d)-mer composition equal to PairedReads. (If multiple answers exist, you may return 39 | any one.) 40 | ''' 41 | input_lines = sys.stdin.read().splitlines() 42 | k, d = [int(x) for x in input_lines[0].split()] 43 | PairedReads = [] 44 | for line in input_lines[1:]: 45 | PairedReads.append(line.split("|")) 46 | 47 | print(string_reconstruction_read_pairs(k, d, PairedReads)) 48 | -------------------------------------------------------------------------------- /solutions/BA1J.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA1C import rev_comp 3 | from BA1G import hamming_dist 4 | 5 | 6 | def neighbors(pattern, d): 7 | if d == 0: 8 | return pattern 9 | if len(pattern) == 1: 10 | return ['A', 'C', 'G', 'T'] 11 | neighborhood = set() 12 | suffix_neighbors = neighbors(pattern[1:], d) 13 | for suffix in suffix_neighbors: 14 | if hamming_dist(pattern[1:], suffix) < d: 15 | for nuc in ['A', 'C', 'G', 'T']: 16 | neighborhood.add(nuc + suffix) 17 | else: 18 | neighborhood.add(pattern[0] + suffix) 19 | return neighborhood 20 | 21 | 22 | def most_freq_kmers_mismatch_revc(text, k, d): 23 | count_dict = {} 24 | for i in range(len(text) - k + 1): 25 | kmer = text[i:i+k] 26 | neighborhood = neighbors(kmer, d) 27 | for approx_pattern in neighborhood: 28 | if approx_pattern in count_dict: 29 | count_dict[approx_pattern] += 1 30 | else: 31 | count_dict[approx_pattern] = 1 32 | r_pattern = rev_comp(approx_pattern) 33 | if r_pattern in count_dict: 34 | count_dict[r_pattern] += 1 35 | else: 36 | count_dict[r_pattern] = 1 37 | max_freq = max(count_dict.values()) 38 | return [kmer for kmer, count in count_dict.items() if count == max_freq] 39 | 40 | 41 | if __name__ == "__main__": 42 | ''' 43 | Given: A DNA string Text as well as integers k and d. 44 | Return: All k-mers Pattern maximizing the sum Countd(Text, Pattern) + Countd(Text, Pattern) over all possible 45 | k-mers. 46 | ''' 47 | input_lines = sys.stdin.read().splitlines() 48 | Text = input_lines[0] 49 | k, d = [int(x) for x in input_lines[1].split()] 50 | 51 | print(" ".join(most_freq_kmers_mismatch_revc(Text, k, d))) 52 | -------------------------------------------------------------------------------- /solutions/BA11F.py: -------------------------------------------------------------------------------- 1 | import sys 2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163} 4 | 5 | 6 | def PeptideVector(peptide): 7 | prefixMasses = [] 8 | for i in range(len(peptide)): 9 | prefixMasses.append(sum(peptide[:i + 1])) 10 | vector = [0] * prefixMasses[-1] 11 | for mass in prefixMasses: 12 | vector[mass - 1] = 1 13 | return vector 14 | 15 | 16 | def PeptideIdentification(spectral_vector, proteome): 17 | max_score = -1e6 18 | mass_list = [] 19 | for aa in proteome: 20 | mass_list.append(aa_table[aa]) 21 | 22 | for i in range(len(mass_list)): 23 | k = 2 24 | while i + k < len(mass_list): 25 | peptide = mass_list[i:i + k] 26 | pep_vec = PeptideVector(peptide) 27 | if len(pep_vec) > len(spectral_vector): 28 | break 29 | if len(pep_vec) == len(spectral_vector): 30 | score = 0 31 | for idx in range(len(pep_vec)): 32 | if pep_vec[idx] == 1: 33 | score += spectral_vector[idx] 34 | if score > max_score: 35 | max_score = score 36 | best_peptide = proteome[i:i + k] 37 | k += 1 38 | return best_peptide 39 | 40 | 41 | if __name__ == "__main__": 42 | ''' 43 | Given: A space-delimited spectral vector S and an amino acid string Proteome. 44 | Return: A peptide in Proteome with maximum score against S. 45 | ''' 46 | tmp = sys.stdin.read().splitlines() 47 | spectral_vector = [int(x) for x in tmp[0].rstrip().split(' ')] 48 | proteome = tmp[1].rstrip() 49 | 50 | print(PeptideIdentification(spectral_vector, proteome)) 51 | 52 | -------------------------------------------------------------------------------- /solutions/BA4G.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA4E import expand, cyclospectrum_mass_peptide 3 | 4 | 5 | def Score(peptide, spectrum): 6 | pep_spec = cyclospectrum_mass_peptide(peptide) 7 | result = 0 8 | unique_masses = set(pep_spec + spectrum) 9 | for mass in unique_masses: 10 | result += min(pep_spec.count(mass), spectrum.count(mass)) 11 | return result 12 | 13 | 14 | def Trim(leaderboard, spectrum, N): 15 | if len(leaderboard) <= N: 16 | return leaderboard 17 | 18 | scores = {} 19 | for i, peptide in enumerate(leaderboard): 20 | scores[i] = Score(peptide, spectrum) 21 | 22 | sorted_scores = sorted(scores.values(), reverse=True) 23 | threshold = sorted_scores[N - 1] 24 | 25 | return [leaderboard[idx] for idx, score in scores.items() if score >= threshold] 26 | 27 | 28 | def leaderboard_cyclopeptide_sequencing(spectrum, N): 29 | leaderboard = [[]] 30 | leader_peptide = [] 31 | 32 | while leaderboard: 33 | leaderboard = expand(leaderboard) 34 | for peptide in leaderboard: 35 | if sum(peptide) == spectrum[-1]: 36 | if Score(peptide, spectrum) > Score(leader_peptide, spectrum): 37 | leader_peptide = peptide 38 | elif sum(peptide) > spectrum[-1]: 39 | leaderboard = [pep for pep in leaderboard if pep != peptide] 40 | leaderboard = Trim(leaderboard, spectrum, N) 41 | return leader_peptide 42 | 43 | 44 | if __name__ == "__main__": 45 | ''' 46 | Given: An integer N and a collection of integers Spectrum. 47 | Return: LeaderPeptide after running LeaderboardCyclopeptideSequencing(Spectrum, N). 48 | ''' 49 | input_lines = sys.stdin.read().splitlines() 50 | N = int(input_lines[0]) 51 | Spectrum = [int(x) for x in input_lines[1].split()] 52 | 53 | print("-".join(map(str, leaderboard_cyclopeptide_sequencing(Spectrum, N)))) 54 | -------------------------------------------------------------------------------- /solutions/BA4M.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def delta2(S, T): 5 | A = list(S) 6 | B = list(T) 7 | result = [] 8 | for a in A: 9 | for b in B: 10 | result.append(abs(a - b)) 11 | return result 12 | 13 | 14 | def is_multi_subset(A, B): 15 | for elem in A: 16 | if A.count(elem) > B.count(elem): 17 | return False 18 | return True 19 | 20 | 21 | def difference(A, B): 22 | # A - B 23 | diffset = [] 24 | all_elems = set(A) 25 | for elem in all_elems: 26 | n = A.count(elem) - B.count(elem) 27 | if n > 0: 28 | for _ in range(n): 29 | diffset.append(elem) 30 | diffset.sort() 31 | return diffset 32 | 33 | 34 | def Place(dist_l): 35 | if not dist_l: 36 | return X 37 | 38 | y = dist_l[-1] 39 | # place on left 40 | tmp = delta2({y}, X) 41 | if is_multi_subset(tmp, dist_l): 42 | X.add(y) 43 | L_left = difference(dist_l, tmp) 44 | res_left = Place(L_left) 45 | if res_left: 46 | return res_left 47 | X.remove(y) 48 | 49 | # place on right 50 | tmp = delta2({width - y}, X) 51 | if is_multi_subset(tmp, dist_l): 52 | X.add(width - y) 53 | L_right = difference(dist_l, tmp) 54 | res_right = Place(L_right) 55 | if res_right: 56 | return res_right 57 | X.remove(width - y) 58 | 59 | return {} 60 | 61 | 62 | if __name__ == "__main__": 63 | ''' 64 | Given: A collection of integers L. 65 | Return: A set A such that ∆A = L. 66 | ''' 67 | # Zhang Z. An exponential example for a partial digest mapping algorithm. J Comput Biol. 1994;1(3):235-9. 68 | L = [int(x) for x in sys.stdin.readline().strip().split()] 69 | L = [x for x in L if x > 0] 70 | 71 | width = L.pop(-1) 72 | X = {0, width} 73 | 74 | result = Place(L) 75 | 76 | print(" ".join(map(str, result))) 77 | -------------------------------------------------------------------------------- /solutions/BA5N.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | class Node: 5 | def __init__(self, lbl): 6 | self.label = lbl 7 | self.parent_nodes = [] 8 | self.target_nodes = [] 9 | self.visited = False 10 | 11 | 12 | class DAG: 13 | def __init__(self): 14 | self.nodes_dict = {} 15 | self.distances = {} 16 | self.backtrack = {} 17 | 18 | def add_node(self, lbl): 19 | if lbl in self.nodes_dict: 20 | return self.nodes_dict[lbl] 21 | 22 | new_node = Node(lbl) 23 | self.nodes_dict[lbl] = new_node 24 | return new_node 25 | 26 | def contruct_dag(self, adj_list_text): 27 | for line in adj_list_text: 28 | 29 | nodeA, tmp = line.split(" -> ") 30 | nodeB_list = tmp.split(",") 31 | 32 | from_node = self.add_node(nodeA) 33 | 34 | for nodeB in nodeB_list: 35 | to_node = self.add_node(nodeB) 36 | 37 | from_node.target_nodes.append(to_node) 38 | to_node.parent_nodes.append(from_node) 39 | 40 | def topological_sort_util(self, node, stack): 41 | node.visited = True 42 | for node2 in node.target_nodes: 43 | if not node2.visited: 44 | self.topological_sort_util(node2, stack) 45 | stack.insert(0, node.label) 46 | 47 | def topological_sort(self): 48 | stack = [] 49 | for node in self.nodes_dict.values(): 50 | if not node.visited: 51 | self.topological_sort_util(node, stack) 52 | return stack 53 | 54 | 55 | if __name__ == "__main__": 56 | ''' 57 | Given: The adjacency list of a graph (with nodes represented by integers). 58 | Return: A topological ordering of this graph. 59 | ''' 60 | adj_list_text = sys.stdin.read().splitlines() 61 | 62 | graph = DAG() 63 | graph.contruct_dag(adj_list_text) 64 | print(", ".join(graph.topological_sort())) -------------------------------------------------------------------------------- /solutions/BA8C.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | from BA8A import euclidean_distance 4 | 5 | 6 | def closest_center(point, centers): 7 | min_dist = float("Inf") 8 | for x in centers: 9 | current = euclidean_distance(x, point) 10 | if current < min_dist: 11 | min_dist = current 12 | closest = x 13 | return closest 14 | 15 | 16 | def cluster_mean(cluster): 17 | m = len(cluster[0]) 18 | center = [0] * m 19 | for point in cluster: 20 | for i in range(m): 21 | center[i] += point[i] 22 | center = [x / len(cluster) for x in center] 23 | return center 24 | 25 | 26 | def lloyd_k_means(data, k): 27 | centers = data[:k] 28 | 29 | while True: 30 | # Centers to clusters 31 | cluster_assignments = defaultdict(list) 32 | for point in data: 33 | center = closest_center(point, centers) 34 | cluster_assignments[tuple(center)].append(point) 35 | 36 | # Clusters to centers 37 | new_centers = [[]] * k 38 | for i in range(k): 39 | new_centers[i] = cluster_mean(cluster_assignments[tuple(centers[i])]) 40 | 41 | if new_centers == centers: 42 | break 43 | centers = new_centers[:] 44 | 45 | return centers 46 | 47 | 48 | if __name__ == "__main__": 49 | ''' 50 | Given: Integers k and m followed by a set of points Data in m-dimensional space. 51 | Return: A set Centers consisting of k points (centers) resulting from applying the Lloyd algorithm to Data and 52 | Centers, where the first k points from Data are selected as the first k centers. 53 | ''' 54 | input_lines = sys.stdin.read().splitlines() 55 | k, m = [int(x) for x in input_lines[0].split()] 56 | data = [[float(x) for x in line.split()] for line in input_lines[1:]] 57 | 58 | centers = lloyd_k_means(data, k) 59 | for center in centers: 60 | print(" ".join(map(str, center))) 61 | -------------------------------------------------------------------------------- /solutions/BA4I.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA4H import convolution 3 | from BA4G import Score, Trim 4 | 5 | 6 | def find_masses(spectrum, M): 7 | conv = convolution(spectrum) 8 | conv = [x for x in conv if 57 <= x <= 200] 9 | 10 | freq_dict = {} 11 | for mass in set(conv): 12 | freq_dict[mass] = conv.count(mass) 13 | 14 | sorted_elems = sorted(freq_dict.items(), key=lambda kv: kv[1], reverse=True) 15 | masses = [mass for mass, freq in sorted_elems if freq >= sorted_elems[M][1]] 16 | masses.sort() 17 | return masses 18 | 19 | 20 | def Expand(peptides, masses): 21 | new_peptides = [] 22 | for pep in peptides: 23 | for mass in masses: 24 | new_peptides.append(pep + [mass]) 25 | return new_peptides 26 | 27 | 28 | def convolution_cyclopeptide_sequencing(spectrum, M, N): 29 | masses = find_masses(spectrum, M) 30 | leaderboard = [[]] 31 | leader_peptide = [] 32 | 33 | while leaderboard: 34 | leaderboard = Expand(leaderboard, masses) 35 | for peptide in leaderboard: 36 | if sum(peptide) == spectrum[-1]: 37 | if Score(peptide, spectrum) > Score(leader_peptide, spectrum): 38 | leader_peptide = peptide 39 | elif sum(peptide) > spectrum[-1]: 40 | leaderboard = [pep for pep in leaderboard if pep != peptide] 41 | leaderboard = Trim(leaderboard, spectrum, N) 42 | return leader_peptide 43 | 44 | 45 | if __name__ == "__main__": 46 | ''' 47 | Given: An integer M, an integer N, and a collection of (possibly repeated) integers Spectrum. 48 | Return: A cyclic peptide LeaderPeptide with amino acids taken only from the top M elements (and ties) of the 49 | convolution of Spectrum that fall between 57 and 200, and where the size of Leaderboard is restricted to the top N 50 | (and ties). 51 | ''' 52 | input_lines = sys.stdin.read().splitlines() 53 | M = int(input_lines[0]) 54 | N = int(input_lines[1]) 55 | Spectrum = [int(x) for x in input_lines[2].split()] 56 | 57 | print("-".join(map(str, convolution_cyclopeptide_sequencing(Spectrum, M, N)))) 58 | -------------------------------------------------------------------------------- /solutions/BA9L.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def BWMatching_wrapper(BWT, pattern_list): 5 | counts = {} 6 | BWT_list = [] 7 | for char in BWT: 8 | if char not in counts.keys(): 9 | counts[char] = 1 10 | else: 11 | counts[char] += 1 12 | tmp = char + str(counts[char]) 13 | BWT_list.append(tmp) 14 | 15 | first_col = sorted(BWT_list, key=lambda x: x[0]) 16 | 17 | last_to_first = [] 18 | for sym_last in BWT_list: 19 | for idx, sym_first in enumerate(first_col): 20 | if sym_first == sym_last: 21 | last_to_first.append(idx) 22 | 23 | result = [] 24 | for pattern in pattern_list: 25 | result.append(BWMatching(BWT, pattern, last_to_first)) 26 | 27 | return result 28 | 29 | 30 | def BWMatching(last_column, pattern, last_to_first): 31 | top = 0 32 | bottom = len(last_column) - 1 33 | 34 | while top <= bottom: 35 | if len(pattern) != 0: 36 | symbol = pattern[-1] 37 | pattern = pattern[:-1] 38 | 39 | # if positions from top to bottom in LastColumn 40 | # contain any occurrence of symbol 41 | 42 | match_positions = [] 43 | for idx in range(top, bottom + 1): 44 | if last_column[idx] == symbol: 45 | match_positions.append(idx) 46 | 47 | if len(match_positions) != 0: 48 | top = last_to_first[min(match_positions)] 49 | bottom = last_to_first[max(match_positions)] 50 | else: 51 | return 0 52 | else: 53 | return bottom - top + 1 54 | 55 | 56 | if __name__ == "__main__": 57 | ''' 58 | Given: A string BWT(Text), followed by a collection of strings Patterns. 59 | Return: A list of integers, where the i-th integer corresponds to the number of substring matches of the i-th member 60 | of Patterns in Text. 61 | ''' 62 | input_lines = sys.stdin.read().splitlines() 63 | BWT = input_lines[0] 64 | pattern_list = input_lines[1].split() 65 | 66 | match_nums = BWMatching_wrapper(BWT, pattern_list) 67 | print(' '.join(map(str, match_nums))) -------------------------------------------------------------------------------- /solutions/BA11E.py: -------------------------------------------------------------------------------- 1 | import sys 2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163} 4 | mass_table = {v: k for k, v in aa_table.items()} 5 | 6 | 7 | def PeptideSequencing(spectral_vector): 8 | spectral_vector = [0] + spectral_vector 9 | 10 | adj_list = [] 11 | for i in range(len(spectral_vector)): 12 | for j in range(i, len(spectral_vector)): 13 | if (j - i) in mass_table.keys(): 14 | adj_list.append([i, j]) 15 | 16 | adj_dict = {} 17 | for i in range(len(spectral_vector)): 18 | for j in range(i, len(spectral_vector)): 19 | if (j - i) in mass_table.keys(): 20 | tmp = [i, mass_table[j - i]] 21 | if not j in adj_dict.keys(): 22 | adj_dict[j] = [tmp] 23 | else: 24 | adj_dict[j].append(tmp) 25 | 26 | scores = {0: [0, '-']} 27 | for node in adj_dict.keys(): 28 | scores[node] = [-1e6, '-'] 29 | tmp = adj_dict[node] 30 | for x in tmp: 31 | if x[0] != 0: 32 | scores[x[0]] = [-1e6, '-'] 33 | 34 | for node in adj_dict.keys(): 35 | max_score = -1e6 36 | bold_edge = '-' 37 | for parent in adj_dict[node]: 38 | score = scores[parent[0]][0] 39 | if score > max_score: 40 | max_score = score 41 | bold_edge = parent 42 | scores[node] = [max_score + spectral_vector[node], bold_edge] 43 | 44 | node = list(scores.keys())[-1] 45 | peptide = '' 46 | while node != 0: 47 | peptide = scores[node][1][1] + peptide 48 | node = scores[node][1][0] 49 | 50 | return peptide 51 | 52 | 53 | if __name__ == "__main__": 54 | ''' 55 | Given: A space-delimited spectral vector S. 56 | Return: A peptide with maximum score against S. For masses with more than one amino acid, any choice may be used. 57 | ''' 58 | spectral_vector = [int(x) for x in sys.stdin.read().rstrip().split(' ')] 59 | 60 | print(PeptideSequencing(spectral_vector)) 61 | -------------------------------------------------------------------------------- /solutions/BA3F.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from re import split 3 | from random import choice 4 | 5 | 6 | def parse_adj_list(adj_list_text): 7 | adj_list = {} 8 | for elem in adj_list_text: 9 | temp = split(' -> ', elem) 10 | adj_list[temp[0]] = temp[1].split(',') 11 | return adj_list 12 | 13 | 14 | def remove_edge(adj_list, from_node, to_node): 15 | adj_list[from_node].remove(to_node) 16 | if not adj_list[from_node]: 17 | del adj_list[from_node] 18 | return adj_list 19 | 20 | 21 | def Eulerian_cycle(adj_list): 22 | # form a cycle Cycle by randomly walking in Graph 23 | start_node, edges = choice(list(adj_list.items())) 24 | target_node = choice(edges) 25 | adj_list = remove_edge(adj_list, start_node, target_node) 26 | 27 | Cycle = [start_node, target_node] 28 | current_node = target_node 29 | while current_node != start_node: 30 | edges = adj_list[current_node] 31 | target_node = choice(edges) 32 | adj_list = remove_edge(adj_list, current_node, target_node) 33 | current_node = target_node 34 | Cycle.append(current_node) 35 | 36 | while adj_list: 37 | potential_starts = [(idx, node) for idx, node in enumerate(Cycle) if node in adj_list] 38 | idx, new_start = choice(potential_starts) 39 | 40 | # form Cycle’ by traversing Cycle (starting at newStart) and then randomly walking 41 | new_cycle = Cycle[idx:] + Cycle[1:idx + 1] 42 | 43 | target_node = choice(adj_list[new_start]) 44 | adj_list = remove_edge(adj_list, new_start, target_node) 45 | current_node = target_node 46 | new_cycle.append(current_node) 47 | while current_node != new_start: 48 | edges = adj_list[current_node] 49 | target_node = choice(edges) 50 | adj_list = remove_edge(adj_list, current_node, target_node) 51 | current_node = target_node 52 | new_cycle.append(current_node) 53 | Cycle = new_cycle 54 | return Cycle 55 | 56 | 57 | if __name__ == "__main__": 58 | ''' 59 | Given: An Eulerian directed graph, in the form of an adjacency list. 60 | Return: An Eulerian cycle in this graph. 61 | ''' 62 | input_lines = sys.stdin.read().splitlines() 63 | Adj_list = parse_adj_list(input_lines) 64 | 65 | print("->".join(Eulerian_cycle(Adj_list))) 66 | -------------------------------------------------------------------------------- /solutions/BA3M.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA3F import parse_adj_list, remove_edge 3 | 4 | 5 | def maximal_non_branching_paths(adj_list): 6 | paths = [] 7 | 8 | # in and out degrees 9 | in_out_degrees = {} 10 | for source, targets in adj_list.items(): 11 | if source not in in_out_degrees: 12 | in_out_degrees[source] = [0, len(targets)] 13 | else: 14 | in_out_degrees[source][1] += len(targets) 15 | 16 | for target in targets: 17 | if target not in in_out_degrees: 18 | in_out_degrees[target] = [1, 0] 19 | else: 20 | in_out_degrees[target][0] += 1 21 | 22 | # find all non-branching paths 23 | for v in list(in_out_degrees): 24 | if in_out_degrees[v] != [1, 1]: 25 | if in_out_degrees[v][1] > 0: 26 | while v in adj_list: 27 | w = adj_list[v][0] 28 | non_branching_path = [v, w] 29 | adj_list = remove_edge(adj_list, v, w) 30 | while in_out_degrees[w] == [1, 1]: 31 | u = adj_list[w][0] 32 | non_branching_path.append(u) 33 | adj_list = remove_edge(adj_list, w, u) 34 | w = u 35 | paths.append(non_branching_path) 36 | 37 | # find isolated cycles 38 | while adj_list: 39 | start_node = list(adj_list)[0] 40 | current_node = adj_list[start_node][0] 41 | adj_list = remove_edge(adj_list, start_node, current_node) 42 | cycle = [start_node, current_node] 43 | while current_node != start_node: 44 | target_node = adj_list[current_node][0] 45 | cycle.append(target_node) 46 | adj_list = remove_edge(adj_list, current_node, target_node) 47 | current_node = target_node 48 | paths.append(cycle) 49 | 50 | return paths 51 | 52 | 53 | if __name__ == "__main__": 54 | ''' 55 | Given: The adjacency list of a graph whose nodes are integers. 56 | Return: The collection of all maximal non-branching paths in the graph. 57 | ''' 58 | input_lines = sys.stdin.read().splitlines() 59 | Adj_list = parse_adj_list(input_lines) 60 | 61 | result = maximal_non_branching_paths(Adj_list) 62 | for r in result: 63 | print("->".join(r)) 64 | -------------------------------------------------------------------------------- /solutions/BA4E.py: -------------------------------------------------------------------------------- 1 | import sys 2 | MASSES = [57, 71, 87, 97, 99, 101, 103, 113, 114, 115, 128, 129, 131, 137, 147, 156, 163, 186] 3 | 4 | 5 | def cyclospectrum_mass_peptide(peptide): 6 | spec = [0, sum(peptide)] 7 | temp = peptide + peptide 8 | for k in range(1, len(peptide)): 9 | for i in range(len(peptide)): 10 | subpeptide = temp[i:i + k] 11 | spec.append(sum(subpeptide)) 12 | spec.sort() 13 | return spec 14 | 15 | 16 | def LinearSpectrum(Peptide): 17 | PrefixMass = [0] 18 | for i in range(len(Peptide)): 19 | temp = PrefixMass[i] + Peptide[i] 20 | PrefixMass.append(temp) 21 | LinearSpectrum = [0] 22 | for i in range(len(Peptide)): 23 | for j in range(i + 1, len(Peptide) + 1): 24 | LinearSpectrum.append(PrefixMass[j] - PrefixMass[i]) 25 | LinearSpectrum.sort() 26 | return LinearSpectrum 27 | 28 | 29 | def expand(peptides): 30 | new_peptides = [] 31 | for pep in peptides: 32 | for mass in MASSES: 33 | new_peptides.append(pep + [mass]) 34 | return new_peptides 35 | 36 | 37 | def Consistent(Peptide, Spectrum): 38 | if sum(Peptide) > Spectrum[-1] - MASSES[0]: 39 | return False 40 | spec = LinearSpectrum(Peptide) 41 | for mass in spec: 42 | if mass not in Spectrum: 43 | return False 44 | return True 45 | 46 | 47 | def cyclopeptide_sequencing(spectrum): 48 | result = set() 49 | peptides = [[]] 50 | while peptides: 51 | peptides = expand(peptides) 52 | for peptide in peptides: 53 | if sum(peptide) == spectrum[-1]: 54 | if cyclospectrum_mass_peptide(peptide) == spectrum: 55 | result.add("-".join(map(str, peptide))) 56 | peptides = [pep for pep in peptides if pep != peptide] 57 | elif not Consistent(peptide, spectrum): 58 | peptides = [pep for pep in peptides if pep != peptide] 59 | return result 60 | 61 | 62 | if __name__ == "__main__": 63 | ''' 64 | Given: A collection of (possibly repeated) integers Spectrum corresponding to an ideal experimental spectrum. 65 | Return: Every amino acid string Peptide such that Cyclospectrum(Peptide) = Spectrum (if such a string exists). 66 | ''' 67 | spectrum = [int(x) for x in sys.stdin.readline().strip().split()] 68 | 69 | print(" ".join(cyclopeptide_sequencing(spectrum))) 70 | -------------------------------------------------------------------------------- /solutions/BA7A.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import queue 3 | 4 | 5 | class Node: 6 | def __init__(self, label): 7 | self.label = label 8 | self.linked_nodes = set() 9 | 10 | 11 | class Tree: 12 | def __init__(self): 13 | self.nodes_dict = {} 14 | 15 | def add_node(self, label): 16 | if label in self.nodes_dict: 17 | return self.nodes_dict[label] 18 | 19 | node = Node(label) 20 | self.nodes_dict[label] = node 21 | return node 22 | 23 | def construct_tree(self, adj_list): 24 | for line in adj_list: 25 | labels, weight = line.split(":") 26 | weight = int(weight) 27 | label1, label2 = [int(x) for x in labels.split("->")] 28 | 29 | node1 = self.add_node(label1) 30 | node2 = self.add_node(label2) 31 | 32 | node1.linked_nodes.add((label2, weight)) 33 | node2.linked_nodes.add((label1, weight)) 34 | 35 | def distance(self, label_a, label_b): 36 | visited = [False] * len(self.nodes_dict) 37 | distance = [0] * len(self.nodes_dict) 38 | 39 | Q = queue.Queue() 40 | distance[label_a] = 0 41 | 42 | Q.put(label_a) 43 | visited[label_a] = True 44 | while not Q.empty(): 45 | x = Q.get() 46 | for label2, weight in self.nodes_dict[x].linked_nodes: 47 | if not visited[label2]: 48 | distance[label2] = distance[x] + weight 49 | Q.put(label2) 50 | visited[label2] = True 51 | return distance[label_b] 52 | 53 | 54 | def distance_matrix_between_leaves(self, n_leaves): 55 | distance_mat = [[0 for _ in range(n_leaves)] for _ in range(n_leaves)] 56 | for i in range(n_leaves): 57 | for j in range(n_leaves): 58 | distance_mat[i][j] = self.distance(i, j) 59 | return distance_mat 60 | 61 | 62 | if __name__ == "__main__": 63 | ''' 64 | Given: An integer n followed by the adjacency list of a weighted tree with n leaves. 65 | Return: A space-separated n x n (di, j), where di, j is the length of the path between leaves i and j. 66 | ''' 67 | input_lines = sys.stdin.read().splitlines() 68 | n = int(input_lines[0]) 69 | adj_list = input_lines[1:] 70 | 71 | t = Tree() 72 | t.construct_tree(adj_list) 73 | result = t.distance_matrix_between_leaves(n) 74 | for row in result: 75 | print(" ".join(map(str, row))) -------------------------------------------------------------------------------- /solutions/BA9M.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def BetterBWMatching_wrapper(BWT, pattern_list): 5 | FirstOccurrence = {} 6 | for idx, symbol in enumerate(sorted(BWT)): 7 | if symbol not in FirstOccurrence.keys(): 8 | FirstOccurrence[symbol] = idx 9 | 10 | result = [] 11 | for pattern in pattern_list: 12 | result.append(BetterBWMatching(FirstOccurrence, BWT, pattern)) 13 | 14 | return result 15 | 16 | 17 | def Countsymbol(idx, LastColumn, symbol): 18 | return LastColumn[:idx].count(symbol) 19 | 20 | 21 | def BetterBWMatching(FirstOccurrence, LastColumn, pattern): 22 | ''' Better Burrows Wheeler Matching 23 | BetterBWMatching(FirstOccurrence, LastColumn, Pattern) 24 | top ← 0 25 | bottom ← |LastColumn| − 1 26 | while top ≤ bottom 27 | if Pattern is nonempty 28 | symbol ← last letter in Pattern 29 | remove last letter from Pattern 30 | if positions from top to bottom in LastColumn contain an occurrence of symbol 31 | top ← FirstOccurrence(symbol) + Countsymbol(top, LastColumn) 32 | bottom ← FirstOccurrence(symbol) + Countsymbol(bottom + 1, LastColumn) − 1 33 | else 34 | return 0 35 | else 36 | return bottom − top + 1 37 | ''' 38 | top = 0 39 | bottom = len(LastColumn) - 1 40 | 41 | while top <= bottom: 42 | if len(pattern) != 0: 43 | symbol = pattern[-1] 44 | pattern = pattern[:-1] 45 | 46 | # if positions from top to bottom in LastColumn 47 | # contain any occurrence of symbol 48 | if symbol in LastColumn[top:bottom + 1]: 49 | top = FirstOccurrence[symbol] + Countsymbol(top, LastColumn, symbol) 50 | bottom = FirstOccurrence[symbol] + Countsymbol(bottom + 1, LastColumn, symbol) - 1 51 | else: 52 | return 0 53 | else: 54 | return bottom - top + 1 55 | 56 | 57 | if __name__ == "__main__": 58 | ''' 59 | Given: A string BWT(Text), followed by a collection of strings Patterns. 60 | Return: A list of integers, where the i-th integer corresponds to the number of substring matches of the i-th member 61 | of Patterns in Text. 62 | ''' 63 | tmp = sys.stdin.read().splitlines() 64 | BWT = tmp[0] 65 | pattern_list = tmp[1].split(' ') 66 | 67 | match_nums = BetterBWMatching_wrapper(BWT, pattern_list) 68 | print(' '.join(str(num) for num in match_nums)) 69 | -------------------------------------------------------------------------------- /solutions/BA5H.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def fitting_alignment(str1, str2, indel_penalty=1): 5 | str1 = "-" + str1 6 | str2 = "-" + str2 7 | 8 | score_mat = [[0 for _ in range(len(str2))] for _ in range(len(str1))] 9 | backtrack_mat = [[None for _ in range(len(str2))] for _ in range(len(str1))] 10 | 11 | for i in range(1, len(str1)): 12 | for j in range(1, len(str2)): 13 | score1 = score_mat[i - 1][j - 1] + (1 if str1[i] == str2[j] else - 1) 14 | score2 = score_mat[i - 1][j] - indel_penalty 15 | score3 = score_mat[i][j - 1] - indel_penalty 16 | score_mat[i][j] = max(score1, score2, score3) 17 | if score_mat[i][j] == score1: 18 | backtrack_mat[i][j] = "d" 19 | elif score_mat[i][j] == score2: 20 | backtrack_mat[i][j] = "u" 21 | elif score_mat[i][j] == score3: 22 | backtrack_mat[i][j] = "l" 23 | 24 | j = len(str2) - 1 25 | i = max(enumerate([score_mat[row][j] for row in range(len(str2) - 1, len(str1) - 1)]), key=lambda x: x[1])[0] + len(str2) - 1 26 | max_score = score_mat[i][j] 27 | 28 | aligned_1 = aligned_2 = "" 29 | while backtrack_mat[i][j] is not None: 30 | direction = backtrack_mat[i][j] 31 | if direction == "d": 32 | aligned_1 = str1[i] + aligned_1 33 | aligned_2 = str2[j] + aligned_2 34 | i -= 1 35 | j -= 1 36 | elif direction == "u": 37 | aligned_1 = str1[i] + aligned_1 38 | aligned_2 = "-" + aligned_2 39 | i -= 1 40 | else: 41 | aligned_1 = "-" + aligned_1 42 | aligned_2 = str2[j] + aligned_2 43 | j -= 1 44 | 45 | return max_score, aligned_1, aligned_2 46 | 47 | 48 | if __name__ == "__main__": 49 | ''' 50 | Given: Two DNA strings v and w, where v has length at most 10000 and w has length at most 1000. 51 | Return: The maximum score of a fitting alignment of v and w, followed by a fitting alignment achieving this maximum 52 | score. Use the simple scoring method in which matches count +1 and both the mismatch and indel penalties are equal 53 | to 1. (If multiple fitting alignments achieving the maximum score exist, you may return any one.) 54 | ''' 55 | input_lines = sys.stdin.read().splitlines() 56 | string1 = input_lines[0] 57 | string2 = input_lines[1] 58 | 59 | score, alignment1, alignment2 = fitting_alignment(string1, string2) 60 | print(score) 61 | print(alignment1) 62 | print(alignment2) 63 | -------------------------------------------------------------------------------- /solutions/BA2D.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA1G import hamming_dist 3 | 4 | 5 | def probability(pattern, profile): 6 | prob = 1 7 | for i, nuc in enumerate(pattern): 8 | prob *= profile[nuc][i] 9 | return prob 10 | 11 | 12 | def profile_most_probable_kmer(text, profile, k): 13 | max_prob = -1 14 | for i in range(len(text) - k + 1): 15 | kmer = text[i:i + k] 16 | prob = probability(kmer, profile) 17 | if prob > max_prob: 18 | max_prob = prob 19 | result = kmer 20 | return result 21 | 22 | def GreedyMotifSearch(DNA_list, k, t): 23 | BestMotifs = [dna[0:k] for dna in DNA_list] 24 | LowestScore = CalculateScore(BestMotifs) 25 | DNA = DNA_list[0] 26 | for i in range(len(DNA) - k + 1): 27 | Motifs = [DNA[i:i + k]] 28 | for j in range(1, t): 29 | profile = FormProfile(Motifs) 30 | Motifs.append(profile_most_probable_kmer(DNA_list[j], profile, k)) 31 | CurrentScore = CalculateScore(Motifs) 32 | if CurrentScore < LowestScore: 33 | BestMotifs = Motifs 34 | LowestScore = CurrentScore 35 | return BestMotifs 36 | 37 | 38 | def FormProfile(TextList): 39 | if type(TextList) != list: 40 | TextList = [TextList] 41 | t = len(TextList) 42 | k = len(TextList[0]) 43 | profile = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0]*k} 44 | for i in range(k): 45 | for j in range(t): 46 | profile[TextList[j][i]][i] += 1 47 | return profile 48 | 49 | 50 | def CalculateScore(Motifs): 51 | k = len(Motifs[0]) 52 | profile = FormProfile(Motifs) 53 | consensus = '' 54 | for i in range(k): 55 | most_freq = 0 56 | for nuc in ['A', 'C', 'G', 'T']: 57 | if profile[nuc][i] > most_freq: 58 | most_freq = profile[nuc][i] 59 | to_add = nuc 60 | consensus += to_add 61 | score = 0 62 | for motif in Motifs: 63 | score += hamming_dist(consensus, motif) 64 | return score 65 | 66 | 67 | if __name__ == "__main__": 68 | ''' 69 | Given: Integers k and t, followed by a collection of strings Dna. 70 | Return: A collection of strings BestMotifs resulting from running GreedyMotifSearch(Dna, k, t). If at any step you 71 | find more than one Profile-most probable k-mer in a given string, use the one occurring first. 72 | ''' 73 | input_lines = sys.stdin.read().splitlines() 74 | k, t = [int(x) for x in input_lines[0].split()] 75 | DNA_list = input_lines[1:] 76 | print("\n".join(GreedyMotifSearch(DNA_list, k, t))) 77 | -------------------------------------------------------------------------------- /solutions/BA5I.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def overlap_alignment(str1, str2): 5 | str1 = "-" + str1 6 | str2 = "-" + str2 7 | 8 | score_mat = [[0 for j in range(len(str2))] for i in range(len(str1))] 9 | backtrack_mat = [[None for j in range(len(str2))] for i in range(len(str1))] 10 | 11 | for j in range(1, len(str2)): 12 | score_mat[0][j] = score_mat[0][j - 1] - 2 13 | backtrack_mat[0][j] = "l" 14 | 15 | for i in range(1, len(str1)): 16 | for j in range(1, len(str2)): 17 | 18 | score1 = score_mat[i - 1][j - 1] + (1 if str1[i] == str2[j] else -2) 19 | score2 = score_mat[i - 1][j] - 2 20 | score3 = score_mat[i][j - 1] - 2 21 | score_mat[i][j] = max(score1, score2, score3) 22 | if score_mat[i][j] == score1: 23 | backtrack_mat[i][j] = "d" 24 | elif score_mat[i][j] == score2: 25 | backtrack_mat[i][j] = "u" 26 | elif score_mat[i][j] == score3: 27 | backtrack_mat[i][j] = "l" 28 | 29 | i = len(str1) - 1 30 | j = max(range(len(str2)), key=lambda x: score_mat[i][x]) 31 | max_score = score_mat[i][j] 32 | 33 | aligned_1 = aligned_2 = "" 34 | while backtrack_mat[i][j] is not None: 35 | direction = backtrack_mat[i][j] 36 | if direction == "d": 37 | aligned_1 = str1[i] + aligned_1 38 | aligned_2 = str2[j] + aligned_2 39 | i -= 1 40 | j -= 1 41 | elif direction == "u": 42 | aligned_1 = str1[i] + aligned_1 43 | aligned_2 = "-" + aligned_2 44 | i -= 1 45 | else: 46 | aligned_1 = "-" + aligned_1 47 | aligned_2 = str2[j] + aligned_2 48 | j -= 1 49 | 50 | return max_score, aligned_1, aligned_2 51 | 52 | 53 | if __name__ == "__main__": 54 | ''' 55 | Given: Two protein strings v and w, each of length at most 1000. 56 | Return: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v’ of v and a 57 | prefix w’ of w achieving this maximum score. Use an alignment score in which matches count +1 and both the mismatch 58 | and indel penalties are 2. (If multiple overlap alignments achieving the maximum score exist, you may return any 59 | one.) 60 | ''' 61 | input_lines = sys.stdin.read().splitlines() 62 | string1 = input_lines[0] 63 | string2 = input_lines[1] 64 | 65 | score, alignment1, alignment2 = overlap_alignment(string1, string2) 66 | print(score) 67 | print(alignment1) 68 | print(alignment2) 69 | -------------------------------------------------------------------------------- /solutions/BA10I.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA10H import print_matrices, HMMParameterEstimation 3 | from BA10C import Viterbi 4 | 5 | 6 | def Viterbi_Learning(x, init_transition_matrix, init_emission_matrix, alphabet, all_states, max_iterations): 7 | transition_matrix = init_transition_matrix 8 | emission_matrix = init_emission_matrix 9 | 10 | for iteration in range(max_iterations): 11 | # Step 1: Determine path using current parameters 12 | path = Viterbi(x, all_states, transition_matrix, emission_matrix) 13 | 14 | # Step 2: Determine new parameters using path 15 | transition_matrix, emission_matrix = HMMParameterEstimation(x, path, alphabet, all_states) 16 | 17 | return transition_matrix, emission_matrix 18 | 19 | 20 | if __name__ == "__main__": 21 | ''' 22 | Given: A sequence of emitted symbols x = x1 ... xn in an alphabet A, generated by a k-state HMM with unknown 23 | transition and emission probabilities, initial Transition and Emission matrices and a number of iterations i. 24 | Return: A matrix of transition probabilities Transition and a matrix of emission probabilities Emission that 25 | maximizes Pr(x, π) over all possible transition and emission matrices and over all hidden paths π. 26 | ''' 27 | 28 | tmp = sys.stdin.read().splitlines() 29 | 30 | j = tmp[0] 31 | x = tmp[2] 32 | alphabet = tmp[4].split() 33 | all_states = tmp[6].split() 34 | 35 | init_transition_matrix = {} 36 | init_emission_matrix = {} 37 | 38 | # initial transition matrix 39 | col_syms = tmp[8].split('\t')[1:] 40 | transition_end = 8 + len(all_states) 41 | 42 | for i in range(9, transition_end + 1): 43 | current_line = tmp[i].rstrip().split('\t') 44 | row_sym = current_line[0] 45 | init_transition_matrix[row_sym] = {} 46 | for j in range(1, len(current_line)): 47 | init_transition_matrix[row_sym][col_syms[j - 1]] = float(current_line[j]) 48 | 49 | # emission matrix 50 | col_syms = tmp[transition_end + 2].split('\t')[1:] 51 | 52 | for i in range(transition_end + 3, len(tmp)): 53 | current_line = tmp[i].rstrip().split('\t') 54 | row_sym = current_line[0] 55 | init_emission_matrix[row_sym] = {} 56 | for j in range(1, len(current_line)): 57 | init_emission_matrix[row_sym][col_syms[j - 1]] = float(current_line[j]) 58 | 59 | transition_matrix, emission_matrix = Viterbi_Learning(x, init_transition_matrix, init_emission_matrix, alphabet, 60 | all_states, j) 61 | 62 | print_matrices(transition_matrix, emission_matrix) -------------------------------------------------------------------------------- /solutions/BA11B.py: -------------------------------------------------------------------------------- 1 | import sys 2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163} 4 | mass_table = {v: k for k, v in aa_table.items()} 5 | 6 | 7 | def SpectrumGraph(spectrum): 8 | adj_list = [] 9 | for i in range(len(spectrum)): 10 | for j in range(i, len(spectrum)): 11 | if spectrum[j] - spectrum[i] in mass_table.keys(): 12 | adj_list.append([spectrum[i], spectrum[j], mass_table[spectrum[j] - spectrum[i]]]) 13 | return adj_list 14 | 15 | 16 | def IdealSpectrum(Peptide): 17 | PrefixMass = [0] 18 | for i in range(len(Peptide)): 19 | temp = PrefixMass[i] + aa_table[Peptide[i]] 20 | PrefixMass.append(temp) 21 | LinearSpectrum = [0] 22 | for i in range(len(Peptide)): 23 | for j in range(i + 1, len(Peptide) + 1): 24 | LinearSpectrum.append(PrefixMass[j] - PrefixMass[i]) 25 | LinearSpectrum.sort() 26 | return LinearSpectrum 27 | 28 | 29 | def Paths(adj_list): 30 | node = 0 31 | peptide_list = [] 32 | tmp_edges = [] 33 | peptide = '' 34 | tmp_peps = [] 35 | 36 | while any([len(x) != 0 for x in tmp_edges]) or len(tmp_edges) == 0: 37 | next_edges = [e for e in adj_list if e[0] == node] 38 | if len(next_edges) > 1: 39 | tmp = next_edges[1:] 40 | tmp_edges.append(tmp) 41 | tmp_peps.append(peptide) 42 | 43 | next_edge = next_edges[0] 44 | peptide += next_edge[2] 45 | node = next_edge[1] 46 | 47 | if len([e for e in adj_list if e[0] == node]) == 0: 48 | tmp = [x for x in tmp_edges if len(x) != 0][-1] 49 | next_edge = tmp.pop() 50 | node = next_edge[1] 51 | peptide_list.append(peptide) 52 | tmp_pep = tmp_peps.pop() 53 | peptide = tmp_pep + next_edge[2] 54 | 55 | return peptide_list 56 | 57 | 58 | def DecodingIdealSpectrum(spectrum): 59 | adj_list = SpectrumGraph(spectrum) 60 | all_paths = Paths(adj_list) 61 | for peptide in all_paths: 62 | if set(spectrum).issubset(IdealSpectrum(peptide)): 63 | return peptide 64 | 65 | 66 | if __name__ == "__main__": 67 | ''' 68 | Given: A space-delimited list of integers, Spectrum. 69 | Return: An amino acid string with an ideal spectrum that matches Spectrum. 70 | ''' 71 | spectrum = sys.stdin.read().rstrip() 72 | spectrum = [int(s) for s in spectrum.split()] 73 | spectrum = [0] + spectrum 74 | 75 | print(DecodingIdealSpectrum(spectrum)) -------------------------------------------------------------------------------- /solutions/BA2E.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA1G import hamming_dist 3 | 4 | 5 | def probability(pattern, profile): 6 | prob = 1 7 | for i, nuc in enumerate(pattern): 8 | prob *= profile[nuc][i] 9 | return prob 10 | 11 | 12 | def profile_most_probable_kmer(text, profile, k): 13 | max_prob = -1 14 | for i in range(len(text) - k + 1): 15 | kmer = text[i:i + k] 16 | prob = probability(kmer, profile) 17 | if prob > max_prob: 18 | max_prob = prob 19 | result = kmer 20 | return result 21 | 22 | def GreedyMotifSearch(DNA_list, k, t): 23 | BestMotifs = [dna[0:k] for dna in DNA_list] 24 | LowestScore = CalculateScore(BestMotifs) 25 | DNA = DNA_list[0] 26 | for i in range(len(DNA) - k + 1): 27 | Motifs = [DNA[i:i + k]] 28 | for j in range(1, t): 29 | profile = FormProfile(Motifs) 30 | Motifs.append(profile_most_probable_kmer(DNA_list[j], profile, k)) 31 | CurrentScore = CalculateScore(Motifs) 32 | if CurrentScore < LowestScore: 33 | BestMotifs = Motifs 34 | LowestScore = CurrentScore 35 | return BestMotifs 36 | 37 | 38 | def FormProfile(TextList, pseudocount=1): 39 | if type(TextList) != list: 40 | TextList = [TextList] 41 | t = len(TextList) 42 | k = len(TextList[0]) 43 | profile = {'A': [pseudocount] * k, 'C': [pseudocount] * k, 'G': [pseudocount] * k, 'T': [pseudocount] * k} 44 | for i in range(k): 45 | for j in range(t): 46 | profile[TextList[j][i]][i] += 1 47 | return profile 48 | 49 | 50 | def CalculateScore(Motifs): 51 | k = len(Motifs[0]) 52 | profile = FormProfile(Motifs) 53 | consensus = '' 54 | for i in range(k): 55 | most_freq = 0 56 | for nuc in ['A', 'C', 'G', 'T']: 57 | if profile[nuc][i] > most_freq: 58 | most_freq = profile[nuc][i] 59 | to_add = nuc 60 | consensus += to_add 61 | score = 0 62 | for motif in Motifs: 63 | score += hamming_dist(consensus, motif) 64 | return score 65 | 66 | 67 | if __name__ == "__main__": 68 | ''' 69 | Given: Integers k and t, followed by a collection of strings Dna. 70 | Return: A collection of strings BestMotifs resulting from running GreedyMotifSearch(Dna, k, t) with pseudocounts. 71 | If at any step you find more than one Profile-most probable k-mer in a given string, use the one occurring first. 72 | ''' 73 | input_lines = sys.stdin.read().splitlines() 74 | k, t = [int(x) for x in input_lines[0].split()] 75 | DNA_list = input_lines[1:] 76 | print("\n".join(GreedyMotifSearch(DNA_list, k, t))) 77 | -------------------------------------------------------------------------------- /solutions/BA8E.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def HierarchicalClustering(distance_matrix, agg_method='average'): 5 | clusters = [[i] for i in range(len(distance_matrix))] 6 | 7 | new_clusters_list = [] 8 | while len(clusters) != 1: 9 | 10 | ## Find the two closest clusters 11 | min_dist = float('inf') 12 | for i in range(len(clusters) - 1): 13 | for j in range(i + 1, len(clusters)): 14 | if agg_method == 'average': 15 | dist = 0 16 | for idx1 in clusters[i]: 17 | for idx2 in clusters[j]: 18 | dist += distance_matrix[idx1][idx2] 19 | dist /= (len(clusters[i]) * len(clusters[j])) 20 | elif agg_method == 'min': 21 | dist = float('inf') 22 | for idx1 in clusters[i]: 23 | for idx2 in clusters[j]: 24 | current = distance_matrix[idx1][idx2] 25 | if current < dist: 26 | dist = current 27 | elif agg_method == 'max': 28 | dist = -1 29 | for idx1 in clusters[i]: 30 | for idx2 in clusters[j]: 31 | current = distance_matrix[idx1][idx2] 32 | if current > dist: 33 | dist = current 34 | else: 35 | raise Exception('Agglomeration method not implemented!') 36 | if dist < min_dist: 37 | min_dist = dist 38 | closest_idx1 = i 39 | closest_idx2 = j 40 | 41 | ## Merge the two closeet clusters 42 | new_cluster = clusters[closest_idx1] + clusters[closest_idx2] 43 | clusters = [clu for clu in clusters if clu not in [clusters[closest_idx1], clusters[closest_idx2]]] 44 | clusters.append(new_cluster) 45 | new_clusters_list.append(new_cluster) 46 | return new_clusters_list 47 | 48 | 49 | if __name__ == "__main__": 50 | ''' 51 | Given: An integer n, followed by an nxn distance matrix. 52 | Return: The result of applying HierarchicalClustering to this distance matrix (using Davg), with each newly created 53 | cluster listed on each line. 54 | ''' 55 | tmp = sys.stdin.read().splitlines() 56 | n = int(tmp[0]) 57 | 58 | distance_matrix = [] 59 | for i in range(1, len(tmp)): 60 | distance_matrix.append([float(d) for d in tmp[i].split(' ')]) 61 | 62 | new_clusters_list = HierarchicalClustering(distance_matrix, 'average') 63 | for clu in new_clusters_list: 64 | print(' '.join([str(x + 1) for x in clu])) -------------------------------------------------------------------------------- /solutions/BA11G.py: -------------------------------------------------------------------------------- 1 | import sys 2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163} 4 | 5 | 6 | def is_number(n): 7 | try: 8 | float(n) # Type-casting the string to `float`. 9 | # If string is not a valid `float`, 10 | # it'll raise `ValueError` exception 11 | except ValueError: 12 | return False 13 | return True 14 | 15 | 16 | def PeptideVector(peptide): 17 | prefixMasses = [] 18 | for i in range(len(peptide)): 19 | prefixMasses.append(sum(peptide[:i + 1])) 20 | vector = [0] * prefixMasses[-1] 21 | for mass in prefixMasses: 22 | vector[mass - 1] = 1 23 | return vector 24 | 25 | 26 | def PeptideIdentification(spectral_vector, proteome): 27 | max_score = -1e6 28 | mass_list = [] 29 | for aa in proteome: 30 | mass_list.append(aa_table[aa]) 31 | 32 | best_peptide = '' 33 | 34 | for i in range(len(mass_list)): 35 | k = 2 36 | while i + k < len(mass_list): 37 | peptide = mass_list[i:i + k] 38 | pep_vec = PeptideVector(peptide) 39 | if len(pep_vec) > len(spectral_vector): 40 | break 41 | if len(pep_vec) == len(spectral_vector): 42 | score = 0 43 | for idx in range(len(pep_vec)): 44 | if pep_vec[idx] == 1: 45 | score += spectral_vector[idx] 46 | if score > max_score: 47 | max_score = score 48 | best_peptide = proteome[i:i + k] 49 | k += 1 50 | return [best_peptide, max_score] 51 | 52 | 53 | def PSMSearch(spectral_vectors, proteome, threshold): 54 | PSMSet = set() 55 | for vec in spectral_vectors: 56 | peptide, score = PeptideIdentification(vec, proteome) 57 | if score >= threshold: 58 | PSMSet.add(peptide) 59 | return PSMSet 60 | 61 | 62 | if __name__ == "__main__": 63 | ''' 64 | Given: A set of space-delimited spectral vectors SpectralVectors, an amino acid string Proteome, and a score 65 | threshold T. 66 | Return: All unique Peptide-Spectrum Matches scoring at least as high as T. 67 | ''' 68 | tmp = sys.stdin.read().splitlines() 69 | 70 | spectral_vectors = [] 71 | idx = 0 72 | while is_number(tmp[idx][0]) or is_number(tmp[idx][:2]): 73 | vec = [int(x) for x in tmp[idx].rstrip().split(' ')] 74 | spectral_vectors.append(vec) 75 | idx += 1 76 | 77 | proteome = tmp[idx].rstrip() 78 | threshold = int(tmp[idx + 1]) 79 | 80 | result = PSMSearch(spectral_vectors, proteome, threshold) 81 | 82 | for res in result: 83 | print(res) 84 | 85 | -------------------------------------------------------------------------------- /solutions/BA7C.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import networkx as nx 3 | from BA7B import limb_length 4 | 5 | 6 | def additive_phylogeny(dist_mat, num_leaves, graph, int_node): 7 | 8 | if num_leaves == 2: 9 | graph.add_edge(0, 1, weight = dist_mat[0][1]) 10 | return graph 11 | 12 | n = num_leaves - 1 13 | len_limb = limb_length(dist_mat, n, num_leaves) 14 | 15 | for j in range(n): 16 | dist_mat[j][n] -= len_limb 17 | dist_mat[n][j] = dist_mat[j][n] 18 | 19 | other_leaves = [i for i in range(num_leaves) if i != n] 20 | selected_i = -1 21 | selected_k = -1 22 | for idx1 in range(len(other_leaves) - 1): 23 | i = other_leaves[idx1] 24 | for idx2 in range(idx1 + 1, len(other_leaves)): 25 | k = other_leaves[idx2] 26 | if dist_mat[i][n] + dist_mat[n][k] == dist_mat[i][k]: 27 | selected_i = i 28 | selected_k = k 29 | x = dist_mat[selected_i][n] 30 | 31 | del dist_mat[-1] 32 | for i in range(len(dist_mat)): 33 | del dist_mat[i][-1] 34 | 35 | while int_node in list(graph.nodes): 36 | int_node += 1 37 | T = additive_phylogeny(dist_mat, num_leaves - 1, graph, int_node) 38 | 39 | V = -1 40 | spath = nx.shortest_path(T, source=selected_i, target=selected_k) 41 | dist = 0 42 | for j in range(1, len(spath) - 1): 43 | dist += T[spath[j - 1]][spath[j]]['weight'] 44 | if dist == x: 45 | V = spath[j] 46 | 47 | if V == -1: 48 | V = int_node 49 | while V in list(T.nodes): 50 | V += 1 51 | dist = 0 52 | j = 0 53 | while dist < x: 54 | j += 1 55 | pdist = dist 56 | dist += T[spath[j - 1]][spath[j]]['weight'] 57 | T.remove_edge(spath[j - 1], spath[j]) 58 | T.add_edge(V, spath[j], weight=dist - x) 59 | T.add_edge(V, spath[j - 1], weight=x - pdist) 60 | 61 | T.add_edge(V, n, weight=len_limb) 62 | 63 | return T 64 | 65 | 66 | if __name__ == "__main__": 67 | ''' 68 | Given: n and a tab-delimited n x n additive matrix. 69 | Return: A weighted adjacency list for the simple tree fitting this matrix. 70 | ''' 71 | input_lines = sys.stdin.read().splitlines() 72 | num_leaves = int(input_lines[0]) 73 | distance_matrix = [[int(x) for x in line.split()] for line in input_lines[1:]] 74 | 75 | graph = nx.Graph() 76 | result = additive_phylogeny(distance_matrix, num_leaves, graph, num_leaves) 77 | adj_dict = nx.to_dict_of_lists(result) 78 | answer = [] 79 | for key, value in adj_dict.items(): 80 | for val in value: 81 | temp = str(key) + '->' + str(val) + ':' + str(int(result[key][val]['weight'])) 82 | answer.append(temp) 83 | answer.sort() 84 | for l in answer: 85 | print(l) 86 | -------------------------------------------------------------------------------- /solutions/BA10D.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def OutcomeLikelihood(x, all_states, transition_matrix, emission_matrix): 5 | init_transition_prob = 1 / len(all_states) 6 | 7 | ## calculate all scores 8 | Forward_dict = {} 9 | for i in range(len(x)): 10 | for current_state in all_states: 11 | if current_state not in Forward_dict.keys(): 12 | Forward_dict[current_state] = {} 13 | ## if the leftmost column, initialize the recurrence 14 | # (every node in the leftmost column is connected to source) 15 | if i == 0: 16 | # Forward[source] is 1 17 | Forward_dict[current_state][i] = 1 * init_transition_prob * emission_matrix[current_state][x[i]] 18 | 19 | # forward𝑘,𝑖 = ∑all states 𝑙forward𝑙,𝑖−1⋅Weight𝑖(𝑙,𝑘) 20 | else: 21 | Forward_dict[current_state][i] = 0 22 | for state in all_states: 23 | Forward_dict[current_state][i] += Forward_dict[state][i - 1] * transition_matrix[state][ 24 | current_state] * emission_matrix[current_state][x[i]] 25 | 26 | outcome_probability = 0 27 | for state in all_states: 28 | outcome_probability += Forward_dict[state][len(x) - 1] 29 | 30 | return outcome_probability 31 | 32 | 33 | if __name__ == "__main__": 34 | ''' 35 | Given: A string x, followed by the alphabet Σ from which x was constructed, followed by the states States, 36 | transition matrix Transition, and emission matrix Emission of an HMM (Σ, States, Transition, Emission). 37 | Return: The probability Pr(x) that the HMM emits x. 38 | ''' 39 | tmp = sys.stdin.read().splitlines() 40 | 41 | x = tmp[0] # the emitted string 42 | alphabet = tmp[2].split() # the alphabet from which x was constructed 43 | states = tmp[4].split() # the states of HMM 44 | 45 | # transition matrix 46 | col_syms = tmp[6].split() 47 | transition_end = 6 + len(states) 48 | 49 | transition_matrix = {} 50 | for i in range(7, transition_end + 1): 51 | current_line = tmp[i].split() 52 | row_sym = current_line[0] 53 | transition_matrix[row_sym] = {} 54 | for j in range(1, len(current_line)): 55 | transition_matrix[row_sym][col_syms[j - 1]] = float(current_line[j]) 56 | 57 | # emission matrix 58 | col_syms = tmp[transition_end + 2].split() 59 | emission_matrix = {} 60 | for i in range(transition_end + 3, len(tmp)): 61 | current_line = tmp[i].rstrip().split() 62 | row_sym = current_line[0] 63 | emission_matrix[row_sym] = {} 64 | for j in range(1, len(current_line)): 65 | emission_matrix[row_sym][col_syms[j - 1]] = float(current_line[j]) 66 | print("{:.11E}".format(OutcomeLikelihood(x, states, transition_matrix, emission_matrix))) -------------------------------------------------------------------------------- /solutions/BA2F.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from random import randint 3 | 4 | from BA1G import hamming_dist 5 | 6 | 7 | def probability(pattern, profile): 8 | prob = 1 9 | for i, nuc in enumerate(pattern): 10 | prob *= profile[nuc][i] 11 | return prob 12 | 13 | 14 | def profile_most_probable_kmer(text, profile, k): 15 | max_prob = -1 16 | for i in range(len(text) - k + 1): 17 | kmer = text[i:i + k] 18 | prob = probability(kmer, profile) 19 | if prob > max_prob: 20 | max_prob = prob 21 | result = kmer 22 | return result 23 | 24 | 25 | def FormProfile(TextList, pseudocount=1): 26 | if type(TextList) != list: 27 | TextList = [TextList] 28 | t = len(TextList) 29 | k = len(TextList[0]) 30 | profile = {'A': [pseudocount] * k, 'C': [pseudocount] * k, 'G': [pseudocount] * k, 'T': [pseudocount] * k} 31 | for i in range(k): 32 | for j in range(t): 33 | profile[TextList[j][i]][i] += 1 34 | return profile 35 | 36 | 37 | def CalculateScore(Motifs): 38 | k = len(Motifs[0]) 39 | profile = FormProfile(Motifs) 40 | consensus = '' 41 | for i in range(k): 42 | most_freq = 0 43 | for nuc in ['A', 'C', 'G', 'T']: 44 | if profile[nuc][i] > most_freq: 45 | most_freq = profile[nuc][i] 46 | to_add = nuc 47 | consensus += to_add 48 | score = 0 49 | for motif in Motifs: 50 | score += hamming_dist(consensus, motif) 51 | return score 52 | 53 | 54 | def RandomizedMotifSearch(DNA_list, k, t): 55 | Motifs = [] 56 | for dna in DNA_list: 57 | idx = randint(0, len(dna) - k) 58 | Motifs.append(dna[idx:idx + k]) 59 | BestMotifs = Motifs 60 | while True: 61 | profile = FormProfile(Motifs) 62 | Motifs = [] 63 | for dna in DNA_list: 64 | Motifs.append(profile_most_probable_kmer(dna, profile, k)) 65 | if CalculateScore(Motifs) < CalculateScore(BestMotifs): 66 | BestMotifs = Motifs 67 | else: 68 | return BestMotifs 69 | 70 | 71 | if __name__ == "__main__": 72 | ''' 73 | Given: Positive integers k and t, followed by a collection of strings Dna. 74 | Return: A collection BestMotifs resulting from running RandomizedMotifSearch(Dna, k, t) 1000 times. Remember to use 75 | pseudocounts! 76 | ''' 77 | input_lines = sys.stdin.read().splitlines() 78 | k, t = [int(x) for x in input_lines[0].split()] 79 | DNA_list = input_lines[1:] 80 | 81 | best_score = float("Inf") 82 | for _ in range(1000): 83 | result = RandomizedMotifSearch(DNA_list, k, t) 84 | current_score = CalculateScore(result) 85 | if current_score <= best_score: 86 | best_score = current_score 87 | best_result = result 88 | 89 | print("\n".join(best_result)) 90 | -------------------------------------------------------------------------------- /solutions/BA8D.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from math import exp 3 | from random import random 4 | 5 | 6 | def Euclidean_distance(PointA, PointB): 7 | if len(PointA) != len(PointB): 8 | raise ValueError('The dimensions are not the same!') 9 | dist = 0 10 | for i in range(len(PointA)): 11 | dist += ((PointA[i] - PointB[i]) ** 2) 12 | dist **= 1/2 13 | return dist 14 | 15 | 16 | def dist_from_centers(DataPoint, Centers): 17 | min_d = float("inf") 18 | for C in Centers: 19 | distance = Euclidean_distance(DataPoint, C) 20 | if distance < min_d: 21 | min_d = distance 22 | return min_d 23 | 24 | 25 | def Random(prob_list): 26 | tot = sum(prob_list) 27 | massDist = map(lambda x: x/tot, prob_list) 28 | randRoll = random() 29 | cum = 0 30 | result = 0 31 | for mass in massDist: 32 | cum += mass 33 | if randRoll < cum: 34 | return result 35 | result += 1 36 | 37 | 38 | def Hidden_Matrix(Data, Centers, beta): 39 | hidden_mat = [[0 for _ in range(len(Data))] for _ in range(len(Centers))] 40 | for j in range(len(Data)): 41 | tot = 0 42 | for i in range(len(Centers)): 43 | tot += exp(-beta * Euclidean_distance(Centers[i], Data[j])) 44 | for i in range(len(Centers)): 45 | hidden_mat[i][j] = exp(-beta * Euclidean_distance(Centers[i], Data[j])) / tot 46 | return hidden_mat 47 | 48 | 49 | def clu_to_center(hidden_mat, Data): 50 | k = len(hidden_mat) 51 | m = len(Data[0]) 52 | n = len(Data) 53 | new_centers = [[0 for j in range(m)] for i in range(k)] 54 | for i in range(k): 55 | for j in range(m): 56 | product = 0 57 | for idx in range(n): 58 | product += Data[idx][j] * hidden_mat[i][idx] 59 | new_centers[i][j] = product / sum(hidden_mat[i]) 60 | return new_centers 61 | 62 | 63 | def soft_kmeans(Data, k, beta, N=100): 64 | Centers = Data[:k] 65 | for _ in range(N): 66 | hidden_mat = Hidden_Matrix(Data, Centers, beta) 67 | Centers = clu_to_center(hidden_mat, Data) 68 | return Centers 69 | 70 | 71 | if __name__ == "__main__": 72 | ''' 73 | Given: Integers k and m, followed by a stiffness parameter β, followed by a set of points Data in m-dimensional 74 | space. 75 | Return: A set Centers consisting of k points (centers) resulting from applying the soft k-means clustering 76 | algorithm. Select the first k points from Data as the first centers for the algorithm and run the algorithm for 100 77 | steps. Results should be accurate up to three decimal places. 78 | ''' 79 | input_lines = sys.stdin.read().splitlines() 80 | k, m = [int(x) for x in input_lines[0].split(' ')] 81 | beta = float(input_lines[1]) 82 | data = [[float(x) for x in line.split()] for line in input_lines[2:]] 83 | 84 | Centers = soft_kmeans(data, k, beta) 85 | for C in Centers: 86 | print(" ".join(map(str, C))) 87 | -------------------------------------------------------------------------------- /solutions/BA7E.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def TotalDistance(dist_dict, i): 4 | return sum(dist_dict[i].values()) 5 | 6 | def ConstructNJMatrix(dist_dict): 7 | D_NJ = {} 8 | for key1, val1 in dist_dict.items(): 9 | for key2, val in dist_dict[key1].items(): 10 | if not key1 in D_NJ: 11 | D_NJ[key1] = {} 12 | if key1 == key2: 13 | D_NJ[key1][key2] = 0 14 | else: 15 | D_NJ[key1][key2] = (len(dist_dict) - 2) * val - TotalDistance(dist_dict, key1) - TotalDistance(dist_dict, key2) 16 | return D_NJ 17 | 18 | 19 | def NeighborJoining(dist_dict, num_leaves): 20 | if num_leaves == 2: 21 | idx1 = list(dist_dict.keys())[0] 22 | idx2 = list(dist_dict.keys())[1] 23 | T = [[idx1, idx2, dist_dict[idx1][idx2]], [idx2, idx1, dist_dict[idx1][idx2]]] 24 | return T 25 | 26 | D_NJ = ConstructNJMatrix(dist_dict) 27 | 28 | min_dist = 1e6 29 | for key1, val1 in D_NJ.items(): 30 | for key2, val in D_NJ[key1].items(): 31 | if key1 != key2 and val < min_dist: 32 | idx1 = key1 33 | idx2 = key2 34 | min_dist = val 35 | 36 | delta = (TotalDistance(dist_dict, idx1) - TotalDistance(dist_dict, idx2)) / (num_leaves - 2) 37 | LimbLength1 = (dist_dict[idx1][idx2] + delta) / 2 38 | LimbLength2 = (dist_dict[idx1][idx2] - delta) / 2 39 | 40 | m = max(list(dist_dict.keys())) + 1 41 | 42 | for k in dist_dict.keys(): 43 | dist_dict[k][m] = (dist_dict[idx1][k] + dist_dict[k][idx2] - dist_dict[idx1][idx2]) / 2 44 | 45 | dist_dict[m] = {} 46 | for k in dist_dict.keys(): 47 | dist_dict[m][k] = (dist_dict[idx1][k] + dist_dict[k][idx2] - dist_dict[idx1][idx2]) / 2 48 | 49 | dist_dict[m][m] = 0.0 50 | 51 | del dist_dict[idx1] 52 | del dist_dict[idx2] 53 | 54 | for k in dist_dict.keys(): 55 | del dist_dict[k][idx1] 56 | del dist_dict[k][idx2] 57 | 58 | T = NeighborJoining(dist_dict, num_leaves - 1) 59 | 60 | T.append([idx1, m, LimbLength1]) 61 | T.append([m, idx1, LimbLength1]) 62 | T.append([idx2, m, LimbLength2]) 63 | T.append([m, idx2, LimbLength2]) 64 | 65 | return T 66 | 67 | 68 | if __name__ == "__main__": 69 | ''' 70 | Given: An integer n, followed by a space-separated n x n distance matrix. 71 | Return: An adjacency list for the tree resulting from applying the neighbor-joining algorithm. Edge-weights should 72 | be accurate to two decimal places (they are provided to three decimal places in the sample output below). 73 | ''' 74 | lines = sys.stdin.read().splitlines() 75 | num_leaves = int(lines[0]) 76 | dist_dict = {} 77 | for i in range(len(lines[1:])): 78 | row = lines[i + 1] 79 | temp = row.rstrip().split() 80 | temp_d = {} 81 | for j in range(len(temp)): 82 | temp_d[j] = int(temp[j]) 83 | dist_dict[i] = temp_d 84 | 85 | result = NeighborJoining(dist_dict, num_leaves) 86 | 87 | result.sort(key=lambda x: x[0]) 88 | 89 | for edge in result: 90 | print(str(edge[0]) + '->' + str(edge[1]) + ':' + '%.3f' % edge[2]) -------------------------------------------------------------------------------- /solutions/BA9P.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from re import split 3 | 4 | 5 | class colored_Tree_node: 6 | ''' Colored Tree Node Structure 7 | Each node has children(list) and color (string) attributes. 8 | ''' 9 | 10 | def __init__(self): 11 | self.children = [] 12 | self.color = 'gray' 13 | 14 | 15 | def is_node_ripe(tree_dict, node): 16 | ''' Check if node is ripe 17 | A node in a tree is called ripe if it is "gray" but has no "gray" children. 18 | ''' 19 | if node.color != 'gray': 20 | return False 21 | 22 | for child_idx in node.children: 23 | if tree_dict[child_idx].color == 'gray': 24 | return False 25 | 26 | return True 27 | 28 | 29 | def return_ripe_nodes(tree_dict): 30 | ''' Return ripe nodes in tree 31 | ''' 32 | ripe_nodes = [] 33 | for node_idx, node in tree_dict.items(): 34 | if is_node_ripe(tree_dict, node): 35 | ripe_nodes.append(node) 36 | return ripe_nodes 37 | 38 | 39 | def TreeColoring(tree_dict): 40 | ''' Tree Coloring 41 | # TreeColoring(ColoredTree) 42 | # while ColoredTree has ripe nodes 43 | # for each ripe node v in ColoredTree 44 | # if there exist differently colored children of v 45 | # Color(v) ← "purple" 46 | # else 47 | # Color(v) ← color of all children of v 48 | # return ColoredTree 49 | ''' 50 | 51 | ripe_list = return_ripe_nodes(tree_dict) 52 | while len(ripe_list) != 0: ## while there are ripe nodes 53 | 54 | for ripe_node in ripe_list: 55 | ## collect colors for all children 56 | children_cols = [] 57 | for child_idx in ripe_node.children: 58 | children_cols.append(tree_dict[child_idx].color) 59 | 60 | # if there exist differently colored children of v 61 | children_cols = list(set(children_cols)) 62 | 63 | if len(children_cols) != 1: 64 | ripe_node.color = "purple" 65 | else: 66 | ripe_node.color = children_cols[0] # color of all children 67 | 68 | ripe_list = return_ripe_nodes(tree_dict) 69 | 70 | return tree_dict 71 | 72 | 73 | if __name__ == '__main__': 74 | ''' 75 | Given: An adjacency list, followed by color labels for leaf nodes. 76 | Return: Color labels for all nodes, in any order. 77 | ''' 78 | tmp = sys.stdin.read().splitlines() 79 | 80 | lst_flag = True 81 | 82 | tree_dict = {} 83 | for line in tmp: 84 | if line == '-': 85 | lst_flag = False 86 | 87 | elif lst_flag: 88 | tmp2 = split(' -> ', line) 89 | if tmp2[1] == "{}": 90 | tmp2[1] = [] 91 | else: 92 | tmp2[1] = tmp2[1].split(',') 93 | 94 | node = colored_Tree_node() 95 | node.children = tmp2[1] 96 | tree_dict[tmp2[0]] = node 97 | 98 | else: 99 | tmp2 = split(': ', line) 100 | node = tree_dict[tmp2[0]] 101 | node.color = tmp2[1] 102 | 103 | tree_dict = TreeColoring(tree_dict) 104 | 105 | for node_idx, node in tree_dict.items(): 106 | print(node_idx + ': ' + node.color) -------------------------------------------------------------------------------- /solutions/BA5D.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | class node: 5 | def __init__(self, lbl): 6 | self.label = lbl 7 | self.parent_nodes = [] 8 | self.target_nodes = [] 9 | self.visited = False 10 | 11 | 12 | class DAG: 13 | def __init__(self): 14 | self.nodes_dict = {} 15 | self.distances = {} 16 | self.backtrack = {} 17 | 18 | def add_node(self, lbl): 19 | if lbl in self.nodes_dict: 20 | return self.nodes_dict[lbl] 21 | 22 | new_node = node(lbl) 23 | self.nodes_dict[lbl] = new_node 24 | return new_node 25 | 26 | def contruct_dag(self, adj_list_text): 27 | for line in adj_list_text: 28 | nodeA, tmp = line.split("->") 29 | nodeB, weight = tmp.split(":") 30 | weight = int(weight) 31 | 32 | from_node = self.add_node(nodeA) 33 | to_node = self.add_node(nodeB) 34 | 35 | from_node.target_nodes.append((to_node, weight)) 36 | to_node.parent_nodes.append((from_node, weight)) 37 | 38 | def topological_sort_util(self, node, stack): 39 | node.visited = True 40 | for node2,_ in node.target_nodes: 41 | if not node2.visited: 42 | self.topological_sort_util(node2, stack) 43 | stack.insert(0, node.label) 44 | 45 | def topological_sort(self): 46 | stack = [] 47 | for node in self.nodes_dict.values(): 48 | if not node.visited: 49 | self.topological_sort_util(node, stack) 50 | return stack 51 | 52 | def longest_path(self, source, sink): 53 | for label in self.nodes_dict: 54 | self.distances[label] = -float("Inf") 55 | 56 | self.distances[source] = 0 57 | self.backtrack[source] = None 58 | 59 | top_order = self.topological_sort() 60 | for label in top_order: 61 | current_node = self.nodes_dict[label] 62 | for v, weight in current_node.target_nodes: 63 | if self.distances[v.label] < self.distances[label] + weight: 64 | self.distances[v.label] = self.distances[label] + weight 65 | self.backtrack[v.label] = label 66 | 67 | path = [sink] 68 | curr = self.backtrack[sink] 69 | while curr != source: 70 | path = [curr] + path 71 | curr = self.backtrack[curr] 72 | path = [source] + path 73 | return self.distances[sink], path 74 | 75 | 76 | 77 | 78 | if __name__ == "__main__": 79 | ''' 80 | Given: An integer representing the source node of a graph, followed by an integer representing the sink node of the 81 | graph, followed by an edge-weighted graph. The graph is represented by a modified adjacency list in which the 82 | notation "0->1:7" indicates that an edge connects node 0 to node 1 with weight 7. 83 | Return: The length of a longest path in the graph, followed by a longest path. (If multiple longest paths exist, 84 | you may return any one.) 85 | ''' 86 | input_lines = sys.stdin.read().splitlines() 87 | source = input_lines[0] 88 | sink = input_lines[1] 89 | adj_list_text = input_lines[2:] 90 | 91 | graph = DAG() 92 | graph.contruct_dag(adj_list_text) 93 | longest_dist, longest_path = graph.longest_path(source, sink) 94 | print(longest_dist) 95 | print("->".join(longest_path)) -------------------------------------------------------------------------------- /solutions/BA9O.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA9I import BurrowsWheelerTransform 3 | from BA9N import create_check_point_array, Count_symbol, PartialSuffixArray, MultiplePatternMatching 4 | 5 | 6 | def pattern_to_seeds(pattern, d): 7 | minsize = len(pattern) // (d + 1) 8 | 9 | cut_points = list(range(0, len(pattern) - minsize + 1, minsize)) 10 | cut_points.append(len(pattern)) 11 | 12 | seeds = [] 13 | offsets = [] 14 | for i in range(1, len(cut_points)): 15 | seeds.append(pattern[cut_points[i - 1]: cut_points[i]]) 16 | offsets.append(cut_points[i - 1]) 17 | return seeds, offsets 18 | 19 | 20 | def find_seed_positions(seed, FirstOccurrence, BWT, check_point_array, partial_suffix_array): 21 | seed_pos_list = [] 22 | top, bottom = MultiplePatternMatching(FirstOccurrence, BWT, seed, check_point_array) 23 | if top: 24 | for idx in range(top, bottom + 1): 25 | to_add = 0 26 | while idx not in partial_suffix_array.keys(): 27 | idx = FirstOccurrence[BWT[idx]] + Count_symbol(check_point_array, idx, BWT, BWT[idx]) 28 | to_add += 1 29 | seed_pos_list.append(partial_suffix_array[idx] + to_add) 30 | return seed_pos_list 31 | 32 | 33 | def wrapper(Text, pattern_list, d, C): 34 | BWT = BurrowsWheelerTransform(Text + '$') 35 | 36 | FirstOccurrence = {} 37 | for idx, symbol in enumerate(sorted(BWT)): 38 | if symbol not in FirstOccurrence.keys(): 39 | FirstOccurrence[symbol] = idx 40 | 41 | check_point_array = create_check_point_array(BWT, C) 42 | partial_suffix_array = PartialSuffixArray(Text + '$', C) 43 | 44 | positions_list = [] 45 | for pattern in pattern_list: 46 | ## break pattern into seeds 47 | seeds_list, offsets_list = pattern_to_seeds(pattern, d) 48 | 49 | # find exact matches and try to extend each seed 50 | pattern_pos_list = set() 51 | for candidate_seed, offset in zip(seeds_list, offsets_list): 52 | seed_pos_list = find_seed_positions(candidate_seed, FirstOccurrence, BWT, check_point_array, 53 | partial_suffix_array) 54 | 55 | for candidate_pos in seed_pos_list: 56 | pattern_position = candidate_pos - offset 57 | 58 | if pattern_position >= 0 and pattern_position + len(pattern) <= len(Text): 59 | approximate_match_flag = True 60 | num_mismatch = 0 61 | for idx, symbol in enumerate(pattern): 62 | if symbol != Text[pattern_position + idx]: 63 | num_mismatch += 1 64 | if num_mismatch > d: 65 | approximate_match_flag = False 66 | break 67 | if approximate_match_flag: 68 | pattern_pos_list.add(pattern_position) 69 | 70 | positions_list += list(pattern_pos_list) 71 | 72 | return sorted(positions_list) 73 | 74 | 75 | if __name__ == "__main__": 76 | ''' 77 | Given: A string Text, a collection of strings Patterns, and an integer d. 78 | Return: All positions in Text where a string from Patterns appears as a substring with at most d mismatches. 79 | ''' 80 | tmp = sys.stdin.read().splitlines() 81 | Text = tmp[0] 82 | pattern_list = [pattern for pattern in tmp[1].split(' ')] 83 | d = int(tmp[2]) 84 | 85 | positions_list = wrapper(Text, pattern_list, d, C=100) 86 | print(' '.join(str(pos) for pos in positions_list)) 87 | -------------------------------------------------------------------------------- /solutions/BA5M.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from itertools import product 3 | 4 | 5 | def multiple_alignment(str_list): 6 | str_list = ["-" + string for string in str_list] 7 | 8 | score_mat = {} 9 | backtrack_mat = {} 10 | 11 | def add_tuples_elemwise(t1, t2): 12 | return tuple(sum(x) for x in zip(t1, t2)) 13 | 14 | # all possible "moves" 15 | perm_list = list(product([0, -1], repeat=len(str_list)))[1:] 16 | 17 | # fill n-dimensional score and backtrack matrices 18 | for index in product(*map(range, map(lambda s: len(s) + 1, str_list))): 19 | if index.count(0) >= len(str_list) - 1: 20 | if sum(index) == 0: 21 | score_mat[index] = 0 22 | else: 23 | score_mat[index] = 0 24 | move = tuple(0 if id == 0 else -1 for id in index) 25 | bck = -1 26 | for idx, perm in enumerate(perm_list): 27 | if perm == move: 28 | bck = idx 29 | break 30 | backtrack_mat[index] = bck 31 | else: 32 | possible_scores = [] 33 | for perm_idx, move in enumerate(perm_list): 34 | prev_idx = add_tuples_elemwise(index, move) 35 | if -1 not in prev_idx: 36 | prev_score = score_mat[prev_idx] 37 | chars = [str_list[i][index[i] - 1] if val == -1 else "-" for i, val in enumerate(move)] 38 | # score of an alignment column is 1 if all three symbols are identical and 0 otherwise 39 | 40 | current = 1 if all(elem == chars[0] for elem in chars) and chars[0] != "-" else 0 41 | possible_scores.append((prev_score + current, perm_idx)) 42 | score_mat[index], backtrack_mat[index] = max(possible_scores, key=lambda p: p[0]) 43 | 44 | # backtrack 45 | alignment = ["" for _ in str_list] 46 | current_index = list(map(len, str_list)) 47 | max_score = score_mat[tuple(current_index)] 48 | 49 | while sum(current_index) != 0: 50 | back_perm_idx = backtrack_mat[tuple(current_index)] 51 | permutation = perm_list[back_perm_idx] 52 | for i, perm_value in enumerate(permutation): 53 | if perm_value == 0: 54 | alignment[i] = "-" + alignment[i] 55 | else: 56 | alignment[i] = str_list[i][current_index[i] - 1] + alignment[i] 57 | 58 | current_index = add_tuples_elemwise(tuple(current_index), permutation) 59 | 60 | # remove all "-" columns 61 | to_rm_idx = [] 62 | for pos in range(len(alignment[0])): 63 | temp = [x[pos] for x in alignment] 64 | if all(x == "-" for x in temp): 65 | to_rm_idx.append(pos) 66 | 67 | for i in range(len(alignment)): 68 | alignment[i] = "".join([char for idx, char in enumerate(alignment[i]) if idx not in to_rm_idx]) 69 | 70 | return max_score, alignment 71 | 72 | 73 | if __name__ == "__main__": 74 | ''' 75 | Given: Three DNA strings. 76 | Return: The maximum score of a multiple alignment of these three strings, followed by a multiple alignment of the 77 | three strings achieving this maximum. Use a scoring function in which the score of an alignment column is 1 if all 78 | three symbols are identical and 0 otherwise. (If more than one multiple alignment achieve the maximum, you may 79 | return any one.) 80 | ''' 81 | DNA_strings_list = sys.stdin.read().splitlines() 82 | 83 | score, alignment = multiple_alignment(DNA_strings_list) 84 | 85 | print(score) 86 | 87 | for aligned in alignment: 88 | print(aligned) -------------------------------------------------------------------------------- /solutions/BA2G.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from random import randint 3 | from random import random 4 | 5 | 6 | def GibbsSampler(Dna, k, t, N): 7 | Motifs = [] 8 | for dna in Dna: 9 | idx = randint(0, len(dna) - k) 10 | Motifs.append(dna[idx:idx + k]) 11 | BestMotifs = Motifs 12 | min_score = CalculateScore(BestMotifs) 13 | for ITER in range(N): 14 | idx = randint(0, t - 1) 15 | profile = FormProfileWithPseudoCounts([motif for i, motif in enumerate(Motifs) if i != idx]) 16 | Motifs[idx] = ProfileRandomlyGeneratedKmer(Dna[idx], profile) 17 | current_score = CalculateScore(Motifs) 18 | if current_score < min_score: 19 | BestMotifs = Motifs 20 | min_score = current_score 21 | return BestMotifs 22 | 23 | 24 | def FormProfileWithPseudoCounts(TextList, pseudocount = 1): 25 | if type(TextList) != list: 26 | TextList = [TextList] 27 | t = len(TextList) 28 | k = len(TextList[0]) 29 | profile = {'A': [pseudocount]*k, 'C': [pseudocount]*k, 'G': [pseudocount]*k, 'T': [pseudocount]*k} 30 | for i in range(k): 31 | for j in range(t): 32 | profile[TextList[j][i]][i] += 1 33 | return profile 34 | 35 | 36 | def ProfileRandomlyGeneratedKmer(Text, profile): 37 | L = len(Text) 38 | k = len(profile['A']) 39 | probs = [] 40 | tot = profile['A'][0] + profile['C'][0] + profile['G'][0] + profile['T'][0] 41 | for i in range(L - k + 1): 42 | Motif = Text[i:i + k] 43 | current_prob = 1.0 44 | for j, nuc in enumerate(Motif): 45 | current_prob *= float(profile[nuc][j]) / tot 46 | probs.append(current_prob) 47 | selected_start = Random(probs) 48 | return Text[selected_start:selected_start + k] 49 | 50 | 51 | def Random(prob_list): 52 | tot = sum(prob_list) 53 | massDist = map(lambda x: x/tot, prob_list) 54 | randRoll = random() 55 | cum = 0 56 | result = 0 57 | for mass in massDist: 58 | cum += mass 59 | if randRoll < cum: 60 | return result 61 | result += 1 62 | 63 | 64 | def HammingDistance(p, q): 65 | mm = [p[i] != q[i] for i in range(len(p))] 66 | return sum(mm) 67 | 68 | 69 | def CalculateScore(Motifs): 70 | k = len(Motifs[0]) 71 | profile = FormProfileWithPseudoCounts(Motifs, 0) 72 | consensus = '' 73 | for i in range(k): 74 | most_freq = 0 75 | for nuc in ['A', 'C', 'G', 'T']: 76 | if profile[nuc][i] > most_freq: 77 | most_freq = profile[nuc][i] 78 | to_add = nuc 79 | consensus += to_add 80 | score = 0 81 | for motif in Motifs: 82 | score += HammingDistance(consensus, motif) 83 | return score 84 | 85 | 86 | def wrapper(Dna, k, t, N, nstart = 20): 87 | min_score = 1e6 88 | for i in range(nstart): 89 | res = GibbsSampler(Dna, k, t, N) 90 | current_score = CalculateScore(res) 91 | # print current_score 92 | if current_score < min_score: 93 | min_score = current_score 94 | result = res 95 | return result 96 | 97 | 98 | if __name__ == "__main__": 99 | ''' 100 | Given: Integers k, t, and N, followed by a collection of strings Dna. 101 | Return: The strings BestMotifs resulting from running GibbsSampler(Dna, k, t, N) with 20 random starts. Remember to use pseudocounts! 102 | ''' 103 | input_lines = sys.stdin.read().splitlines() 104 | k, t, N = [int(x) for x in input_lines[0].split()] 105 | DNA_list = input_lines[1:] 106 | 107 | print("\n".join(wrapper(DNA_list, k, t, N))) -------------------------------------------------------------------------------- /solutions/BA7D.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | class Node: 5 | def __init__(self, label): 6 | self.label = label 7 | self.age = 0 8 | 9 | 10 | class Tree: 11 | def __init__(self): 12 | self.nodes = {} 13 | self.edges = [] 14 | 15 | def add_node(self, label): 16 | if label in self.nodes: 17 | return self.nodes[label] 18 | 19 | node = Node(label) 20 | self.nodes[label] = node 21 | return node 22 | 23 | def UPGMA(self, dist_mat, n): 24 | dist_dict = {} 25 | for i in range(len(dist_mat)): 26 | dist_dict[i] = {} 27 | for j in range(len(dist_mat[i])): 28 | dist_dict[i][j] = dist_mat[i][j] 29 | 30 | Clusters = {i: [i] for i in range(n)} 31 | 32 | for i in range(n): 33 | self.add_node(i) 34 | 35 | new_node_label = n 36 | T = [] 37 | while len(dist_dict) > 1: 38 | min_dist = float("Inf") 39 | nodes = list(dist_dict.keys()) 40 | for i in range(len(nodes) - 1): 41 | for j in range(i + 1, len(nodes)): 42 | if dist_dict[nodes[i]][nodes[j]] < min_dist: 43 | min_dist = dist_dict[nodes[i]][nodes[j]] 44 | node_i = nodes[i] 45 | node_j = nodes[j] 46 | 47 | new_cluster = Clusters[node_i] + Clusters[node_j] 48 | 49 | new_node = self.add_node(new_node_label) 50 | T.append([new_node_label, node_i]) 51 | T.append([new_node_label, node_j]) 52 | 53 | new_node.age = dist_dict[node_i][node_j] / 2 54 | 55 | dist_dict[new_node_label] = {} 56 | dist_dict[new_node_label][new_node_label] = 0 57 | for old_node in nodes: 58 | total = 0 59 | count = 0 60 | for init_node in Clusters[old_node]: 61 | for node in new_cluster: 62 | total += dist_mat[init_node][node] 63 | count += 1 64 | dist_dict[old_node][new_node_label] = total / count 65 | dist_dict[new_node_label][old_node] = total / count 66 | 67 | Clusters[new_node_label] = new_cluster 68 | new_node_label += 1 69 | 70 | del dist_dict[node_i] 71 | del dist_dict[node_j] 72 | for key in dist_dict.keys(): 73 | del dist_dict[key][node_i] 74 | 75 | for key in dist_dict.keys(): 76 | del dist_dict[key][node_j] 77 | 78 | for edge in T: 79 | length = self.nodes[edge[0]].age - self.nodes[edge[1]].age 80 | self.edges.append(edge + [length]) 81 | self.edges.append(edge[::-1] + [length]) 82 | 83 | self.edges.sort(key=lambda x: x[1]) 84 | self.edges.sort(key=lambda x: x[0]) 85 | 86 | return self.edges 87 | 88 | 89 | if __name__ == "__main__": 90 | ''' 91 | Given: An integer n followed by a space-delimited n x n distance matrix. 92 | Return: An adjacency list for the ultrametric tree output by UPGMA. Weights should be accurate to three decimal 93 | places. 94 | ''' 95 | input_lines = sys.stdin.read().splitlines() 96 | n = int(input_lines[0]) 97 | distance_matrix = [[int(x) for x in line.split()] for line in input_lines[1:]] 98 | 99 | t = Tree() 100 | adj_list = t.UPGMA(distance_matrix, n) 101 | 102 | for node1, node2, weight in adj_list: 103 | temp = str(node1) + '->' + str(node2) + ':' + str(round(weight, 3)) 104 | print(temp) 105 | 106 | -------------------------------------------------------------------------------- /solutions/BA9N.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA9I import BurrowsWheelerTransform 3 | 4 | 5 | def create_check_point_array(BWT, C): 6 | symbol_list = list(set(BWT)) 7 | check_point_array = {} 8 | for idx in range(0, len(BWT), C): 9 | check_point_array[idx] = {} 10 | for symbol in symbol_list: 11 | check_point_array[idx][symbol] = BWT[:idx].count(symbol) 12 | return check_point_array 13 | 14 | 15 | def Count_symbol(check_point_array, idx, LastColumn, symbol): 16 | vals = [x for x in check_point_array.keys() if x <= idx] 17 | nearest_idx = min(vals, key=lambda x: abs(x - idx)) 18 | 19 | count = check_point_array[nearest_idx][symbol] 20 | count += LastColumn[nearest_idx:idx].count(symbol) 21 | return count 22 | 23 | 24 | def PartialSuffixArray(Text, K): 25 | suffixes = [] 26 | suffix_array = [] 27 | for i in range(len(Text)): 28 | suffixes.append(Text[i:]) 29 | suffix_array.append(i) 30 | 31 | suffix_array = [x for _, x in sorted(zip(suffixes, suffix_array), key=lambda pair: pair[0])] 32 | 33 | partial_suffix_array = {i: x for i, x in enumerate(suffix_array) if x % K == 0} 34 | 35 | return partial_suffix_array 36 | 37 | 38 | def MultiplePatternMatching(FirstOccurrence, LastColumn, pattern, check_point_array): 39 | ''' Multiple Pattern Matching with BWT 40 | ''' 41 | top = 0 42 | bottom = len(LastColumn) - 1 43 | 44 | while top <= bottom: 45 | if len(pattern) != 0: 46 | symbol = pattern[-1] 47 | pattern = pattern[:-1] 48 | 49 | # if positions from top to bottom in LastColumn 50 | # contain any occurrence of symbol 51 | if symbol in LastColumn[top: bottom + 1]: 52 | top = FirstOccurrence[symbol] + Count_symbol(check_point_array, top, LastColumn, symbol) 53 | bottom = FirstOccurrence[symbol] + Count_symbol(check_point_array, bottom + 1, LastColumn, symbol) - 1 54 | else: 55 | return False, False 56 | else: 57 | return top, bottom 58 | 59 | 60 | def wrapper(Text, pattern_list, C): 61 | BWT = BurrowsWheelerTransform(Text + '$') 62 | 63 | FirstOccurrence = {} 64 | for idx, symbol in enumerate(sorted(BWT)): 65 | if symbol not in FirstOccurrence.keys(): 66 | FirstOccurrence[symbol] = idx 67 | 68 | check_point_array = create_check_point_array(BWT, C) 69 | partial_suffix_array = PartialSuffixArray(Text + '$', C) 70 | 71 | positions_list = [] 72 | for pattern in pattern_list: 73 | top, bottom = MultiplePatternMatching(FirstOccurrence, BWT, pattern, check_point_array) 74 | if top: 75 | for idx in range(top, bottom + 1): 76 | 77 | to_add = 0 78 | while idx not in partial_suffix_array.keys(): 79 | idx = FirstOccurrence[BWT[idx]] + Count_symbol(check_point_array, idx, BWT, BWT[idx]) 80 | to_add += 1 81 | 82 | positions_list.append(partial_suffix_array[idx] + to_add) 83 | 84 | return sorted(positions_list) 85 | 86 | 87 | if __name__ == "__main__": 88 | ''' Multiple Patterns Matching Implementation (with BWT) 89 | Given: A string Text and a collection of strings Patterns. 90 | Return: All starting positions in Text where a string from Patterns appears as a substring. 91 | ''' 92 | tmp = sys.stdin.read().splitlines() 93 | Text = tmp[0] 94 | pattern_list = [] 95 | for i in range(1, len(tmp)): 96 | pattern_list.append(tmp[i]) 97 | 98 | positions_list = wrapper(Text, pattern_list, C=100) 99 | print(' '.join(str(pos) for pos in positions_list)) 100 | -------------------------------------------------------------------------------- /solutions/BA6K.py: -------------------------------------------------------------------------------- 1 | def ProcessInput(P): 2 | P = P[1:-1] 3 | P = P.split(')(') 4 | for i in range(len(P)): 5 | P[i] = P[i].split(' ') 6 | for j in range(len(P[i])): 7 | P[i][j] = int(P[i][j]) 8 | return P 9 | 10 | 11 | def ChromosomeToCycle(Chromosome): 12 | Nodes = [] 13 | for block in Chromosome: 14 | if block > 0: 15 | Nodes.append(2 * block - 1) 16 | Nodes.append(2 * block) 17 | else: 18 | Nodes.append(-2 * block) 19 | Nodes.append(-2 * block - 1) 20 | return Nodes 21 | 22 | 23 | def ColoredEdges(P): 24 | Edges = [] 25 | for Chromosome in P: 26 | Nodes = ChromosomeToCycle(Chromosome) 27 | for j in range(1, len(Nodes), 2): 28 | if j != len(Nodes) - 1: 29 | Edges.append([Nodes[j], Nodes[j + 1]]) 30 | else: 31 | Edges.append([Nodes[j], Nodes[0]]) 32 | return Edges 33 | 34 | 35 | def TwoBreakOnGenomeGraph(GenomeGraph, i1 , i2 , i3 , i4): 36 | if [i1, i2] in GenomeGraph: 37 | for i in range(len(GenomeGraph)): 38 | if GenomeGraph[i] == [i1, i2]: 39 | GenomeGraph[i] = [i1, i3] 40 | else: 41 | for i in range(len(GenomeGraph)): 42 | if GenomeGraph[i] == [i2, i1]: 43 | GenomeGraph[i] = [i3, i1] 44 | if [i3, i4] in GenomeGraph: 45 | for i in range(len(GenomeGraph)): 46 | if GenomeGraph[i] == [i3, i4]: 47 | GenomeGraph[i] = [i2, i4] 48 | else: 49 | for i in range(len(GenomeGraph)): 50 | if GenomeGraph[i] == [i4, i3]: 51 | GenomeGraph[i] = [i4, i2] 52 | return GenomeGraph 53 | 54 | 55 | def CycleToChromosome(Nodes): 56 | Chromosome = [] 57 | for i in range(0, len(Nodes), 2): 58 | if Nodes[i] < Nodes[i + 1]: 59 | Chromosome.append(Nodes[i + 1] // 2) 60 | else: 61 | Chromosome.append(-Nodes[i] // 2) 62 | return Chromosome 63 | 64 | 65 | def FindNextEdge(current, edges): 66 | if len(edges) == 0: 67 | return -1 68 | idx = 0 69 | while not (current[1] + 1 == edges[idx][0] or current[1] - 1 == edges[idx][0]): 70 | idx += 1 71 | if idx == len(edges): 72 | return -1 73 | return edges[idx] 74 | 75 | 76 | def GraphToGenome(GenomeGraph): 77 | Q = [] 78 | Cycles = [] 79 | idx = 0 80 | while len(GenomeGraph) != 0: 81 | Cycle = [] 82 | current = GenomeGraph[0] 83 | while current != -1: 84 | Cycle += current 85 | GenomeGraph.remove(current) 86 | current = FindNextEdge(current, GenomeGraph) 87 | Cycles.append(Cycle) 88 | for Cycle in Cycles: 89 | Cycle = Cycle[-3:] + Cycle[:-3] 90 | Chromosome = CycleToChromosome(Cycle) 91 | Q.append(Chromosome) 92 | return Q 93 | 94 | 95 | def TwoBreakOnGenome(P, i1 , i2 , i3 , i4): 96 | GenomeGraph = ColoredEdges(P) 97 | GenomeGraph = TwoBreakOnGenomeGraph(GenomeGraph, i1 , i2 , i3 , i4) 98 | Q = GraphToGenome(GenomeGraph) 99 | return Q 100 | 101 | 102 | if __name__ == "__main__": 103 | ''' 104 | Given: The colored edges of a genome graph GenomeGraph, followed by indices i, i', j, and j'. 105 | Return: The colored edges of the genome graph resulting from applying the 2-break operation. 106 | ''' 107 | P = input().rstrip() 108 | P = ProcessInput(P) 109 | i1, i2, i3, i4 = map(int, input().rstrip().split(', ')) 110 | result = TwoBreakOnGenome(P, i1, i2, i3, i4) 111 | for j in range(len(result)): 112 | result[j] = '(' + ' '.join(('+' if i > 0 else '') + str(i) for i in result[j]) + ')' 113 | print(''.join(result)) 114 | -------------------------------------------------------------------------------- /solutions/BA10C.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def Viterbi(x, all_states, transition_matrix, emission_matrix): 5 | init_transition_prob = 1 / len(all_states) 6 | 7 | ## calculate all scores 8 | backtrace = {} 9 | Score_dict = {} 10 | for i in range(len(x)): 11 | backtrace[i] = {} 12 | for current_state in all_states: 13 | if current_state not in Score_dict.keys(): 14 | Score_dict[current_state] = {} 15 | ## if the leftmost column, initialize the recurrence 16 | # (every node in the leftmost column is connected to source) 17 | if i == 0: 18 | # Score[source] is 1 19 | Score_dict[current_state][i] = 1 * init_transition_prob * emission_matrix[current_state][x[i]] 20 | # print(str(i) + ': '+ 'source' + '>>' + current_state + ':\t' + '{:.5f}'.format(init_transition_prob * emission_matrix[current_state][x[i]])) 21 | 22 | # 𝑠𝑘,𝑖 = max𝑎𝑙𝑙 𝑠𝑡𝑎𝑡𝑒𝑠 𝑙{𝑠𝑙,𝑖−1⋅𝑡𝑟𝑎𝑛𝑠𝑖𝑡𝑖𝑜𝑛𝑙,𝑘⋅𝑒𝑚𝑖𝑠𝑠𝑖𝑜𝑛𝑘(𝑥𝑖)} 23 | else: 24 | Score_dict[current_state][i] = -1e6 25 | for state in all_states: 26 | tmp_score = Score_dict[state][i - 1] * transition_matrix[state][current_state] * \ 27 | emission_matrix[current_state][x[i]] 28 | # print(str(i) + ': '+ state + '>>' + current_state + ':\t' + '{:.5f}'.format(transition_matrix[state][current_state] * emission_matrix[current_state][x[i]])) 29 | if tmp_score > Score_dict[current_state][i]: 30 | Score_dict[current_state][i] = tmp_score 31 | backtrace[i][current_state] = state 32 | 33 | ## Backtrace the maximum scoring path 34 | max_score_state = max(Score_dict.keys(), key=lambda state: Score_dict[state][len(x) - 1]) 35 | most_probable_path = max_score_state 36 | 37 | current_state = max_score_state 38 | for i in range(len(x) - 1, 0, -1): 39 | prev_state = backtrace[i][current_state] 40 | most_probable_path = prev_state + most_probable_path 41 | current_state = prev_state 42 | 43 | # print('\t' + '\t'.join([str(i) for i in range(len(x))])) 44 | # to_print = '' 45 | # for state in all_states: 46 | # to_print += state + '\t' 47 | # for i in range(len(x)): 48 | # to_print += '{:.5f}'.format(Score_dict[state][i]) + '\t' 49 | # to_print += '\n' 50 | # print(to_print) 51 | 52 | return most_probable_path 53 | 54 | 55 | if __name__ == "__main__": 56 | ''' 57 | Given: A string x, followed by the alphabet Σ from which x was constructed, followed by the states States, 58 | transition matrix Transition, and emission matrix Emission of an HMM (Σ, States, Transition, Emission). 59 | Return: A path that maximizes the (unconditional) probability Pr(x, π) over all possible paths π. 60 | ''' 61 | 62 | tmp = sys.stdin.read().splitlines() 63 | 64 | x = tmp[0] # the emitted string 65 | alphabet = tmp[2].split() # the alphabet from which x was constructed 66 | states = tmp[4].split() # the states of HMM 67 | 68 | # transition matrix 69 | col_syms = tmp[6].split() 70 | transition_end = 6 + len(states) 71 | 72 | transition_matrix = {} 73 | for i in range(7, transition_end + 1): 74 | current_line = tmp[i].split() 75 | row_sym = current_line[0] 76 | transition_matrix[row_sym] = {} 77 | for j in range(1, len(current_line)): 78 | transition_matrix[row_sym][col_syms[j - 1]] = float(current_line[j]) 79 | 80 | # emission matrix 81 | col_syms = tmp[transition_end + 2].split() 82 | emission_matrix = {} 83 | for i in range(transition_end + 3, len(tmp)): 84 | current_line = tmp[i].split() 85 | row_sym = current_line[0] 86 | emission_matrix[row_sym] = {} 87 | for j in range(1, len(current_line)): 88 | emission_matrix[row_sym][col_syms[j - 1]] = float(current_line[j]) 89 | 90 | print(Viterbi(x, states, transition_matrix, emission_matrix)) -------------------------------------------------------------------------------- /solutions/BA10J.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def SoftDecoding(x, transition_matrix, emission_matrix, alphabet, all_states): 5 | # assume that transitions from the source node and to sink node occur with equal probability 6 | init_transition_prob = 1 / len(all_states) 7 | 8 | ## calculate all forward values 9 | Forward_dict = {} 10 | for i in range(len(x)): 11 | for current_state in all_states: 12 | if current_state not in Forward_dict.keys(): 13 | Forward_dict[current_state] = {} 14 | # initialize the recurrence 15 | # (every node in the leftmost column is connected to source) 16 | if i == 0: 17 | # Forward[source] is 1 18 | Forward_dict[current_state][i] = 1 * init_transition_prob * emission_matrix[current_state][x[i]] 19 | 20 | # forward𝑘,𝑖 = ∑all states 𝑙forward𝑙,𝑖−1⋅Weight𝑖(𝑙,𝑘) 21 | else: 22 | Forward_dict[current_state][i] = 0 23 | for state in all_states: 24 | Forward_dict[current_state][i] += Forward_dict[state][i - 1] * transition_matrix[state][ 25 | current_state] * emission_matrix[current_state][x[i]] 26 | 27 | Pr_x = 0 28 | for state in all_states: 29 | Pr_x += Forward_dict[state][len(x) - 1] 30 | 31 | Backward_dict = {} 32 | for i in range(len(x) - 1, -1, -1): 33 | for current_state in all_states: 34 | if current_state not in Backward_dict.keys(): 35 | Backward_dict[current_state] = {} 36 | 37 | # initialize the recurrence 38 | # (every node in the rightmost column is connected to sink) 39 | if i == len(x) - 1: 40 | Backward_dict[current_state][i] = 1 41 | # backward𝑘,𝑖 = ∑all states 𝑙backward𝑙,𝑖+1⋅Weight𝑖(𝑙,𝑘) 42 | else: 43 | Backward_dict[current_state][i] = 0 44 | for state in all_states: 45 | Backward_dict[current_state][i] += Backward_dict[state][i + 1] * transition_matrix[current_state][ 46 | state] * emission_matrix[state][x[i + 1]] 47 | 48 | cond_prob_matrix = {} 49 | for i in range(len(x)): 50 | for state in all_states: 51 | if state not in cond_prob_matrix.keys(): 52 | cond_prob_matrix[state] = {} 53 | 54 | cond_prob_matrix[state][i] = Forward_dict[state][i] * Backward_dict[state][i] / Pr_x 55 | 56 | return cond_prob_matrix 57 | 58 | 59 | if __name__ == "__main__": 60 | ''' 61 | Given: A string x, followed by the alphabet Σ from which x was constructed, followed by the states States, 62 | transition matrix Transition, and emission matrix Emission of an HMM (Σ, States, Transition, Emission). 63 | Return: The probability Pr(πi = k|x) that the HMM was in state k at step i (for each state k and each step i). 64 | ''' 65 | tmp = sys.stdin.read().splitlines() 66 | 67 | x = tmp[0] 68 | alphabet = tmp[2].split() 69 | all_states = tmp[4].split() 70 | 71 | transition_matrix = {} 72 | emission_matrix = {} 73 | 74 | # initial transition matrix 75 | col_syms = tmp[6].split() 76 | transition_end = 6 + len(all_states) 77 | 78 | for i in range(7, transition_end + 1): 79 | current_line = tmp[i].split() 80 | row_sym = current_line[0] 81 | transition_matrix[row_sym] = {} 82 | for j in range(1, len(current_line)): 83 | transition_matrix[row_sym][col_syms[j - 1]] = float(current_line[j]) 84 | 85 | # emission matrix 86 | col_syms = tmp[transition_end + 2].split() 87 | 88 | for i in range(transition_end + 3, len(tmp)): 89 | current_line = tmp[i].split() 90 | row_sym = current_line[0] 91 | emission_matrix[row_sym] = {} 92 | for j in range(1, len(current_line)): 93 | emission_matrix[row_sym][col_syms[j - 1]] = float(current_line[j]) 94 | 95 | cond_prob_matrix = SoftDecoding(x, transition_matrix, emission_matrix, alphabet, all_states) 96 | 97 | to_print = '\t'.join(all_states) + '\n' 98 | for i in range(len(x)): 99 | for state in all_states: 100 | to_print += str(round(cond_prob_matrix[state][i], 4)).rstrip('0') + '\t' 101 | to_print += '\n' 102 | 103 | print(to_print) 104 | -------------------------------------------------------------------------------- /solutions/BA10H.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def print_matrices(*argv, delim="\t", separator = "--------"): 5 | ''' Function for printing multiple matrices 6 | Prints each matrix (stored as a dictionary) in 7 | tab-delimited format (default). Separates the 8 | matrices with '--------' (default). 9 | ''' 10 | 11 | for idx, matrix in enumerate(argv): 12 | 13 | row_labels = list(matrix.keys()) 14 | col_labels = list(matrix[row_labels[0]].keys()) 15 | if idx == 0: 16 | to_print = delim + delim.join(col_labels) + '\n' 17 | # to_print = delim + delim.join(col_labels) + '\t\n' # fix for required format 18 | else: 19 | to_print = delim + delim.join(col_labels) + '\n' 20 | 21 | for r_label in row_labels: 22 | tmp = [r_label] 23 | for c_label in col_labels: 24 | val = matrix[r_label][c_label] 25 | if val == 0: 26 | val_str = '0' 27 | elif val == int(val): 28 | val_str = '{:.1f}'.format(val) 29 | else: 30 | val_str = '{:.3f}'.format(val).rstrip('0') 31 | tmp.append(val_str) 32 | to_print += delim.join(tmp) 33 | if r_label != row_labels[-1]: 34 | to_print += '\n' 35 | 36 | print(to_print) 37 | if idx != len(argv) - 1: 38 | print(separator) 39 | 40 | return None 41 | 42 | 43 | def HMMParameterEstimation(x, path, alphabet, all_states): 44 | transitions = {} 45 | for i in range(1, len(path)): 46 | current = (path[i - 1], path[i]) 47 | if current in transitions.keys(): 48 | transitions[current] += 1 49 | else: 50 | transitions[current] = 1 51 | 52 | transition_matrix = {} 53 | for state1 in all_states: 54 | 55 | transition_matrix[state1] = {} 56 | 57 | total_transitions = 0 58 | for state2 in all_states: 59 | if (state1, state2) in transitions.keys(): 60 | total_transitions += transitions[(state1, state2)] 61 | 62 | for state2 in all_states: 63 | # assume uniform transition probability 64 | # if no transition from state 1 was observed 65 | if total_transitions == 0: 66 | transition_matrix[state1][state2] = 1 / len(all_states) 67 | else: 68 | if (state1, state2) in transitions.keys(): 69 | transition_matrix[state1][state2] = transitions[(state1, state2)] / total_transitions 70 | else: 71 | transition_matrix[state1][state2] = 0 72 | 73 | emissions = {} 74 | for state, symbol in zip(path, x): 75 | current = (state, symbol) 76 | if current in emissions.keys(): 77 | emissions[current] += 1 78 | else: 79 | emissions[current] = 1 80 | 81 | emission_matrix = {} 82 | for state in all_states: 83 | emission_matrix[state] = {} 84 | 85 | total_emissions = 0 86 | for symbol in alphabet: 87 | if (state, symbol) in emissions.keys(): 88 | total_emissions += emissions[(state, symbol)] 89 | 90 | for symbol in alphabet: 91 | # assume uniform emission probability 92 | # if state was not observed in path 93 | if total_emissions == 0: 94 | emission_matrix[state][symbol] = 1 / len(alphabet) 95 | else: 96 | if (state, symbol) in emissions.keys(): 97 | emission_matrix[state][symbol] = emissions[(state, symbol)] / total_emissions 98 | else: 99 | emission_matrix[state][symbol] = 0 100 | 101 | return transition_matrix, emission_matrix 102 | 103 | 104 | if __name__ == "__main__": 105 | ''' 106 | Given: A sequence of emitted symbols x = x1 . . . xn in an alphabet ∑ and a path π = π1 . . . πn generated by a 107 | k-state HMM with unknown transition and emission probabilities. 108 | Return: A matrix of transition probabilities Transition and a matrix of emission probabilities Emission that 109 | maximize Pr(x,π) over all possible matrices of transition and emission probabilities. 110 | ''' 111 | tmp = sys.stdin.read().splitlines() 112 | 113 | x = tmp[0] 114 | alphabet = tmp[2].split() 115 | path = tmp[4] 116 | all_states = tmp[6].split() 117 | transition_matrix, emission_matrix = HMMParameterEstimation(x, path, alphabet, all_states) 118 | print_matrices(transition_matrix, emission_matrix) -------------------------------------------------------------------------------- /solutions/BA11J.py: -------------------------------------------------------------------------------- 1 | import sys 2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163} 4 | 5 | 6 | def printScoreMat(Score, prefixMasses, spectral_vector, k): 7 | for t in range(-2, k + 1): 8 | if t == -2: 9 | tmp = [str(x) for x in spectral_vector] 10 | for idx in range(len(tmp)): 11 | s = tmp[idx] 12 | if len(s) == 1: 13 | s = ' ' + s 14 | tmp[idx] = s 15 | tmp = [' '] + tmp 16 | print(' '.join(tmp)) 17 | print(' ') 18 | print(' ') 19 | elif t == -1: 20 | tmp = [str(i) for i in range(len(spectral_vector))] 21 | for idx in range(len(tmp)): 22 | s = tmp[idx] 23 | if len(s) == 1: 24 | s = ' ' + s 25 | tmp[idx] = s 26 | tmp = [' '] + tmp 27 | print(' '.join(tmp)) 28 | else: 29 | for i in prefixMasses: 30 | row = [] 31 | for j in range(-1, len(spectral_vector)): 32 | if j == -1: 33 | row.append((str(i) if i >= 10 else ' ' + str(i)) + ' ') 34 | else: 35 | score = Score[i][j][t] 36 | if score < -1e5: 37 | score = 'XX' 38 | elif len(str(score)) == 1: 39 | score = ' ' + str(score) 40 | row.append(str(score)) 41 | print(' '.join(row)) 42 | print(' ') 43 | return None 44 | 45 | 46 | def SpectralAlignment(peptide, spectral_vector, k): 47 | spectral_vector.insert(0, 0) 48 | 49 | ## Calculate prefix masses 50 | prefixMasses = [0] 51 | for i in range(len(peptide)): 52 | prefix = peptide[:i + 1] 53 | mass = 0 54 | for aa in prefix: 55 | mass += aa_table[aa] 56 | prefixMasses.append(mass) 57 | 58 | ## Create diff array 59 | diff = {} 60 | for i in range(1, len(prefixMasses)): 61 | diff[prefixMasses[i]] = prefixMasses[i] - prefixMasses[i - 1] 62 | 63 | ## Initiliaze scores 64 | Score = {} 65 | for i in prefixMasses: 66 | Score[i] = {} 67 | for j in range(len(spectral_vector)): 68 | Score[i][j] = {} 69 | for t in range(k + 1): 70 | Score[i][j][t] = -float("inf") 71 | Score[0][0][0] = 0 72 | 73 | ## Calculate scores 74 | for i in prefixMasses[1:]: 75 | for j in range(len(spectral_vector)): 76 | for t in range(k + 1): 77 | if (t == 0) and (i - diff[i] >= 0) and (j - diff[i] >= 0): 78 | Score[i][j][t] = spectral_vector[j] + Score[i - diff[i]][j - diff[i]][t] 79 | elif (t > 0) and (i - diff[i] >= 0) and (j - diff[i] >= 0): 80 | Score[i][j][t] = spectral_vector[j] + max(Score[i - diff[i]][j - diff[i]][t], max( 81 | [Score[i - diff[i]][j_star][t - 1] for j_star in range(j)])) 82 | elif (t > 0) and (i - diff[i] >= 0) and (j > 0): 83 | Score[i][j][t] = spectral_vector[j] + max( 84 | [Score[i - diff[i]][j_star][t - 1] for j_star in range(j)]) 85 | 86 | # printScoreMat(Score, prefixMasses, spectral_vector, k) 87 | 88 | ## Find max score layer 89 | max_score = -float("inf") 90 | for t in range(k + 1): 91 | current = Score[prefixMasses[-1]][len(spectral_vector) - 1][t] 92 | if current > max_score: 93 | max_score = current 94 | max_layer = t 95 | 96 | ## Backtrace 97 | layer = max_layer 98 | column = len(spectral_vector) - 1 99 | 100 | result = '' 101 | for i in range(len(peptide), 0, -1): 102 | pre = prefixMasses[i] 103 | if (column - diff[pre] >= 0) and ( 104 | Score[pre][column][layer] == spectral_vector[column] + Score[pre - diff[pre]][column - diff[pre]][ 105 | layer]): 106 | column -= diff[pre] 107 | result = peptide[i - 1] + result 108 | else: 109 | tmp = [Score[pre - diff[pre]][j_star][layer - 1] for j_star in range(column)] 110 | idx = tmp.index((max(tmp))) 111 | modif = column - idx - diff[pre] 112 | if modif > 0: 113 | result = peptide[i - 1] + '(+' + str(modif) + ')' + result 114 | else: 115 | result = peptide[i - 1] + '(' + str(modif) + ')' + result 116 | column = idx 117 | layer -= 1 118 | 119 | return result 120 | 121 | 122 | if __name__ == "__main__": 123 | ''' 124 | Given: A peptide Peptide, a spectral vector Spectrum', and an integer k. 125 | Return: A peptide Peptide' related to Peptide by up to k modifications with maximal score against Spectrum' out of 126 | all possibilities. 127 | ''' 128 | tmp = sys.stdin.read().splitlines() 129 | peptide = tmp[0] 130 | spectral_vector = [int(x) for x in tmp[1].rstrip().split(' ')] 131 | k = int(tmp[2]) 132 | 133 | print(SpectralAlignment(peptide, spectral_vector, k)) 134 | -------------------------------------------------------------------------------- /solutions/BA6D.py: -------------------------------------------------------------------------------- 1 | def ProcessInput(P): 2 | P = P[1:-1] 3 | P = P.split(')(') 4 | for i in range(len(P)): 5 | P[i] = P[i].split(' ') 6 | for j in range(len(P[i])): 7 | P[i][j] = int(P[i][j]) 8 | return P 9 | 10 | 11 | def ChromosomeToCycle(Chromosome): 12 | Nodes = [] 13 | for block in Chromosome: 14 | if block > 0: 15 | Nodes.append(2 * block - 1) 16 | Nodes.append(2 * block) 17 | else: 18 | Nodes.append(-2 * block) 19 | Nodes.append(-2 * block - 1) 20 | return Nodes 21 | 22 | 23 | def ColoredEdges(P): 24 | Edges = [] 25 | for Chromosome in P: 26 | Nodes = ChromosomeToCycle(Chromosome) 27 | for j in range(1, len(Nodes), 2): 28 | if j != len(Nodes) - 1: 29 | Edges.append([Nodes[j], Nodes[j + 1]]) 30 | else: 31 | Edges.append([Nodes[j], Nodes[0]]) 32 | return Edges 33 | 34 | 35 | def FindNextEdge(current, edges): 36 | if len(edges) == 0: 37 | return -1 38 | idx = 0 39 | while not (current[0] in edges[idx] or current[1] in edges[idx]): 40 | idx += 1 41 | if idx == len(edges): 42 | return -1 43 | return edges[idx] 44 | 45 | 46 | def FindCycles(edges): 47 | Cycles = [] 48 | while len(edges) != 0: 49 | start = edges[0] 50 | edges.remove(edges[0]) 51 | Cycle = [start] 52 | current = FindNextEdge(start, edges) 53 | while current != -1: 54 | Cycle.append(current) 55 | edges.remove(current) 56 | current = FindNextEdge(current, edges) 57 | if len(Cycle) > 2: 58 | Cycles.append(Cycle) 59 | return Cycles 60 | 61 | 62 | def TwoBreakOnGenomeGraph(GenomeGraph, i1 , i2 , i3 , i4): 63 | if [i1, i2] in GenomeGraph: 64 | for i in range(len(GenomeGraph)): 65 | if GenomeGraph[i] == [i1, i2]: 66 | GenomeGraph[i] = [i1, i3] 67 | else: 68 | for i in range(len(GenomeGraph)): 69 | if GenomeGraph[i] == [i2, i1]: 70 | GenomeGraph[i] = [i3, i1] 71 | if [i3, i4] in GenomeGraph: 72 | for i in range(len(GenomeGraph)): 73 | if GenomeGraph[i] == [i3, i4]: 74 | GenomeGraph[i] = [i2, i4] 75 | else: 76 | for i in range(len(GenomeGraph)): 77 | if GenomeGraph[i] == [i4, i3]: 78 | GenomeGraph[i] = [i4, i2] 79 | return GenomeGraph 80 | 81 | 82 | def TwoBreakOnGenome(P, i1 , i2 , i3 , i4): 83 | GenomeGraph = ColoredEdges(P) 84 | GenomeGraph = TwoBreakOnGenomeGraph(GenomeGraph, i1, i2, i3, i4) 85 | Q = GraphToGenome(GenomeGraph) 86 | return Q 87 | 88 | 89 | def FindNextEdge2(current, edges): 90 | if len(edges) == 0: 91 | return -1 92 | idx = 0 93 | val = current[1] 94 | if val % 2 == 0: 95 | val -= 1 96 | else: 97 | val += 1 98 | while not val in edges[idx]: 99 | idx += 1 100 | if idx == len(edges): 101 | return -1 102 | if val == edges[idx][1]: 103 | edges[idx].reverse() 104 | return edges[idx] 105 | 106 | 107 | def CycleToChromosome(Nodes): 108 | Chromosome = [] 109 | for i in range(0, len(Nodes), 2): 110 | if Nodes[i] < Nodes[i + 1]: 111 | Chromosome.append(Nodes[i + 1] // 2) 112 | else: 113 | Chromosome.append(-Nodes[i] // 2) 114 | return Chromosome 115 | 116 | 117 | def GraphToGenome(GenomeGraph): 118 | Q = [] 119 | Cycles = [] 120 | idx = 0 121 | while len(GenomeGraph) != 0: 122 | Cycle = [] 123 | current = GenomeGraph[0] 124 | while current != -1: 125 | Cycle += current 126 | GenomeGraph.remove(current) 127 | next_edge = FindNextEdge2(current, GenomeGraph) 128 | current = next_edge 129 | Cycles.append(Cycle) 130 | for Cycle in Cycles: 131 | Cycle = [Cycle[-1]] + Cycle[:-1] 132 | Chromosome = CycleToChromosome(Cycle) 133 | Q.append(Chromosome) 134 | return Q 135 | 136 | 137 | def ShortestRearrangementScenario(P, Q): 138 | result = [P] 139 | RedEdges = ColoredEdges(P) 140 | BlueEdges = ColoredEdges(Q) 141 | BreakpointGraph = BlueEdges + RedEdges 142 | NonTrivialCycles = FindCycles(BreakpointGraph) 143 | while len(NonTrivialCycles) != 0: 144 | Cycle = NonTrivialCycles[0] 145 | for i in range(len(Cycle) - 1): 146 | if Cycle[i][0] in Cycle[i + 1]: 147 | Cycle[i].reverse() 148 | if Cycle[i + 1][1] in Cycle[i]: 149 | Cycle[i+1].reverse() 150 | idx = 0 151 | while not Cycle[idx] in RedEdges: 152 | idx += 1 153 | i1, i2 = Cycle[idx] 154 | if idx + 2 != len(Cycle): 155 | i3, i4 = Cycle[idx + 2] 156 | else: 157 | i3, i4 = Cycle[0] 158 | RedEdges.remove([i1, i2]) 159 | RedEdges.remove([i3, i4]) 160 | RedEdges.append([i1, i4]) 161 | RedEdges.append([i2, i3]) 162 | BreakpointGraph = BlueEdges + RedEdges 163 | NonTrivialCycles = FindCycles(BreakpointGraph) 164 | P = TwoBreakOnGenome(P, i1 , i2 , i4 , i3) 165 | result.append(P) 166 | return result 167 | 168 | if __name__ == "__main__": 169 | ''' 170 | Given: Two genomes with circular chromosomes on the same set of synteny blocks. 171 | Return: The sequence of genomes resulting from applying a shortest sequence of 2-breaks transforming one genome into 172 | the other. 173 | ''' 174 | P = input().rstrip() 175 | P = ProcessInput(P) 176 | Q = input().rstrip() 177 | Q = ProcessInput(Q) 178 | answer = ShortestRearrangementScenario(P, Q) 179 | for result in answer: 180 | for j in range(len(result)): 181 | result[j] = '(' + ' '.join(('+' if i > 0 else '') + str(i) for i in result[j]) + ')' 182 | print(''.join(result)) -------------------------------------------------------------------------------- /solutions/BA7F.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | ALPHABET = ['A', 'C', 'G', 'T'] 4 | 5 | 6 | def HammingDistance(p, q): 7 | mm = [p[i] != q[i] for i in range(len(p))] 8 | return sum(mm) 9 | 10 | 11 | def RepresentsInt(s): 12 | try: 13 | int(s) 14 | return True 15 | except ValueError: 16 | return False 17 | 18 | 19 | def SmallParsimony(adj_list): 20 | ## initialize 21 | Tag = {} 22 | S = {} 23 | 24 | nodes = [item for sublist in adj_list for item in sublist] 25 | nodes = list(set(nodes)) 26 | 27 | for v in nodes: 28 | S[v] = {} 29 | Tag[v] = 0 30 | if not RepresentsInt(v): 31 | Tag[v] = 1 32 | len_dna = len(v) 33 | for pos in range(len_dna): 34 | S[v][pos] = {} 35 | char = v[pos] 36 | for k in ALPHABET: 37 | if char == k: 38 | S[v][pos][k] = 0 39 | else: 40 | S[v][pos][k] = 1e6 41 | 42 | ## calculate scores 43 | while any(x == 0 for x in list(Tag.values())): 44 | zero_nodes = [node for node, tag in Tag.items() if tag == 0] 45 | for zn in zero_nodes: 46 | children = [child for parent, child in adj_list if parent == zn] 47 | if all([Tag[child] == 1 for child in children]): 48 | v = zn 49 | break 50 | Tag[v] = 1 51 | S[v] = {} 52 | for pos in range(len_dna): 53 | S[v][pos] = {} 54 | for k in ALPHABET: 55 | temp = [] 56 | for i, score in S[children[0]][pos].items(): 57 | if i == k: 58 | temp.append(score) 59 | else: 60 | temp.append(score + 1) 61 | score_daughter = min(temp) 62 | 63 | temp = [] 64 | for i, score in S[children[1]][pos].items(): 65 | if i == k: 66 | temp.append(score) 67 | else: 68 | temp.append(score + 1) 69 | score_son = min(temp) 70 | 71 | S[v][pos][k] = score_daughter + score_son 72 | return S 73 | 74 | 75 | def FinalTree(adj_list, score_dict): 76 | nodes = [item for sublist in adj_list for item in sublist] 77 | nodes = list(set(nodes)) 78 | child_nodes = [child for parent, child in adj_list] 79 | 80 | ## Find root 81 | root = nodes[0] 82 | idx = 1 83 | while root in child_nodes: 84 | root = nodes[idx] 85 | idx += 1 86 | 87 | ## Root's label and min parsimony score 88 | label_dict = {} 89 | label_dict[root] = '' 90 | min_pars_score = 0 91 | for pos, scores in score_dict[root].items(): 92 | label_dict[root] += min(scores, key=scores.get) 93 | min_pars_score += min(scores.values()) 94 | 95 | ## Backtrace 96 | Tag = {} 97 | for node in nodes: 98 | if not RepresentsInt(node): 99 | Tag[node] = 1 100 | else: 101 | Tag[node] = 0 102 | 103 | Tag[root] = 1 104 | 105 | while any(x == 0 for x in list(Tag.values())): 106 | 107 | one_nodes = [node for node, tag in Tag.items() if tag == 1] 108 | for node in one_nodes: 109 | children = [child for parent, child in adj_list if parent == node] 110 | if RepresentsInt(node) and all([Tag[child] == 0 for child in children]): 111 | v = node 112 | break 113 | 114 | daughter_label = '' 115 | daughter_scores = score_dict[children[0]] 116 | for pos, daughter_score in daughter_scores.items(): 117 | parent_letter = label_dict[v][pos] 118 | # parent_score = score_dict[v][pos] 119 | # parent_score = parent_score[parent_letter] 120 | min_nucs = [nuc for nuc, val in daughter_score.items() if val == min(daughter_score.values())] 121 | if parent_letter in min_nucs: 122 | daughter_label += parent_letter 123 | else: 124 | daughter_label += min_nucs[0] 125 | 126 | label_dict[children[0]] = daughter_label 127 | Tag[children[0]] = 1 128 | 129 | son_label = '' 130 | son_scores = score_dict[children[1]] 131 | for pos, son_score in son_scores.items(): 132 | parent_letter = label_dict[v][pos] 133 | # parent_score = score_dict[v][pos] 134 | # parent_score = parent_score[parent_letter] 135 | min_nucs = [nuc for nuc, val in son_score.items() if val == min(son_score.values())] 136 | if parent_letter in min_nucs: 137 | son_label += parent_letter 138 | else: 139 | son_label += min_nucs[0] 140 | 141 | label_dict[children[1]] = son_label 142 | Tag[children[1]] = 1 143 | 144 | ## Create final adjacency list 145 | final_adj_list = [] 146 | for edge in adj_list: 147 | if RepresentsInt(edge[0]): 148 | node0 = label_dict[edge[0]] 149 | else: 150 | node0 = edge[0] 151 | if RepresentsInt(edge[1]): 152 | node1 = label_dict[edge[1]] 153 | else: 154 | node1 = edge[1] 155 | final_adj_list.append([node0, node1, HammingDistance(node0, node1)]) 156 | final_adj_list.append([node1, node0, HammingDistance(node0, node1)]) 157 | 158 | return [final_adj_list, min_pars_score] 159 | 160 | 161 | if __name__ == "__main__": 162 | ''' 163 | Given: An integer n followed by an adjacency list for a rooted binary tree with n leaves labeled by DNA strings. 164 | Return: The minimum parsimony score of this tree, followed by the adjacency list of the tree corresponding to 165 | labeling internal nodes by DNA strings in order to minimize the parsimony score of the tree. 166 | ''' 167 | lines = sys.stdin.read().splitlines() 168 | num_leaves = int(lines[0]) 169 | 170 | adj_list = [] 171 | for row in lines[1:]: 172 | temp = row.rstrip().split('->') 173 | adj_list.append(temp) 174 | 175 | score_dict = SmallParsimony(adj_list) 176 | 177 | final_adj_list, min_pars_score = FinalTree(adj_list, score_dict) 178 | 179 | print(min_pars_score) 180 | 181 | for edge in final_adj_list: 182 | print(str(edge[0]) + '->' + str(edge[1]) + ':' + str(edge[2])) -------------------------------------------------------------------------------- /solutions/BA5F.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | PAM250 = {'A': {'A': 2, 'C': -2, 'D': 0, 'E': 0, 'F': -3, 'G': 1, 'H': -1, 'I': -1, 'K': -1, 'L': -2, 'M': -1, 'N': 0, 4 | 'P': 1, 'Q': 0, 'R': -2, 'S': 1, 'T': 1, 'V': 0, 'W': -6, 'Y': -3}, 5 | 'C': {'A': -2, 'C': 12, 'D': -5, 'E': -5, 'F': -4, 'G': -3, 'H': -3, 'I': -2, 'K': -5, 'L': -6, 'M': -5, 6 | 'N': -4, 'P': -3, 'Q': -5, 'R': -4, 'S': 0, 'T': -2, 'V': -2, 'W': -8, 'Y': 0}, 7 | 'D': {'A': 0, 'C': -5, 'D': 4, 'E': 3, 'F': -6, 'G': 1, 'H': 1, 'I': -2, 'K': 0, 'L': -4, 'M': -3, 'N': 2, 8 | 'P': -1, 'Q': 2, 'R': -1, 'S': 0, 'T': 0, 'V': -2, 'W': -7, 'Y': -4}, 9 | 'E': {'A': 0, 'C': -5, 'D': 3, 'E': 4, 'F': -5, 'G': 0, 'H': 1, 'I': -2, 'K': 0, 'L': -3, 'M': -2, 'N': 1, 10 | 'P': -1, 'Q': 2, 'R': -1, 'S': 0, 'T': 0, 'V': -2, 'W': -7, 'Y': -4}, 11 | 'F': {'A': -3, 'C': -4, 'D': -6, 'E': -5, 'F': 9, 'G': -5, 'H': -2, 'I': 1, 'K': -5, 'L': 2, 'M': 0, 'N': -3, 12 | 'P': -5, 'Q': -5, 'R': -4, 'S': -3, 'T': -3, 'V': -1, 'W': 0, 'Y': 7}, 13 | 'G': {'A': 1, 'C': -3, 'D': 1, 'E': 0, 'F': -5, 'G': 5, 'H': -2, 'I': -3, 'K': -2, 'L': -4, 'M': -3, 'N': 0, 14 | 'P': 0, 'Q': -1, 'R': -3, 'S': 1, 'T': 0, 'V': -1, 'W': -7, 'Y': -5}, 15 | 'H': {'A': -1, 'C': -3, 'D': 1, 'E': 1, 'F': -2, 'G': -2, 'H': 6, 'I': -2, 'K': 0, 'L': -2, 'M': -2, 'N': 2, 16 | 'P': 0, 'Q': 3, 'R': 2, 'S': -1, 'T': -1, 'V': -2, 'W': -3, 'Y': 0}, 17 | 'I': {'A': -1, 'C': -2, 'D': -2, 'E': -2, 'F': 1, 'G': -3, 'H': -2, 'I': 5, 'K': -2, 'L': 2, 'M': 2, 'N': -2, 18 | 'P': -2, 'Q': -2, 'R': -2, 'S': -1, 'T': 0, 'V': 4, 'W': -5, 'Y': -1}, 19 | 'K': {'A': -1, 'C': -5, 'D': 0, 'E': 0, 'F': -5, 'G': -2, 'H': 0, 'I': -2, 'K': 5, 'L': -3, 'M': 0, 'N': 1, 20 | 'P': -1, 'Q': 1, 'R': 3, 'S': 0, 'T': 0, 'V': -2, 'W': -3, 'Y': -4}, 21 | 'L': {'A': -2, 'C': -6, 'D': -4, 'E': -3, 'F': 2, 'G': -4, 'H': -2, 'I': 2, 'K': -3, 'L': 6, 'M': 4, 'N': -3, 22 | 'P': -3, 'Q': -2, 'R': -3, 'S': -3, 'T': -2, 'V': 2, 'W': -2, 'Y': -1}, 23 | 'M': {'A': -1, 'C': -5, 'D': -3, 'E': -2, 'F': 0, 'G': -3, 'H': -2, 'I': 2, 'K': 0, 'L': 4, 'M': 6, 'N': -2, 24 | 'P': -2, 'Q': -1, 'R': 0, 'S': -2, 'T': -1, 'V': 2, 'W': -4, 'Y': -2}, 25 | 'N': {'A': 0, 'C': -4, 'D': 2, 'E': 1, 'F': -3, 'G': 0, 'H': 2, 'I': -2, 'K': 1, 'L': -3, 'M': -2, 'N': 2, 26 | 'P': 0, 'Q': 1, 'R': 0, 'S': 1, 'T': 0, 'V': -2, 'W': -4, 'Y': -2}, 27 | 'P': {'A': 1, 'C': -3, 'D': -1, 'E': -1, 'F': -5, 'G': 0, 'H': 0, 'I': -2, 'K': -1, 'L': -3, 'M': -2, 'N': 0, 28 | 'P': 6, 'Q': 0, 'R': 0, 'S': 1, 'T': 0, 'V': -1, 'W': -6, 'Y': -5}, 29 | 'Q': {'A': 0, 'C': -5, 'D': 2, 'E': 2, 'F': -5, 'G': -1, 'H': 3, 'I': -2, 'K': 1, 'L': -2, 'M': -1, 'N': 1, 30 | 'P': 0, 'Q': 4, 'R': 1, 'S': -1, 'T': -1, 'V': -2, 'W': -5, 'Y': -4}, 31 | 'R': {'A': -2, 'C': -4, 'D': -1, 'E': -1, 'F': -4, 'G': -3, 'H': 2, 'I': -2, 'K': 3, 'L': -3, 'M': 0, 'N': 0, 32 | 'P': 0, 'Q': 1, 'R': 6, 'S': 0, 'T': -1, 'V': -2, 'W': 2, 'Y': -4}, 33 | 'S': {'A': 1, 'C': 0, 'D': 0, 'E': 0, 'F': -3, 'G': 1, 'H': -1, 'I': -1, 'K': 0, 'L': -3, 'M': -2, 'N': 1, 34 | 'P': 1, 'Q': -1, 'R': 0, 'S': 2, 'T': 1, 'V': -1, 'W': -2, 'Y': -3}, 35 | 'T': {'A': 1, 'C': -2, 'D': 0, 'E': 0, 'F': -3, 'G': 0, 'H': -1, 'I': 0, 'K': 0, 'L': -2, 'M': -1, 'N': 0, 36 | 'P': 0, 'Q': -1, 'R': -1, 'S': 1, 'T': 3, 'V': 0, 'W': -5, 'Y': -3}, 37 | 'V': {'A': 0, 'C': -2, 'D': -2, 'E': -2, 'F': -1, 'G': -1, 'H': -2, 'I': 4, 'K': -2, 'L': 2, 'M': 2, 'N': -2, 38 | 'P': -1, 'Q': -2, 'R': -2, 'S': -1, 'T': 0, 'V': 4, 'W': -6, 'Y': -2}, 39 | 'W': {'A': -6, 'C': -8, 'D': -7, 'E': -7, 'F': 0, 'G': -7, 'H': -3, 'I': -5, 'K': -3, 'L': -2, 'M': -4, 40 | 'N': -4, 'P': -6, 'Q': -5, 'R': 2, 'S': -2, 'T': -5, 'V': -6, 'W': 17, 'Y': 0}, 41 | 'Y': {'A': -3, 'C': 0, 'D': -4, 'E': -4, 'F': 7, 'G': -5, 'H': 0, 'I': -1, 'K': -4, 'L': -1, 'M': -2, 'N': -2, 42 | 'P': -5, 'Q': -4, 'R': -4, 'S': -3, 'T': -3, 'V': -2, 'W': 0, 'Y': 10}} 43 | 44 | 45 | def local_alignment(str1, str2, indel_penalty=5): 46 | str1 = "-" + str1 47 | str2 = "-" + str2 48 | 49 | score_mat = [[0 for i in range(len(str2))] for j in range(len(str1))] 50 | backtrack_mat = [[None for i in range(len(str2))] for j in range(len(str1))] 51 | 52 | for i in range(1, len(str1)): 53 | for j in range(1, len(str2)): 54 | if str1[i] in PAM250.keys(): 55 | key1 = str1[i] 56 | key2 = str2[j] 57 | else: 58 | key1 = str2[j] 59 | key2 = str1[i] 60 | 61 | score1 = score_mat[i - 1][j - 1] + PAM250[key1][key2] 62 | score2 = score_mat[i - 1][j] - indel_penalty 63 | score3 = score_mat[i][j - 1] - indel_penalty 64 | score_mat[i][j] = max(score1, score2, score3, 0) 65 | if score_mat[i][j] == score1: 66 | backtrack_mat[i][j] = "d" 67 | elif score_mat[i][j] == score2: 68 | backtrack_mat[i][j] = "u" 69 | elif score_mat[i][j] == score3: 70 | backtrack_mat[i][j] = "l" 71 | 72 | max_score = -1 73 | for i in range(len(str1)): 74 | for j in range(len(str2)): 75 | if score_mat[i][j] > max_score: 76 | max_score = score_mat[i][j] 77 | max_i, max_j = i, j 78 | 79 | i = max_i 80 | j = max_j 81 | aligned_1 = "" 82 | aligned_2 = "" 83 | while backtrack_mat[i][j] is not None: 84 | direction = backtrack_mat[i][j] 85 | if direction == "d": 86 | aligned_1 = str1[i] + aligned_1 87 | aligned_2 = str2[j] + aligned_2 88 | i -= 1 89 | j -= 1 90 | elif direction == "u": 91 | aligned_1 = str1[i] + aligned_1 92 | aligned_2 = "-" + aligned_2 93 | i -= 1 94 | else: 95 | aligned_1 = "-" + aligned_1 96 | aligned_2 = str2[j] + aligned_2 97 | j -= 1 98 | 99 | return max_score, aligned_1, aligned_2 100 | 101 | 102 | if __name__ == "__main__": 103 | ''' 104 | Given: Two amino acid strings. 105 | Return: The maximum score of a local alignment of the strings, followed by a local alignment of these strings 106 | achieving the maximum score. Use the PAM250 scoring matrix and indel penalty σ = 5. (If multiple local alignments 107 | achieving the maximum score exist, you may return any one.) 108 | ''' 109 | input_lines = sys.stdin.read().splitlines() 110 | string1 = input_lines[0] 111 | string2 = input_lines[1] 112 | 113 | score, alignment1, alignment2 = local_alignment(string1, string2) 114 | print(score) 115 | print(alignment1) 116 | print(alignment2) 117 | -------------------------------------------------------------------------------- /solutions/BA5L.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from BA5K import middle_edge 3 | 4 | BLOSUM62 = { 5 | ('W', 'F'): 1, ('L', 'R'): -2, ('S', 'P'): -1, ('V', 'T'): 0, 6 | ('Q', 'Q'): 5, ('N', 'A'): -2, ('Z', 'Y'): -2, ('W', 'R'): -3, 7 | ('Q', 'A'): -1, ('S', 'D'): 0, ('H', 'H'): 8, ('S', 'H'): -1, 8 | ('H', 'D'): -1, ('L', 'N'): -3, ('W', 'A'): -3, ('Y', 'M'): -1, 9 | ('G', 'R'): -2, ('Y', 'I'): -1, ('Y', 'E'): -2, ('B', 'Y'): -3, 10 | ('Y', 'A'): -2, ('V', 'D'): -3, ('B', 'S'): 0, ('Y', 'Y'): 7, 11 | ('G', 'N'): 0, ('E', 'C'): -4, ('Y', 'Q'): -1, ('Z', 'Z'): 4, 12 | ('V', 'A'): 0, ('C', 'C'): 9, ('M', 'R'): -1, ('V', 'E'): -2, 13 | ('T', 'N'): 0, ('P', 'P'): 7, ('V', 'I'): 3, ('V', 'S'): -2, 14 | ('Z', 'P'): -1, ('V', 'M'): 1, ('T', 'F'): -2, ('V', 'Q'): -2, 15 | ('K', 'K'): 5, ('P', 'D'): -1, ('I', 'H'): -3, ('I', 'D'): -3, 16 | ('T', 'R'): -1, ('P', 'L'): -3, ('K', 'G'): -2, ('M', 'N'): -2, 17 | ('P', 'H'): -2, ('F', 'Q'): -3, ('Z', 'G'): -2, ('X', 'L'): -1, 18 | ('T', 'M'): -1, ('Z', 'C'): -3, ('X', 'H'): -1, ('D', 'R'): -2, 19 | ('B', 'W'): -4, ('X', 'D'): -1, ('Z', 'K'): 1, ('F', 'A'): -2, 20 | ('Z', 'W'): -3, ('F', 'E'): -3, ('D', 'N'): 1, ('B', 'K'): 0, 21 | ('X', 'X'): -1, ('F', 'I'): 0, ('B', 'G'): -1, ('X', 'T'): 0, 22 | ('F', 'M'): 0, ('B', 'C'): -3, ('Z', 'I'): -3, ('Z', 'V'): -2, 23 | ('S', 'S'): 4, ('L', 'Q'): -2, ('W', 'E'): -3, ('Q', 'R'): 1, 24 | ('N', 'N'): 6, ('W', 'M'): -1, ('Q', 'C'): -3, ('W', 'I'): -3, 25 | ('S', 'C'): -1, ('L', 'A'): -1, ('S', 'G'): 0, ('L', 'E'): -3, 26 | ('W', 'Q'): -2, ('H', 'G'): -2, ('S', 'K'): 0, ('Q', 'N'): 0, 27 | ('N', 'R'): 0, ('H', 'C'): -3, ('Y', 'N'): -2, ('G', 'Q'): -2, 28 | ('Y', 'F'): 3, ('C', 'A'): 0, ('V', 'L'): 1, ('G', 'E'): -2, 29 | ('G', 'A'): 0, ('K', 'R'): 2, ('E', 'D'): 2, ('Y', 'R'): -2, 30 | ('M', 'Q'): 0, ('T', 'I'): -1, ('C', 'D'): -3, ('V', 'F'): -1, 31 | ('T', 'A'): 0, ('T', 'P'): -1, ('B', 'P'): -2, ('T', 'E'): -1, 32 | ('V', 'N'): -3, ('P', 'G'): -2, ('M', 'A'): -1, ('K', 'H'): -1, 33 | ('V', 'R'): -3, ('P', 'C'): -3, ('M', 'E'): -2, ('K', 'L'): -2, 34 | ('V', 'V'): 4, ('M', 'I'): 1, ('T', 'Q'): -1, ('I', 'G'): -4, 35 | ('P', 'K'): -1, ('M', 'M'): 5, ('K', 'D'): -1, ('I', 'C'): -1, 36 | ('Z', 'D'): 1, ('F', 'R'): -3, ('X', 'K'): -1, ('Q', 'D'): 0, 37 | ('X', 'G'): -1, ('Z', 'L'): -3, ('X', 'C'): -2, ('Z', 'H'): 0, 38 | ('B', 'L'): -4, ('B', 'H'): 0, ('F', 'F'): 6, ('X', 'W'): -2, 39 | ('B', 'D'): 4, ('D', 'A'): -2, ('S', 'L'): -2, ('X', 'S'): 0, 40 | ('F', 'N'): -3, ('S', 'R'): -1, ('W', 'D'): -4, ('V', 'Y'): -1, 41 | ('W', 'L'): -2, ('H', 'R'): 0, ('W', 'H'): -2, ('H', 'N'): 1, 42 | ('W', 'T'): -2, ('T', 'T'): 5, ('S', 'F'): -2, ('W', 'P'): -4, 43 | ('L', 'D'): -4, ('B', 'I'): -3, ('L', 'H'): -3, ('S', 'N'): 1, 44 | ('B', 'T'): -1, ('L', 'L'): 4, ('Y', 'K'): -2, ('E', 'Q'): 2, 45 | ('Y', 'G'): -3, ('Z', 'S'): 0, ('Y', 'C'): -2, ('G', 'D'): -1, 46 | ('B', 'V'): -3, ('E', 'A'): -1, ('Y', 'W'): 2, ('E', 'E'): 5, 47 | ('Y', 'S'): -2, ('C', 'N'): -3, ('V', 'C'): -1, ('T', 'H'): -2, 48 | ('P', 'R'): -2, ('V', 'G'): -3, ('T', 'L'): -1, ('V', 'K'): -2, 49 | ('K', 'Q'): 1, ('R', 'A'): -1, ('I', 'R'): -3, ('T', 'D'): -1, 50 | ('P', 'F'): -4, ('I', 'N'): -3, ('K', 'I'): -3, ('M', 'D'): -3, 51 | ('V', 'W'): -3, ('W', 'W'): 11, ('M', 'H'): -2, ('P', 'N'): -2, 52 | ('K', 'A'): -1, ('M', 'L'): 2, ('K', 'E'): 1, ('Z', 'E'): 4, 53 | ('X', 'N'): -1, ('Z', 'A'): -1, ('Z', 'M'): -1, ('X', 'F'): -1, 54 | ('K', 'C'): -3, ('B', 'Q'): 0, ('X', 'B'): -1, ('B', 'M'): -3, 55 | ('F', 'C'): -2, ('Z', 'Q'): 3, ('X', 'Z'): -1, ('F', 'G'): -3, 56 | ('B', 'E'): 1, ('X', 'V'): -1, ('F', 'K'): -3, ('B', 'A'): -2, 57 | ('X', 'R'): -1, ('D', 'D'): 6, ('W', 'G'): -2, ('Z', 'F'): -3, 58 | ('S', 'Q'): 0, ('W', 'C'): -2, ('W', 'K'): -3, ('H', 'Q'): 0, 59 | ('L', 'C'): -1, ('W', 'N'): -4, ('S', 'A'): 1, ('L', 'G'): -4, 60 | ('W', 'S'): -3, ('S', 'E'): 0, ('H', 'E'): 0, ('S', 'I'): -2, 61 | ('H', 'A'): -2, ('S', 'M'): -1, ('Y', 'L'): -1, ('Y', 'H'): 2, 62 | ('Y', 'D'): -3, ('E', 'R'): 0, ('X', 'P'): -2, ('G', 'G'): 6, 63 | ('G', 'C'): -3, ('E', 'N'): 0, ('Y', 'T'): -2, ('Y', 'P'): -3, 64 | ('T', 'K'): -1, ('A', 'A'): 4, ('P', 'Q'): -1, ('T', 'C'): -1, 65 | ('V', 'H'): -3, ('T', 'G'): -2, ('I', 'Q'): -3, ('Z', 'T'): -1, 66 | ('C', 'R'): -3, ('V', 'P'): -2, ('P', 'E'): -1, ('M', 'C'): -1, 67 | ('K', 'N'): 0, ('I', 'I'): 4, ('P', 'A'): -1, ('M', 'G'): -3, 68 | ('T', 'S'): 1, ('I', 'E'): -3, ('P', 'M'): -2, ('M', 'K'): -1, 69 | ('I', 'A'): -1, ('P', 'I'): -3, ('R', 'R'): 5, ('X', 'M'): -1, 70 | ('L', 'I'): 2, ('X', 'I'): -1, ('Z', 'B'): 1, ('X', 'E'): -1, 71 | ('Z', 'N'): 0, ('X', 'A'): 0, ('B', 'R'): -1, ('B', 'N'): 3, 72 | ('F', 'D'): -3, ('X', 'Y'): -1, ('Z', 'R'): 0, ('F', 'H'): -1, 73 | ('B', 'F'): -3, ('F', 'L'): 0, ('X', 'Q'): -1, ('B', 'B'): 4 74 | } 75 | 76 | 77 | def alignment_score(str1, str2, indel_penalty=5): 78 | score = 0 79 | for i in range(len(str1)): 80 | if str1[i] == '-' or str2[i] == '-': 81 | score -= indel_penalty 82 | else: 83 | if (str1[i], str2[i]) in BLOSUM62: 84 | key = (str1[i], str2[i]) 85 | else: 86 | key = (str2[i], str1[i]) 87 | score += BLOSUM62[key] 88 | return score 89 | 90 | 91 | def linear_space_alignment(str1, str2, top=0, bottom=None, left=0, right=None): 92 | if bottom is None: 93 | bottom = len(str1) 94 | if right is None: 95 | right = len(str2) 96 | 97 | if left == right: 98 | return "V" * (bottom - top) 99 | if top == bottom: 100 | return "H" * (right - left) 101 | 102 | mid_edge, mid_from, mid_to = middle_edge(str1, str2, top, bottom, left, right) 103 | # Left 104 | midNode, middle = mid_from 105 | pathL = linear_space_alignment(str1, str2, top, midNode, left, middle) 106 | # Right 107 | midNode, middle = mid_to 108 | pathR = linear_space_alignment(str1, str2, midNode, bottom, middle, right) 109 | return pathL + mid_edge + pathR 110 | 111 | 112 | def backtrack_path(path, str1, str2): 113 | aligned1 = '' 114 | aligned2 = '' 115 | i = 0 116 | j = 0 117 | for arrow in path: 118 | if arrow == "D": 119 | aligned1 += str1[i] 120 | aligned2 += str2[j] 121 | i += 1 122 | j += 1 123 | elif arrow == "V": 124 | aligned1 += str1[i] 125 | aligned2 += '-' 126 | i += 1 127 | else: 128 | aligned1 += '-' 129 | aligned2 += str2[j] 130 | j += 1 131 | return aligned1, aligned2 132 | 133 | 134 | if __name__ == "__main__": 135 | ''' 136 | Given: Two long amino acid strings (of length approximately 10,000). 137 | Return: The maximum alignment score of these strings, followed by an alignment achieving this maximum score. Use the BLOSUM62 scoring matrix and indel penalty σ = 5. 138 | ''' 139 | input_lines = sys.stdin.read().splitlines() 140 | string1 = input_lines[0] 141 | string2 = input_lines[1] 142 | 143 | path = linear_space_alignment(string1, string2) 144 | 145 | alignment1, alignment2 = backtrack_path(path, string1, string2) 146 | 147 | print(alignment_score(alignment1, alignment2)) 148 | print(alignment1) 149 | print(alignment2) 150 | --------------------------------------------------------------------------------