├── .idea
├── .gitignore
├── vcs.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── Bioinformatics_Textbook_Track.iml
└── modules.xml
└── solutions
├── BA1G.py
├── BA9C.py
├── BA9I.py
├── BA3A.py
├── BA9R.py
├── BA1A.py
├── BA1L.py
├── BA1C.py
├── BA6B.py
├── BA1M.py
├── BA9G.py
├── BA3B.py
├── BA1D.py
├── BA3L.py
├── BA3E.py
├── BA4F.py
├── BA1B.py
├── BA1H.py
├── BA4K.py
├── BA3C.py
├── BA3H.py
├── BA9H.py
├── BA3K.py
├── BA4D.py
├── BA5A.py
├── BA4A.py
├── BA6G.py
├── BA8B.py
├── BA1F.py
├── BA3D.py
├── BA6F.py
├── BA11C.py
├── BA1K.py
├── BA2H.py
├── BA3I.py
├── BA4J.py
├── BA1N.py
├── BA6A.py
├── BA6H.py
├── BA9Q.py
├── BA4C.py
├── BA4L.py
├── BA4B.py
├── BA9K.py
├── BA5G.py
├── BA2C.py
├── BA7B.py
├── BA4H.py
├── BA11A.py
├── BA11D.py
├── BA1E.py
├── BA2A.py
├── BA2B.py
├── BA10A.py
├── BA9J.py
├── BA6J.py
├── BA5C.py
├── BA10B.py
├── BA5B.py
├── BA6I.py
├── BA6E.py
├── BA9D.py
├── BA3G.py
├── BA1I.py
├── BA11H.py
├── BA11I.py
├── BA6C.py
├── BA9F.py
├── BA8A.py
├── BA9B.py
├── BA9E.py
├── BA9A.py
├── BA3J.py
├── BA1J.py
├── BA11F.py
├── BA4G.py
├── BA4M.py
├── BA5N.py
├── BA8C.py
├── BA4I.py
├── BA9L.py
├── BA11E.py
├── BA3F.py
├── BA3M.py
├── BA4E.py
├── BA7A.py
├── BA9M.py
├── BA5H.py
├── BA2D.py
├── BA5I.py
├── BA10I.py
├── BA11B.py
├── BA2E.py
├── BA8E.py
├── BA11G.py
├── BA7C.py
├── BA10D.py
├── BA2F.py
├── BA8D.py
├── BA7E.py
├── BA9P.py
├── BA5D.py
├── BA9O.py
├── BA5M.py
├── BA2G.py
├── BA7D.py
├── BA9N.py
├── BA6K.py
├── BA10C.py
├── BA10J.py
├── BA10H.py
├── BA11J.py
├── BA6D.py
├── BA7F.py
├── BA5F.py
└── BA5L.py
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/Bioinformatics_Textbook_Track.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/solutions/BA1G.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def hamming_dist(string1, string2):
5 | return sum([x != y for x, y in zip(string1, string2)])
6 |
7 |
8 | if __name__ == "__main__":
9 | '''
10 | Given: Two DNA strings.
11 | Return: An integer value representing the Hamming distance.
12 | '''
13 | input_lines = sys.stdin.read().splitlines()
14 | dna1 = input_lines[0]
15 | dna2 = input_lines[1]
16 |
17 | print(hamming_dist(dna1, dna2))
18 |
--------------------------------------------------------------------------------
/solutions/BA9C.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from Tree_Trie_classes import Tree
3 |
4 |
5 | if __name__ == "__main__":
6 | '''
7 | Given: A string Text.
8 | Return: The strings labeling the edges of SuffixTree(Text). (You may return these strings in any order.)
9 | '''
10 | input_lines = sys.stdin.read().splitlines()
11 | Text = input_lines[0]
12 |
13 | tree = Tree()
14 | tree.PopulateSuffixTree(Text)
15 |
16 | result = tree.edge_labels(Text)
17 | print("\n".join(result))
--------------------------------------------------------------------------------
/solutions/BA9I.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def BurrowsWheelerTransform(Text):
5 | n = len(Text)
6 | rotations = sorted([Text[i:] + Text[:i] for i in range(n)])
7 | bwt = ''.join([rot[-1] for rot in rotations])
8 | return bwt
9 |
10 |
11 | if __name__ == "__main__":
12 | '''
13 | Given: A string Text.
14 | Return: BWT(Text).
15 | '''
16 | input_lines = sys.stdin.read().splitlines()
17 | Text = input_lines[0]
18 |
19 | print(BurrowsWheelerTransform(Text))
20 |
--------------------------------------------------------------------------------
/solutions/BA3A.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def composition(text, k):
5 | for i in range(len(text) - k + 1):
6 | yield text[i:i + k]
7 |
8 |
9 | if __name__ == "__main__":
10 | '''
11 | Given: An integer k and a string Text.
12 | Return: Compositionk(Text) (the k-mers can be provided in any order).
13 | '''
14 | input_lines = sys.stdin.read().splitlines()
15 | k = int(input_lines[0])
16 | Text = input_lines[1]
17 |
18 | for kmer in composition(Text, k):
19 | print(kmer)
20 |
--------------------------------------------------------------------------------
/solutions/BA9R.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from Tree_Trie_classes import Tree
3 |
4 |
5 | if __name__ == "__main__":
6 | '''
7 | Given: A string Text, SuffixArray(Text), and LCP(Text).
8 | Return: The strings labeling the edges of SuffixTree(Text). (You may return these strings in any order.)
9 | '''
10 | input_lines = sys.stdin.read().splitlines()
11 | Text = input_lines[0]
12 |
13 | tree = Tree()
14 | tree.PopulateSuffixTree(Text)
15 |
16 | result = tree.edge_labels(Text)
17 | print("\n".join(result))
--------------------------------------------------------------------------------
/solutions/BA1A.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def count_pattern(text, pattern):
5 | k = len(pattern)
6 | count = 0
7 | for i in range(len(text) - k + 1):
8 | if text[i:i+k] == pattern:
9 | count += 1
10 | return count
11 |
12 |
13 | if __name__ == "__main__":
14 | '''
15 | Given: {DNA strings}} Text and Pattern.
16 | Return: Count(Text, Pattern).
17 | '''
18 | input_lines = sys.stdin.read().splitlines()
19 | Text = input_lines[0]
20 | Pattern = input_lines[1]
21 | print(count_pattern(Text, Pattern))
--------------------------------------------------------------------------------
/solutions/BA1L.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def PatternToNumber(Pattern):
5 | indices = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
6 | result = 0
7 | N = len(Pattern)
8 | for i in range(N):
9 | nuc = Pattern[i]
10 | result += indices[nuc] * 4 ** (N - i - 1)
11 | return result
12 |
13 |
14 | if __name__ == "__main__":
15 | '''
16 | Given: A DNA string Pattern.
17 | Return: PatternToNumber(Pattern).
18 | '''
19 | input_lines = sys.stdin.read().splitlines()
20 | Pattern = input_lines[0]
21 |
22 | print(PatternToNumber(Pattern))
--------------------------------------------------------------------------------
/solutions/BA1C.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def rev_comp(string):
5 | ''' Create reverse complement of the string
6 | :param string: DNA sequence to rev. comp. (string)
7 | :return: the reverse complement (string)
8 | '''
9 | revc_seq = string[::-1].translate(str.maketrans("ACGT", "TGCA"))
10 | return revc_seq
11 |
12 |
13 | if __name__ == "__main__":
14 | '''
15 | Given: A DNA string Pattern.
16 | Return: Pattern, the reverse complement of Pattern
17 | '''
18 | DNA_string = sys.stdin.read().splitlines()[0]
19 | print(rev_comp(DNA_string))
20 |
--------------------------------------------------------------------------------
/solutions/BA6B.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def number_of_breakpoints(P):
5 | P = [0] + P
6 | P.append(max(P) + 1)
7 | num_bp = 0
8 | for i in range(1, len(P) - 1):
9 | if P[i] != P[i - 1] + 1:
10 | num_bp += 1
11 | return num_bp
12 |
13 |
14 | if __name__ == "__main__":
15 | '''
16 | Given: A signed permutation P.
17 | Return: The number of breakpoints in P.
18 | '''
19 | P = sys.stdin.readline().strip()
20 | P = P.replace("(", "").replace(")", "")
21 | P = [int(x) for x in P.split()]
22 |
23 | print(number_of_breakpoints(P))
24 |
--------------------------------------------------------------------------------
/solutions/BA1M.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def NumberToPattern(Number, k):
5 | reverse = ['A', 'C', 'G', 'T']
6 | Pattern = ''
7 | for i in range(k - 1, -1, -1):
8 | current = Number // 4 ** i
9 | Pattern += reverse[current]
10 | Number %= 4 ** i
11 | return Pattern
12 |
13 |
14 | if __name__ == "__main__":
15 | '''
16 | Given: Integers index and k.
17 | Return: NumberToPattern(index, k).
18 | '''
19 | input_lines = sys.stdin.read().splitlines()
20 | index = int(input_lines[0])
21 | k = int(input_lines[1])
22 |
23 | print(NumberToPattern(index, k))
--------------------------------------------------------------------------------
/solutions/BA9G.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def SuffixArray(Text):
5 | suffixes = []
6 | suffix_array = []
7 | for i in range(len(Text)):
8 | suffixes.append(Text[i:])
9 | suffix_array.append(i)
10 |
11 | suffix_array = [x for _, x in sorted(zip(suffixes, suffix_array), key=lambda pair: pair[0])]
12 |
13 | return suffix_array
14 |
15 |
16 | if __name__ == "__main__":
17 | '''
18 | Input: A string Text.
19 | Output: SuffixArray(Text).
20 | '''
21 | Text = sys.stdin.read().rstrip()
22 | suffix_array = SuffixArray(Text)
23 |
24 | print(', '.join(str(x) for x in suffix_array))
--------------------------------------------------------------------------------
/solutions/BA3B.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def reconstruct_string(pattern):
5 | string = pattern[0]
6 | for i in range(1, len(pattern)):
7 | string += pattern[i][-1]
8 | return string
9 |
10 |
11 | if __name__ == "__main__":
12 | '''
13 | Given: A sequence of k-mers Pattern1, ... , Patternn such that the last k - 1 symbols of Patterni are equal to the
14 | first k - 1 symbols of Patterni+1 for i from 1 to n-1.
15 | Return: A string Text of length k+n-1 where the i-th k-mer in Text is equal to Patterni for all i.
16 | '''
17 | Pattern = sys.stdin.read().splitlines()
18 |
19 | print(reconstruct_string(Pattern))
20 |
--------------------------------------------------------------------------------
/solutions/BA1D.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def positions_pattern(text, pattern):
5 | k = len(pattern)
6 | pos = []
7 | for i in range(len(text) - k + 1):
8 | if text[i:i+k] == pattern:
9 | pos.append(i)
10 | return pos
11 |
12 |
13 | if __name__ == "__main__":
14 | '''
15 | Given: Strings Pattern and Genome.
16 | Return: All starting positions in Genome where Pattern appears as a substring. Use 0-based indexing.
17 | '''
18 | input_lines = sys.stdin.read().splitlines()
19 | Pattern = input_lines[0]
20 | Genome = input_lines[1]
21 |
22 | print(" ".join(map(str, positions_pattern(Genome, Pattern))))
23 |
--------------------------------------------------------------------------------
/solutions/BA3L.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA3J import StringSpelledByGappedPatterns
3 |
4 |
5 | if __name__ == "__main__":
6 | '''
7 | Given: A sequence of (k, d)-mers (a1|b1), ... , (an|bn) such that Suffix(ai|bi) = Prefix(ai+1|bi+1) for all i
8 | from 1 to n-1.
9 | Return: A string Text where the i-th k-mer in Text is equal to Suffix(ai|bi) for all i from 1 to n, if such a
10 | string exists.
11 | '''
12 | input_lines = sys.stdin.read().splitlines()
13 | k, d = [int(x) for x in input_lines[0].split()]
14 | Gapped_Patterns = []
15 | for line in input_lines[1:]:
16 | Gapped_Patterns.append(line.split("|"))
17 |
18 | print(StringSpelledByGappedPatterns(Gapped_Patterns, k - 1, d))
19 |
--------------------------------------------------------------------------------
/solutions/BA3E.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def deBruijn_graph_kmers(patterns):
5 | adj_list = {}
6 | for pattern in patterns:
7 | if pattern[:-1] not in adj_list:
8 | adj_list[pattern[:-1]] = [pattern[1:]]
9 | else:
10 | adj_list[pattern[:-1]].append(pattern[1:])
11 | return adj_list
12 |
13 |
14 | if __name__ == "__main__":
15 | '''
16 | Given: A collection of k-mers Patterns.
17 | Return: The de Bruijn graph DeBruijn(Patterns), in the form of an adjacency list.
18 | '''
19 | Patterns = sys.stdin.read().splitlines()
20 |
21 | adj_list = deBruijn_graph_kmers(Patterns)
22 | for key, val in adj_list.items():
23 | print(key + ' -> ' + ",".join(val))
24 |
--------------------------------------------------------------------------------
/solutions/BA4F.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA4C import cyclospectrum
3 |
4 |
5 | def score(peptide, spectrum):
6 | pep_spec = cyclospectrum(peptide)
7 | result = 0
8 | unique_masses = set(pep_spec + spectrum)
9 | for mass in unique_masses:
10 | result += min(pep_spec.count(mass), spectrum.count(mass))
11 | return result
12 |
13 |
14 | if __name__ == "__main__":
15 | '''
16 | Given: An amino acid string Peptide and a collection of integers Spectrum.
17 | Return: The score of Peptide against Spectrum, Score(Peptide, Spectrum).
18 | '''
19 | input_lines = sys.stdin.read().splitlines()
20 | Peptide = input_lines[0]
21 | Spectrum = [int(x) for x in input_lines[1].split()]
22 |
23 | print(score(Peptide, Spectrum))
24 |
--------------------------------------------------------------------------------
/solutions/BA1B.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def most_freq_kmers(text, k):
5 | count_dict = {}
6 | for i in range(len(text) - k + 1):
7 | kmer = text[i:i+k]
8 | if kmer not in count_dict:
9 | count_dict[kmer] = 1
10 | else:
11 | count_dict[kmer] += 1
12 |
13 | max_freq = max(count_dict.values())
14 | return [kmer for kmer, count in count_dict.items() if count == max_freq]
15 |
16 |
17 | if __name__ == "__main__":
18 | '''
19 | Given: A DNA string Text and an integer k.
20 | Return: All most frequent k-mers in Text (in any order).
21 | '''
22 | input_lines = sys.stdin.read().splitlines()
23 | Text = input_lines[0]
24 | k = int(input_lines[1])
25 | print(" ".join(most_freq_kmers(Text, k)))
26 |
--------------------------------------------------------------------------------
/solutions/BA1H.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA1G import hamming_dist
3 |
4 | def positions_approx_pattern(text, pattern, d):
5 | k = len(pattern)
6 | pos = []
7 | for i in range(len(text) - k + 1):
8 | if hamming_dist(text[i:i+k], pattern) <= d:
9 | pos.append(i)
10 | return pos
11 |
12 |
13 | if __name__ == "__main__":
14 | '''
15 | Given: Strings Pattern and Text along with an integer d.
16 | Return: All starting positions where Pattern appears as a substring of Text with at most d mismatches.
17 | '''
18 | input_lines = sys.stdin.read().splitlines()
19 | Pattern = input_lines[0]
20 | Genome = input_lines[1]
21 | d = int(input_lines[2])
22 |
23 | print(" ".join(map(str, positions_approx_pattern(Genome, Pattern, d))))
24 |
--------------------------------------------------------------------------------
/solutions/BA4K.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA4J import LinearSpectrum
3 |
4 |
5 | def linear_score(peptide, spectrum):
6 | pep_spec = LinearSpectrum(peptide)
7 | result = 0
8 | unique_masses = set(pep_spec + spectrum)
9 | for mass in unique_masses:
10 | result += min(pep_spec.count(mass), spectrum.count(mass))
11 | return result
12 |
13 |
14 | if __name__ == "__main__":
15 | '''
16 | Given: An amino acid string Peptide and a collection of integers LinearSpectrum.
17 | Return: The linear score of Peptide against Spectrum, LinearScore(Peptide, Spectrum).
18 | '''
19 | input_lines = sys.stdin.read().splitlines()
20 | Peptide = input_lines[0]
21 | Spectrum = [int(x) for x in input_lines[1].split()]
22 |
23 | print(linear_score(Peptide, Spectrum))
24 |
--------------------------------------------------------------------------------
/solutions/BA3C.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def overlap_graph(patterns):
5 | adj_list = []
6 | for i in range(len(patterns) - 1):
7 | for j in range(i, len(patterns)):
8 | if patterns[i][1:] == patterns[j][:-1]:
9 | adj_list.append((patterns[i], patterns[j]))
10 | if patterns[j][1:] == patterns[i][:-1]:
11 | adj_list.append((patterns[j], patterns[i]))
12 | return adj_list
13 |
14 |
15 | if __name__ == "__main__":
16 | '''
17 | Given: A collection Patterns of k-mers.
18 | Return: The overlap graph Overlap(Patterns), in the form of an adjacency list.
19 | '''
20 | Patterns = sys.stdin.read().splitlines()
21 |
22 | adj_list = overlap_graph(Patterns)
23 | for edge in adj_list:
24 | print(" -> ".join(edge))
25 |
--------------------------------------------------------------------------------
/solutions/BA3H.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA3E import deBruijn_graph_kmers
3 | from BA3G import Eulerian_path
4 |
5 |
6 | def string_reconstruction(patterns):
7 | adj_list = deBruijn_graph_kmers(patterns)
8 | path = Eulerian_path(adj_list)
9 | ReconstructedString = path[0][:-1]
10 | for r in path:
11 | ReconstructedString += r[-1]
12 | return ReconstructedString
13 |
14 |
15 | if __name__ == "__main__":
16 | '''
17 | Given: An integer k followed by a list of k-mers Patterns.
18 | Return: A string Text with k-mer composition equal to Patterns. (If multiple answers exist, you may return any one.)
19 | '''
20 | input_lines = sys.stdin.read().splitlines()
21 | k = int(input_lines[0])
22 | Patterns = input_lines[1:]
23 |
24 | print(string_reconstruction(Patterns))
25 |
--------------------------------------------------------------------------------
/solutions/BA9H.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def pattern_matching(Text, Patterns):
5 | # how to do this with suffix array?
6 | match_idx = []
7 | for pattern in Patterns:
8 | for j in range(len(Text) - len(pattern) + 1):
9 | if pattern == Text[j:j + len(pattern)]:
10 | match_idx.append(j)
11 | match_idx = sorted(match_idx)
12 | return match_idx
13 |
14 |
15 | if __name__ == "__main__":
16 | '''
17 | Given: A string Text and a collection of strings Patterns.
18 | Return: All starting positions in Text where a string from Patterns appears as a substring.
19 | '''
20 | input_lines = sys.stdin.read().splitlines()
21 | Text = input_lines[0]
22 | Patterns = input_lines[1:]
23 |
24 | print(" ".join(map(str, pattern_matching(Text, Patterns))))
25 |
--------------------------------------------------------------------------------
/solutions/BA3K.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA3E import deBruijn_graph_kmers
3 | from BA3M import maximal_non_branching_paths
4 |
5 |
6 | def contig_generation(kmers):
7 | adj_list = deBruijn_graph_kmers(kmers)
8 | paths = maximal_non_branching_paths(adj_list)
9 | contigs = []
10 | for path in paths:
11 | contig = path[0]
12 | for edge in path[1:]:
13 | contig += edge[-1]
14 | contigs.append(contig)
15 | return contigs
16 |
17 |
18 | if __name__ == "__main__":
19 | '''
20 | Given: A collection of k-mers Patterns.
21 | Return: All contigs in DeBruijn(Patterns). (You may return the strings in any order.)
22 | '''
23 | Patterns = sys.stdin.read().splitlines()
24 | contigs = contig_generation(Patterns)
25 | contigs.sort()
26 | print(" ".join(contigs))
27 |
--------------------------------------------------------------------------------
/solutions/BA4D.py:
--------------------------------------------------------------------------------
1 | import sys
2 | MASSES = [57, 71, 87, 97, 99, 101, 103, 113, 114, 115, 128, 129, 131, 137, 147, 156, 163, 186]
3 |
4 |
5 | def CountPeptides(Mass):
6 | NumPeptides={}
7 | for i in range(57):
8 | NumPeptides[i] = 0
9 |
10 | for mass in range(57, Mass + 1):
11 | NumPeptides[mass] = MASSES.count(mass)
12 | for int_mass in MASSES:
13 | if mass >= int_mass:
14 | if NumPeptides[mass - int_mass] > 0:
15 | NumPeptides[mass] += NumPeptides[mass - int_mass]
16 |
17 | return NumPeptides[Mass]
18 |
19 |
20 | if __name__ == "__main__":
21 | '''
22 | Given: An integer m.
23 | Return: The number of linear peptides having integer mass m.
24 | '''
25 | m = int(sys.stdin.readline().strip())
26 |
27 | print(CountPeptides(m))
28 |
--------------------------------------------------------------------------------
/solutions/BA5A.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def dp_change(money, coins):
5 | MinNumCoins = [0]
6 | for m in range(1, money + 1):
7 | MinNumCoins.append(money + 1)
8 | for coin in coins:
9 | if m >= coin:
10 | current = MinNumCoins[m - coin] + 1
11 | if current < MinNumCoins[m]:
12 | MinNumCoins[m] = current
13 | return MinNumCoins[money]
14 |
15 |
16 | if __name__ == "__main__":
17 | '''
18 | Given: An integer money and an array Coins of positive integers.
19 | Return: The minimum number of coins with denominations Coins that changes money.
20 | '''
21 | input_lines = sys.stdin.read().splitlines()
22 | money = int(input_lines[0])
23 | Coins = [int(x) for x in input_lines[1].split(",")]
24 |
25 | print(dp_change(money, Coins))
26 |
--------------------------------------------------------------------------------
/solutions/BA4A.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | bases = "UCAG"
4 | codons = [a + b + c for a in bases for b in bases for c in bases]
5 | amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
6 | CODON_TABLE = dict(zip(codons, amino_acids))
7 |
8 |
9 | def translate_rna(rna):
10 | protein = ""
11 | idx = 0
12 | codon = rna[idx:idx + 3]
13 | while CODON_TABLE[codon] != "*":
14 | protein += CODON_TABLE[codon]
15 | idx += 3
16 | codon = rna[idx:idx + 3]
17 | if idx == len(rna):
18 | break
19 | return protein
20 |
21 |
22 | if __name__ == "__main__":
23 | '''
24 | Given: An RNA string Pattern.
25 | Return: The translation of Pattern into an amino acid string Peptide.
26 | '''
27 | Pattern = sys.stdin.read().splitlines()[0]
28 |
29 | print(translate_rna(Pattern))
30 |
--------------------------------------------------------------------------------
/solutions/BA6G.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def cycle_to_chromosome(Nodes):
5 | Chromosome = []
6 | for i in range(0, len(Nodes), 2):
7 | if Nodes[i] < Nodes[i + 1]:
8 | Chromosome.append(Nodes[i + 1] // 2)
9 | else:
10 | Chromosome.append(-Nodes[i] // 2)
11 | return Chromosome
12 |
13 |
14 | if __name__ == "__main__":
15 | '''
16 | Given: A sequence Nodes of integers between 1 and 2n.
17 | Return: The chromosome Chromosome containing n synteny blocks resulting from applying CycleToChromosome to Nodes.
18 | '''
19 | Nodes = sys.stdin.readline().strip()
20 | Nodes = Nodes.replace("(", "").replace(")", "")
21 | Nodes = [int(x) for x in Nodes.split()]
22 |
23 | chromosome = cycle_to_chromosome(Nodes)
24 | print("(" + " ".join(["+" + str(x) if x > 0 else str(x) for x in chromosome]) + ")")
25 |
--------------------------------------------------------------------------------
/solutions/BA8B.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA8A import distance_from_centers
3 |
4 |
5 | def squared_error_distortion(data, centers):
6 | distortion = 0
7 | for point in data:
8 | distortion += distance_from_centers(centers, point) ** 2
9 | distortion /= len(data)
10 | return distortion
11 |
12 |
13 | if __name__ == "__main__":
14 | '''
15 | Given: Integers k and m, followed by a set of centers Centers and a set of points Data.
16 | Return: The squared error distortion Distortion(Data, Centers).
17 | '''
18 | input_lines = sys.stdin.read().splitlines()
19 | k, m = [int(x) for x in input_lines[0].split()]
20 |
21 | centers = [[float(x) for x in line.split()] for line in input_lines[1:k + 1]]
22 | data = [[float(x) for x in line.split()] for line in input_lines[k + 2:]]
23 |
24 | print(squared_error_distortion(data, centers))
--------------------------------------------------------------------------------
/solutions/BA1F.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def skew(dna_seq):
5 | return dna_seq.count("G") - dna_seq.count("C")
6 |
7 |
8 | def minimal_skew(dna_seq):
9 | all_skew = []
10 | min_skew = 1e6
11 | for i in range(len(dna_seq) + 1):
12 | skw = skew(dna_seq[:i])
13 | all_skew.append(skw)
14 | if skw < min_skew:
15 | min_skew = skw
16 |
17 | idx_list = []
18 | for i, skw in enumerate(all_skew):
19 | if skw == min_skew:
20 | idx_list.append(i)
21 | return idx_list
22 |
23 |
24 | if __name__ == "__main__":
25 | '''
26 | Given: A DNA string Genome.
27 | Return: All integer(s) i minimizing Skew(Prefixi (Text)) over all values of i (from 0 to |Genome|).
28 | '''
29 | input_lines = sys.stdin.read().splitlines()
30 | Genome = input_lines[0]
31 |
32 | print(" ".join(map(str, minimal_skew(Genome))))
33 |
--------------------------------------------------------------------------------
/solutions/BA3D.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from collections import OrderedDict
3 |
4 |
5 | def deBruijn_graph(text, k):
6 | adj_list = OrderedDict()
7 | for i in range(len(text) - k + 2):
8 | adj_list[text[i:i + k - 1]] = set()
9 |
10 | for i in range(len(text) - k + 1):
11 | pattern = text[i:i + k - 1]
12 | pattern2 = text[i + 1:i + k]
13 | adj_list[pattern].add(pattern2)
14 |
15 | return adj_list
16 |
17 |
18 | if __name__ == "__main__":
19 | '''
20 | Given: An integer k and a string Text.
21 | Return:DeBruijnk(Text), in the form of an adjacency list.
22 | '''
23 | input_lines = sys.stdin.read().splitlines()
24 | k = int(input_lines[0])
25 | Text = input_lines[1]
26 |
27 | adj_list = deBruijn_graph(Text, k)
28 | for key, val in adj_list.items():
29 | if val:
30 | print(key + ' -> ' + ",".join(val))
31 |
--------------------------------------------------------------------------------
/solutions/BA6F.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def chromosome_to_cycle(Chromosome):
5 | Nodes = []
6 | for block in Chromosome:
7 | if block > 0:
8 | Nodes.append(2 * block - 1)
9 | Nodes.append(2 * block)
10 | else:
11 | Nodes.append(-2 * block)
12 | Nodes.append(-2 * block - 1)
13 | return Nodes
14 |
15 |
16 | if __name__ == "__main__":
17 | '''
18 | Given: A chromosome Chromosome containing n synteny blocks.
19 | Return: The sequence Nodes of integers between 1 and 2n resulting from applying ChromosomeToCycle to Chromosome.
20 | '''
21 | Chromosome = sys.stdin.readline().strip()
22 | Chromosome = Chromosome.replace("(", "").replace(")", "")
23 | Chromosome = [int(x) for x in Chromosome.split()]
24 |
25 | cycle = chromosome_to_cycle(Chromosome)
26 | print("(" + " ".join(map(str, cycle)) + ")")
--------------------------------------------------------------------------------
/solutions/BA11C.py:
--------------------------------------------------------------------------------
1 | import sys
2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 'L': 113,
3 | 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163, 'X': 4, 'Z': 5}
4 |
5 |
6 | def PeptideVector(peptide):
7 | prefixMasses = []
8 | for i in range(len(peptide)):
9 | prefix = peptide[:i+1]
10 | mass = 0
11 | for aa in prefix:
12 | mass += aa_table[aa]
13 | prefixMasses.append(mass)
14 |
15 | vector = [0] * prefixMasses[-1]
16 | for mass in prefixMasses:
17 | vector[mass - 1] = 1
18 | return vector
19 |
20 |
21 | if __name__ == "__main__":
22 | '''
23 | Given: A peptide P.
24 | Return: The peptide vector of P.
25 | '''
26 | peptide = sys.stdin.read().rstrip()
27 | print(' '.join(str(x) for x in PeptideVector(peptide)))
--------------------------------------------------------------------------------
/solutions/BA1K.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA1A import count_pattern
3 |
4 |
5 | def generate_all_kmers(k):
6 | if k == 1:
7 | return ["A", "C", "G", "T"]
8 | kmers = []
9 | suff = generate_all_kmers(k - 1)
10 | for nuc in ["A", "C", "G", "T"]:
11 | for s in suff:
12 | kmers.append(nuc + s)
13 | return kmers
14 |
15 |
16 | def frequency_array(text, k):
17 | all_kmers = generate_all_kmers(k)
18 | freq_arr = []
19 | for kmer in all_kmers:
20 | freq_arr.append(count_pattern(text, kmer))
21 | return freq_arr
22 |
23 |
24 | if __name__ == "__main__":
25 | '''
26 | Given: A DNA string Text and an integer k.
27 | Return: The frequency array of k-mers in Text.
28 | '''
29 | input_lines = sys.stdin.read().splitlines()
30 | Text = input_lines[0]
31 | k = int(input_lines[1])
32 |
33 | print(" ".join(map(str, frequency_array(Text, k))))
34 |
--------------------------------------------------------------------------------
/solutions/BA2H.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA1G import hamming_dist
3 |
4 |
5 | def distance(pattern, text):
6 | k = len(pattern)
7 | min_dist = float("Inf")
8 | for i in range(len(text) - k + 1):
9 | dist = hamming_dist(text[i:i + k], pattern)
10 | if dist < min_dist:
11 | min_dist = dist
12 | return min_dist
13 |
14 |
15 | def DistanceBetweenPatternAndStrings(dna_list, pattern):
16 | dist = 0
17 | for dna in dna_list:
18 | dist += distance(pattern, dna)
19 | return dist
20 |
21 |
22 | if __name__ == "__main__":
23 | '''
24 | Given: A DNA string Pattern and a collection of DNA strings Dna.
25 | Return: DistanceBetweenPatternAndStrings(Pattern, Dna).
26 | '''
27 | input_lines = sys.stdin.read().splitlines()
28 | Pattern = input_lines[0]
29 | DNA_list = input_lines[1].split()
30 |
31 | print(DistanceBetweenPatternAndStrings(DNA_list, Pattern))
32 |
--------------------------------------------------------------------------------
/solutions/BA3I.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA3E import deBruijn_graph_kmers
3 | from BA3F import Eulerian_cycle
4 |
5 |
6 | def k_universal_circular_string(k):
7 | kmers = []
8 | for i in range(2 ** k):
9 | kmer = str(bin(i))[2:]
10 | if len(kmer) != k:
11 | kmer = '0' * (k - len(kmer)) + kmer
12 | kmers.append(kmer)
13 |
14 | adj_list = deBruijn_graph_kmers(kmers)
15 | cycle = Eulerian_cycle(adj_list)
16 |
17 | cycle = cycle[:len(cycle) - k + 1]
18 | string = cycle[0][:-1]
19 | for r in cycle:
20 | string += r[-1]
21 | return string
22 |
23 |
24 | if __name__ == "__main__":
25 | '''
26 | Given: An integer k.
27 | Return: A k-universal circular string. (If multiple answers exist, you may return any one.)
28 | '''
29 | input_lines = sys.stdin.read().splitlines()
30 | k = int(input_lines[0])
31 |
32 | print(k_universal_circular_string(k))
33 |
--------------------------------------------------------------------------------
/solutions/BA4J.py:
--------------------------------------------------------------------------------
1 | import sys
2 | MASS_TABLE = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}
4 |
5 |
6 | def LinearSpectrum(Peptide):
7 | PrefixMass = [0]
8 | for i in range(len(Peptide)):
9 | temp = PrefixMass[i] + MASS_TABLE[Peptide[i]]
10 | PrefixMass.append(temp)
11 | LinearSpectrum = [0]
12 | for i in range(len(Peptide)):
13 | for j in range(i + 1, len(Peptide) + 1):
14 | LinearSpectrum.append(PrefixMass[j] - PrefixMass[i])
15 | LinearSpectrum.sort()
16 | return LinearSpectrum
17 |
18 |
19 | if __name__ == "__main__":
20 | '''
21 | Given: An amino acid string Peptide.
22 | Return: The linear spectrum of Peptide.
23 | '''
24 | Peptide = sys.stdin.readline().strip()
25 |
26 | print(" ".join(map(str, LinearSpectrum(Peptide))))
27 |
--------------------------------------------------------------------------------
/solutions/BA1N.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA1G import hamming_dist
3 |
4 |
5 | def neighbors(pattern, d):
6 | if d == 0:
7 | return pattern
8 | if len(pattern) == 1:
9 | return ['A', 'C', 'G', 'T']
10 | neighborhood = set()
11 | suffix_neighbors = neighbors(pattern[1:], d)
12 | for suffix in suffix_neighbors:
13 | if hamming_dist(pattern[1:], suffix) < d:
14 | for nuc in ['A', 'C', 'G', 'T']:
15 | neighborhood.add(nuc + suffix)
16 | else:
17 | neighborhood.add(pattern[0] + suffix)
18 | return neighborhood
19 |
20 |
21 | if __name__ == "__main__":
22 | '''
23 | Given: A DNA string Pattern and an integer d.
24 | Return: The collection of strings Neighbors(Pattern, d).
25 | '''
26 | input_lines = sys.stdin.read().splitlines()
27 | Pattern = input_lines[0]
28 | d = int(input_lines[1])
29 |
30 | result = neighbors(Pattern, d)
31 | for r in result:
32 | print(r)
33 |
--------------------------------------------------------------------------------
/solutions/BA6A.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def kSortingReversal(P, k):
5 | j = k
6 | while P[j] != k + 1 and P[j] != -(k + 1):
7 | j += 1
8 | P[k:j+1] = list(map(lambda x: -x, P[k:j+1][::-1]))
9 | return P
10 |
11 |
12 | def GreedySorting(P):
13 | reversals = []
14 | for k in range(len(P)):
15 | while P[k] != k + 1:
16 | P = kSortingReversal(P, k)
17 | reversals.append(list(P))
18 | return reversals
19 |
20 |
21 | if __name__ == "__main__":
22 | '''
23 | Given: A signed permutation P.
24 | Return: The sequence of permutations corresponding to applying GreedySorting to P, ending with the identity
25 | permutation.
26 | '''
27 | P = sys.stdin.readline().strip()
28 | P = P.replace("(", "").replace(")", "")
29 | P = [int(x) for x in P.split()]
30 |
31 | result = GreedySorting(P)
32 | for res in result:
33 | print("(" + " ".join(["+" + str(x) if x > 0 else str(x) for x in res]) + ")")
34 |
--------------------------------------------------------------------------------
/solutions/BA6H.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA6F import chromosome_to_cycle
3 |
4 |
5 | def colored_edges(P):
6 | Edges = list()
7 | for chromosome in P:
8 | Nodes = chromosome_to_cycle(chromosome)
9 | for j in range(1, len(Nodes), 2):
10 | if j != len(Nodes) - 1:
11 | Edges.append([Nodes[j], Nodes[j + 1]])
12 | else:
13 | Edges.append([Nodes[j], Nodes[0]])
14 | return Edges
15 |
16 |
17 | if __name__ == "__main__":
18 | '''
19 | Given: A genome P.
20 | Return: The collection of colored edges in the genome graph of P in the form (x, y).
21 | '''
22 | P = sys.stdin.readline().strip()
23 | P = P[1:-1]
24 | P = P.split(')(')
25 |
26 | for i in range(len(P)):
27 | P[i] = [int(x) for x in P[i].split(' ')]
28 |
29 | result = colored_edges(P)
30 | for j in range(len(result)):
31 | result[j] = '(' + ', '.join(str(i) for i in result[j]) + ')'
32 | print(', '.join(result))
33 |
--------------------------------------------------------------------------------
/solutions/BA9Q.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def PartialSuffixArray(Text, K):
5 | suffixes = []
6 | suffix_array = []
7 | for i in range(len(Text)):
8 | suffixes.append(Text[i:])
9 | suffix_array.append(i)
10 |
11 | suffix_array = [x for _, x in sorted(zip(suffixes, suffix_array), key=lambda pair: pair[0])]
12 |
13 | partial_suffix_array = [(i, x) for i, x in enumerate(suffix_array) if x % K == 0]
14 |
15 | return partial_suffix_array
16 |
17 |
18 | if __name__ == '__main__':
19 | '''
20 | Given: A string Text and a positive integer K.
21 | Return: SuffixArrayK(Text), in the form of a list of ordered pairs (i, SuffixArray(i)) for all nonempty entries in
22 | the partial suffix array.
23 | '''
24 | input_lines = sys.stdin.read().splitlines()
25 | Text = input_lines[0]
26 | K = int(input_lines[1])
27 |
28 | partial_suffix_array = PartialSuffixArray(Text, K)
29 | for elem in partial_suffix_array:
30 | print(','.join(map(str, elem)))
31 |
--------------------------------------------------------------------------------
/solutions/BA4C.py:
--------------------------------------------------------------------------------
1 | import sys
2 | MASS_TABLE = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}
4 |
5 |
6 | def cyclospectrum(peptide):
7 | full_mass = 0
8 | for aa in peptide:
9 | full_mass += MASS_TABLE[aa]
10 | spec = [0, full_mass]
11 | temp = peptide + peptide
12 | for k in range(1, len(peptide)):
13 | for i in range(len(peptide)):
14 | subpeptide = temp[i:i + k]
15 | mass = 0
16 | for aa in subpeptide:
17 | mass += MASS_TABLE[aa]
18 | spec.append(mass)
19 | spec.sort()
20 | return spec
21 |
22 |
23 | if __name__ == "__main__":
24 | '''
25 | Given: An amino acid string Peptide.
26 | Return: Cyclospectrum(Peptide).
27 | '''
28 | Peptide = sys.stdin.readline().strip()
29 |
30 | print(" ".join(map(str, cyclospectrum(Peptide))))
31 |
--------------------------------------------------------------------------------
/solutions/BA4L.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA4K import linear_score
3 |
4 |
5 | def Trim(leaderboard, spectrum, N):
6 | if len(leaderboard) <= N:
7 | return leaderboard
8 |
9 | scores = {}
10 | for i, peptide in enumerate(leaderboard):
11 | scores[i] = linear_score(peptide, spectrum)
12 |
13 | sorted_scores = sorted(scores.values(), reverse=True)
14 | threshold = sorted_scores[N - 1]
15 |
16 | return [leaderboard[idx] for idx, score in scores.items() if score >= threshold]
17 |
18 |
19 | if __name__ == "__main__":
20 | '''
21 | Given: A leaderboard of linear peptides Leaderboard, a linear spectrum Spectrum, and an integer N.
22 | Return: The top N peptides from Leaderboard scored against Spectrum. Remember to use LinearScore.
23 | '''
24 | input_lines = sys.stdin.read().splitlines()
25 | Leaderboard = input_lines[0].split()
26 | Spectrum = [int(x) for x in input_lines[1].split()]
27 | N = int(input_lines[2])
28 |
29 | print(" ".join(Trim(Leaderboard, Spectrum, N)))
30 |
--------------------------------------------------------------------------------
/solutions/BA4B.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA1C import rev_comp
3 | from BA4A import translate_rna
4 |
5 |
6 | def substrings_encoding_peptide(dna, peptide):
7 | k = len(peptide) * 3
8 | result = []
9 | for i in range(len(dna) - k + 1):
10 | substring = dna[i:i + k]
11 | revc_substring = rev_comp(substring)
12 |
13 | substring = substring.replace("T", "U")
14 | revc_substring = revc_substring.replace("T", "U")
15 |
16 | if translate_rna(substring) == peptide or translate_rna(revc_substring) == peptide:
17 | result.append(substring.replace("U", "T"))
18 | return result
19 |
20 |
21 | if __name__ == "__main__":
22 | '''
23 | Given: A DNA string Text and an amino acid string Peptide.
24 | Return: All substrings of Text encoding Peptide (if any such substrings exist).
25 | '''
26 | input_lines = sys.stdin.read().splitlines()
27 | Text = input_lines[0]
28 | Peptide = input_lines[1]
29 |
30 | print("\n".join(substrings_encoding_peptide(Text, Peptide)))
31 |
--------------------------------------------------------------------------------
/solutions/BA9K.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def LastToFirst(BWT, i):
5 | counts = {}
6 | BWT_list = []
7 | for char in BWT:
8 | if char not in counts.keys():
9 | counts[char] = 1
10 | else:
11 | counts[char] += 1
12 | tmp = char + str(counts[char])
13 | BWT_list.append(tmp)
14 |
15 | first_col = sorted(BWT_list, key=lambda x: x[0])
16 |
17 | last_to_first = []
18 | for sym_last in BWT_list:
19 | for idx, sym_first in enumerate(first_col):
20 | if sym_first == sym_last:
21 | last_to_first.append(idx)
22 |
23 | return last_to_first[i]
24 |
25 |
26 | if __name__ == "__main__":
27 | '''
28 | Given: A string Transform and an integer i.
29 | Return: The position LastToFirst(i) in FirstColumn in the Burrows-Wheeler matrix if LastColumn = Transform.
30 | '''
31 | input_lines = sys.stdin.read().splitlines()
32 | Transform = input_lines[0]
33 | i = int(input_lines[1])
34 |
35 | print(LastToFirst(Transform, i))
36 |
--------------------------------------------------------------------------------
/solutions/BA5G.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def edit_distance(str1, str2):
5 | str1 = "-" + str1
6 | str2 = "-" + str2
7 |
8 | score_mat = [[0 for _ in range(len(str2))] for _ in range(len(str1))]
9 |
10 | for j in range(len(str2)):
11 | score_mat[0][j] = j
12 |
13 | for i in range(len(str1)):
14 | score_mat[i][0] = i
15 |
16 | for i in range(1, len(str1)):
17 | for j in range(1, len(str2)):
18 | score1 = score_mat[i - 1][j - 1] + (1 if str1[i] != str2[j] else 0)
19 | score2 = score_mat[i - 1][j] + 1
20 | score3 = score_mat[i][j - 1] + 1
21 | score_mat[i][j] = min(score1, score2, score3)
22 |
23 | return score_mat[len(str1) - 1][len(str2) - 1]
24 |
25 |
26 | if __name__ == "__main__":
27 | '''
28 | Given: Two amino acid strings.
29 | Return: The edit distance between these strings.
30 | '''
31 | input_lines = sys.stdin.read().splitlines()
32 | string1 = input_lines[0]
33 | string2 = input_lines[1]
34 |
35 | print(edit_distance(string1, string2))
36 |
--------------------------------------------------------------------------------
/solutions/BA2C.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def probability(pattern, profile):
5 | indices = {"A": 0, "C": 1, "G": 2, "T": 3}
6 | prob = 1
7 | for i, nuc in enumerate(pattern):
8 | prob *= profile[indices[nuc]][i]
9 | return prob
10 |
11 |
12 | def profile_most_probable_kmer(text, profile, k):
13 | max_prob = 0
14 | for i in range(len(text) - k + 1):
15 | kmer = text[i:i + k]
16 | prob = probability(kmer, profile)
17 | if prob > max_prob:
18 | max_prob = prob
19 | result = kmer
20 | return result
21 |
22 |
23 | if __name__ == "__main__":
24 | '''
25 | Given: A string Text, an integer k, and a 4 × k matrix Profile.
26 | Return: A Profile-most probable k-mer in Text. (If multiple answers exist, you may return any one.)
27 | '''
28 | input_lines = sys.stdin.read().splitlines()
29 | Text = input_lines[0]
30 | k = int(input_lines[1])
31 | profile = [[float(x) for x in line.split()] for line in input_lines[2:]]
32 |
33 | print(profile_most_probable_kmer(Text, profile, k))
34 |
--------------------------------------------------------------------------------
/solutions/BA7B.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def limb_length(distance_mat, j, n_leaves):
5 | other_leaves = [i for i in range(n_leaves) if i != j]
6 |
7 | ll = []
8 | for idx_i in range(len(other_leaves) - 1):
9 | for idx_k in range(idx_i, len(other_leaves)):
10 | i = other_leaves[idx_i]
11 | k = other_leaves[idx_k]
12 | ll.append((distance_mat[i][j] + distance_mat[j][k] - distance_mat[i][k]) / 2)
13 | return min(ll)
14 |
15 |
16 | if __name__ == "__main__":
17 | '''
18 | Given: An integer n, followed by an integer j between 0 and n - 1, followed by a space-separated additive distance
19 | matrix D (whose elements are integers).
20 | Return: The limb length of the leaf in Tree(D) corresponding to row j of this distance matrix (use 0-based indexing)
21 | '''
22 | input_lines = sys.stdin.read().splitlines()
23 | n = int(input_lines[0])
24 | j = int(input_lines[1])
25 | distance_mat = [[int(x) for x in line.split()] for line in input_lines[2:]]
26 |
27 | print(limb_length(distance_mat, j, n))
28 |
--------------------------------------------------------------------------------
/solutions/BA4H.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def convolution(spectrum):
5 | spectrum.sort()
6 | conv = []
7 | for i in range(len(spectrum) - 1):
8 | for j in range(i, len(spectrum)):
9 | if spectrum[j] - spectrum[i] != 0:
10 | conv.append(spectrum[j] - spectrum[i])
11 |
12 | freq_dict = {}
13 | for mass in set(conv):
14 | freq_dict[mass] = conv.count(mass)
15 |
16 | sorted_mass_list = [k for k, _ in sorted(freq_dict.items(), key=lambda item: item[1], reverse=True)]
17 | conv = []
18 | for mass in sorted_mass_list:
19 | conv += [mass] * freq_dict[mass]
20 | return conv
21 |
22 |
23 | if __name__ == "__main__":
24 | '''
25 | Given: A collection of integers Spectrum.
26 | Return: The list of elements in the convolution of Spectrum in decreasing order of their multiplicities. If an
27 | element has multiplicity k, it should appear exactly k times.
28 | '''
29 | spectrum = [int(x) for x in sys.stdin.readline().strip().split()]
30 |
31 | print(" ".join(map(str, convolution(spectrum))))
32 |
--------------------------------------------------------------------------------
/solutions/BA11A.py:
--------------------------------------------------------------------------------
1 | import sys
2 | MASS_TABLE = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}
4 | INV_MASS_TABLE = {v: k for k, v in MASS_TABLE.items()}
5 |
6 |
7 | def spectrum_graph(spectrum):
8 | adj_list = []
9 | for i in range(len(spectrum)):
10 | for j in range(i, len(spectrum)):
11 | if (spectrum[j] - spectrum[i]) in INV_MASS_TABLE:
12 | adj_list.append([spectrum[i], spectrum[j], INV_MASS_TABLE[spectrum[j] - spectrum[i]]])
13 | return adj_list
14 |
15 |
16 | if __name__ == "__main__":
17 | '''
18 | Given: A space-delimited list of integers Spectrum.
19 | Return: Graph(Spectrum).
20 | '''
21 | Spectrum = sys.stdin.readline().strip()
22 | Spectrum = [int(x) for x in Spectrum.split()]
23 | Spectrum = [0] + Spectrum
24 |
25 | adj_list = spectrum_graph(Spectrum)
26 | for edge in adj_list:
27 | print(str(edge[0]) + '->' + str(edge[1]) + ':' + str(edge[2]))
--------------------------------------------------------------------------------
/solutions/BA11D.py:
--------------------------------------------------------------------------------
1 | import sys
2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}
4 | mass_table = {v: k for k, v in aa_table.items()}
5 | mass_table[4] = 'X'
6 | mass_table[5] = 'Z'
7 |
8 |
9 | def ConvertPeptideVector(vector):
10 | prefixMasses = []
11 | for i in range(len(vector)):
12 | if vector[i] == 1:
13 | prefixMasses.append(i + 1)
14 |
15 | peptide = mass_table[prefixMasses[0]]
16 | for i in range(1, len(prefixMasses)):
17 | mass = prefixMasses[i] - prefixMasses[i - 1]
18 | peptide += mass_table[mass]
19 |
20 | return peptide
21 |
22 |
23 | if __name__ == "__main__":
24 | '''
25 | Given: A space-delimited binary vector P.
26 | Return: A peptide whose binary peptide vector matches P. For masses with more than one amino acid, any choice may
27 | be used.
28 | '''
29 | vector = [int(x) for x in sys.stdin.read().rstrip().split()]
30 |
31 | print(ConvertPeptideVector(vector))
--------------------------------------------------------------------------------
/solutions/BA1E.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def most_freq_kmers(text, k, t):
5 | count_dict = {}
6 | for i in range(len(text) - k + 1):
7 | kmer = text[i:i + k]
8 | if kmer not in count_dict:
9 | count_dict[kmer] = 1
10 | else:
11 | count_dict[kmer] += 1
12 |
13 | max_freq = max(count_dict.values())
14 | if max_freq < t:
15 | return []
16 | return [kmer for kmer, count in count_dict.items() if count == max_freq]
17 |
18 |
19 | def find_clumping_kmers(text, k, L, t):
20 | result = set()
21 | for i in range(len(text) - L + 1):
22 | window = text[i:i + L]
23 | win_kmers = most_freq_kmers(window, k, t)
24 | for kmer in win_kmers:
25 | result.add(kmer)
26 | return result
27 |
28 |
29 | if __name__ == "__main__":
30 | '''
31 | Given: A string Genome, and integers k, L, and t.
32 | Return: All distinct k-mers forming (L, t)-clumps in Genome.
33 | '''
34 | input_lines = sys.stdin.read().splitlines()
35 | Genome = input_lines[0]
36 | k, L, t = [int(x) for x in input_lines[1].split()]
37 |
38 | print(" ".join(find_clumping_kmers(Genome, k, L, t)))
39 |
--------------------------------------------------------------------------------
/solutions/BA2A.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA1G import hamming_dist
3 | from BA1N import neighbors
4 |
5 |
6 | def count_approx_pattern(text, pattern, d):
7 | k = len(pattern)
8 | count = 0
9 | for i in range(len(text) - k + 1):
10 | if hamming_dist(text[i:i+k], pattern) <= d:
11 | count += 1
12 | return count
13 |
14 |
15 | def motif_enumeration(dna_list, k, d):
16 | patterns = set()
17 | for dna in dna_list:
18 | for i in range(len(dna) - k + 1):
19 | pattern = dna[i:i + k]
20 | neighborhood = neighbors(pattern, d)
21 | for neighbor in neighborhood:
22 | if all(count_approx_pattern(pat, neighbor, d) > 0 for pat in dna_list):
23 | patterns.add(neighbor)
24 | return patterns
25 |
26 |
27 | if __name__ == "__main__":
28 | '''
29 | Given: Integers k and d, followed by a collection of strings Dna.
30 | Return: All (k, d)-motifs in Dna.
31 | '''
32 | input_lines = sys.stdin.read().splitlines()
33 | k, d = [int(x) for x in input_lines[0].split()]
34 | DNA_list = input_lines[1:]
35 |
36 | print(" ".join(motif_enumeration(DNA_list, k, d)))
37 |
--------------------------------------------------------------------------------
/solutions/BA2B.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA1G import hamming_dist
3 | from BA1K import generate_all_kmers
4 |
5 |
6 | def distance(pattern, text):
7 | k = len(pattern)
8 | min_dist = float("Inf")
9 | for i in range(len(text) - k + 1):
10 | dist = hamming_dist(text[i:i + k], pattern)
11 | if dist < min_dist:
12 | min_dist = dist
13 | return min_dist
14 |
15 |
16 | def median_string(dna_list, k):
17 | all_kmers = generate_all_kmers(k)
18 |
19 | min_dist = float("Inf")
20 | for kmer in all_kmers:
21 | dist = 0
22 | for dna in dna_list:
23 | dist += distance(kmer, dna)
24 | if dist < min_dist:
25 | min_dist = dist
26 | med_str = kmer
27 | return med_str
28 |
29 |
30 | if __name__ == "__main__":
31 | '''
32 | Given: An integer k and a collection of strings Dna.
33 | Return: A k-mer Pattern that minimizes d(Pattern, Dna) over all k-mers Pattern. (If multiple answers exist,
34 | you may return any one.)
35 | '''
36 | input_lines = sys.stdin.read().splitlines()
37 | k = int(input_lines[0])
38 | DNA_list = input_lines[1:]
39 |
40 | print(median_string(DNA_list, k))
41 |
--------------------------------------------------------------------------------
/solutions/BA10A.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def HiddenPathProbability(hidden_path, transition_matrix):
5 | # initial probability
6 | prob_path = .5
7 |
8 | # Calculate ∏ (i: 1 -> n) transition𝜋𝑖−1,𝜋𝑖
9 | for idx in range(1, len(hidden_path)):
10 | prob_path *= transition_matrix[hidden_path[idx - 1]][hidden_path[idx]]
11 |
12 | return prob_path
13 |
14 |
15 | if __name__ == "__main__":
16 | '''
17 | Given: A hidden path π followed by the states States and transition matrix Transition of an HMM (Σ, States,
18 | Transition, Emission).
19 | Return: The probability of this path, Pr(π). You may assume that initial probabilities are equal.
20 | '''
21 | tmp = sys.stdin.read().splitlines()
22 |
23 | hidden_path = tmp[0]
24 | states = tmp[2].split(' ')
25 |
26 | col_syms = tmp[4].split('\t')[1:]
27 | transition_matrix = {}
28 | for i in range(5, len(tmp)):
29 | current_line = tmp[i].rstrip().split('\t')
30 | row_sym = current_line[0]
31 | transition_matrix[row_sym] = {}
32 | for j in range(1, len(current_line)):
33 | transition_matrix[row_sym][col_syms[j - 1]] = float(current_line[j])
34 |
35 | print(HiddenPathProbability(hidden_path, transition_matrix))
--------------------------------------------------------------------------------
/solutions/BA9J.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def InverseBurrowsWheelerTransform(BWT):
5 | lenText = len(BWT)
6 |
7 | counts = {}
8 | BWT_list = []
9 | for char in BWT:
10 | if char not in counts.keys():
11 | counts[char] = 1
12 | else:
13 | counts[char] += 1
14 | tmp = char + str(counts[char])
15 | BWT_list.append(tmp)
16 |
17 | first_col = sorted(BWT_list, key=lambda x: x[0])
18 |
19 | first_row = ['$1']
20 | for i in range(1, lenText):
21 | prev_symbol = first_row[i - 1]
22 | for BWT_idx, char in enumerate(BWT_list):
23 | if char == prev_symbol:
24 | idx = BWT_idx
25 | break
26 | first_row.append(first_col[idx])
27 |
28 | Text = ''
29 | for i in range(1, len(first_row)):
30 | Text += ''.join(x for x in first_row[i] if not x.isdigit())
31 | Text += '$'
32 | return Text
33 |
34 |
35 | if __name__ == "__main__":
36 | '''
37 | Given: A string Transform (with a single "$" sign).
38 | Return: The string Text such that BWT(Text) = Transform
39 | '''
40 | input_lines = sys.stdin.read().splitlines()
41 | BWT = input_lines[0]
42 |
43 | print(InverseBurrowsWheelerTransform(BWT))
44 |
--------------------------------------------------------------------------------
/solutions/BA6J.py:
--------------------------------------------------------------------------------
1 | def TwoBreakOnGenomeGraph(GenomeGraph, i1 , i2 , i3 , i4):
2 | if [i1, i2] in GenomeGraph:
3 | GenomeGraph.remove([i1, i2])
4 | else:
5 | GenomeGraph.remove([i2, i1])
6 | if [i3, i4] in GenomeGraph:
7 | GenomeGraph.remove([i3, i4])
8 | else:
9 | GenomeGraph.remove([i4, i3])
10 | GenomeGraph += [[i1, i3]] + [[i2, i4]]
11 | return GenomeGraph
12 |
13 |
14 | if __name__ == "__main__":
15 | '''
16 | Given: The colored edges of a genome graph GenomeGraph, followed by indices i, i', j, and j'.
17 | Return: The colored edges of the genome graph resulting from applying the 2-break operation.
18 | '''
19 | GenomeGraph = input().rstrip()
20 | GenomeGraph = GenomeGraph[1:-1]
21 | GenomeGraph = GenomeGraph.split('), (')
22 | for i in range(len(GenomeGraph)):
23 | GenomeGraph[i] = GenomeGraph[i].split(', ')
24 | for j in range(len(GenomeGraph[i])):
25 | GenomeGraph[i][j] = int(GenomeGraph[i][j])
26 | i1, i2, i3, i4 = map(int, input().rstrip().split(', '))
27 | result = TwoBreakOnGenomeGraph(GenomeGraph, i1, i2, i3, i4)
28 | for j in range(len(result)):
29 | result[j] = '(' + ', '.join(str(i) for i in result[j]) + ')'
30 | print(', '.join(result))
--------------------------------------------------------------------------------
/solutions/BA5C.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def LCS(v, w):
5 | v = '-' + v
6 | w = '-' + w
7 |
8 | S = [[0 for _ in range(len(w))] for _ in range(len(v))]
9 | Backtrack = [[None for _ in range(len(w))] for _ in range(len(v))]
10 |
11 | for i in range(1, len(v)):
12 | for j in range(1, len(w)):
13 | tmp = S[i - 1][j - 1] + (1 if v[i] == w[j] else 0)
14 | S[i][j] = max(S[i - 1][j], S[i][j - 1], tmp)
15 |
16 | if S[i][j] == S[i - 1][j]:
17 | Backtrack[i][j] = "up"
18 | elif S[i][j] == S[i][j - 1]:
19 | Backtrack[i][j] = "left"
20 | else:
21 | Backtrack[i][j] = "diag"
22 |
23 | LCS = ""
24 | while i > 0 and j > 0:
25 | if Backtrack[i][j] == "diag":
26 | LCS = v[i] + LCS
27 | i -= 1
28 | j -= 1
29 | elif Backtrack[i][j] == "left":
30 | j -= 1
31 | else:
32 | i -= 1
33 |
34 | return LCS
35 |
36 |
37 | if __name__ == "__main__":
38 | '''
39 | Given: Two strings.
40 | Return: A longest common subsequence of these strings.
41 | '''
42 | input_lines = sys.stdin.read().splitlines()
43 | s = input_lines[0]
44 | t = input_lines[1]
45 |
46 | print(LCS(s,t))
--------------------------------------------------------------------------------
/solutions/BA10B.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def ProbabilityOutcomeGivenPath(x, hidden_path, emission_matrix):
5 | emission_prob = 1
6 | # Calculate ∏ (i: 1 -> n) emission𝜋𝑖(𝑥𝑖)
7 | for i in range(len(x)):
8 | emission_prob *= emission_matrix[hidden_path[i]][x[i]]
9 | return emission_prob
10 |
11 |
12 | if __name__ == "__main__":
13 | '''
14 | Given: A string x, followed by the alphabet Σ from which x was constructed, followed by a hidden path π, followed by
15 | the states States and emission matrix Emission of an HMM (Σ, States, Transition, Emission).
16 | Return: The conditional probability Pr(x|π) that string x will be emitted by the HMM given the hidden path π.
17 | '''
18 | input_lines = sys.stdin.read().splitlines()
19 |
20 | x = input_lines[0]
21 | alphabet = input_lines[2].split(' ')
22 | hidden_path = input_lines[4]
23 |
24 | col_syms = input_lines[8].split()
25 | emission_matrix = {}
26 | for line in input_lines[9:]:
27 | current_line = line.split()
28 | row_sym = current_line[0]
29 | emission_matrix[row_sym] = {}
30 | for j in range(1, len(current_line)):
31 | emission_matrix[row_sym][col_syms[j - 1]] = float(current_line[j])
32 |
33 | print(ProbabilityOutcomeGivenPath(x, hidden_path, emission_matrix))
--------------------------------------------------------------------------------
/solutions/BA5B.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def ManhattanTouristProblem(n, m, Down, Right):
5 | S = [[0 for _ in range(m + 1)] for _ in range(n + 1)]
6 |
7 | for i in range(1, n + 1):
8 | S[i][0] = S[i - 1][0] + Down[i - 1][0]
9 | for j in range(1, m + 1):
10 | S[0][j] = S[0][j - 1] + Right[0][j - 1]
11 |
12 | for i in range(1, n + 1):
13 | for j in range(1, m + 1):
14 | S[i][j] = max(S[i - 1][j] + Down[i - 1][j], S[i][j - 1] + Right[i][j - 1])
15 |
16 | return S[n][m]
17 |
18 |
19 | if __name__ == "__main__":
20 | '''
21 | Given: Integers n and m, followed by an n × (m+1) matrix Down and an (n+1) × m matrix Right. The two matrices are
22 | separated by the "-" symbol.
23 | Return: The length of a longest path from source (0, 0) to sink (n, m) in the n × m rectangular grid whose edges are
24 | defined by the matrices Down and Right.
25 | '''
26 | input_lines = sys.stdin.read().splitlines()
27 | n, m = [int(x) for x in input_lines[0].split()]
28 |
29 | Down = []
30 | for idx in range(1, n + 1):
31 | Down.append([int(x) for x in input_lines[idx].split()])
32 |
33 | Right = []
34 | for idx in range(n + 2, len(input_lines)):
35 | Right.append([int(x) for x in input_lines[idx].split()])
36 |
37 |
38 | print(ManhattanTouristProblem(m, m, Down, Right))
39 |
--------------------------------------------------------------------------------
/solutions/BA6I.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA6G import cycle_to_chromosome
3 |
4 |
5 | def graph_to_genome(GenomeGraph):
6 | P = []
7 | Cycles = []
8 | temp = []
9 | for i in range(len(GenomeGraph)):
10 | if i == len(GenomeGraph) - 1:
11 | temp += GenomeGraph[i]
12 | Cycles.append(temp)
13 | elif GenomeGraph[i][1] == GenomeGraph[i + 1][0] + 1 or GenomeGraph[i][1] == GenomeGraph[i + 1][0] - 1:
14 | temp += GenomeGraph[i]
15 | else:
16 | temp += GenomeGraph[i]
17 | Cycles.append(temp)
18 | temp = []
19 | for Cycle in Cycles:
20 | Chromosome = cycle_to_chromosome([Cycle[-1]] + Cycle[:-1])
21 | P.append(Chromosome)
22 | return P
23 |
24 |
25 | if __name__ == "__main__":
26 | '''
27 | Given: The colored edges of a genome graph.
28 | Return: A genome corresponding to the genome graph.
29 | '''
30 | Edges = sys.stdin.readline().strip()
31 | Edges = Edges.split('), (')
32 |
33 | for i in range(len(Edges)):
34 | Edges[i] = Edges[i].replace("(", "").replace(")", "")
35 | Edges[i] = [int(x) for x in Edges[i].split(", ")]
36 |
37 | result = graph_to_genome(Edges)
38 | for j in range(len(result)):
39 | result[j] = '(' + ' '.join(('+' if i > 0 else '') + str(i) for i in result[j]) + ')'
40 | print(''.join(result))
41 |
--------------------------------------------------------------------------------
/solutions/BA6E.py:
--------------------------------------------------------------------------------
1 | def reverse_comp(Seq):
2 | return Seq[::-1].translate(Seq.maketrans('ATCG', 'TAGC'))
3 |
4 |
5 | def SharedKmers(k, seq1, seq2):
6 | result = []
7 | seq1dict = {}
8 | for i in range(len(seq1) - k + 1):
9 | key = seq1[i:i+k]
10 | if key in seq1dict.keys():
11 | seq1dict[key].append(i)
12 | elif reverse_comp(key) in seq1dict.keys():
13 | seq1dict[reverse_comp(key)].append(i)
14 | else:
15 | seq1dict[key] = [i]
16 | for j in range(len(seq2) - k + 1):
17 | sub2 = seq2[j:j+k]
18 | if sub2 in seq1dict.keys():
19 | for pos in seq1dict[sub2]:
20 | result.append([pos, j])
21 | elif reverse_comp(sub2) in seq1dict.keys():
22 | for pos in seq1dict[reverse_comp(sub2)]:
23 | result.append([pos, j])
24 | return result
25 |
26 |
27 | if __name__ == "__main__":
28 | '''
29 | Given: An integer k and two strings.
30 | Return: All k-mers shared by these strings, in the form of ordered pairs (x, y) corresponding to starting positions
31 | of these k-mers in the respective strings.
32 | '''
33 | k = int(input().rstrip())
34 | seq1 = input().rstrip()
35 | seq2 = input().rstrip()
36 | result = SharedKmers(k, seq1, seq2)
37 | for r in result:
38 | print('(' + ', '.join(map(str, r)) + ')')
39 |
--------------------------------------------------------------------------------
/solutions/BA9D.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from Tree_Trie_classes import Tree
3 |
4 |
5 | def backtrace_path_from_node(tree, node):
6 | # if root is reached, stop
7 | if node.label == 0:
8 | return ''
9 |
10 | for edge in tree.all_edges:
11 | if edge.target_node == node:
12 | incoming_edge = edge
13 | break
14 |
15 | path_substring = Text[incoming_edge.position: incoming_edge.position + incoming_edge.length]
16 | path_substring = backtrace_path_from_node(tree, incoming_edge.from_node) + path_substring
17 |
18 | return path_substring
19 |
20 |
21 | def LongestRepeat(Text):
22 | suffix_tree = Tree()
23 | suffix_tree.PopulateSuffixTree(Text + '$')
24 |
25 | ## Find deepest internal node
26 | max_dep = -1
27 | for node in suffix_tree.all_nodes:
28 | if len(node.edges) != 0 and node.depth > max_dep:
29 | max_dep = node.depth
30 | max_dep_node = node
31 |
32 | longest_substring = backtrace_path_from_node(suffix_tree, max_dep_node)
33 | return longest_substring
34 |
35 |
36 | if __name__ == "__main__":
37 | '''
38 | Given: A string Text.
39 | Return: A longest substring of Text that appears in Text more than once. (Multiple solutions may exist, in which
40 | case you may return any one.)
41 | '''
42 | Text = sys.stdin.read().rstrip()
43 | print(LongestRepeat(Text))
--------------------------------------------------------------------------------
/solutions/BA3G.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA3F import parse_adj_list
3 | from BA3F import Eulerian_cycle
4 |
5 |
6 | def Eulerian_path(adj_list):
7 | deg_diffs = {}
8 | for source, targets in adj_list.items():
9 | if source in deg_diffs:
10 | deg_diffs[source] += len(targets)
11 | else:
12 | deg_diffs[source] = len(targets)
13 | for target in targets:
14 | if target in deg_diffs:
15 | deg_diffs[target] -= 1
16 | else:
17 | deg_diffs[target] = -1
18 |
19 | to_add_s = [node for node, diff in deg_diffs.items() if diff == -1][0]
20 | to_add_t = [node for node, diff in deg_diffs.items() if diff == 1][0]
21 | if to_add_s in adj_list:
22 | adj_list[to_add_s].append(to_add_t)
23 | else:
24 | adj_list[to_add_s] = [to_add_t]
25 |
26 | cycle = Eulerian_cycle(adj_list)
27 | idx = 0
28 | while True:
29 | if cycle[idx] == to_add_s and cycle[idx + 1] == to_add_t:
30 | break
31 | idx += 1
32 | return cycle[idx + 1:] + cycle[1:idx + 1]
33 |
34 |
35 | if __name__ == "__main__":
36 | '''
37 | Given: A directed graph that contains an Eulerian path, where the graph is given in the form of an adjacency list.
38 | Return: An Eulerian path in this graph.
39 | '''
40 | input_lines = sys.stdin.read().splitlines()
41 | Adj_list = parse_adj_list(input_lines)
42 |
43 | print("->".join(Eulerian_path(Adj_list)))
44 |
--------------------------------------------------------------------------------
/solutions/BA1I.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA1G import hamming_dist
3 |
4 |
5 | def neighbors(pattern, d):
6 | if d == 0:
7 | return pattern
8 | if len(pattern) == 1:
9 | return ['A', 'C', 'G', 'T']
10 | neighborhood = set()
11 | suffix_neighbors = neighbors(pattern[1:], d)
12 | for suffix in suffix_neighbors:
13 | if hamming_dist(pattern[1:], suffix) < d:
14 | for nuc in ['A', 'C', 'G', 'T']:
15 | neighborhood.add(nuc + suffix)
16 | else:
17 | neighborhood.add(pattern[0] + suffix)
18 | return neighborhood
19 |
20 |
21 | def most_freq_kmers_mismatch(text, k, d):
22 | count_dict = {}
23 | for i in range(len(text) - k + 1):
24 | kmer = text[i:i+k]
25 | neighborhood = neighbors(kmer, d)
26 | for approx_pattern in neighborhood:
27 | if approx_pattern in count_dict:
28 | count_dict[approx_pattern] += 1
29 | else:
30 | count_dict[approx_pattern] = 1
31 | max_freq = max(count_dict.values())
32 | return [kmer for kmer, count in count_dict.items() if count == max_freq]
33 |
34 |
35 | if __name__ == "__main__":
36 | '''
37 | Given: A string Text as well as integers k and d.
38 | Return: All most frequent k-mers with up to d mismatches in Text.
39 | '''
40 | input_lines = sys.stdin.read().splitlines()
41 | Text = input_lines[0]
42 | k, d = [int(x) for x in input_lines[1].split()]
43 |
44 | print(" ".join(most_freq_kmers_mismatch(Text, k, d)))
45 |
--------------------------------------------------------------------------------
/solutions/BA11H.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
4 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}
5 | masses = list(aa_table.values())
6 |
7 |
8 | def SpectralDictionarySize(spectral_vector, threshold, max_score):
9 | m = len(spectral_vector)
10 |
11 | Size = {}
12 | Size[0] = {}
13 | Size[0][0] = 1
14 |
15 | for t in range(1, max_score + 1):
16 | Size[0][t] = 0
17 |
18 | for i in range(1, m + 1):
19 | Size[i] = {}
20 | for t in range(max_score + 1):
21 | Size[i][t] = 0
22 | for a in masses:
23 | if (i - a) >= 0 and (t - spectral_vector[i - 1]) >= 0 and (t - spectral_vector[i - 1]) <= max_score:
24 | Size[i][t] += Size[i - a][t - spectral_vector[i - 1]]
25 |
26 | final_size = 0
27 | for t in range(threshold, max_score + 1):
28 | final_size += Size[m][t]
29 |
30 | return final_size
31 |
32 |
33 | if __name__ == "__main__":
34 | '''
35 | Given: A spectral vector Spectrum', an integer threshold, and an integer max_score.
36 | Return: The size of the dictionary Dictionarythreshold(Spectrum').
37 | '''
38 | tmp = sys.stdin.read().splitlines()
39 | spectral_vector = [int(x) for x in tmp[0].rstrip().split(' ')]
40 | threshold = int(tmp[1])
41 | max_score = int(tmp[2])
42 |
43 | print(SpectralDictionarySize(spectral_vector, threshold, max_score))
--------------------------------------------------------------------------------
/solutions/BA11I.py:
--------------------------------------------------------------------------------
1 | import sys
2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}
4 | masses = list(aa_table.values())
5 |
6 |
7 | def SpectralDictionaryProbability(spectral_vector, threshold, max_score):
8 | m = len(spectral_vector)
9 |
10 | Prob = {}
11 | Prob[0] = {}
12 | Prob[0][0] = 1
13 |
14 | for t in range(1, max_score + 1):
15 | Prob[0][t] = 0
16 |
17 | for i in range(1, m + 1):
18 | Prob[i] = {}
19 | for t in range(max_score + 1):
20 | Prob[i][t] = 0
21 | for a in masses:
22 | if (i - a) >= 0 and (t - spectral_vector[i - 1]) >= 0 and (t - spectral_vector[i - 1]) <= max_score:
23 | Prob[i][t] += Prob[i - a][t - spectral_vector[i - 1]]
24 | Prob[i][t] /= 20
25 |
26 | final_Prob = 0
27 | for t in range(threshold, max_score + 1):
28 | final_Prob += Prob[m][t]
29 |
30 | return final_Prob
31 |
32 | if __name__ == "__main__":
33 | '''
34 | Given: A spectral vector Spectrum', an integer threshold, and an integer max_score.
35 | Return: The probability of the dictionary Dictionarythreshold(Spectrum').
36 | '''
37 | tmp = sys.stdin.read().splitlines()
38 | spectral_vector = [int(x) for x in tmp[0].rstrip().split(' ')]
39 | threshold = int(tmp[1])
40 | max_score = int(tmp[2])
41 |
42 | print(SpectralDictionaryProbability(spectral_vector, threshold, max_score))
--------------------------------------------------------------------------------
/solutions/BA6C.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA6H import colored_edges
3 |
4 |
5 | def find_next_edge(current, edges):
6 | if len(edges) == 0:
7 | return -1
8 | idx = 0
9 | while not (current[0] in edges[idx] or current[1] in edges[idx]):
10 | idx += 1
11 | if idx == len(edges):
12 | return -1
13 | return edges[idx]
14 |
15 |
16 | def two_break_distance(P, Q):
17 | edgesP = colored_edges(P)
18 | edgesQ = colored_edges(Q)
19 | edges = edgesP + edgesQ
20 | blocks = set()
21 | for edge in edges:
22 | blocks.add(edge[0])
23 | blocks.add(edge[1])
24 | Cycles = []
25 | while len(edges) != 0:
26 | start = edges[0]
27 | edges.remove(edges[0])
28 | Cycle = [start]
29 | current = find_next_edge(start, edges)
30 | while current != -1:
31 | Cycle.append(current)
32 | edges.remove(current)
33 | current = find_next_edge(current, edges)
34 | Cycles.append(Cycle)
35 | return len(blocks) // 2 - len(Cycles)
36 |
37 |
38 | if __name__ == "__main__":
39 | '''
40 | Given: Two genomes with circular chromosomes on the same set of synteny blocks.
41 | Return: The 2-break distance between these two genomes.
42 | '''
43 | input_lines = sys.stdin.read().splitlines()
44 | P = input_lines[0]
45 | P = P[1:-1]
46 | P = P.split(')(')
47 | for i in range(len(P)):
48 | P[i] = [int(x) for x in P[i].split(' ')]
49 |
50 | Q = input_lines[1]
51 | Q = Q[1:-1]
52 | Q = Q.split(')(')
53 | for i in range(len(Q)):
54 | Q[i] = [int(x) for x in Q[i].split(' ')]
55 |
56 | print(two_break_distance(P, Q))
57 |
--------------------------------------------------------------------------------
/solutions/BA9F.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from Tree_Trie_classes import Tree
3 |
4 |
5 | def backtrace_path_from_node(tree, node, Text):
6 | # if root is reached, stop
7 | if node.label == 0:
8 | return ''
9 |
10 | for edge in tree.all_edges:
11 | if edge.target_node == node:
12 | incoming_edge = edge
13 | break
14 |
15 | path_substring = Text[incoming_edge.position: incoming_edge.position + incoming_edge.length]
16 | path_substring = backtrace_path_from_node(tree, incoming_edge.from_node, Text) + path_substring
17 | return path_substring
18 |
19 |
20 | def Shortest_NonShared_Substring(Text1, Text2):
21 | suffix_tree = Tree()
22 | combined_Text = Text1 + '#' + Text2 + '$'
23 | suffix_tree.PopulateSuffixTree(combined_Text)
24 | suffix_tree.add_indicators()
25 |
26 | ## Find shallowest Text1 internal node
27 | min_dep = 1e6
28 | for node in suffix_tree.all_nodes:
29 |
30 | if node.indicator == '#':
31 | if len(node.edges) != 0 and node.depth <= min_dep:
32 | min_dep = node.depth
33 | min_dep_node = node
34 |
35 | non_shared_substr = backtrace_path_from_node(suffix_tree, min_dep_node, combined_Text)
36 | return non_shared_substr
37 |
38 |
39 | if __name__ == "__main__":
40 | '''
41 | Given: Strings Text1 and Text2.
42 | Return: The shortest substring of Text1 that does not appear in Text2. (Multiple solutions may exist, in which case
43 | you may return any one.)
44 | '''
45 | input_lines = sys.stdin.read().splitlines()
46 | Text1 = input_lines[0]
47 | Text2 = input_lines[1]
48 |
49 | print(Shortest_NonShared_Substring(Text1, Text2))
--------------------------------------------------------------------------------
/solutions/BA8A.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def euclidean_distance(pointA, pointB):
5 | distance = 0
6 | for i in range(len(pointA)):
7 | distance += (pointA[i] - pointB[i]) ** 2
8 | distance = distance ** 0.5
9 | return distance
10 |
11 |
12 | def distance_from_centers(centers, point):
13 | min_dist = float("Inf")
14 | for x in centers:
15 | current = euclidean_distance(x, point)
16 | if current < min_dist:
17 | min_dist = current
18 | return min_dist
19 |
20 |
21 | def max_distance_point(data, centers):
22 | max_dist = -1
23 | for point in data:
24 | current = distance_from_centers(centers, point)
25 | if current > max_dist:
26 | max_dist = current
27 | max_point = point
28 | return max_point
29 |
30 |
31 | def farthest_first_traversal(data, k):
32 | centers = [data[0]]
33 | while len(centers) < k:
34 | point = max_distance_point(data, centers)
35 | centers.append(point)
36 | return centers
37 |
38 |
39 | if __name__ == "__main__":
40 | '''
41 | Given: Integers k and m followed by a set of points Data in m-dimensional space.
42 | Return: A set Centers consisting of k points (centers) resulting from applying FarthestFirstTraversal(Data, k),
43 | where the first point from Data is chosen as the first center to initialize the algorithm.
44 | '''
45 | input_lines = sys.stdin.read().splitlines()
46 | k, m = [int(x) for x in input_lines[0].split()]
47 | data = [[float(x) for x in line.split()] for line in input_lines[1:]]
48 |
49 | center_points = farthest_first_traversal(data, k)
50 |
51 | for center in center_points:
52 | print(" ".join(map(str, center)))
53 |
--------------------------------------------------------------------------------
/solutions/BA9B.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA9A import TrieConstruction
3 |
4 |
5 | def PrefixTrieMatching(Prefix, trie):
6 | ''' Prefix Trie Matching
7 | '''
8 | symbol = Prefix[0]
9 | node = trie.root
10 |
11 | idx = 1
12 | pattern = ''
13 |
14 | while True:
15 | # if node is a leaf
16 | if len(node.edges) == 0:
17 | return pattern
18 |
19 | # if there is an edge (node, some other node) in Trie,
20 | # labeled by symbol, extend pattern
21 | found = False
22 | for edge in node.edges:
23 | if edge.label == symbol:
24 | found = True
25 | pattern += symbol
26 | node = edge.target_node
27 | if idx != len(Prefix):
28 | symbol = Prefix[idx]
29 | idx += 1
30 | break
31 |
32 | if not found:
33 | return None
34 |
35 |
36 | def TrieMatching(Text, trie):
37 | indices = []
38 | idx = 0
39 | while len(Text) != 0:
40 | match = PrefixTrieMatching(Text, trie)
41 | if match != None:
42 | indices.append(idx)
43 | Text = Text[1:]
44 | idx += 1
45 | return indices
46 |
47 |
48 | if __name__ == "__main__":
49 | '''
50 | Given: A string Text and a collection of strings Patterns.
51 | Return: All starting positions in Text where a string from Patterns appears as a substring.
52 | '''
53 | tmp = sys.stdin.read().splitlines()
54 | Text = tmp[0]
55 | Patterns = []
56 | for i in range(1, len(tmp)):
57 | Patterns.append(tmp[i])
58 |
59 | trie = TrieConstruction(Patterns)
60 | result = TrieMatching(Text, trie)
61 | print(' '.join(str(x) for x in result))
--------------------------------------------------------------------------------
/solutions/BA9E.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from Tree_Trie_classes import Tree
3 |
4 |
5 | def backtrace_path_from_node(tree, node, Text):
6 | # if root is reached, stop
7 | if node.label == 0:
8 | return ''
9 |
10 | for edge in tree.all_edges:
11 | if edge.target_node == node:
12 | incoming_edge = edge
13 | break
14 |
15 | path_substring = Text[incoming_edge.position : incoming_edge.position + incoming_edge.length]
16 | path_substring = backtrace_path_from_node(tree, incoming_edge.from_node, Text) + path_substring
17 | return path_substring
18 |
19 |
20 | def LongestSharedSubstring(Text1, Text2):
21 | suffix_tree = Tree()
22 | combined_Text = Text1 + '#' + Text2 + '$'
23 | suffix_tree.PopulateSuffixTree(combined_Text)
24 | suffix_tree.add_indicators()
25 |
26 | ## Find deepest common internal node
27 | max_dep = -1
28 | for node in suffix_tree.all_nodes:
29 | if node.indicator == '*':
30 | # print(str(node.label) + ': '+ str(node.indicator) + ': ' + str(node.depth))
31 | if len(node.edges) != 0 and node.depth >= max_dep:
32 | max_dep = node.depth
33 | max_dep_node = node
34 |
35 | longest_substring = backtrace_path_from_node(suffix_tree, max_dep_node, combined_Text)
36 | return longest_substring
37 |
38 |
39 | if __name__ == "__main__":
40 | '''
41 | Given: Strings Text1 and Text2.
42 | Return: The longest substring that occurs in both Text1 and Text2. (Multiple solutions may exist, in which case you
43 | may return any one.)
44 | '''
45 | input_lines = sys.stdin.read().splitlines()
46 | Text1 = input_lines[0]
47 | Text2 = input_lines[1]
48 |
49 | print(LongestSharedSubstring(Text1, Text2))
--------------------------------------------------------------------------------
/solutions/BA9A.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from Tree_Trie_classes import Trie
3 |
4 | def TrieConstruction(Pattern_list):
5 | trie = Trie()
6 |
7 | for Pattern in Pattern_list:
8 | currentNode = trie.root
9 |
10 | for currentSymbol in Pattern:
11 | # if there is an outgoing edge from currentNode with label currentSymbol,
12 | # change currentNode to target_node
13 | for edge in currentNode.edges:
14 | if edge.label == currentSymbol:
15 | currentNode = edge.target_node
16 | break
17 | else:
18 | # add a new node newNode to Trie
19 | newNode = trie.add_node()
20 | # add a new edge from currentNode to newNode with label currentSymbol
21 | trie.add_edge(currentNode, newNode, currentSymbol)
22 | currentNode = newNode
23 | return trie
24 |
25 |
26 | if __name__ == "__main__":
27 | '''
28 | Given: A collection of strings Patterns.
29 | Return: The adjacency list corresponding to Trie(Patterns), in the following format. If Trie(Patterns) has n nodes,
30 | first label the root with 1 and then label the remaining nodes with the integers 2 through n in any order you like.
31 | Each edge of the adjacency list of Trie(Patterns) will be encoded by a triple: the first two members of the triple
32 | must be the integers labeling the initial and terminal nodes of the edge, respectively; the third member of the
33 | triple must be the symbol labeling the edge.
34 | '''
35 | Patterns = sys.stdin.read().splitlines()
36 | trie = TrieConstruction(Patterns)
37 |
38 | for edge in trie.all_edges:
39 | print(str(edge.from_node.label) + '->' + str(edge.target_node.label) + ':' + str(edge.label))
--------------------------------------------------------------------------------
/solutions/BA3J.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from collections import defaultdict
3 | from BA3G import Eulerian_path
4 |
5 |
6 | def deBruijn_graph_paired_reads(paired_reads):
7 | adj_list = defaultdict(list)
8 | for pair in paired_reads:
9 | adj_list[(pair[0][:-1], pair[1][:-1])].append((pair[0][1:], pair[1][1:]))
10 | return adj_list
11 |
12 |
13 | def StringSpelledByGappedPatterns(GappedPatterns, k, d):
14 | prefix_string = ''
15 | suffix_string = ''
16 | for i, pattern_pair in enumerate(GappedPatterns):
17 | if i != len(GappedPatterns) - 1:
18 | prefix_string += pattern_pair[0][0]
19 | suffix_string += pattern_pair[1][0]
20 | else:
21 | prefix_string += pattern_pair[0]
22 | suffix_string += pattern_pair[1]
23 | for i in range(k + d + 1, len(prefix_string)):
24 | if prefix_string[i] != suffix_string[i - k - d - 1]:
25 | return -1
26 | return prefix_string + suffix_string[len(suffix_string) - k - d - 1:]
27 |
28 |
29 | def string_reconstruction_read_pairs(k, d, paired_reads):
30 | adj_list = deBruijn_graph_paired_reads(paired_reads)
31 | path = Eulerian_path(adj_list)
32 | return StringSpelledByGappedPatterns(path, k - 1, d)
33 |
34 |
35 | if __name__ == "__main__":
36 | '''
37 | Given: Integers k and d followed by a collection of paired k-mers PairedReads.
38 | Return: A string Text with (k, d)-mer composition equal to PairedReads. (If multiple answers exist, you may return
39 | any one.)
40 | '''
41 | input_lines = sys.stdin.read().splitlines()
42 | k, d = [int(x) for x in input_lines[0].split()]
43 | PairedReads = []
44 | for line in input_lines[1:]:
45 | PairedReads.append(line.split("|"))
46 |
47 | print(string_reconstruction_read_pairs(k, d, PairedReads))
48 |
--------------------------------------------------------------------------------
/solutions/BA1J.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA1C import rev_comp
3 | from BA1G import hamming_dist
4 |
5 |
6 | def neighbors(pattern, d):
7 | if d == 0:
8 | return pattern
9 | if len(pattern) == 1:
10 | return ['A', 'C', 'G', 'T']
11 | neighborhood = set()
12 | suffix_neighbors = neighbors(pattern[1:], d)
13 | for suffix in suffix_neighbors:
14 | if hamming_dist(pattern[1:], suffix) < d:
15 | for nuc in ['A', 'C', 'G', 'T']:
16 | neighborhood.add(nuc + suffix)
17 | else:
18 | neighborhood.add(pattern[0] + suffix)
19 | return neighborhood
20 |
21 |
22 | def most_freq_kmers_mismatch_revc(text, k, d):
23 | count_dict = {}
24 | for i in range(len(text) - k + 1):
25 | kmer = text[i:i+k]
26 | neighborhood = neighbors(kmer, d)
27 | for approx_pattern in neighborhood:
28 | if approx_pattern in count_dict:
29 | count_dict[approx_pattern] += 1
30 | else:
31 | count_dict[approx_pattern] = 1
32 | r_pattern = rev_comp(approx_pattern)
33 | if r_pattern in count_dict:
34 | count_dict[r_pattern] += 1
35 | else:
36 | count_dict[r_pattern] = 1
37 | max_freq = max(count_dict.values())
38 | return [kmer for kmer, count in count_dict.items() if count == max_freq]
39 |
40 |
41 | if __name__ == "__main__":
42 | '''
43 | Given: A DNA string Text as well as integers k and d.
44 | Return: All k-mers Pattern maximizing the sum Countd(Text, Pattern) + Countd(Text, Pattern) over all possible
45 | k-mers.
46 | '''
47 | input_lines = sys.stdin.read().splitlines()
48 | Text = input_lines[0]
49 | k, d = [int(x) for x in input_lines[1].split()]
50 |
51 | print(" ".join(most_freq_kmers_mismatch_revc(Text, k, d)))
52 |
--------------------------------------------------------------------------------
/solutions/BA11F.py:
--------------------------------------------------------------------------------
1 | import sys
2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}
4 |
5 |
6 | def PeptideVector(peptide):
7 | prefixMasses = []
8 | for i in range(len(peptide)):
9 | prefixMasses.append(sum(peptide[:i + 1]))
10 | vector = [0] * prefixMasses[-1]
11 | for mass in prefixMasses:
12 | vector[mass - 1] = 1
13 | return vector
14 |
15 |
16 | def PeptideIdentification(spectral_vector, proteome):
17 | max_score = -1e6
18 | mass_list = []
19 | for aa in proteome:
20 | mass_list.append(aa_table[aa])
21 |
22 | for i in range(len(mass_list)):
23 | k = 2
24 | while i + k < len(mass_list):
25 | peptide = mass_list[i:i + k]
26 | pep_vec = PeptideVector(peptide)
27 | if len(pep_vec) > len(spectral_vector):
28 | break
29 | if len(pep_vec) == len(spectral_vector):
30 | score = 0
31 | for idx in range(len(pep_vec)):
32 | if pep_vec[idx] == 1:
33 | score += spectral_vector[idx]
34 | if score > max_score:
35 | max_score = score
36 | best_peptide = proteome[i:i + k]
37 | k += 1
38 | return best_peptide
39 |
40 |
41 | if __name__ == "__main__":
42 | '''
43 | Given: A space-delimited spectral vector S and an amino acid string Proteome.
44 | Return: A peptide in Proteome with maximum score against S.
45 | '''
46 | tmp = sys.stdin.read().splitlines()
47 | spectral_vector = [int(x) for x in tmp[0].rstrip().split(' ')]
48 | proteome = tmp[1].rstrip()
49 |
50 | print(PeptideIdentification(spectral_vector, proteome))
51 |
52 |
--------------------------------------------------------------------------------
/solutions/BA4G.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA4E import expand, cyclospectrum_mass_peptide
3 |
4 |
5 | def Score(peptide, spectrum):
6 | pep_spec = cyclospectrum_mass_peptide(peptide)
7 | result = 0
8 | unique_masses = set(pep_spec + spectrum)
9 | for mass in unique_masses:
10 | result += min(pep_spec.count(mass), spectrum.count(mass))
11 | return result
12 |
13 |
14 | def Trim(leaderboard, spectrum, N):
15 | if len(leaderboard) <= N:
16 | return leaderboard
17 |
18 | scores = {}
19 | for i, peptide in enumerate(leaderboard):
20 | scores[i] = Score(peptide, spectrum)
21 |
22 | sorted_scores = sorted(scores.values(), reverse=True)
23 | threshold = sorted_scores[N - 1]
24 |
25 | return [leaderboard[idx] for idx, score in scores.items() if score >= threshold]
26 |
27 |
28 | def leaderboard_cyclopeptide_sequencing(spectrum, N):
29 | leaderboard = [[]]
30 | leader_peptide = []
31 |
32 | while leaderboard:
33 | leaderboard = expand(leaderboard)
34 | for peptide in leaderboard:
35 | if sum(peptide) == spectrum[-1]:
36 | if Score(peptide, spectrum) > Score(leader_peptide, spectrum):
37 | leader_peptide = peptide
38 | elif sum(peptide) > spectrum[-1]:
39 | leaderboard = [pep for pep in leaderboard if pep != peptide]
40 | leaderboard = Trim(leaderboard, spectrum, N)
41 | return leader_peptide
42 |
43 |
44 | if __name__ == "__main__":
45 | '''
46 | Given: An integer N and a collection of integers Spectrum.
47 | Return: LeaderPeptide after running LeaderboardCyclopeptideSequencing(Spectrum, N).
48 | '''
49 | input_lines = sys.stdin.read().splitlines()
50 | N = int(input_lines[0])
51 | Spectrum = [int(x) for x in input_lines[1].split()]
52 |
53 | print("-".join(map(str, leaderboard_cyclopeptide_sequencing(Spectrum, N))))
54 |
--------------------------------------------------------------------------------
/solutions/BA4M.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def delta2(S, T):
5 | A = list(S)
6 | B = list(T)
7 | result = []
8 | for a in A:
9 | for b in B:
10 | result.append(abs(a - b))
11 | return result
12 |
13 |
14 | def is_multi_subset(A, B):
15 | for elem in A:
16 | if A.count(elem) > B.count(elem):
17 | return False
18 | return True
19 |
20 |
21 | def difference(A, B):
22 | # A - B
23 | diffset = []
24 | all_elems = set(A)
25 | for elem in all_elems:
26 | n = A.count(elem) - B.count(elem)
27 | if n > 0:
28 | for _ in range(n):
29 | diffset.append(elem)
30 | diffset.sort()
31 | return diffset
32 |
33 |
34 | def Place(dist_l):
35 | if not dist_l:
36 | return X
37 |
38 | y = dist_l[-1]
39 | # place on left
40 | tmp = delta2({y}, X)
41 | if is_multi_subset(tmp, dist_l):
42 | X.add(y)
43 | L_left = difference(dist_l, tmp)
44 | res_left = Place(L_left)
45 | if res_left:
46 | return res_left
47 | X.remove(y)
48 |
49 | # place on right
50 | tmp = delta2({width - y}, X)
51 | if is_multi_subset(tmp, dist_l):
52 | X.add(width - y)
53 | L_right = difference(dist_l, tmp)
54 | res_right = Place(L_right)
55 | if res_right:
56 | return res_right
57 | X.remove(width - y)
58 |
59 | return {}
60 |
61 |
62 | if __name__ == "__main__":
63 | '''
64 | Given: A collection of integers L.
65 | Return: A set A such that ∆A = L.
66 | '''
67 | # Zhang Z. An exponential example for a partial digest mapping algorithm. J Comput Biol. 1994;1(3):235-9.
68 | L = [int(x) for x in sys.stdin.readline().strip().split()]
69 | L = [x for x in L if x > 0]
70 |
71 | width = L.pop(-1)
72 | X = {0, width}
73 |
74 | result = Place(L)
75 |
76 | print(" ".join(map(str, result)))
77 |
--------------------------------------------------------------------------------
/solutions/BA5N.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | class Node:
5 | def __init__(self, lbl):
6 | self.label = lbl
7 | self.parent_nodes = []
8 | self.target_nodes = []
9 | self.visited = False
10 |
11 |
12 | class DAG:
13 | def __init__(self):
14 | self.nodes_dict = {}
15 | self.distances = {}
16 | self.backtrack = {}
17 |
18 | def add_node(self, lbl):
19 | if lbl in self.nodes_dict:
20 | return self.nodes_dict[lbl]
21 |
22 | new_node = Node(lbl)
23 | self.nodes_dict[lbl] = new_node
24 | return new_node
25 |
26 | def contruct_dag(self, adj_list_text):
27 | for line in adj_list_text:
28 |
29 | nodeA, tmp = line.split(" -> ")
30 | nodeB_list = tmp.split(",")
31 |
32 | from_node = self.add_node(nodeA)
33 |
34 | for nodeB in nodeB_list:
35 | to_node = self.add_node(nodeB)
36 |
37 | from_node.target_nodes.append(to_node)
38 | to_node.parent_nodes.append(from_node)
39 |
40 | def topological_sort_util(self, node, stack):
41 | node.visited = True
42 | for node2 in node.target_nodes:
43 | if not node2.visited:
44 | self.topological_sort_util(node2, stack)
45 | stack.insert(0, node.label)
46 |
47 | def topological_sort(self):
48 | stack = []
49 | for node in self.nodes_dict.values():
50 | if not node.visited:
51 | self.topological_sort_util(node, stack)
52 | return stack
53 |
54 |
55 | if __name__ == "__main__":
56 | '''
57 | Given: The adjacency list of a graph (with nodes represented by integers).
58 | Return: A topological ordering of this graph.
59 | '''
60 | adj_list_text = sys.stdin.read().splitlines()
61 |
62 | graph = DAG()
63 | graph.contruct_dag(adj_list_text)
64 | print(", ".join(graph.topological_sort()))
--------------------------------------------------------------------------------
/solutions/BA8C.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from collections import defaultdict
3 | from BA8A import euclidean_distance
4 |
5 |
6 | def closest_center(point, centers):
7 | min_dist = float("Inf")
8 | for x in centers:
9 | current = euclidean_distance(x, point)
10 | if current < min_dist:
11 | min_dist = current
12 | closest = x
13 | return closest
14 |
15 |
16 | def cluster_mean(cluster):
17 | m = len(cluster[0])
18 | center = [0] * m
19 | for point in cluster:
20 | for i in range(m):
21 | center[i] += point[i]
22 | center = [x / len(cluster) for x in center]
23 | return center
24 |
25 |
26 | def lloyd_k_means(data, k):
27 | centers = data[:k]
28 |
29 | while True:
30 | # Centers to clusters
31 | cluster_assignments = defaultdict(list)
32 | for point in data:
33 | center = closest_center(point, centers)
34 | cluster_assignments[tuple(center)].append(point)
35 |
36 | # Clusters to centers
37 | new_centers = [[]] * k
38 | for i in range(k):
39 | new_centers[i] = cluster_mean(cluster_assignments[tuple(centers[i])])
40 |
41 | if new_centers == centers:
42 | break
43 | centers = new_centers[:]
44 |
45 | return centers
46 |
47 |
48 | if __name__ == "__main__":
49 | '''
50 | Given: Integers k and m followed by a set of points Data in m-dimensional space.
51 | Return: A set Centers consisting of k points (centers) resulting from applying the Lloyd algorithm to Data and
52 | Centers, where the first k points from Data are selected as the first k centers.
53 | '''
54 | input_lines = sys.stdin.read().splitlines()
55 | k, m = [int(x) for x in input_lines[0].split()]
56 | data = [[float(x) for x in line.split()] for line in input_lines[1:]]
57 |
58 | centers = lloyd_k_means(data, k)
59 | for center in centers:
60 | print(" ".join(map(str, center)))
61 |
--------------------------------------------------------------------------------
/solutions/BA4I.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA4H import convolution
3 | from BA4G import Score, Trim
4 |
5 |
6 | def find_masses(spectrum, M):
7 | conv = convolution(spectrum)
8 | conv = [x for x in conv if 57 <= x <= 200]
9 |
10 | freq_dict = {}
11 | for mass in set(conv):
12 | freq_dict[mass] = conv.count(mass)
13 |
14 | sorted_elems = sorted(freq_dict.items(), key=lambda kv: kv[1], reverse=True)
15 | masses = [mass for mass, freq in sorted_elems if freq >= sorted_elems[M][1]]
16 | masses.sort()
17 | return masses
18 |
19 |
20 | def Expand(peptides, masses):
21 | new_peptides = []
22 | for pep in peptides:
23 | for mass in masses:
24 | new_peptides.append(pep + [mass])
25 | return new_peptides
26 |
27 |
28 | def convolution_cyclopeptide_sequencing(spectrum, M, N):
29 | masses = find_masses(spectrum, M)
30 | leaderboard = [[]]
31 | leader_peptide = []
32 |
33 | while leaderboard:
34 | leaderboard = Expand(leaderboard, masses)
35 | for peptide in leaderboard:
36 | if sum(peptide) == spectrum[-1]:
37 | if Score(peptide, spectrum) > Score(leader_peptide, spectrum):
38 | leader_peptide = peptide
39 | elif sum(peptide) > spectrum[-1]:
40 | leaderboard = [pep for pep in leaderboard if pep != peptide]
41 | leaderboard = Trim(leaderboard, spectrum, N)
42 | return leader_peptide
43 |
44 |
45 | if __name__ == "__main__":
46 | '''
47 | Given: An integer M, an integer N, and a collection of (possibly repeated) integers Spectrum.
48 | Return: A cyclic peptide LeaderPeptide with amino acids taken only from the top M elements (and ties) of the
49 | convolution of Spectrum that fall between 57 and 200, and where the size of Leaderboard is restricted to the top N
50 | (and ties).
51 | '''
52 | input_lines = sys.stdin.read().splitlines()
53 | M = int(input_lines[0])
54 | N = int(input_lines[1])
55 | Spectrum = [int(x) for x in input_lines[2].split()]
56 |
57 | print("-".join(map(str, convolution_cyclopeptide_sequencing(Spectrum, M, N))))
58 |
--------------------------------------------------------------------------------
/solutions/BA9L.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def BWMatching_wrapper(BWT, pattern_list):
5 | counts = {}
6 | BWT_list = []
7 | for char in BWT:
8 | if char not in counts.keys():
9 | counts[char] = 1
10 | else:
11 | counts[char] += 1
12 | tmp = char + str(counts[char])
13 | BWT_list.append(tmp)
14 |
15 | first_col = sorted(BWT_list, key=lambda x: x[0])
16 |
17 | last_to_first = []
18 | for sym_last in BWT_list:
19 | for idx, sym_first in enumerate(first_col):
20 | if sym_first == sym_last:
21 | last_to_first.append(idx)
22 |
23 | result = []
24 | for pattern in pattern_list:
25 | result.append(BWMatching(BWT, pattern, last_to_first))
26 |
27 | return result
28 |
29 |
30 | def BWMatching(last_column, pattern, last_to_first):
31 | top = 0
32 | bottom = len(last_column) - 1
33 |
34 | while top <= bottom:
35 | if len(pattern) != 0:
36 | symbol = pattern[-1]
37 | pattern = pattern[:-1]
38 |
39 | # if positions from top to bottom in LastColumn
40 | # contain any occurrence of symbol
41 |
42 | match_positions = []
43 | for idx in range(top, bottom + 1):
44 | if last_column[idx] == symbol:
45 | match_positions.append(idx)
46 |
47 | if len(match_positions) != 0:
48 | top = last_to_first[min(match_positions)]
49 | bottom = last_to_first[max(match_positions)]
50 | else:
51 | return 0
52 | else:
53 | return bottom - top + 1
54 |
55 |
56 | if __name__ == "__main__":
57 | '''
58 | Given: A string BWT(Text), followed by a collection of strings Patterns.
59 | Return: A list of integers, where the i-th integer corresponds to the number of substring matches of the i-th member
60 | of Patterns in Text.
61 | '''
62 | input_lines = sys.stdin.read().splitlines()
63 | BWT = input_lines[0]
64 | pattern_list = input_lines[1].split()
65 |
66 | match_nums = BWMatching_wrapper(BWT, pattern_list)
67 | print(' '.join(map(str, match_nums)))
--------------------------------------------------------------------------------
/solutions/BA11E.py:
--------------------------------------------------------------------------------
1 | import sys
2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}
4 | mass_table = {v: k for k, v in aa_table.items()}
5 |
6 |
7 | def PeptideSequencing(spectral_vector):
8 | spectral_vector = [0] + spectral_vector
9 |
10 | adj_list = []
11 | for i in range(len(spectral_vector)):
12 | for j in range(i, len(spectral_vector)):
13 | if (j - i) in mass_table.keys():
14 | adj_list.append([i, j])
15 |
16 | adj_dict = {}
17 | for i in range(len(spectral_vector)):
18 | for j in range(i, len(spectral_vector)):
19 | if (j - i) in mass_table.keys():
20 | tmp = [i, mass_table[j - i]]
21 | if not j in adj_dict.keys():
22 | adj_dict[j] = [tmp]
23 | else:
24 | adj_dict[j].append(tmp)
25 |
26 | scores = {0: [0, '-']}
27 | for node in adj_dict.keys():
28 | scores[node] = [-1e6, '-']
29 | tmp = adj_dict[node]
30 | for x in tmp:
31 | if x[0] != 0:
32 | scores[x[0]] = [-1e6, '-']
33 |
34 | for node in adj_dict.keys():
35 | max_score = -1e6
36 | bold_edge = '-'
37 | for parent in adj_dict[node]:
38 | score = scores[parent[0]][0]
39 | if score > max_score:
40 | max_score = score
41 | bold_edge = parent
42 | scores[node] = [max_score + spectral_vector[node], bold_edge]
43 |
44 | node = list(scores.keys())[-1]
45 | peptide = ''
46 | while node != 0:
47 | peptide = scores[node][1][1] + peptide
48 | node = scores[node][1][0]
49 |
50 | return peptide
51 |
52 |
53 | if __name__ == "__main__":
54 | '''
55 | Given: A space-delimited spectral vector S.
56 | Return: A peptide with maximum score against S. For masses with more than one amino acid, any choice may be used.
57 | '''
58 | spectral_vector = [int(x) for x in sys.stdin.read().rstrip().split(' ')]
59 |
60 | print(PeptideSequencing(spectral_vector))
61 |
--------------------------------------------------------------------------------
/solutions/BA3F.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from re import split
3 | from random import choice
4 |
5 |
6 | def parse_adj_list(adj_list_text):
7 | adj_list = {}
8 | for elem in adj_list_text:
9 | temp = split(' -> ', elem)
10 | adj_list[temp[0]] = temp[1].split(',')
11 | return adj_list
12 |
13 |
14 | def remove_edge(adj_list, from_node, to_node):
15 | adj_list[from_node].remove(to_node)
16 | if not adj_list[from_node]:
17 | del adj_list[from_node]
18 | return adj_list
19 |
20 |
21 | def Eulerian_cycle(adj_list):
22 | # form a cycle Cycle by randomly walking in Graph
23 | start_node, edges = choice(list(adj_list.items()))
24 | target_node = choice(edges)
25 | adj_list = remove_edge(adj_list, start_node, target_node)
26 |
27 | Cycle = [start_node, target_node]
28 | current_node = target_node
29 | while current_node != start_node:
30 | edges = adj_list[current_node]
31 | target_node = choice(edges)
32 | adj_list = remove_edge(adj_list, current_node, target_node)
33 | current_node = target_node
34 | Cycle.append(current_node)
35 |
36 | while adj_list:
37 | potential_starts = [(idx, node) for idx, node in enumerate(Cycle) if node in adj_list]
38 | idx, new_start = choice(potential_starts)
39 |
40 | # form Cycle’ by traversing Cycle (starting at newStart) and then randomly walking
41 | new_cycle = Cycle[idx:] + Cycle[1:idx + 1]
42 |
43 | target_node = choice(adj_list[new_start])
44 | adj_list = remove_edge(adj_list, new_start, target_node)
45 | current_node = target_node
46 | new_cycle.append(current_node)
47 | while current_node != new_start:
48 | edges = adj_list[current_node]
49 | target_node = choice(edges)
50 | adj_list = remove_edge(adj_list, current_node, target_node)
51 | current_node = target_node
52 | new_cycle.append(current_node)
53 | Cycle = new_cycle
54 | return Cycle
55 |
56 |
57 | if __name__ == "__main__":
58 | '''
59 | Given: An Eulerian directed graph, in the form of an adjacency list.
60 | Return: An Eulerian cycle in this graph.
61 | '''
62 | input_lines = sys.stdin.read().splitlines()
63 | Adj_list = parse_adj_list(input_lines)
64 |
65 | print("->".join(Eulerian_cycle(Adj_list)))
66 |
--------------------------------------------------------------------------------
/solutions/BA3M.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA3F import parse_adj_list, remove_edge
3 |
4 |
5 | def maximal_non_branching_paths(adj_list):
6 | paths = []
7 |
8 | # in and out degrees
9 | in_out_degrees = {}
10 | for source, targets in adj_list.items():
11 | if source not in in_out_degrees:
12 | in_out_degrees[source] = [0, len(targets)]
13 | else:
14 | in_out_degrees[source][1] += len(targets)
15 |
16 | for target in targets:
17 | if target not in in_out_degrees:
18 | in_out_degrees[target] = [1, 0]
19 | else:
20 | in_out_degrees[target][0] += 1
21 |
22 | # find all non-branching paths
23 | for v in list(in_out_degrees):
24 | if in_out_degrees[v] != [1, 1]:
25 | if in_out_degrees[v][1] > 0:
26 | while v in adj_list:
27 | w = adj_list[v][0]
28 | non_branching_path = [v, w]
29 | adj_list = remove_edge(adj_list, v, w)
30 | while in_out_degrees[w] == [1, 1]:
31 | u = adj_list[w][0]
32 | non_branching_path.append(u)
33 | adj_list = remove_edge(adj_list, w, u)
34 | w = u
35 | paths.append(non_branching_path)
36 |
37 | # find isolated cycles
38 | while adj_list:
39 | start_node = list(adj_list)[0]
40 | current_node = adj_list[start_node][0]
41 | adj_list = remove_edge(adj_list, start_node, current_node)
42 | cycle = [start_node, current_node]
43 | while current_node != start_node:
44 | target_node = adj_list[current_node][0]
45 | cycle.append(target_node)
46 | adj_list = remove_edge(adj_list, current_node, target_node)
47 | current_node = target_node
48 | paths.append(cycle)
49 |
50 | return paths
51 |
52 |
53 | if __name__ == "__main__":
54 | '''
55 | Given: The adjacency list of a graph whose nodes are integers.
56 | Return: The collection of all maximal non-branching paths in the graph.
57 | '''
58 | input_lines = sys.stdin.read().splitlines()
59 | Adj_list = parse_adj_list(input_lines)
60 |
61 | result = maximal_non_branching_paths(Adj_list)
62 | for r in result:
63 | print("->".join(r))
64 |
--------------------------------------------------------------------------------
/solutions/BA4E.py:
--------------------------------------------------------------------------------
1 | import sys
2 | MASSES = [57, 71, 87, 97, 99, 101, 103, 113, 114, 115, 128, 129, 131, 137, 147, 156, 163, 186]
3 |
4 |
5 | def cyclospectrum_mass_peptide(peptide):
6 | spec = [0, sum(peptide)]
7 | temp = peptide + peptide
8 | for k in range(1, len(peptide)):
9 | for i in range(len(peptide)):
10 | subpeptide = temp[i:i + k]
11 | spec.append(sum(subpeptide))
12 | spec.sort()
13 | return spec
14 |
15 |
16 | def LinearSpectrum(Peptide):
17 | PrefixMass = [0]
18 | for i in range(len(Peptide)):
19 | temp = PrefixMass[i] + Peptide[i]
20 | PrefixMass.append(temp)
21 | LinearSpectrum = [0]
22 | for i in range(len(Peptide)):
23 | for j in range(i + 1, len(Peptide) + 1):
24 | LinearSpectrum.append(PrefixMass[j] - PrefixMass[i])
25 | LinearSpectrum.sort()
26 | return LinearSpectrum
27 |
28 |
29 | def expand(peptides):
30 | new_peptides = []
31 | for pep in peptides:
32 | for mass in MASSES:
33 | new_peptides.append(pep + [mass])
34 | return new_peptides
35 |
36 |
37 | def Consistent(Peptide, Spectrum):
38 | if sum(Peptide) > Spectrum[-1] - MASSES[0]:
39 | return False
40 | spec = LinearSpectrum(Peptide)
41 | for mass in spec:
42 | if mass not in Spectrum:
43 | return False
44 | return True
45 |
46 |
47 | def cyclopeptide_sequencing(spectrum):
48 | result = set()
49 | peptides = [[]]
50 | while peptides:
51 | peptides = expand(peptides)
52 | for peptide in peptides:
53 | if sum(peptide) == spectrum[-1]:
54 | if cyclospectrum_mass_peptide(peptide) == spectrum:
55 | result.add("-".join(map(str, peptide)))
56 | peptides = [pep for pep in peptides if pep != peptide]
57 | elif not Consistent(peptide, spectrum):
58 | peptides = [pep for pep in peptides if pep != peptide]
59 | return result
60 |
61 |
62 | if __name__ == "__main__":
63 | '''
64 | Given: A collection of (possibly repeated) integers Spectrum corresponding to an ideal experimental spectrum.
65 | Return: Every amino acid string Peptide such that Cyclospectrum(Peptide) = Spectrum (if such a string exists).
66 | '''
67 | spectrum = [int(x) for x in sys.stdin.readline().strip().split()]
68 |
69 | print(" ".join(cyclopeptide_sequencing(spectrum)))
70 |
--------------------------------------------------------------------------------
/solutions/BA7A.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import queue
3 |
4 |
5 | class Node:
6 | def __init__(self, label):
7 | self.label = label
8 | self.linked_nodes = set()
9 |
10 |
11 | class Tree:
12 | def __init__(self):
13 | self.nodes_dict = {}
14 |
15 | def add_node(self, label):
16 | if label in self.nodes_dict:
17 | return self.nodes_dict[label]
18 |
19 | node = Node(label)
20 | self.nodes_dict[label] = node
21 | return node
22 |
23 | def construct_tree(self, adj_list):
24 | for line in adj_list:
25 | labels, weight = line.split(":")
26 | weight = int(weight)
27 | label1, label2 = [int(x) for x in labels.split("->")]
28 |
29 | node1 = self.add_node(label1)
30 | node2 = self.add_node(label2)
31 |
32 | node1.linked_nodes.add((label2, weight))
33 | node2.linked_nodes.add((label1, weight))
34 |
35 | def distance(self, label_a, label_b):
36 | visited = [False] * len(self.nodes_dict)
37 | distance = [0] * len(self.nodes_dict)
38 |
39 | Q = queue.Queue()
40 | distance[label_a] = 0
41 |
42 | Q.put(label_a)
43 | visited[label_a] = True
44 | while not Q.empty():
45 | x = Q.get()
46 | for label2, weight in self.nodes_dict[x].linked_nodes:
47 | if not visited[label2]:
48 | distance[label2] = distance[x] + weight
49 | Q.put(label2)
50 | visited[label2] = True
51 | return distance[label_b]
52 |
53 |
54 | def distance_matrix_between_leaves(self, n_leaves):
55 | distance_mat = [[0 for _ in range(n_leaves)] for _ in range(n_leaves)]
56 | for i in range(n_leaves):
57 | for j in range(n_leaves):
58 | distance_mat[i][j] = self.distance(i, j)
59 | return distance_mat
60 |
61 |
62 | if __name__ == "__main__":
63 | '''
64 | Given: An integer n followed by the adjacency list of a weighted tree with n leaves.
65 | Return: A space-separated n x n (di, j), where di, j is the length of the path between leaves i and j.
66 | '''
67 | input_lines = sys.stdin.read().splitlines()
68 | n = int(input_lines[0])
69 | adj_list = input_lines[1:]
70 |
71 | t = Tree()
72 | t.construct_tree(adj_list)
73 | result = t.distance_matrix_between_leaves(n)
74 | for row in result:
75 | print(" ".join(map(str, row)))
--------------------------------------------------------------------------------
/solutions/BA9M.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def BetterBWMatching_wrapper(BWT, pattern_list):
5 | FirstOccurrence = {}
6 | for idx, symbol in enumerate(sorted(BWT)):
7 | if symbol not in FirstOccurrence.keys():
8 | FirstOccurrence[symbol] = idx
9 |
10 | result = []
11 | for pattern in pattern_list:
12 | result.append(BetterBWMatching(FirstOccurrence, BWT, pattern))
13 |
14 | return result
15 |
16 |
17 | def Countsymbol(idx, LastColumn, symbol):
18 | return LastColumn[:idx].count(symbol)
19 |
20 |
21 | def BetterBWMatching(FirstOccurrence, LastColumn, pattern):
22 | ''' Better Burrows Wheeler Matching
23 | BetterBWMatching(FirstOccurrence, LastColumn, Pattern)
24 | top ← 0
25 | bottom ← |LastColumn| − 1
26 | while top ≤ bottom
27 | if Pattern is nonempty
28 | symbol ← last letter in Pattern
29 | remove last letter from Pattern
30 | if positions from top to bottom in LastColumn contain an occurrence of symbol
31 | top ← FirstOccurrence(symbol) + Countsymbol(top, LastColumn)
32 | bottom ← FirstOccurrence(symbol) + Countsymbol(bottom + 1, LastColumn) − 1
33 | else
34 | return 0
35 | else
36 | return bottom − top + 1
37 | '''
38 | top = 0
39 | bottom = len(LastColumn) - 1
40 |
41 | while top <= bottom:
42 | if len(pattern) != 0:
43 | symbol = pattern[-1]
44 | pattern = pattern[:-1]
45 |
46 | # if positions from top to bottom in LastColumn
47 | # contain any occurrence of symbol
48 | if symbol in LastColumn[top:bottom + 1]:
49 | top = FirstOccurrence[symbol] + Countsymbol(top, LastColumn, symbol)
50 | bottom = FirstOccurrence[symbol] + Countsymbol(bottom + 1, LastColumn, symbol) - 1
51 | else:
52 | return 0
53 | else:
54 | return bottom - top + 1
55 |
56 |
57 | if __name__ == "__main__":
58 | '''
59 | Given: A string BWT(Text), followed by a collection of strings Patterns.
60 | Return: A list of integers, where the i-th integer corresponds to the number of substring matches of the i-th member
61 | of Patterns in Text.
62 | '''
63 | tmp = sys.stdin.read().splitlines()
64 | BWT = tmp[0]
65 | pattern_list = tmp[1].split(' ')
66 |
67 | match_nums = BetterBWMatching_wrapper(BWT, pattern_list)
68 | print(' '.join(str(num) for num in match_nums))
69 |
--------------------------------------------------------------------------------
/solutions/BA5H.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def fitting_alignment(str1, str2, indel_penalty=1):
5 | str1 = "-" + str1
6 | str2 = "-" + str2
7 |
8 | score_mat = [[0 for _ in range(len(str2))] for _ in range(len(str1))]
9 | backtrack_mat = [[None for _ in range(len(str2))] for _ in range(len(str1))]
10 |
11 | for i in range(1, len(str1)):
12 | for j in range(1, len(str2)):
13 | score1 = score_mat[i - 1][j - 1] + (1 if str1[i] == str2[j] else - 1)
14 | score2 = score_mat[i - 1][j] - indel_penalty
15 | score3 = score_mat[i][j - 1] - indel_penalty
16 | score_mat[i][j] = max(score1, score2, score3)
17 | if score_mat[i][j] == score1:
18 | backtrack_mat[i][j] = "d"
19 | elif score_mat[i][j] == score2:
20 | backtrack_mat[i][j] = "u"
21 | elif score_mat[i][j] == score3:
22 | backtrack_mat[i][j] = "l"
23 |
24 | j = len(str2) - 1
25 | i = max(enumerate([score_mat[row][j] for row in range(len(str2) - 1, len(str1) - 1)]), key=lambda x: x[1])[0] + len(str2) - 1
26 | max_score = score_mat[i][j]
27 |
28 | aligned_1 = aligned_2 = ""
29 | while backtrack_mat[i][j] is not None:
30 | direction = backtrack_mat[i][j]
31 | if direction == "d":
32 | aligned_1 = str1[i] + aligned_1
33 | aligned_2 = str2[j] + aligned_2
34 | i -= 1
35 | j -= 1
36 | elif direction == "u":
37 | aligned_1 = str1[i] + aligned_1
38 | aligned_2 = "-" + aligned_2
39 | i -= 1
40 | else:
41 | aligned_1 = "-" + aligned_1
42 | aligned_2 = str2[j] + aligned_2
43 | j -= 1
44 |
45 | return max_score, aligned_1, aligned_2
46 |
47 |
48 | if __name__ == "__main__":
49 | '''
50 | Given: Two DNA strings v and w, where v has length at most 10000 and w has length at most 1000.
51 | Return: The maximum score of a fitting alignment of v and w, followed by a fitting alignment achieving this maximum
52 | score. Use the simple scoring method in which matches count +1 and both the mismatch and indel penalties are equal
53 | to 1. (If multiple fitting alignments achieving the maximum score exist, you may return any one.)
54 | '''
55 | input_lines = sys.stdin.read().splitlines()
56 | string1 = input_lines[0]
57 | string2 = input_lines[1]
58 |
59 | score, alignment1, alignment2 = fitting_alignment(string1, string2)
60 | print(score)
61 | print(alignment1)
62 | print(alignment2)
63 |
--------------------------------------------------------------------------------
/solutions/BA2D.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA1G import hamming_dist
3 |
4 |
5 | def probability(pattern, profile):
6 | prob = 1
7 | for i, nuc in enumerate(pattern):
8 | prob *= profile[nuc][i]
9 | return prob
10 |
11 |
12 | def profile_most_probable_kmer(text, profile, k):
13 | max_prob = -1
14 | for i in range(len(text) - k + 1):
15 | kmer = text[i:i + k]
16 | prob = probability(kmer, profile)
17 | if prob > max_prob:
18 | max_prob = prob
19 | result = kmer
20 | return result
21 |
22 | def GreedyMotifSearch(DNA_list, k, t):
23 | BestMotifs = [dna[0:k] for dna in DNA_list]
24 | LowestScore = CalculateScore(BestMotifs)
25 | DNA = DNA_list[0]
26 | for i in range(len(DNA) - k + 1):
27 | Motifs = [DNA[i:i + k]]
28 | for j in range(1, t):
29 | profile = FormProfile(Motifs)
30 | Motifs.append(profile_most_probable_kmer(DNA_list[j], profile, k))
31 | CurrentScore = CalculateScore(Motifs)
32 | if CurrentScore < LowestScore:
33 | BestMotifs = Motifs
34 | LowestScore = CurrentScore
35 | return BestMotifs
36 |
37 |
38 | def FormProfile(TextList):
39 | if type(TextList) != list:
40 | TextList = [TextList]
41 | t = len(TextList)
42 | k = len(TextList[0])
43 | profile = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0]*k}
44 | for i in range(k):
45 | for j in range(t):
46 | profile[TextList[j][i]][i] += 1
47 | return profile
48 |
49 |
50 | def CalculateScore(Motifs):
51 | k = len(Motifs[0])
52 | profile = FormProfile(Motifs)
53 | consensus = ''
54 | for i in range(k):
55 | most_freq = 0
56 | for nuc in ['A', 'C', 'G', 'T']:
57 | if profile[nuc][i] > most_freq:
58 | most_freq = profile[nuc][i]
59 | to_add = nuc
60 | consensus += to_add
61 | score = 0
62 | for motif in Motifs:
63 | score += hamming_dist(consensus, motif)
64 | return score
65 |
66 |
67 | if __name__ == "__main__":
68 | '''
69 | Given: Integers k and t, followed by a collection of strings Dna.
70 | Return: A collection of strings BestMotifs resulting from running GreedyMotifSearch(Dna, k, t). If at any step you
71 | find more than one Profile-most probable k-mer in a given string, use the one occurring first.
72 | '''
73 | input_lines = sys.stdin.read().splitlines()
74 | k, t = [int(x) for x in input_lines[0].split()]
75 | DNA_list = input_lines[1:]
76 | print("\n".join(GreedyMotifSearch(DNA_list, k, t)))
77 |
--------------------------------------------------------------------------------
/solutions/BA5I.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def overlap_alignment(str1, str2):
5 | str1 = "-" + str1
6 | str2 = "-" + str2
7 |
8 | score_mat = [[0 for j in range(len(str2))] for i in range(len(str1))]
9 | backtrack_mat = [[None for j in range(len(str2))] for i in range(len(str1))]
10 |
11 | for j in range(1, len(str2)):
12 | score_mat[0][j] = score_mat[0][j - 1] - 2
13 | backtrack_mat[0][j] = "l"
14 |
15 | for i in range(1, len(str1)):
16 | for j in range(1, len(str2)):
17 |
18 | score1 = score_mat[i - 1][j - 1] + (1 if str1[i] == str2[j] else -2)
19 | score2 = score_mat[i - 1][j] - 2
20 | score3 = score_mat[i][j - 1] - 2
21 | score_mat[i][j] = max(score1, score2, score3)
22 | if score_mat[i][j] == score1:
23 | backtrack_mat[i][j] = "d"
24 | elif score_mat[i][j] == score2:
25 | backtrack_mat[i][j] = "u"
26 | elif score_mat[i][j] == score3:
27 | backtrack_mat[i][j] = "l"
28 |
29 | i = len(str1) - 1
30 | j = max(range(len(str2)), key=lambda x: score_mat[i][x])
31 | max_score = score_mat[i][j]
32 |
33 | aligned_1 = aligned_2 = ""
34 | while backtrack_mat[i][j] is not None:
35 | direction = backtrack_mat[i][j]
36 | if direction == "d":
37 | aligned_1 = str1[i] + aligned_1
38 | aligned_2 = str2[j] + aligned_2
39 | i -= 1
40 | j -= 1
41 | elif direction == "u":
42 | aligned_1 = str1[i] + aligned_1
43 | aligned_2 = "-" + aligned_2
44 | i -= 1
45 | else:
46 | aligned_1 = "-" + aligned_1
47 | aligned_2 = str2[j] + aligned_2
48 | j -= 1
49 |
50 | return max_score, aligned_1, aligned_2
51 |
52 |
53 | if __name__ == "__main__":
54 | '''
55 | Given: Two protein strings v and w, each of length at most 1000.
56 | Return: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v’ of v and a
57 | prefix w’ of w achieving this maximum score. Use an alignment score in which matches count +1 and both the mismatch
58 | and indel penalties are 2. (If multiple overlap alignments achieving the maximum score exist, you may return any
59 | one.)
60 | '''
61 | input_lines = sys.stdin.read().splitlines()
62 | string1 = input_lines[0]
63 | string2 = input_lines[1]
64 |
65 | score, alignment1, alignment2 = overlap_alignment(string1, string2)
66 | print(score)
67 | print(alignment1)
68 | print(alignment2)
69 |
--------------------------------------------------------------------------------
/solutions/BA10I.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA10H import print_matrices, HMMParameterEstimation
3 | from BA10C import Viterbi
4 |
5 |
6 | def Viterbi_Learning(x, init_transition_matrix, init_emission_matrix, alphabet, all_states, max_iterations):
7 | transition_matrix = init_transition_matrix
8 | emission_matrix = init_emission_matrix
9 |
10 | for iteration in range(max_iterations):
11 | # Step 1: Determine path using current parameters
12 | path = Viterbi(x, all_states, transition_matrix, emission_matrix)
13 |
14 | # Step 2: Determine new parameters using path
15 | transition_matrix, emission_matrix = HMMParameterEstimation(x, path, alphabet, all_states)
16 |
17 | return transition_matrix, emission_matrix
18 |
19 |
20 | if __name__ == "__main__":
21 | '''
22 | Given: A sequence of emitted symbols x = x1 ... xn in an alphabet A, generated by a k-state HMM with unknown
23 | transition and emission probabilities, initial Transition and Emission matrices and a number of iterations i.
24 | Return: A matrix of transition probabilities Transition and a matrix of emission probabilities Emission that
25 | maximizes Pr(x, π) over all possible transition and emission matrices and over all hidden paths π.
26 | '''
27 |
28 | tmp = sys.stdin.read().splitlines()
29 |
30 | j = tmp[0]
31 | x = tmp[2]
32 | alphabet = tmp[4].split()
33 | all_states = tmp[6].split()
34 |
35 | init_transition_matrix = {}
36 | init_emission_matrix = {}
37 |
38 | # initial transition matrix
39 | col_syms = tmp[8].split('\t')[1:]
40 | transition_end = 8 + len(all_states)
41 |
42 | for i in range(9, transition_end + 1):
43 | current_line = tmp[i].rstrip().split('\t')
44 | row_sym = current_line[0]
45 | init_transition_matrix[row_sym] = {}
46 | for j in range(1, len(current_line)):
47 | init_transition_matrix[row_sym][col_syms[j - 1]] = float(current_line[j])
48 |
49 | # emission matrix
50 | col_syms = tmp[transition_end + 2].split('\t')[1:]
51 |
52 | for i in range(transition_end + 3, len(tmp)):
53 | current_line = tmp[i].rstrip().split('\t')
54 | row_sym = current_line[0]
55 | init_emission_matrix[row_sym] = {}
56 | for j in range(1, len(current_line)):
57 | init_emission_matrix[row_sym][col_syms[j - 1]] = float(current_line[j])
58 |
59 | transition_matrix, emission_matrix = Viterbi_Learning(x, init_transition_matrix, init_emission_matrix, alphabet,
60 | all_states, j)
61 |
62 | print_matrices(transition_matrix, emission_matrix)
--------------------------------------------------------------------------------
/solutions/BA11B.py:
--------------------------------------------------------------------------------
1 | import sys
2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}
4 | mass_table = {v: k for k, v in aa_table.items()}
5 |
6 |
7 | def SpectrumGraph(spectrum):
8 | adj_list = []
9 | for i in range(len(spectrum)):
10 | for j in range(i, len(spectrum)):
11 | if spectrum[j] - spectrum[i] in mass_table.keys():
12 | adj_list.append([spectrum[i], spectrum[j], mass_table[spectrum[j] - spectrum[i]]])
13 | return adj_list
14 |
15 |
16 | def IdealSpectrum(Peptide):
17 | PrefixMass = [0]
18 | for i in range(len(Peptide)):
19 | temp = PrefixMass[i] + aa_table[Peptide[i]]
20 | PrefixMass.append(temp)
21 | LinearSpectrum = [0]
22 | for i in range(len(Peptide)):
23 | for j in range(i + 1, len(Peptide) + 1):
24 | LinearSpectrum.append(PrefixMass[j] - PrefixMass[i])
25 | LinearSpectrum.sort()
26 | return LinearSpectrum
27 |
28 |
29 | def Paths(adj_list):
30 | node = 0
31 | peptide_list = []
32 | tmp_edges = []
33 | peptide = ''
34 | tmp_peps = []
35 |
36 | while any([len(x) != 0 for x in tmp_edges]) or len(tmp_edges) == 0:
37 | next_edges = [e for e in adj_list if e[0] == node]
38 | if len(next_edges) > 1:
39 | tmp = next_edges[1:]
40 | tmp_edges.append(tmp)
41 | tmp_peps.append(peptide)
42 |
43 | next_edge = next_edges[0]
44 | peptide += next_edge[2]
45 | node = next_edge[1]
46 |
47 | if len([e for e in adj_list if e[0] == node]) == 0:
48 | tmp = [x for x in tmp_edges if len(x) != 0][-1]
49 | next_edge = tmp.pop()
50 | node = next_edge[1]
51 | peptide_list.append(peptide)
52 | tmp_pep = tmp_peps.pop()
53 | peptide = tmp_pep + next_edge[2]
54 |
55 | return peptide_list
56 |
57 |
58 | def DecodingIdealSpectrum(spectrum):
59 | adj_list = SpectrumGraph(spectrum)
60 | all_paths = Paths(adj_list)
61 | for peptide in all_paths:
62 | if set(spectrum).issubset(IdealSpectrum(peptide)):
63 | return peptide
64 |
65 |
66 | if __name__ == "__main__":
67 | '''
68 | Given: A space-delimited list of integers, Spectrum.
69 | Return: An amino acid string with an ideal spectrum that matches Spectrum.
70 | '''
71 | spectrum = sys.stdin.read().rstrip()
72 | spectrum = [int(s) for s in spectrum.split()]
73 | spectrum = [0] + spectrum
74 |
75 | print(DecodingIdealSpectrum(spectrum))
--------------------------------------------------------------------------------
/solutions/BA2E.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA1G import hamming_dist
3 |
4 |
5 | def probability(pattern, profile):
6 | prob = 1
7 | for i, nuc in enumerate(pattern):
8 | prob *= profile[nuc][i]
9 | return prob
10 |
11 |
12 | def profile_most_probable_kmer(text, profile, k):
13 | max_prob = -1
14 | for i in range(len(text) - k + 1):
15 | kmer = text[i:i + k]
16 | prob = probability(kmer, profile)
17 | if prob > max_prob:
18 | max_prob = prob
19 | result = kmer
20 | return result
21 |
22 | def GreedyMotifSearch(DNA_list, k, t):
23 | BestMotifs = [dna[0:k] for dna in DNA_list]
24 | LowestScore = CalculateScore(BestMotifs)
25 | DNA = DNA_list[0]
26 | for i in range(len(DNA) - k + 1):
27 | Motifs = [DNA[i:i + k]]
28 | for j in range(1, t):
29 | profile = FormProfile(Motifs)
30 | Motifs.append(profile_most_probable_kmer(DNA_list[j], profile, k))
31 | CurrentScore = CalculateScore(Motifs)
32 | if CurrentScore < LowestScore:
33 | BestMotifs = Motifs
34 | LowestScore = CurrentScore
35 | return BestMotifs
36 |
37 |
38 | def FormProfile(TextList, pseudocount=1):
39 | if type(TextList) != list:
40 | TextList = [TextList]
41 | t = len(TextList)
42 | k = len(TextList[0])
43 | profile = {'A': [pseudocount] * k, 'C': [pseudocount] * k, 'G': [pseudocount] * k, 'T': [pseudocount] * k}
44 | for i in range(k):
45 | for j in range(t):
46 | profile[TextList[j][i]][i] += 1
47 | return profile
48 |
49 |
50 | def CalculateScore(Motifs):
51 | k = len(Motifs[0])
52 | profile = FormProfile(Motifs)
53 | consensus = ''
54 | for i in range(k):
55 | most_freq = 0
56 | for nuc in ['A', 'C', 'G', 'T']:
57 | if profile[nuc][i] > most_freq:
58 | most_freq = profile[nuc][i]
59 | to_add = nuc
60 | consensus += to_add
61 | score = 0
62 | for motif in Motifs:
63 | score += hamming_dist(consensus, motif)
64 | return score
65 |
66 |
67 | if __name__ == "__main__":
68 | '''
69 | Given: Integers k and t, followed by a collection of strings Dna.
70 | Return: A collection of strings BestMotifs resulting from running GreedyMotifSearch(Dna, k, t) with pseudocounts.
71 | If at any step you find more than one Profile-most probable k-mer in a given string, use the one occurring first.
72 | '''
73 | input_lines = sys.stdin.read().splitlines()
74 | k, t = [int(x) for x in input_lines[0].split()]
75 | DNA_list = input_lines[1:]
76 | print("\n".join(GreedyMotifSearch(DNA_list, k, t)))
77 |
--------------------------------------------------------------------------------
/solutions/BA8E.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def HierarchicalClustering(distance_matrix, agg_method='average'):
5 | clusters = [[i] for i in range(len(distance_matrix))]
6 |
7 | new_clusters_list = []
8 | while len(clusters) != 1:
9 |
10 | ## Find the two closest clusters
11 | min_dist = float('inf')
12 | for i in range(len(clusters) - 1):
13 | for j in range(i + 1, len(clusters)):
14 | if agg_method == 'average':
15 | dist = 0
16 | for idx1 in clusters[i]:
17 | for idx2 in clusters[j]:
18 | dist += distance_matrix[idx1][idx2]
19 | dist /= (len(clusters[i]) * len(clusters[j]))
20 | elif agg_method == 'min':
21 | dist = float('inf')
22 | for idx1 in clusters[i]:
23 | for idx2 in clusters[j]:
24 | current = distance_matrix[idx1][idx2]
25 | if current < dist:
26 | dist = current
27 | elif agg_method == 'max':
28 | dist = -1
29 | for idx1 in clusters[i]:
30 | for idx2 in clusters[j]:
31 | current = distance_matrix[idx1][idx2]
32 | if current > dist:
33 | dist = current
34 | else:
35 | raise Exception('Agglomeration method not implemented!')
36 | if dist < min_dist:
37 | min_dist = dist
38 | closest_idx1 = i
39 | closest_idx2 = j
40 |
41 | ## Merge the two closeet clusters
42 | new_cluster = clusters[closest_idx1] + clusters[closest_idx2]
43 | clusters = [clu for clu in clusters if clu not in [clusters[closest_idx1], clusters[closest_idx2]]]
44 | clusters.append(new_cluster)
45 | new_clusters_list.append(new_cluster)
46 | return new_clusters_list
47 |
48 |
49 | if __name__ == "__main__":
50 | '''
51 | Given: An integer n, followed by an nxn distance matrix.
52 | Return: The result of applying HierarchicalClustering to this distance matrix (using Davg), with each newly created
53 | cluster listed on each line.
54 | '''
55 | tmp = sys.stdin.read().splitlines()
56 | n = int(tmp[0])
57 |
58 | distance_matrix = []
59 | for i in range(1, len(tmp)):
60 | distance_matrix.append([float(d) for d in tmp[i].split(' ')])
61 |
62 | new_clusters_list = HierarchicalClustering(distance_matrix, 'average')
63 | for clu in new_clusters_list:
64 | print(' '.join([str(x + 1) for x in clu]))
--------------------------------------------------------------------------------
/solutions/BA11G.py:
--------------------------------------------------------------------------------
1 | import sys
2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}
4 |
5 |
6 | def is_number(n):
7 | try:
8 | float(n) # Type-casting the string to `float`.
9 | # If string is not a valid `float`,
10 | # it'll raise `ValueError` exception
11 | except ValueError:
12 | return False
13 | return True
14 |
15 |
16 | def PeptideVector(peptide):
17 | prefixMasses = []
18 | for i in range(len(peptide)):
19 | prefixMasses.append(sum(peptide[:i + 1]))
20 | vector = [0] * prefixMasses[-1]
21 | for mass in prefixMasses:
22 | vector[mass - 1] = 1
23 | return vector
24 |
25 |
26 | def PeptideIdentification(spectral_vector, proteome):
27 | max_score = -1e6
28 | mass_list = []
29 | for aa in proteome:
30 | mass_list.append(aa_table[aa])
31 |
32 | best_peptide = ''
33 |
34 | for i in range(len(mass_list)):
35 | k = 2
36 | while i + k < len(mass_list):
37 | peptide = mass_list[i:i + k]
38 | pep_vec = PeptideVector(peptide)
39 | if len(pep_vec) > len(spectral_vector):
40 | break
41 | if len(pep_vec) == len(spectral_vector):
42 | score = 0
43 | for idx in range(len(pep_vec)):
44 | if pep_vec[idx] == 1:
45 | score += spectral_vector[idx]
46 | if score > max_score:
47 | max_score = score
48 | best_peptide = proteome[i:i + k]
49 | k += 1
50 | return [best_peptide, max_score]
51 |
52 |
53 | def PSMSearch(spectral_vectors, proteome, threshold):
54 | PSMSet = set()
55 | for vec in spectral_vectors:
56 | peptide, score = PeptideIdentification(vec, proteome)
57 | if score >= threshold:
58 | PSMSet.add(peptide)
59 | return PSMSet
60 |
61 |
62 | if __name__ == "__main__":
63 | '''
64 | Given: A set of space-delimited spectral vectors SpectralVectors, an amino acid string Proteome, and a score
65 | threshold T.
66 | Return: All unique Peptide-Spectrum Matches scoring at least as high as T.
67 | '''
68 | tmp = sys.stdin.read().splitlines()
69 |
70 | spectral_vectors = []
71 | idx = 0
72 | while is_number(tmp[idx][0]) or is_number(tmp[idx][:2]):
73 | vec = [int(x) for x in tmp[idx].rstrip().split(' ')]
74 | spectral_vectors.append(vec)
75 | idx += 1
76 |
77 | proteome = tmp[idx].rstrip()
78 | threshold = int(tmp[idx + 1])
79 |
80 | result = PSMSearch(spectral_vectors, proteome, threshold)
81 |
82 | for res in result:
83 | print(res)
84 |
85 |
--------------------------------------------------------------------------------
/solutions/BA7C.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import networkx as nx
3 | from BA7B import limb_length
4 |
5 |
6 | def additive_phylogeny(dist_mat, num_leaves, graph, int_node):
7 |
8 | if num_leaves == 2:
9 | graph.add_edge(0, 1, weight = dist_mat[0][1])
10 | return graph
11 |
12 | n = num_leaves - 1
13 | len_limb = limb_length(dist_mat, n, num_leaves)
14 |
15 | for j in range(n):
16 | dist_mat[j][n] -= len_limb
17 | dist_mat[n][j] = dist_mat[j][n]
18 |
19 | other_leaves = [i for i in range(num_leaves) if i != n]
20 | selected_i = -1
21 | selected_k = -1
22 | for idx1 in range(len(other_leaves) - 1):
23 | i = other_leaves[idx1]
24 | for idx2 in range(idx1 + 1, len(other_leaves)):
25 | k = other_leaves[idx2]
26 | if dist_mat[i][n] + dist_mat[n][k] == dist_mat[i][k]:
27 | selected_i = i
28 | selected_k = k
29 | x = dist_mat[selected_i][n]
30 |
31 | del dist_mat[-1]
32 | for i in range(len(dist_mat)):
33 | del dist_mat[i][-1]
34 |
35 | while int_node in list(graph.nodes):
36 | int_node += 1
37 | T = additive_phylogeny(dist_mat, num_leaves - 1, graph, int_node)
38 |
39 | V = -1
40 | spath = nx.shortest_path(T, source=selected_i, target=selected_k)
41 | dist = 0
42 | for j in range(1, len(spath) - 1):
43 | dist += T[spath[j - 1]][spath[j]]['weight']
44 | if dist == x:
45 | V = spath[j]
46 |
47 | if V == -1:
48 | V = int_node
49 | while V in list(T.nodes):
50 | V += 1
51 | dist = 0
52 | j = 0
53 | while dist < x:
54 | j += 1
55 | pdist = dist
56 | dist += T[spath[j - 1]][spath[j]]['weight']
57 | T.remove_edge(spath[j - 1], spath[j])
58 | T.add_edge(V, spath[j], weight=dist - x)
59 | T.add_edge(V, spath[j - 1], weight=x - pdist)
60 |
61 | T.add_edge(V, n, weight=len_limb)
62 |
63 | return T
64 |
65 |
66 | if __name__ == "__main__":
67 | '''
68 | Given: n and a tab-delimited n x n additive matrix.
69 | Return: A weighted adjacency list for the simple tree fitting this matrix.
70 | '''
71 | input_lines = sys.stdin.read().splitlines()
72 | num_leaves = int(input_lines[0])
73 | distance_matrix = [[int(x) for x in line.split()] for line in input_lines[1:]]
74 |
75 | graph = nx.Graph()
76 | result = additive_phylogeny(distance_matrix, num_leaves, graph, num_leaves)
77 | adj_dict = nx.to_dict_of_lists(result)
78 | answer = []
79 | for key, value in adj_dict.items():
80 | for val in value:
81 | temp = str(key) + '->' + str(val) + ':' + str(int(result[key][val]['weight']))
82 | answer.append(temp)
83 | answer.sort()
84 | for l in answer:
85 | print(l)
86 |
--------------------------------------------------------------------------------
/solutions/BA10D.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def OutcomeLikelihood(x, all_states, transition_matrix, emission_matrix):
5 | init_transition_prob = 1 / len(all_states)
6 |
7 | ## calculate all scores
8 | Forward_dict = {}
9 | for i in range(len(x)):
10 | for current_state in all_states:
11 | if current_state not in Forward_dict.keys():
12 | Forward_dict[current_state] = {}
13 | ## if the leftmost column, initialize the recurrence
14 | # (every node in the leftmost column is connected to source)
15 | if i == 0:
16 | # Forward[source] is 1
17 | Forward_dict[current_state][i] = 1 * init_transition_prob * emission_matrix[current_state][x[i]]
18 |
19 | # forward𝑘,𝑖 = ∑all states 𝑙forward𝑙,𝑖−1⋅Weight𝑖(𝑙,𝑘)
20 | else:
21 | Forward_dict[current_state][i] = 0
22 | for state in all_states:
23 | Forward_dict[current_state][i] += Forward_dict[state][i - 1] * transition_matrix[state][
24 | current_state] * emission_matrix[current_state][x[i]]
25 |
26 | outcome_probability = 0
27 | for state in all_states:
28 | outcome_probability += Forward_dict[state][len(x) - 1]
29 |
30 | return outcome_probability
31 |
32 |
33 | if __name__ == "__main__":
34 | '''
35 | Given: A string x, followed by the alphabet Σ from which x was constructed, followed by the states States,
36 | transition matrix Transition, and emission matrix Emission of an HMM (Σ, States, Transition, Emission).
37 | Return: The probability Pr(x) that the HMM emits x.
38 | '''
39 | tmp = sys.stdin.read().splitlines()
40 |
41 | x = tmp[0] # the emitted string
42 | alphabet = tmp[2].split() # the alphabet from which x was constructed
43 | states = tmp[4].split() # the states of HMM
44 |
45 | # transition matrix
46 | col_syms = tmp[6].split()
47 | transition_end = 6 + len(states)
48 |
49 | transition_matrix = {}
50 | for i in range(7, transition_end + 1):
51 | current_line = tmp[i].split()
52 | row_sym = current_line[0]
53 | transition_matrix[row_sym] = {}
54 | for j in range(1, len(current_line)):
55 | transition_matrix[row_sym][col_syms[j - 1]] = float(current_line[j])
56 |
57 | # emission matrix
58 | col_syms = tmp[transition_end + 2].split()
59 | emission_matrix = {}
60 | for i in range(transition_end + 3, len(tmp)):
61 | current_line = tmp[i].rstrip().split()
62 | row_sym = current_line[0]
63 | emission_matrix[row_sym] = {}
64 | for j in range(1, len(current_line)):
65 | emission_matrix[row_sym][col_syms[j - 1]] = float(current_line[j])
66 | print("{:.11E}".format(OutcomeLikelihood(x, states, transition_matrix, emission_matrix)))
--------------------------------------------------------------------------------
/solutions/BA2F.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from random import randint
3 |
4 | from BA1G import hamming_dist
5 |
6 |
7 | def probability(pattern, profile):
8 | prob = 1
9 | for i, nuc in enumerate(pattern):
10 | prob *= profile[nuc][i]
11 | return prob
12 |
13 |
14 | def profile_most_probable_kmer(text, profile, k):
15 | max_prob = -1
16 | for i in range(len(text) - k + 1):
17 | kmer = text[i:i + k]
18 | prob = probability(kmer, profile)
19 | if prob > max_prob:
20 | max_prob = prob
21 | result = kmer
22 | return result
23 |
24 |
25 | def FormProfile(TextList, pseudocount=1):
26 | if type(TextList) != list:
27 | TextList = [TextList]
28 | t = len(TextList)
29 | k = len(TextList[0])
30 | profile = {'A': [pseudocount] * k, 'C': [pseudocount] * k, 'G': [pseudocount] * k, 'T': [pseudocount] * k}
31 | for i in range(k):
32 | for j in range(t):
33 | profile[TextList[j][i]][i] += 1
34 | return profile
35 |
36 |
37 | def CalculateScore(Motifs):
38 | k = len(Motifs[0])
39 | profile = FormProfile(Motifs)
40 | consensus = ''
41 | for i in range(k):
42 | most_freq = 0
43 | for nuc in ['A', 'C', 'G', 'T']:
44 | if profile[nuc][i] > most_freq:
45 | most_freq = profile[nuc][i]
46 | to_add = nuc
47 | consensus += to_add
48 | score = 0
49 | for motif in Motifs:
50 | score += hamming_dist(consensus, motif)
51 | return score
52 |
53 |
54 | def RandomizedMotifSearch(DNA_list, k, t):
55 | Motifs = []
56 | for dna in DNA_list:
57 | idx = randint(0, len(dna) - k)
58 | Motifs.append(dna[idx:idx + k])
59 | BestMotifs = Motifs
60 | while True:
61 | profile = FormProfile(Motifs)
62 | Motifs = []
63 | for dna in DNA_list:
64 | Motifs.append(profile_most_probable_kmer(dna, profile, k))
65 | if CalculateScore(Motifs) < CalculateScore(BestMotifs):
66 | BestMotifs = Motifs
67 | else:
68 | return BestMotifs
69 |
70 |
71 | if __name__ == "__main__":
72 | '''
73 | Given: Positive integers k and t, followed by a collection of strings Dna.
74 | Return: A collection BestMotifs resulting from running RandomizedMotifSearch(Dna, k, t) 1000 times. Remember to use
75 | pseudocounts!
76 | '''
77 | input_lines = sys.stdin.read().splitlines()
78 | k, t = [int(x) for x in input_lines[0].split()]
79 | DNA_list = input_lines[1:]
80 |
81 | best_score = float("Inf")
82 | for _ in range(1000):
83 | result = RandomizedMotifSearch(DNA_list, k, t)
84 | current_score = CalculateScore(result)
85 | if current_score <= best_score:
86 | best_score = current_score
87 | best_result = result
88 |
89 | print("\n".join(best_result))
90 |
--------------------------------------------------------------------------------
/solutions/BA8D.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from math import exp
3 | from random import random
4 |
5 |
6 | def Euclidean_distance(PointA, PointB):
7 | if len(PointA) != len(PointB):
8 | raise ValueError('The dimensions are not the same!')
9 | dist = 0
10 | for i in range(len(PointA)):
11 | dist += ((PointA[i] - PointB[i]) ** 2)
12 | dist **= 1/2
13 | return dist
14 |
15 |
16 | def dist_from_centers(DataPoint, Centers):
17 | min_d = float("inf")
18 | for C in Centers:
19 | distance = Euclidean_distance(DataPoint, C)
20 | if distance < min_d:
21 | min_d = distance
22 | return min_d
23 |
24 |
25 | def Random(prob_list):
26 | tot = sum(prob_list)
27 | massDist = map(lambda x: x/tot, prob_list)
28 | randRoll = random()
29 | cum = 0
30 | result = 0
31 | for mass in massDist:
32 | cum += mass
33 | if randRoll < cum:
34 | return result
35 | result += 1
36 |
37 |
38 | def Hidden_Matrix(Data, Centers, beta):
39 | hidden_mat = [[0 for _ in range(len(Data))] for _ in range(len(Centers))]
40 | for j in range(len(Data)):
41 | tot = 0
42 | for i in range(len(Centers)):
43 | tot += exp(-beta * Euclidean_distance(Centers[i], Data[j]))
44 | for i in range(len(Centers)):
45 | hidden_mat[i][j] = exp(-beta * Euclidean_distance(Centers[i], Data[j])) / tot
46 | return hidden_mat
47 |
48 |
49 | def clu_to_center(hidden_mat, Data):
50 | k = len(hidden_mat)
51 | m = len(Data[0])
52 | n = len(Data)
53 | new_centers = [[0 for j in range(m)] for i in range(k)]
54 | for i in range(k):
55 | for j in range(m):
56 | product = 0
57 | for idx in range(n):
58 | product += Data[idx][j] * hidden_mat[i][idx]
59 | new_centers[i][j] = product / sum(hidden_mat[i])
60 | return new_centers
61 |
62 |
63 | def soft_kmeans(Data, k, beta, N=100):
64 | Centers = Data[:k]
65 | for _ in range(N):
66 | hidden_mat = Hidden_Matrix(Data, Centers, beta)
67 | Centers = clu_to_center(hidden_mat, Data)
68 | return Centers
69 |
70 |
71 | if __name__ == "__main__":
72 | '''
73 | Given: Integers k and m, followed by a stiffness parameter β, followed by a set of points Data in m-dimensional
74 | space.
75 | Return: A set Centers consisting of k points (centers) resulting from applying the soft k-means clustering
76 | algorithm. Select the first k points from Data as the first centers for the algorithm and run the algorithm for 100
77 | steps. Results should be accurate up to three decimal places.
78 | '''
79 | input_lines = sys.stdin.read().splitlines()
80 | k, m = [int(x) for x in input_lines[0].split(' ')]
81 | beta = float(input_lines[1])
82 | data = [[float(x) for x in line.split()] for line in input_lines[2:]]
83 |
84 | Centers = soft_kmeans(data, k, beta)
85 | for C in Centers:
86 | print(" ".join(map(str, C)))
87 |
--------------------------------------------------------------------------------
/solutions/BA7E.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | def TotalDistance(dist_dict, i):
4 | return sum(dist_dict[i].values())
5 |
6 | def ConstructNJMatrix(dist_dict):
7 | D_NJ = {}
8 | for key1, val1 in dist_dict.items():
9 | for key2, val in dist_dict[key1].items():
10 | if not key1 in D_NJ:
11 | D_NJ[key1] = {}
12 | if key1 == key2:
13 | D_NJ[key1][key2] = 0
14 | else:
15 | D_NJ[key1][key2] = (len(dist_dict) - 2) * val - TotalDistance(dist_dict, key1) - TotalDistance(dist_dict, key2)
16 | return D_NJ
17 |
18 |
19 | def NeighborJoining(dist_dict, num_leaves):
20 | if num_leaves == 2:
21 | idx1 = list(dist_dict.keys())[0]
22 | idx2 = list(dist_dict.keys())[1]
23 | T = [[idx1, idx2, dist_dict[idx1][idx2]], [idx2, idx1, dist_dict[idx1][idx2]]]
24 | return T
25 |
26 | D_NJ = ConstructNJMatrix(dist_dict)
27 |
28 | min_dist = 1e6
29 | for key1, val1 in D_NJ.items():
30 | for key2, val in D_NJ[key1].items():
31 | if key1 != key2 and val < min_dist:
32 | idx1 = key1
33 | idx2 = key2
34 | min_dist = val
35 |
36 | delta = (TotalDistance(dist_dict, idx1) - TotalDistance(dist_dict, idx2)) / (num_leaves - 2)
37 | LimbLength1 = (dist_dict[idx1][idx2] + delta) / 2
38 | LimbLength2 = (dist_dict[idx1][idx2] - delta) / 2
39 |
40 | m = max(list(dist_dict.keys())) + 1
41 |
42 | for k in dist_dict.keys():
43 | dist_dict[k][m] = (dist_dict[idx1][k] + dist_dict[k][idx2] - dist_dict[idx1][idx2]) / 2
44 |
45 | dist_dict[m] = {}
46 | for k in dist_dict.keys():
47 | dist_dict[m][k] = (dist_dict[idx1][k] + dist_dict[k][idx2] - dist_dict[idx1][idx2]) / 2
48 |
49 | dist_dict[m][m] = 0.0
50 |
51 | del dist_dict[idx1]
52 | del dist_dict[idx2]
53 |
54 | for k in dist_dict.keys():
55 | del dist_dict[k][idx1]
56 | del dist_dict[k][idx2]
57 |
58 | T = NeighborJoining(dist_dict, num_leaves - 1)
59 |
60 | T.append([idx1, m, LimbLength1])
61 | T.append([m, idx1, LimbLength1])
62 | T.append([idx2, m, LimbLength2])
63 | T.append([m, idx2, LimbLength2])
64 |
65 | return T
66 |
67 |
68 | if __name__ == "__main__":
69 | '''
70 | Given: An integer n, followed by a space-separated n x n distance matrix.
71 | Return: An adjacency list for the tree resulting from applying the neighbor-joining algorithm. Edge-weights should
72 | be accurate to two decimal places (they are provided to three decimal places in the sample output below).
73 | '''
74 | lines = sys.stdin.read().splitlines()
75 | num_leaves = int(lines[0])
76 | dist_dict = {}
77 | for i in range(len(lines[1:])):
78 | row = lines[i + 1]
79 | temp = row.rstrip().split()
80 | temp_d = {}
81 | for j in range(len(temp)):
82 | temp_d[j] = int(temp[j])
83 | dist_dict[i] = temp_d
84 |
85 | result = NeighborJoining(dist_dict, num_leaves)
86 |
87 | result.sort(key=lambda x: x[0])
88 |
89 | for edge in result:
90 | print(str(edge[0]) + '->' + str(edge[1]) + ':' + '%.3f' % edge[2])
--------------------------------------------------------------------------------
/solutions/BA9P.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from re import split
3 |
4 |
5 | class colored_Tree_node:
6 | ''' Colored Tree Node Structure
7 | Each node has children(list) and color (string) attributes.
8 | '''
9 |
10 | def __init__(self):
11 | self.children = []
12 | self.color = 'gray'
13 |
14 |
15 | def is_node_ripe(tree_dict, node):
16 | ''' Check if node is ripe
17 | A node in a tree is called ripe if it is "gray" but has no "gray" children.
18 | '''
19 | if node.color != 'gray':
20 | return False
21 |
22 | for child_idx in node.children:
23 | if tree_dict[child_idx].color == 'gray':
24 | return False
25 |
26 | return True
27 |
28 |
29 | def return_ripe_nodes(tree_dict):
30 | ''' Return ripe nodes in tree
31 | '''
32 | ripe_nodes = []
33 | for node_idx, node in tree_dict.items():
34 | if is_node_ripe(tree_dict, node):
35 | ripe_nodes.append(node)
36 | return ripe_nodes
37 |
38 |
39 | def TreeColoring(tree_dict):
40 | ''' Tree Coloring
41 | # TreeColoring(ColoredTree)
42 | # while ColoredTree has ripe nodes
43 | # for each ripe node v in ColoredTree
44 | # if there exist differently colored children of v
45 | # Color(v) ← "purple"
46 | # else
47 | # Color(v) ← color of all children of v
48 | # return ColoredTree
49 | '''
50 |
51 | ripe_list = return_ripe_nodes(tree_dict)
52 | while len(ripe_list) != 0: ## while there are ripe nodes
53 |
54 | for ripe_node in ripe_list:
55 | ## collect colors for all children
56 | children_cols = []
57 | for child_idx in ripe_node.children:
58 | children_cols.append(tree_dict[child_idx].color)
59 |
60 | # if there exist differently colored children of v
61 | children_cols = list(set(children_cols))
62 |
63 | if len(children_cols) != 1:
64 | ripe_node.color = "purple"
65 | else:
66 | ripe_node.color = children_cols[0] # color of all children
67 |
68 | ripe_list = return_ripe_nodes(tree_dict)
69 |
70 | return tree_dict
71 |
72 |
73 | if __name__ == '__main__':
74 | '''
75 | Given: An adjacency list, followed by color labels for leaf nodes.
76 | Return: Color labels for all nodes, in any order.
77 | '''
78 | tmp = sys.stdin.read().splitlines()
79 |
80 | lst_flag = True
81 |
82 | tree_dict = {}
83 | for line in tmp:
84 | if line == '-':
85 | lst_flag = False
86 |
87 | elif lst_flag:
88 | tmp2 = split(' -> ', line)
89 | if tmp2[1] == "{}":
90 | tmp2[1] = []
91 | else:
92 | tmp2[1] = tmp2[1].split(',')
93 |
94 | node = colored_Tree_node()
95 | node.children = tmp2[1]
96 | tree_dict[tmp2[0]] = node
97 |
98 | else:
99 | tmp2 = split(': ', line)
100 | node = tree_dict[tmp2[0]]
101 | node.color = tmp2[1]
102 |
103 | tree_dict = TreeColoring(tree_dict)
104 |
105 | for node_idx, node in tree_dict.items():
106 | print(node_idx + ': ' + node.color)
--------------------------------------------------------------------------------
/solutions/BA5D.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | class node:
5 | def __init__(self, lbl):
6 | self.label = lbl
7 | self.parent_nodes = []
8 | self.target_nodes = []
9 | self.visited = False
10 |
11 |
12 | class DAG:
13 | def __init__(self):
14 | self.nodes_dict = {}
15 | self.distances = {}
16 | self.backtrack = {}
17 |
18 | def add_node(self, lbl):
19 | if lbl in self.nodes_dict:
20 | return self.nodes_dict[lbl]
21 |
22 | new_node = node(lbl)
23 | self.nodes_dict[lbl] = new_node
24 | return new_node
25 |
26 | def contruct_dag(self, adj_list_text):
27 | for line in adj_list_text:
28 | nodeA, tmp = line.split("->")
29 | nodeB, weight = tmp.split(":")
30 | weight = int(weight)
31 |
32 | from_node = self.add_node(nodeA)
33 | to_node = self.add_node(nodeB)
34 |
35 | from_node.target_nodes.append((to_node, weight))
36 | to_node.parent_nodes.append((from_node, weight))
37 |
38 | def topological_sort_util(self, node, stack):
39 | node.visited = True
40 | for node2,_ in node.target_nodes:
41 | if not node2.visited:
42 | self.topological_sort_util(node2, stack)
43 | stack.insert(0, node.label)
44 |
45 | def topological_sort(self):
46 | stack = []
47 | for node in self.nodes_dict.values():
48 | if not node.visited:
49 | self.topological_sort_util(node, stack)
50 | return stack
51 |
52 | def longest_path(self, source, sink):
53 | for label in self.nodes_dict:
54 | self.distances[label] = -float("Inf")
55 |
56 | self.distances[source] = 0
57 | self.backtrack[source] = None
58 |
59 | top_order = self.topological_sort()
60 | for label in top_order:
61 | current_node = self.nodes_dict[label]
62 | for v, weight in current_node.target_nodes:
63 | if self.distances[v.label] < self.distances[label] + weight:
64 | self.distances[v.label] = self.distances[label] + weight
65 | self.backtrack[v.label] = label
66 |
67 | path = [sink]
68 | curr = self.backtrack[sink]
69 | while curr != source:
70 | path = [curr] + path
71 | curr = self.backtrack[curr]
72 | path = [source] + path
73 | return self.distances[sink], path
74 |
75 |
76 |
77 |
78 | if __name__ == "__main__":
79 | '''
80 | Given: An integer representing the source node of a graph, followed by an integer representing the sink node of the
81 | graph, followed by an edge-weighted graph. The graph is represented by a modified adjacency list in which the
82 | notation "0->1:7" indicates that an edge connects node 0 to node 1 with weight 7.
83 | Return: The length of a longest path in the graph, followed by a longest path. (If multiple longest paths exist,
84 | you may return any one.)
85 | '''
86 | input_lines = sys.stdin.read().splitlines()
87 | source = input_lines[0]
88 | sink = input_lines[1]
89 | adj_list_text = input_lines[2:]
90 |
91 | graph = DAG()
92 | graph.contruct_dag(adj_list_text)
93 | longest_dist, longest_path = graph.longest_path(source, sink)
94 | print(longest_dist)
95 | print("->".join(longest_path))
--------------------------------------------------------------------------------
/solutions/BA9O.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA9I import BurrowsWheelerTransform
3 | from BA9N import create_check_point_array, Count_symbol, PartialSuffixArray, MultiplePatternMatching
4 |
5 |
6 | def pattern_to_seeds(pattern, d):
7 | minsize = len(pattern) // (d + 1)
8 |
9 | cut_points = list(range(0, len(pattern) - minsize + 1, minsize))
10 | cut_points.append(len(pattern))
11 |
12 | seeds = []
13 | offsets = []
14 | for i in range(1, len(cut_points)):
15 | seeds.append(pattern[cut_points[i - 1]: cut_points[i]])
16 | offsets.append(cut_points[i - 1])
17 | return seeds, offsets
18 |
19 |
20 | def find_seed_positions(seed, FirstOccurrence, BWT, check_point_array, partial_suffix_array):
21 | seed_pos_list = []
22 | top, bottom = MultiplePatternMatching(FirstOccurrence, BWT, seed, check_point_array)
23 | if top:
24 | for idx in range(top, bottom + 1):
25 | to_add = 0
26 | while idx not in partial_suffix_array.keys():
27 | idx = FirstOccurrence[BWT[idx]] + Count_symbol(check_point_array, idx, BWT, BWT[idx])
28 | to_add += 1
29 | seed_pos_list.append(partial_suffix_array[idx] + to_add)
30 | return seed_pos_list
31 |
32 |
33 | def wrapper(Text, pattern_list, d, C):
34 | BWT = BurrowsWheelerTransform(Text + '$')
35 |
36 | FirstOccurrence = {}
37 | for idx, symbol in enumerate(sorted(BWT)):
38 | if symbol not in FirstOccurrence.keys():
39 | FirstOccurrence[symbol] = idx
40 |
41 | check_point_array = create_check_point_array(BWT, C)
42 | partial_suffix_array = PartialSuffixArray(Text + '$', C)
43 |
44 | positions_list = []
45 | for pattern in pattern_list:
46 | ## break pattern into seeds
47 | seeds_list, offsets_list = pattern_to_seeds(pattern, d)
48 |
49 | # find exact matches and try to extend each seed
50 | pattern_pos_list = set()
51 | for candidate_seed, offset in zip(seeds_list, offsets_list):
52 | seed_pos_list = find_seed_positions(candidate_seed, FirstOccurrence, BWT, check_point_array,
53 | partial_suffix_array)
54 |
55 | for candidate_pos in seed_pos_list:
56 | pattern_position = candidate_pos - offset
57 |
58 | if pattern_position >= 0 and pattern_position + len(pattern) <= len(Text):
59 | approximate_match_flag = True
60 | num_mismatch = 0
61 | for idx, symbol in enumerate(pattern):
62 | if symbol != Text[pattern_position + idx]:
63 | num_mismatch += 1
64 | if num_mismatch > d:
65 | approximate_match_flag = False
66 | break
67 | if approximate_match_flag:
68 | pattern_pos_list.add(pattern_position)
69 |
70 | positions_list += list(pattern_pos_list)
71 |
72 | return sorted(positions_list)
73 |
74 |
75 | if __name__ == "__main__":
76 | '''
77 | Given: A string Text, a collection of strings Patterns, and an integer d.
78 | Return: All positions in Text where a string from Patterns appears as a substring with at most d mismatches.
79 | '''
80 | tmp = sys.stdin.read().splitlines()
81 | Text = tmp[0]
82 | pattern_list = [pattern for pattern in tmp[1].split(' ')]
83 | d = int(tmp[2])
84 |
85 | positions_list = wrapper(Text, pattern_list, d, C=100)
86 | print(' '.join(str(pos) for pos in positions_list))
87 |
--------------------------------------------------------------------------------
/solutions/BA5M.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from itertools import product
3 |
4 |
5 | def multiple_alignment(str_list):
6 | str_list = ["-" + string for string in str_list]
7 |
8 | score_mat = {}
9 | backtrack_mat = {}
10 |
11 | def add_tuples_elemwise(t1, t2):
12 | return tuple(sum(x) for x in zip(t1, t2))
13 |
14 | # all possible "moves"
15 | perm_list = list(product([0, -1], repeat=len(str_list)))[1:]
16 |
17 | # fill n-dimensional score and backtrack matrices
18 | for index in product(*map(range, map(lambda s: len(s) + 1, str_list))):
19 | if index.count(0) >= len(str_list) - 1:
20 | if sum(index) == 0:
21 | score_mat[index] = 0
22 | else:
23 | score_mat[index] = 0
24 | move = tuple(0 if id == 0 else -1 for id in index)
25 | bck = -1
26 | for idx, perm in enumerate(perm_list):
27 | if perm == move:
28 | bck = idx
29 | break
30 | backtrack_mat[index] = bck
31 | else:
32 | possible_scores = []
33 | for perm_idx, move in enumerate(perm_list):
34 | prev_idx = add_tuples_elemwise(index, move)
35 | if -1 not in prev_idx:
36 | prev_score = score_mat[prev_idx]
37 | chars = [str_list[i][index[i] - 1] if val == -1 else "-" for i, val in enumerate(move)]
38 | # score of an alignment column is 1 if all three symbols are identical and 0 otherwise
39 |
40 | current = 1 if all(elem == chars[0] for elem in chars) and chars[0] != "-" else 0
41 | possible_scores.append((prev_score + current, perm_idx))
42 | score_mat[index], backtrack_mat[index] = max(possible_scores, key=lambda p: p[0])
43 |
44 | # backtrack
45 | alignment = ["" for _ in str_list]
46 | current_index = list(map(len, str_list))
47 | max_score = score_mat[tuple(current_index)]
48 |
49 | while sum(current_index) != 0:
50 | back_perm_idx = backtrack_mat[tuple(current_index)]
51 | permutation = perm_list[back_perm_idx]
52 | for i, perm_value in enumerate(permutation):
53 | if perm_value == 0:
54 | alignment[i] = "-" + alignment[i]
55 | else:
56 | alignment[i] = str_list[i][current_index[i] - 1] + alignment[i]
57 |
58 | current_index = add_tuples_elemwise(tuple(current_index), permutation)
59 |
60 | # remove all "-" columns
61 | to_rm_idx = []
62 | for pos in range(len(alignment[0])):
63 | temp = [x[pos] for x in alignment]
64 | if all(x == "-" for x in temp):
65 | to_rm_idx.append(pos)
66 |
67 | for i in range(len(alignment)):
68 | alignment[i] = "".join([char for idx, char in enumerate(alignment[i]) if idx not in to_rm_idx])
69 |
70 | return max_score, alignment
71 |
72 |
73 | if __name__ == "__main__":
74 | '''
75 | Given: Three DNA strings.
76 | Return: The maximum score of a multiple alignment of these three strings, followed by a multiple alignment of the
77 | three strings achieving this maximum. Use a scoring function in which the score of an alignment column is 1 if all
78 | three symbols are identical and 0 otherwise. (If more than one multiple alignment achieve the maximum, you may
79 | return any one.)
80 | '''
81 | DNA_strings_list = sys.stdin.read().splitlines()
82 |
83 | score, alignment = multiple_alignment(DNA_strings_list)
84 |
85 | print(score)
86 |
87 | for aligned in alignment:
88 | print(aligned)
--------------------------------------------------------------------------------
/solutions/BA2G.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from random import randint
3 | from random import random
4 |
5 |
6 | def GibbsSampler(Dna, k, t, N):
7 | Motifs = []
8 | for dna in Dna:
9 | idx = randint(0, len(dna) - k)
10 | Motifs.append(dna[idx:idx + k])
11 | BestMotifs = Motifs
12 | min_score = CalculateScore(BestMotifs)
13 | for ITER in range(N):
14 | idx = randint(0, t - 1)
15 | profile = FormProfileWithPseudoCounts([motif for i, motif in enumerate(Motifs) if i != idx])
16 | Motifs[idx] = ProfileRandomlyGeneratedKmer(Dna[idx], profile)
17 | current_score = CalculateScore(Motifs)
18 | if current_score < min_score:
19 | BestMotifs = Motifs
20 | min_score = current_score
21 | return BestMotifs
22 |
23 |
24 | def FormProfileWithPseudoCounts(TextList, pseudocount = 1):
25 | if type(TextList) != list:
26 | TextList = [TextList]
27 | t = len(TextList)
28 | k = len(TextList[0])
29 | profile = {'A': [pseudocount]*k, 'C': [pseudocount]*k, 'G': [pseudocount]*k, 'T': [pseudocount]*k}
30 | for i in range(k):
31 | for j in range(t):
32 | profile[TextList[j][i]][i] += 1
33 | return profile
34 |
35 |
36 | def ProfileRandomlyGeneratedKmer(Text, profile):
37 | L = len(Text)
38 | k = len(profile['A'])
39 | probs = []
40 | tot = profile['A'][0] + profile['C'][0] + profile['G'][0] + profile['T'][0]
41 | for i in range(L - k + 1):
42 | Motif = Text[i:i + k]
43 | current_prob = 1.0
44 | for j, nuc in enumerate(Motif):
45 | current_prob *= float(profile[nuc][j]) / tot
46 | probs.append(current_prob)
47 | selected_start = Random(probs)
48 | return Text[selected_start:selected_start + k]
49 |
50 |
51 | def Random(prob_list):
52 | tot = sum(prob_list)
53 | massDist = map(lambda x: x/tot, prob_list)
54 | randRoll = random()
55 | cum = 0
56 | result = 0
57 | for mass in massDist:
58 | cum += mass
59 | if randRoll < cum:
60 | return result
61 | result += 1
62 |
63 |
64 | def HammingDistance(p, q):
65 | mm = [p[i] != q[i] for i in range(len(p))]
66 | return sum(mm)
67 |
68 |
69 | def CalculateScore(Motifs):
70 | k = len(Motifs[0])
71 | profile = FormProfileWithPseudoCounts(Motifs, 0)
72 | consensus = ''
73 | for i in range(k):
74 | most_freq = 0
75 | for nuc in ['A', 'C', 'G', 'T']:
76 | if profile[nuc][i] > most_freq:
77 | most_freq = profile[nuc][i]
78 | to_add = nuc
79 | consensus += to_add
80 | score = 0
81 | for motif in Motifs:
82 | score += HammingDistance(consensus, motif)
83 | return score
84 |
85 |
86 | def wrapper(Dna, k, t, N, nstart = 20):
87 | min_score = 1e6
88 | for i in range(nstart):
89 | res = GibbsSampler(Dna, k, t, N)
90 | current_score = CalculateScore(res)
91 | # print current_score
92 | if current_score < min_score:
93 | min_score = current_score
94 | result = res
95 | return result
96 |
97 |
98 | if __name__ == "__main__":
99 | '''
100 | Given: Integers k, t, and N, followed by a collection of strings Dna.
101 | Return: The strings BestMotifs resulting from running GibbsSampler(Dna, k, t, N) with 20 random starts. Remember to use pseudocounts!
102 | '''
103 | input_lines = sys.stdin.read().splitlines()
104 | k, t, N = [int(x) for x in input_lines[0].split()]
105 | DNA_list = input_lines[1:]
106 |
107 | print("\n".join(wrapper(DNA_list, k, t, N)))
--------------------------------------------------------------------------------
/solutions/BA7D.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | class Node:
5 | def __init__(self, label):
6 | self.label = label
7 | self.age = 0
8 |
9 |
10 | class Tree:
11 | def __init__(self):
12 | self.nodes = {}
13 | self.edges = []
14 |
15 | def add_node(self, label):
16 | if label in self.nodes:
17 | return self.nodes[label]
18 |
19 | node = Node(label)
20 | self.nodes[label] = node
21 | return node
22 |
23 | def UPGMA(self, dist_mat, n):
24 | dist_dict = {}
25 | for i in range(len(dist_mat)):
26 | dist_dict[i] = {}
27 | for j in range(len(dist_mat[i])):
28 | dist_dict[i][j] = dist_mat[i][j]
29 |
30 | Clusters = {i: [i] for i in range(n)}
31 |
32 | for i in range(n):
33 | self.add_node(i)
34 |
35 | new_node_label = n
36 | T = []
37 | while len(dist_dict) > 1:
38 | min_dist = float("Inf")
39 | nodes = list(dist_dict.keys())
40 | for i in range(len(nodes) - 1):
41 | for j in range(i + 1, len(nodes)):
42 | if dist_dict[nodes[i]][nodes[j]] < min_dist:
43 | min_dist = dist_dict[nodes[i]][nodes[j]]
44 | node_i = nodes[i]
45 | node_j = nodes[j]
46 |
47 | new_cluster = Clusters[node_i] + Clusters[node_j]
48 |
49 | new_node = self.add_node(new_node_label)
50 | T.append([new_node_label, node_i])
51 | T.append([new_node_label, node_j])
52 |
53 | new_node.age = dist_dict[node_i][node_j] / 2
54 |
55 | dist_dict[new_node_label] = {}
56 | dist_dict[new_node_label][new_node_label] = 0
57 | for old_node in nodes:
58 | total = 0
59 | count = 0
60 | for init_node in Clusters[old_node]:
61 | for node in new_cluster:
62 | total += dist_mat[init_node][node]
63 | count += 1
64 | dist_dict[old_node][new_node_label] = total / count
65 | dist_dict[new_node_label][old_node] = total / count
66 |
67 | Clusters[new_node_label] = new_cluster
68 | new_node_label += 1
69 |
70 | del dist_dict[node_i]
71 | del dist_dict[node_j]
72 | for key in dist_dict.keys():
73 | del dist_dict[key][node_i]
74 |
75 | for key in dist_dict.keys():
76 | del dist_dict[key][node_j]
77 |
78 | for edge in T:
79 | length = self.nodes[edge[0]].age - self.nodes[edge[1]].age
80 | self.edges.append(edge + [length])
81 | self.edges.append(edge[::-1] + [length])
82 |
83 | self.edges.sort(key=lambda x: x[1])
84 | self.edges.sort(key=lambda x: x[0])
85 |
86 | return self.edges
87 |
88 |
89 | if __name__ == "__main__":
90 | '''
91 | Given: An integer n followed by a space-delimited n x n distance matrix.
92 | Return: An adjacency list for the ultrametric tree output by UPGMA. Weights should be accurate to three decimal
93 | places.
94 | '''
95 | input_lines = sys.stdin.read().splitlines()
96 | n = int(input_lines[0])
97 | distance_matrix = [[int(x) for x in line.split()] for line in input_lines[1:]]
98 |
99 | t = Tree()
100 | adj_list = t.UPGMA(distance_matrix, n)
101 |
102 | for node1, node2, weight in adj_list:
103 | temp = str(node1) + '->' + str(node2) + ':' + str(round(weight, 3))
104 | print(temp)
105 |
106 |
--------------------------------------------------------------------------------
/solutions/BA9N.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA9I import BurrowsWheelerTransform
3 |
4 |
5 | def create_check_point_array(BWT, C):
6 | symbol_list = list(set(BWT))
7 | check_point_array = {}
8 | for idx in range(0, len(BWT), C):
9 | check_point_array[idx] = {}
10 | for symbol in symbol_list:
11 | check_point_array[idx][symbol] = BWT[:idx].count(symbol)
12 | return check_point_array
13 |
14 |
15 | def Count_symbol(check_point_array, idx, LastColumn, symbol):
16 | vals = [x for x in check_point_array.keys() if x <= idx]
17 | nearest_idx = min(vals, key=lambda x: abs(x - idx))
18 |
19 | count = check_point_array[nearest_idx][symbol]
20 | count += LastColumn[nearest_idx:idx].count(symbol)
21 | return count
22 |
23 |
24 | def PartialSuffixArray(Text, K):
25 | suffixes = []
26 | suffix_array = []
27 | for i in range(len(Text)):
28 | suffixes.append(Text[i:])
29 | suffix_array.append(i)
30 |
31 | suffix_array = [x for _, x in sorted(zip(suffixes, suffix_array), key=lambda pair: pair[0])]
32 |
33 | partial_suffix_array = {i: x for i, x in enumerate(suffix_array) if x % K == 0}
34 |
35 | return partial_suffix_array
36 |
37 |
38 | def MultiplePatternMatching(FirstOccurrence, LastColumn, pattern, check_point_array):
39 | ''' Multiple Pattern Matching with BWT
40 | '''
41 | top = 0
42 | bottom = len(LastColumn) - 1
43 |
44 | while top <= bottom:
45 | if len(pattern) != 0:
46 | symbol = pattern[-1]
47 | pattern = pattern[:-1]
48 |
49 | # if positions from top to bottom in LastColumn
50 | # contain any occurrence of symbol
51 | if symbol in LastColumn[top: bottom + 1]:
52 | top = FirstOccurrence[symbol] + Count_symbol(check_point_array, top, LastColumn, symbol)
53 | bottom = FirstOccurrence[symbol] + Count_symbol(check_point_array, bottom + 1, LastColumn, symbol) - 1
54 | else:
55 | return False, False
56 | else:
57 | return top, bottom
58 |
59 |
60 | def wrapper(Text, pattern_list, C):
61 | BWT = BurrowsWheelerTransform(Text + '$')
62 |
63 | FirstOccurrence = {}
64 | for idx, symbol in enumerate(sorted(BWT)):
65 | if symbol not in FirstOccurrence.keys():
66 | FirstOccurrence[symbol] = idx
67 |
68 | check_point_array = create_check_point_array(BWT, C)
69 | partial_suffix_array = PartialSuffixArray(Text + '$', C)
70 |
71 | positions_list = []
72 | for pattern in pattern_list:
73 | top, bottom = MultiplePatternMatching(FirstOccurrence, BWT, pattern, check_point_array)
74 | if top:
75 | for idx in range(top, bottom + 1):
76 |
77 | to_add = 0
78 | while idx not in partial_suffix_array.keys():
79 | idx = FirstOccurrence[BWT[idx]] + Count_symbol(check_point_array, idx, BWT, BWT[idx])
80 | to_add += 1
81 |
82 | positions_list.append(partial_suffix_array[idx] + to_add)
83 |
84 | return sorted(positions_list)
85 |
86 |
87 | if __name__ == "__main__":
88 | ''' Multiple Patterns Matching Implementation (with BWT)
89 | Given: A string Text and a collection of strings Patterns.
90 | Return: All starting positions in Text where a string from Patterns appears as a substring.
91 | '''
92 | tmp = sys.stdin.read().splitlines()
93 | Text = tmp[0]
94 | pattern_list = []
95 | for i in range(1, len(tmp)):
96 | pattern_list.append(tmp[i])
97 |
98 | positions_list = wrapper(Text, pattern_list, C=100)
99 | print(' '.join(str(pos) for pos in positions_list))
100 |
--------------------------------------------------------------------------------
/solutions/BA6K.py:
--------------------------------------------------------------------------------
1 | def ProcessInput(P):
2 | P = P[1:-1]
3 | P = P.split(')(')
4 | for i in range(len(P)):
5 | P[i] = P[i].split(' ')
6 | for j in range(len(P[i])):
7 | P[i][j] = int(P[i][j])
8 | return P
9 |
10 |
11 | def ChromosomeToCycle(Chromosome):
12 | Nodes = []
13 | for block in Chromosome:
14 | if block > 0:
15 | Nodes.append(2 * block - 1)
16 | Nodes.append(2 * block)
17 | else:
18 | Nodes.append(-2 * block)
19 | Nodes.append(-2 * block - 1)
20 | return Nodes
21 |
22 |
23 | def ColoredEdges(P):
24 | Edges = []
25 | for Chromosome in P:
26 | Nodes = ChromosomeToCycle(Chromosome)
27 | for j in range(1, len(Nodes), 2):
28 | if j != len(Nodes) - 1:
29 | Edges.append([Nodes[j], Nodes[j + 1]])
30 | else:
31 | Edges.append([Nodes[j], Nodes[0]])
32 | return Edges
33 |
34 |
35 | def TwoBreakOnGenomeGraph(GenomeGraph, i1 , i2 , i3 , i4):
36 | if [i1, i2] in GenomeGraph:
37 | for i in range(len(GenomeGraph)):
38 | if GenomeGraph[i] == [i1, i2]:
39 | GenomeGraph[i] = [i1, i3]
40 | else:
41 | for i in range(len(GenomeGraph)):
42 | if GenomeGraph[i] == [i2, i1]:
43 | GenomeGraph[i] = [i3, i1]
44 | if [i3, i4] in GenomeGraph:
45 | for i in range(len(GenomeGraph)):
46 | if GenomeGraph[i] == [i3, i4]:
47 | GenomeGraph[i] = [i2, i4]
48 | else:
49 | for i in range(len(GenomeGraph)):
50 | if GenomeGraph[i] == [i4, i3]:
51 | GenomeGraph[i] = [i4, i2]
52 | return GenomeGraph
53 |
54 |
55 | def CycleToChromosome(Nodes):
56 | Chromosome = []
57 | for i in range(0, len(Nodes), 2):
58 | if Nodes[i] < Nodes[i + 1]:
59 | Chromosome.append(Nodes[i + 1] // 2)
60 | else:
61 | Chromosome.append(-Nodes[i] // 2)
62 | return Chromosome
63 |
64 |
65 | def FindNextEdge(current, edges):
66 | if len(edges) == 0:
67 | return -1
68 | idx = 0
69 | while not (current[1] + 1 == edges[idx][0] or current[1] - 1 == edges[idx][0]):
70 | idx += 1
71 | if idx == len(edges):
72 | return -1
73 | return edges[idx]
74 |
75 |
76 | def GraphToGenome(GenomeGraph):
77 | Q = []
78 | Cycles = []
79 | idx = 0
80 | while len(GenomeGraph) != 0:
81 | Cycle = []
82 | current = GenomeGraph[0]
83 | while current != -1:
84 | Cycle += current
85 | GenomeGraph.remove(current)
86 | current = FindNextEdge(current, GenomeGraph)
87 | Cycles.append(Cycle)
88 | for Cycle in Cycles:
89 | Cycle = Cycle[-3:] + Cycle[:-3]
90 | Chromosome = CycleToChromosome(Cycle)
91 | Q.append(Chromosome)
92 | return Q
93 |
94 |
95 | def TwoBreakOnGenome(P, i1 , i2 , i3 , i4):
96 | GenomeGraph = ColoredEdges(P)
97 | GenomeGraph = TwoBreakOnGenomeGraph(GenomeGraph, i1 , i2 , i3 , i4)
98 | Q = GraphToGenome(GenomeGraph)
99 | return Q
100 |
101 |
102 | if __name__ == "__main__":
103 | '''
104 | Given: The colored edges of a genome graph GenomeGraph, followed by indices i, i', j, and j'.
105 | Return: The colored edges of the genome graph resulting from applying the 2-break operation.
106 | '''
107 | P = input().rstrip()
108 | P = ProcessInput(P)
109 | i1, i2, i3, i4 = map(int, input().rstrip().split(', '))
110 | result = TwoBreakOnGenome(P, i1, i2, i3, i4)
111 | for j in range(len(result)):
112 | result[j] = '(' + ' '.join(('+' if i > 0 else '') + str(i) for i in result[j]) + ')'
113 | print(''.join(result))
114 |
--------------------------------------------------------------------------------
/solutions/BA10C.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def Viterbi(x, all_states, transition_matrix, emission_matrix):
5 | init_transition_prob = 1 / len(all_states)
6 |
7 | ## calculate all scores
8 | backtrace = {}
9 | Score_dict = {}
10 | for i in range(len(x)):
11 | backtrace[i] = {}
12 | for current_state in all_states:
13 | if current_state not in Score_dict.keys():
14 | Score_dict[current_state] = {}
15 | ## if the leftmost column, initialize the recurrence
16 | # (every node in the leftmost column is connected to source)
17 | if i == 0:
18 | # Score[source] is 1
19 | Score_dict[current_state][i] = 1 * init_transition_prob * emission_matrix[current_state][x[i]]
20 | # print(str(i) + ': '+ 'source' + '>>' + current_state + ':\t' + '{:.5f}'.format(init_transition_prob * emission_matrix[current_state][x[i]]))
21 |
22 | # 𝑠𝑘,𝑖 = max𝑎𝑙𝑙 𝑠𝑡𝑎𝑡𝑒𝑠 𝑙{𝑠𝑙,𝑖−1⋅𝑡𝑟𝑎𝑛𝑠𝑖𝑡𝑖𝑜𝑛𝑙,𝑘⋅𝑒𝑚𝑖𝑠𝑠𝑖𝑜𝑛𝑘(𝑥𝑖)}
23 | else:
24 | Score_dict[current_state][i] = -1e6
25 | for state in all_states:
26 | tmp_score = Score_dict[state][i - 1] * transition_matrix[state][current_state] * \
27 | emission_matrix[current_state][x[i]]
28 | # print(str(i) + ': '+ state + '>>' + current_state + ':\t' + '{:.5f}'.format(transition_matrix[state][current_state] * emission_matrix[current_state][x[i]]))
29 | if tmp_score > Score_dict[current_state][i]:
30 | Score_dict[current_state][i] = tmp_score
31 | backtrace[i][current_state] = state
32 |
33 | ## Backtrace the maximum scoring path
34 | max_score_state = max(Score_dict.keys(), key=lambda state: Score_dict[state][len(x) - 1])
35 | most_probable_path = max_score_state
36 |
37 | current_state = max_score_state
38 | for i in range(len(x) - 1, 0, -1):
39 | prev_state = backtrace[i][current_state]
40 | most_probable_path = prev_state + most_probable_path
41 | current_state = prev_state
42 |
43 | # print('\t' + '\t'.join([str(i) for i in range(len(x))]))
44 | # to_print = ''
45 | # for state in all_states:
46 | # to_print += state + '\t'
47 | # for i in range(len(x)):
48 | # to_print += '{:.5f}'.format(Score_dict[state][i]) + '\t'
49 | # to_print += '\n'
50 | # print(to_print)
51 |
52 | return most_probable_path
53 |
54 |
55 | if __name__ == "__main__":
56 | '''
57 | Given: A string x, followed by the alphabet Σ from which x was constructed, followed by the states States,
58 | transition matrix Transition, and emission matrix Emission of an HMM (Σ, States, Transition, Emission).
59 | Return: A path that maximizes the (unconditional) probability Pr(x, π) over all possible paths π.
60 | '''
61 |
62 | tmp = sys.stdin.read().splitlines()
63 |
64 | x = tmp[0] # the emitted string
65 | alphabet = tmp[2].split() # the alphabet from which x was constructed
66 | states = tmp[4].split() # the states of HMM
67 |
68 | # transition matrix
69 | col_syms = tmp[6].split()
70 | transition_end = 6 + len(states)
71 |
72 | transition_matrix = {}
73 | for i in range(7, transition_end + 1):
74 | current_line = tmp[i].split()
75 | row_sym = current_line[0]
76 | transition_matrix[row_sym] = {}
77 | for j in range(1, len(current_line)):
78 | transition_matrix[row_sym][col_syms[j - 1]] = float(current_line[j])
79 |
80 | # emission matrix
81 | col_syms = tmp[transition_end + 2].split()
82 | emission_matrix = {}
83 | for i in range(transition_end + 3, len(tmp)):
84 | current_line = tmp[i].split()
85 | row_sym = current_line[0]
86 | emission_matrix[row_sym] = {}
87 | for j in range(1, len(current_line)):
88 | emission_matrix[row_sym][col_syms[j - 1]] = float(current_line[j])
89 |
90 | print(Viterbi(x, states, transition_matrix, emission_matrix))
--------------------------------------------------------------------------------
/solutions/BA10J.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def SoftDecoding(x, transition_matrix, emission_matrix, alphabet, all_states):
5 | # assume that transitions from the source node and to sink node occur with equal probability
6 | init_transition_prob = 1 / len(all_states)
7 |
8 | ## calculate all forward values
9 | Forward_dict = {}
10 | for i in range(len(x)):
11 | for current_state in all_states:
12 | if current_state not in Forward_dict.keys():
13 | Forward_dict[current_state] = {}
14 | # initialize the recurrence
15 | # (every node in the leftmost column is connected to source)
16 | if i == 0:
17 | # Forward[source] is 1
18 | Forward_dict[current_state][i] = 1 * init_transition_prob * emission_matrix[current_state][x[i]]
19 |
20 | # forward𝑘,𝑖 = ∑all states 𝑙forward𝑙,𝑖−1⋅Weight𝑖(𝑙,𝑘)
21 | else:
22 | Forward_dict[current_state][i] = 0
23 | for state in all_states:
24 | Forward_dict[current_state][i] += Forward_dict[state][i - 1] * transition_matrix[state][
25 | current_state] * emission_matrix[current_state][x[i]]
26 |
27 | Pr_x = 0
28 | for state in all_states:
29 | Pr_x += Forward_dict[state][len(x) - 1]
30 |
31 | Backward_dict = {}
32 | for i in range(len(x) - 1, -1, -1):
33 | for current_state in all_states:
34 | if current_state not in Backward_dict.keys():
35 | Backward_dict[current_state] = {}
36 |
37 | # initialize the recurrence
38 | # (every node in the rightmost column is connected to sink)
39 | if i == len(x) - 1:
40 | Backward_dict[current_state][i] = 1
41 | # backward𝑘,𝑖 = ∑all states 𝑙backward𝑙,𝑖+1⋅Weight𝑖(𝑙,𝑘)
42 | else:
43 | Backward_dict[current_state][i] = 0
44 | for state in all_states:
45 | Backward_dict[current_state][i] += Backward_dict[state][i + 1] * transition_matrix[current_state][
46 | state] * emission_matrix[state][x[i + 1]]
47 |
48 | cond_prob_matrix = {}
49 | for i in range(len(x)):
50 | for state in all_states:
51 | if state not in cond_prob_matrix.keys():
52 | cond_prob_matrix[state] = {}
53 |
54 | cond_prob_matrix[state][i] = Forward_dict[state][i] * Backward_dict[state][i] / Pr_x
55 |
56 | return cond_prob_matrix
57 |
58 |
59 | if __name__ == "__main__":
60 | '''
61 | Given: A string x, followed by the alphabet Σ from which x was constructed, followed by the states States,
62 | transition matrix Transition, and emission matrix Emission of an HMM (Σ, States, Transition, Emission).
63 | Return: The probability Pr(πi = k|x) that the HMM was in state k at step i (for each state k and each step i).
64 | '''
65 | tmp = sys.stdin.read().splitlines()
66 |
67 | x = tmp[0]
68 | alphabet = tmp[2].split()
69 | all_states = tmp[4].split()
70 |
71 | transition_matrix = {}
72 | emission_matrix = {}
73 |
74 | # initial transition matrix
75 | col_syms = tmp[6].split()
76 | transition_end = 6 + len(all_states)
77 |
78 | for i in range(7, transition_end + 1):
79 | current_line = tmp[i].split()
80 | row_sym = current_line[0]
81 | transition_matrix[row_sym] = {}
82 | for j in range(1, len(current_line)):
83 | transition_matrix[row_sym][col_syms[j - 1]] = float(current_line[j])
84 |
85 | # emission matrix
86 | col_syms = tmp[transition_end + 2].split()
87 |
88 | for i in range(transition_end + 3, len(tmp)):
89 | current_line = tmp[i].split()
90 | row_sym = current_line[0]
91 | emission_matrix[row_sym] = {}
92 | for j in range(1, len(current_line)):
93 | emission_matrix[row_sym][col_syms[j - 1]] = float(current_line[j])
94 |
95 | cond_prob_matrix = SoftDecoding(x, transition_matrix, emission_matrix, alphabet, all_states)
96 |
97 | to_print = '\t'.join(all_states) + '\n'
98 | for i in range(len(x)):
99 | for state in all_states:
100 | to_print += str(round(cond_prob_matrix[state][i], 4)).rstrip('0') + '\t'
101 | to_print += '\n'
102 |
103 | print(to_print)
104 |
--------------------------------------------------------------------------------
/solutions/BA10H.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def print_matrices(*argv, delim="\t", separator = "--------"):
5 | ''' Function for printing multiple matrices
6 | Prints each matrix (stored as a dictionary) in
7 | tab-delimited format (default). Separates the
8 | matrices with '--------' (default).
9 | '''
10 |
11 | for idx, matrix in enumerate(argv):
12 |
13 | row_labels = list(matrix.keys())
14 | col_labels = list(matrix[row_labels[0]].keys())
15 | if idx == 0:
16 | to_print = delim + delim.join(col_labels) + '\n'
17 | # to_print = delim + delim.join(col_labels) + '\t\n' # fix for required format
18 | else:
19 | to_print = delim + delim.join(col_labels) + '\n'
20 |
21 | for r_label in row_labels:
22 | tmp = [r_label]
23 | for c_label in col_labels:
24 | val = matrix[r_label][c_label]
25 | if val == 0:
26 | val_str = '0'
27 | elif val == int(val):
28 | val_str = '{:.1f}'.format(val)
29 | else:
30 | val_str = '{:.3f}'.format(val).rstrip('0')
31 | tmp.append(val_str)
32 | to_print += delim.join(tmp)
33 | if r_label != row_labels[-1]:
34 | to_print += '\n'
35 |
36 | print(to_print)
37 | if idx != len(argv) - 1:
38 | print(separator)
39 |
40 | return None
41 |
42 |
43 | def HMMParameterEstimation(x, path, alphabet, all_states):
44 | transitions = {}
45 | for i in range(1, len(path)):
46 | current = (path[i - 1], path[i])
47 | if current in transitions.keys():
48 | transitions[current] += 1
49 | else:
50 | transitions[current] = 1
51 |
52 | transition_matrix = {}
53 | for state1 in all_states:
54 |
55 | transition_matrix[state1] = {}
56 |
57 | total_transitions = 0
58 | for state2 in all_states:
59 | if (state1, state2) in transitions.keys():
60 | total_transitions += transitions[(state1, state2)]
61 |
62 | for state2 in all_states:
63 | # assume uniform transition probability
64 | # if no transition from state 1 was observed
65 | if total_transitions == 0:
66 | transition_matrix[state1][state2] = 1 / len(all_states)
67 | else:
68 | if (state1, state2) in transitions.keys():
69 | transition_matrix[state1][state2] = transitions[(state1, state2)] / total_transitions
70 | else:
71 | transition_matrix[state1][state2] = 0
72 |
73 | emissions = {}
74 | for state, symbol in zip(path, x):
75 | current = (state, symbol)
76 | if current in emissions.keys():
77 | emissions[current] += 1
78 | else:
79 | emissions[current] = 1
80 |
81 | emission_matrix = {}
82 | for state in all_states:
83 | emission_matrix[state] = {}
84 |
85 | total_emissions = 0
86 | for symbol in alphabet:
87 | if (state, symbol) in emissions.keys():
88 | total_emissions += emissions[(state, symbol)]
89 |
90 | for symbol in alphabet:
91 | # assume uniform emission probability
92 | # if state was not observed in path
93 | if total_emissions == 0:
94 | emission_matrix[state][symbol] = 1 / len(alphabet)
95 | else:
96 | if (state, symbol) in emissions.keys():
97 | emission_matrix[state][symbol] = emissions[(state, symbol)] / total_emissions
98 | else:
99 | emission_matrix[state][symbol] = 0
100 |
101 | return transition_matrix, emission_matrix
102 |
103 |
104 | if __name__ == "__main__":
105 | '''
106 | Given: A sequence of emitted symbols x = x1 . . . xn in an alphabet ∑ and a path π = π1 . . . πn generated by a
107 | k-state HMM with unknown transition and emission probabilities.
108 | Return: A matrix of transition probabilities Transition and a matrix of emission probabilities Emission that
109 | maximize Pr(x,π) over all possible matrices of transition and emission probabilities.
110 | '''
111 | tmp = sys.stdin.read().splitlines()
112 |
113 | x = tmp[0]
114 | alphabet = tmp[2].split()
115 | path = tmp[4]
116 | all_states = tmp[6].split()
117 | transition_matrix, emission_matrix = HMMParameterEstimation(x, path, alphabet, all_states)
118 | print_matrices(transition_matrix, emission_matrix)
--------------------------------------------------------------------------------
/solutions/BA11J.py:
--------------------------------------------------------------------------------
1 | import sys
2 | aa_table = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
3 | 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163}
4 |
5 |
6 | def printScoreMat(Score, prefixMasses, spectral_vector, k):
7 | for t in range(-2, k + 1):
8 | if t == -2:
9 | tmp = [str(x) for x in spectral_vector]
10 | for idx in range(len(tmp)):
11 | s = tmp[idx]
12 | if len(s) == 1:
13 | s = ' ' + s
14 | tmp[idx] = s
15 | tmp = [' '] + tmp
16 | print(' '.join(tmp))
17 | print(' ')
18 | print(' ')
19 | elif t == -1:
20 | tmp = [str(i) for i in range(len(spectral_vector))]
21 | for idx in range(len(tmp)):
22 | s = tmp[idx]
23 | if len(s) == 1:
24 | s = ' ' + s
25 | tmp[idx] = s
26 | tmp = [' '] + tmp
27 | print(' '.join(tmp))
28 | else:
29 | for i in prefixMasses:
30 | row = []
31 | for j in range(-1, len(spectral_vector)):
32 | if j == -1:
33 | row.append((str(i) if i >= 10 else ' ' + str(i)) + ' ')
34 | else:
35 | score = Score[i][j][t]
36 | if score < -1e5:
37 | score = 'XX'
38 | elif len(str(score)) == 1:
39 | score = ' ' + str(score)
40 | row.append(str(score))
41 | print(' '.join(row))
42 | print(' ')
43 | return None
44 |
45 |
46 | def SpectralAlignment(peptide, spectral_vector, k):
47 | spectral_vector.insert(0, 0)
48 |
49 | ## Calculate prefix masses
50 | prefixMasses = [0]
51 | for i in range(len(peptide)):
52 | prefix = peptide[:i + 1]
53 | mass = 0
54 | for aa in prefix:
55 | mass += aa_table[aa]
56 | prefixMasses.append(mass)
57 |
58 | ## Create diff array
59 | diff = {}
60 | for i in range(1, len(prefixMasses)):
61 | diff[prefixMasses[i]] = prefixMasses[i] - prefixMasses[i - 1]
62 |
63 | ## Initiliaze scores
64 | Score = {}
65 | for i in prefixMasses:
66 | Score[i] = {}
67 | for j in range(len(spectral_vector)):
68 | Score[i][j] = {}
69 | for t in range(k + 1):
70 | Score[i][j][t] = -float("inf")
71 | Score[0][0][0] = 0
72 |
73 | ## Calculate scores
74 | for i in prefixMasses[1:]:
75 | for j in range(len(spectral_vector)):
76 | for t in range(k + 1):
77 | if (t == 0) and (i - diff[i] >= 0) and (j - diff[i] >= 0):
78 | Score[i][j][t] = spectral_vector[j] + Score[i - diff[i]][j - diff[i]][t]
79 | elif (t > 0) and (i - diff[i] >= 0) and (j - diff[i] >= 0):
80 | Score[i][j][t] = spectral_vector[j] + max(Score[i - diff[i]][j - diff[i]][t], max(
81 | [Score[i - diff[i]][j_star][t - 1] for j_star in range(j)]))
82 | elif (t > 0) and (i - diff[i] >= 0) and (j > 0):
83 | Score[i][j][t] = spectral_vector[j] + max(
84 | [Score[i - diff[i]][j_star][t - 1] for j_star in range(j)])
85 |
86 | # printScoreMat(Score, prefixMasses, spectral_vector, k)
87 |
88 | ## Find max score layer
89 | max_score = -float("inf")
90 | for t in range(k + 1):
91 | current = Score[prefixMasses[-1]][len(spectral_vector) - 1][t]
92 | if current > max_score:
93 | max_score = current
94 | max_layer = t
95 |
96 | ## Backtrace
97 | layer = max_layer
98 | column = len(spectral_vector) - 1
99 |
100 | result = ''
101 | for i in range(len(peptide), 0, -1):
102 | pre = prefixMasses[i]
103 | if (column - diff[pre] >= 0) and (
104 | Score[pre][column][layer] == spectral_vector[column] + Score[pre - diff[pre]][column - diff[pre]][
105 | layer]):
106 | column -= diff[pre]
107 | result = peptide[i - 1] + result
108 | else:
109 | tmp = [Score[pre - diff[pre]][j_star][layer - 1] for j_star in range(column)]
110 | idx = tmp.index((max(tmp)))
111 | modif = column - idx - diff[pre]
112 | if modif > 0:
113 | result = peptide[i - 1] + '(+' + str(modif) + ')' + result
114 | else:
115 | result = peptide[i - 1] + '(' + str(modif) + ')' + result
116 | column = idx
117 | layer -= 1
118 |
119 | return result
120 |
121 |
122 | if __name__ == "__main__":
123 | '''
124 | Given: A peptide Peptide, a spectral vector Spectrum', and an integer k.
125 | Return: A peptide Peptide' related to Peptide by up to k modifications with maximal score against Spectrum' out of
126 | all possibilities.
127 | '''
128 | tmp = sys.stdin.read().splitlines()
129 | peptide = tmp[0]
130 | spectral_vector = [int(x) for x in tmp[1].rstrip().split(' ')]
131 | k = int(tmp[2])
132 |
133 | print(SpectralAlignment(peptide, spectral_vector, k))
134 |
--------------------------------------------------------------------------------
/solutions/BA6D.py:
--------------------------------------------------------------------------------
1 | def ProcessInput(P):
2 | P = P[1:-1]
3 | P = P.split(')(')
4 | for i in range(len(P)):
5 | P[i] = P[i].split(' ')
6 | for j in range(len(P[i])):
7 | P[i][j] = int(P[i][j])
8 | return P
9 |
10 |
11 | def ChromosomeToCycle(Chromosome):
12 | Nodes = []
13 | for block in Chromosome:
14 | if block > 0:
15 | Nodes.append(2 * block - 1)
16 | Nodes.append(2 * block)
17 | else:
18 | Nodes.append(-2 * block)
19 | Nodes.append(-2 * block - 1)
20 | return Nodes
21 |
22 |
23 | def ColoredEdges(P):
24 | Edges = []
25 | for Chromosome in P:
26 | Nodes = ChromosomeToCycle(Chromosome)
27 | for j in range(1, len(Nodes), 2):
28 | if j != len(Nodes) - 1:
29 | Edges.append([Nodes[j], Nodes[j + 1]])
30 | else:
31 | Edges.append([Nodes[j], Nodes[0]])
32 | return Edges
33 |
34 |
35 | def FindNextEdge(current, edges):
36 | if len(edges) == 0:
37 | return -1
38 | idx = 0
39 | while not (current[0] in edges[idx] or current[1] in edges[idx]):
40 | idx += 1
41 | if idx == len(edges):
42 | return -1
43 | return edges[idx]
44 |
45 |
46 | def FindCycles(edges):
47 | Cycles = []
48 | while len(edges) != 0:
49 | start = edges[0]
50 | edges.remove(edges[0])
51 | Cycle = [start]
52 | current = FindNextEdge(start, edges)
53 | while current != -1:
54 | Cycle.append(current)
55 | edges.remove(current)
56 | current = FindNextEdge(current, edges)
57 | if len(Cycle) > 2:
58 | Cycles.append(Cycle)
59 | return Cycles
60 |
61 |
62 | def TwoBreakOnGenomeGraph(GenomeGraph, i1 , i2 , i3 , i4):
63 | if [i1, i2] in GenomeGraph:
64 | for i in range(len(GenomeGraph)):
65 | if GenomeGraph[i] == [i1, i2]:
66 | GenomeGraph[i] = [i1, i3]
67 | else:
68 | for i in range(len(GenomeGraph)):
69 | if GenomeGraph[i] == [i2, i1]:
70 | GenomeGraph[i] = [i3, i1]
71 | if [i3, i4] in GenomeGraph:
72 | for i in range(len(GenomeGraph)):
73 | if GenomeGraph[i] == [i3, i4]:
74 | GenomeGraph[i] = [i2, i4]
75 | else:
76 | for i in range(len(GenomeGraph)):
77 | if GenomeGraph[i] == [i4, i3]:
78 | GenomeGraph[i] = [i4, i2]
79 | return GenomeGraph
80 |
81 |
82 | def TwoBreakOnGenome(P, i1 , i2 , i3 , i4):
83 | GenomeGraph = ColoredEdges(P)
84 | GenomeGraph = TwoBreakOnGenomeGraph(GenomeGraph, i1, i2, i3, i4)
85 | Q = GraphToGenome(GenomeGraph)
86 | return Q
87 |
88 |
89 | def FindNextEdge2(current, edges):
90 | if len(edges) == 0:
91 | return -1
92 | idx = 0
93 | val = current[1]
94 | if val % 2 == 0:
95 | val -= 1
96 | else:
97 | val += 1
98 | while not val in edges[idx]:
99 | idx += 1
100 | if idx == len(edges):
101 | return -1
102 | if val == edges[idx][1]:
103 | edges[idx].reverse()
104 | return edges[idx]
105 |
106 |
107 | def CycleToChromosome(Nodes):
108 | Chromosome = []
109 | for i in range(0, len(Nodes), 2):
110 | if Nodes[i] < Nodes[i + 1]:
111 | Chromosome.append(Nodes[i + 1] // 2)
112 | else:
113 | Chromosome.append(-Nodes[i] // 2)
114 | return Chromosome
115 |
116 |
117 | def GraphToGenome(GenomeGraph):
118 | Q = []
119 | Cycles = []
120 | idx = 0
121 | while len(GenomeGraph) != 0:
122 | Cycle = []
123 | current = GenomeGraph[0]
124 | while current != -1:
125 | Cycle += current
126 | GenomeGraph.remove(current)
127 | next_edge = FindNextEdge2(current, GenomeGraph)
128 | current = next_edge
129 | Cycles.append(Cycle)
130 | for Cycle in Cycles:
131 | Cycle = [Cycle[-1]] + Cycle[:-1]
132 | Chromosome = CycleToChromosome(Cycle)
133 | Q.append(Chromosome)
134 | return Q
135 |
136 |
137 | def ShortestRearrangementScenario(P, Q):
138 | result = [P]
139 | RedEdges = ColoredEdges(P)
140 | BlueEdges = ColoredEdges(Q)
141 | BreakpointGraph = BlueEdges + RedEdges
142 | NonTrivialCycles = FindCycles(BreakpointGraph)
143 | while len(NonTrivialCycles) != 0:
144 | Cycle = NonTrivialCycles[0]
145 | for i in range(len(Cycle) - 1):
146 | if Cycle[i][0] in Cycle[i + 1]:
147 | Cycle[i].reverse()
148 | if Cycle[i + 1][1] in Cycle[i]:
149 | Cycle[i+1].reverse()
150 | idx = 0
151 | while not Cycle[idx] in RedEdges:
152 | idx += 1
153 | i1, i2 = Cycle[idx]
154 | if idx + 2 != len(Cycle):
155 | i3, i4 = Cycle[idx + 2]
156 | else:
157 | i3, i4 = Cycle[0]
158 | RedEdges.remove([i1, i2])
159 | RedEdges.remove([i3, i4])
160 | RedEdges.append([i1, i4])
161 | RedEdges.append([i2, i3])
162 | BreakpointGraph = BlueEdges + RedEdges
163 | NonTrivialCycles = FindCycles(BreakpointGraph)
164 | P = TwoBreakOnGenome(P, i1 , i2 , i4 , i3)
165 | result.append(P)
166 | return result
167 |
168 | if __name__ == "__main__":
169 | '''
170 | Given: Two genomes with circular chromosomes on the same set of synteny blocks.
171 | Return: The sequence of genomes resulting from applying a shortest sequence of 2-breaks transforming one genome into
172 | the other.
173 | '''
174 | P = input().rstrip()
175 | P = ProcessInput(P)
176 | Q = input().rstrip()
177 | Q = ProcessInput(Q)
178 | answer = ShortestRearrangementScenario(P, Q)
179 | for result in answer:
180 | for j in range(len(result)):
181 | result[j] = '(' + ' '.join(('+' if i > 0 else '') + str(i) for i in result[j]) + ')'
182 | print(''.join(result))
--------------------------------------------------------------------------------
/solutions/BA7F.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | ALPHABET = ['A', 'C', 'G', 'T']
4 |
5 |
6 | def HammingDistance(p, q):
7 | mm = [p[i] != q[i] for i in range(len(p))]
8 | return sum(mm)
9 |
10 |
11 | def RepresentsInt(s):
12 | try:
13 | int(s)
14 | return True
15 | except ValueError:
16 | return False
17 |
18 |
19 | def SmallParsimony(adj_list):
20 | ## initialize
21 | Tag = {}
22 | S = {}
23 |
24 | nodes = [item for sublist in adj_list for item in sublist]
25 | nodes = list(set(nodes))
26 |
27 | for v in nodes:
28 | S[v] = {}
29 | Tag[v] = 0
30 | if not RepresentsInt(v):
31 | Tag[v] = 1
32 | len_dna = len(v)
33 | for pos in range(len_dna):
34 | S[v][pos] = {}
35 | char = v[pos]
36 | for k in ALPHABET:
37 | if char == k:
38 | S[v][pos][k] = 0
39 | else:
40 | S[v][pos][k] = 1e6
41 |
42 | ## calculate scores
43 | while any(x == 0 for x in list(Tag.values())):
44 | zero_nodes = [node for node, tag in Tag.items() if tag == 0]
45 | for zn in zero_nodes:
46 | children = [child for parent, child in adj_list if parent == zn]
47 | if all([Tag[child] == 1 for child in children]):
48 | v = zn
49 | break
50 | Tag[v] = 1
51 | S[v] = {}
52 | for pos in range(len_dna):
53 | S[v][pos] = {}
54 | for k in ALPHABET:
55 | temp = []
56 | for i, score in S[children[0]][pos].items():
57 | if i == k:
58 | temp.append(score)
59 | else:
60 | temp.append(score + 1)
61 | score_daughter = min(temp)
62 |
63 | temp = []
64 | for i, score in S[children[1]][pos].items():
65 | if i == k:
66 | temp.append(score)
67 | else:
68 | temp.append(score + 1)
69 | score_son = min(temp)
70 |
71 | S[v][pos][k] = score_daughter + score_son
72 | return S
73 |
74 |
75 | def FinalTree(adj_list, score_dict):
76 | nodes = [item for sublist in adj_list for item in sublist]
77 | nodes = list(set(nodes))
78 | child_nodes = [child for parent, child in adj_list]
79 |
80 | ## Find root
81 | root = nodes[0]
82 | idx = 1
83 | while root in child_nodes:
84 | root = nodes[idx]
85 | idx += 1
86 |
87 | ## Root's label and min parsimony score
88 | label_dict = {}
89 | label_dict[root] = ''
90 | min_pars_score = 0
91 | for pos, scores in score_dict[root].items():
92 | label_dict[root] += min(scores, key=scores.get)
93 | min_pars_score += min(scores.values())
94 |
95 | ## Backtrace
96 | Tag = {}
97 | for node in nodes:
98 | if not RepresentsInt(node):
99 | Tag[node] = 1
100 | else:
101 | Tag[node] = 0
102 |
103 | Tag[root] = 1
104 |
105 | while any(x == 0 for x in list(Tag.values())):
106 |
107 | one_nodes = [node for node, tag in Tag.items() if tag == 1]
108 | for node in one_nodes:
109 | children = [child for parent, child in adj_list if parent == node]
110 | if RepresentsInt(node) and all([Tag[child] == 0 for child in children]):
111 | v = node
112 | break
113 |
114 | daughter_label = ''
115 | daughter_scores = score_dict[children[0]]
116 | for pos, daughter_score in daughter_scores.items():
117 | parent_letter = label_dict[v][pos]
118 | # parent_score = score_dict[v][pos]
119 | # parent_score = parent_score[parent_letter]
120 | min_nucs = [nuc for nuc, val in daughter_score.items() if val == min(daughter_score.values())]
121 | if parent_letter in min_nucs:
122 | daughter_label += parent_letter
123 | else:
124 | daughter_label += min_nucs[0]
125 |
126 | label_dict[children[0]] = daughter_label
127 | Tag[children[0]] = 1
128 |
129 | son_label = ''
130 | son_scores = score_dict[children[1]]
131 | for pos, son_score in son_scores.items():
132 | parent_letter = label_dict[v][pos]
133 | # parent_score = score_dict[v][pos]
134 | # parent_score = parent_score[parent_letter]
135 | min_nucs = [nuc for nuc, val in son_score.items() if val == min(son_score.values())]
136 | if parent_letter in min_nucs:
137 | son_label += parent_letter
138 | else:
139 | son_label += min_nucs[0]
140 |
141 | label_dict[children[1]] = son_label
142 | Tag[children[1]] = 1
143 |
144 | ## Create final adjacency list
145 | final_adj_list = []
146 | for edge in adj_list:
147 | if RepresentsInt(edge[0]):
148 | node0 = label_dict[edge[0]]
149 | else:
150 | node0 = edge[0]
151 | if RepresentsInt(edge[1]):
152 | node1 = label_dict[edge[1]]
153 | else:
154 | node1 = edge[1]
155 | final_adj_list.append([node0, node1, HammingDistance(node0, node1)])
156 | final_adj_list.append([node1, node0, HammingDistance(node0, node1)])
157 |
158 | return [final_adj_list, min_pars_score]
159 |
160 |
161 | if __name__ == "__main__":
162 | '''
163 | Given: An integer n followed by an adjacency list for a rooted binary tree with n leaves labeled by DNA strings.
164 | Return: The minimum parsimony score of this tree, followed by the adjacency list of the tree corresponding to
165 | labeling internal nodes by DNA strings in order to minimize the parsimony score of the tree.
166 | '''
167 | lines = sys.stdin.read().splitlines()
168 | num_leaves = int(lines[0])
169 |
170 | adj_list = []
171 | for row in lines[1:]:
172 | temp = row.rstrip().split('->')
173 | adj_list.append(temp)
174 |
175 | score_dict = SmallParsimony(adj_list)
176 |
177 | final_adj_list, min_pars_score = FinalTree(adj_list, score_dict)
178 |
179 | print(min_pars_score)
180 |
181 | for edge in final_adj_list:
182 | print(str(edge[0]) + '->' + str(edge[1]) + ':' + str(edge[2]))
--------------------------------------------------------------------------------
/solutions/BA5F.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | PAM250 = {'A': {'A': 2, 'C': -2, 'D': 0, 'E': 0, 'F': -3, 'G': 1, 'H': -1, 'I': -1, 'K': -1, 'L': -2, 'M': -1, 'N': 0,
4 | 'P': 1, 'Q': 0, 'R': -2, 'S': 1, 'T': 1, 'V': 0, 'W': -6, 'Y': -3},
5 | 'C': {'A': -2, 'C': 12, 'D': -5, 'E': -5, 'F': -4, 'G': -3, 'H': -3, 'I': -2, 'K': -5, 'L': -6, 'M': -5,
6 | 'N': -4, 'P': -3, 'Q': -5, 'R': -4, 'S': 0, 'T': -2, 'V': -2, 'W': -8, 'Y': 0},
7 | 'D': {'A': 0, 'C': -5, 'D': 4, 'E': 3, 'F': -6, 'G': 1, 'H': 1, 'I': -2, 'K': 0, 'L': -4, 'M': -3, 'N': 2,
8 | 'P': -1, 'Q': 2, 'R': -1, 'S': 0, 'T': 0, 'V': -2, 'W': -7, 'Y': -4},
9 | 'E': {'A': 0, 'C': -5, 'D': 3, 'E': 4, 'F': -5, 'G': 0, 'H': 1, 'I': -2, 'K': 0, 'L': -3, 'M': -2, 'N': 1,
10 | 'P': -1, 'Q': 2, 'R': -1, 'S': 0, 'T': 0, 'V': -2, 'W': -7, 'Y': -4},
11 | 'F': {'A': -3, 'C': -4, 'D': -6, 'E': -5, 'F': 9, 'G': -5, 'H': -2, 'I': 1, 'K': -5, 'L': 2, 'M': 0, 'N': -3,
12 | 'P': -5, 'Q': -5, 'R': -4, 'S': -3, 'T': -3, 'V': -1, 'W': 0, 'Y': 7},
13 | 'G': {'A': 1, 'C': -3, 'D': 1, 'E': 0, 'F': -5, 'G': 5, 'H': -2, 'I': -3, 'K': -2, 'L': -4, 'M': -3, 'N': 0,
14 | 'P': 0, 'Q': -1, 'R': -3, 'S': 1, 'T': 0, 'V': -1, 'W': -7, 'Y': -5},
15 | 'H': {'A': -1, 'C': -3, 'D': 1, 'E': 1, 'F': -2, 'G': -2, 'H': 6, 'I': -2, 'K': 0, 'L': -2, 'M': -2, 'N': 2,
16 | 'P': 0, 'Q': 3, 'R': 2, 'S': -1, 'T': -1, 'V': -2, 'W': -3, 'Y': 0},
17 | 'I': {'A': -1, 'C': -2, 'D': -2, 'E': -2, 'F': 1, 'G': -3, 'H': -2, 'I': 5, 'K': -2, 'L': 2, 'M': 2, 'N': -2,
18 | 'P': -2, 'Q': -2, 'R': -2, 'S': -1, 'T': 0, 'V': 4, 'W': -5, 'Y': -1},
19 | 'K': {'A': -1, 'C': -5, 'D': 0, 'E': 0, 'F': -5, 'G': -2, 'H': 0, 'I': -2, 'K': 5, 'L': -3, 'M': 0, 'N': 1,
20 | 'P': -1, 'Q': 1, 'R': 3, 'S': 0, 'T': 0, 'V': -2, 'W': -3, 'Y': -4},
21 | 'L': {'A': -2, 'C': -6, 'D': -4, 'E': -3, 'F': 2, 'G': -4, 'H': -2, 'I': 2, 'K': -3, 'L': 6, 'M': 4, 'N': -3,
22 | 'P': -3, 'Q': -2, 'R': -3, 'S': -3, 'T': -2, 'V': 2, 'W': -2, 'Y': -1},
23 | 'M': {'A': -1, 'C': -5, 'D': -3, 'E': -2, 'F': 0, 'G': -3, 'H': -2, 'I': 2, 'K': 0, 'L': 4, 'M': 6, 'N': -2,
24 | 'P': -2, 'Q': -1, 'R': 0, 'S': -2, 'T': -1, 'V': 2, 'W': -4, 'Y': -2},
25 | 'N': {'A': 0, 'C': -4, 'D': 2, 'E': 1, 'F': -3, 'G': 0, 'H': 2, 'I': -2, 'K': 1, 'L': -3, 'M': -2, 'N': 2,
26 | 'P': 0, 'Q': 1, 'R': 0, 'S': 1, 'T': 0, 'V': -2, 'W': -4, 'Y': -2},
27 | 'P': {'A': 1, 'C': -3, 'D': -1, 'E': -1, 'F': -5, 'G': 0, 'H': 0, 'I': -2, 'K': -1, 'L': -3, 'M': -2, 'N': 0,
28 | 'P': 6, 'Q': 0, 'R': 0, 'S': 1, 'T': 0, 'V': -1, 'W': -6, 'Y': -5},
29 | 'Q': {'A': 0, 'C': -5, 'D': 2, 'E': 2, 'F': -5, 'G': -1, 'H': 3, 'I': -2, 'K': 1, 'L': -2, 'M': -1, 'N': 1,
30 | 'P': 0, 'Q': 4, 'R': 1, 'S': -1, 'T': -1, 'V': -2, 'W': -5, 'Y': -4},
31 | 'R': {'A': -2, 'C': -4, 'D': -1, 'E': -1, 'F': -4, 'G': -3, 'H': 2, 'I': -2, 'K': 3, 'L': -3, 'M': 0, 'N': 0,
32 | 'P': 0, 'Q': 1, 'R': 6, 'S': 0, 'T': -1, 'V': -2, 'W': 2, 'Y': -4},
33 | 'S': {'A': 1, 'C': 0, 'D': 0, 'E': 0, 'F': -3, 'G': 1, 'H': -1, 'I': -1, 'K': 0, 'L': -3, 'M': -2, 'N': 1,
34 | 'P': 1, 'Q': -1, 'R': 0, 'S': 2, 'T': 1, 'V': -1, 'W': -2, 'Y': -3},
35 | 'T': {'A': 1, 'C': -2, 'D': 0, 'E': 0, 'F': -3, 'G': 0, 'H': -1, 'I': 0, 'K': 0, 'L': -2, 'M': -1, 'N': 0,
36 | 'P': 0, 'Q': -1, 'R': -1, 'S': 1, 'T': 3, 'V': 0, 'W': -5, 'Y': -3},
37 | 'V': {'A': 0, 'C': -2, 'D': -2, 'E': -2, 'F': -1, 'G': -1, 'H': -2, 'I': 4, 'K': -2, 'L': 2, 'M': 2, 'N': -2,
38 | 'P': -1, 'Q': -2, 'R': -2, 'S': -1, 'T': 0, 'V': 4, 'W': -6, 'Y': -2},
39 | 'W': {'A': -6, 'C': -8, 'D': -7, 'E': -7, 'F': 0, 'G': -7, 'H': -3, 'I': -5, 'K': -3, 'L': -2, 'M': -4,
40 | 'N': -4, 'P': -6, 'Q': -5, 'R': 2, 'S': -2, 'T': -5, 'V': -6, 'W': 17, 'Y': 0},
41 | 'Y': {'A': -3, 'C': 0, 'D': -4, 'E': -4, 'F': 7, 'G': -5, 'H': 0, 'I': -1, 'K': -4, 'L': -1, 'M': -2, 'N': -2,
42 | 'P': -5, 'Q': -4, 'R': -4, 'S': -3, 'T': -3, 'V': -2, 'W': 0, 'Y': 10}}
43 |
44 |
45 | def local_alignment(str1, str2, indel_penalty=5):
46 | str1 = "-" + str1
47 | str2 = "-" + str2
48 |
49 | score_mat = [[0 for i in range(len(str2))] for j in range(len(str1))]
50 | backtrack_mat = [[None for i in range(len(str2))] for j in range(len(str1))]
51 |
52 | for i in range(1, len(str1)):
53 | for j in range(1, len(str2)):
54 | if str1[i] in PAM250.keys():
55 | key1 = str1[i]
56 | key2 = str2[j]
57 | else:
58 | key1 = str2[j]
59 | key2 = str1[i]
60 |
61 | score1 = score_mat[i - 1][j - 1] + PAM250[key1][key2]
62 | score2 = score_mat[i - 1][j] - indel_penalty
63 | score3 = score_mat[i][j - 1] - indel_penalty
64 | score_mat[i][j] = max(score1, score2, score3, 0)
65 | if score_mat[i][j] == score1:
66 | backtrack_mat[i][j] = "d"
67 | elif score_mat[i][j] == score2:
68 | backtrack_mat[i][j] = "u"
69 | elif score_mat[i][j] == score3:
70 | backtrack_mat[i][j] = "l"
71 |
72 | max_score = -1
73 | for i in range(len(str1)):
74 | for j in range(len(str2)):
75 | if score_mat[i][j] > max_score:
76 | max_score = score_mat[i][j]
77 | max_i, max_j = i, j
78 |
79 | i = max_i
80 | j = max_j
81 | aligned_1 = ""
82 | aligned_2 = ""
83 | while backtrack_mat[i][j] is not None:
84 | direction = backtrack_mat[i][j]
85 | if direction == "d":
86 | aligned_1 = str1[i] + aligned_1
87 | aligned_2 = str2[j] + aligned_2
88 | i -= 1
89 | j -= 1
90 | elif direction == "u":
91 | aligned_1 = str1[i] + aligned_1
92 | aligned_2 = "-" + aligned_2
93 | i -= 1
94 | else:
95 | aligned_1 = "-" + aligned_1
96 | aligned_2 = str2[j] + aligned_2
97 | j -= 1
98 |
99 | return max_score, aligned_1, aligned_2
100 |
101 |
102 | if __name__ == "__main__":
103 | '''
104 | Given: Two amino acid strings.
105 | Return: The maximum score of a local alignment of the strings, followed by a local alignment of these strings
106 | achieving the maximum score. Use the PAM250 scoring matrix and indel penalty σ = 5. (If multiple local alignments
107 | achieving the maximum score exist, you may return any one.)
108 | '''
109 | input_lines = sys.stdin.read().splitlines()
110 | string1 = input_lines[0]
111 | string2 = input_lines[1]
112 |
113 | score, alignment1, alignment2 = local_alignment(string1, string2)
114 | print(score)
115 | print(alignment1)
116 | print(alignment2)
117 |
--------------------------------------------------------------------------------
/solutions/BA5L.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from BA5K import middle_edge
3 |
4 | BLOSUM62 = {
5 | ('W', 'F'): 1, ('L', 'R'): -2, ('S', 'P'): -1, ('V', 'T'): 0,
6 | ('Q', 'Q'): 5, ('N', 'A'): -2, ('Z', 'Y'): -2, ('W', 'R'): -3,
7 | ('Q', 'A'): -1, ('S', 'D'): 0, ('H', 'H'): 8, ('S', 'H'): -1,
8 | ('H', 'D'): -1, ('L', 'N'): -3, ('W', 'A'): -3, ('Y', 'M'): -1,
9 | ('G', 'R'): -2, ('Y', 'I'): -1, ('Y', 'E'): -2, ('B', 'Y'): -3,
10 | ('Y', 'A'): -2, ('V', 'D'): -3, ('B', 'S'): 0, ('Y', 'Y'): 7,
11 | ('G', 'N'): 0, ('E', 'C'): -4, ('Y', 'Q'): -1, ('Z', 'Z'): 4,
12 | ('V', 'A'): 0, ('C', 'C'): 9, ('M', 'R'): -1, ('V', 'E'): -2,
13 | ('T', 'N'): 0, ('P', 'P'): 7, ('V', 'I'): 3, ('V', 'S'): -2,
14 | ('Z', 'P'): -1, ('V', 'M'): 1, ('T', 'F'): -2, ('V', 'Q'): -2,
15 | ('K', 'K'): 5, ('P', 'D'): -1, ('I', 'H'): -3, ('I', 'D'): -3,
16 | ('T', 'R'): -1, ('P', 'L'): -3, ('K', 'G'): -2, ('M', 'N'): -2,
17 | ('P', 'H'): -2, ('F', 'Q'): -3, ('Z', 'G'): -2, ('X', 'L'): -1,
18 | ('T', 'M'): -1, ('Z', 'C'): -3, ('X', 'H'): -1, ('D', 'R'): -2,
19 | ('B', 'W'): -4, ('X', 'D'): -1, ('Z', 'K'): 1, ('F', 'A'): -2,
20 | ('Z', 'W'): -3, ('F', 'E'): -3, ('D', 'N'): 1, ('B', 'K'): 0,
21 | ('X', 'X'): -1, ('F', 'I'): 0, ('B', 'G'): -1, ('X', 'T'): 0,
22 | ('F', 'M'): 0, ('B', 'C'): -3, ('Z', 'I'): -3, ('Z', 'V'): -2,
23 | ('S', 'S'): 4, ('L', 'Q'): -2, ('W', 'E'): -3, ('Q', 'R'): 1,
24 | ('N', 'N'): 6, ('W', 'M'): -1, ('Q', 'C'): -3, ('W', 'I'): -3,
25 | ('S', 'C'): -1, ('L', 'A'): -1, ('S', 'G'): 0, ('L', 'E'): -3,
26 | ('W', 'Q'): -2, ('H', 'G'): -2, ('S', 'K'): 0, ('Q', 'N'): 0,
27 | ('N', 'R'): 0, ('H', 'C'): -3, ('Y', 'N'): -2, ('G', 'Q'): -2,
28 | ('Y', 'F'): 3, ('C', 'A'): 0, ('V', 'L'): 1, ('G', 'E'): -2,
29 | ('G', 'A'): 0, ('K', 'R'): 2, ('E', 'D'): 2, ('Y', 'R'): -2,
30 | ('M', 'Q'): 0, ('T', 'I'): -1, ('C', 'D'): -3, ('V', 'F'): -1,
31 | ('T', 'A'): 0, ('T', 'P'): -1, ('B', 'P'): -2, ('T', 'E'): -1,
32 | ('V', 'N'): -3, ('P', 'G'): -2, ('M', 'A'): -1, ('K', 'H'): -1,
33 | ('V', 'R'): -3, ('P', 'C'): -3, ('M', 'E'): -2, ('K', 'L'): -2,
34 | ('V', 'V'): 4, ('M', 'I'): 1, ('T', 'Q'): -1, ('I', 'G'): -4,
35 | ('P', 'K'): -1, ('M', 'M'): 5, ('K', 'D'): -1, ('I', 'C'): -1,
36 | ('Z', 'D'): 1, ('F', 'R'): -3, ('X', 'K'): -1, ('Q', 'D'): 0,
37 | ('X', 'G'): -1, ('Z', 'L'): -3, ('X', 'C'): -2, ('Z', 'H'): 0,
38 | ('B', 'L'): -4, ('B', 'H'): 0, ('F', 'F'): 6, ('X', 'W'): -2,
39 | ('B', 'D'): 4, ('D', 'A'): -2, ('S', 'L'): -2, ('X', 'S'): 0,
40 | ('F', 'N'): -3, ('S', 'R'): -1, ('W', 'D'): -4, ('V', 'Y'): -1,
41 | ('W', 'L'): -2, ('H', 'R'): 0, ('W', 'H'): -2, ('H', 'N'): 1,
42 | ('W', 'T'): -2, ('T', 'T'): 5, ('S', 'F'): -2, ('W', 'P'): -4,
43 | ('L', 'D'): -4, ('B', 'I'): -3, ('L', 'H'): -3, ('S', 'N'): 1,
44 | ('B', 'T'): -1, ('L', 'L'): 4, ('Y', 'K'): -2, ('E', 'Q'): 2,
45 | ('Y', 'G'): -3, ('Z', 'S'): 0, ('Y', 'C'): -2, ('G', 'D'): -1,
46 | ('B', 'V'): -3, ('E', 'A'): -1, ('Y', 'W'): 2, ('E', 'E'): 5,
47 | ('Y', 'S'): -2, ('C', 'N'): -3, ('V', 'C'): -1, ('T', 'H'): -2,
48 | ('P', 'R'): -2, ('V', 'G'): -3, ('T', 'L'): -1, ('V', 'K'): -2,
49 | ('K', 'Q'): 1, ('R', 'A'): -1, ('I', 'R'): -3, ('T', 'D'): -1,
50 | ('P', 'F'): -4, ('I', 'N'): -3, ('K', 'I'): -3, ('M', 'D'): -3,
51 | ('V', 'W'): -3, ('W', 'W'): 11, ('M', 'H'): -2, ('P', 'N'): -2,
52 | ('K', 'A'): -1, ('M', 'L'): 2, ('K', 'E'): 1, ('Z', 'E'): 4,
53 | ('X', 'N'): -1, ('Z', 'A'): -1, ('Z', 'M'): -1, ('X', 'F'): -1,
54 | ('K', 'C'): -3, ('B', 'Q'): 0, ('X', 'B'): -1, ('B', 'M'): -3,
55 | ('F', 'C'): -2, ('Z', 'Q'): 3, ('X', 'Z'): -1, ('F', 'G'): -3,
56 | ('B', 'E'): 1, ('X', 'V'): -1, ('F', 'K'): -3, ('B', 'A'): -2,
57 | ('X', 'R'): -1, ('D', 'D'): 6, ('W', 'G'): -2, ('Z', 'F'): -3,
58 | ('S', 'Q'): 0, ('W', 'C'): -2, ('W', 'K'): -3, ('H', 'Q'): 0,
59 | ('L', 'C'): -1, ('W', 'N'): -4, ('S', 'A'): 1, ('L', 'G'): -4,
60 | ('W', 'S'): -3, ('S', 'E'): 0, ('H', 'E'): 0, ('S', 'I'): -2,
61 | ('H', 'A'): -2, ('S', 'M'): -1, ('Y', 'L'): -1, ('Y', 'H'): 2,
62 | ('Y', 'D'): -3, ('E', 'R'): 0, ('X', 'P'): -2, ('G', 'G'): 6,
63 | ('G', 'C'): -3, ('E', 'N'): 0, ('Y', 'T'): -2, ('Y', 'P'): -3,
64 | ('T', 'K'): -1, ('A', 'A'): 4, ('P', 'Q'): -1, ('T', 'C'): -1,
65 | ('V', 'H'): -3, ('T', 'G'): -2, ('I', 'Q'): -3, ('Z', 'T'): -1,
66 | ('C', 'R'): -3, ('V', 'P'): -2, ('P', 'E'): -1, ('M', 'C'): -1,
67 | ('K', 'N'): 0, ('I', 'I'): 4, ('P', 'A'): -1, ('M', 'G'): -3,
68 | ('T', 'S'): 1, ('I', 'E'): -3, ('P', 'M'): -2, ('M', 'K'): -1,
69 | ('I', 'A'): -1, ('P', 'I'): -3, ('R', 'R'): 5, ('X', 'M'): -1,
70 | ('L', 'I'): 2, ('X', 'I'): -1, ('Z', 'B'): 1, ('X', 'E'): -1,
71 | ('Z', 'N'): 0, ('X', 'A'): 0, ('B', 'R'): -1, ('B', 'N'): 3,
72 | ('F', 'D'): -3, ('X', 'Y'): -1, ('Z', 'R'): 0, ('F', 'H'): -1,
73 | ('B', 'F'): -3, ('F', 'L'): 0, ('X', 'Q'): -1, ('B', 'B'): 4
74 | }
75 |
76 |
77 | def alignment_score(str1, str2, indel_penalty=5):
78 | score = 0
79 | for i in range(len(str1)):
80 | if str1[i] == '-' or str2[i] == '-':
81 | score -= indel_penalty
82 | else:
83 | if (str1[i], str2[i]) in BLOSUM62:
84 | key = (str1[i], str2[i])
85 | else:
86 | key = (str2[i], str1[i])
87 | score += BLOSUM62[key]
88 | return score
89 |
90 |
91 | def linear_space_alignment(str1, str2, top=0, bottom=None, left=0, right=None):
92 | if bottom is None:
93 | bottom = len(str1)
94 | if right is None:
95 | right = len(str2)
96 |
97 | if left == right:
98 | return "V" * (bottom - top)
99 | if top == bottom:
100 | return "H" * (right - left)
101 |
102 | mid_edge, mid_from, mid_to = middle_edge(str1, str2, top, bottom, left, right)
103 | # Left
104 | midNode, middle = mid_from
105 | pathL = linear_space_alignment(str1, str2, top, midNode, left, middle)
106 | # Right
107 | midNode, middle = mid_to
108 | pathR = linear_space_alignment(str1, str2, midNode, bottom, middle, right)
109 | return pathL + mid_edge + pathR
110 |
111 |
112 | def backtrack_path(path, str1, str2):
113 | aligned1 = ''
114 | aligned2 = ''
115 | i = 0
116 | j = 0
117 | for arrow in path:
118 | if arrow == "D":
119 | aligned1 += str1[i]
120 | aligned2 += str2[j]
121 | i += 1
122 | j += 1
123 | elif arrow == "V":
124 | aligned1 += str1[i]
125 | aligned2 += '-'
126 | i += 1
127 | else:
128 | aligned1 += '-'
129 | aligned2 += str2[j]
130 | j += 1
131 | return aligned1, aligned2
132 |
133 |
134 | if __name__ == "__main__":
135 | '''
136 | Given: Two long amino acid strings (of length approximately 10,000).
137 | Return: The maximum alignment score of these strings, followed by an alignment achieving this maximum score. Use the BLOSUM62 scoring matrix and indel penalty σ = 5.
138 | '''
139 | input_lines = sys.stdin.read().splitlines()
140 | string1 = input_lines[0]
141 | string2 = input_lines[1]
142 |
143 | path = linear_space_alignment(string1, string2)
144 |
145 | alignment1, alignment2 = backtrack_path(path, string1, string2)
146 |
147 | print(alignment_score(alignment1, alignment2))
148 | print(alignment1)
149 | print(alignment2)
150 |
--------------------------------------------------------------------------------