├── week5-6 ├── universal_string.txt ├── 12 - k-Universal Circular String Problem.py ├── 13 - String Reconstruction from Read-Pairs Problem.py ├── 14 - Contig Generation Problem.py ├── 11 - String Reconstruction Problem.py ├── 15 - Gapped Genome Path String Problem.py ├── 123.py ├── 111.py └── help.py ├── week7-8 ├── counting_peptides.txt ├── integer_mass_table.txt ├── cyclopeptide_sequencing.txt ├── test_dataset.txt ├── codon_table.txt ├── leaderboard_cyclopeptide_sequencing.txt ├── convolution_cyclopeptide_sequencing.txt ├── 17 - Compute the Number of Peptides of Given Total Mass.py ├── 16 - Find Substrings of a Genome Encoding a Given Amino Acid String.py ├── data.txt ├── 18 - Cyclopeptide Sequencing1.py ├── 19 - LeaderboardCyclopeptideSequencing.py ├── 20 - ConvolutionCyclopeptideSequencing.py └── peptide_encoding.txt ├── week9-10 ├── change_problem.txt ├── Manhattan_tourist.txt ├── 21 - The Change Problem.py ├── 22 - Manhattan Tourist Problem.py ├── 24 - Local_Alignment_Problem.py ├── 23 - Global_Alignment_Problem.py ├── PAM250.txt ├── 25 - Global_Alignment_in_Linear_Space_Problem.py ├── BLOSUM62.txt ├── local_alignment.txt ├── global_alignment.txt └── linear_space_alignment.txt ├── week11-12 ├── 2BreakSorting.txt ├── 2BreakOnGenome.txt ├── 27.py ├── GreedySorting.txt ├── 26 - Implement GreedySorting to Sort a Permutation by Reversals.py ├── 28.py ├── 26.py ├── 30.py ├── 29.py └── test.py ├── week3-4 ├── output_BA2F.txt ├── 10-Implement DistanceBetweenPatternAndStrings.py ├── distance_between_pattern_and_strings.txt ├── randomized_motif_search.txt ├── greedy_motif_search.txt ├── greedy_motif_search_pseudocounts.txt ├── 9-Implement GibbsSampler.py ├── 6-Implement GreedyMotifSearch.py ├── 7-Implement GreedyMotifSearch with Pseudocounts.py ├── 8-Implement RandomizedMotifSearch.py └── gibbs.txt ├── week1-2 ├── 2-Minimum Skew Problem.py ├── 3-Approximate Pattern Matching Problem.py ├── frequent_words_mismatch.txt ├── 1-Clump Finding Problem.py ├── frequent_words_mismatch_complements.txt ├── 4-Frequent Words with Mismatches Problem(new).py ├── 5-Reverse Complements Problem.py ├── 4-Frequent Words with Mismatches Problem.py ├── clump_finding.txt ├── rosalind_ba1e.txt └── approximate_match.txt └── README.md /week5-6/universal_string.txt: -------------------------------------------------------------------------------- 1 | 9 -------------------------------------------------------------------------------- /week7-8/counting_peptides.txt: -------------------------------------------------------------------------------- 1 | 1412 2 | -------------------------------------------------------------------------------- /week9-10/change_problem.txt: -------------------------------------------------------------------------------- 1 | 18817 2 | 1,3,5,20,21,22,23 3 | -------------------------------------------------------------------------------- /week11-12/2BreakSorting.txt: -------------------------------------------------------------------------------- 1 | (-6 -4 +9 +1 +8 -3 -5 +2 -12 +10 -7 +13 +11) 2 | (-11 +7 +8 -6 -1 +4 -5 +13 +3 -10 +12 +9 +2) -------------------------------------------------------------------------------- /week7-8/integer_mass_table.txt: -------------------------------------------------------------------------------- 1 | G 57 2 | A 71 3 | S 87 4 | P 97 5 | V 99 6 | T 101 7 | C 103 8 | I 113 9 | L 113 10 | N 114 11 | D 115 12 | K 128 13 | Q 128 14 | E 129 15 | M 131 16 | H 137 17 | F 147 18 | R 156 19 | Y 163 20 | W 186 -------------------------------------------------------------------------------- /week11-12/2BreakOnGenome.txt: -------------------------------------------------------------------------------- 1 | (-1 -2 -3 +4 +5 +6 -7 +8 -9 -10 -11 +12 -13 -14 -15 +16 +17 -18 +19 -20 +21 +22 -23 +24 -25 -26 -27 +28 +29 -30 +31 -32 +33 -34 -35 +36 +37 -38 -39 +40 -41 -42 +43 -44 -45 +46 +47 +48 +49 +50 +51 -52 +53 +54 +55 -56 -57 +58 +59 +60 -61) 2 | 41, 39, 11, 10 -------------------------------------------------------------------------------- /week9-10/Manhattan_tourist.txt: -------------------------------------------------------------------------------- 1 | 6 10 2 | 0 0 2 4 1 4 0 1 2 1 0 3 | 0 4 2 0 3 4 0 4 0 0 2 4 | 0 2 3 2 1 3 4 0 3 0 4 5 | 3 1 3 1 4 3 0 3 1 2 2 6 | 0 3 4 4 3 3 3 2 1 4 0 7 | 4 4 0 4 1 0 4 0 4 0 2 8 | - 9 | 4 4 4 0 2 3 0 2 2 1 10 | 1 2 2 2 3 1 3 3 1 1 11 | 2 4 4 2 1 2 1 0 4 1 12 | 3 1 1 1 1 1 1 2 0 0 13 | 2 4 3 1 0 1 2 2 3 0 14 | 1 1 4 3 1 0 4 4 3 0 15 | 2 3 4 0 3 0 2 4 1 2 16 | -------------------------------------------------------------------------------- /week7-8/cyclopeptide_sequencing.txt: -------------------------------------------------------------------------------- 1 | 0 71 97 99 103 113 113 114 115 131 137 196 200 202 208 214 226 227 228 240 245 299 311 311 316 327 337 339 340 341 358 408 414 424 429 436 440 442 453 455 471 507 527 537 539 542 551 554 556 566 586 622 638 640 651 653 657 664 669 679 685 735 752 753 754 756 766 777 782 782 794 848 853 865 866 867 879 885 891 893 897 956 962 978 979 980 980 990 994 996 1022 1093 -------------------------------------------------------------------------------- /week3-4/output_BA2F.txt: -------------------------------------------------------------------------------- 1 | GATCACAGCACGATT 2 | GGTGAGTGCAATATT 3 | GGGAATTGCAATATT 4 | CCTCGTTGCAATATC 5 | GGTCCGGGCAATATT 6 | GCGGGTTGCAATATT 7 | GGTCGAGCCAATATT 8 | GGTCGCGACAATATT 9 | GGTCGTTAAGATATT 10 | GGTCGTTGCAAGTCT 11 | CCACGTTGCAATATT 12 | GGTCGTTGCAATGGG 13 | GGTCGTAAGAATATT 14 | GGTCGTTGCCCCATT 15 | GGTCGTTGCACCGTT 16 | GGTCGTTGATGTATT 17 | GGTTTGTGCAATATT 18 | GGTCCGAGCAATATT 19 | GGTCGTCAGAATATT 20 | AGTCGTTGCAATAGG 21 | -------------------------------------------------------------------------------- /week1-2/2-Minimum Skew Problem.py: -------------------------------------------------------------------------------- 1 | data = open('minimum_skew.txt', 'r') 2 | 3 | string = data.read() 4 | result = [] 5 | position = 0 6 | skew = [] 7 | for i in string: 8 | if i == 'C': 9 | position += -1 10 | elif i =='G': 11 | position += 1 12 | skew.append(position) 13 | 14 | 15 | minimum = min(skew) 16 | for i in range(len(skew)): 17 | if skew[i] == minimum: 18 | result.append(i+1) 19 | 20 | print(*result) 21 | -------------------------------------------------------------------------------- /week7-8/test_dataset.txt: -------------------------------------------------------------------------------- 1 | 0 71 97 113 113 115 115 115 131 156 163 163 168 227 228 228 228 228 230 276 278 294 299 319 324 343 343 343 390 391 391 391 407 434 455 456 458 462 487 504 505 506 506 522 547 571 575 602 618 618 618 619 619 621 637 662 690 715 731 733 733 734 734 734 750 777 781 805 830 846 846 847 848 865 890 894 896 897 918 945 961 961 961 962 1009 1009 1009 1028 1033 1053 1058 1074 1076 1122 1124 1124 1124 1124 1125 1184 1189 1189 1196 1221 1237 1237 1237 1239 1239 1255 1281 1352 2 | -------------------------------------------------------------------------------- /week11-12/27.py: -------------------------------------------------------------------------------- 1 | def breakpoints(P): 2 | adj = 0 3 | for i in range(len(P) - 1): 4 | if P[i + 1] - P[i] == 1: 5 | adj += 1 6 | if P[0] == 1: 7 | adj += 1 8 | if P[-1] == len(P): 9 | adj += 1 10 | return len(P) + 1 - adj 11 | 12 | if __name__ == '__main__': 13 | with open('NumberOfBreakpoints.txt') as f: 14 | P = f.readline().strip() 15 | P = P[1:-1].split() 16 | P = [int(i) for i in P] 17 | print(P) 18 | print(breakpoints(P)) -------------------------------------------------------------------------------- /week11-12/GreedySorting.txt: -------------------------------------------------------------------------------- 1 | (+4 -74 +40 +61 +22 +112 +122 +25 +48 +38 +82 +13 -77 -99 +39 -21 -123 -10 +53 -76 +51 -109 -17 +124 +120 -93 -35 -94 -79 -34 -50 +46 +64 -58 -104 +36 -75 +19 +52 -105 +14 -30 -73 +11 +114 +54 +55 +67 +90 +6 +69 -16 -41 +96 +119 +24 -102 +68 -85 -86 +32 -84 +20 -88 +56 +78 +110 +106 -101 -42 -57 -87 -83 -111 -103 +44 +15 +27 -49 -37 -31 +28 -43 -26 -66 -113 +7 +3 +98 -117 -121 -100 -60 -116 -71 -12 -97 +70 +45 -118 +63 -2 -8 -18 +29 -108 -5 +91 +1 +65 -23 +80 -9 -47 +81 +92 +62 +89 +115 +59 -33 +72 -107 +95) 2 | -------------------------------------------------------------------------------- /week7-8/codon_table.txt: -------------------------------------------------------------------------------- 1 | AAA K 2 | AAC N 3 | AAG K 4 | AAU N 5 | ACA T 6 | ACC T 7 | ACG T 8 | ACU T 9 | AGA R 10 | AGC S 11 | AGG R 12 | AGU S 13 | AUA I 14 | AUC I 15 | AUG M 16 | AUU I 17 | CAA Q 18 | CAC H 19 | CAG Q 20 | CAU H 21 | CCA P 22 | CCC P 23 | CCG P 24 | CCU P 25 | CGA R 26 | CGC R 27 | CGG R 28 | CGU R 29 | CUA L 30 | CUC L 31 | CUG L 32 | CUU L 33 | GAA E 34 | GAC D 35 | GAG E 36 | GAU D 37 | GCA A 38 | GCC A 39 | GCG A 40 | GCU A 41 | GGA G 42 | GGC G 43 | GGG G 44 | GGU G 45 | GUA V 46 | GUC V 47 | GUG V 48 | GUU V 49 | UAA 50 | UAC Y 51 | UAG 52 | UAU Y 53 | UCA S 54 | UCC S 55 | UCG S 56 | UCU S 57 | UGA 58 | UGC C 59 | UGG W 60 | UGU C 61 | UUA L 62 | UUC F 63 | UUG L 64 | UUU F -------------------------------------------------------------------------------- /week9-10/21 - The Change Problem.py: -------------------------------------------------------------------------------- 1 | def dp_change(money, coins): 2 | min_num_coins = [0 for i in range(0, money + 1)] 3 | for m in range(1, money + 1): 4 | min_num_coins[m] = 99999999999999 5 | for i in range(0, len(coins)): 6 | coin = coins[i] 7 | if m >= coin: 8 | if min_num_coins[m - coin] + 1 < min_num_coins[m]: 9 | min_num_coins[m] = min_num_coins[m - coin] + 1 10 | return max(min_num_coins) 11 | 12 | 13 | if __name__ == "__main__": 14 | data = "".join(open('change_problem.txt')).split() 15 | money = int(data[0]) 16 | coins = list(map(int, data[1].split(','))) 17 | print(dp_change(money, coins)) 18 | -------------------------------------------------------------------------------- /week1-2/3-Approximate Pattern Matching Problem.py: -------------------------------------------------------------------------------- 1 | def find_position(pattern, string, maxerror): 2 | position = [] 3 | k = len(pattern) 4 | for i in range(len(string)): 5 | word = "".join(string[i: i + k]) 6 | if func(word, pattern, maxerror) == 1: 7 | position.append(i) 8 | print(*position) 9 | 10 | 11 | def func(str, pattern, maxerror): 12 | if (len(str) != len(pattern)): 13 | return 0 14 | errorcount = 0 15 | for i in range(len(str)): 16 | if (str[i] != pattern[i]): 17 | errorcount += 1 18 | if errorcount > maxerror: 19 | return 0 20 | return 1 21 | 22 | data ="".join(open('approximate_match.txt')).split() 23 | find_position(data[0], data[1], int(data[2])) -------------------------------------------------------------------------------- /week7-8/leaderboard_cyclopeptide_sequencing.txt: -------------------------------------------------------------------------------- 1 | 375 2 | 0 71 87 99 101 113 113 115 128 128 131 131 147 163 172 186 200 202 202 218 227 241 244 246 260 260 271 291 303 314 317 328 331 333 333 349 373 388 390 391 399 402 404 418 427 446 464 477 477 478 491 501 504 505 517 530 535 562 574 576 577 590 591 593 604 605 632 636 645 648 677 687 693 704 706 718 719 724 732 735 737 748 749 795 805 808 817 818 834 837 848 850 863 865 876 879 895 896 905 908 918 964 965 976 978 981 989 994 995 1007 1009 1020 1026 1036 1065 1068 1077 1081 1108 1109 1120 1122 1123 1136 1137 1139 1151 1178 1183 1196 1208 1209 1212 1222 1235 1236 1236 1249 1267 1295 1309 1311 1314 1322 1323 1325 1340 1364 1380 1380 1382 1385 1396 1399 1410 1422 1442 1453 1453 1467 1469 1472 1486 1495 1511 1511 1513 1527 1541 1550 1566 1582 1582 1585 1585 1598 1600 1600 1612 1614 1626 1642 1713 3 | -------------------------------------------------------------------------------- /week1-2/frequent_words_mismatch.txt: -------------------------------------------------------------------------------- 1 | TTTGATCTTTTTGATCTTGCTACCAAGAGATCGTCGAGATCGTCGAGATCGTCGAGATCGTCTTTGATCTTGCTACCAAGCTACCAATTTGATCTTGCTACCAAAGCCCACGAGATCGTCTTTGATCTTGAGATCGTCGAGATCGTCAGCCCACTTTGATCTTGAGATCGTCATTGACTGAGATCGTCTTTGATCTTAGCCCACATTGACTATTGACTTTTGATCTTGAGATCGTCATTGACTATTGACTAGCCCACATTGACTGAGATCGTCGAGATCGTCTTTGATCTTATTGACTATTGACTTTTGATCTTTTTGATCTTAGCCCACAGCCCACGCTACCAAATTGACTGCTACCAAATTGACTAGCCCACAGCCCACGCTACCAAATTGACTGCTACCAATTTGATCTTATTGACTAGCCCACGCTACCAAGAGATCGTCATTGACTGAGATCGTCTTTGATCTTTTTGATCTTTTTGATCTTGCTACCAAGAGATCGTCGCTACCAAATTGACTTTTGATCTTGCTACCAAAGCCCACGAGATCGTCATTGACTGCTACCAAGCTACCAATTTGATCTTATTGACTATTGACTGAGATCGTCATTGACTAGCCCACAGCCCACGCTACCAAAGCCCACAGCCCACGAGATCGTCTTTGATCTTATTGACTTTTGATCTTAGCCCACAGCCCACGCTACCAAATTGACTGAGATCGTCGCTACCAAATTGACTGCTACCAAAGCCCACAGCCCACTTTGATCTTAGCCCACGCTACCAAGAGATCGTCAGCCCACATTGACTGCTACCAAATTGACTGAGATCGTCAGCCCACGAGATCGTC 2 | 6 2 3 | -------------------------------------------------------------------------------- /week1-2/1-Clump Finding Problem.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from collections import Counter 3 | 4 | def find_frequent(string, k,t): 5 | words = [] 6 | frequent = [] 7 | 8 | for i in range(len(string)): 9 | word = "".join(string[i: i + k]) 10 | 11 | if len(word) == k: 12 | words.append(word) 13 | 14 | return Counter(words).most_common() 15 | 16 | def clump_finding_problem(string, k, L, t): 17 | words = [] 18 | for i in range(len(string)): 19 | strings1 = string[i:i + L] 20 | if len(strings1) == L: 21 | words.append(find_frequent(strings1, k, t)) 22 | 23 | pattern = list(itertools.chain(*words)) 24 | print(*set([x[0] for x in pattern if x[1] >= t])) 25 | 26 | 27 | data = "".join(open('rosalind_ba1e.txt')).split() 28 | clump_finding_problem(data[0], int(data[1]), int(data[2]), int(data[3])) 29 | -------------------------------------------------------------------------------- /week3-4/10-Implement DistanceBetweenPatternAndStrings.py: -------------------------------------------------------------------------------- 1 | def hamming_distance(str1, str2): 2 | counter = 0 3 | for s1, s2 in zip(str1, str2): 4 | if s1 != s2: 5 | counter += 1 6 | return counter 7 | 8 | def distance_between_pattern_and_strings(pattern, dna): 9 | k = len(pattern) 10 | distance = 0 11 | for string in dna: 12 | hammingdistance = float("inf") 13 | for i in range(len(string) - k + 1): 14 | if hammingdistance > hamming_distance(pattern, string[i:i + k]): 15 | hammingdistance = hamming_distance(pattern, string[i:i + k]) 16 | distance = distance + hammingdistance 17 | return distance 18 | 19 | 20 | if __name__ == "__main__": 21 | data = "".join(open('distance_between_pattern_and_strings.txt')).split() 22 | distance = distance_between_pattern_and_strings(data[0], data[1:]) 23 | print(distance) -------------------------------------------------------------------------------- /week1-2/frequent_words_mismatch_complements.txt: -------------------------------------------------------------------------------- 1 | GGCCTAACAGACCCCCACGGCCTAACAGCAGCAATCGGCCTAACAGCGCAACTCGCGATCGACCGCAACTCGCGATCGACCAGCAATCACCCCCACCAGCAATCGCGATCGACGGCCTAACAGGCGATCGACACCCCCACCAGCAATCGCGATCGACCAGCAATCCGCAACTCCAGCAATCGGCCTAACAGGCGATCGACACCCCCACGCGATCGACCAGCAATCGCGATCGACGCGATCGACGGCCTAACAGGCGATCGACCAGCAATCGGCCTAACAGACCCCCACGCGATCGACCGCAACTCGCGATCGACGCGATCGACACCCCCACGGCCTAACAGCGCAACTCGCGATCGACACCCCCACACCCCCACCGCAACTCCGCAACTCCGCAACTCACCCCCACCGCAACTCGGCCTAACAGCAGCAATCCAGCAATCCAGCAATCGCGATCGACGGCCTAACAGGGCCTAACAGCAGCAATCACCCCCACACCCCCACGGCCTAACAGGGCCTAACAGGCGATCGACCGCAACTCACCCCCACGGCCTAACAGCGCAACTCGCGATCGACGGCCTAACAGGGCCTAACAGGCGATCGACCAGCAATCCAGCAATCGCGATCGACGCGATCGACCGCAACTCACCCCCACCGCAACTCCAGCAATCGGCCTAACAGGGCCTAACAGACCCCCACGCGATCGACCAGCAATCCAGCAATCCGCAACTCGGCCTAACAGGCGATCGACCGCAACTCCGCAACTCCAGCAATCCGCAACTCGCGATCGACGCGATCGACCAGCAATCCGCAACTCACCCCCACGGCCTAACAGCGCAACTCGGCCTAACAGCGCAACTCCAGCAATCACCCCCACCAGCAATCCAGCAATCCGCAACTCCAGCAATCCAGCAATCCGCAACTCGCGATCGACCGCAACTC 2 | 6 3 -------------------------------------------------------------------------------- /week7-8/convolution_cyclopeptide_sequencing.txt: -------------------------------------------------------------------------------- 1 | 19 2 | 363 3 | 0 71 97 97 99 103 103 113 114 114 115 128 129 147 156 184 186 200 206 210 213 218 225 228 229 232 242 261 271 281 283 303 328 331 331 332 335 339 342 342 357 360 385 396 428 431 431 432 434 439 442 445 445 454 457 467 489 513 531 542 544 545 548 552 554 557 560 560 571 592 610 614 623 641 645 659 660 663 667 668 673 674 695 699 713 728 738 766 770 770 773 773 776 781 789 792 796 796 816 827 852 863 873 879 884 885 887 888 893 899 909 920 945 956 976 976 980 983 991 996 999 999 1002 1002 1006 1034 1044 1059 1073 1077 1098 1099 1104 1105 1109 1112 1113 1127 1131 1149 1158 1162 1180 1201 1212 1212 1215 1218 1220 1224 1227 1228 1230 1241 1259 1283 1305 1315 1318 1327 1327 1330 1333 1338 1340 1341 1341 1344 1376 1387 1412 1415 1430 1430 1433 1437 1440 1441 1441 1444 1469 1489 1491 1501 1511 1530 1540 1543 1544 1547 1554 1559 1562 1566 1572 1586 1588 1616 1625 1643 1644 1657 1658 1658 1659 1669 1669 1673 1675 1675 1701 1772 4 | -------------------------------------------------------------------------------- /week9-10/22 - Manhattan Tourist Problem.py: -------------------------------------------------------------------------------- 1 | def manhatan_tourist_problem(n, m, down, right): 2 | s = [[0] * (m + 1) for i in range(n + 1)] 3 | for i in range(1, n + 1): 4 | s[i][0] = s[i - 1][0] + down[i - 1][0] 5 | for j in range(1, m + 1): 6 | s[0][j] = s[0][j - 1] + right[0][j - 1] 7 | 8 | for i in range(1, n + 1): 9 | for j in range(1, m + 1): 10 | s[i][j] = max(s[i - 1][j] + down[i - 1][j], s[i][j - 1] + right[i][j - 1]) 11 | return s[n][m] 12 | 13 | 14 | if __name__ == "__main__": 15 | with open('Manhattan_tourist.txt') as f: 16 | line = f.readline().strip().split() 17 | n = int(line[0]) 18 | m = int(line[1]) 19 | down = [] 20 | for i in range(n): 21 | line = f.readline().strip().split() 22 | down.append([int(i) for i in line]) 23 | f.readline() 24 | right = [] 25 | for i in range(n + 1): 26 | line = f.readline().strip().split() 27 | right.append([int(i) for i in line]) 28 | print(manhatan_tourist_problem(n, m, down, right)) 29 | -------------------------------------------------------------------------------- /week11-12/26 - Implement GreedySorting to Sort a Permutation by Reversals.py: -------------------------------------------------------------------------------- 1 | from operator import neg 2 | 3 | 4 | def GreedySorting(P): 5 | 6 | permSeq = [] 7 | 8 | kInd = lambda p, k: map(abs, p).index(k) 9 | 10 | kSort = lambda p, i, j: p[:i] + map(neg, p[i:j+1][::-1]) + p[j+1:] 11 | 12 | i = 0 13 | while i < len(P): 14 | if P[i] == i+1: 15 | i += 1 16 | else: 17 | P = kSort(P, i, kInd(P, i+1)) 18 | permSeq.append(P) 19 | 20 | return permSeq 21 | 22 | '''Input/Output''' 23 | if __name__ == "__main__": 24 | with open('GreedySorting.txt') as infile: 25 | p = map(int, infile.read().strip().lstrip('(').rstrip(')').split()) 26 | print(p) 27 | revList = GreedySorting(p) 28 | revList = ['('+' '.join([['', '+'][element > 0] + str(element) for element in permutation])+')' for permutation in revList] 29 | print('\n'.join(revList)) 30 | 31 | 32 | def breakpoints(P): 33 | adj = 0 34 | for i in range(len(P) - 1): 35 | if P[i + 1] - P[i] == 1: 36 | adj += 1 37 | if P[0] == 1: 38 | adj += 1 39 | if P[-1] == len(P): 40 | adj += 1 41 | return len(P) + 1 - adj -------------------------------------------------------------------------------- /week7-8/17 - Compute the Number of Peptides of Given Total Mass.py: -------------------------------------------------------------------------------- 1 | def count_peptide(m): 2 | count = 0 3 | for i in amino: 4 | if (m - amino_dict[i]) in mass_dict.keys(): 5 | count += mass_dict[(m - amino_dict[i])] 6 | elif m - amino_dict[i] < 0: 7 | break 8 | elif m - amino_dict[i] == 0: 9 | count += 1 10 | return count 11 | elif m - amino_dict[i] > 0: 12 | count += count_peptide(m - amino_dict[i]) 13 | mass_dict[m] = count 14 | # print(count) 15 | return count 16 | 17 | 18 | if __name__ == '__main__': 19 | m = int("".join(open('counting_peptides.txt'))) 20 | amino_dict = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113, \ 21 | 'N': 114, 'D': 115, 'K': 128, 'E': 129, 'M': 131, \ 22 | 'H': 137, 'F': 147, 'R': 156, 'Y': 163, 'W': 186} 23 | amino = list(amino_dict.keys()) 24 | amino_mass = list(amino_dict.values()) 25 | mass_dict = {} 26 | ans_num = count_peptide(m) 27 | # print(mass_dict) 28 | print(ans_num) 29 | -------------------------------------------------------------------------------- /week11-12/28.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import re 3 | 4 | 5 | def two_break_dist(P, Q): 6 | bp_graph = defaultdict(list) 7 | # print(type(Q)) 8 | for j in P + Q: 9 | n = len(j) 10 | for i in range(n): 11 | bp_graph[j[i]].append(-1 * j[(i + 1) % n]) 12 | bp_graph[-1 * j[(i + 1) % n]].append(j[i]) 13 | counter = 0 14 | remain = set(bp_graph.keys()) 15 | 16 | while remain: 17 | counter += 1 18 | queue = {remain.pop()} 19 | 20 | while queue: 21 | current = queue.pop() 22 | new = {node for node in bp_graph[current] if node in remain} 23 | 24 | queue |= new 25 | remain -= new 26 | 27 | return sum(map(len, P)) - counter 28 | 29 | 30 | if __name__ == '__main__': 31 | with open('2BreakDistance.txt') as f: 32 | P, Q = [line.strip().lstrip('(').rstrip(')').split(')(') for line in f] 33 | # print(P) 34 | P = [list(map(int, i.split())) for i in P] 35 | Q = [list(map(int, i.split())) for i in Q] 36 | answer = two_break_dist(P, Q) 37 | print(answer) 38 | -------------------------------------------------------------------------------- /week11-12/26.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | 4 | def greedy_sorting(P): 5 | approx_reversal_distance = 0 6 | permutations = [] 7 | for i in range(1, len(P) + 1): 8 | if P[i - 1] != i and P[i - 1] != -i: 9 | index = 0 10 | if i in P: 11 | index = P.index(i) 12 | elif -i in P: 13 | index = P.index(-i) 14 | tem = P[i - 1:index + 1] 15 | tem = [-k for k in tem] 16 | P[i - 1:index + 1] = tem[::-1] 17 | permutations.append(copy.copy(P)) 18 | approx_reversal_distance += 1 19 | if P[i - 1] == -i: 20 | P[i - 1] = i 21 | permutations.append(copy.copy(P)) 22 | approx_reversal_distance += 1 23 | return approx_reversal_distance, permutations 24 | 25 | 26 | if __name__ == '__main__': 27 | with open('GreedySorting.txt') as f: 28 | P = f.readline().strip() 29 | P = P[1:-1].split() 30 | P = [int(i) for i in P] 31 | print(P) 32 | a, permutations = greedy_sorting(P) 33 | # print(permutations) 34 | # for i in permutations: 35 | # str_p = ['+' + str(pp) if pp > 0 else str(pp) for pp in i] 36 | # print('(%s)' % ' '.join(str_p)) 37 | -------------------------------------------------------------------------------- /week1-2/4-Frequent Words with Mismatches Problem(new).py: -------------------------------------------------------------------------------- 1 | def hamming_distance(str1, str2): 2 | counter = 0 3 | for s1, s2 in zip(str1, str2): 4 | if s1 != s2: 5 | counter += 1 6 | return counter 7 | 8 | 9 | def neighbors(pattern, d): 10 | if d == 0: 11 | return pattern 12 | if len(pattern) == 1: 13 | return ["A", "C", "G", "T"] 14 | neighbor= [] 15 | suffixneighbors = neighbors(pattern[1:], d) 16 | for text in suffixneighbors: 17 | if hamming_distance(pattern[1:], text) < d: 18 | for x in ["A", "C", "G", "T"]: 19 | neighbor.append(x + text) 20 | else: 21 | neighbor.append(pattern[0] + text) 22 | 23 | return neighbor 24 | 25 | 26 | def find_frequent(string, k, d): 27 | words = [] 28 | neighborhood = set() 29 | result = [] 30 | 31 | for i in range(len(string) - k + 1): 32 | words.append(string[i: i + k]) 33 | 34 | 35 | for word in words: 36 | neighborhood.update(set(neighbors(word, d))) 37 | 38 | mmax = 0 39 | for i in neighborhood: 40 | frequenti = 0 41 | for c in words: 42 | if hamming_distance(i, c) <= d: 43 | frequenti += 1 44 | 45 | if mmax < frequenti: 46 | mmax = frequenti 47 | result = [i] 48 | elif mmax == frequenti: 49 | result.append(i) 50 | 51 | 52 | return result 53 | 54 | data = "".join(open('frequent_words_mismatch.txt')).split() 55 | print(*find_frequent(data[0], int(data[1]), int(data[2]))) -------------------------------------------------------------------------------- /week9-10/24 - Local_Alignment_Problem.py: -------------------------------------------------------------------------------- 1 | def local_alignment(seq1, seq2, score_matrix, penalty): 2 | len1, len2 = len(seq1), len(seq2) 3 | s = [[0 for i in range(len2 + 1)] for j in range(len1 + 1)] 4 | backtrack = [[0 for i in range(len2 + 1)] for j in range(len1 + 1)] 5 | max_score = -1 6 | max_a, max_b = 0, 0 7 | for i in range(1, len1 +1): 8 | for j in range(1, len2+1): 9 | score_list = [s[i-1][j] - penalty, s[i][j-1] - penalty, s[i-1][j-1] + score_matrix[seq1[i-1], seq2[j-1]], 0] 10 | s[i][j] = max(score_list) 11 | backtrack[i][j] = score_list.index(s[i][j]) 12 | if s[i][j] > max_score: 13 | max_score = s[i][j] 14 | max_a, max_b = i, j 15 | insert_indel = lambda seq, i: seq[:i] + '-' + seq[i:] 16 | a, b = max_a, max_b 17 | align1, align2 = seq1[:a], seq2[:b] 18 | while backtrack[a][b] != 3 and a * b != 0: 19 | if backtrack[a][b] == 0: 20 | a -= 1 21 | align2 = insert_indel(align2, b) 22 | elif backtrack[a][b] == 1: 23 | b -= 1 24 | align1 = insert_indel(align1, a) 25 | elif backtrack[a][b] == 2: 26 | a -= 1 27 | b -= 1 28 | align1 = align1[a:] 29 | align2 = align2[b:] 30 | return str(max_score), align1, align2 31 | 32 | 33 | if __name__ == '__main__': 34 | with open('local_alignment.txt') as f: 35 | seq1 = f.readline().strip() 36 | seq2 = f.readline().strip() 37 | with open('PAM250.txt') as f1: 38 | lines = [line.strip().split() for line in f1.readlines()] 39 | score_matrix = {(i[0], i[1]): int(i[2]) for i in lines} 40 | penalty = 5 41 | alignment = '\n'.join(local_alignment(seq1, seq2, score_matrix, penalty)) 42 | print(alignment) 43 | -------------------------------------------------------------------------------- /week7-8/16 - Find Substrings of a Genome Encoding a Given Amino Acid String.py: -------------------------------------------------------------------------------- 1 | def peptide_encoding_problem(dna, peptide): 2 | sequence = [] 3 | protein_length = len(peptide) 4 | for i in range(len(dna) - 3 * protein_length + 1): 5 | if protein_translation(dna_rna(dna[i:i + protein_length * 3])) == peptide \ 6 | or protein_translation(dna_rna(reverse_sequence(dna[i:i + protein_length * 3]))) == peptide: 7 | sequence.append(dna[i:i + protein_length * 3]) 8 | return sequence 9 | 10 | 11 | def protein_translation(rna): 12 | protein = "" 13 | for i in range(0, len(rna), 3): 14 | if rna_codons[rna[i:i + 3]]: 15 | protein += rna_codons[rna[i:i + 3]] 16 | else: 17 | return protein 18 | return protein 19 | 20 | 21 | def dna_rna(dna): 22 | return dna.replace('T', 'U') 23 | 24 | 25 | def reverse_sequence(seq): 26 | result = '' 27 | for i in seq: 28 | if i == 'A': 29 | result += 'T' 30 | elif i == 'C': 31 | result += 'G' 32 | elif i == 'G': 33 | result += 'C' 34 | elif i == 'T': 35 | result += 'A' 36 | return result[::-1] 37 | 38 | 39 | if __name__ == '__main__': 40 | data = "".join(open('peptide_encoding.txt')).split() 41 | # print(data) 42 | rna_codons = dict() 43 | with open('codon_table.txt') as f: 44 | for i in f: 45 | i = i.split() 46 | if len(i) > 1: 47 | rna_codons[i[0]] = i[1] 48 | else: 49 | rna_codons[i[0]] = [] 50 | # print(rna_codons) 51 | for i in peptide_encoding_problem(data[0], data[1]): 52 | print(i) 53 | 54 | 55 | -------------------------------------------------------------------------------- /week9-10/23 - Global_Alignment_Problem.py: -------------------------------------------------------------------------------- 1 | def global_alignment(seq1, seq2, score_matrix, sig): 2 | len1, len2 = len(seq1), len(seq2) 3 | s = [[0] * (len2 + 1) for i in range(len1 + 1)] 4 | backtrack = [[0] * (len2 + 1) for i in range(len1 + 1)] 5 | for i in range(1, len1 + 1): 6 | s[i][0] = - i * sig 7 | for j in range(1, len2 + 1): 8 | s[0][j] = - j * sig 9 | for i in range(1, len1 + 1): 10 | for j in range(1, len2 + 1): 11 | score_list = [s[i - 1][j] - sig, s[i][j - 1] - sig, s[i - 1][j - 1] + score_matrix[seq1[i - 1], seq2[j - 1]]] 12 | s[i][j] = max(score_list) 13 | backtrack[i][j] = score_list.index(s[i][j]) 14 | indel_insert = lambda seq, i: seq[:i] + '-' + seq[i:] 15 | align1, align2 = seq1, seq2 16 | a, b = len1, len2 17 | max_score = str(s[a][b]) 18 | while a * b != 0: 19 | if backtrack[a][b] == 0: 20 | a -= 1 21 | align2 = indel_insert(align2, b) 22 | elif backtrack[a][b] == 1: 23 | b -= 1 24 | align1 = indel_insert(align1, a) 25 | else: 26 | a -= 1 27 | b -= 1 28 | for i in range(a): 29 | align2 = indel_insert(align2, 0) 30 | for j in range(b): 31 | align1 = indel_insert(align1, 0) 32 | return max_score, align1, align2 33 | 34 | 35 | if __name__ == '__main__': 36 | with open('global_alignment.txt') as f: 37 | seq1 = f.readline().strip() 38 | seq2 = f.readline().strip() 39 | with open('BLOSUM62.txt') as f1: 40 | lines = [line.strip().split() for line in f1.readlines()] 41 | score_matrix = {(i[0], i[1]): int(i[2]) for i in lines} 42 | penalty = 5 43 | alignment = '\n'.join(global_alignment(seq1, seq2, score_matrix, penalty)) 44 | print(alignment) 45 | -------------------------------------------------------------------------------- /week1-2/5-Reverse Complements Problem.py: -------------------------------------------------------------------------------- 1 | def complement(x): 2 | return {'A':'T','T':'A','C':'G','G':'C'}[x] 3 | 4 | def reversecomplement(x): 5 | return ''.join([complement(x[i]) for i in range(len(x)-1, -1,-1)]) 6 | 7 | def hamming_distance(str1, str2): 8 | 9 | counter = 0 10 | for s1, s2 in zip(str1, str2): 11 | if s1 != s2: 12 | counter += 1 13 | return counter 14 | 15 | 16 | def neighbors(pattern, d): 17 | if d == 0: 18 | return pattern 19 | if len(pattern) == 1: 20 | return ["A", "C", "G", "T"] 21 | neighbor= [] 22 | suffixneighbors = neighbors(pattern[1:], d) 23 | for text in suffixneighbors: 24 | if hamming_distance(pattern[1:], text) < d: 25 | for x in ["A", "C", "G", "T"]: 26 | neighbor.append(x + text) 27 | else: 28 | neighbor.append(pattern[0] + text) 29 | 30 | return neighbor 31 | 32 | 33 | def find_frequent(string, k, d): 34 | words = [] 35 | neighborhood = set() 36 | result = [] 37 | 38 | for i in range(len(string) - k + 1): 39 | words.append(string[i: i + k]) 40 | 41 | for i in range(len(string) - k + 1): 42 | words.append(reversecomplement(string[i: i + k])) 43 | 44 | 45 | for word in words: 46 | neighborhood.update(set(neighbors(word, d))) 47 | 48 | mmax = 0 49 | for i in neighborhood: 50 | frequenti = 0 51 | for c in words: 52 | if hamming_distance(i, c) <= d: 53 | frequenti += 1 54 | 55 | if mmax < frequenti: 56 | mmax = frequenti 57 | result = [i] 58 | elif mmax == frequenti: 59 | result.append(i) 60 | 61 | 62 | return result 63 | 64 | data = "".join(open('frequent_words_mismatch_complements.txt')).split() 65 | print(*find_frequent(data[0], int(data[1]), int(data[2]))) -------------------------------------------------------------------------------- /week5-6/12 - k-Universal Circular String Problem.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | 4 | def k_universal_string_problem(k): 5 | cycle = eulerian_cycle_problem(debrujin_graph_from_kmers(binary_strings(k))) 6 | cycle = cycle[:-(k-1)] 7 | genome = cycle[0][:-1] 8 | for i in cycle: 9 | genome += i[-1] 10 | return genome 11 | 12 | 13 | def eulerian_cycle_problem(dict): 14 | stack = [] 15 | random_vertex = sorted(dict.keys())[0] 16 | stack.append(random_vertex) 17 | path = [] 18 | while stack != []: 19 | u_v = stack[-1] 20 | try: 21 | w = dict[u_v][0] 22 | stack.append(w) 23 | dict[u_v].remove(w) 24 | except: 25 | path.append(stack.pop()) 26 | return path[::-1] 27 | 28 | 29 | def binary_strings(k): 30 | universe = ["0", "1"] 31 | kmers = ["".join(el) for el in itertools.product(universe, repeat=k)] 32 | return sorted(kmers) 33 | 34 | 35 | def debrujin_graph_from_kmers(patterns): 36 | kmers = [] 37 | for pattern in patterns: 38 | kmers = kmers+suffix_composition(len(pattern), pattern, uniq=True) 39 | kmers = set(kmers) 40 | dict = {} 41 | for kmer1 in kmers: 42 | dict[kmer1] = [] 43 | for kmer in patterns: 44 | dict[prefix(kmer)].append(suffix(kmer)) 45 | return dict 46 | 47 | 48 | def suffix_composition(k, text, uniq=False): 49 | kmers = [] 50 | for i in range(len(text)+1-k): 51 | kmers.append(text[i:i+k-1]) 52 | if uniq: 53 | return sorted(list(kmers)) 54 | else: 55 | return sorted(kmers) 56 | 57 | 58 | def suffix(string): 59 | return string[1:] 60 | 61 | 62 | def prefix(string): 63 | return string[0:-1] 64 | 65 | 66 | if __name__ == "__main__": 67 | data = "".join(open('universal_string.txt')).split() 68 | print(data) 69 | print(k_universal_string_problem(int(data[0]))) 70 | -------------------------------------------------------------------------------- /week1-2/4-Frequent Words with Mismatches Problem.py: -------------------------------------------------------------------------------- 1 | def hamming_distance(str1, str2, d): 2 | 3 | counter = 0 4 | for s1, s2 in zip(str1, str2): 5 | if s1 != s2: 6 | counter += 1 7 | if counter > d: 8 | return 0 9 | return 1 10 | 11 | 12 | def make_word(num, k): 13 | newNum = '' 14 | while num > 0: 15 | newNum = str(num % 4) + newNum 16 | num //= 4 17 | newNum = "0"*(k - len(newNum)) + newNum 18 | return newNum 19 | 20 | 21 | def find_frequent(string, k,d): 22 | words = [] 23 | result = [] 24 | 25 | for i in range(len(string) - k + 1): 26 | words.append(string[i: i + k]) 27 | 28 | mmax = 0 29 | for i in range(4**k - 1): 30 | # if (i % 2000) == 0: 31 | # print('.', end='') 32 | testword = make_word(i, k) 33 | frequenti = 0 34 | for c in words: 35 | if hamming_distance(testword, c, d): 36 | frequenti += 1 37 | 38 | if mmax < frequenti: 39 | mmax = frequenti 40 | result = [testword] 41 | elif mmax == frequenti: 42 | result.append(testword) 43 | 44 | str_result = [] 45 | for w in result: 46 | sstr = "" 47 | for c in w: 48 | if (c == "0"): 49 | sstr += "A" 50 | elif (c == "1"): 51 | sstr += "C" 52 | elif (c == "2"): 53 | sstr += "T" 54 | elif (c == "3"): 55 | sstr += "G" 56 | str_result.append(sstr) 57 | return str_result 58 | 59 | data ="".join(open('frequent_words_mismatch.txt')).split() 60 | 61 | data_num = [] 62 | for c in data[0]: 63 | if (c == "A"): 64 | data_num.append("0") 65 | elif (c == "C"): 66 | data_num.append("1") 67 | elif (c == "T"): 68 | data_num.append("2") 69 | elif (c == "G"): 70 | data_num.append("3") 71 | data[0] = data_num 72 | print(*find_frequent(data[0], int(data[1]), int(data[2]))) -------------------------------------------------------------------------------- /week5-6/13 - String Reconstruction from Read-Pairs Problem.py: -------------------------------------------------------------------------------- 1 | def string_reconstruction_from_read_pairs(patterns, d): 2 | return genome_path_problem(eulerian_path_problem(debruijn_from_read_pairs(patterns)), d) 3 | 4 | 5 | # def genome_path_problem(kmers): 6 | # genome = '' 7 | # kmer_length=len(kmers[0]) 8 | # for kmer in kmers: 9 | # genome += kmer[0] 10 | # if apppend_last: 11 | # genome += kmer[1:] 12 | # return genome 13 | 14 | 15 | def genome_path_problem(path, d): 16 | text = path[0][0] 17 | for pair in path[1: d + 2]: 18 | text += pair[0][-1] 19 | 20 | text += path[0][1] 21 | for pair in path[1:]: 22 | text += pair[1][-1] 23 | 24 | return text 25 | 26 | 27 | def eulerian_path_problem(dict): 28 | stack = [] 29 | random_vertex = sorted(dict.keys())[0] 30 | stack.append(random_vertex) 31 | path = [] 32 | while stack != []: 33 | u_v = stack[-1] 34 | print(u_v) 35 | try: 36 | w = dict[u_v][0] 37 | stack.append(w) 38 | dict[u_v].remove(w) 39 | except: 40 | path.append(stack.pop()) 41 | return path[::-1] 42 | 43 | 44 | def paired_prefix(pair): 45 | return (pair[0][:-1], pair[1][:-1]) 46 | 47 | def paired_suffix(pair): 48 | return (pair[0][1:], pair[1][1:]) 49 | 50 | 51 | def debruijn_from_read_pairs(read_pairs): 52 | read_pairs = list(read_pairs) 53 | 54 | dict = {} 55 | 56 | for pair in read_pairs: 57 | pair = pair.split('|') 58 | 59 | suffix = paired_suffix(pair) 60 | prefix = paired_prefix(pair) 61 | 62 | if prefix in dict.keys(): 63 | dict[prefix].append(suffix) 64 | else: 65 | dict[prefix] = [suffix] 66 | 67 | return dict 68 | 69 | if __name__ == "__main__": 70 | data = "".join(open('string_reconstruction_from_read_pairs.txt')).split() 71 | #print(data) 72 | print(string_reconstruction_from_read_pairs(data[2:], int(data[1]))) 73 | 74 | 75 | -------------------------------------------------------------------------------- /week5-6/14 - Contig Generation Problem.py: -------------------------------------------------------------------------------- 1 | def generate_contigs_from_reads(kmers): 2 | graph = debrujin_graph_from_kmers(kmers) 3 | degrees = graph_degrees(graph) 4 | contigs = [] 5 | 6 | for v in graph.keys(): 7 | if degrees[v] == [1, 1]: 8 | continue 9 | for u in graph[v]: 10 | contig = v 11 | w = u 12 | while True: 13 | contig += w[-1] 14 | w_degree = degrees[w] 15 | if w_degree == [1, 1]: 16 | w = graph[w][0] 17 | else: 18 | break 19 | contigs.append(contig) 20 | return sorted(contigs) 21 | 22 | 23 | def debrujin_graph_from_kmers(patterns): 24 | kmers = [] 25 | for pattern in patterns: 26 | kmers = kmers+suffix_composition(len(pattern), pattern, uniq=True) 27 | kmers = set(kmers) 28 | dict = {} 29 | for kmer1 in kmers: 30 | dict[kmer1] = [] 31 | for kmer in patterns: 32 | dict[prefix(kmer)].append(suffix(kmer)) 33 | return dict 34 | 35 | 36 | def graph_degrees(graph): 37 | degrees = {} 38 | for i in graph.keys(): 39 | neighbors = graph[i] 40 | out_degree = len(neighbors) 41 | 42 | if i in degrees: 43 | degrees[i][1] = out_degree 44 | else: 45 | degrees[i] = [0, out_degree] 46 | 47 | for j in neighbors: 48 | if j in degrees: 49 | degrees[j][0] += 1 50 | else: 51 | degrees[j] = [1, 0] 52 | 53 | return degrees 54 | 55 | 56 | def suffix(string): 57 | return string[1:] 58 | 59 | 60 | def prefix(string): 61 | return string[0:-1] 62 | 63 | 64 | def suffix_composition(k, text, uniq=False): 65 | kmers = [] 66 | for i in range(len(text)+1-k): 67 | kmers.append(text[i:i+k-1]) 68 | if uniq: 69 | return sorted(list(kmers)) 70 | else: 71 | return sorted(kmers) 72 | 73 | 74 | if __name__ == "__main__": 75 | data = "".join(open('contig_generation.txt')).split() 76 | # print(data) 77 | print(*generate_contigs_from_reads(data[0:])) 78 | 79 | 80 | -------------------------------------------------------------------------------- /week5-6/11 - String Reconstruction Problem.py: -------------------------------------------------------------------------------- 1 | # StringReconstruction(Patterns) 2 | # # dB ← DeBruijn(Patterns) 3 | # # path ← EulerianPath(dB) 4 | # # Text ← PathToGenome(path) 5 | # # return Text 6 | def string_reconstruction_problem(patterns): 7 | return genome_path_problem(eulerian_path_problem(debrujin_graph_from_kmers(patterns))) 8 | 9 | 10 | def debrujin_graph_from_kmers(patterns): 11 | kmers = [] 12 | for pattern in patterns: 13 | kmers = kmers + suffix_composition(len(pattern), pattern, uniq=True) 14 | kmers = set(kmers) 15 | dict = {} 16 | for kmer1 in kmers: 17 | dict[kmer1] = [] 18 | for kmer in patterns: 19 | dict[prefix(kmer)].append(suffix(kmer)) 20 | return dict 21 | 22 | 23 | def genome_path_problem(kmers, apppend_last=True): 24 | genome = '' 25 | kmer_length = len(kmers[0]) 26 | for kmer in kmers: 27 | genome += kmer[0] 28 | if apppend_last: 29 | genome += kmer[1:] 30 | return genome 31 | 32 | def eulerian_path_problem(dict): 33 | stack=[] 34 | balanced_count = get_balance_count(dict) 35 | stack.append([k for k, v in balanced_count.items() if v==-1][0]) 36 | path = [] 37 | while stack != []: 38 | u_v = stack[-1] 39 | try: 40 | w = dict[u_v][0] 41 | stack.append(w) 42 | dict[u_v].remove(w) 43 | except: 44 | path.append(stack.pop()) 45 | return path[::-1] 46 | 47 | 48 | def suffix_composition(k, text, uniq=False): 49 | kmers = [] 50 | for i in range(len(text)+1-k): 51 | kmers.append(text[i:i+k-1]) 52 | if uniq: 53 | return sorted(list(kmers)) 54 | else: 55 | return sorted(kmers) 56 | 57 | 58 | def get_balance_count(adj_list): 59 | balanced_count = dict.fromkeys(adj_list.keys(), 0) 60 | # Look for nodes balancing 61 | for node in adj_list.keys(): 62 | for out in adj_list[node]: 63 | balanced_count[node] -= 1 64 | try: 65 | balanced_count[out] += 1 66 | except: 67 | balanced_count[out] = 1 68 | return balanced_count 69 | 70 | 71 | def suffix(string): 72 | return string[1:] 73 | 74 | 75 | def prefix(string): 76 | return string[0:-1] 77 | 78 | 79 | if __name__ == "__main__": 80 | data = "".join(open('StringReconstructionProblem.txt')).split() 81 | print(string_reconstruction_problem(data[1:])) 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /week3-4/distance_between_pattern_and_strings.txt: -------------------------------------------------------------------------------- 1 | GTGAGCC 2 | GAAAGTCGGACTTTTTCTTATCACGATTTCGTACCGAAGGATTGTATAAAACGCTCCATAGAGTCTTTTGCTACGTGATGCCCTCGGGCT CCAAAATCTGATTCCGCTAGTCAGACGATCGTCATCAAAGTGCATCGTCACGCTGGGCGGCTTGTTCTGCGCTGAGCTGCGAAGGGGGAC CTAGAGAAGCAGCGCACTTTAACGTACGAGGAGGTTTGCAGGCTTCCACCTTTCTTCCAGTGTATTATGGATTTTTGACCCTCCCAGGCG TGTTTAAGTAGAACCTAAAAGTTTGGCTCCGTAAAGCGGTGTTGGTGTCAGTGGTTCACTTAATATCGAGACTTCTCTGCGTGGGGTGTG CTACCGGAGCTTATTCTCACCGGTGGCATGGGGATTACGTAAAAACTCAAAGGGTTGTAAGTCCGCTAGTGATCACCGTTTCCTAGACAT TATACAAGAGCCCCCGGAGAGAGATTTGTTCGCTGGAACACCGCACTTTAATGCTTCCGTTCCTGTCGCTATAGTCACATTTGTTAGTTC CGACCTCGTATTCGTGCGTAGTGTAATAACCGGTGGAGTTAGACGGGGTTAATTATAACGCGCGGCGTCAGCACTTCATTTACCTCCCAG TATTTCTGTAATGCCAACGGCAGTTCCCGGCGTGCATTTGCGCCGATGAGAACCCCAAGCAGATGTCACTGGCGACATGGAGTGTAGTAC CCCAGTTCACCTTGCTCCCTCTCAGACCTTTTATGGGTCCGGACAAATCACTTCTTCGCGAAGAATTGGCTAGTTAATGACAATAACCGT TCCGTCACTTCATAGCCGACTAATAGGCGCAGTTTTAGTTTGGCATTGGCCATGCTGCACAATATGCTCTCTAGCACCCCGGTTCGATCG TATGGCCTTGAGTGATAAGGTTACCGCCTACCTGCTTTCAGAAGCAAGATGATCCATCGTATGCAGACCCTTTGCTAGGGTGGGATCTTT GACGCGAACTACTACGTCTTTTCAGAGTGCATGCCAGCTGAAATTCTTGAAATGGTCTCGAATTAACCTAACGAAAGAGTATGCGATAAG ATACCCGCTCCACGAACTTATGTTTAACCCCTAAGCGGTAGACGAAGAACGCGTACTTGAAGTCCACACCTCAGGGAACGTAAAACCTGT AAATGACCTAGGTATATAGCGAATGGATCTCAATCATCAGCGCCCATATCGAGAGTCGCGGGCTTGTTACATCCTCGCACATTACCTGAG CACGAGCGTATGCCCATGTGTAACGTGCATCACTCAATTGGTCTATCCGGGATTCATTCCTGCGCATGATGCGTATATCGCAGGGCAGGT CTTTAAAACCAGTCTTATGAACAATTGACGCCTTTCCTACGGCGAAAAGGTGTCGATTCTCAACCCGCATAGCCTCCTTGCCCTCGCCAG TAAGTTGTCGCGTTGGGATGTTCCGGATGCCTCAAACTAGCATTCAAATTAGTCGCCAAGAATATTGGTTAACAGAATATCAAGGCAGCT TCCCCTCGAGCATGTCTGTGGCTTCACATTGCATGGAAGGTTTACTGCGGGGCCTACTAAAGGCACCCAGGCAGACAAAAGCGCACAATA ATGATAGACGTGTCAGGCGACGGCATTAGTGCCAATTGTTAGAACAGTTCACTTATGTCACTCGTCCAGAGTCCTAAGTCATGGAGAATT CAACTACTGCAGTTGGATGACCAACACAGAACTGCATTTCTGTTTCTGCGTACGAGCTAACCTCAAGTTGCACTCATAATAACGACGCTT CTAAAATGCCCTGACCAAACAAGATTATCCGCCAATCTGCGGTTAGCTTCCTTGCGCCTGCGCATTTACATTTCTCGTATGGGGTAGCAC GCGTTCGTATGCAACACAGCGTTAGGGACTAGGAGGTGATTCTCAACCAGATAAGCGTGCAAGTGTGCGCGGTCCCTTGTAGAGACGCTT TTACGCCATGGTTCGAATGGACCTGAATGTGGCGGATGCTACTGAGCATCCATTATGATAAAGGGCGTGCTTTGCGATTCATACAAGCGC CTACATGCTATTGTAAACCTAAAGAATTCCGTACCGTGATCTCAAACAACAGCTTTGCGAAGTATGCTTACTTCACCAATCGGCGGCTCT CCCAGGCATACCCGAAACACTAATATTCTCCGTATATCGACTGTTGTGACGACGAAGTGTTCATCGGCTATTGCCTACGTGCCAGACGCT ACTTGCTAAGCAAAAAGTACTAAGCGTCGTCTTTTGTCACTGCTTAGAGAGCGAAAGGGTCCCCTGACTGCTGGCGCCCAATAGGGGGCC TCCAGGAGCAATGTGGCACAAAGGTAAGTGACCCGTAGCCGGCGACAAAAGGAATGCAAAGGCGTGAAACGTCTGCTACATGCTTTTCCA ACCACTTGCTAGTGGGGGATTACCTGTCCTTCGGTAATCTGACCATCCTGAATTGGACTGCGCATCATCGGTACGCGGTACCTCCCTTAG CCCTGCTAATACTCGTCAAAATAACGCTGCGTAGGCCCCTGAGCTGCAGAGTCTAGCATTTCCTCCATTCGAACAAATATAACGATATAA AATACAAAGTACTTCTGCCCAACTTAGGGCTAGTGCAATGAATGGCTACAGGCGGAGAAACGCCCAAGGCATCGAAGCGCGCGAGTGCCT GTTACCCGCTCAGACCTTCATCTACCGGAATAAATTGACCCGCCGGTCATGAATTCCGGTATCGTTCTGCCCATCACCTAACGCGTAAAC CCACTCGGCGAAACGGCTAAGGCGTCAAATTCACCGCATTCAAAGACGCACCCACAGAGGGCCCCTAGCGCCGTTTGAGGCTTCGTACAC TGGTACCGGTGTAGTGCCTTTAGTGTCAAGAACCTTGCTTAGAACGCCTCCTCCGACCCAAGATTACAGCTAACATCTATGCTAAGTGTC AGGGACGGGCGATGCCTCGGCTGCGTCTGTTGGATTGTTCGAACTGACGCATCCTATTCTCCTTGCTTACCGATTGCCATATAGGGATGC CCAACGGAGGATATAGGCTAGGTAAGATAATGTCAGTAATTTGCAGTGTGACGGCAAACGTTTGATCTTGGGACAGCTTTCATCTCTCGG TATCGGGGAACCGTCAAGGACGCGGTCGCAAAGTTTCGCGTACCTCCCTTCGCGGGGACCGCCCAAGGTCACTCCTCCGTGACGATAAGC 3 | -------------------------------------------------------------------------------- /week11-12/30.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def two_break_on_genome(genome, i, j, k, l): 5 | g = colored_edges(genome) 6 | g = two_break_on_genome_graph(g, i, j, k, l) 7 | genome = graph_to_genome(g) 8 | return genome 9 | 10 | 11 | def two_break_on_genome_graph(g, i, j, k, l): 12 | rem = ((i, j), (j, i), (k, l), (l, k)) 13 | bg = [t for t in g if t not in rem] 14 | bg.append((i, k)) 15 | bg.append((j, l)) 16 | return bg 17 | 18 | def colored_edges(genome): 19 | g = [] 20 | for p in genome: 21 | s = chromosome_to_cycle(p) 22 | for j in range(len(s) // 2): 23 | head = 1 + 2 * j 24 | tail = (2 + 2 * j) % len(s) 25 | e = (s[head], s[tail]) 26 | g.append(e) 27 | return g 28 | 29 | 30 | def chromosome_to_cycle(p): 31 | nodes = [] 32 | for i in p: 33 | if i > 0: 34 | nodes.append(2 * i - 1) 35 | nodes.append(2 * i) 36 | else: 37 | nodes.append(-2 * i) 38 | nodes.append(-2 * i - 1) 39 | return nodes 40 | 41 | 42 | def cycle_to_chromosome(nodes): 43 | p = [] 44 | for j in range(0, len(nodes) // 2): 45 | if nodes[2 * j] < nodes[2 * j + 1]: 46 | s = j + 1 47 | else: 48 | s = -(j + 1) 49 | p.append(s) 50 | return p 51 | 52 | 53 | def graph_to_genome(g): 54 | genome = [] 55 | visited = [] 56 | adj = np.zeros(len(g) * 2, dtype=np.int) 57 | for t in g: 58 | adj[t[0] - 1] = t[1] - 1 59 | adj[t[1] - 1] = t[0] - 1 60 | 61 | for t in g: 62 | orig = t[0] 63 | if orig in visited: 64 | continue 65 | visited.append(orig) 66 | if orig % 2 == 0: 67 | closing = orig - 1 68 | else: 69 | closing = orig + 1 70 | p = [] 71 | i = 0 72 | while True: 73 | if orig % 2 == 0: 74 | p.append(orig // 2) 75 | else: 76 | p.append(-(orig + 1) // 2) 77 | dest = adj[orig - 1] + 1 78 | i = i + 1 79 | if (i > 100): 80 | 81 | return 82 | visited.append(dest) 83 | if dest == closing: 84 | genome.append(p) 85 | break 86 | if (dest % 2 == 0): 87 | orig = dest - 1 88 | else: 89 | orig = dest + 1 90 | assert orig > 0 91 | visited.append(orig) 92 | return genome 93 | 94 | 95 | def format_sequence(s): 96 | fs = [] 97 | for i in s: 98 | str_p = permutation_list_to_str(i) 99 | fs.append(str_p) 100 | return fs 101 | 102 | 103 | def permutation_list_to_str(p): 104 | ps = [] 105 | for i in p: 106 | if i > 0: 107 | ps.append('+' + str(i)) 108 | elif i == 0: 109 | ps.append('0') 110 | elif i < 0: 111 | ps.append(str(i)) 112 | return '(' + ' '.join(ps) + ')' 113 | 114 | 115 | 116 | 117 | if __name__ == '__main__': 118 | with open('2BreakOnGenome.txt') as f: 119 | genome = [list(map(int, f.readline().strip()[1:-1].split(' ')))] 120 | [i, j, k, l] = list(map(int, f.readline().strip().split(', '))) 121 | genome = two_break_on_genome(genome, i, j, k, l) 122 | print(''.join(format_sequence(genome))) 123 | -------------------------------------------------------------------------------- /week3-4/randomized_motif_search.txt: -------------------------------------------------------------------------------- 1 | 15 20 2 | ATCTACCGAACCGGGGTCGATCCAGCAGCTGTGAGGCGGTCGCGACTTCCTGTCCGCTGGCAAGATAAGCCTTCATACAGTGACCCCGTTGGTTCTATCATCTCCTGCACTACGGTGGACTCTATCCTCCATTGAACTCTGTTCCCGTCCAGTGTGTCAGGCGGCCTATCTACCGAACCGGG 3 | GTCGATCCAGCAGCTGTGAGGCGGTCGCGACTTCCTGTCCGCTGGCAAGATAAGCCTTCATACAGTGACCCCGTTGGTTCTATCATCTCCTGCACTACGGTGGACTCTAGCTGAGAGTCTCGTATCCTCCATTGAACTCTGTTCCCGTCCAGTGTGTCAGGCGGCCTATCTACCGAACCGGG 4 | GTAGAAACGTTTACTTTGGAGATCATAATATGAGGTCATCTTGCTACTTGGAATAACCTTTGACACTGAATAGGGTAGAGGTCCGAGCATACCCTGTTTCGACAACGGGTGCGCATCAACGGAGATCTTTGTCATGGAATCTCCACACTATGCGTAATTCTACAATTCTGGAGTCCCTAGGT 5 | CACCAGCAACCTCCGCCTCCAACGAAACTCGTTATATCTCGTTGCACTGCTACGAATGTAACGCTTGGGACTGCGAGCGTATTGAGGCACCGGATTCTAGCAGTGCGCCTCGACCAGAGGCTCCAACTTCGTCAGACTGCCCCTAGTTAGGGTAGGTCCTCGTGAAGGTCCCCGACAGACCA 6 | ACAATGGAGCCGGCATTGCACCGGCGGTTGAGAGAATATTACGCAAACAATCTCAAGGCGCGGCGCAAGATAATGGGATTCCATCTCTCCCCCAGCCCAGGGCATTCACCACTAAGACGATACACCTTATAATTAAAACTTAGACCACACAACAAAGGCATATCTTAGGGCCTAGTCTCGTG 7 | CAAGCTGGGTAGAGTCTCAAATCTGGATGCTTGGCGGGCGTCTGGGGCTTGGTCTGGTAACTTCAAGTACTGCTACCGCACGGGGTCTGCGACTTTGAGTTATTGGCACTTCTATATGGAGCCATGGAGAAGTTATCACACTGAGCGAGTTCCTGCAGTAGACGTAATAGTGACGAGGCTCC 8 | ATGTTAATCTGTGTTAGCTTATGAAGCGGGTGCACTATTTGCGCCAATTCCACGGGATCGACAATGTATCCCCTCTAAGTGCAAAAGTGGTAGAGTCTCGTCTCTCACTGTGGGGGGTTCACGAATTGTCTATTATGTCATACTACTACATTGAGAGTTTAATCACGCTAGTAATGTAACGC 9 | CGCGGGTTAACGTTATCTTGTCATCCGAGTAGGCGTGAGTCTCGAATAATAAGGTATACGAAGCCACCTGGTGACGGTTGGATGGTGATCACTCAGATTGCCAGGGTACTTTTTTAAAGTGCAGATACGATCCATATGTCGATTCATAGTTCTGGGTGCAAGGTCCCCGGCTGCGACTGCTT 10 | CCGACCAAACTAGAAGACTGGAATATTGGGCTTCGGCTATCTGATGCCGGGTTTTGTGCTACTAGAAAACTCCAGCGACTGACTTATCTTGAGTGCTGGCTCGCGGACCAAAACCTAGGGTTAGGTCTCGGCTTCAGATTGGCACATTGTAAACCCGCCATAATTGTGACGTCGTTTAGGCA 11 | GGGCCGTAGGCGCAAATTAAGGTGGTACGCGCGAGGACCAGAACCGCATAATTGGTCAAGCTGAAGGGGGCGCAAGGTCACGAGAATGGTCGTGATGCTAGACCTAGGGTAGAGTCCACGCCGAATGACATGGTTTTACAGACCTGCTTAGAGGATTAATAAGCGATCAAGTAACGGTGCGA 12 | CCCCACTGGCACTCTCCCGGATCGGCCATCCTCTCTAGTTAGTTGCCTGATCGTTCCCGCGCAGATTTAGCGCACGGTATACAGCGGAACGCTTCGTTGCCTCGTTGTGGTTGGTAGGCACGAGTCTCGCATAAAGCAGTTACCATTCACAACTCCTCACGGCACTCATGCCAAGTGTCTTC 13 | AGGTATTGCTCTGTTCATTAATCTCAGGCACGTGAACATTATAGAGGCCCATGGTGTCATCGCGATTTGGGGGAACCAGTGCACCCTGGGGCACTGGTTCGGGATTAAATAGTTAAGAGTCTCGTGTCTCGGGTTTAGGCTGAGGCCCACCTCTTACCCACTTTCAATAACCTGAATGGCGA 14 | CCAGATCATTATGGGGCCGTTGCGCTGTTACTAGGAGCATAGTTAAAAATCATATCGTATTCTATCGGGCCCCTCCACATAGGGTAGATAGTCGCTTTGCCAGAGTCCCCGACATTCCGTCGTTCGCATTTCTTAACATTCGGGGAATCTGACGAGGTCTAGCAATAAGCGATGCCGTTCGC 15 | ACTCTTCGTGCCGCGGTAGCACACACTGTATTATACTACCAGCCAGGGAAAGTCCTAGCACGCCGGCTTAACCACGTTAACCAGAACTACTTAGGCATCGATTCGTTGGTGCAAGATCAGCACACAAGAGCCAGCTCGAGTCGGCGTTAAGTAGAGTCTCGAAGCTTAGGATACCTAAACGG 16 | GCTCACTGTACTAACCTCCACTTGCGCTGTCTCCATTGCCGCTTGAGGCCCGTTAGGCTTCTTCCTCCAAGAAGACATGGGTATGGAGTTTCTCGAGCTATTCGAGTTACGCTCCGCGGAGGGGGCAAATTTAATTTAGAGTCTCGTGCGTTAGCGAAACACACGCTCTTGATTTTTCGATA 17 | ACACTTAAGATTCATATGCTCTTGATGCTTAGATTGCATTTAGGGACAGCCTACTGTCATATACGAAAGTAGGGTATGCTCTCGGGTAACAGCGTAAGCCTCAGAACGTGACGCTCTTTGACAAGCACCGATTTCAGCACTGTTCATCTCCGAATATCAGCCGGCCGTCGAATCTAACCCCG 18 | CGGCCGTTCGTCTTGAGTCACCTACCTCGTACTGCTCACAACTCATAGGGTAGAGTTGGGGTTTTGGACTAATACACTCTCAAGGAGAAGTGTATTACATTTGACACAGAGGAGTATGGTACATGAATCCAGGAATTCGGCCTTGATGCCGGCAGTATAGCACAATTTCCGTCTAAATGGCA 19 | CGGGGAACACTTGAAATGCTTCACACTGAGCCGTCTTCTTCCGCTGTTGCTGTTCTCGGTAGAGGCGGGGGAGCTCGCATAGGGTTCCGTCTCGCTGGTCGAGATACTACACGTCTGTATTACCGCTGGCGCACTTCAGACTTCTATTACTGCAGCTGATCAGCAGTGAGGTGATATCTGCA 20 | AACGGACTCGTGCTAATCCGTACTGATGTGTTGAAATCCACCCAGGCCGTTTACGTAAGAAGTCAGAGCCCGGGAAAGAGTCCCCTCGTATGCAGTTTTAGGGCTCAGTCTCGCTAGGTCAGATACATCCTTTATATGCAGTGGTAGCCTAGAGATGCTACCCGGAAGAATATATAGAGTTC 21 | GGGGCATAGGGCAAATTACCCGCAAGGGTAGAGTCTGCTCGGGCTGCTTACCATGCTCAGTATGAGATGTCTGTCCCAACTTGGTTGTTTCTCATTCGTAAAAAGCTGGGTGGGAAACCTGGGTCGAACTGCGTCCGGATGATACTGAGCCCTCTGAGTCTGGCCGACGCCTTGCACCCCAA 22 | -------------------------------------------------------------------------------- /week5-6/15 - Gapped Genome Path String Problem.py: -------------------------------------------------------------------------------- 1 | # StringSpelledByGappedPatterns(GappedPatterns, k, d) 2 | # first_patterns ← the sequence of initial k-mers from GappedPatterns 3 | # second_patterns ← the sequence of terminal k-mers from GappedPatterns 4 | # PrefixString ← StringSpelledByGappedPatterns(first_patterns, k) 5 | # SuffixString ← StringSpelledByGappedPatterns(second_patterns, k) 6 | # for i = k + d + 1 to |PrefixString| 7 | # if the i-th symbol in PrefixString does not equal the (i - k - d)-th symbol in SuffixString 8 | # return "there is no string spelled by the gapped patterns" 9 | # return PrefixString concatenated with the last k + d symbols of SuffixString 10 | 11 | 12 | def string_spelled_by_gapped_patterns(path, k, d): 13 | first_patterns = [n for n, m in path] 14 | second_patterns = [m for n, m in path] 15 | prefix_string = string_spelled_by_patterns(first_patterns, k) 16 | suffix_string = string_spelled_by_patterns(second_patterns, k) 17 | for i in range((k + d + 1), len(prefix_string)): 18 | if prefix_string[i] != suffix_string[i - k - d]: 19 | return "There is no string spelled by the gapped patterns" 20 | return prefix_string + suffix_string[-k - d:] 21 | 22 | 23 | def string_spelled_by_patterns(patterns, k): 24 | str = patterns[0] 25 | for i in range(1, len(patterns)): 26 | str += patterns[i][-1] 27 | return str 28 | 29 | 30 | def debruijn_from_read_pairs(read_pairs): 31 | read_pairs = list(read_pairs) 32 | 33 | dict = {} 34 | 35 | for pair in read_pairs: 36 | pair = pair.split('|') 37 | 38 | suffix = paired_suffix(pair) 39 | prefix = paired_prefix(pair) 40 | 41 | if prefix in dict.keys(): 42 | dict[prefix].append(suffix) 43 | else: 44 | dict[prefix] = [suffix] 45 | 46 | return dict 47 | 48 | 49 | def paired_prefix(pair): 50 | return (pair[0][:-1], pair[1][:-1]) 51 | 52 | 53 | def paired_suffix(pair): 54 | return (pair[0][1:], pair[1][1:]) 55 | 56 | 57 | def eulerian_path_problem(dict): 58 | stack = [] 59 | balanced_count = get_balance_count(dict) 60 | stack.append([k for k, v in balanced_count.items() if v == -1][0]) 61 | path = [] 62 | while stack != []: 63 | u_v = stack[-1] 64 | try: 65 | w = dict[u_v][0] 66 | stack.append(w) 67 | dict[u_v].remove(w) 68 | except: 69 | path.append(stack.pop()) 70 | return path[::-1] 71 | 72 | 73 | def suffix_composition(k, text, uniq=False): 74 | kmers = [] 75 | for i in range(len(text) + 1 - k): 76 | kmers.append(text[i:i + k - 1]) 77 | if uniq: 78 | return sorted(list(kmers)) 79 | else: 80 | return sorted(kmers) 81 | 82 | 83 | def get_balance_count(adj_list): 84 | balanced_count = dict.fromkeys(adj_list.keys(), 0) 85 | for node in adj_list.keys(): 86 | for out in adj_list[node]: 87 | balanced_count[node] -= 1 88 | try: 89 | balanced_count[out] += 1 90 | except: 91 | balanced_count[out] = 1 92 | return balanced_count 93 | 94 | 95 | def suffix(string): 96 | return string[1:] 97 | 98 | 99 | def prefix(string): 100 | return string[0:-1] 101 | 102 | 103 | if __name__ == "__main__": 104 | data = "".join(open('string_spelled_by_gapped_patterns.txt')).split() 105 | # print(data) 106 | print(string_spelled_by_gapped_patterns(eulerian_path_problem(debruijn_from_read_pairs(data[2:])), int(data[0]), 107 | int(data[1]))) 108 | -------------------------------------------------------------------------------- /week7-8/data.txt: -------------------------------------------------------------------------------- 1 | 57-57-129-113-115-114-128-97-163-131-129-129-147 57-57-147-129-129-131-163-97-128-114-115-113-129 57-129-113-115-114-128-97-163-131-129-129-147-57 57-147-129-129-131-163-97-128-114-115-113-129-57 97-128-114-115-113-129-57-57-147-129-129-131-163 97-163-131-129-129-147-57-57-129-113-115-114-128 113-115-114-128-97-163-131-129-129-147-57-57-129 113-129-57-57-147-129-129-131-163-97-128-114-115 114-115-113-129-57-57-147-129-129-131-163-97-128 114-128-97-163-131-129-129-147-57-57-129-113-115 115-113-129-57-57-147-129-129-131-163-97-128-114 115-114-128-97-163-131-129-129-147-57-57-129-113 128-97-163-131-129-129-147-57-57-129-113-115-114 128-114-115-113-129-57-57-147-129-129-131-163-97 129-57-57-147-129-129-131-163-97-128-114-115-113 129-113-115-114-128-97-163-131-129-129-147-57-57 129-129-131-163-97-128-114-115-113-129-57-57-147 129-129-147-57-57-129-113-115-114-128-97-163-131 129-131-163-97-128-114-115-113-129-57-57-147-129 129-147-57-57-129-113-115-114-128-97-163-131-129 131-129-129-147-57-57-129-113-115-114-128-97-163 131-163-97-128-114-115-113-129-57-57-147-129-129 147-57-57-129-113-115-114-128-97-163-131-129-129 147-129-129-131-163-97-128-114-115-113-129-57-57 163-97-128-114-115-113-129-57-57-147-129-129-131 163-131-129-129-147-57-57-129-113-115-114-128-97 57-57-129-113-115-114-128-97-163-65-66-129-129-147 57-57-129-113-115-114-128-97-163-131-129-129-66-81 57-57-129-113-115-114-128-97-163-131-129-129-81-66 57-57-147-129-129-66-65-163-97-128-114-115-113-129 57-57-147-129-129-131-97-66-97-128-114-115-113-129 66-97-128-114-115-113-129-57-57-147-129-129-131-97 66-97-131-129-129-147-57-57-129-113-115-114-128-97 97-128-114-115-113-129-57-57-147-129-129-131-66-97 97-128-114-115-113-129-57-57-147-129-129-131-97-66 97-131-129-129-147-57-57-129-113-115-114-128-97-66 97-163-131-129-129-81-66-57-57-129-113-115-114-128 113-129-57-57-147-129-129-66-65-163-97-128-114-115 113-129-57-57-147-129-129-131-97-66-97-128-114-115 114-115-113-129-57-57-147-129-129-131-97-66-97-128 115-113-129-57-57-147-129-129-66-65-163-97-128-114 115-113-129-57-57-147-129-129-131-97-66-97-128-114 115-113-129-57-57-147-129-129-131-163-97-128-57-57 128-97-163-131-129-129-81-66-57-57-129-113-115-114 128-97-163-131-129-129-147-57-57-129-113-115-57-57 128-114-115-113-129-57-57-147-129-129-131-97-66-97 129-57-57-147-129-129-66-65-163-97-128-114-115-113 129-57-57-147-129-129-131-97-66-97-128-114-115-113 129-113-115-57-57-128-97-163-131-129-129-147-57-57 129-129-131-163-97-128-114-115-113-129-57-57-66-81 129-129-131-163-97-128-114-115-113-129-57-57-81-66 129-129-147-57-57-129-113-115-114-128-97-66-97-131 129-129-147-57-57-129-113-115-114-128-97-163-65-66 129-129-147-57-57-129-113-115-114-128-97-163-66-65 129-131-163-97-128-114-115-113-129-57-57-66-81-129 129-147-57-57-129-113-115-114-128-97-66-97-131-129 129-147-57-57-129-113-115-114-128-97-163-65-66-129 131-129-129-147-57-57-129-113-115-114-128-97-66-97 131-129-129-147-57-57-129-113-115-114-128-97-97-66 131-163-97-128-114-115-113-129-57-57-66-81-129-129 147-57-57-129-113-115-114-128-97-66-97-131-129-129 147-57-57-129-113-115-114-128-97-163-65-66-129-129 163-97-128-114-115-113-129-57-57-66-81-129-129-131 163-97-128-114-115-113-129-57-57-147-129-129-65-66 163-97-128-114-115-113-129-57-57-147-129-129-66-65 163-131-129-129-81-66-57-57-129-113-115-114-128-97 57-57-129-113-115-114-128-97-163-65-66-129-129-66-81 57-57-129-113-115-114-128-97-163-65-66-129-129-81-66 66-97-128-114-115-113-129-57-57-147-129-129-65-66-97 115-113-129-57-57-147-129-129-66-65-163-97-128-57-57 115-113-129-57-57-147-129-129-131-97-66-97-128-57-57 128-97-163-131-129-129-81-66-57-57-129-113-115-57-57 129-129-147-57-57-129-113-115-114-128-97-66-97-65-66 129-129-147-57-57-129-113-115-114-128-97-66-97-66-65 129-147-57-57-129-113-115-114-128-97-66-97-65-66-129 163-97-128-114-115-113-129-57-57-66-81-129-129-65-66 163-97-128-114-115-113-129-57-57-66-81-129-129-66-65 -------------------------------------------------------------------------------- /week1-2/clump_finding.txt: -------------------------------------------------------------------------------- 1 | GCGGTTATGCACCGTTCAAATTAGCAAACCACTAAGCGACGTAGTCTGGATTGATTTCTCCCTACCAGTGACCCAAGACGCGTTAGTGAGTTAAGTTCATATCCAGTACCTGCCGCCCTCTGTACTTGGGCGTCCGATTCGCATGCTTACTCAGGTGGAGGACACGATAATCTGATTAAACTGAGCTAAACCAGGTGGAACCAGAAACCAGGTGGGGAGTCTCGCTTCAAGCCGTTCTTGCGATCAAACCAGGTGGTCCATTATGAAACCAGGTGGCTAAACCAGGTGGTCCAGATCCTCGAATGATGTCGGTGCACATCAAAACCAGGTGGGGTGGTGGAACGTAAAACCAGGTGGCATAAACCAGGTGGGCCGGTTCGTAAACCAGGTGAAACCAGGTGGGGTGGAAACCAGGTGGGTTACAAATTACGTTGAGATGGCCCAAACCAGGTGGTGGGCTTCACCCATGTCAACAAACCACCCTATGGAACTAAACCAGGTGGAACCAGGTGGTGAAGGCTTATCCTCAGGAAAAACCAGGTGGAGGTGGTGAAATAAAACCAGGTGGACCAGGTGGATAACCCTCGCCTCGCTTCTCAACCGAGACCTGGATAAACCAGGTGGGGTGGTCCACCGATTTTTGAGACACTAGAAACCAGGTGGGCGGGGAAACCAGGTGGCAAACCAGGTGGGGTGGACGGAAACCAGGTGGATATGTCATAAAACCAAACCAGGTGGTGCACCCCCATGGTGTGTCTTATCCGTGCGTATAAACCAGGTGGTCGCACGGCTTCCACTTGCTGAGAATAGGCCCGCAGGGTCAGTGCCATGCCCTCCGTCACTCGATATGTGTTGTAAGAGTGGTTACCCCTTCATTGAAGTCGCCCACAGCCCCACCTGCATTGCTAGACTATCACCCTACAGTAGGCCTTTTCGCCTTCTTCAAGCAGCAATCTCTTATCCGCGGATGGGCGCGGCGAGCGTGGCGTCCCCGAACATTTTTACCTAACGTGTTTTGTTGGCCGCAAGCCTTCCCTCTAGTCCACCTCAGCCATTCAGCCTAGTAGCTTTCAAGCCGAGCCTTCCATATCTAATGGACCGTCCAGAATTTCACACGTTTCACAGGGCTGTGTTCGACCGCCCGTAATGCTGTTTCACAGGCGATCGCCTTGCGGTTTTTTCACAGATCGCAGCCGATGGACATGCCAACTCGATTTTCACAGAGTTTTTCACAGCGGTTTCACAGCACAGCAGTGATTGTTTCACAGCAATTTTCACTTTCACAGGGGCCCTTTTCACAGCTCAGGGCTCTTTTCACTTTCACAGTTTCACAGCGCTCCTTTCACAGAGCGGGGAAATTTAAGGGAACACTCAAGGGAACAAGGGAACACACAAAGGGAACACAACACAACACATAAGGGAACACTTTCACAGAACACAAAAGTCCGAAATCATCAGCGGCGAAGGGATTTCACAGACAGACACTTTCACAGCGCATTTCACAGATACGTACTTTCACAGGCGTACTTTCACAGACTTTCACAGAGGACAAGCTCAATTTTCACAGACAGGCTGGATAAATTTCACAGCGGTAAGGGTTTCACAGCACACATAAGGGAACACGAATTTCACAGCAGGGAACACCTCTACGAGTAATCTATTACTCTACCTACTGAAGGGAACACACCGAAGACCTACTATTACCTATTACTCTTAAAGGGAACACATTACAAGGGAACACACTCTCTCGTCATATCTCACCTCTCTATTACTCTTAAGGGAACACCTTCTCGATCAACCTATTACTCTATGGAGATAGAGATATTCCAGACATATGGAGATAACATGGAGATATGGAGATAATGGAGATGGAGATAGCTCTTATATTTATCCTATGGAGATATGATACTATTAATGGAGATAATTCTAATGGAGATATAATTACTCTAAGAGGATGGGATCTCGGGCTATTACTCTAATGGAGATAAGCACTATTACTCTAGGAAATGGAGATATGTCAATGGAGATATGTAATGGAGATAGAGGGAGATGGAGTCGCCATTTCATAATCGCCATTTCATAGTTCAGGAATCGCCATTTCCGCCATTTCTAAGATGGAGTCGCCATTTCTACGTATGGAGATAGGATCGCCATTTCATACGACCCGTTGGATATCGCCATTTCCTCGCCATTTCTGGTGACATTTCTCGCCATTTCATTTCTGGAGATAGATGGATCTCGCCATTTCATAGGAATCGCCATTTCCACGTAGGGGGGGCCACAATCCGTAGGTCGGAATTCAGACTCGCCATTTCCCATCGCCATTTCTTCACCTGTATGCCGATCCCTTCGCCATTTCTCATGGAGATAACTCTCTCTCGCCATTTCTCGCCATTTCCATTTCACTCTCATTCGCCATCGCCATTTCCATTCGCCATTTCATCGCCATTTCTTCAGGATAAGATATCGCCATTTCGACTCTCATTCGCATACTGACTCTCATTCTCATCTCGCCATTTCTCATCTGACTCTCATCCTGGGGGAAACTTGCGACTCTCATCACACTTCCGTCGACTCTCATACTGGCGGATAGCATAGGAGCCATTTAAAGACTCTCATTCTCATTCGAGACTCTCATTCAAATCCTACGAGGACTCTCATATAGACTCTCATATCATTACGAGGACTCTCATATACGAGCCATGCATGTGGCGACGACTCTCATCTACGAGCCATGCAAGCAGAATCTACGAGCGACTCTCATTACGAGCCATGTGACCGTACGAGCCATGCATGCATGCCATGCTGACTCTCATCGAGTACGAGCCATGGAAGTTCTTGTTGGTTCGTAGCCCAAGAGCTGAAGTTACGAGCCTACGAGCCATGAAGTTACTTTTACGAGCCATGAAGCTTACGATACGAGCCATGCGAGCCATGCATCCGCGCTACGAGCCATGTTCCAGTACGAGCCATGTTAGTTGCTGAAGTTAAGTTTGGCGCTGAAGTTTGTACGAGCCATGTGCCCGCTGAAGTTTGTTGTACGAGCCATGCATGCTGAAGTTAATGGCTGAAGTTAGCGTTTGCGGGCAGATCCTCATTCTACGATACGAGCCATGCCATGCAGCTGAAGTTAAGTTGGGTTACGAGCCATGCGAGCCATGTGAAGTACGAGCCATGCTGGCTGAAGTTGTTTGTGCTGCTGAAGTTGCTCTTGTCTCTAGCTGAAGTTGCCAACAGGGCTGAAGCTGAAGTTTAAGCTGAAGTTGCGAGCAGGCTGAAGTTATCGGATTGGGGCTGAAGTTCAACCTCCCGTCCCCCCACACTATATTCCCGTCCCCCCCCGCGCACGCGCCGTCTCCCGTCCCCCCTATCCCGTGCGCACGCGACGCGATCCCGTCCCCCCAGAGTGCGCGCACGCGTCCCCCTTCCCGTCCCCCTCTCCCGGGCGCACGCGTCGCTCAACATTTCCGCGCACGCGTCGCGCACGCGGGCGCACGCGGGTCCCGTCCCCCCCCCTCTTCGGCGCACGCGGAATTCCCGTCGCGCACGCGTCCCGTCCCGCGCACGCGTCGCGCACGCGACTGCCCTAACCAACAGTGCGCACGCGCCGGTAACCCGGTAACCCGGTAACCGCGCACGCGGGCGCACGCGCGTAACCCGCGCACGCGCCGCGCACGCGGCCCGGTTCCCGTCCCCCCCGGTAACCCGGTAACTCCCGTCCCCCGTAACCCGGTGCGCACGCGCCCGGCGCACGCGGAGCGCACGCGCCCCCCCCGGTAATAGCGCACGCGCCCGGGCGCACGCGCCCGGTAACCCGGTAACCCGGGCGCGCGCACGCGGCGGCGCACGCGGCGCACGCGGCGCACGCG 2 | 11 566 18 3 | 4 | -------------------------------------------------------------------------------- /week5-6/123.py: -------------------------------------------------------------------------------- 1 | CCAATTGTTGGCAACAAAGAATCGCTTATGCTAGGGTGACGTGCCAATCGACTGATTTGACTGGCCGGGGGATCGGCTGCGTAAAACCGGTGTCAGAATAAATAGTCATGGCCGGCGTCGACAGGCGCCCCGAGGGATAGGTAACGGGCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGTCAACTACGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGAGCCTGAGGCCCGTGAAGAAGCCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGGTTCTGGGTGCATAGCCGCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGACGCCACGAAGTGTCAACTAGTGTTGTCATGAGAGAGTTATTATAGCAGGCCTACTTGTAGGTAAATACACTCTAGGTTATTCGCTCTGCTCCCCTCCTGCGTAACCCCTACCGTGAAGAAGCGGTCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGTGTTACTACCCATAGCGTCGGCCTCGTGAAGAAGCGGTTCTGGGCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTTGCATAGCCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGACGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTAGTGTCAACTCGGACGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTCGCCACGAAGTGTCAACTACGTGGCAATCATCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGTACTAGTTTAGCTGTAGGGCTTGAGGCAATTCCACGATCAGCGGGAACAGCGATATAACCCTTACATATCTAAACGCTGGACTGCATAAAGTAAGCAAGGAAATTGACTGAGGCGCTTACCCCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTCCAGTATCAAGCCGCAACCGGGCCCGTGACTCATCCTCCTGCATACCCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGAACGGGGCCTGGTCCCGTTTTCGAAGGGTGAGTTCTGCTTAGCGTTGTCTTTCATTCGCTCAAAAGTCCCGCGTAAGAGCATCCTGGATTGTTCGCCCTGTAAGCGGGACTACGCGTGCCGATGGTGGGCTTGCAATTATCATAGCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTTCCTGTTCCGTCAATTCCTCTCTAAATACTATCTAACCTGGTCGCAGAACTCGAAGAACTACCGGCCGTCAGCAATTCTAGCTTAATACCTCGTCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTTGAATAGTGCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGCCCCTCGGAACGGTATGTACTGCAAGCGTAGAAACCCTGATAGCTTGGATGACGAAACTGTTAGATGTACTGCCAACGGTTAGTCGCGCTGTCGGTTTCGTTAACGATGCATTAAGTCGAACTCGTACCTAGAAACGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTAGTGGGATATTGGTGAAGCAGAGGACGAATTGCGATATCCAAGATGAGAACTGTTTGTCAGTCGGGGAAGACCCAGCTGACTACGCTCAGAGCCCGGTCATGTGTCTGAATCAATCTAAAAACGTATAGTTTGGCTACTGGGGCGCTAGGTGC 2 | CCAATTGTTGGCAACAAAGAATCGCTTATGCTAGGGTGACGTGCCAATCGACTGATTTGACTGGCCGGGGGATCGGCTGCGTAAAACCGGTGTCAGAATAAATAGTCATGGCCGGCGTCGACAGGCGCCCCGAGGGATAGGTAACGGGCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGTCAACTACGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGAGCCTGAGGCCCGTGAAGAAGCCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGGTTCTGGGTGCATAGCCGCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGACGCCACGAAGTGTCAACTAGTGTTGTCATGAGAGAGTTATTATAGCAGGCCTACTTGTAGGTAAATACACTCTAGGTTATTCGCTCTGCTCCCCTCCTGCGTAACCCCTACCGTGAAGAAGCGGTCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGTGTTACTACCCATAGCGTCGGCCTCGTGAAGAAGCGGTTCTGGGCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTTGCATAGCCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGACGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTAGTGTCAACTCGGACGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTCGCCACGAAGTGTCAACTACGTGGCAATCATCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGTACTAGTTTAGCTGTAGGGCTTGAGGCAATTCCACGATCAGCGGGAACAGCGATATAACCCTTACATATCTAAACGCTGGACTGCATAAAGTAAGCAAGGAAATTGACTGAGGCGCTTACCCCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTCCAGTATCAAGCCGCAACCGGGCCCGTGACTCATCCTCCTGCATACCCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGAACGGGGCCTGGTCCCGTTTTCGAAGGGTGAGTTCTGCTTAGCGTTGTCTTTCATTCGCTCAAAAGTCCCGCGTAAGAGCATCCTGGATTGTTCGCCCTGTAAGCGGGACTACGCGTGCCGATGGTGGGCTTGCAATTATCATAGCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTTCCTGTTCCGTCAATTCCTCTCTAAATACTATCTAACCTGGTCGCAGAACTCGAAGAACTACCGGCCGTCAGCAATTCTAGCTTAATACCTCGTCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTTGAATAGTGCGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTGCCCCTCGGAACGGTATGTACTGCAAGCGTAGAAACCCTGATAGCTTGGATGACGAAACTGTTAGATGTACTGCCAACGGTTAGTCGCGCTGTCGGTTTCGTTAACGATGCATTAAGTCGAACTCGTACCTAGAAACGTGAAGAAGCGGTTCTGGGTGCATAGCCGGACGCCACGAAGTGTCAACTAGTGGGATATTGGTGAAGCAGAGGACGAATTGCGATATCCAAGATGAGAACTGTTTGTCAGTCGGGGAAGACCCAGCTGACTACGCTCAGAGCCCGGTCATGTGTCTGAATCAATCTAAAAACGTATAGTTTGGCTACTGGGGCGCTAGGTGC -------------------------------------------------------------------------------- /week3-4/greedy_motif_search.txt: -------------------------------------------------------------------------------- 1 | 12 25 2 | AACGATAACGATCTGTGGGGGGCGCACGAAAACCCGTAAGCTCACTGCTGTGAGGGCCGCTTTGAAAGTTCTAGGAGCGCTCCTACGTCGAGTCGCACAGTACTATATCTAAATTAAAAGCACTCGACTCACCAGATTTGGGCAATTGCGACGGTA 3 | GTTGCATCGACTTGTGGTTCGCAACCACACATTCGCAGCTCTTAACTCGAGGAGTGAAGCGCTAATGGTTCCAGATGAGAGCATCTAAAATAAGATGGATGTAACTTTGGTTCCAGACGTTTCGGGTCTCTGAGGTAACGGTGCTCAGTTGTAGGG 4 | TCGACCTCTAGCTTAAACTAATAGACATGGTGAGATAATTGGGCCTGTTGTTTTATTGTTTTTTTGCATCGCGTTACGTAGGCAACAAAAGCCATAGATACATCACTATTTTGAGATAAACGAAAGCTTCTCATACTCTGATGCCGTGCCTGCGTA 5 | GTGTCCCAGACATGGAAGCGGCAGGTAAAGTAATATGTACGTAGACTGCCATACCTCGCGCTTTGGTTACTTGTGAACTTGGGTGAATCCGGGTAGCATCCTTGTTATGCTGAGAACTGTTGAAGGGCGCCGTGAGCCATTCATTTCACTTCCGTA 6 | CGAGATTAAAATGCTTAGGGTGTACTTGCTCCCCAACGTGTTACATCTCATAGAGCTTCGTGTGGGAAGAAGCTACTCAAAGTGTTGCCCGCCTCTATAAAGTAATACTTAGATTTTTGCGCTTGCCCTCAGTACTGCCGCCTCTGAGTGGTACTG 7 | TTCTGAATGCAGACCTTAATCAGGCCCGACCTTGTAGAAGGATTTGATGTCAGCAGTCGGCTCAGTTGCACGTCGGAGTCTGGCCCTAGCGACATACTAAAGTAAAAGGCACGTCTCGTCCAACACAAAGTTAAATATTATTTGCGCACGAGAGCG 8 | CAGGGGAGCTTACTCTAATCATGCTCTTTAACATTCTACCATAATACGTATTGGTGACACCACAAGCACTGGCTGACCTCCTGATAGCGTGCTTATCGTAGCGTGTCGCTAAATTAAGATGCAGCGGCGGACAGTCTCCTATAGGTATTTCTCTCT 9 | GTTGCTACAAAGGTATACCTGTCGGCTTATCTTGCTCTGCCGCCCATAACTACTGGATGGTTGCGCCCGGCACCTGGTATTCCTCTCTAGAAGGGATGTAGTACAGTGGTACTCATAGCCTTAAAATAAAAGTGCCGGAATCTGCGTTTTTTAGGG 10 | CTCGATTCTGACCGACCAAGTGGGCTATAGTTCCGAGCCTAGCACAGTGCCCGTTTGGGGCATGACGGCAGGGCACTTCACAATATCCTAAGAGGCTTAAACTAACATTGGACTGTTTGGCGTACCGTTTCAAACGCTTGCTTTCAAATGTGGGCC 11 | CTATGCACCGGTCTTTGGCAGAACATTAGCATCCTGCCTGGTGGTCTTTGAATCGTCTCAAGGGGCTCTATTCCGCTCGATATTCTAAAGTAACATCGTATGACTGCTTGGCCAAGTCTATGAAAGTCAGTCTGGATTGTCACGGAAGGTTGCAAG 12 | ACGCGCGATATAGTCCTTCTAACGCTATCGGACGACCAGGTGAGAATGATTCTTATTGTACTAAATTAAAATCGGGGAAGTTTGTCTGGCATGACTTATACTACAGGAAAGGGCAAGAAGTGCACAGCTGGGTGTAGCTTTATAGCCAAGTGAACA 13 | GGACGCAGATTCTGTAAGGACAACTGGATTTACATTTTAAAGAATGATAGCTCAGGATTCCTAAAGTAATACATGCAGTTGATGGTTGGGGTAGATTTACCGCTAGCCTCCATTCATCGGTCCACGTGATATAGCACTTTTGTAACACGCGACGCA 14 | TCAAAACACCTAGGGGTCTTAGTCTGCTTCGGAAAAACTAGCCGGGGGACCTTTACTCCGGGATTGTTTATTGCAGCCCGTTTGTCGGTTGTGTGATTCGGGTTACTGTTAAAATAATAAAAGAGCTGTGAACTGGGGAAAGTTTAGGTACGTTTG 15 | CAGGCTTAACCATCAGCATAACCCTACTCTAACAGAAACAATTTTAACATGGCAATGGTAGCTAAAGGACATTTAAAATAAAATCCATATTGATAGGCTTCGAAAAAAGCGACATACCCCTCGAGGATATACGGGCCGTCCGTTTTGCCAGAGGGC 16 | CTGCCAAACGAATCGCTCGCAGGCTAATGGGATGGGTTTACATGGAAGAGCAGCCGGGACTATGATGTCCAAATCCACGATCTCCTAAACTAAAAACTTTCAACTACGGGAGCCGTACGCGGTACTAGTAAATCAGCTCGAGGCTGGAAATCAGCA 17 | AGTACCTGACCTGTCCGCCTCGTTAGGAAGGATATCCGGAGCCCCAACGAGGTGCCGCGACGAAACGATGCTCTAAACTAATACTCAACTAGGCTGGAGCCAATTTACCCCGCACCTACCAACGTCGAAAGTAATCCGTTCAAGGCCCGCAAGCGA 18 | CGCGTGTGCGAAGTTTAGTGGTTTATGTTCCTTTTGTGAGGGGTGACGACGGCTAGTCGCTGCCATTTCTCCATAAATTAAGAGCCGCATTGTTCCACCAGGCCCAGAGTAGGCTTGACAGAGTCGCCTATATATCCCCATAAGGACCTCTGTTAT 19 | GAGTTAGTTGACTCTCGCTAACACACGTCGGACGGGCTGAAATGCCATGATGCCATACTGTCCACTAGATGCAAAACGATAGGGATAAAATAATAGTACCCAAAAAAGTTATTTGTAACTGAATTCGTTGCAAGGAAAGTTATCACACTCCGCCTC 20 | CCTGGCAATGCGCGATCCGATCAGTCCTGGGATTCCAAGAATAAGCTTTCCGCCAACTACTCCTAGGTACGGATTATAAAACTCGTAAATTAAAAATCCCGCTACAAGCTCCACCGGAATGCAGGTCGACTTCGGATACTGACCACAGTCATCTGG 21 | TTAAAATAAAACCGGGTTTGTGCAGAATGGTAAAAGGCTAACTGCCGCGACGTGAACCGGCTATAAGGGTAGACCAATTTTTTAGAACGCCAGCAGGTCGTAGCAGAGTCATATCAGTCCTGTTCCAGGCTGCACCTCCAACGAGTGACTCATGGG 22 | CTGATTCACATCTGGCACTCGCTTTTTTTTCCCGTATGGCTACAGAAATGCCAGTAGATCCGCTGAACGTGATTCGATACCCGAAGCCGCGTCGATTGTTTGCAGGAGTCGCCATTCTCAGAGGGAATGAATATGCTGGACCTTGTAAATTAACAT 23 | AAAACATAATTATTAAAATAATAGGTCCGGTAAAAGCGGCAAAAATGAGTCGGCCGAATTTCTTAACTACTCAGCCATTCGAGACCGACCACCTACTTGTGTTTGGTGAGGTGATGAAGAGCAAACCCACAACGACGCCCCTCGAGCGATTCTTCA 24 | GCCGCGTAACCTCGGCTCCGCCATAGTTGTAGACCGGTTAGCAGGATCACTCTTCAGGCGAATTAGCACCAACGATTGCATGCCGACCCAAACCCTCACGACCAGAGACTCCACTACCAGTCCGGTTTCTCGGTAAAGTAAAACTGAGAAACTAGG 25 | GGCACCGGCTAGCTCTGTCGGTATTTAAATTAACACCAGACTGATGCATGCAGCCCGTAGATTCAATATACTTCACCTCCGTAGCTACTACCTCGGAGTGACCAAACGCCAACTCGAGTACTACATGACACCTAGATATCCACTGCATAGCCCTCG 26 | CAGCATTACCTGGTGGCTTAGACGCAGGAACGCGTAATAAAGTAACACTCCGGGGGATAGGCCGGATTGGGAGATTCTGGGATGGGCCTGGGCGACATAAGATCGGAGTCACTCGGTGCTCGGGTCCCTAGAAAACCCGTTCAGGGGGGGACTGCC -------------------------------------------------------------------------------- /week3-4/greedy_motif_search_pseudocounts.txt: -------------------------------------------------------------------------------- 1 | 12 25 2 | ACGGTTGGGACCTGCAGAATCAGCACGTGATAATATGCCGACCAAAATAGCATAGCGGATCTCTAGTATTCTTCTATAACGCGGTAACTCGTCGCGGTGCTATCTGACGACCTGGTAACTCACAAGGTTTGCCAAGTTGTGCAATTTACAGGTTGA 3 | ACGTGATGATCTGTGAGTGGGTGGTTTAGATCAACCATCCATGATTCTAAGCTTCCTGTATCTCATGAAGCAAGTCGATTCCACCCAGGACGTCAACACTATGAACTTTTTCGCGGAAGTTAACGATTCGCAGGGGACCCAGCATACACGGTCCGA 4 | CGTTAGCTTCTGACCACTCAGAGAACGGGATTATGTAAAGGGTCGTCCAAAAAGACGTTAGATTAAGCAGGCCTGATCCTCTCCCCTAACTTTCGGTATTCAACTCGGGTGCAATGTGATCGTTTGAGCCGATCCGTAATTCAGTCGGATTCACAG 5 | ATGCACCTTCATTTAGAAATGCTCACCCCTTGGGATCTCACAATGCTCCGCTGCCTCTAAATTTGCAAGTAGCCCACCGTTGGGGCGTGATGATGTATACATCTCTAGTCCGATGCTTTTCTGATCTCATGCAATTCCAACCAAATGCATCGGTAG 6 | TGAACCTTGAATGTAGTATTGGCGCCGGGATAATCTTCATTTTTAGTGCCGGCATGATCTCGGGGGGAACAACTCGCTCTCTGCATGCTTGCCTATAGCGGTTTATAGGTGTTAAGATAGCCGCTCAGCCGTTTCACTCTATAGTTGGTCTGATTC 7 | AACGTGGAAACCGCTTTTGGGTTGCCGAGATTATGTCTAATGGGGGCAGCACCCGCACTGGTGCGGATAGTCCCCCTAAACTCTGACGCGAACTACCACCATACAGCTGTATGCCCTATGTCATCGCACATTAATCACTTGGAGTCAAGAAAGTAA 8 | CTTTTGGCACGCTGAAATAGCGTTAACCTGCTTGAAGTAAGCAAACAGACGTGATAATTTAGTTGCTGTGAAGTCGGCCTTTCTAGATTGCATGCGTCCACGATCTCTCAGATAAGTACCTAAGCAAGGCCGGCAGGCCATCGGACGTACACTTGC 9 | GGGGTGAGTAGCGTACACGGTAATCTCTGGTATGCCTGGACACTACTTCCTACGGATTAATTTTCGCACTCGAGATAAAGAGCAAGCCCTAGCTGTGGCCGCTCTTGGTCGAGATCATCTTAGAATTTTCTCTTCCGCTCCGCTGAATAACCGAGG 10 | AAAAGCAGGGCCATGCTCGAACTTTGTACACTTGCTGGGGCCCCCAAGAGGTAGCGGTCTCCGTGATGATATATATCAGCAGAACCACCTAGTACGACTACACCCCATCGCGCGTTTGAGGGTTTTCTTCTTCTCGCCCTGGGTGAGAATTTTCAT 11 | CACGTAATCGTAACCGACCTCTGTCCGGGATCATGTCGATCTACGAGTTATACAAAGACCCGCCAATTATAAGACGGACAGTGGGACTTGAGGGGCAAGAGCCCGATATTCTAAAGAAAGGCCTTGGGGGAACACCGCATTTCGAAATACACGGAA 12 | GCCTCTGTCTTCAGTCAACATCGTCATGCGCTTAAAGCCTGTACTATGATAATTTCGCCAGGCTTCGTCCTATGTGGTAGATGTGCGAGATCATCTGGGCCCTCCGAGTTAAGATACCAGTTGAAGCGCTTTAGGCCGTCCTAAATTTATCATCCA 13 | AGGGGTGGGTCGGGACTGTATTACCAACATCGTGGGGGCGAGCCATGTCCGGGATTATCTCATCAACCGAAAGTCTTGAATGCAATCGACCCCAAGGTGCGGTGGGGTCCGCAAGGACTACCTATTCTAAAACGGTTCACGTTTCACAGGAGGGTG 14 | CCCCTCTGTAGATGGCCCGGCCCTTTGTATTCGCACTTGAACCGCTCCGCTCAGGGGTTCGCGAGATGATGTAATTCAGATTGTTAGAGCCCGGTCGTTTATTTACATAGGTAGATGAGGGCCCCATATTTCATTAATCTACCTTAGTCACCACAC 15 | CAGACCGGCAGGGCTGCCAACGTGAGGTGAGGAGGAGGAATGATATGGCTCAGCTGTTTCCAAACGCGAGCATCTCCTAAGTCCGAGATCGATTTTCATCCACCAAGAGCAGGCGAGACCACGCGATCATGTGGTGAACGTACCACGTGTACACGA 16 | GCGCTAGAAAATGGCCATACGAGAGACTATCAACTCCCGGGATAATGTTTATGATATCGAATGGAGGACACTAGCCTGAAGCCGTAGGGAGTCACCATTAGTTGGCCCTCCCGTCGGACACCCATATCCACGGTAAGAGCGAATCATTCTCTAGCA 17 | TGACCGCCGTATTACAACCGTATATACCGGATACGCACGCATCTCGGAATGCGTTTCCTCTGGCCACGCCGTGGCCCACGAGAAACGCGATTATGTATCTCTTTCGGGCGCATTCTTCGACTTCTTCATTGCGTAAAGTTATGTTGCGTTTCACTA 18 | ACATAGGCGGATACGTGATTATGTCGAATTTCACCTACATGCTCCCGCCAGCCAAAACTATGTTGTGTACTACCCACTGCGCTCAGCACCATCTGTGGTACTTATCGGTTAGCCGTAAGGTCACCTTATTTAGGGCATGGCTTGGTAGCCTCGTCC 19 | TCATCTTCTTGAGGGGCCGCTCGTTTAAGGGCAACAACTCAGATAGGCACGTGATCATCTGAGGGAAAAGGTATAAATTAGGTAAGAAAGCTGGGACTGGCCTTGCACGACAGGCCAAGCACAGCGCTAGCATCGGCGTGATGGGTTGCCGTTTCG 20 | GTTAAGCGAGAGCCCTGAGCTATGCAGTCCACCAAAGCTGGGACTACCAAACATAATGACTCTGACGAGACCTCGCAAGCCACTAGTCGCTGTGAACCATGTAAGTATACGTGATCATCTCTTCCCATTTGTGTTAGGTGAGCGGGGGAGCGCTGT 21 | CATACTTGTACACCGACAAGCGCTTGCATAAGACGGTCGGGATAATTTTAGAAAGCGGTTCCCCACTCGCCTCACCGCAGCCGACGATTTTCGTGATCGCGGGCTATTTTCGATTTAGTCCCACAGTCGCGCCAAGATCTGCAGGAGAATCAGCCG 22 | AACACGCGGTAGGAGCTGACCGAACAGGGGATATATTCGGGATTATATAGACTTTCACGATGTCGACGACACTGCTAAGATGATTTCAAATCATCTATCGTCCTCTGTGTCCGAGCCATTATAATTCTACTGCGTGATAGGTTGCTTCACATCGGA 23 | TGTCTATTAATGTTAATCCCCAAGGATGATAACTGGTGGCGCAAGTAAGTCTAATGCCATGGGATCTTACGTGGTACAGAAAAGGCGCGATGATGTACGGCCAGGGTCCCAGCTCTCGGTACAGCACCACTGACCCTTGTGTTACCTTGATGATCT 24 | CCGAGATAATCTGCTGCACCGTCTAACTAGTCGTCGTCCGTTATCTAAGGACAAAACGGGCCCACCTGCGAGGTGTGTGATATTGATAACGTGACGTAGGCCAAATGGCTCATGAACTTAATCGGAATAGGGAACGCAAACCCTCGGTATAGTATA 25 | TCACAGGGTCAGCGTGAGGGTTCTGCGAGATTATATGTCGCTATCTTTAACTGGAGAGAAAGCGGCTAGTTCGGACTGTACTACAGTCGAGTTTCAACTTGCTAAAGGTCTCAAGGTTGGATCATCCCGGGAGCTCGCCGGGCCTTCTGGGGGTAC 26 | CGAATCGGGAATCACGTAACCAACTTTCTTAGGATGGAGTGAGACGCCAATCAAAGGGAGACGGTTATTGACCGATACGAACCATCGTGATCATATAGCCAGCCCGAAAAAAACCGGGTGATATATAGCTATTGATGAAGGGAGCATGGCTATGCG 27 | -------------------------------------------------------------------------------- /week3-4/9-Implement GibbsSampler.py: -------------------------------------------------------------------------------- 1 | from random import randint, choice 2 | import sys 3 | import numpy 4 | 5 | def score(motifs): 6 | score = 0 7 | for i in range(len(motifs[0])): 8 | motif = ''.join([motifs[j][i] for j in range(len(motifs))]) 9 | score += min([hamming_distance(motif, homogeneous*len(motif)) for homogeneous in 'ACGT']) 10 | return score 11 | 12 | def profile_with_pseudocounts(motifs): 13 | prof = [] 14 | for i in range(len(motifs[0])): 15 | col = ''.join([motifs[j][i] for j in range(len(motifs))]) 16 | prof.append([float(col.count(nuc)+1)/float(len(col)+4) for nuc in 'ACGT']) 17 | return prof 18 | 19 | def profile_most_probable_kmer(dna, k, prof): 20 | nuc_loc = {nucleotide:index for index,nucleotide in enumerate('ACGT')} 21 | max_prob = [-1, None] 22 | for i in range(len(dna)-k+1): 23 | current_prob = 1 24 | for j, nucleotide in enumerate(dna[i:i+k]): 25 | current_prob *= prof[j][nuc_loc[nucleotide]] 26 | if current_prob > max_prob[0]: 27 | max_prob = [current_prob, dna[i:i+k]] 28 | 29 | return max_prob[1] 30 | 31 | def motifs_from_profile(profile, dna, k): 32 | return [profile_most_probable_kmer(seq,k,profile) for seq in dna] 33 | 34 | def randomized_motif_search(dna_list,k,t): 35 | rand_ints = [randint(0,len(dna_list[0])-k) for a in range(t)] 36 | motifs = [dna_list[i][r:r+k] for i,r in enumerate(rand_ints)] 37 | 38 | # Initialize the best score as a score higher than the highest possible score. 39 | best_score = [score(motifs), motifs] 40 | 41 | # Iterate motifs. 42 | while True: 43 | current_profile = profile_with_pseudocounts(motifs) 44 | motifs = motifs_from_profile(current_profile, dna_list, k) 45 | current_score = score(motifs) 46 | if current_score < best_score[0]: 47 | best_score = [current_score, motifs] 48 | else: 49 | return best_score 50 | 51 | def hamming_distance(str1, str2): 52 | counter = 0 53 | for s1, s2 in zip(str1, str2): 54 | if s1 != s2: 55 | counter += 1 56 | return counter 57 | 58 | 59 | def profile_randomized_kmer(dna, k, prof): 60 | nuc_loc = {nucleotide: index for index, nucleotide in enumerate('ACGT')} 61 | probs = [] 62 | for i in range(len(dna) - k): 63 | current_prob = 1. 64 | for j, nucleotide in enumerate(dna[i:i + k]): 65 | current_prob *= prof[j][nuc_loc[nucleotide]] 66 | probs.append(current_prob) 67 | 68 | i = numpy.random.choice(len(probs), p = numpy.array(probs) / numpy.sum(probs)) 69 | return dna[i:i + k] 70 | 71 | 72 | def gibbs_sampling_motif_search(dna_list, k, t, N, init_motifs=None): 73 | if init_motifs: 74 | motifs = init_motifs 75 | else: 76 | rand_ints = [randint(0, len(dna_list[0]) - k) for a in range(t)] 77 | motifs = [dna_list[i][r:r + k] for i, r in enumerate(rand_ints)] 78 | 79 | best_score = [score(motifs), list(motifs)] 80 | 81 | for j in range(N): 82 | i = randint(0, t - 1) 83 | current_profile = profile_with_pseudocounts([x for amotif, x in enumerate(motifs) if amotif != i]) 84 | motifs[i] = profile_randomized_kmer(dna_list[i], k, current_profile) 85 | current_score = score(motifs) 86 | if current_score < best_score[0]: 87 | best_score = [current_score, list(motifs)] 88 | 89 | return best_score 90 | 91 | 92 | if __name__ == '__main__': 93 | data = "".join(open('gibbs.txt')).split() 94 | k, t, N = int(data[0]), int(data[1]), int(data[2]) 95 | dna_list = data[3:] 96 | best_motifs = [k * t, None] 97 | for repeat in range(20): 98 | current_motifs = gibbs_sampling_motif_search(dna_list, k, t, N) 99 | if current_motifs[0] < best_motifs[0]: 100 | best_motifs = current_motifs 101 | print('\n'.join(best_motifs[1])) -------------------------------------------------------------------------------- /week5-6/111.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | def StringSpelledByPatterns(patterns, k): 4 | str = patterns[0] 5 | for i in range(1,len(patterns)): 6 | str += patterns[i][-1] 7 | return str 8 | 9 | def StringSpelledByGappedPatterns(patterns, k, d): 10 | FirstPatterns = [u for u,v in patterns] 11 | SecondPatterns = [v for u,v in patterns] 12 | PrefixString = StringSpelledByPatterns(FirstPatterns, k) 13 | SuffixString = StringSpelledByPatterns(SecondPatterns, k) 14 | for i in range((k+d+1),len(PrefixString)): 15 | if PrefixString[i] != SuffixString[i-k-d]: 16 | return "there is no string spelled by the gapped patterns" 17 | return PrefixString+SuffixString[-k-d:] 18 | 19 | 20 | 21 | def PairedDeBruijnGraph(patterns): 22 | result = dict() 23 | for pattern in patterns: 24 | PrefixPattern = (pattern[0][:-1],pattern[1][:-1]) 25 | SuffixPattern = (pattern[0][1:],pattern[1][1:]) 26 | if PrefixPattern in result: 27 | result[PrefixPattern].append(SuffixPattern) 28 | else: 29 | result[PrefixPattern] = [SuffixPattern] 30 | return result 31 | 32 | 33 | def EulerianCycle(graph): 34 | current_node = list(graph.keys())[0] 35 | path = [] 36 | # path = [current_node] 37 | 38 | def FormCycle(node): 39 | # print(node) 40 | cycle = [node] 41 | while True: 42 | # print(node in graph) 43 | # print(graph[node]) 44 | cycle.append(graph[node].pop()) 45 | # print(cycle) 46 | if len(graph[node]) == 0: 47 | del graph[node] 48 | # print(graph) 49 | if cycle[-1] in graph: 50 | node = cycle[-1] 51 | else: 52 | break 53 | # print(node) 54 | return cycle 55 | 56 | path.extend(FormCycle(current_node)) 57 | while len(graph)>0: 58 | for i in range(len(path)): 59 | if path[i] in graph: 60 | # print(path[i]) 61 | # print(type(path[i])) 62 | # print(graph[path[i]]) 63 | current_node = path[i] 64 | cycle = FormCycle(current_node) 65 | path = path[:i] + cycle +path[i+1:] 66 | # break 67 | return path 68 | 69 | 70 | def EulerianPath(graph): 71 | 72 | def UnbalancedNode(graph): 73 | # start = [] 74 | outNodes = set(graph.keys()) 75 | inNodes = [] 76 | for node in outNodes: 77 | inNodes.extend(graph[node]) 78 | inNodes = set(inNodes) 79 | # pprint(inNodes) 80 | # pprint(outNodes) 81 | end = list(inNodes - outNodes)[0] 82 | outDegree = dict() 83 | for node in outNodes: 84 | outDegree[node] = len(graph[node]) 85 | inDegree = dict() 86 | for node in graph.keys(): 87 | for inNode in graph[node]: 88 | inDegree.setdefault(inNode,0) 89 | inDegree[inNode] += 1 90 | # pprint(outDegree) 91 | # pprint(inDegree) 92 | for node in outDegree: 93 | if node not in inDegree: 94 | start = node 95 | break 96 | if outDegree[node] > inDegree[node]: 97 | start = node 98 | return start,end 99 | 100 | start,end = UnbalancedNode(graph) 101 | if end in graph: 102 | graph[end].append(start) 103 | else: 104 | graph[end] = [start] 105 | path = EulerianCycle(graph) 106 | # print(path) 107 | divide_point = list(filter(lambda i: path[i:i+2] == [end, start], range(len(path)-1)))[0] 108 | path = path[divide_point+1:] + path[1:divide_point+1] 109 | 110 | return path 111 | 112 | 113 | if __name__ == '__main__': 114 | # with open('./data/StringReconstructionFromReadPairs_test.txt') as f: 115 | # k,d = list(map(int,f.readline().strip().split())) 116 | # patterns = f.readlines() 117 | # patterns = [list(pattern.strip().split('|')) for pattern in patterns] 118 | 119 | patterns = [] 120 | with open('./data/probelmset3.txt') as f: 121 | for line in f: 122 | pattern = line.strip().split('|') 123 | patterns.append([pattern[0][1:],pattern[1][:-1]]) 124 | print(patterns) 125 | k=3 126 | d=1 127 | 128 | # print(k) 129 | # print(d) 130 | # print(patterns) 131 | # print(StringSpelledByGappedPatterns(patterns,k,d)) 132 | graph = PairedDeBruijnGraph(patterns) 133 | path = EulerianPath(graph) 134 | # print(path) 135 | # sortPatterns = 136 | s = StringSpelledByGappedPatterns(path,k,d) 137 | print(s) -------------------------------------------------------------------------------- /week3-4/6-Implement GreedyMotifSearch.py: -------------------------------------------------------------------------------- 1 | ''' 2 | GREEDYMOTIFSEARCH(Dna, k, t) 3 | BestMotifs ← motif matrix formed by first k-mers in each string 4 | from Dna 5 | for each k-mer Motif in the first string from Dna 6 | Motif1 ← Motif 7 | for i = 2 to t 8 | form Profile from motifs Motif1, …, Motifi - 1 9 | Motifi ← Profile-most probable k-mer in the i-th string 10 | in Dna 11 | Motifs ← (Motif1, …, Motift) 12 | if Score(Motifs) < Score(BestMotifs) 13 | BestMotifs ← Motifs 14 | return BestMotifs 15 | ''' 16 | def greedy_motif_search(dna, k, t): 17 | best_motifs = [] 18 | for i in range(t): 19 | best_motifs.append(dna[i][:k]) 20 | 21 | for i in range(len(dna[0]) - k + 1): 22 | motifs = [] 23 | motifs.append(dna[0][i: i + k]) 24 | for j in range(1, t): 25 | profile = create_profile(motifs) 26 | motifs.append(profile_most_probable(dna[j], profile, k)) 27 | if score(best_motifs) > score(motifs): 28 | best_motifs = motifs 29 | return best_motifs 30 | 31 | 32 | def calculate_probablity(profile, string): 33 | probablity = 1 34 | for i in range(0, len(string)): 35 | if string[i] == 'A': 36 | probablity = probablity * profile[i][0] 37 | elif string[i] == 'C': 38 | probablity = probablity * profile[i][1] 39 | elif string[i] == 'G': 40 | probablity = probablity * profile[i][2] 41 | elif string[i] == 'T': 42 | probablity = probablity * profile[i][3] 43 | return probablity 44 | 45 | 46 | def profile_most_probable(dna_string, profile, k): 47 | best_pattern = dna_string[0:0 + k] 48 | best_probability = 0 49 | for i in range(len(dna_string) - k + 1): 50 | string = dna_string[i:i + k] 51 | new_probablity = calculate_probablity(profile, string) 52 | if new_probablity > best_probability: 53 | best_pattern = string 54 | best_probability = new_probablity 55 | return best_pattern 56 | 57 | 58 | def create_profile(motifs): 59 | profile =[] 60 | for i in range(len(motifs[0])): 61 | count_A, count_C, count_G, count_T = 0, 0, 0, 0 62 | for motif in motifs: 63 | if motif[i] == 'A': 64 | count_A += 1 65 | elif motif[i] == 'C': 66 | count_C += 1 67 | elif motif[i] == 'G': 68 | count_G += 1 69 | elif motif[i] == 'T': 70 | count_T += 1 71 | profile.append([count_A / len(motifs), count_C / len(motifs), count_G / len(motifs), count_T / len(motifs)]) 72 | return profile 73 | 74 | 75 | def hamming_distance(str1, str2): 76 | counter = 0 77 | for s1, s2 in zip(str1, str2): 78 | if s1 != s2: 79 | counter += 1 80 | return counter 81 | 82 | 83 | def find_consensus(motifs): 84 | consensus = '' 85 | for i in range(len(motifs[0])): 86 | count_A, count_C, count_G, count_T = 0, 0, 0, 0 87 | for motif in motifs: 88 | if motif[i] == 'A': 89 | count_A += 1 90 | elif motif[i] == 'C': 91 | count_C += 1 92 | elif motif[i] == 'G': 93 | count_G += 1 94 | elif motif[i] == 'T': 95 | count_T += 1 96 | if count_A >= max(count_C, count_G, count_T): 97 | consensus += "A" 98 | elif count_C >= max(count_A, count_G, count_T): 99 | consensus += "C" 100 | elif count_G >= max(count_C, count_A, count_T): 101 | consensus += "G" 102 | elif count_T >= max(count_C, count_G, count_A): 103 | consensus += "T" 104 | return consensus 105 | 106 | 107 | def score(motifs): 108 | consensus = find_consensus(motifs) 109 | score = 0 110 | for motif in motifs: 111 | score += hamming_distance(consensus, motif) 112 | return score 113 | 114 | 115 | if __name__ == "__main__": 116 | data = "".join(open('greedy_motif_search.txt')).split() 117 | print(*greedy_motif_search(data[2: ], int(data[0]), int(data[1]))) 118 | 119 | -------------------------------------------------------------------------------- /week3-4/7-Implement GreedyMotifSearch with Pseudocounts.py: -------------------------------------------------------------------------------- 1 | ''' 2 | GREEDYMOTIFSEARCH(Dna, k, t) 3 | BestMotifs ← motif matrix formed by first k-mers in each string 4 | from Dna 5 | for each k-mer Motif in the first string from Dna 6 | Motif1 ← Motif 7 | for i = 2 to t 8 | form Profile from motifs Motif1, …, Motifi - 1 9 | Motifi ← Profile-most probable k-mer in the i-th string 10 | in Dna 11 | Motifs ← (Motif1, …, Motift) 12 | if Score(Motifs) < Score(BestMotifs) 13 | BestMotifs ← Motifs 14 | return BestMotifs 15 | ''' 16 | 17 | 18 | def greedy_motif_search(dna, k, t): 19 | best_motifs = [] 20 | for i in range(t): 21 | best_motifs.append(dna[i][:k]) 22 | 23 | for i in range(len(dna[0]) - k + 1): 24 | motifs = [] 25 | motifs.append(dna[0][i: i + k]) 26 | for j in range(1, t): 27 | profile = create_profile(motifs) 28 | motifs.append(profile_most_probable(dna[j], profile, k)) 29 | if score(best_motifs) > score(motifs): 30 | best_motifs = motifs 31 | return best_motifs 32 | 33 | 34 | def calculate_probablity(profile, string): 35 | probablity = 1 36 | for i in range(0, len(string)): 37 | if string[i] == 'A': 38 | probablity = probablity * profile[i][0] 39 | elif string[i] == 'C': 40 | probablity = probablity * profile[i][1] 41 | elif string[i] == 'G': 42 | probablity = probablity * profile[i][2] 43 | elif string[i] == 'T': 44 | probablity = probablity * profile[i][3] 45 | return probablity 46 | 47 | 48 | def profile_most_probable(dna_string, profile, k): 49 | best_pattern = dna_string[0:0 + k] 50 | best_probability = 0 51 | for i in range(len(dna_string) - k + 1): 52 | string = dna_string[i:i + k] 53 | new_probablity = calculate_probablity(profile, string) 54 | if new_probablity > best_probability: 55 | best_pattern = string 56 | best_probability = new_probablity 57 | return best_pattern 58 | 59 | 60 | def create_profile(motifs): 61 | profile =[] 62 | for i in range(len(motifs[0])): 63 | count_A, count_C, count_G, count_T = 1, 1, 1, 1 64 | for motif in motifs: 65 | if motif[i] == 'A': 66 | count_A += 1 67 | elif motif[i] == 'C': 68 | count_C += 1 69 | elif motif[i] == 'G': 70 | count_G += 1 71 | elif motif[i] == 'T': 72 | count_T += 1 73 | profile.append([count_A / (len(motifs) + 4), count_C/ (len(motifs) + 4), 74 | count_G / (len(motifs) + 4), count_T / (len(motifs) + 4)]) 75 | return profile 76 | 77 | 78 | def hamming_distance(str1, str2): 79 | counter = 0 80 | for s1, s2 in zip(str1, str2): 81 | if s1 != s2: 82 | counter += 1 83 | return counter 84 | 85 | 86 | def find_consensus(motifs): 87 | consensus = '' 88 | for i in range(len(motifs[0])): 89 | count_A, count_C, count_G, count_T = 0, 0, 0, 0 90 | for motif in motifs: 91 | if motif[i] == 'A': 92 | count_A += 1 93 | elif motif[i] == 'C': 94 | count_C += 1 95 | elif motif[i] == 'G': 96 | count_G += 1 97 | elif motif[i] == 'T': 98 | count_T += 1 99 | if count_A >= max(count_C, count_G, count_T): 100 | consensus += "A" 101 | elif count_C >= max(count_A, count_G, count_T): 102 | consensus += "C" 103 | elif count_G >= max(count_C, count_A, count_T): 104 | consensus += "G" 105 | elif count_T >= max(count_C, count_G, count_A): 106 | consensus += "T" 107 | return consensus 108 | 109 | 110 | def score(motifs): 111 | consensus = find_consensus(motifs) 112 | score = 0 113 | for motif in motifs: 114 | score += hamming_distance(consensus, motif) 115 | return score 116 | 117 | 118 | if __name__ == "__main__": 119 | data = "".join(open('greedy_motif_search_pseudocounts.txt')).split() 120 | print(*greedy_motif_search(data[2: ], int(data[0]), int(data[1]))) -------------------------------------------------------------------------------- /week7-8/18 - Cyclopeptide Sequencing1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | CyclopeptideSequencing(Spectrum) 3 | candidate_peptides ← a set containing only the empty peptide 4 | FinalPeptides ← empty list of strings 5 | while candidate_peptides is nonempty 6 | candidate_peptides ← Expand(candidate_peptides) 7 | for each peptide Peptide in candidate_peptides 8 | if Mass(Peptide) = ParentMass(Spectrum) 9 | if Cyclospectrum(Peptide) = Spectrum and Peptide is not in FinalPeptides 10 | append Peptide to FinalPeptides 11 | remove Peptide from candidate_peptides 12 | else if Peptide is not consistent with Spectrum 13 | remove Peptide from candidate_peptides 14 | return FinalPeptides 15 | ''' 16 | 17 | 18 | def cyclopeptide_sequencing(spectrum): 19 | candidate_peptides = [''] 20 | final_peptides = [] 21 | while candidate_peptides: 22 | candidate_peptides = expand(candidate_peptides) 23 | minus = [] 24 | for i in range(len(candidate_peptides)): 25 | peptide = candidate_peptides[i] 26 | if get_peptide_mass(peptide) == max(spectrum): 27 | if cyclic_spectrum(peptide, amino_acid_mass_table) == spectrum: 28 | final_peptides.append(peptide) 29 | minus.append(peptide) 30 | elif not consistent(peptide, spectrum): 31 | minus.append(peptide) 32 | for i in range(len(minus)): 33 | candidate_peptides.remove(minus[i]) 34 | 35 | mass_final_peptide = [] 36 | for peptide in final_peptides: 37 | mass_peptides = [] 38 | for i in range(len(peptide)): 39 | mass_peptides.append(amino_acid_mass_table[peptide[i]]) 40 | mass_final_peptide.append('-'.join(str(i) for i in mass_peptides)) 41 | return ' '.join(str(i) for i in mass_final_peptide) 42 | 43 | 44 | def expand(peptides): 45 | new_peptides = [] 46 | for peptide in peptides: 47 | for key in amino_acid_mass_table: 48 | new_peptides.append(peptide + key) 49 | return new_peptides 50 | 51 | 52 | def get_peptide_mass(peptide): 53 | mass = 0 54 | for i in range(len(peptide)): 55 | mass += amino_acid_mass_table[peptide[i]] 56 | return mass 57 | 58 | 59 | def linear_spectrum(peptide): 60 | prefix_mass = [0 for i in range(len(peptide) + 1)] 61 | for i in range(len(peptide)): 62 | prefix_mass[i + 1] = prefix_mass[i] + amino_acid_mass_table[peptide[i]] 63 | lin_spectrum = [] 64 | for i in range(len(peptide)): 65 | for j in range(i + 1, len(peptide) + 1): 66 | lin_spectrum.append(prefix_mass[j] - prefix_mass[i]) 67 | lin_spectrum.append(0) 68 | lin_spectrum = sorted(lin_spectrum) 69 | return lin_spectrum 70 | 71 | 72 | def cyclic_spectrum(peptide, amino_acid_mass_table): 73 | prefix_mass = [0 for i in range(len(peptide) + 1)] 74 | for i in range(len(peptide)): 75 | prefix_mass[i + 1] = prefix_mass[i] + amino_acid_mass_table[peptide[i]] 76 | peptideMass = prefix_mass[len(peptide)] 77 | cycl_spectrum = [] 78 | for i in range(len(peptide)): 79 | for j in range(i + 1, len(peptide) + 1): 80 | cycl_spectrum.append(prefix_mass[j] - prefix_mass[i]) 81 | if i > 0 and j < len(peptide): 82 | cycl_spectrum.append(peptideMass - (prefix_mass[j] - prefix_mass[i])) 83 | cycl_spectrum.append(0) 84 | cycl_spectrum = sorted(cycl_spectrum) 85 | return cycl_spectrum 86 | 87 | 88 | def consistent(peptide, spectrum): 89 | lin_spectrum = linear_spectrum(peptide) 90 | for s in lin_spectrum: 91 | if lin_spectrum.count(s) > spectrum.count(s): 92 | return False 93 | return True 94 | 95 | 96 | if __name__ == '__main__': 97 | spectrum = list(map(int, open('cyclopeptide_sequencing.txt').readline().split())) 98 | # print(spectrum) 99 | amino_acid_mass_table = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113, 'L': 113, 100 | 'N': 114, 'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'M': 131, 'H': 137, 'F': 147, 'R': 156, 101 | 'Y': 163, 'W': 186} 102 | amino = list(amino_acid_mass_table.keys()) 103 | amino_mass = list(amino_acid_mass_table.values()) 104 | print(cyclopeptide_sequencing(spectrum)) 105 | -------------------------------------------------------------------------------- /week7-8/19 - LeaderboardCyclopeptideSequencing.py: -------------------------------------------------------------------------------- 1 | """ 2 | LeaderboardCyclopeptideSequencing(Spectrum, N) 3 | Leaderboard ← set containing only the empty peptide 4 | LeaderPeptide ← empty peptide 5 | while Leaderboard is non-empty 6 | Leaderboard ← Expand(Leaderboard) 7 | for each Peptide in Leaderboard 8 | if Mass(Peptide) = ParentMass(Spectrum) 9 | if Score(Peptide, Spectrum) > Score(LeaderPeptide, Spectrum) 10 | LeaderPeptide ← Peptide 11 | else if Mass(Peptide) > ParentMass(Spectrum) 12 | remove Peptide from Leaderboard 13 | Leaderboard ← Trim(Leaderboard, Spectrum, N) 14 | output LeaderPeptide 15 | """ 16 | 17 | 18 | def expand(peptides): 19 | new_peptides = [] 20 | for peptide in peptides: 21 | for key in amino_acid_mass_table: 22 | new_peptides.append(peptide + key) 23 | return new_peptides 24 | 25 | 26 | def get_peptide_mass(peptide): 27 | mass = 0 28 | for i in range(len(peptide)): 29 | mass += amino_acid_mass_table[peptide[i]] 30 | return mass 31 | 32 | def get_parent_mass(spectrum): 33 | return spectrum[-1] 34 | 35 | 36 | def linear_spectrum(peptide): 37 | prefix_mass = [0 for i in range(len(peptide) + 1)] 38 | for i in range(len(peptide)): 39 | prefix_mass[i + 1] = prefix_mass[i] + amino_acid_mass_table[peptide[i]] 40 | lin_spectrum = [] 41 | for i in range(len(peptide)): 42 | for j in range(i + 1, len(peptide) + 1): 43 | lin_spectrum.append(prefix_mass[j] - prefix_mass[i]) 44 | lin_spectrum.append(0) 45 | lin_spectrum = sorted(lin_spectrum) 46 | return lin_spectrum 47 | 48 | 49 | def linear_score(peptide, spectrum): 50 | ls = linear_spectrum(peptide) 51 | cs = spectrum.copy() 52 | score = 0 53 | for c in ls: 54 | if c in cs: 55 | score += 1 56 | cs.remove(c) 57 | return score 58 | 59 | 60 | def trim(leaderboard, spectrum, N): 61 | scores = [] 62 | if len(leaderboard) < N: 63 | leaderboard_to_return = leaderboard 64 | else: 65 | for pep in leaderboard: 66 | scores.append(linear_score(pep, spectrum)) 67 | scores.sort(reverse=True) 68 | score_min = scores[N - 1] 69 | valid_pep = [] 70 | for i, pep in enumerate(leaderboard): 71 | if linear_score(pep, spectrum) >= score_min: 72 | valid_pep.append(i) 73 | leaderboard_to_return = [] 74 | for k in valid_pep: 75 | leaderboard_to_return.append(leaderboard[k]) 76 | return leaderboard_to_return 77 | 78 | 79 | def leaderboard_cyclopeptide_sequencing(spectrum, n): 80 | leaderboard = [''] 81 | leader_peptide = '' 82 | leader_peptide_score = 0 83 | while leaderboard: 84 | leaderboard = expand(leaderboard) 85 | loop = list(leaderboard) 86 | for peptide in loop: 87 | mass = get_peptide_mass(peptide) 88 | parent_mass = get_parent_mass(spectrum) 89 | if mass == parent_mass: 90 | score = linear_score(peptide, spectrum) 91 | if score > leader_peptide_score: 92 | leader_peptide = peptide 93 | leader_peptide_score = score 94 | elif mass > parent_mass: 95 | leaderboard.remove(peptide) 96 | leaderboard = trim(leaderboard, spectrum, n) 97 | return leader_peptide 98 | 99 | 100 | def leaderboard_cyclopeptide(spectrum, n): 101 | leader_peptide = leaderboard_cyclopeptide_sequencing(spectrum, n) 102 | return [amino_acid_mass_table[amino_acid] for amino_acid in leader_peptide] 103 | 104 | 105 | if __name__ == '__main__': 106 | amino_acid_mass_table = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113, 'L': 113, 107 | 'N': 114, 'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'M': 131, 'H': 137, 'F': 147, 'R': 156, 108 | 'Y': 163, 'W': 186} 109 | with open('leaderboard_cyclopeptide_sequencing.txt') as f: 110 | n = int(f.readline()) 111 | spectrum = list(map(int, f.readline().split())) 112 | leader_peptide = leaderboard_cyclopeptide_sequencing(spectrum, n) 113 | leader_peptide_mass = [] 114 | for i in leader_peptide: 115 | leader_peptide_mass.append(amino_acid_mass_table[i]) 116 | print('-'.join([str(i) for i in leader_peptide_mass])) 117 | # 71-101-99-128-163-186-128-113-147-113-131-87-115-131 -------------------------------------------------------------------------------- /week9-10/PAM250.txt: -------------------------------------------------------------------------------- 1 | A A 2 2 | A C -2 3 | A D 0 4 | A E 0 5 | A F -3 6 | A G 1 7 | A H -1 8 | A I -1 9 | A K -1 10 | A L -2 11 | A M -1 12 | A N 0 13 | A P 1 14 | A Q 0 15 | A R -2 16 | A S 1 17 | A T 1 18 | A V 0 19 | A W -6 20 | A Y -3 21 | C A -2 22 | C C 12 23 | C D -5 24 | C E -5 25 | C F -4 26 | C G -3 27 | C H -3 28 | C I -2 29 | C K -5 30 | C L -6 31 | C M -5 32 | C N -4 33 | C P -3 34 | C Q -5 35 | C R -4 36 | C S 0 37 | C T -2 38 | C V -2 39 | C W -8 40 | C Y 0 41 | D A 0 42 | D C -5 43 | D D 4 44 | D E 3 45 | D F -6 46 | D G 1 47 | D H 1 48 | D I -2 49 | D K 0 50 | D L -4 51 | D M -3 52 | D N 2 53 | D P -1 54 | D Q 2 55 | D R -1 56 | D S 0 57 | D T 0 58 | D V -2 59 | D W -7 60 | D Y -4 61 | E A 0 62 | E C -5 63 | E D 3 64 | E E 4 65 | E F -5 66 | E G 0 67 | E H 1 68 | E I -2 69 | E K 0 70 | E L -3 71 | E M -2 72 | E N 1 73 | E P -1 74 | E Q 2 75 | E R -1 76 | E S 0 77 | E T 0 78 | E V -2 79 | E W -7 80 | E Y -4 81 | F A -3 82 | F C -4 83 | F D -6 84 | F E -5 85 | F F 9 86 | F G -5 87 | F H -2 88 | F I 1 89 | F K -5 90 | F L 2 91 | F M 0 92 | F N -3 93 | F P -5 94 | F Q -5 95 | F R -4 96 | F S -3 97 | F T -3 98 | F V -1 99 | F W 0 100 | F Y 7 101 | G A 1 102 | G C -3 103 | G D 1 104 | G E 0 105 | G F -5 106 | G G 5 107 | G H -2 108 | G I -3 109 | G K -2 110 | G L -4 111 | G M -3 112 | G N 0 113 | G P 0 114 | G Q -1 115 | G R -3 116 | G S 1 117 | G T 0 118 | G V -1 119 | G W -7 120 | G Y -5 121 | H A -1 122 | H C -3 123 | H D 1 124 | H E 1 125 | H F -2 126 | H G -2 127 | H H 6 128 | H I -2 129 | H K 0 130 | H L -2 131 | H M -2 132 | H N 2 133 | H P 0 134 | H Q 3 135 | H R 2 136 | H S -1 137 | H T -1 138 | H V -2 139 | H W -3 140 | H Y 0 141 | I A -1 142 | I C -2 143 | I D -2 144 | I E -2 145 | I F 1 146 | I G -3 147 | I H -2 148 | I I 5 149 | I K -2 150 | I L 2 151 | I M 2 152 | I N -2 153 | I P -2 154 | I Q -2 155 | I R -2 156 | I S -1 157 | I T 0 158 | I V 4 159 | I W -5 160 | I Y -1 161 | K A -1 162 | K C -5 163 | K D 0 164 | K E 0 165 | K F -5 166 | K G -2 167 | K H 0 168 | K I -2 169 | K K 5 170 | K L -3 171 | K M 0 172 | K N 1 173 | K P -1 174 | K Q 1 175 | K R 3 176 | K S 0 177 | K T 0 178 | K V -2 179 | K W -3 180 | K Y -4 181 | L A -2 182 | L C -6 183 | L D -4 184 | L E -3 185 | L F 2 186 | L G -4 187 | L H -2 188 | L I 2 189 | L K -3 190 | L L 6 191 | L M 4 192 | L N -3 193 | L P -3 194 | L Q -2 195 | L R -3 196 | L S -3 197 | L T -2 198 | L V 2 199 | L W -2 200 | L Y -1 201 | M A -1 202 | M C -5 203 | M D -3 204 | M E -2 205 | M F 0 206 | M G -3 207 | M H -2 208 | M I 2 209 | M K 0 210 | M L 4 211 | M M 6 212 | M N -2 213 | M P -2 214 | M Q -1 215 | M R 0 216 | M S -2 217 | M T -1 218 | M V 2 219 | M W -4 220 | M Y -2 221 | N A 0 222 | N C -4 223 | N D 2 224 | N E 1 225 | N F -3 226 | N G 0 227 | N H 2 228 | N I -2 229 | N K 1 230 | N L -3 231 | N M -2 232 | N N 2 233 | N P 0 234 | N Q 1 235 | N R 0 236 | N S 1 237 | N T 0 238 | N V -2 239 | N W -4 240 | N Y -2 241 | P A 1 242 | P C -3 243 | P D -1 244 | P E -1 245 | P F -5 246 | P G 0 247 | P H 0 248 | P I -2 249 | P K -1 250 | P L -3 251 | P M -2 252 | P N 0 253 | P P 6 254 | P Q 0 255 | P R 0 256 | P S 1 257 | P T 0 258 | P V -1 259 | P W -6 260 | P Y -5 261 | Q A 0 262 | Q C -5 263 | Q D 2 264 | Q E 2 265 | Q F -5 266 | Q G -1 267 | Q H 3 268 | Q I -2 269 | Q K 1 270 | Q L -2 271 | Q M -1 272 | Q N 1 273 | Q P 0 274 | Q Q 4 275 | Q R 1 276 | Q S -1 277 | Q T -1 278 | Q V -2 279 | Q W -5 280 | Q Y -4 281 | R A -2 282 | R C -4 283 | R D -1 284 | R E -1 285 | R F -4 286 | R G -3 287 | R H 2 288 | R I -2 289 | R K 3 290 | R L -3 291 | R M 0 292 | R N 0 293 | R P 0 294 | R Q 1 295 | R R 6 296 | R S 0 297 | R T -1 298 | R V -2 299 | R W 2 300 | R Y -4 301 | S A 1 302 | S C 0 303 | S D 0 304 | S E 0 305 | S F -3 306 | S G 1 307 | S H -1 308 | S I -1 309 | S K 0 310 | S L -3 311 | S M -2 312 | S N 1 313 | S P 1 314 | S Q -1 315 | S R 0 316 | S S 2 317 | S T 1 318 | S V -1 319 | S W -2 320 | S Y -3 321 | T A 1 322 | T C -2 323 | T D 0 324 | T E 0 325 | T F -3 326 | T G 0 327 | T H -1 328 | T I 0 329 | T K 0 330 | T L -2 331 | T M -1 332 | T N 0 333 | T P 0 334 | T Q -1 335 | T R -1 336 | T S 1 337 | T T 3 338 | T V 0 339 | T W -5 340 | T Y -3 341 | V A 0 342 | V C -2 343 | V D -2 344 | V E -2 345 | V F -1 346 | V G -1 347 | V H -2 348 | V I 4 349 | V K -2 350 | V L 2 351 | V M 2 352 | V N -2 353 | V P -1 354 | V Q -2 355 | V R -2 356 | V S -1 357 | V T 0 358 | V V 4 359 | V W -6 360 | V Y -2 361 | W A -6 362 | W C -8 363 | W D -7 364 | W E -7 365 | W F 0 366 | W G -7 367 | W H -3 368 | W I -5 369 | W K -3 370 | W L -2 371 | W M -4 372 | W N -4 373 | W P -6 374 | W Q -5 375 | W R 2 376 | W S -2 377 | W T -5 378 | W V -6 379 | W W 17 380 | W Y 0 381 | Y A -3 382 | Y C 0 383 | Y D -4 384 | Y E -4 385 | Y F 7 386 | Y G -5 387 | Y H 0 388 | Y I -1 389 | Y K -4 390 | Y L -1 391 | Y M -2 392 | Y N -2 393 | Y P -5 394 | Y Q -4 395 | Y R -4 396 | Y S -3 397 | Y T -3 398 | Y V -2 399 | Y W 0 400 | Y Y 10 -------------------------------------------------------------------------------- /week9-10/25 - Global_Alignment_in_Linear_Space_Problem.py: -------------------------------------------------------------------------------- 1 | def global_alignment(seq1, seq2, score_matrix, penalty): 2 | len1, len2 = len(seq1), len(seq2) 3 | s = [[0] * (len2 + 1) for i in range(len1 + 1)] 4 | backtrack = [[0] * (len2 + 1) for i in range(len1 + 1)] 5 | for i in range(1, len1 + 1): 6 | s[i][0] = - i * penalty 7 | for j in range(1, len2 + 1): 8 | s[0][j] = - j * penalty 9 | for i in range(1, len1 + 1): 10 | for j in range(1, len2 + 1): 11 | score_list = [s[i - 1][j] - penalty, s[i][j - 1] - penalty, 12 | s[i - 1][j - 1] + score_matrix[seq1[i - 1], seq2[j - 1]]] 13 | s[i][j] = max(score_list) 14 | backtrack[i][j] = score_list.index(s[i][j]) 15 | indel_insert = lambda seq, i: seq[:i] + '-' + seq[i:] 16 | align1, align2 = seq1, seq2 17 | a, b = len1, len2 18 | max_score = str(s[a][b]) 19 | while a * b != 0: 20 | if backtrack[a][b] == 0: 21 | a -= 1 22 | align2 = indel_insert(align2, b) 23 | elif backtrack[a][b] == 1: 24 | b -= 1 25 | align1 = indel_insert(align1, a) 26 | else: 27 | a -= 1 28 | b -= 1 29 | for i in range(a): 30 | align2 = indel_insert(align2, 0) 31 | for j in range(b): 32 | align1 = indel_insert(align1, 0) 33 | return max_score, align1, align2 34 | 35 | 36 | def mid_column_score(v, w, score_matrix, penalty): 37 | s = [[i * j * penalty for i in range(-1, 1)] for j in range(len(v) + 1)] 38 | s[0][1] = -penalty 39 | backtrack = [0] * (len(v) + 1) 40 | for j in range(1, len(w) // 2 + 1): 41 | for i in range(0, len(v) + 1): 42 | if i == 0: 43 | s[i][1] = -j * penalty 44 | else: 45 | scores = [s[i - 1][0] + score_matrix[v[i - 1], w[j - 1]], s[i][0] - penalty, s[i - 1][1] - penalty] 46 | s[i][1] = max(scores) 47 | backtrack[i] = scores.index(s[i][1]) 48 | if j != len(w) // 2: 49 | s = [[row[1]] * 2 for row in s] 50 | return [i[1] for i in s], backtrack 51 | 52 | 53 | def mid_edge(v, w, score_matrix, penalty): 54 | source = mid_column_score(v, w, score_matrix, penalty)[0] 55 | mid_to_sink, backtrack = list(map(lambda l: l[::-1], mid_column_score(v[::-1], w[::-1] + ['', '$'][ 56 | len(w) % 2 == 1 and len(w) > 1], score_matrix, penalty))) 57 | scores = list(map(sum, zip(source, mid_to_sink))) 58 | max_mid = max(range(len(scores)), key = lambda i: scores[i]) 59 | if max_mid == len(scores) - 1: 60 | next_node = (max_mid, len(w) // 2 + 1) 61 | else: 62 | next_node = [(max_mid + 1, len(w) // 2 + 1), (max_mid, len(w) // 2 + 1), (max_mid + 1, len(w) // 2), ][ 63 | backtrack[max_mid]] 64 | return (max_mid, len(w) // 2), next_node 65 | 66 | 67 | def linear_space_alignment(top, bottom, left, right, score_matrix): 68 | v = seq1 69 | w = seq2 70 | if left == right: 71 | return [v[top:bottom], '-' * (bottom - top)] 72 | elif top == bottom: 73 | return ['-' * (right - left), w[left:right]] 74 | elif bottom - top == 1 or right - left == 1: 75 | return global_alignment(v[top:bottom], w[left:right], score_matrix, penalty)[1:] 76 | else: 77 | mid_node, next_node = mid_edge(v[top:bottom], w[left:right], score_matrix, penalty) 78 | mid_node = tuple(map(sum, zip(mid_node, [top, left]))) 79 | next_node = tuple(map(sum, zip(next_node, [top, left]))) 80 | current = [['-', v[mid_node[0] % len(v)]][next_node[0] - mid_node[0]], 81 | ['-', w[mid_node[1] % len(w)]][next_node[1] - mid_node[1]]] 82 | a = linear_space_alignment(top, mid_node[0], left, mid_node[1], score_matrix) 83 | b = linear_space_alignment(next_node[0], bottom, next_node[1], right, score_matrix) 84 | return [a[i] + current[i] + b[i] for i in range(2)] 85 | 86 | 87 | def linear_space_global_alignment(v, w, score_matrix, penalty): 88 | align1, align2 = linear_space_alignment(0, len(v), 0, len(w), score_matrix) 89 | p = [] 90 | for i in zip(align1, align2): 91 | if '-' in i: 92 | p.append(-penalty) 93 | else: 94 | p.append(score_matrix[i]) 95 | score = sum(p) 96 | return str(score), align1, align2 97 | 98 | 99 | if __name__ == '__main__': 100 | with open('linear_space_alignment.txt') as f: 101 | seq1 = f.readline().strip() 102 | seq2 = f.readline().strip() 103 | with open('BLOSUM62.txt') as f1: 104 | lines = [line.strip().split() for line in f1.readlines()] 105 | matrix = {(i[0], i[1]): int(i[2]) for i in lines} 106 | penalty = 5 107 | alignment = '\n'.join(linear_space_global_alignment(seq1, seq2, matrix, penalty)) 108 | print(alignment) 109 | -------------------------------------------------------------------------------- /week3-4/8-Implement RandomizedMotifSearch.py: -------------------------------------------------------------------------------- 1 | ''' 2 | RANDOMIZEDMOTIFSEARCH(Dna, k, t) 3 | randomly select k-mers Motifs = (Motif1, …, Motift) in each string 4 | from Dna 5 | BestMotifs ← Motifs 6 | while forever 7 | Profile ← Profile(Motifs) 8 | Motifs ← Motifs(Profile, Dna) 9 | if Score(Motifs) < Score(BestMotifs) 10 | BestMotifs ← Motifs 11 | else 12 | return BestMotifs 13 | ''' 14 | import random 15 | 16 | def iterate_randomized_motif_search(dna, k ,t): 17 | i = 0 18 | best_motifs = [] 19 | for i in range(t): 20 | best_motifs.append(dna[i][:k]) 21 | while i < 1000: 22 | motifs = randomized_motif_search(dna, k, t) 23 | if score(motifs) < score(best_motifs): 24 | best_motifs = motifs 25 | i = i+1 26 | #print(i) 27 | return best_motifs 28 | 29 | 30 | def randomized_motif_search(dna, k, t): 31 | best_motifs = random_motifs(dna, k) 32 | while True: 33 | profile = create_profile(best_motifs) 34 | motifs = create_motifs(profile, dna) 35 | if score(motifs) < score(best_motifs): 36 | best_motifs = motifs 37 | else: 38 | return best_motifs 39 | 40 | 41 | def create_motifs(profile, dna): 42 | motifs = [] 43 | for i in dna: 44 | motifs.append(profile_most_probable(i, profile, len(profile))) 45 | return motifs 46 | 47 | 48 | def calculate_probablity(profile, string): 49 | probablity = 1 50 | for i in range(0, len(string)): 51 | if string[i] == 'A': 52 | probablity = probablity * profile[i][0] 53 | elif string[i] == 'C': 54 | probablity = probablity * profile[i][1] 55 | elif string[i] == 'G': 56 | probablity = probablity * profile[i][2] 57 | elif string[i] == 'T': 58 | probablity = probablity * profile[i][3] 59 | return probablity 60 | 61 | 62 | def profile_most_probable(dna_string, profile, k): 63 | best_pattern = dna_string[0:0 + k] 64 | best_probability = 0 65 | for i in range(len(dna_string) - k + 1): 66 | string = dna_string[i:i + k] 67 | new_probablity = calculate_probablity(profile, string) 68 | if new_probablity > best_probability: 69 | best_pattern = string 70 | best_probability = new_probablity 71 | return best_pattern 72 | 73 | 74 | def create_profile(motifs): 75 | profile =[] 76 | for i in range(len(motifs[0])): 77 | count_A, count_C, count_G, count_T = 1, 1, 1, 1 78 | for motif in motifs: 79 | if motif[i] == 'A': 80 | count_A += 1 81 | elif motif[i] == 'C': 82 | count_C += 1 83 | elif motif[i] == 'G': 84 | count_G += 1 85 | elif motif[i] == 'T': 86 | count_T += 1 87 | profile.append([count_A / (len(motifs) + 4), count_C/ (len(motifs) + 4), 88 | count_G / (len(motifs) + 4), count_T / (len(motifs) + 4)]) 89 | return profile 90 | 91 | 92 | def hamming_distance(str1, str2): 93 | counter = 0 94 | for s1, s2 in zip(str1, str2): 95 | if s1 != s2: 96 | counter += 1 97 | return counter 98 | 99 | 100 | def find_consensus(motifs): 101 | consensus = '' 102 | for i in range(len(motifs[0])): 103 | count_A, count_C, count_G, count_T = 0, 0, 0, 0 104 | for motif in motifs: 105 | if motif[i] == 'A': 106 | count_A += 1 107 | elif motif[i] == 'C': 108 | count_C += 1 109 | elif motif[i] == 'G': 110 | count_G += 1 111 | elif motif[i] == 'T': 112 | count_T += 1 113 | if count_A >= max(count_C, count_G, count_T): 114 | consensus += "A" 115 | elif count_C >= max(count_A, count_G, count_T): 116 | consensus += "C" 117 | elif count_G >= max(count_C, count_A, count_T): 118 | consensus += "G" 119 | elif count_T >= max(count_C, count_G, count_A): 120 | consensus += "T" 121 | return consensus 122 | 123 | 124 | def score(motifs): 125 | consensus = find_consensus(motifs) 126 | score = 0 127 | for motif in motifs: 128 | score += hamming_distance(consensus, motif) 129 | return score 130 | 131 | 132 | def random_motifs(dna,k): 133 | motifs = [] 134 | for i in dna: 135 | point = random.randint(0, len(i)-k) 136 | motifs.append(i[point:point + k]) 137 | return motifs 138 | 139 | if __name__ == "__main__": 140 | data = "".join(open('randomized_motif_search.txt')).split() 141 | best_motifs = iterate_randomized_motif_search(data[2:], int(data[0]), int(data[1])) 142 | for i in best_motifs: 143 | print(i) 144 | -------------------------------------------------------------------------------- /week9-10/BLOSUM62.txt: -------------------------------------------------------------------------------- 1 | A A 4 2 | A C 0 3 | A D -2 4 | A E -1 5 | A F -2 6 | A G 0 7 | A H -2 8 | A I -1 9 | A K -1 10 | A L -1 11 | A M -1 12 | A N -2 13 | A P -1 14 | A Q -1 15 | A R -1 16 | A S 1 17 | A T 0 18 | A V 0 19 | A W -3 20 | A Y -2 21 | C A 0 22 | C C 9 23 | C D -3 24 | C E -4 25 | C F -2 26 | C G -3 27 | C H -3 28 | C I -1 29 | C K -3 30 | C L -1 31 | C M -1 32 | C N -3 33 | C P -3 34 | C Q -3 35 | C R -3 36 | C S -1 37 | C T -1 38 | C V -1 39 | C W -2 40 | C Y -2 41 | D A -2 42 | D C -3 43 | D D 6 44 | D E 2 45 | D F -3 46 | D G -1 47 | D H -1 48 | D I -3 49 | D K -1 50 | D L -4 51 | D M -3 52 | D N 1 53 | D P -1 54 | D Q 0 55 | D R -2 56 | D S 0 57 | D T -1 58 | D V -3 59 | D W -4 60 | D Y -3 61 | E A -1 62 | E C -4 63 | E D 2 64 | E E 5 65 | E F -3 66 | E G -2 67 | E H 0 68 | E I -3 69 | E K 1 70 | E L -3 71 | E M -2 72 | E N 0 73 | E P -1 74 | E Q 2 75 | E R 0 76 | E S 0 77 | E T -1 78 | E V -2 79 | E W -3 80 | E Y -2 81 | F A -2 82 | F C -2 83 | F D -3 84 | F E -3 85 | F F 6 86 | F G -3 87 | F H -1 88 | F I 0 89 | F K -3 90 | F L 0 91 | F M 0 92 | F N -3 93 | F P -4 94 | F Q -3 95 | F R -3 96 | F S -2 97 | F T -2 98 | F V -1 99 | F W 1 100 | F Y 3 101 | G A 0 102 | G C -3 103 | G D -1 104 | G E -2 105 | G F -3 106 | G G 6 107 | G H -2 108 | G I -4 109 | G K -2 110 | G L -4 111 | G M -3 112 | G N 0 113 | G P -2 114 | G Q -2 115 | G R -2 116 | G S 0 117 | G T -2 118 | G V -3 119 | G W -2 120 | G Y -3 121 | H A -2 122 | H C -3 123 | H D -1 124 | H E 0 125 | H F -1 126 | H G -2 127 | H H 8 128 | H I -3 129 | H K -1 130 | H L -3 131 | H M -2 132 | H N 1 133 | H P -2 134 | H Q 0 135 | H R 0 136 | H S -1 137 | H T -2 138 | H V -3 139 | H W -2 140 | H Y 2 141 | I A -1 142 | I C -1 143 | I D -3 144 | I E -3 145 | I F 0 146 | I G -4 147 | I H -3 148 | I I 4 149 | I K -3 150 | I L 2 151 | I M 1 152 | I N -3 153 | I P -3 154 | I Q -3 155 | I R -3 156 | I S -2 157 | I T -1 158 | I V 3 159 | I W -3 160 | I Y -1 161 | K A -1 162 | K C -3 163 | K D -1 164 | K E 1 165 | K F -3 166 | K G -2 167 | K H -1 168 | K I -3 169 | K K 5 170 | K L -2 171 | K M -1 172 | K N 0 173 | K P -1 174 | K Q 1 175 | K R 2 176 | K S 0 177 | K T -1 178 | K V -2 179 | K W -3 180 | K Y -2 181 | L A -1 182 | L C -1 183 | L D -4 184 | L E -3 185 | L F 0 186 | L G -4 187 | L H -3 188 | L I 2 189 | L K -2 190 | L L 4 191 | L M 2 192 | L N -3 193 | L P -3 194 | L Q -2 195 | L R -2 196 | L S -2 197 | L T -1 198 | L V 1 199 | L W -2 200 | L Y -1 201 | M A -1 202 | M C -1 203 | M D -3 204 | M E -2 205 | M F 0 206 | M G -3 207 | M H -2 208 | M I 1 209 | M K -1 210 | M L 2 211 | M M 5 212 | M N -2 213 | M P -2 214 | M Q 0 215 | M R -1 216 | M S -1 217 | M T -1 218 | M V 1 219 | M W -1 220 | M Y -1 221 | N A -2 222 | N C -3 223 | N D 1 224 | N E 0 225 | N F -3 226 | N G 0 227 | N H 1 228 | N I -3 229 | N K 0 230 | N L -3 231 | N M -2 232 | N N 6 233 | N P -2 234 | N Q 0 235 | N R 0 236 | N S 1 237 | N T 0 238 | N V -3 239 | N W -4 240 | N Y -2 241 | P A -1 242 | P C -3 243 | P D -1 244 | P E -1 245 | P F -4 246 | P G -2 247 | P H -2 248 | P I -3 249 | P K -1 250 | P L -3 251 | P M -2 252 | P N -2 253 | P P 7 254 | P Q -1 255 | P R -2 256 | P S -1 257 | P T -1 258 | P V -2 259 | P W -4 260 | P Y -3 261 | Q A -1 262 | Q C -3 263 | Q D 0 264 | Q E 2 265 | Q F -3 266 | Q G -2 267 | Q H 0 268 | Q I -3 269 | Q K 1 270 | Q L -2 271 | Q M 0 272 | Q N 0 273 | Q P -1 274 | Q Q 5 275 | Q R 1 276 | Q S 0 277 | Q T -1 278 | Q V -2 279 | Q W -2 280 | Q Y -1 281 | R A -1 282 | R C -3 283 | R D -2 284 | R E 0 285 | R F -3 286 | R G -2 287 | R H 0 288 | R I -3 289 | R K 2 290 | R L -2 291 | R M -1 292 | R N 0 293 | R P -2 294 | R Q 1 295 | R R 5 296 | R S -1 297 | R T -1 298 | R V -3 299 | R W -3 300 | R Y -2 301 | S A 1 302 | S C -1 303 | S D 0 304 | S E 0 305 | S F -2 306 | S G 0 307 | S H -1 308 | S I -2 309 | S K 0 310 | S L -2 311 | S M -1 312 | S N 1 313 | S P -1 314 | S Q 0 315 | S R -1 316 | S S 4 317 | S T 1 318 | S V -2 319 | S W -3 320 | S Y -2 321 | T A 0 322 | T C -1 323 | T D -1 324 | T E -1 325 | T F -2 326 | T G -2 327 | T H -2 328 | T I -1 329 | T K -1 330 | T L -1 331 | T M -1 332 | T N 0 333 | T P -1 334 | T Q -1 335 | T R -1 336 | T S 1 337 | T T 5 338 | T V 0 339 | T W -2 340 | T Y -2 341 | V A 0 342 | V C -1 343 | V D -3 344 | V E -2 345 | V F -1 346 | V G -3 347 | V H -3 348 | V I 3 349 | V K -2 350 | V L 1 351 | V M 1 352 | V N -3 353 | V P -2 354 | V Q -2 355 | V R -3 356 | V S -2 357 | V T 0 358 | V V 4 359 | V W -3 360 | V Y -1 361 | W A -3 362 | W C -2 363 | W D -4 364 | W E -3 365 | W F 1 366 | W G -2 367 | W H -2 368 | W I -3 369 | W K -3 370 | W L -2 371 | W M -1 372 | W N -4 373 | W P -4 374 | W Q -2 375 | W R -3 376 | W S -3 377 | W T -2 378 | W V -3 379 | W W 11 380 | W Y 2 381 | Y A -2 382 | Y C -2 383 | Y D -3 384 | Y E -2 385 | Y F 3 386 | Y G -3 387 | Y H 2 388 | Y I -1 389 | Y K -2 390 | Y L -1 391 | Y M -1 392 | Y N -2 393 | Y P -3 394 | Y Q -1 395 | Y R -2 396 | Y S -2 397 | Y T -2 398 | Y V -1 399 | Y W 2 400 | Y Y 7 -------------------------------------------------------------------------------- /week5-6/help.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | import random 3 | import fileinput 4 | from itertools import product 5 | 6 | 7 | def Composition(k, text): 8 | kmers = [] 9 | for i in range(len(text)+1-k): 10 | kmers.append(text[i:i+k]) 11 | return sorted(kmers) 12 | 13 | def PairedComposition(k, d, text): 14 | paired_reads = [] 15 | for i in range(len(text)+1-k-d-k): 16 | paired_reads.append((text[i:i+k], text[i+k+d:i+k+d+k])) 17 | return sorted(paired_reads) 18 | 19 | def PairedPrefix(pair): 20 | return (pair[0][:-1], pair[1][:-1]) 21 | 22 | def PairedSuffix(pair): 23 | return (pair[0][1:], pair[1][1:]) 24 | return None 25 | 26 | 27 | print(PairedSuffix(("GAC", "TCA"))) 28 | def SuffixComposition(k, text, uniq=False): 29 | kmers = [] 30 | for i in range(len(text)+1-k): 31 | kmers.append(text[i:i+k-1]) 32 | if uniq: 33 | return sorted(list(kmers)) 34 | else: 35 | return sorted(kmers) 36 | 37 | def GenomePathProblem(kmers, apppend_last=True): 38 | genome = "" 39 | kmer_length=len(kmers[0]) 40 | for kmer in kmers: 41 | genome+=kmer[0] 42 | if apppend_last: 43 | genome+=kmer[1:] 44 | return genome 45 | 46 | def Suffix(text): 47 | return text[1:] 48 | 49 | def Prefix(text): 50 | return text[0:-1] 51 | 52 | # Combining this with integer to pattern will be more efficient 53 | def Overlap(patterns): 54 | kmers=sorted(patterns) 55 | adj_list = [] 56 | for k1 in kmers: 57 | for k2 in kmers: 58 | if Suffix(k1) == Prefix(k2): 59 | adj_list.append((k1, k2)) 60 | return adj_list 61 | 62 | def deBrujin(k, text): 63 | kmers = SuffixComposition(k, text) 64 | overl = Overlap(kmers) 65 | adj_list = {} 66 | # Inicializo diccionario 67 | for kmer in kmers: 68 | adj_list[kmer]=[] 69 | for i in range(len(text)+1-k): 70 | adj_list[text[i:i+k-1]].append(text[i+1:i+k]) 71 | return adj_list 72 | 73 | def deBrujinGraphFromKmers(kmers_in): 74 | #print "Initialazing dictionary" 75 | kmers = [] 76 | for kmer in kmers_in: 77 | kmers = kmers+SuffixComposition(len(kmer), kmer, uniq=True ) 78 | kmers = set(kmers) 79 | adj_dict = {} 80 | for kmer1 in kmers: 81 | adj_dict[kmer1] = [] 82 | for kmer in kmers_in: 83 | adj_dict[Prefix(kmer)].append(Suffix(kmer)) 84 | return adj_dict 85 | 86 | def EulerianCycleProblem(adj_list): 87 | # Choose any vertex and push into stack 88 | stack=[] 89 | random_vertex = sorted(adj_list.keys())[0] 90 | #random_vertex = random.sample(adj_list.keys(), 1)[0] 91 | stack.append(random_vertex) 92 | # To save the right path 93 | path = [] 94 | # Stack but fifo xD 95 | while stack != []: 96 | # top vertex 97 | u_v = stack[-1] 98 | try: 99 | w = adj_list[u_v][0] 100 | stack.append(w) 101 | # Removeadj_list[u][0] from available edges (edge marked) 102 | adj_list[u_v].remove(w) 103 | # No edges 104 | except: 105 | path.append(stack.pop()) 106 | return path[::-1] 107 | 108 | def getBalanceCount(adj_list): 109 | balanced_count = dict.fromkeys(adj_list.keys(), 0) 110 | # Look for nodes balancing 111 | for node in adj_list.keys(): 112 | # If is in the sum 1 to balance, if out rest 1 113 | #print node 114 | for out in adj_list[node]: 115 | balanced_count[node] -= 1 116 | # Possibly there is a node with no out edges 117 | try: 118 | balanced_count[out] += 1 119 | except: 120 | balanced_count[out] = 1 121 | return balanced_count 122 | 123 | def EulerianPathProblem(adj_list): 124 | # Choose a unbalanced vertex (with out edge) and push into stack 125 | stack=[] 126 | balanced_count = getBalanceCount(adj_list) 127 | stack.append([k for k, v in balanced_count.iteritems() if v==-1][0]) 128 | # To save the right path 129 | path = [] 130 | # Stack but fifo xD 131 | while stack != []: 132 | # top vertex 133 | u_v = stack[-1] 134 | try: 135 | w = adj_list[u_v][0] 136 | stack.append(w) 137 | # Removeadj_list[u][0] from available edges (edge marked) 138 | adj_list[u_v].remove(w) 139 | # No edges 140 | except: 141 | path.append(stack.pop()) 142 | return path[::-1] 143 | 144 | def StringReconstructionProblem(kmers): 145 | return GenomePathProblem(EulerianPathProblem(deBrujinGraphFromKmers(kmers))) 146 | 147 | 148 | def BinaryStrings(k): 149 | universe = ["0", "1"] 150 | kmers = ["".join(el) for el in product(universe, repeat=k)] 151 | return sorted(kmers) 152 | 153 | def kUniversalStringProblem(k): 154 | #print deBrujinGraphFromKmers(BinaryStrings(k)) 155 | cycle = EulerianCycleProblem(deBrujinGraphFromKmers(BinaryStrings(k))) 156 | #print cycle 157 | genome = "" 158 | cycle=cycle[:-(k-1)] 159 | genome=cycle[0][:-1] 160 | for n in cycle: 161 | genome+=n[-1] 162 | return genome -------------------------------------------------------------------------------- /week9-10/local_alignment.txt: -------------------------------------------------------------------------------- 1 | KWWIPMIHRVCEQQSCTYGAVIFSVWLYPDGDPTVDCKYHEHTQMVRIENCERCSDEAQYGDPESTCLDYSSGWARHIEIWIWQISTLQMVRLANKNPMFPPEYTIGLYQMVTDQMGGDDKSHYRMMERGMFNWAEHEDKQKGHKSRHPCCHGGTCIENECAKHVRKTEMYDSEIDEWDYSDFTPLNHVHESMKLATVAQKVCQMRYMLQDSDHKWWWVHQVNHERCARFTDLSAWFLSEETYRIETWVKNKYTWHCQQYTTYYSMVKEVCYFDKQKPGMYTATMVHEEVMLQERFWGHYAYYAWSSLDMHNCVYMHMICFITYWPWKWKFGNLMRVWYDRKDFETTTGAGEALAWMLDSTQPNPHYAFGIQILDRISLIRHLTEQQWPDWIAWGEVQCMESTCWRQGNETSTLAQTGMYWFKICSELVEADTHYCVMHFCQHTKEYMCNSDVGRDFQAKNHPDPSSRGHLMMSYVYRFTSTMVHRRNTDPWYISTNKEPVVIYDNMWEIPRMWYMMWITQECDSGIMMVGKWAKVHILCRPIKTYATFSFCSCCNNTYQCKRAPCPDVIFVSWWLVKFHRQDPTLAKDKIVGHYPLHQLNYLHWSSERNTDYQLNIEPVPKDSKQYKNYNPDSCHEKVRSSVNQNWPKELTECGPVDWKFSVMFTQMHCAEQEQKEDKKAIWNNTRWFQVLMDIMSNTFCQWVLQWHPPHLMFCYDDYLWETMPWTKCEWWQEWGFYVFYAESNDRKRIGVCRCCVFLVVDLFSHAVVQRHAPEPDCCRGESVTKAMHTNSETHIIESPPNVFDVCGYQKHTMTCWGFLPACHNPRFFNQWCFAVINMSTQMWNNDASYSLCALLRIVIVARRLCYHSHSYASNWRCYQHKPDCVIDFQEHLNSSDNFMAFSIRCEHFTYMEGRNNRWDPETTIGQWKDWCTWNFLGNQKCFNQWKEGMDVTDTPHAMAYPWNLDKAWVCLVFPNALCCVAMDKHRDTICKIYFLTPHACLKDMNYYHYYRTWRMHVMEGSCHSCCVKPVKHNHANSVNNFDVKQNVTGCYVVNKWGKQLYQMWMAPPTMYIPAGLLRQNPACRQVTEYHYGIITEMHVGEDCRQNMFKGTRLDCRKDIIHIYVKVGPYPVTENKYLGVEIHSIGHDMFNETDMYRYCQVCGASFTYGYFCRHQKAAYHGAIWQKGCERINKMCFFYDFWFHHSWYKTREKGAHDYYWPHPGFDNPNHKYMGLHRGYCKFRNRSRGSEKKVKRDGKENSAHERQVPTDHYFLLSKDNHKLHSCIMNYYFKLRECQHPAWETCHSMQWFDWQGWGYSEPANLHQHWCDMQECCYNHIWPHLNLTIPSSDHLYTCDYPYCIQLPKCQFFKHYWCKGVRKPQCPFTPEHDVFKYNSDWGMPHERLAGSKPHWSNTIYTHATYAGFNYSLQPICLGGIISKECKAAECRWIRNDMEYWPLTWEQQFEHRGSMIPTYMLDRDHLKSKNIMAMGPALMFCWTWAVASVFYVIRPPVQNEGYFNSLTPKWFATQHIWMTWTIKENYCMWLHRPYQQREHPTLYLNDRGAFKREKKWSMMRQHLKEMILPVYGRKILQENRILWKNAVFADYSYMCPCFVPMPGPKKTKFTFAGSGELHTWKSCMDWNGASDIGGVWMLIWITWKQPTNPFFISMRPIMKQGLHHQYYKFGCDDMQLSFQFNSVGLKDKTYMQGYEQNFFHYYIAIWVLVDENCLFWEINCKFHPFLINTTNCQRFDFCHEDEAASQMNNMDFDCMCDFMDPTCRCYWCNQTSQGCICQRALEYENFQTGVKYEQLECQHSWLIEYKVVFNCNMETNQPTVPEREWNRGVGDSRRTCFHDAVKFLAHFTECWLCRVMLGGLETCWGTHMFEQQWSALLCPDYAKNSIRLQQNWKVAGYYWSNNSYDRMTMSQDKTEFNEFGFLPVRNGIGAYMSQALPCMRSNVLHHTNYWMTNRADLDYMMDFRVVRDKHKAKPQHSEKNTHVHHEHIIYSPKLMNMFHYRNCLYHIIRPRIHGVQCKDGKRNMFTHTAHSATISMKIEVHAFEWVPGSRVWSCFKHQLTCINVQTCVHPHENCNFGRGNDNVATRAEQWFIFTAFNFGEFKKKERYRPTIMMCWDHVNPKFALKEKACRLPGNEENSSAYYLGYTKFGQGGHHKSNAFISLWNYKMAAKKFDDDYKIEMNLGWIEMGWPDDEWQFKEKTWIIVISGLRMQQMTTCWQSDRDQNTQFGNYMKPKVAVSWEMCANEVKYSDMFVHGFEFQQNSHNVVDYCQEHCEELEVNISSHSRPWGTIVKAVEFIHLDRRISGYNIMFINDFCIEPTQYVIRTISPVNGYIFNYGMNVPEINTKGRPFFILGVQALSERHAAEESCKDIAPQCPINETAVEIQLDWITNESVHFYNCNSKHFIYKCCIVINLMWDYRDIYRENFADREWSENQWRDQTERQWINEWRTQIYHQLYICIRSHWYITQWHWGFQKWMVRPPCTNCSAQHKEAATIMYINRMQTDAMVGDHPFHFCCVYFMWTTSYTLTKCMPIQTQKMAWFRKQQQPDYNFQLDIMDNPYLYAYYPNDMQQTHPKNEWGPPTHPNYNAEDGRMPRGIWLVYGMMQRSKTVYLCGSYGTLPWCRQNTWSDFTPNNCFMAYWAMYAFTRDLVVFFSLFTKTGQTNQICRQYAATQHDCYYIISVFDWR 2 | HQRQCNKMNHVCYRTYYMYSWQWARINNNQGLFYECSGCTDVPCGNCFWAYRCLSVMMHCCYRKNSACIAVIIEMNWEIGLAGTVYPFDPGGGQVPIPEIFWVFMAQWIFPMKQRVDWNTLPGYGVAVHNLWPMVQDSSKSNCMHRECARWVQRQVPQQGETDWWIIMIVGPFGKQNPLPNCVSEPQDTSGVPNHFDGTMEVDIKWDYHMCPDERSGWTSKNHAEWNRFTCWHSKSGNEFFSTAKWKKDFISMQVPWAECYKPFKRTMYYMQVVRSTLVIHTEPVWWQPYNKYVIWTIFAHKHTASPYQQHPYRDVDFSKWKCCCLCHHIGKEWFFSWCAACKVCVNTFWVHQKPKDTCITLWCTSLWPPIHEHRWLDDKEYCIPMNHFTPHWGWWPIERGWHMYLDWLPCGMAFAGVKFDQFRAKKDDLDLQGPAVLKLSFIESTVNIRNIDYYQPVYSTLYPRFYCEMNQCSNMHNPCRRFSLKNVLHWFYSDQRSAQRYINENMGCWVITAMDAGNESSKTFPWQPNDWWAHCCGCVEGANRDYCWTTINNDISHNKYNQHTAVESISFWFQVHFCNLMGTYFVALQSANEPFQFRTQDNCPSQEVDSKKDSDQNFYQEDKQIQNDRENFYDRPNACDHPKDGYTHRQGCDWRPKTASVDWAEWLSKYTIGHPRAVYARERSYQNNMNNHIDFILCEAPPKEKDSVDWADKEPEFIENTFAARGQKTESGNVCDLNLTYCCFVKVHESERSDGAPGQKGMQTVRDNQKEECEYQCSEREAPMMWACNCPLIAMTESFCQMLYLEGMELYGNFIFRDWKDPPDIKVAPCYIALHPFPNATFSHTFIDKAWCPLQHCMRRSYASNWRWAMSVDCVIDFMEHQQLWSYYTKDNFMAFSICCEHFTYMEGRWDTWDPETCTWNFLGNQQCFNQWKEGMDVTDTPHAMWYPWNLDLAWVCNLCRFQPFPFRMIRIFPALCPVNMDTHRDTICKIYFLTYYHYYRTWRQHVMEASGPRRFTSFMSWCVKPVKHNHANSVNNFDVKQNVTHTTIFYVVNTWGKQLQQMWMAPPTMTDFECNWHVPKCWSFKLRQNPHEYMRQSTHMIIIEMTVGEDCRQNMFKGIRAVTEICGEPDILHSIGHDMFNETTMYRYWMGPHHRQVCIWCKISASFWCHRAYGAAYHGAIWQKGCERINKQCFFHWYTTREKGAHDPYYPHTSERCWGFDDIYCHFMWFPNHKYMGLHRGVCWLCSRSRGCEKKSAHEVDLGRHCHQVPTDHYFLLSKDNHKKWCKISPAAHSMIMNYYTRECQHPAWEKCHSMKWFEWQGWGYSLPANLHQHWCDMQECCYNHITITVWYDWTSSSDNSFFSMTCDYATKYCIQLPKCQFFKHYYGHMIPSCPFDPEHDSFLWTVKYYNSEISCCMPHERLAGLKQVPCIPSRYETIYATYAGFNYSLQPGRIPAKGYVLGGIISKECAECRWIRNDMEEQAYWPLTWEQQFEHRGSMITRKYHWSCTYMMGVTHHGAKRDHLKSKHMIHHWIEIMAMGPALMFCWTLAVASVFYVRMSRPPRQGEGYCSSLEGEPTPKWRATHDWPDDVIWMTWIATHNYYQHPLYLNDRGAFQVPWHMEKKMSMMRQHLKEPPLFGQARTILENLVSGGADYSYVYYWQFMPASTQPKDEYPSQLTKFTFAGSDELHTERPEKSCMDWNGDIGGNWMRPSWMWNIWITWKTDPGNPRPICKQGLHHHYWIRRCYSFQFNSVGGYEQNFFDYYIAIHTCYSNNYDVLVDENCLFINCKTNCQRFDFCHEDAIHTALFPSMVMNNMDFDCMCDFPIPTCRWCNQENKFDTMMWMEQLECDHKVVFNCNMETNPIREWNYGVGDSRRTFHDAVKFLAHFTECWLCRVMLGGLETCGGTHMFEQQWSYLLCPWPYCHQKVYRAGYYWSNNSYDRMTMSQDKTEFNEFGFLPGRNGQGAYARGTGEWCASMLVNCSERGQMMPVWRPLSHADRDIERNKHQFWNCRLTTGRVGPCVKWRNISQGCTPVKTSNYCTKQRVAFNQQFPSYQKTSHHVNYEGLKAYRQMFYRMMDRDTFNGQYGMEQPLDKLNWNTGRQVHAMAPHMGDFSSTTINAIQWYSREVNMVKGKDNFDERVGKWQDIIEMCWNMRWFFIEDWASAYWMSRIGCGAPQQNKNYAYKKMYDMEQCMGQIWYCITFKSVGLCHKVTTSEKIFMVWSGCMYHKFICAWRVFPMLSHNKDWAMAYWGQDTFVRYNPKRYALERTMALNTVYKQDACLHICYNWYYRDNPPPRDKMQLIREASCAMNPVSDNRNSMMSWSQTACHVALVDCLNLPFKLMYILPHVTHRFYGTRNEPFMSLETQHGNMMTAWKSYPPGAEPINIDCINAFPSPGYFWDPWQCGQIYGSMPVVPCGCDGDPLCHMPCFLRTKKGGVTMHYSYWIGLALYFGQAMWKALTHRSKICCSKPIGSSCRHNDMARTWAKQVDKFHTRWCVHMGKELLHQRYPMFWKPFGKMCTQFCLNSAECCVKNHCQFQHYGLMVMAYPSARSCPEPATIILSYRDKDHGMLNSHDDQPKQIQQRCYLYTVATTKGAGWQNEACKLRHCSAWYWLFYQLHIGHEFDTDNNDSTWCQGMSWCKFHNFCRHYQLKENCSYQTPSSILLKQNAWQIVGPIRAESNTLLMFIGSYMTQWQKVCQMELWYPQDMPGWGVVAEIIMDEPYPTNVQCLHLHMGMCSYREALYHYFNAMNPWVIHEGFIMQWENPPKM 3 | -------------------------------------------------------------------------------- /week7-8/20 - ConvolutionCyclopeptideSequencing.py: -------------------------------------------------------------------------------- 1 | def get_spectral_convolution_dict(spectrum): 2 | spectrum = sorted(spectrum) 3 | convolution_dict = {} 4 | for i in range(len(spectrum) - 1): 5 | for j in range(i, len(spectrum)): 6 | mass = spectrum[j] - spectrum[i] 7 | if mass < 57 or mass > 200: 8 | continue 9 | if mass in convolution_dict: 10 | convolution_dict[mass] += 1 11 | else: 12 | convolution_dict[mass] = 1 13 | return convolution_dict 14 | 15 | 16 | def get_top_m_elements(convolution_dict, m): 17 | convolution = [(key, val) for key, val in convolution_dict.items()] 18 | sorted_convolution = sorted(convolution, key=lambda entry: entry[1], reverse=True) 19 | trim_pos = m-1 20 | for trim_pos in range(m - 1, len(sorted_convolution) - 1): 21 | if sorted_convolution[trim_pos][1] > sorted_convolution[trim_pos + 1][1]: 22 | break 23 | return [i[0] for i in sorted_convolution[:trim_pos + 1]] 24 | 25 | 26 | def expand(peptides, amino_acid_mass_list): 27 | new_peptides = [] 28 | for peptide in peptides: 29 | for mass in amino_acid_mass_list: 30 | new_peptide = list(peptide) 31 | new_peptide.append(mass) 32 | new_peptides.append(new_peptide) 33 | return new_peptides 34 | 35 | 36 | def get_parent_mass(spectrum): 37 | return spectrum[-1] 38 | 39 | 40 | def make_score(peptide, spectrum): 41 | ls = cyclospectrum(peptide) 42 | cs = spectrum.copy() 43 | score = 0 44 | for c in ls: 45 | if c in cs: 46 | score += 1 47 | cs.remove(c) 48 | return score 49 | 50 | 51 | def cyclospectrum(peptide): 52 | prefix_mass = [0] 53 | for i in range(len(peptide)): 54 | prefix_mass.append(prefix_mass[i]+peptide[i]) 55 | 56 | theoretical_spectrum = [0] 57 | for i in range(len(prefix_mass) - 1): 58 | for j in range(i + 1, len(prefix_mass)): 59 | theoretical_spectrum.append(prefix_mass[j]-prefix_mass[i]) 60 | if i > 0 and j < len(prefix_mass)-1: 61 | theoretical_spectrum.append(prefix_mass[-1] - (prefix_mass[j] - prefix_mass[i])) 62 | return sorted(theoretical_spectrum) 63 | 64 | 65 | 66 | def linear_spectrum(peptide): 67 | prefix_mass = [0 for i in range(len(peptide) + 1)] 68 | for i in range(len(peptide)): 69 | prefix_mass[i + 1] = prefix_mass[i] + peptide[i] 70 | lin_spectrum = [] 71 | for i in range(len(peptide)): 72 | for j in range(i + 1, len(peptide) + 1): 73 | lin_spectrum.append(prefix_mass[j] - prefix_mass[i]) 74 | lin_spectrum.append(0) 75 | lin_spectrum = sorted(lin_spectrum) 76 | return lin_spectrum 77 | 78 | 79 | def linearscore(peptide, spectrum): 80 | ls = linear_spectrum(peptide) 81 | cs = spectrum.copy() 82 | score = 0 83 | for c in ls: 84 | if c in cs: 85 | score += 1 86 | cs.remove(c) 87 | return score 88 | 89 | 90 | def trim(leaderboard, spectrum, N): 91 | scores = [] 92 | if len(leaderboard) < N: 93 | leaderboard_to_return = leaderboard 94 | else: 95 | for pep in leaderboard: 96 | scores.append(linearscore(pep, spectrum)) 97 | scores.sort(reverse=True) 98 | score_min = scores[N - 1] 99 | valid_pep = [] 100 | for i, pep in enumerate(leaderboard): 101 | if linearscore(pep, spectrum) >= score_min: 102 | valid_pep.append(i) 103 | leaderboard_to_return = [] 104 | for k in valid_pep: 105 | leaderboard_to_return.append(leaderboard[k]) 106 | return leaderboard_to_return 107 | 108 | 109 | def leaderboard_cyclopeptide_sequencing(spectrum, n, amino_acid_mass_list): 110 | leaderboard = [[]] 111 | leader_peptide = '' 112 | leader_peptidescore = 0 113 | while leaderboard: 114 | leaderboard = expand(leaderboard, amino_acid_mass_list) 115 | loop = list(leaderboard) 116 | for peptide in loop: 117 | mass = sum(peptide) 118 | parent_mass = get_parent_mass(spectrum) 119 | if mass == parent_mass: 120 | score = make_score(peptide, spectrum) 121 | if score > leader_peptidescore: 122 | leader_peptide = peptide 123 | leader_peptidescore = score 124 | elif mass > parent_mass: 125 | leaderboard.remove(peptide) 126 | leaderboard = trim(leaderboard, spectrum, n) 127 | return leader_peptide 128 | 129 | 130 | def convolution_cyclopeptide_sequencing(m, n, spectrum): 131 | spectrum = sorted(spectrum) 132 | convolution_dict = get_spectral_convolution_dict(spectrum) 133 | top_amino_acid_mass = get_top_m_elements(convolution_dict, m) 134 | return leaderboard_cyclopeptide_sequencing(spectrum, n, top_amino_acid_mass) 135 | 136 | 137 | if __name__ == '__main__': 138 | with open('convolution_cyclopeptide_sequencing.txt') as f: 139 | m = int(f.readline()) 140 | n = int(f.readline()) 141 | spectrum = list(map(int, f.readline().split())) 142 | print('-'.join([str(i) for i in convolution_cyclopeptide_sequencing(m, n, spectrum)])) -------------------------------------------------------------------------------- /week11-12/29.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def two_break_sorting(P, Q): 5 | red = colored_edges(Q) 6 | path = [P] 7 | while two_break_distance(P, Q) > 0: 8 | cycles = colored_edges_cycles(colored_edges(P), red) 9 | for i in cycles: 10 | if len(i) >= 4: 11 | P = two_break_on_genome(P, i[0], i[1], i[3], i[2]) 12 | path.append(P) 13 | break 14 | return path 15 | 16 | 17 | def two_break_on_genome(genome, i, j, k, l): 18 | g = colored_edges(genome) 19 | g = two_break_on_genome_graph(g, i, j, k, l) 20 | genome = graph_to_genome(g) 21 | return genome 22 | 23 | 24 | def two_break_on_genome_graph(g, i, j, k, l): 25 | rem = ((i, j), (j, i), (k, l), (l, k)) 26 | bg = [t for t in g if t not in rem] 27 | bg.append((i, k)) 28 | bg.append((j, l)) 29 | return bg 30 | 31 | 32 | def two_break_distance(P, Q): 33 | blue = colored_edges(P) 34 | red = colored_edges(Q) 35 | size = len(blue) + len(red) 36 | l = colored_edges_cycles(blue, red) 37 | return size // 2 - len(l) 38 | 39 | 40 | def permutation_list_to_str(p): 41 | ps = [] 42 | for i in p: 43 | if i > 0: 44 | ps.append('+' + str(i)) 45 | elif i == 0: 46 | ps.append('0') 47 | elif i < 0: 48 | ps.append(str(i)) 49 | return '(' + ' '.join(ps) + ')' 50 | 51 | 52 | def permutation_str_to_list(str_p): 53 | p = list(map(int, str_p.strip()[1:-1].split(' '))) 54 | return p 55 | 56 | 57 | def format_sequence(s): 58 | fs = [] 59 | for i in s: 60 | str_p = permutation_list_to_str(i) 61 | fs.append(str_p) 62 | return fs 63 | 64 | 65 | def chromosome_to_cycle(p): 66 | nodes = [] 67 | for i in p: 68 | if i > 0: 69 | nodes.append(2 * i - 1) 70 | nodes.append(2 * i) 71 | else: 72 | nodes.append(-2 * i) 73 | nodes.append(-2 * i - 1) 74 | return nodes 75 | 76 | 77 | def cycle_to_chromosome(nodes): 78 | p = [] 79 | for j in range(0, len(nodes) // 2): 80 | if nodes[2 * j] < nodes[2 * j + 1]: 81 | s = j + 1 82 | else: 83 | s = -(j + 1) 84 | p.append(s) 85 | return p 86 | 87 | 88 | def genome_str_to_list(genome): 89 | lp = [] 90 | for p in genome.split('(')[1:]: 91 | p = permutation_str_to_list('(' + p) 92 | lp.append(p) 93 | return lp 94 | 95 | 96 | def colored_edges(genome): 97 | g = [] 98 | for p in genome: 99 | s = chromosome_to_cycle(p) 100 | for j in range(len(s) // 2): 101 | head = 1 + 2 * j 102 | tail = (2 + 2 * j) % len(s) 103 | e = (s[head], s[tail]) 104 | g.append(e) 105 | return g 106 | 107 | 108 | def graph_to_genome(g): 109 | genome = [] 110 | visited = [] 111 | adj = np.zeros(len(g) * 2, dtype=np.int) 112 | for t in g: 113 | adj[t[0] - 1] = t[1] - 1 114 | adj[t[1] - 1] = t[0] - 1 115 | 116 | for t in g: 117 | orig = t[0] 118 | if orig in visited: 119 | continue 120 | visited.append(orig) 121 | if (orig % 2 == 0): 122 | closing = orig - 1 123 | else: 124 | closing = orig + 1 125 | p = [] 126 | i = 0 127 | while (True): 128 | if (orig % 2 == 0): 129 | p.append(orig // 2) 130 | else: 131 | p.append(-(orig + 1) // 2) 132 | dest = adj[orig - 1] + 1 133 | i = i + 1 134 | if (i > 100): 135 | 136 | return 137 | visited.append(dest) 138 | if (dest == closing): 139 | genome.append(p) 140 | break 141 | if (dest % 2 == 0): 142 | orig = dest - 1 143 | else: 144 | orig = dest + 1 145 | assert orig > 0 146 | visited.append(orig) 147 | return genome 148 | 149 | 150 | def colored_edges_cycles(blue, red): 151 | cycles = [] 152 | size = len(blue) + len(red) 153 | adj = np.zeros(shape=(size, 2), dtype=np.int) 154 | visiteded = np.zeros(shape=size, dtype=np.bool) 155 | for e in blue: 156 | adj[e[0] - 1, 0] = e[1] - 1 157 | adj[e[1] - 1, 0] = e[0] - 1 158 | for e in red: 159 | adj[e[0] - 1, 1] = e[1] - 1 160 | adj[e[1] - 1, 1] = e[0] - 1 161 | 162 | for node in range(size): 163 | if not visiteded[node]: 164 | visiteded[node] = True 165 | head = node 166 | cycle = [head + 1] 167 | color = 0 168 | while True: 169 | node = adj[node, color] 170 | if node == head: 171 | cycles.append(cycle) 172 | break 173 | cycle.append(node + 1) 174 | visiteded[node] = True 175 | color = (color + 1) % 2 176 | return cycles 177 | 178 | 179 | if __name__ == '__main__': 180 | with open('2BreakSorting.txt') as f: 181 | P = [list(map(int, f.readline().strip()[1:-1].split(' ')))] 182 | Q = [list(map(int, f.readline().strip()[1:-1].split(' ')))] 183 | path = two_break_sorting(P, Q) 184 | result = '' 185 | for p in path: 186 | print(''.join(format_sequence(p))) 187 | -------------------------------------------------------------------------------- /week11-12/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def two_break_sorting(P, Q): 5 | red = colored_edges(Q) 6 | path = [P] 7 | while two_break_distance(P, Q) > 0: 8 | cycles = colored_edges_cycles(colored_edges(P), red) 9 | for i in cycles: 10 | if len(i) >= 4: 11 | P = two_break_on_genome(P, i[0], i[1], i[3], i[2]) 12 | path.append(P) 13 | break 14 | return path 15 | 16 | 17 | def two_break_on_genome(genome, i, j, k, l): 18 | g = colored_edges(genome) 19 | g = two_break_on_genome_graph(g, i, j, k, l) 20 | genome = graph_to_genome(g) 21 | return genome 22 | 23 | 24 | def two_break_on_genome_graph(g, i, j, k, l): 25 | rem = ((i, j), (j, i), (k, l), (l, k)) 26 | bg = [t for t in g if t not in rem] 27 | bg.append((i, k)) 28 | bg.append((j, l)) 29 | return bg 30 | 31 | 32 | def two_break_distance(P, Q): 33 | blue = colored_edges(P) 34 | red = colored_edges(Q) 35 | size = len(blue) + len(red) 36 | l = colored_edges_cycles(blue, red) 37 | return size // 2 - len(l) 38 | 39 | 40 | def permutation_list_to_str(p): 41 | ps = [] 42 | for i in p: 43 | if i > 0: 44 | ps.append('+' + str(i)) 45 | elif i == 0: 46 | ps.append('0') 47 | elif i < 0: 48 | ps.append(str(i)) 49 | return '(' + ' '.join(ps) + ')' 50 | 51 | 52 | def permutation_str_to_list(str_p): 53 | p = list(map(int, str_p.strip()[1:-1].split(' '))) 54 | return p 55 | 56 | 57 | def format_sequence(s): 58 | fs = [] 59 | for i in s: 60 | str_p = permutation_list_to_str(i) 61 | fs.append(str_p) 62 | return fs 63 | 64 | 65 | def chromosome_to_cycle(p): 66 | nodes = [] 67 | for i in p: 68 | if i > 0: 69 | nodes.append(2 * i - 1) 70 | nodes.append(2 * i) 71 | else: 72 | nodes.append(-2 * i) 73 | nodes.append(-2 * i - 1) 74 | return nodes 75 | 76 | 77 | def cycle_to_chromosome(nodes): 78 | p = [] 79 | for j in range(0, len(nodes) // 2): 80 | if nodes[2 * j] < nodes[2 * j + 1]: 81 | s = j + 1 82 | else: 83 | s = -(j + 1) 84 | p.append(s) 85 | return p 86 | 87 | 88 | def genome_str_to_list(genome): 89 | lp = [] 90 | for p in genome.split('(')[1:]: 91 | p = permutation_str_to_list('(' + p) 92 | lp.append(p) 93 | return lp 94 | 95 | 96 | def colored_edges(genome): 97 | g = [] 98 | for p in genome: 99 | s = chromosome_to_cycle(p) 100 | for j in range(len(s) // 2): 101 | head = 1 + 2 * j 102 | tail = (2 + 2 * j) % len(s) 103 | e = (s[head], s[tail]) 104 | g.append(e) 105 | return g 106 | 107 | 108 | def graph_to_genome(g): 109 | genome = [] 110 | visited = [] 111 | adj = np.zeros(len(g) * 2, dtype=np.int) 112 | for t in g: 113 | adj[t[0] - 1] = t[1] - 1 114 | adj[t[1] - 1] = t[0] - 1 115 | 116 | for t in g: 117 | orig = t[0] 118 | if orig in visited: 119 | continue 120 | visited.append(orig) 121 | if (orig % 2 == 0): 122 | closing = orig - 1 123 | else: 124 | closing = orig + 1 125 | p = [] 126 | i = 0 127 | while (True): 128 | if (orig % 2 == 0): 129 | p.append(orig // 2) 130 | else: 131 | p.append(-(orig + 1) // 2) 132 | dest = adj[orig - 1] + 1 133 | i = i + 1 134 | if (i > 100): 135 | 136 | return 137 | visited.append(dest) 138 | if (dest == closing): 139 | genome.append(p) 140 | break 141 | if (dest % 2 == 0): 142 | orig = dest - 1 143 | else: 144 | orig = dest + 1 145 | assert orig > 0 146 | visited.append(orig) 147 | return genome 148 | 149 | 150 | def colored_edges_cycles(blue, red): 151 | cycles = [] 152 | size = len(blue) + len(red) 153 | adj = np.zeros(shape=(size, 2), dtype=np.int) 154 | visiteded = np.zeros(shape=size, dtype=np.bool) 155 | for e in blue: 156 | adj[e[0] - 1, 0] = e[1] - 1 157 | adj[e[1] - 1, 0] = e[0] - 1 158 | for e in red: 159 | adj[e[0] - 1, 1] = e[1] - 1 160 | adj[e[1] - 1, 1] = e[0] - 1 161 | 162 | for node in range(size): 163 | if not visiteded[node]: 164 | visiteded[node] = True 165 | head = node 166 | cycle = [head + 1] 167 | color = 0 168 | while True: 169 | node = adj[node, color] 170 | if node == head: 171 | cycles.append(cycle) 172 | break 173 | cycle.append(node + 1) 174 | visiteded[node] = True 175 | color = (color + 1) % 2 176 | return cycles 177 | 178 | 179 | if __name__ == '__main__': 180 | with open('2BreakSorting.txt') as f: 181 | P = [list(map(int, f.readline().strip()[1:-1].split(' ')))] 182 | Q = [list(map(int, f.readline().strip()[1:-1].split(' ')))] 183 | path = two_break_sorting(P, Q) 184 | result = '' 185 | for p in path: 186 | print(''.join(format_sequence(p))) 187 | -------------------------------------------------------------------------------- /week3-4/gibbs.txt: -------------------------------------------------------------------------------- 1 | 15 20 2000 2 | CTCTTGATTAAGGAGATGTAAAACTCTTTCCGGACATTAACTTGTCGATTGGTTCGTTTTATGATTGTTAGCCCATACAACGAGTGCTACTTTCGACGATTACCTGGCAACAATAGACAAGTCAGGGCCGCGGAAGACTGATCCCCTATACAGACCGTTATCATGCTACGAGAACGGTTGTCTAGCAACTCTTAGCTACGTGTGACGTCCACCGGCGTCGAGCCTGGCGACTATTAAATTCGCATGCGCTAAAAGCACCTGTTATAAACGGCTGTCAGCGATGTTCGGCCGATATGCGCATCTTCGTTTCCTCTTGATTAAGGAG 3 | ATGTAAAACTCTTTCCGGACATTAACTTGTCGATTGGTTCGTTTTATGATTGTTAGCCCATACAACGAGTGCTACTTTCGACGATTACCTGGCAACAATAGACAAGTCAGGGCCGCGGAAGACTGATCCCCTATACAGACCGTTATCATGCTACGAGAACGGTTGTCTAGCAACTCTTAGCTACGTGTGACGTCCACCGGCGTCGAGCCTGGCGACTATTAAATTCGCATGCGCTAAAAGCACCTGTTATAAACGGCTGTCAGCGATGTTCGGCCGATATGCGCATCTTCGTAAGCGCACCGGGGTGTTCCTCTTGATTAAGGAG 4 | GAGATGATAGGTTGGCCGGTTCGCCTCGATACGGTCCACGCCTGCTGGAATCTAGCTAGACAATTGCTTAGTGGATTCATTCTCCTCACCCCTGTAATTTACCCTTACCGGGGTGGGGAGGAAATACTCCACGTAGAACACGTTTACGAGCCTAAGGGCCGAGAATCACATAAGGCGTCTAACTATTAAGTGCCTTTGGTATCGATTATTGTGTTTTTCCCCATGCCCGCAGTCCTCCACTTAATAGACTGCTATCAACTATGGTAAATCAATTTCCACGATCGGGCTCTCGAACTTCTGTGTTATCCGATACGTCGCCGAAATC 5 | GCCTAATTGAATTATAAAGTATTTCGTCCGACATATCGCCATGTTGACTGTATGCGCATGGAATTCGCTTCGAGAAGTTCCTCGGGGTGAGGCACGTTTTGAAGAACCCGGAAGCTCCTTCGGTTGAGCCTAAGTTTACTCTATAGGCAATCTCACCATCCGCGTCCACCCAATCGCGTGAGGTAAGATCTAAGTCCGGCTGCAAGTATCCATAAGGCCCCTTGCGGATGGTCACGTCTCTTAGCAAGGAGTCAATGAGATCGGCCCTCCCTACCCTTAGTCTATGTTTTGGCATAAGCATTGGGAATTGTGTAGGATATGTGAG 6 | CGTTTCATCTACATGACATTGCTGCTACGACATGCGTGTCGCCCTCCTGGAGCCCAGTGTTGATCACCGTGGGAACGTTCCTAATAGCTGAAGTGAGGACTGGGAATTCGTTCACTTGACGTCTCACCTGTCGATTTATGCATTTGAAGCTCAATTTGGGGGTAAATTGGAATGAGAGCGAAGAGACGTTTACCTATCCTTCTAATAGGAAACTTCTAGTTGGATGATGAGATAAGTTTTATGGGGTGTATATTGGGCGTCAATGAACCCTCGCCAGTGTAAACACCAATTTCCATTGAGGTTGGGTGGTAGAGTCCGCGGGACA 7 | TAGACTAACCCACACGTAACCAATTGGTTTTTCGGACAGGGTGAAGGGATGTGTGCATCGAAAGTTTTTAGCTACGACTGTAATATCCACTTCACCTCTGTCCACCAGTACAATCCAGGTAATAAATCTCCTCTGGCTGGTGCTTTAAAGGGAGTCTGTTTCACGATCCTTGAACAGGTGCGTCTCACGAGGACGTGTATGAATTTTCATAATAGACGTGTTCCCGAGCCACCAACAGGAGCGTGCCTGATTCGGAAGATGCAAAGCCAATTGCATACCACCTGCACAGGAGGAGGCATGGATCGCAAGTTTACCGGGTGCAAGG 8 | CCTACTTGACAAGCGTAGGCGCGGTACGCAAGTGTTGCGTTCTCCCTCGCAACACCCGTCAGTGCTACGGGGACGGGTTTTACGACTTGACGCTCTTCCGGCCACCTGCATTAACTCGACGGAATGAGCACGGCTCGGTAGGCGATCGAGTATGCGTCATGGGAAAATAGGAATCGGACGCCCCTCGGGCATATTAAGCCTGCGTTCGTGTTGTCCTTACGATATTAGCCTACCAAGTTTCGAGGGGTGCCAAGCTCAAGTGATCCGGAACTTTGCTTTACCACCACCGCCATCCAGGGCATTATACATCGCTCCCTTGTGACCT 9 | TAATACACATCCTCGGACTCCACATGACGATACCACTAAAAAATCAACGACCTTTCGGCCGCATGATAGGTCATGAGGGGGCAGTTTATTCTCGGTTCCTGTTTACCGGGGTATGGTAAATCTGCAGGGTTGCACACCCGATCAGCTTGTAGGCTTTCGTGCTTTCAGATTTCTAACAATACGTTAAAGATTTTTGAGTTAGAGAAAGAGCGTCGAACATACTGTCGTACCAATTTACTCTTTACGATCATTCGCCCGCAGCATTCCGGTGCAATCGATTATTCGCATAGTCATTCCCCTGTTCCGTGGCTATTCTTCGTACCTT 10 | AATGGGATTGCTGAACAAGAAGGCGGCTTAGACTGTCTATGGCTTCCGATCGGACTAACGGCGAATAATAGTAAGATTACGGATCCCTGACAGCTTCAGTCCGCAAACGACACCACAGGCTCCTGTAGTAAAACAGACAGCCACTATAGCGCGATTGTTGGCCCCCCCTTAAGTTGCTCGGGGTGGTCCAACAGTCCCCAGAAGACATACGACGGGATGTATATAATGAAATTCGCCTTCTTTAAGAAGATGCTCTGGCAGTTTCATATAGGGGCCCGCTGTTGAAAATCGGATGAGTGAGGATACATGCGTTTGCGTTCGTGTC 11 | GATACTCCTATCGCGCAGTGACCTCCCTGCGTTCATATTTAGCCCTACTTTGACGAGACAGATAGCTGGGAAAGCCTATTCGACATATATACTGCGATGACTCCGGAACGTAAAAGAGTAAATCGACATATTTAGTGGCTTGGATTTGAGTAGTATCGCAACCTACGCCGATGCGGAAAATTAAACATACCGGGGTGTCCCATATGAGGGGGGCGAAATCTCCGAGGATTGAGTACTCGTGCCCCCGACTTTTTTTCGACTCGCGGCAATGAAAACCGAAGGAGGCACGAAGTGGTACATGTGTACCCCTCTTTGGTTACTCATG 12 | CGCAGGCTCATTCGTTCACGAACACACGGAACTACCCAGCGCGTTGATGCTCCAAAACGAGGCCACGTTCACAGAACCGAAACACCGATAAAAGCGCGCCAACAACCCGACGACGCACAGGGTGAAATGGCACTTACGGCTCTTTCATGATCTTCGACCGAAGGAATGGAGGGGGTCACCTGGCCCGGCCCGGTGAGTGCTTGTATAGGCGTTTGTACTGAGGTACCAGGACCGGGCGCTGCACAAGCTGCCATTCTAGCGTATTCTCATATCCAAATGGCTCGCAAGTTTAGGAGGGTGGGGCTCCCGCCAGGCCGTCATATCC 13 | ACGTTTCGCAGCTGAGGTAAGGAAACCGGGGTGGAATCACCCTCGAAGCTGGTCGCGCCGGCATCTATTGTTGAGCAGGTCATCACAATTCCTCTATTTCTATGATACAACTTCGACGATCCACGGGATATGTAACGCCGGAACACAGGAGTAAATGTGATTGACAGGGGCTCATCCGTCTGCCCAAACGGCATCTACGCAATGACTGCATAGGTTTTGTGTAAAAGAGTTTGTCATCTACCCAACCAGGACAAGTCAGCCCGCGCAAACGGCCCACGCGCACATATCAAGCCCGTCAGGCGCCCGCAGAAACAGATCCTAAGTT 14 | GTGGCTGTGCGTAACCGTCTAAATGTAAAAGCGCACATGAGGTAAGTTTACACAGGTGACCCAAGTGATCCTGATCGAGATGGGTAACCGCATTTCTGTGAGTCGGGACACTGGGTGTTACCAGTTGCCAGAAATTCGGCGGGGAGTGAGTTCGGTCGGTATTTATGACTAGGTCATTGGGCTGCAGCGCTCCGCAACAGTCCATGGTTTATAGTTGGAACAGACCGGGGTGATTCATTAAAAGAACATTCATCTGCTTAGAAAAATAGATTTACGTTCCGTAGAACCGTAAGAAATTACTGGCTAACCCAACATAAAAGCTGAG 15 | TCGAATCCGCCACATGCAAGGCTCAATGTTGACAACTCTTGTGGAGAAGACATTGCAAGACAGCTTGAAGGAGGTCCGCTAGAGCTAGTCTACGCTCCGTGTCAAAGCCTGGAGAACATACGATAATGAGTTAGACCGGGGCCAATTAGTTTACCGGGGATCGCTGAACAACCGGTCCGTGACCATACACTTAGTTGGGTAGCAATACATCTGGCCCGGTCAATTTCATCTAAGGCACCCGATATGAGGACGTGTGCAATACACATATTTTCGGTGCTGTCATGTCCTGTGAGGTTTGCATGGCTGACCGTACTAGTATTAACAG 16 | GGCCTTAATGAATCGCTCTGTCATGCATGCATTGGGATGGGGACCCCTCCGTTAGCTGTGATGGGTCGAGACGTACGATGTACCGCCCCTTTTACCGGGGTGAGCGATTCTCGTGCGAAATGTTCTCCCACTTTGTCCGGCCGTGCGCGCAGCATACTGGGCAGCCTGCGTTCCCCGCCCCCCCACATGACCACGACTGGGTTCGCCATCGTCAGCTTAAAATCCGTATGGTTAGGGAAGATAGCGTCCAAATGGGAAGCATGCACGTAATTCAGACTGAGTCCCTCTGTATTCTGTCTTGGACGTAATGAAACTCTATAAAACT 17 | CCCTAGCAAAAGCCCTCTTCAATCACTTGCAATTGTTCTTTACCCCCTTAACTCAGCTTGACCATCATCAATCCAAACCGAAGCTTCGGCCACATCCAGTATGCCGAACAAAGGCAGTAGATTATGCGATCATTCTGTTCTATAACTTTCTTTCTACCCTCACCGATCCACATATTAGCTGTGATTTCGAGTCATTCTGATTGACTATTCATGACTTCCGGCTAAAGACAGGTACATGAAGTGAGCCGGGGTGATACAGGAGTGGGATGCTTGGGCAGCGTTCAATTGGAGAAATCGGAGAGTTGCTACCATTCCTGCTGTCTGG 18 | GCGTGACGGCTCTATAAGAGAACTACGACCAATAGTACCGGTGGTCCTCAGCCTTAAATATAGTGTAAAGTCGTCCGGGGTGATTCAAATGGGTGTCTTTAAACTTATTTAGAAGTACATTGTGCCTAGGTTTCCGGGACTTGCCATAATTGAGAGTCCCTCATTCTCGGTGAGGAGCGCCGAAGTCCCGTTAATCTGGCGTGTCCCGTGATGCATCATCTAAGTTATCAGTCAGTCGCACGCACTCCTACATGACTGAACCAGTGCGCGCTGAGATGGTACGCGTGCTCACTGTCCAAGGAGACGGACACGTATCAACTGGCGC 19 | GCCTATGGAATTGCAATGGAGTTATGTCCAGTACAGAGGTGAAAGTTTACCGGACAGAGATTACCAACCCCGGGATTAGGGGAGATCGAGCTGCGGGCTCGTGGGCCAAGTATTCAACGAACAAAGCTTAAGTAAAGCAGCGAAACGCCTACCGGTACAACAGGCGGTTCAGGTGACTACCAATAAAGTAAATGTTCGGACGCAGACGTCTAAGCAAGTGACGGCCTAGGAGTTTACGCCCTACAACCCACCAGCCACCAACGGCAAATAAGTCCCTACTGACCGCGGCATTTTGCGCACCGAACTAGCCGTCAACCACATCACG 20 | CGGTCCATGTCTCTAGCGCAAATGGATAGGTTCTGTATATACGGCACCTGGCCCAGCACGTCTTTACACAATAAACAATAACCCGAGTGGTGTTAGGTGAGACTTACTAAGGGACCCGCGCAACAACGGGTCCAAGGTGACGGATTTTAATCGTTGCGTGTCGATATCTCGCAGCATCTAAGACTGAGAATGGCGGGATTCACTCCTCGGACTAGGACATCTTCCCAAAGTTTACCAATGTGAGAGACAGAGGTGCACCACTAGGCACGGATGTATGCGCGAGCAATTGAACAATACGGTCCTGTTATACAGTTCACGTTAACAC 21 | GTCGGTTCAGAGCAACTTTACATAGAGGAACGGAAAGAGCAACATTCTTCCCAAGTTTACCGTCATGGTTTGCGAGTACAGCGGCCGGCACTACTGGCGGAGTGAGCCACATCGTTGGCTGGGACCGAGAAACTGCGAGTCTTTAAACGGACCCGCGCCCCAGACACTAGTGTTTCCTATGCGCGCGCATAAAAAGCCAGTCCCGGTAACTGGAGTTCAGGACCAAGGAGTTTGGACAAGCTTGCTAATCGAAATACCATTTGTGTTGCGATCTTGGAGCGTGCGTAGCGCTTACGGTCGAAACGTACCCCGCAGTATTATACCC -------------------------------------------------------------------------------- /week9-10/global_alignment.txt: -------------------------------------------------------------------------------- 1 | LHAALSPKHCYDFISQKQKKWMHMPPQLWGQDPVEEYNHFRAKKLRVEAQCCNGGMCCLYRTCMDYWVGQRTEWKEDQQVPWFGYTKLDVQCKRFKPKIDKQPFGLYMYKWEYIRQLWIPMEVFYMDFTCVDWLSKCGAWNMKHYCNHCNNIMMIWFFKVVAGMHERRDTWRNETENFWWPWAPRMRCGAHLTQEWSRSMAAFEGLWHDWFIVDTDTDCEISSHFPGFINIVKNAPQTASTKAMRNSRRNANWAYADGFWCFIAKMFIGHPIHPHNQPDHEIDQVTQDYCFGVPFSHLISLFDWADQDWIIWFFRFSCWAICHIVEWMGPMWHSVHCLRKWVNLALEAKWSAGGCVPNFNIFNLWHMIPWCESKPCAHTEFDEYRLKDVVQCGRLFFEIVKMLKSNKFVGTVTDQWGAEMESIPARGLLSLEVPQEYRLNRATNYKVVFLSPWIDGPQCAGIDAMQFYYWLILFTLVVCNMLYHIWWSGINSKHLVIDCNDRNACVLWACCNAHQEVWDAPEMRNDAKDLFHEANCGSFTALAHENAHSGYINLCDDPPCMPCPEPCCCWAANTEMVHASPMKYQHWRKENYDFLVNFYTNCEYIDDFDCGFQGGCNFKRMNWIYEDRTKKFGEFYVAAAEEHQPDLESNPIENGDWVQDDEGMTSWELMMFHERHRRTDRWSSAHLLNEHKPDCKHKSEYQFLFTPTKMQYPHWIMRPKAREDPKGVRFHNHQYEDNMDMMFPEMNAPVQGSKEKEVGSTWPPPCGREIMCPHVDIVQHLRGAECVVAYLIIALYCVKKRAFSRYTCRGFMWHNMAVMWPMMRKFVWNQEAGYNHTMSIRYRENSVMGKDCNNTPQQKDCAVMYCRATWLFHNFQCLLPRDFIEGQSQESYMYVTQFPKVMIVDFQEINWKCTPGDCDVNEADWCMFLDMWCPIYTLINFQKMQQDCMMEYTMEPGLPVDIAMRPLHIRIVMHGMWKNTKFGRVRAACKNVTWQKFFWTGVMHWWHWEMHVSWISYTEMSFNSTIPWCVEYNLCDADKQVFDATKDGKYWGNLMWLKMAGHYEWHDMICMKCSNACVHFVLNDPDSRSQFNMYIIEYLIMRSNRSNQDGGAHINWTRHSFRAGWWNMQLNCFEKDEKEQEISSPGRRNNRQFMYDTWWWTVDIFAHRQCAFSTIPQKKAVAEWRLEFIWTHMLTANQQAWKNSWTKKEFLSTEDWLCNIHLPAYLTCTYDAIDTPPFNLKKPPPLFRHEVHPAKYYGLVTGYCITSQQWIPEWFRWTHGRTNQDKFSLDEVSCICCWATDRKRQNYQMYIRCLYIRQAWDRNWKWWSQAGSIRIDPGRELYRPMGKHKEDNAHGPRPNADHTAKAESPGKLRWYWQFTQWPYHPVTNWDPRHKCVTWQQVHMQIVVCCHHLQMRCAYCHIQILDAAKQKTAGWWIDVYRKDGAWSNLGEMYQKNYGWCTFSDYGTPGRKGWSICNHFVRKYSHCFQVVDPYECVLFLQNWCDPAGCTKWQIKKRDFEDLLDPCIEYFTIQIKNNDNAGWREQCYDTPDYGMWSVYGWCWWSCCREYACSGVTMMKGLFSYCRAWRREVDYMHWYNHGIWWWSKSHCVSNTMSYYEKVYTMCDVQWCYMYHQENASQKFGLQCMQQERPDDAKCTPCRWPLCSWQMDFINEQTIPMMKSWVREVATDPNTYNSTECRFHVSTETGIMNCESKGVNSKWFCCECWRAPNTMVIECMKQYSPPGTFARLLQPASDPLNMMRAHHKHQYMTHQKNEDNNNTYENLTALEDTEQRTAHSLDIEYREGQMNGKYFTFHTTDAVYYEQYETMIWAVMGPFMTMNKSIWNNQACFAQDHMWTISPFRQNSMIELCGPACTCSTEAFQRDWLPIRKEWPPIMFAWLLDWWAAWCPLPWMKSWSWLLDGTDHDGDNWLQAMTAEANGYLTAVGIVARCVQYVPHAWHCWICESCWDDDFRPIDIIVMCPTTEGRFASALAYYGNSIWPEPRKVNHQTPVLEEPKQLRILEINAFNMAMRWDCTCLKETEIQTLRNSEDTWYDCFPIDKYVANDRDIGSPPAHEWNTCLDVHRMLIMQIHEYPSGMMSYVLDKQAAIQGYAHMNSALHHQSLDDFARQELLNVTWVSDYKSEAPFHVPPKWKLTGRYKAITNLMIRLPFKEFPVRMLTCYSSTGSCDDVIINMFYDLHEILWPQYCNIWNEPISPPHGEKHKCQDHYNMIYPSNAVRMLSDETTLNHDMDPHNRMPSDITQMWSMCYQIFSVCDEQHIGGSLVPYNRPKEMNPRVKFNQHVVPGRNTCMWVDTVRIAWRYWWIMQEMFCWTSKSSYWQTKLTYTGSINQFMISFGTERFRFFENMPKGKMARPIQPHHPLQENHEMAMMTIQRRSGYCLPFCCEAPIKTLKIYRPVWTDDHMCNKSSGPAFTINMAPMRTVMEFPAINRLGFTCYWMDSSTFYVDEEENIPHPYGGISRSHPHPIKQCCFLMESCLHMFKELRLIVASLMEFYVFCDCEHSVQSPGPWYWTGPMTPKGSINLITMKGKFHHDRQCVIYNAKEESIRNYLMDREWPHVCNQCECSSRCESDCTGEPATEHLTWPIARAPYQLQRWKTHLSCPNMSGIVRCLWDSVGHNWGKRWFVCWPTPQCWLCFHTVYETYPCKCGCISCFFERMDEAQYVVPVCVYHIGSNFMVICACNDYSEFLVSTMYTYDQNSRCVTLSQAWFAMVVWFGRYAGVDPIQETIWKSQHHICFTEDDQGNKLISFTCNREEDRETCFIRKWFGSHTTGSSWALCQMGPISYLQQSYCLDFEEYGKPWCWANAQFQMPRLVHSTLPIRHTRRIIAHQLMPADPGCEETGKQPNWSRKNSDRINAWGWCMRIMPNRMCTWCDFNIKKPSTVNPADEATDFSPLSCPYLAYSFHDWLILPVDVPKVWDQCNACMDTLVKEKAVPKDKWSYKFQYESFYGSHEMSQSTMRPQMQPYLSAMLNLEVRPEGSFMDKLFWHGEQGPYHLVLHGDALQLTMMLFHESCVFNVMERRMDSGAVEAQFGPAIPVSAMRLNSSEIHWSVHNFHQSVTCVKASNKQNQMSPEGQTWVSLWYHCRTKTVYSEKLEPQQADYYYWFYTALCKIMQRSEGFTVSCFASPCWLAMSHVRTEKMTHFFGINNMILKGGYCKSSILIKSETGYWNHPDEYDDEMWYWVEYWDDMRALSVDTYMTWGVHVYNVKCVEKRLDNRYKTANSFKNDCQPQKVIADSSHRAQRRSKHRMSGQQHDTMKVNSPYDLNSHGGKGWEILPRSIPEVTKLGGGSLTRYQGAAGVVDHRSFLELRKKPHQNILVSWIVGLRPQHCGERGEKAYHGKYQECPKYRQRYGLPVQRGKEVQLNRFKPVAKISDKCWNYYVEPPVEIPKAHDTVKTIYAQYGFHWKDLIKIHLKPDKWILVWQESSFQSDPACAMQTGAWMQASSSVNMRIRDKQANAFIQWIPNNCVGQAYHIPNWDEWAVQQASVYPSFEVQVFRDYMMHEPTIDPYITQEKDDLSPRKEDWPGRWEMHQMRKYVCTPRRTMWYLQYRKELHAMILKFDLYKGVSKNNEEDNQHLNPIMHPRWTAYCDTFIVDKFRVRMLTIGFSTYYPVCHHEKYALITLNTDMDNYIECSTYYTRMLHHESSFGKEADVYVRKCHRWYV 2 | KHWALDPKHCYDFISRKQKKWMHMPPQLLHIDLPGQDPVEEYNHFKEPFCGGWLRQERVCICQCLYRTERMVMDYWSGEVETHVFDGRTENMEDQQFPWFGYTKLDVQCKRFKHKIDNQPFGLYMYKWERMAKLWIRMEVTCDDWLCNLSVFRHGKCGAWNMKHCNDITDRKIMMIWFFKVVAQMTKEPRFERRDTWRNPPVKVDWAPRMRCGHWSRCMAAFEGLWHDWFIVDTDTDCEISSRFPGFINIVKNAIKVHDTQVRGTASTKAMRNSRFRFAASQMRNAHAWLTSPRKYADGFKCFWKWIIAKMFIGHPHNQWAGAPEWVDHIIQWNHAPIRYCFGVPEMHIKKVSHLIMLFDWAQPQTDGYWIWIIWFFGFECWCICHIVEWMPICMLPMWKANIRWHSVHVNLAFEDNWSAGGCMPNFNIQNLSHMPWCESKVDAHTEFDEYRLKDVVQCGRLFFEIVYMKWVGTVDTFKQQNMDWWAEDESIPAELQGLLSLEVPQEYRLNRATMPDQYVYKVVRLSPWIDGPQCAGIAAMQFYYWLILFTLVASNMLYHIWWSGSNSKHLVIDCNDRNACVTCETMKDQVWACCNQEVWDAPELRRKDLFHEANCYTFTHLAHENAHSGYAQNLIDDTPCMPYEGTWVYCEPEFACCCWAANTEMVSRHMIPTASPMKYQHWRDFLLMNFYTRRYDKEGRMCEKICGFQGGCNFKRMNWIYEDCFPMRKNRIICCEFHHYCCCPHNQPDLEVRTLSADVQRPLDDEGMTSWELRRKDRWSSAHLLNEHKPVCKFASEYQFLFTPTKYGHWIMRPKAREDPRGVKFHMHQYEDNMDMMWPEMNAPVQGSKEKEVGSQWPPPCCVQHERGAECVVPRCSDVCWVKLLDLIIALYCVRGVMWYQNQHNMAVMWPMMRKFVLNQEAGGNHTNSIRNNTPQQKFHNFQNLSCYLLLPRDFPERMQCQDAYCTQFPKVMIPDFQTVHQNEYCGDCDYNHKEHLYMGNADWCMFLDMMIDSSGDYLQYMQQDCMEEPGLPVDIAMRCLHIRIVSAQEVINHKGNPHEHFRNTKIWIYWAPLGRVHWAHQAAKNVTWQKFFWTGVATHCMWEWEMHVSWIRHTEMSTNSNQWRELCDADKQATKWYNGNLMWLVNTHIKIMAMHYEWHDTCWLCKICMKCSNACVHFVLNDPDYIIEVLIMSNRYHINWKLWHSFRAGWWNMQLNVFEKDSDSPSHRVCNRFFMTWWWTVDIFVPRQCAFPQKKAGAEWDLEFIHTPMLSANQQLGWKMSWIKVTNPMSLKKEFQSWRIDWLANIHLAAYLTCTYDGIDTPPFNLKKPPPLFRHAVHPAKYYGLVTGMFISSQQEWIEEIWMKKYCNMCLFMHIAYELVSYQDKFALDEVDRKRQNYQMYIRCGGREYCLRIRQAWDRNWKWWSQYGSIRIDPTKCAHMFIFYRPEEWGVIEPGNRHFFWPEHGPRSNASATTMTAKAEMRKPPGKLRWYWQQWPDHPVTNWDPRHKCVTWQQVHMAYYEIVVCCHEAAHLQMLDAAKCETALNKTAGWWLSYVTGGCTDVYRKDGADFNWVEASNLGEMYQKNYGWCTFSDYGTPGRKGWSWCNHPVRKYHCFQVVDPYEFKNCVPWNWCYDKARVDRMGCTKDSPLHTFEDLLIEYKNNDNAGWRKSGTAVRPMHRIKGMWSTNIAAFIYGWCWWSCCREYGCSGVTMMNGCDKPAAWAREVDYMHWYNHGIWWWSLVVKPYSHCVSNTMCCRYEKVYTMCDVYHQENASQKFGLQCMQQERPDDAKCTPCDIQWHWAMDTINNQTIPMRDYYEVGTCARCHEWMLQSRFHQNVETGIMNCESKGVNQMMKWFCCLCWRAQSGYFNTMVIECMKQYSPPGTFIRLLDPENMMTAHHKHQYMTHQKNEDEHNTYENLTALEDTEQRTAHPLDIEYREIALQMTQMMTDRDRQAQLRIHTTDAVYYEQYETSMDYGIIFMTMWRFVMNTSYWNPFRQNSMIELCVPACTWSTEFQRDWLPIRKEWPPIWTASWCPLPDLLDGTDHDGPMCHQWYPNWLQAMTAEANGYLTNVGIVARCVHAWHCWICESCWDINYDPWDFRSEWMIPTTEGRFASALAYYGNSIWPEPRSVNEETTQLAVPEQILEINAFNDAHKCTMKYRWDCTCLKGNAKTTEQNAKRVWYDCFPDHDWYVANNRDIGSPGGKFCGHHRWDVHRMLIMQIHCAPSGQAKIQGYAHMSWILTCVTKDIGARQELLNVTWVSAECIGVCKSEAPFHVPPKWKLTGRVDFEDGKAKEFPVMMLTCYSSTGSCDDVIINMFYDLIEILWPQYCNTWLEPPHGYKHKCQDHYYPSNAVRMLSDETTLNHCYKMDPHNRMPSDITIMWSMCLQGFSVCLNQHIGGSLVPYDRPKEMNWSSEGEMYQVPGRNTCMWVDTMRIAYGGSRYDWIMQEMFCWSVCSKSTYTGSIMDSLIQGQFMANFGVEMNYDYYPFRFFYYCLYQMPKGIEFLKSMARHNMWFQPHHLLQENHEMMMTIQKRSGYCLPFITLWIYRPVCNKSSGPASPTFWSHARTINMAPMRTVMELWLTCYWRDSSYWRHFFYVENIPHPYGGISRLHPHPIKQKELRLWAMWRKCVVASDQRTYWKEQYVFSDCEHSVQSPGPWYQRTGPMTPLITMKCKFHHDELTIVIYNAKEQMSIRNYLMDREWPAVCNQCECSSRGESDCTGEPATEHLTWPIDQHRIKGVMRAPYQLQRPRTHLSCPNMSGIMRVYLWDSVGHNWGKRWFWAIQCCCWLCFDTTFFPFASTWQVSPYETYPCKCGCISCFFERMDEAQYVVPVCVQHIGSNIMAICDQWLACNQYSEFLVSTMYTVDQNSRCMTLSMSRGWFGRYAETIWKSQFHICFTISTCNREEPGRETCFIRKGFGSHEALCQMGPISYLQQSYCLYFEGTRGKPWREFPQKGPIRHYRVSLGAHSTLPIHTRHIIAYRVIHVVLETHRTHIIMKQPNWSHKNSDRNAFCAMCTWCDFWIKKPSTVNPADEETDFSPLSTDRTNMYLAYSFHDWLILPADVYCLGKVWDQCNACNDRLVKEKAVPKDKWSYKFQDISFYGSHEMSQSTMRPGMQYCLQFADAMLNLEVRPEGSFMDKLFWHGEQGPYHLVAHGDARMMLVHESCVFNSPLTHTPPHDEFRMDFGAVEAQFGEESQWFLNSSEIHWQSVTCVKYSNKQNVTPSIMSPFIQTWVSLWYHCRTKTVAVHFSEFLADYYYWFYKALCKWCKALEMQRSEGFNKLVSCFASPCWPAGSHVRTEKMTDMHPFFGINTMILKLIKSETGWNHWHIDEYDDEMWYWVEYWALSVKSPKTYMTWGVHVYLVKCVEKHMGECDAKLDNRYKTANKYDTRHNCTFKNDCQPQKVIADSNHRLVRRSKHRMSGFQHDTMKVNSPYDLHNGKSIYVDKEGEIDPRSIPKLGGGSLTRYQGAAGVVDHIVMVMDMWSFLKKEASENGNILVGSQGERGEKAYHGKYQECPKYRQRYGKCYKTLGEEEQKNRFKPCWNYYVEPVVEIPKVHDTFKTICAQYGFPIFIWQDYIKIHLKPDKWILVWQESSITGAWPRCYFQTWQASSSVNMRIRDKQANTFIPQIWGQAYHDPNWDHWAVQQASVYPSFRSHKIAYCVIQGYTHTMHEPTIDPYITFEKDVLSPRFEDWPGRWRSTPRRTMWYLQYRKNMLHHLCQRLMILKFDIYKGQSKNWEEDNQHLNPIMHPRYCDTFILDKFRVHMLTIGFSTYYPVCHKYALIRKKKLNTDMDNYAVIKKNVCCSRMAYQESVGKCADVYVRKYHIWYV 3 | -------------------------------------------------------------------------------- /week7-8/peptide_encoding.txt: -------------------------------------------------------------------------------- 1 | GAGTGCCCAGGACGTGTTAACGTCTTACTGAACCCATTAAGCACAATATACAACAAGAGCTCGGATCATATGGTGGACGGTAACAAAGCTGACTTGTGACGCAAGAAAAGCCCCCCCGTAATGACAGTGTCGTTTAGAAGCCACCGCAAAACCTGCATCTGAGTTTGTAGCCCCGTGGCGATTAACGTCCATTATGATCTTACAGCTAGGGCGCGCCGCTCGTGCCGTATGAGCTCACATGTCGGGACCAATCTCTATACTCGAGACGAGTGGGGGACAGTGGGTTATCAAGTCCGTGCCTAGACACGTTTGTTACTCCCCAAGGCAAACGCTCGTCGGATCAGAGCACTAGAGGTCCGTTTCGTCCGAGGGCTTCACTGCCCTTCCCTTTTATTCACACCTAATCTTTTCTGAACCATCGTTAAGCACCACTTGAATCCAGTAGCGACGTTCGGAACCACGCCGTGACCGTCCACCAGATAATCCTGAAGGCCTTGAGGGAGACTGCAAAAGGGGTGATGTACCACTGTTAGCTAACTCTAATAAATGTCCGGGACATACCAGGCTGACGCCTTTAGCCGATCTGCGATGGAACGTCATATTTCGGGAGGCCATTAGATGTGCTTTGTCTCCAAGGTTCGTCTATCGTTCGCGACATTCCCGGGAGAGATCCAGTGGGCCGGTACTAAACTTAACGGGACCTGACTTTCTTCTCCGGGTGTTCTCCGTCCGATTGGGTAGTGGGATGCTAACAGCTCGGAACCTGTATCATGCTGTCGGCCCCAAGTTTACTGAGGAGGCCACGTCTAGGGGAGATCCCCAATTCAATCTGGTGTCTATACCCGAACGCGCTTTCACAACCACAGTGTGGTCCTCAAGAGGCACTACGGGGTGGTCTTACATGCGGCCAGGCTTAACGCGCACATGTAACTTGTCCTCTTACTACAATGTAGCCACTTACAGACCGACGTAGGAACGTGCCAGCAATGTCGATTCCACGTTGTCACTCGATGTTATTGGACCCGCATCCATGTTCCGACCGGCCGCAAATAACAATAGCGACGGTCTCCGCTATCCAACGCGGTGGACGAGGGAATTGTCACTGAATGTGTTTCAGTGCCTGAGGCAGATATGACAGTTGTAACTGTCCACCATATTATGCGGGCCGTGTTGGTAGCGCGCCGCGGACGAATAATCTGGTGGACAGTTACTAATGGCGAAATAAGTACAATAGGGGTGAGACCAAAGCTTAAAAAAAGATGTGAAGCAGGCAGAAGGGAACGCCGACCGCGTTAGACGCGTAAAGCTTAGAGTGCGTCACAGTCCACCAGATAATCCGTTTTAGATCTTGCACAGGGAGGTCAATATTCTTACACCTTGCCACCCGAACAACAGTAAGCGCTCGGCAGACAATCCAGCCGGATCATCTGGTGGACCGTAACAAGAACACTGTTCAAAGCGTGCGGTACCCAAACCGAGCTCTGAAGTTGATAGTGCGTTCCGAGGGTATTAAATAAGTCGATCTATCGCTAGAATCGATATGACTCTACAGGGGTAGGTCCGGATCCATCCTAGAACCATCGGCCCGGTAATCAGTCGAGAACGCACGCAGTGCTTGGTAGCGTGCTGAGCGTTAGTTATCAAGAACCTTGCTATAACTGAGGAATTGGTGGAGACCGCACGAGTAAACGCTGATTAAATCTATTCTACCGCTGAGCGACTATCGCGAGCTCGGTTGGCGCCGCCTAGAGCCTGCCCAATGCTAACCTTTTGAGCATAGCCAGTGGGGCATGTGATGAGCACATTGGTCTTCATGTCCTCTCGTAGCAATGACATTGTGTGACGGTCCACCAGATGATTCTGATGAGATGTCCGTTGCGTGTTCCTTCCAAATCTGCGCTATAAACGCTGGGGCCATGCAGGAATTCTTTGTTATGCTCTATATTGGGGACGCATCTATAGTCACCGGGATTAGAGTAATGAAGTTGTTACATGATTCCCATGAGCCCAACTCTCCCTGCATGTGAAAGCTCGAACTTCGGGCCAACTAACCAATACGAATGATGGTTGTCGCCTTTCAACGCGCAACAAACTGTTCTGTTTATTAAAACCGGGATATATCTGGAAATTGACGTTCCCCCTTCCAACGGCTCCGATATCTAAGTGAGACCCCCATACCTTATTCGCGGTCAGTGTTAGGTCATCTATCTTCATTTGTATCTGAGGCAGCATCCTGGTAACGCCTTTTCAAGTACAGTGCCGAGCAAAGGAACCCTCCATGGACAGTCCGACTAATATGTGCGATTAGAACTGCTTTTACCCAGGCTGCTAGGTCCGCACATTGGTATGACCGGTCAGTTCTTGCTGGTGTCACTTCAAACCTGGTCTTAGTCCGACACTTACTTCCGTGTCTTGTGGGAGTTTAAAACTTGCATAGACCGGTTCCTTCTACTTTCGGCTTAATCTTGACCTGTGAGTGATTCATCTACGAAGAGCGCTGCCTAGGAAGGGACAGGCATACAGGAGGCTTGATACGTTTCTCGAGGAAGGACATTGTCCGCATAATATGGTGGACTGTAACTCGGAGGATTATATGGTGGACGGTGACTACCTGCCCGCCTGCGCTTGCTTTTGGGGCATATTGTGGTGAAAACCCAGTTAGCGATTTCAGACAAGAGGTTCTTCGATATAGGATAATGACGTCACTCTTTTCAGCCAGAATGTTGTTTTGCGTGATTTCGTACCTAATCAGACGGAACGTGTGCACCCTCTTTTGGTTCGGCTACGTGTTGAGCTTCACCTGGAAGGTTGAACAGTACGATGCGACGCACTAAGGATTGTTCCCCGAGGGCGTGGCCTCTCCTAATACCGATCATGCGTACATCTTATTCCCCCAGATTTACTGTTTTTTTACGGAGGGATAGGTCAAGTCTAGACGACCTCCCCTCAGGCGAGGTGGAACCCAACTTGATGAATTATTTACTTTAACGCCGCTCGTCAGCACCTTAACACTTGTTATTATAGAAGCCCGTCATCAAACTGACGCCCCGGGCTGCCTTAAGCCTTAGGGATCGTTCGTCCTTCCTTCTGGGCTTTCAGCGAGACAAGAGCGGCCATCATAGTCACAATGCTCTACATTAAAACCCCAACCGGACCTGAGCATCATGTCTTCTACTGACGAACTCCCTTCCGAATCAGTTTGGTGACTGTCCACCAAATAATTCGGGAGGGGGCAACGTTTGGCCCCGGGGAAGTAGCCTTCTCTATATTCTCTAAGCTAGTATAGCACCACCCAGAATCCAAGCCCCGACAGCGCGCACCTTCACGGATTATTTCAGACCAACTCAGATTTTCTTTTTGCGCATTTCACCTAAGATTTAAATTTAACGTTGGGCTGGTCCGGGTCTTGCGCGCAGGACCGACTTTCACCATGATCCCACGAATAACTGATAGTTATTCGGGCCGTTCGTTGACTCGAGGCAGTACGATGCTAGGTCGTAGAAGCCCTCTAAAAAAGGATCTAGAAGTACCGGACAACTGGGCAACCTGCAAGCTCTCGTTATGTTCTGTATTATACGTCCGACTGTAGTTCGATCACAACGTCATAGGGGTGTATCACAGCAACTGCGATGAAGCATTACATTGACGGCCAGTGACTTTATCAATCATCACAATATACGCCGGTCAGGATCGGGAAACTTGCTATGGTAGGGTTCGCAGGAGTGCCGTCGCATGAAACGCTCATATTAGACCACAACGGTAAGACAGGTAACCGTCCACCATATTATGCGATGAAAATTGATACAGTATCGAATCGCAGAGGTTAATGATTTCCACGCTTTCCGAGTCCTCGCAACGCTCAGTTCGAGGTAGGCGTTAAAGTCATGCAAGCTAATATTGCGGGTACGTTATACCTCTTTAGAATCAGCTAGGGGGTTATAGTGAGCGTCGAGGCTTTATGTAACCGAGAAGGGAGGCCACACCGTTATGCCGATGACTAAATAAGAGATGATTCGCCATAGTAGGTAATTCCACGTTTGCCTCCCTTTATTACGACAGATAGTTCAACCTGGTTCTCGGCTCACGAGTTGACTGTATCATGATAGCGGTAACTAGGTTTCTAACCTGCGCCCCAGACAGGTCGTCCGACGTCATTCCTGGCGTGGGGGCCTAGGCGATTTGCCGTGGCTTGAGTCGACCCAGATCTCCTCGCGTAAGTTCACTTCGCATGATTCACCGGCCAGCCGATTGTCGGACTACATAAGCGGGACTGACAGTCTGGACCAAACCCGGTAAGGGTGACAGAAACTGCCACGCTCCTCTGCACAGGTGCACCACGATTCGACCTTACTAAGTAGCTGGTTGAGCATCCCTCTGACCGAAAGGAGACTGTCTACACTCAGTTCCCGTTGCTAAACACTACAGAGGGGTATTCGCTCTCATAATGAGTATATGTGACCAGTAGTAGGTATGGGGCGTCGAAATCCTATCCCCTATATTCCTTTTAATCTCGCGTTGAGGCAGAGCTGTAATCTTACATGGTCCCTCCAGGTCGCTGTCCTGCATTGTACAAGAAAACTTGACTTGTTACGATTGGACTCTCGGCGACTTCAACCGCATCATCTGGTGGACCGTTACGCAGGTTGTTTAGTGATTCGTAGCGAAAAGCGGTCGGGGTATAATCCCTTTGCTACAATGACATAGAGCGCGTCACTGAGCATATTGACCGTGTTCAAGACGCGTGTAATTTTTTACTAGACTGGCGCTGCAGAGCATGAAGTCGTCAACCCTTACAAGGCCTCTATTGAAGTACCCACAGTATGGCACACTAACGCCAATATAACTGTAGGTGCCCGAGCACTCATATGCACGGACTGAACAATACTGAATGCAGTTACCGTCCACCAAATGATTCGTGGTCCCGGCAGAGTAACGGGAAGGACCCTCATCGTGGTTCAACTAACTGCTTAACGGTATAGCTTCGATACGTGACCACAAGTAGGTGACTCGACGCGTATAATTTGGTGGACGGTAACCTTCGGGGGTATGTTCGGAGGGACAGACACGGGCAAGCAGTGGCTCAAATGATATAGTTGATTTACGGAAGCGAAACAGTCTGCTTCGAGGTAGTAGGCTGTTTGTCTAGTTTTGGATGACTGCTACGAACTAACTTAGGAAACATAGGCAACCCGCCTGCCGGGCAGCACCCCTCGTCGCTCTCTAGGCAATTCGTATACGGCGAATCTGGTCCGTAGGTTAAGGCCCTTATAGACATTTGGGCCCGAGACAATGGGGCTCCGAAACCCAGTACCTTTATAACGTGCACCCGGCTTTAGGACACCCACGGACAAGCGGGGGGTAGGCCTTCCGAAGCCACCCCCAGCGAACCCCCGCCTTTGCCGTGGCTTCATGTTAGAGAGCTCCGCCCGCACTCACTAGGACGAACACATTCCCCTCTTATGACGACGGTCGCCAAGTAGCCGCTCCTTTTGATATTGTACGCGAATCGTTACAGGATTCTATTCAATTCCTCTAATCCGTCGTCGACGCGAAAGGTAGCGAAGAGATTGCAACGCGAGAGCCGGGTGGCATCGGTGAAATGGTATTGGTGAGCTTTAAGTGGTTCAACGGTAGCAATGCAACATAACAGCGTATTGCATGTTTGACAGACAGAGCTGGCCCCTTACGCACTTAGCGTTATAAAGGGTAATTAGGCATGCCTGGATGGCTCAAGCTTGAACATCCCAGTTCATGCGTTACTCGAGAATTTAGTATCGTATCTGTGTGTCATTACGGGAAGGACGATTGTTTAGCGGCGCTCACCAACCTGCAGTCGGTTATATAGGCGTTTGCCCATGTGGGAGCGCAGCTGACAGCATAAAAGAATTTTTTCACATCTTCGGGATTCTCGCCTTACCGAGGCCTATGGAATCCGGCCGAGGTCACGGCTCGTGAGTTCGTGTCTTCAGAAATTATTTAGGAGTGCTTTGTGACCCGTCTCCCGGAATAGGATTGGGTACTAGTCCCAGATCAGCCTTATACCGGACAGCTATAGGAAGTCCCTCGGCCAAGGATCATTTGGTGGACAGTTACGGTTTGGACCTTTTCCTGCTGTTATCTGCGGGCATAGGTATCGTAAGTCTCGGAACAACACCTCGGCATGACTGTAGGTAGTCAAACTTGCGGAAACTCGATCTGTGCTTATCTATACCAGGATAATATGGTGGACCGTGACATAATTACCATATACTGCACTGACCCATGCCAGGTTCTTCTCTACCGCTGACGTAGCGGCGTCGGGGCTTACGGAATACAGTTGTTTGGCCCGCCGTCCGCCGTTGATACGTATCGGCGGTACTAGGCCTCGCGGGTTATCAAATTAGGCATAAACGTTACAACCCCTACGGCCTTAACTCCTTCAGGAGAGGCCGAAACTGTTTGCGCAGAATTGCTTGCTTATGCCACTAGAAGTCATAGAGGGTAACGGTCCACCATATTATTCGATGCTTCTTATTGCTGCTGCTAGTGCTTGTGCATTGGGGGGTCGTGAAATACAGCACCCGTCGTGACTGACTAACAACTGTTATAAGGGCTGTTACTGAATAGTCTAACAGCGAGGACATATTCGATTACCAAAGCTTCGCATAGCTTTCGGGGGCGAGACCTTAATTAAACGTCGTCCAAAGTGTGGAATGGCTACCTGCGACCAACGGGTCAGTAGACGTCCATACATTTGTCACTTCTTGAAGCAGGTGCATTTGTTGAACCATAAATAAATCGTACATCTCCCCAATAACATGGTGTGTTAGGTATAAGCTTGCACAGTTAATAGTTTCGGTACTGAACCCCTGGAGTCACAAAACGCCTTGTGCGCGAAATGACCCCTGCTCAAGGTCAGGAGGGCACCTATTAATGCAGGGTGCTAGCATATCCCATCCTCCGCCGGTGCACTTTCTCCATACGGACCGTTCCTACAGATGACTGTGCAGCCCGTCTCAATAAGAGACATCTTATGTCTCTAAAACCTCCTTAAGCCGAGTAGGTCTCCGGAGCCACGAACCAGTTCGCCCTACTTCAAGATATGTGGGTCGCTCGTAAGATGACTGCAGAATTCTGCCAGAGGATCCCGGTCGCGCATGATGACCCTTGAGTTATGATACCACGGGCACAGGCTCTAAGTCTTTACCGAATGCGCAGGTGAATATCCCTTTAAAGCCCGAATCTCTGAAGGCACGGTGCCGACTATGTTCGGTCGTCCCGTTGACACGTAAGAGTAGAGGATTCTTAGGGTGCTTCCTGTCGTTCTAGGTTATACATTGCCCAATCCAACTTGGGACATGAACCACAATCGACGCACTGTCCGAACACGGGGACGCCGCCGAGCGTACTAACGGAAAGTTCACCAAAACCTAGCTGATACTTTAGGTGCTGTGGTGTCGCTCAATGCGTCGATAATACTGTAGCGCCGAGACTCCTCTCGTGATTTTGTTGATTGAGACTGCAGGAACGTAATTGCATGCGTTATTCGCTCGTGCGCGACATAGCCGAGGGACCCGTAGTGTATTCGCATACCAATGGTCGGTTAGTTGTTACTGACCCGGGTCCCTGGTGTAATATGTCTTAACTTGCCCGGCTCGATGGGGTGATGCCAACCGTGATTTGGATAGGTAATCACCGCGTGGTTGCGACCTAGTGAAAACACTGTTCCGCATTGACTGCCGATGTGGTTCCTTAATCTGCAGTTCTTCCGGGAATCTCGAGACTATACACCGTGGGGCAATGGACAGTAACTGTCCACCAGATTATCCGCTCTTATTTGCACCTCCCTGATGTAATACCGGTGACCCAGCTCCGTCGCCCGCACAGCACGATTACGCAAGCTAAGTCTGATATCGCGATGGCTTTAGATATTCCCAGTTGTTATTTGCTACACAATACGTGTATAAATGTGTCGCTGAAACGAACCATTGATGTCTGCTTCTTTTTCTATGGCATCAAATGAGTAAGCAACGGAACGTAAAAGCCTTTACCGCTACTAGTGCTATACAGTTTGAATTCGTTGCGGTCAACGCATGCCGAGTTTCTGAAAGTACCGGACATTCAGATGATTGCTACTACGTCACACATGATATTGTCCTTCATCTGCCCCACGAGTAATGCGGCCACTGAAGCAGTGTATCTGAGAGACCACCCACAGGTCTGGCGAGGAGCGATGATTGCTAGGACTTAAATCCCTTGGCTATCTACAGCTATATCGTCATGGCAGCCTTTTTACTATGCCTAATCAAATCGGCATGCCCTCTGGGCTGCTTGAGAATAGGAAAGTTTTCCGTGACAGTCCACCATATTATTCTGTCTCTTAATAGATCTTCCTGTAAGACACCGCCACACGCTGCGGGCCTTTACTATCCCTAGGCAGATGC 2 | RIIWWTVT 3 | -------------------------------------------------------------------------------- /week1-2/rosalind_ba1e.txt: -------------------------------------------------------------------------------- 1 | GTAACAGGAAAACGAGAGCTCCACGGATCCTTTTGGAAGTATTAATAGTGACCGCCCCCTGCATACATATTTCGATTGCGGTCAATCGGCCCCCGGATTTGTCCCGATCTGGCTTAGATCATGCGCCGGGCACATGGTAAAGTCTGTTCGACGCCACGAATTGCCAAGCTGAGGTGTGGGTCCTTAAGGAAGTAGCTTAGGAGCAACACGAGAGAAAACTGTAGGGTCGTCTACCGCCGTCACACAGCCAAGTCGTTTTACGTCCTCCATCAACAAATTCGCGGTATTCGCCCTTCGCCAACCGGCTAAGAGCCTTGCAGAACAGCTGCTAGCGGCAAGACAAGCCCCAGTGAAAACAGGTATCGCACCCAAACAACTAAATTGGAACTATGCGTTGCACTGCGACAGCCTACCCGCGATTGGTTTCCCTCCACGAAAACGGGGCCTTCCTCCCCACCCGAACGAGATTCTGACCGCTGCGATCCTGCACCACAGCAGCAACAGTACATAACTACCCATATATGACAGGATCGACGAAGGGATACAAGTATACTCTTTTAATGCGCACAGTCCGCGGGGATGACAGGGGTCATGAGTACAGGCGCTTAGTATAGCGGACTACTTCTAGCACAAGATTACTTTCGAACCGGGGGTTACCCGCTTAGGAGCTCTTGCCGTCGAAAGCACCCAGTTCCGATCTGCATGCTGGCTTTGTCGGGTGATATGCTCTGTCGGGGACGCTGGCGTGTTTATACGCAACACGAGTAGATAAATAGTTTGGAAGCCATGGATAAACACTGAGGCCATCCCCGTTGGGAGTAGTGAAGAGAGACAAGCATGCAAGTTAGGCGTAACGTACTGACATGGAGGCGTAACGTACTTCTTAACTTGCCATTTGGCTAGGGCTTAGGCGTAACGTAGTATAAAGAATCGGACGGCCCACGATAGGCGTAACGTAGCGTAACGTAGGCGTAACGTAATAGGCGTAACGTAAGGCGTAACGTAAAGGCGTAACGTACGTAATAGCCCGGAGGCGTAACGTAACGTCGAGGTTTTAAGACAAAGGATAGCTAGAGCGTAATGGTCCGTGGCTTTCCAGAACCTACACTGCGCAATTAACAAAGCTCGTAGAGGAGCTCGTAGAGGAGGCGTAACGTATAGGCGTAACAGGCGTAACGTAACCGTACAGGGAAGGCGTAACGTAGTAACGTAAGCGAGGCGTAACGTATCTATAGACTTTAAGAGGTCCCATTCCAGAAGCTCGTAGAGGTCGTAGAGGCCTGCGTGTAAGTAGCTCGTAGAGGACGGTTGACTTAGGCGTAACGTAATTCGGAGACGTATGAATGCACAGGCGTAACGTATAACGTAACGTAAGTAAGCTCGTAGAGGGAGGAACGTATAGTCGCCGTGAATCTGCACCAAGCTCGAAGACTTATCAAATATCAAACAGCTCGTAGAGGTGTAAAGAGGGGAGCTCGTAGAGGCTAGAGTTATGGGTGAGACTTATCAAAGAGGGCTCGTAGAAGACTTAGACTTATCAAACCTGAGCAGCTCGTAGAGGTAGAGGAGACTTATCAAATAAGACTTATCAAACTAGACTTATCAAACAAATAGCAGCTCGTAGAGGATGAAAACTAGCTCGTAGAGGAAGAGACTTATCAAAAGAGAGCTCGTAGAAGACTTATCAAAAGAGGGTAGACCTGCGCACAACAAAGACTTATCAAAAAAGACTTATCAAAAAACGATGCCCGAATTAGAAGACTTATCAAAACTTATCAAAATCCGTATACAACAGACTTATCAAAGATAGAATTTCCGCAGAAGACTTATCAAACACTAAACTGGCTTACCAGACTTATCAAATTGGGGTGGTGCTGATTGACACCGTTCACCGAGACTTATAGACTTATCAAACAAAACAGGCAGTACCAGTGGGCGTGAGACTTATCAAATATCTCTTCGTACTTCGTATCTATGGACCAATCACCGATTACTTCGAGACTTATCAAACCTTCGTATCTATTCGTATCTATTCGCGTTTCCAGCTAGAGTAGCGAGTTGGTCTTGCCGCTCTTCGTATCTATTGCTGGTTTCTTCGTATCTATACACAGCCTTCGTATCTATGATCTTGAGCAAAAAGTCACTTCGTATCTATCGTATCTATCTATTCGTATCTATCGACTTCGTATCTATGCTGCTTCGTATCTATTCGTATCTACTTCGTATCTTCGTATCTATATCTATAAGCTTCGTATCTATACGAAAAAGGTACTCTAGAGTACGAAAACTTCGTATCTATTATCTATAGTACGACTTCGTACCCGATAATCCACCATGGCGATAATCCACCGATAATCCGATAATCCACCCTTTGAATCCAGTACGAAAAAGCGATAATCCACCGCGCGATAATCCACCTCGGAACAGCGGACCGTAAGAACAGCGGACCGTAGACCGTACACTTCGTATCCGATAATCCACCACGTACCGATAATCCACCCACGATAATCCACCAACAGCGGACCGTAAAAACAGCCGACGATCGATAATCCACCCAGCGGACCGTAAACAGCGCGATAATCCACCAATCCACCGATAATCCGATAATCCACCTACGAAAGTACGAAAAAGAATAAGTCAGCGGACCGTAGAATCGATAATCCACCATAATCCACCTAGTGTGCCAATGCCGATAATCCACCACGAAAACAGCGGACCGTACGGCCGTGCCGATAATCCACCTGAACGAGGCGTGGATCAACAGCGATAATCCACCCGGACCGATAATCCACCCCACCATAATCCACCGCTCAGCGGACCGTAGCACTCGAGTCACAAGCGATAATCCACCCCGTACCCGATAATCCACCGCGGACCGATAATCCCGCGATAATCCACCATCCACCTAAGAGTCAGCCAGCGGACCGTACACAGCCATCACGACACGGTCTCCCCCCCGAACAGCGGACCGTAAAACGAGCCCATCTAAGAATGGCCATAGGGCTGGGACGGGTGAGTAATTACCAAGGTTGACCCCAGTACACCGTTTTGAAAGCCAGCAAAGAGACTACGTATTCAGAGGGTTGCTGACAGAGTCTTCACAAAAGCATACCGTTCGGTTCCTAAACCGTTTGTCTGACATCCTGCCCAGTAGATCTGAACTTGGCGCTTTACCTCACGTCAGTGCAATTCTTATGTCGGAGGGTGATCCCGATCGTTCCATAGTGTCCCAGGCGACCTTTGCGGTCACCATCCCAATGGCGGGGGCTCGAGTTCCCATGAATAAATGCATACGCAGTCGAGCATCGCTTTGTGAGACATGTGTAGGTAGGAAAAAGTTTCTCTTGGCCAGCGTGAACTAGCCCTTCTTAGGCGTAAATCTATTCCTTATAGCGATTGTCGCAGGAGGCTGGTCGAGTATACATCAGGGCCCGTGGGGGACGACGATCATCCCATGGCCAGCCTTCAAACACTTTGGCGCGCCAGAGTAAAACGCTTTAATACGCCGACTTAAGATTGCCGAACGGACCATGTTTACAGCGGATTTGCCCGCGCGAGGCTTGGAGAAGGCAGTTGTGAGGTGCAGAACCACGTAGTTAGACGAAGGCCCGTAATTAACATTTGGGTCAAGTGCCAAGACGAAGGCCCACGAAGGCAGACGAAGGCCCCCCGTATAGCCCCTCCTGTCTAGATCAGAGGATGTTCAAGACGAAGGCCCCAGCAGACGAAGGCCCTATAATGACGTACACAGACAGACGAAGGCCCACCAGCAGACGAAGGCCCATGATAAGAGACGAAGGCCCCAAGGCCCGTCTCCCGTCGTGGAGAGACGAAGGCCCAAAGACGAAGGCCCAGCGGGCGGAGACGAAAGACGAAGGCCCCGAAGGCCCATCAGCATTTGTAGTTCGCAAGACCTCGTGCCTGGGATGTAAAGACTGCAGTAAGACGAAGGCCCCATGCTACTGGTCATAGCTAGACGAAGGCCCCACGAAGGTGGAAGACGAAGGCCCGAGAGACGAAGGCCCACATATAGTAAGACGAAGGCCAGACGAAGGCCCCGCACCTAGACGAAGGCCCTTGAAGACGCGTCTAGACGAGACGAAGGCCCAGGTTGCCGGCCCCTTGATAGGCAGGCCCTCCGTTTCTGCCGTTTCTGTCCCTCCGTTTCTGGTAAGACGAAGGCCCCCCACAATGCTACTGCCCTCCGTTTCTCCTCCGTTTCTGCGAGCCAGTCTGGGGTTGCTTCTGGCGCCATCCTCCGTTCCTCCGTTTCTGAAAGCGGCAGACCTCCGTTTCTGTCCACCTTAGCACGGGGTTTAGCCGATACTGACCTCCGTTTCTGTGGGCGCGCGGAGCGGGAGCATGATCACGCCGACTAGCCTCTCGAGCACTATACCACAGCTATGTTAGGAGGATGTCCCCCTCCGTTTCTGGGCGCTCATGCGGTGGGTCCTCCGTTTCTGCGTTTCTGATTACCATGGATGGTCCTCCGTTTCTGCCCTCCGTTTCTGGACGTCGTTTCGACTCCTCCGTTTCTGAGGTTAGTGAGGCTGTACACGACTGGGGCCCCTCCGTTTCTGGCCTCCTCCGTTTCTGCCTCCGTTTCTGTACACCTCTAGTGACTCCTCCCTCCGTTTCTGCTGGTGTTCCCTCCGCCTCCGTTTCTGACCTCCGTTTCTGTCCTTGGCGTCCCTCCGTTTCTGCTCCGTTTCTGAGCCTCCGACCTCCCGCTTCGGACCTCAAAGCCGGTAATGTGTTTCGCCGATGCGATGGCGGGGAACCATAAACCGACAGCCAATAACTGAGCTCCGGCCTATTGCCCGATAGGACGTACGGTCCCGACGATCCGAAGGGTCTCCAGGACTTTTTGTGGAATTGGGTCACAACCCGTGACTTGTTGCGTAATGATGCCTGAAAGCTGGAACCTCCTGTTGGCTTGCGTGTGGGGGCGTCCGGACGAACGGCACCACCATATGAATGATCAACCGGTGGTTTACCGAGGTATCGCAGTCGCAAACGCGTAGCCGCGGTATCCCCCTGGCACTGGCCCTTAGGTGGGCACGGAACCTCTAGTAGCGGACAAACGGTAAAAGTCATAAGTAAAAGACTATGTAAGACTCAACAGACAGGATTCCGTATAAGAACCCGGGTGACCAGTGTCTCTGCTTCGGACAATGTTAGTAAGGGATCAGTCTCTCTAAGATCAAAGCGCGCTACGTTTCACTGTTGGATCGTACTCGTGCTACGCCAACGCCGAGATACACGCTAAACCCTAGATTCCACTACTGTCATCGTACTGGAGGAAAGGTTCTAGAAACGCACCTAATCGACTTGGTCACGGGTTTCCCAGGCTCATGCCCAGAGTATCTTGCCCTTGAGCAACGTCGGGCACGAACCCTGGAACTACTACGATCGTCTGCACAAGTAGCCGTCCAATCCAACTATCGTTGCCAAATTGCACCCTTCAACTTCCAGAAGATTCAGTCCCGTTCTTATGCACATCTCTGATAGAAGTCAGGCAGCTAAGTATGGGACTGATTCACTGGACACGAACGCTGGACTCGTGCCTAAAACCGGTGCCTTACGAGGGATTCACTGGATTCACTGGACTTGATTCACTGGAGATTCACTGGACTCCCTTGTTGCAAAAGTGATTCACTGGACCGCTCCGCAGATTCACTGGATTCACTGGACGTCGGTAAGGTGGATTCACTGGACTCCTTCGATCCGATTCAGATTCGATTCACTGGACTCACTGGACGATATGCCGAGGTAGTCACAAAAGATTCGATTCACTGGACGCGTTGGCTGGAACACGATTCACTGGACCACTGGACAAGGAGATTCACTGGACATGTTAGAGATTGGGCCGCCGATTCACTGGACCCGTTAGTAAAAGTCGGGCGCACCTGGCATGATCGCAAATATGGATTCACTGGACCATAACGGATTCACTGGAGATTCACTGGACTCTCCGCCTTTAGAGCCACGGCCGCTGCACATGCATAAACCAGTGGAAACGTATTCCACTTGATTCACTGGAATTTCACCGGGAATTTCACGAATTTCACCGGACCGATAGATTCACGAATTTCACCGGTTTATGAATTTCACGAATTTCACCGGTACCCGATACGAATTTCACCGGAGGTACGCCAAAAATGAATTTCACCGGCGGTCAACCGGTGCACCATTACGATGTGGAGGGAATTTCACCGGCTAGGAGGTTATCAAAAGAATTTCACCGGTAGAGTACGAATTTCACCGGGTGCGTTAGAATTTCACCGGTGTTGTCGATGTGCCATTGCAAACCCTTCCGCTTCCGATGGTTCCAAGCAGGTTTCAAGAATTTCACCGGTTATCTTCTTTGGGTATGTAGAGAGCGAGAATTTCACCGGATGAATTTCACCGGATTGAATTTCACCGGAATTCGGGAATTTCACCGGTCACCGGCTACGGTACAGTGGTTAGAGCCAGCATCGCCATTTCGCGCCCATGAATTTCACCGGTTTTTGAATTTGAATTTCGAATTTCACCGGATTTCACCGGACGGAATTTCACCGGTTAGGAATTTCACCGGGAATTTCACCGGTCATCTTCATGACGCGTGAATTTCACCGGTGCCTTGTCTCCTAAGTCGTCGAGTAGCAATTGATAGATAGACACGGAAAAAAATCGTATGAGTGATTTCGTAATAGGAGGGTAGCTAGTCGTATGCTCATGACCTTTCGCGACAGATCTCGGACTAGGTGAGGCGCGCCGACTAGGTGAGGTCGACTAGGTGAGGTTGGTCGCGACTAGGTGAGGGACTAGGTGAGGCTTTGAAGATTATACGTAGGTGATGCTTGTGACTGACTAGGTGAGGAGGCTTACGACTAGGTGAGGGACTAGGTGAGGGTTAAAATTTCTTTTGGTGAGCAAACTCCATGACTAGGGACTAGGTGAGGCGACTAGGTGAGGGGAGTATGAGAATGACTAGGTGAGGCCTGCATCGGGGGGACTAGGTGAGGACGCCGTAGACTAGGTGAGGGGCTCCCCTTGGATGTCACCCTTGACTAGGTGAGGGACTAGGTGAGGCCGTCCGAGGGACTAGGTGAGGCAAGACGCTACGACTAGACTAGGTGAGGCCGGAGACTAGGTGAGACTAGGTGAGGAGCGACGACTAGGTGACTAGGTGAGGACTAGGTGAGGCAATTTTTGACTAGGTGAGGCTAGGTGAGGAACCGGCGAGAATACAGGGGCGCGCTACCTTGGGATGCCTAAAGGACGCTTCGAGATCCTGTTATAGGCCATATCCGCGGACTAGGTGAGGCCGGACTAGGTGAGGCCTCGTTCGCTTCTTCCGCTCCCGTCGCATGGCGGGGAAGCCTCGGCCCTACCAATGTCGGTGAGTACGTGTCGAAACACACGAACCACCGCCTATTACACCCAACTAAACAACCCTCCCCAAATGTGGTGCTTCAGCCTTTTCGCAACGTGGGTGAAGGCGGCCCTGGCCAGGCTGTATGGGGTTGGTATTTGATCCGTTGTTTGTCACCTAATTAGAAATGTCACCACGTCAAATTGTGTACTACAAGTGGCATCTAAGTCGGTCGAATAGAATTAATATATGCCGGTAATGCATTGTGTACATTTTTGTAAAAGGGGTTCGATGCCGTTATATGGTCCGCACCGAGACTCTAGAAAATCTCAGCTGGTCTCGTGACCGTATTTCAAGCAACAACTTTCGGTGATAGATGAGTGAGTTTAGGTCGCGTATGAGAGGGTCAGCTCGTAACGGATGAACTTCACACGTCTTGTGGTTTCCAGCGGCGGGCTCTCACCCAATTCCCTATATTTAAGTACGTAATTCGTATAAATAACGGTCCGCTAACGGTCCGCAAAACAATAACGGTCCGCGCCTTCCTAATGTCCCCAATAACGGTCCGCATAAATAACGGTCCGCCGGACCTTGAGTTTTACGGTGGACGCACATAACGGTCCGCACGGATAACGGTCCGCTCCATAACGGTCCGCTGCTCGAGAAGATAACGGTCCGCGATAACGGTCCGCTCCGCCATTTCCTCCCTAATTCCCTTCTGCGAACGTGTCCATAACGGTCCGCGTGATAACGGTCCGCTCCGATAACGGTCCGCTTCGATAACGGTCCGCACCCTTCAACTTCCTTAAGGCTTCCCTAATCCCTAATTCCCGCTGTTATGCGGCCCGGCCTCACGGAGTTCCCTAATATAACGGTCCGCCCCGTCTCCCTATAACGGTCCGCCTATAACGGTCCGCCTGCTTAGCCCTGGGAACTACCACAACTGGTTCGCAGATAACGGTCCGCGGTCCGCATAAATTCAATTCGAGATAACGGTCCGCGTCCGCAATTCCCCCCTACCGGCTCCCTAATTCCCGTTAATAACGGTCCGCAACGGTCCGCATAACGGTCCGCCTATCCCTAATTCCCAGACTGCATCCCTAATTCCCGCGTGCGGTTGTTCCCTAATTCCCTTTAATGTTACCCAGAGGCCACGCCCAGAGGCCACGCCCCTCCCTAATTCCCTTAGCCACTTCCCTAATTCCCATCCCAGAGGCCACGCTTCCCAATTCAGAGGCCACGCAGAGGCCACGCAGAGGCCACGCCAGAGGCCACGCGAGGCCACGCGATAGTGGAGGAACAGAGGCCACGCAGCAATAGTCAGAGGCCACGCGGAGGACCCTCCTGTCAGAGGCCACAGAGGCCACGCGACATTGCAGAGGCCACGCAGACAGAGGCCACGCCGCGGAGGACCAGAGGCCACGCGCAGAGGCCAGAGGCCACGCAGTGGAGGACCAACCTCTCTTAGTCTACAATAATAGTGGAGGACTCGGGTGACTGGCAGAGGCCACGCCAGATAGTGGAGGACTGGAGGACCCCATACAGAGGCCACGCCAGAGGCCACGCCCATAGTGGAGGCAGAGGCCACGCCAGAGGCCACGCATACAGAGGCCACGCCTACCAGACGGCATTCTGGCACAGAGGCCACGCGAGGACCGAAATCCCGATGTATCAGGGTCGAGGTCTCTGCCCAGAGCAGAGGCCACGCGCAGAGGCCACGCCACGCGGAGGACATAGTGGAGGACGAATAGTGGAGGACTCTCTCTCACGTTTTTTGCCATGACTAGACCTCGAATCGAGCCGCGCGGATGCGAGTCACAGACTCCAGGCGCCACATATGCTGTTAGAGCCGACGACATATGCCGGCAAGTTTCCGAACGATTTTCCGAACGATTTTCCGAACGATTTTCCGAACGATTTTCCGAACGATCCGAACGATGATTTTCCGATTTCCGAACGATACGATTTTCCGAACGATTTTCCGAACGATTTTCCGAACGATTTTTCCGAACGATTTTCCGAACGATTTCCGAACGATTTTTTCCGAACGATTCCGAACTTTCCGAACGATGATTTTCCGAACGATTTTCCGTTTCCGAACGATAACGATTTTCCGAACGATTTTCCGAACGATTTTCCGAACGATTTTCCGAACGAT 2 | 12 501 16 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bioinformatics-Algorithms 2 | Authors: 3 | 4 | • Pavel Pevzner (University of California, San Diego) 5 | 6 | • Phillip E. C. Compeau (University of California, San Diego) 7 | 8 | Resources: 9 | 10 | • bioinformaticsalgorithms.com – Lecture Videos 11 | 12 | • Stepik.org – Interactive Text 13 | 14 | • Rosalind.info – Programming Exercises 15 | 16 | 17 | # 1 - Find Patterns Forming Clumps in a String 18 | Given integers L and t, a string Pattern forms an (L, t)-clump inside a (larger) string Genome if there is an interval of Genome of length L in which Pattern appears at least t times. For example, TGCA forms a (25,3)-clump in the following Genome: gatcagcataagggtcccTGCAATGCATGACAAGCCTGCAgttgttttac. 19 | 20 | Clump Finding Problem 21 | Find patterns forming clumps in a string. 22 | 23 | Given: A string Genome, and integers k, L, and t. 24 | 25 | Return: All distinct k-mers forming (L, t)-clumps in Genome. 26 | 27 | Define the skew of a DNA string Genome, denoted Skew(Genome), as the difference between the total number of occurrences of 'G' and 'C' in Genome. Let Prefixi (Genome) denote the prefix (i.e., initial substring) of Genome of length i. For example, the values of Skew(Prefixi ("CATGGGCATCGGCCATACGCC")) are: 28 | 29 | 0 -1 -1 -1 0 1 2 1 1 1 0 1 2 1 0 0 0 0 -1 0 -1 -2 30 | 31 | # 2 - Find a Position in a Genome Minimizing the Skew 32 | Minimum Skew Problem 33 | Find a position in a genome minimizing the skew. 34 | 35 | Given: A DNA string Genome. 36 | 37 | Return: All integer(s) i minimizing Skew(Prefixi (Text)) over all values of i (from 0 to |Genome|). 38 | 39 | Sample Dataset 40 | CCTATCGGTGGATTAGCATGTCCCTGTACGTTTCGCCGCGAACTAGTTCACACGGCTTGATGGCAAATGGTTTTTCCGGCGACCGTAATCGTCCACCGAG 41 | 42 | Sample Output 43 | 53 97 44 | 45 | # 3 - Find All Approximate Occurrences of a Pattern in a String 46 | We say that a k-mer Pattern appears as a substring of Text with at most d mismatches if there is some k-mer substring Pattern' of Text having d or fewer mismatches with Pattern, i.e., HammingDistance(Pattern, Pattern') ≤ d. Our observation that a DnaA box may appear with slight variations leads to the following generalization of the Pattern Matching Problem. 47 | 48 | Approximate Pattern Matching Problem 49 | Find all approximate occurrences of a pattern in a string. 50 | 51 | Given: Strings Pattern and Text along with an integer d. 52 | 53 | Return: All starting positions where Pattern appears as a substring of Text with at most d mismatches. 54 | 55 | Sample Dataset 56 | ATTCTGGA 57 | CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAATGCCTAGCGGCTTGTGGTTTCTCCTACGCTCC 58 | 3 59 | 60 | Sample Output 61 | 6 7 26 27 78 62 | 63 | # 4 - Find the Most Frequent Words with Mismatches in a String 64 | We defined a mismatch in “Compute the Hamming Distance Between Two Strings”. We now generalize “Find the Most Frequent Words in a String” to incorporate mismatches as well. 65 | 66 | Given strings Text and Pattern as well as an integer d, we define Countd(Text, Pattern) as the total number of occurrences of Pattern in Text with at most d mismatches. For example, Count1(AACAAGCTGATAAACATTTAAAGAG, AAAAA) = 4 because AAAAA appears four times in this string with at most one mismatch: AACAA, ATAAA, AAACA, and AAAGA. Note that two of these occurrences overlap. 67 | 68 | A most frequent k-mer with up to d mismatches in Text is simply a string Pattern maximizing Countd(Text, Pattern) among all k-mers. Note that Pattern does not need to actually appear as a substring of Text; for example, AAAAA is the most frequent 5-mer with 1 mismatch in AACAAGCTGATAAACATTTAAAGAG, even though AAAAA does not appear exactly in this string. Keep this in mind while solving the following problem. 69 | 70 | Frequent Words with Mismatches Problem 71 | Find the most frequent k-mers with mismatches in a string. 72 | 73 | Given: A string Text as well as integers k and d. 74 | 75 | Return: All most frequent k-mers with up to d mismatches in Text. 76 | 77 | Sample Dataset 78 | ACGTTGCATGTCGCATGATGCATGAGAGCT 79 | 4 1 80 | 81 | Sample Output 82 | GATG ATGC ATGT 83 | # 5 - Find Frequent Words with Mismatches and Reverse Complements 84 | e now extend “Find the Most Frequent Words with Mismatches in a String” to find frequent words with both mismatches and reverse complements. Recall that Pattern refers to the reverse complement of Pattern. 85 | 86 | Frequent Words with Mismatches and Reverse Complements Problem 87 | Find the most frequent k-mers (with mismatches and reverse complements) in a DNA string. 88 | 89 | Given: A DNA string Text as well as integers k and d. 90 | 91 | Return: All k-mers Pattern maximizing the sum Countd(Text, Pattern) + Countd(Text, Pattern) over all possible k-mers. 92 | 93 | Sample Dataset 94 | ACGTTGCATGTCGCATGATGCATGAGAGCT 95 | 4 1 96 | 97 | Sample Output 98 | ATGT ACAT 99 | # 6 - Implement GreedyMotifSearch 100 | GREEDYMOTIFSEARCH(Dna, k, t) 101 | BestMotifs ← motif matrix formed by first k-mers in each string 102 | from Dna 103 | for each k-mer Motif in the first string from Dna 104 | Motif1 ← Motif 105 | for i = 2 to t 106 | form Profile from motifs Motif1, …, Motifi - 1 107 | Motifi ← Profile-most probable k-mer in the i-th string 108 | in Dna 109 | Motifs ← (Motif1, …, Motift) 110 | if Score(Motifs) < Score(BestMotifs) 111 | BestMotifs ← Motifs 112 | return BestMotifs 113 | Implement GreedyMotifSearch 114 | Given: Integers k and t, followed by a collection of strings Dna. 115 | 116 | Return: A collection of strings BestMotifs resulting from running GreedyMotifSearch(Dna, k, t). If at any step you find more than one Profile-most probable k-mer in a given string, use the one occurring first. 117 | 118 | Sample Dataset 119 | 3 5 120 | GGCGTTCAGGCA 121 | AAGAATCAGTCA 122 | CAAGGAGTTCGC 123 | CACGTCAATCAC 124 | CAATAATATTCG 125 | 126 | Sample Output 127 | CAG 128 | CAG 129 | CAA 130 | CAA 131 | CAA 132 | # 7 - Implement GreedyMotifSearch with Pseudocounts 133 | Implement GreedyMotifSearch with Pseudocounts 134 | Given: Integers k and t, followed by a collection of strings Dna. 135 | 136 | Return: A collection of strings BestMotifs resulting from running GreedyMotifSearch(Dna, k, t) with pseudocounts. If at any step you find more than one Profile-most probable k-mer in a given string, use the one occurring first. 137 | 138 | Sample Dataset 139 | 3 5 140 | GGCGTTCAGGCA 141 | AAGAATCAGTCA 142 | CAAGGAGTTCGC 143 | CACGTCAATCAC 144 | CAATAATATTCG 145 | 146 | Sample Output 147 | TTC 148 | ATC 149 | TTC 150 | ATC 151 | TTC 152 | # 8 - Implement RandomizedMotifSearch 153 | RANDOMIZEDMOTIFSEARCH(Dna, k, t) 154 | randomly select k-mers Motifs = (Motif1, …, Motift) in each string 155 | from Dna 156 | BestMotifs ← Motifs 157 | while forever 158 | Profile ← Profile(Motifs) 159 | Motifs ← Motifs(Profile, Dna) 160 | if Score(Motifs) < Score(BestMotifs) 161 | BestMotifs ← Motifs 162 | else 163 | return BestMotifs 164 | Implement RandomizedMotifSearch 165 | Given: Positive integers k and t, followed by a collection of strings Dna. 166 | 167 | Return: A collection BestMotifs resulting from running RandomizedMotifSearch(Dna, k, t) 1000 times. Remember to use pseudocounts! 168 | 169 | Sample Dataset 170 | 8 5 171 | CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA 172 | GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG 173 | TAGTACCGAGACCGAAAGAAGTATACAGGCGT 174 | TAGATCAAGTTTCAGGTGCACGTCGGTGAACC 175 | AATCCACCAGCTCCACGTGCAATGTTGGCCTA 176 | 177 | Sample Output 178 | TCTCGGGG 179 | CCAAGGTG 180 | TACAGGCG 181 | TTCAGGTG 182 | TCCACGTG 183 | # 9 - Implement GibbsSampler 184 | GIBBSSAMPLER(Dna, k, t, N) 185 | randomly select k-mers Motifs = (Motif1, …, Motift) in each string 186 | from Dna 187 | BestMotifs ← Motifs 188 | for j ← 1 to N 189 | i ← Random(t) 190 | Profile ← profile matrix constructed from all strings in Motifs 191 | except for Motifi 192 | Motifi ← Profile-randomly generated k-mer in the i-th sequence 193 | if Score(Motifs) < Score(BestMotifs) 194 | BestMotifs ← Motifs 195 | return BestMotifs 196 | Implement GibbsSampler 197 | Given: Integers k, t, and N, followed by a collection of strings Dna. 198 | 199 | Return: The strings BestMotifs resulting from running GibbsSampler(Dna, k, t, N) with 20 random starts. Remember to use pseudocounts! 200 | 201 | Sample Dataset 202 | 8 5 100 203 | CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA 204 | GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG 205 | TAGTACCGAGACCGAAAGAAGTATACAGGCGT 206 | TAGATCAAGTTTCAGGTGCACGTCGGTGAACC 207 | AATCCACCAGCTCCACGTGCAATGTTGGCCTA 208 | 209 | Sample Output 210 | TCTCGGGG 211 | CCAAGGTG 212 | TACAGGCG 213 | TTCAGGTG 214 | TCCACGTG 215 | # 10 - Implement DistanceBetweenPatternAndStrings 216 | DistanceBetweenPatternAndStrings(Pattern, Dna) 217 | k ← |Pattern| 218 | distance ← 0 219 | for each string Text in Dna 220 | HammingDistance ← ∞ 221 | for each k-mer Pattern’ in Text 222 | if HammingDistance > HammingDistance(Pattern, Pattern’) 223 | HammingDistance ← HammingDistance(Pattern, Pattern’) 224 | distance ← distance + HammingDistance 225 | return distance 226 | 227 | Compute DistanceBetweenPatternAndStrings 228 | Find the distance between a pattern and a set of strings. 229 | 230 | Given: A DNA string Pattern and a collection of DNA strings Dna. 231 | 232 | Return: DistanceBetweenPatternAndStrings(Pattern, Dna). 233 | 234 | Sample Dataset 235 | AAA 236 | TTACCTTAAC GATATCTGTC ACGGCGTTCG CCCTAAAGAG CGTCAGAGGT 237 | 238 | Sample Output 239 | 5 240 | # 11 - Reconstruct a String from its k-mer Composition 241 | String Reconstruction Problem 242 | Reconstruct a string from its k-mer composition. 243 | 244 | Given: An integer k followed by a list of k-mers Patterns. 245 | 246 | Return: A string Text with k-mer composition equal to Patterns. (If multiple answers exist, you may return any one.) 247 | 248 | Sample Dataset 249 | 4 250 | CTTA 251 | ACCA 252 | TACC 253 | GGCT 254 | GCTT 255 | TTAC 256 | 257 | Sample Output 258 | GGCTTACCA 259 | # 12 - Find a k-Universal Circular String 260 | A k-universal circular string is a circular string that contains every possible k-mer constructed over a given alphabet. 261 | 262 | k-Universal Circular String Problem 263 | Find a k-universal circular binary string. 264 | 265 | Given: An integer k. 266 | 267 | Return: A k-universal circular string. (If multiple answers exist, you may return any one.) 268 | 269 | Sample Dataset 270 | 4 271 | 272 | Sample Output 273 | 0000110010111101 274 | # 13 - Reconstruct a String from its Paired Composition 275 | Given a string Text, a (k,d)-mer is a pair of k-mers in Text separated by distance d. We use the notation (Pattern1|Pattern2) to refer to a a (k,d)-mer whose k-mers are Pattern1 and Pattern2. The (k,d)-mer composition of Text, denoted PairedCompositionk,d(Text), is the collection of all (k,d)- mers in Text (including repeated (k,d)-mers). 276 | 277 | String Reconstruction from Read-Pairs Problem 278 | Reconstruct a string from its paired composition. 279 | 280 | Given: Integers k and d followed by a collection of paired k-mers PairedReads. 281 | 282 | Return: A string Text with (k, d)-mer composition equal to PairedReads. (If multiple answers exist, you may return any one.) 283 | 284 | Sample Dataset 285 | 4 2 286 | GAGA|TTGA 287 | TCGT|GATG 288 | CGTG|ATGT 289 | TGGT|TGAG 290 | GTGA|TGTT 291 | GTGG|GTGA 292 | TGAG|GTTG 293 | GGTC|GAGA 294 | GTCG|AGAT 295 | 296 | Sample Output 297 | GTGGTCGTGAGATGTTGA 298 | # 14 - Generate Contigs from a Collection of Reads 299 | Contig Generation Problem 300 | Generate the contigs from a collection of reads (with imperfect coverage). 301 | 302 | Given: A collection of k-mers Patterns. 303 | 304 | Return: All contigs in DeBruijn(Patterns). (You may return the strings in any order.) 305 | 306 | Sample Dataset 307 | ATG 308 | ATG 309 | TGT 310 | TGG 311 | CAT 312 | GGA 313 | GAT 314 | AGA 315 | 316 | Sample Output 317 | AGA ATG ATG CAT GAT TGGA TGT 318 | # 15 - Construct a String Spelled by a Gapped Genome Path 319 | Gapped Genome Path String Problem 320 | Reconstruct a string from a sequence of (k,d)-mers corresponding to a path in a paired de Bruijn graph. 321 | 322 | Given: A sequence of (k, d)-mers (a1|b1), ... , (an|bn) such that Suffix(ai|bi) = Prefix(ai+1|bi+1) for all i from 1 to n-1. 323 | 324 | Return: A string Text where the i-th k-mer in Text is equal to Suffix(ai|bi) for all i from 1 to n, if such a string exists. 325 | 326 | Sample Dataset 327 | 4 2 328 | GACC|GCGC 329 | ACCG|CGCC 330 | CCGA|GCCG 331 | CGAG|CCGG 332 | GAGC|CGGA 333 | 334 | Sample Output 335 | GACCGAGCGCCGGA 336 | -------------------------------------------------------------------------------- /week9-10/linear_space_alignment.txt: -------------------------------------------------------------------------------- 1 | ECFNCNNVQKGAANITMTEFIRWAKHPPSYTSAYAAMWCAAPNSENILYRIECMCAHNWCTTVSDESIHILEQYTYDMAASHVWFIDGAYAACTFYCHFIDAYKDRYGTCRDVLVLAFDCVVNKCHSGCDNWCPDGSWRIHRVWYCPKSCCIEEYFMAKMHACCVAESDYCEVAHSVKGGYMKQRKMAAEQVNGDIGIMCEVLEVDMMWAAYRSQKKVPEFILKANSMIKSEYPEYHEQDKDHTEIMGVYVTFNNIICVIKQQFDCDRMMYMSRHSFCAGDFNQAWNGCLRWCIFDYDVIVVCWLIKPVEENNVTKSCNVDKVEMGSVTCICADHEIKCCTHYTDYGHAFHTGVNPIMQGKVGKQWVAIWWGIHDGSMRAFKCHDPICSPGCAPREDKFLVALAFPWKDQDPTNAKDATTMEKTTDWLWKQDMPHMFHMSMFSDSWVEVITLMALNKKGGCHLEQRMLEVALACDYIWCCKYQKLLLCSLQRNTCTMIYVVMGYVYQGLTTVTKNCKPILAMQEYWTNRDKFKCHTPWEFKKTWMICTPMGYCALHFCTCPWPIMCQYRPRIDYFVNPSMLYNFWADIDPTIKSTYENLMDGEAFPLTNANRVFLEEYMNTNHRWRVHIIVHNERAMMDRMRGACCANSLEVQIYSGVSEYQNTKHHTKKPDTEQIEWATSRYHEYMWDNDTDIQYCFDNHCCSNMSYKYGHVLWERQACNEKRFKKTQDDCHFIIGEGCGQYRDWFSMPLQHPQIRSCYIWIPSKVDAKMLRHWLKYGQIHWEFMDVCMHYTFFDDAIAENKCLERKCMDHVVVHYDASPFQYTMNGKPFRVEFMVCATHNCVPVDFKRVFVCAGNNNATVVDEMKMNEKGTDKFHRLCRCPAVYGCARPCSERALYASQICDTMTCNRGYSEQLWINNYGIRLKQETIFSYQPWMVVNGVLAPWRVFEATKDMIGVSLTTDSKVDRQYLAYAELVLGPSCPSYQKIVQFQDLASQNRSCKLDVNHEPIHNQFHEVIAVCRFIVIHQETGWMSYYPVTPWNSGAQRNKIAQQWGNAGVGQRAKCHELDFAYTCDFANVSAAYHGKPSVHALIVGPCKDLGNYASGWCQTCTQDNGVWKYNRSTQMLLVEKQGKISFIVKRIMGNTYIHYAEKQRLGFVRIVGIGSMDRHNVVIASWHCYMEINLNPWSWKQGPQADEYRTYPPPYKRNKIMFKYQWVYGWVGNDVEVQTKENLQQEKQAPAMARMRFYFLAKDQWDKRKYTPVRGHCFDRVFDQGAVEKYYPKFSVKDGNWTCDQCREFYNDMVWVFEEDICYERDDKPTAPGQNNTHNDKEPYDNGFYQSKYCENCMTTYIVSIFAIYQWQAQWFDDRHCRDWNRSNWMCCRPTFNYDSNNHYIDSINQLSIMWILFHYTAYIWDYWASHIQHGDICMNYPIDIMFCNYHDTDDNPHNHEFYVSGFHQTDHDFPGKTRDCFKYGMGGKRETIHHQTIAIEFSEYDREYSSAECSWEHHTIIYAFCSCAHDKGPWDMAYNVVTTPCAGVEHTESLRPNFDAAMYWVQYWNIRLTREKFTVPEKPMTLWMDQTLIEHNVKQSHTNTQDNCRMMENRMIHINVSQYQKPNLPDTVDATAYFAMVMTQDDPWYRTNPLGIWELYMEPLQSYDEIKIKYICFGNSIVMPGDVFYLTCIFNEQMDQHSGFPTMNHISAPIGFVTISANHYPISGDQIMLWFYAGEWRASDSRIKCTTQWICGAARSYSCDPCCSVFWISTVLEHYSINTHHVQAFHRPTQHISCKIIDDCMASKIYCPIWMTGDWPVENWRCFPFRVYTLWEKACRRVCVYQFRLWGCWFLHECHTPMQNPWFRCKLYAWMNTQDETQAVYCDNLACIIAMMNFGTPWSQFIPSAEVEKPLEWGGIIQQTIQRNIATLDYKWDWVKKSAVDWAEWRAVTDHEKFSHTVFDSKHNLVEYSMSFGGESWLIKTTNSRPGCWGLGVSWFVGNELMKNQTLAHRVLVTRWMQARWWSRVRTKSRQCKAHGRWGNKDDNRFCPYFDKLKFHRMFMMGNMWLHYIMPAFTNFVVAHQDQGQRYEPHCLQKFPPHDDPQRPVKALQLHMQWIVPTCVQKSGSWERSSHDAFVWMEVYPIDCGCRPRQVTQVAMWDVPKQRVLHLHFRVNWVLMEDQPSNTNQGQALPFWFSVDQWGIWMPDAYVPLRTYIECRFKLKRMRVQWKITDFFPTRYMINWLNQMHRWISFHNDKLDILWNGEWMSQCQYRVMEQRKKLMPQDMRYNDRCPNGHNHNLDMWIRETKGLPDFWHASHGHPDFWGAHQCCNVNWNEYQTWVQGLCDRQNYCMDASKEHAHWCEGMCKLPYCSTFSMCLISMEPPDYFSMHQMFDVCQTKCMYKRQTTPSCQINIFSALQKHSDVKSSCDPHIEVIQWLYAMASGCQSSETSSQRQYPPHACPHDKIRDINYLWTAADFTRMRKMPHAHIKRRWLFPPRIAHYRPEARQAWHGDGTYPHRPQKHWFTECTIVCHFSQLSMAEDCLAWLVIPWYMITLMDLLILNIIFLMYRTYGKDNFWKNPERHHKGFQSKNVFFVDWSHQNIGRWQYPDTMNPANSYSSHCCEDTVWTVYIWHVCWFGVMQVVQTDRRENNTHSFEEVLCTNYWWFNAEHKYSMSAWCKNTNFMCDFGINFWEVAHPGSHSPTHPDVRAAFEHTKHPMPHDATQIRDNDNILRISLTLHKDEERWNLPRWFTNRRNRMHAAHEQDTAKYNTWCKMTNMGSEKRTWTFARPVVWLCKGQEFKKCQDYIYPDRQLGCFIDLCKTMVILAPEFHDWDMVGRCWCNQYLHQCSYFDWMSFWIAENSNVSCPNSPHKWRYFYRVQIWTYWPLCKYAAAHMSGYDCVGGVPEFGNEPWMAFAWRPPAPIRMRTPVCHLAVENAKAVGPKVLHARRSASNWMWVARFKQIMCDIMGPRKAVNNFMWGTIDCFEMEAKRNYIDVWQYNINRGYSQFNYMDADRFASFCTNTNHIWWNWRPQQLRYKGNAFLNAICVIESDCGMCNPRVEAFMQNYRDCEKFIDKGCCKRLWHRCQGNPCRCNQPCRSWGSQIMRCAVWMMIVCINVSISVTRIGGCKMKEMGMMMQYNQRACVHEGHQNENVYACHPGWPEIFSMFEVCCVCWMNTLCNLLEGAELSRPNVQKSNELQEDMWRVETRGSLFRSGEHRKWCKAKAWSANTKNVEMPAQRIWEVISKQWWYNNIIRPDDWIVHGPTLNFQEDRSVRTWLKACGMQRRMDHWCCHGDLPNGDDKVLCGGPRFLQFHAMGNFGCSMAAIARCTYLNDQRRGPFNTREYWQWVEMHPHYGRHVRMGRPTDKSSPFPRCDCHMCMCPSEESDPWKPLQSIPTGAYKQFQFDTPDHYCRKDQISKNLKVRCNYDVWFFGFEMQKVIYTWCVHPFGNRHYETHYGNWQCSELRTIVQIHTQFIPWGEQEAWFRPKSYNQLALACVFESLQKILWHANISIRYEPPEWYTEHLLHGHWKEIVSFCAWFERQHICQPWNSMETWSAQRMQIQHSDRRDCYEPDDEETCYMCGVRSYRSYQSRNGLIENDWVANLPYHYCFVVYQFVMVLHQQWHNGKTLMYTKDPEAWCEICFLGDFILQTDNMQWKWSWFCPHQHASFPEPFFLFRRTCWKKYLLQGKRKHTNDFIMTVDVFWAMGQLVEWRRYNTWIGPVMEQDPMVRCFQTHKGELGNQWESYEMKLQHNAQVIRGIPPIISKTNTLKKEDDCSEDNCEYKTRNRDKWGLQSPIPHQYHMSFCACWDWSIDNTLILRFYMIKVYANIMLWMLNSCANLHMCHGILAKWNDFQFMIEAAIVSHTEWAAVYRPAIMRNKYQVRFQLCESLQGWYHWWTFMEMMSVRIVDAVSMCLYAWVCEVEMHEHGTPTVHFGSMTRWLSHLSDGWVLWLQHPDFKLGVAHPFPDCNMFQPVQPLGQNTTWCRQPWWPSEYHPHVWSEGWSAFIFPFAYHDRIVDLALQLHARRYYCFTGNRFKQLIKVRRVNFGTNLRSYATHSHYFAPHIARCRQDVGDELAKRHIPSELLKASRRGTCVIYDWYHNFADARTMPLFWNRGNMSSCSFCTKRSTMFQMPRMKQSEMFFMFLDSHMEAFCGSCNLVRDWKCLVMPEWKMKASRAAYLQCTVFNPTCTNQQTVCMPDALEPYCSPNAICCRCHQNRKGPAFQEKITRPHNQEAGKMINPNSDYHMHQRPWWQKAKGGDTVFSVERHRLNKHIYKGTAVLEFYSTCIWERGPKCPYEMSSHMVTMNWYVDNCVSSWGIDTSTTDDIIADAHYLTMYHTNAQEECTVQALEVTCSCIAYVVYLVMGLKFINNKYFMCQAVHVLREDQQEENWNAQTHDEDEYNCRSITRMCYFMKIMVAAWCHDFTCQVPKMSWHVMDMHMDWYVCPYNVCIKMEITSHIHVIDLHPLSARKMIRIDGRVPWISYICTDQHQQNEESRHFNDEQLLNRDLYHLCTRLFSVNRGVSGCKGLEFMQLAKIHHKKLCFIMNMEMSHGNNYGDYIIARFHCSPMNLEPRHFQYTYYTDFHIDRRAWYGEHMCCINTVCEVYRNYILNIMKSMHEGGGSCVIGSIYTQLRNSHNAMSYSDFCCEFDGVKYVTYFHWRRTLPICQHKPNMHYSDCHFNDFQGNILKFQPGQQTWGSDKLFGGDFNENRYFEVSVDCNPFDPHCCCMLRYWAYAFMDWFMGGGCGYWCLKVFGGWEHQVSWNTQGMIISKCNMKTRECDYSIVFAIWKHAKFDNNCRCGHPAKQGTNRNWHQCIIQIQKDAHFKYVTLQIATDQSNPREPMRQLKWIHSQINHFHARGQQKTQCHSKELEAQCNYGFCGTKIQYINCRYHLHQYNQEYREYRVHRWPWPDPQVDGGQYRNMIWLCIAVSEGCQCTFFTMDNGYMELQPAIAQRFSQSSGNALCHCCRLVKHHQIEEHHPNDAIVSHSVCGIALDNIDTGQKADQLKKYQGKLLDYWCCMPDKNGYICMSDNAERMNTCPATFKNAWWKPVGEFYRRTLAPTMIHCCFKWQHRPHDHYTERMAQNNVLAGLQRETIQWPQQLMTIENMALNHACVQPFVKTEYFWNFALPLICRQWYMPHQRSNEFGSDKWKRRKWAVNQVGDQICLMFCECCRRPMADMTMGAQHGTVCTAWLAEQPSDMQYRMETFHKGTIGAFLVFWNAREDCNNLARSYHNERKGTYANSTVHMSGRGCCQECDQILAGRAPLKMERRYCWKDHMLNYLKAHHYQIYVMGQGNWFEHKNSPPMKDLSTCLYFEILYGVHGEKPVPRVYEGIKNYAADTGAVASAGDWRKFPQDEIFQKKQYVKLWPCVVMCVPYSENKSAYHEPVTTVSYPDVGFRILRYANRLMMMPGSMQMNAVQWDAEKCLEMGIQHKTNNGYGVTGIQTPTWDMHQAESENFEMVFYYHHAHNWPQHKLVQGRDREYQWSCRDNAFTQDLFETYKQVQTYTMKITVQCMESSMARCAGFPCTWGRKTSKQKGRWCAHIMKTRMSNASVQRWEMHDWIHTNYYPPWMARVRTYSRPCCREAARYIVFMPEFHVVHCGKGQNYEPMFFHIASWISMETCYVYELNNFTHCSNPYAIVDSVHTFAAFNQAKYKIKLDPSLSPNKILNNELSQLRQNMLCPVLVIPSICYILYHGTDIDVQMQSQWASQWMEGHINTLYRMEFVLGTTTVDHFLLGSQTQILRWWCWFIGELQVIMSLVQMRFVNQGLMSGYYCVQKAKWDTGFPYEIYRGMYNHFAVHKQPSWFAQYNDQNRYLWCETMFCGYIGINSGRDVRGKDHQGWYDFCRATRQDWTKCFNKENNIMSMPDKWKENCCAFGMPCHWRSHNRWIVECKIRYWWHDVEDGFHWLDGEHDTFHTPPLAKVDVYEQNIVRLMHWHGWKCMRPEHYNESFVGTHIILDEPHMNCAFFNSWPLKMFCSATEHKKFTIQLDIENFWQNTVMKGFFQDSVISHCWFSKNKQACPEPHPLVQPPSHLVQQWCWKETMTFHGCTPPRFYISGQENHRCQHNFATASCRDHPECSNHVWMIWTFMHEDMFQVNEVPKEPTVPATTTWIGWISNAVANCRDNTHFAPKFYEECSMVFDRHADRPINLSGGMKWDTLVRMSWHLNQLDIVEQMAKRIIMIDHLHVTMEFLKVDLPNPQYWKEQTFLGKIAPRVAMNLVPDFRTLVRGGEAQNYRFMHGTVMMKISVNIWDLPTKQYFQCWTCEEKKGMLCLNYWIRIFGELGTWVEGANALCKCSMNPFCSWDENFVRMLDRVWTSGQGYNDTNMTNELWCTEDQAPCQCVSKSNKNKKMELENKREVLPNDDKQAWRQDYPGILTHGETYYNGLGWNYPIHNWPIVQYVPSPLIAPDRYWKQNIFFCELIWKGFVHCLDCSEKRTCVASCREYPCYGFAGYAKHDNWIRMGNFTWHAHVFGVKEWYHPTTCFAWVFRCGYQDSCHFHKTNNYLMKQNGLWCYEATTSARSGNIWMGVWLVMWDEKNEGPLFHNRSHHMYLAFWPKEYHQYSEQEFMQGHDKTYSCMNKNKATIMHNRKAWRSWWATGYHSRVRLEKGHKCALVFVRHKSFRQVDLKISYENSKCFLCNPFNSNIYTQGQIKTDVWMEACKVWQYGKGDNEPHHNKFECYWYLFAICGHPDVWQYEDRADYQQSWDQPWYFFKTYQQELYFWTFGVRSPALHQVCDHKQFNQSHTDSSLIIALPLKPWNDNWINYFIDQYDCSNVSYQFMSIVHERFMNIIRYDHIRCLEYSDCVNIKCGNPGPLLHYGFCLSVYRQDADNHKFDDCQPFHCGDHTAQDFWSPMEIHRNQRKSCDMRNVQIAHFYALGCGATYIHMGMFTSREYGEEIADLSISKTIQIGEFVWQQRMSAWLQAGANGWVFCSTKYDRIYHSSTTFMDQFNRFRVEVTCKLRVPITVGALVVLLNIQHVALLLVMHIHWAHHERPGVAYHAFEHDLFRTRAVRQDLPYVWQMQCCCVNPYEFTEAMYANHQTYFFTFATAFQPYPETYMHVQVSWITIGVIGIKVCNCWKGTMFMPPGHPHYDHLTVILYKHLQGFYDFKAQNEAERSHADFKFKVTTCDKGRWSFHWWESMQVDRQRRIYSLSTQYILYHKGGHSLEASKMMIVADMGLPHFIWKGEEGFIDG 2 | ECKSVTKSYWWFEKDMRQSFQKGAANITMTEFIDRWAKHNQGYLRDFNMFMIDDAAPNSENILYNDPTTVIDESIHITYDMAASDVWFIDGACAYCMRCCMKYFYEWMYMCIDARKDRDRVVHSGCPNDDNWCPRKDRYQLPKNCCIEEYFMAKMHACCVHQETHSVKGGYMKQGDTFKNECSCDSVDFMWAAYRSQQKVEEDRNILEMIKSRRYPEYHEQDKDHTEIMTVYGTFNNIICVIKQQTDCFRMFKMSRNQAWNGCKKPVEENNVTCNMMRRNTGDMGSVTCICADHEIKCCTHYTDYGHKGIYVSTTGVNEIMQGKVGKQWVANCKNKWWGIHDGSETHRAFKCHCSGGCVALAFPWKDQDPTNGLMLVMVGSKDATLMEKTPHIHPHTCEVQTLMALNKKGGKMHQQIEHCDYKFTKDCWCCKEGRYDKLLLCSLCRAIVVMGYVYFWECGLTTWTLDAWQCKHNCKPILAAQEYNEDKWKCHCQFDPWEFKIDTWMICTPMGYCALHFCTCPWENLMFNLIMCQYRPRIDYFVNPSMLYCVQPTIKSTYENLMDGEAFPQDFTNANRVFLYMNTNHRVHIIVRMSGACKCANSRGMALELERTPQDSQIYSGVSEGQNDTEQIEWATSRYVIEGGEYMDGEDNDYDQVMNDFLRDCIGWGGCFNHCCKVLNREPQGNMKYRYRGKKSLHPGQLWERQACLERFKKVWPDHWWQDDCHFIIGEGCGQSHPLQHPQSKVDAKMLRHWLKYGQIHSKHMHYTFFDDWIAENKCLNREDHVVVHYDASPFQYTGNGMVCATHNCVPVGYKRVFVCSAGNCKMNEQYTPKIYPAVYPCARPHSFWPSISMYDRDTYYMTCGSQGRGYSEQLWFNYYGIRLKQSYHPWMVVNVDVLIEKGVMGRDATKDMIGVSLTTDSKVDRHVPHEVSKCYDAYMELVLIPSRPSMQKMWVQAQKMQQNRSQNKLDVNHEPIHQQFHEVIAVCRFKVIHQETGWMSYYPVTFWNSGAQRNKGKVHELAFAYTADFANVFGVARDQPSSHTLIVGPCKWCQTCTQSAQVSKMCPCRNYGKISFIGDVQRRIMGKQWDMMPKQRLCLVRIVGGRCVGSMGRHQPKEPVQFHAHMEINLNSWKQGPPPRTRNKWVYGFQNYVGNDTDVFDEVQCKENLQQERQNEGIIWYSPAMARMRFYFLAKDDWDKRKYTPVRGHCFPQAGIAVNRQWLEQPLIFNQGAVEFSVKCRQGANYQDGNWTCDQCKVFIARKNEFYNDMVEVFEEDKCYERDPIMDVYLDNTHNKCEPYDNGFYQSKYCENCTDEIVPIFYRHAIYQWQARHCRDWRSNRVKKWIGGGMCCRQTFNYDSESHYIDSINQLSIMTILFHYTAYIWFYWAYHIQGDICMNRGWLPIDPMFWAGMNYPDIDDNPHNNEFYVSHHNNQFFHQRDTSQRFKNGMGGKYRTIMIEFSESSQYRQWECSWEHHTIIYRFCGPSTPCAGVERLRPNEDAAMYWVPYWWIRLMREKQTRPEKPMPLWKMNGITCCSGFEIDMTLIEHCVKQSHYNTQDNCAMMEPNDRWRWPNMIHINVSQYQKPNIPDTAMVMTQDEPWYRTNPLGIWELPMEPLQADVFDEAMRDEIKIKYICFGNGISVAFENQVMPGDHFEPWILTCIFNEQPTQHSGKYANFVTNHYAISIMLWFYWRASDSRIKCTQWICIVWAARSYSCDPCCSVFWISTVLEHYSINTHHVKNFKDVAFHRPDVDQHKIIDDIMASKIFLGRELCPMWMWMEQGGDPGDWPVENWRCFPPRVYTQWETVYQFRCQHQTRNSLGCCHTPMQPWNDTLKYSNTNLACCIAMMNFGTPWSQFCAIFREPSAEVEKPLKRIKSIQRHIATLYMGLGHQVKKSAVDWKMCIKYWDCRAVTDHFSHTVFDSKYNLVEYSMSFGGEVPLIWCVLNQTTNSRPGCWGEGVSWFVGLAHAVLVTRWMQARKQEKWTVMPWSRVRTKSRQTKAHGRWGNDHRFTPLFDKLKFHRMFMYGNMWGKPFVVAHQDCGHRYEPHCLQKFPPHDRQKRAVQKPVKALQLHMQWIVPTCVQKSGSWERSSHDAFRPRTCTQVAQRYLHLHFRVNWVGLEGGPSNTNQGQALPFYFSDQHGIWMPDAYVPLRTYIECRFIQDRVPCITLYRMRVQWKITDFFPTRYMINWLNQFLDQDACGTHEPVGVMSRFAYHNDKLDILWNGEWMSQCQYRYPLPPQDHQFEFAHWRYNMRCPNGHNTNLDMWIRETKGLPDFSHGHPCIQQGQSGWFWPALNKHRVMPVNHNLCQRPYTYQMGWVQGLCDRPMHWLEGMCKIPYCSTFSMCLISMTPPDYFSMHQAFDVCQTKCMYKRQTAPQYLYKIFDNALQKHGDQWAGMSPFIKGSCDPHIEVRQWLYARMRHKASGRQSDETSSQQQHYQSPPHACPHDKIRDSNYLWNMKRMRKMPHAHIGQRQSRLEIRRWLFPPRCAHYPDDNLRFRQHRPQKHWMTLRTIVCMGFVNKVLFSQLSMAIDCLAWLVIPWYMITLMDLLILRQMPKIRFLMYRAYGKDNLVRYWWKNPERCHFEYGFQSKNVFFVDWSHQNWQYPDTLNPANSLPWHVTWFGVMQCVQTDRRWNNPIHSFQCIAMHKWVEVLCTNYWWFNLEHKYSGRAAGSTCPSAWMKNTNFCPLNMSQFPMEFGINFWEVAHPGSHSPTANVVMVANKPDVRAAFEHWQCLSLQIRDNACILTLHKDTERWNTNRRNRMHARQIIQWHEQFYGVQPAKCNTWCKMTNMGSEKKWFTTCHRRAVVTIECRLALCKGQIYPVHWMLVVSQRGIKGNTCYKEGSNFINSIAIEFWCNQYLHQCTYMWHYVCYFDWQQEDYNSNKSCKSMYKHRWYYYQGNKVTSFYHVQIWTYWPLCKYAAAHMSGIVFVCFGNEPWVCTPLYNLPIRMRTPICHFAVEYAKAVGPKCLWGKFNWARFKQIMCDIMGARKAVNNVSNRSTMCGTIDCFEMEAVINYIDWYNRDPWHIIETSCPQYSQANYMDADFASFPWWYPRPQQWRYKGFLNAICVIESDCGMCNPRCEAFMANYRDCEKFIKGCCKRGNPCRCNQPSNGSSIEIKIETSDRDAVSIFVTRQGGCNQRMKRMGMMMQYNHEHLSPNGWYNEHVRQVYACHPGVLFFFVESRVNNIWAPQKSFELQHQGTLIHMWWLVENLFRSGLKSCAWVLWYDGANFVQVEIWEVQWYKWYVISTQWWYNNIIRPDHCHENWGYHVHHFGPTLNDSVRTLLQACGMQRRMDHWERYKLCHRSKDYNNPRFLQFQNCEDVKMGNFGCSMALTINIARCTYLNHQRRGPFNTRQMHVHDGRHVRMGRPTDCSKPPPRCDCHMCMCPSEMFTALAYFMNRNASIPTGARHFQFDTPDHYCRKDSKNLKVRCNYDAWFFGFEMGKVCCHPFGNRHYSETRTIVQIHTQWGEQVKWMEKWFVAWSYNQLALACVFESLQKILWHANISIKSVQKYTEHIGHGHFKEIVDFCAWFERAHNSMETGGFGQVFSDQRMQISYKVIDCYEPDDEECMPRQWHQACYMMQYQSWVAHIHRFRLTLPYHYCMVVYEFVMSLHQIMHNTKMNDQKSMSLMYTKDPEAVQTETPLHIQQDSDMRRTCWKKYELQGKAKHTNYKGENEDVFWAMGQLGEWERYNTWIGPVMFQDGMCRLNNMVRLWNQWLLAESYEMKLQHNAQYIRGIPPIILKKEDDCWHMFKGGAHMDKAPCRGMGWTRDGLWSDIPHCYHMSNIYCACWDWSIDNTLFNSILRANIMFYLVDWMLNGCANWNMCHGILALWMICAFFFMIEAAIVKPHTEWSCTWRAVYRPAIMRNKYQLGKFNLQGWYHWWTRFRDEHIRHCSCMSVRIQDAVSMCLYASWHDRIVCEVEHHHGTMTRMLSHGGCFCSDGWVLWDQHPDFKLGVAHIMPRGHYDVQPLGQRTTWCRQPFHYRAWWENFYHPHFFPFAHFSDKKLFHCGVCQHRIVDLALQLHARRYYCFTGNRFKQLIKVEGNRVNFGTNYRSYRTHSHYFHMYYVGICNDIHIGYVPEDYRGVHQRQWVAVWYYLRVGDEPSELLKAGKGPVEGGDEAGHCHTDCVDHPNAEWYDWYKNFADARWRRHNMSSCSFCTKRSTRDWTVIQMGHNERMKQSLDFCGSCWLVRDWKQAHKHKLYQFKVIDLRRCAYLQCTVFNPTDTIQQPVCMQDTAVIHGTTPNAICCRCHQNWKYDSKGTAFQEKITRPHPQEAGKMINPNSDYHMHQRPWGALCRGDQKAKGGDTVFWVTRMLIRGNNKHIYKGCIERGPKCISHMVKLQGPCDSTMNWYVDNCVSSWGIDTSTTDDIIIHHHIRWKVYHTNAQFECTTFPTCSIIAKGVVYLVMGLKFCNNKYFMCQAVHRQEEESCVSWRSIRPGRMRYFHDFPPICIDIKQVPKMSWHGDSYNKWYVCHLTACQYNVCIKMEITHVVWSHIHVIDLHPLSARKMIRIDGSVPSMDQHQQTSRHFNDEQLLNRDLYHLSHKMLYGVEDCMVFEVNHRKIAHKKLCFIMNMEMSHGTHNVYIIARNLEDFTHCSPMNLRMVFRKPRHFAYTDRRAAGQVLIWHNTVCEDRNPKCWQWEQNYILNITGPMHEGGQKAPVRRDMCVYYFKFKWFGGEIIVMIYTILWKRNSHNAQSVNSYSDFCCEFDGGLRQDYHWRRTLPPCQMFYSDCHFNCASNSVFQGAILIFQPGQSMDTDYVWTSGSDKLFGGDPLPTSFPNPCCMLRYWPPGRHRWTYAFMVWFMGGGCIPPFKIHYWCLKMDWIPNTYSFNGWEHQVSWNNMQTRECDLSGQWRTARWKVFAIWHGKFMDMYPYNNCRCGHPAKQGTNRNWPETDCQIQKDAHFKYVIAQIHDTETDQSNPIVNCWSDTFHSQINMHSAMFHARGQDKTAWLENDWTKELVAQCNYGFCAMCDLEKLTKIQKINCRYNQWILCEVCHQYNQLYREYVPMSMHFEPNTPDPQIDYGQYRNMIWLCIAWSEGCQFTMDNGYMELQPAIAQHAAVSESSGNQWHIGIKEVCVEDHHPNDARRVSHSFCRIALDNIDTGNAADQLKKYQGWCGSLDPCEEMMCCMPDKNGYICMSDNAERMNKNWKPVGAKNFYRRFLAWSEEDWHSKMIHCCFKWFIGCDHAGATNERMACNHVLAGLQIETDQRPWENMALNHACTGPFVKTEYFWNMALLLELICRQWYMNWHEMISMHQRSNEFGSDKWFRRHWAVNQVKDQICLMFCECCRRPMHIRRFDMGTVCTAWLNEQPGDTQYRMRTFHKGTIGAFLTFWNAREDCNNLARSYHNERKGTHAVSTVYNVMTLMGEKAHRHKECKQILRLYCWKDHMLNYLKAHHGDTRQIYVMCEDQGDWFEHKNQWCMGVDPPMCDYSTCLYFEILYGVHGEKPVPTGVKEGIKNYTGAVGSCVCNIRGDWRHFWMSQHNWFPQDEIFQEKQYVKTWPCAVMCVPASERSAYHEPVTTVSYPDVGFRILWPWNQTMMPGSMYMMNEWFLVNFDCCNQKLEMGYQHQTNNGYGVTGIQTATQGEDMHQAESENFEMVFYYLHAHNQGRDREYQWKMEEVPGKTCNDNAFLQDHMNVLYMFESMIDADATYKTVQAQRQRKMESSMERCARFPCTLGRHQTSKQKGRWCAHNTWVAKPTTRESNMRAHDWIHTYYYPPWMARVRDYSRPCCREAARYIIYVSPEFHVVHCGKGHSGFNVFNYEPMFFGIASWISMELCHIKTDVYCLNRFTHCSNPYAIVDSVHTFAAFNQAKPKIKLDPSLSPDKILSQVRQNMLCPVTVHPSICYILYHAQNDLTDYDVQMQVQEAFILYMQTICILKRMEFVLHETTVDHFLLGSYTPTELWILRWWCWFREYAFYGELQVIMKMVYMRFVNQLWPWMISEEQKAKWDTGFPYEIYRGMYNHFAVKWHEKQPKWFAQYNDYVAWCETMFCGYIGINSGRDVRGKDHQGWYDFCKFIEKFWKFNKENRGSYIMSMPDKWKEDFVSSMKFCCAFGMPRPLWIVECKIPYDGFHWLDGEDDTFDTPQWWMNFLCFSWCPIVRLFMRPEHYNMSFVDEPHMFSSWPLKMFDSKKFTIQLDHRHHTKFWFSYVMKGFFQDKNPSHLSQQWVRTWKETMTFHRCTFRFKTKPYYISVQENHRCQHQFATASCRVTDHRMIWTFRHEDMFLPWFEHAFQVREWTDWMYKEPTVPGWANCRDNTHFAPVFFPIHTYYFYEESHGSMVFDRHADVATNVRMSWHLNQMAKRIIMIDHLHPMWEFLEVDLPEQTKLGKIAPRVAFLVPDFRTLVRGGEAQNRFMHGTVMMKISMNSWDLPTKGYFQCWTCEEKKGMLCQELVKVDGTWGANAKGDDCKCSMVPYENFVRGVLDRVWTSGQGYNDTNMSIALDSFKWCSEDQAPCQCVSTNNKNKKMELENKRGVLPNDDKQAWRQDYPGNVLYANQNLTHGETYYELLGWNYPIHNWPIIAPDRYWIPFCTLINDALMCSEKRTASCREYPCYGFAGYAKHPWEEGNWIRMGDYYTLDFNWHAHVFGVKEWEHPTTCKAWVFRCGTPEMETYLTLSSTYMCELWCYEATTSWRSGNIWMSVWCLVMWQAKNEGPLFGNRPHHMYLAFWKKCYSEQEFMQGHDKAHIMHEYCWWATGYHSRVRLEKGLKCAVRHKSFRQECLKISCENWKCFLCNPHNSNIYTQGQIKNDVWMCACKVWQYGKGDNEPHHNKFPPGWYYFAICGHPDVWYEDRADYQFVNTKTLFVPQSLFWEKKYKYQQELYFWTFLFFIIGGPCGDERSRYLLFVYALHKQFNQSLPLKYWANNNDNWINYGIDQNWDCENVSSQFMSIVHERFMNIIRYDHIRCLEMKCSMTKCGGPSRENQFSPLLHYGFCKSVYRQDAHNHKFDDCQPFHYPYKHPDFVIGTPGEWSMWEGIHRNLPWWVVQIASALPLLAPTYIGTSREYGEEIEDLPSISKTIQIGEFVWYPLIESAWLQAGANGWVFCSTKAEVDTFMMLKRCWDSQFNRFRVEVTCKYHLFQMQCLPITVCQGHVEWGYNPNYDHQSPTRDMAALLVMHINFPIQIYYIWTHRPGDAWSWHHAVHHDMFRTRVADVNKDIGPYVWQMQCTEVNPYCFTEAMYGIYWHVRVANHQTDEHENFFQPSPETYMHVQVSWITCLADGCKKGTMFCGAHYIQPPWKHPYYDELTVILYKHLQGFYMFKNEIERSHAPACAGCLIKFKVTTCDKGRWSSHWWESMWVDRQRRIYSLPTQNAYIILYHKVPVFQFPQPMHSGSKMGKLGPIVADVKCMDSLGLPHFCECGEAHWKGEEGFSTG 3 | -------------------------------------------------------------------------------- /week1-2/approximate_match.txt: -------------------------------------------------------------------------------- 1 | TCTCCAGGC 2 | TTTAATCTGGCGCCGTCGAGCGAACCCTTTAACAACTGTCGCTTCGATGTACCAGATCTGACATGTTCCCTAAGGTAGGTTTTGGACTGATGAAGACCCAGTGGAAACCGCTCCGAATCGGGTCACTTGATCGTGAGCCTCGGCCGTGAGAAGTGGACCCATCTGGCCCAGGCAGTCAAACTTCACGACAGCCATTCAACGCTAGCTAGGCCCTATCGGCCGGGAGCGGTGAGAGCTGTTGCGTCACTGTGATCGGGCGGCCCTATTTCCGGGCATACGTTGCCGGGCATGCCACCCACTGCTGAGCCTTAGCTCATAACATACTCGAGATCTAGAATGGACGTGACTGCTGGTAAAATTGGGACAGTGGCGGTCAGCTAGCGCGCGCTTTGCAGATTATATGATCAGCAGAAAAACCTTTATAAACGCCCATGGGGGGCGGAGGATTTGATGTGCCCCTTCCTGCCCACCAAGATTATTCTGTACAAAGAGAGGTCTATTAGCAGCACCCGGGAGAATTATTCTAGGGGTATCCAGGCGTTTTAGAGTTGTGCACAACTTCAGGGCTTAATGCTGCCCGTCCTGTGGCCACTGGGACTGTCCCCCTGTTACATTACTTCAGTTCCAATAGGCACAATGCCTATTCTGTTAACTGAACCGCCAATTGTTGCAACCCACAATATGGGCTGTGCAAGATTGATGTATGACTATTTACGTCGGTTGTCGGAGGTCGCCAACCCTTCGGCCATTCTCTACCGGAGGCGAGATTGATGCTTCTCAGATACGCTTGAAATCCCTAATGCCGGCTTTAGGTTTATATGTTTGAAATCTCCCGGGCATGGGAATTGCTACGCTTCCGATCAAACTCAGACGACAGAGCTAAAGGCGCCTTATACGCCTTTAACGGCTCTGGACCTTCCCCTATTGCTAACAGTTACAGCTATGTCATCATGGAGAAATGGTCATGACCAAGGTGTGACCGCCCCCGCCGTTCACGCTCTGCGTATGTTGCAGGAGGGTCAGTCCTCCACTAGCACATTGAGATATGGGTACACAAGTGGTATCAATAACCAAGAGAAGCCCTCCTGGTTCTATCCGGTACACTTTATCCGGTCCTTGAAGTCACGGGGGGCGATCCATTTCGTTAATGACACCTTCCATTTTGTGAGTCAGGTTAGAGCTCACACAGGTGTTCATTTGAGATCTCACTTTAAATCGCCTAGTAGTCCAAGTCGCTTGTGCAACGTGAAATTCTGGTATGCTACTTCGGATTCCCCTAATACGCTTTTGTAGTCGCCCGCGTAAACTGTTCCACGGTCGTTGAGTGAAGTTGGTCAGAGCCAAAGGCCCATTAAGTACAATTGTACGCGTTGGAAAGGGGACGACGATGTTAACCAGGCTGCCATCGTGCGACCTAGCATGAAGGGGCACTTAGATATCATGGAACTCGTTTTCGCCTTCGCTGACAGCCGCAAACCCTGACGACGGTTAAGTCGCCACGTCTACCTATCGCGTGACGGCCGCGGCAGATCCCCGGCCCTAACGTTAGGGGGTAAGGCGTTCGAAGTTCAAACTGGACGTAGCGAGGCGACAATGATGATGGATCTAACCAAACTGAATTGAACCCACCATATGCAGGGTAGAGCCACAGAAGTAGTGCAATCATTGTCGCCCCCTTGGATGTAGCTAGCACTGACTTCTAAAAGGCACGCCATCCGGTAGGGTCCGCCTACCAAATTATCGAGGGGCCTCTGTCCGACCGGATGCAGTTGGCTTCTGCCCGGAGTTAATTCAGATTGTACTAGTAAAGATGTCCTAAGCCACTTGTGGAGGTAGCCGCCACCGCGTAAGGATAGGCGTATGGTGCCGTTAAATTCGCTATAGGAACCGGCAATTGTAGCTGCCCTAACGAGTCTTTAATATTGAATATTTTCTCTTAAGACAGTGCATGCTTCACTTTATTCACCACTACCGAATAACGCCATACCCAAACCTATAGGAATGAATCCGCTGTAAAGCGGATTAATGTCGTCAATACCTCGGCATTGAGGCTGCACGTTTTCTGCCGTCATATGACCAAGAGATAGTAATTTAGTTGCGTACGATTTGTGTGACACAGCCCTTACTGATAGGCAAGTTTTATCGTGATAACCCGGGCAGGGGGAGACCAAGGTCAGTGAAGAGCAGCTACTCGATGATTCCCAACAAAGTAATTGGATGTTTGAATTTCACATACAGTTGGAATCTCACTGCCTGGTGCTGAGAAAACGTGTGTGGAGTTTTAACCGTGTAAACTAACGAAGTGCGACAGAGCACTTACCGTGGACACCCCCGGGGATAGATGCCGCGGATGTAGAAACATACAGATGTTCCTGTTCTGGTAACTATTCTGATTGCGAGCCCGTGAGGGTAGCACGTGTAGTGAATTCCTTGTGATACTAGTAATGGTATGCATTTCTCTACTCAGTGTACGTCCTAGTATACGTCCTCGAGAGAAGGCGAACGAACTCTTGACACGTTAGAAAGATTTAAATCCCGCCCGATAAGACTAATGTAACACTGCTGCTGGAGACCCACGTCCAAGCAGCCAGTCCATGAATCTAGATCACAGCAGGAATTCTGATATCCACATAGTGTAGCAGCCCATCCGATTCTCTCAGCATTCAACGTTGCCTCGAAAGAACACGTCTTGGCACGTCCAGCGCTTTCGCTACATTCAAACACAGGACACGTGAAGCTTAGCCCGCATTGTATAAGGGGCATACGTACCTTCCGTTAAAACAGTAATCAACCATCTATCGCTGTGATTATGGGGGGCATGCCGTCACGCAGGACGACCCTGCGACAAATCCTCCCACAGCCCAAGGGTGGACTTAGCCTCAATATAAAGCGAGGTTCCGTCGCATATTAGCCTCAATCGGGCCAGAATAACCAGAGCATCGGCCGTGTGTCAGTCTTTGTCCTGAATCTTATTTTTATGCCATTACAAATTGTAGTTCCCGGTCTATTAGCCCCTCACTCAATAATTTACCGGACCTAATTAAAAGGTCGTCGCGAACACTCTCACCGCGTTTAACAGTCGTTTAGTCGCAGTAGGTATAACGAGGTTGTAAAATGGGCACCTTCGTGGCGTTCCTATTAGTGCAATACCGGGGGTAATAACCTCGAGCAACTGCAACCAGTCGCCAAGCTAGAGCCACCGCTCAGACCATGAAAGTTGACGTGATAACTATGGGGTGTCAAATCTGTCTAGGGTGTGCGTTCGTGAGAGGTTGGGAACTGTCTTGCACACCGAAGCCCGCTGCCAGTCTTCTTACGTACCCCACAGGGCAATCGGAAACGGCAGCAATAAGGCAACCAAACCGTTTAGCTAGGTTGGCTTATGTAAGACCGAAGTAATAACGTTCCATACTTACCGTGCCAATACGTGACATCGTACTCAATACTCCGACAAAATCAAATCCTATTCGCACGATATTCAGGGTTTCTAGCCATCGTGGAGGGCGTTGCAGTTCCCTTGGGTTCGTATACGTGAGTACATTGGAGAGGAGTGGAAGATCCAGCCAACACCCGTTACGAGCGTAAGCGACCGATGCTGGTCGAAGACGCCGGCGCCGTCTTGAATCAACATCCTGAATAAACCTAGCGTCTGCACCTGGAGCGCTGCAACCATCAGCAGTCTCGATACCTCGTAATATTCGAGGCCTGAATCTGGCACGAATAGGGGTCGAAGACATCATCGCCATCTCGGCGAGGCCCGGACAAGGTGATATACGGCAACATCTGCAGAGGCTAGCACACTACAAACCTTTAACTAGGGCCATGTACGGAATTCATCTGGGAAGGGGCCTCTGCTACGACACATCTAAGTTAGTAAGGCCTATCGTCTTCATCATTGGCGACCATGGGGGTTCTCGGGTATTAGTTGAGTAATATTAATGAGAAACTCGATATTCTCCAGTTCCTCGAAATATCTCAATTTACTAGAGCTGTATAGCATGCACAAGGTGACTCCTATTCCAAATAAGAGCTAGTACGGGCAAACCGTTCGGGTACATCCTCTAAGGATGTGCCTGGACTTAGCCCAGGAACATTTTAGACTCACCGTACATTCCGGTGGACGCTGCCTCCAAGCTCGGCCATACGTACGTCCTTGCATGACGTATAGGCCTCAACAGTCTTACCCCAGTAGACCTGCATTCGCAGCCGGCCGACATGAGTATAGAAAGCCCTTTTTGACGGATCTCGGTATGCACCGCGAGAGAGATTCGGCCTGTGTGCAATATAGCATTAGGTCCCAACAGCCACCTGACGAGAAGTCTCGTACTACTTTTGAACCGTACAGCAGTATGTCACGGGAAAGCTTTCGGTTGAATACTCCGTTTCCTTCTCTATGTATGTGCGGGGATATCTCTAGGGGACTCGCGTTAGGTAAGCTCTCCGTGCAGAGGGTTGGCACCAATGTGTCCAAGTGTGCTGATTGCGTGCCGCGCGCCGGTGGATTGAGTTCGCCGACTTGCCGCGTGACATTCAAACTTGCAAGTACCCACACGGCTCTCGCCACACCTAACGGATTCGCATAGCCCGTGGAATGCCTGACAGGACGGCTGTGGACGTGGGTGCAGAGGTACGACTGCACGGTGGCTTAAGTTGGTCTGGCCTCTCGTGTGTCACGTAGCGGGCCCCACACGGTGTTGTGTAACTGGACCTCGCCGTGGGCGAGGGAACGAAACTCGGTTGAAGCTTTCCGCGCCTTCTTCAATGGTTACGCAGATCAGTGGAGCCGCACTTACGCTAAACACTTTAGGGCCAACAGAACCCATGTCTCACGCTGCCCCGTCCCAGGTAGATACGTCTGTACTCCTCGGCATATCCTTTTACAACTCGTCCCTAGTCAATGTTTGCCAGGCTCTGTCGTCAAAAATTCCACAAGACTTGATCAGACGTCGCCCGTACGGAACCCAAGTTCGTTACTTGGGCGAAACGGGCTCATTTAGATTACATCGTCAGTCCATATTGAATATATCTCGTCAGAAGATAAACAGGCGTAAATTCAACCCTCTGCTAAGTCCGGTGACCGAGCACTCATTCATAAAATGTGGGAGGGCCGTGCTCCCTACGGGCGACAATGTCAGTGCTCTATGTTGCATATCTGCAAGAGTAAGGAGCCCCCGGGCACCACGGATCTTGATTCAATCATATTCCGAGGACAGACCGCGGTCAAGACGCTACGGTCTCTATTTATGCGGCGGAGTTACCCGTTGCTAGCCGGCTCTTTCCTGCCGTCATTTGCACTTGTAACCCAATTCCCCAGCGGCTGATGCGTGACACATGTGCGACCTGGGAAACCTATACTCCCACTCGTTACCTGATTACACGCATGTAGTCGTCTTTAAGTCCAGTGCCTGTCAAATTCTACTTGTCAGCCAGAGTTCTCTTCAGCTTGGTTTAACAATCGTGTGGATCTTTGTGGTTTGCAGGGGTCCTGGTCCATACGAAGTACGTTACCTGCGGATGTGTGCTAGCCACGCTGTGCTTCGCTCCCTTTCAACCATGGGTCCCGCAGGGTCTGCTGGTTACTGAGACTGACCTTCAGCCACCCCACCAACTAGCTGTTACTAGGACGAAATATAGAGGTCACACGCCTTGCCCCCTTGGACATGGTTGAAACTGCGACACTGGGAGTCGGCCAATTTCGCGCCTGACTTTAGAGACTAGAGATATCAAGCAGTCGGCACACAATTTTACCTCCTCGGTTGCTTGAGCCTCGGGATTGCCTAACCTTGCGATCACACTAAGGTCACCGGCTCGCGCCGCTCCGTATTTATAGACCCATGGTGACGGCCGAGCTAGTTACCTCTATATCAACGGGCCTGTCTATTTGGCGTCAACTTATATCCGATAGTGCCCATTCTCTGTAGGTCGGGTTTTACTTCACTTTGGTTATGTGTTATCTGGGTGATTGGGAACGGGAGACGAGGGTAGTTAGCAGGAATGAAGGAGCGAAGGTGTGACCCATGTGATCAACAAAATCTCTTGTAGTCAGTGTGCCGTAATTGCTATCGCAGCTTAACAGGTGAACCAAGACGAGTAGGGGCATTTTCACTGTCCGTTGGAGGTCGACCATTTCGCCTCGAGTGTCTGTGCCTTCTTTGCGCCGTAGCTACACTCGCTTTGACAATCCCTAATAGATAGGTTTACGGGCAACGGACTTAGTAGCCCGATCCCGTAATCGCTGGACTTCTGTAGATTCTAGGCCGCCCCTCCTTTCTGTTGCCGACAAGCGAGTAGTGCCCATTTAACCAAAATATCTGAATTCCGTAAGCCGTAAGTCGCCTCGGGTCAGGGTGGTAGGAATCCTCTTCTTCTTTTCTTGGTGCCCGGCTCGCGAACTCCGGGCCCTCACGTGAGTCTAACGTACTCACGTTATCGCTACCACTGGGTTAGGCTTGCTGGACGTGCTGAATATACCTGTCTTCACACTCGCGCATCTGGTTGAAACGATGAAACTTAGAAATGTCAAACATGAATACCAGAGCTCAGATATATAGTTGGCGCGCCACTTATGTACGGGCGGTGATTCGCTATTAGCTGACTGCTTTGTTGTTTGCCAGGAGTTTCTTGAAGAATCCGTCTTCTGGCCTACTCCTAAATGGATCATCGAGGTGCGGCGGATTGCTGATCAGAGCTCCCAATCTGCAGATGTGGCGGAAATCGCGCTCACGGCATCGAATTCACACACCGTGGGATTCGGGAAGCTACCCGCTTGTCGCGCCGCTGACGCACCACTATAGATGACTACCAATATAAGCTGAGGTCTTCATTTGTACCTAGATCTCAGGTCCGTCTACGGTGTCATAGGCTAGAGGTTCTTTTAATAACCTTTCGATGCTTCAGCGGCAAGCGATTGCAAAGTCGCGATATTCAATGATTTGTCACACTATGTATAGATCGTTACACAGAAGTCGTTTAGCAACTACTATTAACAATTCGCTAGCCGAGCAGGAGGAAAGCGACGTCGGAAAGTTAAGTGTTAGCAACCCGTATGTACCAGATTGCCTCGCCGGTTTAGCACAAAAGTCTTCTCGGCAAGAGTTGGAGTACGGAATTACTTGTATGCGACCGGGGAGGCTTTTTATTGGCGCCGCCGCCGCGCTTGAAAATGTGCGTCGTCTCTGCACGCTCACATAGGCTCACGCTAACCCCGGACACTAGGCGCCGTCTGCCCAGGAGATGACTGCAATATACGACCAAAGACACCACAAGACGGAGGTATATTCGCATCAGTACACGTGCCTAGCCCTCAAAGTCTATCAGTAACTGGTGGTGAGGTTAGAGCTAAGTTCGCGTAGACGACAATCGATAAAGTTAGTTCTATTGACTAAGGTATTCGCCTCGGCTGTCTGACGCCACTAGATGCCCACCCTCCCGTGGCCAAGCTCTGAGAAGTTGGTGGTATTACCGCCCTTTATTGATCTTGTCCGTGCAATCCTCGGAGTACCGGATAGGGTTTTCATAAACCCCTTGTGACCGTGTCATCGCAACTAGAGCTTGTCCACGAGGTGTAATTTTCCTTGTCATTCCCGCGCTCCTTTTGGGTCAGGGACAACTGGAGATTCCAGATAAGGTTACATTATACGTCAGGTCGCGTCGCCAGTGTTGCCTCCTACTGCGCACCGAACTAACACTGATTCTCGTCGAGCTCCAGCATTCGATCCAATTTGGCCTTGTTGATCGTTTCAAGGGCTAGGCGACAATCGAATGAAGGCTGTCCTCCCCCGCAAGTGGTAACGCCATCGCGCACAGTTTCTATGGAGAACGCGCCCTGCCGACCTCGGAACCCACGACGTATACTCTCTATATTAGTACTAATATATATTCAGCGGGTGTTCGTGTTCCTACTTACTGCAATTTGACGGTTTAAAGCTCGTTCTATTACCTTGGCTTTTGGGTTAGACGCAGCCTCAGTTAATAAGGACCTACAGATACTAGCGGGATGTAAGGCTCATGTTGCGAGGGCAAGATGAAGGACACGATTAACGCCCCAAGCATCTATCGGAAGTAGTGTGCAATCTCACTGTTTCCTCTATGTGATAGATGATGTTCTCAGTTTTTGCGTCGACTACAAGGAGCCGCAAACGAACATAGCCGCTCTTGGAAGTGATCTCGAGGTTACTGCCAGGCCCGTACATCAAGTTTCAACTAAAAGCGTGACTCCGGCTGGGAACGCTATATCATGGTGTGGTGGTCGGGTATCCAGCCTCTTACTCTGCCGCCCCGACATAGAAAGGTGCGTGGCTCCCTACTTAGTAGCCTAGGGATGCCACCTTTCCCGTTCCTGGGTTGACGGTTCTTGAGGTGTTCGGCCCCTGAAAGCCAATCGGAACAGTCCTGTAGTCCGCACGATGGATAGAGCTATAAGTGACACCGTCTGGTACCGCCTAAGTTGGCGTGGGGACGCCGTAAGCCCCAAGGGTACCTTACCAAAGGATACAACACGTTTAAGCTGACTTGGCAGATCCGACGTCAATAATGATAGCCGCATAGGTGACCAGCGAAGCTCGCCTCAAGTGGTTCACTCTCTTTCGTTGGACAGCGCGTGTATACCTTCGCTGCTACTGCTTGTCCGGGATTTTTGCGGTTACCATGACATTAGGCACAGTAAAGACGGAGGTCTTGATCCAGGTTTCCGTCCGGGATGACCGCACTACCCGTGGATTGGCCGCGTGTGCCGTTCGTGTGTAGACCGGCTGGGGGATTGGGATAAAACTAGTGTTCTCATTACTATGCCTGCCACCGAAAGAATTCGCGGCCCGGAGAAATGCGTGAGTATAATTCCTATCCCTTACATAGGGAATAAGCGGCTGAGGCGGGGTGGGGCCTTGTACCCAAGATTTGTCTCCGGACCGACAGTGGTTAGCTGATATGATGGACGAGTCGCGAGCGCCGTTGGAGATTGATGTTCCATAATACTGGCGGGCCACCACTAGCTAAAACGCACAGTGAGACGCACAATATTTTGATGGGTTGAGGAAAACGACTAGGAGCCCCAGCATGGCTTAGTCGCAACGATCTTGTGTTAAGTTGCGTCTTCTGCCGGTCTTGGAGTCCCTTGCGTCTGGGTCCACTCTAAAATAGCCCTAACGGGTGTCTATCTGGGTACATGTGGGAAACATACTCTCGTAAAGGAGCCTTCTACCATTCTAAACTGGTCATCTCATAAATTAGACATGGGTCCGACAGGTCATGGACGGACTCGGATTCACTATATAACTGCAATTAGGATAAACCCATCTTCCGTTCTCCACTACCGTTCATTCATTGAAGATCTCCGGACCTACTCTGACCAGTACGTCCTGCTAACGTCTGAACCGCGATATGCAAACTTCGCTTGCCGATTACATTGTGACATTTCAATCGTCGACCTAAGGAATGGAATGGTCGGTATACCGAATGTTTGTGCAGTGCTGACCCACTGCTAGTATTAAACTTCAGAGCTTTAATCATAACTAAACGCGCATTCGCCGCTTGCACCGAGGTACTCTCGCAACTTGTGAGATCGAGAAGGCATTCGGCGATACATTTCTCTGATGGTGCCTGATAGGACGATGGGATCCTTTGGTGTAGCGGATAACTCGTGCAAACGTGGGACCTGGCGAGGGTACGGATCGGTCGGGGAAGGGAATACAGTATAATGAAACCCGATCTAAGGACGCGTCACCCGGACCTTCGCATTAAAGATCCCATAAGGCACTGTATGGAGGAACTCTGGGATTCTTTAGATCCGCATTCCGTAGCCAACCCAAGGCCCGAGGTGTGGCGCGGCGATATTTCTATCACGCATACGGCTCCTTCTATACGCACCTACGGCTCCCAACGCCTCACAGACATGATATGACTTTTGCATTAGCGAGGTGCCGCCTCTAACCGGAGTGGTAGTGAAACAGTTAAACCGCCGGTGGCTACCTCAAATAGCAGAGGAATGCGTTCTGCGGGTCCGATAACGACCCACCATCGCCCCGAACTTAATCGCTTAAGTAAAGGGTCGTGCCATTGTATAAAGGAGCGACCCCTGATGGTCATGTGTCCTCTGCCCGATCAACCGCGTGTACGTGACCGCAACTAGTCATATGGGACTTTGTTCTTCGCTTCGGATGCGAAGAGGCGCAGAGAATGAGCAAGGTTGCCGCCGACGGCGAATGTGTGATACCGCGGACTCGATCTACCATCGGTCACCCGTCTTCACCTGACATGAGGTATCATGACAGATCTAGAAGCTCGCTCAGGGGGTACCGTTGGACTGATGGGAAGCGGAATTTACTATCGCAAACAGTGGAGCAGAGTGAGCGTTGGTCACAAATGACGGCTACATTACTTCAAAAATGGCCCAGCTTCGAGGGCCACTTTTGTGGGTTAGCGTCTACAGCGAAGTGCCAAGCACGCGGGTTGTGCACAGGAGCAAGCATCAGGGAGATGACATCCGGGTATTAACTTAGTCGGTGTACACGATCTAGGCAGGGTATCCCGGGGAAAAGATCTACTGGGATTACGTTTACCGCAAATAGTGAGATAACTTCTGTCCTCAGTTTTCACTAGCCCACTCCTATGGTCCAGAATCCTTCCGGAGTGAGCTAGTGTCAATGGGTCTCAGACACAACTCGGAGTGCGCCTCCACACGCTCACTTCAGGGAGTGCGGCAGTACGTACAGTAGTCGCGCCGGTGACTTCTTTCATCGATCTTCACCGCAATTAGGCTCCTTGTGTGGGTTTTCAGAGGGCGATACGGGCACGGTGTAAATGTGGATACTTTCACCACCCACATACCTGTTAGTTGCATGTTATAATTATTCCTTCCTCTTTCCGGATCGGGTAGGATACATTCAGCCTCCCCGGCCCTTGGCTACATGAAGTCCACTATTTGCCGTTTCCCCTAACCATGAAGATCTGTAGAGCTCTAAGGAATAACGCCAGGTATTAAATGTGCGGATACCGATCACGCCCGCTATATGATATTCCCTCTTCCCATTCTCTGATTACGGGTGTCCGGATTCCGTCGCCACGCCGCGGTGCGGTATCCCAAATTTAACGTTCCCCGTGTGTCCCAAGAAACGCGTCCCCTCTACGGTCTGTTGGTGTCTCACTCTAACTGTACAAAAGACCTGCATATGGCCCCACGGGTTTGATCTTAAGCGGCCGCCGAGCCTCAAGGGTCCCTTTTTTTCCGGCAGGTAGATCTGCGGCAATACACCTAGCGTGGAGGTATAATTGCCGGTGGTAGAGTTCGGGATGGTAGAAGAAACAAATTCGGCGCTCCAATGCCAGGCAAAACTCCAGGGTACCATAGAACATTATTCACCTTCGGACCCGCGGGAGAACATAAAGGAACGCCGCCCCAATTTCGAGTCTGTTCAACGGGTATGAGATAGAATCTTGACGCAGTCCCGTTTTTGTCGAGTGTGCGCGTGGGGAGGCATTTGCAATTGACAGACACCTCGATTTTTACCCAACGACTTAACGAATATAACAATGGGAGGTAGCCCCATTCAGGAGGAGTTCGGTGATGCGATGGACCTTCCTGGCACTATCCAAGGTGACCACCGTCGGCCAAGCATACTCAGCTTCGACAGTTTGCTAGCGTGACTTCAGTTTGTGACGAAGCTGGCAGTCCCACCAGGCTGTTACCACTCCAAGACTAATGACCCTGCCTATAGTCTGGTGGGGGTCTCCGTTCGGGAGAATTGGAGCGAACTTCTTGAAGGCCTGCGAGTCCCTGGCTACTGGGGCCACGCTTCTAGGAATCCACACTAAGAAAGGTAAGTGTGGTGTACCTACCGACGTTATGCTGCCCTGGTAACTCGGGGACGTTTACGCTGCGACACGTCAACATAAATGGTCAATAGGGGACTTCGCTCTCCCCCGCGCTACACGTCATACATCAGCCGGAGAAAACTTAATTATTATACTGTCTTCACCGTGAAGGAGAACGGACAGGACTGCCCGGGTTTGAGACTAGGCGCTCTCTAAAGTTGACGTACTACCATGCGTCTGAACCCCATAGACATGGCCCGACCAACATCCGATACAAGGACAGCGAGTTGCGTAAAGACTCAAGAGCGTTACCGGTTATGGGCTCAACATATCTAGCGCGGGGAAATAGTTGCACCTCTCATGCCATATCAAGTCGGCATAAAGGCTCACGTAATTCGTCGACGAGCAACCGGCCCCACCTAGCGAGCACTGTTCCTAGGGTTACCGAACTGTAAGCGCGTAACATAACGTGGTGGTAGCACGTCAATAATCGCAATGAGTTGAGCAGGTGCCGAAACTTGTGGATTGACCTCACATAGTCGCTCGCCTGGTGCTCTGTACGTGATAGATTTCACGCTAGACTTTAACTACGTAGATCGTACTTCTGATACACTCAGGATACCTGCTTGAGCCGGGAAGTCTTATCCGTGGTCGCTCGCTAGCACTTGAACATGAGACAGGGCTAGTGCTCGGAAGTCTTTTTTGATCGAATTTGTAGTGCATATTCATATAGGGTTTTCAGGTGTGTCCAACTTGGAAGCCTTTCTCCATAACTAGGGCAGCTCCGTTTGAGGCAGTTTGCTCGGAGCCGTTTGGTCCCAGGTCAGCGCTTGATTCCCTCTTCCTTACCTAGATTAGAAACGTACGATACGTCCAATCATTGAACTCGAGCCTCTGTATCGGGAGAATAGGTGCATGAGATGCTAGTATCGGCGGGACCTAGCCCTGGTGGACGCTGTATAGCGTCCTGCCAGGGCTCTTTACGATTACCGCAAACGTATACGTGCGACCTGGGAGGTCAACAAGTTGGTCTATAGTATGTTCTAGAGGCTTATATTCAATTCCGATATCTGACAACCTATGAATGCGCTGGCTTCGTAAAAAGCATACACTGATGAGTCTTTTGCTGCGTGAAGAATCCGGGTGGGGTGGAATAATGGCAGGGGTACCCAGTACAGCAGAGTAGACTGTCACCCTCACCCCCAATAACATCGTTACTGCAAGTTCGTCAAAAGAGTTCTAGCGGAGGGAACTGGTGGACTCCAGAGTTCCAAACCGAGGGATATCGAAAGATCAGAATGGCGCGCTCTATCAAGACACGTCTACATCACACCCTTCCATGGTTTGAAATGACGGTGAGAAACGTGGTCTGTTTAACGATGCTTTCGTGGATTGGTATGCGATAGATAATCCATACTGACGTAGGTCCTGGGTACGGCACGAGCGGTAGATTGTGGAGAGACCTTGACGTAGTTGACTCCCGCGCTACTTCAGTGGTTAAGAAAGTATCGATACCTACGTGAACATTGTACACCATAAGATTCCAACAGACAAAAAGAACGAGTACCCGCTTATTCTGATGTGAGTGGACAACTTGTGTCTCTAGCTCGCATGCAAGTAACATTGACGTTCGATTACCGGACGCAAGCTGCGGGTGTTGATCCTGCCACTCATGTCGTCGTGATTCCAATCATCTCCAGTGTGACTTACGGCGCGCAAATGCAACCGTTTGGATTCCTGCGTTCTTCGAGCTTGTGTGGCACTACGAACTTTGCCGAGAACCGTCCATCAGGGGCCAGAATCATTTTGCCGGTCGGTCGAATTACGGCGCCACGCCCACGCTCGACTTAGCTTTTTAACAAGATCCTTCCGAATCTTTCGATGGAAACCAACGAGAGTCCCCACGTCGTTTTTTGATACGATAGCGAGACGGAACACTGAATTAATACGCCGCAATGCTGGAAACGCAAGTTGCCTCGCCGACTTTAGCTAGGCCTTAGAGGGGAGAGGGGCCGACGGCCATCATTGACTACGTATTATCTGGCACCGTCTAGACTTGGCCGCATGGATCGCAAACCTTTCCCACTTACGGTTCAGGCCCTAGGCTGCATAACCTTTCTATTTCCGATATGTTAACGGAACCTGTTAGATAATGCAGCGCACAACCCGCTCACTTCCATTTCCAGCAGGTTAAATTGGTGAGCCATTTGACTGTTGGAATCCTTGACTAATGAACCACTAGCATCGTAGTTTAACTTTCATGAAAAATCGTCACGCACGCACCAAGTTGGCGTTCCGCTATGGTGAGCGGTATATGTAGAGAAGCATGAACGGCCGTCGTGCTCTCGTCTTCGAGGTGAGAGATGTCGGTAAGACGTTAAAGCCGTAAACGTAAAACGGAGATAGTGCCGGTTGTACAAGCGTAGGACCGGCGGTCGCGACGGACAAACTATCACCGGGGTCTGAGGGGGTGTCTAAGGTATACTCAGATTACGTCCATTCGAACACCTCATTCGGTGCCCTCATAACGCATAAACTCGAACCGTAACCGACCAACCCACCCAACAGAGTCTGTGACTCAGCAGGACACTAAATAAGGTCTCTTTGTGCCAGCTCTGTGTCGTGATTTCTTTACGGACGAGGGTGCAGAAGACGGGAACTGCCGTCACTCCTCAGAATACCCTAAGTGTGGTCAAGTTGCTGGGTACCTCTAGTTACTCACTTTCCCATATTAGTGTGGATCATAAGTGGCCGCTGGAGAGAGGAGAGGGCCTTCCAACCCGTCGCGCCGTCCAATTTTGTCGACATCCAAACAATTTGTGCGCTCGGCAAAAGCGACTGACTGACTAAGGTTTATAATATTGCTGCCGTCCGGGTTTTCCGAGGCCGAGCTGCTTTAGGAGCCGAACGAGTACATGGTGTTACGAGACCTAGTCGGTCCGTGAGTTCTGCACATCCGCAGAGCGGAGAATCTACACCCTCATCTTTTCAGATCTTGTCTCGAACATGTAGCCTATAACCCACGACACCCACAGCAGGGCTCCTCTCCTTTCGGCATCTTCGCACCCCCATACTAACGTCGCCCGTGCTGACTCTTGCTTAGAAACACAGGTGACGTTCGTTTGACCTTAGAGGAGTGGCACTGATAGCCTCACCTTTGCGTGTGTTAGGGCTCAATCGACATGTGATTATACCGCGGTGGTTCTGAACAGTAGCCTGTCAGTGACATCCACTCCTCCCCCAGTCTGGTATTACAAAGGCTCTCAGGCTTATTCAGGCAGTGGTCTACTACCGAACTCACTAGAATGGAACCGGATACATACGACTGTGTTACTGGTGACTCTCAGTGACTTCGCTGCATGGACCCGGAGGATAAAGCTACTGGGTCCCTTGACACACCAACGATGGCGTAAGCGATCCGTAGTCGTGAATTCCCACAGTTAGACCCCCTTAACGACGCAAAGGCTTCAGGGACCACCGGCTGCTAGCGTGAGATTCCCTGATGTTCTAGTTTGAACCGTTTTTTGAGCACTGGCTTCAAAATCAGAAGTAGAGAAACTGACTAGCAGCCCGAGAAACCGATAGTATTACTTTTAGTGAGTGCATTTCCCGGTGATTGGGATTGGCTCAACTATGATTTAGACTTTATCTAGATAGGGGACGGGTAGACGGGTATATCGTGATTAGCGCGTGGAGGAGGACGGGTTCAGCCGTAAGAATGCGCCCGTCGTTCATAGGCCCAGAAGAATCCGTTCGGTACATCGATCACTTGTACATGCTCCCGCACTGGGAAGCTCAAATGCTCGCAAGGTTGAGGAGACTACAGTTGCGACCTTTGCTACGCCACCTAAGTCTGGAAGGCGAAGCGGGTTCTAGAATCCTGCGGTCCGCGCAGAGCCGCGGTTCCTGGAGCCACGATTAATCCCGAGGTACAGGCGTTGACTCTCGATACTCAACATCTTTGATATATTTCAGGCCCAAGGAGGCGGTTTGTAATCCTCAACTCTGTCCACTGCCCGCACGAACAGTATTTCGAGCCTCGCTAGTTTTCGCTTTGAATATGAGTTGCGCGAAGCTCAGCCATGATTGCCGTCACATTTTTCTGACAGACAAGTGTACACAAGATGGTGCTCCTATTCGCCTGTGCGAGCACTTACAGTGCGAGGTATTCTAGTCGCACATGGGTCAGTTCTGGGGAGCGAACTCGAGAGATCTGTGCAGGGTCTGAGTTGTCGCCGTGGTCTCACAGGAATGTAGGAGTAGGCCCGTAAATACGAATTCTGTGGATAGGGAATCTCACTGCAAGAAGTCGCGCCTCACGGTGGGTTCGGCGTTGTTAAGTTTAGTACAA 3 | 6 --------------------------------------------------------------------------------