├── README.txt ├── chapter1 ├── C1_2 │ ├── 2.py │ ├── stepic_dataset.txt │ └── test.txt ├── C1_3 │ ├── 3_1.py │ ├── 3_2.py │ ├── 3_3.py │ ├── Vibrio_cholerae.txt │ ├── output.3_1.txt │ ├── output.3_2.txt │ ├── output.3_3.txt │ ├── stepic_dataset.3_1.txt │ ├── stepic_dataset.3_2.txt │ ├── test.3_1.txt │ └── test.3_2.txt ├── C1_4 │ ├── .4_3.py.swp │ ├── .4_3b.py.swo │ ├── 4_1.py │ ├── 4_2.py │ ├── E-coli.txt │ ├── Thermotoga-petrophila.txt │ ├── all_9mers.txt │ ├── find_3_kmers.py │ ├── find_4_kmers.py │ ├── find_5_kmers.py │ ├── find_6_kmers.py │ ├── find_9_kmers.py │ ├── find_9a_kmers.py │ ├── find_9b_kmers.py │ ├── find_precalculate_9mers.py │ ├── find_short_kmers.pyc │ ├── output.find_3_kmers.txt │ ├── output.find_4_kmers.txt │ ├── output.find_5_kmers.txt │ ├── output.find_6_kmers.txt │ ├── output.find_9_kmers.txt │ ├── output.find_9b_kmers.txt │ ├── stepic_dataset.4_2.txt │ └── test.4_2.txt ├── C1_7 │ ├── 7_1.py │ ├── 7_2.py │ ├── dataset_7_6.txt │ ├── test.7_1.txt │ └── test.7_2.txt └── C1_8 │ ├── .1g.py.swp │ ├── .8_4.py.swp │ ├── 8_3 │ ├── approximate_pattern_match.py │ ├── dataset_8_3.txt │ ├── output.dataset_8_3.txt │ └── test.8_3.txt │ ├── 8_4 │ ├── 8_4.py │ ├── dataset_8_4.extra.txt │ ├── dataset_8_4.txt │ ├── output.dataset_8_4.extra.txt │ ├── output.dataset_8_4.txt │ ├── output.test.8_4.txt │ ├── test.8_4.txt │ └── test.8_4b.txt │ └── 8_5 │ ├── 8_5.py │ ├── dataset_8_5.txt │ ├── output.dataset_8_5.txt │ ├── output.test.8_5.extra.txt │ ├── output.test.8_5.txt │ ├── test.8_5.extra.txt │ └── test.8_5.txt ├── chapter2 ├── C2_18 │ ├── .peptide_encoding.dna_condon_table.py.swp │ ├── .peptide_encoding.py.swp │ ├── dataset_18_3.txt │ ├── dataset_18_6.txt │ ├── output.txt │ ├── peptide_encoding.py │ ├── protein_translation.py │ ├── test.peptide_encoding.txt │ └── test.protein_translation.txt ├── C2_20 │ ├── .theoretical_spectrum.py.swp │ ├── dataset_20_3.extra.txt │ ├── dataset_20_3.txt │ ├── integer_mass_table.txt │ ├── output.dataset_20_3.txt │ ├── output.test.theoretical_spectrum.extra.txt │ ├── output.test.theoretical_spectrum.txt │ ├── test.theoretical_spectrum.extra.txt │ ├── test.theoretical_spectrum.txt │ ├── theoretical_spectrum.py │ └── theoretical_spectrum.py~ ├── C2_22 │ ├── cyclo_peptide_sequencing.py │ ├── dataset_22_4.txt │ ├── integer_mass_table.txt │ ├── leaderboard_data.txt │ ├── output.dataset_22_4.txt │ ├── output.test.cyclo_peptide_sequencing.extra.txt │ ├── output.test.cyclo_peptide_sequencing.txt │ ├── test.cyclo_peptide_sequencing.extra.txt │ └── test.cyclo_peptide_sequencing.txt ├── C2_24 │ ├── dataset_24_4.txt │ ├── leaderboardcyclopeptide_sequencing.py │ ├── test.leaderboard_data.extra.txt │ └── test.leaderboard_data.txt └── C2_26 │ ├── 26_4 │ ├── dataset_26_4.txt │ ├── output.dataset_26_4.txt │ ├── output.test.spectral_convolution.txt │ ├── spectral_convolution.py │ └── test.spectral_convolution.txt │ └── 26_7 │ ├── convolutioncyclopeptidesequencing.py │ ├── dataset_26_7.txt │ ├── spectralconvolution.py │ ├── test.convolutioncyclopeptidesequencing.extra.txt │ └── test.convolutioncyclopeptidesequencing.txt ├── chapter3 ├── C3_36 │ ├── dataset_36_7.txt │ ├── motifenumeration.py │ ├── output.dataset_36_7.txt │ ├── output.test.motifenumeration.extra.txt │ ├── output.test.motifenumeration.txt │ ├── test.motifenumeration.extra.txt │ └── test.motifenumeration.txt ├── C3_38 │ ├── dataset_38_7.txt │ ├── medianstring.py │ ├── test.medianstring.extra.txt │ └── test.medianstring.txt ├── C3_39 │ ├── 39_3 │ │ ├── dataset_39_3.txt │ │ ├── pmpkp.py │ │ ├── test.pmpkp.extra.txt │ │ └── test.pmpkp.txt │ └── 39_5 │ │ ├── dataset_39_5.txt │ │ ├── greedymotifsearch.py │ │ ├── output.dataset_39_5.txt │ │ ├── output.test.greedymotifsearch.extra.txt │ │ ├── output.test.greedymotifsearch.txt │ │ ├── test.greedymotifsearch.extra.output.txt │ │ ├── test.greedymotifsearch.extra.txt │ │ └── test.greedymotifsearch.txt ├── C3_40 │ ├── dataset_40_9.txt │ ├── gmswp.py │ ├── output.dataset_40_9.txt │ ├── output.test.gmswp.extra.txt │ ├── output.test.gmswp.txt │ ├── test.gmswp.extra.txt │ └── test.gmswp.txt ├── C3_41 │ ├── .haha.txt.swp │ ├── dataset_41_4.txt │ ├── output.dataset_41_4.txt │ ├── output.test.randomizedmotifsearch.extra.txt │ ├── output.test.randomizedmotifsearch.txt │ ├── randomizedmotifsearch.py │ ├── test.randomizedmotifsearch.extra.answer.txt │ ├── test.randomizedmotifsearch.extra.txt │ └── test.randomizedmotifsearch.txt └── C3_43 │ ├── dataset_43_4.txt │ ├── gibbssampler.py │ ├── output.dataset_43_4.txt │ ├── output.test.gibbssampler.txt │ └── test.gibbssampler.txt ├── chapter4 ├── C4_51 │ ├── dataset_51_3.txt │ ├── output.dataset_51_3.txt │ ├── output.test.string_composition.txt │ ├── string_composition.py │ └── test.string_composition.txt ├── C4_52 │ ├── dataset_52_7.txt │ ├── output.dataset_52_7.txt │ ├── output.test.overlap_graph.txt │ ├── overlap_graph.py │ └── test.overlap_graph.txt ├── C4_53 │ ├── answer.test.de_bruijn_graph.extra.txt │ ├── dataset_53_6.txt │ ├── de_bruijn_graph.py │ ├── haha.txt │ ├── output.dataset_53_6.txt │ ├── output.test.de_bruijn_graph.extra.txt │ ├── output.test.de_bruijn_graph.txt │ ├── test.de_bruijn_graph.extra.txt │ └── test.de_bruijn_graph.txt ├── C4_54 │ ├── answer.test.debruijn_graph_from_kmers.extra.txt │ ├── dataset_54_7.txt │ ├── debruijn_graph_from_kmers.py │ ├── haha.txt │ ├── output.dataset_54_7.txt │ ├── output.test.debruijn_graph_from_kmers.extra.txt │ ├── output.test.debruijn_graph_from_kmers.txt │ ├── test.debruijn_graph_from_kmers.extra.txt │ └── test.debruijn_graph_from_kmers.txt └── C4_57 │ ├── answer.test.eulerian_cycle.extra.txt │ ├── dataset_57_2.txt │ ├── eulerian_cycle.py │ ├── output.dataset_57_2.txt │ ├── output.test.eulerian_cycle.3.txt │ ├── output.test.eulerian_cycle.extra.txt │ ├── output.test.eulerian_cycle.txt │ ├── test.eulerian_cycle.3.txt │ ├── test.eulerian_cycle.extra.txt │ └── test.eulerian_cycle.txt └── chapter5 ├── C5_57 ├── 57_10 │ ├── answer.test.k-universal_circular_string.extra.txt │ ├── dataset_57_10.txt │ ├── k-universal_circular_string.py │ ├── output.dataset_57_10.txt │ ├── output.test.k-universal_circular_string.14.txt │ ├── output.test.k-universal_circular_string.4.txt │ └── test.k-universal_circular_string.4.txt ├── 57_5 │ ├── answer.test.eulerian_path.extra.txt │ ├── dataset_57_5.txt │ ├── eulerian_path.py │ ├── output.dataset_57_5.txt │ ├── output.test.eulerian_path.extra.txt │ ├── output.test.eulerian_path.txt │ ├── test.eulerian_path.extra.txt │ └── test.eulerian_path.txt └── 57_6 │ ├── answer.test.string_reconstruction.extra.txt │ ├── dataset_57_6.txt │ ├── output.dataset_57_6.txt │ ├── output.test.string_reconstruction.extra.txt │ ├── output.test.string_reconstruction.txt │ ├── string_reconstruction.py │ ├── test.string_reconstruction.extra.txt │ └── test.string_reconstruction.txt ├── C5_58 ├── answer.test.srfrp.extra.txt ├── dataset_58_14.txt ├── output.dataset_58_14.txt ├── output.test.srfrp.extra.txt ├── output.test.srfrp.txt ├── srfrp.py ├── test.srfrp.extra.txt └── test.srfrp.txt └── C5_59 ├── .contig_generation.py.swp ├── answer.test.contig_generation.extra.txt ├── contig_generation.py ├── contig_generation.py~ ├── debruijn_graph_from_kmers.py ├── debruijn_graph_from_kmers.pyc ├── output.test.contig_generation.txt ├── test.contig_generation.extra.txt ├── test.contig_generation.txt └── well.txt /README.txt: -------------------------------------------------------------------------------- 1 | The directory stores all learning materials from Coursera Bioinformatics Algorithms 1 2 | -------------------------------------------------------------------------------- /chapter1/C1_2/2.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - Hidden Messages in the Replication Origin 10 | # 11 | # Frequent Words Problem: Find the most frequent k-mers in a string. 12 | # Input: A string Text and an integer k. 13 | # Output: All most frequent k-mers in Text. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # ACGTTGCATGTCGCATGATGCATGAGAGCT 19 | # 4 20 | # 21 | # Sample Output: 22 | # CATG GCAT 23 | # 24 | #################################################################### 25 | 26 | import sys 27 | 28 | def read_file(filename): 29 | f = open(filename, 'r') 30 | data = f.readlines() 31 | return data 32 | f.close() 33 | 34 | def possible_kmers(seq,n): 35 | possible = set() 36 | for i in range(len(seq)-n+1): 37 | possible.add(seq[i:i+n]) 38 | return possible 39 | 40 | def most_frequent_kmers(seq,kmers): 41 | freqs = [seq.count(kmer) for kmer in kmers] 42 | maximum = max(freqs) 43 | kmer = [kmer for kmer in kmers if seq.count(kmer) == maximum] 44 | return kmer 45 | 46 | if __name__ == '__main__': 47 | 48 | argv = sys.argv[-1] 49 | seq, n = read_file(argv)[0].strip(), int(read_file(argv)[1].strip()) 50 | kmers = possible_kmers(seq,n) 51 | result = most_frequent_kmers(seq,kmers) 52 | print ' '.join(result) 53 | -------------------------------------------------------------------------------- /chapter1/C1_2/stepic_dataset.txt: -------------------------------------------------------------------------------- 1 | ACTATTCGCGCGTAGGTTACAGGGAGAATCCCCCAGGTTACAGACTATTCATCCCCCAGGTTACAGGCGCGTAGGAGAGCGCGTAGCGCGTAATCCCCCAACTATTCGGAGAACTATTCGGTTACAGACTATTCACTATTCGCGCGTAGGAGAGGAGAATCCCCCAGGTTACAGGCGCGTAGGAGAGGTTACAGGGAGAATCCCCCAATCCCCCAACTATTCATCCCCCAGGAGAGGTTACAGGCGCGTAGGTTACAGGCGCGTAACTATTCATCCCCCAATCCCCCAGGTTACAGGGAGAATCCCCCAACTATTCATCCCCCAGGTTACAGGGAGAACTATTCGGTTACAGGGTTACAGGGAGAGCGCGTAATCCCCCAGGAGAGCGCGTAGCGCGTAACTATTCATCCCCCAATCCCCCAGGAGAATCCCCCAGGTTACAGACTATTCACTATTCATCCCCCAATCCCCCAGCGCGTAATCCCCCAGCGCGTAACTATTCACTATTCATCCCCCAGGTTACAGGCGCGTAGGAGAATCCCCCAGCGCGTAATCCCCCAACTATTCGGTTACAGGGAGAGGTTACAGACTATTCATCCCCCAGGAGAACTATTCATCCCCCAGGAGAACTATTCGGAGAGGTTACAGACTATTCACTATTCGCGCGTAACTATTCGCGCGTAGGTTACAGGGAGAGGAGAGCGCGTAGGAGAGGTTACAGGGTTACAGGCGCGTAGCGCGTAATCCCCCAGCGCGTAGCGCGTAGGAGAATCCCCCAACTATTCACTATTCGGAGAGCGCGTAGCGCGTAGCGCGTAATCCCCCAGGTTACAGGGAGAGGAGAGGAGAGGTTACAGACTATTCGCGCGTA 2 | 11 3 | -------------------------------------------------------------------------------- /chapter1/C1_2/test.txt: -------------------------------------------------------------------------------- 1 | ACGTTGCATGTCGCATGATGCATGAGAGCT 2 | 4 3 | -------------------------------------------------------------------------------- /chapter1/C1_3/3_1.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - Some Hidden Messages are More Surprising than Others 10 | # 11 | # Reverse Complement Problem: Reverse complement a nucleotide pattern. 12 | # Input: A DNA string Pattern. 13 | # Output: Pattern, the reverse complement of Pattern. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # AAAACCCGGT 19 | # 20 | # Sample Output: 21 | # ACCGGGTTTT 22 | #################################################################### 23 | 24 | import sys 25 | 26 | complement = {'A':'T', 'T':'A', 'C':'G', 'G':'C'} 27 | 28 | def read_file(filename): 29 | f = open(filename, 'r') 30 | data = f.readline().strip() 31 | return data 32 | f.close() 33 | 34 | def reverse_complement(seq): 35 | seqc = [complement[ch] for ch in seq] 36 | seqc = ''.join(seqc) 37 | return seqc[::-1] 38 | 39 | if __name__ == '__main__': 40 | 41 | argv = sys.argv[-1] 42 | seq = read_file(argv) 43 | 44 | fw = open('./output.3_1.txt','w') 45 | fw.write(reverse_complement(seq)) 46 | fw.close() 47 | -------------------------------------------------------------------------------- /chapter1/C1_3/3_2.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - Some Hidden Messages are More Surprising than Others 10 | # 11 | # Pattern Matching Problem: Find all occurrences of a pattern in a string. 12 | # Input: Two strings, Pattern and Text. 13 | # Output: All starting positions where Pattern appears as a substring of Text. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # ATAT 19 | # GATATATGCATATACTT 20 | # 21 | # Sample Output: 22 | # 1 3 9 23 | ################################################################### 24 | 25 | import sys 26 | import re 27 | 28 | def read_file(filename): 29 | f = open(filename, 'r') 30 | data = f.readlines() 31 | return data 32 | f.close() 33 | 34 | def occurrences(pattern, genome): 35 | matches = re.finditer(r'(?=(%s))' % re.escape(pattern), genome) 36 | return [m.start(1) for m in matches] 37 | 38 | if __name__ == '__main__': 39 | 40 | argv = sys.argv[-1] 41 | print read_file(argv) 42 | pattern, genome = read_file(argv)[0].strip(), read_file(argv)[1].strip() 43 | result = ' '.join(map(str,occurrences(pattern, genome))) 44 | 45 | fw = open('./output.3_2.txt', 'w') 46 | fw.write(result) 47 | fw.close() 48 | -------------------------------------------------------------------------------- /chapter1/C1_3/3_3.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - Some Hidden Messages are More Surprising than Others 10 | # 11 | # Pattern Matching Problem: Find all occurrences of a pattern in a string. 12 | # Input: Two strings, Pattern and Text. 13 | # Output: All starting positions where Pattern appears as a substring of Text. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # ATAT 19 | # GATATATGCATATACTT 20 | # 21 | # Sample Output: 22 | # 1 3 9 23 | ################################################################### 24 | 25 | import sys 26 | import re 27 | 28 | def read_file(filename): 29 | f = open(filename, 'r') 30 | genome = f.readline().strip() 31 | return genome 32 | f.close() 33 | 34 | def occurrences(pattern, genome): 35 | matches = re.finditer(r'(?=(%s))' % re.escape(pattern), genome) 36 | return [m.start(1) for m in matches] 37 | 38 | if __name__ == '__main__': 39 | 40 | argv = sys.argv[-1] 41 | genome = read_file(argv) 42 | result = ' '.join(map(str,occurrences('CTTGATCAT', genome))) 43 | 44 | fw = open('./output.3_3.txt', 'w') 45 | fw.write(result) 46 | fw.close() 47 | -------------------------------------------------------------------------------- /chapter1/C1_3/output.3_2.txt: -------------------------------------------------------------------------------- 1 | 8 22 96 101 114 148 153 158 204 218 223 252 265 284 300 309 314 322 327 333 363 382 424 446 452 487 502 507 513 534 593 606 613 618 632 644 657 702 707 712 738 743 761 772 777 782 799 824 837 870 875 888 911 917 948 955 960 965 979 1001 1025 1037 1049 1054 1059 1074 1095 1115 1134 1139 1160 1178 1183 1188 1193 1198 1205 1210 1221 1226 1284 1296 1301 1317 1344 1349 1354 1359 1364 1380 1441 1457 1470 1507 1518 1549 1572 1584 1601 1631 1636 1658 1667 1691 1699 1734 1750 1755 1777 1803 1808 1818 1837 1853 1858 1864 1869 1875 1880 1887 1892 1906 1921 1926 1938 1946 1966 1980 1999 2023 2029 2045 2069 2135 2154 2181 2200 2214 2240 2245 2278 2311 2327 2351 2356 2385 2390 2430 2435 2463 2479 2525 2530 2572 2601 2606 2631 2636 2641 2672 2687 2692 2701 2706 2734 2745 2757 2777 2798 2846 2851 2856 2867 2879 2914 2960 2986 2991 3005 3039 3057 3068 3073 3084 3099 3104 3114 3119 3157 3168 3173 3205 3211 3229 3260 3269 3291 3296 3316 3321 3335 3351 3368 3405 3417 3422 3444 3459 3471 3494 3499 3506 3512 3538 3607 3633 3661 3708 3734 3761 3766 3772 3799 3812 3818 3838 3868 3886 3891 3905 3910 3940 3966 3971 3998 4018 4040 4045 4069 4088 4095 4106 4111 4116 4149 4173 4181 4188 4212 4233 4242 4269 4275 4280 4294 4311 4348 4385 4406 4413 4418 4486 4519 4550 4560 4600 4608 4613 4643 4667 4687 4698 4750 4778 4787 4828 4858 4869 4877 4890 4935 4951 4968 4985 4996 5022 5034 5054 5080 5091 5103 5108 5113 5118 5134 5168 5191 5205 5218 5224 5321 5326 5344 5351 5359 5364 5369 5388 5400 5405 5410 5415 5427 5432 5452 5459 5511 5578 5583 5606 5612 5617 5624 5630 5635 5662 5702 5713 5730 5735 5753 5778 5785 5802 5807 5854 5879 5900 5905 5921 5929 5934 5955 5960 5974 6002 6046 6058 6079 6091 6104 6124 6129 6140 6147 6152 6157 6179 6201 6221 6252 6257 6268 6279 6284 6310 6372 6379 6397 6404 6413 6418 6460 6523 6528 6535 6563 6574 6580 6600 6605 6610 6623 6628 6634 6641 6646 6657 6685 6703 6716 6759 6806 6811 6816 6823 6841 6868 6906 6930 6941 6948 6966 6971 6982 7023 7037 7062 7067 7119 7125 7131 7154 7180 7239 7244 7296 7301 7310 7332 7380 7399 7423 7448 7491 7525 7532 7540 7552 7562 7579 7590 7595 7604 7617 7646 7651 7719 7751 7815 7844 7849 7889 7902 7942 7957 7989 7994 7999 8004 8009 8122 8140 8171 8206 8211 8218 8229 8263 8274 8287 8310 8315 8321 8326 8351 8363 8368 8382 8387 8394 8409 8416 8455 8467 8502 8543 8548 8553 8588 8593 8598 8624 8695 8744 8781 8794 8799 8810 8815 8826 8831 8859 8868 -------------------------------------------------------------------------------- /chapter1/C1_3/output.3_3.txt: -------------------------------------------------------------------------------- 1 | 60039 98409 129189 152283 152354 152411 163207 197028 200160 357976 376771 392723 532935 600085 622755 1065555 -------------------------------------------------------------------------------- /chapter1/C1_3/test.3_1.txt: -------------------------------------------------------------------------------- 1 | AAAACCCGGT 2 | -------------------------------------------------------------------------------- /chapter1/C1_3/test.3_2.txt: -------------------------------------------------------------------------------- 1 | ATAT 2 | GATATATGCATATACTT 3 | -------------------------------------------------------------------------------- /chapter1/C1_4/.4_3.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter1/C1_4/.4_3.py.swp -------------------------------------------------------------------------------- /chapter1/C1_4/.4_3b.py.swo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter1/C1_4/.4_3b.py.swo -------------------------------------------------------------------------------- /chapter1/C1_4/4_1.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - An Explosion of Hidden Messages 10 | # 11 | # Pattern Matching Problem: 12 | # Find all occurrences of a pattern in a string. 13 | # Input: Two strings, Pattern and Text. 14 | # Output: All starting positions where Pattern appears as a 15 | # substring of Text. 16 | # 17 | # Example: 18 | # 19 | # Sample Input: 20 | # ATAT 21 | # GATATATGCATATACTT 22 | # 23 | # Sample Output: 24 | # 1 3 9 25 | # 26 | ################################################################# 27 | 28 | import sys 29 | import re 30 | 31 | def read_file(filename): 32 | f = open(filename, 'r') 33 | genome = f.readline().strip() 34 | return genome 35 | f.close() 36 | 37 | def occurrences(pattern, genome): 38 | matches = re.finditer(r'(?=(%s))' % re.escape(pattern), genome) 39 | return [m.start(1) for m in matches] 40 | 41 | if __name__ == '__main__': 42 | 43 | filename, pattern = sys.argv[-2], sys.argv[-1] 44 | genome = read_file(filename) 45 | result = ' '.join(map(str,occurrences(pattern, genome))) 46 | 47 | fw = open('./output.4_1.'+pattern+'.txt', 'w') 48 | fw.write(result) 49 | fw.close() 50 | -------------------------------------------------------------------------------- /chapter1/C1_4/4_2.py: -------------------------------------------------------------------------------- 1 | ############################################################################## 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - An Explosion of Hidden Messages 10 | # 11 | # Clump Finding Problem: Find patterns forming clumps in a string. 12 | # Input: A string Genome, and integers k, L, and t. 13 | # Output: All distinct k-mers forming (L, t)-clumps in Genome. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA 19 | # 5 50 4 20 | # 21 | # Sample Output: 22 | # CGACA GAAGA 23 | # 24 | ################################################################################# 25 | 26 | import sys 27 | 28 | def read_file(filename): 29 | f = open(filename, 'r') 30 | data = f.readlines() 31 | return data 32 | f.close() 33 | 34 | def seq_window(seq, L): 35 | windows = set() 36 | for i in range(len(seq)-L+1): 37 | windows.add(seq[i:i+L]) 38 | return windows 39 | 40 | def possible_kmers(seq,k): 41 | possible = set() 42 | for i in range(len(seq)-k+1): 43 | possible.add(seq[i:i+k]) 44 | return possible 45 | 46 | def clump(seq,kmers,t): 47 | kmer = [item for item in kmers if seq.count(item) >= t] 48 | return kmer 49 | 50 | def result(seq,k,L,t): 51 | kmer = [] 52 | windows = seq_window(seq, L) 53 | for window in windows: 54 | kmers = possible_kmers(window,k) 55 | if len(clump(window,kmers,t)) > 0: 56 | kmer.append(' '.join(clump(window,kmers,t))) 57 | return kmer 58 | 59 | if __name__ == '__main__': 60 | 61 | filename = sys.argv[-1] 62 | data = read_file(filename) 63 | seq = data[0].strip() 64 | k = int(data[1].strip().split(' ')[0]) 65 | L = int(data[1].strip().split(' ')[1]) 66 | t = int(data[1].strip().split(' ')[2]) 67 | 68 | raw_result = ' '.join(result(seq,k,L,t)) 69 | raw_result = raw_result.split(' ') 70 | final_result = set() 71 | for item in raw_result: 72 | final_result.add(item) 73 | 74 | fw = open('./output.'+filename,'w') 75 | fw.write(' '.join(final_result)) 76 | fw.close() 77 | -------------------------------------------------------------------------------- /chapter1/C1_4/find_3_kmers.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - An Explosion of Hidden Messages 10 | # 11 | # Clump Finding Problem: Find patterns forming clumps in a string. 12 | # Input: A string Genome, and integers k, L, and t. 13 | # Output: All distinct k-mers forming (L, t)-clumps in Genome. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA 19 | # 5 50 4 20 | # 21 | # Sample Output: 22 | # CGACA GAAGA 23 | # 24 | #################################################################### 25 | 26 | import sys 27 | import re 28 | import itertools 29 | 30 | def read_file(filename): 31 | f = open(filename, 'r') 32 | data = f.read() 33 | return data 34 | f.close() 35 | 36 | def generate_bases(k): 37 | base = 'ACTG' 38 | i = 1 39 | bases = '' 40 | while len(bases) < k: 41 | bases = base*i 42 | i += 1 43 | return bases 44 | 45 | def possible_kmers(k): 46 | bases = generate_bases(k) 47 | possible = set() 48 | for p in itertools.permutations(bases,k): 49 | possible.add(''.join(p)) 50 | return possible 51 | 52 | def frequency(seq,kmer,t): 53 | if seq.count(kmer) < t: 54 | return False 55 | return True 56 | 57 | def filter_kmer(raw_kmers): 58 | fil_kmers = set() 59 | for kmer in raw_kmers: 60 | if frequency(seq,kmer,t) == True: 61 | fil_kmers.add(kmer) 62 | return fil_kmers 63 | 64 | def kmer_position(seq,fil_kmer,t,L): 65 | positions = [m.start() for m in re.finditer(fil_kmer,seq)] 66 | if positions[t-1] - positions[0] < L: 67 | return True 68 | return False 69 | 70 | def result(fil_kmers): 71 | result = set() 72 | for kmer in fil_kmers: 73 | if kmer_position(seq,kmer,t,L) == True: 74 | result.add(kmer) 75 | return result 76 | 77 | if __name__ == '__main__': 78 | 79 | filename = sys.argv[-1] 80 | seq = read_file(filename).strip() 81 | k = 3 82 | L = 500 83 | t = 3 84 | # k,L,t = map(int,data[1].strip().split(' ')) 85 | raw_kmers = possible_kmers(k) 86 | fil_kmers = filter_kmer(raw_kmers) 87 | final_result = result(fil_kmers) 88 | 89 | fw = open('./output.'+sys.argv[-2][:-3]+'.txt','w') 90 | fw.write(' '.join(final_result)) 91 | fw.close() 92 | -------------------------------------------------------------------------------- /chapter1/C1_4/find_4_kmers.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - An Explosion of Hidden Messages 10 | # 11 | # Clump Finding Problem: Find patterns forming clumps in a string. 12 | # Input: A string Genome, and integers k, L, and t. 13 | # Output: All distinct k-mers forming (L, t)-clumps in Genome. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA 19 | # 5 50 4 20 | # 21 | # Sample Output: 22 | # CGACA GAAGA 23 | # 24 | #################################################################### 25 | 26 | import sys 27 | import re 28 | import itertools 29 | 30 | def read_file(filename): 31 | f = open(filename, 'r') 32 | data = f.read() 33 | return data 34 | f.close() 35 | 36 | def generate_bases(k): 37 | base = 'ACTG' 38 | i = 1 39 | bases = '' 40 | while len(bases) < k: 41 | bases = base*i 42 | i += 1 43 | return bases 44 | 45 | def possible_kmers(k): 46 | bases = generate_bases(k) 47 | possible = set() 48 | for p in itertools.permutations(bases,k): 49 | possible.add(''.join(p)) 50 | return possible 51 | 52 | def frequency(seq,kmer,t): 53 | if seq.count(kmer) < t: 54 | return False 55 | return True 56 | 57 | def filter_kmer(raw_kmers): 58 | fil_kmers = set() 59 | for kmer in raw_kmers: 60 | if frequency(seq,kmer,t) == True: 61 | fil_kmers.add(kmer) 62 | return fil_kmers 63 | 64 | def kmer_position(seq,fil_kmer,t,L): 65 | positions = [m.start() for m in re.finditer(fil_kmer,seq)] 66 | if positions[t-1] - positions[0] < L: 67 | return True 68 | return False 69 | 70 | def result(fil_kmers): 71 | result = set() 72 | for kmer in fil_kmers: 73 | if kmer_position(seq,kmer,t,L) == True: 74 | result.add(kmer) 75 | return result 76 | 77 | if __name__ == '__main__': 78 | 79 | filename = sys.argv[-1] 80 | seq = read_file(filename).strip() 81 | k = 4 82 | L = 500 83 | t = 3 84 | # k,L,t = map(int,data[1].strip().split(' ')) 85 | raw_kmers = possible_kmers(k) 86 | fil_kmers = filter_kmer(raw_kmers) 87 | final_result = result(fil_kmers) 88 | 89 | fw = open('./output.'+sys.argv[-2][:-3]+'.txt','w') 90 | fw.write(' '.join(final_result)) 91 | fw.close() 92 | -------------------------------------------------------------------------------- /chapter1/C1_4/find_5_kmers.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - An Explosion of Hidden Messages 10 | # 11 | # Clump Finding Problem: Find patterns forming clumps in a string. 12 | # Input: A string Genome, and integers k, L, and t. 13 | # Output: All distinct k-mers forming (L, t)-clumps in Genome. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA 19 | # 5 50 4 20 | # 21 | # Sample Output: 22 | # CGACA GAAGA 23 | # 24 | #################################################################### 25 | 26 | import sys 27 | import re 28 | import itertools 29 | 30 | def read_file(filename): 31 | f = open(filename, 'r') 32 | data = f.read() 33 | return data 34 | f.close() 35 | 36 | def generate_bases(k): 37 | base = 'ACTG' 38 | i = 1 39 | bases = '' 40 | while len(bases) < k: 41 | bases = base*i 42 | i += 1 43 | return bases 44 | 45 | def possible_kmers(k): 46 | bases = generate_bases(k) 47 | possible = set() 48 | for p in itertools.permutations(bases,k): 49 | possible.add(''.join(p)) 50 | return possible 51 | 52 | def frequency(seq,kmer,t): 53 | if seq.count(kmer) < t: 54 | return False 55 | return True 56 | 57 | def filter_kmer(raw_kmers): 58 | fil_kmers = set() 59 | for kmer in raw_kmers: 60 | if frequency(seq,kmer,t) == True: 61 | fil_kmers.add(kmer) 62 | return fil_kmers 63 | 64 | def kmer_position(seq,fil_kmer,t,L): 65 | positions = [m.start() for m in re.finditer(fil_kmer,seq)] 66 | if positions[t-1] - positions[0] < L: 67 | return True 68 | return False 69 | 70 | def result(fil_kmers): 71 | result = set() 72 | for kmer in fil_kmers: 73 | if kmer_position(seq,kmer,t,L) == True: 74 | result.add(kmer) 75 | return result 76 | 77 | if __name__ == '__main__': 78 | 79 | filename = sys.argv[-1] 80 | seq = read_file(filename).strip() 81 | k = 5 82 | L = 500 83 | t = 3 84 | # k,L,t = map(int,data[1].strip().split(' ')) 85 | raw_kmers = possible_kmers(k) 86 | fil_kmers = filter_kmer(raw_kmers) 87 | final_result = result(fil_kmers) 88 | 89 | fw = open('./output.'+sys.argv[-2][:-3]+'.txt','w') 90 | fw.write(' '.join(final_result)) 91 | fw.close() 92 | -------------------------------------------------------------------------------- /chapter1/C1_4/find_6_kmers.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - An Explosion of Hidden Messages 10 | # 11 | # Clump Finding Problem: Find patterns forming clumps in a string. 12 | # Input: A string Genome, and integers k, L, and t. 13 | # Output: All distinct k-mers forming (L, t)-clumps in Genome. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA 19 | # 5 50 4 20 | # 21 | # Sample Output: 22 | # CGACA GAAGA 23 | # 24 | #################################################################### 25 | 26 | import sys 27 | import re 28 | import itertools 29 | 30 | def read_file(filename): 31 | f = open(filename, 'r') 32 | data = f.read() 33 | return data 34 | f.close() 35 | 36 | def generate_bases(k): 37 | base = 'ACTG' 38 | i = 1 39 | bases = '' 40 | while len(bases) < k: 41 | bases = base*i 42 | i += 1 43 | return bases 44 | 45 | def possible_kmers(k): 46 | bases = generate_bases(k) 47 | possible = set() 48 | for p in itertools.permutations(bases,k): 49 | possible.add(''.join(p)) 50 | return possible 51 | 52 | def frequency(seq,kmer,t): 53 | if seq.count(kmer) < t: 54 | return False 55 | return True 56 | 57 | def filter_kmer(raw_kmers): 58 | fil_kmers = set() 59 | for kmer in raw_kmers: 60 | if frequency(seq,kmer,t) == True: 61 | fil_kmers.add(kmer) 62 | return fil_kmers 63 | 64 | def kmer_position(seq,fil_kmer,t,L): 65 | positions = [m.start() for m in re.finditer(fil_kmer,seq)] 66 | if positions[t-1] - positions[0] < L: 67 | return True 68 | return False 69 | 70 | def result(fil_kmers): 71 | result = set() 72 | for kmer in fil_kmers: 73 | if kmer_position(seq,kmer,t,L) == True: 74 | result.add(kmer) 75 | return result 76 | 77 | if __name__ == '__main__': 78 | 79 | filename = sys.argv[-1] 80 | seq = read_file(filename).strip() 81 | k = 6 82 | L = 500 83 | t = 3 84 | # k,L,t = map(int,data[1].strip().split(' ')) 85 | raw_kmers = possible_kmers(k) 86 | fil_kmers = filter_kmer(raw_kmers) 87 | final_result = result(fil_kmers) 88 | 89 | fw = open('./output.'+sys.argv[-2][:-3]+'.txt','w') 90 | fw.write(' '.join(final_result)) 91 | fw.close() 92 | -------------------------------------------------------------------------------- /chapter1/C1_4/find_9_kmers.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - An Explosion of Hidden Messages 10 | # 11 | # Clump Finding Problem: Find patterns forming clumps in a string. 12 | # Input: A string Genome, and integers k, L, and t. 13 | # Output: All distinct k-mers forming (L, t)-clumps in Genome. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA 19 | # 5 50 4 20 | # 21 | # Sample Output: 22 | # CGACA GAAGA 23 | # 24 | #################################################################### 25 | 26 | import sys 27 | import re 28 | import itertools 29 | 30 | def read_file(filename): 31 | f = open(filename, 'r') 32 | data = f.read() 33 | return data 34 | f.close() 35 | 36 | def possible_kmers(base1,base2,k): 37 | bases = base1 + base2 38 | possible = set() 39 | for p in itertools.permutations(bases,k): 40 | if len(''.join(p))==9: 41 | possible.add(''.join(p)) 42 | return possible 43 | 44 | def frequency(seq,kmer,t): 45 | if seq.count(kmer) < t: 46 | return False 47 | return True 48 | 49 | def filter_kmer(raw_kmers): 50 | fil_kmers = set() 51 | for kmer in raw_kmers: 52 | if frequency(seq,kmer,t) == True: 53 | fil_kmers.add(kmer) 54 | return fil_kmers 55 | 56 | def kmer_position(seq,fil_kmer,t,L): 57 | positions = [m.start() for m in re.finditer(fil_kmer,seq)] 58 | if positions[t-1] - positions[0] < L: 59 | return True 60 | return False 61 | 62 | def result(fil_kmers): 63 | result = set() 64 | for kmer in fil_kmers: 65 | if kmer_position(seq,kmer,t,L) == True: 66 | result.add(kmer) 67 | return result 68 | 69 | if __name__ == '__main__': 70 | 71 | seq_filename = sys.argv[-3] 72 | seq = read_file(seq_filename).strip() 73 | base1_filename = sys.argv[-2] 74 | base1 = read_file(base1_filename).strip().split(' ') 75 | base2_filename = sys.argv[-1] 76 | base2 = read_file(base2_filename).strip().split(' ') 77 | k = 2 78 | L = 500 79 | t = 3 80 | # k,L,t = map(int,data[1].strip().split(' ')) 81 | raw_kmers = possible_kmers(base1,base2, k) 82 | fil_kmers = filter_kmer(raw_kmers) 83 | final_result = result(fil_kmers) 84 | ''' 85 | fw = open('./output.'+sys.argv[-4][:-3]+'.txt','w') 86 | fw.write(' '.join(final_result)) 87 | fw.close() 88 | ''' 89 | -------------------------------------------------------------------------------- /chapter1/C1_4/find_9a_kmers.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - An Explosion of Hidden Messages 10 | # 11 | # Clump Finding Problem: Find patterns forming clumps in a string. 12 | # Input: A string Genome, and integers k, L, and t. 13 | # Output: All distinct k-mers forming (L, t)-clumps in Genome. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA 19 | # 5 50 4 20 | # 21 | # Sample Output: 22 | # CGACA GAAGA 23 | # 24 | #################################################################### 25 | 26 | import sys 27 | import re 28 | import itertools 29 | 30 | def read_file(filename): 31 | f = open(filename, 'r') 32 | data = f.read() 33 | return data 34 | f.close() 35 | 36 | def generate_bases(k): 37 | base = 'ACTG' 38 | i = 1 39 | bases = '' 40 | while len(bases) < k: 41 | bases = base*i 42 | i += 1 43 | return bases 44 | 45 | def possible_kmers(k): 46 | bases = generate_bases(k) 47 | possible = set() 48 | for p in itertools.permutations(bases,k): 49 | possible.add(''.join(p)) 50 | return possible 51 | 52 | def frequency(seq,kmer,t): 53 | if seq.count(kmer) < t: 54 | return False 55 | return True 56 | 57 | def filter_kmer(raw_kmers): 58 | fil_kmers = set() 59 | for kmer in raw_kmers: 60 | if frequency(seq,kmer,t) == True: 61 | fil_kmers.add(kmer) 62 | return fil_kmers 63 | 64 | def kmer_position(seq,fil_kmer,t,L): 65 | positions = [m.start() for m in re.finditer(fil_kmer,seq)] 66 | if positions[t-1] - positions[0] < L: 67 | return True 68 | return False 69 | 70 | def result(fil_kmers): 71 | result = set() 72 | for kmer in fil_kmers: 73 | if kmer_position(seq,kmer,t,L) == True: 74 | result.add(kmer) 75 | return result 76 | 77 | if __name__ == '__main__': 78 | 79 | filename = sys.argv[-1] 80 | seq = read_file(filename).strip() 81 | k = 9 82 | L = 500 83 | t = 3 84 | # k,L,t = map(int,data[1].strip().split(' ')) 85 | raw_kmers = possible_kmers(k) 86 | fil_kmers = filter_kmer(raw_kmers) 87 | final_result = result(fil_kmers) 88 | 89 | fw = open('./output.'+sys.argv[-2][:-3]+'.txt','w') 90 | fw.write(' '.join(final_result)) 91 | fw.close() 92 | -------------------------------------------------------------------------------- /chapter1/C1_4/find_9b_kmers.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - An Explosion of Hidden Messages 10 | # 11 | # Clump Finding Problem: Find patterns forming clumps in a string. 12 | # Input: A string Genome, and integers k, L, and t. 13 | # Output: All distinct k-mers forming (L, t)-clumps in Genome. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA 19 | # 5 50 4 20 | # 21 | # Sample Output: 22 | # CGACA GAAGA 23 | # 24 | #################################################################### 25 | 26 | import sys 27 | import re 28 | import itertools 29 | 30 | def read_file(filename): 31 | f = open(filename, 'r') 32 | data = f.read() 33 | return data 34 | f.close() 35 | 36 | def possible_kmers(base1,base2,k): 37 | bases = base1 + base2 38 | possible = set() 39 | for p in itertools.permutations(bases,k): 40 | if len(''.join(p))==9: 41 | possible.add(''.join(p)) 42 | return possible 43 | 44 | def frequency(seq,kmer,t): 45 | if seq.count(kmer) < t: 46 | return False 47 | return True 48 | 49 | def filter_kmer(raw_kmers): 50 | fil_kmers = set() 51 | for kmer in raw_kmers: 52 | if frequency(seq,kmer,t) == True: 53 | fil_kmers.add(kmer) 54 | return fil_kmers 55 | 56 | def kmer_position(seq,fil_kmer,t,L): 57 | positions = [m.start() for m in re.finditer(fil_kmer,seq)] 58 | if positions[t-1] - positions[0] < L: 59 | return True 60 | return False 61 | 62 | def result(fil_kmers): 63 | result = set() 64 | for kmer in fil_kmers: 65 | if kmer_position(seq,kmer,t,L) == True: 66 | result.add(kmer) 67 | return result 68 | 69 | if __name__ == '__main__': 70 | 71 | seq_filename = sys.argv[-3] 72 | seq = read_file(seq_filename).strip() 73 | base1_filename = sys.argv[-2] 74 | base1 = read_file(base1_filename).strip().split(' ') 75 | base2_filename = sys.argv[-1] 76 | base2 = read_file(base2_filename).strip().split(' ') 77 | k = 2 78 | L = 500 79 | t = 3 80 | # k,L,t = map(int,data[1].strip().split(' ')) 81 | raw_kmers = possible_kmers(base1,base2, k) 82 | fil_kmers = filter_kmer(raw_kmers) 83 | final_result = result(fil_kmers) 84 | 85 | fw = open('./output.'+sys.argv[-4][:-3]+'.txt','w') 86 | fw.write(' '.join(final_result)) 87 | fw.close() 88 | -------------------------------------------------------------------------------- /chapter1/C1_4/find_precalculate_9mers.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Oct 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - An Explosion of Hidden Messages 10 | # 11 | # Clump Finding Problem: Find patterns forming clumps in a string. 12 | # Input: A string Genome, and integers k, L, and t. 13 | # Output: All distinct k-mers forming (L, t)-clumps in Genome. 14 | # 15 | # Example: 16 | # 17 | # Sample Input: 18 | # CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA 19 | # 5 50 4 20 | # 21 | # Sample Output: 22 | # CGACA GAAGA 23 | # 24 | #################################################################### 25 | 26 | import sys 27 | import re 28 | import itertools 29 | 30 | def read_file(filename): 31 | f = open(filename, 'r') 32 | data = f.read() 33 | return data 34 | f.close() 35 | 36 | def frequency(seq,kmer,t): 37 | if seq.count(kmer) < t: 38 | return False 39 | return True 40 | 41 | def filter_kmer(raw_kmers): 42 | fil_kmers = set() 43 | for kmer in raw_kmers: 44 | if frequency(seq,kmer,t) == True: 45 | fil_kmers.add(kmer) 46 | return fil_kmers 47 | 48 | def kmer_position(seq,fil_kmer,t,L): 49 | positions = [m.start() for m in re.finditer(fil_kmer,seq)] 50 | if positions[t-1] - positions[0] < L: 51 | return True 52 | return False 53 | 54 | def result(fil_kmers): 55 | result = set() 56 | for kmer in fil_kmers: 57 | if kmer_position(seq,kmer,t,L) == True: 58 | result.add(kmer) 59 | return result 60 | 61 | if __name__ == '__main__': 62 | 63 | seq_filename = sys.argv[-2] 64 | nine_mers_filename = sys.argv[-1] 65 | seq = read_file(seq_filename).strip() 66 | nine_mers = read_file(nine_mers_filename).strip().split(' ') 67 | # k = 9 68 | L = 500 69 | t = 3 70 | # k,L,t = map(int,data[1].strip().split(' ')) 71 | # raw_kmers = possible_kmers(k) 72 | fil_kmers = filter_kmer(nine_mers) 73 | final_result = result(fil_kmers) 74 | 75 | fw = open('./output.'+sys.argv[-2][:-3]+'.txt','w') 76 | fw.write(' '.join(final_result)) 77 | fw.close() 78 | -------------------------------------------------------------------------------- /chapter1/C1_4/find_short_kmers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter1/C1_4/find_short_kmers.pyc -------------------------------------------------------------------------------- /chapter1/C1_4/output.find_3_kmers.txt: -------------------------------------------------------------------------------- 1 | ATG ACG ATC AGC CAT CTG ACT AGT CAG CGA GAT TGC TAG TAC TCG GAC CGT TCA GCA GTA GTC GCT TGA -------------------------------------------------------------------------------- /chapter1/C1_4/output.find_4_kmers.txt: -------------------------------------------------------------------------------- 1 | GTAC TCGA ACGT ATGC TGAC CAGT CGAT CGTA CTGA TAGC CATG TCAG ATCG TACG GATC -------------------------------------------------------------------------------- /chapter1/C1_4/output.find_5_kmers.txt: -------------------------------------------------------------------------------- 1 | GCGTT TGCTG GCCGA GTACA CGACA ATTAG GTCGA CATTG GTAAC ATCTC GGATC TACGC AGGTA GCACG TACCG TTGAA GTGAA CAATC CACGA TGGTA TGGCA TCTCG TCTGA CATCA CACAG CCGGT AACGT CGATC ACCAT GTGCT CCATT TTACC CGGTC CACGG GCTCG CGAAG ATGGC CGAAC GCTCA GCTGC CAGGA TCAAT CTCGT ATTAC TTCCA TTCCG ACCTG ATTGA CAGTG GAACC CGCTG TCGAT CGTAT CTGGC GACAT GTATC GGTAA CTCGG CATTA ATCAG TTGCC TGCCG CTGAC -------------------------------------------------------------------------------- /chapter1/C1_4/output.find_6_kmers.txt: -------------------------------------------------------------------------------- 1 | CGTGAA ACTGGC AGCCTT TTGCCG TCTCGG CATTAG GTCGAT CCAGTG TCCAGT GCTGAT ATCTCG TCGATC ACCATT ACTTGC GGTAAC TGCCGA CCATTA -------------------------------------------------------------------------------- /chapter1/C1_4/output.find_9_kmers.txt: -------------------------------------------------------------------------------- 1 | GCTGTCGAT AGTATCTCG CTGACTTGC GTCGATTAG TCGATCGAT CGAGTCGAT CAGGCTGAT GTATGCCGA TGCCGATGC GATCATTAG TGCCGATGA GACTGCCGA GATTCGATC TGCCGAACT ATGCATTAG GCTCATTAG TCGATCGAC GTCGATTAC GCTGATATG TGCTGCCGA CATGCTGAT GCTGATATC AGTACTTGC GCTGATGAC ACGACTTGC TGCCGAGTC GCTGATTAC GACGCTGAT TGCCGAGCA GCTGATTAG GTCGATGTC GTCGATGTA TGCCGAACG TGCCGAGCT TACCATTAG GACACTTGC GCTGATGAT GCAATCTCG AGTCATTAG GCATCGATC CATTAGACT ACTTGCATG ATCTCGCAG ATGGCTGAT CGTACTTGC TCACATTAG ACTTGCCGT ACTACTTGC CGTTGCCGA TGAATCTCG GATATCTCG TCGTGCCGA ATCTCGACT GCTGATCAT TAGACTTGC CATTAGGCT TCGATCCAT CAGATCTCG GCTGATCAG TGATCGATC ATCTCGACG ATCTCGCGT TGCACTTGC ACTTGCGTA CAGTCGATC TCGATCTCG CATTAGTCA TCGATCTCA CATTAGGCA GTCGATTGA ATCTCGGCT ATCTCGGTC ATCTCGGTA GCAACTTGC AGTTCGATC GCACATTAG CGTTCGATC GCTGATCGA GTCGATAGC TCGATCGCA CAGTGCCGA ATCTCGCTG CGAGCTGAT GCTGATCGT CATTAGGAC CAGACTTGC GTCTCGATC ATGACTTGC TGATGCCGA AGCATCTCG CGTGTCGAT GTCGATATG TACGTCGAT ACGTGCCGA GCTGATAGC TGCCGATAG ATCCATTAG CATTAGGTA GCTGATTGC TGCCGATAC GATGCTGAT ACTTGCCAG ACGTCGATC ACTTGCGTC CATTAGACG ACTATCTCG TACGCTGAT AGCGCTGAT GTCGATCGT GCAGCTGAT GCTACTTGC ACGGCTGAT TCGATCACT GTAGTCGAT GTCGATCTG ACTTGCACT AGCACTTGC TGAGTCGAT GCATGCCGA ACTTGCGCA ATCTCGTGA TCGATCACG TCGATCATC ACTTGCACG CATTAGAGC CGATCGATC GTCTGCCGA GTATCGATC AGTGTCGAT ACTTGCGCT CGATGCCGA CATGTCGAT ATCTGCCGA GTCATCTCG TAGTGCCGA TAGTCGATC TCGATCATG TCGATCCAG GTCGCTGAT TGCCGACTG CATTAGTGA TCGATCCTG TACACTTGC TGCCGACGA ATCTCGCGA CATATCTCG CATTAGTAC ATCTCGTGC GTCGATCAT TCGTCGATC TGCCGACGT TCGATCGTC GCTTCGATC TCGGCTGAT ACGCATTAG CGTATCTCG GCTGATTCG GACTCGATC ACTGCTGAT TACTCGATC GTCGATCAG TGCCATTAG CATCATTAG TGCCGATCA GCTGATGTC TGCCGATCG CATTCGATC CGAATCTCG ATGGTCGAT GTCGATGAT TCGATCGCT TGACATTAG GATACTTGC TCAATCTCG GTAATCTCG GACGTCGAT ACGGTCGAT CATTAGGTC TGCCGAATG TCATCGATC GCTGATGTA ATCATCTCG GTCGATGAC GCTGATTGA GCTGCTGAT CATACTTGC CATTAGTAG ATCTCGTAG ATCTCGGCA ACGATCTCG TCGCATTAG CTGATCTCG ATGATCTCG CTGTGCCGA TAGGTCGAT CATTGCCGA CTGGTCGAT TGCGTCGAT TACATCTCG TCGGTCGAT GTCGATACT CATTAGCAT TCGATCTGA TCGATCTGC GTCGTCGAT ACTTGCATC TGCCGAAGT TGAGCTGAT CATTAGCGT TGAACTTGC CTGTCGATC CATTAGAGT AGCCATTAG CATTAGCTG TGCCGAAGC CAGCATTAG ATCTCGTCG AGCTCGATC ATCTCGAGC TCAACTTGC CATTAGATC GCTGATAGT GTCACTTGC ACTTGCGAT GACATCTCG ATCGCTGAT TGCCGAGAT ATCTCGATC TCGACTTGC ATCTCGATG AGTTGCCGA ATCTCGCAT TCGATCCGA GCTGATACT ATGTCGATC CTGCATTAG TGCCGAGTA ATCTCGGAT GCAGTCGAT GCTGATTCA GCTGATGCA TGCCGAGAC GTCGATAGT GCTGATACG ACTCATTAG CATTAGCAG GTCGATTCG ACTTGCCAT GTCGATTCA TCAGCTGAT TCGATCCGT ACTGTCGAT TGCATCTCG TCATGCCGA TGCGCTGAT CTGGCTGAT CATTAGTGC TAGCATTAG TCAGTCGAT GTCGATGCA ACTTGCTGC CGTGCTGAT GCTTGCCGA GATTGCCGA AGTGCTGAT CAGGTCGAT GATGTCGAT CGACATTAG ACTTCGATC AGCTGCCGA TCGATCGTA ACTTGCGAC GTCGATATC ACTTGCCGA GTAGCTGAT AGCGTCGAT ATGTGCCGA ACTTGCTCA GTAACTTGC ACTTGCTCG TGCTCGATC ATCGTCGAT ATCTCGTCA GTACATTAG ACTTGCCTG ACTTGCTAC CATTAGTCG CGAACTTGC CATTAGCGA TGCCGAATC TCGATCTAC TGCCGACAT TAGGCTGAT ACTTGCAGC GTCGATTGC GTCGATGCT TACTGCCGA CATTAGGAT TCGATCAGC ACTTGCTGA ATCACTTGC GCTGATCTG CATTAGATG TGCCGACAG GTCGATACG GCTGATGCT TCGATCAGT ATCTCGGAC GCTATCTCG CGTCATTAG GTCGATCGA ACTTGCAGT -------------------------------------------------------------------------------- /chapter1/C1_4/output.find_9b_kmers.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter1/C1_4/output.find_9b_kmers.txt -------------------------------------------------------------------------------- /chapter1/C1_4/test.4_2.txt: -------------------------------------------------------------------------------- 1 | CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA 2 | 5 50 4 3 | -------------------------------------------------------------------------------- /chapter1/C1_7/7_1.py: -------------------------------------------------------------------------------- 1 | ################################################################### 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 13 Nov 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - Peculiar Statistics of the Forward and Reverse Half-Strands 10 | # 11 | # EXERCISE BREAK: Give all values of Skew(Prefixi (GAGCCACCGCGATA)) 12 | # for i ranging from 0 to 14. 13 | # 14 | # Sample Input: 15 | # CATGGGCATCGGCCATACGCC 16 | # 17 | # Sample Output: 18 | # 0 -1 -1 -1 0 1 2 1 1 1 0 1 2 1 0 0 0 0 -1 0 -1 -2 19 | # 20 | ################################################################### 21 | 22 | import sys 23 | 24 | def read_file(filename): 25 | f = open(filename, 'r') 26 | genome = f.read() 27 | return genome 28 | f.close() 29 | 30 | def skew(genome): 31 | return genome.count('G') - genome.count('C') 32 | 33 | def prefix(genome,i): 34 | return genome[:i] 35 | 36 | def skew_diagram(genome): 37 | result = [] 38 | for i in range(15): 39 | # for i in range(len(genome)+1): 40 | result.append(skew(prefix(genome,i))) 41 | return result 42 | 43 | if __name__ == '__main__': 44 | 45 | # genome = read_file(sys.argv[-1]).strip().upper() 46 | genome = 'GAGCCACCGCGATA' 47 | result = skew_diagram(genome) 48 | print ' '.join(map(str,result)) 49 | 50 | -------------------------------------------------------------------------------- /chapter1/C1_7/7_2.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 13 Nov 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - Peculiar Statistics of the Forward and Reverse Half-Strands 10 | # 11 | # Minimum Skew Problem: 12 | # Find a position in a genome minimizing the skew. 13 | # Input: A DNA string Genome. 14 | # Output: All integer(s) i minimizing Skew(Prefixi (Text)) 15 | # among all values of i (from 0 to |Genome|). 16 | # 17 | # CODE CHALLENGE: Solve the Minimum Skew Problem. 18 | # 19 | # Sample Input: 20 | # TAAAGACTGCCGAGAGGCCAACACGAGTGCTAGAACGAGGGGCGTAAACGCGGGTCCGAT 21 | # 22 | # Sample Output: 23 | # 11 24 24 | # 25 | ################################################################### 26 | 27 | import sys 28 | 29 | def read_file(filename): 30 | f = open(filename, 'r') 31 | genome = f.read() 32 | return genome 33 | f.close() 34 | 35 | def skew(genome): 36 | return genome.count('G') - genome.count('C') 37 | 38 | def prefix(genome,i): 39 | return genome[:i] 40 | 41 | def skew_diagram(genome): 42 | skew_values = [] 43 | for i in range(len(genome)+1): 44 | skew_values.append(skew(prefix(genome,i))) 45 | return skew_values 46 | 47 | def minimum_skew(skew_values): 48 | result = [] 49 | i = 0 50 | while i < len(skew_values): 51 | if skew_values[i] == min(skew_values): 52 | result.append(i) 53 | i += 1 54 | return result 55 | 56 | if __name__ == '__main__': 57 | 58 | genome = read_file(sys.argv[-1]).strip().upper() 59 | skew_values = skew_diagram(genome) 60 | result = minimum_skew(skew_values) 61 | print ' '.join(map(str,result)) 62 | 63 | -------------------------------------------------------------------------------- /chapter1/C1_7/test.7_1.txt: -------------------------------------------------------------------------------- 1 | CATGGGCATCGGCCATACGCC 2 | -------------------------------------------------------------------------------- /chapter1/C1_7/test.7_2.txt: -------------------------------------------------------------------------------- 1 | TAAAGACTGCCGAGAGGCCAACACGAGTGCTAGAACGAGGGGCGTAAACGCGGGTCCGAT 2 | -------------------------------------------------------------------------------- /chapter1/C1_8/.1g.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter1/C1_8/.1g.py.swp -------------------------------------------------------------------------------- /chapter1/C1_8/.8_4.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter1/C1_8/.8_4.py.swp -------------------------------------------------------------------------------- /chapter1/C1_8/8_3/approximate_pattern_match.py: -------------------------------------------------------------------------------- 1 | ####################################################################################### 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 13 Nov 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - Some Hidden Messages are More Elusive than Others 10 | # 11 | # Approximate Pattern Matching Problem: 12 | # Find all approximate occurrences of a pattern in a string. 13 | # Input: Two strings Pattern and Text along with an integer d. 14 | # Output: All positions where Pattern appears in Text with at most d mismatches. 15 | # 16 | # CODE CHALLENGE: Solve the Approximate Pattern Matching Problem 17 | # 18 | # Sample Input: 19 | # ATTCTGGA 20 | # CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT 21 | # 3 22 | # 23 | # Sample Output: 24 | # 6 7 26 27 25 | # 26 | ######################################################################################## 27 | 28 | import sys 29 | 30 | def read_file(filename): 31 | f = open(filename, 'r') 32 | data = f.readlines() 33 | return data 34 | f.close() 35 | 36 | def comparison(pattern,text): 37 | i,count = 0,0 38 | while i < len(pattern): 39 | if pattern[i] != text[i]: 40 | count += 1 41 | i += 1 42 | return count 43 | 44 | def approximate(pattern,text,d): 45 | i, result = 0, [] 46 | while i < len(text)-len(pattern)+1: 47 | if comparison(pattern,text[i:i+len(pattern)]) <= d: 48 | result.append(i) 49 | i += 1 50 | return result 51 | 52 | if __name__ == '__main__': 53 | 54 | pattern, text, d = [item.strip() for item in read_file(sys.argv[-1])] 55 | d = float(d) 56 | result = approximate(pattern,text,d) 57 | 58 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 59 | fw.write(' '.join(map(str,result))) 60 | fw.close() 61 | -------------------------------------------------------------------------------- /chapter1/C1_8/8_3/output.dataset_8_3.txt: -------------------------------------------------------------------------------- 1 | 2 8 70 89 154 160 183 187 188 219 240 255 268 282 314 332 380 397 420 540 666 673 678 700 703 711 740 750 751 774 803 854 878 881 943 950 994 1018 1094 1207 1229 1240 1257 1273 1280 1283 1317 1322 1343 1370 1399 1417 1477 1513 1582 1596 1661 1679 1729 1742 1807 1870 1913 1916 1993 1998 2000 2030 2039 2059 2071 2080 2109 2118 2145 2159 2178 2181 2185 2200 2207 2213 2219 2257 2305 2315 2330 2340 2351 2382 2435 2483 2487 2493 2507 2513 2541 2548 2568 2582 2585 2625 2664 2670 2683 2716 2723 2735 2744 2759 2760 2765 2841 2844 2867 2873 2884 2891 2895 2900 2901 2919 2925 2956 2971 3003 3036 3057 3065 3075 3113 3127 3132 3135 3149 3175 3188 3194 3199 3204 3225 3237 3263 3272 3277 3318 3417 3442 3448 3454 3487 3491 3527 3531 3562 3588 3614 3623 3633 3644 3703 3729 3758 3760 3762 3836 3876 3909 3917 3956 3963 3971 3978 3988 4014 4031 4061 4067 4087 4097 4123 4158 4231 4249 4288 4300 4314 4325 4328 4360 4365 4388 4401 4409 4437 4478 4491 4502 4525 4576 4602 4619 4633 4639 4644 4647 4703 4706 4714 4722 4735 4798 4805 4816 4849 4914 4933 4941 4957 4961 4966 4985 5007 5048 5053 5064 5066 5071 5105 5114 5125 5136 5146 5149 5160 5176 5177 5189 5192 5196 5199 5200 5204 5205 5242 5246 5256 5268 5280 5283 5316 5335 5340 5344 5359 5385 5420 5432 5439 5471 5517 5521 5533 5579 5582 5588 5594 5600 5610 5615 5632 5642 5645 5677 5689 5694 5695 5747 5755 5772 5774 5778 5852 5861 5883 5938 5945 5959 5965 5970 6041 6057 6075 6095 6102 6104 6109 6113 6141 6177 6182 6219 6221 6223 6337 6343 6347 6350 6356 6409 6478 6490 6511 6519 6608 6612 6630 6671 6672 6711 6724 6725 6742 6771 6806 6817 6865 6868 6898 6900 6943 6965 6969 6993 6999 7059 7062 7072 7077 7088 7101 7114 7132 7162 7183 7191 7257 7259 7264 7275 7284 7301 7319 7337 7341 7411 7458 7505 7566 7575 7579 7624 7648 7661 7673 7690 7751 7763 7770 7823 7870 7874 7898 7911 7939 7953 7955 7956 7970 7981 7986 8005 8006 8013 8016 8021 8050 8057 8062 8066 8073 8078 8102 8123 8152 8157 8160 8175 8218 8272 8280 8284 8297 8325 8358 8505 8547 8563 8579 8620 8690 8695 8729 8732 8733 8737 8764 8768 8769 8773 8778 8795 8859 8875 8919 8928 8940 8957 8964 8973 8997 9025 9062 9066 9119 9128 9135 9150 9154 9170 9176 9185 9199 9229 9237 9264 9288 9297 9334 9376 9392 9409 9417 9529 9544 9551 9569 9573 9579 9593 9594 9618 9643 9654 9664 9668 9678 9682 9686 9753 9762 9765 9774 9776 9792 9799 9815 9819 9830 9834 9839 9854 9881 9886 9906 9948 9990 10002 10017 10022 10027 10069 10075 10080 10101 10146 10149 10152 10156 10161 10196 10202 10206 10223 10226 10258 10262 10327 10348 10361 10377 10392 10405 10449 10483 10496 10557 10570 10597 10598 10632 10648 10678 10710 10732 10742 10750 10757 10773 10780 10809 10847 10911 10945 11004 11035 11036 11064 11079 11126 11137 11149 11156 11201 11206 11242 11246 11272 11331 11334 11381 11438 11457 11461 11481 11499 11543 11560 11654 11687 11705 11715 11828 11903 11907 11920 11926 11953 11977 11980 11984 11985 12020 12023 12093 12112 12204 12233 12237 12247 12259 12353 12358 12367 12417 12423 12455 12477 12482 12496 12501 12562 12578 12624 12638 12662 12679 12698 12707 12778 12829 12836 12872 12880 12925 12964 12975 12987 12988 13028 13035 13063 13090 13096 13112 13125 13197 13202 13245 13265 13270 13274 13278 13288 13294 13310 13329 13333 13360 13366 13376 13387 13425 13439 13442 13447 13468 13555 13562 13565 13575 13578 13582 13587 13609 13638 13666 13691 13695 13702 13716 13760 13795 13802 13814 13817 13820 13837 13871 13925 13956 13977 14000 14004 14030 14033 14044 14047 14051 14080 14111 14118 14150 14154 14176 14207 14216 14242 14270 14306 14338 14360 14493 14496 14516 14526 14533 14538 14543 14561 14582 14620 14625 14655 14704 14737 14742 14798 14812 14897 14905 14918 14933 14938 14987 15004 15015 15021 15060 15130 15160 15201 15210 15236 15264 15268 15271 15280 15287 15328 15430 15446 15460 15463 15485 15494 15504 15511 15524 15535 15540 15570 15601 15630 15660 15665 15681 15684 15690 15700 15712 15714 15741 15746 15759 15795 15820 15836 15839 15845 15855 15906 15948 15978 15987 16001 16020 16034 16050 16068 16075 16090 16137 16150 16177 16214 16240 16262 16290 16293 16414 16499 16538 16605 16611 16637 16643 16652 16657 16659 16680 -------------------------------------------------------------------------------- /chapter1/C1_8/8_3/test.8_3.txt: -------------------------------------------------------------------------------- 1 | ATTCTGGA 2 | CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT 3 | 3 4 | -------------------------------------------------------------------------------- /chapter1/C1_8/8_4/8_4.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Nov 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - Some Hidden Messages are More Elusive than Others 10 | # 11 | # Frequent Words with Mismatches Problem: 12 | # Find the most frequent k-mers with mismatches in a string. 13 | # Input: A string Text as well as integers k and d. (You may 14 | # assume k <= 12 and d <= 3.) 15 | # Output: All most frequent k-mers with up to d mismatches in Text. 16 | # 17 | # CODE CHALLENGE: Solve the Frequent Words with Mismatches Problem. 18 | # 19 | # Sample Input: 20 | # ACGTTGCATGTCGCATGATGCATGAGAGCT 4 1 21 | # Sample Output: 22 | # GATG ATGC ATGT 23 | # 24 | ################################################################### 25 | 26 | import sys 27 | import regex 28 | import timeit 29 | from itertools import combinations, product 30 | 31 | def read_file(filename): 32 | f = open(filename, 'r') 33 | data = f.read() 34 | f.close() 35 | return data 36 | 37 | def correct_kmers(seq,k): 38 | correct = set(seq[i:i+k] for i in range(len(seq)-k+1)) 39 | return correct 40 | 41 | def generate(s,d): 42 | N = len(s) 43 | letters = 'ACGT' 44 | pool = list(s) 45 | for indices in combinations(range(N),d): 46 | for replacements in product(letters,repeat=d): 47 | skip = False 48 | for i, a in zip(indices, replacements): 49 | if pool[i] == a: 50 | skip = True 51 | if skip: 52 | continue 53 | key = dict(zip(indices,replacements)) 54 | yield ''.join([pool[i] if i not in indices else key[i] for i in range(N)]) 55 | 56 | def possible_kmers(seq,k,d): 57 | possibles = set() 58 | correct = correct_kmers(seq,k) 59 | dd = 1 60 | while dd <= d: 61 | for s in correct: 62 | for item in generate(s,dd): 63 | possibles.add(item) 64 | dd += 1 65 | return possibles 66 | 67 | def find_kmer(seq,kmer,d): 68 | match = regex.findall(r'(?=(%s){s,e<=%d})'%(kmer,d),seq) 69 | return match 70 | 71 | def kmer_composition(seq,k,d): 72 | possibles = possible_kmers(seq,k,d) 73 | kmers = {} 74 | for kmer in possibles: 75 | kmers[kmer] = len(find_kmer(seq,kmer,d)) 76 | return kmers 77 | 78 | def result(filename): 79 | seq,k,d = [item.strip() for item in read_file(filename).split(' ')] 80 | k, d = int(k), int(d) 81 | kmers = kmer_composition(seq,k,d) 82 | maximum = max([value for value in kmers.itervalues()]) 83 | results = [key for key in kmers.iterkeys() if kmers[key] == maximum] 84 | return results 85 | 86 | if __name__ == '__main__': 87 | 88 | start = timeit.default_timer() 89 | results = result(sys.argv[-1]) 90 | stop = timeit.default_timer() 91 | print stop - start 92 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 93 | fw.write(' '.join(map(str,results))) 94 | fw.close() 95 | -------------------------------------------------------------------------------- /chapter1/C1_8/8_4/dataset_8_4.extra.txt: -------------------------------------------------------------------------------- 1 | CACAGTAGGCGCCGGCACACACAGCCCCGGGCCCCGGGCCGCCCCGGGCCGGCGGCCGCCGGCGCCGGCACACCGGCACAGCCGTACCGGCACAGTAGTACCGGCCGGCCGGCACACCGGCACACCGGGTACACACCGGGGCGCACACACAGGCGGGCGCCGGGCCCCGGGCCGTACCGGGCCGCCGGCGGCCCACAGGCGCCGGCACAGTACCGGCACACACAGTAGCCCACACACAGGCGGGCGGTAGCCGGCGCACACACACACAGTAGGCGCACAGCCGCCCACACACACCGGCCGGCCGGCACAGGCGGGCGGGCGCACACACACCGGCACAGTAGTAGGCGGCCGGCGCACAGCC 10 2 2 | -------------------------------------------------------------------------------- /chapter1/C1_8/8_4/dataset_8_4.txt: -------------------------------------------------------------------------------- 1 | GCTCCAGTAGATGCTCCAGATGCTACAGCTCCAGTAGTAACAGTAGATGTAGCTGATACACCAACACCACCAGCTGCTACAGCTCCAGCTGATCCAGATGATACAACACCAGTAGATGTAGTAGATGATGTAGATGATGTAGTAACAACAGCTGTAGTAACAGTACCAGTAGATACAGTACCAACAACAGATGATACAACAACAGATCCAGTAACAGCTACAGTAACAGATGATGTAGCTGCTGCTGTAACAGATGATGTAACAGCTGCTGCTGTAGCTACAGCTACAGTAGCTACAGCTCCAGATACACCAGCTACACCAACACCAGTAGCTGCTCCAGCTGCTACAGATCCACCACCACCAGCTCCAGTAGCTGTAGTAACACCA 10 2 2 | -------------------------------------------------------------------------------- /chapter1/C1_8/8_4/output.dataset_8_4.extra.txt: -------------------------------------------------------------------------------- 1 | GCACACAGAC GCGCACACAC -------------------------------------------------------------------------------- /chapter1/C1_8/8_4/output.dataset_8_4.txt: -------------------------------------------------------------------------------- 1 | AGCTGCTGTA -------------------------------------------------------------------------------- /chapter1/C1_8/8_4/output.test.8_4.txt: -------------------------------------------------------------------------------- 1 | GATG ATGC ATGT -------------------------------------------------------------------------------- /chapter1/C1_8/8_4/test.8_4.txt: -------------------------------------------------------------------------------- 1 | ACGTTGCATGTCGCATGATGCATGAGAGCT 4 1 2 | -------------------------------------------------------------------------------- /chapter1/C1_8/8_4/test.8_4b.txt: -------------------------------------------------------------------------------- 1 | GCACTACTGATCGCCATTGCCAGCACTACTTTGCTTACTCAACTCAACTCAACTGCACTGCACTTTACTCATTCATTGCTTGATCGATCACTCAGATCCAGATCGCTTACTGATCGCGATCACTGATCGCTTGATCCAGCGCGATCACTACTACTGCGCTTGCACTGCGCACTGATCACTCACAGATCGATCTTCACAACTCAGATCACTGATCTTTTGCTTCAGATCTTCACAGATCCATTACTGATCACTGATCACTACTGCGCTTGATCTTCAGCGCGATCCAGCACTCAGATCCAGATCACTACTCATTTTCAGCTTGATCACTGCCAACTTTGCACTTTACTTTGATCTTCACAGATCTTTTGATCGATCACTCACAGATC 10 2 2 | -------------------------------------------------------------------------------- /chapter1/C1_8/8_5/8_5.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Author: Min Wang (san-heng-yi-shu@163.com) 4 | # 5 | # Date Created: 6 | # 21 Nov 2013 7 | # 8 | # Coursera - Bioinformatics Algorithms 9 | # - Some Hidden Messages are More Elusive than Others 10 | # 11 | # Frequent Words with Mismatches and Reverse Complements Problem: 12 | # Find the most frequent k-mers (with mismatches and reverse complements) in a DNA string. 13 | # Input: A DNA string Text as well as integers k and d. 14 | # Output: All k-mers Pattern maximizing the sum Countd(Text, Pattern) + Countd(Text, Pattern) 15 | # over all possible k-mers. 16 | # 17 | # CODE CHALLENGE: 18 | # Solve the Frequent Words with Mismatches and Reverse Complements Problem. 19 | # 20 | # Sample Input: 21 | # ACGTTGCATGTCGCATGATGCATGAGAGCT 22 | # 4 1 23 | # 24 | # Sample Output: 25 | # ATGT ACAT 26 | # 27 | ############################################################################### 28 | 29 | import sys 30 | import regex 31 | import timeit 32 | from Bio.Seq import Seq 33 | from Bio.Alphabet import generic_dna 34 | from itertools import combinations, product 35 | 36 | def read_file(filename): 37 | f = open(filename, 'r') 38 | data = f.readlines() 39 | f.close() 40 | return data 41 | 42 | def reverse_complement(seq): 43 | my_dna = Seq(seq, generic_dna) 44 | rc = my_dna.reverse_complement() 45 | return str(rc) 46 | 47 | def correct_kmers(seq,k): 48 | correct = set(seq[i:i+k] for i in range(len(seq)-k+1)) 49 | return correct 50 | 51 | def generate_mismatches(s,d): 52 | N = len(s) 53 | letters = 'ACGT' 54 | pool = list(s) 55 | for indices in combinations(range(N),d): 56 | for replacements in product(letters,repeat=d): 57 | skip = False 58 | for i, a in zip(indices, replacements): 59 | if pool[i] == a: 60 | skip = True 61 | if skip: 62 | continue 63 | key = dict(zip(indices,replacements)) 64 | yield ''.join([pool[i] if i not in indices else key[i] for i in range(N)]) 65 | 66 | def possible_kmers(seq,k,d): 67 | possibles = set() 68 | correct = correct_kmers(seq,k) 69 | dd = 1 70 | while dd <= d: 71 | for s in correct: 72 | for item in generate_mismatches(s,dd): 73 | possibles.add(item) 74 | dd += 1 75 | return possibles 76 | 77 | def all_kmers_pairup(seq,k,d): 78 | possibles = possible_kmers(seq,k,d) 79 | all_pair_kmers = set() 80 | for kmer in possibles: 81 | rc_kmer = reverse_complement(kmer) 82 | if (kmer,rc_kmer) not in all_pair_kmers and (rc_kmer,kmer) not in all_pair_kmers: 83 | all_pair_kmers.add((kmer,reverse_complement(kmer))) 84 | return all_pair_kmers 85 | 86 | def find_kmer(seq,kmer,d): 87 | match = regex.findall(r'(?=(%s){s,e<=%d})'%(kmer,d),seq) 88 | return len(match) 89 | 90 | def kmer_composition(seq,k,d): 91 | all_pair_kmers = all_kmers_pairup(seq,k,d) 92 | new_dict = {} 93 | for pair_kmers in all_pair_kmers: 94 | new_dict[pair_kmers] = sum([find_kmer(seq,i,d) for i in pair_kmers]) 95 | return new_dict 96 | 97 | def result(filename): 98 | seq,num = [item.strip() for item in read_file(filename)] 99 | k,d = [int(item.strip()) for item in num.split(' ')] 100 | new_dict = kmer_composition(seq,k,d) 101 | maximum = max([value for value in new_dict.itervalues()]) 102 | results = [' '.join(key) for key in new_dict.iterkeys() if new_dict[key] == maximum] 103 | return results 104 | 105 | if __name__ == '__main__': 106 | 107 | start = timeit.default_timer() 108 | results = result(sys.argv[-1]) 109 | stop = timeit.default_timer() 110 | print stop - start 111 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 112 | fw.write(' '.join(map(str,results))) 113 | fw.close() 114 | -------------------------------------------------------------------------------- /chapter1/C1_8/8_5/dataset_8_5.txt: -------------------------------------------------------------------------------- 1 | TGCTGCGCGCTCAGCGCGCTCTAATGCAGCATGCTAGCTATCGCGCTCTTGCGCGCTCGCAGCGCATCTCTCTGCTATCGCTGCTGCAGCTCGCAGCAGCGCAGCGCTGCGCTCAGCGCTCGCGCGCATCGCGCGCTGCGCATTCAAAGCGCATTGCTGCTCTGCTGCGCTTGCATCGCAGCGCGCAGCGCGCGCTCATCGCGCGCAGCTA 2 | 9 2 3 | -------------------------------------------------------------------------------- /chapter1/C1_8/8_5/output.dataset_8_5.txt: -------------------------------------------------------------------------------- 1 | GCGCGCGCC GGCGCGCGC -------------------------------------------------------------------------------- /chapter1/C1_8/8_5/output.test.8_5.extra.txt: -------------------------------------------------------------------------------- 1 | AGCGCCGCT AGCGGCGCT -------------------------------------------------------------------------------- /chapter1/C1_8/8_5/output.test.8_5.txt: -------------------------------------------------------------------------------- 1 | ACAT ATGT -------------------------------------------------------------------------------- /chapter1/C1_8/8_5/test.8_5.extra.txt: -------------------------------------------------------------------------------- 1 | CTTGCCGGCGCCGATTATACGATCGCGGCCGCTTGCCTTCTTTATAATGCATCGGCGCCGCGATCTTGCTATATACGTACGCTTCGCTTGCATCTTGCGCGCATTACGTACTTATCGATTACTTATCTTCGATGCCGGCCGGCATATGCCGCTTTAGCATCGATCGATCGTACTTTACGCGTATAGCCGCTTCGCTTGCCGTACGCGATGCTAGCATATGCTAGCGCTAATTACTTAT 2 | 9 3 3 | -------------------------------------------------------------------------------- /chapter1/C1_8/8_5/test.8_5.txt: -------------------------------------------------------------------------------- 1 | ACGTTGCATGTCGCATGATGCATGAGAGCT 2 | 4 1 3 | -------------------------------------------------------------------------------- /chapter2/C2_18/.peptide_encoding.dna_condon_table.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter2/C2_18/.peptide_encoding.dna_condon_table.py.swp -------------------------------------------------------------------------------- /chapter2/C2_18/.peptide_encoding.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter2/C2_18/.peptide_encoding.py.swp -------------------------------------------------------------------------------- /chapter2/C2_18/output.txt: -------------------------------------------------------------------------------- 1 | MRFGASGTLGQDSKQGTVILALPSPSPILMGRMLRVDVDGSLEFSCDLAPLLTMGTFETTPRTCRTSNRSYNKLNPSPHVCVATARRFYRTCPCEWRNKPVAFSLVPRVHLLDRFSLPRGLVQGKRAPDAQLPKACGSGSFHLGSTTLHTLVRAVGNDDTPRVTAVHRYISILLILSFLQSQKLVKLRKLPIRSAVVTDSLALCCSLSGFLVIGSFHCSYRQYFHMVSSCTEAGRHDNLNLRLSGPIDTDILNCFRVLRAAFHIGGLPSVAHSKLLFARPRRIGSELLSVVRLRSTSWRPERTFFESERLPLERGCDRGSEVAHLWIWFRITTRVCESTEVVPGKPPSCGIGGSPIIHVRRSLSSKLNVCQLINSKLKLRTSHAPAHMTYCALAVSAKCALLLQRLHGYKISSFLVSGGRQYEILEYPDSSVTGRTRNIIRPATCVLWRPLWGRKDGKTKLNLELNTYGQKGDRAPVCSILQHPIQWARKLKHVYTCSDVRNFYSPTYGESLRGPPTTRRHDTIGAVQASSLLYEQKSALGSEAASTRLYNPNKADEPNEVVSYSGCVKGVTRRRVFIRVGEAPPYPDRLSHDAGRPLALALQHKSIVSQTSSALASNGVGTQDAAELPLEVILYAVYPKEHHMFWTRSYASAELKGRFQSLDLARFPNPAGLQKYLALRREGVKALGDSLLDTRPYSGQNLRLAILSKTGYRQRPACLVSFSPSLIAGPFLHAFVYLQYQAGLRRLLYGFLSSLTILIVRKLVYSSNPSLYSPKRALSLIQAPNLSHCGPCEFETMLAAVAGGQGKCMTKLLDLGFLLALRSGLKLSLPRELLYLTLNFDSLVAPTHLNSELRCGCRPPPHEDISDPRHDRGIIRTLLGSDHMQLLGRWRSKSPSGAVRIRRSSESNTQLRQVTFHTHCLIFNPKNTESTKSPRSTTRPIFTSDLCTVMRQFSNARCQGIPHIIHTERRIQRCVLSCERWNMHVQANHKSPSRNIFKTYIYAWTQVRCQSIAARHPRAQDTSYVNMLLYVRVAWRCSICGATSEARSGDCSSLDGVRESIPQIIYTSSIEGGPLSTRQPRCSCTSSGERVATLSRDRESINIGYRMVSNLWGGGDPISTLKVANLCGLNRPGISLNHCGVLSNYGGWGVFPEKRISRAVLRRRAVGSLTLLCQNLHSTITLGTVTPHGIHLRSSRDPVRRVAMSPPLEPDACPKSDPQTCHGLGDSHAMCEANRSDFTTVKLHLELFFFVERRAASLTTSLQDDPIPERFKSIYSQTSEVLVSTSRYDGEPIGRAGGRRGFCRALLPQVPFLNAKGLSGPLFTMFHNSCRRRYPYNWWTPLASHPKFRRNGVRDGFDYCCQKAPFGGVRYARLSRSLFRRNRSDISPETWLGIGFDGDILMWAMVLKPGCCSIWQNHPCAHPSHLRNLSVVNHQPGKRPVRLSGVNSFKRHTLKVPLRRSLVVTSATQLSEIRRVNRFSMVLGEGALLLVADRVSNLVTLRAGPLGNKVRSVVSCAVQCQARRGHRPLMINNAYTRRRTQTPRIRGATVFSPPLFFRPAPANDPAFGAYVGTMMGLWSPIATNAALLRIFLYPPSLHLVVFRPPNDAFIEGTSPYGGDNILFADKYSLLVRAITHLLLALGGRYYTLVLFIESHARLPQPRYTPDGGLVAPDLARGRPHPRLAARPRSTVDGLHYSWKTFCYSNRSIFGNRYHKPLDVRHTLCRVVFILRTASELANESSPPYRYFNFDVRVSRGRSRDSRTAYVFNRCYVVSSGSVTWIMAMNSLAMPLVGRPMSRYIFADVACASLSLSYVCRRQNSSHLALGCYRWTASVIHRVKQWLVSPSLERFRVCGRAAQLLTAALRIYVSSALPSAGIQYGKSCYRTPHVNRKEIFYALIPKFSDIEYSYDPSLYGTVVAYVSSETHLSEQRVYFVYSDRRFLPPRMGERQRPSCDRSEIPDVSSGSPGGHRAGITKPSVNNDSPIASAEAGEARVHRKNRSDHCSIWYVGDERASHFGMSRRVVRPGSESSPHRNSQYRLPECVMLARWITLRVPDIEDLDTQTTGTLFSTVCVVNATPRALVFYKGADGPGHQLRDPSPPSTPWTKALTRRHHTDSSADYPPVHWDQQHAVRSYRPHRTYKFLSERLLSSSSGKLTVGGKARLLPRHKSEKAGTIKSPRSSLYATYICAPGAHAHYLFSFSAIVRSTRWTSDSIYRLCFAFGGGLGPPRLSSHIDAIHLNLLPAYLHGGGLLDFGVGSMLTQLLSGKHPTTYELNDDDGVDSASMGQHCCRTRIGIRYHIRCAFLGTYVNVFEHISDCRLQRDTAEAIPMSMEKVVGSIAMGSLVHVRVLIIQPFYRIVEFRSAGCPVSVSSSRHRLSEALNTKGGRDIKTYNSLMLWTSAIQGLPPRDRIAEIRCVVPRPWLPMRYHVPRATSSKRCKRIRHRGASLDQLSRFSNMSRNAMYRLTIACVHSSFRINHNSASKCPYHYSPSVILYRPLSCAGCRSVGACSNEVPVAAPCTTYLRKEAFIIFLNETNIKQCYCNSLVPSESTHNNIHFKEYILPHLETRYMNPTRDRYSLFRSFNARTVRKRIVCINQPFHKQIFVSFLVSPNTDLTRQSVVLCLRLYQSVSGESLGWVEAIATVYIQPPSGGERKCPTDQAECYVCYLDKHCFSRPMVTIALKDMRLDLTRRATQGRQWTGVSGAQTNVNSTLTQIVNTCAKCVIPQGSRPTKKNGNSGETYKGFRNSEQLRAHRRFITEFYTRHSAVLPVDMRTSLLGHVNGRCIRPRCERPAYSNSMYILHSCLNPARSSPCLRRGLNSNNRGSGCTSEHGRFDMILLTVSLVPGARAMNRILNPLEPGIQSFPIGIEVQTLPVPFAVPFRLSSKGELTDCRHGCSLPIRRRTPLILSARRRCTRTAVNQTNPQYPALLVSGLVSSQTPARSQRDFGHKSILPDYSAWIFLPQPDSVSIWYVVHRKQAFVGAMLTRGVLLGKGGIPPRCVILHTTEVWVISRPKILRLYGLSRTTCWYSDARKQHLCLLGIWNVSKRVQDSDSAVSTTPVGLRKRPAVSLAAILMGQLVRPRFYTSTYWHPTIDNRCLGAGYNPLTRGRKAAHGFRCSSQTTLCNLFAPSRSTVRGHPLQALSATSIIPSLWAVGWGSLDMLESAWQRSSPPGMCLRLMRPYNRWCQPPKMVYQVCPQPGRFLIRVTKPL -------------------------------------------------------------------------------- /chapter2/C2_18/peptide_encoding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 18 Nov 2013 12 | # 13 | # Peptide Encoding Problem: 14 | # Find substrings of a genome encoding a given amino acid sequence. 15 | # Input: A DNA string Text and an amino acid string Peptide. 16 | # Output: All substrings of Text encoding Peptide (if any such substrings exist). 17 | # 18 | # CODE CHALLENGE: Solve the Peptide Encoding Problem. 19 | # 20 | # Sample Input: 21 | # ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA 22 | # MA 23 | # 24 | # Sample Output: 25 | # ATGGCC 26 | # GGCCAT 27 | # ATGGCC 28 | # 29 | # Note: The solution may contain repeated strings if the same string occurs 30 | # more than once as a substring of Text and encodes Peptide. 31 | # 32 | ############################################################################### 33 | 34 | import sys 35 | import re 36 | from itertools import product 37 | from Bio.Seq import Seq 38 | from Bio.Alphabet import generic_dna 39 | 40 | dna_codon_table = { 41 | 'TTT': 'F', 'CTT': 'L', 'ATT': 'I', 'GTT': 'V', 42 | 'TTC': 'F', 'CTC': 'L', 'ATC': 'I', 'GTC': 'V', 43 | 'TTA': 'L', 'CTA': 'L', 'ATA': 'I', 'GTA': 'V', 44 | 'TTG': 'L', 'CTG': 'L', 'ATG': 'M', 'GTG': 'V', 45 | 'TCT': 'S', 'CCT': 'P', 'ACT': 'T', 'GCT': 'A', 46 | 'TCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A', 47 | 'TCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A', 48 | 'TCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A', 49 | 'TAT': 'Y', 'CAT': 'H', 'AAT': 'N', 'GAT': 'D', 50 | 'TAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D', 51 | 'TAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E', 52 | 'TAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E', 53 | 'TGT': 'C', 'CGT': 'R', 'AGT': 'S', 'GGT': 'G', 54 | 'TGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G', 55 | 'TGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G', 56 | 'TGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G' } 57 | 58 | def read_file(input_file): 59 | f = open(input_file) 60 | raw_input = f.readlines() 61 | f.close() 62 | return raw_input 63 | 64 | def reverse_complement(dna): 65 | my_dna = Seq(dna,generic_dna) 66 | my_dna_rc = my_dna.reverse_complement() 67 | return str(my_dna_rc) 68 | 69 | def generate_kmers(dna,peptide): 70 | codons, kmers = [],[] 71 | for d,v in dna_codon_table.iteritems(): 72 | if v == peptide[0]: 73 | codons.append(d) 74 | 75 | for codon in codons: 76 | for index in [m.start() for m in re.finditer(r'(?=(%s))'%codon,dna)]: 77 | kmers.append(dna[index:index+len(peptide)*3]) 78 | return kmers 79 | 80 | def translation(DNA): 81 | my_dna = Seq(DNA, generic_dna) 82 | protein = my_dna.translate() 83 | return str(protein) 84 | 85 | def confirm_tranlation(dna,peptide): 86 | kmers = generate_kmers(dna,peptide) 87 | confirmed_kmers = [] 88 | for k in kmers: 89 | if translation(k) == peptide: 90 | confirmed_kmers.append(k) 91 | return confirmed_kmers 92 | 93 | def result(filename): 94 | dna,peptide = [item.strip() for item in read_file(filename)] 95 | results = [item for item in confirm_tranlation(dna,peptide)] 96 | for item in confirm_tranlation(reverse_complement(dna),peptide): 97 | results.append(reverse_complement(item)) 98 | return results 99 | 100 | if __name__ == "__main__": 101 | 102 | print '\n'.join(result(sys.argv[-1])) 103 | -------------------------------------------------------------------------------- /chapter2/C2_18/protein_translation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 18 Nov 2013 12 | # 13 | # Protein Translation Problem: Translate an RNA string into an amino acid string. 14 | # Input: An RNA string Pattern. 15 | # Output: The translation of Pattern into an amino acid string Peptide. 16 | # 17 | # CODE CHALLENGE: Solve the Protein Translation Problem. 18 | # 19 | # Sample Input: 20 | # AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA 21 | # 22 | # Sample Output: 23 | # MAMAPRTEINSTRING 24 | ################################################################################ 25 | 26 | import sys 27 | from Bio.Seq import translate 28 | 29 | def read_file(input_file): 30 | f = open(input_file) 31 | raw_input = f.read().strip() 32 | f.close() 33 | return raw_input 34 | 35 | def translation(DNA): 36 | return translate(DNA, to_stop=True) 37 | 38 | def result(filename): 39 | seq = read_file(filename) 40 | results = translation(seq) 41 | return results 42 | 43 | if __name__ == '__main__': 44 | 45 | fw = open('output.txt','w') 46 | fw.write(result(sys.argv[-1])) 47 | fw.close() 48 | -------------------------------------------------------------------------------- /chapter2/C2_18/test.peptide_encoding.txt: -------------------------------------------------------------------------------- 1 | ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA 2 | MA 3 | -------------------------------------------------------------------------------- /chapter2/C2_18/test.protein_translation.txt: -------------------------------------------------------------------------------- 1 | AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA 2 | -------------------------------------------------------------------------------- /chapter2/C2_20/.theoretical_spectrum.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter2/C2_20/.theoretical_spectrum.py.swp -------------------------------------------------------------------------------- /chapter2/C2_20/dataset_20_3.extra.txt: -------------------------------------------------------------------------------- 1 | 0 71 71 99 101 103 113 113 114 128 128 131 147 163 170 172 184 199 215 227 227 231 244 259 260 266 271 286 298 298 310 312 328 330 330 372 385 391 394 399 399 399 401 413 423 426 443 443 470 493 498 502 513 519 526 527 541 554 556 557 564 569 590 598 616 626 640 654 657 658 665 670 682 697 697 703 711 729 729 753 753 771 779 785 785 800 812 817 824 825 828 842 856 866 884 892 913 918 925 926 928 941 955 956 963 969 980 984 989 1012 1039 1039 1056 1059 1069 1081 1083 1083 1083 1088 1091 1097 1110 1152 1152 1154 1170 1172 1184 1184 1196 1211 1216 1222 1223 1238 1251 1255 1255 1267 1283 1298 1310 1312 1319 1335 1351 1354 1354 1368 1369 1369 1379 1381 1383 1411 1411 1482 2 | -------------------------------------------------------------------------------- /chapter2/C2_20/dataset_20_3.txt: -------------------------------------------------------------------------------- 1 | WGMKDKLRRDTLVE 2 | -------------------------------------------------------------------------------- /chapter2/C2_20/integer_mass_table.txt: -------------------------------------------------------------------------------- 1 | G 57 2 | A 71 3 | S 87 4 | P 97 5 | V 99 6 | T 101 7 | C 103 8 | I 113 9 | L 113 10 | N 114 11 | D 115 12 | K 128 13 | Q 128 14 | E 129 15 | M 131 16 | H 137 17 | F 147 18 | R 156 19 | Y 163 20 | W 186 -------------------------------------------------------------------------------- /chapter2/C2_20/output.dataset_20_3.txt: -------------------------------------------------------------------------------- 1 | 0 57 99 101 113 113 115 115 128 128 129 131 156 156 186 188 212 214 216 228 241 243 243 243 259 269 271 312 313 315 316 329 341 356 371 372 372 374 374 397 414 425 427 428 431 442 471 484 485 502 502 503 512 527 528 540 553 557 559 584 584 602 615 617 628 631 640 641 641 668 668 672 685 713 715 730 740 743 745 746 754 769 771 783 796 800 816 828 843 845 853 858 869 874 882 884 899 911 927 931 944 956 958 973 981 982 984 987 997 1012 1014 1042 1055 1059 1059 1086 1086 1087 1096 1099 1110 1112 1125 1143 1143 1168 1170 1174 1187 1199 1200 1215 1224 1225 1225 1242 1243 1256 1285 1296 1299 1300 1302 1313 1330 1353 1353 1355 1355 1356 1371 1386 1398 1411 1412 1414 1415 1456 1458 1468 1484 1484 1484 1486 1499 1511 1513 1515 1539 1541 1571 1571 1596 1598 1599 1599 1612 1612 1614 1614 1626 1628 1670 1727 -------------------------------------------------------------------------------- /chapter2/C2_20/output.test.theoretical_spectrum.extra.txt: -------------------------------------------------------------------------------- 1 | 0 71 71 99 101 103 113 113 114 128 128 131 147 163 170 172 184 199 215 227 227 231 244 259 260 266 271 286 298 298 310 312 328 330 330 372 385 391 394 399 399 399 401 413 423 426 443 443 470 493 498 502 513 519 526 527 541 554 556 557 564 569 590 598 616 626 640 654 657 658 665 670 682 697 697 703 711 729 729 753 753 771 779 785 785 800 812 817 824 825 828 842 856 866 884 892 913 918 925 926 928 941 955 956 963 969 980 984 989 1012 1039 1039 1056 1059 1069 1081 1083 1083 1083 1088 1091 1097 1110 1152 1152 1154 1170 1172 1184 1184 1196 1211 1216 1222 1223 1238 1251 1255 1255 1267 1283 1298 1310 1312 1319 1335 1351 1354 1354 1368 1369 1369 1379 1381 1383 1411 1411 1482 -------------------------------------------------------------------------------- /chapter2/C2_20/output.test.theoretical_spectrum.txt: -------------------------------------------------------------------------------- 1 | 0 113 114 128 129 227 242 242 257 355 356 370 371 484 -------------------------------------------------------------------------------- /chapter2/C2_20/test.theoretical_spectrum.extra.txt: -------------------------------------------------------------------------------- 1 | IAQMLFYCKVATN 2 | -------------------------------------------------------------------------------- /chapter2/C2_20/test.theoretical_spectrum.txt: -------------------------------------------------------------------------------- 1 | LEQN 2 | -------------------------------------------------------------------------------- /chapter2/C2_20/theoretical_spectrum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 19 Nov 2013 12 | # 13 | # Generating Theoretical Spectrum Problem: 14 | # Generate the theoretical spectrum of a cyclic peptide. 15 | # Input: An amino acid string Peptide. 16 | # Output: Cyclospectrum(Peptide). 17 | # 18 | # CODE CHALLENGE: Solve the Generating Theoretical Spectrum Problem. 19 | # 20 | # Sample Input: 21 | # LEQN 22 | # 23 | # Sample Output: 24 | # 0 113 114 128 129 227 242 242 257 355 356 370 371 484 25 | # 26 | ############################################################################### 27 | 28 | import sys 29 | 30 | def read_file(input_file): 31 | f = open(input_file) 32 | raw_input = f.readlines() 33 | f.close() 34 | return raw_input 35 | 36 | def lookup_table(filename): 37 | table = {} 38 | for line in read_file(filename): 39 | table[line.strip().split(' ')[0]] = int(line.strip().split(' ')[1]) 40 | return table 41 | 42 | def generate_subpeptides(peptide): 43 | subpeptides = ['',peptide] 44 | l = len(peptide) 45 | looped = peptide + peptide 46 | for start in range(0,l): 47 | for length in range(1,l): 48 | subpeptides.append(looped[start:start+length]) 49 | return subpeptides 50 | 51 | def result(table_filename, peptide_filename): 52 | protein_mass_table = lookup_table(table_filename) 53 | peptide = read_file(peptide_filename)[0].strip() 54 | subpeptides = generate_subpeptides(peptide) 55 | results = [] 56 | for item in subpeptides: 57 | try: 58 | mass = sum([protein_mass_table[c] for c in item]) 59 | results.append(mass) 60 | except: 61 | results.append(0) 62 | return sorted(results) 63 | 64 | if __name__ == "__main__": 65 | 66 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 67 | fw.write(' '.join(map(str,result('integer_mass_table.txt', sys.argv[-1])))) 68 | fw.close() 69 | -------------------------------------------------------------------------------- /chapter2/C2_20/theoretical_spectrum.py~: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 19 Nov 2013 12 | # 13 | # Generating Theoretical Spectrum Problem: 14 | # Generate the theoretical spectrum of a cyclic peptide. 15 | # Input: An amino acid string Peptide. 16 | # Output: Cyclospectrum(Peptide). 17 | # 18 | # CODE CHALLENGE: Solve the Generating Theoretical Spectrum Problem. 19 | # 20 | # Sample Input: 21 | # LEQN 22 | # 23 | # Sample Output: 24 | # 0 113 114 128 129 227 242 242 257 355 356 370 371 484 25 | # 26 | ############################################################################### 27 | 28 | import sys 29 | import re 30 | from itertools import product 31 | from Bio.Seq import Seq 32 | from Bio.Alphabet import generic_dna 33 | 34 | def read_file(input_file): 35 | f = open(input_file) 36 | raw_input = f.readlines() 37 | f.close() 38 | return raw_input 39 | 40 | def reverse_complement(dna): 41 | my_dna = Seq(dna,generic_dna) 42 | my_dna_rc = my_dna.reverse_complement() 43 | return str(my_dna_rc) 44 | 45 | def generate_kmers(dna,peptide): 46 | codons, kmers = [],[] 47 | for d,v in dna_codon_table.iteritems(): 48 | if v == peptide[0]: 49 | codons.append(d) 50 | 51 | for codon in codons: 52 | for index in [m.start() for m in re.finditer(r'(?=(%s))'%codon,dna)]: 53 | kmers.append(dna[index:index+len(peptide)*3]) 54 | return kmers 55 | 56 | def translation(DNA): 57 | my_dna = Seq(DNA, generic_dna) 58 | protein = my_dna.translate() 59 | return str(protein) 60 | 61 | def confirm_tranlation(dna,peptide): 62 | kmers = generate_kmers(dna,peptide) 63 | confirmed_kmers = [] 64 | for k in kmers: 65 | if translation(k) == peptide: 66 | confirmed_kmers.append(k) 67 | return confirmed_kmers 68 | 69 | def result(filename): 70 | dna,peptide = [item.strip() for item in read_file(filename)] 71 | results = [item for item in confirm_tranlation(dna,peptide)] 72 | for item in confirm_tranlation(reverse_complement(dna),peptide): 73 | results.append(reverse_complement(item)) 74 | return results 75 | 76 | if __name__ == "__main__": 77 | 78 | monoisotopic_mass_table = {} 79 | for line in read_file('integer_mass_table.txt'): 80 | monoisotopic_mass_table[line.strip().split(' ')[0]] = line.strip().split(' ')[1] 81 | print monoisotopic_mass_table 82 | # print '\n'.join(result(sys.argv[-1])) 83 | -------------------------------------------------------------------------------- /chapter2/C2_22/dataset_22_4.txt: -------------------------------------------------------------------------------- 1 | 0 87 103 103 113 114 128 128 163 186 190 200 206 227 231 277 291 293 303 314 314 314 334 390 405 406 417 417 421 442 477 477 518 520 520 534 545 580 591 605 605 607 648 648 683 704 708 708 719 720 735 791 811 811 811 822 832 834 848 894 898 919 925 935 939 962 997 997 1011 1012 1022 1022 1038 1125 2 | -------------------------------------------------------------------------------- /chapter2/C2_22/integer_mass_table.txt: -------------------------------------------------------------------------------- 1 | G 57 2 | A 71 3 | S 87 4 | P 97 5 | V 99 6 | T 101 7 | C 103 8 | I 113 9 | L 113 10 | N 114 11 | D 115 12 | K 128 13 | Q 128 14 | E 129 15 | M 131 16 | H 137 17 | F 147 18 | R 156 19 | Y 163 20 | W 186 -------------------------------------------------------------------------------- /chapter2/C2_22/leaderboard_data.txt: -------------------------------------------------------------------------------- 1 | Input 2 | 26 3 | 0 71 97 101 103 113 113 113 113 114 114 115 128 128 128 128 129 131 131 131 156 156 184 186 186 200 214 227 227 228 230 231 241 242 242 243 244 244 256 257 262 269 270 287 298 299 301 328 331 340 340 343 345 345 356 358 359 370 370 372 375 383 385 397 400 401 429 430 442 453 454 454 459 462 468 471 472 473 474 485 486 487 498 499 501 512 514 514 542 561 567 570 573 575 581 583 585 590 599 600 600 601 602 610 615 615 616 627 627 630 658 695 696 698 698 698 701 703 704 713 723 728 728 728 728 730 730 731 741 744 747 758 761 769 799 810 817 827 829 831 832 841 841 844 844 851 854 854 857 859 862 872 882 884 886 889 928 928 944 945 947 955 955 958 959 960 966 967 972 972 982 985 990 996 997 1000 1000 1003 1041 1056 1059 1062 1068 1068 1068 1073 1075 1075 1084 1087 1089 1095 1097 1103 1113 1114 1128 1128 1131 1152 1172 1172 1181 1182 1184 1189 1190 1190 1196 1197 1199 1200 1202 1210 1212 1227 1231 1242 1259 1259 1283 1295 1298 1303 1303 1303 1303 1304 1311 1312 1317 1318 1325 1325 1328 1330 1338 1340 1345 1355 1356 1388 1396 1416 1426 1426 1427 1431 1432 1432 1434 1440 1442 1443 1445 1451 1453 1453 1454 1458 1459 1459 1469 1489 1497 1529 1530 1540 1545 1547 1555 1557 1560 1560 1567 1568 1573 1574 1581 1582 1582 1582 1582 1587 1590 1602 1626 1626 1643 1654 1658 1673 1675 1683 1685 1686 1688 1689 1695 1695 1695 1696 1701 1703 1704 1713 1713 1733 1754 1757 1757 1771 1772 1782 1788 1790 1796 1798 1801 1810 1810 1812 1817 1817 1817 1823 1826 1829 1844 1882 1885 1885 1888 1889 1895 1900 1903 1913 1913 1918 1919 1925 1926 1927 1930 1930 1938 1940 1941 1957 1957 1996 1999 2001 2003 2013 2023 2026 2028 2031 2031 2034 2041 2041 2044 2044 2053 2054 2056 2058 2068 2075 2086 2116 2124 2127 2138 2141 2144 2154 2155 2155 2157 2157 2157 2157 2162 2172 2181 2182 2184 2187 2187 2187 2189 2190 2227 2255 2258 2258 2269 2270 2270 2275 2283 2284 2285 2285 2286 2295 2300 2302 2304 2310 2312 2315 2318 2324 2343 2371 2371 2373 2384 2386 2387 2398 2399 2400 2411 2412 2413 2414 2417 2423 2426 2431 2431 2432 2443 2455 2456 2484 2485 2488 2500 2502 2510 2513 2515 2515 2526 2527 2529 2540 2540 2542 2545 2545 2554 2557 2584 2586 2587 2598 2615 2616 2623 2628 2629 2641 2641 2642 2643 2643 2644 2654 2655 2657 2658 2658 2671 2685 2699 2699 2701 2729 2729 2754 2754 2754 2756 2757 2757 2757 2757 2770 2771 2771 2772 2772 2772 2772 2782 2784 2788 2814 2885 4 | Output 5 | 156-71-113-114-131-156-113-101-129-128-128-114-128-103-97-131-131-113-131-113-128-115-128-113 6 | -------------------------------------------------------------------------------- /chapter2/C2_22/output.dataset_22_4.txt: -------------------------------------------------------------------------------- 1 | 87-103-103-128-186-128-163-114-113 87-113-114-163-128-186-128-103-103 103-87-113-114-163-128-186-128-103 103-103-87-113-114-163-128-186-128 103-103-128-186-128-163-114-113-87 103-128-186-128-163-114-113-87-103 113-87-103-103-128-186-128-163-114 113-114-163-128-186-128-103-103-87 114-113-87-103-103-128-186-128-163 114-163-128-186-128-103-103-87-113 128-103-103-87-113-114-163-128-186 128-163-114-113-87-103-103-128-186 128-186-128-103-103-87-113-114-163 128-186-128-163-114-113-87-103-103 163-114-113-87-103-103-128-186-128 163-128-186-128-103-103-87-113-114 186-128-103-103-87-113-114-163-128 186-128-163-114-113-87-103-103-128 -------------------------------------------------------------------------------- /chapter2/C2_22/output.test.cyclo_peptide_sequencing.extra.txt: -------------------------------------------------------------------------------- 1 | 71-131-114-113-113-115-99-97-103-137 71-137-103-97-99-115-113-113-114-131 97-99-115-113-113-114-131-71-137-103 97-103-137-71-131-114-113-113-115-99 99-97-103-137-71-131-114-113-113-115 99-115-113-113-114-131-71-137-103-97 103-97-99-115-113-113-114-131-71-137 103-137-71-131-114-113-113-115-99-97 113-113-114-131-71-137-103-97-99-115 113-113-115-99-97-103-137-71-131-114 113-114-131-71-137-103-97-99-115-113 113-115-99-97-103-137-71-131-114-113 114-113-113-115-99-97-103-137-71-131 114-131-71-137-103-97-99-115-113-113 115-99-97-103-137-71-131-114-113-113 115-113-113-114-131-71-137-103-97-99 131-71-137-103-97-99-115-113-113-114 131-114-113-113-115-99-97-103-137-71 137-71-131-114-113-113-115-99-97-103 137-103-97-99-115-113-113-114-131-71 -------------------------------------------------------------------------------- /chapter2/C2_22/output.test.cyclo_peptide_sequencing.txt: -------------------------------------------------------------------------------- 1 | 113-128-186 113-186-128 128-113-186 128-186-113 186-113-128 186-128-113 -------------------------------------------------------------------------------- /chapter2/C2_22/test.cyclo_peptide_sequencing.extra.txt: -------------------------------------------------------------------------------- 1 | 0 71 97 99 103 113 113 114 115 131 137 196 200 202 208 214 226 227 228 240 245 299 311 311 316 327 337 339 340 341 358 408 414 424 429 436 440 442 453 455 471 507 527 537 539 542 551 554 556 566 586 622 638 640 651 653 657 664 669 679 685 735 752 753 754 756 766 777 782 782 794 848 853 865 866 867 879 885 891 893 897 956 962 978 979 980 980 990 994 996 1022 1093 2 | -------------------------------------------------------------------------------- /chapter2/C2_22/test.cyclo_peptide_sequencing.txt: -------------------------------------------------------------------------------- 1 | 0 113 128 186 241 299 314 427 2 | -------------------------------------------------------------------------------- /chapter2/C2_24/dataset_24_4.txt: -------------------------------------------------------------------------------- 1 | 352 2 | 0 87 87 99 101 103 103 103 113 113 114 114 114 115 115 129 131 137 147 156 163 186 186 190 200 201 213 216 218 218 228 230 238 245 245 250 262 273 276 278 285 285 289 289 303 305 314 315 321 344 351 353 359 359 363 367 376 376 386 388 391 399 402 404 408 418 425 428 432 446 454 458 473 477 478 479 480 489 491 502 503 507 523 530 533 538 539 547 559 560 572 581 583 591 592 592 594 594 594 606 617 625 626 633 636 640 644 648 659 670 673 693 694 697 706 707 710 720 722 727 729 739 739 747 748 758 769 772 780 781 783 785 807 810 811 823 830 836 837 844 845 851 853 856 861 862 879 883 886 895 898 910 924 925 935 938 945 947 948 948 951 958 959 965 976 982 984 993 996 1001 1009 1011 1012 1024 1048 1050 1052 1061 1061 1061 1063 1072 1074 1079 1095 1096 1098 1115 1121 1124 1125 1133 1138 1140 1148 1159 1164 1166 1166 1174 1176 1177 1187 1197 1209 1224 1227 1230 1232 1234 1236 1238 1239 1253 1254 1261 1262 1262 1269 1274 1277 1279 1333 1333 1339 1339 1340 1341 1346 1350 1352 1353 1363 1364 1366 1370 1375 1376 1377 1377 1383 1383 1437 1439 1442 1447 1454 1454 1455 1462 1463 1477 1478 1480 1482 1484 1486 1489 1492 1507 1519 1529 1539 1540 1542 1550 1550 1552 1557 1568 1576 1578 1583 1591 1592 1595 1601 1618 1620 1621 1637 1642 1644 1653 1655 1655 1655 1664 1666 1668 1692 1704 1705 1707 1715 1720 1723 1732 1734 1740 1751 1757 1758 1765 1768 1768 1769 1771 1778 1781 1791 1792 1806 1818 1821 1830 1833 1837 1854 1855 1860 1863 1865 1871 1872 1879 1880 1886 1893 1905 1906 1909 1931 1933 1935 1936 1944 1947 1958 1968 1969 1977 1977 1987 1989 1994 1996 2006 2009 2010 2019 2022 2023 2043 2046 2057 2068 2072 2076 2080 2083 2090 2091 2099 2110 2122 2122 2122 2124 2124 2125 2133 2135 2144 2156 2157 2169 2177 2178 2183 2186 2193 2209 2213 2214 2225 2227 2236 2237 2239 2243 2258 2262 2270 2284 2288 2291 2298 2308 2312 2314 2317 2325 2328 2330 2340 2340 2349 2353 2357 2357 2363 2365 2372 2395 2401 2402 2411 2413 2427 2427 2431 2431 2438 2440 2443 2454 2466 2471 2471 2478 2486 2488 2498 2498 2500 2503 2515 2516 2526 2530 2530 2553 2560 2569 2579 2585 2587 2601 2601 2602 2602 2602 2603 2603 2613 2613 2613 2615 2617 2629 2629 2716 3 | -------------------------------------------------------------------------------- /chapter2/C2_24/leaderboardcyclopeptide_sequencing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 27 Nov 2013 12 | # 13 | # CODE CHALLENGE: Implement LEADERBOARDCYCLOPEPTIDESEQUENCING. 14 | # 15 | # Input: Integer N and a collection of integers Spectrum. 16 | # 17 | # Output: LeaderPeptide after running LEADERBOARDCYCLOPEPTIDESEQUENCING(Spectrum, N). 18 | # 19 | # Sample Input: 20 | # 10 21 | # 0 71 113 129 147 200 218 260 313 331 347 389 460 22 | # 23 | # Sample Output: 24 | # 113-147-71-129 25 | # 26 | ############################################################################### 27 | 28 | import sys 29 | import timeit 30 | import heapq 31 | from itertools import product,izip,ifilter 32 | from collections import Counter 33 | 34 | table = [57, 71, 87, 97, 99, 101, 103, 113, 113, 114, 115, 128, 128, 129, 131, 137, 147, 156, 163, 186] 35 | 36 | def read_file(input_file): 37 | f = open(input_file) 38 | N,Spectrum = [item.strip() for item in f.readlines()] 39 | f.close() 40 | return (int(N),map(int,Spectrum.split(' '))) 41 | 42 | def score(theorectical_spectrum,experimental_spectrum): 43 | return len(list((Counter(theorectical_spectrum) & Counter(experimental_spectrum)).elements())) 44 | 45 | def generate_subspectrums(peptide): 46 | l = len(peptide) 47 | looped = peptide + peptide 48 | return [0,sum(peptide)]+[sum(looped[start:start+length]) for start,length in product(range(0,l),range(1,l))] 49 | 50 | def cut(Leaderboard,Spectrum,N): 51 | if len(Leaderboard) > N: 52 | results = [] 53 | for Peptide in Leaderboard: 54 | try: 55 | Peptide_experimental_spectrum = generate_subspectrums(Peptide) 56 | except: 57 | Peptide = Peptide[0]+[Peptide[1]] 58 | Peptide_experimental_spectrum = generate_subspectrums(Peptide) 59 | results.append((Peptide,score(Spectrum,Peptide_experimental_spectrum))) 60 | tie = heapq.nlargest(N,results,key=lambda x: x[1])[-1][1] 61 | res = list(ifilter(lambda x: x[1]>=tie,results)) 62 | return list(izip(*res))[0] 63 | else: 64 | return Leaderboard 65 | 66 | def LeaderboardCyclopeptideSequencing(Spectrum,N): 67 | Leaderboard = [0] 68 | LeaderPeptide = [] 69 | while Leaderboard != []: 70 | Leaderboard = [list(pt) for pt in product(Leaderboard,table)] 71 | for Peptide in Leaderboard: 72 | try: 73 | Peptide_experimental_spectrum = generate_subspectrums(Peptide) 74 | except: 75 | Leaderboard = [Peptide[0]+[Peptide[1]] if x == Peptide else x for x in Leaderboard] 76 | Peptide = Peptide[0]+[Peptide[1]] 77 | Peptide_experimental_spectrum = generate_subspectrums(Peptide) 78 | if max(Peptide_experimental_spectrum) == max(Spectrum): 79 | LeaderPeptide_experimental_spectrum = generate_subspectrums(LeaderPeptide) 80 | if score(Spectrum,Peptide_experimental_spectrum) > score(Spectrum,LeaderPeptide_experimental_spectrum): 81 | LeaderPeptide = Peptide 82 | elif max(Peptide_experimental_spectrum) > max(Spectrum): 83 | Leaderboard.remove(Peptide) 84 | Leaderboard = cut(Leaderboard,Spectrum,N) 85 | return LeaderPeptide 86 | 87 | def result(filename): 88 | N,Spectrum = read_file(filename) 89 | results = LeaderboardCyclopeptideSequencing(Spectrum,N) 90 | return results[1:] 91 | 92 | if __name__ == "__main__": 93 | 94 | start = timeit.default_timer() 95 | results = result(sys.argv[-1]) 96 | print '-'.join(map(str,results)) 97 | print '' 98 | stop = timeit.default_timer() 99 | print stop - start 100 | -------------------------------------------------------------------------------- /chapter2/C2_24/test.leaderboard_data.extra.txt: -------------------------------------------------------------------------------- 1 | 26 2 | 0 71 97 101 103 113 113 113 113 114 114 115 128 128 128 128 129 131 131 131 156 156 184 186 186 200 214 227 227 228 230 231 241 242 242 243 244 244 256 257 262 269 270 287 298 299 301 328 331 340 340 343 345 345 356 358 359 370 370 372 375 383 385 397 400 401 429 430 442 453 454 454 459 462 468 471 472 473 474 485 486 487 498 499 501 512 514 514 542 561 567 570 573 575 581 583 585 590 599 600 600 601 602 610 615 615 616 627 627 630 658 695 696 698 698 698 701 703 704 713 723 728 728 728 728 730 730 731 741 744 747 758 761 769 799 810 817 827 829 831 832 841 841 844 844 851 854 854 857 859 862 872 882 884 886 889 928 928 944 945 947 955 955 958 959 960 966 967 972 972 982 985 990 996 997 1000 1000 1003 1041 1056 1059 1062 1068 1068 1068 1073 1075 1075 1084 1087 1089 1095 1097 1103 1113 1114 1128 1128 1131 1152 1172 1172 1181 1182 1184 1189 1190 1190 1196 1197 1199 1200 1202 1210 1212 1227 1231 1242 1259 1259 1283 1295 1298 1303 1303 1303 1303 1304 1311 1312 1317 1318 1325 1325 1328 1330 1338 1340 1345 1355 1356 1388 1396 1416 1426 1426 1427 1431 1432 1432 1434 1440 1442 1443 1445 1451 1453 1453 1454 1458 1459 1459 1469 1489 1497 1529 1530 1540 1545 1547 1555 1557 1560 1560 1567 1568 1573 1574 1581 1582 1582 1582 1582 1587 1590 1602 1626 1626 1643 1654 1658 1673 1675 1683 1685 1686 1688 1689 1695 1695 1695 1696 1701 1703 1704 1713 1713 1733 1754 1757 1757 1771 1772 1782 1788 1790 1796 1798 1801 1810 1810 1812 1817 1817 1817 1823 1826 1829 1844 1882 1885 1885 1888 1889 1895 1900 1903 1913 1913 1918 1919 1925 1926 1927 1930 1930 1938 1940 1941 1957 1957 1996 1999 2001 2003 2013 2023 2026 2028 2031 2031 2034 2041 2041 2044 2044 2053 2054 2056 2058 2068 2075 2086 2116 2124 2127 2138 2141 2144 2154 2155 2155 2157 2157 2157 2157 2162 2172 2181 2182 2184 2187 2187 2187 2189 2190 2227 2255 2258 2258 2269 2270 2270 2275 2283 2284 2285 2285 2286 2295 2300 2302 2304 2310 2312 2315 2318 2324 2343 2371 2371 2373 2384 2386 2387 2398 2399 2400 2411 2412 2413 2414 2417 2423 2426 2431 2431 2432 2443 2455 2456 2484 2485 2488 2500 2502 2510 2513 2515 2515 2526 2527 2529 2540 2540 2542 2545 2545 2554 2557 2584 2586 2587 2598 2615 2616 2623 2628 2629 2641 2641 2642 2643 2643 2644 2654 2655 2657 2658 2658 2671 2685 2699 2699 2701 2729 2729 2754 2754 2754 2756 2757 2757 2757 2757 2770 2771 2771 2772 2772 2772 2772 2782 2784 2788 2814 2885 3 | -------------------------------------------------------------------------------- /chapter2/C2_24/test.leaderboard_data.txt: -------------------------------------------------------------------------------- 1 | 10 2 | 0 71 113 129 147 200 218 260 313 331 347 389 460 3 | -------------------------------------------------------------------------------- /chapter2/C2_26/26_4/dataset_26_4.txt: -------------------------------------------------------------------------------- 1 | 57 718 928 489 228 504 757 744 414 228 974 214 186 332 188 717 531 428 0 1017 732 1089 543 527 528 229 1033 87 1015 131 402 603 429 447 945 960 804 619 1146 586 113 1047 831 803 201 389 830 218 529 871 303 917 918 400 115 642 847 814 299 918 1032 699 732 129 115 560 875 390 642 114 756 618 932 843 746 1059 1031 99 342 271 414 617 316 958 615 172 657 504 315 1031 275 343 2 | -------------------------------------------------------------------------------- /chapter2/C2_26/26_4/output.test.spectral_convolution.txt: -------------------------------------------------------------------------------- 1 | 49 137 137 186 186 323 -------------------------------------------------------------------------------- /chapter2/C2_26/26_4/spectral_convolution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 19 Nov 2013 12 | # 13 | # Spectral Convolution Problem: Compute the convolution of a spectrum. 14 | # Input: A collection of integers Spectrum. 15 | # Output: The list of elements in the convolution of Spectrum. If an element 16 | # has multiplicity k, it should appearexactly k times; you may return 17 | # the elements in any order. 18 | # 19 | # CODE CHALLENGE: Solve the Spectral Convolution Problem. 20 | # 21 | # Sample Input: 22 | # 0 137 186 323 23 | # 24 | # Sample Output: 25 | # 137 137 186 186 323 49 26 | # 27 | ############################################################################### 28 | 29 | import sys 30 | from itertools import product 31 | 32 | def read_file(input_file): 33 | f = open(input_file) 34 | data = [int(item) for item in f.read().strip().split(' ')] 35 | f.close() 36 | return data 37 | 38 | def result(filename): 39 | Spectrum = read_file(filename) 40 | return [pt1 - pt2 for pt1,pt2 in product(Spectrum,Spectrum) if pt1 != pt2 and pt1-pt2>=0] 41 | 42 | if __name__ == "__main__": 43 | 44 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 45 | fw.write(' '.join(map(str,sorted(result(sys.argv[-1]))))) 46 | fw.close() 47 | -------------------------------------------------------------------------------- /chapter2/C2_26/26_4/test.spectral_convolution.txt: -------------------------------------------------------------------------------- 1 | 0 137 186 323 2 | -------------------------------------------------------------------------------- /chapter2/C2_26/26_7/convolutioncyclopeptidesequencing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 27 Nov 2013 12 | # 13 | # CODE CHALLENGE: Implement CONVOLUTIONCYCLOPEPTIDESEQUENCING. 14 | # 15 | # Input: An integer M, an integer N, and a collection of (possibly repeated) 16 | # integers Spectrum. 17 | # 18 | # Output: A cyclic peptide LeaderPeptide with amino acids taken only from the 19 | # top M elements (and ties) of the convolution of Spectrum that fall 20 | # between 57 and 200, and where the size of Leaderboard is restricted 21 | # to the top N (and ties). 22 | # 23 | # Sample Input: 24 | # 20 25 | # 60 26 | # 57 57 71 99 129 137 170 186 194 208 228 265 285 299 307 323 356 364 394 422 493 27 | # 28 | # Sample Output: 29 | # 99-71-137-57-72-57 30 | # 31 | ############################################################################### 32 | 33 | import sys 34 | import timeit 35 | import heapq 36 | from itertools import product,izip,ifilter 37 | from collections import Counter 38 | 39 | def read_file(input_file): 40 | f = open(input_file) 41 | M,N,Spectrum = [item.strip() for item in f.readlines()] 42 | f.close() 43 | return (int(M),int(N),map(int,Spectrum.split(' '))) 44 | 45 | def select_spectrum(Spectrum,M): 46 | poten = [pt1-pt2 for pt1,pt2 in product(Spectrum,Spectrum) if pt1-pt2 >= 57 and pt1-pt2 <= 200] 47 | lst = Counter(poten).most_common() 48 | tie = heapq.nlargest(M,lst,key=lambda x:x[1])[-1][1] 49 | res = list(ifilter(lambda x: x[1]>=tie, lst)) 50 | return list(izip(*res))[0] 51 | 52 | def score(theorectical_spectrum,experimental_spectrum): 53 | return len(list((Counter(theorectical_spectrum) & Counter(experimental_spectrum)).elements())) 54 | 55 | def generate_subspectrums(peptide): 56 | l = len(peptide) 57 | looped = peptide + peptide 58 | return [0,sum(peptide)]+[sum(looped[start:start+length]) for start,length in product(range(0,l),range(1,l))] 59 | 60 | def cut(Leaderboard,Spectrum,N): 61 | if len(Leaderboard) > N: 62 | results = [] 63 | for Peptide in Leaderboard: 64 | try: 65 | Peptide_experimental_spectrum = generate_subspectrums(Peptide) 66 | except: 67 | Peptide = Peptide[0]+[Peptide[1]] 68 | Peptide_experimental_spectrum = generate_subspectrums(Peptide) 69 | results.append((Peptide,score(Spectrum,Peptide_experimental_spectrum))) 70 | tie = heapq.nlargest(N,results,key=lambda x: x[1])[-1][1] 71 | res = list(ifilter(lambda x: x[1]>=tie,results)) 72 | return list(izip(*res))[0] 73 | else: 74 | return Leaderboard 75 | 76 | def LeaderboardCyclopeptideSequencing(M,N,Spectrum): 77 | Leaderboard = [0] 78 | LeaderPeptide = [] 79 | table = select_spectrum(Spectrum,M) 80 | while Leaderboard != []: 81 | Leaderboard = [list(pt) for pt in product(Leaderboard,table)] 82 | for Peptide in Leaderboard: 83 | try: 84 | Peptide_experimental_spectrum = generate_subspectrums(Peptide) 85 | except: 86 | Leaderboard = [Peptide[0]+[Peptide[1]] if x == Peptide else x for x in Leaderboard] 87 | Peptide = Peptide[0]+[Peptide[1]] 88 | Peptide_experimental_spectrum = generate_subspectrums(Peptide) 89 | if max(Peptide_experimental_spectrum) == max(Spectrum): 90 | LeaderPeptide_experimental_spectrum = generate_subspectrums(LeaderPeptide) 91 | if score(Spectrum,Peptide_experimental_spectrum) > score(Spectrum,LeaderPeptide_experimental_spectrum): 92 | LeaderPeptide = Peptide 93 | elif max(Peptide_experimental_spectrum) > max(Spectrum): 94 | Leaderboard.remove(Peptide) 95 | Leaderboard = cut(Leaderboard,Spectrum,N) 96 | return LeaderPeptide 97 | 98 | def result(filename): 99 | M,N,Spectrum = read_file(filename) 100 | results = LeaderboardCyclopeptideSequencing(M,N,Spectrum) 101 | return results[1:] 102 | 103 | if __name__ == "__main__": 104 | 105 | start = timeit.default_timer() 106 | results = result(sys.argv[-1]) 107 | print '-'.join(map(str,results)) 108 | print '' 109 | stop = timeit.default_timer() 110 | print stop - start 111 | -------------------------------------------------------------------------------- /chapter2/C2_26/26_7/dataset_26_7.txt: -------------------------------------------------------------------------------- 1 | 19 2 | 376 3 | 659 499 933 759 1553 895 551 1563 236 113 1216 612 1069 438 300 241 353 1332 147 1061 1390 1376 1232 1316 982 1665 218 312 1691 1665 1757 516 1828 137 266 1566 163 679 1174 816 1431 1715 250 887 1369 615 560 1440 778 1429 1729 541 1578 633 1608 1562 919 349 998 1312 1379 1032 204 791 1050 1771 683 1268 137 1461 1450 1009 1329 262 796 378 163 979 1108 938 1691 675 367 216 399 1700 1153 1294 1174 323 1266 1213 782 1277 846 1681 541 1156 1082 496 1644 1392 512 1012 1610 819 612 1255 57 654 796 1195 71 128 163 1213 1665 697 849 1175 428 1594 573 909 1037 1715 516 654 1145 1050 397 1216 113 1587 583 1287 99 1003 615 388 825 816 1479 672 941 436 265 904 1166 371 1245 835 1358 1725 1594 379 1131 425 1431 830 1400 993 1497 470 449 1169 1457 1053 1505 1502 1449 1032 234 0 331 1516 775 778 1403 1624 890 326 653 720 452 562 1046 924 596 1475 1012 534 163 1149 220 184 275 746 1592 662 767 103 1612 234 1757 459 1312 1287 1665 397 1528 71 4 | -------------------------------------------------------------------------------- /chapter2/C2_26/26_7/spectralconvolution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 20 Nov 2013 12 | # 13 | # CODE CHALLENGE: Implement CONVOLUTIONCYCLOPEPTIDESEQUENCING. 14 | # 15 | # Input: An integer M, an integer N, and a collection of (possibly repeated) 16 | # integers Spectrum. 17 | # 18 | # Output: A cyclic peptide LeaderPeptide with amino acids taken only from the 19 | # top M elements (and ties) of the convolution of Spectrum that fall 20 | # between 57 and 200, and where the size of Leaderboard is restricted 21 | # to the top N (and ties). 22 | # 23 | # Sample Input: 24 | # 20 25 | # 60 26 | # 57 57 71 99 129 137 170 186 194 208 228 265 285 299 307 323 356 364 394 422 493 27 | # 28 | # Sample Output: 29 | # 99-71-137-57-72-57 30 | # 31 | ############################################################################### 32 | 33 | import sys 34 | 35 | def read_file(input_file): 36 | f = open(input_file) 37 | raw_input = f.read() 38 | f.close() 39 | return raw_input 40 | 41 | def result(filename): 42 | data = [int(item) for item in read_file(filename).strip().split(' ')] 43 | results = [] 44 | for item1 in data: 45 | for item2 in data: 46 | if item1 != item2 and item1-item2 >= 0: 47 | results.append(item1-item2) 48 | return results 49 | 50 | if __name__ == "__main__": 51 | 52 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 53 | fw.write(' '.join(map(str,sorted(result(sys.argv[-1]))))) 54 | fw.close() 55 | -------------------------------------------------------------------------------- /chapter2/C2_26/26_7/test.convolutioncyclopeptidesequencing.extra.txt: -------------------------------------------------------------------------------- 1 | 16 2 | 84 3 | 672 669 1075 761 690 464 579 344 841 1142 1291 633 1204 438 474 882 399 853 879 301 637 766 303 1275 985 216 595 113 781 668 1191 1086 1249 458 87 273 601 1248 486 1176 729 511 417 975 546 224 1261 1116 682 596 71 214 938 87 295 1275 1138 966 1174 367 568 725 759 1233 816 1305 1263 896 658 129 1215 300 396 924 325 287 0 1067 171 945 1089 466 704 1154 434 881 1158 1305 995 898 99 580 1199 276 382 158 612 1029 509 693 851 603 928 186 767 888 1037 805 270 57 963 295 521 204 1092 680 360 876 377 783 208 147 794 1146 387 1225 483 980 1148 480 137 1061 581 246 188 417 750 101 694 1018 481 904 333 945 1067 1059 57 213 1362 1002 424 114 163 1149 1062 220 782 557 4 | -------------------------------------------------------------------------------- /chapter2/C2_26/26_7/test.convolutioncyclopeptidesequencing.txt: -------------------------------------------------------------------------------- 1 | 20 2 | 60 3 | 57 57 71 99 129 137 170 186 194 208 228 265 285 299 307 323 356 364 394 422 493 4 | -------------------------------------------------------------------------------- /chapter3/C3_36/dataset_36_7.txt: -------------------------------------------------------------------------------- 1 | 5 2 2 | CGTCGCGAAAGGCATTTTTATGCGT 3 | CTGCCCCGATTTGAGACCCGTGTCG 4 | CGGCGGCACGTTGCGTAGTGCCGCC 5 | TAAGCGGTCGCTGGAAGCTCTAACA 6 | TATGCCAGAGTACAAGGACGTGGGT 7 | GACGGACAGGCGTCGGACGCAGTTC 8 | -------------------------------------------------------------------------------- /chapter3/C3_36/motifenumeration.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 27 Nov 2013 12 | # 13 | # CODE CHALLENGE: Implement MOTIFENUMERATION (reproduced below). 14 | # Input: Integers k and d, followed by a collection of strings Dna. 15 | # Output: All (k, d)-motifs in Dna. 16 | # 17 | # Sample Input: 18 | # 3 1 19 | # ATTTGGC 20 | # TGCCTTA 21 | # CGGTATC 22 | # GAAAATT 23 | # 24 | # Sample Output: 25 | # ATA ATT GTT TTT 26 | # 27 | # Note: 28 | # Given a collection of strings Dna and an integer d, a k-mer is a (k,d)-motif 29 | # if it appears in every string from Dna with at most d mismatches. 30 | # 31 | ############################################################################### 32 | 33 | import sys 34 | import timeit 35 | import regex 36 | from itertools import combinations,product 37 | 38 | def read_file(input_file): 39 | f = open(input_file) 40 | data = [item.strip() for item in f.readlines()] 41 | k,d = map(int,data[0].split(' ')) 42 | f.close() 43 | return (k,d,data[1:]) 44 | 45 | def correct(seq,k): 46 | return set(seq[i:i+k] for i in range(len(seq)-k+1)) 47 | 48 | def correct_kmers(Dna,k): 49 | return frozenset().union(*[correct(seq,k) for seq in Dna]) 50 | 51 | def generate(s,d): 52 | N = len(s) 53 | letters = 'ACGT' 54 | pool = list(s) 55 | for indices in combinations(range(N),d): 56 | for replacements in product(letters,repeat=d): 57 | skip = False 58 | for i, a in zip(indices, replacements): 59 | if pool[i] == a: 60 | skip = True 61 | if skip: 62 | continue 63 | key = dict(zip(indices,replacements)) 64 | yield ''.join([pool[i] if i not in indices else key[i] for i in range(N)]) 65 | 66 | def possible_kmers(k,d,Dna): 67 | correct = set(correct_kmers(Dna,k)) 68 | possibles = set() 69 | dd = 1 70 | while dd <= d: 71 | for s in correct: 72 | for item in generate(s,dd): 73 | possibles.add(item) 74 | dd += 1 75 | return possibles 76 | 77 | def find_kmer(seq,kmer,d): 78 | return regex.findall(r'(?=(%s){s,e<=%d})'%(kmer,d),seq) 79 | 80 | def kmer_composition(k,d,Dna): 81 | possibles = possible_kmers(k,d,Dna) 82 | kmers = [] 83 | for kmer in possibles: 84 | skip = False 85 | for seq in Dna: 86 | if len(find_kmer(seq,kmer,d))<=0: 87 | skip = True 88 | break 89 | if skip == False: 90 | kmers.append(kmer) 91 | return kmers 92 | 93 | def result(filename): 94 | k,d,Dna = read_file(filename) 95 | return kmer_composition(k,d,Dna) 96 | 97 | if __name__ == "__main__": 98 | 99 | start = timeit.default_timer() 100 | results = result(sys.argv[-1]) 101 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 102 | fw.write(' '.join(map(str,results))) 103 | fw.close() 104 | stop = timeit.default_timer() 105 | print stop - start 106 | -------------------------------------------------------------------------------- /chapter3/C3_36/output.dataset_36_7.txt: -------------------------------------------------------------------------------- 1 | GCGTT AAATG GCCCG GCCCA GAGAC GCCCC GCCTA GCCCT AAATT GCGTG GCGTA CCTAG TGGTG AGTCT GTGGG GTGGA TGGTA GTGGC CGCAC GATAG CGCAA GCCTG CGCAG GATAC AGACG CTGCA GTGGT TGGTT CCCCA TTACA CCCCG TCGCC AGCAC TCGCG AGCAG CCAAC TCGCT GATCC GAAAG GGGTA TGGAG TGTTG TGTTA TGTTC GACTG TATAG GACTC CTAGC GCACC CCTTG TGGCG TGGCC TGGCA TGGCT GCTTG GCTTA CGGCA CGGCC CACTA CACTG CGGCG AATGG CGGCT CACTT TGACT ATACG AGACA GGGCG GGGCC TGACG TGACA AGTTT TGAGA GCCGA GCCGC GCCGG AGTTA AGTTC AGTTG AGGTC GCCGT TAACG GTGCA GTGCC CGGGT CGATT GTGCG CACAG CACAA ATTGG CGATA GCTCG ACATC CTGCT TATGC ATGAC GGCGC ATGAG AATTG CTGCG ACCCG TTACG CTGCC GGTGG GGTGC GGTGA ACTTT GAACC GGTGT CGTAG CGTAA CGTAC GCGTC TGAGT ACTTG TAGCG ACTTA ACTTC TAGCC TACCG CAACT ACGAT GTAAG CAACG GGAGC AGGAG GGAGG AGGAC ATCCG TGGGC TGGGA TGGGG ATGTC AGGAT ATGTA ATGTG GGAGT TGGGT TTAGA AAGTC ACAAG TTAGG AAGTG GGAGA GAGCC TTGCG GAGCA AAGTT TTAGT TGCGT ATTGC TTGGA CACCT TTGGG GGGGT TGCGG TAGGA TGCGC TGCGA GTCTA TTGGT TAGGC CCAAG TAGGG AGATT TCCAG CCGCG AGATG CCGCA CCGCC CGGTT TAAGG CCACT CGGTG CGGTA TAAGT CGAAG GTCCG AAGAG CGAAC GCTGC CGAAT AAGAT TCATA GCTGT CTGGA CTGGC CTGGG ACCGG GGTCC GGTCA GGTCG ACCGA ACCGT TGTAG CCTGG TGTAC TACAC CCTGT CAAGA TTGCA AGTCG AGTCA AGTCC GCGAG GCGAC GCGAA GTCCT GGACG GAAAT CAAGT GGACA ACGCT GCGAT TGAAC GAAGT AACGT CGAAA GAAGA GTTAG GAAGC ATTGA AACGG AACGA AACGC GAGGG GAGGC GAGGA ATTCG TGCCT GTATC GTATA GAGGT GTATG ACGAA GAGCG TGCCA TGCCG ACGAG GTCCA GTCCC AATCC GGCAC CCGGA CCGGC CCGTA CCGGG AGCGT AGCGC AGGGC AGCGA AGCGG GATTC AGGGA GATTG TCAGT CAGGA CAGGG CTTCC CTTCA TCAGG CTTCG TCAGC TTGTT TTGGC GACAC GACAG TTGTC TTGTA GGATT CGTTG CGTTC CGTTA GTCTT CATTG CCTCA CATTA CCTCG CATTC GGATA CGTTT AGTGA CGCCG AGTGC CGCCA CGCCC AGTGG CAGGC TTCGT CGCCT AGTGT ATGGG ATGGC CCAGG GAACT GGGGA GAACA GGCGA GGGGC GAACG CGCGT ATGTT ATCGT GTCTG ATCGG CAGGT GCCTC GTCGT GTAGC GTAGA CTAAG GCACT GTCGA GTCGC GCCTT GCACA GCACG CGCGA GGGTT CAGCT CAGCG CAGCC CAGCA ACTGT TCACA TCACG CTTGG GAGTC CTTGC GAGTG TGAAG ACTGG GAGTT TACGA GCCAC GCCAA GCCAG GCCAT GTGAC TGAGG GTGAG AGAAG CGCGC GATCG AGAAC CGCGG TCGAA GCGCA TCGAC AGCCG ATGCC CAGTT ATGCA AGCCC AGGCA AGCCA ATGCG CTGAG CTGAA CAGTG TCGAG CAGTC CCACA CAGTA TAGAT CGTCG CGTCC CGTCA TAGAG TAGAC TACGT TAGAA TATCG AAACG CTACG GTACC TCCTA TGCTG GCGGG GCGGA ACGTC AGCCT CTACA TCGTG GCAGA ATGCT TGGAT AGCTT GCAGT TCGTT GTTGC GTTGA CGGAT GTTGG CGGAG ACACT GAGAG GAGAA CGGAA GTTGT CCATG CCATC GGGAT GAATG GAATA GGGAC GGGAA ATTTC GGGAG GAATT TGTGG TGTGA TGTGC ACTCG TATGG GTGTC GGCGT CAATG CGAGG GGCTT TAAAG GATGT CACCG GATGA GATGC GATGG AACTT AAGCG GCTGA AAGCC GCTAC AACTG GTGTA CGTCT TTCGG GTTCG TTCGC TTCGA CGTGC CGTGA CGTGG TGTCA TGTCC TGTCG CCGTT CAAAT TGTCT TACGC AGTAG CCGTG AGGCC GCGCC AGGCG TCTGT GCGCG GGAAA GGAAC CATGG CATGA CATGC TCTGC AGGCT GGAAT ACAGC ACAGG ATTAC CTTTG GTTCA ACAGT CTCGT ACGGT TACCC AATAG TCCCA TTGAA ACGGA CTCGG ACGGG CTCGA GCGGT GTCAG TACTC GGCCT TGATG CCGAT GGCCA GGCCG CCGAC CGGGA CCGAA TCGGT TTGAC AGGTG GTTTG GCGGC TCGGG TCGGA CGACC CGACA CGACG AAGGC TTTGG TTTGA AACAC CTTAG CTTAC AATTC CAGAA GGTAT GACGC GACGA GACGG GGTTA AAAGC TCGTC GGTAC GGTAG CGAGC TCGTA GACGT ACGCG TGCAA TGCAC ACGCC CAGAG ACGCA TGCAG GCAGC TAGTC TAGTG TGGAA TGCAT GCATG TAGTT GTCAC GGATG CCCGT AGAGG AGGGT AGAGA AGAGC AGAGT CCCGG AGACC CCCGA CCCGC CATCG CCAGT CACGG CACGC CACGA CCTGC GTGTT TCCGT CTGTA GGCCC CTGTG CGATG TCCGC TCCGA TCCGG TGCTC ACGTG GCAAT ACACG ACGTT TGCTT GGATC ACACA GCAAC GCTCA AGGTT TCGAT CGGGG CAGAT CGGGC CGCTA CGCTG CAGAC GGACT GGTTG CGAGA TCAAG ATAGG ATAGA ATAGC GACCT TATTC CATAC GACCG CGGAC GACCC -------------------------------------------------------------------------------- /chapter3/C3_36/output.test.motifenumeration.extra.txt: -------------------------------------------------------------------------------- 1 | GCGTT CCTAT GCGTC TATGC CCTAC AGTCT GGCTA TGGTA AGACA TTACT GTAGA CTGCA TTACG TTACA AGACT TTACC TCGCC TTCAA AGCAT TCGCT GCACT GATCC TCATG TATAT GACTT AACAC GAAAT TGTTA AACAT TATAA TATAC CTAGC CTAGA CTAGG CCTTT CTAAT CCTTC CCTTA CCTTG CTAGT TAATC TTCAT CGTAA GCTTA CGGCA CACTA GCTTT AATGC CACTT TAGCG ACTAT TGACT ATACA ATACC ACTAA ACTAG GATAG TGACA TGACC GGTAC GATAC AGTTG AGGTC AGGTA GCCGT TAACC TAACA GTGCA CGATT GTGCG CACAG ATTGG CACAC CGATA CGATG ACATC ATGAT TCTTA ACCCT ATGAC AATTG TCTTT GATAT CTGCC CTCAT ACTTT CGTAC ATCTC CGCAT ACTTA ACTTC TAGCC TGAGA CAACT GTAAG ATCTG GTAAA TGAGT ATCTA CAATA CAACC TATGT ATCCG TCTAC TCTAG ATGTC AGGAT ATGTA TCTAT TTAGA TTAGC AAGTC TTAGG GAGCT GTTAA GTTAC TCCAC GTTAG ATTCG ATTCA TTAGT ATTCC AATCC GTTAT GTCGT TGCGT TTGGA TTGGG TAGGA TGCGA GTCTA GTCTG TTCAC CCGCT TCCAG AGATA AGATC CGGTT TAAGC TAAGA CGGTA CGGTC AAGAA GTCCG CGAAA ATAAC GCTGA CCCTA TCACC CCCTT TCATA GCTGT TCACG GGTCG CTATT ACCGT CCAAG ACAAC TGTAG TGTAA TACAC TACAG TGTAT CAAGA CAAGC AGTCC CATAC CATAA CATAG TACTC GGACA GCGAT CATAT TGAAC GAAGT ACTGC ATTGT AATCT ATTGA ATTGC TCCAT ATCCT GTATT ACGAT TGCCT GTATC GTATA GTATG TCCAA TTGCG ACGAC TGCCA TGCCG TTGCA GTCCT TACTG TACTA CCGTA AGCGT CCGGT GATTA GATTC GATTG TTTAC TTTAA TTTAG GATTT CTTCA TCAGA TTTAT CTCTT GACAC TTGTG CTCTA CATTT GGATT CGTTA CCTCC CATTG CCTCA CATTA GGATA CGTTT AGTGA CGCCA TTTTG TTTTA ACGAG GCCTA ATGGC ATGGA GAACT GTGAA GAACA TAATA AACCT ATCGC ATCGA GTGAG CAGGT CTAAG CTAAC GTCGA GTCGC GCCTT CGCGA TTATT TTATA TTATC TTATG ACCTT GATAA ACCTC ACCTA ACCTG GACAT ACTGT TGAAT ATAAA CTGTT CTTGA TCACT ACTGA TGAAA GCCAT GATCT AGAAT TTAAT AGAAG GATCG AGAAC GATCA AGAAA ACCAT TTAAC TTAAA CCGTG TTAAG TCGAC TTCCT ATGCC AGCCA TTCCC CTGAG TTCCA CTGAA TCGAT CAGTA TAGAT CGTCG CGTCA TAGAG TAGAC TAGAA TATCC CGTCT TATCG CTACG TCCTG GCGGT GTACA GTACG TCCTA TCCTT CTACC TCGTC AGCTG AGCTC TCTCC AGCTA CTATA TGGAT TCTCT AGCTT GAGAT ACACG CGGAT ACACC CCATT ATATA ATATC ACACT CGGAC ATATT CCATA ATTTT GGGAT GAATG ACTCT ACGTT ATTTA ATTTC ATTTG GAATT ACTCA TGTGG ACTCC ACTCG TATGG TCGAA CAATT CGAAG TATGA CAGTC CACCT TAAAG GATGT TAAAC AACTT GCTAG GCTAA GCTAC AAGCT AACTG CGCAA AACTA AACTC GCTAT TATCA TTCGG ATTAC TTCGA TTCGT GTTCC TACGT TGTCC TGTCG AGTAT CCGTT TACGA TACGC CCGTC CAAAG AGTAC AGTAA TCTGT CATGA TCTGG GTACC CATGT GGAAT CTTTT ATTAT ACAGA ACAGC GTTCG CTTTC ATTAA CTTTA ATTAG CTTTG GTTCA CTCGT AATAC TTGAT TACCC ACGGT AATAT TTGAG TCCCT GTCAG GGTCA GTCAA TGATA TGATC TGATG CCGAT CCGAC CCGAA TCCCA TGATT TTGAA ACACA TCGGT TTGAC GTTTA TCGGG TCGGC TCGGA CGACA CTTAT TTTGG TTTGA TACCG CGACT CTTAG CTTAA AAGGT CTTAC AATTC CAGAA GGTAT GGTTA TTTCC GGTAG GACGT CTATG TGCAA TGCAC CTATC GCATT TGCAG TAGTC TACCT TGGAA GCATC ACGCT TGCAT CCCGT CATCT TAACT CATCC CACGT CCATG TCCGT CTGTA TTCTA TTCTG TCCGC TCCGA ATCCC ATCAG CTCCG ACGTC CTACA GCAAT GTTGA CTACT AAACT TGCTT GGATC ACATG ATATG CGAGT CAGAT CGCTA ATGCA CGCTC TCAAT TTTCA CAGAC AGTCG TTTCG TCAAC CGCTT AAATC ATAGA GACCT TGAGC TATTA TATTG GAGAA GACCA -------------------------------------------------------------------------------- /chapter3/C3_36/output.test.motifenumeration.txt: -------------------------------------------------------------------------------- 1 | ATA ATT TTT GTT -------------------------------------------------------------------------------- /chapter3/C3_36/test.motifenumeration.extra.txt: -------------------------------------------------------------------------------- 1 | 5 2 2 | TCTGAGCTTGCGTTATTTTTAGACC 3 | GTTTGACGGGAACCCGACGCCTATA 4 | TTTTAGATTTCCTCAGTCCACTATA 5 | CTTACAATTTCGTTATTTATCTAAT 6 | CAGTAGGAATAGCCACTTTGTTGTA 7 | AAATCCATTAAGGAAAGACGACCGT 8 | -------------------------------------------------------------------------------- /chapter3/C3_36/test.motifenumeration.txt: -------------------------------------------------------------------------------- 1 | 3 1 2 | ATTTGGC 3 | TGCCTTA 4 | CGGTATC 5 | GAAAATT 6 | -------------------------------------------------------------------------------- /chapter3/C3_38/dataset_38_7.txt: -------------------------------------------------------------------------------- 1 | 6 2 | TGGGAACCAGTGTGCACACGAACGGGGACCGATGCGAATGGC 3 | CTAGTGGACAAAAAACTGCTGATTCATAAGGGGACGTGGTAT 4 | GGGACCAGATCCTAAGACAAAAGATGGGAGGGTCTGGGTCAG 5 | GGGACGGGGAAACTAAAAAACATTGTACGTCAAGAGTCCCGT 6 | GCGTGGCTGGCTAGTTGCACCCAGGTCGGGGGGACACCAAGC 7 | GGGTGGACCGCTAGCTTCTCGATGGGCCTAGGGACACCCTTA 8 | ATACAGGGCGAATAACTTACCGCCCGCACCGGGACTGGATCG 9 | GGGATATTGCGGGGGACTGACGGCCTGTGGCCAAAGCTGCCC 10 | GAGTAATTCAACGGGACAACAAGTGGCTTCCTAGGGGACATC 11 | ACGGGACCGCGGCAACATCTCAGCGGGACACCAAAATAGACG 12 | -------------------------------------------------------------------------------- /chapter3/C3_38/medianstring.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 28 Nov 2013 12 | # 13 | # CODE CHALLENGE: Implement MEDIANSTRING. 14 | # Input: An integer k, followed by a collection of strings Dna. 15 | # Output: A k-mer Pattern that minimizes d(Pattern, Dna) among all k-mers Pattern. 16 | # 17 | # Sample Input: 18 | # 3 19 | # AAATTGACGCAT 20 | # GACGACCACGTT 21 | # CGTCAGCGCCTG 22 | # GCTGAGCACCGG 23 | # AGTACGGGACAG 24 | # 25 | # Sample Output: 26 | # GAC 27 | # 28 | # Note: 29 | # d(Pattern, Text): the minimum Hamming distance between Pattern and any k-mer in Text 30 | # 31 | ############################################################################### 32 | 33 | import sys 34 | import timeit 35 | import regex 36 | import heapq 37 | import operator 38 | from itertools import combinations,product,izip,ifilter,chain 39 | from collections import Counter,defaultdict 40 | 41 | def read_file(input_file): 42 | f = open(input_file) 43 | data = [item.strip() for item in f.readlines()] 44 | f.close() 45 | return (int(data[0]),data[1:]) 46 | 47 | def correct(seq,k): 48 | return set(seq[i:i+k] for i in range(len(seq)-k+1)) 49 | 50 | def hamming_distance(pattern,text,k): 51 | return sum([1 for i in range(k) if pattern[i] != text[i]]) 52 | 53 | def possibles(k): 54 | ''' 55 | >>> possibles(k) 56 | ['AAA', 'AAC', 'AAT', 'AAG', 'ACA', 'ACC', 'ACT', 'ACG', 'ATA', 'ATC', 'ATT', 'ATG', 'AGA', 'AGC', 'AGT', 'AGG', 57 | 'CAA', 'CAC', 'CAT', 'CAG', 'CCA', 'CCC', 'CCT', 'CCG', 'CTA', 'CTC', 'CTT', 'CTG', 'CGA', 'CGC', 'CGT', 'CGG', 58 | 'TAA', 'TAC', 'TAT', 'TAG', 'TCA', 'TCC', 'TCT', 'TCG', 'TTA', 'TTC', 'TTT', 'TTG', 'TGA', 'TGC', 'TGT', 'TGG', 59 | 'GAA', 'GAC', 'GAT', 'GAG', 'GCA', 'GCC', 'GCT', 'GCG', 'GTA', 'GTC', 'GTT', 'GTG', 'GGA', 'GGC', 'GGT', 'GGG'] 60 | ''' 61 | return [''.join(item) for item in product('ACTG',repeat=k)] 62 | 63 | def distance(kmer,dna_string,k): 64 | ''' 65 | >>> distance('AAA','AAATTGACGCAT',k) 66 | ('AAA', 0) 67 | >>> distance('GGC','AAATTGACGCAT',k) 68 | ('GGC', 1) 69 | ''' 70 | return (kmer,min([hamming_distance(kmer,sk,k) for sk in correct(dna_string,k)])) 71 | 72 | def kmer_Dna_distance(kmer,Dna,k): 73 | ''' 74 | Note: calculate d(Pattern, Dna) 75 | >>> Dna 76 | ['AAATTGACGCAT', 'GACGACCACGTT', 'CGTCAGCGCCTG', 'GCTGAGCACCGG', 'AGTACGGGACAG'] 77 | >>> kmer_Dna_distance('AAA',Dna,k) 78 | {'ACC': [('GACGACCACGTT', 0), ('GCTGAGCACCGG', 0), ('AAATTGACGCAT', 1), ('CGTCAGCGCCTG', 1), ('AGTACGGGACAG', 1)], 79 | 'ATG': [('AAATTGACGCAT', 1), ('GACGACCACGTT', 1), ('CGTCAGCGCCTG', 1), ('GCTGAGCACCGG', 1), ('AGTACGGGACAG', 1)], 80 | ... 81 | 'TCT': [('CGTCAGCGCCTG', 1), ('GCTGAGCACCGG', 1), ('AAATTGACGCAT', 2), ('GACGACCACGTT', 2), ('AGTACGGGACAG', 2)]} 82 | ''' 83 | return [(distance(kmer,dna_string,k)[0],(dna_string,distance(kmer,dna_string,k)[1])) for dna_string in Dna] 84 | 85 | def all_kmers_Dna_distance(Dna,k): 86 | apks = possibles(k) 87 | d = list(chain(*[kmer_Dna_distance(kmer,Dna,k) for kmer in apks])) 88 | df = defaultdict(tuple) 89 | for tup in d: 90 | df[tup[0]] += (tup[1],) 91 | return dict([(key,sorted(value,key=lambda x:x[1])) for key,value in df.iteritems()]) 92 | 93 | def medianstring(Dna,k): 94 | ''' 95 | Note: calculate d(Pattern, Dna) 96 | ''' 97 | akDd = all_kmers_Dna_distance(Dna,k) 98 | result = [(key,sum([item[1] for item in value])) for key,value in akDd.iteritems()] 99 | return sorted(result,key=lambda x:x[1])[0] 100 | 101 | def result(filename): 102 | k,Dna = read_file(filename) 103 | return medianstring(Dna,k)[0] 104 | 105 | if __name__ == "__main__": 106 | 107 | start = timeit.default_timer() 108 | results = result(sys.argv[-1]) 109 | print results 110 | stop = timeit.default_timer() 111 | print stop - start 112 | -------------------------------------------------------------------------------- /chapter3/C3_38/test.medianstring.extra.txt: -------------------------------------------------------------------------------- 1 | 6 2 | TGATGATAACGTGACGGGACTCAGCGGCGATGAAGGATGAGT 3 | CAGCGACAGACAATTTCAATAATATCCGCGGTAAGCGGCGTA 4 | TGCAGAGGTTGGTAACGCCGGCGACTCGGAGAGCTTTTCGCT 5 | TTTGTCATGAACTCAGATACCATAGAGCACCGGCGAGACTCA 6 | ACTGGGACTTCACATTAGGTTGAACCGCGAGCCAGGTGGGTG 7 | TTGCGGACGGGATACTCAATAACTAAGGTAGTTCAGCTGCGA 8 | TGGGAGGACACACATTTTCTTACCTCTTCCCAGCGAGATGGC 9 | GAAAAAACCTATAAAGTCCACTCTTTGCGGCGGCGAGCCATA 10 | CCACGTCCGTTACTCCGTCGCCGTCAGCGATAATGGGATGAG 11 | CCAAAGCTGCGAAATAACCATACTCTGCTCAGGAGCCCGATG 12 | -------------------------------------------------------------------------------- /chapter3/C3_38/test.medianstring.txt: -------------------------------------------------------------------------------- 1 | 3 2 | AAATTGACGCAT 3 | GACGACCACGTT 4 | CGTCAGCGCCTG 5 | GCTGAGCACCGG 6 | AGTACGGGACAG 7 | -------------------------------------------------------------------------------- /chapter3/C3_39/39_3/dataset_39_3.txt: -------------------------------------------------------------------------------- 1 | CTTTTGTCGAAGTAGCTCATGACCCCTGGTGCAATTGCAATCTATATCTTGGTAAATAATCGTCGGTAGGTGCGCCCCCACAACAAGGAGGGTGAGGAGATCCAGCGTGATTCATTACGACTGTGGTTTTCAGGAGCTGTCTCTAAAAGCTCAGAGGAAAGTCCAAGGACTCCCCAGCCTCTAAGGACTTGACTTCTATC 2 | 7 3 | A C G T 4 | 0.357 0.25 0.214 0.179 5 | 0.214 0.25 0.214 0.321 6 | 0.25 0.25 0.286 0.214 7 | 0.179 0.25 0.179 0.393 8 | 0.25 0.107 0.179 0.464 9 | 0.179 0.321 0.214 0.286 10 | 0.179 0.214 0.286 0.321 11 | -------------------------------------------------------------------------------- /chapter3/C3_39/39_3/pmpkp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 28 Nov 2013 12 | # 13 | # Profile-most Probable k-mer Problem: 14 | # Find a Profile-most probable k-mer in a string. 15 | # Input: A string Text, an integer k, and a k * 4 matrix Profile. 16 | # Output: A Profile-most probable k-mer in Text. 17 | # 18 | # CODE CHALLENGE: Solve the Profile-most Probable k-mer Problem. 19 | # 20 | # Sample Input: 21 | # ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT 22 | # 5 23 | # A C G T 24 | # 0.2 0.4 0.3 0.1 25 | # 0.2 0.3 0.3 0.2 26 | # 0.3 0.1 0.5 0.1 27 | # 0.2 0.5 0.2 0.1 28 | # 0.3 0.1 0.4 0.2 29 | # 30 | # Sample Output: 31 | # CCGAG 32 | # 33 | ############################################################################### 34 | 35 | import sys 36 | import timeit 37 | import regex 38 | import heapq 39 | import operator 40 | import numpy as np 41 | from itertools import combinations,product,izip,ifilter,chain 42 | from collections import Counter,defaultdict 43 | 44 | def read_file(input_file): 45 | f = open(input_file) 46 | data = [item.strip() for item in f.readlines()] 47 | f.close() 48 | return (data[0],int(data[1]),data[2].split(' '),np.asarray([map(float,item.split(' ')) for item in data[3:]])) 49 | 50 | def correct(seq,k): 51 | return set(seq[i:i+k] for i in range(len(seq)-k+1)) 52 | 53 | def compute(kmer,order,profile): 54 | ''' 55 | >>> kmer = 'GCCTA' 56 | >>> order = ['A', 'C', 'G', 'T'] 57 | >>> profile 58 | array([[ 0.2, 0.4, 0.3, 0.1], 59 | [ 0.2, 0.3, 0.3, 0.2], 60 | [ 0.3, 0.1, 0.5, 0.1], 61 | [ 0.2, 0.5, 0.2, 0.1], 62 | [ 0.3, 0.1, 0.4, 0.2]]) 63 | >>> compute(kmer,order,profile) 64 | 0.00027 65 | ''' 66 | c,i = [],0 67 | while i < len(kmer): 68 | c.append(profile.item(i,order.index(kmer[i]))) 69 | i+=1 70 | return reduce(operator.mul,c,1) 71 | 72 | def pmpkp(text,k,order,profile): 73 | corrects = correct(text,k) 74 | return [(kmer, compute(kmer,order,profile)) for kmer in corrects] 75 | 76 | def result(filename): 77 | text,k,order,profile = read_file(filename) 78 | results = pmpkp(text,k,order,profile) 79 | return sorted(results,key=lambda x:x[1],reverse=True)[0][0] 80 | 81 | if __name__ == "__main__": 82 | 83 | start = timeit.default_timer() 84 | results = result(sys.argv[-1]) 85 | print results 86 | stop = timeit.default_timer() 87 | print stop - start 88 | -------------------------------------------------------------------------------- /chapter3/C3_39/39_3/test.pmpkp.extra.txt: -------------------------------------------------------------------------------- 1 | GGTATGCGCACTTCCGAAGAAGGATGCTCAATCATACAAGACACATTCCATCGAGGTAGTTTGACTGGCGAAGTCCCGACTCGCTCACAACTAGTATCCTGTGAAGTCCAGCGTTGAACGACGTGTTGGCTTTAAGCGCCCTGCTTTTCACCAGTTTCTCTCCTAAGTTCGTTCCAGGTCCAAACTGTGGCACTGCAAAT 2 | 7 3 | A C G T 4 | 0.357 0.357 0.179 0.107 5 | 0.393 0.179 0.143 0.286 6 | 0.179 0.179 0.321 0.321 7 | 0.214 0.179 0.321 0.286 8 | 0.286 0.25 0.25 0.214 9 | 0.286 0.25 0.179 0.286 10 | 0.393 0.143 0.214 0.25 11 | -------------------------------------------------------------------------------- /chapter3/C3_39/39_3/test.pmpkp.txt: -------------------------------------------------------------------------------- 1 | ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT 2 | 5 3 | A C G T 4 | 0.2 0.4 0.3 0.1 5 | 0.2 0.3 0.3 0.2 6 | 0.3 0.1 0.5 0.1 7 | 0.2 0.5 0.2 0.1 8 | 0.3 0.1 0.4 0.2 9 | -------------------------------------------------------------------------------- /chapter3/C3_39/39_5/dataset_39_5.txt: -------------------------------------------------------------------------------- 1 | 12 25 2 | ATGCTAGTTTGCCTTGAACTCCTGAACTAACCTTCCGTCGTCAGCCAAAACATGAGCTAATTCATGAGGAACATTTTTGTTGCCTGAGATGGGAAATCTGCGCGCTTGGGCCATGAAGAAAAGGTGAGTTCACGTATTTCATCGAATAATGGTCAG 3 | CCGTCCCTCCCGACGACGTGAATCTTATGTATGGGACGACTGCTCCACGACGTTTGTAATAAAAGTGCTTTCACGATGAGTTCACTGAGCCTTTTTGTTCCTGTTCGACATCATGATCAGTTCGCGTTGTTTCCTTGTAATCCGATCAAACTACGA 4 | TGCCTTGGAGCGCGCCCTTCCGCGTGAAGTCTCACGGGTAGGTGTAGTTGTTGAAACTATTGGACTCGTTAATATGATTTCATGACGCGGATCCGGAGCGTCGCCTCAGTCTCGTGGTAGATCGGAGAGAGGACGCTGAGTACAGTGTGCAACACG 5 | ATGAATTGCGAACGGACAGCTGAAGAACTCAGTATAAGGATGAGTACATAAAGAACCTCTCACCTCTTTCGTTTACAGCGGGGGATGCGTATAATGGACCCTTACACAGCTTTGCAGTAGAAAGGTCACAAAGTTGAGTATTTTGTCCTGCCGCCA 6 | GCAGATGAAATGACCTATATACCCGGGTGCCATTAGATTATACGTCTTCGCCCGTTGGGGCAAAACCCCAAGGCCTGTGCTCCCTGCGTAGCACTGACTTAGGACGTATCTATCCCTGTGCATATACAAATCATGTTGAGTACACCCTTGCGTTAC 7 | GTGTGACCATGTGGTTGGGTGCTGTGGTTCGCGTCAAATTCCAGAAACCGCGTAATGCTCCGACGTAGATTACGTCATCTGTGCACTAGCCGTCCGACGCTGAGCGCAATAGCGGGATGCTCAGCAGCGGAAACCCCCTATGGGAGTAGTTAGTTC 8 | AGTGTATCCCATTACATTGAGGCGCACGCCTAGCATAGGGCACACCTCCACCCCGGTGACCACAATTAAACTAGGCTGAGGGCATTAGAGTTCCGGTTATAATGGGTAGTATCAATCCGTCGCCGTAGCTATAACTGCCTGACGCAGTATTTCTAG 9 | GGGTCCGGTCGCACTTACCGAGATATGCTGAGATCAGCGTGAGGTGACCTCCCTTTCTTCGAGTTCGCATCTTGTTTACTACCGACCCACGTCACTTAAGCCGCCCGAAACATGGTTCCAACAACCATCGCGGGCGTATCCATGAGAGGACACGCG 10 | CTTCCGCATTTGATGGTGAGTACAAACACTGTAGAACAATGTACGGAAATTGTTCACGTCTGCCAAATCATGAAAGAAGCGCGATTGTCTCTGCTGGCACAGATAGTTCAGAAGACCGGTGGTCAAGCAAAAAGACTTTATTCAGCCGAGTATATT 11 | TGATCAGAGATGAGTAAAGAGTCTACCTGCACGGTAGTCGCTCAAGAACAATTTAGGGTGTGAAAGCTTATGTAGATACGATCTACGCTGAGTTCATGAAGTCCCAGGGCTGAATGCCGGGCCATTTTTGGCACGCCCAACAGCCGCTTGCTCCCT 12 | ACGTTGAGTACACAGGTAAGCTGTCGCTGCCACCCTCAAGGGTCTGCGGGTTAGACTCATCTCTAACAAGGTCCCGTATAGACTTCAATATCTCCCGAACTGCGCGGACTTACGTTCGACGATCGGTCCTCGTGCAGCTCGTTTGTCACAGGGGGA 13 | GGAAACCTCAGCAAACGGTCTATGTCGGTAACTGAGTAACCGTGCATTATGCTGCGCTAGAGGGTGAGTCCACGCCGTGCATCCGTATCACAATGCGTGAGCGTGACAAGCCGAGAGGCGATAACGTCGATACTTCGCAAATGTCGGGCAAAGCCC 14 | TACACCGAGACCTTTCTGGCGCGGAAATTGGTACCCATTGCGTTTGTATTCGGTTGGAACAAGGTGAGGCCATGTATGGCCAGTCGAGGTGACAGTCGACCTGCTATAAATGCTATAGTAACTAGGTTGTCGGCCAATCTAAGGGGGAGTACGAGC 15 | AGGGTTTGGGTCCGAGGCGGCAGGACTCGCATACCCTCTAGTGCCCGCAGCGATCTCATAACGAACAGTACATATTACCCAGAAATGAAAAGTCCTTTATAGGTCTGCTTGACATGGACTTGCACGGCCAAATGGCGCTGCGGGACGCTGAGCACA 16 | TTCCGGTTAAGCTATGTTGAGCTTCTTATATTCAAGTCTACTTCCTCCACGATGAGACCAGATTGCCGTAATTCGACGGGAATCCGTAAGTGTAGGTTGTATATATGTTCCGGCATCCGCGACGTTTATTGGCGAACCAATTCGGGGAGCAAGTCC 17 | ACGCTGAGACCAACGACTAAGCCACGTTTCAATAAAGTCTGCATTCTGTAGTGTCGTTAAGAGCGCTGGTGCGTCAAATCCCACAATGTCGAGGATGCCCCACGTCGATATGTTGACACTTGGTATCAAATGACTGTTATCAGTCCGTGTAATCCT 18 | CAGTCTTAAACTTGATGAACCTCCAATTGTTGTTTTGGTATCATCCCCCCAAGACTGTCATTCGAACCGCAAGACCTTCCGTACAGACTAGCAATTTCAAGGTGCAACACGCTGAGCGCATGGGAAATGACTACGAACACTATCCACACCCCATTA 19 | TAATGAGGGTGTCTCAAACCACTACAAGTGATGAATTAGAGCCAAATACATGGAAAGGATGTGCCGCTTGCACTATGCCGGCAGGTCTTCCTAGCTTACCTGGTCCAAAGGTTGAGCGCAACCAAGTCCGGGGCAGTGACCAACCCAACTCACGAT 20 | CGACTGGCGATGATACGCTATCTATGCGGTATGCTCTCTACCCCGACATGAATCACGAGCTTAGGTCGATCCGGCTCTAATAAAAGTGGAGCTTACAATCGTACTCTATCCGTCGGCTACCAGGTTGGAAACAAGGTGAGTTCAGAGTCTATATGG 21 | CCATGTGCATCAAAGGAGAGGGCTCCGCATCATATTATGTTGAGTGCAAGTCAAGAAAGCCCTCTATTAATTTGTCGTCACCATACTCGTCATGCTTACGTGCCCGAGGGACACTGCCGATGGGCCTTTAGATGTTCATCCACGTCGGCCACATGC 22 | ATGATGAGCGCAGGCCGATTGCGGTGATTGAATCTACAGCCTTGAATACCCTGAATAGGCATGGCATTTCCGCGGAAACTCCATAGCTTGGAAATGAGTCCAGTTAATTTTAAGCAGCTGCGTAGAAGCACGGTGCATTGCAATCAACCGCCCGCT 23 | ATGAGGACCGTTTACCAACAAATTCGGGGTGCCACTCCCTTAGCGCGCACTAACCATGCAATATCGATTATTGTGATGCAACACAAGGTGAGATCACGATTGGCAAGGACCCCCATCTTCCACCAAGACTTGAATGCTAGACTGCCAACCCGACCG 24 | GTGACCCCAGTGCCGCATTCTTTTCGGATCGGTTGAGGGGTTGCCATTAGGGCCTCTTGTCCGCTTGATTATGTGTGCTTTACAATGGGTGCACAAGGTATAAAAGTTCCATTTCCCCGCAGGCTGAGATCATAAAACTGCACTCCTGCTGGAAAC 25 | GTTGTCTGGAGTACGGTGAGGACAGGGATACCTCTAGCTAGGCAGTGCGAGCATAAACGATCCTTGGCGTTGGGTGAACGAGATAGTTCCTTCTTCGTGGCGAACCGCTAATTAGTCGCAAGATTCCCACTGTGAAGCAATTTAGTTAGGTGTCTG 26 | GAGTGCGACGATTCGCTACCCAGAAGTAGATCCTCACTATGTAGATGTCGAGACCGCTCGGTCGAACGCATCAGTCCCGCTGGCGTCTGCTCCCCAAAGGTTGTGCCCAGATGCAGCGGAGAGGACGCCTGTATGGTGAGCACACCTGTCCACCCC 27 | -------------------------------------------------------------------------------- /chapter3/C3_39/39_5/output.dataset_39_5.txt: -------------------------------------------------------------------------------- 1 | TTCATGAGGAAC 2 | CCGTCCCTCCCG 3 | TGCCTTGGAGCG 4 | ATGAATTGCGAA 5 | CCGTTGGGGCAA 6 | ACGCTGAGCGCA 7 | AGGCTGAGGGCA 8 | TCGCACTTACCG 9 | CTTCCGCATTTG 10 | ACGCTGAGTTCA 11 | ACGTTGAGTACA 12 | ATGCTGCGCTAG 13 | ATGTATGGCCAG 14 | ACGCTGAGCACA 15 | ACGATGAGACCA 16 | ACGCTGAGACCA 17 | ACGCTGAGCGCA 18 | AGGTTGAGCGCA 19 | AGGTTGGAAACA 20 | ATGTTGAGTGCA 21 | ATGATGAGCGCA 22 | ACGATTGGCAAG 23 | AGGCTGAGATCA 24 | AGGCAGTGCGAG 25 | AGGTTGTGCCCA -------------------------------------------------------------------------------- /chapter3/C3_39/39_5/output.test.greedymotifsearch.extra.txt: -------------------------------------------------------------------------------- 1 | AGTGGGTATCTC 2 | TAAAAAGGTATA 3 | AACCACGAGTAC 4 | TGTCATGTGCGG 5 | AACCTAAACCCT 6 | AGTCGTTATCCC 7 | AGTAATATGTAC 8 | AGTGGTTATCAC 9 | AGTGGTTATCCC 10 | AGTGGCTATCGC 11 | AGTGGATATCCC 12 | AGTGAGAAGCAA 13 | AGTGACTAGACA 14 | TAAGACTAGTTA 15 | TATGAAGGGTGA 16 | AGTCGGGATAAC 17 | AGTGGGTATCTC 18 | AGCGGTTAGTCA 19 | AGTGAAATTCCT 20 | TGTGGATGGCTT 21 | TGTAGGTATCAC 22 | TGCAGATATCCA 23 | TGTGGTTATCAC 24 | TGTCATTATTCA 25 | TGCGTAGATCAA -------------------------------------------------------------------------------- /chapter3/C3_39/39_5/output.test.greedymotifsearch.txt: -------------------------------------------------------------------------------- 1 | CAG 2 | CAG 3 | CAA 4 | CAA 5 | CAA -------------------------------------------------------------------------------- /chapter3/C3_39/39_5/test.greedymotifsearch.extra.output.txt: -------------------------------------------------------------------------------- 1 | AGTGGGTATCTC 2 | TAAAAAGGTATA 3 | AACCACGAGTAC 4 | TGTCATGTGCGG 5 | AACCTAAACCCT 6 | AGTCGTTATCCC 7 | AGTAATATGTAC 8 | AGTGGTTATCAC 9 | AGTGGTTATCCC 10 | AGTGGCTATCGC 11 | AGTGGATATCCC 12 | AGTGAGAAGCAA 13 | AGTGACTAGACA 14 | TAAGACTAGTTA 15 | TATGAAGGGTGA 16 | AGTCGGGATAAC 17 | AGTGGGTATCTC 18 | AGCGGTTAGTCA 19 | AGTGAAATTCCT 20 | TGTGGATGGCTT 21 | TGTAGGTATCAC 22 | TGCAGATATCCA 23 | TGTGGTTATCAC 24 | TGTCATTATTCA 25 | TGCGTAGATCAA 26 | -------------------------------------------------------------------------------- /chapter3/C3_39/39_5/test.greedymotifsearch.extra.txt: -------------------------------------------------------------------------------- 1 | 12 25 2 | GATGGACCGGGCCATACATGGTGACACGCATCAGAAAGCTGTCCCCGTGCCTTATGCGCTGTCTGTTAGTACATCTCTCTCAATGGCCGTATTTTCAGAACAAGTATCACTTGGATCATCATCTACTCGACGGAGGGCGCGCAAGTGGGTATCTCG 3 | TAAAAAGGTATAAGGGAGTCATATCCGCAGTCCTAGTGACCTTTCCCGGCCCTAGCAGTGCTCCGATAGCCCATGGATGAGACGTAACTCGGCTACTGTTTGTGACTCAAGATAGTTGCCGTCGATATCTCGGATTCTGCTTATCGTGTTACGAGC 4 | AACCACGAGTACCTCTGTCGTGGTCCTTCACCAGGACTCGAAATTTGGCTCACGCCCAACGCCAAGATTACGTCGATCGTTCCTGTTGATATCTCGCCGCATATCAGGTTTATACTGATCGGCTCAGTGATTGTTAATCATCGGCGCGGGTTGTCA 5 | TGTCATGTGCGGATACAGTGGTGACACCAGGTCACCTCCGACCTAAAAGCGTTCAAGGTATGGCCCGAAGAGGTAGGTATCTCTTGTGCCCGCTGCTGCTATCACTCCCTGTGAGCCCCGACACGAATGTTAAACCAGTTTATATTCGCTCGTCAT 6 | AACCTAAACCCTTGCTTCCCACCATCCCTGCGAAGCAACCTTATCCGTAGTTCAGTCGCGCTGAACTCGGAAAGTGCGCTCACGAGATTCTAACTTACACTTGTTTAAGTACCACGTCCGGTGGATATCGCTGGCGTTGAAGGATAGTTCTGGTTA 7 | GTGCCCGATATGCCCGTCAGGGTTGAAGTCTGGGAAGTCGTTATCCCAAACGAAATACCCCGTTCGCAGGCGTGATGTATATCCTGATTAGTACCGCGTATAATATTAGTTTCGCACAGGAGCGTGCTTGTTTTGGGTCATGGATTGGGTACAGCA 8 | TAGTCTTCGAGGCATCTACCTGCGACCGAGCTTGCGATCCAAAAGCATACCCATGAAGTTTGCAAATCTTTCTTAGAGCTACAGGTAGATATCCCTGGGCCGGTAACTAGTAATATGTACAGTTTAGTTGTCCTGTACAATACTGATTGGAGTCAG 9 | AGGAGACGTGTTTGGTAGGCGCTCGACCCTTACCCCCCTATCTCGACTGGCATTGGACGTCCTCAGACTGGTGGATTTGACCTAGTGGTTATCACGCACATGGGAGAACCCGGTCAGAATACATCCTGTTACAGTCAGACGCCGGGTCATTCGCAA 10 | TGTGGGGATCGTCTGATTCATCGACGTCATGGAAACGGGGACGGGCCAGTGGTTATCCCATGCTGCCTTTTAGGAATTCTAGGAGGCGTTCCTAAATAGCTCGCCCGACGATATCCTACTTGATTGTGGCGGTATACCTTGCTGAACCTCGCAGTA 11 | CGCAGTGCACTAGTGGCTATCGCCTTTCTATCCCTGAGGCGTTTGCGTTATAAGATCACTCTGTCGGTGTGAAACCACTGAGTCACACTGACCGGTCGACTGGCCCGTCATATGCAAGTTGAAACGCTTACTGCCGGGTATCGCTCTAAGCTCGAC 12 | TACTCGTAACTTCAAAATCTCGTAGGGTCTGCAGAGTAAGAATCCCGGATACTTCAACATTATCGATTCGTCAAGACGTGCGGAGTGGATATCCCTTATTCCTATACGTCACAAGCCGCGGTCAAGTCGCTTACGCACGGTAGAGCGGGAAACCCT 13 | GCTCTAGTACGCCACAGTGCCAGTACATGACCTCACGAGCCGCCACTTACGTTGATATGTTATAAATCACTAGTTTCGTTGGTACAAACAATAAGTGAGAAGCAATGAGCAACTTATCTCATAAAAAGTGTGGTCGTTATCACACATGACATAAAC 14 | GAGACGGTCGTAGAAATTTGCGCTTGCTCGTATCTCAGTATCCTTCAAAGATTGAACCGGATCGCGGCGGCTAATATTGAAATCCTTAGACTTAACGTTGGTATCACTTTAGAATTTCTGACTTGAGGGTAGTGACTAGACAATCATGATGGAAGA 15 | GATCGGTGGCAGTTGAATTAAGACTAGTTATCCCCTGCTTACACTTTTTCCGCCCGGACACGTGTGACGGTAGTTGATATCTCTCAAGTATCCCGACTTCACGTACGATGCCACCCATCTCCGGCAAACACACTTCTATAATATCGTGGAGCCGAA 16 | GTAGGTATCACCAGAGTTCTCTCGGTATGTGGCGCTAAAACCTCTTAGAGTATGAAGGGTGAAGACCAAGCTCATACCACCCCTATATAGGGCTAATTAATTCCACATCCAGGCAAGATGTCACCCTACAGGTCGCTCACTTGTGGAGAACATCAC 17 | GTGGCTATCGCTGTGATCCGTCACACAAAATCAACTTTGTAGTATTTTGGGTAGTCGGGATAACGCGTGGTCTAAGGTGAGCTGCCTTTGATCCGTTGGGTGGCTACCTTGCAAGTTACCGGCTTGTCACTAGATATAACCGGAAGTCTCTGCAAA 18 | TGAGCAGACCCGACAAAGGCCATGACTTCAAAACCGGTTGTGCAGCGACAGGTACTTTAAGTACGGCACCATTAATATCGCTATACTTACGAGTTAGTGGGTATCTCATGCAAACAAACTGCTACTAGGAACTTAGACGAACTTACCAGGAGGATT 19 | AGTGATCTGAGCATAGTATACTGAGAACGTGTGTGTGACCCCTCCAGCGTCCCCGGCTACTGTTCCAGATCCTAACTAATTACTGCTAGCGATCTGGTCGATATCGCTACGGAACGAAGCCGTACAATCGCCCAGAAGCGGTTAGTCAAGGGCGTT 20 | CAAAATGGGAGTACTTCTCGTAGAATAATTCGTCTGTAATTCCTAGGTTCCATAGATAGGCCTCGATAGTGAAATTCCTTCATGCCCCGAGGTGGCGTTGATATCTCCTTTCTAAGCGGTACCCTTAAGAGTACTCGCGATGGGCTTATCCTCCTC 21 | GTTGGTATCACCCCCAATCCTCTTAGTTTACACTGTAAGATTAACGTCAGGGTGTTGTGGATGGCTTTTCTAATTTAGGTCCTCGGGATGCTCAGGTGTTACTATCGGACTGAGTGAATGTAAGTCCGGGTATCAGCCATGAAACCACTGGAGATC 22 | CCTCGGAAAACGTCGTAAGTGGTATCACAATCCACTAGTCACGGAGGGCGGTGAGTGTCTCGATGCTCAACCCCCAACCCTCAGATGAGGCTATCTGTAGGTATCACTTACGTCACTACGGGGAACAGCTCGCCTTAAGTGTAGGCTAGGTATATG 23 | GGCGGCTCCATCCGATGTGACGGATTTCATGAGGCACAAGCGCTTCACTCCCTATTTGGCTCGTGACAAAGTTCAACGGCTTTGCAGATATCCATGGTCGTTATCCCGGTGGTGAACCTACCGTCAAGTCTCTACAGTGCGCGAAGTGTCCCGGGC 24 | TACTAGTATAAGTTCCGAGCCAAATGGATGGCCCAGGCGCACTAGTGTTAGAAGGATTCGGGCTGTAGGTACATCAAGCTCGAATTTGTCCCACGTCATTCTGGGCCACCCGGACCACAGAAGACCTCTCTCTGTGCCGACGGTGTGGTTATCACG 25 | GTGGTTATCACCGGGATCGAATACAGATACGTCTGGTACATGCCTTGTCATTATTCATACGCCCCCTGGGCCAACAATCCTTTGTCAACCGCGGTCAAAAAGGTAACTAGGATCGGCTTGATCTCTAATTCCGGACTGTTCACCACGGGTCGACCG 26 | CATCACGCAATGCGAACGACTGAAGAAGGCAAGGACAGTTACGCAACCTATCATGCGTAGATCAAGGTAATCGGGACCGGTCTGGAATTTAGGAGTGTTGTTATCGCAACCTGCGATCATAACATCCTCTTATTGCCTATAAACCGACCCTGACCG 27 | -------------------------------------------------------------------------------- /chapter3/C3_39/39_5/test.greedymotifsearch.txt: -------------------------------------------------------------------------------- 1 | 3 5 2 | GGCGTTCAGGCA 3 | AAGAATCAGTCA 4 | CAAGGAGTTCGC 5 | CACGTCAATCAC 6 | CAATAATATTCG 7 | -------------------------------------------------------------------------------- /chapter3/C3_40/dataset_40_9.txt: -------------------------------------------------------------------------------- 1 | 12 25 2 | TACCGGCTTAAAGACGGATCTGCAGTGTTCGATGTGGCGGAGCGTTTAGTCGGAACTGCAGGCACTTCCGCCATCGGGCCGTCCCGCAAATCGTCCGACGCCTCCCGCTAGTTCTGGGTTGATACCCAGATCGGCCTCCGGCAGTCGAGCAGGTAG 3 | GAATTATCTAGAGCGTAGCGTTAGATATCTAAACGCTGGAACTGGTAGGAGGGCAGCCGCGTAGAATTGCTGGCGCTTGCAGGCATCGCAACGACAAAGTCGATAACTTAATTCCTCACTCACTCATCGTGGACGGAGTTCCTTGAACTGCGTTCA 4 | AAACAAGAGCTTGACGTTTTTAGCAAGACGCTGTTTGTATCGGCCTAACGGCTTGGATATATAAGATCACGCGGGCGGGAAAAAGCCGGTCCACATGAAGTCGTCCGCCTGCTGAAGGCGTGTTGAAGGGGCCCGCACATTGTAGCATTCTCTGAG 5 | ACTGCGAGCAACGACGGTGGCGAAATAGTCGACGAGTGCATAAGGGTAGACGGCTTCCGCCAAGCTACCATGCCCGGGCATGTTTTCCCATCGTCAGCCACCAACGGTTACACCGATCTGTGGGTTGTTCTAACATACTCCCTGGAAGAGAATTTT 6 | AGATGGCGACTAACCACTGCGGTAGAGTAAGAAAGGGTAATAAGGTGACTCGCTAGGACCAGACGACGTCGCCGAGTAATAACCAACCCCTACTTTTCTGGGCTGCCCAAAACGCGTGAGGTAGGCTGCTGCCACCTGCATACCGATGTCCACCGC 7 | TGAAAGTAGTTGCGTGTTGAAACCGCCCACTCGCACGCCCATAGCTGGATACACCCACGAGACAACGCAGACGAAGCCAGCCGCAGAACGCATGTCCCCGGCGGGGAGCACCATTAAGGGGTCCCAGCGGCTCATTCGCACGCGGGGCTGTCAGGT 8 | GATGGCAACCGCTTGCAAAACCACAATGCTGTCATCACTAAAATTTTATCTCATACGGGACAACGAATGCTCATAGTAAAATAAATTTACACGAGAAATCCACACTACACAGACACTAAATGCCTTCCTGGGGCAGTGGTCCAGGTCAGTACTGGG 9 | TGGAGCACCGCCGTGGATGCCCTTGATGATATCGGGACGAGCAGAGCCGAAGGCGCCCGCCTCCCGTAAACTGCTTAGTCAACGTATTTACCGACCTGTTTGTCAAGCTAATCCGTCAAGGGCTTCGCTAGTCGGCCTTTTTGTCCGAAAGACTTG 10 | GCTTTTGCGACGACCATAGCGATGGGACACGCCTACTAATATAGTGACTCCGCCTGGAACGCCCCGGCAGATTCTTCCGACATCGATGTCGGCCGCTCGGAGTCATGGTACTGGCGGAACGCGGTATTACCGGGCTAGCAAGTAACTGTTGCTATT 11 | CTATGTACGTATATGGGTTGCGGCGGTCAGTTTCGAACCCATTACGTCTAGGGTGTGAAAGATTTCCCTATCACGTACGGTGGTTGCGGAGTCCAGGGTGATAATTATGGGAGGTGAAGGGGCGAATTGTCTGAGGTCCCCCGCGTACAGACCAGA 12 | TGTGTAAGCAACACGATTGTGTACCATTACTTGGGTGCTACGAAATGGTATATAGCTCATAGGTAGAGCATGTCGGACCGCCTAACGTCAACTCTAGTTGGAGTACGCGTTGTCTGTCTCGAAGGCATCCGCTTGTAGCGATCCTCATCCTCGCTT 13 | CGCCTAGCCCGTGAAAATATTCTATCCAGATGGGTTGCCTACGCATATCCCAGCGCAAAGGTGATCACTCAATATTGGTTTCATGAGGACCACCGCATACTTCTTGTCGAACAGTGCATGAAGCACTTCGCAGATTCCTTTCGAGATGAGCCATCG 14 | TCCCATAATTCGGACGCCTCCCGCTAAGGCGAACGTTTTCCTCAATTACCCATTGCGCACGACTATCGGAGGGATGGGCGGCTCCACGCCACGCACCCCACCTCCAGAGTATGCTGGGTTTGCCGTGCTGGTGTGCAGGGGGTTTCGAACTGGCTT 15 | GGGGGGAATTTAAGCTGATTGAATGGATCCGACCCATGACCATCCTAACGGCACCTCAAGACCGTGAGTTACCGCAATAGCTCGGTGCCTATTTAAGAAGGCAGCCGCTTTCCACCCTCATTACGTTCGAACTAAATCTGTTTGTCACTCCCCTTT 16 | CTCTATTCCACTGACGACACCCGCCTGCAACTGACCTTGCTTAGAGATCGGTTGAACATTGGGGAACCTATGGTCCAAATAGAGCGGACTACGTTCATAGGTATCTGCCTACTCGTTGGTAGCAGCAGGAAATCGATCTACCGGAGTCATCCAATT 17 | TCTATTTTAGCGGACGCCCCCCGCAATCTCGGGTAAGCTCAGAGATGTGTCTGTTAATTGTGTATACTCGTTTCGACGATACACTGACACTGCGCGACTTTTTTTGGCCTCTGCCTAAGTTACATCCTCCAATAAGGCGTAGAAGAGAAGACTATC 18 | TGCCGCGTTGTTCAGGATGGCAACTAACAGGACGACACCTCAGTTCGACCGCTTGGAGATCTTACGCCCCAGCAACAGGTGCACGAAGCCACCCGCACGACAGTGATCATGTCGACGCGAGTTGTACATCTGACTCATCCCGTGAGACCCGAGACC 19 | ATCAGTCGAGCACCATATACAACCAAACCAGTCTAAAATGCGAATGATGTTCAATGGTCAGGGAATACTTGTGTCCAGGCTCTCGCACGTTGGACTAGTACGAAAGGTGACCCTTCAGTGTGTGTTAGTCTGGACGACTACCGCCCGCTGTAGAGT 20 | TATGTCGTGGGCGCAAGGTTAGGTCGGTAGAAGGCCGTCTTCCCCATACAGCACTGAGCTTTAGCTGGCTCTCACTCAACAACCTCATCCCTGGTTTATAGTAACCGGGGCCACACAACTATCGGTTTGAATGAAGACGCCCGCGGGTTTTTCCAG 21 | GGGCGCTTAGTAACGGGTTAAAATAGCCAATGGGGTAAGAATATGTAATGTAAGCAGGATTCGTCTTGATTATCGTTGAAATTGTGCATAATAGGTTGTACAAGTATTGAGGACGACCGCCATCCGTTCCCCGAGAAACTTGTGCAGCGTCTAGTG 22 | GGAGCTTCTTCCCATCCAAATCTGGTTCGCGGATCAACAGCACAGACGATGCCATCGCTGGGAGCACAATGTCATTCCCGTTGGCATGCGTCATTGACTTAATAGGCCACAAGCAAAAGTGAGGCTCGGTCAGAAGTCGCCCGCCGAATGTGCATT 23 | TGCATTCAGTTAATGCAGCAGAGATTTTCCAAAAATGGAGCCCCGCTATTAGTCAGTGCACTGATGTCGGCAATCTACGAGCGTGATGGCCACCGCGTGAACACATTAGTTCCTCCTTGAGGGGGCGAACGTAATTAGACACTCCATTTGTTATGC 24 | CCCGGATGAATTATGAGCCTCTGACATAGGTTGGAGGTAAGACGCGGTCCACTGTCACCCCTGAAGGAATATGTGTTTTTTTACTTTCCCGATCCTATGATGCCTTTGTAGTTGGCGATTTTATTTGCTTAGATCTTCGAATATGAGGCCAACCGC 25 | CTCCGCTTGTTACACCAACTGGAAATAGACGAATATACAAGTTGGGTGACACGGCGATAGCTGTGGTGCTTTGGGTAGGTGTTCGACGCCCCCCGCAACACGATTTAAAACCGAGAGCGGCCGCCTCTCCGAAGTAAGCGCCAGTCAGACAGCGTT 26 | CTCTCGAACTGAGTCCACACGACTGGTACGTCGTATGAGTTAAGTCGACTTGTGATCCGGTTCCATAAGTTTGACGGCGGCCGCACAAACTGAGCGAGGTGGGGATGATCCAACCAACAATGTAAATGGAGTTGAGTAACGGTAGCTTCTCAACGC 27 | -------------------------------------------------------------------------------- /chapter3/C3_40/output.dataset_40_9.txt: -------------------------------------------------------------------------------- 1 | GACGCCTCCCGC 2 | GAGGGCAGCCGC 3 | GAAGGGGCCCGC 4 | GACGGCTTCCGC 5 | GTAGGCTGCTGC 6 | GAAGCCAGCCGC 7 | GATGGCAACCGC 8 | GAAGGCGCCCGC 9 | GATGTCGGCCGC 10 | GAGGTCCCCCGC 11 | GAAGGCATCCGC 12 | GAGGACCACCGC 13 | GACGCCTCCCGC 14 | GAAGGCAGCCGC 15 | GACGACACCCGC 16 | GACGCCCCCCGC 17 | GAAGCCACCCGC 18 | GACGACTACCGC 19 | GAAGACGCCCGC 20 | GAGGACGACCGC 21 | GAAGTCGCCCGC 22 | GATGGCCACCGC 23 | GAGGCCAACCGC 24 | GACGCCCCCCGC 25 | GACGGCGGCCGC -------------------------------------------------------------------------------- /chapter3/C3_40/output.test.gmswp.extra.txt: -------------------------------------------------------------------------------- 1 | CATCGCTTAACT 2 | CCTCACTGAACT 3 | CGTCACTACACT 4 | CTTCTCTCGACT 5 | CTTCACTCCACT 6 | CCTCGCTAAACT 7 | CTTCACTCCACT 8 | CTTCGCTAGACT 9 | CTTCACTGAACT 10 | CGTCCCTGGACT 11 | CCTCGCTGAACT 12 | CTTCACTTAACT 13 | CGTCACTTAACT 14 | CATCTCTTTACT 15 | CGTCGCTGGACT 16 | CTTCTCTGCACT 17 | CCTCTCTGCACT 18 | CGTCTCTAGACT 19 | CATCACTTCACT 20 | CATCGCTCAACT 21 | CATCACTAGACT 22 | CATCACTCGACT 23 | CGTCCCTACACT 24 | CTTCGCTTGACT 25 | CTTCCCTGAACT -------------------------------------------------------------------------------- /chapter3/C3_40/output.test.gmswp.txt: -------------------------------------------------------------------------------- 1 | TTC 2 | ATC 3 | TTC 4 | ATC 5 | TTC -------------------------------------------------------------------------------- /chapter3/C3_40/test.gmswp.extra.txt: -------------------------------------------------------------------------------- 1 | 12 25 2 | ACGAGAACTGTAATGGAGACCAATCGGGTCGTATGTACGACACGGATCTTCTGTATCGATCATCGCTTAACTTATACGATCTCATTCTCACGACGATCCTCAACCCCGGATACCCGCACTGCCTCAATCCGAAGTACTGCGTAGTACTTTACCCTT 3 | ACCTGTGATCACTCAGAGAACAGAGGATCCGGGTTGGATGTCAGTGTTATGCCAAGAAACGAGACCTAAGGTGCCGTCCCCGGCGGAATGCTTCTCGCTTCCCCTTCTAAAGGGTCCTGGCAAAATGCTTGTGACTTTGAATGCCCTCACTGAACT 4 | AAATGTGAAACCTATATCAGTCATTATACCGGCGCGATGTTAACTCGCACCTGATTGCAAGGTCACTGATCGCGTCACTACACTAGAGTTTATTCATACCTGCATGGGGGGCATGATGGATGAATTTTAACTAGGTGATCGTGACCAATGTTCACC 5 | TTTAACCGAATGAGAAGGGTTTCGTTTTAGGCCGTCGATCCGCCGCTTCTTCTCTCGACTGAATCGGAGGTTTTAATGTGTGTGCTAAAGTGAACTGCCTAATAACCCGCTAGGGGGATGATTTATTGTATCTGCAATGTACGGTGGTACATCCCG 6 | GCTTTCCCGTGTTCTTCGTGCGCTCGGGACCAGAGATCAGAACTAGTGCTAAAGCCATGGAAGCATTTAGCCGGCCGATGTAGTAAAGTTGCCCATATTTCTCCCTAAACCAGCGTATACGTCGAAAACTTTCTTCACTCCACTATGTATAGATGG 7 | CGCTTAGCGCATCCTCGCTAAACTAATTTGTAGGAGCAATCGCATGTCGACACCGAGGAAGACAACAAGTACATTAGTTACACGCTTCTTACTGGGGATAAAAATCTAGGATCGCGTGATCGGCGTGCTCCGGCGTCAGGTGACTTGATGGCCCCA 8 | AGTCATGACAATTCGCACGAGGTGACTTTCAATCGACTTACGTACTGCGCGTCGATGTCGCTTCACTCCACTAATACCCATATTCCTAACACGCCAGTACGGTTCAGAGTCGGCGGCTGAGGGGCCCTAGAAACGAGACACCCTAGAGGCTCTGGG 9 | ATAATAAACGAGATTTAGTGCTCCAGGTCCCTCTAACTTCGCTAGACTGTTCACGGTACTTAGAGTAGCTCAGAAATCGCCTGTCTTCGGGTTGGTTTTTTCAGGAGGTGCTCTGTGCGTTAACATACCAAGCCATAGCTGCTTTTCCTCTACAAA 10 | CCTACCGGAGGACTTCACTGAACTAAGTACTGGGGTGCTAAGGTCGAACGAGATGACTGGACCCTACTCTCTACGGGACCGACGCCCCAGGGCTTAATTCATATTGACTAGATTTATGATAATAATAGACTCGGCGGTTTGTAGCTTTCCCCTGAA 11 | CCGTTACTCCGCCGCTGCTGATCCTACTCCACGTGGGGCCCCCCAATATGCACATCTTATCGTCCCTGGACTGGCAGTTGATGCGAAATAATATTCGTGGGTATGATAACGCGCTATACTGATAGAACCACGGGGACTCCTGTATTCGTCTCGCCA 12 | ATTCGTGGGTTGAAGCCTTTAAACGGGATGGCCAAGTTGATTGGGTCTAATTGATATTAATTCTGGTGTACTGTACGAGACCGGTGCAGCACGGACGGGCGGTTTCAACAATACTCGTGCGCCTGGAGGGTACCTCGCTGAACTCGCATATCAGGT 13 | TGGTTACCCTCTCTTCACTTAACTCTTGTATCAAGACGTTTCTGTGAGACAAAGCAATGGCCGGACTTTGGGGCGCGTGCTCTGGAGATCCAAGCACTCGAGGTCAGCGGTATAATATAACGCATCCAACATGCAGACTGTGCGTGGGGGCCCAAA 14 | CCTTAGCGGTGTCGGCCATCATTTATCGAGCTGGAACTCCGGTGGGTAACGTGGATCCGCAGCAGGCCTTACCGTCACTTAACTTGTTACTAGAACATACGTGGAACCTATGCATGATCGAGATAGAGTGCGCTCCGCGGTACACCGCGCTCTATA 15 | AGTTACAACACGACAGGAACTATTTGCTAGGCGTTACATCTCTTTACTTTTAGCCCCTGGATTTTGAACGCATGTCAACACGTTCCACCATGGGTATAAGAATGCATGGACAGGGTTAATGAATGTGTCTCGGTCCGTTAGCTGTTACAATATACC 16 | CTTAGCAGCCCACGTCGCTGGACTGTGTTATATTACGAGGTCGAATAGTGAGGTTAACAGTCTCCGTTGTAACTTAATCCCGATATCACGCAGTGTATATGGTCGCTGTAGCTTTCTGGGCAGCTCGCACACCGCCAATTCGCAGAGGCGACCAGA 17 | TGGTAATAGGCTTCGAAATAACTCTTGGATTGCAACGAAGGTCCGAGCCTTCTCTGCACTTGGATACATTTTGGACATATGAAAGGATGGGTGCTCGGGATGGGACTTTGGTTGCCTGCAAGACGGCGAGACCACCTTACTGAAACCAACATCTTA 18 | TTACCGGTAATCTCTGATCGCCCATGCCGTCAGGTGCCTTAATTTAAGCGAGAGCTAAATAGAAAGCTGCGCGGTTTTAGCAAAATGAAGTATCAGGAATAACATGGGTTAATGTCACAAACCTGAGGGTTACCTCTCTGCACTCCAGGTCCAGCA 19 | AATTCAGCTGGTCAGTCACCACAACGTCTCTAGACTCGCACACCCAACTATTATATCACGTACAAGCCGCCCCACAACCGGCATGATAATGTCTGCACGGCCCAACTAACACGCCAGATGACGTACTTTTCGCGGCAGAGCAGTATTCGAACTCAA 20 | CGTACCCGTTATACAAGCACCATCTACAAAACGTTAGGTGTCAACGATCGTGGGCCGGACTTAGGGGTGAGACCTTAAAGCACACATTCCCTCCCACATCACTTCACTTGTCAAAAATAAAGTCGAATGATGACTACCTCAATTCCTCGCGAAAGC 21 | GCTGACACGTATTACGAACCGAGACCAGGCGCGACCCCAACCTGGACTAACAGCTATCCCTTTGTTACTTGGCACGTACGGTCAAATCCCGGTGGAGTTATTTACCGAGGGTGCGCCATGCATCGCTCAACTCCTAATGGTCGCCTGTACTTGGTC 22 | CGGCCTTGGTCCAATTCATCGTAACATTCTGTGAATCTACGGGTACTACTCGACCACGCTTGCTTCGCTGAGCTGTACCGCAAAATCGAATGGACCCAATAATCTGAATCCTTCGGTATACATCACTAGACTCACGATTCAGTATGCCCTCAATCA 23 | ATCCGAAACATTCAGTTCCGATGGCAACGACGACCACCCAGCACGCATCATCACTCGACTGACCCTGCTCGAATACAGGCGTATCTAACAACCAGCGGATCCAGGGCCCATGCTGAGGCTATTGTCACTCCCGCCCACCCTGTATGTATTCGGATT 24 | CCCGGATCGCCCGGTCAAGCACTCCAGGGCTTCAGCGCTGTGTACCTCTCTGCACACTCCGGTGGTGGGCCCCGTCCCTACACTGCGTATTCAAGTGATAGCTCGTACGATCCGCATCGAAGGCTGACTGCCCCTCTACAACAGTGCGCTCGCACT 25 | CCCGTATGTAGGTGATTAGACCCACCAAGCAATCCGCATGTTGCGTCACCGTAGATATATGCAGCGGTCATTCTTCGCTTGACTCTCTGACTTGCCCATTTAGAAACTATCCACTTAATTCCCCTTGGACTTGGGGCCTGTTTTCGGTCGCTTTGT 26 | CATTTCTATAAAGCTACAATAATAATCCGCGCTGTCGGCAGACGTGGTACCGACCCTACTCCTACCGTTTGAGAGATGGAGGGTCTTCCCTGAACTAACGGCATGCATGAGAGGGGTACGACCCTGGTACTTCTGAAACCAGCATCCGCGGCGACG 27 | -------------------------------------------------------------------------------- /chapter3/C3_40/test.gmswp.txt: -------------------------------------------------------------------------------- 1 | 3 5 2 | GGCGTTCAGGCA 3 | AAGAATCAGTCA 4 | CAAGGAGTTCGC 5 | CACGTCAATCAC 6 | CAATAATATTCG 7 | -------------------------------------------------------------------------------- /chapter3/C3_41/.haha.txt.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter3/C3_41/.haha.txt.swp -------------------------------------------------------------------------------- /chapter3/C3_41/dataset_41_4.txt: -------------------------------------------------------------------------------- 1 | 15 20 2 | GATAGTTATGGCATAGATTAGCACCGTATTAAACTCGTTCGCTGTTGTATTATTACTACGACAAAACCAAGCCTAAAGCTTGGATGTCGCTACTTTAAACAGTGAAAATACACCTCGGTATCCCATAACTAGGGTCTAGCCTTCCTCATCATTTCTTTAGTGAGCTGACGGTGGACTAGGAAGACTCACAGGATAGTTATGGCATA 3 | GATTAGCACCGTATTAAACTCGTTCGCTGTTGTATTATTACTACGACAAAACCAAGCCTAAAGCTTGGATGTCGCTACTTTAAACAGTGAAAATACACCTCGGTATCCCATAACTAGGGTCTAGCCTTCCTCATCATTTCTTTAGTCTCATTGCATTGGTAGAGCTGACGGTGGACTAGGAAGACTCACAGGATAGTTATGGCATA 4 | GGCAGAAAGATGCGTAAGTCTACTATCACGATGACACATGCTCTTAGCTTGCCTCAAATTCCTCTGATAGTTTTTCACTGTCCTAGACTACCGAACATGACGCTCTATGTTCTCCACTCCCGAGGTTTGAGTAACGCTGTGATGGTTCGTGCCAACGGGGCTTTATGATCTGGCATACAGTGGTGAACGCCTCTCCCCATTGGTAG 5 | GAATAGCTGTCATGCAAGTTCATAAGATTTCTTCTCGGCGACGCTCACCACTTACTCGGTAGAGTGTCAGCATCCCACGTTGGTCTGAGCATTGGAGAGATGGTATTGAACAATCCCTCAATAACCCTTGATAACCGGCAATCTCGCTTCTCGCTATCTAGAATCCTTCTAATGGCTTCTATGTGAGTACAGGCCTCCTACAATCG 6 | GGGGCGGCCCTCTTGTCATTGGTAAGCATGATACATTCAGGGTGACGTTGTGGCCTGCCACGGAGGATCCCTCTAGTGATCACCCGAGGTAATGTACCATTGTAGTAGAGCTATATACCTGGCTGGAGGTAGGTGTGTCTAATAGAGGTTACCCGGCTGTTCCGATTTTAAGTCATTCCACGACCCACAAATCGAGTTGATTATCG 7 | GCGAAATCACCTACATATGCACGGTGTCGCCCTACTAAACTACGAGAGTACTTAAGGCGCGCTTCATGGTTGGTGGACGGATCGGCCTAATGTCCTAGGTTCTATTCAACCTCTGAGCAACCGTAAGGCCCTGTCGGCCACTGGAATACTTAGACGCGTCGGTCGGAAGGGGTTTACTAGATAAAGGACAATCGCTCGAAAGTGAC 8 | CCTGAATAAGCTCTACATAGTCTATGCGCTCAGATGCAGTTTGGGTGCTGAGCATTGGTTGCTGTCGACCCTCATGGCGACCGCCGTGATTGCACCCTTCTTCCTAAGCGAAGACCTTTTAAATCATCGGGCCGCTCGAGGTTTTCTAACGCCCGACACGCACTCTTTCGATTGGGGTTCAGATGGATTGACGCACCTATGTAATC 9 | ATAACCCGAACGCCCGGCATGTGTAGGGTTAACATTGCGGCAAGGCCCCAGGGTGCATCGAAGTCTTCGCTCACGAGCTGCAAGTGTTTCCTCATTGCTAAAGGAACGAAAGCAAAAAGCTTCTCTTTCTCTCTGAAATTTGGTAATAAGCAAACTTGAAGTTACAGTGGAGCTGCAGAGTTGCGAGGTAGTTAGTCACTTAGGAG 10 | GAACTTCAGCCTTCCAAGGGCGAGACGTACGGTGCATCTGACACCGCACAGTGCTACACCTCGTGAAGGAGCGTTCAAGAAAAGGGCCTATGATTCTCGATAATGTATAGTTACCCGTCAGGCGCGCTCTGTTAATTGGTAGTTGTGCACGGAGAGAAACGGCTAGTTCTGGCTGCCGATTCTTTGAATACAGGTATCATTGCTTC 11 | CATAGGAGTATGAGACACCACCCGATGCACATGGGCGGAGAACAATTTCTCGCTGCAAGGTACGAGCGCGCGTTGGTGCAACATGTTGCCCTCGCTAGGGTTCGTATAGCGCCGCGAGTTTCTACAGACTACGACAAGAAAGCTTGCGTATTCTCTGAGTCCTGGTACAAGGGGCCTCGATGTGGCCGTCATTCGCACGACAAGCT 12 | GCGGCCCACCTTTCACTCTGAGCATTTACACGCTAGACCGCTCGCAGCGCATCCCATCCCGAATTGCCAGGACCGGAGATCTTTCCGTTCTGCAGTGTGTTCTAAATGGCTAGTCAGCACGACCACACTCGCCCATTATGCTACACACTACTGCTGTTCACGGCCGAGTAATTTTCAGGTACGGCCGGCTTAATAGTCTCGCGCTC 13 | TCATCACAACCGCAACTCGCGTCATCCACCCCCTAGAATGTCAAAAAAGCGAAGTTAATCCAGGGTTCGGAATTCTATGCCAACCTCAATACTACCACCCGGGAGTCTCGCCACCTCCTATTAGGCCATGACACCTTTTCACTATGCGCAGTAGAAGAGAAGAAAGGGCCAATTCGATGGCCATTGCGCAAAGAGCATTGGTAAGT 14 | TTAGGCGCTGAACACGAGCGAAAGATATATACATAGGTCGTGGTGCCCCGCCCTTCCTCTAGTCACGTGTAGAATCAGTAAGAGGCGGAACGTGGAATTTCATTTCGTCTAGCGCCTAGAAAACCCTAATGAAGGAAAGGTGTCTAAGGGTTCACAGACTCTGAGCTAAGGTACGACCGAATCGTGCCCACGAGTCTTTCCGTCTC 15 | ATAGAGTGATTGCGGTAATAGGTAGCGCTTCTTGCACAGCGAGTTCACTTGCCTTCTACCGCACCAGAGAGATAACTCCTACAGCCTGCCAGTGTTATGATAACCACTAAGGGGCGATGTACGCACCCTCTGAGCATGTTTAAGTGCTTACACCCTGAATAAGTGTGATCTTGGTAGTGACGGGACAGATGCTTACGACATGTCGG 16 | AAGTTCGTAGAGTGACTCATCTTATCCTAACCCGCGAGAAATTCGGCGTTGATAAAGAAACGGGATCGTTATAGATTTATATGGTGCAACAACATTAATCGCGTTGCTGAGGCCTAATATGTGCGTGATCGCACAAAGACCCATTTTGGACTACTTGTGCACCCAGTGGAATAACGGCCCCACCCTGCTAGCATTGGTAGAACTCA 17 | CGGGAGCGAGGGCGTAGACCTTCAACGTTAAGTCCATAACGTCAAGGTAGGTCGAGGTAGCTGGCTCTCTGAATGTTGGTATGTGTGCCAGATGGAACTTTGGCAGGGCTAAACCAGCCGGCACGGAATTTTGCCTCTGTCGGAGCCTTGGAGGTTCATCCCTACTAGCACCAGCTGCTTATGTTCCCTTATAGGACTCATCCAAC 18 | GAAGAATTCCGTTATAACATGGACCAATAATCTGAACACGCGCTGACGCTGTTCTAGCGGAGAAAGAAAGGTCCGGCTCCCCTGCCTTACCGCCTTCACCCTTAGTTATGCCCATCTTACCGGATTACAGTAGGTGCAGGGGACTCTGAGCATTGCACGTTTTCACGGTACTAGTACGGAAAGTGAAAACCGGTCTGGGGCGTGCG 19 | TACGGAAAGAGGTCCTCGGAGACTGCCTGGATGCGGGTGGGCAGTCGTCTGGAAATAATCGAGTATACAAACTCCACTAGCGAAGGCAGAAGCAGTAAATGATGAGCATTGGTATTAACCGGCTTGTTGAGGACTTGTATAGTTCGACGCTGCTGAGCCCCAAATAGGATGGGGCCTCGAAACAGGTAACCGCCCACCTGAGTGTA 20 | CCTTATACAACGGTGACAGGGATTATACCTGTGAGAATGAGATGAACAGTCTGCTGCGTTTTAACTCTGTTGATTGGTAAAGCGCCGTGTGCATCCACTTCCTTTAGCTGACGGGCTCATTCCAGACTCTCCGCGGATCTACACATCTGGAGGGACGCCCAGGTAGTACGAAGTCTCATGGCGCGCCCCGGATCCTCTGTCGATGA 21 | GCGTATGGCTTATTCTCCCCTTTAAGATGTATGTTGATCGCAAGCATACCACATCTGATAAGAGGGTACGCCATGATAGGCCGTCACTCTCACCGCATTGGTAGCCGCGCGTCTTATCGGGCGTCAGTGGGCCAATCACTGGGACGGCCCGTATCAGTTCCAGGGCATGAAATCACCTTAGCTGAATTGGCGAGGCCCGTCGACCC 22 | -------------------------------------------------------------------------------- /chapter3/C3_41/output.dataset_41_4.txt: -------------------------------------------------------------------------------- 1 | CTCACAGGATAGTTA 2 | CTCATTGCATTGGTA 3 | CTCTCCCCATTGGTA 4 | GTCTGAGCATTGGAG 5 | CTCTTGTCATTGGTA 6 | CTCTGAGCAACCGTA 7 | TGCTGAGCATTGGTT 8 | CTCTGAAATTTGGTA 9 | CTCTGTTAATTGGTA 10 | CTCTGAGTCCTGGTA 11 | CTCTGAGCATTTACA 12 | CAAAGAGCATTGGTA 13 | CTCTGAGCTAAGGTA 14 | CTCTGAGCATGTTTA 15 | CTGCTAGCATTGGTA 16 | CTCTGAATGTTGGTA 17 | CTCTGAGCATTGCAC 18 | TGATGAGCATTGGTA 19 | CTCTGTTGATTGGTA 20 | CTCACCGCATTGGTA -------------------------------------------------------------------------------- /chapter3/C3_41/output.test.randomizedmotifsearch.extra.txt: -------------------------------------------------------------------------------- 1 | CATGGGGAAAACTGA 2 | CCTCTCGATCACCGA 3 | CCTATAGATCACCGA 4 | CCGATTGATCACCGA 5 | CCTTGTGCAGACCGA 6 | CCTTGCCTTCACCGA 7 | CCTTGTTGCCACCGA 8 | ACTTGTGATCACCTT 9 | CCTTGTGATCAATTA 10 | CCTTGTGATCTGTGA 11 | CCTTGTGATCACTCC 12 | AACTGTGATCACCGA 13 | CCTTAGTATCACCGA 14 | CCTTGTGAAATCCGA 15 | CCTTGTCGCCACCGA 16 | TGTTGTGATCACCGC 17 | CACCGTGATCACCGA 18 | CCTTGGTTTCACCGA 19 | CCTTTGCATCACCGA 20 | CCTTGTGATTTACGA -------------------------------------------------------------------------------- /chapter3/C3_41/output.test.randomizedmotifsearch.txt: -------------------------------------------------------------------------------- 1 | TCTCGGGG 2 | CCAAGGTG 3 | TACAGGCG 4 | TTCAGGTG 5 | TCCACGTG -------------------------------------------------------------------------------- /chapter3/C3_41/test.randomizedmotifsearch.extra.answer.txt: -------------------------------------------------------------------------------- 1 | CATGGGGAAAACTGA 2 | CCTCTCGATCACCGA 3 | CCTATAGATCACCGA 4 | CCGATTGATCACCGA 5 | CCTTGTGCAGACCGA 6 | CCTTGCCTTCACCGA 7 | CCTTGTTGCCACCGA 8 | ACTTGTGATCACCTT 9 | CCTTGTGATCAATTA 10 | CCTTGTGATCTGTGA 11 | CCTTGTGATCACTCC 12 | AACTGTGATCACCGA 13 | CCTTAGTATCACCGA 14 | CCTTGTGAAATCCGA 15 | CCTTGTCGCCACCGA 16 | TGTTGTGATCACCGC 17 | CACCGTGATCACCGA 18 | CCTTGGTTTCACCGA 19 | CCTTTGCATCACCGA 20 | CCTTGTGATTTACGA 21 | -------------------------------------------------------------------------------- /chapter3/C3_41/test.randomizedmotifsearch.extra.txt: -------------------------------------------------------------------------------- 1 | 15 20 2 | ACTTATATCTAGAGTAAAGCCCTGATTCCATTGACGCGATCCCTACCTCCATCATACTCCACAGGTTCTTCAATAGAACATGGGGAAAACTGAGGTACACCAGGTCTAACGGAGATTTCTGGCACTAACTACCCAAAATCGAGTGATTGAACTGACTTATATCTAGAGT 3 | AAAGCCCTGATTCCATTGACGCGATCCCTACCTCCATCATACTCCACAGGTTCTTCAATAGAACATGGGGAAAACTGAGGTACACCAGGTCTAACGGAGATTTCTGGCACTAACTACCCAAAATCCTCTCGATCACCGACGAGTGATTGAACTGACTTATATCTAGAGT 4 | CACTCCCGTCCGTCTGACGCCAGGTGCTCTACCCCGCTGATTGTCTGGTACATAGCAGCCTATAGATCACCGATGCAGAAACACTTCGAGGCAGCCGATTTCGCTTATCACAACGTGACGGAATTTGATAAACCACGTACTCTAATACCGTCACGGGCCCATCAACGAA 5 | ACAAGAACTGGTGGGGAGACTATGACACTCTAGCGGTCGCATAAGGGCCGGAAACCAGGACAAATCGATAAGATGAAGCGGGGATATAAGCCTTATACTGCGACTGGTTCCTTATATTATTTAGCCCCGATTGATCACCGATTAAAATATTCTGCGGTTTTCGAGACGG 6 | TAACCACACCTAAAATTTTTCTTGGTGAGATGGACCCCCGCCGTAAATATCAGGATTAAATGTACGGATACCCATGACCCTCCAGTCATCTACCTTCCCGTGGTGGTCGCTCAGCCTTGTGCAGACCGAACTAGCACCTGTCACATACAATGTTGCCCGCATAGATCGT 7 | ATCCGACAGAGGCAGTGAATAAGGTTTCGTTTCCTCAGAGAGTAGAACTGCGTGTGACCTTGCCTTCACCGACATCCGTTTCCAATTGAGCTTTTCAGGACGTTTAGGTAACTGATTGTCATTGCAATTGTCCGGGGGATTTAGATGGCCGGGTACCTCTCGGACTATA 8 | CCTTGTTGCCACCGATTCGCGAGCAACATCGGAGTGCTCTGATTCACGGCGATGCTCCACGAAGAGGACCGCGGCACGACACGCCCTGTACCTACGTTTCTGGATATCCTCCGGCGAGTTAATAGAGCAATACGACCTGGTCGTCGAGATCGTGTATCTAGCCCTACCT 9 | ATAGGTTAACGAATCAGGAGAGTTAATTTTACCTAGCTAGAGCGGACGGTGCCTGGCTGTATTCGCGTTTGACTTTCGGGCTCGCTGATAACTTGTGATCACCTTTTACGCTTACTGGATCCAACGATGGATCAAAGTTGAGAATTTCTGTGCCTTGGGTGTGAGCTGT 10 | CTGACGAAAGGACGGGCGGTGTACTTAGTTTGGGGTAAAATAGTTGGTATAATTCTGTGCGACAGACATTTGGTCAGGCCATACTGCCATATCGTGATGTAACTATCCACACTACGTCATAGGCCCTTGTGATCAATTAAACGTTCCTCATGCCAGGCTATCTGTTTAA 11 | GGCTTCGCGTTTAAGGCTGGATTAAGTACTCCGCCTTGTGATCTGTGATCCTCCGACCTGTGATCAGCAAGATTGGAACCTAGGTAGGCGGCGGGTCTACGCTGGCCCACAATCGTGAGTCCCCCACTCCGTAGGTTGTGGAATTTATAGACCCGCAAGGGGCACCACT 12 | AGGATGACACCCAGGATGAATCTGGATTAGGAACACCAACCCGACATATTTGTTACCGCTGCAGCATTTCGCTCTTGGACGCGTAACCCGAGATCCGTCTCGCGATCGTCACGGATCGGGATTATGCAGGCAATACCTTGTGATCACTCCGCGCTTGGTTTTGCTAGCG 13 | ACATCTCTAGTCACTTTTATTGAGCAGGTGGGCGGATTCATGATCCGGCTCTGTCGTACGTCCAACCACGGTGACATGTTCGGAGCTGTCGCCGTGGAGCAGAGATACATCGGATCTATCAATTTTACTAAGAGCAACTAGCCACGACAAACTGTGATCACCGATTGGA 14 | AATTTGCGTATCTCTAGGACTCCCTCATACAAATCAAAGCTTGGATGGGTAAGATGCCGCAGCAGCAGGTATCTCATATTGGCTATTAAGAGCCAGGCCCTATGGCCTTAGTATCACCGATCAGACGTCGCATGAGCGGGCCCGTTGTCCTATCTCTTTAGCTGCCGCA 15 | GAAGTAAAGGGGTTCCACTGCGTAGAGCGTGCCCCTCTGGTGTGCCGTACTGTTATGGTGATACAGCTTCCTTATACCCCTCGTAAAGCGGCTAATGGTCCTAATGAATGCCCTTGTGAAATCCGAATCGCTTTACAATTGCGTTCGGCGGAATGCAGTCACCAGTGTT 16 | TACACTACGCGTTATTTACTTTTACTGAGTCCTTGTCGCCACCGAACGAGGATTGTTCATTGTATCCGGAGATTAGGAGTTCGCATCGCTGACACAGCCAGTTCGTAGCAAATACCGCTGGCCCTGGGCACTCCAGATCAGAACTACTAGCCCTAAACTCTATGACACA 17 | TTGGGTCTCGATCCCTCTATGTTAAGCTGTTCCGTGGAGAATCTCCTGGGTTTTATGATTTGAATGACGAGAATTGGGAAGTCGGGATGTTGTGATCACCGCCGTTCGCTTTCATAAATGAACCCCTTTTTTTCAGCAGACGGTGGCCTTTCCCTTTCATCATTATACA 18 | TTTCAAGTTACTACCGCCCTCTAGCGATAGAACTGAGGCAAATCATACACCGTGATCACCGACCCATGGAGTTTGACTCAGATTTACACTTTTAGGGGAACATGTTTGTCGGTCAGAGGTGTCAATTATTAGCAGATATCCCCCAACGCAGCGAGAGAGCACGGAGTGA 19 | GATCCATTACCCTACGATATGTATATAGCGCCCTAGTACGGCTTCTCCCTTGCAGACACGCAGGCGCTGTGCGCTATCGGCTTCCTCGGACATTCCTGGATATAAGTAACGGCGAACTGGCTATCACTACCGCCGCTCCTTAAGCCTTGGTTTCACCGACGATTGTCGT 20 | TAGTAGATTATTACCTGTGGACCGTTAGCTTCAAGACCGAAACGTTGGTGATGCTACTTAAATGTCAAGAGTTGCGAAGTTGGGCGAAGCACATCCGTACTCCCAAGTGGACGATCGATAGATCCATGGAGTTTCCATCCATCTTAATCCGCCCTTTGCATCACCGACG 21 | TACAAGGCACAAACGAGACCTGATCGAACGGTGCACGGTCGAGGCAGCGAGATAAATGTACATTGAGAGCACCTTGTGATTTACGACCTGCATCGAAGGTTTCTTGGCACCCACCTGTCGTCCGCCAGGGCAGAGCCGACATTATATGACGCTGATGTACGAAGCCCCT 22 | -------------------------------------------------------------------------------- /chapter3/C3_41/test.randomizedmotifsearch.txt: -------------------------------------------------------------------------------- 1 | 8 5 2 | CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA 3 | GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG 4 | TAGTACCGAGACCGAAAGAAGTATACAGGCGT 5 | TAGATCAAGTTTCAGGTGCACGTCGGTGAACC 6 | AATCCACCAGCTCCACGTGCAATGTTGGCCTA 7 | -------------------------------------------------------------------------------- /chapter3/C3_43/output.dataset_43_4.txt: -------------------------------------------------------------------------------- 1 | TTAGCGGCAAACTGA 2 | TAACCAACCGTCTCA 3 | TCCCGTACCGTCTCA 4 | TAAAGTACGAGCTCA 5 | GAAAGTACCGTCTAC 6 | TAAAGAGGCGTCTCA 7 | TAAAGTTTAGTCTCA 8 | TAAAGTACCAATTCA 9 | TAATCCACCGTCTCA 10 | TAAAGTAGACTCTCA 11 | GTAAGTACCGTCTCT 12 | TAAAGTACCGTTAAA 13 | TAAAGTACCGGTACA 14 | TAAACACCCGTCTCA 15 | TAAATCGCCGTCTCA 16 | TAAAGTCTTGTCTCA 17 | TAAAGTACCGTCCTG 18 | GTGAGTACCGTCTCA 19 | TAGGATACCGTCTCA 20 | TAAAGAGGCGTCTCA -------------------------------------------------------------------------------- /chapter3/C3_43/output.test.gibbssampler.txt: -------------------------------------------------------------------------------- 1 | CCCTCTCG 2 | AGGTGCCA 3 | AGTATACA 4 | TTCAGGTG 5 | CCACGTGC -------------------------------------------------------------------------------- /chapter3/C3_43/test.gibbssampler.txt: -------------------------------------------------------------------------------- 1 | 8 5 100 2 | CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA 3 | GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG 4 | TAGTACCGAGACCGAAAGAAGTATACAGGCGT 5 | TAGATCAAGTTTCAGGTGCACGTCGGTGAACC 6 | AATCCACCAGCTCCACGTGCAATGTTGGCCTA 7 | -------------------------------------------------------------------------------- /chapter4/C4_51/output.test.string_composition.txt: -------------------------------------------------------------------------------- 1 | AATCC 2 | ATCCA 3 | CAATC 4 | CCAAC 5 | TCCAA -------------------------------------------------------------------------------- /chapter4/C4_51/string_composition.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 3 Dec 2013 12 | # 13 | # CODE CHALLENGE: Solve the String Composition Problem. 14 | # Input: An integer k and a string Text. 15 | # Output: Compositionk(Text), where the k-mers are written in lexicographic order. 16 | # 17 | # Sample Input: 18 | # 5 19 | # CAATCCAAC 20 | # 21 | # Sample Output: 22 | # AATCC 23 | # ATCCA 24 | # CAATC 25 | # CCAAC 26 | # TCCAA 27 | # 28 | ############################################################################### 29 | 30 | import sys 31 | import timeit 32 | import heapq 33 | import operator 34 | import random 35 | import numpy as np 36 | from scipy import stats 37 | from itertools import combinations,product,izip,ifilter,chain 38 | from collections import Counter,defaultdict 39 | 40 | def read_file(input_file): 41 | ''' 42 | >>> k,Text = read_file('test.string_composition.txt') 43 | ''' 44 | f = open(input_file) 45 | data = [item.strip() for item in f.readlines()] 46 | f.close() 47 | return (int(data[0]),data[1]) 48 | 49 | def correct(dna,k): 50 | l = len(dna)-k+1 51 | return [dna[i:i+k] for i in range(l)] 52 | 53 | def result(filename): 54 | k,Text = read_file(filename) 55 | results = sorted(correct(Text,k)) 56 | return results 57 | 58 | if __name__ == "__main__": 59 | 60 | start = timeit.default_timer() 61 | results = result(sys.argv[-1]) 62 | print '\n'.join(results) 63 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 64 | fw.write('\n'.join(results)) 65 | fw.close() 66 | stop = timeit.default_timer() 67 | print stop - start 68 | -------------------------------------------------------------------------------- /chapter4/C4_51/test.string_composition.txt: -------------------------------------------------------------------------------- 1 | 5 2 | CAATCCAAC 3 | -------------------------------------------------------------------------------- /chapter4/C4_52/output.test.overlap_graph.txt: -------------------------------------------------------------------------------- 1 | GCATG -> CATGC 2 | CATGC -> ATGCG 3 | AGGCA -> GGCAT 4 | GGCAT -> GCATG 5 | -------------------------------------------------------------------------------- /chapter4/C4_52/overlap_graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 3 Dec 2013 12 | # 13 | # CODE CHALLENGE: Solve the Overlap Graph Problem (restated below). 14 | # Input: A collection Patterns of k-mers. 15 | # Output: The overlap graph Overlap(Patterns), in the form of an adjacency list. 16 | # 17 | # Sample Input: 18 | # ATGCG 19 | # GCATG 20 | # CATGC 21 | # AGGCA 22 | # GGCAT 23 | # 24 | # Sample Output: 25 | # AGGCA -> GGCAT 26 | # CATGC -> ATGCG 27 | # GCATG -> CATGC 28 | # GGCAT -> GCATG 29 | # 30 | ############################################################################### 31 | 32 | import sys 33 | import timeit 34 | import heapq 35 | import operator 36 | import random 37 | import numpy as np 38 | from scipy import stats 39 | from itertools import combinations,product,izip,ifilter,chain 40 | from collections import Counter,defaultdict 41 | 42 | def read_file(input_file): 43 | ''' 44 | >>> kmers = read_file('test.overlap_graph.txt') 45 | ''' 46 | f = open(input_file) 47 | data = [item.strip() for item in f.readlines()] 48 | f.close() 49 | return data 50 | 51 | def overlap_graph(data): 52 | n = len(data[0])-1 53 | return [(k1,k2) for k1,k2 in product(data,data) if k1 != k2 and k1.endswith(k2[:n])] 54 | 55 | def result(filename): 56 | kmers = read_file(filename) 57 | results = overlap_graph(kmers) 58 | return results 59 | 60 | if __name__ == "__main__": 61 | 62 | start = timeit.default_timer() 63 | results = result(sys.argv[-1]) 64 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 65 | for r in results: 66 | fw.write('{0} -> {1}'.format(r[0],r[1])+'\n') 67 | fw.close() 68 | stop = timeit.default_timer() 69 | print stop - start 70 | -------------------------------------------------------------------------------- /chapter4/C4_52/test.overlap_graph.txt: -------------------------------------------------------------------------------- 1 | ATGCG 2 | GCATG 3 | CATGC 4 | AGGCA 5 | GGCAT 6 | -------------------------------------------------------------------------------- /chapter4/C4_53/dataset_53_6.txt: -------------------------------------------------------------------------------- 1 | 12 2 | TACCGATACATACAGATGGGACCGGTTTGGCCCGCGACCTGTTAGTTTGTTCCGTACACGCACTGTCCAGTAGCAGCCAGCAATCCGATGATTACGACCTCCTAAAGGGTCCTTACCTAGTTAGCCCAGCTCTAACCAGGCACTCATCACAGACAGAAAAATCGTCTGGAGTGGGGTATGGAGGCAAGACTGATGGCTTTTTTTTATGTCGGTATGTCTGGTCTGAAGCACTCATAATGGTTGGGGCTCAGTGTATTGCAAGCGTATGACCCGCGTGCCTTTCCTCAGCCCACAGGTCTTAGCTCATAGGACCTACCAATCTCGCATACCCCCGATTGTTCACTGTTGACGCATGCCGTATGCATTGGTGTGCAAACAAACTGAGTTAGGGCACTGTTGCATTGGGAACACAATTCCGTGGGTTTTCGTAACACTTCGACTGCAACATACCGTACACGCTTTTCTTGGCCATTCTTCTCGGTTTGTCACATATTGGAGAGCATTCACAGATGCGTCCATAGATAATTGGGTCATTCTACCCAATTAGATCACATTTCGCAAAGGGTCGGCGCAAATCTATAGCGGGACTCGCGACGGTTCACGGCATTGCGATCGATGATGGACGATATTGGCCCGGTCCCCTTCAGAGTTTAGCACGAGAAAATCACTGAGCAAAGCAGATACCCTGATAGCAAATTTTGCGTCTAACCCCCATTCGGTTTCTAAAGAATTGCCGCGGAGCATTTCCCGATAGCGACGACCTATTTTTGCTGCGCGTACATCCACCTACACTTGGGATCGGCGGACGAGGGCTAGCTTGCTGCGGAATCTTAATGGCCAACCTTGTTTTTGAGTGGACATACGTAATACGCTGACATGCCCGGCCGGTGCGACCAGGAGTATTGTGCCGTCTCAGTGGACCCATCTTCAACTTACGGTCCGGAAAAAGGGTGAGGGTATAGAACGCCTACGTGGGGGTATCGTTAACGAGACGGTTATTTGACTTCGTTCTGTTTTGGCATACCCCACTATGAACTGATAAGACTGGCTAACTCTGCCTGGGCCGGCGAAAGACCTCCGAGTGCCGAGAGAATCCAATTAGCTTGATGGCCAATTAAGGGCAAGTGAAGCCTTTTGCACGTAATTGGTCTCCAGCTAACTAATCTATTGCGGAAACTCGAGGCATGTTTAGGGATCCAGAGGCTCGGCTTAGTAATGCAACTTTACAGTATTTCGTCAGTCAGTCGGACTTCAGACGGCGAGAGTGGATACGTCATGGCGTGCTGTTCATTTGTCCCTGGCATAGTTTTGGTCGCATGTCGGCAAAAGTTCCAACTTTATTCCTTCAGTTGCGGTACGGGGCACGCGGAGTTAGGCACGAGTGGTACGTTCGGTACATCCCTCAGACGTCAGAACAGGATTGCATCCCCGCGAGTCTCTCTGAGCAACCATTAGTCGATTCGAGTTTGGAAGTTTAGAGATGTTTTAGAGTTCGGACTTCAGTGTTGACCCAACCCGGGACAAACTTCCTACTCGCATACGGGGTCAAGAATGTCGGCTTGGCGCCATTGATAACTCAGGTGGACGAGCGCAGATCGTCTACAATTATTTCCTTCATCGAGCCACTGAAGTTCTAGCGTTCTGGCTTGCTCCCAAGCGGCGCAAAGCGGCTCGCTGTTGCATTCACGTCCTCGCTCTGTGTACCTTTGGTATCATAGACGGCTCAAGGTCGCGGGGCATTTCACCACGGGAAAGCAACC 3 | -------------------------------------------------------------------------------- /chapter4/C4_53/de_bruijn_graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 3 Dec 2013 12 | # 13 | # CODE CHALLENGE: Solve the De Bruijn Graph from a String Problem. 14 | # Input: An integer k and a string Text. 15 | # Output: DeBruijnk(Text). 16 | # 17 | # Sample Input: 18 | # 4 19 | # AAGATTCTCTAC 20 | # 21 | # Sample Output: 22 | # AAG -> AGA 23 | # AGA -> GAT 24 | # ATT -> TTC 25 | # CTA -> TAC 26 | # CTC -> TCT 27 | # GAT -> ATT 28 | # TCT -> CTA,CTC 29 | # TTC -> TCT 30 | # 31 | ############################################################################### 32 | 33 | import sys 34 | import timeit 35 | import heapq 36 | import operator 37 | import random 38 | import numpy as np 39 | from scipy import stats 40 | from itertools import combinations,product,izip,ifilter,chain 41 | from collections import Counter,defaultdict 42 | 43 | def read_file(input_file): 44 | ''' 45 | >>> k,Text = read_file('test.de_bruijn_graph.txt') 46 | ''' 47 | f = open(input_file) 48 | data = [item.strip() for item in f.readlines()] 49 | f.close() 50 | return (int(data[0]),data[1]) 51 | 52 | def correct(dna,k): 53 | l = len(dna)-k+1 54 | return [dna[i:i+k] for i in range(l)] 55 | 56 | def overlap_graph(k,Text): 57 | data = correct(Text,k) 58 | n = len(data[0])-1 59 | return [(k1,k2) for k1,k2 in product(data,data) if k1 != k2 and k1.endswith(k2[:n])] 60 | 61 | def de_bruijn_graph(k,Text): 62 | og = overlap_graph(k,Text) 63 | nodes = [Text[:k-1]]+[item[0][1:] for item in og]+[Text[-(k-1):]] 64 | raw_dbg = [(nodes[i],nodes[i+1]) for i in range(len(nodes)-1) if nodes[i] != nodes[i+1]] 65 | d = defaultdict(tuple) 66 | for tup in raw_dbg: 67 | d[tup[0]] += (tup[1],) 68 | for k,v in d.iteritems(): 69 | if len(v) > 1: 70 | v = [(item,Text.index(item)) for item in v] 71 | v = sorted(v,key=lambda x:x[1],reverse=True) 72 | d[k] = [item[0] for item in v] 73 | return d 74 | 75 | def result(filename): 76 | k,Text = read_file(filename) 77 | results = de_bruijn_graph(k,Text) 78 | return results 79 | 80 | if __name__ == "__main__": 81 | 82 | start = timeit.default_timer() 83 | results = result(sys.argv[-1]) 84 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 85 | for k,v in results.iteritems(): 86 | fw.write('{0} -> {1}'.format(k,','.join(v))+'\n') 87 | fw.close() 88 | stop = timeit.default_timer() 89 | print stop - start 90 | -------------------------------------------------------------------------------- /chapter4/C4_53/haha.txt: -------------------------------------------------------------------------------- 1 | def compare(filename1,filename2): 2 | f1 = open(filename1) 3 | data1 = f1.readlines() 4 | f1.close() 5 | data1 = [item.strip().split(' -> ') for item in data1] 6 | f2 = open(filename2) 7 | data2 = f2.readlines() 8 | f2.close() 9 | data2 = [item.strip().split(' -> ') for item in data2] 10 | length1 = len(data1) 11 | length2 = len(data2) 12 | print 'Number of lines in '+filename1 + ' :'+str(length1) 13 | print 'Number of lines in '+filename2 + ' :'+str(length2) 14 | for item in data1: 15 | if item not in data2: 16 | print str(item) + ' in ' + filename1 + ' not in ' + filename2 17 | for item in data2: 18 | if item not in data1: 19 | print str(item) + ' in ' + filename2 + ' not in ' + filename1 20 | return None 21 | -------------------------------------------------------------------------------- /chapter4/C4_53/output.test.de_bruijn_graph.txt: -------------------------------------------------------------------------------- 1 | AAG -> AGA 2 | TCT -> CTA,CTC 3 | GAT -> ATT 4 | AGA -> GAT 5 | ATT -> TTC 6 | CTA -> TAC 7 | CTC -> TCT 8 | TTC -> TCT 9 | -------------------------------------------------------------------------------- /chapter4/C4_53/test.de_bruijn_graph.extra.txt: -------------------------------------------------------------------------------- 1 | 10 2 | CGCTTCTACACTGGCTTGAAATCCCAAGCACCATACGTCTTCTGTAACGCATGTGGAGTACGGGCCCATTGCTTGCATAGACGGGGCATTCCTGTAGCTATCACACAATCTCATCGACAATCGCGGCTGACAAGATGTAAGAATTTCAACGAATTACTTTGACAACATTACATTCTCCACCAGTGACGCACTGAGTCCACGAGGAAGATGAGTATAAATAGAGGCGTACGATCTAGAGCATAGGTTGCCTCCATCGAAGGCGCGGTGCACAGTATCTAGGTAGACCTTTCACGCGTAGCAGAGTGAGATGTTAAGATGTGGTAGATACCAGTGCCGCGCACACCCCTTCAAAGTCAATACGAATTGTATCTAGAAACAAGCACACAAGTCACTGCCGCGTGCGCTTGATGAGGTGGTGATCTTGCTAGTTAAGTGCGGTAAGTCAGCTATTTCTAGCCCGTTCAACGTCTATATCAGGTGGGGAATACCCAGTTAAAGATTGAAGGTTAGGCTAGGCGTAGTGCGCGTAGACCCGTAGATCGCTGTCTACTCCCTTACGCGGAATAAATCCTTCGTCCGGCGAGTGCTGCAATTCCATTGTCCTCATCGCAGACGGAACAGTCCAGAGCGATACTTTGGTACCTTGATTGCATTAATCCGTCGATTGGACCTGAGATTGAATAGGCCGATGGTCATACGGTCGTACCAGTACGAGGACTGGGTGCCCCAGCTTGAATGAGGATGCGGATGAACTCTTACCTACGCATGGTGATCTAGGCCGTATGTTTGCGGCCCCCCGGGCGGACAGCTCACCTAGCACAAACGAGTGGGCTTGGTTTTTCTCCGCAGCACGCACTGCATAGGGGGGTTCATAGAAGGAGCAATAAC 3 | -------------------------------------------------------------------------------- /chapter4/C4_53/test.de_bruijn_graph.txt: -------------------------------------------------------------------------------- 1 | 4 2 | AAGATTCTCTAC 3 | -------------------------------------------------------------------------------- /chapter4/C4_54/debruijn_graph_from_kmers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 4 Dec 2013 12 | # 13 | # DeBruijn Graph from k-mers Problem: Construct the de Bruijn graph from a set of k-mers. 14 | # Input: A collection of k-mers Patterns. 15 | # Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns). 16 | # 17 | # CODE CHALLENGE: Solve the de Bruijn Graph from k-mers Problem. 18 | # 19 | # Sample Input: 20 | # GAGG 21 | # GGGG 22 | # GGGA 23 | # CAGG 24 | # AGGG 25 | # GGAG 26 | # 27 | # Sample Output: 28 | # AGG -> GGG 29 | # CAG -> AGG 30 | # GAG -> AGG 31 | # GGA -> GAG 32 | # GGG -> GGA,GGG 33 | # 34 | ############################################################################### 35 | 36 | import sys 37 | import timeit 38 | import heapq 39 | import operator 40 | import random 41 | import numpy as np 42 | from scipy import stats 43 | from itertools import combinations,product,izip,ifilter,chain 44 | from collections import Counter,defaultdict 45 | 46 | def read_file(input_file): 47 | ''' 48 | >>> kmers = read_file('test.debruijn_graph_from_kmers.txt') 49 | >>> kmers = read_file('test.debruijn_graph_from_kmers.extra.txt') 50 | ''' 51 | f = open(input_file) 52 | data = [item.strip() for item in f.readlines()] 53 | f.close() 54 | return data 55 | 56 | def overlap_graph(kmers): 57 | return [(k1,k2) for k1,k2 in product(kmers,kmers) if k1 != k2 and k1.endswith(k2[:-1])] 58 | 59 | def de_bruijn_graph(kmers): 60 | og = overlap_graph(kmers) 61 | nodes = [(item[0][:-1],item[1][:-1]) for item in og] 62 | potentials = [(item[0][1:],item[1][1:]) for item in og] 63 | for item in potentials: 64 | if item not in nodes: 65 | nodes.append(item) 66 | d = defaultdict(tuple) 67 | for tup in nodes: 68 | if tup[1] not in d[tup[0]]: 69 | d[tup[0]] += (tup[1],) 70 | return d 71 | 72 | def result(filename): 73 | kmers = read_file(filename) 74 | results = de_bruijn_graph(kmers) 75 | return results 76 | 77 | if __name__ == "__main__": 78 | 79 | start = timeit.default_timer() 80 | results = result(sys.argv[-1]) 81 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 82 | for k,v in results.iteritems(): 83 | fw.write('{0} -> {1}'.format(k,','.join(v))+'\n') 84 | fw.close() 85 | stop = timeit.default_timer() 86 | print stop - start 87 | -------------------------------------------------------------------------------- /chapter4/C4_54/haha.txt: -------------------------------------------------------------------------------- 1 | def compare(filename1,filename2): 2 | f1 = open(filename1) 3 | data1 = f1.readlines() 4 | f1.close() 5 | data1 = [item.strip().split(' -> ') for item in data1] 6 | f2 = open(filename2) 7 | data2 = f2.readlines() 8 | f2.close() 9 | data2 = [item.strip().split(' -> ') for item in data2] 10 | length1 = len(data1) 11 | length2 = len(data2) 12 | print 'Number of lines in '+filename1 + ' :'+str(length1) 13 | print 'Number of lines in '+filename2 + ' :'+str(length2) 14 | for item in data1: 15 | if item not in data2: 16 | print str(item) + ' in ' + filename1 + ' not in ' + filename2 17 | for item in data2: 18 | if item not in data1: 19 | print str(item) + ' in ' + filename2 + ' not in ' + filename1 20 | return None 21 | -------------------------------------------------------------------------------- /chapter4/C4_54/output.test.debruijn_graph_from_kmers.txt: -------------------------------------------------------------------------------- 1 | GAG -> AGG 2 | AGG -> GGG 3 | GGG -> GGG,GGA 4 | CAG -> AGG 5 | GGA -> GAG 6 | -------------------------------------------------------------------------------- /chapter4/C4_54/test.debruijn_graph_from_kmers.txt: -------------------------------------------------------------------------------- 1 | GAGG 2 | GGGG 3 | GGGA 4 | CAGG 5 | AGGG 6 | GGAG 7 | -------------------------------------------------------------------------------- /chapter4/C4_57/eulerian_cycle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 4 Dec 2013 12 | # 13 | # CODE CHALLENGE: Solve the Eulerian Cycle Problem. 14 | # Input: The adjacency list of an Eulerian directed graph. 15 | # Output: An Eulerian cycle in this graph. 16 | # 17 | # Sample Input: 18 | # 0 -> 3 19 | # 1 -> 0 20 | # 2 -> 1,6 21 | # 3 -> 2 22 | # 4 -> 2 23 | # 5 -> 4 24 | # 6 -> 5,8 25 | # 7 -> 9 26 | # 8 -> 7 27 | # 9 -> 6 28 | # 29 | # Sample Output: 30 | # 6->8->7->9->6->5->4->2->1->0->3->2->6 31 | # 32 | ############################################################################### 33 | 34 | import sys 35 | import timeit 36 | import re 37 | import heapq 38 | import random 39 | import numpy as np 40 | from scipy import stats 41 | from itertools import combinations,product,izip,ifilter,chain 42 | from collections import Counter,defaultdict 43 | 44 | def read_file(input_file): 45 | ''' 46 | >>> data = read_file('test.eulerian_cycle.txt') 47 | >>> data 48 | [('0', '3'), ('1', '0'), ('2', '1'), ('2', '6'), ('3', '2'), ('4', '2'), ('5', '4'), ('6', '5'), ('6', '8'), ('7', '9'), ('8', '7'), ('9', '6')] 49 | >>> data = read_file('test.eulerian_cycle.3.txt') 50 | >>> data = read_file('test.eulerian_cycle.extra.txt') 51 | ''' 52 | f = open(input_file) 53 | data = [item.strip() for item in f.readlines()] 54 | f.close() 55 | result = [] 56 | for line in data: 57 | dp = data_process(line) 58 | if isinstance(dp,tuple): 59 | result.append(dp) 60 | elif isinstance(dp,list): 61 | result += dp 62 | return result 63 | 64 | def data_process(line): 65 | line = [item.strip() for item in line.split('->')] 66 | if ',' in line[1]: 67 | for item in line[1].split(','): 68 | line.append((line[0],item)) 69 | line.pop(0) 70 | line.pop(0) 71 | else: 72 | line = tuple(line) 73 | return line 74 | 75 | def form_cycle(data): 76 | rgk,rgv = random.choice(data) 77 | cycle = [rgk] 78 | while len(data) != 0: 79 | try: 80 | cycle.append(rgv) 81 | data.remove((rgk,rgv)) 82 | rgk = rgv 83 | rgv = [item2 for (item1,item2) in data if rgv == item1][0] 84 | except: 85 | break 86 | return (cycle,data) 87 | 88 | def form_unCycle(data,rgk): 89 | rgv = [item2 for (item1,item2) in data if rgk == item1][0] 90 | cycle = [rgk] 91 | while len(data) != 0: 92 | try: 93 | cycle.append(rgv) 94 | data.remove((rgk,rgv)) 95 | rgk = rgv 96 | rgv = [item2 for (item1,item2) in data if rgv == item1][0] 97 | except: 98 | break 99 | return (cycle,data) 100 | 101 | def fuse(Cycle,new_Cycle): 102 | fuse_index = Cycle.index(new_Cycle[0]) 103 | return Cycle[:fuse_index]+new_Cycle+Cycle[fuse_index+1:] 104 | 105 | def eulerian_cycle(data): 106 | Cycle,unCycle = form_cycle(data) 107 | while len(unCycle) != 0: 108 | potential = [item for item in Cycle if item in chain(*unCycle)] 109 | newStart = random.choice(potential) 110 | new_Cycle,unCycle = form_unCycle(unCycle,newStart) 111 | Cycle = fuse(Cycle,new_Cycle) 112 | return Cycle 113 | 114 | def result(filename): 115 | data = read_file(filename) 116 | results = eulerian_cycle(data) 117 | return results 118 | 119 | if __name__ == "__main__": 120 | 121 | start = timeit.default_timer() 122 | results = result(sys.argv[-1]) 123 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 124 | fw.write('->'.join(results)) 125 | fw.close() 126 | stop = timeit.default_timer() 127 | print stop - start 128 | -------------------------------------------------------------------------------- /chapter4/C4_57/output.test.eulerian_cycle.3.txt: -------------------------------------------------------------------------------- 1 | 4->2->10->11->12->2->1->0->3->2->6->8->7->9->6->5->4 -------------------------------------------------------------------------------- /chapter4/C4_57/output.test.eulerian_cycle.txt: -------------------------------------------------------------------------------- 1 | 0->3->2->6->8->7->9->6->5->4->2->1->0 -------------------------------------------------------------------------------- /chapter4/C4_57/test.eulerian_cycle.3.txt: -------------------------------------------------------------------------------- 1 | 0 -> 3 2 | 1 -> 0 3 | 2 -> 1,6,10 4 | 3 -> 2 5 | 4 -> 2 6 | 5 -> 4 7 | 6 -> 5,8 8 | 7 -> 9 9 | 8 -> 7 10 | 9 -> 6 11 | 10 -> 11 12 | 11 -> 12 13 | 12 -> 2 14 | -------------------------------------------------------------------------------- /chapter4/C4_57/test.eulerian_cycle.txt: -------------------------------------------------------------------------------- 1 | 0 -> 3 2 | 1 -> 0 3 | 2 -> 1,6 4 | 3 -> 2 5 | 4 -> 2 6 | 5 -> 4 7 | 6 -> 5,8 8 | 7 -> 9 9 | 8 -> 7 10 | 9 -> 6 11 | -------------------------------------------------------------------------------- /chapter5/C5_57/57_10/dataset_57_10.txt: -------------------------------------------------------------------------------- 1 | 19 2 | -------------------------------------------------------------------------------- /chapter5/C5_57/57_10/k-universal_circular_string.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 5 Dec 2013 12 | # 13 | # CODE CHALLENGE: Solve the k-Universal Circular String Problem. 14 | # Input: An integer k. 15 | # Output: A k-universal circular string. 16 | # 17 | # Sample Input: 18 | # 4 19 | # 20 | # Sample Output: 21 | # 0000110010111101 22 | # 23 | ############################################################################### 24 | 25 | import sys 26 | import timeit 27 | import random 28 | from itertools import combinations,product,izip,ifilter,chain 29 | from collections import Counter,defaultdict 30 | 31 | def read_file(input_file): 32 | ''' 33 | >>> k = read_file('test.k-universal_circular_string.4.txt') 34 | ''' 35 | f = open(input_file) 36 | data = f.read().strip() 37 | f.close() 38 | return int(data) 39 | 40 | def generate_binary(k): 41 | return [bin(i)[2:].zfill(k) for i in range(2**k)] 42 | 43 | def form_relation(k): 44 | nodes = generate_binary(k-1) 45 | d = {} 46 | for item in nodes: 47 | add1 = item[1:]+'0' 48 | add2 = item[1:]+'1' 49 | d[item] = [add1,add2] 50 | return d 51 | 52 | def form_cycle(data): 53 | rgk = random.choice(data.keys()) 54 | rgv = random.choice(data[rgk]) 55 | cycle = [rgk] 56 | while len(data) != 0: 57 | try: 58 | cycle.append(rgv) 59 | if len(data[rgk]) > 1: 60 | data[rgk].remove(rgv) 61 | else: 62 | del data[rgk] 63 | rgk = rgv 64 | rgv = random.choice(data[rgk]) 65 | if rgv == cycle[0] and rgv == cycle[-1]: 66 | break 67 | except: 68 | break 69 | return (cycle,data) 70 | 71 | def form_unCycle(data,rgk): 72 | choose = data[rgk] 73 | rgv = random.choice(choose) 74 | cycle = [rgk] 75 | while len(data) != 0: 76 | try: 77 | cycle.append(rgv) 78 | if len(data[rgk]) > 1: 79 | data[rgk].remove(rgv) 80 | else: 81 | del data[rgk] 82 | rgk = rgv 83 | rgv = random.choice(data[rgk]) 84 | if rgv == cycle[0] and rgv == cycle[-1]: 85 | break 86 | except: 87 | break 88 | return (cycle,data) 89 | 90 | def fuse(Cycle,new_Cycle): 91 | fuse_index = Cycle.index(new_Cycle[0]) 92 | return Cycle[:fuse_index]+new_Cycle+Cycle[fuse_index+1:] 93 | 94 | def eulerian_cycle(data,k): 95 | Cycle,unCycle = form_cycle(data) 96 | while len(unCycle) != 0: 97 | keys = unCycle.keys() 98 | potential = list(set(Cycle)&set(keys)) 99 | newStart = random.choice(potential) 100 | new_Cycle,unCycle = form_unCycle(unCycle,newStart) 101 | Cycle = fuse(Cycle,new_Cycle) 102 | return Cycle 103 | 104 | def form_string(path): 105 | string = ''.join([item[-1] for item in path[1:]]) 106 | return string 107 | 108 | def result(filename): 109 | k = read_file(filename) 110 | data = form_relation(k) 111 | path = eulerian_cycle(data,k) 112 | results = form_string(path) 113 | return results 114 | 115 | if __name__ == "__main__": 116 | 117 | start = timeit.default_timer() 118 | results = result(sys.argv[-1]) 119 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 120 | fw.write(results) 121 | fw.close() 122 | stop = timeit.default_timer() 123 | print stop - start 124 | -------------------------------------------------------------------------------- /chapter5/C5_57/57_10/output.test.k-universal_circular_string.4.txt: -------------------------------------------------------------------------------- 1 | 1101011001000011 -------------------------------------------------------------------------------- /chapter5/C5_57/57_10/test.k-universal_circular_string.4.txt: -------------------------------------------------------------------------------- 1 | 4 2 | -------------------------------------------------------------------------------- /chapter5/C5_57/57_5/eulerian_path.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 5 Dec 2013 12 | # 13 | # CODE CHALLENGE: Solve the Eulerian Path Problem. 14 | # Input: The adjacency list of a directed graph that has an Eulerian path. 15 | # Output: An Eulerian path in this graph. 16 | # 17 | # Sample Input: 18 | # 0 -> 2 19 | # 1 -> 3 20 | # 2 -> 1 21 | # 3 -> 0,4 22 | # 6 -> 3,7 23 | # 7 -> 8 24 | # 8 -> 9 25 | # 9 -> 6 26 | # 27 | # Sample Output: 28 | # 6->7->8->9->6->3->0->2->1->3->4 29 | # 30 | ############################################################################### 31 | 32 | import sys 33 | import timeit 34 | import re 35 | import heapq 36 | import random 37 | import numpy as np 38 | from itertools import combinations,product,izip,ifilter,chain 39 | from collections import Counter,defaultdict 40 | 41 | def data_process(line): 42 | line = [item.strip() for item in line.split('->')] 43 | if ',' in line[1]: 44 | for item in line[1].split(','): 45 | line.append((line[0],item)) 46 | line.pop(0) 47 | line.pop(0) 48 | else: 49 | line = tuple(line) 50 | return line 51 | 52 | def read_file(input_file): 53 | ''' 54 | >>> data = read_file('test.eulerian_path.txt') 55 | >>> data 56 | [('0', '2'), ('1', '3'), ('2', '1'), ('3', '0'), ('3', '4'), ('6', '3'), ('6', '7'), ('7', '8'), ('8', '9'), ('9', '6')] 57 | >>> data = read_file('test.eulerian_path.extra.txt') 58 | ''' 59 | f = open(input_file) 60 | data = [item.strip() for item in f.readlines()] 61 | f.close() 62 | result = [] 63 | for line in data: 64 | dp = data_process(line) 65 | if isinstance(dp,tuple): 66 | result.append(dp) 67 | elif isinstance(dp,list): 68 | result += dp 69 | return result 70 | 71 | def find_ender(data): 72 | item0 = [item[0] for item in data] 73 | item1 = [item[1] for item in data] 74 | c0 = Counter(item0) 75 | c1 = Counter(item1) 76 | result = [(item,c0[item],c1[item]) for item in c1 if c0[item] != c1[item]] 77 | return [item[0] for item in result if item[1] < item[2]][0] 78 | 79 | def find_path(data): 80 | initial_ender = find_ender(data) 81 | ender = find_ender(data) 82 | path = [ender] 83 | while len(data) != 0: 84 | try: 85 | starter = [item1 for (item1,item2) in data if item2 == ender][0] 86 | data.remove((starter,ender)) 87 | path.append(starter) 88 | ender = starter 89 | except: 90 | break 91 | return (path[::-1],data,initial_ender) 92 | 93 | def form_cycle(unpath,initial_ender): 94 | rgk,rgv = [(item1,item2) for (item1,item2) in unpath if item2 == initial_ender][0] 95 | cycle = [rgv] 96 | while len(unpath) != 0: 97 | try: 98 | cycle.append(rgk) 99 | unpath.remove((rgk,rgv)) 100 | rgk,rgv = [(item1,item2) for (item1,item2) in unpath if item2 == rgk][0] 101 | except: 102 | break 103 | return (cycle[::-1],unpath) 104 | 105 | def fuse(path,cycle): 106 | fuse_index = path.index(cycle[0]) 107 | return path[:fuse_index]+cycle+path[fuse_index+1:] 108 | 109 | def eulerian_path(data): 110 | path,unpath,initial_ender = find_path(data) 111 | cycle,unpath = form_cycle(unpath,initial_ender) 112 | path = fuse(path,cycle) 113 | while len(unpath) != 0: 114 | print 'len(unpath): '+str(len(unpath)) 115 | potential = [item for item in path if item in chain(*unpath)] 116 | newStart = random.choice(potential) 117 | cycle,unpath = form_unCycle(unpath,newStart) 118 | path = fuse(path,cycle) 119 | return path 120 | 121 | def form_unCycle(data,rgk): 122 | rgv = [item2 for (item1,item2) in data if rgk == item1][0] 123 | cycle = [rgk] 124 | while len(data) != 0: 125 | try: 126 | cycle.append(rgv) 127 | data.remove((rgk,rgv)) 128 | rgk = rgv 129 | rgv = [item2 for (item1,item2) in data if rgv == item1][0] 130 | except: 131 | break 132 | return (cycle,data) 133 | 134 | def result(filename): 135 | data = read_file(filename) 136 | results = eulerian_path(data) 137 | return results 138 | 139 | if __name__ == "__main__": 140 | 141 | start = timeit.default_timer() 142 | results = result(sys.argv[-1]) 143 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 144 | fw.write('->'.join(results)) 145 | fw.close() 146 | stop = timeit.default_timer() 147 | print stop - start 148 | -------------------------------------------------------------------------------- /chapter5/C5_57/57_5/output.test.eulerian_path.txt: -------------------------------------------------------------------------------- 1 | 6->7->8->9->6->3->0->2->1->3->4 -------------------------------------------------------------------------------- /chapter5/C5_57/57_5/test.eulerian_path.txt: -------------------------------------------------------------------------------- 1 | 0 -> 2 2 | 1 -> 3 3 | 2 -> 1 4 | 3 -> 0,4 5 | 6 -> 3,7 6 | 7 -> 8 7 | 8 -> 9 8 | 9 -> 6 9 | -------------------------------------------------------------------------------- /chapter5/C5_57/57_6/answer.test.string_reconstruction.extra.txt: -------------------------------------------------------------------------------- 1 | GAGGTAAAGGACGGACATTGAACTGCCTTTCACATCAAAAATCTTTCTTAAATAGACATGGTACCGCTTTATCATCTGGTCCCCAAGCCCTCAGGCTCTATGATTCCCAGGTCAACCATGCCTTCTCAGGGTCTGGAACATGCTGGACATTTTAATATCAGATGGTGGCACATTTTAACGCATCCCCAAAGGCTGAACAGTGGCTCGGAATTTGAAAACGTCTGAACCCATAGGTTATCACTCCTACCCCCATCCAAACCAAGGTCATACAAAGAAGGGGAGCACAGCTATTGTTTCTTTAGTGTGACCCGGGTGTTACTAGCGCAACTTCGAGTAACATCACAAGAACTCCCTCTTCAATTTTATTACTTGCCTTAGCTACCTTGGTCCGTTCTACTATACATGATGCATCTTATCGTAACCGTGGTCTGTACCCGAGTGAAACGGCAGCAGCGACTCCCTACAGCGGCGTCGCAAGAATCTCGTTTCCCTTTGAAAGGGCCGCAAGACGGCGCTTATGTCCCGCGTCTCTTATGCTCACCAGATTGTTCCAATACGTTCTTCGTCCCATATTCATATGCTTGCTGAAATATTCCTGGTGAGCCACGGCCGGCCATCCAATGTATACGACTACCTATATTACTGACACATGCCAACCTTGCTACAGCATTCCGCTTGTAAAAGCATGATCTCTGCCCGTTTGTACTCTTAAATGGGTGAGTTCATTGTAGGCGTACCTGTCCCGTCGGCAACCATGGGAAACTGTTAACCGTATAATTAGTTCTGTCTTATTATGCTCTCGCTTACTTAACGTAACGAATAGTCCGGGCGAATACATACCTTGGCCTTTGTACTCACCCTCGCAATTAGGATCTGGACCGCAATCAAGCGGTGGCGAGCGGTAAACAGAACTAAGAGACGTGCCACATTTCATATATGCGGGTGCCCGGCCGCACCACGGACTATAAGGATAACGAGGCGTGTATTCCTCCACCACAGAAAACTAGAAGAGGCCCTTCTATCCAGTTGTGTCCCTAATTCCGCCAGAAAAGCCTAAATCACACTGGGGGGATCTTCCTGAATCACCTCACTATTAGCCTGAGTGGGACGCTCACAGCAGTATGGGTGTATCGTGCAGAAAGGGATCTAACCTTTTCGTACCTCATTACACACGGGTCGGAGCCAAGCGCACAGTGGCTGCGCGCGTTCGAACGTCTACGGCCATGCTTCGGTCGCATACTCATATGTACGGAGAAAAACAGGAGTGACTGTCTATGATTGGTCCTGACGTGCTGAACCGTTAACGACGGCGGCGCCCTACCTGGATATTGGACCGTACCCCGACAGAGTGCTTCCCTCTGTCAAGAAATCAATCGGAAGAACGGCTGAACCTATATTCAGGCGGTTGTCTAGATGGTTGATACAGTATCCTATATCACGAGTGACAGGATTGGTTCAAAAATAGAATACCCCACATACGGTTTACAGGGCCTAAGGCTATCGAAAACAAGCAAGGCCGGGTTACCTGGTCCCAGAATACTAAATGTTAACAGTAGACTCTCGAAGGATATGCGGAAAGCTCAAGAATCCCGTCCTAGGTTG 2 | -------------------------------------------------------------------------------- /chapter5/C5_57/57_6/output.dataset_57_6.txt: -------------------------------------------------------------------------------- 1 | AAGTCAGCTTATAGGGTTACGCTCTGCGTTCCCACCACGTCTTTCAAGTACCAACTGTTTACCGGGCCAGCGCGGCCATTCTATTGCAACTGGCACGCCGTACGATGTCAACTGCGTTTTCGTGCAACGGGTATCTCGAGCGTAGAAGTAGCTTGGATCGCGTAAGGGCACATGCGAGGAGATATTTATGCTACGCCCTGGTCCCTCGAACACGGCTACTTCGAAATCCGTCGCGAATCAGGGGCCGGCATAACCATTGTTCTTATATCGATCCGGATAGCACAGAACATACAGAACTGGCCGCGATACTTCGCGCCAGGCTGTATTTACAGAAAGACCCAGGGCAGTTCCGAGCCTACGCCAGGTCTTTCGACGTATAGTAGGGGTACTCGCTAGCCCTTCGGATCATGGTCGGCCGCCATCCAATCGCTCGCCTGACGGTTCGGCAGTAGTAGTGTGTTCTGAGGGTGAGAGACCTGGGACCGCCTATACTCTGCGTGCGCAAGTTAGCCTACGCATCTTACTATTCCTGACGGACTCCTACAGATCCGATTTTGGGCGGTGGTGAACTATGACCTAATAGTATAAGGAACTACACGCTGCTCGAATGCGTCTTGAATAGATACAAATCGGTGGACCAATTCTAGAATGCGTCATCATGATTGTGTGCTGTATACAGTCAATTCAGCTGCTCCGAAGGAACACTAGCGGTTCACCTGAACGAACGTACGCATCTGCTGTTTCCGCTGTGTGAGACTCGTAAAAATCGGAACGAGAAGGCTAGCTGCAATCGCGCGTCAAGAGCGTTGTAGGTCATTAATAAGGTGACTCTTAAAAAGCAAGGTAGAGAGGGTGATGGTGTACATCCGGGTCCTGTGAACGCCTAATGTCACGATGTAGGCCAGGAAAATTCGGTGTGCTACACCAGAAAGACTAGGCGGGTGACATTACTGCCGTGAGTCAATGCCTCATTGATGGTGTGGCTTAGTACGCGAGCATTTTTACCTCGCAATCTAACGGTCACTTCACGGCCAGCCTTTACTGGCACAAAAAACTATCGTACGTGCCTCGGCCCAAACGTGATATATCCGTTTCTTTACTTCCGGACACGCGCTCTCTCGGCATACTAGCACTTACACCAGGGGCGCACGCTCTATGGAAACCCTGAATCCGACGGAGGGCGTACATCTTGACCGGTTACTAAGAATCACCCAAGCATGCAGCGATCCGTGCACCCCTATCACGATCGTATTTGGTTTGCACTATTTTATCGCTTCGGGAGGTACACATTACCGAAAACTCGAATCATCCGGTTTAATGGCTCGCGCTATACAACAGAAAATACCGCCATGCGCGACCGCAAGCGTTGCCACACAGTATTGACATGGGGTAGTTCTTCTATATCCCGGAGACCATCGCGCAAAATCCCGTTCAATTATGTAAAGATCGACGGCTGGGTGACCAACTGGATCCATCGTTCTGAGACTGGCAACGGCATAAAGACGTCCTAACCATGGCCATATAAATATTTTCCCTGTTAGACCTAGCAAGGAGCACACGATATAGCTTATAGCTCACAGATTAATGGGCGTTCTATGGAAAGGTTGCGTAGTCTACTGCAAGGGGAGGCAGGCTTAGACGCGCAACGAAAGATCCAGTCACGTACCGTGATCCATGACACCTGTAGCCCGTGGGCCATCTCTTCTCGCAATATCTTGGACCCTGATCATTGCTTCTCCACTAGGTTTTCATTAAACAAACACCTTTTACTCGGAGCGCCGAGCCTCCTCATCAGGCGTTATCCCATAGCCCTTGTTACGCTA -------------------------------------------------------------------------------- /chapter5/C5_57/57_6/output.test.string_reconstruction.extra.txt: -------------------------------------------------------------------------------- 1 | GAGGTAAAGGACGGACATTGAACTGCCTTTCACATCAAAAATCTTTCTTAAATAGACATGGTACCGCTTTATCATCTGGTCCCCAAGCCCTCAGGCTCTATGATTCCCAGGTCAACCATGCCTTCTCAGGGTCTGGAACATGCTGGACATTTTAATATCAGATGGTGGCACATTTTAACGCATCCCCAAAGGCTGAACAGTGGCTCGGAATTTGAAAACGTCTGAACCCATAGGTTATCACTCCTACCCCCATCCAAACCAAGGTCATACAAAGAAGGGGAGCACAGCTATTGTTTCTTTAGTGTGACCCGGGTGTTACTAGCGCAACTTCGAGTAACATCACAAGAACTCCCTCTTCAATTTTATTACTTGCCTTAGCTACCTTGGTCCGTTCTACTATACATGATGCATCTTATCGTAACCGTGGTCTGTACCCGAGTGAAACGGCAGCAGCGACTCCCTACAGCGGCGTCGCAAGAATCTCGTTTCCCTTTGAAAGGGCCGCAAGACGGCGCTTATGTCCCGCGTCTCTTATGCTCACCAGATTGTTCCAATACGTTCTTCGTCCCATATTCATATGCTTGCTGAAATATTCCTGGTGAGCCACGGCCGGCCATCCAATGTATACGACTACCTATATTACTGACACATGCCAACCTTGCTACAGCATTCCGCTTGTAAAAGCATGATCTCTGCCCGTTTGTACTCTTAAATGGGTGAGTTCATTGTAGGCGTACCTGTCCCGTCGGCAACCATGGGAAACTGTTAACCGTATAATTAGTTCTGTCTTATTATGCTCTCGCTTACTTAACGTAACGAATAGTCCGGGCGAATACATACCTTGGCCTTTGTACTCACCCTCGCAATTAGGATCTGGACCGCAATCAAGCGGTGGCGAGCGGTAAACAGAACTAAGAGACGTGCCACATTTCATATATGCGGGTGCCCGGCCGCACCACGGACTATAAGGATAACGAGGCGTGTATTCCTCCACCACAGAAAACTAGAAGAGGCCCTTCTATCCAGTTGTGTCCCTAATTCCGCCAGAAAAGCCTAAATCACACTGGGGGGATCTTCCTGAATCACCTCACTATTAGCCTGAGTGGGACGCTCACAGCAGTATGGGTGTATCGTGCAGAAAGGGATCTAACCTTTTCGTACCTCATTACACACGGGTCGGAGCCAAGCGCACAGTGGCTGCGCGCGTTCGAACGTCTACGGCCATGCTTCGGTCGCATACTCATATGTACGGAGAAAAACAGGAGTGACTGTCTATGATTGGTCCTGACGTGCTGAACCGTTAACGACGGCGGCGCCCTACCTGGATATTGGACCGTACCCCGACAGAGTGCTTCCCTCTGTCAAGAAATCAATCGGAAGAACGGCTGAACCTATATTCAGGCGGTTGTCTAGATGGTTGATACAGTATCCTATATCACGAGTGACAGGATTGGTTCAAAAATAGAATACCCCACATACGGTTTACAGGGCCTAAGGCTATCGAAAACAAGCAAGGCCGGGTTACCTGGTCCCAGAATACTAAATGTTAACAGTAGACTCTCGAAGGATATGCGGAAAGCTCAAGAATCCCGTCCTAGGTTG -------------------------------------------------------------------------------- /chapter5/C5_57/57_6/output.test.string_reconstruction.txt: -------------------------------------------------------------------------------- 1 | GGCTTACCA -------------------------------------------------------------------------------- /chapter5/C5_57/57_6/string_reconstruction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 5 Dec 2013 12 | # 13 | # CODE CHALLENGE: Solve the String Reconstruction Problem. 14 | # Input: The adjacency list of a directed graph that has an Eulerian path. 15 | # Output: An Eulerian path in this graph. 16 | # 17 | # Sample Input: 18 | # CTT -> TTA 19 | # ACC -> CCA 20 | # TAC -> ACC 21 | # GGC -> GCT 22 | # GCT -> CTT 23 | # TTA -> TAC 24 | # 25 | # Sample Output: 26 | # GGCTTACCA 27 | # 28 | ############################################################################### 29 | 30 | import sys 31 | import timeit 32 | import re 33 | import heapq 34 | import random 35 | import numpy as np 36 | from itertools import combinations,product,izip,ifilter,chain 37 | from collections import Counter,defaultdict 38 | 39 | def read_file(input_file): 40 | ''' 41 | >>> data = read_file('test.string_reconstruction.txt') 42 | >>> data = read_file('test.string_reconstruction.extra.txt') 43 | ''' 44 | f = open(input_file) 45 | data = [item.strip().split(' -> ') for item in f.readlines()] 46 | f.close() 47 | return data 48 | 49 | def find_ender(data): 50 | item0 = [item[0] for item in data] 51 | item1 = [item[1] for item in data] 52 | c0 = Counter(item0) 53 | c1 = Counter(item1) 54 | result = [(item,c0[item],c1[item]) for item in c1 if c0[item] != c1[item]] 55 | return [item[0] for item in result if item[1] < item[2]][0] 56 | 57 | def find_path(data): 58 | initial_ender = find_ender(data) 59 | ender = find_ender(data) 60 | path = [ender] 61 | while len(data) != 0: 62 | try: 63 | starter = [item1 for [item1,item2] in data if item2 == ender][0] 64 | data.remove([starter,ender]) 65 | path.append(starter) 66 | ender = starter 67 | except: 68 | break 69 | return path[::-1] 70 | 71 | def form_string(path): 72 | string = path[0] 73 | for item in path[1:]: 74 | string += item[-1] 75 | return string 76 | 77 | def result(filename): 78 | data = read_file(filename) 79 | path = find_path(data) 80 | results = form_string(path) 81 | return results 82 | 83 | if __name__ == "__main__": 84 | 85 | start = timeit.default_timer() 86 | results = result(sys.argv[-1]) 87 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 88 | fw.write(results) 89 | fw.close() 90 | stop = timeit.default_timer() 91 | print stop - start 92 | -------------------------------------------------------------------------------- /chapter5/C5_57/57_6/test.string_reconstruction.txt: -------------------------------------------------------------------------------- 1 | CTT -> TTA 2 | ACC -> CCA 3 | TAC -> ACC 4 | GGC -> GCT 5 | GCT -> CTT 6 | TTA -> TAC 7 | -------------------------------------------------------------------------------- /chapter5/C5_58/answer.test.srfrp.extra.txt: -------------------------------------------------------------------------------- 1 | AACAGAAAGCGAGCGAACCGTCTCTTGCATTTCGCCCTATGTTACTATGCATTACCATGGCTTAAGTACGAAGAATCTGTGACGGAAGCTCTTCGCAGTTCGCTAGGGCCTGATAGATCAGGATGGTACATCTCCACCAATTATCCGCAGCAATTACACGCCCAACCTATTTCGGGGATCTAGCACGCCAAGGTGTCCACTCGGCGTTCGGGGAGACCACTACGACGATAGCATATCTCACCAGTTCGGCCCTTTCGTCACTGTCACTTCCTACTTCTAGCGGTCTGATGATATTATACCTGACCGACTTGAAACGGTATCTACGGGCCATGGGCACGTAATCATTCCATCTACCGTTACTCCTATCGCCTCATGATGCTGCCTGCAACCTATCGTTGGACACACCCGCATCTGGTTGGCGGTACTGGCAGATTATCGCGCTGGTTGTGTATTCTTAGCGGCAAATCACATCGGTTAATAAGTTTACGAACAGCACCAGCTCAACCGTTCATTGCCCCACCATACGAGACAGCGACAGGCACATCTGATCCGTGTATCGTCTCATGGCTAAATACATTTCACCGTTGCGTGGGGAAGTCCTTGTGCCATGCCTGTACAAAATGGCGGCCGTAAAGTTCCCATGCGAAGGAACATTCCGGCAGTCTGTTTTGCGTCTTAGGCCCTTGGTAATTACTACTTAGGGGAAGGTGGTCCTTACGGTATTCGTATAGATAAACCTCCTCCCGACTCTCGCTGATATAAACTGGGGCCGTGAGAAACTGGGTGTCAAGCATAGTTCTAGCTACATAGTTCCAAGCGTATTTCGGGGCTAGGCCGGGGGTGCGCCCTGGGCAACGCTCAGGGTGTCCCCCGTGACATGACGAGGGAGTAGCCTAGGCACATCGTTACTTATAACCCAAGCAACGCGCTGTTTAGGCGAATGACGTAGTACTCTATGCCGCTGCAGGTCCCTGGGCGTCAGGCCTGTGACGGAAGCTCTTCGCAGTTCGCTAGTAGGTGCGTCATCTCATGACACCCAACCCTTGTCTGTAAATGTGTGACGGAAGCTCTTCGCAGTTCGCTAGATCGAGTCCCCAGGACATCAGTATCTAACCTCCACTTCCTGTTCGCCTCCCCTTTGTTTTCATGCAGTTGTCATCTGCATGTCATGTATACAACCTATGGAGGAAGTCTCGATGTCGGTTTCGGGGATGTGTGATCCCGTTGCGGTGACAGCGATCTTGTCATCGCTCACATAACTCATTAACTATTGAATTGGCGCCCTGGTTTCGCCGGTACGCTCAAGCATGCGACCCTGGAGGCAGCAGTACGAATTTTGAGGAAACCCAAGTACCGGAATCTTAAGGGCAGATAAGCCGTTAAAGCGATTGCTTTGATTGGCCCGGTAATCGGCCTGTGTGCAGCGCTTGTACCCTAAAAACATTCAAAAGAGTATACCAGTCGGTCGTACCTCATACTGTATGGCCCAACGGGATCTCTACTTATTTAATCTGACGCACTCATAATGCGAACTGATCTGCAACCTCATTAAGGCTACCACCTGCGGACGATAGGCGAACAATAACACTAGGGGGGTTAGATTCCCCAGGTTATGCATATCACCACTAGTGCGTATTGACAGCTGCCCCAGCCTCTAGAACATTTGGAGTTTTAATTTCAAACACTCGCACTCCATTCTAGGGGGCGCATATCATAGCGTATTTCTTGACGACCCTGATTGGACACGCAGCCCTAGGTAGACCGTGGGTAATTTCCTAACATTAAATTAACATAGTTCCTAATACCGACACAACCTCTGACTGGTCGCATGGACCTAGTGAGACCCCAGCAATGCCGTCGCGGACACACATCAGTTTTATGACAGCGTAAGGGTGGTACTAACCTGTATCGCCCGACAGGCGCATGTGTGATTCTCTCCGTAGAGTATGAGACTCTCGGGTAACATTATTAACGACCTGACCTCTTTCCTGCGCTCACCTACAGACGTTCAAAAGGGACCTCGGATAAATCTACATGAGAAGGGTAGCCATAATAACTAGTTGCATTCGCGCATCCATCACTGATGCGCGGCCACAAAGTTAATTGCCGCTAACACCTCGGTGACTCTACTCAATGTAAAGAATACCAGCCTAATCGAGGTTTTGACATGGCGCTCATGACTTAAAGGCCATGAGTCACTCACGCACAATGACTACCTCCCTCGGATGTGCCCTATGGGAGCCAGTATGAGTGTGCTGAGAAGAGCTCTCTCCGTATCGTCTCCGTAGAGGGTGCTTAACCTCCTATACTTTGCTTGGATAGGTGGGTGACGGAAGCTCTTCGCAGTTCGCTAGCAAATCGAATTATACGACTATGGGGGTTTCCCCCCCGGGCAATTCGTTTTATTGGGTGACGGAAGCTCTTCGCAGTTCGCTAGTTTACTTTTTGATTAAGTTCCGGCTACTAAAAAATGGCCCCGTAAAGTAAATTCCTAATACTCGACGGTAAACCCCTGATTAGAGTCCAATCGGCTCTTCACAAATACAGACTGTCCGGTTGGAGTGTTGCGTCATTCATGAGGTGCTAAACGTAACAGAACCACACATTGAACTGTTTGGGAACGGTAGGCTTCCTCACTTCAGAACCTAGCCCAAATCAGGCCTTGATACCTAACGTTCATACCGGGCTGACTACGCAACATAAGAGAATCTCTCCTCCTTAACGAGGTTTTGTGATTGACACCTGACAGGCTGTCAGTTTCCCACACCCTGGCGTATTAAGATATTAGTTTGCTTACGAGCATTGCGGATGTCTCAGAAAACACAGTAGCCAGTAAAATGTGATATTGACCAGAAGATCGCAAACTTCGCCAAGGTAAAGCTAAACTGAAGACCGTCTGCATAGGACCTCAATCGTTTGACTTCCAAGAAAGAAGGGTATACTACGACTTTCAAGTATTGATGCAGCCTTGCCATATCGAGTCGTCAGAGGATAGCATCTCGGGTGACCGTAGCTAAGATGCAGGTACTAGATCTAACAAGTATCCGTCCCCGTCCGATCTAAATCAAGTTAAAGGGTAGACCCAGGTGCTTTGACCAGACTTATTATCCGGTGACGGAAGCTCTTCGCAGGTGACGGAAGCTCTTCGCAGTTCGCTAGTTCGCTAGACGGGTGGAAGGAATTTGTATAGCGACAGGTTGTCATATCGATACAACGTAGACGCCCGCTCAAGAGTACGGTGGGATCAGGCACCCTTACTGCTTTCTACCCTTGCCAGAGGACCTGTAAGAACGGGCGGTTGCAGGAGGATTATAATCCCGAAAGCAGACGTTAAGTCGGACAAGAAGCATATTTTATATATAAGCGAGCTGGGCATAGTAGCCCGTGGGGCAAAGTGTCGCACCGCTATAAGCGAGCCCGCGGAAGTATGCTAACCCTAGTGTAGCCCCATCCAAGTGACTGTAGCAAGGAGCCGTGCTAGTGTGACGGAAGCTCTTCGCAGTTCGCTAGAGCGATCGTCCGTATCCTCGATCAACGAATTTAGAGTACCCGGAAGCCAGGGCCCTATGGCGTAACGACACGAGTTGGTCTACCAGAAAGACCGAGTAATTTTGCGTTCTTTCATTCCGTTTAAGGCGAATGAAACCAGCAACGAATGTCGTGTGGGCATAGGCAAATCCAACTGGAGGATTTACGCGCATGACGCATTCGTTGAACGAGGCCAAGCTAACATCGAAGTACTATTCTCTGACAGACGCTACTAGTTACCGCCTTTCATACCCATTCATGTGGCTCGGTCTGGAAATTCACATCTCGTGAATTTCACGTATCAGGGCAATTCCCCGAGCAGTGCTCGAAGACTACCCCAAACACTTCTAAAAGGTTCTGAGAGCTATCCTCAGGGGCACCATCGTTTGAAATCCACCCAAGACCTGATTTGTATGGCGCCCAGGGGAGGCGGTCCCCCCAACGCTCCGATTTAGCATTTCTGGGTGTTAACCCCTTAAACGAGGTCCCAGTGTAATAGAAACAGGTACGTAGCTTCCCGACTGGAGAGTCACCCTTACTCTAATCTGGTGCAGTCGCATGGTCCATACAACAAACAACCTTAAGATTCCTAAGTGGACCCTCGCCTTACATAGTCGGATTCTGACAGGTGTGCAAACCCCTCGCTAGTAGTCGTGACGGAAGCTCTTCGCAGTTCGCTAGGTTGCATGCATGAGACATTGACGGGGACCCGCAACCAGGTAACCAAATAGTTCGACCCCTCCCAGTTAAATATCTGCAAGCCGGCCAAGTCGCGCGGGTCAGGCGACGATTCCTAGGAAGAGGAGCAGTGTTGGAGAGTCTTATGAAGCCTAGGCACAGTCCTGGATTCCGTACCCCCCTTGCGATTACATCCACCGTTAGACTGATGTACGCACCCTGGTTATAGGAAAGGATAAGGGCCAGATGTTCGGTAAGCAAGGTGTTCGAGGCACCCTCCCCTCCTGGAGCTCGGTAGGCAACAGCCTTCTTATCGGCAAAGCCATAAACAAAATAGGAGTGAGTGATTAGTGGGGCAAGCCGTGACGGAAGCTCTTCGCAGTTCGCTAG 2 | -------------------------------------------------------------------------------- /chapter5/C5_58/output.test.srfrp.extra.txt: -------------------------------------------------------------------------------- 1 | AACAGAAAGCGAGCGAACCGTCTCTTGCATTTCGCCCTATGTTACTATGCATTACCATGGCTTAAGTACGAAGAATCTGTGACGGAAGCTCTTCGCAGTTCGCTAGGGCCTGATAGATCAGGATGGTACATCTCCACCAATTATCCGCAGCAATTACACGCCCAACCTATTTCGGGGATCTAGCACGCCAAGGTGTCCACTCGGCGTTCGGGGAGACCACTACGACGATAGCATATCTCACCAGTTCGGCCCTTTCGTCACTGTCACTTCCTACTTCTAGCGGTCTGATGATATTATACCTGACCGACTTGAAACGGTATCTACGGGCCATGGGCACGTAATCATTCCATCTACCGTTACTCCTATCGCCTCATGATGCTGCCTGCAACCTATCGTTGGACACACCCGCATCTGGTTGGCGGTACTGGCAGATTATCGCGCTGGTTGTGTATTCTTAGCGGCAAATCACATCGGTTAATAAGTTTACGAACAGCACCAGCTCAACCGTTCATTGCCCCACCATACGAGACAGCGACAGGCACATCTGATCCGTGTATCGTCTCATGGCTAAATACATTTCACCGTTGCGTGGGGAAGTCCTTGTGCCATGCCTGTACAAAATGGCGGCCGTAAAGTTCCCATGCGAAGGAACATTCCGGCAGTCTGTTTTGCGTCTTAGGCCCTTGGTAATTACTACTTAGGGGAAGGTGGTCCTTACGGTATTCGTATAGATAAACCTCCTCCCGACTCTCGCTGATATAAACTGGGGCCGTGAGAAACTGGGTGTCAAGCATAGTTCTAGCTACATAGTTCCAAGCGTATTTCGGGGCTAGGCCGGGGGTGCGCCCTGGGCAACGCTCAGGGTGTCCCCCGTGACATGACGAGGGAGTAGCCTAGGCACATCGTTACTTATAACCCAAGCAACGCGCTGTTTAGGCGAATGACGTAGTACTCTATGCCGCTGCAGGTCCCTGGGCGTCAGGCCTGTGACGGAAGCTCTTCGCAGTTCGCTAGTAGGTGCGTCATCTCATGACACCCAACCCTTGTCTGTAAATGTGTGACGGAAGCTCTTCGCAGTTCGCTAGATCGAGTCCCCAGGACATCAGTATCTAACCTCCACTTCCTGTTCGCCTCCCCTTTGTTTTCATGCAGTTGTCATCTGCATGTCATGTATACAACCTATGGAGGAAGTCTCGATGTCGGTTTCGGGGATGTGTGATCCCGTTGCGGTGACAGCGATCTTGTCATCGCTCACATAACTCATTAACTATTGAATTGGCGCCCTGGTTTCGCCGGTACGCTCAAGCATGCGACCCTGGAGGCAGCAGTACGAATTTTGAGGAAACCCAAGTACCGGAATCTTAAGGGCAGATAAGCCGTTAAAGCGATTGCTTTGATTGGCCCGGTAATCGGCCTGTGTGCAGCGCTTGTACCCTAAAAACATTCAAAAGAGTATACCAGTCGGTCGTACCTCATACTGTATGGCCCAACGGGATCTCTACTTATTTAATCTGACGCACTCATAATGCGAACTGATCTGCAACCTCATTAAGGCTACCACCTGCGGACGATAGGCGAACAATAACACTAGGGGGGTTAGATTCCCCAGGTTATGCATATCACCACTAGTGCGTATTGACAGCTGCCCCAGCCTCTAGAACATTTGGAGTTTTAATTTCAAACACTCGCACTCCATTCTAGGGGGCGCATATCATAGCGTATTTCTTGACGACCCTGATTGGACACGCAGCCCTAGGTAGACCGTGGGTAATTTCCTAACATTAAATTAACATAGTTCCTAATACCGACACAACCTCTGACTGGTCGCATGGACCTAGTGAGACCCCAGCAATGCCGTCGCGGACACACATCAGTTTTATGACAGCGTAAGGGTGGTACTAACCTGTATCGCCCGACAGGCGCATGTGTGATTCTCTCCGTAGAGTATGAGACTCTCGGGTAACATTATTAACGACCTGACCTCTTTCCTGCGCTCACCTACAGACGTTCAAAAGGGACCTCGGATAAATCTACATGAGAAGGGTAGCCATAATAACTAGTTGCATTCGCGCATCCATCACTGATGCGCGGCCACAAAGTTAATTGCCGCTAACACCTCGGTGACTCTACTCAATGTAAAGAATACCAGCCTAATCGAGGTTTTGACATGGCGCTCATGACTTAAAGGCCATGAGTCACTCACGCACAATGACTACCTCCCTCGGATGTGCCCTATGGGAGCCAGTATGAGTGTGCTGAGAAGAGCTCTCTCCGTATCGTCTCCGTAGAGGGTGCTTAACCTCCTATACTTTGCTTGGATAGGTGGGTGACGGAAGCTCTTCGCAGTTCGCTAGCAAATCGAATTATACGACTATGGGGGTTTCCCCCCCGGGCAATTCGTTTTATTGGGTGACGGAAGCTCTTCGCAGTTCGCTAGTTTACTTTTTGATTAAGTTCCGGCTACTAAAAAATGGCCCCGTAAAGTAAATTCCTAATACTCGACGGTAAACCCCTGATTAGAGTCCAATCGGCTCTTCACAAATACAGACTGTCCGGTTGGAGTGTTGCGTCATTCATGAGGTGCTAAACGTAACAGAACCACACATTGAACTGTTTGGGAACGGTAGGCTTCCTCACTTCAGAACCTAGCCCAAATCAGGCCTTGATACCTAACGTTCATACCGGGCTGACTACGCAACATAAGAGAATCTCTCCTCCTTAACGAGGTTTTGTGATTGACACCTGACAGGCTGTCAGTTTCCCACACCCTGGCGTATTAAGATATTAGTTTGCTTACGAGCATTGCGGATGTCTCAGAAAACACAGTAGCCAGTAAAATGTGATATTGACCAGAAGATCGCAAACTTCGCCAAGGTAAAGCTAAACTGAAGACCGTCTGCATAGGACCTCAATCGTTTGACTTCCAAGAAAGAAGGGTATACTACGACTTTCAAGTATTGATGCAGCCTTGCCATATCGAGTCGTCAGAGGATAGCATCTCGGGTGACCGTAGCTAAGATGCAGGTACTAGATCTAACAAGTATCCGTCCCCGTCCGATCTAAATCAAGTTAAAGGGTAGACCCAGGTGCTTTGACCAGACTTATTATCCGGTGACGGAAGCTCTTCGCAGGTGACGGAAGCTCTTCGCAGTTCGCTAGTTCGCTAGACGGGTGGAAGGAATTTGTATAGCGACAGGTTGTCATATCGATACAACGTAGACGCCCGCTCAAGAGTACGGTGGGATCAGGCACCCTTACTGCTTTCTACCCTTGCCAGAGGACCTGTAAGAACGGGCGGTTGCAGGAGGATTATAATCCCGAAAGCAGACGTTAAGTCGGACAAGAAGCATATTTTATATATAAGCGAGCTGGGCATAGTAGCCCGTGGGGCAAAGTGTCGCACCGCTATAAGCGAGCCCGCGGAAGTATGCTAACCCTAGTGTAGCCCCATCCAAGTGACTGTAGCAAGGAGCCGTGCTAGTGTGACGGAAGCTCTTCGCAGTTCGCTAGAGCGATCGTCCGTATCCTCGATCAACGAATTTAGAGTACCCGGAAGCCAGGGCCCTATGGCGTAACGACACGAGTTGGTCTACCAGAAAGACCGAGTAATTTTGCGTTCTTTCATTCCGTTTAAGGCGAATGAAACCAGCAACGAATGTCGTGTGGGCATAGGCAAATCCAACTGGAGGATTTACGCGCATGACGCATTCGTTGAACGAGGCCAAGCTAACATCGAAGTACTATTCTCTGACAGACGCTACTAGTTACCGCCTTTCATACCCATTCATGTGGCTCGGTCTGGAAATTCACATCTCGTGAATTTCACGTATCAGGGCAATTCCCCGAGCAGTGCTCGAAGACTACCCCAAACACTTCTAAAAGGTTCTGAGAGCTATCCTCAGGGGCACCATCGTTTGAAATCCACCCAAGACCTGATTTGTATGGCGCCCAGGGGAGGCGGTCCCCCCAACGCTCCGATTTAGCATTTCTGGGTGTTAACCCCTTAAACGAGGTCCCAGTGTAATAGAAACAGGTACGTAGCTTCCCGACTGGAGAGTCACCCTTACTCTAATCTGGTGCAGTCGCATGGTCCATACAACAAACAACCTTAAGATTCCTAAGTGGACCCTCGCCTTACATAGTCGGATTCTGACAGGTGTGCAAACCCCTCGCTAGTAGTCGTGACGGAAGCTCTTCGCAGTTCGCTAGGTTGCATGCATGAGACATTGACGGGGACCCGCAACCAGGTAACCAAATAGTTCGACCCCTCCCAGTTAAATATCTGCAAGCCGGCCAAGTCGCGCGGGTCAGGCGACGATTCCTAGGAAGAGGAGCAGTGTTGGAGAGTCTTATGAAGCCTAGGCACAGTCCTGGATTCCGTACCCCCCTTGCGATTACATCCACCGTTAGACTGATGTACGCACCCTGGTTATAGGAAAGGATAAGGGCCAGATGTTCGGTAAGCAAGGTGTTCGAGGCACCCTCCCCTCCTGGAGCTCGGTAGGCAACAGCCTTCTTATCGGCAAAGCCATAAACAAAATAGGAGTGAGTGATTAGTGGGGCAAGCCGTGACGGAAGCTCTTCGCAGTTCGCTAG -------------------------------------------------------------------------------- /chapter5/C5_58/output.test.srfrp.txt: -------------------------------------------------------------------------------- 1 | GTGGTCGTGAGATGTTGA -------------------------------------------------------------------------------- /chapter5/C5_58/srfrp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 10 Dec 2013 12 | # 13 | # CODE CHALLENGE: Solve the String Reconstruction from Read-Pairs Problem. 14 | # Input: An integer d followed by a collection of paired k-mers PairedReads. 15 | # Output: A string Text with (k, d)-mer composition equal to PairedReads. 16 | # 17 | # Sample Input: 18 | # 2 19 | # GAGA|TTGA 20 | # TCGT|GATG 21 | # CGTG|ATGT 22 | # TGGT|TGAG 23 | # GTGA|TGTT 24 | # GTGG|GTGA 25 | # TGAG|GTTG 26 | # GGTC|GAGA 27 | # GTCG|AGAT 28 | # 29 | # Sample Output: 30 | # GTGGTCGTGAGATGTTGA 31 | # 32 | ############################################################################## 33 | 34 | import sys 35 | import timeit 36 | import math 37 | import random 38 | from itertools import combinations,product,izip,ifilter,chain 39 | from collections import Counter,defaultdict 40 | 41 | def seperate_pairwise(pairs): 42 | return [tuple(item.split('|')) for item in pairs] 43 | 44 | def read_file(input_file): 45 | ''' 46 | >>> d,kmer_pairs = read_file('test.srfrp.txt') 47 | >>> d,kmer_pairs = read_file('test.srfrp.extra.txt') 48 | ''' 49 | f = open(input_file) 50 | data = f.readlines() 51 | f.close() 52 | data = [item.strip() for item in data] 53 | return (int(data[0]),seperate_pairwise(data[1:])) 54 | 55 | def prefix(kmer_pair): 56 | return [item[:-1] for item in kmer_pair] 57 | 58 | def suffix(kmer_pair): 59 | return [item[1:] for item in kmer_pair] 60 | 61 | def path_graph(kmer_pairs): 62 | data = [(kmer_pair1,kmer_pair2) for kmer_pair1,kmer_pair2 in product(kmer_pairs,kmer_pairs) if suffix(kmer_pair1) == prefix(kmer_pair2)] 63 | edges = find_path(data) 64 | nodes = [prefix(item) for item in edges]+[suffix(edges[-1])] 65 | return (edges,nodes) 66 | 67 | def find_ender(edges): 68 | item0 = [item[0] for item in edges] 69 | item1 = [item[1] for item in edges] 70 | c0 = Counter(item0) 71 | c1 = Counter(item1) 72 | result = [item for item in c1 if c0[item] < c1[item]] 73 | return random.choice(result) 74 | 75 | def inverted_dictionary(edges): 76 | return dict([(item2,item1) for item1,item2 in edges]) 77 | 78 | def find_path(data): 79 | ivd = inverted_dictionary(data) 80 | ender = find_ender(data) 81 | path = [ender] 82 | while len(ivd) != 0: 83 | try: 84 | starter = ivd[ender] 85 | del ivd[ender] 86 | path.append(starter) 87 | ender = starter 88 | except: 89 | break 90 | return path[::-1] 91 | 92 | def form_string(List): 93 | string = List[0] + ''.join([item[-1] for item in List[1:]]) 94 | return string 95 | 96 | def result(filename): 97 | d,kmer_pairs = read_file(filename) 98 | edges,nodes = path_graph(kmer_pairs) 99 | k = len(kmer_pairs[0][0]) 100 | former = [item[0] for item in edges] 101 | latter = [item[1] for item in edges] 102 | results = form_string(former)[:k+d]+form_string(latter) 103 | return results 104 | 105 | if __name__ == "__main__": 106 | 107 | start = timeit.default_timer() 108 | results = result(sys.argv[-1]) 109 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 110 | fw.write(results) 111 | fw.close() 112 | stop = timeit.default_timer() 113 | print stop - start 114 | -------------------------------------------------------------------------------- /chapter5/C5_58/test.srfrp.txt: -------------------------------------------------------------------------------- 1 | 2 2 | GAGA|TTGA 3 | TCGT|GATG 4 | CGTG|ATGT 5 | TGGT|TGAG 6 | GTGA|TGTT 7 | GTGG|GTGA 8 | TGAG|GTTG 9 | GGTC|GAGA 10 | GTCG|AGAT 11 | -------------------------------------------------------------------------------- /chapter5/C5_59/.contig_generation.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter5/C5_59/.contig_generation.py.swp -------------------------------------------------------------------------------- /chapter5/C5_59/contig_generation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 10 Dec 2013 12 | # 13 | # Contig Generation Problem: 14 | # Generate the contigs from a collection of reads (with imperfect coverage). 15 | # Input: A collection of k-mers Patterns. 16 | # Output: All contigs in DeBruijn(Patterns). 17 | # 18 | # CODE CHALLENGE: Solve the Contig Generation Problem. 19 | # 20 | # Sample Input: 21 | # ATG 22 | # ATG 23 | # TGT 24 | # TGG 25 | # CAT 26 | # GGA 27 | # GAT 28 | # AGA 29 | # 30 | # Sample Output: 31 | # AGA ATG ATG CAT GAT TGGA TGT 32 | # 33 | ############################################################################## 34 | 35 | import sys 36 | import timeit 37 | import math 38 | import re 39 | import heapq 40 | import random 41 | import numpy as np 42 | from itertools import combinations,product,izip,ifilter,chain 43 | from collections import Counter,defaultdict 44 | import debruijn_graph_from_kmers 45 | 46 | def read_file(input_file): 47 | ''' 48 | >>> pattern = read_file('test.contig_generation.txt') 49 | >>> pattern 50 | ['ATG', 'ATG', 'TGT', 'TGG', 'CAT', 'GGA', 'GAT', 'AGA'] 51 | >>> pattern = read_file('test.contig_generation.extra.txt') 52 | ''' 53 | f = open(input_file) 54 | data = f.readlines() 55 | f.close() 56 | return [item.strip() for item in data] 57 | 58 | def build_de_bruijn_graph_from_kmers(pattern): 59 | ''' 60 | >>> build_de_bruijn_graph_from_kmers(pattern) 61 | [('AG', ('GA',)), ('CA', ('AT',)), ('GG', ('GA',)), ('AT', ('TG',)), ('GA', ('AT',)), ('TG', ('GG', 'GT'))] 62 | ''' 63 | return debruijn_graph_from_kmers.de_bruijn_graph(pattern).items() 64 | 65 | def inout_degree(pattern): 66 | graph = build_de_bruijn_graph_from_kmers(pattern) 67 | outdegree = Counter([item[0] for item in graph]) 68 | indegree = Counter([','.join(item[1]) for item in graph]) 69 | keys = set(outdegree.keys()+indegree.keys()) 70 | d = {} 71 | for key in keys: 72 | d[key] = (indegree[key],outdegree[key]) 73 | return d 74 | 75 | def prefix(kmer_pair): 76 | return [item[:-1] for item in kmer_pair] 77 | 78 | def suffix(kmer_pair): 79 | return [item[1:] for item in kmer_pair] 80 | 81 | def select(pattern): 82 | d = inout_degree(pattern) 83 | r = [] 84 | for key,value in d.iteritems(): 85 | value1,value2 = value 86 | if value1 != 1 or value2 > 1: 87 | r.append(key) 88 | return r 89 | 90 | def path_graph(kmer_pairs): 91 | data = [(kmer_pair1,kmer_pair2) for kmer_pair1,kmer_pair2 in product(kmer_pairs,kmer_pairs) if suffix(kmer_pair1) == prefix(kmer_pair2)] 92 | edges = find_path(data) 93 | nodes = [prefix(item) for item in edges]+[suffix(edges[-1])] 94 | return (edges,nodes) 95 | 96 | def find_ender(edges): 97 | item0 = [item[0] for item in edges] 98 | item1 = [item[1] for item in edges] 99 | c0 = Counter(item0) 100 | c1 = Counter(item1) 101 | result = [item for item in c1 if c0[item] < c1[item]] 102 | return random.choice(result) 103 | 104 | def inverted_dictionary(edges): 105 | return dict([(item2,item1) for item1,item2 in edges]) 106 | 107 | def find_path(data): 108 | ivd = inverted_dictionary(data) 109 | ender = find_ender(data) 110 | path = [ender] 111 | while len(ivd) != 0: 112 | try: 113 | starter = ivd[ender] 114 | del ivd[ender] 115 | path.append(starter) 116 | ender = starter 117 | except: 118 | break 119 | return path[::-1] 120 | 121 | def list_duplicates(data_list): 122 | data_list = [tuple(item) for item in data_list] 123 | tally = defaultdict(list) 124 | for i,item in enumerate(data_list): 125 | tally[item].append(i) 126 | return ((key,locs) for key,locs in tally.items() if len(locs) > 1) 127 | 128 | def form_string(List): 129 | string = List[0] + ''.join([item[-1] for item in List[1:]]) 130 | return string 131 | 132 | def result(filename): 133 | pattern = read_file(filename) 134 | edges,nodes = path_graph(kmer_pairs) 135 | k = len(kmer_pairs[0][0]) 136 | former = [item[0] for item in edges] 137 | latter = [item[1] for item in edges] 138 | results = form_string(former)[:k+d]+form_string(latter) 139 | return results 140 | 141 | if __name__ == "__main__": 142 | 143 | start = timeit.default_timer() 144 | results = result(sys.argv[-1]) 145 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 146 | fw.write(results) 147 | fw.close() 148 | stop = timeit.default_timer() 149 | print stop - start 150 | -------------------------------------------------------------------------------- /chapter5/C5_59/contig_generation.py~: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 10 Dec 2013 12 | # 13 | # Contig Generation Problem: 14 | # Generate the contigs from a collection of reads (with imperfect coverage). 15 | # Input: A collection of k-mers Patterns. 16 | # Output: All contigs in DeBruijn(Patterns). 17 | # 18 | # CODE CHALLENGE: Solve the Contig Generation Problem. 19 | # 20 | # Sample Input: 21 | # ATG 22 | # ATG 23 | # TGT 24 | # TGG 25 | # CAT 26 | # GGA 27 | # GAT 28 | # AGA 29 | # 30 | # Sample Output: 31 | # AGA ATG ATG CAT GAT TGGA TGT 32 | # 33 | ############################################################################## 34 | 35 | import sys 36 | import timeit 37 | import math 38 | import re 39 | import heapq 40 | import random 41 | import numpy as np 42 | from itertools import combinations,product,izip,ifilter,chain 43 | from collections import Counter,defaultdict 44 | import debruijn_graph_from_kmers 45 | 46 | def read_file(input_file): 47 | ''' 48 | >>> pattern = read_file('test.contig_generation.txt') 49 | >>> pattern 50 | ['ATG', 'ATG', 'TGT', 'TGG', 'CAT', 'GGA', 'GAT', 'AGA'] 51 | >>> pattern = read_file('test.contig_generation.extra.txt') 52 | ''' 53 | f = open(input_file) 54 | data = f.readlines() 55 | f.close() 56 | return [item.strip() for item in data] 57 | 58 | def build_de_bruijn_graph_from_kmers(pattern): 59 | ''' 60 | >>> build_de_bruijn_graph_from_kmers(pattern) 61 | [('AG', ('GA',)), ('CA', ('AT',)), ('GG', ('GA',)), ('AT', ('TG',)), ('GA', ('AT',)), ('TG', ('GG', 'GT'))] 62 | ''' 63 | return debruijn_graph_from_kmers.de_bruijn_graph(pattern).items() 64 | 65 | def prefix(kmer_pair): 66 | return [item[:-1] for item in kmer_pair] 67 | 68 | def suffix(kmer_pair): 69 | return [item[1:] for item in kmer_pair] 70 | 71 | def inout_degree(pattern): 72 | indegree = Counter(suffix(pattern)) 73 | outdegree = Counter(prefix(pattern)) 74 | keys = set(indegree.keys()+outdegree.keys()) 75 | d = {} 76 | for key in keys: 77 | d[key] = (indegree[key],outdegree[key]) 78 | return d 79 | 80 | def select(pattern): 81 | d = inout_degree(pattern) 82 | r = [] 83 | for key,value in d.iteritems(): 84 | value1,value2 = value 85 | if value1 != 1 or value2 > 1: 86 | r.append(key) 87 | return r 88 | 89 | def path_graph(kmer_pairs): 90 | data = [(kmer_pair1,kmer_pair2) for kmer_pair1,kmer_pair2 in product(kmer_pairs,kmer_pairs) if suffix(kmer_pair1) == prefix(kmer_pair2)] 91 | edges = find_path(data) 92 | nodes = [prefix(item) for item in edges]+[suffix(edges[-1])] 93 | return (edges,nodes) 94 | 95 | def find_ender(edges): 96 | item0 = [item[0] for item in edges] 97 | item1 = [item[1] for item in edges] 98 | c0 = Counter(item0) 99 | c1 = Counter(item1) 100 | result = [item for item in c1 if c0[item] < c1[item]] 101 | return random.choice(result) 102 | 103 | def inverted_dictionary(edges): 104 | return dict([(item2,item1) for item1,item2 in edges]) 105 | 106 | def find_path(data): 107 | ivd = inverted_dictionary(data) 108 | ender = find_ender(data) 109 | path = [ender] 110 | while len(ivd) != 0: 111 | try: 112 | starter = ivd[ender] 113 | del ivd[ender] 114 | path.append(starter) 115 | ender = starter 116 | except: 117 | break 118 | return path[::-1] 119 | 120 | def list_duplicates(data_list): 121 | data_list = [tuple(item) for item in data_list] 122 | tally = defaultdict(list) 123 | for i,item in enumerate(data_list): 124 | tally[item].append(i) 125 | return ((key,locs) for key,locs in tally.items() if len(locs) > 1) 126 | 127 | def form_string(List): 128 | string = List[0] + ''.join([item[-1] for item in List[1:]]) 129 | return string 130 | 131 | def result(filename): 132 | pattern = read_file(filename) 133 | edges,nodes = path_graph(kmer_pairs) 134 | k = len(kmer_pairs[0][0]) 135 | former = [item[0] for item in edges] 136 | latter = [item[1] for item in edges] 137 | results = form_string(former)[:k+d]+form_string(latter) 138 | return results 139 | 140 | if __name__ == "__main__": 141 | 142 | start = timeit.default_timer() 143 | results = result(sys.argv[-1]) 144 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 145 | fw.write(results) 146 | fw.close() 147 | stop = timeit.default_timer() 148 | print stop - start 149 | -------------------------------------------------------------------------------- /chapter5/C5_59/debruijn_graph_from_kmers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ############################################################################### 4 | # 5 | # Author: 6 | # 7 | # Sanyk28 (san-heng-yi-shu@163.com) 8 | # 9 | # Date created: 10 | # 11 | # 4 Dec 2013 12 | # 13 | # DeBruijn Graph from k-mers Problem: Construct the de Bruijn graph from a set of k-mers. 14 | # Input: A collection of k-mers Patterns. 15 | # Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns). 16 | # 17 | # CODE CHALLENGE: Solve the de Bruijn Graph from k-mers Problem. 18 | # 19 | # Sample Input: 20 | # GAGG 21 | # GGGG 22 | # GGGA 23 | # CAGG 24 | # AGGG 25 | # GGAG 26 | # 27 | # Sample Output: 28 | # AGG -> GGG 29 | # CAG -> AGG 30 | # GAG -> AGG 31 | # GGA -> GAG 32 | # GGG -> GGA,GGG 33 | # 34 | ############################################################################### 35 | 36 | import sys 37 | import timeit 38 | import heapq 39 | import operator 40 | import random 41 | import numpy as np 42 | from scipy import stats 43 | from itertools import combinations,product,izip,ifilter,chain 44 | from collections import Counter,defaultdict 45 | 46 | def read_file(input_file): 47 | ''' 48 | >>> kmers = read_file('test.debruijn_graph_from_kmers.txt') 49 | >>> kmers = read_file('test.debruijn_graph_from_kmers.extra.txt') 50 | ''' 51 | f = open(input_file) 52 | data = [item.strip() for item in f.readlines()] 53 | f.close() 54 | return data 55 | 56 | def overlap_graph(kmers): 57 | return [(k1,k2) for k1,k2 in product(kmers,kmers) if k1 != k2 and k1.endswith(k2[:-1])] 58 | 59 | def de_bruijn_graph(kmers): 60 | og = overlap_graph(kmers) 61 | nodes = [(item[0][:-1],item[1][:-1]) for item in og] 62 | potentials = [(item[0][1:],item[1][1:]) for item in og] 63 | for item in potentials: 64 | if item not in nodes: 65 | nodes.append(item) 66 | d = defaultdict(tuple) 67 | for tup in nodes: 68 | if tup[1] not in d[tup[0]]: 69 | d[tup[0]] += (tup[1],) 70 | return d 71 | 72 | def result(filename): 73 | kmers = read_file(filename) 74 | results = de_bruijn_graph(kmers) 75 | return results 76 | 77 | if __name__ == "__main__": 78 | 79 | start = timeit.default_timer() 80 | results = result(sys.argv[-1]) 81 | fw = open('output.'+sys.argv[-1][:-4]+'.txt','w') 82 | for k,v in results.iteritems(): 83 | fw.write('{0} -> {1}'.format(k,','.join(v))+'\n') 84 | fw.close() 85 | stop = timeit.default_timer() 86 | print stop - start 87 | -------------------------------------------------------------------------------- /chapter5/C5_59/debruijn_graph_from_kmers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter5/C5_59/debruijn_graph_from_kmers.pyc -------------------------------------------------------------------------------- /chapter5/C5_59/output.test.contig_generation.txt: -------------------------------------------------------------------------------- 1 | AG -> GA 2 | CA -> AT 3 | GG -> GA 4 | AT -> TG 5 | GA -> AT 6 | TG -> GG,GT 7 | -------------------------------------------------------------------------------- /chapter5/C5_59/test.contig_generation.txt: -------------------------------------------------------------------------------- 1 | ATG 2 | ATG 3 | TGT 4 | TGG 5 | CAT 6 | GGA 7 | GAT 8 | AGA 9 | -------------------------------------------------------------------------------- /chapter5/C5_59/well.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minw2828/Coursera---Bioinformatics-Algorithms/9a51f7ca1fa9ab5fd246dc971648ebe0acf9b308/chapter5/C5_59/well.txt --------------------------------------------------------------------------------