├── .gitattributes ├── .gitignore ├── Assignment_01A.py ├── Assignment_01B.py ├── Assignment_01C.py ├── Assignment_01D.py ├── Assignment_01E.py ├── Assignment_01F.py ├── Assignment_01G.py ├── Assignment_01H.py ├── Assignment_02A.py ├── Assignment_02B.py ├── Assignment_02C.py ├── Assignment_02D.py ├── Assignment_02E.py ├── Assignment_02F.py ├── Assignment_02G.py ├── Assignment_03A.py ├── Assignment_03B.py ├── Assignment_03C.py ├── Assignment_03D.py ├── Assignment_03E.py ├── Assignment_03F.py ├── Assignment_03G.py ├── Assignment_04A.py ├── Assignment_04B.py ├── Assignment_04C.py ├── Assignment_04D.py ├── Assignment_04E.py ├── Assignment_04E_with_NetworkX.py ├── Assignment_05A.py ├── Assignment_05B.py ├── Assignment_05C.py ├── Assignment_05D.py ├── Assignment_05E.py ├── Assignment_06A.py ├── Assignment_06B.py ├── Assignment_06C.py ├── Assignment_06D.py ├── Assignment_06E.py ├── Assignment_06F.py ├── Assignment_07A.py ├── Assignment_07B.py ├── Assignment_07C.py ├── Assignment_07D.py ├── Assignment_07E.py ├── Assignment_07F.py ├── Assignment_07G.py ├── Assignment_08A.py ├── Assignment_08B.py ├── Assignment_08C.py ├── Assignment_08D.py ├── Assignment_09A.py ├── Assignment_09B.py ├── Assignment_09D.py ├── Assignment_09E.py ├── Assignment_09F.py ├── README.md ├── data ├── stepic_1a.txt ├── stepic_1b.txt ├── stepic_1c.txt ├── stepic_1d.txt ├── stepic_1e.txt ├── stepic_1f.txt ├── stepic_1g.txt ├── stepic_1h.txt ├── stepic_2a.txt ├── stepic_2b.txt ├── stepic_2c.txt ├── stepic_2d.txt ├── stepic_2e.txt ├── stepic_2f.txt ├── stepic_2g.txt ├── stepic_3a.txt ├── stepic_3b.txt ├── stepic_3c.txt ├── stepic_3d.txt ├── stepic_3e.txt ├── stepic_3f.txt ├── stepic_3g.txt ├── stepic_4a.txt ├── stepic_4b.txt ├── stepic_4c.txt ├── stepic_4d.txt ├── stepic_4e.txt ├── stepic_5a.txt ├── stepic_5b.txt ├── stepic_5c.txt ├── stepic_5d.txt ├── stepic_5e.txt ├── stepic_6a.txt ├── stepic_6b.txt ├── stepic_6c.txt ├── stepic_6d.txt ├── stepic_6e.txt ├── stepic_6f.txt ├── stepic_7a.txt ├── stepic_7b.txt ├── stepic_7c.txt ├── stepic_7d.txt ├── stepic_7e.txt ├── stepic_7f.txt ├── stepic_7g.txt ├── stepic_8a.txt ├── stepic_8b.txt ├── stepic_8c.txt ├── stepic_8d.txt ├── stepic_9a.txt ├── stepic_9b.txt ├── stepic_9d.txt ├── stepic_9e.txt └── stepic_9f.txt ├── output ├── Assignment_01A.txt ├── Assignment_01B.txt ├── Assignment_01C.txt ├── Assignment_01D.txt ├── Assignment_01E.txt ├── Assignment_01F.txt ├── Assignment_01G.txt ├── Assignment_01H.txt ├── Assignment_02A.txt ├── Assignment_02B.txt ├── Assignment_02C.txt ├── Assignment_02D.txt ├── Assignment_02E.txt ├── Assignment_02F.txt ├── Assignment_02G.txt ├── Assignment_03A.txt ├── Assignment_03B.txt ├── Assignment_03C.txt ├── Assignment_03D.txt ├── Assignment_03E.txt ├── Assignment_03F.txt ├── Assignment_03G.txt ├── Assignment_04A.txt ├── Assignment_04B.txt ├── Assignment_04C.txt ├── Assignment_04D.txt ├── Assignment_04E.txt ├── Assignment_05A.txt ├── Assignment_05B.txt ├── Assignment_05C.txt ├── Assignment_05D.txt ├── Assignment_05E.txt ├── Assignment_06A.txt ├── Assignment_06B.txt ├── Assignment_06C.txt ├── Assignment_06D.txt ├── Assignment_06E.txt ├── Assignment_06F.txt ├── Assignment_07A.txt ├── Assignment_07B.txt ├── Assignment_07C.txt ├── Assignment_07D.txt ├── Assignment_07E.txt ├── Assignment_07F.txt ├── Assignment_07G.txt ├── Assignment_08A.txt ├── Assignment_08B.txt ├── Assignment_08C.txt ├── Assignment_08D.txt ├── Assignment_09A.txt ├── Assignment_09B.txt ├── Assignment_09D.txt ├── Assignment_09E.txt └── Assignment_09F.txt └── scripts ├── DNA_RNA_Operations.py ├── Protein_Dictionaries.py ├── __init__.py ├── data ├── BLOSUM62.txt └── PAM250.txt ├── generalized_suffix_tree.py ├── scoring_matrices.py └── trie.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Eclipse 3 | ################# 4 | 5 | *.pydevproject 6 | .project 7 | .metadata 8 | bin/ 9 | tmp/ 10 | *.tmp 11 | *.bak 12 | *.swp 13 | *~.nib 14 | local.properties 15 | .classpath 16 | .settings/ 17 | .loadpath 18 | 19 | # External tool builders 20 | .externalToolBuilders/ 21 | 22 | # Locally stored "Eclipse launch configurations" 23 | *.launch 24 | 25 | # CDT-specific 26 | .cproject 27 | 28 | # PDT-specific 29 | .buildpath 30 | 31 | 32 | ################# 33 | ## Visual Studio 34 | ################# 35 | 36 | ## Ignore Visual Studio temporary files, build results, and 37 | ## files generated by popular Visual Studio add-ons. 38 | 39 | # User-specific files 40 | *.suo 41 | *.user 42 | *.sln.docstates 43 | 44 | # Build results 45 | 46 | [Dd]ebug/ 47 | [Rr]elease/ 48 | x64/ 49 | build/ 50 | [Bb]in/ 51 | [Oo]bj/ 52 | 53 | # MSTest test Results 54 | [Tt]est[Rr]esult*/ 55 | [Bb]uild[Ll]og.* 56 | 57 | *_i.c 58 | *_p.c 59 | *.ilk 60 | *.meta 61 | *.obj 62 | *.pch 63 | *.pdb 64 | *.pgc 65 | *.pgd 66 | *.rsp 67 | *.sbr 68 | *.tlb 69 | *.tli 70 | *.tlh 71 | *.tmp 72 | *.tmp_proj 73 | *.log 74 | *.vspscc 75 | *.vssscc 76 | .builds 77 | *.pidb 78 | *.log 79 | *.scc 80 | 81 | # Visual C++ cache files 82 | ipch/ 83 | *.aps 84 | *.ncb 85 | *.opensdf 86 | *.sdf 87 | *.cachefile 88 | 89 | # Visual Studio profiler 90 | *.psess 91 | *.vsp 92 | *.vspx 93 | 94 | # Guidance Automation Toolkit 95 | *.gpState 96 | 97 | # ReSharper is a .NET coding add-in 98 | _ReSharper*/ 99 | *.[Rr]e[Ss]harper 100 | 101 | # TeamCity is a build add-in 102 | _TeamCity* 103 | 104 | # DotCover is a Code Coverage Tool 105 | *.dotCover 106 | 107 | # NCrunch 108 | *.ncrunch* 109 | .*crunch*.local.xml 110 | 111 | # Installshield output folder 112 | [Ee]xpress/ 113 | 114 | # DocProject is a documentation generator add-in 115 | DocProject/buildhelp/ 116 | DocProject/Help/*.HxT 117 | DocProject/Help/*.HxC 118 | DocProject/Help/*.hhc 119 | DocProject/Help/*.hhk 120 | DocProject/Help/*.hhp 121 | DocProject/Help/Html2 122 | DocProject/Help/html 123 | 124 | # Click-Once directory 125 | publish/ 126 | 127 | # Publish Web Output 128 | *.Publish.xml 129 | *.pubxml 130 | 131 | # NuGet Packages Directory 132 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line 133 | #packages/ 134 | 135 | # Windows Azure Build Output 136 | csx 137 | *.build.csdef 138 | 139 | # Windows Store app package directory 140 | AppPackages/ 141 | 142 | # Others 143 | sql/ 144 | *.Cache 145 | ClientBin/ 146 | [Ss]tyle[Cc]op.* 147 | ~$* 148 | *~ 149 | *.dbmdl 150 | *.[Pp]ublish.xml 151 | *.pfx 152 | *.publishsettings 153 | 154 | # RIA/Silverlight projects 155 | Generated_Code/ 156 | 157 | # Backup & report files from converting an old project file to a newer 158 | # Visual Studio version. Backup files are not needed, because we have git ;-) 159 | _UpgradeReport_Files/ 160 | Backup*/ 161 | UpgradeLog*.XML 162 | UpgradeLog*.htm 163 | 164 | # SQL Server files 165 | App_Data/*.mdf 166 | App_Data/*.ldf 167 | 168 | ############# 169 | ## Windows detritus 170 | ############# 171 | 172 | # Windows image file caches 173 | Thumbs.db 174 | ehthumbs.db 175 | 176 | # Folder config file 177 | Desktop.ini 178 | 179 | # Recycle Bin used on file shares 180 | $RECYCLE.BIN/ 181 | 182 | # Mac crap 183 | .DS_Store 184 | 185 | 186 | ############# 187 | ## Python 188 | ############# 189 | 190 | *.py[co] 191 | 192 | # Packages 193 | *.egg 194 | *.egg-info 195 | dist/ 196 | build/ 197 | eggs/ 198 | parts/ 199 | var/ 200 | sdist/ 201 | develop-eggs/ 202 | .installed.cfg 203 | 204 | # Installer logs 205 | pip-log.txt 206 | 207 | # Unit test / coverage reports 208 | .coverage 209 | .tox 210 | 211 | #Translations 212 | *.mo 213 | 214 | #Mr Developer 215 | .mr.developer.cfg 216 | -------------------------------------------------------------------------------- /Assignment_01A.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic. 6 | 7 | Problem Title: Creating a Distance Matrix 8 | Assignment #: 01 9 | Problem ID: A 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Hidden-Messages-in-the-Replication-Origin-2/#step-4 11 | ''' 12 | 13 | with open('data/stepic_1a.txt') as input_data: 14 | dna, k = [line.strip() for line in input_data.readlines()] 15 | k = int(k) 16 | 17 | kmer_dict = dict() 18 | 19 | for i in xrange(len(dna)-k+1): 20 | if dna[i:i+k] in kmer_dict: 21 | kmer_dict[dna[i:i+k]] += 1 22 | else: 23 | kmer_dict[dna[i:i+k]] = 1 24 | 25 | kmers = [item[0] for item in kmer_dict.items() if item[1] == max(kmer_dict.values())] 26 | 27 | print ' '.join(kmers) 28 | with open('output/Assignment_01A.txt', 'w') as output_data: 29 | output_data.write(' '.join(kmers)) 30 | -------------------------------------------------------------------------------- /Assignment_01B.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Reverse Complement Problem 8 | Assignment #: 01 9 | Problem ID: B 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Some-Hidden-Messages-are-More-Surprising-than-Others-3/#step-2 11 | ''' 12 | 13 | from scripts import ReverseComplementDNA as RevComp 14 | 15 | with open('data/stepic_1b.txt') as input_data: 16 | dna = input_data.read().strip() 17 | 18 | # The script I previously wrote solves the problem... 19 | with open('output/Assignment_01B.txt', 'w') as output_data: 20 | output_data.write(RevComp(dna)) 21 | -------------------------------------------------------------------------------- /Assignment_01C.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Pattern Matching Problem 8 | Assignment #: 01 9 | Problem ID: C 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Some-Hidden-Messages-are-More-Surprising-than-Others-3/#step-5 11 | ''' 12 | 13 | with open('data/stepic_1c.txt') as input_data: 14 | pattern, text = [line.strip() for line in input_data.readlines()] 15 | 16 | pattern_loc = [] 17 | for i in xrange(len(text)-len(pattern)+1): 18 | if text[i:i+len(pattern)] == pattern: 19 | pattern_loc.append(str(i)) 20 | 21 | print ' '.join(pattern_loc) 22 | with open('output/Assignment_01C.txt', 'w') as output_data: 23 | output_data.write(' '.join(pattern_loc)) 24 | -------------------------------------------------------------------------------- /Assignment_01D.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Clump Finding Problem 8 | Assignment #: 01 9 | Problem ID: D 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/An-Explosion-of-Hidden-Messages-4/#step-4 11 | ''' 12 | 13 | def CheckClumpLength(indicies, t, L): 14 | '''Checks that a given set of t k-mers falls within a clump of size L.''' 15 | for i in xrange(len(indicies)-t+1): 16 | if indicies[t+i-1] - indicies[i] <= L: 17 | return True 18 | return False 19 | 20 | with open('data/stepic_1d.txt') as input_data: 21 | dna, [k, L, t] = [line.strip() if index == 0 else map(int, line.strip().split()) for index, line in enumerate(input_data.readlines())] 22 | 23 | # Find all k-mers, count their appearances, and store thier indicies. 24 | kmer_dict = dict() 25 | for i in xrange(len(dna)-k+1): 26 | if dna[i:i+k] in kmer_dict: 27 | kmer_dict[dna[i:i+k]][0] += 1 28 | kmer_dict[dna[i:i+k]][1].append(i) 29 | else: 30 | kmer_dict[dna[i:i+k]] = [1, [i]] 31 | 32 | # The candidate k-mers that appear at least t times, along with the indicies where they appear. 33 | kmer_candidates = [ [kmer[0],kmer[1][1]] for kmer in kmer_dict.items() if kmer[1][0] >= t] 34 | 35 | # Check that at least t candidate k-mers fall within a clump of size L. 36 | kmer_clumps = [] 37 | for candidate in kmer_candidates: 38 | if CheckClumpLength(candidate[1], t, L): 39 | kmer_clumps.append(candidate[0]) 40 | 41 | # Print and save the solution. 42 | print ' '.join(kmer_clumps) 43 | with open('output/Assignment_01D.txt', 'w') as output_data: 44 | output_data.write(' '.join(kmer_clumps)) 45 | -------------------------------------------------------------------------------- /Assignment_01E.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Minimum Skew Problem 8 | Assignment #: 01 9 | Problem ID: E 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Peculiar-Statistics-of-the-Forward-and-Reverse-Half-Strands-7/#step-6 11 | ''' 12 | 13 | with open('data/stepic_1e.txt') as input_data: 14 | dna = input_data.read().strip() 15 | 16 | skew_value, min_skew, min_ind = 0, 1, [] 17 | for index, nucleotide in enumerate(dna): 18 | # Determine the skew value. 19 | if nucleotide == 'C': 20 | skew_value -= 1 21 | elif nucleotide == 'G': 22 | skew_value += 1 23 | # Check if it matches the current minimum, or is a new minimum. 24 | if skew_value == min_skew: 25 | min_ind.append(str(index+1)) 26 | elif skew_value < min_skew: 27 | min_skew = skew_value 28 | min_ind = [str(index+1)] 29 | 30 | print ' '.join(min_ind) 31 | with open('output/Assignment_01E.txt', 'w') as output_data: 32 | output_data.write(' '.join(min_ind)) 33 | -------------------------------------------------------------------------------- /Assignment_01F.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Approximate Pattern Matching Problem 8 | Assignment #: 01 9 | Problem ID: F 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Some-Hidden-Messages-are-More-Elusive-than-Others-9/#step-3 11 | ''' 12 | 13 | with open('data/stepic_1f.txt') as input_data: 14 | pattern, dna, n = [line.strip() if index != 2 else int(line.strip()) for index, line in enumerate(input_data.readlines())] 15 | 16 | approx_match = [] 17 | for i in xrange(len(dna)-len(pattern)+1): 18 | mismatch_count = 0 19 | for j in xrange(len(pattern)): 20 | if dna[i:i+len(pattern)][j] != pattern[j]: 21 | mismatch_count += 1 22 | 23 | if mismatch_count <= n: 24 | approx_match.append(str(i)) 25 | 26 | print ' '.join(approx_match) 27 | with open('output/Assignment_01F.txt', 'w') as output_data: 28 | output_data.write(' '.join(approx_match)) 29 | -------------------------------------------------------------------------------- /Assignment_01G.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Frequent Words with Mismatches Problem 8 | Assignment #: 01 9 | Problem ID: G 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Some-Hidden-Messages-are-More-Elusive-than-Others-9/#step-4 11 | ''' 12 | 13 | from itertools import combinations 14 | 15 | def MismatchList(kmer, d): 16 | '''Returns a list of all k-mers that mismatch a given k-mer by at most d characters.''' 17 | kmer_mismatches = [kmer] 18 | for i in xrange(1,d+1): 19 | # Each combination gives the indicies we want to mismatch. 20 | kmer_mismatches += CreateMismatches([[kmer, list(combo)] for combo in combinations(range(len(kmer)),i)]) 21 | return kmer_mismatches 22 | 23 | 24 | def CreateMismatches(swap_list): 25 | '''Generates k-mer mismatches by replacing the characters at given indicies with mismatching characters.''' 26 | nucleotides = 'ACGT' 27 | mismatch_list = [] 28 | # Swap the i-th character of string with the character ch. 29 | swap = lambda string, ch, i: string[:index]+ch+string[index+1:] 30 | 31 | # If we have more than one index left to mismatch, repeat the process. 32 | if len(swap_list[0][1]) > 1: 33 | for kmer, indicies in swap_list: 34 | index = indicies[0] 35 | for nuc in filter(lambda n: n != kmer[index], nucleotides): 36 | mismatch_list.append([swap(kmer, nuc, index), indicies[1:]]) 37 | 38 | return CreateMismatches(mismatch_list) 39 | 40 | # Otherwise, on the final mismatch return the list of k-mers. 41 | else: 42 | for kmer, [index] in swap_list: 43 | for nuc in filter(lambda n: n != kmer[index], nucleotides): 44 | mismatch_list.append(swap(kmer, nuc, index)) 45 | 46 | return mismatch_list 47 | 48 | 49 | if __name__ == '__main__': 50 | 51 | with open('data/stepic_1g.txt') as input_data: 52 | dna, [k, d] = [line.strip() if index == 0 else map(int, line.strip().split()) for index, line in enumerate(input_data.readlines())] 53 | 54 | # Count the occurence of each k-mer with up to d mismatches in a dictionary. 55 | mismatch_dict = {} 56 | for i in xrange(len(dna)-k+1): 57 | for kmer in MismatchList(dna[i:i+k], d): 58 | if kmer in mismatch_dict: 59 | mismatch_dict[kmer] += 1 60 | else: 61 | mismatch_dict[kmer] = 1 62 | 63 | # Computing the maximum value is somewhat time consuming to repeat, so only do it once! 64 | max_val = max(mismatch_dict.values()) 65 | kmers = [item[0] for item in mismatch_dict.items() if item[1] == max_val] 66 | 67 | print ' '.join(kmers) 68 | with open('output/Assignment_01G.txt', 'w') as output_data: 69 | output_data.write(' '.join(kmers)) 70 | -------------------------------------------------------------------------------- /Assignment_01H.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Frequent Words with Mismatches and Reverse Complements Problem 8 | Assignment #: 01 9 | Problem ID: H 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Some-Hidden-Messages-are-More-Elusive-than-Others-9/#step-5 11 | ''' 12 | 13 | from scripts import ReverseComplementDNA as RevComp 14 | from Assignment_01G import MismatchList 15 | 16 | with open('data/stepic_1h.txt') as input_data: 17 | dna, [k, d] = [line.strip() if index == 0 else map(int, line.strip().split()) for index, line in enumerate(input_data.readlines())] 18 | 19 | # Use a dictionary to count the occurence of each k-mer and its reverse complement with up to d mismatches on each. 20 | mismatch_dict = {} 21 | for i in xrange(len(dna)-k+1): 22 | for kmer in MismatchList(dna[i:i+k], d)+MismatchList(RevComp(dna[i:i+k]), d): 23 | if kmer in mismatch_dict: 24 | mismatch_dict[kmer] += 1 25 | else: 26 | mismatch_dict[kmer] = 1 27 | 28 | # Computing the maximum value is somewhat time consuming to repeat, so only do it once! 29 | max_val = max(mismatch_dict.values()) 30 | kmers = [item[0] for item in mismatch_dict.items() if item[1] == max_val] 31 | 32 | print ' '.join(kmers) 33 | with open('output/Assignment_01H.txt', 'w') as output_data: 34 | output_data.write(' '.join(kmers)) 35 | -------------------------------------------------------------------------------- /Assignment_02A.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Protein Translation Problem 8 | Assignment #: 02 9 | Problem ID: A 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/How-Do-Bacteria-Make-Antibiotics-96/#step-3 11 | ''' 12 | 13 | # This is a repeat of Rosalind Problem 008: Translating RNA into Protein. 14 | from scripts import ProteinDictRNA 15 | 16 | with open('data/stepic_2a.txt') as input_data: 17 | s = input_data.read().strip() 18 | 19 | # Dictionary translating RNA to Protein 20 | rna_dict = ProteinDictRNA() 21 | 22 | s_protein = '' 23 | for i in range(0,len(s),3): 24 | if rna_dict[s[i:i+3]] != 'Stop': 25 | s_protein += rna_dict[s[i:i+3]] 26 | 27 | print s_protein 28 | 29 | with open('output/Assignment_02A.txt', 'w') as output_data: 30 | output_data.write(s_protein) 31 | -------------------------------------------------------------------------------- /Assignment_02B.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Peptide Encoding Problem 8 | Assignment #: 02 9 | Problem ID: B 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/How-Do-Bacteria-Make-Antibiotics-96/#step-6 11 | ''' 12 | 13 | from scripts import ProteinDictDNA 14 | from scripts import ReverseComplementDNA as RevComp 15 | 16 | with open('data/stepic_2b.txt') as input_data: 17 | dna, peptide = [line.strip() for line in input_data.readlines()] 18 | 19 | # Dictionary translating RNA to Protein 20 | dna_dict = ProteinDictDNA() 21 | 22 | encodings = [] 23 | for i in range(0,len(dna)-3*len(peptide)+1): 24 | # Get translate the current slice and its reverse complement to protein. 25 | dna_slice = dna[i:i+3*len(peptide)] 26 | proteins = [dna_dict[dna_slice[3*(j-1):3*j]] for j in range(1,len(peptide)+1)] 27 | proteins_rc =[dna_dict[RevComp(dna_slice)[3*(j-1):3*j]] for j in range(1,len(peptide)+1)] 28 | 29 | # Check if either translation matches the peptide. 30 | if ''.join(proteins) == peptide or ''.join(proteins_rc) == peptide: 31 | encodings.append(dna_slice) 32 | 33 | print '\n'.join(encodings) 34 | with open('output/Assignment_02B.txt', 'w') as output_data: 35 | output_data.write('\n'.join(encodings)) 36 | -------------------------------------------------------------------------------- /Assignment_02C.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Generating Theoretical Spectrum Problem 8 | Assignment #: 02 9 | Problem ID: C 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Sequencing-Antibiotics-by-Shattering-Them-into-Pieces-98/#step-3 11 | ''' 12 | 13 | from scripts import ProteinWeightDict 14 | 15 | def cyclospectrum(peptide): 16 | # Dictionary translating RNA to Protein 17 | weight = ProteinWeightDict() 18 | 19 | # Initialize as the mass 0 and the mass of the entire peptide. 20 | cyclospec = [0, sum([int(weight[protein]) for protein in peptide])] 21 | 22 | # Find the masses of the adjacent intermediary subpeptides 23 | cyclospec += [sum([int(weight[protein]) for protein in (peptide*2)[j:j+i]]) for i in xrange(1,len(peptide)) for j in xrange(len(peptide))] 24 | 25 | # Sort the list in ascending order and convert to strings. 26 | cyclospec = map(str,sorted(cyclospec)) 27 | 28 | return cyclospec 29 | 30 | if __name__ == '__main__': 31 | with open('data/stepic_2c.txt') as input_data: 32 | peptide = input_data.read().strip() 33 | 34 | cyclospec = cyclospectrum(peptide) 35 | 36 | # Print and save the answer. 37 | print ' '.join(cyclospec) 38 | with open('output/Assignment_02C.txt', 'w') as output_data: 39 | output_data.write(' '.join(cyclospec)) 40 | -------------------------------------------------------------------------------- /Assignment_02D.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Cyclopeptide Sequencing 8 | Assignment #: 02 9 | Problem ID: D 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/A-Faster-Algorithm-for-Cyclopeptide-Sequencing-100/#step-4 11 | ''' 12 | 13 | from math import sqrt 14 | from scripts import ProteinWeightDict 15 | 16 | def append_char(add_list, add_chars): 17 | '''Returns a list containing all words possible from add_list with suffixes from add_chars.''' 18 | newlist = [] 19 | for item in add_list: 20 | newlist += [item+ch for ch in set(add_chars)] 21 | return newlist 22 | 23 | def spectrum(peptide): 24 | '''Returns the linear spectrum of a given peptide.''' 25 | # Dictionary translating RNA to Protein 26 | weight = ProteinWeightDict() 27 | # Initialize as the mass 0 and the mass of the entire peptide. 28 | spec = [0, sum([int(weight[protein]) for protein in peptide])] 29 | # Find the masses of the adjacent intermediary subpeptides 30 | spec += [sum([int(weight[protein]) for protein in peptide[j:j+i]]) for i in xrange(1,len(peptide)) for j in xrange(len(peptide)-i+1)] 31 | # Sort the list in ascending order and convert to strings. 32 | spec = map(str,sorted(spec)) 33 | 34 | return spec 35 | 36 | with open('data/stepic_2d.txt') as input_data: 37 | cyclospec = input_data.read().strip().split() 38 | 39 | # Create the protein weight dictionary. 40 | weight = ProteinWeightDict() 41 | 42 | # Let n be the length of a given peptide, and L be the length of its cyclospectrum. Then L = n(n-1) + 2. 43 | # Using the quadratic formula to to solve for n: n = (sqrt(4L-7) + 1)/2 44 | n = int((sqrt(4*len(cyclospec)-7)+1)/2) 45 | 46 | # Find the first n protein in the peptide. 47 | # Need to be careful: two small proteins can add to be less than a larger one, so we can't just take the first n nonzero entries. 48 | # Fortunately, no two small proteins masses add to that of a larger protein. 49 | protein, i = [], 1 50 | while len(protein) != n: 51 | if int(cyclospec[i]) in map(int,weight.values()): 52 | protein.append(cyclospec[i]) 53 | i += 1 54 | 55 | # Get the name of each protein corresponding to a given weight (if multiple, only take one). 56 | names = [] 57 | for w in protein: 58 | names.append([items[0] for items in weight.items() if int(items[1])==int(w)][0]) 59 | 60 | # Build the possible sequences. 61 | seq = append_char(names,names) 62 | for repeat in xrange(1,n): 63 | seq = filter(lambda subpeptide:set(spectrum(subpeptide)) < set(cyclospec), set(seq)) 64 | if repeat != n-1: 65 | seq = append_char(seq,names) 66 | 67 | # Convert each protein to the proper format. 68 | cyclopeptide_sequence = ['-'.join([str(int(weight[protein])) for protein in peptide]) for peptide in seq] 69 | 70 | # Print and save the answer. 71 | print ' '.join(cyclopeptide_sequence) 72 | with open('output/Assignment_02D.txt', 'w') as output_data: 73 | output_data.write(' '.join(cyclopeptide_sequence)) 74 | -------------------------------------------------------------------------------- /Assignment_02E.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Leaderboard Cyclopeptide Sequencing 8 | Assignment #: 02 9 | Problem ID: E 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Adapting-Cyclopeptide-Sequencing-for-Spectra-with-Errors-102/#step-3 11 | ''' 12 | 13 | from scripts import ProteinWeightDict 14 | 15 | def append_protein(add_list): 16 | '''Returns a list containing all peptides from add_list with every possible protein suffix.''' 17 | newlist = [] 18 | for item in add_list: 19 | newlist += [item+ch for ch in ProteinWeightDict().keys()] 20 | return newlist 21 | 22 | def spectrum(peptide): 23 | '''Returns the circular spectrum of a given peptide.''' 24 | # Initialize as the mass 0 and the mass of the entire peptide. 25 | spec = [0, sum([int(weight[protein]) for protein in peptide])] 26 | # Find the masses of the adjacent intermediary subpeptides 27 | spec += [sum([int(weight[protein]) for protein in (peptide*2)[j:j+i]]) for i in xrange(1,len(peptide)) for j in xrange(len(peptide))] 28 | 29 | return sorted(spec) 30 | 31 | def spectrum_score(peptide, exp_spec): 32 | '''Returns the number of matching masses from the spectrum of peptide when compared with the spectrum exp_spec.''' 33 | pep_spec = spectrum(peptide) 34 | # Return -1 if the peptide has more mass than exp_spec. 35 | if pep_spec[-1] > exp_spec[-1]: 36 | return -1 37 | return sum([min(pep_spec.count(protein),exp_spec.count(protein)) for protein in set(pep_spec)]) 38 | 39 | if __name__ == '__main__': 40 | 41 | with open('data/stepic_2e.txt') as input_data: 42 | n, spec = [int(line.strip()) if i==0 else map(int,line.strip().split()) for i, line in enumerate(input_data.readlines())] 43 | 44 | # Create the protein weight dictionary. 45 | weight = ProteinWeightDict() 46 | # Initialize the scores dictionary. 47 | scores = dict() 48 | # Build the intial peptides. 49 | seq = filter(lambda L: L[0] != -1, [[spectrum_score(peptide,spec), peptide] for peptide in append_protein(weight.keys())]) 50 | 51 | # Build the sequence until the masses all grow too large. 52 | while seq != []: 53 | # Store the scores of the current sequence in a dictionary. 54 | scores = dict() 55 | for item in seq: 56 | if item[0] in scores: 57 | scores[item[0]].append(item[1]) 58 | else: 59 | scores[item[0]] = [item[1]] 60 | 61 | # Get the n leading scores with ties, remove lower scores from dictionary. 62 | leaders, leader_scores = [], [] 63 | if sum(len(peptides) for peptides in scores.values()) < n: 64 | leaders = scores[max(scores.keys())] 65 | else: 66 | while len(leaders) < n: 67 | leaders += scores[max(scores.keys())] 68 | del scores[max(scores.keys())] 69 | 70 | # Use this line to reduce runtime, removes excess ties. 71 | # leaders = leaders[:100] 72 | 73 | # Generate a new sequence of scores from the leaders. 74 | seq = filter(lambda L: L[0] != -1, [[spectrum_score(peptide,spec), peptide] for peptide in append_protein(leaders)]) 75 | 76 | # By construction, the scores are listed in descending order, so take the first peptide as the leader peptide. 77 | leader_peptide = '-'.join([str(int(weight[protein])) for protein in leaders[0]]) 78 | 79 | # Print and save the answer. 80 | print leader_peptide 81 | with open('output/Assignment_02E.txt', 'w') as output_data: 82 | output_data.write(leader_peptide) 83 | -------------------------------------------------------------------------------- /Assignment_02F.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Spectral Convolution Problem 8 | Assignment #: 02 9 | Problem ID: F 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/The-Spectral-Convolution-Saves-the-Day-104/#step-4 11 | ''' 12 | 13 | with open('data/stepic_2f.txt') as input_data: 14 | spec = map(int, input_data.read().strip().split()) 15 | 16 | # The spectrum isn't sorted, so find all differences and filter out the non-positive. 17 | convolution = [str(i-j) for i in spec for j in spec if i-j > 0] 18 | 19 | # Print and save the answer. 20 | print ' '.join(convolution) 21 | with open('output/Assignment_02F.txt', 'w') as output_data: 22 | output_data.write(' '.join(convolution)) 23 | -------------------------------------------------------------------------------- /Assignment_02G.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Convolution Cyclopeptide Sequencing 8 | Assignment #: 02 9 | Problem ID: G 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/The-Spectral-Convolution-Saves-the-Day-104/#step-7 11 | ''' 12 | 13 | def append_protein(add_list, protein_alphabet): 14 | '''Returns a list containing all peptides from add_list with every possible protein suffix.''' 15 | newlist = [] 16 | for item in add_list: 17 | for p in protein_alphabet: 18 | newlist.append(item+[p]) 19 | 20 | return newlist 21 | 22 | def spectrum(peptide): 23 | '''Returns the circular spectrum of a given peptide.''' 24 | # Initialize as the mass 0 and the mass of the entire peptide. 25 | spect = [0, sum(peptide)] 26 | # Find the masses of the adjacent intermediary subpeptides 27 | spect += [sum([protein for protein in (peptide*2)[j:j+i]]) for i in xrange(1,len(peptide)) for j in xrange(len(peptide))] 28 | 29 | return sorted(spect) 30 | 31 | def spectrum_score(peptide, exp_spec): 32 | '''Returns the number of matching masses from the spectrum of peptide when compared with the spectrum exp_spec.''' 33 | pep_spec = spectrum(peptide) 34 | # Return -1 if the peptide has more mass than exp_spec. 35 | if pep_spec[-1] > exp_spec[-1]: 36 | return -1 37 | return sum([min(pep_spec.count(protein),exp_spec.count(protein)) for protein in set(pep_spec)]) 38 | 39 | if __name__ == '__main__': 40 | 41 | with open('data/stepic_2g.txt') as input_data: 42 | m, n, spec = [int(line.strip()) if i <= 1 else sorted(map(int,line.strip().split())) for i, line in enumerate(input_data.readlines())] 43 | 44 | # Get the convolution. 45 | convolution = [i-j for i in spec for j in spec if i-j > 0] 46 | 47 | # Get the top M elements from the convolution that are between 57 and 200. 48 | convo_dict = dict() 49 | for c in set(filter(lambda c: 57<=c<=200, convolution)): 50 | num_c = convolution.count(c) 51 | if num_c in convo_dict: 52 | convo_dict[num_c].append(c) 53 | else: 54 | convo_dict[num_c] = [c] 55 | 56 | alphabet = [] 57 | while len(alphabet) < m: 58 | alphabet += convo_dict[max(convo_dict.keys())] 59 | del convo_dict[max(convo_dict.keys())] 60 | 61 | # Initialize the overall leader. 62 | overall_leader = [-1,-1] 63 | # Build the intial peptides. 64 | seq = filter(lambda L: L[0] != -1, [[spectrum_score([peptide],spec), [peptide]] for peptide in alphabet]) 65 | 66 | # Build the sequence until the masses all grow too large. 67 | while seq != []: 68 | 69 | # Add the peptides and scores from the current round to the scores dictonary. 70 | scores = dict() 71 | for item in seq: 72 | if item[0] in scores: 73 | scores[item[0]].append(item[1]) 74 | else: 75 | scores[item[0]] = [item[1]] 76 | 77 | # If we have less than n total items, then use all of them. 78 | if len(seq) < n: 79 | leaders = [item[1] for item in seq] 80 | leader_scores = [min(item[0] for item in seq)] 81 | 82 | # Otherwise, get the n leading scores with ties, remove lower scores from dictionary. 83 | else: 84 | leaders, leader_scores = [], [] 85 | while len(leaders) < n: 86 | current_max = max(filter(lambda s: s not in leader_scores, scores.keys())) 87 | leaders += scores[current_max] 88 | leader_scores.append(current_max) 89 | 90 | # Use this line to reduce runtime, removes excess ties. 91 | # leaders = leaders[:100] 92 | 93 | # If necessary, update the overall leader. 94 | if overall_leader[0] <= max(scores.keys()): 95 | overall_leader = [max(scores.keys()), '-'.join(map(str, scores[max(scores.keys())][0]))] 96 | 97 | # Generate a new sequence of scores from the leaders. 98 | seq = filter(lambda L: L[0] != -1, [[spectrum_score(peptide,spec), peptide] for peptide in append_protein(leaders, alphabet)]) 99 | 100 | # Print and save the answer. 101 | print overall_leader[1] 102 | with open('output/Assignment_02G.txt', 'w') as output_data: 103 | output_data.write(overall_leader[1]) 104 | -------------------------------------------------------------------------------- /Assignment_03A.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Motif Enumeration 8 | Assignment #: 03 9 | Problem ID: A 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Motif-Finding-Is-More-Difficult-Than-You-Think-156/#step-7 11 | ''' 12 | 13 | from Assignment_01G import MismatchList 14 | 15 | with open('data/stepic_3a.txt') as input_data: 16 | k, d = map(int, input_data.readline().split()) 17 | dna_list = [line.strip() for line in input_data.readlines()] 18 | 19 | # Generate sets of (k,d)-motifs for each dna sequence in the list. 20 | motif_sets = [{kmer for i in xrange(len(dna)-k+1) for kmer in MismatchList(dna[i:i+k], d)} for dna in dna_list] 21 | 22 | # Intersect all sets to get the common elements. The answers are displayed as sorted, so we'll sort too. 23 | motifs = sorted(list(reduce(lambda a,b: a&b, motif_sets))) 24 | 25 | # Print and save the answer. 26 | print ' '. join(motifs) 27 | with open('output/Assignment_03A.txt', 'w') as output_data: 28 | output_data.write(' '.join(motifs)) 29 | -------------------------------------------------------------------------------- /Assignment_03B.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Median String Problem 8 | Assignment #: 03 9 | Problem ID: B 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Motif-Finding-to-Finding-a-Median-String-158/#step-7 11 | ''' 12 | 13 | from itertools import product 14 | from scripts import HammingDistance 15 | 16 | def motif_score(pattern, motif): 17 | '''Returns the score of d(pattern, motif).''' 18 | return min([HammingDistance(motif[i:i+len(pattern)], pattern) for i in range(len(motif)-len(pattern)+1)]) 19 | 20 | with open('data/stepic_3b.txt') as input_data: 21 | k = int(input_data.readline()) 22 | dna_list = [line.strip() for line in input_data.readlines()] 23 | 24 | # Initialize the best pattern score as one greater than the maximum possible score. 25 | best_pattern = [k*len(dna_list) + 1, None] 26 | 27 | # Check the scores of all k-mers. 28 | for pattern in product('ACGT', repeat=k): 29 | current_score = sum([motif_score(''.join(pattern),dna) for dna in dna_list]) 30 | if current_score < best_pattern[0]: 31 | best_pattern = [current_score, ''.join(pattern)] 32 | 33 | # Print and save the answer. 34 | print best_pattern[1] 35 | with open('output/Assignment_03B.txt', 'w') as output_data: 36 | output_data.write(best_pattern[1]) 37 | -------------------------------------------------------------------------------- /Assignment_03C.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Profile-most Probable k-mer Problem 8 | Assignment #: 03 9 | Problem ID: C 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Greedy-Motif-Search-159/#step-3 11 | ''' 12 | 13 | with open('data/stepic_3c.txt') as input_data: 14 | dna = input_data.readline().strip() 15 | k = int(input_data.readline()) 16 | profile = [map(float,line.strip().split()) if i!=0 else line.strip().split() for i,line in enumerate(input_data.readlines())] 17 | 18 | # A dictionary relating nucleotides to their position within the profile. 19 | nuc_loc = {nucleotide:index for index,nucleotide in enumerate(profile[0])} 20 | 21 | # Initialize the maximum probabily. 22 | max_prob = [-1, None] 23 | 24 | # Compute the probability of the each k-mer, store it if it's currently a maximum. 25 | for i in xrange(len(dna)-k+1): 26 | current_prob = 1 27 | for j, nucleotide in enumerate(dna[i:i+k]): 28 | current_prob *= profile[j+1][nuc_loc[nucleotide]] 29 | if current_prob > max_prob[0]: 30 | max_prob = [current_prob, dna[i:i+k]] 31 | 32 | # Print and save the answer. 33 | print max_prob[1] 34 | with open('output/Assignment_03C.txt', 'w') as output_data: 35 | output_data.write(max_prob[1]) 36 | -------------------------------------------------------------------------------- /Assignment_03D.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Greedy Motif Search 8 | Assignment #: 03 9 | Problem ID: D 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Greedy-Motif-Search-159/#step-5 11 | ''' 12 | 13 | from scripts import HammingDistance 14 | 15 | def score(motifs): 16 | '''Returns the score of the dna list motifs.''' 17 | score = 0 18 | for i in xrange(len(motifs[0])): 19 | motif = ''.join([motifs[j][i] for j in xrange(len(motifs))]) 20 | score += min([HammingDistance(motif, homogeneous*len(motif)) for homogeneous in 'ACGT']) 21 | return score 22 | 23 | def profile(motifs): 24 | '''Returns the profile of the dna list motifs.''' 25 | prof = [] 26 | for i in xrange(len(motifs[0])): 27 | col = ''.join([motifs[j][i] for j in xrange(len(motifs))]) 28 | prof.append([float(col.count(nuc))/float(len(col)) for nuc in 'ACGT']) 29 | return prof 30 | 31 | def profile_most_probable_kmer(dna, k, prof): 32 | '''Return the profile most probable k-mer in a given dna sequence.''' 33 | # A dictionary relating nucleotides to their position within the profile. 34 | nuc_loc = {nucleotide:index for index,nucleotide in enumerate('ACGT')} 35 | # Initialize the maximum probabily. 36 | max_prob = [-1, None] 37 | # Compute the probability of the each k-mer, store it if it's currently a maximum. 38 | for i in xrange(len(dna)-k+1): 39 | current_prob = 1 40 | for j, nucleotide in enumerate(dna[i:i+k]): 41 | current_prob *= prof[j][nuc_loc[nucleotide]] 42 | if current_prob > max_prob[0]: 43 | max_prob = [current_prob, dna[i:i+k]] 44 | 45 | return max_prob[1] 46 | 47 | if __name__ == '__main__': 48 | 49 | with open('data/stepic_3d.txt') as input_data: 50 | k,t = map(int, input_data.readline().split()) 51 | dna_list = [line.strip() for line in input_data.readlines()] 52 | 53 | # Initialize the best score as a score higher than the highest possible score. 54 | best_score = [t*k, None] 55 | 56 | # Run the greedy motif search. 57 | for i in xrange(len(dna_list[0])-k+1): 58 | # Initialize the motifs as each k-mer from the first dna sequence. 59 | motifs = [dna_list[0][i:i+k]] 60 | current_profile = profile(motifs) 61 | 62 | # Find the most probable k-mer in the next string. 63 | for j in xrange(1,t): 64 | motifs.append(profile_most_probable_kmer(dna_list[j],k,current_profile)) 65 | current_profile = profile(motifs) 66 | 67 | # Check to see if we have a new best scoring list of motifs. 68 | current_score = score(motifs) 69 | if current_score < best_score[0]: 70 | best_score = [current_score, motifs] 71 | 72 | # Print and save the answer. 73 | print '\n'.join(best_score[1]) 74 | with open('output/Assignment_03D.txt', 'w') as output_data: 75 | output_data.write('\n'.join(best_score[1])) 76 | -------------------------------------------------------------------------------- /Assignment_03E.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Greedy Motif Search with Pseudocounts 8 | Assignment #: 03 9 | Problem ID: E 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Motif-Finding-Meets-Oliver-Cromwell-160/#step-9 11 | ''' 12 | 13 | from Assignment_03D import score, profile_most_probable_kmer 14 | 15 | def profile_with_pseudocounts(motifs): 16 | '''Returns the profile of the dna list motifs.''' 17 | prof = [] 18 | for i in xrange(len(motifs[0])): 19 | col = ''.join([motifs[j][i] for j in xrange(len(motifs))]) 20 | prof.append([float(col.count(nuc)+1)/float(len(col)+4) for nuc in 'ACGT']) 21 | return prof 22 | 23 | if __name__ == '__main__': 24 | 25 | with open('data/stepic_3e.txt') as input_data: 26 | k,t = map(int, input_data.readline().split()) 27 | dna_list = [line.strip() for line in input_data.readlines()] 28 | 29 | # Initialize the best score as a score higher than the highest possible score. 30 | best_score = [t*k, None] 31 | 32 | # Run the greedy motif search. 33 | for i in xrange(len(dna_list[0])-k+1): 34 | # Initialize the motifs as each k-mer from the first dna sequence. 35 | motifs = [dna_list[0][i:i+k]] 36 | current_profile = profile_with_pseudocounts(motifs) 37 | 38 | # Find the most probable k-mer in the next string, using pseudocounts. 39 | for j in xrange(1,t): 40 | motifs.append(profile_most_probable_kmer(dna_list[j],k,current_profile)) 41 | current_profile = profile_with_pseudocounts(motifs) 42 | 43 | # Check to see if we have a new best scoring list of motifs. 44 | current_score = score(motifs) 45 | if current_score < best_score[0]: 46 | best_score = [current_score, motifs] 47 | 48 | # Print and save the answer. 49 | print '\n'.join(best_score[1]) 50 | with open('output/Assignment_03E.txt', 'w') as output_data: 51 | output_data.write('\n'.join(best_score[1])) 52 | -------------------------------------------------------------------------------- /Assignment_03F.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Randomized Motif Search 8 | Assignment #: 03 9 | Problem ID: F 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Randomized-Motif-Search-161/#step-3 11 | ''' 12 | 13 | from random import randint 14 | from Assignment_03D import score, profile_most_probable_kmer 15 | from Assignment_03E import profile_with_pseudocounts 16 | 17 | def motifs_from_profile(profile, dna, k): 18 | return [profile_most_probable_kmer(seq,k,profile) for seq in dna] 19 | 20 | def randomized_motif_search(dna,k,t): 21 | # Randomly generate k-mers from each sequence in the dna list. 22 | rand_ints = [randint(0,len(dna[0])-k) for a in xrange(t)] 23 | motifs = [dna_list[i][r:r+k] for i,r in enumerate(rand_ints)] 24 | 25 | # Initialize the best score as a score higher than the highest possible score. 26 | best_score = [score(motifs), motifs] 27 | 28 | # Iterate motifs. 29 | while True: 30 | current_profile = profile_with_pseudocounts(motifs) 31 | motifs = motifs_from_profile(current_profile, dna_list, k) 32 | current_score = score(motifs) 33 | if current_score < best_score[0]: 34 | best_score = [current_score, motifs] 35 | else: 36 | return best_score 37 | 38 | if __name__ == '__main__': 39 | 40 | with open('data/stepic_3f.txt') as input_data: 41 | k,t = map(int, input_data.readline().split()) 42 | dna_list = [line.strip() for line in input_data.readlines()] 43 | 44 | # Initialize the best scoring motifs as a score higher than the highest possible score. 45 | best_motifs = [k*t, None] 46 | 47 | # Repeat the radomized motif search 1000 times. 48 | for repeat in xrange(1000): 49 | current_motifs = randomized_motif_search(dna_list,k,t) 50 | if current_motifs[0] < best_motifs[0]: 51 | best_motifs = current_motifs 52 | 53 | # Print and save the answer. 54 | print '\n'.join(best_motifs[1]) 55 | with open('output/Assignment_03F.txt', 'w') as output_data: 56 | output_data.write('\n'.join(best_motifs[1])) 57 | -------------------------------------------------------------------------------- /Assignment_03G.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic. 6 | 7 | Problem Title: Gibbs Sampler 8 | Assignment #: 03 9 | Problem ID: G 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Gibbs-Sampling-163/#step-4 11 | ''' 12 | 13 | from random import randint 14 | from Assignment_03D import score, profile_most_probable_kmer 15 | from Assignment_03E import profile_with_pseudocounts 16 | 17 | def gibbs_sampler(dna,k,t,N): 18 | # Randomly generate k-mers from each sequence in the dna list. 19 | rand_ints = [randint(0,len(dna[0])-k) for a in xrange(t)] 20 | motifs = [dna_list[i][r:r+k] for i,r in enumerate(rand_ints)] 21 | 22 | # Initialize the best score as a score higher than the highest possible score. 23 | best_score = [score(motifs), motifs] 24 | 25 | # Iterate motifs. 26 | for i in xrange(N): 27 | r = randint(0,t-1) 28 | current_profile = profile_with_pseudocounts([motif for index, motif in enumerate(motifs) if index!=r]) 29 | # print 'a: ', motifs 30 | motifs = [profile_most_probable_kmer(dna[index],k,current_profile) if index == r else motif for index,motif in enumerate(motifs)] 31 | # print 'b: ', motifs 32 | current_score = score(motifs) 33 | if current_score < best_score[0]: 34 | best_score = [current_score, motifs] 35 | 36 | return best_score 37 | 38 | if __name__ == '__main__': 39 | 40 | with open('data/stepic_3g.txt') as input_data: 41 | k,t,N = map(int, input_data.readline().split()) 42 | dna_list = [line.strip() for line in input_data.readlines()] 43 | 44 | # Initialize the best scoring motifs as a score higher than the highest possible score. 45 | best_motifs = [k*t, None] 46 | 47 | # Repeat the radomized motif search 20 times. 48 | for repeat in xrange(20): 49 | current_motifs = gibbs_sampler(dna_list,k,t,N) 50 | if current_motifs[0] < best_motifs[0]: 51 | best_motifs = current_motifs 52 | 53 | # Print and save the answer. 54 | print '\n'.join(best_motifs[1]) 55 | with open('output/Assignment_03G.txt', 'w') as output_data: 56 | output_data.write('\n'.join(best_motifs[1])) 57 | -------------------------------------------------------------------------------- /Assignment_04A.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: String Composition Problem 8 | Assignment #: 04 9 | Problem ID: A 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/The-String-Reconstruction-Problem-197/#step-3 11 | ''' 12 | 13 | with open('data/stepic_4a.txt') as input_data: 14 | k = int(input_data.readline().strip()) 15 | text = input_data.readline().strip() 16 | 17 | # Generate the list of all k-mers in text and sort them lexiographically. 18 | composition = sorted([text[i:i+k] for i in xrange(len(text)-k+1)]) 19 | 20 | # Print and save the answer. 21 | print '\n'. join(composition) 22 | with open('output/Assignment_04A.txt', 'w') as output_data: 23 | output_data.write('\n'. join(composition)) 24 | -------------------------------------------------------------------------------- /Assignment_04B.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Overlap Graph Problem 8 | Assignment #: 04 9 | Problem ID: B 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/String-Reconstruction-as-a-Walk-Through-the-Overlap-Graph-198/#step-7 11 | ''' 12 | 13 | with open('data/stepic_4b.txt') as input_data: 14 | dna = [line.strip() for line in input_data.readlines()] 15 | 16 | # Lambda functions to check for overlap and print overlaps in the desired way. 17 | check_overlap = lambda pair: pair[0][1:] == pair[1][:-1] 18 | print_overlap = lambda pair: ' -> '.join(pair) 19 | 20 | # Get all pairs, filter out non-overlapping pairs, print overlapping pairs appropriately. 21 | pairs = ([dna1, dna2] for i, dna1 in enumerate(dna) for j, dna2 in enumerate(dna) if i != j) 22 | overlaps = map(print_overlap, filter(check_overlap, pairs)) 23 | 24 | # Print and save the answer. 25 | print '\n'.join(overlaps) 26 | with open('output/Assignment_04B.txt', 'w') as output_data: 27 | output_data.write('\n'.join(overlaps)) 28 | -------------------------------------------------------------------------------- /Assignment_04C.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: De Bruijn Graph from a String Problem 8 | Assignment #: 04 9 | Problem ID: C 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Another-Graph-for-String-Reconstruction-199/#step-6 11 | ''' 12 | 13 | # Read the input data. 14 | with open('data/stepic_4c.txt') as input_data: 15 | k = int(input_data.readline()) 16 | dna = input_data.readline().strip() 17 | 18 | # Create a dictionary matching (k-1)-mers to their followers. 19 | de_bruijn_dict = dict() 20 | for kmer in (dna[i:i+k] for i in xrange(len(dna)-k+1)): 21 | if kmer[:-1] in de_bruijn_dict: 22 | de_bruijn_dict[kmer[:-1]].add(kmer[1:]) 23 | else: 24 | de_bruijn_dict[kmer[:-1]] = {kmer[1:]} 25 | 26 | # Write the De Bruijn Graph in the specified format 27 | de_buijn = [' -> '.join([item[0], ','.join(item[1])]) for item in de_bruijn_dict.items()] 28 | 29 | # Print and save the answer. 30 | print '\n'.join(de_buijn) 31 | with open('output/Assignment_04C.txt', 'w') as output_data: 32 | output_data.write('\n'.join(de_buijn)) 33 | -------------------------------------------------------------------------------- /Assignment_04D.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: De Bruijn Graph from k-mers Problem 8 | Assignment #: 4 9 | Problem ID: D 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Another-Walk-200/#step-7 11 | ''' 12 | 13 | # Read the input data. 14 | with open('data/stepic_4d.txt') as input_data: 15 | kmers = [line.strip() for line in input_data.readlines()] 16 | 17 | # Create a dictionary matching (k-1)-mers to their followers. 18 | de_bruijn_dict = dict() 19 | for kmer in kmers: 20 | if kmer[:-1] in de_bruijn_dict: 21 | de_bruijn_dict[kmer[:-1]].add(kmer[1:]) 22 | else: 23 | de_bruijn_dict[kmer[:-1]] = {kmer[1:]} 24 | 25 | # Write the De Bruijn Graph in the specified format 26 | de_buijn = [' -> '.join([item[0], ','.join(item[1])]) for item in de_bruijn_dict.items()] 27 | 28 | # Print and save the answer. 29 | print '\n'.join(de_buijn) 30 | with open('output/Assignment_04D.txt', 'w') as output_data: 31 | output_data.write('\n'.join(de_buijn)) 32 | -------------------------------------------------------------------------------- /Assignment_04E.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Eulerian Cycle Problem 8 | Assignment #: 04 9 | Problem ID: E 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Eulers-Theorem-to-an-Algorithm-for-Finding-Eulerian-Cycles-203/#step-2 11 | ''' 12 | 13 | 14 | def eulerian_cycle(edge_dict): 15 | '''Generates an Eulerian cycle from the given edges.''' 16 | current_node = edge_dict.keys()[0] 17 | path = [current_node] 18 | 19 | # Get the initial cycle. 20 | while True: 21 | path.append(edge_dict[current_node][0]) 22 | 23 | if len(edge_dict[current_node]) == 1: 24 | del edge_dict[current_node] 25 | else: 26 | edge_dict[current_node] = edge_dict[current_node][1:] 27 | 28 | if path[-1] in edge_dict: 29 | current_node = path[-1] 30 | else: 31 | break 32 | 33 | # Continually expand the initial cycle until we're out of edge_dict. 34 | while len(edge_dict) > 0: 35 | for i in xrange(len(path)): 36 | if path[i] in edge_dict: 37 | current_node = path[i] 38 | cycle = [current_node] 39 | while True: 40 | cycle.append(edge_dict[current_node][0]) 41 | 42 | if len(edge_dict[current_node]) == 1: 43 | del edge_dict[current_node] 44 | else: 45 | edge_dict[current_node] = edge_dict[current_node][1:] 46 | 47 | if cycle[-1] in edge_dict: 48 | current_node = cycle[-1] 49 | else: 50 | break 51 | 52 | path = path[:i] + cycle + path[i+1:] 53 | break 54 | return path 55 | 56 | if __name__ == '__main__': 57 | 58 | # Read the input data. 59 | with open('data/stepic_4e.txt') as input_data: 60 | edges = {} 61 | for edge in [line.strip().split(' -> ') for line in input_data.readlines()]: 62 | if ',' in edge[1]: 63 | edges[int(edge[0])] = map(int,edge[1].split(',')) 64 | else: 65 | edges[int(edge[0])] = [int(edge[1])] 66 | 67 | # Get the Eulerian cycle. 68 | path = eulerian_cycle(edges) 69 | 70 | # Print and save the answer. 71 | print '->'.join(map(str,path)) 72 | with open('output/Assignment_04E.txt', 'w') as output_data: 73 | output_data.write('->'.join(map(str,path))) 74 | -------------------------------------------------------------------------------- /Assignment_04E_with_NetworkX.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Eulerian Cycle Problem 8 | Assignment #: 04 9 | Problem ID: E 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Eulers-Theorem-to-an-Algorithm-for-Finding-Eulerian-Cycles-203/#step-2 11 | ''' 12 | 13 | import networkx as nx 14 | 15 | # Read the input data. 16 | with open('data/stepic_4e.txt') as input_data: 17 | edges = [line.strip().split(' -> ') for line in input_data.readlines()] 18 | 19 | # Properly format the edges. 20 | edges2 = [] 21 | for edge in edges: 22 | if ',' in edge[1]: 23 | for node in edge[1].split(','): 24 | edges2.append(map(int, [edge[0], node])) 25 | else: 26 | edges2.append(map(int, edge)) 27 | 28 | # Create the graph. 29 | G = nx.DiGraph() 30 | G.add_edges_from(edges2) 31 | 32 | # Find an eulerian cycle. 33 | path = [str(e[0]) for e in nx.eulerian_circuit(G)] 34 | path.append(path[0]) 35 | 36 | # Print and save the answer. 37 | print '->'.join(path) 38 | with open('output/Assignment_04E.txt', 'w') as output_data: 39 | output_data.write('->'.join(path)) 40 | -------------------------------------------------------------------------------- /Assignment_05A.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Eulerian Path Problem 8 | Assignment #: 05 9 | Problem ID: A 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Eulers-Theorem-to-an-Algorithm-for-Finding-Eulerian-Cycles-203/#step-5 11 | ''' 12 | 13 | from Assignment_04E import eulerian_cycle 14 | 15 | 16 | def eulerian_path(edge_dict): 17 | '''Returns an Eulerian path from the given edges.''' 18 | # Determine the unbalanced edges. 19 | out_values = reduce(lambda a,b: a+b, edge_dict.values()) 20 | for node in set(out_values+edge_dict.keys()): 21 | out_value = out_values.count(node) 22 | if node in edge_dict: 23 | in_value = len(edge_dict[node]) 24 | else: 25 | in_value = 0 26 | 27 | if in_value < out_value: 28 | unbalanced_from = node 29 | elif out_value < in_value: 30 | unbalanced_to = node 31 | 32 | # Add an edge connecting the unbalanced edges. 33 | if unbalanced_from in edge_dict: 34 | edge_dict[unbalanced_from].append(unbalanced_to) 35 | else: 36 | edge_dict[unbalanced_from] = [unbalanced_to] 37 | 38 | # Get the Eulerian Cycle from the edges, including the unbalanced edge. 39 | cycle = eulerian_cycle(edge_dict) 40 | 41 | # Find the location of the unbalanced edge in the eulerian cycle. 42 | divide_point = filter(lambda i: cycle[i:i+2] == [unbalanced_from, unbalanced_to], xrange(len(cycle)-1))[0] 43 | 44 | # Remove the unbalanced edge, and shift appropriately, overlapping the head and tail. 45 | return cycle[divide_point+1:]+cycle[1:divide_point+1] 46 | 47 | if __name__ == '__main__': 48 | 49 | # Read the input data. 50 | with open('data/stepic_5a.txt') as input_data: 51 | edges = {} 52 | for edge in [line.strip().split(' -> ') for line in input_data.readlines()]: 53 | if ',' in edge[1]: 54 | edges[int(edge[0])] = map(int,edge[1].split(',')) 55 | else: 56 | edges[int(edge[0])] = [int(edge[1])] 57 | 58 | # Get the Eulerian path associated with the edges. 59 | path = eulerian_path(edges) 60 | 61 | # Print and save the answer. 62 | print '->'.join(map(str, path)) 63 | with open('output/Assignment_05A.txt', 'w') as output_data: 64 | output_data.write('->'.join(map(str, path))) 65 | -------------------------------------------------------------------------------- /Assignment_05B.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: String Reconstruction Problem 8 | Assignment #: 05 9 | Problem ID: B 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Eulers-Theorem-to-an-Algorithm-for-Finding-Eulerian-Cycles-203/#step-6 11 | ''' 12 | 13 | # Read the input data. 14 | with open('data/stepic_5b.txt') as input_data: 15 | string_dict = {line.strip().split(' -> ')[0]:line.strip().split(' -> ')[1] for line in input_data.readlines()} 16 | 17 | # Find the head and tail strings of the reconstructed string. 18 | head = filter(lambda x: x not in string_dict.values(), string_dict.keys())[0] 19 | tail = filter(lambda x: x not in string_dict.keys(), string_dict.values())[0] 20 | 21 | # Initialize the reconstruction process, starting at the head. 22 | reconstructed_str = head[0] 23 | current_str = head 24 | 25 | # Iterate over all intermediary strings, appending the first character to reconstruct the string. 26 | while current_str != tail: 27 | current_str = string_dict[current_str] 28 | reconstructed_str += current_str[0] 29 | 30 | # Complete the reconstruction by adding the end of the tail. 31 | reconstructed_str += tail[1:] 32 | 33 | # Print and save the answer. 34 | print reconstructed_str 35 | with open('output/Assignment_05B.txt', 'w') as output_data: 36 | output_data.write(reconstructed_str) 37 | -------------------------------------------------------------------------------- /Assignment_05C.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Universal String Problem 8 | Assignment #: 05 9 | Problem ID: C 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Eulers-Theorem-to-an-Algorithm-for-Finding-Eulerian-Cycles-203/#step-8 11 | ''' 12 | 13 | from Assignment_04E import eulerian_cycle 14 | from itertools import product 15 | 16 | # Read the input data. 17 | with open('data/stepic_5c.txt') as input_data: 18 | k = int(input_data.read().strip()) 19 | 20 | # Create the edges. 21 | universal_dict = {} 22 | for kmer in [''.join(item) for item in product('01', repeat=k)]: 23 | if kmer[:-1] in universal_dict: 24 | universal_dict[kmer[:-1]].append(kmer[1:]) 25 | else: 26 | universal_dict[kmer[:-1]] = [kmer[1:]] 27 | 28 | # Get the cycle, remove the repeated last entry for the associated path. 29 | path = eulerian_cycle(universal_dict) 30 | 31 | # Print and save the answer. 32 | print ''.join([item[0] for item in path[:-1]]) 33 | with open('output/Assignment_05C.txt', 'w') as output_data: 34 | output_data.write(''.join([item[0] for item in path[:-1]])) 35 | -------------------------------------------------------------------------------- /Assignment_05D.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: String Construction from Read-Pairs Problem 8 | Assignment #: 05 9 | Problem ID: D 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Assembling-Read-Pairs-204/#step-14 11 | ''' 12 | 13 | from Assignment_05A import eulerian_path 14 | 15 | # Read the input data. 16 | with open('data/stepic_5d.txt') as input_data: 17 | d = int(input_data.readline()) 18 | paired_reads = [line.strip().split('|') for line in input_data.readlines()] 19 | k = len(paired_reads[0][0]) 20 | 21 | # Construct a dictionary of edges from the paired reads. 22 | paired_dict = {} 23 | for pair in paired_reads: 24 | if (pair[0][:-1],pair[1][:-1]) in paired_dict: 25 | paired_dict[(pair[0][:-1],pair[1][:-1])].append((pair[0][1:],pair[1][1:])) 26 | else: 27 | paired_dict[(pair[0][:-1],pair[1][:-1])] = [(pair[0][1:],pair[1][1:])] 28 | 29 | # Get an eulerian path from the paired edges. 30 | paired_path = eulerian_path(paired_dict) 31 | 32 | # Recombine the paths, accounting for their overlaps. 33 | strings = [paired_path[0][i] + ''.join(map(lambda x: x[i][-1], paired_path[1:])) for i in xrange(2)] 34 | text = strings[0][:k+d]+strings[1] 35 | 36 | # Print and save the answer. 37 | print text 38 | with open('output/Assignment_05D.txt', 'w') as output_data: 39 | output_data.write(text) 40 | -------------------------------------------------------------------------------- /Assignment_05E.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Contig Generation Problem 8 | Assignment #: 05 9 | Problem ID: E 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Epilogue-Genome-Assembly-Faces-Additional-Practical-Hurdles-205/#step-5 11 | ''' 12 | 13 | from compiler.ast import flatten 14 | 15 | # Read the input data. 16 | with open('data/stepic_5e.txt') as input_data: 17 | kmers = [line.strip() for line in input_data.readlines()] 18 | 19 | # Construct a dictionary of edges. 20 | edges = {} 21 | for kmer in kmers: 22 | if kmer[:-1] in edges: 23 | edges[kmer[:-1]].append(kmer[1:]) 24 | else: 25 | edges[kmer[:-1]] = [kmer[1:]] 26 | 27 | # Determine the balanced and unbalanced edges. 28 | balanced, unbalanced = [], [] 29 | out_values = reduce(lambda a,b: a+b, edges.values()) 30 | for node in set(out_values+edges.keys()): 31 | out_value = out_values.count(node) 32 | if node in edges: 33 | in_value = len(edges[node]) 34 | else: 35 | in_value = 0 36 | 37 | if in_value == out_value == 1: 38 | balanced.append(node) 39 | else: 40 | unbalanced.append(node) 41 | 42 | # Generate the contigs. 43 | get_contigs = lambda s, c: flatten([c+e[-1] if e not in balanced else get_contigs(e,c+e[-1]) for e in edges[s]]) 44 | contigs = sorted(flatten([get_contigs(start,start) for start in set(unbalanced) & set(edges.keys())])) 45 | 46 | # Print and save the answer. 47 | print '\n'.join(contigs) 48 | with open('output/Assignment_05E.txt', 'w') as output_data: 49 | output_data.write('\n'.join(contigs)) 50 | -------------------------------------------------------------------------------- /Assignment_06A.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Change Problem 8 | Assignment #: 06 9 | Problem ID: A 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/An-Introduction-to-Dynamic-Programming-The-Change-Problem-243/#step-8 11 | ''' 12 | 13 | 14 | def DPChange(amount, coin_list): 15 | '''Gives the minimum number of coins of denomination in coint_list necessary to create the given amount.''' 16 | # Initiate the amounts larger than zero as a number greater than the upper bound. 17 | min_coins = [0]+[(amount/min(coin_list))+1]*amount 18 | # Use dynamic programming to build up to the desired amount. 19 | for m in xrange(1,amount+1): 20 | for coin in coin_list: 21 | if m >= coin: 22 | if min_coins[m-coin] + 1 < min_coins[m]: 23 | min_coins[m] = min_coins[m-coin] + 1 24 | return min_coins[amount] 25 | 26 | if __name__ == '__main__': 27 | 28 | # Read the input data. 29 | with open('data/stepic_6a.txt') as input_data: 30 | money = int(input_data.readline().strip()) 31 | coins = map(int, input_data.readline().strip().split(',')) 32 | 33 | # Get the desired minimum number of coins. 34 | min_number = str(DPChange(money, coins)) 35 | 36 | # Print and save the answer. 37 | print min_number 38 | with open('output/Assignment_06A.txt', 'w') as output_data: 39 | output_data.write(min_number) 40 | -------------------------------------------------------------------------------- /Assignment_06B.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Manhattan Tourist 8 | Assignment #: 06 9 | Problem ID: B 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/The-Manhattan-Tourist-Problem-Revisited-261/#step-8 11 | ''' 12 | 13 | 14 | def manhattan_tourist(n, m, down, right): 15 | '''Returns the longest path from (0,0) to (n,m) using the taxicab metric and weights down, right.''' 16 | from numpy import zeros 17 | 18 | # Initialize as the zero matrix. 19 | S = zeros((n+1,m+1), dtype=int) 20 | 21 | # Compute the first row and column. 22 | for i in xrange(1,n+1): 23 | S[i][0] = S[i-1][0] + down[i-1][0] 24 | for j in xrange(1, m+1): 25 | S[0][j] = S[0][j-1] + right[0][j-1] 26 | 27 | # Compute the interior values. 28 | for i in xrange(1,n+1): 29 | for j in xrange(1,m+1): 30 | S[i][j] = max(S[i-1][j]+down[i-1][j], S[i][j-1] + right[i][j-1]) 31 | 32 | return S[n][m] 33 | 34 | if __name__ == '__main__': 35 | 36 | # Read the input data. 37 | with open('data/stepic_6b.txt') as input_data: 38 | n, m = [int(input_data.readline()) for repeat in xrange(2)] 39 | down, right = [[map(int, row.split()) for row in matrix.split('\n')] for matrix in input_data.read().strip().split('\n-\n')] 40 | 41 | # Get the maximum distance. 42 | max_dist = str(manhattan_tourist(n, m, down, right)) 43 | 44 | # Print and save the answer. 45 | print max_dist 46 | with open('output/Assignment_06B.txt', 'w') as output_data: 47 | output_data.write(max_dist) 48 | -------------------------------------------------------------------------------- /Assignment_06C.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Longest Common Subsequence Problem 8 | Assignment #: 06 9 | Problem ID: C 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Backtracking-in-the-Alignment-Graph-245/#step-5 11 | ''' 12 | 13 | 14 | # No need for two functions, as in the problem description. I replaced the recursive function 15 | # with a while loop in the first function. Also, no need for the backtrack array, as all of 16 | # that information is easily recoverable from the original array. 17 | def longest_common_subsequence(v, w): 18 | '''Returns the longest longest common subsequence of strings v and w.''' 19 | from numpy import zeros 20 | 21 | # Initialize the array S and iterate through all character of v and w. 22 | S = zeros((len(v)+1,len(w)+1), dtype=int) 23 | for i in xrange(len(v)): 24 | for j in xrange(len(w)): 25 | if v[i] == w[j]: 26 | S[i+1][j+1] = S[i][j]+1 27 | else: 28 | S[i+1][j+1] = max(S[i+1][j],S[i][j+1]) 29 | 30 | # Recover a maximum substring. 31 | longest_sseq = '' 32 | i,j = len(v), len(w) 33 | while i*j != 0: 34 | if S[i][j] == S[i-1][j]: 35 | i -= 1 36 | elif S[i][j] == S[i][j-1]: 37 | j -= 1 38 | else: 39 | longest_sseq = v[i-1] + longest_sseq 40 | i -= 1 41 | j -= 1 42 | 43 | return longest_sseq 44 | 45 | if __name__ == '__main__': 46 | 47 | # Read the input data. 48 | with open('data/stepic_6c.txt') as input_data: 49 | dna1, dna2 = [line.strip() for line in input_data.readlines()] 50 | 51 | # Get the longest common subsequence. 52 | longest_subseq = longest_common_subsequence(dna1, dna2) 53 | 54 | # Print and save the answer. 55 | print longest_subseq 56 | with open('output/Assignment_06C.txt', 'w') as output_data: 57 | output_data.write(longest_subseq) 58 | -------------------------------------------------------------------------------- /Assignment_06D.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Longest Path in a DAG Problem 8 | Assignment #: 06 9 | Problem ID: D 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Backtracking-in-the-Alignment-Graph-245/#step-7 11 | ''' 12 | 13 | 14 | def topological_ordering(graph): 15 | '''Returns a topological ordering for the given graph.''' 16 | # Initialize and covert variables appropriately. 17 | graph = set(graph) 18 | ordering = [] 19 | candidates = list({edge[0] for edge in graph} - {edge[1] for edge in graph}) 20 | 21 | # Get the topological ordering. 22 | while len(candidates) != 0: 23 | # Add the next candidate to the ordering. 24 | ordering.append(candidates[0]) 25 | 26 | # Remove outgoing edges and store outgoing nodes. 27 | temp_nodes = [] 28 | for edge in filter(lambda e: e[0] == candidates[0], graph): 29 | graph.remove(edge) 30 | temp_nodes.append(edge[1]) 31 | 32 | # Add outgoing nodes to candidates list if it has no other incoming edges. 33 | for node in temp_nodes: 34 | if node not in {edge[1] for edge in graph}: 35 | candidates.append(node) 36 | 37 | # Remove the current candidate. 38 | candidates = candidates[1:] 39 | 40 | return ordering 41 | 42 | 43 | def longest_path(graph, edges, source, sink): 44 | '''Returns the length and path of the longest path.''' 45 | # Get the topological ordering from the source to sink, not including the source. 46 | top_order = topological_ordering(graph.keys()) 47 | top_order = top_order[top_order.index(source)+1:top_order.index(sink)+1] 48 | 49 | # Initialize S and backtrack. 50 | S = {node:-100 for node in {edge[0] for edge in graph.keys()} | {edge[1] for edge in graph.keys()}} 51 | S[source] = 0 52 | backtrack = {node:None for node in top_order} 53 | 54 | # Iterate through the topological order to get the distances, store predecessors in backtrack. 55 | for node in top_order: 56 | try: 57 | S[node], backtrack[node] = max(map(lambda e: [S[e[0]] + graph[e], e[0]], filter(lambda e: e[1] == node, graph.keys())), key=lambda p:p[0]) 58 | # ValueError occurs if max() is empty, i.e. the given node has no predecessor. This is fine, as top_order can include unrealted vertices. 59 | # Ignore such nodes, as they will not factor into the longest path from source to sink. 60 | except ValueError: 61 | pass 62 | 63 | # Backtrack to get the longest path. 64 | path = [sink] 65 | while path[0] != source: 66 | path = [backtrack[path[0]]] + path 67 | 68 | return S[sink], path 69 | 70 | if __name__ == '__main__': 71 | 72 | # Read the input data. 73 | with open('data/stepic_6d.txt') as input_data: 74 | source, sink = [int(input_data.readline()) for repeat in xrange(2)] 75 | 76 | # Construct the edges and edge weights. 77 | edges, edge_weight = {}, {} 78 | for pair in [line.strip().split('->') for line in input_data.readlines()]: 79 | if int(pair[0]) not in edges: 80 | edges[int(pair[0])] = [int(pair[1].split(':')[0])] 81 | else: 82 | edges[int(pair[0])].append(int(pair[1].split(':')[0])) 83 | 84 | edge_weight[int(pair[0]), int(pair[1].split(':')[0])] = int(pair[1].split(':')[1]) 85 | 86 | # Get the length and path of the longest path. 87 | length, path = longest_path(edge_weight, edges, source, sink) 88 | 89 | # Convert to strings and format properly. 90 | lenth = str(length) 91 | path = '->'.join(map(str, path)) 92 | 93 | # Print and save the answer. 94 | print '\n'.join([lenth,path]) 95 | with open('output/Assignment_06D.txt', 'w') as output_data: 96 | output_data.write('\n'.join([lenth,path])) 97 | -------------------------------------------------------------------------------- /Assignment_06E.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Global Alignment 8 | Assignment #: 06 9 | Problem ID: E 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Global-to-Local-Alignment-247/#step-3 11 | ''' 12 | 13 | 14 | def global_alignment(v, w, scoring_matrix, sigma): 15 | 16 | # Initialize the matrices. 17 | S = [[0 for repeat_j in xrange(len(w)+1)] for repeat_i in xrange(len(v)+1)] 18 | backtrack = [[0 for repeat_j in xrange(len(w)+1)] for repeat_i in xrange(len(v)+1)] 19 | 20 | # Initialize the edges with the given penalties. 21 | for i in xrange(1, len(v)+1): 22 | S[i][0] = -i*sigma 23 | for j in xrange(1, len(w)+1): 24 | S[0][j] = -j*sigma 25 | 26 | # Fill in the Score and Backtrack matrices. 27 | for i in xrange(1, len(v)+1): 28 | for j in xrange(1, len(w)+1): 29 | scores = [S[i-1][j] - sigma, S[i][j-1] - sigma, S[i-1][j-1] + scoring_matrix[v[i-1], w[j-1]]] 30 | S[i][j] = max(scores) 31 | backtrack[i][j] = scores.index(S[i][j]) 32 | 33 | # Quick lambda function to insert indels. 34 | insert_indel = lambda word, i: word[:i] + '-' + word[i:] 35 | 36 | # Initialize the aligned strings as the input strings. 37 | v_aligned, w_aligned = v, w 38 | 39 | # Get the position of the highest scoring cell in the matrix and the high score. 40 | i, j = len(v), len(w) 41 | max_score = str(S[i][j]) 42 | 43 | # Backtrack to the edge of the matrix starting at the highest scoring cell. 44 | while i*j != 0: 45 | if backtrack[i][j] == 0: 46 | i -= 1 47 | w_aligned = insert_indel(w_aligned, j) 48 | elif backtrack[i][j] == 1: 49 | j -= 1 50 | v_aligned = insert_indel(v_aligned, i) 51 | else: 52 | i -= 1 53 | j -= 1 54 | 55 | # Prepend the necessary preceeding indels to get to (0,0). 56 | for repeat in xrange(i): 57 | w_aligned = insert_indel(w_aligned, 0) 58 | for repeat in xrange(j): 59 | v_aligned = insert_indel(v_aligned, 0) 60 | 61 | return max_score, v_aligned, w_aligned 62 | 63 | if __name__ == '__main__': 64 | from scripts import BLOSUM62 65 | 66 | # Read the input data. 67 | with open('data/stepic_6e.txt') as input_data: 68 | word1, word2 = [line.strip() for line in input_data.readlines()] 69 | 70 | # Get the alignment. 71 | alignment = global_alignment(word1, word2, BLOSUM62(), 5) 72 | 73 | # Print and save the answer. 74 | print '\n'.join(alignment) 75 | with open('output/Assignment_06E.txt', 'w') as output_data: 76 | output_data.write('\n'.join(alignment)) 77 | -------------------------------------------------------------------------------- /Assignment_06F.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Local Alignment 8 | Assignment #: 06 9 | Problem ID: F 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Global-to-Local-Alignment-247/#step-3 11 | ''' 12 | 13 | from scripts import PAM250 14 | 15 | 16 | def local_alignment(v, w, scoring_matrix, sigma): 17 | '''Returns the score and local alignment with the given scoring matrix and indel penalty sigma for strings v, w.''' 18 | from numpy import unravel_index, zeros 19 | 20 | # Initialize the matrices. 21 | S = zeros((len(v)+1, len(w)+1), dtype=int) 22 | backtrack = zeros((len(v)+1, len(w)+1), dtype=int) 23 | 24 | # Fill in the Score and Backtrack matrices. 25 | for i in xrange(1, len(v)+1): 26 | for j in xrange(1, len(w)+1): 27 | scores = [S[i-1][j] - sigma, S[i][j-1] - sigma, S[i-1][j-1] + scoring_matrix[v[i-1], w[j-1]], 0] 28 | S[i][j] = max(scores) 29 | backtrack[i][j] = scores.index(S[i][j]) 30 | 31 | # Quick lambda function to insert indels. 32 | insert_indel = lambda word, i: word[:i] + '-' + word[i:] 33 | 34 | # Get the position of the highest scoring cell in the matrix and the high score. 35 | i,j = unravel_index(S.argmax(), S.shape) 36 | max_score = str(S[i][j]) 37 | 38 | # Initialize the aligned strings as the input strings up to the position of the high score. 39 | v_aligned, w_aligned = v[:i], w[:j] 40 | 41 | # Backtrack to start of the local alignment starting at the highest scoring cell. 42 | while backtrack[i][j] != 3 and i*j != 0: 43 | if backtrack[i][j] == 0: 44 | i -= 1 45 | w_aligned = insert_indel(w_aligned, j) 46 | elif backtrack[i][j] == 1: 47 | j -= 1 48 | v_aligned = insert_indel(v_aligned, i) 49 | elif backtrack[i][j] == 2: 50 | i -= 1 51 | j -= 1 52 | 53 | # Cut the strings at the ending point of the backtrack. 54 | v_aligned = v_aligned[i:] 55 | w_aligned = w_aligned[j:] 56 | 57 | return max_score, v_aligned, w_aligned 58 | 59 | if __name__ == '__main__': 60 | 61 | # Read the input data. 62 | with open('data/stepic_6f.txt') as input_data: 63 | word1, word2 = [line.strip() for line in input_data.readlines()] 64 | 65 | # Get the local alignment (given sigma = 5 in problem statement). 66 | alignment = local_alignment(word1, word2, PAM250(), 5) 67 | 68 | # Print and save the answer. 69 | print '\n'.join(alignment) 70 | with open('output/Assignment_06F.txt', 'w') as output_data: 71 | output_data.write('\n'.join(alignment)) 72 | -------------------------------------------------------------------------------- /Assignment_07A.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Edit Distance 8 | Assignment #: 07 9 | Problem ID: A 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/The-Changing-Faces-of-Sequence-Alignment-248/step/3 11 | ''' 12 | 13 | 14 | def edit_distance(v,w): 15 | '''Returns the edit distance of strings v and w.''' 16 | from numpy import zeros 17 | 18 | # Initialize matrix M. 19 | M = zeros((len(v)+1,len(w)+1), dtype=int) 20 | for i in range(1,len(v)+1): 21 | M[i][0] = i 22 | for j in range(1,len(w)+1): 23 | M[0][j] = j 24 | 25 | # Compute each entry of M. 26 | for i in xrange(1,len(v)+1): 27 | for j in xrange(1,len(w)+1): 28 | if v[i-1] == w[j-1]: 29 | M[i][j] = M[i-1][j-1] 30 | else: 31 | M[i][j] = min(M[i-1][j]+1, M[i][j-1]+1, M[i-1][j-1]+1) 32 | 33 | # Print and save the desired edit distance. 34 | return M[len(v)][len(w)] 35 | 36 | if __name__ == '__main__': 37 | 38 | # Read the input data. 39 | with open('data/stepic_7a.txt') as input_data: 40 | word1, word2 = [line.strip() for line in input_data.readlines()] 41 | 42 | # Get the edit distance. 43 | e_dist = edit_distance(word1, word2) 44 | 45 | # Print and save the answer. 46 | print str(e_dist) 47 | with open('output/Assignment_07A.txt', 'w') as output_data: 48 | output_data.write(str(e_dist)) 49 | -------------------------------------------------------------------------------- /Assignment_07B.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Fitting Alignment Problem 8 | Assignment #: 07 9 | Problem ID: B 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/The-Changing-Faces-of-Sequence-Alignment-248/step/5 11 | ''' 12 | 13 | 14 | def fitting_alignment(v,w): 15 | '''Returns the fitting alignment of strings v and w.''' 16 | from numpy import zeros 17 | 18 | # Initialize the matrices. 19 | S = zeros((len(v)+1, len(w)+1), dtype=int) 20 | backtrack = zeros((len(v)+1, len(w)+1), dtype=int) 21 | 22 | # Fill in the Score and Backtrack matrices. 23 | for i in xrange(1, len(v)+1): 24 | for j in xrange(1, len(w)+1): 25 | scores = [S[i-1][j] - 1, S[i][j-1] - 1, S[i-1][j-1] + [-1, 1][v[i-1] == w[j-1]]] 26 | S[i][j] = max(scores) 27 | backtrack[i][j] = scores.index(S[i][j]) 28 | 29 | # Get the position of the highest scoring cell corresponding to the end of the shorter word w. 30 | j = len(w) 31 | i = max(enumerate([S[row][j] for row in xrange(len(w), len(v))]),key=lambda x: x[1])[0] + len(w) 32 | max_score = str(S[i][j]) 33 | 34 | # Initialize the aligned strings as the input strings up to the position of the high score. 35 | v_aligned, w_aligned = v[:i], w[:j] 36 | 37 | # Quick lambda function to insert indels. 38 | insert_indel = lambda word, i: word[:i] + '-' + word[i:] 39 | 40 | # Backtrack to start of the fitting alignment. 41 | while i*j != 0: 42 | if backtrack[i][j] == 0: 43 | i -= 1 44 | w_aligned = insert_indel(w_aligned, j) 45 | elif backtrack[i][j] == 1: 46 | j -= 1 47 | v_aligned = insert_indel(v_aligned, i) 48 | elif backtrack[i][j] == 2: 49 | i -= 1 50 | j -= 1 51 | 52 | # Cut off v at the ending point of the backtrack. 53 | v_aligned = v_aligned[i:] 54 | 55 | return max_score, v_aligned, w_aligned 56 | 57 | if __name__ == '__main__': 58 | 59 | # Read the input data. 60 | with open('data/stepic_7b.txt') as input_data: 61 | word1, word2 = [line.strip() for line in input_data.readlines()] 62 | 63 | # Get the fitting alignment. 64 | alignment = fitting_alignment(word1, word2) 65 | 66 | # Print and save the answer. 67 | print '\n'.join(alignment) 68 | with open('output/Assignment_07B.txt', 'w') as output_data: 69 | output_data.write('\n'.join(alignment)) 70 | -------------------------------------------------------------------------------- /Assignment_07C.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Overlap Alignment Problem 8 | Assignment #: 07 9 | Problem ID: C 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/The-Changing-Faces-of-Sequence-Alignment-248/step/7 11 | ''' 12 | 13 | 14 | def overlap_alignment(v, w): 15 | '''Returns the overlap alignment of strings v and w.''' 16 | 17 | # Initialize the arrays. 18 | S = [[0 for repeat_j in xrange(len(w)+1)] for repeat_i in xrange(len(v)+1)] 19 | backtrack = [[0 for repeat_j in xrange(len(w)+1)] for repeat_i in xrange(len(v)+1)] 20 | 21 | # Initialize the max score. 22 | max_score = -3*(len(v) + len(w)) 23 | 24 | # Fill in the Score and Backtrack arrays. 25 | for i in xrange(1, len(v)+1): 26 | for j in xrange(1, len(w)+1): 27 | # Match score = 1, Mismatch and Indels = -2. 28 | scores = [S[i-1][j-1] + [-2, 1][v[i-1] == w[j-1]], S[i-1][j] - 2, S[i][j-1] - 2] 29 | S[i][j] = max(scores) 30 | backtrack[i][j] = scores.index(S[i][j]) 31 | 32 | # Check if we have a new maximum along the last row or column and update accordingly. 33 | if i == len(v) or j == len(w): 34 | if S[i][j] > max_score: 35 | max_score = S[i][j] 36 | max_indices = (i, j) 37 | 38 | # Initialize i and j as their corresponding index of the maximum score. 39 | i, j = max_indices 40 | 41 | # Initialize the aligned strings as the input strings, removing the unused tails. 42 | v_aligned, w_aligned = v[:i], w[:j] 43 | 44 | # Quick lambda function to insert indels. 45 | insert_indel = lambda word, i: word[:i] + '-' + word[i:] 46 | 47 | # Backtrack to the first row or column from the highest score in the last row or column. 48 | while i*j != 0: 49 | if backtrack[i][j] == 1: 50 | i -= 1 51 | w_aligned = insert_indel(w_aligned, j) 52 | elif backtrack[i][j] == 2: 53 | j -= 1 54 | v_aligned = insert_indel(v_aligned, i) 55 | else: 56 | i -= 1 57 | j -= 1 58 | 59 | # Remove the unused head the aligned strings. 60 | v_aligned, w_aligned = v_aligned[i:], w_aligned[j:] 61 | 62 | return str(max_score), v_aligned, w_aligned 63 | 64 | if __name__ == '__main__': 65 | 66 | # Read the input data. 67 | with open('data/stepic_7c.txt') as input_data: 68 | word1, word2 = [line.strip() for line in input_data.readlines()] 69 | 70 | # Get the alignment. 71 | alignment = overlap_alignment(word1, word2) 72 | 73 | # Print and save the answer. 74 | print '\n'.join(alignment) 75 | with open('output/Assignment_07C.txt', 'w') as output_data: 76 | output_data.write('\n'.join(alignment)) 77 | -------------------------------------------------------------------------------- /Assignment_07D.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Alignment with Affine Gap Penalties Problem 8 | Assignment #: 07 9 | Problem ID: D 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Penalizing-Insertions-and-Deletions-in-Sequence-Alignments-249/step/8 11 | ''' 12 | 13 | 14 | def global_alignment_affine_gap_penalty(v, w, scoring_matrix, sigma, epsilon): 15 | '''Returns the global alignment score of v and w with constant gap peantaly sigma subject to the scoring_matrix.''' 16 | from numpy import zeros 17 | 18 | # Initialize the matrices. 19 | S_lower = zeros((len(v)+1, len(w)+1), dtype=int) 20 | S_middle = zeros((len(v)+1, len(w)+1), dtype=int) 21 | S_upper = zeros((len(v)+1, len(w)+1), dtype=int) 22 | backtrack = zeros((len(v)+1, len(w)+1), dtype=int) 23 | 24 | # Initialize the edges with the given penalties. 25 | for i in xrange(1, len(v)+1): 26 | S_lower[i][0] = -sigma - (i-1)*epsilon 27 | S_middle[i][0] = -sigma - (i-1)*epsilon 28 | S_upper[i][0] = -10*sigma 29 | for j in xrange(1, len(w)+1): 30 | S_upper[0][j] = -sigma - (j-1)*epsilon 31 | S_middle[0][j] = -sigma - (j-1)*epsilon 32 | S_lower[0][j] = -10*sigma 33 | 34 | # Fill in the scores for the lower, middle, upper, and backtrack matrices. 35 | for i in xrange(1, len(v)+1): 36 | for j in xrange(1, len(w)+1): 37 | S_lower[i][j] = max([S_lower[i-1][j] - epsilon, S_middle[i-1][j] - sigma]) 38 | S_upper[i][j] = max([S_upper[i][j-1] - epsilon, S_middle[i][j-1] - sigma]) 39 | middle_scores = [S_lower[i][j], S_middle[i-1][j-1] + scoring_matrix[v[i-1], w[j-1]], S_upper[i][j]] 40 | S_middle[i][j] = max(middle_scores) 41 | backtrack[i][j] = middle_scores.index(S_middle[i][j]) + 1 42 | 43 | # Initialize the values of i,j and get the minimum score. 44 | i,j = len(v), len(w) 45 | max_score = S_middle[i][j] 46 | v_aligned, w_aligned = v, w 47 | 48 | # Quick lambda function to insert indels. 49 | insert_indel = lambda word, i: word[:i] + '-' + word[i:] 50 | 51 | # Backtrack to the edge of the matrix starting bottom right. 52 | while i*j != 0: 53 | if backtrack[i][j] == 1: 54 | i -= 1 55 | w_aligned = insert_indel(w_aligned, j) 56 | elif backtrack[i][j] == 3: 57 | j -= 1 58 | v_aligned = insert_indel(v_aligned, i) 59 | else: 60 | i -= 1 61 | j -= 1 62 | 63 | # Prepend the necessary preceeding indels to get to (0,0). 64 | for repeat in xrange(i): 65 | w_aligned = insert_indel(w_aligned, 0) 66 | for repeat in xrange(j): 67 | v_aligned = insert_indel(v_aligned, 0) 68 | 69 | return max_score, v_aligned, w_aligned 70 | 71 | if __name__ == '__main__': 72 | from scripts import BLOSUM62 73 | 74 | # Read the input data. 75 | with open('data/stepic_7d.txt') as input_data: 76 | protein1, protein2 = [line.strip() for line in input_data.readlines()] 77 | 78 | # Get the alignment score. 79 | score = map(str, global_alignment_affine_gap_penalty(protein1, protein2, BLOSUM62(), 11, 1)) 80 | 81 | # Print and save the answer. 82 | print '\n'.join(score) 83 | with open('output/Assignment_07D.txt', 'w') as output_data: 84 | output_data.write('\n'.join(score)) 85 | -------------------------------------------------------------------------------- /Assignment_07E.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Middle Edge in Linear Space Problem 8 | Assignment #: 07 9 | Problem ID: E 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/SpaceEfficient-Sequence-Alignment-250/step/12 11 | ''' 12 | 13 | 14 | def middle_column_score(v, w, scoring_matrix, sigma): 15 | '''Returns the score of the middle column for the alignment of v and w.''' 16 | 17 | # Initialize the score columns. 18 | S = [[i*j*sigma for j in xrange(-1, 1)] for i in xrange(len(v)+1)] 19 | S[0][1] = -sigma 20 | backtrack = [0]*(len(v)+1) 21 | 22 | # Fill in the Score and Backtrack matrices. 23 | for j in xrange(1, len(w)/2+1): 24 | for i in xrange(0, len(v)+1): 25 | if i == 0: 26 | S[i][1] = -j*sigma 27 | else: 28 | scores = [S[i-1][0] + scoring_matrix[v[i-1], w[j-1]], S[i][0] - sigma, S[i-1][1] - sigma] 29 | S[i][1] = max(scores) 30 | backtrack[i] = scores.index(S[i][1]) 31 | 32 | if j != len(w)/2: 33 | S = [[row[1]]*2 for row in S] 34 | 35 | return [row[1] for row in S], backtrack 36 | 37 | 38 | def middle_edge(v, w, scoring_matrix, sigma): 39 | '''Returns the middle edge in the alignment graph of v and w.''' 40 | 41 | # Get the score of the middle column from the source to the middle. The backtrack matrix is unnecessary here. 42 | source_to_middle = middle_column_score(v, w, scoring_matrix, sigma)[0] 43 | 44 | # Get the score of the middle column from the middle to sink. Reverse the order as the computations are done in the opposite orientation. 45 | middle_to_sink, backtrack = map(lambda l: l[::-1], middle_column_score(v[::-1], w[::-1]+['', '$'][len(w) % 2 == 1 and len(w) > 1], scoring_matrix, sigma)) 46 | 47 | # Get the componentwise sum of the middle column scores. 48 | scores = map(sum, zip(source_to_middle, middle_to_sink)) 49 | 50 | # Get the position of the maximum score and the next node. 51 | max_middle = max(xrange(len(scores)), key=lambda i: scores[i]) 52 | 53 | if max_middle == len(scores) - 1: 54 | next_node = (max_middle, len(w)/2 + 1) 55 | else: 56 | next_node = [(max_middle + 1, len(w)/2 + 1), (max_middle, len(w)/2 + 1), (max_middle + 1, len(w)/2),][backtrack[max_middle]] 57 | 58 | return (max_middle, len(w)/2), next_node 59 | 60 | 61 | if __name__ == '__main__': 62 | from scripts import BLOSUM62 63 | 64 | # Read the input data. 65 | with open('data/stepic_7e.txt') as input_data: 66 | word1, word2 = [line.strip() for line in input_data.readlines()] 67 | 68 | # Get the middle edge. 69 | middle = middle_edge(word1, word2, BLOSUM62(), 5) 70 | 71 | # Print and save the answer. 72 | print ' '.join(map(str, middle)) 73 | with open('output/Assignment_07E.txt', 'w') as output_data: 74 | output_data.write(' '.join(map(str, middle))) 75 | -------------------------------------------------------------------------------- /Assignment_07F.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Linear Space Alignment 8 | Assignment #: 07 9 | Problem ID: F 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/SpaceEfficient-Sequence-Alignment-250/step/14 11 | ''' 12 | 13 | 14 | def space_efficient_global_alignment(v, w, scoring_matrix, sigma): 15 | '''Return the global alignment of v and w using a linear space algorithm.''' 16 | from Assignment_06E import global_alignment 17 | from Assignment_07E import middle_edge 18 | 19 | def linear_space_alignment2(top, bottom, left, right): 20 | '''Constructs the global alignment path using linear space.''' 21 | 22 | if left == right: 23 | return [v[top:bottom], '-'*(bottom - top)] 24 | 25 | elif top == bottom: 26 | return ['-'*(right - left), w[left:right]] 27 | 28 | elif bottom - top == 1 or right - left == 1: 29 | return global_alignment(v[top:bottom], w[left:right], scoring_matrix, sigma)[1:] 30 | 31 | else: 32 | # Get the middle edge and the corresponding nodes. 33 | mid_node, next_node = middle_edge(v[top:bottom], w[left:right], scoring_matrix, sigma) 34 | 35 | # Shift the nodes appropriately, as they currently don't alighn with the top/left starting points. 36 | mid_node = tuple(map(sum, zip(mid_node, [top, left]))) 37 | next_node = tuple(map(sum, zip(next_node, [top, left]))) 38 | 39 | # Get the character in each alignment corresponding to the current middle edge. 40 | # (Take the index modulo the string length to avoid IndexErrors if we reach the end of a string but still have -'s to append.) 41 | current = [['-', v[mid_node[0] % len(v)]][next_node[0] - mid_node[0]], ['-', w[mid_node[1] % len(w)]][next_node[1] - mid_node[1]]] 42 | 43 | # Recursively divide and conquer to generate the alignment. 44 | A = linear_space_alignment2(top, mid_node[0], left, mid_node[1]) 45 | B = linear_space_alignment2(next_node[0], bottom, next_node[1], right) 46 | return [A[i] + current[i] + B[i] for i in xrange(2)] 47 | 48 | # Get the alignment and alignment score. 49 | v_aligned, w_aligned = linear_space_alignment2(0, len(v), 0, len(w)) 50 | score = sum([-sigma if '-' in pair else scoring_matrix[pair] for pair in zip(v_aligned, w_aligned)]) 51 | 52 | return str(score), v_aligned, w_aligned 53 | 54 | 55 | if __name__ == '__main__': 56 | from scripts import BLOSUM62 57 | 58 | # Read the input data. 59 | with open('data/stepic_7f.txt') as input_data: 60 | word1, word2 = [line.strip() for line in input_data.readlines()] 61 | 62 | # Get the alignment. 63 | alignment = space_efficient_global_alignment(word1, word2, BLOSUM62(), 5) 64 | 65 | # Print and save the answer. 66 | print '\n'.join(alignment) 67 | with open('output/Assignment_07F.txt', 'w') as output_data: 68 | output_data.write('\n'.join(alignment)) 69 | -------------------------------------------------------------------------------- /Assignment_07G.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Multiple Longest Common Subsequence Problem 8 | Assignment #: 07 9 | Problem ID: G 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Epilogue-Multiple-Sequence-Alignment-251/step/5 11 | ''' 12 | 13 | 14 | def multiple_alignment_3(v, w, u): 15 | 16 | # Initialize the matrices. 17 | S = [[[0 for repeat_k in xrange(len(u)+1)] for repeat_j in xrange(len(w)+1)] for repeat_i in xrange(len(v)+1)] 18 | backtrack = [[[0 for repeat_k in xrange(len(u)+1)] for repeat_j in xrange(len(w)+1)] for repeat_i in xrange(len(v)+1)] 19 | 20 | # Fill in the Score and Backtrack matrices. 21 | for i in xrange(1, len(v)+1): 22 | for j in xrange(1, len(w)+1): 23 | for k in xrange(1, len(u)+1): 24 | scores = [S[i-1][j-1][k-1] + int(v[i-1] == w[j-1] == u[k-1]), S[i-1][j][k], S[i][j-1][k], S[i][j][k-1], S[i-1][j][k-1], S[i][j-1][k-1]] 25 | backtrack[i][j][k], S[i][j][k] = max(enumerate(scores), key=lambda p: p[1]) 26 | 27 | # Quick lambda function to insert indels. 28 | insert_indel = lambda word, i: word[:i] + '-' + word[i:] 29 | 30 | # Initialize the aligned strings as the input strings. 31 | v_aligned, w_aligned, u_aligned = v, w, u 32 | 33 | # Get the position of the highest scoring cell in the matrix and the high score. 34 | i, j, k = len(v), len(w), len(u) 35 | max_score = S[i][j][k] 36 | 37 | # Backtrack to the edge of the matrix starting at the highest scoring cell. 38 | while i*j*k != 0: 39 | if backtrack[i][j][k] == 1: 40 | i -= 1 41 | w_aligned = insert_indel(w_aligned, j) 42 | u_aligned = insert_indel(u_aligned, k) 43 | elif backtrack[i][j][k] == 2: 44 | j -= 1 45 | v_aligned = insert_indel(v_aligned, i) 46 | u_aligned = insert_indel(u_aligned, k) 47 | elif backtrack[i][j][k] == 3: 48 | k -= 1 49 | v_aligned = insert_indel(v_aligned, i) 50 | w_aligned = insert_indel(w_aligned, j) 51 | elif backtrack[i][j][k] == 4: 52 | i -= 1 53 | j -= 1 54 | u_aligned = insert_indel(u_aligned, k) 55 | elif backtrack[i][j][k] == 5: 56 | i -= 1 57 | k -= 1 58 | w_aligned = insert_indel(w_aligned, j) 59 | elif backtrack[i][j][k] == 6: 60 | j -= 1 61 | k -= 1 62 | v_aligned = insert_indel(v_aligned, i) 63 | else: 64 | i -= 1 65 | j -= 1 66 | k -= 1 67 | 68 | # Prepend the necessary preceeding indels to get match lengths. 69 | while len(v_aligned) != max(len(v_aligned),len(w_aligned),len(u_aligned)): 70 | v_aligned = insert_indel(v_aligned, 0) 71 | while len(w_aligned) != max(len(v_aligned),len(w_aligned),len(u_aligned)): 72 | w_aligned = insert_indel(w_aligned, 0) 73 | while len(u_aligned) != max(len(v_aligned),len(w_aligned),len(u_aligned)): 74 | u_aligned = insert_indel(u_aligned, 0) 75 | 76 | return str(max_score), v_aligned, w_aligned, u_aligned 77 | 78 | if __name__ == '__main__': 79 | 80 | # Read the input data. 81 | with open('data/stepic_7g.txt') as input_data: 82 | word1, word2, word3 = [line.strip() for line in input_data.readlines()] 83 | 84 | # Get the alignment. 85 | alignment = multiple_alignment_3(word1, word2, word3) 86 | 87 | # Print and save the answer. 88 | print '\n'.join(alignment) 89 | with open('output/Assignment_07G.txt', 'w') as output_data: 90 | output_data.write('\n'.join(alignment)) 91 | -------------------------------------------------------------------------------- /Assignment_08A.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Greedy Sorting 8 | Assignment #: 08 9 | Problem ID: A 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/A-Greedy-Algorithm-for-Sorting-by-Reversals-286/step/2 11 | ''' 12 | 13 | 14 | def greedy_sorting(permutation): 15 | '''A greedy algorithm to sort by reversals.''' 16 | from operator import neg 17 | 18 | # Initialize the transformation list, which stores all intermediate transformations. 19 | transformation_list = [] 20 | 21 | # Quick lambda functions to find the index of a given element, and swap and negate a region in the permutation. 22 | k_index = lambda perm, k: map(abs, perm).index(k) 23 | k_sort = lambda perm, i, j: perm[:i] + map(neg, perm[i:j+1][::-1]) + perm[j+1:] 24 | 25 | # Loop over the permutation to sort it. 26 | i = 0 27 | while i < len(permutation): 28 | if permutation[i] == i+1: 29 | i += 1 30 | elif permutation[i] == -(i+1): 31 | permutation = k_sort(permutation, i, i) 32 | transformation_list.append(permutation) 33 | else: 34 | permutation = k_sort(permutation, i, k_index(permutation, i+1)) 35 | transformation_list.append(permutation) 36 | 37 | # Note: the approximate reversal distance is the length of the transformation list. 38 | return transformation_list 39 | 40 | 41 | if __name__ == '__main__': 42 | 43 | # Read the input data. 44 | with open('data/stepic_8a.txt') as input_data: 45 | perm = map(int, input_data.read().strip().lstrip('(').rstrip(')').split()) 46 | 47 | # Get the list of recerals necessary to sort the given permutation. 48 | reversal_list = greedy_sorting(perm) 49 | # Write the permutation in the desired form for in the desired output form for stepic. 50 | reversal_list = ['('+' '.join([['', '+'][value > 0] + str(value) for value in perm])+')' for perm in reversal_list] 51 | 52 | # Print and save the answer. 53 | print '\n'.join(reversal_list) 54 | with open('output/Assignment_08A.txt', 'w') as output_data: 55 | output_data.write('\n'.join(reversal_list)) 56 | -------------------------------------------------------------------------------- /Assignment_08B.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Number of Breakpoints Problem 8 | Assignment #: 08 9 | Problem ID: B 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Breakpoints-287/step/1 11 | ''' 12 | 13 | 14 | def breakpoint_count(permutation): 15 | '''Returns the number of breakpoints in a given permutation.''' 16 | 17 | # Prepend 0 and append len(permutation)+1 to check if the endpoints are in place. 18 | permutation = [0] + permutation + [len(permutation)+1] 19 | 20 | return sum(map(lambda x,y: x - y != 1, permutation[1:], permutation[:-1])) 21 | 22 | 23 | if __name__ == '__main__': 24 | 25 | # Read the input data. 26 | with open('data/stepic_8b.txt') as input_data: 27 | perm = map(int, input_data.read().strip().lstrip('(').rstrip(')').split()) 28 | 29 | # Get the number of breakpoints 30 | num_of_breakpoints = breakpoint_count(perm) 31 | 32 | # Print and save the answer. 33 | print str(num_of_breakpoints) 34 | with open('output/Assignment_08B.txt', 'w') as output_data: 35 | output_data.write(str(num_of_breakpoints)) 36 | -------------------------------------------------------------------------------- /Assignment_08C.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: 2-Break Distance Problem 8 | Assignment #: 08 9 | Problem ID: C 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Computing-the-2Break-Distance-288/step/1 11 | ''' 12 | 13 | 14 | def two_break_dist(P, Q): 15 | '''Returns the 2-Break Distance of Circular Chromosomes P and Q.''' 16 | 17 | # Construct the break point graph of P and Q. 18 | edges = {} 19 | for block in P+Q: 20 | L = len(block) 21 | # Note: Modulo L in the higher index for the edge between the last and first elements. 22 | for i in xrange(len(block)): 23 | # Add the edge between consecutive items. 24 | if block[i] in edges: 25 | edges[block[i]].append(-1*block[(i+1) % L]) 26 | else: 27 | edges[block[i]] = [-1*block[(i+1) % L]] 28 | # Add in the reverse edge, as we aren't guaranteed a directed cycle without it. 29 | if -1*block[(i+1) % L] in edges: 30 | edges[-1*block[(i+1) % L]].append(block[i]) 31 | else: 32 | edges[-1*block[(i+1) % L]] = [block[i]] 33 | 34 | # Count the number of cycles in the break point graph. 35 | cycles = 0 36 | while len(edges) > 0: 37 | cycles += 1 38 | current = edges.keys()[0] 39 | while current in edges: 40 | temp = edges[current][0] 41 | if len(edges[current]) == 1: 42 | del edges[current] 43 | else: 44 | edges[current] = edges[current][1:] 45 | # Remove the complementary edge. 46 | if edges[temp] == [current]: 47 | del edges[temp] 48 | else: 49 | edges[temp].remove(current) 50 | 51 | current = temp 52 | 53 | # Theorem: d(P,Q) = blocks(P,W) - cycles(P,Q) 54 | return sum([len(block) for block in P]) - cycles 55 | 56 | 57 | if __name__ == '__main__': 58 | 59 | # Read the input data. 60 | with open('data/stepic_8c.txt') as input_data: 61 | P, Q = [line.strip().lstrip('(').rstrip(')').split(')(') for line in input_data.readlines()] 62 | P = [map(int, block.split()) for block in P] 63 | Q = [map(int, block.split()) for block in Q] 64 | 65 | # Get the 2-Break Distance. 66 | dist = two_break_dist(P, Q) 67 | 68 | # Print and save the answer. 69 | print str(dist) 70 | with open('output/Assignment_08C.txt', 'w') as output_data: 71 | output_data.write(str(dist)) 72 | -------------------------------------------------------------------------------- /Assignment_08D.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Shared k-mers Problem 8 | Assignment #: 08 9 | Problem ID: D 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Synteny-Block-Construction-289/step/2 11 | ''' 12 | 13 | 14 | def shared_kmers(dna1, dna2, k): 15 | '''Returns a list of positions for shared kmers (up to reverse complement) in dna1 and dna2.''' 16 | from scripts import ReverseComplementDNA as rev_comp 17 | 18 | # Initialize the dictionary to store kmers. 19 | dna_dict = {} 20 | 21 | # Store the starting index of all kmers contained in dna1 in a list keyed to the kmer. 22 | for i in xrange(len(dna1) - k + 1): 23 | # Add the ith kmer. 24 | if dna1[i:i+k] in dna_dict: 25 | dna_dict[dna1[i:i+k]].append(i) 26 | else: 27 | dna_dict[dna1[i:i+k]] = [i] 28 | 29 | # Add the reverse complement of the ith kmer. 30 | if rev_comp(dna1[i:i+k]) in dna_dict: 31 | dna_dict[rev_comp(dna1[i:i+k])].append(i) 32 | else: 33 | dna_dict[rev_comp(dna1[i:i+k])] = [i] 34 | 35 | # Use a set to remove possible duplicate entries. 36 | common_kmers = set() 37 | 38 | # Check kmers in dna2 against those in dna1, adding matching indices to common_kmers. 39 | for j in xrange(len(dna2) - k + 1): 40 | # Check the jth kmer. 41 | if dna2[j:j+k] in dna_dict: 42 | for x in dna_dict[dna2[j:j+k]]: 43 | common_kmers.add((x,j)) 44 | 45 | # Check the reverse complement of the jth kmer. 46 | if rev_comp(dna2[j:j+k]) in dna_dict: 47 | for x in dna_dict[rev_comp(dna2[j:j+k])]: 48 | common_kmers.add((x,j)) 49 | 50 | return common_kmers 51 | 52 | if __name__ == '__main__': 53 | 54 | # Read the input data. 55 | with open('data/stepic_8d.txt') as input_data: 56 | k = int(input_data.readline().strip()) 57 | dna1, dna2 = [line.strip() for line in input_data.readlines()] 58 | 59 | # Get the shared kmers. Sorting doesn't add significant time and makes the result more readable. 60 | common = map(str, sorted(shared_kmers(dna1, dna2, k))) 61 | 62 | # Print and save the answer. 63 | print '\n'.join(common) 64 | with open('output/Assignment_08D.txt', 'w') as output_data: 65 | output_data.write('\n'.join(common)) 66 | -------------------------------------------------------------------------------- /Assignment_09A.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Trie Construction Problem 8 | Assignment #: 09 9 | Problem ID: A 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Preprocessing-Patterns-into-a-Trie-294/step/3 11 | ''' 12 | from scripts import Trie 13 | 14 | 15 | def trie_edges(words): 16 | '''Returns the edges of a trie constructed from the given words in adjacency format.''' 17 | 18 | # Construct the trie. 19 | t = Trie(words) 20 | 21 | # Convert trie edges to adjacency form, as edges are currently dictionary items. 22 | # Converts: ((1, 2), 'A') --> '1 2 A' 23 | adjacency_format = lambda item: ' '.join(map(str,item[0]))+' '+item[1] 24 | 25 | # Return all edges converted to adjacency form. 26 | return map(adjacency_format, t.edges.items()) 27 | 28 | 29 | def main(): 30 | '''Main call. Reads, runs, and saves problem specific data.''' 31 | 32 | # Read the input data. 33 | with open('data/stepic_9a.txt') as input_data: 34 | words = [line.strip() for line in input_data.readlines()] 35 | 36 | # Get the adjacency list. 37 | adjacency_list = trie_edges(words) 38 | 39 | # Print and save the answer. 40 | print '\n'.join(adjacency_list) 41 | with open('output/Assignment_09A.txt', 'w') as output_file: 42 | output_file.write('\n'.join(adjacency_list)) 43 | 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /Assignment_09B.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Multiple Pattern Matching Problem 8 | Assignment #: 09 9 | Problem ID: B 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Preprocessing-Patterns-into-a-Trie-294/step/6 11 | ''' 12 | 13 | from scripts import Trie 14 | 15 | 16 | def trie_pattern_matching(word, patterns): 17 | '''Returns the starting index off all locations in word where a string in patterns is a substring.''' 18 | 19 | # Construct a trie from all of the given patterns. 20 | t = Trie(patterns) 21 | 22 | # Checck each index in the word (until the remainder is shorter than the shortest pattern) 23 | # to see if a pattern occurs starting at the specified index. 24 | check_patterns = [i for i in xrange(len(word)-min(map(len, patterns))+1) if t.prefix_in_trie(word[i:]) is True] 25 | 26 | return check_patterns 27 | 28 | 29 | def main(): 30 | '''Main call. Reads, runs, and saves problem specific data.''' 31 | # Read the input data. 32 | with open('data/stepic_9b.txt') as input_data: 33 | word = input_data.readline().strip() 34 | patterns = [line.strip() for line in input_data.readlines()] 35 | 36 | # Get the matching pattern indices. 37 | pattern_indices = trie_pattern_matching(word, patterns) 38 | 39 | # Print and save the answer. 40 | print ' '.join(map(str, pattern_indices)) 41 | with open('output/Assignment_09B.txt', 'w') as output_data: 42 | output_data.write(' '.join(map(str, pattern_indices))) 43 | 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /Assignment_09D.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Suffix Tree Construction Problem 8 | Assignment #: 09 9 | Problem ID: D 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Suffix-Trees-296/step/4 11 | ''' 12 | from scripts import GeneralizedSuffixTree 13 | 14 | 15 | def suffix_tree_edges(word): 16 | '''Returns the edge subsrings associated with the suffix tree for the given word.''' 17 | 18 | # Most of the work is done by the generalized suffix tree script (see scripts folder). 19 | gst = GeneralizedSuffixTree(word) 20 | 21 | # Get a list of all edge substrings from the generalized suffix tree. 22 | edges = [gst.edge_substring(e) for e in gst.edges.values()] 23 | 24 | # Return the edges in suffix tree format (i.e. want endings $0 to be $). 25 | # Note: This is necessary because we're using a generalized suffix tree, which uses $0, $1, ..., $N 26 | # as the out of alphabet suffixes in order to distinguish between word 0, word 1, ..., word N. 27 | return [e[:-1] if '$' in e else e for e in edges] 28 | 29 | 30 | def main(): 31 | '''Main call. Reads, runs, and saves problem specific data.''' 32 | 33 | # Read the input data. 34 | with open('data/stepic_9d.txt') as input_data: 35 | text = input_data.read().strip() 36 | 37 | # Get the edge substrings. 38 | edges = suffix_tree_edges(text) 39 | 40 | # Print and save the answer. 41 | print '\n'.join(edges) 42 | with open('output/Assignment_09D.txt', 'w') as output_data: 43 | output_data.write('\n'.join(edges)) 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /Assignment_09E.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Longest Shared Repeat Problem 8 | Assignment #: 09 9 | Problem ID: E 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Suffix-Trees-296/step/5 11 | ''' 12 | from scripts import GeneralizedSuffixTree 13 | 14 | 15 | def longest_common_substring(string_list): 16 | '''Returns the longest common substring among all strings in string_list.''' 17 | # Construct the generalized suffix tree for the input text. 18 | gst = GeneralizedSuffixTree(string_list) 19 | 20 | # Find all nodes that are traversed by all words in text, meaning that the substring up to that node is in all words in text. 21 | candidate_nodes = filter(lambda i: len(gst.nodes[i].words) == len(string_list), xrange(len(gst.nodes))) 22 | 23 | # Get the deepest node of from the candidate nodes, where depth corresponds to substring length. 24 | deepest_node = max(candidate_nodes, key=lambda i: gst.node_depth(i)) 25 | 26 | # Return the substring corresponding to a traversal up to the deepest node. 27 | return gst.node_substring(deepest_node) 28 | 29 | 30 | def main(): 31 | '''Reads, runs, and saves problem specific data.''' 32 | # Read the input data. 33 | with open('data/stepic_9e.txt') as input_data: 34 | text = [line.strip() for line in input_data.readlines()] 35 | 36 | # Get the longest shared repeat. 37 | longest_shared_repeat = longest_common_substring(text) 38 | 39 | # Print and save the answer. 40 | print longest_shared_repeat 41 | with open('output/Assignment_09E.txt', 'w') as output_data: 42 | output_data.write(longest_shared_repeat) 43 | 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /Assignment_09F.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera. 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner. 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic 6 | 7 | Problem Title: Shortest Non-Shared Substring Problem 8 | Assignment #: 09 9 | Problem ID: F 10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Suffix-Trees-296/step/6 11 | ''' 12 | from scripts import GeneralizedSuffixTree 13 | 14 | 15 | def shortest_nonshared_substring(string_list): 16 | '''Returns the shortest nonshared substring unique to the first word in string_list.''' 17 | 18 | # Construct the generalized suffix tree for the input text. 19 | gst = GeneralizedSuffixTree(string_list) 20 | 21 | # Find all nodes that are traversed only by the first word in text, meaning that the substring up to that node is only in the first word. 22 | candidate_nodes = filter(lambda i: gst.nodes[i].words == {0}, xrange(len(gst.nodes))) 23 | 24 | # Filter out all nodes corresponding to the out of alphabet character unique to first word, as these are trivally only traveresed by the first word. 25 | # If the out of alphabet character is the only character on the edge, then its parent must be traversed by another word. 26 | candidate_nodes = filter(lambda i: gst.edge_substring(gst.edges[gst.nodes[i].parent,i]) != '$0', candidate_nodes) 27 | 28 | # To get the shortest substring, only take the first character of the last edge, hence the substring has length parent_length + 1. 29 | shortest = min(candidate_nodes, key=lambda i: gst.node_depth(gst.nodes[i].parent)+1) 30 | 31 | # Shortest nonshared substring is the substring up to the first character of the edge leading to the optimal node. 32 | return gst.node_substring(gst.nodes[shortest].parent) + gst.edge_substring(gst.edges[gst.nodes[shortest].parent,shortest])[0] 33 | 34 | 35 | def main(): 36 | '''Solves problem Problem 9F.''' 37 | 38 | # Read the input data. 39 | with open('data/stepic_9f.txt') as input_data: 40 | text = [line.strip() for line in input_data.readlines()] 41 | 42 | # Get the shortest nonshared substring unique to the first word. 43 | minimal_unique_substring = shortest_nonshared_substring(text) 44 | 45 | # Print and save the answer. 46 | print minimal_unique_substring 47 | with open('output/Assignment_09F.txt', 'w') as output_data: 48 | output_data.write(minimal_unique_substring) 49 | 50 | if __name__ == '__main__': 51 | main() 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bioinformatics Algorithms (Coursera) # 2 | 3 | ### Description 4 | --- 5 | My solutions to programming assignments for the Bioinformatics Algorithms course on Coursera, worked primarily in Python. 6 | 7 | Coursera Class Website: [https://www.coursera.org/course/bioinformatics](https://www.coursera.org/course/bioinformatics) 8 | 9 | Stepic Textbook Website: [https://beta.stepic.org/Bioinformatics-Algorithms-2/](https://beta.stepic.org/Bioinformatics-Algorithms-2/) 10 | 11 | ### Repository Structure 12 | --- 13 | ***Top Level Directory*** 14 | 15 | The top level directory contains problem solutions. The problems are organized by the week which they are assigned, and their position on the given weeks assignment. Specifically, Assignment_XXY denotes week XX, problem Y. 16 | 17 | ***Scripts Directory*** 18 | 19 | The scripts directory contains scripts for various processes which appear in multiple programming problems but are not solutions to actual problems. 20 | 21 | ***Data and Output Directories*** 22 | 23 | The data and output directories hold text files containing the data for each problem supplied by Stepic, and the associated output from my solutions. 24 | -------------------------------------------------------------------------------- /data/stepic_1a.txt: -------------------------------------------------------------------------------- 1 | GCCGGGCCCCGCAGGTTCCGCAGGTTCTTCCGCGTTCGCAGGTTCGTGGATACGCAGGTTCTCATCGGGGGCCGGGCCCTCATCGGGGCGCAGGTTCCGCAGGTTCTCATCGGGGGTGGATAGCCGGGCCCGTGGATATTCCGCGTTGCCGGGCCCCGCAGGTTCGTGGATAGCCGGGCCCTTCCGCGTTCGCAGGTTCGCCGGGCCCGTGGATATCATCGGGGTCATCGGGGGTGGATAGTGGATACGCAGGTTCTCATCGGGGGTGGATATTCCGCGTTCGCAGGTTCTTCCGCGTTTTCCGCGTTTTCCGCGTTTTCCGCGTTGCCGGGCCCTCATCGGGGTCATCGGGGGCCGGGCCCTCATCGGGGGTGGATAGTGGATATCATCGGGGCGCAGGTTCGTGGATATTCCGCGTTTTCCGCGTTCGCAGGTTCTTCCGCGTTGTGGATAGCCGGGCCCCGCAGGTTCTCATCGGGGTCATCGGGGTCATCGGGGCGCAGGTTCTTCCGCGTTGCCGGGCCCGTGGATACGCAGGTTCTTCCGCGTTCGCAGGTTCTCATCGGGGTCATCGGGGCGCAGGTTCGCCGGGCCCGCCGGGCCCTTCCGCGTTGTGGATATTCCGCGTTGTGGATAGTGGATATTCCGCGTTTCATCGGGGTTCCGCGTTCGCAGGTTCGTGGATATTCCGCGTTCGCAGGTTCGTGGATAGCCGGGCCCTTCCGCGTTGCCGGGCCCGCCGGGCCCTCATCGGGGGTGGATAGTGGATAGTGGATAGTGGATATTCCGCGTTGCCGGGCCCGTGGATAGTGGATACGCAGGTTCTTCCGCGTTTCATCGGGGGTGGATATCATCGGGGGCCGGGCCCGCCGGGCCCTTCCGCGTT 2 | 11 3 | -------------------------------------------------------------------------------- /data/stepic_1g.txt: -------------------------------------------------------------------------------- 1 | GTACAGACGGACAAAAATTGTACAGACGAGTGTTGCCCAAGTTAGAACATTCAACTGTACAGACGATTCAACTAAGTTAGAACAGTGTTGCCCATTCAACTAAGTTAGAACGACAAAAATTATTCAACTAAGTTAGAACATTCAACTGTACAGACGAGTGTTGCCCAGTGTTGCCCAGTGTTGCCCAGTGTTGCCCATTCAACTAGTGTTGCCCATTCAACTAGTGTTGCCCGACAAAAATTAGTGTTGCCCAGTGTTGCCCAGTGTTGCCCAAGTTAGAACAAGTTAGAACGACAAAAATTATTCAACTAGTGTTGCCCAGTGTTGCCCGTACAGACGGTACAGACGAAGTTAGAACAAGTTAGAACAAGTTAGAACGACAAAAATTAAGTTAGAACGACAAAAATTAAGTTAGAACGTACAGACGAAGTTAGAACAGTGTTGCCCAGTGTTGCCCGTACAGACGGTACAGACGAGTGTTGCCCGTACAGACGATTCAACTGACAAAAATTATTCAACTAGTGTTGCCCGTACAGACGGTACAGACGATTCAACTATTCAACTAGTGTTGCCCGTACAGACGGTACAGACGATTCAACTATTCAACTAAGTTAGAACAGTGTTGCCCGTACAGACGGTACAGACGATTCAACTATTCAACTAGTGTTGCCCATTCAACTAGTGTTGCCCGTACAGACGATTCAACTAGTGTTGCCCATTCAACTGTACAGACGGTACAGACGATTCAACTAGTGTTGCCCGTACAGACGATTCAACTAGTGTTGCCCGACAAAAATTAAGTTAGAACGACAAAAATTATTCAACTATTCAACTGACAAAAATTATTCAACTAAGTTAGAACATTCAACTGACAAAAATTAAGTTAGAACGACAAAAATTATTCAACTAAGTTAGAACAGTGTTGCCCATTCAACTATTCAACTATTCAACTGTACAGACG 2 | 10 2 3 | -------------------------------------------------------------------------------- /data/stepic_1h.txt: -------------------------------------------------------------------------------- 1 | CCTAGTGTCAGCGGAATTTCTGCTGCCTAGTGTCTCTGCTGCGCCGTCCAGCGGAATTGTTCTTAAAGCGGAATTGTTCTTAAAGCGGAATTCGCCGTCCCGCCGTCCTCTGCTGTCTGCTGGTTCTTAACGCCGTCCAGCGGAATTGTTCTTAAGTTCTTAATCTGCTGCCTAGTGTCAGCGGAATTTCTGCTGTCTGCTGAGCGGAATTCCTAGTGTCAGCGGAATTCGCCGTCCGTTCTTAAAGCGGAATTCGCCGTCCGTTCTTAAAGCGGAATTCCTAGTGTCCCTAGTGTCTCTGCTGGTTCTTAACGCCGTCCCCTAGTGTCCGCCGTCCTCTGCTGAGCGGAATTGTTCTTAATCTGCTGCGCCGTCCTCTGCTGTCTGCTGAGCGGAATTCCTAGTGTCGTTCTTAAAGCGGAATTTCTGCTGAGCGGAATTGTTCTTAAAGCGGAATTAGCGGAATTTCTGCTGCGCCGTCCGTTCTTAACGCCGTCCCCTAGTGTCTCTGCTGCGCCGTCCTCTGCTGAGCGGAATTCGCCGTCCCGCCGTCCAGCGGAATTGTTCTTAACGCCGTCCCGCCGTCCAGCGGAATTAGCGGAATTCCTAGTGTCGTTCTTAACCTAGTGTCTCTGCTGCGCCGTCCGTTCTTAACGCCGTCCCGCCGTCCCCTAGTGTCTCTGCTGAGCGGAATTCGCCGTCCAGCGGAATTAGCGGAATTCGCCGTCCGTTCTTAACGCCGTCCTCTGCTGTCTGCTGCGCCGTCCCCTAGTGTCGTTCTTAACCTAGTGTCTCTGCTGCGCCGTCCTCTGCTGGTTCTTAACGCCGTCCCCTAGTGTCCGCCGTCCCGCCGTCCTCTGCTGAGCGGAATT 2 | 9 2 3 | -------------------------------------------------------------------------------- /data/stepic_2c.txt: -------------------------------------------------------------------------------- 1 | RRNQKRGCLSQQCFL 2 | -------------------------------------------------------------------------------- /data/stepic_2d.txt: -------------------------------------------------------------------------------- 1 | 0 97 113 113 131 131 137 147 156 163 226 234 244 253 268 269 278 294 310 357 365 366 382 390 407 415 441 441 479 503 512 513 520 521 554 572 578 610 616 634 667 668 675 676 685 709 747 747 773 781 798 806 822 823 831 878 894 910 919 920 935 944 954 962 1025 1032 1041 1051 1057 1057 1075 1075 1091 1188 2 | -------------------------------------------------------------------------------- /data/stepic_2e.txt: -------------------------------------------------------------------------------- 1 | 459 2 | 0 71 71 71 71 71 97 99 113 113 114 114 115 115 128 129 129 142 147 147 156 163 168 184 184 184 185 185 199 214 226 227 228 243 244 246 255 255 256 270 275 276 278 297 298 299 310 312 324 341 341 346 357 358 361 369 369 370 373 375 377 383 412 412 425 427 428 429 438 438 444 453 459 471 472 483 484 490 497 498 499 509 509 524 524 525 530 542 542 542 543 553 567 580 587 596 600 613 613 613 613 622 624 626 638 640 644 652 653 655 658 671 684 693 693 699 709 711 714 723 726 726 727 737 741 750 769 773 785 797 797 798 799 800 807 808 808 813 814 821 822 836 840 854 856 868 870 883 884 888 897 907 911 922 925 928 928 936 937 937 954 954 955 968 969 977 982 983 996 999 999 1001 1021 1025 1025 1051 1051 1051 1054 1066 1067 1068 1070 1072 1084 1091 1096 1096 1110 1112 1122 1124 1138 1138 1143 1150 1162 1164 1166 1167 1168 1180 1183 1183 1183 1209 1209 1213 1233 1235 1235 1238 1251 1252 1257 1265 1266 1279 1280 1280 1297 1297 1298 1306 1306 1309 1312 1323 1327 1337 1346 1350 1351 1364 1366 1378 1380 1394 1398 1412 1413 1420 1421 1426 1426 1427 1434 1435 1436 1437 1437 1449 1461 1465 1484 1493 1497 1507 1508 1508 1511 1520 1523 1525 1535 1541 1541 1550 1563 1576 1579 1581 1582 1590 1594 1596 1608 1610 1612 1621 1621 1621 1621 1634 1638 1647 1654 1667 1681 1691 1692 1692 1692 1704 1709 1710 1710 1725 1725 1735 1736 1737 1744 1750 1751 1762 1763 1775 1781 1790 1796 1796 1805 1806 1807 1809 1822 1822 1851 1857 1859 1861 1864 1865 1865 1873 1876 1877 1888 1893 1893 1910 1920 1922 1924 1935 1936 1937 1956 1958 1959 1964 1978 1979 1979 1988 1990 1991 2006 2007 2008 2020 2035 2049 2049 2050 2050 2050 2066 2071 2078 2087 2087 2092 2105 2105 2106 2119 2119 2120 2120 2121 2121 2135 2137 2163 2163 2163 2163 2163 2234 3 | -------------------------------------------------------------------------------- /data/stepic_2f.txt: -------------------------------------------------------------------------------- 1 | 225 456 363 584 756 731 878 97 866 981 650 503 397 753 884 331 356 687 372 228 115 638 113 471 850 228 250 735 128 765 625 868 216 834 519 706 0 103 753 294 462 147 834 525 622 510 559 131 609 422 343 275 246 618 359 853 478 147 2 | -------------------------------------------------------------------------------- /data/stepic_2g.txt: -------------------------------------------------------------------------------- 1 | 16 2 | 371 3 | 550 1624 113 220 529 1551 788 732 773 535 1492 1008 888 760 0 819 1145 1070 317 1232 253 1175 269 1346 259 1461 1038 691 657 1014 1192 625 829 1605 415 776 610 942 747 884 1648 1502 454 488 1241 472 1470 1436 422 932 616 916 278 1664 432 1630 1079 1029 156 1541 1289 1098 820 1598 1098 500 1211 941 1069 1389 1395 357 366 886 291 212 985 325 1485 1508 194 1646 469 357 875 115 584 569 1329 473 1501 276 1308 1404 1633 1404 1144 1761 1345 512 1483 163 453 338 1261 163 1339 163 1549 128 1598 300 692 749 845 1357 1136 97 732 663 1249 1567 57 1273 260 1126 1704 372 404 1226 663 1502 1664 529 387 810 1423 1104 1177 1520 113 128 1633 712 520 1288 1164 560 1648 617 1386 1001 375 137 875 1201 97 1012 1232 416 1444 1598 1151 988 241 131 1374 877 873 597 973 1307 1029 210 723 682 753 1049 951 635 886 1292 259 586 4 | -------------------------------------------------------------------------------- /data/stepic_3a.txt: -------------------------------------------------------------------------------- 1 | 5 1 2 | TCCGTCTTGCGGTAGCGCACCTCTG 3 | GGAGCCATATGGCAGGTTATGACAA 4 | CTGCATATGGTTCTACCGTCGGTAG 5 | TATGAGGAAGACAATGTCTTCACGG 6 | CGCGTACCCTTCCTAGGGTTGGTAG 7 | AACCAGGCAGAAGTCTGTCTTACTT 8 | -------------------------------------------------------------------------------- /data/stepic_3b.txt: -------------------------------------------------------------------------------- 1 | 6 2 | GTGCGTATTAAAATAGGAACGGATCATCTGATTAAACAGGGG 3 | GTCCGAAGATCGCTAATAGCCAAGCAGGAGAGTCTACTGGAC 4 | GAGCTACAGGGGTCAGCTCTGCGGGTAGTGCTCAGATAACGA 5 | GGGATCCAGGAGCAACTGTTCTCACTCCTCAGTGTCCACTCT 6 | ACGCATCCCGGGCAGGTGCACCATAGTGCGAGAAGCAATCAT 7 | CGGATCATTTACAAGTCCCAAGTCAGTTTCCAGGCGTCGTAG 8 | CGATTGGATTGGTACCCTGGCCCTGGTAACGAGGGTCAGGAG 9 | CAGGCGGCTGGATGTGTATACTATGGAAAACCAGGAAGCAGT 10 | GATGTTCCTAGACAGGCGTTTTACAAGTTCCGTTAATCTGTG 11 | ACCGTTTGTTGAAGATTGAGATTGACTAGACAGGAGTCAACC 12 | -------------------------------------------------------------------------------- /data/stepic_3c.txt: -------------------------------------------------------------------------------- 1 | TGTACGCGACCTCTCCGAGAGTAGTAATTGCTATCTACCTAGCTCCTCGGGCTCGGGTCATCAATAGCTGTCTTAAGCATCGTAGGGAAGACTACTACTTATGCTGGATATATTCACACATGGGATGGGTACACGTGGTAGACCGGTATAAGGGACCTATGGAATCGATGAAACATAGTAACAGCTGGGAACACGTGCCG 2 | 6 3 | A C G T 4 | 0.333 0.212 0.212 0.242 5 | 0.242 0.273 0.394 0.091 6 | 0.242 0.242 0.242 0.273 7 | 0.273 0.242 0.303 0.182 8 | 0.333 0.333 0.152 0.182 9 | 0.394 0.182 0.152 0.273 10 | -------------------------------------------------------------------------------- /data/stepic_3d.txt: -------------------------------------------------------------------------------- 1 | 12 25 2 | GAGCAGAACGTACATAAGACTATCGAAGAGTAATCGTATTTCATCAGTCTAGCAGCCTCTCTTCGGTTTGGACGGGCGTTTCCGCCCGGTTTCACCGGCCAAGGCTTACGATATGGTGCCATCATGGTCCATTTGTTGGGTTACAGTACAAGTGAG 3 | TCCTGAACTCGCGACCAAGCCTCATCTGTTATATTCACGTTCTAATGTTGCAGAGGGAGTCGTTGCGCGTGCTGTCTGGGGCGGGTATGTTGTGTAGTAAGAGGTAATGATTGAGGCACTGATTTACACTTCAAGCTGTCATGTGAGCTGAGAAGG 4 | ACCATAGATCAAGGAGCTATTACCGACCACGCCTAAAAGTAGCACGAAATTAGCCCAGTTTCGACGCCAGAAAGCAACTCAGGCGAAGCCAATCATCTGGAGAGTTATGAACAGCCAGTTCGATATTCGGTAAAGTTACTTGGCACTAGAGCACCT 5 | GGCCAAGACTCAAGAGGCCCTATGCAGGCTCCCCACCGTAATTGTTTATAAAATGCTATGTCATAGTGTACTGAGAATACGGCCTAGTCGAAGGCTATATAATAGCTCCAATCGCGAAACGATCGGAATCGGCGAAAGAATAGCAAAGCAACATGT 6 | GACTGATCCCCGGACTGATATTGAGCGTGTGACCCCGGCCATGGCTCATATGTTGATCTACCCCATTACGGCGTCAATCCTCTTTATTCGTCCCCACTCGTCGTCGGGCTGAGTTTAAAATAGCTTGATACCCGTGATCTCACACCTAGTGCAGCT 7 | GACCAAGGCTAACTTGAGTATTCTCAGTGCGTTGCACACTCAGAGACTAGATTGCACGCCGTGAGGGTGTTTCTGGTTGCTTATGCGCGCGTGGTGCATCGTACTATAACTCGACGAAAAAAAAAGTACTCGTTGAACAACTTACAGTCGGTTTAG 8 | GGTGCCAAGTTAAGTCCGGTTATGCGAGCTTAGGCGCGCTTGGCGTACGTCCACGCCTGAGCGGTTGTCGAGTATCTTTAGGGCACGGCGACTTGCATACATGTGCCAACATAAGGTGGACAAGCGCATGGTCACCCAAACTGGTGAGCAACCATG 9 | AAGACACCCCAGTATGTCTTTGCAGGTCGGGTGGGCATACCCGTTCTCGAACACCTCAACACAAATGTTAGCGAAAGAGAGTTGCCTTCGCTTTTCACCCGCCAGAAGGGTATGAAATGTGCTCGGGCGTAGGTCCAAGCCTGAAGCTAAATTCGG 10 | AACCCCGTCGTCGCAGCAGAATCAAGACGGTGCCATTGACAATTAGAAGAAAACACATTTCATTGTTTCCTCCAGATGGGTACCCCGTCAGGCCGAGAAACACCTGCCTCATGCTAAGTCGTCCATGACTCATCCCACCTTGCCTACGGCCTACAG 11 | GCCGCACCCCTGGTTTAAATCCGCGTAGACCATTTGAGGGATGGCCAAGCAAAGTCGGCGTTCCCTTGACTAAGGGTCTCTTGCCCCATGGACCTGGAGGCGGTCCCCACCAGAATTTATTTACCTTCTTTTTGGTGAAATGCAGACCAGGACTGA 12 | CGTTTAGTATCCTGAACCTGATAATTCTTTAGTTAGTTAGCCCGGTACGACCATGCCTCAATGGCTTATGAAGTCCCGCAGGTACGGATTTCGAGCTAAAACTCTCCGCTTTTGAAAAGGAAGACATTAAGCCCTTGTCAATGACGATTTCGCAAA 13 | GTCCACGCCTTAATAGAGGAACGGTTCGCCATTAGTGGCGGTCGGGAGAGCACCCTATTAGTTAGTCTTTTGCGCACATCGCAGCAACACAAATACTCCGGGGAGGATGCACCAGAGGGGTTTTATTTCGCCCCGGTATTGACTTACGACGGAATT 14 | AGGTTTCACAGTTGTTGTGCTCACGTCCAGGTCTAATGAGTTTATTGAACGCCGAGTTGGAACTTCATTGTCCTGCCCAATAGTTGCGTGCCTGCTATGGCCCTGTCGCATGCTCTCACAGTACCGTGGCTCGCTATAGGAGCTGCCTTAGGATTG 15 | TTTATGGTGCGGCCCTTGATACGTCGCCTCACTTATTAATTCTCGGGACCCGTAATTTGGACGGCCCCTGAAGCCATCATTGGCTAGTTGAGCGACAGGAAGTATTCAATCCACGTTGACCTCGCTTGGGTCCCGACCCATGGGGCCCAAGACTAA 16 | CCCGAGTAATGTTAAGACATAGCATCGTGGAGGACTACAAGACTGGTAGTTCCAGCTGAGTGTCCTATACAAGGCCACGGCTGAGTAGTACATATACCATCAGTGAGGTAGTGCAAGATTCTGCGCATTTGGAATAAGCAGTCTAAACGGACTAAT 17 | GAGACGCTTAGCGTCCAAGCCCCCGAGCGCACGGAATCTAAACTATCTCACGCGTGTTGCGGCCAAGTCTGAGCGCGTCTTAGGGATCTTGTGTTATGGTAGGCAGCGGCTATACGTATACGGTGCCGGAACCCGGGGGCCTAGCGCATAGTTAGA 18 | GCCCAAGACTCAACTCACTCTGGCTGAGCCCTACATGTGAGTGTAAATGAGGCCCAAGGTCCAACTGCTAGAGCTGACTTTAAATGCTCATAGTCAGATGCTCAGGAGTAGCGGATCTGCAATAGGAAGCACTCGAGCCGCATCGACTGACGGCTC 19 | AGAACATTTTGAAAGGCTGGTTAAGGTAATCTCTCGGGTCACGCTCCCGTAGAGGCAACTCGGGTAGCATTCATTTTAAGGTCATGCTGTACATGGGGCCAGGGCTAAATGGTTTTAGACTCTCATTCGATGCGTGCACGCGCCGCACTCAGTCGA 20 | GTATGGTCTCTAGACCATGTCTTACCTGCGCTTCGAAGCAGTTATGGACTTCGGGAAACAAGGACTCATTTCCGATAGTGTATTTTATCACACTGGAATGATTCAAGGGCCATCCCACGACGATTAAGCTATAGCGTACAGTCGAATTGGTGCATC 21 | CGTACACTAGCGACAGGCGGATGCTCCCTCCACCCATAGTCGATTGGGAAAGGCTAGCAGCTTCAGGTCCTTGTCCAGGTCTCAAGAACCATTCAAGCAATCGCAGCCATGCGCGCCGTTGTCCTAGAGCTGGGTCTTGTTTACATAGTGGGCATA 22 | TGACAAACCTTGTGAGGATATGGTTACCTTAGTTAATTGAGATATGAAGAGGCGGACCCAAGAGATCTGATACTCGAGCATTTTATTAACTCAGGGAAACTGGCCACAGCCCAGGCCTGAACGCCGTAGCTTAGTATTGGCTGGAACACTACATCT 23 | ACTACCGATGGCGGCATCTGGAACGAGCTAGGAGTACGTAGTCAGATGGCCCATGTCTCAATCTTATATCTTGTGATTTGACGTAAGCGCATGGCGACAGGCCATCGCTTAGCCCGTATTGCGACAAGACTGATGTTCCAGCCCCCGTGATCTTAG 24 | CATAAGTAACACTTGACAATAAACATCGGTAAGTGTCCCCTTCGGCGATCAGGTGGCATAAAAAACAACCAAGGCCACGCCTTAAGTCCTGTATATCCCTACTTCCCAGTATAATGCCATTAAACGGAGTCTAACTAACTCGTAGAAAGCGCGAGC 25 | CGATTGAAGGTTCGGATTTTAAAGATTGGTGCATCCGTAATAACCTACCACACGAAGGGGCTCCGTTGTGACTGCGTCCCCCTTGCAAGATTCATAAGCACGAAACCATCATCGCATATTTAAAAGCACGGCGTCCACGGCTTAGCGGGATTCATC 26 | ACGATAGTGTTCTGTAGAGTGACCATGTACATTTGCTCCGGGTGGGGCGAATCGCGACCTGGCAGATCTGCGATAATAAACAAGGCCCAAGTCTCAGTCCCGGTTTTCGGGTGTATGAATGTACAGGAACCAATAACCGTACAAACGTATGGGCCA 27 | -------------------------------------------------------------------------------- /data/stepic_3e.txt: -------------------------------------------------------------------------------- 1 | 12 25 2 | TGATTGAAATTTACTTACGACACAATGGTGTTTGGGGTAGATACGAGGAGGGAGTTTTGCGGCTTTTTCTTAACCGTGTGCTAGTGCTTCCGCCTTTTAAGTTACATGCCCGATGCAACGACCTCTTCTTTGTTTGCCCCCGCGGAGTATTGGCTG 3 | ACCGCGACTTATCACTCGTATCTAGGTGCAACAAGTGCTAATAAGTTGCTTGGAAAACACCATATGGTGCCAGATCCCGGGCTGGATGTTTAACGATTACAATCAGAACAAGAGCGGTCAAGCCCTTCTTTAATGCATCACGTGACTGATTATGTA 4 | ACTTAGCTGGAACCTGCTTGCTTAATGCAACTAGCATTTCGCTAGCCTAGAACATCGTACTCCCTTCTGGTAAGGGCTCAGGACGTGTTGAAGTGCACTTGTACCCTGAGCCCTTCATTCTTATGGCATACTCTGATAACGGCAGGCTGTCTGTGG 5 | AAACCACAGTTGAATGTATCTACAGGAATGATAACATACATGCCGGGACTGGACGCCTGTCCGTTATATATACCCGGAACAGGAACTGTACTTTACAGTCAGCTAGAATAGATGCGGGAGAACGCTTCGTTCTAGCGAACTCGCAATCCCAGGTAA 6 | CGTGCCGCCAAAGGAGTTTAGATGAATAAACGACTTGGTATTTGCGAAGAGCATACTTCGCCAGCGCTTGGCCGGGACTTGCCGCCAGGTCGCCATGGTACGTATAAGTGGAAACTACCGAACCCTTCATTATCCAGGAAGCATAACCACTCTTGA 7 | GAGCACCGCAAAACATAAAGTAGGATCACTTCGTTGACCTATACACCGGTCTTGTGCCTGGGGTGAAAATGGAGGCAAAAGGTGAGCTGCTCCTCCTGTTCGATCGCCTGCAAAAGAAATTCTGGGAGAGAGACGGGGAATTTTGAGAGGTACTTT 8 | CCCTGTGATTGCCTTCTAATCGGGCGCTGACCGATCTACAGTCTATGCCGCTGAACTCACCAAACTTTGCGACCGGCCTACTCGAATCGGTAGCCAGCGTAAGGTTAAATCCGTATCATAATCTCTTCATTCTGTCGACTTTTACTTCGTTTCACC 9 | TCTATGGTATACATCTCTTCTTTCACTCATACAAACGGTGGTGTGGGTCACTGCAGTTACCCGTTAAAGGAGAGTTAGCCGGCAGCTACAACCCTTCAACCACGATAAGACCATAGTGCACCGATGAAGCTAGTATGATGATATCCACACTGTGAG 10 | TTGCCTCGCACGATCGCTTCGTTCGTTGTCCAGAAACTATACGAATAGGAGGCCAACAATATCGTTTACTAAATATCTGGTTTACTGCCCCAGAAGGGCATGGAGATCGGAGACGGGGGGAACCTAAAGGGGCGATCAGAAGGCCAATTACAAAGT 11 | CTAAGTAGATTAGTGACTTTCAGCCAATTGATGGCAGCTACTCGATCACTGAAATGGTCATGGCCAAAGCTGCTAAAATCGTACTAAATCAGGTCTGACACCCATTGAATCACTTCATTTTGCCTCGCGAGCCTATTCCAGTAGGGAAGTCTTCAC 12 | ATTCCGGGCCAAGCTGTGTCATTTCATGAACGCGGGGAACCGACCAGTGGAAGTCCTACCCCATCTCCTGTATATCGCTCTCGTTTTGCTAGTACAAACCCTTCGTTAGGTTTTCGTCACAAGAGTCCGTTTCCAGATAACAAGGCGTACGCGCCT 13 | AGGGAACGAGTTGAATCCACTAGGGGCGGACTCCAAAACGCTTCATTTCATCATCATCCTTGGATTCGAACAAGCCAGAGAGCCGGGGGCTACGTCGAGAAAATTCTCTTTTAAGAACCTGGCCAGGTCTTGATGTGACGGGAACCATAAATGCTG 14 | TTTATCCGGCTGCATCCCGACTGCGCACTACACACCGCAACGTGGTAACCTGGACCACCAAGCAAAGTATGCCAACCGCGATCGCATAGTCCGGACGACTCGTGGCTGGAGTTGTAAGTCCTCTACCAAGTTACCACTTCTTTGTAAGGTGGTACC 15 | ATGATATTGCCTACGGCCCTGCGCGCATTGGATCTCTAAAGCCGAGAGGAGAGCAGCCTGTGAATTGTACGCATGGGGTTCTAGACCACTTCGTTGAGTTGACTCATCTCCGGGATGCATGTCACTTGGTATAACCACACTTATGCCTATGGTTGA 16 | AGACTATACCCTATCGCTTCGTTAAATAAAATCATCAAGTTTATTCGTTCCTCGTGGGAACCTGGTCAAAATTAACCATGCCTGAAGTCCAATTAGCGATTGGATATGTGGATGTCAGAATTTGCAGCTCTGATTGCACTGACTAAGATGCCGGAC 17 | GTACAAGAGAAGCCTAGCGCTTTTCCGCTCATTCCTACCCCTTCGTTCACAAAAAGCCACTACATGCCATCGGAGACTATCACTGAACTACTACTCTCTGATCCTATCGTGATATCCTGTAGTATCATATGGTTACCACGAATAGCGCTTCGACTA 18 | ACCCCTTCCTTGCGGTTCCTAGTGAGGGCTGCCGAACGCTCACTGAAGGAGTCAATAAGTAATGGGGGTTGAATACGACATGCACTACATAATGCTACCAACAAGGGGTGACCGAATGCTTTGCGCCACTGTGGGATAACAGCTACCTCCAGGTGC 19 | CCTTGATAGGTAAAGGCGTAACATATTACTCAGAAGAACGCTTCATTTCACGGAAATTTCACACTATTCTACAGTGTATCGGCTGATGTAGGAGGCCCCCGCCCCCTATCCCAGAACATGATGAAATGTATCTCCTCTGCGGCATCAAACTCTACC 20 | GCCAAAAGTACACATTTGATGAAAGGACCCCCGCTTGTCTTCTCATTCAACACTTCTTTCTATATCCGGGTAAAAAATCTCCAGTCCCAGGTACTCGGACCGTCGTGGCACGCTACCTCGTGCGTTACTGAAGGTCTGCAAACGCATTAACCCAAC 21 | GAAATCCGTTATTGGATTTGGTGAGTAATGATTGGGATCGCTTCGTTCTACTCCATAACACCATTCATAAAGGGCCGGCAAAGTTGGGTCAACTTCGGCCCGGTCACATCCGTAATCCATATACGGCGGGGAGGCGGACTAAATTAGATCGTTCCG 22 | ACGCCAAACGGTAACTCTTCTTTGGGTTGGAGTCTTGGTGGGGTATGTGGCGTCGGAGTTGTACTTTAATCCTACCCTGAAGTGGGACTTTCGCTGGCTGGTTGGCTGAGGTAAACAGGCATTGGACGTCTCTAATTAATCACTTATCCGGTGTAA 23 | GCTTGTAATTTTCCACGTCGGGTTTATTAGAATACGAGAGGACAGTGTCGGGGTGTGCTAATCTCTTCCTTATTAACGCCTTTTTTGCCCGTTCTATCCTGATACCACACAATTCAAAAACTGCTCGATCAAGACATACCTCATGTTATTCGTGCG 24 | CGCATGACAAACAAGTGTCACTGTGGAGTAGTGAATTTGCGTTCGTACACAATGCAAGCGACCGCTTCCTTGATGCCTCCCTAGGTACTTAGAAGTGGATTCATCCATTGGGACACATAAGTCAACCGATAGGAAACTTAGCTGAGGTGTACTTCT 25 | TGTCGGACTGAAATGAGAAAGCTAACTGATAGGGCGGGAGGCCGGTGCTCCAATGGATCTTCGAGCATTTTTAGCACTTCTTTGCGCACTAAGATAATCCCTATGTCCCACTACTAGATATTTACCCGGGTAACACCTGCGACTCCTAGCCTTAAA 26 | AGTCCGGTAGTGAAGCCGGAGTGTCCTCGCGACTTCGCCATAGTTCGACCATTACCTTTTTCTGCCGGCTAGCAGTTGTATGTAGCTGGCGTCTGGATACGTCCGTACGCTGCTGTTTTCAACCCTTCGTTCAGACTTATTGCAACAAAGGGTGCA 27 | -------------------------------------------------------------------------------- /data/stepic_3f.txt: -------------------------------------------------------------------------------- 1 | 15 20 2 | CGTCTTCTTACCGATTAAAAGGGCAACCCTCCAATGACACTCGCAATTACACAGGCGATAGACGCACCCTGATTCCATCCTCGAGCCTGATGTACGATCGATCCGTGTAGCATTTGCTAAGATCCCACCCACTATAGCGTTGAACATATCTGGAAAAATGGTCGGCGTCTTCTTACCGAT 3 | TAAAAGGGCAACCCTCCAATGACACTCGCAATTACACAGGCGATAGACGCACCCTGATTCCATCCTCGAGCCTGATGTACGATCGATCCGTGTAGCATTTGCTAAGATCCCACCCACTATAGCGTTGAACATATCCTTTGGGGTTTGGCCTGGAAAAATGGTCGGCGTCTTCTTACCGAT 4 | TGAGGTATTACGGTCGCAAAGCATCATCTCGGACGGTACGCAAAAAGAACCCGACCTTAGAATGACGTCGTCCAGTCATTCAGTGAAATTGATCGTACGGCTAGGCCTTCCTGGTAACGCCAGGCCCTGCACGGGTCGCAGGGGAGCCTTATAAGCCAACCAAGCCCTCAGATTATCTAA 5 | GGCGTCATAACGCGGGCGCCTAATTGGTGCCCGTCACCTGGGTGTCCCCACAGCTTATACTTCCTAAGTTGGCCTAGGTCGATAGCGGTTTTGGTGGTTTACAACCCTAAATCGCCAGAGCGAAGGAGACCTGCAGTCTATGGAAAGGAACTCGTTCTTTTACTTAAAGGCCCGGTAAGA 6 | GTCCGTCTAGCATCCTCGATACTTCGGCGCGATCTTTAACTGATAGTGCGGACATACGAGCTGGTTTGGCCTTTACGAATCAGCGAATAGCTGAATCCATTACTTATTTTTTTGCTAATCTCTATAACACACCGAACTGCAGATACATGGACTTAAGTCAACGCACCCCTTATGGCGAGT 7 | TCGCGATGGAACGCGTATACCGTTCCAGATTCGAAGACGTTGGCTATCGGCCCAGCAATCTGAGTTAGGGAGCTTCCACTTTTGGCCGTGTACACTTTTCGATTCTGCTGTGACGTGTAGCAGCGTGAATGCCTATATTCCTACTCTGGTGTCGTCATTCTTGTAGCCGGTCTAGCGGCG 8 | CAGTATGCGTTTTCCTGGTGCAAAGCTGTTACTCGTCGCCCGCTTCCTATCTTGGCCCTTCTAAATAGACGGCCTACCTGTACCCTCCAGCCTCGTCGGGGAACTTTGCTGGTTATAACTCACGAGTAATCGCCAGAACTTCCATCCGGAGACCTAAGAACCGTCTTGTCCAGCAATGCG 9 | CTTACCTCTGTTAGAAACGCGAAGCATATCCGGCCACCTGGAGACCTGGGGGAAACACACACGCGGATCTTTTGTAGGACTTCCTGGTTTGAGGAGACTTCGGTCTTCTGCCAAGTAGCGTACGATCCGTCTATGGGAGTGAGTGGGTCTACCGAACGAGATAGGCGTTGATCGCTATCA 10 | TGATTTGACCTAAGCTGTGTGGTTTGGCCGACAAGACTTGCACCACTTGCCAAAAGGTGGGCTCCAGGAGTCCACACAGATATGTGCGGTACAGGCGTAATGTGTACGAGCTGGGAAGAATACGTGACGGTCAATACCCCGCCGTCTTGCAGGCCATTTTCCCGCGGAGGTTCACCGAGC 11 | GCTCGGTCACCGGAAATGGTCTACTTGCGTGTCTGGCTAGCTGTACCCCTGGTTTGGCCATCACGTTAGTAAGATCACCCGCCAACCCCATTATAGGGGTTCCCTACTAATCCCAAGCTCGGATGTCTGTCGGCCTCAGCTTCCTCGCACGTAGACCTGTACTTCGTTCAAGCTTCAAAT 12 | TATCACGGGTGGGGTGTGACACAAGTCAGAGTCTGTACTCGCTGGGCTGCGCCCGATCTTTCACATTGAGTGGGCGCCTACGATTTTGGTACTTCTACGTTTGGCCGCGAGATCCAACTTGAAACGCGTTCTCTTACATAACGCTCGCACCGGTATCATATTTTGCTATCTGGTCCTGGA 13 | AGCTAATTGTAGCCATCCTGGTTATGTTATACAATCAGCTCTCTTCCTGGTTTCTACTAAAGATACATCCCGAGAGCGCTTGATGTGCCGCTCAAACGCCACATTCCGCGCTTGAACTAATACCGAATTAGCGAGCTCTATGGACGCAGGCATGCTACAAGGCAATGGCCTCAATGTCAC 14 | TTGATCGGCTTGAAGGTTTGGCCATTCACCCTTTCGAGAAGGCTCGAAATCGCCGGGCCGCGAATCTTAAGCAGGACTCGCACCCGAACGTGGGTCCGGGTATCTCATGCGGACGATTCTACCGACGTACGCCGAGGGCACATGGATGGACTCTCGAGATGAATACCTTTCAGGTAGGAC 15 | TATATGGCTGGGCCCCTGTCTACCGCTAAAGCATATGAATTCGGCATAGTGCTTCCTTCCTGGAGGGGCCAGGCGAGTTTCATTGTATAATTGCTAGTTGGGTCGCTATTGGTTACGATAGAGGTTGGTTACCTCGATTATGTTGCGCTACTAGCCTTCCGGCCGCGTTTACAGACATGA 16 | TAATAAGTTAGGCGATAGCCACCACGTAAACACGTGCCTCCGGATTGAGTTGGTAAGACCACCGCACCGCGACACCGGATCTTCATGTGGTGGGATCTCATATTCCTGGTTTGGGAGAGTGAGGAAGACTTGATAGACCGCAGAGTTCGACAATGAGCCACTAGTCTTAACAGACAGTTT 17 | TCGGGGTAATCCTTCGATGTTTGGCCGGACTGGGACGCCTTGATCCTGGTGCTCCGAATAGGCTCTGAGAGTGTATTATTTCTGTCTAGCTTGGAGAGACCTTGGCCGAATCGGTCCCCGCTCACCTTTTGGCATTGTATAGGGAGGGCTCTCACAGGGCTTCAGAGGGAATGGTCAGCT 18 | GAAAGAATGCGTGTAGGTTACGTCCCCGGTGAGGTTTTACTCACCGGGTGGGCGGAACTTGAGCATCCACAAACCACTATTGTCGGGACTCTTAAATAAGACTATGGCTTGGCTCGTTCGAAAGAGCTGGCCATTCGATAGTCTTCTTCCTGGTTGAACCATAGGGGCCCCCGTAATTAC 19 | CCTCCGACGTATGAAATGTTGTGGTCATAAGTGGTATTGATTTGAGGGAGAATGTCTTTAACGTAGGAAAGCTGCACACCTGTCGTAACGGTCTGTGTTGGGTAATACAGCTCATGACGTTGCTTTACTCGTAGCACGTGTTAGTCCTTCCTGCGATGGCCCTCATTAAAGCGTCTCCTT 20 | GGGCGAAGAAATCCTGGTTTGGCTCTCTCCCCTTTTCATGCACCGACTAGTGCCGGTCCGATTAGAATGCTAATAGGAAGCCGGGAACACCGAGCGTATCGAGCACACGACTTACGCGAGGTACGACTGAAATTTTCAGTTTCTCAGAAGGTACTCCCACGTGAGGTCCCTAGATCGGGA 21 | CGTTATCGCGTGCCGGCGCGTTATATATCGTCGAAGAAGCTTCCGACTTTGGCCGCCCTCTCGAGATATGAGCCCCATCAATCGTTACCCTAGTTAACGACGCCGAACATATAGTTCATCGGCGAACTAATGTTGTATGACAACTCAAGCACTTGGTGTTTTAGACGCAGCACATTGCCC 22 | -------------------------------------------------------------------------------- /data/stepic_3g.txt: -------------------------------------------------------------------------------- 1 | 15 20 2000 2 | AGCCATCATAAAGGCTGCCAACTTCAATGTCATTAAGCCCGAGGCCGCATACGGGGCCGGAGCGACCGGGAACCAATCCAGTAGTTGGCCCGCAGCCTCAGGAGGGACAGGCCTCATGCGTGACACTTGAGTAAAAGTTACACACTCGACTAAGGTTTACCGAGTTGAAATAACCGAGTGACAACTGCGCAACTCAATGCTAGTGGAAGGACCACCACCGGCAAGCTAGCCGTGCGACTTCCCTGAGCCGGCCCGTTATGCCCAACCAATTTTTTGCTGAGCACCTGAGGTGCACTCCGGCTAGGTGAAGAAGCCATCATAAAGGC 3 | TGCCAACTTCAATGTCATTAAGCCCGAGGCCGCATACGGGGCCGGAGCGACCGGGAACCAATCCAGTAGTTGGCCCGCAGCCTCAGGAGGGACAGGCCTCATGCGTGACACTTGAGTAAAAGTTACACACTCGACTAAGGTTTACCGAGTTGAAATAACCGAGTGACAACTGCGCAACTCAATGCTAGTGGAAGGACCACCACCATGATACGAAGTGAGGGCAAGCTAGCCGTGCGACTTCCCTGAGCCGGCCCGTTATGCCCAACCAATTTTTTGCTGAGCACCTGAGGTGCACTCCGGCTAGGTGAAGAAGCCATCATAAAGGC 4 | GACCGCTTGCGTCGAAGTGCCTCCTCGGACAACCCAATAACATAGACATGGCTTATCGCCCAGAAGCGGGGACCGTAACTACAGTTTAGCCCGTTGTCAATACAGTGTCAAACTCACATTAAAGGTTCAGGGGGTATAAAGCAGGGACTCTCACCCCATTTTACGCACCGGCCAGAGGACGTATTCTGGCGATGCCAGGGAAATCTGAAAGAATGTGGCCTCTACCACATGCAGAAAGTATCGATTTTGTGAGAGGTGTTAGCAACCTCAGGCTAGACAGTGGCCACGCAATCGAAATCAGATTTGATTGAGCTCCAGAGCGGGGA 5 | TATCCGGCGAATGACCAAACACCATCATACCCTAGAACGTCCGATCTCCCGTACCCGCACCGCGTTGAATGGTCCAGCACACTGTCCACCCGCTTGTCATGCTTAACAGGAGCTTCTAATGCGTCTTTGTGAGTGTGTATAGTTCCGACGTCTAGCCGGTTAGACCAGTACCCCTCGCGCCAATCAAACATTTCGGATGCCGTTGATTTGATATAGCCCGCTATGATGCCGCCTCTTGACCCTGCCCCGAGGCATCTACGTGATCGACTGTAAAAGAAATTCAGATGACCGCACTCCTGAGGTGGATTAGGCTATATAAATCAAGG 6 | TCCGACGGGAGCCTGTGTGTTACCGACTTCCCGTAAAGAGATAAGGTTCCGGGTGACAAACGTGAAGCACGAGAAGGACCAATTTGAACACCTTATCGAGTCTAGGGTGTCATGTGTGGGAGTGCCCGGTATCTGCGTATGCGTGCTAGTGAGGCGGTACTTTTCTTTCGTCTCGAACCCCGACTAGCTCGATGCACACTATGATAGGCGATACCACTTCTCCTTTGCATCACAACTGAAAAGGATTTCGATTTGGCTGTTTCCGCGGGGGTGAATCCGATTCGGATGTCACACATCCAGCAGTGGACCATGCTTGATTATGGCAC 7 | GGATAAATCCTAGCGGGTTACTGCGTCTTCGTTATAAAGCCTTTTGCCGAGCACGTTCGTGCAGCTGCTCTGGGGTTAGGCGCTGTCCTAGGCGGCGTTTAATTGTCTTTGCTAAAGAAACCGTAAGCGTTCAAGCGCTGTTGTCGATAGGGGGTGCGTGTGAACCTAAATATAGTTACACGATGCGTCGCTTTGAGATAAGCACTGGTTCCCCCGCGACCAGATGTAGGCTGGACACAAGATCCATTAAGATCAATGCTGATTTAAGGTACAGACATTTAATAGAGCACGGGACGTTAGTTCGTTTTGCTATGAGACAGGACGCT 8 | CCTTAAAATCTTCTAGACATAAAATCGAAATCATCCTTACGTCGACTAGCCATACGTTACCTGATCGTAAGGGGTTACCAAGTGCTTTATATATGACTGGCAGTACATCGCGAATGGTGCGAAGTGAGGTCTCTTACTCAGGCGTCGGGGAAGTTTCCAGGGAAAGTCGCCTGATCTGGGCCGCTTGTATCATTCGTTATTCCGTGGGCTGTAACAAGTAGGGCTTGACTTAGTGTCCGGCTGAAGGAATTAAAGGATCAGCGGGGATCTAAAGATTGTTGGGAGTGCCAGTTGAATGCTGTCCGGACGGTTGACTCTTAAGGAGG 9 | GGGACCAGCTGATGAGGAAAGCCTACGGGTCGATAAAACTTTTGAGTCGAGGATTCTTAGGATTCGGCAATTCGCACAGAAGTCCCAAACTTGGTACTGTCATCGCTTAGCTCGAGACACATACACTCTCATCTCCGTAGCCATGGATTTCTCCGAGAAGCCCCCTTAGCCTTGCGCCCCAGCCTGGTCATAACGTTACGGTGCGAATTTCGTCGTTTGGCAAGTATGCATATGCGTCGAGCAGAGCAACCTGGCTGTCCTAATGGCGGTCGGCATCCTATATTATTAGCGTGCTTGGGGCGGCCGGTAAAGAAAAGACCCGTACC 10 | ATTACTCCAGTGTCAACCACCGGAAAGTATTAGCTTCGTACCAACAACAAGTCGAAGTGAGTAATGGACCGACAACCGTTGCCGGGAACTTATGGCTCACCATCAGGGACCTCTGTTATAAGGCATCAGAACCCGAGCGGAATCGCGTGGGAGACTATATGTTAAAGAGATGTAATGACCGCGCGCAAACAGAAATGCAGGGTAAGGCCGTCCATGCGTCTGGACCCAGAATTTGGTGTGCCTCATCCTGTTTTACTTAATAGAGCATGGCTTTACCCTTGTGGGGTGTCCGGCCCGTGAGCCGCCTTTGGACTTTCTACAGCTTT 11 | GTCCAGTATCGTCGCAGAACTACTCTGGAGAATTTAAACAAGAATTGCACGGGTCAACACCTTTCCGCCGGGTAGGGCGAGGCCCAGACGACAACATATGTATGCTAAGAAGTGAGAGCGCGCGACTTTGTAAACCGTTGAAAATGCGAACAAAGCTTCATGTGGACATTACACGGTGCGACACAGAAAAAAGGCAGAGGAACGAGGCTCCTTTCTCTTAAAAACGAGGCATAACATTCCTGCCTCTCGTACTCAGGACTTGGGGCCACGCCTTCGGCAGCAAGATGACGCAGACAAACTGGTGTGACCGCTTAGGTGCACAGGAT 12 | GTATACTCGCTGACAAATATCCGCGGACCCACCTTTTCTCCAGTAGGTGGTAGTATGGCATGGTCCGATCTCCGAGTCCTAAAGACAACAGTTGGTGAGTCTCATCCATTGGACCCCGCGGTACCTTCCATCGTACAACACATGCGTCGAAGTATCAGAGGGAATTGAAGGGTATTAATAATCAGCGTCTAGTGCATCCCGCAGAACGATTATTAATCAAGTACGTCTATCGGGGGTCGTCGCCGTCTCTTGAGAATTTAGCTCTAATCTTTGGGGAGGTGTTTCCCAGGAACGAGCAGTCGATTAACTACGGCGTAAGTTCCAAA 13 | ATTGTTTTTTTAATGGTCTCGCGACAACGCGGTGCAAAAGCTGAATTTCGCGGCTCGGGTCCAGAGGTAAATTCCGACTATGGGTTTACTCAGACGCTCTGAGCTGATCATTGTTTTCTGGAATGAGAGGTGGAGCGTCCTTCCAAACTGCTGAGAGGTCTGTGTACGGGCTGCCTTGCTTAAAACAGCGTATGATTCTCAAGTTAACTTGCGCTTGGAACGTATTACACTGTCCTGCTGCCCCACATCAATAACTCGAAGTGAGGATACTTAGCCTGAGGGCAGCTCGGCCGGAGGCTAAGCGGCGCTCATGGACGTACGGGTTT 14 | TCTGTGCTCTGAAACGTTACCGAGGCATCGTCGTTTAAATGGCCTGAATTCCGTTAAGCCTTATTACCAACGACTTAATATTGTACCATATTTATTTAAACATAGGTTTATTTGTATGAGGATCATCTACCTTCAGGTCCGAAGAAAGGAAACTAATATATGGCTAAGTCACCAAGCACGTGCAATGATACACAATGCGAAAAAGTGAGTCAGCTATAGCACAAGACTCTAAATTACTTTTCAGTGGGGGTAAAATCGGTGACCGGCTTGAACAGGCGGACCACGTTTGCAAAGGCGGCAGAACAAGAAATGACCGGGGTCTTTAT 15 | GTGTGGACATTCCGACTGAGTACGAACCGGAGTTAACGGGGAGCCTATATATGACCGGCTATTCAGTGCGGGCCTTCATGTTATAACGCTGCATTTACGATTGACGTTGTAGTTTCAGTTTGCTCTGATTCTGAGTTGCACAATCAGGGCCTTTGCTAGTGGCGGAGAGCAAGATGCATGGTATTAGCCACCAAGGACGGTGTCTGCGCAAAGTTAGATGGCCTAACTCAGAACATAGCGTACTCATGTGGGCATAAGGAGCTATTAGACCGGATCGAGACTTCGTACATCGATTATATTTATGCGTGCTAGTGAGGCTCGGATAT 16 | TCGTCTAGCCCGCTACGTTGGCGTCCGGGTTAAATAGCTATGAATTTTTGCTAACCTCAGGGTCCCTTGAGCCGCTTAGTCGAAACGCAGGTATATGGACGGTGACCGCACCAACTGGGTGGCTCCCTTTTTAAAGGAGTCAATCTGCGTTGTCATAAACATGAACATCTCTGTGAGCGCAGGGAGGATTACCTTAATAGCCGAACAACCGACACGTCCAAAAATTTGAATTGATTTACCAACAATGGTAGGATGAGGTATAGATGCGTCGAAAGAAGGACTAGTTCCCCCCAAGACGCCTCACGACGCTAATGCCTCGGAGAGTT 17 | TCAATCGCTTCACGCAAGCAATGTGTTATGGGATCTCTAAACTATGCGGGCTATTTCTATGCGTAACAGATCGGAGTGTCGACTTTGATTTTCTGGGAGACATGACCGAGCTTATTACTGAACGCAATGCGGTTAAGTGAGGACCCTGGACGAAATCGCAACGACCATGGAGCAGAATCTTGTCTCGAGTATCCGTCACCGATGCCGCGTGACGCCCCGAGTTTCTGTATATGAACCCTGGTTAACGGTCCGGACAGGCTTGCCTTCTCTGCGACTATAGAGGACCCAGATTCAGAAGTACGGAAGAAGGTAACCTCCCGCTTAGG 18 | CGCAGAGAGTATAAGGCGGTAACACATCATTCCCATATGAACGGCATACTTTAATGGAACGTGAGTCGGACGTGCTTGTACTCCGGCATTCCGCATACGTTGACCTAATGGGGGTACGTGTCTGGCAGTCCGATTCCCAGCCGCGTCGAAGTGACGTTCCCTTGTACAGTCGAACACGACTGTAGATCACTGGTCACCCGTGGTCAGATGCCCCATGCAATGGATGCATAAATTTGCATGGCGCGAGTTAATGGTCCCTCGAGTAGACGGTTGCAACTGTCGGGATCCCTACTGACATCATTACTGGGATCCGCCACCCCAGGCAC 19 | AGGAGCGTGGATGGAGAGTGCGTAGTGTGTTTTTACCGGACTGGATGCCTCTGCCAACGTCGATGCGTCGAAGGTTGAATTTCATGCGCTATGAGCGGAAGTTGAGCAAGCTCTATTGCATTACTACAAGGCCTAACACTCAAACTTACAGGGGTCCGCAATTAAATTTTCAATCACTGCCCACCTCTCTGAAAACGGGACTCCCTCGAACATAGAAGTCCACGGGATTACGGCCCCGGGGTACATGGGGTGTTCCCTTTTTGGCGGACACGACTTAGTAGATGAAACAATCCAATTCCGAGGACAATTACACTGGAGTTCTGCAG 20 | GCCAACTGACGGATTTGCTCCGATGTTTTATGGGCGTTCGACCGGTCCCTCGTCGAAGTGAGCGCGGGCTGAATACCTATGTGTGGAAGTACGGTGTGGGGATTGAGATAGTGTGCTTAGTCCCACGATTTTTCAGTTTGTTGCCTGATGGACAACTCTTATGAGCACCCTTATAGCCATGATACCTATTACAACCGGTCTTAGCTACGCTGTGAGGTGCCGGTTATTGGCGTACTCTGCGACGTTTCACGCGGACGGGAGAATCCCCGTAACCCTTAAAGGTCCTGCATTAAGATCACCTTAGGTTTTCTTTACGTTCTACTGAA 21 | CCACGTCACAGTGCTGCTGTGGGAATTAAGATAAGTTTCGGCTTGAGCATGGATTAATCTTTGCACGCTGGTCTTAGCCTTAAGCGAGGTACGTCAAGCGTACTCGGCTGAGTAGCCGTCATCCTGAGCGTATCAGTCTGGCTGTCGGCTATTGCCCATGCGTTTCGTGGCCATTGGCCTCTTGATGCGCGTAGACGAATCACGAGCAGGACCGAAATTATCCTTCCTCCTCCCCGTCATCATCTTACATTGTGGTCAGCTATTTAGCGACATGCTATGAAGTGAGTGGATTGCCGTATTTAGGATTGCTAACCAAATGCTTCCGC 22 | -------------------------------------------------------------------------------- /data/stepic_4c.txt: -------------------------------------------------------------------------------- 1 | 12 2 | GACCCTTCGGCAGGATTCTAATAACTACTGACATATCAGATTCGGTTGCCTATCTAGCGTGAGCTCATATCCAATGCTCTACATCCAGCTTTTAAGCCGAAGGCAGGTCGCTTCTCCGCTTCCCTGAATTCACGGGCCGACACGATACGGTGACTAAAGTTTGGCCCGCCACCAAGTTCCCAAGGCTATCACCGAAAGAAAGCGGACGGTATACCTGAGATATAACTATCTTTAAATAGGTAATTAGCGCGAATTGATTTCCGCCCCGACTGTTTGTCTAGGGCGTGGTAGACTGCGTCATTAGTAGACAACTCCCGATGCAGGTTAAGAGGGACTCTCATACTAACCCCGGATACGACAGACGACCAAGCCCGCACTGGAGGGGTATGCGAGTATATGGTCGGACTTAGAGACTTTTGCGCTAGCATTAGTCGACGTATCAAGAAGCGTGACGTCATGATCATCTGACGTCGAGCGCGCTGATACCTGCTGAGTAGATCCCGCTCACTCCGCGGTTCTTCTCCGTGGCAAGTCCAACCACAATGTTCCGTTGGGTACGCGACTTACCGGACTCTCCGGGCTTTTAGGCCCTGGCAATTGCTAAGATACAATTGGGAATCGCCCCTTTAAGCCCAAGCTTTCCTTCGCTGACCGCGAATATTGAGCCGGTTGTACCATCTCTAGGAAGACCATCCTCACCGGACGCCTCTGTGTGTTAATTATCTCCCCAGGTGAGATGAATAAGTGCGGGTAGCCCAGTCCAACTTGAAAGCCCTATAAGATGGCGCGGTATGTTACAGATGTCCGAGGCGGGCCCCCCGTCCCACTTCTTGGGGGATAAGCCGTCGGATTTGTATTACTGATCTTTGTGTCTGCGGGGACGGCTGCGCTTCGGTTGCAAGGTCGCAGCAAGTGTTGTAGTGACCTCAGTTAAAGTTCATTGACCCTGCTCGCACGGGCAAACGGTTCGGCCGGGAATAGCTGGATCCGATGGGACATATACGGGTACCAAGGGTACGATTGAGGCAGCTAGCGGTTGCGTGTGGAATTATCCGCGTAGCAGCGGCACAACTATCTCATTGCAGTGGTGGGGCTAGGTCCGGAACCATAAGCTGTCATTCACTGAGATGAGCAGACGGTTAACCCAGGTCTTTGCGTGCTAAGCTATAGGCAGGTCTCTGACGAGGGTAGAGGAAGGTGCGTCCAAATATTCGCCCATTATCTTAGGTGCGTCTGATCGTGGGTATGTTCCTCAGTTGGAGCCGGATTCTACCCATGATACTGGTTTGAAACAGGATGGGGTCGACAAACACTCAGGTAAGACCACAGTAGAGTCGACTGTCGTCCTCACAAGAAATTCGCAAGTTATTACCCTGGCGAATTACAGATAGGTCCGAGAACTTATAAAGTTCGGCCGTTCACATGGCGGTTCAGACCCCTTTTCTTCGATCGAATGACGCTAGGAAGCATGTTGCACCCTTGAGGATGGAGGCGGCTCTTAGAGCTACATTCTAATCAACTCTCTTGTGACTTTGACTGACAGGCCAGAACTCCACTTACGCTGGCGCTAGCATTCAATTTAAGTACATTCTGTCTAGGAAGAGAGAACGTACTTTGAGGACAATAATGAACGTCCATGACGGACTAAGAGGAAGTAGATCGAACGCAGACGCGGTATGTCTGGCGCGGCGCGACTACATTACACCCGTTCAAGTGGAGGGTATCTTGGCGCCTAGAACTGAAATCGTGACTTTCATAACCCCTATTTGATTGAACCGAGGAATTCCGCATTCGGATCACCCGGGAGTAGAAAGCTTATATGGTTAGGGACGGTCAAAATAGAGTTAGAAGGCGATTGTTAGCCCAGCGTGTGGGACTGAACGCGAATGTGTGCGTAATCTGAGGAACGGCCAACTGGTTGAGGTGACTATGCCTGGTGTGCGAAATAACTTTGTCGGTAGGGAGAGTAGATTAAGATTTCTGC 3 | -------------------------------------------------------------------------------- /data/stepic_5c.txt: -------------------------------------------------------------------------------- 1 | 17 2 | -------------------------------------------------------------------------------- /data/stepic_6a.txt: -------------------------------------------------------------------------------- 1 | 16730 2 | 22,13,11,5,3,1 3 | -------------------------------------------------------------------------------- /data/stepic_6b.txt: -------------------------------------------------------------------------------- 1 | 17 2 | 11 3 | 1 2 1 4 2 1 4 2 3 4 2 1 4 | 4 3 3 4 2 4 1 1 1 3 1 1 5 | 4 2 2 3 4 3 2 0 2 2 0 0 6 | 3 1 0 1 0 2 0 3 4 4 1 0 7 | 1 3 4 2 2 4 1 2 2 2 0 4 8 | 0 0 3 4 1 0 0 2 4 1 0 1 9 | 3 2 1 1 1 1 4 3 3 4 0 4 10 | 4 4 0 2 0 1 2 3 0 0 3 3 11 | 4 4 3 0 3 2 3 2 0 0 2 0 12 | 2 3 1 4 2 0 2 0 1 1 4 4 13 | 1 0 1 1 2 1 1 2 2 3 0 4 14 | 2 0 3 2 2 2 3 1 1 0 4 3 15 | 2 0 2 4 2 3 3 3 1 2 4 4 16 | 3 1 4 1 3 1 1 2 3 2 4 3 17 | 2 4 2 2 1 4 1 4 3 1 1 4 18 | 1 1 2 2 2 0 1 3 4 0 0 3 19 | 2 4 0 3 1 4 3 4 4 3 1 0 20 | - 21 | 2 3 4 2 3 3 4 3 1 4 2 22 | 4 3 4 0 3 3 1 4 1 0 2 23 | 2 4 4 3 3 1 2 3 2 0 3 24 | 0 4 0 0 2 1 1 2 2 0 4 25 | 2 0 0 1 2 4 1 4 1 3 1 26 | 3 0 2 3 4 0 2 1 0 4 0 27 | 4 3 1 3 3 2 4 1 2 0 4 28 | 3 4 2 1 2 0 4 3 0 3 3 29 | 0 2 4 0 4 4 1 3 4 1 1 30 | 3 3 3 3 1 3 1 2 2 2 3 31 | 2 1 3 1 4 0 4 4 3 1 0 32 | 2 0 4 4 3 3 2 1 4 3 2 33 | 2 2 4 0 0 2 4 3 3 2 0 34 | 2 3 1 3 4 4 4 0 3 4 4 35 | 0 2 3 1 0 4 3 3 2 0 0 36 | 3 3 2 2 0 0 1 0 3 2 1 37 | 1 3 1 3 3 1 4 0 0 3 1 38 | 2 4 3 4 2 3 3 2 0 4 4 39 | -------------------------------------------------------------------------------- /data/stepic_6c.txt: -------------------------------------------------------------------------------- 1 | GCATTATTGGTGACTTCTCTTACAACTCTGGCCACCGGGAAGATGGGCTATGTCAAGAGGCTTGCCTTGCACCTCCGAGGCCTGCTCGCCGGTATGCTTCGACAACGAAACCAGGACCGACCACAACGTAGCCCCCCCCGTCGTGCGTTCACAGTTACTCTAAATGTCAGACCGTTTCCGTGTGGTCCCTTAGGACTTGACCCTCGGAAAAATAAACTAATCATCGGCCTATGGTTGACTAGTCTCTGCGAATAATCATACAACTACTGGCCGTGCAGAACATTATCTATAGAACGTTAGCAGAGGATTTTAGTTTGCGTTTTCGAGTCGTGTTTTCAGGGATAACCCTTTCCCGGTTGTTGCACATAGGACCCCAATTCCAGTGGCTGTATGCATCCTTCCAGGATAGTAAGCTGCGTACGTTCCGCCGTGGCCGGGATGCCTATAGATTACACAGAAAGGCGTCAGTTCTTGAGACCGCACTGGATTCGACCCGCCGTCCACATTACGACAAAAGTTCACGAATCACCGTGTCGTGTATAAGGCCAGCTGGCGGTCACATCACGGGGGTATCAACAGCCCTCCCACCTACATAGAAGTGCGAGTATAGCGTGTGGTACATGTGTAGAGCATCCACATCGAATGAGCCCAATAGGTGCTGCCTACTCTAAATCTATCAGATAGGGACATGCCTACCACAGGGGATTTTTCCGACGTACTGTGAGATTTTACCAATGATCGAGCCGCCCCATTCTGCCTCGGCTGATAACCCTGTCTGCGCAGTGTTGCGCCCCACCATAGGAAGAATGTACCGGCTTCATTCCCGAACGGAACAGGGCACATTACGCTGCATTTATGCTATCGATTAAGAGTTTCTTTTCTATTAACATTACATGTTGACAGGTCCGAAAATCGGCCATCTGTTAGAACGCGCAGATGCCGAAGCTCATTTCACTGTACGGGGGCCCTCATGGCGATTTCCAGCTTACTATGAC 2 | AGTGAGGCCTCGCAGCTATGACGCAGACGCAGCCCAGACGTACGTCCAAGTCACTGAAGAGAATGTATAGTATTGTGAAGAAATCGACTGGGAAGTGCGGATGCGTAGTTCCGTAGGGATCAGCCCTTAAGAGGCACAACGGACGTCTGCGGGTGGTGGAAGGCTAGGGTGCATGGAGGATGGGGTAGGCCGTAGGACGTCCACGGATCTCCTGGACACCAAAGGAAACCGATCAAAATCTATCTAGATGAAGGCAATAAGTTGGTGAAGGACGGGTTCCGGTTATGGCACTACACAACCGGCCGGTCCATGCAATTTAGAACAACGTTTGAATAGCGAGTGGGGGAACTTAAGCGGCAACCAAAATCTAAATGTGACCGGACATGTCGTATGTTTCGGGCCCCTTTTCGACACTGAATAGGTGCAGGGTGCTACTCCTCTTATGGCTTAGACATGAGGAATCCAAGCATCCTGGCGCCGTAGTTTAGCCGCCTGGGACAAGGGTTTTTTCAAACCGTTTTCAGTATAGAGTGAGCGCCCTGCTTAGCTCATAACGTGCGATGAGGAGTATACAACTACAAGGCAGCAGGATGAGTCTAGGAAATAAGCTGCCTAGAAATCTACTGATGCGCGCACGCTGAATTCGTTGGGTAACCAATGCAACGTTTCCCAAACCAAGAACGCAGACGTGACTCTTTTTATCCGGTTGCCATCGCACCAGATAGCCCACCTGGTAGCTACAGGGAATCGCCGCAGTAATCAGAGCTATAAGATGATCGGCTGCCAGGGGGCTCGACGTTCTTTGAGAGATACAACGTGATCGAATTTAATTCTGTATGGAGTGATACCCTCGCCATAAACTCGCTTCAGCTGACCTATGATGCTACGGTCGACCTA 3 | -------------------------------------------------------------------------------- /data/stepic_6d.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 21 3 | 10->14:25 4 | 10->17:20 5 | 1->19:6 6 | 18->20:7 7 | 15->21:38 8 | 11->22:19 9 | 4->14:39 10 | 1->21:14 11 | 4->12:26 12 | 10->11:32 13 | 9->17:23 14 | 9->15:8 15 | 9->12:29 16 | 9->11:14 17 | 5->21:28 18 | 2->21:0 19 | 9->19:19 20 | 8->20:7 21 | 14->20:34 22 | 19->20:39 23 | 13->19:3 24 | 6->8:3 25 | 12->23:0 26 | 12->22:19 27 | 6->12:15 28 | 2->5:10 29 | 6->16:37 30 | 1->18:9 31 | 6->18:18 32 | 6->19:13 33 | 2->8:20 34 | 1->10:31 35 | 5->18:28 36 | 0->21:33 37 | 20->21:6 38 | 5->13:17 39 | 5->17:23 40 | 5->15:3 41 | 3->14:11 42 | 14->15:2 43 | 5->9:33 44 | 14->17:7 45 | 1->3:8 46 | 13->16:30 47 | 13->15:15 48 | 1->6:10 49 | 1->4:13 50 | 3->20:32 51 | 3->21:15 52 | 18->23:13 53 | 18->22:10 54 | 4->5:0 55 | 6->20:33 56 | 5->6:38 57 | 0->1:7 58 | 0->6:19 59 | 12->17:23 60 | 12->15:10 61 | 7->15:35 62 | 13->22:4 63 | 13->20:33 64 | 5->8:13 65 | 11->12:33 66 | 11->15:6 67 | 11->14:0 68 | 11->16:0 69 | 11->19:20 70 | 11->18:39 71 | 3->12:9 72 | 3->15:37 73 | 7->21:21 74 | 3->16:19 75 | 9->20:12 76 | 7->8:31 77 | 10->20:5 78 | 10->23:22 79 | 13->18:9 80 | 3->9:0 81 | 14->18:22 82 | 14->19:23 83 | 3->5:28 84 | 16->18:31 85 | 5->23:36 86 | 9->23:31 87 | 8->18:25 88 | 8->13:35 89 | 2->14:35 90 | 7->17:4 91 | 7->16:18 92 | 2->11:13 93 | 7->18:6 94 | -------------------------------------------------------------------------------- /data/stepic_6e.txt: -------------------------------------------------------------------------------- 1 | IWWRDFMAEFMWQNSGSRAMCFFNRIVCWNARALNYKNCSLQVKERLAAKRCYATHPEAITDGGIFAECEQTNYDFREHKSIFMFCPTWYGEQNHEVLGRHDHCHEMCHTCKCFRWRCHSSAKKLGCFPWYIPKFHYIKPMVVYHHMLTIHYKIPSQNKSDDALIGNAKIVMHLTGCQYNNYPTYSFEMPPCDTWPDAQVVKVTMFPIFIAFQSKTWWSMILSSSSGYLPYLMNCPNNLVWQVPLRCCYCCGNQNLEKPMQHSCGKPAAPDIRSRSQDFHWNRLQRVQEDIVSIEFKWSEELIEQWTFWVFYNLAHMGYAPDGYVTEFIWHGTYCIKFYKLGKQRQWKSQQHYCTMWAVRRVRWRIVVHYETKRRENYWIKFPPMCGERTWHRYCREKRVEGLVNFEIHQWQNQPLFKYMFHRCKMMINCREDGPSHTSQKNQHNLHFDMRNYSTQGMYDFTNITAKLPYISCTYHKMLWPMPARQMTHVGRESYKWSHCRMRGNTSWPQYSEHRFYFPPSMQWWMTIKNMLCATNRKKEVFNHDSVDKFRDVSPHMPYDIIQEQWGGPFMYDTMEFEVLTQMDIFYMYDAYYSRCCSMFKTFALWNRSKYMQMDWNNNNQTIPIAKWASHMYWQELLDVTMRACCDKYWSTYIMIKANYVEPRMGVLANVGPRHQHWAHCKHNVAIWQSFTHRMAGSMAKEEPWFACLYLIREVIICGETCDLLGCVQMQWKDWCRGQGKKYGMVMGKMIEALKFFFTLFVGVVWQQSCQEEQYGDMVEEYQIVMWHDSWLTILQYKQIMRQWGKQPMRSMYPAPFERYLWCCFNFEWDVQIVDYVLIRTFCPLVNQMWIETCNIQLQASDHNHCCDCCQATMLMDTHHPHDYNKDDQEPCMCDHCEV 2 | IWWMWQNSGSRAMCFFNSDGDIIVCWNAGAWMMSGTSVGNKRCYATHPEHEMFATDGGIFAECEQTNYDFREHKSIFMFCPWMVVNWYGEQNHEVLGRHDHCHEMCMCKCFRWCCHSSAKKLGCFPPYIPIFHYIKPHHMLMDFGNIHYMIPSDLIGNAHIVMQYPTMPPCTTQNMIMPDAQVVKVTMFPIFIGPIQAEYVFQGNLPYMNHRFQVEGPNNLVWQVTTVELGQYCCNWNQNLIKPMQHSCGKPAAPDIRSRSLQRVQFAIVIIEFKQSEELEGMAPDGYVTDTQIFWSEFIWHGTYCIKFGSQQHDCTMWAQRRVRWRIVVMRYPYETKARENYWIKFPPICGERTGWWHRSCREKRGLYTWQFEIHGMFHSPASSAKVNCKMMINCGDIAPCLREDGPSHTSQKNPHNLHFCMRNFTNITMHKLMVMRLDYISCWYRHMLHVGRENYKWSKCRMRGLTSLEEQNPQYSEHFLLFLMQQLLSDIYFPPSGMHTIKNMLCATNRKKAVFNHVAMEWDCSVDKDRCVAHDGGAPCKPYDIIQEMYDTMVFEVWMDIFYMYDAYYSRCCSFHQFVTRSKCAWKWASHMYWQELWTAIYEKGNTPANKEPRMGVLANVGPRHQHWGHCKHNVAQSFTHRMAGSMAKEEFDTWSFETVDLLGCVQMFWKDWCRGQGKKYGMVMGGSYINMLEALKWPYYWIQQMKFTLFVGVFLIRFDRWQQSFQEEVYGDMVEEYQIVMWHDSWLTILQYKQIMRQWGKSMYPAYQNLEMYTWEEMVRSAEDCFNFEWDYPSWPMSERTFCPLDTAVNHMWIETHGNIQLQASDGNHCCDCCQATMLMDTHHPHDYNKDDQEPCMCDHCEV 3 | -------------------------------------------------------------------------------- /data/stepic_6f.txt: -------------------------------------------------------------------------------- 1 | HDDVNYHHWLSRTYMHLWYFNFYTRPMTMSDKFAPAFNRYCNDVLYMRWNQICNPPHFFKFNGRCRTMTKSDWTGAVVQMSQTMMDKVTTPKFATVVCFGGQLGKIWEPENWVIIKPVDLTTRCPWCLRNEDIDTWCLAELKMIETGIGSPLAICRSSGVPCLYQWNLMALIPDMPPRGRGMGEKTPTGMNVLAHGHFQHCYYDYNRQSTGLVFPRTFEWSNRFHHTVQQDMQFLWICKVDADDETERLVYGGKPCPRKDIEYVDPMQNQASLYEWVWEFGDLEATTRYPPITCMKAEIFSIAGLWFSSSRARQYDHDYNMGTFTSIQSQGACKICAARTHWCCGTGTPFKLASDTLFQRKAAVMGFETCKIYFAPWRNIQMATVGSYEIHHFYTYNAESYPQCGTHRTGTTEWSDNPSGNFIVLTYCHPENYFMSYYDLKGAMASECICKETKEPHSLWIHYANNFHFPQCCLVNHSRVGQCERKCVKNPLIFWREVMKGTHPGCQNLCMGVVGQVTETRCHFNNFHKETRSPKALNQPMKPPAEYSRKTVCDIASILIVKLSPFEQFEMQSPQQCHTVHASWKGQVGWMRWWGMMIQGIKPQTHAPYEHFSDFTHEPATARDELCETRWIEYVRKALFTCHEGRHQYHWLTHPVVQMQQGFCESNDQHPYDFLFKHGFDQHWHWMIGLCPVPRGLQPWTMRAKYMPMLPQFLTNAKGDYTHGFSFIDHCQMICMMCMQTDMQQQQASTHSPQQDHYSNTKMFKASCMGQEPQEMCWNRVAGIRWPEFDAPDWADQFPKFVGTPKCNLDALYYQDGQHKDEREFVDWAQFQQGTCAEYWVPAKFEHWLDEDGQAKSKFFTQYMYENQVFTPMERWNVKWCSGHWAHRAEQDRDTKMHSI 2 | FGPEFAHQWAQGYDYEHCCSPTQENNDYFIRCAALIIEMVFYDKTFSRIDMNRTDEHHDTPWHFPHKCNYQFPERPIPDTIDNTVNENWFWPDMNNGKCSSTHPGVLEGEKDMVHLVDNGTNNLGSFWVFCDTNWYYWHAWPNTPNGLPNPTMDLRMNLKPCAQPPAPESHHQETCFCYHCSYVTEIQEPDVHGHYVSKSTKDGEIWCKVQCWTDLACWFACEKMNKLYLIWYGPRDFITIDPCYPDECQNVKERTDKPEWNSPAWNNNHPFEAGITGSFWNHATCDFGDGGSSMFHDYNMGTFTDNDFGVDIQQWACKIGALLTHWCMGTGKLASDTSFQRKAAVDGEVGTIYFAPWRNNQMATERYGSYEIHHWYTYNAESYPNTTVMKEVYEWCVMAHRTGTTESAHLHSDNPSGLFIVRTYEKACACHYYDLKGAMASECICKETHRFDQMEPSSLWIHYAYDFQTNDFPQCCLVGHSRVGQCERKCVKNPLIFNSVMKGTHPGCQNLCMGTVGQCPETRCHFNNFHKETWSCMDGQTSAKALNGPMKPPAEYSRKTICDIASILIIKLSPFEQFEMQVHASWPGQVCFIDMYVMMIQIKPQTFSDAMWIEYVRKALFTKHEGRHCRYGRYMPYHWLTCWNDVVQMQYGFCENWMLTWVHMKYKENHPCVALHDKPHQHSHMKTVIFKEKMAMMNPMDQELRYTSQWYKRARCDQQVKRRFIKNMAWLHLDTQEIIPTVRCFKNVVPLYFWTFFCFFVAASFFKIFKCWTPWTYQWPVTCSWMNYDGRILPKGHAQAKEKQSTSDWETFKRKQPWYDQSYNQYEAWPRCNFGWPWASHRTAHFEKHTSVFIYMPPACCSLMMPKPHCCKFPKMQGQCEHMFVRPAFHQWWNKKSLELPKEEEDKMHAIVYANYKYPQIRWNYQLYIVMCYS 3 | -------------------------------------------------------------------------------- /data/stepic_7a.txt: -------------------------------------------------------------------------------- 1 | MWCPGWPAVANTRMCSLIVPYGMHGMQSMLRMTPTVWHKNGCMHWETTTAAWAMVWSMRKFWLCWFSFKSQLLREMCFLQHMKPIARYQKNNVMHIIIPISSWEWLTWAFYGGSCSMMGIDSRGMNPEIIILRTPSTIVPGNSWITCFYKQSCSKFWMMQGGGFARRMGVDVHEDSMLTNMIKKMKFGIQFSEFFEGLHLDFNVDDRRTIWVLSICWQIFHVAMWKFIFFPNHEIDDELTCQVTLTFEQRVRTSDTQPKMWLVHDAMRGDNRAMKSGNEAPNKHKQSMYQQETRDRDAWMVHPNYRTDQHSFLQFRAPKVDQMLGWINQIYDKSEFCGKVVEPCLMKHDLPDRWFSPPECYVKKDMIFSICQLERHLQRDYTEGHDNDRPPPSRAQPCYNPQAICNASEDRPTKINPCGFGDQDKTNFVTGITFDETVKGHTGRMWGRQAWLISHKQSGERLFCQQEWFWLQMSMVDAFQGICRMKQFKNQVCDFIAEETALTIWQRRDSLCTLADPLNMDLLRTNMPRKWQCSVGTKGHCFSYPHCSMGWDHEHMYPQAIGLDRPVMHTPFGDCIMRFVACIYHTSSYPDNTSRQLLDQVSGQANFDSDRVPVEWGYDNGLDLTQFNACFKGFFLWKFLRENCTCIWREPKLQNWVFFYRFDAALHAWIADYYSMYCNCLEFRQEIGCFVFDCSMDREYEMYKMFAICNYERDCCMGKQYNYCGNGGWNDDEFTCHKISNKFNDELRRADKNPSFWKSFLEACALYSMVLICFWNERECMRMINWLQLPECNGPSDTCDWNSYYENYTFAKTFDEDPS 2 | MWCWWGFSLPNGWPAKRMIEANTRVPYGMHGMQSKEAMVHKNLCMHWETTTAAWAMVWSMRKFWLCWKSQLLREMCFLWHMKIIARYQNNVMHIIRYLIPISSFDEAANKQMRDPYSCSMMGIDSRGMNLEIIILRTPSTIVPWNSWITCFGKQSCSKFWMMCLDAPDGMAKDWCYGVSMLTNMIYKQASDAQSNADFRVDDRTHWMIFHVQEEGLTYGWKFCHGFDRYCNIIMDTDEITLTFEQRVRTSDTCHRIWWHQRGDNRARKSGNEAPNKHKQSMYQQETRDRDAWMVHPYRTDQHSFLQFRAPKVDQMFFWINQIEFCVWICLMKWFSPPECYVMLKLASSCGQLERHRDYTEGVDNDRPYIANCTMQNPSRAQPVYASEDGCGFGDQDKTNFVTGITDDETCKAQRFQAWLTSHKVSGERLFCQCNFSNQEWFWLQMSMVDAFQGICRTKQFKQQVCDFWYHRRDSLCTLADPLNPFETTKDGRVKVGAHKIACSVGTDGHCFSYPHCSMGVDHEHMYPQAIGLDRPVMHTPFGEAVARGEACIMLDNFVLCEYRSSDYPDNMSRQLLDQVSGQHLWNMWFDSCRVDVRDVGMWGYDAYMTGLDLTDWTKGIDNQQKLWCFAGNFLWKFLRENCTLNMQIWREAERAALQNFFYRWCSMTKIYYSMYCNCLEFRQEIGFDCSMDYEYEMYKMFAICCMGNYCGNGGDWDDEFKCCDNFNDELMGPTFYMDYSFWKSFLEACALYAAKHFEDMVLICFWNERMADRTALVNWLQLTMDITAEGDWCDWVSYYENYTFAKTFDEDPD 3 | -------------------------------------------------------------------------------- /data/stepic_7b.txt: -------------------------------------------------------------------------------- 1 | GAGAGTTCGCGAACTCACCTTGATGGGCAGGTGTATTGACGGCTCCGATCACACAGGCTGTGGCCAGCCCGTAATTGCCCCCTAGATGGTTGGGTAGCAGCGCTTCTGAACTACCGTCATACATTGCCTTTATGTAACTTACACGAACGAAAACCATCCGACGATGCGGTAACGATATCACCGTGTTCACACCAGATGCGCGCTTAAGTACATGAGGGGGTTACAATATATCTTAGCAACCAGTCATAATAATTCGCCCTAAGATTCGGTTAGGATAATACGGATCAGCAGATGTACGTTTGGGGTACGGTTGTTATCAGAATTGGACCTTTACCTTTAGCGTGATTCGCCCCGTAGCGTAAGGCAATCTCGGGGATCAGAACCTTGGTGGGACTTCCAGACCGTTAGACCATACAGAAGAAATCCCGTGGCGATGCACTTCTGATTACCACTGAGTTGGTCAGGGGAGCCTTAGCCGCTTCCCTCGAGCACACATTAGAGATTCCTCCCTAGGGGTCCTGATAGAGTATGGAGGGGGAGGCAGGATATACTCCCAACCTCGAGTTTAAAATGTGCCGCGTTCATCGATACCGCTCCTCTTACTAATACTAATTGGGGAAGTAGGCTTTAGGTTGTATACTCGCACTGCACGCGGTGCGAACCTACCCCTGGTCAGTACCACTGGGGCAATCGAGTATGCAATGCTGGCCTCAATATCGTGCGGTCAATTCGGTCCAGATACCCGATTCGCTAGGGGGTGTTAAGACTCATCCTGAATTACTTTTATTGCACTAGCCGGTGCTAGTCTAATACTGCCTTTCGGGTCTCCACCTTTGCTAGGTGAGCCGGCAGTGAAAAACTGACAGCCTTCTAGCGCCCCCTAACGGACAAATTATGCACTGAAGGTGCATCAATCAGTTGTGCTTCACATCTTCCCTCGAGTCGTAATTTGACAGCTAGCT 2 | TCTCAATTGGGCTCTCTTATTTTCCTTATCGGGACATCCTAGATTCAGGCTCTGGTGTGCCAGGTACCCATTGAGCTGGT 3 | -------------------------------------------------------------------------------- /data/stepic_7c.txt: -------------------------------------------------------------------------------- 1 | ACCTGTGCACTGTGACACAGGTCTCCGTAGGCTCTCACTTACCACAGTGAACAACTTAACCTTAATCGTCGAATGTCATTCCAAGGATGTCGGGTAAACTCGAGCTCACGAACAACGGTGACAATTGTCGGAAGGCAGTAAACAGCCCGATGTCGCAGTTGGTATACTGCCTCAGTAGAAGGCCATTCGTCCGTTTTGCTGCGCGGGTCTTGAGCTGACGGCATAACTAATCGTCGTCTGGAAGTTTTCTCGTTCATCTCCTGTCCAACCGACTAGACATAAAGTGAAGATACTTGGGGAGAAACGCACGATTCCACGCGGCGGCCGGCAAATTACACGTAAGGCTCCTACCCGTTAGGGAAGTGACCTTAATATCGCGATTGACACGATCCGACTGATGAAACCCTTAGTTGCAGTATCTGGATAGACGTTCTAAGGCAAACCCTGTACCATACGGGACGACATACCTGCGACGCATAAGACCCGATCTGAAACGTGTGCGTGGTCTTGTCTGGGGAGAACCTGTGAATTAATTAGGTATTCGGTGTCTTGCCTCGCTCCAAAATGCACTCTGGTGTTACGCATGGATAATGCGCTTATTCCTAGCATAATTTATAAAGGATATGGGCTACGAACAACCGCGACCCATGCGGGATTGAAGCTCAACTACATCCCCATAATGTTACCGGACGGAAACAGGGAAAACGAAGATGGACATTGACGGTATCTCCATGGGCTAAGAATCGATGCCAGGGAGACTCGATCTCTACTCCACATCGTATTAAGCCTAAATGTATAATCAGGTTACACAGCGGAGCGCAATAACCCGTGGCTGCCAAGGCAGTCAGTGAGTCTAGGAATGTCAGATCGAACTATTATAAGGATGGTCCGGCGTGGTCGCGCTATCGGCCACCATCTTATTTGTCCACCTTGC 2 | AAAATCGAAGATGGCATCGACGTGTACTCCGTGAGCTAGGTATCGCCGCCAGGGGAACTCGATCCTCTACTTTTAAGATCATATTACAGCCTAAAATTTATAATCGGGTTACACAGCGTGAGAATAACTCGTTGCTGCTAAGGCACCAGTATTCGGTATCATTAAAATGAATAACTCATAAGGGATGGTCCGACGCTGGTCGCGCCACCGGCACCATCCTTCACCCCTTTACATGCGATGCGTGGGTGGGTAACCGCGCAAATGACACGTACTGCTTTCTGGTATTGCTAGCGGCCAAGATTTACCGTAGGTTTGCTGTATCCGCTACTGTAAATTGTCCGATCCAGAAGTGGGGCATCATACCGAGGTAATAAGGGTCATCTACCGGTCGAGCTGTTTGAAACACCGGACGGCTACTGACTGGCACCGTTTAGTGTCTCAAGCTTACATTTCTGAGGAAAAAAACGAACGGTATCCGTAAGGGACACAATGAAAATCGCTCCTACCTAATCATGATCAGTCTTCGCGCGTACGCAGTATAGCTGAAATCCTACCTCTGTACACTACAGAACATGGATGTAGACACTTGACCACACGTAAGACCGCTCCGTAAGATATAAAACTACATCTCGTACCGACAACTAAAAGCCCAGCAATTGTCAGGGCATTCCGAAGGGCCCAATTATCCGATCTCACGAGGCCCTTAATCGGTGACGCGCGTGGTAAAATGCTAGCGATATAGTACGTTGCAAAAGTTTCTTACTCCGGTGTCCGTGTCCTCCAGCGTGTTTGGAGGATCCTGCACTGACTCGGGGAGCAAGAGGTCCGTAGAGACGGGGTTACAGTGAGAGGC 3 | -------------------------------------------------------------------------------- /data/stepic_7d.txt: -------------------------------------------------------------------------------- 1 | YETDVPSCFQRPQAHRQSSTPMRKGIMYEREKHSSGFPNDWWADLCTMTYDDCCDWCECCFCSNYEAGIQMIC 2 | YETDVPSCFQWIQWHPMWSTPMRKGIMYEREKHSSGFPNDWWAPQFVTLYYHLDDGSMDWCECCFCSCYEAGIKMTRQMIC 3 | -------------------------------------------------------------------------------- /data/stepic_7e.txt: -------------------------------------------------------------------------------- 1 | DVRGTAASLQLWRDGDLHFSVIGPSYLKCRISGAKQQMIKTRKNWTDGLTCWMHTHECEENTAEVMYHHLYYMRNMLMYMSFFWWQDFLSQNQFMNWSGVANEVATLELSNQFNLTWRAVHCLVWPCANYMCDGQVLHMIELPETHQAWPCSKHVDENLTGANMTHSWHEIMLVFMAWIYARRQYWVWGFVAHTAFREKMRQMPCPMWCHIAGWGITGINTDLDIVRGHKVPNCSVPDNNECIMGTYIKQWEVSCRNNVIVGHNLDYNHGYQHQQQRSFNMDVYPNLTRNVMFPMEPHRGALKNSWKQWGQSIHATMMEHYSRVDQTYQAVQREQTSGDLQINHREMMNDKCYAKCCRNNGIMELFMMLEQLPFPEAEEHQDRWMMARFTLYNPDVGMSDHTQSHAMNYPGGWTILWPACNHKFTWVPVCQTPGAFASTHKYDTRSHMIVKWDTIHEDKNRKDHRLTHSTRTRLKRRMYVNQEFQNGYLQDKCEEDYQKIEWTTSWRTVLIQTHVDKMHMIVPGCRPICKSEHLYYYFDCIMCNHAMRRGREVDWDGRLTHDYNSAVKTSTQCIMRRSWPRDQVGLQDPGKNNLWTLNMVKVINMYMCNCYGLNWGWLCKSWHYEFSMWDHCMVWGTQMDDHLFLPFHPATGCYDFTEIGTQNKRYLRWKAIIMNDCADALALCWPWCYNYLGRGMAKHLCQDWTDYNVNPQYFITPVKWNCKVRHVMLIPRLCIRRKVTKWQPQQCQVSPWSPMTVSTRVCWRAGKCQKIFIRRMLKTNCDVYMDRQQRENAKCWCVKPGDQSVVSNHHPMGWPRDEHMRCAHGDKGHIMILSLEQNDAMMGKRSQANDELTQKKTVEWDTTIFQWKWDMSRACSWRWHPYLNIAYCVICHIDENCTWSHHPAEYDKACMNMLMNNVGCRMKLHAQFLGVNSSCVCAWLHQYGGEQRLAQQIDFQLFKIPTHALWSIFTKKDHRKHRPHCGGCCIANYIEFTKSELNGYTHRAHSTQKYQKQLKNAFIHLSAIFAMPTCETVGL 2 | DELGSYEVAACWKSAVGSVFQCKFTNSWSDKGGSAFDRPYTQKPQNDSDCHYNYHPKEMPISIHIYCIPHHNSDFFWDVFFSKYNPDLRAERMARRIAIWIVWMSREQCADGQVHPTINQARCRVRGHLRYCVRFWFRQVPQAVMLLPCQGKPYKDQHHPKFQYIHQYWLCDEMNNTIVPLSHNFCGECRILLQLESWICKFNCLDLTSPCDHLLITEMFCQAWFVMLHAQNFARTWMLHVMNRYKPFNNLMENNMIHNPSCLAWHFKWPRRDNLDCYDVIESQFDCSKNNYKLFQYGFAGGDAITNQAWNKLYCAGGVINRGMFFMWDLCCMMVGPTQCDTVDKVVTCLGTELGEMQAELFYESVCISVISQCGHFLYDWFLIAPVRKGMTQPHPMYWCSTHMAFWISCRCGEADGYNRCILSFFCSYLPKMTSIAAALMVKNYDSERGKWDTIHEDKNRKYHRATHTTIFTFSVTRTRMKCRMYVNQEEQKCEEDYQKIEWTTTWATVLPQTHDAPRYTDKMHMTEYKTFGPGCRPAFMNCQSEHLYYYFDGFPYEVYHCNHAMRRGREVDWDGRLTHQYEYRSAVHNQLRLTRDQVGAYYCEERAQATQHSMFTEVMRIMCIAYMGCDVITSRFYGHDTKRKAFDFLSLMKYYSAIYGHKNGFRIIQIARDFWANIWRQPPGKYDWLAGQATEQRGFSTGRTLIAKCCTQYDCAWHHYKMEEAGDWWPKWFRVTGDYFYVWYPAQYMITKCDAVCGQHPVVDIISCQNEWALRFDPHHNHSTNGYDGQYMLMFSAMMFYGAICFNAQHVMKRIPGYHENARIYVWKFTLQHHLNCDQERTHQRMWYFCHNPCHTRTNFCETKMFYLKWHGSPLMDNQGRIIRMHVYMVLSYQMVEQYCQAVGIRLKWIFHCKDIFLIVENRQFVAAWRCMCWYMRPPTEFLYGVGLKVCHQPPIIASWCFCRLRFWECWNSHRHDDFWHGKEQFLQIYEYEYSAIDMMPCNMQLQNRNCRPQVRHHFTPKVMPICTCYLANLSRIVWGWQECQYVFPL 3 | -------------------------------------------------------------------------------- /data/stepic_7g.txt: -------------------------------------------------------------------------------- 1 | CTCATGTCG 2 | TTGGTGGCCT 3 | ATGATAAAC 4 | -------------------------------------------------------------------------------- /data/stepic_8a.txt: -------------------------------------------------------------------------------- 1 | (-42 -247 -285 -32 +44 -251 +269 +53 -156 -15 -151 +312 +321 +134 +370 +43 -50 -96 -4 +320 +36 -167 -252 +325 +280 -218 -215 -159 +257 -150 -226 +240 -124 -229 -338 -307 +367 +153 +270 -2 +212 -258 +379 +209 -57 -264 -208 -273 +100 +89 -396 -221 +319 -133 -222 +200 -350 -267 -121 +392 -259 -293 +31 -5 -277 +138 -326 -281 -163 -108 +117 +380 -394 +79 +172 +189 -171 -73 -292 +180 -114 +81 -238 -190 -72 +389 -67 -196 +130 +168 +181 -223 +228 +262 -174 -231 -369 -93 +331 +38 +56 +302 -219 -112 -400 +237 -20 +256 +162 -271 +358 +166 +91 +207 +364 +248 -12 -272 +35 -398 +88 +353 -283 -145 -18 +146 -135 -356 +158 +317 +6 -282 +104 -214 -199 +274 +120 +101 +80 -60 +69 -11 -136 +268 +25 +232 +241 +129 +191 +78 +288 +51 -366 -10 +110 +349 -40 -377 +233 +315 +182 +22 -63 -362 -375 -59 -352 +245 +333 -47 +310 -105 -3 -90 -14 +393 +249 +286 -103 -83 +46 -34 +204 -74 -384 +71 -260 -55 +294 -344 -175 +220 +118 -378 +39 -155 +244 +122 -8 +289 +246 +165 +210 +123 +250 -391 +299 +48 +341 +303 +261 +340 -217 -202 -314 +328 -235 -147 +144 +92 -99 +297 -16 +243 -52 +336 +143 +9 -291 +141 -345 +311 +161 -115 +254 -62 +323 -335 +361 +387 +304 -137 -351 -21 -278 +192 -154 +327 +382 -372 -383 +186 +198 -65 -339 +225 -371 +178 +354 -29 -313 -255 -177 -305 +119 -86 -401 +295 -179 -266 +106 +395 -324 -388 -276 -385 -113 +206 +236 -98 +348 +66 -279 +359 -107 -230 +131 -169 +28 +365 +176 +164 -381 +342 +126 +27 +54 +173 -287 +85 +77 +125 +227 +160 -184 -185 +82 +239 +402 -205 +58 +334 +376 +197 +149 -203 -64 -97 +298 +148 -213 -242 -33 +132 +296 +7 -76 +87 +194 +111 +94 -322 +170 -374 +26 +263 -19 -195 +157 +127 -75 -337 +357 +13 +373 -301 -363 -109 +347 -211 +49 +95 +140 -284 +128 -346 +116 +68 +290 +253 -265 +368 +390 +309 +329 -360 +318 -234 -142 -24 -330 +332 -61 -152 +23 -397 +84 -386 -275 -343 -308 -316 +193 -224 -306 -399 +30 -201 -37 -1 +45 -188 +139 -70 -187 -183 -216 -17 +41 -102 -355 -300) 2 | -------------------------------------------------------------------------------- /data/stepic_9d.txt: -------------------------------------------------------------------------------- 1 | TGCATAGAGAGTGCCCGATGCGATGGTTGGTTGAGACCTGCCATTTGTGGAGATGAGAACAACTTGCTGCCCCACCTTGTCAACTGGATTTTGCATGGTTTTGGGTCACATTCTCCCATGCTATCACACACTCAATCAGGGGTCTTTATTCTAACTCAGACGTATATGTAGAGCTCTCTGTTGTGACACATTACTATTTTTGGGTTTTGCTCATTGGTTAGTCACAATCCCCTGTGGATTTCAATAAGGCCTCATTCCATGTAGGTGGAACAGTAAGACCGCGATTCAAACGTCAGTTAGCATCGGAGAGATAGCATCTTGGGCATGGAAACGGATCCCAGGACTGACTAGCGACTCTCTCTGTCTGTTTATATACAATGAAAACACTGTCGCGAGACTCCTGTTTACAAGTTCCTAGTTATGGCTCAGATGGCGTCGTATGCGAAAAACCGTGGAAGAAGACAAGGCTCACAATATCCATAAGTGGCGGTTCCTTGAGGGTGGCCCAAGGAAGGAAATGTGCACTCGCGGAAACGGGGGTACGTGTCCCACATGTCTCGTTGCCACATGGCTGGCTCTGAGGTTGACCTTAGTCGGAATCGTAGAGTTGCTACCACCGTGGGCCCGAAAGTCCTGGAAAAATCAGTGCATAGACGGGGCCTTCGATGTGTCCTGCGTTTTTACATATTGTATGAAGTTCACAACCGTTCGTAGTGTCTACAGCTCCTAAGTTATTTCCTGCGTATAGTCAGAGGTCGTCGTAAGCTCCTGACGTAGACGTTTATCAACCGTTAATAGTATTTTTACGCCTAGGCCATCGTGCCAGATCCAAGCGCACCGCGTTGCGGATCTTGAGACACTCAATGCGCGAAGCGGTGCTATAAACTACGTCCTCGCTCATTTGATCATACAACGACATTCACGGGATGAAAACAGATATTCGTACTCCTCAAAAGGCTGTGTGCGTAAACGAACAATACGAGGAAGAAACTTGTACCCTCGGCGGTACAACTCTAGTGCGCCGAAGTACGTCTAGGTTACGATAAGTTGCTTGAGTAGAGAGTTAGCGCTCTCACCCACGGGGTGGGGCGCTACCCTGTCCTTGCAGCCTAAGCCAGTTCTGTCGCACTGTTTGCTCACGCTAGGGGACATTTCCTGGTCTGAATCGCGCGCATCTACATTAGTCGAATGTGGGGGCCTGGTGGACAACAGTTATCGCAGGACGGTAGAGACAGCGCATTCCACAGGCGTAGTGTCTAGTA$ 2 | -------------------------------------------------------------------------------- /output/Assignment_01A.txt: -------------------------------------------------------------------------------- 1 | GTGGATATTCC GTTCGCAGGTT GCGTTCGCAGG GATATTCCGCG GATAGTGGATA CGCGTTCGCAG GGATATTCCGC TCCGCGTTCGC TTCGCAGGTTC TATTCCGCGTT ATATTCCGCGT CCGCGTTCGCA CGTTCGCAGGT GGATAGTGGAT TTCCGCGTTCG TGGATATTCCG GTGGATAGTGG TGGATAGTGGA -------------------------------------------------------------------------------- /output/Assignment_01C.txt: -------------------------------------------------------------------------------- 1 | 21 36 51 77 107 163 192 213 220 269 276 292 344 376 383 394 409 430 458 534 557 564 572 631 682 725 734 741 748 755 762 769 776 867 874 920 935 942 961 968 1009 1025 1032 1102 1129 1161 1168 1212 1227 1269 1292 1329 1348 1365 1384 1468 1517 1542 1558 1658 1665 1672 1701 1751 1758 1808 1815 1832 1839 1855 1865 1902 1954 2008 2060 2084 2112 2120 2138 2197 2226 2233 2263 2335 2362 2379 2416 2431 2438 2510 2517 2527 2583 2591 2599 2639 2738 2755 2771 2794 2874 3111 3171 3241 3248 3265 3319 3326 3357 3436 3443 3450 3549 3556 3592 3681 3705 3714 3740 3756 3763 3770 3845 3852 3925 3981 3997 4023 4040 4047 4067 4089 4099 4115 4216 4223 4266 4273 4290 4324 4360 4481 4488 4499 4552 4559 4617 4636 4884 4892 4908 4915 4923 4930 4948 4966 4983 4990 5049 5056 5072 5179 5186 5223 5230 5264 5366 5399 5430 5452 5459 5466 5473 5491 5528 5548 5612 5741 5772 5787 5794 5801 5871 5924 5952 5959 5988 6047 6055 6170 6186 6193 6200 6232 6275 6290 6410 6418 6425 6459 6488 6524 6531 6547 6638 6735 6775 6800 6878 6894 6916 6939 6954 7000 7040 7087 7102 7111 7118 7138 7178 7193 7248 7304 7311 7351 7417 7425 7432 7465 7497 7554 7571 7579 7600 7608 7615 7623 7721 7728 7763 7792 7822 7885 7960 7967 7974 7981 7988 8114 8169 8176 8211 8218 8225 8300 8315 8335 8364 8371 8437 8462 8515 8522 8538 8590 8644 8651 8658 8688 8696 8703 8734 8750 8767 8804 8871 8878 8963 8970 8977 9020 9035 9114 9121 9172 9184 9279 9318 9406 9431 9522 9562 9569 9599 9614 9642 9657 -------------------------------------------------------------------------------- /output/Assignment_01D.txt: -------------------------------------------------------------------------------- 1 | CCTTTGAGC CCCCGATGT CGAGGTGAG TCATATGAC TTTGGCCCG TAGCCGGCC -------------------------------------------------------------------------------- /output/Assignment_01E.txt: -------------------------------------------------------------------------------- 1 | 12 13 -------------------------------------------------------------------------------- /output/Assignment_01F.txt: -------------------------------------------------------------------------------- 1 | 566 659 1387 1949 2098 3069 3650 3961 4343 4424 5220 5854 6258 6259 6581 6924 7076 7285 7532 7686 7702 7843 8849 9200 9516 9732 9884 9940 10197 10517 10526 11009 11041 11802 11999 12823 12947 13313 13748 14110 14284 15180 16065 16279 16800 16894 -------------------------------------------------------------------------------- /output/Assignment_01G.txt: -------------------------------------------------------------------------------- 1 | ACCATTCAAC TTATTCAACT CCATTCAACT AACATTCAAC GATTCAACTA GAATTCAACT ACTATTCAAC ATTCAACTGT ATTCAACTGA ATTCAACTGC ATTCAACTGG GATTCAACTT GATTCAACTC GATTCAACTG ACAATTCAAC GCATTCAACT TTCAACTAAA CATTCAACTT CATTCAACTG CATTCAACTC CATTCAACTA AACCATTCAA TCATTCAACT ACGATTCAAC ATTCAACTAT GACTATTCAA ATTCAACTAC ATTCAACTAA ATTCAACTAG TTCAACTATC ATCATTCAAC ACATTCAACT AAGATTGAAC TATTCAACTA AGATTCAACT AGCATTCAAC GGATTCAACT TATTCAACTC TATTCAACTT ATTCAACTTT ATTCAACTTG ATTCAACTTA ATTCAACTTC TATTCAACTG ATTCAACTCG ATTCAACTCA ATTCAACTCC ATTCAACTCT TGATTCAACT CGATTCAACT TTCAACTGAT AAATTCAACT AATTCAACTT AATTCAACTG AATTCAACTA AATTCAACTC GTATTCAACT CAATTCAACT CTATTCAACT ATATTCAACT TAATTCAACT -------------------------------------------------------------------------------- /output/Assignment_01H.txt: -------------------------------------------------------------------------------- 1 | ATTCTGCTG CAGCAGAAT -------------------------------------------------------------------------------- /output/Assignment_02A.txt: -------------------------------------------------------------------------------- 1 | MSSSATGPLFLPVIPNPQIKGCVDFILLGPPHSHIIRSPGVLSPRPVVRTLSTLCYLQRPCVISESSVGELRLRRHVYIQLVNCARYLFVHSLIRDPAKLTCLPRGSVTSLSRYSSTTTILTSRSRHNFHSPVCQLLSTFASVEGTHVIMFFTLPGTLSLTQGRLNRRRLSPVPTELALSSLFPVGGLRRASLNLRQPRHRAWRNHCHQALRYIIQAVPAESQPRYKPFVGRVLCRSTKLLGADVKKGKTLWSTEKQYKFEMKLSTEISNRHPYSSRPSGLLEDTWRLHLSHIAWFSCYVVGRGGVLHYGLAQSRRNPKSFFCGLLLGVWVPNSSESKNSSGGCNCPPGLVGNDHAKCLICQPPQMPLLFAGAYYVDKSSAFLSCGLSERHIQSILDAREAELPTERPAANGPQSTPERRIGAGCIGGSHAKAFDPSVETATCDARVSKIVCRSLKGTNRVLATCSLRSDSETTLVLTISVSPTGRQLGERKTKQPLNQVFAGFVHSRGHAVYPIRRMFLILLVGVYEIVNVCGVEHAGPRRYFQMRVISIPWVYRCQPLWSATTFNRSEPLLILQLNSRHQEETSRSPDRGKPIEGAGSDQRDYLEVGEADKSTTGPIEQGPIFYSFNDDLHTGLHRKDFYRWYPDVYCRQKNTGTTGAPTYKRSPLHRIAESHWQACTYSSIRLLLQAGSVGPNLSFGPVTNTAVSQEASFQCLTFLYPRKSTGIDPIGAGPKQNTRSRQRSDTSRVRGHGDYLAPVSSRRGMFRQSRICRAGARADLGNEYGILVRDRQYTANPIHTREVGNLSRRLLTLYRQSIDVEVECESIRSSPARQERAINRWFLWVLLDMQTSYRPFNVNGRRGDLRLPGFVHRNLYPWLNIDNSKHRRYTVSFHYRTSSQPNGNGYAIVLHNGVNYLCSICIAIFDNTVSITTVVTPKPPGGTGAYVIGPRMDVRTTVRRLLCSKKTCARSAIKSFNELTYSEASSIEPGAKIAPSSLSCRIASITRPLIADADTYNGLAKVRQLICCRVFLKRRVVRTTQVQRSGTIIRISPHPLITPFMSMVPSSKTGCYEALLVSGRGPDTPAHHKKLGVRIALLVSVLSKRSCVPQDLTEPNAIPPRMPRSEDLGVGNVRPWMEDHTYDTDRSPVIQSDMNEEPHTNPRTLGAAPVPLSYSGSRYIFRRLKWDLPDRVFSNPARRFQRKVNSPVLGGGLGEGSGPGGTIPLRSIILDFEEMRLDCMLSCMSPYSAEKHNRPLACIEPRFPESLDSTVYGDCTPLLEIWFPRGVSPPSSLNISSLPSKHKPQRWLFPSEGAPRYLANRHISSPDTCWDVKHALVREVSLVCFQSSSTLDRMFLLSIPCQTPDPSAESMSMLCTGGRVHLGSSSSVIFTPHPGFFVPYTSIRNEFVRCRLLKHRYAWIHNRAVSCDGLGRRPNKTSPALYVISPIEASADFPRCSAINPDMCESPSRNRVMALEVRWVSGVLVRLAIPDYVTHATAESYSLRSCKESHDCAIVIPHVLCATSHAECEANMIAYPGGPTASQTHLKHSLTTDYACYFVFVLWTNACTGTLPGSEIHEAFVSSLWRSTLKSVTLPRSASSRYGVFHTPVTRLDSHLFLIRKLSWPFGAGDEERLWVTTLGLLPLAYRSSFRPQANSLYHSGTLDSPASTPIGCSLAGMSSILREEYFPCPLRTPNYVAHSKVAQCKEKLSLRIARWFVSQGRYGVLVVPSYLLKNTCRHPYLKKIVARVTRWAYPNARTSAVGYVAACPLYGCPGVCLVVVTLIMVIQDILGLGGGLHVLTLRVDYVNRVLLTGNEGVNGEVLWLRGFFFIKVAVLPHPCIPLLGTTHIRLGNGLAPRCEGLVVPPLSTTSGSDRRTQHGGASENQVAPVRHKCTVGTSYSDVTMSPKVSNGLAPTAVPSSRTRLHFPIVFSHPQCHRGQIKFCLGLYPRLFHLSESVRTAHSLVVSGSQQGCGYFAGCDLTCAPRLSHRIHRQIQTQDTSIRDCTIIPTESTTHKAPVTRASGLAVSTYKESTDLVDFPWHVHECSAFSRSIGTTTSPDNLPPLGRQGDVLPCRKKSCNVTVAGPTDRSKLSQGVGYPPAQSPCCYVSCETNISAFSPDGHVLPARQHYHLQTRQTFHPIENRVQILHKKRGYKGNSISKPEAFSLSRLLIHHFSLVILPHVQNLVRMQNTTKFGRDPATSIGSFIQCQYSEFFGYPITHRPSVWSSNKDISSSKGLGRLNFYCDIRQSFILLFGRGPANLSRWVTSRLTIVRTECRTSAHNSPSVNRHKPVGGAITSGVPNWESWKTKVLGSSVIPSRGLWIIIPQEQRESRRTLIIVCENERLTAIVIPIVFDVLFGNLEGGQSHSAYVSHSLHGNTHIETVFRPEWAGSKQGRVIAMNVKDPAFEAFVYRGLKQLSGLSHRKKQSNTGTHHSIGRDRRKTRGPARVNAPGGAVIVRRVCHLNIAFEEKAYHLWFCVHNQIWVALPSLRQQRGTLRVRWTRLTQTYRSTVMGYKTPPCTPVLSLRGVLRGKSSSLGKDKIHVSTDEADYPSNLSKTVPPLTRTSSLNAIVIIVHCPAKQARQFAESLYFHYVRRSSHKIWYNNTLYRITCRLAPVMPSSSLGLELWSRLPLPRLNSHLSSLGGPVTCSGPLRRSHVDAMPYATTALDIHTHVSMLVACGFLAADAVQPPEEGYCEDVGPALIVVTMCKSSAAVLRVLFNVVLPMPRLTTTPTRGLLLAFTRITPVTESRHGRSAVLQHCSGTSVVSCGTVDMCVRAAMPTPAMIVSALSRLARPFTTPQRDPTR -------------------------------------------------------------------------------- /output/Assignment_02B.txt: -------------------------------------------------------------------------------- 1 | CATGGGCCCCAATGTTTTGTTAATGTA 2 | CATGGGCCACAGTGTTTCGTTAATGTC 3 | CACATTAACAAAGCATTGCGGTCCATG 4 | CATGGGCCACAGTGTTTCGTAAACGTC 5 | CACATTGACAAAGCACTGGGGCCCGTG 6 | CATGGACCGCAATGCTTTGTGAATGTT 7 | CACGGCCCTCAATGTTTCGTTAACGTA 8 | CACGGACCCCAGTGCTTTGTAAACGTA 9 | CATGGACCCCAATGCTTCGTTAACGTA 10 | CATGGACCACAATGTTTTGTTAACGTC 11 | CATGGTCCACAATGCTTTGTTAATGTA 12 | CACGGACCGCAATGTTTTGTCAACGTA 13 | CATGGCCCCCAATGTTTTGTCAATGTA 14 | CATGGGCCACAGTGCTTTGTGAATGTT 15 | CACGGCCCACAATGTTTCGTGAACGTT 16 | GACATTGACAAAACACTGAGGTCCATG 17 | CATGGACCGCAATGTTTCGTGAATGTC 18 | GACATTGACAAAACATTGAGGACCGTG -------------------------------------------------------------------------------- /output/Assignment_02C.txt: -------------------------------------------------------------------------------- 1 | 0 57 87 103 103 113 113 114 128 128 128 128 147 156 156 156 160 200 213 215 216 231 242 250 256 256 260 269 270 273 284 303 312 316 328 341 343 359 360 363 370 378 398 412 416 425 426 429 431 444 446 456 469 488 491 506 516 519 526 526 539 554 557 559 559 572 572 583 593 616 619 644 644 647 662 667 675 682 682 685 686 686 706 706 719 739 772 772 772 775 789 795 799 803 809 814 819 838 842 862 866 875 886 895 900 900 917 917 922 931 942 951 955 975 979 998 1003 1008 1014 1018 1022 1028 1042 1045 1045 1045 1078 1098 1111 1111 1131 1131 1132 1135 1135 1142 1150 1155 1170 1173 1173 1198 1201 1224 1234 1245 1245 1258 1258 1260 1263 1278 1291 1291 1298 1301 1311 1326 1329 1348 1361 1371 1373 1386 1388 1391 1392 1401 1405 1419 1439 1447 1454 1457 1458 1474 1476 1489 1501 1505 1514 1533 1544 1547 1548 1557 1561 1561 1567 1575 1586 1601 1602 1604 1617 1657 1661 1661 1661 1670 1689 1689 1689 1689 1703 1704 1704 1714 1714 1730 1760 1817 -------------------------------------------------------------------------------- /output/Assignment_02D.txt: -------------------------------------------------------------------------------- 1 | 156-113-113-131-163-147-131-137-97 97-156-113-113-131-163-147-131-137 163-147-131-137-97-156-113-113-131 113-113-156-97-137-131-147-163-131 97-137-131-147-163-131-113-113-156 131-163-147-131-137-97-156-113-113 147-131-137-97-156-113-113-131-163 156-97-137-131-147-163-131-113-113 137-97-156-113-113-131-163-147-131 163-131-113-113-156-97-137-131-147 131-147-163-131-113-113-156-97-137 113-131-163-147-131-137-97-156-113 113-113-131-163-147-131-137-97-156 137-131-147-163-131-113-113-156-97 113-156-97-137-131-147-163-131-113 147-163-131-113-113-156-97-137-131 131-113-113-156-97-137-131-147-163 131-137-97-156-113-113-131-163-147 -------------------------------------------------------------------------------- /output/Assignment_02E.txt: -------------------------------------------------------------------------------- 1 | 71-114-156-71-97-129-147-99-115-163-147-128-71-113-71-114-129-115-113-71 -------------------------------------------------------------------------------- /output/Assignment_02F.txt: -------------------------------------------------------------------------------- 1 | 128 110 112 97 9 225 122 78 94 78 231 93 359 59 125 100 84 228 341 343 228 206 328 240 456 353 162 309 325 34 113 181 210 97 309 138 266 32 7 135 248 250 135 113 235 147 363 260 69 216 232 20 88 117 4 216 359 128 221 487 81 187 253 228 212 356 469 471 113 356 334 456 368 65 584 481 290 122 437 59 74 25 453 162 241 309 338 225 106 437 531 300 393 172 25 659 106 253 359 3 425 400 69 384 528 641 118 643 285 528 506 21 628 131 540 237 50 756 653 3 462 294 609 231 134 246 197 625 147 334 413 481 510 138 397 278 609 506 275 368 147 634 81 228 334 400 375 44 359 503 616 93 618 260 503 481 603 106 515 212 25 731 628 437 269 584 206 109 221 172 600 122 309 388 456 485 113 372 253 584 653 422 515 294 122 147 781 12 228 375 481 125 547 522 191 506 650 763 240 765 407 28 650 628 143 750 113 253 10 662 44 359 172 878 775 125 584 416 731 44 353 256 368 319 747 269 456 535 603 632 260 519 25 400 731 97 641 410 503 282 110 135 769 216 363 469 113 535 510 179 494 638 751 228 753 395 16 638 616 131 738 101 241 650 32 347 160 866 763 113 572 404 719 32 341 244 356 307 735 257 444 523 591 620 248 507 13 388 719 756 525 618 397 225 250 103 884 115 331 478 584 228 97 650 625 294 609 753 866 343 868 510 131 753 731 246 853 216 356 113 765 147 462 275 981 878 228 687 519 834 147 456 359 471 422 850 372 559 638 706 735 363 622 128 503 834 425 194 287 66 553 147 253 319 294 278 422 535 12 537 179 422 400 522 25 434 131 650 547 356 188 503 125 28 140 91 519 41 228 307 375 404 32 291 172 503 278 47 140 406 106 172 147 131 275 388 390 32 275 253 375 287 503 400 209 41 356 372 81 160 228 257 144 25 356 172 34 300 66 41 25 169 282 284 169 147 269 181 397 294 103 250 266 54 122 151 38 250 528 297 390 169 22 656 103 250 356 422 397 66 381 525 638 115 640 282 525 503 18 625 128 537 234 47 753 650 459 291 606 228 131 243 194 622 144 331 410 478 507 135 394 275 606 659 428 521 300 128 153 6 787 18 234 381 487 131 553 528 197 512 656 769 246 771 413 34 656 634 149 756 119 259 16 668 50 365 178 884 781 131 590 422 737 50 359 262 374 325 753 275 462 541 609 638 266 525 31 406 737 106 234 103 216 218 103 81 203 115 331 228 37 184 200 56 85 184 131 259 25 128 241 243 128 106 228 140 356 253 62 209 225 13 81 110 209 462 231 324 103 590 37 184 290 356 331 315 459 572 49 574 216 459 437 559 62 471 168 687 584 393 225 540 162 65 177 128 556 78 265 344 412 441 69 328 209 540 147 9 275 41 16 144 257 259 144 122 244 156 372 269 78 225 241 29 97 126 13 225 3 131 113 115 100 12 228 125 81 97 81 18 2 115 12 413 182 275 54 541 135 241 307 282 266 410 523 525 167 410 388 510 13 422 119 638 535 344 176 491 113 16 128 79 507 29 216 295 363 392 20 279 160 491 16 113 10 246 15 108 374 74 140 115 99 243 356 358 243 221 343 255 471 368 177 9 324 340 49 128 196 225 112 324 625 394 487 266 94 119 753 200 347 453 97 519 494 163 478 622 735 212 737 379 622 600 115 722 85 225 634 16 331 144 850 747 97 556 388 703 16 325 228 340 291 719 241 428 507 575 604 232 491 372 703 3 131 113 115 100 12 228 125 81 97 81 25 153 22 135 137 22 122 34 250 147 103 119 4 103 510 279 372 151 4 638 85 232 338 404 379 48 363 507 620 97 622 264 507 485 607 110 519 216 29 735 632 441 273 588 210 113 225 176 604 126 313 392 460 489 117 376 257 588 31 13 15 128 25 540 309 402 181 9 34 668 115 262 368 12 434 409 78 393 537 650 127 652 294 537 515 30 637 140 549 246 59 765 662 12 471 303 618 240 143 255 206 634 156 343 422 490 519 147 406 287 618 400 169 262 41 528 122 228 294 269 253 397 510 512 154 397 375 497 409 106 625 522 331 163 478 100 3 115 66 494 16 203 282 350 379 7 266 147 478 643 412 505 284 112 137 771 2 218 365 471 115 537 512 181 496 640 753 230 755 397 18 640 618 133 740 103 243 652 34 349 162 868 765 115 574 406 721 34 343 246 358 309 737 259 446 525 593 622 250 509 15 390 721 119 101 103 88 216 113 69 85 69 609 378 471 250 78 103 737 184 331 437 81 503 478 147 462 606 719 196 721 363 606 584 99 706 69 209 618 315 128 834 731 81 540 372 687 309 212 324 275 703 225 412 491 559 588 216 475 356 687 294 63 156 422 16 122 188 163 147 291 404 406 48 291 269 391 303 519 416 225 57 372 9 388 97 176 244 273 160 41 372 481 250 343 122 609 56 203 309 375 350 19 334 478 591 68 593 235 478 456 578 81 490 187 706 603 412 244 559 181 84 196 147 575 97 284 363 431 460 88 347 228 559 6 103 528 297 390 169 22 656 103 250 356 422 397 66 381 525 638 115 640 282 525 503 18 625 128 537 234 47 753 650 459 291 606 228 131 243 194 622 144 331 410 478 507 135 394 275 606 69 197 66 179 181 66 44 166 78 294 191 147 163 19 48 147 237 6 99 365 65 131 106 90 234 347 349 234 212 334 246 462 359 168 315 331 40 119 187 216 103 315 50 32 34 19 147 44 16 609 378 471 250 78 103 737 184 331 437 81 503 478 147 462 606 719 196 721 363 606 584 99 706 69 209 618 315 128 834 731 81 540 372 687 309 212 324 275 703 225 412 491 559 588 216 475 356 687 300 69 162 428 22 128 194 169 153 297 410 412 54 297 275 397 309 6 525 422 231 63 378 15 394 103 182 250 279 166 47 378 397 166 259 38 525 119 225 291 266 250 394 507 509 151 394 372 494 406 103 622 519 328 160 475 97 112 63 491 13 200 279 347 376 4 263 144 475 285 54 147 413 7 113 179 154 138 282 395 397 39 282 260 382 294 510 407 216 48 363 379 88 167 235 264 151 32 363 334 103 196 462 56 162 228 203 187 331 444 446 88 331 309 431 343 40 559 456 265 97 412 34 49 428 137 216 284 313 200 81 412 34 16 18 3 131 28 384 153 246 25 512 106 212 278 253 237 381 494 496 138 381 359 481 393 90 609 506 315 147 462 84 99 50 478 187 266 334 363 250 131 462 197 59 325 25 91 66 50 194 307 309 194 172 294 206 422 319 128 275 291 79 147 176 63 275 118 246 12 115 228 230 115 93 215 127 343 240 49 196 212 68 97 196 50 178 47 160 162 47 25 147 59 275 172 128 144 29 128 21 149 18 131 133 18 118 30 246 143 99 115 99 393 162 255 34 521 115 221 287 262 246 390 503 505 147 390 368 490 402 99 618 515 324 156 471 93 108 59 487 9 196 275 343 372 259 140 471 134 262 28 3 131 244 246 131 109 231 143 359 256 65 212 228 16 84 113 212 628 397 490 269 97 122 756 203 350 456 100 522 497 166 481 625 738 215 740 382 3 625 603 118 725 88 228 637 19 334 147 853 750 100 559 391 706 19 328 231 343 294 722 244 431 510 578 607 235 494 375 706 253 22 115 381 81 147 122 106 250 363 365 7 250 228 350 262 478 375 184 16 331 347 56 135 203 232 119 331 50 32 34 19 147 44 16 -------------------------------------------------------------------------------- /output/Assignment_02G.txt: -------------------------------------------------------------------------------- 1 | 57-137-163-115-97-113-128-131-128-163-113-156-97-66-97 -------------------------------------------------------------------------------- /output/Assignment_03A.txt: -------------------------------------------------------------------------------- 1 | AGGTA GGAAG GGCAG GGGAG GGTAG -------------------------------------------------------------------------------- /output/Assignment_03B.txt: -------------------------------------------------------------------------------- 1 | CAGGAG -------------------------------------------------------------------------------- /output/Assignment_03C.txt: -------------------------------------------------------------------------------- 1 | AGGGAA -------------------------------------------------------------------------------- /output/Assignment_03D.txt: -------------------------------------------------------------------------------- 1 | CGCCCGGTTTCA 2 | TCCTGAACTCGC 3 | ACCATAGATCAA 4 | GGCCAAGACTCA 5 | GGCTGAGTTTAA 6 | GACCAAGGCTAA 7 | GGTGCCAAGTTA 8 | GCCTGAAGCTAA 9 | GCCTACGGCCTA 10 | GACCAGGACTGA 11 | TCCTGAACCTGA 12 | GTCCACGCCTTA 13 | GTCCAGGTCTAA 14 | GCCCAAGACTAA 15 | GGCCACGGCTGA 16 | GGCCAAGTCTGA 17 | GCCCAAGACTCA 18 | GGCCAGGGCTAA 19 | AACAAGGACTCA 20 | GTCCAGGTCTCA 21 | GCCCAGGCCTGA 22 | GTCTCAATCTTA 23 | GGCCACGCCTTA 24 | GTCCACGGCTTA 25 | GCCCAAGTCTCA -------------------------------------------------------------------------------- /output/Assignment_03E.txt: -------------------------------------------------------------------------------- 1 | ACCTCTTCTTTG 2 | AGCCCTTCTTTA 3 | AGCCCTTCATTC 4 | AACGCTTCGTTC 5 | AACCCTTCATTA 6 | ATCACTTCGTTG 7 | ATCTCTTCATTC 8 | ATCTCTTCTTTC 9 | ATCGCTTCGTTC 10 | ATCACTTCATTT 11 | AACCCTTCGTTA 12 | AACGCTTCATTT 13 | ACCACTTCTTTG 14 | ACCACTTCGTTG 15 | ATCGCTTCGTTA 16 | ACCCCTTCGTTC 17 | ACCCCTTCCTTG 18 | AACGCTTCATTT 19 | AACACTTCTTTC 20 | ATCGCTTCGTTC 21 | AACTCTTCTTTG 22 | ATCTCTTCCTTA 23 | ACCGCTTCCTTG 24 | AGCACTTCTTTG 25 | AACCCTTCGTTC -------------------------------------------------------------------------------- /output/Assignment_03F.txt: -------------------------------------------------------------------------------- 1 | CACCCTGATTCCATC 2 | CTTTGGGGTTTGGCC 3 | CTTCCTGGTAACGCC 4 | CTTCCTAAGTTGGCC 5 | CGAGCTGGTTTGGCC 6 | CTTCCACTTTTGGCC 7 | CTTCCTATCTTGGCC 8 | CTTCCTGGTTTGAGG 9 | CTGTGTGGTTTGGCC 10 | ACCCCTGGTTTGGCC 11 | CTTCTACGTTTGGCC 12 | CTTCCTGGTTTCTAC 13 | CTTGAAGGTTTGGCC 14 | CTTCCTGGAGGGGCC 15 | ATTCCTGGTTTGGGA 16 | CTTCGATGTTTGGCC 17 | CTTCCTGGTTGAACC 18 | CTTCCTGCGATGGCC 19 | AATCCTGGTTTGGCT 20 | CTTCCGACTTTGGCC -------------------------------------------------------------------------------- /output/Assignment_03G.txt: -------------------------------------------------------------------------------- 1 | TTGAGTAAAAGTTAC 2 | ATGATACGAAGTGAG 3 | TTGCGTCGAAGTGCC 4 | ATGCGTCTTTGTGAG 5 | ATGCGTGCTAGTGAG 6 | ATGCGTCGCTTTGAG 7 | ATGGTGCGAAGTGAG 8 | ATGCGTCGAGCAGAG 9 | ACAAGTCGAAGTGAG 10 | ATGCTAAGAAGTGAG 11 | ATGCGTCGAAGTATC 12 | ATAACTCGAAGTGAG 13 | ATGCGAAAAAGTGAG 14 | ATGCGTGCTAGTGAG 15 | ATGCGTCGAAAGAAG 16 | ATGCGGTTAAGTGAG 17 | CCGCGTCGAAGTGAC 18 | ATGCGTCGAAGGTTG 19 | CCTCGTCGAAGTGAG 20 | ATGCTATGAAGTGAG -------------------------------------------------------------------------------- /output/Assignment_05B.txt: -------------------------------------------------------------------------------- 1 | TCTCTATAGCATTCAAAGGGGACGCGCCACTTTTAAACCTTGTGTTGTCCGCTAAACTTTGGGGCCTCTGAGAGCTGGATTGTGCCCGCTCTAACTAGTCGATCAAGAGCTATTTGGCAGACGCGCTAGGATTTCAACGGCAAACACAACGCGGTCACTCTAGTAAATACCTGTGCACACAATGTCAGCGCCGTCACCAATGCGTGTGATTGGCAAGGGGTTTCACTAGTTCCTTCCGGGGCCCGGTTGTAATATGATCATGGAGCCCCTCTTCGCCATCAGCAGACCATGTTATTTAAATGTATACTCCGGTGTCACAGAGAAGCCCACCTGATGGTGCGGGCTGGGGTGCGCCGATGCCCTAAAGTAGGGAGCACGTCCTCCCAGTCTACTGTGGTACATGGACCCTTTTCGAAGGGTGCGAAACGACTGCGATCCGCACTCATTAGGTTGGGAGTTCAGGAACGAGGACGGTAGTAGCTAGACTTAGCTAACGGGACGCGCCGAACCCTCTTGACTGTGGGGTTCTCAATGCTTTATCATAAAAACTAGGTACACGGTTGTGCTCCGATAGCGCAGTGGAGCAACACCAACAGATGGGCCCCGGCGGGGAGATGCTGCTATTATTTTTTACATGACTCTAAAGTCTTATAACGGCTGCCAAGGCGTTTTAACTCATGACAACACAGGATCCGGCTCGACCAGGAGAAAACCCCCCTATCTCGTTTATCTAATGACAGCTCATTAGCTGTTCAAACGACCGTGCGACTCGTGTGGCAGCCCCCGGGACCTGCCGAGAGCGATCTAGCCATGCGCTCCTAATCGCTACTTTTTATGGGAAGCAACATTCCATCAGCTTGTGATGCCATTCTTTCAGCCCGTTATTCCCAACAATATCCCCAAATATGAGTGGGCTTGCGCTTATCGGGATTATGGGTCACGTGTAGTGCAGACGTGTCCTTACGAAAGACGGGTCGCTCCCACGCTACGGATTAACAAGCATGCGTTCCTCCTGTCACACTTACCCCTCAACTACACACGGCAGGAACGCCCTGCGAACACCCATACTTGTTTCAAGCTGCTTTAGCCCCGGGGCTACGGCAAAGACAGCCTGACCACACTCCGTACAGTTTGGAATTATGAGGAGTGATGACATAGAGCATTGTCTTGTGCCCTGTCTACCGGGCTACGAGGGATTTTAGATGCAATCGTTCTGCGAATCAATTGGTTGGATGCCCACAAGTTGCTTTTATCTAGGACCCCTAGCCTGCGCCGGTATTTAAAAGAAGGCGGCTCCGCGATAGCGCGTTCCTCAAAGTTGAGGGGAGTATCAGTCTTCGGAACCTCCACCCACAGGCAGTTGGGACTTCGTAGACGGAGCAGACAAATGAATCCCGGAGTCACCCCTGCGACGGCGTCCTTCTTCATGGGCCGTCAATGAGGTATTTCCCATATGCAGCGTTCGCGATGCAGCGGGATAATTTGGCTTAACCCTCACGAAAGACACGTTTCCGCTGTGGAAGACATTGCCTCATTCTTGATCGATACGATTCGCGTTGTCCCCTCCTTACTGCACATAACCTACGGTGAAACGCGAACCCTGGAGTCGGTATATCTTCCATTTTCGCATTGCATGCATGAACGTGGAGTTGAAGGGGGATCTTCAACGGAAGATTTCTATCACGGAGCCGGGCGTTTCGACTGTTAGTCAAATGATGGATCCTATAGGACACGCGGAGCCTCTGGCTAAGACTGCACCACCCTAACCAATGGTGGACATAATGGTTTCGATAGCGGCTCTACGACGTGCTACATGGCATAAGAGCTGCCGTATACCTGCAAGTCCACTCATCTACCCACAGCTTCCTCCATGTCCGTCCTACGATCTTTTGCCACACCTCGGCAGTGATATGCTTTCTCCACAATCTATACAAGTAACATGTCAATTTTCCAGTTGGGAGTCGTAACACTAGAAATATGGTATGTCTGAGCATTAGTGA -------------------------------------------------------------------------------- /output/Assignment_06A.txt: -------------------------------------------------------------------------------- 1 | 762 -------------------------------------------------------------------------------- /output/Assignment_06B.txt: -------------------------------------------------------------------------------- 1 | 85 -------------------------------------------------------------------------------- /output/Assignment_06C.txt: -------------------------------------------------------------------------------- 1 | GTAGGTGACTTCCACAACTCTCCACCGGGAAGATGTATGTAAGATCGACTGGGGTGCGGATGCTTCGACAGCCAGAGCACAACGACGTCTGCGTTAAGTATCTAATGTAGGCCGTGGGTCCCGGACTTGACCCGGAAAATAAACTATCATGGCTATGGTGAAGCTTCGTATCATACACACGGCCGTCAGCAATTTAGAACGTTAGCGAGGATTAGGCTTTTCGAGTCGTGTTTCGGGCCCCTTTTGCACATAGGTCAGGGTGTACTCCTCTTATGGCTTGCTGGGATCCAAGATCCGGGCGTAGTTTAGCCGCCTGGACAGGTCAAACGCAAAAGTACGCCCTGCTGTATAACGTGCGATAGGGTATACAACTACAAGAGCAGATAGCTGGTAAGTGTAGAATCACTGATGCGCGCACCTAATTCGTGGGACATGCCTCCACAGACCGACGTACTTTTTTACCTGATCGACCGCCCACTGCTCGGGAATCGCGCAGTTCGACATAAGATGTCGGCTCCAGGGGGCCACGTCTTTAGAGATAAAGTTCTTTAATTCTGTTGAGGTCCTCGCCATAAACTCGCTTCACTGACCTATGATGCTACTATA -------------------------------------------------------------------------------- /output/Assignment_06D.txt: -------------------------------------------------------------------------------- 1 | 193 2 | 0->1->3->5->6->8->13->16->18->20->21 -------------------------------------------------------------------------------- /output/Assignment_06E.txt: -------------------------------------------------------------------------------- 1 | 2280 2 | IWWRDFMAEFMWQNSGSRAMCFFNR---I-VCWNARALNYKNCSLQVKERLAAKRCYATHPE-AI--TDGGIFAECEQTNYDFREHKSIFMFCP--T--WYGEQNHEVLGRHDHCHEMCHTCKCFRWRCHSSAKKLGCFPWYIPKFHYIKPMVVYHHMLTIHYKIPSQNKSDDALIGNAKIVMHLTGCQYNNYPTYSFEMPPCDT--W--PDAQVVKVTMFPIFIA-FQSKTWWSMILSSSSGYLPYLMNCPNNLVWQVP-LRCC-YCCG-NQNLEKPMQHSCGKPAAPDIRSRSQDFHWNRLQRVQEDIVSIEFKWSEELIEQWTFWVFYNLAHMGYAPDGYV--TEFIWHGTYCIKFYKLGKQRQWKSQQHYCTMWAVRRVRWRIVV--H-YETKRRENYWIKFPPMCGERT-W-HRYCREKRVEGL-V-NFEIHQWQNQPLFKYMFHRCKMMINC-------REDGPSHTSQKNQHNLHFDMRNYSTQGMYDFTNITAKLPYISCTYHKMLWPMPARQMTHVGRESYKWSHCRMRGNTSW----PQYSEH-R-F---------YFPPS-MQWWMTIKNMLCATNRKKEVFNH-----D-SVDKFR----DV-SPHMPYDIIQEQWGGPFMYDTMEFEVLTQMDIFYMYDAYYSRCCSMFKTFALWNRSKYMQMDWNNNNQTIPIAKWASHMYWQELLDVTMRACCDKYWSTYIMIKANYVEPRMGVLANVGPRHQHWAHCKHNVAIWQSFTHRMAGSMAKEEPWFACLYLIREVIICGETCDLLGCVQMQWKDWCRGQGKKYGMVMG-K---MIEALK--FF------FTLFVGV--V----WQQSCQEEQYGDMVEEYQIVMWHDSWLTILQYKQIMRQWGKQ--PM-RS--MYP-APFERYLWCCFNFEWDVQIVDYVLI-RTFCPL---VNQMWIETC-NIQLQASDHNHCCDCCQATMLMDTHHPHDYNKDDQEPCMCDHCEV 3 | IWW---M----WQNSGSRAMCFFNSDGDIIVCWNAGA--WM-MS-GTS--VGNKRCYATHPEHEMFATDGGIFAECEQTNYDFREHKSIFMFCPWMVVNWYGEQNHEVLGRHDHCHEMC-MCKCFRWCCHSSAKKLGCFPPYIPIFHYIKPHHMLMDFGNIHYMIPS----D--LIGNAHIVM-----QY---PT----MPPCTTQNMIMPDAQVVKVTMFPIFIGPIQAEYVFQGNLPYMN-HR-FQVEGPNNLVWQVTTVELGQYCCNWNQNLIKPMQHSCGKPAAPDIRSRS-------LQRVQFAIVIIEFKQSEEL-E--------GMAPDGYVTDTQIFWSEFIWHGTYCIKF---G------SQQHDCTMWAQRRVRWRIVVMRYPYETKARENYWIKFPPICGERTGWWHRSCREKR--GLYTWQFEIHGMFHSPASSAKVN-CKMMINCGDIAPCLREDGPSHTSQKNPHNLHFCMRNFTNITMHKLM-VM-RLDYISC------W-Y--RHMLHVGRENYKWSKCRMRGLTSLEEQNPQYSEHFLLFLMQQLLSDIYFPPSGMH---TIKNMLCATNRKKAVFNHVAMEWDCSVDKDRCVAHDGGAPCKPYDIIQE------MYDTMVFEVW--MDIFYMYDAYYSRCCS-FHQF-V-TRSK-CA--W----------KWASHMYWQELW--T--AIYEK-GNT----PANK-EPRMGVLANVGPRHQHWGHCKHNVA--QSFTHRMAGSMAKEE--FDT-W---S-F---ETVDLLGCVQMFWKDWCRGQGKKYGMVMGGSYINMLEALKWPYYWIQQMKFTLFVGVFLIRFDRWQQSFQEEVYGDMVEEYQIVMWHDSWLTILQYKQIMRQWGKSMYPAYQNLEMYTWEEMVRSAEDCFNFEWDYP--SWPMSERTFCPLDTAVNHMWIETHGNIQLQASDGNHCCDCCQATMLMDTHHPHDYNKDDQEPCMCDHCEV -------------------------------------------------------------------------------- /output/Assignment_06F.txt: -------------------------------------------------------------------------------- 1 | 1188 2 | FAPAF-NRYCNDVLYMRWNQICNPPH-FFKFNGRCRTMT-K-S--DWTGAVVQMSQTMMDKVTTP-KFATVVC---FGGQ-L-GKI-WE-PENWVIIKPVDLTT-RCPWCLRNEDIDTWCLAEL-KMIETGIGSPLAICRS-SGVPCLYQWNLMALIPD-MP-P-RG-RGMGEKTPTGMNVLAHGHFQH-CY-YD--Y-NR-QSTGL-V-FPRTFEWSNRFHHTVQ--QDMQFLWI-CKVDADDETERLV-YGGKPCPRKDIEYVDPMQNQASLYEWV-WEFGDLEATTRYPPITCMKAEIFSIAGLWFSSSRARQYDHDYNMGTFT-S---IQSQG-ACKICAARTHWCCGTGTPFKLASDTLFQRKAAVMGFETCKIYFAPWRNIQMATV--GSYEIHHFYTYNAESYP------Q----CG-THRTGTTEW----SDNPSGNFIVLTYCHPENYFMSYYDLKGAMASECICKET-K----EPHSLWIHYANNFH---FPQCCLVNHSRVGQCERKCVKNPLIFWREVMKGTHPGCQNLCMGVVGQVTETRCHFNNFHKETRS------P-KALNQPMKPPAEYSRKTVCDIASILIVKLSPFEQFEMQSPQQCHTVHASWKGQVGWMRWWGMMIQGIKPQTHAPYEHFSDFTHEPATARDELCETRWIEYVRKALFTCHEGRH-QY------HWLT--HPVVQMQQGFCESNDQHPYDFLFKHGFDQHWHWMIGLC--PVPRG-LQPWTMRAKYMPMLPQFLTNAKGDYTHGFSFI-DHC-QMICMMCMQTDMQQQQASTHS--PQQDHYSNT-KM-FKAS-CM-GQEP--QEM-CWNRVAGIRWPEFDAPDWADQF-PKFVGTPKCNLDALYYQDGQHKDERE-FV--D-WAQFQQGTCAEY--WVPAKFE-HWLDE-DGQ-AK-SKFFTQYM 3 | FGPEFAHQW-AQG-Y-DYEHCCSPTQENNDYFIRCAALIIEMVFYDKTFSRIDMNRTD-EHHDTPWHFPHK-CNYQFPERPIPDTIDNTVNENW-F-WP-DMNNGKCS-STHPGVLEG--EKDMVHLVDNGTNN-LGSFWVFCDTN-WYYWHAWPNTPNGLPNPTMDLR-MNLK-PCAQPPAPESHHQETCFCYHCSYVTEIQEPDVHGHYVSKSTKDGEIWCKVQCWTDLAC-WFACE-KMNKLY--LIWYGPRDFITIDPCYPDECQNVKERTDKPEWNSPAWNNNHPFEA-G-ITGSFWNHATCDFGDGGSSMF-HDYNMGTFTDNDFGVDIQQWACKIGALLTHWCMGTG---KLASDTSFQRKAAVDG-EVGTIYFAPWRNNQMATERYGSYEIHHWYTYNAESYPNTTVMKEVYEWCVMAHRTGTTESAHLHSDNPSGLFIVRTY-E-KACACHYYDLKGAMASECICKETHRFDQMEPSSLWIHYAYDFQTNDFPQCCLVGHSRVGQCERKCVKNPLIF-NSVMKGTHPGCQNLCMGTVGQCPETRCHFNNFHKETWSCMDGQTSAKALNGPMKPPAEYSRKTICDIASILIIKLSPFEQFEMQ-------VHASWPGQVCFIDMYVMMIQ-IKPQT------FSD-----A-----M----WIEYVRKALFTKHEGRHCRYGRYMPYHWLTCWNDVVQMQYGFCENW-MLTWVHM-K--YKEN-HPCVALHDKPHQHSHMKTVIFKEK-MAMMNP-MDQEL-RYTSQW-YKRARCDQQVKRRFIK-NMAWLHLDTQEIIPTVRCFKNVVPLYFWTFFCFFVAASFFKIFKCWTPWT-YQWP-VTCS-WMN-YDGRIL--PKGHAQA---KEKQSTSDWETFKRKQPW--YDQS-YNQYEAWPRCNFGWPWASHRTAHFEKHTSVFI-YM -------------------------------------------------------------------------------- /output/Assignment_07A.txt: -------------------------------------------------------------------------------- 1 | 342 -------------------------------------------------------------------------------- /output/Assignment_07B.txt: -------------------------------------------------------------------------------- 1 | 20 2 | TCATCGATACCGCTCCTCTTACTAATACTAATTGGGG-A----AG-T--AGGCTTTAGGT-TGT-A--TACTCGCACTGCACGC-GGT 3 | TC-TCAATTGGGCTC-TCTTATTT-TCCTTATCGGGACATCCTAGATTCAGGCTCT-GGTGTGCCAGGTAC-C-CATTG-A-GCTGGT -------------------------------------------------------------------------------- /output/Assignment_07C.txt: -------------------------------------------------------------------------------- 1 | 61 2 | AAAA-CGAAGATGGACATTGACG-GTATCTCCATGGGCTAAGAATCGATGCCA-GGGAGACTCGAT-CTCTAC--TCCACATCGTATTA-AGCCT-AAATGTATAATCAGGTTACACAGCG-GAGCGCAATAACCCGTGGCTGCCAAGGCAGTCAGTGAGTCTAGG-A--ATGTCAGATCGAACT-A-TTATAA-GGATGGTCCGGCG-TGGTCGCGCTATCGGCCACCATCTTATTTGTCCACC-TTGC 3 | AAAATCGAAGATGG-CATCGACGTGTA-CTCCGTGAGCTAGGTATCGCCGCCAGGGGA-ACTCGATCCTCTACTTTTAAGATCATATTACAGCCTAAAATTTATAATCGGGTTACACAGCGTGA--G-AATAACTCGTTGCTGCTAAGGCA-CCAGT-A-T-TCGGTATCAT-TAAAAT-GAA-TAACTCATAAGGGATGGTCCGACGCTGGTCGCGCCACCGG-CACCATC---CTTCACC-CCTTTAC -------------------------------------------------------------------------------- /output/Assignment_07D.txt: -------------------------------------------------------------------------------- 1 | 293 2 | YETDVPSCFQRPQAHRQSSTPMRKGIMYEREKHSSGFPNDWWA-DLCTMTY---DDCCDWCECCFCSNYEAGI----QMIC 3 | YETDVPSCFQWIQWHPMWSTPMRKGIMYEREKHSSGFPNDWWAPQFVTLYYHLDDGSMDWCECCFCSCYEAGIKMTRQMIC -------------------------------------------------------------------------------- /output/Assignment_07E.txt: -------------------------------------------------------------------------------- 1 | (519, 525) (520, 526) -------------------------------------------------------------------------------- /output/Assignment_07G.txt: -------------------------------------------------------------------------------- 1 | 4 2 | CTCATG--T------C-G 3 | ---TTG-GT---GGCCT- 4 | ---ATGA-TAAA---C-- -------------------------------------------------------------------------------- /output/Assignment_08B.txt: -------------------------------------------------------------------------------- 1 | 166 -------------------------------------------------------------------------------- /output/Assignment_08C.txt: -------------------------------------------------------------------------------- 1 | 8971 -------------------------------------------------------------------------------- /output/Assignment_08D.txt: -------------------------------------------------------------------------------- 1 | (3101, 8976) 2 | (3101, 19060) 3 | (3312, 1506) 4 | (3312, 13771) 5 | (9180, 5713) 6 | (9181, 5714) 7 | (9181, 16928) 8 | (9182, 16929) 9 | (9622, 15514) 10 | (14233, 1837) 11 | (14233, 14775) 12 | (14233, 15891) 13 | (14383, 1556) 14 | (14383, 4129) 15 | (14383, 6978) 16 | (17714, 8237) 17 | (17715, 8236) 18 | (17715, 8740) 19 | (18451, 7648) 20 | (18451, 10875) 21 | (18451, 16548) 22 | (21500, 11515) 23 | (21501, 11516) 24 | (21501, 19082) 25 | (23126, 12554) 26 | (23127, 2982) 27 | (23127, 12555) 28 | (23128, 2981) 29 | (23128, 12556) 30 | (23129, 12557) 31 | (23826, 10609) 32 | (23826, 13960) 33 | (23826, 18178) 34 | (26499, 6943) 35 | (35242, 12833) 36 | (35243, 1364) 37 | (35243, 12832) 38 | (35244, 12831) 39 | (36141, 7648) 40 | (36141, 10875) 41 | (36141, 16548) 42 | (36971, 15514) 43 | (41047, 1837) 44 | (41047, 14775) 45 | (41047, 15891) 46 | (41138, 10574) 47 | (41139, 10573) 48 | (41140, 8922) 49 | (41140, 10572) 50 | (41140, 10738) 51 | (41141, 8921) 52 | (41141, 10571) 53 | (41142, 8920) 54 | (41142, 10570) 55 | (41143, 10569) 56 | (41144, 10568) 57 | (42857, 7145) 58 | (42857, 9550) 59 | (42857, 14153) 60 | (42858, 7144) 61 | (47548, 8236) 62 | (47548, 8740) 63 | (48000, 850) 64 | (48001, 849) 65 | (49205, 10819) 66 | (49205, 19926) 67 | (49206, 19925) 68 | (50555, 5713) 69 | (50556, 5714) 70 | (50556, 16928) 71 | (59633, 2814) 72 | (59634, 2140) 73 | (59634, 2813) 74 | (59634, 8208) 75 | (59635, 2812) 76 | (61593, 18177) 77 | (61594, 10609) 78 | (61594, 13960) 79 | (61594, 18178) 80 | (61767, 7649) 81 | (61767, 16549) 82 | (61768, 7648) 83 | (61768, 10875) 84 | (61768, 16548) 85 | (61769, 7647) 86 | (61769, 10874) 87 | (61769, 16547) 88 | (61770, 16546) 89 | (67029, 16511) 90 | (67030, 3173) 91 | (67030, 16510) 92 | (68070, 1837) 93 | (68070, 14775) 94 | (68070, 15891) 95 | (68071, 1838) 96 | (68071, 15890) 97 | (69651, 7585) 98 | (73329, 8922) 99 | (73329, 10572) 100 | (73329, 10738) 101 | (73330, 10739) 102 | (81821, 16511) 103 | (81822, 3173) 104 | (81822, 16510) 105 | (81823, 3172) 106 | (86287, 6477) 107 | (86288, 6478) 108 | (86289, 2231) 109 | (86289, 6479) 110 | (87799, 8975) 111 | (87800, 8976) 112 | (87800, 19060) 113 | (90479, 1558) 114 | (90480, 1557) 115 | (90481, 1556) 116 | (90481, 4129) 117 | (90481, 6978) 118 | (90482, 1555) 119 | (90482, 4130) 120 | (90482, 6977) 121 | (90483, 1554) 122 | (90484, 1553) 123 | (90485, 1552) 124 | (90486, 1551) 125 | (90834, 14331) 126 | (97463, 2983) 127 | (97464, 2982) 128 | (97464, 12555) 129 | (97465, 2981) 130 | (97465, 12556) 131 | (100991, 73) 132 | (100991, 4236) 133 | (100992, 74) 134 | (100992, 4235) 135 | (100993, 4234) 136 | (101660, 11168) 137 | (101661, 11169) 138 | (103607, 2183) 139 | (103607, 11622) 140 | (106485, 2089) 141 | (110479, 4310) 142 | (110480, 3229) 143 | (110480, 4311) 144 | (110481, 3228) 145 | (116572, 1506) 146 | (116572, 13771) 147 | (116731, 11567) 148 | (116732, 11568) 149 | (116733, 8706) 150 | (116733, 11569) 151 | (119468, 11912) 152 | (119468, 18358) 153 | (119637, 9266) 154 | (120783, 12945) 155 | (121023, 3914) 156 | (121023, 16412) 157 | (122187, 15277) 158 | (122188, 15082) 159 | (122188, 15276) 160 | (123008, 2139) 161 | (123009, 2140) 162 | (123009, 2813) 163 | (123009, 8208) 164 | (125462, 3914) 165 | (125462, 16412) 166 | (126911, 19084) 167 | (126912, 19083) 168 | (126913, 11516) 169 | (126913, 19082) 170 | (126914, 11517) 171 | (138848, 1509) 172 | (138849, 1508) 173 | (138850, 1507) 174 | (138851, 1506) 175 | (138851, 13771) 176 | (139512, 19957) 177 | (140958, 9554) 178 | (140959, 9553) 179 | (140960, 9552) 180 | (140961, 9551) 181 | (140962, 7145) 182 | (140962, 9550) 183 | (140962, 14153) 184 | (142363, 4309) 185 | (142364, 4310) 186 | (142365, 3229) 187 | (142365, 4311) 188 | (142366, 4312) 189 | (148879, 73) 190 | (148879, 4236) 191 | (149290, 11568) 192 | (149291, 8706) 193 | (149291, 11569) 194 | (149292, 8705) 195 | (152155, 4127) 196 | (152156, 4128) 197 | (152156, 6979) 198 | (152157, 1556) 199 | (152157, 4129) 200 | (152157, 6978) 201 | (156885, 18356) 202 | (156886, 18357) 203 | (156887, 11912) 204 | (156887, 18358) 205 | (160278, 6211) 206 | (160278, 12265) 207 | (164471, 19612) 208 | (165915, 7145) 209 | (165915, 9550) 210 | (165915, 14153) 211 | (167383, 2231) 212 | (167383, 6479) 213 | (167384, 6480) 214 | (170530, 8922) 215 | (170530, 10572) 216 | (170530, 10738) 217 | (173043, 7079) 218 | (173043, 9036) 219 | (178246, 2184) 220 | (178247, 2183) 221 | (178247, 11622) 222 | (178248, 2182) 223 | (184381, 6212) 224 | (184382, 6211) 225 | (184382, 12265) 226 | (188452, 10609) 227 | (188452, 13960) 228 | (188452, 18178) 229 | (188453, 10610) 230 | (188453, 13959) 231 | (189341, 16043) 232 | (190643, 15278) 233 | (190644, 15277) 234 | (190645, 15082) 235 | (190645, 15276) 236 | (190646, 15083) 237 | (195082, 2138) 238 | (195083, 2139) 239 | (195084, 2140) 240 | (195084, 2813) 241 | (195084, 8208) 242 | (195085, 8207) 243 | (195238, 7079) 244 | (195238, 9036) 245 | (195239, 7080) 246 | (196374, 12809) 247 | (196375, 12808) 248 | (196789, 15082) 249 | (196789, 15276) 250 | (196790, 15275) 251 | (198444, 10819) 252 | (198444, 19926) 253 | (198445, 19925) 254 | (199587, 1364) 255 | (199587, 12832) -------------------------------------------------------------------------------- /output/Assignment_09B.txt: -------------------------------------------------------------------------------- 1 | 36 43 618 1082 1089 1286 1293 2030 2224 2231 2329 2430 2577 2584 2827 2952 2967 3142 3149 3239 3410 3803 3810 4102 4109 4290 4297 4857 4864 5287 5386 5418 5500 6335 6342 6615 6636 6643 6880 7117 7124 7291 7298 7707 7714 8267 8576 8583 8676 9094 9145 -------------------------------------------------------------------------------- /output/Assignment_09E.txt: -------------------------------------------------------------------------------- 1 | AAAGTCTGGTAATT -------------------------------------------------------------------------------- /output/Assignment_09F.txt: -------------------------------------------------------------------------------- 1 | GAAAAAAT -------------------------------------------------------------------------------- /scripts/DNA_RNA_Operations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | '''ROSALIND bioinformatics scripts that returns that operate on DNA and RNA.''' 3 | 4 | from string import maketrans 5 | 6 | # Kind of pointless, as it's so simple. 7 | def DNA_to_RNA(dna): 8 | '''Translates DNA to RNA''' 9 | return dna.replace('T', 'U') 10 | 11 | # Kind of pointless, as it's so simple. 12 | def RNA_to_DNA(rna): 13 | '''Translates RNA to DNA''' 14 | return rna.replace('U', 'T') 15 | 16 | 17 | def ReverseComplementDNA(nucleic_acid): 18 | '''Returns the reverse complement of a given DNA strand.''' 19 | nucleotide = 'ATCG' 20 | complement = 'TAGC' 21 | transtab = maketrans(nucleotide, complement) 22 | 23 | return nucleic_acid.translate(transtab)[::-1].lstrip() 24 | 25 | def ReverseComplementRNA(nucleic_acid): 26 | '''Returns the reverse complement of a given RNA strand.''' 27 | nucleotide = 'AUCG' 28 | complement = 'UAGC' 29 | transtab = maketrans(nucleotide, complement) 30 | 31 | return nucleic_acid.translate(transtab)[::-1].lstrip() 32 | 33 | def HammingDistance(seq1, seq2): 34 | 'Return the Hamming distance between equal-length sequences.' 35 | if len(seq1) != len(seq2): 36 | raise ValueError('Undefined for sequences of unequal length.') 37 | return sum(ch1 != ch2 for ch1, ch2 in zip(seq1, seq2)) -------------------------------------------------------------------------------- /scripts/Protein_Dictionaries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | '''A ROSALIND bioinformatics script to create RNA and DNA to Protein dictionary.''' 3 | 4 | def ProteinDictDNA(): 5 | '''Returns a dictionary that translates DNA to Protein.''' 6 | # Get the raw codon table. 7 | dna2protein = CodonTableDNA() 8 | 9 | # Convert to dictionary. 10 | dna_dict = {} 11 | for translation in dna2protein: 12 | dna_dict[translation[0]] = translation[1] 13 | 14 | return dna_dict 15 | 16 | 17 | def ProteinDictRNA(): 18 | '''Returns a dictionary that translates RNA to Protein.''' 19 | # Get the raw codon table. 20 | rna2protein = CodonTableRNA() 21 | 22 | # Convert to dictionary. 23 | rna_dict = {} 24 | for translation in rna2protein: 25 | rna_dict[translation[0]] = translation[1] 26 | 27 | return rna_dict 28 | 29 | 30 | def ProteinWeightDict(): 31 | '''Returns a dictionary that translates Protein to Monoisotopic Mass.''' 32 | table ='''A 71.03711 33 | C 103.00919 34 | D 115.02694 35 | E 129.04259 36 | F 147.06841 37 | G 57.02146 38 | H 137.05891 39 | I 113.08406 40 | K 128.09496 41 | L 113.08406 42 | M 131.04049 43 | N 114.04293 44 | P 97.05276 45 | Q 128.05858 46 | R 156.10111 47 | S 87.03203 48 | T 101.04768 49 | V 99.06841 50 | W 186.07931 51 | Y 163.06333''' 52 | 53 | protein_weight_dict = dict() 54 | 55 | for protein in table.split('\n'): 56 | protein_weight_dict[protein.strip('\t').split()[0]] = float(protein.strip('\t').split()[1]) 57 | 58 | return protein_weight_dict 59 | 60 | 61 | def CodonTableDNA(): 62 | '''Returns a DNA Codon translation list.''' 63 | table = '''TTT F 64 | CTT L 65 | ATT I 66 | GTT V 67 | TTC F 68 | CTC L 69 | ATC I 70 | GTC V 71 | TTA L 72 | CTA L 73 | ATA I 74 | GTA V 75 | TTG L 76 | CTG L 77 | ATG M 78 | GTG V 79 | TCT S 80 | CCT P 81 | ACT T 82 | GCT A 83 | TCC S 84 | CCC P 85 | ACC T 86 | GCC A 87 | TCA S 88 | CCA P 89 | ACA T 90 | GCA A 91 | TCG S 92 | CCG P 93 | ACG T 94 | GCG A 95 | TAT Y 96 | CAT H 97 | AAT N 98 | GAT D 99 | TAC Y 100 | CAC H 101 | AAC N 102 | GAC D 103 | TAA Stop 104 | CAA Q 105 | AAA K 106 | GAA E 107 | TAG Stop 108 | CAG Q 109 | AAG K 110 | GAG E 111 | TGT C 112 | CGT R 113 | AGT S 114 | GGT G 115 | TGC C 116 | CGC R 117 | AGC S 118 | GGC G 119 | TGA Stop 120 | CGA R 121 | AGA R 122 | GGA G 123 | TGG W 124 | CGG R 125 | AGG R 126 | GGG G''' 127 | 128 | table = table.split('\n') 129 | for index, item in enumerate(table): 130 | table[index] = item.strip().split() 131 | 132 | return table 133 | 134 | 135 | def CodonTableRNA(): 136 | '''Returns an RNA Codon translation list.''' 137 | table = '''UUU F 138 | UUC F 139 | UUA L 140 | UUG L 141 | UCU S 142 | UCC S 143 | UCA S 144 | UCG S 145 | UAU Y 146 | UAC Y 147 | UAA Stop 148 | UAG Stop 149 | UGU C 150 | UGC C 151 | UGA Stop 152 | UGG W 153 | CUU L 154 | CUC L 155 | CUA L 156 | CUG L 157 | CCU P 158 | CCC P 159 | CCA P 160 | CCG P 161 | CAU H 162 | CAC H 163 | CAA Q 164 | CAG Q 165 | CGU R 166 | CGC R 167 | CGA R 168 | CGG R 169 | AUU I 170 | AUC I 171 | AUA I 172 | AUG M 173 | ACU T 174 | ACC T 175 | ACA T 176 | ACG T 177 | AAU N 178 | AAC N 179 | AAA K 180 | AAG K 181 | AGU S 182 | AGC S 183 | AGA R 184 | AGG R 185 | GUU V 186 | GUC V 187 | GUA V 188 | GUG V 189 | GCU A 190 | GCC A 191 | GCA A 192 | GCG A 193 | GAU D 194 | GAC D 195 | GAA E 196 | GAG E 197 | GGU G 198 | GGC G 199 | GGA G 200 | GGG G''' 201 | 202 | table = table.split('\n') 203 | for index, item in enumerate(table): 204 | table[index] = item.strip().split() 205 | 206 | return table 207 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | Scripts for functions common to multiple programming assignments for Bioinformatics Algortihms on Coursera. 4 | ''' 5 | 6 | from DNA_RNA_Operations import DNA_to_RNA, RNA_to_DNA, ReverseComplementDNA, ReverseComplementRNA, HammingDistance 7 | from generalized_suffix_tree import GeneralizedSuffixTree 8 | from Protein_Dictionaries import ProteinDictDNA, ProteinDictRNA, ProteinWeightDict 9 | from scoring_matrices import BLOSUM62, PAM250 10 | from trie import Trie 11 | -------------------------------------------------------------------------------- /scripts/data/BLOSUM62.txt: -------------------------------------------------------------------------------- 1 | A A 4 2 | A C 0 3 | A D -2 4 | A E -1 5 | A F -2 6 | A G 0 7 | A H -2 8 | A I -1 9 | A K -1 10 | A L -1 11 | A M -1 12 | A N -2 13 | A P -1 14 | A Q -1 15 | A R -1 16 | A S 1 17 | A T 0 18 | A V 0 19 | A W -3 20 | A Y -2 21 | C A 0 22 | C C 9 23 | C D -3 24 | C E -4 25 | C F -2 26 | C G -3 27 | C H -3 28 | C I -1 29 | C K -3 30 | C L -1 31 | C M -1 32 | C N -3 33 | C P -3 34 | C Q -3 35 | C R -3 36 | C S -1 37 | C T -1 38 | C V -1 39 | C W -2 40 | C Y -2 41 | D A -2 42 | D C -3 43 | D D 6 44 | D E 2 45 | D F -3 46 | D G -1 47 | D H -1 48 | D I -3 49 | D K -1 50 | D L -4 51 | D M -3 52 | D N 1 53 | D P -1 54 | D Q 0 55 | D R -2 56 | D S 0 57 | D T -1 58 | D V -3 59 | D W -4 60 | D Y -3 61 | E A -1 62 | E C -4 63 | E D 2 64 | E E 5 65 | E F -3 66 | E G -2 67 | E H 0 68 | E I -3 69 | E K 1 70 | E L -3 71 | E M -2 72 | E N 0 73 | E P -1 74 | E Q 2 75 | E R 0 76 | E S 0 77 | E T -1 78 | E V -2 79 | E W -3 80 | E Y -2 81 | F A -2 82 | F C -2 83 | F D -3 84 | F E -3 85 | F F 6 86 | F G -3 87 | F H -1 88 | F I 0 89 | F K -3 90 | F L 0 91 | F M 0 92 | F N -3 93 | F P -4 94 | F Q -3 95 | F R -3 96 | F S -2 97 | F T -2 98 | F V -1 99 | F W 1 100 | F Y 3 101 | G A 0 102 | G C -3 103 | G D -1 104 | G E -2 105 | G F -3 106 | G G 6 107 | G H -2 108 | G I -4 109 | G K -2 110 | G L -4 111 | G M -3 112 | G N 0 113 | G P -2 114 | G Q -2 115 | G R -2 116 | G S 0 117 | G T -2 118 | G V -3 119 | G W -2 120 | G Y -3 121 | H A -2 122 | H C -3 123 | H D -1 124 | H E 0 125 | H F -1 126 | H G -2 127 | H H 8 128 | H I -3 129 | H K -1 130 | H L -3 131 | H M -2 132 | H N 1 133 | H P -2 134 | H Q 0 135 | H R 0 136 | H S -1 137 | H T -2 138 | H V -3 139 | H W -2 140 | H Y 2 141 | I A -1 142 | I C -1 143 | I D -3 144 | I E -3 145 | I F 0 146 | I G -4 147 | I H -3 148 | I I 4 149 | I K -3 150 | I L 2 151 | I M 1 152 | I N -3 153 | I P -3 154 | I Q -3 155 | I R -3 156 | I S -2 157 | I T -1 158 | I V 3 159 | I W -3 160 | I Y -1 161 | K A -1 162 | K C -3 163 | K D -1 164 | K E 1 165 | K F -3 166 | K G -2 167 | K H -1 168 | K I -3 169 | K K 5 170 | K L -2 171 | K M -1 172 | K N 0 173 | K P -1 174 | K Q 1 175 | K R 2 176 | K S 0 177 | K T -1 178 | K V -2 179 | K W -3 180 | K Y -2 181 | L A -1 182 | L C -1 183 | L D -4 184 | L E -3 185 | L F 0 186 | L G -4 187 | L H -3 188 | L I 2 189 | L K -2 190 | L L 4 191 | L M 2 192 | L N -3 193 | L P -3 194 | L Q -2 195 | L R -2 196 | L S -2 197 | L T -1 198 | L V 1 199 | L W -2 200 | L Y -1 201 | M A -1 202 | M C -1 203 | M D -3 204 | M E -2 205 | M F 0 206 | M G -3 207 | M H -2 208 | M I 1 209 | M K -1 210 | M L 2 211 | M M 5 212 | M N -2 213 | M P -2 214 | M Q 0 215 | M R -1 216 | M S -1 217 | M T -1 218 | M V 1 219 | M W -1 220 | M Y -1 221 | N A -2 222 | N C -3 223 | N D 1 224 | N E 0 225 | N F -3 226 | N G 0 227 | N H 1 228 | N I -3 229 | N K 0 230 | N L -3 231 | N M -2 232 | N N 6 233 | N P -2 234 | N Q 0 235 | N R 0 236 | N S 1 237 | N T 0 238 | N V -3 239 | N W -4 240 | N Y -2 241 | P A -1 242 | P C -3 243 | P D -1 244 | P E -1 245 | P F -4 246 | P G -2 247 | P H -2 248 | P I -3 249 | P K -1 250 | P L -3 251 | P M -2 252 | P N -2 253 | P P 7 254 | P Q -1 255 | P R -2 256 | P S -1 257 | P T -1 258 | P V -2 259 | P W -4 260 | P Y -3 261 | Q A -1 262 | Q C -3 263 | Q D 0 264 | Q E 2 265 | Q F -3 266 | Q G -2 267 | Q H 0 268 | Q I -3 269 | Q K 1 270 | Q L -2 271 | Q M 0 272 | Q N 0 273 | Q P -1 274 | Q Q 5 275 | Q R 1 276 | Q S 0 277 | Q T -1 278 | Q V -2 279 | Q W -2 280 | Q Y -1 281 | R A -1 282 | R C -3 283 | R D -2 284 | R E 0 285 | R F -3 286 | R G -2 287 | R H 0 288 | R I -3 289 | R K 2 290 | R L -2 291 | R M -1 292 | R N 0 293 | R P -2 294 | R Q 1 295 | R R 5 296 | R S -1 297 | R T -1 298 | R V -3 299 | R W -3 300 | R Y -2 301 | S A 1 302 | S C -1 303 | S D 0 304 | S E 0 305 | S F -2 306 | S G 0 307 | S H -1 308 | S I -2 309 | S K 0 310 | S L -2 311 | S M -1 312 | S N 1 313 | S P -1 314 | S Q 0 315 | S R -1 316 | S S 4 317 | S T 1 318 | S V -2 319 | S W -3 320 | S Y -2 321 | T A 0 322 | T C -1 323 | T D -1 324 | T E -1 325 | T F -2 326 | T G -2 327 | T H -2 328 | T I -1 329 | T K -1 330 | T L -1 331 | T M -1 332 | T N 0 333 | T P -1 334 | T Q -1 335 | T R -1 336 | T S 1 337 | T T 5 338 | T V 0 339 | T W -2 340 | T Y -2 341 | V A 0 342 | V C -1 343 | V D -3 344 | V E -2 345 | V F -1 346 | V G -3 347 | V H -3 348 | V I 3 349 | V K -2 350 | V L 1 351 | V M 1 352 | V N -3 353 | V P -2 354 | V Q -2 355 | V R -3 356 | V S -2 357 | V T 0 358 | V V 4 359 | V W -3 360 | V Y -1 361 | W A -3 362 | W C -2 363 | W D -4 364 | W E -3 365 | W F 1 366 | W G -2 367 | W H -2 368 | W I -3 369 | W K -3 370 | W L -2 371 | W M -1 372 | W N -4 373 | W P -4 374 | W Q -2 375 | W R -3 376 | W S -3 377 | W T -2 378 | W V -3 379 | W W 11 380 | W Y 2 381 | Y A -2 382 | Y C -2 383 | Y D -3 384 | Y E -2 385 | Y F 3 386 | Y G -3 387 | Y H 2 388 | Y I -1 389 | Y K -2 390 | Y L -1 391 | Y M -1 392 | Y N -2 393 | Y P -3 394 | Y Q -1 395 | Y R -2 396 | Y S -2 397 | Y T -2 398 | Y V -1 399 | Y W 2 400 | Y Y 7 401 | -------------------------------------------------------------------------------- /scripts/data/PAM250.txt: -------------------------------------------------------------------------------- 1 | A C D E F G H I K L M N P Q R S T V W Y 2 | A 2 -2 0 0 -3 1 -1 -1 -1 -2 -1 0 1 0 -2 1 1 0 -6 -3 3 | C -2 12 -5 -5 -4 -3 -3 -2 -5 -6 -5 -4 -3 -5 -4 0 -2 -2 -8 0 4 | D 0 -5 4 3 -6 1 1 -2 0 -4 -3 2 -1 2 -1 0 0 -2 -7 -4 5 | E 0 -5 3 4 -5 0 1 -2 0 -3 -2 1 -1 2 -1 0 0 -2 -7 -4 6 | F -3 -4 -6 -5 9 -5 -2 1 -5 2 0 -3 -5 -5 -4 -3 -3 -1 0 7 7 | G 1 -3 1 0 -5 5 -2 -3 -2 -4 -3 0 0 -1 -3 1 0 -1 -7 -5 8 | H -1 -3 1 1 -2 -2 6 -2 0 -2 -2 2 0 3 2 -1 -1 -2 -3 0 9 | I -1 -2 -2 -2 1 -3 -2 5 -2 2 2 -2 -2 -2 -2 -1 0 4 -5 -1 10 | K -1 -5 0 0 -5 -2 0 -2 5 -3 0 1 -1 1 3 0 0 -2 -3 -4 11 | L -2 -6 -4 -3 2 -4 -2 2 -3 6 4 -3 -3 -2 -3 -3 -2 2 -2 -1 12 | M -1 -5 -3 -2 0 -3 -2 2 0 4 6 -2 -2 -1 0 -2 -1 2 -4 -2 13 | N 0 -4 2 1 -3 0 2 -2 1 -3 -2 2 0 1 0 1 0 -2 -4 -2 14 | P 1 -3 -1 -1 -5 0 0 -2 -1 -3 -2 0 6 0 0 1 0 -1 -6 -5 15 | Q 0 -5 2 2 -5 -1 3 -2 1 -2 -1 1 0 4 1 -1 -1 -2 -5 -4 16 | R -2 -4 -1 -1 -4 -3 2 -2 3 -3 0 0 0 1 6 0 -1 -2 2 -4 17 | S 1 0 0 0 -3 1 -1 -1 0 -3 -2 1 1 -1 0 2 1 -1 -2 -3 18 | T 1 -2 0 0 -3 0 -1 0 0 -2 -1 0 0 -1 -1 1 3 0 -5 -3 19 | V 0 -2 -2 -2 -1 -1 -2 4 -2 2 2 -2 -1 -2 -2 -1 0 4 -6 -2 20 | W -6 -8 -7 -7 0 -7 -3 -5 -3 -2 -4 -4 -6 -5 2 -2 -5 -6 17 0 21 | Y -3 0 -4 -4 7 -5 0 -1 -4 -1 -2 -2 -5 -4 -4 -3 -3 -2 0 10 -------------------------------------------------------------------------------- /scripts/scoring_matrices.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | '''A Bioinformatics Algorithms script containing scoring matrices.''' 3 | 4 | 5 | class BLOSUM62(object): 6 | """The BLOSUM62 scoring matrix class.""" 7 | 8 | def __init__(self): 9 | """Initialize the scoring matrix.""" 10 | import os 11 | 12 | with open(os.path.join(os.path.dirname(__file__), 'data/BLOSUM62.txt')) as input_data: 13 | items = [line.strip().split() for line in input_data.readlines()] 14 | self.scoring_matrix = {(item[0], item[1]):int(item[2]) for item in items} 15 | 16 | def __getitem__(self, pair): 17 | """Returns the score of the given pair of protein.""" 18 | return self.scoring_matrix[pair[0], pair[1]] 19 | 20 | 21 | class PAM250(object): 22 | """The PAM250 scoring matrix class.""" 23 | 24 | def __init__(self): 25 | """Initialize the scoring matrix.""" 26 | import os 27 | import pandas as pd 28 | # Convert the scoring matrix text file to a data frame. 29 | self.scoring_matrix = pd.read_table(os.path.join(os.path.dirname(__file__), 'data/PAM250.txt'), sep=' ') 30 | 31 | def __getitem__(self, pair): 32 | """Returns the score of the given pair of protein.""" 33 | return self.scoring_matrix[pair[0]][pair[1]] 34 | -------------------------------------------------------------------------------- /scripts/trie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | '''A Bioinformatics Algorithms script containing a trie data structure.''' 3 | 4 | 5 | class Trie(object): 6 | """Constucts a trie for the given words.""" 7 | def __init__(self, words): 8 | """Initialize the nodes and edges and add the given words.""" 9 | 10 | # A lambda function to create nodes. 11 | # 'parent' = parent node number 12 | # 'children' = list of children node numbers. 13 | # 'depth' = length of substring up to the node. 14 | # 'end' = boolean to determine if the node corresponds to the last character of an inserted word. 15 | self.create_node = lambda p, d: {'parent':p, 'children':[], 'depth':d, 'end':False} 16 | 17 | # Initialize nodes and edges. 18 | self.nodes = {1:self.create_node(0,0)} 19 | self.edges = {} 20 | 21 | # Construct the trie by adding the words. 22 | if type(words) is str: 23 | self._add_word(words) 24 | else: 25 | for word in words: 26 | self._add_word(word) 27 | 28 | def _add_word(self, current_word): 29 | """Adds a word to the trie.""" 30 | 31 | # Get the insertion node and portion of the word to insert. 32 | insertion_node, insertion_substring = self._insert_location(current_word) 33 | 34 | # Begin inserting at the insertion node. 35 | for i in xrange(len(insertion_substring)): 36 | 37 | # Get the new node number. 38 | new_node = len(self.nodes) + 1 39 | 40 | # Add the new node to the trie, and add parent/depth/child information. 41 | self.nodes[new_node] = self.create_node(insertion_node, self.nodes[insertion_node]['depth']+1) 42 | self.nodes[insertion_node]['children'].append(new_node) 43 | 44 | # Add the new edge to the trie. 45 | self.edges[insertion_node, new_node] = insertion_substring[i] 46 | 47 | # Move to the new node and continue insertion. 48 | insertion_node = new_node 49 | 50 | # Mark the last node as an end node, as it is the end of the word added. 51 | self.nodes[insertion_node]['end'] = True 52 | 53 | def _insert_location(self, word_to_add, current_node=1): 54 | """Traverses the trie to determine the insertion point of the given word.""" 55 | 56 | # This happends if the word we're trying to add is already a substring of an added word. 57 | if word_to_add == '': 58 | return current_node, word_to_add 59 | 60 | # Search all child nodes for a match. 61 | for child_node in self.nodes[current_node]['children']: 62 | if self.edges[current_node, child_node] == word_to_add[0]: 63 | # Move to the child node if we have a match. 64 | return self._insert_location(word_to_add[1:], child_node) 65 | 66 | # If we reach this point, there is no character match. 67 | return current_node, word_to_add 68 | 69 | def word_up_to_node(self, node_num): 70 | """Returns the word associated with a traversal up to the given node.""" 71 | 72 | node_word = '' 73 | while self.nodes[node_num]['parent'] != 0: 74 | node_word += self.edges[self.nodes[node_num]['parent'], node_num] 75 | node_num = self.nodes[node_num]['parent'] 76 | 77 | # We travelled backwards, so reverse the word. 78 | return node_word[::-1] 79 | 80 | def prefix_in_trie(self, word_to_check, current_node=1): 81 | """Traverses the trie to determine if a prefix of the given word matches a pattern in the trie.""" 82 | 83 | if self.nodes[current_node]['end'] is True: 84 | # If we hit an end node then we've found a matching pattern as a prefix. 85 | return True 86 | elif word_to_check == '': 87 | # If we've exhausted the word_to_check then no prefix of it matches an entire pattern in the trie. 88 | return False 89 | 90 | # Search all child nodes for a match. 91 | for child_node in self.nodes[current_node]['children']: 92 | if self.edges[current_node, child_node] == word_to_check[0]: 93 | # Move to the child node if we have a match. 94 | return self.prefix_in_trie(word_to_check[1:], child_node) 95 | 96 | # If we reach this point, there is no character match, and hence no prefix matching a pattern in the trie. 97 | return False 98 | --------------------------------------------------------------------------------