├── .gitattributes
├── .gitignore
├── Assignment_01A.py
├── Assignment_01B.py
├── Assignment_01C.py
├── Assignment_01D.py
├── Assignment_01E.py
├── Assignment_01F.py
├── Assignment_01G.py
├── Assignment_01H.py
├── Assignment_02A.py
├── Assignment_02B.py
├── Assignment_02C.py
├── Assignment_02D.py
├── Assignment_02E.py
├── Assignment_02F.py
├── Assignment_02G.py
├── Assignment_03A.py
├── Assignment_03B.py
├── Assignment_03C.py
├── Assignment_03D.py
├── Assignment_03E.py
├── Assignment_03F.py
├── Assignment_03G.py
├── Assignment_04A.py
├── Assignment_04B.py
├── Assignment_04C.py
├── Assignment_04D.py
├── Assignment_04E.py
├── Assignment_04E_with_NetworkX.py
├── Assignment_05A.py
├── Assignment_05B.py
├── Assignment_05C.py
├── Assignment_05D.py
├── Assignment_05E.py
├── Assignment_06A.py
├── Assignment_06B.py
├── Assignment_06C.py
├── Assignment_06D.py
├── Assignment_06E.py
├── Assignment_06F.py
├── Assignment_07A.py
├── Assignment_07B.py
├── Assignment_07C.py
├── Assignment_07D.py
├── Assignment_07E.py
├── Assignment_07F.py
├── Assignment_07G.py
├── Assignment_08A.py
├── Assignment_08B.py
├── Assignment_08C.py
├── Assignment_08D.py
├── Assignment_09A.py
├── Assignment_09B.py
├── Assignment_09D.py
├── Assignment_09E.py
├── Assignment_09F.py
├── README.md
├── data
    ├── stepic_1a.txt
    ├── stepic_1b.txt
    ├── stepic_1c.txt
    ├── stepic_1d.txt
    ├── stepic_1e.txt
    ├── stepic_1f.txt
    ├── stepic_1g.txt
    ├── stepic_1h.txt
    ├── stepic_2a.txt
    ├── stepic_2b.txt
    ├── stepic_2c.txt
    ├── stepic_2d.txt
    ├── stepic_2e.txt
    ├── stepic_2f.txt
    ├── stepic_2g.txt
    ├── stepic_3a.txt
    ├── stepic_3b.txt
    ├── stepic_3c.txt
    ├── stepic_3d.txt
    ├── stepic_3e.txt
    ├── stepic_3f.txt
    ├── stepic_3g.txt
    ├── stepic_4a.txt
    ├── stepic_4b.txt
    ├── stepic_4c.txt
    ├── stepic_4d.txt
    ├── stepic_4e.txt
    ├── stepic_5a.txt
    ├── stepic_5b.txt
    ├── stepic_5c.txt
    ├── stepic_5d.txt
    ├── stepic_5e.txt
    ├── stepic_6a.txt
    ├── stepic_6b.txt
    ├── stepic_6c.txt
    ├── stepic_6d.txt
    ├── stepic_6e.txt
    ├── stepic_6f.txt
    ├── stepic_7a.txt
    ├── stepic_7b.txt
    ├── stepic_7c.txt
    ├── stepic_7d.txt
    ├── stepic_7e.txt
    ├── stepic_7f.txt
    ├── stepic_7g.txt
    ├── stepic_8a.txt
    ├── stepic_8b.txt
    ├── stepic_8c.txt
    ├── stepic_8d.txt
    ├── stepic_9a.txt
    ├── stepic_9b.txt
    ├── stepic_9d.txt
    ├── stepic_9e.txt
    └── stepic_9f.txt
├── output
    ├── Assignment_01A.txt
    ├── Assignment_01B.txt
    ├── Assignment_01C.txt
    ├── Assignment_01D.txt
    ├── Assignment_01E.txt
    ├── Assignment_01F.txt
    ├── Assignment_01G.txt
    ├── Assignment_01H.txt
    ├── Assignment_02A.txt
    ├── Assignment_02B.txt
    ├── Assignment_02C.txt
    ├── Assignment_02D.txt
    ├── Assignment_02E.txt
    ├── Assignment_02F.txt
    ├── Assignment_02G.txt
    ├── Assignment_03A.txt
    ├── Assignment_03B.txt
    ├── Assignment_03C.txt
    ├── Assignment_03D.txt
    ├── Assignment_03E.txt
    ├── Assignment_03F.txt
    ├── Assignment_03G.txt
    ├── Assignment_04A.txt
    ├── Assignment_04B.txt
    ├── Assignment_04C.txt
    ├── Assignment_04D.txt
    ├── Assignment_04E.txt
    ├── Assignment_05A.txt
    ├── Assignment_05B.txt
    ├── Assignment_05C.txt
    ├── Assignment_05D.txt
    ├── Assignment_05E.txt
    ├── Assignment_06A.txt
    ├── Assignment_06B.txt
    ├── Assignment_06C.txt
    ├── Assignment_06D.txt
    ├── Assignment_06E.txt
    ├── Assignment_06F.txt
    ├── Assignment_07A.txt
    ├── Assignment_07B.txt
    ├── Assignment_07C.txt
    ├── Assignment_07D.txt
    ├── Assignment_07E.txt
    ├── Assignment_07F.txt
    ├── Assignment_07G.txt
    ├── Assignment_08A.txt
    ├── Assignment_08B.txt
    ├── Assignment_08C.txt
    ├── Assignment_08D.txt
    ├── Assignment_09A.txt
    ├── Assignment_09B.txt
    ├── Assignment_09D.txt
    ├── Assignment_09E.txt
    └── Assignment_09F.txt
└── scripts
    ├── DNA_RNA_Operations.py
    ├── Protein_Dictionaries.py
    ├── __init__.py
    ├── data
        ├── BLOSUM62.txt
        └── PAM250.txt
    ├── generalized_suffix_tree.py
    ├── scoring_matrices.py
    └── trie.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | *.sln    merge=union
 7 | *.csproj merge=union
 8 | *.vbproj merge=union
 9 | *.fsproj merge=union
10 | *.dbproj merge=union
11 | 
12 | # Standard to msysgit
13 | *.doc	 diff=astextplain
14 | *.DOC	 diff=astextplain
15 | *.docx diff=astextplain
16 | *.DOCX diff=astextplain
17 | *.dot  diff=astextplain
18 | *.DOT  diff=astextplain
19 | *.pdf  diff=astextplain
20 | *.PDF	 diff=astextplain
21 | *.rtf	 diff=astextplain
22 | *.RTF	 diff=astextplain
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | #################
  2 | ## Eclipse
  3 | #################
  4 | 
  5 | *.pydevproject
  6 | .project
  7 | .metadata
  8 | bin/
  9 | tmp/
 10 | *.tmp
 11 | *.bak
 12 | *.swp
 13 | *~.nib
 14 | local.properties
 15 | .classpath
 16 | .settings/
 17 | .loadpath
 18 | 
 19 | # External tool builders
 20 | .externalToolBuilders/
 21 | 
 22 | # Locally stored "Eclipse launch configurations"
 23 | *.launch
 24 | 
 25 | # CDT-specific
 26 | .cproject
 27 | 
 28 | # PDT-specific
 29 | .buildpath
 30 | 
 31 | 
 32 | #################
 33 | ## Visual Studio
 34 | #################
 35 | 
 36 | ## Ignore Visual Studio temporary files, build results, and
 37 | ## files generated by popular Visual Studio add-ons.
 38 | 
 39 | # User-specific files
 40 | *.suo
 41 | *.user
 42 | *.sln.docstates
 43 | 
 44 | # Build results
 45 | 
 46 | [Dd]ebug/
 47 | [Rr]elease/
 48 | x64/
 49 | build/
 50 | [Bb]in/
 51 | [Oo]bj/
 52 | 
 53 | # MSTest test Results
 54 | [Tt]est[Rr]esult*/
 55 | [Bb]uild[Ll]og.*
 56 | 
 57 | *_i.c
 58 | *_p.c
 59 | *.ilk
 60 | *.meta
 61 | *.obj
 62 | *.pch
 63 | *.pdb
 64 | *.pgc
 65 | *.pgd
 66 | *.rsp
 67 | *.sbr
 68 | *.tlb
 69 | *.tli
 70 | *.tlh
 71 | *.tmp
 72 | *.tmp_proj
 73 | *.log
 74 | *.vspscc
 75 | *.vssscc
 76 | .builds
 77 | *.pidb
 78 | *.log
 79 | *.scc
 80 | 
 81 | # Visual C++ cache files
 82 | ipch/
 83 | *.aps
 84 | *.ncb
 85 | *.opensdf
 86 | *.sdf
 87 | *.cachefile
 88 | 
 89 | # Visual Studio profiler
 90 | *.psess
 91 | *.vsp
 92 | *.vspx
 93 | 
 94 | # Guidance Automation Toolkit
 95 | *.gpState
 96 | 
 97 | # ReSharper is a .NET coding add-in
 98 | _ReSharper*/
 99 | *.[Rr]e[Ss]harper
100 | 
101 | # TeamCity is a build add-in
102 | _TeamCity*
103 | 
104 | # DotCover is a Code Coverage Tool
105 | *.dotCover
106 | 
107 | # NCrunch
108 | *.ncrunch*
109 | .*crunch*.local.xml
110 | 
111 | # Installshield output folder
112 | [Ee]xpress/
113 | 
114 | # DocProject is a documentation generator add-in
115 | DocProject/buildhelp/
116 | DocProject/Help/*.HxT
117 | DocProject/Help/*.HxC
118 | DocProject/Help/*.hhc
119 | DocProject/Help/*.hhk
120 | DocProject/Help/*.hhp
121 | DocProject/Help/Html2
122 | DocProject/Help/html
123 | 
124 | # Click-Once directory
125 | publish/
126 | 
127 | # Publish Web Output
128 | *.Publish.xml
129 | *.pubxml
130 | 
131 | # NuGet Packages Directory
132 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line
133 | #packages/
134 | 
135 | # Windows Azure Build Output
136 | csx
137 | *.build.csdef
138 | 
139 | # Windows Store app package directory
140 | AppPackages/
141 | 
142 | # Others
143 | sql/
144 | *.Cache
145 | ClientBin/
146 | [Ss]tyle[Cc]op.*
147 | ~$*
148 | *~
149 | *.dbmdl
150 | *.[Pp]ublish.xml
151 | *.pfx
152 | *.publishsettings
153 | 
154 | # RIA/Silverlight projects
155 | Generated_Code/
156 | 
157 | # Backup & report files from converting an old project file to a newer
158 | # Visual Studio version. Backup files are not needed, because we have git ;-)
159 | _UpgradeReport_Files/
160 | Backup*/
161 | UpgradeLog*.XML
162 | UpgradeLog*.htm
163 | 
164 | # SQL Server files
165 | App_Data/*.mdf
166 | App_Data/*.ldf
167 | 
168 | #############
169 | ## Windows detritus
170 | #############
171 | 
172 | # Windows image file caches
173 | Thumbs.db
174 | ehthumbs.db
175 | 
176 | # Folder config file
177 | Desktop.ini
178 | 
179 | # Recycle Bin used on file shares
180 | $RECYCLE.BIN/
181 | 
182 | # Mac crap
183 | .DS_Store
184 | 
185 | 
186 | #############
187 | ## Python
188 | #############
189 | 
190 | *.py[co]
191 | 
192 | # Packages
193 | *.egg
194 | *.egg-info
195 | dist/
196 | build/
197 | eggs/
198 | parts/
199 | var/
200 | sdist/
201 | develop-eggs/
202 | .installed.cfg
203 | 
204 | # Installer logs
205 | pip-log.txt
206 | 
207 | # Unit test / coverage reports
208 | .coverage
209 | .tox
210 | 
211 | #Translations
212 | *.mo
213 | 
214 | #Mr Developer
215 | .mr.developer.cfg
216 | 


--------------------------------------------------------------------------------
/Assignment_01A.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic.
 6 | 
 7 | Problem Title: Creating a Distance Matrix
 8 | Assignment #: 01
 9 | Problem ID: A 
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Hidden-Messages-in-the-Replication-Origin-2/#step-4
11 | '''
12 | 
13 | with open('data/stepic_1a.txt') as input_data:
14 | 	dna, k = [line.strip() for line in input_data.readlines()]
15 | 	k = int(k)
16 | 
17 | kmer_dict = dict()
18 | 
19 | for i in xrange(len(dna)-k+1):
20 | 	if dna[i:i+k] in kmer_dict:
21 | 		kmer_dict[dna[i:i+k]] += 1
22 | 	else:
23 | 		kmer_dict[dna[i:i+k]] = 1
24 | 
25 | kmers = [item[0] for item in kmer_dict.items() if item[1] == max(kmer_dict.values())]
26 | 
27 | print ' '.join(kmers)
28 | with open('output/Assignment_01A.txt', 'w') as output_data:
29 | 	output_data.write(' '.join(kmers))
30 | 


--------------------------------------------------------------------------------
/Assignment_01B.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Reverse Complement Problem
 8 | Assignment #: 01
 9 | Problem ID: B
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Some-Hidden-Messages-are-More-Surprising-than-Others-3/#step-2
11 | '''
12 | 
13 | from scripts import ReverseComplementDNA as RevComp
14 | 
15 | with open('data/stepic_1b.txt') as input_data:
16 | 	dna = input_data.read().strip()
17 | 
18 | # The script I previously wrote solves the problem...
19 | with open('output/Assignment_01B.txt', 'w') as output_data:
20 | 	output_data.write(RevComp(dna))
21 | 


--------------------------------------------------------------------------------
/Assignment_01C.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Pattern Matching Problem
 8 | Assignment #: 01
 9 | Problem ID: C 
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Some-Hidden-Messages-are-More-Surprising-than-Others-3/#step-5
11 | '''
12 | 
13 | with open('data/stepic_1c.txt') as input_data:
14 | 	pattern, text = [line.strip() for line in input_data.readlines()]
15 | 
16 | pattern_loc = []
17 | for i in xrange(len(text)-len(pattern)+1):
18 | 	if text[i:i+len(pattern)] == pattern:
19 | 		pattern_loc.append(str(i))
20 | 
21 | print ' '.join(pattern_loc)
22 | with open('output/Assignment_01C.txt', 'w') as output_data:
23 | 	output_data.write(' '.join(pattern_loc))
24 | 


--------------------------------------------------------------------------------
/Assignment_01D.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Clump Finding Problem
 8 | Assignment #: 01
 9 | Problem ID: D
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/An-Explosion-of-Hidden-Messages-4/#step-4
11 | '''
12 | 
13 | def CheckClumpLength(indicies, t, L):
14 | 	'''Checks that a given set of t k-mers falls within a clump of size L.'''
15 | 	for i in  xrange(len(indicies)-t+1):
16 | 		if indicies[t+i-1] - indicies[i] <= L:
17 | 			return True
18 | 	return False
19 | 
20 | with open('data/stepic_1d.txt') as input_data:
21 | 	dna, [k, L, t] = [line.strip() if index == 0 else map(int, line.strip().split()) for index, line in enumerate(input_data.readlines())]
22 | 
23 | # Find all k-mers, count their appearances, and store thier indicies. 
24 | kmer_dict = dict()
25 | for i in xrange(len(dna)-k+1):
26 | 	if dna[i:i+k] in kmer_dict:
27 | 		kmer_dict[dna[i:i+k]][0] += 1
28 | 		kmer_dict[dna[i:i+k]][1].append(i)
29 | 	else:
30 | 		kmer_dict[dna[i:i+k]] = [1, [i]]
31 | 
32 | # The candidate k-mers that appear at least t times, along with the indicies where they appear.
33 | kmer_candidates = [ [kmer[0],kmer[1][1]] for kmer in kmer_dict.items() if kmer[1][0] >= t]
34 | 
35 | # Check that at least t candidate k-mers fall within a clump of size L.
36 | kmer_clumps = []
37 | for candidate in kmer_candidates:
38 | 	if CheckClumpLength(candidate[1], t, L):
39 | 		kmer_clumps.append(candidate[0])
40 | 
41 | # Print and save the solution.
42 | print ' '.join(kmer_clumps)
43 | with open('output/Assignment_01D.txt', 'w') as output_data:
44 | 	output_data.write(' '.join(kmer_clumps))
45 | 


--------------------------------------------------------------------------------
/Assignment_01E.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Minimum Skew Problem
 8 | Assignment #: 01
 9 | Problem ID: E
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Peculiar-Statistics-of-the-Forward-and-Reverse-Half-Strands-7/#step-6
11 | '''
12 | 
13 | with open('data/stepic_1e.txt') as input_data:
14 | 	dna = input_data.read().strip()
15 | 
16 | skew_value, min_skew, min_ind = 0, 1, []
17 | for index, nucleotide in enumerate(dna):
18 | 	# Determine the skew value.
19 | 	if nucleotide == 'C':
20 | 		skew_value -= 1
21 | 	elif nucleotide == 'G':
22 | 		skew_value += 1
23 | 	# Check if it matches the current minimum, or is a new minimum.
24 | 	if skew_value == min_skew:
25 | 		min_ind.append(str(index+1))
26 | 	elif skew_value < min_skew:
27 | 		min_skew = skew_value
28 | 		min_ind = [str(index+1)]
29 | 
30 | print ' '.join(min_ind)
31 | with open('output/Assignment_01E.txt', 'w') as output_data:
32 | 	output_data.write(' '.join(min_ind))
33 | 


--------------------------------------------------------------------------------
/Assignment_01F.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Approximate Pattern Matching Problem
 8 | Assignment #: 01
 9 | Problem ID: F
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Some-Hidden-Messages-are-More-Elusive-than-Others-9/#step-3
11 | '''
12 | 
13 | with open('data/stepic_1f.txt') as input_data:
14 | 	pattern, dna, n = [line.strip() if index != 2 else int(line.strip()) for index, line in enumerate(input_data.readlines())]
15 | 
16 | approx_match = []
17 | for i in xrange(len(dna)-len(pattern)+1):
18 | 	mismatch_count = 0
19 | 	for j in xrange(len(pattern)):
20 | 		if dna[i:i+len(pattern)][j] != pattern[j]:
21 | 			mismatch_count += 1
22 | 	
23 | 	if mismatch_count <= n:
24 | 		approx_match.append(str(i))
25 | 
26 | print ' '.join(approx_match)
27 | with open('output/Assignment_01F.txt', 'w') as output_data:
28 | 	output_data.write(' '.join(approx_match))
29 | 


--------------------------------------------------------------------------------
/Assignment_01G.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Frequent Words with Mismatches Problem
 8 | Assignment #: 01
 9 | Problem ID: G
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Some-Hidden-Messages-are-More-Elusive-than-Others-9/#step-4
11 | '''
12 | 
13 | from itertools import combinations
14 | 
15 | def MismatchList(kmer, d):
16 | 	'''Returns a list of all k-mers that mismatch a given k-mer by at most d characters.'''
17 | 	kmer_mismatches = [kmer]
18 | 	for i in xrange(1,d+1):
19 | 		# Each combination gives the indicies we want to mismatch.
20 | 		kmer_mismatches += CreateMismatches([[kmer, list(combo)] for combo in combinations(range(len(kmer)),i)])
21 | 	return kmer_mismatches
22 | 
23 | 
24 | def CreateMismatches(swap_list):
25 | 	'''Generates k-mer mismatches by replacing the characters at given indicies with mismatching characters.'''
26 | 	nucleotides = 'ACGT'
27 | 	mismatch_list = []
28 | 	# Swap the i-th character of string with the character ch.
29 | 	swap = lambda string, ch, i: string[:index]+ch+string[index+1:]
30 | 
31 | 	# If we have more than one index left to mismatch, repeat the process.
32 | 	if len(swap_list[0][1]) > 1:
33 | 		for kmer, indicies in swap_list:
34 | 			index = indicies[0]
35 | 			for nuc in filter(lambda n: n != kmer[index], nucleotides):
36 | 				mismatch_list.append([swap(kmer, nuc, index), indicies[1:]])
37 | 		
38 | 		return CreateMismatches(mismatch_list)
39 | 	
40 | 	# Otherwise, on the final mismatch return the list of k-mers.
41 | 	else:
42 | 		for kmer, [index] in swap_list:
43 | 			for nuc in filter(lambda n: n != kmer[index], nucleotides):
44 | 				mismatch_list.append(swap(kmer, nuc, index))
45 | 		
46 | 		return mismatch_list
47 | 
48 | 
49 | if __name__ == '__main__':
50 | 
51 | 	with open('data/stepic_1g.txt') as input_data:
52 | 		dna, [k, d] = [line.strip() if index == 0 else map(int, line.strip().split()) for index, line in enumerate(input_data.readlines())]
53 | 
54 | 	# Count the occurence of each k-mer with up to d mismatches in a dictionary.
55 | 	mismatch_dict = {}
56 | 	for i in xrange(len(dna)-k+1):
57 | 		for kmer in MismatchList(dna[i:i+k], d):
58 | 			if kmer in mismatch_dict:
59 | 				mismatch_dict[kmer] += 1
60 | 			else:
61 | 				mismatch_dict[kmer] = 1
62 | 
63 | 	# Computing the maximum value is somewhat time consuming to repeat, so only do it once!
64 | 	max_val = max(mismatch_dict.values())
65 | 	kmers = [item[0] for item in mismatch_dict.items() if item[1] == max_val]
66 | 
67 | 	print ' '.join(kmers)
68 | 	with open('output/Assignment_01G.txt', 'w') as output_data:
69 | 		output_data.write(' '.join(kmers))
70 | 


--------------------------------------------------------------------------------
/Assignment_01H.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Frequent Words with Mismatches and Reverse Complements Problem
 8 | Assignment #: 01
 9 | Problem ID: H
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Some-Hidden-Messages-are-More-Elusive-than-Others-9/#step-5
11 | '''
12 | 
13 | from scripts import ReverseComplementDNA as RevComp
14 | from Assignment_01G import MismatchList
15 | 
16 | with open('data/stepic_1h.txt') as input_data:
17 | 	dna, [k, d] = [line.strip() if index == 0 else map(int, line.strip().split()) for index, line in enumerate(input_data.readlines())]
18 | 
19 | # Use a dictionary to count the occurence of each k-mer and its reverse complement with up to d mismatches on each.
20 | mismatch_dict = {}
21 | for i in xrange(len(dna)-k+1):
22 | 	for kmer in MismatchList(dna[i:i+k], d)+MismatchList(RevComp(dna[i:i+k]), d):
23 | 		if kmer in mismatch_dict:
24 | 			mismatch_dict[kmer] += 1
25 | 		else:
26 | 			mismatch_dict[kmer] = 1
27 | 
28 | # Computing the maximum value is somewhat time consuming to repeat, so only do it once!
29 | max_val = max(mismatch_dict.values())
30 | kmers = [item[0] for item in mismatch_dict.items() if item[1] == max_val]
31 | 
32 | print ' '.join(kmers)
33 | with open('output/Assignment_01H.txt', 'w') as output_data:
34 | 	output_data.write(' '.join(kmers))
35 | 


--------------------------------------------------------------------------------
/Assignment_02A.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Protein Translation Problem
 8 | Assignment #: 02
 9 | Problem ID: A
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/How-Do-Bacteria-Make-Antibiotics-96/#step-3
11 | '''
12 | 
13 | # This is a repeat of Rosalind Problem 008: Translating RNA into Protein.
14 | from scripts import ProteinDictRNA
15 | 
16 | with open('data/stepic_2a.txt') as input_data:
17 | 	s = input_data.read().strip()
18 | 
19 | # Dictionary translating RNA to Protein
20 | rna_dict = ProteinDictRNA()
21 | 
22 | s_protein = ''
23 | for i in range(0,len(s),3):
24 |     if rna_dict[s[i:i+3]] != 'Stop':
25 |         s_protein += rna_dict[s[i:i+3]]
26 | 
27 | print s_protein
28 | 
29 | with open('output/Assignment_02A.txt', 'w') as output_data:
30 | 	output_data.write(s_protein)
31 | 


--------------------------------------------------------------------------------
/Assignment_02B.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Peptide Encoding Problem
 8 | Assignment #: 02
 9 | Problem ID: B
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/How-Do-Bacteria-Make-Antibiotics-96/#step-6
11 | '''
12 | 
13 | from scripts import ProteinDictDNA
14 | from scripts import ReverseComplementDNA as RevComp
15 | 
16 | with open('data/stepic_2b.txt') as input_data:
17 | 	dna, peptide = [line.strip() for line in input_data.readlines()]
18 | 
19 | # Dictionary translating RNA to Protein
20 | dna_dict = ProteinDictDNA()
21 | 
22 | encodings = []
23 | for i in range(0,len(dna)-3*len(peptide)+1):
24 | 	# Get translate the current slice and its reverse complement to protein.
25 | 	dna_slice = dna[i:i+3*len(peptide)]
26 | 	proteins = [dna_dict[dna_slice[3*(j-1):3*j]]  for j in range(1,len(peptide)+1)]
27 | 	proteins_rc =[dna_dict[RevComp(dna_slice)[3*(j-1):3*j]]  for j in range(1,len(peptide)+1)] 
28 |     
29 | 	# Check if either translation matches the peptide.
30 | 	if ''.join(proteins) == peptide or ''.join(proteins_rc) == peptide:
31 | 		encodings.append(dna_slice)
32 | 
33 | print '\n'.join(encodings)
34 | with open('output/Assignment_02B.txt', 'w') as output_data:
35 | 	output_data.write('\n'.join(encodings))
36 | 


--------------------------------------------------------------------------------
/Assignment_02C.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Generating Theoretical Spectrum Problem
 8 | Assignment #: 02
 9 | Problem ID: C
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Sequencing-Antibiotics-by-Shattering-Them-into-Pieces-98/#step-3
11 | '''
12 | 
13 | from scripts import ProteinWeightDict
14 | 
15 | def cyclospectrum(peptide):
16 | 	# Dictionary translating RNA to Protein
17 | 	weight = ProteinWeightDict()
18 | 
19 | 	# Initialize as the mass 0 and the mass of the entire peptide.
20 | 	cyclospec = [0, sum([int(weight[protein]) for protein in peptide])]
21 | 
22 | 	# Find the masses of the adjacent intermediary subpeptides
23 | 	cyclospec += [sum([int(weight[protein]) for protein in (peptide*2)[j:j+i]]) for i in xrange(1,len(peptide)) for j in xrange(len(peptide))]
24 | 
25 | 	# Sort the list in ascending order and convert to strings.
26 | 	cyclospec = map(str,sorted(cyclospec))
27 | 
28 | 	return cyclospec
29 | 
30 | if __name__ == '__main__':
31 | 	with open('data/stepic_2c.txt') as input_data:
32 | 		peptide = input_data.read().strip()
33 | 
34 | 	cyclospec = cyclospectrum(peptide)
35 | 
36 | 	# Print and save the answer.
37 | 	print ' '.join(cyclospec)
38 | 	with open('output/Assignment_02C.txt', 'w') as output_data:
39 | 		output_data.write(' '.join(cyclospec))
40 | 


--------------------------------------------------------------------------------
/Assignment_02D.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Cyclopeptide Sequencing
 8 | Assignment #: 02
 9 | Problem ID: D
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/A-Faster-Algorithm-for-Cyclopeptide-Sequencing-100/#step-4
11 | '''
12 | 
13 | from math import sqrt
14 | from scripts import ProteinWeightDict
15 | 
16 | def append_char(add_list, add_chars):
17 | 	'''Returns a list containing all words possible from add_list with suffixes from add_chars.'''
18 | 	newlist = []
19 | 	for item in add_list:
20 | 		newlist += [item+ch for ch in set(add_chars)]
21 | 	return newlist
22 | 
23 | def spectrum(peptide):
24 | 	'''Returns the linear spectrum of a given peptide.'''
25 | 	# Dictionary translating RNA to Protein
26 | 	weight = ProteinWeightDict()
27 | 	# Initialize as the mass 0 and the mass of the entire peptide.
28 | 	spec = [0, sum([int(weight[protein]) for protein in peptide])]
29 | 	# Find the masses of the adjacent intermediary subpeptides
30 | 	spec += [sum([int(weight[protein]) for protein in peptide[j:j+i]]) for i in xrange(1,len(peptide)) for j in xrange(len(peptide)-i+1)]
31 | 	# Sort the list in ascending order and convert to strings.
32 | 	spec = map(str,sorted(spec))
33 | 
34 | 	return spec
35 | 
36 | with open('data/stepic_2d.txt') as input_data:
37 | 	cyclospec = input_data.read().strip().split()
38 | 
39 | # Create the protein weight dictionary.
40 | weight = ProteinWeightDict()
41 | 
42 | # Let n be the length of a given peptide, and L be the length of its cyclospectrum.  Then L = n(n-1) + 2.
43 | # Using the quadratic formula to to solve for n:  n = (sqrt(4L-7) + 1)/2
44 | n = int((sqrt(4*len(cyclospec)-7)+1)/2)
45 | 
46 | # Find the first n protein in the peptide.  
47 | # Need to be careful: two small proteins can add to be less than a larger one, so we can't just take the first n nonzero entries.
48 | # Fortunately, no two small proteins masses add to that of a larger protein.
49 | protein, i = [], 1
50 | while len(protein) != n:
51 | 	if int(cyclospec[i]) in map(int,weight.values()):
52 | 		protein.append(cyclospec[i])
53 | 	i += 1
54 | 
55 | # Get the name of each protein corresponding to a given weight (if multiple, only take one).
56 | names = []
57 | for w in protein:
58 | 	names.append([items[0] for items in weight.items() if int(items[1])==int(w)][0])
59 | 
60 | # Build the possible sequences.
61 | seq = append_char(names,names)
62 | for repeat in xrange(1,n):
63 | 	seq = filter(lambda subpeptide:set(spectrum(subpeptide)) < set(cyclospec), set(seq))
64 | 	if repeat != n-1:
65 | 		seq = append_char(seq,names)
66 | 
67 | # Convert each protein to the proper format. 
68 | cyclopeptide_sequence = ['-'.join([str(int(weight[protein])) for protein in peptide]) for peptide in seq]
69 | 
70 | # Print and save the answer.
71 | print ' '.join(cyclopeptide_sequence)
72 | with open('output/Assignment_02D.txt', 'w') as output_data:
73 | 	output_data.write(' '.join(cyclopeptide_sequence))
74 | 


--------------------------------------------------------------------------------
/Assignment_02E.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Leaderboard Cyclopeptide Sequencing
 8 | Assignment #: 02
 9 | Problem ID: E
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Adapting-Cyclopeptide-Sequencing-for-Spectra-with-Errors-102/#step-3
11 | '''
12 | 
13 | from scripts import ProteinWeightDict
14 | 
15 | def append_protein(add_list):
16 | 	'''Returns a list containing all peptides from add_list with every possible protein suffix.'''
17 | 	newlist = []
18 | 	for item in add_list:
19 | 		newlist += [item+ch for ch in ProteinWeightDict().keys()]
20 | 	return newlist
21 | 
22 | def spectrum(peptide):
23 | 	'''Returns the circular spectrum of a given peptide.'''
24 | 	# Initialize as the mass 0 and the mass of the entire peptide.
25 | 	spec = [0, sum([int(weight[protein]) for protein in peptide])]
26 | 	# Find the masses of the adjacent intermediary subpeptides
27 | 	spec += [sum([int(weight[protein]) for protein in (peptide*2)[j:j+i]]) for i in xrange(1,len(peptide)) for j in xrange(len(peptide))]
28 | 
29 | 	return sorted(spec)
30 | 
31 | def spectrum_score(peptide, exp_spec):
32 | 	'''Returns the number of matching masses from the spectrum of peptide when compared with the spectrum exp_spec.'''
33 | 	pep_spec = spectrum(peptide)
34 | 	# Return -1 if the peptide has more mass than exp_spec.
35 | 	if pep_spec[-1] > exp_spec[-1]:
36 | 		return -1
37 | 	return sum([min(pep_spec.count(protein),exp_spec.count(protein)) for protein in set(pep_spec)])
38 | 
39 | if __name__ == '__main__':
40 | 
41 | 	with open('data/stepic_2e.txt') as input_data:
42 | 		n, spec = [int(line.strip()) if i==0 else map(int,line.strip().split()) for i, line in enumerate(input_data.readlines())]
43 | 	
44 | 	# Create the protein weight dictionary.
45 | 	weight = ProteinWeightDict()
46 | 	# Initialize the scores dictionary.
47 | 	scores = dict()
48 | 	# Build the intial peptides.
49 | 	seq = filter(lambda L: L[0] != -1, [[spectrum_score(peptide,spec), peptide] for peptide in append_protein(weight.keys())]) 
50 | 
51 | 	# Build the sequence until the masses all grow too large.
52 | 	while seq != []:
53 | 		# Store the scores of the current sequence in a dictionary.
54 | 		scores = dict()
55 | 		for item in seq:
56 | 			if item[0] in scores:
57 | 				scores[item[0]].append(item[1])
58 | 			else:
59 | 				scores[item[0]] = [item[1]]
60 | 
61 | 		# Get the n leading scores with ties, remove lower scores from dictionary.
62 | 		leaders, leader_scores = [], []
63 | 		if sum(len(peptides) for peptides in scores.values()) < n:
64 | 			leaders = scores[max(scores.keys())]
65 | 		else:
66 | 			while len(leaders) < n:
67 | 				leaders += scores[max(scores.keys())]
68 | 				del scores[max(scores.keys())]		
69 | 
70 | 		# Use this line to reduce runtime, removes excess ties.
71 | 		# leaders = leaders[:100]
72 | 
73 | 		# Generate a new sequence of scores from the leaders.
74 | 		seq = filter(lambda L: L[0] != -1, [[spectrum_score(peptide,spec), peptide] for peptide in append_protein(leaders)])
75 | 
76 | 	# By construction, the scores are listed in descending order, so take the first peptide as the leader peptide.
77 | 	leader_peptide = '-'.join([str(int(weight[protein])) for protein in leaders[0]])
78 | 
79 | 	# Print and save the answer.
80 | 	print leader_peptide
81 | 	with open('output/Assignment_02E.txt', 'w') as output_data:
82 | 		output_data.write(leader_peptide)
83 | 


--------------------------------------------------------------------------------
/Assignment_02F.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Spectral Convolution Problem
 8 | Assignment #: 02
 9 | Problem ID: F
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/The-Spectral-Convolution-Saves-the-Day-104/#step-4
11 | '''
12 | 
13 | with open('data/stepic_2f.txt') as input_data:
14 | 	spec = map(int, input_data.read().strip().split())
15 | 
16 | # The spectrum isn't sorted, so find all differences and filter out the non-positive.
17 | convolution = [str(i-j) for i in spec for j in spec if i-j > 0]
18 | 
19 | # Print and save the answer.
20 | print ' '.join(convolution)
21 | with open('output/Assignment_02F.txt', 'w') as output_data:
22 | 	output_data.write(' '.join(convolution))
23 | 


--------------------------------------------------------------------------------
/Assignment_02G.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
  4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
  5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
  6 | 
  7 | Problem Title: Convolution Cyclopeptide Sequencing
  8 | Assignment #: 02
  9 | Problem ID: G
 10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/The-Spectral-Convolution-Saves-the-Day-104/#step-7
 11 | '''
 12 | 
 13 | def append_protein(add_list, protein_alphabet):
 14 | 	'''Returns a list containing all peptides from add_list with every possible protein suffix.'''
 15 | 	newlist = []
 16 | 	for item in add_list:
 17 | 		for p in protein_alphabet:
 18 | 			newlist.append(item+[p])
 19 | 
 20 | 	return newlist
 21 | 
 22 | def spectrum(peptide):
 23 | 	'''Returns the circular spectrum of a given peptide.'''
 24 | 	# Initialize as the mass 0 and the mass of the entire peptide.
 25 | 	spect = [0, sum(peptide)]
 26 | 	# Find the masses of the adjacent intermediary subpeptides
 27 | 	spect += [sum([protein for protein in (peptide*2)[j:j+i]]) for i in xrange(1,len(peptide)) for j in xrange(len(peptide))]
 28 | 
 29 | 	return sorted(spect)
 30 | 
 31 | def spectrum_score(peptide, exp_spec):
 32 | 	'''Returns the number of matching masses from the spectrum of peptide when compared with the spectrum exp_spec.'''
 33 | 	pep_spec = spectrum(peptide)
 34 | 	# Return -1 if the peptide has more mass than exp_spec.
 35 | 	if pep_spec[-1] > exp_spec[-1]:
 36 | 		return -1
 37 | 	return sum([min(pep_spec.count(protein),exp_spec.count(protein)) for protein in set(pep_spec)])
 38 | 
 39 | if __name__ == '__main__':
 40 | 
 41 | 	with open('data/stepic_2g.txt') as input_data:
 42 | 		m, n, spec = [int(line.strip()) if i <= 1 else sorted(map(int,line.strip().split())) for i, line in enumerate(input_data.readlines())]
 43 | 
 44 | 	# Get the convolution.
 45 | 	convolution = [i-j for i in spec for j in spec if i-j > 0]
 46 | 
 47 | 	# Get the top M elements from the convolution that are between 57 and 200.
 48 | 	convo_dict = dict()
 49 | 	for c in set(filter(lambda c: 57<=c<=200, convolution)):
 50 | 		num_c = convolution.count(c)
 51 | 		if num_c in convo_dict:
 52 | 			convo_dict[num_c].append(c) 
 53 | 		else:
 54 | 			convo_dict[num_c] = [c]
 55 | 
 56 | 	alphabet = []
 57 | 	while len(alphabet) < m:
 58 | 		alphabet += convo_dict[max(convo_dict.keys())]
 59 | 		del convo_dict[max(convo_dict.keys())]
 60 | 
 61 | 	# Initialize the overall leader.
 62 | 	overall_leader = [-1,-1]
 63 | 	# Build the intial peptides.
 64 | 	seq = filter(lambda L: L[0] != -1, [[spectrum_score([peptide],spec), [peptide]] for peptide in alphabet]) 
 65 | 
 66 | 	# Build the sequence until the masses all grow too large.
 67 | 	while seq != []:
 68 | 
 69 | 		# Add the peptides and scores from the current round to the scores dictonary.
 70 | 		scores = dict()
 71 | 		for item in seq:
 72 | 			if item[0] in scores:
 73 | 				scores[item[0]].append(item[1])
 74 | 			else:
 75 | 				scores[item[0]] = [item[1]]
 76 | 
 77 | 		# If we have less than n total items, then use all of them.
 78 | 		if len(seq) < n:
 79 | 			leaders = [item[1] for item in seq]
 80 | 			leader_scores = [min(item[0] for item in seq)]
 81 | 
 82 | 		# Otherwise, get the n leading scores with ties, remove lower scores from dictionary.
 83 | 		else:
 84 | 			leaders, leader_scores = [], []
 85 | 			while len(leaders) < n:
 86 | 				current_max = max(filter(lambda s: s not in leader_scores, scores.keys()))
 87 | 				leaders += scores[current_max]
 88 | 				leader_scores.append(current_max)
 89 | 
 90 | 		# Use this line to reduce runtime, removes excess ties.
 91 | 		# leaders = leaders[:100]
 92 | 
 93 | 		# If necessary, update the overall leader.
 94 | 		if overall_leader[0] <= max(scores.keys()):
 95 | 			overall_leader = [max(scores.keys()), '-'.join(map(str, scores[max(scores.keys())][0]))]
 96 | 
 97 | 		# Generate a new sequence of scores from the leaders.
 98 | 		seq = filter(lambda L: L[0] != -1, [[spectrum_score(peptide,spec), peptide] for peptide in append_protein(leaders, alphabet)])
 99 | 
100 | 	# Print and save the answer.
101 | 	print overall_leader[1]
102 | 	with open('output/Assignment_02G.txt', 'w') as output_data:
103 | 		output_data.write(overall_leader[1])
104 | 


--------------------------------------------------------------------------------
/Assignment_03A.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Motif Enumeration
 8 | Assignment #: 03
 9 | Problem ID: A
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Motif-Finding-Is-More-Difficult-Than-You-Think-156/#step-7
11 | '''
12 | 
13 | from Assignment_01G import MismatchList
14 | 
15 | with open('data/stepic_3a.txt') as input_data:
16 | 	k, d = map(int, input_data.readline().split())
17 | 	dna_list = [line.strip() for line in input_data.readlines()]
18 | 
19 | # Generate sets of (k,d)-motifs for each dna sequence in the list.
20 | motif_sets = [{kmer for i in xrange(len(dna)-k+1) for kmer in MismatchList(dna[i:i+k], d)} for dna in dna_list]
21 | 
22 | # Intersect all sets to get the common elements.  The answers are displayed as sorted, so we'll sort too.
23 | motifs = sorted(list(reduce(lambda a,b: a&b, motif_sets)))
24 | 
25 | # Print and save the answer.
26 | print ' '. join(motifs)
27 | with open('output/Assignment_03A.txt', 'w') as output_data:
28 | 	output_data.write(' '.join(motifs))
29 | 


--------------------------------------------------------------------------------
/Assignment_03B.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Median String Problem
 8 | Assignment #: 03
 9 | Problem ID: B
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Motif-Finding-to-Finding-a-Median-String-158/#step-7
11 | '''
12 | 
13 | from itertools import product
14 | from scripts import HammingDistance
15 | 
16 | def motif_score(pattern, motif):
17 | 	'''Returns the score of d(pattern, motif).'''
18 | 	return min([HammingDistance(motif[i:i+len(pattern)], pattern) for i in range(len(motif)-len(pattern)+1)])
19 | 
20 | with open('data/stepic_3b.txt') as input_data:
21 | 	k = int(input_data.readline())
22 | 	dna_list = [line.strip() for line in input_data.readlines()]
23 | 
24 | # Initialize the best pattern score as one greater than the maximum possible score.
25 | best_pattern = [k*len(dna_list) + 1, None]
26 | 
27 | # Check the scores of all k-mers.
28 | for pattern in product('ACGT', repeat=k):
29 | 	current_score = sum([motif_score(''.join(pattern),dna) for dna in dna_list])
30 | 	if current_score < best_pattern[0]:
31 | 		best_pattern = [current_score, ''.join(pattern)]
32 | 
33 | # Print and save the answer.
34 | print best_pattern[1]
35 | with open('output/Assignment_03B.txt', 'w') as output_data:
36 | 	output_data.write(best_pattern[1])
37 | 


--------------------------------------------------------------------------------
/Assignment_03C.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Profile-most Probable k-mer Problem
 8 | Assignment #: 03
 9 | Problem ID: C
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Greedy-Motif-Search-159/#step-3
11 | '''
12 | 
13 | with open('data/stepic_3c.txt') as input_data:
14 | 	dna = input_data.readline().strip()
15 | 	k = int(input_data.readline())
16 | 	profile = [map(float,line.strip().split()) if i!=0 else line.strip().split() for i,line in enumerate(input_data.readlines())]
17 | 
18 | # A dictionary relating nucleotides to their position within the profile.
19 | nuc_loc = {nucleotide:index for index,nucleotide in enumerate(profile[0])}
20 | 
21 | # Initialize the maximum probabily.
22 | max_prob = [-1, None]
23 | 
24 | # Compute the probability of the each k-mer, store it if it's currently a maximum.
25 | for i in xrange(len(dna)-k+1):
26 | 	current_prob = 1
27 | 	for j, nucleotide in enumerate(dna[i:i+k]):
28 | 		current_prob *= profile[j+1][nuc_loc[nucleotide]]
29 | 	if current_prob > max_prob[0]:
30 | 		max_prob = [current_prob, dna[i:i+k]]
31 | 
32 | # Print and save the answer.
33 | print max_prob[1]
34 | with open('output/Assignment_03C.txt', 'w') as output_data:
35 | 	output_data.write(max_prob[1])
36 | 


--------------------------------------------------------------------------------
/Assignment_03D.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Greedy Motif Search
 8 | Assignment #: 03
 9 | Problem ID: D
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Greedy-Motif-Search-159/#step-5
11 | '''
12 | 
13 | from scripts import HammingDistance
14 | 
15 | def score(motifs):
16 | 	'''Returns the score of the dna list motifs.'''
17 | 	score = 0
18 | 	for i in xrange(len(motifs[0])):
19 | 		motif = ''.join([motifs[j][i] for j in xrange(len(motifs))])
20 | 		score += min([HammingDistance(motif, homogeneous*len(motif)) for homogeneous in 'ACGT'])
21 | 	return score
22 | 
23 | def profile(motifs):
24 | 	'''Returns the profile of the dna list motifs.'''
25 | 	prof = []
26 | 	for i in xrange(len(motifs[0])):
27 | 		col = ''.join([motifs[j][i] for j in xrange(len(motifs))])
28 | 		prof.append([float(col.count(nuc))/float(len(col)) for nuc in 'ACGT'])
29 | 	return prof
30 | 
31 | def profile_most_probable_kmer(dna, k, prof):
32 | 	'''Return the profile most probable k-mer in a given dna sequence.'''
33 | 	# A dictionary relating nucleotides to their position within the profile.
34 | 	nuc_loc = {nucleotide:index for index,nucleotide in enumerate('ACGT')}
35 | 	# Initialize the maximum probabily.
36 | 	max_prob = [-1, None]
37 | 	# Compute the probability of the each k-mer, store it if it's currently a maximum.
38 | 	for i in xrange(len(dna)-k+1):
39 | 		current_prob = 1
40 | 		for j, nucleotide in enumerate(dna[i:i+k]):
41 | 			current_prob *= prof[j][nuc_loc[nucleotide]]
42 | 		if current_prob > max_prob[0]:
43 | 			max_prob = [current_prob, dna[i:i+k]]
44 | 
45 | 	return max_prob[1]
46 | 
47 | if __name__ == '__main__':
48 | 
49 | 	with open('data/stepic_3d.txt') as input_data:
50 | 		k,t = map(int, input_data.readline().split())
51 | 		dna_list = [line.strip() for line in input_data.readlines()]
52 | 
53 | 	# Initialize the best score as a score higher than the highest possible score.
54 | 	best_score = [t*k, None]
55 | 
56 | 	# Run the greedy motif search.
57 | 	for i in xrange(len(dna_list[0])-k+1):
58 | 		# Initialize the motifs as each k-mer from the first dna sequence.
59 | 		motifs = [dna_list[0][i:i+k]]
60 | 		current_profile = profile(motifs)
61 | 
62 | 		# Find the most probable k-mer in the next string.
63 | 		for j in xrange(1,t):
64 | 			motifs.append(profile_most_probable_kmer(dna_list[j],k,current_profile))
65 | 			current_profile = profile(motifs)
66 | 
67 | 		# Check to see if we have a new best scoring list of motifs.
68 | 		current_score = score(motifs)
69 | 		if current_score < best_score[0]:
70 | 			best_score = [current_score, motifs]
71 | 
72 | 	# Print and save the answer.
73 | 	print '\n'.join(best_score[1])
74 | 	with open('output/Assignment_03D.txt', 'w') as output_data:
75 | 		output_data.write('\n'.join(best_score[1]))
76 | 


--------------------------------------------------------------------------------
/Assignment_03E.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Greedy Motif Search with Pseudocounts
 8 | Assignment #: 03
 9 | Problem ID: E
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Motif-Finding-Meets-Oliver-Cromwell-160/#step-9
11 | '''
12 | 
13 | from Assignment_03D import score, profile_most_probable_kmer
14 | 
15 | def profile_with_pseudocounts(motifs):
16 | 	'''Returns the profile of the dna list motifs.'''
17 | 	prof = []
18 | 	for i in xrange(len(motifs[0])):
19 | 		col = ''.join([motifs[j][i] for j in xrange(len(motifs))])
20 | 		prof.append([float(col.count(nuc)+1)/float(len(col)+4) for nuc in 'ACGT'])
21 | 	return prof
22 | 
23 | if __name__ == '__main__':
24 | 
25 | 	with open('data/stepic_3e.txt') as input_data:
26 | 		k,t = map(int, input_data.readline().split())
27 | 		dna_list = [line.strip() for line in input_data.readlines()]
28 | 
29 | 	# Initialize the best score as a score higher than the highest possible score.
30 | 	best_score = [t*k, None]
31 | 
32 | 	# Run the greedy motif search.
33 | 	for i in xrange(len(dna_list[0])-k+1):
34 | 		# Initialize the motifs as each k-mer from the first dna sequence.
35 | 		motifs = [dna_list[0][i:i+k]]
36 | 		current_profile = profile_with_pseudocounts(motifs)
37 | 
38 | 		# Find the most probable k-mer in the next string, using pseudocounts.
39 | 		for j in xrange(1,t):
40 | 			motifs.append(profile_most_probable_kmer(dna_list[j],k,current_profile))
41 | 			current_profile = profile_with_pseudocounts(motifs)
42 | 
43 | 		# Check to see if we have a new best scoring list of motifs.
44 | 		current_score = score(motifs)
45 | 		if current_score < best_score[0]:
46 | 			best_score = [current_score, motifs]
47 | 
48 | 	# Print and save the answer.
49 | 	print '\n'.join(best_score[1])
50 | 	with open('output/Assignment_03E.txt', 'w') as output_data:
51 | 		output_data.write('\n'.join(best_score[1]))
52 | 


--------------------------------------------------------------------------------
/Assignment_03F.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Randomized Motif Search
 8 | Assignment #: 03
 9 | Problem ID: F
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Randomized-Motif-Search-161/#step-3
11 | '''
12 | 
13 | from random import randint
14 | from Assignment_03D import score, profile_most_probable_kmer
15 | from Assignment_03E import profile_with_pseudocounts
16 | 
17 | def motifs_from_profile(profile, dna, k):
18 | 	return [profile_most_probable_kmer(seq,k,profile) for seq in dna]
19 | 
20 | def randomized_motif_search(dna,k,t):
21 | 	# Randomly generate k-mers from each sequence in the dna list.
22 | 	rand_ints = [randint(0,len(dna[0])-k) for a in xrange(t)]
23 | 	motifs = [dna_list[i][r:r+k] for i,r in enumerate(rand_ints)]
24 | 
25 | 	# Initialize the best score as a score higher than the highest possible score.
26 | 	best_score = [score(motifs), motifs]
27 | 
28 | 	# Iterate motifs.
29 | 	while True:
30 | 		current_profile = profile_with_pseudocounts(motifs)
31 | 		motifs = motifs_from_profile(current_profile, dna_list, k)
32 | 		current_score = score(motifs)
33 | 		if current_score < best_score[0]:
34 | 			best_score = [current_score, motifs]
35 | 		else:
36 | 			return best_score
37 | 
38 | if __name__ == '__main__':
39 | 
40 | 	with open('data/stepic_3f.txt') as input_data:
41 | 		k,t = map(int, input_data.readline().split())
42 | 		dna_list = [line.strip() for line in input_data.readlines()]
43 | 
44 | 	# Initialize the best scoring motifs as a score higher than the highest possible score.
45 | 	best_motifs = [k*t, None]
46 | 
47 | 	# Repeat the radomized motif search 1000 times.
48 | 	for repeat in xrange(1000):
49 | 		current_motifs = randomized_motif_search(dna_list,k,t)
50 | 		if current_motifs[0] < best_motifs[0]:
51 | 			best_motifs = current_motifs
52 | 
53 | 	# Print and save the answer.
54 | 	print '\n'.join(best_motifs[1])
55 | 	with open('output/Assignment_03F.txt', 'w') as output_data:
56 | 		output_data.write('\n'.join(best_motifs[1]))
57 | 


--------------------------------------------------------------------------------
/Assignment_03G.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic.
 6 | 
 7 | Problem Title: Gibbs Sampler
 8 | Assignment #: 03
 9 | Problem ID: G
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Gibbs-Sampling-163/#step-4
11 | '''
12 | 
13 | from random import randint
14 | from Assignment_03D import score, profile_most_probable_kmer
15 | from Assignment_03E import profile_with_pseudocounts
16 | 
17 | def gibbs_sampler(dna,k,t,N):
18 | 	# Randomly generate k-mers from each sequence in the dna list.
19 | 	rand_ints = [randint(0,len(dna[0])-k) for a in xrange(t)]
20 | 	motifs = [dna_list[i][r:r+k] for i,r in enumerate(rand_ints)]
21 | 
22 | 	# Initialize the best score as a score higher than the highest possible score.
23 | 	best_score = [score(motifs), motifs]
24 | 
25 | 	# Iterate motifs.
26 | 	for i in xrange(N):
27 | 		r = randint(0,t-1)
28 | 		current_profile = profile_with_pseudocounts([motif for index, motif in enumerate(motifs) if index!=r])
29 | 		# print 'a: ', motifs
30 | 		motifs = [profile_most_probable_kmer(dna[index],k,current_profile) if index == r else motif for index,motif in enumerate(motifs)]
31 | 		# print 'b: ', motifs
32 | 		current_score = score(motifs)
33 | 		if current_score < best_score[0]:
34 | 			best_score = [current_score, motifs]
35 | 
36 | 	return best_score
37 | 
38 | if __name__ == '__main__':
39 | 
40 | 	with open('data/stepic_3g.txt') as input_data:
41 | 		k,t,N = map(int, input_data.readline().split())
42 | 		dna_list = [line.strip() for line in input_data.readlines()]
43 | 
44 | 	# Initialize the best scoring motifs as a score higher than the highest possible score.
45 | 	best_motifs = [k*t, None]
46 | 
47 | 	# Repeat the radomized motif search 20 times.
48 | 	for repeat in xrange(20):
49 | 		current_motifs = gibbs_sampler(dna_list,k,t,N)
50 | 		if current_motifs[0] < best_motifs[0]:
51 | 			best_motifs = current_motifs
52 | 
53 | 	# Print and save the answer.
54 | 	print '\n'.join(best_motifs[1])
55 | 	with open('output/Assignment_03G.txt', 'w') as output_data:
56 | 		output_data.write('\n'.join(best_motifs[1]))
57 | 


--------------------------------------------------------------------------------
/Assignment_04A.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: String Composition Problem
 8 | Assignment #: 04
 9 | Problem ID: A
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/The-String-Reconstruction-Problem-197/#step-3
11 | '''
12 | 
13 | with open('data/stepic_4a.txt') as input_data:
14 | 	k = int(input_data.readline().strip())
15 | 	text = input_data.readline().strip()
16 | 
17 | # Generate the list of all k-mers in text and sort them lexiographically.
18 | composition = sorted([text[i:i+k] for i in xrange(len(text)-k+1)])
19 | 
20 | # Print and save the answer.
21 | print '\n'. join(composition)
22 | with open('output/Assignment_04A.txt', 'w') as output_data:
23 | 	output_data.write('\n'. join(composition))
24 | 


--------------------------------------------------------------------------------
/Assignment_04B.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Overlap Graph Problem
 8 | Assignment #: 04
 9 | Problem ID: B
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/String-Reconstruction-as-a-Walk-Through-the-Overlap-Graph-198/#step-7
11 | '''
12 | 
13 | with open('data/stepic_4b.txt') as input_data:
14 |     dna = [line.strip() for line in input_data.readlines()]
15 | 
16 | # Lambda functions to check for overlap and print overlaps in the desired way.
17 | check_overlap = lambda pair: pair[0][1:] == pair[1][:-1]
18 | print_overlap = lambda pair: ' -> '.join(pair)
19 | 
20 | # Get all pairs, filter out non-overlapping pairs, print overlapping pairs appropriately.
21 | pairs = ([dna1, dna2] for i, dna1 in enumerate(dna) for j, dna2 in enumerate(dna) if i != j)
22 | overlaps = map(print_overlap, filter(check_overlap, pairs))
23 | 
24 | # Print and save the answer.
25 | print '\n'.join(overlaps)
26 | with open('output/Assignment_04B.txt', 'w') as output_data:
27 |     output_data.write('\n'.join(overlaps))
28 | 


--------------------------------------------------------------------------------
/Assignment_04C.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: De Bruijn Graph from a String Problem
 8 | Assignment #: 04
 9 | Problem ID: C
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Another-Graph-for-String-Reconstruction-199/#step-6
11 | '''
12 | 
13 | # Read the input data.
14 | with open('data/stepic_4c.txt') as input_data:
15 |     k = int(input_data.readline())
16 |     dna = input_data.readline().strip()
17 | 
18 | # Create a dictionary matching (k-1)-mers to their followers.
19 | de_bruijn_dict = dict()
20 | for kmer in (dna[i:i+k] for i in xrange(len(dna)-k+1)):
21 |     if kmer[:-1] in de_bruijn_dict:
22 |         de_bruijn_dict[kmer[:-1]].add(kmer[1:])
23 |     else:
24 |         de_bruijn_dict[kmer[:-1]] = {kmer[1:]}
25 | 
26 | # Write the De Bruijn Graph in the specified format
27 | de_buijn = [' -> '.join([item[0], ','.join(item[1])]) for item in de_bruijn_dict.items()]
28 | 
29 | # Print and save the answer.
30 | print '\n'.join(de_buijn)
31 | with open('output/Assignment_04C.txt', 'w') as output_data:
32 |     output_data.write('\n'.join(de_buijn))
33 | 


--------------------------------------------------------------------------------
/Assignment_04D.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: De Bruijn Graph from k-mers Problem
 8 | Assignment #: 4
 9 | Problem ID: D
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Another-Walk-200/#step-7
11 | '''
12 | 
13 | # Read the input data.
14 | with open('data/stepic_4d.txt') as input_data:
15 |     kmers = [line.strip() for line in input_data.readlines()]
16 | 
17 | # Create a dictionary matching (k-1)-mers to their followers.
18 | de_bruijn_dict = dict()
19 | for kmer in kmers:
20 |     if kmer[:-1] in de_bruijn_dict:
21 |         de_bruijn_dict[kmer[:-1]].add(kmer[1:])
22 |     else:
23 |         de_bruijn_dict[kmer[:-1]] = {kmer[1:]}
24 | 
25 | # Write the De Bruijn Graph in the specified format
26 | de_buijn = [' -> '.join([item[0], ','.join(item[1])]) for item in de_bruijn_dict.items()]
27 | 
28 | # Print and save the answer.
29 | print '\n'.join(de_buijn)
30 | with open('output/Assignment_04D.txt', 'w') as output_data:
31 |     output_data.write('\n'.join(de_buijn))
32 | 


--------------------------------------------------------------------------------
/Assignment_04E.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Eulerian Cycle Problem
 8 | Assignment #: 04
 9 | Problem ID: E
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Eulers-Theorem-to-an-Algorithm-for-Finding-Eulerian-Cycles-203/#step-2
11 | '''
12 | 
13 | 
14 | def eulerian_cycle(edge_dict):
15 |     '''Generates an Eulerian cycle from the given edges.'''
16 |     current_node = edge_dict.keys()[0]
17 |     path = [current_node]
18 | 
19 |     # Get the initial cycle.
20 |     while True:
21 |         path.append(edge_dict[current_node][0])
22 | 
23 |         if len(edge_dict[current_node]) == 1:
24 |             del edge_dict[current_node]
25 |         else:
26 |             edge_dict[current_node] = edge_dict[current_node][1:]
27 | 
28 |         if path[-1] in edge_dict:
29 |             current_node = path[-1]
30 |         else:
31 |             break
32 | 
33 |     # Continually expand the initial cycle until we're out of edge_dict.
34 |     while len(edge_dict) > 0:
35 |         for i in xrange(len(path)):
36 |             if path[i] in edge_dict:
37 |                 current_node = path[i]
38 |                 cycle = [current_node]
39 |                 while True:
40 |                     cycle.append(edge_dict[current_node][0])
41 | 
42 |                     if len(edge_dict[current_node]) == 1:
43 |                         del edge_dict[current_node]
44 |                     else:
45 |                         edge_dict[current_node] = edge_dict[current_node][1:]
46 | 
47 |                     if cycle[-1] in edge_dict:
48 |                         current_node = cycle[-1]
49 |                     else:
50 |                         break
51 | 
52 |                 path = path[:i] + cycle + path[i+1:]
53 |                 break
54 |     return path
55 | 
56 | if __name__ == '__main__':
57 | 
58 |     # Read the input data.
59 |     with open('data/stepic_4e.txt') as input_data:
60 |         edges = {}
61 |         for edge in [line.strip().split(' -> ') for line in input_data.readlines()]:
62 |             if ',' in edge[1]:
63 |                 edges[int(edge[0])] = map(int,edge[1].split(','))
64 |             else:
65 |                 edges[int(edge[0])] = [int(edge[1])]
66 | 
67 |     # Get the Eulerian cycle.
68 |     path = eulerian_cycle(edges)
69 | 
70 |     # Print and save the answer.
71 |     print '->'.join(map(str,path))
72 |     with open('output/Assignment_04E.txt', 'w') as output_data:
73 |         output_data.write('->'.join(map(str,path)))
74 | 


--------------------------------------------------------------------------------
/Assignment_04E_with_NetworkX.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Eulerian Cycle Problem
 8 | Assignment #: 04
 9 | Problem ID: E
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Eulers-Theorem-to-an-Algorithm-for-Finding-Eulerian-Cycles-203/#step-2
11 | '''
12 | 
13 | import networkx as nx
14 | 
15 | # Read the input data.
16 | with open('data/stepic_4e.txt') as input_data:
17 |     edges = [line.strip().split(' -> ') for line in input_data.readlines()]
18 | 
19 | # Properly format the edges.
20 | edges2 = []
21 | for edge in edges:
22 |     if ',' in edge[1]:
23 |         for node in edge[1].split(','):
24 |             edges2.append(map(int, [edge[0], node]))
25 |     else:
26 |         edges2.append(map(int, edge))
27 | 
28 | # Create the graph.
29 | G = nx.DiGraph()
30 | G.add_edges_from(edges2)
31 | 
32 | # Find an eulerian cycle.
33 | path = [str(e[0]) for e in nx.eulerian_circuit(G)]
34 | path.append(path[0])
35 | 
36 | # Print and save the answer.
37 | print '->'.join(path)
38 | with open('output/Assignment_04E.txt', 'w') as output_data:
39 |     output_data.write('->'.join(path))
40 | 


--------------------------------------------------------------------------------
/Assignment_05A.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Eulerian Path Problem
 8 | Assignment #: 05
 9 | Problem ID: A
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Eulers-Theorem-to-an-Algorithm-for-Finding-Eulerian-Cycles-203/#step-5
11 | '''
12 | 
13 | from Assignment_04E import eulerian_cycle
14 | 
15 | 
16 | def eulerian_path(edge_dict):
17 |     '''Returns an Eulerian path from the given edges.'''
18 |     # Determine the unbalanced edges.
19 |     out_values = reduce(lambda a,b: a+b, edge_dict.values())
20 |     for node in set(out_values+edge_dict.keys()):
21 |         out_value = out_values.count(node)
22 |         if node in edge_dict:
23 |             in_value = len(edge_dict[node])
24 |         else:
25 |             in_value = 0
26 | 
27 |         if in_value < out_value:
28 |             unbalanced_from = node
29 |         elif out_value < in_value:
30 |             unbalanced_to = node
31 | 
32 |     # Add an edge connecting the unbalanced edges.
33 |     if unbalanced_from in edge_dict:
34 |         edge_dict[unbalanced_from].append(unbalanced_to)
35 |     else:
36 |         edge_dict[unbalanced_from] = [unbalanced_to]
37 | 
38 |     # Get the Eulerian Cycle from the edges, including the unbalanced edge.
39 |     cycle = eulerian_cycle(edge_dict)
40 | 
41 |     # Find the location of the unbalanced edge in the eulerian cycle.
42 |     divide_point = filter(lambda i: cycle[i:i+2] == [unbalanced_from, unbalanced_to], xrange(len(cycle)-1))[0]
43 | 
44 |     # Remove the unbalanced edge, and shift appropriately, overlapping the head and tail.
45 |     return cycle[divide_point+1:]+cycle[1:divide_point+1]
46 | 
47 | if __name__ == '__main__':
48 | 
49 |     # Read the input data.
50 |     with open('data/stepic_5a.txt') as input_data:
51 |         edges = {}
52 |         for edge in [line.strip().split(' -> ') for line in input_data.readlines()]:
53 |             if ',' in edge[1]:
54 |                 edges[int(edge[0])] = map(int,edge[1].split(','))
55 |             else:
56 |                 edges[int(edge[0])] = [int(edge[1])]
57 | 
58 |     # Get the Eulerian path associated with the edges.
59 |     path = eulerian_path(edges)
60 | 
61 |     # Print and save the answer.
62 |     print '->'.join(map(str, path))
63 |     with open('output/Assignment_05A.txt', 'w') as output_data:
64 |         output_data.write('->'.join(map(str, path)))
65 | 


--------------------------------------------------------------------------------
/Assignment_05B.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: String Reconstruction Problem
 8 | Assignment #: 05
 9 | Problem ID: B
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Eulers-Theorem-to-an-Algorithm-for-Finding-Eulerian-Cycles-203/#step-6
11 | '''
12 | 
13 | # Read the input data.
14 | with open('data/stepic_5b.txt') as input_data:
15 |     string_dict = {line.strip().split(' -> ')[0]:line.strip().split(' -> ')[1] for line in input_data.readlines()}
16 | 
17 | # Find the head and tail strings of the reconstructed string.
18 | head = filter(lambda x: x not in string_dict.values(), string_dict.keys())[0]
19 | tail = filter(lambda x: x not in string_dict.keys(), string_dict.values())[0]
20 | 
21 | # Initialize the reconstruction process, starting at the head.
22 | reconstructed_str = head[0]
23 | current_str = head
24 | 
25 | # Iterate over all intermediary strings, appending the first character to reconstruct the string.
26 | while current_str != tail:
27 |     current_str = string_dict[current_str]
28 |     reconstructed_str += current_str[0]
29 | 
30 | # Complete the reconstruction by adding the end of the tail.
31 | reconstructed_str += tail[1:]
32 | 
33 | # Print and save the answer.
34 | print reconstructed_str
35 | with open('output/Assignment_05B.txt', 'w') as output_data:
36 |     output_data.write(reconstructed_str)
37 | 


--------------------------------------------------------------------------------
/Assignment_05C.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Universal String Problem
 8 | Assignment #: 05
 9 | Problem ID: C
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Eulers-Theorem-to-an-Algorithm-for-Finding-Eulerian-Cycles-203/#step-8
11 | '''
12 | 
13 | from Assignment_04E import eulerian_cycle
14 | from itertools import product
15 | 
16 | # Read the input data.
17 | with open('data/stepic_5c.txt') as input_data:
18 |     k = int(input_data.read().strip())
19 | 
20 | # Create the edges.
21 | universal_dict = {}
22 | for kmer in [''.join(item) for item in product('01', repeat=k)]:
23 |     if kmer[:-1] in universal_dict:
24 |         universal_dict[kmer[:-1]].append(kmer[1:])
25 |     else:
26 |         universal_dict[kmer[:-1]] = [kmer[1:]]
27 | 
28 | # Get the cycle, remove the repeated last entry for the associated path.
29 | path = eulerian_cycle(universal_dict)
30 | 
31 | # Print and save the answer.
32 | print ''.join([item[0] for item in path[:-1]])
33 | with open('output/Assignment_05C.txt', 'w') as output_data:
34 |     output_data.write(''.join([item[0] for item in path[:-1]]))
35 | 


--------------------------------------------------------------------------------
/Assignment_05D.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: String Construction from Read-Pairs Problem
 8 | Assignment #: 05
 9 | Problem ID: D
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Assembling-Read-Pairs-204/#step-14
11 | '''
12 | 
13 | from Assignment_05A import eulerian_path
14 | 
15 | # Read the input data.
16 | with open('data/stepic_5d.txt') as input_data:
17 |     d = int(input_data.readline())
18 |     paired_reads = [line.strip().split('|') for line in input_data.readlines()]
19 |     k = len(paired_reads[0][0])
20 | 
21 | # Construct a dictionary of edges from the paired reads.
22 | paired_dict = {}
23 | for pair in paired_reads:
24 |     if (pair[0][:-1],pair[1][:-1]) in paired_dict:
25 |         paired_dict[(pair[0][:-1],pair[1][:-1])].append((pair[0][1:],pair[1][1:]))
26 |     else:
27 |         paired_dict[(pair[0][:-1],pair[1][:-1])] = [(pair[0][1:],pair[1][1:])]
28 | 
29 | # Get an eulerian path from the paired edges.
30 | paired_path = eulerian_path(paired_dict)
31 | 
32 | # Recombine the paths, accounting for their overlaps.
33 | strings = [paired_path[0][i] + ''.join(map(lambda x: x[i][-1], paired_path[1:])) for i in xrange(2)]
34 | text = strings[0][:k+d]+strings[1]
35 | 
36 | # Print and save the answer.
37 | print text
38 | with open('output/Assignment_05D.txt', 'w') as output_data:
39 |     output_data.write(text)
40 | 


--------------------------------------------------------------------------------
/Assignment_05E.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Contig Generation Problem
 8 | Assignment #: 05
 9 | Problem ID: E
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Epilogue-Genome-Assembly-Faces-Additional-Practical-Hurdles-205/#step-5
11 | '''
12 | 
13 | from compiler.ast import flatten
14 | 
15 | # Read the input data.
16 | with open('data/stepic_5e.txt') as input_data:
17 |     kmers = [line.strip() for line in input_data.readlines()]
18 | 
19 | # Construct a dictionary of edges.
20 | edges = {}
21 | for kmer in kmers:
22 |     if kmer[:-1] in edges:
23 |         edges[kmer[:-1]].append(kmer[1:])
24 |     else:
25 |         edges[kmer[:-1]] = [kmer[1:]]
26 | 
27 | # Determine the balanced and unbalanced edges.
28 | balanced, unbalanced = [], []
29 | out_values = reduce(lambda a,b: a+b, edges.values())
30 | for node in set(out_values+edges.keys()):
31 |     out_value = out_values.count(node)
32 |     if node in edges:
33 |         in_value = len(edges[node])
34 |     else:
35 |         in_value = 0
36 | 
37 |     if in_value == out_value == 1:
38 |         balanced.append(node)
39 |     else:
40 |         unbalanced.append(node)
41 | 
42 | # Generate the contigs.
43 | get_contigs = lambda s, c: flatten([c+e[-1] if e not in balanced else get_contigs(e,c+e[-1]) for e in edges[s]])
44 | contigs = sorted(flatten([get_contigs(start,start) for start in set(unbalanced) & set(edges.keys())]))
45 | 
46 | # Print and save the answer.
47 | print '\n'.join(contigs)
48 | with open('output/Assignment_05E.txt', 'w') as output_data:
49 |     output_data.write('\n'.join(contigs))
50 | 


--------------------------------------------------------------------------------
/Assignment_06A.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Change Problem
 8 | Assignment #: 06
 9 | Problem ID: A
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/An-Introduction-to-Dynamic-Programming-The-Change-Problem-243/#step-8
11 | '''
12 | 
13 | 
14 | def DPChange(amount, coin_list):
15 |     '''Gives the minimum number of coins of denomination in coint_list necessary to create the given amount.'''
16 |     # Initiate the amounts larger than zero as a number greater than the upper bound.
17 |     min_coins = [0]+[(amount/min(coin_list))+1]*amount
18 |     # Use dynamic programming to build up to the desired amount.
19 |     for m in xrange(1,amount+1):
20 |         for coin in coin_list:
21 |             if m >= coin:
22 |                 if min_coins[m-coin] + 1 < min_coins[m]:
23 |                     min_coins[m] = min_coins[m-coin] + 1
24 |     return min_coins[amount]
25 | 
26 | if __name__ == '__main__':
27 | 
28 |     # Read the input data.
29 |     with open('data/stepic_6a.txt') as input_data:
30 |         money = int(input_data.readline().strip())
31 |         coins = map(int, input_data.readline().strip().split(','))
32 | 
33 |     # Get the desired minimum number of coins.
34 |     min_number = str(DPChange(money, coins))
35 | 
36 |     # Print and save the answer.
37 |     print min_number
38 |     with open('output/Assignment_06A.txt', 'w') as output_data:
39 |         output_data.write(min_number)
40 | 


--------------------------------------------------------------------------------
/Assignment_06B.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Manhattan Tourist
 8 | Assignment #: 06
 9 | Problem ID: B
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/The-Manhattan-Tourist-Problem-Revisited-261/#step-8
11 | '''
12 | 
13 | 
14 | def manhattan_tourist(n, m, down, right):
15 |     '''Returns the longest path from (0,0) to (n,m) using the taxicab metric and weights down, right.'''
16 |     from numpy import zeros
17 | 
18 |     # Initialize as the zero matrix.
19 |     S = zeros((n+1,m+1), dtype=int)
20 | 
21 |     # Compute the first row and column.
22 |     for i in xrange(1,n+1):
23 |         S[i][0] = S[i-1][0] + down[i-1][0]
24 |     for j in xrange(1, m+1):
25 |         S[0][j] = S[0][j-1] + right[0][j-1]
26 | 
27 |     # Compute the interior values.
28 |     for i in xrange(1,n+1):
29 |         for j in xrange(1,m+1):
30 |             S[i][j] = max(S[i-1][j]+down[i-1][j], S[i][j-1] + right[i][j-1])
31 | 
32 |     return S[n][m]
33 | 
34 | if __name__ == '__main__':
35 | 
36 |     # Read the input data.
37 |     with open('data/stepic_6b.txt') as input_data:
38 |         n, m = [int(input_data.readline()) for repeat in xrange(2)]
39 |         down, right = [[map(int, row.split()) for row in matrix.split('\n')] for matrix in input_data.read().strip().split('\n-\n')]
40 | 
41 |     # Get the maximum distance.
42 |     max_dist = str(manhattan_tourist(n, m, down, right))
43 | 
44 |     # Print and save the answer.
45 |     print max_dist
46 |     with open('output/Assignment_06B.txt', 'w') as output_data:
47 |         output_data.write(max_dist)
48 | 


--------------------------------------------------------------------------------
/Assignment_06C.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Longest Common Subsequence Problem
 8 | Assignment #: 06
 9 | Problem ID: C
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Backtracking-in-the-Alignment-Graph-245/#step-5
11 | '''
12 | 
13 | 
14 | # No need for two functions, as in the problem description. I replaced the recursive function
15 | # with a while loop in the first function.  Also, no need for the backtrack array, as all of
16 | # that information is easily recoverable from the original array.
17 | def longest_common_subsequence(v, w):
18 |     '''Returns the longest longest common subsequence of strings v and w.'''
19 |     from numpy import zeros
20 | 
21 |     # Initialize the array S and iterate through all character of v and w.
22 |     S = zeros((len(v)+1,len(w)+1), dtype=int)
23 |     for i in xrange(len(v)):
24 |         for j in xrange(len(w)):
25 |             if v[i] == w[j]:
26 |                 S[i+1][j+1] = S[i][j]+1
27 |             else:
28 |                 S[i+1][j+1] = max(S[i+1][j],S[i][j+1])
29 | 
30 |     # Recover a maximum substring.
31 |     longest_sseq = ''
32 |     i,j = len(v), len(w)
33 |     while i*j != 0:
34 |         if S[i][j] == S[i-1][j]:
35 |             i -= 1
36 |         elif S[i][j] == S[i][j-1]:
37 |             j -= 1
38 |         else:
39 |             longest_sseq = v[i-1] + longest_sseq
40 |             i -= 1
41 |             j -= 1
42 | 
43 |     return longest_sseq
44 | 
45 | if __name__ == '__main__':
46 | 
47 |     # Read the input data.
48 |     with open('data/stepic_6c.txt') as input_data:
49 |         dna1, dna2 = [line.strip() for line in input_data.readlines()]
50 | 
51 |     # Get the longest common subsequence.
52 |     longest_subseq = longest_common_subsequence(dna1, dna2)
53 | 
54 |     # Print and save the answer.
55 |     print longest_subseq
56 |     with open('output/Assignment_06C.txt', 'w') as output_data:
57 |         output_data.write(longest_subseq)
58 | 


--------------------------------------------------------------------------------
/Assignment_06D.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Longest Path in a DAG Problem
 8 | Assignment #: 06
 9 | Problem ID: D
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/Backtracking-in-the-Alignment-Graph-245/#step-7
11 | '''
12 | 
13 | 
14 | def topological_ordering(graph):
15 |     '''Returns a topological ordering for the given graph.'''
16 |     # Initialize and covert variables appropriately.
17 |     graph = set(graph)
18 |     ordering = []
19 |     candidates = list({edge[0] for edge in graph} - {edge[1] for edge in graph})
20 | 
21 |     # Get the topological ordering.
22 |     while len(candidates) != 0:
23 |         # Add the next candidate to the ordering.
24 |         ordering.append(candidates[0])
25 | 
26 |         # Remove outgoing edges and store outgoing nodes.
27 |         temp_nodes = []
28 |         for edge in filter(lambda e: e[0] == candidates[0], graph):
29 |             graph.remove(edge)
30 |             temp_nodes.append(edge[1])
31 | 
32 |         # Add outgoing nodes to candidates list if it has no other incoming edges.
33 |         for node in temp_nodes:
34 |             if node not in {edge[1] for edge in graph}:
35 |                 candidates.append(node)
36 | 
37 |         # Remove the current candidate.
38 |         candidates = candidates[1:]
39 | 
40 |     return ordering
41 | 
42 | 
43 | def longest_path(graph, edges, source, sink):
44 |     '''Returns the length and path of the longest path.'''
45 |     # Get the topological ordering from the source to sink, not including the source.
46 |     top_order = topological_ordering(graph.keys())
47 |     top_order = top_order[top_order.index(source)+1:top_order.index(sink)+1]
48 | 
49 |     # Initialize S and backtrack.
50 |     S = {node:-100 for node in {edge[0] for edge in graph.keys()} | {edge[1] for edge in graph.keys()}}
51 |     S[source] = 0
52 |     backtrack = {node:None for node in top_order}
53 | 
54 |     # Iterate through the topological order to get the distances, store predecessors in backtrack.
55 |     for node in top_order:
56 |         try:
57 |             S[node], backtrack[node] = max(map(lambda e: [S[e[0]] + graph[e], e[0]], filter(lambda e: e[1] == node, graph.keys())), key=lambda p:p[0])
58 |         # ValueError occurs if max() is empty, i.e. the given node has no predecessor.  This is fine, as top_order can include unrealted vertices.
59 |         # Ignore such nodes, as they will not factor into the longest path from source to sink.
60 |         except ValueError:
61 |             pass
62 | 
63 |     # Backtrack to get the longest path.
64 |     path = [sink]
65 |     while path[0] != source:
66 |         path = [backtrack[path[0]]] + path
67 | 
68 |     return S[sink], path
69 | 
70 | if __name__ == '__main__':
71 | 
72 |     # Read the input data.
73 |     with open('data/stepic_6d.txt') as input_data:
74 |         source, sink = [int(input_data.readline()) for repeat in xrange(2)]
75 | 
76 |         # Construct the edges and edge weights.
77 |         edges, edge_weight = {}, {}
78 |         for pair in [line.strip().split('->') for line in input_data.readlines()]:
79 |             if int(pair[0]) not in edges:
80 |                 edges[int(pair[0])] = [int(pair[1].split(':')[0])]
81 |             else:
82 |                 edges[int(pair[0])].append(int(pair[1].split(':')[0]))
83 | 
84 |             edge_weight[int(pair[0]), int(pair[1].split(':')[0])] = int(pair[1].split(':')[1])
85 | 
86 |     # Get the length and path of the longest path.
87 |     length, path = longest_path(edge_weight, edges, source, sink)
88 | 
89 |     # Convert to strings and format properly.
90 |     lenth = str(length)
91 |     path = '->'.join(map(str, path))
92 | 
93 |     # Print and save the answer.
94 |     print '\n'.join([lenth,path])
95 |     with open('output/Assignment_06D.txt', 'w') as output_data:
96 |         output_data.write('\n'.join([lenth,path]))
97 | 


--------------------------------------------------------------------------------
/Assignment_06E.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Global Alignment
 8 | Assignment #: 06
 9 | Problem ID: E
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Global-to-Local-Alignment-247/#step-3
11 | '''
12 | 
13 | 
14 | def global_alignment(v, w, scoring_matrix, sigma):
15 | 
16 |     # Initialize the matrices.
17 |     S = [[0 for repeat_j in xrange(len(w)+1)] for repeat_i in xrange(len(v)+1)]
18 |     backtrack = [[0 for repeat_j in xrange(len(w)+1)] for repeat_i in xrange(len(v)+1)]
19 | 
20 |     # Initialize the edges with the given penalties.
21 |     for i in xrange(1, len(v)+1):
22 |         S[i][0] = -i*sigma
23 |     for j in xrange(1, len(w)+1):
24 |         S[0][j] = -j*sigma
25 | 
26 |     # Fill in the Score and Backtrack matrices.
27 |     for i in xrange(1, len(v)+1):
28 |         for j in xrange(1, len(w)+1):
29 |             scores = [S[i-1][j] - sigma, S[i][j-1] - sigma, S[i-1][j-1] + scoring_matrix[v[i-1], w[j-1]]]
30 |             S[i][j] = max(scores)
31 |             backtrack[i][j] = scores.index(S[i][j])
32 | 
33 |     # Quick lambda function to insert indels.
34 |     insert_indel = lambda word, i: word[:i] + '-' + word[i:]
35 | 
36 |     # Initialize the aligned strings as the input strings.
37 |     v_aligned, w_aligned = v, w
38 | 
39 |     # Get the position of the highest scoring cell in the matrix and the high score.
40 |     i, j = len(v), len(w)
41 |     max_score = str(S[i][j])
42 | 
43 |     # Backtrack to the edge of the matrix starting at the highest scoring cell.
44 |     while i*j != 0:
45 |         if backtrack[i][j] == 0:
46 |             i -= 1
47 |             w_aligned = insert_indel(w_aligned, j)
48 |         elif backtrack[i][j] == 1:
49 |             j -= 1
50 |             v_aligned = insert_indel(v_aligned, i)
51 |         else:
52 |             i -= 1
53 |             j -= 1
54 | 
55 |     # Prepend the necessary preceeding indels to get to (0,0).
56 |     for repeat in xrange(i):
57 |         w_aligned = insert_indel(w_aligned, 0)
58 |     for repeat in xrange(j):
59 |         v_aligned = insert_indel(v_aligned, 0)
60 | 
61 |     return max_score, v_aligned, w_aligned
62 | 
63 | if __name__ == '__main__':
64 |     from scripts import BLOSUM62
65 | 
66 |     # Read the input data.
67 |     with open('data/stepic_6e.txt') as input_data:
68 |         word1, word2 = [line.strip() for line in input_data.readlines()]
69 | 
70 |     # Get the alignment.
71 |     alignment = global_alignment(word1, word2, BLOSUM62(), 5)
72 | 
73 |     # Print and save the answer.
74 |     print '\n'.join(alignment)
75 |     with open('output/Assignment_06E.txt', 'w') as output_data:
76 |         output_data.write('\n'.join(alignment))
77 | 


--------------------------------------------------------------------------------
/Assignment_06F.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Local Alignment
 8 | Assignment #: 06
 9 | Problem ID: F
10 | URL: https://beta.stepic.org/Bioinformatics-Algorithms-2/From-Global-to-Local-Alignment-247/#step-3
11 | '''
12 | 
13 | from scripts import PAM250
14 | 
15 | 
16 | def local_alignment(v, w, scoring_matrix, sigma):
17 |     '''Returns the score and local alignment with the given scoring matrix and indel penalty sigma for strings v, w.'''
18 |     from numpy import unravel_index, zeros
19 | 
20 |     # Initialize the matrices.
21 |     S = zeros((len(v)+1, len(w)+1), dtype=int)
22 |     backtrack = zeros((len(v)+1, len(w)+1), dtype=int)
23 | 
24 |     # Fill in the Score and Backtrack matrices.
25 |     for i in xrange(1, len(v)+1):
26 |         for j in xrange(1, len(w)+1):
27 |             scores = [S[i-1][j] - sigma, S[i][j-1] - sigma, S[i-1][j-1] + scoring_matrix[v[i-1], w[j-1]], 0]
28 |             S[i][j] = max(scores)
29 |             backtrack[i][j] = scores.index(S[i][j])
30 | 
31 |     # Quick lambda function to insert indels.
32 |     insert_indel = lambda word, i: word[:i] + '-' + word[i:]
33 | 
34 |     # Get the position of the highest scoring cell in the matrix and the high score.
35 |     i,j = unravel_index(S.argmax(), S.shape)
36 |     max_score = str(S[i][j])
37 | 
38 |     # Initialize the aligned strings as the input strings up to the position of the high score.
39 |     v_aligned, w_aligned = v[:i], w[:j]
40 | 
41 |     # Backtrack to start of the local alignment starting at the highest scoring cell.
42 |     while backtrack[i][j] != 3 and i*j != 0:
43 |         if backtrack[i][j] == 0:
44 |             i -= 1
45 |             w_aligned = insert_indel(w_aligned, j)
46 |         elif backtrack[i][j] == 1:
47 |             j -= 1
48 |             v_aligned = insert_indel(v_aligned, i)
49 |         elif backtrack[i][j] == 2:
50 |             i -= 1
51 |             j -= 1
52 | 
53 |     # Cut the strings at the ending point of the backtrack.
54 |     v_aligned = v_aligned[i:]
55 |     w_aligned = w_aligned[j:]
56 | 
57 |     return max_score, v_aligned, w_aligned
58 | 
59 | if __name__ == '__main__':
60 | 
61 |     # Read the input data.
62 |     with open('data/stepic_6f.txt') as input_data:
63 |         word1, word2 = [line.strip() for line in input_data.readlines()]
64 | 
65 |     # Get the local alignment (given sigma = 5 in problem statement).
66 |     alignment = local_alignment(word1, word2, PAM250(), 5)
67 | 
68 |     # Print and save the answer.
69 |     print '\n'.join(alignment)
70 |     with open('output/Assignment_06F.txt', 'w') as output_data:
71 |         output_data.write('\n'.join(alignment))
72 | 


--------------------------------------------------------------------------------
/Assignment_07A.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Edit Distance
 8 | Assignment #: 07
 9 | Problem ID: A
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/The-Changing-Faces-of-Sequence-Alignment-248/step/3
11 | '''
12 | 
13 | 
14 | def edit_distance(v,w):
15 |     '''Returns the edit distance of strings v and w.'''
16 |     from numpy import zeros
17 | 
18 |     # Initialize matrix M.
19 |     M = zeros((len(v)+1,len(w)+1), dtype=int)
20 |     for i in range(1,len(v)+1):
21 |         M[i][0] = i
22 |     for j in range(1,len(w)+1):
23 |         M[0][j] = j
24 | 
25 |     # Compute each entry of M.
26 |     for i in xrange(1,len(v)+1):
27 |         for j in xrange(1,len(w)+1):
28 |             if v[i-1] == w[j-1]:
29 |                 M[i][j] = M[i-1][j-1]
30 |             else:
31 |                 M[i][j] = min(M[i-1][j]+1, M[i][j-1]+1, M[i-1][j-1]+1)
32 | 
33 |     # Print and save the desired edit distance.
34 |     return M[len(v)][len(w)]
35 | 
36 | if __name__ == '__main__':
37 | 
38 |     # Read the input data.
39 |     with open('data/stepic_7a.txt') as input_data:
40 |         word1, word2 = [line.strip() for line in input_data.readlines()]
41 | 
42 |     # Get the edit distance.
43 |     e_dist = edit_distance(word1, word2)
44 | 
45 |     # Print and save the answer.
46 |     print str(e_dist)
47 |     with open('output/Assignment_07A.txt', 'w') as output_data:
48 |         output_data.write(str(e_dist))
49 | 


--------------------------------------------------------------------------------
/Assignment_07B.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Fitting Alignment Problem
 8 | Assignment #: 07
 9 | Problem ID: B
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/The-Changing-Faces-of-Sequence-Alignment-248/step/5
11 | '''
12 | 
13 | 
14 | def fitting_alignment(v,w):
15 |     '''Returns the fitting alignment of strings v and w.'''
16 |     from numpy import zeros
17 | 
18 |     # Initialize the matrices.
19 |     S = zeros((len(v)+1, len(w)+1), dtype=int)
20 |     backtrack = zeros((len(v)+1, len(w)+1), dtype=int)
21 | 
22 |     # Fill in the Score and Backtrack matrices.
23 |     for i in xrange(1, len(v)+1):
24 |         for j in xrange(1, len(w)+1):
25 |             scores = [S[i-1][j] - 1, S[i][j-1] - 1, S[i-1][j-1] + [-1, 1][v[i-1] == w[j-1]]]
26 |             S[i][j] = max(scores)
27 |             backtrack[i][j] = scores.index(S[i][j])
28 | 
29 |     # Get the position of the highest scoring cell corresponding to the end of the shorter word w.
30 |     j = len(w)
31 |     i = max(enumerate([S[row][j] for row in xrange(len(w), len(v))]),key=lambda x: x[1])[0] + len(w)
32 |     max_score = str(S[i][j])
33 | 
34 |     # Initialize the aligned strings as the input strings up to the position of the high score.
35 |     v_aligned, w_aligned = v[:i], w[:j]
36 | 
37 |     # Quick lambda function to insert indels.
38 |     insert_indel = lambda word, i: word[:i] + '-' + word[i:]
39 | 
40 |     # Backtrack to start of the fitting alignment.
41 |     while i*j != 0:
42 |         if backtrack[i][j] == 0:
43 |             i -= 1
44 |             w_aligned = insert_indel(w_aligned, j)
45 |         elif backtrack[i][j] == 1:
46 |             j -= 1
47 |             v_aligned = insert_indel(v_aligned, i)
48 |         elif backtrack[i][j] == 2:
49 |             i -= 1
50 |             j -= 1
51 | 
52 |     # Cut off v at the ending point of the backtrack.
53 |     v_aligned = v_aligned[i:]
54 | 
55 |     return max_score, v_aligned, w_aligned
56 | 
57 | if __name__ == '__main__':
58 | 
59 |     # Read the input data.
60 |     with open('data/stepic_7b.txt') as input_data:
61 |         word1, word2 = [line.strip() for line in input_data.readlines()]
62 | 
63 |     # Get the fitting alignment.
64 |     alignment = fitting_alignment(word1, word2)
65 | 
66 |     # Print and save the answer.
67 |     print '\n'.join(alignment)
68 |     with open('output/Assignment_07B.txt', 'w') as output_data:
69 |         output_data.write('\n'.join(alignment))
70 | 


--------------------------------------------------------------------------------
/Assignment_07C.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Overlap Alignment Problem
 8 | Assignment #: 07
 9 | Problem ID: C
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/The-Changing-Faces-of-Sequence-Alignment-248/step/7
11 | '''
12 | 
13 | 
14 | def overlap_alignment(v, w):
15 |     '''Returns the overlap alignment of strings v and w.'''
16 | 
17 |     # Initialize the arrays.
18 |     S = [[0 for repeat_j in xrange(len(w)+1)] for repeat_i in xrange(len(v)+1)]
19 |     backtrack = [[0 for repeat_j in xrange(len(w)+1)] for repeat_i in xrange(len(v)+1)]
20 | 
21 |     # Initialize the max score.
22 |     max_score = -3*(len(v) + len(w))
23 | 
24 |     # Fill in the Score and Backtrack arrays.
25 |     for i in xrange(1, len(v)+1):
26 |         for j in xrange(1, len(w)+1):
27 |             # Match score = 1, Mismatch and Indels = -2.
28 |             scores = [S[i-1][j-1] + [-2, 1][v[i-1] == w[j-1]], S[i-1][j] - 2, S[i][j-1] - 2]
29 |             S[i][j] = max(scores)
30 |             backtrack[i][j] = scores.index(S[i][j])
31 | 
32 |             # Check if we have a new maximum along the last row or column and update accordingly.
33 |             if i == len(v) or j == len(w):
34 |                 if S[i][j] > max_score:
35 |                     max_score = S[i][j]
36 |                     max_indices = (i, j)
37 | 
38 |     # Initialize i and j as their corresponding index of the maximum score.
39 |     i, j = max_indices
40 | 
41 |     # Initialize the aligned strings as the input strings, removing the unused tails.
42 |     v_aligned, w_aligned = v[:i], w[:j]
43 | 
44 |     # Quick lambda function to insert indels.
45 |     insert_indel = lambda word, i: word[:i] + '-' + word[i:]
46 | 
47 |     # Backtrack to the first row or column from the highest score in the last row or column.
48 |     while i*j != 0:
49 |         if backtrack[i][j] == 1:
50 |             i -= 1
51 |             w_aligned = insert_indel(w_aligned, j)
52 |         elif backtrack[i][j] == 2:
53 |             j -= 1
54 |             v_aligned = insert_indel(v_aligned, i)
55 |         else:
56 |             i -= 1
57 |             j -= 1
58 | 
59 |     # Remove the unused head the aligned strings.
60 |     v_aligned, w_aligned = v_aligned[i:], w_aligned[j:]
61 | 
62 |     return str(max_score), v_aligned, w_aligned
63 | 
64 | if __name__ == '__main__':
65 | 
66 |     # Read the input data.
67 |     with open('data/stepic_7c.txt') as input_data:
68 |         word1, word2 = [line.strip() for line in input_data.readlines()]
69 | 
70 |     # Get the alignment.
71 |     alignment = overlap_alignment(word1, word2)
72 | 
73 |     # Print and save the answer.
74 |     print '\n'.join(alignment)
75 |     with open('output/Assignment_07C.txt', 'w') as output_data:
76 |         output_data.write('\n'.join(alignment))
77 | 


--------------------------------------------------------------------------------
/Assignment_07D.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Alignment with Affine Gap Penalties Problem
 8 | Assignment #: 07
 9 | Problem ID: D
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Penalizing-Insertions-and-Deletions-in-Sequence-Alignments-249/step/8
11 | '''
12 | 
13 | 
14 | def global_alignment_affine_gap_penalty(v, w, scoring_matrix, sigma, epsilon):
15 |     '''Returns the global alignment score of v and w with constant gap peantaly sigma subject to the scoring_matrix.'''
16 |     from numpy import zeros
17 | 
18 |     # Initialize the matrices.
19 |     S_lower = zeros((len(v)+1, len(w)+1), dtype=int)
20 |     S_middle = zeros((len(v)+1, len(w)+1), dtype=int)
21 |     S_upper = zeros((len(v)+1, len(w)+1), dtype=int)
22 |     backtrack = zeros((len(v)+1, len(w)+1), dtype=int)
23 | 
24 |     # Initialize the edges with the given penalties.
25 |     for i in xrange(1, len(v)+1):
26 |         S_lower[i][0] = -sigma - (i-1)*epsilon
27 |         S_middle[i][0] = -sigma - (i-1)*epsilon
28 |         S_upper[i][0] = -10*sigma
29 |     for j in xrange(1, len(w)+1):
30 |         S_upper[0][j] = -sigma - (j-1)*epsilon
31 |         S_middle[0][j] = -sigma - (j-1)*epsilon
32 |         S_lower[0][j] = -10*sigma
33 | 
34 |     # Fill in the scores for the lower, middle, upper, and backtrack matrices.
35 |     for i in xrange(1, len(v)+1):
36 |         for j in xrange(1, len(w)+1):
37 |             S_lower[i][j] = max([S_lower[i-1][j] - epsilon, S_middle[i-1][j] - sigma])
38 |             S_upper[i][j] = max([S_upper[i][j-1] - epsilon, S_middle[i][j-1] - sigma])
39 |             middle_scores = [S_lower[i][j], S_middle[i-1][j-1] + scoring_matrix[v[i-1], w[j-1]], S_upper[i][j]]
40 |             S_middle[i][j] = max(middle_scores)
41 |             backtrack[i][j] = middle_scores.index(S_middle[i][j]) + 1
42 | 
43 |    # Initialize the values of i,j and get the minimum score.
44 |     i,j = len(v), len(w)
45 |     max_score = S_middle[i][j]
46 |     v_aligned, w_aligned = v, w
47 | 
48 |     # Quick lambda function to insert indels.
49 |     insert_indel = lambda word, i: word[:i] + '-' + word[i:]
50 | 
51 |     # Backtrack to the edge of the matrix starting bottom right.
52 |     while i*j != 0:
53 |         if backtrack[i][j] == 1:
54 |             i -= 1
55 |             w_aligned = insert_indel(w_aligned, j)
56 |         elif backtrack[i][j] == 3:
57 |             j -= 1
58 |             v_aligned = insert_indel(v_aligned, i)
59 |         else:
60 |             i -= 1
61 |             j -= 1
62 | 
63 |     # Prepend the necessary preceeding indels to get to (0,0).
64 |     for repeat in xrange(i):
65 |         w_aligned = insert_indel(w_aligned, 0)
66 |     for repeat in xrange(j):
67 |         v_aligned = insert_indel(v_aligned, 0)
68 | 
69 |     return max_score, v_aligned, w_aligned
70 | 
71 | if __name__ == '__main__':
72 |     from scripts import BLOSUM62
73 | 
74 |     # Read the input data.
75 |     with open('data/stepic_7d.txt') as input_data:
76 |         protein1, protein2 = [line.strip() for line in input_data.readlines()]
77 | 
78 |     # Get the alignment score.
79 |     score = map(str, global_alignment_affine_gap_penalty(protein1, protein2, BLOSUM62(), 11, 1))
80 | 
81 |     # Print and save the answer.
82 |     print '\n'.join(score)
83 |     with open('output/Assignment_07D.txt', 'w') as output_data:
84 |         output_data.write('\n'.join(score))
85 | 


--------------------------------------------------------------------------------
/Assignment_07E.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Middle Edge in Linear Space Problem
 8 | Assignment #: 07
 9 | Problem ID: E
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/SpaceEfficient-Sequence-Alignment-250/step/12
11 | '''
12 | 
13 | 
14 | def middle_column_score(v, w, scoring_matrix, sigma):
15 |     '''Returns the score of the middle column for the alignment of v and w.'''
16 | 
17 |     # Initialize the score columns.
18 |     S = [[i*j*sigma for j in xrange(-1, 1)] for i in xrange(len(v)+1)]
19 |     S[0][1] = -sigma
20 |     backtrack = [0]*(len(v)+1)
21 | 
22 |     # Fill in the Score and Backtrack matrices.
23 |     for j in xrange(1, len(w)/2+1):
24 |         for i in xrange(0, len(v)+1):
25 |             if i == 0:
26 |                 S[i][1] = -j*sigma
27 |             else:
28 |                 scores = [S[i-1][0] + scoring_matrix[v[i-1], w[j-1]], S[i][0] - sigma, S[i-1][1] - sigma]
29 |                 S[i][1] = max(scores)
30 |                 backtrack[i] = scores.index(S[i][1])
31 | 
32 |         if j != len(w)/2:
33 |             S = [[row[1]]*2 for row in S]
34 | 
35 |     return [row[1] for row in S], backtrack
36 | 
37 | 
38 | def middle_edge(v, w, scoring_matrix, sigma):
39 |     '''Returns the middle edge in the alignment graph of v and w.'''
40 | 
41 |     # Get the score of the middle column from the source to the middle.  The backtrack matrix is unnecessary here.
42 |     source_to_middle = middle_column_score(v, w, scoring_matrix, sigma)[0]
43 | 
44 |     # Get the score of the middle column from the middle to sink.  Reverse the order as the computations are done in the opposite orientation.
45 |     middle_to_sink, backtrack = map(lambda l: l[::-1], middle_column_score(v[::-1], w[::-1]+['', '$'][len(w) % 2 == 1 and len(w) > 1], scoring_matrix, sigma))
46 | 
47 |     # Get the componentwise sum of the middle column scores.
48 |     scores = map(sum, zip(source_to_middle, middle_to_sink))
49 | 
50 |     # Get the position of the maximum score and the next node.
51 |     max_middle = max(xrange(len(scores)), key=lambda i: scores[i])
52 | 
53 |     if max_middle == len(scores) - 1:
54 |         next_node = (max_middle, len(w)/2 + 1)
55 |     else:
56 |         next_node = [(max_middle + 1, len(w)/2 + 1), (max_middle, len(w)/2 + 1), (max_middle + 1, len(w)/2),][backtrack[max_middle]]
57 | 
58 |     return (max_middle, len(w)/2), next_node
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     from scripts import BLOSUM62
63 | 
64 |     # Read the input data.
65 |     with open('data/stepic_7e.txt') as input_data:
66 |         word1, word2 = [line.strip() for line in input_data.readlines()]
67 | 
68 |     # Get the middle edge.
69 |     middle = middle_edge(word1, word2, BLOSUM62(), 5)
70 | 
71 |     # Print and save the answer.
72 |     print ' '.join(map(str, middle))
73 |     with open('output/Assignment_07E.txt', 'w') as output_data:
74 |         output_data.write(' '.join(map(str, middle)))
75 | 


--------------------------------------------------------------------------------
/Assignment_07F.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Linear Space Alignment
 8 | Assignment #: 07
 9 | Problem ID: F
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/SpaceEfficient-Sequence-Alignment-250/step/14
11 | '''
12 | 
13 | 
14 | def space_efficient_global_alignment(v, w, scoring_matrix, sigma):
15 |     '''Return the global alignment of v and w using a linear space algorithm.'''
16 |     from Assignment_06E import global_alignment
17 |     from Assignment_07E import middle_edge
18 | 
19 |     def linear_space_alignment2(top, bottom, left, right):
20 |         '''Constructs the global alignment path using linear space.'''
21 | 
22 |         if left == right:
23 |             return [v[top:bottom], '-'*(bottom - top)]
24 | 
25 |         elif top == bottom:
26 |             return ['-'*(right - left), w[left:right]]
27 | 
28 |         elif bottom - top == 1 or right - left == 1:
29 |             return global_alignment(v[top:bottom], w[left:right], scoring_matrix, sigma)[1:]
30 | 
31 |         else:
32 |             # Get the middle edge and the corresponding nodes.
33 |             mid_node, next_node = middle_edge(v[top:bottom], w[left:right], scoring_matrix, sigma)
34 | 
35 |             # Shift the nodes appropriately, as they currently don't alighn with the top/left starting points.
36 |             mid_node = tuple(map(sum, zip(mid_node, [top, left])))
37 |             next_node = tuple(map(sum, zip(next_node, [top, left])))
38 | 
39 |             # Get the character in each alignment corresponding to the current middle edge.
40 |             # (Take the index modulo the string length to avoid IndexErrors if we reach the end of a string but still have -'s to append.)
41 |             current = [['-', v[mid_node[0] % len(v)]][next_node[0] - mid_node[0]], ['-', w[mid_node[1] % len(w)]][next_node[1] - mid_node[1]]]
42 | 
43 |             # Recursively divide and conquer to generate the alignment.
44 |             A = linear_space_alignment2(top, mid_node[0], left, mid_node[1])
45 |             B = linear_space_alignment2(next_node[0], bottom, next_node[1], right)
46 |             return [A[i] + current[i] + B[i] for i in xrange(2)]
47 | 
48 |     # Get the alignment and alignment score.
49 |     v_aligned, w_aligned = linear_space_alignment2(0, len(v), 0, len(w))
50 |     score = sum([-sigma if '-' in pair else scoring_matrix[pair] for pair in zip(v_aligned, w_aligned)])
51 | 
52 |     return str(score), v_aligned, w_aligned
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     from scripts import BLOSUM62
57 | 
58 |     # Read the input data.
59 |     with open('data/stepic_7f.txt') as input_data:
60 |         word1, word2 = [line.strip() for line in input_data.readlines()]
61 | 
62 |     # Get the alignment.
63 |     alignment = space_efficient_global_alignment(word1, word2, BLOSUM62(), 5)
64 | 
65 |     # Print and save the answer.
66 |     print '\n'.join(alignment)
67 |     with open('output/Assignment_07F.txt', 'w') as output_data:
68 |         output_data.write('\n'.join(alignment))
69 | 


--------------------------------------------------------------------------------
/Assignment_07G.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Multiple Longest Common Subsequence Problem
 8 | Assignment #: 07
 9 | Problem ID: G
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Epilogue-Multiple-Sequence-Alignment-251/step/5
11 | '''
12 | 
13 | 
14 | def multiple_alignment_3(v, w, u):
15 | 
16 |     # Initialize the matrices.
17 |     S = [[[0 for repeat_k in xrange(len(u)+1)] for repeat_j in xrange(len(w)+1)] for repeat_i in xrange(len(v)+1)]
18 |     backtrack = [[[0 for repeat_k in xrange(len(u)+1)] for repeat_j in xrange(len(w)+1)] for repeat_i in xrange(len(v)+1)]
19 | 
20 |     # Fill in the Score and Backtrack matrices.
21 |     for i in xrange(1, len(v)+1):
22 |         for j in xrange(1, len(w)+1):
23 |             for k in xrange(1, len(u)+1):
24 |                 scores = [S[i-1][j-1][k-1] + int(v[i-1] == w[j-1] == u[k-1]), S[i-1][j][k], S[i][j-1][k], S[i][j][k-1], S[i-1][j][k-1], S[i][j-1][k-1]]
25 |                 backtrack[i][j][k], S[i][j][k] = max(enumerate(scores), key=lambda p: p[1])
26 | 
27 |     # Quick lambda function to insert indels.
28 |     insert_indel = lambda word, i: word[:i] + '-' + word[i:]
29 | 
30 |     # Initialize the aligned strings as the input strings.
31 |     v_aligned, w_aligned, u_aligned = v, w, u
32 | 
33 |     # Get the position of the highest scoring cell in the matrix and the high score.
34 |     i, j, k = len(v), len(w), len(u)
35 |     max_score = S[i][j][k]
36 | 
37 |     # Backtrack to the edge of the matrix starting at the highest scoring cell.
38 |     while i*j*k != 0:
39 |         if backtrack[i][j][k] == 1:
40 |             i -= 1
41 |             w_aligned = insert_indel(w_aligned, j)
42 |             u_aligned = insert_indel(u_aligned, k)
43 |         elif backtrack[i][j][k] == 2:
44 |             j -= 1
45 |             v_aligned = insert_indel(v_aligned, i)
46 |             u_aligned = insert_indel(u_aligned, k)
47 |         elif backtrack[i][j][k] == 3:
48 |             k -= 1
49 |             v_aligned = insert_indel(v_aligned, i)
50 |             w_aligned = insert_indel(w_aligned, j)
51 |         elif backtrack[i][j][k] == 4:
52 |             i -= 1
53 |             j -= 1
54 |             u_aligned = insert_indel(u_aligned, k)
55 |         elif backtrack[i][j][k] == 5:
56 |             i -= 1
57 |             k -= 1
58 |             w_aligned = insert_indel(w_aligned, j)
59 |         elif backtrack[i][j][k] == 6:
60 |             j -= 1
61 |             k -= 1
62 |             v_aligned = insert_indel(v_aligned, i)
63 |         else:
64 |             i -= 1
65 |             j -= 1
66 |             k -= 1
67 | 
68 |     # Prepend the necessary preceeding indels to get match lengths.
69 |     while len(v_aligned) != max(len(v_aligned),len(w_aligned),len(u_aligned)):
70 |         v_aligned = insert_indel(v_aligned, 0)
71 |     while len(w_aligned) != max(len(v_aligned),len(w_aligned),len(u_aligned)):
72 |         w_aligned = insert_indel(w_aligned, 0)
73 |     while len(u_aligned) != max(len(v_aligned),len(w_aligned),len(u_aligned)):
74 |         u_aligned = insert_indel(u_aligned, 0)
75 | 
76 |     return str(max_score), v_aligned, w_aligned, u_aligned
77 | 
78 | if __name__ == '__main__':
79 | 
80 |     # Read the input data.
81 |     with open('data/stepic_7g.txt') as input_data:
82 |         word1, word2, word3 = [line.strip() for line in input_data.readlines()]
83 | 
84 |     # Get the alignment.
85 |     alignment = multiple_alignment_3(word1, word2, word3)
86 | 
87 |     # Print and save the answer.
88 |     print '\n'.join(alignment)
89 |     with open('output/Assignment_07G.txt', 'w') as output_data:
90 |         output_data.write('\n'.join(alignment))
91 | 


--------------------------------------------------------------------------------
/Assignment_08A.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Greedy Sorting
 8 | Assignment #: 08
 9 | Problem ID: A
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/A-Greedy-Algorithm-for-Sorting-by-Reversals-286/step/2
11 | '''
12 | 
13 | 
14 | def greedy_sorting(permutation):
15 |     '''A greedy algorithm to sort by reversals.'''
16 |     from operator import neg
17 | 
18 |     # Initialize the transformation list, which stores all intermediate transformations.
19 |     transformation_list = []
20 | 
21 |     # Quick lambda functions to find the index of a given element, and swap and negate a region in the permutation.
22 |     k_index = lambda perm, k: map(abs, perm).index(k)
23 |     k_sort = lambda perm, i, j: perm[:i] + map(neg, perm[i:j+1][::-1]) + perm[j+1:]
24 | 
25 |     # Loop over the permutation to sort it.
26 |     i = 0
27 |     while i < len(permutation):
28 |         if permutation[i] == i+1:
29 |             i += 1
30 |         elif permutation[i] == -(i+1):
31 |             permutation = k_sort(permutation, i, i)
32 |             transformation_list.append(permutation)
33 |         else:
34 |             permutation = k_sort(permutation, i, k_index(permutation, i+1))
35 |             transformation_list.append(permutation)
36 | 
37 |     # Note: the approximate reversal distance is the length of the transformation list.
38 |     return transformation_list
39 | 
40 | 
41 | if __name__ == '__main__':
42 | 
43 |     # Read the input data.
44 |     with open('data/stepic_8a.txt') as input_data:
45 |         perm = map(int, input_data.read().strip().lstrip('(').rstrip(')').split())
46 | 
47 |     # Get the list of recerals necessary to sort the given permutation.
48 |     reversal_list = greedy_sorting(perm)
49 |     # Write the permutation in the desired form for in the desired output form for stepic.
50 |     reversal_list = ['('+' '.join([['', '+'][value > 0] + str(value) for value in perm])+')' for perm in reversal_list]
51 | 
52 |     # Print and save the answer.
53 |     print '\n'.join(reversal_list)
54 |     with open('output/Assignment_08A.txt', 'w') as output_data:
55 |         output_data.write('\n'.join(reversal_list))
56 | 


--------------------------------------------------------------------------------
/Assignment_08B.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Number of Breakpoints Problem
 8 | Assignment #: 08
 9 | Problem ID: B
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Breakpoints-287/step/1
11 | '''
12 | 
13 | 
14 | def breakpoint_count(permutation):
15 |     '''Returns the number of breakpoints in a given permutation.'''
16 | 
17 |     # Prepend 0 and append len(permutation)+1 to check if the endpoints are in place.
18 |     permutation = [0] + permutation + [len(permutation)+1]
19 | 
20 |     return sum(map(lambda x,y: x - y != 1, permutation[1:], permutation[:-1]))
21 | 
22 | 
23 | if __name__ == '__main__':
24 | 
25 |     # Read the input data.
26 |     with open('data/stepic_8b.txt') as input_data:
27 |         perm = map(int, input_data.read().strip().lstrip('(').rstrip(')').split())
28 | 
29 |     # Get the number of breakpoints
30 |     num_of_breakpoints = breakpoint_count(perm)
31 | 
32 |     # Print and save the answer.
33 |     print str(num_of_breakpoints)
34 |     with open('output/Assignment_08B.txt', 'w') as output_data:
35 |         output_data.write(str(num_of_breakpoints))
36 | 


--------------------------------------------------------------------------------
/Assignment_08C.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: 2-Break Distance Problem
 8 | Assignment #: 08
 9 | Problem ID: C
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Computing-the-2Break-Distance-288/step/1
11 | '''
12 | 
13 | 
14 | def two_break_dist(P, Q):
15 |     '''Returns the 2-Break Distance of Circular Chromosomes P and Q.'''
16 | 
17 |     # Construct the break point graph of P and Q.
18 |     edges = {}
19 |     for block in P+Q:
20 |         L = len(block)
21 |         # Note: Modulo L in the higher index for the edge between the last and first elements.
22 |         for i in xrange(len(block)):
23 |             # Add the edge between consecutive items.
24 |             if block[i] in edges:
25 |                 edges[block[i]].append(-1*block[(i+1) % L])
26 |             else:
27 |                 edges[block[i]] = [-1*block[(i+1) % L]]
28 |             # Add in the reverse edge, as we aren't guaranteed a directed cycle without it.
29 |             if -1*block[(i+1) % L] in edges:
30 |                 edges[-1*block[(i+1) % L]].append(block[i])
31 |             else:
32 |                 edges[-1*block[(i+1) % L]] = [block[i]]
33 | 
34 |     # Count the number of cycles in the break point graph.
35 |     cycles = 0
36 |     while len(edges) > 0:
37 |         cycles += 1
38 |         current = edges.keys()[0]
39 |         while current in edges:
40 |             temp = edges[current][0]
41 |             if len(edges[current]) == 1:
42 |                 del edges[current]
43 |             else:
44 |                 edges[current] = edges[current][1:]
45 |             # Remove the complementary edge.
46 |             if edges[temp] == [current]:
47 |                 del edges[temp]
48 |             else:
49 |                 edges[temp].remove(current)
50 | 
51 |             current = temp
52 | 
53 |     # Theorem: d(P,Q) = blocks(P,W) - cycles(P,Q)
54 |     return sum([len(block) for block in P]) - cycles
55 | 
56 | 
57 | if __name__ == '__main__':
58 | 
59 |     # Read the input data.
60 |     with open('data/stepic_8c.txt') as input_data:
61 |         P, Q = [line.strip().lstrip('(').rstrip(')').split(')(') for line in input_data.readlines()]
62 |         P = [map(int, block.split()) for block in P]
63 |         Q = [map(int, block.split()) for block in Q]
64 | 
65 |     # Get the 2-Break Distance.
66 |     dist = two_break_dist(P, Q)
67 | 
68 |     # Print and save the answer.
69 |     print str(dist)
70 |     with open('output/Assignment_08C.txt', 'w') as output_data:
71 |         output_data.write(str(dist))
72 | 


--------------------------------------------------------------------------------
/Assignment_08D.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Shared k-mers Problem
 8 | Assignment #: 08
 9 | Problem ID: D
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Synteny-Block-Construction-289/step/2
11 | '''
12 | 
13 | 
14 | def shared_kmers(dna1, dna2, k):
15 |     '''Returns a list of positions for shared kmers (up to reverse complement) in dna1 and dna2.'''
16 |     from scripts import ReverseComplementDNA as rev_comp
17 | 
18 |     # Initialize the dictionary to store kmers.
19 |     dna_dict = {}
20 | 
21 |     # Store the starting index of all kmers contained in dna1 in a list keyed to the kmer.
22 |     for i in xrange(len(dna1) - k + 1):
23 |         # Add the ith kmer.
24 |         if dna1[i:i+k] in dna_dict:
25 |             dna_dict[dna1[i:i+k]].append(i)
26 |         else:
27 |             dna_dict[dna1[i:i+k]] = [i]
28 | 
29 |         # Add the reverse complement of the ith kmer.
30 |         if rev_comp(dna1[i:i+k]) in dna_dict:
31 |             dna_dict[rev_comp(dna1[i:i+k])].append(i)
32 |         else:
33 |             dna_dict[rev_comp(dna1[i:i+k])] = [i]
34 | 
35 |     # Use a set to remove possible duplicate entries.
36 |     common_kmers = set()
37 | 
38 |     # Check kmers in dna2 against those in dna1, adding matching indices to common_kmers.
39 |     for j in xrange(len(dna2) - k + 1):
40 |         # Check the jth kmer.
41 |         if dna2[j:j+k] in dna_dict:
42 |             for x in dna_dict[dna2[j:j+k]]:
43 |                 common_kmers.add((x,j))
44 | 
45 |         # Check the reverse complement of the jth kmer.
46 |         if rev_comp(dna2[j:j+k]) in dna_dict:
47 |             for x in dna_dict[rev_comp(dna2[j:j+k])]:
48 |                 common_kmers.add((x,j))
49 | 
50 |     return common_kmers
51 | 
52 | if __name__ == '__main__':
53 | 
54 |     # Read the input data.
55 |     with open('data/stepic_8d.txt') as input_data:
56 |         k = int(input_data.readline().strip())
57 |         dna1, dna2 = [line.strip() for line in input_data.readlines()]
58 | 
59 |     # Get the shared kmers.  Sorting doesn't add significant time and makes the result more readable.
60 |     common = map(str, sorted(shared_kmers(dna1, dna2, k)))
61 | 
62 |     # Print and save the answer.
63 |     print '\n'.join(common)
64 |     with open('output/Assignment_08D.txt', 'w') as output_data:
65 |         output_data.write('\n'.join(common))
66 | 


--------------------------------------------------------------------------------
/Assignment_09A.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Trie Construction Problem
 8 | Assignment #: 09
 9 | Problem ID: A
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Preprocessing-Patterns-into-a-Trie-294/step/3
11 | '''
12 | from scripts import Trie
13 | 
14 | 
15 | def trie_edges(words):
16 |     '''Returns the edges of a trie constructed from the given words in adjacency format.'''
17 | 
18 |     # Construct the trie.
19 |     t = Trie(words)
20 | 
21 |     # Convert trie edges to adjacency form, as edges are currently dictionary items.
22 |     # Converts: ((1, 2), 'A')  --> '1 2 A'
23 |     adjacency_format = lambda item: ' '.join(map(str,item[0]))+' '+item[1]
24 | 
25 |     # Return all edges converted to adjacency form.
26 |     return map(adjacency_format, t.edges.items())
27 | 
28 | 
29 | def main():
30 |     '''Main call. Reads, runs, and saves problem specific data.'''
31 | 
32 |     # Read the input data.
33 |     with open('data/stepic_9a.txt') as input_data:
34 |         words = [line.strip() for line in input_data.readlines()]
35 | 
36 |     # Get the adjacency list.
37 |     adjacency_list = trie_edges(words)
38 | 
39 |     # Print and save the answer.
40 |     print '\n'.join(adjacency_list)
41 |     with open('output/Assignment_09A.txt', 'w') as output_file:
42 |         output_file.write('\n'.join(adjacency_list))
43 | 
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/Assignment_09B.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Multiple Pattern Matching Problem
 8 | Assignment #: 09
 9 | Problem ID: B
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Preprocessing-Patterns-into-a-Trie-294/step/6
11 | '''
12 | 
13 | from scripts import Trie
14 | 
15 | 
16 | def trie_pattern_matching(word, patterns):
17 |     '''Returns the starting index off all locations in word where a string in patterns is a substring.'''
18 | 
19 |     # Construct a trie from all of the given patterns.
20 |     t = Trie(patterns)
21 | 
22 |     # Checck each index in the word (until the remainder is shorter than the shortest pattern)
23 |     # to see if a pattern occurs starting at the specified index.
24 |     check_patterns = [i for i in xrange(len(word)-min(map(len, patterns))+1) if t.prefix_in_trie(word[i:]) is True]
25 | 
26 |     return check_patterns
27 | 
28 | 
29 | def main():
30 |     '''Main call. Reads, runs, and saves problem specific data.'''
31 |     # Read the input data.
32 |     with open('data/stepic_9b.txt') as input_data:
33 |         word = input_data.readline().strip()
34 |         patterns = [line.strip() for line in input_data.readlines()]
35 | 
36 |     # Get the matching pattern indices.
37 |     pattern_indices = trie_pattern_matching(word, patterns)
38 | 
39 |     # Print and save the answer.
40 |     print ' '.join(map(str, pattern_indices))
41 |     with open('output/Assignment_09B.txt', 'w') as output_data:
42 |         output_data.write(' '.join(map(str, pattern_indices)))
43 | 
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/Assignment_09D.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Suffix Tree Construction Problem
 8 | Assignment #: 09
 9 | Problem ID: D
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Suffix-Trees-296/step/4
11 | '''
12 | from scripts import GeneralizedSuffixTree
13 | 
14 | 
15 | def suffix_tree_edges(word):
16 |     '''Returns the edge subsrings associated with the suffix tree for the given word.'''
17 | 
18 |     # Most of the work is done by the generalized suffix tree script (see scripts folder).
19 |     gst = GeneralizedSuffixTree(word)
20 | 
21 |     # Get a list of all edge substrings from the generalized suffix tree.
22 |     edges = [gst.edge_substring(e) for e in gst.edges.values()]
23 | 
24 |     # Return the edges in suffix tree format (i.e. want endings $0 to be $).
25 |     # Note: This is necessary because we're using a generalized suffix tree, which uses $0, $1, ..., $N
26 |     # as the out of alphabet suffixes in order to distinguish between word 0, word 1, ..., word N.
27 |     return [e[:-1] if '$' in e else e for e in edges]
28 | 
29 | 
30 | def main():
31 |     '''Main call. Reads, runs, and saves problem specific data.'''
32 | 
33 |     # Read the input data.
34 |     with open('data/stepic_9d.txt') as input_data:
35 |         text = input_data.read().strip()
36 | 
37 |     # Get the edge substrings.
38 |     edges = suffix_tree_edges(text)
39 | 
40 |     # Print and save the answer.
41 |     print '\n'.join(edges)
42 |     with open('output/Assignment_09D.txt', 'w') as output_data:
43 |         output_data.write('\n'.join(edges))
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/Assignment_09E.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Longest Shared Repeat Problem
 8 | Assignment #: 09
 9 | Problem ID: E
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Suffix-Trees-296/step/5
11 | '''
12 | from scripts import GeneralizedSuffixTree
13 | 
14 | 
15 | def longest_common_substring(string_list):
16 |     '''Returns the longest common substring among all strings in string_list.'''
17 |     # Construct the generalized suffix tree for the input text.
18 |     gst = GeneralizedSuffixTree(string_list)
19 | 
20 |     # Find all nodes that are traversed by all words in text, meaning that the substring up to that node is in all words in text.
21 |     candidate_nodes = filter(lambda i: len(gst.nodes[i].words) == len(string_list), xrange(len(gst.nodes)))
22 | 
23 |     # Get the deepest node of from the candidate nodes, where depth corresponds to substring length.
24 |     deepest_node = max(candidate_nodes, key=lambda i: gst.node_depth(i))
25 | 
26 |     # Return the substring corresponding to a traversal up to the deepest node.
27 |     return gst.node_substring(deepest_node)
28 | 
29 | 
30 | def main():
31 |     '''Reads, runs, and saves problem specific data.'''
32 |     # Read the input data.
33 |     with open('data/stepic_9e.txt') as input_data:
34 |         text = [line.strip() for line in input_data.readlines()]
35 | 
36 |     # Get the longest shared repeat.
37 |     longest_shared_repeat = longest_common_substring(text)
38 | 
39 |     # Print and save the answer.
40 |     print longest_shared_repeat
41 |     with open('output/Assignment_09E.txt', 'w') as output_data:
42 |         output_data.write(longest_shared_repeat)
43 | 
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/Assignment_09F.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | A solution to a programming assignment for the Bioinformatics Algorithms (Part 1) on Coursera.
 4 | The associated textbook is Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
 5 | The course is run on Coursera and the assignments and textbook are hosted on Stepic
 6 | 
 7 | Problem Title: Shortest Non-Shared Substring Problem
 8 | Assignment #: 09
 9 | Problem ID: F
10 | URL: https://stepic.org/Bioinformatics-Algorithms-2/Suffix-Trees-296/step/6
11 | '''
12 | from scripts import GeneralizedSuffixTree
13 | 
14 | 
15 | def shortest_nonshared_substring(string_list):
16 |     '''Returns the shortest nonshared substring unique to the first word in string_list.'''
17 | 
18 |     # Construct the generalized suffix tree for the input text.
19 |     gst = GeneralizedSuffixTree(string_list)
20 | 
21 |     # Find all nodes that are traversed only by the first word in text, meaning that the substring up to that node is only in the first word.
22 |     candidate_nodes = filter(lambda i: gst.nodes[i].words == {0}, xrange(len(gst.nodes)))
23 | 
24 |     # Filter out all nodes corresponding to the out of alphabet character unique to first word, as these are trivally only traveresed by the first word.
25 |     # If the out of alphabet character is the only character on the edge, then its parent must be traversed by another word.
26 |     candidate_nodes = filter(lambda i: gst.edge_substring(gst.edges[gst.nodes[i].parent,i]) != '$0', candidate_nodes)
27 | 
28 |     # To get the shortest substring, only take the first character of the last edge, hence the substring has length parent_length + 1.
29 |     shortest = min(candidate_nodes, key=lambda i: gst.node_depth(gst.nodes[i].parent)+1)
30 | 
31 |     # Shortest nonshared substring is the substring up to the first character of the edge leading to the optimal node.
32 |     return gst.node_substring(gst.nodes[shortest].parent) + gst.edge_substring(gst.edges[gst.nodes[shortest].parent,shortest])[0]
33 | 
34 | 
35 | def main():
36 |     '''Solves problem Problem 9F.'''
37 | 
38 |     # Read the input data.
39 |     with open('data/stepic_9f.txt') as input_data:
40 |         text = [line.strip() for line in input_data.readlines()]
41 | 
42 |     # Get the shortest nonshared substring unique to the first word.
43 |     minimal_unique_substring = shortest_nonshared_substring(text)
44 | 
45 |     # Print and save the answer.
46 |     print minimal_unique_substring
47 |     with open('output/Assignment_09F.txt', 'w') as output_data:
48 |         output_data.write(minimal_unique_substring)
49 | 
50 | if __name__ == '__main__':
51 |     main()
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bioinformatics Algorithms (Coursera) #
 2 | 
 3 | ### Description
 4 | ---
 5 | My solutions to programming assignments for the Bioinformatics Algorithms course on Coursera, worked primarily in Python.
 6 | 
 7 | Coursera Class Website: [https://www.coursera.org/course/bioinformatics](https://www.coursera.org/course/bioinformatics)
 8 | 
 9 | Stepic Textbook Website: [https://beta.stepic.org/Bioinformatics-Algorithms-2/](https://beta.stepic.org/Bioinformatics-Algorithms-2/)
10 | 
11 | ### Repository Structure
12 | ---
13 | ***Top Level Directory***
14 | 
15 | The top level directory contains problem solutions.  The problems are organized by the week which they are assigned, and their position on the given weeks assignment.  Specifically, Assignment_XXY denotes week XX, problem Y. 
16 | 
17 | ***Scripts Directory***
18 | 
19 | The scripts directory contains scripts for various processes which appear in multiple programming problems but are not solutions to actual problems.
20 | 
21 | ***Data and Output Directories***
22 | 
23 | The data and output directories hold text files containing the data for each problem supplied by Stepic, and the associated output from my solutions.  
24 | 


--------------------------------------------------------------------------------
/data/stepic_1a.txt:
--------------------------------------------------------------------------------
1 | GCCGGGCCCCGCAGGTTCCGCAGGTTCTTCCGCGTTCGCAGGTTCGTGGATACGCAGGTTCTCATCGGGGGCCGGGCCCTCATCGGGGCGCAGGTTCCGCAGGTTCTCATCGGGGGTGGATAGCCGGGCCCGTGGATATTCCGCGTTGCCGGGCCCCGCAGGTTCGTGGATAGCCGGGCCCTTCCGCGTTCGCAGGTTCGCCGGGCCCGTGGATATCATCGGGGTCATCGGGGGTGGATAGTGGATACGCAGGTTCTCATCGGGGGTGGATATTCCGCGTTCGCAGGTTCTTCCGCGTTTTCCGCGTTTTCCGCGTTTTCCGCGTTGCCGGGCCCTCATCGGGGTCATCGGGGGCCGGGCCCTCATCGGGGGTGGATAGTGGATATCATCGGGGCGCAGGTTCGTGGATATTCCGCGTTTTCCGCGTTCGCAGGTTCTTCCGCGTTGTGGATAGCCGGGCCCCGCAGGTTCTCATCGGGGTCATCGGGGTCATCGGGGCGCAGGTTCTTCCGCGTTGCCGGGCCCGTGGATACGCAGGTTCTTCCGCGTTCGCAGGTTCTCATCGGGGTCATCGGGGCGCAGGTTCGCCGGGCCCGCCGGGCCCTTCCGCGTTGTGGATATTCCGCGTTGTGGATAGTGGATATTCCGCGTTTCATCGGGGTTCCGCGTTCGCAGGTTCGTGGATATTCCGCGTTCGCAGGTTCGTGGATAGCCGGGCCCTTCCGCGTTGCCGGGCCCGCCGGGCCCTCATCGGGGGTGGATAGTGGATAGTGGATAGTGGATATTCCGCGTTGCCGGGCCCGTGGATAGTGGATACGCAGGTTCTTCCGCGTTTCATCGGGGGTGGATATCATCGGGGGCCGGGCCCGCCGGGCCCTTCCGCGTT
2 | 11
3 | 


--------------------------------------------------------------------------------
/data/stepic_1g.txt:
--------------------------------------------------------------------------------
1 | GTACAGACGGACAAAAATTGTACAGACGAGTGTTGCCCAAGTTAGAACATTCAACTGTACAGACGATTCAACTAAGTTAGAACAGTGTTGCCCATTCAACTAAGTTAGAACGACAAAAATTATTCAACTAAGTTAGAACATTCAACTGTACAGACGAGTGTTGCCCAGTGTTGCCCAGTGTTGCCCAGTGTTGCCCATTCAACTAGTGTTGCCCATTCAACTAGTGTTGCCCGACAAAAATTAGTGTTGCCCAGTGTTGCCCAGTGTTGCCCAAGTTAGAACAAGTTAGAACGACAAAAATTATTCAACTAGTGTTGCCCAGTGTTGCCCGTACAGACGGTACAGACGAAGTTAGAACAAGTTAGAACAAGTTAGAACGACAAAAATTAAGTTAGAACGACAAAAATTAAGTTAGAACGTACAGACGAAGTTAGAACAGTGTTGCCCAGTGTTGCCCGTACAGACGGTACAGACGAGTGTTGCCCGTACAGACGATTCAACTGACAAAAATTATTCAACTAGTGTTGCCCGTACAGACGGTACAGACGATTCAACTATTCAACTAGTGTTGCCCGTACAGACGGTACAGACGATTCAACTATTCAACTAAGTTAGAACAGTGTTGCCCGTACAGACGGTACAGACGATTCAACTATTCAACTAGTGTTGCCCATTCAACTAGTGTTGCCCGTACAGACGATTCAACTAGTGTTGCCCATTCAACTGTACAGACGGTACAGACGATTCAACTAGTGTTGCCCGTACAGACGATTCAACTAGTGTTGCCCGACAAAAATTAAGTTAGAACGACAAAAATTATTCAACTATTCAACTGACAAAAATTATTCAACTAAGTTAGAACATTCAACTGACAAAAATTAAGTTAGAACGACAAAAATTATTCAACTAAGTTAGAACAGTGTTGCCCATTCAACTATTCAACTATTCAACTGTACAGACG
2 | 10 2
3 | 


--------------------------------------------------------------------------------
/data/stepic_1h.txt:
--------------------------------------------------------------------------------
1 | CCTAGTGTCAGCGGAATTTCTGCTGCCTAGTGTCTCTGCTGCGCCGTCCAGCGGAATTGTTCTTAAAGCGGAATTGTTCTTAAAGCGGAATTCGCCGTCCCGCCGTCCTCTGCTGTCTGCTGGTTCTTAACGCCGTCCAGCGGAATTGTTCTTAAGTTCTTAATCTGCTGCCTAGTGTCAGCGGAATTTCTGCTGTCTGCTGAGCGGAATTCCTAGTGTCAGCGGAATTCGCCGTCCGTTCTTAAAGCGGAATTCGCCGTCCGTTCTTAAAGCGGAATTCCTAGTGTCCCTAGTGTCTCTGCTGGTTCTTAACGCCGTCCCCTAGTGTCCGCCGTCCTCTGCTGAGCGGAATTGTTCTTAATCTGCTGCGCCGTCCTCTGCTGTCTGCTGAGCGGAATTCCTAGTGTCGTTCTTAAAGCGGAATTTCTGCTGAGCGGAATTGTTCTTAAAGCGGAATTAGCGGAATTTCTGCTGCGCCGTCCGTTCTTAACGCCGTCCCCTAGTGTCTCTGCTGCGCCGTCCTCTGCTGAGCGGAATTCGCCGTCCCGCCGTCCAGCGGAATTGTTCTTAACGCCGTCCCGCCGTCCAGCGGAATTAGCGGAATTCCTAGTGTCGTTCTTAACCTAGTGTCTCTGCTGCGCCGTCCGTTCTTAACGCCGTCCCGCCGTCCCCTAGTGTCTCTGCTGAGCGGAATTCGCCGTCCAGCGGAATTAGCGGAATTCGCCGTCCGTTCTTAACGCCGTCCTCTGCTGTCTGCTGCGCCGTCCCCTAGTGTCGTTCTTAACCTAGTGTCTCTGCTGCGCCGTCCTCTGCTGGTTCTTAACGCCGTCCCCTAGTGTCCGCCGTCCCGCCGTCCTCTGCTGAGCGGAATT
2 | 9 2
3 | 


--------------------------------------------------------------------------------
/data/stepic_2c.txt:
--------------------------------------------------------------------------------
1 | RRNQKRGCLSQQCFL
2 | 


--------------------------------------------------------------------------------
/data/stepic_2d.txt:
--------------------------------------------------------------------------------
1 | 0 97 113 113 131 131 137 147 156 163 226 234 244 253 268 269 278 294 310 357 365 366 382 390 407 415 441 441 479 503 512 513 520 521 554 572 578 610 616 634 667 668 675 676 685 709 747 747 773 781 798 806 822 823 831 878 894 910 919 920 935 944 954 962 1025 1032 1041 1051 1057 1057 1075 1075 1091 1188
2 | 


--------------------------------------------------------------------------------
/data/stepic_2e.txt:
--------------------------------------------------------------------------------
1 | 459
2 | 0 71 71 71 71 71 97 99 113 113 114 114 115 115 128 129 129 142 147 147 156 163 168 184 184 184 185 185 199 214 226 227 228 243 244 246 255 255 256 270 275 276 278 297 298 299 310 312 324 341 341 346 357 358 361 369 369 370 373 375 377 383 412 412 425 427 428 429 438 438 444 453 459 471 472 483 484 490 497 498 499 509 509 524 524 525 530 542 542 542 543 553 567 580 587 596 600 613 613 613 613 622 624 626 638 640 644 652 653 655 658 671 684 693 693 699 709 711 714 723 726 726 727 737 741 750 769 773 785 797 797 798 799 800 807 808 808 813 814 821 822 836 840 854 856 868 870 883 884 888 897 907 911 922 925 928 928 936 937 937 954 954 955 968 969 977 982 983 996 999 999 1001 1021 1025 1025 1051 1051 1051 1054 1066 1067 1068 1070 1072 1084 1091 1096 1096 1110 1112 1122 1124 1138 1138 1143 1150 1162 1164 1166 1167 1168 1180 1183 1183 1183 1209 1209 1213 1233 1235 1235 1238 1251 1252 1257 1265 1266 1279 1280 1280 1297 1297 1298 1306 1306 1309 1312 1323 1327 1337 1346 1350 1351 1364 1366 1378 1380 1394 1398 1412 1413 1420 1421 1426 1426 1427 1434 1435 1436 1437 1437 1449 1461 1465 1484 1493 1497 1507 1508 1508 1511 1520 1523 1525 1535 1541 1541 1550 1563 1576 1579 1581 1582 1590 1594 1596 1608 1610 1612 1621 1621 1621 1621 1634 1638 1647 1654 1667 1681 1691 1692 1692 1692 1704 1709 1710 1710 1725 1725 1735 1736 1737 1744 1750 1751 1762 1763 1775 1781 1790 1796 1796 1805 1806 1807 1809 1822 1822 1851 1857 1859 1861 1864 1865 1865 1873 1876 1877 1888 1893 1893 1910 1920 1922 1924 1935 1936 1937 1956 1958 1959 1964 1978 1979 1979 1988 1990 1991 2006 2007 2008 2020 2035 2049 2049 2050 2050 2050 2066 2071 2078 2087 2087 2092 2105 2105 2106 2119 2119 2120 2120 2121 2121 2135 2137 2163 2163 2163 2163 2163 2234
3 | 


--------------------------------------------------------------------------------
/data/stepic_2f.txt:
--------------------------------------------------------------------------------
1 | 225 456 363 584 756 731 878 97 866 981 650 503 397 753 884 331 356 687 372 228 115 638 113 471 850 228 250 735 128 765 625 868 216 834 519 706 0 103 753 294 462 147 834 525 622 510 559 131 609 422 343 275 246 618 359 853 478 147
2 | 


--------------------------------------------------------------------------------
/data/stepic_2g.txt:
--------------------------------------------------------------------------------
1 | 16
2 | 371
3 | 550 1624 113 220 529 1551 788 732 773 535 1492 1008 888 760 0 819 1145 1070 317 1232 253 1175 269 1346 259 1461 1038 691 657 1014 1192 625 829 1605 415 776 610 942 747 884 1648 1502 454 488 1241 472 1470 1436 422 932 616 916 278 1664 432 1630 1079 1029 156 1541 1289 1098 820 1598 1098 500 1211 941 1069 1389 1395 357 366 886 291 212 985 325 1485 1508 194 1646 469 357 875 115 584 569 1329 473 1501 276 1308 1404 1633 1404 1144 1761 1345 512 1483 163 453 338 1261 163 1339 163 1549 128 1598 300 692 749 845 1357 1136 97 732 663 1249 1567 57 1273 260 1126 1704 372 404 1226 663 1502 1664 529 387 810 1423 1104 1177 1520 113 128 1633 712 520 1288 1164 560 1648 617 1386 1001 375 137 875 1201 97 1012 1232 416 1444 1598 1151 988 241 131 1374 877 873 597 973 1307 1029 210 723 682 753 1049 951 635 886 1292 259 586
4 | 


--------------------------------------------------------------------------------
/data/stepic_3a.txt:
--------------------------------------------------------------------------------
1 | 5 1
2 | TCCGTCTTGCGGTAGCGCACCTCTG
3 | GGAGCCATATGGCAGGTTATGACAA
4 | CTGCATATGGTTCTACCGTCGGTAG
5 | TATGAGGAAGACAATGTCTTCACGG
6 | CGCGTACCCTTCCTAGGGTTGGTAG
7 | AACCAGGCAGAAGTCTGTCTTACTT
8 | 


--------------------------------------------------------------------------------
/data/stepic_3b.txt:
--------------------------------------------------------------------------------
 1 | 6
 2 | GTGCGTATTAAAATAGGAACGGATCATCTGATTAAACAGGGG
 3 | GTCCGAAGATCGCTAATAGCCAAGCAGGAGAGTCTACTGGAC
 4 | GAGCTACAGGGGTCAGCTCTGCGGGTAGTGCTCAGATAACGA
 5 | GGGATCCAGGAGCAACTGTTCTCACTCCTCAGTGTCCACTCT
 6 | ACGCATCCCGGGCAGGTGCACCATAGTGCGAGAAGCAATCAT
 7 | CGGATCATTTACAAGTCCCAAGTCAGTTTCCAGGCGTCGTAG
 8 | CGATTGGATTGGTACCCTGGCCCTGGTAACGAGGGTCAGGAG
 9 | CAGGCGGCTGGATGTGTATACTATGGAAAACCAGGAAGCAGT
10 | GATGTTCCTAGACAGGCGTTTTACAAGTTCCGTTAATCTGTG
11 | ACCGTTTGTTGAAGATTGAGATTGACTAGACAGGAGTCAACC
12 | 


--------------------------------------------------------------------------------
/data/stepic_3c.txt:
--------------------------------------------------------------------------------
 1 | TGTACGCGACCTCTCCGAGAGTAGTAATTGCTATCTACCTAGCTCCTCGGGCTCGGGTCATCAATAGCTGTCTTAAGCATCGTAGGGAAGACTACTACTTATGCTGGATATATTCACACATGGGATGGGTACACGTGGTAGACCGGTATAAGGGACCTATGGAATCGATGAAACATAGTAACAGCTGGGAACACGTGCCG
 2 | 6
 3 | A C G T
 4 | 0.333 0.212 0.212 0.242
 5 | 0.242 0.273 0.394 0.091
 6 | 0.242 0.242 0.242 0.273
 7 | 0.273 0.242 0.303 0.182
 8 | 0.333 0.333 0.152 0.182
 9 | 0.394 0.182 0.152 0.273
10 | 


--------------------------------------------------------------------------------
/data/stepic_3d.txt:
--------------------------------------------------------------------------------
 1 | 12 25
 2 | GAGCAGAACGTACATAAGACTATCGAAGAGTAATCGTATTTCATCAGTCTAGCAGCCTCTCTTCGGTTTGGACGGGCGTTTCCGCCCGGTTTCACCGGCCAAGGCTTACGATATGGTGCCATCATGGTCCATTTGTTGGGTTACAGTACAAGTGAG
 3 | TCCTGAACTCGCGACCAAGCCTCATCTGTTATATTCACGTTCTAATGTTGCAGAGGGAGTCGTTGCGCGTGCTGTCTGGGGCGGGTATGTTGTGTAGTAAGAGGTAATGATTGAGGCACTGATTTACACTTCAAGCTGTCATGTGAGCTGAGAAGG
 4 | ACCATAGATCAAGGAGCTATTACCGACCACGCCTAAAAGTAGCACGAAATTAGCCCAGTTTCGACGCCAGAAAGCAACTCAGGCGAAGCCAATCATCTGGAGAGTTATGAACAGCCAGTTCGATATTCGGTAAAGTTACTTGGCACTAGAGCACCT
 5 | GGCCAAGACTCAAGAGGCCCTATGCAGGCTCCCCACCGTAATTGTTTATAAAATGCTATGTCATAGTGTACTGAGAATACGGCCTAGTCGAAGGCTATATAATAGCTCCAATCGCGAAACGATCGGAATCGGCGAAAGAATAGCAAAGCAACATGT
 6 | GACTGATCCCCGGACTGATATTGAGCGTGTGACCCCGGCCATGGCTCATATGTTGATCTACCCCATTACGGCGTCAATCCTCTTTATTCGTCCCCACTCGTCGTCGGGCTGAGTTTAAAATAGCTTGATACCCGTGATCTCACACCTAGTGCAGCT
 7 | GACCAAGGCTAACTTGAGTATTCTCAGTGCGTTGCACACTCAGAGACTAGATTGCACGCCGTGAGGGTGTTTCTGGTTGCTTATGCGCGCGTGGTGCATCGTACTATAACTCGACGAAAAAAAAAGTACTCGTTGAACAACTTACAGTCGGTTTAG
 8 | GGTGCCAAGTTAAGTCCGGTTATGCGAGCTTAGGCGCGCTTGGCGTACGTCCACGCCTGAGCGGTTGTCGAGTATCTTTAGGGCACGGCGACTTGCATACATGTGCCAACATAAGGTGGACAAGCGCATGGTCACCCAAACTGGTGAGCAACCATG
 9 | AAGACACCCCAGTATGTCTTTGCAGGTCGGGTGGGCATACCCGTTCTCGAACACCTCAACACAAATGTTAGCGAAAGAGAGTTGCCTTCGCTTTTCACCCGCCAGAAGGGTATGAAATGTGCTCGGGCGTAGGTCCAAGCCTGAAGCTAAATTCGG
10 | AACCCCGTCGTCGCAGCAGAATCAAGACGGTGCCATTGACAATTAGAAGAAAACACATTTCATTGTTTCCTCCAGATGGGTACCCCGTCAGGCCGAGAAACACCTGCCTCATGCTAAGTCGTCCATGACTCATCCCACCTTGCCTACGGCCTACAG
11 | GCCGCACCCCTGGTTTAAATCCGCGTAGACCATTTGAGGGATGGCCAAGCAAAGTCGGCGTTCCCTTGACTAAGGGTCTCTTGCCCCATGGACCTGGAGGCGGTCCCCACCAGAATTTATTTACCTTCTTTTTGGTGAAATGCAGACCAGGACTGA
12 | CGTTTAGTATCCTGAACCTGATAATTCTTTAGTTAGTTAGCCCGGTACGACCATGCCTCAATGGCTTATGAAGTCCCGCAGGTACGGATTTCGAGCTAAAACTCTCCGCTTTTGAAAAGGAAGACATTAAGCCCTTGTCAATGACGATTTCGCAAA
13 | GTCCACGCCTTAATAGAGGAACGGTTCGCCATTAGTGGCGGTCGGGAGAGCACCCTATTAGTTAGTCTTTTGCGCACATCGCAGCAACACAAATACTCCGGGGAGGATGCACCAGAGGGGTTTTATTTCGCCCCGGTATTGACTTACGACGGAATT
14 | AGGTTTCACAGTTGTTGTGCTCACGTCCAGGTCTAATGAGTTTATTGAACGCCGAGTTGGAACTTCATTGTCCTGCCCAATAGTTGCGTGCCTGCTATGGCCCTGTCGCATGCTCTCACAGTACCGTGGCTCGCTATAGGAGCTGCCTTAGGATTG
15 | TTTATGGTGCGGCCCTTGATACGTCGCCTCACTTATTAATTCTCGGGACCCGTAATTTGGACGGCCCCTGAAGCCATCATTGGCTAGTTGAGCGACAGGAAGTATTCAATCCACGTTGACCTCGCTTGGGTCCCGACCCATGGGGCCCAAGACTAA
16 | CCCGAGTAATGTTAAGACATAGCATCGTGGAGGACTACAAGACTGGTAGTTCCAGCTGAGTGTCCTATACAAGGCCACGGCTGAGTAGTACATATACCATCAGTGAGGTAGTGCAAGATTCTGCGCATTTGGAATAAGCAGTCTAAACGGACTAAT
17 | GAGACGCTTAGCGTCCAAGCCCCCGAGCGCACGGAATCTAAACTATCTCACGCGTGTTGCGGCCAAGTCTGAGCGCGTCTTAGGGATCTTGTGTTATGGTAGGCAGCGGCTATACGTATACGGTGCCGGAACCCGGGGGCCTAGCGCATAGTTAGA
18 | GCCCAAGACTCAACTCACTCTGGCTGAGCCCTACATGTGAGTGTAAATGAGGCCCAAGGTCCAACTGCTAGAGCTGACTTTAAATGCTCATAGTCAGATGCTCAGGAGTAGCGGATCTGCAATAGGAAGCACTCGAGCCGCATCGACTGACGGCTC
19 | AGAACATTTTGAAAGGCTGGTTAAGGTAATCTCTCGGGTCACGCTCCCGTAGAGGCAACTCGGGTAGCATTCATTTTAAGGTCATGCTGTACATGGGGCCAGGGCTAAATGGTTTTAGACTCTCATTCGATGCGTGCACGCGCCGCACTCAGTCGA
20 | GTATGGTCTCTAGACCATGTCTTACCTGCGCTTCGAAGCAGTTATGGACTTCGGGAAACAAGGACTCATTTCCGATAGTGTATTTTATCACACTGGAATGATTCAAGGGCCATCCCACGACGATTAAGCTATAGCGTACAGTCGAATTGGTGCATC
21 | CGTACACTAGCGACAGGCGGATGCTCCCTCCACCCATAGTCGATTGGGAAAGGCTAGCAGCTTCAGGTCCTTGTCCAGGTCTCAAGAACCATTCAAGCAATCGCAGCCATGCGCGCCGTTGTCCTAGAGCTGGGTCTTGTTTACATAGTGGGCATA
22 | TGACAAACCTTGTGAGGATATGGTTACCTTAGTTAATTGAGATATGAAGAGGCGGACCCAAGAGATCTGATACTCGAGCATTTTATTAACTCAGGGAAACTGGCCACAGCCCAGGCCTGAACGCCGTAGCTTAGTATTGGCTGGAACACTACATCT
23 | ACTACCGATGGCGGCATCTGGAACGAGCTAGGAGTACGTAGTCAGATGGCCCATGTCTCAATCTTATATCTTGTGATTTGACGTAAGCGCATGGCGACAGGCCATCGCTTAGCCCGTATTGCGACAAGACTGATGTTCCAGCCCCCGTGATCTTAG
24 | CATAAGTAACACTTGACAATAAACATCGGTAAGTGTCCCCTTCGGCGATCAGGTGGCATAAAAAACAACCAAGGCCACGCCTTAAGTCCTGTATATCCCTACTTCCCAGTATAATGCCATTAAACGGAGTCTAACTAACTCGTAGAAAGCGCGAGC
25 | CGATTGAAGGTTCGGATTTTAAAGATTGGTGCATCCGTAATAACCTACCACACGAAGGGGCTCCGTTGTGACTGCGTCCCCCTTGCAAGATTCATAAGCACGAAACCATCATCGCATATTTAAAAGCACGGCGTCCACGGCTTAGCGGGATTCATC
26 | ACGATAGTGTTCTGTAGAGTGACCATGTACATTTGCTCCGGGTGGGGCGAATCGCGACCTGGCAGATCTGCGATAATAAACAAGGCCCAAGTCTCAGTCCCGGTTTTCGGGTGTATGAATGTACAGGAACCAATAACCGTACAAACGTATGGGCCA
27 | 


--------------------------------------------------------------------------------
/data/stepic_3e.txt:
--------------------------------------------------------------------------------
 1 | 12 25
 2 | TGATTGAAATTTACTTACGACACAATGGTGTTTGGGGTAGATACGAGGAGGGAGTTTTGCGGCTTTTTCTTAACCGTGTGCTAGTGCTTCCGCCTTTTAAGTTACATGCCCGATGCAACGACCTCTTCTTTGTTTGCCCCCGCGGAGTATTGGCTG
 3 | ACCGCGACTTATCACTCGTATCTAGGTGCAACAAGTGCTAATAAGTTGCTTGGAAAACACCATATGGTGCCAGATCCCGGGCTGGATGTTTAACGATTACAATCAGAACAAGAGCGGTCAAGCCCTTCTTTAATGCATCACGTGACTGATTATGTA
 4 | ACTTAGCTGGAACCTGCTTGCTTAATGCAACTAGCATTTCGCTAGCCTAGAACATCGTACTCCCTTCTGGTAAGGGCTCAGGACGTGTTGAAGTGCACTTGTACCCTGAGCCCTTCATTCTTATGGCATACTCTGATAACGGCAGGCTGTCTGTGG
 5 | AAACCACAGTTGAATGTATCTACAGGAATGATAACATACATGCCGGGACTGGACGCCTGTCCGTTATATATACCCGGAACAGGAACTGTACTTTACAGTCAGCTAGAATAGATGCGGGAGAACGCTTCGTTCTAGCGAACTCGCAATCCCAGGTAA
 6 | CGTGCCGCCAAAGGAGTTTAGATGAATAAACGACTTGGTATTTGCGAAGAGCATACTTCGCCAGCGCTTGGCCGGGACTTGCCGCCAGGTCGCCATGGTACGTATAAGTGGAAACTACCGAACCCTTCATTATCCAGGAAGCATAACCACTCTTGA
 7 | GAGCACCGCAAAACATAAAGTAGGATCACTTCGTTGACCTATACACCGGTCTTGTGCCTGGGGTGAAAATGGAGGCAAAAGGTGAGCTGCTCCTCCTGTTCGATCGCCTGCAAAAGAAATTCTGGGAGAGAGACGGGGAATTTTGAGAGGTACTTT
 8 | CCCTGTGATTGCCTTCTAATCGGGCGCTGACCGATCTACAGTCTATGCCGCTGAACTCACCAAACTTTGCGACCGGCCTACTCGAATCGGTAGCCAGCGTAAGGTTAAATCCGTATCATAATCTCTTCATTCTGTCGACTTTTACTTCGTTTCACC
 9 | TCTATGGTATACATCTCTTCTTTCACTCATACAAACGGTGGTGTGGGTCACTGCAGTTACCCGTTAAAGGAGAGTTAGCCGGCAGCTACAACCCTTCAACCACGATAAGACCATAGTGCACCGATGAAGCTAGTATGATGATATCCACACTGTGAG
10 | TTGCCTCGCACGATCGCTTCGTTCGTTGTCCAGAAACTATACGAATAGGAGGCCAACAATATCGTTTACTAAATATCTGGTTTACTGCCCCAGAAGGGCATGGAGATCGGAGACGGGGGGAACCTAAAGGGGCGATCAGAAGGCCAATTACAAAGT
11 | CTAAGTAGATTAGTGACTTTCAGCCAATTGATGGCAGCTACTCGATCACTGAAATGGTCATGGCCAAAGCTGCTAAAATCGTACTAAATCAGGTCTGACACCCATTGAATCACTTCATTTTGCCTCGCGAGCCTATTCCAGTAGGGAAGTCTTCAC
12 | ATTCCGGGCCAAGCTGTGTCATTTCATGAACGCGGGGAACCGACCAGTGGAAGTCCTACCCCATCTCCTGTATATCGCTCTCGTTTTGCTAGTACAAACCCTTCGTTAGGTTTTCGTCACAAGAGTCCGTTTCCAGATAACAAGGCGTACGCGCCT
13 | AGGGAACGAGTTGAATCCACTAGGGGCGGACTCCAAAACGCTTCATTTCATCATCATCCTTGGATTCGAACAAGCCAGAGAGCCGGGGGCTACGTCGAGAAAATTCTCTTTTAAGAACCTGGCCAGGTCTTGATGTGACGGGAACCATAAATGCTG
14 | TTTATCCGGCTGCATCCCGACTGCGCACTACACACCGCAACGTGGTAACCTGGACCACCAAGCAAAGTATGCCAACCGCGATCGCATAGTCCGGACGACTCGTGGCTGGAGTTGTAAGTCCTCTACCAAGTTACCACTTCTTTGTAAGGTGGTACC
15 | ATGATATTGCCTACGGCCCTGCGCGCATTGGATCTCTAAAGCCGAGAGGAGAGCAGCCTGTGAATTGTACGCATGGGGTTCTAGACCACTTCGTTGAGTTGACTCATCTCCGGGATGCATGTCACTTGGTATAACCACACTTATGCCTATGGTTGA
16 | AGACTATACCCTATCGCTTCGTTAAATAAAATCATCAAGTTTATTCGTTCCTCGTGGGAACCTGGTCAAAATTAACCATGCCTGAAGTCCAATTAGCGATTGGATATGTGGATGTCAGAATTTGCAGCTCTGATTGCACTGACTAAGATGCCGGAC
17 | GTACAAGAGAAGCCTAGCGCTTTTCCGCTCATTCCTACCCCTTCGTTCACAAAAAGCCACTACATGCCATCGGAGACTATCACTGAACTACTACTCTCTGATCCTATCGTGATATCCTGTAGTATCATATGGTTACCACGAATAGCGCTTCGACTA
18 | ACCCCTTCCTTGCGGTTCCTAGTGAGGGCTGCCGAACGCTCACTGAAGGAGTCAATAAGTAATGGGGGTTGAATACGACATGCACTACATAATGCTACCAACAAGGGGTGACCGAATGCTTTGCGCCACTGTGGGATAACAGCTACCTCCAGGTGC
19 | CCTTGATAGGTAAAGGCGTAACATATTACTCAGAAGAACGCTTCATTTCACGGAAATTTCACACTATTCTACAGTGTATCGGCTGATGTAGGAGGCCCCCGCCCCCTATCCCAGAACATGATGAAATGTATCTCCTCTGCGGCATCAAACTCTACC
20 | GCCAAAAGTACACATTTGATGAAAGGACCCCCGCTTGTCTTCTCATTCAACACTTCTTTCTATATCCGGGTAAAAAATCTCCAGTCCCAGGTACTCGGACCGTCGTGGCACGCTACCTCGTGCGTTACTGAAGGTCTGCAAACGCATTAACCCAAC
21 | GAAATCCGTTATTGGATTTGGTGAGTAATGATTGGGATCGCTTCGTTCTACTCCATAACACCATTCATAAAGGGCCGGCAAAGTTGGGTCAACTTCGGCCCGGTCACATCCGTAATCCATATACGGCGGGGAGGCGGACTAAATTAGATCGTTCCG
22 | ACGCCAAACGGTAACTCTTCTTTGGGTTGGAGTCTTGGTGGGGTATGTGGCGTCGGAGTTGTACTTTAATCCTACCCTGAAGTGGGACTTTCGCTGGCTGGTTGGCTGAGGTAAACAGGCATTGGACGTCTCTAATTAATCACTTATCCGGTGTAA
23 | GCTTGTAATTTTCCACGTCGGGTTTATTAGAATACGAGAGGACAGTGTCGGGGTGTGCTAATCTCTTCCTTATTAACGCCTTTTTTGCCCGTTCTATCCTGATACCACACAATTCAAAAACTGCTCGATCAAGACATACCTCATGTTATTCGTGCG
24 | CGCATGACAAACAAGTGTCACTGTGGAGTAGTGAATTTGCGTTCGTACACAATGCAAGCGACCGCTTCCTTGATGCCTCCCTAGGTACTTAGAAGTGGATTCATCCATTGGGACACATAAGTCAACCGATAGGAAACTTAGCTGAGGTGTACTTCT
25 | TGTCGGACTGAAATGAGAAAGCTAACTGATAGGGCGGGAGGCCGGTGCTCCAATGGATCTTCGAGCATTTTTAGCACTTCTTTGCGCACTAAGATAATCCCTATGTCCCACTACTAGATATTTACCCGGGTAACACCTGCGACTCCTAGCCTTAAA
26 | AGTCCGGTAGTGAAGCCGGAGTGTCCTCGCGACTTCGCCATAGTTCGACCATTACCTTTTTCTGCCGGCTAGCAGTTGTATGTAGCTGGCGTCTGGATACGTCCGTACGCTGCTGTTTTCAACCCTTCGTTCAGACTTATTGCAACAAAGGGTGCA
27 | 


--------------------------------------------------------------------------------
/data/stepic_3f.txt:
--------------------------------------------------------------------------------
 1 | 15 20
 2 | CGTCTTCTTACCGATTAAAAGGGCAACCCTCCAATGACACTCGCAATTACACAGGCGATAGACGCACCCTGATTCCATCCTCGAGCCTGATGTACGATCGATCCGTGTAGCATTTGCTAAGATCCCACCCACTATAGCGTTGAACATATCTGGAAAAATGGTCGGCGTCTTCTTACCGAT
 3 | TAAAAGGGCAACCCTCCAATGACACTCGCAATTACACAGGCGATAGACGCACCCTGATTCCATCCTCGAGCCTGATGTACGATCGATCCGTGTAGCATTTGCTAAGATCCCACCCACTATAGCGTTGAACATATCCTTTGGGGTTTGGCCTGGAAAAATGGTCGGCGTCTTCTTACCGAT
 4 | TGAGGTATTACGGTCGCAAAGCATCATCTCGGACGGTACGCAAAAAGAACCCGACCTTAGAATGACGTCGTCCAGTCATTCAGTGAAATTGATCGTACGGCTAGGCCTTCCTGGTAACGCCAGGCCCTGCACGGGTCGCAGGGGAGCCTTATAAGCCAACCAAGCCCTCAGATTATCTAA
 5 | GGCGTCATAACGCGGGCGCCTAATTGGTGCCCGTCACCTGGGTGTCCCCACAGCTTATACTTCCTAAGTTGGCCTAGGTCGATAGCGGTTTTGGTGGTTTACAACCCTAAATCGCCAGAGCGAAGGAGACCTGCAGTCTATGGAAAGGAACTCGTTCTTTTACTTAAAGGCCCGGTAAGA
 6 | GTCCGTCTAGCATCCTCGATACTTCGGCGCGATCTTTAACTGATAGTGCGGACATACGAGCTGGTTTGGCCTTTACGAATCAGCGAATAGCTGAATCCATTACTTATTTTTTTGCTAATCTCTATAACACACCGAACTGCAGATACATGGACTTAAGTCAACGCACCCCTTATGGCGAGT
 7 | TCGCGATGGAACGCGTATACCGTTCCAGATTCGAAGACGTTGGCTATCGGCCCAGCAATCTGAGTTAGGGAGCTTCCACTTTTGGCCGTGTACACTTTTCGATTCTGCTGTGACGTGTAGCAGCGTGAATGCCTATATTCCTACTCTGGTGTCGTCATTCTTGTAGCCGGTCTAGCGGCG
 8 | CAGTATGCGTTTTCCTGGTGCAAAGCTGTTACTCGTCGCCCGCTTCCTATCTTGGCCCTTCTAAATAGACGGCCTACCTGTACCCTCCAGCCTCGTCGGGGAACTTTGCTGGTTATAACTCACGAGTAATCGCCAGAACTTCCATCCGGAGACCTAAGAACCGTCTTGTCCAGCAATGCG
 9 | CTTACCTCTGTTAGAAACGCGAAGCATATCCGGCCACCTGGAGACCTGGGGGAAACACACACGCGGATCTTTTGTAGGACTTCCTGGTTTGAGGAGACTTCGGTCTTCTGCCAAGTAGCGTACGATCCGTCTATGGGAGTGAGTGGGTCTACCGAACGAGATAGGCGTTGATCGCTATCA
10 | TGATTTGACCTAAGCTGTGTGGTTTGGCCGACAAGACTTGCACCACTTGCCAAAAGGTGGGCTCCAGGAGTCCACACAGATATGTGCGGTACAGGCGTAATGTGTACGAGCTGGGAAGAATACGTGACGGTCAATACCCCGCCGTCTTGCAGGCCATTTTCCCGCGGAGGTTCACCGAGC
11 | GCTCGGTCACCGGAAATGGTCTACTTGCGTGTCTGGCTAGCTGTACCCCTGGTTTGGCCATCACGTTAGTAAGATCACCCGCCAACCCCATTATAGGGGTTCCCTACTAATCCCAAGCTCGGATGTCTGTCGGCCTCAGCTTCCTCGCACGTAGACCTGTACTTCGTTCAAGCTTCAAAT
12 | TATCACGGGTGGGGTGTGACACAAGTCAGAGTCTGTACTCGCTGGGCTGCGCCCGATCTTTCACATTGAGTGGGCGCCTACGATTTTGGTACTTCTACGTTTGGCCGCGAGATCCAACTTGAAACGCGTTCTCTTACATAACGCTCGCACCGGTATCATATTTTGCTATCTGGTCCTGGA
13 | AGCTAATTGTAGCCATCCTGGTTATGTTATACAATCAGCTCTCTTCCTGGTTTCTACTAAAGATACATCCCGAGAGCGCTTGATGTGCCGCTCAAACGCCACATTCCGCGCTTGAACTAATACCGAATTAGCGAGCTCTATGGACGCAGGCATGCTACAAGGCAATGGCCTCAATGTCAC
14 | TTGATCGGCTTGAAGGTTTGGCCATTCACCCTTTCGAGAAGGCTCGAAATCGCCGGGCCGCGAATCTTAAGCAGGACTCGCACCCGAACGTGGGTCCGGGTATCTCATGCGGACGATTCTACCGACGTACGCCGAGGGCACATGGATGGACTCTCGAGATGAATACCTTTCAGGTAGGAC
15 | TATATGGCTGGGCCCCTGTCTACCGCTAAAGCATATGAATTCGGCATAGTGCTTCCTTCCTGGAGGGGCCAGGCGAGTTTCATTGTATAATTGCTAGTTGGGTCGCTATTGGTTACGATAGAGGTTGGTTACCTCGATTATGTTGCGCTACTAGCCTTCCGGCCGCGTTTACAGACATGA
16 | TAATAAGTTAGGCGATAGCCACCACGTAAACACGTGCCTCCGGATTGAGTTGGTAAGACCACCGCACCGCGACACCGGATCTTCATGTGGTGGGATCTCATATTCCTGGTTTGGGAGAGTGAGGAAGACTTGATAGACCGCAGAGTTCGACAATGAGCCACTAGTCTTAACAGACAGTTT
17 | TCGGGGTAATCCTTCGATGTTTGGCCGGACTGGGACGCCTTGATCCTGGTGCTCCGAATAGGCTCTGAGAGTGTATTATTTCTGTCTAGCTTGGAGAGACCTTGGCCGAATCGGTCCCCGCTCACCTTTTGGCATTGTATAGGGAGGGCTCTCACAGGGCTTCAGAGGGAATGGTCAGCT
18 | GAAAGAATGCGTGTAGGTTACGTCCCCGGTGAGGTTTTACTCACCGGGTGGGCGGAACTTGAGCATCCACAAACCACTATTGTCGGGACTCTTAAATAAGACTATGGCTTGGCTCGTTCGAAAGAGCTGGCCATTCGATAGTCTTCTTCCTGGTTGAACCATAGGGGCCCCCGTAATTAC
19 | CCTCCGACGTATGAAATGTTGTGGTCATAAGTGGTATTGATTTGAGGGAGAATGTCTTTAACGTAGGAAAGCTGCACACCTGTCGTAACGGTCTGTGTTGGGTAATACAGCTCATGACGTTGCTTTACTCGTAGCACGTGTTAGTCCTTCCTGCGATGGCCCTCATTAAAGCGTCTCCTT
20 | GGGCGAAGAAATCCTGGTTTGGCTCTCTCCCCTTTTCATGCACCGACTAGTGCCGGTCCGATTAGAATGCTAATAGGAAGCCGGGAACACCGAGCGTATCGAGCACACGACTTACGCGAGGTACGACTGAAATTTTCAGTTTCTCAGAAGGTACTCCCACGTGAGGTCCCTAGATCGGGA
21 | CGTTATCGCGTGCCGGCGCGTTATATATCGTCGAAGAAGCTTCCGACTTTGGCCGCCCTCTCGAGATATGAGCCCCATCAATCGTTACCCTAGTTAACGACGCCGAACATATAGTTCATCGGCGAACTAATGTTGTATGACAACTCAAGCACTTGGTGTTTTAGACGCAGCACATTGCCC
22 | 


--------------------------------------------------------------------------------
/data/stepic_3g.txt:
--------------------------------------------------------------------------------
 1 | 15 20 2000
 2 | AGCCATCATAAAGGCTGCCAACTTCAATGTCATTAAGCCCGAGGCCGCATACGGGGCCGGAGCGACCGGGAACCAATCCAGTAGTTGGCCCGCAGCCTCAGGAGGGACAGGCCTCATGCGTGACACTTGAGTAAAAGTTACACACTCGACTAAGGTTTACCGAGTTGAAATAACCGAGTGACAACTGCGCAACTCAATGCTAGTGGAAGGACCACCACCGGCAAGCTAGCCGTGCGACTTCCCTGAGCCGGCCCGTTATGCCCAACCAATTTTTTGCTGAGCACCTGAGGTGCACTCCGGCTAGGTGAAGAAGCCATCATAAAGGC
 3 | TGCCAACTTCAATGTCATTAAGCCCGAGGCCGCATACGGGGCCGGAGCGACCGGGAACCAATCCAGTAGTTGGCCCGCAGCCTCAGGAGGGACAGGCCTCATGCGTGACACTTGAGTAAAAGTTACACACTCGACTAAGGTTTACCGAGTTGAAATAACCGAGTGACAACTGCGCAACTCAATGCTAGTGGAAGGACCACCACCATGATACGAAGTGAGGGCAAGCTAGCCGTGCGACTTCCCTGAGCCGGCCCGTTATGCCCAACCAATTTTTTGCTGAGCACCTGAGGTGCACTCCGGCTAGGTGAAGAAGCCATCATAAAGGC
 4 | GACCGCTTGCGTCGAAGTGCCTCCTCGGACAACCCAATAACATAGACATGGCTTATCGCCCAGAAGCGGGGACCGTAACTACAGTTTAGCCCGTTGTCAATACAGTGTCAAACTCACATTAAAGGTTCAGGGGGTATAAAGCAGGGACTCTCACCCCATTTTACGCACCGGCCAGAGGACGTATTCTGGCGATGCCAGGGAAATCTGAAAGAATGTGGCCTCTACCACATGCAGAAAGTATCGATTTTGTGAGAGGTGTTAGCAACCTCAGGCTAGACAGTGGCCACGCAATCGAAATCAGATTTGATTGAGCTCCAGAGCGGGGA
 5 | TATCCGGCGAATGACCAAACACCATCATACCCTAGAACGTCCGATCTCCCGTACCCGCACCGCGTTGAATGGTCCAGCACACTGTCCACCCGCTTGTCATGCTTAACAGGAGCTTCTAATGCGTCTTTGTGAGTGTGTATAGTTCCGACGTCTAGCCGGTTAGACCAGTACCCCTCGCGCCAATCAAACATTTCGGATGCCGTTGATTTGATATAGCCCGCTATGATGCCGCCTCTTGACCCTGCCCCGAGGCATCTACGTGATCGACTGTAAAAGAAATTCAGATGACCGCACTCCTGAGGTGGATTAGGCTATATAAATCAAGG
 6 | TCCGACGGGAGCCTGTGTGTTACCGACTTCCCGTAAAGAGATAAGGTTCCGGGTGACAAACGTGAAGCACGAGAAGGACCAATTTGAACACCTTATCGAGTCTAGGGTGTCATGTGTGGGAGTGCCCGGTATCTGCGTATGCGTGCTAGTGAGGCGGTACTTTTCTTTCGTCTCGAACCCCGACTAGCTCGATGCACACTATGATAGGCGATACCACTTCTCCTTTGCATCACAACTGAAAAGGATTTCGATTTGGCTGTTTCCGCGGGGGTGAATCCGATTCGGATGTCACACATCCAGCAGTGGACCATGCTTGATTATGGCAC
 7 | GGATAAATCCTAGCGGGTTACTGCGTCTTCGTTATAAAGCCTTTTGCCGAGCACGTTCGTGCAGCTGCTCTGGGGTTAGGCGCTGTCCTAGGCGGCGTTTAATTGTCTTTGCTAAAGAAACCGTAAGCGTTCAAGCGCTGTTGTCGATAGGGGGTGCGTGTGAACCTAAATATAGTTACACGATGCGTCGCTTTGAGATAAGCACTGGTTCCCCCGCGACCAGATGTAGGCTGGACACAAGATCCATTAAGATCAATGCTGATTTAAGGTACAGACATTTAATAGAGCACGGGACGTTAGTTCGTTTTGCTATGAGACAGGACGCT
 8 | CCTTAAAATCTTCTAGACATAAAATCGAAATCATCCTTACGTCGACTAGCCATACGTTACCTGATCGTAAGGGGTTACCAAGTGCTTTATATATGACTGGCAGTACATCGCGAATGGTGCGAAGTGAGGTCTCTTACTCAGGCGTCGGGGAAGTTTCCAGGGAAAGTCGCCTGATCTGGGCCGCTTGTATCATTCGTTATTCCGTGGGCTGTAACAAGTAGGGCTTGACTTAGTGTCCGGCTGAAGGAATTAAAGGATCAGCGGGGATCTAAAGATTGTTGGGAGTGCCAGTTGAATGCTGTCCGGACGGTTGACTCTTAAGGAGG
 9 | GGGACCAGCTGATGAGGAAAGCCTACGGGTCGATAAAACTTTTGAGTCGAGGATTCTTAGGATTCGGCAATTCGCACAGAAGTCCCAAACTTGGTACTGTCATCGCTTAGCTCGAGACACATACACTCTCATCTCCGTAGCCATGGATTTCTCCGAGAAGCCCCCTTAGCCTTGCGCCCCAGCCTGGTCATAACGTTACGGTGCGAATTTCGTCGTTTGGCAAGTATGCATATGCGTCGAGCAGAGCAACCTGGCTGTCCTAATGGCGGTCGGCATCCTATATTATTAGCGTGCTTGGGGCGGCCGGTAAAGAAAAGACCCGTACC
10 | ATTACTCCAGTGTCAACCACCGGAAAGTATTAGCTTCGTACCAACAACAAGTCGAAGTGAGTAATGGACCGACAACCGTTGCCGGGAACTTATGGCTCACCATCAGGGACCTCTGTTATAAGGCATCAGAACCCGAGCGGAATCGCGTGGGAGACTATATGTTAAAGAGATGTAATGACCGCGCGCAAACAGAAATGCAGGGTAAGGCCGTCCATGCGTCTGGACCCAGAATTTGGTGTGCCTCATCCTGTTTTACTTAATAGAGCATGGCTTTACCCTTGTGGGGTGTCCGGCCCGTGAGCCGCCTTTGGACTTTCTACAGCTTT
11 | GTCCAGTATCGTCGCAGAACTACTCTGGAGAATTTAAACAAGAATTGCACGGGTCAACACCTTTCCGCCGGGTAGGGCGAGGCCCAGACGACAACATATGTATGCTAAGAAGTGAGAGCGCGCGACTTTGTAAACCGTTGAAAATGCGAACAAAGCTTCATGTGGACATTACACGGTGCGACACAGAAAAAAGGCAGAGGAACGAGGCTCCTTTCTCTTAAAAACGAGGCATAACATTCCTGCCTCTCGTACTCAGGACTTGGGGCCACGCCTTCGGCAGCAAGATGACGCAGACAAACTGGTGTGACCGCTTAGGTGCACAGGAT
12 | GTATACTCGCTGACAAATATCCGCGGACCCACCTTTTCTCCAGTAGGTGGTAGTATGGCATGGTCCGATCTCCGAGTCCTAAAGACAACAGTTGGTGAGTCTCATCCATTGGACCCCGCGGTACCTTCCATCGTACAACACATGCGTCGAAGTATCAGAGGGAATTGAAGGGTATTAATAATCAGCGTCTAGTGCATCCCGCAGAACGATTATTAATCAAGTACGTCTATCGGGGGTCGTCGCCGTCTCTTGAGAATTTAGCTCTAATCTTTGGGGAGGTGTTTCCCAGGAACGAGCAGTCGATTAACTACGGCGTAAGTTCCAAA
13 | ATTGTTTTTTTAATGGTCTCGCGACAACGCGGTGCAAAAGCTGAATTTCGCGGCTCGGGTCCAGAGGTAAATTCCGACTATGGGTTTACTCAGACGCTCTGAGCTGATCATTGTTTTCTGGAATGAGAGGTGGAGCGTCCTTCCAAACTGCTGAGAGGTCTGTGTACGGGCTGCCTTGCTTAAAACAGCGTATGATTCTCAAGTTAACTTGCGCTTGGAACGTATTACACTGTCCTGCTGCCCCACATCAATAACTCGAAGTGAGGATACTTAGCCTGAGGGCAGCTCGGCCGGAGGCTAAGCGGCGCTCATGGACGTACGGGTTT
14 | TCTGTGCTCTGAAACGTTACCGAGGCATCGTCGTTTAAATGGCCTGAATTCCGTTAAGCCTTATTACCAACGACTTAATATTGTACCATATTTATTTAAACATAGGTTTATTTGTATGAGGATCATCTACCTTCAGGTCCGAAGAAAGGAAACTAATATATGGCTAAGTCACCAAGCACGTGCAATGATACACAATGCGAAAAAGTGAGTCAGCTATAGCACAAGACTCTAAATTACTTTTCAGTGGGGGTAAAATCGGTGACCGGCTTGAACAGGCGGACCACGTTTGCAAAGGCGGCAGAACAAGAAATGACCGGGGTCTTTAT
15 | GTGTGGACATTCCGACTGAGTACGAACCGGAGTTAACGGGGAGCCTATATATGACCGGCTATTCAGTGCGGGCCTTCATGTTATAACGCTGCATTTACGATTGACGTTGTAGTTTCAGTTTGCTCTGATTCTGAGTTGCACAATCAGGGCCTTTGCTAGTGGCGGAGAGCAAGATGCATGGTATTAGCCACCAAGGACGGTGTCTGCGCAAAGTTAGATGGCCTAACTCAGAACATAGCGTACTCATGTGGGCATAAGGAGCTATTAGACCGGATCGAGACTTCGTACATCGATTATATTTATGCGTGCTAGTGAGGCTCGGATAT
16 | TCGTCTAGCCCGCTACGTTGGCGTCCGGGTTAAATAGCTATGAATTTTTGCTAACCTCAGGGTCCCTTGAGCCGCTTAGTCGAAACGCAGGTATATGGACGGTGACCGCACCAACTGGGTGGCTCCCTTTTTAAAGGAGTCAATCTGCGTTGTCATAAACATGAACATCTCTGTGAGCGCAGGGAGGATTACCTTAATAGCCGAACAACCGACACGTCCAAAAATTTGAATTGATTTACCAACAATGGTAGGATGAGGTATAGATGCGTCGAAAGAAGGACTAGTTCCCCCCAAGACGCCTCACGACGCTAATGCCTCGGAGAGTT
17 | TCAATCGCTTCACGCAAGCAATGTGTTATGGGATCTCTAAACTATGCGGGCTATTTCTATGCGTAACAGATCGGAGTGTCGACTTTGATTTTCTGGGAGACATGACCGAGCTTATTACTGAACGCAATGCGGTTAAGTGAGGACCCTGGACGAAATCGCAACGACCATGGAGCAGAATCTTGTCTCGAGTATCCGTCACCGATGCCGCGTGACGCCCCGAGTTTCTGTATATGAACCCTGGTTAACGGTCCGGACAGGCTTGCCTTCTCTGCGACTATAGAGGACCCAGATTCAGAAGTACGGAAGAAGGTAACCTCCCGCTTAGG
18 | CGCAGAGAGTATAAGGCGGTAACACATCATTCCCATATGAACGGCATACTTTAATGGAACGTGAGTCGGACGTGCTTGTACTCCGGCATTCCGCATACGTTGACCTAATGGGGGTACGTGTCTGGCAGTCCGATTCCCAGCCGCGTCGAAGTGACGTTCCCTTGTACAGTCGAACACGACTGTAGATCACTGGTCACCCGTGGTCAGATGCCCCATGCAATGGATGCATAAATTTGCATGGCGCGAGTTAATGGTCCCTCGAGTAGACGGTTGCAACTGTCGGGATCCCTACTGACATCATTACTGGGATCCGCCACCCCAGGCAC
19 | AGGAGCGTGGATGGAGAGTGCGTAGTGTGTTTTTACCGGACTGGATGCCTCTGCCAACGTCGATGCGTCGAAGGTTGAATTTCATGCGCTATGAGCGGAAGTTGAGCAAGCTCTATTGCATTACTACAAGGCCTAACACTCAAACTTACAGGGGTCCGCAATTAAATTTTCAATCACTGCCCACCTCTCTGAAAACGGGACTCCCTCGAACATAGAAGTCCACGGGATTACGGCCCCGGGGTACATGGGGTGTTCCCTTTTTGGCGGACACGACTTAGTAGATGAAACAATCCAATTCCGAGGACAATTACACTGGAGTTCTGCAG
20 | GCCAACTGACGGATTTGCTCCGATGTTTTATGGGCGTTCGACCGGTCCCTCGTCGAAGTGAGCGCGGGCTGAATACCTATGTGTGGAAGTACGGTGTGGGGATTGAGATAGTGTGCTTAGTCCCACGATTTTTCAGTTTGTTGCCTGATGGACAACTCTTATGAGCACCCTTATAGCCATGATACCTATTACAACCGGTCTTAGCTACGCTGTGAGGTGCCGGTTATTGGCGTACTCTGCGACGTTTCACGCGGACGGGAGAATCCCCGTAACCCTTAAAGGTCCTGCATTAAGATCACCTTAGGTTTTCTTTACGTTCTACTGAA
21 | CCACGTCACAGTGCTGCTGTGGGAATTAAGATAAGTTTCGGCTTGAGCATGGATTAATCTTTGCACGCTGGTCTTAGCCTTAAGCGAGGTACGTCAAGCGTACTCGGCTGAGTAGCCGTCATCCTGAGCGTATCAGTCTGGCTGTCGGCTATTGCCCATGCGTTTCGTGGCCATTGGCCTCTTGATGCGCGTAGACGAATCACGAGCAGGACCGAAATTATCCTTCCTCCTCCCCGTCATCATCTTACATTGTGGTCAGCTATTTAGCGACATGCTATGAAGTGAGTGGATTGCCGTATTTAGGATTGCTAACCAAATGCTTCCGC
22 | 


--------------------------------------------------------------------------------
/data/stepic_4c.txt:
--------------------------------------------------------------------------------
1 | 12
2 | GACCCTTCGGCAGGATTCTAATAACTACTGACATATCAGATTCGGTTGCCTATCTAGCGTGAGCTCATATCCAATGCTCTACATCCAGCTTTTAAGCCGAAGGCAGGTCGCTTCTCCGCTTCCCTGAATTCACGGGCCGACACGATACGGTGACTAAAGTTTGGCCCGCCACCAAGTTCCCAAGGCTATCACCGAAAGAAAGCGGACGGTATACCTGAGATATAACTATCTTTAAATAGGTAATTAGCGCGAATTGATTTCCGCCCCGACTGTTTGTCTAGGGCGTGGTAGACTGCGTCATTAGTAGACAACTCCCGATGCAGGTTAAGAGGGACTCTCATACTAACCCCGGATACGACAGACGACCAAGCCCGCACTGGAGGGGTATGCGAGTATATGGTCGGACTTAGAGACTTTTGCGCTAGCATTAGTCGACGTATCAAGAAGCGTGACGTCATGATCATCTGACGTCGAGCGCGCTGATACCTGCTGAGTAGATCCCGCTCACTCCGCGGTTCTTCTCCGTGGCAAGTCCAACCACAATGTTCCGTTGGGTACGCGACTTACCGGACTCTCCGGGCTTTTAGGCCCTGGCAATTGCTAAGATACAATTGGGAATCGCCCCTTTAAGCCCAAGCTTTCCTTCGCTGACCGCGAATATTGAGCCGGTTGTACCATCTCTAGGAAGACCATCCTCACCGGACGCCTCTGTGTGTTAATTATCTCCCCAGGTGAGATGAATAAGTGCGGGTAGCCCAGTCCAACTTGAAAGCCCTATAAGATGGCGCGGTATGTTACAGATGTCCGAGGCGGGCCCCCCGTCCCACTTCTTGGGGGATAAGCCGTCGGATTTGTATTACTGATCTTTGTGTCTGCGGGGACGGCTGCGCTTCGGTTGCAAGGTCGCAGCAAGTGTTGTAGTGACCTCAGTTAAAGTTCATTGACCCTGCTCGCACGGGCAAACGGTTCGGCCGGGAATAGCTGGATCCGATGGGACATATACGGGTACCAAGGGTACGATTGAGGCAGCTAGCGGTTGCGTGTGGAATTATCCGCGTAGCAGCGGCACAACTATCTCATTGCAGTGGTGGGGCTAGGTCCGGAACCATAAGCTGTCATTCACTGAGATGAGCAGACGGTTAACCCAGGTCTTTGCGTGCTAAGCTATAGGCAGGTCTCTGACGAGGGTAGAGGAAGGTGCGTCCAAATATTCGCCCATTATCTTAGGTGCGTCTGATCGTGGGTATGTTCCTCAGTTGGAGCCGGATTCTACCCATGATACTGGTTTGAAACAGGATGGGGTCGACAAACACTCAGGTAAGACCACAGTAGAGTCGACTGTCGTCCTCACAAGAAATTCGCAAGTTATTACCCTGGCGAATTACAGATAGGTCCGAGAACTTATAAAGTTCGGCCGTTCACATGGCGGTTCAGACCCCTTTTCTTCGATCGAATGACGCTAGGAAGCATGTTGCACCCTTGAGGATGGAGGCGGCTCTTAGAGCTACATTCTAATCAACTCTCTTGTGACTTTGACTGACAGGCCAGAACTCCACTTACGCTGGCGCTAGCATTCAATTTAAGTACATTCTGTCTAGGAAGAGAGAACGTACTTTGAGGACAATAATGAACGTCCATGACGGACTAAGAGGAAGTAGATCGAACGCAGACGCGGTATGTCTGGCGCGGCGCGACTACATTACACCCGTTCAAGTGGAGGGTATCTTGGCGCCTAGAACTGAAATCGTGACTTTCATAACCCCTATTTGATTGAACCGAGGAATTCCGCATTCGGATCACCCGGGAGTAGAAAGCTTATATGGTTAGGGACGGTCAAAATAGAGTTAGAAGGCGATTGTTAGCCCAGCGTGTGGGACTGAACGCGAATGTGTGCGTAATCTGAGGAACGGCCAACTGGTTGAGGTGACTATGCCTGGTGTGCGAAATAACTTTGTCGGTAGGGAGAGTAGATTAAGATTTCTGC
3 | 


--------------------------------------------------------------------------------
/data/stepic_5c.txt:
--------------------------------------------------------------------------------
1 | 17
2 | 


--------------------------------------------------------------------------------
/data/stepic_6a.txt:
--------------------------------------------------------------------------------
1 | 16730
2 | 22,13,11,5,3,1
3 | 


--------------------------------------------------------------------------------
/data/stepic_6b.txt:
--------------------------------------------------------------------------------
 1 | 17
 2 | 11
 3 | 1 2 1 4 2 1 4 2 3 4 2 1
 4 | 4 3 3 4 2 4 1 1 1 3 1 1
 5 | 4 2 2 3 4 3 2 0 2 2 0 0
 6 | 3 1 0 1 0 2 0 3 4 4 1 0
 7 | 1 3 4 2 2 4 1 2 2 2 0 4
 8 | 0 0 3 4 1 0 0 2 4 1 0 1
 9 | 3 2 1 1 1 1 4 3 3 4 0 4
10 | 4 4 0 2 0 1 2 3 0 0 3 3
11 | 4 4 3 0 3 2 3 2 0 0 2 0
12 | 2 3 1 4 2 0 2 0 1 1 4 4
13 | 1 0 1 1 2 1 1 2 2 3 0 4
14 | 2 0 3 2 2 2 3 1 1 0 4 3
15 | 2 0 2 4 2 3 3 3 1 2 4 4
16 | 3 1 4 1 3 1 1 2 3 2 4 3
17 | 2 4 2 2 1 4 1 4 3 1 1 4
18 | 1 1 2 2 2 0 1 3 4 0 0 3
19 | 2 4 0 3 1 4 3 4 4 3 1 0
20 | -
21 | 2 3 4 2 3 3 4 3 1 4 2
22 | 4 3 4 0 3 3 1 4 1 0 2
23 | 2 4 4 3 3 1 2 3 2 0 3
24 | 0 4 0 0 2 1 1 2 2 0 4
25 | 2 0 0 1 2 4 1 4 1 3 1
26 | 3 0 2 3 4 0 2 1 0 4 0
27 | 4 3 1 3 3 2 4 1 2 0 4
28 | 3 4 2 1 2 0 4 3 0 3 3
29 | 0 2 4 0 4 4 1 3 4 1 1
30 | 3 3 3 3 1 3 1 2 2 2 3
31 | 2 1 3 1 4 0 4 4 3 1 0
32 | 2 0 4 4 3 3 2 1 4 3 2
33 | 2 2 4 0 0 2 4 3 3 2 0
34 | 2 3 1 3 4 4 4 0 3 4 4
35 | 0 2 3 1 0 4 3 3 2 0 0
36 | 3 3 2 2 0 0 1 0 3 2 1
37 | 1 3 1 3 3 1 4 0 0 3 1
38 | 2 4 3 4 2 3 3 2 0 4 4
39 | 


--------------------------------------------------------------------------------
/data/stepic_6c.txt:
--------------------------------------------------------------------------------
1 | GCATTATTGGTGACTTCTCTTACAACTCTGGCCACCGGGAAGATGGGCTATGTCAAGAGGCTTGCCTTGCACCTCCGAGGCCTGCTCGCCGGTATGCTTCGACAACGAAACCAGGACCGACCACAACGTAGCCCCCCCCGTCGTGCGTTCACAGTTACTCTAAATGTCAGACCGTTTCCGTGTGGTCCCTTAGGACTTGACCCTCGGAAAAATAAACTAATCATCGGCCTATGGTTGACTAGTCTCTGCGAATAATCATACAACTACTGGCCGTGCAGAACATTATCTATAGAACGTTAGCAGAGGATTTTAGTTTGCGTTTTCGAGTCGTGTTTTCAGGGATAACCCTTTCCCGGTTGTTGCACATAGGACCCCAATTCCAGTGGCTGTATGCATCCTTCCAGGATAGTAAGCTGCGTACGTTCCGCCGTGGCCGGGATGCCTATAGATTACACAGAAAGGCGTCAGTTCTTGAGACCGCACTGGATTCGACCCGCCGTCCACATTACGACAAAAGTTCACGAATCACCGTGTCGTGTATAAGGCCAGCTGGCGGTCACATCACGGGGGTATCAACAGCCCTCCCACCTACATAGAAGTGCGAGTATAGCGTGTGGTACATGTGTAGAGCATCCACATCGAATGAGCCCAATAGGTGCTGCCTACTCTAAATCTATCAGATAGGGACATGCCTACCACAGGGGATTTTTCCGACGTACTGTGAGATTTTACCAATGATCGAGCCGCCCCATTCTGCCTCGGCTGATAACCCTGTCTGCGCAGTGTTGCGCCCCACCATAGGAAGAATGTACCGGCTTCATTCCCGAACGGAACAGGGCACATTACGCTGCATTTATGCTATCGATTAAGAGTTTCTTTTCTATTAACATTACATGTTGACAGGTCCGAAAATCGGCCATCTGTTAGAACGCGCAGATGCCGAAGCTCATTTCACTGTACGGGGGCCCTCATGGCGATTTCCAGCTTACTATGAC
2 | AGTGAGGCCTCGCAGCTATGACGCAGACGCAGCCCAGACGTACGTCCAAGTCACTGAAGAGAATGTATAGTATTGTGAAGAAATCGACTGGGAAGTGCGGATGCGTAGTTCCGTAGGGATCAGCCCTTAAGAGGCACAACGGACGTCTGCGGGTGGTGGAAGGCTAGGGTGCATGGAGGATGGGGTAGGCCGTAGGACGTCCACGGATCTCCTGGACACCAAAGGAAACCGATCAAAATCTATCTAGATGAAGGCAATAAGTTGGTGAAGGACGGGTTCCGGTTATGGCACTACACAACCGGCCGGTCCATGCAATTTAGAACAACGTTTGAATAGCGAGTGGGGGAACTTAAGCGGCAACCAAAATCTAAATGTGACCGGACATGTCGTATGTTTCGGGCCCCTTTTCGACACTGAATAGGTGCAGGGTGCTACTCCTCTTATGGCTTAGACATGAGGAATCCAAGCATCCTGGCGCCGTAGTTTAGCCGCCTGGGACAAGGGTTTTTTCAAACCGTTTTCAGTATAGAGTGAGCGCCCTGCTTAGCTCATAACGTGCGATGAGGAGTATACAACTACAAGGCAGCAGGATGAGTCTAGGAAATAAGCTGCCTAGAAATCTACTGATGCGCGCACGCTGAATTCGTTGGGTAACCAATGCAACGTTTCCCAAACCAAGAACGCAGACGTGACTCTTTTTATCCGGTTGCCATCGCACCAGATAGCCCACCTGGTAGCTACAGGGAATCGCCGCAGTAATCAGAGCTATAAGATGATCGGCTGCCAGGGGGCTCGACGTTCTTTGAGAGATACAACGTGATCGAATTTAATTCTGTATGGAGTGATACCCTCGCCATAAACTCGCTTCAGCTGACCTATGATGCTACGGTCGACCTA
3 | 


--------------------------------------------------------------------------------
/data/stepic_6d.txt:
--------------------------------------------------------------------------------
 1 | 0
 2 | 21
 3 | 10->14:25
 4 | 10->17:20
 5 | 1->19:6
 6 | 18->20:7
 7 | 15->21:38
 8 | 11->22:19
 9 | 4->14:39
10 | 1->21:14
11 | 4->12:26
12 | 10->11:32
13 | 9->17:23
14 | 9->15:8
15 | 9->12:29
16 | 9->11:14
17 | 5->21:28
18 | 2->21:0
19 | 9->19:19
20 | 8->20:7
21 | 14->20:34
22 | 19->20:39
23 | 13->19:3
24 | 6->8:3
25 | 12->23:0
26 | 12->22:19
27 | 6->12:15
28 | 2->5:10
29 | 6->16:37
30 | 1->18:9
31 | 6->18:18
32 | 6->19:13
33 | 2->8:20
34 | 1->10:31
35 | 5->18:28
36 | 0->21:33
37 | 20->21:6
38 | 5->13:17
39 | 5->17:23
40 | 5->15:3
41 | 3->14:11
42 | 14->15:2
43 | 5->9:33
44 | 14->17:7
45 | 1->3:8
46 | 13->16:30
47 | 13->15:15
48 | 1->6:10
49 | 1->4:13
50 | 3->20:32
51 | 3->21:15
52 | 18->23:13
53 | 18->22:10
54 | 4->5:0
55 | 6->20:33
56 | 5->6:38
57 | 0->1:7
58 | 0->6:19
59 | 12->17:23
60 | 12->15:10
61 | 7->15:35
62 | 13->22:4
63 | 13->20:33
64 | 5->8:13
65 | 11->12:33
66 | 11->15:6
67 | 11->14:0
68 | 11->16:0
69 | 11->19:20
70 | 11->18:39
71 | 3->12:9
72 | 3->15:37
73 | 7->21:21
74 | 3->16:19
75 | 9->20:12
76 | 7->8:31
77 | 10->20:5
78 | 10->23:22
79 | 13->18:9
80 | 3->9:0
81 | 14->18:22
82 | 14->19:23
83 | 3->5:28
84 | 16->18:31
85 | 5->23:36
86 | 9->23:31
87 | 8->18:25
88 | 8->13:35
89 | 2->14:35
90 | 7->17:4
91 | 7->16:18
92 | 2->11:13
93 | 7->18:6
94 | 


--------------------------------------------------------------------------------
/data/stepic_6e.txt:
--------------------------------------------------------------------------------
1 | IWWRDFMAEFMWQNSGSRAMCFFNRIVCWNARALNYKNCSLQVKERLAAKRCYATHPEAITDGGIFAECEQTNYDFREHKSIFMFCPTWYGEQNHEVLGRHDHCHEMCHTCKCFRWRCHSSAKKLGCFPWYIPKFHYIKPMVVYHHMLTIHYKIPSQNKSDDALIGNAKIVMHLTGCQYNNYPTYSFEMPPCDTWPDAQVVKVTMFPIFIAFQSKTWWSMILSSSSGYLPYLMNCPNNLVWQVPLRCCYCCGNQNLEKPMQHSCGKPAAPDIRSRSQDFHWNRLQRVQEDIVSIEFKWSEELIEQWTFWVFYNLAHMGYAPDGYVTEFIWHGTYCIKFYKLGKQRQWKSQQHYCTMWAVRRVRWRIVVHYETKRRENYWIKFPPMCGERTWHRYCREKRVEGLVNFEIHQWQNQPLFKYMFHRCKMMINCREDGPSHTSQKNQHNLHFDMRNYSTQGMYDFTNITAKLPYISCTYHKMLWPMPARQMTHVGRESYKWSHCRMRGNTSWPQYSEHRFYFPPSMQWWMTIKNMLCATNRKKEVFNHDSVDKFRDVSPHMPYDIIQEQWGGPFMYDTMEFEVLTQMDIFYMYDAYYSRCCSMFKTFALWNRSKYMQMDWNNNNQTIPIAKWASHMYWQELLDVTMRACCDKYWSTYIMIKANYVEPRMGVLANVGPRHQHWAHCKHNVAIWQSFTHRMAGSMAKEEPWFACLYLIREVIICGETCDLLGCVQMQWKDWCRGQGKKYGMVMGKMIEALKFFFTLFVGVVWQQSCQEEQYGDMVEEYQIVMWHDSWLTILQYKQIMRQWGKQPMRSMYPAPFERYLWCCFNFEWDVQIVDYVLIRTFCPLVNQMWIETCNIQLQASDHNHCCDCCQATMLMDTHHPHDYNKDDQEPCMCDHCEV
2 | IWWMWQNSGSRAMCFFNSDGDIIVCWNAGAWMMSGTSVGNKRCYATHPEHEMFATDGGIFAECEQTNYDFREHKSIFMFCPWMVVNWYGEQNHEVLGRHDHCHEMCMCKCFRWCCHSSAKKLGCFPPYIPIFHYIKPHHMLMDFGNIHYMIPSDLIGNAHIVMQYPTMPPCTTQNMIMPDAQVVKVTMFPIFIGPIQAEYVFQGNLPYMNHRFQVEGPNNLVWQVTTVELGQYCCNWNQNLIKPMQHSCGKPAAPDIRSRSLQRVQFAIVIIEFKQSEELEGMAPDGYVTDTQIFWSEFIWHGTYCIKFGSQQHDCTMWAQRRVRWRIVVMRYPYETKARENYWIKFPPICGERTGWWHRSCREKRGLYTWQFEIHGMFHSPASSAKVNCKMMINCGDIAPCLREDGPSHTSQKNPHNLHFCMRNFTNITMHKLMVMRLDYISCWYRHMLHVGRENYKWSKCRMRGLTSLEEQNPQYSEHFLLFLMQQLLSDIYFPPSGMHTIKNMLCATNRKKAVFNHVAMEWDCSVDKDRCVAHDGGAPCKPYDIIQEMYDTMVFEVWMDIFYMYDAYYSRCCSFHQFVTRSKCAWKWASHMYWQELWTAIYEKGNTPANKEPRMGVLANVGPRHQHWGHCKHNVAQSFTHRMAGSMAKEEFDTWSFETVDLLGCVQMFWKDWCRGQGKKYGMVMGGSYINMLEALKWPYYWIQQMKFTLFVGVFLIRFDRWQQSFQEEVYGDMVEEYQIVMWHDSWLTILQYKQIMRQWGKSMYPAYQNLEMYTWEEMVRSAEDCFNFEWDYPSWPMSERTFCPLDTAVNHMWIETHGNIQLQASDGNHCCDCCQATMLMDTHHPHDYNKDDQEPCMCDHCEV
3 | 


--------------------------------------------------------------------------------
/data/stepic_6f.txt:
--------------------------------------------------------------------------------
1 | HDDVNYHHWLSRTYMHLWYFNFYTRPMTMSDKFAPAFNRYCNDVLYMRWNQICNPPHFFKFNGRCRTMTKSDWTGAVVQMSQTMMDKVTTPKFATVVCFGGQLGKIWEPENWVIIKPVDLTTRCPWCLRNEDIDTWCLAELKMIETGIGSPLAICRSSGVPCLYQWNLMALIPDMPPRGRGMGEKTPTGMNVLAHGHFQHCYYDYNRQSTGLVFPRTFEWSNRFHHTVQQDMQFLWICKVDADDETERLVYGGKPCPRKDIEYVDPMQNQASLYEWVWEFGDLEATTRYPPITCMKAEIFSIAGLWFSSSRARQYDHDYNMGTFTSIQSQGACKICAARTHWCCGTGTPFKLASDTLFQRKAAVMGFETCKIYFAPWRNIQMATVGSYEIHHFYTYNAESYPQCGTHRTGTTEWSDNPSGNFIVLTYCHPENYFMSYYDLKGAMASECICKETKEPHSLWIHYANNFHFPQCCLVNHSRVGQCERKCVKNPLIFWREVMKGTHPGCQNLCMGVVGQVTETRCHFNNFHKETRSPKALNQPMKPPAEYSRKTVCDIASILIVKLSPFEQFEMQSPQQCHTVHASWKGQVGWMRWWGMMIQGIKPQTHAPYEHFSDFTHEPATARDELCETRWIEYVRKALFTCHEGRHQYHWLTHPVVQMQQGFCESNDQHPYDFLFKHGFDQHWHWMIGLCPVPRGLQPWTMRAKYMPMLPQFLTNAKGDYTHGFSFIDHCQMICMMCMQTDMQQQQASTHSPQQDHYSNTKMFKASCMGQEPQEMCWNRVAGIRWPEFDAPDWADQFPKFVGTPKCNLDALYYQDGQHKDEREFVDWAQFQQGTCAEYWVPAKFEHWLDEDGQAKSKFFTQYMYENQVFTPMERWNVKWCSGHWAHRAEQDRDTKMHSI
2 | FGPEFAHQWAQGYDYEHCCSPTQENNDYFIRCAALIIEMVFYDKTFSRIDMNRTDEHHDTPWHFPHKCNYQFPERPIPDTIDNTVNENWFWPDMNNGKCSSTHPGVLEGEKDMVHLVDNGTNNLGSFWVFCDTNWYYWHAWPNTPNGLPNPTMDLRMNLKPCAQPPAPESHHQETCFCYHCSYVTEIQEPDVHGHYVSKSTKDGEIWCKVQCWTDLACWFACEKMNKLYLIWYGPRDFITIDPCYPDECQNVKERTDKPEWNSPAWNNNHPFEAGITGSFWNHATCDFGDGGSSMFHDYNMGTFTDNDFGVDIQQWACKIGALLTHWCMGTGKLASDTSFQRKAAVDGEVGTIYFAPWRNNQMATERYGSYEIHHWYTYNAESYPNTTVMKEVYEWCVMAHRTGTTESAHLHSDNPSGLFIVRTYEKACACHYYDLKGAMASECICKETHRFDQMEPSSLWIHYAYDFQTNDFPQCCLVGHSRVGQCERKCVKNPLIFNSVMKGTHPGCQNLCMGTVGQCPETRCHFNNFHKETWSCMDGQTSAKALNGPMKPPAEYSRKTICDIASILIIKLSPFEQFEMQVHASWPGQVCFIDMYVMMIQIKPQTFSDAMWIEYVRKALFTKHEGRHCRYGRYMPYHWLTCWNDVVQMQYGFCENWMLTWVHMKYKENHPCVALHDKPHQHSHMKTVIFKEKMAMMNPMDQELRYTSQWYKRARCDQQVKRRFIKNMAWLHLDTQEIIPTVRCFKNVVPLYFWTFFCFFVAASFFKIFKCWTPWTYQWPVTCSWMNYDGRILPKGHAQAKEKQSTSDWETFKRKQPWYDQSYNQYEAWPRCNFGWPWASHRTAHFEKHTSVFIYMPPACCSLMMPKPHCCKFPKMQGQCEHMFVRPAFHQWWNKKSLELPKEEEDKMHAIVYANYKYPQIRWNYQLYIVMCYS
3 | 


--------------------------------------------------------------------------------
/data/stepic_7a.txt:
--------------------------------------------------------------------------------
1 | MWCPGWPAVANTRMCSLIVPYGMHGMQSMLRMTPTVWHKNGCMHWETTTAAWAMVWSMRKFWLCWFSFKSQLLREMCFLQHMKPIARYQKNNVMHIIIPISSWEWLTWAFYGGSCSMMGIDSRGMNPEIIILRTPSTIVPGNSWITCFYKQSCSKFWMMQGGGFARRMGVDVHEDSMLTNMIKKMKFGIQFSEFFEGLHLDFNVDDRRTIWVLSICWQIFHVAMWKFIFFPNHEIDDELTCQVTLTFEQRVRTSDTQPKMWLVHDAMRGDNRAMKSGNEAPNKHKQSMYQQETRDRDAWMVHPNYRTDQHSFLQFRAPKVDQMLGWINQIYDKSEFCGKVVEPCLMKHDLPDRWFSPPECYVKKDMIFSICQLERHLQRDYTEGHDNDRPPPSRAQPCYNPQAICNASEDRPTKINPCGFGDQDKTNFVTGITFDETVKGHTGRMWGRQAWLISHKQSGERLFCQQEWFWLQMSMVDAFQGICRMKQFKNQVCDFIAEETALTIWQRRDSLCTLADPLNMDLLRTNMPRKWQCSVGTKGHCFSYPHCSMGWDHEHMYPQAIGLDRPVMHTPFGDCIMRFVACIYHTSSYPDNTSRQLLDQVSGQANFDSDRVPVEWGYDNGLDLTQFNACFKGFFLWKFLRENCTCIWREPKLQNWVFFYRFDAALHAWIADYYSMYCNCLEFRQEIGCFVFDCSMDREYEMYKMFAICNYERDCCMGKQYNYCGNGGWNDDEFTCHKISNKFNDELRRADKNPSFWKSFLEACALYSMVLICFWNERECMRMINWLQLPECNGPSDTCDWNSYYENYTFAKTFDEDPS
2 | MWCWWGFSLPNGWPAKRMIEANTRVPYGMHGMQSKEAMVHKNLCMHWETTTAAWAMVWSMRKFWLCWKSQLLREMCFLWHMKIIARYQNNVMHIIRYLIPISSFDEAANKQMRDPYSCSMMGIDSRGMNLEIIILRTPSTIVPWNSWITCFGKQSCSKFWMMCLDAPDGMAKDWCYGVSMLTNMIYKQASDAQSNADFRVDDRTHWMIFHVQEEGLTYGWKFCHGFDRYCNIIMDTDEITLTFEQRVRTSDTCHRIWWHQRGDNRARKSGNEAPNKHKQSMYQQETRDRDAWMVHPYRTDQHSFLQFRAPKVDQMFFWINQIEFCVWICLMKWFSPPECYVMLKLASSCGQLERHRDYTEGVDNDRPYIANCTMQNPSRAQPVYASEDGCGFGDQDKTNFVTGITDDETCKAQRFQAWLTSHKVSGERLFCQCNFSNQEWFWLQMSMVDAFQGICRTKQFKQQVCDFWYHRRDSLCTLADPLNPFETTKDGRVKVGAHKIACSVGTDGHCFSYPHCSMGVDHEHMYPQAIGLDRPVMHTPFGEAVARGEACIMLDNFVLCEYRSSDYPDNMSRQLLDQVSGQHLWNMWFDSCRVDVRDVGMWGYDAYMTGLDLTDWTKGIDNQQKLWCFAGNFLWKFLRENCTLNMQIWREAERAALQNFFYRWCSMTKIYYSMYCNCLEFRQEIGFDCSMDYEYEMYKMFAICCMGNYCGNGGDWDDEFKCCDNFNDELMGPTFYMDYSFWKSFLEACALYAAKHFEDMVLICFWNERMADRTALVNWLQLTMDITAEGDWCDWVSYYENYTFAKTFDEDPD
3 | 


--------------------------------------------------------------------------------
/data/stepic_7b.txt:
--------------------------------------------------------------------------------
1 | GAGAGTTCGCGAACTCACCTTGATGGGCAGGTGTATTGACGGCTCCGATCACACAGGCTGTGGCCAGCCCGTAATTGCCCCCTAGATGGTTGGGTAGCAGCGCTTCTGAACTACCGTCATACATTGCCTTTATGTAACTTACACGAACGAAAACCATCCGACGATGCGGTAACGATATCACCGTGTTCACACCAGATGCGCGCTTAAGTACATGAGGGGGTTACAATATATCTTAGCAACCAGTCATAATAATTCGCCCTAAGATTCGGTTAGGATAATACGGATCAGCAGATGTACGTTTGGGGTACGGTTGTTATCAGAATTGGACCTTTACCTTTAGCGTGATTCGCCCCGTAGCGTAAGGCAATCTCGGGGATCAGAACCTTGGTGGGACTTCCAGACCGTTAGACCATACAGAAGAAATCCCGTGGCGATGCACTTCTGATTACCACTGAGTTGGTCAGGGGAGCCTTAGCCGCTTCCCTCGAGCACACATTAGAGATTCCTCCCTAGGGGTCCTGATAGAGTATGGAGGGGGAGGCAGGATATACTCCCAACCTCGAGTTTAAAATGTGCCGCGTTCATCGATACCGCTCCTCTTACTAATACTAATTGGGGAAGTAGGCTTTAGGTTGTATACTCGCACTGCACGCGGTGCGAACCTACCCCTGGTCAGTACCACTGGGGCAATCGAGTATGCAATGCTGGCCTCAATATCGTGCGGTCAATTCGGTCCAGATACCCGATTCGCTAGGGGGTGTTAAGACTCATCCTGAATTACTTTTATTGCACTAGCCGGTGCTAGTCTAATACTGCCTTTCGGGTCTCCACCTTTGCTAGGTGAGCCGGCAGTGAAAAACTGACAGCCTTCTAGCGCCCCCTAACGGACAAATTATGCACTGAAGGTGCATCAATCAGTTGTGCTTCACATCTTCCCTCGAGTCGTAATTTGACAGCTAGCT
2 | TCTCAATTGGGCTCTCTTATTTTCCTTATCGGGACATCCTAGATTCAGGCTCTGGTGTGCCAGGTACCCATTGAGCTGGT
3 | 


--------------------------------------------------------------------------------
/data/stepic_7c.txt:
--------------------------------------------------------------------------------
1 | ACCTGTGCACTGTGACACAGGTCTCCGTAGGCTCTCACTTACCACAGTGAACAACTTAACCTTAATCGTCGAATGTCATTCCAAGGATGTCGGGTAAACTCGAGCTCACGAACAACGGTGACAATTGTCGGAAGGCAGTAAACAGCCCGATGTCGCAGTTGGTATACTGCCTCAGTAGAAGGCCATTCGTCCGTTTTGCTGCGCGGGTCTTGAGCTGACGGCATAACTAATCGTCGTCTGGAAGTTTTCTCGTTCATCTCCTGTCCAACCGACTAGACATAAAGTGAAGATACTTGGGGAGAAACGCACGATTCCACGCGGCGGCCGGCAAATTACACGTAAGGCTCCTACCCGTTAGGGAAGTGACCTTAATATCGCGATTGACACGATCCGACTGATGAAACCCTTAGTTGCAGTATCTGGATAGACGTTCTAAGGCAAACCCTGTACCATACGGGACGACATACCTGCGACGCATAAGACCCGATCTGAAACGTGTGCGTGGTCTTGTCTGGGGAGAACCTGTGAATTAATTAGGTATTCGGTGTCTTGCCTCGCTCCAAAATGCACTCTGGTGTTACGCATGGATAATGCGCTTATTCCTAGCATAATTTATAAAGGATATGGGCTACGAACAACCGCGACCCATGCGGGATTGAAGCTCAACTACATCCCCATAATGTTACCGGACGGAAACAGGGAAAACGAAGATGGACATTGACGGTATCTCCATGGGCTAAGAATCGATGCCAGGGAGACTCGATCTCTACTCCACATCGTATTAAGCCTAAATGTATAATCAGGTTACACAGCGGAGCGCAATAACCCGTGGCTGCCAAGGCAGTCAGTGAGTCTAGGAATGTCAGATCGAACTATTATAAGGATGGTCCGGCGTGGTCGCGCTATCGGCCACCATCTTATTTGTCCACCTTGC
2 | AAAATCGAAGATGGCATCGACGTGTACTCCGTGAGCTAGGTATCGCCGCCAGGGGAACTCGATCCTCTACTTTTAAGATCATATTACAGCCTAAAATTTATAATCGGGTTACACAGCGTGAGAATAACTCGTTGCTGCTAAGGCACCAGTATTCGGTATCATTAAAATGAATAACTCATAAGGGATGGTCCGACGCTGGTCGCGCCACCGGCACCATCCTTCACCCCTTTACATGCGATGCGTGGGTGGGTAACCGCGCAAATGACACGTACTGCTTTCTGGTATTGCTAGCGGCCAAGATTTACCGTAGGTTTGCTGTATCCGCTACTGTAAATTGTCCGATCCAGAAGTGGGGCATCATACCGAGGTAATAAGGGTCATCTACCGGTCGAGCTGTTTGAAACACCGGACGGCTACTGACTGGCACCGTTTAGTGTCTCAAGCTTACATTTCTGAGGAAAAAAACGAACGGTATCCGTAAGGGACACAATGAAAATCGCTCCTACCTAATCATGATCAGTCTTCGCGCGTACGCAGTATAGCTGAAATCCTACCTCTGTACACTACAGAACATGGATGTAGACACTTGACCACACGTAAGACCGCTCCGTAAGATATAAAACTACATCTCGTACCGACAACTAAAAGCCCAGCAATTGTCAGGGCATTCCGAAGGGCCCAATTATCCGATCTCACGAGGCCCTTAATCGGTGACGCGCGTGGTAAAATGCTAGCGATATAGTACGTTGCAAAAGTTTCTTACTCCGGTGTCCGTGTCCTCCAGCGTGTTTGGAGGATCCTGCACTGACTCGGGGAGCAAGAGGTCCGTAGAGACGGGGTTACAGTGAGAGGC
3 | 


--------------------------------------------------------------------------------
/data/stepic_7d.txt:
--------------------------------------------------------------------------------
1 | YETDVPSCFQRPQAHRQSSTPMRKGIMYEREKHSSGFPNDWWADLCTMTYDDCCDWCECCFCSNYEAGIQMIC
2 | YETDVPSCFQWIQWHPMWSTPMRKGIMYEREKHSSGFPNDWWAPQFVTLYYHLDDGSMDWCECCFCSCYEAGIKMTRQMIC
3 | 


--------------------------------------------------------------------------------
/data/stepic_7e.txt:
--------------------------------------------------------------------------------
1 | DVRGTAASLQLWRDGDLHFSVIGPSYLKCRISGAKQQMIKTRKNWTDGLTCWMHTHECEENTAEVMYHHLYYMRNMLMYMSFFWWQDFLSQNQFMNWSGVANEVATLELSNQFNLTWRAVHCLVWPCANYMCDGQVLHMIELPETHQAWPCSKHVDENLTGANMTHSWHEIMLVFMAWIYARRQYWVWGFVAHTAFREKMRQMPCPMWCHIAGWGITGINTDLDIVRGHKVPNCSVPDNNECIMGTYIKQWEVSCRNNVIVGHNLDYNHGYQHQQQRSFNMDVYPNLTRNVMFPMEPHRGALKNSWKQWGQSIHATMMEHYSRVDQTYQAVQREQTSGDLQINHREMMNDKCYAKCCRNNGIMELFMMLEQLPFPEAEEHQDRWMMARFTLYNPDVGMSDHTQSHAMNYPGGWTILWPACNHKFTWVPVCQTPGAFASTHKYDTRSHMIVKWDTIHEDKNRKDHRLTHSTRTRLKRRMYVNQEFQNGYLQDKCEEDYQKIEWTTSWRTVLIQTHVDKMHMIVPGCRPICKSEHLYYYFDCIMCNHAMRRGREVDWDGRLTHDYNSAVKTSTQCIMRRSWPRDQVGLQDPGKNNLWTLNMVKVINMYMCNCYGLNWGWLCKSWHYEFSMWDHCMVWGTQMDDHLFLPFHPATGCYDFTEIGTQNKRYLRWKAIIMNDCADALALCWPWCYNYLGRGMAKHLCQDWTDYNVNPQYFITPVKWNCKVRHVMLIPRLCIRRKVTKWQPQQCQVSPWSPMTVSTRVCWRAGKCQKIFIRRMLKTNCDVYMDRQQRENAKCWCVKPGDQSVVSNHHPMGWPRDEHMRCAHGDKGHIMILSLEQNDAMMGKRSQANDELTQKKTVEWDTTIFQWKWDMSRACSWRWHPYLNIAYCVICHIDENCTWSHHPAEYDKACMNMLMNNVGCRMKLHAQFLGVNSSCVCAWLHQYGGEQRLAQQIDFQLFKIPTHALWSIFTKKDHRKHRPHCGGCCIANYIEFTKSELNGYTHRAHSTQKYQKQLKNAFIHLSAIFAMPTCETVGL
2 | DELGSYEVAACWKSAVGSVFQCKFTNSWSDKGGSAFDRPYTQKPQNDSDCHYNYHPKEMPISIHIYCIPHHNSDFFWDVFFSKYNPDLRAERMARRIAIWIVWMSREQCADGQVHPTINQARCRVRGHLRYCVRFWFRQVPQAVMLLPCQGKPYKDQHHPKFQYIHQYWLCDEMNNTIVPLSHNFCGECRILLQLESWICKFNCLDLTSPCDHLLITEMFCQAWFVMLHAQNFARTWMLHVMNRYKPFNNLMENNMIHNPSCLAWHFKWPRRDNLDCYDVIESQFDCSKNNYKLFQYGFAGGDAITNQAWNKLYCAGGVINRGMFFMWDLCCMMVGPTQCDTVDKVVTCLGTELGEMQAELFYESVCISVISQCGHFLYDWFLIAPVRKGMTQPHPMYWCSTHMAFWISCRCGEADGYNRCILSFFCSYLPKMTSIAAALMVKNYDSERGKWDTIHEDKNRKYHRATHTTIFTFSVTRTRMKCRMYVNQEEQKCEEDYQKIEWTTTWATVLPQTHDAPRYTDKMHMTEYKTFGPGCRPAFMNCQSEHLYYYFDGFPYEVYHCNHAMRRGREVDWDGRLTHQYEYRSAVHNQLRLTRDQVGAYYCEERAQATQHSMFTEVMRIMCIAYMGCDVITSRFYGHDTKRKAFDFLSLMKYYSAIYGHKNGFRIIQIARDFWANIWRQPPGKYDWLAGQATEQRGFSTGRTLIAKCCTQYDCAWHHYKMEEAGDWWPKWFRVTGDYFYVWYPAQYMITKCDAVCGQHPVVDIISCQNEWALRFDPHHNHSTNGYDGQYMLMFSAMMFYGAICFNAQHVMKRIPGYHENARIYVWKFTLQHHLNCDQERTHQRMWYFCHNPCHTRTNFCETKMFYLKWHGSPLMDNQGRIIRMHVYMVLSYQMVEQYCQAVGIRLKWIFHCKDIFLIVENRQFVAAWRCMCWYMRPPTEFLYGVGLKVCHQPPIIASWCFCRLRFWECWNSHRHDDFWHGKEQFLQIYEYEYSAIDMMPCNMQLQNRNCRPQVRHHFTPKVMPICTCYLANLSRIVWGWQECQYVFPL
3 | 


--------------------------------------------------------------------------------
/data/stepic_7g.txt:
--------------------------------------------------------------------------------
1 | CTCATGTCG
2 | TTGGTGGCCT
3 | ATGATAAAC
4 | 


--------------------------------------------------------------------------------
/data/stepic_8a.txt:
--------------------------------------------------------------------------------
1 | (-42 -247 -285 -32 +44 -251 +269 +53 -156 -15 -151 +312 +321 +134 +370 +43 -50 -96 -4 +320 +36 -167 -252 +325 +280 -218 -215 -159 +257 -150 -226 +240 -124 -229 -338 -307 +367 +153 +270 -2 +212 -258 +379 +209 -57 -264 -208 -273 +100 +89 -396 -221 +319 -133 -222 +200 -350 -267 -121 +392 -259 -293 +31 -5 -277 +138 -326 -281 -163 -108 +117 +380 -394 +79 +172 +189 -171 -73 -292 +180 -114 +81 -238 -190 -72 +389 -67 -196 +130 +168 +181 -223 +228 +262 -174 -231 -369 -93 +331 +38 +56 +302 -219 -112 -400 +237 -20 +256 +162 -271 +358 +166 +91 +207 +364 +248 -12 -272 +35 -398 +88 +353 -283 -145 -18 +146 -135 -356 +158 +317 +6 -282 +104 -214 -199 +274 +120 +101 +80 -60 +69 -11 -136 +268 +25 +232 +241 +129 +191 +78 +288 +51 -366 -10 +110 +349 -40 -377 +233 +315 +182 +22 -63 -362 -375 -59 -352 +245 +333 -47 +310 -105 -3 -90 -14 +393 +249 +286 -103 -83 +46 -34 +204 -74 -384 +71 -260 -55 +294 -344 -175 +220 +118 -378 +39 -155 +244 +122 -8 +289 +246 +165 +210 +123 +250 -391 +299 +48 +341 +303 +261 +340 -217 -202 -314 +328 -235 -147 +144 +92 -99 +297 -16 +243 -52 +336 +143 +9 -291 +141 -345 +311 +161 -115 +254 -62 +323 -335 +361 +387 +304 -137 -351 -21 -278 +192 -154 +327 +382 -372 -383 +186 +198 -65 -339 +225 -371 +178 +354 -29 -313 -255 -177 -305 +119 -86 -401 +295 -179 -266 +106 +395 -324 -388 -276 -385 -113 +206 +236 -98 +348 +66 -279 +359 -107 -230 +131 -169 +28 +365 +176 +164 -381 +342 +126 +27 +54 +173 -287 +85 +77 +125 +227 +160 -184 -185 +82 +239 +402 -205 +58 +334 +376 +197 +149 -203 -64 -97 +298 +148 -213 -242 -33 +132 +296 +7 -76 +87 +194 +111 +94 -322 +170 -374 +26 +263 -19 -195 +157 +127 -75 -337 +357 +13 +373 -301 -363 -109 +347 -211 +49 +95 +140 -284 +128 -346 +116 +68 +290 +253 -265 +368 +390 +309 +329 -360 +318 -234 -142 -24 -330 +332 -61 -152 +23 -397 +84 -386 -275 -343 -308 -316 +193 -224 -306 -399 +30 -201 -37 -1 +45 -188 +139 -70 -187 -183 -216 -17 +41 -102 -355 -300)
2 | 


--------------------------------------------------------------------------------
/data/stepic_9d.txt:
--------------------------------------------------------------------------------
1 | TGCATAGAGAGTGCCCGATGCGATGGTTGGTTGAGACCTGCCATTTGTGGAGATGAGAACAACTTGCTGCCCCACCTTGTCAACTGGATTTTGCATGGTTTTGGGTCACATTCTCCCATGCTATCACACACTCAATCAGGGGTCTTTATTCTAACTCAGACGTATATGTAGAGCTCTCTGTTGTGACACATTACTATTTTTGGGTTTTGCTCATTGGTTAGTCACAATCCCCTGTGGATTTCAATAAGGCCTCATTCCATGTAGGTGGAACAGTAAGACCGCGATTCAAACGTCAGTTAGCATCGGAGAGATAGCATCTTGGGCATGGAAACGGATCCCAGGACTGACTAGCGACTCTCTCTGTCTGTTTATATACAATGAAAACACTGTCGCGAGACTCCTGTTTACAAGTTCCTAGTTATGGCTCAGATGGCGTCGTATGCGAAAAACCGTGGAAGAAGACAAGGCTCACAATATCCATAAGTGGCGGTTCCTTGAGGGTGGCCCAAGGAAGGAAATGTGCACTCGCGGAAACGGGGGTACGTGTCCCACATGTCTCGTTGCCACATGGCTGGCTCTGAGGTTGACCTTAGTCGGAATCGTAGAGTTGCTACCACCGTGGGCCCGAAAGTCCTGGAAAAATCAGTGCATAGACGGGGCCTTCGATGTGTCCTGCGTTTTTACATATTGTATGAAGTTCACAACCGTTCGTAGTGTCTACAGCTCCTAAGTTATTTCCTGCGTATAGTCAGAGGTCGTCGTAAGCTCCTGACGTAGACGTTTATCAACCGTTAATAGTATTTTTACGCCTAGGCCATCGTGCCAGATCCAAGCGCACCGCGTTGCGGATCTTGAGACACTCAATGCGCGAAGCGGTGCTATAAACTACGTCCTCGCTCATTTGATCATACAACGACATTCACGGGATGAAAACAGATATTCGTACTCCTCAAAAGGCTGTGTGCGTAAACGAACAATACGAGGAAGAAACTTGTACCCTCGGCGGTACAACTCTAGTGCGCCGAAGTACGTCTAGGTTACGATAAGTTGCTTGAGTAGAGAGTTAGCGCTCTCACCCACGGGGTGGGGCGCTACCCTGTCCTTGCAGCCTAAGCCAGTTCTGTCGCACTGTTTGCTCACGCTAGGGGACATTTCCTGGTCTGAATCGCGCGCATCTACATTAGTCGAATGTGGGGGCCTGGTGGACAACAGTTATCGCAGGACGGTAGAGACAGCGCATTCCACAGGCGTAGTGTCTAGTA$
2 | 


--------------------------------------------------------------------------------
/output/Assignment_01A.txt:
--------------------------------------------------------------------------------
1 | GTGGATATTCC GTTCGCAGGTT GCGTTCGCAGG GATATTCCGCG GATAGTGGATA CGCGTTCGCAG GGATATTCCGC TCCGCGTTCGC TTCGCAGGTTC TATTCCGCGTT ATATTCCGCGT CCGCGTTCGCA CGTTCGCAGGT GGATAGTGGAT TTCCGCGTTCG TGGATATTCCG GTGGATAGTGG TGGATAGTGGA


--------------------------------------------------------------------------------
/output/Assignment_01C.txt:
--------------------------------------------------------------------------------
1 | 21 36 51 77 107 163 192 213 220 269 276 292 344 376 383 394 409 430 458 534 557 564 572 631 682 725 734 741 748 755 762 769 776 867 874 920 935 942 961 968 1009 1025 1032 1102 1129 1161 1168 1212 1227 1269 1292 1329 1348 1365 1384 1468 1517 1542 1558 1658 1665 1672 1701 1751 1758 1808 1815 1832 1839 1855 1865 1902 1954 2008 2060 2084 2112 2120 2138 2197 2226 2233 2263 2335 2362 2379 2416 2431 2438 2510 2517 2527 2583 2591 2599 2639 2738 2755 2771 2794 2874 3111 3171 3241 3248 3265 3319 3326 3357 3436 3443 3450 3549 3556 3592 3681 3705 3714 3740 3756 3763 3770 3845 3852 3925 3981 3997 4023 4040 4047 4067 4089 4099 4115 4216 4223 4266 4273 4290 4324 4360 4481 4488 4499 4552 4559 4617 4636 4884 4892 4908 4915 4923 4930 4948 4966 4983 4990 5049 5056 5072 5179 5186 5223 5230 5264 5366 5399 5430 5452 5459 5466 5473 5491 5528 5548 5612 5741 5772 5787 5794 5801 5871 5924 5952 5959 5988 6047 6055 6170 6186 6193 6200 6232 6275 6290 6410 6418 6425 6459 6488 6524 6531 6547 6638 6735 6775 6800 6878 6894 6916 6939 6954 7000 7040 7087 7102 7111 7118 7138 7178 7193 7248 7304 7311 7351 7417 7425 7432 7465 7497 7554 7571 7579 7600 7608 7615 7623 7721 7728 7763 7792 7822 7885 7960 7967 7974 7981 7988 8114 8169 8176 8211 8218 8225 8300 8315 8335 8364 8371 8437 8462 8515 8522 8538 8590 8644 8651 8658 8688 8696 8703 8734 8750 8767 8804 8871 8878 8963 8970 8977 9020 9035 9114 9121 9172 9184 9279 9318 9406 9431 9522 9562 9569 9599 9614 9642 9657


--------------------------------------------------------------------------------
/output/Assignment_01D.txt:
--------------------------------------------------------------------------------
1 | CCTTTGAGC CCCCGATGT CGAGGTGAG TCATATGAC TTTGGCCCG TAGCCGGCC


--------------------------------------------------------------------------------
/output/Assignment_01E.txt:
--------------------------------------------------------------------------------
1 | 12 13


--------------------------------------------------------------------------------
/output/Assignment_01F.txt:
--------------------------------------------------------------------------------
1 | 566 659 1387 1949 2098 3069 3650 3961 4343 4424 5220 5854 6258 6259 6581 6924 7076 7285 7532 7686 7702 7843 8849 9200 9516 9732 9884 9940 10197 10517 10526 11009 11041 11802 11999 12823 12947 13313 13748 14110 14284 15180 16065 16279 16800 16894


--------------------------------------------------------------------------------
/output/Assignment_01G.txt:
--------------------------------------------------------------------------------
1 | ACCATTCAAC TTATTCAACT CCATTCAACT AACATTCAAC GATTCAACTA GAATTCAACT ACTATTCAAC ATTCAACTGT ATTCAACTGA ATTCAACTGC ATTCAACTGG GATTCAACTT GATTCAACTC GATTCAACTG ACAATTCAAC GCATTCAACT TTCAACTAAA CATTCAACTT CATTCAACTG CATTCAACTC CATTCAACTA AACCATTCAA TCATTCAACT ACGATTCAAC ATTCAACTAT GACTATTCAA ATTCAACTAC ATTCAACTAA ATTCAACTAG TTCAACTATC ATCATTCAAC ACATTCAACT AAGATTGAAC TATTCAACTA AGATTCAACT AGCATTCAAC GGATTCAACT TATTCAACTC TATTCAACTT ATTCAACTTT ATTCAACTTG ATTCAACTTA ATTCAACTTC TATTCAACTG ATTCAACTCG ATTCAACTCA ATTCAACTCC ATTCAACTCT TGATTCAACT CGATTCAACT TTCAACTGAT AAATTCAACT AATTCAACTT AATTCAACTG AATTCAACTA AATTCAACTC GTATTCAACT CAATTCAACT CTATTCAACT ATATTCAACT TAATTCAACT


--------------------------------------------------------------------------------
/output/Assignment_01H.txt:
--------------------------------------------------------------------------------
1 | ATTCTGCTG CAGCAGAAT


--------------------------------------------------------------------------------
/output/Assignment_02A.txt:
--------------------------------------------------------------------------------
1 | MSSSATGPLFLPVIPNPQIKGCVDFILLGPPHSHIIRSPGVLSPRPVVRTLSTLCYLQRPCVISESSVGELRLRRHVYIQLVNCARYLFVHSLIRDPAKLTCLPRGSVTSLSRYSSTTTILTSRSRHNFHSPVCQLLSTFASVEGTHVIMFFTLPGTLSLTQGRLNRRRLSPVPTELALSSLFPVGGLRRASLNLRQPRHRAWRNHCHQALRYIIQAVPAESQPRYKPFVGRVLCRSTKLLGADVKKGKTLWSTEKQYKFEMKLSTEISNRHPYSSRPSGLLEDTWRLHLSHIAWFSCYVVGRGGVLHYGLAQSRRNPKSFFCGLLLGVWVPNSSESKNSSGGCNCPPGLVGNDHAKCLICQPPQMPLLFAGAYYVDKSSAFLSCGLSERHIQSILDAREAELPTERPAANGPQSTPERRIGAGCIGGSHAKAFDPSVETATCDARVSKIVCRSLKGTNRVLATCSLRSDSETTLVLTISVSPTGRQLGERKTKQPLNQVFAGFVHSRGHAVYPIRRMFLILLVGVYEIVNVCGVEHAGPRRYFQMRVISIPWVYRCQPLWSATTFNRSEPLLILQLNSRHQEETSRSPDRGKPIEGAGSDQRDYLEVGEADKSTTGPIEQGPIFYSFNDDLHTGLHRKDFYRWYPDVYCRQKNTGTTGAPTYKRSPLHRIAESHWQACTYSSIRLLLQAGSVGPNLSFGPVTNTAVSQEASFQCLTFLYPRKSTGIDPIGAGPKQNTRSRQRSDTSRVRGHGDYLAPVSSRRGMFRQSRICRAGARADLGNEYGILVRDRQYTANPIHTREVGNLSRRLLTLYRQSIDVEVECESIRSSPARQERAINRWFLWVLLDMQTSYRPFNVNGRRGDLRLPGFVHRNLYPWLNIDNSKHRRYTVSFHYRTSSQPNGNGYAIVLHNGVNYLCSICIAIFDNTVSITTVVTPKPPGGTGAYVIGPRMDVRTTVRRLLCSKKTCARSAIKSFNELTYSEASSIEPGAKIAPSSLSCRIASITRPLIADADTYNGLAKVRQLICCRVFLKRRVVRTTQVQRSGTIIRISPHPLITPFMSMVPSSKTGCYEALLVSGRGPDTPAHHKKLGVRIALLVSVLSKRSCVPQDLTEPNAIPPRMPRSEDLGVGNVRPWMEDHTYDTDRSPVIQSDMNEEPHTNPRTLGAAPVPLSYSGSRYIFRRLKWDLPDRVFSNPARRFQRKVNSPVLGGGLGEGSGPGGTIPLRSIILDFEEMRLDCMLSCMSPYSAEKHNRPLACIEPRFPESLDSTVYGDCTPLLEIWFPRGVSPPSSLNISSLPSKHKPQRWLFPSEGAPRYLANRHISSPDTCWDVKHALVREVSLVCFQSSSTLDRMFLLSIPCQTPDPSAESMSMLCTGGRVHLGSSSSVIFTPHPGFFVPYTSIRNEFVRCRLLKHRYAWIHNRAVSCDGLGRRPNKTSPALYVISPIEASADFPRCSAINPDMCESPSRNRVMALEVRWVSGVLVRLAIPDYVTHATAESYSLRSCKESHDCAIVIPHVLCATSHAECEANMIAYPGGPTASQTHLKHSLTTDYACYFVFVLWTNACTGTLPGSEIHEAFVSSLWRSTLKSVTLPRSASSRYGVFHTPVTRLDSHLFLIRKLSWPFGAGDEERLWVTTLGLLPLAYRSSFRPQANSLYHSGTLDSPASTPIGCSLAGMSSILREEYFPCPLRTPNYVAHSKVAQCKEKLSLRIARWFVSQGRYGVLVVPSYLLKNTCRHPYLKKIVARVTRWAYPNARTSAVGYVAACPLYGCPGVCLVVVTLIMVIQDILGLGGGLHVLTLRVDYVNRVLLTGNEGVNGEVLWLRGFFFIKVAVLPHPCIPLLGTTHIRLGNGLAPRCEGLVVPPLSTTSGSDRRTQHGGASENQVAPVRHKCTVGTSYSDVTMSPKVSNGLAPTAVPSSRTRLHFPIVFSHPQCHRGQIKFCLGLYPRLFHLSESVRTAHSLVVSGSQQGCGYFAGCDLTCAPRLSHRIHRQIQTQDTSIRDCTIIPTESTTHKAPVTRASGLAVSTYKESTDLVDFPWHVHECSAFSRSIGTTTSPDNLPPLGRQGDVLPCRKKSCNVTVAGPTDRSKLSQGVGYPPAQSPCCYVSCETNISAFSPDGHVLPARQHYHLQTRQTFHPIENRVQILHKKRGYKGNSISKPEAFSLSRLLIHHFSLVILPHVQNLVRMQNTTKFGRDPATSIGSFIQCQYSEFFGYPITHRPSVWSSNKDISSSKGLGRLNFYCDIRQSFILLFGRGPANLSRWVTSRLTIVRTECRTSAHNSPSVNRHKPVGGAITSGVPNWESWKTKVLGSSVIPSRGLWIIIPQEQRESRRTLIIVCENERLTAIVIPIVFDVLFGNLEGGQSHSAYVSHSLHGNTHIETVFRPEWAGSKQGRVIAMNVKDPAFEAFVYRGLKQLSGLSHRKKQSNTGTHHSIGRDRRKTRGPARVNAPGGAVIVRRVCHLNIAFEEKAYHLWFCVHNQIWVALPSLRQQRGTLRVRWTRLTQTYRSTVMGYKTPPCTPVLSLRGVLRGKSSSLGKDKIHVSTDEADYPSNLSKTVPPLTRTSSLNAIVIIVHCPAKQARQFAESLYFHYVRRSSHKIWYNNTLYRITCRLAPVMPSSSLGLELWSRLPLPRLNSHLSSLGGPVTCSGPLRRSHVDAMPYATTALDIHTHVSMLVACGFLAADAVQPPEEGYCEDVGPALIVVTMCKSSAAVLRVLFNVVLPMPRLTTTPTRGLLLAFTRITPVTESRHGRSAVLQHCSGTSVVSCGTVDMCVRAAMPTPAMIVSALSRLARPFTTPQRDPTR


--------------------------------------------------------------------------------
/output/Assignment_02B.txt:
--------------------------------------------------------------------------------
 1 | CATGGGCCCCAATGTTTTGTTAATGTA
 2 | CATGGGCCACAGTGTTTCGTTAATGTC
 3 | CACATTAACAAAGCATTGCGGTCCATG
 4 | CATGGGCCACAGTGTTTCGTAAACGTC
 5 | CACATTGACAAAGCACTGGGGCCCGTG
 6 | CATGGACCGCAATGCTTTGTGAATGTT
 7 | CACGGCCCTCAATGTTTCGTTAACGTA
 8 | CACGGACCCCAGTGCTTTGTAAACGTA
 9 | CATGGACCCCAATGCTTCGTTAACGTA
10 | CATGGACCACAATGTTTTGTTAACGTC
11 | CATGGTCCACAATGCTTTGTTAATGTA
12 | CACGGACCGCAATGTTTTGTCAACGTA
13 | CATGGCCCCCAATGTTTTGTCAATGTA
14 | CATGGGCCACAGTGCTTTGTGAATGTT
15 | CACGGCCCACAATGTTTCGTGAACGTT
16 | GACATTGACAAAACACTGAGGTCCATG
17 | CATGGACCGCAATGTTTCGTGAATGTC
18 | GACATTGACAAAACATTGAGGACCGTG


--------------------------------------------------------------------------------
/output/Assignment_02C.txt:
--------------------------------------------------------------------------------
1 | 0 57 87 103 103 113 113 114 128 128 128 128 147 156 156 156 160 200 213 215 216 231 242 250 256 256 260 269 270 273 284 303 312 316 328 341 343 359 360 363 370 378 398 412 416 425 426 429 431 444 446 456 469 488 491 506 516 519 526 526 539 554 557 559 559 572 572 583 593 616 619 644 644 647 662 667 675 682 682 685 686 686 706 706 719 739 772 772 772 775 789 795 799 803 809 814 819 838 842 862 866 875 886 895 900 900 917 917 922 931 942 951 955 975 979 998 1003 1008 1014 1018 1022 1028 1042 1045 1045 1045 1078 1098 1111 1111 1131 1131 1132 1135 1135 1142 1150 1155 1170 1173 1173 1198 1201 1224 1234 1245 1245 1258 1258 1260 1263 1278 1291 1291 1298 1301 1311 1326 1329 1348 1361 1371 1373 1386 1388 1391 1392 1401 1405 1419 1439 1447 1454 1457 1458 1474 1476 1489 1501 1505 1514 1533 1544 1547 1548 1557 1561 1561 1567 1575 1586 1601 1602 1604 1617 1657 1661 1661 1661 1670 1689 1689 1689 1689 1703 1704 1704 1714 1714 1730 1760 1817


--------------------------------------------------------------------------------
/output/Assignment_02D.txt:
--------------------------------------------------------------------------------
1 | 156-113-113-131-163-147-131-137-97 97-156-113-113-131-163-147-131-137 163-147-131-137-97-156-113-113-131 113-113-156-97-137-131-147-163-131 97-137-131-147-163-131-113-113-156 131-163-147-131-137-97-156-113-113 147-131-137-97-156-113-113-131-163 156-97-137-131-147-163-131-113-113 137-97-156-113-113-131-163-147-131 163-131-113-113-156-97-137-131-147 131-147-163-131-113-113-156-97-137 113-131-163-147-131-137-97-156-113 113-113-131-163-147-131-137-97-156 137-131-147-163-131-113-113-156-97 113-156-97-137-131-147-163-131-113 147-163-131-113-113-156-97-137-131 131-113-113-156-97-137-131-147-163 131-137-97-156-113-113-131-163-147


--------------------------------------------------------------------------------
/output/Assignment_02E.txt:
--------------------------------------------------------------------------------
1 | 71-114-156-71-97-129-147-99-115-163-147-128-71-113-71-114-129-115-113-71


--------------------------------------------------------------------------------
/output/Assignment_02F.txt:
--------------------------------------------------------------------------------
1 | 128 110 112 97 9 225 122 78 94 78 231 93 359 59 125 100 84 228 341 343 228 206 328 240 456 353 162 309 325 34 113 181 210 97 309 138 266 32 7 135 248 250 135 113 235 147 363 260 69 216 232 20 88 117 4 216 359 128 221 487 81 187 253 228 212 356 469 471 113 356 334 456 368 65 584 481 290 122 437 59 74 25 453 162 241 309 338 225 106 437 531 300 393 172 25 659 106 253 359 3 425 400 69 384 528 641 118 643 285 528 506 21 628 131 540 237 50 756 653 3 462 294 609 231 134 246 197 625 147 334 413 481 510 138 397 278 609 506 275 368 147 634 81 228 334 400 375 44 359 503 616 93 618 260 503 481 603 106 515 212 25 731 628 437 269 584 206 109 221 172 600 122 309 388 456 485 113 372 253 584 653 422 515 294 122 147 781 12 228 375 481 125 547 522 191 506 650 763 240 765 407 28 650 628 143 750 113 253 10 662 44 359 172 878 775 125 584 416 731 44 353 256 368 319 747 269 456 535 603 632 260 519 25 400 731 97 641 410 503 282 110 135 769 216 363 469 113 535 510 179 494 638 751 228 753 395 16 638 616 131 738 101 241 650 32 347 160 866 763 113 572 404 719 32 341 244 356 307 735 257 444 523 591 620 248 507 13 388 719 756 525 618 397 225 250 103 884 115 331 478 584 228 97 650 625 294 609 753 866 343 868 510 131 753 731 246 853 216 356 113 765 147 462 275 981 878 228 687 519 834 147 456 359 471 422 850 372 559 638 706 735 363 622 128 503 834 425 194 287 66 553 147 253 319 294 278 422 535 12 537 179 422 400 522 25 434 131 650 547 356 188 503 125 28 140 91 519 41 228 307 375 404 32 291 172 503 278 47 140 406 106 172 147 131 275 388 390 32 275 253 375 287 503 400 209 41 356 372 81 160 228 257 144 25 356 172 34 300 66 41 25 169 282 284 169 147 269 181 397 294 103 250 266 54 122 151 38 250 528 297 390 169 22 656 103 250 356 422 397 66 381 525 638 115 640 282 525 503 18 625 128 537 234 47 753 650 459 291 606 228 131 243 194 622 144 331 410 478 507 135 394 275 606 659 428 521 300 128 153 6 787 18 234 381 487 131 553 528 197 512 656 769 246 771 413 34 656 634 149 756 119 259 16 668 50 365 178 884 781 131 590 422 737 50 359 262 374 325 753 275 462 541 609 638 266 525 31 406 737 106 234 103 216 218 103 81 203 115 331 228 37 184 200 56 85 184 131 259 25 128 241 243 128 106 228 140 356 253 62 209 225 13 81 110 209 462 231 324 103 590 37 184 290 356 331 315 459 572 49 574 216 459 437 559 62 471 168 687 584 393 225 540 162 65 177 128 556 78 265 344 412 441 69 328 209 540 147 9 275 41 16 144 257 259 144 122 244 156 372 269 78 225 241 29 97 126 13 225 3 131 113 115 100 12 228 125 81 97 81 18 2 115 12 413 182 275 54 541 135 241 307 282 266 410 523 525 167 410 388 510 13 422 119 638 535 344 176 491 113 16 128 79 507 29 216 295 363 392 20 279 160 491 16 113 10 246 15 108 374 74 140 115 99 243 356 358 243 221 343 255 471 368 177 9 324 340 49 128 196 225 112 324 625 394 487 266 94 119 753 200 347 453 97 519 494 163 478 622 735 212 737 379 622 600 115 722 85 225 634 16 331 144 850 747 97 556 388 703 16 325 228 340 291 719 241 428 507 575 604 232 491 372 703 3 131 113 115 100 12 228 125 81 97 81 25 153 22 135 137 22 122 34 250 147 103 119 4 103 510 279 372 151 4 638 85 232 338 404 379 48 363 507 620 97 622 264 507 485 607 110 519 216 29 735 632 441 273 588 210 113 225 176 604 126 313 392 460 489 117 376 257 588 31 13 15 128 25 540 309 402 181 9 34 668 115 262 368 12 434 409 78 393 537 650 127 652 294 537 515 30 637 140 549 246 59 765 662 12 471 303 618 240 143 255 206 634 156 343 422 490 519 147 406 287 618 400 169 262 41 528 122 228 294 269 253 397 510 512 154 397 375 497 409 106 625 522 331 163 478 100 3 115 66 494 16 203 282 350 379 7 266 147 478 643 412 505 284 112 137 771 2 218 365 471 115 537 512 181 496 640 753 230 755 397 18 640 618 133 740 103 243 652 34 349 162 868 765 115 574 406 721 34 343 246 358 309 737 259 446 525 593 622 250 509 15 390 721 119 101 103 88 216 113 69 85 69 609 378 471 250 78 103 737 184 331 437 81 503 478 147 462 606 719 196 721 363 606 584 99 706 69 209 618 315 128 834 731 81 540 372 687 309 212 324 275 703 225 412 491 559 588 216 475 356 687 294 63 156 422 16 122 188 163 147 291 404 406 48 291 269 391 303 519 416 225 57 372 9 388 97 176 244 273 160 41 372 481 250 343 122 609 56 203 309 375 350 19 334 478 591 68 593 235 478 456 578 81 490 187 706 603 412 244 559 181 84 196 147 575 97 284 363 431 460 88 347 228 559 6 103 528 297 390 169 22 656 103 250 356 422 397 66 381 525 638 115 640 282 525 503 18 625 128 537 234 47 753 650 459 291 606 228 131 243 194 622 144 331 410 478 507 135 394 275 606 69 197 66 179 181 66 44 166 78 294 191 147 163 19 48 147 237 6 99 365 65 131 106 90 234 347 349 234 212 334 246 462 359 168 315 331 40 119 187 216 103 315 50 32 34 19 147 44 16 609 378 471 250 78 103 737 184 331 437 81 503 478 147 462 606 719 196 721 363 606 584 99 706 69 209 618 315 128 834 731 81 540 372 687 309 212 324 275 703 225 412 491 559 588 216 475 356 687 300 69 162 428 22 128 194 169 153 297 410 412 54 297 275 397 309 6 525 422 231 63 378 15 394 103 182 250 279 166 47 378 397 166 259 38 525 119 225 291 266 250 394 507 509 151 394 372 494 406 103 622 519 328 160 475 97 112 63 491 13 200 279 347 376 4 263 144 475 285 54 147 413 7 113 179 154 138 282 395 397 39 282 260 382 294 510 407 216 48 363 379 88 167 235 264 151 32 363 334 103 196 462 56 162 228 203 187 331 444 446 88 331 309 431 343 40 559 456 265 97 412 34 49 428 137 216 284 313 200 81 412 34 16 18 3 131 28 384 153 246 25 512 106 212 278 253 237 381 494 496 138 381 359 481 393 90 609 506 315 147 462 84 99 50 478 187 266 334 363 250 131 462 197 59 325 25 91 66 50 194 307 309 194 172 294 206 422 319 128 275 291 79 147 176 63 275 118 246 12 115 228 230 115 93 215 127 343 240 49 196 212 68 97 196 50 178 47 160 162 47 25 147 59 275 172 128 144 29 128 21 149 18 131 133 18 118 30 246 143 99 115 99 393 162 255 34 521 115 221 287 262 246 390 503 505 147 390 368 490 402 99 618 515 324 156 471 93 108 59 487 9 196 275 343 372 259 140 471 134 262 28 3 131 244 246 131 109 231 143 359 256 65 212 228 16 84 113 212 628 397 490 269 97 122 756 203 350 456 100 522 497 166 481 625 738 215 740 382 3 625 603 118 725 88 228 637 19 334 147 853 750 100 559 391 706 19 328 231 343 294 722 244 431 510 578 607 235 494 375 706 253 22 115 381 81 147 122 106 250 363 365 7 250 228 350 262 478 375 184 16 331 347 56 135 203 232 119 331 50 32 34 19 147 44 16


--------------------------------------------------------------------------------
/output/Assignment_02G.txt:
--------------------------------------------------------------------------------
1 | 57-137-163-115-97-113-128-131-128-163-113-156-97-66-97


--------------------------------------------------------------------------------
/output/Assignment_03A.txt:
--------------------------------------------------------------------------------
1 | AGGTA GGAAG GGCAG GGGAG GGTAG


--------------------------------------------------------------------------------
/output/Assignment_03B.txt:
--------------------------------------------------------------------------------
1 | CAGGAG


--------------------------------------------------------------------------------
/output/Assignment_03C.txt:
--------------------------------------------------------------------------------
1 | AGGGAA


--------------------------------------------------------------------------------
/output/Assignment_03D.txt:
--------------------------------------------------------------------------------
 1 | CGCCCGGTTTCA
 2 | TCCTGAACTCGC
 3 | ACCATAGATCAA
 4 | GGCCAAGACTCA
 5 | GGCTGAGTTTAA
 6 | GACCAAGGCTAA
 7 | GGTGCCAAGTTA
 8 | GCCTGAAGCTAA
 9 | GCCTACGGCCTA
10 | GACCAGGACTGA
11 | TCCTGAACCTGA
12 | GTCCACGCCTTA
13 | GTCCAGGTCTAA
14 | GCCCAAGACTAA
15 | GGCCACGGCTGA
16 | GGCCAAGTCTGA
17 | GCCCAAGACTCA
18 | GGCCAGGGCTAA
19 | AACAAGGACTCA
20 | GTCCAGGTCTCA
21 | GCCCAGGCCTGA
22 | GTCTCAATCTTA
23 | GGCCACGCCTTA
24 | GTCCACGGCTTA
25 | GCCCAAGTCTCA


--------------------------------------------------------------------------------
/output/Assignment_03E.txt:
--------------------------------------------------------------------------------
 1 | ACCTCTTCTTTG
 2 | AGCCCTTCTTTA
 3 | AGCCCTTCATTC
 4 | AACGCTTCGTTC
 5 | AACCCTTCATTA
 6 | ATCACTTCGTTG
 7 | ATCTCTTCATTC
 8 | ATCTCTTCTTTC
 9 | ATCGCTTCGTTC
10 | ATCACTTCATTT
11 | AACCCTTCGTTA
12 | AACGCTTCATTT
13 | ACCACTTCTTTG
14 | ACCACTTCGTTG
15 | ATCGCTTCGTTA
16 | ACCCCTTCGTTC
17 | ACCCCTTCCTTG
18 | AACGCTTCATTT
19 | AACACTTCTTTC
20 | ATCGCTTCGTTC
21 | AACTCTTCTTTG
22 | ATCTCTTCCTTA
23 | ACCGCTTCCTTG
24 | AGCACTTCTTTG
25 | AACCCTTCGTTC


--------------------------------------------------------------------------------
/output/Assignment_03F.txt:
--------------------------------------------------------------------------------
 1 | CACCCTGATTCCATC
 2 | CTTTGGGGTTTGGCC
 3 | CTTCCTGGTAACGCC
 4 | CTTCCTAAGTTGGCC
 5 | CGAGCTGGTTTGGCC
 6 | CTTCCACTTTTGGCC
 7 | CTTCCTATCTTGGCC
 8 | CTTCCTGGTTTGAGG
 9 | CTGTGTGGTTTGGCC
10 | ACCCCTGGTTTGGCC
11 | CTTCTACGTTTGGCC
12 | CTTCCTGGTTTCTAC
13 | CTTGAAGGTTTGGCC
14 | CTTCCTGGAGGGGCC
15 | ATTCCTGGTTTGGGA
16 | CTTCGATGTTTGGCC
17 | CTTCCTGGTTGAACC
18 | CTTCCTGCGATGGCC
19 | AATCCTGGTTTGGCT
20 | CTTCCGACTTTGGCC


--------------------------------------------------------------------------------
/output/Assignment_03G.txt:
--------------------------------------------------------------------------------
 1 | TTGAGTAAAAGTTAC
 2 | ATGATACGAAGTGAG
 3 | TTGCGTCGAAGTGCC
 4 | ATGCGTCTTTGTGAG
 5 | ATGCGTGCTAGTGAG
 6 | ATGCGTCGCTTTGAG
 7 | ATGGTGCGAAGTGAG
 8 | ATGCGTCGAGCAGAG
 9 | ACAAGTCGAAGTGAG
10 | ATGCTAAGAAGTGAG
11 | ATGCGTCGAAGTATC
12 | ATAACTCGAAGTGAG
13 | ATGCGAAAAAGTGAG
14 | ATGCGTGCTAGTGAG
15 | ATGCGTCGAAAGAAG
16 | ATGCGGTTAAGTGAG
17 | CCGCGTCGAAGTGAC
18 | ATGCGTCGAAGGTTG
19 | CCTCGTCGAAGTGAG
20 | ATGCTATGAAGTGAG


--------------------------------------------------------------------------------
/output/Assignment_05B.txt:
--------------------------------------------------------------------------------
1 | TCTCTATAGCATTCAAAGGGGACGCGCCACTTTTAAACCTTGTGTTGTCCGCTAAACTTTGGGGCCTCTGAGAGCTGGATTGTGCCCGCTCTAACTAGTCGATCAAGAGCTATTTGGCAGACGCGCTAGGATTTCAACGGCAAACACAACGCGGTCACTCTAGTAAATACCTGTGCACACAATGTCAGCGCCGTCACCAATGCGTGTGATTGGCAAGGGGTTTCACTAGTTCCTTCCGGGGCCCGGTTGTAATATGATCATGGAGCCCCTCTTCGCCATCAGCAGACCATGTTATTTAAATGTATACTCCGGTGTCACAGAGAAGCCCACCTGATGGTGCGGGCTGGGGTGCGCCGATGCCCTAAAGTAGGGAGCACGTCCTCCCAGTCTACTGTGGTACATGGACCCTTTTCGAAGGGTGCGAAACGACTGCGATCCGCACTCATTAGGTTGGGAGTTCAGGAACGAGGACGGTAGTAGCTAGACTTAGCTAACGGGACGCGCCGAACCCTCTTGACTGTGGGGTTCTCAATGCTTTATCATAAAAACTAGGTACACGGTTGTGCTCCGATAGCGCAGTGGAGCAACACCAACAGATGGGCCCCGGCGGGGAGATGCTGCTATTATTTTTTACATGACTCTAAAGTCTTATAACGGCTGCCAAGGCGTTTTAACTCATGACAACACAGGATCCGGCTCGACCAGGAGAAAACCCCCCTATCTCGTTTATCTAATGACAGCTCATTAGCTGTTCAAACGACCGTGCGACTCGTGTGGCAGCCCCCGGGACCTGCCGAGAGCGATCTAGCCATGCGCTCCTAATCGCTACTTTTTATGGGAAGCAACATTCCATCAGCTTGTGATGCCATTCTTTCAGCCCGTTATTCCCAACAATATCCCCAAATATGAGTGGGCTTGCGCTTATCGGGATTATGGGTCACGTGTAGTGCAGACGTGTCCTTACGAAAGACGGGTCGCTCCCACGCTACGGATTAACAAGCATGCGTTCCTCCTGTCACACTTACCCCTCAACTACACACGGCAGGAACGCCCTGCGAACACCCATACTTGTTTCAAGCTGCTTTAGCCCCGGGGCTACGGCAAAGACAGCCTGACCACACTCCGTACAGTTTGGAATTATGAGGAGTGATGACATAGAGCATTGTCTTGTGCCCTGTCTACCGGGCTACGAGGGATTTTAGATGCAATCGTTCTGCGAATCAATTGGTTGGATGCCCACAAGTTGCTTTTATCTAGGACCCCTAGCCTGCGCCGGTATTTAAAAGAAGGCGGCTCCGCGATAGCGCGTTCCTCAAAGTTGAGGGGAGTATCAGTCTTCGGAACCTCCACCCACAGGCAGTTGGGACTTCGTAGACGGAGCAGACAAATGAATCCCGGAGTCACCCCTGCGACGGCGTCCTTCTTCATGGGCCGTCAATGAGGTATTTCCCATATGCAGCGTTCGCGATGCAGCGGGATAATTTGGCTTAACCCTCACGAAAGACACGTTTCCGCTGTGGAAGACATTGCCTCATTCTTGATCGATACGATTCGCGTTGTCCCCTCCTTACTGCACATAACCTACGGTGAAACGCGAACCCTGGAGTCGGTATATCTTCCATTTTCGCATTGCATGCATGAACGTGGAGTTGAAGGGGGATCTTCAACGGAAGATTTCTATCACGGAGCCGGGCGTTTCGACTGTTAGTCAAATGATGGATCCTATAGGACACGCGGAGCCTCTGGCTAAGACTGCACCACCCTAACCAATGGTGGACATAATGGTTTCGATAGCGGCTCTACGACGTGCTACATGGCATAAGAGCTGCCGTATACCTGCAAGTCCACTCATCTACCCACAGCTTCCTCCATGTCCGTCCTACGATCTTTTGCCACACCTCGGCAGTGATATGCTTTCTCCACAATCTATACAAGTAACATGTCAATTTTCCAGTTGGGAGTCGTAACACTAGAAATATGGTATGTCTGAGCATTAGTGA


--------------------------------------------------------------------------------
/output/Assignment_06A.txt:
--------------------------------------------------------------------------------
1 | 762


--------------------------------------------------------------------------------
/output/Assignment_06B.txt:
--------------------------------------------------------------------------------
1 | 85


--------------------------------------------------------------------------------
/output/Assignment_06C.txt:
--------------------------------------------------------------------------------
1 | GTAGGTGACTTCCACAACTCTCCACCGGGAAGATGTATGTAAGATCGACTGGGGTGCGGATGCTTCGACAGCCAGAGCACAACGACGTCTGCGTTAAGTATCTAATGTAGGCCGTGGGTCCCGGACTTGACCCGGAAAATAAACTATCATGGCTATGGTGAAGCTTCGTATCATACACACGGCCGTCAGCAATTTAGAACGTTAGCGAGGATTAGGCTTTTCGAGTCGTGTTTCGGGCCCCTTTTGCACATAGGTCAGGGTGTACTCCTCTTATGGCTTGCTGGGATCCAAGATCCGGGCGTAGTTTAGCCGCCTGGACAGGTCAAACGCAAAAGTACGCCCTGCTGTATAACGTGCGATAGGGTATACAACTACAAGAGCAGATAGCTGGTAAGTGTAGAATCACTGATGCGCGCACCTAATTCGTGGGACATGCCTCCACAGACCGACGTACTTTTTTACCTGATCGACCGCCCACTGCTCGGGAATCGCGCAGTTCGACATAAGATGTCGGCTCCAGGGGGCCACGTCTTTAGAGATAAAGTTCTTTAATTCTGTTGAGGTCCTCGCCATAAACTCGCTTCACTGACCTATGATGCTACTATA


--------------------------------------------------------------------------------
/output/Assignment_06D.txt:
--------------------------------------------------------------------------------
1 | 193
2 | 0->1->3->5->6->8->13->16->18->20->21


--------------------------------------------------------------------------------
/output/Assignment_06E.txt:
--------------------------------------------------------------------------------
1 | 2280
2 | IWWRDFMAEFMWQNSGSRAMCFFNR---I-VCWNARALNYKNCSLQVKERLAAKRCYATHPE-AI--TDGGIFAECEQTNYDFREHKSIFMFCP--T--WYGEQNHEVLGRHDHCHEMCHTCKCFRWRCHSSAKKLGCFPWYIPKFHYIKPMVVYHHMLTIHYKIPSQNKSDDALIGNAKIVMHLTGCQYNNYPTYSFEMPPCDT--W--PDAQVVKVTMFPIFIA-FQSKTWWSMILSSSSGYLPYLMNCPNNLVWQVP-LRCC-YCCG-NQNLEKPMQHSCGKPAAPDIRSRSQDFHWNRLQRVQEDIVSIEFKWSEELIEQWTFWVFYNLAHMGYAPDGYV--TEFIWHGTYCIKFYKLGKQRQWKSQQHYCTMWAVRRVRWRIVV--H-YETKRRENYWIKFPPMCGERT-W-HRYCREKRVEGL-V-NFEIHQWQNQPLFKYMFHRCKMMINC-------REDGPSHTSQKNQHNLHFDMRNYSTQGMYDFTNITAKLPYISCTYHKMLWPMPARQMTHVGRESYKWSHCRMRGNTSW----PQYSEH-R-F---------YFPPS-MQWWMTIKNMLCATNRKKEVFNH-----D-SVDKFR----DV-SPHMPYDIIQEQWGGPFMYDTMEFEVLTQMDIFYMYDAYYSRCCSMFKTFALWNRSKYMQMDWNNNNQTIPIAKWASHMYWQELLDVTMRACCDKYWSTYIMIKANYVEPRMGVLANVGPRHQHWAHCKHNVAIWQSFTHRMAGSMAKEEPWFACLYLIREVIICGETCDLLGCVQMQWKDWCRGQGKKYGMVMG-K---MIEALK--FF------FTLFVGV--V----WQQSCQEEQYGDMVEEYQIVMWHDSWLTILQYKQIMRQWGKQ--PM-RS--MYP-APFERYLWCCFNFEWDVQIVDYVLI-RTFCPL---VNQMWIETC-NIQLQASDHNHCCDCCQATMLMDTHHPHDYNKDDQEPCMCDHCEV
3 | IWW---M----WQNSGSRAMCFFNSDGDIIVCWNAGA--WM-MS-GTS--VGNKRCYATHPEHEMFATDGGIFAECEQTNYDFREHKSIFMFCPWMVVNWYGEQNHEVLGRHDHCHEMC-MCKCFRWCCHSSAKKLGCFPPYIPIFHYIKPHHMLMDFGNIHYMIPS----D--LIGNAHIVM-----QY---PT----MPPCTTQNMIMPDAQVVKVTMFPIFIGPIQAEYVFQGNLPYMN-HR-FQVEGPNNLVWQVTTVELGQYCCNWNQNLIKPMQHSCGKPAAPDIRSRS-------LQRVQFAIVIIEFKQSEEL-E--------GMAPDGYVTDTQIFWSEFIWHGTYCIKF---G------SQQHDCTMWAQRRVRWRIVVMRYPYETKARENYWIKFPPICGERTGWWHRSCREKR--GLYTWQFEIHGMFHSPASSAKVN-CKMMINCGDIAPCLREDGPSHTSQKNPHNLHFCMRNFTNITMHKLM-VM-RLDYISC------W-Y--RHMLHVGRENYKWSKCRMRGLTSLEEQNPQYSEHFLLFLMQQLLSDIYFPPSGMH---TIKNMLCATNRKKAVFNHVAMEWDCSVDKDRCVAHDGGAPCKPYDIIQE------MYDTMVFEVW--MDIFYMYDAYYSRCCS-FHQF-V-TRSK-CA--W----------KWASHMYWQELW--T--AIYEK-GNT----PANK-EPRMGVLANVGPRHQHWGHCKHNVA--QSFTHRMAGSMAKEE--FDT-W---S-F---ETVDLLGCVQMFWKDWCRGQGKKYGMVMGGSYINMLEALKWPYYWIQQMKFTLFVGVFLIRFDRWQQSFQEEVYGDMVEEYQIVMWHDSWLTILQYKQIMRQWGKSMYPAYQNLEMYTWEEMVRSAEDCFNFEWDYP--SWPMSERTFCPLDTAVNHMWIETHGNIQLQASDGNHCCDCCQATMLMDTHHPHDYNKDDQEPCMCDHCEV


--------------------------------------------------------------------------------
/output/Assignment_06F.txt:
--------------------------------------------------------------------------------
1 | 1188
2 | FAPAF-NRYCNDVLYMRWNQICNPPH-FFKFNGRCRTMT-K-S--DWTGAVVQMSQTMMDKVTTP-KFATVVC---FGGQ-L-GKI-WE-PENWVIIKPVDLTT-RCPWCLRNEDIDTWCLAEL-KMIETGIGSPLAICRS-SGVPCLYQWNLMALIPD-MP-P-RG-RGMGEKTPTGMNVLAHGHFQH-CY-YD--Y-NR-QSTGL-V-FPRTFEWSNRFHHTVQ--QDMQFLWI-CKVDADDETERLV-YGGKPCPRKDIEYVDPMQNQASLYEWV-WEFGDLEATTRYPPITCMKAEIFSIAGLWFSSSRARQYDHDYNMGTFT-S---IQSQG-ACKICAARTHWCCGTGTPFKLASDTLFQRKAAVMGFETCKIYFAPWRNIQMATV--GSYEIHHFYTYNAESYP------Q----CG-THRTGTTEW----SDNPSGNFIVLTYCHPENYFMSYYDLKGAMASECICKET-K----EPHSLWIHYANNFH---FPQCCLVNHSRVGQCERKCVKNPLIFWREVMKGTHPGCQNLCMGVVGQVTETRCHFNNFHKETRS------P-KALNQPMKPPAEYSRKTVCDIASILIVKLSPFEQFEMQSPQQCHTVHASWKGQVGWMRWWGMMIQGIKPQTHAPYEHFSDFTHEPATARDELCETRWIEYVRKALFTCHEGRH-QY------HWLT--HPVVQMQQGFCESNDQHPYDFLFKHGFDQHWHWMIGLC--PVPRG-LQPWTMRAKYMPMLPQFLTNAKGDYTHGFSFI-DHC-QMICMMCMQTDMQQQQASTHS--PQQDHYSNT-KM-FKAS-CM-GQEP--QEM-CWNRVAGIRWPEFDAPDWADQF-PKFVGTPKCNLDALYYQDGQHKDERE-FV--D-WAQFQQGTCAEY--WVPAKFE-HWLDE-DGQ-AK-SKFFTQYM
3 | FGPEFAHQW-AQG-Y-DYEHCCSPTQENNDYFIRCAALIIEMVFYDKTFSRIDMNRTD-EHHDTPWHFPHK-CNYQFPERPIPDTIDNTVNENW-F-WP-DMNNGKCS-STHPGVLEG--EKDMVHLVDNGTNN-LGSFWVFCDTN-WYYWHAWPNTPNGLPNPTMDLR-MNLK-PCAQPPAPESHHQETCFCYHCSYVTEIQEPDVHGHYVSKSTKDGEIWCKVQCWTDLAC-WFACE-KMNKLY--LIWYGPRDFITIDPCYPDECQNVKERTDKPEWNSPAWNNNHPFEA-G-ITGSFWNHATCDFGDGGSSMF-HDYNMGTFTDNDFGVDIQQWACKIGALLTHWCMGTG---KLASDTSFQRKAAVDG-EVGTIYFAPWRNNQMATERYGSYEIHHWYTYNAESYPNTTVMKEVYEWCVMAHRTGTTESAHLHSDNPSGLFIVRTY-E-KACACHYYDLKGAMASECICKETHRFDQMEPSSLWIHYAYDFQTNDFPQCCLVGHSRVGQCERKCVKNPLIF-NSVMKGTHPGCQNLCMGTVGQCPETRCHFNNFHKETWSCMDGQTSAKALNGPMKPPAEYSRKTICDIASILIIKLSPFEQFEMQ-------VHASWPGQVCFIDMYVMMIQ-IKPQT------FSD-----A-----M----WIEYVRKALFTKHEGRHCRYGRYMPYHWLTCWNDVVQMQYGFCENW-MLTWVHM-K--YKEN-HPCVALHDKPHQHSHMKTVIFKEK-MAMMNP-MDQEL-RYTSQW-YKRARCDQQVKRRFIK-NMAWLHLDTQEIIPTVRCFKNVVPLYFWTFFCFFVAASFFKIFKCWTPWT-YQWP-VTCS-WMN-YDGRIL--PKGHAQA---KEKQSTSDWETFKRKQPW--YDQS-YNQYEAWPRCNFGWPWASHRTAHFEKHTSVFI-YM


--------------------------------------------------------------------------------
/output/Assignment_07A.txt:
--------------------------------------------------------------------------------
1 | 342


--------------------------------------------------------------------------------
/output/Assignment_07B.txt:
--------------------------------------------------------------------------------
1 | 20
2 | TCATCGATACCGCTCCTCTTACTAATACTAATTGGGG-A----AG-T--AGGCTTTAGGT-TGT-A--TACTCGCACTGCACGC-GGT
3 | TC-TCAATTGGGCTC-TCTTATTT-TCCTTATCGGGACATCCTAGATTCAGGCTCT-GGTGTGCCAGGTAC-C-CATTG-A-GCTGGT


--------------------------------------------------------------------------------
/output/Assignment_07C.txt:
--------------------------------------------------------------------------------
1 | 61
2 | AAAA-CGAAGATGGACATTGACG-GTATCTCCATGGGCTAAGAATCGATGCCA-GGGAGACTCGAT-CTCTAC--TCCACATCGTATTA-AGCCT-AAATGTATAATCAGGTTACACAGCG-GAGCGCAATAACCCGTGGCTGCCAAGGCAGTCAGTGAGTCTAGG-A--ATGTCAGATCGAACT-A-TTATAA-GGATGGTCCGGCG-TGGTCGCGCTATCGGCCACCATCTTATTTGTCCACC-TTGC
3 | AAAATCGAAGATGG-CATCGACGTGTA-CTCCGTGAGCTAGGTATCGCCGCCAGGGGA-ACTCGATCCTCTACTTTTAAGATCATATTACAGCCTAAAATTTATAATCGGGTTACACAGCGTGA--G-AATAACTCGTTGCTGCTAAGGCA-CCAGT-A-T-TCGGTATCAT-TAAAAT-GAA-TAACTCATAAGGGATGGTCCGACGCTGGTCGCGCCACCGG-CACCATC---CTTCACC-CCTTTAC


--------------------------------------------------------------------------------
/output/Assignment_07D.txt:
--------------------------------------------------------------------------------
1 | 293
2 | YETDVPSCFQRPQAHRQSSTPMRKGIMYEREKHSSGFPNDWWA-DLCTMTY---DDCCDWCECCFCSNYEAGI----QMIC
3 | YETDVPSCFQWIQWHPMWSTPMRKGIMYEREKHSSGFPNDWWAPQFVTLYYHLDDGSMDWCECCFCSCYEAGIKMTRQMIC


--------------------------------------------------------------------------------
/output/Assignment_07E.txt:
--------------------------------------------------------------------------------
1 | (519, 525) (520, 526)


--------------------------------------------------------------------------------
/output/Assignment_07G.txt:
--------------------------------------------------------------------------------
1 | 4
2 | CTCATG--T------C-G
3 | ---TTG-GT---GGCCT-
4 | ---ATGA-TAAA---C--


--------------------------------------------------------------------------------
/output/Assignment_08B.txt:
--------------------------------------------------------------------------------
1 | 166


--------------------------------------------------------------------------------
/output/Assignment_08C.txt:
--------------------------------------------------------------------------------
1 | 8971


--------------------------------------------------------------------------------
/output/Assignment_08D.txt:
--------------------------------------------------------------------------------
  1 | (3101, 8976)
  2 | (3101, 19060)
  3 | (3312, 1506)
  4 | (3312, 13771)
  5 | (9180, 5713)
  6 | (9181, 5714)
  7 | (9181, 16928)
  8 | (9182, 16929)
  9 | (9622, 15514)
 10 | (14233, 1837)
 11 | (14233, 14775)
 12 | (14233, 15891)
 13 | (14383, 1556)
 14 | (14383, 4129)
 15 | (14383, 6978)
 16 | (17714, 8237)
 17 | (17715, 8236)
 18 | (17715, 8740)
 19 | (18451, 7648)
 20 | (18451, 10875)
 21 | (18451, 16548)
 22 | (21500, 11515)
 23 | (21501, 11516)
 24 | (21501, 19082)
 25 | (23126, 12554)
 26 | (23127, 2982)
 27 | (23127, 12555)
 28 | (23128, 2981)
 29 | (23128, 12556)
 30 | (23129, 12557)
 31 | (23826, 10609)
 32 | (23826, 13960)
 33 | (23826, 18178)
 34 | (26499, 6943)
 35 | (35242, 12833)
 36 | (35243, 1364)
 37 | (35243, 12832)
 38 | (35244, 12831)
 39 | (36141, 7648)
 40 | (36141, 10875)
 41 | (36141, 16548)
 42 | (36971, 15514)
 43 | (41047, 1837)
 44 | (41047, 14775)
 45 | (41047, 15891)
 46 | (41138, 10574)
 47 | (41139, 10573)
 48 | (41140, 8922)
 49 | (41140, 10572)
 50 | (41140, 10738)
 51 | (41141, 8921)
 52 | (41141, 10571)
 53 | (41142, 8920)
 54 | (41142, 10570)
 55 | (41143, 10569)
 56 | (41144, 10568)
 57 | (42857, 7145)
 58 | (42857, 9550)
 59 | (42857, 14153)
 60 | (42858, 7144)
 61 | (47548, 8236)
 62 | (47548, 8740)
 63 | (48000, 850)
 64 | (48001, 849)
 65 | (49205, 10819)
 66 | (49205, 19926)
 67 | (49206, 19925)
 68 | (50555, 5713)
 69 | (50556, 5714)
 70 | (50556, 16928)
 71 | (59633, 2814)
 72 | (59634, 2140)
 73 | (59634, 2813)
 74 | (59634, 8208)
 75 | (59635, 2812)
 76 | (61593, 18177)
 77 | (61594, 10609)
 78 | (61594, 13960)
 79 | (61594, 18178)
 80 | (61767, 7649)
 81 | (61767, 16549)
 82 | (61768, 7648)
 83 | (61768, 10875)
 84 | (61768, 16548)
 85 | (61769, 7647)
 86 | (61769, 10874)
 87 | (61769, 16547)
 88 | (61770, 16546)
 89 | (67029, 16511)
 90 | (67030, 3173)
 91 | (67030, 16510)
 92 | (68070, 1837)
 93 | (68070, 14775)
 94 | (68070, 15891)
 95 | (68071, 1838)
 96 | (68071, 15890)
 97 | (69651, 7585)
 98 | (73329, 8922)
 99 | (73329, 10572)
100 | (73329, 10738)
101 | (73330, 10739)
102 | (81821, 16511)
103 | (81822, 3173)
104 | (81822, 16510)
105 | (81823, 3172)
106 | (86287, 6477)
107 | (86288, 6478)
108 | (86289, 2231)
109 | (86289, 6479)
110 | (87799, 8975)
111 | (87800, 8976)
112 | (87800, 19060)
113 | (90479, 1558)
114 | (90480, 1557)
115 | (90481, 1556)
116 | (90481, 4129)
117 | (90481, 6978)
118 | (90482, 1555)
119 | (90482, 4130)
120 | (90482, 6977)
121 | (90483, 1554)
122 | (90484, 1553)
123 | (90485, 1552)
124 | (90486, 1551)
125 | (90834, 14331)
126 | (97463, 2983)
127 | (97464, 2982)
128 | (97464, 12555)
129 | (97465, 2981)
130 | (97465, 12556)
131 | (100991, 73)
132 | (100991, 4236)
133 | (100992, 74)
134 | (100992, 4235)
135 | (100993, 4234)
136 | (101660, 11168)
137 | (101661, 11169)
138 | (103607, 2183)
139 | (103607, 11622)
140 | (106485, 2089)
141 | (110479, 4310)
142 | (110480, 3229)
143 | (110480, 4311)
144 | (110481, 3228)
145 | (116572, 1506)
146 | (116572, 13771)
147 | (116731, 11567)
148 | (116732, 11568)
149 | (116733, 8706)
150 | (116733, 11569)
151 | (119468, 11912)
152 | (119468, 18358)
153 | (119637, 9266)
154 | (120783, 12945)
155 | (121023, 3914)
156 | (121023, 16412)
157 | (122187, 15277)
158 | (122188, 15082)
159 | (122188, 15276)
160 | (123008, 2139)
161 | (123009, 2140)
162 | (123009, 2813)
163 | (123009, 8208)
164 | (125462, 3914)
165 | (125462, 16412)
166 | (126911, 19084)
167 | (126912, 19083)
168 | (126913, 11516)
169 | (126913, 19082)
170 | (126914, 11517)
171 | (138848, 1509)
172 | (138849, 1508)
173 | (138850, 1507)
174 | (138851, 1506)
175 | (138851, 13771)
176 | (139512, 19957)
177 | (140958, 9554)
178 | (140959, 9553)
179 | (140960, 9552)
180 | (140961, 9551)
181 | (140962, 7145)
182 | (140962, 9550)
183 | (140962, 14153)
184 | (142363, 4309)
185 | (142364, 4310)
186 | (142365, 3229)
187 | (142365, 4311)
188 | (142366, 4312)
189 | (148879, 73)
190 | (148879, 4236)
191 | (149290, 11568)
192 | (149291, 8706)
193 | (149291, 11569)
194 | (149292, 8705)
195 | (152155, 4127)
196 | (152156, 4128)
197 | (152156, 6979)
198 | (152157, 1556)
199 | (152157, 4129)
200 | (152157, 6978)
201 | (156885, 18356)
202 | (156886, 18357)
203 | (156887, 11912)
204 | (156887, 18358)
205 | (160278, 6211)
206 | (160278, 12265)
207 | (164471, 19612)
208 | (165915, 7145)
209 | (165915, 9550)
210 | (165915, 14153)
211 | (167383, 2231)
212 | (167383, 6479)
213 | (167384, 6480)
214 | (170530, 8922)
215 | (170530, 10572)
216 | (170530, 10738)
217 | (173043, 7079)
218 | (173043, 9036)
219 | (178246, 2184)
220 | (178247, 2183)
221 | (178247, 11622)
222 | (178248, 2182)
223 | (184381, 6212)
224 | (184382, 6211)
225 | (184382, 12265)
226 | (188452, 10609)
227 | (188452, 13960)
228 | (188452, 18178)
229 | (188453, 10610)
230 | (188453, 13959)
231 | (189341, 16043)
232 | (190643, 15278)
233 | (190644, 15277)
234 | (190645, 15082)
235 | (190645, 15276)
236 | (190646, 15083)
237 | (195082, 2138)
238 | (195083, 2139)
239 | (195084, 2140)
240 | (195084, 2813)
241 | (195084, 8208)
242 | (195085, 8207)
243 | (195238, 7079)
244 | (195238, 9036)
245 | (195239, 7080)
246 | (196374, 12809)
247 | (196375, 12808)
248 | (196789, 15082)
249 | (196789, 15276)
250 | (196790, 15275)
251 | (198444, 10819)
252 | (198444, 19926)
253 | (198445, 19925)
254 | (199587, 1364)
255 | (199587, 12832)


--------------------------------------------------------------------------------
/output/Assignment_09B.txt:
--------------------------------------------------------------------------------
1 | 36 43 618 1082 1089 1286 1293 2030 2224 2231 2329 2430 2577 2584 2827 2952 2967 3142 3149 3239 3410 3803 3810 4102 4109 4290 4297 4857 4864 5287 5386 5418 5500 6335 6342 6615 6636 6643 6880 7117 7124 7291 7298 7707 7714 8267 8576 8583 8676 9094 9145


--------------------------------------------------------------------------------
/output/Assignment_09E.txt:
--------------------------------------------------------------------------------
1 | AAAGTCTGGTAATT


--------------------------------------------------------------------------------
/output/Assignment_09F.txt:
--------------------------------------------------------------------------------
1 | GAAAAAAT


--------------------------------------------------------------------------------
/scripts/DNA_RNA_Operations.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''ROSALIND bioinformatics scripts that returns that operate on DNA and RNA.'''
 3 | 
 4 | from string import maketrans 
 5 | 
 6 | # Kind of pointless, as it's so simple.
 7 | def DNA_to_RNA(dna):
 8 | 	'''Translates DNA to RNA'''
 9 | 	return dna.replace('T', 'U')
10 | 
11 | # Kind of pointless, as it's so simple.
12 | def RNA_to_DNA(rna):
13 | 	'''Translates RNA to DNA'''
14 | 	return rna.replace('U', 'T')
15 | 
16 | 
17 | def ReverseComplementDNA(nucleic_acid):
18 | 	'''Returns the reverse complement of a given DNA strand.'''
19 | 	nucleotide = 'ATCG'
20 | 	complement = 'TAGC'
21 | 	transtab = maketrans(nucleotide, complement)
22 | 	
23 | 	return nucleic_acid.translate(transtab)[::-1].lstrip()
24 | 
25 | def ReverseComplementRNA(nucleic_acid):
26 | 	'''Returns the reverse complement of a given RNA strand.'''
27 | 	nucleotide = 'AUCG'
28 | 	complement = 'UAGC'
29 | 	transtab = maketrans(nucleotide, complement)
30 | 	
31 | 	return nucleic_acid.translate(transtab)[::-1].lstrip()
32 | 
33 | def HammingDistance(seq1, seq2):
34 |     'Return the Hamming distance between equal-length sequences.'
35 |     if len(seq1) != len(seq2):
36 |         raise ValueError('Undefined for sequences of unequal length.')
37 |     return sum(ch1 != ch2 for ch1, ch2 in zip(seq1, seq2))


--------------------------------------------------------------------------------
/scripts/Protein_Dictionaries.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''A ROSALIND bioinformatics script to create RNA and DNA to Protein dictionary.'''
  3 | 
  4 | def ProteinDictDNA():
  5 | 	'''Returns a dictionary that translates DNA to Protein.'''
  6 | 	# Get the raw codon table.
  7 | 	dna2protein = CodonTableDNA()
  8 | 
  9 | 	# Convert to dictionary.
 10 | 	dna_dict = {}
 11 | 	for translation in dna2protein:
 12 | 	    dna_dict[translation[0]] = translation[1]
 13 | 
 14 | 	return dna_dict
 15 | 
 16 | 
 17 | def ProteinDictRNA():
 18 | 	'''Returns a dictionary that translates RNA to Protein.'''
 19 | 	# Get the raw codon table.
 20 | 	rna2protein = CodonTableRNA()
 21 | 
 22 | 	# Convert to dictionary.
 23 | 	rna_dict = {}
 24 | 	for translation in rna2protein:
 25 | 	    rna_dict[translation[0]] = translation[1]
 26 | 
 27 | 	return rna_dict
 28 | 
 29 | 
 30 | def ProteinWeightDict():
 31 | 	'''Returns a dictionary that translates Protein to Monoisotopic Mass.'''
 32 | 	table ='''A   71.03711
 33 | 	C   103.00919
 34 | 	D   115.02694
 35 | 	E   129.04259
 36 | 	F   147.06841
 37 | 	G   57.02146
 38 | 	H   137.05891
 39 | 	I   113.08406
 40 | 	K   128.09496
 41 | 	L   113.08406
 42 | 	M   131.04049
 43 | 	N   114.04293
 44 | 	P   97.05276
 45 | 	Q   128.05858
 46 | 	R   156.10111
 47 | 	S   87.03203
 48 | 	T   101.04768
 49 | 	V   99.06841
 50 | 	W   186.07931
 51 | 	Y   163.06333''' 
 52 | 
 53 | 	protein_weight_dict = dict()
 54 | 
 55 | 	for protein in table.split('\n'):
 56 | 		protein_weight_dict[protein.strip('\t').split()[0]] = float(protein.strip('\t').split()[1])
 57 | 
 58 | 	return protein_weight_dict
 59 | 
 60 | 
 61 | def CodonTableDNA():
 62 | 	'''Returns a DNA Codon translation list.'''
 63 | 	table = '''TTT F
 64 | 	CTT L      
 65 | 	ATT I      
 66 | 	GTT V
 67 | 	TTC F      
 68 | 	CTC L      
 69 | 	ATC I      
 70 | 	GTC V
 71 | 	TTA L     
 72 | 	CTA L      
 73 | 	ATA I      
 74 | 	GTA V
 75 | 	TTG L      
 76 | 	CTG L      
 77 | 	ATG M      
 78 | 	GTG V
 79 | 	TCT S      
 80 | 	CCT P      
 81 | 	ACT T      
 82 | 	GCT A
 83 | 	TCC S      
 84 | 	CCC P      
 85 | 	ACC T      
 86 | 	GCC A
 87 | 	TCA S      
 88 | 	CCA P      
 89 | 	ACA T      
 90 | 	GCA A
 91 | 	TCG S      
 92 | 	CCG P      
 93 | 	ACG T      
 94 | 	GCG A
 95 | 	TAT Y      
 96 | 	CAT H      
 97 | 	AAT N      
 98 | 	GAT D
 99 | 	TAC Y      
100 | 	CAC H      
101 | 	AAC N      
102 | 	GAC D
103 | 	TAA Stop   
104 | 	CAA Q      
105 | 	AAA K      
106 | 	GAA E
107 | 	TAG Stop   
108 | 	CAG Q      
109 | 	AAG K      
110 | 	GAG E
111 | 	TGT C      
112 | 	CGT R      
113 | 	AGT S      
114 | 	GGT G
115 | 	TGC C      
116 | 	CGC R      
117 | 	AGC S      
118 | 	GGC G
119 | 	TGA Stop   
120 | 	CGA R      
121 | 	AGA R      
122 | 	GGA G
123 | 	TGG W      
124 | 	CGG R      
125 | 	AGG R      
126 | 	GGG G'''
127 | 
128 | 	table = table.split('\n')
129 | 	for index, item in enumerate(table):
130 | 		table[index] = item.strip().split()
131 | 
132 | 	return table
133 | 
134 | 
135 | def CodonTableRNA():
136 | 	'''Returns an RNA Codon translation list.'''
137 | 	table = '''UUU F
138 | 	UUC F
139 | 	UUA L
140 | 	UUG L
141 | 	UCU S
142 | 	UCC S
143 | 	UCA S
144 | 	UCG S
145 | 	UAU Y
146 | 	UAC Y
147 | 	UAA Stop
148 | 	UAG Stop
149 | 	UGU C
150 | 	UGC C
151 | 	UGA Stop
152 | 	UGG W
153 | 	CUU L
154 | 	CUC L
155 | 	CUA L
156 | 	CUG L
157 | 	CCU P
158 | 	CCC P
159 | 	CCA P
160 | 	CCG P
161 | 	CAU H
162 | 	CAC H
163 | 	CAA Q
164 | 	CAG Q
165 | 	CGU R
166 | 	CGC R
167 | 	CGA R
168 | 	CGG R
169 | 	AUU I
170 | 	AUC I
171 | 	AUA I
172 | 	AUG M
173 | 	ACU T
174 | 	ACC T
175 | 	ACA T
176 | 	ACG T
177 | 	AAU N
178 | 	AAC N
179 | 	AAA K
180 | 	AAG K
181 | 	AGU S
182 | 	AGC S
183 | 	AGA R
184 | 	AGG R
185 | 	GUU V
186 | 	GUC V
187 | 	GUA V
188 | 	GUG V
189 | 	GCU A
190 | 	GCC A
191 | 	GCA A
192 | 	GCG A
193 | 	GAU D
194 | 	GAC D
195 | 	GAA E
196 | 	GAG E
197 | 	GGU G
198 | 	GGC G
199 | 	GGA G
200 | 	GGG G'''
201 | 	
202 | 	table = table.split('\n')
203 | 	for index, item in enumerate(table):
204 | 		table[index] = item.strip().split()
205 | 
206 | 	return table
207 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | Scripts for functions common to multiple programming assignments for Bioinformatics Algortihms on Coursera.
 4 | '''
 5 | 
 6 | from DNA_RNA_Operations import DNA_to_RNA, RNA_to_DNA, ReverseComplementDNA, ReverseComplementRNA, HammingDistance
 7 | from generalized_suffix_tree import GeneralizedSuffixTree
 8 | from Protein_Dictionaries import ProteinDictDNA, ProteinDictRNA, ProteinWeightDict
 9 | from scoring_matrices import BLOSUM62, PAM250
10 | from trie import Trie
11 | 


--------------------------------------------------------------------------------
/scripts/data/BLOSUM62.txt:
--------------------------------------------------------------------------------
  1 | A A 4
  2 | A C 0
  3 | A D -2
  4 | A E -1
  5 | A F -2
  6 | A G 0
  7 | A H -2
  8 | A I -1
  9 | A K -1
 10 | A L -1
 11 | A M -1
 12 | A N -2
 13 | A P -1
 14 | A Q -1
 15 | A R -1
 16 | A S 1
 17 | A T 0
 18 | A V 0
 19 | A W -3
 20 | A Y -2
 21 | C A 0
 22 | C C 9
 23 | C D -3
 24 | C E -4
 25 | C F -2
 26 | C G -3
 27 | C H -3
 28 | C I -1
 29 | C K -3
 30 | C L -1
 31 | C M -1
 32 | C N -3
 33 | C P -3
 34 | C Q -3
 35 | C R -3
 36 | C S -1
 37 | C T -1
 38 | C V -1
 39 | C W -2
 40 | C Y -2
 41 | D A -2
 42 | D C -3
 43 | D D 6
 44 | D E 2
 45 | D F -3
 46 | D G -1
 47 | D H -1
 48 | D I -3
 49 | D K -1
 50 | D L -4
 51 | D M -3
 52 | D N 1
 53 | D P -1
 54 | D Q 0
 55 | D R -2
 56 | D S 0
 57 | D T -1
 58 | D V -3
 59 | D W -4
 60 | D Y -3
 61 | E A -1
 62 | E C -4
 63 | E D 2
 64 | E E 5
 65 | E F -3
 66 | E G -2
 67 | E H 0
 68 | E I -3
 69 | E K 1
 70 | E L -3
 71 | E M -2
 72 | E N 0
 73 | E P -1
 74 | E Q 2
 75 | E R 0
 76 | E S 0
 77 | E T -1
 78 | E V -2
 79 | E W -3
 80 | E Y -2
 81 | F A -2
 82 | F C -2
 83 | F D -3
 84 | F E -3
 85 | F F 6
 86 | F G -3
 87 | F H -1
 88 | F I 0
 89 | F K -3
 90 | F L 0
 91 | F M 0
 92 | F N -3
 93 | F P -4
 94 | F Q -3
 95 | F R -3
 96 | F S -2
 97 | F T -2
 98 | F V -1
 99 | F W 1
100 | F Y 3
101 | G A 0
102 | G C -3
103 | G D -1
104 | G E -2
105 | G F -3
106 | G G 6
107 | G H -2
108 | G I -4
109 | G K -2
110 | G L -4
111 | G M -3
112 | G N 0
113 | G P -2
114 | G Q -2
115 | G R -2
116 | G S 0
117 | G T -2
118 | G V -3
119 | G W -2
120 | G Y -3
121 | H A -2
122 | H C -3
123 | H D -1
124 | H E 0
125 | H F -1
126 | H G -2
127 | H H 8
128 | H I -3
129 | H K -1
130 | H L -3
131 | H M -2
132 | H N 1
133 | H P -2
134 | H Q 0
135 | H R 0
136 | H S -1
137 | H T -2
138 | H V -3
139 | H W -2
140 | H Y 2
141 | I A -1
142 | I C -1
143 | I D -3
144 | I E -3
145 | I F 0
146 | I G -4
147 | I H -3
148 | I I 4
149 | I K -3
150 | I L 2
151 | I M 1
152 | I N -3
153 | I P -3
154 | I Q -3
155 | I R -3
156 | I S -2
157 | I T -1
158 | I V 3
159 | I W -3
160 | I Y -1
161 | K A -1
162 | K C -3
163 | K D -1
164 | K E 1
165 | K F -3
166 | K G -2
167 | K H -1
168 | K I -3
169 | K K 5
170 | K L -2
171 | K M -1
172 | K N 0
173 | K P -1
174 | K Q 1
175 | K R 2
176 | K S 0
177 | K T -1
178 | K V -2
179 | K W -3
180 | K Y -2
181 | L A -1
182 | L C -1
183 | L D -4
184 | L E -3
185 | L F 0
186 | L G -4
187 | L H -3
188 | L I 2
189 | L K -2
190 | L L 4
191 | L M 2
192 | L N -3
193 | L P -3
194 | L Q -2
195 | L R -2
196 | L S -2
197 | L T -1
198 | L V 1
199 | L W -2
200 | L Y -1
201 | M A -1
202 | M C -1
203 | M D -3
204 | M E -2
205 | M F 0
206 | M G -3
207 | M H -2
208 | M I 1
209 | M K -1
210 | M L 2
211 | M M 5
212 | M N -2
213 | M P -2
214 | M Q 0
215 | M R -1
216 | M S -1
217 | M T -1
218 | M V 1
219 | M W -1
220 | M Y -1
221 | N A -2
222 | N C -3
223 | N D 1
224 | N E 0
225 | N F -3
226 | N G 0
227 | N H 1
228 | N I -3
229 | N K 0
230 | N L -3
231 | N M -2
232 | N N 6
233 | N P -2
234 | N Q 0
235 | N R 0
236 | N S 1
237 | N T 0
238 | N V -3
239 | N W -4
240 | N Y -2
241 | P A -1
242 | P C -3
243 | P D -1
244 | P E -1
245 | P F -4
246 | P G -2
247 | P H -2
248 | P I -3
249 | P K -1
250 | P L -3
251 | P M -2
252 | P N -2
253 | P P 7
254 | P Q -1
255 | P R -2
256 | P S -1
257 | P T -1
258 | P V -2
259 | P W -4
260 | P Y -3
261 | Q A -1
262 | Q C -3
263 | Q D 0
264 | Q E 2
265 | Q F -3
266 | Q G -2
267 | Q H 0
268 | Q I -3
269 | Q K 1
270 | Q L -2
271 | Q M 0
272 | Q N 0
273 | Q P -1
274 | Q Q 5
275 | Q R 1
276 | Q S 0
277 | Q T -1
278 | Q V -2
279 | Q W -2
280 | Q Y -1
281 | R A -1
282 | R C -3
283 | R D -2
284 | R E 0
285 | R F -3
286 | R G -2
287 | R H 0
288 | R I -3
289 | R K 2
290 | R L -2
291 | R M -1
292 | R N 0
293 | R P -2
294 | R Q 1
295 | R R 5
296 | R S -1
297 | R T -1
298 | R V -3
299 | R W -3
300 | R Y -2
301 | S A 1
302 | S C -1
303 | S D 0
304 | S E 0
305 | S F -2
306 | S G 0
307 | S H -1
308 | S I -2
309 | S K 0
310 | S L -2
311 | S M -1
312 | S N 1
313 | S P -1
314 | S Q 0
315 | S R -1
316 | S S 4
317 | S T 1
318 | S V -2
319 | S W -3
320 | S Y -2
321 | T A 0
322 | T C -1
323 | T D -1
324 | T E -1
325 | T F -2
326 | T G -2
327 | T H -2
328 | T I -1
329 | T K -1
330 | T L -1
331 | T M -1
332 | T N 0
333 | T P -1
334 | T Q -1
335 | T R -1
336 | T S 1
337 | T T 5
338 | T V 0
339 | T W -2
340 | T Y -2
341 | V A 0
342 | V C -1
343 | V D -3
344 | V E -2
345 | V F -1
346 | V G -3
347 | V H -3
348 | V I 3
349 | V K -2
350 | V L 1
351 | V M 1
352 | V N -3
353 | V P -2
354 | V Q -2
355 | V R -3
356 | V S -2
357 | V T 0
358 | V V 4
359 | V W -3
360 | V Y -1
361 | W A -3
362 | W C -2
363 | W D -4
364 | W E -3
365 | W F 1
366 | W G -2
367 | W H -2
368 | W I -3
369 | W K -3
370 | W L -2
371 | W M -1
372 | W N -4
373 | W P -4
374 | W Q -2
375 | W R -3
376 | W S -3
377 | W T -2
378 | W V -3
379 | W W 11
380 | W Y 2
381 | Y A -2
382 | Y C -2
383 | Y D -3
384 | Y E -2
385 | Y F 3
386 | Y G -3
387 | Y H 2
388 | Y I -1
389 | Y K -2
390 | Y L -1
391 | Y M -1
392 | Y N -2
393 | Y P -3
394 | Y Q -1
395 | Y R -2
396 | Y S -2
397 | Y T -2
398 | Y V -1
399 | Y W 2
400 | Y Y 7
401 | 


--------------------------------------------------------------------------------
/scripts/data/PAM250.txt:
--------------------------------------------------------------------------------
 1 |    A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  W  Y
 2 | A  2  -2  0  0  -3  1  -1  -1  -1  -2  -1  0  1  0  -2  1  1  0  -6  -3
 3 | C  -2  12  -5  -5  -4  -3  -3  -2  -5  -6  -5  -4  -3  -5  -4  0  -2  -2  -8  0
 4 | D  0  -5  4  3  -6  1  1  -2  0  -4  -3  2  -1  2  -1  0  0  -2  -7  -4
 5 | E  0  -5  3  4  -5  0  1  -2  0  -3  -2  1  -1  2  -1  0  0  -2  -7  -4
 6 | F  -3  -4  -6  -5  9  -5  -2  1  -5  2  0  -3  -5  -5  -4  -3  -3  -1  0  7
 7 | G  1  -3  1  0  -5  5  -2  -3  -2  -4  -3  0  0  -1  -3  1  0  -1  -7  -5
 8 | H  -1  -3  1  1  -2  -2  6  -2  0  -2  -2  2  0  3  2  -1  -1  -2  -3  0
 9 | I  -1  -2  -2  -2  1  -3  -2  5  -2  2  2  -2  -2  -2  -2  -1  0  4  -5  -1
10 | K  -1  -5  0  0  -5  -2  0  -2  5  -3  0  1  -1  1  3  0  0  -2  -3  -4
11 | L  -2  -6  -4  -3  2  -4  -2  2  -3  6  4  -3  -3  -2  -3  -3  -2  2  -2  -1
12 | M  -1  -5  -3  -2  0  -3  -2  2  0  4  6  -2  -2  -1  0  -2  -1  2  -4  -2
13 | N  0  -4  2  1  -3  0  2  -2  1  -3  -2  2  0  1  0  1  0  -2  -4  -2
14 | P  1  -3  -1  -1  -5  0  0  -2  -1  -3  -2  0  6  0  0  1  0  -1  -6  -5
15 | Q  0  -5  2  2  -5  -1  3  -2  1  -2  -1  1  0  4  1  -1  -1  -2  -5  -4
16 | R  -2  -4  -1  -1  -4  -3  2  -2  3  -3  0  0  0  1  6  0  -1  -2  2  -4
17 | S  1  0  0  0  -3  1  -1  -1  0  -3  -2  1  1  -1  0  2  1  -1  -2  -3
18 | T  1  -2  0  0  -3  0  -1  0  0  -2  -1  0  0  -1  -1  1  3  0  -5  -3
19 | V  0  -2  -2  -2  -1  -1  -2  4  -2  2  2  -2  -1  -2  -2  -1  0  4  -6  -2
20 | W  -6  -8  -7  -7  0  -7  -3  -5  -3  -2  -4  -4  -6  -5  2  -2  -5  -6  17  0
21 | Y  -3  0  -4  -4  7  -5  0  -1  -4  -1  -2  -2  -5  -4  -4  -3  -3  -2  0  10


--------------------------------------------------------------------------------
/scripts/scoring_matrices.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''A Bioinformatics Algorithms script containing scoring matrices.'''
 3 | 
 4 | 
 5 | class BLOSUM62(object):
 6 |     """The BLOSUM62 scoring matrix class."""
 7 | 
 8 |     def __init__(self):
 9 |         """Initialize the scoring matrix."""
10 |         import os
11 | 
12 |         with open(os.path.join(os.path.dirname(__file__), 'data/BLOSUM62.txt')) as input_data:
13 |             items = [line.strip().split() for line in input_data.readlines()]
14 |             self.scoring_matrix = {(item[0], item[1]):int(item[2]) for item in items}
15 | 
16 |     def __getitem__(self, pair):
17 |         """Returns the score of the given pair of protein."""
18 |         return self.scoring_matrix[pair[0], pair[1]]
19 | 
20 | 
21 | class PAM250(object):
22 |     """The PAM250 scoring matrix class."""
23 | 
24 |     def __init__(self):
25 |         """Initialize the scoring matrix."""
26 |         import os
27 |         import pandas as pd
28 |         # Convert the scoring matrix text file to a data frame.
29 |         self.scoring_matrix = pd.read_table(os.path.join(os.path.dirname(__file__), 'data/PAM250.txt'), sep='  ')
30 | 
31 |     def __getitem__(self, pair):
32 |         """Returns the score of the given pair of protein."""
33 |         return self.scoring_matrix[pair[0]][pair[1]]
34 | 


--------------------------------------------------------------------------------
/scripts/trie.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''A Bioinformatics Algorithms script containing a trie data structure.'''
 3 | 
 4 | 
 5 | class Trie(object):
 6 |     """Constucts a trie for the given words."""
 7 |     def __init__(self, words):
 8 |         """Initialize the nodes and edges and add the given words."""
 9 | 
10 |         # A lambda function to create nodes.
11 |         # 'parent' = parent node number
12 |         # 'children' = list of children node numbers.
13 |         # 'depth' = length of substring up to the node.
14 |         # 'end' = boolean to determine if the node corresponds to the last character of an inserted word.
15 |         self.create_node = lambda p, d: {'parent':p, 'children':[], 'depth':d, 'end':False}
16 | 
17 |         # Initialize nodes and edges.
18 |         self.nodes = {1:self.create_node(0,0)}
19 |         self.edges = {}
20 | 
21 |         # Construct the trie by adding the words.
22 |         if type(words) is str:
23 |             self._add_word(words)
24 |         else:
25 |             for word in words:
26 |                 self._add_word(word)
27 | 
28 |     def _add_word(self, current_word):
29 |         """Adds a word to the trie."""
30 | 
31 |         # Get the insertion node and portion of the word to insert.
32 |         insertion_node, insertion_substring = self._insert_location(current_word)
33 | 
34 |         # Begin inserting at the insertion node.
35 |         for i in xrange(len(insertion_substring)):
36 | 
37 |             # Get the new node number.
38 |             new_node = len(self.nodes) + 1
39 | 
40 |             # Add the new node to the trie, and add parent/depth/child information.
41 |             self.nodes[new_node] = self.create_node(insertion_node, self.nodes[insertion_node]['depth']+1)
42 |             self.nodes[insertion_node]['children'].append(new_node)
43 | 
44 |             # Add the new edge to the trie.
45 |             self.edges[insertion_node, new_node] = insertion_substring[i]
46 | 
47 |             # Move to the new node and continue insertion.
48 |             insertion_node = new_node
49 | 
50 |         # Mark the last node as an end node, as it is the end of the word added.
51 |         self.nodes[insertion_node]['end'] = True
52 | 
53 |     def _insert_location(self, word_to_add, current_node=1):
54 |         """Traverses the trie to determine the insertion point of the given word."""
55 | 
56 |         # This happends if the word we're trying to add is already a substring of an added word.
57 |         if word_to_add == '':
58 |             return current_node, word_to_add
59 | 
60 |         # Search all child nodes for a match.
61 |         for child_node in self.nodes[current_node]['children']:
62 |             if self.edges[current_node, child_node] == word_to_add[0]:
63 |                 # Move to the child node if we have a match.
64 |                 return self._insert_location(word_to_add[1:], child_node)
65 | 
66 |         # If we reach this point, there is no character match.
67 |         return current_node, word_to_add
68 | 
69 |     def word_up_to_node(self, node_num):
70 |         """Returns the word associated with a traversal up to the given node."""
71 | 
72 |         node_word = ''
73 |         while self.nodes[node_num]['parent'] != 0:
74 |             node_word += self.edges[self.nodes[node_num]['parent'], node_num]
75 |             node_num = self.nodes[node_num]['parent']
76 | 
77 |         # We travelled backwards, so reverse the word.
78 |         return node_word[::-1]
79 | 
80 |     def prefix_in_trie(self, word_to_check, current_node=1):
81 |         """Traverses the trie to determine if a prefix of the given word matches a pattern in the trie."""
82 | 
83 |         if self.nodes[current_node]['end'] is True:
84 |             # If we hit an end node then we've found a matching pattern as a prefix.
85 |             return True
86 |         elif word_to_check == '':
87 |             # If we've exhausted the word_to_check then no prefix of it matches an entire pattern in the trie.
88 |             return False
89 | 
90 |         # Search all child nodes for a match.
91 |         for child_node in self.nodes[current_node]['children']:
92 |             if self.edges[current_node, child_node] == word_to_check[0]:
93 |                 # Move to the child node if we have a match.
94 |                 return self.prefix_in_trie(word_to_check[1:], child_node)
95 | 
96 |         # If we reach this point, there is no character match, and hence no prefix matching a pattern in the trie.
97 |         return False
98 | 


--------------------------------------------------------------------------------