├── README.md
├── Wolff_Presentations
├── Feng-Doolittle.pdf
├── Gotoh.pdf
├── Needleman-Wunsch n=3.pdf
├── Needleman-Wunsch.pdf
├── Nussinov.pdf
├── SumOfPairs.pdf
└── UPGMA_WPGM.pdf
├── bin
├── algorithmsInBioinformatics.py
├── blosum62.txt
└── fengDoolittle.fas
├── report.pdf
└── source
├── .idea
├── .name
├── encodings.xml
├── misc.xml
├── modules.xml
├── scopes
│ └── scope_settings.xml
├── source.iml
├── vcs.xml
└── workspace.xml
├── lib
├── __init__.py
├── helper
│ ├── IOHelper.py
│ ├── __init__.py
│ ├── mathHelper.py
│ ├── multipleAlignmentHelper.py
│ ├── pairwiseAlignmentHelper.py
│ └── test
│ │ └── IOHelperTest.py
├── multiple
│ ├── __init__.py
│ ├── fengDoolittle.py
│ ├── needlemanWunschN3.py
│ ├── sumOfPairs.py
│ ├── test
│ │ ├── fengDoolittleTest.py
│ │ ├── needlemanWunschN3Test.py
│ │ ├── sumOfPairsTest.py
│ │ └── upgmaWpgmaTest.py
│ └── upgmaWpgma.py
├── pairwise
│ ├── __init__.py
│ ├── gotoh.py
│ ├── needlemanWunsch.py
│ └── test
│ │ ├── __init__.py
│ │ ├── gotohTest.py
│ │ └── needlemanWunschTest.py
└── structurePrediction
│ ├── __init__.py
│ ├── nussinov.py
│ └── test
│ └── nussinovTest.py
└── sequences
/README.md:
--------------------------------------------------------------------------------
1 | # Algorithms In Bioinformatics
2 | To run the algorithms execute the file "algorithmsInBioinformatics.py" in the folder source/bin.
3 |
4 | ## Parameters
5 |
6 | #### Help
7 | -h, --help
8 |
9 | Show this help message and exit
10 |
11 | #### Algorithms
12 |
13 | -a {nw,gotoh,nw3,fengDoolittle,sumOfPairs,upgma,wpgma,nussinov},
14 |
15 | --algorithm {nw,gotoh,nw3,fengDoolittle,sumOfPairs,upgma,wpgma,nussinov}
16 |
17 | Define which algorithm should be executed. Options are:
18 |
19 | * 'nw' for the algorithm of Needleman and Wunsch.
20 | * 'gotoh' for the algorithm of Osamu Gotoh.
21 | * 'nw3' for the Needleman-Wunsch algorithm with three sequences.
22 | * 'fengDoolittle' for the heuristic multiple sequence alignment algorithm by Da-Fei Feng and Russell F. Doolittle.
23 | * 'sumOfPairs' for the scoring of a multiple sequence alignment by Humberto Carrillo and David Lipman.
24 | * 'upgma' or 'wpgma' is a clustering method to generate pylogenetic trees.
25 | * 'nussinov' for the RNA secondary structure prediction algorithm by Ruth
26 | Nussinov.
27 |
28 | #### Input file
29 |
30 | -f INPUTFILE, --inputFile INPUTFILE
31 |
32 | Define the file in which the input sequences are defined. It have to be in fasta-format.
33 |
34 | #### Output file
35 |
36 | -o OUTPUTFILE, --outputFile OUTPUTFILE
37 |
38 | Define in which file the output should be written. If
39 | not defined, it is written to "outputFile.fas" in the
40 | local directory.
41 |
42 | #### Weight function
43 |
44 | -w WEIGHTFUNCTION, --weightFunction WEIGHTFUNCTION
45 |
46 | Name of a weight function definde in class
47 | PairwiseAligmentHelper.
48 |
49 | #### Gap costs
50 |
51 | -gc GAPCOST, --gapCost GAPCOST
52 |
53 | Name of a gap function definde in class PairwiseAligmentHelper.
54 |
55 | #### Number of solutions
56 |
57 | --numberOfSolutions NUMBEROFSOLUTIONS
58 |
59 | Define the number of optimal solutions the Needleman-Wunsch algorithm should compute.
60 |
61 | #### Output format
62 |
63 | --outputFormat {graphML,newickTree}
64 |
65 | Define the output format of the output file. This function is only parsed if you choose 'upgma' or 'wpgma' as an algorithm. Default is Newick tree.
66 |
67 | #### similarity score
68 |
69 | --similarityScore SIMILARITYSCORE
70 |
71 | Name of a similarity score defined in class PairwiseAligmentHelper.
72 |
73 | ## Support
74 |
75 | If you are having issues, please let me know. Mail adress: wolffj[at]informatik[dot]uni-freiburg[dot]de
--------------------------------------------------------------------------------
/Wolff_Presentations/Feng-Doolittle.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/Feng-Doolittle.pdf
--------------------------------------------------------------------------------
/Wolff_Presentations/Gotoh.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/Gotoh.pdf
--------------------------------------------------------------------------------
/Wolff_Presentations/Needleman-Wunsch n=3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/Needleman-Wunsch n=3.pdf
--------------------------------------------------------------------------------
/Wolff_Presentations/Needleman-Wunsch.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/Needleman-Wunsch.pdf
--------------------------------------------------------------------------------
/Wolff_Presentations/Nussinov.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/Nussinov.pdf
--------------------------------------------------------------------------------
/Wolff_Presentations/SumOfPairs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/SumOfPairs.pdf
--------------------------------------------------------------------------------
/Wolff_Presentations/UPGMA_WPGM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/UPGMA_WPGM.pdf
--------------------------------------------------------------------------------
/bin/algorithmsInBioinformatics.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2014 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | #
12 | # main class
13 |
14 | import argparse
15 | import os, sys
16 | if os.name == "posix":
17 | lib_path = os.path.abspath('../lib')
18 | elif os.name == "nt":
19 | lib_path = os.path.abspath('..\lib')
20 | sys.path.append(lib_path)
21 |
22 | from helper import IOHelper as io
23 | from helper import MultipleAlignmentHelper as mah
24 | from pairwise import NeedlemanWunsch as nw
25 | from pairwise import Gotoh
26 | from multiple import NeedlemanWunschN3 as NW3
27 | from multiple import UpgmaWpgma
28 | from multiple import FengDoolittle
29 | from multiple import SumOfPairs
30 | from structurePrediction import Nussinov
31 | def main():
32 | """Method to parse the arguments and start the defined algorithms."""
33 | parser = argparse.ArgumentParser()
34 | parser.add_argument("-a", "--algorithm",
35 | choices=["nw", "gotoh", "nw3", "fengDoolittle", "sumOfPairs","upgma", "wpgma", "nussinov"],
36 | required=True,
37 | help="Define which algorithm should be executed. "
38 | "\nOptions are: 'nw' for the algorithm of Needleman and Wunsch,\n"
39 | "'gotoh' for the algorithm of Osamu Gotoh, \n"
40 | "'nw3' for the Needleman-Wunsch algorithm with three sequences, \n"
41 | "'fengDoolittle' for the heuristic multiple sequence alignment algorithm by Da-Fei Feng and Russell F. Doolittle,"
42 | "'sumOfPairs' for the scoring of a multiple sequence alignment by Humberto Carrillo and David Lipman."
43 | "'upgma' or 'wpgma' is a clustering method to generate phylogenetic trees, \n"
44 | "'nussinov' for the RNA secondary structure prediction algorithm by Ruth Nussinov.")
45 | parser.add_argument("-f", "--inputFile", dest="inputFile",
46 | help="Define the file in which the input sequences are defined. It have to be in fasta-format.")
47 | parser.add_argument("-o", "--outputFile", help="Define in which file the output should be written. "
48 | "If not defined, it is written to \"outputFile.fas\" in the local directory.")
49 | parser.add_argument("-gc", "--gapCost", dest="gapCost",
50 | help="Name of a gap function definde in class PairwiseAligmentHelper.")
51 | parser.add_argument("--numberOfSolutions", dest="numberOfSolutions",
52 | help="Define the number of optimal solutions the Needleman-Wunsch algorithm should compute.")
53 | parser.add_argument("--outputFormat", dest="outputFormat", choices=["graphML", "newickTree"],
54 | help="Define the output format of the output file. "
55 | "This function is only parsed if you choose 'upgma' or 'wpgma' as an algorithm. Default is"
56 | " Newick tree")
57 | parser.add_argument("--scoring", dest="similarityScore",
58 | help="Name of a similarity score defined in class PairwiseAligmentHelper. Per default "
59 | "\"pam\" and \"blosum\" (pam250 and blosum62) are implemented. Feel free to extend, you can find the "
60 | "file \"PairwiseAligmentHelper.py\" in lib/helper. If this option is not defined, the pam250 matrix is choosen.")
61 | parser.add_argument("--gapPenalty", dest="gapPenalty", help="Define a gap penalty. Default for pam is 8 and blosum 6.")
62 | args = parser.parse_args()
63 |
64 | outputFile = ""
65 | weightFunction = ""
66 | if args.outputFile:
67 | outputFile = args.outputFile
68 | if args.similarityScore:
69 | weightFunction = args.similarityScore
70 |
71 | sequences = getSequencesFromFile(args.inputFile)
72 | if len(sequences) > 1:
73 |
74 | # pairwise alignment
75 | if args.algorithm == "nw":
76 | if outputFile == "":
77 | outputFile = "needlemanWunsch.fas"
78 | if weightFunction == "":
79 | weightFunction = "pam"
80 | numberOfSolutions = -1
81 | if args.numberOfSolutions:
82 | numberOfSolutions = args.numberOfSolutions
83 | needlemanWunsch(sequences[0:2], scoreFunction = weightFunction, outputFile = outputFile, numberOfSolutions=numberOfSolutions)
84 |
85 | elif args.algorithm == "gotoh":
86 | if outputFile == "":
87 | outputFile = "gotoh.fas"
88 | if weightFunction == "":
89 | weightFunction = "pam"
90 | gapCost = "gapCost"
91 | if args.gapCost:
92 | gapCost = args.gapCost
93 | gotoh(sequences[0:2], scoreFunction = weightFunction, costFunction = gapCost, outputFile = outputFile)
94 |
95 | # multiple alignment
96 |
97 | elif args.algorithm == "upgma" or args.algorithm == "wpgma":
98 | newickTree = True
99 | if args.outputFormat == "graphML":
100 | newickTree = False
101 | if outputFile == "":
102 | if args.algorithm == "upgma":
103 | outputFile = "upgma"
104 | else:
105 | outputFile = "wpgma"
106 | upgmaWpgma(args.algorithm == "upgma", sequences, outputFile, newickTree)
107 |
108 | elif args.algorithm == "fengDoolittle":
109 | if outputFile == "":
110 | outputFile = "fengDoolittle.fas"
111 | if weightFunction == "":
112 | weightFunction = "pam"
113 | similarityScore = "pam"
114 | if args.similarityScore:
115 | similarityScore = args.similarityScore
116 | fengDoolittle(sequences, weightFunction, similarityScore, outputFile)
117 | elif args.algorithm == "sumOfPairs":
118 | similarityScore = "pam"
119 | if args.similarityScore:
120 | similarityScore = args.similarityScore
121 | if args.gapPenalty:
122 | sumOfPairs(sequences, similarityScore, args.gapPenalty)
123 | else:
124 | sumOfPairs(sequences, similarityScore)
125 |
126 | elif args.algorithm == "nw3":
127 | if not (len(sequences) == 3):
128 | print "Wrong number of input sequences. Needleman-Wunsch n=3 needs exactly three sequences; ", \
129 | len(sequences) , " sequences are given."
130 | sys.exit()
131 | if weightFunction == "":
132 | weightFunction = "pam"
133 | if outputFile == "":
134 | outputFile = "nw3.fas"
135 | needlemanWunschN3(sequences[0:3], weightFunction = weightFunction, outputFile = outputFile)
136 |
137 | # multiple alignment
138 |
139 | elif len(sequences) == 1:
140 | # structure prediction
141 | if args.algorithm == "nussinov":
142 | if outputFile == "":
143 | outputFile = "nussinov.dotBracket"
144 | nussinov(sequences[0:1], outputFile)
145 | else:
146 | print "You have defined only one input sequence, but your defined algorithm \'",\
147 | args.algorithm, "\' needs at least two sequences."
148 | else:
149 | print "No sequences in input file defined."
150 | sys.exit(0)
151 | def getSequencesFromFile(inputFile):
152 | """Parse the input file to get the sequences. Returns the sequences as an array.
153 | inputFile: A fasta format file with the input sequences."""
154 | sequences = io().readFastaFile(inputFile)
155 | return sequences
156 | def needlemanWunsch(sequences, scoreFunction, outputFile, numberOfSolutions):
157 | """Executes the Needleman-Wunsch algorithm with a default score function defined as: a == b -> 0 and a !=b --> 1.\n
158 | Stores the alignments per default in file needlemanWunsch.fas.
159 | To change the score function define a function in class PairwiseAligmentHelper and define the name as an input paramter.
160 | scoreFunction: The name of the weigh function which is defined in class PairwiseAligmentHelper.
161 | outputFile: The path to the output file.
162 | numberOfSolutions: Maximal number of optimal solutions which should be computed."""
163 | print "\nThe following sequences are given:"
164 | for i in sequences:
165 | print i
166 | print "\nComputing solution...\n\n"
167 | result = nw().compute(sequences, scoreFunction, int(numberOfSolutions), scoringValue=True)
168 | print "\nScore: ", result[1]
169 | print "Number of optimal solutions: ", len(result[0])
170 | print "\nOne solution is:\n", result[0][0][0], "\n", result[0][0][1]
171 | print "\nFor more solutions look in the file \"needlemanWunsch.fas\" in the bin directory.\n"
172 | io().writeFastaFile(result[0], outputFile)
173 | def gotoh(sequences, scoreFunction="weightFunctionDifference", costFunction="gapCost", outputFile="gotoh.fas"):
174 | """Executes the Gotoh algorithm with a default score function defined as: a == b -> 0 and a !=b --> 1 and a cost function defined as: g(x) = 2 + k.\n
175 | Stores the alignments per default in file gotoh.fas.
176 | To change the score or cost function define a function in class PairwiseAligmentHelper and define the name as an input paramter.
177 | scoreFunction: The name of the weigh function which is defined in class PairwiseAligmentHelper.
178 | costFunction: The name of the gap cost function which is defined in class PairwiseAligmentHelper.
179 | outputFile: The path to the output file.
180 | """
181 | print "The following sequences are given:"
182 | for i in sequences:
183 | print i
184 | print "Computing solution..."
185 | gotoh = Gotoh(sequences[0], sequences[1], scoreFunction, costFunction)
186 | result = gotoh.compute()
187 | io().writeFastaFile(result, outputFile)
188 | print "Number of solutions: ", len(result)
189 | print "Score:", max(gotoh.computationMatrix[0][-1][-1], max(gotoh.computationMatrix[1][-1][-1], gotoh.computationMatrix[2][-1][-1]))
190 | print "One solution is:\n", result[0][0], "\n", result[0][1]
191 | print "For more solutions look in the file \"gotoh.fas\" in the bin directory."
192 |
193 | def needlemanWunschN3(sequences, weightFunction="weightFunctionDifference", outputFile="nw3.fas"):
194 | """Executes the Needleman-Wunsch algorithm with three sequences"""
195 | print "\nThe following sequences are given:"
196 | for i in sequences:
197 | print i
198 | print "\nComputing solution...\n\n"
199 | nw3 = NW3(sequences[0], sequences[1], sequences[2], weightFunction)
200 | result = nw3.execute()
201 |
202 | io().writeFastaFile(result, outputFile)
203 | print "\nScore: ", nw3.computation_matrix[-1][-1][-1]
204 | print "Number of optimal solutions: ", len(result)
205 | print "\nOne solution is:\n", result[0][0], "\n", result[0][1], "\n", result[0][2]
206 | print "\nFor more solutions look in the file \"nw3.fas\" in the bin directory.\n"
207 |
208 | def upgmaWpgma(upgmaWpgma, sequences, outputFile, fileFormat):
209 | """Executes the a phylogenetic clustering with a upgm or wpgm weighting.
210 | sequences: All defined input sequences as a list.
211 | outputFile: The name of the output file
212 | fileFormat: The file format of the output file"""
213 | #create
214 | print "The following sequences are given:"
215 | for i in sequences:
216 | print i
217 | print "Computing clustering..."
218 | data = mah().createDataForUpgmaWpgma(sequences)
219 | if upgmaWpgma:
220 | upgma = UpgmaWpgma(data[0], len(data[1]))
221 | upgma.compute_clustering()
222 | if not fileFormat:
223 | outputFile += ".graphML"
224 | io().writeGraphMLFile(upgma.mapping, outputFile)
225 | print "Clustering written as graphML file: ", os.path.abspath(outputFile)
226 | else:
227 | outputFile += ".newickTree"
228 | cluster = upgma.get_newick_tree(with_edge_weights=True)
229 | io().writeNewickTree(cluster, outputFile)
230 | print "Computed upgma cluster: ", cluster
231 | print "The clustering was also written to: ", os.path.abspath(outputFile)
232 | else:
233 | wpgma = UpgmaWpgma(data[0], len(data[1]), False, data[2])
234 | wpgma.compute_clustering()
235 | if not fileFormat:
236 | outputFile += ".graphML"
237 | io().writeGraphMLFile(wpgma.mapping, outputFile)
238 | print "Clustering written as graphML file: ", os.path.abspath(outputFile)
239 | else:
240 | outputFile += ".newickTree"
241 | cluster = wpgma.get_newick_tree(with_edge_weights=True)
242 | io().writeNewickTree(cluster, outputFile)
243 | print "Computed wpgma cluster: ", cluster
244 | print "The clustering was also written to: ", os.path.abspath(outputFile)
245 |
246 |
247 | def nussinov(sequence, outputFile):
248 | """Executes the RNA-folding algorithm from Nussinov.
249 | sequence: The RNA-sequnce as a list.
250 | outputFile: The name of the output file."""
251 | print "\nThe following sequence is given:"
252 | print sequence[0]
253 | print "\n"
254 | nussinov = Nussinov(sequence[0])
255 | nussinov.execute()
256 | print "\nDot-bracket: "
257 | io().writeRnaDotBracketNotation(sequence[0], nussinov.pairedBases, outputFile)
258 | print "The result was also written to: ", os.path.abspath(outputFile)
259 |
260 | def sumOfPairs(sequences, scoringFunction, gapPenalty=-1):
261 | """This method scores a multiple sequence alignment with the sum of pairs algorithm.
262 | sequences: The multiple sequence alignment.
263 | scoringFunction: Name of a similarity score defined in class PairwiseAligmentHelper."""
264 | print "The following sequences are given:"
265 | for i in sequences:
266 | print i
267 | if gapPenalty == -1:
268 | sof = SumOfPairs(sequences, scoringFunction)
269 | else:
270 | sof = SumOfPairs(sequences, scoringFunction, gapPenalty)
271 | print "Sum-of-pairs scoring: ", sof.execute()
272 | def fengDoolittle(sequences, weightFunction, similarityScore, outputFile):
273 | """Executes the heuristic multiple sequence alignment by Feng and Doolittle.
274 | sequences: All input sequnces to align.
275 | weightFunction: The weight function defined in class PairwiseAlignmentHelper for the Needleman-Wunsch algorithm to compute the optimal local alignment.
276 | similarityScore: Name of a similarity score defined in class PairwiseAligmentHelper.
277 | outputFile: The output file name."""
278 | fd = FengDoolittle(sequences, weightFunction, similarityScore)
279 | alignmentDict = fd.computeMultipleAlignment()
280 | alignment = [[]]
281 | for i in alignmentDict:
282 | alignment[0].append(alignmentDict[i])
283 | io().writeFastaFile(alignment, outputFile)
284 | print "Input sequences:\n"
285 | for i in sequences:
286 | print i
287 | print "\nAlignment:"
288 | for i in alignmentDict:
289 | print alignmentDict[i]
290 | print sumOfPairs(alignment[0], weightFunction)
291 |
292 |
293 | if __name__ == "__main__":
294 | # try:
295 | main()
296 | # except:
297 | # "You discovered a bug! Please write an email to wolffj@informatik.uni-freiburg.de with your input parameters and I try to fix it."
--------------------------------------------------------------------------------
/bin/blosum62.txt:
--------------------------------------------------------------------------------
1 | # Matrix made by matblas from blosum62.iij
2 | # * column uses minimum score
3 | # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
4 | # Blocks Database = /data/blocks_5.0/blocks.dat
5 | # Cluster Percentage: >= 62
6 | # Entropy = 0.6979, Expected = -0.5209
7 |
8 | A R N D C Q E G H I L K M F P S T W Y V *
9 | A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -4
10 | R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -4
11 | N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 -4
12 | D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 -4
13 | C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -4
14 | Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 -4
15 | E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 -4
16 | G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -4
17 | H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 -4
18 | I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -4
19 | L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4
20 | K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 -4
21 | M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -4
22 | F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -4
23 | P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -4
24 | S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 -4
25 | T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -4
26 | W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4
27 | Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -4
28 | V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -4
29 | * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
--------------------------------------------------------------------------------
/bin/fengDoolittle.fas:
--------------------------------------------------------------------------------
1 | >Alignment 0 sequence 0
2 | ILDXXXMDVVEGSAARFDCKVEGXXXYPDPEVMWFKDDNPXXXXXXVKXXXXXESRHXXFQIDYXXDEXXEGXXXN
3 | >Alignment 0 sequence 1
4 | RRLXXIPAARGGEISILCQPRAXXAPKATILWSKGTEIXXXXLGXXXXXXNSTXXXXRVTVXXXXTXXXXSXXXXD
5 | >Alignment 0 sequence 2
6 | XXRDPXVKTHEGWGVMLPCNPPAHXYPGLSYRWLLNEFPXXNFIPXXXTDGXXRHFXXVSXXXXQXXTXXXXTXXX
7 | >Alignment 0 sequence 3
8 | ISDXXTEADIGSNLRWGCAAAGXXKPRPMVRWLRNGEPXXXXLAXXXXXXSQNXXXXRVEVXXXXLXXXXAXXXXXX
9 |
--------------------------------------------------------------------------------
/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/report.pdf
--------------------------------------------------------------------------------
/source/.idea/.name:
--------------------------------------------------------------------------------
1 | source
--------------------------------------------------------------------------------
/source/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/source/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/source/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/source/.idea/scopes/scope_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/source/.idea/source.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/source/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/source/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 | 1422826067293
199 |
200 | 1422826067293
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
--------------------------------------------------------------------------------
/source/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/source/lib/__init__.py
--------------------------------------------------------------------------------
/source/lib/helper/IOHelper.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2015 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | import os
12 | class IOHelper():
13 | """Helper class for reading an writing files in different formats."""
14 | def readFastaFile(self, inputFileName):
15 | """Reads a given fasta file and returns it as a array.
16 |
17 | inputFileName: The path (relative or absolut) to the input fasta file."""
18 | sequence = []
19 | if not os.path.exists(inputFileName):
20 | return sequence
21 |
22 | fileToRead = open(inputFileName, "r")
23 | i = 0
24 | for line in fileToRead.readlines():
25 | if line.startswith(">"):
26 | continue
27 | sequence.append(line.strip("\n"))
28 | i += 1
29 | fileToRead.close()
30 | return sequence
31 |
32 | def writeFastaFile(self, sequences, outputFileName):
33 | """Writes a the given sequences to a file in the fasta format.
34 | sequences: All computed alignemnts.
35 | A list of lists with two elements: [[,],...,[,]].
36 | outputFileName: The path (relative or absolut) and the output file name.
37 | e.g.: "/path/to/file" or "file" to write it in the local directory."""
38 | if not outputFileName.endswith(".fas"):
39 | outputFileName += str(".fas")
40 | fileToWrite = open(outputFileName, "w")
41 | i = 0
42 | while i < len(sequences):
43 | for sequence in sequences[i]:
44 | fileToWrite.write('>Alignment '+ str(i) +' sequence ' + str(sequences[i].index(sequence)) + '\n')
45 | fileToWrite.write(sequence + '\n')
46 | i += 1
47 | fileToWrite.close()
48 |
49 | def writeGraphMLFile(self, clusteredNodesDictionary, outputFileName):
50 | """Writes a tree computed by the UpgmaWpgma class in graphML-format to specified outputFileName."""
51 | if not outputFileName.endswith(".graphml"):
52 | outputFileName += str(".graphml")
53 | fileToWrite = open(outputFileName, "w")
54 | fileToWrite.write(""
55 | +"\n"
59 | +"\n\t\n")
60 | for i in clusteredNodesDictionary:
61 | nodes = i.split(" ")
62 | fileToWrite.write("\t\t\n")
63 | fileToWrite.write("\t\t\n")
64 | fileToWrite.write("\t\t\n")
65 | j = 0
66 | for i in clusteredNodesDictionary:
67 | nodes = i.split(" ")
68 | fileToWrite.write("\t\t\n")
69 | j += 1
70 | fileToWrite.write("\t\t\n")
71 | j += 1
72 |
73 | fileToWrite.write("\t\n")
74 | fileToWrite.close()
75 | def writeRnaDotBracketNotation(self, sequence, pairedBases, outputFileName):
76 | """Writes a given RNA sequence and the computed matching bases in dot-bracket notation to the file outputFileName."""
77 | stack = {}
78 | for i in range (0, len(sequence)):
79 | if i in pairedBases:
80 | stack[i] = "("
81 | stack[pairedBases[i]] = ")"
82 | else:
83 | if not i in stack:
84 | stack[i] = "."
85 | fileToWrite = open(outputFileName, "w")
86 | fileToWrite.write(sequence+"\n")
87 | for i in sorted(stack):
88 | fileToWrite.write(stack[i])
89 | def writeNewickTree(self, newickTree, outputFileName):
90 | fileToWrite = open(outputFileName, "w")
91 | fileToWrite.write(newickTree)
92 |
--------------------------------------------------------------------------------
/source/lib/helper/__init__.py:
--------------------------------------------------------------------------------
1 | from IOHelper import IOHelper
2 | from mathHelper import MathHelper
3 | from pairwiseAlignmentHelper import PairwiseAlignmentHelper
4 | from multipleAlignmentHelper import MultipleAlignmentHelper
--------------------------------------------------------------------------------
/source/lib/helper/mathHelper.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2014 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | #
12 | # A math helper class. Some constants are defined here.
13 | class MathHelper():
14 | """MathHelper class. Some constants are defined."""
15 | Inf = 1e3000
16 | NaN = 0 * Inf
--------------------------------------------------------------------------------
/source/lib/helper/multipleAlignmentHelper.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2014 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | #
12 | from helper import PairwiseAlignmentHelper as pah
13 | class MultipleAlignmentHelper():
14 |
15 | noGap = 0
16 | gapA = 1
17 | gapB = 2
18 | gapC = 3
19 | gapAB = 4
20 | gapBC = 5
21 | gapAC = 6
22 |
23 | def weightFunctionDifference(self, a, b, c):
24 | """Weight function with 0 if a==b==c, 1 if a==b, a==c or b==c, 2 else."""
25 | if a == b and b == c:
26 | return 0
27 | elif a == b:
28 | return 1
29 | elif b == c:
30 | return 1
31 | elif a ==c :
32 | return 1
33 | else:
34 | return 2
35 | def createDataForUpgmaWpgma(self, sequences):
36 | """Preprocessing of the sequences for the upgm/wpgm algorithm."""
37 | differenceDictionary = {}
38 | sequenceToIdMapping = {}
39 | sequenceToLengthMapping = {}
40 | mappingCount = 0
41 | for i in sequences:
42 | sequenceToIdMapping[i] = mappingCount
43 | sequenceToLengthMapping[mappingCount] = len(i)
44 | mappingCount += 1
45 |
46 | differenceScore = 0
47 | for i in range(0, len(sequences)):
48 | for j in range(i+1, len(sequences)):
49 | for k in range(0, max(len(sequences[i]), len(sequences[j]))):
50 | if k < len(sequences[j]) and k < len(sequences[i]):
51 | differenceScore += pah().weightFunctionDifference(sequences[i][k], sequences[j][k])
52 | elif k < len(sequences[i]):
53 | differenceScore += pah().weightFunctionDifference(sequences[i][k], "-", )
54 | elif k < len(sequences[j]):
55 | differenceScore += pah().weightFunctionDifference("-", sequences[j][k])
56 | key = str(i) + " " + str(j)
57 | differenceDictionary[key] = differenceScore
58 | differenceScore = 0
59 | return [differenceDictionary, sequenceToIdMapping, sequenceToLengthMapping]
60 |
61 |
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/source/lib/helper/pairwiseAlignmentHelper.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2014 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 |
12 | class PairwiseAlignmentHelper():
13 | """Class to support the pairwise alignment algorithms Needleman-Wunsch and Gotoh."""
14 | diagonalD = 0
15 | dotQ = 1
16 | dotP = 2
17 | upD = 3
18 | upP = 4
19 | leftD = 5
20 | leftQ = 6
21 | matrixIndexD = 0
22 | matrixIndexP = 1
23 | matrixIndexQ = 2
24 | left = 0
25 | up = 1
26 | diagonal = 2
27 |
28 | def weightFunctionDifference(self, a, b):
29 | """Weight function with 0 if a==b and 1 else."""
30 | if a == b:
31 | return 0
32 | elif a != b:
33 | return 1
34 |
35 | def gapCost(self, x):
36 | """Returns a gap cost of g(x) = 2 + k."""
37 | return 2 + x
38 |
39 | def pam250(self, a, b):
40 | """Returns the value of an amino acid given a pam250 matrix. If it is a gap, 1 is returned.
41 | Source: http://www.icp.ucl.ac.be/~opperd/private/pam250.html"""
42 | pam250 = [[13, 6, 9, 9, 5, 8, 9, 12, 6, 8, 6, 7, 7, 4, 11, 11, 11, 2, 4, 9]
43 | , [3, 17, 4, 3, 2, 5, 3, 2, 6, 3, 2, 9, 4, 1, 4, 4, 3, 7, 2, 2]
44 | , [4, 4, 6, 7, 2, 5, 6, 4, 6, 3, 2, 5, 3, 2, 4, 5, 4, 2, 3, 3]
45 | , [5, 4, 8, 11, 1, 7, 10, 5, 6, 3, 2, 5, 3, 1, 4, 5, 5, 1, 2, 3]
46 | , [2, 1, 1, 1, 52, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 3, 2, 1, 4, 2]
47 | , [3, 5, 5, 6, 1, 10, 7, 3, 7, 2, 3, 5, 3, 1, 4, 3, 3, 1, 2, 3]
48 | , [5, 4, 7, 11, 1, 9, 12, 5, 6, 3, 2, 5, 3, 1, 4, 5, 5, 1, 2, 3]
49 | , [12, 5, 10, 10, 4, 7, 9, 27, 5, 5, 4, 6, 5, 3, 8, 11, 9, 2, 3, 7]
50 | , [2, 5, 5, 4, 2, 7, 4, 2, 15, 2, 2, 3, 2, 2, 3, 3, 2, 2, 3, 2]
51 | , [3, 2, 2, 2, 2, 2, 2, 2, 2, 10, 6, 2, 6, 5, 2, 3, 4, 1, 3, 9]
52 | , [6, 4, 4, 3, 2, 6, 4, 3, 5, 15, 34, 4, 20, 13, 5, 4, 6, 6, 7, 13]
53 | , [6, 18, 10, 8, 2, 10, 8, 5, 8, 5, 4, 24, 9, 2, 6, 8, 8, 4, 3, 5]
54 | , [1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 3, 2, 6, 2, 1, 1, 1, 1, 1, 2]
55 | , [2, 1, 2, 1, 1, 1, 1, 1, 3, 5, 6, 1, 4, 32, 1, 2, 2, 4, 20, 3]
56 | , [7, 5, 5, 4, 3, 5, 4, 5, 5, 3, 3, 4, 3, 2, 20, 6, 5, 1, 2, 4]
57 | , [9, 6, 8, 7, 7, 6, 7, 9, 6, 5, 4, 7, 5, 3, 9, 10, 9, 4, 4, 6]
58 | , [8, 5, 6, 6, 4, 5, 5, 6, 4, 6, 4, 6, 5, 3, 6, 8, 11, 2, 3, 6]
59 | , [0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 55, 1, 0]
60 | , [1, 1, 2, 1, 3, 1, 1, 1, 3, 2, 2, 1, 2, 15, 1, 2, 2, 3, 31, 2]
61 | , [7, 4, 4, 4, 4, 4, 4, 4, 5, 4, 15, 10, 4, 10, 5, 5, 5, 72, 4, 17]]
62 |
63 | pamdict = {"A":0, "R":1, "N":2, "D":3, "C":4, "Q":5, "E":6, "G":7, "H":8, "I":9, "L":10, "K":11, "M":12,
64 | "F":13, "P":14, "S":15, "T":16, "W":17, "Y":18, "V":19}
65 |
66 | if a in pamdict and b in pamdict:
67 | return pam250[pamdict[a]][pamdict[b]]
68 | else:
69 | return 1
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/source/lib/helper/test/IOHelperTest.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2014 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 |
12 | import unittest
13 | import os, sys
14 | lib_path = os.path.abspath('../../')
15 | sys.path.append(lib_path)
16 | from helper import IOHelper as io
17 |
18 | class IOHelperTestClass(unittest.TestCase):
19 | """Test class to check the correctness of the methods in IOHelper."""
20 | def test_readFastaFile(self):
21 | """Test method to test the correct reading of a fasta file."""
22 | if os.path.exists("testReadFasta.fas"):
23 | os.remove("testReadFasta.fas")
24 |
25 | # first test case: two sequences
26 | sequenceToWrite = [["ACGT", "ACGTAATTA"]]
27 | expectedSequence = ["ACGT", "ACGTAATTA"]
28 | io().writeFastaFile(sequenceToWrite, "testReadFasta.fas")
29 | readSequence = io().readFastaFile("testReadFasta.fas", multipleSequenceAlignment=False)
30 | self.assertEqual(expectedSequence, readSequence)
31 |
32 | # second test case: two sequences but there are multilpe ones
33 | sequenceToWrite = [["ACGT", "ACGTAATTA", "AGTTG"]]
34 | expectedSequence = ["ACGT", "ACGTAATTA", "AGTTG"]
35 | io().writeFastaFile(sequenceToWrite, "testReadFasta.fas")
36 | readSequence = io().readFastaFile("testReadFasta.fas", multipleSequenceAlignment=False)
37 | self.assertNotEqual(expectedSequence, readSequence)
38 |
39 | # third test case: multiple sequences
40 | readSequence = io().readFastaFile("testReadFasta.fas", multipleSequenceAlignment=True)
41 | self.assertEqual(expectedSequence, readSequence)
42 |
43 | os.remove("testReadFasta.fas")
44 |
45 | def test_writeFastaFile(self):
46 | """Test method to test the correct writing of a fasta file."""
47 | if os.path.exists("testWriteFasta.fas"):
48 | os.remove("testWriteFasta.fas")
49 | sequence = [["ACGT", "ACGTAATTA"]]
50 | expectedReadSequence = [">Alignment 0 sequence 0", "ACGT", ">Alignment 0 sequence 1", "ACGTAATTA"]
51 | readInputSequence = []
52 |
53 | # first test case, filename with extension
54 | io().writeFastaFile(sequence, "testWriteFasta.fas")
55 | testInputFile = open("testWriteFasta.fas")
56 | for line in testInputFile.readlines():
57 | readInputSequence.append(line.strip("\n"))
58 | self.assertEqual(expectedReadSequence, readInputSequence)
59 | testInputFile.close()
60 | os.remove("testWriteFasta.fas")
61 |
62 | # second test case, filename without extension
63 | readInputSequence = []
64 | io().writeFastaFile(sequence, "testWriteFasta")
65 | testInputFile = open("testWriteFasta.fas")
66 | for line in testInputFile.readlines():
67 | readInputSequence.append(line.strip("\n"))
68 | self.assertEqual(expectedReadSequence, readInputSequence)
69 | testInputFile.close()
70 | os.remove("testWriteFasta.fas")
71 |
72 | if __name__ == "__main__":
73 | unittest.main() # run all tests
--------------------------------------------------------------------------------
/source/lib/multiple/__init__.py:
--------------------------------------------------------------------------------
1 | from needlemanWunschN3 import NeedlemanWunschN3
2 | from upgmaWpgma import UpgmaWpgma
3 | from fengDoolittle import FengDoolittle
4 | from sumOfPairs import SumOfPairs
--------------------------------------------------------------------------------
/source/lib/multiple/fengDoolittle.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2015 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | import sys
12 | from pairwise import NeedlemanWunsch
13 | from math import log10
14 | from helper import PairwiseAlignmentHelper as pah
15 | from multiple import UpgmaWpgma
16 | import random
17 |
18 |
19 | class FengDoolittle():
20 | """This class computes the Feng-Doolittle algorithm by Da-Fei Feng and Russell F. Doolittle:
21 | Feng, Da-Fei, and Russell F. Doolittle.
22 | "Progressive sequence alignment as a prerequisitetto correct phylogenetic trees."
23 | Journal of molecular evolution 25.4 (1987): 351-360.
24 | http://dna.bio.puc.cl/cardex/papersbio252/Grupo06-2013.pdf"""
25 | def __init__(self, sequences, weightFunction, similarityScore):
26 | """To initialize an object of class FengDoolittle you have to define:
27 | sequences: A list of sequences for the multiple alignment.
28 | weightFunction: A string containing the name of your preferred weight function.
29 | The weight function have to be defined in package helper, class PairwiseAlignmentHelper.
30 | similarityScore: A string containing the name of your preferred similarity score like pam250.
31 | The similarity score have to be defined in package helper, class PairwiseAlignmentHelper."""
32 | self.sequences = sequences
33 | self.alignments = []
34 | self.weightFunction = weightFunction
35 | if similarityScore in dir(pah) and callable(getattr(pah, similarityScore)):
36 | similarityScoreObj = eval('pah().' + similarityScore)
37 | else:
38 | print "Score function not found!"
39 | sys.exit()
40 | self.similarityScore = similarityScoreObj
41 | self.alignmentToIndexMapping = {}
42 | self.sequenceToIndexMapping = {}
43 | self.distanceDictionary = {}
44 | self.newickTree = ""
45 | self.orderToAlign = []
46 |
47 | def computeAlignments(self):
48 | """This function computes all pairwise alignments between every sequence with the Needleman-Wunsch algorithm."""
49 | nw = NeedlemanWunsch()
50 | alignmentsAppend = self.alignments.append
51 | for i in range(0, len(self.sequences)):
52 | for j in range(i + 1, len(self.sequences)):
53 | alignmentsAppend([nw.compute([self.sequences[i], self.sequences[j]], self.weightFunction,1)[0], i, j])
54 |
55 | def computeDistanceDictionary(self):
56 | """This function computes the distance between every alignment. The distances are used to generate a phylogenetic tree."""
57 | for i in range(0, len(self.alignments)):
58 | index = str(self.alignments[i][1]) + " " + str(self.alignments[i][2])
59 | self.distanceDictionary[index] = self.similarityToDistance(self.alignments[i][0])
60 |
61 | def similarityToDistance(self, alignment):
62 | """Computes from the given similarity the distance measure."""
63 | sMax = self.similarity(alignment[0], alignment[0]) + self.similarity(alignment[1], alignment[1])
64 | sMax /= 2
65 | alignmentAsList = list(alignment[0])
66 | alignmentAsList1 = list(alignment[1])
67 | random.shuffle(alignmentAsList)
68 | random.shuffle(alignmentAsList1)
69 | alignmentShuffel0 = "".join(alignmentAsList)
70 | alignmentShuffel1 = "".join(alignmentAsList1)
71 |
72 | sRand = self.similarity(alignmentShuffel0, alignmentShuffel1)
73 | if sMax == sRand:
74 | sRand = sRand - 0.0001
75 | else:
76 | sEff = (self.similarity(alignment[0], alignment[1]) - sRand) / float(sMax - sRand)
77 | if sEff <= 0.0:
78 | return 1
79 | distance = -log10(sEff)
80 | return distance
81 |
82 | def similarity(self, a, b):
83 | """Returns the similarity of two sequences a and b with the similarity score defined at the initialization."""
84 | similarity = 0
85 | for i in range(0, len(a)):
86 | similarity += self.similarityScore(a[i], b[i])
87 | return similarity
88 |
89 | def buildTree(self):
90 | """This function computes the phylogenetic tree with UPGMA and stores it in the Newick-Tree format."""
91 | upgma = UpgmaWpgma(self.distanceDictionary, len(self.sequences))
92 | upgma.compute_clustering()
93 | self.newickTree = upgma.get_newick_tree()
94 |
95 | def buildMultipleAlignment(self, group0, group1):
96 | """This function returns which is the best pairwise alignment out of all alignments of group0 and group1."""
97 | highestScore = 0
98 | optimalAlignment = []
99 | for i in group0:
100 | for j in group1:
101 | nw = NeedlemanWunsch()
102 | alignment = nw.compute([i[0], j[0]], self.weightFunction, 1)
103 | score = self.similarity(alignment[0][0], alignment[0][1])
104 | if highestScore < score:
105 | highestScore = score
106 | optimalAlignment = [alignment[0][0], alignment[0][1], i[1], j[1]]
107 | return optimalAlignment
108 |
109 |
110 | def computeOrderOfSequencesToAlign(self):
111 | """This function computes out of the phylogenetic tree in which order the sequences are aligned."""
112 | indexBegin = 0
113 | indexEnd = len(self.newickTree)
114 | while indexEnd != -1:
115 | indexBegin = self.newickTree.rfind("(", indexBegin, indexEnd)
116 | if indexBegin == -1:
117 | break
118 | i = indexBegin + 1
119 | stack = 0
120 | while stack >= 0 and i < len(self.newickTree):
121 | if self.newickTree[i] == "(":
122 | stack += 1
123 | elif self.newickTree[i] == ")":
124 | stack -= 1
125 | i += 1
126 | indexEnd = i
127 |
128 | group0 = ""
129 | group1 = ""
130 | substring = self.newickTree[indexBegin:indexEnd]
131 | if substring[1] != "(":
132 | indexGroup0 = substring.find(",")
133 | group0 = substring[0:indexGroup0].strip(",")
134 | group1 = substring[indexGroup0:-1].strip(",")
135 | else:
136 | k = 1
137 | stack = 0
138 | while k < len(substring):
139 | if substring[k] == "(":
140 | stack += 1
141 | elif substring[k] == ")":
142 | stack -= 1
143 | k += 1
144 | if stack <= 0:
145 | break
146 | group0 = substring[0:k].strip(",")
147 | group1 = substring[k:-1].strip(",")
148 | group0List = group0.split(",")
149 | group1List = group1.split(",")
150 | list0 = []
151 | list1 = []
152 | for j in group0List:
153 | list0.append(int(j.strip("(").strip(")").strip(",")))
154 | for j in group1List:
155 | list1.append(int(j.strip("(").strip(")").strip(",")))
156 |
157 | self.orderToAlign.append(sorted([sorted(list0), sorted(list1)]))
158 | indexEnd = indexBegin
159 | indexBegin = 0
160 |
161 | def computeMultipleAlignment(self):
162 | """This function returns the multiple sequence alignment."""
163 | self.computeAlignments()
164 | self.computeDistanceDictionary()
165 | self.buildTree()
166 | self.computeOrderOfSequencesToAlign()
167 | i = 0
168 | indexAlignments = {}
169 | # create index to algnment realation
170 | while i < len(self.orderToAlign):
171 | if len(self.orderToAlign[i][0]) == 1 and len(self.orderToAlign[i][1]):
172 | for j in self.alignments:
173 | if (j[1] == self.orderToAlign[i][0][0] and j[2] == self.orderToAlign[i][1][0]):
174 | indexAlignments[self.orderToAlign[i][0][0]] = j[0][0]
175 | indexAlignments[self.orderToAlign[i][1][0]] = j[0][1]
176 | break
177 | elif(j[1] == self.orderToAlign[i][1][0] and j[2] == self.orderToAlign[i][0][0]):
178 | indexAlignments[self.orderToAlign[i][0][0]] = j[0][1]
179 | indexAlignments[self.orderToAlign[i][1][0]] = j[0][0]
180 | break
181 | elif len(self.orderToAlign[i][0]) == 1:
182 | indexAlignments[self.orderToAlign[i][0][0]] = self.sequences[self.orderToAlign[i][0][0]]
183 | elif len(self.orderToAlign[i][1]) == 1:
184 | try:
185 | indexAlignments[self.orderToAlign[i][1][0]] = self.sequences[self.orderToAlign[i][1][0]]
186 | except:
187 | print "Exception!"
188 | print "i: ", i
189 | print "OrderToAlign: ", self.orderToAlign
190 | print "orderAlign:", self.orderToAlign[i][1][0]
191 | print self.sequences
192 | i += 1
193 |
194 | for i in self.orderToAlign:
195 | # one sequence with one sequence
196 | if len(i[0]) == 1 and len(i[1]):
197 | indexAlignments[i[0][0]] = indexAlignments[i[0][0]].replace("-", "X")
198 | indexAlignments[i[1][0]] = indexAlignments[i[1][0]].replace("-", "X")
199 | # one sequence with one group
200 | # two groups
201 | else:
202 | group0 = []
203 | group1 = []
204 | for j in i[0]:
205 | group0.append([indexAlignments[j], j])
206 | for j in i[1]:
207 | group1.append([indexAlignments[j],j])
208 | pairwiseAlignment = self.buildMultipleAlignment(group0, group1)
209 | indexAlignments[pairwiseAlignment[2]] = pairwiseAlignment[0].replace("-", "X")
210 | indexAlignments[pairwiseAlignment[3]] = pairwiseAlignment[1].replace("-", "X")
211 |
212 | for j in i[0]:
213 | nw = NeedlemanWunsch()
214 | alignment = nw.compute([pairwiseAlignment[0], indexAlignments[j]], self.weightFunction, 1)
215 | indexAlignments[j] = alignment[0][1]
216 | for j in i[1]:
217 | nw = NeedlemanWunsch()
218 | alignment = nw.compute([pairwiseAlignment[1], indexAlignments[j]], self.weightFunction, 1)
219 | indexAlignments[j] = alignment[0][1]
220 | for j in indexAlignments:
221 | indexAlignments[j] = indexAlignments[j].replace("-", "X")
222 | return indexAlignments
223 |
--------------------------------------------------------------------------------
/source/lib/multiple/needlemanWunschN3.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2015 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | import sys
12 | from helper import MultipleAlignmentHelper as mah
13 |
14 |
15 | class NeedlemanWunschN3():
16 | """This class computes the Needleman-Wunsch algorithm with three sequences."""
17 |
18 | def __init__(self, sequence_a, sequence_b, sequence_c, score_function):
19 | """Initalize all variables and methods needed to compute the Needleman-Wunsch algorithm with three sequences.
20 | sequenceA: A string with the first DNA sequence.
21 | sequenceB: A string with the second DNA sequence.
22 | sequenceC: A string with the third DNA sequence.
23 | scoreFunction: The name of a weight function as a String which is defined
24 | in the pairwiseAlignmentHelper-class.
25 | """
26 | if score_function in dir(mah) and callable(getattr(mah, score_function)):
27 | score_function_obj = eval('mah().' + score_function)
28 | else:
29 | print "Score function not found!"
30 | sys.exit()
31 |
32 | self.computation_matrix = [[[]]]
33 | self.sequence_a = sequence_a
34 | self.sequence_b = sequence_b
35 | self.sequence_c = sequence_c
36 | self.score_function = score_function_obj
37 | self.i = 0
38 | self.j = 0
39 | self.traceback_stack = [[]]
40 | self.traceback_stack_index = 0
41 | self.indices_stack = [[]]
42 | self.computed_alignment = []
43 |
44 | def compute_matrix(self):
45 | """Computes the matrix which is needed by the Needleman-Wunsch algorithm for three sequences."""
46 | self.computation_matrix = [
47 | [[0 for i in range(len(self.sequence_c) + 1)] for j in range(len(self.sequence_b) + 1)] \
48 | for k in range(len(self.sequence_a) + 1)]
49 | # initalize matrix
50 | for i in range(1, len(self.sequence_a) + 1):
51 | self.computation_matrix[i][0][0] = self.computation_matrix[i - 1][0][0] \
52 | + self.score_function("", "", self.sequence_a[i - 1])
53 | for i in range(1, len(self.sequence_b) + 1):
54 | self.computation_matrix[0][i][0] = self.computation_matrix[0][i - 1][0] \
55 | + self.score_function("", "", self.sequence_b[i - 1])
56 | for i in range(1, len(self.sequence_c) + 1):
57 | self.computation_matrix[0][0][i] = self.computation_matrix[0][0][i - 1] \
58 | + self.score_function("", "", self.sequence_c[i - 1])
59 | for i in range(1, len(self.sequence_a) + 1):
60 | for j in range(1, len(self.sequence_b) + 1):
61 | self.computation_matrix[i][j][0] = self.computation_matrix[i - 1][j - 1][0] \
62 | + self.score_function(self.sequence_a[i - 1], self.sequence_b[j - 1],
63 | "")
64 | for i in range(1, len(self.sequence_a) + 1):
65 | for k in range(1, len(self.sequence_c) + 1):
66 | self.computation_matrix[i][0][k] = self.computation_matrix[i - 1][0][k - 1] \
67 | + self.score_function(self.sequence_a[i - 1], "",
68 | self.sequence_c[k - 1])
69 | for j in range(1, len(self.sequence_b) + 1):
70 | for k in range(1, len(self.sequence_c) + 1):
71 | self.computation_matrix[0][j][k] = self.computation_matrix[0][j - 1][k - 1] \
72 | + self.score_function("", self.sequence_b[j - 1],
73 | self.sequence_c[k - 1])
74 |
75 | for i in range(1, len(self.sequence_a) + 1):
76 | for j in range(1, len(self.sequence_b) + 1):
77 | for k in range(1, len(self.sequence_c) + 1):
78 | self.computation_matrix[i][j][k] = self.compute_minimum(i, j, k)
79 |
80 | def compute_minimum(self, i, j, k):
81 | """Compute the minimal value for a given cell of the matrix.
82 | The minimum is choosen of the following values:
83 | D(i-1, j-1, k-1) + w(a_i-1, b_j-1, c_k-1)
84 | D(i, j-1, k-1) + w(a_i, b_j-1, c_k-1)
85 | D(i-1, j, k-1) + w(a_i-1, b_j, c_k-1)
86 | D(i-1, j-1, k) + w(a_i-1, b_j-1, c_k)
87 | D(i, j, k-1) + w(a_i, b_j, c_k-1)
88 | D(i-1, j, k) + w(a_i-1, b_j, c_k)
89 | D(i, j-1, k) + w(a_i, b_j-1, c_k)
90 | i: index of sequence A
91 | j: index of sequence B
92 | k: index of sequence C
93 | """
94 | # no gap
95 | no_gap = self.computation_matrix[i - 1][j - 1][k - 1] \
96 | + self.score_function(self.sequence_a[i - 1], self.sequence_b[j - 1], self.sequence_c[k - 1])
97 | # one gap
98 | gap_a = self.computation_matrix[i][j - 1][k - 1] \
99 | + self.score_function("", self.sequence_b[j - 1], self.sequence_c[k - 1])
100 | gap_b = self.computation_matrix[i - 1][j][k - 1] \
101 | + self.score_function(self.sequence_a[i - 1], "", self.sequence_c[k - 1])
102 | gap_c = self.computation_matrix[i - 1][j - 1][k] \
103 | + self.score_function(self.sequence_a[i - 1], self.sequence_b[j - 1], "")
104 | # two gaps
105 | gap_ab = self.computation_matrix[i][j][k - 1] + self.score_function("", "", self.sequence_c[k - 1])
106 | gap_bc = self.computation_matrix[i - 1][j][k] + self.score_function(self.sequence_a[i - 1], "", "")
107 | gap_ac = self.computation_matrix[i][j - 1][k] + self.score_function("", self.sequence_b[j - 1], "")
108 | possible_values = [no_gap, gap_a, gap_b, gap_c, gap_ab, gap_bc, gap_ac]
109 | return min(possible_values)
110 |
111 | def traceback(self, maximal_optimal_solutions=-1):
112 | """Computes the traceback for the Needleman-Wunsch n=3 matrix."""
113 | self.traceback_stack = [[]]
114 | self.indices_stack = [[len(self.computation_matrix) - 1, len(self.computation_matrix[0]) - 1,
115 | len(self.computation_matrix[0][0]) - 1]]
116 | self.traceback_stack_index = 0
117 | traceback_done = False
118 | optimal_solutions_count = 0
119 | while not traceback_done:
120 |
121 | i = self.indices_stack[self.traceback_stack_index][0]
122 | j = self.indices_stack[self.traceback_stack_index][1]
123 | k = self.indices_stack[self.traceback_stack_index][2]
124 | optimal_solutions_count += 1
125 | split = False
126 | while i > 0 or j > 0 or k > 0:
127 | path_variable_i = i
128 | path_variable_j = j
129 | path_variable_k = k
130 | # no gap
131 | if i > 0 and j > 0 and k > 0:
132 | if self.computation_matrix[i][j][k] == self.computation_matrix[i - 1][j - 1][k - 1] \
133 | + self.score_function(self.sequence_a[i - 1], self.sequence_b[j - 1],
134 | self.sequence_c[k - 1]):
135 | self.traceback_stack[self.traceback_stack_index].append(mah.noGap)
136 | path_variable_i -= 1 # change i
137 | path_variable_j -= 1 # change j
138 | path_variable_k -= 1 # change k
139 | split = True
140 |
141 | # a gap in sequence a
142 | if j > 0 and k > 0:
143 | if self.computation_matrix[i][j][k] == self.computation_matrix[i][j - 1][k - 1] \
144 | + self.score_function("", self.sequence_b[j - 1], self.sequence_c[k - 1]):
145 | if split == False:
146 | self.traceback_stack[self.traceback_stack_index].append(mah.gapA)
147 | path_variable_j -= 1
148 | path_variable_k -= 1
149 | split = True
150 | else:
151 | self.split([i, j - 1, k - 1], mah.gapA)
152 | # a gap in sequence b
153 | if i > 0 and k > 0:
154 | if self.computation_matrix[i][j][k] == self.computation_matrix[i - 1][j][k - 1] \
155 | + self.score_function(self.sequence_a[i - 1], "", self.sequence_c[k - 1]):
156 | if split == False:
157 | self.traceback_stack[self.traceback_stack_index].append(mah.gapB)
158 | path_variable_i -= 1
159 | path_variable_k -= 1
160 | elif split == True:
161 | self.split([i - 1, j, k - 1], mah.gapB)
162 | # a gap in sequence c
163 | if i > 0 and j > 0:
164 | if self.computation_matrix[i][j][k] == self.computation_matrix[i - 1][j - 1][k] \
165 | + self.score_function(self.sequence_a[i - 1], self.sequence_b[j - 1], ""):
166 | if split == False:
167 | self.traceback_stack[self.traceback_stack_index].append(mah.gapC)
168 | path_variable_i -= 1
169 | path_variable_j -= 1
170 | elif split == True:
171 | self.split([i - 1, j - 1, k], mah.gapC)
172 | # a gap in sequence a and b
173 | if k > 0:
174 | if self.computation_matrix[i][j][k] == self.computation_matrix[i][j][k - 1] \
175 | + self.score_function("", "", self.sequence_c[k - 1]):
176 | if split == False:
177 | self.traceback_stack[self.traceback_stack_index].append(mah.gapAB)
178 | path_variable_k -= 1
179 | elif split == True:
180 | self.split([i, j, k - 1], mah.gapAB)
181 | # a gap in sequence a and c
182 | if j > 0:
183 | if self.computation_matrix[i][j][k] == self.computation_matrix[i][j - 1][k] \
184 | + self.score_function("", self.sequence_b[j - 1], ""):
185 | if split == False:
186 | self.traceback_stack[self.traceback_stack_index].append(mah.gapAC)
187 | path_variable_j -= 1
188 | elif split == True:
189 | self.split([i, j - 1, k], mah.gapAC)
190 | # a gap in sequence b and c
191 | if i > 0:
192 | if self.computation_matrix[i][j][k] == self.computation_matrix[i - 1][j][k] \
193 | + self.score_function(self.sequence_a[i - 1], "", ""):
194 | if split == False:
195 | self.traceback_stack[self.traceback_stack_index].append(mah.gapBC)
196 | path_variable_i -= 1
197 | elif split == True:
198 | self.split([i - 1, j, k], mah.gapBC)
199 | split = False
200 | i = path_variable_i
201 | j = path_variable_j
202 | k = path_variable_k
203 | if maximal_optimal_solutions != -1 and optimal_solutions_count >= maximal_optimal_solutions:
204 | break
205 | self.indices_stack[self.traceback_stack_index][0] = i
206 | self.indices_stack[self.traceback_stack_index][1] = j
207 | self.indices_stack[self.traceback_stack_index][2] = k
208 | l = 0
209 | all_tracebacks_computed = 0
210 | while l < len(self.indices_stack):
211 | if self.indices_stack[l][0] == 0 and self.indices_stack[l][1] == 0 and self.indices_stack[l][2] == 0:
212 | all_tracebacks_computed += 1
213 | else:
214 | self.traceback_stack_index = l
215 | l = len(self.indices_stack)
216 | l += 1
217 | if all_tracebacks_computed >= len(self.indices_stack):
218 | traceback_done = True
219 | # all_tracebacks_computed = 0
220 | if maximal_optimal_solutions != -1 and optimal_solutions_count >= maximal_optimal_solutions:
221 | for i in range(0, maximal_optimal_solutions):
222 | self.computed_alignment.append(self.build_alignment(self.traceback_stack[i]))
223 | else:
224 | for i in range(0, len(self.traceback_stack)):
225 | self.computed_alignment.append(self.build_alignment(self.traceback_stack[i]))
226 |
227 | def split(self, index, gapSymbol):
228 | """Splits the actual traceback path into two paths.
229 | index: The index values for the next cell of the path.
230 | gapSymbol: A symbol for the computed step for the path."""
231 | self.traceback_stack.append(self.traceback_stack[self.traceback_stack_index][0:-1])
232 | self.traceback_stack[len(self.traceback_stack) - 1].append(gapSymbol)
233 | self.indices_stack.append(index)
234 |
235 | def build_alignment(self, tracebackStack):
236 | """Builds the alignment for one traceback path.
237 | tracebackStack: The computed tracebackpath as a list = []
238 | """
239 | i = 0
240 | j = 0
241 | k = 0
242 | l = len(tracebackStack) - 1
243 | alignment_of_a = ""
244 | alignment_of_b = ""
245 | alignment_of_c = ""
246 |
247 | while len(tracebackStack) > 0:
248 | try:
249 | tracebackElement = tracebackStack.pop(l)
250 | if mah.noGap == tracebackElement:
251 | alignment_of_a += self.sequence_a[i]
252 | alignment_of_b += self.sequence_b[j]
253 | alignment_of_c += self.sequence_c[k]
254 | i += 1
255 | j += 1
256 | k += 1
257 | elif mah.gapA == tracebackElement:
258 | alignment_of_a += "-"
259 | alignment_of_b += self.sequence_b[j]
260 | alignment_of_c += self.sequence_c[k]
261 | j += 1
262 | k += 1
263 | elif mah.gapB == tracebackElement:
264 | alignment_of_a += self.sequence_a[i]
265 | alignment_of_b += "-"
266 | alignment_of_c += self.sequence_c[k]
267 | i += 1
268 | k += 1
269 | elif mah.gapC == tracebackElement:
270 | alignment_of_a += self.sequence_a[i]
271 | alignment_of_b += self.sequence_b[j]
272 | alignment_of_c += "-"
273 | i += 1
274 | j += 1
275 | elif mah.gapAB == tracebackElement:
276 | alignment_of_a += "-"
277 | alignment_of_b += "-"
278 | alignment_of_c += self.sequence_c[k]
279 | k += 1
280 | elif mah.gapAC == tracebackElement:
281 | alignment_of_a += "-"
282 | alignment_of_b += self.sequence_b[j]
283 | alignment_of_c += "-"
284 | j += 1
285 | elif mah.gapBC == tracebackElement:
286 | alignment_of_a += self.sequence_a[i]
287 | alignment_of_b += "-"
288 | alignment_of_c += "-"
289 | i += 1
290 | l -= 1
291 | except:
292 | print "An error occured."
293 | sys.exit()
294 | while i < len(self.sequence_a):
295 | alignment_of_a += self.sequence_a[i]
296 | i += 1
297 | while j < len(self.sequence_b):
298 | alignment_of_b += self.sequence_b[j]
299 | j += 1
300 | while k < len(self.sequence_c):
301 | alignment_of_b += self.sequence_c[k]
302 | k += 1
303 | alignment = [alignment_of_a, alignment_of_b, alignment_of_c]
304 | return alignment
305 |
306 | def execute(self, maximalOptimalSolutions=-1):
307 | """Method to start the computation of the Needleman-Wunsch algorithm with three sequences. It returns the computed alignment.
308 | [maximalOptimalSolutions]: Define how many optimal solutions should be computed. If not defined, all optimal solutions are computed."""
309 | self.compute_matrix()
310 | if maximalOptimalSolutions == -1:
311 | self.traceback()
312 | else:
313 | self.traceback(maximalOptimalSolutions)
314 | return self.computed_alignment
315 |
--------------------------------------------------------------------------------
/source/lib/multiple/sumOfPairs.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2015 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | #
12 | # Sum of pairs algorithm
13 | from helper import PairwiseAlignmentHelper as pah
14 | import sys
15 |
16 |
17 | class SumOfPairs():
18 | """This class computes the Sum-of-pairs algorithm by Carillo and Lipman:
19 | Carrillo, Humberto, and David Lipman.
20 | "The multiple sequence alignment problem in biology."
21 | SIAM Journal on Applied Mathematics 48.5 (1988): 1073-1082.
22 | http://www.academia.edu/download/30855770/Articulo03.pdf"""
23 | def __init__(self, sequences, similarity_score):
24 | """To initialize a object of the SumOfPairs class please define a list with the multiple sequence alignment and
25 | a similarity score method which is defined in class PairwiseAlignmentHelper of package helper.
26 | sequences: The multiple alginment as a list.
27 | similarity_score: The scoring functions name as a string."""
28 | self.sequences = sequences
29 | if similarity_score in dir(pah) and callable(getattr(pah, similarity_score)):
30 | similarity_score_obj = eval('pah().' + similarity_score)
31 | else:
32 | print "Score function not found!"
33 | sys.exit()
34 | self.score_function = similarity_score_obj
35 |
36 | def execute(self):
37 | """Run this method to compute the sum of pairs scoring for multiple alignment."""
38 | score_value = 0
39 | for i in range(0, len(self.sequences)):
40 | for j in range(i+1, len(self.sequences)):
41 | score_value += self.score(self.sequences[i], self.sequences[j])
42 | return score_value
43 |
44 | def score(self, sequence_a, sequence_b):
45 | """Returns the pairwise alignment for sequence_a and sequence_b."""
46 | score_value = 0
47 | for i in range(0, max(len(sequence_a), len(sequence_b))):
48 | if i < len(sequence_a) and i < len(sequence_b):
49 | score_value += self.score_function(sequence_a[i], sequence_b[i])
50 | elif i < len(sequence_a):
51 | score_value += self.score_function(sequence_a[i], "")
52 | elif i < len(sequence_b):
53 | score_value += self.score_function("", sequence_b[i])
54 | i += 1
55 | return score_value
56 |
--------------------------------------------------------------------------------
/source/lib/multiple/test/fengDoolittleTest.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2014 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | #
12 | # Feng-Doolittle test class
13 | import unittest
14 | import os, sys
15 | lib_path = os.path.abspath('../../')
16 | sys.path.append(lib_path)
17 |
18 | from multiple import FengDoolittle
19 |
20 | class FengDoolittleTestClass(unittest.TestCase):
21 | """Test class to test the correct computation of the Needleman-Wunsch n=3 algorithm."""
22 | def test_computeAlignments(self):
23 | sequences = ["ACTG", "AT", "ACG"]
24 | expectedAlignments = [[["ACTG", "A-T-"],0,1], [["ACTG", "AC-G"],0,2], [["AT-", "ACG"],1,2]]
25 | fd = FengDoolittle(sequences, "weightFunctionDifference", "pam250")
26 | fd.computeAlignments()
27 | self.assertEqual(expectedAlignments, fd.alignments)
28 | def test_computeDistanceDictionary(self):
29 | sequences = ["ACCCAT", "ACGGAT", "AACCT"]
30 | expectedAlignments = [["AC-CAT", "ACGGAT"], ["ACGGAT", "AACCAT"], ["-ACCAT", "AACCAT"]]
31 | fd = FengDoolittle(sequences, "weightFunctionDifference", "pam250")
32 | fd.computeAlignments()
33 | fd.computeDistanceDictionary()
34 | def test_computeOrderOfSequencesToAlign(self):
35 | sequences = ["ACTG", "AT", "ACG"]
36 | fd = FengDoolittle(sequences, "weightFunctionDifference", "pam250")
37 | fd.computeAlignments()
38 | fd.computeDistanceDictionary()
39 | fd.buildTree()
40 | # print "NewickTree: ",fd.newickTree
41 | expectedResult = [[[0],[2]], [[0,2],[1]]]
42 | # print "asd"
43 | fd.computeOrderOfSequencesToAlign()
44 | # print "asd"
45 | self.assertEqual(expectedResult, fd.orderToAlign)
46 | def test_computeMultipleAlignment(self):
47 | sequences = ["ACTG", "AT", "ACG"]
48 | expectedResult = {0: 'ACTG', 1: 'AXTX', 2: 'ACXG'}
49 | fd = FengDoolittle(sequences, "weightFunctionDifference", "pam250")
50 | fd.computeMultipleAlignment()
51 | self.assertEqual(expectedResult, fd.computeMultipleAlignment())
52 |
53 | sequences = ["ACCAT", "ACGGAT", "AACCAT"]
54 | expectedResult = {0: 'AXCCXAT', 1: 'AXCGGAT', 2: 'AACCXAT'}
55 | fd2 = FengDoolittle(sequences, "weightFunctionDifference", "pam250")
56 | fd2.computeMultipleAlignment()
57 | self.assertEqual(expectedResult, fd2.computeMultipleAlignment())
58 | if __name__ == "__main__":
59 | unittest.main() # run all tests
60 |
--------------------------------------------------------------------------------
/source/lib/multiple/test/needlemanWunschN3Test.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2014 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | #
12 | # Needleman-Wunsch with n=3 test class
13 | import unittest
14 | import os, sys
15 | lib_path = os.path.abspath('../../')
16 | sys.path.append(lib_path)
17 |
18 | from multiple import NeedlemanWunschN3 as nw
19 | from helper import MultipleAlignmentHelper as mah
20 | from helper import MathHelper as mathHelper
21 | class NeelemanWunschN3TestClass(unittest.TestCase):
22 | """Test class to test the correct computation of the Needleman-Wunsch n=3 algorithm."""
23 | def test_computeMatrix(self):
24 | sequenceA = "AC"
25 | sequenceB = "AGT"
26 | sequenceC = "AGT"
27 | expectedMatrix = [[[0 for i in range(len(sequenceC)+1) ] for j in range(len(sequenceB)+1)] for k in range(len(sequenceA)+1 )]
28 | for i in range(1, len(sequenceA)+1):
29 | expectedMatrix[i][0][0] = expectedMatrix[i-1][0][0] + mah().weightFunctionDifference("", "", sequenceA[i-1])
30 | for i in range(1, len(sequenceB)+1):
31 | expectedMatrix[0][i][0] = expectedMatrix[0][i-1][0] + mah().weightFunctionDifference("", "", sequenceB[i-1])
32 | for i in range(1, len(sequenceC)+1):
33 | expectedMatrix[0][0][i] = expectedMatrix[0][0][i-1] + mah().weightFunctionDifference("", "", sequenceC[i-1])
34 | for i in range(1, len(sequenceA)+1):
35 | for j in range(1, len(sequenceB)+1):
36 | expectedMatrix[i][j][0] = expectedMatrix[i-1][j-1][0] + mah().weightFunctionDifference(sequenceA[i-1], sequenceB[j-1], "")
37 | for i in range(1, len(sequenceA)+1):
38 | for k in range(1, len(sequenceC)+1):
39 | expectedMatrix[i][0][k] = expectedMatrix[i-1][0][k-1] + mah().weightFunctionDifference(sequenceA[i-1], "", sequenceC[k-1])
40 | for j in range(1, len(sequenceB)+1):
41 | for k in range(1, len(sequenceC)+1):
42 | expectedMatrix[0][j][k] = expectedMatrix[0][j-1][k-1] + mah().weightFunctionDifference("", sequenceB[j-1], sequenceC[k-1])
43 |
44 | assertEqual()
45 |
46 | if __name__ == "__main__":
47 | unittest.main() # run all tests
48 |
--------------------------------------------------------------------------------
/source/lib/multiple/test/sumOfPairsTest.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/source/lib/multiple/test/sumOfPairsTest.py
--------------------------------------------------------------------------------
/source/lib/multiple/test/upgmaWpgmaTest.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2015 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | #
12 | # UPGMA/WPGMA test class
13 | import unittest
14 | import os, sys
15 |
16 | lib_path = os.path.abspath('../../')
17 | sys.path.append(lib_path)
18 |
19 | from multiple import UpgmaWpgma
20 |
21 |
22 | class UpgmaWpgmaTestClass(unittest.TestCase):
23 | """Test class to test the correct computation of the UPGMA/WPGMA algorithm."""
24 |
25 | def test_computeMinimalDistance(self):
26 | distanceDictionary = {"0 1": 1, "0 2": 2, "0 3": 3, "1 2": 2, "1 3": 3, "1 4": 3}
27 | upgma = UpgmaWpgma(distanceDictionary, 4)
28 | expectedValue = ["0 1", 1]
29 | self.assertEqual(expectedValue, upgma.compute_minimal_distance())
30 |
31 | def test_computeClustering(self):
32 | distanceDictionary = {"0 1": 1, "0 2": 2, "0 3": 3, "1 2": 2, "1 3": 3, "2 3": 3}
33 | upgma = UpgmaWpgma(distanceDictionary, 4)
34 | expectedValue = {"0 1": 4, "2 4": 5, "3 5": 6}
35 | upgma.compute_clustering()
36 | print upgma.get_newick_tree()
37 | self.assertEqual(expectedValue, upgma.mapping)
38 |
39 | print upgma.get_newick_tree(with_edge_weights=True)
40 | distanceDictionary = {"0 1": 6, "0 2": 10, "0 3": 10, "0 4": 10, "1 2": 10, "1 3": 10, "1 4": 10, "2 3": 2,
41 | "2 4": 6, "3 4": 6}
42 | upgma2 = UpgmaWpgma(distanceDictionary, 5)
43 | expectedValue = {"2 3": 5, "0 1": 7, "4 5": 6, "6 7": 8}
44 | upgma2.compute_clustering()
45 | print upgma2.get_newick_tree(with_edge_weights=False)
46 | self.assertEqual(expectedValue, upgma2.mapping)
47 | print upgma2.get_newick_tree(with_edge_weights=True)
48 |
49 |
50 |
51 |
52 | def test_getNewickTree(self):
53 | mapping = {'1 3': 5, '4 6': 7, '5 7': 8, '0 2': 6}
54 | distanceDictionary = {}
55 | upgma = UpgmaWpgma(distanceDictionary, 5)
56 | upgma.mapping = mapping
57 | upgma.get_newick_tree()
58 |
59 |
60 | if __name__ == "__main__":
61 | unittest.main() # run all tests
62 |
--------------------------------------------------------------------------------
/source/lib/multiple/upgmaWpgma.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2015 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 |
12 | from helper import MathHelper
13 |
14 |
15 | class UpgmaWpgma():
16 | """Upgma/Wpgma is a clustering method to generate phylogenetic trees. """
17 |
18 | def __init__(self, distance_dictionary, node_count, upgma_wpgma=True, sequence_size_mapping={}):
19 | """To initalize a object of this class, please define the following:
20 | distance_dictionary: A dictionary with the distance between two sequences.
21 | Should have the form \"Key0 key1\":distance. The key0 and key1 have to be integers.
22 | node_count: The number of sequences.
23 | upgma_wpgma: If True, the upgma weighting is used, if False, wpgma.
24 | sequence_size_mapping: Only necessary if wpgma is executed. It defines the size of each sequence.
25 | Should have the form: \"Key:len(sequence)\""""
26 | self.distance_dictionary = distance_dictionary
27 | self.mapping = {}
28 | self.node_count = node_count
29 | self.number_of_nodes = node_count
30 | self.upgma_wpgma = upgma_wpgma
31 | self.sequence_size_mapping = sequence_size_mapping
32 | self.edge_weight = {}
33 |
34 | def compute_clustering(self):
35 | """This function computes the clustering to get the phylogenetic tree."""
36 | computation_is_done = False
37 | j = 0
38 | while not computation_is_done:
39 | j += 1
40 | minimum_cluster = self.compute_minimal_distance()
41 | nodes = minimum_cluster[0].split(" ")
42 | if len(nodes) > 1:
43 | self.mapping[minimum_cluster[0]] = self.node_count
44 | self.compute_edge_weight(minimum_cluster[1], nodes)
45 |
46 | if minimum_cluster[0] in self.distance_dictionary:
47 | del self.distance_dictionary[minimum_cluster[0]]
48 |
49 | for i in range(0, self.node_count + 1):
50 | key_value_0 = nodes[0] + " " + str(i)
51 | key_value_1 = nodes[1] + " " + str(i)
52 | key_value = self.key_in_dictionary(key_value_0, key_value_1)
53 | if key_value[0] != "":
54 | key_for_new_cluster_distance = str(i) + " " + str(self.node_count)
55 | self.distance_dictionary[key_for_new_cluster_distance] = self.compute_new_distance(
56 | self.distance_dictionary[key_value[0]], self.distance_dictionary[key_value[1]], nodes[0],
57 | nodes[1])
58 | # try:
59 | # except:
60 | # "something wring"
61 | # "something wring"
62 | if not self.upgma_wpgma:
63 | self.sequence_size_mapping[self.node_count] = self.sequence_size_mapping[int(nodes[0])] + \
64 | self.sequence_size_mapping[int(nodes[1])]
65 | del self.distance_dictionary[key_value[0]]
66 | del self.distance_dictionary[key_value[1]]
67 | self.node_count += 1
68 | else:
69 | computation_is_done = True
70 |
71 | def key_in_dictionary(self, key_value_0, key_value_1):
72 | """Returns True if the given keys are in the distance dictionary, False otherwise.
73 | key_value_0: The first key value.
74 | key_value_1: The second key value."""
75 | for i in range(0, 4):
76 | if key_value_0 in self.distance_dictionary and key_value_1 in self.distance_dictionary:
77 | return [key_value_0, key_value_1]
78 | elif key_value_0[::-1] in self.distance_dictionary and key_value_1 in self.distance_dictionary:
79 | return [key_value_0[::-1], key_value_1]
80 | elif key_value_0 in self.distance_dictionary and key_value_1[::-1] in self.distance_dictionary:
81 | return [key_value_0, key_value_1[::-1]]
82 | elif key_value_0[::-1] in self.distance_dictionary and key_value_1[::-1] in self.distance_dictionary:
83 | return [key_value_0[::-1], key_value_1[::-1]]
84 | else:
85 | return ["", ""]
86 |
87 |
88 | def compute_minimal_distance(self):
89 | """Returns the next two clusters for merging."""
90 | minimum = ["", MathHelper.Inf]
91 | for i in self.distance_dictionary:
92 | if minimum[1] > self.distance_dictionary[i]:
93 | minimum[0] = i
94 | minimum[1] = self.distance_dictionary[i]
95 | return minimum
96 |
97 | def compute_new_distance(self, distance_a_x, distance_b_x, index_a, index_b):
98 | """Returns the new distance between the new merged cluster and an other cluster.
99 | distance_a_x: The old distance between cluster a and x.
100 | distance_b_x: The old distance between cluster b and x.
101 | index_a: The index of a.
102 | index_b: The index of b."""
103 | if self.upgma_wpgma:
104 | return self.upgma_distance(distance_a_x, distance_b_x)
105 | else:
106 | return self.wpgma_distance(distance_a_x, distance_b_x, self.sequence_size_mapping[int(index_a)],
107 | self.sequence_size_mapping[int(index_b)])
108 |
109 | def upgma_distance(self, distance_a_x, distance_b_x):
110 | """Returns the upgma-distance between the new merged cluster a and an other cluster x.
111 | distance_a_x: The old distance between cluster a and x.
112 | distance_b_x: The old distance between cluster b and x."""
113 | return (distance_a_x + distance_b_x) / 2
114 |
115 | def wpgma_distance(self, distance_a_x, distance_b_x, length_of_a, length_of_b):
116 | """Returns the wpgma-distance between the new merged cluster a and an other cluster x.
117 | distance_a_x: The old distance between cluster a and x.
118 | distance_b_x: The old distance between cluster b and x.
119 | length_of_a: The index of a.
120 | length_of_b: The index of b."""
121 | return (length_of_a * distance_a_x + length_of_b * distance_b_x) / (length_of_a + length_of_b)
122 |
123 | def compute_edge_weight(self, weight, nodes):
124 | """This method computes the new edge weight for a new cluster.
125 | weight: The edge weight equal to the distance of the to merged clusters.
126 | nodes: A list containing the indices of the two merged clusters."""
127 | node0= int(nodes[0])
128 | node1 = int(nodes[1])
129 | if node0 < self.number_of_nodes and node1 < self.number_of_nodes:
130 | # self.edge_weight[self.node_count] = 1
131 | self.edge_weight[self.node_count] = [weight / float(2), weight / float(2)]
132 | elif node0 < self.number_of_nodes:
133 | weightToLeafs = self.edge_weight[node1][1]
134 | self.edge_weight[self.node_count] = [weight / float(2) - weightToLeafs, weight / float(2)]
135 | elif node1 < self.number_of_nodes:
136 | weightToLeafs = self.edge_weight[node0][1]
137 | self.edge_weight[self.node_count] = [weight / float(2), weight / float(2) - weightToLeafs]
138 | else:
139 | weightToLeafs = self.edge_weight[node0][1]
140 | weightToLeafs1 = self.edge_weight[node1][1]
141 | self.edge_weight[self.node_count] = [weight / float(2) - weightToLeafs, weight / float(2) - weightToLeafs1]
142 |
143 |
144 | def get_newick_tree(self, with_edge_weights=False):
145 | """Returns the computed cluster in the Newick tree format.
146 | with_edge_weights: If True, edge weights are part of the output, if False, not."""
147 | # expectedValue = {"2 3": 5, "0 1": 7, "4 5": 6, "6 7": 8}
148 | newick_dictionary = dict([[v, k] for k, v in self.mapping.items()])
149 | if with_edge_weights:
150 | for i in newick_dictionary:
151 | if i in self.edge_weight:
152 | nodesWithWeights = newick_dictionary[i].split(" ")
153 | nodesWithWeights[0] = nodesWithWeights[0].strip(" ")
154 | nodesWithWeights[0] += ":" + str(self.edge_weight[i][1])
155 | nodesWithWeights[1] = nodesWithWeights[1].strip(" ")
156 | nodesWithWeights[1] += ":" + str(self.edge_weight[i][0])
157 | newick_dictionary[i] = nodesWithWeights[0] + " " + nodesWithWeights[1]
158 | self.mapping = dict([[v, k] for k, v in newick_dictionary.items()])
159 | for i in self.mapping:
160 | index = -1
161 | leading_sequence = True
162 | for j in newick_dictionary:
163 | string_to_find = " " + str(self.mapping[i]) + ""
164 | if newick_dictionary[j].find(string_to_find) != -1:
165 | index = j
166 | leading_sequence = False
167 | break
168 | string_to_find = str(self.mapping[i]) + " "
169 | if newick_dictionary[j].find(string_to_find) != -1:
170 | index = j
171 | leading_sequence = True
172 | break
173 | if with_edge_weights:
174 | string_to_find = str(self.mapping[i]) + ":"
175 | else:
176 | string_to_find = str(self.mapping[i]) + ","
177 | if newick_dictionary[j].find(string_to_find) != -1:
178 | index = j
179 | leading_sequence = True
180 | break
181 | string_to_find = "," + str(self.mapping[i])
182 | if newick_dictionary[j].find(string_to_find) != -1:
183 | index = j
184 | leading_sequence = False
185 | break
186 |
187 | if index != -1:
188 | if leading_sequence:
189 | stringToReplace = "(" + newick_dictionary[int(string_to_find.strip().strip(",").strip(":"))].replace(" ",
190 | ",") + "):"
191 | else:
192 | stringToReplace = ",(" + newick_dictionary[int(string_to_find.strip().strip(",").strip(":"))].replace(" ",
193 | ",") + ")"
194 | newick_dictionary[index] = newick_dictionary[index].replace(string_to_find, stringToReplace).replace(
195 | ",,", ",")
196 | del newick_dictionary[int(string_to_find.strip().strip(",").strip(":"))]
197 |
198 | for i in newick_dictionary:
199 | return "(" + newick_dictionary[i] + ")"
200 |
--------------------------------------------------------------------------------
/source/lib/pairwise/__init__.py:
--------------------------------------------------------------------------------
1 | from gotoh import Gotoh
2 | from needlemanWunsch import NeedlemanWunsch
3 |
--------------------------------------------------------------------------------
/source/lib/pairwise/gotoh.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2014 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | #
12 | # Gotoh algorithm
13 | from helper import PairwiseAlignmentHelper as pah
14 | from helper import MathHelper as mathHelper
15 | import sys
16 |
17 | class Gotoh():
18 | """This class holds methods which are needed to compute the pairwise
19 | alignment algorithm from Osamu Gotoh, published in 1982:
20 | Osamu Gotoh (1982). "An improved algorithm for matching biological sequences".
21 | Journal of molecular biology 162: 705.
22 | https://www.cs.umd.edu/class/spring2003/cmsc838t/papers/gotoh1982.pdf
23 | """
24 | def __init__(self, sequenceA, sequenceB, scoreFunction, costFunction):
25 | """Initalize all variables and methods needed to compute the Gotoh algorithm.
26 | sequenceA: A string with the first DNA sequence.
27 | sequenceB: A string with the second DNA sequence.
28 | scoreFunction: The name of a weight function as a String which is defined
29 | in the pairwiseAlignmentHelper-class.
30 | costFunction: The name of a gap cost function as a String which is defined
31 | in the pairwiseAlignmentHelper-class.
32 | """
33 | if scoreFunction in dir(pah) and callable(getattr(pah, scoreFunction)):
34 | scoreFunctionObj = eval('pah().' + scoreFunction)
35 | else:
36 | print "Score function not found!"
37 | sys.exit()
38 | if costFunction in dir(pah) and callable(getattr(pah, costFunction)):
39 | costFunctionObj = eval('pah().' + costFunction)
40 | else:
41 | print "Gap cost function not found!"
42 | sys.exit()
43 | self.computationMatrix = [[],[],[]]
44 | self.sequenceA = sequenceA
45 | self.sequenceB = sequenceB
46 | self.scoreFunction = scoreFunctionObj
47 | self.costFunction = costFunctionObj
48 | self.beta = self.costFunction(1) - self.costFunction(0)
49 | self.i = 0
50 | self.j = 0
51 | self.tracebackStack = [[]]
52 | self.tracebackStackIndex = 0
53 | self.indiciesStack = [[]]
54 | self.computedAlignment = []
55 |
56 |
57 | def computeMatrix(self):
58 | """Initalize the three matricies needed for the Gotoh-Algorithm.
59 | The sequences A and B, the weight function and the gap costs have to be defined
60 | by the creation of the object of this class."""
61 | computationMatrixD = [[0 for i in range(len(self.sequenceB)+1) ] for j in range(len(self.sequenceA)+1)]
62 | computationMatrixP = [[0 for i in range(len(self.sequenceB)+1) ] for j in range(len(self.sequenceA)+1)]
63 | computationMatrixQ = [[0 for i in range(len(self.sequenceB)+1) ] for j in range(len(self.sequenceA)+1)]
64 | # initalize matrix
65 | for i in range(1, len(self.sequenceA)+1):
66 | computationMatrixD[i][0] = self.costFunction(i)
67 | computationMatrixP[i][0] = mathHelper.NaN
68 | computationMatrixQ[i][0] = mathHelper.Inf
69 | for i in range(1, len(self.sequenceB)+1):
70 | computationMatrixD[0][i] = self.costFunction(i)
71 | computationMatrixP[0][i] = mathHelper.Inf
72 | computationMatrixQ[0][i] = mathHelper.NaN
73 |
74 | for i in range(1, len(self.sequenceA)+1):
75 | for j in range(1, len(self.sequenceB)+1):
76 | computationMatrixP[i][j] = self.computeP(computationMatrixD[i-1][j], computationMatrixP[i-1][j], self.costFunction, self.beta)
77 | computationMatrixQ[i][j] = self.computeQ(computationMatrixD[i][j-1], computationMatrixQ[i][j-1], self.costFunction, self.beta)
78 | computationMatrixD[i][j] = self.computeD(computationMatrixD[i-1][j-1], computationMatrixP[i][j], computationMatrixQ[i][j], self.sequenceA[i-1], self.sequenceB[j-1], self.scoreFunction)
79 | self.computationMatrix = [computationMatrixD, computationMatrixP, computationMatrixQ]
80 |
81 | def computeP(self, valueOfD, valueOfP, costFunction, beta):
82 | """Compute the values for matrix P.
83 | This is the minimum value of:
84 | matrix D of cell (i-1, j) + gap costs
85 | and
86 | matrix P of cell (i-1, j) + 1
87 | valueOfD: The value from matrix D of cell i-1, j.
88 | valueOfP: The value from matrix P of cell i-1, j.
89 | costFunction: The gap cost function defined at the object creation.
90 | beta: The beta value from the gap costs."""
91 | return min(valueOfD + costFunction(1), valueOfP + beta)
92 |
93 | def computeQ(self, valueOfD, valueOfQ, costFunction, beta):
94 | """Compute the values for matrix Q.
95 | This is the minimum value of:
96 | matrix D of cell (i, j-1) + gap costs
97 | and
98 | matrix Q of cell (i, j-1) + 1
99 | valueOfD: The value from matrix D of cell i, j-1.
100 | valueOfQ: The value from matrix Q of cell i, j-1.
101 | costFunction: The gap cost function defined at the object creation.
102 | beta: The beta value from the gap costs."""
103 | return min(valueOfD + costFunction(1), valueOfQ + beta)
104 |
105 | def computeD(self, valueOfD, valueOfP, valueOfQ, characterA, characterB, scoreFunction):
106 | """Compute the values for matrix D.
107 | This is the minimum value of:
108 | matrix D of cell (i-1, j-1) + w(a,b)
109 | and
110 | matrix P of cell (i, j)
111 | and
112 | matrix Q of cell (i, j)
113 | valueOfD: The value from matrix D of cell i-1, j-1.
114 | valueOfP: The value from matrix P of cell i, j.
115 | valueOfQ: The value from matrix Q of cell i, j.
116 | characterA: The character in sequence A at position i.
117 | characterB: The character in sequence B at position j.
118 | scoreFunction: The weight cost function defined at the object creation."""
119 | return min(valueOfP, min(valueOfQ, valueOfD + scoreFunction(characterA, characterB)))
120 |
121 | def traceback(self):
122 | """Computes the traceback for the Gotoh algorithm."""
123 | self.j = len(self.computationMatrix[0][0]) - 1
124 | self.i = len(self.computationMatrix[0]) - 1
125 | self.tracebackStackIndex = 0
126 | self.indiciesStack[self.tracebackStackIndex] = [self.i, self.j, pah.matrixIndexD]
127 | tracebackDone = False
128 | while not tracebackDone:
129 | while self.i > 0 or self.j > 0:
130 | if self.indiciesStack[self.tracebackStackIndex][2] == pah.matrixIndexD:
131 | self.tracebackD()
132 | elif self.indiciesStack[self.tracebackStackIndex][2] == pah.matrixIndexP:
133 | self.tracebackP()
134 | elif self.indiciesStack[self.tracebackStackIndex][2] == pah.matrixIndexQ:
135 | self.tracebackQ()
136 | self.i = self.indiciesStack[self.tracebackStackIndex][0]
137 | self.j = self.indiciesStack[self.tracebackStackIndex][1]
138 | tracebackDone = True
139 | for i in range(0, len(self.indiciesStack)):
140 | if self.indiciesStack[i][0] > 0 or self.indiciesStack[i][1] > 0:
141 | self.tracebackStackIndex = i
142 | tracebackDone = False
143 | break
144 | self.i = self.indiciesStack[self.tracebackStackIndex][0]
145 | self.j = self.indiciesStack[self.tracebackStackIndex][1]
146 | for i in range(0, len(self.tracebackStack)):
147 | self.computedAlignment.append(self.buildAlignment(self.tracebackStack[i]))
148 |
149 |
150 | def tracebackD(self):
151 | """Computes the traceback for a cell of the matrix D."""
152 | a = self.sequenceA[self.i - 1]
153 | b = self.sequenceB[self.j - 1]
154 | split = 0
155 | pathVariableI = self.i
156 | pathVariableJ = self.j
157 | if self.j > 0 and self.i > 0:
158 | if self.computationMatrix[pah.matrixIndexD][self.i][self.j] == self.computationMatrix[pah.matrixIndexD][self.i-1][self.j-1] + self.scoreFunction(a,b):
159 | self.tracebackStack[self.tracebackStackIndex].append(pah.diagonalD)
160 | pathVariableI -= 1
161 | pathVariableJ -= 1
162 | split = 1
163 | if self.computationMatrix[pah.matrixIndexD][self.i][self.j] == self.computationMatrix[pah.matrixIndexQ][self.i][self.j]:
164 | if split == 0:
165 | self.tracebackStack[self.tracebackStackIndex].append(pah.dotQ)
166 | self.indiciesStack[self.tracebackStackIndex][2] = pah.matrixIndexQ
167 | split = 1
168 | else:
169 | self.tracebackStack.append(self.tracebackStack[self.tracebackStackIndex][0:-1])
170 | self.tracebackStack[len(self.tracebackStack)-1].append(pah.dotQ)
171 | self.indiciesStack.append([self.i,self.j, pah.matrixIndexQ])
172 | if self.computationMatrix[pah.matrixIndexD][self.i][self.j] == self.computationMatrix[pah.matrixIndexP][self.i][self.j]:
173 | if split == 0:
174 | self.tracebackStack[self.tracebackStackIndex].append(pah.dotP)
175 | self.indiciesStack[self.tracebackStackIndex][2] = pah.matrixIndexP
176 | else:
177 | self.tracebackStack.append(self.tracebackStack[self.tracebackStackIndex][0:-1])
178 | self.tracebackStack[len(self.tracebackStack)-1].append(pah.dotP)
179 | self.indiciesStack.append([self.i, self.j, pah.matrixIndexP])
180 |
181 | if self.i == 0:
182 | self.tracebackStack[self.tracebackStackIndex].append(pah.leftD)
183 | pathVariableJ -= 1
184 | if self.j == 0:
185 | self.tracebackStack[self.tracebackStackIndex].append(pah.upD)
186 | pathVariableI -= 1
187 | if self.i <= 0 or pathVariableI <= 0:
188 | pathVariableI = 0
189 | if self.j <= 0 or pathVariableJ <= 0:
190 | pathVariableJ = 0
191 | self.indiciesStack[self.tracebackStackIndex][0] = pathVariableI
192 | self.indiciesStack[self.tracebackStackIndex][1] = pathVariableJ
193 |
194 |
195 | def tracebackP(self):
196 | """Computes the traceback for a cell of the matrix P"""
197 | split = False
198 | if self.i > 0:
199 | if self.computationMatrix[pah.matrixIndexP][self.i][self.j] == self.computationMatrix[pah.matrixIndexD][self.i-1][self.j] + self.costFunction(1):
200 | self.tracebackStack[self.tracebackStackIndex].append(pah.upD)
201 | self.indiciesStack[self.tracebackStackIndex][0] -= 1
202 | self.indiciesStack[self.tracebackStackIndex][2] = pah.matrixIndexD
203 | split = True
204 | if self.computationMatrix[pah.matrixIndexP][self.i][self.j] == self.computationMatrix[pah.matrixIndexP][self.i-1][self.j] + self.beta:
205 | if split:
206 | self.tracebackStack.append(self.tracebackStack[self.tracebackStackIndex][0:-1])
207 | self.tracebackStack[len(self.tracebackStack)-1].append(pah.upP)
208 | self.indiciesStack.append([self.i - 1, self.j, pah.matrixIndexP])
209 | else:
210 | self.tracebackStack[self.tracebackStackIndex].append(pah.upP)
211 | self.indiciesStack[self.tracebackStackIndex][0] -= 1
212 | self.indiciesStack[self.tracebackStackIndex][2] = pah.matrixIndexP
213 |
214 | def tracebackQ(self):
215 | """Computes the traceback for a cell of the matrix Q"""
216 | split = False
217 | if self.j > 0:
218 | if self.computationMatrix[pah.matrixIndexQ][self.i][self.j] == self.computationMatrix[pah.matrixIndexD][self.i][self.j-1] + self.costFunction(1):
219 | self.tracebackStack[self.tracebackStackIndex].append(pah.leftD)
220 | self.indiciesStack[self.tracebackStackIndex][1] -= 1
221 | self.indiciesStack[self.tracebackStackIndex][2] = pah.matrixIndexD
222 | split = True
223 |
224 | if self.computationMatrix[pah.matrixIndexQ][self.i][self.j] == self.computationMatrix[pah.matrixIndexQ][self.i][self.j-1] + self.beta:
225 | if split:
226 | self.tracebackStack.append(self.tracebackStack[self.tracebackStackIndex][0:-1])
227 | self.tracebackStack[len(self.tracebackStack)-1].append(pah.leftQ)
228 | self.indiciesStack.append([self.i , self.j - 1, pah.matrixIndexQ])
229 | else:
230 | self.tracebackStack[self.tracebackStackIndex].append(pah.leftQ)
231 | self.indiciesStack[self.tracebackStackIndex][1] -= 1
232 | self.indiciesStack[self.tracebackStackIndex][2] = pah.matrixIndexQ
233 |
234 | def buildAlignment(self, tracebackStack):
235 | """A method to compute the alignment of a given traceback of the Gotoh algorithm.
236 | tracebackStack: The computed traceback path for one alignment as a list."""
237 | i = 0
238 | j = 0
239 | k = len(tracebackStack)-1
240 | alignmentOfA = ""
241 | alignmentOfB = ""
242 | while len(tracebackStack) > 0:
243 | try:
244 | tracebackElement = tracebackStack.pop(k)
245 | if pah.leftQ == tracebackElement or pah.leftD == tracebackElement:
246 | alignmentOfA += "-"
247 | alignmentOfB += self.sequenceB[j]
248 | j += 1
249 | elif pah.upP == tracebackElement or pah.upD == tracebackElement:
250 | alignmentOfA += self.sequenceA[i]
251 | alignmentOfB += "-"
252 | i += 1
253 | elif pah.diagonalD == tracebackElement:
254 | alignmentOfA += self.sequenceA[i]
255 | alignmentOfB += self.sequenceB[j]
256 | i += 1
257 | j += 1
258 | k -= 1
259 |
260 | except:
261 | print "An error occured."
262 | sys.exit()
263 |
264 | while i < len(self.sequenceA):
265 | alignmentOfA += self.sequenceA[i]
266 | i += 1
267 | while j < len(self.sequenceB):
268 | alignmentOfB += self.sequenceB[j]
269 | j += 1
270 | alignment = [alignmentOfA, alignmentOfB]
271 | return alignment
272 |
273 | def compute(self):
274 | """Method to start the computation of the Gotoh algorithm."""
275 | self.computeMatrix()
276 | self.traceback()
277 | return self.computedAlignment
278 |
279 |
--------------------------------------------------------------------------------
/source/lib/pairwise/needlemanWunsch.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2014 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | #
12 | # Needleman-Wunsch algorithm
13 | import sys
14 | from helper import PairwiseAlignmentHelper as pah
15 |
16 |
17 | class NeedlemanWunsch():
18 | """This class holds methods which are needed to compute the pairwise
19 | alignment algorithm from Saul Needleman and Christian Wunsch, published in 1970:
20 | Needleman, Saul B.; and Wunsch, Christian D. (1070).
21 | A general method applicable to search for similarities in the aminoacid
22 | sequence of two proteins. Journal of Molecular Biology 48 (3): 443-53
23 | http://www.cise.ufl.edu/class/cis4930sp09rab/00052.pdf"""
24 |
25 | def computeMatrix(self, sequenceA, sequenceB, scoreFunction):
26 | """Initalize and computes the values for the Needleman-Wunsch matrix.
27 | sequenceA: A string with the first DNA sequence.
28 | sequenceB: A string with the second DNA sequence.
29 | scoreFunction: The name of a weight function as a String which is defined
30 | in the pairwiseAlignmentHelper-class."""
31 | computationMatrix = [[0 for i in range(len(sequenceB) + 1)] for j in range(len(sequenceA) + 1)]
32 |
33 | # initalize matrix
34 | for i in range(1, len(sequenceA) + 1):
35 | computationMatrix[i][0] = computationMatrix[i - 1][0] + scoreFunction("", sequenceA[i - 1])
36 | for i in range(1, len(sequenceB) + 1):
37 | computationMatrix[0][i] = computationMatrix[0][i - 1] + scoreFunction("", sequenceB[i - 1])
38 |
39 | for i in range(1, len(sequenceA) + 1):
40 | for j in range(1, len(sequenceB) + 1):
41 | computationMatrix[i][j] = self.computeMinimum(sequenceA[i - 1], sequenceB[j - 1],
42 | computationMatrix[i][j - 1], computationMatrix[i - 1][j],
43 | computationMatrix[i - 1][j - 1], scoreFunction)
44 | return computationMatrix
45 |
46 | def computeMinimum(self, characterOfA, characterOfB, predecessorLeft, predecessorUp, predecessorDiagonal,
47 | scoreFunction):
48 | """Computes the minimum of a given cell for the Needleman-Wunsch matrix.
49 | characterA: The character in sequence A at position i.
50 | characterB: The character in sequence B at position j.
51 | predecessorLeft: The value i, j-1 in the matrix.
52 | predecessorUp: The value i-1, j in the matrix.
53 | predecessorDiagonal: The value i-1, j-1 in the matrix.
54 | scoreFunction: The weight function defined in
55 | class pairwiseAlignmentHelper."""
56 | costUp = predecessorUp + scoreFunction(characterOfA, "")
57 | costDiagonal = predecessorDiagonal + scoreFunction(characterOfA, characterOfB)
58 | costLeft = predecessorLeft + scoreFunction("", characterOfB)
59 | return min(costUp, costDiagonal, costLeft)
60 |
61 | def traceback(self, sequenceA, sequenceB, computationMatrix, scoreFunction, maxOptimalSolutions=-1):
62 | """Computes the traceback for the Needleman-Wunsch matrix.
63 | sequenceA: A string with the first DNA sequence.
64 | sequenceB: A string with the second DNA sequence.
65 | computationMatrix: The computed matrix for the two sequences.
66 | scoreFunction: The name of a weight function as a String which is defined
67 | in the pairwiseAlignmentHelper-class.
68 | """
69 | tracebackStack = [[]]
70 | indiciesStack = [[len(computationMatrix) - 1, len(computationMatrix[0]) - 1]]
71 | tracebackCount = 0
72 | tracebackDone = False
73 | optimalSolutionsCount = 0
74 | l = 0
75 | allTracebacksComputed = 0
76 | appendTracebackStack = tracebackStack.append
77 | appendIndices = indiciesStack.append
78 | while not tracebackDone:
79 |
80 | optimalSolutionsCount += 1
81 | i = indiciesStack[tracebackCount][0]
82 | j = indiciesStack[tracebackCount][1]
83 | split = False
84 | appendTraceback = tracebackStack[tracebackCount].append
85 |
86 | while i > 0 or j > 0:
87 | pathVariableI = i
88 | pathVariableJ = j
89 | # left arrow
90 | if j > 0:
91 | if computationMatrix[i][j] == computationMatrix[i][j - 1] + scoreFunction("", sequenceB[j - 1]):
92 | # tracebackStack[tracebackCount].append(pah.left)
93 | appendTraceback(pah.left)
94 | pathVariableJ -= 1 # change j
95 | split = True
96 |
97 | # up arrow
98 | if i > 0:
99 | if computationMatrix[i][j] == computationMatrix[i - 1][j] + scoreFunction(sequenceA[i - 1], ""):
100 | if split == False:
101 | appendTraceback(pah.up)
102 | # tracebackStack[tracebackCount].append(pah.up)
103 | pathVariableI -= 1
104 | split = True
105 | else:
106 | appendTracebackStack(tracebackStack[tracebackCount][0:-1])
107 | tracebackStack[len(tracebackStack) - 1].append(pah.up)
108 | appendIndices([i - 1, j])
109 |
110 | # diagonal arrow
111 | if i > 0 and j > 0:
112 | if computationMatrix[i][j] == computationMatrix[i - 1][j - 1] + scoreFunction(sequenceA[i - 1],
113 | sequenceB[j - 1]):
114 | if split == False:
115 | appendTraceback(pah.diagonal)
116 | # tracebackStack[tracebackCount].append(pah.diagonal)
117 | pathVariableI -= 1
118 | pathVariableJ -= 1
119 | elif split == True:
120 | appendTracebackStack(tracebackStack[tracebackCount][0:-1])
121 | tracebackStack[len(tracebackStack) - 1].append(pah.diagonal)
122 | appendIndices([i - 1, j - 1])
123 | split = 0
124 | i = pathVariableI
125 | j = pathVariableJ
126 |
127 | indiciesStack[tracebackCount][0] = i
128 | indiciesStack[tracebackCount][1] = j
129 | l = tracebackCount
130 | while l < len(indiciesStack):
131 | if indiciesStack[l][0] == 0 and indiciesStack[l][1] == 0:
132 | allTracebacksComputed += 1
133 | else:
134 | tracebackCount = l
135 | l = len(indiciesStack)
136 | l += 1
137 | if allTracebacksComputed >= len(indiciesStack):
138 | tracebackDone = True
139 | if maxOptimalSolutions != -1 and optimalSolutionsCount >= maxOptimalSolutions:
140 | tracebackDone = True
141 | # allTracebacksComputed = 0
142 |
143 | computedAlignment = []
144 | if maxOptimalSolutions == -1:
145 | for i in range(0, len(tracebackStack)):
146 | computedAlignment.append(self.buildAlignment(tracebackStack[i], sequenceA, sequenceB))
147 | else:
148 | for i in range(0, maxOptimalSolutions):
149 | computedAlignment.append(self.buildAlignment(tracebackStack[i], sequenceA, sequenceB))
150 | return computedAlignment
151 |
152 | def buildAlignment(self, tracebackStack, sequenceA, sequenceB):
153 | """Builds the alignment for one traceback path.
154 | tracebackStack: The computed tracebackpath as a list = []
155 | sequenceA: A string with the first DNA sequence.
156 | sequenceB: A string with the second DNA sequence.
157 | """
158 | i = 0
159 | j = 0
160 | k = len(tracebackStack) - 1
161 | alignmentOfA = ""
162 | alignmentOfB = ""
163 |
164 | while len(tracebackStack) > 0:
165 | try:
166 | tracebackElement = tracebackStack.pop(k)
167 | if pah.left == tracebackElement:
168 | alignmentOfA += "-"
169 | alignmentOfB += sequenceB[j]
170 | j += 1
171 | elif pah.up == tracebackElement:
172 | alignmentOfA += sequenceA[i]
173 | alignmentOfB += "-"
174 | i += 1
175 | elif pah.diagonal == tracebackElement:
176 | alignmentOfA += sequenceA[i]
177 | alignmentOfB += sequenceB[j]
178 | i += 1
179 | j += 1
180 | k -= 1
181 | except:
182 | print "An error occured."
183 | sys.exit()
184 | while i < len(sequenceA):
185 | alignmentOfA += sequenceA[i]
186 | i += 1
187 | while j < len(sequenceB):
188 | alignmentOfB += sequenceB[j]
189 | j += 1
190 | alignment = [alignmentOfA, alignmentOfB]
191 | return alignment
192 |
193 | def compute(self, sequences, scoreFunction, maxOptimalSolutions=-1):
194 | """Method to execute the Needleman-Wunsch algorithm.
195 | sequences: A list with two strings which represents the DNA sequences.
196 | scoreFunction: The name of the weight function defined in
197 | class pairwiseAlignmentHelper."""
198 | if scoreFunction in dir(pah) and callable(getattr(pah, scoreFunction)):
199 | scoreFunctionObj = eval('pah().' + scoreFunction)
200 | else:
201 | print "Score function not found!"
202 | sys.exit()
203 | if maxOptimalSolutions == -1:
204 | return self.traceback(sequences[0], sequences[1],
205 | self.computeMatrix(sequences[0], sequences[1], scoreFunctionObj), scoreFunctionObj)
206 | else:
207 | return self.traceback(sequences[0], sequences[1],
208 | self.computeMatrix(sequences[0], sequences[1], scoreFunctionObj), scoreFunctionObj,
209 | maxOptimalSolutions)
210 |
211 |
--------------------------------------------------------------------------------
/source/lib/pairwise/test/__init__.py:
--------------------------------------------------------------------------------
1 | from gotohTest import GotohTestClass
2 | from needlemanWunschTest import NeedlemanWunschTestClass
3 |
--------------------------------------------------------------------------------
/source/lib/pairwise/test/gotohTest.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2014 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | #
12 | # Gotoh test class
13 | import unittest
14 | import os, sys
15 | lib_path = os.path.abspath('../../')
16 | sys.path.append(lib_path)
17 |
18 | from pairwise import Gotoh
19 | from helper import PairwiseAlignmentHelper as pah
20 | from helper import MathHelper as mathHelper
21 | class GotohTestClass(unittest.TestCase):
22 | """Test class to test the correct computation of the Gotoh algorithm."""
23 | def test_computeMatrix(self):
24 | """Test method to test the correct computation of the matrix."""
25 | a = "AGC"
26 | b = "AC"
27 | computedMatrixD = [[0 for i in range(len(b)+1) ] for j in range(len(a)+1)]
28 | computedMatrixP = [[0 for i in range(len(b)+1) ] for j in range(len(a)+1)]
29 | computedMatrixQ = [[0 for i in range(len(b)+1) ] for j in range(len(a)+1)]
30 | # initalize matrix
31 | for i in range(1, len(a)+1):
32 | computedMatrixD[i][0] = pah().gapCost(i)
33 | computedMatrixP[i][0] = mathHelper.NaN
34 | computedMatrixQ[i][0] = mathHelper.Inf
35 | for i in range(1, len(b)+1):
36 | computedMatrixD[0][i] = pah().gapCost(i)
37 | computedMatrixP[0][i] = mathHelper.Inf
38 | computedMatrixQ[0][i] = mathHelper.NaN
39 |
40 |
41 | # define values that should be computed by Gotoh algorithm
42 | # matrix D
43 | computedMatrixD[1][1] = 0
44 | computedMatrixD[2][1] = 3
45 | computedMatrixD[3][1] = 4
46 |
47 | computedMatrixD[1][2] = 3
48 | computedMatrixD[2][2] = 1
49 | computedMatrixD[3][2] = 3
50 |
51 | # matrix P
52 | computedMatrixP[1][1] = 6
53 | computedMatrixP[2][1] = 3
54 | computedMatrixP[3][1] = 4
55 |
56 | computedMatrixP[1][2] = 7
57 | computedMatrixP[2][2] = 6
58 | computedMatrixP[3][2] = 4
59 |
60 | # matrix Q
61 | computedMatrixQ[1][1] = 6
62 | computedMatrixQ[2][1] = 7
63 | computedMatrixQ[3][1] = 8
64 |
65 | computedMatrixQ[1][2] = 3
66 | computedMatrixQ[2][2] = 6
67 | computedMatrixQ[3][2] = 7
68 |
69 | computedMatrix = [computedMatrixD, computedMatrixP, computedMatrixQ]
70 | # print "test: ", computedMatrix
71 | # check if the values computed by Gotoh are correct
72 | gotoh = Gotoh(a, b, "weightFunctionDifference", "gapCost")
73 | gotoh.compute_matrix()
74 | # print gotoh.computationMatrix
75 | self.assertEqual(computedMatrix, gotoh.computationMatrix)
76 |
77 | def test_traceback(self):
78 | """Test method to test the correct computation of the traceback."""
79 | #test case with a single traceback
80 | a = "AGC"
81 | b = "AC"
82 | gotoh = Gotoh(a, b, "weightFunctionDifference", "gapCost")
83 | computedAlignment = [["AGC", "A-C"]]
84 | gotoh.compute_matrix()
85 | gotoh.traceback()
86 | self.assertEqual(computedAlignment, gotoh.computedAlignment)
87 |
88 | # test case with a multiple traceback
89 | a = "CC"
90 | b = "ACCT"
91 | gotoh2 = Gotoh(a, b, "weightFunctionDifference", "gapCost")
92 | gotoh2.compute_matrix()
93 | computedAlignment = [["--CC","ACCT"], ["CC--","ACCT"]]
94 | gotoh2.traceback()
95 | self.assertEqual(computedAlignment, gotoh2.computedAlignment)
96 |
97 | if __name__ == "__main__":
98 | unittest.main() # run all tests
99 |
--------------------------------------------------------------------------------
/source/lib/pairwise/test/needlemanWunschTest.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2014 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 |
12 | # Test class for the Needleman-Wunsch algorithm
13 | # All test cases are written with PyUnit: http://pyunit.sourceforge.net/
14 |
15 | import unittest
16 | import os, sys
17 | lib_path = os.path.abspath('../../')
18 | sys.path.append(lib_path)
19 |
20 | from pairwise import NeedlemanWunsch as nw
21 | from helper import PairwiseAlignmentHelper as pah
22 |
23 | class NeedlemanWunschTestClass(unittest.TestCase):
24 | """Class to test the correctness of the computation for the class NeedlemanWunsch."""
25 | def test_computeMatrix(self):
26 | """Test of the computation of the matrix."""
27 | a = "AGC"
28 | b = "AC"
29 | computedMatrix = [[0 for i in range(len(b)+1) ] for j in range(len(a)+1)]
30 |
31 | # initalize matrix
32 | for i in range(1, len(a)+1):
33 | computedMatrix[i][0] = computedMatrix[i-1][0] + pah().weightFunctionDifference("", a[i-1])
34 | for i in range(1, len(b)+1):
35 | computedMatrix[0][i] = computedMatrix[0][i-1] + pah().weightFunctionDifference("", b[i-1])
36 |
37 | # define values that should be computed by Needleman-Wunsch algorithm
38 | computedMatrix[1][1] = 0
39 | computedMatrix[2][1] = 1
40 | computedMatrix[3][1] = 2
41 |
42 | computedMatrix[1][2] = 1
43 | computedMatrix[2][2] = 1
44 | computedMatrix[3][2] = 1
45 |
46 | # check if the values computed by Needleman-Wunsch are correct
47 | self.assertEqual(computedMatrix, nw().compute_matrix(a, b, pah().weightFunctionDifference))
48 |
49 | def test_traceback(self):
50 | """Test of the traceback computation."""
51 | # test case with a single traceback
52 | a = "AGC"
53 | b = "AC"
54 | computedAlignment = [["AGC", "A-C"]]
55 | computedMatrix = nw().compute_matrix(a, b, pah().weightFunctionDifference)
56 | self.assertEqual(computedAlignment,
57 | nw().traceback(a, b, computedMatrix,pah().weightFunctionDifference))
58 |
59 | # test case with a multiple traceback
60 | a = "AT"
61 | b = "AAGT"
62 | computedMatrix = nw().compute_matrix(a, b, pah().weightFunctionDifference)
63 | computedAlignment = [["A--T","AAGT"], ["-A-T","AAGT"]]
64 | self.assertEqual(computedAlignment,
65 | nw().traceback(a, b, computedMatrix,pah().weightFunctionDifference))
66 |
67 | if __name__ == "__main__":
68 | unittest.main() # run all tests
69 |
--------------------------------------------------------------------------------
/source/lib/structurePrediction/__init__.py:
--------------------------------------------------------------------------------
1 | from nussinov import Nussinov
--------------------------------------------------------------------------------
/source/lib/structurePrediction/nussinov.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2015 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | #
12 | # Nussinov algorithm
13 |
14 | class Nussinov():
15 | """The algorithm of Nussinov is a RNA secondary structure folding algorithm. It was developed by Ruth Nussinov et al.
16 | and was published in 1978:
17 | Nussinov, Ruth, et al. "Algorithms for loop matchings."
18 | SIAM Journal on Applied mathematics 35.1 (1978): 68-82.
19 | http://rci.rutgers.edu/~piecze/GriggsNussinovKleitmanPieczenik.pdf
20 | """
21 | def __init__(self, rnaSequence):
22 | """rnaSequence: The RNA sequence for which the folding should be computed."""
23 | self.sequence = rnaSequence
24 | self.pairedBases = {}
25 | self.computationMatrix = [[]]
26 |
27 | def computeMatrix(self):
28 | """This function computes the matrix which the Nussinov-algorithm is based on."""
29 | self.computationMatrix = [[0 for i in range(len(self.sequence)+1) ] for j in range(len(self.sequence))]
30 | i = 2
31 | while i <= len(self.sequence):
32 | k = i
33 | j = 0
34 | while j <= (len(self.sequence)-2) and k <= (len(self.sequence)):
35 | self.computeMatrixCell(j, k)
36 | j += 1
37 | k += 1
38 | i += 1
39 |
40 | def computeMatrixCell(self, i, j):
41 | """This function computes the value for every cell of the matrix for the Nussinov-algorithm.
42 | i: First index of cell of the Nussinov-matrix
43 | j: Second index of cell of the Nussinov-matrix
44 | Every cell is the maximum of:
45 | | N_(i, j-1)
46 | N_(i,j) = max |max i <= k < j N_(i, k-1) + N_(k+1, j-1) + 1
47 | | S_k and S_j are complementary
48 | """
49 | self.computationMatrix[i][j-1]
50 | maximumValue = [0,0,0]
51 | k = i
52 | while i <= k and k < j:
53 | if self.complementary(self.sequence[k], self.sequence[j-1]):
54 | pairingValue = self.computationMatrix[i][k-1] + self.computationMatrix[k+1][j-1] + 1
55 | if maximumValue[2] < pairingValue:
56 | maximumValue[0] = k
57 | maximumValue[1] = j
58 | maximumValue[2] = pairingValue
59 | k += 1
60 | self.computationMatrix[i][j] = max(self.computationMatrix[i][j-1], maximumValue[2])
61 |
62 | def complementary(self, characterA, characterB):
63 | """Returns True if two RNA nucleotides are complementary, False otherwise.
64 | Nucleotides are complemetary if there are "A" and "U" or "C" and "G".
65 | characterA: First nucleotide
66 | characterB: Second nucleotide"""
67 | if characterA == "A" and characterB == "U":
68 | return True
69 | elif characterA == "U" and characterB == "A":
70 | return True
71 | elif characterA == "C" and characterB == "G":
72 | return True
73 | elif characterA == "G" and characterB == "C":
74 | return True
75 | return False
76 |
77 | def traceback(self, i, j):
78 | """Computes the traceback for the Nussinov-algorithm.
79 | i: First index of cell of the Nussinov-matrix
80 | j: Second index of cell of the Nussinov-matrix
81 | """
82 | if j <= i:
83 | return
84 | elif self.computationMatrix[i][j] == self.computationMatrix[i][j-1]:
85 | self.traceback(i, j-1)
86 | return
87 | else:
88 | k = i
89 | while i <= k and k < j:
90 | if self.complementary(self.sequence[k-1], self.sequence[j-1]):
91 |
92 | if self.computationMatrix[i][j] == self.computationMatrix[i][k-1] + self.computationMatrix[k][j-1] + 1:
93 | self.pairedBases[k] = j
94 | self.traceback(i, k-1)
95 | self.traceback(k, j -1)
96 | return
97 | k += 1
98 |
99 | def execute(self):
100 | """To compute the Nussinov-algorithm execute this method. It returns a dictionary with the paired bases."""
101 | self.computeMatrix()
102 | self.traceback(0, len(self.sequence))
103 | print self.pairedBases
104 | print len(self.pairedBases)
105 | return self.pairedBases
--------------------------------------------------------------------------------
/source/lib/structurePrediction/test/nussinovTest.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # Copyright 2014 Joachim Wolff
3 | # Programming Course: Algorithms in Bioinformatics
4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi
5 | # Winter semester 2014/2015
6 | #
7 | # Chair of Bioinformatics
8 | # Department of Computer Science
9 | # Faculty of Engineering
10 | # Albert-Ludwig-University Freiburg im Breisgau
11 | #
12 | # Gotoh test class
13 | import unittest
14 | import os, sys
15 | lib_path = os.path.abspath('../../')
16 | sys.path.append(lib_path)
17 |
18 | from structurePrediction import Nussinov
19 |
20 | class NussinovTestClass(unittest.TestCase):
21 | def test_computeMatrix(self):
22 | # example for the slides of Prof. Backofen
23 | expectedMatrix = [[0,0,1,1,1,2,2,2,3], [0,0,0,0,0,1,1,1,2], [0,0,0,0,0,1,1,1,2], [0,0,0,0,0,1,1,1,2], [0,0,0,0,0,0,0,1,1], [0,0,0,0,0,0,0,0,1], [0,0,0,0,0,0,0,0,1],[0,0,0,0,0,0,0,0,0]]
24 | rnaSequence = "GCACGACG"
25 | nussinov = Nussinov(rnaSequence)
26 | nussinov.compute_matrix()
27 | self.assertEqual(expectedMatrix, nussinov.computationMatrix)
28 |
29 | def test_traceback(self):
30 | # example for the slides of Prof. Backofen
31 | expectedMatrix = {1:2, 4:8, 5:7}
32 | rnaSequence = "GCACGACG"
33 | nussinov = Nussinov(rnaSequence)
34 | nussinov.compute_matrix()
35 | nussinov.traceback(0, len(rnaSequence))
36 | self.assertEqual(expectedMatrix, nussinov.pairedBases)
37 |
38 | if __name__ == "__main__":
39 | unittest.main() # run all tests
--------------------------------------------------------------------------------
/source/sequences:
--------------------------------------------------------------------------------
1 | >sequence 0
2 | UUUGGUCCUCGGUAGUGGUUUCCGGAAAACGAUUUUCCGUGAACUUCGAUCGAAGAUCCAU
3 |
--------------------------------------------------------------------------------