├── README.md ├── Wolff_Presentations ├── Feng-Doolittle.pdf ├── Gotoh.pdf ├── Needleman-Wunsch n=3.pdf ├── Needleman-Wunsch.pdf ├── Nussinov.pdf ├── SumOfPairs.pdf └── UPGMA_WPGM.pdf ├── bin ├── algorithmsInBioinformatics.py ├── blosum62.txt └── fengDoolittle.fas ├── report.pdf └── source ├── .idea ├── .name ├── encodings.xml ├── misc.xml ├── modules.xml ├── scopes │ └── scope_settings.xml ├── source.iml ├── vcs.xml └── workspace.xml ├── lib ├── __init__.py ├── helper │ ├── IOHelper.py │ ├── __init__.py │ ├── mathHelper.py │ ├── multipleAlignmentHelper.py │ ├── pairwiseAlignmentHelper.py │ └── test │ │ └── IOHelperTest.py ├── multiple │ ├── __init__.py │ ├── fengDoolittle.py │ ├── needlemanWunschN3.py │ ├── sumOfPairs.py │ ├── test │ │ ├── fengDoolittleTest.py │ │ ├── needlemanWunschN3Test.py │ │ ├── sumOfPairsTest.py │ │ └── upgmaWpgmaTest.py │ └── upgmaWpgma.py ├── pairwise │ ├── __init__.py │ ├── gotoh.py │ ├── needlemanWunsch.py │ └── test │ │ ├── __init__.py │ │ ├── gotohTest.py │ │ └── needlemanWunschTest.py └── structurePrediction │ ├── __init__.py │ ├── nussinov.py │ └── test │ └── nussinovTest.py └── sequences /README.md: -------------------------------------------------------------------------------- 1 | # Algorithms In Bioinformatics 2 | To run the algorithms execute the file "algorithmsInBioinformatics.py" in the folder source/bin. 3 | 4 | ## Parameters 5 | 6 | #### Help 7 | -h, --help 8 | 9 | Show this help message and exit 10 | 11 | #### Algorithms 12 | 13 | -a {nw,gotoh,nw3,fengDoolittle,sumOfPairs,upgma,wpgma,nussinov}, 14 | 15 | --algorithm {nw,gotoh,nw3,fengDoolittle,sumOfPairs,upgma,wpgma,nussinov} 16 | 17 | Define which algorithm should be executed. Options are: 18 | 19 | * 'nw' for the algorithm of Needleman and Wunsch. 20 | * 'gotoh' for the algorithm of Osamu Gotoh. 21 | * 'nw3' for the Needleman-Wunsch algorithm with three sequences. 22 | * 'fengDoolittle' for the heuristic multiple sequence alignment algorithm by Da-Fei Feng and Russell F. Doolittle. 23 | * 'sumOfPairs' for the scoring of a multiple sequence alignment by Humberto Carrillo and David Lipman. 24 | * 'upgma' or 'wpgma' is a clustering method to generate pylogenetic trees. 25 | * 'nussinov' for the RNA secondary structure prediction algorithm by Ruth 26 | Nussinov. 27 | 28 | #### Input file 29 | 30 | -f INPUTFILE, --inputFile INPUTFILE 31 | 32 | Define the file in which the input sequences are defined. It have to be in fasta-format. 33 | 34 | #### Output file 35 | 36 | -o OUTPUTFILE, --outputFile OUTPUTFILE 37 | 38 | Define in which file the output should be written. If 39 | not defined, it is written to "outputFile.fas" in the 40 | local directory. 41 | 42 | #### Weight function 43 | 44 | -w WEIGHTFUNCTION, --weightFunction WEIGHTFUNCTION 45 | 46 | Name of a weight function definde in class 47 | PairwiseAligmentHelper. 48 | 49 | #### Gap costs 50 | 51 | -gc GAPCOST, --gapCost GAPCOST 52 | 53 | Name of a gap function definde in class PairwiseAligmentHelper. 54 | 55 | #### Number of solutions 56 | 57 | --numberOfSolutions NUMBEROFSOLUTIONS 58 | 59 | Define the number of optimal solutions the Needleman-Wunsch algorithm should compute. 60 | 61 | #### Output format 62 | 63 | --outputFormat {graphML,newickTree} 64 | 65 | Define the output format of the output file. This function is only parsed if you choose 'upgma' or 'wpgma' as an algorithm. Default is Newick tree. 66 | 67 | #### similarity score 68 | 69 | --similarityScore SIMILARITYSCORE 70 | 71 | Name of a similarity score defined in class PairwiseAligmentHelper. 72 | 73 | ## Support 74 | 75 | If you are having issues, please let me know. Mail adress: wolffj[at]informatik[dot]uni-freiburg[dot]de -------------------------------------------------------------------------------- /Wolff_Presentations/Feng-Doolittle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/Feng-Doolittle.pdf -------------------------------------------------------------------------------- /Wolff_Presentations/Gotoh.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/Gotoh.pdf -------------------------------------------------------------------------------- /Wolff_Presentations/Needleman-Wunsch n=3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/Needleman-Wunsch n=3.pdf -------------------------------------------------------------------------------- /Wolff_Presentations/Needleman-Wunsch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/Needleman-Wunsch.pdf -------------------------------------------------------------------------------- /Wolff_Presentations/Nussinov.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/Nussinov.pdf -------------------------------------------------------------------------------- /Wolff_Presentations/SumOfPairs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/SumOfPairs.pdf -------------------------------------------------------------------------------- /Wolff_Presentations/UPGMA_WPGM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/Wolff_Presentations/UPGMA_WPGM.pdf -------------------------------------------------------------------------------- /bin/algorithmsInBioinformatics.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2014 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | # 12 | # main class 13 | 14 | import argparse 15 | import os, sys 16 | if os.name == "posix": 17 | lib_path = os.path.abspath('../lib') 18 | elif os.name == "nt": 19 | lib_path = os.path.abspath('..\lib') 20 | sys.path.append(lib_path) 21 | 22 | from helper import IOHelper as io 23 | from helper import MultipleAlignmentHelper as mah 24 | from pairwise import NeedlemanWunsch as nw 25 | from pairwise import Gotoh 26 | from multiple import NeedlemanWunschN3 as NW3 27 | from multiple import UpgmaWpgma 28 | from multiple import FengDoolittle 29 | from multiple import SumOfPairs 30 | from structurePrediction import Nussinov 31 | def main(): 32 | """Method to parse the arguments and start the defined algorithms.""" 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument("-a", "--algorithm", 35 | choices=["nw", "gotoh", "nw3", "fengDoolittle", "sumOfPairs","upgma", "wpgma", "nussinov"], 36 | required=True, 37 | help="Define which algorithm should be executed. " 38 | "\nOptions are: 'nw' for the algorithm of Needleman and Wunsch,\n" 39 | "'gotoh' for the algorithm of Osamu Gotoh, \n" 40 | "'nw3' for the Needleman-Wunsch algorithm with three sequences, \n" 41 | "'fengDoolittle' for the heuristic multiple sequence alignment algorithm by Da-Fei Feng and Russell F. Doolittle," 42 | "'sumOfPairs' for the scoring of a multiple sequence alignment by Humberto Carrillo and David Lipman." 43 | "'upgma' or 'wpgma' is a clustering method to generate phylogenetic trees, \n" 44 | "'nussinov' for the RNA secondary structure prediction algorithm by Ruth Nussinov.") 45 | parser.add_argument("-f", "--inputFile", dest="inputFile", 46 | help="Define the file in which the input sequences are defined. It have to be in fasta-format.") 47 | parser.add_argument("-o", "--outputFile", help="Define in which file the output should be written. " 48 | "If not defined, it is written to \"outputFile.fas\" in the local directory.") 49 | parser.add_argument("-gc", "--gapCost", dest="gapCost", 50 | help="Name of a gap function definde in class PairwiseAligmentHelper.") 51 | parser.add_argument("--numberOfSolutions", dest="numberOfSolutions", 52 | help="Define the number of optimal solutions the Needleman-Wunsch algorithm should compute.") 53 | parser.add_argument("--outputFormat", dest="outputFormat", choices=["graphML", "newickTree"], 54 | help="Define the output format of the output file. " 55 | "This function is only parsed if you choose 'upgma' or 'wpgma' as an algorithm. Default is" 56 | " Newick tree") 57 | parser.add_argument("--scoring", dest="similarityScore", 58 | help="Name of a similarity score defined in class PairwiseAligmentHelper. Per default " 59 | "\"pam\" and \"blosum\" (pam250 and blosum62) are implemented. Feel free to extend, you can find the " 60 | "file \"PairwiseAligmentHelper.py\" in lib/helper. If this option is not defined, the pam250 matrix is choosen.") 61 | parser.add_argument("--gapPenalty", dest="gapPenalty", help="Define a gap penalty. Default for pam is 8 and blosum 6.") 62 | args = parser.parse_args() 63 | 64 | outputFile = "" 65 | weightFunction = "" 66 | if args.outputFile: 67 | outputFile = args.outputFile 68 | if args.similarityScore: 69 | weightFunction = args.similarityScore 70 | 71 | sequences = getSequencesFromFile(args.inputFile) 72 | if len(sequences) > 1: 73 | 74 | # pairwise alignment 75 | if args.algorithm == "nw": 76 | if outputFile == "": 77 | outputFile = "needlemanWunsch.fas" 78 | if weightFunction == "": 79 | weightFunction = "pam" 80 | numberOfSolutions = -1 81 | if args.numberOfSolutions: 82 | numberOfSolutions = args.numberOfSolutions 83 | needlemanWunsch(sequences[0:2], scoreFunction = weightFunction, outputFile = outputFile, numberOfSolutions=numberOfSolutions) 84 | 85 | elif args.algorithm == "gotoh": 86 | if outputFile == "": 87 | outputFile = "gotoh.fas" 88 | if weightFunction == "": 89 | weightFunction = "pam" 90 | gapCost = "gapCost" 91 | if args.gapCost: 92 | gapCost = args.gapCost 93 | gotoh(sequences[0:2], scoreFunction = weightFunction, costFunction = gapCost, outputFile = outputFile) 94 | 95 | # multiple alignment 96 | 97 | elif args.algorithm == "upgma" or args.algorithm == "wpgma": 98 | newickTree = True 99 | if args.outputFormat == "graphML": 100 | newickTree = False 101 | if outputFile == "": 102 | if args.algorithm == "upgma": 103 | outputFile = "upgma" 104 | else: 105 | outputFile = "wpgma" 106 | upgmaWpgma(args.algorithm == "upgma", sequences, outputFile, newickTree) 107 | 108 | elif args.algorithm == "fengDoolittle": 109 | if outputFile == "": 110 | outputFile = "fengDoolittle.fas" 111 | if weightFunction == "": 112 | weightFunction = "pam" 113 | similarityScore = "pam" 114 | if args.similarityScore: 115 | similarityScore = args.similarityScore 116 | fengDoolittle(sequences, weightFunction, similarityScore, outputFile) 117 | elif args.algorithm == "sumOfPairs": 118 | similarityScore = "pam" 119 | if args.similarityScore: 120 | similarityScore = args.similarityScore 121 | if args.gapPenalty: 122 | sumOfPairs(sequences, similarityScore, args.gapPenalty) 123 | else: 124 | sumOfPairs(sequences, similarityScore) 125 | 126 | elif args.algorithm == "nw3": 127 | if not (len(sequences) == 3): 128 | print "Wrong number of input sequences. Needleman-Wunsch n=3 needs exactly three sequences; ", \ 129 | len(sequences) , " sequences are given." 130 | sys.exit() 131 | if weightFunction == "": 132 | weightFunction = "pam" 133 | if outputFile == "": 134 | outputFile = "nw3.fas" 135 | needlemanWunschN3(sequences[0:3], weightFunction = weightFunction, outputFile = outputFile) 136 | 137 | # multiple alignment 138 | 139 | elif len(sequences) == 1: 140 | # structure prediction 141 | if args.algorithm == "nussinov": 142 | if outputFile == "": 143 | outputFile = "nussinov.dotBracket" 144 | nussinov(sequences[0:1], outputFile) 145 | else: 146 | print "You have defined only one input sequence, but your defined algorithm \'",\ 147 | args.algorithm, "\' needs at least two sequences." 148 | else: 149 | print "No sequences in input file defined." 150 | sys.exit(0) 151 | def getSequencesFromFile(inputFile): 152 | """Parse the input file to get the sequences. Returns the sequences as an array. 153 | inputFile: A fasta format file with the input sequences.""" 154 | sequences = io().readFastaFile(inputFile) 155 | return sequences 156 | def needlemanWunsch(sequences, scoreFunction, outputFile, numberOfSolutions): 157 | """Executes the Needleman-Wunsch algorithm with a default score function defined as: a == b -> 0 and a !=b --> 1.\n 158 | Stores the alignments per default in file needlemanWunsch.fas. 159 | To change the score function define a function in class PairwiseAligmentHelper and define the name as an input paramter. 160 | scoreFunction: The name of the weigh function which is defined in class PairwiseAligmentHelper. 161 | outputFile: The path to the output file. 162 | numberOfSolutions: Maximal number of optimal solutions which should be computed.""" 163 | print "\nThe following sequences are given:" 164 | for i in sequences: 165 | print i 166 | print "\nComputing solution...\n\n" 167 | result = nw().compute(sequences, scoreFunction, int(numberOfSolutions), scoringValue=True) 168 | print "\nScore: ", result[1] 169 | print "Number of optimal solutions: ", len(result[0]) 170 | print "\nOne solution is:\n", result[0][0][0], "\n", result[0][0][1] 171 | print "\nFor more solutions look in the file \"needlemanWunsch.fas\" in the bin directory.\n" 172 | io().writeFastaFile(result[0], outputFile) 173 | def gotoh(sequences, scoreFunction="weightFunctionDifference", costFunction="gapCost", outputFile="gotoh.fas"): 174 | """Executes the Gotoh algorithm with a default score function defined as: a == b -> 0 and a !=b --> 1 and a cost function defined as: g(x) = 2 + k.\n 175 | Stores the alignments per default in file gotoh.fas. 176 | To change the score or cost function define a function in class PairwiseAligmentHelper and define the name as an input paramter. 177 | scoreFunction: The name of the weigh function which is defined in class PairwiseAligmentHelper. 178 | costFunction: The name of the gap cost function which is defined in class PairwiseAligmentHelper. 179 | outputFile: The path to the output file. 180 | """ 181 | print "The following sequences are given:" 182 | for i in sequences: 183 | print i 184 | print "Computing solution..." 185 | gotoh = Gotoh(sequences[0], sequences[1], scoreFunction, costFunction) 186 | result = gotoh.compute() 187 | io().writeFastaFile(result, outputFile) 188 | print "Number of solutions: ", len(result) 189 | print "Score:", max(gotoh.computationMatrix[0][-1][-1], max(gotoh.computationMatrix[1][-1][-1], gotoh.computationMatrix[2][-1][-1])) 190 | print "One solution is:\n", result[0][0], "\n", result[0][1] 191 | print "For more solutions look in the file \"gotoh.fas\" in the bin directory." 192 | 193 | def needlemanWunschN3(sequences, weightFunction="weightFunctionDifference", outputFile="nw3.fas"): 194 | """Executes the Needleman-Wunsch algorithm with three sequences""" 195 | print "\nThe following sequences are given:" 196 | for i in sequences: 197 | print i 198 | print "\nComputing solution...\n\n" 199 | nw3 = NW3(sequences[0], sequences[1], sequences[2], weightFunction) 200 | result = nw3.execute() 201 | 202 | io().writeFastaFile(result, outputFile) 203 | print "\nScore: ", nw3.computation_matrix[-1][-1][-1] 204 | print "Number of optimal solutions: ", len(result) 205 | print "\nOne solution is:\n", result[0][0], "\n", result[0][1], "\n", result[0][2] 206 | print "\nFor more solutions look in the file \"nw3.fas\" in the bin directory.\n" 207 | 208 | def upgmaWpgma(upgmaWpgma, sequences, outputFile, fileFormat): 209 | """Executes the a phylogenetic clustering with a upgm or wpgm weighting. 210 | sequences: All defined input sequences as a list. 211 | outputFile: The name of the output file 212 | fileFormat: The file format of the output file""" 213 | #create 214 | print "The following sequences are given:" 215 | for i in sequences: 216 | print i 217 | print "Computing clustering..." 218 | data = mah().createDataForUpgmaWpgma(sequences) 219 | if upgmaWpgma: 220 | upgma = UpgmaWpgma(data[0], len(data[1])) 221 | upgma.compute_clustering() 222 | if not fileFormat: 223 | outputFile += ".graphML" 224 | io().writeGraphMLFile(upgma.mapping, outputFile) 225 | print "Clustering written as graphML file: ", os.path.abspath(outputFile) 226 | else: 227 | outputFile += ".newickTree" 228 | cluster = upgma.get_newick_tree(with_edge_weights=True) 229 | io().writeNewickTree(cluster, outputFile) 230 | print "Computed upgma cluster: ", cluster 231 | print "The clustering was also written to: ", os.path.abspath(outputFile) 232 | else: 233 | wpgma = UpgmaWpgma(data[0], len(data[1]), False, data[2]) 234 | wpgma.compute_clustering() 235 | if not fileFormat: 236 | outputFile += ".graphML" 237 | io().writeGraphMLFile(wpgma.mapping, outputFile) 238 | print "Clustering written as graphML file: ", os.path.abspath(outputFile) 239 | else: 240 | outputFile += ".newickTree" 241 | cluster = wpgma.get_newick_tree(with_edge_weights=True) 242 | io().writeNewickTree(cluster, outputFile) 243 | print "Computed wpgma cluster: ", cluster 244 | print "The clustering was also written to: ", os.path.abspath(outputFile) 245 | 246 | 247 | def nussinov(sequence, outputFile): 248 | """Executes the RNA-folding algorithm from Nussinov. 249 | sequence: The RNA-sequnce as a list. 250 | outputFile: The name of the output file.""" 251 | print "\nThe following sequence is given:" 252 | print sequence[0] 253 | print "\n" 254 | nussinov = Nussinov(sequence[0]) 255 | nussinov.execute() 256 | print "\nDot-bracket: " 257 | io().writeRnaDotBracketNotation(sequence[0], nussinov.pairedBases, outputFile) 258 | print "The result was also written to: ", os.path.abspath(outputFile) 259 | 260 | def sumOfPairs(sequences, scoringFunction, gapPenalty=-1): 261 | """This method scores a multiple sequence alignment with the sum of pairs algorithm. 262 | sequences: The multiple sequence alignment. 263 | scoringFunction: Name of a similarity score defined in class PairwiseAligmentHelper.""" 264 | print "The following sequences are given:" 265 | for i in sequences: 266 | print i 267 | if gapPenalty == -1: 268 | sof = SumOfPairs(sequences, scoringFunction) 269 | else: 270 | sof = SumOfPairs(sequences, scoringFunction, gapPenalty) 271 | print "Sum-of-pairs scoring: ", sof.execute() 272 | def fengDoolittle(sequences, weightFunction, similarityScore, outputFile): 273 | """Executes the heuristic multiple sequence alignment by Feng and Doolittle. 274 | sequences: All input sequnces to align. 275 | weightFunction: The weight function defined in class PairwiseAlignmentHelper for the Needleman-Wunsch algorithm to compute the optimal local alignment. 276 | similarityScore: Name of a similarity score defined in class PairwiseAligmentHelper. 277 | outputFile: The output file name.""" 278 | fd = FengDoolittle(sequences, weightFunction, similarityScore) 279 | alignmentDict = fd.computeMultipleAlignment() 280 | alignment = [[]] 281 | for i in alignmentDict: 282 | alignment[0].append(alignmentDict[i]) 283 | io().writeFastaFile(alignment, outputFile) 284 | print "Input sequences:\n" 285 | for i in sequences: 286 | print i 287 | print "\nAlignment:" 288 | for i in alignmentDict: 289 | print alignmentDict[i] 290 | print sumOfPairs(alignment[0], weightFunction) 291 | 292 | 293 | if __name__ == "__main__": 294 | # try: 295 | main() 296 | # except: 297 | # "You discovered a bug! Please write an email to wolffj@informatik.uni-freiburg.de with your input parameters and I try to fix it." -------------------------------------------------------------------------------- /bin/blosum62.txt: -------------------------------------------------------------------------------- 1 | # Matrix made by matblas from blosum62.iij 2 | # * column uses minimum score 3 | # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units 4 | # Blocks Database = /data/blocks_5.0/blocks.dat 5 | # Cluster Percentage: >= 62 6 | # Entropy = 0.6979, Expected = -0.5209 7 | 8 | A R N D C Q E G H I L K M F P S T W Y V * 9 | A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -4 10 | R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -4 11 | N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 -4 12 | D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 -4 13 | C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -4 14 | Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 -4 15 | E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 -4 16 | G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -4 17 | H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 -4 18 | I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -4 19 | L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 20 | K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 -4 21 | M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -4 22 | F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -4 23 | P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -4 24 | S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 -4 25 | T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -4 26 | W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 27 | Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -4 28 | V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -4 29 | * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 -------------------------------------------------------------------------------- /bin/fengDoolittle.fas: -------------------------------------------------------------------------------- 1 | >Alignment 0 sequence 0 2 | ILDXXXMDVVEGSAARFDCKVEGXXXYPDPEVMWFKDDNPXXXXXXVKXXXXXESRHXXFQIDYXXDEXXEGXXXN 3 | >Alignment 0 sequence 1 4 | RRLXXIPAARGGEISILCQPRAXXAPKATILWSKGTEIXXXXLGXXXXXXNSTXXXXRVTVXXXXTXXXXSXXXXD 5 | >Alignment 0 sequence 2 6 | XXRDPXVKTHEGWGVMLPCNPPAHXYPGLSYRWLLNEFPXXNFIPXXXTDGXXRHFXXVSXXXXQXXTXXXXTXXX 7 | >Alignment 0 sequence 3 8 | ISDXXTEADIGSNLRWGCAAAGXXKPRPMVRWLRNGEPXXXXLAXXXXXXSQNXXXXRVEVXXXXLXXXXAXXXXXX 9 | -------------------------------------------------------------------------------- /report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/report.pdf -------------------------------------------------------------------------------- /source/.idea/.name: -------------------------------------------------------------------------------- 1 | source -------------------------------------------------------------------------------- /source/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /source/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /source/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /source/.idea/scopes/scope_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /source/.idea/source.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /source/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /source/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 14 | 15 | 16 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 34 | 35 | 36 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 71 | 72 | 75 | 76 | 79 | 80 | 81 | 82 | 85 | 86 | 89 | 90 | 91 | 92 | 95 | 96 | 99 | 100 | 103 | 104 | 105 | 106 | 109 | 110 | 113 | 114 | 117 | 118 | 121 | 122 | 123 | 124 | 127 | 128 | 131 | 132 | 135 | 136 | 139 | 140 | 141 | 142 | 145 | 146 | 149 | 150 | 153 | 154 | 157 | 158 | 159 | 160 | 163 | 164 | 167 | 168 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 1422826067293 199 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 233 | 236 | 237 | 238 | 240 | 241 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | -------------------------------------------------------------------------------- /source/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/source/lib/__init__.py -------------------------------------------------------------------------------- /source/lib/helper/IOHelper.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2015 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | import os 12 | class IOHelper(): 13 | """Helper class for reading an writing files in different formats.""" 14 | def readFastaFile(self, inputFileName): 15 | """Reads a given fasta file and returns it as a array. 16 | 17 | inputFileName: The path (relative or absolut) to the input fasta file.""" 18 | sequence = [] 19 | if not os.path.exists(inputFileName): 20 | return sequence 21 | 22 | fileToRead = open(inputFileName, "r") 23 | i = 0 24 | for line in fileToRead.readlines(): 25 | if line.startswith(">"): 26 | continue 27 | sequence.append(line.strip("\n")) 28 | i += 1 29 | fileToRead.close() 30 | return sequence 31 | 32 | def writeFastaFile(self, sequences, outputFileName): 33 | """Writes a the given sequences to a file in the fasta format. 34 | sequences: All computed alignemnts. 35 | A list of lists with two elements: [[,],...,[,]]. 36 | outputFileName: The path (relative or absolut) and the output file name. 37 | e.g.: "/path/to/file" or "file" to write it in the local directory.""" 38 | if not outputFileName.endswith(".fas"): 39 | outputFileName += str(".fas") 40 | fileToWrite = open(outputFileName, "w") 41 | i = 0 42 | while i < len(sequences): 43 | for sequence in sequences[i]: 44 | fileToWrite.write('>Alignment '+ str(i) +' sequence ' + str(sequences[i].index(sequence)) + '\n') 45 | fileToWrite.write(sequence + '\n') 46 | i += 1 47 | fileToWrite.close() 48 | 49 | def writeGraphMLFile(self, clusteredNodesDictionary, outputFileName): 50 | """Writes a tree computed by the UpgmaWpgma class in graphML-format to specified outputFileName.""" 51 | if not outputFileName.endswith(".graphml"): 52 | outputFileName += str(".graphml") 53 | fileToWrite = open(outputFileName, "w") 54 | fileToWrite.write("" 55 | +"\n" 59 | +"\n\t\n") 60 | for i in clusteredNodesDictionary: 61 | nodes = i.split(" ") 62 | fileToWrite.write("\t\t\n") 63 | fileToWrite.write("\t\t\n") 64 | fileToWrite.write("\t\t\n") 65 | j = 0 66 | for i in clusteredNodesDictionary: 67 | nodes = i.split(" ") 68 | fileToWrite.write("\t\t\n") 69 | j += 1 70 | fileToWrite.write("\t\t\n") 71 | j += 1 72 | 73 | fileToWrite.write("\t\n") 74 | fileToWrite.close() 75 | def writeRnaDotBracketNotation(self, sequence, pairedBases, outputFileName): 76 | """Writes a given RNA sequence and the computed matching bases in dot-bracket notation to the file outputFileName.""" 77 | stack = {} 78 | for i in range (0, len(sequence)): 79 | if i in pairedBases: 80 | stack[i] = "(" 81 | stack[pairedBases[i]] = ")" 82 | else: 83 | if not i in stack: 84 | stack[i] = "." 85 | fileToWrite = open(outputFileName, "w") 86 | fileToWrite.write(sequence+"\n") 87 | for i in sorted(stack): 88 | fileToWrite.write(stack[i]) 89 | def writeNewickTree(self, newickTree, outputFileName): 90 | fileToWrite = open(outputFileName, "w") 91 | fileToWrite.write(newickTree) 92 | -------------------------------------------------------------------------------- /source/lib/helper/__init__.py: -------------------------------------------------------------------------------- 1 | from IOHelper import IOHelper 2 | from mathHelper import MathHelper 3 | from pairwiseAlignmentHelper import PairwiseAlignmentHelper 4 | from multipleAlignmentHelper import MultipleAlignmentHelper -------------------------------------------------------------------------------- /source/lib/helper/mathHelper.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2014 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | # 12 | # A math helper class. Some constants are defined here. 13 | class MathHelper(): 14 | """MathHelper class. Some constants are defined.""" 15 | Inf = 1e3000 16 | NaN = 0 * Inf -------------------------------------------------------------------------------- /source/lib/helper/multipleAlignmentHelper.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2014 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | # 12 | from helper import PairwiseAlignmentHelper as pah 13 | class MultipleAlignmentHelper(): 14 | 15 | noGap = 0 16 | gapA = 1 17 | gapB = 2 18 | gapC = 3 19 | gapAB = 4 20 | gapBC = 5 21 | gapAC = 6 22 | 23 | def weightFunctionDifference(self, a, b, c): 24 | """Weight function with 0 if a==b==c, 1 if a==b, a==c or b==c, 2 else.""" 25 | if a == b and b == c: 26 | return 0 27 | elif a == b: 28 | return 1 29 | elif b == c: 30 | return 1 31 | elif a ==c : 32 | return 1 33 | else: 34 | return 2 35 | def createDataForUpgmaWpgma(self, sequences): 36 | """Preprocessing of the sequences for the upgm/wpgm algorithm.""" 37 | differenceDictionary = {} 38 | sequenceToIdMapping = {} 39 | sequenceToLengthMapping = {} 40 | mappingCount = 0 41 | for i in sequences: 42 | sequenceToIdMapping[i] = mappingCount 43 | sequenceToLengthMapping[mappingCount] = len(i) 44 | mappingCount += 1 45 | 46 | differenceScore = 0 47 | for i in range(0, len(sequences)): 48 | for j in range(i+1, len(sequences)): 49 | for k in range(0, max(len(sequences[i]), len(sequences[j]))): 50 | if k < len(sequences[j]) and k < len(sequences[i]): 51 | differenceScore += pah().weightFunctionDifference(sequences[i][k], sequences[j][k]) 52 | elif k < len(sequences[i]): 53 | differenceScore += pah().weightFunctionDifference(sequences[i][k], "-", ) 54 | elif k < len(sequences[j]): 55 | differenceScore += pah().weightFunctionDifference("-", sequences[j][k]) 56 | key = str(i) + " " + str(j) 57 | differenceDictionary[key] = differenceScore 58 | differenceScore = 0 59 | return [differenceDictionary, sequenceToIdMapping, sequenceToLengthMapping] 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /source/lib/helper/pairwiseAlignmentHelper.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2014 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | 12 | class PairwiseAlignmentHelper(): 13 | """Class to support the pairwise alignment algorithms Needleman-Wunsch and Gotoh.""" 14 | diagonalD = 0 15 | dotQ = 1 16 | dotP = 2 17 | upD = 3 18 | upP = 4 19 | leftD = 5 20 | leftQ = 6 21 | matrixIndexD = 0 22 | matrixIndexP = 1 23 | matrixIndexQ = 2 24 | left = 0 25 | up = 1 26 | diagonal = 2 27 | 28 | def weightFunctionDifference(self, a, b): 29 | """Weight function with 0 if a==b and 1 else.""" 30 | if a == b: 31 | return 0 32 | elif a != b: 33 | return 1 34 | 35 | def gapCost(self, x): 36 | """Returns a gap cost of g(x) = 2 + k.""" 37 | return 2 + x 38 | 39 | def pam250(self, a, b): 40 | """Returns the value of an amino acid given a pam250 matrix. If it is a gap, 1 is returned. 41 | Source: http://www.icp.ucl.ac.be/~opperd/private/pam250.html""" 42 | pam250 = [[13, 6, 9, 9, 5, 8, 9, 12, 6, 8, 6, 7, 7, 4, 11, 11, 11, 2, 4, 9] 43 | , [3, 17, 4, 3, 2, 5, 3, 2, 6, 3, 2, 9, 4, 1, 4, 4, 3, 7, 2, 2] 44 | , [4, 4, 6, 7, 2, 5, 6, 4, 6, 3, 2, 5, 3, 2, 4, 5, 4, 2, 3, 3] 45 | , [5, 4, 8, 11, 1, 7, 10, 5, 6, 3, 2, 5, 3, 1, 4, 5, 5, 1, 2, 3] 46 | , [2, 1, 1, 1, 52, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 3, 2, 1, 4, 2] 47 | , [3, 5, 5, 6, 1, 10, 7, 3, 7, 2, 3, 5, 3, 1, 4, 3, 3, 1, 2, 3] 48 | , [5, 4, 7, 11, 1, 9, 12, 5, 6, 3, 2, 5, 3, 1, 4, 5, 5, 1, 2, 3] 49 | , [12, 5, 10, 10, 4, 7, 9, 27, 5, 5, 4, 6, 5, 3, 8, 11, 9, 2, 3, 7] 50 | , [2, 5, 5, 4, 2, 7, 4, 2, 15, 2, 2, 3, 2, 2, 3, 3, 2, 2, 3, 2] 51 | , [3, 2, 2, 2, 2, 2, 2, 2, 2, 10, 6, 2, 6, 5, 2, 3, 4, 1, 3, 9] 52 | , [6, 4, 4, 3, 2, 6, 4, 3, 5, 15, 34, 4, 20, 13, 5, 4, 6, 6, 7, 13] 53 | , [6, 18, 10, 8, 2, 10, 8, 5, 8, 5, 4, 24, 9, 2, 6, 8, 8, 4, 3, 5] 54 | , [1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 3, 2, 6, 2, 1, 1, 1, 1, 1, 2] 55 | , [2, 1, 2, 1, 1, 1, 1, 1, 3, 5, 6, 1, 4, 32, 1, 2, 2, 4, 20, 3] 56 | , [7, 5, 5, 4, 3, 5, 4, 5, 5, 3, 3, 4, 3, 2, 20, 6, 5, 1, 2, 4] 57 | , [9, 6, 8, 7, 7, 6, 7, 9, 6, 5, 4, 7, 5, 3, 9, 10, 9, 4, 4, 6] 58 | , [8, 5, 6, 6, 4, 5, 5, 6, 4, 6, 4, 6, 5, 3, 6, 8, 11, 2, 3, 6] 59 | , [0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 55, 1, 0] 60 | , [1, 1, 2, 1, 3, 1, 1, 1, 3, 2, 2, 1, 2, 15, 1, 2, 2, 3, 31, 2] 61 | , [7, 4, 4, 4, 4, 4, 4, 4, 5, 4, 15, 10, 4, 10, 5, 5, 5, 72, 4, 17]] 62 | 63 | pamdict = {"A":0, "R":1, "N":2, "D":3, "C":4, "Q":5, "E":6, "G":7, "H":8, "I":9, "L":10, "K":11, "M":12, 64 | "F":13, "P":14, "S":15, "T":16, "W":17, "Y":18, "V":19} 65 | 66 | if a in pamdict and b in pamdict: 67 | return pam250[pamdict[a]][pamdict[b]] 68 | else: 69 | return 1 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /source/lib/helper/test/IOHelperTest.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2014 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | 12 | import unittest 13 | import os, sys 14 | lib_path = os.path.abspath('../../') 15 | sys.path.append(lib_path) 16 | from helper import IOHelper as io 17 | 18 | class IOHelperTestClass(unittest.TestCase): 19 | """Test class to check the correctness of the methods in IOHelper.""" 20 | def test_readFastaFile(self): 21 | """Test method to test the correct reading of a fasta file.""" 22 | if os.path.exists("testReadFasta.fas"): 23 | os.remove("testReadFasta.fas") 24 | 25 | # first test case: two sequences 26 | sequenceToWrite = [["ACGT", "ACGTAATTA"]] 27 | expectedSequence = ["ACGT", "ACGTAATTA"] 28 | io().writeFastaFile(sequenceToWrite, "testReadFasta.fas") 29 | readSequence = io().readFastaFile("testReadFasta.fas", multipleSequenceAlignment=False) 30 | self.assertEqual(expectedSequence, readSequence) 31 | 32 | # second test case: two sequences but there are multilpe ones 33 | sequenceToWrite = [["ACGT", "ACGTAATTA", "AGTTG"]] 34 | expectedSequence = ["ACGT", "ACGTAATTA", "AGTTG"] 35 | io().writeFastaFile(sequenceToWrite, "testReadFasta.fas") 36 | readSequence = io().readFastaFile("testReadFasta.fas", multipleSequenceAlignment=False) 37 | self.assertNotEqual(expectedSequence, readSequence) 38 | 39 | # third test case: multiple sequences 40 | readSequence = io().readFastaFile("testReadFasta.fas", multipleSequenceAlignment=True) 41 | self.assertEqual(expectedSequence, readSequence) 42 | 43 | os.remove("testReadFasta.fas") 44 | 45 | def test_writeFastaFile(self): 46 | """Test method to test the correct writing of a fasta file.""" 47 | if os.path.exists("testWriteFasta.fas"): 48 | os.remove("testWriteFasta.fas") 49 | sequence = [["ACGT", "ACGTAATTA"]] 50 | expectedReadSequence = [">Alignment 0 sequence 0", "ACGT", ">Alignment 0 sequence 1", "ACGTAATTA"] 51 | readInputSequence = [] 52 | 53 | # first test case, filename with extension 54 | io().writeFastaFile(sequence, "testWriteFasta.fas") 55 | testInputFile = open("testWriteFasta.fas") 56 | for line in testInputFile.readlines(): 57 | readInputSequence.append(line.strip("\n")) 58 | self.assertEqual(expectedReadSequence, readInputSequence) 59 | testInputFile.close() 60 | os.remove("testWriteFasta.fas") 61 | 62 | # second test case, filename without extension 63 | readInputSequence = [] 64 | io().writeFastaFile(sequence, "testWriteFasta") 65 | testInputFile = open("testWriteFasta.fas") 66 | for line in testInputFile.readlines(): 67 | readInputSequence.append(line.strip("\n")) 68 | self.assertEqual(expectedReadSequence, readInputSequence) 69 | testInputFile.close() 70 | os.remove("testWriteFasta.fas") 71 | 72 | if __name__ == "__main__": 73 | unittest.main() # run all tests -------------------------------------------------------------------------------- /source/lib/multiple/__init__.py: -------------------------------------------------------------------------------- 1 | from needlemanWunschN3 import NeedlemanWunschN3 2 | from upgmaWpgma import UpgmaWpgma 3 | from fengDoolittle import FengDoolittle 4 | from sumOfPairs import SumOfPairs -------------------------------------------------------------------------------- /source/lib/multiple/fengDoolittle.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2015 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | import sys 12 | from pairwise import NeedlemanWunsch 13 | from math import log10 14 | from helper import PairwiseAlignmentHelper as pah 15 | from multiple import UpgmaWpgma 16 | import random 17 | 18 | 19 | class FengDoolittle(): 20 | """This class computes the Feng-Doolittle algorithm by Da-Fei Feng and Russell F. Doolittle: 21 | Feng, Da-Fei, and Russell F. Doolittle. 22 | "Progressive sequence alignment as a prerequisitetto correct phylogenetic trees." 23 | Journal of molecular evolution 25.4 (1987): 351-360. 24 | http://dna.bio.puc.cl/cardex/papersbio252/Grupo06-2013.pdf""" 25 | def __init__(self, sequences, weightFunction, similarityScore): 26 | """To initialize an object of class FengDoolittle you have to define: 27 | sequences: A list of sequences for the multiple alignment. 28 | weightFunction: A string containing the name of your preferred weight function. 29 | The weight function have to be defined in package helper, class PairwiseAlignmentHelper. 30 | similarityScore: A string containing the name of your preferred similarity score like pam250. 31 | The similarity score have to be defined in package helper, class PairwiseAlignmentHelper.""" 32 | self.sequences = sequences 33 | self.alignments = [] 34 | self.weightFunction = weightFunction 35 | if similarityScore in dir(pah) and callable(getattr(pah, similarityScore)): 36 | similarityScoreObj = eval('pah().' + similarityScore) 37 | else: 38 | print "Score function not found!" 39 | sys.exit() 40 | self.similarityScore = similarityScoreObj 41 | self.alignmentToIndexMapping = {} 42 | self.sequenceToIndexMapping = {} 43 | self.distanceDictionary = {} 44 | self.newickTree = "" 45 | self.orderToAlign = [] 46 | 47 | def computeAlignments(self): 48 | """This function computes all pairwise alignments between every sequence with the Needleman-Wunsch algorithm.""" 49 | nw = NeedlemanWunsch() 50 | alignmentsAppend = self.alignments.append 51 | for i in range(0, len(self.sequences)): 52 | for j in range(i + 1, len(self.sequences)): 53 | alignmentsAppend([nw.compute([self.sequences[i], self.sequences[j]], self.weightFunction,1)[0], i, j]) 54 | 55 | def computeDistanceDictionary(self): 56 | """This function computes the distance between every alignment. The distances are used to generate a phylogenetic tree.""" 57 | for i in range(0, len(self.alignments)): 58 | index = str(self.alignments[i][1]) + " " + str(self.alignments[i][2]) 59 | self.distanceDictionary[index] = self.similarityToDistance(self.alignments[i][0]) 60 | 61 | def similarityToDistance(self, alignment): 62 | """Computes from the given similarity the distance measure.""" 63 | sMax = self.similarity(alignment[0], alignment[0]) + self.similarity(alignment[1], alignment[1]) 64 | sMax /= 2 65 | alignmentAsList = list(alignment[0]) 66 | alignmentAsList1 = list(alignment[1]) 67 | random.shuffle(alignmentAsList) 68 | random.shuffle(alignmentAsList1) 69 | alignmentShuffel0 = "".join(alignmentAsList) 70 | alignmentShuffel1 = "".join(alignmentAsList1) 71 | 72 | sRand = self.similarity(alignmentShuffel0, alignmentShuffel1) 73 | if sMax == sRand: 74 | sRand = sRand - 0.0001 75 | else: 76 | sEff = (self.similarity(alignment[0], alignment[1]) - sRand) / float(sMax - sRand) 77 | if sEff <= 0.0: 78 | return 1 79 | distance = -log10(sEff) 80 | return distance 81 | 82 | def similarity(self, a, b): 83 | """Returns the similarity of two sequences a and b with the similarity score defined at the initialization.""" 84 | similarity = 0 85 | for i in range(0, len(a)): 86 | similarity += self.similarityScore(a[i], b[i]) 87 | return similarity 88 | 89 | def buildTree(self): 90 | """This function computes the phylogenetic tree with UPGMA and stores it in the Newick-Tree format.""" 91 | upgma = UpgmaWpgma(self.distanceDictionary, len(self.sequences)) 92 | upgma.compute_clustering() 93 | self.newickTree = upgma.get_newick_tree() 94 | 95 | def buildMultipleAlignment(self, group0, group1): 96 | """This function returns which is the best pairwise alignment out of all alignments of group0 and group1.""" 97 | highestScore = 0 98 | optimalAlignment = [] 99 | for i in group0: 100 | for j in group1: 101 | nw = NeedlemanWunsch() 102 | alignment = nw.compute([i[0], j[0]], self.weightFunction, 1) 103 | score = self.similarity(alignment[0][0], alignment[0][1]) 104 | if highestScore < score: 105 | highestScore = score 106 | optimalAlignment = [alignment[0][0], alignment[0][1], i[1], j[1]] 107 | return optimalAlignment 108 | 109 | 110 | def computeOrderOfSequencesToAlign(self): 111 | """This function computes out of the phylogenetic tree in which order the sequences are aligned.""" 112 | indexBegin = 0 113 | indexEnd = len(self.newickTree) 114 | while indexEnd != -1: 115 | indexBegin = self.newickTree.rfind("(", indexBegin, indexEnd) 116 | if indexBegin == -1: 117 | break 118 | i = indexBegin + 1 119 | stack = 0 120 | while stack >= 0 and i < len(self.newickTree): 121 | if self.newickTree[i] == "(": 122 | stack += 1 123 | elif self.newickTree[i] == ")": 124 | stack -= 1 125 | i += 1 126 | indexEnd = i 127 | 128 | group0 = "" 129 | group1 = "" 130 | substring = self.newickTree[indexBegin:indexEnd] 131 | if substring[1] != "(": 132 | indexGroup0 = substring.find(",") 133 | group0 = substring[0:indexGroup0].strip(",") 134 | group1 = substring[indexGroup0:-1].strip(",") 135 | else: 136 | k = 1 137 | stack = 0 138 | while k < len(substring): 139 | if substring[k] == "(": 140 | stack += 1 141 | elif substring[k] == ")": 142 | stack -= 1 143 | k += 1 144 | if stack <= 0: 145 | break 146 | group0 = substring[0:k].strip(",") 147 | group1 = substring[k:-1].strip(",") 148 | group0List = group0.split(",") 149 | group1List = group1.split(",") 150 | list0 = [] 151 | list1 = [] 152 | for j in group0List: 153 | list0.append(int(j.strip("(").strip(")").strip(","))) 154 | for j in group1List: 155 | list1.append(int(j.strip("(").strip(")").strip(","))) 156 | 157 | self.orderToAlign.append(sorted([sorted(list0), sorted(list1)])) 158 | indexEnd = indexBegin 159 | indexBegin = 0 160 | 161 | def computeMultipleAlignment(self): 162 | """This function returns the multiple sequence alignment.""" 163 | self.computeAlignments() 164 | self.computeDistanceDictionary() 165 | self.buildTree() 166 | self.computeOrderOfSequencesToAlign() 167 | i = 0 168 | indexAlignments = {} 169 | # create index to algnment realation 170 | while i < len(self.orderToAlign): 171 | if len(self.orderToAlign[i][0]) == 1 and len(self.orderToAlign[i][1]): 172 | for j in self.alignments: 173 | if (j[1] == self.orderToAlign[i][0][0] and j[2] == self.orderToAlign[i][1][0]): 174 | indexAlignments[self.orderToAlign[i][0][0]] = j[0][0] 175 | indexAlignments[self.orderToAlign[i][1][0]] = j[0][1] 176 | break 177 | elif(j[1] == self.orderToAlign[i][1][0] and j[2] == self.orderToAlign[i][0][0]): 178 | indexAlignments[self.orderToAlign[i][0][0]] = j[0][1] 179 | indexAlignments[self.orderToAlign[i][1][0]] = j[0][0] 180 | break 181 | elif len(self.orderToAlign[i][0]) == 1: 182 | indexAlignments[self.orderToAlign[i][0][0]] = self.sequences[self.orderToAlign[i][0][0]] 183 | elif len(self.orderToAlign[i][1]) == 1: 184 | try: 185 | indexAlignments[self.orderToAlign[i][1][0]] = self.sequences[self.orderToAlign[i][1][0]] 186 | except: 187 | print "Exception!" 188 | print "i: ", i 189 | print "OrderToAlign: ", self.orderToAlign 190 | print "orderAlign:", self.orderToAlign[i][1][0] 191 | print self.sequences 192 | i += 1 193 | 194 | for i in self.orderToAlign: 195 | # one sequence with one sequence 196 | if len(i[0]) == 1 and len(i[1]): 197 | indexAlignments[i[0][0]] = indexAlignments[i[0][0]].replace("-", "X") 198 | indexAlignments[i[1][0]] = indexAlignments[i[1][0]].replace("-", "X") 199 | # one sequence with one group 200 | # two groups 201 | else: 202 | group0 = [] 203 | group1 = [] 204 | for j in i[0]: 205 | group0.append([indexAlignments[j], j]) 206 | for j in i[1]: 207 | group1.append([indexAlignments[j],j]) 208 | pairwiseAlignment = self.buildMultipleAlignment(group0, group1) 209 | indexAlignments[pairwiseAlignment[2]] = pairwiseAlignment[0].replace("-", "X") 210 | indexAlignments[pairwiseAlignment[3]] = pairwiseAlignment[1].replace("-", "X") 211 | 212 | for j in i[0]: 213 | nw = NeedlemanWunsch() 214 | alignment = nw.compute([pairwiseAlignment[0], indexAlignments[j]], self.weightFunction, 1) 215 | indexAlignments[j] = alignment[0][1] 216 | for j in i[1]: 217 | nw = NeedlemanWunsch() 218 | alignment = nw.compute([pairwiseAlignment[1], indexAlignments[j]], self.weightFunction, 1) 219 | indexAlignments[j] = alignment[0][1] 220 | for j in indexAlignments: 221 | indexAlignments[j] = indexAlignments[j].replace("-", "X") 222 | return indexAlignments 223 | -------------------------------------------------------------------------------- /source/lib/multiple/needlemanWunschN3.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2015 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | import sys 12 | from helper import MultipleAlignmentHelper as mah 13 | 14 | 15 | class NeedlemanWunschN3(): 16 | """This class computes the Needleman-Wunsch algorithm with three sequences.""" 17 | 18 | def __init__(self, sequence_a, sequence_b, sequence_c, score_function): 19 | """Initalize all variables and methods needed to compute the Needleman-Wunsch algorithm with three sequences. 20 | sequenceA: A string with the first DNA sequence. 21 | sequenceB: A string with the second DNA sequence. 22 | sequenceC: A string with the third DNA sequence. 23 | scoreFunction: The name of a weight function as a String which is defined 24 | in the pairwiseAlignmentHelper-class. 25 | """ 26 | if score_function in dir(mah) and callable(getattr(mah, score_function)): 27 | score_function_obj = eval('mah().' + score_function) 28 | else: 29 | print "Score function not found!" 30 | sys.exit() 31 | 32 | self.computation_matrix = [[[]]] 33 | self.sequence_a = sequence_a 34 | self.sequence_b = sequence_b 35 | self.sequence_c = sequence_c 36 | self.score_function = score_function_obj 37 | self.i = 0 38 | self.j = 0 39 | self.traceback_stack = [[]] 40 | self.traceback_stack_index = 0 41 | self.indices_stack = [[]] 42 | self.computed_alignment = [] 43 | 44 | def compute_matrix(self): 45 | """Computes the matrix which is needed by the Needleman-Wunsch algorithm for three sequences.""" 46 | self.computation_matrix = [ 47 | [[0 for i in range(len(self.sequence_c) + 1)] for j in range(len(self.sequence_b) + 1)] \ 48 | for k in range(len(self.sequence_a) + 1)] 49 | # initalize matrix 50 | for i in range(1, len(self.sequence_a) + 1): 51 | self.computation_matrix[i][0][0] = self.computation_matrix[i - 1][0][0] \ 52 | + self.score_function("", "", self.sequence_a[i - 1]) 53 | for i in range(1, len(self.sequence_b) + 1): 54 | self.computation_matrix[0][i][0] = self.computation_matrix[0][i - 1][0] \ 55 | + self.score_function("", "", self.sequence_b[i - 1]) 56 | for i in range(1, len(self.sequence_c) + 1): 57 | self.computation_matrix[0][0][i] = self.computation_matrix[0][0][i - 1] \ 58 | + self.score_function("", "", self.sequence_c[i - 1]) 59 | for i in range(1, len(self.sequence_a) + 1): 60 | for j in range(1, len(self.sequence_b) + 1): 61 | self.computation_matrix[i][j][0] = self.computation_matrix[i - 1][j - 1][0] \ 62 | + self.score_function(self.sequence_a[i - 1], self.sequence_b[j - 1], 63 | "") 64 | for i in range(1, len(self.sequence_a) + 1): 65 | for k in range(1, len(self.sequence_c) + 1): 66 | self.computation_matrix[i][0][k] = self.computation_matrix[i - 1][0][k - 1] \ 67 | + self.score_function(self.sequence_a[i - 1], "", 68 | self.sequence_c[k - 1]) 69 | for j in range(1, len(self.sequence_b) + 1): 70 | for k in range(1, len(self.sequence_c) + 1): 71 | self.computation_matrix[0][j][k] = self.computation_matrix[0][j - 1][k - 1] \ 72 | + self.score_function("", self.sequence_b[j - 1], 73 | self.sequence_c[k - 1]) 74 | 75 | for i in range(1, len(self.sequence_a) + 1): 76 | for j in range(1, len(self.sequence_b) + 1): 77 | for k in range(1, len(self.sequence_c) + 1): 78 | self.computation_matrix[i][j][k] = self.compute_minimum(i, j, k) 79 | 80 | def compute_minimum(self, i, j, k): 81 | """Compute the minimal value for a given cell of the matrix. 82 | The minimum is choosen of the following values: 83 | D(i-1, j-1, k-1) + w(a_i-1, b_j-1, c_k-1) 84 | D(i, j-1, k-1) + w(a_i, b_j-1, c_k-1) 85 | D(i-1, j, k-1) + w(a_i-1, b_j, c_k-1) 86 | D(i-1, j-1, k) + w(a_i-1, b_j-1, c_k) 87 | D(i, j, k-1) + w(a_i, b_j, c_k-1) 88 | D(i-1, j, k) + w(a_i-1, b_j, c_k) 89 | D(i, j-1, k) + w(a_i, b_j-1, c_k) 90 | i: index of sequence A 91 | j: index of sequence B 92 | k: index of sequence C 93 | """ 94 | # no gap 95 | no_gap = self.computation_matrix[i - 1][j - 1][k - 1] \ 96 | + self.score_function(self.sequence_a[i - 1], self.sequence_b[j - 1], self.sequence_c[k - 1]) 97 | # one gap 98 | gap_a = self.computation_matrix[i][j - 1][k - 1] \ 99 | + self.score_function("", self.sequence_b[j - 1], self.sequence_c[k - 1]) 100 | gap_b = self.computation_matrix[i - 1][j][k - 1] \ 101 | + self.score_function(self.sequence_a[i - 1], "", self.sequence_c[k - 1]) 102 | gap_c = self.computation_matrix[i - 1][j - 1][k] \ 103 | + self.score_function(self.sequence_a[i - 1], self.sequence_b[j - 1], "") 104 | # two gaps 105 | gap_ab = self.computation_matrix[i][j][k - 1] + self.score_function("", "", self.sequence_c[k - 1]) 106 | gap_bc = self.computation_matrix[i - 1][j][k] + self.score_function(self.sequence_a[i - 1], "", "") 107 | gap_ac = self.computation_matrix[i][j - 1][k] + self.score_function("", self.sequence_b[j - 1], "") 108 | possible_values = [no_gap, gap_a, gap_b, gap_c, gap_ab, gap_bc, gap_ac] 109 | return min(possible_values) 110 | 111 | def traceback(self, maximal_optimal_solutions=-1): 112 | """Computes the traceback for the Needleman-Wunsch n=3 matrix.""" 113 | self.traceback_stack = [[]] 114 | self.indices_stack = [[len(self.computation_matrix) - 1, len(self.computation_matrix[0]) - 1, 115 | len(self.computation_matrix[0][0]) - 1]] 116 | self.traceback_stack_index = 0 117 | traceback_done = False 118 | optimal_solutions_count = 0 119 | while not traceback_done: 120 | 121 | i = self.indices_stack[self.traceback_stack_index][0] 122 | j = self.indices_stack[self.traceback_stack_index][1] 123 | k = self.indices_stack[self.traceback_stack_index][2] 124 | optimal_solutions_count += 1 125 | split = False 126 | while i > 0 or j > 0 or k > 0: 127 | path_variable_i = i 128 | path_variable_j = j 129 | path_variable_k = k 130 | # no gap 131 | if i > 0 and j > 0 and k > 0: 132 | if self.computation_matrix[i][j][k] == self.computation_matrix[i - 1][j - 1][k - 1] \ 133 | + self.score_function(self.sequence_a[i - 1], self.sequence_b[j - 1], 134 | self.sequence_c[k - 1]): 135 | self.traceback_stack[self.traceback_stack_index].append(mah.noGap) 136 | path_variable_i -= 1 # change i 137 | path_variable_j -= 1 # change j 138 | path_variable_k -= 1 # change k 139 | split = True 140 | 141 | # a gap in sequence a 142 | if j > 0 and k > 0: 143 | if self.computation_matrix[i][j][k] == self.computation_matrix[i][j - 1][k - 1] \ 144 | + self.score_function("", self.sequence_b[j - 1], self.sequence_c[k - 1]): 145 | if split == False: 146 | self.traceback_stack[self.traceback_stack_index].append(mah.gapA) 147 | path_variable_j -= 1 148 | path_variable_k -= 1 149 | split = True 150 | else: 151 | self.split([i, j - 1, k - 1], mah.gapA) 152 | # a gap in sequence b 153 | if i > 0 and k > 0: 154 | if self.computation_matrix[i][j][k] == self.computation_matrix[i - 1][j][k - 1] \ 155 | + self.score_function(self.sequence_a[i - 1], "", self.sequence_c[k - 1]): 156 | if split == False: 157 | self.traceback_stack[self.traceback_stack_index].append(mah.gapB) 158 | path_variable_i -= 1 159 | path_variable_k -= 1 160 | elif split == True: 161 | self.split([i - 1, j, k - 1], mah.gapB) 162 | # a gap in sequence c 163 | if i > 0 and j > 0: 164 | if self.computation_matrix[i][j][k] == self.computation_matrix[i - 1][j - 1][k] \ 165 | + self.score_function(self.sequence_a[i - 1], self.sequence_b[j - 1], ""): 166 | if split == False: 167 | self.traceback_stack[self.traceback_stack_index].append(mah.gapC) 168 | path_variable_i -= 1 169 | path_variable_j -= 1 170 | elif split == True: 171 | self.split([i - 1, j - 1, k], mah.gapC) 172 | # a gap in sequence a and b 173 | if k > 0: 174 | if self.computation_matrix[i][j][k] == self.computation_matrix[i][j][k - 1] \ 175 | + self.score_function("", "", self.sequence_c[k - 1]): 176 | if split == False: 177 | self.traceback_stack[self.traceback_stack_index].append(mah.gapAB) 178 | path_variable_k -= 1 179 | elif split == True: 180 | self.split([i, j, k - 1], mah.gapAB) 181 | # a gap in sequence a and c 182 | if j > 0: 183 | if self.computation_matrix[i][j][k] == self.computation_matrix[i][j - 1][k] \ 184 | + self.score_function("", self.sequence_b[j - 1], ""): 185 | if split == False: 186 | self.traceback_stack[self.traceback_stack_index].append(mah.gapAC) 187 | path_variable_j -= 1 188 | elif split == True: 189 | self.split([i, j - 1, k], mah.gapAC) 190 | # a gap in sequence b and c 191 | if i > 0: 192 | if self.computation_matrix[i][j][k] == self.computation_matrix[i - 1][j][k] \ 193 | + self.score_function(self.sequence_a[i - 1], "", ""): 194 | if split == False: 195 | self.traceback_stack[self.traceback_stack_index].append(mah.gapBC) 196 | path_variable_i -= 1 197 | elif split == True: 198 | self.split([i - 1, j, k], mah.gapBC) 199 | split = False 200 | i = path_variable_i 201 | j = path_variable_j 202 | k = path_variable_k 203 | if maximal_optimal_solutions != -1 and optimal_solutions_count >= maximal_optimal_solutions: 204 | break 205 | self.indices_stack[self.traceback_stack_index][0] = i 206 | self.indices_stack[self.traceback_stack_index][1] = j 207 | self.indices_stack[self.traceback_stack_index][2] = k 208 | l = 0 209 | all_tracebacks_computed = 0 210 | while l < len(self.indices_stack): 211 | if self.indices_stack[l][0] == 0 and self.indices_stack[l][1] == 0 and self.indices_stack[l][2] == 0: 212 | all_tracebacks_computed += 1 213 | else: 214 | self.traceback_stack_index = l 215 | l = len(self.indices_stack) 216 | l += 1 217 | if all_tracebacks_computed >= len(self.indices_stack): 218 | traceback_done = True 219 | # all_tracebacks_computed = 0 220 | if maximal_optimal_solutions != -1 and optimal_solutions_count >= maximal_optimal_solutions: 221 | for i in range(0, maximal_optimal_solutions): 222 | self.computed_alignment.append(self.build_alignment(self.traceback_stack[i])) 223 | else: 224 | for i in range(0, len(self.traceback_stack)): 225 | self.computed_alignment.append(self.build_alignment(self.traceback_stack[i])) 226 | 227 | def split(self, index, gapSymbol): 228 | """Splits the actual traceback path into two paths. 229 | index: The index values for the next cell of the path. 230 | gapSymbol: A symbol for the computed step for the path.""" 231 | self.traceback_stack.append(self.traceback_stack[self.traceback_stack_index][0:-1]) 232 | self.traceback_stack[len(self.traceback_stack) - 1].append(gapSymbol) 233 | self.indices_stack.append(index) 234 | 235 | def build_alignment(self, tracebackStack): 236 | """Builds the alignment for one traceback path. 237 | tracebackStack: The computed tracebackpath as a list = [] 238 | """ 239 | i = 0 240 | j = 0 241 | k = 0 242 | l = len(tracebackStack) - 1 243 | alignment_of_a = "" 244 | alignment_of_b = "" 245 | alignment_of_c = "" 246 | 247 | while len(tracebackStack) > 0: 248 | try: 249 | tracebackElement = tracebackStack.pop(l) 250 | if mah.noGap == tracebackElement: 251 | alignment_of_a += self.sequence_a[i] 252 | alignment_of_b += self.sequence_b[j] 253 | alignment_of_c += self.sequence_c[k] 254 | i += 1 255 | j += 1 256 | k += 1 257 | elif mah.gapA == tracebackElement: 258 | alignment_of_a += "-" 259 | alignment_of_b += self.sequence_b[j] 260 | alignment_of_c += self.sequence_c[k] 261 | j += 1 262 | k += 1 263 | elif mah.gapB == tracebackElement: 264 | alignment_of_a += self.sequence_a[i] 265 | alignment_of_b += "-" 266 | alignment_of_c += self.sequence_c[k] 267 | i += 1 268 | k += 1 269 | elif mah.gapC == tracebackElement: 270 | alignment_of_a += self.sequence_a[i] 271 | alignment_of_b += self.sequence_b[j] 272 | alignment_of_c += "-" 273 | i += 1 274 | j += 1 275 | elif mah.gapAB == tracebackElement: 276 | alignment_of_a += "-" 277 | alignment_of_b += "-" 278 | alignment_of_c += self.sequence_c[k] 279 | k += 1 280 | elif mah.gapAC == tracebackElement: 281 | alignment_of_a += "-" 282 | alignment_of_b += self.sequence_b[j] 283 | alignment_of_c += "-" 284 | j += 1 285 | elif mah.gapBC == tracebackElement: 286 | alignment_of_a += self.sequence_a[i] 287 | alignment_of_b += "-" 288 | alignment_of_c += "-" 289 | i += 1 290 | l -= 1 291 | except: 292 | print "An error occured." 293 | sys.exit() 294 | while i < len(self.sequence_a): 295 | alignment_of_a += self.sequence_a[i] 296 | i += 1 297 | while j < len(self.sequence_b): 298 | alignment_of_b += self.sequence_b[j] 299 | j += 1 300 | while k < len(self.sequence_c): 301 | alignment_of_b += self.sequence_c[k] 302 | k += 1 303 | alignment = [alignment_of_a, alignment_of_b, alignment_of_c] 304 | return alignment 305 | 306 | def execute(self, maximalOptimalSolutions=-1): 307 | """Method to start the computation of the Needleman-Wunsch algorithm with three sequences. It returns the computed alignment. 308 | [maximalOptimalSolutions]: Define how many optimal solutions should be computed. If not defined, all optimal solutions are computed.""" 309 | self.compute_matrix() 310 | if maximalOptimalSolutions == -1: 311 | self.traceback() 312 | else: 313 | self.traceback(maximalOptimalSolutions) 314 | return self.computed_alignment 315 | -------------------------------------------------------------------------------- /source/lib/multiple/sumOfPairs.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2015 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | # 12 | # Sum of pairs algorithm 13 | from helper import PairwiseAlignmentHelper as pah 14 | import sys 15 | 16 | 17 | class SumOfPairs(): 18 | """This class computes the Sum-of-pairs algorithm by Carillo and Lipman: 19 | Carrillo, Humberto, and David Lipman. 20 | "The multiple sequence alignment problem in biology." 21 | SIAM Journal on Applied Mathematics 48.5 (1988): 1073-1082. 22 | http://www.academia.edu/download/30855770/Articulo03.pdf""" 23 | def __init__(self, sequences, similarity_score): 24 | """To initialize a object of the SumOfPairs class please define a list with the multiple sequence alignment and 25 | a similarity score method which is defined in class PairwiseAlignmentHelper of package helper. 26 | sequences: The multiple alginment as a list. 27 | similarity_score: The scoring functions name as a string.""" 28 | self.sequences = sequences 29 | if similarity_score in dir(pah) and callable(getattr(pah, similarity_score)): 30 | similarity_score_obj = eval('pah().' + similarity_score) 31 | else: 32 | print "Score function not found!" 33 | sys.exit() 34 | self.score_function = similarity_score_obj 35 | 36 | def execute(self): 37 | """Run this method to compute the sum of pairs scoring for multiple alignment.""" 38 | score_value = 0 39 | for i in range(0, len(self.sequences)): 40 | for j in range(i+1, len(self.sequences)): 41 | score_value += self.score(self.sequences[i], self.sequences[j]) 42 | return score_value 43 | 44 | def score(self, sequence_a, sequence_b): 45 | """Returns the pairwise alignment for sequence_a and sequence_b.""" 46 | score_value = 0 47 | for i in range(0, max(len(sequence_a), len(sequence_b))): 48 | if i < len(sequence_a) and i < len(sequence_b): 49 | score_value += self.score_function(sequence_a[i], sequence_b[i]) 50 | elif i < len(sequence_a): 51 | score_value += self.score_function(sequence_a[i], "") 52 | elif i < len(sequence_b): 53 | score_value += self.score_function("", sequence_b[i]) 54 | i += 1 55 | return score_value 56 | -------------------------------------------------------------------------------- /source/lib/multiple/test/fengDoolittleTest.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2014 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | # 12 | # Feng-Doolittle test class 13 | import unittest 14 | import os, sys 15 | lib_path = os.path.abspath('../../') 16 | sys.path.append(lib_path) 17 | 18 | from multiple import FengDoolittle 19 | 20 | class FengDoolittleTestClass(unittest.TestCase): 21 | """Test class to test the correct computation of the Needleman-Wunsch n=3 algorithm.""" 22 | def test_computeAlignments(self): 23 | sequences = ["ACTG", "AT", "ACG"] 24 | expectedAlignments = [[["ACTG", "A-T-"],0,1], [["ACTG", "AC-G"],0,2], [["AT-", "ACG"],1,2]] 25 | fd = FengDoolittle(sequences, "weightFunctionDifference", "pam250") 26 | fd.computeAlignments() 27 | self.assertEqual(expectedAlignments, fd.alignments) 28 | def test_computeDistanceDictionary(self): 29 | sequences = ["ACCCAT", "ACGGAT", "AACCT"] 30 | expectedAlignments = [["AC-CAT", "ACGGAT"], ["ACGGAT", "AACCAT"], ["-ACCAT", "AACCAT"]] 31 | fd = FengDoolittle(sequences, "weightFunctionDifference", "pam250") 32 | fd.computeAlignments() 33 | fd.computeDistanceDictionary() 34 | def test_computeOrderOfSequencesToAlign(self): 35 | sequences = ["ACTG", "AT", "ACG"] 36 | fd = FengDoolittle(sequences, "weightFunctionDifference", "pam250") 37 | fd.computeAlignments() 38 | fd.computeDistanceDictionary() 39 | fd.buildTree() 40 | # print "NewickTree: ",fd.newickTree 41 | expectedResult = [[[0],[2]], [[0,2],[1]]] 42 | # print "asd" 43 | fd.computeOrderOfSequencesToAlign() 44 | # print "asd" 45 | self.assertEqual(expectedResult, fd.orderToAlign) 46 | def test_computeMultipleAlignment(self): 47 | sequences = ["ACTG", "AT", "ACG"] 48 | expectedResult = {0: 'ACTG', 1: 'AXTX', 2: 'ACXG'} 49 | fd = FengDoolittle(sequences, "weightFunctionDifference", "pam250") 50 | fd.computeMultipleAlignment() 51 | self.assertEqual(expectedResult, fd.computeMultipleAlignment()) 52 | 53 | sequences = ["ACCAT", "ACGGAT", "AACCAT"] 54 | expectedResult = {0: 'AXCCXAT', 1: 'AXCGGAT', 2: 'AACCXAT'} 55 | fd2 = FengDoolittle(sequences, "weightFunctionDifference", "pam250") 56 | fd2.computeMultipleAlignment() 57 | self.assertEqual(expectedResult, fd2.computeMultipleAlignment()) 58 | if __name__ == "__main__": 59 | unittest.main() # run all tests 60 | -------------------------------------------------------------------------------- /source/lib/multiple/test/needlemanWunschN3Test.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2014 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | # 12 | # Needleman-Wunsch with n=3 test class 13 | import unittest 14 | import os, sys 15 | lib_path = os.path.abspath('../../') 16 | sys.path.append(lib_path) 17 | 18 | from multiple import NeedlemanWunschN3 as nw 19 | from helper import MultipleAlignmentHelper as mah 20 | from helper import MathHelper as mathHelper 21 | class NeelemanWunschN3TestClass(unittest.TestCase): 22 | """Test class to test the correct computation of the Needleman-Wunsch n=3 algorithm.""" 23 | def test_computeMatrix(self): 24 | sequenceA = "AC" 25 | sequenceB = "AGT" 26 | sequenceC = "AGT" 27 | expectedMatrix = [[[0 for i in range(len(sequenceC)+1) ] for j in range(len(sequenceB)+1)] for k in range(len(sequenceA)+1 )] 28 | for i in range(1, len(sequenceA)+1): 29 | expectedMatrix[i][0][0] = expectedMatrix[i-1][0][0] + mah().weightFunctionDifference("", "", sequenceA[i-1]) 30 | for i in range(1, len(sequenceB)+1): 31 | expectedMatrix[0][i][0] = expectedMatrix[0][i-1][0] + mah().weightFunctionDifference("", "", sequenceB[i-1]) 32 | for i in range(1, len(sequenceC)+1): 33 | expectedMatrix[0][0][i] = expectedMatrix[0][0][i-1] + mah().weightFunctionDifference("", "", sequenceC[i-1]) 34 | for i in range(1, len(sequenceA)+1): 35 | for j in range(1, len(sequenceB)+1): 36 | expectedMatrix[i][j][0] = expectedMatrix[i-1][j-1][0] + mah().weightFunctionDifference(sequenceA[i-1], sequenceB[j-1], "") 37 | for i in range(1, len(sequenceA)+1): 38 | for k in range(1, len(sequenceC)+1): 39 | expectedMatrix[i][0][k] = expectedMatrix[i-1][0][k-1] + mah().weightFunctionDifference(sequenceA[i-1], "", sequenceC[k-1]) 40 | for j in range(1, len(sequenceB)+1): 41 | for k in range(1, len(sequenceC)+1): 42 | expectedMatrix[0][j][k] = expectedMatrix[0][j-1][k-1] + mah().weightFunctionDifference("", sequenceB[j-1], sequenceC[k-1]) 43 | 44 | assertEqual() 45 | 46 | if __name__ == "__main__": 47 | unittest.main() # run all tests 48 | -------------------------------------------------------------------------------- /source/lib/multiple/test/sumOfPairsTest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joachimwolff/algorithmsInBioinformatics/0d3d91b7cb2370426617c09d98796998b7c5d1d7/source/lib/multiple/test/sumOfPairsTest.py -------------------------------------------------------------------------------- /source/lib/multiple/test/upgmaWpgmaTest.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2015 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | # 12 | # UPGMA/WPGMA test class 13 | import unittest 14 | import os, sys 15 | 16 | lib_path = os.path.abspath('../../') 17 | sys.path.append(lib_path) 18 | 19 | from multiple import UpgmaWpgma 20 | 21 | 22 | class UpgmaWpgmaTestClass(unittest.TestCase): 23 | """Test class to test the correct computation of the UPGMA/WPGMA algorithm.""" 24 | 25 | def test_computeMinimalDistance(self): 26 | distanceDictionary = {"0 1": 1, "0 2": 2, "0 3": 3, "1 2": 2, "1 3": 3, "1 4": 3} 27 | upgma = UpgmaWpgma(distanceDictionary, 4) 28 | expectedValue = ["0 1", 1] 29 | self.assertEqual(expectedValue, upgma.compute_minimal_distance()) 30 | 31 | def test_computeClustering(self): 32 | distanceDictionary = {"0 1": 1, "0 2": 2, "0 3": 3, "1 2": 2, "1 3": 3, "2 3": 3} 33 | upgma = UpgmaWpgma(distanceDictionary, 4) 34 | expectedValue = {"0 1": 4, "2 4": 5, "3 5": 6} 35 | upgma.compute_clustering() 36 | print upgma.get_newick_tree() 37 | self.assertEqual(expectedValue, upgma.mapping) 38 | 39 | print upgma.get_newick_tree(with_edge_weights=True) 40 | distanceDictionary = {"0 1": 6, "0 2": 10, "0 3": 10, "0 4": 10, "1 2": 10, "1 3": 10, "1 4": 10, "2 3": 2, 41 | "2 4": 6, "3 4": 6} 42 | upgma2 = UpgmaWpgma(distanceDictionary, 5) 43 | expectedValue = {"2 3": 5, "0 1": 7, "4 5": 6, "6 7": 8} 44 | upgma2.compute_clustering() 45 | print upgma2.get_newick_tree(with_edge_weights=False) 46 | self.assertEqual(expectedValue, upgma2.mapping) 47 | print upgma2.get_newick_tree(with_edge_weights=True) 48 | 49 | 50 | 51 | 52 | def test_getNewickTree(self): 53 | mapping = {'1 3': 5, '4 6': 7, '5 7': 8, '0 2': 6} 54 | distanceDictionary = {} 55 | upgma = UpgmaWpgma(distanceDictionary, 5) 56 | upgma.mapping = mapping 57 | upgma.get_newick_tree() 58 | 59 | 60 | if __name__ == "__main__": 61 | unittest.main() # run all tests 62 | -------------------------------------------------------------------------------- /source/lib/multiple/upgmaWpgma.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2015 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | 12 | from helper import MathHelper 13 | 14 | 15 | class UpgmaWpgma(): 16 | """Upgma/Wpgma is a clustering method to generate phylogenetic trees. """ 17 | 18 | def __init__(self, distance_dictionary, node_count, upgma_wpgma=True, sequence_size_mapping={}): 19 | """To initalize a object of this class, please define the following: 20 | distance_dictionary: A dictionary with the distance between two sequences. 21 | Should have the form \"Key0 key1\":distance. The key0 and key1 have to be integers. 22 | node_count: The number of sequences. 23 | upgma_wpgma: If True, the upgma weighting is used, if False, wpgma. 24 | sequence_size_mapping: Only necessary if wpgma is executed. It defines the size of each sequence. 25 | Should have the form: \"Key:len(sequence)\"""" 26 | self.distance_dictionary = distance_dictionary 27 | self.mapping = {} 28 | self.node_count = node_count 29 | self.number_of_nodes = node_count 30 | self.upgma_wpgma = upgma_wpgma 31 | self.sequence_size_mapping = sequence_size_mapping 32 | self.edge_weight = {} 33 | 34 | def compute_clustering(self): 35 | """This function computes the clustering to get the phylogenetic tree.""" 36 | computation_is_done = False 37 | j = 0 38 | while not computation_is_done: 39 | j += 1 40 | minimum_cluster = self.compute_minimal_distance() 41 | nodes = minimum_cluster[0].split(" ") 42 | if len(nodes) > 1: 43 | self.mapping[minimum_cluster[0]] = self.node_count 44 | self.compute_edge_weight(minimum_cluster[1], nodes) 45 | 46 | if minimum_cluster[0] in self.distance_dictionary: 47 | del self.distance_dictionary[minimum_cluster[0]] 48 | 49 | for i in range(0, self.node_count + 1): 50 | key_value_0 = nodes[0] + " " + str(i) 51 | key_value_1 = nodes[1] + " " + str(i) 52 | key_value = self.key_in_dictionary(key_value_0, key_value_1) 53 | if key_value[0] != "": 54 | key_for_new_cluster_distance = str(i) + " " + str(self.node_count) 55 | self.distance_dictionary[key_for_new_cluster_distance] = self.compute_new_distance( 56 | self.distance_dictionary[key_value[0]], self.distance_dictionary[key_value[1]], nodes[0], 57 | nodes[1]) 58 | # try: 59 | # except: 60 | # "something wring" 61 | # "something wring" 62 | if not self.upgma_wpgma: 63 | self.sequence_size_mapping[self.node_count] = self.sequence_size_mapping[int(nodes[0])] + \ 64 | self.sequence_size_mapping[int(nodes[1])] 65 | del self.distance_dictionary[key_value[0]] 66 | del self.distance_dictionary[key_value[1]] 67 | self.node_count += 1 68 | else: 69 | computation_is_done = True 70 | 71 | def key_in_dictionary(self, key_value_0, key_value_1): 72 | """Returns True if the given keys are in the distance dictionary, False otherwise. 73 | key_value_0: The first key value. 74 | key_value_1: The second key value.""" 75 | for i in range(0, 4): 76 | if key_value_0 in self.distance_dictionary and key_value_1 in self.distance_dictionary: 77 | return [key_value_0, key_value_1] 78 | elif key_value_0[::-1] in self.distance_dictionary and key_value_1 in self.distance_dictionary: 79 | return [key_value_0[::-1], key_value_1] 80 | elif key_value_0 in self.distance_dictionary and key_value_1[::-1] in self.distance_dictionary: 81 | return [key_value_0, key_value_1[::-1]] 82 | elif key_value_0[::-1] in self.distance_dictionary and key_value_1[::-1] in self.distance_dictionary: 83 | return [key_value_0[::-1], key_value_1[::-1]] 84 | else: 85 | return ["", ""] 86 | 87 | 88 | def compute_minimal_distance(self): 89 | """Returns the next two clusters for merging.""" 90 | minimum = ["", MathHelper.Inf] 91 | for i in self.distance_dictionary: 92 | if minimum[1] > self.distance_dictionary[i]: 93 | minimum[0] = i 94 | minimum[1] = self.distance_dictionary[i] 95 | return minimum 96 | 97 | def compute_new_distance(self, distance_a_x, distance_b_x, index_a, index_b): 98 | """Returns the new distance between the new merged cluster and an other cluster. 99 | distance_a_x: The old distance between cluster a and x. 100 | distance_b_x: The old distance between cluster b and x. 101 | index_a: The index of a. 102 | index_b: The index of b.""" 103 | if self.upgma_wpgma: 104 | return self.upgma_distance(distance_a_x, distance_b_x) 105 | else: 106 | return self.wpgma_distance(distance_a_x, distance_b_x, self.sequence_size_mapping[int(index_a)], 107 | self.sequence_size_mapping[int(index_b)]) 108 | 109 | def upgma_distance(self, distance_a_x, distance_b_x): 110 | """Returns the upgma-distance between the new merged cluster a and an other cluster x. 111 | distance_a_x: The old distance between cluster a and x. 112 | distance_b_x: The old distance between cluster b and x.""" 113 | return (distance_a_x + distance_b_x) / 2 114 | 115 | def wpgma_distance(self, distance_a_x, distance_b_x, length_of_a, length_of_b): 116 | """Returns the wpgma-distance between the new merged cluster a and an other cluster x. 117 | distance_a_x: The old distance between cluster a and x. 118 | distance_b_x: The old distance between cluster b and x. 119 | length_of_a: The index of a. 120 | length_of_b: The index of b.""" 121 | return (length_of_a * distance_a_x + length_of_b * distance_b_x) / (length_of_a + length_of_b) 122 | 123 | def compute_edge_weight(self, weight, nodes): 124 | """This method computes the new edge weight for a new cluster. 125 | weight: The edge weight equal to the distance of the to merged clusters. 126 | nodes: A list containing the indices of the two merged clusters.""" 127 | node0= int(nodes[0]) 128 | node1 = int(nodes[1]) 129 | if node0 < self.number_of_nodes and node1 < self.number_of_nodes: 130 | # self.edge_weight[self.node_count] = 1 131 | self.edge_weight[self.node_count] = [weight / float(2), weight / float(2)] 132 | elif node0 < self.number_of_nodes: 133 | weightToLeafs = self.edge_weight[node1][1] 134 | self.edge_weight[self.node_count] = [weight / float(2) - weightToLeafs, weight / float(2)] 135 | elif node1 < self.number_of_nodes: 136 | weightToLeafs = self.edge_weight[node0][1] 137 | self.edge_weight[self.node_count] = [weight / float(2), weight / float(2) - weightToLeafs] 138 | else: 139 | weightToLeafs = self.edge_weight[node0][1] 140 | weightToLeafs1 = self.edge_weight[node1][1] 141 | self.edge_weight[self.node_count] = [weight / float(2) - weightToLeafs, weight / float(2) - weightToLeafs1] 142 | 143 | 144 | def get_newick_tree(self, with_edge_weights=False): 145 | """Returns the computed cluster in the Newick tree format. 146 | with_edge_weights: If True, edge weights are part of the output, if False, not.""" 147 | # expectedValue = {"2 3": 5, "0 1": 7, "4 5": 6, "6 7": 8} 148 | newick_dictionary = dict([[v, k] for k, v in self.mapping.items()]) 149 | if with_edge_weights: 150 | for i in newick_dictionary: 151 | if i in self.edge_weight: 152 | nodesWithWeights = newick_dictionary[i].split(" ") 153 | nodesWithWeights[0] = nodesWithWeights[0].strip(" ") 154 | nodesWithWeights[0] += ":" + str(self.edge_weight[i][1]) 155 | nodesWithWeights[1] = nodesWithWeights[1].strip(" ") 156 | nodesWithWeights[1] += ":" + str(self.edge_weight[i][0]) 157 | newick_dictionary[i] = nodesWithWeights[0] + " " + nodesWithWeights[1] 158 | self.mapping = dict([[v, k] for k, v in newick_dictionary.items()]) 159 | for i in self.mapping: 160 | index = -1 161 | leading_sequence = True 162 | for j in newick_dictionary: 163 | string_to_find = " " + str(self.mapping[i]) + "" 164 | if newick_dictionary[j].find(string_to_find) != -1: 165 | index = j 166 | leading_sequence = False 167 | break 168 | string_to_find = str(self.mapping[i]) + " " 169 | if newick_dictionary[j].find(string_to_find) != -1: 170 | index = j 171 | leading_sequence = True 172 | break 173 | if with_edge_weights: 174 | string_to_find = str(self.mapping[i]) + ":" 175 | else: 176 | string_to_find = str(self.mapping[i]) + "," 177 | if newick_dictionary[j].find(string_to_find) != -1: 178 | index = j 179 | leading_sequence = True 180 | break 181 | string_to_find = "," + str(self.mapping[i]) 182 | if newick_dictionary[j].find(string_to_find) != -1: 183 | index = j 184 | leading_sequence = False 185 | break 186 | 187 | if index != -1: 188 | if leading_sequence: 189 | stringToReplace = "(" + newick_dictionary[int(string_to_find.strip().strip(",").strip(":"))].replace(" ", 190 | ",") + "):" 191 | else: 192 | stringToReplace = ",(" + newick_dictionary[int(string_to_find.strip().strip(",").strip(":"))].replace(" ", 193 | ",") + ")" 194 | newick_dictionary[index] = newick_dictionary[index].replace(string_to_find, stringToReplace).replace( 195 | ",,", ",") 196 | del newick_dictionary[int(string_to_find.strip().strip(",").strip(":"))] 197 | 198 | for i in newick_dictionary: 199 | return "(" + newick_dictionary[i] + ")" 200 | -------------------------------------------------------------------------------- /source/lib/pairwise/__init__.py: -------------------------------------------------------------------------------- 1 | from gotoh import Gotoh 2 | from needlemanWunsch import NeedlemanWunsch 3 | -------------------------------------------------------------------------------- /source/lib/pairwise/gotoh.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2014 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | # 12 | # Gotoh algorithm 13 | from helper import PairwiseAlignmentHelper as pah 14 | from helper import MathHelper as mathHelper 15 | import sys 16 | 17 | class Gotoh(): 18 | """This class holds methods which are needed to compute the pairwise 19 | alignment algorithm from Osamu Gotoh, published in 1982: 20 | Osamu Gotoh (1982). "An improved algorithm for matching biological sequences". 21 | Journal of molecular biology 162: 705. 22 | https://www.cs.umd.edu/class/spring2003/cmsc838t/papers/gotoh1982.pdf 23 | """ 24 | def __init__(self, sequenceA, sequenceB, scoreFunction, costFunction): 25 | """Initalize all variables and methods needed to compute the Gotoh algorithm. 26 | sequenceA: A string with the first DNA sequence. 27 | sequenceB: A string with the second DNA sequence. 28 | scoreFunction: The name of a weight function as a String which is defined 29 | in the pairwiseAlignmentHelper-class. 30 | costFunction: The name of a gap cost function as a String which is defined 31 | in the pairwiseAlignmentHelper-class. 32 | """ 33 | if scoreFunction in dir(pah) and callable(getattr(pah, scoreFunction)): 34 | scoreFunctionObj = eval('pah().' + scoreFunction) 35 | else: 36 | print "Score function not found!" 37 | sys.exit() 38 | if costFunction in dir(pah) and callable(getattr(pah, costFunction)): 39 | costFunctionObj = eval('pah().' + costFunction) 40 | else: 41 | print "Gap cost function not found!" 42 | sys.exit() 43 | self.computationMatrix = [[],[],[]] 44 | self.sequenceA = sequenceA 45 | self.sequenceB = sequenceB 46 | self.scoreFunction = scoreFunctionObj 47 | self.costFunction = costFunctionObj 48 | self.beta = self.costFunction(1) - self.costFunction(0) 49 | self.i = 0 50 | self.j = 0 51 | self.tracebackStack = [[]] 52 | self.tracebackStackIndex = 0 53 | self.indiciesStack = [[]] 54 | self.computedAlignment = [] 55 | 56 | 57 | def computeMatrix(self): 58 | """Initalize the three matricies needed for the Gotoh-Algorithm. 59 | The sequences A and B, the weight function and the gap costs have to be defined 60 | by the creation of the object of this class.""" 61 | computationMatrixD = [[0 for i in range(len(self.sequenceB)+1) ] for j in range(len(self.sequenceA)+1)] 62 | computationMatrixP = [[0 for i in range(len(self.sequenceB)+1) ] for j in range(len(self.sequenceA)+1)] 63 | computationMatrixQ = [[0 for i in range(len(self.sequenceB)+1) ] for j in range(len(self.sequenceA)+1)] 64 | # initalize matrix 65 | for i in range(1, len(self.sequenceA)+1): 66 | computationMatrixD[i][0] = self.costFunction(i) 67 | computationMatrixP[i][0] = mathHelper.NaN 68 | computationMatrixQ[i][0] = mathHelper.Inf 69 | for i in range(1, len(self.sequenceB)+1): 70 | computationMatrixD[0][i] = self.costFunction(i) 71 | computationMatrixP[0][i] = mathHelper.Inf 72 | computationMatrixQ[0][i] = mathHelper.NaN 73 | 74 | for i in range(1, len(self.sequenceA)+1): 75 | for j in range(1, len(self.sequenceB)+1): 76 | computationMatrixP[i][j] = self.computeP(computationMatrixD[i-1][j], computationMatrixP[i-1][j], self.costFunction, self.beta) 77 | computationMatrixQ[i][j] = self.computeQ(computationMatrixD[i][j-1], computationMatrixQ[i][j-1], self.costFunction, self.beta) 78 | computationMatrixD[i][j] = self.computeD(computationMatrixD[i-1][j-1], computationMatrixP[i][j], computationMatrixQ[i][j], self.sequenceA[i-1], self.sequenceB[j-1], self.scoreFunction) 79 | self.computationMatrix = [computationMatrixD, computationMatrixP, computationMatrixQ] 80 | 81 | def computeP(self, valueOfD, valueOfP, costFunction, beta): 82 | """Compute the values for matrix P. 83 | This is the minimum value of: 84 | matrix D of cell (i-1, j) + gap costs 85 | and 86 | matrix P of cell (i-1, j) + 1 87 | valueOfD: The value from matrix D of cell i-1, j. 88 | valueOfP: The value from matrix P of cell i-1, j. 89 | costFunction: The gap cost function defined at the object creation. 90 | beta: The beta value from the gap costs.""" 91 | return min(valueOfD + costFunction(1), valueOfP + beta) 92 | 93 | def computeQ(self, valueOfD, valueOfQ, costFunction, beta): 94 | """Compute the values for matrix Q. 95 | This is the minimum value of: 96 | matrix D of cell (i, j-1) + gap costs 97 | and 98 | matrix Q of cell (i, j-1) + 1 99 | valueOfD: The value from matrix D of cell i, j-1. 100 | valueOfQ: The value from matrix Q of cell i, j-1. 101 | costFunction: The gap cost function defined at the object creation. 102 | beta: The beta value from the gap costs.""" 103 | return min(valueOfD + costFunction(1), valueOfQ + beta) 104 | 105 | def computeD(self, valueOfD, valueOfP, valueOfQ, characterA, characterB, scoreFunction): 106 | """Compute the values for matrix D. 107 | This is the minimum value of: 108 | matrix D of cell (i-1, j-1) + w(a,b) 109 | and 110 | matrix P of cell (i, j) 111 | and 112 | matrix Q of cell (i, j) 113 | valueOfD: The value from matrix D of cell i-1, j-1. 114 | valueOfP: The value from matrix P of cell i, j. 115 | valueOfQ: The value from matrix Q of cell i, j. 116 | characterA: The character in sequence A at position i. 117 | characterB: The character in sequence B at position j. 118 | scoreFunction: The weight cost function defined at the object creation.""" 119 | return min(valueOfP, min(valueOfQ, valueOfD + scoreFunction(characterA, characterB))) 120 | 121 | def traceback(self): 122 | """Computes the traceback for the Gotoh algorithm.""" 123 | self.j = len(self.computationMatrix[0][0]) - 1 124 | self.i = len(self.computationMatrix[0]) - 1 125 | self.tracebackStackIndex = 0 126 | self.indiciesStack[self.tracebackStackIndex] = [self.i, self.j, pah.matrixIndexD] 127 | tracebackDone = False 128 | while not tracebackDone: 129 | while self.i > 0 or self.j > 0: 130 | if self.indiciesStack[self.tracebackStackIndex][2] == pah.matrixIndexD: 131 | self.tracebackD() 132 | elif self.indiciesStack[self.tracebackStackIndex][2] == pah.matrixIndexP: 133 | self.tracebackP() 134 | elif self.indiciesStack[self.tracebackStackIndex][2] == pah.matrixIndexQ: 135 | self.tracebackQ() 136 | self.i = self.indiciesStack[self.tracebackStackIndex][0] 137 | self.j = self.indiciesStack[self.tracebackStackIndex][1] 138 | tracebackDone = True 139 | for i in range(0, len(self.indiciesStack)): 140 | if self.indiciesStack[i][0] > 0 or self.indiciesStack[i][1] > 0: 141 | self.tracebackStackIndex = i 142 | tracebackDone = False 143 | break 144 | self.i = self.indiciesStack[self.tracebackStackIndex][0] 145 | self.j = self.indiciesStack[self.tracebackStackIndex][1] 146 | for i in range(0, len(self.tracebackStack)): 147 | self.computedAlignment.append(self.buildAlignment(self.tracebackStack[i])) 148 | 149 | 150 | def tracebackD(self): 151 | """Computes the traceback for a cell of the matrix D.""" 152 | a = self.sequenceA[self.i - 1] 153 | b = self.sequenceB[self.j - 1] 154 | split = 0 155 | pathVariableI = self.i 156 | pathVariableJ = self.j 157 | if self.j > 0 and self.i > 0: 158 | if self.computationMatrix[pah.matrixIndexD][self.i][self.j] == self.computationMatrix[pah.matrixIndexD][self.i-1][self.j-1] + self.scoreFunction(a,b): 159 | self.tracebackStack[self.tracebackStackIndex].append(pah.diagonalD) 160 | pathVariableI -= 1 161 | pathVariableJ -= 1 162 | split = 1 163 | if self.computationMatrix[pah.matrixIndexD][self.i][self.j] == self.computationMatrix[pah.matrixIndexQ][self.i][self.j]: 164 | if split == 0: 165 | self.tracebackStack[self.tracebackStackIndex].append(pah.dotQ) 166 | self.indiciesStack[self.tracebackStackIndex][2] = pah.matrixIndexQ 167 | split = 1 168 | else: 169 | self.tracebackStack.append(self.tracebackStack[self.tracebackStackIndex][0:-1]) 170 | self.tracebackStack[len(self.tracebackStack)-1].append(pah.dotQ) 171 | self.indiciesStack.append([self.i,self.j, pah.matrixIndexQ]) 172 | if self.computationMatrix[pah.matrixIndexD][self.i][self.j] == self.computationMatrix[pah.matrixIndexP][self.i][self.j]: 173 | if split == 0: 174 | self.tracebackStack[self.tracebackStackIndex].append(pah.dotP) 175 | self.indiciesStack[self.tracebackStackIndex][2] = pah.matrixIndexP 176 | else: 177 | self.tracebackStack.append(self.tracebackStack[self.tracebackStackIndex][0:-1]) 178 | self.tracebackStack[len(self.tracebackStack)-1].append(pah.dotP) 179 | self.indiciesStack.append([self.i, self.j, pah.matrixIndexP]) 180 | 181 | if self.i == 0: 182 | self.tracebackStack[self.tracebackStackIndex].append(pah.leftD) 183 | pathVariableJ -= 1 184 | if self.j == 0: 185 | self.tracebackStack[self.tracebackStackIndex].append(pah.upD) 186 | pathVariableI -= 1 187 | if self.i <= 0 or pathVariableI <= 0: 188 | pathVariableI = 0 189 | if self.j <= 0 or pathVariableJ <= 0: 190 | pathVariableJ = 0 191 | self.indiciesStack[self.tracebackStackIndex][0] = pathVariableI 192 | self.indiciesStack[self.tracebackStackIndex][1] = pathVariableJ 193 | 194 | 195 | def tracebackP(self): 196 | """Computes the traceback for a cell of the matrix P""" 197 | split = False 198 | if self.i > 0: 199 | if self.computationMatrix[pah.matrixIndexP][self.i][self.j] == self.computationMatrix[pah.matrixIndexD][self.i-1][self.j] + self.costFunction(1): 200 | self.tracebackStack[self.tracebackStackIndex].append(pah.upD) 201 | self.indiciesStack[self.tracebackStackIndex][0] -= 1 202 | self.indiciesStack[self.tracebackStackIndex][2] = pah.matrixIndexD 203 | split = True 204 | if self.computationMatrix[pah.matrixIndexP][self.i][self.j] == self.computationMatrix[pah.matrixIndexP][self.i-1][self.j] + self.beta: 205 | if split: 206 | self.tracebackStack.append(self.tracebackStack[self.tracebackStackIndex][0:-1]) 207 | self.tracebackStack[len(self.tracebackStack)-1].append(pah.upP) 208 | self.indiciesStack.append([self.i - 1, self.j, pah.matrixIndexP]) 209 | else: 210 | self.tracebackStack[self.tracebackStackIndex].append(pah.upP) 211 | self.indiciesStack[self.tracebackStackIndex][0] -= 1 212 | self.indiciesStack[self.tracebackStackIndex][2] = pah.matrixIndexP 213 | 214 | def tracebackQ(self): 215 | """Computes the traceback for a cell of the matrix Q""" 216 | split = False 217 | if self.j > 0: 218 | if self.computationMatrix[pah.matrixIndexQ][self.i][self.j] == self.computationMatrix[pah.matrixIndexD][self.i][self.j-1] + self.costFunction(1): 219 | self.tracebackStack[self.tracebackStackIndex].append(pah.leftD) 220 | self.indiciesStack[self.tracebackStackIndex][1] -= 1 221 | self.indiciesStack[self.tracebackStackIndex][2] = pah.matrixIndexD 222 | split = True 223 | 224 | if self.computationMatrix[pah.matrixIndexQ][self.i][self.j] == self.computationMatrix[pah.matrixIndexQ][self.i][self.j-1] + self.beta: 225 | if split: 226 | self.tracebackStack.append(self.tracebackStack[self.tracebackStackIndex][0:-1]) 227 | self.tracebackStack[len(self.tracebackStack)-1].append(pah.leftQ) 228 | self.indiciesStack.append([self.i , self.j - 1, pah.matrixIndexQ]) 229 | else: 230 | self.tracebackStack[self.tracebackStackIndex].append(pah.leftQ) 231 | self.indiciesStack[self.tracebackStackIndex][1] -= 1 232 | self.indiciesStack[self.tracebackStackIndex][2] = pah.matrixIndexQ 233 | 234 | def buildAlignment(self, tracebackStack): 235 | """A method to compute the alignment of a given traceback of the Gotoh algorithm. 236 | tracebackStack: The computed traceback path for one alignment as a list.""" 237 | i = 0 238 | j = 0 239 | k = len(tracebackStack)-1 240 | alignmentOfA = "" 241 | alignmentOfB = "" 242 | while len(tracebackStack) > 0: 243 | try: 244 | tracebackElement = tracebackStack.pop(k) 245 | if pah.leftQ == tracebackElement or pah.leftD == tracebackElement: 246 | alignmentOfA += "-" 247 | alignmentOfB += self.sequenceB[j] 248 | j += 1 249 | elif pah.upP == tracebackElement or pah.upD == tracebackElement: 250 | alignmentOfA += self.sequenceA[i] 251 | alignmentOfB += "-" 252 | i += 1 253 | elif pah.diagonalD == tracebackElement: 254 | alignmentOfA += self.sequenceA[i] 255 | alignmentOfB += self.sequenceB[j] 256 | i += 1 257 | j += 1 258 | k -= 1 259 | 260 | except: 261 | print "An error occured." 262 | sys.exit() 263 | 264 | while i < len(self.sequenceA): 265 | alignmentOfA += self.sequenceA[i] 266 | i += 1 267 | while j < len(self.sequenceB): 268 | alignmentOfB += self.sequenceB[j] 269 | j += 1 270 | alignment = [alignmentOfA, alignmentOfB] 271 | return alignment 272 | 273 | def compute(self): 274 | """Method to start the computation of the Gotoh algorithm.""" 275 | self.computeMatrix() 276 | self.traceback() 277 | return self.computedAlignment 278 | 279 | -------------------------------------------------------------------------------- /source/lib/pairwise/needlemanWunsch.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2014 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | # 12 | # Needleman-Wunsch algorithm 13 | import sys 14 | from helper import PairwiseAlignmentHelper as pah 15 | 16 | 17 | class NeedlemanWunsch(): 18 | """This class holds methods which are needed to compute the pairwise 19 | alignment algorithm from Saul Needleman and Christian Wunsch, published in 1970: 20 | Needleman, Saul B.; and Wunsch, Christian D. (1070). 21 | A general method applicable to search for similarities in the aminoacid 22 | sequence of two proteins. Journal of Molecular Biology 48 (3): 443-53 23 | http://www.cise.ufl.edu/class/cis4930sp09rab/00052.pdf""" 24 | 25 | def computeMatrix(self, sequenceA, sequenceB, scoreFunction): 26 | """Initalize and computes the values for the Needleman-Wunsch matrix. 27 | sequenceA: A string with the first DNA sequence. 28 | sequenceB: A string with the second DNA sequence. 29 | scoreFunction: The name of a weight function as a String which is defined 30 | in the pairwiseAlignmentHelper-class.""" 31 | computationMatrix = [[0 for i in range(len(sequenceB) + 1)] for j in range(len(sequenceA) + 1)] 32 | 33 | # initalize matrix 34 | for i in range(1, len(sequenceA) + 1): 35 | computationMatrix[i][0] = computationMatrix[i - 1][0] + scoreFunction("", sequenceA[i - 1]) 36 | for i in range(1, len(sequenceB) + 1): 37 | computationMatrix[0][i] = computationMatrix[0][i - 1] + scoreFunction("", sequenceB[i - 1]) 38 | 39 | for i in range(1, len(sequenceA) + 1): 40 | for j in range(1, len(sequenceB) + 1): 41 | computationMatrix[i][j] = self.computeMinimum(sequenceA[i - 1], sequenceB[j - 1], 42 | computationMatrix[i][j - 1], computationMatrix[i - 1][j], 43 | computationMatrix[i - 1][j - 1], scoreFunction) 44 | return computationMatrix 45 | 46 | def computeMinimum(self, characterOfA, characterOfB, predecessorLeft, predecessorUp, predecessorDiagonal, 47 | scoreFunction): 48 | """Computes the minimum of a given cell for the Needleman-Wunsch matrix. 49 | characterA: The character in sequence A at position i. 50 | characterB: The character in sequence B at position j. 51 | predecessorLeft: The value i, j-1 in the matrix. 52 | predecessorUp: The value i-1, j in the matrix. 53 | predecessorDiagonal: The value i-1, j-1 in the matrix. 54 | scoreFunction: The weight function defined in 55 | class pairwiseAlignmentHelper.""" 56 | costUp = predecessorUp + scoreFunction(characterOfA, "") 57 | costDiagonal = predecessorDiagonal + scoreFunction(characterOfA, characterOfB) 58 | costLeft = predecessorLeft + scoreFunction("", characterOfB) 59 | return min(costUp, costDiagonal, costLeft) 60 | 61 | def traceback(self, sequenceA, sequenceB, computationMatrix, scoreFunction, maxOptimalSolutions=-1): 62 | """Computes the traceback for the Needleman-Wunsch matrix. 63 | sequenceA: A string with the first DNA sequence. 64 | sequenceB: A string with the second DNA sequence. 65 | computationMatrix: The computed matrix for the two sequences. 66 | scoreFunction: The name of a weight function as a String which is defined 67 | in the pairwiseAlignmentHelper-class. 68 | """ 69 | tracebackStack = [[]] 70 | indiciesStack = [[len(computationMatrix) - 1, len(computationMatrix[0]) - 1]] 71 | tracebackCount = 0 72 | tracebackDone = False 73 | optimalSolutionsCount = 0 74 | l = 0 75 | allTracebacksComputed = 0 76 | appendTracebackStack = tracebackStack.append 77 | appendIndices = indiciesStack.append 78 | while not tracebackDone: 79 | 80 | optimalSolutionsCount += 1 81 | i = indiciesStack[tracebackCount][0] 82 | j = indiciesStack[tracebackCount][1] 83 | split = False 84 | appendTraceback = tracebackStack[tracebackCount].append 85 | 86 | while i > 0 or j > 0: 87 | pathVariableI = i 88 | pathVariableJ = j 89 | # left arrow 90 | if j > 0: 91 | if computationMatrix[i][j] == computationMatrix[i][j - 1] + scoreFunction("", sequenceB[j - 1]): 92 | # tracebackStack[tracebackCount].append(pah.left) 93 | appendTraceback(pah.left) 94 | pathVariableJ -= 1 # change j 95 | split = True 96 | 97 | # up arrow 98 | if i > 0: 99 | if computationMatrix[i][j] == computationMatrix[i - 1][j] + scoreFunction(sequenceA[i - 1], ""): 100 | if split == False: 101 | appendTraceback(pah.up) 102 | # tracebackStack[tracebackCount].append(pah.up) 103 | pathVariableI -= 1 104 | split = True 105 | else: 106 | appendTracebackStack(tracebackStack[tracebackCount][0:-1]) 107 | tracebackStack[len(tracebackStack) - 1].append(pah.up) 108 | appendIndices([i - 1, j]) 109 | 110 | # diagonal arrow 111 | if i > 0 and j > 0: 112 | if computationMatrix[i][j] == computationMatrix[i - 1][j - 1] + scoreFunction(sequenceA[i - 1], 113 | sequenceB[j - 1]): 114 | if split == False: 115 | appendTraceback(pah.diagonal) 116 | # tracebackStack[tracebackCount].append(pah.diagonal) 117 | pathVariableI -= 1 118 | pathVariableJ -= 1 119 | elif split == True: 120 | appendTracebackStack(tracebackStack[tracebackCount][0:-1]) 121 | tracebackStack[len(tracebackStack) - 1].append(pah.diagonal) 122 | appendIndices([i - 1, j - 1]) 123 | split = 0 124 | i = pathVariableI 125 | j = pathVariableJ 126 | 127 | indiciesStack[tracebackCount][0] = i 128 | indiciesStack[tracebackCount][1] = j 129 | l = tracebackCount 130 | while l < len(indiciesStack): 131 | if indiciesStack[l][0] == 0 and indiciesStack[l][1] == 0: 132 | allTracebacksComputed += 1 133 | else: 134 | tracebackCount = l 135 | l = len(indiciesStack) 136 | l += 1 137 | if allTracebacksComputed >= len(indiciesStack): 138 | tracebackDone = True 139 | if maxOptimalSolutions != -1 and optimalSolutionsCount >= maxOptimalSolutions: 140 | tracebackDone = True 141 | # allTracebacksComputed = 0 142 | 143 | computedAlignment = [] 144 | if maxOptimalSolutions == -1: 145 | for i in range(0, len(tracebackStack)): 146 | computedAlignment.append(self.buildAlignment(tracebackStack[i], sequenceA, sequenceB)) 147 | else: 148 | for i in range(0, maxOptimalSolutions): 149 | computedAlignment.append(self.buildAlignment(tracebackStack[i], sequenceA, sequenceB)) 150 | return computedAlignment 151 | 152 | def buildAlignment(self, tracebackStack, sequenceA, sequenceB): 153 | """Builds the alignment for one traceback path. 154 | tracebackStack: The computed tracebackpath as a list = [] 155 | sequenceA: A string with the first DNA sequence. 156 | sequenceB: A string with the second DNA sequence. 157 | """ 158 | i = 0 159 | j = 0 160 | k = len(tracebackStack) - 1 161 | alignmentOfA = "" 162 | alignmentOfB = "" 163 | 164 | while len(tracebackStack) > 0: 165 | try: 166 | tracebackElement = tracebackStack.pop(k) 167 | if pah.left == tracebackElement: 168 | alignmentOfA += "-" 169 | alignmentOfB += sequenceB[j] 170 | j += 1 171 | elif pah.up == tracebackElement: 172 | alignmentOfA += sequenceA[i] 173 | alignmentOfB += "-" 174 | i += 1 175 | elif pah.diagonal == tracebackElement: 176 | alignmentOfA += sequenceA[i] 177 | alignmentOfB += sequenceB[j] 178 | i += 1 179 | j += 1 180 | k -= 1 181 | except: 182 | print "An error occured." 183 | sys.exit() 184 | while i < len(sequenceA): 185 | alignmentOfA += sequenceA[i] 186 | i += 1 187 | while j < len(sequenceB): 188 | alignmentOfB += sequenceB[j] 189 | j += 1 190 | alignment = [alignmentOfA, alignmentOfB] 191 | return alignment 192 | 193 | def compute(self, sequences, scoreFunction, maxOptimalSolutions=-1): 194 | """Method to execute the Needleman-Wunsch algorithm. 195 | sequences: A list with two strings which represents the DNA sequences. 196 | scoreFunction: The name of the weight function defined in 197 | class pairwiseAlignmentHelper.""" 198 | if scoreFunction in dir(pah) and callable(getattr(pah, scoreFunction)): 199 | scoreFunctionObj = eval('pah().' + scoreFunction) 200 | else: 201 | print "Score function not found!" 202 | sys.exit() 203 | if maxOptimalSolutions == -1: 204 | return self.traceback(sequences[0], sequences[1], 205 | self.computeMatrix(sequences[0], sequences[1], scoreFunctionObj), scoreFunctionObj) 206 | else: 207 | return self.traceback(sequences[0], sequences[1], 208 | self.computeMatrix(sequences[0], sequences[1], scoreFunctionObj), scoreFunctionObj, 209 | maxOptimalSolutions) 210 | 211 | -------------------------------------------------------------------------------- /source/lib/pairwise/test/__init__.py: -------------------------------------------------------------------------------- 1 | from gotohTest import GotohTestClass 2 | from needlemanWunschTest import NeedlemanWunschTestClass 3 | -------------------------------------------------------------------------------- /source/lib/pairwise/test/gotohTest.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2014 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | # 12 | # Gotoh test class 13 | import unittest 14 | import os, sys 15 | lib_path = os.path.abspath('../../') 16 | sys.path.append(lib_path) 17 | 18 | from pairwise import Gotoh 19 | from helper import PairwiseAlignmentHelper as pah 20 | from helper import MathHelper as mathHelper 21 | class GotohTestClass(unittest.TestCase): 22 | """Test class to test the correct computation of the Gotoh algorithm.""" 23 | def test_computeMatrix(self): 24 | """Test method to test the correct computation of the matrix.""" 25 | a = "AGC" 26 | b = "AC" 27 | computedMatrixD = [[0 for i in range(len(b)+1) ] for j in range(len(a)+1)] 28 | computedMatrixP = [[0 for i in range(len(b)+1) ] for j in range(len(a)+1)] 29 | computedMatrixQ = [[0 for i in range(len(b)+1) ] for j in range(len(a)+1)] 30 | # initalize matrix 31 | for i in range(1, len(a)+1): 32 | computedMatrixD[i][0] = pah().gapCost(i) 33 | computedMatrixP[i][0] = mathHelper.NaN 34 | computedMatrixQ[i][0] = mathHelper.Inf 35 | for i in range(1, len(b)+1): 36 | computedMatrixD[0][i] = pah().gapCost(i) 37 | computedMatrixP[0][i] = mathHelper.Inf 38 | computedMatrixQ[0][i] = mathHelper.NaN 39 | 40 | 41 | # define values that should be computed by Gotoh algorithm 42 | # matrix D 43 | computedMatrixD[1][1] = 0 44 | computedMatrixD[2][1] = 3 45 | computedMatrixD[3][1] = 4 46 | 47 | computedMatrixD[1][2] = 3 48 | computedMatrixD[2][2] = 1 49 | computedMatrixD[3][2] = 3 50 | 51 | # matrix P 52 | computedMatrixP[1][1] = 6 53 | computedMatrixP[2][1] = 3 54 | computedMatrixP[3][1] = 4 55 | 56 | computedMatrixP[1][2] = 7 57 | computedMatrixP[2][2] = 6 58 | computedMatrixP[3][2] = 4 59 | 60 | # matrix Q 61 | computedMatrixQ[1][1] = 6 62 | computedMatrixQ[2][1] = 7 63 | computedMatrixQ[3][1] = 8 64 | 65 | computedMatrixQ[1][2] = 3 66 | computedMatrixQ[2][2] = 6 67 | computedMatrixQ[3][2] = 7 68 | 69 | computedMatrix = [computedMatrixD, computedMatrixP, computedMatrixQ] 70 | # print "test: ", computedMatrix 71 | # check if the values computed by Gotoh are correct 72 | gotoh = Gotoh(a, b, "weightFunctionDifference", "gapCost") 73 | gotoh.compute_matrix() 74 | # print gotoh.computationMatrix 75 | self.assertEqual(computedMatrix, gotoh.computationMatrix) 76 | 77 | def test_traceback(self): 78 | """Test method to test the correct computation of the traceback.""" 79 | #test case with a single traceback 80 | a = "AGC" 81 | b = "AC" 82 | gotoh = Gotoh(a, b, "weightFunctionDifference", "gapCost") 83 | computedAlignment = [["AGC", "A-C"]] 84 | gotoh.compute_matrix() 85 | gotoh.traceback() 86 | self.assertEqual(computedAlignment, gotoh.computedAlignment) 87 | 88 | # test case with a multiple traceback 89 | a = "CC" 90 | b = "ACCT" 91 | gotoh2 = Gotoh(a, b, "weightFunctionDifference", "gapCost") 92 | gotoh2.compute_matrix() 93 | computedAlignment = [["--CC","ACCT"], ["CC--","ACCT"]] 94 | gotoh2.traceback() 95 | self.assertEqual(computedAlignment, gotoh2.computedAlignment) 96 | 97 | if __name__ == "__main__": 98 | unittest.main() # run all tests 99 | -------------------------------------------------------------------------------- /source/lib/pairwise/test/needlemanWunschTest.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2014 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | 12 | # Test class for the Needleman-Wunsch algorithm 13 | # All test cases are written with PyUnit: http://pyunit.sourceforge.net/ 14 | 15 | import unittest 16 | import os, sys 17 | lib_path = os.path.abspath('../../') 18 | sys.path.append(lib_path) 19 | 20 | from pairwise import NeedlemanWunsch as nw 21 | from helper import PairwiseAlignmentHelper as pah 22 | 23 | class NeedlemanWunschTestClass(unittest.TestCase): 24 | """Class to test the correctness of the computation for the class NeedlemanWunsch.""" 25 | def test_computeMatrix(self): 26 | """Test of the computation of the matrix.""" 27 | a = "AGC" 28 | b = "AC" 29 | computedMatrix = [[0 for i in range(len(b)+1) ] for j in range(len(a)+1)] 30 | 31 | # initalize matrix 32 | for i in range(1, len(a)+1): 33 | computedMatrix[i][0] = computedMatrix[i-1][0] + pah().weightFunctionDifference("", a[i-1]) 34 | for i in range(1, len(b)+1): 35 | computedMatrix[0][i] = computedMatrix[0][i-1] + pah().weightFunctionDifference("", b[i-1]) 36 | 37 | # define values that should be computed by Needleman-Wunsch algorithm 38 | computedMatrix[1][1] = 0 39 | computedMatrix[2][1] = 1 40 | computedMatrix[3][1] = 2 41 | 42 | computedMatrix[1][2] = 1 43 | computedMatrix[2][2] = 1 44 | computedMatrix[3][2] = 1 45 | 46 | # check if the values computed by Needleman-Wunsch are correct 47 | self.assertEqual(computedMatrix, nw().compute_matrix(a, b, pah().weightFunctionDifference)) 48 | 49 | def test_traceback(self): 50 | """Test of the traceback computation.""" 51 | # test case with a single traceback 52 | a = "AGC" 53 | b = "AC" 54 | computedAlignment = [["AGC", "A-C"]] 55 | computedMatrix = nw().compute_matrix(a, b, pah().weightFunctionDifference) 56 | self.assertEqual(computedAlignment, 57 | nw().traceback(a, b, computedMatrix,pah().weightFunctionDifference)) 58 | 59 | # test case with a multiple traceback 60 | a = "AT" 61 | b = "AAGT" 62 | computedMatrix = nw().compute_matrix(a, b, pah().weightFunctionDifference) 63 | computedAlignment = [["A--T","AAGT"], ["-A-T","AAGT"]] 64 | self.assertEqual(computedAlignment, 65 | nw().traceback(a, b, computedMatrix,pah().weightFunctionDifference)) 66 | 67 | if __name__ == "__main__": 68 | unittest.main() # run all tests 69 | -------------------------------------------------------------------------------- /source/lib/structurePrediction/__init__.py: -------------------------------------------------------------------------------- 1 | from nussinov import Nussinov -------------------------------------------------------------------------------- /source/lib/structurePrediction/nussinov.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2015 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | # 12 | # Nussinov algorithm 13 | 14 | class Nussinov(): 15 | """The algorithm of Nussinov is a RNA secondary structure folding algorithm. It was developed by Ruth Nussinov et al. 16 | and was published in 1978: 17 | Nussinov, Ruth, et al. "Algorithms for loop matchings." 18 | SIAM Journal on Applied mathematics 35.1 (1978): 68-82. 19 | http://rci.rutgers.edu/~piecze/GriggsNussinovKleitmanPieczenik.pdf 20 | """ 21 | def __init__(self, rnaSequence): 22 | """rnaSequence: The RNA sequence for which the folding should be computed.""" 23 | self.sequence = rnaSequence 24 | self.pairedBases = {} 25 | self.computationMatrix = [[]] 26 | 27 | def computeMatrix(self): 28 | """This function computes the matrix which the Nussinov-algorithm is based on.""" 29 | self.computationMatrix = [[0 for i in range(len(self.sequence)+1) ] for j in range(len(self.sequence))] 30 | i = 2 31 | while i <= len(self.sequence): 32 | k = i 33 | j = 0 34 | while j <= (len(self.sequence)-2) and k <= (len(self.sequence)): 35 | self.computeMatrixCell(j, k) 36 | j += 1 37 | k += 1 38 | i += 1 39 | 40 | def computeMatrixCell(self, i, j): 41 | """This function computes the value for every cell of the matrix for the Nussinov-algorithm. 42 | i: First index of cell of the Nussinov-matrix 43 | j: Second index of cell of the Nussinov-matrix 44 | Every cell is the maximum of: 45 | | N_(i, j-1) 46 | N_(i,j) = max |max i <= k < j N_(i, k-1) + N_(k+1, j-1) + 1 47 | | S_k and S_j are complementary 48 | """ 49 | self.computationMatrix[i][j-1] 50 | maximumValue = [0,0,0] 51 | k = i 52 | while i <= k and k < j: 53 | if self.complementary(self.sequence[k], self.sequence[j-1]): 54 | pairingValue = self.computationMatrix[i][k-1] + self.computationMatrix[k+1][j-1] + 1 55 | if maximumValue[2] < pairingValue: 56 | maximumValue[0] = k 57 | maximumValue[1] = j 58 | maximumValue[2] = pairingValue 59 | k += 1 60 | self.computationMatrix[i][j] = max(self.computationMatrix[i][j-1], maximumValue[2]) 61 | 62 | def complementary(self, characterA, characterB): 63 | """Returns True if two RNA nucleotides are complementary, False otherwise. 64 | Nucleotides are complemetary if there are "A" and "U" or "C" and "G". 65 | characterA: First nucleotide 66 | characterB: Second nucleotide""" 67 | if characterA == "A" and characterB == "U": 68 | return True 69 | elif characterA == "U" and characterB == "A": 70 | return True 71 | elif characterA == "C" and characterB == "G": 72 | return True 73 | elif characterA == "G" and characterB == "C": 74 | return True 75 | return False 76 | 77 | def traceback(self, i, j): 78 | """Computes the traceback for the Nussinov-algorithm. 79 | i: First index of cell of the Nussinov-matrix 80 | j: Second index of cell of the Nussinov-matrix 81 | """ 82 | if j <= i: 83 | return 84 | elif self.computationMatrix[i][j] == self.computationMatrix[i][j-1]: 85 | self.traceback(i, j-1) 86 | return 87 | else: 88 | k = i 89 | while i <= k and k < j: 90 | if self.complementary(self.sequence[k-1], self.sequence[j-1]): 91 | 92 | if self.computationMatrix[i][j] == self.computationMatrix[i][k-1] + self.computationMatrix[k][j-1] + 1: 93 | self.pairedBases[k] = j 94 | self.traceback(i, k-1) 95 | self.traceback(k, j -1) 96 | return 97 | k += 1 98 | 99 | def execute(self): 100 | """To compute the Nussinov-algorithm execute this method. It returns a dictionary with the paired bases.""" 101 | self.computeMatrix() 102 | self.traceback(0, len(self.sequence)) 103 | print self.pairedBases 104 | print len(self.pairedBases) 105 | return self.pairedBases -------------------------------------------------------------------------------- /source/lib/structurePrediction/test/nussinovTest.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # Copyright 2014 Joachim Wolff 3 | # Programming Course: Algorithms in Bioinformatics 4 | # Tutors: Robert Kleinkauf, Omer Alkhnbashi 5 | # Winter semester 2014/2015 6 | # 7 | # Chair of Bioinformatics 8 | # Department of Computer Science 9 | # Faculty of Engineering 10 | # Albert-Ludwig-University Freiburg im Breisgau 11 | # 12 | # Gotoh test class 13 | import unittest 14 | import os, sys 15 | lib_path = os.path.abspath('../../') 16 | sys.path.append(lib_path) 17 | 18 | from structurePrediction import Nussinov 19 | 20 | class NussinovTestClass(unittest.TestCase): 21 | def test_computeMatrix(self): 22 | # example for the slides of Prof. Backofen 23 | expectedMatrix = [[0,0,1,1,1,2,2,2,3], [0,0,0,0,0,1,1,1,2], [0,0,0,0,0,1,1,1,2], [0,0,0,0,0,1,1,1,2], [0,0,0,0,0,0,0,1,1], [0,0,0,0,0,0,0,0,1], [0,0,0,0,0,0,0,0,1],[0,0,0,0,0,0,0,0,0]] 24 | rnaSequence = "GCACGACG" 25 | nussinov = Nussinov(rnaSequence) 26 | nussinov.compute_matrix() 27 | self.assertEqual(expectedMatrix, nussinov.computationMatrix) 28 | 29 | def test_traceback(self): 30 | # example for the slides of Prof. Backofen 31 | expectedMatrix = {1:2, 4:8, 5:7} 32 | rnaSequence = "GCACGACG" 33 | nussinov = Nussinov(rnaSequence) 34 | nussinov.compute_matrix() 35 | nussinov.traceback(0, len(rnaSequence)) 36 | self.assertEqual(expectedMatrix, nussinov.pairedBases) 37 | 38 | if __name__ == "__main__": 39 | unittest.main() # run all tests -------------------------------------------------------------------------------- /source/sequences: -------------------------------------------------------------------------------- 1 | >sequence 0 2 | UUUGGUCCUCGGUAGUGGUUUCCGGAAAACGAUUUUCCGUGAACUUCGAUCGAAGAUCCAU 3 | --------------------------------------------------------------------------------