├── .gitignore ├── Repeats.h ├── convert.h ├── LeftAlign.h ├── LICENSE ├── IndelAllele.h ├── IndelAllele.cpp ├── Mosaik.h ├── Repeats.cpp ├── Makefile ├── disorder.h ├── SmithWatermanGotoh.h ├── BandedSmithWaterman.h ├── SWMain.cpp ├── disorder.cpp ├── smithwaterman.cpp ├── examples.txt ├── libdisorder.LICENSE ├── BandedSmithWaterman.cpp ├── SmithWatermanGotoh.cpp └── LeftAlign.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | *.o 3 | smithwaterman 4 | -------------------------------------------------------------------------------- /Repeats.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | 7 | map repeatCounts(long int pos, const string& seq, int maxsize); 8 | bool isRepeatUnit(const string& seq, const string& unit); 9 | -------------------------------------------------------------------------------- /convert.h: -------------------------------------------------------------------------------- 1 | #ifndef __CONVERT_H 2 | #define __CONVERT_H 3 | 4 | #include 5 | 6 | // converts the string into the specified type, setting r to the converted 7 | // value and returning true/false on success or failure 8 | template 9 | bool convert(const std::string& s, T& r) { 10 | std::istringstream iss(s); 11 | iss >> r; 12 | return iss.eof() ? true : false; 13 | } 14 | 15 | template 16 | std::string convert(const T& r) { 17 | std::ostringstream iss; 18 | iss << r; 19 | return iss.str(); 20 | } 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /LeftAlign.h: -------------------------------------------------------------------------------- 1 | #ifndef __LEFTALIGN_H 2 | #define __LEFTALIGN_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "IndelAllele.h" 12 | #include "convert.h" 13 | 14 | #ifdef VERBOSE_DEBUG 15 | #define LEFTALIGN_DEBUG(msg) \ 16 | if (debug) { cerr << msg; } 17 | #else 18 | #define LEFTALIGN_DEBUG(msg) 19 | #endif 20 | 21 | using namespace std; 22 | 23 | bool leftAlign(string& alternateQuery, string& cigar, string& referenceSequence, int& offset, bool debug = false); 24 | bool stablyLeftAlign(string alternateQuery, string& cigar, string referenceSequence, int& offset, int maxiterations = 20, bool debug = false); 25 | int countMismatches(string& alternateQuery, string& cigar, string& referenceSequence); 26 | 27 | string mergeCIGAR(const string& c1, const string& c2); 28 | vector > splitCIGAR(const string& cigarStr); 29 | string joinCIGAR(const vector >& cigar); 30 | 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Erik Garrison 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /IndelAllele.h: -------------------------------------------------------------------------------- 1 | #ifndef __INDEL_ALLELE_H 2 | #define __INDEL_ALLELE_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | class IndelAllele { 11 | friend ostream& operator<<(ostream&, const IndelAllele&); 12 | friend bool operator==(const IndelAllele&, const IndelAllele&); 13 | friend bool operator!=(const IndelAllele&, const IndelAllele&); 14 | friend bool operator<(const IndelAllele&, const IndelAllele&); 15 | public: 16 | bool insertion; 17 | int length; 18 | int referenceLength(void); 19 | int readLength(void); 20 | int position; 21 | int readPosition; 22 | string sequence; 23 | 24 | bool homopolymer(void); 25 | 26 | IndelAllele(bool i, int l, int p, int rp, string s) 27 | : insertion(i), length(l), position(p), readPosition(rp), sequence(s) 28 | { } 29 | }; 30 | 31 | bool homopolymer(string sequence); 32 | ostream& operator<<(ostream& out, const IndelAllele& indel); 33 | bool operator==(const IndelAllele& a, const IndelAllele& b); 34 | bool operator!=(const IndelAllele& a, const IndelAllele& b); 35 | bool operator<(const IndelAllele& a, const IndelAllele& b); 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /IndelAllele.cpp: -------------------------------------------------------------------------------- 1 | #include "IndelAllele.h" 2 | 3 | using namespace std; 4 | 5 | 6 | bool IndelAllele::homopolymer(void) { 7 | string::iterator s = sequence.begin(); 8 | char c = *s++; 9 | while (s != sequence.end()) { 10 | if (c != *s++) return false; 11 | } 12 | return true; 13 | } 14 | 15 | int IndelAllele::readLength(void) { 16 | if (insertion) { 17 | return length; 18 | } else { 19 | return 0; 20 | } 21 | } 22 | 23 | int IndelAllele::referenceLength(void) { 24 | if (insertion) { 25 | return 0; 26 | } else { 27 | return length; 28 | } 29 | } 30 | 31 | bool homopolymer(string sequence) { 32 | string::iterator s = sequence.begin(); 33 | char c = *s++; 34 | while (s != sequence.end()) { 35 | if (c != *s++) return false; 36 | } 37 | return true; 38 | } 39 | 40 | ostream& operator<<(ostream& out, const IndelAllele& indel) { 41 | string t = indel.insertion ? "i" : "d"; 42 | out << t << ":" << indel.position << ":" << indel.readPosition << ":" << indel.length << ":" << indel.sequence; 43 | return out; 44 | } 45 | 46 | bool operator==(const IndelAllele& a, const IndelAllele& b) { 47 | return (a.insertion == b.insertion 48 | && a.length == b.length 49 | && a.position == b.position 50 | && a.sequence == b.sequence); 51 | } 52 | 53 | bool operator!=(const IndelAllele& a, const IndelAllele& b) { 54 | return !(a==b); 55 | } 56 | 57 | bool operator<(const IndelAllele& a, const IndelAllele& b) { 58 | ostringstream as, bs; 59 | as << a; 60 | bs << b; 61 | return as.str() < bs.str(); 62 | } 63 | -------------------------------------------------------------------------------- /Mosaik.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef WIN32 4 | //#include "SafeFunctions.h" 5 | #endif 6 | 7 | // ============== 8 | // MOSAIK version 9 | // ============== 10 | 11 | #define MOSAIK_VERSION_DATE "2009-02-11" 12 | 13 | // adopt a major.minor.build version number [1].[1].[3] 14 | const unsigned char MOSAIK_MAJOR_VERSION = 0; 15 | const unsigned char MOSAIK_MINOR_VERSION = 9; 16 | const unsigned short MOSAIK_BUILD_VERSION = 899; 17 | 18 | // ================================ 19 | // Platform specific variable sizes 20 | // ================================ 21 | 22 | // Windows Vista 32-bit 23 | // Fedora Core 7 32-bit 24 | // Fedora Core 6 64-bit 25 | // Itanium2 64-bit 26 | #define SIZEOF_CHAR 1 27 | #define SIZEOF_WCHAR 2 28 | #define SIZEOF_SHORT 2 29 | #define SIZEOF_INT 4 30 | #define SIZEOF_FLOAT 4 31 | #define SIZEOF_DOUBLE 8 32 | #define SIZEOF_UINT64 8 33 | #define MOSAIK_LITTLE_ENDIAN 1 34 | 35 | #ifdef WIN32 36 | typedef signed long long int64_t; 37 | typedef unsigned long long uint64_t; 38 | #endif 39 | 40 | #define NEGATIVE_ONE_INT 0xffffffff 41 | #define NEGATIVE_TWO_INT 0xfffffffe 42 | #define NEGATIVE_THREE_INT 0xfffffffd 43 | #define NEGATIVE_FOUR_INT 0xfffffffc 44 | #define MAX_SHORT 0xffff 45 | 46 | // ========================== 47 | // Platform specific file I/O 48 | // ========================== 49 | 50 | #ifdef WIN32 51 | const char OS_DIRECTORY_SEPARATOR = '\\'; 52 | #else 53 | const char OS_DIRECTORY_SEPARATOR = '/'; 54 | #endif 55 | 56 | #define DIRECTORY_NAME_LENGTH 255 57 | 58 | // ==================================== 59 | // Enable unit test diagnostic messages 60 | // ==================================== 61 | 62 | #ifdef UNITTEST 63 | #define SILENTMODE if(0) 64 | #else 65 | #define SILENTMODE 66 | #endif 67 | 68 | // ================= 69 | // Aligner constants 70 | // ================= 71 | 72 | const double HASH_REGION_EXTENSION_PERCENT = 0.025; 73 | const unsigned char REFERENCE_SEQUENCE_QUALITY = 40; 74 | -------------------------------------------------------------------------------- /Repeats.cpp: -------------------------------------------------------------------------------- 1 | #include "Repeats.h" 2 | 3 | map repeatCounts(long int position, const string& sequence, int maxsize) { 4 | map counts; 5 | for (int i = 1; i <= maxsize; ++i) { 6 | // subseq here i bases 7 | string seq = sequence.substr(position, i); 8 | // go left. 9 | 10 | int j = position - i; 11 | int leftsteps = 0; 12 | while (j >= 0 && seq == sequence.substr(j, i)) { 13 | j -= i; 14 | ++leftsteps; 15 | } 16 | 17 | // go right. 18 | j = position; 19 | 20 | int rightsteps = 0; 21 | while (j + i <= sequence.size() && seq == sequence.substr(j, i)) { 22 | j += i; 23 | ++rightsteps; 24 | } 25 | // if we went left and right a non-zero number of times, 26 | if (leftsteps + rightsteps > 1) { 27 | counts[seq] = leftsteps + rightsteps; 28 | } 29 | } 30 | 31 | // filter out redundant repeat information 32 | if (counts.size() > 1) { 33 | map filteredcounts; 34 | map::iterator c = counts.begin(); 35 | string prev = c->first; 36 | filteredcounts[prev] = c->second; // shortest sequence 37 | ++c; 38 | for (; c != counts.end(); ++c) { 39 | int i = 0; 40 | string seq = c->first; 41 | while (i + prev.length() <= seq.length() && seq.substr(i, prev.length()) == prev) { 42 | i += prev.length(); 43 | } 44 | if (i < seq.length()) { 45 | filteredcounts[seq] = c->second; 46 | prev = seq; 47 | } 48 | } 49 | return filteredcounts; 50 | } else { 51 | return counts; 52 | } 53 | } 54 | 55 | bool isRepeatUnit(const string& seq, const string& unit) { 56 | 57 | if (seq.size() % unit.size() != 0) { 58 | return false; 59 | } else { 60 | int maxrepeats = seq.size() / unit.size(); 61 | for (int i = 0; i < maxrepeats; ++i) { 62 | if (seq.substr(i * unit.size(), unit.size()) != unit) { 63 | return false; 64 | } 65 | } 66 | return true; 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # ========================================= 2 | # MOSAIK Banded Smith-Waterman Makefile 3 | # (c) 2009 Michael Stromberg & Wan-Ping Lee 4 | # ========================================= 5 | 6 | # ---------------------------------- 7 | # define our source and object files 8 | # ---------------------------------- 9 | SOURCES= smithwaterman.cpp BandedSmithWaterman.cpp SmithWatermanGotoh.cpp Repeats.cpp LeftAlign.cpp IndelAllele.cpp 10 | OBJECTS= $(SOURCES:.cpp=.o) disorder.o 11 | OBJECTS_NO_MAIN= disorder.o BandedSmithWaterman.o SmithWatermanGotoh.o Repeats.o LeftAlign.o IndelAllele.o 12 | 13 | # ---------------- 14 | # compiler options 15 | # ---------------- 16 | 17 | # Use ?= to allow overriding from the env or command-line 18 | CXX?= c++ 19 | CXXFLAGS?= -O3 20 | OBJ?= sw.o 21 | 22 | # I don't think := is useful here, since there is nothing to expand 23 | LDFLAGS:= -Wl,-s 24 | #CXXFLAGS=-g 25 | EXE:= smithwaterman 26 | LIBS= 27 | 28 | all: $(EXE) $(OBJ) 29 | 30 | .PHONY: all 31 | 32 | libsw.a: smithwaterman.o BandedSmithWaterman.o SmithWatermanGotoh.o LeftAlign.o Repeats.o IndelAllele.o disorder.o 33 | ar rs $@ smithwaterman.o SmithWatermanGotoh.o disorder.o BandedSmithWaterman.o LeftAlign.o Repeats.o IndelAllele.o 34 | 35 | sw.o: BandedSmithWaterman.o SmithWatermanGotoh.o LeftAlign.o Repeats.o IndelAllele.o disorder.o 36 | ld -r $^ -o sw.o -L. 37 | #$(CXX) $(CFLAGS) -c -o smithwaterman.cpp $(OBJECTS_NO_MAIN) -I. 38 | 39 | ### @$(CXX) $(LDFLAGS) $(CFLAGS) -o $@ $^ -I. 40 | $(EXE): smithwaterman.o BandedSmithWaterman.o SmithWatermanGotoh.o disorder.o LeftAlign.o Repeats.o IndelAllele.o 41 | $(CXX) $(CFLAGS) $^ -I. -o $@ 42 | 43 | #smithwaterman: $(OBJECTS) 44 | # $(CXX) $(CXXFLAGS) -o $@ $< -I. 45 | 46 | smithwaterman.o: smithwaterman.cpp disorder.o 47 | $(CXX) $(CXXFLAGS) -c -o $@ smithwaterman.cpp -I. 48 | 49 | disorder.o: disorder.cpp disorder.h 50 | $(CXX) $(CXXFLAGS) -c -o $@ $< -I. 51 | BandedSmithWaterman.o: BandedSmithWaterman.cpp BandedSmithWaterman.h 52 | $(CXX) $(CXXFLAGS) -c -o $@ $< -I. 53 | SmithWatermanGotoh.o: SmithWatermanGotoh.cpp SmithWatermanGotoh.h disorder.o 54 | $(CXX) $(CXXFLAGS) -c -o $@ $< -I. 55 | Repeats.o: Repeats.cpp 56 | $(CXX) $(CXXFLAGS) -c -o $@ $< -I. 57 | LeftAlign.o: LeftAlign.cpp 58 | $(CXX) $(CXXFLAGS) -c -o $@ $< -I. 59 | IndelAllele.o: IndelAllele.cpp 60 | $(CXX) $(CXXFLAGS) -c -o $@ $< -I. 61 | 62 | .PHONY: clean 63 | 64 | clean: 65 | @echo "Cleaning up." 66 | @rm -f *.o $(PROGRAM) *~ 67 | -------------------------------------------------------------------------------- /disorder.h: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * libdisorder: A Library for Measuring Byte Stream Entropy 3 | * Copyright (C) 2010 Michael E. Locasto 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation; either version 2 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the: 17 | * Free Software Foundation, Inc. 18 | * 59 Temple Place, Suite 330 19 | * Boston, MA 02111-1307 USA 20 | * 21 | * $Id$ 22 | **************************************************************************/ 23 | 24 | #ifndef __DISORDER_H_ 25 | #define __DISORDER_H_ 26 | 27 | /** Max number of bytes (i.e., tokens) */ 28 | #define LIBDO_MAX_BYTES 256 29 | 30 | /** A convienance value for clients of this library. Feel free to change 31 | * if you plan to use a larger buffer. You can also safely ignore it, as 32 | * libdisorder does not use this value internally; it relies on the 33 | * client-supplied `length' parameter. 34 | * 35 | * NB: Might become deprecated because it is potentially misleading and 36 | * has zero relationship to any library internal state. 37 | */ 38 | #define LIBDO_BUFFER_LEN 16384 39 | 40 | /** 41 | * Given a pointer to an array of bytes, return a float indicating the 42 | * level of entropy in bits (a number between zero and eight), 43 | * assuming a space of 256 possible byte values. The second argument 44 | * indicates the number of bytes in the sequence. If this sequence 45 | * runs into unallocated memory, this function should fail with a 46 | * SIGSEGV. 47 | */ 48 | float shannon_H(char*, long long); 49 | 50 | /** Report the number of (unique) tokens seen. This is _not_ the 51 | number of individual events seen. For example, if the library sees 52 | the string `aaab', the number of events is 4 and the number of 53 | tokens is 2. */ 54 | int get_num_tokens(void); 55 | 56 | /** Returns maximum entropy for byte distributions log2(256)=8 bits*/ 57 | float get_max_entropy(void); 58 | 59 | /** Returns the ratio of entropy to maxentropy */ 60 | float get_entropy_ratio(void); 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /SmithWatermanGotoh.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | //#include "Alignment.h" 7 | #include "Mosaik.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "disorder.h" 13 | #include "Repeats.h" 14 | #include "LeftAlign.h" 15 | 16 | using namespace std; 17 | 18 | #define MOSAIK_NUM_NUCLEOTIDES 26 19 | #define GAP '-' 20 | 21 | class CSmithWatermanGotoh { 22 | public: 23 | // constructor 24 | CSmithWatermanGotoh(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty); 25 | // destructor 26 | ~CSmithWatermanGotoh(void); 27 | // aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm 28 | void Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2); 29 | // enables homo-polymer scoring 30 | void EnableHomoPolymerGapPenalty(float hpGapOpenPenalty); 31 | // enables non-repeat gap open penalty 32 | void EnableEntropyGapPenalty(float enGapOpenPenalty); 33 | // enables repeat gap extension penalty 34 | void EnableRepeatGapExtensionPenalty(float rGapExtensionPenalty, float rMaxGapRepeatExtensionPenaltyFactor = 10); 35 | // record the best score for external use 36 | float BestScore; 37 | private: 38 | // creates a simple scoring matrix to align the nucleotides and the ambiguity code N 39 | void CreateScoringMatrix(void); 40 | // corrects the homopolymer gap order for forward alignments 41 | void CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches); 42 | // returns the maximum floating point number 43 | static inline float MaxFloats(const float& a, const float& b, const float& c); 44 | // our simple scoring matrix 45 | float mScoringMatrix[MOSAIK_NUM_NUCLEOTIDES][MOSAIK_NUM_NUCLEOTIDES]; 46 | // keep track of maximum initialized sizes 47 | unsigned int mCurrentMatrixSize; 48 | unsigned int mCurrentAnchorSize; 49 | unsigned int mCurrentQuerySize; 50 | unsigned int mCurrentAQSumSize; 51 | // define our traceback directions 52 | // N.B. This used to be defined as an enum, but gcc doesn't like being told 53 | // which storage class to use 54 | const static char Directions_STOP; 55 | const static char Directions_LEFT; 56 | const static char Directions_DIAGONAL; 57 | const static char Directions_UP; 58 | // repeat structure determination 59 | const static int repeat_size_max; 60 | // define scoring constants 61 | const float mMatchScore; 62 | const float mMismatchScore; 63 | const float mGapOpenPenalty; 64 | const float mGapExtendPenalty; 65 | // store the backtrace pointers 66 | char* mPointers; 67 | // store the vertical gap sizes - assuming gaps are not longer than 32768 bases long 68 | short* mSizesOfVerticalGaps; 69 | // store the horizontal gap sizes - assuming gaps are not longer than 32768 bases long 70 | short* mSizesOfHorizontalGaps; 71 | // score if xi aligns to a gap after yi 72 | float* mQueryGapScores; 73 | // best score of alignment x1...xi to y1...yi 74 | float* mBestScores; 75 | // our reversed alignment 76 | char* mReversedAnchor; 77 | char* mReversedQuery; 78 | // define static constants 79 | static const float FLOAT_NEGATIVE_INFINITY; 80 | // toggles the use of the homo-polymer gap open penalty 81 | bool mUseHomoPolymerGapOpenPenalty; 82 | // specifies the homo-polymer gap open penalty 83 | float mHomoPolymerGapOpenPenalty; 84 | // toggles the use of the entropy gap open penalty 85 | bool mUseEntropyGapOpenPenalty; 86 | // specifies the entropy gap open penalty (multiplier) 87 | float mEntropyGapOpenPenalty; 88 | // toggles the use of the repeat gap extension penalty 89 | bool mUseRepeatGapExtensionPenalty; 90 | // specifies the repeat gap extension penalty 91 | float mRepeatGapExtensionPenalty; 92 | // specifies the max repeat gap extension penalty 93 | float mMaxRepeatGapExtensionPenalty; 94 | }; 95 | 96 | // returns the maximum floating point number 97 | inline float CSmithWatermanGotoh::MaxFloats(const float& a, const float& b, const float& c) { 98 | float max = 0.0f; 99 | if(a > max) max = a; 100 | if(b > max) max = b; 101 | if(c > max) max = c; 102 | return max; 103 | } 104 | -------------------------------------------------------------------------------- /BandedSmithWaterman.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | //#include "Alignment.h" 7 | #include "Mosaik.h" 8 | //#include "HashRegion.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | #define MOSAIK_NUM_NUCLEOTIDES 26 17 | #define GAP '-' 18 | 19 | typedef unsigned char DirectionType; 20 | typedef unsigned char PositionType; 21 | 22 | struct ElementInfo { 23 | unsigned int Direction : 2; 24 | unsigned int mSizeOfVerticalGaps : 15; 25 | unsigned int mSizeOfHorizontalGaps : 15; 26 | }; 27 | 28 | class CBandedSmithWaterman { 29 | public: 30 | // constructor 31 | CBandedSmithWaterman(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty, unsigned int bandWidth); 32 | // destructor 33 | ~CBandedSmithWaterman(void); 34 | // aligns the query sequence to the anchor using the Smith Waterman Gotoh algorithm 35 | void Align(unsigned int& referenceAl, string& stringAl, const string& s1, const string& s2, pair< pair, pair >& hr); 36 | // enables homo-polymer scoring 37 | void EnableHomoPolymerGapPenalty(float hpGapOpenPenalty); 38 | private: 39 | // calculates the score during the forward algorithm 40 | float CalculateScore(const string& s1, const string& s2, const unsigned int rowNum, const unsigned int columnNum, float& currentQueryGapScore, const unsigned int rowOffset, const unsigned int columnOffset); 41 | // creates a simple scoring matrix to align the nucleotides and the ambiguity code N 42 | void CreateScoringMatrix(void); 43 | // corrects the homopolymer gap order for forward alignments 44 | void CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches); 45 | // returns the maximum floating point number 46 | static inline float MaxFloats(const float& a, const float& b, const float& c); 47 | // reinitializes the matrices 48 | void ReinitializeMatrices(const PositionType& positionType, const unsigned int& s1Length, const unsigned int& s2Length, const pair< pair, pair > hr); 49 | // performs the backtrace algorithm 50 | void Traceback(unsigned int& referenceAl, string& stringAl, const string& s1, const string& s2, unsigned int bestRow, unsigned int bestColumn, const unsigned int rowOffset, const unsigned int columnOffset); 51 | // updates the best score during the forward algorithm 52 | inline void UpdateBestScore(unsigned int& bestRow, unsigned int& bestColumn, float& bestScore, const unsigned int rowNum, const unsigned int columnNum, const float score); 53 | // our simple scoring matrix 54 | float mScoringMatrix[MOSAIK_NUM_NUCLEOTIDES][MOSAIK_NUM_NUCLEOTIDES]; 55 | // keep track of maximum initialized sizes 56 | unsigned int mCurrentMatrixSize; 57 | unsigned int mCurrentAnchorSize; 58 | unsigned int mCurrentAQSumSize; 59 | unsigned int mBandwidth; 60 | // define our backtrace directions 61 | const static DirectionType Directions_STOP; 62 | const static DirectionType Directions_LEFT; 63 | const static DirectionType Directions_DIAGONAL; 64 | const static DirectionType Directions_UP; 65 | // store the backtrace pointers 66 | ElementInfo* mPointers; 67 | // define our position types 68 | const static PositionType Position_REF_AND_QUERY_ZERO; 69 | const static PositionType Position_REF_ZERO; 70 | const static PositionType Position_QUERY_ZERO; 71 | const static PositionType Position_REF_AND_QUERO_NONZERO; 72 | // define scoring constants 73 | const float mMatchScore; 74 | const float mMismatchScore; 75 | const float mGapOpenPenalty; 76 | const float mGapExtendPenalty; 77 | // score if xi aligns to a gap after yi 78 | float* mAnchorGapScores; 79 | // best score of alignment x1...xi to y1...yi 80 | float* mBestScores; 81 | // our reversed alignment 82 | char* mReversedAnchor; 83 | char* mReversedQuery; 84 | // define static constants 85 | static const float FLOAT_NEGATIVE_INFINITY; 86 | // toggles the use of the homo-polymer gap open penalty 87 | bool mUseHomoPolymerGapOpenPenalty; 88 | float mHomoPolymerGapOpenPenalty; 89 | }; 90 | 91 | // returns the maximum floating point number 92 | inline float CBandedSmithWaterman::MaxFloats(const float& a, const float& b, const float& c) { 93 | float max = 0.0f; 94 | if(a > max) max = a; 95 | if(b > max) max = b; 96 | if(c > max) max = c; 97 | return max; 98 | } 99 | 100 | // updates the best score during the forward algorithm 101 | inline void CBandedSmithWaterman::UpdateBestScore(unsigned int& bestRow, unsigned int& bestColumn, float& bestScore, const unsigned int rowNum, const unsigned int columnNum, const float score) { 102 | 103 | //const unsigned int row = rowNum + rowOffset; 104 | //const unsigned int column = columnOffset - rowNum + columnNum; 105 | 106 | if(score > bestScore) { 107 | bestRow = rowNum; 108 | bestColumn = columnNum; 109 | bestScore = score; 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /SWMain.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | //#include "Alignment.h" 4 | //#include "Benchmark.h" 5 | //#include "HashRegion.h" 6 | #include "SmithWatermanGotoh.h" 7 | #include "BandedSmithWaterman.h" 8 | 9 | using namespace std; 10 | 11 | int main(int argc, char* argv[]) { 12 | /* 13 | printf("------------------------------------------------------------------------------\n"); 14 | printf("Banded Smith-Waterman Algorithm (worst case)\n"); 15 | printf("Michael Stromberg & Wan-Ping Lee Marth Lab, Boston College Biology Department\n"); 16 | printf("------------------------------------------------------------------------------\n\n"); 17 | */ 18 | // this version simulates the worst case of only a fragment hashing to the 19 | // reference sequence. Basically a non-centered diagonal in the Smith-Waterman 20 | // dynamic programming matrix. 21 | 22 | // here we simulate a region on the reference that occurs between position 4001 23 | // and position 4136. During hashing, only the first 20 bases in the query 24 | // matched perfectly. 25 | 26 | // define the start and end coordinates of the entire reference region 27 | //const unsigned int start = 4001; 28 | //const unsigned int end = 4136; 29 | 30 | //const unsigned int testStart = atoi(argv[1]); 31 | //const unsigned int testEnd = atoi(argv[2]); 32 | //const unsigned int testQueryStart = atoi(argv[3]); 33 | //const unsigned int testQueryEnd = atoi(argv[4]); 34 | 35 | //cout << endl<< "=====================================================" << endl; 36 | //cout << testStart << "\t" << testQueryStart << endl; 37 | 38 | // define the 20 b:q 39 | // ases that matched perfectly 40 | //HashRegion hr; 41 | 42 | //===================================================== 43 | // defind the hash region 44 | // first.first: reference begin 45 | // first.second: reference end 46 | // second.first: query begin 47 | // second.second: query end 48 | //===================================================== 49 | 50 | pair< pair, pair > hr; 51 | hr.first.first = 5; 52 | hr.first.second = 13; 53 | hr.second.first = 0; 54 | hr.second.second = 8; 55 | 56 | //===================================================== 57 | 58 | // for 76 bp reads, we expect as much as 12 mismatches - however this does not 59 | // translate to a bandwidth of 12 * 2 + 1 since most of these will be 60 | // substitution errors 61 | const unsigned char bandwidth = 11; 62 | 63 | // initialize 64 | const char* pReference = "ATGGCGGGGATCGGGACACTCGCCGGTGCGGGTACCCTA"; 65 | const char* pQuery = "GGGGATCGGGACACTCGCTCTCCGGTGCGGGTA"; 66 | 67 | const unsigned int referenceLen = strlen(pReference); 68 | const unsigned int queryLen = strlen(pQuery); 69 | 70 | // ============================================================================================== 71 | // benchmarking reference on koi.bc.edu when NUM_ITERATIONS = 38000 on 76 bp read (1 try): 72 | // CPU time: 23.920 s, wall time: 24.012 s (1582.6 alignments/s) 73 | // ============================================================================================== 74 | //const unsigned int NUM_ITERATIONS = 38000; 75 | //unsigned int NUM_ITERATIONS = 1; 76 | 77 | // create a new Smith-Waterman alignment object 78 | CSmithWatermanGotoh sw(10.0f, -9.0f, 15.0f, 6.66f); 79 | CBandedSmithWaterman bsw(10.0f, -9.0f, 15.0f, 6.66f, bandwidth); 80 | 81 | // start timing the algorithm 82 | //CBenchmark bench; 83 | //bench.Start(); 84 | 85 | // perform NUM_ITERATIONS alignments 86 | //Alignment bswAl; 87 | //Alignment swAl; 88 | // referenceBegin, referenceEnd 89 | unsigned int referenceSW, referenceBSW; 90 | string cigarSW, cigarBSW; 91 | //for(unsigned int i = 0; i < NUM_ITERATIONS; i++) { 92 | sw.Align(referenceSW, cigarSW, pReference, referenceLen, pQuery, queryLen); 93 | bsw.Align(referenceBSW, cigarBSW, pReference, referenceLen, pQuery, queryLen, hr); 94 | //} 95 | 96 | // stop timing the algorithm 97 | //bench.Stop(); 98 | 99 | // calculate the alignments per second 100 | //double elapsedWallTime = bench.GetElapsedWallTime(); 101 | //double alignmentsPerSecond = (double)NUM_ITERATIONS / elapsedWallTime; 102 | 103 | // show our results 104 | //printf("%d\t%d\n", al.ReferenceBegin,al.QueryBegin); 105 | 106 | printf("Smith-Waterman\n"); 107 | printf("reference: %s %3u\n", cigarSW.c_str(), referenceSW); 108 | printf("Banded Smith-Waterman\n"); 109 | printf("reference: %s %3u\n", cigarBSW.c_str(), referenceBSW); 110 | /* 111 | printf("Smith-Waterman\n"); 112 | printf("reference: %s %3u %3u\n", swAl.Reference.CData(), swAl.ReferenceBegin, swAl.ReferenceEnd); 113 | printf("query: %s %3u %3u\n", swAl.Query.CData(), swAl.QueryBegin, swAl.QueryEnd); 114 | printf("mismatches: %u\n", swAl.NumMismatches); 115 | printf("\n"); 116 | printf("Banded Smith-Waterman\n"); 117 | printf("reference: %s %3u %3u\n", bswAl.Reference.CData(), bswAl.ReferenceBegin, bswAl.ReferenceEnd); 118 | printf("query: %s %3u %3u\n", bswAl.Query.CData(), bswAl.QueryBegin, bswAl.QueryEnd); 119 | printf("mismatches: %u\n", bswAl.NumMismatches); 120 | */ 121 | //printf("alignments/s: %.1f\n\n", alignmentsPerSecond); 122 | 123 | //bench.DisplayTime("BandedSmithWaterman"); 124 | 125 | return 0; 126 | } 127 | -------------------------------------------------------------------------------- /disorder.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * libdisorder: A Library for Measuring Byte Stream Entropy 3 | * Copyright (C) 2010 Michael E. Locasto 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation; either version 2 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the: 17 | * Free Software Foundation, Inc. 18 | * 59 Temple Place, Suite 330 19 | * Boston, MA 02111-1307 USA 20 | * 21 | * $Id$ 22 | **************************************************************************/ 23 | 24 | #include //for log2() 25 | #include //for NULL 26 | #include "disorder.h" 27 | 28 | #if defined(__FreeBSD__) 29 | #define log2(x) (log((x)) * (1./M_LN2)) 30 | #endif 31 | 32 | /** Frequecies for each byte */ 33 | static int m_token_freqs[LIBDO_MAX_BYTES]; //frequency of each token in sample 34 | static float m_token_probs[LIBDO_MAX_BYTES]; //P(each token appearing) 35 | static int m_num_tokens = 0; //actual number of `seen' tokens, max 256 36 | static float m_maxent = 0.0; 37 | static float m_ratio = 0.0; 38 | static int LIBDISORDER_INITIALIZED = 0; 39 | 40 | static void 41 | initialize_lib() 42 | { 43 | int i = 0; 44 | if(1==LIBDISORDER_INITIALIZED) 45 | return; 46 | 47 | m_num_tokens = 0; 48 | 49 | for(i=0;iLIBDO_MAX_BYTES) 150 | { 151 | //report error somehow? 152 | return 0.0; 153 | } 154 | 155 | //iterate through whole m_token_freq array, but only count 156 | //spots that have a registered token (i.e., freq>0) 157 | for(i=0;i 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "SmithWatermanGotoh.h" 10 | #include "BandedSmithWaterman.h" 11 | 12 | using namespace std; 13 | 14 | /* Returns the Reverse Complement of a DNA Sequence, from the alphabet {A,T,C,G,N} */ 15 | string reverseComplement(string read) { 16 | 17 | // Declare the (empty) reverse complement read as a string 18 | string rc_read; 19 | 20 | // Reverse Read 21 | rc_read.assign(read.rbegin(), read.rend()); 22 | 23 | // Complement. Note that not IUPAC compliant. Uses the alphabet {A,T,C,G,N} 24 | string::iterator t; 25 | for (t = rc_read.begin(); t != rc_read.end(); ++t) { 26 | switch (*t) { 27 | case 'A': 28 | *t = 'T'; 29 | break; 30 | case 'T': 31 | *t = 'A'; 32 | break; 33 | case 'C': 34 | *t = 'G'; 35 | break; 36 | case 'G': 37 | *t = 'C'; 38 | break; 39 | case 'N': 40 | *t = 'N'; 41 | break; 42 | default: 43 | cout << "Unknown Nucleotide!"; 44 | break; 45 | } 46 | } 47 | 48 | // Return the Read (faster if done through pointers?) 49 | return rc_read; 50 | } 51 | 52 | 53 | void printSummary(void) { 54 | cerr << "usage: smithwaterman [options] " << endl 55 | << endl 56 | << "options:" << endl 57 | << " -m, --match-score the match score (default 10.0)" << endl 58 | << " -n, --mismatch-score the mismatch score (default -9.0)" << endl 59 | << " -g, --gap-open-penalty the gap open penalty (default 15.0)" << endl 60 | << " -z, --entropy-gap-open-penalty enable entropy scaling of the gap open penalty" << endl 61 | << " -e, --gap-extend-penalty the gap extend penalty (default 6.66)" << endl 62 | << " -r, --repeat-gap-extend-penalty use repeat information when generating gap extension penalties" << endl 63 | << " -b, --bandwidth bandwidth to use (default 0, or non-banded algorithm)" << endl 64 | << " -p, --print-alignment print out the alignment" << endl 65 | << " -R, --reverse-complement report the reverse-complement alignment if it scores better" << endl 66 | << endl 67 | << "When called with literal reference and query sequences, smithwaterman" << endl 68 | << "prints the cigar match positional string and the match position for the" << endl 69 | << "query sequence against the reference sequence." << endl; 70 | } 71 | 72 | 73 | int main (int argc, char** argv) { 74 | 75 | int c; 76 | 77 | string reference; 78 | string query; 79 | 80 | int bandwidth = 0; 81 | 82 | float matchScore = 10.0f; 83 | float mismatchScore = -9.0f; 84 | float gapOpenPenalty = 15.0f; 85 | float gapExtendPenalty = 6.66f; 86 | float entropyGapOpenPenalty = 0.0f; 87 | bool useRepeatGapExtendPenalty = false; 88 | float repeatGapExtendPenalty = 1.0f; 89 | 90 | bool print_alignment = false; 91 | bool tryReverseComplement = false; 92 | 93 | while (true) { 94 | static struct option long_options[] = 95 | { 96 | {"help", no_argument, 0, 'h'}, 97 | {"match-score", required_argument, 0, 'm'}, 98 | {"mismatch-score", required_argument, 0, 'n'}, 99 | {"gap-open-penalty", required_argument, 0, 'g'}, 100 | {"entropy-gap-open-penalty", required_argument, 0, 'z'}, 101 | {"gap-extend-penalty", required_argument, 0, 'e'}, 102 | {"repeat-gap-extend-penalty", required_argument, 0, 'r'}, 103 | {"print-alignment", required_argument, 0, 'p'}, 104 | {"bandwidth", required_argument, 0, 'b'}, 105 | {"reverse-complement", no_argument, 0, 'R'}, 106 | {0, 0, 0, 0} 107 | }; 108 | int option_index = 0; 109 | 110 | c = getopt_long (argc, argv, "hpRzm:n:g:r:e:b:r:", 111 | long_options, &option_index); 112 | 113 | if (c == -1) 114 | break; 115 | 116 | switch (c) 117 | { 118 | case 0: 119 | /* If this option set a flag, do nothing else now. */ 120 | if (long_options[option_index].flag != 0) 121 | break; 122 | printf ("option %s", long_options[option_index].name); 123 | if (optarg) 124 | printf (" with arg %s", optarg); 125 | printf ("\n"); 126 | break; 127 | 128 | case 'R': 129 | tryReverseComplement = true; 130 | break; 131 | 132 | case 'm': 133 | matchScore = atof(optarg); 134 | break; 135 | 136 | case 'n': 137 | mismatchScore = atof(optarg); 138 | break; 139 | 140 | case 'g': 141 | gapOpenPenalty = atof(optarg); 142 | break; 143 | 144 | case 'z': 145 | entropyGapOpenPenalty = 1; 146 | break; 147 | 148 | case 'r': 149 | useRepeatGapExtendPenalty = true; 150 | repeatGapExtendPenalty = atof(optarg); 151 | break; 152 | 153 | case 'e': 154 | gapExtendPenalty = atof(optarg); 155 | break; 156 | 157 | case 'b': 158 | bandwidth = atoi(optarg); 159 | break; 160 | 161 | case 'p': 162 | print_alignment = true; 163 | break; 164 | 165 | case 'h': 166 | printSummary(); 167 | exit(0); 168 | break; 169 | 170 | case '?': 171 | /* getopt_long already printed an error message. */ 172 | printSummary(); 173 | exit(1); 174 | break; 175 | 176 | default: 177 | abort (); 178 | } 179 | } 180 | 181 | /* Print any remaining command line arguments (not options). */ 182 | if (optind == argc - 2) { 183 | //cerr << "fasta file: " << argv[optind] << endl; 184 | reference = string(argv[optind]); 185 | ++optind; 186 | query = string(argv[optind]); 187 | } else { 188 | cerr << "please specify a reference and query sequence" << endl 189 | << "execute " << argv[0] << " --help for command-line usage" << endl; 190 | exit(1); 191 | } 192 | 193 | // initialize 194 | 195 | unsigned int referencePos; 196 | string cigar; 197 | 198 | float bestScore = 0; 199 | 200 | bool alignedReverse = false; 201 | 202 | // create a new Smith-Waterman alignment object 203 | if (bandwidth > 0) { 204 | pair< pair, pair > hr; 205 | hr.first.first = 2; 206 | hr.first.second = 18; 207 | hr.second.first = 1; 208 | hr.second.second = 17; 209 | CBandedSmithWaterman bsw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty, bandwidth); 210 | bsw.Align(referencePos, cigar, reference, query, hr); 211 | } else { 212 | CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty); 213 | if (useRepeatGapExtendPenalty) 214 | sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty); 215 | if (entropyGapOpenPenalty > 0) 216 | sw.EnableEntropyGapPenalty(entropyGapOpenPenalty); 217 | sw.Align(referencePos, cigar, reference, query); 218 | bestScore = sw.BestScore; 219 | if (tryReverseComplement) { 220 | string queryRevC = reverseComplement(query); 221 | sw.Align(referencePos, cigar, reference, query); 222 | if (sw.BestScore > bestScore) { 223 | alignedReverse = true; 224 | bestScore = sw.BestScore; 225 | query = queryRevC; 226 | } 227 | } 228 | } 229 | 230 | printf("%s %3u %f %s\n", cigar.c_str(), referencePos, bestScore, (alignedReverse ? "-" : "+")); 231 | 232 | // optionally print out the alignment 233 | if (print_alignment) { 234 | int alignmentLength = 0; 235 | int len; 236 | string slen; 237 | vector > cigarData; 238 | for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) { 239 | switch (*c) { 240 | case 'I': 241 | len = atoi(slen.c_str()); 242 | slen.clear(); 243 | cigarData.push_back(make_pair(len, *c)); 244 | break; 245 | case 'D': 246 | len = atoi(slen.c_str()); 247 | alignmentLength += len; 248 | slen.clear(); 249 | cigarData.push_back(make_pair(len, *c)); 250 | break; 251 | case 'M': 252 | len = atoi(slen.c_str()); 253 | alignmentLength += len; 254 | slen.clear(); 255 | cigarData.push_back(make_pair(len, *c)); 256 | break; 257 | case 'S': 258 | len = atoi(slen.c_str()); 259 | slen.clear(); 260 | cigarData.push_back(make_pair(len, *c)); 261 | break; 262 | default: 263 | len = 0; 264 | slen += *c; 265 | break; 266 | } 267 | } 268 | 269 | string gapped_ref = string(reference).substr(referencePos, alignmentLength); 270 | string gapped_query = string(query); 271 | 272 | int refpos = 0; 273 | int readpos = 0; 274 | for (vector >::iterator c = cigarData.begin(); c != cigarData.end(); ++c) { 275 | int len = c->first; 276 | switch (c->second) { 277 | case 'I': 278 | gapped_ref.insert(refpos, string(len, '-')); 279 | readpos += len; 280 | refpos += len; 281 | break; 282 | case 'D': 283 | gapped_query.insert(readpos, string(len, '-')); 284 | refpos += len; 285 | readpos += len; 286 | break; 287 | case 'M': 288 | readpos += len; 289 | refpos += len; 290 | break; 291 | case 'S': 292 | readpos += len; 293 | gapped_ref.insert(refpos, string(len, '*')); 294 | refpos += len; 295 | break; 296 | default: 297 | break; 298 | } 299 | } 300 | 301 | cout << gapped_ref << endl << gapped_query << endl; 302 | } 303 | 304 | return 0; 305 | 306 | } 307 | -------------------------------------------------------------------------------- /examples.txt: -------------------------------------------------------------------------------- 1 | TCTGTGACCTCAAAGCCCAACTGTGCATACACAAGCATACACACACACACACACACACACACACACACACACACATACACACACA TCTGTGACCTCAAAGCCCAACTGTGCATACACAAGCATACACACACACACACACACA 2 | AAAGGCTGGGGACCACTGATCTAAATACACCAATAAAAAGAAAAAGATTGTAAGATTGGATTTTAAAAGACCTGACTCTATACTGACCACAAAAAAAAACCCTCACT AAAGGCTGGGGACCACTGATCTAAATACACCAATAAAAAGAAAAAGATTGTAAGATTGGATTTTAAAAGACCTGACTCTATACTGACCACAAAAAAAACC 3 | TGTGACCTCAAAGCCCAACTGTGCATACACAAGCATACACACACACACACACACACACACACACACACACACATACACA TGTGACCTCAAAGCCCAACTGTGCATACACAAGCATACACACACACACGCACACACACACATACACACAC 4 | ATTTTTTAAATCAGGAATAACTTAGACCAGGGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACACACACACACACACACACATACACACATACACACAAATATATCTTCACTAATGTTCTTTTTTTCTTGTTTTTC GGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACATACACACATACACACAAAT 5 | ATGCATGCCTCTCTCTCTCTCTCTGTCGCTCTCTCTCTCTCTCTCTCTCTCTCTCT ATGCATGCCTCTCTCTCTCTCTCTCGCTCTCTCTCTCTCTCTCTCT 6 | ACACCAGCTGGGGTGTGTGTGTGTGTGTGTGTGTGTGCGTGTGTGTGTGTGTGTGTGATTCTCGTGCCT GGGGGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGATTCTCGTGC 7 | GCCTGGGCAACATAGTGAGACCTTGTCTCTACAAATAGTTAAAAAAAAAAAAAAAATTAGCCAGGTGTGGTGGTGCACACATGT GCCTGGGCAACATAGTGAGACCTTGTCTCTACAAATAGTTAAAAAAAAAAAAAATTAGCCAGGTGTGGTGGTGCACACATGT 8 | TTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCTGGAGTGCAATGGTGTGATCTCGGCTCACGGCAACCTCCG TTTTCTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCCGGAGTGCAATGGTGTGCTCTCGGCTCCCGGCAACCTCCG 9 | AGTAATGGAAATACTGTTTATCATCATTAGAGTTGGGTGTATACTACTACATTACTCTCTCTCTCTCTCTATATATATATATATATATATATATATTTTTTTTT AGTAATGGAAATACTGTTTATCATCATTAGAGTTGGGTGTATACTACTACATTACTCTCTCTCTCTCTCTCTCTATATATATATATATATATTTTTTTTT 10 | AGCCTGGGCGACAGGGCGAGACTCCGTCTCAAATAATAATAATAATAATAATAATAATAATAATAATAATAATAAAATAAAATAAAATAAAATAAAAATACAAAAAT AGCCTGGGCGACAGGGCGAGACTCCGTCTCAAATAATAATAATAATAATAATAATAATAATAAAATAAAATAAAATAAAATAAAAATACAAAAAT 11 | CATACACACACACACACACACACACACACACACACACACACACACACACACACCTCATGTAGTGAACTTAATAAATTTAATCTGCAGCTCTGATGATTTCCTTAAGG CATACACACACACACACACACATACACACACACACACACACACACCTCATGTAGTGAACTTAATAAATTTAATCTGCAGCTCTGATGATTTCCTTAAGG 12 | GGGAGGCTGAGGCAGGAGGATCACACCACTGCACTTTAGCCTGAATACTGAGTAACAAAGCAAAACCCTGTCTCTCTTAAAAAAAAAATTGGGGGGAAGGACAAGTCTTTTTTCTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCTGGAGTGCAATGGTGTGATC AGTAACAAAGCAAAACCCTGTCTCTCTTAAAAAAAAAATTGGGGGGAAGGACAAGTCTTTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTGTGATGGAATT 13 | ATTTTTTAAATCAGGAATAACTTAGACCAGGGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACACACACACACACACACACATACACACATACACACAAATATATCTTCACTAATGTTCTTTTTTTCTTGTTTTTC GGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACATACACACATACACACAAAT 14 | CATACACACACACACACACACACACACACACACACACACACACACACACACACCTCATGTAGTGAACTTAATAAATTTAATCTGCAGCTCTGATGATTTCCTTAAGGC CATACACACACACACACACACATACACACACACACACACACACACCTCATGTAGTGAACTTAATAAATTTAATCTGCAGCTCTGATGATTTCCTTAAGGC 15 | TATACAGATTACTTTTATAGCTGATGAGGCAAGTCCTTCTATCATTGTTTCAAAGATTCCATGGCTTTTACTGAACATTTTCTTTTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAGACGGAGTTTC TTTTCTTTTTTCTTTTTTTTTTTTTGAGACGGAGTTTC 16 | GAGAGAGGCGCGCGCGCGCGTGCGCACGCACACACACACACACACACACATAACTAATATATATAAATACATATATATGTGGGTATATATTATTTATTTATG GAGAGAGGCGCGCGCGCGCGCGCGCACACACACACACACACACACACATAACTAATATATATAAATACATATATATGTGGGTATATATTATTTATTTATG 17 | TATATGTATATATATGTGTGTATATATATGTATATATATGTGTGTATATATATGTGTATATATATGTGTGTGTGTGTGTGTGTGTGTGTATATATATATATATATATATATATCAGTTTGCCCTTGCTGGATAACAAA TATATGTATATATATGTGTGTATATATATGTATATATATGTGTGTATATATATGTGTATATATGTGTGTGTGTGTGTATATATATATCAGTTTGCCCTTG 18 | AGCAAACACCTATTGTGCATTTTCTTTTCTTTCTTTCTTTCTTTCTTTTTTTTTTTTGAGACGGAGTTTCGCTCTTGTTGTCCAGGCTAGAGTACGATGG AGCAAACACCTATTGTGCATTTTCTTTTCCTTCTTTCTTTCTTTTTTTTTTTTTTTGAGACGGAGTTTCGCTCTTGTTGTCCAGTCTAGAGTCAGTGG 19 | GATTTTGGTATATTGGTCTTACATTTTTTCACTTTGCTGAACTCATTTATTAGTTCTAATTCATGGGGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTCTGTGTGTGTGTGTGTCTGTGTGTT GATTTTGGTATATTGGTCTTACATTTTTTCACTTTGCTGAACTCATTTATTAGTTCTAATTCATGGGGTGTGTGTGTGTGTGTGTGTGTGTCTGTGTGTG 20 | CCAGCCTGGGCGACAGAGTGAGACTCCATCTCAAAACAAACAAACAAACAAACAAACAAACAAACAACTCCACTAAGACTTTTGGGACAACCTGTACCTACCTGACCTGCCTTTCCATCTTTAATGCTCTTCT CCAGCCTGGGCGACAGAGTGAGACTCCATCTCAAAACAAACAAACAAACAAACAAACAAACAACTCCACTAAGACTTTTGGGACAACCTGTACCTATCT 21 | TATATGTATATATATGTGTGTATATATATGTATATATATGTGTGTATATATATGTGTATATATATGTGTGTGTGTGTGTGTGTGTGTGTATATATATATATATATATATATATCAGTTTGCCCTTGCTGGATAACAAA TATATGTATATATATGTGTGTATATATATGTATATATATGTGTGTATATATATGTGTATATATGTGTGTGTGTGTGTATATATATATCAGTTTGCCCTTG 22 | TAATATATATAATATCTAATATATATAATATCTAATATATATAATATATATATAGAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTGCAATGGCGCGATCTC AATATATATATAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTGCAATGGCGCGATCTC 23 | GAGAAAGAAACTATTTAGCATTGTGGCTTTCATAATTTCTTTCTTTCTTTTTTTTTTTTTTTTTGAAGCAGAGTCTAGCTCTGTCGCCCAGGCTGGAAAGCAGTGGTGCAAACTC TTAGCATTGTGGCTTTCATAATTTCTTTCTTTTTTTTTTTTTTTTTTTTTAAGCAGAGTCTAGCTCTGTCCCCCCGGCTGGAAAGCAGTGGTGCAACCTC 24 | GGTGACAGAGCAAGACTCCATCTCTCTCTCTCTCTCTCTCTCTCTATATATATATATATATATATACATATATATATGTATATATATGTATATATATGTGTATATATGTGTATATATATGTATATATGTGTATATATATAT GTGACAGAGCAAGACTCCATCTCTCTCTCTCTCTCTCTCTCTATATATATATATATACACACATATATATATGTATATATATGTATATATATATGTGTAT 25 | GAGAGAGGCGCGCGCGCGCGTGCGCACGCACACACACACACACACACACATAACTAATATATATAAATACATATATATGTGGGTATATATTATTTATTTATG GAGAGAGGCGCGCGCGCGCGCGCGCACACACACACACACACACACACATAACTAATATATATAAATACATATATATGTGGGTATATATTATTTATTTATG 26 | ATTTTTTAAATCAGGAATAACTTAGACCAGGGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACACACACACACACACACACATACACACATACACACAAATATATCTTCACTAATGTTCTTTTTTTCTTGTTTTTC GGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACATACACACATACACACAAAT 27 | TTTTCTTTTCTTTTCTTTTTTTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCTGGAGTGCAATGGTGTGATCTCGGCTCACGGCAACCTCCGCCTCC TTTTCTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCCGGAGTGCAATGGTGTGCTCTCGGCTCCCGGCAACCTCCGCCTCT 28 | GAGCCGAGATCGTGCCACTGCACTCCAGCCTGGGTGACAGAGCGAGACTCTGTCTCAAAAACAAAAAACAAGCAAACAAAAAAACAAAAAAAAACAAAAAATCCCCAGCA ATCGTGCCACTGCACTCCAGCCTGGGTGACAGAGCGAGACTCTGTCTCAAAAACAAAAAACAAGCAAACAAAAAGCAAAAAAAACAAAAAACCCCCAGCA 29 | GAGCCGAGATCGTGCCACTGCACTCCAGCCTGGGTGACAGAGCGAGACTCTGTCTCAAAAACAAAAAACAAGCAAACAAAAAAACAAAAAAAAACAAAAAATCCCCAGCA ATCGTGCCACTGCACTCCAGCCTGGGTGACAGAGCGAGACTCTGTCTCAAAAACAAAAAACAAGCAAACAAAAAGCAAAAAAAACAAAAAACCCCCAGCA 30 | TAATATATATAATATCTAATATATATAATATCTAATATATATAATATATATATAGAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTG AATATATATATAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTG 31 | AAACCCCATCTCTGCTACAAATACAAATACAAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAGCCAGGCATGC AAACCCCATCTCTGCTACAAATACAAATACAAATAATAATAATAATAATAATAATAATAATAATAATAATAATAGCCAGGCATGC 32 | TATATAATATCTAATATATATAATATATATATAGAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTGC TATATAATATCTAATATATATAATATATATATATAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGGGAGAGA 33 | AGAGGCAGTAGATTTAGGGACCACTCAACCTAGTGAGACACCAGCTGGGGTGTGTGTGTGTGTGTGTGTGTGTGCGTGTGTGTGTGTGTGTGTGATTCTCGTGCCTC TAGATTTAGGGAGCACTCAACCTAGTGAGACACCAGCTGGGGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGATT 34 | ACACCAGCTGGGGTGTGTGTGTGTGTGTGTGTGTGTGCGTGTGTGTGTGTGTGTGTGATTCTCGTGCCTC GGGGGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGATTCTCGTGC 35 | CAAGGCTCCAAAATTTTCAGATGAAATGTACAGAAAGAATACAATCTCATTTTAATAGTTTTTTTTTTTTTTAAAAAGGTCCTTGACCAATTCCCCAAGGTCCAT CAAGGCTCCAAAATTTTCAGATGAAATGTACAGAAAGAATACAATCTCATTTTAATAGTTTTTTTTTTTTAAAAAAGGTCCTTGACCAATTCCCCAAGGT 36 | AAACCCCATCTCTGCTACAAATACAAATACAAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAATAGCCAGGCATGC AAACCCCATCTCTGCTACAAATACAAATACAAATAATAATAATAATAATAATAATAATAATAATAATAATAATAGCCAGGCATGC 37 | TACTTCATATGTTCCACGCTCTGGTTGTTTTGTGGGGAGCAAAAGAGAAGTTCCCATTTCTGTTTATGTTAGAAACAAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAATAATGAGGT TTCCCATTTCTGTTTATGTTAAAAACAAAACAAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAA 38 | GGTTGTTTTGTGGGGAGCAAAAGAGAAGTTCCCATTTCTGTTTATGTTAGAAACAAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAATAATGAGGTAGGAGGGGGCTACTTGAGATCAGGTGGTCACGCTGAGAGTTGA TTAAAAACAAAACGAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAAT 39 | TTAGAAACAAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAAT TTAAAAACAAAACGAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAAT 40 | ACCTTGAATATGAACACTCTAAATGCTCCACTTAAGAGGCACAGAGTAGCAAGTTGGAAAAGAAAAGAAAAGAAAAGAAAAGAAAACTCATCTGTCTGCAG ACCTTGAATATGAACACTCTAAATGCTCCACTTAAGAGGCACAGAGTAGCAAGTTGGAAAAGAAAAGAAAAGAAAAGAAAACAAAACAAAACTCATCTGT 41 | TTTTCTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCTGGAGTGCAATGGTGTGATCTCGGCTCACGGCAAC TTTTCTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCCGGAGTGCAATGGTGTGCTCTCGGCTCCCGGCAAC 42 | AGAAAGAAAGAAAAAGAAAAAGAACCAAGAAGAAAAAATAATCACCGGAGATTCCTCCCCTCCCCTAGAGCTAACTAGGCTAACATTTTGGTATATATCTTTCCAG AGAAAGAAAGAAAAAGAACCAAGAAGAAAAAATAATCACCGGAGATTCCTCCCCTCCCCTAGAGCTGACTAGGCTAACATTTTGGTATATATCTTTCCAG 43 | GACCTTTCTATATATGGTTTTACAATCGGATCAATCGAGATCCCCTCCCCTCCTTAGAGGCCACTAATAAAAAAGAAGAACCAAGAAAAAGAAAAAGAAAGA GACCTTTCTATATATGGTTTTACAATCGGATCAATCGAGATCCCCTCCCCTCCTTAGAGGCCACTAATAAAAAAGAAGAACCAAGAAAAAGAAAAAGA 44 | AGAAAGAAAGAAAAAGAAAAAGAACCAAGAAGAAAAAATAATCACCGGAGATTCCTCCCCTCCCCTAGAGCTAACTAGGCTAACATTTTGGTATATATCTTTCCAG AGAAAGAAAGAAAAAGAACCAAGAAGAAAAAATAATCACCGGAGATTCCTCCCCTCCCCTAGAGCTGACTAGGCTAACATTTTGGTATATATCTTTCCAG 45 | ATTTTTTAAATCAGGAATAACTTAGACCAGGGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACACACACACACACACACACATACACACATACACACAAATATATCTTCACTAATGTTCTTTTTTTCTTGTTTTTC GGTGAACAAACTACTGCTGTCAGGGCAAATCCAGCCCATAGCCTGCTTTTGGAAATAAATTTGTATTAGAACACACACACACACACACACACATACACACATACACACAAAT 46 | GGTTGTTTTGTGGGGAGCAAAAGAGAAGTTCCCATTTCTGTTTATGTTAGAAACAAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAATAATGAGGTAGGAGGGGGCTACTTGAGATCAGGTGGTCACGCTGAGAGTTGA TTAAAAACAAAACGAAACAAGACAAAAAACAAATTAACAAGCAAATACATACTGATTATAAAATAATGTGATGTGAAGAAAAAAACTGCTAAGAAAGAAT 47 | TAAGAAAGAATCGTCAAAAAAAGAAGTGTAGTGTAATAAAATATTAGTCATACATAAACGAACAATTAAACAAAAAACAGAACAAAGCACACACACACACACA TAAGAAAGAATCGTCAAAAAAAGAAGTGTAGTGTAATAAAATATTAGTCATACATAAACGAACAATTAAACAAAAAACAGAACAAAGCACACACACACACACA 48 | TAATATATATAATATCTAATATATATAATATCTAATATATATAATATATATATAGAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTGCAATGGCGCGATCTC AATATATATATAGAGAGAGAGAGAGCGAGAGAGAGAGAGAGAGAGGGAGAGACGGAGTTTCGCTCTTGTTGCCCAGACTGGAGTGCAATGGCGCGATCTC 49 | AAAGAAAGAAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAAAAAGAACCAAGAAGAAAAAATAATCACCGGAGATTCCTCCCCTCCCCTAGAGCTAACTAGGCTAACATTTTGGTATATATCTTTCCAGTCCGGATCCTGTGTGACTGAGTGTGTATATGCATATGTATTATTTTCAAC AAGAAAGAAAGAAAAAGAACCAAGAAGAAAAAATAATCACCCGAGATTCCTCCCCTCCCCTAGAGCTAACTAGGCTAACATTTTGGTATATATCTTTCCA 50 | GAATGCACATTTGGTCCTTCATTAACTCATTTACTCACAATTTTTTTTTTTTTTTTTTTTTTGAGACGGAGTTTCACTCTTGTCGCCCAGGCTGGAGTGCAATGGTGTGA ACAATTTTTTTTTTTTTTTTTTTGAGACGGAGTTTCACTCTTGTCGCCCAGGCTGGAGTGCAATGGTGTGA 51 | CCACTGCACTCCAGCCTGGGCGACAGGGCGAGACTCCGTCTCAAATAATAATAATAATAATAATAATAATAATAATAATAATAATAAAATAAAATAAAA CCACTGCACTCCAGCCTGGGCGACAGGGCGAGACTCCGTCTCAAATAATAATAATAATAATAATAATAATAATAAAATAAAATAAAATAAAATAAAA 52 | ATTCCAATACTATTCAATTGTTCCACAGCACAGAAAAGAAAAAATTCTAAATTCTTTCTATGGAACAAAAAAATCATCAATGACACCTGACCAAAGATGGCACACACACACACACACACACACACACACACACACACACACTACAGTCCAACCCCACTAATGAATACAAAAATCCTAACACTAGCA GGGACACACACACACACACACACACACACACACTACAGTCCAACCCCACTAATGAATACAAAAATCCTAACACTAGCA 53 | GGCAGGAGAATTGCTTGAACCCAGGGGGCAGAGGTGGCAGTGAGCCGGGATCATGCCACTTCACTCCAGCCTGGGTGAAAGAGCAAAACTCTGTCTCAAAAAAAAAAAAAAAAAAAGACAGCTGCAACAAATGTCAAGTTCTGTGTGTTTTCTTTTCTTTTCTTTTTTTTCTATTTAATTAATTTATT AAAAAAAAAAAAAAAAAGACAGCTGCAACAAATGTCAAGTTCTGTGTGTTTTCTTTTCTTTTCTTTTTTTTCTATTTAATTAATTTATT 54 | GGCAGGAGAATTGCTTGAACCCAGGGGGCAGAGGTGGCAGTGAGCCGGGATCATGCCACTTCACTCCAGCCTGGGTGAAAGAGCAAAACTCTGTCTCAAAAAAAAAAAAAAAAAAAGACAGCTGCAACAAATGTCAAGTTCTGTGTGTTTTCTTTTCTTTTCTTTTTTTTCTATTTAATTAATTTATT CAAAAAAAAAAAAAAAAAGACAGCTGCAACAAATGTCAAGTTCTGTGTGTTTTCTTTTCTTTTCTTTTTTTTCTATTTAATTAATTTATT 55 | ATTGCTTGAGCCCAGGAGTTCAGGGCTGCAATGAGCTATGATCATGCCACTGCACTCCAGCCTGGGCAACAGAGTGAGATCCTGTCTCTAAAATATGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTATGTGTGTGTGTCTGTACATATACGTATATATATATGTGTGTATATATACAT AATATGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTATGTGTGTGTGTCTGTACATATACGTATATATATATGTGTGTATATATACAT 56 | GCCCACCACGCTGCAGATGAATATGCCACAATGTCAACAGTGTTTAGGCTCATATATATATATATATATATATATATATATATATATATATATATATATATTTTAAGACAGTCTTGCTCTGTCACCC TCTCATATATATATATATATATATATATATTTTAAGACAGTCTTGCTCTGTCACCC 57 | GCCCACCACGCTGCAGATGAATATGCCACAATGTCAACAGTGTTTAGGCTCATATATATATATATATATATATATATATATATATATATATATATATATATTTTAAGACAGTCTTGCTCTGTCACCC TATATATATATATATATATATATATATATATATATTGAGACAGTCTCGCTCTGTCACCC 58 | ATTCCAATACTATTCAATTGTTCCACAGCACAGAAAAGAAAAAATTCTAAATTCTTTCTATGGAACAAAAAAATCATCAATGACACCTGACCAAAGATGGCACACACACACACACACACACACACACACACACACACACACTACAGTCCAACCCCACTAATGAATACAAAAATCCTAACACTAGCA GGGACACACACACACACACACACACACACACACTACAGTCCAACCCCACTAATGAATACAAAAATCCTAACACTAGCA 59 | GCCCACCACGCTGCAGATGAATATGCCACAATGTCAACAGTGTTTAGGCTCATATATATATATATATATATATATATATATATATATATATATATATATATTTTAAGACAGTCTTGCTCTGTCACCC TATATATATATATATATATATATATATATATATATTGAGACAGTCTCGCTCTGTCACCC 60 | CACAAGAACTGCAATTCCTAGGCAACTGCTAGTGCTGTGCTGGGCTCAGAGGCAGTAGATTTAGGGACCACTCAACCTAGTGAGACACCAGCTGGGGTGTGTGTGTGTGTGTGTGTGTGTGCGTGTGTGTGTGTGTGTGTGATTCTCGTGCCTCAGCCTCCCAAGTAGCTGGTGATGGCAGTGGCAGCCCATCTGGAGTGGACGCTGCCATCAAGCCAGCTGCAGCAGGGAGGGACAGCTGGGGCTGCACAT GGTCAGAGAGAGTATATAGAGAGAGCACACACCAGAGAGAGACACGCTGGGGGGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGATTCTCGTGC 61 | CGCCTGTAGTCTCAGCTATTAATATTTGGGAGGCTGAGGCAGGAGGATCACACCACTGCACTTTAGCCTGAATACTGAGTAACAAAGCAAAACCCTGTCTCTCTTAAAAAAAAAATTGGGGGGAAGGACAAGTCTTTTTTCTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCTGGAGTGCAATGGTGTGATCTCGGCTCACGGCAACCTCCGCCTCCTGGGTTCAAGCAATTCTGCTTCAGCCTCCCGAGTGGCTGGGATTATAGT CTCTTAAAAAAAAAATTGGGGGGAAGGACAAGTCTTTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTGGGTTGGAATTTTACTCTTGTTG 62 | CTCGGCTCACTGCAAGCTCCACCTCCCGGGTTCACGCCATTCTCCTGCCTCAGTCTCCCAAGTAGCTGGGACTACAGGTGCCCACCACCACGTCTGGCTAATTTTTTTGTATTTTTAGTAGAGATGGGGATTCACCGTGTTAGCCAGGATGGTCTCGATCTCCTGACTTTGTGATCCACCCGCCTCAGCCTCCCAAAGCCTCCTTCACTTTTCTTTATTAGCCTCAACCCCATGATTCACCACTCCAAGTACTCCCTTGCCAGCATCCTCAAATCCCAATACCATTTTTAAAATTTTTTAA ATTTTTTTGTATTTTTAGTAGAGATGGGGATTCACCGTGTTAGCCAGGATGGTCTCGATCTCCTGACTTTGTGATCCGCCCGCCTCAGCCTCCCAAAGCC 63 | TTGGGGGGAAGGACAAGTCTTTTTTCTTTTCTTTTCTTTTCTTTTCTTTTTTTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCTGGAGTGCAATG TTCTTTTTTTTTTTGAGATGGAATTTCACTCTTGTTGCCCAGGCCGGAGTGCAATG 64 | GGAGGCAGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGTGACAGAGCAAGACTCCATCTCTCTCTCTCTCTCTCTCTCTCTATATATATATATATATATATACATATATATATGTATATATATGTATATATATGTGTATATATGTGTATATATATGTATATATGTGTATATATATATGTAT CCTGTGTGACAGAGCAAGACTCCATCTCTCTCTCTCTCTCTCTCTCTCTATATATATATATATATATACATATATATATATATGTATATATATGTATATA 65 | TAATTTACAACAACACGTAAGTTGTTACTCTGTAAACCCTTGCCTCCCCCCCACCCCCCACCCAATTGGGTCTTTTTTTTTTTTCTCTCTCTCCATGCTTCTGCAGTGACTCTTAAGTAGCATTTTTAAAAACTTC CCACCCCCCACCCAATTGGGTCTTTTTTTTTTTTCTCTCTCTCCATGCTTCTGCAGTGACTCTTAAGTAGCATTTTTTAAAACTTCTATTTATTTTAAAA 66 | GTTCACACTTTTGGCCATACCCAGGGTCAGCCATGAAGATTGTTCTCAGAATGTTTTCTTTCTTCCTTCCTTTCTCTTTCTTTTCTTTCTTTCTTCCTTTCTTTCTTTCCTTTCTTTTTCTTTCTTCTCTTTCTTTTTTTCTTTTCTTCTTTCTCTCTCTTTCTTTCTCTCTCTCTCTCTCCTTCCTTCCTTCCTTCT ATGTTTTCTTTCTTCCTTCCTTTCTCTTTCTTTTCTTTCTTTCTTCCTTTCTTTCTTTCCTTTCTTTTTCTTTCTTCTCTTTCTTTCTTTTTTCTTTTCT 67 | GTTCACACTTTTGGCCATACCCAGGGTCAGCCATGAAGATTGTTCTCAGAATGTTTTCTTTCTTCCTTCCTTTCTCTTTCTTTTCTTTCTTTCTTCCTTTCTTTCTTTCCTTTCTTTTTCTTTCTTCTCTTTCTTTTTTTCTTTTCTTCTTTCTCTCTCTTTCTTTCTCTCTCTCTCTCTCCTTCCTTCCTTCCTTCT ATGTTTTCTTTCTTCCTTCCTTTCTCTTTCTTTTCTTTCTTTCTTCCTTTCTTTCTTTCCTTTCTTTTTCTTTCTTCTCTTTCTTTCTTTTTTCTTTTCT 68 | GGAGGCAGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGTGACAGAGCAAGACTCCATCTCTCTCTCTCTCTCTCTCTCTCTATATATATATATATATATATACATATATATATGTATATATATGTATATATATGTGTATATATGTGTATATATATGTATATATGTGTATATATATATGTAT CCTGTGTGACAGAGCAAGACTCCATCTCTCTCTCTCTCTCTCTCTCTCTATATATATATATATATATACATATATATATATATGTATATATATGTATATA 69 | CTCCCACTGGGGTAATCCATCTTTTCTTTTAATTATTTTCCTTTTGAGATATATTAAAAATGCAAAAAAAAAAATTTTTATTTTTTTGAGACGGAGTCTCGCTCTATCACCTAGGCTGGAGTGCAGTGGCATGATCTTGGCTCACTGCAACCTCCGCCTCCTGGGTTCAACTGATTCTCCTGCCTCAGCCTCCTGAATAG ATATTAAAAATGCAAAAAAAAATTTTTTTTATTTTTTTGAGACGGAGTCTCGCTCTATCGCCTAGGCTGGAGTGCAGTGGCATGATCTTGGCTCACTGCA 70 | GGCAGGAGAATTGCTTGAACCCAGGGGGCAGAGGTGGCAGTGAGCCGGGATCATGCCACTTCACTCCAGCCTGGGTGAAAGAGCAAAACTCTGTCTCAAAAAAAAAAAAAAAAAAAGACAGCTGCAACAAATGTCAAGTTCTGTGTGTTTTCTTTTCTTTTCTTTTTTTTCTATTTAATTAATTTATT CAAAAAAAAAAAAAAAAAGACAGCTGCAACAAATGTCAAGTTCTGTGTGTTTTCTTTTCTTTTCTTTTTTTTCTATTTAATTAATTTATT 71 | CAAGATCGTGCCGCTGCACTCCAGCCCAGGTGACAGAGCGAGACTTCATCTCAAAAAAAAAAAGGGCGCCAAACATCTACTGTGTACCCACAAAAATTAAAATTATAAAAAGACGGCATCAGCAATCCCAGGAGGTGATGTGTCCCTGGTTGGTGTACCTCAGGAGTTGCTGCATTTGCCTCACATCACCATGTGAGATAA CAAAAAAAAAAAAGGGCGCCAAACATCTACTGTGTACCCACAAAAATTAAAATTATAAAAAGACGGCATCAGCAATCCCAGGAGGTGATGTGTCCCTGGT 72 | CCTCTTAGGTGGTCACATCCTAGAGAGGGGGAAATTACATCAGAAAAGGACCAATGCCAAATTACAGCAACAAAGGGAAAGTAATCCTGGAAGCTGATTTAAGCTATGTGACTGTGTCTTCAATTAAAATATTCAGTCCCTTCCCCTCCCCCTCCCCCTCCCTTCCGTCTCCCTCTGTTGCTGAGGCTGGACTG CCAATGCCAAATTACAGCAACAAAGGGAAAGTAATCCTGGAAGCTGATTTAAGCTATGTGACTGTGTCTTCAATTAAAATATTCAGTCCCTCCCCCTCCC 73 | CCAAGTGACCCTTTCACCTCAGCTTCCCAAGTAGCTGGGATTACAGGTGCACACCAACTGTGCTTTGCAGTTTTGTTTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTATTTTGTTTTTTTTACAAGCCCACAACATCATAGC TCCTCAGCGTCCCAAGTAGCTGGGAGTAGAGATGCACACCAACTGTGCTTTGCAGTTTTGTTTGTGTGTGTGTGTGTGTGTGTGTGTGTATTTTGTTTTT 74 | CAAGATTTGCCACTGCACTCCAGCCTGGGTGACAGAGTGAGACTGTATCTCAAAAAAAAAAAATAAATAAATAAAGAGAATAAGGCATTTGATATAGTTTCTTTGCATAACAAAATACAAATAAAACCACATTCTATTTCATCTAAACACTTTCTCCCAAGTCATTCTCTCTTAGCCTCA GGACAGGGGGAGACTGTATCTCAAAAAAAAAAAAAAAATAAATAAAGAGAATAAGGCATTTGATATAGTTTCTTTGCATAACAAAATACAAATAAAACCA 75 | GATCATGCCACTGCACTCCAGCCTGGGCAAAAGAGCGAGACTCTGTCTCAAAAAAAAAAAAAAAATCCAGAAAGAATTGGCACACCTATGTTGTTAAGTTTTCCAATCCAAGAATACTGTATTCCTTATCATTTTT AAAAAAAAAAAAAAATCCAGAAAGAATTGGCACCCTTTTGTTGTAAAGTTTTCCATTCCAAGAAAACTGGATTCCTTCTTTTTTTTTTCTTTTTGTTTCT 76 | TCCCTCCCTTCCTCCCTTTCTCTCCCTCTCCCTCTCTTTCTTTCTCTCTCTCTCTTTCTCCCCTTCTTTCTTTCTTTCTCTCTCTCTTTTTCTTTCTTTCTTTCTTTCTTTC TCCCTCCCTTCCTCCCTTTCTCTCCCTCTCCCTCTCTTTCTTTCTCTCTCTCTCTTTCTCCCCTTCTTTTTTTCTCTCTCTCTCTTTTTTTTTCTTTCTC 77 | -------------------------------------------------------------------------------- /libdisorder.LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc. 5 | 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Library General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License 307 | along with this program; if not, write to the Free Software 308 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Library General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /BandedSmithWaterman.cpp: -------------------------------------------------------------------------------- 1 | #include "BandedSmithWaterman.h" 2 | 3 | // define our static constants 4 | const float CBandedSmithWaterman::FLOAT_NEGATIVE_INFINITY = (float)-1e+30; 5 | 6 | const DirectionType CBandedSmithWaterman::Directions_STOP = 0; 7 | const DirectionType CBandedSmithWaterman::Directions_LEFT = 1; 8 | const DirectionType CBandedSmithWaterman::Directions_DIAGONAL = 2; 9 | const DirectionType CBandedSmithWaterman::Directions_UP = 3; 10 | 11 | const PositionType CBandedSmithWaterman::Position_REF_AND_QUERY_ZERO = 0; 12 | const PositionType CBandedSmithWaterman::Position_REF_ZERO = 1; 13 | const PositionType CBandedSmithWaterman::Position_QUERY_ZERO = 2; 14 | const PositionType CBandedSmithWaterman::Position_REF_AND_QUERO_NONZERO = 3; 15 | 16 | // constructor 17 | CBandedSmithWaterman::CBandedSmithWaterman(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty, unsigned int bandWidth) 18 | : mCurrentMatrixSize(0) 19 | , mCurrentAnchorSize(0) 20 | , mCurrentAQSumSize(0) 21 | , mBandwidth(bandWidth) 22 | , mPointers(NULL) 23 | , mMatchScore(matchScore) 24 | , mMismatchScore(mismatchScore) 25 | , mGapOpenPenalty(gapOpenPenalty) 26 | , mGapExtendPenalty(gapExtendPenalty) 27 | , mAnchorGapScores(NULL) 28 | , mBestScores(NULL) 29 | , mReversedAnchor(NULL) 30 | , mReversedQuery(NULL) 31 | , mUseHomoPolymerGapOpenPenalty(false) 32 | { 33 | CreateScoringMatrix(); 34 | 35 | //if((bandWidth % 2) != 1) { 36 | //printf("ERROR: The bandwidth must be an odd number.\n"); 37 | //exit(1); 38 | //} 39 | 40 | try { 41 | mBestScores = new float[bandWidth + 2]; 42 | mAnchorGapScores = new float[bandWidth + 2]; 43 | } catch(bad_alloc) { 44 | printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n"); 45 | exit(1); 46 | } 47 | } 48 | 49 | // destructor 50 | CBandedSmithWaterman::~CBandedSmithWaterman(void) { 51 | if(mPointers) delete [] mPointers; 52 | if(mAnchorGapScores) delete [] mAnchorGapScores; 53 | if(mBestScores) delete [] mBestScores; 54 | if(mReversedAnchor) delete [] mReversedAnchor; 55 | if(mReversedQuery) delete [] mReversedQuery; 56 | } 57 | 58 | // aligns the query sequence to the anchor using the Smith Waterman Gotoh algorithm 59 | void CBandedSmithWaterman::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2, pair< pair, pair >& hr) { 60 | 61 | 62 | 63 | unsigned int rowStart = min(hr.first.first, (unsigned int)hr.second.first); 64 | hr.first.first -= rowStart; 65 | hr.second.first -= rowStart; 66 | 67 | //bool isLegalBandWidth = (s2.length() - hr.QueryBegin) > (mBandwidth / 2); 68 | // isLegalBandWidth = isLegalBandWidth && ((s1.length() - hr.Begin) > (mBandwidth / 2)); 69 | 70 | 71 | 72 | // check the lengths of the input sequences 73 | //if( (s1.length() <= 0) || (s2.length() <= 0) || (s1.length() < s2.length()) ) { 74 | // printf("ERROR: An unexpected sequence length was encountered during pairwise alignment.\n"); 75 | // printf("Sequence lengths are listed as following:\n"); 76 | // printf("1. Reference length: %u\n2. Query length: %u\n", s1.length(), s2.length()); 77 | //printf("3. Hash region in reference:%4u-%4u\n", hr.Begin + rowStart, hr.End); 78 | //printf("4. Hash region in query: %4u-%4u\n", hr.QueryBegin + rowStart, hr.QueryEnd); 79 | // exit(1); 80 | //} 81 | 82 | 83 | // determine the hash region type 84 | unsigned int rowOffset; 85 | unsigned int columnOffset; 86 | PositionType positionType; 87 | 88 | if(hr.first.first == 0) { 89 | if(hr.second.first == 0) { 90 | rowOffset = 1; 91 | columnOffset = (mBandwidth / 2) + 1; 92 | positionType = Position_REF_AND_QUERY_ZERO; 93 | } else { 94 | rowOffset = 1 - hr.second.first; 95 | columnOffset = (mBandwidth / 2) + 1 + hr.second.first; 96 | positionType = Position_REF_ZERO; 97 | } 98 | } else { 99 | if(hr.second.first == 0) { 100 | rowOffset = 1; 101 | columnOffset = (mBandwidth / 2) + 1 - hr.first.first; 102 | positionType = Position_QUERY_ZERO; 103 | } else { 104 | rowOffset = 1 - hr.second.first; 105 | columnOffset = (mBandwidth / 2) + 1 + hr.second.first - hr.first.first; 106 | positionType = Position_REF_AND_QUERO_NONZERO; 107 | } 108 | } 109 | 110 | // ========================= 111 | // Reinitialize the matrices 112 | // ========================= 113 | 114 | ReinitializeMatrices(positionType, s1.length(), s2.length(), hr); 115 | 116 | // ======================================= 117 | // Banded Smith-Waterman forward algorithm 118 | // ======================================= 119 | 120 | unsigned int bestColumn = 0; 121 | unsigned int bestRow = 0; 122 | float bestScore = FLOAT_NEGATIVE_INFINITY; 123 | float currentQueryGapScore; 124 | 125 | // rowNum and column indicate the row and column numbers in the Smith-Waterman matrix respectively 126 | unsigned int rowNum = hr.second.first; 127 | unsigned int columnNum = hr.first.first; 128 | 129 | // indicates how many rows including blank elements in the Banded SmithWaterman 130 | int numBlankElements = (mBandwidth / 2) - columnNum; 131 | 132 | //cout << numBlankElements << endl; 133 | // upper triangle matrix in Banded Smith-Waterman 134 | for( ; numBlankElements > 0; numBlankElements--, rowNum++){ 135 | // in the upper triangle matrix, we always start at the 0th column 136 | columnNum = 0; 137 | 138 | // columnEnd indicates how many columns which should be dealt with in the current row 139 | unsigned int columnEnd = min((mBandwidth - numBlankElements), ((unsigned int) s1.length() - columnNum + 1) ); 140 | currentQueryGapScore = FLOAT_NEGATIVE_INFINITY; 141 | for( unsigned int j = 0; j < columnEnd; j++){ 142 | float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset); 143 | //cout << s1[columnNum] << s2[rowNum] << score << endl; 144 | UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score); 145 | columnNum++; 146 | } 147 | 148 | // replace the columnNum to the middle column in the Smith-Waterman matrix 149 | columnNum = columnNum - (mBandwidth / 2); 150 | } 151 | // complete matrix in Banded Smith-Waterman 152 | unsigned int completeNum = min((s1.length() - columnNum - (mBandwidth / 2)), (s2.length() - rowNum)); 153 | //cout << completeNum << endl; 154 | for(unsigned int i = 0; i < completeNum; i++, rowNum++){ 155 | columnNum = columnNum - (mBandwidth / 2); 156 | 157 | // there are mBandwidth columns which should be dealt with in each row 158 | currentQueryGapScore = FLOAT_NEGATIVE_INFINITY; 159 | 160 | for(unsigned int j = 0; j < mBandwidth; j++){ 161 | float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset); 162 | UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score); 163 | //cout << s1[columnNum] << s2[rowNum] << score << endl; 164 | columnNum++; 165 | } 166 | 167 | // replace the columnNum to the middle column in the Smith-Waterman matrix 168 | // because mBandwidth is an odd number, everytime the following equation shifts a column (pluses 1). 169 | columnNum = columnNum - (mBandwidth / 2); 170 | } 171 | 172 | // lower triangle matrix 173 | numBlankElements = min(mBandwidth, ((unsigned int) s2.length() - rowNum)); 174 | columnNum = columnNum - (mBandwidth / 2); 175 | for(unsigned int i = 0; numBlankElements > 0; i++, rowNum++, numBlankElements--) { 176 | 177 | mBestScores[ mBandwidth - i ] = FLOAT_NEGATIVE_INFINITY;; 178 | // columnEnd indicates how many columns which should be dealt with 179 | currentQueryGapScore = FLOAT_NEGATIVE_INFINITY; 180 | 181 | for( unsigned int j = columnNum; j < s1.length(); j++){ 182 | float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset); 183 | UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score); 184 | //cout << s1[columnNum] << s2[rowNum] << score << endl; 185 | columnNum++; 186 | } 187 | 188 | // replace the columnNum to the middle column in the Smith-Waterman matrix 189 | columnNum = columnNum - mBandwidth + i + 2; 190 | } 191 | 192 | // ========================================= 193 | // Banded Smith-Waterman backtrace algorithm 194 | // ========================================= 195 | 196 | Traceback(referenceAl, cigarAl, s1, s2, bestRow, bestColumn, rowOffset, columnOffset); 197 | 198 | } 199 | 200 | // calculates the score during the forward algorithm 201 | float CBandedSmithWaterman::CalculateScore(const string& s1, const string& s2, const unsigned int rowNum, const unsigned int columnNum, float& currentQueryGapScore, const unsigned int rowOffset, const unsigned int columnOffset) { 202 | 203 | // initialize 204 | const unsigned int row = rowNum + rowOffset; 205 | const unsigned int column = columnOffset - rowNum + columnNum; 206 | const unsigned int position = row * (mBandwidth + 2) + column; 207 | 208 | // retrieve the similarity scores 209 | const float similarityScore = mScoringMatrix[s1[columnNum] - 'A'][s2[rowNum] - 'A']; 210 | const float totalSimilarityScore = mBestScores[column] + similarityScore; 211 | 212 | // ================================ 213 | // open a gap in the query sequence 214 | // ================================ 215 | 216 | float queryGapExtendScore = currentQueryGapScore - mGapExtendPenalty; 217 | float queryGapOpenScore = mBestScores[column - 1] - mGapOpenPenalty; 218 | 219 | // compute the homo-polymer gap score if enabled 220 | if(mUseHomoPolymerGapOpenPenalty) 221 | if((rowNum > 1) && (s2[rowNum] == s2[rowNum - 1])) 222 | queryGapOpenScore = mBestScores[column - 1] - mHomoPolymerGapOpenPenalty; 223 | 224 | if(queryGapExtendScore > queryGapOpenScore) { 225 | currentQueryGapScore = queryGapExtendScore; 226 | mPointers[position].mSizeOfHorizontalGaps = mPointers[position - 1].mSizeOfHorizontalGaps + 1; 227 | } else currentQueryGapScore = queryGapOpenScore; 228 | 229 | 230 | // ==================================== 231 | // open a gap in the reference sequence 232 | // ==================================== 233 | 234 | 235 | float anchorGapExtendScore = mAnchorGapScores[column + 1] - mGapExtendPenalty; 236 | float anchorGapOpenScore = mBestScores[column + 1] - mGapOpenPenalty; 237 | 238 | // compute the homo-polymer gap score if enabled 239 | if(mUseHomoPolymerGapOpenPenalty) 240 | if((columnNum > 1) && (s1[columnNum] == s1[columnNum - 1])) 241 | anchorGapOpenScore = mBestScores[column + 1] - mHomoPolymerGapOpenPenalty; 242 | 243 | if(anchorGapExtendScore > anchorGapOpenScore) { 244 | mAnchorGapScores[column] = anchorGapExtendScore; 245 | mPointers[position].mSizeOfVerticalGaps = mPointers[position - mBandwidth - 1].mSizeOfVerticalGaps + 1; 246 | } else mAnchorGapScores[column] = anchorGapOpenScore; 247 | 248 | // ====================================== 249 | // calculate the best score and direction 250 | // ====================================== 251 | 252 | //mBestScores[column] = MaxFloats(totalSimilarityScore, mAnchorGapScores[column], currentQueryGapScore); 253 | mBestScores[column] = MaxFloats(totalSimilarityScore, currentQueryGapScore, mAnchorGapScores[column]); 254 | 255 | // determine the traceback direction 256 | // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495) 257 | if(mBestScores[column] == 0) mPointers[position].Direction = Directions_STOP; 258 | else if(mBestScores[column] == totalSimilarityScore) mPointers[position].Direction = Directions_UP; 259 | else if(mBestScores[column] == currentQueryGapScore) mPointers[position].Direction = Directions_LEFT; 260 | else mPointers[position].Direction = Directions_DIAGONAL; 261 | 262 | return mBestScores[column]; 263 | } 264 | 265 | // corrects the homopolymer gap order for forward alignments 266 | void CBandedSmithWaterman::CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches) { 267 | 268 | 269 | // this is only required for alignments with mismatches 270 | //if(al.NumMismatches == 0) return; 271 | if ( numMismatches == 0 ) return; 272 | 273 | // localize the alignment data 274 | //char* pReference = al.Reference.Data(); 275 | //char* pQuery = al.Query.Data(); 276 | //const unsigned int numBases = al.Reference.Length(); 277 | char* pReference = mReversedAnchor; 278 | char* pQuery = mReversedQuery; 279 | 280 | // initialize 281 | bool hasReferenceGap = false, hasQueryGap = false; 282 | char* pNonGapSeq = NULL; 283 | char* pGapSeq = NULL; 284 | char nonGapBase = 'J'; 285 | 286 | // identify gapped regions 287 | for(unsigned int i = 0; i < numBases; i++) { 288 | 289 | // check if the current position is gapped 290 | hasReferenceGap = false; 291 | hasQueryGap = false; 292 | 293 | if(pReference[i] == GAP) { 294 | hasReferenceGap = true; 295 | pNonGapSeq = pQuery; 296 | pGapSeq = pReference; 297 | nonGapBase = pQuery[i]; 298 | } 299 | 300 | if(pQuery[i] == GAP) { 301 | hasQueryGap = true; 302 | pNonGapSeq = pReference; 303 | pGapSeq = pQuery; 304 | nonGapBase = pReference[i]; 305 | } 306 | 307 | // continue if we don't have any gaps 308 | if(!hasReferenceGap && !hasQueryGap) continue; 309 | 310 | // sanity check 311 | if(hasReferenceGap && hasQueryGap) { 312 | printf("ERROR: Found a gap in both the reference sequence and query sequence.\n"); 313 | exit(1); 314 | } 315 | 316 | // find the non-gapped length (forward) 317 | unsigned short numGappedBases = 0; 318 | unsigned short nonGapLength = 0; 319 | unsigned short testPos = i; 320 | while(testPos < numBases) { 321 | 322 | const char gs = pGapSeq[testPos]; 323 | const char ngs = pNonGapSeq[testPos]; 324 | 325 | bool isPartofHomopolymer = false; 326 | if(((gs == nonGapBase) || (gs == GAP)) && (ngs == nonGapBase)) isPartofHomopolymer = true; 327 | if(!isPartofHomopolymer) break; 328 | 329 | if(gs == GAP) numGappedBases++; 330 | else nonGapLength++; 331 | testPos++; 332 | } 333 | 334 | // fix the gap order 335 | if(numGappedBases != 0) { 336 | char* pCurrentSequence = pGapSeq + i; 337 | memset(pCurrentSequence, nonGapBase, nonGapLength); 338 | pCurrentSequence += nonGapLength; 339 | memset(pCurrentSequence, GAP, numGappedBases); 340 | } 341 | 342 | // increment 343 | i += numGappedBases + nonGapLength - 1; 344 | } 345 | } 346 | 347 | // creates a simple scoring matrix to align the nucleotides and the ambiguity code N 348 | void CBandedSmithWaterman::CreateScoringMatrix(void) { 349 | 350 | unsigned int nIndex = 13; 351 | unsigned int xIndex = 23; 352 | 353 | // define the N score to be 1/4 of the span between mismatch and match 354 | //const short nScore = mMismatchScore + (short)(((mMatchScore - mMismatchScore) / 4.0) + 0.5); 355 | 356 | // calculate the scoring matrix 357 | for(unsigned char i = 0; i < MOSAIK_NUM_NUCLEOTIDES; i++) { 358 | for(unsigned char j = 0; j < MOSAIK_NUM_NUCLEOTIDES; j++) { 359 | 360 | // N.B. matching N to everything (while conceptually correct) leads to some 361 | // bad alignments, lets make N be a mismatch instead. 362 | 363 | // add the matches or mismatches to the hashtable (N is a mismatch) 364 | if((i == nIndex) || (j == nIndex)) mScoringMatrix[i][j] = mMismatchScore; 365 | else if((i == xIndex) || (j == xIndex)) mScoringMatrix[i][j] = mMismatchScore; 366 | else if(i == j) mScoringMatrix[i][j] = mMatchScore; 367 | else mScoringMatrix[i][j] = mMismatchScore; 368 | } 369 | } 370 | 371 | // add ambiguity codes 372 | mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A 373 | mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore; 374 | // add ambiguity codes 375 | mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A 376 | mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore; 377 | mScoringMatrix['M' - 'A']['C' - 'A'] = mMatchScore; // M - C 378 | mScoringMatrix['C' - 'A']['M' - 'A'] = mMatchScore; 379 | 380 | mScoringMatrix['R' - 'A']['A' - 'A'] = mMatchScore; // R - A 381 | mScoringMatrix['A' - 'A']['R' - 'A'] = mMatchScore; 382 | mScoringMatrix['R' - 'A']['G' - 'A'] = mMatchScore; // R - G 383 | mScoringMatrix['G' - 'A']['R' - 'A'] = mMatchScore; 384 | 385 | mScoringMatrix['W' - 'A']['A' - 'A'] = mMatchScore; // W - A 386 | mScoringMatrix['A' - 'A']['W' - 'A'] = mMatchScore; 387 | mScoringMatrix['W' - 'A']['T' - 'A'] = mMatchScore; // W - T 388 | mScoringMatrix['T' - 'A']['W' - 'A'] = mMatchScore; 389 | 390 | mScoringMatrix['S' - 'A']['C' - 'A'] = mMatchScore; // S - C 391 | mScoringMatrix['C' - 'A']['S' - 'A'] = mMatchScore; 392 | mScoringMatrix['S' - 'A']['G' - 'A'] = mMatchScore; // S - G 393 | mScoringMatrix['G' - 'A']['S' - 'A'] = mMatchScore; 394 | 395 | mScoringMatrix['Y' - 'A']['C' - 'A'] = mMatchScore; // Y - C 396 | mScoringMatrix['C' - 'A']['Y' - 'A'] = mMatchScore; 397 | mScoringMatrix['Y' - 'A']['T' - 'A'] = mMatchScore; // Y - T 398 | mScoringMatrix['T' - 'A']['Y' - 'A'] = mMatchScore; 399 | 400 | mScoringMatrix['K' - 'A']['G' - 'A'] = mMatchScore; // K - G 401 | mScoringMatrix['G' - 'A']['K' - 'A'] = mMatchScore; 402 | mScoringMatrix['K' - 'A']['T' - 'A'] = mMatchScore; // K - T 403 | mScoringMatrix['T' - 'A']['K' - 'A'] = mMatchScore; 404 | 405 | mScoringMatrix['V' - 'A']['A' - 'A'] = mMatchScore; // V - A 406 | mScoringMatrix['A' - 'A']['V' - 'A'] = mMatchScore; 407 | mScoringMatrix['V' - 'A']['C' - 'A'] = mMatchScore; // V - C 408 | mScoringMatrix['C' - 'A']['V' - 'A'] = mMatchScore; 409 | mScoringMatrix['V' - 'A']['G' - 'A'] = mMatchScore; // V - G 410 | mScoringMatrix['G' - 'A']['V' - 'A'] = mMatchScore; 411 | 412 | mScoringMatrix['H' - 'A']['A' - 'A'] = mMatchScore; // H - A 413 | mScoringMatrix['A' - 'A']['H' - 'A'] = mMatchScore; 414 | mScoringMatrix['H' - 'A']['C' - 'A'] = mMatchScore; // H - C 415 | mScoringMatrix['C' - 'A']['H' - 'A'] = mMatchScore; 416 | mScoringMatrix['H' - 'A']['T' - 'A'] = mMatchScore; // H - T 417 | mScoringMatrix['T' - 'A']['H' - 'A'] = mMatchScore; 418 | 419 | mScoringMatrix['D' - 'A']['A' - 'A'] = mMatchScore; // D - A 420 | mScoringMatrix['A' - 'A']['D' - 'A'] = mMatchScore; 421 | mScoringMatrix['D' - 'A']['G' - 'A'] = mMatchScore; // D - G 422 | mScoringMatrix['G' - 'A']['D' - 'A'] = mMatchScore; 423 | mScoringMatrix['D' - 'A']['T' - 'A'] = mMatchScore; // D - T 424 | mScoringMatrix['T' - 'A']['D' - 'A'] = mMatchScore; 425 | 426 | mScoringMatrix['B' - 'A']['C' - 'A'] = mMatchScore; // B - C 427 | mScoringMatrix['C' - 'A']['B' - 'A'] = mMatchScore; 428 | mScoringMatrix['B' - 'A']['G' - 'A'] = mMatchScore; // B - G 429 | mScoringMatrix['G' - 'A']['B' - 'A'] = mMatchScore; 430 | mScoringMatrix['B' - 'A']['T' - 'A'] = mMatchScore; // B - T 431 | mScoringMatrix['T' - 'A']['B' - 'A'] = mMatchScore; 432 | } 433 | 434 | // enables homo-polymer scoring 435 | void CBandedSmithWaterman::EnableHomoPolymerGapPenalty(float hpGapOpenPenalty) { 436 | mUseHomoPolymerGapOpenPenalty = true; 437 | mHomoPolymerGapOpenPenalty = hpGapOpenPenalty; 438 | } 439 | 440 | // reinitializes the matrices 441 | void CBandedSmithWaterman::ReinitializeMatrices(const PositionType& positionType, const unsigned int& s1Length, const unsigned int& s2Length, const pair< pair, pair > hr) { 442 | 443 | /* 444 | try { 445 | mBestScores = new float[mBandwidth + 2]; 446 | mAnchorGapScores = new float[mBandwidth + 2]; 447 | } catch(bad_alloc) { 448 | printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n"); 449 | exit(1); 450 | } 451 | */ 452 | 453 | const unsigned int numColumns = mBandwidth + 2; 454 | unsigned int numRows = 0; 455 | 456 | switch(positionType) { 457 | case Position_REF_AND_QUERY_ZERO: 458 | numRows = s2Length + 1; 459 | break; 460 | case Position_REF_ZERO: 461 | numRows = s2Length - hr.second.first + 2; 462 | break; 463 | case Position_QUERY_ZERO: 464 | numRows = min(s2Length + 1, s1Length - hr.first.first + 2); 465 | break; 466 | case Position_REF_AND_QUERO_NONZERO: 467 | numRows = min(s1Length - hr.first.first + 2, s2Length - hr.second.first + 2); 468 | break; 469 | } 470 | 471 | // update the size of the backtrace matrix 472 | if((numColumns * numRows) > mCurrentMatrixSize) { 473 | 474 | mCurrentMatrixSize = numColumns * numRows; 475 | if(mPointers) delete [] mPointers; 476 | 477 | try { 478 | mPointers = new ElementInfo[mCurrentMatrixSize]; 479 | } catch(bad_alloc) { 480 | printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n"); 481 | exit(1); 482 | } 483 | } 484 | 485 | // initialize our backtrace matrix 486 | ElementInfo defaultElement; 487 | defaultElement.Direction = Directions_STOP; 488 | defaultElement.mSizeOfHorizontalGaps = 1; 489 | defaultElement.mSizeOfVerticalGaps = 1; 490 | 491 | uninitialized_fill(mPointers, mPointers + mCurrentMatrixSize, defaultElement); 492 | 493 | // update the sequence character arrays 494 | if((s1Length + s2Length) > mCurrentAQSumSize) { 495 | 496 | mCurrentAQSumSize = s1Length + s2Length; 497 | if(mReversedAnchor) delete [] mReversedAnchor; 498 | if(mReversedQuery) delete [] mReversedQuery; 499 | 500 | try { 501 | mReversedAnchor = new char[mCurrentAQSumSize + 1]; // reversed sequence #1 502 | mReversedQuery = new char[mCurrentAQSumSize + 1]; // reversed sequence #2 503 | } catch(bad_alloc) { 504 | printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n"); 505 | exit(1); 506 | } 507 | } 508 | 509 | // initialize the gap score and score vectors 510 | uninitialized_fill(mAnchorGapScores, mAnchorGapScores + mBandwidth + 2, FLOAT_NEGATIVE_INFINITY); 511 | memset((char*)mBestScores, 0, SIZEOF_FLOAT * (mBandwidth + 2)); 512 | mBestScores[0] = FLOAT_NEGATIVE_INFINITY; 513 | mBestScores[mBandwidth + 1] = FLOAT_NEGATIVE_INFINITY; 514 | } 515 | 516 | // performs the backtrace algorithm 517 | void CBandedSmithWaterman::Traceback(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2, unsigned int bestRow, unsigned int bestColumn, const unsigned int rowOffset, const unsigned int columnOffset){ 518 | 519 | 520 | unsigned int currentRow = bestRow; 521 | unsigned int currentColumn = bestColumn; 522 | unsigned int currentPosition = ((currentRow + rowOffset) * (mBandwidth + 2)) + (columnOffset - currentRow + currentColumn); 523 | 524 | 525 | // record the numbers of row and column before the current row and column 526 | unsigned int previousRow = bestRow; 527 | unsigned int previousColumn = bestColumn; 528 | 529 | unsigned int gappedAnchorLen = 0; 530 | unsigned int gappedQueryLen = 0; 531 | unsigned int numMismatches = 0; 532 | 533 | bool keepProcessing = true; 534 | while(keepProcessing) { 535 | unsigned int nVerticalGap = 0; 536 | unsigned int nHorizontalGap = 0; 537 | switch(mPointers[currentPosition].Direction){ 538 | case Directions_DIAGONAL: 539 | nVerticalGap = mPointers[currentPosition].mSizeOfVerticalGaps; 540 | for(unsigned int i = 0; i < nVerticalGap; i++){ 541 | mReversedAnchor[gappedAnchorLen++] = GAP; 542 | mReversedQuery[gappedQueryLen++] = s2[currentRow]; 543 | 544 | numMismatches++; 545 | 546 | previousRow = currentRow; 547 | previousColumn = currentColumn; 548 | 549 | currentRow--; 550 | } 551 | break; 552 | 553 | case Directions_STOP: 554 | keepProcessing = false; 555 | //mReversedAnchor[gappedAnchorLen+1]='\0'; 556 | //mReversedQuery [gappedQueryLen+1]='\0'; 557 | break; 558 | 559 | case Directions_UP: 560 | 561 | mReversedAnchor[gappedAnchorLen++] = s1[currentColumn]; 562 | mReversedQuery[gappedQueryLen++] = s2[currentRow]; 563 | 564 | if(s1[currentColumn] != s2[currentRow]) numMismatches++; 565 | previousRow = currentRow; 566 | previousColumn = currentColumn; 567 | 568 | currentRow--; 569 | currentColumn--; 570 | break; 571 | 572 | case Directions_LEFT: 573 | nHorizontalGap = mPointers[currentPosition].mSizeOfHorizontalGaps; 574 | for(unsigned int i = 0; i < nHorizontalGap; i++){ 575 | 576 | mReversedAnchor[gappedAnchorLen++] = s1[currentColumn]; 577 | mReversedQuery[gappedQueryLen++] = GAP; 578 | 579 | numMismatches++; 580 | 581 | previousRow = currentRow; 582 | previousColumn = currentColumn; 583 | 584 | 585 | currentColumn--; 586 | } 587 | break; 588 | } 589 | currentPosition = ((currentRow + rowOffset) * (mBandwidth + 2)) + (columnOffset - currentRow + currentColumn); 590 | } 591 | 592 | // correct the reference and query sequence order 593 | mReversedAnchor[gappedAnchorLen] = 0; 594 | mReversedQuery [gappedQueryLen] = 0; 595 | reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen); 596 | reverse(mReversedQuery, mReversedQuery + gappedQueryLen); 597 | 598 | //alignment.Reference = mReversedAnchor; 599 | //alignment.Query = mReversedQuery; 600 | 601 | // assign the alignment endpoints 602 | //alignment.ReferenceBegin = previousColumn; 603 | //alignment.ReferenceEnd = bestColumn; 604 | referenceAl = previousColumn; 605 | /* 606 | if(alignment.IsReverseComplement){ 607 | alignment.QueryBegin = s2.length() - bestRow - 1; 608 | alignment.QueryEnd = s2.length() - previousRow - 1; 609 | } else { 610 | alignment.QueryBegin = previousRow; 611 | alignment.QueryEnd = bestRow; 612 | } 613 | */ 614 | 615 | //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1; 616 | //alignment.NumMismatches = numMismatches; 617 | 618 | const unsigned int alLength = strlen(mReversedAnchor); 619 | unsigned int m = 0, d = 0, i = 0; 620 | bool dashRegion = false; 621 | ostringstream oCigar; 622 | 623 | if ( previousRow != 0 ) 624 | oCigar << previousRow << 'S'; 625 | 626 | for ( unsigned int j = 0; j < alLength; j++ ) { 627 | // m 628 | if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) { 629 | if ( dashRegion ) { 630 | if ( d != 0 ) oCigar << d << 'D'; 631 | else oCigar << i << 'I'; 632 | } 633 | dashRegion = false; 634 | m++; 635 | d = 0; 636 | i = 0; 637 | } 638 | // I or D 639 | else { 640 | if ( !dashRegion ) 641 | oCigar << m << 'M'; 642 | dashRegion = true; 643 | m = 0; 644 | if ( mReversedAnchor[j] == GAP ) { 645 | if ( d != 0 ) oCigar << d << 'D'; 646 | i++; 647 | d = 0; 648 | } 649 | else { 650 | if ( i != 0 ) oCigar << i << 'I'; 651 | d++; 652 | i = 0; 653 | } 654 | } 655 | } 656 | 657 | if ( m != 0 ) oCigar << m << 'M'; 658 | else if ( d != 0 ) oCigar << d << 'D'; 659 | else if ( i != 0 ) oCigar << i << 'I'; 660 | 661 | if ( ( bestRow + 1 ) != s2.length() ) 662 | oCigar << s2.length() - bestRow - 1 << 'S'; 663 | 664 | cigarAl = oCigar.str(); 665 | 666 | 667 | // correct the homopolymer gap order 668 | CorrectHomopolymerGapOrder(alLength, numMismatches); 669 | 670 | } 671 | -------------------------------------------------------------------------------- /SmithWatermanGotoh.cpp: -------------------------------------------------------------------------------- 1 | #include "SmithWatermanGotoh.h" 2 | 3 | const float CSmithWatermanGotoh::FLOAT_NEGATIVE_INFINITY = (float)-1e+30; 4 | 5 | const char CSmithWatermanGotoh::Directions_STOP = 0; 6 | const char CSmithWatermanGotoh::Directions_LEFT = 1; 7 | const char CSmithWatermanGotoh::Directions_DIAGONAL = 2; 8 | const char CSmithWatermanGotoh::Directions_UP = 3; 9 | 10 | const int CSmithWatermanGotoh::repeat_size_max = 12; 11 | 12 | CSmithWatermanGotoh::CSmithWatermanGotoh(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty) 13 | : mCurrentMatrixSize(0) 14 | , mCurrentAnchorSize(0) 15 | , mCurrentQuerySize(0) 16 | , mCurrentAQSumSize(0) 17 | , mMatchScore(matchScore) 18 | , mMismatchScore(mismatchScore) 19 | , mGapOpenPenalty(gapOpenPenalty) 20 | , mGapExtendPenalty(gapExtendPenalty) 21 | , mPointers(NULL) 22 | , mSizesOfVerticalGaps(NULL) 23 | , mSizesOfHorizontalGaps(NULL) 24 | , mQueryGapScores(NULL) 25 | , mBestScores(NULL) 26 | , mReversedAnchor(NULL) 27 | , mReversedQuery(NULL) 28 | , mUseHomoPolymerGapOpenPenalty(false) 29 | , mUseEntropyGapOpenPenalty(false) 30 | , mUseRepeatGapExtensionPenalty(false) 31 | { 32 | CreateScoringMatrix(); 33 | } 34 | 35 | CSmithWatermanGotoh::~CSmithWatermanGotoh(void) { 36 | if(mPointers) delete [] mPointers; 37 | if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps; 38 | if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps; 39 | if(mQueryGapScores) delete [] mQueryGapScores; 40 | if(mBestScores) delete [] mBestScores; 41 | if(mReversedAnchor) delete [] mReversedAnchor; 42 | if(mReversedQuery) delete [] mReversedQuery; 43 | } 44 | 45 | // aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm 46 | void CSmithWatermanGotoh::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2) { 47 | 48 | if((s1.length() == 0) || (s2.length() == 0)) { 49 | cout << "ERROR: Found a read with a zero length." << endl; 50 | exit(1); 51 | } 52 | 53 | unsigned int referenceLen = s1.length() + 1; 54 | unsigned int queryLen = s2.length() + 1; 55 | unsigned int sequenceSumLength = s1.length() + s2.length(); 56 | 57 | // reinitialize our matrices 58 | 59 | if((referenceLen * queryLen) > mCurrentMatrixSize) { 60 | 61 | // calculate the new matrix size 62 | mCurrentMatrixSize = referenceLen * queryLen; 63 | 64 | // delete the old arrays 65 | if(mPointers) delete [] mPointers; 66 | if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps; 67 | if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps; 68 | 69 | try { 70 | 71 | // initialize the arrays 72 | mPointers = new char[mCurrentMatrixSize]; 73 | mSizesOfVerticalGaps = new short[mCurrentMatrixSize]; 74 | mSizesOfHorizontalGaps = new short[mCurrentMatrixSize]; 75 | 76 | } catch(bad_alloc) { 77 | cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl; 78 | exit(1); 79 | } 80 | } 81 | 82 | // initialize the traceback matrix to STOP 83 | memset((char*)mPointers, 0, SIZEOF_CHAR * queryLen); 84 | for(unsigned int i = 1; i < referenceLen; i++) mPointers[i * queryLen] = 0; 85 | 86 | // initialize the gap matrices to 1 87 | uninitialized_fill(mSizesOfVerticalGaps, mSizesOfVerticalGaps + mCurrentMatrixSize, 1); 88 | uninitialized_fill(mSizesOfHorizontalGaps, mSizesOfHorizontalGaps + mCurrentMatrixSize, 1); 89 | 90 | 91 | // initialize our repeat counts if they are needed 92 | vector > referenceRepeats; 93 | vector > queryRepeats; 94 | int queryBeginRepeatBases = 0; 95 | int queryEndRepeatBases = 0; 96 | if (mUseRepeatGapExtensionPenalty) { 97 | for (unsigned int i = 0; i < queryLen; ++i) 98 | queryRepeats.push_back(repeatCounts(i, s2, repeat_size_max)); 99 | for (unsigned int i = 0; i < referenceLen; ++i) 100 | referenceRepeats.push_back(repeatCounts(i, s1, repeat_size_max)); 101 | 102 | // keep only the biggest repeat 103 | vector >::iterator q = queryRepeats.begin(); 104 | for (; q != queryRepeats.end(); ++q) { 105 | map::iterator biggest = q->begin(); 106 | map::iterator z = q->begin(); 107 | for (; z != q->end(); ++z) 108 | if (z->first.size() > biggest->first.size()) biggest = z; 109 | z = q->begin(); 110 | while (z != q->end()) { 111 | if (z != biggest) 112 | q->erase(z++); 113 | else ++z; 114 | } 115 | } 116 | 117 | q = referenceRepeats.begin(); 118 | for (; q != referenceRepeats.end(); ++q) { 119 | map::iterator biggest = q->begin(); 120 | map::iterator z = q->begin(); 121 | for (; z != q->end(); ++z) 122 | if (z->first.size() > biggest->first.size()) biggest = z; 123 | z = q->begin(); 124 | while (z != q->end()) { 125 | if (z != biggest) 126 | q->erase(z++); 127 | else ++z; 128 | } 129 | } 130 | 131 | // remove repeat information from ends of queries 132 | // this results in the addition of spurious flanking deletions in repeats 133 | map& qrend = queryRepeats.at(queryRepeats.size() - 2); 134 | if (!qrend.empty()) { 135 | int queryEndRepeatBases = qrend.begin()->first.size() * qrend.begin()->second; 136 | for (int i = 0; i < queryEndRepeatBases; ++i) 137 | queryRepeats.at(queryRepeats.size() - 2 - i).clear(); 138 | } 139 | 140 | map& qrbegin = queryRepeats.front(); 141 | if (!qrbegin.empty()) { 142 | int queryBeginRepeatBases = qrbegin.begin()->first.size() * qrbegin.begin()->second; 143 | for (int i = 0; i < queryBeginRepeatBases; ++i) 144 | queryRepeats.at(i).clear(); 145 | } 146 | 147 | } 148 | 149 | int entropyWindowSize = 8; 150 | vector referenceEntropies; 151 | vector queryEntropies; 152 | if (mUseEntropyGapOpenPenalty) { 153 | for (unsigned int i = 0; i < queryLen; ++i) 154 | queryEntropies.push_back( 155 | shannon_H((char*) &s2[max(0, min((int) i - entropyWindowSize / 2, (int) queryLen - entropyWindowSize - 1))], 156 | entropyWindowSize)); 157 | for (unsigned int i = 0; i < referenceLen; ++i) 158 | referenceEntropies.push_back( 159 | shannon_H((char*) &s1[max(0, min((int) i - entropyWindowSize / 2, (int) referenceLen - entropyWindowSize - 1))], 160 | entropyWindowSize)); 161 | } 162 | 163 | // normalize entropies 164 | /* 165 | float qsum = 0; 166 | float qnorm = 0; 167 | float qmax = 0; 168 | for (vector::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) { 169 | qsum += *q; 170 | if (*q > qmax) qmax = *q; 171 | } 172 | qnorm = qsum / queryEntropies.size(); 173 | for (vector::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) 174 | *q = *q / qsum + qmax; 175 | 176 | float rsum = 0; 177 | float rnorm = 0; 178 | float rmax = 0; 179 | for (vector::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) { 180 | rsum += *r; 181 | if (*r > rmax) rmax = *r; 182 | } 183 | rnorm = rsum / referenceEntropies.size(); 184 | for (vector::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) 185 | *r = *r / rsum + rmax; 186 | */ 187 | 188 | // 189 | // construct 190 | // 191 | 192 | // reinitialize our query-dependent arrays 193 | if(s2.length() > mCurrentQuerySize) { 194 | 195 | // calculate the new query array size 196 | mCurrentQuerySize = s2.length(); 197 | 198 | // delete the old arrays 199 | if(mQueryGapScores) delete [] mQueryGapScores; 200 | if(mBestScores) delete [] mBestScores; 201 | 202 | // initialize the arrays 203 | try { 204 | 205 | mQueryGapScores = new float[mCurrentQuerySize + 1]; 206 | mBestScores = new float[mCurrentQuerySize + 1]; 207 | 208 | } catch(bad_alloc) { 209 | cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl; 210 | exit(1); 211 | } 212 | } 213 | 214 | // reinitialize our reference+query-dependent arrays 215 | if(sequenceSumLength > mCurrentAQSumSize) { 216 | 217 | // calculate the new reference array size 218 | mCurrentAQSumSize = sequenceSumLength; 219 | 220 | // delete the old arrays 221 | if(mReversedAnchor) delete [] mReversedAnchor; 222 | if(mReversedQuery) delete [] mReversedQuery; 223 | 224 | // initialize the arrays 225 | try { 226 | 227 | mReversedAnchor = new char[mCurrentAQSumSize + 1]; // reversed sequence #1 228 | mReversedQuery = new char[mCurrentAQSumSize + 1]; // reversed sequence #2 229 | 230 | } catch(bad_alloc) { 231 | cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl; 232 | exit(1); 233 | } 234 | } 235 | 236 | // initialize the gap score and score vectors 237 | uninitialized_fill(mQueryGapScores, mQueryGapScores + queryLen, FLOAT_NEGATIVE_INFINITY); 238 | memset((char*)mBestScores, 0, SIZEOF_FLOAT * queryLen); 239 | 240 | float similarityScore, totalSimilarityScore, bestScoreDiagonal; 241 | float queryGapExtendScore, queryGapOpenScore; 242 | float referenceGapExtendScore, referenceGapOpenScore, currentAnchorGapScore; 243 | 244 | unsigned int BestColumn = 0; 245 | unsigned int BestRow = 0; 246 | BestScore = FLOAT_NEGATIVE_INFINITY; 247 | 248 | for(unsigned int i = 1, k = queryLen; i < referenceLen; i++, k += queryLen) { 249 | 250 | currentAnchorGapScore = FLOAT_NEGATIVE_INFINITY; 251 | bestScoreDiagonal = mBestScores[0]; 252 | 253 | for(unsigned int j = 1, l = k + 1; j < queryLen; j++, l++) { 254 | 255 | // calculate our similarity score 256 | similarityScore = mScoringMatrix[s1[i - 1] - 'A'][s2[j - 1] - 'A']; 257 | 258 | // fill the matrices 259 | totalSimilarityScore = bestScoreDiagonal + similarityScore; 260 | 261 | //cerr << "i: " << i << ", j: " << j << ", totalSimilarityScore: " << totalSimilarityScore << endl; 262 | 263 | queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty; 264 | queryGapOpenScore = mBestScores[j] - mGapOpenPenalty; 265 | 266 | // compute the homo-polymer gap score if enabled 267 | if(mUseHomoPolymerGapOpenPenalty) 268 | if((j > 1) && (s2[j - 1] == s2[j - 2])) 269 | queryGapOpenScore = mBestScores[j] - mHomoPolymerGapOpenPenalty; 270 | 271 | // compute the entropy gap score if enabled 272 | if (mUseEntropyGapOpenPenalty) { 273 | queryGapOpenScore = 274 | mBestScores[j] - mGapOpenPenalty 275 | * max(queryEntropies.at(j), referenceEntropies.at(i)) 276 | * mEntropyGapOpenPenalty; 277 | } 278 | 279 | int gaplen = mSizesOfVerticalGaps[l - queryLen] + 1; 280 | 281 | if (mUseRepeatGapExtensionPenalty) { 282 | map& repeats = queryRepeats[j]; 283 | // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in? 284 | if (!repeats.empty()) { 285 | 286 | const pair& repeat = *repeats.begin(); 287 | int repeatsize = repeat.first.size(); 288 | if (gaplen != repeatsize && gaplen % repeatsize != 0) { 289 | gaplen = gaplen / repeatsize + repeatsize; 290 | } 291 | 292 | if ((repeat.first.size() * repeat.second) > 3 && gaplen + i < s1.length()) { 293 | string gapseq = string(&s1[i], gaplen); 294 | if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) { 295 | queryGapExtendScore = mQueryGapScores[j] 296 | + mRepeatGapExtensionPenalty / (float) gaplen; 297 | // mMaxRepeatGapExtensionPenalty) 298 | } else { 299 | queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty; 300 | } 301 | } 302 | } else { 303 | queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty; 304 | } 305 | } 306 | 307 | if(queryGapExtendScore > queryGapOpenScore) { 308 | mQueryGapScores[j] = queryGapExtendScore; 309 | mSizesOfVerticalGaps[l] = gaplen; 310 | } else mQueryGapScores[j] = queryGapOpenScore; 311 | 312 | referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty; 313 | referenceGapOpenScore = mBestScores[j - 1] - mGapOpenPenalty; 314 | 315 | // compute the homo-polymer gap score if enabled 316 | if(mUseHomoPolymerGapOpenPenalty) 317 | if((i > 1) && (s1[i - 1] == s1[i - 2])) 318 | referenceGapOpenScore = mBestScores[j - 1] - mHomoPolymerGapOpenPenalty; 319 | 320 | // compute the entropy gap score if enabled 321 | if (mUseEntropyGapOpenPenalty) { 322 | referenceGapOpenScore = 323 | mBestScores[j - 1] - mGapOpenPenalty 324 | * max(queryEntropies.at(j), referenceEntropies.at(i)) 325 | * mEntropyGapOpenPenalty; 326 | } 327 | 328 | gaplen = mSizesOfHorizontalGaps[l - 1] + 1; 329 | 330 | if (mUseRepeatGapExtensionPenalty) { 331 | map& repeats = referenceRepeats[i]; 332 | // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in? 333 | if (!repeats.empty()) { 334 | 335 | const pair& repeat = *repeats.begin(); 336 | int repeatsize = repeat.first.size(); 337 | if (gaplen != repeatsize && gaplen % repeatsize != 0) { 338 | gaplen = gaplen / repeatsize + repeatsize; 339 | } 340 | 341 | if ((repeat.first.size() * repeat.second) > 3 && gaplen + j < s2.length()) { 342 | string gapseq = string(&s2[j], gaplen); 343 | if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) { 344 | referenceGapExtendScore = currentAnchorGapScore 345 | + mRepeatGapExtensionPenalty / (float) gaplen; 346 | //mMaxRepeatGapExtensionPenalty) 347 | } else { 348 | referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty; 349 | } 350 | } 351 | } else { 352 | referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty; 353 | } 354 | } 355 | 356 | if(referenceGapExtendScore > referenceGapOpenScore) { 357 | currentAnchorGapScore = referenceGapExtendScore; 358 | mSizesOfHorizontalGaps[l] = gaplen; 359 | } else currentAnchorGapScore = referenceGapOpenScore; 360 | 361 | bestScoreDiagonal = mBestScores[j]; 362 | mBestScores[j] = MaxFloats(totalSimilarityScore, mQueryGapScores[j], currentAnchorGapScore); 363 | 364 | 365 | // determine the traceback direction 366 | // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495) 367 | if(mBestScores[j] == 0) mPointers[l] = Directions_STOP; 368 | else if(mBestScores[j] == totalSimilarityScore) mPointers[l] = Directions_DIAGONAL; 369 | else if(mBestScores[j] == mQueryGapScores[j]) mPointers[l] = Directions_UP; 370 | else mPointers[l] = Directions_LEFT; 371 | 372 | // set the traceback start at the current cell i, j and score 373 | if(mBestScores[j] > BestScore) { 374 | BestRow = i; 375 | BestColumn = j; 376 | BestScore = mBestScores[j]; 377 | } 378 | } 379 | } 380 | 381 | // 382 | // traceback 383 | // 384 | 385 | // aligned sequences 386 | int gappedAnchorLen = 0; // length of sequence #1 after alignment 387 | int gappedQueryLen = 0; // length of sequence #2 after alignment 388 | int numMismatches = 0; // the mismatched nucleotide count 389 | 390 | char c1, c2; 391 | 392 | int ci = BestRow; 393 | int cj = BestColumn; 394 | int ck = ci * queryLen; 395 | 396 | // traceback flag 397 | bool keepProcessing = true; 398 | 399 | while(keepProcessing) { 400 | //cerr << ci << " " << cj << " " << ck << " ... " << gappedAnchorLen << " " << gappedQueryLen << endl; 401 | 402 | // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495) 403 | switch(mPointers[ck + cj]) { 404 | 405 | case Directions_DIAGONAL: 406 | c1 = s1[--ci]; 407 | c2 = s2[--cj]; 408 | ck -= queryLen; 409 | 410 | mReversedAnchor[gappedAnchorLen++] = c1; 411 | mReversedQuery[gappedQueryLen++] = c2; 412 | 413 | // increment our mismatch counter 414 | if(mScoringMatrix[c1 - 'A'][c2 - 'A'] == mMismatchScore) numMismatches++; 415 | break; 416 | 417 | case Directions_STOP: 418 | keepProcessing = false; 419 | break; 420 | 421 | case Directions_UP: 422 | for(unsigned int l = 0, len = mSizesOfVerticalGaps[ck + cj]; l < len; l++) { 423 | if (ci <= 0) { 424 | keepProcessing = false; 425 | break; 426 | } 427 | mReversedAnchor[gappedAnchorLen++] = s1[--ci]; 428 | mReversedQuery[gappedQueryLen++] = GAP; 429 | ck -= queryLen; 430 | numMismatches++; 431 | } 432 | break; 433 | 434 | case Directions_LEFT: 435 | for(unsigned int l = 0, len = mSizesOfHorizontalGaps[ck + cj]; l < len; l++) { 436 | if (cj <= 0) { 437 | keepProcessing = false; 438 | break; 439 | } 440 | mReversedAnchor[gappedAnchorLen++] = GAP; 441 | mReversedQuery[gappedQueryLen++] = s2[--cj]; 442 | numMismatches++; 443 | } 444 | break; 445 | } 446 | } 447 | 448 | // define the reference and query sequences 449 | mReversedAnchor[gappedAnchorLen] = 0; 450 | mReversedQuery[gappedQueryLen] = 0; 451 | 452 | // catch sequences with different lengths 453 | if(gappedAnchorLen != gappedQueryLen) { 454 | cout << "ERROR: The aligned sequences have different lengths after Smith-Waterman-Gotoh algorithm." << endl; 455 | exit(1); 456 | } 457 | 458 | // reverse the strings and assign them to our alignment structure 459 | reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen); 460 | reverse(mReversedQuery, mReversedQuery + gappedQueryLen); 461 | 462 | //alignment.Reference = mReversedAnchor; 463 | //alignment.Query = mReversedQuery; 464 | 465 | // set the reference endpoints 466 | //alignment.ReferenceBegin = ci; 467 | //alignment.ReferenceEnd = BestRow - 1; 468 | referenceAl = ci; 469 | 470 | // set the query endpoints 471 | /* 472 | if(alignment.IsReverseComplement) { 473 | alignment.QueryBegin = s2Length - BestColumn; 474 | alignment.QueryEnd = s2Length - cj - 1; 475 | // alignment.QueryLength= alignment.QueryBegin - alignment.QueryEnd + 1; 476 | } else { 477 | alignment.QueryBegin = cj; 478 | alignment.QueryEnd = BestColumn - 1; 479 | // alignment.QueryLength= alignment.QueryEnd - alignment.QueryBegin + 1; 480 | } 481 | */ 482 | 483 | // set the query length and number of mismatches 484 | //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1; 485 | //alignment.NumMismatches = numMismatches; 486 | 487 | unsigned int alLength = strlen(mReversedAnchor); 488 | unsigned int m = 0, d = 0, i = 0; 489 | bool dashRegion = false; 490 | ostringstream oCigar (ostringstream::out); 491 | int insertedBases = 0; 492 | 493 | if ( cj != 0 ) { 494 | if ( cj > 0 ) { 495 | oCigar << cj << 'S'; 496 | } else { // how do we get negative cj's? 497 | referenceAl -= cj; 498 | alLength += cj; 499 | } 500 | } 501 | 502 | for ( unsigned int j = 0; j < alLength; j++ ) { 503 | // m 504 | if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) { 505 | if ( dashRegion ) { 506 | if ( d != 0 ) oCigar << d << 'D'; 507 | else { oCigar << i << 'I'; insertedBases += i; } 508 | } 509 | dashRegion = false; 510 | m++; 511 | d = 0; 512 | i = 0; 513 | } 514 | else { 515 | if ( !dashRegion && m ) 516 | oCigar << m << 'M'; 517 | dashRegion = true; 518 | m = 0; 519 | if ( mReversedAnchor[j] == GAP ) { 520 | if ( d != 0 ) oCigar << d << 'D'; 521 | i++; 522 | d = 0; 523 | } 524 | else { 525 | if ( i != 0) { oCigar << i << 'I'; insertedBases += i; } 526 | d++; 527 | i = 0; 528 | } 529 | } 530 | } 531 | if ( m != 0 ) oCigar << m << 'M'; 532 | else if ( d != 0 ) oCigar << d << 'D'; 533 | else if ( i != 0 ) oCigar << i << 'I'; 534 | 535 | if ( BestColumn != s2.length() ) 536 | oCigar << s2.length() - BestColumn << 'S'; 537 | 538 | cigarAl = oCigar.str(); 539 | 540 | // fix the gap order 541 | CorrectHomopolymerGapOrder(alLength, numMismatches); 542 | 543 | if (mUseEntropyGapOpenPenalty || mUseRepeatGapExtensionPenalty) { 544 | int offset = 0; 545 | string oldCigar; 546 | try { 547 | oldCigar = cigarAl; 548 | stablyLeftAlign(s2, cigarAl, s1.substr(referenceAl, alLength - insertedBases), offset); 549 | } catch (...) { 550 | cerr << "an exception occurred when left-aligning " << s1 << " " << s2 << endl; 551 | cigarAl = oldCigar; // undo the failed left-realignment attempt 552 | offset = 0; 553 | } 554 | referenceAl += offset; 555 | } 556 | 557 | } 558 | 559 | // creates a simple scoring matrix to align the nucleotides and the ambiguity code N 560 | void CSmithWatermanGotoh::CreateScoringMatrix(void) { 561 | 562 | unsigned int nIndex = 13; 563 | unsigned int xIndex = 23; 564 | 565 | // define the N score to be 1/4 of the span between mismatch and match 566 | //const short nScore = mMismatchScore + (short)(((mMatchScore - mMismatchScore) / 4.0) + 0.5); 567 | 568 | // calculate the scoring matrix 569 | for(unsigned char i = 0; i < MOSAIK_NUM_NUCLEOTIDES; i++) { 570 | for(unsigned char j = 0; j < MOSAIK_NUM_NUCLEOTIDES; j++) { 571 | 572 | // N.B. matching N to everything (while conceptually correct) leads to some 573 | // bad alignments, lets make N be a mismatch instead. 574 | 575 | // add the matches or mismatches to the hashtable (N is a mismatch) 576 | if((i == nIndex) || (j == nIndex)) mScoringMatrix[i][j] = mMismatchScore; 577 | else if((i == xIndex) || (j == xIndex)) mScoringMatrix[i][j] = mMismatchScore; 578 | else if(i == j) mScoringMatrix[i][j] = mMatchScore; 579 | else mScoringMatrix[i][j] = mMismatchScore; 580 | } 581 | } 582 | 583 | // add ambiguity codes 584 | mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A 585 | mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore; 586 | mScoringMatrix['M' - 'A']['C' - 'A'] = mMatchScore; // M - C 587 | mScoringMatrix['C' - 'A']['M' - 'A'] = mMatchScore; 588 | 589 | mScoringMatrix['R' - 'A']['A' - 'A'] = mMatchScore; // R - A 590 | mScoringMatrix['A' - 'A']['R' - 'A'] = mMatchScore; 591 | mScoringMatrix['R' - 'A']['G' - 'A'] = mMatchScore; // R - G 592 | mScoringMatrix['G' - 'A']['R' - 'A'] = mMatchScore; 593 | 594 | mScoringMatrix['W' - 'A']['A' - 'A'] = mMatchScore; // W - A 595 | mScoringMatrix['A' - 'A']['W' - 'A'] = mMatchScore; 596 | mScoringMatrix['W' - 'A']['T' - 'A'] = mMatchScore; // W - T 597 | mScoringMatrix['T' - 'A']['W' - 'A'] = mMatchScore; 598 | 599 | mScoringMatrix['S' - 'A']['C' - 'A'] = mMatchScore; // S - C 600 | mScoringMatrix['C' - 'A']['S' - 'A'] = mMatchScore; 601 | mScoringMatrix['S' - 'A']['G' - 'A'] = mMatchScore; // S - G 602 | mScoringMatrix['G' - 'A']['S' - 'A'] = mMatchScore; 603 | 604 | mScoringMatrix['Y' - 'A']['C' - 'A'] = mMatchScore; // Y - C 605 | mScoringMatrix['C' - 'A']['Y' - 'A'] = mMatchScore; 606 | mScoringMatrix['Y' - 'A']['T' - 'A'] = mMatchScore; // Y - T 607 | mScoringMatrix['T' - 'A']['Y' - 'A'] = mMatchScore; 608 | 609 | mScoringMatrix['K' - 'A']['G' - 'A'] = mMatchScore; // K - G 610 | mScoringMatrix['G' - 'A']['K' - 'A'] = mMatchScore; 611 | mScoringMatrix['K' - 'A']['T' - 'A'] = mMatchScore; // K - T 612 | mScoringMatrix['T' - 'A']['K' - 'A'] = mMatchScore; 613 | 614 | mScoringMatrix['V' - 'A']['A' - 'A'] = mMatchScore; // V - A 615 | mScoringMatrix['A' - 'A']['V' - 'A'] = mMatchScore; 616 | mScoringMatrix['V' - 'A']['C' - 'A'] = mMatchScore; // V - C 617 | mScoringMatrix['C' - 'A']['V' - 'A'] = mMatchScore; 618 | mScoringMatrix['V' - 'A']['G' - 'A'] = mMatchScore; // V - G 619 | mScoringMatrix['G' - 'A']['V' - 'A'] = mMatchScore; 620 | 621 | mScoringMatrix['H' - 'A']['A' - 'A'] = mMatchScore; // H - A 622 | mScoringMatrix['A' - 'A']['H' - 'A'] = mMatchScore; 623 | mScoringMatrix['H' - 'A']['C' - 'A'] = mMatchScore; // H - C 624 | mScoringMatrix['C' - 'A']['H' - 'A'] = mMatchScore; 625 | mScoringMatrix['H' - 'A']['T' - 'A'] = mMatchScore; // H - T 626 | mScoringMatrix['T' - 'A']['H' - 'A'] = mMatchScore; 627 | 628 | mScoringMatrix['D' - 'A']['A' - 'A'] = mMatchScore; // D - A 629 | mScoringMatrix['A' - 'A']['D' - 'A'] = mMatchScore; 630 | mScoringMatrix['D' - 'A']['G' - 'A'] = mMatchScore; // D - G 631 | mScoringMatrix['G' - 'A']['D' - 'A'] = mMatchScore; 632 | mScoringMatrix['D' - 'A']['T' - 'A'] = mMatchScore; // D - T 633 | mScoringMatrix['T' - 'A']['D' - 'A'] = mMatchScore; 634 | 635 | mScoringMatrix['B' - 'A']['C' - 'A'] = mMatchScore; // B - C 636 | mScoringMatrix['C' - 'A']['B' - 'A'] = mMatchScore; 637 | mScoringMatrix['B' - 'A']['G' - 'A'] = mMatchScore; // B - G 638 | mScoringMatrix['G' - 'A']['B' - 'A'] = mMatchScore; 639 | mScoringMatrix['B' - 'A']['T' - 'A'] = mMatchScore; // B - T 640 | mScoringMatrix['T' - 'A']['B' - 'A'] = mMatchScore; 641 | } 642 | 643 | // enables homo-polymer scoring 644 | void CSmithWatermanGotoh::EnableHomoPolymerGapPenalty(float hpGapOpenPenalty) { 645 | mUseHomoPolymerGapOpenPenalty = true; 646 | mHomoPolymerGapOpenPenalty = hpGapOpenPenalty; 647 | } 648 | 649 | // enables entropy-based gap open penalty 650 | void CSmithWatermanGotoh::EnableEntropyGapPenalty(float enGapOpenPenalty) { 651 | mUseEntropyGapOpenPenalty = true; 652 | mEntropyGapOpenPenalty = enGapOpenPenalty; 653 | } 654 | 655 | // enables repeat-aware gap extension penalty 656 | void CSmithWatermanGotoh::EnableRepeatGapExtensionPenalty(float rGapExtensionPenalty, float rMaxGapRepeatExtensionPenaltyFactor) { 657 | mUseRepeatGapExtensionPenalty = true; 658 | mRepeatGapExtensionPenalty = rGapExtensionPenalty; 659 | mMaxRepeatGapExtensionPenalty = rMaxGapRepeatExtensionPenaltyFactor * rGapExtensionPenalty; 660 | } 661 | 662 | // corrects the homopolymer gap order for forward alignments 663 | void CSmithWatermanGotoh::CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches) { 664 | 665 | // this is only required for alignments with mismatches 666 | //if(al.NumMismatches == 0) return; 667 | if ( numMismatches == 0 ) return; 668 | 669 | // localize the alignment data 670 | //char* pReference = al.Reference.Data(); 671 | //char* pQuery = al.Query.Data(); 672 | //const unsigned int numBases = al.Reference.Length(); 673 | char* pReference = mReversedAnchor; 674 | char* pQuery = mReversedQuery; 675 | 676 | // initialize 677 | bool hasReferenceGap = false, hasQueryGap = false; 678 | char* pNonGapSeq = NULL; 679 | char* pGapSeq = NULL; 680 | char nonGapBase = 'J'; 681 | 682 | // identify gapped regions 683 | for(unsigned int i = 0; i < numBases; i++) { 684 | 685 | // check if the current position is gapped 686 | hasReferenceGap = false; 687 | hasQueryGap = false; 688 | 689 | if(pReference[i] == GAP) { 690 | hasReferenceGap = true; 691 | pNonGapSeq = pQuery; 692 | pGapSeq = pReference; 693 | nonGapBase = pQuery[i]; 694 | } 695 | 696 | if(pQuery[i] == GAP) { 697 | hasQueryGap = true; 698 | pNonGapSeq = pReference; 699 | pGapSeq = pQuery; 700 | nonGapBase = pReference[i]; 701 | } 702 | 703 | // continue if we don't have any gaps 704 | if(!hasReferenceGap && !hasQueryGap) continue; 705 | 706 | // sanity check 707 | if(hasReferenceGap && hasQueryGap) { 708 | printf("ERROR: Found a gap in both the reference sequence and query sequence.\n"); 709 | exit(1); 710 | } 711 | 712 | // find the non-gapped length (forward) 713 | unsigned short numGappedBases = 0; 714 | unsigned short nonGapLength = 0; 715 | unsigned short testPos = i; 716 | while(testPos < numBases) { 717 | 718 | const char gs = pGapSeq[testPos]; 719 | const char ngs = pNonGapSeq[testPos]; 720 | 721 | bool isPartofHomopolymer = false; 722 | if(((gs == nonGapBase) || (gs == GAP)) && (ngs == nonGapBase)) isPartofHomopolymer = true; 723 | if(!isPartofHomopolymer) break; 724 | 725 | if(gs == GAP) numGappedBases++; 726 | else nonGapLength++; 727 | testPos++; 728 | } 729 | 730 | // fix the gap order 731 | if(numGappedBases != 0) { 732 | char* pCurrentSequence = pGapSeq + i; 733 | memset(pCurrentSequence, nonGapBase, nonGapLength); 734 | pCurrentSequence += nonGapLength; 735 | memset(pCurrentSequence, GAP, numGappedBases); 736 | } 737 | 738 | // increment 739 | i += numGappedBases + nonGapLength - 1; 740 | } 741 | } 742 | -------------------------------------------------------------------------------- /LeftAlign.cpp: -------------------------------------------------------------------------------- 1 | #include "LeftAlign.h" 2 | 3 | //bool debug; 4 | #define VERBOSE_DEBUG 5 | 6 | // Attempts to left-realign all the indels represented by the alignment cigar. 7 | // 8 | // This is done by shifting all indels as far left as they can go without 9 | // mismatch, then merging neighboring indels of the same class. leftAlign 10 | // updates the alignment cigar with changes, and returns true if realignment 11 | // changed the alignment cigar. 12 | // 13 | // To left-align, we move multi-base indels left by their own length as long as 14 | // the preceding bases match the inserted or deleted sequence. After this 15 | // step, we handle multi-base homopolymer indels by shifting them one base to 16 | // the left until they mismatch the reference. 17 | // 18 | // To merge neighboring indels, we iterate through the set of left-stabilized 19 | // indels. For each indel we add a new cigar element to the new cigar. If a 20 | // deletion follows a deletion, or an insertion occurs at the same place as 21 | // another insertion, we merge the events by extending the previous cigar 22 | // element. 23 | // 24 | // In practice, we must call this function until the alignment is stabilized. 25 | // 26 | bool leftAlign(string& querySequence, string& cigar, string& baseReferenceSequence, int& offset, bool debug) { 27 | 28 | debug = false; 29 | 30 | string referenceSequence = baseReferenceSequence.substr(offset); 31 | 32 | int arsOffset = 0; // pointer to insertion point in aligned reference sequence 33 | string alignedReferenceSequence, alignedQuerySequence; 34 | if (debug) alignedReferenceSequence = referenceSequence; 35 | if (debug) alignedQuerySequence = querySequence; 36 | int aabOffset = 0; 37 | 38 | // store information about the indels 39 | vector indels; 40 | 41 | int rp = 0; // read position, 0-based relative to read 42 | int sp = 0; // sequence position 43 | 44 | string softBegin; 45 | string softEnd; 46 | 47 | string cigarbefore = cigar; 48 | 49 | vector > cigarData = splitCIGAR(cigar); 50 | for (vector >::const_iterator c = cigarData.begin(); 51 | c != cigarData.end(); ++c) { 52 | unsigned int l = c->first; 53 | string t = c->second; 54 | if (debug) cerr << l << t << " " << sp << " " << rp << endl; 55 | if (t == "M") { // match or mismatch 56 | sp += l; 57 | rp += l; 58 | } else if (t == "D") { // deletion 59 | indels.push_back(IndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l))); 60 | if (debug) { cerr << indels.back() << endl; alignedQuerySequence.insert(rp + aabOffset, string(l, '-')); } 61 | aabOffset += l; 62 | sp += l; // update reference sequence position 63 | } else if (t == "I") { // insertion 64 | indels.push_back(IndelAllele(true, l, sp, rp, querySequence.substr(rp, l))); 65 | if (debug) { cerr << indels.back() << endl; alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-')); } 66 | arsOffset += l; 67 | rp += l; 68 | } else if (t == "S") { // soft clip, clipped sequence present in the read not matching the reference 69 | // remove these bases from the refseq and read seq, but don't modify the alignment sequence 70 | if (rp == 0) { 71 | alignedReferenceSequence = string(l, '*') + alignedReferenceSequence; 72 | //indels.push_back(IndelAllele(true, l, sp, rp, querySequence.substr(rp, l))); 73 | softBegin = querySequence.substr(0, l); 74 | } else { 75 | alignedReferenceSequence = alignedReferenceSequence + string(l, '*'); 76 | //indels.push_back(IndelAllele(true, l, sp, rp, querySequence.substr(rp, l))); 77 | softEnd = querySequence.substr(querySequence.size() - l, l); 78 | } 79 | rp += l; 80 | } else if (t == "H") { // hard clip on the read, clipped sequence is not present in the read 81 | } else if (t == "N") { // skipped region in the reference not present in read, aka splice 82 | sp += l; 83 | } 84 | } 85 | 86 | 87 | if (debug) cerr << "| " << cigarbefore << endl 88 | << "| " << alignedReferenceSequence << endl 89 | << "| " << alignedQuerySequence << endl; 90 | 91 | // if no indels, return the alignment 92 | if (indels.empty()) { return false; } 93 | 94 | if (debug) { 95 | for (vector::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " "; 96 | cerr << endl; 97 | } 98 | 99 | // for each indel, from left to right 100 | // while the indel sequence repeated to the left and we're not matched up with the left-previous indel 101 | // move the indel left 102 | 103 | vector::iterator previous = indels.begin(); 104 | for (vector::iterator id = indels.begin(); id != indels.end(); ++id) { 105 | 106 | // left shift by repeats 107 | // 108 | // from 1 base to the length of the indel, attempt to shift left 109 | // if the move would cause no change in alignment optimality (no 110 | // introduction of mismatches, and by definition no change in gap 111 | // length), move to the new position. 112 | // in practice this moves the indel left when we reach the size of 113 | // the repeat unit. 114 | // 115 | int steppos, readsteppos; 116 | IndelAllele& indel = *id; 117 | int i = 1; 118 | 119 | while (i <= indel.length) { 120 | 121 | int steppos = indel.position - i; 122 | int readsteppos = indel.readPosition - i; 123 | 124 | if (debug) { 125 | if (steppos >= 0 && readsteppos >= 0) { 126 | cerr << "refseq flank " << referenceSequence.substr(steppos, indel.length) << endl; 127 | cerr << "qryseq flank " << querySequence.substr(readsteppos, indel.length) << endl; 128 | cerr << "indelseq " << indel.sequence << endl; 129 | } 130 | } 131 | 132 | while (steppos >= 0 && readsteppos >= 0 133 | && indel.sequence == referenceSequence.substr(steppos, indel.length) 134 | && indel.sequence == querySequence.substr(readsteppos, indel.length) 135 | && (id == indels.begin() 136 | || (previous->insertion && steppos >= previous->position) 137 | || (!previous->insertion && steppos >= previous->position + previous->length))) { 138 | LEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl); 139 | indel.position -= i; 140 | indel.readPosition -= i; 141 | steppos = indel.position - i; 142 | readsteppos = indel.readPosition - i; 143 | } 144 | do { 145 | ++i; 146 | } while (i <= indel.length && indel.length % i != 0); 147 | } 148 | 149 | 150 | 151 | // left shift indels with exchangeable flanking sequence 152 | // 153 | // for example: 154 | // 155 | // GTTACGTT GTTACGTT 156 | // GT-----T ----> G-----TT 157 | // 158 | // GTGTGACGTGT GTGTGACGTGT 159 | // GTGTG-----T ----> GTG-----TGT 160 | // 161 | // GTGTG-----T GTG-----TGT 162 | // GTGTGACGTGT ----> GTGTGACGTGT 163 | // 164 | // 165 | 166 | steppos = indel.position - 1; 167 | readsteppos = indel.readPosition - 1; 168 | while (steppos >= 0 && readsteppos >= 0 169 | && querySequence.at(readsteppos) == referenceSequence.at(steppos) 170 | && referenceSequence.size() > steppos + indel.length 171 | && indel.sequence.at((int) indel.sequence.size() - 1) == referenceSequence.at(steppos + indel.length) // are the exchanged bases going to match wrt. the reference? 172 | && querySequence.at(readsteppos) == indel.sequence.at((int) indel.sequence.size() - 1) 173 | && (id == indels.begin() 174 | || (previous->insertion && indel.position - 1 >= previous->position) 175 | || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) { 176 | if (debug) cerr << (indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl; 177 | indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1); 178 | indel.position -= 1; 179 | indel.readPosition -= 1; 180 | if (debug) cerr << indel << endl; 181 | steppos = indel.position - 1; 182 | readsteppos = indel.readPosition - 1; 183 | //if (debug && steppos && readsteppos) cerr << querySequence.at(readsteppos) << " ==? " << referenceSequence.at(steppos) << endl; 184 | //if (debug && steppos && readsteppos) cerr << indel.sequence.at((int) indel.sequence.size() - 1) << " ==? " << referenceSequence.at(steppos + indel.length) << endl; 185 | } 186 | // tracks previous indel, so we don't run into it with the next shift 187 | previous = id; 188 | } 189 | 190 | if (debug) { 191 | for (vector::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " "; 192 | cerr << endl; 193 | } 194 | 195 | if (debug) cerr << "bring together floating indels" << endl; 196 | 197 | // bring together floating indels 198 | // from left to right 199 | // check if we could merge with the next indel 200 | // if so, adjust so that we will merge in the next step 201 | if (indels.size() > 1) { 202 | previous = indels.begin(); 203 | for (vector::iterator id = (indels.begin() + 1); id != indels.end(); ++id) { 204 | IndelAllele& indel = *id; 205 | // parsimony: could we shift right and merge with the previous indel? 206 | // if so, do it 207 | int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length; 208 | int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length; 209 | if (previous->insertion == indel.insertion 210 | && ((previous->insertion 211 | && (previous->position < indel.position 212 | && previous->readPosition < indel.readPosition)) 213 | || 214 | (!previous->insertion 215 | && (previous->position + previous->length < indel.position) 216 | && (previous->readPosition < indel.readPosition) 217 | ))) { 218 | if (previous->homopolymer()) { 219 | string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref); 220 | string readseq = querySequence.substr(prev_end_read, indel.position - prev_end_ref); 221 | if (debug) cerr << "seq: " << seq << endl << "readseq: " << readseq << endl; 222 | if (previous->sequence.at(0) == seq.at(0) 223 | && homopolymer(seq) 224 | && homopolymer(readseq)) { 225 | if (debug) cerr << "moving " << *previous << " right to " 226 | << (indel.insertion ? indel.position : indel.position - previous->length) << endl; 227 | previous->position = indel.insertion ? indel.position : indel.position - previous->length; 228 | previous->readPosition = !indel.insertion ? indel.readPosition : indel.readPosition - previous->readLength(); // should this be readLength? 229 | } 230 | } 231 | /* 232 | else { 233 | int pos = previous->position; 234 | int readpos = previous->readPosition; 235 | while (pos < (int) referenceSequence.length() && 236 | ((previous->insertion && pos + previous->length <= indel.position) 237 | || 238 | (!previous->insertion && pos + previous->length < indel.position)) 239 | && previous->sequence == referenceSequence.substr(pos + previous->length, previous->length) 240 | && previous->sequence == querySequence.substr(readpos + previous->length, previous->length) 241 | ) { 242 | pos += previous->length; 243 | readpos += previous->length; 244 | } 245 | string seq = previous->sequence; 246 | if (pos > previous->position) { 247 | // wobble bases right to left as far as we can go 248 | int steppos = previous->position + seq.size(); 249 | int readsteppos = previous->readPosition + seq.size(); 250 | 251 | while (querySequence.at(readsteppos) == referenceSequence.at(steppos) 252 | && querySequence.at(readsteppos) == seq.at(0) 253 | && (id == indels.begin() 254 | || (indel.insertion && pos + seq.size() - 1 <= indel.position) 255 | || (!previous->insertion && indel.position - 1 >= pos + previous->length))) { 256 | seq = seq.substr(1) + seq.at(0); 257 | ++pos; 258 | ++readpos; 259 | steppos = pos + 1; 260 | readsteppos = readpos + 1; 261 | } 262 | 263 | if (((previous->insertion && pos + previous->length == indel.position) 264 | || 265 | (!previous->insertion && pos == indel.position - previous->length)) 266 | ) { 267 | if (debug) cerr << "right-merging tandem repeat: moving " << *previous << " right to " << pos << endl; 268 | previous->position = pos; 269 | previous->readPosition = readpos; 270 | previous->sequence = seq; 271 | } 272 | } 273 | } 274 | */ 275 | } 276 | previous = id; 277 | } 278 | } 279 | 280 | if (debug) { 281 | for (vector::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " "; 282 | cerr << endl; 283 | } 284 | 285 | 286 | if (debug) cerr << "bring in indels at ends of read" << endl; 287 | 288 | // try to "bring in" repeat indels at the end, for maximum parsimony 289 | // 290 | // e.g. 291 | // 292 | // AGAAAGAAAGAAAAAGAAAAAGAACCAAGAAGAAAA 293 | // AGAAAG------AAAGAAAAAGAACCAAGAAGAAAA 294 | // 295 | // has no information which distinguishes it from: 296 | // 297 | // AGAAAGAAAAAGAAAAAGAACCAAGAAGAAAA 298 | // AGAAAG--AAAGAAAAAGAACCAAGAAGAAAA 299 | // 300 | // here we take the parsimonious explanation 301 | 302 | if (!indels.empty()) { 303 | // deal with the first indel 304 | // the first deletion ... or the biggest deletion 305 | vector::iterator a = indels.begin(); 306 | vector::iterator del = indels.begin(); 307 | for (; a != indels.end(); ++a) { 308 | //if (!a->insertion && a->length > biggestDel->length) biggestDel = a; 309 | if (!a->insertion && a->length) del = a; 310 | if (!del->insertion) { 311 | //if (!indel.insertion) { // only handle deletions like this for now 312 | //if (!indel.insertion && !(indels.size() > 1 && indel.readPosition == indels.at(1).readPosition)) { // only handle deletions like this for now 313 | int insertedBpBefore = 0; 314 | int deletedBpBefore = 0; 315 | for (vector::iterator i = indels.begin(); i != del; ++i) { 316 | if (i->insertion) insertedBpBefore += i->length; 317 | else deletedBpBefore += i->length; 318 | } 319 | IndelAllele& indel = *del; 320 | int minsize = indel.length; 321 | int flankingLength = indel.readPosition; 322 | if (debug) cerr << indel << endl; 323 | string flanking = querySequence.substr(0, flankingLength); 324 | if (debug) cerr << flanking << endl; 325 | 326 | size_t p = referenceSequence.substr(0, indel.position + indel.length).rfind(flanking); 327 | if (p == string::npos) { 328 | if (debug) cerr << "flanking not found" << endl; 329 | } else { 330 | if (debug) { 331 | cerr << "flanking is at " << p << endl; 332 | cerr << "minsize would be " << (indel.position + indel.length) - ((int) p + flankingLength) << endl; 333 | } 334 | minsize = (indel.position + indel.length) - ((int) p + flankingLength); 335 | } 336 | 337 | if (debug) cerr << minsize << endl; 338 | 339 | if (minsize >= 0 && minsize < indel.length) { 340 | 341 | int softdiff = softBegin.size(); 342 | if (!softBegin.empty()) { // remove soft clips if we can 343 | if (flankingLength < softBegin.size()) { 344 | softBegin = softBegin.substr(0, flankingLength - softBegin.size()); 345 | softdiff -= softBegin.size(); 346 | } else { 347 | softBegin.clear(); 348 | } 349 | } 350 | 351 | // the new read position == the current read position 352 | // the new reference position == the flanking length size 353 | // the positional offset of the reference sequence == the new position of the deletion - the flanking length 354 | 355 | int diff = indel.length - minsize - softdiff + deletedBpBefore - insertedBpBefore; 356 | //int querydiff = indel.length - minsize - softBegin.size() - insertedBpBefore + deletedBpBefore; 357 | if (debug) cerr << "adjusting " << indel.length <<" " << minsize << " " << softdiff << " " << diff << endl; 358 | offset += diff; 359 | /// 360 | indel.length = minsize; 361 | indel.sequence = indel.sequence.substr(indel.sequence.size() - minsize); 362 | indel.position = flankingLength; 363 | indel.readPosition = indel.position; // if we have removed all the sequence before, this should be == 364 | referenceSequence = referenceSequence.substr(diff); 365 | 366 | for (vector::iterator i = indels.begin(); i != indels.end(); ++i) { 367 | if (i < del) { 368 | i->length = 0; // remove 369 | } else if (i > del) { 370 | i->position -= diff; 371 | } 372 | } 373 | } 374 | if (debug) cerr << indel << endl; 375 | 376 | // now, do the same for the reverse 377 | if (indel.length > 0) { 378 | int minsize = indel.length + 1; 379 | int flankingLength = querySequence.size() - indel.readPosition + indel.readLength(); 380 | string flanking = querySequence.substr(indel.readPosition + indel.readLength(), flankingLength); 381 | int indelRefEnd = indel.position + indel.referenceLength(); 382 | 383 | size_t p = referenceSequence.find(flanking, indel.position); 384 | if (p == string::npos) { 385 | if (debug) 386 | cerr << "flanking not found" << endl; 387 | } else { 388 | if (debug) { 389 | cerr << "flanking is at " << p << endl; 390 | cerr << "minsize would be " << (int) p - indel.position << endl; 391 | } 392 | minsize = (int) p - indel.position; 393 | } 394 | 395 | if (debug) cerr << "minsize " << minsize << endl; 396 | if (minsize >= 0 && minsize <= indel.length) { 397 | //referenceSequence = referenceSequence.substr(0, referenceSequence.size() - (indel.length - minsize)); 398 | if (debug) cerr << "adjusting " << indel << endl; 399 | indel.length = minsize; 400 | indel.sequence = indel.sequence.substr(0, minsize); 401 | //cerr << indel << endl; 402 | if (!softEnd.empty()) { // remove soft clips if we can 403 | if (flankingLength < softEnd.size()) { 404 | softEnd = softEnd.substr(flankingLength - softEnd.size()); 405 | } else { 406 | softEnd.clear(); 407 | } 408 | } 409 | for (vector::iterator i = indels.begin(); i != indels.end(); ++i) { 410 | if (i > del) { 411 | i->length = 0; // remove 412 | } 413 | } 414 | } 415 | } 416 | } 417 | } 418 | } 419 | 420 | if (debug) { 421 | for (vector::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " "; 422 | cerr << endl; 423 | } 424 | 425 | if (debug) cerr << "parsing indels" << endl; 426 | 427 | // if soft clipping can be reduced by adjusting the tailing indels in the read, do it 428 | // TODO 429 | 430 | /* 431 | int numEmptyIndels = 0; 432 | 433 | if (!indels.empty()) { 434 | vector::iterator a = indels.begin(); 435 | while (a != indels.end()) { 436 | if (debug) cerr << "parsing " << *a << endl; 437 | if (!(a->length > 0 && a->position >= 0)) { 438 | ++numEmptyIndels; 439 | } 440 | ++a; 441 | } 442 | } 443 | */ 444 | 445 | for (vector::iterator i = indels.begin(); i != indels.end(); ++i) { 446 | if (i->length == 0) continue; 447 | if (i->insertion) { 448 | if (querySequence.substr(i->readPosition, i->readLength()) != i->sequence) { 449 | cerr << "failure: " << *i << " should be " << querySequence.substr(i->readPosition, i->readLength()) << endl; 450 | cerr << baseReferenceSequence << endl; 451 | cerr << querySequence << endl; 452 | throw 1; 453 | } 454 | } else { 455 | if (referenceSequence.substr(i->position, i->length) != i->sequence) { 456 | cerr << "failure: " << *i << " should be " << referenceSequence.substr(i->position, i->length) << endl; 457 | cerr << baseReferenceSequence << endl; 458 | cerr << querySequence << endl; 459 | throw 1; 460 | } 461 | } 462 | } 463 | 464 | if (indels.size() > 1) { 465 | vector::iterator id = indels.begin(); 466 | while ((id + 1) != indels.end()) { 467 | if (debug) { 468 | cerr << "indels: "; 469 | for (vector::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " "; 470 | cerr << endl; 471 | //for (vector::iterator a = newIndels.begin(); a != newIndels.end(); ++a) cerr << *a << " "; 472 | //cerr << endl; 473 | } 474 | 475 | // get the indels to try to merge 476 | while (id->length == 0 && (id + 1) != indels.end()) ++id; 477 | vector::iterator idn = (id + 1); 478 | while (idn != indels.end() && idn->length == 0) ++idn; 479 | if (idn == indels.end()) break; 480 | 481 | IndelAllele& indel = *idn; 482 | IndelAllele& last = *id; 483 | if (debug) cerr << "trying " << last << " against " << indel << endl; 484 | 485 | int lastend = last.insertion ? last.position : (last.position + last.length); 486 | if (indel.position == lastend) { 487 | if (debug) cerr << "indel.position " << indel.position << " lastend " << lastend << endl; 488 | if (indel.insertion == last.insertion) { 489 | last.length += indel.length; 490 | last.sequence += indel.sequence; 491 | indel.length = 0; 492 | indel.sequence.clear(); 493 | id = idn; 494 | } else if (last.length && indel.length) { // if the end of the previous == the start of the current, cut it off of both the ins and the del 495 | if (debug) cerr << "Merging " << last << " " << indel << endl; 496 | int matchsize = 1; 497 | int biggestmatchsize = 0; 498 | 499 | while (matchsize <= last.sequence.size() && matchsize <= indel.sequence.size()) { 500 | if (last.sequence.substr(last.sequence.size() - matchsize) == indel.sequence.substr(0, matchsize)) { 501 | biggestmatchsize = matchsize; 502 | } 503 | ++matchsize; 504 | } 505 | if (debug) cerr << "biggestmatchsize " << biggestmatchsize << endl; 506 | 507 | last.sequence = last.sequence.substr(0, last.sequence.size() - biggestmatchsize); 508 | last.length -= biggestmatchsize; 509 | indel.sequence = indel.sequence.substr(biggestmatchsize); 510 | indel.length -= biggestmatchsize; 511 | if (indel.insertion) indel.readPosition += biggestmatchsize; 512 | else indel.position += biggestmatchsize; 513 | 514 | if (indel.length > 0) { 515 | id = idn; 516 | } 517 | } 518 | } else { 519 | if (last.insertion != indel.insertion) { 520 | if (debug) cerr << "merging by overlap " << last << " " << indel << endl; 521 | // see if we can slide the sequence in between these two indels together 522 | string lastOverlapSeq; 523 | string indelOverlapSeq; 524 | 525 | if (last.insertion) { 526 | lastOverlapSeq = 527 | last.sequence 528 | + querySequence.substr(last.readPosition + last.readLength(), 529 | indel.readPosition - (last.readPosition + last.readLength())); 530 | indelOverlapSeq = 531 | referenceSequence.substr(last.position + last.referenceLength(), 532 | indel.position - (last.position + last.referenceLength())) 533 | + indel.sequence; 534 | } else { 535 | lastOverlapSeq = 536 | last.sequence 537 | + referenceSequence.substr(last.position + last.referenceLength(), 538 | indel.position - (last.position + last.referenceLength())); 539 | indelOverlapSeq = 540 | querySequence.substr(last.readPosition + last.readLength(), 541 | indel.readPosition - (last.readPosition + last.readLength())) 542 | + indel.sequence; 543 | } 544 | 545 | if (debug) { 546 | if (!last.insertion) { 547 | if (last.insertion) cerr << string(last.length, '-'); 548 | cerr << lastOverlapSeq; 549 | if (indel.insertion) cerr << string(indel.length, '-'); 550 | cerr << endl; 551 | if (!last.insertion) cerr << string(last.length, '-'); 552 | cerr << indelOverlapSeq; 553 | if (!indel.insertion) cerr << string(indel.length, '-'); 554 | cerr << endl; 555 | } else { 556 | if (last.insertion) cerr << string(last.length, '-'); 557 | cerr << indelOverlapSeq; 558 | if (indel.insertion) cerr << string(indel.length, '-'); 559 | cerr << endl; 560 | if (!last.insertion) cerr << string(last.length, '-'); 561 | cerr << lastOverlapSeq; 562 | if (!indel.insertion) cerr << string(indel.length, '-'); 563 | cerr << endl; 564 | } 565 | } 566 | 567 | 568 | int dist = min(last.length, indel.length); 569 | int matchingInBetween = indel.position - (last.position + last.referenceLength()); 570 | int previousMatchingInBetween = matchingInBetween; 571 | //int matchingInBetween = indel.position - last.position; 572 | if (debug) cerr << "matchingInBetween " << matchingInBetween << endl; 573 | if (debug) cerr << "dist " << dist << endl; 574 | int mindist = matchingInBetween - dist; 575 | if (lastOverlapSeq == indelOverlapSeq) { 576 | matchingInBetween = lastOverlapSeq.size(); 577 | } else { 578 | // TODO change to use string::find() 579 | for (int i = dist; i > 0; --i) { 580 | if (debug) cerr << "lastoverlap: " 581 | << lastOverlapSeq.substr(lastOverlapSeq.size() - previousMatchingInBetween - i) 582 | << " thisoverlap: " 583 | << indelOverlapSeq.substr(0, i + previousMatchingInBetween) << endl; 584 | if (lastOverlapSeq.substr(lastOverlapSeq.size() - previousMatchingInBetween - i) 585 | == indelOverlapSeq.substr(0, i + previousMatchingInBetween)) { 586 | matchingInBetween = previousMatchingInBetween + i; 587 | break; 588 | } 589 | } 590 | } 591 | //cerr << last << " " << indel << endl; 592 | if (matchingInBetween > 0 && matchingInBetween > previousMatchingInBetween) { 593 | if (debug) cerr << "matching " << matchingInBetween << "bp between " << last << " " << indel 594 | << " was matching " << previousMatchingInBetween << endl; 595 | int diff = matchingInBetween - previousMatchingInBetween; 596 | last.length -= diff; 597 | last.sequence = last.sequence.substr(0, last.length); 598 | indel.length -= diff; 599 | indel.sequence = indel.sequence.substr(diff); 600 | if (!indel.insertion) indel.position += diff; 601 | else indel.readPosition += diff; 602 | if (debug) cerr << last << " " << indel << endl; 603 | }// else if (matchingInBetween == 0 || matchingInBetween == indel.position - last.position) { 604 | //if (!newIndels.empty()) newIndels.pop_back(); 605 | //} //else { newIndels.push_back(indel); } 606 | id = idn; 607 | //newIndels.push_back(indel); 608 | } else { 609 | id = idn; 610 | //newIndels.push_back(indel); 611 | } 612 | } 613 | } 614 | } 615 | 616 | vector newIndels; 617 | for (vector::iterator i = indels.begin(); i != indels.end(); ++i) { 618 | if (!i->insertion && i->position == 0) { offset += i->length; 619 | } else if (i->length > 0) newIndels.push_back(*i); // remove dels at front 620 | } 621 | 622 | // for each indel 623 | // if ( we're matched up to the previous insertion (or deletion) 624 | // and it's also an insertion or deletion ) 625 | // merge the indels 626 | // 627 | // and simultaneously reconstruct the cigar 628 | // 629 | // if there are spurious deletions at the start and end of the read, remove them 630 | // if there are spurious insertions after soft-clipped bases, make them soft clips 631 | 632 | vector > newCigar; 633 | 634 | if (!softBegin.empty()) { 635 | newCigar.push_back(make_pair(softBegin.size(), "S")); 636 | } 637 | 638 | if (newIndels.empty()) { 639 | 640 | int remainingReadBp = querySequence.size() - softEnd.size() - softBegin.size(); 641 | newCigar.push_back(make_pair(remainingReadBp, "M")); 642 | 643 | if (!softEnd.empty()) { 644 | newCigar.push_back(make_pair(softEnd.size(), "S")); 645 | } 646 | 647 | cigar = joinCIGAR(newCigar); 648 | 649 | // check if we're realigned 650 | if (cigar == cigarbefore) { 651 | return false; 652 | } else { 653 | return true; 654 | } 655 | } 656 | 657 | vector::iterator id = newIndels.begin(); 658 | vector::iterator last = id++; 659 | 660 | if (last->position > 0) { 661 | newCigar.push_back(make_pair(last->position, "M")); 662 | newCigar.push_back(make_pair(last->length, (last->insertion ? "I" : "D"))); 663 | } else if (last->position == 0) { // discard floating indels 664 | if (last->insertion) newCigar.push_back(make_pair(last->length, "S")); 665 | else newCigar.push_back(make_pair(last->length, "D")); 666 | } else { 667 | cerr << "negative indel position " << *last << endl; 668 | } 669 | 670 | int lastend = last->insertion ? last->position : (last->position + last->length); 671 | LEFTALIGN_DEBUG(*last << ","); 672 | 673 | for (; id != newIndels.end(); ++id) { 674 | IndelAllele& indel = *id; 675 | if (indel.length == 0) continue; // remove 0-length indels 676 | if (debug) cerr << indel << " " << *last << endl; 677 | LEFTALIGN_DEBUG(indel << ","); 678 | if ((id + 1) == newIndels.end() 679 | && (indel.insertion && indel.position == referenceSequence.size() 680 | || (!indel.insertion && indel.position + indel.length == referenceSequence.size())) 681 | ) { 682 | if (indel.insertion) { 683 | if (!newCigar.empty() && newCigar.back().second == "S") { 684 | newCigar.back().first += indel.length; 685 | } else { 686 | newCigar.push_back(make_pair(indel.length, "S")); 687 | } 688 | } 689 | } else if (indel.position < lastend) { 690 | cerr << "impossibility?: indel realigned left of another indel" << endl; 691 | return false; 692 | } else if (indel.position == lastend) { 693 | // how? 694 | if (indel.insertion == last->insertion) { 695 | pair& op = newCigar.back(); 696 | op.first += indel.length; 697 | } else { 698 | newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D"))); 699 | } 700 | } else if (indel.position > lastend) { // also catches differential indels, but with the same position 701 | if (!newCigar.empty() && newCigar.back().second == "M") newCigar.back().first += indel.position - lastend; 702 | else newCigar.push_back(make_pair(indel.position - lastend, "M")); 703 | newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D"))); 704 | } 705 | 706 | last = id; 707 | lastend = last->insertion ? last->position : (last->position + last->length); 708 | 709 | if (debug) { 710 | for (vector >::iterator c = newCigar.begin(); c != newCigar.end(); ++c) 711 | cerr << c->first << c->second; 712 | cerr << endl; 713 | } 714 | 715 | } 716 | 717 | int remainingReadBp = querySequence.size() - (last->readPosition + last->readLength()) - softEnd.size(); 718 | if (remainingReadBp > 0) { 719 | if (debug) cerr << "bp remaining = " << remainingReadBp << endl; 720 | if (newCigar.back().second == "M") newCigar.back().first += remainingReadBp; 721 | else newCigar.push_back(make_pair(remainingReadBp, "M")); 722 | } 723 | 724 | if (newCigar.back().second == "D") newCigar.pop_back(); // remove trailing deletions 725 | 726 | if (!softEnd.empty()) { 727 | if (newCigar.back().second == "S") newCigar.back().first += softEnd.size(); 728 | else newCigar.push_back(make_pair(softEnd.size(), "S")); 729 | } 730 | 731 | LEFTALIGN_DEBUG(endl); 732 | 733 | cigar = joinCIGAR(newCigar); 734 | 735 | LEFTALIGN_DEBUG(cigar << endl); 736 | 737 | // check if we're realigned 738 | if (cigar == cigarbefore) { 739 | return false; 740 | } else { 741 | return true; 742 | } 743 | 744 | } 745 | 746 | int countMismatches(string& querySequence, string& cigar, string referenceSequence) { 747 | 748 | int mismatches = 0; 749 | int sp = 0; 750 | int rp = 0; 751 | vector > cigarData = splitCIGAR(cigar); 752 | for (vector >::const_iterator c = cigarData.begin(); 753 | c != cigarData.end(); ++c) { 754 | unsigned int l = c->first; 755 | string t = c->second; 756 | if (t == "M") { // match or mismatch 757 | for (int i = 0; i < l; ++i) { 758 | if (querySequence.at(rp) != referenceSequence.at(sp)) 759 | ++mismatches; 760 | ++sp; 761 | ++rp; 762 | } 763 | } else if (t == "D") { // deletion 764 | sp += l; // update reference sequence position 765 | } else if (t == "I") { // insertion 766 | rp += l; // update read position 767 | } else if (t == "S") { // soft clip, clipped sequence present in the read not matching the reference 768 | rp += l; 769 | } else if (t == "H") { // hard clip on the read, clipped sequence is not present in the read 770 | } else if (t == "N") { // skipped region in the reference not present in read, aka splice 771 | sp += l; 772 | } 773 | } 774 | 775 | return mismatches; 776 | 777 | } 778 | 779 | // Iteratively left-aligns the indels in the alignment until we have a stable 780 | // realignment. Returns true on realignment success or non-realignment. 781 | // Returns false if we exceed the maximum number of realignment iterations. 782 | // 783 | bool stablyLeftAlign(string querySequence, string& cigar, string referenceSequence, int& offset, int maxiterations, bool debug) { 784 | 785 | if (!leftAlign(querySequence, cigar, referenceSequence, offset)) { 786 | 787 | LEFTALIGN_DEBUG("did not realign" << endl); 788 | return true; 789 | 790 | } else { 791 | 792 | while (leftAlign(querySequence, cigar, referenceSequence, offset) && --maxiterations > 0) { 793 | LEFTALIGN_DEBUG("realigning ..." << endl); 794 | } 795 | 796 | if (maxiterations <= 0) { 797 | return false; 798 | } else { 799 | return true; 800 | } 801 | } 802 | } 803 | 804 | string mergeCIGAR(const string& c1, const string& c2) { 805 | vector > cigar1 = splitCIGAR(c1); 806 | vector > cigar2 = splitCIGAR(c2); 807 | // check if the middle elements are the same 808 | if (cigar1.back().second == cigar2.front().second) { 809 | cigar1.back().first += cigar2.front().first; 810 | cigar2.erase(cigar2.begin()); 811 | } 812 | for (vector >::iterator c = cigar2.begin(); c != cigar2.end(); ++c) { 813 | cigar1.push_back(*c); 814 | } 815 | return joinCIGAR(cigar1); 816 | } 817 | 818 | vector > splitCIGAR(const string& cigarStr) { 819 | vector > cigar; 820 | string number; 821 | string type; 822 | // strings go [Number][Type] ... 823 | for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) { 824 | char c = *s; 825 | if (isdigit(c)) { 826 | if (type.empty()) { 827 | number += c; 828 | } else { 829 | // signal for next token, push back the last pair, clean up 830 | cigar.push_back(make_pair(atoi(number.c_str()), type)); 831 | number.clear(); 832 | type.clear(); 833 | number += c; 834 | } 835 | } else { 836 | type += c; 837 | } 838 | } 839 | if (!number.empty() && !type.empty()) { 840 | cigar.push_back(make_pair(atoi(number.c_str()), type)); 841 | } 842 | return cigar; 843 | } 844 | 845 | string joinCIGAR(const vector >& cigar) { 846 | string cigarStr; 847 | for (vector >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) { 848 | if (c->first) { 849 | cigarStr += convert(c->first) + c->second; 850 | } 851 | } 852 | return cigarStr; 853 | } 854 | --------------------------------------------------------------------------------