├── .gitignore ├── CMakeLists.txt ├── src ├── mxml-utils │ ├── CMakeLists.txt │ ├── MXMLUtils.h │ └── MXMLUtils.C ├── mxml │ ├── CMakeLists.txt │ ├── ANNOUNCEMENT │ ├── mxml-private.h │ ├── config.h │ ├── README │ ├── mxml-search.c │ ├── mxml-private.c │ ├── mxml-attr.c │ └── mxml-set.c ├── libseq │ ├── CMakeLists.txt │ ├── Codon.h │ ├── ParseException.h │ ├── AlignmentAlgorithm.h │ ├── NeedlemanWunsh.h │ ├── CodingSequence.cpp │ ├── CodingSequence.h │ ├── AASequence.cpp │ ├── AminoAcid.cpp │ ├── CodonAlign.h │ ├── NTSequence.h │ ├── AASequence.h │ ├── AlignmentAlgorithm.cpp │ ├── AminoAcid.h │ ├── Nucleotide.h │ ├── NeedlemanWunsh.cpp │ ├── NTSequence.cpp │ ├── Nucleotide.cpp │ ├── CodonAlign.cpp │ └── Codon.cpp ├── CLIUtils.h ├── CMakeLists.txt ├── Utils.h ├── CLIUtils.cpp ├── ResultsExporter.h ├── ReferenceSequence.h ├── Utils.cpp ├── Alignment.h ├── ReferenceSequence.cpp ├── Virulign.cpp └── Alignment.cpp ├── references ├── HCV │ ├── README │ └── HCV2-FN666429.xml ├── SARS-CoV-2 │ ├── ORF10.xml │ ├── ORF7b.xml │ ├── ORF6.xml │ ├── E.xml │ ├── ORF7a.xml │ ├── ORF8.xml │ ├── M.xml │ ├── ORF3a.xml │ ├── N.xml │ └── S.xml ├── HIV │ ├── HIV-HXB2-gag.xml │ ├── HIV-HXB2-env.xml │ └── HIV-HXB2-pol.xml ├── CHIKV │ ├── CHIKV-NC004162-gp2.xml │ └── CHIKV-NC004162-gp1.xml └── DENV │ └── DENV4-NC002640.xml ├── BUILD.txt └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | **/.DS_Store 2 | build 3 | install 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | PROJECT(VIRULIGN) 2 | CMAKE_MINIMUM_REQUIRED(VERSION 2.6) 3 | 4 | SUBDIRS(src) 5 | -------------------------------------------------------------------------------- /src/mxml-utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | SET(SOURCES 2 | MXMLUtils.C 3 | ) 4 | 5 | INCLUDE_DIRECTORIES( 6 | ${CMAKE_CURRENT_SOURCE_DIR}/..) 7 | 8 | ADD_LIBRARY(mxml-utils ${SOURCES}) 9 | 10 | TARGET_LINK_LIBRARIES(mxml-utils mxml) 11 | -------------------------------------------------------------------------------- /src/mxml/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | SET(SOURCES 2 | mxml-attr.c 3 | mxml-entity.c 4 | mxml-file.c 5 | mxml-index.c 6 | mxml-node.c 7 | mxml-private.c 8 | mxml-search.c 9 | mxml-set.c 10 | mxml-get.c 11 | mxml-string.c 12 | ) 13 | 14 | ADD_LIBRARY(mxml ${SOURCES}) 15 | -------------------------------------------------------------------------------- /references/HCV/README: -------------------------------------------------------------------------------- 1 | To get FN666429: 2 | efetch -db sequences -id FN666429 -format gpc > FN666429.insd.xml 3 | manually remove the outer XML tag (Set) 4 | python2 ~/projects/virulign-tools/annotations/genbank_to_virulign.py ../FN666429.insd.xml "HCV2" 314 9398 > HCV2-FN666429.xml 5 | -------------------------------------------------------------------------------- /src/libseq/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | SET(SOURCES 2 | AASequence.cpp 3 | AlignmentAlgorithm.cpp 4 | AminoAcid.cpp 5 | CodingSequence.cpp 6 | Codon.cpp 7 | CodonAlign.cpp 8 | NTSequence.cpp 9 | NeedlemanWunsh.cpp 10 | Nucleotide.cpp 11 | ) 12 | 13 | ADD_LIBRARY(seq ${SOURCES}) 14 | -------------------------------------------------------------------------------- /src/CLIUtils.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef CLI_UTILS_H_ 3 | #define CLI_UTILS_H_ 4 | 5 | #include 6 | 7 | ReferenceSequence loadRefSeqFromFile(const char* refSeqFileName); 8 | bool equalsS(char* str1, char* str2); 9 | bool equalsString(std::string str1, std::string str2); 10 | 11 | #endif // CLI_UTILS_H_ 12 | -------------------------------------------------------------------------------- /references/SARS-CoV-2/ORF10.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /references/SARS-CoV-2/ORF7b.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /references/SARS-CoV-2/ORF6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | SUBDIRS(libseq) 2 | SUBDIRS(mxml) 3 | SUBDIRS(mxml-utils) 4 | 5 | SET(LIB_SOURCES 6 | Alignment.cpp 7 | CLIUtils.cpp 8 | Utils.cpp 9 | ReferenceSequence.cpp 10 | ResultsExporter.cpp 11 | ) 12 | 13 | include_directories(libseq mxml mxml-utils) 14 | 15 | ADD_LIBRARY(virulignlib ${LIB_SOURCES}) 16 | ADD_EXECUTABLE(virulign Virulign.cpp) 17 | TARGET_LINK_LIBRARIES(virulign virulignlib seq mxml mxml-utils) 18 | 19 | install(TARGETS virulign DESTINATION bin) 20 | -------------------------------------------------------------------------------- /references/SARS-CoV-2/E.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/mxml-utils/MXMLUtils.h: -------------------------------------------------------------------------------- 1 | #ifndef MXML_UTILS 2 | #define MXML_UTILS 3 | 4 | #include 5 | #include 6 | 7 | typedef struct mxml_node_s mxml_node_t; 8 | 9 | mxml_node_t *singleChildElement(mxml_node_t *element, 10 | const std::string& tagName); 11 | 12 | bool attributeValue(mxml_node_t *element, const std::string& attributeName, 13 | std::string& result); 14 | 15 | std::vector 16 | childElements(mxml_node_t *element, const std::string& tagName); 17 | 18 | #endif //MXML_UTILS 19 | -------------------------------------------------------------------------------- /references/SARS-CoV-2/ORF7a.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /references/SARS-CoV-2/ORF8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/mxml/ANNOUNCEMENT: -------------------------------------------------------------------------------- 1 | Mini-XML 2.7 is now available for download from: 2 | 3 | http://www.minixml.org/software.php 4 | 5 | Mini-XML 2.7 fixes some minor platform and XML issues. Changes include: 6 | 7 | - Updated the source headers to reference the Mini-XML license and its 8 | exceptions to the LGPL2 (STR #108) 9 | - The shared library did not include a destructor for the thread- 10 | specific data key on UNIX-based operating systems (STR #103) 11 | - mxmlLoad* did not error out on XML with multiple root nodes (STR #101) 12 | - Fixed an issue with the _mxml_vstrdupf function (STR #107) 13 | - mxmlSave* no longer write all siblings of the passed node, just that 14 | node and its children (STR #109) 15 | -------------------------------------------------------------------------------- /BUILD.txt: -------------------------------------------------------------------------------- 1 | virulign: : fast codon-correct alignment for virus pathogens 2 | ------------------------------------------------------------ 3 | 4 | Requirements 5 | ------------ 6 | * We use CMake (cmake.org) for the build process, and tested this on GNU/Linux, MacOS and Windows (Visual Studio C++ Express). 7 | * C++ environment. 8 | 9 | Build instructions 10 | ------------------ 11 | Create a temporary build directory (for example within the project root). 12 | To install virulign to the operating system's default location, use: 13 | $ mkdir build 14 | $ cd build 15 | $ cmake ../ -DCMAKE_BUILD_TYPE=Release 16 | $ make 17 | 18 | To install virulign to a custom location, use the CMAKE_INSTALL_PREFIX variable: 19 | $ mkdir build 20 | $ cd build 21 | $ cmake ../ -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/soft/virulign/ 22 | $ make 23 | 24 | To install 25 | ---------- 26 | $ make install 27 | -------------------------------------------------------------------------------- /src/Utils.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef UTILS_H_ 3 | #define UTILS_H_ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | template 11 | T lexical_cast(const std::string& s) 12 | { 13 | std::stringstream ss(s); 14 | 15 | T result; 16 | if ((ss >> result).fail() || !(ss >> std::ws).eof()) 17 | { 18 | throw std::bad_cast(); 19 | } 20 | 21 | return result; 22 | } 23 | 24 | template 25 | std::string to_string(const T& t) 26 | { 27 | std::stringstream ss; 28 | ss << t; 29 | return ss.str(); 30 | } 31 | 32 | std::string to_upper_copy(const std::string& s); 33 | 34 | bool ends_with(const std::string& s, const std::string& p); 35 | 36 | long long current_time_ms(); 37 | 38 | std::string format_time(const long long& milliseconds); 39 | 40 | #endif // UTILS_H_ 41 | -------------------------------------------------------------------------------- /references/SARS-CoV-2/M.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/libseq/Codon.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef CODON_H_ 3 | #define CODON_H_ 4 | 5 | #include 6 | #include 7 | 8 | #include "NTSequence.h" 9 | #include "AminoAcid.h" 10 | 11 | namespace seq { 12 | 13 | /** 14 | * Utility class that defines the genetic code. 15 | */ 16 | class Codon 17 | { 18 | public: 19 | /** 20 | * Translate a nucleotide triplet (given by the range starting and 21 | * the indicated start point in a NTSequence) into an AminoAcid. 22 | * 23 | * If the triplet is three gaps, then the result is AminoAcid::GAP. 24 | * If the triplet contains ambiguity codes or gaps, then the result 25 | * is AminoAcid::X. Otherwise, the result is the translated amino 26 | * acid. 27 | */ 28 | static AminoAcid translate(const NTSequence::const_iterator triplet); 29 | 30 | static std::set 31 | translateAll(const NTSequence::const_iterator triplet); 32 | 33 | static std::set codonsFor(AminoAcid a); 34 | }; 35 | 36 | }; 37 | 38 | #endif // CODON_H_ 39 | -------------------------------------------------------------------------------- /references/SARS-CoV-2/ORF3a.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **! See [here](https://github.com/rega-cev/virulign/tree/master/references/SARS-CoV-2) for xml files for alignment and annotation of SARS-CoV-2 genomes** 2 | 3 | 4 | 5 | ### Summary 6 | VIRULIGN is a tool for codon-correct pairwise alignments, with an augmented functionality to annotate the alignment according the positions of the proteins. We present a [tutorial](https://github.com/rega-cev/virulign-tutorial) demonstrating VIRULIGN's functionality for three different pathogens (i.e., HIV-1, Dengue virus, Zika virus). 7 | 8 | Please download releases for GNU/Linux, MacOS and Windows [here](https://github.com/rega-cev/virulign/releases). To install the download, just put the virulign binary in a directory that is in your PATH environment. To build VIRULIGN from source have a look at the [BUILD.txt file](BUILD.txt). 9 | 10 | 11 | 12 | ### Manuscript 13 | For a detailed description of VIRULIGN and how to cite, please use: 14 | 15 | Libin PJK, Deforche K, Abecasis AB and Theys K., (2018), VIRULIGN: fast codon-correct alignment and annotation of viral genomes, Bioinformatics, bty851, https://doi.org/10.1093/bioinformatics/bty851 16 | 17 | -------------------------------------------------------------------------------- /src/mxml/mxml-private.h: -------------------------------------------------------------------------------- 1 | /* 2 | * "$Id: mxml-private.h 408 2010-09-19 05:26:46Z mike $" 3 | * 4 | * Private definitions for Mini-XML, a small XML-like file parsing library. 5 | * 6 | * Copyright 2003-2010 by Michael R Sweet. 7 | * 8 | * These coded instructions, statements, and computer programs are the 9 | * property of Michael R Sweet and are protected by Federal copyright 10 | * law. Distribution and use rights are outlined in the file "COPYING" 11 | * which should have been included with this file. If this file is 12 | * missing or damaged, see the license at: 13 | * 14 | * http://www.minixml.org/ 15 | */ 16 | 17 | /* 18 | * Include necessary headers... 19 | */ 20 | 21 | #include "config.h" 22 | #include "mxml.h" 23 | 24 | 25 | /* 26 | * Global, per-thread data... 27 | */ 28 | 29 | typedef struct _mxml_global_s 30 | { 31 | void (*error_cb)(const char *); 32 | int num_entity_cbs; 33 | int (*entity_cbs[100])(const char *name); 34 | int wrap; 35 | mxml_custom_load_cb_t custom_load_cb; 36 | mxml_custom_save_cb_t custom_save_cb; 37 | } _mxml_global_t; 38 | 39 | 40 | /* 41 | * Functions... 42 | */ 43 | 44 | extern _mxml_global_t *_mxml_global(void); 45 | extern int _mxml_entity_cb(const char *name); 46 | 47 | 48 | /* 49 | * End of "$Id: mxml-private.h 408 2010-09-19 05:26:46Z mike $". 50 | */ 51 | -------------------------------------------------------------------------------- /src/CLIUtils.cpp: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "ReferenceSequence.h" 8 | #include "CLIUtils.h" 9 | 10 | ReferenceSequence loadRefSeqFromFile(const char* refSeqFileName) { 11 | std::ifstream f_ref(refSeqFileName); 12 | if (!f_ref) { 13 | throw std::runtime_error(std::string("Could not open ") + refSeqFileName); 14 | } 15 | 16 | ReferenceSequence* ref; 17 | 18 | try { 19 | seq::NTSequence refNt; 20 | f_ref >> refNt; 21 | 22 | if (!f_ref) { 23 | throw std::runtime_error(std::string("RefSeq loading:: File ") + refSeqFileName + " does not contain a FASTA sequence ?"); 24 | } 25 | 26 | ref = new ReferenceSequence(refNt); 27 | ref->addRegion(ReferenceSequence::Region(0, refNt.size()/3, "P")); 28 | 29 | f_ref >> refNt; 30 | 31 | if (f_ref) { 32 | throw std::runtime_error(std::string("RefSeq loading:: File ") + refSeqFileName + " contains multiple sequences ?"); 33 | } 34 | return *ref; 35 | } catch (seq::ParseException& e) { 36 | throw std::runtime_error(std::string("RefSeq loading:: Fatal error: ") + e.message()); 37 | } 38 | } 39 | 40 | bool equalsS(char* str1, char* str2) { 41 | return strcmp(str1, str2) == 0; 42 | } 43 | 44 | bool equalsString(std::string str1, std::string str2){ 45 | return str1.compare(str2) == 0; 46 | } 47 | -------------------------------------------------------------------------------- /src/ResultsExporter.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef RESULTS_EXPORTER_H_ 3 | #define RESULTS_EXPORTER_H_ 4 | 5 | #include 6 | #include 7 | 8 | class Alignment; 9 | 10 | enum ExportKind { Mutations, PairwiseAlignments, GlobalAlignment, 11 | PositionTable, MutationTable }; 12 | enum ExportAlphabet { Nucleotides, AminoAcids }; 13 | 14 | class ResultsExporter 15 | { 16 | public: 17 | ResultsExporter(const std::vector& results, ExportKind kind, 18 | ExportAlphabet alphabet, bool withInsertions = false); 19 | 20 | ExportKind kind() const { return kind_; } 21 | ExportAlphabet alphabet() const { return alphabet_; } 22 | 23 | void streamData(std::ostream& stream); 24 | void streamConsensusSequence(std::ostream& stream); 25 | 26 | private: 27 | const std::vector& results_; 28 | const ExportKind kind_; 29 | const ExportAlphabet alphabet_; 30 | const bool withInsertions_; 31 | 32 | void streamMutationsCsv(std::ostream& stream); 33 | void streamPairwiseAlignments(std::ostream& stream); 34 | void streamPositionTable(std::ostream& stream); 35 | void streamMutationTable(std::ostream& stream); 36 | 37 | void computeGlobalAlignment(seq::NTSequence& globalRef, 38 | std::vector& globalAlignment); 39 | void streamGlobalAlignment(std::ostream& stream); 40 | }; 41 | 42 | #endif // RESULTS_EXPORTER_H_ 43 | -------------------------------------------------------------------------------- /references/SARS-CoV-2/N.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/libseq/ParseException.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef PARSE_EXCEPTION_H_ 3 | #define PARSE_EXCEPTION_H_ 4 | 5 | #include 6 | 7 | namespace seq { 8 | 9 | /** 10 | * Exception thrown when an error was encountered while parsing the 11 | * string representation of an nucleotide, nucleotide sequence, amino 12 | * acid, amino acid sequence, or a FASTA file. 13 | * 14 | * \sa Nucleotide::Nucleotide(char), AminoAcid::AminoAcid(char), 15 | * NTSequence::NTSequence(const std::string, const std::string, const 16 | * std::string, bool), AASequence::AASequence(const std::string, const 17 | * std::string, const std::string), operator>> (std::istream&, 18 | * NTSequence&), operator>> (std::istream&, AASequence&) 19 | */ 20 | class ParseException 21 | { 22 | public: 23 | ParseException(const std::string name, 24 | const std::string message, bool recovered) 25 | : name_(name), message_(message), recovered_(recovered) { } 26 | 27 | /** 28 | * The sequence name. 29 | */ 30 | std::string name() const { return name_; } 31 | 32 | /** 33 | * The message describing the error. 34 | */ 35 | std::string message() const { return message_; } 36 | 37 | /** 38 | * Whether the parser attempted to recover and you could try parsing 39 | * the next sequence. 40 | */ 41 | bool recovered() const { return recovered_; } 42 | 43 | private: 44 | std::string name_; 45 | std::string message_; 46 | bool recovered_; 47 | }; 48 | 49 | }; 50 | 51 | #endif // PARSE_EXCEPTION_H_ 52 | -------------------------------------------------------------------------------- /src/mxml-utils/MXMLUtils.C: -------------------------------------------------------------------------------- 1 | #include "MXMLUtils.h" 2 | #include "mxml/mxml.h" 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | mxml_node_t *singleChildElement(mxml_node_t *element, 9 | const std::string& tagName) 10 | { 11 | mxml_node_t *result = mxmlFindElement(element, element, tagName.c_str(), 12 | 0, 0, MXML_DESCEND); 13 | 14 | if (result) { 15 | mxml_node_t *next = mxmlFindElement(result, element, tagName.c_str(), 16 | 0, 0, MXML_NO_DESCEND); 17 | if (next) { 18 | throw std::runtime_error(std::string("Expected only one child <") 19 | + tagName 20 | + "> in <" + element->value.element.name + ">"); 21 | } 22 | } 23 | 24 | if (result && result->type != MXML_ELEMENT) 25 | throw std::runtime_error("Expected an XML DOM element"); 26 | 27 | return result; 28 | } 29 | 30 | bool attributeValue(mxml_node_t *element, const std::string& attributeName, 31 | std::string& result) 32 | { 33 | const char *r = mxmlElementGetAttr(element, attributeName.c_str()); 34 | 35 | if (r) { 36 | result = r; 37 | 38 | return true; 39 | } else 40 | return false; 41 | } 42 | 43 | std::vector 44 | childElements(mxml_node_t *element, const std::string& tagName) 45 | { 46 | std::vector result; 47 | 48 | mxml_node_t *r = mxmlFindElement(element, element, tagName.c_str(), 49 | 0, 0, MXML_DESCEND); 50 | while (r) { 51 | result.push_back(r); 52 | r = mxmlFindElement(r, element, tagName.c_str(), 0, 0, MXML_NO_DESCEND); 53 | } 54 | 55 | return result; 56 | } 57 | -------------------------------------------------------------------------------- /src/ReferenceSequence.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef REFERENCE_SEQUENCE_H_ 3 | #define REFERENCE_SEQUENCE_H_ 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | class ReferenceSequence : public seq::NTSequence 11 | { 12 | public: 13 | class Region { 14 | public: 15 | 16 | Region(int begin, int end, std::string prefix) 17 | : begin_(begin), 18 | end_(end), 19 | prefix_(prefix) 20 | { } 21 | 22 | int begin() const { return begin_; } // AA position [0 -- N[ 23 | int end() const { return end_; } // AA position [0 -- N[ 24 | std::string prefix() const { return prefix_; } 25 | 26 | // aligned positions of begin, end 27 | int alignedBegin, alignedEnd; // AA position [0 -- N[ 28 | // reference position of first/last non-gap in target within region 29 | int targetBegin, targetEnd; // AA position [0 -- N[ 30 | 31 | private: 32 | int begin_, end_; 33 | std::string prefix_; 34 | 35 | friend class ReferenceSequence; 36 | }; 37 | 38 | ReferenceSequence(const seq::NTSequence& seq); 39 | 40 | const std::vector& regions() const { return regions_; } 41 | std::vector& regions() { return regions_; } 42 | void addRegion(Region r) { regions_.push_back(r); } 43 | 44 | static std::map > 45 | parseProteinReferences(std::string genomesXmlFile); 46 | static ReferenceSequence 47 | parseOrfReferenceFile(const std::string& fileName); 48 | 49 | private: 50 | std::vector regions_; 51 | }; 52 | 53 | #endif // REFERENCE_SEQUENCE_H_ 54 | -------------------------------------------------------------------------------- /src/Utils.cpp: -------------------------------------------------------------------------------- 1 | #include "Utils.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #ifdef _WIN32 8 | #include 9 | #else 10 | #include 11 | #endif 12 | 13 | std::string to_upper_copy(const std::string& s) 14 | { 15 | std::string copy = s; 16 | std::transform(copy.begin(), copy.end(), copy.begin(), ::toupper); 17 | return copy; 18 | } 19 | 20 | bool ends_with(const std::string& s, const std::string& p) 21 | { 22 | if (p.size() > s.size()) 23 | return false; 24 | else { 25 | for (unsigned i = 0; i < p.size(); ++i) { 26 | if (p[i] != s[s.size() - p.size() + i]) 27 | return false; 28 | } 29 | return true; 30 | } 31 | } 32 | 33 | long long current_time_ms() 34 | { 35 | #ifdef _WIN32 36 | static LARGE_INTEGER s_frequency; 37 | static BOOL s_use_qpc = QueryPerformanceFrequency(&s_frequency); 38 | //if there is no high resolution time stamp, use GetTickCount() 39 | if (s_use_qpc) { 40 | LARGE_INTEGER now; 41 | QueryPerformanceCounter(&now); 42 | return (1000LL * now.QuadPart) / s_frequency.QuadPart; 43 | } else { 44 | return GetTickCount(); 45 | } 46 | #else 47 | struct timeval tv; 48 | gettimeofday(&tv, NULL); 49 | return (tv.tv_sec) * 1000 + (tv.tv_usec) / 1000 ; 50 | #endif 51 | } 52 | 53 | std::string format_time(const long long& milliseconds) 54 | { 55 | int seconds = (int) (milliseconds / 1000) % 60 ; 56 | int minutes = (int) ((milliseconds / (1000*60)) % 60); 57 | int hours = (int) ((milliseconds / (1000*60*60)) % 24); 58 | 59 | std::stringstream ss; 60 | if (hours != 0) 61 | ss << hours << "h"; 62 | if (minutes != 0 || hours != 0) 63 | ss << minutes << "m"; 64 | ss << seconds << "s"; 65 | 66 | return ss.str(); 67 | } 68 | -------------------------------------------------------------------------------- /src/libseq/AlignmentAlgorithm.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef ALIGNMENT_ALGORITHM_H_ 3 | #define ALIGNMENT_ALGORITHM_H_ 4 | 5 | #include 6 | #include 7 | 8 | /** 9 | * libseq namespace 10 | */ 11 | namespace seq { 12 | 13 | class AlignmentAlgorithm { 14 | public: 15 | /** 16 | * Pair-wise align two nucleotide sequences. 17 | * 18 | * The two sequences seq1 and seq2 are aligned in-place: gaps are inserted 19 | * according to a global alignment, and they will have equal length. 20 | */ 21 | virtual double align(NTSequence& seq1, NTSequence& seq2) = 0; 22 | 23 | /** 24 | * Pair-wise align two amino acid sequences. 25 | * 26 | * The two sequences seq1 and seq2 are aligned in-place: gaps are inserted 27 | * according to a global alignment, and they will have equal length. 28 | */ 29 | virtual double align(AASequence& seq1, AASequence& seq2) = 0; 30 | 31 | virtual double computeAlignScore(const NTSequence& seq1, 32 | const NTSequence& seq2) = 0; 33 | 34 | /** 35 | * Similarity weights matrix for nucleotides. 36 | * 37 | * Compares also IUB ambiuguity codes, and is the matrix used by BLAST. 38 | * 39 | * Taken from: ftp://ftp.ncbi.nih.gov/blast/matrices/NUC.4.4 40 | */ 41 | static double** IUB(); 42 | 43 | /** 44 | * Similarity weights matrix for amino acids. 45 | * 46 | * This is from the famous BLOSUM series of weight matrices, the one 47 | * that is the default use by ClustalX. 48 | * 49 | * From: ftp://ftp.ncbi.nih.gov/blast/matrices/BLOSUM30 50 | */ 51 | static double** BLOSUM30(); 52 | }; 53 | 54 | } 55 | 56 | #endif // ALIGNMENT_ALGORITHM_H_ 57 | -------------------------------------------------------------------------------- /src/Alignment.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef ALIGNMENT_H_ 3 | #define ALIGNMENT_H_ 4 | 5 | #include "ReferenceSequence.h" 6 | #include 7 | 8 | class IsolateMutation; 9 | 10 | class Alignment 11 | { 12 | public: 13 | bool success; 14 | bool tooShort; 15 | bool failure; 16 | int correctedFrameshifts; 17 | double score; 18 | 19 | ReferenceSequence ref; 20 | seq::NTSequence target; 21 | 22 | std::string 23 | mutations(const ReferenceSequence::Region& region) const; 24 | std::string 25 | codonMutations(const ReferenceSequence::Region& region, 26 | int& start, 27 | int& end) const; 28 | void isolateMutations(const ReferenceSequence::Region& regioni, std::vector& mutations) const; 29 | 30 | /*! \brief Return the amino acid position of the given mutation, if there 31 | * is information on that mutation in the alignment 32 | * 33 | * The first result (bool) indicates if the target sequence contains 34 | * the mutation. 35 | * 36 | * The second result is the amino acid position. If the mutation is 37 | * an insertion which is not contained in the sequence, this value is 38 | * -1. 39 | */ 40 | std::pair findAminoAcid(const ReferenceSequence::Region& region, 41 | int positionInRegion, int insertion) 42 | const; 43 | 44 | static Alignment compute(const ReferenceSequence& ref, 45 | const seq::NTSequence& target, 46 | seq::AlignmentAlgorithm* algorithm, 47 | int maxFrameShifts = 5); 48 | 49 | static Alignment given(const ReferenceSequence& ref, 50 | const seq::NTSequence& target); 51 | 52 | void revert(const IsolateMutation& mutation); 53 | 54 | private: 55 | Alignment(const ReferenceSequence& aref, 56 | const seq::NTSequence& atarget); 57 | 58 | void computeAlignedRanges(int referenceSequenceLength); 59 | int alignedPos(int refPos) const; 60 | int firstPos(int begin, int end) const; 61 | int lastPos(int begin, int end) const; 62 | }; 63 | 64 | #endif // ALIGNMENT_H_ 65 | -------------------------------------------------------------------------------- /references/HIV/HIV-HXB2-gag.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/mxml/config.h: -------------------------------------------------------------------------------- 1 | /* config.h. Generated by configure. */ 2 | /* 3 | * "$Id: config.h,v 1.2 2007/10/11 08:22:37 wimpie Exp $" 4 | * 5 | * Configuration file for Mini-XML, a small XML-like file parsing library. 6 | * 7 | * Copyright 2003-2007 by Michael Sweet. 8 | * 9 | * This program is free software; you can redistribute it and/or 10 | * modify it under the terms of the GNU Library General Public 11 | * License as published by the Free Software Foundation; either 12 | * version 2, or (at your option) any later version. 13 | * 14 | * This program is distributed in the hope that it will be useful, 15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | * GNU General Public License for more details. 18 | */ 19 | 20 | /* 21 | * Include necessary headers... 22 | */ 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | 31 | /* 32 | * Version number... 33 | */ 34 | 35 | #define MXML_VERSION "Mini-XML v2.3" 36 | 37 | 38 | /* 39 | * Do we have the snprintf() and vsnprintf() functions? 40 | */ 41 | 42 | #define HAVE_SNPRINTF 1 43 | #define HAVE_VSNPRINTF 1 44 | #ifdef WIN32 45 | #define snprintf _snprintf 46 | #endif 47 | 48 | 49 | /* 50 | * Do we have the strXXX() functions? 51 | */ 52 | 53 | #define HAVE_STRDUP 1 54 | 55 | 56 | /* 57 | * Define prototypes for string functions as needed... 58 | */ 59 | 60 | # ifndef HAVE_STRDUP 61 | extern char *_mxml_strdup(const char *); 62 | # define strdup _mxml_strdup 63 | # endif /* !HAVE_STRDUP */ 64 | 65 | extern char *_mxml_strdupf(const char *, ...); 66 | extern char *_mxml_vstrdupf(const char *, va_list); 67 | 68 | # ifndef HAVE_SNPRINTF 69 | extern int _mxml_snprintf(char *, size_t, const char *, ...); 70 | # define snprintf _mxml_snprintf 71 | # endif /* !HAVE_SNPRINTF */ 72 | 73 | # ifndef HAVE_VSNPRINTF 74 | extern int _mxml_vsnprintf(char *, size_t, const char *, va_list); 75 | # define vsnprintf _mxml_vsnprintf 76 | # endif /* !HAVE_VSNPRINTF */ 77 | 78 | /* 79 | * End of "$Id: config.h,v 1.2 2007/10/11 08:22:37 wimpie Exp $". 80 | */ 81 | -------------------------------------------------------------------------------- /src/libseq/NeedlemanWunsh.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef NEEDLEMAN_WUNSH_H_ 3 | #define NEEDLEMAN_WUNSH_H_ 4 | 5 | #include 6 | 7 | /** 8 | * libseq namespace 9 | */ 10 | namespace seq { 11 | 12 | class NeedlemanWunsh : public AlignmentAlgorithm 13 | { 14 | public: 15 | NeedlemanWunsh(double gapOpenScore = -10, 16 | double gapExtensionScore = -3.3, 17 | double **ntWeightMatrix = 18 | AlignmentAlgorithm::IUB(), 19 | double **aaWeightMatrix = 20 | AlignmentAlgorithm::BLOSUM30()); 21 | /** 22 | * Pair-wise align two nucleotide sequences, using a modified 23 | * NeedleMan-Wunsh algorithm. 24 | * 25 | * The two sequences seq1 and seq2 are aligned in-place: gaps are inserted 26 | * according to a global alignment, and they will have equal length. 27 | * 28 | * The algorithm is NeedleMan-Wunsh, with two popular modifications: 29 | * - there is a different cost for opening a gap or for extending a gap. 30 | * - there is no gap open cost for a gap at the beginning or the end. 31 | */ 32 | virtual double align(NTSequence& seq1, NTSequence& seq2); 33 | 34 | /** 35 | * Pair-wise align two amino acid sequences, using a modified 36 | * NeedleMan-Wunsh algorithm. 37 | * 38 | * The two sequences seq1 and seq2 are aligned in-place: gaps are inserted 39 | * according to a global alignment, and they will have equal length. 40 | * 41 | * The algorithm is NeedleMan-Wunsh, with two popular modifications: 42 | * - there is a different cost for opening a gap or for extending a gap. 43 | * - there is no gap open cost for a gap at the beginning or the end. 44 | */ 45 | virtual double align(AASequence& seq1, AASequence& seq2); 46 | 47 | virtual double computeAlignScore(const NTSequence& seq1, 48 | const NTSequence& seq2); 49 | 50 | private: 51 | double gapOpenScore_; 52 | double gapExtensionScore_; 53 | double **ntWeightMatrix_; 54 | double **aaWeightMatrix_; 55 | 56 | template 57 | double needlemanWunshAlign(std::vector& seq1, 58 | std::vector& seq2, 59 | double** weigthMatrix); 60 | }; 61 | 62 | } 63 | 64 | #endif // NEEDLEMAN_WUNSH_H_ 65 | -------------------------------------------------------------------------------- /src/libseq/CodingSequence.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "Codon.h" 4 | #include "CodingSequence.h" 5 | 6 | namespace seq { 7 | 8 | CodingSequence::CodingSequence() 9 | : ntSequence_(), 10 | aaSequence_(), 11 | dirty_(D_COMPLETE) 12 | { } 13 | 14 | CodingSequence::CodingSequence(const NTSequence& aNtSequence) 15 | : ntSequence_(aNtSequence), 16 | aaSequence_(aNtSequence.size() / 3), 17 | dirty_(D_COMPLETE) 18 | { } 19 | 20 | const AASequence& CodingSequence::aaSequence() const 21 | { 22 | if (isDirty()) 23 | updateAASequence(); 24 | 25 | return aaSequence_; 26 | } 27 | 28 | void CodingSequence::changeNucleotide(int pos, Nucleotide value) 29 | { 30 | // a small effort to avoid to avoid retranslation of the whole AA sequence. 31 | if (isDirty() && (dirty_ != D_COMPLETE)) 32 | updateAASequence(); 33 | 34 | ntSequence_[pos] = value; 35 | 36 | if (isDirty()) 37 | dirty_ = D_COMPLETE; 38 | else 39 | dirty_ = pos; 40 | } 41 | 42 | int CodingSequence::whatIfMutation(int pos, Nucleotide value, 43 | AminoAcid& oldAA, 44 | AminoAcid& newAA) const 45 | { 46 | if (isDirty()) 47 | updateAASequence(); 48 | 49 | const int aaPos = pos / 3; 50 | const int codonPos = pos % 3; 51 | 52 | NTSequence newcodon(ntSequence_.begin() + aaPos * 3, 53 | ntSequence_.begin() + (aaPos * 3 + 3)); 54 | newcodon[codonPos] = value; 55 | 56 | oldAA = aaSequence_[aaPos]; 57 | newAA = Codon::translate(newcodon.begin()); 58 | 59 | return aaPos; 60 | } 61 | 62 | bool CodingSequence::isSynonymousMutation(int pos, Nucleotide value) const 63 | { 64 | AminoAcid oldAA, newAA; 65 | 66 | whatIfMutation(pos, value, oldAA, newAA); 67 | 68 | return (oldAA == newAA); 69 | } 70 | 71 | void CodingSequence::updateAASequence() const 72 | { 73 | if (dirty_ == D_COMPLETE) { 74 | aaSequence_ = AASequence::translate(ntSequence_); 75 | } else { 76 | dirty_ /= 3; 77 | aaSequence_[dirty_] 78 | = Codon::translate(ntSequence_.begin() + (dirty_ * 3)); 79 | } 80 | 81 | dirty_ = D_CLEAN; 82 | } 83 | 84 | void CodingSequence::allAASequences(std::vector >& result) 85 | const 86 | { 87 | for (unsigned i = 0; i < ntSequence_.size(); i += 3) { 88 | result.push_back(Codon::translateAll(ntSequence_.begin() + i)); 89 | } 90 | }; 91 | 92 | extern void printAmbiguousAASequence(std::ostream& out, 93 | const CodingSequence& cs) 94 | { 95 | std::vector > aas; 96 | cs.allAASequences(aas); 97 | 98 | for (unsigned i = 0; i < aas.size(); ++i) { 99 | if (aas[i].size() > 1) 100 | out << "{"; 101 | for (std::set::const_iterator j = aas[i].begin(); 102 | j != aas[i].end(); ++j) 103 | out << *j; 104 | if (aas[i].size() > 1) 105 | out << "}"; 106 | } 107 | } 108 | 109 | } 110 | -------------------------------------------------------------------------------- /references/HIV/HIV-HXB2-env.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /src/libseq/CodingSequence.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef CODING_SEQUENCE_H_ 3 | #define CODING_SEQUENCE_H_ 4 | 5 | #include 6 | #include 7 | 8 | #include "NTSequence.h" 9 | #include "AASequence.h" 10 | 11 | namespace seq { 12 | 13 | /** 14 | * A coding sequence represents a nucleotide sequence that codes for 15 | * an amino acid sequence (an oligo- or polypeptide). 16 | * 17 | * It is useful when one wants to track the effect of changes in the 18 | * nucleotide sequence for the amino acid sequence, and to investigate 19 | * properties of nucleotide mutations. 20 | */ 21 | class CodingSequence 22 | { 23 | public: 24 | /** 25 | * Construct a coding sequence with empty nucleotide sequence. 26 | */ 27 | CodingSequence(); 28 | 29 | /** 30 | * Construct a coding sequence based on the given nucleotide 31 | * sequence. The sequence must be translatable as per 32 | * AASequence::translate(const NTSequence&). 33 | */ 34 | CodingSequence(const NTSequence& aNtSequence); 35 | 36 | /** 37 | * Get the nucleotide sequence. 38 | */ 39 | const NTSequence& ntSequence() const { return ntSequence_; } 40 | 41 | /** 42 | * Get the amino acid sequence. 43 | * 44 | * If needed, the amino acid sequence is updated to reflect changes 45 | * in the nucleotide sequence. 46 | */ 47 | const AASequence& aaSequence() const; 48 | 49 | /** 50 | * Change a nucleotide at a given position in the nucleotide sequence to 51 | * a new value. 52 | */ 53 | void changeNucleotide(int pos, Nucleotide value); 54 | 55 | /** 56 | * Investigate the effect of a nucleotide mutation on the amino acid 57 | * sequence. This returns both the old (oldAA) and new amino acid (newAA) 58 | * encoded by the mutation, as well as the position (return value). 59 | */ 60 | int whatIfMutation(int pos, Nucleotide value, 61 | AminoAcid& oldAA, AminoAcid& newAA) const; 62 | 63 | /** 64 | * Investigate whether a give nucleotide mutation is synonymous or 65 | * non-synonymous with respect to the amino acid sequence. 66 | */ 67 | bool isSynonymousMutation(int pos, Nucleotide value) const; 68 | 69 | /** 70 | * Get the amino acid sequence possibilities, taking into account 71 | * all ambiguities 72 | */ 73 | void allAASequences(std::vector >& result) const; 74 | 75 | protected: 76 | void updateAASequence() const; 77 | 78 | private: 79 | NTSequence ntSequence_; 80 | mutable AASequence aaSequence_; 81 | 82 | bool isDirty() const { return dirty_ != D_CLEAN; } 83 | 84 | mutable int dirty_; 85 | 86 | static const int D_CLEAN = -1; 87 | static const int D_COMPLETE = -2; 88 | }; 89 | 90 | /** 91 | * Write an amino acid sequence with all possible ambiguities 92 | * to the stream. 93 | * 94 | * The format is e.g. TW{LM}YS 95 | */ 96 | extern void printAmbiguousAASequence(std::ostream& out, 97 | const CodingSequence& cs); 98 | 99 | }; 100 | 101 | #endif // CODING_SEQUENCE_H_ 102 | -------------------------------------------------------------------------------- /src/libseq/AASequence.cpp: -------------------------------------------------------------------------------- 1 | #include "AASequence.h" 2 | #include "Codon.h" 3 | 4 | namespace seq { 5 | 6 | AASequence::AASequence() 7 | : std::vector() 8 | { } 9 | 10 | AASequence::AASequence(unsigned size) 11 | : std::vector(size) 12 | { } 13 | 14 | AASequence::AASequence(const const_iterator first, 15 | const const_iterator last) 16 | : std::vector(first, last) 17 | { } 18 | 19 | AASequence::AASequence(const std::string name, 20 | const std::string description, 21 | const std::string aSeqString) 22 | : std::vector(aSeqString.length()), 23 | name_(name), 24 | description_(description) 25 | { 26 | for (unsigned i = 0; i < aSeqString.length(); ++i) { 27 | (*this)[i] = AminoAcid(aSeqString[i]); 28 | } 29 | } 30 | 31 | std::string AASequence::asString() const 32 | { 33 | std::string result(size(), '-'); 34 | 35 | for (unsigned i = 0; i < size(); ++i) { 36 | result[i] = (*this)[i].toChar(); 37 | } 38 | 39 | return result; 40 | } 41 | 42 | inline bool contains(const std::set& possibilities, const AminoAcid& aa) 43 | { 44 | return possibilities.find(aa) != possibilities.end(); 45 | } 46 | 47 | AASequence AASequence::translate(const NTSequence::const_iterator begin, 48 | const NTSequence::const_iterator end) 49 | { 50 | const int size = end - begin; 51 | assert(size % 3 == 0); 52 | 53 | AASequence result(size / 3); 54 | 55 | for (NTSequence::const_iterator i = begin; i < end; i += 3) { 56 | std::set possibilities = Codon::translateAll(i); 57 | if (possibilities.size() > 2) { 58 | result[(i - begin)/3] = AminoAcid::X; 59 | } else if (possibilities.size() == 2) { 60 | if (contains(possibilities,AminoAcid::D) && contains(possibilities,AminoAcid::N)) 61 | result[(i - begin)/3] = AminoAcid::B; 62 | else if (contains(possibilities,AminoAcid::E) && contains(possibilities,AminoAcid::Q)) 63 | result[(i - begin)/3] = AminoAcid::Z; 64 | else if (contains(possibilities,AminoAcid::L) && contains(possibilities,AminoAcid::I)) 65 | result[(i - begin)/3] = AminoAcid::J; 66 | else 67 | result[(i - begin)/3] = AminoAcid::X; 68 | } else { 69 | result[(i - begin)/3] = *possibilities.begin(); 70 | } 71 | } 72 | 73 | return result; 74 | } 75 | 76 | AASequence AASequence::translate(const NTSequence& ntSequence) 77 | { 78 | return translate(ntSequence.begin(), ntSequence.end()); 79 | } 80 | 81 | // defined in NTSequence.C: 82 | extern void readFastaEntry(std::istream& i, 83 | std::string& name, 84 | std::string& description, 85 | std::string& sequence); 86 | extern void writeFastaEntry(std::ostream& o, 87 | const std::string& name, 88 | const std::string& description, 89 | const std::string& sequence); 90 | 91 | std::istream& operator>>(std::istream& i, AASequence& sequence) 92 | { 93 | std::string name, description, seqString; 94 | 95 | readFastaEntry(i, name, description, seqString); 96 | sequence = AASequence(name, description, seqString); 97 | 98 | return i; 99 | } 100 | 101 | std::ostream& operator<<(std::ostream& o, const AASequence& sequence) 102 | { 103 | writeFastaEntry(o, sequence.name(), sequence.description(), 104 | sequence.asString()); 105 | 106 | return o; 107 | } 108 | 109 | }; 110 | -------------------------------------------------------------------------------- /src/libseq/AminoAcid.cpp: -------------------------------------------------------------------------------- 1 | #include "AminoAcid.h" 2 | #include "ParseException.h" 3 | 4 | namespace seq { 5 | 6 | const char AminoAcid::AA_CHAR[] 7 | = { 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 8 | 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 9 | 'W', 'Y', '*', '-', 'Z', 'U', 'B', 'X', 'J' }; 10 | 11 | const char * const AminoAcid::AA_TLA[] 12 | = { "Ala", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", "Lys", 13 | "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val", 14 | "Trp", "Tyr", "STP", "GAP", "Glu/Gln", "Sec", "Asp/Asn", "Any", 15 | "Leu/Ile" }; 16 | 17 | const AminoAcid AminoAcid::A(AminoAcid::AA_A); 18 | const AminoAcid AminoAcid::C(AminoAcid::AA_C); 19 | const AminoAcid AminoAcid::D(AminoAcid::AA_D); 20 | const AminoAcid AminoAcid::E(AminoAcid::AA_E); 21 | const AminoAcid AminoAcid::F(AminoAcid::AA_F); 22 | const AminoAcid AminoAcid::G(AminoAcid::AA_G); 23 | const AminoAcid AminoAcid::H(AminoAcid::AA_H); 24 | const AminoAcid AminoAcid::I(AminoAcid::AA_I); 25 | const AminoAcid AminoAcid::K(AminoAcid::AA_K); 26 | const AminoAcid AminoAcid::L(AminoAcid::AA_L); 27 | const AminoAcid AminoAcid::M(AminoAcid::AA_M); 28 | const AminoAcid AminoAcid::N(AminoAcid::AA_N); 29 | const AminoAcid AminoAcid::P(AminoAcid::AA_P); 30 | const AminoAcid AminoAcid::Q(AminoAcid::AA_Q); 31 | const AminoAcid AminoAcid::R(AminoAcid::AA_R); 32 | const AminoAcid AminoAcid::S(AminoAcid::AA_S); 33 | const AminoAcid AminoAcid::T(AminoAcid::AA_T); 34 | const AminoAcid AminoAcid::V(AminoAcid::AA_V); 35 | const AminoAcid AminoAcid::W(AminoAcid::AA_W); 36 | const AminoAcid AminoAcid::Y(AminoAcid::AA_Y); 37 | const AminoAcid AminoAcid::STP(AminoAcid::AA_STP); 38 | const AminoAcid AminoAcid::GAP(AminoAcid::AA_GAP); 39 | const AminoAcid AminoAcid::Z(AminoAcid::AA_Z); 40 | const AminoAcid AminoAcid::U(AminoAcid::AA_U); 41 | const AminoAcid AminoAcid::B(AminoAcid::AA_B); 42 | const AminoAcid AminoAcid::X(AminoAcid::AA_X); 43 | const AminoAcid AminoAcid::J(AminoAcid::AA_J); 44 | 45 | AminoAcid::AminoAcid() 46 | : rep_(AA_Z) 47 | { } 48 | 49 | AminoAcid::AminoAcid(char c) 50 | { 51 | switch (toupper(c)) { 52 | case 'A': rep_ = AA_A; break; 53 | case 'C': rep_ = AA_C; break; 54 | case 'D': rep_ = AA_D; break; 55 | case 'E': rep_ = AA_E; break; 56 | case 'F': rep_ = AA_F; break; 57 | case 'G': rep_ = AA_G; break; 58 | case 'H': rep_ = AA_H; break; 59 | case 'I': rep_ = AA_I; break; 60 | case 'K': rep_ = AA_K; break; 61 | case 'L': rep_ = AA_L; break; 62 | case 'M': rep_ = AA_M; break; 63 | case 'N': rep_ = AA_N; break; 64 | case 'P': rep_ = AA_P; break; 65 | case 'Q': rep_ = AA_Q; break; 66 | case 'R': rep_ = AA_R; break; 67 | case 'S': rep_ = AA_S; break; 68 | case 'T': rep_ = AA_T; break; 69 | case 'V': rep_ = AA_V; break; 70 | case 'W': rep_ = AA_W; break; 71 | case 'Y': rep_ = AA_Y; break; 72 | case '*': rep_ = AA_STP; break; 73 | case '-': rep_ = AA_GAP; break; 74 | case 'Z': rep_ = AA_Z; break; 75 | case 'U': rep_ = AA_U; break; 76 | case 'B': rep_ = AA_B; break; 77 | case 'X': rep_ = AA_X; break; 78 | case 'J': rep_ = AA_J; break; 79 | default: 80 | throw ParseException 81 | (std::string(), 82 | std::string("Invalid amino acid character: '") + c + "'", false); 83 | } 84 | } 85 | 86 | std::string AminoAcid::tla() const 87 | { 88 | return AA_TLA[rep_]; 89 | } 90 | 91 | std::ostream& operator<< (std::ostream& s, const AminoAcid aa) 92 | { 93 | return s << aa.toChar(); 94 | } 95 | 96 | }; 97 | -------------------------------------------------------------------------------- /references/HIV/HIV-HXB2-pol.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /src/ReferenceSequence.cpp: -------------------------------------------------------------------------------- 1 | #include "ReferenceSequence.h" 2 | 3 | #include "mxml-utils/MXMLUtils.h" 4 | #include "mxml/mxml.h" 5 | #include 6 | #include 7 | 8 | #include "Utils.h" 9 | 10 | ReferenceSequence::ReferenceSequence(const seq::NTSequence& seq) 11 | : seq::NTSequence(seq) 12 | { 13 | 14 | } 15 | 16 | ReferenceSequence parseOrfReference(mxml_node_t* node) 17 | { 18 | std::string orf; 19 | std::string refSeq; 20 | attributeValue(node, "name", orf); 21 | attributeValue(node, "referenceSequence", refSeq); 22 | 23 | std::vector proteins = childElements(node, "protein"); 24 | 25 | ReferenceSequence seq 26 | (seq::NTSequence(orf, orf, refSeq)); 27 | for (int k = 0; k < proteins.size(); k++) { 28 | std::string protein; 29 | std::string start; 30 | std::string end; 31 | 32 | attributeValue(proteins[k], "abbreviation", protein); 33 | attributeValue(proteins[k], "startPosition", start); 34 | attributeValue(proteins[k], "stopPosition", end); 35 | 36 | if (protein == "") 37 | throw std::runtime_error("protein abbreviation is invalid"); 38 | if (start == "") 39 | throw std::runtime_error("protein start is invalid"); 40 | if (end == "") 41 | throw std::runtime_error("protein end is invalid"); 42 | 43 | int startPos = (atoi(start.c_str()) - 1) / 3; 44 | int endPos = (atoi(end.c_str()) - 1) / 3; 45 | 46 | ReferenceSequence::Region region(startPos, 47 | endPos, 48 | protein); 49 | seq.addRegion(region); 50 | } 51 | 52 | return seq; 53 | } 54 | 55 | ReferenceSequence ReferenceSequence::parseOrfReferenceFile(const std::string& fileName) 56 | { 57 | FILE *fp = fopen(fileName.c_str(), "r"); 58 | if (fp) { 59 | mxml_node_t *top = mxmlNewElement(MXML_NO_PARENT, "top"); 60 | 61 | mxml_node_t *first = mxmlLoadFile(top, fp, MXML_NO_CALLBACK); 62 | 63 | if (first) { 64 | mxml_node_t *root = singleChildElement(top, "orf"); 65 | return parseOrfReference(root); 66 | } 67 | } 68 | 69 | throw std::runtime_error("Error parsing ORF reference file"); 70 | } 71 | 72 | std::map > 73 | ReferenceSequence::parseProteinReferences(std::string genomesXmlFile) 74 | { 75 | std::map > genomesMap; 76 | 77 | FILE *fp = fopen(genomesXmlFile.c_str(), "r"); 78 | if (fp) { 79 | mxml_node_t *top = mxmlNewElement(MXML_NO_PARENT, "top"); 80 | 81 | mxml_node_t *first = mxmlLoadFile(top, fp, MXML_NO_CALLBACK); 82 | 83 | if (first) { 84 | mxml_node_t *root = singleChildElement(top, "genomes"); 85 | 86 | std::vector genomes 87 | = childElements(root, "genome"); 88 | 89 | for (int i = 0; i < genomes.size(); i++) { 90 | std::string organism; 91 | std::vector refs; 92 | attributeValue(genomes[i], "organismName", organism); 93 | if (organism == "") 94 | throw std::runtime_error("organism name is invalid"); 95 | 96 | std::vector orfs 97 | = childElements(genomes[i], "openReadingFrame"); 98 | for (int j = 0; j < orfs.size(); j++) { 99 | ReferenceSequence seq = parseOrfReference(orfs[j]); 100 | refs.push_back(seq); 101 | } 102 | genomesMap[organism] = refs; 103 | } 104 | } 105 | } 106 | 107 | return genomesMap; 108 | } 109 | 110 | -------------------------------------------------------------------------------- /references/CHIKV/CHIKV-NC004162-gp2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /src/libseq/CodonAlign.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef CODON_ALIGN_H_ 3 | #define CODON_ALIGN_H_ 4 | 5 | #include 6 | 7 | /** 8 | * libseq namespace 9 | */ 10 | namespace seq { 11 | /** 12 | * Thrown when alignment failed. 13 | */ 14 | class AlignmentError : public std::exception 15 | { 16 | public: 17 | AlignmentError(double ntScore, double codonScore, 18 | const NTSequence& ntRef, const NTSequence& ntTarget, 19 | const std::string& message = std::string("Alignment error.")); 20 | virtual ~AlignmentError() throw(); 21 | 22 | /** %Nucleotide alignment score. 23 | */ 24 | double nucleotideAlignmentScore() const { return ntScore_; } 25 | 26 | /** Codon-based alignemnt score. 27 | */ 28 | double codonAlignmentScore() const { return codonScore_; } 29 | 30 | /** %Nucleotide aligned reference sequence 31 | */ 32 | const NTSequence& nucleotideAlignedRef() const { return ntRef_; } 33 | 34 | /** %Nucleotide aligned target sequence 35 | */ 36 | const NTSequence& nucleotideAlignedTarget() const { return ntTarget_; } 37 | 38 | /** Error message 39 | */ 40 | const std::string& message() const{ return message_; } 41 | 42 | private: 43 | std::string message_; 44 | double ntScore_, codonScore_; 45 | NTSequence ntRef_, ntTarget_; 46 | }; 47 | 48 | /** 49 | * Error thrown by CodonAlign when apparent frame shifts cannot be corrected. 50 | * 51 | * Details in CodonAlign. 52 | */ 53 | class FrameShiftError : public AlignmentError 54 | { 55 | public: 56 | FrameShiftError(double ntScore, double codonScore, 57 | const NTSequence& ntRef, const NTSequence& ntTarget); 58 | ~FrameShiftError() throw(); 59 | 60 | const char *what() const throw() { return "Frameshift error"; } 61 | }; 62 | 63 | 64 | class CodonAlign { 65 | public: 66 | /** 67 | * Constructor 68 | */ 69 | CodonAlign(AlignmentAlgorithm* algorithm); 70 | 71 | /** 72 | * Perform codon-based alignment of nucleotide sequences. 73 | * 74 | * Two nucleotide sequences are pair-wise aligned, but so that gaps are 75 | * at codon boundaries. Optionally, frameshifts may be detected and corrected. 76 | * 77 | * The reference sequence must be of length a multiple of 3, and is assumed 78 | * to represent an Open Reading Frame (ORF). 79 | * 80 | * The procedure translates the target sequence in the 3 ORFs, 81 | * and for each ORF performs an amino-acid alignment against the translated 82 | * reference sequence. The best alignment is used to create the nucleotide 83 | * alignment. 84 | * 85 | * Then, the score of the codon aligned nucleotide alignment is computed, and 86 | * compared with a direct nucleotide alignment of both nucleotide sequences. 87 | * The codon alignment is accepted only if the difference is smaller than 100. 88 | * Otherwise, if maxFrameShifts > 0, the frameshift is searched, corrected 89 | * by inserting 1 or 2 'N' symbols in the target sequence, and repeating the 90 | * codon alignment. This is repeated for up to maxFrameShifts of times. 91 | * 92 | * The result is the nucleotide alignment score of the codon alignment, and 93 | * the number of frameshifts that have been corrected. 94 | * 95 | * @throws FrameShiftError when frameshifts could not be corrected, or 96 | * the number of detected frameshifts exceeds maxFrameShifts. 97 | */ 98 | std::pair 99 | align(NTSequence& ref, NTSequence& target, int maxFrameShifts = 1); 100 | 101 | private: 102 | bool haveGaps(const NTSequence& seq, int from, int to); 103 | double alignLikeAA(NTSequence& seq1, NTSequence& seq2, 104 | int ORF, 105 | const AASequence& seqAA1, const AASequence& seqAA2); 106 | bool noGapAt(const NTSequence& seq, unsigned int i) const; 107 | 108 | AlignmentAlgorithm* algorithm_; 109 | }; 110 | } 111 | 112 | #endif // CODON_ALIGN_H_ 113 | -------------------------------------------------------------------------------- /references/SARS-CoV-2/S.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/libseq/NTSequence.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef NTSEQUENCE_H_ 3 | #define NTSEQUENCE_H_ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "ParseException.h" 11 | #include "Nucleotide.h" 12 | 13 | namespace seq { 14 | 15 | /** 16 | * A nucleotide sequence. 17 | * 18 | * The sequence may have a name and a description. 19 | * 20 | * The sequence data is stored by publicly inheriting 21 | * std::vector, so you can use all 22 | * 23 | * std::vector manipulations to access the nucleotide data. 24 | */ 25 | class NTSequence : public std::vector 26 | { 27 | public: 28 | /** 29 | * Create an empty nucleotide sequence with emtpy name 30 | * and empty description. 31 | */ 32 | NTSequence(); 33 | 34 | /** 35 | * Create a nucleotide sequence of length size, filled with 36 | * Nucleotide::N, with empty name and emtpy description. 37 | */ 38 | NTSequence(unsigned size); 39 | 40 | /** 41 | * Create a nucleotide sequence with given name and description, and 42 | * with the given sequence string. Each character in the sequence 43 | * string will be interpreted as a Nucleotide using the 44 | * Nucleotide::Nucleotide(char) constructor. 45 | * 46 | * If sampleAmbiguities = true, then sampleAmbiguities() is 47 | * performed during construction. 48 | * 49 | * \sa sampleAmbiguities() 50 | */ 51 | NTSequence(const std::string name, 52 | const std::string description, 53 | const std::string aSeqString, 54 | bool sampleAmbiguities = false); 55 | 56 | /** 57 | * Create a nucleotide sequence with empty name and emtpy 58 | * description, and copy the sequence data from the range [first, last[. 59 | */ 60 | NTSequence(const const_iterator first, 61 | const const_iterator last); 62 | 63 | /** 64 | * Remove ambiguity nucleotide symbols by replacing them by sampling 65 | * a random non-ambiguous nucleotide that is represented by the 66 | * ambiguity symbol. 67 | * 68 | * \sa Nucleotide::sampleAmbiguity() 69 | */ 70 | void sampleAmbiguities(); 71 | 72 | NTSequence reverseComplement() const; 73 | 74 | /** 75 | * Add all the possible non-ambiguous sequences possibly represented by 76 | * this sequence to result. 77 | */ 78 | void nonAmbiguousSequences(std::vector& result) const; 79 | 80 | /** 81 | * Represent the sequence data as a string. 82 | */ 83 | std::string asString() const; 84 | 85 | /** 86 | * Get the name. 87 | */ 88 | std::string name() const { return name_; } 89 | 90 | /** 91 | * Get the description. 92 | */ 93 | std::string description() const { return description_; } 94 | 95 | /** 96 | * Set the name. 97 | */ 98 | void setName(std::string name) { name_ = name; } 99 | 100 | /** 101 | * Set the description. 102 | */ 103 | void setDescription(std::string description) { description_ = description; } 104 | 105 | private: 106 | std::string name_; 107 | std::string description_; 108 | 109 | void iterateNonAmbiguous(const NTSequence& head, 110 | std::vector& result) const; 111 | }; 112 | 113 | /** 114 | * Write a set of sequences to Stockholm format 115 | */ 116 | extern void writeStockholm(std::ostream& o, 117 | const std::vector& sequences, 118 | int length=10000, int labelsize=0, 119 | int seqsize=0, int pos=0); 120 | 121 | /** 122 | * Read a nucleotide sequence in FASTA format from the given stream. 123 | */ 124 | extern std::istream& operator>>(std::istream& i, NTSequence& sequence); 125 | 126 | /** 127 | * Write a nucleotide sequence to the given stream in FASTA format. 128 | */ 129 | extern std::ostream& operator<<(std::ostream& o, const NTSequence& sequence); 130 | 131 | }; 132 | 133 | #endif // NTSEQUENCE_H_ 134 | -------------------------------------------------------------------------------- /src/libseq/AASequence.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef AASEQUENCE_H_ 3 | #define AASEQUENCE_H_ 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "NTSequence.h" 10 | #include "AminoAcid.h" 11 | 12 | namespace seq { 13 | 14 | /** 15 | * An amino acid sequence. 16 | * 17 | * The sequence may have a name and a description. 18 | * 19 | * The sequence data is stored by publicly inheriting 20 | * std::vector, so you can use all 21 | * 22 | * std::vector manipulations to access the amino acid data. 23 | */ 24 | class AASequence : public std::vector 25 | { 26 | public: 27 | /** 28 | * Create an empty amino acid sequence with emtpy name and empty 29 | * description. 30 | */ 31 | AASequence(); 32 | 33 | /** 34 | * Create an amino acid sequence of length size, filled with 35 | * AminoAcid::X, with empty name and emtpy description. 36 | */ 37 | AASequence(unsigned size); 38 | 39 | /** 40 | * Create an amino acid sequence with given name and description, and 41 | * with the given sequence string. Each character in the sequence 42 | * string will be interpreted as an AminoAcid using the 43 | * AminoAcid::AminoAcid(char) constructor. 44 | */ 45 | AASequence(const std::string name, 46 | const std::string description, 47 | const std::string aSeqString); 48 | 49 | /** 50 | * Create a nucleotide sequence with empty name and emtpy 51 | * description, and copy the sequence data from the range [first, last[. 52 | */ 53 | AASequence(const const_iterator first, const const_iterator last); 54 | 55 | /** 56 | * Represent the sequence data as a string. 57 | */ 58 | std::string asString() const; 59 | 60 | /** 61 | * Get the name. 62 | */ 63 | std::string name() const { return name_; } 64 | 65 | /** 66 | * Get the description. 67 | */ 68 | std::string description() const { return description_; } 69 | 70 | /** 71 | * Set the name. 72 | */ 73 | void setName(std::string name) { name_ = name; } 74 | 75 | /** 76 | * Set the description. 77 | */ 78 | void setDescription(std::string description) { description_ = description; } 79 | 80 | /** 81 | * Translate a nucleotide sequence to an amino acid sequence. The 82 | * nucleotide sequence must have a length that is a multiple of 83 | * three. 84 | * 85 | * The resulting amino acid sequence will contain an amino acid for 86 | * every triplet of nucleotides in the nucleotide sequence. The 87 | * amino acid sequence will have the same name and description as 88 | * the nucleotide sequence. 89 | * 90 | * \sa translate(const NTSequence::const_iterator, const NTSequence::const_iterator), Codon::translate(const NTSequence::const_iterator) 91 | */ 92 | static AASequence translate(const NTSequence& ntSequence); 93 | 94 | /** 95 | * Translate a nucleotide sequence, defined by the range begin to 96 | * end, to an amino acid sequence. The nucleotide sequence must have a 97 | * length that is a multiple of three. 98 | * 99 | * The resulting amino acid sequence will contain an amino acid for 100 | * every triplet of nucleotides in the nucleotide sequence, and will 101 | * have an empty name and empty description. 102 | * 103 | * \sa translate(const NTSequence&), Codon::translate(const NTSequence::const_iterator) 104 | */ 105 | static AASequence translate(const NTSequence::const_iterator begin, 106 | const NTSequence::const_iterator end); 107 | private: 108 | std::string name_; 109 | std::string description_; 110 | }; 111 | 112 | /** 113 | * Read an amino acid sequence in FASTA format from the given stream. 114 | */ 115 | extern std::istream& operator>>(std::istream& i, AASequence& sequence); 116 | 117 | /** 118 | * Write an amino acid sequence to the given stream in FASTA format. 119 | */ 120 | extern std::ostream& operator<<(std::ostream& o, const AASequence& sequence); 121 | 122 | }; 123 | 124 | #endif // AASEQUENCE_H_ 125 | -------------------------------------------------------------------------------- /src/libseq/AlignmentAlgorithm.cpp: -------------------------------------------------------------------------------- 1 | #include "AlignmentAlgorithm.h" 2 | 3 | namespace seq { 4 | 5 | double** AlignmentAlgorithm::IUB() 6 | { 7 | static double rowA[] = { 5,-4,-4,-4,1,1,1,-4,-4,-4,-1,-1,-1,-4,-2 }; 8 | static double rowC[] = { -4,5,-4,-4,1,-4,-4,1,1,-4,-1,-1,-4,-1,-2 }; 9 | static double rowG[] = { -4,-4,5,-4,-4,1,-4,1,-4,1,-1,-4,-1,-1,-2 }; 10 | static double rowT[] = { -4,-4,-4,5,-4,-4,1,-4,1,1,-4,-1,-1,-1,-2 }; 11 | static double rowM[] = { 1,1,-4,-4,-1,-2,-2,-2,-2,-4,-1,-1,-3,-3,-1 }; 12 | static double rowR[] = { 1,-4,1,-4,-2,-1,-2,-2,-4,-2,-1,-3,-1,-3,-1 }; 13 | static double rowW[] = { 1,-4,-4,1,-2,-2,-1,-4,-2,-2,-3,-1,-1,-3,-1 }; 14 | static double rowS[] = { -4,1,1,-4,-2,-2,-4,-1,-2,-2,-1,-3,-3,-1,-1 }; 15 | static double rowY[] = { -4,1,-4,1,-2,-4,-2,-2,-1,-2,-3,-1,-3,-1,-1 }; 16 | static double rowK[] = { -4,-4,1,1,-4,-2,-2,-2,-2,-1,-3,-3,-1,-1,-1 }; 17 | static double rowV[] = { -1,-1,-1,-4,-1,-1,-3,-1,-3,-3,-1,-2,-2,-2,-1 }; 18 | static double rowH[] = { -1,-1,-4,-1,-1,-3,-1,-3,-1,-3,-2,-1,-2,-2,-1 }; 19 | static double rowD[] = { -1,-4,-1,-1,-3,-1,-1,-3,-3,-1,-2,-2,-1,-2,-1 }; 20 | static double rowB[] = { -4,-1,-1,-1,-3,-3,-3,-1,-1,-1,-2,-2,-2,-1,-1 }; 21 | static double rowN[] = { -2,-2,-2,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 }; 22 | 23 | static double *iub[] = { rowA, rowC, rowG, rowT, rowM, rowR, rowW, rowS, 24 | rowY, rowK, rowV, rowH, rowD, rowB, rowN }; 25 | 26 | return iub; 27 | } 28 | 29 | double** AlignmentAlgorithm::BLOSUM30() 30 | { 31 | static double rowA[] = { 4,-3,0,0,-2,0,-2,0,0,-1,1,0,-1,1,-1,1,1,1,-5,-4,-7,0,0,0,0,0 }; 32 | static double rowC[] = { -3,17,-3,1,-3,-4,-5,-2,-3,0,-2,-1,-3,-2,-2,-2,-2,-2,-2,-6,-7,0,0,0,-2,-2 }; 33 | static double rowD[] = { 0,-3,9,1,-5,-1,-2,-4,0,-1,-3,1,-1,-1,-1,0,-1,-2,-4,-1,-7,0,0,0,5,-1 }; 34 | static double rowE[] = { 0,1,1,6,-4,-2,0,-3,2,-1,-1,-1,1,2,-1,0,-2,-3,-1,-2,-7,0,5,0,0,-1 }; 35 | static double rowF[] = { -2,-3,-5,-4,10,-3,-3,0,-1,2,-2,-1,-4,-3,-1,-1,-2,1,1,3,-7,0,-4,0,-3,-1 }; 36 | static double rowG[] = { 0,-4,-1,-2,-3,8,-3,-1,-1,-2,-2,0,-1,-2,-2,0,-2,-3,1,-3,-7,0,-2,0,0,-1 }; 37 | static double rowH[] = { -2,-5,-2,0,-3,-3,14,-2,-2,-1,2,-1,1,0,-1,-1,-2,-3,-5,0,-7,0,0,0,-2,-1 }; 38 | static double rowI[] = { 0,-2,-4,-3,0,-1,-2,6,-2,2,1,0,-3,-2,-3,-1,0,4,-3,-1,-7,0,-3,0,-2,0 }; 39 | static double rowK[] = { 0,-3,0,2,-1,-1,-2,-2,4,-2,2,0,1,0,1,0,-1,-2,-2,-1,-7,0,1,0,0,0 }; 40 | static double rowL[] = { -1,0,-1,-1,2,-2,-1,2,-2,4,2,-2,-3,-2,-2,-2,0,1,-2,3,-7,0,-1,0,-1,0 }; 41 | static double rowM[] = { 1,-2,-3,-1,-2,-2,2,1,2,2,6,0,-4,-1,0,-2,0,0,-3,-1,-7,0,-1,0,-2,0 }; 42 | static double rowN[] = { 0,-1,1,-1,-1,0,-1,0,0,-2,0,8,-3,-1,-2,0,1,-2,-7,-4,-7,0,-1,0,4,0 }; 43 | static double rowP[] = { -1,-3,-1,1,-4,-1,1,-3,1,-3,-4,-3,11,0,-1,-1,0,-4,-3,-2,-7,0,0,0,-2,-1 }; 44 | static double rowQ[] = { 1,-2,-1,2,-3,-2,0,-2,0,-2,-1,-1,0,8,3,-1,0,-3,-1,-1,-7,0,4,0,-1,0 }; 45 | static double rowR[] = { -1,-2,-1,-1,-1,-2,-1,-3,1,-2,0,-2,-1,3,8,-1,-3,-1,0,0,-7,0,0,0,-2,-1 }; 46 | static double rowS[] = { 1,-2,0,0,-1,0,-1,-1,0,-2,-2,0,-1,-1,-1,4,2,-1,-3,-2,-7,0,-1,0,0,0 }; 47 | static double rowT[] = { 1,-2,-1,-2,-2,-2,-2,0,-1,0,0,1,0,0,-3,2,5,1,-5,-1,-7,0,-1,0,0,0 }; 48 | static double rowV[] = { 1,-2,-2,-3,1,-3,-3,4,-2,1,0,-2,-4,-3,-1,-1,1,5,-3,1,-7,0,-3,0,-2,0 }; 49 | static double rowW[] = { -5,-2,-4,-1,1,1,-5,-3,-2,-2,-3,-7,-3,-1,0,-3,-5,-3,20,5,-7,0,-1,0,-5,-2 }; 50 | static double rowY[] = { -4,-6,-1,-2,3,-3,0,-1,-1,3,-1,-4,-2,-1,0,-2,-1,1,5,9,-7,0,-2,0,-3,-1 }; 51 | static double rowSTP[] = { -7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,1,0,-7,0,-7,-7 }; 52 | static double rowGAP[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; 53 | static double rowZ[] = { 0,0,0,5,-4,-2,0,-3,1,-1,-1,-1,0,4,0,-1,-1,-3,-1,-2,-7,0,4,0,0,0 }; 54 | static double rowU[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; 55 | static double rowB[] = { 0,-2,5,0,-3,0,-2,-2,0,-1,-2,4,-2,-1,-2,0,0,-2,-5,-3,-7,0,0,0,5,-1 }; 56 | static double rowX[] = { 0,-2,-1,-1,-1,-1,-1,0,0,0,0,0,-1,0,-1,0,0,0,-2,-1,-7,0,0,0,-1,-1 }; 57 | 58 | static double *mat[] = { rowA, rowC, rowD, rowE, rowF, rowG, rowH, rowI, 59 | rowK, rowL, rowM, rowN, rowP, rowQ, rowR, rowS, 60 | rowT, rowV, rowW, rowY, rowSTP, rowGAP, 61 | rowZ, rowU, rowB, rowX }; 62 | 63 | return mat; 64 | } 65 | 66 | }; 67 | -------------------------------------------------------------------------------- /src/libseq/AminoAcid.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef AMINO_ACID_H_ 3 | #define AMINO_ACID_H_ 4 | 5 | #include 6 | #include 7 | 8 | #include "ParseException.h" 9 | 10 | namespace seq { 11 | 12 | /** 13 | * An amino acid. 14 | * 15 | * The amino acid is represented internally using an integer 16 | * representation. This may be helpful for e.g. indexing into an 17 | * array. Therefore, it is possible to both retrieve this internal 18 | * representation with intRep() and construct an AminoAcid from an 19 | * internal representation directly with fromRep(int). 20 | */ 21 | class AminoAcid { 22 | public: 23 | /** 24 | * @name Constants used in the internal representation. 25 | * \sa intRep() and fromRep(int). 26 | */ 27 | //@{ 28 | static const int AA_A = 0; 29 | static const int AA_C = 1; 30 | static const int AA_D = 2; 31 | static const int AA_E = 3; 32 | static const int AA_F = 4; 33 | static const int AA_G = 5; 34 | static const int AA_H = 6; 35 | static const int AA_I = 7; 36 | static const int AA_K = 8; 37 | static const int AA_L = 9; 38 | static const int AA_M = 10; 39 | static const int AA_N = 11; 40 | static const int AA_P = 12; 41 | static const int AA_Q = 13; 42 | static const int AA_R = 14; 43 | static const int AA_S = 15; 44 | static const int AA_T = 16; 45 | static const int AA_V = 17; 46 | static const int AA_W = 18; 47 | static const int AA_Y = 19; 48 | static const int AA_STP = 20; // translation stop 49 | static const int AA_GAP = 21; 50 | static const int AA_Z = 22; // glutamate (E) or glutamine (Q) 51 | static const int AA_U = 23; // selenocysteine 52 | static const int AA_B = 24; // asparatate (D) or asparagine (N) 53 | static const int AA_X = 25; // any 54 | static const int AA_J = 26; // leucine (L) or isoleucine (I) 55 | //@} 56 | 57 | /** 58 | * @name AminoAcid constants. 59 | */ 60 | //@{ 61 | static const AminoAcid A; 62 | static const AminoAcid C; 63 | static const AminoAcid D; 64 | static const AminoAcid E; 65 | static const AminoAcid F; 66 | static const AminoAcid G; 67 | static const AminoAcid H; 68 | static const AminoAcid I; 69 | static const AminoAcid K; 70 | static const AminoAcid L; 71 | static const AminoAcid M; 72 | static const AminoAcid N; 73 | static const AminoAcid P; 74 | static const AminoAcid Q; 75 | static const AminoAcid R; 76 | static const AminoAcid S; 77 | static const AminoAcid T; 78 | static const AminoAcid V; 79 | static const AminoAcid W; 80 | static const AminoAcid Y; 81 | static const AminoAcid STP; 82 | static const AminoAcid GAP; 83 | 84 | /* less common amino acids: */ 85 | static const AminoAcid Z; 86 | static const AminoAcid U; 87 | static const AminoAcid B; 88 | static const AminoAcid X; 89 | static const AminoAcid J; 90 | //@} 91 | 92 | /** 93 | * Create an amino acid with value AminoAcid::X (any). 94 | */ 95 | AminoAcid(); 96 | 97 | /** 98 | * Create an amino acid by parsing a character. 99 | * Accepted are the characters from the FASTA file definition. 100 | * 101 | * \sa toChar() 102 | */ 103 | AminoAcid(char c); 104 | 105 | /** 106 | * Create an amino acid using the internal representation directly. 107 | * Only valid representations are accepted, see the AA_* constants. 108 | * Illegal representations are fenced off by an assert() statement. 109 | * 110 | * \sa intRep() 111 | */ 112 | static AminoAcid fromRep(int rep) { 113 | assert(rep >= 0 && rep <= AA_X); 114 | 115 | return AminoAcid(rep); 116 | } 117 | 118 | /** 119 | * Get the uppercase character representation for this amino acid. 120 | * 121 | * \sa AminoAcid(char) 122 | */ 123 | char toChar() const { 124 | return AA_CHAR[rep_]; 125 | } 126 | 127 | /** 128 | * Get the three letter abbreviation for this amino acid. 129 | * 130 | * Note that AminoAcid::B and AminoAcid::Z are combinations of two 131 | * amino acids, and represented as "One/Two". 132 | * 133 | * \sa toChar() 134 | */ 135 | std::string tla() const; 136 | 137 | /** 138 | * Get the internal representation. 139 | * 140 | * \sa fromRep(int) 141 | */ 142 | int intRep() const { 143 | return rep_; 144 | } 145 | 146 | /** 147 | * Are two amino acids identical ? 148 | */ 149 | bool operator== (const AminoAcid other) const { 150 | return other.rep_ == rep_; 151 | } 152 | 153 | /** 154 | * Are two amino acids different ? 155 | */ 156 | bool operator!= (const AminoAcid other) const { 157 | return !(*this == other); 158 | } 159 | 160 | /** 161 | * So that you can use it as a key for STL containers. 162 | */ 163 | bool operator< (const AminoAcid other) const { return rep_ < other.rep_; } 164 | 165 | private: 166 | static const char AA_CHAR[]; 167 | static const char * const AA_TLA[]; 168 | 169 | explicit AminoAcid(int rep) 170 | : rep_(rep) { 171 | } 172 | 173 | short int rep_; 174 | }; 175 | 176 | /** 177 | * Write the one-letter representation of the amino acid to the stream. 178 | */ 179 | extern std::ostream& operator<< (std::ostream& o, const AminoAcid aa); 180 | 181 | }; 182 | 183 | #endif // AMINO_ACID_H_ 184 | -------------------------------------------------------------------------------- /src/libseq/Nucleotide.h: -------------------------------------------------------------------------------- 1 | // This may look like C code, but it's really -*- C++ -*- 2 | #ifndef NUCLEOTIDE_H_ 3 | #define NUCLEOTIDE_H_ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "ParseException.h" 12 | 13 | namespace seq { 14 | 15 | /** 16 | * A nucleotide, including support for ambiguity codes. 17 | * 18 | * The nucleotide is represented internally using an integer 19 | * representation. This may be helpful for e.g. indexing into a 20 | * table. Therefore, it is possible to both retrieve this internal 21 | * representation with intRep() and construct a Nucleotide from an 22 | * internal representation directly with fromRep(int). 23 | */ 24 | class Nucleotide { 25 | public: 26 | /** 27 | * @name Constants used in the internal representation. 28 | * \sa intRep() and fromRep(int). 29 | */ 30 | //@{ 31 | static const int NT_A = 0; 32 | static const int NT_C = 1; 33 | static const int NT_G = 2; 34 | static const int NT_T = 3; 35 | static const int NT_M = 4; 36 | static const int NT_R = 5; 37 | static const int NT_W = 6; 38 | static const int NT_S = 7; 39 | static const int NT_Y = 8; 40 | static const int NT_K = 9; 41 | static const int NT_V = 10; 42 | static const int NT_H = 11; 43 | static const int NT_D = 12; 44 | static const int NT_B = 13; 45 | static const int NT_N = 14; 46 | static const int NT_GAP = 15; 47 | //@} 48 | 49 | /** 50 | * @name Nucleotide constants. 51 | */ 52 | //@{ 53 | static const Nucleotide A; 54 | static const Nucleotide C; 55 | static const Nucleotide G; 56 | static const Nucleotide T; 57 | static const Nucleotide M; 58 | static const Nucleotide R; 59 | static const Nucleotide W; 60 | static const Nucleotide S; 61 | static const Nucleotide Y; 62 | static const Nucleotide K; 63 | static const Nucleotide V; 64 | static const Nucleotide H; 65 | static const Nucleotide D; 66 | static const Nucleotide B; 67 | static const Nucleotide N; 68 | static const Nucleotide GAP; 69 | //@} 70 | 71 | /** 72 | * Create a nucleotide with value Nucleotide::N (any). 73 | */ 74 | Nucleotide(); 75 | 76 | /** 77 | * Create a nucleotide by parsing a character. 78 | * Accepted are the characters from the FASTA file definition. 79 | * 80 | * \sa toChar() 81 | */ 82 | Nucleotide(char c); 83 | 84 | /** 85 | * Create a nucleotide using the internal representation directly. 86 | * Only valid representations are accepted, see the NT_* constants. 87 | * Illegal representations are fenced off by an assert() statement. 88 | * 89 | * \sa intRep() 90 | */ 91 | static Nucleotide fromRep(int rep) { 92 | assert(rep >= 0 && rep <= NT_GAP); 93 | 94 | return Nucleotide(rep); 95 | } 96 | 97 | /** 98 | * Get the uppercase character representation for this nucleotide. 99 | * 100 | * \sa Nucleotide(char) 101 | */ 102 | char toChar() const { 103 | return NT_CHAR[rep_]; 104 | } 105 | 106 | /** 107 | * Get the internal representation. 108 | * 109 | * \sa fromRep(int) 110 | */ 111 | int intRep() const { 112 | return rep_; 113 | } 114 | 115 | /** 116 | * Are two nucleotides identical ? 117 | */ 118 | bool operator== (const Nucleotide& other) const { 119 | return other.rep_ == rep_; 120 | } 121 | 122 | /** 123 | * Are two nucleotides different ? 124 | */ 125 | bool operator!= (const Nucleotide& other) const { 126 | return !(*this == other); 127 | } 128 | 129 | /** 130 | * Is the nucleotide ambiguous ? Only A,C,G,T are considered non-ambiguous. 131 | * 132 | * \sa sampleAmbiguity() 133 | */ 134 | bool isAmbiguity() const { return rep_ > NT_T; } 135 | 136 | /** 137 | * Replace the (ambiguos) nucleotide with a random non-ambigiuos nucleotide 138 | * that is represented by the ambiguity symbol. 139 | * 140 | * \sa isAmbiguity() 141 | */ 142 | void sampleAmbiguity(); 143 | 144 | Nucleotide reverseComplement() const; 145 | 146 | /** 147 | * Get all non ambiguous nucleotides represented by this nucleotide. 148 | */ 149 | void nonAmbiguousNucleotides(std::vector& result) const; 150 | 151 | /** 152 | * Get the single nucleotide representing all given nucleotides. 153 | */ 154 | static Nucleotide singleNucleotide(std::set& nucleotides); 155 | 156 | /** 157 | * So that you can use it as a key for STL containers. 158 | */ 159 | bool operator< (const Nucleotide other) const { return rep_ < other.rep_; } 160 | 161 | private: 162 | static const char NT_CHAR[]; 163 | 164 | Nucleotide(int rep) 165 | : rep_(rep) { 166 | } 167 | 168 | short int rep_; 169 | }; 170 | 171 | /** 172 | * Write the character representation of the nucleotide. 173 | */ 174 | extern std::ostream& operator<< (std::ostream& o, const Nucleotide nt); 175 | 176 | inline Nucleotide::Nucleotide(char c) 177 | { 178 | switch (toupper(c)) { 179 | case 'A': rep_ = NT_A; break; 180 | case 'C': rep_ = NT_C; break; 181 | case 'G': rep_ = NT_G; break; 182 | case 'T': case 'U': rep_ = NT_T; break; 183 | case 'M': rep_ = NT_M; break; 184 | case 'R': rep_ = NT_R; break; 185 | case 'W': rep_ = NT_W; break; 186 | case 'S': rep_ = NT_S; break; 187 | case 'Y': rep_ = NT_Y; break; 188 | case 'K': rep_ = NT_K; break; 189 | case 'V': rep_ = NT_V; break; 190 | case 'H': rep_ = NT_H; break; 191 | case 'D': rep_ = NT_D; break; 192 | case 'B': rep_ = NT_B; break; 193 | case 'N': rep_ = NT_N; break; 194 | case '-': rep_ = NT_GAP; break; 195 | default: 196 | throw ParseException 197 | (std::string(), 198 | std::string("Invalid nucleotide character: '") + c + "'", false); 199 | } 200 | } 201 | 202 | }; 203 | 204 | #endif // NUCLEOTIDE_H_ 205 | -------------------------------------------------------------------------------- /src/libseq/NeedlemanWunsh.cpp: -------------------------------------------------------------------------------- 1 | #include "NeedlemanWunsh.h" 2 | 3 | #include 4 | 5 | namespace seq { 6 | 7 | NeedlemanWunsh::NeedlemanWunsh(double gapOpenScore, 8 | double gapExtensionScore, 9 | double **ntWeightMatrix, 10 | double **aaWeightMatrix) 11 | { 12 | gapOpenScore_ = gapOpenScore; 13 | gapExtensionScore_ = gapExtensionScore; 14 | ntWeightMatrix_ = ntWeightMatrix; 15 | aaWeightMatrix_ = aaWeightMatrix; 16 | } 17 | 18 | /* 19 | * A straight-forward implementation of Neeldeman-Wunsh algorithm 20 | * for a pairwise global alignment, with the difference that a 21 | * gapOpenScore is not added at the beginning or end of the sequence 22 | * (like ClustalW does). 23 | */ 24 | template 25 | double NeedlemanWunsh::needlemanWunshAlign(std::vector& seq1, 26 | std::vector& seq2, 27 | double** weightMatrix) 28 | { 29 | /* 30 | * Remove gaps, and warn that we did. 31 | */ 32 | bool foundGaps = false; 33 | for (unsigned i = 0; i < seq1.size(); ++i) { 34 | if (seq1[i] == Symbol::GAP) { 35 | if (!foundGaps) { 36 | std::cerr << "Warning: NeedlemanWunsh: sequence contained gaps? " 37 | "Removed them." << std::endl; 38 | foundGaps = true; 39 | } 40 | seq1.erase(seq1.begin() + i); 41 | --i; 42 | } 43 | } 44 | 45 | for (unsigned i = 0; i < seq2.size(); ++i) { 46 | if (seq2[i] == Symbol::GAP) { 47 | if (!foundGaps) { 48 | std::cerr << "Warning: NeedlemanWunsh: sequence contained gaps? " 49 | "Removed them." << std::endl; 50 | foundGaps = true; 51 | } 52 | seq2.erase(seq2.begin() + i); 53 | --i; 54 | } 55 | } 56 | 57 | const int seq1Size = seq1.size(); 58 | const int seq2Size = seq2.size(); 59 | 60 | double **dnTable = new double* [seq1Size+1]; 61 | for (unsigned i = 0; i < seq1Size+1; ++i) 62 | dnTable[i] = new double[seq2Size+1]; 63 | int **gapsLengthTable = new int *[seq1Size+1]; 64 | for (unsigned i = 0; i < seq1Size+1; ++i) 65 | gapsLengthTable[i] = new int[seq2Size+1]; // >0: horiz, <0: vert 66 | 67 | double edgeGapExtensionScore = 0; 68 | 69 | /* 70 | * compute table 71 | */ 72 | dnTable[0][0] = 0; 73 | gapsLengthTable[0][0] = 0; 74 | for (unsigned i = 1; i < seq1Size+1; ++i) { 75 | dnTable[i][0] = dnTable[i-1][0] + edgeGapExtensionScore; 76 | gapsLengthTable[i][0] = gapsLengthTable[i-1][0] + 1; 77 | } 78 | for (unsigned j = 1; j < seq2Size+1; ++j) { 79 | dnTable[0][j] = dnTable[0][j-1] + edgeGapExtensionScore; 80 | gapsLengthTable[0][j] = gapsLengthTable[0][j-1] - 1; 81 | } 82 | 83 | for (unsigned i = 1; i < seq1Size+1; ++i) { 84 | for (unsigned j = 1; j < seq2Size+1; ++j) { 85 | 86 | double sextend 87 | = dnTable[i-1][j-1] 88 | + weightMatrix[seq1[i-1].intRep()][seq2[j-1].intRep()]; 89 | 90 | double ges = (j == seq2Size) ? edgeGapExtensionScore : gapExtensionScore_; 91 | 92 | double horizGapScore = ((gapsLengthTable[i-1][j] > 0) || (j == seq2Size) 93 | ? ges : gapOpenScore_ + ges); 94 | double sgaphoriz 95 | = dnTable[i-1][j] + horizGapScore; 96 | 97 | ges = (i == seq1Size) ? edgeGapExtensionScore : gapExtensionScore_; 98 | 99 | double vertGapScore = (gapsLengthTable[i][j-1] < 0 || (i == seq1Size) 100 | ? ges : gapOpenScore_ + ges); 101 | double sgapvert 102 | = dnTable[i][j-1] + vertGapScore; 103 | 104 | if ((sextend >= sgaphoriz) && (sextend >= sgapvert)) { 105 | dnTable[i][j] = sextend; 106 | gapsLengthTable[i][j] = 0; 107 | } else { 108 | if (sgaphoriz > sgapvert) { 109 | dnTable[i][j] = sgaphoriz; 110 | gapsLengthTable[i][j] = std::max(0, gapsLengthTable[i-1][j]) + 1; 111 | } else { 112 | dnTable[i][j] = sgapvert; 113 | gapsLengthTable[i][j] = std::min(0, gapsLengthTable[i][j-1]) - 1; 114 | } 115 | } 116 | } 117 | } 118 | 119 | /* 120 | * reconstruct best solution alignment. 121 | */ 122 | int i = seq1Size+1, j = seq2Size+1; 123 | do { 124 | if (gapsLengthTable[i-1][j-1] == 0) { 125 | --i; --j; 126 | } else if (gapsLengthTable[i-1][j-1] > 0) { 127 | --i; 128 | seq2.insert(seq2.begin() + (j-1), Symbol::GAP); 129 | } else { 130 | --j; 131 | seq1.insert(seq1.begin() + (i-1), Symbol::GAP); 132 | } 133 | } while (i > 1 || j > 1); 134 | 135 | double score = dnTable[seq1Size][seq2Size]; 136 | 137 | for (unsigned i = 0; i < seq1Size+1; ++i) { 138 | delete[] dnTable[i]; 139 | delete[] gapsLengthTable[i]; 140 | } 141 | delete[] dnTable; 142 | delete[] gapsLengthTable; 143 | 144 | return score; 145 | } 146 | 147 | double NeedlemanWunsh::align(NTSequence& seq1, NTSequence& seq2) 148 | { 149 | return needlemanWunshAlign(seq1, seq2, ntWeightMatrix_); 150 | } 151 | 152 | double NeedlemanWunsh::align(AASequence& seq1, AASequence& seq2) 153 | { 154 | return needlemanWunshAlign(seq1, seq2, aaWeightMatrix_); 155 | } 156 | 157 | double NeedlemanWunsh::computeAlignScore(const NTSequence& seq1, 158 | const NTSequence& seq2) 159 | { 160 | double score = 0; 161 | int seq1GapLength = 0; 162 | int seq2GapLength = 0; 163 | 164 | bool seq1LeadingGap = true; 165 | bool seq2LeadingGap = true; 166 | 167 | double edgeGapExtensionScore = 0; 168 | 169 | for (unsigned i = 0; i < seq1.size(); ++i) { 170 | if (seq1[i] == Nucleotide::GAP) { 171 | ++seq1GapLength; 172 | } else { 173 | if (seq1GapLength) { 174 | if (seq1LeadingGap) 175 | score += seq1GapLength * edgeGapExtensionScore; 176 | else 177 | score += gapOpenScore_ + seq1GapLength * gapExtensionScore_; 178 | } 179 | seq1GapLength = 0; 180 | 181 | if (seq2[i] == Nucleotide::GAP) { 182 | ++seq2GapLength; 183 | } else { 184 | if (seq2GapLength) { 185 | if (seq2LeadingGap) 186 | score += seq2GapLength * edgeGapExtensionScore; 187 | else 188 | score += gapOpenScore_ + seq2GapLength * gapExtensionScore_; 189 | } 190 | seq2GapLength = 0; 191 | 192 | score += ntWeightMatrix_[seq1[i].intRep()][seq2[i].intRep()]; 193 | } 194 | } 195 | } 196 | 197 | score += seq1GapLength * edgeGapExtensionScore; 198 | score += seq2GapLength * edgeGapExtensionScore; 199 | 200 | return score; 201 | } 202 | 203 | } 204 | -------------------------------------------------------------------------------- /src/libseq/NTSequence.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "NTSequence.h" 5 | #include "ParseException.h" 6 | 7 | namespace seq { 8 | 9 | NTSequence::NTSequence() 10 | : std::vector() 11 | { } 12 | 13 | NTSequence::NTSequence(unsigned size) 14 | : std::vector(size) 15 | { } 16 | 17 | NTSequence::NTSequence(const std::string name, const std::string description, 18 | const std::string aSeqString, 19 | bool sampleAmbiguities) 20 | : std::vector(aSeqString.length()), 21 | name_(name), 22 | description_(description) 23 | { 24 | for (unsigned i = 0; i < aSeqString.length(); ++i) { 25 | try { 26 | Nucleotide nt(aSeqString[i]); 27 | if (sampleAmbiguities) 28 | nt.sampleAmbiguity(); 29 | 30 | (*this)[i] = nt; 31 | } catch (ParseException& e) { 32 | throw ParseException(name, e.message(), e.recovered()); 33 | } 34 | } 35 | } 36 | 37 | NTSequence::NTSequence(const const_iterator first, 38 | const const_iterator last) 39 | : std::vector(first, last) 40 | { } 41 | 42 | void NTSequence::sampleAmbiguities() 43 | { 44 | for (unsigned i = 0; i < size(); ++i) { 45 | (*this)[i].sampleAmbiguity(); 46 | } 47 | } 48 | 49 | NTSequence NTSequence::reverseComplement() const 50 | { 51 | NTSequence result(size()); 52 | result.name_ = name_; 53 | result.description_ = description_; 54 | 55 | for (unsigned i = 0; i < size(); ++i) 56 | result[size() - i - 1] = (*this)[i].reverseComplement(); 57 | 58 | return result; 59 | } 60 | 61 | void NTSequence::nonAmbiguousSequences(std::vector& result) const 62 | { 63 | iterateNonAmbiguous(NTSequence(), result); 64 | } 65 | 66 | void NTSequence::iterateNonAmbiguous(const NTSequence& head, 67 | std::vector& result) const 68 | { 69 | /* 70 | * find the next ambigous codon (if any) 71 | */ 72 | NTSequence s = head; 73 | unsigned i = head.size(); 74 | for (; i < size(); ++i) 75 | if ((*this)[i].isAmbiguity()) 76 | break; 77 | else 78 | s.push_back((*this)[i]); 79 | 80 | if (i == size()) 81 | result.push_back(s); 82 | else { 83 | std::vector ambiguities; 84 | (*this)[i].nonAmbiguousNucleotides(ambiguities); 85 | for (unsigned i = 0; i < ambiguities.size(); ++i) { 86 | s.push_back(ambiguities[i]); 87 | iterateNonAmbiguous(s, result); 88 | s.pop_back(); 89 | } 90 | } 91 | } 92 | 93 | std::string NTSequence::asString() const 94 | { 95 | std::string result(size(), '-'); 96 | 97 | for (unsigned i = 0; i < size(); ++i) { 98 | result[i] = (*this)[i].toChar(); 99 | } 100 | 101 | return result; 102 | } 103 | 104 | /// \cond 105 | 106 | void readFastaEntry(std::istream& i, 107 | std::string& name, 108 | std::string& description, 109 | std::string& sequence) 110 | { 111 | char ch; 112 | char c[512]; 113 | 114 | i.getline(c, 511); 115 | if (i) { 116 | if (c[0] != '>') { 117 | throw ParseException(std::string(), 118 | std::string("FASTA file expected '>', got: '") 119 | + c[0] + "'", false); 120 | } 121 | 122 | std::string nameDesc = c + 1; 123 | std::string::size_type spacepos = nameDesc.find(" "); 124 | name = nameDesc.substr(0, spacepos); 125 | description = (spacepos == std::string::npos 126 | ? "" 127 | : nameDesc.substr(spacepos)); 128 | 129 | for (ch = i.get(); (ch != EOF) && (ch != '>'); ch = i.get()) { 130 | if ((ch != '\n') && (ch != '\r') && (ch != ' ')) { 131 | if (((ch >= 'a') && (ch <= 'z')) 132 | || ((ch >= 'A') && (ch <= 'Z')) 133 | || (ch == '-') || (ch == '*')) { 134 | sequence += ch; 135 | } else { 136 | char failedCh = ch; 137 | /* 138 | * Wind further to the next possible sequence. 139 | */ 140 | for (ch = i.get(); (ch != EOF) && (ch != '>'); ch = i.get()) 141 | ; 142 | 143 | if (ch == '>') 144 | i.putback(ch); 145 | 146 | throw ParseException 147 | (name, std::string("Illegal character in FASTA: '") 148 | + (char)failedCh + "'", true); 149 | } 150 | } 151 | 152 | if (i.peek() == EOF) 153 | break; 154 | } 155 | 156 | if (ch == '>') 157 | i.putback(ch); 158 | } 159 | } 160 | 161 | void writeFastaEntry(std::ostream& o, 162 | const std::string& name, 163 | const std::string& description, 164 | const std::string& sequence) 165 | { 166 | o << ">" << name << " " << description << std::endl; 167 | if (sequence.size() == 0) 168 | o << std::endl; 169 | else { 170 | for (unsigned i = 0; i <= (sequence.size() - 1) / 60; ++i) { 171 | int s = i * 60; 172 | o << sequence.substr(s, 60) << std::endl; 173 | } 174 | } 175 | } 176 | 177 | void writeStockholm(std::ostream& o, const std::vector& sequences, int length, int labelsize, int seqsize, int pos) 178 | { 179 | if(labelsize < 1 && seqsize < 1){ 180 | for(std::vector::const_iterator i = sequences.begin(); 181 | i < sequences.end(); ++i){ 182 | labelsize = std::max(labelsize, (int)i->name().length()); 183 | seqsize = std::max(seqsize, (int)i->size()); 184 | } 185 | 186 | o << "# STOCKHOLM 1.0" << std::endl; 187 | } 188 | 189 | int epos = pos+length - (labelsize + 1); 190 | for(std::vector::const_iterator i = sequences.begin(); 191 | i < sequences.end(); ++i){ 192 | o << i->name(); 193 | for(int j = 0; j < labelsize - i->name().length() + 1; ++j) 194 | o << ' '; 195 | 196 | int n = std::min(epos, (int)i->size()); 197 | for(int spos=pos; spos= seqsize){ 204 | o << "//"; 205 | } 206 | else{ 207 | writeStockholm(o, sequences, length, labelsize, seqsize, epos); 208 | } 209 | } 210 | 211 | /// \endcond 212 | 213 | std::istream& operator>>(std::istream& i, NTSequence& sequence) 214 | { 215 | std::string name, description, seqString; 216 | 217 | readFastaEntry(i, name, description, seqString); 218 | sequence = NTSequence(name, description, seqString); 219 | 220 | return i; 221 | } 222 | 223 | std::ostream& operator<<(std::ostream& o, const NTSequence& sequence) 224 | { 225 | writeFastaEntry(o, sequence.name(), sequence.description(), 226 | sequence.asString()); 227 | return o; 228 | } 229 | 230 | }; 231 | -------------------------------------------------------------------------------- /src/libseq/Nucleotide.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "ParseException.h" 7 | #include "Nucleotide.h" 8 | 9 | namespace { 10 | 11 | #ifdef _WIN32 12 | 13 | double drand48() 14 | { 15 | return (double(rand()) / RAND_MAX); 16 | } 17 | 18 | #endif 19 | 20 | int sampleUniform(int one, int two) 21 | { 22 | double d = drand48(); 23 | 24 | return (d < 0.5 ? one : two); 25 | } 26 | 27 | int sampleUniform(int one, int two, int three) 28 | { 29 | double d = drand48() * 3.; 30 | 31 | return (d < 1. ? one : (d < 2. ? two : three)); 32 | } 33 | 34 | int sampleUniform(int one, int two, int three, int four) 35 | { 36 | double d = drand48() * 4.; 37 | 38 | return (d < 1. ? one : (d < 2. ? two : (d < 3. ? three : four))); 39 | } 40 | }; 41 | 42 | namespace seq { 43 | 44 | const char Nucleotide::NT_CHAR[] = {'A', 'C', 'G', 'T', 45 | 'M', 'R', 'W', 'S', 46 | 'Y', 'K', 'V', 'H', 47 | 'D', 'B', 'N', '-' }; 48 | 49 | const Nucleotide Nucleotide::A(Nucleotide::NT_A); 50 | const Nucleotide Nucleotide::C(Nucleotide::NT_C); 51 | const Nucleotide Nucleotide::G(Nucleotide::NT_G); 52 | const Nucleotide Nucleotide::T(Nucleotide::NT_T); 53 | const Nucleotide Nucleotide::M(Nucleotide::NT_M); 54 | const Nucleotide Nucleotide::R(Nucleotide::NT_R); 55 | const Nucleotide Nucleotide::W(Nucleotide::NT_W); 56 | const Nucleotide Nucleotide::S(Nucleotide::NT_S); 57 | const Nucleotide Nucleotide::Y(Nucleotide::NT_Y); 58 | const Nucleotide Nucleotide::K(Nucleotide::NT_K); 59 | const Nucleotide Nucleotide::V(Nucleotide::NT_V); 60 | const Nucleotide Nucleotide::H(Nucleotide::NT_H); 61 | const Nucleotide Nucleotide::D(Nucleotide::NT_D); 62 | const Nucleotide Nucleotide::B(Nucleotide::NT_B); 63 | const Nucleotide Nucleotide::N(Nucleotide::NT_N); 64 | const Nucleotide Nucleotide::GAP(Nucleotide::NT_GAP); 65 | 66 | Nucleotide::Nucleotide() 67 | : rep_(NT_N) 68 | { } 69 | 70 | void Nucleotide::sampleAmbiguity() 71 | { 72 | switch (rep_) { 73 | case NT_A: 74 | case NT_C: 75 | case NT_G: 76 | case NT_T: 77 | case NT_GAP: 78 | break; 79 | case NT_M: 80 | rep_ = sampleUniform(NT_A, NT_C); break; 81 | case NT_R: 82 | rep_ = sampleUniform(NT_A, NT_G); break; 83 | case NT_W: 84 | rep_ = sampleUniform(NT_A, NT_T); break; 85 | case NT_S: 86 | rep_ = sampleUniform(NT_C, NT_G); break; 87 | case NT_Y: 88 | rep_ = sampleUniform(NT_C, NT_T); break; 89 | case NT_K: 90 | rep_ = sampleUniform(NT_G, NT_T); break; 91 | case NT_V: 92 | rep_ = sampleUniform(NT_A, NT_C, NT_G); break; 93 | case NT_H: 94 | rep_ = sampleUniform(NT_A, NT_C, NT_T); break; 95 | case NT_D: 96 | rep_ = sampleUniform(NT_A, NT_G, NT_T); break; 97 | case NT_B: 98 | rep_ = sampleUniform(NT_C, NT_G, NT_T); break; 99 | case NT_N: 100 | rep_ = sampleUniform(NT_A, NT_C, NT_G, NT_T); break; 101 | default: 102 | std::cerr << rep_ << std::endl; 103 | assert(false); 104 | } 105 | } 106 | 107 | Nucleotide Nucleotide::reverseComplement() const 108 | { 109 | switch (rep_) { 110 | case NT_A: return NT_T; 111 | case NT_C: return NT_G; 112 | case NT_G: return NT_C; 113 | case NT_T: return NT_A; 114 | case NT_GAP: return NT_GAP; 115 | case NT_M: return /* AC -> TG */ NT_K; 116 | case NT_R: return /* AG -> TC */ NT_Y; 117 | case NT_W: return /* AT -> TA */ NT_W; 118 | case NT_S: return /* CG -> GC */ NT_S; 119 | case NT_Y: return /* CT -> GA */ NT_R; 120 | case NT_K: return /* GT -> CA */ NT_M; 121 | case NT_V: return /* ACG -> TGC */ NT_B; 122 | case NT_H: return /* ACT -> TGA */ NT_D; 123 | case NT_D: return /* AGT -> TCA */ NT_H; 124 | case NT_B: return /* CGT -> GCA */ NT_V; 125 | case NT_N: return NT_N; 126 | default: 127 | std::cerr << rep_ << std::endl; 128 | assert(false); 129 | } 130 | } 131 | 132 | Nucleotide Nucleotide::singleNucleotide(std::set& nucleotides) 133 | { 134 | std::set::iterator itgap = nucleotides.find(GAP); 135 | if(itgap != nucleotides.end()) 136 | nucleotides.erase(itgap); 137 | 138 | if (nucleotides.size() == 1) 139 | return *nucleotides.begin(); 140 | 141 | std::set all; 142 | for(std::set::iterator it = nucleotides.begin(); it != nucleotides.end(); ++it) { 143 | std::vector t; 144 | it->nonAmbiguousNucleotides(t); 145 | all.insert(t.begin(), t.end()); 146 | } 147 | bool nta = all.find(A) != all.end(); 148 | bool ntc = all.find(C) != all.end(); 149 | bool ntg = all.find(G) != all.end(); 150 | bool ntt = all.find(T) != all.end(); 151 | 152 | if (nta && ntc && ntg && ntt) 153 | return N; 154 | if (nta && ntc && ntg) 155 | return V; 156 | if (nta && ntc && ntt) 157 | return H; 158 | if (nta && ntg && ntt) 159 | return D; 160 | if (ntc && ntg && ntt) 161 | return B; 162 | if (nta && ntc) 163 | return M; 164 | if (ntg && ntt) 165 | return K; 166 | if (nta && ntt) 167 | return W; 168 | if (ntg && ntc) 169 | return S; 170 | if (ntc && ntt) 171 | return Y; 172 | if (nta && ntg) 173 | return R; 174 | 175 | throw std::runtime_error 176 | ("Internal error in Nucleotide::singleNucleotide()"); 177 | } 178 | 179 | /** 180 | * Get all non ambiguous nucleotides represented by this nucleotide. 181 | */ 182 | void Nucleotide::nonAmbiguousNucleotides(std::vector& result) const 183 | { 184 | switch (rep_) { 185 | case NT_A: 186 | case NT_C: 187 | case NT_G: 188 | case NT_T: 189 | case NT_GAP: 190 | result.push_back(*this); 191 | break; 192 | case NT_M: 193 | result.push_back(A); 194 | result.push_back(C); 195 | break; 196 | case NT_R: 197 | result.push_back(A); 198 | result.push_back(G); 199 | break; 200 | case NT_W: 201 | result.push_back(A); 202 | result.push_back(T); 203 | break; 204 | case NT_S: 205 | result.push_back(C); 206 | result.push_back(G); 207 | break; 208 | case NT_Y: 209 | result.push_back(C); 210 | result.push_back(T); 211 | break; 212 | case NT_K: 213 | result.push_back(G); 214 | result.push_back(T); 215 | break; 216 | case NT_V: 217 | result.push_back(A); 218 | result.push_back(C); 219 | result.push_back(G); 220 | break; 221 | case NT_H: 222 | result.push_back(A); 223 | result.push_back(C); 224 | result.push_back(T); 225 | break; 226 | case NT_D: 227 | result.push_back(A); 228 | result.push_back(G); 229 | result.push_back(T); 230 | break; 231 | case NT_B: 232 | result.push_back(C); 233 | result.push_back(G); 234 | result.push_back(T); 235 | break; 236 | case NT_N: 237 | result.push_back(A); 238 | result.push_back(C); 239 | result.push_back(G); 240 | result.push_back(T); 241 | break; 242 | default: 243 | std::cerr << rep_ << std::endl; 244 | assert(false); 245 | } 246 | } 247 | 248 | 249 | std::ostream& operator<< (std::ostream& s, const Nucleotide nt) 250 | { 251 | return s << nt.toChar(); 252 | } 253 | 254 | }; 255 | -------------------------------------------------------------------------------- /src/mxml/README: -------------------------------------------------------------------------------- 1 | README - 2011-12-20 2 | ------------------- 3 | 4 | 5 | INTRODUCTION 6 | 7 | This README file describes the Mini-XML library version 2.7. 8 | 9 | Mini-XML is a small XML parsing library that you can use to read XML and 10 | XML-like data files in your application without requiring large non-standard 11 | libraries. Mini-XML only requires an ANSI C compatible compiler (GCC works, 12 | as do most vendors' ANSI C compilers) and a "make" program. 13 | 14 | Mini-XML provides the following functionality: 15 | 16 | - Reading of UTF-8 and UTF-16 and writing of UTF-8 encoded XML files and 17 | strings. 18 | - Data is stored in a linked-list tree structure, preserving the XML 19 | data hierarchy. 20 | - Supports arbitrary element names, attributes, and attribute values 21 | with no preset limits, just available memory. 22 | - Supports integer, real, opaque ("cdata"), and text data types in 23 | "leaf" nodes. 24 | - Functions for creating and managing trees of data. 25 | - "Find" and "walk" functions for easily locating and navigating trees 26 | of data. 27 | 28 | Mini-XML doesn't do validation or other types of processing on the data 29 | based upon schema files or other sources of definition information. 30 | 31 | 32 | BUILDING Mini-XML 33 | 34 | Mini-XML comes with an autoconf-based configure script; just type the 35 | following command to get things going: 36 | 37 | ./configure 38 | 39 | The default install prefix is /usr/local, which can be overridden using the 40 | --prefix option: 41 | 42 | ./configure --prefix=/foo 43 | 44 | Other configure options can be found using the --help option: 45 | 46 | ./configure --help 47 | 48 | Once you have configured the software, type "make" to do the build and run 49 | the test program to verify that things are working, as follows: 50 | 51 | make 52 | 53 | If you are using Mini-XML under Microsoft Windows with Visual C++ 2008, use 54 | the included project files in the "vcnet" subdirectory to build the library 55 | instead. 56 | 57 | 58 | INSTALLING Mini-XML 59 | 60 | The "install" target will install Mini-XML in the lib and include 61 | directories: 62 | 63 | make install 64 | 65 | Once you have installed it, use the "-lmxml" option to link your application 66 | against it. 67 | 68 | 69 | DOCUMENTATION 70 | 71 | The documentation is available in the "doc" subdirectory in the files 72 | "mxml.html" (HTML) and "mxml.pdf" (PDF). You can also look at the 73 | "testmxml.c" and "mxmldoc.c" source files for examples of using Mini-XML. 74 | 75 | Mini-XML provides a single header file which you include: 76 | 77 | #include 78 | 79 | Nodes are defined by the "mxml_node_t" structure; the "type" member defines 80 | the node type (element, integer, opaque, real, or text) which determines 81 | which value you want to look at in the "value" union. New nodes can be 82 | created using the "mxmlNewElement()", "mxmlNewInteger()", "mxmlNewOpaque()", 83 | "mxmlNewReal()", and "mxmlNewText()" functions. Only elements can have 84 | child nodes, and the top node must be an element, usually "?xml". 85 | 86 | You load an XML file using the "mxmlLoadFile()" function: 87 | 88 | FILE *fp; 89 | mxml_node_t *tree; 90 | 91 | fp = fopen("filename.xml", "r"); 92 | tree = mxmlLoadFile(NULL, fp, MXML_NO_CALLBACK); 93 | fclose(fp); 94 | 95 | Similarly, you save an XML file using the "mxmlSaveFile()" function: 96 | 97 | FILE *fp; 98 | mxml_node_t *tree; 99 | 100 | fp = fopen("filename.xml", "w"); 101 | mxmlSaveFile(tree, fp, MXML_NO_CALLBACK); 102 | fclose(fp); 103 | 104 | The "mxmlLoadString()", "mxmlSaveAllocString()", and "mxmlSaveString()" 105 | functions load XML node trees from and save XML node trees to strings: 106 | 107 | char buffer[8192]; 108 | char *ptr; 109 | mxml_node_t *tree; 110 | 111 | ... 112 | tree = mxmlLoadString(NULL, buffer, MXML_NO_CALLBACK); 113 | 114 | ... 115 | mxmlSaveString(tree, buffer, sizeof(buffer), MXML_NO_CALLBACK); 116 | 117 | ... 118 | ptr = mxmlSaveAllocString(tree, MXML_NO_CALLBACK); 119 | 120 | You can find a named element/node using the "mxmlFindElement()" function: 121 | 122 | mxml_node_t *node = mxmlFindElement(tree, tree, "name", "attr", 123 | "value", MXML_DESCEND); 124 | 125 | The "name", "attr", and "value" arguments can be passed as NULL to act as 126 | wildcards, e.g.: 127 | 128 | /* Find the first "a" element */ 129 | node = mxmlFindElement(tree, tree, "a", NULL, NULL, MXML_DESCEND); 130 | 131 | /* Find the first "a" element with "href" attribute */ 132 | node = mxmlFindElement(tree, tree, "a", "href", NULL, MXML_DESCEND); 133 | 134 | /* Find the first "a" element with "href" to a URL */ 135 | node = mxmlFindElement(tree, tree, "a", "href", 136 | "http://www.minixml.org/", 137 | MXML_DESCEND); 138 | 139 | /* Find the first element with a "src" attribute*/ 140 | node = mxmlFindElement(tree, tree, NULL, "src", NULL, MXML_DESCEND); 141 | 142 | /* Find the first element with a "src" = "foo.jpg" */ 143 | node = mxmlFindElement(tree, tree, NULL, "src", "foo.jpg", 144 | MXML_DESCEND); 145 | 146 | You can also iterate with the same function: 147 | 148 | mxml_node_t *node; 149 | 150 | for (node = mxmlFindElement(tree, tree, "name", NULL, NULL, 151 | MXML_DESCEND); 152 | node != NULL; 153 | node = mxmlFindElement(node, tree, "name", NULL, NULL, 154 | MXML_DESCEND)) 155 | { 156 | ... do something ... 157 | } 158 | 159 | The "mxmlFindPath()" function finds the (first) value node under a specific 160 | element using a "path": 161 | 162 | mxml_node_t *value = mxmlFindPath(tree, "path/to/*/foo/bar"); 163 | 164 | The "mxmlGetInteger()", "mxmlGetOpaque()", "mxmlGetReal()", and 165 | "mxmlGetText()" functions retrieve the value from a node: 166 | 167 | mxml_node_t *node; 168 | 169 | int intvalue = mxmlGetInteger(node); 170 | 171 | const char *opaquevalue = mxmlGetOpaque(node); 172 | 173 | double realvalue = mxmlGetReal(node); 174 | 175 | int whitespacevalue; 176 | const char *textvalue = mxmlGetText(node, &whitespacevalue); 177 | 178 | Finally, once you are done with the XML data, use the "mxmlDelete()" 179 | function to recursively free the memory that is used for a particular node 180 | or the entire tree: 181 | 182 | mxmlDelete(tree); 183 | 184 | 185 | GETTING HELP AND REPORTING PROBLEMS 186 | 187 | The Mini-XML web site provides access to a discussion forum and bug 188 | reporting page: 189 | 190 | http://www.minixml.org/ 191 | 192 | 193 | LEGAL STUFF 194 | 195 | The Mini-XML library is Copyright 2003-2011 by Michael Sweet. License terms 196 | are described in the file "COPYING". 197 | -------------------------------------------------------------------------------- /references/CHIKV/CHIKV-NC004162-gp1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /src/mxml/mxml-search.c: -------------------------------------------------------------------------------- 1 | /* 2 | * "$Id: mxml-search.c 427 2011-01-03 02:03:29Z mike $" 3 | * 4 | * Search/navigation functions for Mini-XML, a small XML-like file 5 | * parsing library. 6 | * 7 | * Copyright 2003-2010 by Michael R Sweet. 8 | * 9 | * These coded instructions, statements, and computer programs are the 10 | * property of Michael R Sweet and are protected by Federal copyright 11 | * law. Distribution and use rights are outlined in the file "COPYING" 12 | * which should have been included with this file. If this file is 13 | * missing or damaged, see the license at: 14 | * 15 | * http://www.minixml.org/ 16 | * 17 | * Contents: 18 | * 19 | * mxmlFindElement() - Find the named element. 20 | * mxmlFindValue() - Find a value with the given path. 21 | * mxmlWalkNext() - Walk to the next logical node in the tree. 22 | * mxmlWalkPrev() - Walk to the previous logical node in the tree. 23 | */ 24 | 25 | /* 26 | * Include necessary headers... 27 | */ 28 | 29 | #include "config.h" 30 | #include "mxml.h" 31 | 32 | 33 | /* 34 | * 'mxmlFindElement()' - Find the named element. 35 | * 36 | * The search is constrained by the name, attribute name, and value; any 37 | * NULL names or values are treated as wildcards, so different kinds of 38 | * searches can be implemented by looking for all elements of a given name 39 | * or all elements with a specific attribute. The descend argument determines 40 | * whether the search descends into child nodes; normally you will use 41 | * MXML_DESCEND_FIRST for the initial search and MXML_NO_DESCEND to find 42 | * additional direct descendents of the node. The top node argument 43 | * constrains the search to a particular node's children. 44 | */ 45 | 46 | mxml_node_t * /* O - Element node or NULL */ 47 | mxmlFindElement(mxml_node_t *node, /* I - Current node */ 48 | mxml_node_t *top, /* I - Top node */ 49 | const char *name, /* I - Element name or NULL for any */ 50 | const char *attr, /* I - Attribute name, or NULL for none */ 51 | const char *value, /* I - Attribute value, or NULL for any */ 52 | int descend) /* I - Descend into tree - MXML_DESCEND, MXML_NO_DESCEND, or MXML_DESCEND_FIRST */ 53 | { 54 | const char *temp; /* Current attribute value */ 55 | 56 | 57 | /* 58 | * Range check input... 59 | */ 60 | 61 | if (!node || !top || (!attr && value)) 62 | return (NULL); 63 | 64 | /* 65 | * Start with the next node... 66 | */ 67 | 68 | node = mxmlWalkNext(node, top, descend); 69 | 70 | /* 71 | * Loop until we find a matching element... 72 | */ 73 | 74 | while (node != NULL) 75 | { 76 | /* 77 | * See if this node matches... 78 | */ 79 | 80 | if (node->type == MXML_ELEMENT && 81 | node->value.element.name && 82 | (!name || !strcmp(node->value.element.name, name))) 83 | { 84 | /* 85 | * See if we need to check for an attribute... 86 | */ 87 | 88 | if (!attr) 89 | return (node); /* No attribute search, return it... */ 90 | 91 | /* 92 | * Check for the attribute... 93 | */ 94 | 95 | if ((temp = mxmlElementGetAttr(node, attr)) != NULL) 96 | { 97 | /* 98 | * OK, we have the attribute, does it match? 99 | */ 100 | 101 | if (!value || !strcmp(value, temp)) 102 | return (node); /* Yes, return it... */ 103 | } 104 | } 105 | 106 | /* 107 | * No match, move on to the next node... 108 | */ 109 | 110 | if (descend == MXML_DESCEND) 111 | node = mxmlWalkNext(node, top, MXML_DESCEND); 112 | else 113 | node = node->next; 114 | } 115 | 116 | return (NULL); 117 | } 118 | 119 | 120 | /* 121 | * 'mxmlFindPath()' - Find a node with the given path. 122 | * 123 | * The "path" is a slash-separated list of element names. The name "*" is 124 | * considered a wildcard for one or more levels of elements. For example, 125 | * "foo/one/two", "bar/two/one", "*\/one", and so forth. 126 | * 127 | * The first child node of the found node is returned if the given node has 128 | * children and the first child is a value node. 129 | * 130 | * @since Mini-XML 2.7@ 131 | */ 132 | 133 | mxml_node_t * /* O - Found node or NULL */ 134 | mxmlFindPath(mxml_node_t *top, /* I - Top node */ 135 | const char *path) /* I - Path to element */ 136 | { 137 | mxml_node_t *node; /* Current node */ 138 | char element[256]; /* Current element name */ 139 | const char *pathsep; /* Separator in path */ 140 | int descend; /* mxmlFindElement option */ 141 | 142 | 143 | /* 144 | * Range check input... 145 | */ 146 | 147 | if (!top || !path || !*path) 148 | return (NULL); 149 | 150 | /* 151 | * Search each element in the path... 152 | */ 153 | 154 | node = top; 155 | while (*path) 156 | { 157 | /* 158 | * Handle wildcards... 159 | */ 160 | 161 | if (!strncmp(path, "*/", 2)) 162 | { 163 | path += 2; 164 | descend = MXML_DESCEND; 165 | } 166 | else 167 | descend = MXML_DESCEND_FIRST; 168 | 169 | /* 170 | * Get the next element in the path... 171 | */ 172 | 173 | if ((pathsep = strchr(path, '/')) == NULL) 174 | pathsep = path + strlen(path); 175 | 176 | if (pathsep == path || (pathsep - path) >= sizeof(element)) 177 | return (NULL); 178 | 179 | memcpy(element, path, pathsep - path); 180 | element[pathsep - path] = '\0'; 181 | 182 | if (*pathsep) 183 | path = pathsep + 1; 184 | else 185 | path = pathsep; 186 | 187 | /* 188 | * Search for the element... 189 | */ 190 | 191 | if ((node = mxmlFindElement(node, node, element, NULL, NULL, 192 | descend)) == NULL) 193 | return (NULL); 194 | } 195 | 196 | /* 197 | * If we get this far, return the node or its first child... 198 | */ 199 | 200 | if (node->child && node->child->type != MXML_ELEMENT) 201 | return (node->child); 202 | else 203 | return (node); 204 | } 205 | 206 | 207 | /* 208 | * 'mxmlWalkNext()' - Walk to the next logical node in the tree. 209 | * 210 | * The descend argument controls whether the first child is considered 211 | * to be the next node. The top node argument constrains the walk to 212 | * the node's children. 213 | */ 214 | 215 | mxml_node_t * /* O - Next node or NULL */ 216 | mxmlWalkNext(mxml_node_t *node, /* I - Current node */ 217 | mxml_node_t *top, /* I - Top node */ 218 | int descend) /* I - Descend into tree - MXML_DESCEND, MXML_NO_DESCEND, or MXML_DESCEND_FIRST */ 219 | { 220 | if (!node) 221 | return (NULL); 222 | else if (node->child && descend) 223 | return (node->child); 224 | else if (node == top) 225 | return (NULL); 226 | else if (node->next) 227 | return (node->next); 228 | else if (node->parent && node->parent != top) 229 | { 230 | node = node->parent; 231 | 232 | while (!node->next) 233 | if (node->parent == top || !node->parent) 234 | return (NULL); 235 | else 236 | node = node->parent; 237 | 238 | return (node->next); 239 | } 240 | else 241 | return (NULL); 242 | } 243 | 244 | 245 | /* 246 | * 'mxmlWalkPrev()' - Walk to the previous logical node in the tree. 247 | * 248 | * The descend argument controls whether the previous node's last child 249 | * is considered to be the previous node. The top node argument constrains 250 | * the walk to the node's children. 251 | */ 252 | 253 | mxml_node_t * /* O - Previous node or NULL */ 254 | mxmlWalkPrev(mxml_node_t *node, /* I - Current node */ 255 | mxml_node_t *top, /* I - Top node */ 256 | int descend) /* I - Descend into tree - MXML_DESCEND, MXML_NO_DESCEND, or MXML_DESCEND_FIRST */ 257 | { 258 | if (!node || node == top) 259 | return (NULL); 260 | else if (node->prev) 261 | { 262 | if (node->prev->last_child && descend) 263 | { 264 | /* 265 | * Find the last child under the previous node... 266 | */ 267 | 268 | node = node->prev->last_child; 269 | 270 | while (node->last_child) 271 | node = node->last_child; 272 | 273 | return (node); 274 | } 275 | else 276 | return (node->prev); 277 | } 278 | else if (node->parent != top) 279 | return (node->parent); 280 | else 281 | return (NULL); 282 | } 283 | 284 | 285 | /* 286 | * End of "$Id: mxml-search.c 427 2011-01-03 02:03:29Z mike $". 287 | */ 288 | -------------------------------------------------------------------------------- /src/Virulign.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include "ReferenceSequence.h" 11 | #include "Alignment.h" 12 | #include "ResultsExporter.h" 13 | #include "CLIUtils.h" 14 | #include "Utils.h" 15 | 16 | ReferenceSequence loadRefSeq(const std::string& fn) { 17 | if (ends_with(fn, ".fasta")) { 18 | return loadRefSeqFromFile(fn.c_str()); 19 | } else if (ends_with(fn, ".xml")) { 20 | return ReferenceSequence::parseOrfReferenceFile(fn); 21 | } 22 | throw std::runtime_error("Unsupported reference sequence format"); 23 | } 24 | 25 | int main(int argc, char **argv) { 26 | unsigned int i; 27 | 28 | int obligatoryParams = 2; 29 | if(argc < obligatoryParams+1) { 30 | std::cerr << "Usage: virulign [reference.fasta orf-description.xml] sequences.fasta" << std::endl 31 | << "Optional parameters (first option will be the default):" << std::endl 32 | << " --exportKind [Mutations PairwiseAlignments GlobalAlignment PositionTable MutationTable]" << std::endl 33 | << " --exportAlphabet [AminoAcids Nucleotides]" << std::endl 34 | << " --exportWithInsertions [yes no]" << std::endl 35 | << " --exportReferenceSequence [no yes]" << std::endl 36 | << " --gapExtensionPenalty doubleValue=>3.3" << std::endl 37 | << " --gapOpenPenalty doubleValue=>10.0" << std::endl 38 | << " --maxFrameShifts intValue=>3" << std::endl 39 | << " --progress [no yes]" << std::endl 40 | << " --nt-debug directory" << std::endl 41 | << "Output: The alignment will be printed to standard out and any progress or error messages will be printed to the standard error. This output can be redirected to files, e.g.:" << std::endl 42 | << " virulign ref.xml sequence.fasta > alignment.mutations 2> alignment.err" << std::endl; 43 | exit(0); 44 | } 45 | 46 | int amountOfParameters = argc - obligatoryParams - 1; 47 | if (amountOfParameters%2 == 1) { 48 | std::cerr << "Please provide parameters as: --parameterName parameterValue" << std::endl; 49 | exit(0); 50 | } 51 | 52 | std::string refSeqFileName = argv[1]; 53 | if (!ends_with(refSeqFileName, ".fasta") && !ends_with(refSeqFileName, ".xml")) { 54 | std::cerr << 55 | "Unknown reference sequence: " 56 | "expected a FASTA file or an XML file that describes the ORF" << std::endl; 57 | exit(1); 58 | } 59 | ReferenceSequence refSeq = loadRefSeq(refSeqFileName); 60 | 61 | std::ifstream f_seqs(argv[2]); 62 | std::vector targets; 63 | 64 | try { 65 | while (f_seqs) { 66 | seq::NTSequence s; 67 | 68 | f_seqs >> s; 69 | 70 | if (f_seqs) { 71 | targets.push_back(s); 72 | } 73 | } 74 | } catch (seq::ParseException& e) { 75 | std::cerr << "Fatal error: " << e.message() << std::endl; 76 | exit(1); 77 | } 78 | 79 | ExportKind exportKind = Mutations; 80 | ExportAlphabet exportAlphabet = AminoAcids; 81 | bool exportWithInsertions = true; 82 | 83 | double gapExtensionPenalty = 3.3; 84 | double gapOpenPenalty = 10.0; 85 | int maxFrameShifts = 3; 86 | 87 | bool progress = false; 88 | 89 | std::string ntDebugDir; 90 | 91 | char* parameterName; 92 | char* parameterValue; 93 | for(i = obligatoryParams+1; i < amountOfParameters+obligatoryParams; i=i+2) { 94 | parameterName = argv[i]; 95 | parameterValue = argv[i+1]; 96 | if(equalsString(parameterName,"--exportKind")) { 97 | if(equalsString(parameterValue, "Mutations")) { 98 | exportKind = Mutations; 99 | } else if(equalsString(parameterValue, "PairwiseAlignments")) { 100 | exportKind = PairwiseAlignments; 101 | } else if(equalsString(parameterValue, "GlobalAlignment")) { 102 | exportKind = GlobalAlignment; 103 | } else if(equalsString(parameterValue, "PositionTable")) { 104 | exportKind = PositionTable; 105 | } else if(equalsString(parameterValue, "MutationTable")) { 106 | exportKind = MutationTable; 107 | } else { 108 | std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl; 109 | exit(0); 110 | } 111 | } else if(equalsString(parameterName,"--exportAlphabet")) { 112 | if(equalsString(parameterValue, "AminoAcids")) { 113 | exportAlphabet = AminoAcids; 114 | } else if(equalsString(parameterValue, "Nucleotides")) { 115 | exportAlphabet = Nucleotides; 116 | } else { 117 | std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl; 118 | exit(0); 119 | } 120 | } else if(equalsString(parameterName,"--exportReferenceSequence")) { 121 | if (equalsString(parameterValue,"yes")) { 122 | seq::NTSequence refNtSeq = refSeq; 123 | targets.insert(targets.begin(), refNtSeq); 124 | } 125 | } else if(equalsString(parameterName,"--exportWithInsertions")) { 126 | if(equalsString(parameterValue,"yes")) { 127 | exportWithInsertions = true; 128 | } else if(equalsString(parameterValue,"no")) { 129 | exportWithInsertions = false; 130 | } else { 131 | std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl; 132 | exit(0); 133 | } 134 | } else if(equalsString(parameterName,"--gapExtensionPenalty")) { 135 | try { 136 | gapExtensionPenalty = lexical_cast(parameterValue); 137 | } catch (std::bad_cast& e) { 138 | std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl; 139 | exit(0); 140 | } 141 | } else if(equalsString(parameterName,"--gapOpenPenalty")) { 142 | try { 143 | gapOpenPenalty = lexical_cast(parameterValue); 144 | } catch (std::bad_cast& e) { 145 | std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl; 146 | exit(0); 147 | } 148 | } else if(equalsString(parameterName,"--maxFrameShifts")) { 149 | try { 150 | maxFrameShifts = lexical_cast(parameterValue); 151 | } catch (std::bad_cast& e) { 152 | std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl; 153 | exit(0); 154 | } 155 | } else if(equalsString(parameterName,"--progress")) { 156 | if(equalsString(parameterValue,"yes")) { 157 | progress = true; 158 | } else if(equalsString(parameterValue,"no")) { 159 | progress = false; 160 | } else { 161 | std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl; 162 | exit(0); 163 | } 164 | } else if(equalsString(parameterName,"--nt-debug")) { 165 | ntDebugDir = parameterValue; 166 | } else { 167 | std::cerr << "Unkown parameter name: " << parameterName << std::endl; 168 | exit(0); 169 | } 170 | } 171 | 172 | std::vector results; 173 | 174 | seq::NeedlemanWunsh algorithm(-gapOpenPenalty, -gapExtensionPenalty); 175 | 176 | if (!ntDebugDir.empty()) { 177 | seq::NTSequence r = refSeq; 178 | for (i = 0; i < targets.size(); ++i) { 179 | seq::NTSequence t = targets[i]; 180 | double ntScore = algorithm.align(r, t); 181 | if(ntScore > 200) { 182 | std::string dbg = ntDebugDir + std::string("/") + t.name() + ".fasta"; 183 | std::ofstream ofs(dbg.c_str()); 184 | ofs << r; 185 | ofs << t; 186 | } 187 | } 188 | } 189 | 190 | long int start = current_time_ms(); 191 | 192 | for (i = 0; i < targets.size(); ++i) { 193 | std::cerr << "Align target " << i 194 | << " (" << targets[i].name() << ")" << std::endl; 195 | results.push_back(Alignment::compute(refSeq, targets[i], &algorithm, maxFrameShifts)); 196 | if (progress) { 197 | long int end = current_time_ms(); 198 | long int elapsed = end - start; 199 | double time_per_seq = (double)elapsed / (i + 1); 200 | double estimated_time_left = time_per_seq * (targets.size() - (i + 1)); 201 | 202 | std::cerr << "Progress: " << (i + 1) << "/" << targets.size() << " sequences aligned (" << std::fixed << std::setprecision(2) << (i + 1) / (double)targets.size() * 100 << "%), Estimated time left " << format_time(estimated_time_left) << std::endl; 203 | } 204 | } 205 | 206 | ResultsExporter exporter(results, exportKind, exportAlphabet, exportWithInsertions); 207 | 208 | exporter.streamData(std::cout); 209 | } 210 | -------------------------------------------------------------------------------- /src/mxml/mxml-private.c: -------------------------------------------------------------------------------- 1 | /* 2 | * "$Id: mxml-private.c 422 2010-11-07 22:55:11Z mike $" 3 | * 4 | * Private functions for Mini-XML, a small XML-like file parsing library. 5 | * 6 | * Copyright 2003-2010 by Michael R Sweet. 7 | * 8 | * These coded instructions, statements, and computer programs are the 9 | * property of Michael R Sweet and are protected by Federal copyright 10 | * law. Distribution and use rights are outlined in the file "COPYING" 11 | * which should have been included with this file. If this file is 12 | * missing or damaged, see the license at: 13 | * 14 | * http://www.minixml.org/ 15 | * 16 | * Contents: 17 | * 18 | * mxml_error() - Display an error message. 19 | * mxml_integer_cb() - Default callback for integer values. 20 | * mxml_opaque_cb() - Default callback for opaque values. 21 | * mxml_real_cb() - Default callback for real number values. 22 | * _mxml_global() - Get global data. 23 | */ 24 | 25 | /* 26 | * Include necessary headers... 27 | */ 28 | 29 | #include "mxml-private.h" 30 | 31 | 32 | /* 33 | * Some crazy people think that unloading a shared object is a good or safe 34 | * thing to do. Unfortunately, most objects are simply *not* safe to unload 35 | * and bad things *will* happen. 36 | * 37 | * The following mess of conditional code allows us to provide a destructor 38 | * function in Mini-XML for our thread-global storage so that it can possibly 39 | * be unloaded safely, although since there is no standard way to do so I 40 | * can't even provide any guarantees that you can do it safely on all platforms. 41 | * 42 | * This code currently supports AIX, HP-UX, Linux, Mac OS X, Solaris, and 43 | * Windows. It might work on the BSDs and IRIX, but I haven't tested that. 44 | */ 45 | 46 | #if defined(__sun) || defined(_AIX) 47 | # pragma fini(_mxml_fini) 48 | # define _MXML_FINI _mxml_fini 49 | #elif defined(__hpux) 50 | # pragma FINI _mxml_fini 51 | # define _MXML_FINI _mxml_fini 52 | #elif defined(__GNUC__) /* Linux and Mac OS X */ 53 | # define _MXML_FINI __attribute((destructor)) _mxml_fini 54 | #else 55 | # define _MXML_FINI _fini 56 | #endif /* __sun */ 57 | 58 | 59 | /* 60 | * 'mxml_error()' - Display an error message. 61 | */ 62 | 63 | void 64 | mxml_error(const char *format, /* I - Printf-style format string */ 65 | ...) /* I - Additional arguments as needed */ 66 | { 67 | va_list ap; /* Pointer to arguments */ 68 | char s[1024]; /* Message string */ 69 | _mxml_global_t *global = _mxml_global(); 70 | /* Global data */ 71 | 72 | 73 | /* 74 | * Range check input... 75 | */ 76 | 77 | if (!format) 78 | return; 79 | 80 | /* 81 | * Format the error message string... 82 | */ 83 | 84 | va_start(ap, format); 85 | 86 | vsnprintf(s, sizeof(s), format, ap); 87 | 88 | va_end(ap); 89 | 90 | /* 91 | * And then display the error message... 92 | */ 93 | 94 | if (global->error_cb) 95 | (*global->error_cb)(s); 96 | else 97 | fprintf(stderr, "mxml: %s\n", s); 98 | } 99 | 100 | 101 | /* 102 | * 'mxml_ignore_cb()' - Default callback for ignored values. 103 | */ 104 | 105 | mxml_type_t /* O - Node type */ 106 | mxml_ignore_cb(mxml_node_t *node) /* I - Current node */ 107 | { 108 | (void)node; 109 | 110 | return (MXML_IGNORE); 111 | } 112 | 113 | 114 | /* 115 | * 'mxml_integer_cb()' - Default callback for integer values. 116 | */ 117 | 118 | mxml_type_t /* O - Node type */ 119 | mxml_integer_cb(mxml_node_t *node) /* I - Current node */ 120 | { 121 | (void)node; 122 | 123 | return (MXML_INTEGER); 124 | } 125 | 126 | 127 | /* 128 | * 'mxml_opaque_cb()' - Default callback for opaque values. 129 | */ 130 | 131 | mxml_type_t /* O - Node type */ 132 | mxml_opaque_cb(mxml_node_t *node) /* I - Current node */ 133 | { 134 | (void)node; 135 | 136 | return (MXML_OPAQUE); 137 | } 138 | 139 | 140 | /* 141 | * 'mxml_real_cb()' - Default callback for real number values. 142 | */ 143 | 144 | mxml_type_t /* O - Node type */ 145 | mxml_real_cb(mxml_node_t *node) /* I - Current node */ 146 | { 147 | (void)node; 148 | 149 | return (MXML_REAL); 150 | } 151 | 152 | 153 | #ifdef HAVE_PTHREAD_H /**** POSIX threading ****/ 154 | # include 155 | 156 | static pthread_key_t _mxml_key = -1; /* Thread local storage key */ 157 | static pthread_once_t _mxml_key_once = PTHREAD_ONCE_INIT; 158 | /* One-time initialization object */ 159 | static void _mxml_init(void); 160 | static void _mxml_destructor(void *g); 161 | 162 | 163 | /* 164 | * '_mxml_destructor()' - Free memory used for globals... 165 | */ 166 | 167 | static void 168 | _mxml_destructor(void *g) /* I - Global data */ 169 | { 170 | free(g); 171 | } 172 | 173 | 174 | /* 175 | * '_mxml_fini()' - Clean up when unloaded. 176 | */ 177 | 178 | static void 179 | _MXML_FINI(void) 180 | { 181 | _mxml_global_t *global; /* Global data */ 182 | 183 | 184 | if (_mxml_key != -1) 185 | { 186 | if ((global = (_mxml_global_t *)pthread_getspecific(_mxml_key)) != NULL) 187 | _mxml_destructor(global); 188 | 189 | pthread_key_delete(_mxml_key); 190 | _mxml_key = -1; 191 | } 192 | } 193 | 194 | 195 | /* 196 | * '_mxml_global()' - Get global data. 197 | */ 198 | 199 | _mxml_global_t * /* O - Global data */ 200 | _mxml_global(void) 201 | { 202 | _mxml_global_t *global; /* Global data */ 203 | 204 | 205 | pthread_once(&_mxml_key_once, _mxml_init); 206 | 207 | if ((global = (_mxml_global_t *)pthread_getspecific(_mxml_key)) == NULL) 208 | { 209 | global = (_mxml_global_t *)calloc(1, sizeof(_mxml_global_t)); 210 | pthread_setspecific(_mxml_key, global); 211 | 212 | global->num_entity_cbs = 1; 213 | global->entity_cbs[0] = _mxml_entity_cb; 214 | global->wrap = 72; 215 | } 216 | 217 | return (global); 218 | } 219 | 220 | 221 | /* 222 | * '_mxml_init()' - Initialize global data... 223 | */ 224 | 225 | static void 226 | _mxml_init(void) 227 | { 228 | pthread_key_create(&_mxml_key, _mxml_destructor); 229 | } 230 | 231 | 232 | #elif defined(WIN32) && defined(MXML1_EXPORTS) /**** WIN32 threading ****/ 233 | # include 234 | 235 | static DWORD _mxml_tls_index; /* Index for global storage */ 236 | 237 | 238 | /* 239 | * 'DllMain()' - Main entry for library. 240 | */ 241 | 242 | BOOL WINAPI /* O - Success/failure */ 243 | DllMain(HINSTANCE hinst, /* I - DLL module handle */ 244 | DWORD reason, /* I - Reason */ 245 | LPVOID reserved) /* I - Unused */ 246 | { 247 | _mxml_global_t *global; /* Global data */ 248 | 249 | 250 | (void)hinst; 251 | (void)reserved; 252 | 253 | switch (reason) 254 | { 255 | case DLL_PROCESS_ATTACH : /* Called on library initialization */ 256 | if ((_mxml_tls_index = TlsAlloc()) == TLS_OUT_OF_INDEXES) 257 | return (FALSE); 258 | break; 259 | 260 | case DLL_THREAD_DETACH : /* Called when a thread terminates */ 261 | if ((global = (_mxml_global_t *)TlsGetValue(_mxml_tls_index)) != NULL) 262 | free(global); 263 | break; 264 | 265 | case DLL_PROCESS_DETACH : /* Called when library is unloaded */ 266 | if ((global = (_mxml_global_t *)TlsGetValue(_mxml_tls_index)) != NULL) 267 | free(global); 268 | 269 | TlsFree(_mxml_tls_index); 270 | break; 271 | 272 | default: 273 | break; 274 | } 275 | 276 | return (TRUE); 277 | } 278 | 279 | 280 | /* 281 | * '_mxml_global()' - Get global data. 282 | */ 283 | 284 | _mxml_global_t * /* O - Global data */ 285 | _mxml_global(void) 286 | { 287 | _mxml_global_t *global; /* Global data */ 288 | 289 | 290 | if ((global = (_mxml_global_t *)TlsGetValue(_mxml_tls_index)) == NULL) 291 | { 292 | global = (_mxml_global_t *)calloc(1, sizeof(_mxml_global_t)); 293 | 294 | global->num_entity_cbs = 1; 295 | global->entity_cbs[0] = _mxml_entity_cb; 296 | global->wrap = 72; 297 | 298 | TlsSetValue(_mxml_tls_index, (LPVOID)global); 299 | } 300 | 301 | return (global); 302 | } 303 | 304 | 305 | #else /**** No threading ****/ 306 | /* 307 | * '_mxml_global()' - Get global data. 308 | */ 309 | 310 | _mxml_global_t * /* O - Global data */ 311 | _mxml_global(void) 312 | { 313 | static _mxml_global_t global = /* Global data */ 314 | { 315 | NULL, /* error_cb */ 316 | 1, /* num_entity_cbs */ 317 | { _mxml_entity_cb }, /* entity_cbs */ 318 | 72, /* wrap */ 319 | NULL, /* custom_load_cb */ 320 | NULL /* custom_save_cb */ 321 | }; 322 | 323 | 324 | return (&global); 325 | } 326 | #endif /* HAVE_PTHREAD_H */ 327 | 328 | 329 | /* 330 | * End of "$Id: mxml-private.c 422 2010-11-07 22:55:11Z mike $". 331 | */ 332 | -------------------------------------------------------------------------------- /src/mxml/mxml-attr.c: -------------------------------------------------------------------------------- 1 | /* 2 | * "$Id: mxml-attr.c 408 2010-09-19 05:26:46Z mike $" 3 | * 4 | * Attribute support code for Mini-XML, a small XML-like file parsing library. 5 | * 6 | * Copyright 2003-2010 by Michael R Sweet. 7 | * 8 | * These coded instructions, statements, and computer programs are the 9 | * property of Michael R Sweet and are protected by Federal copyright 10 | * law. Distribution and use rights are outlined in the file "COPYING" 11 | * which should have been included with this file. If this file is 12 | * missing or damaged, see the license at: 13 | * 14 | * http://www.minixml.org/ 15 | * 16 | * Contents: 17 | * 18 | * mxmlElementDeleteAttr() - Delete an attribute. 19 | * mxmlElementGetAttr() - Get an attribute. 20 | * mxmlElementSetAttr() - Set an attribute. 21 | * mxmlElementSetAttrf() - Set an attribute with a formatted value. 22 | * mxml_set_attr() - Set or add an attribute name/value pair. 23 | */ 24 | 25 | /* 26 | * Include necessary headers... 27 | */ 28 | 29 | #include "config.h" 30 | #include "mxml.h" 31 | 32 | 33 | /* 34 | * Local functions... 35 | */ 36 | 37 | static int mxml_set_attr(mxml_node_t *node, const char *name, 38 | char *value); 39 | 40 | 41 | /* 42 | * 'mxmlElementDeleteAttr()' - Delete an attribute. 43 | * 44 | * @since Mini-XML 2.4@ 45 | */ 46 | 47 | void 48 | mxmlElementDeleteAttr(mxml_node_t *node,/* I - Element */ 49 | const char *name)/* I - Attribute name */ 50 | { 51 | int i; /* Looping var */ 52 | mxml_attr_t *attr; /* Cirrent attribute */ 53 | 54 | 55 | #ifdef DEBUG 56 | fprintf(stderr, "mxmlElementDeleteAttr(node=%p, name=\"%s\")\n", 57 | node, name ? name : "(null)"); 58 | #endif /* DEBUG */ 59 | 60 | /* 61 | * Range check input... 62 | */ 63 | 64 | if (!node || node->type != MXML_ELEMENT || !name) 65 | return; 66 | 67 | /* 68 | * Look for the attribute... 69 | */ 70 | 71 | for (i = node->value.element.num_attrs, attr = node->value.element.attrs; 72 | i > 0; 73 | i --, attr ++) 74 | { 75 | #ifdef DEBUG 76 | printf(" %s=\"%s\"\n", attr->name, attr->value); 77 | #endif /* DEBUG */ 78 | 79 | if (!strcmp(attr->name, name)) 80 | { 81 | /* 82 | * Delete this attribute... 83 | */ 84 | 85 | free(attr->name); 86 | free(attr->value); 87 | 88 | i --; 89 | if (i > 0) 90 | memmove(attr, attr + 1, i * sizeof(mxml_attr_t)); 91 | 92 | node->value.element.num_attrs --; 93 | return; 94 | } 95 | } 96 | } 97 | 98 | 99 | /* 100 | * 'mxmlElementGetAttr()' - Get an attribute. 101 | * 102 | * This function returns NULL if the node is not an element or the 103 | * named attribute does not exist. 104 | */ 105 | 106 | const char * /* O - Attribute value or NULL */ 107 | mxmlElementGetAttr(mxml_node_t *node, /* I - Element node */ 108 | const char *name) /* I - Name of attribute */ 109 | { 110 | int i; /* Looping var */ 111 | mxml_attr_t *attr; /* Cirrent attribute */ 112 | 113 | 114 | #ifdef DEBUG 115 | fprintf(stderr, "mxmlElementGetAttr(node=%p, name=\"%s\")\n", 116 | node, name ? name : "(null)"); 117 | #endif /* DEBUG */ 118 | 119 | /* 120 | * Range check input... 121 | */ 122 | 123 | if (!node || node->type != MXML_ELEMENT || !name) 124 | return (NULL); 125 | 126 | /* 127 | * Look for the attribute... 128 | */ 129 | 130 | for (i = node->value.element.num_attrs, attr = node->value.element.attrs; 131 | i > 0; 132 | i --, attr ++) 133 | { 134 | #ifdef DEBUG 135 | printf(" %s=\"%s\"\n", attr->name, attr->value); 136 | #endif /* DEBUG */ 137 | 138 | if (!strcmp(attr->name, name)) 139 | { 140 | #ifdef DEBUG 141 | printf(" Returning \"%s\"!\n", attr->value); 142 | #endif /* DEBUG */ 143 | return (attr->value); 144 | } 145 | } 146 | 147 | /* 148 | * Didn't find attribute, so return NULL... 149 | */ 150 | 151 | #ifdef DEBUG 152 | puts(" Returning NULL!\n"); 153 | #endif /* DEBUG */ 154 | 155 | return (NULL); 156 | } 157 | 158 | 159 | /* 160 | * 'mxmlElementSetAttr()' - Set an attribute. 161 | * 162 | * If the named attribute already exists, the value of the attribute 163 | * is replaced by the new string value. The string value is copied 164 | * into the element node. This function does nothing if the node is 165 | * not an element. 166 | */ 167 | 168 | void 169 | mxmlElementSetAttr(mxml_node_t *node, /* I - Element node */ 170 | const char *name, /* I - Name of attribute */ 171 | const char *value) /* I - Attribute value */ 172 | { 173 | char *valuec; /* Copy of value */ 174 | 175 | 176 | #ifdef DEBUG 177 | fprintf(stderr, "mxmlElementSetAttr(node=%p, name=\"%s\", value=\"%s\")\n", 178 | node, name ? name : "(null)", value ? value : "(null)"); 179 | #endif /* DEBUG */ 180 | 181 | /* 182 | * Range check input... 183 | */ 184 | 185 | if (!node || node->type != MXML_ELEMENT || !name) 186 | return; 187 | 188 | if (value) 189 | valuec = strdup(value); 190 | else 191 | valuec = NULL; 192 | 193 | if (mxml_set_attr(node, name, valuec)) 194 | free(valuec); 195 | } 196 | 197 | 198 | /* 199 | * 'mxmlElementSetAttrf()' - Set an attribute with a formatted value. 200 | * 201 | * If the named attribute already exists, the value of the attribute 202 | * is replaced by the new formatted string. The formatted string value is 203 | * copied into the element node. This function does nothing if the node 204 | * is not an element. 205 | * 206 | * @since Mini-XML 2.3@ 207 | */ 208 | 209 | void 210 | mxmlElementSetAttrf(mxml_node_t *node, /* I - Element node */ 211 | const char *name, /* I - Name of attribute */ 212 | const char *format,/* I - Printf-style attribute value */ 213 | ...) /* I - Additional arguments as needed */ 214 | { 215 | va_list ap; /* Argument pointer */ 216 | char *value; /* Value */ 217 | 218 | 219 | #ifdef DEBUG 220 | fprintf(stderr, 221 | "mxmlElementSetAttrf(node=%p, name=\"%s\", format=\"%s\", ...)\n", 222 | node, name ? name : "(null)", format ? format : "(null)"); 223 | #endif /* DEBUG */ 224 | 225 | /* 226 | * Range check input... 227 | */ 228 | 229 | if (!node || node->type != MXML_ELEMENT || !name || !format) 230 | return; 231 | 232 | /* 233 | * Format the value... 234 | */ 235 | 236 | va_start(ap, format); 237 | value = _mxml_vstrdupf(format, ap); 238 | va_end(ap); 239 | 240 | if (!value) 241 | mxml_error("Unable to allocate memory for attribute '%s' in element %s!", 242 | name, node->value.element.name); 243 | else if (mxml_set_attr(node, name, value)) 244 | free(value); 245 | } 246 | 247 | 248 | /* 249 | * 'mxml_set_attr()' - Set or add an attribute name/value pair. 250 | */ 251 | 252 | static int /* O - 0 on success, -1 on failure */ 253 | mxml_set_attr(mxml_node_t *node, /* I - Element node */ 254 | const char *name, /* I - Attribute name */ 255 | char *value) /* I - Attribute value */ 256 | { 257 | int i; /* Looping var */ 258 | mxml_attr_t *attr; /* New attribute */ 259 | 260 | 261 | /* 262 | * Look for the attribute... 263 | */ 264 | 265 | for (i = node->value.element.num_attrs, attr = node->value.element.attrs; 266 | i > 0; 267 | i --, attr ++) 268 | if (!strcmp(attr->name, name)) 269 | { 270 | /* 271 | * Free the old value as needed... 272 | */ 273 | 274 | if (attr->value) 275 | free(attr->value); 276 | 277 | attr->value = value; 278 | 279 | return (0); 280 | } 281 | 282 | /* 283 | * Add a new attribute... 284 | */ 285 | 286 | if (node->value.element.num_attrs == 0) 287 | attr = malloc(sizeof(mxml_attr_t)); 288 | else 289 | attr = realloc(node->value.element.attrs, 290 | (node->value.element.num_attrs + 1) * sizeof(mxml_attr_t)); 291 | 292 | if (!attr) 293 | { 294 | mxml_error("Unable to allocate memory for attribute '%s' in element %s!", 295 | name, node->value.element.name); 296 | return (-1); 297 | } 298 | 299 | node->value.element.attrs = attr; 300 | attr += node->value.element.num_attrs; 301 | 302 | if ((attr->name = strdup(name)) == NULL) 303 | { 304 | mxml_error("Unable to allocate memory for attribute '%s' in element %s!", 305 | name, node->value.element.name); 306 | return (-1); 307 | } 308 | 309 | attr->value = value; 310 | 311 | node->value.element.num_attrs ++; 312 | 313 | return (0); 314 | } 315 | 316 | 317 | /* 318 | * End of "$Id: mxml-attr.c 408 2010-09-19 05:26:46Z mike $". 319 | */ 320 | -------------------------------------------------------------------------------- /src/libseq/CodonAlign.cpp: -------------------------------------------------------------------------------- 1 | #include "CodonAlign.h" 2 | 3 | #include 4 | 5 | namespace seq { 6 | 7 | CodonAlign::CodonAlign(AlignmentAlgorithm* algorithm) 8 | { 9 | algorithm_ = algorithm; 10 | } 11 | 12 | double CodonAlign::alignLikeAA(NTSequence& seq1, 13 | NTSequence& seq2, 14 | int ORF, 15 | const AASequence& seqAA1, 16 | const AASequence& seqAA2) 17 | { 18 | NTSequence seq2ORFLead(seq2.begin(), seq2.begin() + ORF); 19 | seq2.erase(seq2.begin(), seq2.begin() + ORF); 20 | int aaLength = seq2.size() / 3; 21 | NTSequence seq2ORFEnd(seq2.begin() + aaLength*3, seq2.end()); 22 | seq2.erase(seq2.begin() + aaLength*3, seq2.end()); 23 | 24 | int firstNonGap = -1; 25 | int lastNonGap = -1; 26 | 27 | for (unsigned i = 0; i < seqAA1.size(); ++i) { 28 | if (seqAA1[i] == AminoAcid::GAP && noGapAt(seq1, i)) { 29 | if (i*3 < seq1.size()) 30 | seq1.insert(seq1.begin() + (i*3), 3, Nucleotide::GAP); 31 | else 32 | seq1.insert(seq1.end(), 3, Nucleotide::GAP); 33 | } 34 | 35 | if (seqAA2[i] == AminoAcid::GAP && noGapAt(seq2, i)) { 36 | if (i*3 < seq2.size()) 37 | seq2.insert(seq2.begin() + (i*3), 3, Nucleotide::GAP); 38 | else 39 | seq2.insert(seq2.end(), 3, Nucleotide::GAP); 40 | } else { 41 | if (firstNonGap == -1) 42 | firstNonGap = i*3; 43 | lastNonGap = i*3 + 3; 44 | } 45 | } 46 | 47 | for (int i = 0; i < (int)seq2ORFLead.size(); ++i) 48 | if ((firstNonGap - (int)seq2ORFLead.size() + i) >= 0) 49 | seq2[firstNonGap - (int)seq2ORFLead.size() + i] = seq2ORFLead[i]; 50 | 51 | for (unsigned i = 0; i < seq2ORFEnd.size(); ++i) 52 | if (lastNonGap + i < seq2.size()) 53 | seq2[lastNonGap + i] = seq2ORFEnd[i]; 54 | 55 | return algorithm_->computeAlignScore(seq1, seq2); 56 | } 57 | 58 | bool CodonAlign::noGapAt(const NTSequence& seq, unsigned int i) const 59 | { 60 | if((i * 3) == seq.size()) 61 | return true; 62 | else 63 | return seq[i * 3] != Nucleotide::GAP 64 | && seq[(i * 3) + 1] != Nucleotide::GAP 65 | && seq[(i * 3) + 2] != Nucleotide::GAP; 66 | } 67 | 68 | bool CodonAlign::haveGaps(const NTSequence& seq, int from, int to) 69 | { 70 | for (unsigned i = std::max(from, 0); i < std::min((int)seq.size(), to); ++i) 71 | if (seq[i] == Nucleotide::GAP) 72 | return true; 73 | 74 | return false; 75 | } 76 | 77 | std::pair 78 | CodonAlign::align(NTSequence& ref, NTSequence& target, int maxFrameShifts) 79 | { 80 | /* 81 | * 1. translate the reference sequence 82 | * 2. for every open reading frame: 83 | * - translate the target sequence 84 | * - perform the alignment 85 | * 3. take the alignment with best score and align nucleotide 86 | * sequence like amino acid sequence 87 | * 4. compute nucleotide alignment score 88 | * 5. make nucleotide sequence alignment, compare score, if difference 89 | * too big then correct the frame shift and repeat. 90 | */ 91 | AASequence refAA = AASequence::translate(ref); 92 | 93 | NTSequence refNTAligned = ref; 94 | NTSequence targetNTAligned = target; 95 | double ntScore = algorithm_->align(refNTAligned, targetNTAligned); 96 | 97 | if(ntScore < 200) 98 | throw AlignmentError(ntScore,0,refNTAligned,targetNTAligned); 99 | 100 | int bestFrameShift = -1; 101 | double bestScore = -1E10; 102 | AASequence bestRefAA; 103 | AASequence bestTargetAA; 104 | 105 | for (unsigned i = 0; i < 3; ++i) { 106 | int last = i + ((target.size() - i) / 3) * 3; 107 | AASequence targetAA 108 | = AASequence::translate(target.begin() + i, target.begin() + last); 109 | 110 | AASequence refCopyAA = refAA; 111 | double score = algorithm_->align(refCopyAA, targetAA); 112 | 113 | if (score > bestScore) { 114 | bestFrameShift = i; 115 | bestScore = score; 116 | bestRefAA = refCopyAA; 117 | bestTargetAA = targetAA; 118 | } 119 | } 120 | 121 | NTSequence refCodonAligned = ref; 122 | NTSequence targetCodonAligned = target; 123 | 124 | double ntCodonScore = alignLikeAA(refCodonAligned, 125 | targetCodonAligned, 126 | bestFrameShift, 127 | bestRefAA, 128 | bestTargetAA); 129 | 130 | 131 | if (ntScore - ntCodonScore > 100) { 132 | /* 133 | * a possible frameshift 134 | */ 135 | if (maxFrameShifts) { 136 | /* 137 | * try to fix: walk through the nucleotide alignment, and find 138 | * an "isolated" gap that is not of size multiple of 3. 139 | */ 140 | const int BOUNDARY=10; 141 | int seq2pos = 0; 142 | int refGapStart = 0; 143 | int targetGapStart = 0; 144 | bool fixed = false; 145 | 146 | for (unsigned i = 0; i < refNTAligned.size(); ++i) { 147 | if (refNTAligned[i] == Nucleotide::GAP) { 148 | if (refGapStart == -1) 149 | refGapStart = i; 150 | } else { 151 | if (refGapStart > 0) { 152 | int refGapStop = i; 153 | 154 | if ((refGapStop - refGapStart) % 3) { 155 | /* 156 | * check it is isolated: no gaps in either sequence around 157 | * this gap 158 | */ 159 | if (haveGaps(refNTAligned, 160 | refGapStart - BOUNDARY, refGapStart) 161 | || haveGaps(refNTAligned, 162 | refGapStop, refGapStop + BOUNDARY) 163 | || haveGaps(targetNTAligned, 164 | refGapStart - BOUNDARY, refGapStart) 165 | || haveGaps(targetNTAligned, 166 | refGapStop, refGapStop + BOUNDARY)) { 167 | /* 168 | * not isolated: skip this gap. 169 | */ 170 | } else { 171 | /* 172 | * fix it ! 173 | */ 174 | target.insert(target.begin() + seq2pos, 175 | 3 - (refGapStop - refGapStart) % 3, 176 | Nucleotide::N); 177 | fixed = true; 178 | break; 179 | } 180 | } 181 | } 182 | 183 | refGapStart = -1; 184 | } 185 | 186 | if (targetNTAligned[i] == Nucleotide::GAP) { 187 | if (targetGapStart == -1) 188 | targetGapStart = i; 189 | } else { 190 | if (targetGapStart > 0) { 191 | int targetGapStop = i; 192 | 193 | if ((targetGapStop - targetGapStart) % 3) { 194 | /* 195 | * check it is isolated: no gaps in either sequence around 196 | * this gap 197 | */ 198 | if (haveGaps(refNTAligned, 199 | targetGapStart - BOUNDARY, targetGapStart) 200 | || haveGaps(refNTAligned, targetGapStop, 201 | targetGapStop + BOUNDARY) 202 | || haveGaps(targetNTAligned, 203 | targetGapStart - BOUNDARY, targetGapStart) 204 | || haveGaps(targetNTAligned, 205 | targetGapStop, targetGapStop + BOUNDARY)) { 206 | /* 207 | * not isolated: skip this gap. 208 | */ 209 | } else { 210 | /* 211 | * fix it ! 212 | */ 213 | target.insert(target.begin() + seq2pos, 214 | (targetGapStop - targetGapStart) % 3, 215 | Nucleotide::N); 216 | fixed = true; 217 | break; 218 | } 219 | } 220 | } 221 | 222 | targetGapStart = -1; 223 | ++seq2pos; 224 | } 225 | } 226 | 227 | if (!fixed) 228 | throw FrameShiftError(ntScore, ntCodonScore, 229 | refNTAligned, targetNTAligned); 230 | else { 231 | std::pair result 232 | = align(ref, target, maxFrameShifts - 1); 233 | ++result.second; 234 | return result; 235 | } 236 | } else { 237 | throw FrameShiftError(ntScore, ntCodonScore, 238 | refNTAligned, targetNTAligned); 239 | } 240 | } else { 241 | ref = refCodonAligned; 242 | target = targetCodonAligned; 243 | /* 244 | std::cerr << "Scores: " << ntScore << " " << ntCodonScore << " " << bestScore << std::endl; 245 | std::cerr << refNTAligned.asString() << std::endl; 246 | std::cerr << targetNTAligned.asString() << std::endl; 247 | std::cerr << refCodonAligned.asString() << std::endl; 248 | std::cerr << targetCodonAligned.asString() << std::endl; 249 | std::cerr << bestRefAA.asString() << std::endl; 250 | std::cerr << bestTargetAA.asString() << std::endl; 251 | */ 252 | return std::make_pair(ntCodonScore, 0); 253 | } 254 | } 255 | 256 | AlignmentError::AlignmentError(double ntScore, double codonScore, 257 | const NTSequence& ntRef, 258 | const NTSequence& ntTarget, 259 | const std::string& message) 260 | :ntScore_(ntScore),codonScore_(codonScore), 261 | ntRef_(ntRef),ntTarget_(ntTarget), 262 | message_(message) 263 | { } 264 | 265 | AlignmentError::~AlignmentError() throw() 266 | { } 267 | 268 | 269 | FrameShiftError::FrameShiftError(double ntScore, double codonScore, 270 | const NTSequence& ntRef, 271 | const NTSequence& ntTarget) 272 | :AlignmentError(ntScore,codonScore,ntRef,ntTarget,std::string("Frameshift error")) 273 | { } 274 | 275 | FrameShiftError::~FrameShiftError() throw() 276 | { } 277 | 278 | }; 279 | 280 | -------------------------------------------------------------------------------- /src/Alignment.cpp: -------------------------------------------------------------------------------- 1 | #include "Utils.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "Alignment.h" 8 | 9 | #include 10 | 11 | Alignment Alignment::compute(const ReferenceSequence& ref, 12 | const seq::NTSequence& target, 13 | seq::AlignmentAlgorithm* algorithm, 14 | int maxFrameShifts) 15 | { 16 | seq::CodonAlign codonAlign(algorithm); 17 | Alignment result(ref, target); 18 | 19 | for (unsigned j = 0; j < result.target.size(); ++j) 20 | if (result.target[j] == seq::Nucleotide::GAP) { 21 | result.target.erase(result.target.begin() + j); 22 | --j; 23 | } 24 | 25 | try { 26 | if (result.target.size() > 6) { 27 | std::pair res 28 | = codonAlign.align(result.ref, result.target, maxFrameShifts); 29 | 30 | result.score = res.first; 31 | result.correctedFrameshifts = res.second; 32 | result.success = true; 33 | } else 34 | result.tooShort = true; 35 | } catch (seq::AlignmentError e) { 36 | result.failure = true; 37 | std::cerr << e.nucleotideAlignedTarget().name() << ": " << e.message() 38 | << " (scores nt: " << e.nucleotideAlignmentScore() << "; codon: " 39 | << e.codonAlignmentScore() << ")" << std::endl; 40 | } 41 | 42 | result.computeAlignedRanges(ref.size()/3); 43 | 44 | return result; 45 | } 46 | 47 | Alignment Alignment::given(const ReferenceSequence& ref, 48 | const seq::NTSequence& target) 49 | { 50 | if (ref.size() != target.size()) { 51 | std::cerr << ref.name() << ".length: " << ref.size() 52 | << ", " << target.name() << ".length: " << target.size() 53 | << std::endl; 54 | 55 | assert(ref.size() == target.size()); 56 | } 57 | 58 | Alignment result(ref, target); 59 | 60 | result.success = true; 61 | 62 | result.computeAlignedRanges(ref.size()/3); 63 | 64 | return result; 65 | } 66 | 67 | Alignment::Alignment(const ReferenceSequence& aref, 68 | const seq::NTSequence& atarget) 69 | : success(false), 70 | tooShort(false), 71 | failure(false), 72 | correctedFrameshifts(0), 73 | ref(aref), 74 | target(atarget) 75 | { } 76 | 77 | void Alignment::computeAlignedRanges(int referenceSequenceLength) 78 | { 79 | for (unsigned r = 0; r < ref.regions().size(); ++r) { 80 | ReferenceSequence::Region& region = ref.regions()[r]; 81 | 82 | int regionEnd = std::min(region.end(), referenceSequenceLength); 83 | 84 | if (success) { 85 | region.alignedBegin = alignedPos(region.begin()); 86 | region.alignedEnd = alignedPos(regionEnd); 87 | region.targetBegin = firstPos(region.begin(), regionEnd); 88 | region.targetEnd = lastPos(region.begin(), regionEnd); 89 | } else { 90 | region.alignedBegin = region.begin(); 91 | region.alignedEnd = regionEnd; 92 | region.targetBegin = ref.size(); 93 | region.targetEnd = -1; 94 | } 95 | } 96 | } 97 | 98 | int Alignment::alignedPos(int refPos) const 99 | { 100 | int j = -1; 101 | for (unsigned i = 0; i < ref.size(); i += 3) { 102 | if (ref[i] != seq::Nucleotide::GAP) 103 | ++j; 104 | 105 | if (j == refPos) 106 | return i/3; 107 | } 108 | if (j == refPos - 1) 109 | return ref.size() / 3; 110 | else { 111 | std::cerr << refPos << " " << ref.size() << " " << j << std::endl; 112 | assert(false); 113 | } 114 | } 115 | 116 | int Alignment::firstPos(int begin, int end) const 117 | { 118 | int refPos = -1; 119 | 120 | for (unsigned i = 0; i < ref.size(); i += 3) { 121 | if (ref[i] != seq::Nucleotide::GAP) 122 | ++refPos; 123 | 124 | if (refPos >= begin) { 125 | if (refPos >= end) 126 | return end; 127 | 128 | if (target[i] != seq::Nucleotide::GAP) 129 | return refPos; 130 | } 131 | } 132 | 133 | return end; 134 | } 135 | 136 | int Alignment::lastPos(int begin, int end) const 137 | { 138 | int refPos = -1; 139 | int lastPos = -1; 140 | 141 | for (unsigned i = 0; i < ref.size(); i += 3) { 142 | if (ref[i] != seq::Nucleotide::GAP) 143 | ++refPos; 144 | 145 | if (refPos >= begin) { 146 | if (refPos >= end) 147 | return lastPos; 148 | 149 | if (target[i+2] != seq::Nucleotide::GAP) 150 | lastPos = refPos; 151 | } 152 | } 153 | 154 | return lastPos; 155 | } 156 | 157 | std::pair 158 | Alignment::findAminoAcid(const ReferenceSequence::Region& region, 159 | int posInRegion, int insertion) const 160 | { 161 | bool withinTarget 162 | = ((region.targetBegin < region.targetEnd) 163 | && posInRegion >= region.targetBegin - region.begin() + 1 164 | && posInRegion <= region.targetEnd - region.begin() + 1); 165 | 166 | int pos = 0; 167 | int gap = 0; 168 | 169 | for (int i = region.alignedBegin; i < region.alignedEnd; ++i) { 170 | if (ref[i*3] != seq::Nucleotide::GAP) { 171 | ++pos; 172 | gap = 0; 173 | } else 174 | ++gap; 175 | 176 | if (pos == posInRegion 177 | && gap == insertion 178 | && (!withinTarget || (target[i*3] != seq::Nucleotide::GAP))) { 179 | return std::make_pair(withinTarget, i); 180 | } else if (pos > posInRegion) { 181 | return std::make_pair(withinTarget, -1); 182 | } 183 | } 184 | 185 | assert(false); 186 | return std::make_pair(false, 0); 187 | } 188 | 189 | std::string Alignment::mutations(const ReferenceSequence::Region& region) const 190 | { 191 | std::string result; 192 | int fp = region.targetBegin; 193 | int lp = region.targetEnd; 194 | 195 | if (fp >= lp) 196 | return result; 197 | 198 | int refPos = -1; 199 | 200 | for (unsigned i = 0; i < ref.size(); i += 3) { 201 | if (ref[i] != seq::Nucleotide::GAP) 202 | ++refPos; 203 | 204 | if (refPos >= fp) { 205 | if (refPos > lp) 206 | return result; 207 | 208 | seq::AminoAcid refAA = seq::Codon::translate(ref.begin() + i); 209 | std::set targetAAs 210 | = seq::Codon::translateAll(target.begin() + i); 211 | 212 | if (((targetAAs.size() > 1) 213 | || (*targetAAs.begin() != refAA)) 214 | && (*targetAAs.begin() != seq::AminoAcid::GAP)) { 215 | 216 | if (!result.empty()) 217 | result += ' '; 218 | 219 | result += refAA.toChar() 220 | + to_string(refPos - region.begin() + 1); 221 | 222 | for (std::set::const_iterator k = targetAAs.begin(); 223 | k != targetAAs.end(); ++k) 224 | result += k->toChar(); 225 | 226 | } 227 | } 228 | } 229 | 230 | return result; 231 | } 232 | 233 | std::string Alignment:: 234 | codonMutations(const ReferenceSequence::Region& region, 235 | int& start, 236 | int& end) const 237 | { 238 | std::string result; 239 | int fp = region.begin(); 240 | int lp = region.end() - 1; 241 | 242 | start = -1; 243 | end = -1; 244 | 245 | if (fp >= lp) 246 | return result; 247 | 248 | int refPos = -1; 249 | 250 | for (unsigned i = 0; i < ref.size(); i += 3) { 251 | if (ref[i] != seq::Nucleotide::GAP) 252 | ++refPos; 253 | 254 | int pos = refPos - region.begin() + 1; 255 | 256 | if (refPos >= fp) { 257 | if (refPos > lp) 258 | return result; 259 | 260 | if (target[i] == seq::Nucleotide::GAP && 261 | target[i + 1] == seq::Nucleotide::GAP && 262 | target[i + 2] == seq::Nucleotide::GAP && 263 | (refPos < region.targetBegin || refPos > region.targetEnd)) 264 | continue; 265 | 266 | if (refPos == region.targetEnd && 267 | ref[i] == seq::Nucleotide::GAP && 268 | ref[i + 1] == seq::Nucleotide::GAP && 269 | ref[i + 2] == seq::Nucleotide::GAP) 270 | continue; 271 | 272 | //skip incomplete begin codon 273 | if(refPos == region.targetBegin-1 && 274 | target[i] == seq::Nucleotide::GAP) 275 | continue; 276 | 277 | //skip incomplete end codon 278 | if(refPos == region.targetEnd+1 && 279 | target[i + 2] == seq::Nucleotide::GAP) 280 | continue; 281 | 282 | if (start == -1) 283 | start = pos; 284 | end = pos; 285 | 286 | bool mutation; 287 | mutation = ref[i] != target[i] || 288 | ref[i + 1] != target[i + 1] || 289 | ref[i + 2] != target[i + 2]; 290 | 291 | if(mutation) { 292 | if (!result.empty()) 293 | result += ' '; 294 | 295 | seq::AminoAcid refAA = seq::Codon::translate(ref.begin() + i); 296 | std::set targetAAs = seq::Codon::translateAll(target.begin() + i); 297 | 298 | result += refAA.toChar() 299 | + to_string(refPos - region.begin() + 1); 300 | 301 | for (std::set::const_iterator k = targetAAs.begin(); k != targetAAs.end(); ++k) 302 | result += k->toChar(); 303 | result += ';'; 304 | 305 | result += ref[i].toChar(); 306 | result += ref[i+1].toChar(); 307 | result += ref[i+2].toChar(); 308 | result += to_string(pos); 309 | 310 | result += target[i].toChar(); 311 | result += target[i + 1].toChar(); 312 | result += target[i + 2].toChar(); 313 | } 314 | } 315 | } 316 | 317 | return result; 318 | } 319 | 320 | 321 | -------------------------------------------------------------------------------- /references/HCV/HCV2-FN666429.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/mxml/mxml-set.c: -------------------------------------------------------------------------------- 1 | /* 2 | * "$Id: mxml-set.c 441 2011-12-09 23:49:00Z mike $" 3 | * 4 | * Node set functions for Mini-XML, a small XML-like file parsing library. 5 | * 6 | * Copyright 2003-2011 by Michael R Sweet. 7 | * 8 | * These coded instructions, statements, and computer programs are the 9 | * property of Michael R Sweet and are protected by Federal copyright 10 | * law. Distribution and use rights are outlined in the file "COPYING" 11 | * which should have been included with this file. If this file is 12 | * missing or damaged, see the license at: 13 | * 14 | * http://www.minixml.org/ 15 | * 16 | * Contents: 17 | * 18 | * mxmlSetCDATA() - Set the element name of a CDATA node. 19 | * mxmlSetCustom() - Set the data and destructor of a custom data node. 20 | * mxmlSetElement() - Set the name of an element node. 21 | * mxmlSetInteger() - Set the value of an integer node. 22 | * mxmlSetOpaque() - Set the value of an opaque node. 23 | * mxmlSetReal() - Set the value of a real number node. 24 | * mxmlSetText() - Set the value of a text node. 25 | * mxmlSetTextf() - Set the value of a text node to a formatted string. 26 | * mxmlSetUserData() - Set the user data pointer for a node. 27 | */ 28 | 29 | /* 30 | * Include necessary headers... 31 | */ 32 | 33 | #include "config.h" 34 | #include "mxml.h" 35 | 36 | 37 | /* 38 | * 'mxmlSetCDATA()' - Set the element name of a CDATA node. 39 | * 40 | * The node is not changed if it (or its first child) is not a CDATA element node. 41 | * 42 | * @since Mini-XML 2.3@ 43 | */ 44 | 45 | int /* O - 0 on success, -1 on failure */ 46 | mxmlSetCDATA(mxml_node_t *node, /* I - Node to set */ 47 | const char *data) /* I - New data string */ 48 | { 49 | /* 50 | * Range check input... 51 | */ 52 | 53 | if (node && node->type == MXML_ELEMENT && 54 | strncmp(node->value.element.name, "![CDATA[", 8) && 55 | node->child && node->child->type == MXML_ELEMENT && 56 | !strncmp(node->child->value.element.name, "![CDATA[", 8)) 57 | node = node->child; 58 | 59 | if (!node || node->type != MXML_ELEMENT || !data || 60 | strncmp(node->value.element.name, "![CDATA[", 8)) 61 | return (-1); 62 | 63 | /* 64 | * Free any old element value and set the new value... 65 | */ 66 | 67 | if (node->value.element.name) 68 | free(node->value.element.name); 69 | 70 | node->value.element.name = _mxml_strdupf("![CDATA[%s]]", data); 71 | 72 | return (0); 73 | } 74 | 75 | 76 | /* 77 | * 'mxmlSetCustom()' - Set the data and destructor of a custom data node. 78 | * 79 | * The node is not changed if it (or its first child) is not a custom node. 80 | * 81 | * @since Mini-XML 2.1@ 82 | */ 83 | 84 | int /* O - 0 on success, -1 on failure */ 85 | mxmlSetCustom( 86 | mxml_node_t *node, /* I - Node to set */ 87 | void *data, /* I - New data pointer */ 88 | mxml_custom_destroy_cb_t destroy) /* I - New destructor function */ 89 | { 90 | /* 91 | * Range check input... 92 | */ 93 | 94 | if (node && node->type == MXML_ELEMENT && 95 | node->child && node->child->type == MXML_CUSTOM) 96 | node = node->child; 97 | 98 | if (!node || node->type != MXML_CUSTOM) 99 | return (-1); 100 | 101 | /* 102 | * Free any old element value and set the new value... 103 | */ 104 | 105 | if (node->value.custom.data && node->value.custom.destroy) 106 | (*(node->value.custom.destroy))(node->value.custom.data); 107 | 108 | node->value.custom.data = data; 109 | node->value.custom.destroy = destroy; 110 | 111 | return (0); 112 | } 113 | 114 | 115 | /* 116 | * 'mxmlSetElement()' - Set the name of an element node. 117 | * 118 | * The node is not changed if it is not an element node. 119 | */ 120 | 121 | int /* O - 0 on success, -1 on failure */ 122 | mxmlSetElement(mxml_node_t *node, /* I - Node to set */ 123 | const char *name) /* I - New name string */ 124 | { 125 | /* 126 | * Range check input... 127 | */ 128 | 129 | if (!node || node->type != MXML_ELEMENT || !name) 130 | return (-1); 131 | 132 | /* 133 | * Free any old element value and set the new value... 134 | */ 135 | 136 | if (node->value.element.name) 137 | free(node->value.element.name); 138 | 139 | node->value.element.name = strdup(name); 140 | 141 | return (0); 142 | } 143 | 144 | 145 | /* 146 | * 'mxmlSetInteger()' - Set the value of an integer node. 147 | * 148 | * The node is not changed if it (or its first child) is not an integer node. 149 | */ 150 | 151 | int /* O - 0 on success, -1 on failure */ 152 | mxmlSetInteger(mxml_node_t *node, /* I - Node to set */ 153 | int integer) /* I - Integer value */ 154 | { 155 | /* 156 | * Range check input... 157 | */ 158 | 159 | if (node && node->type == MXML_ELEMENT && 160 | node->child && node->child->type == MXML_INTEGER) 161 | node = node->child; 162 | 163 | if (!node || node->type != MXML_INTEGER) 164 | return (-1); 165 | 166 | /* 167 | * Set the new value and return... 168 | */ 169 | 170 | node->value.integer = integer; 171 | 172 | return (0); 173 | } 174 | 175 | 176 | /* 177 | * 'mxmlSetOpaque()' - Set the value of an opaque node. 178 | * 179 | * The node is not changed if it (or its first child) is not an opaque node. 180 | */ 181 | 182 | int /* O - 0 on success, -1 on failure */ 183 | mxmlSetOpaque(mxml_node_t *node, /* I - Node to set */ 184 | const char *opaque) /* I - Opaque string */ 185 | { 186 | /* 187 | * Range check input... 188 | */ 189 | 190 | if (node && node->type == MXML_ELEMENT && 191 | node->child && node->child->type == MXML_OPAQUE) 192 | node = node->child; 193 | 194 | if (!node || node->type != MXML_OPAQUE || !opaque) 195 | return (-1); 196 | 197 | /* 198 | * Free any old opaque value and set the new value... 199 | */ 200 | 201 | if (node->value.opaque) 202 | free(node->value.opaque); 203 | 204 | node->value.opaque = strdup(opaque); 205 | 206 | return (0); 207 | } 208 | 209 | 210 | /* 211 | * 'mxmlSetReal()' - Set the value of a real number node. 212 | * 213 | * The node is not changed if it (or its first child) is not a real number node. 214 | */ 215 | 216 | int /* O - 0 on success, -1 on failure */ 217 | mxmlSetReal(mxml_node_t *node, /* I - Node to set */ 218 | double real) /* I - Real number value */ 219 | { 220 | /* 221 | * Range check input... 222 | */ 223 | 224 | if (node && node->type == MXML_ELEMENT && 225 | node->child && node->child->type == MXML_REAL) 226 | node = node->child; 227 | 228 | if (!node || node->type != MXML_REAL) 229 | return (-1); 230 | 231 | /* 232 | * Set the new value and return... 233 | */ 234 | 235 | node->value.real = real; 236 | 237 | return (0); 238 | } 239 | 240 | 241 | /* 242 | * 'mxmlSetText()' - Set the value of a text node. 243 | * 244 | * The node is not changed if it (or its first child) is not a text node. 245 | */ 246 | 247 | int /* O - 0 on success, -1 on failure */ 248 | mxmlSetText(mxml_node_t *node, /* I - Node to set */ 249 | int whitespace, /* I - 1 = leading whitespace, 0 = no whitespace */ 250 | const char *string) /* I - String */ 251 | { 252 | /* 253 | * Range check input... 254 | */ 255 | 256 | if (node && node->type == MXML_ELEMENT && 257 | node->child && node->child->type == MXML_TEXT) 258 | node = node->child; 259 | 260 | if (!node || node->type != MXML_TEXT || !string) 261 | return (-1); 262 | 263 | /* 264 | * Free any old string value and set the new value... 265 | */ 266 | 267 | if (node->value.text.string) 268 | free(node->value.text.string); 269 | 270 | node->value.text.whitespace = whitespace; 271 | node->value.text.string = strdup(string); 272 | 273 | return (0); 274 | } 275 | 276 | 277 | /* 278 | * 'mxmlSetTextf()' - Set the value of a text node to a formatted string. 279 | * 280 | * The node is not changed if it (or its first child) is not a text node. 281 | */ 282 | 283 | int /* O - 0 on success, -1 on failure */ 284 | mxmlSetTextf(mxml_node_t *node, /* I - Node to set */ 285 | int whitespace, /* I - 1 = leading whitespace, 0 = no whitespace */ 286 | const char *format, /* I - Printf-style format string */ 287 | ...) /* I - Additional arguments as needed */ 288 | { 289 | va_list ap; /* Pointer to arguments */ 290 | 291 | 292 | /* 293 | * Range check input... 294 | */ 295 | 296 | if (node && node->type == MXML_ELEMENT && 297 | node->child && node->child->type == MXML_TEXT) 298 | node = node->child; 299 | 300 | if (!node || node->type != MXML_TEXT || !format) 301 | return (-1); 302 | 303 | /* 304 | * Free any old string value and set the new value... 305 | */ 306 | 307 | if (node->value.text.string) 308 | free(node->value.text.string); 309 | 310 | va_start(ap, format); 311 | 312 | node->value.text.whitespace = whitespace; 313 | node->value.text.string = _mxml_strdupf(format, ap); 314 | 315 | va_end(ap); 316 | 317 | return (0); 318 | } 319 | 320 | 321 | /* 322 | * 'mxmlSetUserData()' - Set the user data pointer for a node. 323 | * 324 | * @since Mini-XML 2.7@ 325 | */ 326 | 327 | int /* O - 0 on success, -1 on failure */ 328 | mxmlSetUserData(mxml_node_t *node, /* I - Node to set */ 329 | void *data) /* I - User data pointer */ 330 | { 331 | /* 332 | * Range check input... 333 | */ 334 | 335 | if (!node) 336 | return (-1); 337 | 338 | /* 339 | * Set the user data pointer and return... 340 | */ 341 | 342 | node->user_data = data; 343 | return (0); 344 | } 345 | 346 | 347 | /* 348 | * End of "$Id: mxml-set.c 441 2011-12-09 23:49:00Z mike $". 349 | */ 350 | -------------------------------------------------------------------------------- /src/libseq/Codon.cpp: -------------------------------------------------------------------------------- 1 | #include "Codon.h" 2 | 3 | namespace seq { 4 | 5 | AminoAcid Codon::translate(const NTSequence::const_iterator triplet) 6 | { 7 | const AminoAcid codonTable[4][4][4] = { 8 | { { AminoAcid::K /* AAA */, 9 | AminoAcid::N /* AAC */, 10 | AminoAcid::K /* AAG */, 11 | AminoAcid::N /* AAT */ 12 | }, 13 | { AminoAcid::T /* ACA */, 14 | AminoAcid::T /* ACC */, 15 | AminoAcid::T /* ACG */, 16 | AminoAcid::T /* ACT */ 17 | }, 18 | { AminoAcid::R /* AGA */, 19 | AminoAcid::S /* AGC */, 20 | AminoAcid::R /* AGG */, 21 | AminoAcid::S /* AGT */ 22 | }, 23 | { AminoAcid::I /* ATA */, 24 | AminoAcid::I /* ATC */, 25 | AminoAcid::M /* ATG */, 26 | AminoAcid::I /* ATT */ 27 | } 28 | }, 29 | { { AminoAcid::Q /* CAA */, 30 | AminoAcid::H /* CAC */, 31 | AminoAcid::Q /* CAG */, 32 | AminoAcid::H /* CAT */ 33 | }, 34 | { AminoAcid::P /* CCA */, 35 | AminoAcid::P /* CCC */, 36 | AminoAcid::P /* CCG */, 37 | AminoAcid::P /* CCT */ 38 | }, 39 | { AminoAcid::R /* CGA */, 40 | AminoAcid::R /* CGC */, 41 | AminoAcid::R /* CGG */, 42 | AminoAcid::R /* CGT */ 43 | }, 44 | { AminoAcid::L /* CTA */, 45 | AminoAcid::L /* CTC */, 46 | AminoAcid::L /* CTG */, 47 | AminoAcid::L /* CTT */ 48 | } 49 | }, 50 | { { AminoAcid::E /* GAA */, 51 | AminoAcid::D /* GAC */, 52 | AminoAcid::E /* GAG */, 53 | AminoAcid::D /* GAT */ 54 | }, 55 | { AminoAcid::A /* GCA */, 56 | AminoAcid::A /* GCC */, 57 | AminoAcid::A /* GCG */, 58 | AminoAcid::A /* GCT */ 59 | }, 60 | { AminoAcid::G /* GGA */, 61 | AminoAcid::G /* GGC */, 62 | AminoAcid::G /* GGG */, 63 | AminoAcid::G /* GGT */ 64 | }, 65 | { AminoAcid::V /* GTA */, 66 | AminoAcid::V /* GTC */, 67 | AminoAcid::V /* GTG */, 68 | AminoAcid::V /* GTT */ 69 | } 70 | }, 71 | { { AminoAcid::STP /* TAA */, 72 | AminoAcid::Y /* TAC */, 73 | AminoAcid::STP /* TAG */, 74 | AminoAcid::Y /* TAT */ 75 | }, 76 | { AminoAcid::S /* TCA */, 77 | AminoAcid::S /* TCC */, 78 | AminoAcid::S /* TCG */, 79 | AminoAcid::S /* TCT */ 80 | }, 81 | { AminoAcid::STP /* TGA */, 82 | AminoAcid::C /* TGC */, 83 | AminoAcid::W /* TGG */, 84 | AminoAcid::C /* TGT */ 85 | }, 86 | { AminoAcid::L /* TTA */, 87 | AminoAcid::F /* TTC */, 88 | AminoAcid::L /* TTG */, 89 | AminoAcid::F /* TTT */ 90 | } 91 | } }; 92 | 93 | if (*triplet == Nucleotide::GAP 94 | && (*(triplet + 1) == Nucleotide::GAP) 95 | && (*(triplet + 2) == Nucleotide::GAP)) 96 | return AminoAcid::GAP; 97 | 98 | if (triplet->isAmbiguity() 99 | || (triplet + 1)->isAmbiguity() 100 | || (triplet + 2)->isAmbiguity()) 101 | return AminoAcid::X; 102 | 103 | return 104 | codonTable[triplet->intRep()] 105 | [(triplet + 1)->intRep()] 106 | [(triplet + 2)->intRep()]; 107 | } 108 | 109 | std::set 110 | Codon::translateAll(const NTSequence::const_iterator triplet) 111 | { 112 | std::set result; 113 | 114 | NTSequence s(triplet, triplet + 3); 115 | 116 | std::vector possibilities; 117 | s.nonAmbiguousSequences(possibilities); 118 | 119 | for (unsigned i = 0; i < possibilities.size(); ++i) 120 | result.insert(translate(possibilities[i].begin())); 121 | 122 | return result; 123 | } 124 | 125 | namespace { 126 | void addTriplet(std::set& result, 127 | Nucleotide c1, Nucleotide c2, Nucleotide c3) 128 | { 129 | NTSequence triplet; 130 | triplet.push_back(c1); 131 | triplet.push_back(c2); 132 | triplet.push_back(c3); 133 | 134 | result.insert(triplet); 135 | } 136 | 137 | } 138 | 139 | std::set Codon::codonsFor(AminoAcid a) 140 | { 141 | std::set result; 142 | 143 | switch (a.intRep()) { 144 | case AminoAcid::AA_A: 145 | addTriplet(result, Nucleotide::G, Nucleotide::C, Nucleotide::T); 146 | addTriplet(result, Nucleotide::G, Nucleotide::C, Nucleotide::C); 147 | addTriplet(result, Nucleotide::G, Nucleotide::C, Nucleotide::A); 148 | addTriplet(result, Nucleotide::G, Nucleotide::C, Nucleotide::G); 149 | break; 150 | case AminoAcid::AA_C: 151 | addTriplet(result, Nucleotide::T, Nucleotide::G, Nucleotide::T); 152 | addTriplet(result, Nucleotide::T, Nucleotide::G, Nucleotide::C); 153 | break; 154 | case AminoAcid::AA_D: 155 | addTriplet(result, Nucleotide::G, Nucleotide::A, Nucleotide::T); 156 | addTriplet(result, Nucleotide::G, Nucleotide::A, Nucleotide::C); 157 | break; 158 | case AminoAcid::AA_E: 159 | addTriplet(result, Nucleotide::G, Nucleotide::A, Nucleotide::A); 160 | addTriplet(result, Nucleotide::G, Nucleotide::A, Nucleotide::G); 161 | break; 162 | case AminoAcid::AA_F: 163 | addTriplet(result, Nucleotide::T, Nucleotide::T, Nucleotide::T); 164 | addTriplet(result, Nucleotide::T, Nucleotide::T, Nucleotide::C); 165 | break; 166 | case AminoAcid::AA_G: 167 | addTriplet(result, Nucleotide::G, Nucleotide::G, Nucleotide::T); 168 | addTriplet(result, Nucleotide::G, Nucleotide::G, Nucleotide::C); 169 | addTriplet(result, Nucleotide::G, Nucleotide::G, Nucleotide::A); 170 | addTriplet(result, Nucleotide::G, Nucleotide::G, Nucleotide::G); 171 | break; 172 | case AminoAcid::AA_H: 173 | addTriplet(result, Nucleotide::C, Nucleotide::A, Nucleotide::T); 174 | addTriplet(result, Nucleotide::C, Nucleotide::A, Nucleotide::C); 175 | break; 176 | case AminoAcid::AA_I: 177 | addTriplet(result, Nucleotide::A, Nucleotide::T, Nucleotide::T); 178 | addTriplet(result, Nucleotide::A, Nucleotide::T, Nucleotide::C); 179 | addTriplet(result, Nucleotide::A, Nucleotide::T, Nucleotide::A); 180 | break; 181 | case AminoAcid::AA_K: 182 | addTriplet(result, Nucleotide::A, Nucleotide::A, Nucleotide::A); 183 | addTriplet(result, Nucleotide::A, Nucleotide::A, Nucleotide::G); 184 | break; 185 | case AminoAcid::AA_L: 186 | addTriplet(result, Nucleotide::T, Nucleotide::T, Nucleotide::A); 187 | addTriplet(result, Nucleotide::T, Nucleotide::T, Nucleotide::G); 188 | addTriplet(result, Nucleotide::C, Nucleotide::T, Nucleotide::T); 189 | addTriplet(result, Nucleotide::C, Nucleotide::T, Nucleotide::C); 190 | addTriplet(result, Nucleotide::C, Nucleotide::T, Nucleotide::A); 191 | addTriplet(result, Nucleotide::C, Nucleotide::T, Nucleotide::G); 192 | break; 193 | case AminoAcid::AA_M: 194 | addTriplet(result, Nucleotide::A, Nucleotide::T, Nucleotide::G); 195 | break; 196 | case AminoAcid::AA_N: 197 | addTriplet(result, Nucleotide::A, Nucleotide::A, Nucleotide::T); 198 | addTriplet(result, Nucleotide::A, Nucleotide::A, Nucleotide::C); 199 | break; 200 | case AminoAcid::AA_P: 201 | addTriplet(result, Nucleotide::C, Nucleotide::C, Nucleotide::T); 202 | addTriplet(result, Nucleotide::C, Nucleotide::C, Nucleotide::C); 203 | addTriplet(result, Nucleotide::C, Nucleotide::C, Nucleotide::A); 204 | addTriplet(result, Nucleotide::C, Nucleotide::C, Nucleotide::G); 205 | break; 206 | case AminoAcid::AA_Q: 207 | addTriplet(result, Nucleotide::C, Nucleotide::A, Nucleotide::A); 208 | addTriplet(result, Nucleotide::C, Nucleotide::A, Nucleotide::G); 209 | break; 210 | case AminoAcid::AA_R: 211 | addTriplet(result, Nucleotide::C, Nucleotide::G, Nucleotide::T); 212 | addTriplet(result, Nucleotide::C, Nucleotide::G, Nucleotide::C); 213 | addTriplet(result, Nucleotide::C, Nucleotide::G, Nucleotide::A); 214 | addTriplet(result, Nucleotide::C, Nucleotide::G, Nucleotide::G); 215 | addTriplet(result, Nucleotide::A, Nucleotide::G, Nucleotide::A); 216 | addTriplet(result, Nucleotide::A, Nucleotide::G, Nucleotide::G); 217 | break; 218 | case AminoAcid::AA_S: 219 | addTriplet(result, Nucleotide::T, Nucleotide::C, Nucleotide::T); 220 | addTriplet(result, Nucleotide::T, Nucleotide::C, Nucleotide::C); 221 | addTriplet(result, Nucleotide::T, Nucleotide::C, Nucleotide::A); 222 | addTriplet(result, Nucleotide::T, Nucleotide::C, Nucleotide::G); 223 | addTriplet(result, Nucleotide::A, Nucleotide::G, Nucleotide::T); 224 | addTriplet(result, Nucleotide::A, Nucleotide::G, Nucleotide::C); 225 | break; 226 | case AminoAcid::AA_T: 227 | addTriplet(result, Nucleotide::A, Nucleotide::C, Nucleotide::T); 228 | addTriplet(result, Nucleotide::A, Nucleotide::C, Nucleotide::C); 229 | addTriplet(result, Nucleotide::A, Nucleotide::C, Nucleotide::A); 230 | addTriplet(result, Nucleotide::A, Nucleotide::C, Nucleotide::G); 231 | break; 232 | case AminoAcid::AA_V: 233 | addTriplet(result, Nucleotide::G, Nucleotide::T, Nucleotide::T); 234 | addTriplet(result, Nucleotide::G, Nucleotide::T, Nucleotide::C); 235 | addTriplet(result, Nucleotide::G, Nucleotide::T, Nucleotide::A); 236 | addTriplet(result, Nucleotide::G, Nucleotide::T, Nucleotide::G); 237 | break; 238 | case AminoAcid::AA_W: 239 | addTriplet(result, Nucleotide::T, Nucleotide::G, Nucleotide::G); 240 | break; 241 | case AminoAcid::AA_Y: 242 | addTriplet(result, Nucleotide::T, Nucleotide::A, Nucleotide::T); 243 | addTriplet(result, Nucleotide::T, Nucleotide::A, Nucleotide::C); 244 | break; 245 | case AminoAcid::AA_STP: 246 | addTriplet(result, Nucleotide::T, Nucleotide::A, Nucleotide::A); 247 | addTriplet(result, Nucleotide::T, Nucleotide::A, Nucleotide::G); 248 | addTriplet(result, Nucleotide::T, Nucleotide::G, Nucleotide::A); 249 | break; 250 | case AminoAcid::AA_GAP: 251 | case AminoAcid::AA_Z: 252 | case AminoAcid::AA_U: 253 | case AminoAcid::AA_B: 254 | case AminoAcid::AA_X: 255 | default: 256 | break; 257 | } 258 | 259 | return result; 260 | } 261 | 262 | }; 263 | -------------------------------------------------------------------------------- /references/DENV/DENV4-NC002640.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | --------------------------------------------------------------------------------