├── .gitignore
├── CMakeLists.txt
├── src
    ├── mxml-utils
    │   ├── CMakeLists.txt
    │   ├── MXMLUtils.h
    │   └── MXMLUtils.C
    ├── mxml
    │   ├── CMakeLists.txt
    │   ├── ANNOUNCEMENT
    │   ├── mxml-private.h
    │   ├── config.h
    │   ├── README
    │   ├── mxml-search.c
    │   ├── mxml-private.c
    │   ├── mxml-attr.c
    │   └── mxml-set.c
    ├── libseq
    │   ├── CMakeLists.txt
    │   ├── Codon.h
    │   ├── ParseException.h
    │   ├── AlignmentAlgorithm.h
    │   ├── NeedlemanWunsh.h
    │   ├── CodingSequence.cpp
    │   ├── CodingSequence.h
    │   ├── AASequence.cpp
    │   ├── AminoAcid.cpp
    │   ├── CodonAlign.h
    │   ├── NTSequence.h
    │   ├── AASequence.h
    │   ├── AlignmentAlgorithm.cpp
    │   ├── AminoAcid.h
    │   ├── Nucleotide.h
    │   ├── NeedlemanWunsh.cpp
    │   ├── NTSequence.cpp
    │   ├── Nucleotide.cpp
    │   ├── CodonAlign.cpp
    │   └── Codon.cpp
    ├── CLIUtils.h
    ├── CMakeLists.txt
    ├── Utils.h
    ├── CLIUtils.cpp
    ├── ResultsExporter.h
    ├── ReferenceSequence.h
    ├── Utils.cpp
    ├── Alignment.h
    ├── ReferenceSequence.cpp
    ├── Virulign.cpp
    └── Alignment.cpp
├── references
    ├── HCV
    │   ├── README
    │   └── HCV2-FN666429.xml
    ├── SARS-CoV-2
    │   ├── ORF10.xml
    │   ├── ORF7b.xml
    │   ├── ORF6.xml
    │   ├── E.xml
    │   ├── ORF7a.xml
    │   ├── ORF8.xml
    │   ├── M.xml
    │   ├── ORF3a.xml
    │   ├── N.xml
    │   └── S.xml
    ├── HIV
    │   ├── HIV-HXB2-gag.xml
    │   ├── HIV-HXB2-env.xml
    │   └── HIV-HXB2-pol.xml
    ├── CHIKV
    │   ├── CHIKV-NC004162-gp2.xml
    │   └── CHIKV-NC004162-gp1.xml
    └── DENV
    │   └── DENV4-NC002640.xml
├── BUILD.txt
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | **/.DS_Store
2 | build
3 | install
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | PROJECT(VIRULIGN)
2 | CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
3 | 
4 | SUBDIRS(src)
5 | 


--------------------------------------------------------------------------------
/src/mxml-utils/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | SET(SOURCES
 2 | MXMLUtils.C
 3 | )
 4 | 
 5 | INCLUDE_DIRECTORIES(
 6 | 	${CMAKE_CURRENT_SOURCE_DIR}/..)
 7 | 
 8 | ADD_LIBRARY(mxml-utils ${SOURCES})
 9 | 
10 | TARGET_LINK_LIBRARIES(mxml-utils mxml)
11 | 


--------------------------------------------------------------------------------
/src/mxml/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | SET(SOURCES
 2 | mxml-attr.c
 3 | mxml-entity.c
 4 | mxml-file.c
 5 | mxml-index.c
 6 | mxml-node.c
 7 | mxml-private.c
 8 | mxml-search.c
 9 | mxml-set.c
10 | mxml-get.c
11 | mxml-string.c
12 | )
13 | 
14 | ADD_LIBRARY(mxml ${SOURCES})
15 | 


--------------------------------------------------------------------------------
/references/HCV/README:
--------------------------------------------------------------------------------
1 | To get FN666429:
2 | efetch -db sequences -id FN666429 -format gpc > FN666429.insd.xml
3 | manually remove the outer XML tag (Set)
4 | python2 ~/projects/virulign-tools/annotations/genbank_to_virulign.py ../FN666429.insd.xml "HCV2" 314 9398 > HCV2-FN666429.xml
5 | 


--------------------------------------------------------------------------------
/src/libseq/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | SET(SOURCES
 2 |     AASequence.cpp
 3 |     AlignmentAlgorithm.cpp
 4 |     AminoAcid.cpp
 5 |     CodingSequence.cpp
 6 |     Codon.cpp
 7 |     CodonAlign.cpp
 8 |     NTSequence.cpp
 9 |     NeedlemanWunsh.cpp
10 |     Nucleotide.cpp
11 | )
12 |     
13 | ADD_LIBRARY(seq ${SOURCES})
14 | 


--------------------------------------------------------------------------------
/src/CLIUtils.h:
--------------------------------------------------------------------------------
 1 | // This may look like C code, but it's really -*- C++ -*-
 2 | #ifndef CLI_UTILS_H_ 
 3 | #define CLI_UTILS_H_ 
 4 | 
 5 | #include <string>
 6 | 
 7 | ReferenceSequence loadRefSeqFromFile(const char* refSeqFileName); 
 8 | bool equalsS(char* str1, char* str2); 
 9 | bool equalsString(std::string str1, std::string str2);
10 | 
11 | #endif // CLI_UTILS_H_ 
12 | 


--------------------------------------------------------------------------------
/references/SARS-CoV-2/ORF10.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!-- SARS-CoV-2 Spike from NC_045512.2 (29558-29674) -->
3 |   <orf name="orf10" description="orf10" referenceSequence="ATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAG" >
4 |          <protein abbreviation="orf10" startPosition="1" stopPosition="115" />
5 |   </orf>
6 | 


--------------------------------------------------------------------------------
/references/SARS-CoV-2/ORF7b.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!-- SARS-CoV-2 ORF7b from NC_045512.2 (27756-27887) -->
3 | <orf name="ORF7b" description="ORF7b" referenceSequence="ATGATTGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAA" >
4 | <protein abbreviation="ORF7b" startPosition="1" stopPosition="131" />
5 | </orf>
6 | 


--------------------------------------------------------------------------------
/references/SARS-CoV-2/ORF6.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!-- SARS-CoV-2 ORF6 from NC_045512.2 (27202-27387) -->
3 |   <orf name="ORF6" description="ORF6" referenceSequence="ATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATAAACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAACCAATGGAGATTGATTAA" >
4 |          <protein abbreviation="ORF6" startPosition="1" stopPosition="185" />
5 |   </orf>
6 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | SUBDIRS(libseq)
 2 | SUBDIRS(mxml)
 3 | SUBDIRS(mxml-utils)
 4 | 
 5 | SET(LIB_SOURCES
 6 |     Alignment.cpp
 7 |     CLIUtils.cpp
 8 |     Utils.cpp
 9 |     ReferenceSequence.cpp
10 |     ResultsExporter.cpp
11 | )
12 | 
13 | include_directories(libseq mxml mxml-utils)
14 | 
15 | ADD_LIBRARY(virulignlib ${LIB_SOURCES})
16 | ADD_EXECUTABLE(virulign Virulign.cpp)
17 | TARGET_LINK_LIBRARIES(virulign virulignlib seq mxml mxml-utils)
18 | 
19 | install(TARGETS virulign DESTINATION bin)
20 | 


--------------------------------------------------------------------------------
/references/SARS-CoV-2/E.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!-- SARS-CoV-2 Envelope (E) from NC_045512.2 (26245-26472) -->
3 |   <orf name="E" description="envelope protein" referenceSequence="ATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTATTCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGTGAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGATCTTCTGGTCTAA" >
4 |          <protein abbreviation="E" startPosition="1" stopPosition="227" />
5 |   </orf>
6 | 


--------------------------------------------------------------------------------
/src/mxml-utils/MXMLUtils.h:
--------------------------------------------------------------------------------
 1 | #ifndef MXML_UTILS
 2 | #define MXML_UTILS
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | 
 7 | typedef struct mxml_node_s mxml_node_t;
 8 | 
 9 | mxml_node_t *singleChildElement(mxml_node_t *element, 
10 | 				const std::string& tagName);
11 | 
12 | bool attributeValue(mxml_node_t *element, const std::string& attributeName,
13 | 		    std::string& result);
14 | 
15 | std::vector<mxml_node_t *> 
16 | childElements(mxml_node_t *element, const std::string& tagName);
17 | 
18 | #endif //MXML_UTILS
19 | 


--------------------------------------------------------------------------------
/references/SARS-CoV-2/ORF7a.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!-- SARS-CoV-2 ORF7a from NC_045512.2 (27394-27759) -->
3 | <orf name="ORF7a" description="ORF7a" referenceSequence="ATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGAGCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTTATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGA" >
4 | <protein abbreviation="ORF7a" startPosition="1" stopPosition="365" />
5 | </orf>
6 | 


--------------------------------------------------------------------------------
/references/SARS-CoV-2/ORF8.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!-- SARS-CoV-2 ORF8 from NC_045512.2 (27894-28259) -->
3 |   <orf name="ORF8 " description="ORF8" referenceSequence="ATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTGTAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCCGTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAATTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCTGTTTACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAA" >
4 |          <protein abbreviation="ORF8" startPosition="1" stopPosition="365" />
5 |   </orf>
6 | 


--------------------------------------------------------------------------------
/src/mxml/ANNOUNCEMENT:
--------------------------------------------------------------------------------
 1 | Mini-XML 2.7 is now available for download from:
 2 | 
 3 |     http://www.minixml.org/software.php
 4 | 
 5 | Mini-XML 2.7 fixes some minor platform and XML issues. Changes include:
 6 | 
 7 | - Updated the source headers to reference the Mini-XML license and its
 8 |   exceptions to the LGPL2 (STR #108)
 9 | - The shared library did not include a destructor for the thread-
10 |   specific data key on UNIX-based operating systems (STR #103)
11 | - mxmlLoad* did not error out on XML with multiple root nodes (STR #101)
12 | - Fixed an issue with the _mxml_vstrdupf function (STR #107)
13 | - mxmlSave* no longer write all siblings of the passed node, just that
14 |   node and its children (STR #109)
15 | 


--------------------------------------------------------------------------------
/BUILD.txt:
--------------------------------------------------------------------------------
 1 | virulign: : fast codon-correct alignment for virus pathogens 
 2 | ------------------------------------------------------------
 3 | 
 4 | Requirements
 5 | ------------
 6 | * We use CMake (cmake.org) for the build process, and tested this on GNU/Linux, MacOS and Windows (Visual Studio C++ Express).
 7 | * C++ environment.
 8 | 
 9 | Build instructions
10 | ------------------
11 | Create a temporary build directory (for example within the project root).
12 | To install virulign to the operating system's default location, use:
13 | $ mkdir build
14 | $ cd build
15 | $ cmake ../ -DCMAKE_BUILD_TYPE=Release 
16 | $ make
17 | 
18 | To install virulign to a custom location, use the CMAKE_INSTALL_PREFIX variable:
19 | $ mkdir build
20 | $ cd build
21 | $ cmake ../ -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/soft/virulign/
22 | $ make
23 | 
24 | To install
25 | ----------
26 | $ make install
27 | 


--------------------------------------------------------------------------------
/src/Utils.h:
--------------------------------------------------------------------------------
 1 | // This may look like C code, but it's really -*- C++ -*-
 2 | #ifndef UTILS_H_ 
 3 | #define UTILS_H_ 
 4 | 
 5 | #include <string>
 6 | #include <sstream>
 7 | #include <algorithm>
 8 | #include <typeinfo>
 9 | 
10 | template <typename T>
11 | T lexical_cast(const std::string& s)
12 | {
13 |     std::stringstream ss(s);
14 | 
15 |     T result;
16 |     if ((ss >> result).fail() || !(ss >> std::ws).eof())
17 |     {
18 |         throw std::bad_cast();
19 |     }
20 | 
21 |     return result;
22 | }
23 | 
24 | template <typename T>
25 | std::string to_string(const T& t)
26 | {
27 |   std::stringstream ss;
28 |   ss << t;
29 |   return ss.str();
30 | }
31 | 
32 | std::string to_upper_copy(const std::string& s);
33 | 
34 | bool ends_with(const std::string& s, const std::string& p);
35 | 
36 | long long current_time_ms();
37 | 
38 | std::string format_time(const long long& milliseconds);
39 | 
40 | #endif // UTILS_H_ 
41 | 


--------------------------------------------------------------------------------
/references/SARS-CoV-2/M.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!-- SARS-CoV-2 Membrane (M) from NC_045512.2 (26523-27191) -->
3 |   <orf name="M" description="membrane glycoprotein" referenceSequence="ATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTCCTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTAAGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAATAAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTCATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTCTCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGTGATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAAGAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTGACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAGCAGTGACAATATTGCTTTGCTTGTACAGTAA" >
4 |          <protein abbreviation="M" startPosition="1" stopPosition="668" />
5 |   </orf>
6 | 


--------------------------------------------------------------------------------
/src/libseq/Codon.h:
--------------------------------------------------------------------------------
 1 | // This may look like C code, but it's really -*- C++ -*-
 2 | #ifndef CODON_H_
 3 | #define CODON_H_
 4 | 
 5 | #include <string>
 6 | #include <set>
 7 | 
 8 | #include "NTSequence.h"
 9 | #include "AminoAcid.h"
10 | 
11 | namespace seq {
12 | 
13 | /**
14 |  * Utility class that defines the genetic code.
15 |  */
16 | class Codon
17 | {
18 | public:
19 |   /**
20 |    * Translate a nucleotide triplet (given by the range starting and
21 |    * the indicated start point in a NTSequence) into an AminoAcid.
22 |    *
23 |    * If the triplet is three gaps, then the result is AminoAcid::GAP.
24 |    * If the triplet contains ambiguity codes or gaps, then the result
25 |    * is AminoAcid::X. Otherwise, the result is the translated amino
26 |    * acid.
27 |    */
28 |   static AminoAcid translate(const NTSequence::const_iterator triplet);
29 | 
30 |   static std::set<AminoAcid>
31 |      translateAll(const NTSequence::const_iterator triplet);
32 | 
33 |   static std::set<NTSequence> codonsFor(AminoAcid a);
34 | };
35 | 
36 | };
37 | 
38 | #endif // CODON_H_
39 | 


--------------------------------------------------------------------------------
/references/SARS-CoV-2/ORF3a.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!-- SARS-CoV-2 ORF3a from NC_045512.2 (25393-26220) -->
3 | <orf name="ORF3a" description="ORF3a" referenceSequence="ATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTCGCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCTTGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGTGTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTGGCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAATAATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTTCTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTACTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATGGGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCAACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGCCTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAATTTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAA" >
4 | <protein abbreviation="ORF3a" startPosition="1" stopPosition="827" />
5 | </orf>
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | **! See [here](https://github.com/rega-cev/virulign/tree/master/references/SARS-CoV-2) for xml files for alignment and annotation of SARS-CoV-2 genomes** 
 2 | 
 3 | 
 4 | 
 5 | ### Summary 
 6 | VIRULIGN is a tool for codon-correct pairwise alignments, with an augmented functionality to annotate the alignment according the positions of the proteins. We present a [tutorial](https://github.com/rega-cev/virulign-tutorial) demonstrating VIRULIGN's functionality for three different pathogens (i.e., HIV-1, Dengue virus, Zika virus).
 7 | 
 8 | Please download releases for GNU/Linux, MacOS and Windows [here](https://github.com/rega-cev/virulign/releases). To install the download, just put the virulign binary in a directory that is in your PATH environment. To build VIRULIGN from source have a look at the [BUILD.txt file](BUILD.txt).
 9 | 
10 | 
11 | 
12 | ### Manuscript 
13 | For a detailed description of VIRULIGN and how to cite, please use: 
14 | 
15 | Libin PJK, Deforche K, Abecasis AB and Theys K.,  (2018),  VIRULIGN: fast codon-correct alignment and annotation of viral genomes,  Bioinformatics, bty851, https://doi.org/10.1093/bioinformatics/bty851
16 | 
17 | 


--------------------------------------------------------------------------------
/src/mxml/mxml-private.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * "$Id: mxml-private.h 408 2010-09-19 05:26:46Z mike $"
 3 |  *
 4 |  * Private definitions for Mini-XML, a small XML-like file parsing library.
 5 |  *
 6 |  * Copyright 2003-2010 by Michael R Sweet.
 7 |  *
 8 |  * These coded instructions, statements, and computer programs are the
 9 |  * property of Michael R Sweet and are protected by Federal copyright
10 |  * law.  Distribution and use rights are outlined in the file "COPYING"
11 |  * which should have been included with this file.  If this file is
12 |  * missing or damaged, see the license at:
13 |  *
14 |  *     http://www.minixml.org/
15 |  */
16 | 
17 | /*
18 |  * Include necessary headers...
19 |  */
20 | 
21 | #include "config.h"
22 | #include "mxml.h"
23 | 
24 | 
25 | /*
26 |  * Global, per-thread data...
27 |  */
28 | 
29 | typedef struct _mxml_global_s
30 | {
31 |   void	(*error_cb)(const char *);
32 |   int	num_entity_cbs;
33 |   int	(*entity_cbs[100])(const char *name);
34 |   int	wrap;
35 |   mxml_custom_load_cb_t	custom_load_cb;
36 |   mxml_custom_save_cb_t	custom_save_cb;
37 | } _mxml_global_t;
38 | 
39 | 
40 | /*
41 |  * Functions...
42 |  */
43 | 
44 | extern _mxml_global_t	*_mxml_global(void);
45 | extern int		_mxml_entity_cb(const char *name);
46 | 
47 | 
48 | /*
49 |  * End of "$Id: mxml-private.h 408 2010-09-19 05:26:46Z mike $".
50 |  */
51 | 


--------------------------------------------------------------------------------
/src/CLIUtils.cpp:
--------------------------------------------------------------------------------
 1 | // This may look like C code, but it's really -*- C++ -*-
 2 | 
 3 | #include <fstream> 
 4 | #include <string.h>
 5 | #include <stdexcept>
 6 | 
 7 | #include "ReferenceSequence.h" 
 8 | #include "CLIUtils.h" 
 9 | 
10 | ReferenceSequence loadRefSeqFromFile(const char* refSeqFileName) {
11 |   std::ifstream f_ref(refSeqFileName);
12 |   if (!f_ref) {
13 |     throw std::runtime_error(std::string("Could not open ") + refSeqFileName);
14 |   }
15 | 
16 |   ReferenceSequence* ref;
17 | 
18 |   try {
19 |     seq::NTSequence refNt;
20 |     f_ref >> refNt;
21 | 
22 |     if (!f_ref) {
23 |       throw std::runtime_error(std::string("RefSeq loading:: File ") + refSeqFileName + " does not contain a FASTA sequence ?");
24 |     }
25 | 
26 |     ref = new ReferenceSequence(refNt);
27 |     ref->addRegion(ReferenceSequence::Region(0, refNt.size()/3, "P"));
28 | 
29 |     f_ref >> refNt;
30 | 
31 |   if (f_ref) {
32 |       throw std::runtime_error(std::string("RefSeq loading:: File ") + refSeqFileName + " contains multiple sequences ?");
33 |     }
34 | 	return *ref;
35 |   } catch (seq::ParseException& e) {
36 |       throw std::runtime_error(std::string("RefSeq loading:: Fatal error: ") + e.message());
37 |   }
38 | }
39 | 
40 | bool equalsS(char* str1, char* str2) {
41 |   return strcmp(str1, str2) == 0;
42 | }
43 | 
44 | bool equalsString(std::string str1, std::string str2){
45 |   return str1.compare(str2) == 0;
46 | }
47 | 


--------------------------------------------------------------------------------
/src/ResultsExporter.h:
--------------------------------------------------------------------------------
 1 | // This may look like C code, but it's really -*- C++ -*-
 2 | #ifndef RESULTS_EXPORTER_H_
 3 | #define RESULTS_EXPORTER_H_
 4 | 
 5 | #include <iostream>
 6 | #include <vector>
 7 | 
 8 | class Alignment;
 9 | 
10 | enum ExportKind { Mutations, PairwiseAlignments, GlobalAlignment,
11 | 		  PositionTable, MutationTable };
12 | enum ExportAlphabet { Nucleotides, AminoAcids };
13 | 
14 | class ResultsExporter
15 | {
16 | public:
17 |   ResultsExporter(const std::vector<Alignment>& results, ExportKind kind,
18 | 		  ExportAlphabet alphabet, bool withInsertions = false);
19 | 
20 |   ExportKind     kind()     const { return kind_; }
21 |   ExportAlphabet alphabet() const { return alphabet_; }
22 | 
23 |   void streamData(std::ostream& stream);
24 | 	void streamConsensusSequence(std::ostream& stream);
25 | 
26 | private:
27 |   const std::vector<Alignment>& results_;
28 |   const ExportKind      kind_;
29 |   const ExportAlphabet  alphabet_;
30 |   const bool            withInsertions_;
31 | 
32 |   void streamMutationsCsv(std::ostream& stream);
33 |   void streamPairwiseAlignments(std::ostream& stream);
34 |   void streamPositionTable(std::ostream& stream);
35 |   void streamMutationTable(std::ostream& stream);
36 | 
37 |   void computeGlobalAlignment(seq::NTSequence& globalRef,
38 | 			      std::vector<seq::NTSequence>& globalAlignment);
39 |   void streamGlobalAlignment(std::ostream& stream);
40 | };
41 | 
42 | #endif // RESULTS_EXPORTER_H_
43 | 


--------------------------------------------------------------------------------
/references/SARS-CoV-2/N.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!-- SARS-CoV-2 Nucleocapsid (N) from NC_045512.2 (28274-29533) -->
3 |   <orf name="N" description="nucleocapsid protein" referenceSequence="ATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTCCAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGGTGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCTGGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAAAAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAACATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGTAGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCAATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGGTAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGGCAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCCAAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACAATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACGTGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGCTGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGCTGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGATTTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAA" >
4 |          <protein abbreviation="N" startPosition="1" stopPosition="1259" />
5 |   </orf>
6 | 


--------------------------------------------------------------------------------
/src/libseq/ParseException.h:
--------------------------------------------------------------------------------
 1 | // This may look like C code, but it's really -*- C++ -*-
 2 | #ifndef PARSE_EXCEPTION_H_
 3 | #define PARSE_EXCEPTION_H_
 4 | 
 5 | #include <string>
 6 | 
 7 | namespace seq {
 8 | 
 9 | /**
10 |  * Exception thrown when an error was encountered while parsing the
11 |  * string representation of an nucleotide, nucleotide sequence, amino
12 |  * acid, amino acid sequence, or a FASTA file.
13 |  *
14 |  * \sa Nucleotide::Nucleotide(char), AminoAcid::AminoAcid(char),
15 |  * NTSequence::NTSequence(const std::string, const std::string, const
16 |  * std::string, bool), AASequence::AASequence(const std::string, const
17 |  * std::string, const std::string), operator>> (std::istream&,
18 |  * NTSequence&), operator>> (std::istream&, AASequence&)
19 |  */
20 | class ParseException
21 | {
22 | public:
23 |   ParseException(const std::string name,
24 | 		 const std::string message, bool recovered)
25 |     : name_(name), message_(message), recovered_(recovered) { }
26 | 
27 |   /**
28 |    * The sequence name.
29 |    */
30 |   std::string name() const { return name_; }
31 | 
32 |   /**
33 |    * The message describing the error.
34 |    */
35 |   std::string message() const { return message_; }
36 | 
37 |   /**
38 |    * Whether the parser attempted to recover and you could try parsing
39 |    * the next sequence.
40 |    */
41 |   bool recovered() const { return recovered_; }
42 | 
43 | private:
44 |   std::string name_;
45 |   std::string message_;
46 |   bool recovered_;
47 | };
48 | 
49 | };
50 | 
51 | #endif // PARSE_EXCEPTION_H_
52 | 


--------------------------------------------------------------------------------
/src/mxml-utils/MXMLUtils.C:
--------------------------------------------------------------------------------
 1 | #include "MXMLUtils.h"
 2 | #include "mxml/mxml.h"
 3 | 
 4 | #include <stdexcept>
 5 | #include <iostream>
 6 | #include <fstream>
 7 | 
 8 | mxml_node_t *singleChildElement(mxml_node_t *element, 
 9 | 				const std::string& tagName)
10 | {
11 |   mxml_node_t *result = mxmlFindElement(element, element, tagName.c_str(),
12 | 					0, 0, MXML_DESCEND);
13 | 
14 |   if (result) {
15 |     mxml_node_t *next = mxmlFindElement(result, element, tagName.c_str(),
16 | 					0, 0, MXML_NO_DESCEND);
17 |     if (next) {
18 |       throw std::runtime_error(std::string("Expected only one child <") 
19 | 			       + tagName
20 | 			       + "> in <" + element->value.element.name + ">");
21 |     }
22 |   }
23 | 
24 |   if (result && result->type != MXML_ELEMENT)
25 |     throw std::runtime_error("Expected an XML DOM element");
26 | 
27 |   return result;
28 | }
29 | 
30 | bool attributeValue(mxml_node_t *element, const std::string& attributeName,
31 | 		    std::string& result)
32 | {
33 |   const char *r = mxmlElementGetAttr(element, attributeName.c_str());
34 | 
35 |   if (r) {
36 |     result = r;
37 | 
38 |     return true;
39 |   } else
40 |     return false;
41 | }
42 | 
43 | std::vector<mxml_node_t *> 
44 | childElements(mxml_node_t *element, const std::string& tagName)
45 | {
46 |   std::vector<mxml_node_t *> result;
47 | 
48 |   mxml_node_t *r = mxmlFindElement(element, element, tagName.c_str(),
49 | 				   0, 0, MXML_DESCEND);
50 |   while (r) {
51 |     result.push_back(r);
52 |     r = mxmlFindElement(r, element, tagName.c_str(), 0, 0, MXML_NO_DESCEND);
53 |   }
54 |   
55 |   return result;
56 | }
57 | 


--------------------------------------------------------------------------------
/src/ReferenceSequence.h:
--------------------------------------------------------------------------------
 1 | // This may look like C code, but it's really -*- C++ -*-
 2 | #ifndef REFERENCE_SEQUENCE_H_
 3 | #define REFERENCE_SEQUENCE_H_
 4 | 
 5 | #include <NTSequence.h>
 6 | 
 7 | #include <map>
 8 | #include <vector>
 9 | 
10 | class ReferenceSequence : public seq::NTSequence
11 | {
12 | public:
13 |   class Region {
14 |   public:
15 | 
16 |     Region(int begin, int end, std::string prefix)
17 |       : begin_(begin),
18 | 	end_(end),
19 | 	prefix_(prefix)
20 |     { }
21 | 
22 |     int         begin()   const { return begin_; }   // AA position [0 -- N[
23 |     int         end()     const { return end_; }     // AA position [0 -- N[
24 |     std::string prefix()  const { return prefix_; }
25 |     
26 |     // aligned positions of begin, end
27 |     int         alignedBegin, alignedEnd; // AA position [0 -- N[
28 |     // reference position of first/last non-gap in target within region
29 |     int         targetBegin, targetEnd;   // AA position [0 -- N[
30 | 
31 |   private:
32 |     int                       begin_, end_;
33 |     std::string               prefix_;
34 | 
35 |     friend class ReferenceSequence;
36 |   };
37 | 
38 |   ReferenceSequence(const seq::NTSequence& seq);
39 |   
40 |   const std::vector<Region>& regions() const { return regions_; }
41 |   std::vector<Region>&       regions() { return regions_; }
42 |   void addRegion(Region r) { regions_.push_back(r); }
43 | 
44 |   static std::map<std::string, std::vector<ReferenceSequence> > 
45 |   parseProteinReferences(std::string genomesXmlFile);
46 |   static ReferenceSequence 
47 |   parseOrfReferenceFile(const std::string& fileName);
48 | 
49 | private:
50 |   std::vector<Region> regions_;
51 | };
52 | 
53 | #endif // REFERENCE_SEQUENCE_H_
54 | 


--------------------------------------------------------------------------------
/src/Utils.cpp:
--------------------------------------------------------------------------------
 1 | #include "Utils.h"
 2 | 
 3 | #include <string>
 4 | #include <algorithm>
 5 | #include <sstream>
 6 | 
 7 | #ifdef _WIN32
 8 | #include <windows.h>
 9 | #else
10 | #include <sys/time.h>
11 | #endif
12 | 
13 | std::string to_upper_copy(const std::string& s)
14 | {
15 |   std::string copy = s;
16 |   std::transform(copy.begin(), copy.end(), copy.begin(), ::toupper);
17 |   return copy;
18 | }
19 | 
20 | bool ends_with(const std::string& s, const std::string& p)
21 | {
22 |   if (p.size() > s.size())
23 |     return false;
24 |   else {
25 |     for (unsigned i = 0; i < p.size(); ++i) {
26 |       if (p[i] != s[s.size() - p.size() + i])
27 | 	return false;
28 |     }
29 |     return true;
30 |   }
31 | }
32 | 
33 | long long current_time_ms()
34 | {
35 | #ifdef _WIN32
36 |   static LARGE_INTEGER s_frequency;
37 |   static BOOL s_use_qpc = QueryPerformanceFrequency(&s_frequency);
38 |   //if there is no high resolution time stamp, use GetTickCount()
39 |   if (s_use_qpc) {
40 |     LARGE_INTEGER now;
41 |     QueryPerformanceCounter(&now);
42 |     return (1000LL * now.QuadPart) / s_frequency.QuadPart;
43 |   } else {
44 |     return GetTickCount();
45 |   }  
46 | #else
47 |   struct timeval  tv;
48 |   gettimeofday(&tv, NULL);
49 |   return (tv.tv_sec) * 1000 + (tv.tv_usec) / 1000 ; 
50 | #endif
51 | }
52 | 
53 | std::string format_time(const long long& milliseconds)
54 | {
55 |   int seconds = (int) (milliseconds / 1000) % 60 ;
56 |   int minutes = (int) ((milliseconds / (1000*60)) % 60);
57 |   int hours   = (int) ((milliseconds / (1000*60*60)) % 24);
58 | 
59 |   std::stringstream ss;
60 |   if (hours != 0)
61 |     ss << hours << "h";
62 |   if (minutes != 0 || hours != 0)
63 |     ss << minutes << "m";
64 |   ss << seconds << "s";
65 | 
66 |   return ss.str();
67 | }
68 | 


--------------------------------------------------------------------------------
/src/libseq/AlignmentAlgorithm.h:
--------------------------------------------------------------------------------
 1 | // This may look like C code, but it's really -*- C++ -*-
 2 | #ifndef ALIGNMENT_ALGORITHM_H_
 3 | #define ALIGNMENT_ALGORITHM_H_
 4 | 
 5 | #include <NTSequence.h>
 6 | #include <AASequence.h>
 7 | 
 8 | /**
 9 |  * libseq namespace
10 |  */
11 | namespace seq {
12 | 
13 | class AlignmentAlgorithm {
14 |   public:
15 |     /**
16 |      * Pair-wise align two nucleotide sequences.
17 |      *
18 |      * The two sequences seq1 and seq2 are aligned in-place: gaps are inserted
19 |      * according to a global alignment, and they will have equal length.
20 |      */
21 |     virtual double align(NTSequence& seq1, NTSequence& seq2) = 0;
22 | 
23 |     /**
24 |      * Pair-wise align two amino acid sequences.
25 |      *
26 |      * The two sequences seq1 and seq2 are aligned in-place: gaps are inserted
27 |      * according to a global alignment, and they will have equal length.
28 |      */
29 |     virtual double align(AASequence& seq1, AASequence& seq2) = 0;
30 | 
31 |     virtual double computeAlignScore(const NTSequence& seq1, 
32 | 				     const NTSequence& seq2) = 0;
33 | 
34 |     /**
35 |      * Similarity weights matrix for nucleotides.
36 |      *
37 |      * Compares also IUB ambiuguity codes, and is the matrix used by BLAST.
38 |      *
39 |      * Taken from: ftp://ftp.ncbi.nih.gov/blast/matrices/NUC.4.4
40 |      */
41 |     static double** IUB();
42 | 
43 |     /**
44 |      * Similarity weights matrix for amino acids.
45 |      *
46 |      * This is from the famous BLOSUM series of weight matrices, the one
47 |      * that is the default use by ClustalX.
48 |      *
49 |      * From: ftp://ftp.ncbi.nih.gov/blast/matrices/BLOSUM30
50 |      */
51 |     static double** BLOSUM30();
52 |   };
53 | 
54 | }
55 | 
56 | #endif // ALIGNMENT_ALGORITHM_H_
57 | 


--------------------------------------------------------------------------------
/src/Alignment.h:
--------------------------------------------------------------------------------
 1 | // This may look like C code, but it's really -*- C++ -*-
 2 | #ifndef ALIGNMENT_H_
 3 | #define ALIGNMENT_H_
 4 | 
 5 | #include "ReferenceSequence.h"
 6 | #include <AlignmentAlgorithm.h>
 7 | 
 8 | class IsolateMutation;
 9 | 
10 | class Alignment
11 | {
12 | public:
13 |   bool   success;
14 |   bool   tooShort;
15 |   bool   failure;
16 |   int    correctedFrameshifts;
17 |   double score;
18 | 
19 |   ReferenceSequence ref;
20 |   seq::NTSequence   target;
21 | 
22 |   std::string 
23 |   mutations(const ReferenceSequence::Region& region) const;
24 |   std::string 
25 |   codonMutations(const ReferenceSequence::Region& region,
26 | 		 int& start,
27 | 		 int& end) const;
28 |   void isolateMutations(const ReferenceSequence::Region& regioni, std::vector<IsolateMutation>& mutations) const;
29 | 
30 |   /*! \brief Return the amino acid position of the given mutation, if there
31 |    *         is information on that mutation in the alignment
32 |    *
33 |    * The first result (bool) indicates if the target sequence contains
34 |    * the mutation.
35 |    *
36 |    * The second result is the amino acid position. If the mutation is
37 |    * an insertion which is not contained in the sequence, this value is
38 |    * -1.
39 |    */
40 |   std::pair<bool, int> findAminoAcid(const ReferenceSequence::Region& region,
41 | 				     int positionInRegion, int insertion)
42 |     const;
43 | 
44 |   static Alignment compute(const ReferenceSequence& ref,
45 | 			   const seq::NTSequence& target,
46 | 			   seq::AlignmentAlgorithm* algorithm,
47 | 			   int maxFrameShifts = 5);
48 | 
49 |   static Alignment given(const ReferenceSequence& ref,
50 | 			 const seq::NTSequence& target);
51 | 
52 |   void revert(const IsolateMutation& mutation);
53 | 
54 | private:
55 |   Alignment(const ReferenceSequence& aref,
56 | 	    const seq::NTSequence&   atarget);
57 | 
58 |   void     computeAlignedRanges(int referenceSequenceLength);
59 |   int      alignedPos(int refPos) const;
60 |   int      firstPos(int begin, int end) const;
61 |   int      lastPos(int begin, int end) const;
62 | };
63 | 
64 | #endif // ALIGNMENT_H_
65 | 


--------------------------------------------------------------------------------
/references/HIV/HIV-HXB2-gag.xml:
--------------------------------------------------------------------------------
 1 | <?xml encoding="UTF-8"?>
 2 |     <!-- HIV gag ORF from HXB2 (NC_001802) -->
 3 |     <orf name="HIV-HXB2-gag" referenceSequence="atgggtgcgagagcgtcagtattaagcgggggagaattagatcgatgggaaaaaattcggttaaggccagggggaaagaaaaaatataaattaaaacatatagtatgggcaagcagggagctagaacgattcgcagttaatcctggcctgttagaaacatcagaaggctgtagacaaatactgggacagctacaaccatcccttcagacaggatcagaagaacttagatcattatataatacagtagcaaccctctattgtgtgcatcaaaggatagagataaaagacaccaaggaagctttagacaagatagaggaagagcaaaacaaaagtaagaaaaaagcacagcaagcagcagctgacacaggacacagcaatcaggtcagccaaaattaccctatagtgcagaacatccaggggcaaatggtacatcaggccatatcacctagaactttaaatgcatgggtaaaagtagtagaagagaaggctttcagcccagaagtgatacccatgttttcagcattatcagaaggagccaccccacaagatttaaacaccatgctaaacacagtggggggacatcaagcagccatgcaaatgttaaaagagaccatcaatgaggaagctgcagaatgggatagagtgcatccagtgcatgcagggcctattgcaccaggccagatgagagaaccaaggggaagtgacatagcaggaactactagtacccttcaggaacaaataggatggatgacaaataatccacctatcccagtaggagaaatttataaaagatggataatcctgggattaaataaaatagtaagaatgtatagccctaccagcattctggacataagacaaggaccaaaggaaccctttagagactatgtagaccggttctataaaactctaagagccgagcaagcttcacaggaggtaaaaaattggatgacagaaaccttgttggtccaaaatgcgaacccagattgtaagactattttaaaagcattgggaccagcggctacactagaagaaatgatgacagcatgtcagggagtaggaggacccggccataaggcaagagttttggctgaagcaatgagccaagtaacaaattcagctaccataatgatgcagagaggcaattttaggaaccaaagaaagattgttaagtgtttcaattgtggcaaagaagggcacacagccagaaattgcagggcccctaggaaaaagggctgttggaaatgtggaaaggaaggacaccaaatgaaagattgtactgagagacaggctaattttttagggaagatctggccttcctacaagggaaggccagggaattttcttcagagcagaccagagccaacagccccaccagaagagagcttcaggtctggggtagagacaacaactccccctcagaagcaggagccgatagacaaggaactgtatcctttaacttccctcaggtcactctttggcaacgacccctcgtcacaataa">
 4 |       <protein abbreviation="p17" startPosition="1" stopPosition="397"/>
 5 |       <protein abbreviation="p7" startPosition="1132" stopPosition="1297" />
 6 |       <protein abbreviation="p1" startPosition="1297" stopPosition="1345" />
 7 |       <protein abbreviation="p24" startPosition="397" stopPosition="1090" />
 8 |       <protein abbreviation="p2" startPosition="1090" stopPosition="1132" />
 9 |       <protein abbreviation="p6" startPosition="1345" stopPosition="1504" />
10 |   </orf>
11 | 


--------------------------------------------------------------------------------
/src/mxml/config.h:
--------------------------------------------------------------------------------
 1 | /* config.h.  Generated by configure.  */
 2 | /*
 3 |  * "$Id: config.h,v 1.2 2007/10/11 08:22:37 wimpie Exp $"
 4 |  *
 5 |  * Configuration file for Mini-XML, a small XML-like file parsing library.
 6 |  *
 7 |  * Copyright 2003-2007 by Michael Sweet.
 8 |  *
 9 |  * This program is free software; you can redistribute it and/or
10 |  * modify it under the terms of the GNU Library General Public
11 |  * License as published by the Free Software Foundation; either
12 |  * version 2, or (at your option) any later version.
13 |  *
14 |  * This program is distributed in the hope that it will be useful,
15 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 |  * GNU General Public License for more details.
18 |  */
19 | 
20 | /*
21 |  * Include necessary headers...
22 |  */
23 | 
24 | #include <stdio.h>
25 | #include <stdlib.h>
26 | #include <string.h>
27 | #include <stdarg.h>
28 | #include <ctype.h>
29 | 
30 | 
31 | /*
32 |  * Version number...
33 |  */
34 | 
35 | #define MXML_VERSION "Mini-XML v2.3"
36 | 
37 | 
38 | /*
39 |  * Do we have the snprintf() and vsnprintf() functions?
40 |  */
41 | 
42 | #define HAVE_SNPRINTF 1
43 | #define HAVE_VSNPRINTF 1
44 | #ifdef WIN32
45 | #define snprintf _snprintf
46 | #endif
47 | 
48 | 
49 | /*
50 |  * Do we have the strXXX() functions?
51 |  */
52 | 
53 | #define HAVE_STRDUP 1
54 | 
55 | 
56 | /*
57 |  * Define prototypes for string functions as needed...
58 |  */
59 | 
60 | #  ifndef HAVE_STRDUP
61 | extern char	*_mxml_strdup(const char *);
62 | #    define strdup _mxml_strdup
63 | #  endif /* !HAVE_STRDUP */
64 | 
65 | extern char	*_mxml_strdupf(const char *, ...);
66 | extern char	*_mxml_vstrdupf(const char *, va_list);
67 | 
68 | #  ifndef HAVE_SNPRINTF
69 | extern int	_mxml_snprintf(char *, size_t, const char *, ...);
70 | #    define snprintf _mxml_snprintf
71 | #  endif /* !HAVE_SNPRINTF */
72 | 
73 | #  ifndef HAVE_VSNPRINTF
74 | extern int	_mxml_vsnprintf(char *, size_t, const char *, va_list);
75 | #    define vsnprintf _mxml_vsnprintf
76 | #  endif /* !HAVE_VSNPRINTF */
77 | 
78 | /*
79 |  * End of "$Id: config.h,v 1.2 2007/10/11 08:22:37 wimpie Exp $".
80 |  */
81 | 


--------------------------------------------------------------------------------
/src/libseq/NeedlemanWunsh.h:
--------------------------------------------------------------------------------
 1 | // This may look like C code, but it's really -*- C++ -*-
 2 | #ifndef NEEDLEMAN_WUNSH_H_
 3 | #define NEEDLEMAN_WUNSH_H_
 4 | 
 5 | #include <AlignmentAlgorithm.h>
 6 | 
 7 | /**
 8 |  * libseq namespace
 9 |  */
10 | namespace seq {
11 | 
12 | class NeedlemanWunsh : public AlignmentAlgorithm 
13 | {
14 |   public:
15 |     NeedlemanWunsh(double gapOpenScore = -10,
16 | 		   double gapExtensionScore = -3.3,
17 | 		   double **ntWeightMatrix = 
18 | 		   AlignmentAlgorithm::IUB(),
19 | 		   double **aaWeightMatrix = 
20 | 		   AlignmentAlgorithm::BLOSUM30());
21 |   /**
22 |    * Pair-wise align two nucleotide sequences, using a modified
23 |    * NeedleMan-Wunsh algorithm.
24 |    *
25 |    * The two sequences seq1 and seq2 are aligned in-place: gaps are inserted
26 |    * according to a global alignment, and they will have equal length.
27 |    *
28 |    * The algorithm is NeedleMan-Wunsh, with two popular modifications:
29 |    *  - there is a different cost for opening a gap or for extending a gap.
30 |    *  - there is no gap open cost for a gap at the beginning or the end.
31 |    */
32 |   virtual double align(NTSequence& seq1, NTSequence& seq2);
33 | 
34 |   /**
35 |    * Pair-wise align two amino acid sequences, using a modified
36 |    * NeedleMan-Wunsh algorithm.
37 |    *
38 |    * The two sequences seq1 and seq2 are aligned in-place: gaps are inserted
39 |    * according to a global alignment, and they will have equal length.
40 |    *
41 |    * The algorithm is NeedleMan-Wunsh, with two popular modifications:
42 |    *  - there is a different cost for opening a gap or for extending a gap.
43 |    *  - there is no gap open cost for a gap at the beginning or the end.
44 |    */
45 |   virtual double align(AASequence& seq1, AASequence& seq2);
46 | 
47 |   virtual double computeAlignScore(const NTSequence& seq1, 
48 | 				   const NTSequence& seq2);
49 | 
50 | private:
51 |   double gapOpenScore_;
52 |   double gapExtensionScore_;
53 |   double **ntWeightMatrix_;
54 |   double **aaWeightMatrix_;
55 | 
56 |   template <typename Symbol>
57 |   double needlemanWunshAlign(std::vector<Symbol>& seq1,
58 | 			     std::vector<Symbol>& seq2,
59 | 			     double** weigthMatrix);
60 | };
61 | 
62 | }
63 | 
64 | #endif // NEEDLEMAN_WUNSH_H_
65 | 


--------------------------------------------------------------------------------
/src/libseq/CodingSequence.cpp:
--------------------------------------------------------------------------------
  1 | #include <ctype.h>
  2 | 
  3 | #include "Codon.h"
  4 | #include "CodingSequence.h"
  5 | 
  6 | namespace seq {
  7 | 
  8 | CodingSequence::CodingSequence()
  9 |   : ntSequence_(),
 10 |     aaSequence_(),
 11 |     dirty_(D_COMPLETE)
 12 | { }
 13 | 
 14 | CodingSequence::CodingSequence(const NTSequence& aNtSequence)
 15 |   : ntSequence_(aNtSequence),
 16 |     aaSequence_(aNtSequence.size() / 3),
 17 |     dirty_(D_COMPLETE)
 18 | { }
 19 | 
 20 | const AASequence& CodingSequence::aaSequence() const
 21 | {
 22 |   if (isDirty())
 23 |     updateAASequence();
 24 | 
 25 |   return aaSequence_;
 26 | }
 27 | 
 28 | void CodingSequence::changeNucleotide(int pos, Nucleotide value)
 29 | {
 30 |   // a small effort to avoid to avoid retranslation of the whole AA sequence.
 31 |   if (isDirty() && (dirty_ != D_COMPLETE))
 32 |     updateAASequence();
 33 | 
 34 |   ntSequence_[pos] = value;
 35 | 
 36 |   if (isDirty())
 37 |     dirty_ = D_COMPLETE;
 38 |   else
 39 |     dirty_ = pos;
 40 | }
 41 | 
 42 | int CodingSequence::whatIfMutation(int pos, Nucleotide value,
 43 | 				   AminoAcid& oldAA,
 44 | 				   AminoAcid& newAA) const
 45 | {
 46 |   if (isDirty())
 47 |     updateAASequence();  
 48 | 
 49 |   const int aaPos = pos / 3;
 50 |   const int codonPos = pos % 3;
 51 | 
 52 |   NTSequence newcodon(ntSequence_.begin() + aaPos * 3,
 53 | 		      ntSequence_.begin() + (aaPos * 3 + 3));
 54 |   newcodon[codonPos] = value;
 55 | 
 56 |   oldAA = aaSequence_[aaPos];
 57 |   newAA = Codon::translate(newcodon.begin());
 58 | 
 59 |   return aaPos;
 60 | }
 61 | 
 62 | bool CodingSequence::isSynonymousMutation(int pos, Nucleotide value) const
 63 | {
 64 |   AminoAcid oldAA, newAA;
 65 | 
 66 |   whatIfMutation(pos, value, oldAA, newAA);
 67 | 
 68 |   return (oldAA == newAA);
 69 | }
 70 | 
 71 | void CodingSequence::updateAASequence() const
 72 | {
 73 |   if (dirty_ == D_COMPLETE) {
 74 |     aaSequence_ = AASequence::translate(ntSequence_);
 75 |   } else {
 76 |     dirty_ /= 3;
 77 |     aaSequence_[dirty_]
 78 |       = Codon::translate(ntSequence_.begin() + (dirty_ * 3)); 
 79 |   }
 80 | 
 81 |   dirty_ = D_CLEAN;
 82 | }
 83 | 
 84 | void CodingSequence::allAASequences(std::vector<std::set<AminoAcid> >& result)
 85 |   const
 86 | {
 87 |   for (unsigned i = 0; i < ntSequence_.size(); i += 3) {
 88 |     result.push_back(Codon::translateAll(ntSequence_.begin() + i));
 89 |   }
 90 | };
 91 | 
 92 | extern void printAmbiguousAASequence(std::ostream& out,
 93 | 				     const CodingSequence& cs)
 94 | {
 95 |   std::vector<std::set<AminoAcid> > aas;
 96 |   cs.allAASequences(aas);
 97 | 
 98 |   for (unsigned i = 0; i < aas.size(); ++i) {
 99 |     if (aas[i].size() > 1)
100 |       out << "{";
101 |     for (std::set<AminoAcid>::const_iterator j = aas[i].begin();
102 | 	 j != aas[i].end(); ++j)
103 |       out << *j;
104 |     if (aas[i].size() > 1)
105 |       out << "}";
106 |   }
107 | }
108 | 
109 | }
110 | 


--------------------------------------------------------------------------------
/references/HIV/HIV-HXB2-env.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 |     <!-- HIV env ORF from HXB2 (NC_001802) -->
3 |     <orf name="HIV-HXB2-env" referenceSequence="atgagagtgaaggagaaatatcagcacttgtggagatgggggtggagatggggcaccatgctccttgggatgttgatgatctgtagtgctacagaaaaattgtgggtcacagtctattatggggtacctgtgtggaaggaagcaaccaccactctattttgtgcatcagatgctaaagcatatgatacagaggtacataatgtttgggccacacatgcctgtgtacccacagaccccaacccacaagaagtagtattggtaaatgtgacagaaaattttaacatgtggaaaaatgacatggtagaacagatgcatgaggatataatcagtttatgggatcaaagcctaaagccatgtgtaaaattaaccccactctgtgttagtttaaagtgcactgatttgaagaatgatactaataccaatagtagtagcgggagaatgataatggagaaaggagagataaaaaactgctctttcaatatcagcacaagcataagaggtaaggtgcagaaagaatatgcatttttttataaacttgatataataccaatagataatgatactaccagctataagttgacaagttgtaacacctcagtcattacacaggcctgtccaaaggtatcctttgagccaattcccatacattattgtgccccggctggttttgcgattctaaaatgtaataataagacgttcaatggaacaggaccatgtacaaatgtcagcacagtacaatgtacacatggaattaggccagtagtatcaactcaactgctgttaaatggcagtctagcagaagaagaggtagtaattagatctgtcaatttcacggacaatgctaaaaccataatagtacagctgaacacatctgtagaaattaattgtacaagacccaacaacaatacaagaaaaagaatccgtatccagagaggaccagggagagcatttgttacaataggaaaaataggaaatatgagacaagcacattgtaacattagtagagcaaaatggaataacactttaaaacagatagctagcaaattaagagaacaatttggaaataataaaacaataatctttaagcaatcctcaggaggggacccagaaattgtaacgcacagttttaattgtggaggggaatttttctactgtaattcaacacaactgtttaatagtacttggtttaatagtacttggagtactgaagggtcaaataacactgaaggaagtgacacaatcaccctcccatgcagaataaaacaaattataaacatgtggcagaaagtaggaaaagcaatgtatgcccctcccatcagtggacaaattagatgttcatcaaatattacagggctgctattaacaagagatggtggtaatagcaacaatgagtccgagatcttcagacctggaggaggagatatgagggacaattggagaagtgaattatataaatataaagtagtaaaaattgaaccattaggagtagcacccaccaaggcaaagagaagagtggtgcagagagaaaaaagagcagtgggaataggagctttgttccttgggttcttgggagcagcaggaagcactatgggcgcagcctcaatgacgctgacggtacaggccagacaattattgtctggtatagtgcagcagcagaacaatttgctgagggctattgaggcgcaacagcatctgttgcaactcacagtctggggcatcaagcagctccaggcaagaatcctggctgtggaaagatacctaaaggatcaacagctcctggggatttggggttgctctggaaaactcatttgcaccactgctgtgccttggaatgctagttggagtaataaatctctggaacagatttggaatcacacgacctggatggagtgggacagagaaattaacaattacacaagcttaatacactccttaattgaagaatcgcaaaaccagcaagaaaagaatgaacaagaattattggaattagataaatgggcaagtttgtggaattggtttaacataacaaattggctgtggtatataaaattattcataatgatagtaggaggcttggtaggtttaagaatagtttttgctgtactttctatagtgaatagagttaggcagggatattcaccattatcgtttcagacccacctcccaaccccgaggggacccgacaggcccgaaggaatagaagaagaaggtggagagagagacagagacagatccattcgattagtgaacggatccttggcacttatctgggacgatctgcggagcctgtgcctcttcagctaccaccgcttgagagacttactcttgattgtaacgaggattgtggaacttctgggacgcagggggtgggaagccctcaaatattggtggaatctcctacagtattggagtcaggaactaaagaatagtgctgttagcttgctcaatgccacagccatagcagtagctgaggggacagatagggttatagaagtagtacaaggagcttgtagagctattcgccacatacctagaagaataagacagggcttggaaaggattttgctataa" >
4 |       <protein abbreviation="sig" startPosition="1" stopPosition="85" />
5 |       <protein abbreviation="gp120" startPosition="85" stopPosition="1534" />
6 |       <protein abbreviation="gp41" startPosition="1534" stopPosition="2572" />
7 |     </orf>
8 | 


--------------------------------------------------------------------------------
/src/libseq/CodingSequence.h:
--------------------------------------------------------------------------------
  1 | // This may look like C code, but it's really -*- C++ -*-
  2 | #ifndef CODING_SEQUENCE_H_
  3 | #define CODING_SEQUENCE_H_
  4 | 
  5 | #include <set>
  6 | #include <iostream>
  7 | 
  8 | #include "NTSequence.h"
  9 | #include "AASequence.h"
 10 | 
 11 | namespace seq {
 12 | 
 13 | /**
 14 |  * A coding sequence represents a nucleotide sequence that codes for
 15 |  * an amino acid sequence (an oligo- or polypeptide).
 16 |  *
 17 |  * It is useful when one wants to track the effect of changes in the
 18 |  * nucleotide sequence for the amino acid sequence, and to investigate
 19 |  * properties of nucleotide mutations.
 20 |  */
 21 | class CodingSequence
 22 | {
 23 |  public:
 24 |   /**
 25 |    * Construct a coding sequence with empty nucleotide sequence.
 26 |    */
 27 |   CodingSequence();
 28 | 
 29 |   /**
 30 |    * Construct a coding sequence based on the given nucleotide
 31 |    * sequence. The sequence must be translatable as per
 32 |    * AASequence::translate(const NTSequence&).
 33 |    */
 34 |   CodingSequence(const NTSequence& aNtSequence);
 35 | 
 36 |   /**
 37 |    * Get the nucleotide sequence.
 38 |    */
 39 |   const NTSequence& ntSequence() const { return ntSequence_; }
 40 | 
 41 |   /**
 42 |    * Get the amino acid sequence.
 43 |    *
 44 |    * If needed, the amino acid sequence is updated to reflect changes
 45 |    * in the nucleotide sequence.
 46 |    */
 47 |   const AASequence& aaSequence() const;
 48 | 
 49 |   /**
 50 |    * Change a nucleotide at a given position in the nucleotide sequence to
 51 |    * a new value.
 52 |    */
 53 |   void changeNucleotide(int pos, Nucleotide value);
 54 | 
 55 |   /**
 56 |    * Investigate the effect of a nucleotide mutation on the amino acid
 57 |    * sequence. This returns both the old (oldAA) and new amino acid (newAA)
 58 |    * encoded by the mutation, as well as the position (return value).
 59 |    */
 60 |   int whatIfMutation(int pos, Nucleotide value,
 61 | 		     AminoAcid& oldAA, AminoAcid& newAA) const;
 62 | 
 63 |   /**
 64 |    * Investigate whether a give nucleotide mutation is synonymous or
 65 |    * non-synonymous with respect to the amino acid sequence.
 66 |    */
 67 |   bool isSynonymousMutation(int pos, Nucleotide value) const;
 68 | 
 69 |   /**
 70 |    * Get the amino acid sequence possibilities, taking into account
 71 |    * all ambiguities
 72 |    */
 73 |   void allAASequences(std::vector<std::set<AminoAcid> >& result) const;
 74 | 
 75 |  protected:
 76 |   void updateAASequence() const;
 77 | 
 78 |  private:
 79 |   NTSequence         ntSequence_;
 80 |   mutable AASequence aaSequence_;
 81 | 
 82 |   bool               isDirty() const { return dirty_ != D_CLEAN; }
 83 | 
 84 |   mutable int        dirty_;
 85 | 
 86 |   static const int   D_CLEAN = -1;
 87 |   static const int   D_COMPLETE = -2;
 88 | };
 89 | 
 90 |   /**
 91 |    * Write an amino acid sequence with all possible ambiguities
 92 |    * to the stream.
 93 |    *
 94 |    * The format is e.g. TW{LM}YS
 95 |    */
 96 | extern void printAmbiguousAASequence(std::ostream& out,
 97 | 				     const CodingSequence& cs);
 98 | 
 99 | };
100 | 
101 | #endif // CODING_SEQUENCE_H_
102 | 


--------------------------------------------------------------------------------
/src/libseq/AASequence.cpp:
--------------------------------------------------------------------------------
  1 | #include "AASequence.h"
  2 | #include "Codon.h"
  3 | 
  4 | namespace seq {
  5 | 
  6 | AASequence::AASequence()
  7 |   : std::vector<AminoAcid>()
  8 | { }
  9 | 
 10 | AASequence::AASequence(unsigned size)
 11 |   : std::vector<AminoAcid>(size)
 12 | { }
 13 | 
 14 | AASequence::AASequence(const const_iterator first,
 15 | 		       const const_iterator last)
 16 |   : std::vector<AminoAcid>(first, last)
 17 | { }
 18 | 
 19 | AASequence::AASequence(const std::string name,
 20 | 		       const std::string description,
 21 | 		       const std::string aSeqString)
 22 |   : std::vector<AminoAcid>(aSeqString.length()),
 23 |     name_(name),
 24 |     description_(description)
 25 | {
 26 |   for (unsigned i = 0; i < aSeqString.length(); ++i) {
 27 |     (*this)[i] = AminoAcid(aSeqString[i]);
 28 |   }
 29 | }
 30 | 
 31 | std::string AASequence::asString() const
 32 | {
 33 |   std::string result(size(), '-');
 34 | 
 35 |   for (unsigned i = 0; i < size(); ++i) {
 36 |     result[i] = (*this)[i].toChar();
 37 |   }
 38 | 
 39 |   return result;
 40 | }
 41 | 
 42 | inline bool contains(const std::set<AminoAcid>& possibilities, const AminoAcid& aa)
 43 | {
 44 |   return possibilities.find(aa) != possibilities.end();
 45 | }
 46 | 
 47 | AASequence AASequence::translate(const NTSequence::const_iterator begin,
 48 | 				 const NTSequence::const_iterator end)
 49 | {
 50 |   const int size = end - begin;
 51 |   assert(size % 3 == 0);
 52 | 
 53 |   AASequence result(size / 3);
 54 | 
 55 |   for (NTSequence::const_iterator i = begin; i < end; i += 3) {
 56 |     std::set<AminoAcid> possibilities = Codon::translateAll(i);
 57 |     if (possibilities.size() > 2) {
 58 |       result[(i - begin)/3] = AminoAcid::X;
 59 |     } else if (possibilities.size() == 2) {
 60 |       if (contains(possibilities,AminoAcid::D) && contains(possibilities,AminoAcid::N))
 61 |         result[(i - begin)/3] = AminoAcid::B;
 62 |       else if (contains(possibilities,AminoAcid::E) && contains(possibilities,AminoAcid::Q))
 63 |         result[(i - begin)/3] = AminoAcid::Z;
 64 |       else if (contains(possibilities,AminoAcid::L) && contains(possibilities,AminoAcid::I))
 65 |         result[(i - begin)/3] = AminoAcid::J;
 66 |       else
 67 |         result[(i - begin)/3] = AminoAcid::X;
 68 |     } else {
 69 |       result[(i - begin)/3] = *possibilities.begin();
 70 |     }
 71 |   }
 72 | 
 73 |   return result;
 74 | }
 75 | 
 76 | AASequence AASequence::translate(const NTSequence& ntSequence)
 77 | {
 78 |   return translate(ntSequence.begin(), ntSequence.end());
 79 | }
 80 | 
 81 | // defined in NTSequence.C:
 82 | extern void readFastaEntry(std::istream& i,
 83 | 			   std::string& name,
 84 | 			   std::string& description,
 85 | 			   std::string& sequence);
 86 | extern void writeFastaEntry(std::ostream& o,
 87 | 			    const std::string& name,
 88 | 			    const std::string& description,
 89 | 			    const std::string& sequence);
 90 | 
 91 | std::istream& operator>>(std::istream& i, AASequence& sequence)
 92 | {
 93 |   std::string name, description, seqString;
 94 | 
 95 |   readFastaEntry(i, name, description, seqString);
 96 |   sequence = AASequence(name, description, seqString);
 97 | 
 98 |   return i;
 99 | }
100 | 
101 | std::ostream& operator<<(std::ostream& o, const AASequence& sequence)
102 | {
103 |   writeFastaEntry(o, sequence.name(), sequence.description(),
104 | 		  sequence.asString());
105 | 
106 |   return o;
107 | }
108 | 
109 | };
110 | 


--------------------------------------------------------------------------------
/src/libseq/AminoAcid.cpp:
--------------------------------------------------------------------------------
 1 | #include "AminoAcid.h"
 2 | #include "ParseException.h"
 3 | 
 4 | namespace seq {
 5 | 
 6 | const char AminoAcid::AA_CHAR[]
 7 | = { 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K',
 8 |     'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V',
 9 |     'W', 'Y', '*', '-', 'Z', 'U', 'B', 'X', 'J' };
10 | 
11 | const char * const AminoAcid::AA_TLA[]
12 | = { "Ala", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", "Lys",
13 |     "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val",
14 |     "Trp", "Tyr", "STP", "GAP", "Glu/Gln", "Sec", "Asp/Asn", "Any",
15 |     "Leu/Ile" };
16 | 
17 | const AminoAcid AminoAcid::A(AminoAcid::AA_A);
18 | const AminoAcid AminoAcid::C(AminoAcid::AA_C);
19 | const AminoAcid AminoAcid::D(AminoAcid::AA_D);
20 | const AminoAcid AminoAcid::E(AminoAcid::AA_E);
21 | const AminoAcid AminoAcid::F(AminoAcid::AA_F);
22 | const AminoAcid AminoAcid::G(AminoAcid::AA_G);
23 | const AminoAcid AminoAcid::H(AminoAcid::AA_H);
24 | const AminoAcid AminoAcid::I(AminoAcid::AA_I);
25 | const AminoAcid AminoAcid::K(AminoAcid::AA_K);
26 | const AminoAcid AminoAcid::L(AminoAcid::AA_L);
27 | const AminoAcid AminoAcid::M(AminoAcid::AA_M);
28 | const AminoAcid AminoAcid::N(AminoAcid::AA_N);
29 | const AminoAcid AminoAcid::P(AminoAcid::AA_P);
30 | const AminoAcid AminoAcid::Q(AminoAcid::AA_Q);
31 | const AminoAcid AminoAcid::R(AminoAcid::AA_R);
32 | const AminoAcid AminoAcid::S(AminoAcid::AA_S);
33 | const AminoAcid AminoAcid::T(AminoAcid::AA_T);
34 | const AminoAcid AminoAcid::V(AminoAcid::AA_V);
35 | const AminoAcid AminoAcid::W(AminoAcid::AA_W);
36 | const AminoAcid AminoAcid::Y(AminoAcid::AA_Y);
37 | const AminoAcid AminoAcid::STP(AminoAcid::AA_STP);
38 | const AminoAcid AminoAcid::GAP(AminoAcid::AA_GAP);
39 | const AminoAcid AminoAcid::Z(AminoAcid::AA_Z);
40 | const AminoAcid AminoAcid::U(AminoAcid::AA_U);
41 | const AminoAcid AminoAcid::B(AminoAcid::AA_B);
42 | const AminoAcid AminoAcid::X(AminoAcid::AA_X);
43 | const AminoAcid AminoAcid::J(AminoAcid::AA_J);
44 | 
45 | AminoAcid::AminoAcid()
46 |   : rep_(AA_Z)
47 | { }
48 | 
49 | AminoAcid::AminoAcid(char c)
50 | {
51 |   switch (toupper(c)) {
52 |   case 'A': rep_ = AA_A; break;
53 |   case 'C': rep_ = AA_C; break;
54 |   case 'D': rep_ = AA_D; break;
55 |   case 'E': rep_ = AA_E; break;
56 |   case 'F': rep_ = AA_F; break;
57 |   case 'G': rep_ = AA_G; break;
58 |   case 'H': rep_ = AA_H; break;
59 |   case 'I': rep_ = AA_I; break;
60 |   case 'K': rep_ = AA_K; break;
61 |   case 'L': rep_ = AA_L; break;
62 |   case 'M': rep_ = AA_M; break;
63 |   case 'N': rep_ = AA_N; break;
64 |   case 'P': rep_ = AA_P; break;
65 |   case 'Q': rep_ = AA_Q; break;
66 |   case 'R': rep_ = AA_R; break;
67 |   case 'S': rep_ = AA_S; break;
68 |   case 'T': rep_ = AA_T; break;
69 |   case 'V': rep_ = AA_V; break;
70 |   case 'W': rep_ = AA_W; break;
71 |   case 'Y': rep_ = AA_Y; break;
72 |   case '*': rep_ = AA_STP; break;
73 |   case '-': rep_ = AA_GAP; break;
74 |   case 'Z': rep_ = AA_Z; break;
75 |   case 'U': rep_ = AA_U; break;
76 |   case 'B': rep_ = AA_B; break;
77 |   case 'X': rep_ = AA_X; break;
78 |   case 'J': rep_ = AA_J; break;
79 |   default:
80 |     throw ParseException
81 |       (std::string(),
82 |        std::string("Invalid amino acid character: '") + c + "'", false);
83 |   }
84 | }
85 | 
86 | std::string AminoAcid::tla() const
87 | {
88 |   return AA_TLA[rep_];
89 | }
90 | 
91 | std::ostream& operator<< (std::ostream& s, const AminoAcid aa)
92 | {
93 |   return s << aa.toChar();
94 | }
95 | 
96 | };
97 | 


--------------------------------------------------------------------------------
/references/HIV/HIV-HXB2-pol.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 |     <!-- HIV pol ORF from HXB2 (NC_001802) -->
3 |     <orf name="pol" description="polymerase" referenceSequence="ttttttagggaagatctggccttcctacaagggaaggccagggaattttcttcagagcagaccagagccaacagccccaccagaagagagcttcaggtctggggtagagacaacaactccccctcagaagcaggagccgatagacaaggaactgtatcctttaacttccctcaggtcactctttggcaacgacccctcgtcacaataaagataggggggcaactaaaggaagctctattagatacaggagcagatgatacagtattagaagaaatgagtttgccaggaagatggaaaccaaaaatgatagggggaattggaggttttatcaaagtaagacagtatgatcagatactcatagaaatctgtggacataaagctataggtacagtattagtaggacctacacctgtcaacataattggaagaaatctgttgactcagattggttgcactttaaattttcccattagccctattgagactgtaccagtaaaattaaagccaggaatggatggcccaaaagttaaacaatggccattgacagaagaaaaaataaaagcattagtagaaatttgtacagagatggaaaaggaagggaaaatttcaaaaattgggcctgaaaatccatacaatactccagtatttgccataaagaaaaaagacagtactaaatggagaaaattagtagatttcagagaacttaataagagaactcaagacttctgggaagttcaattaggaataccacatcccgcagggttaaaaaagaaaaaatcagtaacagtactggatgtgggtgatgcatatttttcagttcccttagatgaagacttcaggaagtatactgcatttaccatacctagtataaacaatgagacaccagggattagatatcagtacaatgtgcttccacagggatggaaaggatcaccagcaatattccaaagtagcatgacaaaaatcttagagccttttagaaaacaaaatccagacatagttatctatcaatacatggatgatttgtatgtaggatctgacttagaaatagggcagcatagaacaaaaatagaggagctgagacaacatctgttgaggtggggacttaccacaccagacaaaaaacatcagaaagaacctccattcctttggatgggttatgaactccatcctgataaatggacagtacagcctatagtgctgccagaaaaagacagctggactgtcaatgacatacagaagttagtggggaaattgaattgggcaagtcagatttacccagggattaaagtaaggcaattatgtaaactccttagaggaaccaaagcactaacagaagtaataccactaacagaagaagcagagctagaactggcagaaaacagagagattctaaaagaaccagtacatggagtgtattatgacccatcaaaagacttaatagcagaaatacagaagcaggggcaaggccaatggacatatcaaatttatcaagagccatttaaaaatctgaaaacaggaaaatatgcaagaatgaggggtgcccacactaatgatgtaaaacaattaacagaggcagtgcaaaaaataaccacagaaagcatagtaatatggggaaagactcctaaatttaaactgcccatacaaaaggaaacatgggaaacatggtggacagagtattggcaagccacctggattcctgagtgggagtttgttaatacccctcccttagtgaaattatggtaccagttagagaaagaacccatagtaggagcagaaaccttctatgtagatggggcagctaacagggagactaaattaggaaaagcaggatatgttactaatagaggaagacaaaaagttgtcaccctaactgacacaacaaatcagaagactgagttacaagcaatttatctagctttgcaggattcgggattagaagtaaacatagtaacagactcacaatatgcattaggaatcattcaagcacaaccagatcaaagtgaatcagagttagtcaatcaaataatagagcagttaataaaaaaggaaaaggtctatctggcatgggtaccagcacacaaaggaattggaggaaatgaacaagtagataaattagtcagtgctggaatcaggaaagtactatttttagatggaatagataaggcccaagatgaacatgagaaatatcacagtaattggagagcaatggctagtgattttaacctgccacctgtagtagcaaaagaaatagtagccagctgtgataaatgtcagctaaaaggagaagccatgcatggacaagtagactgtagtccaggaatatggcaactagattgtacacatttagaaggaaaagttatcctggtagcagttcatgtagccagtggatatatagaagcagaagttattccagcagaaacagggcaggaaacagcatattttcttttaaaattagcaggaagatggccagtaaaaacaatacatactgacaatggcagcaatttcaccggtgctacggttagggccgcctgttggtgggcgggaatcaagcaggaatttggaattccctacaatccccaaagtcaaggagtagtagaatctatgaataaagaattaaagaaaattataggacaggtaagagatcaggctgaacatcttaagacagcagtacaaatggcagtattcatccacaattttaaaagaaaaggggggattggggggtacagtgcaggggaaagaatagtagacataatagcaacagacatacaaactaaagaattacaaaaacaaattacaaaaattcaaaattttcgggtttattacagggacagcagaaatccactttggaaaggaccagcaaagctcctctggaaaggtgaaggggcagtagtaatacaagataatagtgacataaaagtagtgccaagaagaaaagcaaagatcattagggattatggaaaacagatggcaggtgatgattgtgtggcaagtagacaggatgaggattag" version="0">
4 |       <protein abbreviation="PR" startPosition="169" stopPosition="466" />
5 |       <protein abbreviation="RT" startPosition="466" stopPosition="1786" />
6 |       <protein abbreviation="p15" startPosition="1786" stopPosition="2146" />
7 |       <protein abbreviation="IN" startPosition="2146" stopPosition="3013" />
8 | </orf>
9 | 


--------------------------------------------------------------------------------
/src/ReferenceSequence.cpp:
--------------------------------------------------------------------------------
  1 | #include "ReferenceSequence.h"
  2 | 
  3 | #include "mxml-utils/MXMLUtils.h"
  4 | #include "mxml/mxml.h"
  5 | #include <fstream>
  6 | #include <stdexcept>
  7 | 
  8 | #include "Utils.h"
  9 | 
 10 | ReferenceSequence::ReferenceSequence(const seq::NTSequence& seq)
 11 |   : seq::NTSequence(seq)
 12 | {
 13 | 
 14 | }
 15 | 
 16 | ReferenceSequence parseOrfReference(mxml_node_t* node)
 17 | {
 18 |   std::string orf;
 19 |   std::string refSeq;
 20 |   attributeValue(node, "name", orf);
 21 |   attributeValue(node, "referenceSequence", refSeq);
 22 | 	  
 23 |   std::vector<mxml_node_t *> proteins = childElements(node, "protein");
 24 | 	  
 25 |   ReferenceSequence seq 
 26 |     (seq::NTSequence(orf, orf, refSeq));
 27 |   for (int k = 0; k < proteins.size(); k++) {
 28 |     std::string protein;
 29 |     std::string start;
 30 |     std::string end;
 31 |     
 32 |     attributeValue(proteins[k], "abbreviation", protein);
 33 |     attributeValue(proteins[k], "startPosition", start);
 34 |     attributeValue(proteins[k], "stopPosition", end);
 35 | 	    
 36 |     if (protein == "") 
 37 |         throw std::runtime_error("protein abbreviation is invalid");
 38 |     if (start == "") 
 39 |         throw std::runtime_error("protein start is invalid");
 40 |     if (end == "") 
 41 |         throw std::runtime_error("protein end is invalid");
 42 | 
 43 |     int startPos = (atoi(start.c_str()) - 1) / 3;
 44 |     int endPos = (atoi(end.c_str()) - 1) / 3;
 45 | 	    
 46 |     ReferenceSequence::Region region(startPos, 
 47 | 		  endPos, 
 48 | 		  protein);
 49 |     seq.addRegion(region);
 50 |   }
 51 | 
 52 |   return seq;
 53 | }
 54 | 
 55 | ReferenceSequence ReferenceSequence::parseOrfReferenceFile(const std::string& fileName)
 56 | {
 57 |   FILE *fp = fopen(fileName.c_str(), "r");
 58 |   if (fp) {
 59 |     mxml_node_t *top = mxmlNewElement(MXML_NO_PARENT, "top");
 60 |   
 61 |     mxml_node_t *first = mxmlLoadFile(top, fp, MXML_NO_CALLBACK);
 62 |     
 63 |     if (first) {
 64 |       mxml_node_t *root = singleChildElement(top, "orf");
 65 |       return parseOrfReference(root);
 66 |     }
 67 |   }
 68 |     
 69 |   throw std::runtime_error("Error parsing ORF reference file");
 70 | }
 71 | 
 72 | std::map<std::string, std::vector<ReferenceSequence> >
 73 | ReferenceSequence::parseProteinReferences(std::string genomesXmlFile) 
 74 | {
 75 |   std::map<std::string, std::vector<ReferenceSequence> > genomesMap;
 76 | 
 77 |   FILE *fp = fopen(genomesXmlFile.c_str(), "r");
 78 |   if (fp) {
 79 |     mxml_node_t *top = mxmlNewElement(MXML_NO_PARENT, "top");
 80 | 
 81 |     mxml_node_t *first = mxmlLoadFile(top, fp, MXML_NO_CALLBACK);
 82 | 
 83 |     if (first) {
 84 |       mxml_node_t *root = singleChildElement(top, "genomes");
 85 | 
 86 |       std::vector<mxml_node_t *> genomes
 87 |         = childElements(root, "genome");
 88 | 
 89 |       for (int i = 0; i < genomes.size(); i++) {
 90 | 	std::string organism;
 91 | 	std::vector<ReferenceSequence> refs;
 92 | 	attributeValue(genomes[i], "organismName", organism);
 93 |     if (organism == "") 
 94 |         throw std::runtime_error("organism name is invalid");
 95 | 
 96 | 	std::vector<mxml_node_t *> orfs
 97 | 	  = childElements(genomes[i], "openReadingFrame");
 98 | 	for (int j = 0; j < orfs.size(); j++) {
 99 | 	  ReferenceSequence seq = parseOrfReference(orfs[j]);
100 | 	  refs.push_back(seq);
101 | 	}
102 | 	genomesMap[organism] = refs;
103 |       }
104 |     }
105 |   }
106 | 
107 |   return genomesMap;
108 | }
109 | 
110 | 


--------------------------------------------------------------------------------
/references/CHIKV/CHIKV-NC004162-gp2.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 |     <!-- CHIKV gp2 ORF from NC_004162 (7567-11313) -->
3 | 	<orf name="CHIKV_gp2" description="glycoprotein2" referenceSequence="atggagttcatcccaacccaaactttttacaacaggaggtaccagcctcgaccctggactccgcgccctactatccaagtcatcaggcccagaccgcgcccgcagaggcaagctgggcaacttgcccagctgatctcagcagttaataaactgacaatgcgcgcggtaccccaacagaagccacgcaagaatcggaagaataagaagcaaaagcaaaagcagcaggcgccacaaaacaacacaaaccaaaagaagcagccacctaaaaagaaaccagctcaaaagaaaaagaagccgggccgcagagagaggatgtgcatgaaaatcgaaaatgactgtattttcgaagtcaagcacgaaggtaaggtaacaggttacgcgtgcttggtgggggacaaagtaatgaaaccagcacacgtaaaggggaccatcgataacgcggacctggccaaattggcctttaagcggtcatctaagtacgaccttgaatgcgcgcagatacccgtgcacatgaagtccgacgcttcgaagttcacccatgagaaaccggaggggtactacaactggcaccacggagcagtacagtactcaggaggccggttcaccatccctacaggtgcgggcaaaccaggggacagcggtagaccgatcttcgacaacaagggacgcgtggtggccatagtcttaggaggagctaatgaaggagcccgtacagccctctcagtggtgacctggaataaagacattgtcactaaaatcacccctgagggagccgaagagtggagtcttgccatcccagttatgtgcctgttggcaaataccacgttcccctgctcccagcccccttgcataccctgctgctacgaaaaggaaccggaggaaaccctacgcatgcttgaggacaacgtcatgagacctgggtactatcagctgctacaagcatcattaacatgttctccccaccgccagcgacgcagcaccaaggacaacttcaatgtctataaagccacaagaccatacctagctcactgtcccgactgtggagaagggcactcgtgccatagtcccgtagcactagaacgcatcagaaatgaagcgacagacgggacgctgaaaatccaggtctccttgcaaattggaatagggacggatgatagccatgattggaccaagctgcgttacatggacaatcacataccagcagacgcagggagggccgggctatttgtaagaacatcagcaccatgcacgattactggaacaatgggacacttcatcctggcccgatgtccgaaaggagaaactctgacggtgggattcactgacagtaggaagattagtcactcatgtacgcacccatttcaccacgaccctcctgtgataggccgggaaaaattccattcccgaccgcagcacggtaaagagctaccttgcagcacgtacgtgcagagcaacgccgcaactgccgaggagatagaggtacacatgcccccagacacccctgatcgcacattgctgtcacaacagtccggcaacgtaaagatcacagtcaatagtcagacggtgcggtataagtgtaattgcggtggctcaaatgaaggactaataactacagataaagtgattaataactgcaaggttgatcaatgtcatgccgcggtcaccaatcacaaaaagtggcagtataactcccctctggtcccgcgtaacgctgaactcggggaccgaaaaggaaaaattcacatcccgtttccgctggcaaatgtaacatgcatggtgcctaaagcaaggaaccccaccgtgacgtacgggaaaaaccaagtcatcatgctactgtatcctgaccacccaacactcctgtcctaccggagtatgggagaagaaccaaactatcaagaagagtgggtgacgcacaagaaggaggtcgtgctaaccgtgccgactgaagggctcgaggttacgtggggcaacaacgagccgtataagtattggccgcagttatctgcaaacggtacagcccacggccacccgcatgagataatcttgtactattatgagctgtaccctactatgactgtagtagttgtgtcagtggcctcgttcatactcctgtcgatggtgggtatggcagtggggatgtgcatgtgtgcacgacgcagatgcatcacaccatacgaactgacaccaggagctaccgtccctttcctgcttagcctaatatgctgcatcagaacagctaaagcggccacataccaagaggctgcggtatacctgtggaacgagcagcaacctttgttttggctacaagcccttattccgctggcagccctgattgtcctatgcaactgtctgagactcttaccatgctgttgtaaaacgttggcttttttagccgtaatgagcatcggtgcccacactgtgagcgcgtacgaacacgtaacagtgatcccgaacacggtgggagtaccgtataagactctagtcaacagaccgggctacagccccatggtactggagatggagctactgtcagtcactttggagccaacgctatcgcttgattacatcacgtgcgaatacaaaaccgtcatcccgtctccgtacgtgaaatgctgcggtacagcagagtgcaaggacaaaaacctacctgactacagctgtaaggtcttcaccggcgtctacccatttatgtggggcggcgcctactgcttctgcgacgctgaaaacacgcaattgagcgaagcacatgtggagaagtccgaatcatgcaaaacagaatttgcatcagcatacagggctcataccgcatccgcatcagctaagctccgcgtcctttaccaaggaaataacatcactgtaactgcctatgcaaacggcgaccatgccgtcacagttaaggacgccaaattcattgtggggccaatgtcttcagcctggacaccttttgacaacaaaatcgtggtgtacaaaggtgacgtttacaacatggactacccgccctttggcgcaggaagaccaggacaatttggcgatatccaaagtcgcacgcctgagagcaaagacgtctatgctaacacacaactggtactgcagagaccggctgcgggtacggtacacgtgccatactctcaggcaccatctggctttaagtattggttaaaagaacgaggggcgtcgctacagcacacagcaccatttggctgccaaatagcaacaaacccggtaagagcgatgaactgcgccgtagggaacatgcccatctccatcgacataccggatgcggccttcactagggtcgtcgacgcgccctctttaacggacatgtcatgcgaggtaccagcctgcacccattcctcagactttgggggcgtcgccattattaaatatgcagtcagcaagaaaggcaagtgtgcggtgcattcgatgaccaacgccgtcactatccgggaagctgagatagaagttgaagggaattctcagctgcaaatctctttctcgacggccttggccagcgccgaattccgcgtacaagtctgttctacacaagtacactgtgcagccgagtgccaccctccgaaggaccacatagtcaactacccggcgtcacataccaccctcggggtccaggacatttccgctacggcgatgtcatgggtgcagaagatcacgggaggtgtgggactggttgtcgctgttgcagcactgattctaatcgtggtgctatgcgtgtcgttcagcaggcactaa" >
4 |     </orf>
5 | 


--------------------------------------------------------------------------------
/src/libseq/CodonAlign.h:
--------------------------------------------------------------------------------
  1 | // This may look like C code, but it's really -*- C++ -*-
  2 | #ifndef CODON_ALIGN_H_
  3 | #define CODON_ALIGN_H_
  4 | 
  5 | #include <AlignmentAlgorithm.h>
  6 | 
  7 | /**
  8 |  * libseq namespace
  9 |  */
 10 | namespace seq {
 11 | /**
 12 |   * Thrown when alignment failed.
 13 |   */
 14 | class AlignmentError : public std::exception
 15 | {
 16 | public:
 17 |   AlignmentError(double ntScore, double codonScore,
 18 | 		  const NTSequence& ntRef, const NTSequence& ntTarget,
 19 | 		  const std::string& message = std::string("Alignment error."));
 20 |   virtual ~AlignmentError() throw();
 21 | 
 22 |   /** %Nucleotide alignment score.
 23 |    */
 24 |   double nucleotideAlignmentScore() const { return ntScore_; }
 25 | 
 26 |   /** Codon-based alignemnt score.
 27 |    */
 28 |   double codonAlignmentScore() const { return codonScore_; }
 29 | 
 30 |   /** %Nucleotide aligned reference sequence
 31 |    */
 32 |   const NTSequence& nucleotideAlignedRef() const { return ntRef_; }
 33 | 
 34 |   /** %Nucleotide aligned target sequence
 35 |    */
 36 |   const NTSequence& nucleotideAlignedTarget() const { return ntTarget_; }
 37 | 
 38 |   /** Error message
 39 |    */
 40 |   const std::string& message() const{ return message_; }
 41 | 
 42 | private:
 43 |   std::string message_;
 44 |   double ntScore_, codonScore_;
 45 |   NTSequence ntRef_, ntTarget_;
 46 | };
 47 | 
 48 | /**
 49 |  * Error thrown by CodonAlign when apparent frame shifts cannot be corrected.
 50 |  *
 51 |  * Details in CodonAlign.
 52 |  */
 53 | class FrameShiftError : public AlignmentError
 54 | {
 55 | public:
 56 |   FrameShiftError(double ntScore, double codonScore,
 57 | 		  const NTSequence& ntRef, const NTSequence& ntTarget);
 58 |   ~FrameShiftError() throw();
 59 | 
 60 |   const char *what() const throw() { return "Frameshift error"; }
 61 | };
 62 | 
 63 | 
 64 | class CodonAlign {
 65 | public:
 66 |   /**
 67 |    * Constructor
 68 |    */
 69 |   CodonAlign(AlignmentAlgorithm* algorithm);
 70 | 
 71 |  /**
 72 |  * Perform codon-based alignment of nucleotide sequences.
 73 |  *
 74 |  * Two nucleotide sequences are pair-wise aligned, but so that gaps are
 75 |  * at codon boundaries. Optionally, frameshifts may be detected and corrected.
 76 |  *
 77 |  * The reference sequence must be of length a multiple of 3, and is assumed
 78 |  * to represent an Open Reading Frame (ORF).
 79 |  *
 80 |  * The procedure translates the target sequence in the 3 ORFs,
 81 |  * and for each ORF performs an amino-acid alignment against the translated
 82 |  * reference sequence. The best alignment is used to create the nucleotide
 83 |  * alignment.
 84 |  *
 85 |  * Then, the score of the codon aligned nucleotide alignment is computed, and
 86 |  * compared with a direct nucleotide alignment of both nucleotide sequences.
 87 |  * The codon alignment is accepted only if the difference is smaller than 100.
 88 |  * Otherwise, if maxFrameShifts > 0, the frameshift is searched, corrected
 89 |  * by inserting 1 or 2 'N' symbols in the target sequence, and repeating the
 90 |  * codon alignment. This is repeated for up to maxFrameShifts of times.
 91 |  *
 92 |  * The result is the nucleotide alignment score of the codon alignment, and
 93 |  * the number of frameshifts that have been corrected.
 94 |  *
 95 |  * @throws FrameShiftError when frameshifts could not be corrected, or
 96 |  *         the number of detected frameshifts exceeds maxFrameShifts.
 97 |  */
 98 |  std::pair<double, int>
 99 |  align(NTSequence& ref, NTSequence& target, int maxFrameShifts = 1);
100 | 
101 | private:
102 |   bool haveGaps(const NTSequence& seq, int from, int to);
103 |   double alignLikeAA(NTSequence& seq1, NTSequence& seq2, 
104 | 		     int ORF, 
105 | 		     const AASequence& seqAA1, const AASequence& seqAA2);
106 |   bool noGapAt(const NTSequence& seq, unsigned int i) const;
107 | 
108 |   AlignmentAlgorithm* algorithm_;
109 | };
110 | }
111 | 
112 | #endif // CODON_ALIGN_H_
113 | 


--------------------------------------------------------------------------------
/references/SARS-CoV-2/S.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!-- SARS-CoV-2 Spike (S) from NC_045512.2 (21563-25384) -->
3 | 	<orf name="S" description="spike glycoprotein" referenceSequence="ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGCTGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATTGCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCTACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCCATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAA" >
4 |          <protein abbreviation="S" startPosition="1" stopPosition="3821" />
5 | 	</orf>
6 | 


--------------------------------------------------------------------------------
/src/libseq/NTSequence.h:
--------------------------------------------------------------------------------
  1 | // This may look like C code, but it's really -*- C++ -*-
  2 | #ifndef NTSEQUENCE_H_
  3 | #define NTSEQUENCE_H_
  4 | 
  5 | #include <vector>
  6 | #include <string>
  7 | #include <iostream>
  8 | #include <set>
  9 | 
 10 | #include "ParseException.h"
 11 | #include "Nucleotide.h"
 12 | 
 13 | namespace seq {
 14 | 
 15 | /**
 16 |  * A nucleotide sequence.
 17 |  *
 18 |  * The sequence may have a name and a description.
 19 |  *
 20 |  * The sequence data is stored by publicly inheriting
 21 |  * std::vector<Nucleotide>, so you can use all
 22 |  * <a HREF="http://wwwasd.web.cern.ch/wwwasd/lhc++/RW/stdlibcr/vec_0251.htm">
 23 |  * std::vector</a> manipulations to access the nucleotide data.
 24 |  */
 25 | class NTSequence : public std::vector<Nucleotide>
 26 | {
 27 | public:
 28 |   /**
 29 |    * Create an empty nucleotide sequence with emtpy name
 30 |    * and empty description.
 31 |    */
 32 |   NTSequence();
 33 |  
 34 |   /**
 35 |    * Create a nucleotide sequence of length size, filled with
 36 |    * Nucleotide::N, with empty name and emtpy description.
 37 |    */
 38 |   NTSequence(unsigned size);
 39 | 
 40 |   /**
 41 |    * Create a nucleotide sequence with given name and description, and
 42 |    * with the given sequence string. Each character in the sequence
 43 |    * string will be interpreted as a Nucleotide using the
 44 |    * Nucleotide::Nucleotide(char) constructor.
 45 |    *
 46 |    * If sampleAmbiguities = true, then sampleAmbiguities() is
 47 |    * performed during construction.
 48 |    *
 49 |    * \sa sampleAmbiguities()
 50 |    */
 51 |   NTSequence(const std::string name,
 52 | 	     const std::string description,
 53 | 	     const std::string aSeqString,
 54 | 	     bool sampleAmbiguities = false);
 55 | 
 56 |   /**
 57 |    * Create a nucleotide sequence with empty name and emtpy
 58 |    * description, and copy the sequence data from the range [first, last[.
 59 |    */
 60 |   NTSequence(const const_iterator first,
 61 | 	     const const_iterator last);
 62 | 
 63 |   /**
 64 |    * Remove ambiguity nucleotide symbols by replacing them by sampling
 65 |    * a random non-ambiguous nucleotide that is represented by the
 66 |    * ambiguity symbol.
 67 |    *
 68 |    * \sa Nucleotide::sampleAmbiguity()
 69 |    */
 70 |   void sampleAmbiguities();
 71 | 
 72 |   NTSequence reverseComplement() const;
 73 | 
 74 |   /**
 75 |    * Add all the possible non-ambiguous sequences possibly represented by
 76 |    * this sequence to result.
 77 |    */
 78 |   void nonAmbiguousSequences(std::vector<NTSequence>& result) const;
 79 | 
 80 |   /**
 81 |    * Represent the sequence data as a string.
 82 |    */
 83 |   std::string asString() const;
 84 | 
 85 |   /**
 86 |    * Get the name.
 87 |    */
 88 |   std::string name() const { return name_; }
 89 | 
 90 |   /**
 91 |    * Get the description.
 92 |    */
 93 |   std::string description() const { return description_; }
 94 | 
 95 |   /**
 96 |    * Set the name.
 97 |    */
 98 |   void setName(std::string name) { name_ = name; }
 99 | 
100 |   /**
101 |    * Set the description.
102 |    */
103 |   void setDescription(std::string description) { description_ = description; }
104 | 
105 | private:
106 |   std::string name_;
107 |   std::string description_;
108 | 
109 |   void iterateNonAmbiguous(const NTSequence& head,
110 | 			   std::vector<NTSequence>& result) const;
111 | };
112 | 
113 | /**
114 |  * Write a set of sequences to Stockholm format
115 |  */
116 | extern void writeStockholm(std::ostream& o,
117 |                            const std::vector<NTSequence>& sequences,
118 |                            int length=10000, int labelsize=0,
119 |                            int seqsize=0, int pos=0);
120 | 
121 | /**
122 |  * Read a nucleotide sequence in FASTA format from the given stream.
123 |  */
124 | extern std::istream& operator>>(std::istream& i, NTSequence& sequence);
125 | 
126 | /**
127 |  * Write a nucleotide sequence to the given stream in FASTA format.
128 |  */
129 | extern std::ostream& operator<<(std::ostream& o, const NTSequence& sequence);
130 | 
131 | };
132 | 
133 | #endif // NTSEQUENCE_H_
134 | 


--------------------------------------------------------------------------------
/src/libseq/AASequence.h:
--------------------------------------------------------------------------------
  1 | // This may look like C code, but it's really -*- C++ -*-
  2 | #ifndef AASEQUENCE_H_
  3 | #define AASEQUENCE_H_
  4 | 
  5 | #include <vector>
  6 | #include <string>
  7 | #include <iostream>
  8 | 
  9 | #include "NTSequence.h"
 10 | #include "AminoAcid.h"
 11 | 
 12 | namespace seq {
 13 | 
 14 | /**
 15 |  * An amino acid sequence.
 16 |  *
 17 |  * The sequence may have a name and a description.
 18 |  *
 19 |  * The sequence data is stored by publicly inheriting
 20 |  * std::vector<AminoAcid>, so you can use all
 21 |  * <a HREF="http://wwwasd.web.cern.ch/wwwasd/lhc++/RW/stdlibcr/vec_0251.htm">
 22 |  * std::vector</a> manipulations to access the amino acid data.
 23 |  */
 24 | class AASequence : public std::vector<AminoAcid>
 25 | {
 26 | public:
 27 |   /**
 28 |    * Create an empty amino acid sequence with emtpy name and empty
 29 |    * description.
 30 |    */
 31 |   AASequence();
 32 | 
 33 |   /**
 34 |    * Create an amino acid sequence of length size, filled with
 35 |    * AminoAcid::X, with empty name and emtpy description.
 36 |    */
 37 |   AASequence(unsigned size);
 38 | 
 39 |   /**
 40 |    * Create an amino acid sequence with given name and description, and
 41 |    * with the given sequence string. Each character in the sequence
 42 |    * string will be interpreted as an AminoAcid using the
 43 |    * AminoAcid::AminoAcid(char) constructor.
 44 |    */
 45 |   AASequence(const std::string name,
 46 | 	     const std::string description,
 47 | 	     const std::string aSeqString);
 48 | 
 49 |   /**
 50 |    * Create a nucleotide sequence with empty name and emtpy
 51 |    * description, and copy the sequence data from the range [first, last[.
 52 |    */
 53 |   AASequence(const const_iterator first, const const_iterator last);
 54 | 
 55 |   /**
 56 |    * Represent the sequence data as a string.
 57 |    */
 58 |   std::string asString() const;
 59 | 
 60 |   /**
 61 |    * Get the name.
 62 |    */
 63 |   std::string name() const { return name_; }
 64 | 
 65 |   /**
 66 |    * Get the description.
 67 |    */
 68 |   std::string description() const { return description_; }
 69 | 
 70 |   /**
 71 |    * Set the name.
 72 |    */
 73 |   void setName(std::string name) { name_ = name; }
 74 | 
 75 |   /**
 76 |    * Set the description.
 77 |    */
 78 |   void setDescription(std::string description) { description_ = description; }
 79 | 
 80 |   /**
 81 |    * Translate a nucleotide sequence to an amino acid sequence. The
 82 |    * nucleotide sequence must have a length that is a multiple of
 83 |    * three.
 84 |    *
 85 |    * The resulting amino acid sequence will contain an amino acid for
 86 |    * every triplet of nucleotides in the nucleotide sequence.  The
 87 |    * amino acid sequence will have the same name and description as
 88 |    * the nucleotide sequence.
 89 |    *
 90 |    * \sa translate(const NTSequence::const_iterator, const NTSequence::const_iterator), Codon::translate(const NTSequence::const_iterator)
 91 |    */
 92 |   static AASequence translate(const NTSequence& ntSequence);
 93 | 
 94 |   /**
 95 |    * Translate a nucleotide sequence, defined by the range begin to
 96 |    * end, to an amino acid sequence. The nucleotide sequence must have a
 97 |    * length that is a multiple of three.
 98 |    *
 99 |    * The resulting amino acid sequence will contain an amino acid for
100 |    * every triplet of nucleotides in the nucleotide sequence, and will
101 |    * have an empty name and empty description.
102 |    *
103 |    * \sa translate(const NTSequence&), Codon::translate(const NTSequence::const_iterator)
104 |    */
105 |   static AASequence translate(const NTSequence::const_iterator begin,
106 | 			      const NTSequence::const_iterator end);
107 | private:
108 |   std::string name_;
109 |   std::string description_;
110 | };
111 | 
112 | /**
113 |  * Read an amino acid sequence in FASTA format from the given stream.
114 |  */
115 | extern std::istream& operator>>(std::istream& i, AASequence& sequence);
116 | 
117 | /**
118 |  * Write an amino acid sequence to the given stream in FASTA format.
119 |  */
120 | extern std::ostream& operator<<(std::ostream& o, const AASequence& sequence);
121 | 
122 | };
123 | 
124 | #endif // AASEQUENCE_H_
125 | 


--------------------------------------------------------------------------------
/src/libseq/AlignmentAlgorithm.cpp:
--------------------------------------------------------------------------------
 1 | #include "AlignmentAlgorithm.h"
 2 | 
 3 | namespace seq {
 4 | 
 5 | double** AlignmentAlgorithm::IUB()
 6 | {
 7 |   static double rowA[] = { 5,-4,-4,-4,1,1,1,-4,-4,-4,-1,-1,-1,-4,-2 };
 8 |   static double rowC[] = { -4,5,-4,-4,1,-4,-4,1,1,-4,-1,-1,-4,-1,-2 };
 9 |   static double rowG[] = { -4,-4,5,-4,-4,1,-4,1,-4,1,-1,-4,-1,-1,-2 };
10 |   static double rowT[] = { -4,-4,-4,5,-4,-4,1,-4,1,1,-4,-1,-1,-1,-2 };
11 |   static double rowM[] = { 1,1,-4,-4,-1,-2,-2,-2,-2,-4,-1,-1,-3,-3,-1 };
12 |   static double rowR[] = { 1,-4,1,-4,-2,-1,-2,-2,-4,-2,-1,-3,-1,-3,-1 };
13 |   static double rowW[] = { 1,-4,-4,1,-2,-2,-1,-4,-2,-2,-3,-1,-1,-3,-1 };
14 |   static double rowS[] = { -4,1,1,-4,-2,-2,-4,-1,-2,-2,-1,-3,-3,-1,-1 };
15 |   static double rowY[] = { -4,1,-4,1,-2,-4,-2,-2,-1,-2,-3,-1,-3,-1,-1 };
16 |   static double rowK[] = { -4,-4,1,1,-4,-2,-2,-2,-2,-1,-3,-3,-1,-1,-1 };
17 |   static double rowV[] = { -1,-1,-1,-4,-1,-1,-3,-1,-3,-3,-1,-2,-2,-2,-1 };
18 |   static double rowH[] = { -1,-1,-4,-1,-1,-3,-1,-3,-1,-3,-2,-1,-2,-2,-1 };
19 |   static double rowD[] = { -1,-4,-1,-1,-3,-1,-1,-3,-3,-1,-2,-2,-1,-2,-1 };
20 |   static double rowB[] = { -4,-1,-1,-1,-3,-3,-3,-1,-1,-1,-2,-2,-2,-1,-1 };
21 |   static double rowN[] = { -2,-2,-2,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 };
22 | 
23 |   static double *iub[] = { rowA, rowC, rowG, rowT, rowM, rowR, rowW, rowS,
24 | 			   rowY, rowK, rowV, rowH, rowD, rowB, rowN };
25 | 
26 |   return iub;
27 | }
28 | 
29 | double** AlignmentAlgorithm::BLOSUM30()
30 | {
31 |   static double rowA[] = { 4,-3,0,0,-2,0,-2,0,0,-1,1,0,-1,1,-1,1,1,1,-5,-4,-7,0,0,0,0,0 };
32 |   static double rowC[] = { -3,17,-3,1,-3,-4,-5,-2,-3,0,-2,-1,-3,-2,-2,-2,-2,-2,-2,-6,-7,0,0,0,-2,-2 };
33 |   static double rowD[] = { 0,-3,9,1,-5,-1,-2,-4,0,-1,-3,1,-1,-1,-1,0,-1,-2,-4,-1,-7,0,0,0,5,-1 };
34 |   static double rowE[] = { 0,1,1,6,-4,-2,0,-3,2,-1,-1,-1,1,2,-1,0,-2,-3,-1,-2,-7,0,5,0,0,-1 };
35 |   static double rowF[] = { -2,-3,-5,-4,10,-3,-3,0,-1,2,-2,-1,-4,-3,-1,-1,-2,1,1,3,-7,0,-4,0,-3,-1 };
36 |   static double rowG[] = { 0,-4,-1,-2,-3,8,-3,-1,-1,-2,-2,0,-1,-2,-2,0,-2,-3,1,-3,-7,0,-2,0,0,-1 };
37 |   static double rowH[] = { -2,-5,-2,0,-3,-3,14,-2,-2,-1,2,-1,1,0,-1,-1,-2,-3,-5,0,-7,0,0,0,-2,-1 };
38 |   static double rowI[] = { 0,-2,-4,-3,0,-1,-2,6,-2,2,1,0,-3,-2,-3,-1,0,4,-3,-1,-7,0,-3,0,-2,0 };
39 |   static double rowK[] = { 0,-3,0,2,-1,-1,-2,-2,4,-2,2,0,1,0,1,0,-1,-2,-2,-1,-7,0,1,0,0,0 };
40 |   static double rowL[] = { -1,0,-1,-1,2,-2,-1,2,-2,4,2,-2,-3,-2,-2,-2,0,1,-2,3,-7,0,-1,0,-1,0 };
41 |   static double rowM[] = { 1,-2,-3,-1,-2,-2,2,1,2,2,6,0,-4,-1,0,-2,0,0,-3,-1,-7,0,-1,0,-2,0 };
42 |   static double rowN[] = { 0,-1,1,-1,-1,0,-1,0,0,-2,0,8,-3,-1,-2,0,1,-2,-7,-4,-7,0,-1,0,4,0 };
43 |   static double rowP[] = { -1,-3,-1,1,-4,-1,1,-3,1,-3,-4,-3,11,0,-1,-1,0,-4,-3,-2,-7,0,0,0,-2,-1 };
44 |   static double rowQ[] = { 1,-2,-1,2,-3,-2,0,-2,0,-2,-1,-1,0,8,3,-1,0,-3,-1,-1,-7,0,4,0,-1,0 };
45 |   static double rowR[] = { -1,-2,-1,-1,-1,-2,-1,-3,1,-2,0,-2,-1,3,8,-1,-3,-1,0,0,-7,0,0,0,-2,-1 };
46 |   static double rowS[] = { 1,-2,0,0,-1,0,-1,-1,0,-2,-2,0,-1,-1,-1,4,2,-1,-3,-2,-7,0,-1,0,0,0 };
47 |   static double rowT[] = { 1,-2,-1,-2,-2,-2,-2,0,-1,0,0,1,0,0,-3,2,5,1,-5,-1,-7,0,-1,0,0,0 };
48 |   static double rowV[] = { 1,-2,-2,-3,1,-3,-3,4,-2,1,0,-2,-4,-3,-1,-1,1,5,-3,1,-7,0,-3,0,-2,0 };
49 |   static double rowW[] = { -5,-2,-4,-1,1,1,-5,-3,-2,-2,-3,-7,-3,-1,0,-3,-5,-3,20,5,-7,0,-1,0,-5,-2 };
50 |   static double rowY[] = { -4,-6,-1,-2,3,-3,0,-1,-1,3,-1,-4,-2,-1,0,-2,-1,1,5,9,-7,0,-2,0,-3,-1 };
51 |   static double rowSTP[] = { -7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,1,0,-7,0,-7,-7 };
52 |   static double rowGAP[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
53 |   static double rowZ[] = { 0,0,0,5,-4,-2,0,-3,1,-1,-1,-1,0,4,0,-1,-1,-3,-1,-2,-7,0,4,0,0,0 };
54 |   static double rowU[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
55 |   static double rowB[] = { 0,-2,5,0,-3,0,-2,-2,0,-1,-2,4,-2,-1,-2,0,0,-2,-5,-3,-7,0,0,0,5,-1 };
56 |   static double rowX[] = { 0,-2,-1,-1,-1,-1,-1,0,0,0,0,0,-1,0,-1,0,0,0,-2,-1,-7,0,0,0,-1,-1 };
57 | 
58 |   static double *mat[] = { rowA, rowC, rowD, rowE, rowF, rowG, rowH, rowI,
59 | 			   rowK, rowL, rowM, rowN, rowP, rowQ, rowR, rowS,
60 | 			   rowT, rowV, rowW, rowY, rowSTP, rowGAP,
61 | 			   rowZ, rowU, rowB, rowX };
62 | 
63 |   return mat;
64 | }
65 | 
66 | };
67 | 


--------------------------------------------------------------------------------
/src/libseq/AminoAcid.h:
--------------------------------------------------------------------------------
  1 | // This may look like C code, but it's really -*- C++ -*-
  2 | #ifndef AMINO_ACID_H_
  3 | #define AMINO_ACID_H_
  4 | 
  5 | #include <assert.h>
  6 | #include <iostream>
  7 | 
  8 | #include "ParseException.h"
  9 | 
 10 | namespace seq {
 11 | 
 12 | /**
 13 |  * An amino acid.
 14 |  *
 15 |  * The amino acid is represented internally using an integer
 16 |  * representation. This may be helpful for e.g. indexing into an
 17 |  * array. Therefore, it is possible to both retrieve this internal
 18 |  * representation with intRep() and construct an AminoAcid from an
 19 |  * internal representation directly with fromRep(int).
 20 |  */
 21 | class AminoAcid {
 22 | public:
 23 |   /**
 24 |    * @name Constants used in the internal representation.
 25 |    * \sa intRep() and fromRep(int).
 26 |    */
 27 |   //@{
 28 |   static const int AA_A = 0;
 29 |   static const int AA_C = 1;
 30 |   static const int AA_D = 2;
 31 |   static const int AA_E = 3;
 32 |   static const int AA_F = 4;
 33 |   static const int AA_G = 5;
 34 |   static const int AA_H = 6;
 35 |   static const int AA_I = 7;
 36 |   static const int AA_K = 8;
 37 |   static const int AA_L = 9;
 38 |   static const int AA_M = 10;
 39 |   static const int AA_N = 11;
 40 |   static const int AA_P = 12;
 41 |   static const int AA_Q = 13;
 42 |   static const int AA_R = 14;
 43 |   static const int AA_S = 15;
 44 |   static const int AA_T = 16;
 45 |   static const int AA_V = 17;
 46 |   static const int AA_W = 18;
 47 |   static const int AA_Y = 19;
 48 |   static const int AA_STP = 20; // translation stop
 49 |   static const int AA_GAP = 21;
 50 |   static const int AA_Z = 22;   // glutamate (E) or glutamine (Q)
 51 |   static const int AA_U = 23;   // selenocysteine
 52 |   static const int AA_B = 24;   // asparatate (D) or asparagine (N)
 53 |   static const int AA_X = 25;   // any
 54 |   static const int AA_J = 26;   // leucine (L) or isoleucine (I)
 55 |   //@}
 56 | 
 57 |   /**
 58 |    * @name AminoAcid constants.
 59 |    */
 60 |   //@{
 61 |   static const AminoAcid A;
 62 |   static const AminoAcid C;
 63 |   static const AminoAcid D;
 64 |   static const AminoAcid E;
 65 |   static const AminoAcid F;
 66 |   static const AminoAcid G;
 67 |   static const AminoAcid H;
 68 |   static const AminoAcid I;
 69 |   static const AminoAcid K;
 70 |   static const AminoAcid L;
 71 |   static const AminoAcid M;
 72 |   static const AminoAcid N;
 73 |   static const AminoAcid P;
 74 |   static const AminoAcid Q;
 75 |   static const AminoAcid R;
 76 |   static const AminoAcid S;
 77 |   static const AminoAcid T;
 78 |   static const AminoAcid V;
 79 |   static const AminoAcid W;
 80 |   static const AminoAcid Y;
 81 |   static const AminoAcid STP;
 82 |   static const AminoAcid GAP;
 83 | 
 84 |   /* less common amino acids: */
 85 |   static const AminoAcid Z;
 86 |   static const AminoAcid U;
 87 |   static const AminoAcid B;
 88 |   static const AminoAcid X;
 89 |   static const AminoAcid J;
 90 |   //@}
 91 | 
 92 |   /**
 93 |    * Create an amino acid with value AminoAcid::X (any).
 94 |    */
 95 |   AminoAcid();
 96 | 
 97 |   /**
 98 |    * Create an amino acid by parsing a character.
 99 |    * Accepted are the characters from the FASTA file definition.
100 |    *
101 |    * \sa toChar()
102 |    */
103 |   AminoAcid(char c);
104 | 
105 |   /**
106 |    * Create an amino acid using the internal representation directly.
107 |    * Only valid representations are accepted, see the AA_* constants.
108 |    * Illegal representations are fenced off by an assert() statement.
109 |    *
110 |    * \sa intRep()
111 |    */
112 |   static AminoAcid fromRep(int rep) {
113 |     assert(rep >= 0 && rep <= AA_X);
114 | 
115 |     return AminoAcid(rep);
116 |   }
117 | 
118 |   /**
119 |    * Get the uppercase character representation for this amino acid.
120 |    *
121 |    * \sa AminoAcid(char)
122 |    */
123 |   char toChar() const {
124 |     return AA_CHAR[rep_];
125 |   }
126 | 
127 |   /**
128 |    * Get the three letter abbreviation for this amino acid.
129 |    *
130 |    * Note that AminoAcid::B and AminoAcid::Z are combinations of two
131 |    * amino acids, and represented as "One/Two".
132 |    *
133 |    * \sa toChar()
134 |    */
135 |   std::string tla() const;
136 | 
137 |   /**
138 |    * Get the internal representation.
139 |    *
140 |    * \sa fromRep(int)
141 |    */
142 |   int intRep() const {
143 |     return rep_;
144 |   }
145 | 
146 |   /**
147 |    * Are two amino acids identical ?
148 |    */
149 |   bool operator== (const AminoAcid other) const {
150 |     return other.rep_ == rep_;
151 |   }
152 | 
153 |   /**
154 |    * Are two amino acids different ?
155 |    */
156 |   bool operator!= (const AminoAcid other) const {
157 |     return !(*this == other);
158 |   }
159 | 
160 |   /**
161 |    * So that you can use it as a key for STL containers.
162 |    */
163 |   bool operator< (const AminoAcid other) const { return rep_ < other.rep_; }
164 | 
165 | private:
166 |   static const char AA_CHAR[];
167 |   static const char * const AA_TLA[];
168 | 
169 |   explicit AminoAcid(int rep)
170 |     : rep_(rep) {
171 |   }
172 | 
173 |   short int rep_;
174 | };
175 | 
176 | /**
177 |  * Write the one-letter representation of the amino acid to the stream.
178 |  */
179 | extern std::ostream& operator<< (std::ostream& o, const AminoAcid aa);
180 | 
181 | };
182 | 
183 | #endif // AMINO_ACID_H_
184 | 


--------------------------------------------------------------------------------
/src/libseq/Nucleotide.h:
--------------------------------------------------------------------------------
  1 | // This may look like C code, but it's really -*- C++ -*-
  2 | #ifndef NUCLEOTIDE_H_
  3 | #define NUCLEOTIDE_H_
  4 | 
  5 | #include <assert.h>
  6 | #include <ctype.h>
  7 | #include <iostream>
  8 | #include <vector>
  9 | #include <set>
 10 | 
 11 | #include "ParseException.h"
 12 | 
 13 | namespace seq {
 14 |   
 15 | /**
 16 |  * A nucleotide, including support for ambiguity codes.
 17 |  *
 18 |  * The nucleotide is represented internally using an integer
 19 |  * representation. This may be helpful for e.g. indexing into a
 20 |  * table. Therefore, it is possible to both retrieve this internal
 21 |  * representation with intRep() and construct a Nucleotide from an
 22 |  * internal representation directly with fromRep(int).
 23 |  */
 24 | class Nucleotide {
 25 | public:
 26 |   /**
 27 |    * @name Constants used in the internal representation.
 28 |    * \sa intRep() and fromRep(int).
 29 |    */
 30 |   //@{
 31 |   static const int NT_A = 0;
 32 |   static const int NT_C = 1;
 33 |   static const int NT_G = 2;
 34 |   static const int NT_T = 3;
 35 |   static const int NT_M = 4;
 36 |   static const int NT_R = 5;
 37 |   static const int NT_W = 6;
 38 |   static const int NT_S = 7;
 39 |   static const int NT_Y = 8;
 40 |   static const int NT_K = 9;
 41 |   static const int NT_V = 10;
 42 |   static const int NT_H = 11;
 43 |   static const int NT_D = 12;
 44 |   static const int NT_B = 13;
 45 |   static const int NT_N = 14;
 46 |   static const int NT_GAP = 15;
 47 |   //@}
 48 | 
 49 |   /**
 50 |    * @name Nucleotide constants.
 51 |    */
 52 |   //@{
 53 |   static const Nucleotide A;
 54 |   static const Nucleotide C;
 55 |   static const Nucleotide G;
 56 |   static const Nucleotide T;
 57 |   static const Nucleotide M;
 58 |   static const Nucleotide R;
 59 |   static const Nucleotide W;
 60 |   static const Nucleotide S;
 61 |   static const Nucleotide Y;
 62 |   static const Nucleotide K;
 63 |   static const Nucleotide V;
 64 |   static const Nucleotide H;
 65 |   static const Nucleotide D;
 66 |   static const Nucleotide B;
 67 |   static const Nucleotide N;
 68 |   static const Nucleotide GAP;
 69 |   //@}
 70 | 
 71 |   /**
 72 |    * Create a nucleotide with value Nucleotide::N (any).
 73 |    */
 74 |   Nucleotide();
 75 | 
 76 |   /**
 77 |    * Create a nucleotide by parsing a character.
 78 |    * Accepted are the characters from the FASTA file definition.
 79 |    *
 80 |    * \sa toChar()
 81 |    */
 82 |   Nucleotide(char c);
 83 | 
 84 |   /**
 85 |    * Create a nucleotide using the internal representation directly.
 86 |    * Only valid representations are accepted, see the NT_* constants.
 87 |    * Illegal representations are fenced off by an assert() statement.
 88 |    *
 89 |    * \sa intRep()
 90 |    */
 91 |   static Nucleotide fromRep(int rep) {
 92 |     assert(rep >= 0 && rep <= NT_GAP);
 93 | 
 94 |     return Nucleotide(rep);
 95 |   }
 96 | 
 97 |   /**
 98 |    * Get the uppercase character representation for this nucleotide.
 99 |    *
100 |    * \sa Nucleotide(char)
101 |    */
102 |   char toChar() const {
103 |     return NT_CHAR[rep_];
104 |   }
105 | 
106 |   /**
107 |    * Get the internal representation.
108 |    *
109 |    * \sa fromRep(int)
110 |    */
111 |   int intRep() const {
112 |     return rep_;
113 |   }
114 | 
115 |   /**
116 |    * Are two nucleotides identical ?
117 |    */
118 |   bool operator== (const Nucleotide& other) const {
119 |     return other.rep_ == rep_;
120 |   }
121 | 
122 |   /**
123 |    * Are two nucleotides different ?
124 |    */
125 |   bool operator!= (const Nucleotide& other) const {
126 |     return !(*this == other);
127 |   }
128 | 
129 |   /**
130 |    * Is the nucleotide ambiguous ? Only A,C,G,T are considered non-ambiguous.
131 |    *
132 |    * \sa sampleAmbiguity()
133 |    */
134 |   bool isAmbiguity() const { return rep_ > NT_T; }
135 | 
136 |   /**
137 |    * Replace the (ambiguos) nucleotide with a random non-ambigiuos nucleotide
138 |    * that is represented by the ambiguity symbol.
139 |    *
140 |    * \sa isAmbiguity()
141 |    */
142 |   void sampleAmbiguity();
143 | 
144 |   Nucleotide reverseComplement() const;
145 | 
146 |   /**
147 |    * Get all non ambiguous nucleotides represented by this nucleotide.
148 |    */ 
149 |   void nonAmbiguousNucleotides(std::vector<Nucleotide>& result) const;
150 | 	
151 |   /**
152 |    * Get the single nucleotide representing all given nucleotides.
153 |    */
154 |   static Nucleotide singleNucleotide(std::set<Nucleotide>& nucleotides);
155 | 
156 |   /**
157 |    * So that you can use it as a key for STL containers.
158 |    */
159 |   bool operator< (const Nucleotide other) const { return rep_ < other.rep_; }
160 | 
161 | private:
162 |   static const char NT_CHAR[];
163 | 
164 |   Nucleotide(int rep)
165 |     : rep_(rep) {
166 |   }
167 | 
168 |   short int rep_;
169 | };
170 | 
171 | /**
172 |  * Write the character representation of the nucleotide.
173 |  */
174 | extern std::ostream& operator<< (std::ostream& o, const Nucleotide nt);
175 | 
176 | inline Nucleotide::Nucleotide(char c)
177 | {
178 |   switch (toupper(c)) {
179 |   case 'A': rep_ = NT_A; break;
180 |   case 'C': rep_ = NT_C; break;
181 |   case 'G': rep_ = NT_G; break;
182 |   case 'T': case 'U': rep_ = NT_T; break;
183 |   case 'M': rep_ = NT_M; break;
184 |   case 'R': rep_ = NT_R; break;
185 |   case 'W': rep_ = NT_W; break;
186 |   case 'S': rep_ = NT_S; break;
187 |   case 'Y': rep_ = NT_Y; break;
188 |   case 'K': rep_ = NT_K; break;
189 |   case 'V': rep_ = NT_V; break;
190 |   case 'H': rep_ = NT_H; break;
191 |   case 'D': rep_ = NT_D; break;
192 |   case 'B': rep_ = NT_B; break;
193 |   case 'N': rep_ = NT_N; break;
194 |   case '-': rep_ = NT_GAP; break;
195 |   default:
196 |     throw ParseException
197 |       (std::string(),
198 |        std::string("Invalid nucleotide character: '") + c + "'", false);
199 |   }
200 | }
201 | 
202 | };
203 | 
204 | #endif // NUCLEOTIDE_H_
205 | 


--------------------------------------------------------------------------------
/src/libseq/NeedlemanWunsh.cpp:
--------------------------------------------------------------------------------
  1 | #include "NeedlemanWunsh.h"
  2 | 
  3 | #include <algorithm>
  4 | 
  5 | namespace seq {
  6 | 
  7 | NeedlemanWunsh::NeedlemanWunsh(double gapOpenScore,
  8 | 			       double gapExtensionScore,
  9 | 			       double **ntWeightMatrix,
 10 | 			       double **aaWeightMatrix)
 11 | {
 12 |   gapOpenScore_ = gapOpenScore;
 13 |   gapExtensionScore_ = gapExtensionScore;
 14 |   ntWeightMatrix_ = ntWeightMatrix;
 15 |   aaWeightMatrix_ = aaWeightMatrix;
 16 | }
 17 | 
 18 | /*
 19 |  * A straight-forward implementation of Neeldeman-Wunsh algorithm
 20 |  * for a pairwise global alignment, with the difference that a
 21 |  * gapOpenScore is not added at the beginning or end of the sequence
 22 |  * (like ClustalW does).
 23 |  */
 24 | template <typename Symbol>
 25 | double NeedlemanWunsh::needlemanWunshAlign(std::vector<Symbol>& seq1,
 26 | 					   std::vector<Symbol>& seq2,
 27 | 					   double** weightMatrix)
 28 | {
 29 |   /*
 30 |    * Remove gaps, and warn that we did.
 31 |    */
 32 |   bool foundGaps = false;
 33 |   for (unsigned i = 0; i < seq1.size(); ++i) {
 34 |     if (seq1[i] == Symbol::GAP) {
 35 |       if (!foundGaps) {
 36 | 	std::cerr << "Warning: NeedlemanWunsh: sequence contained gaps? "
 37 | 	             "Removed them." << std::endl;
 38 | 	foundGaps = true;
 39 |       }
 40 |       seq1.erase(seq1.begin() + i);
 41 |       --i;
 42 |     }
 43 |   }
 44 | 
 45 |   for (unsigned i = 0; i < seq2.size(); ++i) {
 46 |     if (seq2[i] == Symbol::GAP) {
 47 |       if (!foundGaps) {
 48 | 	std::cerr << "Warning: NeedlemanWunsh: sequence contained gaps? "
 49 | 	             "Removed them." << std::endl;
 50 | 	foundGaps = true;
 51 |       }
 52 |       seq2.erase(seq2.begin() + i);
 53 |       --i;
 54 |     }
 55 |   }
 56 | 
 57 |   const int seq1Size = seq1.size();
 58 |   const int seq2Size = seq2.size();
 59 | 
 60 |   double **dnTable = new double* [seq1Size+1];
 61 |   for (unsigned i = 0; i < seq1Size+1; ++i)
 62 |     dnTable[i] = new double[seq2Size+1];
 63 |   int    **gapsLengthTable = new int *[seq1Size+1];
 64 |   for (unsigned i = 0; i < seq1Size+1; ++i)
 65 |     gapsLengthTable[i] = new int[seq2Size+1]; // >0: horiz, <0: vert
 66 | 
 67 |   double edgeGapExtensionScore = 0;
 68 | 
 69 |   /*
 70 |    * compute table
 71 |    */
 72 |   dnTable[0][0] = 0;
 73 |   gapsLengthTable[0][0] = 0;
 74 |   for (unsigned i = 1; i < seq1Size+1; ++i) {
 75 |     dnTable[i][0] = dnTable[i-1][0] + edgeGapExtensionScore;
 76 |     gapsLengthTable[i][0] = gapsLengthTable[i-1][0] + 1;
 77 |   }
 78 |   for (unsigned j = 1; j < seq2Size+1; ++j) {
 79 |     dnTable[0][j] = dnTable[0][j-1] + edgeGapExtensionScore;
 80 |     gapsLengthTable[0][j] = gapsLengthTable[0][j-1] - 1;
 81 |   }
 82 | 
 83 |   for (unsigned i = 1; i < seq1Size+1; ++i) {
 84 |     for (unsigned j = 1; j < seq2Size+1; ++j) {
 85 | 
 86 |       double sextend
 87 | 	= dnTable[i-1][j-1]
 88 | 	+ weightMatrix[seq1[i-1].intRep()][seq2[j-1].intRep()];
 89 | 
 90 |       double ges = (j == seq2Size) ? edgeGapExtensionScore : gapExtensionScore_;
 91 | 
 92 |       double horizGapScore = ((gapsLengthTable[i-1][j] > 0) || (j == seq2Size)
 93 | 			      ? ges : gapOpenScore_ + ges);
 94 |       double sgaphoriz
 95 | 	= dnTable[i-1][j] + horizGapScore;
 96 | 
 97 |       ges = (i == seq1Size) ? edgeGapExtensionScore : gapExtensionScore_;
 98 | 
 99 |       double vertGapScore = (gapsLengthTable[i][j-1] < 0 || (i == seq1Size)
100 | 			     ? ges : gapOpenScore_ + ges);
101 |       double sgapvert
102 | 	= dnTable[i][j-1] + vertGapScore;
103 | 
104 |       if ((sextend >= sgaphoriz) && (sextend >= sgapvert)) {
105 | 	dnTable[i][j] = sextend;
106 | 	gapsLengthTable[i][j] = 0;
107 |       } else {
108 | 	if (sgaphoriz > sgapvert) {
109 | 	  dnTable[i][j] = sgaphoriz;
110 | 	  gapsLengthTable[i][j] = std::max(0, gapsLengthTable[i-1][j]) + 1;
111 | 	} else {
112 | 	  dnTable[i][j] = sgapvert;
113 | 	  gapsLengthTable[i][j] = std::min(0, gapsLengthTable[i][j-1]) - 1;
114 | 	}
115 |       }
116 |     }
117 |   }
118 | 
119 |   /*
120 |    * reconstruct best solution alignment.
121 |    */
122 |   int i = seq1Size+1, j = seq2Size+1;
123 |   do {
124 |     if (gapsLengthTable[i-1][j-1] == 0) {
125 |       --i; --j;
126 |     } else if (gapsLengthTable[i-1][j-1] > 0) {
127 |       --i;
128 |       seq2.insert(seq2.begin() + (j-1), Symbol::GAP);
129 |     } else {
130 |       --j;
131 |       seq1.insert(seq1.begin() + (i-1), Symbol::GAP);
132 |     }
133 |   } while (i > 1 || j > 1);
134 | 
135 |   double score = dnTable[seq1Size][seq2Size];
136 | 
137 |   for (unsigned i = 0; i < seq1Size+1; ++i) {
138 |     delete[] dnTable[i];
139 |     delete[] gapsLengthTable[i];
140 |   }
141 |   delete[] dnTable;
142 |   delete[] gapsLengthTable;
143 | 
144 |   return score;
145 | }
146 |   
147 | double NeedlemanWunsh::align(NTSequence& seq1, NTSequence& seq2)
148 | {
149 |   return needlemanWunshAlign(seq1, seq2, ntWeightMatrix_);
150 | }
151 | 
152 | double NeedlemanWunsh::align(AASequence& seq1, AASequence& seq2)
153 | {
154 |   return needlemanWunshAlign(seq1, seq2, aaWeightMatrix_);
155 | }
156 | 
157 | double NeedlemanWunsh::computeAlignScore(const NTSequence& seq1, 
158 | 					 const NTSequence& seq2)
159 | {
160 |   double score = 0;
161 |   int seq1GapLength = 0;
162 |   int seq2GapLength = 0;
163 | 
164 |   bool seq1LeadingGap = true;
165 |   bool seq2LeadingGap = true;
166 | 
167 |   double edgeGapExtensionScore = 0;
168 |   
169 |   for (unsigned i = 0; i < seq1.size(); ++i) {
170 |     if (seq1[i] == Nucleotide::GAP) {
171 |       ++seq1GapLength;
172 |     } else {
173 |       if (seq1GapLength) {
174 | 	    if (seq1LeadingGap)
175 | 	        score += seq1GapLength * edgeGapExtensionScore;
176 | 	    else
177 | 	    score += gapOpenScore_ + seq1GapLength * gapExtensionScore_;
178 |       }
179 |       seq1GapLength = 0;
180 | 
181 |       if (seq2[i] == Nucleotide::GAP) {
182 | 	++seq2GapLength;
183 |       } else {
184 | 	if (seq2GapLength) {
185 | 	  if (seq2LeadingGap)
186 | 	    score += seq2GapLength * edgeGapExtensionScore;
187 | 	  else
188 | 	    score += gapOpenScore_ + seq2GapLength * gapExtensionScore_;
189 |     }
190 | 	seq2GapLength = 0;
191 | 
192 | 	score += ntWeightMatrix_[seq1[i].intRep()][seq2[i].intRep()];
193 |       }
194 |     }
195 |   }
196 | 
197 |   score += seq1GapLength * edgeGapExtensionScore;
198 |   score += seq2GapLength * edgeGapExtensionScore;
199 | 
200 |   return score;
201 | }
202 | 
203 | }
204 | 


--------------------------------------------------------------------------------
/src/libseq/NTSequence.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <algorithm>
  3 | 
  4 | #include "NTSequence.h"
  5 | #include "ParseException.h"
  6 | 
  7 | namespace seq {
  8 | 
  9 | NTSequence::NTSequence()
 10 |   : std::vector<Nucleotide>()
 11 | { }
 12 | 
 13 | NTSequence::NTSequence(unsigned size)
 14 |   : std::vector<Nucleotide>(size)
 15 | { }
 16 | 
 17 | NTSequence::NTSequence(const std::string name, const std::string description,
 18 | 		       const std::string aSeqString,
 19 | 		       bool sampleAmbiguities)
 20 |   : std::vector<Nucleotide>(aSeqString.length()),
 21 |     name_(name),
 22 |     description_(description)
 23 | {
 24 |   for (unsigned i = 0; i < aSeqString.length(); ++i) {
 25 |     try {
 26 |       Nucleotide nt(aSeqString[i]);
 27 |       if (sampleAmbiguities)
 28 | 	nt.sampleAmbiguity();
 29 | 
 30 |       (*this)[i] = nt;
 31 |     } catch (ParseException& e) {
 32 |       throw ParseException(name, e.message(), e.recovered());
 33 |     }
 34 |   }
 35 | }
 36 | 
 37 | NTSequence::NTSequence(const const_iterator first,
 38 | 		       const const_iterator last)
 39 |   : std::vector<Nucleotide>(first, last)
 40 | { }
 41 | 
 42 | void NTSequence::sampleAmbiguities()
 43 | {
 44 |   for (unsigned i = 0; i < size(); ++i) {
 45 |     (*this)[i].sampleAmbiguity();
 46 |   }
 47 | }
 48 | 
 49 | NTSequence NTSequence::reverseComplement() const
 50 | {
 51 |   NTSequence result(size());
 52 |   result.name_ = name_;
 53 |   result.description_ = description_;
 54 | 
 55 |   for (unsigned i = 0; i < size(); ++i)
 56 |     result[size() - i - 1] = (*this)[i].reverseComplement();
 57 | 
 58 |   return result;
 59 | }
 60 | 
 61 | void NTSequence::nonAmbiguousSequences(std::vector<NTSequence>& result) const
 62 | {
 63 |   iterateNonAmbiguous(NTSequence(), result);
 64 | }
 65 | 
 66 | void NTSequence::iterateNonAmbiguous(const NTSequence& head,
 67 | 				     std::vector<NTSequence>& result) const
 68 | {
 69 |   /*
 70 |    * find the next ambigous codon (if any)
 71 |    */
 72 |   NTSequence s = head;
 73 |   unsigned i = head.size();
 74 |   for (; i < size(); ++i)
 75 |     if ((*this)[i].isAmbiguity())
 76 |       break;
 77 |     else
 78 |       s.push_back((*this)[i]);
 79 | 
 80 |   if (i == size())
 81 |     result.push_back(s);
 82 |   else {
 83 |     std::vector<Nucleotide> ambiguities;
 84 |     (*this)[i].nonAmbiguousNucleotides(ambiguities);
 85 |     for (unsigned i = 0; i < ambiguities.size(); ++i) {
 86 |       s.push_back(ambiguities[i]);
 87 |       iterateNonAmbiguous(s, result);
 88 |       s.pop_back();
 89 |     }
 90 |   }
 91 | }
 92 | 
 93 | std::string NTSequence::asString() const
 94 | {
 95 |   std::string result(size(), '-');
 96 | 
 97 |   for (unsigned i = 0; i < size(); ++i) {
 98 |     result[i] = (*this)[i].toChar();
 99 |   }
100 | 
101 |   return result;
102 | }
103 | 
104 | /// \cond
105 | 
106 | void readFastaEntry(std::istream& i,
107 | 		    std::string& name,
108 | 		    std::string& description,
109 | 		    std::string& sequence)
110 | {
111 |     char ch;
112 |     char c[512];
113 | 
114 |     i.getline(c, 511);
115 |     if (i) {
116 |       if (c[0] != '>') {
117 | 	throw ParseException(std::string(),
118 | 			     std::string("FASTA file expected '>', got: '")
119 | 			     + c[0] + "'", false);
120 |       }
121 | 
122 |       std::string nameDesc = c + 1;
123 |       std::string::size_type spacepos = nameDesc.find(" ");
124 |       name = nameDesc.substr(0, spacepos);
125 |       description = (spacepos == std::string::npos
126 | 		     ? ""
127 | 		     : nameDesc.substr(spacepos));
128 | 
129 |       for (ch = i.get(); (ch != EOF) && (ch != '>'); ch = i.get()) {
130 | 	if ((ch != '\n') && (ch != '\r') && (ch != ' ')) {
131 | 	  if (((ch >= 'a') && (ch <= 'z'))
132 | 	      || ((ch >= 'A') && (ch <= 'Z'))
133 | 	      || (ch == '-') || (ch == '*')) {
134 | 	    sequence += ch;
135 | 	  } else {
136 | 	    char failedCh = ch;
137 | 	    /*
138 | 	     * Wind further to the next possible sequence.
139 | 	     */
140 | 	    for (ch = i.get(); (ch != EOF) && (ch != '>'); ch = i.get())
141 | 	      ;
142 | 
143 | 	    if (ch == '>')
144 | 	      i.putback(ch);
145 | 
146 | 	    throw ParseException
147 | 	      (name, std::string("Illegal character in FASTA: '")
148 | 	       + (char)failedCh + "'", true);
149 | 	  }
150 | 	}
151 | 
152 | 	if (i.peek() == EOF)
153 | 	  break;
154 |       }
155 | 
156 |       if (ch == '>')
157 | 	i.putback(ch);
158 |     }
159 | }
160 | 
161 | void writeFastaEntry(std::ostream& o,
162 | 		     const std::string& name,
163 | 		     const std::string& description,
164 | 		     const std::string& sequence)
165 | {
166 |   o << ">" << name << " " << description << std::endl;
167 |   if (sequence.size() == 0)
168 |     o << std::endl;
169 |   else {
170 |     for (unsigned i = 0; i <= (sequence.size() - 1) / 60; ++i) {
171 |       int s = i * 60;
172 |       o << sequence.substr(s, 60) << std::endl;
173 |     }
174 |   }
175 | }
176 | 
177 | void writeStockholm(std::ostream& o, const std::vector<NTSequence>& sequences, int length, int labelsize, int seqsize, int pos)
178 | {
179 |   if(labelsize < 1 && seqsize < 1){
180 |     for(std::vector<NTSequence>::const_iterator i = sequences.begin();
181 |         i < sequences.end(); ++i){
182 |       labelsize = std::max(labelsize, (int)i->name().length());
183 |       seqsize = std::max(seqsize, (int)i->size());
184 |     }
185 | 
186 |     o << "# STOCKHOLM 1.0" << std::endl;
187 |   }
188 | 
189 |   int epos = pos+length - (labelsize + 1);
190 |   for(std::vector<NTSequence>::const_iterator i = sequences.begin();
191 |       i < sequences.end(); ++i){
192 |     o << i->name();
193 |     for(int j = 0; j < labelsize - i->name().length() + 1; ++j)
194 |       o << ' ';
195 |     
196 |     int n = std::min(epos, (int)i->size());
197 |     for(int spos=pos; spos<n; ++spos)
198 |       o << (*i)[spos];
199 |   
200 |     o << std::endl;
201 |   }
202 | 
203 |   if(epos >= seqsize){
204 |     o << "//";
205 |   }
206 |   else{
207 |     writeStockholm(o, sequences, length, labelsize, seqsize, epos);
208 |   }
209 | }
210 | 
211 | /// \endcond
212 | 
213 | std::istream& operator>>(std::istream& i, NTSequence& sequence)
214 | {
215 |   std::string name, description, seqString;
216 | 
217 |   readFastaEntry(i, name, description, seqString);
218 |   sequence = NTSequence(name, description, seqString);
219 | 
220 |   return i;
221 | }
222 | 
223 | std::ostream& operator<<(std::ostream& o, const NTSequence& sequence)
224 | {
225 |   writeFastaEntry(o, sequence.name(), sequence.description(),
226 | 		  sequence.asString());
227 |   return o;
228 | }
229 | 
230 | };
231 | 


--------------------------------------------------------------------------------
/src/libseq/Nucleotide.cpp:
--------------------------------------------------------------------------------
  1 | #include <ctype.h>
  2 | #include <stdlib.h>
  3 | #include <set>
  4 | #include <stdexcept>
  5 | 
  6 | #include "ParseException.h"
  7 | #include "Nucleotide.h"
  8 | 
  9 | namespace {
 10 | 
 11 | #ifdef _WIN32
 12 | 
 13 | double drand48()
 14 | {
 15 | 	return (double(rand()) / RAND_MAX);
 16 | }
 17 | 
 18 | #endif
 19 | 
 20 | int sampleUniform(int one, int two)
 21 | {
 22 |   double d = drand48();
 23 |   
 24 |   return (d < 0.5 ? one : two);
 25 | }
 26 | 
 27 | int sampleUniform(int one, int two, int three)
 28 | {
 29 |   double d = drand48() * 3.;
 30 | 
 31 |   return (d < 1. ? one : (d < 2. ? two : three));
 32 | }
 33 | 
 34 | int sampleUniform(int one, int two, int three, int four)
 35 | {
 36 |   double d = drand48() * 4.;
 37 | 
 38 |   return (d < 1. ? one : (d < 2. ? two : (d < 3. ? three : four)));
 39 | }
 40 | };
 41 | 
 42 | namespace seq {
 43 | 
 44 | const char Nucleotide::NT_CHAR[] = {'A', 'C', 'G', 'T',
 45 | 				    'M', 'R', 'W', 'S',
 46 | 				    'Y', 'K', 'V', 'H',
 47 | 				    'D', 'B', 'N', '-' };
 48 | 
 49 | const Nucleotide Nucleotide::A(Nucleotide::NT_A);
 50 | const Nucleotide Nucleotide::C(Nucleotide::NT_C);
 51 | const Nucleotide Nucleotide::G(Nucleotide::NT_G);
 52 | const Nucleotide Nucleotide::T(Nucleotide::NT_T);
 53 | const Nucleotide Nucleotide::M(Nucleotide::NT_M);
 54 | const Nucleotide Nucleotide::R(Nucleotide::NT_R);
 55 | const Nucleotide Nucleotide::W(Nucleotide::NT_W);
 56 | const Nucleotide Nucleotide::S(Nucleotide::NT_S);
 57 | const Nucleotide Nucleotide::Y(Nucleotide::NT_Y);
 58 | const Nucleotide Nucleotide::K(Nucleotide::NT_K);
 59 | const Nucleotide Nucleotide::V(Nucleotide::NT_V);
 60 | const Nucleotide Nucleotide::H(Nucleotide::NT_H);
 61 | const Nucleotide Nucleotide::D(Nucleotide::NT_D);
 62 | const Nucleotide Nucleotide::B(Nucleotide::NT_B);
 63 | const Nucleotide Nucleotide::N(Nucleotide::NT_N);
 64 | const Nucleotide Nucleotide::GAP(Nucleotide::NT_GAP);
 65 | 
 66 | Nucleotide::Nucleotide()
 67 |   : rep_(NT_N)
 68 | { }
 69 | 
 70 | void Nucleotide::sampleAmbiguity()
 71 | {
 72 |   switch (rep_) {
 73 |   case NT_A:
 74 |   case NT_C:
 75 |   case NT_G:
 76 |   case NT_T:
 77 |   case NT_GAP:
 78 |     break;
 79 |   case NT_M:
 80 |     rep_ = sampleUniform(NT_A, NT_C); break;
 81 |   case NT_R:
 82 |     rep_ = sampleUniform(NT_A, NT_G); break;  
 83 |   case NT_W:
 84 |     rep_ = sampleUniform(NT_A, NT_T); break;
 85 |   case NT_S:
 86 |     rep_ = sampleUniform(NT_C, NT_G); break;
 87 |   case NT_Y:
 88 |     rep_ = sampleUniform(NT_C, NT_T); break;
 89 |   case NT_K:
 90 |     rep_ = sampleUniform(NT_G, NT_T); break;
 91 |   case NT_V:
 92 |     rep_ = sampleUniform(NT_A, NT_C, NT_G); break;
 93 |   case NT_H:
 94 |     rep_ = sampleUniform(NT_A, NT_C, NT_T); break;
 95 |   case NT_D:
 96 |     rep_ = sampleUniform(NT_A, NT_G, NT_T); break;
 97 |   case NT_B:
 98 |     rep_ = sampleUniform(NT_C, NT_G, NT_T); break;
 99 |   case NT_N:
100 |     rep_ = sampleUniform(NT_A, NT_C, NT_G, NT_T); break;
101 |   default:
102 |     std::cerr << rep_ << std::endl;
103 |     assert(false);
104 |   }
105 | }
106 | 
107 | Nucleotide Nucleotide::reverseComplement() const
108 | {
109 |   switch (rep_) {
110 |   case NT_A: return NT_T;
111 |   case NT_C: return NT_G;
112 |   case NT_G: return NT_C;
113 |   case NT_T: return NT_A;
114 |   case NT_GAP: return NT_GAP;
115 |   case NT_M: return /* AC -> TG */ NT_K;
116 |   case NT_R: return /* AG -> TC */ NT_Y;
117 |   case NT_W: return /* AT -> TA */ NT_W;
118 |   case NT_S: return /* CG -> GC */ NT_S;
119 |   case NT_Y: return /* CT -> GA */ NT_R;
120 |   case NT_K: return /* GT -> CA */ NT_M;
121 |   case NT_V: return /* ACG -> TGC */ NT_B;
122 |   case NT_H: return /* ACT -> TGA */ NT_D;
123 |   case NT_D: return /* AGT -> TCA */ NT_H;
124 |   case NT_B: return /* CGT -> GCA */ NT_V;
125 |   case NT_N: return NT_N;
126 |   default:
127 |     std::cerr << rep_ << std::endl;
128 |     assert(false);
129 |   }
130 | }
131 | 
132 | Nucleotide Nucleotide::singleNucleotide(std::set<Nucleotide>& nucleotides)
133 | {
134 | 	std::set<Nucleotide>::iterator itgap = nucleotides.find(GAP);
135 | 	if(itgap != nucleotides.end())
136 | 		nucleotides.erase(itgap);
137 | 
138 | 	if (nucleotides.size() == 1)
139 | 		return *nucleotides.begin();
140 | 	
141 | 	std::set<Nucleotide> all;
142 | 	for(std::set<Nucleotide>::iterator it = nucleotides.begin(); it != nucleotides.end(); ++it) {
143 | 		std::vector<Nucleotide> t;
144 | 		it->nonAmbiguousNucleotides(t);
145 | 		all.insert(t.begin(), t.end());
146 | 	}
147 | 	bool nta = all.find(A) != all.end();
148 | 	bool ntc = all.find(C) != all.end();
149 | 	bool ntg = all.find(G) != all.end();
150 | 	bool ntt = all.find(T) != all.end();
151 | 
152 | 	if (nta && ntc && ntg && ntt)
153 | 		return N;
154 | 	if (nta && ntc && ntg)
155 | 		return V;
156 | 	if (nta && ntc && ntt)
157 | 		return H;
158 | 	if (nta && ntg && ntt)
159 | 		return D;
160 | 	if (ntc && ntg && ntt)
161 | 		return B;
162 |  	if (nta && ntc)
163 | 		return M;
164 | 	if (ntg && ntt)
165 | 		return K;
166 | 	if (nta && ntt)
167 | 		return W;
168 | 	if (ntg && ntc)
169 | 		return S;
170 | 	if (ntc && ntt)
171 | 		return Y;
172 | 	if (nta && ntg)
173 | 		return R;		
174 | 
175 | 	throw std::runtime_error
176 | 	  ("Internal error in Nucleotide::singleNucleotide()");
177 | }
178 | 
179 | /**
180 |  * Get all non ambiguous nucleotides represented by this nucleotide.
181 |  */ 
182 | void Nucleotide::nonAmbiguousNucleotides(std::vector<Nucleotide>& result) const
183 | {
184 |   switch (rep_) {
185 |   case NT_A:
186 |   case NT_C:
187 |   case NT_G:
188 |   case NT_T:
189 |   case NT_GAP:
190 |     result.push_back(*this);
191 |     break;
192 |   case NT_M:
193 |     result.push_back(A);
194 |     result.push_back(C);
195 |     break;
196 |   case NT_R:
197 |     result.push_back(A);
198 |     result.push_back(G);
199 |     break;
200 |   case NT_W:
201 |     result.push_back(A);
202 |     result.push_back(T);
203 |     break;
204 |   case NT_S:
205 |     result.push_back(C);
206 |     result.push_back(G);
207 |     break;
208 |   case NT_Y:
209 |     result.push_back(C);
210 |     result.push_back(T);
211 |     break;
212 |   case NT_K:
213 |     result.push_back(G);
214 |     result.push_back(T);
215 |     break;
216 |   case NT_V:
217 |     result.push_back(A);
218 |     result.push_back(C);
219 |     result.push_back(G);
220 |     break;
221 |   case NT_H:
222 |     result.push_back(A);
223 |     result.push_back(C);
224 |     result.push_back(T);
225 |     break;
226 |   case NT_D:
227 |     result.push_back(A);
228 |     result.push_back(G);
229 |     result.push_back(T);
230 |     break;
231 |   case NT_B:
232 |     result.push_back(C);
233 |     result.push_back(G);
234 |     result.push_back(T);
235 |     break;
236 |   case NT_N:
237 |     result.push_back(A);
238 |     result.push_back(C);
239 |     result.push_back(G);
240 |     result.push_back(T);
241 |     break;
242 |   default:
243 |     std::cerr << rep_ << std::endl;
244 |     assert(false);
245 |   }
246 | }
247 | 
248 | 
249 | std::ostream& operator<< (std::ostream& s, const Nucleotide nt)
250 | {
251 |   return s << nt.toChar();
252 | }
253 | 
254 | };
255 | 


--------------------------------------------------------------------------------
/src/mxml/README:
--------------------------------------------------------------------------------
  1 | README - 2011-12-20
  2 | -------------------
  3 | 
  4 | 
  5 | INTRODUCTION
  6 | 
  7 |     This README file describes the Mini-XML library version 2.7.
  8 | 
  9 |     Mini-XML is a small XML parsing library that you can use to read XML and
 10 |     XML-like data files in your application without requiring large non-standard
 11 |     libraries.  Mini-XML only requires an ANSI C compatible compiler (GCC works,
 12 |     as do most vendors' ANSI C compilers) and a "make" program.
 13 | 
 14 |     Mini-XML provides the following functionality:
 15 | 
 16 | 	- Reading of UTF-8 and UTF-16 and writing of UTF-8 encoded XML files and
 17 | 	  strings.
 18 | 	- Data is stored in a linked-list tree structure, preserving the XML
 19 | 	  data hierarchy.
 20 | 	- Supports arbitrary element names, attributes, and attribute values
 21 | 	  with no preset limits, just available memory.
 22 | 	- Supports integer, real, opaque ("cdata"), and text data types in
 23 | 	  "leaf" nodes.
 24 | 	- Functions for creating and managing trees of data.
 25 | 	- "Find" and "walk" functions for easily locating and navigating trees
 26 | 	  of data.
 27 | 
 28 |     Mini-XML doesn't do validation or other types of processing on the data
 29 |     based upon schema files or other sources of definition information.
 30 | 
 31 | 
 32 | BUILDING Mini-XML
 33 | 
 34 |     Mini-XML comes with an autoconf-based configure script; just type the
 35 |     following command to get things going:
 36 | 
 37 |         ./configure
 38 | 
 39 |     The default install prefix is /usr/local, which can be overridden using the
 40 |     --prefix option:
 41 | 
 42 |         ./configure --prefix=/foo
 43 | 
 44 |     Other configure options can be found using the --help option:
 45 | 
 46 |         ./configure --help
 47 | 
 48 |     Once you have configured the software, type "make" to do the build and run
 49 |     the test program to verify that things are working, as follows:
 50 | 
 51 |         make
 52 | 
 53 |     If you are using Mini-XML under Microsoft Windows with Visual C++ 2008, use
 54 |     the included project files in the "vcnet" subdirectory to build the library
 55 |     instead.
 56 | 
 57 | 
 58 | INSTALLING Mini-XML
 59 | 
 60 |     The "install" target will install Mini-XML in the lib and include
 61 |     directories:
 62 | 
 63 |         make install
 64 | 
 65 |     Once you have installed it, use the "-lmxml" option to link your application
 66 |     against it.
 67 | 
 68 | 
 69 | DOCUMENTATION
 70 | 
 71 |     The documentation is available in the "doc" subdirectory in the files
 72 |     "mxml.html" (HTML) and "mxml.pdf" (PDF). You can also look at the
 73 |     "testmxml.c" and "mxmldoc.c" source files for examples of using Mini-XML.
 74 | 
 75 |     Mini-XML provides a single header file which you include:
 76 | 
 77 |         #include <mxml.h>
 78 | 
 79 |     Nodes are defined by the "mxml_node_t" structure; the "type" member defines
 80 |     the node type (element, integer, opaque, real, or text) which determines
 81 |     which value you want to look at in the "value" union.  New nodes can be
 82 |     created using the "mxmlNewElement()", "mxmlNewInteger()", "mxmlNewOpaque()",
 83 |     "mxmlNewReal()", and "mxmlNewText()" functions.  Only elements can have
 84 |     child nodes, and the top node must be an element, usually "?xml".
 85 | 
 86 |     You load an XML file using the "mxmlLoadFile()" function:
 87 | 
 88 |         FILE *fp;
 89 |         mxml_node_t *tree;
 90 | 
 91 | 	fp = fopen("filename.xml", "r");
 92 | 	tree = mxmlLoadFile(NULL, fp, MXML_NO_CALLBACK);
 93 | 	fclose(fp);
 94 | 
 95 |     Similarly, you save an XML file using the "mxmlSaveFile()" function:
 96 | 
 97 |         FILE *fp;
 98 |         mxml_node_t *tree;
 99 | 
100 | 	fp = fopen("filename.xml", "w");
101 | 	mxmlSaveFile(tree, fp, MXML_NO_CALLBACK);
102 | 	fclose(fp);
103 | 
104 |     The "mxmlLoadString()", "mxmlSaveAllocString()", and "mxmlSaveString()"
105 |     functions load XML node trees from and save XML node trees to strings:
106 | 
107 |         char buffer[8192];
108 | 	char *ptr;
109 | 	mxml_node_t *tree;
110 | 
111 |         ...
112 | 	tree = mxmlLoadString(NULL, buffer, MXML_NO_CALLBACK);
113 | 
114 |         ...
115 |         mxmlSaveString(tree, buffer, sizeof(buffer), MXML_NO_CALLBACK);
116 | 
117 |         ...
118 | 	ptr = mxmlSaveAllocString(tree, MXML_NO_CALLBACK);
119 | 
120 |     You can find a named element/node using the "mxmlFindElement()" function:
121 | 
122 |         mxml_node_t *node = mxmlFindElement(tree, tree, "name", "attr",
123 | 	                                    "value", MXML_DESCEND);
124 | 
125 |     The "name", "attr", and "value" arguments can be passed as NULL to act as
126 |     wildcards, e.g.:
127 | 
128 |         /* Find the first "a" element */
129 |         node = mxmlFindElement(tree, tree, "a", NULL, NULL, MXML_DESCEND);
130 | 
131 |         /* Find the first "a" element with "href" attribute */
132 |         node = mxmlFindElement(tree, tree, "a", "href", NULL, MXML_DESCEND);
133 | 
134 |         /* Find the first "a" element with "href" to a URL */
135 |         node = mxmlFindElement(tree, tree, "a", "href",
136 | 	                       "http://www.minixml.org/",
137 | 			       MXML_DESCEND);
138 | 
139 |         /* Find the first element with a "src" attribute*/
140 |         node = mxmlFindElement(tree, tree, NULL, "src", NULL, MXML_DESCEND);
141 | 
142 |         /* Find the first element with a "src" = "foo.jpg" */
143 |         node = mxmlFindElement(tree, tree, NULL, "src", "foo.jpg",
144 | 	                       MXML_DESCEND);
145 | 
146 |     You can also iterate with the same function:
147 | 
148 |         mxml_node_t *node;
149 | 
150 | 	for (node = mxmlFindElement(tree, tree, "name", NULL, NULL,
151 | 	                            MXML_DESCEND);
152 | 	     node != NULL;
153 | 	     node = mxmlFindElement(node, tree, "name", NULL, NULL,
154 | 	                            MXML_DESCEND))
155 |         {
156 | 	  ... do something ...
157 | 	}
158 | 
159 |     The "mxmlFindPath()" function finds the (first) value node under a specific
160 |     element using a "path":
161 | 
162 |         mxml_node_t *value = mxmlFindPath(tree, "path/to/*/foo/bar");
163 | 
164 |     The "mxmlGetInteger()", "mxmlGetOpaque()", "mxmlGetReal()", and
165 |     "mxmlGetText()" functions retrieve the value from a node:
166 | 
167 |         mxml_node_t *node;
168 | 
169 |         int intvalue = mxmlGetInteger(node);
170 | 
171 |         const char *opaquevalue = mxmlGetOpaque(node);
172 | 
173 |         double realvalue = mxmlGetReal(node);
174 | 
175 |         int whitespacevalue;
176 |         const char *textvalue = mxmlGetText(node, &whitespacevalue);
177 | 
178 |     Finally, once you are done with the XML data, use the "mxmlDelete()"
179 |     function to recursively free the memory that is used for a particular node
180 |     or the entire tree:
181 | 
182 |         mxmlDelete(tree);
183 | 
184 | 
185 | GETTING HELP AND REPORTING PROBLEMS
186 | 
187 |     The Mini-XML web site provides access to a discussion forum and bug
188 |     reporting page:
189 | 
190 |         http://www.minixml.org/
191 | 
192 | 
193 | LEGAL STUFF
194 | 
195 |     The Mini-XML library is Copyright 2003-2011 by Michael Sweet.  License terms
196 |     are described in the file "COPYING".
197 | 


--------------------------------------------------------------------------------
/references/CHIKV/CHIKV-NC004162-gp1.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | 	<!-- CHIKV gp1 ORF from NC_004162 (77-7501) -->
3 | 	<orf name="CHIKV_gp1" description="glycoprotein1" referenceSequence="atggatcctgtgtacgtggacatagacgctgacagcgcctttttgaaggccctgcaacgtgcgtaccccatgtttgaggtggaacctaggcaggtcacaccgaatgaccatgctaatgctagagcgttctcgcatctagctataaaactaatagagcaggaaattgatcccgactcaaccatcctggatattggtagtgcgccagcaaggaggatgatgtcggacaggaagtaccactgcgtttgcccgatgcgcagtgcagaagatcccgagagactcgccaattatgcgagaaagctagcatctgccgcaggaaaagtcctggacagaaacatctctggaaagatcggggacttacaagcagtaatggccgtgccagacacggagacgccaacattctgcttacacacagatgtatcatgtagacagagagcagacgtcgcgatataccaagacgtctatgctgtacacgcacccacgtcgctataccaccaggcgattaaaggggtccgattggcgtactgggtagggtttgacacaaccccgttcatgtacaatgccatggcgggtgcctacccctcatactcgacaaattgggcagatgagcaggtactgaaggctaagaacataggattatgttcaacagacctgacggaaggtagacgaggcaaattgtctattatgagaggaaaaaagctagaaccgtgcgaccgtgtgctgttctcagtagggtcaacgctctacccggaaagccgtaagctacttaagagctggcacctaccatcggtgttccatttaaagggcaagctcagcttcacatgccgctgtgatacagtggtttcgtgcgaaggctacgtcgttaagagaataacgatgagcccaggcctttacggaaaaaccacagggtatgcggtaacccaccacgcagacggattcctgatgtgcaagaccaccgacacggttgacggcgaaagagtgtcattctcggtgtgcacgtacgtgccggcgaccatttgtgatcaaatgaccggcatccttgctacagaagtcacgccggaggatgcacagaagctgttggtggggctgaaccagagaatagtggttaacggcagaacgcaacggaatacgaacaccatgaaaaactatatgattcccgtggtcgcccaagccttcagtaagtgggcaaaggagtgccggaaagacatggaagatgaaaaactcctgggggtcagagaaagaacactgacctgctgctgtctatgggcatttaagaagcagaaaacacacacggtctacaagaggcctgatacccagtcaattcagaaggttcaggccgagtttgacagctttgtggtaccgagcctgtggtcgtccgggttgtcaatcccgttgaggactagaatcaaatggttgttaagcaaggtgccaaaaaccgacctgaccccatacagcggggacgcccaagaagcccgggacgcagaaaaagaagcagaggaagaacgagaagcagaactgactcttgaagccctaccaccccttcaggcagcacaggaagatgttcaggtcgaaatcgacgtggaacagcttgaggacagagcgggtgcaggaataatagagactccgagaggagctatcaaagttactgcccaaccaacagaccacgtcgtgggagagtacttggttctttccccgcagaccgtactacgtagccaaaagcttagcctgattcacgctttggcggagcaagtgaagacgtgcacgcacagcggacgagcagggaggtatgcggtcgaagcgtacgacggcagagtcctagtgccctcaggctacgcaatctcgcctgaagacttccagagcctaagcgaaagcgcaacgatggtgtacaacgaaagagagttcgtaaacagaaagctacaccatattgcgatgcatggaccagccctgaacaccgacgaagagtcgtatgagctggtgagggcagagaggacagaacacgagtacgtctacgacgtggaccagagaagatgctgtaagaaggaagaagctgcaggactggtactggtgggcgacttgactaatccgccctaccacgaattcgcatatgaagggctaaaaatccgccctgcctgcccatacaaaattgcagtcataggagtcttcggagtaccaggatctggcaagtcagctattatcaagaacctagttaccaggcaagacctggtgactagcggaaagaaagaaaactgccaagaaatcaccaccgacgtgatgagacagagaggtctagagatatctgcacgtacggttgactcgctgctcttgaatggatgtaacagaccagtcgacgtgttgtacgtagacgaggcgtttgcgtgccactctggaacgttacttgcattgatcgccttggtgagaccaagacagaaagttgtactttgtggtgacccgaagcagtgcggcttcttcaatatgatgcagatgaaagtcaactataatcacaacatctgcacccaagtgtaccacaaaagtatctccaggcggtgtacactgcctgtgactgccattgtgtcatcgttgcattacgaaggcaaaatgcgcactacgaatgagtacaacaagccgattgtagtggacactacaggctcaacaaaacctgaccctggagatctcgtgttaacgtgcttcagaggatgggttaaacaactgcaaattgactatcgtggacacgaggtcatgacagcagccgcatcccaagggttaaccagaaaaggagtttacgcagttaggcaaaaagttaacgaaaacccgctttatgcatcaacgtcagagcacgtcaacgtactcctaacgcgtacggaaggtaaactggtatggaagacactctccggtgacccgtggataaagacgctgcagaacccaccgaaaggaaacttcaaagcaactattaaggagtgggaggtggagcatgcatcaataatggcgggcatctgcagtcaccaaatgacctttgatacattccaaaacaaagccaacgtttgttgggctaagagtttggtccctatcctcgaaacagcggggataaaactaaacgacaggcagtggtcccagataattcaagccttcaaagaagacaaagcatattcacccgaagtagccctgaatgaaatatgcacgcgcatgtatggggtggatctagacagcgggctattttctaaaccgttggtgtctgtgtattacgcggataaccactgggataataggcctggagggaagatgttcggattcaaccccgaggcagcatccattctagaaagaaagtatccatttacaaaagggaagtggaacatcaacaagcagatctgcgtgactaccaggaggatagaagacttcaaccctaccaccaacattataccggccaacaggagactaccacactcattagtggccgaacaccgcccagtaaaaggggaaagaatggaatggctggttaacaagataaacggccaccacgtgctcctggtcagtggctgtagccttgcactgcctactaagagagtcacttgggtagcgccactaggtgtccgcggagcggactatacatacaacctagagttgggtctgccagcaacgcttggtaggtatgacctagtggtcataaacatccacacaccttttcgcatacaccattatcaacagtgcgtagaccacgcaatgaaactgcaaatgctcgggggtgactcattgagactgctcaaaccgggtggctctctattgatcagagcatatggttacgcagatagaaccagtgaacgagtcatctgcgtattgggacgcaagtttagatcatctagagcgttgaaaccaccatgtgtcaccagcaacactgagatgttttttctattcagcaactttgacaatggcagaaggaatttcacaactcatgtcatgaacaatcaactgaatgcagcctttgtaggacaggccacccgagcaggatgtgcaccgtcgtaccgggtaaaacgcatggatatcgcgaagaacgatgaagagtgcgtagtcaacgccgccaaccctcgcgggttaccaggtgacggtgtttgcaaggcagtatacaaaaaatggccggagtcctttaagaacagtgcaacaccagtgggaaccgcaaaaacagtcatgtgcggtacgtatccagtaatccacgccgttggaccaaacttctctaattattcggagtctgaaggggaccgagaattggcggctgcctatcgagaagtcgcaaaggaggtaactagactgggagtaaatagtgtagctatacctctcctctccacaggtgtatactcaggagggaaagacaggctgacccagtcactgaaccacctctttacagccatggactcgacggatgcagacgtggtcatctactgccgcgacaaagaatgggagaagaaaatatctgaggccatacagatgcggacccaagtggagctgctggatgagcacatctccatagactgcgatgttgttcgcgtgcaccctgacagcagcttggcaggcagaaaaggatacagcaccacggaaggcgcactgtactcatatctagaagggacccgttttcaccaaacggcagtggatatggcagagatatatactatgtggccaaagcaaacagaggccaacgagcaagtttgcctatatgccctgggggaaagtattgaatcgatcaggcagaaatgcccggtggatgatgcagatgcatcatctcccccgaaaactgtcccgtgcctctgccgttacgccatgacaccagaacgcgttacccgacttcgcatgaaccatgtcacaagcataattgtgtgttcttcgtttccccttccaaagtacaaaatagaaggagtgcaaaaagtcaaatgctccaaggtaatgctatttgaccacaacgtgccatcgcgcgtaagtccaagggaatacagaccttcccaggagtctgtacaggaagcgagtacgaccacgtcactgacgcatagccaattcgatctaagcgttgacggcaagatactgcccgtcccgtcagacctggatgctgacgccccagccctagaaccagcccttgacgacggggcgatacacacgttgccatctgcaaccggaaaccttgcggccgtgtctgactgggtaatgagcaccgtacctgtcgcgccgcccagaagaaggcgagggagaaacctgactgtgacatgcgacgagagagaagggaatataacacccatggctagcgtccgattctttagggcagagctgtgtccagtcgtacaagaaacagcggagacgcgtgacacagctatgtctcttcaggcaccgccgagtaccgccacggaactgagtcacccgccgatctccttcggtgcaccaagcgagacgttccccatcacatttggggacttcaacgaaggagaaatcgaaagcttgtcttctgagctactaactttcggagacttcctacccggagaagtggatgatttgacagatagcgactggtccacgtgctcagacacggacgacgagttacgactagacagggcaggtgggtatatattctcgtcggacactggtccaggtcatttacaacagaagtcagtacgccagtcagtgctgccggtgaacaccctggaggaagtccacgaggagaagtgttacccacctaagctggatgaagcaaaggagcaactactacttaagaaactccaggagagtgcatccatggccaacagaagcaggtatcagtcgcgcaaagtagaaaacatgaaagcaacaatcatccagagactaaagagaggctgtagattatacttaatgtcagagaccccaaaagtccctacctaccggaccacatatccggcgcctgtgtactcgcctccgattaacgtccgactgtccaaccccgagtccgcagtggcagcatgcaatgagttcttggctagaaactatccaactgtttcatcataccaaatcaccgacgagtatgatgcatatctagacatggtggacgggtcggagagttgtctggaccgagcgacattcaatccgtcaaaacttaggagctacccaaaacagcacgcttaccacgcgccctccatcagaagcgctgtaccgtccccattccagaacacactacagaatgtactggcagcagccacgaaaagaaactgcaacgtcacacagatgagggaattacccactttggactcagcagtattcaacgtggagtgtttcaaaaaattcgcatgcaaccaagaatactgggaagaatttgctgccagccctatcaggataacaactgagaatttaacaacctatgttactaaactaaaggggccaaaagcagcagcgctatttgcaaaaacccataatctgctgccactgcaggaagtgccaatggataggttcacagtagacatgaaaagggatgtgaaggtgactcctggtacaaagcacacagaggaaagacctaaggtacaggttatacaggcggctgaacccttggcaacagcatacctatgtgggattcacagagagctggttaggaggctgaacgccgtcctcctacccaatgtacatacactatttgacatgtctgccgaggatttcgatgccatcatagccgcacactttaagccaggagacactgttttagaaacggacatagcctcctttgataagagccaagatgattcacttgcgcttactgctttaatgctgttagaggatttaggggtggatcactccctgttggacttgatagaggctgctttcggagagatttccagctgtcatctaccgacaggtacgcgcttcaagttcggcgccatgatgaaatctggtatgttcctaactctgttcgtcaacacactgctaaatatcaccatcgccagccgagtgctggaagatcgtctgacaaaatccgcgtgcgcagccttcatcggcgacgacaacataatacatggagtcgtctccgatgaattgatggcagccagatgcgccacttggatgaacatggaagtgaagatcatagatgcagttgtatcccagaaagccccttacttttgtggagggtttatactgcacgatatcgtgacaggaacagcttgcagagtggcagacccgctaaaaaggctatttaaactgggcaaaccgctagcggcaggtgacgaacaagatgaggatagaagacgagcgctggctgacgaagtggtcagatggcaacgaacagggctaattgatgagttggagaaagcggtatactctaggtatgaagtgcagggtatatcagttgtggtaatgtccatggccacctttgcaagctccagatccaacttcgagaagctcagaggacccgtcgtaactttgtacggcggtcctaaatag" version="0">
4 | </orf>


--------------------------------------------------------------------------------
/src/mxml/mxml-search.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * "$Id: mxml-search.c 427 2011-01-03 02:03:29Z mike $"
  3 |  *
  4 |  * Search/navigation functions for Mini-XML, a small XML-like file
  5 |  * parsing library.
  6 |  *
  7 |  * Copyright 2003-2010 by Michael R Sweet.
  8 |  *
  9 |  * These coded instructions, statements, and computer programs are the
 10 |  * property of Michael R Sweet and are protected by Federal copyright
 11 |  * law.  Distribution and use rights are outlined in the file "COPYING"
 12 |  * which should have been included with this file.  If this file is
 13 |  * missing or damaged, see the license at:
 14 |  *
 15 |  *     http://www.minixml.org/
 16 |  *
 17 |  * Contents:
 18 |  *
 19 |  *   mxmlFindElement() - Find the named element.
 20 |  *   mxmlFindValue()   - Find a value with the given path.
 21 |  *   mxmlWalkNext()    - Walk to the next logical node in the tree.
 22 |  *   mxmlWalkPrev()    - Walk to the previous logical node in the tree.
 23 |  */
 24 | 
 25 | /*
 26 |  * Include necessary headers...
 27 |  */
 28 | 
 29 | #include "config.h"
 30 | #include "mxml.h"
 31 | 
 32 | 
 33 | /*
 34 |  * 'mxmlFindElement()' - Find the named element.
 35 |  *
 36 |  * The search is constrained by the name, attribute name, and value; any
 37 |  * NULL names or values are treated as wildcards, so different kinds of
 38 |  * searches can be implemented by looking for all elements of a given name
 39 |  * or all elements with a specific attribute. The descend argument determines
 40 |  * whether the search descends into child nodes; normally you will use
 41 |  * MXML_DESCEND_FIRST for the initial search and MXML_NO_DESCEND to find
 42 |  * additional direct descendents of the node. The top node argument
 43 |  * constrains the search to a particular node's children.
 44 |  */
 45 | 
 46 | mxml_node_t *				/* O - Element node or NULL */
 47 | mxmlFindElement(mxml_node_t *node,	/* I - Current node */
 48 |                 mxml_node_t *top,	/* I - Top node */
 49 |                 const char  *name,	/* I - Element name or NULL for any */
 50 | 		const char  *attr,	/* I - Attribute name, or NULL for none */
 51 | 		const char  *value,	/* I - Attribute value, or NULL for any */
 52 | 		int         descend)	/* I - Descend into tree - MXML_DESCEND, MXML_NO_DESCEND, or MXML_DESCEND_FIRST */
 53 | {
 54 |   const char	*temp;			/* Current attribute value */
 55 | 
 56 | 
 57 |  /*
 58 |   * Range check input...
 59 |   */
 60 | 
 61 |   if (!node || !top || (!attr && value))
 62 |     return (NULL);
 63 | 
 64 |  /*
 65 |   * Start with the next node...
 66 |   */
 67 | 
 68 |   node = mxmlWalkNext(node, top, descend);
 69 | 
 70 |  /*
 71 |   * Loop until we find a matching element...
 72 |   */
 73 | 
 74 |   while (node != NULL)
 75 |   {
 76 |    /*
 77 |     * See if this node matches...
 78 |     */
 79 | 
 80 |     if (node->type == MXML_ELEMENT &&
 81 |         node->value.element.name &&
 82 | 	(!name || !strcmp(node->value.element.name, name)))
 83 |     {
 84 |      /*
 85 |       * See if we need to check for an attribute...
 86 |       */
 87 | 
 88 |       if (!attr)
 89 |         return (node);			/* No attribute search, return it... */
 90 | 
 91 |      /*
 92 |       * Check for the attribute...
 93 |       */
 94 | 
 95 |       if ((temp = mxmlElementGetAttr(node, attr)) != NULL)
 96 |       {
 97 |        /*
 98 |         * OK, we have the attribute, does it match?
 99 | 	*/
100 | 
101 | 	if (!value || !strcmp(value, temp))
102 | 	  return (node);		/* Yes, return it... */
103 |       }
104 |     }
105 | 
106 |    /*
107 |     * No match, move on to the next node...
108 |     */
109 | 
110 |     if (descend == MXML_DESCEND)
111 |       node = mxmlWalkNext(node, top, MXML_DESCEND);
112 |     else
113 |       node = node->next;
114 |   }
115 | 
116 |   return (NULL);
117 | }
118 | 
119 | 
120 | /*
121 |  * 'mxmlFindPath()' - Find a node with the given path.
122 |  *
123 |  * The "path" is a slash-separated list of element names. The name "*" is
124 |  * considered a wildcard for one or more levels of elements.  For example,
125 |  * "foo/one/two", "bar/two/one", "*\/one", and so forth.
126 |  *
127 |  * The first child node of the found node is returned if the given node has
128 |  * children and the first child is a value node.
129 |  * 
130 |  * @since Mini-XML 2.7@
131 |  */
132 | 
133 | mxml_node_t *				/* O - Found node or NULL */
134 | mxmlFindPath(mxml_node_t *top,		/* I - Top node */
135 | 	     const char  *path)		/* I - Path to element */
136 | {
137 |   mxml_node_t	*node;			/* Current node */
138 |   char		element[256];		/* Current element name */
139 |   const char	*pathsep;		/* Separator in path */
140 |   int		descend;		/* mxmlFindElement option */
141 | 
142 | 
143 |  /*
144 |   * Range check input...
145 |   */
146 | 
147 |   if (!top || !path || !*path)
148 |     return (NULL);
149 | 
150 |  /*
151 |   * Search each element in the path...
152 |   */
153 | 
154 |   node = top;
155 |   while (*path)
156 |   {
157 |    /*
158 |     * Handle wildcards...
159 |     */
160 | 
161 |     if (!strncmp(path, "*/", 2))
162 |     {
163 |       path += 2;
164 |       descend = MXML_DESCEND;
165 |     }
166 |     else
167 |       descend = MXML_DESCEND_FIRST;
168 | 
169 |    /*
170 |     * Get the next element in the path...
171 |     */
172 | 
173 |     if ((pathsep = strchr(path, '/')) == NULL)
174 |       pathsep = path + strlen(path);
175 | 
176 |     if (pathsep == path || (pathsep - path) >= sizeof(element))
177 |       return (NULL);
178 | 
179 |     memcpy(element, path, pathsep - path);
180 |     element[pathsep - path] = '\0';
181 | 
182 |     if (*pathsep)
183 |       path = pathsep + 1;
184 |     else
185 |       path = pathsep;
186 | 
187 |    /*
188 |     * Search for the element...
189 |     */
190 | 
191 |     if ((node = mxmlFindElement(node, node, element, NULL, NULL,
192 |                                 descend)) == NULL)
193 |       return (NULL);
194 |   }
195 | 
196 |  /*
197 |   * If we get this far, return the node or its first child...
198 |   */
199 | 
200 |   if (node->child && node->child->type != MXML_ELEMENT)
201 |     return (node->child);
202 |   else
203 |     return (node);
204 | }
205 | 
206 | 
207 | /*
208 |  * 'mxmlWalkNext()' - Walk to the next logical node in the tree.
209 |  *
210 |  * The descend argument controls whether the first child is considered
211 |  * to be the next node. The top node argument constrains the walk to
212 |  * the node's children.
213 |  */
214 | 
215 | mxml_node_t *				/* O - Next node or NULL */
216 | mxmlWalkNext(mxml_node_t *node,		/* I - Current node */
217 |              mxml_node_t *top,		/* I - Top node */
218 |              int         descend)	/* I - Descend into tree - MXML_DESCEND, MXML_NO_DESCEND, or MXML_DESCEND_FIRST */
219 | {
220 |   if (!node)
221 |     return (NULL);
222 |   else if (node->child && descend)
223 |     return (node->child);
224 |   else if (node == top)
225 |     return (NULL);
226 |   else if (node->next)
227 |     return (node->next);
228 |   else if (node->parent && node->parent != top)
229 |   {
230 |     node = node->parent;
231 | 
232 |     while (!node->next)
233 |       if (node->parent == top || !node->parent)
234 |         return (NULL);
235 |       else
236 |         node = node->parent;
237 | 
238 |     return (node->next);
239 |   }
240 |   else
241 |     return (NULL);
242 | }
243 | 
244 | 
245 | /*
246 |  * 'mxmlWalkPrev()' - Walk to the previous logical node in the tree.
247 |  *
248 |  * The descend argument controls whether the previous node's last child
249 |  * is considered to be the previous node. The top node argument constrains
250 |  * the walk to the node's children.
251 |  */
252 | 
253 | mxml_node_t *				/* O - Previous node or NULL */
254 | mxmlWalkPrev(mxml_node_t *node,		/* I - Current node */
255 |              mxml_node_t *top,		/* I - Top node */
256 |              int         descend)	/* I - Descend into tree - MXML_DESCEND, MXML_NO_DESCEND, or MXML_DESCEND_FIRST */
257 | {
258 |   if (!node || node == top)
259 |     return (NULL);
260 |   else if (node->prev)
261 |   {
262 |     if (node->prev->last_child && descend)
263 |     {
264 |      /*
265 |       * Find the last child under the previous node...
266 |       */
267 | 
268 |       node = node->prev->last_child;
269 | 
270 |       while (node->last_child)
271 |         node = node->last_child;
272 | 
273 |       return (node);
274 |     }
275 |     else
276 |       return (node->prev);
277 |   }
278 |   else if (node->parent != top)
279 |     return (node->parent);
280 |   else
281 |     return (NULL);
282 | }
283 | 
284 | 
285 | /*
286 |  * End of "$Id: mxml-search.c 427 2011-01-03 02:03:29Z mike $".
287 |  */
288 | 


--------------------------------------------------------------------------------
/src/Virulign.cpp:
--------------------------------------------------------------------------------
  1 | #include <limits>
  2 | #include <fstream>
  3 | #include <iostream>
  4 | #include <vector>
  5 | #include <stdexcept>
  6 | #include <iomanip>
  7 | 
  8 | #include <NeedlemanWunsh.h>
  9 | 
 10 | #include "ReferenceSequence.h"
 11 | #include "Alignment.h"
 12 | #include "ResultsExporter.h"
 13 | #include "CLIUtils.h"
 14 | #include "Utils.h"
 15 | 
 16 | ReferenceSequence loadRefSeq(const std::string& fn) {
 17 |   if (ends_with(fn, ".fasta")) {
 18 |     return loadRefSeqFromFile(fn.c_str());
 19 |   } else if (ends_with(fn, ".xml")) {
 20 |     return ReferenceSequence::parseOrfReferenceFile(fn);
 21 |   }
 22 |   throw std::runtime_error("Unsupported reference sequence format");
 23 | }
 24 | 
 25 | int main(int argc, char **argv) {
 26 |   unsigned int i;
 27 | 	
 28 |   int obligatoryParams = 2;
 29 |   if(argc < obligatoryParams+1) {
 30 |     std::cerr << "Usage: virulign [reference.fasta orf-description.xml] sequences.fasta" << std::endl 
 31 | 	      << "Optional parameters (first option will be the default):" << std::endl
 32 | 	      << "  --exportKind [Mutations PairwiseAlignments GlobalAlignment PositionTable MutationTable]" << std::endl  
 33 | 	      << "  --exportAlphabet [AminoAcids Nucleotides]" << std::endl
 34 | 	      << "  --exportWithInsertions [yes no]" << std::endl
 35 | 	      << "  --exportReferenceSequence [no yes]" << std::endl
 36 | 	      << "  --gapExtensionPenalty doubleValue=>3.3" << std::endl
 37 | 	      << "  --gapOpenPenalty doubleValue=>10.0" << std::endl
 38 | 	      << "  --maxFrameShifts intValue=>3" << std::endl
 39 |               << "  --progress [no yes]" << std::endl
 40 |               << "  --nt-debug directory" << std::endl
 41 | 	      << "Output: The alignment will be printed to standard out and any progress or error messages will be printed to the standard error. This output can be redirected to files, e.g.:" << std::endl
 42 |               << "   virulign ref.xml sequence.fasta > alignment.mutations 2> alignment.err" << std::endl;
 43 |     exit(0);
 44 |   }
 45 | 	
 46 |   int amountOfParameters = argc - obligatoryParams - 1;
 47 |   if (amountOfParameters%2 == 1) {
 48 |     std::cerr << "Please provide parameters as: --parameterName parameterValue" << std::endl;	
 49 |     exit(0);
 50 |   } 
 51 | 
 52 |   std::string refSeqFileName = argv[1];
 53 |   if (!ends_with(refSeqFileName, ".fasta") && !ends_with(refSeqFileName, ".xml")) {
 54 |     std::cerr << 
 55 |       "Unknown reference sequence: "
 56 |       "expected a FASTA file or an XML file that describes the ORF" << std::endl;
 57 |     exit(1);
 58 |   }
 59 |   ReferenceSequence refSeq = loadRefSeq(refSeqFileName);
 60 | 
 61 |   std::ifstream f_seqs(argv[2]);
 62 |   std::vector<seq::NTSequence> targets; 
 63 | 
 64 |   try {
 65 |     while (f_seqs) {
 66 |       seq::NTSequence s;
 67 | 
 68 |       f_seqs >> s;
 69 | 
 70 |       if (f_seqs) {
 71 | 	targets.push_back(s);
 72 |       }
 73 |     }
 74 |   } catch (seq::ParseException& e) {
 75 |     std::cerr << "Fatal error: " << e.message() << std::endl;
 76 |     exit(1);
 77 |   }
 78 | 
 79 |   ExportKind exportKind = Mutations;
 80 |   ExportAlphabet exportAlphabet = AminoAcids;
 81 |   bool exportWithInsertions = true;
 82 | 
 83 |   double gapExtensionPenalty = 3.3;
 84 |   double gapOpenPenalty = 10.0;
 85 |   int maxFrameShifts = 3;
 86 | 
 87 |   bool progress = false;
 88 | 
 89 |   std::string ntDebugDir;
 90 | 	
 91 |   char* parameterName;
 92 |   char* parameterValue;
 93 |   for(i = obligatoryParams+1; i < amountOfParameters+obligatoryParams; i=i+2) {
 94 |     parameterName = argv[i];
 95 |     parameterValue = argv[i+1];
 96 |     if(equalsString(parameterName,"--exportKind")) {
 97 |       if(equalsString(parameterValue, "Mutations")) {
 98 | 	exportKind = Mutations;
 99 |       } else if(equalsString(parameterValue, "PairwiseAlignments")) {
100 | 	exportKind = PairwiseAlignments;
101 |       } else if(equalsString(parameterValue, "GlobalAlignment")) {
102 | 	exportKind = GlobalAlignment;
103 |       } else if(equalsString(parameterValue, "PositionTable")) {
104 | 	exportKind = PositionTable;
105 |       } else if(equalsString(parameterValue, "MutationTable")) {
106 | 	exportKind = MutationTable;
107 |       } else {
108 | 	std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl; 
109 | 	exit(0);
110 |       }	
111 |     } else if(equalsString(parameterName,"--exportAlphabet")) {
112 |       if(equalsString(parameterValue, "AminoAcids")) {
113 | 	exportAlphabet = AminoAcids;
114 |       } else if(equalsString(parameterValue, "Nucleotides")) {
115 | 	exportAlphabet = Nucleotides;
116 |       } else {
117 | 	std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl; 
118 | 	exit(0);
119 |       }
120 |     } else if(equalsString(parameterName,"--exportReferenceSequence")) {
121 |       if (equalsString(parameterValue,"yes")) {
122 | 	seq::NTSequence refNtSeq = refSeq;
123 | 	targets.insert(targets.begin(), refNtSeq);
124 |       }
125 |     } else if(equalsString(parameterName,"--exportWithInsertions")) {
126 |       if(equalsString(parameterValue,"yes")) {
127 | 	exportWithInsertions = true;
128 |       } else if(equalsString(parameterValue,"no")) {
129 | 	exportWithInsertions = false;
130 |       } else {
131 | 	std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl; 
132 | 	exit(0);
133 |       } 
134 |     } else if(equalsString(parameterName,"--gapExtensionPenalty")) {
135 |       try {
136 | 	gapExtensionPenalty = lexical_cast<double>(parameterValue);
137 |       } catch (std::bad_cast& e) {
138 | 	std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl; 
139 | 	exit(0);
140 |       }
141 |     } else if(equalsString(parameterName,"--gapOpenPenalty")) {
142 |       try {
143 | 	gapOpenPenalty = lexical_cast<double>(parameterValue);
144 |       } catch (std::bad_cast& e) {
145 |         std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl;
146 |         exit(0);
147 |       }
148 |     } else if(equalsString(parameterName,"--maxFrameShifts")) {
149 |       try {
150 |         maxFrameShifts = lexical_cast<int>(parameterValue);
151 |       } catch (std::bad_cast& e) {
152 |         std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl;
153 |         exit(0);
154 |       }
155 |     } else if(equalsString(parameterName,"--progress")) {
156 |       if(equalsString(parameterValue,"yes")) {
157 | 	progress = true;
158 |       } else if(equalsString(parameterValue,"no")) {
159 | 	progress = false;
160 |       } else {
161 | 	std::cerr << "Unkown value " << parameterValue << " for parameter : " << parameterName << std::endl; 
162 | 	exit(0);
163 |       } 
164 |     } else if(equalsString(parameterName,"--nt-debug")) {
165 |       ntDebugDir = parameterValue;  
166 |     } else {
167 |       std::cerr << "Unkown parameter name: " << parameterName << std::endl; 
168 |       exit(0);
169 |     }
170 |   }
171 | 	
172 |   std::vector<Alignment> results;
173 |  
174 |   seq::NeedlemanWunsh algorithm(-gapOpenPenalty, -gapExtensionPenalty);
175 | 
176 |   if (!ntDebugDir.empty()) {
177 | 	seq::NTSequence r = refSeq;
178 |     for (i = 0; i < targets.size(); ++i) {
179 |       seq::NTSequence t = targets[i];
180 |       double ntScore = algorithm.align(r, t);
181 |       if(ntScore > 200) {
182 |         std::string dbg = ntDebugDir + std::string("/") + t.name() + ".fasta";  
183 |         std::ofstream ofs(dbg.c_str());
184 |         ofs << r;
185 |         ofs << t;
186 |       }
187 |     }
188 |   }
189 | 
190 |   long int start = current_time_ms();
191 |   
192 |   for (i = 0; i < targets.size(); ++i) {
193 |     std::cerr << "Align target " << i 
194 |             << " (" << targets[i].name() << ")" << std::endl;
195 |     results.push_back(Alignment::compute(refSeq, targets[i], &algorithm, maxFrameShifts));
196 |     if (progress) {
197 |       long int end = current_time_ms();
198 |       long int elapsed = end - start;
199 |       double time_per_seq = (double)elapsed / (i + 1);
200 |       double estimated_time_left = time_per_seq * (targets.size() - (i + 1));
201 | 
202 |       std::cerr << "Progress: " << (i + 1) << "/" << targets.size() << " sequences aligned (" << std::fixed << std::setprecision(2) << (i + 1) / (double)targets.size() * 100 << "%), Estimated time left " <<  format_time(estimated_time_left) << std::endl;
203 |     }
204 |   }
205 | 
206 |   ResultsExporter exporter(results, exportKind, exportAlphabet, exportWithInsertions);
207 | 
208 |   exporter.streamData(std::cout);
209 | }
210 | 


--------------------------------------------------------------------------------
/src/mxml/mxml-private.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * "$Id: mxml-private.c 422 2010-11-07 22:55:11Z mike $"
  3 |  *
  4 |  * Private functions for Mini-XML, a small XML-like file parsing library.
  5 |  *
  6 |  * Copyright 2003-2010 by Michael R Sweet.
  7 |  *
  8 |  * These coded instructions, statements, and computer programs are the
  9 |  * property of Michael R Sweet and are protected by Federal copyright
 10 |  * law.  Distribution and use rights are outlined in the file "COPYING"
 11 |  * which should have been included with this file.  If this file is
 12 |  * missing or damaged, see the license at:
 13 |  *
 14 |  *     http://www.minixml.org/
 15 |  *
 16 |  * Contents:
 17 |  *
 18 |  *   mxml_error()      - Display an error message.
 19 |  *   mxml_integer_cb() - Default callback for integer values.
 20 |  *   mxml_opaque_cb()  - Default callback for opaque values.
 21 |  *   mxml_real_cb()    - Default callback for real number values.
 22 |  *   _mxml_global()    - Get global data.
 23 |  */
 24 | 
 25 | /*
 26 |  * Include necessary headers...
 27 |  */
 28 | 
 29 | #include "mxml-private.h"
 30 | 
 31 | 
 32 | /*
 33 |  * Some crazy people think that unloading a shared object is a good or safe
 34 |  * thing to do.  Unfortunately, most objects are simply *not* safe to unload
 35 |  * and bad things *will* happen.
 36 |  *
 37 |  * The following mess of conditional code allows us to provide a destructor
 38 |  * function in Mini-XML for our thread-global storage so that it can possibly
 39 |  * be unloaded safely, although since there is no standard way to do so I
 40 |  * can't even provide any guarantees that you can do it safely on all platforms.
 41 |  *
 42 |  * This code currently supports AIX, HP-UX, Linux, Mac OS X, Solaris, and
 43 |  * Windows.  It might work on the BSDs and IRIX, but I haven't tested that.
 44 |  */
 45 | 
 46 | #if defined(__sun) || defined(_AIX)
 47 | #  pragma fini(_mxml_fini)
 48 | #  define _MXML_FINI _mxml_fini
 49 | #elif defined(__hpux)
 50 | #  pragma FINI _mxml_fini
 51 | #  define _MXML_FINI _mxml_fini
 52 | #elif defined(__GNUC__) /* Linux and Mac OS X */
 53 | #  define _MXML_FINI __attribute((destructor)) _mxml_fini
 54 | #else
 55 | #  define _MXML_FINI _fini
 56 | #endif /* __sun */
 57 | 
 58 | 
 59 | /*
 60 |  * 'mxml_error()' - Display an error message.
 61 |  */
 62 | 
 63 | void
 64 | mxml_error(const char *format,		/* I - Printf-style format string */
 65 |            ...)				/* I - Additional arguments as needed */
 66 | {
 67 |   va_list	ap;			/* Pointer to arguments */
 68 |   char		s[1024];		/* Message string */
 69 |   _mxml_global_t *global = _mxml_global();
 70 | 					/* Global data */
 71 | 
 72 | 
 73 |  /*
 74 |   * Range check input...
 75 |   */
 76 | 
 77 |   if (!format)
 78 |     return;
 79 | 
 80 |  /*
 81 |   * Format the error message string...
 82 |   */
 83 | 
 84 |   va_start(ap, format);
 85 | 
 86 |   vsnprintf(s, sizeof(s), format, ap);
 87 | 
 88 |   va_end(ap);
 89 | 
 90 |  /*
 91 |   * And then display the error message...
 92 |   */
 93 | 
 94 |   if (global->error_cb)
 95 |     (*global->error_cb)(s);
 96 |   else
 97 |     fprintf(stderr, "mxml: %s\n", s);
 98 | }
 99 | 
100 | 
101 | /*
102 |  * 'mxml_ignore_cb()' - Default callback for ignored values.
103 |  */
104 | 
105 | mxml_type_t				/* O - Node type */
106 | mxml_ignore_cb(mxml_node_t *node)	/* I - Current node */
107 | {
108 |   (void)node;
109 | 
110 |   return (MXML_IGNORE);
111 | }
112 | 
113 | 
114 | /*
115 |  * 'mxml_integer_cb()' - Default callback for integer values.
116 |  */
117 | 
118 | mxml_type_t				/* O - Node type */
119 | mxml_integer_cb(mxml_node_t *node)	/* I - Current node */
120 | {
121 |   (void)node;
122 | 
123 |   return (MXML_INTEGER);
124 | }
125 | 
126 | 
127 | /*
128 |  * 'mxml_opaque_cb()' - Default callback for opaque values.
129 |  */
130 | 
131 | mxml_type_t				/* O - Node type */
132 | mxml_opaque_cb(mxml_node_t *node)	/* I - Current node */
133 | {
134 |   (void)node;
135 | 
136 |   return (MXML_OPAQUE);
137 | }
138 | 
139 | 
140 | /*
141 |  * 'mxml_real_cb()' - Default callback for real number values.
142 |  */
143 | 
144 | mxml_type_t				/* O - Node type */
145 | mxml_real_cb(mxml_node_t *node)		/* I - Current node */
146 | {
147 |   (void)node;
148 | 
149 |   return (MXML_REAL);
150 | }
151 | 
152 | 
153 | #ifdef HAVE_PTHREAD_H			/**** POSIX threading ****/
154 | #  include <pthread.h>
155 | 
156 | static pthread_key_t	_mxml_key = -1;	/* Thread local storage key */
157 | static pthread_once_t	_mxml_key_once = PTHREAD_ONCE_INIT;
158 | 					/* One-time initialization object */
159 | static void		_mxml_init(void);
160 | static void		_mxml_destructor(void *g);
161 | 
162 | 
163 | /*
164 |  * '_mxml_destructor()' - Free memory used for globals...
165 |  */
166 | 
167 | static void
168 | _mxml_destructor(void *g)		/* I - Global data */
169 | {
170 |   free(g);
171 | }
172 | 
173 | 
174 | /*
175 |  * '_mxml_fini()' - Clean up when unloaded.
176 |  */
177 | 
178 | static void
179 | _MXML_FINI(void)
180 | {
181 |   _mxml_global_t	*global;	/* Global data */
182 | 
183 | 
184 |   if (_mxml_key != -1)
185 |   {
186 |     if ((global = (_mxml_global_t *)pthread_getspecific(_mxml_key)) != NULL)
187 |       _mxml_destructor(global);
188 | 
189 |     pthread_key_delete(_mxml_key);
190 |     _mxml_key = -1;
191 |   }
192 | }
193 | 
194 | 
195 | /*
196 |  * '_mxml_global()' - Get global data.
197 |  */
198 | 
199 | _mxml_global_t *			/* O - Global data */
200 | _mxml_global(void)
201 | {
202 |   _mxml_global_t	*global;	/* Global data */
203 | 
204 | 
205 |   pthread_once(&_mxml_key_once, _mxml_init);
206 | 
207 |   if ((global = (_mxml_global_t *)pthread_getspecific(_mxml_key)) == NULL)
208 |   {
209 |     global = (_mxml_global_t *)calloc(1, sizeof(_mxml_global_t));
210 |     pthread_setspecific(_mxml_key, global);
211 | 
212 |     global->num_entity_cbs = 1;
213 |     global->entity_cbs[0]  = _mxml_entity_cb;
214 |     global->wrap           = 72;
215 |   }
216 | 
217 |   return (global);
218 | }
219 | 
220 | 
221 | /*
222 |  * '_mxml_init()' - Initialize global data...
223 |  */
224 | 
225 | static void
226 | _mxml_init(void)
227 | {
228 |   pthread_key_create(&_mxml_key, _mxml_destructor);
229 | }
230 | 
231 | 
232 | #elif defined(WIN32) && defined(MXML1_EXPORTS) /**** WIN32 threading ****/
233 | #  include <windows.h>
234 | 
235 | static DWORD _mxml_tls_index;		/* Index for global storage */
236 | 
237 | 
238 | /*
239 |  * 'DllMain()' - Main entry for library.
240 |  */
241 |  
242 | BOOL WINAPI				/* O - Success/failure */
243 | DllMain(HINSTANCE hinst,		/* I - DLL module handle */
244 |         DWORD     reason,		/* I - Reason */
245 |         LPVOID    reserved)		/* I - Unused */
246 | {
247 |   _mxml_global_t	*global;	/* Global data */
248 | 
249 | 
250 |   (void)hinst;
251 |   (void)reserved;
252 | 
253 |   switch (reason) 
254 |   { 
255 |     case DLL_PROCESS_ATTACH :		/* Called on library initialization */
256 |         if ((_mxml_tls_index = TlsAlloc()) == TLS_OUT_OF_INDEXES) 
257 |           return (FALSE); 
258 |         break; 
259 | 
260 |     case DLL_THREAD_DETACH :		/* Called when a thread terminates */
261 |         if ((global = (_mxml_global_t *)TlsGetValue(_mxml_tls_index)) != NULL)
262 |           free(global);
263 |         break; 
264 | 
265 |     case DLL_PROCESS_DETACH :		/* Called when library is unloaded */
266 |         if ((global = (_mxml_global_t *)TlsGetValue(_mxml_tls_index)) != NULL)
267 |           free(global);
268 | 
269 |         TlsFree(_mxml_tls_index); 
270 |         break; 
271 | 
272 |     default: 
273 |         break; 
274 |   } 
275 | 
276 |   return (TRUE);
277 | }
278 | 
279 | 
280 | /*
281 |  * '_mxml_global()' - Get global data.
282 |  */
283 | 
284 | _mxml_global_t *			/* O - Global data */
285 | _mxml_global(void)
286 | {
287 |   _mxml_global_t	*global;	/* Global data */
288 | 
289 | 
290 |   if ((global = (_mxml_global_t *)TlsGetValue(_mxml_tls_index)) == NULL)
291 |   {
292 |     global = (_mxml_global_t *)calloc(1, sizeof(_mxml_global_t));
293 | 
294 |     global->num_entity_cbs = 1;
295 |     global->entity_cbs[0]  = _mxml_entity_cb;
296 |     global->wrap           = 72;
297 | 
298 |     TlsSetValue(_mxml_tls_index, (LPVOID)global); 
299 |   }
300 | 
301 |   return (global);
302 | }
303 | 
304 | 
305 | #else					/**** No threading ****/
306 | /*
307 |  * '_mxml_global()' - Get global data.
308 |  */
309 | 
310 | _mxml_global_t *			/* O - Global data */
311 | _mxml_global(void)
312 | {
313 |   static _mxml_global_t	global =	/* Global data */
314 |   {
315 |     NULL,				/* error_cb */
316 |     1,					/* num_entity_cbs */
317 |     { _mxml_entity_cb },		/* entity_cbs */
318 |     72,					/* wrap */
319 |     NULL,				/* custom_load_cb */
320 |     NULL				/* custom_save_cb */
321 |   };
322 | 
323 | 
324 |   return (&global);
325 | }
326 | #endif /* HAVE_PTHREAD_H */
327 | 
328 | 
329 | /*
330 |  * End of "$Id: mxml-private.c 422 2010-11-07 22:55:11Z mike $".
331 |  */
332 | 


--------------------------------------------------------------------------------
/src/mxml/mxml-attr.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * "$Id: mxml-attr.c 408 2010-09-19 05:26:46Z mike $"
  3 |  *
  4 |  * Attribute support code for Mini-XML, a small XML-like file parsing library.
  5 |  *
  6 |  * Copyright 2003-2010 by Michael R Sweet.
  7 |  *
  8 |  * These coded instructions, statements, and computer programs are the
  9 |  * property of Michael R Sweet and are protected by Federal copyright
 10 |  * law.  Distribution and use rights are outlined in the file "COPYING"
 11 |  * which should have been included with this file.  If this file is
 12 |  * missing or damaged, see the license at:
 13 |  *
 14 |  *     http://www.minixml.org/
 15 |  *
 16 |  * Contents:
 17 |  *
 18 |  *   mxmlElementDeleteAttr() - Delete an attribute.
 19 |  *   mxmlElementGetAttr()    - Get an attribute.
 20 |  *   mxmlElementSetAttr()    - Set an attribute.
 21 |  *   mxmlElementSetAttrf()   - Set an attribute with a formatted value.
 22 |  *   mxml_set_attr()         - Set or add an attribute name/value pair.
 23 |  */
 24 | 
 25 | /*
 26 |  * Include necessary headers...
 27 |  */
 28 | 
 29 | #include "config.h"
 30 | #include "mxml.h"
 31 | 
 32 | 
 33 | /*
 34 |  * Local functions...
 35 |  */
 36 | 
 37 | static int	mxml_set_attr(mxml_node_t *node, const char *name,
 38 | 		              char *value);
 39 | 
 40 | 
 41 | /*
 42 |  * 'mxmlElementDeleteAttr()' - Delete an attribute.
 43 |  *
 44 |  * @since Mini-XML 2.4@
 45 |  */
 46 | 
 47 | void
 48 | mxmlElementDeleteAttr(mxml_node_t *node,/* I - Element */
 49 |                       const char  *name)/* I - Attribute name */
 50 | {
 51 |   int		i;			/* Looping var */
 52 |   mxml_attr_t	*attr;			/* Cirrent attribute */
 53 | 
 54 | 
 55 | #ifdef DEBUG
 56 |   fprintf(stderr, "mxmlElementDeleteAttr(node=%p, name=\"%s\")\n",
 57 |           node, name ? name : "(null)");
 58 | #endif /* DEBUG */
 59 | 
 60 |  /*
 61 |   * Range check input...
 62 |   */
 63 | 
 64 |   if (!node || node->type != MXML_ELEMENT || !name)
 65 |     return;
 66 | 
 67 |  /*
 68 |   * Look for the attribute...
 69 |   */
 70 | 
 71 |   for (i = node->value.element.num_attrs, attr = node->value.element.attrs;
 72 |        i > 0;
 73 |        i --, attr ++)
 74 |   {
 75 | #ifdef DEBUG
 76 |     printf("    %s=\"%s\"\n", attr->name, attr->value);
 77 | #endif /* DEBUG */
 78 | 
 79 |     if (!strcmp(attr->name, name))
 80 |     {
 81 |      /*
 82 |       * Delete this attribute...
 83 |       */
 84 | 
 85 |       free(attr->name);
 86 |       free(attr->value);
 87 | 
 88 |       i --;
 89 |       if (i > 0)
 90 |         memmove(attr, attr + 1, i * sizeof(mxml_attr_t));
 91 | 
 92 |       node->value.element.num_attrs --;
 93 |       return;
 94 |     }
 95 |   }
 96 | }
 97 | 
 98 | 
 99 | /*
100 |  * 'mxmlElementGetAttr()' - Get an attribute.
101 |  *
102 |  * This function returns NULL if the node is not an element or the
103 |  * named attribute does not exist.
104 |  */
105 | 
106 | const char *				/* O - Attribute value or NULL */
107 | mxmlElementGetAttr(mxml_node_t *node,	/* I - Element node */
108 |                    const char  *name)	/* I - Name of attribute */
109 | {
110 |   int		i;			/* Looping var */
111 |   mxml_attr_t	*attr;			/* Cirrent attribute */
112 | 
113 | 
114 | #ifdef DEBUG
115 |   fprintf(stderr, "mxmlElementGetAttr(node=%p, name=\"%s\")\n",
116 |           node, name ? name : "(null)");
117 | #endif /* DEBUG */
118 | 
119 |  /*
120 |   * Range check input...
121 |   */
122 | 
123 |   if (!node || node->type != MXML_ELEMENT || !name)
124 |     return (NULL);
125 | 
126 |  /*
127 |   * Look for the attribute...
128 |   */
129 | 
130 |   for (i = node->value.element.num_attrs, attr = node->value.element.attrs;
131 |        i > 0;
132 |        i --, attr ++)
133 |   {
134 | #ifdef DEBUG
135 |     printf("    %s=\"%s\"\n", attr->name, attr->value);
136 | #endif /* DEBUG */
137 | 
138 |     if (!strcmp(attr->name, name))
139 |     {
140 | #ifdef DEBUG
141 |       printf("    Returning \"%s\"!\n", attr->value);
142 | #endif /* DEBUG */
143 |       return (attr->value);
144 |     }
145 |   }
146 | 
147 |  /*
148 |   * Didn't find attribute, so return NULL...
149 |   */
150 | 
151 | #ifdef DEBUG
152 |   puts("    Returning NULL!\n");
153 | #endif /* DEBUG */
154 | 
155 |   return (NULL);
156 | }
157 | 
158 | 
159 | /*
160 |  * 'mxmlElementSetAttr()' - Set an attribute.
161 |  *
162 |  * If the named attribute already exists, the value of the attribute
163 |  * is replaced by the new string value. The string value is copied
164 |  * into the element node. This function does nothing if the node is
165 |  * not an element.
166 |  */
167 | 
168 | void
169 | mxmlElementSetAttr(mxml_node_t *node,	/* I - Element node */
170 |                    const char  *name,	/* I - Name of attribute */
171 |                    const char  *value)	/* I - Attribute value */
172 | {
173 |   char	*valuec;			/* Copy of value */
174 | 
175 | 
176 | #ifdef DEBUG
177 |   fprintf(stderr, "mxmlElementSetAttr(node=%p, name=\"%s\", value=\"%s\")\n",
178 |           node, name ? name : "(null)", value ? value : "(null)");
179 | #endif /* DEBUG */
180 | 
181 |  /*
182 |   * Range check input...
183 |   */
184 | 
185 |   if (!node || node->type != MXML_ELEMENT || !name)
186 |     return;
187 | 
188 |   if (value)
189 |     valuec = strdup(value);
190 |   else
191 |     valuec = NULL;
192 | 
193 |   if (mxml_set_attr(node, name, valuec))
194 |     free(valuec);
195 | }
196 | 
197 | 
198 | /*
199 |  * 'mxmlElementSetAttrf()' - Set an attribute with a formatted value.
200 |  *
201 |  * If the named attribute already exists, the value of the attribute
202 |  * is replaced by the new formatted string. The formatted string value is
203 |  * copied into the element node. This function does nothing if the node
204 |  * is not an element.
205 |  *
206 |  * @since Mini-XML 2.3@
207 |  */
208 | 
209 | void
210 | mxmlElementSetAttrf(mxml_node_t *node,	/* I - Element node */
211 |                     const char  *name,	/* I - Name of attribute */
212 |                     const char  *format,/* I - Printf-style attribute value */
213 | 		    ...)		/* I - Additional arguments as needed */
214 | {
215 |   va_list	ap;			/* Argument pointer */
216 |   char		*value;			/* Value */
217 | 
218 | 
219 | #ifdef DEBUG
220 |   fprintf(stderr,
221 |           "mxmlElementSetAttrf(node=%p, name=\"%s\", format=\"%s\", ...)\n",
222 |           node, name ? name : "(null)", format ? format : "(null)");
223 | #endif /* DEBUG */
224 | 
225 |  /*
226 |   * Range check input...
227 |   */
228 | 
229 |   if (!node || node->type != MXML_ELEMENT || !name || !format)
230 |     return;
231 | 
232 |  /*
233 |   * Format the value...
234 |   */
235 | 
236 |   va_start(ap, format);
237 |   value = _mxml_vstrdupf(format, ap);
238 |   va_end(ap);
239 | 
240 |   if (!value)
241 |     mxml_error("Unable to allocate memory for attribute '%s' in element %s!",
242 |                name, node->value.element.name);
243 |   else if (mxml_set_attr(node, name, value))
244 |     free(value);
245 | }
246 | 
247 | 
248 | /*
249 |  * 'mxml_set_attr()' - Set or add an attribute name/value pair.
250 |  */
251 | 
252 | static int				/* O - 0 on success, -1 on failure */
253 | mxml_set_attr(mxml_node_t *node,	/* I - Element node */
254 |               const char  *name,	/* I - Attribute name */
255 |               char        *value)	/* I - Attribute value */
256 | {
257 |   int		i;			/* Looping var */
258 |   mxml_attr_t	*attr;			/* New attribute */
259 | 
260 | 
261 |  /*
262 |   * Look for the attribute...
263 |   */
264 | 
265 |   for (i = node->value.element.num_attrs, attr = node->value.element.attrs;
266 |        i > 0;
267 |        i --, attr ++)
268 |     if (!strcmp(attr->name, name))
269 |     {
270 |      /*
271 |       * Free the old value as needed...
272 |       */
273 | 
274 |       if (attr->value)
275 |         free(attr->value);
276 | 
277 |       attr->value = value;
278 | 
279 |       return (0);
280 |     }
281 | 
282 |  /*
283 |   * Add a new attribute...
284 |   */
285 | 
286 |   if (node->value.element.num_attrs == 0)
287 |     attr = malloc(sizeof(mxml_attr_t));
288 |   else
289 |     attr = realloc(node->value.element.attrs,
290 |                    (node->value.element.num_attrs + 1) * sizeof(mxml_attr_t));
291 | 
292 |   if (!attr)
293 |   {
294 |     mxml_error("Unable to allocate memory for attribute '%s' in element %s!",
295 |                name, node->value.element.name);
296 |     return (-1);
297 |   }
298 | 
299 |   node->value.element.attrs = attr;
300 |   attr += node->value.element.num_attrs;
301 | 
302 |   if ((attr->name = strdup(name)) == NULL)
303 |   {
304 |     mxml_error("Unable to allocate memory for attribute '%s' in element %s!",
305 |                name, node->value.element.name);
306 |     return (-1);
307 |   }
308 | 
309 |   attr->value = value;
310 | 
311 |   node->value.element.num_attrs ++;
312 | 
313 |   return (0);
314 | }
315 | 
316 | 
317 | /*
318 |  * End of "$Id: mxml-attr.c 408 2010-09-19 05:26:46Z mike $".
319 |  */
320 | 


--------------------------------------------------------------------------------
/src/libseq/CodonAlign.cpp:
--------------------------------------------------------------------------------
  1 | #include "CodonAlign.h"
  2 | 
  3 | #include <algorithm>
  4 | 
  5 | namespace seq {
  6 | 
  7 | CodonAlign::CodonAlign(AlignmentAlgorithm* algorithm)
  8 | { 
  9 |   algorithm_ = algorithm;
 10 | }
 11 | 
 12 | double CodonAlign::alignLikeAA(NTSequence& seq1, 
 13 | 			       NTSequence& seq2, 
 14 | 			       int ORF, 
 15 | 			       const AASequence& seqAA1,
 16 | 			       const AASequence& seqAA2)
 17 | {
 18 |   NTSequence seq2ORFLead(seq2.begin(), seq2.begin() + ORF);
 19 |   seq2.erase(seq2.begin(), seq2.begin() + ORF);
 20 |   int aaLength = seq2.size() / 3;
 21 |   NTSequence seq2ORFEnd(seq2.begin() + aaLength*3, seq2.end());
 22 |   seq2.erase(seq2.begin() + aaLength*3, seq2.end());
 23 | 
 24 |   int firstNonGap = -1;
 25 |   int lastNonGap = -1;
 26 | 
 27 |   for (unsigned i = 0; i < seqAA1.size(); ++i) {
 28 |     if (seqAA1[i] == AminoAcid::GAP && noGapAt(seq1, i)) {
 29 |       if (i*3 < seq1.size())
 30 | 	seq1.insert(seq1.begin() + (i*3), 3, Nucleotide::GAP);
 31 |       else
 32 | 	seq1.insert(seq1.end(), 3, Nucleotide::GAP);
 33 |     }
 34 | 
 35 |     if (seqAA2[i] == AminoAcid::GAP && noGapAt(seq2, i)) {
 36 |       if (i*3 < seq2.size())
 37 | 	seq2.insert(seq2.begin() + (i*3), 3, Nucleotide::GAP);
 38 |       else
 39 | 	seq2.insert(seq2.end(), 3, Nucleotide::GAP);
 40 |     } else {
 41 |       if (firstNonGap == -1)
 42 | 	firstNonGap = i*3;
 43 |       lastNonGap = i*3 + 3;
 44 |     }
 45 |   }
 46 | 
 47 |   for (int i = 0; i < (int)seq2ORFLead.size(); ++i)
 48 |     if ((firstNonGap - (int)seq2ORFLead.size() + i) >= 0)
 49 |       seq2[firstNonGap - (int)seq2ORFLead.size() + i] = seq2ORFLead[i];
 50 | 
 51 |   for (unsigned i = 0; i < seq2ORFEnd.size(); ++i)
 52 |     if (lastNonGap + i < seq2.size())
 53 |       seq2[lastNonGap + i] = seq2ORFEnd[i];
 54 | 
 55 |   return algorithm_->computeAlignScore(seq1, seq2);
 56 | }
 57 | 
 58 | bool CodonAlign::noGapAt(const NTSequence& seq, unsigned int i) const
 59 | {
 60 |   if((i * 3) == seq.size())
 61 |     return true;
 62 |   else
 63 |     return seq[i * 3] != Nucleotide::GAP
 64 |         && seq[(i * 3) + 1] != Nucleotide::GAP
 65 |         && seq[(i * 3) + 2] != Nucleotide::GAP;
 66 | }
 67 | 
 68 | bool CodonAlign::haveGaps(const NTSequence& seq, int from, int to)
 69 | {
 70 |   for (unsigned i = std::max(from, 0); i < std::min((int)seq.size(), to); ++i)
 71 |     if (seq[i] == Nucleotide::GAP)
 72 |       return true;
 73 | 
 74 |   return false;
 75 | }
 76 | 
 77 | std::pair<double, int>
 78 | CodonAlign::align(NTSequence& ref, NTSequence& target, int maxFrameShifts)
 79 | {
 80 |   /*
 81 |    * 1. translate the reference sequence
 82 |    * 2. for every open reading frame:
 83 |    *   - translate the target sequence
 84 |    *   - perform the alignment
 85 |    * 3. take the alignment with best score and align nucleotide
 86 |    *    sequence like amino acid sequence
 87 |    * 4. compute nucleotide alignment score
 88 |    * 5. make nucleotide sequence alignment, compare score, if difference
 89 |    *    too big then correct the frame shift and repeat.
 90 |    */
 91 |   AASequence refAA = AASequence::translate(ref);
 92 | 
 93 |   NTSequence refNTAligned = ref;
 94 |   NTSequence targetNTAligned = target;
 95 |   double ntScore = algorithm_->align(refNTAligned, targetNTAligned);
 96 | 
 97 |   if(ntScore < 200)
 98 |     throw AlignmentError(ntScore,0,refNTAligned,targetNTAligned);
 99 | 
100 |   int bestFrameShift = -1;
101 |   double bestScore = -1E10;
102 |   AASequence bestRefAA;
103 |   AASequence bestTargetAA;
104 | 
105 |   for (unsigned i = 0; i < 3; ++i) {
106 |     int last = i + ((target.size() - i) / 3) * 3;
107 |     AASequence targetAA
108 |       = AASequence::translate(target.begin() + i, target.begin() + last);
109 | 
110 |     AASequence refCopyAA = refAA;
111 |     double score = algorithm_->align(refCopyAA, targetAA);
112 | 
113 |     if (score > bestScore) {
114 |       bestFrameShift = i;
115 |       bestScore = score;
116 |       bestRefAA = refCopyAA;
117 |       bestTargetAA = targetAA;
118 |     }
119 |   }
120 | 
121 |   NTSequence refCodonAligned = ref;
122 |   NTSequence targetCodonAligned = target;
123 | 
124 |   double ntCodonScore = alignLikeAA(refCodonAligned,
125 | 				    targetCodonAligned,
126 | 				    bestFrameShift,
127 | 				    bestRefAA,
128 | 				    bestTargetAA);
129 | 
130 | 
131 |   if (ntScore - ntCodonScore > 100) {
132 |     /*
133 |      * a possible frameshift
134 |      */
135 |     if (maxFrameShifts) {
136 |       /*
137 |        * try to fix: walk through the nucleotide alignment, and find
138 |        * an "isolated" gap that is not of size multiple of 3.
139 |        */
140 |       const int BOUNDARY=10;
141 |       int seq2pos = 0;
142 |       int refGapStart = 0;
143 |       int targetGapStart = 0;
144 |       bool fixed = false;
145 | 
146 |       for (unsigned i = 0; i < refNTAligned.size(); ++i) {
147 | 	if (refNTAligned[i] == Nucleotide::GAP) {
148 | 	  if (refGapStart == -1)
149 | 	    refGapStart = i;
150 | 	} else { 
151 | 	  if (refGapStart > 0) {
152 | 	    int refGapStop = i;
153 | 
154 | 	    if ((refGapStop - refGapStart) % 3) {
155 | 	      /*
156 | 	       * check it is isolated: no gaps in either sequence around
157 | 	       * this gap
158 | 	       */
159 | 	      if (haveGaps(refNTAligned,
160 | 			   refGapStart - BOUNDARY, refGapStart)
161 | 		  || haveGaps(refNTAligned,
162 | 			      refGapStop, refGapStop + BOUNDARY)
163 | 		  || haveGaps(targetNTAligned,
164 | 			      refGapStart - BOUNDARY, refGapStart)
165 | 		  || haveGaps(targetNTAligned,
166 | 			      refGapStop, refGapStop + BOUNDARY)) {
167 | 		/*
168 | 		 * not isolated: skip this gap.
169 | 		 */
170 | 	      } else {
171 | 		/*
172 | 		 * fix it !
173 | 		 */
174 | 		target.insert(target.begin() + seq2pos,
175 | 			      3 - (refGapStop - refGapStart) % 3,
176 | 			      Nucleotide::N);
177 | 		fixed = true;
178 | 		break;		
179 | 	      }
180 | 	    }
181 | 	  }
182 | 
183 | 	  refGapStart = -1;
184 | 	}
185 | 
186 | 	if (targetNTAligned[i] == Nucleotide::GAP) {
187 | 	  if (targetGapStart == -1)
188 | 	    targetGapStart = i;
189 | 	} else {
190 | 	  if (targetGapStart > 0) {
191 | 	    int targetGapStop = i;
192 | 
193 | 	    if ((targetGapStop - targetGapStart) % 3) {
194 | 	      /*
195 | 	       * check it is isolated: no gaps in either sequence around
196 | 	       * this gap
197 | 	       */
198 | 	      if (haveGaps(refNTAligned,
199 | 			   targetGapStart - BOUNDARY, targetGapStart)
200 | 		  || haveGaps(refNTAligned, targetGapStop,
201 | 			      targetGapStop + BOUNDARY)
202 | 		  || haveGaps(targetNTAligned,
203 | 			      targetGapStart - BOUNDARY, targetGapStart)
204 | 		  || haveGaps(targetNTAligned,
205 | 			      targetGapStop, targetGapStop + BOUNDARY)) {
206 | 		/*
207 | 		 * not isolated: skip this gap.
208 | 		 */
209 | 	      } else {
210 | 		/*
211 | 		 * fix it !
212 | 		 */
213 | 		target.insert(target.begin() + seq2pos,
214 | 			      (targetGapStop - targetGapStart) % 3,
215 | 			      Nucleotide::N);
216 | 		fixed = true;
217 | 		break;
218 | 	      }
219 | 	    }
220 | 	  }
221 | 
222 | 	  targetGapStart = -1;
223 | 	  ++seq2pos;
224 | 	}
225 |       }
226 | 
227 |       if (!fixed)
228 | 	throw FrameShiftError(ntScore, ntCodonScore,
229 | 			      refNTAligned, targetNTAligned);
230 |       else {
231 | 	std::pair<double, int> result
232 | 	  = align(ref, target, maxFrameShifts - 1);
233 | 	++result.second;
234 | 	return result;
235 |       }
236 |     } else {
237 |       throw FrameShiftError(ntScore, ntCodonScore,
238 | 			    refNTAligned, targetNTAligned);
239 |     }
240 |   } else {
241 |     ref = refCodonAligned;
242 |     target = targetCodonAligned;
243 | /*
244 |     std::cerr << "Scores: " << ntScore << " " << ntCodonScore << " " << bestScore << std::endl;
245 |     std::cerr << refNTAligned.asString() << std::endl;
246 |     std::cerr << targetNTAligned.asString() << std::endl;
247 |     std::cerr << refCodonAligned.asString() << std::endl;
248 |     std::cerr << targetCodonAligned.asString() << std::endl;
249 |     std::cerr << bestRefAA.asString() << std::endl;
250 |     std::cerr << bestTargetAA.asString() << std::endl;
251 | */
252 |     return std::make_pair(ntCodonScore, 0);
253 |   }
254 | }
255 | 
256 | AlignmentError::AlignmentError(double ntScore, double codonScore,
257 | 				 const NTSequence& ntRef,
258 | 				 const NTSequence& ntTarget,
259 | 				 const std::string& message)
260 |   :ntScore_(ntScore),codonScore_(codonScore),
261 |    ntRef_(ntRef),ntTarget_(ntTarget),
262 |    message_(message)
263 | { }
264 | 
265 | AlignmentError::~AlignmentError() throw()
266 | { }
267 | 
268 | 
269 | FrameShiftError::FrameShiftError(double ntScore, double codonScore,
270 | 				 const NTSequence& ntRef,
271 | 				 const NTSequence& ntTarget)
272 |   :AlignmentError(ntScore,codonScore,ntRef,ntTarget,std::string("Frameshift error"))
273 | { }
274 | 
275 | FrameShiftError::~FrameShiftError() throw()
276 | { }
277 | 
278 | };
279 | 
280 | 


--------------------------------------------------------------------------------
/src/Alignment.cpp:
--------------------------------------------------------------------------------
  1 | #include "Utils.h"
  2 | 
  3 | #include <Codon.h>
  4 | #include <CodonAlign.h>
  5 | #include <NeedlemanWunsh.h>
  6 | 
  7 | #include "Alignment.h"
  8 | 
  9 | #include <algorithm>
 10 | 
 11 | Alignment Alignment::compute(const ReferenceSequence& ref,
 12 | 			     const seq::NTSequence&   target,
 13 | 			     seq::AlignmentAlgorithm* algorithm,
 14 | 			     int maxFrameShifts)
 15 | {
 16 |   seq::CodonAlign codonAlign(algorithm);
 17 |   Alignment result(ref, target);
 18 | 
 19 |   for (unsigned j = 0; j < result.target.size(); ++j)
 20 |     if (result.target[j] == seq::Nucleotide::GAP) {
 21 |       result.target.erase(result.target.begin() + j);
 22 |       --j;
 23 |     }
 24 | 
 25 |   try {
 26 |     if (result.target.size() > 6) {
 27 |       std::pair<double, int> res
 28 | 	= codonAlign.align(result.ref, result.target, maxFrameShifts);
 29 | 
 30 |       result.score = res.first;
 31 |       result.correctedFrameshifts = res.second;
 32 |       result.success = true;
 33 |     } else
 34 |       result.tooShort = true;
 35 |   } catch (seq::AlignmentError e) {
 36 |     result.failure = true;
 37 |     std::cerr << e.nucleotideAlignedTarget().name() << ": " << e.message()
 38 |       << " (scores nt: " << e.nucleotideAlignmentScore() << "; codon: "
 39 |       << e.codonAlignmentScore() << ")" << std::endl;
 40 |   }
 41 | 
 42 |   result.computeAlignedRanges(ref.size()/3);
 43 | 
 44 |   return result;
 45 | }
 46 | 
 47 | Alignment Alignment::given(const ReferenceSequence& ref,
 48 | 			   const seq::NTSequence&   target)
 49 | {
 50 |   if (ref.size() != target.size()) {
 51 |     std::cerr << ref.name() << ".length: " << ref.size()
 52 | 	      << ", " << target.name() << ".length: " << target.size()
 53 | 	      << std::endl;
 54 | 
 55 |     assert(ref.size() == target.size());
 56 |   }
 57 | 
 58 |   Alignment result(ref, target);
 59 | 
 60 |   result.success = true;
 61 | 
 62 |   result.computeAlignedRanges(ref.size()/3);
 63 | 
 64 |   return result;
 65 | }
 66 | 
 67 | Alignment::Alignment(const ReferenceSequence& aref,
 68 | 		     const seq::NTSequence&   atarget)
 69 |   : success(false),
 70 |     tooShort(false),
 71 |     failure(false),
 72 |     correctedFrameshifts(0),
 73 |     ref(aref),
 74 |     target(atarget)    
 75 | { }
 76 | 
 77 | void Alignment::computeAlignedRanges(int referenceSequenceLength)
 78 | {
 79 |   for (unsigned r = 0; r < ref.regions().size(); ++r) {
 80 |     ReferenceSequence::Region& region = ref.regions()[r];
 81 | 
 82 |     int regionEnd = std::min(region.end(), referenceSequenceLength);
 83 | 
 84 |     if (success) {
 85 |       region.alignedBegin  = alignedPos(region.begin());
 86 |       region.alignedEnd    = alignedPos(regionEnd);
 87 |       region.targetBegin = firstPos(region.begin(), regionEnd);
 88 |       region.targetEnd   = lastPos(region.begin(), regionEnd);
 89 |     } else {
 90 |       region.alignedBegin = region.begin();
 91 |       region.alignedEnd   = regionEnd;
 92 |       region.targetBegin = ref.size();
 93 |       region.targetEnd   = -1;
 94 |     }
 95 |   }
 96 | }
 97 | 
 98 | int Alignment::alignedPos(int refPos) const
 99 | {
100 |   int j = -1;
101 |   for (unsigned i = 0; i < ref.size(); i += 3) {
102 |     if (ref[i] != seq::Nucleotide::GAP)
103 |       ++j;
104 | 
105 |     if (j == refPos)
106 |       return i/3;
107 |   }
108 |   if (j == refPos - 1)
109 |     return ref.size() / 3;
110 |   else {
111 |     std::cerr << refPos << " " << ref.size() << " " << j << std::endl;
112 |     assert(false);
113 |   }
114 | }
115 | 
116 | int Alignment::firstPos(int begin, int end) const
117 | {
118 |   int refPos = -1;
119 | 
120 |   for (unsigned i = 0; i < ref.size(); i += 3) {
121 |     if (ref[i] != seq::Nucleotide::GAP)
122 |       ++refPos;
123 | 
124 |     if (refPos >= begin) {
125 |       if (refPos >= end)
126 | 	return end;
127 | 
128 |       if (target[i] != seq::Nucleotide::GAP)
129 | 	return refPos;
130 |     }
131 |   }
132 | 
133 |   return end;
134 | }
135 | 
136 | int Alignment::lastPos(int begin, int end) const
137 | {
138 |   int refPos = -1;
139 |   int lastPos = -1;
140 | 
141 |   for (unsigned i = 0; i < ref.size(); i += 3) {
142 |     if (ref[i] != seq::Nucleotide::GAP)
143 |       ++refPos;
144 | 
145 |     if (refPos >= begin) {
146 |       if (refPos >= end)
147 | 	return lastPos;
148 | 
149 |       if (target[i+2] != seq::Nucleotide::GAP)
150 | 	lastPos = refPos;
151 |     }
152 |   }
153 | 
154 |   return lastPos;
155 | }
156 | 
157 | std::pair<bool, int>
158 | Alignment::findAminoAcid(const ReferenceSequence::Region& region,
159 | 			 int posInRegion, int insertion) const
160 | {
161 |   bool withinTarget
162 |     = ((region.targetBegin < region.targetEnd)
163 |        && posInRegion >= region.targetBegin - region.begin() + 1
164 |        && posInRegion <= region.targetEnd   - region.begin() + 1);
165 | 
166 |   int pos = 0;
167 |   int gap = 0;
168 | 
169 |   for (int i = region.alignedBegin; i < region.alignedEnd; ++i) {
170 |     if (ref[i*3] != seq::Nucleotide::GAP) {
171 |       ++pos;
172 |       gap = 0;
173 |     } else
174 |       ++gap;
175 | 
176 |     if (pos == posInRegion
177 | 	&& gap == insertion
178 | 	&& (!withinTarget || (target[i*3] != seq::Nucleotide::GAP))) {
179 |       return std::make_pair(withinTarget, i);
180 |     } else if (pos > posInRegion) {
181 |       return std::make_pair(withinTarget, -1);
182 |     }
183 |   }
184 | 
185 |   assert(false);
186 |   return std::make_pair(false, 0);
187 | }
188 | 
189 | std::string Alignment::mutations(const ReferenceSequence::Region& region) const
190 | {
191 |   std::string result;
192 |   int fp    = region.targetBegin;
193 |   int lp    = region.targetEnd;
194 | 
195 |   if (fp >= lp)
196 |     return result;
197 | 
198 |   int refPos = -1;
199 | 
200 |   for (unsigned i = 0; i < ref.size(); i += 3) {
201 |     if (ref[i] != seq::Nucleotide::GAP)
202 |       ++refPos;
203 | 
204 |     if (refPos >= fp) {
205 |       if (refPos > lp)
206 | 	return result;
207 | 
208 |       seq::AminoAcid refAA = seq::Codon::translate(ref.begin() + i);
209 |       std::set<seq::AminoAcid> targetAAs
210 | 	= seq::Codon::translateAll(target.begin() + i);
211 | 
212 |       if (((targetAAs.size() > 1)
213 | 	   || (*targetAAs.begin() != refAA))
214 | 	  && (*targetAAs.begin() != seq::AminoAcid::GAP)) {
215 | 
216 | 	if (!result.empty())
217 | 	  result += ' ';
218 | 
219 | 	result += refAA.toChar()
220 | 	  + to_string(refPos - region.begin() + 1);
221 | 
222 | 	for (std::set<seq::AminoAcid>::const_iterator k = targetAAs.begin();
223 | 	     k != targetAAs.end(); ++k)
224 | 	  result += k->toChar();
225 | 
226 |       }
227 |     }
228 |   }
229 | 
230 |   return result;
231 | }
232 | 
233 | std::string Alignment::
234 | codonMutations(const ReferenceSequence::Region& region,
235 | 	       int& start,
236 | 	       int& end) const
237 | {
238 |   std::string result;
239 |   int fp    = region.begin();
240 |   int lp    = region.end() - 1;
241 | 
242 |   start = -1;
243 |   end = -1;
244 | 
245 |   if (fp >= lp)
246 |     return result;
247 | 
248 |   int refPos = -1;
249 | 
250 |   for (unsigned i = 0; i < ref.size(); i += 3) {
251 |     if (ref[i] != seq::Nucleotide::GAP)
252 |       ++refPos;
253 | 
254 |     int pos = refPos - region.begin() + 1;
255 | 
256 |     if (refPos >= fp) {
257 |       if (refPos > lp)
258 | 	return result;
259 | 
260 |       if (target[i] == seq::Nucleotide::GAP &&
261 | 	  target[i + 1] == seq::Nucleotide::GAP &&
262 | 	  target[i + 2] == seq::Nucleotide::GAP &&
263 | 	  (refPos < region.targetBegin || refPos > region.targetEnd))
264 | 	continue;
265 | 
266 |       if (refPos == region.targetEnd && 
267 |           ref[i] == seq::Nucleotide::GAP && 
268 |           ref[i + 1] == seq::Nucleotide::GAP && 
269 |           ref[i + 2] == seq::Nucleotide::GAP)
270 |         continue;
271 | 
272 |       //skip incomplete begin codon
273 |       if(refPos == region.targetBegin-1 &&
274 |           target[i] == seq::Nucleotide::GAP)
275 |         continue;
276 | 
277 |       //skip incomplete end codon
278 |       if(refPos == region.targetEnd+1 &&
279 |           target[i + 2] == seq::Nucleotide::GAP)
280 |         continue;
281 | 
282 |       if (start == -1)
283 | 	start = pos;
284 |       end = pos;
285 | 
286 |       bool mutation;
287 |       mutation = ref[i] != target[i] ||
288 |                  ref[i + 1] != target[i + 1] ||
289 |                  ref[i + 2] != target[i + 2];
290 | 
291 |       if(mutation) {
292 | 	if (!result.empty())
293 | 	  result += ' ';
294 | 
295 |         seq::AminoAcid refAA = seq::Codon::translate(ref.begin() + i);
296 |         std::set<seq::AminoAcid> targetAAs = seq::Codon::translateAll(target.begin() + i);
297 | 
298 |         result += refAA.toChar()
299 |                + to_string(refPos - region.begin() + 1);
300 | 
301 |         for (std::set<seq::AminoAcid>::const_iterator k = targetAAs.begin(); k != targetAAs.end(); ++k)
302 |           result += k->toChar();
303 |         result += ';';
304 | 
305 | 	result += ref[i].toChar();
306 | 	result += ref[i+1].toChar(); 
307 | 	result += ref[i+2].toChar();
308 | 	result += to_string(pos);
309 | 
310 | 	result += target[i].toChar();
311 | 	result += target[i + 1].toChar();
312 | 	result += target[i + 2].toChar();
313 |       }
314 |     }
315 |   }
316 | 
317 |   return result;
318 | }
319 | 
320 | 
321 | 


--------------------------------------------------------------------------------
/references/HCV/HCV2-FN666429.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <orf name="HCV2" referenceSequence="atgagcacaaatcccaaacctcaaagaaaaaccaaaagaaacaccaaccgtcgcccacaggacgtcaagttcccgggcggcggtcagatcgttggcggagtatacttgttgccgcgcaggggcccccggttgggtgtgcgcgcgacgaggaagacttccgaacggtcccagccacgtggaaggcgccagcccatccctaaagatcggcgctccactggcaagtcctggggacgcccaggatatccttggcccctgtatgggaatgagggcctcgggtgggcagggtggctcctgtccccccggggttctcgcccttcgtggggccccactgacccccggcataggtcacggaatttgggtaaggtcatcgataccctcacgtgtggctttgccgacctcatggggtacatacccgtcgtaggcgccccagttggcggcgttgctagagccctcgcgcatggcgtgagggtcctggaggacgggatcaactatgcaacagggaatctacctggttgctctttttctatcttcttgctagctcttttgtcttgcatgtccgtgccagtgtctgccgtggaagtaaaaaacaccagccacacctacatggccaccaatgactgccctaacagcagcattgcgtggcagatggagaatgcagtgctgcacgtccctggttgtatcccctgtgagctcattggcaacgtgtcccactgttggataccgcttacacctaatattgccgtgaaggagagaggagcgctcactaaaggtttgcggactcatatcgatatgattgtgatgtccgccacgctctgttccgctctttacataggggatgtctgcggtgcagtcatgatagcggcacaagtggtcatcgtctcgccgcgacaccacatctttgttcaggactgcaactgctccgtgtacccaggccatctctcgggacatcgcatggcatgggacatgatgatgaactggtctccaacaaccgccatggtcgtggcttacctcatgcgcattcccgaggttgtcctggatatcatcaccggggcgcattggggcgtgatgttcggcctggcctacttctccatgcagggggcgtgggctaaggtcatcgtcatcctcctaatgatcgcaggagtggacgcgagcacgcgcacgacgggtgccgtcgcaggtctccagacgagcagatttaccagtcttttccagcccgggtccaaacaaaacatccaacttattaacaccaacggcagctggcacattaaccgcactgccttgaactgcaatgacagcataaaaactggattcatcgcggccttgttctatcaaagaagattcaactcttcaggatgtccccaacgcctgtcctcttgtcgtcgtattgaggactttaggataggatggggcaccctggagtacgagaagaacgtcaccaatgacgaggatatgaggccctattgctggcattatccacccaagccatgtggcatcgtccccgcaaagaccgtgtgtggcccagtctattgttttactcccagcccggtagtagtcggcacgaccgataaagccggcgcaccaacctacacctggggggagaacgagactgacgtgttcctgctaaacagcacccgacctccgcaagggggttggttcgggtgcacgtggatgaacgggactggatttaccaagacatgtggcgcgccaccttgccgcataaggccagacttcaactccagcaaggacctactgtgccctactgattgcttcaggaagcatcccgaggctacctataagaagtgtggggccgggccgtggttgacccctaaatgcctagtacactacccctacagactgtggcactacccatgtacagtcaatttcaccatctttaaaataagaatgtatgtgggtggggttgagcacaggctcgaagcggcgtgcaatttcacccgaggggaccgctgcaatttggaggacagggacagaagtcaagtgagccccttgttgcattctaccaccgagtgggccatcttgccctgtacctatgctgacatgccagccctgtcctctggccttctacacctccaccaaaacatcgtggatgtgcagtacctgtatggtctgtcaccggccgtcaccaagtacattgtcaggtgggagtgggtggttctcttgtttctgctcctcgcggacgccagggtttgcgcctgcctgtggatgctcatcctgctcggccaagccgaggctgccctagaaaagctagtcgttctgcacgcggccagcgccgctagctccaacggcatgctttgttttgtgatcttcttcatagcagcctggtatttcaagggccgagtggttcccctggctacgtactcatatcttggtttatggtccttcagtattctgattctgacactaccccaacaggcctacgccctagaaccaactgaacaggggcaaattggtatggtcttactggccattatatccatcctcacactcagcccgacttacaaagtcttccttagcctctgcttatggtggctgagttacctactggtcatggcggaggccatgatccaggaatgggtgccacccctgcaatctcggggtggccgtgatggcgtcatatgggccgcaaccatactttgccctggtgtagtgtttgatataaccaagtggctcctggcaatccttgggcctgcctacctactcaggaccgttttgacgcgcacgccgtactttgtcagggcacaagtcctgttgaggttgtgcgtcgcggcgaagcaccttgcgggggggaggtatgtccagatgctgttgctgacccttgggaggtggactggtacttacatttatgaccatctctccccaatgtcagattgggcttccaatggcctacgggaccttgctgttgccgtggaacccatcatcttcagcccaatggagaagaaagtcatcgtgtggggggcggagacagcggcgtgtggtgacatcctgcatggactacccgtttccgcccgtttggggcgagaggtcttactgggtcctgctgatgagtacacctccaaggggtggaaactcctcgcgcccatcaccgcttatgcccagcaaacacgaggcttgctgggcgctatagtggtgagtttgacgggccgcgataagaccgagcaggccggggaagttcaagttctgtccacggttacccaatccttcctcgggacatcgatatcaggggtcctctggacggtctttcacggcgctggaaacaagacgctagccggctcgcgagggccggtcacgcagatgtactccagcgccgagggggaccttgtaggatggcctagtccgcccggcactaggtccctagacccctgcacatgtggtgccgccgacctttacctcgtcacccggaacgctgatgtcatcccggctcggaggcgaggagaccggcggggggcattgctctctccgaggcccctttctactctgaaggggtcctcggggggaccagtgctctgccccagggggcacgctgtggggatcttccgagcagcagtatgctccaggggggttgcaaagtccatagacttcatacccgttgagtcactggatgtcgtcaccaggtcccccaacttttctgacaacagcacccctcccgctgtgccccagacctatcaggtggggtatctgcatgcccccacaggcagtgggaagagcactaaggtgcccgccgcctacgctgcccaagggtacaaggtactggtactgaacccctctgtggctgctactctagggtttggggcttacatgtccaaggcacacgggatcaaccctaacatcagaacgggagtcaggaccgtgacaaccggggaagccatcacatactctacatatgggaagttcctggccgatggaggttgcgcaggtggggcgtacgacgtcataatatgtgatgagtgtcactccactgacgcgactaccatccttggcatcggaacagtccttgaccaagctgagaccgccggggccaggctgacagtcttagccaccgccacgcctcctgggtctatcacaactccccatcccaacatagaggaggttggcctcggccatgagggcgagatcccattctatgggaaggcaatccccctgtcccagatcaaggggggaaggcatttgatcttttgccactcgaagaagaagtgcgacgaggtcgcaaatgctctccggggcatgggcttgaacgcagtcgcctactacagggggctcgacgtctccgtgataccagcccagggagacgtggtggtggttgccaccgacgccctcatgacggggtttactggggacttcgactcggtgatcgactgcaacgtggcggtcacccagaccgtggacttcagtttagaccccaccttcactgtgactacacaaactgtccctcaggacgccgtctctcgcagccaacgtcgagggcggacgggtagaggtaggttgggcatatataggtatgtttcctccggtgagcgagcttccgggatgttcgacaccgtagtactctgtgagtgctatgatgctggagccgcttggtatgagctcacacccgcggagactaccgtcagacttcgggcgtatctcaacacgcctggactgcctgtgtgccaagaccatttggagttctgggaggcagttttcaccggcctcacgcacatagatgcccacttcctctcccagacaaagcaggcgggggaaaacttcccgtacctggtagcctaccaggctacggtttgtgcaagggccaaggccccccccccgtcctgggacgtcatgtggaaatgcctgatacgacttaaacccacactagtcggcccaacacctctgctatatcgcctgggctcggtcagtaatgaggtcaccctcacgcaccccgtcaccaaatacattgccacgtgcatgcaagctgaccttgagatcatgacaagcacgtgggttcttgcggggggggttctggccgccgttgctgcctattgcctagcgactggctgtgtatccatcattggtcgtgtgcacatcaatcagagggccataattgcccccgacaaggaggtcctgtacgaggcgttcgatgagatggaggaatgcgcctcaagggccgccttagtcgaggaagggcagcgaatagcagagatgctgaagtctaaaattcagggtctgttacaacaggccacaaagcaggcccaagacttgcaacccgcggttcaggccggctggcccaagttggaacaattctgggccaagcacatgtggaatttcatcagcggtatccagtacctagcaggattgtccacgctaccagggaatccagctgtggcctcgatgatggcgttcagcgctgccctaaccagcccactgtctaccagcactactatcctcttgaacatcatgggaggctggttggcctctcagatcgccccgcctgcgggggccactggtttcgttgtcagtggcctggtgggggccgccgtgggtagcataggcttaggcaagatactggtggatgttttggctggatacggcgctggcatttcgggggctctcgttgctttcaagatcatgtctggagagaaaccctccatggaagacgtcgtcaacctgctgcctgggatcctatcacccggtgctttggtggtgggggttatttgcgcggccatcctgcgtcgccacgtgggtcaaggggaaggtgcagtccagtggatgaacaggcttatagcctttgcttccaagggaaaccacgtcgccccgactcactacgtggcggagtctgatgcgtcgcagcgagtatctcaactgctcagctccttgaccataaccagcctcctcaggaggcttcataactggatcactgaggattgccccataccgtgcgctggctcctggcttagcgacgtgtgggactgggtctgtaccattctgaccgattttaagaactggctatcctccaagctgcttccaaagatgccaggcctcccctttatctcctgccagaaggggtataggggggtgtgggccggcactggcatcatgaccacaaggtgcccatgcggcgctaacatctctggcaacgtccgcctgggcactatgaggataacggggcccaagacctgcatgaacacctggcaggggaccttccccatcaactgttacacagagggccagtgcgtgccaaagcccgcgcccagctacagaaccgccatctggcgggtggctgcagcggattacgtcgaggtgactcgacacggcagctactcctatgtgacagggttgacaaatgacaacctcaaagtcccatgccaactgccatcaccagagttcttctcctgggtggatggggtccaaatccacagattcgcacccaccccaaagccgttcattagggatgaggttacgttcagcgtgggcctcaactcctttgtagtcgggtctcagctcccttgtgagcctgagccggatacggaggtgttggcgtccatgctaacagacccgtcccacatcacggcggaggcggcggcgagacgcttggcacggggctcgcccccgtccgaggccagctcgtccgcgagccaattgtcagcgccatcgctgcgagctacctgtaccacccatggaaagaattatgacattgacatggtggatgccaacctcttcatggggggggacgtaactcggatagaatctgagtccaaagtgctcgttctagactcccttgatccctcgattgaggaggaggatgaacgcgagccttcaataccatcagaataccttctccccaggaagaaattcccacctgcactgccggtctgggcgcagcctagttacaaccctccgctcatagagagctggaagaaaccagattatgaaccaccgacggtggctgggtgcgctctcccccccccgactaaggcccccactcctccacctaggaggcgccgggccatagtcttgagccaggataatgtggggggggctctcatggacttggctcgtcggagctttggccatcctccccccagcagtgactccggccaccgcacaggagagagcaccaccgacagccccggagacataccgacgggtgagtccgttgactcggagacgggctccgtttcctccatgcccccccttgagggggagccgggggaccctgatctagagcctgagcaagtggagcgctcctcccccccaccgggggggggggcagctcccgactcggactctgggtcttggacctcgtgctccgatgaggacgactctgtcatttgttgctccatgtcatattcctggaccggggctctagtcaccccttgtggcccggaggaagaaaggttgccaattaacccgctaagcaattcgctactgcggtaccataacaaggtataccgcacgtcttcgcgatgtgcctctcagcgggccaaaaaggtcaccttcgacaggatgcaactacttgactcccactatgatgaagtcttaaaggacatcaagcaagctgcctccaaggttagtgcaaggctcctctctgttgaagaagcgtgtgcgctgacccctccccattccgcgagatctaaatacgggtttggggctaaggaggtacgcggcttgtccaggagggccgttaaccacatcaagtccgtgtgggaggacctcttggaagaccaacaatcaccaattcctacaaccatcatggccaaaaatgaggtgttttgcgtagatcctgcaaagggcgggaagaaggcagcgcgcctcatcgtgtaccctgaccttggtgttagggtttgcgagaaaatggccctctatgacattgcacagaagctgccccaagcagtaatgggagcttcatatgggttccagtactctcctgggcagcgggtagagttccttctgcgagcgtggaaggaaaagaagaaccccatggggttctcttatgacactcgctgctttgactcaacagtcactgagagagacatcagaacagaggaatccatataccaggcttgctcgttacccgaggaggcccggactgccattcattcattaactgagagactctacgtaggcgggcccatgatgaacagcaagggtcaggcatgcggttacaggcgttgccgcgccagcggagtatttaccactagcatagggaacaccatgacatgctatatgaaagcccgggctgcttgtaaagcggcggggatcattgctcctaccatgctggtatgtggcgacgacctggtggtcatctcagaaagtcagggggctgaggaggacgagcggaatctgagagtcttcacggaggctatgaccagatactccgcccctcccggcgacccacccaaaccagaatatgacttggagctgataacatcatgctcctcaaacgtgtctgtggctttggacccgcggggtcgccgcagatactacctgaccagagaccctaccactccactcgccagggctgcctgggagacagtcagacactcccctgtcaattcatggctgggaaacatcatccaatacgccccaaccatatgggcgcgcatggtcctaatgacacacttcttctccgttctgtcggcccaggatggcctagaccaaaatctcaatttcgagatgtacggagcagtgtactcagtgaaccccctggacctaccagccataattgagaggatgcatgggctcgacgccttctcactgcacacatactctccccacgaactcaatcgggtggctgcagctctcagaaaacttggagcgcctccccttagagcgtggaagagtcgggcacgcatggtgagggcgtcactcatctcccagggcgggagagcggccatttgtggtcgttacctcttcaactgggcggtgaaaacgaagctcagactcactccattgccggaggcacgccgcctggacttgtccgggtggtttaccgtcggcgccggcgggggcgacatttatcacagcgtgtcgcgagcccgaccccgcattttactcctttgcctactcctactcagcgtaggggtaggcatctttt" >
 3 | 	<protein abbreviation="core protein" startPosition="1" stopPosition="574" />
 4 | 	<protein abbreviation="E1 protein" startPosition="574" stopPosition="1150" />
 5 | 	<protein abbreviation="E2 protein" startPosition="1150" stopPosition="2251" />
 6 | 	<protein abbreviation="p7 protein" startPosition="2251" stopPosition="2440" />
 7 | 	<protein abbreviation="NS2 protein" startPosition="2440" stopPosition="3091" />
 8 | 	<protein abbreviation="NS3 protein" startPosition="3091" stopPosition="4984" />
 9 | 	<protein abbreviation="NS4A protein" startPosition="4984" stopPosition="5146" />
10 | 	<protein abbreviation="NS4B protein" startPosition="5146" stopPosition="5929" />
11 | 	<protein abbreviation="NS5A protein" startPosition="5929" stopPosition="7327" />
12 | 	<protein abbreviation="NS5B protein" startPosition="7327" stopPosition="9086" />
13 | </orf>
14 | 


--------------------------------------------------------------------------------
/src/mxml/mxml-set.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * "$Id: mxml-set.c 441 2011-12-09 23:49:00Z mike $"
  3 |  *
  4 |  * Node set functions for Mini-XML, a small XML-like file parsing library.
  5 |  *
  6 |  * Copyright 2003-2011 by Michael R Sweet.
  7 |  *
  8 |  * These coded instructions, statements, and computer programs are the
  9 |  * property of Michael R Sweet and are protected by Federal copyright
 10 |  * law.  Distribution and use rights are outlined in the file "COPYING"
 11 |  * which should have been included with this file.  If this file is
 12 |  * missing or damaged, see the license at:
 13 |  *
 14 |  *     http://www.minixml.org/
 15 |  *
 16 |  * Contents:
 17 |  *
 18 |  *   mxmlSetCDATA()    - Set the element name of a CDATA node.
 19 |  *   mxmlSetCustom()   - Set the data and destructor of a custom data node.
 20 |  *   mxmlSetElement()  - Set the name of an element node.
 21 |  *   mxmlSetInteger()  - Set the value of an integer node.
 22 |  *   mxmlSetOpaque()   - Set the value of an opaque node.
 23 |  *   mxmlSetReal()     - Set the value of a real number node.
 24 |  *   mxmlSetText()     - Set the value of a text node.
 25 |  *   mxmlSetTextf()    - Set the value of a text node to a formatted string.
 26 |  *   mxmlSetUserData() - Set the user data pointer for a node.
 27 |  */
 28 | 
 29 | /*
 30 |  * Include necessary headers...
 31 |  */
 32 | 
 33 | #include "config.h"
 34 | #include "mxml.h"
 35 | 
 36 | 
 37 | /*
 38 |  * 'mxmlSetCDATA()' - Set the element name of a CDATA node.
 39 |  *
 40 |  * The node is not changed if it (or its first child) is not a CDATA element node.
 41 |  *
 42 |  * @since Mini-XML 2.3@
 43 |  */
 44 | 
 45 | int					/* O - 0 on success, -1 on failure */
 46 | mxmlSetCDATA(mxml_node_t *node,		/* I - Node to set */
 47 |              const char  *data)		/* I - New data string */
 48 | {
 49 |  /*
 50 |   * Range check input...
 51 |   */
 52 | 
 53 |   if (node && node->type == MXML_ELEMENT &&
 54 |       strncmp(node->value.element.name, "![CDATA[", 8) &&
 55 |       node->child && node->child->type == MXML_ELEMENT &&
 56 |       !strncmp(node->child->value.element.name, "![CDATA[", 8))
 57 |     node = node->child;
 58 | 
 59 |   if (!node || node->type != MXML_ELEMENT || !data ||
 60 |       strncmp(node->value.element.name, "![CDATA[", 8))
 61 |     return (-1);
 62 | 
 63 |  /*
 64 |   * Free any old element value and set the new value...
 65 |   */
 66 | 
 67 |   if (node->value.element.name)
 68 |     free(node->value.element.name);
 69 | 
 70 |   node->value.element.name = _mxml_strdupf("![CDATA[%s]]", data);
 71 | 
 72 |   return (0);
 73 | }
 74 | 
 75 | 
 76 | /*
 77 |  * 'mxmlSetCustom()' - Set the data and destructor of a custom data node.
 78 |  *
 79 |  * The node is not changed if it (or its first child) is not a custom node.
 80 |  *
 81 |  * @since Mini-XML 2.1@
 82 |  */
 83 | 
 84 | int					/* O - 0 on success, -1 on failure */
 85 | mxmlSetCustom(
 86 |     mxml_node_t              *node,	/* I - Node to set */
 87 |     void                     *data,	/* I - New data pointer */
 88 |     mxml_custom_destroy_cb_t destroy)	/* I - New destructor function */
 89 | {
 90 |  /*
 91 |   * Range check input...
 92 |   */
 93 | 
 94 |   if (node && node->type == MXML_ELEMENT &&
 95 |       node->child && node->child->type == MXML_CUSTOM)
 96 |     node = node->child;
 97 | 
 98 |   if (!node || node->type != MXML_CUSTOM)
 99 |     return (-1);
100 | 
101 |  /*
102 |   * Free any old element value and set the new value...
103 |   */
104 | 
105 |   if (node->value.custom.data && node->value.custom.destroy)
106 |     (*(node->value.custom.destroy))(node->value.custom.data);
107 | 
108 |   node->value.custom.data    = data;
109 |   node->value.custom.destroy = destroy;
110 | 
111 |   return (0);
112 | }
113 | 
114 | 
115 | /*
116 |  * 'mxmlSetElement()' - Set the name of an element node.
117 |  *
118 |  * The node is not changed if it is not an element node.
119 |  */
120 | 
121 | int					/* O - 0 on success, -1 on failure */
122 | mxmlSetElement(mxml_node_t *node,	/* I - Node to set */
123 |                const char  *name)	/* I - New name string */
124 | {
125 |  /*
126 |   * Range check input...
127 |   */
128 | 
129 |   if (!node || node->type != MXML_ELEMENT || !name)
130 |     return (-1);
131 | 
132 |  /*
133 |   * Free any old element value and set the new value...
134 |   */
135 | 
136 |   if (node->value.element.name)
137 |     free(node->value.element.name);
138 | 
139 |   node->value.element.name = strdup(name);
140 | 
141 |   return (0);
142 | }
143 | 
144 | 
145 | /*
146 |  * 'mxmlSetInteger()' - Set the value of an integer node.
147 |  *
148 |  * The node is not changed if it (or its first child) is not an integer node.
149 |  */
150 | 
151 | int					/* O - 0 on success, -1 on failure */
152 | mxmlSetInteger(mxml_node_t *node,	/* I - Node to set */
153 |                int         integer)	/* I - Integer value */
154 | {
155 |  /*
156 |   * Range check input...
157 |   */
158 | 
159 |   if (node && node->type == MXML_ELEMENT &&
160 |       node->child && node->child->type == MXML_INTEGER)
161 |     node = node->child;
162 | 
163 |   if (!node || node->type != MXML_INTEGER)
164 |     return (-1);
165 | 
166 |  /*
167 |   * Set the new value and return...
168 |   */
169 | 
170 |   node->value.integer = integer;
171 | 
172 |   return (0);
173 | }
174 | 
175 | 
176 | /*
177 |  * 'mxmlSetOpaque()' - Set the value of an opaque node.
178 |  *
179 |  * The node is not changed if it (or its first child) is not an opaque node.
180 |  */
181 | 
182 | int					/* O - 0 on success, -1 on failure */
183 | mxmlSetOpaque(mxml_node_t *node,	/* I - Node to set */
184 |               const char  *opaque)	/* I - Opaque string */
185 | {
186 |  /*
187 |   * Range check input...
188 |   */
189 | 
190 |   if (node && node->type == MXML_ELEMENT &&
191 |       node->child && node->child->type == MXML_OPAQUE)
192 |     node = node->child;
193 | 
194 |   if (!node || node->type != MXML_OPAQUE || !opaque)
195 |     return (-1);
196 | 
197 |  /*
198 |   * Free any old opaque value and set the new value...
199 |   */
200 | 
201 |   if (node->value.opaque)
202 |     free(node->value.opaque);
203 | 
204 |   node->value.opaque = strdup(opaque);
205 | 
206 |   return (0);
207 | }
208 | 
209 | 
210 | /*
211 |  * 'mxmlSetReal()' - Set the value of a real number node.
212 |  *
213 |  * The node is not changed if it (or its first child) is not a real number node.
214 |  */
215 | 
216 | int					/* O - 0 on success, -1 on failure */
217 | mxmlSetReal(mxml_node_t *node,		/* I - Node to set */
218 |             double      real)		/* I - Real number value */
219 | {
220 |  /*
221 |   * Range check input...
222 |   */
223 | 
224 |   if (node && node->type == MXML_ELEMENT &&
225 |       node->child && node->child->type == MXML_REAL)
226 |     node = node->child;
227 | 
228 |   if (!node || node->type != MXML_REAL)
229 |     return (-1);
230 | 
231 |  /*
232 |   * Set the new value and return...
233 |   */
234 | 
235 |   node->value.real = real;
236 | 
237 |   return (0);
238 | }
239 | 
240 | 
241 | /*
242 |  * 'mxmlSetText()' - Set the value of a text node.
243 |  *
244 |  * The node is not changed if it (or its first child) is not a text node.
245 |  */
246 | 
247 | int					/* O - 0 on success, -1 on failure */
248 | mxmlSetText(mxml_node_t *node,		/* I - Node to set */
249 |             int         whitespace,	/* I - 1 = leading whitespace, 0 = no whitespace */
250 | 	    const char  *string)	/* I - String */
251 | {
252 |  /*
253 |   * Range check input...
254 |   */
255 | 
256 |   if (node && node->type == MXML_ELEMENT &&
257 |       node->child && node->child->type == MXML_TEXT)
258 |     node = node->child;
259 | 
260 |   if (!node || node->type != MXML_TEXT || !string)
261 |     return (-1);
262 | 
263 |  /*
264 |   * Free any old string value and set the new value...
265 |   */
266 | 
267 |   if (node->value.text.string)
268 |     free(node->value.text.string);
269 | 
270 |   node->value.text.whitespace = whitespace;
271 |   node->value.text.string     = strdup(string);
272 | 
273 |   return (0);
274 | }
275 | 
276 | 
277 | /*
278 |  * 'mxmlSetTextf()' - Set the value of a text node to a formatted string.
279 |  *
280 |  * The node is not changed if it (or its first child) is not a text node.
281 |  */
282 | 
283 | int					/* O - 0 on success, -1 on failure */
284 | mxmlSetTextf(mxml_node_t *node,		/* I - Node to set */
285 |              int         whitespace,	/* I - 1 = leading whitespace, 0 = no whitespace */
286 |              const char  *format,	/* I - Printf-style format string */
287 | 	     ...)			/* I - Additional arguments as needed */
288 | {
289 |   va_list	ap;			/* Pointer to arguments */
290 | 
291 | 
292 |  /*
293 |   * Range check input...
294 |   */
295 | 
296 |   if (node && node->type == MXML_ELEMENT &&
297 |       node->child && node->child->type == MXML_TEXT)
298 |     node = node->child;
299 | 
300 |   if (!node || node->type != MXML_TEXT || !format)
301 |     return (-1);
302 | 
303 |  /*
304 |   * Free any old string value and set the new value...
305 |   */
306 | 
307 |   if (node->value.text.string)
308 |     free(node->value.text.string);
309 | 
310 |   va_start(ap, format);
311 | 
312 |   node->value.text.whitespace = whitespace;
313 |   node->value.text.string     = _mxml_strdupf(format, ap);
314 | 
315 |   va_end(ap);
316 | 
317 |   return (0);
318 | }
319 | 
320 | 
321 | /*
322 |  * 'mxmlSetUserData()' - Set the user data pointer for a node.
323 |  *
324 |  * @since Mini-XML 2.7@
325 |  */
326 | 
327 | int					/* O - 0 on success, -1 on failure */
328 | mxmlSetUserData(mxml_node_t *node,	/* I - Node to set */
329 |                 void        *data)	/* I - User data pointer */
330 | {
331 |  /*
332 |   * Range check input...
333 |   */
334 | 
335 |   if (!node)
336 |     return (-1);
337 | 
338 |  /*
339 |   * Set the user data pointer and return...
340 |   */
341 | 
342 |   node->user_data = data;
343 |   return (0);
344 | }
345 | 
346 | 
347 | /*
348 |  * End of "$Id: mxml-set.c 441 2011-12-09 23:49:00Z mike $".
349 |  */
350 | 


--------------------------------------------------------------------------------
/src/libseq/Codon.cpp:
--------------------------------------------------------------------------------
  1 | #include "Codon.h"
  2 | 
  3 | namespace seq {
  4 | 
  5 | AminoAcid Codon::translate(const NTSequence::const_iterator triplet)
  6 | {
  7 |   const AminoAcid codonTable[4][4][4] = {
  8 |   { { AminoAcid::K /* AAA */,
  9 |       AminoAcid::N /* AAC */,
 10 |       AminoAcid::K /* AAG */,
 11 |       AminoAcid::N /* AAT */
 12 |     },
 13 |     { AminoAcid::T /* ACA */,
 14 |       AminoAcid::T /* ACC */,
 15 |       AminoAcid::T /* ACG */,
 16 |       AminoAcid::T /* ACT */
 17 |     },
 18 |     { AminoAcid::R /* AGA */,
 19 |       AminoAcid::S /* AGC */,
 20 |       AminoAcid::R /* AGG */,
 21 |       AminoAcid::S /* AGT */
 22 |     },
 23 |     { AminoAcid::I /* ATA */,
 24 |       AminoAcid::I /* ATC */,
 25 |       AminoAcid::M /* ATG */,
 26 |       AminoAcid::I /* ATT */
 27 |     }
 28 |   },
 29 |   { { AminoAcid::Q /* CAA */,
 30 |       AminoAcid::H /* CAC */,
 31 |       AminoAcid::Q /* CAG */,
 32 |       AminoAcid::H /* CAT */
 33 |     },
 34 |     { AminoAcid::P /* CCA */,
 35 |       AminoAcid::P /* CCC */,
 36 |       AminoAcid::P /* CCG */,
 37 |       AminoAcid::P /* CCT */
 38 |     },
 39 |     { AminoAcid::R /* CGA */,
 40 |       AminoAcid::R /* CGC */,
 41 |       AminoAcid::R /* CGG */,
 42 |       AminoAcid::R /* CGT */
 43 |     },
 44 |     { AminoAcid::L /* CTA */,
 45 |       AminoAcid::L /* CTC */,
 46 |       AminoAcid::L /* CTG */,
 47 |       AminoAcid::L /* CTT */
 48 |     }
 49 |   },
 50 |   { { AminoAcid::E /* GAA */,
 51 |       AminoAcid::D /* GAC */,
 52 |       AminoAcid::E /* GAG */,
 53 |       AminoAcid::D /* GAT */
 54 |     },
 55 |     { AminoAcid::A /* GCA */,
 56 |       AminoAcid::A /* GCC */,
 57 |       AminoAcid::A /* GCG */,
 58 |       AminoAcid::A /* GCT */
 59 |     },
 60 |     { AminoAcid::G /* GGA */,
 61 |       AminoAcid::G /* GGC */,
 62 |       AminoAcid::G /* GGG */,
 63 |       AminoAcid::G /* GGT */
 64 |     },
 65 |     { AminoAcid::V /* GTA */,
 66 |       AminoAcid::V /* GTC */,
 67 |       AminoAcid::V /* GTG */,
 68 |       AminoAcid::V /* GTT */
 69 |     }
 70 |   },
 71 |   { { AminoAcid::STP /* TAA */,
 72 |       AminoAcid::Y /* TAC */,
 73 |       AminoAcid::STP /* TAG */,
 74 |       AminoAcid::Y /* TAT */
 75 |     },
 76 |     { AminoAcid::S /* TCA */,
 77 |       AminoAcid::S /* TCC */,
 78 |       AminoAcid::S /* TCG */,
 79 |       AminoAcid::S /* TCT */
 80 |     },
 81 |     { AminoAcid::STP /* TGA */,
 82 |       AminoAcid::C /* TGC */,
 83 |       AminoAcid::W /* TGG */,
 84 |       AminoAcid::C /* TGT */
 85 |     },
 86 |     { AminoAcid::L /* TTA */,
 87 |       AminoAcid::F /* TTC */,
 88 |       AminoAcid::L /* TTG */,
 89 |       AminoAcid::F /* TTT */
 90 |     }
 91 |   } };
 92 | 
 93 |   if (*triplet == Nucleotide::GAP
 94 |       && (*(triplet + 1) == Nucleotide::GAP)
 95 |       && (*(triplet + 2) == Nucleotide::GAP))
 96 |     return AminoAcid::GAP;
 97 | 
 98 |   if (triplet->isAmbiguity()
 99 |       || (triplet + 1)->isAmbiguity()
100 |       || (triplet + 2)->isAmbiguity())
101 |     return AminoAcid::X;
102 | 
103 |   return
104 |     codonTable[triplet->intRep()]
105 |               [(triplet + 1)->intRep()]
106 |               [(triplet + 2)->intRep()];
107 | }
108 | 
109 | std::set<AminoAcid>
110 | Codon::translateAll(const NTSequence::const_iterator triplet)
111 | {
112 |   std::set<AminoAcid> result;
113 | 
114 |   NTSequence s(triplet, triplet + 3);
115 | 
116 |   std::vector<NTSequence> possibilities;
117 |   s.nonAmbiguousSequences(possibilities);
118 | 
119 |   for (unsigned i = 0; i < possibilities.size(); ++i)
120 |     result.insert(translate(possibilities[i].begin()));
121 | 
122 |   return result;
123 | }
124 | 
125 | namespace {
126 |   void addTriplet(std::set<NTSequence>& result,
127 | 		  Nucleotide c1, Nucleotide c2, Nucleotide c3)
128 |   {
129 |     NTSequence triplet;
130 |     triplet.push_back(c1);
131 |     triplet.push_back(c2);
132 |     triplet.push_back(c3);
133 | 
134 |     result.insert(triplet);
135 |   }
136 | 
137 | }
138 | 
139 | std::set<NTSequence> Codon::codonsFor(AminoAcid a)
140 | {
141 |   std::set<NTSequence> result;
142 | 
143 |   switch (a.intRep()) {
144 |   case AminoAcid::AA_A:
145 |     addTriplet(result, Nucleotide::G, Nucleotide::C, Nucleotide::T);
146 |     addTriplet(result, Nucleotide::G, Nucleotide::C, Nucleotide::C);
147 |     addTriplet(result, Nucleotide::G, Nucleotide::C, Nucleotide::A);
148 |     addTriplet(result, Nucleotide::G, Nucleotide::C, Nucleotide::G);
149 |     break;
150 |   case AminoAcid::AA_C:
151 |     addTriplet(result, Nucleotide::T, Nucleotide::G, Nucleotide::T);
152 |     addTriplet(result, Nucleotide::T, Nucleotide::G, Nucleotide::C);
153 |     break;
154 |   case AminoAcid::AA_D:
155 |     addTriplet(result, Nucleotide::G, Nucleotide::A, Nucleotide::T);
156 |     addTriplet(result, Nucleotide::G, Nucleotide::A, Nucleotide::C);
157 |     break;
158 |   case AminoAcid::AA_E:
159 |     addTriplet(result, Nucleotide::G, Nucleotide::A, Nucleotide::A);
160 |     addTriplet(result, Nucleotide::G, Nucleotide::A, Nucleotide::G);
161 |     break;
162 |   case AminoAcid::AA_F:
163 |     addTriplet(result, Nucleotide::T, Nucleotide::T, Nucleotide::T);
164 |     addTriplet(result, Nucleotide::T, Nucleotide::T, Nucleotide::C);
165 |     break;
166 |   case AminoAcid::AA_G:
167 |     addTriplet(result, Nucleotide::G, Nucleotide::G, Nucleotide::T);
168 |     addTriplet(result, Nucleotide::G, Nucleotide::G, Nucleotide::C);
169 |     addTriplet(result, Nucleotide::G, Nucleotide::G, Nucleotide::A);
170 |     addTriplet(result, Nucleotide::G, Nucleotide::G, Nucleotide::G);
171 |     break;   
172 |   case AminoAcid::AA_H:
173 |     addTriplet(result, Nucleotide::C, Nucleotide::A, Nucleotide::T);
174 |     addTriplet(result, Nucleotide::C, Nucleotide::A, Nucleotide::C);
175 |     break;    
176 |   case AminoAcid::AA_I:
177 |     addTriplet(result, Nucleotide::A, Nucleotide::T, Nucleotide::T);
178 |     addTriplet(result, Nucleotide::A, Nucleotide::T, Nucleotide::C);
179 |     addTriplet(result, Nucleotide::A, Nucleotide::T, Nucleotide::A);
180 |     break;    
181 |   case AminoAcid::AA_K:
182 |     addTriplet(result, Nucleotide::A, Nucleotide::A, Nucleotide::A);
183 |     addTriplet(result, Nucleotide::A, Nucleotide::A, Nucleotide::G);
184 |     break;    
185 |   case AminoAcid::AA_L:
186 |     addTriplet(result, Nucleotide::T, Nucleotide::T, Nucleotide::A);
187 |     addTriplet(result, Nucleotide::T, Nucleotide::T, Nucleotide::G);
188 |     addTriplet(result, Nucleotide::C, Nucleotide::T, Nucleotide::T);
189 |     addTriplet(result, Nucleotide::C, Nucleotide::T, Nucleotide::C);
190 |     addTriplet(result, Nucleotide::C, Nucleotide::T, Nucleotide::A);
191 |     addTriplet(result, Nucleotide::C, Nucleotide::T, Nucleotide::G);
192 |     break;    
193 |   case AminoAcid::AA_M:
194 |     addTriplet(result, Nucleotide::A, Nucleotide::T, Nucleotide::G);
195 |     break;
196 |   case AminoAcid::AA_N:
197 |     addTriplet(result, Nucleotide::A, Nucleotide::A, Nucleotide::T);
198 |     addTriplet(result, Nucleotide::A, Nucleotide::A, Nucleotide::C);
199 |     break;    
200 |   case AminoAcid::AA_P:
201 |     addTriplet(result, Nucleotide::C, Nucleotide::C, Nucleotide::T);
202 |     addTriplet(result, Nucleotide::C, Nucleotide::C, Nucleotide::C);
203 |     addTriplet(result, Nucleotide::C, Nucleotide::C, Nucleotide::A);
204 |     addTriplet(result, Nucleotide::C, Nucleotide::C, Nucleotide::G);
205 |     break;
206 |   case AminoAcid::AA_Q:
207 |     addTriplet(result, Nucleotide::C, Nucleotide::A, Nucleotide::A);
208 |     addTriplet(result, Nucleotide::C, Nucleotide::A, Nucleotide::G);
209 |     break;
210 |   case AminoAcid::AA_R:
211 |     addTriplet(result, Nucleotide::C, Nucleotide::G, Nucleotide::T);
212 |     addTriplet(result, Nucleotide::C, Nucleotide::G, Nucleotide::C);
213 |     addTriplet(result, Nucleotide::C, Nucleotide::G, Nucleotide::A);
214 |     addTriplet(result, Nucleotide::C, Nucleotide::G, Nucleotide::G);
215 |     addTriplet(result, Nucleotide::A, Nucleotide::G, Nucleotide::A);
216 |     addTriplet(result, Nucleotide::A, Nucleotide::G, Nucleotide::G);
217 |     break;
218 |   case AminoAcid::AA_S:
219 |     addTriplet(result, Nucleotide::T, Nucleotide::C, Nucleotide::T);
220 |     addTriplet(result, Nucleotide::T, Nucleotide::C, Nucleotide::C);
221 |     addTriplet(result, Nucleotide::T, Nucleotide::C, Nucleotide::A);
222 |     addTriplet(result, Nucleotide::T, Nucleotide::C, Nucleotide::G);
223 |     addTriplet(result, Nucleotide::A, Nucleotide::G, Nucleotide::T);
224 |     addTriplet(result, Nucleotide::A, Nucleotide::G, Nucleotide::C);
225 |     break;
226 |   case AminoAcid::AA_T:
227 |     addTriplet(result, Nucleotide::A, Nucleotide::C, Nucleotide::T);
228 |     addTriplet(result, Nucleotide::A, Nucleotide::C, Nucleotide::C);
229 |     addTriplet(result, Nucleotide::A, Nucleotide::C, Nucleotide::A);
230 |     addTriplet(result, Nucleotide::A, Nucleotide::C, Nucleotide::G);
231 |     break;
232 |   case AminoAcid::AA_V:
233 |     addTriplet(result, Nucleotide::G, Nucleotide::T, Nucleotide::T);
234 |     addTriplet(result, Nucleotide::G, Nucleotide::T, Nucleotide::C);
235 |     addTriplet(result, Nucleotide::G, Nucleotide::T, Nucleotide::A);
236 |     addTriplet(result, Nucleotide::G, Nucleotide::T, Nucleotide::G);
237 |     break;
238 |   case AminoAcid::AA_W:
239 |     addTriplet(result, Nucleotide::T, Nucleotide::G, Nucleotide::G);
240 |     break;
241 |   case AminoAcid::AA_Y:
242 |     addTriplet(result, Nucleotide::T, Nucleotide::A, Nucleotide::T);
243 |     addTriplet(result, Nucleotide::T, Nucleotide::A, Nucleotide::C);
244 |     break;
245 |   case AminoAcid::AA_STP:
246 |     addTriplet(result, Nucleotide::T, Nucleotide::A, Nucleotide::A);
247 |     addTriplet(result, Nucleotide::T, Nucleotide::A, Nucleotide::G);
248 |     addTriplet(result, Nucleotide::T, Nucleotide::G, Nucleotide::A);
249 |     break;
250 |   case AminoAcid::AA_GAP:
251 |   case AminoAcid::AA_Z:
252 |   case AminoAcid::AA_U:
253 |   case AminoAcid::AA_B:
254 |   case AminoAcid::AA_X:
255 |   default:
256 |     break;
257 |   }
258 | 
259 |   return result;
260 | }
261 | 
262 | };
263 | 


--------------------------------------------------------------------------------
/references/DENV/DENV4-NC002640.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <orf name="DENV4-NC002640" referenceSequence="atgaaccaacgaaaaaaggtggttagaccacctttcaatatgctgaaacgcgagagaaaccgcgtatcaacccctcaagggttggtgaagagattctcaaccggacttttttctgggaaaggacccttacggatggtgctagcattcatcacgtttttgcgagtcctttccatcccaccaacagcagggattctgaagagatggggacagttgaagaaaaataaggccatcaagatactgattggattcaggaaggagataggccgcatgctgaacatcttgaacgggagaaaaaggtcaacgataacattgctgtgcttgattcccaccgtaatggcgttttccctcagcacaagagatggcgaacccctcatgatagtggcaaaacatgaaagggggagacctctcttgtttaagacaacagaggggatcaacaaatgcactctcattgccatggacttgggtgaaatgtgtgaggacactgtcacgtataaatgccccctactggtcaataccgaacctgaagacattgattgctggtgcaacctcacgtctacctgggtcatgtatgggacatgcacccagagcggagaacggagacgagagaagcgctcagtagctttaacaccacattcaggaatgggattggaaacaagagctgagacatggatgtcatcggaaggggcttggaagcatgctcagagagtagagagctggatactcagaaacccaggattcgcgctcttggcaggatttatggcttatatgattgggcaaacaggaatccagcgaactgtcttctttgtcctaatgatgctggtcgccccatcctacggaatgcgatgcgtaggagtaggaaacagagactttgtggaaggagtctcaggtggagcatgggtcgacctggtgctagaacatggaggatgcgtcacaaccatggcccagggaaaaccaaccttggattttgaactgactaagacaacagccaaggaagtggctctgttaagaacctattgcattgaagcctcaatatcaaacataactacggcaacaagatgtccaacgcaaggagagccttatctgaaagaggaacaggaccaacagtacatttgccggagagatgtggtagacagagggtggggcaatggctgtggcttgtttggaaaaggaggagttgtgacatgtgcgaagttttcatgttcggggaagataacaggcaatttggtccaaattgagaaccttgaatacacagtggttgtaacagtccacaatggagacacccatgcagtaggaaatgacacatccaatcatggagttacagccatgataactcccaggtcaccatcggtggaagtcaaattgccggactatggagaactaacactcgattgtgaacccaggtctggaattgactttaatgagatgattctgatgaaaatgaaaaagaaaacatggctcgtgcataagcaatggtttttggatctgcctcttccatggacagcaggagcagacacatcagaggttcactggaattacaaagagagaatggtgacatttaaggttcctcatgccaagagacaggatgtgacagtgctgggatctcaggaaggagccatgcattctgccctcgctggagccacagaagtggactccggtgatggaaatcacatgtttgcaggacatcttaagtgcaaagtccgtatggagaaattgagaatcaagggaatgtcatacacgatgtgttcaggaaagttttcaattgacaaagagatggcagaaacacagcatgggacaacagtggtgaaagtcaagtatgaaggtgctggagctccgtgtaaagtccccatagagataagagatgtaaacaaggaaaaagtggttgggcgtatcatctcatccacccctttggctgagaataccaacagtgtaaccaacatagaattagaacccccctttggggacagctacatagtgataggtgttggaaacagcgcattaacactccattggttcaggaaagggagttccattggcaagatgtttgagtccacatacagaggtgcaaaacgaatggccattctaggtgaaacagcttgggattttggttccgttggtggactgttcacatcattgggaaaggctgtgcaccaggtttttggaagtgtgtatacaaccatgtttggaggagtctcatggatgattagaatcctaattgggttcttagtgttgtggattggcacgaactcgaggaacacttcaatggctatgacgtgcatagctgttggaggaatcactctgtttctgggcttcacagttcaagcagacatgggttgtgtggcgtcatggagtgggaaagaattgaagtgtggaagcggaatttttgtggttgacaacgtgcacacttggacagaacagtacaaatttcaaccagagtccccagcgagactagcgtctgcaatattaaatgcccacaaagatggggtctgtggaattagatcaaccacgaggctggaaaatgtcatgtggaagcaaataaccaacgagctaaactatgttctctgggaaggaggacatgacctcactgtagtggctggggatgtgaagggggtgttgaccaaaggcaagagagcactcacacccccagtgagtgatctgaaatattcatggaagacatggggaaaagcaaaaatcttcaccccagaagcaagaaatagcacatttttaatagacggaccagacacctctgaatgccccaatgaacgaagagcatggaactctcttgaggtggaagactatggatttggcatgttcacgaccaacatatggatgaaattccgagaaggaagttcagaagtgtgtgaccacaggttaatgtcagctgcaattaaagatcagaaagctgtgcatgctgacatgggttattggatagagagctcaaaaaaccagacctggcagatagagaaagcatctcttattgaagtgaaaacatgtctgtggcccaagacccacacactgtggagcaatggagtgctggaaagccagatgctcattccaaaatcatatgcgggccctttttcacagcacaattaccgccagggctatgccacgcaaaccgtgggcccatggcacttaggcaaattagagatagactttggagaatgccccggaacaacagtcacaattcaggaggattgtgaccatagaggcccatctttgaggaccaccactgcatctggaaaactagtcacgcaatggtgctgccgctcctgcacgatgcctcccttaaggttcttgggagaagatgggtgctggtatgggatggagattaggcccttgagtgaaaaagaagagaacatggtcaaatcacaggtgacggccggacagggcacatcagaaactttttctatgggtctgttgtgcctgaccttgtttgtggaagaatgcttgaggagaagagtcactaggaaacacatgatattagttgtggtgatcactctttgtgctatcatcctgggaggcctcacatggatggacttactacgagccctcatcatgttgggggacactatgtctggtagaataggaggacagatccacctagccatcatggcagtgttcaagatgtcaccaggatacgtgctgggtgtgtttttaaggaaactcacttcaagagagacagcactaatggtaataggaatggccatgacaacggtgctttcaattccacatgaccttatggaactcattgatggaatatcactgggactaattttgctaaaaatagtaacacagtttgacaacacccaagtgggaaccttagctctttccttgactttcataagatcaacaatgccattggtcatggcttggaggaccattatggctgtgttgtttgtggtcacactcattcctttgtgcaggacaagctgtcttcaaaaacagtctcattgggtagaaataacagcactcatcctaggagcccaagctctgccagtgtacctaatgactcttatgaaaggagcctcaagaagatcttggcctcttaacgagggcataatggctgtgggtttggttagtctcttaggaagcgctcttttaaagaatgatgtccctttagctggcccaatggtggcaggaggcttacttctggcggcttacgtgatgagtggtagctcagcagatctgtcactagagaaggccgccaacgtgcagtgggatgaaatggcagacataacaggctcaagcccaatcgtagaagtgaagcaggatgaagatggctctttctccatacgggacgtcgaggaaaccaatatgataacccttttggtgaaactggcactgataacagtgtcaggtctctaccccttggcaattccagtcacaatgaccttatggtacatgtggcaagtgaaaacacaaagatcaggagccctgtgggacgtcccctcacccgctgccactaaaaaagccgcactgtctgaaggagtgtacaggatcatgcaaagagggttattcgggaaaactcaggttggagtagggatacacatggaaggtgtatttcacacaatgtggcatgtaacaagaggatcagtgatctgccacgagactgggagattggagccatcttgggctgacgtcaggaatgacatgatatcatacggtgggggatggaggcttggagacaaatgggacaaagaagaagacgttcaggtcctcgccatagaaccaggaaaaaatcctaaacatgtccaaacgaaacctggccttttcaagaccctaactggagaaattggagcagtaacattagatttcaaacccggaacgtctggttctcccatcatcaacaggaaaggaaaagtcatcggactctatggaaatggagtagttaccaaatcaggtgattacgtcagtgccataacgcaagccgaaagaattggagagccagattatgaagtggatgaggacatttttcgaaagaaaagattaactataatggacttacaccccggagctggaaagacaaaaagaattcttccatcaatagtgagagaagccttaaaaaggaggctacgaactttgattttagctcccacgagagtggtggcggccgagatggaagaggccctacgtggactgccaatccgttatcagaccccagctgtgaaatcagaacacacaggaagagagattgtagacctcatgtgtcatgcaaccttcacaacaagacttttgtcatcaaccagggttccaaattacaaccttatagtgatggatgaagcacatttcaccgatccttctagtgtcgcggctagaggatacatctcgaccagggtggaaatgggagaggcagcagccatcttcatgaccgcaacccctcccggagcgacagatccctttccccagagcaacagcccaatagaagacatcgagagggaaattccggaaaggtcatggaacacagggttcgactggataacagactaccaagggaaaactgtgtggtttgttcccagcataaaagctggaaatgacattgcaaattgtttgagaaagtcgggaaagaaagttatccagttgagtaggaaaacctttgatacagagtatccaaaaacgaaactcacggactgggactttgtggtcactacagacatatctgaaatgggggccaattttagagccgggagagtgatagaccctagaagatgcctcaagccagttatcctaccagatgggccagagagagtcattttagcaggtcctattccagtgactccagcaagcgctgctcagagaagagggcgaataggaaggaacccagcacaagaagacgaccaatacgttttctccggagacccactaaaaaatgatgaagatcatgcccactggacagaagcaaagatgctgcttgacaatatctacaccccagaagggatcattccaacattgtttggtccggaaagggaaaaaacccaagccattgatggagagtttcgcctcagaggggaacaaaggaagacttttgtggaattaatgaggagaggagaccttccggtgtggctgagctataaggtagcttctgctggcatttcttacgaagatcgggaatggtgcttcacaggggaaagaaataaccaaattttagaagaaaacatggaggttgaaatttggactagagagggagaaaagaaaaagctaaggccaagatggttagatgcacgtgtatacgctgaccccatggctttgaaggatttcaaggagtttgccagtggaaggaagagtataactctcgacatcctaacagagattgccagtttgccaacttacctttcctctagggccaagctcgcccttgataacatagtcatgctccacacaacagaaagaggagggagggcctatcaacacgccctgaacgaacttccggagtcactggaaacactcatgcttgtagctttactaggtgctatgacagcaggcatcttcctgtttttcatgcaagggaaaggaatagggaaattgtcaatgggtttgataaccattgcggtggctagtggcttgctctgggtagcagaaattcaaccccagtggatagcggcctcaatcatactagagttttttctcatggtactgttgataccggaaccagaaaaacaaaggaccccacaagacaatcaattgatctacgtcatattgaccattctcaccatcattggtctaatagcagccaacgagatggggctgattgaaaaaacaaaaacggattttgggttttaccaggtaaaaacagaaaccaccatcctcgatgtggacttgagaccagcttcagcatggacgctctatgcagtagccaccacaattctgactcccatgctgagacacaccatagaaaacacgtcggccaacctatctctagcagccattgccaaccaggcagccgtcctaatggggcttggaaaaggatggccgctccacagaatggacctcggtgtgccgctgttagcaatgggatgctattctcaagtgaacccaacaaccttgacagcatccttagtcatgcttttagtccattatgcaataataggcccaggattgcaggcaaaagccacaagagaggcccagaaaaggacagctgctgggatcatgaaaaatcccacagtggacgggataacagtaatagatctagaaccaatatcctatgacccaaaatttgaaaagcaattagggcaggtcatgctactagtcttgtgtgctggacaactactcttgatgagaacaacatgggctttctgtgaagtcttgactttggccacaggaccaatcttgaccttgtgggagggcaacccgggaaggttttggaacacgaccatagccgtatccaccgccaacattttcaggggaagttacttggcgggagctggactggctttttcactcataaagaatgcacaaacccctaggaggggaactgggaccacaggagagacactgggagagaagtggaagagacagctaaactcattagacagaaaagagtttgaagagtataaaagaagtggaatactagaagtggacaggactgaagccaagtctgccctgaaagatgggtctaaaatcaagcatgcagtatcaagagggtccagtaagatcagatggattgttgagagagggatggtaaagccaaaagggaaagttgtagatcttggctgtgggagaggaggatggtcttattacatggcgacactcaagaacgtgactgaagtgaaagggtatacaaaaggaggtccaggacatgaagaaccgattcccatggctacttatggttggaatttggtcaaactccattcaggggttgacgtgttctacaaacccacagagcaagtggacaccctgctctgtgatattggggagtcatcttctaatccaacaatagaggaaggaagaacattaagagttttgaagatggtggagccatggctctcttcaaaacctgaattctgcatcaaagtccttaacccctacatgccaacagtcatagaagagctggagaaactgcagagaaaacatggtgggaaccttgtcagatgcccgctgtccaggaactccacccatgagatgtattgggtgtcaggagcgtcgggaaacattgtgagctctgtgaacacaacatcaaagatgttgttgaacaggttcacaacaaggcataggaaacccacttatgagaaggacgtagatcttggggcaggaacgagaagtgtctccactgaaacagaaaaaccagacatgacaatcattgggagaaggcttcagcgattgcaagaagagcacaaagaaacctggcattatgatcaggaaaacccatacagaacctgggcgtatcatggaagctatgaagctccttcgacaggctctgcatcctccatggtgaacggggtggtaaaactgctaacaaaaccctgggatgtgattccaatggtgactcagttagccatgacagatacaaccccttttgggcaacaaagagtgttcaaagagaaggtggataccagaacaccacaaccaaaacccggtacacgaatggttatgaccacgacagccaattggctgtgggccctccttggaaagaagaaaaatcccagactgtgcacaagggaagagttcatctcaaaagttagatcaaacgcagccataggcgcagtctttcaggaagaacagggatggacatcagccagtgaagctgtgaatgacagccggttttgggaactggttgacaaagaaagggccctacaccaggaagggaaatgtgaatcgtgtgtctataacatgatgggaaaacgtgagaaaaagttaggagagtttggcagagccaagggaagccgagcaatctggtacatgtggctgggagcgcggtttctggaatttgaagccctgggttttttgaatgaagatcactggtttggcagagaaaattcatggagtggagtggaaggggaaggtctgcacagattgggatatatcctggaggagatagacaagaaggatggagacctaatgtatgctgatgacacagcaggctgggacacaagaatcactgaggatgaccttcaaaatgaggaactgatcacggaacagatggctccccaccacaagatcctagccaaagccattttcaaactaacctatcaaaacaaagtggtgaaagtcctcagacccacaccgcggggagcggtgatggatatcatatccaggaaagaccaaagaggtagtggacaagttggaacatatggtttgaacacattcaccaacatggaagttcaactcatccgccaaatggaagctgaaggagtcatcacacaagatgacatgcagaacccaaaagggttgaaagaaagagttgagaaatggctgaaagagtgtggtgtcgacaggttaaagaggatggcaatcagtggagacgattgcgtggtgaagcccctagatgagaggtttggcacttccctcctcttcttgaacgacatgggaaaggtgaggaaagacattccgcagtgggaaccatctaagggatggaaaaactggcaagaggttcctttttgctcccaccactttcacaagatctttatgaaggatggccgctcactagttgttccatgtagaaaccaggatgaactgatagggagagccagaatctcgcagggagctggatggagcttaagagaaacagcctgcctgggcaaagcttacgcccagatgtggtcgcttatgtacttccacagaagggatctgcgtttagcctccatggccatatgctcagcagttccaacggaatggtttccaacaagcagaacaacatggtcaatccacgctcatcaccagtggatgaccactgaagatatgctcaaagtgtggaacagagtgtggatagaagacaaccctaatatgactgacaagactccagtccattcgtgggaagatataccttacctagggaaaagagaggatttgtggtgtggatccctgattggactttcttccagagccacctgggcgaagaacattcatacggccataacccaggtcaggaacctgatcggaaaagaggaatacgtggattacatgccagtaatgaaaagatacagtgctccttcagagagtgaaggagttctgtaa" >
 3 | 	<protein abbreviation="C" startPosition="1" stopPosition="340" />
 4 | 	<protein abbreviation="M" startPosition="340" stopPosition="838" />
 5 | 	<protein abbreviation="E" startPosition="838" stopPosition="2323" />
 6 | 	<protein abbreviation="NS1" startPosition="2323" stopPosition="3379" />
 7 | 	<protein abbreviation="NS2A" startPosition="3379" stopPosition="4033" />
 8 | 	<protein abbreviation="NS2B" startPosition="4033" stopPosition="4423" />
 9 | 	<protein abbreviation="NS3" startPosition="4423" stopPosition="6277" />
10 | 	<protein abbreviation="NS4A" startPosition="6277" stopPosition="6658" />
11 | 	<protein abbreviation="2K" startPosition="6658" stopPosition="6727" />
12 | 	<protein abbreviation="NS4B" startPosition="6727" stopPosition="7462" />
13 | 	<protein abbreviation="NS5" startPosition="7462" stopPosition="10165" />
14 | </orf>
15 | 


--------------------------------------------------------------------------------