├── .gitignore
├── COPYING
├── Makefile.am
├── README.md
├── bootstrap.sh
├── configure.ac
├── man
    ├── Makefile.am
    ├── ccheck.1
    ├── ma.1
    └── mia.1
├── matrices
    ├── Makefile.am
    ├── ancient.submat.solexa.onepass.txt
    ├── ancient.submat.solexa.pe.txt
    └── ancient.submat.txt
├── misc
    ├── easy-consensus.hs
    └── mt311.fa
├── src
    ├── Makefile.am
    ├── ccheck.cc
    ├── fsdb.c
    ├── fsdb.h
    ├── io.c
    ├── io.h
    ├── kmer.c
    ├── kmer.h
    ├── map_align.c
    ├── map_align.h
    ├── map_alignment.c
    ├── map_alignment.h
    ├── map_assembler.c
    ├── mia.c
    ├── mia.h
    ├── mia_main.c
    ├── mt311.c
    ├── myers_align.c
    ├── myers_align.h
    ├── params.h
    ├── pssm.c
    ├── pssm.h
    └── types.h
└── test
    ├── mia_testsuite.c
    ├── tf.fna
    ├── tf2.fna
    └── tr1.fna


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | 
 6 | # Compiled Dynamic libraries
 7 | *.so
 8 | 
 9 | # Compiled Static libraries
10 | *.lai
11 | *.la
12 | *.a
13 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | Artistic License 2.0
 2 | 
 3 | Copyright (c) 2000-2006, The Perl Foundation.
 4 | 
 5 | Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
 6 | Preamble
 7 | 
 8 | This license establishes the terms under which a given free software Package may be copied, modified, distributed, and/or redistributed. The intent is that the Copyright Holder maintains some artistic control over the development of that Package while still keeping the Package available as open source and free software.
 9 | 
10 | You are always permitted to make arrangements wholly outside of this license directly with the Copyright Holder of a given Package. If the terms of this license do not permit the full use that you propose to make of the Package, you should contact the Copyright Holder and seek a different licensing arrangement.
11 | Definitions
12 | 
13 | "Copyright Holder" means the individual(s) or organization(s) named in the copyright notice for the entire Package.
14 | 
15 | "Contributor" means any party that has contributed code or other material to the Package, in accordance with the Copyright Holder's procedures.
16 | 
17 | "You" and "your" means any person who would like to copy, distribute, or modify the Package.
18 | 
19 | "Package" means the collection of files distributed by the Copyright Holder, and derivatives of that collection and/or of those files. A given Package may consist of either the Standard Version, or a Modified Version.
20 | 
21 | "Distribute" means providing a copy of the Package or making it accessible to anyone else, or in the case of a company or organization, to others outside of your company or organization.
22 | 
23 | "Distributor Fee" means any fee that you charge for Distributing this Package or providing support for this Package to another party. It does not mean licensing fees.
24 | 
25 | "Standard Version" refers to the Package if it has not been modified, or has been modified only in ways explicitly requested by the Copyright Holder.
26 | 
27 | "Modified Version" means the Package, if it has been changed, and such changes were not explicitly requested by the Copyright Holder.
28 | 
29 | "Original License" means this Artistic License as Distributed with the Standard Version of the Package, in its current version or as it may be modified by The Perl Foundation in the future.
30 | 
31 | "Source" form means the source code, documentation source, and configuration files for the Package.
32 | 
33 | "Compiled" form means the compiled bytecode, object code, binary, or any other form resulting from mechanical transformation or translation of the Source form.
34 | Permission for Use and Modification Without Distribution
35 | 
36 | (1) You are permitted to use the Standard Version and create and use Modified Versions for any purpose without restriction, provided that you do not Distribute the Modified Version.
37 | Permissions for Redistribution of the Standard Version
38 | 
39 | (2) You may Distribute verbatim copies of the Source form of the Standard Version of this Package in any medium without restriction, either gratis or for a Distributor Fee, provided that you duplicate all of the original copyright notices and associated disclaimers. At your discretion, such verbatim copies may or may not include a Compiled form of the Package.
40 | 
41 | (3) You may apply any bug fixes, portability changes, and other modifications made available from the Copyright Holder. The resulting Package will still be considered the Standard Version, and as such will be subject to the Original License.
42 | Distribution of Modified Versions of the Package as Source
43 | 
44 | (4) You may Distribute your Modified Version as Source (either gratis or for a Distributor Fee, and with or without a Compiled form of the Modified Version) provided that you clearly document how it differs from the Standard Version, including, but not limited to, documenting any non-standard features, executables, or modules, and provided that you do at least ONE of the following:
45 | 
46 | (a) make the Modified Version available to the Copyright Holder of the Standard Version, under the Original License, so that the Copyright Holder may include your modifications in the Standard Version.
47 | (b) ensure that installation of your Modified Version does not prevent the user installing or running the Standard Version. In addition, the Modified Version must bear a name that is different from the name of the Standard Version.
48 | (c) allow anyone who receives a copy of the Modified Version to make the Source form of the Modified Version available to others under
49 | (i) the Original License or
50 | (ii) a license that permits the licensee to freely copy, modify and redistribute the Modified Version using the same licensing terms that apply to the copy that the licensee received, and requires that the Source form of the Modified Version, and of any works derived from it, be made freely available in that license fees are prohibited but Distributor Fees are allowed.
51 | Distribution of Compiled Forms of the Standard Version or Modified Versions without the Source
52 | 
53 | (5) You may Distribute Compiled forms of the Standard Version without the Source, provided that you include complete instructions on how to get the Source of the Standard Version. Such instructions must be valid at the time of your distribution. If these instructions, at any time while you are carrying out such distribution, become invalid, you must provide new instructions on demand or cease further distribution. If you provide valid instructions or cease distribution within thirty days after you become aware that the instructions are invalid, then you do not forfeit any of your rights under this license.
54 | 
55 | (6) You may Distribute a Modified Version in Compiled form without the Source, provided that you comply with Section 4 with respect to the Source of the Modified Version.
56 | Aggregating or Linking the Package
57 | 
58 | (7) You may aggregate the Package (either the Standard Version or Modified Version) with other packages and Distribute the resulting aggregation provided that you do not charge a licensing fee for the Package. Distributor Fees are permitted, and licensing fees for other components in the aggregation are permitted. The terms of this license apply to the use and Distribution of the Standard or Modified Versions as included in the aggregation.
59 | 
60 | (8) You are permitted to link Modified and Standard Versions with other works, to embed the Package in a larger work of your own, or to build stand-alone binary or bytecode versions of applications that include the Package, and Distribute the result without restriction, provided the result does not expose a direct interface to the Package.
61 | Items That are Not Considered Part of a Modified Version
62 | 
63 | (9) Works (including, but not limited to, modules and scripts) that merely extend or make use of the Package, do not, by themselves, cause the Package to be a Modified Version. In addition, such works are not considered parts of the Package itself, and are not subject to the terms of this license.
64 | General Provisions
65 | 
66 | (10) Any use, modification, and distribution of the Standard or Modified Versions is governed by this Artistic License. By using, modifying or distributing the Package, you accept this license. Do not use, modify, or distribute the Package, if you do not accept this license.
67 | 
68 | (11) If your Modified Version has been derived from a Modified Version made by someone other than you, you are nevertheless required to ensure that your Modified Version complies with the requirements of this license.
69 | 
70 | (12) This license does not grant you the right to use any trademark, service mark, tradename, or logo of the Copyright Holder.
71 | 
72 | (13) This license includes the non-exclusive, worldwide, free-of-charge patent license to make, have made, use, offer to sell, sell, import and otherwise transfer the Package with respect to any patent claims licensable by the Copyright Holder that are necessarily infringed by the Package. If you institute patent litigation (including a cross-claim or counterclaim) against any party alleging that the Package constitutes direct or contributory patent infringement, then this Artistic License to you shall terminate on the date that such litigation is filed.
73 | 
74 | (14) Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
75 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
1 | AUTOMAKE_OPTIONS = foreign
2 | SUBDIRS = src man matrices 
3 | 
4 | 
5 | 
6 | 
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | mapping-iterative-assembler
 2 | ===========================
 3 | 
 4 | The basic idea of this program is to align DNA sequencing fragments
 5 | (shotgun or targeted resequencing) to a reference, then call a
 6 | consensus.  Then the consensus is used as new reference and the process
 7 | is repeated until convergence.  Since it was originally designed to be
 8 | used on ancient DNA, it supports a position specific substitution
 9 | matrix, which improves both alignment and consensus calling on
10 | chemically damaged aDNA.
11 | 
12 | MIA has been used to assemble a number of Neandertal and early modern
13 | human mitochondria.   Occasionally it has been used on smallish nuclear
14 | regions, but it will probably not scale to a genome wide analysis.
15 | 
16 | 
17 | contamination-checker
18 | =====================
19 | 
20 | This program takes the output of MIA and tries to make sure an assembled
21 | mitochondrion is free from contamination.  It works by looking for
22 | positions in the called consensus where it differs from a panel of known
23 | human mitochondria, then classifies each read as either belonging to the
24 | sample or a putative contaminant.
25 | 
26 | 
27 | Installation Instructions
28 | =========================
29 | 
30 | If you get MIA from the Git repository, you will need the usual build
31 | environment (Gcc, Make, ...) and GNU Autotools (autoconf and automake).
32 | Installation should work like this:
33 | 
34 |     sh bootstrap.sh
35 |     ./configure
36 |     make
37 |     make install
38 | 


--------------------------------------------------------------------------------
/bootstrap.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | autoheader -Wall
3 | aclocal -Wall
4 | automake -a -Wall
5 | autoconf -Wall
6 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
 1 | #                                               -*- Autoconf -*-
 2 | # Process this file with autoconf to produce a configure script.
 3 | 
 4 | AC_PREREQ(2.60)
 5 | 
 6 | define([svnversion], esyscmd([sh -c "svnversion -cn"]))dnl
 7 | AC_INIT([MIA],[1.0],[green@eva.mpg.de])
 8 | AM_INIT_AUTOMAKE
 9 | 
10 | AC_CONFIG_SRCDIR([src/mia.c])
11 | AC_CONFIG_HEADER([src/config.h])
12 | 
13 | # Checks for programs.
14 | AC_PROG_CC
15 | AC_PROG_CXX
16 | 
17 | # Checks for header files.
18 | AC_HEADER_STDC
19 | AC_CHECK_HEADERS([float.h limits.h stdlib.h string.h math.h])
20 | 
21 | # Checks for typedefs, structures, and compiler characteristics.
22 | AC_C_CONST
23 | AC_C_INLINE
24 | AC_C_RESTRICT
25 | AC_TYPE_SIZE_T
26 | AC_HEADER_STDBOOL
27 | 
28 | # Checks for library functions.
29 | AC_FUNC_MALLOC
30 | AC_CHECK_FUNCS([memset strstr])
31 | AC_CHECK_LIB([m],[pow])
32 | AC_CHECK_LIB([m],[log10])
33 | 
34 | AC_CONFIG_FILES([Makefile src/Makefile man/Makefile matrices/Makefile])
35 | AC_OUTPUT
36 | 


--------------------------------------------------------------------------------
/man/Makefile.am:
--------------------------------------------------------------------------------
1 | dist_man_MANS = mia.1 ma.1 ccheck.1
2 | 


--------------------------------------------------------------------------------
/man/ccheck.1:
--------------------------------------------------------------------------------
  1 | .TH CCHECK "1" "October 2009" "ccheck" "User Commands"
  2 | .SH NAME
  3 | CCHECK \fB\-\-\fR Contamination Check for mia
  4 | .SH SYNPOSIS
  5 | ccheck [\-r \fIref.fa\fR] [\-a] [\-t] [\-s \fIM\-N\fR] [\-v] [\fIaln.maln\fR...]
  6 | .SH DESCRIPTION
  7 | \fBCcheck\fR looks at alignments in .maln assembly files as generated by
  8 | \fBMia\fR and tries to determine for each sequence whether it is likely
  9 | to be a known contaminant or genuine sequence.  Here \fIref.fa\fR is the
 10 | expected contaminant in FastA format, \fIaln.maln\fR is the output from
 11 | \fBMia\fR, typically the last iteration.
 12 | 
 13 | \fBCcheck\fR aligns the
 14 | assembled sequence and the contaminant, determining positions where no
 15 | credible contaminant would look like the sample, so called \fIstrongly
 16 | diagnostic positions\fR.  It also identifies positions where some
 17 | credible contaminants would look unlike the sample, these are \fIweakly
 18 | diagnostic positions\fR.  It then aligns each read from
 19 | the \fImaln\fR file to the contaminant in the same way \fBMia\fR would have
 20 | and checks for clearly aligned \fIdiagnostic positions\fR.  A \fIweakly
 21 | diagnostic position\fR can only be used once an actual contaminant was
 22 | seen, such positions are then called \fIeffectively diagnostic\fR.
 23 | 
 24 | Depending on whether bases at diagnostic positions are consistent with
 25 | genuine sequence and/or contamination, sequence reads are labelled
 26 | clean, contaminant, conflicting, nonsensical or uninformative.  A
 27 | contamination estimate with a 95% confidence interval is then printed,
 28 | once estimated from only \fIstrongly diagnostic positions\fR, once
 29 | estimated from \fIstrongly\fR and \fIeffectively diagnostic
 30 | positions\fR.
 31 | 
 32 | .SH OPTIONS
 33 | .TP
 34 | \fB\-r\fR, \fB--reference\fR \fIref.fa\fR
 35 | Defines the expected contaminant.  If your assembly is the mitochondrion
 36 | of a primate, \fIref.fa\fR would be the consensus of many human
 37 | mitochondria.  The contamiant is allowed to contain the \fIN\fR code for
 38 | unknown positions, but no other ambiguity codes or gaps.  If no
 39 | contamiant is given, a builtin consensus of 311 human mitochondria is
 40 | used.
 41 | .TP 
 42 | \fB\-a\fR, \fB--ancient\fR
 43 | Treat DNA as ancient, that is, assume it suffered from deamination.  This
 44 | option has no influence on the alignment algorithm (it always takes its
 45 | substitution matrices from the maln file) or the definition of
 46 | diagnostic positions (these are still simply differences between
 47 | contaminant and consensus).  It merely tells \fBccheck\fR
 48 | that a \fIT\fR or an \fIA\fR in a sequence are less informative because
 49 | they could have been formed from a \fIC\fR or \fIG\fR, respectively.  If
 50 | a diagnostic position is a transition, \fIT\fR and \fIA\fR are
 51 | consequently treated as consistent with both consensus and contaminant.
 52 | In contrast, \fIC\fR and \fIG\fR remain fully informative in any case.
 53 | .TP
 54 | \fB\-t\fR, \fB--transversions\fR
 55 | Restrict analysis to sites that show a transversion between contamiant
 56 | and assembly.  Using this option throws away valuable information and is
 57 | recommended if and only if you're going to write a paper about ancient
 58 | DNA and don't want to explain why you trust a \fIC\fR but don't trust a
 59 | \fIT\fR.
 60 | .TP
 61 | \fB\-s\fR, \fB--span\fR \fIM\-N\fR
 62 | Restrict analysis to the range from \fIM\fR to \fIN\fR on the
 63 | contaminant sequence.  Sticking with the mitochondrion example, you
 64 | could restrict the analysis to the hypervariable region.  Again, this
 65 | throws away information and could only ever be useful to placate
 66 | reviewers of your paper.
 67 | .TP
 68 | \fB\-n\fR, \fB--numpos\fR \fIN\fR
 69 | Require \fIN\fR diagnostic sites to be spanned by a single read for it
 70 | to be considered.  Defaults to 1.
 71 | .TP
 72 | \fB\-d\fR, \fB--maxd\fR \fID\fR
 73 | Allow up to \fID\fR differences when aligning assembly and contaminant.
 74 | This option is not normally needed, unless your contaminant differs
 75 | widely enough from the assembly that their alignment fails.  If so, you
 76 | could specify a bigger \fID\fR to allow aligning them anyway, though
 77 | \fBccheck\fR will likely fail even if the alignment doesn't.
 78 | .TP
 79 | \fB\-v\fR, \fB--verbose\fR
 80 | Increases the amount of debug output.  If repeated, produces even more
 81 | output.  You are not expected to understand any of the resulting
 82 | gibberish.
 83 | .TP
 84 | \fB\-f\fR, \fB--force\fR
 85 | Normally, \fBccheck\fR looks for a .maln file with a higher number than
 86 | given on the command line.  This behaviour is usually desired, but can
 87 | be overridden by specifying the \fB\-f\fR option.  
 88 | .TP  
 89 | \fB\-T\fR, \fB--table\fR
 90 | Change format of output to a table.  The table is much easier to parse
 91 | for downstream scripts, but harder to read with naked eyes.  It may be
 92 | way easier to use than the prose output if many input files are
 93 | processed in a single invocation. 
 94 | .TP
 95 | \fB\-F\fR, \fB--shoot\fR, \fB--foot\fR
 96 | Run on a modern sample (as in ''anatomically modern human'') despite
 97 | better knowledge.  Mnemonic: ''I want to shoot myself in the foot.''
 98 | 
 99 | .SH NOTES
100 | 
101 | Despite this documentation, which hints otherwise, you should probably
102 | not base a scientific paper on the output from this program.  Use at
103 | your own risk.
104 | 
105 | Users have been observed trying to trick \fBccheck\fR into comparing things it
106 | is not supposed to compare, e.g. by running it on .maln output files from
107 | the first \fBMia\fR iteration or by supplying variously mangled reference
108 | sequences.  Such endeavours tend to end in combinations of meaningless
109 | numbers and insulting error messages.  
110 | 
111 | Originally, \fBccheck\fR was intended to be used on the mitochondria of
112 | Neanderthals, where it works reasonably well.  That check, if applied to
113 | modern humans (or archaic, modern humans), has to fail by construction.
114 | As it turns out, the new test involving weakly diagnostic positions, end
115 | ups being confused by sequencing error and therefore fails, too.  The
116 | use of \fBccheck\fR on modern humans is therefore useless.
117 | 
118 | If \fBccheck\fR reports a contamination level of more than a few
119 | percent, chances are that your assembly is completely worthless and the
120 | real contamination level is much higher.  If \fBccheck\fR finds hardly
121 | any diagnostic positions, you may have actually assembled the
122 | contaminant, which will look clean.  Finally, if \fBccheck\fR doesn't
123 | report contamination, your contaminant may simply look different from
124 | what you expected.  This program will not magically clean up your
125 | sample.
126 | 
127 | .SH "AUTHOR"
128 | Written by Ed Green, Michael Siebauer and Udo Stenzel.
129 | 
130 | .SH "REPORTING BUGS"
131 | Report bugs to <green@eva.mpg.de>.
132 | 
133 | .SH "COPYRIGHT"
134 | Copyright © 2008 Ed Green  Michael Siebauer  Udo Stenzel.
135 | License Perl Artistic License 2.0
136 | <http://www.perlfoundation.org/artistic_license_2_0>.  This is free
137 | software: you are free to change and redistribute it.  There is NO
138 | WARRANTY, to the extent permitted by law.
139 | 
140 | .SH "SEE ALSO"
141 | mia (1)
142 | 


--------------------------------------------------------------------------------
/man/ma.1:
--------------------------------------------------------------------------------
  1 | .TH MA "1" "September 2009" "ma" "User Commands"
  2 | .SH NAME
  3 | MA \fB\-\-\fR Mapping Assembler
  4 | .SH DESCRIPTION
  5 | MA reports information for a .maln assembly file as generated by \fBMIA\fR. It can also convert the .maln files into different alignment formats. 
  6 | .SH SYNPOSIS
  7 | ma [\fIOPTIONS\fR] \fB\-M\fR \fI<maln input file>\fR
  8 | .SH OPTIONS
  9 | .TP
 10 | \fB\-c\fR \fI<consensus code>\fR
 11 | How the assembly calls each base can be determined by the consensus code.
 12 | .RS
 13 | .TP 
 14 | \fB1\fR = 
 15 | highest, positive aggregate score base (if any)
 16 | .TP
 17 | \fB2\fR =
 18 | highest aggregate score base if it is 2400 higher than second highest
 19 | .RE
 20 | .TP
 21 | \fB\-f\fR \fI<output format>\fR
 22 | Select the \fIoutput format\fR as one of the following. See \fIFORMATS\fR for a full description of available formats.
 23 | .RS
 24 | .PD 0
 25 | .TP 
 26 | \fB1\fR => 
 27 | clustalw
 28 | .TP
 29 | \fB2\fR => 
 30 | line format
 31 | .TP
 32 | \fB3\fR => 
 33 | column format 
 34 | .TP
 35 | \fB4\fR =>
 36 | 13-column format 
 37 | .IP
 38 | of all assembly data for positions that \fIdiffer\fR between consensus and \fBCURRENT\fR reference sequence of this iteration. 
 39 | .TP
 40 | \fB41\fR => 
 41 | 13-column format, but for \fIALL\fR positions
 42 | .TP
 43 | \fB5\fR =>
 44 | fasta
 45 | .TP
 46 | \fB6\fR =>
 47 | region format
 48 | .IP
 49 | shows the reference sequence, the consensus sequence, and then all assembled fragments in a region specified by option \fB\-R\fR
 50 | .RS
 51 | .TP
 52 | \fB\-C\fR
 53 | Also color the format \fB6\fR output \-> don't pipe this output to file!
 54 | .RE
 55 | .TP
 56 | \fB61\fR => 
 57 | multi-fasta
 58 | .IP
 59 | same as above, but in multi\-fasta format for viewing in Bioedit, e.g. (also requires a region as specified by the option \fB\-R\fR)
 60 | .TP
 61 | \fB7\fR => ACE
 62 | .RE
 63 | .PD
 64 | .TP
 65 | \fB\-R\fR <\fIREGION_START\fR:\fIREGION_END\fR>
 66 | Output only for a certain region. Use together with a \fIregion format\fR (\fB-f 6\fR or \fB-f 61\fR)
 67 | .TP
 68 | \fB\-I\fR <\fIID\fR> 
 69 | \fIConsensus_ID\fR to assign to assembly sequence
 70 | 
 71 | .SH FORMATS
 72 | The following output formats can be chosen by using \fB-f\fR option. 
 73 | .RS
 74 | .TP 
 75 | \fB1\fR => 
 76 | clustalw
 77 | .TP
 78 | \fB2\fR => 
 79 | line format
 80 | .PD 0
 81 | .IP
 82 | \fBfirst line\fR is "Consensus, \fIreference_name\fR, coverage:"
 83 | .IP
 84 | \fBsecond line\fR is the entire, assembled, aligned consensus sequence
 85 | .IP
 86 | \fBthird line\fR is the entire aligned reference sequence to which the consensus is aligned
 87 | .IP
 88 | \fBfourth line\fR is the sequence coverage at each position in a spaceseparated list of integers
 89 | .PD
 90 | .TP
 91 | \fB3\fR => 
 92 | column format 
 93 | .IP 
 94 | one line per base, one column for consensus, reference, and coverage; includes header with summary info
 95 | .TP
 96 | \fB4\fR =>
 97 | 13-column format of all assembly data for positions that differ between consensus and \fBCURRENT\fR reference sequence of this iteration. 
 98 | 
 99 | .B
100 | Note that in the FINAL iteration reference and consensus are equal! So there won't be any output. 
101 | .IP
102 | Each row has the following columns: 
103 | .RS
104 | .PD 0
105 | .TP
106 | \fB(1)\fR
107 | position on reference; 0\-based coordinates 
108 | .TP
109 | \fB(2)\fR
110 | reference base
111 | .TP
112 | \fB(3)\fR
113 | consensus assembly base
114 | .TP
115 | \fB(4)\fR
116 | coverage
117 | .TP
118 | \fB(5)\fR
119 | A's
120 | .TP
121 | \fB(6)\fR
122 | C's
123 | .TP
124 | \fB(7)\fR
125 | G's
126 | .TP
127 | \fB(8)\fR
128 | T's
129 | .TP
130 | \fB(9)\FR
131 | gaps; 
132 | .IP
133 | columns \fB5\fR through \fB9\fR should add up to column \fB4\fR 
134 | .TP
135 | \fB(10)\fR
136 | aggregate score for \fIA\fR
137 | .TP
138 | \fB(11)\fR
139 | aggregate score for \fIC\fR
140 | .TP
141 | \fB(12)\fR
142 | aggregate score for \fIG\fR
143 | .TP
144 | \fB(13)\fR
145 | aggregate score for \fIT\fR
146 | .PD
147 | .RE
148 | .TP
149 | \fB41\fR => 
150 | 13-column format, but for \fIALL\fR positions
151 | .TP
152 | \fB5\fR =>
153 | fasta
154 | .TP
155 | \fB6\fR =>
156 | region format
157 | .IP
158 | shows the reference sequence, the consensus sequence, and then all assembled fragments in a region specified by option \fB\-R\fR
159 | .RS
160 | .TP
161 | \fB\-C\fR
162 | Also color the format \fB6\fR output \-> don't pipe this output to file!
163 | .RE
164 | .TP
165 | \fB61\fR => 
166 | multi-fasta
167 | .IP
168 | same as above, but in multi\-fasta format for viewing in Bioedit, e.g. (also requires a region as specified by the option \fB\-R\fR)
169 | .TP
170 | \fB7\fR => ACE
171 | .RE
172 | .PD
173 | 
174 | .SH "AUTHOR"
175 | Written by Ed Green and Michael Siebauer. 
176 | 
177 | .SH "REPORTING BUGS"
178 | Report bugs to <green@eva.mpg.de>.
179 | 
180 | .SH "COPYRIGHT"
181 | Copyright © 2008 Ed Green  Michael Siebauer.
182 | License Perl Artistic License 2.0 <http://www.perlfoundation.org/artistic_license_2_0>. 
183 | This is free software: you are free to change and redistribute it.  There is NO WARRANTY, to the extent permitted by law.
184 | 
185 | .SH "SEE ALSO"
186 | mia (1)
187 | 


--------------------------------------------------------------------------------
/man/mia.1:
--------------------------------------------------------------------------------
  1 | .TH MIA "1" "September 2009" "mia" "User Commands"
  2 | .SH NAME
  3 | MIA \fB\-\-\fR Mapping Iterative Assembler
  4 | .IP
  5 | A tool for creating short read assemblies.
  6 | .SH "SYNOPSIS"
  7 | mia [\fIOPTIONS\fR] \fB\-f\fR \fIfragment reads\fR \fB\-r\fR \fIreference sequence\fR [\fB\-s\fR \fIsubstitution matrix\fR]
  8 | .SH OPTIONS
  9 | .TP
 10 | \fB\-r\fR \fIreference sequence\fR
 11 | initial reference sequence in fasta format
 12 | .TP
 13 | \fB\-f\fR \fIfragment reads\fR
 14 | fasta or fastq file of fragments to align
 15 | .TP
 16 | \fB\-s\fR \fIsubstitution matrix\fR
 17 | substitution matrix file used for scoring (\fBdefault\fR: \fIflat matrix\fR).
 18 | Mia searches for the substitution matrix first in the current directory,
 19 | and if it is not found, in the default location.  If it still isn't
 20 | found, a list of available matrices is printed.  (Therefore, use e.g.
 21 | \fI\-s list\fR to get a list of installed matrices.)
 22 | .TP
 23 | \fB\-m\fR \fINAME\fR
 24 | use \fINAME\fR as root file name for maln output file(s) (\fBdefault\fR: \fIassembly.maln.iter\fR)
 25 | .SS "FILTER parameters:"
 26 | .PP
 27 | A set of filters that can be applied to the reads. 
 28 | .TP
 29 | \fB\-u\fR 
 30 | fasta database has repeat sequences, keep one based on alignment score
 31 | .TP
 32 | \fB\-U\fR 
 33 | fasta database has repeat sequences, keep one based on sum of q\-scores
 34 | .TP
 35 | \fB\-C\fR 
 36 | collapse sequences with same start, end, strand info into a single sequence
 37 | .TP
 38 | \fB\-A\fR 
 39 | use adapter presence and coordinate information to more aggressively remove repeat sequences \- suitable only for 454 sequences that have not
 40 | already been adapter trimmed
 41 | .TP
 42 | \fB\-T\fR 
 43 | fasta database has adapters, trim these (need option \fB\-a\fR)
 44 | .TP
 45 | \fB\-a\fR \fI<adapter sequence or code>\fR
 46 | If \fB\-T\fR is specified, mia will attempt to find and trim adapters on each sequence. The adapter sequence itself can be specified by a one letter code as argument to \fB\-a\fR. 
 47 | 
 48 |     \fBN\fR or \fBn\fR 	                 => Neandertal adapter
 49 |     any other single letter     => Standard GS FLX adapter
 50 |     sequence (less than 127 nt) => user-specified adapter
 51 | 
 52 | .TP
 53 | \fB\-k\fR \fILENGTH\fR
 54 | use kmer filter with kmers of this \fIlength\fR. The kmer filter requires that a sequence fragment have at least one kmer of the specified length in common with the reference sequence in order to align it. For 36nt Solexa data, a value of \fB12\fR works well.
 55 | .TP
 56 | \fB\-I\fR \fIFILE\FR
 57 | filename of list of sequence IDs to use, ignoring all others
 58 | .SS "ALIGNMENT parameters:"
 59 | .TP
 60 | \fB\-p\fR \fI<consensus calling code>\fR
 61 | The \fB\-p\fR option specifies how the new consensus assembly sequence is called at each iteration (\fBdefault\fR: \fB1\fR)
 62 | .PD 0
 63 | .RS
 64 | .TP
 65 | \fB1\fR => 
 66 | Any base whose aggregate score is \fBMIN_SC_DIFF_CONS\fR better than all thers is the assembly base. If none is, then N is the assembly base.
 67 | .TP
 68 | \fB2\fR => 
 69 | The best scoring base whose aggregate score is better than \fBMIN_SCORE_CONS\fR is the assembly base. If none is, then N is the assembly base.
 70 | .PD
 71 | .RE  
 72 | .TP
 73 | \fB\-c\fR 
 74 | means reference/assembly is circular
 75 | .TP
 76 | \fB\-i\fR 
 77 | iterate assembly until convergence
 78 | .TP
 79 | \fB\-F\fR 
 80 | only output the FINAL assembly, not each iteration
 81 | .TP
 82 | \fB\-D\fR 
 83 | reference sequence is only distantly related. Low scoring reads will NOT be removed after each iteration
 84 | .TP
 85 | \fB\-h\fR 
 86 | give special discount for homopolymer gaps. Useful when using 454 sequencing data
 87 | .TP
 88 | \fB\-M\fR 
 89 | use lower\-case soft\-masking of kmers
 90 | .TP
 91 | \fB\-H\fR \fISCORE\fR
 92 | do not use dynamic score cutoff, instead use this hard \fIscore\fR cutoff
 93 | .TP
 94 | \fB\-S\fR \fISLOPE\fR
 95 | \fIslope\fR of length/score cutoff line
 96 | .TP
 97 | \fB\-N\fR \fIINTERCEPT\fR
 98 | \fIintercept\fR of length/score cutoff line
 99 | 
100 | .PP
101 | The procedure for removing bad\-scoring alignments from the assembly is:
102 | .IP
103 | \fBDefault\fR: fit a line to length versus score and remove reads that are less that \fBSCORE_CUTOFF_BUFFER\fR than the average score for its length.
104 | .IP
105 | If \fB\-H\fR is specified then this hard score cutoff is applied to all reads. This is preferable if all reads are the same length.
106 | .IP
107 | If \fB\-S\fR or \fB\-N\fR are specified, then these are used as the slope and intercept of a length/score line. Reads must score above this line to be included. If only one of \fB\-S\fR or \fB\-N\fR is specified then the default values are used for the other (\fBdefault S = 200.0; default N = 0.0\fR)
108 | 
109 | .SH FILES 
110 | .SS Substitution Matrices
111 | .PP 
112 | The substitution matrices are by default installed in the directory  
113 | .B $PREFIX/share/mia/matrices
114 | but you can also write your own. The matrices are:
115 | .IP
116 | \fIancient.submat.txt\fR is designed for ancient DNA and Roche \fB454\fR sequencing data.
117 | .IP
118 | \fIancient.submat.solexa.onepass.txt\fR takes an increased rate of G/T and C/A mismatches at the end of \fBIllumina\fR reads into account.
119 | .IP
120 | \fIancient.submat.solexa.pe.txt\fR like above but for \fBpaired end\fR reads.
121 | 
122 | .B
123 | if no matrix is supplied an generic flat matrix with this parameters will be used:
124 | .IP
125 | MATCH=200, MISMATCH=\-600, N=\-100 for all positions
126 | 
127 | .SS Default ParameterS
128 | .PP 
129 | All default parameters, like \fBSCORE_CUTOFF_BUFFER\fR or \fBMIN_SCORE_CONS\fR can be changed by modifying the source file \fIparams.h\fR in the mia source directory and afterwards recompiling. 
130 | 
131 | .SH DESCRIPTION
132 | .PP
133 | MIA assembles the \fIfragment reads\fR to a single consensus sequence using the \fIreference sequence\fR as guidance.
134 | 
135 | .SH "AUTHOR"
136 | Written by Ed Green and Michael Siebauer. 
137 | 
138 | .SH "REPORTING BUGS"
139 | Report bugs to <green@eva.mpg.de>.
140 | 
141 | 
142 | .SH "COPYRIGHT"
143 | Copyright © 2008 Ed Green  Michael Siebauer.
144 | License Perl Artistic License 2.0 <http://www.perlfoundation.org/artistic_license_2_0>. 
145 | This is free software: you are free to change and redistribute it.  There is NO WARRANTY, to the extent permitted by law.
146 | 
147 | 
148 | .SH "SEE ALSO"
149 | ma (1)
150 | 


--------------------------------------------------------------------------------
/matrices/Makefile.am:
--------------------------------------------------------------------------------
 1 | 
 2 | dist_matrices_DATA = \
 3 |     ancient.submat.solexa.onepass.txt \
 4 |     ancient.submat.solexa.pe.txt \
 5 |     ancient.submat.txt
 6 | 
 7 | matricesdir=$(prefix)/share/mia/matrices
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/matrices/ancient.submat.solexa.onepass.txt:
--------------------------------------------------------------------------------
  1 | # Matrix for position: 1
  2 | 200	-520	-500	-660	
  3 | -620	190	-580	0	
  4 | -510	-510	240	-610	
  5 | -660	-560	-600	120	
  6 | 
  7 | # Matrix for position: 2
  8 | 190	-550	-570	-670	
  9 | -580	210	-590	-50	
 10 | -560	-550	210	-650	
 11 | -650	-590	-620	140	
 12 | 
 13 | # Matrix for position: 3
 14 | 180	-610	-590	-670	
 15 | -610	210	-600	-100	
 16 | -600	-580	210	-630	
 17 | -650	-560	-600	170	
 18 | 
 19 | # Matrix for position: 4
 20 | 180	-620	-590	-660	
 21 | -620	200	-600	-160	
 22 | -600	-580	210	-620	
 23 | -650	-610	-610	170	
 24 | 
 25 | # Matrix for position: 5
 26 | 180	-620	-570	-660	
 27 | -610	210	-610	-210	
 28 | -570	-600	200	-620	
 29 | -650	-600	-630	180	
 30 | 
 31 | # Matrix for position: 6
 32 | 180	-600	-620	-640	
 33 | -610	210	-610	-220	
 34 | -600	-570	200	-640	
 35 | -650	-620	-630	180	
 36 | 
 37 | # Matrix for position: 7
 38 | 190	-600	-620	-650	
 39 | -630	200	-600	-290	
 40 | -530	-610	210	-620	
 41 | -640	-620	-630	180	
 42 | 
 43 | # Matrix for position: 8
 44 | 190	-620	-610	-650	
 45 | -630	200	-620	-320	
 46 | -560	-600	200	-630	
 47 | -640	-600	-630	190	
 48 | 
 49 | # Matrix for position: 9
 50 | 190	-620	-630	-650	
 51 | -630	200	-610	-380	
 52 | -510	-610	200	-620	
 53 | -630	-600	-620	180	
 54 | 
 55 | # Matrix for position: 10
 56 | 190	-600	-590	-630	
 57 | -590	200	-610	-360	
 58 | -480	-610	200	-620	
 59 | -640	-630	-630	180	
 60 | 
 61 | # Matrix for position: 11
 62 | 190	-630	-580	-650	
 63 | -620	200	-610	-390	
 64 | -510	-610	210	-630	
 65 | -650	-600	-610	180	
 66 | 
 67 | # Matrix for position: 12
 68 | 190	-630	-630	-640	
 69 | -630	200	-620	-400	
 70 | -440	-620	200	-630	
 71 | -640	-590	-610	190	
 72 | 
 73 | # Matrix for position: 13
 74 | 190	-630	-600	-630	
 75 | -620	200	-620	-420	
 76 | -500	-620	200	-590	
 77 | -640	-560	-590	190	
 78 | 
 79 | # Matrix for position: 14
 80 | 190	-610	-630	-640	
 81 | -610	200	-620	-420	
 82 | -490	-600	200	-630	
 83 | -640	-560	-630	190	
 84 | 
 85 | # Matrix for position: 15
 86 | 190	-600	-600	-610	
 87 | -600	200	-610	-450	
 88 | -520	-620	200	-610	
 89 | -630	-610	-610	190	
 90 | 
 91 | # Matrix for position: MIDDLE
 92 | 190	-810	-760	-830	
 93 | -800	200	-830	-460	
 94 | -590	-810	200	-760	
 95 | -860	-790	-770	190	
 96 | 
 97 | # Matrix for position: -15
 98 | 190	-600	-620	-620	
 99 | -620	200	-620	-470	
100 | -480	-620	200	-600	
101 | -640	-610	-630	180	
102 | 
103 | # Matrix for position: -14
104 | 190	-570	-600	-600	
105 | -620	200	-620	-460	
106 | -490	-580	200	-600	
107 | -620	-610	-580	190	
108 | 
109 | # Matrix for position: -13
110 | 190	-620	-610	-610	
111 | -620	200	-620	-490	
112 | -450	-620	200	-550	
113 | -640	-610	-590	190	
114 | 
115 | # Matrix for position: -12
116 | 190	-620	-610	-620	
117 | -620	200	-620	-480	
118 | -490	-610	190	-610	
119 | -640	-580	-590	190	
120 | 
121 | # Matrix for position: -11
122 | 190	-580	-600	-600	
123 | -600	200	-620	-490	
124 | -500	-600	200	-570	
125 | -640	-630	-620	180	
126 | 
127 | # Matrix for position: -10
128 | 190	-580	-590	-640	
129 | -610	200	-620	-470	
130 | -460	-590	190	-560	
131 | -640	-580	-610	190	
132 | 
133 | # Matrix for position: -9
134 | 190	-600	-610	-600	
135 | -630	200	-620	-480	
136 | -460	-610	190	-590	
137 | -630	-600	-630	190	
138 | 
139 | # Matrix for position: -8
140 | 190	-580	-590	-640	
141 | -630	200	-590	-470	
142 | -430	-600	200	-530	
143 | -640	-580	-600	190	
144 | 
145 | # Matrix for position: -7
146 | 190	-610	-580	-640	
147 | -630	200	-580	-500	
148 | -400	-620	200	-530	
149 | -640	-600	-600	190	
150 | 
151 | # Matrix for position: -6
152 | 190	-580	-590	-580	
153 | -600	210	-590	-570	
154 | -380	-610	200	-520	
155 | -630	-600	-580	180	
156 | 
157 | # Matrix for position: -5
158 | 190	-570	-600	-590	
159 | -600	200	-550	-520	
160 | -360	-610	200	-430	
161 | -650	-610	-590	180	
162 | 
163 | # Matrix for position: -4
164 | 180	-540	-480	-580	
165 | -610	200	-540	-470	
166 | -320	-570	200	-480	
167 | -630	-590	-570	180	
168 | 
169 | # Matrix for position: -3
170 | 180	-530	-450	-550	
171 | -590	200	-460	-510	
172 | -240	-610	200	-410	
173 | -650	-560	-500	180	
174 | 
175 | # Matrix for position: -2
176 | 170	-530	-480	-550	
177 | -600	200	-470	-510	
178 | -180	-530	220	-260	
179 | -630	-530	-530	170	
180 | 
181 | # Matrix for position: -1
182 | 140	-470	-480	-510	
183 | -620	200	-440	-420	
184 | -70	-320	250	-100	
185 | -670	-430	-480	160	
186 | 
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/matrices/ancient.submat.solexa.pe.txt:
--------------------------------------------------------------------------------
  1 | # Matrix for position: 1
  2 | 210	-220	-230	-330	
  3 | -290	160	-270	0	
  4 | -240	-220	210	-320	
  5 | -280	-240	-250	130	
  6 | 
  7 | # Matrix for position: 2
  8 | 180	-250	-270	-330	
  9 | -290	190	-270	-10	
 10 | -280	-230	200	-310	
 11 | -300	-240	-280	150	
 12 | 
 13 | # Matrix for position: 3
 14 | 180	-270	-230	-340	
 15 | -290	190	-230	-120	
 16 | -240	-210	240	-270	
 17 | -330	-270	-260	140	
 18 | 
 19 | # Matrix for position: 4
 20 | 170	-250	-270	-340	
 21 | -280	210	-230	-90	
 22 | -270	-210	210	-300	
 23 | -330	-270	-280	150	
 24 | 
 25 | # Matrix for position: 5
 26 | 180	-250	-250	-330	
 27 | -250	220	-250	-120	
 28 | -280	-230	200	-310	
 29 | -320	-270	-300	160	
 30 | 
 31 | # Matrix for position: 6
 32 | 170	-270	-270	-310	
 33 | -270	210	-250	-120	
 34 | -290	-240	200	-290	
 35 | -320	-270	-280	170	
 36 | 
 37 | # Matrix for position: 7
 38 | 170	-270	-290	-320	
 39 | -270	210	-250	-130	
 40 | -270	-240	210	-290	
 41 | -300	-270	-280	170	
 42 | 
 43 | # Matrix for position: 8
 44 | 190	-280	-260	-320	
 45 | -270	190	-250	-220	
 46 | -260	-250	220	-290	
 47 | -320	-300	-270	160	
 48 | 
 49 | # Matrix for position: 9
 50 | 190	-260	-270	-310	
 51 | -280	200	-260	-220	
 52 | -250	-250	210	-290	
 53 | -310	-300	-260	160	
 54 | 
 55 | # Matrix for position: 10
 56 | 180	-290	-270	-310	
 57 | -290	190	-260	-180	
 58 | -270	-250	210	-280	
 59 | -310	-290	-260	170	
 60 | 
 61 | # Matrix for position: 11
 62 | 180	-290	-260	-310	
 63 | -290	190	-250	-270	
 64 | -260	-250	220	-280	
 65 | -300	-300	-280	160	
 66 | 
 67 | # Matrix for position: 12
 68 | 160	-300	-280	-310	
 69 | -280	200	-260	-190	
 70 | -300	-260	200	-260	
 71 | -300	-260	-260	190	
 72 | 
 73 | # Matrix for position: 13
 74 | 170	-290	-250	-310	
 75 | -290	200	-240	-210	
 76 | -280	-250	200	-270	
 77 | -310	-280	-270	180	
 78 | 
 79 | # Matrix for position: 14
 80 | 180	-290	-270	-300	
 81 | -280	180	-270	-220	
 82 | -270	-260	210	-270	
 83 | -300	-270	-260	180	
 84 | 
 85 | # Matrix for position: 15
 86 | 170	-260	-270	-300	
 87 | -280	200	-260	-210	
 88 | -290	-260	190	-280	
 89 | -300	-240	-270	180	
 90 | 
 91 | # Matrix for position: MIDDLE
 92 | 180	-610	-590	-680	
 93 | -600	200	-610	-340	
 94 | -410	-580	210	-590	
 95 | -660	-610	-610	180	
 96 | 
 97 | # Matrix for position: -15
 98 | 190	-260	-270	-300	
 99 | -260	210	-260	-250	
100 | -200	-260	190	-300	
101 | -300	-280	-300	170	
102 | 
103 | # Matrix for position: -14
104 | 180	-270	-270	-310	
105 | -290	200	-250	-250	
106 | -240	-250	210	-280	
107 | -310	-290	-270	170	
108 | 
109 | # Matrix for position: -13
110 | 180	-290	-270	-290	
111 | -290	190	-260	-260	
112 | -230	-260	210	-280	
113 | -290	-290	-280	170	
114 | 
115 | # Matrix for position: -12
116 | 190	-260	-290	-300	
117 | -260	210	-250	-220	
118 | -220	-260	180	-280	
119 | -300	-270	-300	170	
120 | 
121 | # Matrix for position: -11
122 | 170	-290	-280	-300	
123 | -290	190	-260	-220	
124 | -170	-270	200	-260	
125 | -300	-280	-270	180	
126 | 
127 | # Matrix for position: -10
128 | 150	-300	-290	-310	
129 | -310	200	-250	-220	
130 | -190	-250	210	-260	
131 | -300	-270	-250	190	
132 | 
133 | # Matrix for position: -9
134 | 150	-290	-270	-330	
135 | -300	220	-230	-220	
136 | -200	-240	210	-260	
137 | -330	-260	-260	190	
138 | 
139 | # Matrix for position: -8
140 | 160	-270	-290	-320	
141 | -280	220	-240	-190	
142 | -170	-230	190	-260	
143 | -320	-260	-280	170	
144 | 
145 | # Matrix for position: -7
146 | 170	-260	-250	-320	
147 | -300	190	-240	-260	
148 | -140	-240	220	-280	
149 | -320	-300	-270	170	
150 | 
151 | # Matrix for position: -6
152 | 160	-280	-260	-310	
153 | -290	210	-230	-270	
154 | -140	-240	210	-270	
155 | -330	-280	-270	170	
156 | 
157 | # Matrix for position: -5
158 | 150	-290	-270	-310	
159 | -300	190	-250	-260	
160 | -150	-260	210	-250	
161 | -310	-250	-240	190	
162 | 
163 | # Matrix for position: -4
164 | 150	-290	-250	-330	
165 | -300	210	-220	-280	
166 | -80	-240	210	-240	
167 | -340	-280	-250	170	
168 | 
169 | # Matrix for position: -3
170 | 140	-260	-260	-330	
171 | -270	250	-200	-250	
172 | -80	-220	190	-300	
173 | -350	-230	-260	170	
174 | 
175 | # Matrix for position: -2
176 | 140	-290	-230	-310	
177 | -330	200	-210	-260	
178 | -20	-240	210	-270	
179 | -340	-280	-220	190	
180 | 
181 | # Matrix for position: -1
182 | 100	-310	-150	-320	
183 | -350	210	-100	-270	
184 | 30	-200	230	-220	
185 | -370	-240	-120	190	
186 | 
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/matrices/ancient.submat.txt:
--------------------------------------------------------------------------------
  1 | # Matrix for position: 1
  2 | 227	-612	-489	-630	
  3 | -683	188	-701	-21	
  4 | -588	-507	251	-679	
  5 | -823	-658	-799	111	
  6 | 
  7 | # Matrix for position: 2
  8 | 195	-748	-788	-725	
  9 | -888	217	-666	-52	
 10 | -729	-626	217	-736	
 11 | -833	-771	-711	148	
 12 | 
 13 | # Matrix for position: 3
 14 | 185	-716	-547	-846	
 15 | -898	217	-615	-101	
 16 | -803	-848	210	-819	
 17 | -929	-774	-672	169	
 18 | 
 19 | # Matrix for position: 4
 20 | 186	-653	-666	-711	
 21 | -800	213	-712	-151	
 22 | -797	-696	216	-912	
 23 | -930	-656	-801	172	
 24 | 
 25 | # Matrix for position: 5
 26 | 189	-890	-805	-829	
 27 | -900	212	-724	-199	
 28 | -705	-772	206	-753	
 29 | -763	-887	-645	182	
 30 | 
 31 | # Matrix for position: 6
 32 | 190	-792	-899	-830	
 33 | -741	212	-877	-266	
 34 | -641	-871	211	-650	
 35 | -925	-896	-703	181	
 36 | 
 37 | # Matrix for position: 7
 38 | 193	-737	-693	-828	
 39 | -802	206	-880	-263	
 40 | -734	-873	215	-748	
 41 | -923	-903	-701	180	
 42 | 
 43 | # Matrix for position: 8
 44 | 194	-738	-735	-766	
 45 | -802	204	-783	-282	
 46 | -694	-719	212	-647	
 47 | -919	-803	-900	182	
 48 | 
 49 | # Matrix for position: 9
 50 | 188	-736	-700	-829	
 51 | -799	213	-876	-305	
 52 | -620	-772	211	-906	
 53 | -925	-738	-744	183	
 54 | 
 55 | # Matrix for position: 10
 56 | 194	-898	-737	-918	
 57 | -744	203	-729	-329	
 58 | -539	-784	208	-672	
 59 | -816	-702	-900	187	
 60 | 
 61 | # Matrix for position: 11
 62 | 194	-892	-663	-767	
 63 | -796	210	-779	-336	
 64 | -615	-595	210	-751	
 65 | -922	-902	-647	181	
 66 | 
 67 | # Matrix for position: 12
 68 | 193	-901	-797	-917	
 69 | -747	202	-789	-344	
 70 | -598	-886	209	-669	
 71 | -914	-802	-697	190	
 72 | 
 73 | # Matrix for position: 13
 74 | 193	-895	-789	-828	
 75 | -900	208	-775	-336	
 76 | -521	-714	216	-706	
 77 | -926	-807	-801	178	
 78 | 
 79 | # Matrix for position: 14
 80 | 196	-896	-895	-918	
 81 | -700	204	-788	-357	
 82 | -616	-785	208	-707	
 83 | -816	-672	-903	186	
 84 | 
 85 | # Matrix for position: 15
 86 | 190	-801	-746	-917	
 87 | -804	206	-789	-349	
 88 | -624	-886	205	-902	
 89 | -814	-895	-899	193	
 90 | 
 91 | # Matrix for position: MIDDLE
 92 | 187	-1069	-869	-991	
 93 | -906	212	-960	-453	
 94 | -462	-967	210	-875	
 95 | -1009	-958	-1040	187	
 96 | 
 97 | # Matrix for position: -15
 98 | 188	-740	-900	-818	
 99 | -744	210	-882	-553	
100 | -357	-726	205	-904	
101 | -920	-898	-741	191	
102 | 
103 | # Matrix for position: -14
104 | 187	-741	-739	-763	
105 | -803	210	-878	-521	
106 | -332	-881	208	-903	
107 | -923	-900	-799	189	
108 | 
109 | # Matrix for position: -13
110 | 180	-897	-907	-828	
111 | -669	219	-671	-592	
112 | -370	-773	208	-803	
113 | -930	-732	-800	189	
114 | 
115 | # Matrix for position: -12
116 | 188	-901	-896	-821	
117 | -746	208	-720	-587	
118 | -344	-682	210	-802	
119 | -923	-703	-798	189	
120 | 
121 | # Matrix for position: -11
122 | 187	-662	-706	-822	
123 | -797	216	-778	-548	
124 | -378	-780	203	-908	
125 | -724	-735	-905	189	
126 | 
127 | # Matrix for position: -10
128 | 181	-904	-705	-820	
129 | -709	211	-778	-635	
130 | -315	-882	207	-798	
131 | -923	-793	-735	196	
132 | 
133 | # Matrix for position: -9
134 | 182	-800	-641	-769	
135 | -745	214	-671	-618	
136 | -336	-774	211	-801	
137 | -730	-698	-698	187	
138 | 
139 | # Matrix for position: -8
140 | 189	-896	-798	-758	
141 | -801	210	-781	-619	
142 | -283	-654	203	-807	
143 | -722	-800	-701	190	
144 | 
145 | # Matrix for position: -7
146 | 187	-894	-800	-915	
147 | -643	212	-782	-615	
148 | -254	-730	200	-808	
149 | -763	-663	-901	192	
150 | 
151 | # Matrix for position: -6
152 | 186	-793	-791	-921	
153 | -801	214	-872	-702	
154 | -213	-881	206	-751	
155 | -649	-743	-742	185	
156 | 
157 | # Matrix for position: -5
158 | 181	-736	-791	-922	
159 | -805	214	-768	-740	
160 | -183	-721	206	-807	
161 | -733	-741	-696	187	
162 | 
163 | # Matrix for position: -4
164 | 172	-802	-629	-771	
165 | -755	214	-624	-740	
166 | -153	-713	213	-667	
167 | -841	-582	-652	187	
168 | 
169 | # Matrix for position: -3
170 | 171	-701	-675	-769	
171 | -817	212	-752	-804	
172 | -102	-775	214	-669	
173 | -944	-704	-778	185	
174 | 
175 | # Matrix for position: -2
176 | 144	-807	-770	-839	
177 | -732	224	-615	-726	
178 | -48	-658	218	-890	
179 | -648	-653	-689	191	
180 | 
181 | # Matrix for position: -1
182 | 111	-670	-686	-724	
183 | -660	247	-428	-625	
184 | -18	-700	191	-648	
185 | -605	-480	-509	225	
186 | 
187 | 
188 | 


--------------------------------------------------------------------------------
/misc/easy-consensus.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, NamedFieldPuns, RecordWildCards #-}
  2 | import Control.Monad
  3 | import Data.Char ( toUpper )
  4 | import Data.List ( transpose, sortBy, sort )
  5 | import System.Console.GetOpt
  6 | import System.Environment ( getArgs, getProgName )
  7 | import System.Exit
  8 | import System.IO
  9 | 
 10 | data Conf = Opts { percent :: Double, to_ambicode :: String -> Char, output :: String -> IO () }
 11 | 
 12 | defaultOpts :: IO Conf
 13 | defaultOpts = return $ Opts 1 toAmbicode putStr
 14 | 
 15 | options :: [OptDescr (Conf -> IO Conf)]
 16 | options = [
 17 |     Option "p" ["percent"] (ReqArg (\a c -> readIO a >>= \p -> return $ c { percent = p / 100 }) "P")
 18 |                            "Set percentage needed for consensus to P",
 19 |     Option "n" ["only-n"]  (NoArg (\c -> return $ c { to_ambicode = toNucleotide }))
 20 |                            "Allow only nucleotides and 'N' in consensus",
 21 |     Option "i" ["iupac"]   (NoArg (\c -> return $ c { to_ambicode = toUpper . toAmbicode }))
 22 |                            "Allow all IUPAC ambiguity codes in consensus",
 23 |     Option "g" ["gaps"]    (NoArg (\c -> return $ c { to_ambicode = toAmbicode }))
 24 |                            "Allow IUPAC codes and small letters for optional gaps",
 25 |     Option "o" ["output"]  (ReqArg (\fn c -> return $ c { output = if fn == "-" then putStr else writeFile fn }) "FILE")
 26 |                            "Write output to FILE instead of stdout",
 27 |     Option "h?" ["help","usage"] (NoArg usage)
 28 |                            "Print this helpful message" ]
 29 |   where
 30 |     usage _ = do pn <- getProgName
 31 |                  let blah = "Usage: " ++ pn ++ "[options...] [fasta-file]\n\
 32 |                             \Reads a multi-FastA file, computes a consensus where \
 33 |                             \a given fraction of the sequences agree, \
 34 |                             \and writes it out in FastA format."
 35 |                  hPutStrLn stderr $ usageInfo blah options
 36 |                  exitSuccess
 37 | 
 38 | main :: IO ()
 39 | main = do
 40 |     (opts', files, errors) <- getOpt Permute options `fmap` getArgs
 41 |     unless (null errors) $ mapM_ (hPutStrLn stderr) errors >> exitFailure
 42 |     Opts{..} <- foldl (>>=) defaultOpts opts'
 43 | 
 44 |     output . writeFasta . filter (/= '-') .
 45 |         map (call_cons percent to_ambicode . map toUpper) . 
 46 |         transpose . map snd . concat =<<
 47 |         mapM (\fn -> readMFasta `fmap` if fn == "-" then getContents else readFile fn) files
 48 | 
 49 | 
 50 | readMFasta :: String -> [(String, String)]
 51 | readMFasta = go . lines . filter (/= '\r')
 52 |   where
 53 |     go ls = case dropWhile (not . isHeader) ls of
 54 |                 [] -> []
 55 |                 (h:ls1) -> let (b,ls2) = span (not.isHeader) ls1
 56 |                            in (h, concat b) : go ls2
 57 | 
 58 | writeFasta :: String -> String
 59 | writeFasta s = unlines $ ">consensus" : split 60 s
 60 |   where
 61 |     split n [] = []
 62 |     split n s' = case splitAt n s' of (l,r) -> l : split n r
 63 | 
 64 | isHeader :: String -> Bool
 65 | isHeader ('>':_) = True
 66 | isHeader _ = False
 67 | 
 68 | -- Consensus call into ambiguity code:
 69 | -- - count A,C,G,T,-; sort by prevalence
 70 | -- - take the more common stuff until we reach a fraction of p
 71 | -- - turn into ambiguity code, small case if the - is included
 72 | call_cons :: Double -> (String -> Char) -> String -> Char
 73 | call_cons p to_ambicode s 
 74 |     | null cum = 'N'
 75 |     | otherwise = case span ((< ceiling (p * total)) . fst) cum of
 76 |                     (l,r:_) -> collapse (r:l)
 77 |                     (l,[ ]) -> collapse l
 78 |   where
 79 |     total = fromIntegral $ fst $ last cum
 80 |   
 81 |     cum = scanl1 (\(a,_) (b,n) -> (a+b,n)) $
 82 |           sortBy (\(a,_) (b,_) -> compare b a) $
 83 |           zip [a,c,g,t,z] "ACGT-"
 84 | 
 85 |     (a,c,g,t,z,n) = count 0 0 0 0 0 0 s
 86 | 
 87 |     collapse = to_ambicode . sort . map snd
 88 | 
 89 | toAmbicode :: String -> Char
 90 | toAmbicode ""     = '-'
 91 | toAmbicode "A"    = 'A'
 92 | toAmbicode "AC"   = 'M'
 93 | toAmbicode "ACG"  = 'V'
 94 | toAmbicode "ACGT" = 'N'
 95 | toAmbicode "ACT"  = 'H'
 96 | toAmbicode "AG"   = 'R'
 97 | toAmbicode "AGT"  = 'D'
 98 | toAmbicode "AT"   = 'W'
 99 | toAmbicode "C"    = 'C'
100 | toAmbicode "CG"   = 'S'
101 | toAmbicode "CGT"  = 'B'
102 | toAmbicode "CT"   = 'Y'
103 | toAmbicode "G"    = 'G'
104 | toAmbicode "GT"   = 'K'
105 | toAmbicode "T"    = 'T'
106 | 
107 | toAmbicode "-"     = '-'
108 | toAmbicode "-A"    = 'a'
109 | toAmbicode "-AC"   = 'm'
110 | toAmbicode "-ACG"  = 'v'
111 | toAmbicode "-ACGT" = 'n'
112 | toAmbicode "-ACT"  = 'h'
113 | toAmbicode "-AG"   = 'r'
114 | toAmbicode "-AGT"  = 'd'
115 | toAmbicode "-AT"   = 'w'
116 | toAmbicode "-C"    = 'c'
117 | toAmbicode "-CG"   = 's'
118 | toAmbicode "-CGT"  = 'b'
119 | toAmbicode "-CT"   = 'y'
120 | toAmbicode "-G"    = 'g'
121 | toAmbicode "-GT"   = 'k'
122 | toAmbicode "-T"    = 't'
123 | 
124 | toAmbicode x = error $ "huh?  " ++ show x
125 | 
126 | toNucleotide :: String -> Char
127 | toNucleotide [c] | c `elem` "ACGT-" = c
128 | toNucleotide [ ] = '-'
129 | toNucleotide  _  = 'N'
130 | 
131 | 
132 | count :: Int -> Int -> Int -> Int -> Int -> Int
133 |       -> String -> (Int,Int,Int,Int,Int,Int)
134 | count !a !c !g !t !z !n s = case s of
135 |     [      ] -> (a,c,g,t,z,n)
136 |     ('A':s') -> count (a+1) c g t z (n+1)  s'
137 |     ('C':s') -> count a (c+1) g t z (n+1)  s'
138 |     ('G':s') -> count a c (g+1) t z (n+1)  s'
139 |     ('T':s') -> count a c g (t+1) z (n+1)  s'
140 |     ('-':s') -> count a c g t (z+1) (n+1)  s'
141 |     ( _ :s') -> count a c g t z     (n+1)  s'
142 | 
143 | 


--------------------------------------------------------------------------------
/src/Makefile.am:
--------------------------------------------------------------------------------
 1 | AM_CFLAGS = -O2
 2 | AM_CPPFLAGS = -DDATA_PATH=\"$(pkgdatadir)\"
 3 | 
 4 | bin_PROGRAMS = mia ma ccheck
 5 | 
 6 | mia_SOURCES = mia.c mia.h params.h types.h pssm.c pssm.h fsdb.h fsdb.c kmer.c kmer.h mia_main.c map_align.c map_align.h io.h io.c map_alignment.h map_alignment.c
 7 | 
 8 | ma_SOURCES = params.h types.h map_alignment.h map_alignment.c map_assembler.c io.h io.c map_align.h map_align.c
 9 | 
10 | ccheck_SOURCES = ccheck.cc myers_align.c fsdb.c io.c kmer.c map_align.c map_alignment.c mia.c pssm.c mt311.c \
11 | 		 map_align.h params.h types.h io.h map_alignment.h config.h mia.h fsdb.h pssm.h kmer.h myers_align.h
12 | 


--------------------------------------------------------------------------------
/src/fsdb.c:
--------------------------------------------------------------------------------
  1 | #include "fsdb.h"
  2 | 
  3 | 
  4 | /* fs_comp
  5 |    Args: (1) pointer to first FragSeqP
  6 |          (2) pointer to second FragSeqP
  7 |    Returns: 1 if the first FragSeq comes before the second one,
  8 |            -1 if it comes after and
  9 | 	   0 if they come at the same time
 10 |    This function defined a sort order for FragSeqP's. This order
 11 |    is useful for then determining which FragSeqP's are unique.
 12 | */
 13 | int fs_comp ( const void* fs1_,
 14 | 	      const void* fs2_ ) {
 15 |   FragSeqP* fs1p = (FragSeqP*) fs1_;
 16 |   FragSeqP* fs2p = (FragSeqP*) fs2_;
 17 |   FragSeqP fs1 = *fs1p;
 18 |   FragSeqP fs2 = *fs2p;
 19 | 
 20 |   /* First sort criteria is strand (rc) */
 21 |   if ( fs1->rc && !(fs2->rc) ) {
 22 |     return -1;
 23 |   }
 24 |   if ( !(fs1->rc) && fs2->rc ) {
 25 |     return 1;
 26 |   }
 27 | 
 28 |   /* Forward strand guys */
 29 |   if ( fs1->rc == 0 ) {
 30 |     /* Second sort criteria is where they start,
 31 |        lower coordinates come first */
 32 |     if ( fs1->as < fs2->as ) {
 33 |       return -1;
 34 |     }
 35 |     if ( fs1->as > fs2->as ) {
 36 |       return 1;
 37 |     }
 38 |     /* Third sort criteria is where they end,
 39 |        lower coordinates come later */
 40 |     if ( fs1->ae < fs2->ae ) {
 41 |       return 1;
 42 |     }
 43 |     if ( fs1->ae > fs2->ae ) {
 44 |       return -1;
 45 |     }
 46 |     /* Fourth sort criteria is the score,
 47 |        lower score comes later */
 48 |     if ( fs1->score < fs2->score ) {
 49 |       return 1;
 50 |     }
 51 |     if ( fs1->score > fs2->score ) {
 52 |       return -1;
 53 |     }
 54 | 
 55 |     /* If they match on all that, they are the same */
 56 |     return 0;
 57 |   }
 58 | 
 59 |   /* Reverse strand guys */
 60 |   else {
 61 |     /* Sort by end (start of molecule, higher coordinates first */
 62 |     if ( fs1->ae < fs2->ae ) {
 63 |       return 1;
 64 |     }
 65 |     if ( fs1->ae > fs2->ae ) {
 66 |       return -1;
 67 |     }
 68 | 
 69 |     /* Sort by start (end of molecule, higher coordinates later */
 70 |     if ( fs1->as < fs2->as ) {
 71 |       return -1;
 72 |     }
 73 |     if ( fs1->as > fs2->as ) {
 74 |       return 1;
 75 |     }
 76 | 
 77 |     /* Sort by score, lower scores come later */
 78 |     if ( fs1->score < fs2->score ) {
 79 |       return 1;
 80 |     }
 81 |     if ( fs1->score > fs2->score ) {
 82 |       return -1;
 83 |     }
 84 | 
 85 |     /* If all that matches, they are sorted the same */
 86 |     return 0;
 87 |   }
 88 | }
 89 | 
 90 | /* fs_comp_qscore
 91 |    Args: (1) pointer to first FragSeqP
 92 |          (2) pointer to second FragSeqP
 93 |    Returns: 1 if the first FragSeq comes before the second one,
 94 |            -1 if it comes after and
 95 | 	   0 if they come at the same time
 96 |    This function defined a sort order for FragSeqP's. This order
 97 |    is useful for then determining which FragSeqP's are unique.
 98 | */
 99 | int fs_comp_qscore ( const void* fs1_,
100 | 		     const void* fs2_ ) {
101 |   FragSeqP* fs1p = (FragSeqP*) fs1_;
102 |   FragSeqP* fs2p = (FragSeqP*) fs2_;
103 |   FragSeqP fs1 = *fs1p;
104 |   FragSeqP fs2 = *fs2p;
105 | 
106 |   /* First sort criteria is strand (rc) */
107 |   if ( fs1->rc && !(fs2->rc) ) {
108 |     return -1;
109 |   }
110 |   if ( !(fs1->rc) && fs2->rc ) {
111 |     return 1;
112 |   }
113 | 
114 |   /* Forward strand guys */
115 |   if ( fs1->rc == 0 ) {
116 |     /* Second sort criteria is where they start,
117 |        lower coordinates come first */
118 |     if ( fs1->as < fs2->as ) {
119 |       return -1;
120 |     }
121 |     if ( fs1->as > fs2->as ) {
122 |       return 1;
123 |     }
124 |     /* Third sort criteria is where they end,
125 |        lower coordinates come later */
126 |     if ( fs1->ae < fs2->ae ) {
127 |       return 1;
128 |     }
129 |     if ( fs1->ae > fs2->ae ) {
130 |       return -1;
131 |     }
132 |     /* Fourth sort criteria is the sum of quality scores
133 |        in fs->qual_sum, lower score comes later */
134 |     if ( fs1->qual_sum < fs2->qual_sum ) {
135 |       return 1;
136 |     }
137 |     if ( fs1->qual_sum > fs2->qual_sum ) {
138 |       return -1;
139 |     }
140 | 
141 |     /* If they match on all that, they are the same */
142 |     return 0;
143 |   }
144 | 
145 |   /* Reverse strand guys */
146 |   else {
147 |     /* Sort by end (start of molecule, higher coordinates first */
148 |     if ( fs1->ae < fs2->ae ) {
149 |       return 1;
150 |     }
151 |     if ( fs1->ae > fs2->ae ) {
152 |       return -1;
153 |     }
154 | 
155 |     /* Sort by start (end of molecule, higher coordinates later */
156 |     if ( fs1->as < fs2->as ) {
157 |       return -1;
158 |     }
159 |     if ( fs1->as > fs2->as ) {
160 |       return 1;
161 |     }
162 |     /* Fourth sort criteria is the sum of quality scores
163 |        in fs->qual_sum, lower score comes later */
164 |     if ( fs1->qual_sum < fs2->qual_sum ) {
165 |       return 1;
166 |     }
167 |     if ( fs1->qual_sum > fs2->qual_sum ) {
168 |       return -1;
169 |     }
170 | 
171 |     /* If all that matches, they are sorted the same */
172 |     return 0;
173 |   }
174 | }
175 | 
176 | 
177 | 
178 | /* add_virgin_fs2fsdb
179 |    Args: (1) FragSeqP fs - pointer to a "virgin" FragSeq
180 |          (2) FSDB fsdb - database to add this FragSeq to
181 |    Returns: 1 if success; 0 if failue (not enough memories)
182 |    This function is only called from sg_align; the argument
183 |    FragSeqP points to a FragSeq for which the following is
184 |    true: id, desc, as, ae, score, front_asp, back_asp,
185 |    unique, and num_inputs are set to correct values.
186 |    If trimmed is true, then this sequence is to be trimmed
187 |    to the trim_point
188 |    If rc is set, then this sequence is to be reverse
189 |    complemented
190 |    Once these operations are done, this "non-virgin" FragSeq
191 |    is then copied into the next slot of fsdb, growing fsdb
192 |    if necessary, and incrementing its fsdb->num_fss
193 | */
194 | int add_virgin_fs2fsdb( FragSeqP fs, FSDB fsdb ) {
195 |   int i, len, half_len;
196 |   char tmp_b, tmp_q;
197 | 
198 |   /* Trim it? */
199 |   if ( fs->trimmed ) {
200 |     fs->seq[fs->trim_point + 1] = '\0';
201 |     fs->qual[fs->trim_point + 1] = '\0';
202 |     fs->seq_len = fs->trim_point + 1;
203 |   }
204 | 
205 |   
206 |   /* revcom it if it's a revcom alignment and
207 |      we know the strand
208 |   */
209 |   if ( fs->rc &&
210 |        fs->strand_known ) {
211 |     len = fs->seq_len;
212 |     half_len = len / 2;
213 |     for ( i = 0; i < half_len; i++ ) {
214 |       tmp_b = fs->seq[i];
215 |       tmp_q = fs->qual[i];
216 |       fs->seq[i] = revcom_char(fs->seq[len-(i+1)]);
217 |       fs->seq[len-(i+1)] = revcom_char(tmp_b);
218 |       fs->qual[i] = fs->qual[len-(i+1)];
219 |       fs->qual[len-(i+1)] = tmp_q;
220 |     }
221 |     if ( len%2 == 1 ) {
222 |       /* Sequence length was odd, revcom the middle base;
223 | 	 No need to adjust the quality score
224 |        */
225 |       fs->seq[half_len] = revcom_char(fs->seq[half_len]);
226 |     }
227 |   }
228 | 
229 |   /* OK, now copy it over to fsdb */
230 |   return ( add_fs2fsdb( fs, fsdb ) );
231 | }
232 | 
233 | 
234 | 
235 | /* Sorts the fsdb->fss on rc, as, ae, score
236 |    After sorting all 1 strand alignments are first
237 |    These are sorted by as, then ae, with the highest
238 |    scoring guys first
239 | */
240 | void sort_fsdb( FSDB fsdb ) {
241 |   qsort( (void*)fsdb->fss, (size_t)fsdb->num_fss,
242 | 	 sizeof(FragSeqP), fs_comp );
243 | }
244 | 
245 | /* Sorts the fsdb->fss on rc, as, ae, qual_sum
246 |    After sorting all 1 strand alignments are first
247 |    These are sorted by as, then ae, with the highest
248 |    scoring guys first
249 | */
250 | void sort_fsdb_qscore( FSDB fsdb ) {
251 |   qsort( (void*)fsdb->fss, (size_t)fsdb->num_fss,
252 | 	 sizeof(FragSeqP), fs_comp_qscore );
253 | }
254 | 
255 | 
256 | /* find_fsdb_score_cut
257 |    Args: (1) FSDB fsdb - has valid data for seq_len, score, and
258 |              unique_best
259 |      (2) double* slope - pointer to slope to be calculated
260 |      (3) double* intercept - pointer to intercept to be calc.
261 |    Returns: void
262 |    Takes all the sequence lengths and scores in a FSDB database.
263 |    Calculates the best fit line through the data:
264 |    score = (slope * seq_len) + intercept
265 |    That is, is determines the dependency of average score on the
266 |    length of the sequence. This can then be used to determine
267 |    what is an inappropriately scoring (for its length) alignment.
268 | */
269 | void find_fsdb_score_cut( FSDB fsdb, double* slope, double* intercept ) {
270 |   /* Load up the lengths and scores of all unique guys that
271 |      had sensible scores as defined by FIRST_ROUND_SCORE_CUTOFF
272 |      This is necessary in case the distant reference option is
273 |      used in which case we may have some total crap sequences
274 |      and scores that will screw up the fit.  (This exact same loop is
275 |      repeated twice (three times in debug mode), the single pass
276 |      algorithm would be numerically unstable.)
277 |   */
278 |   double slope_bf = 0, intercept_bf = 0; 
279 |   //  double slope_max = 0, intercept_max = 0;
280 |   double slope_delta, max_slope_delta;
281 |   int max_sc_len[INIT_ALN_SEQ_LEN+1]; // place to put maximum 
282 |   // scores at each length
283 |   size_t j = 0, i ;
284 |   FILE* LVSLOG;
285 |   /* Initialization */
286 |   double xbar = 0, ybar = 0 ;
287 |   for ( i = 0; i < (INIT_ALN_SEQ_LEN+1); i++ ) {
288 |     max_sc_len[i] = 0;
289 |   }
290 | 
291 |   for ( i = 0; i < fsdb->num_fss; i++ ) {
292 |     if ( fsdb->fss[i]->unique_best &&
293 | 	(fsdb->fss[i]->score >= FIRST_ROUND_SCORE_CUTOFF) ) {
294 |       xbar += fsdb->fss[i]->seq_len;
295 |       ybar += fsdb->fss[i]->score;
296 |       j++;
297 |       /* Is this the best score for this length? */
298 |       if ( fsdb->fss[i]->score > max_sc_len[fsdb->fss[i]->seq_len] ) {
299 |         max_sc_len[fsdb->fss[i]->seq_len] = fsdb->fss[i]->score;
300 |       }
301 |     }
302 |   }
303 |   xbar /= j ;
304 |   ybar /= j ;
305 | 
306 |   double ssxy = 0, ssxx = 0 ;
307 |   for ( i = 0; i < fsdb->num_fss; i++ ) {
308 |     if ( fsdb->fss[i]->unique_best &&
309 | 	(fsdb->fss[i]->score >= FIRST_ROUND_SCORE_CUTOFF) ) {
310 |       ssxy += (fsdb->fss[i]->seq_len - xbar) * (fsdb->fss[i]->score - ybar) ;
311 |       ssxx += (fsdb->fss[i]->seq_len - xbar) * (fsdb->fss[i]->seq_len - xbar) ;
312 |     }
313 |   }
314 |   slope_bf     = ssxy / ssxx ;
315 |   intercept_bf = ybar - slope_bf * xbar ;
316 | 
317 | 
318 |   /* Now find the slope_max and intercept_max */
319 |   /*  xbar = 0;
320 |   ybar = 0;
321 |   j = 0;
322 |   for ( i = 0; i < (INIT_ALN_SEQ_LEN+1); i++ ) {
323 |     if ( max_sc_len[i] > 0 ) {
324 |       xbar += i;
325 |       ybar += max_sc_len[i];
326 |       j++;
327 |     }
328 |   }
329 |   xbar /= j;
330 |   ybar /= j;
331 |   
332 |   ssxy = 0;
333 |   ssxx = 0;
334 |   for ( i = 0; i < (INIT_ALN_SEQ_LEN+1); i++ ) {
335 |     if ( max_sc_len[i] > 0 ) {
336 |       ssxy += (i - xbar) * (max_sc_len[i] - ybar);
337 |       ssxx += (i - xbar) * (i - xbar);
338 |     }
339 |   }
340 |   slope_max = (ssxy / ssxx);
341 |   intercept_max = ybar - slope_max * xbar;
342 |   */
343 | 
344 |   max_slope_delta = 0;
345 |   for ( i = 0; i < fsdb->num_fss; i++ ) {
346 |     if ( fsdb->fss[i]->unique_best &&
347 | 	 (fsdb->fss[i]->score >= FIRST_ROUND_SCORE_CUTOFF) ) {
348 |       slope_delta = ( fsdb->fss[i]->score - 
349 | 		      ((slope_bf * fsdb->fss[i]->seq_len) + 
350 | 		       intercept_bf) ) 
351 | 	/ 
352 | 	fsdb->fss[i]->seq_len;
353 |       if ( slope_delta > max_slope_delta ) {
354 | 	max_slope_delta = slope_delta;
355 |       }
356 |     }
357 |   }
358 | 
359 |   *intercept = intercept_bf;
360 |   /* Make sure slope is sane (positive) */
361 |   if ( (slope_bf - max_slope_delta) > 0 ) {
362 |     *slope     = slope_bf - (max_slope_delta * 2.0);
363 |   }
364 |   else {
365 |     *slope = (double)(slope_bf * (SCORE_CUTOFF_BUFFER/100.0));
366 |   }
367 | 
368 |   if (DEBUG) {
369 |     LVSLOG = fileOpen( "LENvSCORE.dat", "w" );
370 |     fprintf( LVSLOG,
371 | 	"# Just calculated length-score best-fit line:\n" );
372 |     fprintf( LVSLOG,
373 | 	"# score = %0.4f + (length x %0.4f)\n",
374 | 	*intercept, *slope );
375 |     for ( i = 0; i < fsdb->num_fss; i++ ) {
376 |       if ( fsdb->fss[i]->unique_best &&
377 | 	  (fsdb->fss[i]->score >= FIRST_ROUND_SCORE_CUTOFF) ) {
378 | 	fprintf( LVSLOG, "%d\t%d\n", fsdb->fss[i]->seq_len, fsdb->fss[i]->score );
379 |       }
380 |     }
381 |     fclose( LVSLOG );
382 |   }
383 | }
384 | 
385 | /* write_fastq
386 |    Args: (1) char* fn
387 |          (2) FSDB fsdb
388 |    Returns: void
389 |    Writes a fastq database of sequences to the filename given of all
390 |    sequences and quality scores in the fsdb
391 | */
392 | void write_fastq( char* fn, FSDB fsdb ) {
393 |   FILE* f;
394 |   char rc, tr;
395 |   FragSeqP fs;
396 |   f = fileOpen( fn, "w" );
397 |   size_t i;
398 |   for ( i = 0; i < fsdb->num_fss; i++ ) {
399 |     fs = fsdb->fss[i];
400 |     if (fs->rc) {
401 |       rc = 'R';
402 |     }
403 |     else {
404 |       rc = 'F';
405 |     }
406 |     if ( fs->trimmed ) {
407 |       tr = 'T';
408 |     }
409 |     else {
410 |       tr = 'U';
411 |     }
412 |     fprintf( f, "@%s %c %c\n", fs->id, rc, tr );
413 |     fprintf( f, "%s\n", fs->seq );
414 |     fprintf( f, "+%s\n", fs->id );
415 |     fprintf( f, "%s\n", fs->qual );
416 |   }
417 |   fclose( f );
418 |   return;
419 | }
420 | 
421 | /* set_uniq_in_fsdb
422 |    Args: (1) FSDB fsdb - has fss field SORTED!
423 |          (2) int just_outer_coords - boolean; TRUE means just use
424 | 	     outer coordinate info (strand, start, end) to
425 | 	     decide about uniqueness; FALSE is a more complex
426 | 	     scheme. If a sequence is has the same start, but a
427 | 	     lower end point, it is not unique unless it is also
428 | 	     trimmed. This is to handle 454 data where occassionally
429 | 	     sequences "end" because of some filter, but not the
430 | 	     natural end of the molecule. Then, repeats can show up
431 | 	     in different lengths!
432 |          (3) unsigned short tolerance - allow this much tolerance for start and end coordinates. 
433 |  *           Due to oversequencing, PCR can result in redundant reads that differ only a few bases. 
434 |    Returns: void
435 |    Goes through each sequence and the sets the unique_best flag
436 |    to true for the first of each kind (same as, ae, and rc) and
437 |    sets unique_best to false for all others
438 | */
439 | 
440 | void set_uniq_in_fsdb( FSDB fsdb, const int just_outer_coords , const unsigned short tolerance) {
441 |   int i, curr_rc, curr_as, curr_ae;
442 |   FragSeqP fs;
443 |   fs = fsdb->fss[0];
444 |   /* initialize */
445 |   curr_rc = fs->rc;
446 |   curr_as = fs->as;
447 |   curr_ae = fs->ae;
448 |   fs->unique_best = 1;
449 |   for ( i = 1; i < fsdb->num_fss; i++ ) {
450 |     fs = fsdb->fss[i];
451 | 
452 |     /* If new guy is same as last guy, on strand, start, and end,
453 |        he's redundant (not unique) */
454 |     if ( (fs->rc == curr_rc) &&
455 | 	 (abs(fs->as - curr_as) <= tolerance) &&
456 | 	 (abs(fs->ae - curr_ae) <= tolerance)) {
457 |       fs->unique_best = 0;
458 |     }
459 | 
460 |     else {
461 |       if ( just_outer_coords ) {
462 | 	fs->unique_best = 1;
463 |       }
464 |       else { // can still be the weird thing where it ends before
465 | 	// the previous guy and is untrimmed
466 | 	/* forward strand */
467 | 	if ( fs->rc == 0 ) {
468 | 	  /* Can still be redundant if it's untrimmed */
469 | 	  if ( fs->as == curr_as ) {
470 | 	    if ( fs->trimmed ) {
471 | 	      /* strand and start match, end does not and it's trimmed:
472 | 		 therefore, it's a unique best */
473 | 	      fs->unique_best = 1;
474 | 	    }
475 | 	    else {
476 | 	      fs->unique_best = 0;
477 | 	    }
478 | 	  }
479 | 	  else {
480 | 	    fs->unique_best = 1;
481 | 	  }
482 | 	}
483 | 	
484 | 	/* reverse strand */
485 | 	else {
486 | 	  if ( fs->ae == curr_ae ) {
487 | 	    if ( fs->trimmed ) {
488 | 	      /* strand and end (beginning of rc molecule, dummy) match
489 | 		 but start (end, really) and it's trimmed, therefore
490 | 		 it's a unique best */
491 | 	      fs->unique_best = 1;
492 | 	    }
493 | 	    else {
494 | 	      fs->unique_best = 0;
495 | 	    }
496 | 	  }
497 | 	  
498 | 	  else {
499 | 	    fs->unique_best = 1;
500 | 	  }
501 | 	}
502 |       }
503 |       curr_rc = fs->rc;
504 |       curr_as = fs->as;
505 |       curr_ae = fs->ae;
506 |     }
507 |   }
508 | }
509 | 
510 | /* asp_len
511 |    Args: (1) AlnSeqP asp - pointer to an AlnSeq
512 |    Returns (1) int - total length of sequence 
513 |    This function finds the total length of the sequence in this
514 |    aligned sequence fragment. This is the sum of the sequence
515 |    in the asp->seq field and all of the inserted sequence (if any)
516 |    in the asp->ins array
517 | */
518 | int asp_len( AlnSeqP asp ) {
519 |   int i;
520 |   int aln_seq_len = 0;
521 |   int tot_seq_len = 0;
522 |   aln_seq_len = (asp->end - asp->start + 1);
523 |   tot_seq_len = (asp->end - asp->start + 1);
524 |   for( i = 0; i < aln_seq_len; i++ ) {
525 |     if ( asp->ins[i] != NULL ) {
526 |       tot_seq_len += strlen( asp->ins[i] );
527 |     }
528 |   }
529 |   return tot_seq_len;
530 | }
531 | 
532 | 
533 | 
534 | /* pop_smp_from_FSDB
535 |    Args: (1) FSDB fsdb with valid data
536 |          (2) Depth of PSSM matrices
537 |    Returns: void
538 |    Goes through all the AlnSeqs in the fsdb->fss array. Follows
539 |    the front_asp (and back_asp, if necessary) pointer to populate
540 |    the smp field of all AlnSeqs with the correct code for what
541 |    depth in the PSSM matrix to use for consensus calling */
542 | void pop_smp_from_FSDB( FSDB fsdb, int depth ) {
543 |   int i, aln_seq_pos, front_seq_len, back_seq_len,
544 |     distance_from_front,
545 |     distance_from_back,
546 |     aln_seq_len;
547 |   int act_seq_pos = 0;
548 |   AlnSeqP front_asp, back_asp;
549 | 
550 |   for ( i = 0; i < fsdb->num_fss; i++ ) {
551 |     front_asp = fsdb->fss[i]->front_asp;
552 |     back_asp  = fsdb->fss[i]->back_asp;
553 |     act_seq_pos = 0;
554 |     front_seq_len = asp_len( front_asp );
555 |     if ( back_asp != NULL ) {
556 |       back_seq_len  = asp_len( back_asp );
557 |     }
558 |     else {
559 |       back_seq_len = 0;
560 |     }
561 | 
562 |     /* First, fill in the front_asp->smp array */
563 |     aln_seq_len = front_asp->end - front_asp->start + 1;
564 |     for( aln_seq_pos = 0; aln_seq_pos < aln_seq_len; aln_seq_pos++ ) {
565 |       if ( front_asp->ins[aln_seq_pos] != NULL ) {
566 | 	act_seq_pos += strlen( front_asp->ins[aln_seq_pos] );
567 |       }
568 |       distance_from_front = act_seq_pos;
569 |       distance_from_back  = (front_seq_len + back_seq_len) -
570 | 	act_seq_pos - 1;
571 | 
572 |       if ( distance_from_front <= depth ) {
573 | 	front_asp->smp[aln_seq_pos] = ('A'+distance_from_front);
574 |       }
575 |       else {
576 | 	if ( distance_from_back < depth ) {
577 | 	  front_asp->smp[aln_seq_pos] = ('A'+(depth*2)-distance_from_back);
578 | 	}
579 | 	else {
580 | 	  front_asp->smp[aln_seq_pos] = ('A' + depth);
581 | 	}
582 |       }
583 | 
584 |       if ( front_asp->seq[aln_seq_pos] != '-' ) {
585 | 	act_seq_pos++;
586 |       }
587 |     }
588 |     front_asp->smp[aln_seq_pos] = '\0';
589 | 
590 |     /* Then, fill in the back_asp->smp array */
591 |     if ( back_asp != NULL ) {
592 |       aln_seq_len = back_asp->end - back_asp->start + 1;
593 |       for( aln_seq_pos = 0; aln_seq_pos < aln_seq_len; aln_seq_pos++ ) {
594 | 	if ( back_asp->ins[aln_seq_pos] != NULL ) {
595 | 	  act_seq_pos += strlen( back_asp->ins[aln_seq_pos] );
596 | 	}
597 | 	distance_from_front = (front_seq_len + act_seq_pos);
598 | 	distance_from_back = (front_seq_len + back_seq_len) -
599 | 	  act_seq_pos - 1;
600 | 	if ( distance_from_front <= depth ) {
601 | 	  back_asp->smp[aln_seq_pos] = ('A' + distance_from_front);
602 | 	}
603 | 	else {
604 | 	  if ( distance_from_back < depth ) {
605 | 	    back_asp->smp[aln_seq_pos] = ('A'+(depth*2)-distance_from_back);
606 | 	  }
607 | 	  else {
608 | 	    back_asp->smp[aln_seq_pos] = ('A' + depth);
609 | 	  }
610 | 	}
611 | 
612 | 	if ( back_asp->seq[aln_seq_pos] != '-' ) {
613 | 	  act_seq_pos++;
614 | 	}
615 |       }
616 |       back_asp->smp[aln_seq_pos] = '\0';
617 |     }
618 |   }
619 | }
620 | 
621 | 
622 | /* add_fs2fsdb
623 |    Args: (1) FragSeqP fs - pointer to a fully valid FragSeq
624 |          (2) FSdb fsdb - database to add this FragSeq to
625 |    Returns: 1 if success; 0 if failure (not enough memories)
626 |    Adds the FragSeq pointed to by fs to the fsdb database,
627 |    growing it if necessary */
628 | int add_fs2fsdb( FragSeqP fs, FSDB fsdb ) {
629 |   FragSeqP next_fs;
630 | 
631 |   /* First, check if fsdb need to grow */
632 |   if ( fsdb->num_fss == fsdb->size ) {
633 |     if ( grow_FSDB( fsdb ) == 0 ) {
634 |       return 0;
635 |     }
636 |   }
637 | 
638 |   /* Get a pointer to the next available FragSeq */
639 |   next_fs = fsdb->fss[fsdb->num_fss];
640 | 
641 |   /* Copy over the input fs into next_fs */
642 |   strcpy( next_fs->id, fs->id );
643 |   strcpy( next_fs->desc, fs->desc );
644 |   strcpy( next_fs->seq, fs->seq );
645 |   strcpy( next_fs->qual, fs->qual );
646 |   next_fs->qual_sum   = fs->qual_sum;
647 |   next_fs->trim_point = fs->trim_point;
648 |   next_fs->trimmed    = fs->trimmed;
649 |   next_fs->seq_len    = fs->seq_len;
650 |   next_fs->strand_known = fs->strand_known;
651 |   next_fs->rc         = fs->rc;
652 |   next_fs->as         = fs->as;
653 |   next_fs->ae         = fs->ae;
654 |   next_fs->score      = fs->score;
655 |   next_fs->front_asp  = fs->front_asp;
656 |   next_fs->back_asp   = fs->back_asp;
657 |   next_fs->unique_best = fs->unique_best;
658 |   next_fs->num_inputs  = fs->num_inputs;
659 |   next_fs->qss = NULL;
660 |   /* Bump up the num_fss */
661 |   fsdb->num_fss += 1;
662 |   return 1;
663 | }
664 | 
665 | /* grow_FSDB
666 |    Args: (1) FSDB (fsdb) to be made twice as big
667 |    Returns: 1 if success; 0 if failure (not enough memories)
668 |    Grows an FSDB by allocating another chunk of memory for
669 |    the FragSeqs as big as the one it already has. Note, it
670 |    *DOES NOT* throw away the one it already has. Then, the
671 |    fsdb->fss array is replaced by one twice as big. The
672 |    pointers to all the existing FragSeqs are copied over
673 |    and the new ones are set up. The size is reset, too.
674 |    The old fsdb->fss array is freed
675 | */
676 | int grow_FSDB( FSDB fsdb ) {
677 |   int i, j, new_size;
678 |   FragSeqP first_seq;
679 |   FragSeqP* new_fss;
680 | 
681 |   new_size = fsdb->size * 2;
682 | 
683 |   /* DEBUG INFO */
684 |   if ( DEBUG ) {
685 |     fprintf( stderr, "Growing fsdb from %d to %d\n",
686 | 	     (int)fsdb->size, new_size );
687 |   }
688 | 
689 |   /* Allocate another chunck of memories as big as the
690 |      one it has now, doubling its size */
691 |   first_seq = (FragSeqP)save_malloc(fsdb->size *
692 | 			       sizeof(FragSeq));
693 |   if ( first_seq == NULL ) {
694 |     return 0;
695 |   }
696 | 
697 |   /* Now, allocate the *new* array of pointers for fsdb->fss
698 |      But, assign this to new_fss for now because we need to
699 |      keep fsdb->fss so we can copy over the pointers it already
700 |      has!
701 |   */
702 |   new_fss = (FragSeqP*)save_malloc(new_size * sizeof(FragSeqP));
703 |   if ( new_fss == NULL ) {
704 |     return 0;
705 |   }
706 | 
707 |   /* Point the pointers to the pointees */
708 |   for( i = 0; i < fsdb->size; i++ ) {
709 |     new_fss[i] = fsdb->fss[i];
710 |   }
711 |   j = 0;
712 |   for( i = fsdb->size; i < new_size; i++ ) {
713 |     new_fss[i] = &first_seq[j++];
714 |   }
715 | 
716 |   /* Now, free the old fsdb->fss and slot in the new one */
717 |   free( fsdb->fss );
718 |   fsdb->fss  = new_fss;
719 |   fsdb->size = new_size;
720 |   return 1;
721 | }
722 | 
723 | /* init_FSDB
724 |    Arguments: void
725 |    Returns: FSDB (pointer to struct fragseqdb) / NULL if
726 |     not enough memories
727 |    Used for initializing a new database of FragSeqs. Allocates
728 |    enough memoery for INIT_NUM_ALN_SEQS of these
729 | */
730 | FSDB init_FSDB ( void ) {
731 |   int i;
732 |   FSDB fsdb;
733 |   FragSeqP first_seq;
734 | 
735 |   /* First, allocate the memories */
736 |   fsdb = (FSDB)save_malloc(sizeof(FragSeqDB));
737 |   if ( fsdb == NULL ) {
738 |     return NULL;
739 |   }
740 | 
741 |   first_seq = (FragSeqP)save_malloc(INIT_NUM_ALN_SEQS *
742 | 			       sizeof(FragSeq));
743 |   if ( first_seq == NULL ) {
744 |     return NULL;
745 |   }
746 | 
747 |   fsdb->fss = (FragSeqP*)save_malloc(INIT_NUM_ALN_SEQS *
748 | 				sizeof( FragSeqP ));
749 |   if ( fsdb->fss == NULL ) {
750 |     return NULL;
751 |   }
752 | 
753 |   for ( i = 0; i < INIT_NUM_ALN_SEQS; i++ ) {
754 |     fsdb->fss[i] = &first_seq[i];
755 |   }
756 | 
757 |   fsdb->size = INIT_NUM_ALN_SEQS;
758 |   fsdb->num_fss = 0;
759 | 
760 |   return fsdb;
761 | }
762 | 
763 | 


--------------------------------------------------------------------------------
/src/fsdb.h:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * File:   fsdb.h
  3 |  * Author: TCO
  4 |  *
  5 |  * Created on 3. Februar 2009, 13:08
  6 |  */
  7 | 
  8 | #ifndef _FSDB_H
  9 | #define	_FSDB_H
 10 | 
 11 | #include "types.h"
 12 | #include "params.h"
 13 | #include "stdio.h"
 14 | #include "stdlib.h"
 15 | #include "io.h"
 16 | 
 17 | #ifdef	__cplusplus
 18 | extern "C" {
 19 | #endif
 20 | 
 21 | 
 22 | 
 23 | /* fs_comp
 24 |    Args: (1) pointer to first FragSeqP
 25 |          (2) pointer to second FragSeqP
 26 |    Returns: 1 if the first FragSeq comes before the second one,
 27 |            -1 if it comes after and
 28 | 	   0 if they come at the same time
 29 |    This function defined a sort order for FragSeqP's. This order
 30 |    is useful for then determining which FragSeqP's are unique.
 31 | */
 32 |   int fs_comp ( const void* fs1_,
 33 | 		const void* fs2_ ) ;
 34 | 
 35 |   int fs_comp_qscore ( const void* fs1_,
 36 | 		       const void* fs2_ );
 37 | /* add_virgin_fs2fsdb
 38 |    Args: (1) FragSeqP fs - pointer to a "virgin" FragSeq
 39 |          (2) FSDB fsdb - database to add this FragSeq to
 40 |    Returns: 1 if success; 0 if failue (not enough memories)
 41 |    This function is only called from sg_align; the argument
 42 |    FragSeqP points to a FragSeq for which the following is
 43 |    true: id, desc, as, ae, score, front_asp, back_asp and
 44 |    unique are set to correct values.
 45 |    If trimmed is true, then this sequence is to be trimmed
 46 |    to the trim_point
 47 |    If rc is set, then this sequence is to be reverse
 48 |    complemented
 49 |    Once these operations are done, this "non-virgin" FragSeq
 50 |    is then copied into the next slow of fsdb, growing fsdb
 51 |    if necessary, and incrementing its fsdb->num_fss
 52 | */
 53 |   int add_virgin_fs2fsdb( FragSeqP fs, FSDB fsdb ) ;
 54 | 
 55 | /* Sorts the fsdb->fss on rc, as, ae, score
 56 |    After sorting all 1 strand alignments are first
 57 |    These are sorted by as, then ae, with the highest
 58 |    scoring guys first and then lower scoring guys
 59 | */
 60 |   void sort_fsdb( FSDB fsdb ) ;
 61 |   void sort_fsdb_qscore( FSDB fsdb );
 62 | 
 63 | 
 64 | /* find_fsdb_score_cut
 65 |    Args: (1) FSDB fsdb - has valid data for seq_len, score, and
 66 |              unique_best
 67 |      (2) double* slope - pointer to slope to be calculated
 68 |      (3) double* intercept - pointer to intercept to be calc.
 69 |    Returns: void
 70 |    Takes all the sequence lengths and scores in a FSDB database.
 71 |    Calculates the best fit line through the data:
 72 |    score = (slope * seq_len) + intercept
 73 |    That is, is determines the dependency of average score on the
 74 |    length of the sequence. This can then be used to determine
 75 |    what is an inappropriately scoring (for its length) alignment.
 76 | */
 77 |   void find_fsdb_score_cut( FSDB fsdb, double* slope, double* intercept ) ;
 78 |   
 79 | /* write_fastq
 80 |    Args: (1) char* fn
 81 |          (2) FSDB fsdb
 82 |    Returns: void
 83 |    Writes a fastq database of sequences to the filename given of all
 84 |    sequences and quality scores in the fsdb
 85 | */
 86 |   void write_fastq( char* fn, FSDB fsdb );
 87 | 
 88 | 
 89 | /* set_uniq_in_fsdb
 90 |    Args: (1) FSDB fsdb - has fss field SORTED!
 91 |          (2) int just_outer_coords - boolean; TRUE means just use
 92 | 	     outer coordinate info (strand, start, end) to
 93 | 	     decide about uniqueness; FALSE is a more complex
 94 | 	     scheme. If a sequence is has the same start, but a
 95 | 	     lower end point, it is not unique unless it is also
 96 | 	     trimmed. This is to handle 454 data where occassionally
 97 | 	     sequences "end" because of some filter, but not the
 98 | 	     natural end of the molecule. Then, repeats can show up
 99 | 	     in different lengths!
100 |          (3) unsigned short tolerance - allow this much tolerance for start and end coordinates. 
101 |              Due to oversequencing, PCR can result in redundant reads that differ only a few bases. 
102 |    Returns: void
103 |    Goes through each sequence and the sets the unique_best flag
104 |    to true for the first of each kind (same as, ae, and rc) and
105 |    sets unique_best to false for all others
106 | */
107 | 
108 |   void set_uniq_in_fsdb( FSDB fsdb, const int just_outer_coords, const unsigned short tolerance) ;
109 | 
110 | /* pop_smp_from_FSDB
111 |    Args: (1) FSDB fsdb with valid data
112 |          (2) Depth of PSSM matrices
113 |    Returns: void
114 |    Goes through all the AlnSeqs in the fsdb->fss array. Follows
115 |    the front_asp (and back_asp, if necessary) pointer to populate
116 |    the smp field of all AlnSeqs with the correct code for what
117 |    depth in the PSSM matrix to use for consensus calling */
118 | void pop_smp_from_FSDB( FSDB fsdb, int depth ) ;
119 | 
120 | /* add_fs2fsdb
121 |    Args: (1) FragSeqP fs - pointer to a fully valid FragSeq
122 |          (2) FSdb fsdb - database to add this FragSeq to
123 |    Returns: 1 if success; 0 if failure (not enough memories)
124 |    Adds the FragSeq pointed to by fs to the fsdb database,
125 |    growing it if necessary */
126 | int add_fs2fsdb( FragSeqP fs, FSDB fsdb ) ;
127 | 
128 | /* grow_FSDB
129 |    Args: (1) FSDB (fsdb) to be made twice as big
130 |    Returns: 1 if success; 0 if failure (not enough memories)
131 |    Grows an FSDB by allocating another chunk of memory for
132 |    the FragSeqs as big as the one it already has. Note, it
133 |    *DOES NOT* throw away the one it already has. Then, the
134 |    fsdb->fss array is replaced by one twice as big. The
135 |    pointers to all the existing FragSeqs are copied over
136 |    and the new ones are set up. The size is reset, too.
137 |    The old fsdb->fss array is freed
138 | */
139 | int grow_FSDB( FSDB fsdb ) ;
140 | 
141 | /* init_FSDB
142 |    Arguments: void
143 |    Returns: FSDB (pointer to struct fragseqdb) / NULL if
144 |     not enough memories
145 |    Used for initializing a new database of FragSeqs. Allocates
146 |    enough memoery for INIT_NUM_ALN_SEQS of these
147 | */
148 | FSDB init_FSDB ( void );
149 | 
150 | /* asp_len
151 |    Args: (1) AlnSeqP asp - pointer to an AlnSeq
152 |    Returns (1) int - total length of sequence
153 |    This function finds the total length of the sequence in this
154 |    aligned sequence fragment. This is the sum of the sequence
155 |    in the asp->seq field and all of the inserted sequence (if any)
156 |    in the asp->ins array
157 | */
158 | int asp_len( AlnSeqP asp ) ;
159 | 
160 | 
161 | 
162 | 
163 | 
164 | #ifdef	__cplusplus
165 | }
166 | #endif
167 | 
168 | #endif	/* _FSDB_H */
169 | 
170 | 


--------------------------------------------------------------------------------
/src/io.h:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * File:   io.h
  3 |  * Author: Ed Green
  4 |  *         Michael Siebauer
  5 |  *
  6 |  * Created on 25. Januar 2009, 14:41
  7 |  */
  8 | 
  9 | #ifndef _IO_H
 10 | #define	_IO_H
 11 | 
 12 | #ifdef	__cplusplus
 13 | extern "C" {
 14 | #endif
 15 | 
 16 | #include "params.h"
 17 | #include "types.h"
 18 | #include <stdio.h>
 19 | #include <stdlib.h>
 20 | #include <string.h>
 21 | #include <time.h>
 22 | #include "map_align.h"
 23 | 
 24 | /* find_input_type
 25 |    Args: 1. FILE* pointer to file to be analyzed
 26 |    Returns: sequence code indicating what kind of sequence file
 27 |             this is:
 28 | 	    0 => fasta
 29 | 	    1 => fastq
 30 |    Resets the input FILE pointer to the beginning of the file
 31 | */
 32 |   int find_input_type( FILE * FF );
 33 | 
 34 | /* read_next_seq
 35 |    Args: 1. FILE* pointer to file being read
 36 |          2. FragSeqP pointer to FragSeq where the next sequence data will go
 37 | 	 3. int code indicating which parser to use
 38 |    Returns: TRUE if a sequence was read,
 39 |             FALSE if EOF
 40 | */
 41 | 
 42 |   int read_next_seq( FILE * FF, FragSeqP frag_seq, int seq_code );
 43 | 
 44 | /* read_fasta
 45 |    args 1. pointer to file to be read
 46 |         2. pointer to FragSeq to put the sequence
 47 |    returns: TRUE if sequence was read,
 48 |             FALSE if EOF or not fasta
 49 | */
 50 | int read_fasta ( FILE * fasta, FragSeqP frag_seq );
 51 | 
 52 | /* read_fastq
 53 |    Args 1. pointer to file to be read
 54 |         2. pointer to FragSeq to put the sequence into
 55 |    Returns: TRUE if a sequence was read,
 56 |             FALSE if EOF
 57 | */
 58 | 
 59 | int read_fastq ( FILE * fastq, FragSeqP frag_seq );
 60 | 
 61 | /* calc_qual_sum
 62 |    Args: 1. pointer to a string of quality scores for this sequence
 63 |    Returns: 1. int - the sum of quality scores for this sequence
 64 |    This assumes that quality scores are represented as the 
 65 |    ASCII code + 64
 66 | */
 67 |   inline int calc_qual_sum( const char* qual_str );
 68 | 
 69 | 
 70 | 
 71 | /* Read in the reference sequence from fasta file and make reverse complement, too
 72 |  * Return 1 success
 73 |  0 failure
 74 | */
 75 | int read_fasta_ref(RefSeqP ref, const char* fn);
 76 | void make_reverse_complement(RefSeqP ref);
 77 | 
 78 | 
 79 |   /* Reads in a set of scoring matrices for each of the PSSM_DEPTH
 80 |      positions at the beginning and end of the sequence that are
 81 |      to have special scoring matrices and the single 'MIDDLE' matrix
 82 |      for everything in the middle.
 83 |      Puts these matrices into a PSSM structure.
 84 |      Returns a pointer to this structure (PSSMP)
 85 |   */
 86 |   PSSMP read_pssm(const char* fn);
 87 |   
 88 |   /* Reads one pairwise alignment from an Udo Stenzel align
 89 |      output file of semi-global alignments against a common
 90 |      target sequence (usually chrM) into a PWAlnFrag.
 91 |      Args: FILE* advanced to next pairwise alignment
 92 |      PWAlnFragP to be populated
 93 |      Returns 1 if success;
 94 |      0 if EOF or failure
 95 |      -1 for failure
 96 |      */
 97 |   int read_align_aln(FILE* align_f, PWAlnFragP af);
 98 |   
 99 |   FILE * fileOpen(const char *name, char access_mode[]);
100 |   
101 |   IDsListP parse_ids(char* fn);
102 |   
103 |   // Prints this string colored
104 | void color_print(char* string);
105 | 
106 | 
107 |   void ace_output(MapAlignmentP maln);
108 | 
109 |   void line_print_cons(char* consensus, char* aln_ref, char* ref_id, int* cov);
110 | 
111 |   void clustalw_print_cons(char* cons, char* aln_ref, char* ref_id);
112 | 
113 | 
114 | void fasta_aln_print(char* seq, char* id);
115 | void fasta_print_cons(char* cons, char* id);
116 | 
117 | #ifdef	__cplusplus
118 | }
119 | #endif
120 | 
121 | #endif	/* _IO_H */
122 | 
123 | 


--------------------------------------------------------------------------------
/src/kmer.c:
--------------------------------------------------------------------------------
  1 | #include "kmer.h"
  2 | 
  3 | /* kmer2inx
  4 |    Args: (1) a pointer to a character string;
  5 |              the kmer to find the corresponding index of;
  6 | 	     might not be null-terminated
  7 | 	 (2) length of the kmer
  8 | 	 (3) pointer to size_t to put the index
  9 |    Returns: TRUE if the index was set, FALSE if it could not
 10 |             be set because of some non A,C,G,T character
 11 |    Uses the formula A=>00, C=>01, G=>11, T=>11 to make a
 12 |    bit string for the kmer. Any other character is not allowed
 13 |    and will cause an error
 14 |    The bit string is constructed by reading the kmer from left
 15 |    to right. This bit-string is then interpreted as a variable
 16 |    of type size_t and is appropriate as an array index
 17 | */
 18 | int kmer2inx( const char* kmer,
 19 | 	      const unsigned int kmer_len,
 20 | 	      size_t* inx ) {
 21 |   size_t l_inx = 0;
 22 |   int i = 0;
 23 |   char curr_char;
 24 | 
 25 |   while( i < kmer_len ) {
 26 |     l_inx = l_inx << 2;
 27 |     curr_char = toupper(kmer[i]); // Upper case it in case it is not
 28 |     switch( curr_char ) {
 29 |     case 'A' :
 30 |       l_inx += 0;
 31 |       break;
 32 |     case 'C' :
 33 |       l_inx += 1;
 34 |       break;
 35 |     case 'G' :
 36 |       l_inx += 2;
 37 |       break;
 38 |     case 'T' :
 39 |       l_inx += 3;
 40 |       break;
 41 |     default :
 42 |       return 0; // not valid!
 43 |     }
 44 |     i++;
 45 |   }
 46 |   *inx = l_inx;
 47 |   return 1; // valid!
 48 | }
 49 | 
 50 | 
 51 | /* add_kmer
 52 |    Args: (1) KPL* kmer array
 53 |          (2) index position - must be valid
 54 |          (3) position of this kmer to add
 55 |    Returns: void
 56 |    Takes a newly discovered kmer position and adds it to the
 57 |    array. The given index specifies what the kmer is, but
 58 |    for this operation, we really do not care. We simply add
 59 |    the position to the positions field of this kmer. If this
 60 |    kmer has never been seen before, then we have to
 61 |    initialize it, too.
 62 | */
 63 | void add_kmer( KPL* kpa, const size_t inx, const size_t i ) {
 64 | 
 65 |   /* Check if it is not already initialized */
 66 |   if ( kpa[inx] == NULL ) {
 67 |     /* Never seen this kmer before, so set it up */
 68 |     kpa[inx] = (KPL)save_malloc(sizeof(KmerPosList));
 69 |     kpa[inx]->num_pos = 0;
 70 |     kpa[inx]->sorted  = 0;
 71 |   }
 72 | 
 73 |   /* Check to make sure we're not over the maximum number
 74 |      of allowable positions for this kmer */
 75 |   if ( kpa[inx]->num_pos == MAX_KMER_POS ) {
 76 |     return;
 77 |   }
 78 | 
 79 |   /* No? then add it */
 80 |   kpa[inx]->positions[kpa[inx]->num_pos] = i;
 81 |   kpa[inx]->sorted = 0;
 82 |   kpa[inx]->num_pos++;
 83 | 
 84 |   return;
 85 | }
 86 | /* init_kpa
 87 |    Args: (1) length of kmers to use
 88 |    Returns: pointer to KPL; an array of pointers to KmerPosList
 89 | */
 90 | KPL* init_kpa( const int kmer_len ) {
 91 |   KPL* kpa;
 92 |   unsigned int size = 1;
 93 |   if ( kmer_len > MAX_KMER_LEN ) {
 94 |     fprintf( stderr, "Cannot use kmer length greater than %d\n",
 95 | 	     MAX_KMER_LEN );
 96 |     exit( 2 );
 97 |   }
 98 |   size = size << (2*kmer_len);
 99 |   kpa = (KPL*)calloc(size, sizeof(KPL));
100 |   if ( kpa == NULL ) {
101 |     fprintf( stderr,
102 | 	     "Not enough memories for kmers of length %d\n",
103 | 	     kmer_len );
104 |     exit( 1 );
105 |   }
106 |   return kpa;
107 | }
108 | 
109 | 
110 | void grow_kmers ( KmersP k ) {
111 |   int new_size, i, j;
112 |   char** new_kmers;
113 |   char* first_id;
114 |   new_size = k->size * 2;
115 |   new_kmers = (char**)save_malloc( new_size * sizeof(char*) );
116 |   first_id  = (char*)save_malloc(k->size * (k->kmer_len + 1) * sizeof(char));
117 | 
118 |   /* Point first half of new_kmers to already existing pointers */
119 |   for( i = 0; i < k->size; i++ ) {
120 |     new_kmers[i] = k->kmers[i];
121 |   }
122 |   j = 0;
123 |   /* Point the second half of new_kmers to new points */
124 |   for( i = k->size; i < new_size; i++ ) {
125 |     new_kmers[i] = &first_id[(k->kmer_len + 1) * j++];
126 |   }
127 | 
128 |   /* Free old kmers */
129 |   free( k->kmers );
130 |   k->kmers = new_kmers;
131 |   k->size = new_size;
132 | }
133 | 
134 | /* all_upper
135 |    Args: (1) Pointer to char array (seq)
136 |          (2) int number of characters to check (len)
137 |    Returns: int 1 => first len charaters in seq are all upper case
138 |                 0 => at least one of the characters is not upper case
139 | */
140 | inline int all_upper( const char* seq, const int kmer_len ) {
141 |   size_t i;
142 |   for( i = 0; i < kmer_len; i++ ) {
143 |     if ( islower( seq[i] ) ) {
144 |       return 0;
145 |     }
146 |   }
147 |   return 1;
148 | }
149 | 
150 | 
151 | /* populate_kpa
152 |  */
153 | int populate_kpa( KPL* kpa, const char* seq,
154 | 		  const size_t seq_len,
155 | 		  const int kmer_len,
156 | 		  const int soft_mask ) {
157 |   size_t i, inx;
158 |   for( i = 0; i <= (seq_len - kmer_len); i++ ) {
159 |     /* Add this kmer if we're not check for softmasking or
160 |        if we are and it passes the test */
161 |     if ( !soft_mask || all_upper(&seq[i], kmer_len) ) {
162 |       if ( kmer2inx( &seq[i], kmer_len, &inx ) ) {
163 | 	add_kmer( kpa, inx, i );
164 |       }
165 |     }
166 |   }
167 |   return 1;
168 | }
169 | 
170 | 
171 | 
172 | /* pop_kmers
173 |    Args: (1) RefSeqP ref - reference sequence with forward and reverse sequence
174 |          (2) int kmer_filt_len - length of kmers
175 |    Initializes a Kmers struct and populates it with all the kmers in the
176 |    forward and reverse-complement sequence of the input RefSeq.
177 |    Returns: pointer to KmersP
178 | */
179 | KmersP pop_kmers( RefSeqP ref, int kmer_filt_len ) {
180 |   int i, pos;
181 |   KmersP k;
182 |   char* first_kmer;
183 |   char* curr_kmer;
184 |   /* Allocate memory for the kmers, guessing how much will be required */
185 |   k = (KmersP)save_malloc(sizeof( struct kmers ) );
186 |   k->num_kmers = 0;
187 |   k->kmer_len = kmer_filt_len;
188 |   k->size = ref->seq_len; // just a guess
189 |   k->kmers = (char**)save_malloc( k->size * sizeof( char* ) );
190 |   first_kmer = (char*)save_malloc( k->size * (kmer_filt_len+1) * sizeof(char) );
191 |   for( i = 0; i < k->size; i++ ) {
192 |     k->kmers[i] = &first_kmer[i * (kmer_filt_len+1)];
193 |   }
194 | 
195 |   curr_kmer = (char*)save_malloc( (kmer_filt_len + 1) * sizeof(char) );
196 |   curr_kmer[kmer_filt_len] = '\0'; // Null-terminate for now and forever
197 | 
198 |   /* Now, cruise through the forward and reverse complement and
199 |      load up the kmers */
200 |   for( pos = 0; pos <= ref->wrap_seq_len - kmer_filt_len; pos++ ) {
201 |     strncpy( curr_kmer, &ref->seq[pos], kmer_filt_len );
202 |     if ( bsearch( &curr_kmer, k->kmers, k->num_kmers, sizeof(char*), idCmp )
203 | 	 == NULL ) {
204 |       /* not found, add it */
205 |       k->num_kmers++;
206 |       if ( k->num_kmers >= k->size ) {
207 | 	grow_kmers( k );
208 |       }
209 |       strncpy( k->kmers[k->num_kmers - 1], curr_kmer, (kmer_filt_len+1) );
210 |       qsort( k->kmers, k->num_kmers, sizeof(char*), idCmp );
211 |     }
212 |   }
213 | 
214 |   for( pos = 0; pos <= ref->wrap_seq_len - kmer_filt_len; pos++ ) {
215 |     strncpy( curr_kmer, &ref->rcseq[pos], kmer_filt_len );
216 |     if ( bsearch( &curr_kmer, k->kmers, k->num_kmers, sizeof(char*), idCmp )
217 | 	 == NULL ) {
218 |       /* not found, add it */
219 |       k->num_kmers++;
220 |       if ( k->num_kmers >= k->size ) {
221 | 	grow_kmers( k );
222 |       }
223 |       strncpy( k->kmers[k->num_kmers], curr_kmer, (kmer_filt_len+1) );
224 |       qsort( k->kmers, k->num_kmers, sizeof(char*), idCmp );
225 |     }
226 |   }
227 | 
228 |   k->sorted = 1;
229 | 
230 |   free( curr_kmer );
231 |   return k;
232 | }
233 | 
234 | 
235 | /* Returns: TRUE (1) if we should align this sequence
236 |             FALSE (0) if we should NOT align this sequence because
237 |                       it shares no kmers with the reference
238 | */
239 | int new_kmer_filter( FragSeqP fs,
240 | 		     KPL* fkpa,
241 | 		     KPL* rkpa,
242 | 		     int kmer_len,
243 | 		     AlignmentP fwa,
244 | 		     AlignmentP rca ) {
245 |   size_t frag_len, frag_pos, inx, ref_len, ref_pos, i;
246 |   int mask_min, mask_max; // Sometimes these become negative
247 |   unsigned int num_f_kmers_found = 0;
248 |   unsigned int num_r_kmers_found = 0;
249 | 
250 |   /* Check for no kmer filtering */
251 |   if ( kmer_len < 0 ) {
252 |     memset( fwa->align_mask, 1, fwa->len1 );
253 |     memset( fwa->align_mask, 1, rca->len1 );
254 |     return 1;
255 |   }
256 | 
257 |   /* Reset the alignment masks */
258 |   memset( fwa->align_mask, 0, fwa->len1 );
259 |   memset( rca->align_mask, 0, rca->len1 );
260 | 
261 |   /* How long is this fragment? */
262 |   if ( fs->trimmed ) {
263 |     frag_len = (fs->trim_point + 1);
264 |   }
265 |   else {
266 |     frag_len = fs->seq_len;
267 |   }
268 | 
269 |   if ( frag_len < kmer_len ) {
270 |     return 0;
271 |   }
272 | 
273 |   /* Zip through all the kmers in this fragment sequence. If any
274 |      are present in the forward or reverse kpa's, then we pass
275 |      the filter, i.e., return 1 */
276 |   for( frag_pos = 0; frag_pos <= (frag_len - kmer_len); frag_pos++ ) {
277 |     if ( kmer2inx( &fs->seq[frag_pos], kmer_len, &inx ) ) {
278 |       if ( fkpa[inx] != NULL ) {
279 | 	ref_len = fwa->len1;
280 | 	/* There are some kmers here. Add them to the total
281 | 	   count and update the align_mask */
282 | 	num_f_kmers_found += fkpa[inx]->num_pos;
283 | 	if ( num_f_kmers_found >= KMER_SATURATE ) {
284 | 	  memset( fwa->align_mask, 1, fwa->len1 );
285 | 	}
286 | 
287 | 	for( i = 0; i < fkpa[inx]->num_pos; i++ ) {
288 | 	  /* Unmask the region surrounding this kmer */
289 | 	  ref_pos = fkpa[inx]->positions[i];
290 | 	  mask_min = ref_pos - frag_pos - ALIGN_MASK_BUFFER;
291 | 	  if ( mask_min < 0 ) {
292 | 	    mask_min = 0;
293 | 	  }
294 | 	  mask_max = ref_pos + (frag_len - frag_pos) + ALIGN_MASK_BUFFER;
295 | 	  if ( mask_max >= ref_len ) {
296 | 	    mask_max = (ref_len - 1);
297 | 	  }
298 | 	  memset( &fwa->align_mask[mask_min], 1, (mask_max-mask_min+1) );
299 | 	}
300 |       }
301 | 
302 |       if ( rkpa[inx] != NULL ) {
303 | 	ref_len = rca->len1;
304 | 	/* There are some kmers here. Add them to the total
305 | 	   count and update the align_mask */
306 | 	num_r_kmers_found += rkpa[inx]->num_pos;
307 | 	if ( num_r_kmers_found >= KMER_SATURATE ) {
308 | 	  memset( rca->align_mask, 1, rca->len1 );
309 | 	}
310 | 
311 | 	for( i = 0; i < rkpa[inx]->num_pos; i++ ) {
312 | 	  /* Unmask the region surrounding this kmer */
313 | 	  ref_pos = rkpa[inx]->positions[i];
314 | 	  mask_min = ref_pos - frag_pos - ALIGN_MASK_BUFFER;
315 | 	  if ( mask_min < 0 ) {
316 | 	    mask_min = 0;
317 | 	  }
318 | 
319 | 	  mask_max = ref_pos + frag_len - frag_pos - 1 + ALIGN_MASK_BUFFER;
320 | 	  if ( mask_max >= ref_len ) {
321 | 	    mask_max = (ref_len - 1);
322 | 	  }
323 | 	  memset( &rca->align_mask[mask_min], 1, (mask_max-mask_min+1) );
324 | 	}
325 |       }
326 |     }
327 |   }
328 | 
329 |   /* Return 0 if no kmers found; TRUE (not 0) if some kmers found */
330 |   return (num_f_kmers_found + num_r_kmers_found);
331 | }
332 | 
333 | int kmer_filter( int kmer_filt_len, FragSeqP fs, KmersP k ) {
334 |   int len, pos;
335 |   char* test_kmer;
336 | 
337 |   /* First, check if the user wants any kmer filtering
338 |      Special value -1 means none and it doesn't make sense
339 |      to filter for kmers <= 0 */
340 |   if ( kmer_filt_len < 0 ) {
341 |     return 1;
342 |   }
343 | 
344 |   test_kmer = (char*)save_malloc( (k->kmer_len + 1) * sizeof(char) );
345 |   test_kmer[k->kmer_len] = '\0';
346 | 
347 |   if ( fs->trimmed ) {
348 |     len = fs->trim_point;
349 |   }
350 |   else {
351 |     len = fs->seq_len - 1;
352 |   }
353 | 
354 |   for( pos = 0; pos <= len - k->kmer_len; pos++ ) {
355 |     strncpy( test_kmer, &fs->seq[pos], k->kmer_len );
356 |     if ( bsearch( &test_kmer, k->kmers, k->num_kmers, sizeof(char*), idCmp ) !=
357 | 	 NULL ) {
358 |       free( test_kmer );
359 |       return 1;
360 |     }
361 |   }
362 | 
363 |   free( test_kmer );
364 |   return 0;
365 | 
366 | }
367 | 
368 | 


--------------------------------------------------------------------------------
/src/kmer.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * File:   kmer.h
 3 |  * Author: TCO
 4 |  *
 5 |  * Created on 3. Februar 2009, 13:11
 6 |  */
 7 | 
 8 | #ifndef _KMER_H
 9 | #define	_KMER_H
10 | 
11 | #ifdef	__cplusplus
12 | extern "C" {
13 | #endif
14 | 
15 | #include "types.h"
16 | #include <stdlib.h>
17 | #include <stdio.h>
18 | #include "map_align.h"
19 | 
20 | 
21 | /* add_kmer
22 |    Args: (1) KPL* kmer array
23 |          (2) index position - must be valid
24 |          (3) position of this kmer to add
25 |    Returns: void
26 |    Takes a newly discovered kmer position and adds it to the
27 |    array. The given index specifies what the kmer is, but
28 |    for this operation, we really do not care. We simply add
29 |    the position to the positions field of this kmer. If this
30 |    kmer has never been seen before, then we have to
31 |    initialize it, too.
32 | */
33 | void add_kmer( KPL* kpa, const size_t inx, const size_t i ) ;
34 | 
35 | /* init_kpa
36 |    Args: (1) length of kmers to use
37 |    Returns: pointer to KPL; an array of pointers to KmerPosList
38 | */
39 | KPL* init_kpa( const int kmer_len ) ;
40 | 
41 | 
42 | void grow_kmers ( KmersP k ) ;
43 | 
44 | /* populate_kpa
45 |  */
46 | int populate_kpa( KPL* kpa, const char* seq,
47 | 		  const size_t seq_len,
48 | 		  const int kmer_len,
49 | 		  const int soft_mask ) ;
50 | 
51 | /* pop_kmers
52 |    Args: (1) RefSeqP ref - reference sequence with forward and reverse sequence
53 |          (2) int kmer_filt_len - length of kmers
54 |    Initializes a Kmers struct and populates it with all the kmers in the
55 |    forward and reverse-complement sequence of the input RefSeq.
56 |    Returns: pointer to KmersP
57 | */
58 | KmersP pop_kmers( RefSeqP ref, int kmer_filt_len ) ;
59 | 
60 | /* kmer2inx
61 |    Args: (1) a pointer to a character string;
62 |              the kmer to find the corresponding index of;
63 | 	     might not be null-terminated
64 | 	 (2) length of the kmer
65 | 	 (3) pointer to size_t to put the index
66 |    Returns: TRUE if the index was set, FALSE if it could not
67 |             be set because of some non A,C,G,T character
68 |    Uses the formula A=>00, C=>01, G=>11, T=>11 to make a
69 |    bit string for the kmer. Any other character is not allowed
70 |    and will cause an error
71 |    The bit string is constructed by reading the kmer from left
72 |    to right. This bit-string is then interpreted as a variable
73 |    of type size_t and is appropriate as an array index
74 | */
75 | int kmer2inx( const char* kmer,
76 | 		     const unsigned int kmer_len,
77 | 		     size_t* inx ) ;
78 | 
79 | /* Returns: TRUE (1) if we should align this sequence
80 |             FALSE (0) if we should NOT align this sequence because
81 |                       it shares no kmers with the reference
82 | */
83 | int new_kmer_filter( FragSeqP fs,
84 | 		     KPL* fkpa,
85 | 		     KPL* rkpa,
86 | 		     int kmer_len,
87 | 		     AlignmentP fwa,
88 | 		     AlignmentP rca ) ;
89 | 
90 | int kmer_filter( int kmer_filt_len, FragSeqP fs, KmersP k ) ;
91 | 
92 | 
93 | #ifdef	__cplusplus
94 | }
95 | #endif
96 | 
97 | #endif	/* _KMER_H */
98 | 
99 | 


--------------------------------------------------------------------------------
/src/map_align.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef INCLUDED_MAP_ALIGN_H
  3 | #define INCLUDED_MAP_ALIGN_H
  4 | 
  5 | #include <stdlib.h>
  6 | #include <math.h>
  7 | #include <ctype.h>
  8 | #include <getopt.h>
  9 | #include <time.h>
 10 | #include <limits.h>
 11 | #include <float.h>
 12 | #include <stdio.h>
 13 | #include "params.h"
 14 | #include "types.h"
 15 | #include <string.h>
 16 | #include "io.h"
 17 | #include "map_alignment.h"
 18 | 
 19 | 
 20 | /* Function Prototypes */
 21 | 
 22 | 
 23 | /* base2inx
 24 |  Args (1) char base - the base to find the index for
 25 |  Returns a short int of the index position into a substitution
 26 |  matrix for this base
 27 |  This function finds the corresponding index for a
 28 |  PSSMP->sm[DEPTH][][] (position-specific substitution matrix)
 29 |  for a given base. This index is used to look up the appropriate
 30 |  score. This function assumes that the rows and columns are in
 31 |  the order: A, C, G, T, N
 32 |  */
 33 | 
 34 | inline short int base2inx(const char base) ;
 35 | 
 36 | int idCmp(const void* id1_, const void* id2_) ;
 37 | 
 38 | /* Takes an AlnSeqP and a beginning and end coordinate of a region.
 39 |  All coordinates are 0-based
 40 |  Returns true is this AlnSeq overlaps the region at all, false
 41 |  if it does not */
 42 | inline int alnseq_ol_reg(AlnSeqP as, const int rs, const int re);
 43 | 
 44 | 
 45 | /* This IDsList */
 46 | IDsListP init_ids_list(void) ;
 47 | 
 48 | void add_id(char* new_id, IDsListP used_ids_list) ;
 49 | 
 50 | void grow_ids_list(IDsListP ids) ;
 51 | 
 52 | int allowed_alignment(int ids_rest, IDsListP rest_ids_list, int no_dups,
 53 | 		IDsListP used_ids_list, PWAlnFragP pwaln, double score_int,
 54 | 		double score_slo) ;
 55 | 
 56 | void show_single_pos(int ref_pos, char ref_base, char cons_base, BaseCountsP bcs) ;
 57 | 
 58 | void add_base(char b, BaseCountsP bcs, PSSMP psm, int pssm_code) ;
 59 | 
 60 | void reset_base_counts(BaseCountsP bc) ;
 61 | 
 62 | /* Takes a pointer to a BaseCounts bcs and the maln->cons_code
 63 |    (consensus code)
 64 |    The bcs must have valide data
 65 |    Returns the character of the consensus at this position as
 66 |    defined by the scheme to use and the data in bcs
 67 |    If the coverage is 0, returns N
 68 |    If there are >= PERC4GAP percent of reads with a gap, returns -
 69 |    Otherwise:
 70 |    If cons_code = 1, returns any base with score >= MIN_SCORE_CONS
 71 |                      or N if none
 72 |       cons_code = 2, returns any base with score >= MIN_SC_DIFF_CONS
 73 |                      better than the second highest scoring base or
 74 | 		     N if none
 75 | */
 76 | char find_consensus(BaseCountsP bcs, int cons_code) ;
 77 | 
 78 | int alnSeqCmp(const void* as1_, const void* as2_) ;
 79 | 
 80 | char revcom_char(const char base) ;
 81 | 
 82 | /* Takes a MapAlignmentP and a position where some of
 83 |  the aligned fragments have an insert relative to the
 84 |  reference. That is, maln->ref->gaps[position] > 0.
 85 |  Populates the char* ins_cons and int* cons_cov
 86 |  arrays with the consensus sequence and consensus
 87 |  coverage, respectively. These must be appropriately
 88 |  sized elsewhere. If out_format is the special value
 89 |  of 4, then we just show these differences now and
 90 |  do not return anything.
 91 |  */
 92 | void find_ins_cons(MapAlignmentP maln, int pos, char* ins_cons, int* cons_cov,
 93 | 		int out_format) ;
 94 | 
 95 | void revcom_PWAF(PWAlnFragP pwaln) ;
 96 | 
 97 | /* For a given region, defined by reg_start and reg_end, show
 98 |  the refence sequence, the consensus sequence,
 99 |  and the sequence of all the fragments that overlap this
100 |  region at all.
101 |  */
102 | void print_region(MapAlignmentP maln, int reg_start, int reg_end,
103 | 		int out_format, int in_color) ;
104 | 
105 | void col_print_cons(char* consensus, char* aln_ref, int* cov, int* ref_poss,
106 | 		MapAlignmentP maln) ;
107 | 
108 | /* Takes a pointer to a populated PWAlnFrag (pwaln) and
109 |  a pointer to a populated MapAlignent (maln)
110 |  Does:
111 |  1. Adds this aligned sequence, without gaps to maln->AlnSeqArray,
112 |  growing this array if necessary
113 |  2. Populates the gaps array of this newly aligned fragment to
114 |  indicate where its gaps are relative to the reference
115 |  3. Updates the gaps array of the reference sequence (maln->ref->gaps[])
116 |  and the gaps array of all aligned fragments to accomodate any
117 |  new gaps this new fragment may require
118 |  Returns: 1 (TRUE) if success
119 |  0 (FALSE) if failure
120 |  */
121 | int merge_pwaln_into_maln(PWAlnFragP pwaln, MapAlignmentP maln) ;
122 | 
123 | 
124 | /* Takes the description from an Udo align aligned sequence
125 |  and puts the start, end, strand, and score information in
126 |  the correct field of the PWAlnFragP
127 |  The desc (description) is a string like this, e.g.:
128 |  "- 4199-4261 score=5441"
129 |  */
130 | int ses_from_align_desc(PWAlnFragP pwaln, int* strand) ;
131 | 
132 | /* adapt_from_desc checks the frag_desc string of a PWAlnFrag,
133 |  given a pointer to one (PWAlnFragP) and sets the trimmed
134 |  field to true (1) if the phrase "adapter cut off" is there
135 |  Returns true if everthing went fine, false otherwise
136 |  */
137 | int adapt_from_desc(PWAlnFragP af) ;
138 | 
139 | /* Grow the space for a sequence (an array of char)
140 |  to twice its current size
141 |  Copy its current contents into the new sequence
142 |  Free the now unused old memory
143 |  */
144 | char* grow_seq(char* seq, int size);
145 | 
146 | 
147 | 
148 | 
149 | 
150 | #endif
151 | 


--------------------------------------------------------------------------------
/src/map_alignment.c:
--------------------------------------------------------------------------------
  1 | #include "map_alignment.h"
  2 | 
  3 | /* Initialize a MapAlignment object and return a pointer to it */
  4 | MapAlignmentP init_map_alignment(void) {
  5 |     MapAlignmentP aln;
  6 |     AlnSeqP as;
  7 |     size_t i, j;
  8 | 
  9 |     // First, allocate the alignment
 10 |     aln = (MapAlignmentP) save_malloc(sizeof (MapAlignment));
 11 | 
 12 |     // Allocate memory for the RefSeq
 13 |     aln->ref = (RefSeqP) save_malloc(sizeof (RefSeq));
 14 |     if (aln->ref == NULL) {
 15 |         return NULL;
 16 |     }
 17 |     // Zero-out the RefSeq
 18 |     for (i = 0; i <= MAX_ID_LEN; i++) {
 19 |         aln->ref->id[i] = '\0';
 20 |     }
 21 |     for (i = 0; i <= MAX_DESC_LEN; i++) {
 22 |         aln->ref->desc[i] = '\0';
 23 |     }
 24 |     aln->ref->seq = NULL;
 25 |     aln->ref->rcseq = NULL;
 26 |     aln->ref->size = 0;
 27 |     aln->ref->gaps = NULL;
 28 |     aln->ref->circular = 0;
 29 |     aln->ref->wrap_seq_len = 0;
 30 | 
 31 |     // Now, allocate the array of pointers to the
 32 |     // aligned seqs
 33 |     aln->AlnSeqArray = (AlnSeqP*) save_malloc(INIT_NUM_ALN_SEQS *
 34 |             sizeof ( AlnSeqP));
 35 |     if (aln->AlnSeqArray == NULL) {
 36 |         return NULL;
 37 |     }
 38 | 
 39 |     // Now, point the pointers to the pointees
 40 |     for (i = 0; i < INIT_NUM_ALN_SEQS; i++) {
 41 |         aln->AlnSeqArray[i] = (AlnSeqP) save_malloc(sizeof(AlnSeq));
 42 |         /* Zero them out */
 43 |         as = aln->AlnSeqArray[i];
 44 |         for (j = 0; j <= MAX_ID_LEN; j++) {
 45 |             as->id[j] = '\0';
 46 |         }
 47 |         for (j = 0; j <= MAX_DESC_LEN; j++) {
 48 |             as->desc[j] = '\0';
 49 |         }
 50 |         for (j = 0; j <= INIT_ALN_SEQ_LEN; j++) {
 51 |             as->seq[j] = '\0';
 52 |         }
 53 |         /* Set all their char* ins to NULL */
 54 |         for (j = 0; j <= INIT_ALN_SEQ_LEN; j++) {
 55 |             as->ins[j] = NULL;
 56 |         }
 57 |         as->start = 0;
 58 |         as->end = 0;
 59 |         as->revcom = 0;
 60 |         as->trimmed = 0;
 61 |         as->dropped = 0;
 62 |         as->score = 0;
 63 |         as->segment = 'n';
 64 |     }
 65 | 
 66 |     aln->size = INIT_NUM_ALN_SEQS;
 67 |     aln->num_aln_seqs = 0;
 68 | 
 69 |     return aln;
 70 | }
 71 | 
 72 | /* free_map_alignment
 73 |  Takes a MapAlignmentP (maln)
 74 |  Frees the memory pointed to by its components
 75 |  Returns nothing
 76 |  */
 77 | void free_map_alignment(MapAlignmentP maln) {
 78 |     int i,j ;
 79 | 
 80 |     /* First, free the maln->ref components */
 81 |     free(maln->ref->seq);
 82 |     free(maln->ref->rcseq);
 83 |     free(maln->ref->gaps);
 84 |     free(maln->ref);
 85 | 
 86 |     /* Now, free the AlnSeqArray */
 87 |     for( i=0 ; i<maln->num_aln_seqs ; ++i )
 88 |     {
 89 |         int len = strlen(maln->AlnSeqArray[i]->seq) ;
 90 |         for( j=0 ; j < len ; ++j )
 91 |             free(maln->AlnSeqArray[i]->ins[j]) ;
 92 |     }
 93 | 
 94 |     for( i=0 ; i<maln->size ; ++i )
 95 |         free(maln->AlnSeqArray[i]);
 96 |     free(maln->AlnSeqArray);
 97 | 
 98 |     free(maln->fpsm);
 99 |     free(maln->rpsm);
100 |     /* Now, free the MapAlignment */
101 |     free(maln);
102 | 
103 |     /* That oughta do it */
104 |     return;
105 | }
106 | 
107 | void show_consensus(MapAlignmentP maln, int out_format) {
108 |     char* consensus;
109 |     char* aln_ref;
110 |     char* ins_cons;
111 |     int j, cons_pos, ref_pos, ref_gaps;
112 |     int* cov;
113 |     int* ins_cov;
114 |     int* ref_poss;
115 |     int len_consensus = get_consensus_length(maln);
116 |     AlnSeqP aln_seq;
117 |     BaseCountsP bcs;
118 |     PSSMP psm;
119 | 
120 |     bcs = (BaseCountsP) save_malloc(sizeof (BaseCounts));
121 |     reset_base_counts(bcs);
122 | 
123 |     consensus = (char*) save_malloc((len_consensus + 1) * sizeof (char));
124 |     aln_ref = (char*) save_malloc((len_consensus + 1) * sizeof (char));
125 |     cov = (int*) save_malloc((len_consensus + 1) * sizeof (int));
126 |     ref_poss = (int*) save_malloc((len_consensus + 1) * sizeof (int));
127 | 
128 |     ins_cons = (char*) save_malloc(MAX_INS_LEN * sizeof (char));
129 |     ins_cov = (int*) save_malloc(MAX_INS_LEN * sizeof (int));
130 | 
131 |     cons_pos = 0;
132 |     ref_pos = 0;
133 |     /* Go through each position of the reference sequence */
134 |     for (ref_pos = 0; ref_pos < maln->ref->seq_len; ref_pos++) {
135 |         /* How many gaps preceeded this position? */
136 |         ref_gaps = maln->ref->gaps[ref_pos];
137 | 
138 |         /* Add these gaps to the reference aligned string */
139 |         if ((ref_gaps > 0) && (ref_pos > 0)) {
140 |             find_ins_cons(maln, ref_pos, ins_cons, ins_cov, out_format);
141 |             for (j = 0; j < ref_gaps; j++) {
142 |                 aln_ref[cons_pos] = '-';
143 |                 consensus[cons_pos] = ins_cons[j];
144 |                 cov[cons_pos] = ins_cov[j];
145 |                 ref_poss[cons_pos] = ref_pos;
146 |                 cons_pos++;
147 |             }
148 |         }
149 |         /* Re-zero all the base counts */
150 |         reset_base_counts(bcs);
151 | 
152 |         /* Find all the aligned fragments that include this
153 |            position and make a consensus from it */
154 |         for (j = 0; j < maln->num_aln_seqs; j++) {
155 |             aln_seq = maln->AlnSeqArray[j];
156 |             /* Does this aligned fragment cover this position? */
157 |             if ((aln_seq->start <= ref_pos) && // checked
158 |                     (aln_seq->end >= ref_pos)) {
159 | 
160 |                 if (aln_seq->revcom) {
161 |                     psm = maln->rpsm;
162 |                 } else {
163 |                     psm = maln->fpsm;
164 |                 }
165 | 
166 |                 add_base(aln_seq->seq[ref_pos - aln_seq->start], bcs, psm,
167 |                         aln_seq->smp[ref_pos - aln_seq->start]);
168 |             }
169 |         }
170 |         consensus[cons_pos] = find_consensus(bcs, maln->cons_code);
171 |         aln_ref[cons_pos] = maln->ref->seq[ref_pos];
172 |         cov[cons_pos] = bcs->cov;
173 |         ref_poss[cons_pos] = ref_pos;
174 |         if ((out_format == 4) && !(aln_ref[cons_pos] == consensus[cons_pos])) {
175 |             show_single_pos(ref_pos, aln_ref[cons_pos], consensus[cons_pos],
176 |                     bcs);
177 |         }
178 |         if (out_format == 41) {
179 |             show_single_pos(ref_pos, aln_ref[cons_pos], consensus[cons_pos],
180 |                     bcs);
181 |         }
182 |         cons_pos++;
183 |     }
184 |     consensus[cons_pos] = '\0';
185 |     aln_ref[cons_pos] = '\0';
186 | 
187 |     /* Now, output the reference and consensus sequences and the
188 |        coverage in specified way */
189 |     switch (out_format) {
190 |         case 1:
191 |             clustalw_print_cons(consensus, aln_ref, maln->ref->id);
192 |             break;
193 |         case 2:
194 |             line_print_cons(consensus, aln_ref, maln->ref->id, cov);
195 |             break;
196 |         case 3:
197 |             /* Add starts and ends info */
198 |             print_assembly_summary(maln);
199 |             col_print_cons(consensus, aln_ref, cov, ref_poss, maln);
200 |             break;
201 |         case 4:
202 |             ; /* Do nothing, this one is checked along the way */
203 |             break;
204 |         case 41:
205 |             ; /* Do nothing, this one is checked along the way */
206 |             break;
207 |         case 5:
208 |             fasta_print_cons(consensus, maln->ref->id);
209 |             break;
210 |     }
211 | 
212 |     /* Free memory! */
213 |     free(bcs);
214 |     free(consensus);
215 |     free(aln_ref);
216 |     free(cov);
217 |     free(ref_poss);
218 |     free(ins_cons);
219 |     free(ins_cov);
220 | }
221 | 
222 | int get_consensus_length(MapAlignmentP maln) {
223 |     int i, num_gaps = 0;
224 |     for (i = 0; i < maln->ref->seq_len; i++)
225 |         num_gaps += maln->ref->gaps[i];
226 |     return maln->ref->seq_len + num_gaps;
227 | }
228 | 
229 | char* get_consensus(MapAlignmentP maln) {
230 |     int len_consensus = get_consensus_length(maln);
231 |     char* consensus = (char*) save_malloc((len_consensus + 1) * sizeof (char));
232 |     char* ins_cons = (char*) save_malloc(MAX_INS_LEN * sizeof (char));
233 |     int j, cons_pos, ref_pos, ref_gaps;    
234 |     int* ins_cov = (int*) save_malloc(MAX_INS_LEN * sizeof (int));    
235 |     AlnSeqP aln_seq;
236 |     BaseCountsP bcs;
237 |     PSSMP psm;
238 |     bcs = (BaseCountsP) save_malloc(sizeof (BaseCounts));
239 |     reset_base_counts(bcs);
240 | 
241 |     cons_pos = 0;
242 |     ref_pos = 0;
243 |     /* Go through each position of the reference sequence */
244 |     for (ref_pos = 0; ref_pos < maln->ref->seq_len; ref_pos++) {
245 |         /* How many gaps preceeded this position? */
246 |         ref_gaps = maln->ref->gaps[ref_pos];
247 | 
248 |         /* Add these gaps to the reference aligned string */
249 |         if ((ref_gaps > 0) && (ref_pos > 0)) {
250 |             find_ins_cons(maln, ref_pos, ins_cons, ins_cov, 5);
251 |             for (j = 0; j < ref_gaps; j++) {
252 |                 consensus[cons_pos] = ins_cons[j];
253 |                 cons_pos++;
254 |             }
255 |         }
256 |         /* Re-zero all the base counts */
257 |         reset_base_counts(bcs);
258 | 
259 |         /* Find all the aligned fragments that include this
260 |            position and make a consensus from it */
261 |         for (j = 0; j < maln->num_aln_seqs; j++) {
262 |             aln_seq = maln->AlnSeqArray[j];
263 |             /* Does this aligned fragment cover this position? */
264 |             if ((aln_seq->start <= ref_pos) && // checked
265 |                     (aln_seq->end >= ref_pos)) {
266 | 
267 |                 psm = aln_seq->revcom ? maln->rpsm : maln->fpsm;
268 | 
269 |                 add_base(aln_seq->seq[ref_pos - aln_seq->start], bcs, psm,
270 |                         aln_seq->smp[ref_pos - aln_seq->start]);
271 |             }
272 |         }
273 |         consensus[cons_pos] = find_consensus(bcs, maln->cons_code);
274 |         cons_pos++;
275 |     }
276 |     consensus[cons_pos] = '\0';
277 |     return consensus;
278 | }
279 | 
280 | /* Write out the data in a MapAlignment data structure
281 |  to a file
282 |  */
283 | int write_ma(char* fn, MapAlignmentP maln) {
284 |     int i, j, row;
285 |     //    char* at;
286 |     time_t t;
287 |     int aln_seq_len;
288 |     FILE* MAF;
289 |     AlnSeqP as;
290 |     PSSMP fpsm, rpsm;
291 | 
292 |     MAF = fileOpen(fn, "w");
293 | 
294 |     t = time(NULL);
295 |     //at = (char*) save_malloc(64 * sizeof (char));
296 | 
297 |     //at = asctime(localtime(&t));
298 |     /* First, write a nice header */
299 |     fprintf(MAF, "/* map_alignment [V%s] */ %s",PACKAGE_VERSION , 
300 | 	    asctime(localtime(&t)) );
301 | 
302 |     /* Write MapAlignment Info */
303 |     fprintf(MAF, "MALN_NAS %d\n", maln->num_aln_seqs);
304 |     fprintf(MAF, "MALN_SIZ %d\n", maln->size);
305 |     fprintf(MAF, "MALN_COC %d\n", maln->cons_code);
306 | 
307 |     /* Write the reference sequence and associated data */
308 |     fprintf(MAF, "__REFERENCE__\n");
309 |     fprintf(MAF, "ID %s\n", maln->ref->id);
310 |     fprintf(MAF, "DESC %s\n", maln->ref->desc);
311 |     fprintf(MAF, "LEN %d\n", maln->ref->seq_len);
312 |     fprintf(MAF, "SIZE %d\n", maln->ref->size);
313 |     fprintf(MAF, "SEQ ");
314 |     for (i = 0; i < maln->ref->seq_len; i++) {
315 |         fprintf(MAF, "%c", maln->ref->seq[i]);
316 |     }
317 |     fprintf(MAF, "\n");
318 | 
319 |     fprintf(MAF, "GAPS");
320 |     for (i = 0; i < maln->ref->seq_len; i++) {
321 |         fprintf(MAF, " %d", maln->ref->gaps[i]);
322 |     }
323 |     fprintf(MAF, "\n");
324 | 
325 |     /* Write the PSSMs */
326 |     fpsm = maln->fpsm;
327 |     rpsm = maln->rpsm;
328 |     fprintf(MAF, "__PSSM__\n");
329 |     fprintf(MAF, "DEPTH %d\n", fpsm->depth);
330 |     fprintf(MAF, "FPSM:\n");
331 |     for (i = 0; i <= (fpsm->depth * 2); i++) {
332 |       for (row = 0; row <= 4; row++) {
333 | 	fprintf(MAF, "%d %d %d %d %d\n", 
334 | 		fpsm->sm[i][row][0],
335 | 		fpsm->sm[i][row][1], 
336 | 		fpsm->sm[i][row][2],
337 | 		fpsm->sm[i][row][3], 
338 | 		fpsm->sm[i][row][4]);
339 |       }
340 |       fprintf(MAF, "\n");
341 |     }
342 |     
343 |     fprintf(MAF, "RPSM:\n");
344 |     for (i = 0; i <= (fpsm->depth * 2); i++) {
345 |       for (row = 0; row <= 4; row++) {
346 | 	fprintf(MAF, "%d %d %d %d %d\n", rpsm->sm[i][row][0],
347 | 		rpsm->sm[i][row][1], rpsm->sm[i][row][2],
348 | 		rpsm->sm[i][row][3], rpsm->sm[i][row][4]);
349 |       }
350 |       fprintf(MAF, "\n");
351 |     }
352 | 
353 |     /* Write all the aligned fragments */
354 |     fprintf(MAF, "__ALNSEQS__\n");
355 |     for (i = 0; i < maln->num_aln_seqs; i++) {
356 |       as = maln->AlnSeqArray[i];
357 |       aln_seq_len = strlen(as->seq);
358 |       fprintf(MAF, "ID %s\n", as->id);
359 |       fprintf(MAF, "DESC %s\n", as->desc);
360 |       fprintf(MAF, "SCORE %d\n", as->score);
361 |       fprintf(MAF, "NUM_INPUTS %d\n", as->num_inputs);
362 |       fprintf(MAF, "START %d\n", as->start);
363 |       fprintf(MAF, "END %d\n", as->end);
364 |       fprintf(MAF, "RC %d\n", !!as->revcom);
365 |       fprintf(MAF, "TR %d\n", !!as->trimmed);
366 |       fprintf(MAF, "DR %d\n", !!as->dropped);
367 |       fprintf(MAF, "SEG %c\n", as->segment);
368 |       fprintf(MAF, "SEQ %s\n", as->seq);
369 |       fprintf(MAF, "SMP %s\n", as->smp);
370 |       fprintf(MAF, "INS_POS");
371 |         for (j = 0; j < aln_seq_len; j++) {
372 |             if (as->ins[j] == NULL) {
373 |                 //	fprintf( MAF, " _" );
374 |             } else {
375 |                 fprintf(MAF, " %d %s", j, as->ins[j]);
376 |             }
377 |         }
378 |         fprintf(MAF, "\n");
379 |     }
380 |     fclose(MAF);
381 |     return 1;
382 | }
383 | 
384 | MapAlignmentP read_ma(const char* fn) {
385 |     MapAlignmentP maln;
386 |     AlnSeqP as;
387 |     FILE* MAF;
388 |     char* line;
389 |     char tmp_ins[MAX_INS_LEN] ;
390 | 
391 |     char c;
392 |     int tmp, i, as_num, ins_pos, depth, row, A, C, G, T, N;
393 | 
394 |     line = (char*) save_malloc((MAX_LINE_LEN + 1) * sizeof (char));
395 |     MAF = fileOpen(fn, "r");
396 | 
397 |     maln = init_map_alignment();
398 |     maln->fpsm = (PSSMP) save_malloc(sizeof (PSSM));
399 |     maln->rpsm = (PSSMP) save_malloc(sizeof (PSSM));
400 | 
401 |     /* Check header */
402 |     fgets(line, MAX_LINE_LEN, MAF);
403 |     if (strstr(line, "/* map_alignment") == NULL) {
404 |         fprintf(stderr, "%s does not look like a map_alignment input file\n",
405 |                 fn);
406 |         exit(1);
407 |     }
408 | 
409 |     /* Parse MALN_NAS */
410 |     fgets(line, MAX_LINE_LEN, MAF);
411 |     sscanf(line, "MALN_NAS %d", &maln->num_aln_seqs);
412 | 
413 |     /* Parse MALN_SIZ; grow the AlnSeqArray of the MapAlignment until
414 |      it's at least as big as before */
415 |     fgets(line, MAX_LINE_LEN, MAF);
416 |     sscanf(line, "MALN_SIZ %d", &tmp);
417 |     while (maln->size < tmp) {
418 |         grow_alns_map_alignment(maln);
419 |     }
420 | 
421 |     /* Parse MALN_NAS */
422 |     fgets(line, MAX_LINE_LEN, MAF);
423 |     sscanf(line, "MALN_COC %d", &maln->cons_code);
424 | 
425 |     /* Parse the reference sequence header */
426 |     fgets(line, MAX_LINE_LEN, MAF);
427 |     if (strstr(line, "__REFERENCE__") == NULL) {
428 |         fprintf(stderr, "Do not see reference sequence header in %s\n", fn);
429 |         exit(1);
430 |     }
431 | 
432 |     /* Parse the reference ID */
433 |     fgets(line, MAX_LINE_LEN, MAF);
434 |     sscanf(line, "ID %s", maln->ref->id);
435 | 
436 |     /* Parse the reference DESC */
437 |     fgets(line, MAX_LINE_LEN, MAF);
438 |     sscanf(line, "DESC %s", maln->ref->desc);
439 | 
440 |     /* Parse the reference LEN and make the maln->ref->gaps
441 |      point to an array of ints this size*/
442 |     fgets(line, MAX_LINE_LEN, MAF);
443 |     sscanf(line, "LEN %d", &maln->ref->seq_len);
444 |     maln->ref->gaps = (int*) save_malloc(maln->ref->seq_len * sizeof (int));
445 | 
446 |     /* Parse the reference SIZE and make the maln->ref->seq point
447 |      to a char array this size */
448 |     fgets(line, MAX_LINE_LEN, MAF);
449 |     sscanf(line, "SIZE %d", &maln->ref->size);
450 |     maln->ref->seq = (char*) save_malloc(maln->ref->size * sizeof (char));
451 | 
452 |     /* Parse the reference SEQ and put it into maln->ref->seq */
453 |     fgets(line, MAX_LINE_LEN, MAF);
454 |     sscanf(line, "SEQ %s", maln->ref->seq);
455 | 
456 |     /* Check to make sure the LEN info we got a few lines back is correct */
457 |     if (!(strlen(maln->ref->seq) == maln->ref->seq_len)) {
458 |         fprintf(stderr, "Reported length of reference sequence %d is not observed length %d\n",
459 |                 (int) maln->ref->seq_len, (int) strlen(maln->ref->seq));
460 |         exit(1);
461 |     }
462 | 
463 |     /* Parse the reference GAPS and put them into maln->ref->gaps */
464 |     fscanf(MAF, "GAPS"); // Go past the GAPS string and start getting %d
465 |     for (i = 0; i < maln->ref->seq_len; i++) {
466 |         fscanf(MAF, " %u", &maln->ref->gaps[i]);
467 |     }
468 | 
469 |     /* Yoink-a-doink on past the \n that we never passed from the GAPS line */
470 |     c = fgetc(MAF);
471 |     while ((c != '\n') && (c != EOF)) {
472 |         c = fgetc(MAF);
473 |     }
474 | 
475 |     /* Parse the line that announces we're starting the PSSM section */
476 |     fgets(line, MAX_LINE_LEN, MAF);
477 |     if (strstr(line, "__PSSM__") == NULL) {
478 |         fprintf(stderr, "Do not see __PSSM__ line in %s\n", fn);
479 |         exit(2);
480 |     }
481 | 
482 |     /* Get/set the PSSMP depths */
483 |     fgets(line, MAX_LINE_LEN, MAF);
484 |     sscanf(line, "DEPTH %d", &depth);
485 |     maln->fpsm->depth = depth;
486 |     maln->rpsm->depth = depth;
487 | 
488 |     /* Skip past the FPSM: line */
489 |     fgets(line, MAX_LINE_LEN, MAF);
490 |     if (strstr(line, "FPSM:") == NULL) {
491 |         fprintf(stderr, "Do not see the FPSM: in %s\n", fn);
492 |         exit(2);
493 |     }
494 | 
495 |     /* Now, get depth number of substitution matrices for the maln->fpsm->sm */
496 |     for (i = 0; i <= (depth * 2); i++) {
497 |         for (row = 0; row <= 4; row++) {
498 |             fgets(line, MAX_LINE_LEN, MAF);
499 |             sscanf(line, "%d %d %d %d %d", &A, &C, &G, &T, &N);
500 |             maln->fpsm->sm[i][row][0] = A;
501 |             maln->fpsm->sm[i][row][1] = C;
502 |             maln->fpsm->sm[i][row][2] = G;
503 |             maln->fpsm->sm[i][row][3] = T;
504 |             maln->fpsm->sm[i][row][4] = N;
505 |         }
506 |         /* Skip blank line separating matrices */
507 |         fgets(line, MAX_LINE_LEN, MAF);
508 |     }
509 | 
510 |     /* Skip past the RPSM: line */
511 |     fgets(line, MAX_LINE_LEN, MAF);
512 |     if (strstr(line, "RPSM:") == NULL) {
513 |         fprintf(stderr, "Do not see the RPSM: in %s\n", fn);
514 |         exit(2);
515 |     }
516 | 
517 |     /* Now, get depth number of substitution matrices for the maln->rpsm->sm */
518 |     for (i = 0; i <= (depth * 2); i++) {
519 |         for (row = 0; row <= 4; row++) {
520 |             fgets(line, MAX_LINE_LEN, MAF);
521 |             sscanf(line, "%d %d %d %d %d", &A, &C, &G, &T, &N);
522 |             maln->rpsm->sm[i][row][0] = A;
523 |             maln->rpsm->sm[i][row][1] = C;
524 |             maln->rpsm->sm[i][row][2] = G;
525 |             maln->rpsm->sm[i][row][3] = T;
526 |             maln->rpsm->sm[i][row][4] = N;
527 |         }
528 |         /* Skip blank line separating matrices */
529 |         fgets(line, MAX_LINE_LEN, MAF);
530 |     }
531 | 
532 |     /* Parse the line that announces we're starting the aligned fragments
533 |      section of output */
534 |     fgets(line, MAX_LINE_LEN, MAF);
535 |     if (strstr(line, "__ALNSEQS__") == NULL) {
536 |         fprintf(stderr, "Do not see __ALNSEQS__ line in %s\n", fn);
537 |         exit(1);
538 |     }
539 | 
540 |     /* Go through the parsing for as many aligned fragments as we're
541 |      expecting */
542 |     for (as_num = 0; as_num < maln->num_aln_seqs; as_num++) {
543 |         as = maln->AlnSeqArray[as_num];
544 | 
545 |         /* Get ID line */
546 |         fgets(line, MAX_LINE_LEN, MAF);
547 |         sscanf(line, "ID %s\n", as->id);
548 | 
549 |         /* Get DESC line */
550 |         fgets(line, MAX_LINE_LEN, MAF);
551 |         strcpy(as->desc, &line[5]);
552 |         as->desc[strlen(as->desc) - 1] = '\0'; // get rid of \n
553 | 
554 |         /* Get SCORE line */
555 |         fgets(line, MAX_LINE_LEN, MAF);
556 |         sscanf(line, "SCORE %d\n", &as->score);
557 | 
558 | 	/* Get NUM_INPUTS line, if there */
559 | 	fgets(line, MAX_LINE_LEN, MAF);
560 | 	if ( sscanf( line, "NUM_INPUTS %d\n", &as->num_inputs ) == 1 ) {
561 | 	  fgets(line, MAX_LINE_LEN, MAF);
562 | 	}
563 | 	else {
564 | 	  as->num_inputs = 1;
565 | 	}
566 | 
567 |         /* Get START line */
568 |         sscanf(line, "START %d\n", &as->start);
569 | 
570 |         /* Get END line */
571 |         fgets(line, MAX_LINE_LEN, MAF);
572 |         sscanf(line, "END %d\n", &as->end);
573 | 
574 |         /* Get RC line */
575 |         fgets(line, MAX_LINE_LEN, MAF);
576 |         sscanf(line, "RC %d\n", &tmp) ; as->revcom = tmp;
577 | 
578 |         /* Get TR line */
579 |         fgets(line, MAX_LINE_LEN, MAF);
580 |         sscanf(line, "TR %d\n", &tmp) ; as->trimmed = tmp;
581 | 
582 |         /* Get DR line */
583 |         fgets(line, MAX_LINE_LEN, MAF);
584 |         if( 1 == sscanf(line, "DR %d\n", &tmp) ) {
585 |             as->dropped = tmp;
586 |             fgets(line, MAX_LINE_LEN, MAF);
587 |         }
588 | 
589 |         /* Get SEG line */
590 |         sscanf(line, "SEG %c\n", &as->segment);
591 | 
592 |         /* Get SEQ line */
593 |         fgets(line, MAX_LINE_LEN, MAF);
594 |         sscanf(line, "SEQ %s\n", as->seq);
595 | 
596 |         /* Get SMP line */
597 |         fgets(line, MAX_LINE_LEN, MAF);
598 |         sscanf(line, "SMP %s\n", as->smp);
599 | 
600 |         /* Get INS line */
601 |         fscanf(MAF, "INS_POS");
602 |         while (fscanf(MAF, " %d %s", &ins_pos, tmp_ins) == 2) {
603 |             as->ins[ins_pos] = (char*) save_malloc(MAX_INS_LEN * sizeof (char));
604 |             strcpy(as->ins[ins_pos], tmp_ins);
605 |         }
606 | 
607 |     }
608 |     fclose(MAF);
609 |     free(line);
610 |     return maln;
611 | }
612 | 
613 | int count_aln_seqs(MapAlignmentP maln) {
614 |     int i;
615 |     int tot_aln_seqs = 0;
616 | 
617 |     /* Count each one that is segment a, n, or f */
618 |     for (i = 0; i < maln->num_aln_seqs; i++) {
619 |         if (maln->AlnSeqArray[i]->segment != 'b') {
620 |             tot_aln_seqs++;
621 |         }
622 |     }
623 |     return tot_aln_seqs;
624 | }
625 | 
626 | /* Sorts the AlnSeqArray by alnSeqCmp (its start and end
627 |  coordinates.
628 |  Note that after this operation, any FragSeqDB pointing
629 |  to this AlnSeqArray will be wrong! */
630 | void sort_aln_frags(MapAlignmentP maln) {
631 |     qsort((void*) maln->AlnSeqArray, (size_t) maln->num_aln_seqs,
632 |             sizeof (AlnSeqP), alnSeqCmp);
633 | }
634 | 
635 | void print_assembly_summary(MapAlignmentP maln) {
636 |     int i;
637 |     int total_frag_len = 0;
638 | 
639 |     for (i = 0; i < maln->num_aln_seqs; i++) {
640 |         total_frag_len += (maln->AlnSeqArray[i]->end
641 |                 - maln->AlnSeqArray[i]->start + 1);
642 |     }
643 | 
644 |     printf("# Map reference ID: %s\n", maln->ref->id);
645 |     printf("# Map reference length: %d\n", maln->ref->seq_len);
646 |     printf("# Number of fragments aligned to reference: %d\n",
647 |             count_aln_seqs(maln));
648 |     //	  maln->num_aln_seqs );
649 |     printf("# Total length of aligned fragments: %d\n", total_frag_len);
650 |     printf("# Average coverage: %0.3f\n", ((double) total_frag_len
651 |             / (double) maln->ref->seq_len));
652 | 
653 | }
654 | 
655 | 
656 | // Return the absolute number of gaps upstream of this position
657 | 
658 | int sum_of_gaps(MapAlignmentP maln, int pos) {
659 |     int i, gaps;
660 |     gaps = 0;
661 |     for (i = 0; (i < pos); i++)
662 |         gaps += maln->ref->gaps[i];
663 |     return gaps;
664 | }
665 | 
666 | /* Grow the space for a MapAlignment to twice its current
667 |  size. Actually, just grow the array of aligned sequences.
668 |  Copy the current aligned sequences into the new array
669 |  free the now unused memory.
670 |  Return 1 if success
671 |  0 if failure
672 |  */
673 | int grow_alns_map_alignment(MapAlignmentP aln) {
674 |     int i, j, k;
675 |     int new_size;
676 |     AlnSeqP as;
677 |     AlnSeqP* NewAlnSeqArray;
678 | 
679 |     new_size = (aln->size) * 2;
680 | 
681 |     // Now, allocate the new array of pointers to the aligned seqs
682 |     NewAlnSeqArray = (AlnSeqP*) save_malloc(new_size * sizeof (AlnSeqP));
683 |     if (NewAlnSeqArray == NULL) {
684 |         fprintf(stderr, "Out of memory, sucka!\n");
685 |         return 0;
686 |     }
687 | 
688 |     // Now, point the pointers to the pointees
689 |     // First, the old pointers/pointees
690 |     for (i = 0; i < aln->size; i++) {
691 |         NewAlnSeqArray[i] = aln->AlnSeqArray[i];
692 |     }
693 |     // Now, the new pointers/pointees
694 |     k = 0;
695 |     for (i = aln->size; i < new_size; i++) {
696 |         /* Just in case there's some cruffy leftovers in our
697 |          clean new memories */
698 |         NewAlnSeqArray[i] = (AlnSeqP) save_malloc(sizeof (AlnSeq));
699 |         /* Zero them out */
700 |         as = NewAlnSeqArray[i];
701 |         for (j = 0; j <= MAX_ID_LEN; j++) {
702 |             as->id[j] = '\0';
703 |         }
704 |         for (j = 0; j <= MAX_DESC_LEN; j++) {
705 |             as->desc[j] = '\0';
706 |         }
707 |         for (j = 0; j <= INIT_ALN_SEQ_LEN; j++) {
708 |             as->seq[j] = '\0';
709 |         }
710 |         /* Set all their char* ins to NULL */
711 |         for (j = 0; j <= INIT_ALN_SEQ_LEN; j++) {
712 |             as->ins[j] = NULL;
713 |         }
714 |         as->start = 0;
715 |         as->end = 0;
716 |         as->revcom = 0;
717 |         as->trimmed = 0;
718 |         as->dropped = 0;
719 |         as->score = 0;
720 |         as->segment = 'n';
721 |     }
722 | 
723 |     // Now, the old aln->AlnSeqArray can be freed like a bird
724 |     free(aln->AlnSeqArray);
725 | 
726 |     // And put in it's place the newer, bigger NewAlnSeqArray
727 |     aln->AlnSeqArray = NewAlnSeqArray;
728 |     aln->size = new_size;
729 |     return 1;
730 | }
731 | 


--------------------------------------------------------------------------------
/src/map_alignment.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * File:   map_alignment.h
 3 |  * Author: TCO
 4 |  *
 5 |  * Created on 26. Januar 2009, 12:16
 6 |  */
 7 | 
 8 | #ifndef _MAP_ALIGNMENT_H
 9 | #define	_MAP_ALIGNMENT_H
10 | 
11 | #include "types.h"
12 | #include "io.h"
13 | #include "config.h"
14 | 
15 | #ifdef	__cplusplus
16 | extern "C" {
17 | #endif
18 | 
19 | 
20 | 
21 |     /* Initialize a MapAlignment object and return a pointer to it */
22 |     MapAlignmentP init_map_alignment(void);
23 | 
24 | 
25 | 
26 |     /* free_map_alignment
27 |      Takes a MapAlignmentP (maln)
28 |      Frees the memory pointed to by its components
29 |      Returns nothing
30 |      */
31 |     void free_map_alignment(MapAlignmentP maln);
32 | 
33 | 
34 |     /* Write out the data in a MapAlignment data structure
35 |      to a file
36 |      */
37 |     int write_ma(char* fn, MapAlignmentP maln);
38 | 
39 |     MapAlignmentP read_ma(const char* fn);
40 | 
41 |     /* Grow the space for a MapAlignment to twice its current
42 |  size. Actually, just grow the array of aligned sequences.
43 |  Copy the current aligned sequences into the new array
44 |  free the now unused memory.
45 |  Return 1 if success
46 |  0 if failure
47 |      */
48 |     int grow_alns_map_alignment(MapAlignmentP aln);
49 | 
50 |     // Return the absolute number of gaps upstream of this position
51 |     int sum_of_gaps(MapAlignmentP maln, int pos);
52 | 
53 | 
54 |     MapAlignmentP init_map_alignment(void);
55 |     int count_aln_seqs(MapAlignmentP maln);
56 |     void sort_aln_frags(MapAlignmentP maln);
57 | 
58 | 
59 | 
60 | 
61 |     void show_consensus(MapAlignmentP maln, int out_format);
62 | 
63 | 
64 | 
65 | 
66 |     //some more convenient functions
67 |     int get_consensus_length(MapAlignmentP maln);
68 |     char* get_consensus(MapAlignmentP maln);
69 | 
70 | 
71 |     void print_assembly_summary(MapAlignmentP maln);
72 | 
73 |     /* Sorts the AlnSeqArray by alnSeqCmp (its start and end
74 |      coordinates.
75 |      Note that after this operation, any FragSeqDB pointing
76 |      to this AlnSeqArray will be wrong! */
77 |     void sort_aln_frags(MapAlignmentP maln);
78 | 
79 | 
80 | #ifdef	__cplusplus
81 | }
82 | #endif
83 | 
84 | #endif	/* _MAP_ALIGNMENT_H */
85 | 
86 | 


--------------------------------------------------------------------------------
/src/map_assembler.c:
--------------------------------------------------------------------------------
  1 | /* $Id: map_assembler.c 1051 2008-02-11 15:38:53Z green $ */
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include "map_align.h"
  5 | #include "map_alignment.h"
  6 | #include "io.h"
  7 | 
  8 | /*
  9 |     void * save_malloc(size_t size){
 10 |         void * tmp = malloc(size);
 11 |         if (tmp == NULL)
 12 |             fprintf( stderr, "Some memory allokation failed. Exiting");
 13 | 
 14 |         return tmp;
 15 |     }
 16 | */
 17 | 
 18 | void help( void ) {
 19 |   printf( "ma -M <maln input file>\n" );
 20 |   printf( "   -c <consensus code> \n" );
 21 |   printf( "   -f <output format>\n" );
 22 |   printf( "   -R <REGION_START:REGION_END>\n" );
 23 |   printf( "   -I <ID to assign to assembly sequence>\n" );
 24 |   printf( "ma reports information from a maln assembly file as generated by mia\n" );
 25 |   printf( "How the assembly calls each base can be determined by the\n" );
 26 |   printf( "consensus code. 1 = highest, positive aggregate score base (if any)\n" );
 27 |   printf( "                2 = highest aggregate score base if it is %d higher\n",
 28 | 	  MIN_SC_DIFF_CONS );
 29 |   printf( "                    than second highest\n" );
 30 |   printf( "The output format can be specified through -f as one of the following.\n" );
 31 |   printf( "More complete descriptions of these output formats is below,\n" );
 32 |   printf( "under FORMATS\n" );
 33 |   printf( "1 => clustalw\n" );
 34 |   printf( "2 => line format; one line each for consensus, reference\n" );
 35 |   printf( "     and coverage\n" );
 36 |   printf( "3 => column format; one line per base, one column for consensus,\n" );
 37 |   printf( "     reference, and coverage; includes header with summary info\n" );
 38 |   printf( "4 => columns description of all assembly data for positions that differ\n" );
 39 |   printf( "     between consensus and CURRENT reference sequence (see FORMATS, below)\n" );
 40 |   printf( "41 => same as above, but for ALL positions\n" );
 41 |   printf( "5 => fasta format output of assembled sequence only\n" );
 42 |   printf( "6 => show all fragments in a region specified by -R\n" );
 43 |   printf( " -C Color format 6 output -> don't pipe this output to file!\n" );
 44 |   printf( "7 => ACE\n\n" ); 
 45 |   printf( "\nFORMATS (option f):\n" );
 46 |   printf( "1 => clustalw\n" );
 47 |   printf( "2 => line format; first line is \"Consensus, chrM, coverage:\"\n" );
 48 |   printf( "      second line is the entire, assembled, aligned consensus sequence\n" );
 49 |   printf( "      third line is the entire aligned reference sequence to which the\n" );
 50 |   printf( "      consensus is aligned\n" );
 51 |   printf( "      fourth line is the sequence coverage at each position in a space-\n" );
 52 |   printf( "      separated list of integers\n" );
 53 |   printf( "3 => column format; header shows summary statistics; table has one row\n" );
 54 |   printf( "      per position; columns are described in the output\n" );
 55 |   printf( "4 => alternative column format with one row per base that differes between\n" );
 56 |   printf( "      the consensus assembly and the reference of this iteration. \n" );
 57 |   printf( "      Note that in the FINAL iteration reference and consensus are equal! \n" );
 58 |   printf( "      So there won't be any output. Each row has the following\n" );
 59 |   printf( "      columns: (1)position on reference; 0-based coordinates, (2) reference\n" );
 60 |   printf( "      base, (3)consensus assembly base, (4)coverage, (5)A's, (6)C's, (7)G's,\n" );
 61 |   printf( "      (8)T's, (9)gaps; columns 5 through 9 should add up to column 4\n" );
 62 |   printf( "      (10) aggregate score for A, (11) aggregate score for C\n" );
 63 |   printf( "      (12) aggregate score for G, (13) aggregate score for T\n" );
 64 |   printf( "41=> same as above, but for every position\n" );
 65 |   printf( "5 => fasta format using ID \"Consensus\" for the assembly\n" );
 66 |   printf( "6 => region; shows the reference sequence, the consensus sequence, and then\n" );
 67 |   printf( "      all assembled fragments in a region specified by option -R\n" );
 68 |   printf( "61=> same as above, but in multi-fasta format for viewing in Bioedit, e.g.\n" );
 69 |   printf( "     (also requires a region as specified by the option -R\n" );
 70 |   printf( "7 => ACE format\n" ); 
 71 | }
 72 | 
 73 | void parse_region( char* reg_str, int* reg_start, int* reg_end ) {
 74 |   int tmp;
 75 |   sscanf( reg_str, "%d:%d", reg_start, reg_end );
 76 |   /* Flip em around if l-user is so stupid that he puts the bigger one first */
 77 |   if ( *reg_start > *reg_end ) {
 78 |     tmp = *reg_start;
 79 |     *reg_start = *reg_end;
 80 |     *reg_end = *reg_start;
 81 |   }
 82 | }
 83 | 
 84 | int main( int argc, char* argv[] ) {
 85 |   char mafn[MAX_FN_LEN+1];
 86 |   char ma_in_fn[MAX_FN_LEN+1];
 87 |   char assign_id[MAX_ID_LEN+1];
 88 |   unsigned int any_arg;
 89 |   int id_assigned = 0; // Boolean, set to true if -I is given
 90 |   int cons_scheme;
 91 |   int out_ma   = 0;
 92 |   int in_ma    = 0;
 93 |   int no_dups  = 0; // allow duplicate ids by default - the user knows what he's doing
 94 |   int out_format = 1;
 95 |   int reg_start  = 90;
 96 |   int reg_end    = 109;
 97 |   int in_color   = 0;  // Output f6 format colored -> bad when you want to pipe it into a file
 98 |   MapAlignmentP maln;
 99 |   IDsListP rest_ids_list, // the IDs in the -i argument, if any, will go here
100 |     used_ids_list;        // the IDs seen thusfar; just for this list,
101 |                           // the segment character is tacked onto the end
102 |   int ich, cons_scheme_def, ids_rest;
103 |   double score_int, score_slo;
104 |   extern char* optarg;
105 |   cons_scheme_def = 1;
106 |   ids_rest = 0; // Boolean set to no => no IDs restriction set (yet)
107 |   /* Get input options */
108 |   any_arg = 0;
109 |   cons_scheme = cons_scheme_def;
110 |   score_int = -1.0; // Set the score intercept to -1 => not specified (yet)
111 |   score_slo = -1.0; // Set the score intercept to -1 => not specified (yet)
112 |   while( (ich=getopt( argc, argv, "I:c:i:f:R:s:m:M:Cb:s:d" )) != -1 ) {
113 |     switch(ich) {
114 |     case 'h' :
115 |       help();
116 |       exit( 0 );
117 |       any_arg = 1;
118 |       break;
119 |     case 'I' :
120 |       strcpy( assign_id, optarg );
121 |       id_assigned = 1;
122 |       break;
123 |     case 'c' :
124 |       cons_scheme = atoi( optarg );
125 |       any_arg = 1;
126 |       break;
127 |     case 'i' :
128 |       rest_ids_list = parse_ids( optarg );
129 |       ids_rest = 1;
130 |       any_arg = 1;
131 |       break;
132 |     case 'f' :
133 |       out_format = atoi( optarg );
134 |       any_arg = 1;
135 |       break;
136 |     case 'R' :
137 |       parse_region( optarg, &reg_start, &reg_end );
138 |       any_arg = 1;
139 |       break;
140 |     case 's' :
141 |       score_slo = atof( optarg );
142 |       any_arg = 1;
143 |       break;
144 |     case 'b' :
145 |       score_int = atof( optarg );
146 |       any_arg = 1;
147 |       break;
148 |     case 'C' : 
149 |       in_color = 1;
150 |       break;
151 |     case 'm' :
152 |       strcpy( mafn, optarg );
153 |       out_ma  = 1;
154 |       any_arg = 1;
155 |       break;
156 |     case 'M' :
157 |       strcpy( ma_in_fn, optarg );
158 |       in_ma   = 1;
159 |       any_arg = 1;
160 |       break;
161 |     case 'd' :
162 |       no_dups = 1;
163 |       used_ids_list = init_ids_list();
164 |       any_arg = 1;
165 |       break;
166 |     default :
167 |       help();
168 |       any_arg = 1;
169 |       exit( 0 );
170 |     }
171 |   }
172 |   if ( !any_arg || 
173 |        ( (score_slo == -1) && (score_int != -1) ) ||
174 |        ( (score_slo != -1) && (score_int == -1) ) ) {
175 |     help();
176 |     exit( 0 );
177 |   }
178 | 
179 |   /* Initialize maln, either from specified input file or 
180 |      brand new */
181 |   if ( in_ma ) {
182 |     maln = read_ma( ma_in_fn );
183 |   }
184 | 
185 |   else {
186 |     help();
187 |     exit( 0 );
188 |   }
189 | 
190 |   /* Set the maln->cons_code to something reasonable */
191 |   maln->cons_code = cons_scheme;
192 | 
193 |   /* Now input from all sources has been dealt with, we turn our 
194 |      attention to output...*/
195 |   sort_aln_frags( maln );
196 | 
197 |   /* If an ID to be assigned to the assembly was given, then assign it now */
198 |   if ( id_assigned ) {
199 |     strcpy( maln->ref->id, assign_id );
200 |   }
201 | 
202 |   if ( (out_format == 6) ||
203 |        (out_format == 61) ) {
204 |     print_region( maln, reg_start, reg_end, out_format, in_color );
205 |   }
206 |   else {
207 |     show_consensus( maln, out_format );
208 |   }
209 |   
210 |   if (out_format == 7){
211 | 	  ace_output(maln);
212 |   }
213 | 
214 |   /* Write MapAlignment output to a file */
215 |   if ( out_ma ) {
216 |     write_ma( mafn, maln );
217 |   }
218 | 
219 |   exit( 0 );
220 | }
221 | 
222 | 
223 | 


--------------------------------------------------------------------------------
/src/mia.h:
--------------------------------------------------------------------------------
  1 | #ifndef INCLUDED_mia_H
  2 | #define INCLUDED_mia_H
  3 | 
  4 | #include "map_align.h"
  5 | #include "io.h"
  6 | #include "map_alignment.h"
  7 | #include "fsdb.h"
  8 | #include "pssm.h"
  9 | #include "kmer.h"
 10 | #include "assert.h"
 11 | #include "params.h"
 12 | 
 13 | 
 14 | #ifdef HAVE_CONFIG_H
 15 | #include "config.h"
 16 | #else
 17 | #define PACKAGE_BUGREPORT "green@eva.mpg.de"
 18 | #define PACKAGE_NAME "MIA"
 19 | #define PACKAGE_VERSION "1.0"
 20 | #endif
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | /* init_sized_map_alignment
 29 |    Args: MapAlignmentP maln - source maln to get size from
 30 |    Returns: pointer to a fresh MapAligment for holding the
 31 |    culled results from the source maln. This culled guy
 32 |    gets the same ref and an AlnSeqArray big enough for the
 33 |    results, but no actual memory is malloced for new AlnSeq's
 34 |    Instead, we'll just point the source maln guys to this
 35 |    one if they are unique, which is determined elsewhere
 36 | */
 37 | MapAlignmentP init_culled_map_alignment( MapAlignmentP src_maln ) ;
 38 | 
 39 | /* find_alignable_len
 40 |    Args: (1) FragSeqP fs - with value info in as, ae and seq_len fields
 41 |          (2) RefSeqP ref - with valid info in the sequence
 42 |    Returns: int with the alignable sequence length of this sequence
 43 |    in this FragSeqP. That is defined as the length of this sequence minus
 44 |    any part that overlaps positions that are "N" in the RefSeq. This
 45 |    number is not allowed to be less that MIN_ALIGNABLE_LEN to avoid
 46 |    having sequence with very little or no alignable sequence.
 47 | */
 48 | int find_alignable_len( FragSeqP fs, RefSeqP ref );
 49 | 
 50 | inline char best_base_at_pos( QSSP qss, size_t i );
 51 | 
 52 | /* cull_maln_from_fsdb
 53 |    Args: (1) MapAlignmentP culled_maln - maln with enough room to put the
 54 | 	     unique AlnSeq's
 55 | 	 (2) FSDB fsdb - has valid data in front_asp, back_asp, and
 56 | 	     unique_best fields
 57 |    Returns: void
 58 |    Goes through each FragSeq pointed to by fsdb->fss. For all guys that
 59 |    are unique_best and score >= SCORE_CUTOFF, copies front_asp and
 60 |    back_asp into culled_maln->AlnSeqArray. Then res
 61 | */
 62 | void cull_maln_from_fsdb( MapAlignmentP culled_maln,
 63 | 			  FSDB fsdb, int Hard_cut,
 64 | 			  int SCORE_CUT_SET, double s, double n ) ;
 65 | 
 66 | 
 67 | /* consensus_assembly_string
 68 |    Takes a maln object
 69 |    Generates the consensus sequence string using the aligned data
 70 |    within the maln according to the maln->cons_code and puts it
 71 |    in char* cons
 72 |    Returns char* pointer to consensus string
 73 | */
 74 | char* consensus_assembly_string ( MapAlignmentP maln ) ;
 75 | 
 76 | 
 77 | /* Takes a pointer to an Alignment that has valid values
 78 |    in its dynamic programming matrix and valid values
 79 |    for a->aer and a->aec (ending row and column).
 80 |    Tracks back to the beginning of the alignment and
 81 |    adds valid values from a->abr and a->abc
 82 |    Returns nothing
 83 | */
 84 | void find_align_begin( AlignmentP a ) ;
 85 | 
 86 | 
 87 | 
 88 | void make_ref_upper( RefSeqP ref ) ;
 89 | 
 90 | /* Takes a pointer to a RefSeq that has a valid
 91 |    sequence in it. Adds INIT_ALN_SEQ_LEN sequence
 92 |    from the beginning to the end so that any
 93 |    sequence fragment aligned to it will have a
 94 |    valid chance to align, despite the circularity
 95 |    of the sequence. */
 96 | void add_ref_wrap( RefSeqP ref );
 97 | 
 98 | /* init_dpm
 99 |    Args: (1) size1 - the number of rows (fragment sequence)
100 |          (2) size2 - the number of columns (referense sequence)
101 |    Returns: DPMP => pointer to a dynamic programming matrix
102 |    with memory properly allocated
103 | */
104 | DPMP init_dpm( int size1, int size2 ) ;
105 | 
106 | void free_dpm( DPMP m ) ;
107 | 
108 | 
109 | /* Takes a pointer to an Alignment
110 |    that has valid sequence, length, submat, and sg data
111 |    Does dynamic programming, filling in values in the
112 |    a->m dynamic programming matrix
113 |    Returns nothing */
114 | void dyn_prog( AlignmentP a ) ;
115 | 
116 | /* size1 is length of fragment
117 |    size2 is length of reference (wrapped if necessary) + INIT_ALN_SEQ_LEN
118 |    rc is boolean to seay if its reverse complement
119 |    hp_special is boolean to say if homopolymer special gap costs are to be used
120 | */
121 | AlignmentP init_alignment( int size1, int size2,
122 | 			   int rc, int hp_special ) ;
123 | 
124 | void free_alignment( AlignmentP al ) ;
125 | 
126 | /* pop_s1c_in_a
127 |    Args: (1) AlignmentP a - has a->seq1 and a->len1 set to
128 |    valid values
129 |    Returns: void
130 |    Populates the a->s1c array with code for quick lookup
131 |    in submat
132 | */
133 | void pop_s1c_in_a ( AlignmentP a ) ;
134 | 
135 | /* hp_discount_penalty
136 |    Args: (1) int gap_len - gap length
137 |          (2) int hplen1 - homopolymer length in column
138 | 	     (usu. longer) sequence
139 |          (3) int hplen2 - homopolymer length in row
140 | 	     (usu. shorter) sequence
141 |    Returns: int
142 |    Calculates the discounted gap penalty for a gap of the input
143 |    size that would be necessary to gap out the remaining sequence
144 |    in the homopolymers of size indicated.
145 |    The undiscounted (normal) gap penalty =  GOP + (GEP * length)
146 | */
147 | int hp_discount_penalty ( int gap_len, int hplen1, int hplen2 ) ;
148 | 
149 | /* pop_hpl_and_hps
150 |    Args: (1) char* seq - sequence
151 |          (2) int len - length of sequence
152 | 	 (3) int* hpl - array of ints to be filled with hp lengths
153 |          (4) int* hps - array of ints to be filled with hp starts
154 |    Returns: void
155 |    Goes through the input seq and puts in hpl the length of the
156 |    homopolyer at each corresponding position in the sequence and
157 |    in hps the start position of the current homopolymer. For example,
158 |    seq=ACCGTGGTAC, len=12
159 |    hpl=1221122111
160 |    hps=0113455789
161 | */
162 | void pop_hpl_and_hps ( const char* seq, int len, int* hpl, int* hps ) ;
163 | 
164 | /* pop_s2c_in_a
165 |    Args: (1) AlignmentP a - has a->seq2 and a->len2 set to
166 |    valid values
167 |    Returns: void
168 |    Populates the a->s2c array with code for quick lookup
169 |    in submat
170 | */
171 | void pop_s2c_in_a ( AlignmentP a ) ;
172 | 
173 | 
174 | /* Input is a pointer to a valid Alignment. The value
175 |    in a->len1 must be valid.
176 |    Searches the last row (a->len1 -1) along all columns
177 |    to find the best score that aligns all of the a->seq2
178 |    Sets a->aec and a->aer to the correct values and sets
179 |    a->best_score to the best score.
180 |    Returns the best score */
181 | int max_sg_score ( AlignmentP a ) ;
182 | 
183 | /* trim_frag
184 |    Args: (1) FragSeqP frag_seq pointer to a FragSeq
185 |          (2) char* adapter pointer to a string of the adapter
186 | 	     that may need to be trimmed
187 | 	 (3) AlignmentP align pointer to an Alignment big
188 | 	     enough for aligning the adapter to this FragSeq
189 |    Returns: void
190 |    Does a semi/semi global alignment and of the adapter to
191 |    the FragSeq. If the score of this alignment is good enough
192 |    as defined by TRIM_SCORE_CUT or if there is a perfect
193 |    match of any number of bases >=1 at the end, then those
194 |    are "trimmed" by setting the frag_seq->trimmed flag to
195 |    true and the frag_seq->trim_point to the correct value
196 | */
197 | void trim_frag (FragSeqP frag_seq, char* adapter,
198 | 		AlignmentP align) ;
199 | 
200 | /* Takes pointers to two PWAlnFrag's
201 |    The first one (front_pwaln) is populated by an alignment that
202 |    crosses the wrap_point.
203 |    Moves all of the alignment that is behind the wrap point into
204 |    the back_pwaln and copies over all the other info
205 |    Sets the correct segment flag for front and back */
206 | void split_pwaln (PWAlnFragP front_pwaln, PWAlnFragP back_pwaln,
207 | 		  int wrap_point ) ;
208 | 
209 | int populate_pwaln_to_begin( AlignmentP a, PWAlnFragP pwaln ) ;
210 | 
211 | 
212 | int sg_align ( MapAlignmentP maln, FragSeqP fs, FSDB fsdb,
213 | 	       AlignmentP fw_a, AlignmentP rc_a,
214 | 	       PWAlnFragP front_pwaln,
215 | 	       PWAlnFragP back_pwaln) ;
216 | 
217 | 
218 | void clean_FSDB( FSDB fsdb ) ;
219 | void collapse_FSDB( FSDB fsdb, int Hard_cut, 
220 | 		    int SCORE_CUT_SET, double s, double n ) ;
221 | 
222 | #endif
223 | 


--------------------------------------------------------------------------------
/src/mt311.c:
--------------------------------------------------------------------------------
  1 | /* internal MT311 reference for ccheck
  2 |  *
  3 |  * This is the 100% consensus of 311 mitochondria, with ambiguity codes
  4 |  * and small letters for optional gaps.  This makes the -r option truly
  5 |  * optional in the common case without having an additional file
  6 |  * lingering round. */
  7 | 
  8 | char mt311_sequence[] =
  9 | 	"GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTcCCATGCATTTGGHATTY"
 10 | 	"TYDYYTDGGGRGYRTGCACGCGATAGCATYGCRRRMCGCTGGARCCGGAGCACCYTATGT"
 11 | 	"CGCAGYAYYTGYCTTTGATTCCTRCCYCATYYYRTTATTTATCGCACCTACGTYCAATAT"
 12 | 	"YAYRGDHGAVCATAYHTMYYRAARYGTRYTRRYTARTYAATGCTTRTDDGACATRRYAAY"
 13 | 	"AACAATTRarYGYCTGCACAGCCRCTTTCCAtYACAGACAYCAYAAYaaAAAatRTTTYC"
 14 | 	"RCCAAAcccCCCYyyyyCCCCCRVTTYTGGCYACAGCACTTaARCRYATCTCTGCCAAAC"
 15 | 	"CCCRAAAACAAAGAACCCTRRCACCAGCCTARCCAGATTTCAAATTKTATCTTTWGGCGG"
 16 | 	"TATGYACTTTTAACAGTCACCCCYCAACTAACACATTAtTYTTYCYCTCYCRYYYCCAYA"
 17 | 	"CYACTAAYCYCATCAAYACARcCCyCRCCCATCCTRYCCARcacacacacrcacacaCAC"
 18 | 	"acaCGCTGCTAACCCYAYACCCCRARYCAACCAAACCCCAAAGACAccccccCccyCCrC"
 19 | 	"AGTTTATGTAGCTTAMCYcCCTYAAAGCAATACACTGAAAATGTTTAGACGGGCYCACAT"
 20 | 	"CACCCCATARACAAATAGGYTYGGTCCTRGCCTTTCTATTAGCYCYTAGTAAGATTACAC"
 21 | 	"ATGCAAGCATCCCCRYTCCAGTGAGTYYRCCCTCTAAATCAYCAYGAYCAAAAGGRAYAA"
 22 | 	"GCATCAAGCACGCARCAATGCAGCTCAAAACGCTTAGCYYAGCCACACCCCCACGGGARA"
 23 | 	"CAGCAGTGATWARCCTTTAGCAATAAACGAAAGTTYAACTAAGCTATACTAAYCCCAGGG"
 24 | 	"TTGGTCAATTTCGTGCCAGCCACCGCGGTCAYACGATTAACCCAAGYCAATAGAARCCGG"
 25 | 	"CGTAAAGRGTGTTTTAGATCAcccccCCCCBcCCCAATAAAGCTAAARCYCACCTGAGTT"
 26 | 	"GTAAAAAACTCCAGYTRACACAAAATARACTACGAAAGTGGCTTTAACATRTCTGAAYAC"
 27 | 	"ACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACCYCAA"
 28 | 	"CAGTTAAAYCAACAAAACTGCTCGCCAGAACACTACGAGCCACAGCTTAAAACTCAAAGG"
 29 | 	"ACCTGGCGGTGCTTYATAYCCCYCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATC"
 30 | 	"AACCTCACCACCYCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCCTGATGAAGGY"
 31 | 	"TACAAAGTAAGCGCAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGAGGTG"
 32 | 	"GCAAGAAATGGGCTACATTTTCTAYCCCAGAMAACTACGAYARCCCtTTATGAAACYTAA"
 33 | 	"GGGTCRAAGGYGGATTTAGCAGTAAACTRARARTAGAGTGCTTAGTTGAACARGGCCCTG"
 34 | 	"AAGCGCGTACACACCGCCCGTCACCYTCYTCAARTATAYTTCAAAGGACATTTARCTAAA"
 35 | 	"ACCCCTRCGMAYTTATATAGAGGAGRCAAGTCGTAACATGGTAAGTGTACTGGAAAGTGC"
 36 | 	"ACTTGGACRAACCAGAGTGTAGCTTAACAYAAAGCACCYAACTTACACTTAGGAGATTTC"
 37 | 	"AACTTAACTTGACCRCTCTGAGCTAAACCTAGCCCCAAACCCDCTCCACCYTAYTAYCAR"
 38 | 	"ACAACYTTaARCYAAACCATTTACYCARAYAAAGTATAGGCGATAGAAATTGAAACCTGG"
 39 | 	"CGCAATAGATATAGTACCGCAAGGGAAAGATGAAAAATTAYARCCAAGCATAATATAGCA"
 40 | 	"AGGACYAACCCCTRTACCTTCYGCATAATGAATTAACTAGAAAYAACTTTGCAAGGAGAR"
 41 | 	"CCAAAGCTAAGACCCCCGAAACCAGACGAGCTACCTAARAACAGCTAAAAGAGCACACCC"
 42 | 	"GTCTATGTAGCAAAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCTAYCGAGCCTG"
 43 | 	"GYGATAGCTGGTTGTCCAAGATAGAATCTTAGTTCARCTTTAAATTTRCCCRCAGAACCC"
 44 | 	"TCTAAATCCCCTTGYAAATTTAAYTGTTAGTCCAAAGAGGRACAGCTCTTTGGACACTAG"
 45 | 	"GAAAAARCCTTGTAGAGAGAGTaAAAAAATTTAACACCCATAGTAGGCCTAAAAGCAGCC"
 46 | 	"ACCAATTAAGAAAGCGTTCAAGCTCAACACYCACYACMTaAAAAAATCCCAAACATATVA"
 47 | 	"CTGAACTCCTCACACCMAATTGGACCAATCTATYACYCTATAGAAGARCTAATGTTAGTA"
 48 | 	"TRAGTAACRTGAAAACATTCTCCTCYGCATAAGCCTGCGTCAGATYAAAACRYTGAACTG"
 49 | 	"ACAATTAACAGCCYAATATCYACAATCaACYAACAAGYCATTATTACCCYCACTGTCAAC"
 50 | 	"CCAACACAGGCATGCTCATAAGGAAAGGTTAAAAAAAGTAAAAGGAACTCGGCAAAYCTT"
 51 | 	"ACCCCGCCTGTTTACCAAAAACATCACCTCTAGCATCACCAGTATTAGAGGCACCGCCTG"
 52 | 	"CCCAGTGACACATGTTTAACGGCCGCGGTACCCTRACCGTGCAAAGGTAGCATAATCACT"
 53 | 	"TGTTCCTTAAATAGGGACCTGTATGAATGGCTYCACGAGGGTTYAGCTGTCTCTTACTTT"
 54 | 	"TAACCAGTGAAATTGACCTGCCCGTGAAGAGGCGGGCATDACACAGCAAGACGAGAAGAC"
 55 | 	"CCTATGGAGCTTTAATTTATTAATGCAARCARTACCTAACARACCYACAGGTCCTAAACT"
 56 | 	"ACYAARCCTGCATTAAAAATTTCGGTTGGGGCGACCTCGGAGCAGAAYYMAACCTCCGAG"
 57 | 	"CARTACATGCTAAGACYTCACCAGTCAAAGCGAACTACYATACTCRATTGATCCAATAAC"
 58 | 	"TTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATCCTRTTCTAGAGTCCATAT"
 59 | 	"CAACAATAGGGTTTACGACCTCGATGTTGGATCAGGACATCCCRATGGTGCAGCCGCTAT"
 60 | 	"YAAAGGTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGGAGYAAT"
 61 | 	"CCAGGTCGGTTTCTATCTAYTTCAAATTYCTCCCTGTACGAAAGGACAAGAGAAATAAGG"
 62 | 	"CCTACTTCACAtAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGYATWAYRYCY"
 63 | 	"ACACCYACCCARGARCAGGGTTTGTTAAGATGGCAGAGCCCGGTAATCGCATAAAACTTA"
 64 | 	"AAACTTTACAGYCAGAGGTTCAAYTCCTCTTCTTAACAAYAYAYCCATGRCCAACCTCCT"
 65 | 	"ACTCCTCATTGYACCCATTCTRATMGCAATGGCATTCCTAATGCTYACCGAACRRAAAAT"
 66 | 	"TCTAGSMYRTRYACAACTACGCAAAGGCCCCAACGTTGTAGGCCCCTACGGRCTACTACA"
 67 | 	"ACCYTTCGCTGACGCCATAAAACTCTTCACCAARGAGCCCCTAAARCCCGCCACATCTRC"
 68 | 	"YATCACYCTMTACATCACCGCCCCRACCYTRGCYCTCACYRTYGCHCTTCTACTATGAAC"
 69 | 	"CCCCCTCCCCATACCCAACCCCCTRGTYAACCTCAACCTAGGCCTCCTATTTATTCTAGC"
 70 | 	"CACCTCTAGCCTAGCCGYYTACTCRATCCTCTGATCAGGRTGAGCRTCAAACTCAAACTA"
 71 | 	"YGCCCTRATCGGYGCRCTRCGAGCAGTAGCCCARACAATCTCATATGAARTYACYCTAGC"
 72 | 	"CATCATTCTRCTATCAACATTACTAATAAGTGGCTCCTTTAACCTCTCCDCCCTTATCAC"
 73 | 	"ARCACAAGARYACCTCTGAYTACTCCTRCCATCATGRCCCYTRGCYATAATATGATTTAY"
 74 | 	"CTCCACACTAGCAGARACCAACCGAACCCCCTTCGACCTTGCCGAAGGRGARTCMGAACT"
 75 | 	"RGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCYYTATTYTTCATRGCCGA"
 76 | 	"RTACACAAACATYATTATAATAAACACCYTYACCACKAYAATCTTCCTAGGAAYAACRTA"
 77 | 	"TRAYGCACTCTCYCCTGAACTCTAYACAACATATTTTGTYACCAAGRCCCTACTTCTRAC"
 78 | 	"CTCCCTRTTCYTATGRRTTCGAACAGCATACCCCCGATTCCGCTACGACCARCTCATRCA"
 79 | 	"CCTCCWATGAAAAARCTTCCTRCCACTYACCCTAGCRTTACTTATATGAYAYGTYTCCRT"
 80 | 	"ACCYAYTACAATCTCCAGCATYCCCCCTCARACCTAAGAAATATGTCTGATAAAAGAGTT"
 81 | 	"ACTTTGATAGAGTAAATAATAGGAGYTTAAACCCCCTTATTTCTAGGAYYATGAGRATYG"
 82 | 	"AACCCATCCCTGAGAATCCAAAAYTCTCCGTGCCACCTATCRCACCCCATCCTAAAGTAA"
 83 | 	"GGTCAGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTGGTTAHACCCTTCCCGTA"
 84 | 	"CTAATTAATCCCCTGGCCCAACCCRTCATCTACTCTACCRTYTTTRCAGGCACACTCATC"
 85 | 	"ACWGCGCTAAGCTCRCACTGATTTTTTACCTGAGYRGGCCTAGAAATAAACATRCTAGCY"
 86 | 	"TTTATTCCARTTCTAACCAAAAAAATAAACCCTCGTTCCACAGAAGCTGCCAYMAAGTAY"
 87 | 	"TTCCTCAYRCAARCAACCGCATCYATARTCCTYCTAATAGCYATCCTCYTCAAMAAYATA"
 88 | 	"YTCTYCGGRCAATGRRCCATAACCARYACTACCAAYCARTACTCATCATTAATAATCATA"
 89 | 	"RTRGCTATRGCAATAAAACTRGGAATAGCCCCCTTTCACTTCTGAGTCCCAGARGTYRCC"
 90 | 	"CARGGCRYCCCTCTRACATCCGGYCTGCTTCTTCTCACATGACAAAAACTAGCCCCYATC"
 91 | 	"TCAATYATRTACCAAATYTCYCCCTCACTARACGTAAGCCTYCTCCTCACTCTCTCAATC"
 92 | 	"YTATCCATCATAGYAGGYRGTTGAGGTGGRYTAAACCAAACCCARCTRCGCAAAATCYTA"
 93 | 	"GCATACTCCTCAATTACCCAYATAGGATGRATAATAGCARTTCTACCSTACAACCCYAAC"
 94 | 	"ATAACCATYCTTAAYTTAAYTATYTAYATYATCCTAACYACYACCGCATTCCTRCTACTC"
 95 | 	"AACTTAAACTCCAGCACCACRACCCTRCTACTATCTCGCACCTGAAACAARMTAACATGW"
 96 | 	"CTRACACCCTTAATYCCATYCACCCTCCTCTCCCTAGGAGGCCTRCCCCCRCTRACCGGC"
 97 | 	"TTTTTGCCYAAATGGRYCATTATCGAAGARTTCACAAARAACAATAGCCTCATYRYCCCC"
 98 | 	"ACCATCATAGCCACYATYACCCTMMTTAACCTYTACTTCTACCTRCGCCTAATYTACTCC"
 99 | 	"ACCTCAATYACAYTACTCCCCATRTCYAACAACGTAAAAATAAAATGACARTTYGAACAY"
100 | 	"ACAAARCCCAYCCCRYTCCTCCCCAYACTCATCRCCCTYACCACRCTACTCCTRCCYATC"
101 | 	"TCCCCYYTYATACTAATAATCTTATAGAAATTTAGGTTAAATACAGACCAAGAGCCTTCA"
102 | 	"AAGCCCTCAGYAAGTYRCAATACTTAATTTCTGYRACARCTAAGGACTGCAAAAYCYCAY"
103 | 	"TCTGCATCAACTGAACGCAAAYCAGCYACTTTAATTAAGCTAAGCCCTYRCTAGACCAAT"
104 | 	"GGGACTTARACCCACAAACACTTAGTTAACAGCTAAGCACCCTARTCAACTGGCTTCAAT"
105 | 	"CTACTTCTCCCGCCGCCGGGaAAAAAGGCGGGAGAAGCCCCGGCARRYTTGAAGCTGCTT"
106 | 	"CYTMGAATTTGCAATTCAATATGARAAYCACCTCRGARCYGGTAAAAAGRGGCCTARCCC"
107 | 	"CTGTCTTTAGATTTACAGTCCAATGCTTCACTCAGCCATTTTACCYccccCVccCCCCAC"
108 | 	"TGATGTTCGYCRACCGTTGACTATTCTCTACAAACCACAAAGACATTGGRACACTATACC"
109 | 	"TAYTATTCGGYGCATGAGCTGGRGTYYTAGGCACAGCYCTAAGCCTCCTTATTCGRGCCG"
110 | 	"ARCTRGGYCARCCAGGCAACCTTYTRGGYAACGACCACATCTACAACGTYATCGTYACAG"
111 | 	"CYCATGCATTYGTAATAATCTTYTTCATAGTAATACCCATCATAATCGGRGGCTTTGGCA"
112 | 	"ACTGACTARTYCCCCTAATAATYGGYGCCCCCGATATRGCRTTYCCCCGCATAAACAACA"
113 | 	"TAAGCTTCTGACTCTTACCYCCCTCYCTCCTACTCCTGCTYGCATCTRCTAYAGTRGARG"
114 | 	"CCGGAGCAGGAACAGGTTGRACAGTCTACCCTCCCTTAGCAGGGAACTACTCCCACCCTG"
115 | 	"GARCCTCCGTAGACCTAACCATCTTCTCCYTACACCTRGCAGGTRTCTCYTCYATCYTAG"
116 | 	"GGGCYATCAAYTTCATCACAACAATTATCAAYATAAAACCCCCYGCCATAACCCARTACC"
117 | 	"AAACRCCCCTCTTYGTCTGATCCGTCCTAATYACAGCAGTCCTACTTCTMCTATCTCTCC"
118 | 	"CAGTCCTAGCTGCTGGYATCACYATAYTACTAACAGACCGCAAYYTYAACACCACCTTCT"
119 | 	"TCGACCCCGCCGGRGGAGGAGACCCYATTCTATACCARCACCTATTCTGATTTTTCGGYC"
120 | 	"AYCCTGAAGTYTATATTCTTATCCTACCAGGYTTCGGAATAATCTCCCAYATYGTAACYT"
121 | 	"ACTACTCCGGAAAAAAAGAACCATTTGGATAYATAGGYATGGTCTGAGCTATRATATCAA"
122 | 	"TTGGCTTCCTRGGRTTTATCGTGTGAGCRCACCAYATRTTTACAGTAGGAATAGACGTAG"
123 | 	"ACACACGAGCATAYTTCACCTCCGCYACCATAATCATCGCTATCCCCACCGGCGTCAAAG"
124 | 	"TATTTAGCTGACTMGCYACRCTCCACGGAAGCAATATGAAATGRTCTGCTGCAGTRCTCT"
125 | 	"GAGCCCTRGGATTCATYTTTCTTTTCACCGTAGGTGGCYTRACTGGCATTGTATTAGCAA"
126 | 	"ACTCATCRCTAGACATCGTACTACACGAYACRTACTACGTTGTAGCYCACTTCCACTATG"
127 | 	"TCCTATCAATRGGRGCWGTATTTGCCATCATAGGRGGCTTYATTCACTGATTTCCCCTAT"
128 | 	"TCTCAGGCTACACCCTAGACCAAACCTACGCCAAAATCCATTTCRCYATYATRTTCRTCG"
129 | 	"GCGTAAATCTAACYTTCTTCCCACAACACTTYYTMGGCCTRTCCGGAATGCCCCGACGTT"
130 | 	"ACTCRGACTACCCCGATGCATACACCACATGAAAYRYCCTATCATCTGTRGGYTCATTCA"
131 | 	"TTTCTCTAACAGCAGTMATATTAATAATTTTCATGATYTGAGARGCCTTCGCTTCRAARC"
132 | 	"GAAAARTCCTAATAGTAGAAGAACCCTCCATAAACCTGGAGTGRCTRYATGGATGCCCMC"
133 | 	"CRCCCTACCACACATTCRAAGARCCCGTATACATAAAATCTARACAAAAAAGGAAGGAAT"
134 | 	"CGAAcCCCCCCAAAGYTGGTTTCAAGCCAACCYCATGRCCTCCATGACTTTTTCAAAAAG"
135 | 	"RTATTAGAAAAACCATTTCATAACTTTGTCAAAGTTAAATYATAGGCTRARTCCTATATA"
136 | 	"YCTTAATGGSACATGCARCRCAAGTAGGTCTACAAGACGCTACWTCCCCTATCATAGAAG"
137 | 	"AGCTYATYACCTTTCAYGAYCACRCCCTYATARTYATTTTYCTYATCTGCTTYCTARTCY"
138 | 	"TGTAYGCCCTYTTCCTAACACTCACAACAAAACTRACTAATACYAACATCTCAGACGCYC"
139 | 	"AGGARATRGARACCGTCTGAACTATCCTRCCCGCCATCATCCTARTCCTCATCGCCCTCC"
140 | 	"CATCCCTACGCATCCTTTACATAACAGAYGAGRTCAAYGAYCCYTCYCTTACCATCAAAT"
141 | 	"CAATTGGCCACCAATGGTACTGAACYTACGAGTACACCGACTACGGCGGACTRATCTTCA"
142 | 	"ACTCCTAYATACTTCCCCCATTRTTCCTAGAACCAGGCGACCTGCGACTCCTTGACGTYG"
143 | 	"ACAATCGAGTAGTRCTCCCRATYGAARCCCCCATTCGTATAATAATTACATCRCAAGAYG"
144 | 	"TYTTGCACTCATGARCTGTYCCCACAYTAGGCTTAAAAACAGATGCARTTCCMGGACGTC"
145 | 	"TAAACCAAACCACTTTYACYGCTACACGRCCRGGRGTRTACTACGGTCAATGCTCTGAAA"
146 | 	"TCTGYGGAGCRAACCACAGYTTCATRCCCATCGTCCTAGAATTAATTCCCCTAAAAATCT"
147 | 	"TTGAAATRGGRCCCGTATTYACCCTATARYAccccccccyyyyacccccYCTAGARCCCR"
148 | 	"CYGTAAAGCTAACTTAGCATTAACCTTTTAAGTTAAAGATTAAGAGARCCAACACCTCTT"
149 | 	"TACAGTGAAATGCCCCAACYAAATACYACCRTRTGRYCYACCATAATYAYCCCCATRYTC"
150 | 	"CTTACACTATTYCTCATCACCCARCTAAAAAYATTAAACACAARCTACCACYTACYYCCC"
151 | 	"TCACCAAARCCCATRAAAATAAAAAAYTATAACAAACCCTGAGAACCARAATGARCGAAA"
152 | 	"ATCTGTTCRCTYCATTCAYTGCCCCYRCARTCCTARRCYTRCCCGCCRCAGTACTGATCA"
153 | 	"TTCTATYTCCCCCTCTATTGAYCCCCACCTCCAAATRTCTCRTCARCAACCGACTAATYA"
154 | 	"CCACCCAACAATGACTAATYAAACTAAYCTCAAAACAAATRATARCYAYACAYAACACTA"
155 | 	"ARGGRCGAACCTGRTCYCTTATACTARTATCCYTAATCATTTTTATTRCCACAACTAACC"
156 | 	"TYCTMGGRCTCCTRCCYYACTCATTTACRCCAACCRCCCAAYTATCTATAAACCTAGCCR"
157 | 	"TRRCYVTCCCCTTATGAGCRGGYGCAGTRATYAYAGGCYTYCGCTCTAAGATTAAAAATG"
158 | 	"CCCTAGCCCACTTYYTRCCACAAGGCACACCYACACCCCTTATCCCYATACTAGTTATTA"
159 | 	"TYGARRCYATCAGCCTRYTCATTCAACCAATAGCCCTRGCCGTACGCCTARCCGCYAACA"
160 | 	"TTACTGCAGGCCACCTRCTCATGCAYCTAATTGGAARYRCCACCCTARCARTATCRACCA"
161 | 	"YTARCCTTCCCTCYACVYTTATCAYCYTYACARTYCTRAYTCTRCTRACTRTCCTAGAAR"
162 | 	"YCGYTGTCGCCTTARTCCARGCYTACGTTTTCACACTYCTAGTRARCCTCTACCTGCACG"
163 | 	"ACAAYACATAATGACCCACCAATCRCATGCCTATCAYATAGTAAARCCCAGYCCATGRCC"
164 | 	"YCTRACRGGRGCCCTYTCAGCCCTCCTAATGRCCTCCGGYCTRGCCATRTGATTYCACTY"
165 | 	"CCACTCCRYAACGCTCCTYATACTAGGCCTRCTAACCARYACACTAACYATATRCCAATG"
166 | 	"RTGRCGCGATGTRACACGAGAAAGCACATACCAAGGCCACCACACACCACCTGTCCAAAA"
167 | 	"ARGCCTTCGATAYGGGATARTCCTATTTATTRCCTCAGAARTTTTTTTCTTCGCAGGATT"
168 | 	"TTTCTGAGCCTTYTAYCACTCCAGCCTAGCCCCYACCCCYCAAYTAGGDGGRCACTGRCC"
169 | 	"YCCAACAGGCATCACCCCRCTAAAYCCCCTAGAARTCCCACTYCTAAACACATCCGTRTT"
170 | 	"RCTYGCATCAGGAGTRTCAATCACCTGAGCYCACCATAGYCTAATAGAAARCAACCGAAA"
171 | 	"CCAAAYAATTCAAGCACTGCTYATTACAATTTTACTGGGTCTCTATTTTACCCTMCTACA"
172 | 	"ARCCTCAGAGTACTTCGRRTCTCCCTTCACYATYTCCGAYGGCATCTACGGCTCAACATT"
173 | 	"TTTTGTARCCACAGGYTTCCAYGGAMTHCACGTCATTATTGGCTCAACTTTCCTCRCTAT"
174 | 	"CTGCTTCATYCGCCAACTAATATTTCACTTTACATCCAAACAYCACTTTGGCTTYGAAGC"
175 | 	"CGCCGCCTGATACTGRCATTTYGTRGAYGTGRTYTGACTAYTYCTGTATRTCTCCATCTA"
176 | 	"YTGATGAGGGTCTTACTCYTTTAGTATAAATAGTACCGTTAACTTCCAATTAACTAGYTT"
177 | 	"TGACARCATTCAAAAAAGAGTAATAAACTTCGCCTTAATTTTAATAAYCRAYACCCTCCT"
178 | 	"MGCCTTACTACTAATAATYATYACATTTTGACTACCACAACTCAAYRGCTACATRGARAA"
179 | 	"ATCCACCCCYTACGARTGCGGCTTCGACCCTATATMCCCCRCCCGYGTCCCTTTCTCYAT"
180 | 	"AAAATTCTTCTTAGTAGCTATYACCTTCYTATTATTYGAYCTAGAAATTGCCCTCCTTTT"
181 | 	"ACCCCTRCCATGAGCCCTACAAACAACTAACCTRCCRCTAATAGYTATRTCATCCCTCTT"
182 | 	"ATTAATCATCATCCTRGCCCTRAGTCTGGCCTAYGARTGACTACARAAAGGATTAGACTG"
183 | 	"RRCYGAATTGGTAYATAGTTTAAACAAWACRAATGATTTCGACTCATTAAATTATGATAA"
184 | 	"TCATATYTACCAAATGCCCCTCATTTACATAAATATYATAYTRGCATTYRCCATCTCACT"
185 | 	"TCTAGGAATACTAGTATATCGCTCACACCTCATRTCCTCCCTACTATGCCTAGAAGGAAT"
186 | 	"AATACTRTCRCTRTTCATTATAGCTACTCTCAYWACCCTCAACACCCACTCCCTCTTAGC"
187 | 	"YAAYATTGTRCCTATTGCCATRYTAGTYTTYGCYGCCTGCGAAGCAGCRGTRGGCCTAGC"
188 | 	"CCTRCTAGTCTCAATCTCCAACACATAYGGCCTAGAYTAYGTACATAACCTAAACYTACT"
189 | 	"YCAATGCTAAAACTAATCGTCCCAACAATYATAYTRYTACCACTRACRTGACTYTCCAAD"
190 | 	"AARCACATAATYTGAATCAACACAACCACCCACAGCCTAATTATTAGCATCATCCCYCTR"
191 | 	"CTATTTTTYAACCAAATYAACAACAACCTATTTAGCTRYTCCCYAACCTTYTCCTCSGAC"
192 | 	"CCYYTAACAACCCCCCTCCTAATACTAACYACCTGRCTCCTACCCCTCACAATCATGGCA"
193 | 	"AGCCARCGCCACTTATYYARYGARCCRCTATCACGAAAAAAACTCTACCTCTCYATACTA"
194 | 	"ATCTYCCTRCAAATCTCCTTARTTATAACATTCACRGCCACAGAACTAATCATATTTTAT"
195 | 	"ATCTTCTTCGAAACCACACTTATCCCCACCYTGRYKATCATCACCCGATGRGGCARCCAR"
196 | 	"YCAGAACGCCTGAACGCAGGYACATACYTCCTATTCTAYACCCTAGTAGGCTCCCTTCCC"
197 | 	"CTACTCATCGCACTRATYTAYACTCACAACACYCTAGGCTCACTAAACATYYTACTACTY"
198 | 	"ACYCTYACTGCCCAAGARCTATCAAACTCCTGAGCYAACAACYTAATATGAYTRGCTTAC"
199 | 	"ACAATRGCYTTTATAGTAAARATACCTCTTTACGGACTCYACTTATGRCTCCCTAAAGCC"
200 | 	"CATGTCGAAGCYCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTRAAACTRGGY"
201 | 	"GGCTAYGGTATAATACGCCTCACACTCATTCTCAACCCCCTGACAAAACACATAGCCTAY"
202 | 	"CCYTTCCTTGTACTATCSCTATGAGGCATRATTATAACAAGCTCCATCTGCCTRCGACAA"
203 | 	"ACAGACCTAAAATCRCTCATTGCATACTCTTCAATCAGCCACATRGCCCTCGTAGTRRCA"
204 | 	"GCCATTCTMATCCAAACYCCCTGAAGCTTCACCGGCGCAGTCATYCTCATAATCGCCCAC"
205 | 	"GGRCTYACATCCTCAYTACTATTCTGCCTAGCAAACTCAAACTACGAACGYACTCACAGT"
206 | 	"CRCATCATAATCCTCTCTCAAGGACTTCAARCTCTRCTCCCACTAATAGCTTTTTGATGA"
207 | 	"CTTCTAGCAAGCCTCGCYAACCTCGCCYTACCCCCCACTATTAAYCTRCTRGGRGARCTC"
208 | 	"TCYGTGCTAGTRRCCACRTTCTCCTGATCAARTATCACTCTYCTACTYAYRGGACTCAAC"
209 | 	"ATRCTARTCACARCCCTATACTCCCTCTACATATTYACCACAACACAATGRGGCTCACTC"
210 | 	"ACYCACCAYRTTAACAACATAAAACCCTCATTYACACGAGAAAAYACCCTCATRTTCATA"
211 | 	"CACCTAKCCCCCATYCTCCTCCTATCCCTYAACCCCGACATCATYACCGGRTTTTCCYMY"
212 | 	"TGTAAATATAGTTTAAYCAAAACATCAGATTGTGARTCYGACAACAGAGGCTYACRACCC"
213 | 	"CTTATTTACCGAGAAAGCTCACAAGAACTGCTAACTCRYRcCYCCATGYCTRACAACRTG"
214 | 	"GCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCYTAGGCCCCAARAATTTTGG"
215 | 	"TGCARCTCCAAATAAAAGTAAYAACCATGYACACTACTaATARCCRCCCTRACCYTRRCT"
216 | 	"TCCCTAATTCCCCCCATCCTTRCCACCYTYRTTAACCCYAACAAAAAAARCTCATACCCC"
217 | 	"CATTATGTAAAATCCATTRTCGCATCCACCTTTATTATCAGYCTCTTCCCCACAACAATA"
218 | 	"TTCATRYGCCTRGAYCAAGAAGTYATTATCTCRARCTGACACTGRGCCACAACCCAAACA"
219 | 	"ACCCARCTCTCCCTAAGCYTCAAACTAGACTACTTCTCCATAATATTYATCCCTGTRGCR"
220 | 	"YTRTTCGTTACATGRTCMAYCATAGAATTCTCACTGTGATATATAAACTCAGAYCCRAAY"
221 | 	"ATTAATCAGTTCTTCAARTATCTACTCATHTTCCTAATYACCATRCTAATCTTAGTYACC"
222 | 	"GCBAACAACCTATTYCAACTGTTCATCGGCTGRGARGGCGTAGGAATTATATCCTTCYTR"
223 | 	"CTCATYAGTTGATGRYACGCCCGAGCRGATGCCAAYACAGCAGCCAYTCAAGCARTYCTA"
224 | 	"TACAACCGTATCGGCGATATYGGYTTYATCCTCGCCTTAGCATGATTTATCCTACACTCC"
225 | 	"AACTCATGAGACCCWCAACAARTARCCCTTCTRAACGCYAAYCCARGCCTCACCCCRYTA"
226 | 	"CTAGGCCTCCTCCTAGCAGCAGCRGGCAAATCAGCCCAATTAGGYCTCCACCCCTGACTC"
227 | 	"CCCTCAGCCATAGARGGCCCCACYCCAGTYTCRGCCCTACTCCACTCAAGCACTATAGTT"
228 | 	"GTAGCMGGRRTCTTCYTACTCATCCGCTTCCACCCCCTARCAGAAAAYAVCCCRCTAATC"
229 | 	"CAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTRTTYGCAGCAGTCTGCGCYCTY"
230 | 	"ACACARAATGACATCAAAAAAATCGTAGCCTTCTCCACTTCAAGTCARCTAGGRCTCATA"
231 | 	"RTAGTYACAATYGGCATYAACCAACCACACCTRGCATTCCTGCACATCTGTACCCACGCC"
232 | 	"TTCTTCAAAGCCATACTATTTATRTGCTCCGGRTCCATCATCCACAACCTTAACAATGAR"
233 | 	"CAAGATATTCGAAAAATAGGAGGACTRCTCAAAACCATACCYCTCACTTCAAYCTCYCTC"
234 | 	"ACCATTGGCAGCCTRGCATTARCRGGAATRCCTTTCCTCACAGGYTTCTAYTCCAAAGAC"
235 | 	"CACATCATCGAAACCGCAAACATATCATAYACAAACGCCTGAGCCCTRTCTATTACTCTC"
236 | 	"ATCGCTACCTCCCTRACARGCGCCTAYAGCACTCGRATAATYCTTCTCACYCTAACAGGT"
237 | 	"CRACCYCGCTTCCCYRCCCTYACTRACATTAACGAAAATAACCCYRCCCTACTAAACCCC"
238 | 	"ATTAAACGCCTGRCRRCCGGAAGCCTRTTCGCAGGATTYCTCATTACYAACAACATYTYC"
239 | 	"CCYRCATCCCCCTTCCAAACAACARTCCCCCTCYACCTAAAACTCACRGCCCTCGCYGTC"
240 | 	"ACYYTCCTAGGRCTTCTAACAGCCCTAGACCTCAACTACCTAACCAAYAAACTTAAAATA"
241 | 	"AAATMCCCACYATRCACATTTTATTTCTCCAACATACTMGGATTCTACYCTWVCATCAYA"
242 | 	"CACCGCRCAATCCCCTATCTAGSCCTTCTYRYGAGCCAAAACCTRCCCCTACTCCTCCTA"
243 | 	"GACCWAACCTGRCTAGAAAARCTAYTRCCYAAAACAATYTCRCAGCACCAAATCTCCRCC"
244 | 	"TCCRTMATCACCTCDACCCAAAAAGGCATAATYAAACTYTAYTTCCTCTCTTTYYTCTTY"
245 | 	"CCRCTCATCCTRRCCCTACTCYTRATMACATARCCTRTTCCCCCGAGCAATYTCAATTAC"
246 | 	"AAYRYAYACACCAACAAACAATGTYCARCCAGTRACYACYACTAAYCAACGCCCATARTC"
247 | 	"ATACAAAGCCCCCGCACCAATAGGATCCTCCCGAATSAACCCTGACCCYTCTCCYTCATA"
248 | 	"AATTATTCAGCTYCCYACACTAYYAAAGTTTACCACAACCACYACCCCATCATACTCTTT"
249 | 	"CACCCACARCACCARYCCYACCTCCATCSMTAACCCCACTAAAACACTYACYAAGACCTC"
250 | 	"AACCYCTGACCCCCATGCCTCAGGATACTCCTCRATAGCYATCRCTGTAGTATAYCCAAA"
251 | 	"GACAACCAYCATYCCCCCTAAATAAAYTAAAAAAACTATTAAACCCATATAACCTCCCCC"
252 | 	"AAAATTCAGAATAATAACACACCCRACCACRCCRCWAACAAYCARTACTAARCCCCCATA"
253 | 	"AATAGGAGARGGCTTAGAAGAAAAYCCCACAAAYCCCATTACTAAACCCRCACTCAACAG"
254 | 	"AAACAAAGCATAYATCATYATTCTCGCACGGACTACARCCACGACCAATGATATGAAAAA"
255 | 	"CCATCGTTGTATTTYAACTACAAGAACACCAATGACCCCAAYACGCAAAAYYARCCCCCT"
256 | 	"AATAAAAYTAATTAACCRCTCAYTCATCGACCTCCCYACCCCATCCAACATCTCCGCATG"
257 | 	"RTGAAACTTCGGCTCACTCCTTGGCGYCTGCCTGRYCCTCCAAATCACCACAGGRCTAYT"
258 | 	"CCtAGCCATRCACTAYTCACCAGACGCYTCRRCCGCCTTYTCATCAATCGcYCACATCRC"
259 | 	"TCGAGACGTAAATTAYGGSTGAAYCATCCGCTACCTTCACGCCAAYGGCGCYTCAATATT"
260 | 	"YTTTATCTGCCTCTTCCTRCACATYGGRCGAGGYCTATATTACGGRTCATTYCTCTACTC"
261 | 	"ARAAACCYGAAACAYCGGCATTATCCTCCTRCTTRCARCYATAGCAACAGCCTTCATRGG"
262 | 	"YTATGTCCTCCCRTGAGGCCAAATATCATTCTGAGGRGCCACRGTAATTACAAACYTACT"
263 | 	"ATCCGCCAYYCCATACATYGGRVCAGACCTAGTYCAATGRRTCTGAGGRGGSTACTCAGT"
264 | 	"ARRCARTCCCACCCTCACACGATTCTTTACCTTTCACTTCATCTYRCCCTTCATTRTTRC"
265 | 	"ARCCCTARCAGCACTCCACCTCCTATTCTTRCACGAAACRGGRTCAAACAACCCCCTAGG"
266 | 	"ARYCACCTCCCATTCCGATAAAATCACCTTCCACCCYTACTACACAATCAAAGACRCCCT"
267 | 	"CGGCTTRSTTCTCTTYMTYCTCTCCTTAATRAYAYTAACACTAYTCTCACCWGACCTCCT"
268 | 	"ARRCGACCCAGAYAAYTAYAYCCYAGCYAACCCCYTAAAYACCCCTCCCCACATCAARCC"
269 | 	"CGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCCCYAACAARCTAGGRGG"
270 | 	"CGTCCTYGCCYTAYTAYTAYCCATCCTCATYCTAGCAATAATCCCYRYYCTCCAYRTATC"
271 | 	"CAAACAACAAAGCATAAYATTTCGCCCACTAAGCCAATCACTTTAYTGRCTCCTARCCRC"
272 | 	"AGAYCTCCTCRTYCTAACYTGAATCGGAGGRCAACCAGTAARCTACCCYTTTAYYATYAT"
273 | 	"TGGACAARTARCATCCRTACTATACTTCRCAACAATCYTAATCCTAATACCAAYYRYCTC"
274 | 	"CCTAATTGAAAACAAAATACTCAAATGRVYCTGYCCTTGTAGTAYAAAYYARTACAYCAG"
275 | 	"TCTTGTAARCCRRARAYGAAAWCYtYYTTCCAAGGACAMATCAGAGAAAAAGYCYTTAAC"
276 | 	"TCYACCATTAGCACCCAAAGCTAAGATTCTAATTTAAACTAYTCTCTGTTCTTTCATGGG"
277 | 	"GARRCAGATTTGRGTRCCACCCAAGTATTGRCTYAYCCAYCWACAACCGCYATGTAYYYC"
278 | 	"GTACATTACTGCYAGHCAMCATGRATATYGYACVGTACYAYAAAYACTYRRYYACCTRYA"
279 | 	"GTACATRRDAMYYHVRYYYRBATCrAcmcmmcHYYYNYcCYYATGCTTACAAGCMRGYAC"
280 | 	"RRYAAYYRAMYYYCARCYRYYAYRCATYRRHYRYARYYCCAAMRYYVCaYYCYYNHCYAY"
281 | 	"YRGRATAYCAACARABYYAYBYVYYYYYRRYARYACATRGYRCATRDHRYCATYYRYCGT"
282 | 	"ACATRRYACATYVYAGTCAAAYYYYYYCYHGYCCCYAYGGRTGACCCCCCYCAGATAGGR"
283 | 	"RTCCCTTRRYCACCATCCTCCGTGAAATCAATATCCCGYACAAGAGTRMTACTCTCCTCG"
284 | 	"CTCCGGGCCCATRACACYTGGGGGTAGCTAAARTGAAYTGTATCCGRCATCTGGYTCCYA"
285 | 	"CTTCAGGGYCATAAARYCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATg" ;
286 | 
287 | const int mt311_sequence_size = sizeof( mt311_sequence ) ;
288 | 
289 | 


--------------------------------------------------------------------------------
/src/myers_align.c:
--------------------------------------------------------------------------------
  1 | #include "myers_align.h"
  2 | 
  3 | #include <limits.h>
  4 | #include <stdlib.h>
  5 | #include <string.h>
  6 | 
  7 | inline int match( char a, char b ) { return (char_to_bitmap(a) & char_to_bitmap(b)) != 0 ; }
  8 | 
  9 | // [*blech*, this looks and feels like FORTRAN.]
 10 | unsigned myers_diff( const char *seq_a, enum myers_align_mode mode, const char* seq_b, int maxd, char *bt_a, char *bt_b ) 
 11 | {
 12 | 	int len_a = strlen( seq_a ), len_b = strlen( seq_b ) ;
 13 | 	if( maxd > len_a + len_b ) maxd = len_a + len_b ;
 14 | 
 15 | 	// in vee[d][k], d runs from 0 to maxd; k runs from -d to +d
 16 | 	int **vee = calloc( maxd, sizeof(int*) ) ;
 17 | 
 18 | 	int d, dd, k, x, y, r = UINT_MAX ;
 19 | 	int *v_d_1 = 0, *v_d = 0 ; 															// "array slice" vee[.][d-1]
 20 | 	for( d = 0 ; d != maxd ; ++d, v_d_1 = v_d )									// D-paths in order of increasing D
 21 | 	{
 22 | 		v_d = d + (vee[d] = malloc( (2 * d + 1) * sizeof( int ) )) ; 		// "array slice" vee[.][d]
 23 | 
 24 | 		for( k = max(-d,-len_a) ; k <= min(d,len_b) ; ++k ) 					// diagonals
 25 | 		{
 26 | 			if( d == 0 )         x = 0 ;
 27 | 			else if(d==1&&k==0)  x =                       v_d_1[ k ]+1 ;
 28 | 			else if( k == -d   ) x =                                     v_d_1[ k+1 ] ;
 29 | 			else if( k ==  d   ) x =       v_d_1[ k-1 ]+1 ;									// argh, need to check for d first, b/c -d+2 could be equal to d
 30 | 			else if( k == -d+1 ) x = max(                  v_d_1[ k ]+1, v_d_1[ k+1 ] ) ;
 31 | 			else if( k ==  d-1 ) x = max(  v_d_1[ k-1 ]+1, v_d_1[ k ]+1 ) ;
 32 | 			else                 x = max3( v_d_1[ k-1 ]+1, v_d_1[ k ]+1, v_d_1[ k+1 ] ) ;
 33 | 
 34 | 			y = x-k ;
 35 | 			while( x < len_b && y < len_a && match( seq_b[x], seq_a[y] ) ) ++x, ++y ;
 36 | 			v_d[ k ] = x ;
 37 | 
 38 | 			if(
 39 | 					(mode == myers_align_is_prefix || y == len_a) &&
 40 | 					(mode == myers_align_has_prefix || x == len_b) )
 41 | 			{
 42 | 				char *out_a = bt_a + len_a + d +2 ;
 43 | 				char *out_b = bt_b + len_b + d +2 ;
 44 | 				*--out_a = 0 ;
 45 | 				*--out_a = 0 ;
 46 | 				for( dd = d ; dd != 0 ; )
 47 | 				{
 48 | 					if( k != -dd && k != dd && x == vee[ dd-1 ][ k + dd-1 ]+1 )
 49 | 					{
 50 | 						--dd ;
 51 | 						--x ;
 52 | 						--y ;
 53 | 						*--out_b = seq_b[x] ;
 54 | 						*--out_a = seq_a[y] ;
 55 | 					}
 56 | 					else if( k > -dd+1 && x == vee[ dd-1 ][ k-1 + dd-1 ]+1 )
 57 | 					{
 58 | 						--x ;
 59 | 						--k ;
 60 | 						--dd ;
 61 | 						*--out_b = seq_b[x] ;
 62 | 						*--out_a = '-' ;
 63 | 					}
 64 | 					else if( k < dd-1 && x == vee[ dd-1 ][ k+1 + dd-1 ] )
 65 | 					{
 66 | 						++k ;
 67 | 						--y ;
 68 | 						--dd ;
 69 | 						*--out_b = '-' ;
 70 | 						*--out_a = seq_a[y] ;
 71 | 					}
 72 | 					else // this better had been a match...
 73 | 					{
 74 | 						--x ;
 75 | 						--y ;
 76 | 						*--out_b = seq_b[x] ;
 77 | 						*--out_a = seq_a[y] ;
 78 | 					}
 79 | 				}
 80 | 				while( x > 0 )
 81 | 				{
 82 | 					--x ;
 83 | 					*--out_b = seq_b[x] ;
 84 | 					*--out_a = seq_a[x] ;
 85 | 				}
 86 | 				memmove( bt_a, out_a, bt_a + len_a + d + 2 - out_a ) ;
 87 | 				memmove( bt_b, out_b, bt_b + len_b + d + 2 - out_b ) ;
 88 | 				r = d ;
 89 | 				goto cleanup ;
 90 | 			}
 91 | 		}
 92 | 	}
 93 | 
 94 | cleanup:
 95 | 	for( dd = maxd ; dd != 0 ; --dd )
 96 | 		free( vee[dd-1] ) ;
 97 | 	free( vee ) ;
 98 | 	return r ;
 99 | }
100 | 
101 | 


--------------------------------------------------------------------------------
/src/myers_align.h:
--------------------------------------------------------------------------------
 1 | #ifndef INCLUDED_MYERS_ALIGN
 2 | #define INCLUDED_MYERS_ALIGN
 3 | 
 4 | enum myers_align_mode { 
 5 | 	myers_align_globally,
 6 | 	myers_align_is_prefix,
 7 | 	myers_align_has_prefix } ;
 8 | 
 9 | //! \brief aligns two sequences in O(nd) time
10 | //! This alignment algorithm following Eugene W. Myers: "An O(ND)
11 | //! Difference Algorithm and Its Variations".
12 | //! Both input sequences are ASCIIZ-encoded with IUPAC ambiguity codes.
13 | //! By definition, if ambiguity codes overlap, that's a match, else a
14 | //! mismatch.  Mismatches and gaps count a unit penalty.  If mode is
15 | //! myers_align_globally, both sequences must align completely.  If mode
16 | //! is myers_align_is_prefix, seq_a must align completely as prefix of
17 | //! seq_b.  If mode is myers_align_has_prefix, seq_b must align
18 | //! completely as prefix of seq_a.  
19 | //!
20 | //! Note that the calculation time is O(nd) where n is the length of the
21 | //! best alignment and d the number of differences in it, memory
22 | //! consumption is O(maxd^2).
23 | //!
24 | //! \param seq_a First input sequence.
25 | //! \param mode How to align (i.e. what gaps to count).
26 | //! \param seq_b Second input sequence.
27 | //! \param maxd Maximum penalty to consider.
28 | //! \param bt_a Space to backtrace seq_a into, must have room for
29 | //!             (strlen(seq_a)+maxd+1) characters.
30 | //! \param bt_b Space to backtrace seq_b into, must have room for
31 | //!             (strlen(seq_b)+maxd+1) characters.
32 | //! \return The actual edit distance or UINT_MAX if the edit distance
33 | //!         would be greater than maxd.
34 | //!
35 | unsigned myers_diff( const char *seq_a, enum myers_align_mode mode, const char* seq_b, int maxd, char *bt_a, char *bt_b ) ;
36 | 
37 | //! \brief converts an IUPAC ambiguity code to a bitmap
38 | //! Each base is represented by a bit, makes checking for matches
39 | //! easier.
40 | inline int char_to_bitmap( char x ) 
41 | {
42 |     switch( x & ~32 )
43 |     {
44 |         case 'A': return 1 ;
45 |         case 'C': return 2 ;
46 |         case 'G': return 4 ;
47 |         case 'T': return 8 ;
48 |         case 'U': return 8 ;
49 | 
50 |         case 'S': return 6 ;
51 |         case 'W': return 9 ;
52 |         case 'R': return 5 ;
53 |         case 'Y': return 10 ;
54 |         case 'K': return 12 ;
55 |         case 'M': return 3 ;
56 | 
57 |         case 'B': return 14 ;
58 |         case 'D': return 13 ;
59 |         case 'H': return 11 ;
60 |         case 'V': return 7 ;
61 | 
62 |         case 'N': return 15 ;
63 |         default: return 0 ;
64 |     }
65 | }
66 | 
67 | inline int compatible( char x, char y ) { return (char_to_bitmap(x) & char_to_bitmap(y)) != 0 ; }
68 | 
69 | inline int min( int a, int b ) { return a < b ? a : b ; }
70 | inline int max( int a, int b ) { return a < b ? b : a ; }
71 | inline int max3( int a, int b, int c ) { return a < b ? max( b, c ) : max( a, c ) ; }
72 | 
73 | #endif
74 | 


--------------------------------------------------------------------------------
/src/params.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * File:   params.h
 3 |  * Author:
 4 |  *
 5 |  * Created on 16. Januar 2009, 15:02
 6 |  */
 7 | 
 8 | #ifndef _PARAMS_H
 9 | #define	_PARAMS_H
10 | 
11 | #ifdef	__cplusplus
12 | extern "C" {
13 | #endif
14 | 
15 | #define DEBUG (0)
16 | #define CONS_SCHEME (1)
17 | #define MAX_ID_LEN (100)
18 | #define MAX_DESC_LEN (128)
19 | #define CLUSTALW_LINE_WIDTH (60)
20 | #define FASTA_LINE_WIDTH (60)
21 | #define MAX_LINE_LEN (1000000)
22 | #define PSSM_DEPTH (15)
23 | #define MAX_FN_LEN (1023)
24 | #define SCORE_CUTOFF_BUFFER (80) // just a guess for now
25 | #define FIRST_ROUND_SCORE_CUTOFF (2000) // reference alignment original cutoff
26 | #define GOP (1000) // Gap open penalty
27 | #define GEP (200) // Gap extension penalty
28 | #define FLAT_MATCH (200) // score in the flat matrix for a gap
29 | #define FLAT_MISMATCH (-600) // score in the flat matrix for a mismatch
30 | #define N_SCORE (-100)
31 | #define NR_SCORE (-10) // score for N in reference
32 | #define TRIM_SCORE_CUT (1000)
33 | #define MAX_ITER (30) // maximum number of assembly iterations to do
34 | #define REALIGN_BUFFER (50) // amount of sequence padding to add in realignment
35 | #define QUAL_ASCII_OFFSET (33) // ascii code of lowest quality score, i.e. 0
36 | #define DEF_S 200.0
37 | #define DEF_N 0.0
38 | #define MIN_ALIGNABLE_LEN (15) // when distant reference is used, minimum amount of
39 |   // alignable sequence when reducing the sequence length for bases that overlap
40 |   // N positions in the reference
41 | #define MIN_SCORE_CONS (-399) // minimum score to call consensus base, not N, under
42 |                               // cons_code 1
43 | #define MIN_SC_DIFF_CONS (2400) // minimum diff between best and 2nd best to call
44 |                                 // best base consensus under cons_code 2
45 | #define PERC4GAP 50 // minimum percent of reads at a position that have a
46 |                     // gap for the consensus to be a gap
47 | 
48 | /* INIT_NUM_IDS is the initial number of sequence IDs that
49 |    can be in the file that we're restricting analysis to
50 | */
51 | #define INIT_NUM_IDS (1048576)
52 | 
53 | /* MAX_INS_LEN is the size of the char array accomodating
54 |    sequence inserts in an aligned fragment relative to the
55 |    reference sequence. That is, it's the longest single
56 |    gaps size allowable in the reference sequence alignment
57 | */
58 | #define MAX_INS_LEN (512)
59 | 
60 | /* INIT_REF_SEQ_LEN is initial size of refe::rence sequence to
61 |    which all aligned fragments are mapped. It can grow when
62 |    read in, if necessary */
63 | #define INIT_REF_SEQ_LEN (32768)
64 | 
65 | /* INIT_ALN_SEQ_LEN is the initial and maxmial length of
66 |    aligned sequence fragments. It cannot grow, so make sure
67 |    this is big enough */
68 | #define INIT_ALN_SEQ_LEN (256)
69 | #define INIT_NUM_ALN_SEQS (16000)
70 | 
71 | 
72 | #define MAX_FN_LEN (1023)
73 | 
74 | 
75 | #define MAX_KMER_POS (128)
76 | #define MAX_KMER_LEN (14)
77 | #define KMER_SATURATE (128)
78 | #define ALIGN_MASK_BUFFER (10)
79 | 
80 | 
81 | 
82 | 
83 | #ifdef	__cplusplus
84 | }
85 | #endif
86 | 
87 | #endif	/* _PARAMS_H */
88 | 
89 | 


--------------------------------------------------------------------------------
/src/pssm.c:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "pssm.h"
  3 | 
  4 | /* s1b is the reference base
  5 |    s2b is the fragment (ancient) base */
  6 | int sub_mat_score(const short int s1i,
  7 |         const short int s2i,
  8 |         int sm[][5][5],
  9 |         const int row,
 10 |         const int len) {
 11 |     int score;
 12 | 
 13 |     if (row < PSSM_DEPTH) {
 14 |         score = sm[row][s1i][s2i];
 15 |         return score;
 16 |     }
 17 | 
 18 |     if (len - (row + 1) < PSSM_DEPTH) {
 19 |         score = sm[(PSSM_DEPTH * 2)-(len - (row + 1))][s1i][s2i];
 20 |         return score;
 21 |     }
 22 | 
 23 |     score = sm[PSSM_DEPTH][s1i][s2i];
 24 |     return score;
 25 | 
 26 | }
 27 | 
 28 | /* find_sm_depth
 29 |    Args: (1) int row - the current row we're on for alignment, i.e.
 30 |                        the position in the fragment sequence
 31 |          (2) len len - the length of the fragment sequence we are
 32 |                        aligning
 33 |    Returns: int - the depth in the substitution matrix for this position
 34 |             in the fragment
 35 |  */
 36 | int find_sm_depth(int row, int len) {
 37 |     if (row < PSSM_DEPTH) {
 38 |         return row;
 39 |     }
 40 | 
 41 |     if (len - (row + 1) < PSSM_DEPTH) {
 42 |         return ( (PSSM_DEPTH * 2)-(len - (row + 1)));
 43 |     }
 44 | 
 45 |     return PSSM_DEPTH;
 46 | }
 47 | 
 48 | /* revcom_submat
 49 |    Takes a PSSMP (sm) pointer to a valid submat
 50 |    Makes a reverse complement of this submat
 51 |    Returns a pointer to the new submat
 52 |  */
 53 | PSSMP revcom_submat(PSSMP psm) {
 54 |     int d, rcd;
 55 |     PSSMP rcpsm;
 56 |     rcpsm = (PSSMP) save_malloc(sizeof (PSSM));
 57 |     rcpsm->depth = PSSM_DEPTH;
 58 | 
 59 |     for (d = 0; d <= (PSSM_DEPTH * 2); d++) {
 60 |         rcd = (PSSM_DEPTH * 2) - d;
 61 | 
 62 |         rcpsm->sm[rcd][0][0] = psm->sm[d][3][3];
 63 |         rcpsm->sm[rcd][0][1] = psm->sm[d][3][2];
 64 |         rcpsm->sm[rcd][0][2] = psm->sm[d][3][1];
 65 |         rcpsm->sm[rcd][0][3] = psm->sm[d][3][0];
 66 |         rcpsm->sm[rcd][0][4] = psm->sm[d][3][4];
 67 | 
 68 |         rcpsm->sm[rcd][1][0] = psm->sm[d][2][3];
 69 |         rcpsm->sm[rcd][1][1] = psm->sm[d][2][2];
 70 |         rcpsm->sm[rcd][1][2] = psm->sm[d][2][1];
 71 |         rcpsm->sm[rcd][1][3] = psm->sm[d][2][0];
 72 |         rcpsm->sm[rcd][1][4] = psm->sm[d][2][4];
 73 | 
 74 |         rcpsm->sm[rcd][2][0] = psm->sm[d][1][3];
 75 |         rcpsm->sm[rcd][2][1] = psm->sm[d][1][2];
 76 |         rcpsm->sm[rcd][2][2] = psm->sm[d][1][1];
 77 |         rcpsm->sm[rcd][2][3] = psm->sm[d][1][0];
 78 |         rcpsm->sm[rcd][2][4] = psm->sm[d][1][4];
 79 | 
 80 |         rcpsm->sm[rcd][3][0] = psm->sm[d][0][3];
 81 |         rcpsm->sm[rcd][3][1] = psm->sm[d][0][2];
 82 |         rcpsm->sm[rcd][3][2] = psm->sm[d][0][1];
 83 |         rcpsm->sm[rcd][3][3] = psm->sm[d][0][0];
 84 |         rcpsm->sm[rcd][3][4] = psm->sm[d][0][4];
 85 | 
 86 |         rcpsm->sm[rcd][4][0] = psm->sm[d][4][3];
 87 |         rcpsm->sm[rcd][4][1] = psm->sm[d][4][2];
 88 |         rcpsm->sm[rcd][4][2] = psm->sm[d][4][1];
 89 |         rcpsm->sm[rcd][4][3] = psm->sm[d][4][0];
 90 |         rcpsm->sm[rcd][4][4] = psm->sm[d][4][4];
 91 |     }
 92 |     return rcpsm;
 93 | }
 94 | 
 95 | /* Reads in this hardcoded flat substitution matrix */
 96 | PSSMP init_flatsubmat(void) {
 97 |     PSSMP flatsubmat;
 98 |     int cur_pos, base, other_base;
 99 |     flatsubmat = (PSSMP) save_malloc(sizeof (PSSM));
100 |     flatsubmat->depth = PSSM_DEPTH;
101 | 
102 |     for (cur_pos = 0; cur_pos <= (2 * PSSM_DEPTH); cur_pos++) {
103 |       for (base = 0; base <= 4; base++) {
104 | 	for (other_base = 0; other_base <= 3; other_base++) {
105 | 	  if (base == other_base) {
106 | 	    flatsubmat->sm[cur_pos][base][other_base]
107 | 	      = FLAT_MATCH;
108 | 	  } 
109 | 	  else {
110 | 	    flatsubmat->sm[cur_pos][base][other_base]
111 | 	      = FLAT_MISMATCH;
112 | 	  }
113 | 	}
114 | 	// last column is for N
115 | 	flatsubmat->sm[cur_pos][base][4] = N_SCORE;
116 |       }
117 |     }
118 | 
119 |     /* Special score, NR_SCORE, for reference having an N */
120 |     for ( cur_pos = 0; cur_pos <= (2*PSSM_DEPTH); cur_pos++ ) {
121 |       for ( base = 0; base <= 4; base++ ) {
122 | 	flatsubmat->sm[cur_pos][4][base] = NR_SCORE;
123 |       }
124 |     }
125 |     return flatsubmat;
126 | }
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/src/pssm.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * File:   pssm.h
 3 |  * Author: TCO
 4 |  *
 5 |  * Created on 3. Februar 2009, 13:14
 6 |  */
 7 | 
 8 | #ifndef _PSSM_H
 9 | #define	_PSSM_H
10 | 
11 | #ifdef	__cplusplus
12 | extern "C" {
13 | #endif
14 | 
15 | #include "types.h"
16 | 
17 |     /* revcom_submat
18 |        Takes a PSSMP (sm) pointer to a valid submat
19 |        Makes a reverse complement of this submat
20 |        Returns a pointer to the new submat
21 |      */
22 |     PSSMP revcom_submat(PSSMP psm);
23 | 
24 |     /* Reads in this hardcoded flat substitution matrix */
25 |     PSSMP init_flatsubmat(void);
26 | 
27 |     /* s1b is the reference base
28 |        s2b is the fragment (ancient) base */
29 |     int sub_mat_score(const short int s1i,
30 |             const short int s2i,
31 |             int sm[][5][5],
32 |             const int row,
33 |             const int len);
34 | 
35 | 
36 |     /* find_sm_depth
37 |        Args: (1) int row - the current row we're on for alignment, i.e.
38 |                            the position in the fragment sequence
39 |              (2) len len - the length of the fragment sequence we are
40 |                            aligning
41 |        Returns: int - the depth in the substitution matrix for this position
42 |                 in the fragment
43 |      */
44 |     int find_sm_depth(int row, int len);
45 | 
46 | 
47 | #ifdef	__cplusplus
48 | }
49 | #endif
50 | 
51 | #endif	/* _PSSM_H */
52 | 
53 | 


--------------------------------------------------------------------------------
/src/types.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * File:   types.h
  3 |  * Author: 
  4 |  *
  5 |  * Created on 16. Januar 2009, 14:59
  6 |  */
  7 | 
  8 | #ifndef _TYPES_H
  9 | #define	_TYPES_H
 10 | 
 11 | #include "params.h"
 12 | #include <stdlib.h>
 13 | #include <ctype.h>
 14 | 
 15 | 
 16 | #define save_malloc malloc
 17 | 
 18 | #ifdef	__cplusplus
 19 | extern "C" {
 20 | #endif
 21 | 
 22 | 
 23 | 
 24 | 
 25 | /*
 26 |   Define PWAlnFrag as a struct pw_aln_frag
 27 |   This is a structure for holding an alignment pair
 28 |   One of the pair is an aligned fragment of the
 29 |   reference sequence. The other is a sequence fragment
 30 |   aligned to it.
 31 |   This is used as a temporary holding place for alignments
 32 |   until they're merged into the big MapAlignment
 33 |   There is no functionality to grow this guy, so it should
 34 |   be as big as necessary from the beginning. This is
 35 |   determined by INIT_ALN_SEQ_LEN
 36 | */
 37 | typedef struct pw_aln_frag {
 38 |   char ref_id[MAX_ID_LEN + 1];
 39 |   char ref_desc[MAX_DESC_LEN + 1];
 40 |   char frag_id[MAX_ID_LEN + 1];
 41 |   char frag_desc[MAX_DESC_LEN + 1];
 42 |   char ref_seq[(2*INIT_ALN_SEQ_LEN)+1];
 43 |   char frag_seq[(2*INIT_ALN_SEQ_LEN)+1];
 44 |   int start;
 45 |   int end;
 46 |   int revcom;
 47 |   int trimmed;
 48 |   int score;
 49 |   char segment; // f=front, a=all, b=back, n=not applicable
 50 |   int num_inputs; // for collapsed sequences, the number of input seqs
 51 |   int offset; // for segment='b', number of bases not shown that
 52 |               // were in the front fragment
 53 | } PWAlnFrag;
 54 | typedef struct pw_aln_frag* PWAlnFragP;
 55 | 
 56 | /*
 57 |    Define Alnseq as a struct aln_seq
 58 |    This is simply a structure for holding a
 59 |    string of aligned sequence
 60 |  */
 61 | typedef struct alnseq {
 62 |   char id[MAX_ID_LEN + 1]; // the ID of the sequence
 63 |   char desc[MAX_DESC_LEN + 1]; // the description of the sequence
 64 |   char seq[ (2*INIT_ALN_SEQ_LEN) + 1];  // the sequence string
 65 |   char smp[ (2*INIT_ALN_SEQ_LEN) + 1];  // code for substitution matrix depth
 66 |   char* ins[ (2*INIT_ALN_SEQ_LEN) + 1]; // array of pointers to char
 67 |   // that will be filled with sequence
 68 |   int start;  // where this sequence starts relative to the reference (0-indexed)
 69 |   int end;    // where this sequence ends relative to the reference (0-indexed)
 70 |   int score;  // the alignment score for this guy
 71 |   int num_inputs; // the number of input seqs if this is a collapsed seq
 72 |   char segment; // f=front, a=all, b=back, n=not applicable
 73 |   char revcom  : 1; // indicates that this sequence has been reverse complemented
 74 |   char trimmed : 1; // indicates that this sequence has been trimmed
 75 |   char dropped : 1; // indicates that this sequence has been dropped from the consensus
 76 | } AlnSeq;
 77 | // pointer to struct aln_seq
 78 | typedef struct alnseq* AlnSeqP;
 79 | 
 80 | /*
 81 |   Define RefSeq and RefSeqP to be the reference sequence
 82 |   against which all the fragments have been aligned.
 83 | */
 84 | typedef struct refseq {
 85 |   char id[MAX_ID_LEN + 1]; // the ID of the reference sequence
 86 |   char desc[MAX_DESC_LEN + 1]; // the description of the sequence
 87 |   char* seq;                   // the sequence as a string
 88 |   char* rcseq;                 // the reverse complement as a string
 89 |   int seq_len;                 // the length of the sequence
 90 |   int size;                    // size of char array for this aligned sequence
 91 |   int* gaps;               // array giving the size of the longest gap
 92 |                            // that has been introduced at each position
 93 |   int circular;            // Boolean to denote circular sequence
 94 |   int wrap_seq_len;        // length of sequence with extra wrapped bit
 95 |   // seq_len remains the actual length of the sequence
 96 | } RefSeq;
 97 | // pointer to struct refseq
 98 | typedef struct refseq* RefSeqP;
 99 | 
100 | /* Define qsumseq */
101 | typedef struct qsumseq {
102 |   unsigned int Aqualsum[INIT_ALN_SEQ_LEN+1];
103 |   unsigned int Cqualsum[INIT_ALN_SEQ_LEN+1];
104 |   unsigned int Gqualsum[INIT_ALN_SEQ_LEN+1];
105 |   unsigned int Tqualsum[INIT_ALN_SEQ_LEN+1];
106 | } QSumSeq;
107 | typedef struct qsumseq* QSSP;
108 | 
109 | /* Define FragSeq and FragSeqP to hold a simple sequence */
110 | typedef struct fragseq {
111 |   char id[MAX_ID_LEN + 1];
112 |   char desc[MAX_DESC_LEN + 1];
113 |   char seq[INIT_ALN_SEQ_LEN+1];
114 |   char qual[INIT_ALN_SEQ_LEN+1];
115 |   QSSP qss; // pointer to a QSumSeq struct that may be needed for collapsing
116 |   int qual_sum;
117 |   int trim_point; // 0-indexed position of last base before adapter
118 |   int trimmed; // Boolean, TRUE means sequence should be trimmed to trim_point
119 |   int seq_len;
120 |   int strand_known;  // Boolean, TRUE means the alignment strand of this
121 |   // sequence has been learned by virtue of a positive scoring alignment
122 |   int rc; // Boolean, TRUE means this is the reverse complement
123 |   int as; // 0-indexed start point of alignment on current ref
124 |   int ae; // 0-indexed end point of alignment on current ref
125 |   int score; // current score of alignment on reference
126 |   AlnSeqP front_asp; // pointer to where I can find the front AlnSeq
127 |   AlnSeqP back_asp;  // pointer to where I can find the back AlnSeqP
128 |   //                   (if applicable, otherwise NULL)
129 |   int unique_best;   // boolean; TRUE means unique & best score
130 |   //                    for repeat filtering
131 |   int num_inputs; // number of sequences collapsed into this one
132 | } FragSeq;
133 | typedef struct fragseq* FragSeqP;
134 | 
135 | /* Define fragseqdb and FSDB to hold a database of FragSeqs */
136 | typedef struct fragseqdb {
137 |   FragSeqP* fss; // Pointer to array of FragSeqs
138 |   int       trim_sort; // Use extra information about whether sequence is trimmed when
139 |                        // determining whether a sequence is unique
140 |   size_t    size; // Current size of array pointed to by fss
141 |   size_t    num_fss; // Current number of FragSeqs in fss
142 | } FragSeqDB;
143 | typedef struct fragseqdb* FSDB;
144 | 
145 | /* Define PSSM as an array of position specific substitution
146 |    matrices to be used at different points in an alignment.
147 |    The first PSSM_DEPTH-1 matrices are for the beginning of the
148 |    alignment. The matrix at PSSM_DEPTH is for the middle. The
149 |    matrices at PSSM_DEPTH+1..2*PSSM_DEPTH are for the end of
150 |    the alignment.
151 |    The second value is for the reference base. The third value
152 |    is for the ancient base.
153 |    A=0, C=1, G=2, T=3, anthing else=4
154 | */
155 | typedef struct pssm {
156 |   int sm[2*PSSM_DEPTH+1][5][5];
157 |   int depth;
158 | } PSSM;
159 | typedef struct pssm* PSSMP;
160 | 
161 | /* Define DPE to be an element of a dynamic programming
162 |    matrix. Each element remembers its best score and
163 |    keeps a pointer to where it came from */
164 | typedef struct dpe {
165 |   int score; // best score at this position
166 |   int trace; // code for previous aligned position that
167 |   // gave the best score; 0 => diagonal;
168 |   // pos. number => column number
169 |   // neg. number => row number
170 |   // If number == current row, this is the beginning
171 |   // of the alignment
172 | } DPE;
173 | typedef struct dpe* DPEP;
174 | 
175 | typedef struct dpm {
176 |   DPEP* mat;
177 |   int rows;
178 |   int cols;
179 | } Mat;
180 | typedef struct dpm* DPMP;
181 | 
182 | 
183 | typedef struct map_alignment {
184 |   RefSeqP ref;       // The reference sequence to which everything is mapped
185 |   PSSMP fpsm;        // The PSSMP set of + strand matrices for aligning and consensus
186 |   PSSMP rpsm;        // The PSSMP set of - strand matrices for aligning and consensus
187 |   int num_aln_seqs;  // Number of sequences in this alignment
188 |   int size;          // Length of AlnSeqArray
189 |   int cons_code;     // Code for scheme for determining the consensus base
190 |                      //    1 => only majority rule consensus
191 |                      //    2 => (unique) plurality rule consensus
192 |   int distant_ref;   // initial reference sequence is distantly related
193 |   AlnSeqP* AlnSeqArray;
194 | } MapAlignment;
195 | // pointer to struct alignment
196 | typedef struct map_alignment* MapAlignmentP;
197 | 
198 | typedef struct base_counts {
199 |   int As;
200 |   int scoreA;
201 |   int Cs;
202 |   int scoreC;
203 |   int Gs;
204 |   int scoreG;
205 |   int Ts;
206 |   int scoreT;
207 |   int gaps;
208 |   int cov;
209 |   double frac_agree; // fraction of bases that agree with the most
210 |                      // common base
211 | } BaseCounts;
212 | typedef struct base_counts* BaseCountsP;
213 | 
214 | typedef struct alignment {
215 |   const char* seq1; // reference sequence
216 |   const char* seq2; // fragment sequence
217 |   short int* s1c; // array of submat lookup indeces for s1 - must be
218 |             // dynamically allocated
219 |   short int s2c[INIT_ALN_SEQ_LEN]; // code for submat lookup for
220 |                              // sequence2 that cannot be longer
221 |   int len1;   // length of reference sequence
222 |   int len2;   // length of fragment sequence
223 |   unsigned char* align_mask; // 0 => alignment cannot be here;
224 |                              // 1 => alignment can go through here
225 | 
226 |   PSSMP submat;  // position substitution matrices
227 |   int gop;    // gap open penalty
228 |   int gep;    // gap extension penalty
229 |   int hp;     // Boolean, TRUE = special discount for homopolymer
230 |               // associated gaps
231 |   int* hpcl;  // array of lengths of hps for each seq1 position
232 |   int* hpcs;  // array of starts of hps for each seq1 position
233 |   int* hprl;  // array of lenghs of hps for each seq2 position
234 |   int* hprs;  // array of starts of hps for each seq2 position
235 |   DPMP m;     // pointer to struct dpm, dynamic prog. matrix
236 |   int* best_gap_row; // array of current best row to gap to,
237 |   //                     useful during dynaminc programming
238 |   int best_gap_col; // keeps column number of current best-
239 |   //                   scoring gap column
240 |   int sg5;    // Boolean, TRUE = do semiglobal alignment at 5' end of
241 |   //                             seq2 (pay penalty for unaligned)
242 |   //                      FALSE = local alignment at 5' end
243 |   int sg3;    // Boolean, TRUE = do semiglobal alignment at 3' end of
244 |   //                             seq2 (pay penalty for unaligned)
245 |   //                      FALSE = local alignment at 3' end
246 |   int rc;     // Boolean, TRUE => this is an alignment to the
247 |   // reverse complement, FALSE => forward strand alignment
248 |   int abc; // alignment beginning column
249 |   int abr; // alignment beginning row
250 |   int aec; // alignment ending column
251 |   int aer; // alignment ending row
252 |   int best_score; // score at m->[aer][aec], i.e., the best score
253 | } Alignment;
254 | typedef struct alignment* AlignmentP;
255 | 
256 | typedef struct ids_list {
257 |   int num_ids;
258 |   int sorted;
259 |   int size;
260 |   char** ids;
261 | } IDsList;
262 | typedef struct ids_list* IDsListP;
263 | 
264 | typedef struct kmers {
265 |   int num_kmers;
266 |   int kmer_len;
267 |   int sorted;
268 |   int size;
269 |   char** kmers;
270 | } Kmers;
271 | typedef struct kmers* KmersP;
272 | 
273 | typedef struct kmer_pos_list {
274 |   size_t num_pos; // current number of known positions for this kmer
275 |   int    sorted;  // boolean; TRUE => positions are in order
276 |   unsigned int positions[MAX_KMER_POS]; // list of known positions
277 |                                         // for this kmer
278 | } KmerPosList;
279 | typedef struct kmer_pos_list* KPL;
280 | 
281 | 
282 | 
283 | 
284 | 
285 | #ifdef	__cplusplus
286 | }
287 | #endif
288 | 
289 | #endif	/* _TYPES_H */
290 | 
291 | 


--------------------------------------------------------------------------------
/test/mia_testsuite.c:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * File:   mia_testsuite.c
 3 |  * Author: michael_siebauer
 4 |  *
 5 |  * Created on April 2, 2009, 1:13 PM
 6 |  */
 7 | 
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <CUnit/Basic.h>
11 | #include "../src/mia.h"
12 | #include "../src/map_align.h"
13 | #include "../src/map_alignment.h"
14 | #include "../src/io.h"
15 | #include "../src/fsdb.h"
16 | 
17 |  RefSeqP ref_seq;
18 |  FragSeqP frag_seq;
19 |  FSDB frag_db;
20 | 
21 | 
22 |     int maxi(int i1, int i2)
23 |     {
24 |       return (i1 > i2) ? i1 : i2;
25 |     }
26 | 
27 |     void test_maxi(void)
28 |     {
29 |       CU_ASSERT(maxi(0,2) == 2);
30 |       CU_ASSERT(maxi(0,-2) == 0);
31 |       CU_ASSERT(maxi(2,2) == 2);
32 |     }
33 | 
34 |     int init_testsuite(void){
35 |         ref_seq = (RefSeqP)calloc(1, sizeof(RefSeqP));
36 |         frag_seq = (FragSeqP)calloc(1, sizeof(FragSeqP));
37 |         frag_db = init_FSDB();
38 | 
39 | 
40 |         // read in our test reference sequence
41 |         if (read_fasta_ref(ref_seq, "tr1.fna") != 1)
42 |             return EXIT_FAILURE;
43 | 
44 |         FILE* frag_file = fileOpen("tf.fna", "r");
45 |         if (frag_file == NULL)
46 |             return EXIT_FAILURE;
47 | 
48 |         while (read_fasta(frag_file, frag_seq)){
49 |             printf("%s\n", frag_seq->id);
50 |         }
51 | 
52 |         
53 | 
54 |         return EXIT_SUCCESS;
55 |     }
56 | 
57 |     int cleanup_testsuite(void){
58 |         free(ref_seq);
59 |         free(frag_db);
60 |         return EXIT_SUCCESS;
61 |     }
62 | 
63 | /*
64 |  * 
65 |  */
66 | int main(int argc, char** argv) {
67 |     // handle for testsuite
68 |     CU_pSuite test = NULL;
69 |     
70 | 
71 |     if (CUE_SUCCESS != CU_initialize_registry())
72 |         return CU_get_error();
73 | 
74 |     // create the suite
75 |     test = CU_add_suite("Testsuite 1", init_testsuite, cleanup_testsuite);
76 |     if (NULL == test) {
77 |       CU_cleanup_registry();
78 |       return CU_get_error();
79 |    }
80 | 
81 |     // add tests to suite
82 |     CU_add_test(test, "Basic Test", test_maxi);
83 | 
84 | 
85 |     // Now Run all tests
86 |     CU_basic_set_mode(CU_BRM_VERBOSE);
87 |     CU_basic_run_tests();
88 |     CU_cleanup_registry();
89 |     return CU_get_error();
90 | }
91 | 
92 | 


--------------------------------------------------------------------------------
/test/tf.fna:
--------------------------------------------------------------------------------
 1 | >tf15-masked
 2 | ACCATGCATTTGGTATTTTCGTCTGGGGG
 3 | >tf12-really-long
 4 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
 5 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
 6 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
 7 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
 8 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
 9 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
10 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
11 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
12 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
13 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
14 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
15 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
16 | >tf1
17 | CGAGACGCTGGAGCCGGAGCACCCTATGTCGATCACAGGTCTATCACCCTATTAACCACT
18 | CACGGGA
19 | >tf1_redundant
20 | CGAGACGCTGGAGCCGGAGCACCCTATGTCGATCACAGGTCTATCACCCTATTAACCACT
21 | CACGGGA
22 | >tf1_redundant_worse
23 | CGAGACGCTGGAGCCGGAGCACCTATGTCGATCACAGGTCTATCACCCTATTAACCACTC
24 | ACGGGA
25 | >tf2
26 | TTACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCACCATGCATTTGGTATT
27 | >tf3 
28 | TTAACCACTCACGGGAGCTCACCATGCATTTGGTATTT
29 | >tf4 
30 | ACCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGG
31 | AGCCGGAGCAC
32 | >tf5
33 | TATGTCGATCACAGGTCTATCACCCTATTAACCACTCACGGGA
34 | >tf6
35 | GGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGATCAC
36 | >tf7_redundant_worse
37 | CTCCAGCGTCTCGCAATGCTATCGCGTGCACGCCCCCAGACGAAAATACCAAATG
38 | >tf7
39 | CTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCAGACGAAAATACCAAATG
40 | >tf8
41 | GTCTGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGTCGGAGCACCCTAT
42 | >tf9
43 | AATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCACCATGCATTGGTATT
44 | >tf10
45 | GGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCTAT
46 | >tf10-wadapt
47 | GGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCTATGTCAGACACGCAA
48 | CAGGGGATAGGCAAGGCACACAGGGGATAGG
49 | >tf10-shortuntr
50 | GGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCT
51 | >tf10-shorttrimmed
52 | GGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACGTCAGACACGCAACAGG
53 | GGATAGGCAAGGCACACAGGGGATAGG
54 | >t-unrelated
55 | CTAGATGTCGTCGTAGATGTCGTCGTAGAAGGC
56 | >tf11
57 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCT
58 | >tf11-adapt
59 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTC
60 | AGACACGCAACAGGGGATAGGCAAGGCACACAGGGGATAGG
61 | >tf11-shorter-adapt
62 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCGTCA
63 | GACACGCAACAGGGGATAGGCAAGGCACACAGGGGATAGG
64 | >tf11-shorter
65 | ACGAAAATACCAAATGCATGGTGAGCTCCCGTGAGTGGTTAATAGGGTGATAGA
66 | >tf13-really-short
67 | ACG
68 | >tf14-long-and-wrappy
69 | ACGTGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTA
70 | TTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTA
71 | TGTCCGTG
72 | 
73 | 


--------------------------------------------------------------------------------
/test/tf2.fna:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpieva/mapping-iterative-assembler/5a7fb5afad735da7b8297381648049985c599874/test/tf2.fna


--------------------------------------------------------------------------------
/test/tr1.fna:
--------------------------------------------------------------------------------
1 | >tr1
2 | GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCtccatgcatttggtatttt
3 | cgtctggggggtgtGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTC
4 | 


--------------------------------------------------------------------------------