├── .clang-format
├── .gitignore
├── .travis.yml
├── COPYING
├── INSTALL
├── Makefile.am
├── README.md
├── andi-manual.pdf
├── configure.ac
├── docs
    ├── Doxyfile
    ├── Makefile.am
    ├── andi.1.in
    └── manual
    │   ├── andi-manual.tex
    │   ├── andi_labels.pdf
    │   ├── references.bib
    │   └── version.tex.in
├── libs
    ├── Makefile.am
    ├── pfasta.c
    └── pfasta.h
├── m4
    └── ax_cxx_compile_stdcxx_11.m4
├── opt
    ├── Makefile.am
    ├── compat-stdlib.h
    ├── compat-string.h
    ├── reallocarray.c
    └── strchrnul.c
├── scripts
    ├── _andi
    ├── failed.zsh
    ├── maf2phy.awk
    └── vmatch.sh
├── src
    ├── Makefile.am
    ├── andi.c
    ├── dist_hack.h
    ├── esa.c
    ├── esa.h
    ├── global.h
    ├── io.c
    ├── io.h
    ├── model.c
    ├── model.h
    ├── process.c
    ├── process.h
    ├── sequence.c
    └── sequence.h
└── test
    ├── Makefile.am
    ├── low_homo.sh
    ├── nan.sh
    ├── test_esa.c
    ├── test_extra.sh
    ├── test_fasta.cxx
    ├── test_join.sh
    ├── test_process.c
    ├── test_random.sh
    └── test_seq.c


/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: LLVM
 2 | IndentWidth: 4
 3 | TabWidth: 4
 4 | UseTab: Always
 5 | AllowShortIfStatementsOnASingleLine: true
 6 | AllowShortFunctionsOnASingleLine: false
 7 | IndentCaseLabels: true
 8 | AllowShortCaseLabelsOnASingleLine: true
 9 | BreakBeforeBraces: Attach
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binary and automatically generated files
 2 | *.o
 3 | *.a
 4 | andi
 5 | andi_*
 6 | randomSeed.dat
 7 | seedms
 8 | testRMQ
 9 | src/config.h
10 | src/stamp-h1
11 | src/config.hin
12 | src/config.hin~
13 | 
14 | #docs
15 | docs/doxygen_sqlite3.db
16 | docs/html/*
17 | docs/latex/*
18 | docs/andi.1
19 | *.aux
20 | *.auxlock
21 | *.dep
22 | *.dpth
23 | *.toc
24 | *.out
25 | *.pdf
26 | *.backup
27 | *.bbl
28 | *.blg
29 | !andi-manual.pdf
30 | 
31 | *.in
32 | !docs/andi.1.in
33 | !docs/manual/version.tex.in
34 | docs/manual/version.tex
35 | *.log
36 | **/Makefile
37 | configure.scan
38 | config.status
39 | depcomp
40 | install-sh
41 | aclocal.m4
42 | **/.deps/
43 | autom4te.cache/
44 | README
45 | ChangeLog
46 | missing
47 | compile
48 | configure
49 | ar-lib
50 | src/.dirstamp
51 | 
52 | # test files
53 | *.fasta
54 | cachegrind*
55 | callgrind*
56 | test.trs
57 | test-driver
58 | test_esa
59 | test_seq
60 | test_fasta
61 | test_process
62 | *.trs
63 | 
64 | # Coverage
65 | *.gcda
66 | *.gcno
67 | *.gcov
68 | 
69 | 
70 | andi.sublime-*
71 | 
72 | # for legacy git only:
73 | libs/RMQ/.deps/
74 | libs/RMQ/Makefile
75 | Makefile
76 | 
77 | # Profiling:
78 | gmon.out
79 | profile
80 | 
81 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | compiler:
 3 |   - gcc
 4 |   - clang
 5 | arch:
 6 |   - amd64
 7 |   - ppc64le
 8 | sudo: false
 9 | addons:
10 |   apt:
11 |     sources:
12 |       - deadsnakes
13 |       - ubuntu-toolchain-r-test
14 |     packages:
15 |       - cmake
16 |       - libglib2.0-dev
17 |       - libgsl0-dev
18 | 
19 | install:
20 |   - export LIBDIVDIR="$HOME/libdivsufsort"
21 |   - pip install --user cpp-coveralls
22 |   - wget https://github.com/y-256/libdivsufsort/archive/master.tar.gz
23 |   - tar -xzvf master.tar.gz
24 |   - cd libdivsufsort-master && mkdir build && cd build
25 |   - cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="$LIBDIVDIR" ..
26 |   - make && make install
27 | 
28 | script:
29 | - CONFIGURE_FLAGS=""
30 | - export LD_LIBRARY_PATH="$LIBDIVDIR:$LIBDIVDIR/lib"
31 | - export LIBRARY_PATH="$LIBDIVDIR:$LIBRARY_PATH"
32 | - cd $TRAVIS_BUILD_DIR
33 | - autoreconf -fvi -Im4
34 | - export MYFLAGS="-fprofile-arcs -ftest-coverage -I$LIBDIVDIR/include"
35 | - if [ "${CC}" = "clang" ]; then export CONFIGURE_FLAGS="--disable-openmp"; fi
36 | - ./configure $CONFIGURE_FLAGS --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
37 | - make
38 | - make check || cat ./test-suite.log || exit 1
39 | - export MYFLAGS="-I$LIBDIVDIR/include"
40 | - ./configure $CONFIGURE_FLAGS --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
41 | - make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\" $CONFIGURE_FLAGS"
42 | after_success:
43 | - if [ "$CXX" = "g++" ]; then coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'; fi
44 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
  1 | Installation Instructions
  2 | *************************
  3 | 
  4 | Copyright (C) 1994-1996, 1999-2002, 2004-2011 Free Software Foundation,
  5 | Inc.
  6 | 
  7 |    Copying and distribution of this file, with or without modification,
  8 | are permitted in any medium without royalty provided the copyright
  9 | notice and this notice are preserved.  This file is offered as-is,
 10 | without warranty of any kind.
 11 | 
 12 | Basic Installation
 13 | ==================
 14 | 
 15 |    Briefly, the shell commands `./configure; make; make install' should
 16 | configure, build, and install this package.  The following
 17 | more-detailed instructions are generic; see the `README' file for
 18 | instructions specific to this package.  Some packages provide this
 19 | `INSTALL' file but do not implement all of the features documented
 20 | below.  The lack of an optional feature in a given package is not
 21 | necessarily a bug.  More recommendations for GNU packages can be found
 22 | in *note Makefile Conventions: (standards)Makefile Conventions.
 23 | 
 24 |    The `configure' shell script attempts to guess correct values for
 25 | various system-dependent variables used during compilation.  It uses
 26 | those values to create a `Makefile' in each directory of the package.
 27 | It may also create one or more `.h' files containing system-dependent
 28 | definitions.  Finally, it creates a shell script `config.status' that
 29 | you can run in the future to recreate the current configuration, and a
 30 | file `config.log' containing compiler output (useful mainly for
 31 | debugging `configure').
 32 | 
 33 |    It can also use an optional file (typically called `config.cache'
 34 | and enabled with `--cache-file=config.cache' or simply `-C') that saves
 35 | the results of its tests to speed up reconfiguring.  Caching is
 36 | disabled by default to prevent problems with accidental use of stale
 37 | cache files.
 38 | 
 39 |    If you need to do unusual things to compile the package, please try
 40 | to figure out how `configure' could check whether to do them, and mail
 41 | diffs or instructions to the address given in the `README' so they can
 42 | be considered for the next release.  If you are using the cache, and at
 43 | some point `config.cache' contains results you don't want to keep, you
 44 | may remove or edit it.
 45 | 
 46 |    The file `configure.ac' (or `configure.in') is used to create
 47 | `configure' by a program called `autoconf'.  You need `configure.ac' if
 48 | you want to change it or regenerate `configure' using a newer version
 49 | of `autoconf'.
 50 | 
 51 |    The simplest way to compile this package is:
 52 | 
 53 |   1. `cd' to the directory containing the package's source code and type
 54 |      `./configure' to configure the package for your system.
 55 | 
 56 |      Running `configure' might take a while.  While running, it prints
 57 |      some messages telling which features it is checking for.
 58 | 
 59 |   2. Type `make' to compile the package.
 60 | 
 61 |   3. Optionally, type `make check' to run any self-tests that come with
 62 |      the package, generally using the just-built uninstalled binaries.
 63 | 
 64 |   4. Type `make install' to install the programs and any data files and
 65 |      documentation.  When installing into a prefix owned by root, it is
 66 |      recommended that the package be configured and built as a regular
 67 |      user, and only the `make install' phase executed with root
 68 |      privileges.
 69 | 
 70 |   5. Optionally, type `make installcheck' to repeat any self-tests, but
 71 |      this time using the binaries in their final installed location.
 72 |      This target does not install anything.  Running this target as a
 73 |      regular user, particularly if the prior `make install' required
 74 |      root privileges, verifies that the installation completed
 75 |      correctly.
 76 | 
 77 |   6. You can remove the program binaries and object files from the
 78 |      source code directory by typing `make clean'.  To also remove the
 79 |      files that `configure' created (so you can compile the package for
 80 |      a different kind of computer), type `make distclean'.  There is
 81 |      also a `make maintainer-clean' target, but that is intended mainly
 82 |      for the package's developers.  If you use it, you may have to get
 83 |      all sorts of other programs in order to regenerate files that came
 84 |      with the distribution.
 85 | 
 86 |   7. Often, you can also type `make uninstall' to remove the installed
 87 |      files again.  In practice, not all packages have tested that
 88 |      uninstallation works correctly, even though it is required by the
 89 |      GNU Coding Standards.
 90 | 
 91 |   8. Some packages, particularly those that use Automake, provide `make
 92 |      distcheck', which can by used by developers to test that all other
 93 |      targets like `make install' and `make uninstall' work correctly.
 94 |      This target is generally not run by end users.
 95 | 
 96 | Compilers and Options
 97 | =====================
 98 | 
 99 |    Some systems require unusual options for compilation or linking that
100 | the `configure' script does not know about.  Run `./configure --help'
101 | for details on some of the pertinent environment variables.
102 | 
103 |    You can give `configure' initial values for configuration parameters
104 | by setting variables in the command line or in the environment.  Here
105 | is an example:
106 | 
107 |      ./configure CC=c99 CFLAGS=-g LIBS=-lposix
108 | 
109 |    *Note Defining Variables::, for more details.
110 | 
111 | Compiling For Multiple Architectures
112 | ====================================
113 | 
114 |    You can compile the package for more than one kind of computer at the
115 | same time, by placing the object files for each architecture in their
116 | own directory.  To do this, you can use GNU `make'.  `cd' to the
117 | directory where you want the object files and executables to go and run
118 | the `configure' script.  `configure' automatically checks for the
119 | source code in the directory that `configure' is in and in `..'.  This
120 | is known as a "VPATH" build.
121 | 
122 |    With a non-GNU `make', it is safer to compile the package for one
123 | architecture at a time in the source code directory.  After you have
124 | installed the package for one architecture, use `make distclean' before
125 | reconfiguring for another architecture.
126 | 
127 |    On MacOS X 10.5 and later systems, you can create libraries and
128 | executables that work on multiple system types--known as "fat" or
129 | "universal" binaries--by specifying multiple `-arch' options to the
130 | compiler but only a single `-arch' option to the preprocessor.  Like
131 | this:
132 | 
133 |      ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
134 |                  CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
135 |                  CPP="gcc -E" CXXCPP="g++ -E"
136 | 
137 |    This is not guaranteed to produce working output in all cases, you
138 | may have to build one architecture at a time and combine the results
139 | using the `lipo' tool if you have problems.
140 | 
141 | Installation Names
142 | ==================
143 | 
144 |    By default, `make install' installs the package's commands under
145 | `/usr/local/bin', include files under `/usr/local/include', etc.  You
146 | can specify an installation prefix other than `/usr/local' by giving
147 | `configure' the option `--prefix=PREFIX', where PREFIX must be an
148 | absolute file name.
149 | 
150 |    You can specify separate installation prefixes for
151 | architecture-specific files and architecture-independent files.  If you
152 | pass the option `--exec-prefix=PREFIX' to `configure', the package uses
153 | PREFIX as the prefix for installing programs and libraries.
154 | Documentation and other data files still use the regular prefix.
155 | 
156 |    In addition, if you use an unusual directory layout you can give
157 | options like `--bindir=DIR' to specify different values for particular
158 | kinds of files.  Run `configure --help' for a list of the directories
159 | you can set and what kinds of files go in them.  In general, the
160 | default for these options is expressed in terms of `${prefix}', so that
161 | specifying just `--prefix' will affect all of the other directory
162 | specifications that were not explicitly provided.
163 | 
164 |    The most portable way to affect installation locations is to pass the
165 | correct locations to `configure'; however, many packages provide one or
166 | both of the following shortcuts of passing variable assignments to the
167 | `make install' command line to change installation locations without
168 | having to reconfigure or recompile.
169 | 
170 |    The first method involves providing an override variable for each
171 | affected directory.  For example, `make install
172 | prefix=/alternate/directory' will choose an alternate location for all
173 | directory configuration variables that were expressed in terms of
174 | `${prefix}'.  Any directories that were specified during `configure',
175 | but not in terms of `${prefix}', must each be overridden at install
176 | time for the entire installation to be relocated.  The approach of
177 | makefile variable overrides for each directory variable is required by
178 | the GNU Coding Standards, and ideally causes no recompilation.
179 | However, some platforms have known limitations with the semantics of
180 | shared libraries that end up requiring recompilation when using this
181 | method, particularly noticeable in packages that use GNU Libtool.
182 | 
183 |    The second method involves providing the `DESTDIR' variable.  For
184 | example, `make install DESTDIR=/alternate/directory' will prepend
185 | `/alternate/directory' before all installation names.  The approach of
186 | `DESTDIR' overrides is not required by the GNU Coding Standards, and
187 | does not work on platforms that have drive letters.  On the other hand,
188 | it does better at avoiding recompilation issues, and works well even
189 | when some directory options were not specified in terms of `${prefix}'
190 | at `configure' time.
191 | 
192 | Optional Features
193 | =================
194 | 
195 |    If the package supports it, you can cause programs to be installed
196 | with an extra prefix or suffix on their names by giving `configure' the
197 | option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
198 | 
199 |    Some packages pay attention to `--enable-FEATURE' options to
200 | `configure', where FEATURE indicates an optional part of the package.
201 | They may also pay attention to `--with-PACKAGE' options, where PACKAGE
202 | is something like `gnu-as' or `x' (for the X Window System).  The
203 | `README' should mention any `--enable-' and `--with-' options that the
204 | package recognizes.
205 | 
206 |    For packages that use the X Window System, `configure' can usually
207 | find the X include and library files automatically, but if it doesn't,
208 | you can use the `configure' options `--x-includes=DIR' and
209 | `--x-libraries=DIR' to specify their locations.
210 | 
211 |    Some packages offer the ability to configure how verbose the
212 | execution of `make' will be.  For these packages, running `./configure
213 | --enable-silent-rules' sets the default to minimal output, which can be
214 | overridden with `make V=1'; while running `./configure
215 | --disable-silent-rules' sets the default to verbose, which can be
216 | overridden with `make V=0'.
217 | 
218 | Particular systems
219 | ==================
220 | 
221 |    On HP-UX, the default C compiler is not ANSI C compatible.  If GNU
222 | CC is not installed, it is recommended to use the following options in
223 | order to use an ANSI C compiler:
224 | 
225 |      ./configure CC="cc -Ae -D_XOPEN_SOURCE=500"
226 | 
227 | and if that doesn't work, install pre-built binaries of GCC for HP-UX.
228 | 
229 |    HP-UX `make' updates targets which have the same time stamps as
230 | their prerequisites, which makes it generally unusable when shipped
231 | generated files such as `configure' are involved.  Use GNU `make'
232 | instead.
233 | 
234 |    On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot
235 | parse its `<wchar.h>' header file.  The option `-nodtk' can be used as
236 | a workaround.  If GNU CC is not installed, it is therefore recommended
237 | to try
238 | 
239 |      ./configure CC="cc"
240 | 
241 | and if that doesn't work, try
242 | 
243 |      ./configure CC="cc -nodtk"
244 | 
245 |    On Solaris, don't put `/usr/ucb' early in your `PATH'.  This
246 | directory contains several dysfunctional programs; working variants of
247 | these programs are available in `/usr/bin'.  So, if you need `/usr/ucb'
248 | in your `PATH', put it _after_ `/usr/bin'.
249 | 
250 |    On Haiku, software installed for all users goes in `/boot/common',
251 | not `/usr/local'.  It is recommended to use the following options:
252 | 
253 |      ./configure --prefix=/boot/common
254 | 
255 | Specifying the System Type
256 | ==========================
257 | 
258 |    There may be some features `configure' cannot figure out
259 | automatically, but needs to determine by the type of machine the package
260 | will run on.  Usually, assuming the package is built to be run on the
261 | _same_ architectures, `configure' can figure that out, but if it prints
262 | a message saying it cannot guess the machine type, give it the
263 | `--build=TYPE' option.  TYPE can either be a short name for the system
264 | type, such as `sun4', or a canonical name which has the form:
265 | 
266 |      CPU-COMPANY-SYSTEM
267 | 
268 | where SYSTEM can have one of these forms:
269 | 
270 |      OS
271 |      KERNEL-OS
272 | 
273 |    See the file `config.sub' for the possible values of each field.  If
274 | `config.sub' isn't included in this package, then this package doesn't
275 | need to know the machine type.
276 | 
277 |    If you are _building_ compiler tools for cross-compiling, you should
278 | use the option `--target=TYPE' to select the type of system they will
279 | produce code for.
280 | 
281 |    If you want to _use_ a cross compiler, that generates code for a
282 | platform different from the build platform, you should specify the
283 | "host" platform (i.e., that on which the generated programs will
284 | eventually be run) with `--host=TYPE'.
285 | 
286 | Sharing Defaults
287 | ================
288 | 
289 |    If you want to set default values for `configure' scripts to share,
290 | you can create a site shell script called `config.site' that gives
291 | default values for variables like `CC', `cache_file', and `prefix'.
292 | `configure' looks for `PREFIX/share/config.site' if it exists, then
293 | `PREFIX/etc/config.site' if it exists.  Or, you can set the
294 | `CONFIG_SITE' environment variable to the location of the site script.
295 | A warning: not all `configure' scripts look for a site script.
296 | 
297 | Defining Variables
298 | ==================
299 | 
300 |    Variables not defined in a site shell script can be set in the
301 | environment passed to `configure'.  However, some packages may run
302 | configure again during the build, and the customized values of these
303 | variables may be lost.  In order to avoid this problem, you should set
304 | them in the `configure' command line, using `VAR=value'.  For example:
305 | 
306 |      ./configure CC=/usr/local2/bin/gcc
307 | 
308 | causes the specified `gcc' to be used as the C compiler (unless it is
309 | overridden in the site shell script).
310 | 
311 | Unfortunately, this technique does not work for `CONFIG_SHELL' due to
312 | an Autoconf bug.  Until the bug is fixed you can use this workaround:
313 | 
314 |      CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
315 | 
316 | `configure' Invocation
317 | ======================
318 | 
319 |    `configure' recognizes the following options to control how it
320 | operates.
321 | 
322 | `--help'
323 | `-h'
324 |      Print a summary of all of the options to `configure', and exit.
325 | 
326 | `--help=short'
327 | `--help=recursive'
328 |      Print a summary of the options unique to this package's
329 |      `configure', and exit.  The `short' variant lists options used
330 |      only in the top level, while the `recursive' variant lists options
331 |      also present in any nested packages.
332 | 
333 | `--version'
334 | `-V'
335 |      Print the version of Autoconf used to generate the `configure'
336 |      script, and exit.
337 | 
338 | `--cache-file=FILE'
339 |      Enable the cache: use and save the results of the tests in FILE,
340 |      traditionally `config.cache'.  FILE defaults to `/dev/null' to
341 |      disable caching.
342 | 
343 | `--config-cache'
344 | `-C'
345 |      Alias for `--cache-file=config.cache'.
346 | 
347 | `--quiet'
348 | `--silent'
349 | `-q'
350 |      Do not print messages saying which checks are being made.  To
351 |      suppress all normal output, redirect it to `/dev/null' (any error
352 |      messages will still be shown).
353 | 
354 | `--srcdir=DIR'
355 |      Look for the package's source code in directory DIR.  Usually
356 |      `configure' can determine that directory automatically.
357 | 
358 | `--prefix=DIR'
359 |      Use DIR as the installation prefix.  *note Installation Names::
360 |      for more details, including other options available for fine-tuning
361 |      the installation locations.
362 | 
363 | `--no-create'
364 | `-n'
365 |      Run the configure checks, but stop before creating any output
366 |      files.
367 | 
368 | `configure' also accepts some other, not widely useful, options.  Run
369 | `configure --help' for more details.
370 | 
371 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
 1 | ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
 2 | AM_DISTCHECK_CONFIGURE_FLAGS="--enable-unit-tests"
 3 | 
 4 | .PHONY: all
 5 | 
 6 | SUBDIRS = . libs opt src docs
 7 | DIST_SUBDIRS = . libs opt src docs test
 8 | 
 9 | # Conditionally build the tests
10 | if BUILD_TESTS
11 | 
12 | SUBDIRS+= test
13 | 
14 | AM_TESTS_ENVIRONMENT= \
15 | 	RANDOM_SEED='@SEED@' ; export RANDOM_SEED ;
16 | 
17 | XFAIL_TESTS=
18 | TESTS = $(XFAIL_TESTS) test/nan.sh test/low_homo.sh test/test_esa test/test_seq test/test_extra.sh test/test_random.sh test/test_join.sh test/test_process
19 | 
20 | $(TESTS): src/andi
21 | 
22 | endif # BUILD_TESTS
23 | 
24 | 
25 | dist_noinst_DATA = ChangeLog README.md
26 | dist_pdf_DATA = andi-manual.pdf
27 | dist_noinst_SCRIPTS= scripts/maf2phy.awk scripts/vmatch.sh scripts/_andi
28 | 
29 | # Recreate the changelog, when the version string changes.
30 | ChangeLog: configure.ac
31 | 	echo "Missing Git" > ChangeLog;
32 | 	if test -d $(srcdir)/.git; then \
33 | 		which git && git log --stat --date=short --abbrev-commit | grep --invert-match '^ [[:alnum:].]' | git stripspace > ChangeLog; \
34 | 	fi
35 | 
36 | .PHONY: code-docs
37 | code-docs:
38 | 	cd docs && $(MAKE) code-docs;
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/EvolBioInf/andi.svg?branch=master)](https://travis-ci.org/EvolBioInf/andi) [![Coverage Status](https://coveralls.io/repos/EvolBioInf/andi/badge.svg?branch=master)](https://coveralls.io/r/EvolBioInf/andi?branch=master)
 2 | 
 3 | # About
 4 | 
 5 | This is the `andi` program for estimating the evolutionary distance between closely related genomes. These distances can be used to rapidly infer phylogenies for big sets of genomes. Because `andi` does not compute full alignments, it is so efficient that it scales even up to thousands of bacterial genomes.
 6 | 
 7 | This readme covers all necessary instructions for the impatient to get `andi` up and running. For extensive instructions please consult the [manual](andi-manual.pdf).
 8 | 
 9 | 
10 | # Installation and Usage
11 | 
12 | Stable versions of `andi` are available via package managers. For manual installation see below.
13 | 
14 | For Debian and Ubuntu:
15 | 
16 |     sudo apt-get install andi
17 | 
18 | For macOS with Homebrew:
19 | 
20 |     brew tap brewsci/bio
21 |     brew install andi
22 | 
23 | For ArchLinux with aura:
24 | 
25 |     sudo aura -A andi
26 |     
27 | With a successful installation you can get the usage instructions via `--help` or the man page.
28 | 
29 |     $ andi --help
30 |     $ man andi
31 | 
32 | You can simply use `andi` with your genomes in `FASTA` format.
33 | 
34 |     $ andi S1.fasta S2.fasta
35 |     2
36 |     S1     0.0  0.1
37 |     s2     0.1  0.0
38 | 
39 | From this distance matrix the phylogeny can be inferred via neighbor-joining. Check the [manual](andi-manual.pdf) for a more thorough description.
40 | 
41 | 
42 | ## Manual installation
43 | 
44 | If your system does not support one of the above package managers you have to manually build the latest [stable release](https://github.com/EvolBioInf/andi/releases) from a tarball. See the [manual](andi-manual.pdf) for extensive building instructions.
45 | 
46 | This program has the following external dependencies: [libdivsufsort](https://github.com/y-256/libdivsufsort) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools.
47 | 
48 | Assuming you have installed all prerequisites, building is as easy as follows.
49 | 
50 |     $ autoreconf -fi -Im4  # optional when building from tarball
51 |     $ ./configure
52 |     $ make
53 |     $ make install
54 | 
55 | Excessive build instructions are located in `INSTALL`. 
56 | 
57 | # Links and Additional Resources
58 | 
59 | The release of this software is accompanied by a paper from [Haubold et al.](http://bioinformatics.oxfordjournals.org/content/31/8/1169). It explains the used *anchor distance* strategy in great detail. The `maf2phy.awk` script used in the validation process is located under `scripts`. Simulations were done using our own [simK](http://guanine.evolbio.mpg.de/bioBox/) tool. For a demo visualising the internals of andi visit our [GitHub pages](http://evolbioinf.github.io/andi/).
60 | 
61 | ## Data Sets
62 | 
63 | 1. 29 E. coli and Shigella strains: [data](http://guanine.evolbio.mpg.de/andi/eco29.fasta.gz)
64 | 2. 109 E. coli ST131 strains ([paper](http://www.pnas.org/content/early/2014/03/28/1322678111.abstract)): 
65 | 	* [99 newly sequenced strains](https://github.com/BeatsonLab-MicrobialGenomics/ST131_99)
66 | 	* [10 previously published strains](http://guanine.evolbio.mpg.de/andi/st131_extra.tgz)
67 | 3. 3085 Streptococcus pneumoniae strains ([paper](http://www.nature.com/ng/journal/v46/n3/full/ng.2895.html)): ftp://ftp.sanger.ac.uk/pub/pathogens/Streptococcus/pneumoniae/Maela_assemblies.tgz
68 | 
69 | ## License
70 | 
71 | Copyright © 2014 - 2021 Fabian Klötzl  
72 | License GPLv3+: GNU GPL version 3 or later.
73 | 
74 | This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. The full license text is available at <http://gnu.org/licenses/gpl.html>.
75 | 
76 | Some files may be licensed differently.
77 | 
78 | ## Contact
79 | 
80 | In case of bugs or unexpected errors don't hesitate to send me a mail: kloetzl@evolbio.mpg.de
81 | 


--------------------------------------------------------------------------------
/andi-manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolBioInf/andi/390af15beb76badaf8f16864a885747aa60956c8/andi-manual.pdf


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
  1 | AC_INIT([andi], [0.15-beta])
  2 | AM_INIT_AUTOMAKE([-Wall foreign])
  3 | 
  4 | AC_CONFIG_MACRO_DIR([m4])
  5 | 
  6 | AC_PROG_CC
  7 | AC_PROG_CXX
  8 | AC_PROG_MAKE_SET
  9 | AC_PROG_CPP
 10 | AC_PROG_RANLIB
 11 | m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
 12 | 
 13 | # Make sure, also the C++ programs are compiled with OpenMP
 14 | AC_LANG(C++)
 15 | AC_OPENMP
 16 | 
 17 | # Execute all tests using C
 18 | AC_LANG(C)
 19 | AC_OPENMP
 20 | 
 21 | AC_CHECK_LIB([m],[cos])
 22 | AC_CHECK_LIB([gslcblas],[cblas_dgemm], [], [have_gsl=no])
 23 | AC_CHECK_LIB([gsl],[gsl_ran_binomial], [], [have_gsl=no])
 24 | 
 25 | AS_IF([test "x$have_gsl" = "xno"],[
 26 | 	AC_MSG_ERROR([Missing the Gnu Scientific Library.])
 27 | ])
 28 | 
 29 | # The libdivsufsort header contains some Microsoft extension making
 30 | # compilation fail on certain systems (i.e. OS X). Add the following
 31 | # flag so the build runs smoothly.
 32 | CPPFLAGS="$CPPFLAGS -fms-extensions"
 33 | AC_CHECK_HEADERS([divsufsort.h],[have_libdivsufsort=yes],[have_libdivsufsort=no])
 34 | AC_CHECK_LIB(divsufsort, divsufsort, [], [have_libdivsufsort=no])
 35 | 
 36 | AS_IF([test "x$have_libdivsufsort" = "xno"],[
 37 | 	AC_MSG_ERROR([Missing libdivsufsort.])
 38 | ])
 39 | 
 40 | 
 41 | # The unit tests require GLIB2. So by default do not build the test.
 42 | # If enabled, check for glib.
 43 | 
 44 | AC_ARG_ENABLE([unit-tests],
 45 | 	[AS_HELP_STRING([--enable-unit-tests],[build unit tests @<:@default: no@:>@])],
 46 | 	[try_unit_tests=${enableval}],[try_unit_tests=no]
 47 | 	)
 48 | 
 49 | AM_CONDITIONAL([BUILD_TESTS],[test "x${try_unit_tests}" = xyes])
 50 | 
 51 | # The user may set a seed for the unit tests, so that builds are reproducible.
 52 | # A value of 0 makes the tests random.
 53 | AC_ARG_WITH([seed],
 54 | 	[AS_HELP_STRING([--with-seed=INT],
 55 | 		[random seed for reproducible builds. @<:@default: 0@:>@])],
 56 | 	[SEED=$withval],
 57 | 	[SEED=0])
 58 | 
 59 | AC_SUBST([SEED])
 60 | 
 61 | AS_IF([test "x${try_unit_tests}" = xyes], [
 62 | 	have_glib=yes
 63 | 	PKG_CHECK_MODULES([GLIB], [glib-2.0], [], [have_glib=no])
 64 | 
 65 | 	if test "x${have_glib}" = xno; then
 66 | 		AC_MSG_ERROR([Missing Glib 2. Either install it or build without unit tests.])
 67 | 	fi
 68 | 
 69 | 	AX_CXX_COMPILE_STDCXX_11([],[mandatory])
 70 | ])
 71 | 
 72 | 
 73 | # Check for various headers including those used by libdivsufsort.
 74 | AC_CHECK_HEADERS([limits.h stdlib.h string.h unistd.h stdint.h inttypes.h err.h errno.h fcntl.h])
 75 | 
 76 | AC_C_INLINE
 77 | AC_TYPE_SIZE_T
 78 | AC_TYPE_SSIZE_T
 79 | AC_TYPE_INT32_T
 80 | AC_TYPE_UINT8_T
 81 | AC_HEADER_STDBOOL
 82 | 
 83 | # Until someone convinces me otherwise, I will deactivate the macros
 84 | # AC_FUNC_MALLOC and AC_FUNC_REALLOC. They only check if `malloc(0)` retuns a
 85 | # non-null pointer. This breaks the build on systems using uClibc, including
 86 | # my laptop.
 87 | # As requesting zero bytes is not useful, and implementation-defined behaviour,
 88 | # it should be avoided in the first place. Thus I really don't need these checks.
 89 | 
 90 | AC_CHECK_FUNCS([floor pow sqrt strdup strerror])
 91 | AC_CHECK_FUNCS([strndup strcasecmp])
 92 | AC_CHECK_FUNCS([strchr strrchr strchrnul])
 93 | AC_CHECK_FUNCS([strtoul strtod])
 94 | AC_CHECK_FUNCS([reallocarray])
 95 | 
 96 | AM_CONDITIONAL([HAVE_REALLOCARRAY], [test "x$ac_cv_func_reallocarray" = xyes])
 97 | AM_CONDITIONAL([HAVE_STRCHRNUL], [test "x$ac_cv_func_strchrnul" = xyes])
 98 | 
 99 | AC_CONFIG_HEADERS([src/config.h:src/config.hin])
100 | 
101 | AC_CONFIG_FILES([
102 |  Makefile
103 |  docs/andi.1
104 |  docs/Makefile
105 |  libs/Makefile
106 |  opt/Makefile
107 |  src/Makefile
108 |  test/Makefile
109 | ])
110 | AC_OUTPUT
111 | 
112 | 


--------------------------------------------------------------------------------
/docs/Makefile.am:
--------------------------------------------------------------------------------
 1 | dist_man_MANS = andi.1
 2 | dist_noinst_DATA = Doxyfile
 3 | 
 4 | # I intentionally do not list any of the manual files here. I neither want them
 5 | # distributed nor installed. The reason is that building the manual requires
 6 | # LaTeX with a whole bunch of packages installed. Plus, so many things can go
 7 | # wrong, when building, so it's better to inspect the result. Thus, the manual
 8 | # has to be build by hand and copied to the right place for distribution.
 9 | 
10 | .PHONY: code-docs
11 | code-docs:
12 | 	doxygen
13 | 
14 | manual/version.tex: manual/version.tex.in $(top_srcdir)/configure.ac
15 | 	sed "s/VERSION/$(VERSION)/" manual/version.tex.in > manual/version.tex
16 | 
17 | manual/andi-manual.pdf: manual/andi-manual.tex manual/version.tex
18 | 	@echo "error: manual rebuild of the manual required (no pun intended)."
19 | 	@exit 1
20 | 
21 | # maintainer-clean-local:
22 | #	rm -f manual/*{aux,log,out,toc} manual/andi-manual.pdf
23 | 


--------------------------------------------------------------------------------
/docs/andi.1.in:
--------------------------------------------------------------------------------
 1 | .TH ANDI "1" "2020-01-09" "@VERSION@" "andi manual"
 2 | .SH NAME
 3 | andi \- estimates evolutionary distances
 4 | .SH SYNOPSIS
 5 | .B andi
 6 | [\fIOPTIONS...\fR] \fIFILES\fR...
 7 | .SH DESCRIPTION
 8 | \fBandi\fR estimates the evolutionary distance between closely related genomes. For this \fBandi\fR reads the input sequences from \fIFASTA\fR files and computes the pairwise anchor distance. The idea behind this is explained in a paper by Haubold et al. (2015).
 9 | .SH OUTPUT
10 | The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each entry representing divergence with a positive real number. A distance of zero means that two sequences are identical, whereas other values are estimates for the nucleotide substitution rate (Jukes-Cantor corrected). For technical reasons the comparison might fail and no estimate can be computed. In such cases \fInan\fR is printed. This either means that the input sequences were too short (<200bp) or too diverse (K>0.5) for our method to work properly.
11 | .SH OPTIONS
12 | .TP
13 | \fB\-b\fR \fIINT\fR, \fB\-\-bootstrap\fR=\fIINT\fR
14 | Compute multiple distance matrices, with \fIn-1\fR bootstrapped from the first. See the paper Klötzl & Haubold (2016) for a detailed explanation.
15 | .TP
16 | \fB--file-of-filenames\fR=\fIFILE\fR
17 | Usually, \fBandi\fR is called with the filenames as commandline arguments. With this option the filenames may also be read from a file itself, with one name per line. Use a single dash (\fB'-'\fR) to read from stdin.
18 | .TP
19 | \fB\-j\fR, \fB\-\-join\fR
20 | Use this mode if each of your \fIFASTA\fR files represents one assembly with numerous contigs. \fBandi\fR will then treat all of the contained sequences per file as a single genome. In this mode at least one filename must be provided via command line arguments. For the output the filename is used to identify each sequence.
21 | .TP
22 | \fB\-l\fR, \fB\-\-low-memory\fR
23 | In multithreaded mode, \fBandi\fR requires memory linear to the amount of threads. The low memory mode changes this to a constant demand independent from the used number of threads. Unfortunately, this comes at a significant runtime cost.
24 | .TP
25 | \fB\-m\fR \fIMODEL\fR, \fB\-\-model\fR=\fIMODEL\fR
26 | Set the nucleotide evolution model to one of 'Raw', 'JC', 'Kimura', or 'LogDet'. By default the Jukes-Cantor correction is used.
27 | .TP
28 | \fB\-p\fR \fIFLOAT\fR
29 | Significance of an anchor; default: 0.025.
30 | .TP
31 | \fB--progress\fR[=\fIWHEN\fR]
32 | Print a progress bar. \fIWHEN\fR can be 'auto' (default if omitted), 'always', or 'never'.
33 | .TP
34 | \fB\-t\fR \fIINT\fR, \fB\-\-threads\fR=\fIINT\fR
35 | The number of threads to be used; by default, all available processors are used.
36 | .br
37 | Multithreading is only available if \fBandi\fR was compiled with OpenMP support.
38 | .TP
39 | \fB\-\-truncate-names\fR
40 | By default \fBandi\fR outputs the full names of sequences, optionally padded with spaces, if they are shorter than ten characters. Names longer than ten characters may lead to problems with downstream tools. With this switch names will be truncated.
41 | .TP
42 | \fB\-v\fR, \fB\-\-verbose\fR
43 | Prints additional information, including the amount of found homology. Apply multiple times for extra verboseness.
44 | .TP
45 | \fB\-h\fR, \fB\-\-help\fR
46 | Prints the synopsis and an explanation of available options.
47 | .TP
48 | \fB\-\-version\fR
49 | Outputs version information and acknowledgments.
50 | .SH COPYRIGHT
51 | Copyright \(co 2014 - 2021 Fabian Klötzl
52 | License GPLv3+: GNU GPL version 3 or later.
53 | .br
54 | This is free software: you are free to change and redistribute it.
55 | There is NO WARRANTY, to the extent permitted by law.
56 | The full license text is available at <http://gnu.org/licenses/gpl.html>.
57 | .PP
58 | .SH ACKNOWLEDGMENTS
59 | 1) andi: Haubold, B. Klötzl, F. and Pfaffelhuber, P. (2015). andi: Fast and accurate estimation of evolutionary distances between closely related genomes, Bioinformatics 31.8.
60 | .br
61 | 2) Algorithms: Ohlebusch, E. (2013). Bioinformatics Algorithms. Sequence Analysis, Genome Rearrangements, and Phylogenetic Reconstruction. pp 118f.
62 | .br
63 | 3) SA construction: Mori, Y. (2005). libdivsufsort, unpublished.
64 | .br
65 | 4) Bootstrapping: Klötzl, F. and Haubold, B. (2016). Support Values for Genome Phylogenies, Life 6.1.
66 | .SH BUGS
67 | .SS Reporting Bugs
68 | Please report bugs to <kloetzl@evolbio.mpg.de> or at <https://github.com/EvolBioInf/andi>.
69 | 


--------------------------------------------------------------------------------
/docs/manual/andi-manual.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[a4paper,
  2 |   10pt,
  3 |   english,
  4 |   DIV=12,
  5 |   BCOR=8mm]{scrbook}
  6 | \usepackage[utf8x]{inputenc}
  7 | \usepackage{babel}
  8 | \usepackage{listings}
  9 | \usepackage{xcolor}
 10 | \usepackage{hyperref}
 11 | \usepackage{siunitx}
 12 | \usepackage[T1]{fontenc}
 13 | \usepackage{isodate}
 14 | \usepackage{graphicx}
 15 | \usepackage{amsthm}
 16 | \usepackage{acronym}
 17 | \usepackage{amssymb}
 18 | \usepackage{caption}
 19 | \usepackage{subcaption}
 20 | \usepackage{xspace}
 21 | \usepackage{microtype}
 22 | 
 23 | \bibliographystyle{alpha}
 24 | 
 25 | \DeclareSIUnit\byte{B}
 26 | \DeclareSIUnit\basepairs{bp}
 27 | \DeclareSIUnit\bit{bit}
 28 | 
 29 | \definecolor{oceangreen}{cmyk}{1,.0,.20,.78}
 30 | \addtokomafont{sectioning}{\rmfamily\color{oceangreen}}
 31 | 
 32 | \definecolor{bluekeywords}{rgb}{0.13,0.13,1}
 33 | \definecolor{greencomments}{rgb}{0,0.5,0}
 34 | \definecolor{turqusnumbers}{rgb}{0.17,0.57,0.69}
 35 | \definecolor{redstrings}{rgb}{0.5,0,0}
 36 | \definecolor{lightgray}{rgb}{0.9,0.9,0.9}
 37 | 
 38 | \usepackage{libertine}
 39 | \fontfamily{libertine}
 40 | \selectfont
 41 | %\usepackage[scaled]{berasans}
 42 | 
 43 | \newcommand{\thymine}{\textsc{m}\oldstylenums{2}\xspace}
 44 | \newcommand{\local}{\textsc{m}\oldstylenums{1}\xspace}
 45 | \newcommand{\algo}[1]{\textsc{{#1}}}
 46 | \newcommand{\andi}{\algo{andi}\xspace}
 47 | \newcommand{\word}[1]{\textsf{\small#1}}
 48 | \newcommand{\wchar}[1]{\textsf{\small#1}}
 49 | \newcommand{\eco}{\textsc{eco}\oldstylenums{29}\xspace}
 50 | \newcommand{\pneu}{\textsc{Pneu}\oldstylenums{3085}\xspace}
 51 | 
 52 | \include{version}
 53 | 
 54 | % Todos at the margin
 55 | \newcommand{\todo}[1]{
 56 |   \marginpar{\fbox{\begin{minipage}{0.9\marginparwidth}
 57 |   \scriptsize\sloppy\raggedright #1
 58 |   \end{minipage}}}
 59 | }
 60 | 
 61 | 
 62 | \newtheorem{definition}{Definition}
 63 | 
 64 | 
 65 | \lstset{backgroundcolor=\color{lightgray}}
 66 | 
 67 | \lstdefinestyle{shell}{
 68 | 	language=bash,
 69 | 	columns=flexible,
 70 |   xleftmargin=12pt,
 71 |   xrightmargin=12pt,
 72 |   breaklines=true,
 73 |   basicstyle=\small\ttfamily,
 74 |   morekeywords={make, tar, git, sudo, andi, time, man, head, cut, fneighbor,
 75 |    fretree, figtree, brew, aura, autoreconf, ls},
 76 |  % literate={~} {$\sim$}{1}
 77 | }
 78 | 
 79 | \lstset{style=shell}
 80 | 
 81 | \title{Documentation of \algo{andi}}
 82 | \subtitle{Rapid Estimation of Evolutionary Distances between Genomes\\ {\small\url{https://github.com/EvolBioInf/andi}}}
 83 | \author{Fabian Klötzl\\ \href{mailto:kloetzl@evolbio.mpg.de}{kloetzl@evolbio.mpg.de}}
 84 | \date{Version \version, \isodate\today \\
 85 | \vspace*{2cm}
 86 | \centering\includegraphics[width=0.8\textwidth]{andi_labels.pdf}}
 87 | 
 88 | \begin{document}
 89 | 
 90 | \maketitle
 91 | 
 92 | \section*{Abstract}
 93 | This is the documentation of the \andi program for estimating the evolutionary distance between closely related genomes. These distances can be used to rapidly infer phylogenies for big sets of genomes. Because \andi does not compute full alignments, it is so efficient that it scales well up to thousands of bacterial genomes.
 94 | 
 95 | This is scientific software. Please cite our paper \cite{andi} if you use \andi in your publication. Also refer to the paper for the internals of \andi. Additionally, there is a Master's thesis with even more in depth analysis of \andi \cite{kloetzl}.
 96 | 
 97 | \vspace*{1cm}
 98 | \section*{License}
 99 | This document is release under the Creative Commons Attribution Share-Alike license. This means, you are free to copy and redistribute this document. You may even remix, tweak and build upon this document, as long as you credit me for the work I've done and release your document under the identical terms. The full legal code is available online: {\small\url{https://creativecommons.org/licenses/by-sa/4.0/legalcode}}.
100 | 
101 | \tableofcontents
102 | 
103 | \chapter{Installation} %%%%%
104 | 
105 | \section{Package Manager}
106 | 
107 | The easiest way to install \andi is via a package manager. This also handles all dependencies for you.
108 | 
109 | 
110 | \noindent Debian and Ubuntu:
111 | 
112 | \begin{lstlisting}
113 | ~ %  sudo apt-get install andi
114 | \end{lstlisting}
115 | 
116 | \noindent macOS with homebrew:
117 | 
118 | \begin{lstlisting}
119 | ~ %  brew tap brewsci/bio
120 | ~ %  brew install andi
121 | \end{lstlisting}
122 | 
123 | \noindent ArchLinux AUR package with aura:
124 | 
125 | \begin{lstlisting}
126 | ~ %  aura -A andi
127 | \end{lstlisting}
128 | 
129 | \andi is intended to be run in a \algo{Unix} commandline such as \lstinline$bash$ or \lstinline$zsh$. All examples in this document are also intended for that environment. You can verify that \andi was installed correctly by executing \lstinline$andi -h$. This should give you a list of all available options (see Section~\ref{sec:options}).
130 | 
131 | \section{Source Package} \label{sub:regular}
132 | 
133 | To build \andi from source, download the latest \href{https://github.com/EvolBioInf/andi/releases}{release} from GitHub. Please note, that \andi requires the \algo{Gnu Scientific Library} and \algo{libdivsufsort}\footnote{\url{https://github.com/y-256/libdivsufsort}} for optimal performance \cite{divsufsort}.
134 | 
135 | Once you have downloaded the package, unzip it and change into the newly created directory. 
136 | 
137 | \begin{lstlisting}
138 | ~ %  tar -xzvf andi-0.14.tar.gz
139 | ~ %  cd andi-0.14
140 | \end{lstlisting}
141 | 
142 | \noindent Now build and install \andi.
143 | 
144 | \begin{lstlisting}
145 | ~/andi-0.14 %  ./configure
146 | ~/andi-0.14 %  make
147 | ~/andi-0.14 %  sudo make install
148 | \end{lstlisting}
149 | 
150 | \noindent This installs \andi for all users on your system. If you do not have root privileges, you will find a working copy of \andi in the \lstinline$src$ subdirectory. For the rest of this documentation, it is assumed, that \andi is in your \textdollar\lstinline!PATH!.
151 | 
152 | Now \andi should be ready for use. Try invoking the help.
153 | 
154 | \begin{lstlisting}
155 | ~/andi-0.14 %  ~/andi
156 | Usage: andi [OPTIONS...] FILES...
157 | 	FILES... can be any sequence of FASTA files.
158 | 	Use '-' as file name to read from stdin.
159 | Options:
160 |   -b, --bootstrap=INT  Print additional bootstrap matrices
161 |       --file-of-filenames=FILE  Read additional filenames from FILE; one per line
162 |   -j, --join           Treat all sequences from one file as a single genome
163 |   -l, --low-memory     Use less memory at the cost of speed
164 |   -m, --model=MODEL    Pick an evolutionary model of 'Raw', 'JC', 'Kimura', 'LogDet'; default: JC
165 |   -p FLOAT             Significance of an anchor; default: 0.025
166 |       --progress=WHEN  Print a progress bar 'always', 'never', or 'auto'; default: auto
167 |   -t, --threads=INT    Set the number of threads; by default, all processors are used
168 |       --truncate-names Truncate names to ten characters
169 |   -v, --verbose        Prints additional information
170 |   -h, --help           Display this help and exit
171 |       --version        Output version information and acknowledgments
172 | \end{lstlisting}
173 | 
174 | \noindent \andi also comes with a man page, which can be accessed via \lstinline$man andi$. % But once you are done with this documentation, you will require it scarcely.
175 | 
176 | \section{Installing from Git Repository}
177 | 
178 | To build \andi from the \algo{Git} repo, you will also need the \algo{autotools}. Refer to your OS documentation for installation instructions. Once done, execute the following steps.
179 | 
180 | \begin{lstlisting}
181 | ~ %  git clone git@github.com:EvolBioInf/andi.git
182 | ~ %  cd andi
183 | ~/andi %  autoreconf -fi -Im4
184 | \end{lstlisting}
185 | 
186 | \noindent Continue with the \algo{Gnu} trinity as described in Section~\ref{sub:regular}.
187 | 
188 | 
189 | \chapter{Usage} %%%%%
190 | 
191 | The input sequences for \andi should be in \algo{Fasta} format. Any number of files can be passed. Each file may contain more than one sequence.
192 | 
193 | \begin{lstlisting}
194 | ~ %  andi S1.fasta S2.fasta
195 | 2
196 | S1        0.0000 0.0979
197 | S2        0.0979 0.0000
198 | \end{lstlisting}
199 | 
200 | If no file argument is given, \andi reads the input from \algo{stdin}. This makes it convenient to use in \algo{Unix} pipelines.
201 | 
202 | \begin{lstlisting}
203 | ~ %  cat S1.fasta S2.fasta | andi
204 | 2
205 | S1        0.0000 0.0979
206 | S2        0.0979 0.0000
207 | \end{lstlisting}
208 | 
209 | The output of \andi is a matrix in \algo{Phylip} style: On the first line the number of compared sequences is given, \lstinline!2! in our example. Then the matrix is printed, where each line is preceded by the name of the $i$th sequence. Note that the matrix is symmetric and the main diagonal contains only zeros. The numbers themselves are evolutionary distances, estimated from substitution rates.
210 | 
211 | 
212 | \section{Input} \label{sec:join}
213 | 
214 | As mentioned before, \andi reads in \algo{Fasta} files. It recognizes only the four standard bases and is case insensitive (RegEx: \lstinline![acgtACGT]!). All other residue symbols are excluded from the analysis and \andi prints a warning, when this happens.
215 | 
216 | If instead of distinct sequences, a \algo{Fasta} file contains contigs belonging to a single taxon, \andi will treat them as a unit when switched into \algo{join} mode. This can be achieved by using the \lstinline!-j! or \lstinline!--join! command line switch.
217 | 
218 | \begin{lstlisting}
219 | ~ %  andi --join E_coli.fasta Shigella.fasta
220 | [Output]
221 | \end{lstlisting}
222 | 
223 | When the \algo{join} mode is active, the file names are used to label the individual sequences. Thus, in \algo{join} mode, each genome has to be in its own file, and furthermore, at least one filename has to be given via the command line.
224 | 
225 | If not enough file names are provided, \andi will try to read sequences from the standard input stream. This behaviour can be explicitly triggered by passing a single dash (\lstinline$-$) as a file name, which is useful in pipelines.
226 | 
227 | If \andi seems to take unusually long, or requires huge amounts of memory, then you might have forgotten the \algo{join} switch. This makes \andi compare each contig instead of each genome, resulting in many more comparisons! Since version 0.12 \andi produces a progressmeter on the standard error stream. \andi tries to be smart about when to show or hide the progress bar. You can manually change this behaviour using the \lstinline!--progress! option.
228 | 
229 | Starting with version 0.11 \andi supports an extra way of input. Instead of passing file names directly to \andi via the commandline arguments, the file names may also be read from a file itself. Using this new \lstinline$--file-of-filenames$ argument can work around limitations imposed be the shell.
230 | 
231 | The following three snippets have the same functionality.
232 | 
233 | \begin{lstlisting}
234 | ~ %  andi --join *.fasta
235 | [Output]
236 | \end{lstlisting}
237 | 
238 | \begin{lstlisting}
239 | ~ %  ls *.fasta > filenames.txt
240 | ~ %  andi --join --file-of-filenames filenames.txt
241 | [Output]
242 | \end{lstlisting}
243 | 
244 | \begin{lstlisting}
245 | ~ %  ls *.fasta | andi --join --file-of-filenames -
246 | [Output]
247 | \end{lstlisting}
248 | 
249 | \section{Output}
250 | 
251 | The output of \andi is written to \lstinline$stdout$. This makes it easy to use on the command line and within shell scripts. As seen before, the matrix, computed by \algo{andi}, is given in \algo{Phylip} format \cite{phylip}.
252 | 
253 | \begin{lstlisting}
254 | ~ %  cat S1.fasta S2.fasta | andi
255 | 2
256 | S1        0.0000 0.0979
257 | S2        0.0979 0.0000
258 | \end{lstlisting}
259 | 
260 | If the computation completed successfully, \andi exits with the status code 0. Otherwise, the value of \lstinline$errno$ is used as the exit code. \andi can also produce warnings and error messages for the user's convenience. These messages are printed to \lstinline$stderr$ and thus do not interfere with the normal output.
261 | 
262 | \section{Options} \label{sec:options}
263 | 
264 | \andi takes a small number of commandline options, of which even fewer are of interest on a day-to-day basis. If \lstinline$andi -h$ displays a \lstinline$-t$ option, then \andi was compiled with multi-threading support (implemented using \algo{OpenMP}). By default, \andi uses all available processors. However, to restrict the number of threads, use \lstinline$-t$.
265 | 
266 | \begin{lstlisting}
267 | ~ %  time andi ../test/1M.1.fasta -t 1
268 | 2
269 | S1        0.0000 0.0995
270 | S2        0.0995 0.0000
271 | ./andi ../test/1M.1.fasta  0,60s user 0,01s system 99% cpu 0,613 total
272 | ~ %  time andi ../test/1M.1.fasta -t 2
273 | 2
274 | S1        0.0000 0.0995
275 | S2        0.0995 0.0000
276 | ./andi ../test/1M.1.fasta -t 2  0,67s user 0,03s system 195% cpu 0,362 total
277 | \end{lstlisting}
278 | 
279 | In the above examples the runtime dropped from \SI{0.613}{\second}, to \SI{0.362}{\second} using two threads. Giving \andi more threads than input genomes leads to no further speed improvement. \, The other important option is \lstinline$--join$ (see Section~\ref{sec:join}).
280 | 
281 | By default, the distances computed by \andi are \emph{Jukes-Cantor} corrected \cite{jukescantor}. Other evolutionary models are also implemented (Kimura \cite{kimura}, LogDet \cite{logdet}, raw). The \lstinline$--model$ parameter can be used to switch between them.
282 | 
283 | Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}. For more information on computing support values from distance matrices see \cite{afra}.
284 | 
285 | \begin{lstlisting}
286 | ~ %  andi -b 2 ../test/1M.1.fasta
287 | 2
288 | S1        0.0000 0.1067
289 | S2        0.1067 0.0000
290 | 2
291 | S1        0.0000 0.1071
292 | S2        0.1071 0.0000
293 | \end{lstlisting}
294 | 
295 | The original \algo{phylip} only supports distance matrices with names no longer than ten characters. However, this sometimes leads to problems with long accession numbers. Starting with version 0.11 \andi prints the full name of a sequence, even if it is longer than ten characters. If your downstream tools have trouble with this, use \lstinline$--truncate-names$ to reimpose the limit.
296 | 
297 | Also new in version 0.11 is the \lstinline$--file-of-filenames$ option. See Section~\ref{sec:join} for details.
298 | 
299 | \section{Example: \algo{eco29}}
300 | 
301 | Here follows a real-world example of how to use \algo{andi}. It makes heavy use of the commandline and tools like \algo{Phylip}. If you prefer \algo{R}, check out this excellent blog post by Kathryn Holt.\footnote{\url{http://holtlab.net/2015/05/08/r-code-to-infer-tree-from-andi-output/}}
302 | 
303 | As a data set we use \algo{eco29}; 29 genomes of \textit{E. Coli} and \textit{Shigella}. You can download the data from here: {\small{\url{http://guanine.evolbio.mpg.de/andi/eco29.fasta.gz}}}. The genomes have an average length of 4.9~million nucleotides amounting to a total \SI{138}{\mega\byte}.
304 | 
305 | \algo{eco29} comes a single \algo{fasta} file, where each sequence is a genome. To calculate their pairwise distances, enter
306 | 
307 | \begin{lstlisting}
308 | ~ % andi eco29.fasta > eco29.mat
309 | andi: The input sequences contained characters other than acgtACGT. These were automatically stripped to ensure correct results.
310 | \end{lstlisting}
311 | 
312 | \noindent The \algo{eco29} data set includes non-canonical nucleotides, such as \word{Y}, \word{N}, and \word{P}, which get stripped from the input sequences. The resulting matrix is stored in \lstinline$eco29.mat$; Here is a small excerpt:
313 | 
314 | \begin{lstlisting}
315 | ~ % head -n 5 eco29.mat | cut -d ' ' -f 1-5
316 | 29
317 | gi|563845 0.0000e+00 1.8388e-02 1.8439e-02 2.6398e-02
318 | gi|342360 1.8388e-02 0.0000e+00 4.4029e-04 2.6166e-02
319 | gi|300439 1.8439e-02 4.4029e-04 0.0000e+00 2.6123e-02
320 | gi|261117 2.6398e-02 2.6166e-02 2.6123e-02 0.0000e+00
321 | \end{lstlisting}
322 | 
323 | \noindent From this we compute a tree via neighbor-joining using a \algo{Phylip} wrapper called \algo{Embassy}.\footnote{\url{http://emboss.sourceforge.net/embassy/\#PHYLIP}}
324 | 
325 | \begin{lstlisting}
326 | ~ % fneighbor -datafile eco29.mat -outfile eco29.phylipdump
327 | \end{lstlisting}
328 | \noindent To make this tree easier to read, we can midpoint-root it.
329 | \begin{lstlisting}
330 | ~ % fretree -spp 29 -intreefile eco29.treefile -outtreefile eco29.tree <<EOF
331 | M
332 | X
333 | Y
334 | R
335 | EOF
336 | \end{lstlisting}
337 | 
338 | \noindent The file \lstinline$eco29.tree$ now contains the tree in Newick format. This can be plotted using \cite{figtree}
339 | 
340 | \begin{lstlisting}
341 | ~ % figtree eco29.tree &
342 | \end{lstlisting}
343 | 
344 | \noindent to yield
345 | 
346 | \begin{figure}[h]
347 |   \centering\includegraphics[width=0.8\textwidth]{andi_labels.pdf}
348 | \end{figure}
349 | 
350 | 
351 | \chapter{Warnings and Errors}
352 | 
353 | Here be an explanation of all possible errors. Other errors may occur and are due to the failure of underlying functions (e.\,g.~\lstinline$read(3)$). All warning messages are printed to \lstinline$stderr$. Most errors are non-recoverable and will result in \andi exiting with a non-zero state.
354 | 
355 | \section{Sequence Related Messages}
356 | 
357 | \subsection*{Unexpected Character}
358 | 
359 | \andi is pretty pedantic about the formatting of \algo{FASTA} files. If you violate the syntax, \andi will print the file name, the line and the problematic character. These errors are non-recovering, meaning no further sequences are read from the invalid file. The checks are implemented by the \href{https://github.com/kloetzl/pfasta}{\algo{pfasta}} library.
360 | 
361 | 
362 | \subsection*{Non acgtACGT Nucleotides Stripped}
363 | 
364 | Our models of genome evolution (JC, Kimura) only work on the four canonical types of nucleotides. All others are stripped from the sequences. This can be ignored in most cases.
365 | 
366 | \subsection*{Too Short Sequence}
367 | 
368 | \andi was designed for big data sets of whole genomes. On short sequences the distance estimates are inaccurate. Use a multiple sequence alignment instead.
369 | 
370 | \subsection*{Too Long Sequence}
371 | 
372 | \algo{libdivsufsort} limits the length of a sequence to 31 bits. That count includes the reverse complement. So the technical limit for a sequence analysis is $2^{30} = 1.073.741.824$. Unfortunately, that excludes (full) human and mice genomes. Per-chromosome analysis works just fine.
373 | 
374 | \subsection*{Empty Sequence}
375 | 
376 | One of the given sequences contained either no nucleotides at all, or only non-canonical ones.
377 | 
378 | \subsection*{Less than two sequences given}
379 | 
380 | As \andi tries to compare sequences, at least two need to be supplied. Note that \andi may have regarded some of your given sequences as unusable.
381 | 
382 | \subsection*{Maximum Number of Sequences}
383 | 
384 | The maximum number of sequences \andi can possible compare is huge (roughly $457.845.052$). I doubt anyone will ever reach that limit. Please send me a mail, if you do.
385 | 
386 | \section{Technical Messages}
387 | 
388 | \subsection*{Out of Memory}
389 | 
390 | If \andi runs out of memory, it gives up. Either free memory, run \andi on a bigger machine, try the \lstinline$--low-memory$ mode or reduce the number of threads.
391 | 
392 | \subsection*{RNG allocation}
393 | 
394 | Some technical thing failed. If it keeps failing repeatedly, file a bug.
395 | 
396 | \subsection*{Bootstrapping failed}
397 | 
398 | This should not happen.
399 | 
400 | \subsection*{Failed index creation}
401 | 
402 | This should not happen, either.
403 | 
404 | \subsection*{Skipped and ignored Arguments}
405 | 
406 | Some command line parameters of \andi require arguments. If these are not of the expected type, a warning is given. See Section~\ref{sec:options} for their correct usage.
407 | 
408 | 
409 | \section{Output-related Warnings}
410 | 
411 | As the input sequences get more evolutionary divergent, \andi finds less homologous anchors. With less anchors, less nucleotides are considered homologous between two sequences. If no anchors are found, comparison fails and \lstinline!nan! is printed instead. See our paper and especially Figure~2 for details.
412 | 
413 | \subsection*{NaN}
414 | 
415 | No homologous sections were found. Your sequences are very divergent ($d>0.5$) or sprout a lot of indels that make comparison difficult.
416 | 
417 | \subsection*{Little Homology}
418 | 
419 | Very few anchors were found and thus only a tiny part of the sequences is considered homologous. Expect that the given distance is erroneous.
420 | 
421 | \subsection*{Too long name}
422 | 
423 | If you added the \lstinline$--truncate-names$ switch and an input name is longer than ten characters, you will receive this warning.
424 | 
425 | \chapter{DevOps} %%%%%
426 | 
427 | \andi is written in C/C++; mostly C99 with some parts in C++11. The sources are released on \algo{GitHub} as \emph{free software} under the \textsc{Gnu General Public License version~3} \cite{GPL}. Prebundled packages using \algo{autoconf} are also available, with the latest release being {\version} at the time of writing.
428 | 
429 | If you are interested in the internals of \algo{andi}, consult the paper \cite{andi} or my Master's thesis \cite{kloetzl}. Both explain the used approach in detail. The latter emphasizes the used algorithms, data structures and their efficient implementation.
430 | 
431 | \section{Dependencies}
432 | 
433 | Here is a complete list of dependencies required for developing \algo{andi}.
434 | 
435 | \begin{itemize}
436 |   \item A C and a C++11 compiler,
437 |   \item the \algo{autotools},
438 |   \item the \algo{Gnu Scientific Library},
439 |   \item \algo{Pdflatex} with various packages for the manual,
440 |   \item \algo{Git},
441 |   \item \algo{glib2} for the unit tests,
442 |   \item \algo{doxygen},
443 |   \item and \algo{libdivsufsort}.
444 | \end{itemize}
445 | 
446 | 
447 | \section{Code Documentation}
448 | 
449 | \emph{Every} function in \andi is documented using \algo{doxygen} style comments. To create the documentation run \lstinline$make code-docs$ in the main directory. You will then find the documentation under \lstinline$./docs$.
450 | 
451 | 
452 | \section{Unit Tests}
453 | 
454 | The unit tests are located in the \andi repository under the \lstinline$./test$ directory. Because they require \algo{glib2}, and a C++11 compiler, they are deactivated by default. To enable them, execute
455 | 
456 | \begin{lstlisting}
457 | ~/andi %   ./configure --enable-unit-tests
458 | \end{lstlisting}
459 | 
460 | \noindent during the installation process. You can then verify the build via 
461 | 
462 | \begin{lstlisting}
463 | ~/andi %   make check
464 | \end{lstlisting}
465 | 
466 | \noindent The unit tests are also checked each time a commit is sent to the repository. This is done via \algo{TravisCI}.\footnote{\url{https://travis-ci.org/EvolBioInf/andi}} Thus, a warning is produced, when the builds fail, or the unit tests did not run successfully. Currently, the unit tests cover more than 75\% of the code. This is computed via the \algo{Travis} builds and a service called \algo{Coveralls}.\footnote{\url{https://coveralls.io/r/EvolBioInf/andi}}
467 | 
468 | \section{Known Issues}
469 | 
470 | These minor issues are known. I intend to fix them, when I have time.
471 | 
472 | \begin{enumerate}
473 |   \item This code will not work under Windows. At two places Unix-only code is used: filepath-separators are assumed to be \lstinline$/$ and file-descriptors are used for I/O.
474 |   \item Unit tests for the bootstrapped matrices are missing.
475 |   \item Cached intervals are sometimes not “as deep as they could be”. If that got fixed \lstinline$get_match_cache$ could bail out on \lstinline$ij.lcp < CACHE_LENGTH$. However the \lstinline$esa_init_cache$ code is the most fragile part and should be handled with care.
476 | \end{enumerate}
477 | 
478 | 
479 | \section{Creating a Release}
480 | 
481 | A release should be a stable version of \andi with significant improvements over the last version. dotdot releases should be avoided.
482 | 
483 | %\subsection{Preparing a new Release}
484 | 
485 | Once \andi is matured, the new features implemented, and all tests were run, a new release can be created. First, increase the version number in \lstinline$configure.ac$. Commit that change in git, and tag this commit with \lstinline$vX.y$. Tags should be annotated and signed, if possible. This manual then needs manual rebuilding.
486 | 
487 | Ensure that \andi is ready for packaging with \algo{autoconf}.
488 | 
489 | \begin{lstlisting}
490 | ~ % make distcheck
491 | make  dist-gzip am__post_remove_distdir='@:'
492 | make[1]: Entering directory `/home/kloetzl/Projects/andi'
493 | if test -d "andi-0.9.1-beta"; then find "andi-0.9.1-beta" -type d ! -perm -200 -exec chmod u+w {} ';' && rm -rf "andi-0.9.1-beta" || { sleep 5 && rm -rf "andi-0.9.1-beta"; }; else :; fi
494 | test -d "andi-0.9.1-beta" || mkdir "andi-0.9.1-beta"
495 |  (cd src && make  top_distdir=../andi-0.9.1-beta distdir=../andi-0.9.1-beta/src \
496 |      am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)
497 | 
498 | ... Loads of output ...
499 | 
500 | =================================================
501 | andi-0.9.1-beta archives ready for distribution: 
502 | andi-0.9.1-beta.tar.gz
503 | =================================================
504 | \end{lstlisting}
505 | 
506 | If the command does not build successfully, no tarballs will be created. This may necessitate further study of \algo{autoconf} and \algo{automake}.
507 | 
508 | Also verify that the recent changes did not create a performance regression. This includes testing both ends of the scale: \eco and \pneu. Both should be reasonable close to previous releases.
509 | 
510 | Create another commit, where you set the version number to the next release (e.\,g., \lstinline$vX.z-beta$). This assures that there is only one commit and build with that specific version.
511 | 
512 | \backmatter
513 | %\addcontentsline{toc}{chapter}{Bibliography}
514 | \bibliography{references}
515 | 
516 | \end{document}
517 | 


--------------------------------------------------------------------------------
/docs/manual/andi_labels.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EvolBioInf/andi/390af15beb76badaf8f16864a885747aa60956c8/docs/manual/andi_labels.pdf


--------------------------------------------------------------------------------
/docs/manual/references.bib:
--------------------------------------------------------------------------------
  1 | @misc{divsufsort,
  2 |   author="Yuta Mori",
  3 |   year="2005",
  4 |   title="Short description of improved two-stage suffix sorting algorithm",
  5 |   note="\url{http://homepage3.nifty.com/wpage/software/itssort.txt}"
  6 | }
  7 | 
  8 | @article{andi,
  9 |   author = {Haubold, Bernhard and Klötzl, Fabian and Pfaffelhuber, Peter}, 
 10 |   title = {andi: Fast and accurate estimation of evolutionary distances between closely related genomes},
 11 |   volume = {31},
 12 |   number = {8},
 13 |   pages = {1169-1175},
 14 |   year = {2015},
 15 |   doi = {10.1093/bioinformatics/btu815},
 16 |   URL = {http://bioinformatics.oxfordjournals.org/content/31/8/1169.abstract},
 17 |   eprint = {http://bioinformatics.oxfordjournals.org/content/31/8/1169.full.pdf+html},
 18 |   journal = {Bioinformatics}
 19 | }
 20 | 
 21 | @book{Felsenstein,
 22 |   author={Joseph Felsenstein},
 23 |   title={Inferring Phylogenies},
 24 |   year={2004},
 25 |   publisher={Sinauer Associates, Inc.}
 26 | }
 27 | 
 28 | @misc{GPL,
 29 |   author={{Free~Software~Foundation}},
 30 |   year={2007},
 31 |   title={Gnu General Public License},
 32 |   note={\url{https://gnu.org/licenses/gpl.html}}
 33 | }
 34 | 
 35 | @misc{phylip,
 36 |   author={Felsenstein, J.},
 37 |   year={2005},
 38 |   title={PHYLIP (Phylogeny Inference Package)},
 39 |   version={version 3.6},
 40 |   howpublished={Distributed by the author},
 41 |   note={Department of Genome Sciences, University of Washington.}
 42 | }
 43 | 
 44 | @InProceedings{LLVM,
 45 |   Author  = {Chris Lattner and Vikram Adve},
 46 |   Title = {{LLVM}: A Compilation Framework for Lifelong Program 
 47 |   Analysis and Transformation},
 48 |   Booktitle = "Code Generation and Optimization",
 49 |   Month = {Mar},
 50 |   Year  = {2004},
 51 |   pages = {75--88},
 52 |   Publisher={International Symposium on Code Generation and Optimization}
 53 | }
 54 | 
 55 | @article{ms,
 56 |   author = {Hudson, Richard R.}, 
 57 |   title = {Generating samples under a Wright–Fisher neutral model of genetic variation},
 58 |   volume = {18}, 
 59 |   number = {2}, 
 60 |   pages = {337-338}, 
 61 |   year = {2002}, 
 62 |   doi = {10.1093/bioinformatics/18.2.337}, 
 63 |   URL = {http://bioinformatics.oxfordjournals.org/content/18/2/337.abstract}, 
 64 |   eprint = {http://bioinformatics.oxfordjournals.org/content/18/2/337.full.pdf+html}, 
 65 |   journal = {Bioinformatics} 
 66 | }
 67 | 
 68 | @article{valgrind,
 69 |   author = {Nethercote, Nicholas and Seward, Julian},
 70 |   title = {Valgrind: A Framework for Heavyweight Dynamic Binary Instrumentation},
 71 |   journal = {SIGPLAN Not.},
 72 |   issue_date = {June 2007},
 73 |   volume = {42},
 74 |   number = {6},
 75 |   month = jun,
 76 |   year = {2007},
 77 |   issn = {0362-1340},
 78 |   pages = {89--100},
 79 |   numpages = {12},
 80 |   url = {http://doi.acm.org/10.1145/1273442.1250746},
 81 |   doi = {10.1145/1273442.1250746},
 82 |   acmid = {1250746},
 83 |   publisher = {ACM},
 84 |   keywords = {Memcheck, Valgrind, dynamic binary analysis, dynamic binary instrumentation, shadow values}
 85 | } 
 86 | 
 87 | @misc{figtree,
 88 |   title="FigTree",
 89 |   author={Andrew Rambaut},
 90 |   year={accessed 2015},
 91 |   note={\url{http://tree.bio.ed.ac.uk/software/figtree/}}
 92 | }
 93 | 
 94 | 
 95 | @article{jukescantor,
 96 |   author={Jukes, T. H. and Cantor, C. R.},
 97 |   year={1969},
 98 |   title={Evolution of protein molecules},
 99 |   journal={Mammalian protein metabolism},
100 |   volume={3},
101 |   pages={21-132},
102 |   publisher={Academic Press}
103 | }
104 | 
105 | @mastersthesis{kloetzl,
106 |   author={Fabian Kl{\"o}tzl},
107 |   school={University of L\"ubeck},
108 |   year={2015},
109 |   title={Efficient Estimation of Evolutionary Distances}
110 | }
111 | 
112 | @article{afra,
113 |   AUTHOR = {Klötzl, Fabian and Haubold, Bernhard},
114 |   TITLE = {Support Values for Genome Phylogenies},
115 |   JOURNAL = {Life},
116 |   VOLUME = {6},
117 |   YEAR = {2016},
118 |   NUMBER = {1},
119 |   PAGES = {11},
120 |   URL = {http://www.mdpi.com/2075-1729/6/1/11},
121 |   ISSN = {2075-1729},
122 |   DOI = {10.3390/life6010011}
123 | }
124 | 
125 | @article{logdet,
126 |   AUTHOR = {Lockhart, P.J. and M.A. Steel and M.D. Hendy and D. Penny},
127 |   TITLE = {Recovering Evolutionary Trees under a More Realistic Model of Sequence Evolution},
128 |   JOURNAL = {Molecular Biology and Evolution},
129 |   VOLUME = {11},
130 |   YEAR = {1994},
131 |   NUMBER = {4},
132 |   PAGES = {605-612},
133 |   DOI = {10.1093/oxfordjournals.molbev.a040136}
134 | }
135 | 
136 | @article{kimura,
137 |   AUTHOR = {Kimura, M.},
138 |   TITLE = {A Simple Method for Estimating Evolutionary Rate of Base Substitutions Through Comparative Studies of Nucleotide Sequences},
139 |   JOURNAL = {Journal of Molecular Evolution},
140 |   VOLUME = {16},
141 |   YEAR = {1980},
142 |   NUMBER = {2},
143 |   PAGES = {111-120},
144 |   DOI = {10.1007/BF01731581}
145 | }
146 | 
147 | 


--------------------------------------------------------------------------------
/docs/manual/version.tex.in:
--------------------------------------------------------------------------------
1 | 
2 | \newcommand{\version}{VERSION}
3 | 


--------------------------------------------------------------------------------
/libs/Makefile.am:
--------------------------------------------------------------------------------
1 | # (C) 2015, Fabian Klötzl <fabian@kloetzl.info> ISC License
2 | 
3 | noinst_LIBRARIES= libpfasta.a
4 | libpfasta_a_SOURCES= pfasta.c pfasta.h
5 | libpfasta_a_CPPFLAGS= -I$(top_srcdir)/opt
6 | 


--------------------------------------------------------------------------------
/libs/pfasta.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015-2020, Fabian Klötzl <fabian-pfasta@kloetzl.info>
  3 |  *
  4 |  * Permission to use, copy, modify, and/or distribute this software for any
  5 |  * purpose with or without fee is hereby granted, provided that the above
  6 |  * copyright notice and this permission notice appear in all copies.
  7 |  *
  8 |  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9 |  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10 |  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11 |  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12 |  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13 |  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14 |  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15 |  *
 16 |  */
 17 | 
 18 | #include <assert.h>
 19 | #include <ctype.h>
 20 | #include <err.h>
 21 | #include <errno.h>
 22 | #include <inttypes.h>
 23 | #include <stdio.h>
 24 | #include <stdlib.h>
 25 | #include <string.h>
 26 | #include <unistd.h>
 27 | 
 28 | #include "pfasta.h"
 29 | 
 30 | #define VERSION "v15"
 31 | 
 32 | #ifdef __SSE2__
 33 | #include <emmintrin.h>
 34 | #endif
 35 | 
 36 | #if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
 37 | #include <threads.h>
 38 | #define PFASTA_THREADSAFE 1
 39 | #else
 40 | #define thread_local
 41 | #define PFASTA_THREADSAFE 0
 42 | #endif
 43 | 
 44 | int pfasta_threadsafe() {
 45 | 	return PFASTA_THREADSAFE ;
 46 | }
 47 | 
 48 | /** The following is the maximum length of an error string. It has to be
 49 |  * carefully chosen, so that all calls to PF_FAIL_STR succeed. For instance,
 50 |  * the line number can account for up to 20 characters.
 51 |  */
 52 | #define PF_ERROR_STRING_LENGTH 100
 53 | 
 54 | thread_local char errstr_buffer[PF_ERROR_STRING_LENGTH];
 55 | 
 56 | void *pfasta_reallocarray(void *ptr, size_t nmemb, size_t size);
 57 | 
 58 | #define BUFFER_SIZE 16384
 59 | 
 60 | #define LIKELY(X) __builtin_expect((intptr_t)(X), 1)
 61 | #define UNLIKELY(X) __builtin_expect((intptr_t)(X), 0)
 62 | 
 63 | enum { NO_ERROR, E_EOF, E_ERROR, E_ERRNO, E_BUBBLE, E_STR, E_STR_CONST };
 64 | 
 65 | #define PF_FAIL_ERRNO(PP)                                                      \
 66 | 	do {                                                                       \
 67 | 		(void)strerror_r(errno, errstr_buffer, PF_ERROR_STRING_LENGTH);        \
 68 | 		(PP)->errstr = errstr_buffer;                                          \
 69 | 		return_code = E_ERRNO;                                                 \
 70 | 		goto cleanup;                                                          \
 71 | 	} while (0)
 72 | 
 73 | #define PF_FAIL_BUBBLE_CHECK(PP, CHECK)                                        \
 74 | 	do {                                                                       \
 75 | 		if (UNLIKELY(CHECK)) {                                                 \
 76 | 			return_code = CHECK;                                               \
 77 | 			goto cleanup;                                                      \
 78 | 		}                                                                      \
 79 | 	} while (0)
 80 | 
 81 | #define PF_FAIL_BUBBLE(PP)                                                     \
 82 | 	do {                                                                       \
 83 | 		if (UNLIKELY((PP)->errstr)) {                                          \
 84 | 			return_code = E_BUBBLE;                                            \
 85 | 			goto cleanup;                                                      \
 86 | 		}                                                                      \
 87 | 	} while (0)
 88 | 
 89 | #define PF_FAIL_STR_CONST(PP, STR)                                             \
 90 | 	do {                                                                       \
 91 | 		(PP)->errstr = (STR);                                                  \
 92 | 		return_code = E_STR_CONST;                                             \
 93 | 		goto cleanup;                                                          \
 94 | 	} while (0)
 95 | 
 96 | #define PF_FAIL_STR(PP, ...)                                                   \
 97 | 	do {                                                                       \
 98 | 		(void)snprintf(errstr_buffer, PF_ERROR_STRING_LENGTH, __VA_ARGS__);    \
 99 | 		(PP)->errstr = errstr_buffer;                                          \
100 | 		return_code = E_STR;                                                   \
101 | 		goto cleanup;                                                          \
102 | 	} while (0)
103 | 
104 | int pfasta_read_name(struct pfasta_parser *pp, struct pfasta_record *pr);
105 | int pfasta_read_comment(struct pfasta_parser *pp, struct pfasta_record *pr);
106 | int pfasta_read_sequence(struct pfasta_parser *pp, struct pfasta_record *pr);
107 | 
108 | static inline char *buffer_begin(struct pfasta_parser *pp);
109 | static inline char *buffer_end(struct pfasta_parser *pp);
110 | static inline int buffer_advance(struct pfasta_parser *pp, size_t steps);
111 | static inline int buffer_is_empty(const struct pfasta_parser *pp);
112 | static inline int buffer_is_eof(const struct pfasta_parser *pp);
113 | static inline int buffer_peek(struct pfasta_parser *pp);
114 | static inline int buffer_read(struct pfasta_parser *pp);
115 | 
116 | typedef struct dynstr {
117 | 	char *str;
118 | 	size_t capacity, count;
119 | } dynstr;
120 | 
121 | static inline char *dynstr_move(dynstr *ds);
122 | static inline int dynstr_init(dynstr *ds, struct pfasta_parser *pp);
123 | static inline size_t dynstr_len(const dynstr *ds);
124 | static inline void dynstr_free(dynstr *ds);
125 | static inline int dynstr_append(dynstr *ds, const char *str, size_t length,
126 |                                 struct pfasta_parser *pp);
127 | 
128 | static inline int my_isspace(int c) {
129 | 	// ascii whitespace
130 | 	return (c >= '\t' && c <= '\r') || (c == ' ');
131 | }
132 | 
133 | const char *pfasta_version(void) { return VERSION; }
134 | 
135 | int buffer_init(struct pfasta_parser *pp) {
136 | 	int return_code = 0;
137 | 
138 | 	pp->buffer = malloc(BUFFER_SIZE);
139 | 	if (!pp->buffer) PF_FAIL_ERRNO(pp);
140 | 
141 | 	int check = buffer_read(pp);
142 | 	PF_FAIL_BUBBLE_CHECK(pp, check);
143 | 
144 | cleanup:
145 | 	return return_code;
146 | }
147 | 
148 | int buffer_read(struct pfasta_parser *pp) {
149 | 	int return_code = NO_ERROR;
150 | 	ssize_t count = read(pp->file_descriptor, pp->buffer, BUFFER_SIZE);
151 | 
152 | 	if (UNLIKELY(count < 0)) PF_FAIL_ERRNO(pp);
153 | 	if (UNLIKELY(count == 0)) { // EOF
154 | 		pp->fill_ptr = pp->buffer;
155 | 		pp->read_ptr = pp->buffer + 1;
156 | 		pp->errstr = "EOF (maybe error)"; // enable bubbling
157 | 		return E_EOF;
158 | 	}
159 | 
160 | 	pp->read_ptr = pp->buffer;
161 | 	pp->fill_ptr = pp->buffer + count;
162 | 
163 | cleanup:
164 | 	return return_code;
165 | }
166 | 
167 | int buffer_peek(struct pfasta_parser *pp) {
168 | 	return LIKELY(pp->read_ptr < pp->fill_ptr) ? *(unsigned char *)pp->read_ptr
169 | 	                                           : EOF;
170 | }
171 | 
172 | char *buffer_begin(struct pfasta_parser *pp) { return pp->read_ptr; }
173 | 
174 | char *buffer_end(struct pfasta_parser *pp) { return pp->fill_ptr; }
175 | 
176 | inline int buffer_advance(struct pfasta_parser *pp, size_t steps) {
177 | 	int return_code = 0;
178 | 
179 | 	pp->read_ptr += steps;
180 | 	if (UNLIKELY(pp->read_ptr >= pp->fill_ptr)) {
181 | 		assert(pp->read_ptr == pp->fill_ptr);
182 | 		int check = buffer_read(pp); // resets pointers
183 | 		PF_FAIL_BUBBLE_CHECK(pp, check);
184 | 	}
185 | 
186 | cleanup:
187 | 	return return_code;
188 | }
189 | 
190 | int buffer_is_empty(const struct pfasta_parser *pp) {
191 | 	return pp->read_ptr == pp->fill_ptr;
192 | }
193 | 
194 | int buffer_is_eof(const struct pfasta_parser *pp) {
195 | 	return pp->read_ptr > pp->fill_ptr;
196 | }
197 | 
198 | char *find_first_space(const char *begin, const char *end) {
199 | 	size_t offset = 0;
200 | 	size_t length = end - begin;
201 | 
202 | #ifdef __SSE2__
203 | 
204 | 	typedef __m128i vec_type;
205 | 	static const size_t vec_size = sizeof(vec_type);
206 | 
207 | 	const vec_type all_tab = _mm_set1_epi8('\t' - 1);
208 | 	const vec_type all_carriage = _mm_set1_epi8('\r' + 1);
209 | 	const vec_type all_space = _mm_set1_epi8(' ');
210 | 
211 | 	size_t vec_offset = 0;
212 | 	size_t vec_length = (end - begin) / vec_size;
213 | 
214 | 	for (; vec_offset < vec_length; vec_offset++) {
215 | 		vec_type chunk;
216 | 		memcpy(&chunk, begin + vec_offset * vec_size, vec_size);
217 | 
218 | 		// isspace: \t <= char <= \r || char == space
219 | 		vec_type v1 = _mm_cmplt_epi8(all_tab, chunk);
220 | 		vec_type v2 = _mm_cmplt_epi8(chunk, all_carriage);
221 | 		vec_type v3 = _mm_cmpeq_epi8(chunk, all_space);
222 | 
223 | 		unsigned int vmask = (_mm_movemask_epi8(v1) & _mm_movemask_epi8(v2)) |
224 | 		                     _mm_movemask_epi8(v3);
225 | 
226 | 		if (UNLIKELY(vmask)) {
227 | 			offset += __builtin_ctz(vmask);
228 | 			offset += vec_offset * vec_size;
229 | 			return (char *)begin + offset;
230 | 		}
231 | 	}
232 | 
233 | 	offset += vec_offset * vec_size;
234 | #endif
235 | 
236 | 	for (; offset < length; offset++) {
237 | 		if (my_isspace(begin[offset])) break;
238 | 	}
239 | 	return (char *)begin + offset;
240 | }
241 | 
242 | char *find_first_not_space(const char *begin, const char *end) {
243 | 	size_t offset = 0;
244 | 	size_t length = end - begin;
245 | 
246 | 	for (; offset < length; offset++) {
247 | 		if (!my_isspace(begin[offset])) break;
248 | 	}
249 | 	return (char *)begin + offset;
250 | }
251 | 
252 | size_t count_newlines(const char *begin, const char *end) {
253 | 	size_t offset = 0;
254 | 	size_t length = end - begin;
255 | 	size_t newlines = 0;
256 | 
257 | 	for (; offset < length; offset++) {
258 | 		if (begin[offset] == '\n') newlines++;
259 | 	}
260 | 
261 | 	return newlines;
262 | }
263 | 
264 | static int copy_word(struct pfasta_parser *pp, dynstr *target) {
265 | 	int return_code = 0;
266 | 
267 | 	int c;
268 | 	while (c = buffer_peek(pp), c != EOF && LIKELY(!my_isspace(c))) {
269 | 		char *end_of_word = find_first_space(buffer_begin(pp), buffer_end(pp));
270 | 		size_t word_length = end_of_word - buffer_begin(pp);
271 | 
272 | 		assert(word_length > 0);
273 | 
274 | 		int check = dynstr_append(target, buffer_begin(pp), word_length, pp);
275 | 		PF_FAIL_BUBBLE_CHECK(pp, check);
276 | 
277 | 		check = buffer_advance(pp, word_length);
278 | 		PF_FAIL_BUBBLE_CHECK(pp, check);
279 | 	}
280 | 
281 | cleanup:
282 | 	return return_code;
283 | }
284 | 
285 | static int skip_whitespace(struct pfasta_parser *pp) {
286 | 	int return_code = 0;
287 | 
288 | 	while (my_isspace(buffer_peek(pp))) {
289 | 		char *split = find_first_not_space(buffer_begin(pp), buffer_end(pp));
290 | 
291 | 		// advance may clear the buffer. So count first …
292 | 		size_t newlines = count_newlines(buffer_begin(pp), split);
293 | 		int check = buffer_advance(pp, split - buffer_begin(pp));
294 | 		PF_FAIL_BUBBLE_CHECK(pp, check);
295 | 
296 | 		// … and then increase the counter.
297 | 		pp->line_number += newlines;
298 | 	}
299 | 
300 | cleanup:
301 | 	return return_code;
302 | }
303 | 
304 | struct pfasta_parser pfasta_init(int file_descriptor) {
305 | 	int return_code = 0;
306 | 	struct pfasta_parser pp = {0};
307 | 	pp.line_number = 1;
308 | 
309 | 	pp.file_descriptor = file_descriptor;
310 | 	int check = buffer_init(&pp);
311 | 	if (check && check != E_EOF) PF_FAIL_BUBBLE_CHECK(&pp, check);
312 | 
313 | 	if (buffer_is_empty(&pp) || buffer_is_eof(&pp)) {
314 | 		PF_FAIL_STR(&pp, "File is empty.");
315 | 	}
316 | 
317 | 	if (buffer_peek(&pp) != '>') {
318 | 		PF_FAIL_STR(&pp, "File must start with '>'.");
319 | 	}
320 | 
321 | cleanup:
322 | 	// free buffer if necessary
323 | 	if (return_code) {
324 | 		pfasta_free(&pp);
325 | 	}
326 | 	pp.done = return_code || buffer_is_eof(&pp);
327 | 	return pp;
328 | }
329 | 
330 | struct pfasta_record pfasta_read(struct pfasta_parser *pp) {
331 | 	int return_code = 0;
332 | 	struct pfasta_record pr = {0};
333 | 
334 | 	int check = pfasta_read_name(pp, &pr);
335 | 	PF_FAIL_BUBBLE_CHECK(pp, check);
336 | 
337 | 	check = pfasta_read_comment(pp, &pr);
338 | 	PF_FAIL_BUBBLE_CHECK(pp, check);
339 | 
340 | 	check = pfasta_read_sequence(pp, &pr);
341 | 	PF_FAIL_BUBBLE_CHECK(pp, check);
342 | 
343 | cleanup:
344 | 	if (return_code) {
345 | 		pfasta_record_free(&pr);
346 | 		pfasta_free(pp);
347 | 	}
348 | 	pp->done = return_code || buffer_is_eof(pp);
349 | 	return pr;
350 | }
351 | 
352 | int pfasta_read_name(struct pfasta_parser *pp, struct pfasta_record *pr) {
353 | 	int return_code = 0;
354 | 
355 | 	dynstr name;
356 | 	dynstr_init(&name, pp);
357 | 	PF_FAIL_BUBBLE(pp);
358 | 
359 | 	assert(!buffer_is_empty(pp));
360 | 	if (buffer_peek(pp) != '>') {
361 | 		PF_FAIL_STR(pp, "Expected '>' but found '%c' on line %zu.",
362 | 		            buffer_peek(pp), pp->line_number);
363 | 	}
364 | 
365 | 	int check = buffer_advance(pp, 1); // skip >
366 | 	if (check == E_EOF)
367 | 		PF_FAIL_STR(pp, "Unexpected EOF in name on line %zu.", pp->line_number);
368 | 	PF_FAIL_BUBBLE(pp);
369 | 
370 | 	check = copy_word(pp, &name);
371 | 	if (check == E_EOF)
372 | 		PF_FAIL_STR(pp, "Unexpected EOF in name on line %zu.", pp->line_number);
373 | 	PF_FAIL_BUBBLE(pp);
374 | 
375 | 	if (dynstr_len(&name) == 0)
376 | 		PF_FAIL_STR(pp, "Empty name on line %zu.", pp->line_number);
377 | 
378 | 	pr->name_length = dynstr_len(&name);
379 | 	pr->name = dynstr_move(&name);
380 | 
381 | cleanup:
382 | 	if (return_code) {
383 | 		dynstr_free(&name);
384 | 	}
385 | 	return return_code;
386 | }
387 | 
388 | int pfasta_read_comment(struct pfasta_parser *pp, struct pfasta_record *pr) {
389 | 	int return_code = 0;
390 | 
391 | 	if (buffer_peek(pp) == '\n') {
392 | 		pr->comment_length = 0;
393 | 		pr->comment = NULL;
394 | 		return 0;
395 | 	}
396 | 
397 | 	dynstr comment;
398 | 	dynstr_init(&comment, pp);
399 | 	PF_FAIL_BUBBLE(pp);
400 | 
401 | 	assert(!buffer_is_empty(pp));
402 | 
403 | 	int check = buffer_advance(pp, 1); // skip first whitespace
404 | 	if (check == E_EOF) goto label_eof;
405 | 	PF_FAIL_BUBBLE(pp);
406 | 
407 | 	assert(!buffer_is_empty(pp));
408 | 
409 | 	// get comment
410 | 	while (buffer_peek(pp) != '\n') {
411 | 		check = dynstr_append(&comment, buffer_begin(pp), 1, pp);
412 | 		PF_FAIL_BUBBLE_CHECK(pp, check);
413 | 
414 | 		check = buffer_advance(pp, 1);
415 | 		if (check == E_EOF) goto label_eof;
416 | 		PF_FAIL_BUBBLE_CHECK(pp, check);
417 | 	}
418 | 
419 | label_eof:
420 | 	if (buffer_is_eof(pp))
421 | 		PF_FAIL_STR(pp, "Unexpected EOF in comment on line %zu.",
422 | 		            pp->line_number);
423 | 
424 | 	pr->comment_length = dynstr_len(&comment);
425 | 	pr->comment = dynstr_move(&comment);
426 | 
427 | cleanup:
428 | 	if (return_code) {
429 | 		dynstr_free(&comment);
430 | 	}
431 | 	return return_code;
432 | }
433 | 
434 | int pfasta_read_sequence(struct pfasta_parser *pp, struct pfasta_record *pr) {
435 | 	int return_code = 0;
436 | 
437 | 	dynstr sequence;
438 | 	dynstr_init(&sequence, pp);
439 | 	PF_FAIL_BUBBLE(pp);
440 | 
441 | 	assert(!buffer_is_empty(pp));
442 | 	assert(!buffer_is_eof(pp));
443 | 	assert(buffer_peek(pp) == '\n');
444 | 
445 | 	int check = skip_whitespace(pp);
446 | 	if (check == E_EOF)
447 | 		PF_FAIL_STR(pp, "Empty sequence on line %zu.", pp->line_number);
448 | 	PF_FAIL_BUBBLE_CHECK(pp, check);
449 | 
450 | 	// Assume a line begins only with alpha, -, *, or more spaces
451 | 	char c;
452 | 	while (c = buffer_peek(pp), LIKELY(isalpha(c) || c == '-' || c == '*')) {
453 | 		int check = copy_word(pp, &sequence);
454 | 		if (UNLIKELY(check == E_EOF)) break;
455 | 		PF_FAIL_BUBBLE_CHECK(pp, check);
456 | 
457 | 		// optimize for more common case
458 | 		ptrdiff_t length = buffer_end(pp) - buffer_begin(pp);
459 | 		if (LIKELY(length >= 2 && buffer_begin(pp)[0] == '\n' &&
460 | 		           buffer_begin(pp)[1] > ' ')) {
461 | 			pp->read_ptr++; // nasty hack
462 | 			pp->line_number += 1;
463 | 		} else {
464 | 			check = skip_whitespace(pp);
465 | 			if (UNLIKELY(check == E_EOF)) break;
466 | 			PF_FAIL_BUBBLE_CHECK(pp, check);
467 | 		}
468 | 	}
469 | 
470 | 	if (dynstr_len(&sequence) == 0)
471 | 		PF_FAIL_STR(pp, "Empty sequence on line %zu.", pp->line_number);
472 | 
473 | 	pr->sequence_length = dynstr_len(&sequence);
474 | 	pr->sequence = dynstr_move(&sequence);
475 | 	pp->errstr = NULL; // reset error
476 | 
477 | cleanup:
478 | 	if (return_code) {
479 | 		dynstr_free(&sequence);
480 | 	}
481 | 	return return_code;
482 | }
483 | 
484 | void pfasta_record_free(struct pfasta_record *pr) {
485 | 	if (!pr) return;
486 | 	free(pr->name);
487 | 	free(pr->comment);
488 | 	free(pr->sequence);
489 | 	pr->name = pr->comment = pr->sequence = NULL;
490 | }
491 | 
492 | void pfasta_free(struct pfasta_parser *pp) {
493 | 	if (!pp) return;
494 | 	free(pp->buffer);
495 | 	pp->buffer = NULL;
496 | }
497 | 
498 | /** @brief Creates a new string that can grow dynamically.
499 |  *
500 |  * @param ds - A reference to the dynstr container.
501 |  *
502 |  * @returns 0 iff successful.
503 |  */
504 | static inline int dynstr_init(dynstr *ds, struct pfasta_parser *pp) {
505 | 	int return_code = 0;
506 | 
507 | 	*ds = (dynstr){NULL, 0, 0};
508 | 	ds->str = malloc(61);
509 | 	if (!ds->str) PF_FAIL_ERRNO(pp);
510 | 
511 | 	ds->str[0] = '\0';
512 | 	ds->capacity = 61;
513 | 	ds->count = 0;
514 | 
515 | cleanup:
516 | 	return return_code;
517 | }
518 | 
519 | /** @brief A append more than one character to a string.
520 |  *
521 |  * @param ds - A reference to the dynstr container.
522 |  * @param str - The new characters.
523 |  * @param length - number of new characters to append
524 |  *
525 |  * @returns 0 iff successful.
526 |  */
527 | static inline int dynstr_append(dynstr *ds, const char *str, size_t length,
528 |                                 struct pfasta_parser *pp) {
529 | 	int return_code = 0;
530 | 	size_t required = ds->count + length;
531 | 
532 | 	if (UNLIKELY(required >= ds->capacity)) {
533 | 		char *neu = pfasta_reallocarray(ds->str, required / 2, 3);
534 | 		if (UNLIKELY(!neu)) {
535 | 			dynstr_free(ds);
536 | 			PF_FAIL_ERRNO(pp);
537 | 		}
538 | 		ds->str = neu;
539 | 		ds->capacity = (required / 2) * 3;
540 | 	}
541 | 
542 | 	memcpy(ds->str + ds->count, str, length);
543 | 	ds->count = required;
544 | 
545 | cleanup:
546 | 	return return_code;
547 | }
548 | 
549 | /** @brief Frees a dynamic string. */
550 | static inline void dynstr_free(dynstr *ds) {
551 | 	if (!ds) return;
552 | 	free(ds->str);
553 | 	*ds = (dynstr){NULL, 0, 0};
554 | }
555 | 
556 | /** @brief Returns the string as a standard `char*`. The internal reference is
557 |  * then deleted. Hence the name *move* as in *move semantics*.
558 |  *
559 |  * @param ds - The dynamic string to move from.
560 |  *
561 |  * @returns a `char*` to a standard null-terminated string.
562 |  */
563 | static inline char *dynstr_move(dynstr *ds) {
564 | 	char *out = pfasta_reallocarray(ds->str, ds->count + 1, 1);
565 | 	if (!out) {
566 | 		out = ds->str;
567 | 	}
568 | 	out[ds->count] = '\0';
569 | 	*ds = (dynstr){NULL, 0, 0};
570 | 	return out;
571 | }
572 | 
573 | /** @brief Returns the current length of the dynamic string. */
574 | static inline size_t dynstr_len(const dynstr *ds) { return ds->count; }
575 | 
576 | __attribute__((weak)) void *reallocarray(void *ptr, size_t nmemb, size_t size);
577 | 
578 | /**
579 |  * @brief Unsafe fallback in case reallocarray isn't provided by the stdlib.
580 |  */
581 | void *pfasta_reallocarray(void *ptr, size_t nmemb, size_t size) {
582 | 	if (reallocarray == NULL) {
583 | 		return realloc(ptr, nmemb * size);
584 | 	} else {
585 | 		return reallocarray(ptr, nmemb, size);
586 | 	}
587 | }
588 | 


--------------------------------------------------------------------------------
/libs/pfasta.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2015-2020, Fabian Klötzl <fabian-pfasta@kloetzl.info>
 3 |  *
 4 |  * Permission to use, copy, modify, and/or distribute this software for any
 5 |  * purpose with or without fee is hereby granted, provided that the above
 6 |  * copyright notice and this permission notice appear in all copies.
 7 |  *
 8 |  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 9 |  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 |  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 |  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 |  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 |  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 |  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 |  *
16 |  */
17 | 
18 | #ifndef PFASTA_H
19 | #define PFASTA_H
20 | 
21 | #ifdef __cplusplus
22 | extern "C" {
23 | #endif
24 | 
25 | #include <stddef.h>
26 | 
27 | /**
28 |  * There is no magic to this structure. Its just a container of three strings.
29 |  * Feel free to duplicate or move them. But don't forget to free the data after
30 |  * usage!
31 |  */
32 | struct pfasta_record {
33 | 	char *name, *comment, *sequence;
34 | 	size_t name_length, comment_length, sequence_length;
35 | };
36 | 
37 | /**
38 |  * This structure holds a number of members to represent the state of the FASTA
39 |  * parser. Please make sure that it is properly initialized before usage.
40 |  * Always free this structure when the parser is done.
41 |  */
42 | struct pfasta_parser {
43 | 	const char *errstr;
44 | 	int done;
45 | 
46 | 	/*< private -- do not touch! >*/
47 | 	int file_descriptor;
48 | 	char *buffer;
49 | 	char *read_ptr, *fill_ptr;
50 | 	size_t line_number;
51 | };
52 | 
53 | /**
54 |  * This function initializes a `pfasta_parser` struct with a parser bound to a
55 |  * specific file descriptor. Iff an error occurred `errstr` is set to contain a
56 |  * suitable message. Otherwise you can read data from it as long as `done` isn't
57 |  * set. The parser should be freed after usage.
58 |  *
59 |  * Please note that the user is responsible for opening the file descriptor as
60 |  * readable and closing after usage.
61 |  */
62 | struct pfasta_parser pfasta_init(int file_descriptor);
63 | 
64 | /**
65 |  * Using a properly initialized parser, this function can read FASTA sequences.
66 |  * These are stored in the simple structure and returned. On error, the `errstr`
67 |  * property of the parser is set.
68 |  */
69 | struct pfasta_record pfasta_read(struct pfasta_parser *pp);
70 | 
71 | /**
72 |  * This function frees the resources held by a pfasta record.
73 |  */
74 | void pfasta_record_free(struct pfasta_record *pr);
75 | 
76 | /**
77 |  * This function frees the resources held by a pfasta parser.
78 |  */
79 | void pfasta_free(struct pfasta_parser *pp);
80 | 
81 | /**
82 |  * Get a string defining the version of the pfasta library.
83 |  */
84 | const char *pfasta_version(void);
85 | 
86 | /**
87 |  * Returns 0 iff pfasta is not threadsafe.
88 |  */
89 | int pfasta_threadsafe();
90 | 
91 | #ifdef __cplusplus
92 | }
93 | #endif
94 | 
95 | #endif /* PFASTA_H */
96 | 


--------------------------------------------------------------------------------
/m4/ax_cxx_compile_stdcxx_11.m4:
--------------------------------------------------------------------------------
  1 | # ============================================================================
  2 | #  http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html
  3 | # ============================================================================
  4 | #
  5 | # SYNOPSIS
  6 | #
  7 | #   AX_CXX_COMPILE_STDCXX_11([ext|noext],[mandatory|optional])
  8 | #
  9 | # DESCRIPTION
 10 | #
 11 | #   Check for baseline language coverage in the compiler for the C++11
 12 | #   standard; if necessary, add switches to CXXFLAGS to enable support.
 13 | #
 14 | #   The first argument, if specified, indicates whether you insist on an
 15 | #   extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
 16 | #   -std=c++11).  If neither is specified, you get whatever works, with
 17 | #   preference for an extended mode.
 18 | #
 19 | #   The second argument, if specified 'mandatory' or if left unspecified,
 20 | #   indicates that baseline C++11 support is required and that the macro
 21 | #   should error out if no mode with that support is found.  If specified
 22 | #   'optional', then configuration proceeds regardless, after defining
 23 | #   HAVE_CXX11 if and only if a supporting mode is found.
 24 | #
 25 | # LICENSE
 26 | #
 27 | #   Copyright (c) 2008 Benjamin Kosnik <bkoz@redhat.com>
 28 | #   Copyright (c) 2012 Zack Weinberg <zackw@panix.com>
 29 | #   Copyright (c) 2013 Roy Stogner <roystgnr@ices.utexas.edu>
 30 | #   Copyright (c) 2014 Alexey Sokolov <sokolov@google.com>
 31 | #   Copyright (c) 2014, 2015 Google Inc.
 32 | #
 33 | #   Copying and distribution of this file, with or without modification, are
 34 | #   permitted in any medium without royalty provided the copyright notice
 35 | #   and this notice are preserved. This file is offered as-is, without any
 36 | #   warranty.
 37 | 
 38 | #serial 7
 39 | 
 40 | m4_define([_AX_CXX_COMPILE_STDCXX_11_testbody], [[
 41 |   template <typename T>
 42 |     struct check
 43 |     {
 44 |       static_assert(sizeof(int) <= sizeof(T), "not big enough");
 45 |     };
 46 | 
 47 |     struct Base {
 48 |     virtual void f() {}
 49 |     };
 50 |     struct Child : public Base {
 51 |     virtual void f() override {}
 52 |     };
 53 | 
 54 |     typedef check<check<bool>> right_angle_brackets;
 55 | 
 56 |     int a;
 57 |     decltype(a) b;
 58 | 
 59 |     typedef check<int> check_type;
 60 |     check_type c;
 61 |     check_type&& cr = static_cast<check_type&&>(c);
 62 | 
 63 |     auto d = a;
 64 |     auto l = [](){};
 65 | 
 66 |     // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
 67 |     // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
 68 |     namespace test_template_alias_sfinae {
 69 |         struct foo {};
 70 | 
 71 |         template<typename T>
 72 |         using member = typename T::member_type;
 73 | 
 74 |         template<typename T>
 75 |         void func(...) {}
 76 | 
 77 |         template<typename T>
 78 |         void func(member<T>*) {}
 79 | 
 80 |         void test() {
 81 |             func<foo>(0);
 82 |         }
 83 |     }
 84 | ]])
 85 | 
 86 | AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [dnl
 87 |   m4_if([$1], [], [],
 88 |         [$1], [ext], [],
 89 |         [$1], [noext], [],
 90 |         [m4_fatal([invalid argument `$1' to AX_CXX_COMPILE_STDCXX_11])])dnl
 91 |   m4_if([$2], [], [ax_cxx_compile_cxx11_required=true],
 92 |         [$2], [mandatory], [ax_cxx_compile_cxx11_required=true],
 93 |         [$2], [optional], [ax_cxx_compile_cxx11_required=false],
 94 |         [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX_11])])
 95 |   AC_LANG_PUSH([C++])dnl
 96 |   ac_success=no
 97 |   AC_CACHE_CHECK(whether $CXX supports C++11 features by default,
 98 |   ax_cv_cxx_compile_cxx11,
 99 |   [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
100 |     [ax_cv_cxx_compile_cxx11=yes],
101 |     [ax_cv_cxx_compile_cxx11=no])])
102 |   if test x$ax_cv_cxx_compile_cxx11 = xyes; then
103 |     ac_success=yes
104 |   fi
105 | 
106 |   m4_if([$1], [noext], [], [dnl
107 |   if test x$ac_success = xno; then
108 |     for switch in -std=gnu++11 -std=gnu++0x; do
109 |       cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
110 |       AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
111 |                      $cachevar,
112 |         [ac_save_CXXFLAGS="$CXXFLAGS"
113 |          CXXFLAGS="$CXXFLAGS $switch"
114 |          AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
115 |           [eval $cachevar=yes],
116 |           [eval $cachevar=no])
117 |          CXXFLAGS="$ac_save_CXXFLAGS"])
118 |       if eval test x\$$cachevar = xyes; then
119 |         CXXFLAGS="$CXXFLAGS $switch"
120 |         ac_success=yes
121 |         break
122 |       fi
123 |     done
124 |   fi])
125 | 
126 |   m4_if([$1], [ext], [], [dnl
127 |   if test x$ac_success = xno; then
128 |     for switch in -std=c++11 -std=c++0x; do
129 |       cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
130 |       AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
131 |                      $cachevar,
132 |         [ac_save_CXXFLAGS="$CXXFLAGS"
133 |          CXXFLAGS="$CXXFLAGS $switch"
134 |          AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
135 |           [eval $cachevar=yes],
136 |           [eval $cachevar=no])
137 |          CXXFLAGS="$ac_save_CXXFLAGS"])
138 |       if eval test x\$$cachevar = xyes; then
139 |         CXXFLAGS="$CXXFLAGS $switch"
140 |         ac_success=yes
141 |         break
142 |       fi
143 |     done
144 |   fi])
145 |   AC_LANG_POP([C++])
146 |   if test x$ax_cxx_compile_cxx11_required = xtrue; then
147 |     if test x$ac_success = xno; then
148 |       AC_MSG_ERROR([*** A compiler with support for C++11 language features is required.])
149 |     fi
150 |   else
151 |     if test x$ac_success = xno; then
152 |       HAVE_CXX11=0
153 |       AC_MSG_NOTICE([No compiler with C++11 support was found])
154 |     else
155 |       HAVE_CXX11=1
156 |       AC_DEFINE(HAVE_CXX11,1,
157 |                 [define if the compiler supports basic C++11 syntax])
158 |     fi
159 | 
160 |     AC_SUBST(HAVE_CXX11)
161 |   fi
162 | ])
163 | 


--------------------------------------------------------------------------------
/opt/Makefile.am:
--------------------------------------------------------------------------------
 1 | noinst_LIBRARIES= libcompat.a
 2 | libcompat_a_SOURCES= compat-string.h compat-stdlib.h
 3 | 
 4 | if !HAVE_STRCHRNUL
 5 | libcompat_a_SOURCES+= strchrnul.c
 6 | endif
 7 | 
 8 | if !HAVE_REALLOCARRAY
 9 | libcompat_a_SOURCES+= reallocarray.c
10 | endif
11 | 


--------------------------------------------------------------------------------
/opt/compat-stdlib.h:
--------------------------------------------------------------------------------
1 | #include <stdlib.h>
2 | 
3 | void *reallocarray(void *optr, size_t nmemb, size_t size);
4 | 


--------------------------------------------------------------------------------
/opt/compat-string.h:
--------------------------------------------------------------------------------
1 | #ifndef HAVE_STRCHRNUL
2 | char *strchrnul(const char *s, int c);
3 | #endif
4 | 


--------------------------------------------------------------------------------
/opt/reallocarray.c:
--------------------------------------------------------------------------------
 1 | #include <errno.h>
 2 | #include <stdint.h>
 3 | #include "compat-stdlib.h"
 4 | 
 5 | /*
 6 |  * Copyright (c) 2008 Otto Moerbeek <otto@drijf.net>
 7 |  *
 8 |  * Permission to use, copy, modify, and distribute this software for any
 9 |  * purpose with or without fee is hereby granted, provided that the above
10 |  * copyright notice and this permission notice appear in all copies.
11 |  *
12 |  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
13 |  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
14 |  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
15 |  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
16 |  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
17 |  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
18 |  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 |  */
20 | 
21 | /*
22 |  * This is sqrt(SIZE_MAX+1), as s1*s2 <= SIZE_MAX
23 |  * if both s1 < MUL_NO_OVERFLOW and s2 < MUL_NO_OVERFLOW
24 |  */
25 | #define MUL_NO_OVERFLOW ((size_t)1 << (sizeof(size_t) * 4))
26 | 
27 | void *reallocarray(void *optr, size_t nmemb, size_t size) {
28 | 	if ((nmemb >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) && nmemb > 0 &&
29 | 	    SIZE_MAX / nmemb < size) {
30 | 		errno = ENOMEM;
31 | 		return NULL;
32 | 	}
33 | 	return realloc(optr, size * nmemb);
34 | }
35 | 


--------------------------------------------------------------------------------
/opt/strchrnul.c:
--------------------------------------------------------------------------------
 1 | /* @brief Here follows a simple implementation of the GNU function `strchrnul`.
 2 |  * Please check the gnulib manual for details.
 3 |  */
 4 | #include <string.h>
 5 | 
 6 | char *strchrnul(const char *s, int c){
 7 | 	char *p = strchr(s,c);
 8 | 
 9 | 	return p != NULL ? p : strchr(s, '\0');
10 | }
11 | 


--------------------------------------------------------------------------------
/scripts/_andi:
--------------------------------------------------------------------------------
 1 | #compdef andi
 2 | 
 3 | # This file allows zsh to complete arguments for andi. As the syntax is
 4 | # totally non-obvious, I'll explain the basics here. For details see
 5 | #  http://zsh.sourceforge.net/Doc/Release/Completion-System.html
 6 | # Each line consists of three parts: (A){B}[C]
 7 | # The B part performs brace expansion as on the commandline. Thus each
 8 | # line with braces gets translated into multiple arguments! Also the
 9 | # B part lists the relevant argument for which we are trying to set
10 | # the completion rules. The A part simply states that B shall not be
11 | # completed if A is already present. i.e. Most flags only make sense once,
12 | # with the exception of -v. The string C is simply the message that is
13 | # displayed to the user.
14 | 
15 | local info="-h --help --version"
16 | local ret=1
17 | local -a args
18 | 
19 | args+=(
20 | 	"($info -b --bootstrap)"{-b+,--bootstrap=}'[Print additional bootstrap matrices]:int:'
21 | 	"($info)*--file-of-filenames=[Read additional filenames from file; one per line]:file:_files"
22 | 	"($info -j --join)"{-j,--join}'[Treat all sequences from one file as a single genome]'
23 | 	"($info -l --low-memory)"{-l,--low-memory}'[Use less memory at the cost of speed]'
24 | 	"($info -m --model)"{-m+,--model=}'[Pick an evolutionary model]:model:((
25 | 		Raw\:Uncorrected\ distances
26 | 		JC\:Jukes\-Cantor\ corrected
27 | 		Kimura\:Kimura\-two\-parameter
28 | 		LogDet\:Logarithmic\ determinant
29 | 	))'
30 | 	"($info)-p+[Significance of an anchor; default\: 0.025]:float:"
31 | 	"($info)--progress=[Show progress bar]:when:(always auto never)"
32 | 	"($info -t --threads)"{-t+,--threads=}'[The number of threads to be used; by default, all available processors are used]:num_threads:'
33 | 	"($info)--truncate-names[Print only the first ten characters of each name]"
34 | 	"($info)*"{-v,--verbose}'[Prints additional information]'
35 | 	'(- *)'{-h,--help}'[Display help and exit]'
36 | 	'(- *)--version[Output version information and acknowledgments]'
37 | 	'*:file:_files'
38 | )
39 | 
40 | _arguments -w -s -S $args[@] && ret=0
41 | 
42 | return ret
43 | 


--------------------------------------------------------------------------------
/scripts/failed.zsh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/zsh
 2 | 
 3 | # Compute the number of failing comparisons for different distances.
 4 | 
 5 | DISTS=(0.1 0.2 0.3 0.35 0.4 0.45 0.5 0.55 0.6 0.65 0.7)
 6 | 
 7 | LENGTH=100000
 8 | 
 9 | for dist in $DISTS; do
10 | 	echo "" > est_$dist.dist
11 | 	for (( i = 0; i < 1000; i++ )); do
12 | 		../test/test_fasta -l $LENGTH -d $dist > temp.fa
13 | 		../src/andi ./temp.fa > est.dist 2> /dev/null
14 | 		tail -n 1 est.dist >> est_$dist.dist
15 | 	done
16 | 	avg=$(cat est_$dist.dist | awk '"nan" !~ $2 {sum+=$2;c++}END{print sum/c}')
17 | 	sd=$(grep -v 'nan' est_$dist.dist | awk '{a[c++]=$2;aa+=$2}END{aa/=NR;for(c=0;c<NR;c++){t=a[c]-aa;sd+=t*t}print sqrt(sd/(NR-1))}')
18 | 	failed=$(cat est_$dist.dist | grep -c 'nan')
19 | 	echo $dist "\t" $avg"\t±" $sd "\t" $failed
20 | done
21 | 


--------------------------------------------------------------------------------
/scripts/maf2phy.awk:
--------------------------------------------------------------------------------
 1 | # maf2phy.awk
 2 | # Author: Bernhard Haubold, haubold@evolbio.mpg.de
 3 | # Contributors: Fabian Klötzl, kloetzl@evolbio.mpg.de
 4 | # Date: June 19, 2014
 5 | # Last Modified: February 5, 2015
 6 | BEGIN{
 7 |   if(!n){
 8 |     print "maf2phy.awk: Convert mutation annotation format (maf) as generated by the program mugsy to PHYLIP";
 9 |     print "Usage: awk -f maf2phy.awk -v n=<numberOfTaxa> file.maf > file.phy";
10 |     exit(-1);
11 |   }
12 |   numName = 0;
13 |   test = "mult=" n;
14 | }{
15 |   if(/^a/){
16 |     if($0 ~ test)
17 |       open = 1;
18 |     else
19 |       open = 0;
20 |   }
21 |   if(open && /^s/){
22 |     if(!s[$2])
23 |       names[numNames++] = $2;
24 |     s[$2] = s[$2] $7;
25 |   }
26 | }END{
27 |   # check equal length of sequences
28 |   len = -1;
29 |   for(i=0;i<numNames;i++){
30 |     name = names[i];
31 |     if(len > 0){
32 |       if(length(s[name]) != len){
33 | 	print "sequence length should be " len " but is in fact " length(s[name]);
34 | 	exit(-1);
35 |       }
36 |     }else
37 |       len = length(s[name]);
38 |   }
39 |   print numNames, len;
40 |   start = 1;
41 |   l = 60;
42 |   for(i=0;i<numNames;i++){
43 |     name = names[i];
44 |     printf("%-10.10s",name);
45 |     print(" " substr(s[name],start,l));
46 |   }
47 |   printf("\n");
48 |   start += l;
49 |   while(start < len){
50 |     for(i=0;i<numNames;i++){
51 |       name = names[i];
52 |       print(substr(s[name],start,l));
53 |     }
54 |     printf("\n");
55 |     start += l;
56 |   }
57 | }
58 | 
59 | 	    


--------------------------------------------------------------------------------
/scripts/vmatch.sh:
--------------------------------------------------------------------------------
 1 | #/usr/bin/bash -f
 2 | # Simulate the anchor distance using vmatch.
 3 | 
 4 | # These are all sequences from the ECO29 set.
 5 | SEQS="AE005174.fasta AE005674.fasta AE014073.fasta AE014075.fasta AP009048.fasta AP009240.fasta BA000007.fasta CP000034.fasta CP000036.fasta CP000038.fasta CP000243.fasta CP000247.fasta CP000266.fasta CP000468.fasta CP000800.fasta CP000802.fasta CP000946.fasta CP000948.fasta CP000970.fasta CP001063.fasta CP001396.fasta CP001846.fasta CU928160.fasta CU928161.fasta CU928162.fasta CU928163.fasta CU928164.fasta FM180568.fasta U00096.fasta"
 6 | 
 7 | # Loop over all sequences
 8 | for S in $SEQS; do
 9 | 	echo $S;
10 | 	Q=${SEQS/$S/}; # All sequences, except S
11 | 
12 | 	# Create the index for S
13 | 	./mkvtree -db "$S" -dna -allout -pl
14 | 
15 | 	# Recall, that an anchor is unique in S and of some minimum length. Hence
16 | 	# the parameters -mum cand and -l 16. The latter threshold was calculated
17 | 	# by andi.
18 | 
19 | 	# Match all other sequences including their reverse against S
20 | 	./vmatch -q $Q -mum cand -l 16 -d -p -nodist -noscore -noevalue -noidentity $S > /dev/null
21 | 
22 | 	# In theory we have to calculate the anchors distance here using the previously
23 | 	# computed matches. But since vmatch is already significantly slower than andi,
24 | 	# we skip this step.
25 | done
26 | 
27 | 


--------------------------------------------------------------------------------
/src/Makefile.am:
--------------------------------------------------------------------------------
 1 | bin_PROGRAMS = andi
 2 | 
 3 | andi_SOURCES = andi.c esa.c process.c sequence.c io.c global.h esa.h process.h sequence.h io.h dist_hack.h \
 4 | model.h model.c
 5 | andi_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/libs -I$(top_srcdir)/opt -std=gnu99
 6 | andi_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra -Wno-missing-field-initializers
 7 | andi_LDADD = $(top_builddir)/libs/libpfasta.a $(top_builddir)/opt/libcompat.a
 8 | 
 9 | .PHONY: perf
10 | perf: CFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer
11 | perf: andi
12 | 


--------------------------------------------------------------------------------
/src/andi.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file
  3 |  *
  4 |  * This is the main file. It contains functions to parse the commandline
  5 |  * arguments, read files etc.
  6 |  *
  7 |  * @brief The main file
  8 |  * @author Fabian Klötzl
  9 |  *
 10 |  * @section License
 11 |  *
 12 |  * This program is free software; you can redistribute it and/or
 13 |  * modify it under the terms of the GNU General Public License as
 14 |  * published by the Free Software Foundation; either version 3 of
 15 |  * the License, or (at your option) any later version.
 16 |  *
 17 |  * This program is distributed in the hope that it will be useful, but
 18 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 19 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 20 |  * General Public License for more details at
 21 |  * http://www.gnu.org/copyleft/gpl.html
 22 |  *
 23 |  */
 24 | 
 25 | #include "global.h"
 26 | #include "io.h"
 27 | #include "process.h"
 28 | #include "sequence.h"
 29 | #include <assert.h>
 30 | #include <errno.h>
 31 | #include <getopt.h>
 32 | #include <gsl/gsl_rng.h>
 33 | #include <limits.h>
 34 | #include <stdio.h>
 35 | #include <stdlib.h>
 36 | #include <string.h>
 37 | #include <time.h>
 38 | #include <unistd.h>
 39 | 
 40 | #ifdef _OPENMP
 41 | #include <omp.h>
 42 | #endif
 43 | 
 44 | /* Global variables */
 45 | int FLAGS = 0;
 46 | int THREADS = 1;
 47 | long unsigned int BOOTSTRAP = 0;
 48 | double ANCHOR_P_VALUE = 0.025;
 49 | gsl_rng *RNG = NULL;
 50 | int MODEL = M_JC;
 51 | 
 52 | void usage(int);
 53 | void version(void);
 54 | 
 55 | /**
 56 |  * @brief The main function.
 57 |  *
 58 |  * The main function reads and parses the commandline arguments. Depending on
 59 |  * the set flags it reads the input files and forwards the contained sequences
 60 |  * to processing. Also it verifies the input for correctness and issues warnings
 61 |  * and errors.
 62 |  */
 63 | int main(int argc, char *argv[]) {
 64 | 	struct option long_options[] = {
 65 | 		{"version", no_argument, NULL, 0},
 66 | 		{"truncate-names", no_argument, NULL, 0},
 67 | 		{"file-of-filenames", required_argument, NULL, 0},
 68 | 		{"progress", optional_argument, NULL, 0},
 69 | 		{"help", no_argument, NULL, 'h'},
 70 | 		{"verbose", no_argument, NULL, 'v'},
 71 | 		{"join", no_argument, NULL, 'j'},
 72 | 		{"low-memory", no_argument, NULL, 'l'},
 73 | 		{"threads", required_argument, NULL, 't'},
 74 | 		{"bootstrap", required_argument, NULL, 'b'},
 75 | 		{"model", required_argument, NULL, 'm'},
 76 | 		{0, 0, 0, 0}};
 77 | 
 78 | #ifdef _OPENMP
 79 | 	// Use all available processors by default.
 80 | 	THREADS = omp_get_num_procs();
 81 | #endif
 82 | 
 83 | 	enum { P_AUTO, P_NEVER, P_ALWAYS } progress = P_AUTO;
 84 | 
 85 | 	struct string_vector file_names;
 86 | 	string_vector_init(&file_names);
 87 | 
 88 | 	// parse arguments
 89 | 	while (1) {
 90 | 
 91 | 		int option_index = 0;
 92 | 		int c = getopt_long(argc, argv, "jvht:p:m:b:l", long_options,
 93 | 							&option_index);
 94 | 
 95 | 		if (c == -1) {
 96 | 			break;
 97 | 		}
 98 | 
 99 | 		switch (c) {
100 | 			case 0: {
101 | 				const char *option_str = long_options[option_index].name;
102 | 				if (strcasecmp(option_str, "version") == 0) {
103 | 					version();
104 | 				}
105 | 				if (strcasecmp(option_str, "truncate-names") == 0) {
106 | 					FLAGS |= F_TRUNCATE_NAMES;
107 | 				}
108 | 				if (strcasecmp(option_str, "file-of-filenames") == 0) {
109 | 					read_into_string_vector(optarg, &file_names);
110 | 				}
111 | 				if (strcasecmp(option_str, "progress") == 0) {
112 | 					if (!optarg || strcasecmp(optarg, "always") == 0) {
113 | 						progress = P_ALWAYS;
114 | 					} else if (strcasecmp(optarg, "auto") == 0) {
115 | 						progress = P_AUTO;
116 | 					} else if (strcasecmp(optarg, "never") == 0) {
117 | 						progress = P_NEVER;
118 | 					} else {
119 | 						warnx("invalid argument to --progress '%s'. Expected "
120 | 							  "one of 'auto', 'always', or 'never'.",
121 | 							  optarg);
122 | 					}
123 | 				}
124 | 				break;
125 | 			}
126 | 			case 'h': usage(EXIT_SUCCESS); break;
127 | 			case 'v':
128 | 				FLAGS |= FLAGS & F_VERBOSE ? F_EXTRA_VERBOSE : F_VERBOSE;
129 | 				break;
130 | 			case 'p': {
131 | 				errno = 0;
132 | 				char *end;
133 | 				double prop = strtod(optarg, &end);
134 | 
135 | 				if (errno || end == optarg || *end != '\0') {
136 | 					soft_errx(
137 | 						"Expected a floating point number for -p argument, but "
138 | 						"'%s' was given. Skipping argument.",
139 | 						optarg);
140 | 					break;
141 | 				}
142 | 
143 | 				if (prop <= 0.0 || prop >= 1.0) {
144 | 					soft_errx("A probability should be a value between 0 and "
145 | 							  "1, exclusive; Ignoring -p %f argument.",
146 | 							  prop);
147 | 					break;
148 | 				}
149 | 
150 | 				ANCHOR_P_VALUE = prop;
151 | 				break;
152 | 			}
153 | 			case 'l': FLAGS |= F_LOW_MEMORY; break;
154 | 			case 'j': FLAGS |= F_JOIN; break;
155 | 			case 't': {
156 | #ifdef _OPENMP
157 | 				errno = 0;
158 | 				char *end;
159 | 				long unsigned int threads = strtoul(optarg, &end, 10);
160 | 
161 | 				if (errno || end == optarg || *end != '\0') {
162 | 					warnx("Expected a number for -t argument, but '%s' was "
163 | 						  "given. Ignoring -t argument.",
164 | 						  optarg);
165 | 					break;
166 | 				}
167 | 
168 | 				if (threads > (long unsigned int)omp_get_num_procs()) {
169 | 					warnx(
170 | 						"The number of threads to be used, is greater than the "
171 | 						"number of available processors; Ignoring -t %lu "
172 | 						"argument.",
173 | 						threads);
174 | 					break;
175 | 				}
176 | 
177 | 				THREADS = threads;
178 | #else
179 | 				warnx(
180 | 					"This version of andi was built without OpenMP and thus "
181 | 					"does not support multi threading. Ignoring -t argument.");
182 | #endif
183 | 				break;
184 | 			}
185 | 			case 'b': {
186 | 				errno = 0;
187 | 				char *end;
188 | 				long unsigned int bootstrap = strtoul(optarg, &end, 10);
189 | 
190 | 				if (errno || end == optarg || *end != '\0' || bootstrap == 0) {
191 | 					soft_errx(
192 | 						"Expected a positive number for -b argument, but '%s' "
193 | 						"was given. Ignoring -b argument.",
194 | 						optarg);
195 | 					break;
196 | 				}
197 | 
198 | 				BOOTSTRAP = bootstrap - 1;
199 | 				break;
200 | 			}
201 | 			case 'm': {
202 | 				if (strcasecmp(optarg, "RAW") == 0) {
203 | 					MODEL = M_RAW;
204 | 				} else if (strcasecmp(optarg, "JC") == 0) {
205 | 					MODEL = M_JC;
206 | 				} else if (strcasecmp(optarg, "KIMURA") == 0) {
207 | 					MODEL = M_KIMURA;
208 | 				} else if (strcasecmp(optarg, "LOGDET") == 0) {
209 | 					MODEL = M_LOGDET;
210 | 				} else {
211 | 					soft_errx("Ignoring argument for --model. Expected Raw, "
212 | 							  "JC, Kimura or LogDet");
213 | 				}
214 | 				break;
215 | 			}
216 | 			case '?': /* intentional fall-through */
217 | 			default: usage(EXIT_FAILURE); break;
218 | 		}
219 | 	}
220 | 
221 | 	argc -= optind;
222 | 	argv += optind;
223 | 
224 | 	// copy command line arguments into vector
225 | 	// std::copy, anyone?
226 | 	for (size_t i = 0; i < (unsigned int)argc; i++) {
227 | 		string_vector_push_back(&file_names, argv[i]);
228 | 	}
229 | 
230 | 	// at least one file name must be given
231 | 	if (FLAGS & F_JOIN && string_vector_size(&file_names) == 0) {
232 | 		errx(1, "In join mode at least one filename needs to be supplied.");
233 | 	}
234 | 
235 | 	size_t minfiles = FLAGS & F_JOIN ? 2 : 1;
236 | 	if (string_vector_size(&file_names) < minfiles) {
237 | 		// not enough files passed via arguments
238 | 		if (!isatty(STDIN_FILENO)) {
239 | 			// read from stdin in pipe
240 | 			string_vector_push_back(&file_names, "-");
241 | 		} else {
242 | 			// print a helpful message on './andi' without args
243 | 			usage(EXIT_FAILURE);
244 | 		}
245 | 	}
246 | 
247 | 	// parse fasta files
248 | 	dsa_t dsa;
249 | 	dsa_init(&dsa);
250 | 	for (size_t i = 0; i < string_vector_size(&file_names); i++) {
251 | 		char *file_name = string_vector_at(&file_names, i);
252 | 		if (FLAGS & F_JOIN) {
253 | 			read_fasta_join(file_name, &dsa);
254 | 		} else {
255 | 			read_fasta(file_name, &dsa);
256 | 		}
257 | 	}
258 | 
259 | 	string_vector_free(&file_names);
260 | 
261 | 	size_t n = dsa_size(&dsa);
262 | 
263 | 	if (n < 2) {
264 | 		errx(1,
265 | 			 "I am truly sorry, but with less than two sequences (%zu given) "
266 | 			 "there is nothing to compare.",
267 | 			 n);
268 | 	}
269 | 
270 | 	RNG = gsl_rng_alloc(gsl_rng_default);
271 | 	if (!RNG) {
272 | 		err(1, "RNG allocation failed.");
273 | 	}
274 | 
275 | 	// seed the random number generator with the current time
276 | 	// TODO: enable seeding for reproducibility
277 | 	gsl_rng_set(RNG, time(NULL));
278 | 
279 | 	// Warn about non ACGT residues.
280 | 	if (FLAGS & F_NON_ACGT) {
281 | 		warnx("The input sequences contained characters other than acgtACGT. "
282 | 			  "These were automatically stripped to ensure correct results.");
283 | 	}
284 | 
285 | 	// validate sequence correctness
286 | 	const seq_t *seq = dsa_data(&dsa);
287 | 	for (size_t i = 0; i < n; ++i, seq++) {
288 | 		if ((FLAGS & F_TRUNCATE_NAMES) && strlen(seq->name) > 10) {
289 | 			warnx("The sequence name '%s' is longer than ten characters. It "
290 | 				  "will be truncated in the output to '%.10s'.",
291 | 				  seq->name, seq->name);
292 | 		}
293 | 
294 | 		const size_t LENGTH_LIMIT = (INT_MAX - 1) / 2;
295 | 		if (seq->len > LENGTH_LIMIT) {
296 | 			errx(1, "The sequence %s is too long. The technical limit is %zu.",
297 | 				 seq->name, LENGTH_LIMIT);
298 | 		}
299 | 
300 | 		if (seq->len == 0) {
301 | 			errx(1, "The sequence %s is empty.", seq->name);
302 | 		}
303 | 
304 | 		if (seq->len < 1000) {
305 | 			FLAGS |= F_SHORT;
306 | 		}
307 | 	}
308 | 
309 | 	if (FLAGS & F_SHORT) {
310 | 		soft_errx(
311 | 			"One of the given input sequences is shorter than a thousand "
312 | 			"nucleotides. This may result in inaccurate distances. Try an "
313 | 			"alignment instead.");
314 | 	}
315 | 
316 | 	// determine whether to print a progress bar
317 | 	if (progress == P_AUTO) {
318 | 		progress = isatty(STDERR_FILENO) ? P_ALWAYS : P_NEVER;
319 | 	}
320 | 	if (progress == P_ALWAYS) {
321 | 		FLAGS |= F_PRINT_PROGRESS;
322 | 	}
323 | 
324 | 	// compute distance matrix
325 | 	calculate_distances(dsa_data(&dsa), n);
326 | 
327 | 	dsa_free(&dsa);
328 | 	gsl_rng_free(RNG);
329 | 
330 | 	return FLAGS & F_SOFT_ERROR ? EXIT_FAILURE : EXIT_SUCCESS;
331 | }
332 | 
333 | /**
334 |  * @brief Prints the usage and then exits.
335 |  */
336 | void usage(int status) {
337 | 	const char str[] = {
338 | 		"Usage: andi [OPTIONS...] FILES...\n"
339 | 		"\tFILES... can be any sequence of FASTA files.\n"
340 | 		"\tUse '-' as file name to read from stdin.\n"
341 | 		"Options:\n"
342 | 		"  -b, --bootstrap=INT  Print additional bootstrap matrices\n"
343 | 		"      --file-of-filenames=FILE  Read additional filenames from FILE; "
344 | 		"one per line\n"
345 | 		"  -j, --join           Treat all sequences from one file as a single "
346 | 		"genome\n"
347 | 		"  -l, --low-memory     Use less memory at the cost of speed\n"
348 | 		"  -m, --model=MODEL    Pick an evolutionary model of 'Raw', 'JC', "
349 | 		"'Kimura', 'LogDet'; default: JC\n"
350 | 		"  -p FLOAT             Significance of an anchor; default: 0.025\n"
351 | 		"      --progress=WHEN  Print a progress bar 'always', 'never', or "
352 | 		"'auto'; default: auto\n"
353 | #ifdef _OPENMP
354 | 		"  -t, --threads=INT    Set the number of threads; by default, all "
355 | 		"processors are used\n"
356 | #endif
357 | 		"      --truncate-names Truncate names to ten characters\n"
358 | 		"  -v, --verbose        Prints additional information\n"
359 | 		"  -h, --help           Display this help and exit\n"
360 | 		"      --version        Output version information and "
361 | 		"acknowledgments\n"};
362 | 
363 | 	fprintf(status == EXIT_SUCCESS ? stdout : stderr, "%s", str);
364 | 	exit(status);
365 | }
366 | 
367 | /**
368 |  * @brief This function just prints the version string and then aborts
369 |  * the program. It conforms to the [GNU Coding
370 |  * Standard](http://www.gnu.org/prep/standards/html_node/_002d_002dversion.html#g_t_002d_002dversion).
371 |  */
372 | void version(void) {
373 | 	const char str[] = {
374 | 		"andi " VERSION "\n"
375 | 		"Copyright (C) 2014 - 2020 Fabian Klötzl\n"
376 | 		"License GPLv3+: GNU GPL version 3 or later "
377 | 		"<http://gnu.org/licenses/gpl.html>\n"
378 | 		"This is free software: you are free to change and redistribute it.\n"
379 | 		"There is NO WARRANTY, to the extent permitted by law.\n\n"
380 | 		"Acknowledgments:\n"
381 | 		"1) Andi: Haubold, B. Klötzl, F. and Pfaffelhuber, P. (2015). "
382 | 		"Fast and accurate estimation of evolutionary distances between "
383 | 		"closely related genomes, Bioinformatics.\n"
384 | 		"2) Algorithms: Ohlebusch, E. (2013). Bioinformatics Algorithms. "
385 | 		"Sequence Analysis, Genome Rearrangements, and Phylogenetic "
386 | 		"Reconstruction. pp 118f.\n"
387 | 		"3) SA construction: Mori, Y. (2005). libdivsufsort, unpublished.\n"
388 | 		"4) Bootstrapping: Klötzl, F. and Haubold, B. (2016). Support Values "
389 | 		"for Genome Phylogenies, Life 6.1.\n"};
390 | 	printf("%s", str);
391 | 	exit(EXIT_SUCCESS);
392 | }
393 | 


--------------------------------------------------------------------------------
/src/dist_hack.h:
--------------------------------------------------------------------------------
 1 | /** @file
 2 |  * @brief This file is a preprocessor hack for the two functions `distMatrix`
 3 |  * and `distMatrixLM`.
 4 |  */
 5 | // clang-format off
 6 | #ifdef FAST
 7 | #define NAME distMatrix
 8 | #define P_OUTER _Pragma("omp parallel for num_threads( THREADS) default(none) shared(progress_counter) firstprivate( stderr, M, sequences, n, print_progress)")
 9 | #define P_INNER
10 | #else
11 | #undef NAME
12 | #undef P_OUTER
13 | #undef P_INNER
14 | #define NAME distMatrixLM
15 | #define P_OUTER
16 | #define P_INNER _Pragma("omp parallel for num_threads( THREADS) default(none) shared(progress_counter) firstprivate( stderr, M, sequences, n, print_progress, i, E, subject)")
17 | #endif
18 | // clang-format on
19 | 
20 | /** @brief This function calls dist_andi for pairs of subjects and queries, and
21 |  * thereby fills the distance matrix.
22 |  *
23 |  * This function is actually two functions. It is one template that gets
24 |  * compiled into two functions via preprocessor hacks. The reason is DRY (Do not
25 |  * Repeat Yourselves).
26 |  * The two functions only differ by their name and pragmas; i.e. They run in
27 |  * different parallel modes.
28 |  * `distMatrix` is faster than `distMatrixLM` but needs more memory.
29 |  *
30 |  * @param sequences - The sequences to compare
31 |  * @param n - The number of sequences
32 |  * @param M - A matrix for additional output data
33 |  */
34 | void NAME(struct model *M, const seq_t *sequences, size_t n) {
35 | 	size_t i;
36 | 
37 | 	size_t progress_counter = 0;
38 | 	int print_progress = FLAGS & F_PRINT_PROGRESS;
39 | 
40 | 	if (print_progress) {
41 | 		fprintf(stderr, "Comparing %zu sequences: %5.1f%% (%zu/%zu)", n, 0.0,
42 | 				(size_t)0, n * n - n);
43 | 	}
44 | 
45 | 	//#pragma
46 | 	P_OUTER
47 | 	for (i = 0; i < n; i++) {
48 | 		seq_subject subject;
49 | 		esa_s E;
50 | 
51 | 		if (seq_subject_init(&subject, &sequences[i]) ||
52 | 			esa_init(&E, &subject)) {
53 | 			errx(1, "Failed to create index for %s.", sequences[i].name);
54 | 		}
55 | 
56 | 		// now compare every other sequence to i
57 | 		size_t j;
58 | 
59 | 		P_INNER
60 | 		for (j = 0; j < n; j++) {
61 | 			if (j == i) {
62 | 				M(i, j) = (struct model){.seq_len = 9, .counts = {9}};
63 | 				continue;
64 | 			}
65 | 
66 | 			size_t ql = sequences[j].len;
67 | 
68 | 			M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject.threshold);
69 | 
70 | #pragma omp atomic update
71 | 			progress_counter++;
72 | 		}
73 | 
74 | 		if (print_progress) {
75 | 			size_t local_progress_counter;
76 | 			size_t num_comparisons = n * n - n;
77 | 
78 | #pragma omp atomic read
79 | 			local_progress_counter = progress_counter;
80 | 
81 | 			double progress =
82 | 				100.0 * (double)local_progress_counter / num_comparisons;
83 | 
84 | #pragma omp critical
85 | 			fprintf(stderr, "\rComparing %zu sequences: %5.1f%% (%zu/%zu)", n,
86 | 					progress, local_progress_counter, num_comparisons);
87 | 		}
88 | 
89 | 		esa_free(&E);
90 | 		seq_subject_free(&subject);
91 | 	}
92 | 
93 | 	if (print_progress) {
94 | 		fprintf(stderr, ", done.\n");
95 | 	}
96 | }
97 | 


--------------------------------------------------------------------------------
/src/esa.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file
  3 |  * @brief ESA functions
  4 |  *
  5 |  * This file contains various functions that operate on an enhanced suffix
  6 |  * array. The basic algorithms originate from the book of Ohlebusch
  7 |  * "Bioinformatics Algorithms" (2013). Most of these were heavily modified
  8 |  * for improved performance. One example is the lcp-cache.
  9 |  *
 10 |  * The ESA structure defined in esa.h contains a `cache` field. This cache is
 11 |  * used to quickly look up lcp-intervals. Consider the queries "AAGT" and
 12 |  * "AACG". In both cases the interval for "AA" has to be looked up in the
 13 |  * ESA. If we simply store the interval for "AA" in the cache, once and use it
 14 |  * for each query we are significantly faster (up to 7 times).
 15 |  */
 16 | #include "esa.h"
 17 | #include "global.h"
 18 | #include <assert.h>
 19 | #include <stdlib.h>
 20 | #include <string.h>
 21 | 
 22 | static void esa_init_cache_dfs(esa_s *, char *str, size_t pos, lcp_inter_t in);
 23 | static void esa_init_cache_fill(esa_s *, char *str, size_t pos, lcp_inter_t in);
 24 | 
 25 | static lcp_inter_t get_interval(const esa_s *, lcp_inter_t ij, char a);
 26 | lcp_inter_t get_match(const esa_s *, const char *query, size_t qlen);
 27 | static lcp_inter_t get_match_from(const esa_s *, const char *query, size_t qlen,
 28 | 								  saidx_t k, lcp_inter_t ij);
 29 | 
 30 | static int esa_init_SA(esa_s *);
 31 | static int esa_init_LCP(esa_s *);
 32 | static int esa_init_CLD(esa_s *);
 33 | 
 34 | /** @brief The prefix length up to which LCP-intervals are cached. */
 35 | const size_t CACHE_LENGTH = 10;
 36 | 
 37 | /** @brief Map a code to the character. */
 38 | char code2char(ssize_t code) {
 39 | 	switch (code & 0x3) {
 40 | 		case 0: return 'A';
 41 | 		case 1: return 'C';
 42 | 		case 2: return 'G';
 43 | 		case 3: return 'T';
 44 | 	}
 45 | 	return '\0';
 46 | }
 47 | 
 48 | /** @brief Map a character to a two bit code. */
 49 | ssize_t char2code(const char c) {
 50 | 	ssize_t result = -1;
 51 | 	switch (c) {
 52 | 		case 'A': result = 0; break;
 53 | 		case 'C': result = 1; break;
 54 | 		case 'G': result = 2; break;
 55 | 		case 'T': result = 3; break;
 56 | 	}
 57 | 	return result;
 58 | }
 59 | 
 60 | #define R(CLD, i) ((CLD)[(i)])
 61 | #define L(CLD, i) ((CLD)[(i)-1])
 62 | 
 63 | /** @brief Fills the LCP-Interval cache.
 64 |  *
 65 |  * Traversing the virtual suffix tree, created by SA, LCP and CLD is rather
 66 |  * slow. Hence we create a cache, holding the LCP-interval for a prefix of a
 67 |  * certain length ::CACHE_LENGTH. This function it the entry point for the
 68 |  * cache filling routine.
 69 |  *
 70 |  * @param self - The ESA.
 71 |  * @returns 0 iff successful
 72 |  */
 73 | int esa_init_cache(esa_s *self) {
 74 | 	lcp_inter_t *cache = malloc((1 << (2 * CACHE_LENGTH)) * sizeof(*cache));
 75 | 	CHECK_MALLOC(cache);
 76 | 
 77 | 	self->cache = cache;
 78 | 
 79 | 	char str[CACHE_LENGTH + 1];
 80 | 	str[CACHE_LENGTH] = '\0';
 81 | 
 82 | 	saidx_t m = L(self->CLD, self->len);
 83 | 	lcp_inter_t ij = {.i = 0, .j = self->len - 1, .m = m, .l = self->LCP[m]};
 84 | 
 85 | 	esa_init_cache_dfs(self, str, 0, ij);
 86 | 
 87 | 	return 0;
 88 | }
 89 | 
 90 | /** @brief Fills the cache — one char at a time.
 91 |  *
 92 |  * This function is a depth first search on the virtual suffix tree and fills
 93 |  * the cache. Or rather it calls it self until some value to cache is
 94 |  * calculated. This function is a recursive version of get_interval but with
 95 |  * more edge cases.
 96 |  *
 97 |  * @param C - The ESA.
 98 |  * @param str - The current prefix.
 99 |  * @param pos - The length of the prefix.
100 |  * @param in - The LCP-interval of prefix[0..pos-1].
101 |  */
102 | void esa_init_cache_dfs(esa_s *C, char *str, size_t pos, const lcp_inter_t in) {
103 | 	// we are not yet done, but the current strings do not exist in the subject.
104 | 	if (pos < CACHE_LENGTH && in.i == -1 && in.j == -1) {
105 | 		esa_init_cache_fill(C, str, pos, in);
106 | 		return;
107 | 	}
108 | 
109 | 	// we are past the caching length
110 | 	if (pos >= CACHE_LENGTH) {
111 | 		esa_init_cache_fill(C, str, pos, in);
112 | 		return;
113 | 	}
114 | 
115 | 	lcp_inter_t ij;
116 | 
117 | 	// iterate over all nucleotides
118 | 	for (int code = 0; code < 4; ++code) {
119 | 		str[pos] = code2char(code);
120 | 		ij = get_interval(C, in, str[pos]);
121 | 
122 | 		// fail early
123 | 		if (ij.i == -1 && ij.j == -1) {
124 | 			// if the current extension cannot be found, will with previous one
125 | 			esa_init_cache_fill(C, str, pos + 1, in);
126 | 			continue;
127 | 		}
128 | 
129 | 		// singleton
130 | 		if (ij.i == ij.j) {
131 | 			// fix length
132 | 			ij.l = pos + 1;
133 | 			esa_init_cache_fill(C, str, pos + 1, ij);
134 | 			continue;
135 | 		}
136 | 
137 | 		if (ij.l <= (ssize_t)(pos + 1)) {
138 | 			// Continue one level deeper
139 | 			// This is the usual case
140 | 			esa_init_cache_dfs(C, str, pos + 1, ij);
141 | 			continue;
142 | 		}
143 | 
144 | 		// The LCP-interval is deeper than expected
145 | 		// Check if it still fits into the cache
146 | 		if ((size_t)ij.l >= CACHE_LENGTH) {
147 | 			// If the lcp-interval exceeds the cache depth, stop here and fill
148 | 			esa_init_cache_fill(C, str, pos + 1, in);
149 | 			continue;
150 | 		}
151 | 
152 | 		/* At this point the prefix `str` of length `pos` has been found.
153 | 		 * However, the call to `getInterval` above found an interval with
154 | 		 * an LCP value bigger than `pos`. This means that not all elongations
155 | 		 * (more precise: just one) of `str` appear in the subject. Thus fill
156 | 		 * all values with the matched result to far and continue only with
157 | 		 * the one special substring.
158 | 		 */
159 | 		esa_init_cache_fill(C, str, pos + 1, in);
160 | 
161 | 		char non_acgt = 0;
162 | 
163 | 		// fast forward
164 | 		size_t k = pos + 1;
165 | 		for (; k < (size_t)ij.l; k++) {
166 | 			// In some very edgy edge cases the lcp-interval `ij`
167 | 			// contains a `;` or another non-acgt character. Since we
168 | 			// cannot cache those, break.
169 | 			char c = C->S[C->SA[ij.i] + k];
170 | 			if (char2code(c) < 0) {
171 | 				non_acgt = 1;
172 | 				break;
173 | 			}
174 | 
175 | 			str[k] = c;
176 | 		}
177 | 
178 | 		// We are skipping intervals here. Maybe for each of them we should also
179 | 		// fill the cache. However, I haven't yet figured out how to do that
180 | 		// properly and whether it is worth it.
181 | 
182 | 		if (non_acgt) {
183 | 			esa_init_cache_fill(C, str, k, ij);
184 | 		} else {
185 | 			esa_init_cache_dfs(C, str, k, ij);
186 | 		}
187 | 	}
188 | }
189 | 
190 | /** @brief Fills the cache with a given value.
191 |  *
192 |  * Given a prefix and a value this function fills the cache beyond this point
193 |  * the value.
194 |  *
195 |  * @param C - The ESA.
196 |  * @param str - The current prefix.
197 |  * @param pos - The length of the prefix.
198 |  * @param in - The LCP-interval of prefix[0..pos-1].
199 |  */
200 | void esa_init_cache_fill(esa_s *C, char *str, size_t pos, lcp_inter_t in) {
201 | 	if (pos < CACHE_LENGTH) {
202 | 		for (int code = 0; code < 4; ++code) {
203 | 			str[pos] = code2char(code);
204 | 			esa_init_cache_fill(C, str, pos + 1, in);
205 | 		}
206 | 	} else {
207 | 		ssize_t code = 0;
208 | 		for (size_t i = 0; i < CACHE_LENGTH; ++i) {
209 | 			code <<= 2;
210 | 			code |= char2code(str[i]);
211 | 		}
212 | 
213 | 		C->cache[code] = in;
214 | 	}
215 | }
216 | 
217 | /**
218 |  * @brief Initializes the FVC (first variant character) array.
219 |  *
220 |  * The FVC is of my own invention and simply defined as
221 |  * `FVC[i] = S[SA[i]+LCP[i]]`. This expression is constantly used in
222 |  * get_interval. By precomputing the result, we have less memory
223 |  * accesses, less cache misses, and thus improved runtimes of up to 15%
224 |  * faster matching. This comes at a negligible cost of increased memory.
225 |  *
226 |  * @param self - The ESA
227 |  * @returns 0 iff successful
228 |  */
229 | int esa_init_FVC(esa_s *self) {
230 | 	size_t len = self->len;
231 | 
232 | 	char *FVC = self->FVC = malloc(len);
233 | 	CHECK_MALLOC(FVC);
234 | 
235 | 	const char *S = self->S;
236 | 	const int *SA = self->SA;
237 | 	const int *LCP = self->LCP;
238 | 
239 | 	FVC[0] = '\0';
240 | 	for (size_t i = len; i; i--, FVC++, SA++, LCP++) {
241 | 		*FVC = S[*SA + *LCP];
242 | 	}
243 | 
244 | 	return 0;
245 | }
246 | 
247 | /** @brief Initializes an ESA.
248 |  *
249 |  * This function initializes an ESA with respect to the provided sequence.
250 |  * @param C - The ESA to initialize.
251 |  * @param S - The sequence
252 |  * @returns 0 iff successful
253 |  */
254 | int esa_init(esa_s *C, const seq_subject *S) {
255 | 	if (!C || !S || !S->RS) return 1;
256 | 
257 | 	*C = (esa_s){.S = S->RS, .len = S->RSlen};
258 | 
259 | 	int result;
260 | 
261 | 	result = esa_init_SA(C);
262 | 	if (result) return result;
263 | 
264 | 	result = esa_init_LCP(C);
265 | 	if (result) return result;
266 | 
267 | 	result = esa_init_CLD(C);
268 | 	if (result) return result;
269 | 
270 | 	result = esa_init_FVC(C);
271 | 	if (result) return result;
272 | 
273 | 	result = esa_init_cache(C);
274 | 	if (result) return result;
275 | 
276 | 	return 0;
277 | }
278 | 
279 | /** @brief Free the private data of an ESA. */
280 | void esa_free(esa_s *self) {
281 | 	free(self->SA);
282 | 	free(self->LCP);
283 | 	free(self->CLD);
284 | 	free(self->cache);
285 | 	free(self->FVC);
286 | 	*self = (esa_s){};
287 | }
288 | 
289 | /**
290 |  * Computes the SA given a string S. To do so it uses libdivsufsort.
291 |  * @param C The enhanced suffix array to use. Reads C->S, fills C->SA.
292 |  * @returns 0 iff successful
293 |  */
294 | int esa_init_SA(esa_s *C) {
295 | 	// assert c.S
296 | 	if (!C || !C->S) {
297 | 		return 1;
298 | 	}
299 | 
300 | 	C->SA = malloc(C->len * sizeof(*C->SA));
301 | 	CHECK_MALLOC(C->SA);
302 | 
303 | 	return divsufsort((const unsigned char *)C->S, C->SA, C->len);
304 | }
305 | 
306 | /** @brief Initializes the CLD (child) array.
307 |  *
308 |  * See Ohlebusch.
309 |  *
310 |  * @param C - The ESA
311 |  */
312 | int esa_init_CLD(esa_s *C) {
313 | 	if (!C || !C->LCP) {
314 | 		return 1;
315 | 	}
316 | 	saidx_t *CLD = C->CLD = malloc((C->len + 1) * sizeof(*CLD));
317 | 	CHECK_MALLOC(CLD);
318 | 
319 | 	const saidx_t *LCP = C->LCP;
320 | 
321 | 	typedef struct pair_s {
322 | 		saidx_t idx, lcp;
323 | 	} pair_t;
324 | 
325 | 	pair_t *stack = malloc((C->len + 1) * sizeof(*stack));
326 | 	CHECK_MALLOC(stack);
327 | 	pair_t *top = stack; // points at the topmost filled element
328 | 	pair_t last;
329 | 
330 | 	R(CLD, 0) = C->len;
331 | 
332 | 	top->idx = 0;
333 | 	top->lcp = -1;
334 | 
335 | 	// iterate over all elements
336 | 	for (size_t k = 1; k < (size_t)(C->len + 1); k++) {
337 | 		while (LCP[k] < top->lcp) {
338 | 			// top->lcp is a leaf
339 | 			last = *top--;
340 | 
341 | 			// link all elements of same lcp value in a chain
342 | 			while (top->lcp == last.lcp) {
343 | 				R(CLD, top->idx) = last.idx;
344 | 				last = *top--;
345 | 			}
346 | 
347 | 			// store the l-index of last
348 | 			if (LCP[k] < top->lcp) {
349 | 				R(CLD, top->idx) = last.idx;
350 | 			} else {
351 | 				L(CLD, k) = last.idx;
352 | 			}
353 | 		}
354 | 
355 | 		// continue one level deeper
356 | 		top++;
357 | 		top->idx = k;
358 | 		top->lcp = LCP[k];
359 | 	}
360 | 
361 | 	free(stack);
362 | 	return 0;
363 | }
364 | 
365 | /**
366 |  * This function computed the LCP array, given the suffix array. Thereto it uses
367 |  * a special `phi` array, which makes it slightly faster than the original
368 |  * linear-time algorithm by Kasai et al.
369 |  *
370 |  * @param C The enhanced suffix array to compute the LCP from.
371 |  * @returns 0 iff successful
372 |  */
373 | int esa_init_LCP(esa_s *C) {
374 | 	const char *S = C->S;
375 | 	const saidx_t *SA = C->SA;
376 | 	saidx_t len = C->len;
377 | 
378 | 	// Trivial safety checks
379 | 	if (!C || !S || !SA || len == 0) {
380 | 		return 1;
381 | 	}
382 | 
383 | 	// Allocate new memory
384 | 	// The LCP array is one element longer than S.
385 | 	saidx_t *LCP = C->LCP = malloc((len + 1) * sizeof(*LCP));
386 | 	CHECK_MALLOC(LCP);
387 | 
388 | 	LCP[0] = -1;
389 | 	LCP[len] = -1;
390 | 
391 | 	// Allocate temporary arrays
392 | 	saidx_t *PHI = malloc(len * sizeof(*PHI));
393 | 	saidx_t *PLCP = PHI;
394 | 	CHECK_MALLOC(PHI);
395 | 
396 | 	PHI[SA[0]] = -1;
397 | 	saidx_t k;
398 | 	ssize_t i;
399 | 
400 | 	for (i = 1; i < len; i++) {
401 | 		PHI[SA[i]] = SA[i - 1];
402 | 	}
403 | 
404 | 	ssize_t l = 0;
405 | 	for (i = 0; i < len; i++) {
406 | 		k = PHI[i];
407 | 		if (k != -1) {
408 | 			while (S[k + l] == S[i + l]) {
409 | 				l++;
410 | 			}
411 | 			PLCP[i] = l;
412 | 			l--;
413 | 			if (l < 0) l = 0;
414 | 		} else {
415 | 			PLCP[i] = -1;
416 | 		}
417 | 	}
418 | 
419 | 	// unpermutate the LCP array
420 | 	for (i = 1; i < len; i++) {
421 | 		LCP[i] = PLCP[SA[i]];
422 | 	}
423 | 
424 | 	free(PHI);
425 | 	return 0;
426 | }
427 | 
428 | /** @brief For the lcp-interval of string `w` compute the interval for `wa`
429 |  *
430 |  * Say, we already know the LCP-interval ij for a string `w`. Now we want to
431 |  * check if `wa` may also be found in the ESA and thus in the subject. So we
432 |  * look for the sub interval of `ij` in which all strings feature an `a` as
433 |  * the next character. If such a sub interval is found, its boundaries are
434 |  * returned.
435 |  *
436 |  * @param self - The ESA.
437 |  * @param ij - The lcp-interval for `w`.
438 |  * @param a - The next character.
439 |  * @returns The lcp-interval one level deeper.
440 |  */
441 | static lcp_inter_t get_interval(const esa_s *self, lcp_inter_t ij, char a) {
442 | 	saidx_t i = ij.i;
443 | 	saidx_t j = ij.j;
444 | 
445 | 	const saidx_t *SA = self->SA;
446 | 	const saidx_t *LCP = self->LCP;
447 | 	const char *S = self->S;
448 | 	const saidx_t *CLD = self->CLD;
449 | 	const char *FVC = self->FVC;
450 | 	// check for singleton or empty interval
451 | 	if (i == j) {
452 | 		if (S[SA[i] + ij.l] != a) {
453 | 			ij.i = ij.j = -1;
454 | 		}
455 | 		return ij;
456 | 	}
457 | 
458 | 	int m = ij.m;
459 | 	int l = ij.l;
460 | 
461 | 	char c = S[SA[i] + l];
462 | 	goto SoSueMe;
463 | 
464 | 	do {
465 | 		c = FVC[i];
466 | 
467 | 	SoSueMe:
468 | 		if (c == a) {
469 | 			/* found ! */
470 | 
471 | 			if (i != m - 1) {
472 | 				// found interval contains >1 element
473 | 				saidx_t n = L(CLD, m);
474 | 
475 | 				ij = (lcp_inter_t){.i = i, .j = m - 1, .m = n, .l = LCP[n]};
476 | 			} else {
477 | 				// empty or singleton
478 | 				// doing L(CLD, m) is not valid in this case!
479 | 				ij = (lcp_inter_t){.i = i, .j = i, .m = -1, .l = LCP[i]};
480 | 			}
481 | 
482 | 			return ij;
483 | 		}
484 | 
485 | 		if (c > a) {
486 | 			break;
487 | 		}
488 | 
489 | 		i = m;
490 | 
491 | 		if (i == j) {
492 | 			break; // singleton interval, or `a` not found
493 | 		}
494 | 
495 | 		m = R(CLD, m);
496 | 	} while (/*m != "bottom" && */ LCP[m] == l);
497 | 
498 | 	// final sanity check
499 | 	if (i != ij.i ? FVC[i] == a : S[SA[i] + l] == a) {
500 | 		ij.i = i;
501 | 		ij.j = j;
502 | 		/* Also return the length of the LCP interval including `a` and
503 | 		 * possibly even more characters. Note: l + 1 <= LCP[m] */
504 | 		ij.l = LCP[m];
505 | 		ij.m = m;
506 | 	} else {
507 | 		ij.i = ij.j = -1;
508 | 	}
509 | 
510 | 	return ij;
511 | }
512 | 
513 | /** @brief Compute the longest match of a query with the subject.
514 |  *
515 |  * The *longest match* is the core concept of `andi`. Its simply defined as the
516 |  * longest prefix of a query Q appearing anywhere in the subject S. Talking
517 |  * about genetic sequences, a match is a homologous region, likely followed by a
518 |  * SNP.
519 |  *
520 |  * This function returns the interval for where the longest match of the query
521 |  * can be found in the ESA. Thereto it expects a starting interval for the
522 |  * search.
523 |  *
524 |  * @param C - The enhanced suffix array for the subject.
525 |  * @param query - The query sequence.
526 |  * @param qlen - The length of the query. Should correspond to `strlen(query)`.
527 |  * @param k - The starting index into the query.
528 |  * @param ij - The LCP interval for the string `query[0..k]`.
529 |  * @returns The LCP interval for the longest prefix.
530 |  */
531 | lcp_inter_t get_match_from(const esa_s *C, const char *query, size_t qlen,
532 | 						   saidx_t k, lcp_inter_t ij) {
533 | 
534 | 	if (ij.i == -1 && ij.j == -1) {
535 | 		return ij;
536 | 	}
537 | 
538 | 	// fail early on singleton intervals.
539 | 	if (ij.i == ij.j) {
540 | 
541 | 		// try to extend the match. See line 513 below.
542 | 		saidx_t p = C->SA[ij.i];
543 | 		size_t k = ij.l;
544 | 		const char *S = (const char *)C->S;
545 | 
546 | 		for (; k < qlen && S[p + k]; k++) {
547 | 			if (S[p + k] != query[k]) {
548 | 				ij.l = k;
549 | 				return ij;
550 | 			}
551 | 		}
552 | 
553 | 		ij.l = k;
554 | 		return ij;
555 | 	}
556 | 
557 | 	saidx_t l, i, j;
558 | 
559 | 	lcp_inter_t res = ij;
560 | 
561 | 	const saidx_t *SA = C->SA;
562 | 	const char *S = C->S;
563 | 
564 | 	// Loop over the query until a mismatch is found
565 | 	do {
566 | 		// Get the subinterval for the next character.
567 | 		ij = get_interval(C, ij, query[k]);
568 | 		i = ij.i;
569 | 		j = ij.j;
570 | 
571 | 		// If our match cannot be extended further, return.
572 | 		if (i == -1 && j == -1) {
573 | 			res.l = k;
574 | 			return res;
575 | 		}
576 | 
577 | 		res.i = ij.i;
578 | 		res.j = ij.j;
579 | 
580 | 		l = qlen;
581 | 		if (i < j && ij.l < l) {
582 | 			/* Instead of making another look up we can use the LCP interval
583 | 			 * calculated in get_interval */
584 | 			l = ij.l;
585 | 		}
586 | 
587 | 		// By definition, the kth letter of the query was matched.
588 | 		k++;
589 | 
590 | 		// Extend the match
591 | 		for (int p = SA[i]; k < l; k++) {
592 | 			if (S[p + k] != query[k]) {
593 | 				res.l = k;
594 | 				return res;
595 | 			}
596 | 		}
597 | 	} while (k < (ssize_t)qlen);
598 | 
599 | 	res.l = qlen;
600 | 	return res;
601 | }
602 | 
603 | /** @brief Get a match.
604 |  *
605 |  * Given an ESA and a string Q find the longest prefix of Q that matches
606 |  * somewhere in C. This search is done entirely via jumping around in the ESA,
607 |  * and thus is slow.
608 |  *
609 |  * @param C - The ESA.
610 |  * @param query - The query string — duh.
611 |  * @param qlen - The length of the query.
612 |  * @returns the lcp interval of the match.
613 |  */
614 | lcp_inter_t get_match(const esa_s *C, const char *query, size_t qlen) {
615 | 	// sanity checks
616 | 	if (!C || !query || !C->len || !C->SA || !C->LCP || !C->S || !C->CLD) {
617 | 		return (lcp_inter_t){-1, -1, -1, -1};
618 | 	}
619 | 
620 | 	saidx_t m = L(C->CLD, C->len);
621 | 	lcp_inter_t ij = {.i = 0, .j = C->len - 1, .m = m, .l = C->LCP[m]};
622 | 
623 | 	return get_match_from(C, query, qlen, 0, ij);
624 | }
625 | 
626 | /** @brief Compute the LCP interval of a query. For a certain prefix length of
627 |  * the query its LCP interval is retrieved from a cache. Hence this is faster
628 |  * than the naive `get_match`. If the cache fails to provide a proper value, we
629 |  * fall back to the standard search.
630 |  *
631 |  * @param C - The enhanced suffix array for the subject.
632 |  * @param query - The query sequence.
633 |  * @param qlen - The length of the query. Should correspond to `strlen(query)`.
634 |  * @returns The LCP interval for the longest prefix.
635 |  */
636 | lcp_inter_t get_match_cached(const esa_s *C, const char *query, size_t qlen) {
637 | 	if (qlen <= CACHE_LENGTH) return get_match(C, query, qlen);
638 | 
639 | 	ssize_t offset = 0;
640 | 	for (size_t i = 0; i < CACHE_LENGTH && offset >= 0; i++) {
641 | 		offset <<= 2;
642 | 		offset |= char2code(query[i]);
643 | 	}
644 | 
645 | 	if (offset < 0) {
646 | 		return get_match(C, query, qlen);
647 | 	}
648 | 
649 | 	lcp_inter_t ij = C->cache[offset];
650 | 
651 | 	if (ij.i == -1 && ij.j == -1) {
652 | 		return get_match(C, query, qlen);
653 | 	}
654 | 
655 | 	return get_match_from(C, query, qlen, ij.l, ij);
656 | }
657 | 


--------------------------------------------------------------------------------
/src/esa.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file
 3 |  * @brief This header contains the declarations for functions in esa.c.
 4 |  *
 5 |  */
 6 | #ifndef _ESA_H_
 7 | #define _ESA_H_
 8 | 
 9 | #include "config.h"
10 | #include "sequence.h"
11 | #include <divsufsort.h>
12 | #include <sys/types.h>
13 | 
14 | /**
15 |  * @brief Represents LCP-Intervals.
16 |  *
17 |  * This struct is used to represent LCP-intervals. The member `i` should
18 |  * coincide with the lower bound whereas `j` is the upper bound. Both bounds
19 |  * are inclusive. So if `i == j` the interval contains exactly one element,
20 |  * namely `i`. To represent an empty interval please use `i == j == -1`.
21 |  * Other variants, such as `i == j == -2` can be used to indicate an error.
22 |  * The common prefix length is denoted by l and should always be non-negative.
23 |  * Variables of this type are often called `ij`.
24 |  */
25 | typedef struct {
26 | 	/** @brief The common prefix length */
27 | 	saidx_t l;
28 | 	/** @brief lower bound */
29 | 	saidx_t i;
30 | 	/** @brief upper bound */
31 | 	saidx_t j;
32 | 	/** The new middle. */
33 | 	saidx_t m;
34 | } lcp_inter_t;
35 | 
36 | /**
37 |  * @brief The ESA type.
38 |  *
39 |  * This structure holds arrays and objects associated with an enhanced
40 |  * suffix array (ESA).
41 |  */
42 | typedef struct esa_s {
43 | 	/** The base string from which the ESA was generated. */
44 | 	const char *S;
45 | 	/** The actual suffix array with indexes into S. */
46 | 	saidx_t *SA;
47 | 	/** The LCP holds the number of letters up to which a suffix `S[SA[i]]`
48 | 		equals `S[SA[i-1]]`. Hence the name longest common prefix. For `i = 0`
49 | 		and `i = len` the LCP value is -1. */
50 | 	saidx_t *LCP;
51 | 	/** The length of the string S. */
52 | 	saidx_t len;
53 | 	/** A cache for lcp-intervals */
54 | 	lcp_inter_t *cache;
55 | 	/** The FVC array holds the character after the LCP. */
56 | 	char *FVC;
57 | 	/** This is the child array. */
58 | 	saidx_t *CLD;
59 | } esa_s;
60 | 
61 | lcp_inter_t get_match_cached(const esa_s *, const char *query, size_t qlen);
62 | lcp_inter_t get_match(const esa_s *, const char *query, size_t qlen);
63 | int esa_init(esa_s *, const seq_subject *S);
64 | void esa_free(esa_s *);
65 | 
66 | #ifdef DEBUG
67 | 
68 | char code2char(ssize_t code);
69 | 
70 | #endif // DEBUG
71 | 
72 | #endif
73 | 


--------------------------------------------------------------------------------
/src/global.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file
  3 |  * @brief Global Definitions
  4 |  *
  5 |  * This file contains the declaration of global variables and
  6 |  * their related values. The actual definition is located in andi.c
  7 |  */
  8 | #ifndef _GLOBAL_H_
  9 | #define _GLOBAL_H_
 10 | #include <gsl/gsl_rng.h>
 11 | 
 12 | #include "config.h"
 13 | #include <err.h>
 14 | 
 15 | /**
 16 |  * The *global* variable ::FLAGS is used to set different options
 17 |  * for the execution of the program. Use `FLAGS & F_NAME` to check
 18 |  * if `F_NAME` was set.
 19 |  */
 20 | extern int FLAGS;
 21 | 
 22 | /**
 23 |  * The *global* variable ::THREADS contains the number of threads the program
 24 |  * should use.
 25 |  */
 26 | extern int THREADS;
 27 | 
 28 | /**
 29 |  * The ::ANCHOR_P_VALUE represents the probability that an anchor will be found,
 30 |  * if the two sequences are unrelated. I.e. it is the p-value for H_0: random
 31 |  * sequences. Its value can be set using the `-p` switch.
 32 |  */
 33 | extern double ANCHOR_P_VALUE;
 34 | 
 35 | /**
 36 |  * The number of matrices that should be bootstrapped.
 37 |  */
 38 | extern long unsigned int BOOTSTRAP;
 39 | 
 40 | /**
 41 |  * A global random number generator. Has to be seedable.
 42 |  */
 43 | extern gsl_rng *RNG;
 44 | 
 45 | /**
 46 |  * The evolutionary model.
 47 |  */
 48 | extern int MODEL;
 49 | 
 50 | enum { M_RAW, M_JC, M_KIMURA, M_LOGDET };
 51 | 
 52 | /**
 53 |  * This enum contains the available flags. Please note that all
 54 |  * available options are a power of 2.
 55 |  */
 56 | enum {
 57 | 	F_NONE = 0,
 58 | 	F_TRUNCATE_NAMES = 1,
 59 | 	F_VERBOSE = 2,
 60 | 	F_EXTRA_VERBOSE = 4,
 61 | 	F_NON_ACGT = 8,
 62 | 	F_JOIN = 16,
 63 | 	F_LOW_MEMORY = 32,
 64 | 	F_SHORT = 64,
 65 | 	F_PRINT_PROGRESS = 128,
 66 | 	F_SOFT_ERROR = 256
 67 | };
 68 | 
 69 | /**
 70 |  * @brief This macro is used to unify the checks for the return value of malloc.
 71 |  *
 72 |  * @param PTR - The pointer getting checked.
 73 |  */
 74 | #define CHECK_MALLOC(PTR)                                                      \
 75 | 	do {                                                                       \
 76 | 		if (PTR == NULL) {                                                     \
 77 | 			err(errno, "Out of memory");                                       \
 78 | 		}                                                                      \
 79 | 	} while (0)
 80 | 
 81 | /**
 82 |  * @brief This macro is used to print a warning and make the program exit with
 83 |  * an failure exit code, later.
 84 |  */
 85 | #define soft_err(...)                                                          \
 86 | 	do {                                                                       \
 87 | 		FLAGS |= F_SOFT_ERROR;                                                 \
 88 | 		warn(__VA_ARGS__);                                                     \
 89 | 	} while (0)
 90 | 
 91 | /**
 92 |  * @brief This macro is used to print a warning and make the program exit with
 93 |  * an failure exit code, later.
 94 |  */
 95 | #define soft_errx(...)                                                         \
 96 | 	do {                                                                       \
 97 | 		FLAGS |= F_SOFT_ERROR;                                                 \
 98 | 		warnx(__VA_ARGS__);                                                    \
 99 | 	} while (0)
100 | 
101 | #endif
102 | 


--------------------------------------------------------------------------------
/src/io.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file
  3 |  * @brief This file contains the definitions for various io methods.
  4 |  */
  5 | #define _GNU_SOURCE
  6 | #include <fcntl.h>
  7 | #include <limits.h>
  8 | #include <math.h>
  9 | #include <stdio.h>
 10 | #include <string.h>
 11 | #include <unistd.h>
 12 | 
 13 | #include <compat-stdlib.h>
 14 | #include <compat-string.h>
 15 | #include <pfasta.h>
 16 | 
 17 | #include "global.h"
 18 | #include "io.h"
 19 | 
 20 | /**
 21 |  * @brief Access an element.
 22 |  * @param sv - The base vector.
 23 |  * @param index - The element index to access.
 24 |  * @returns the string at position `index`.
 25 |  */
 26 | char *string_vector_at(struct string_vector *sv, size_t index) {
 27 | 	return sv->data[index];
 28 | }
 29 | 
 30 | /**
 31 |  * @brief Access the underlying buffer.
 32 |  * @param sv - The base vector.
 33 |  * @returns the underlying buffer.
 34 |  */
 35 | char **string_vector_data(struct string_vector *sv) {
 36 | 	return sv->data;
 37 | }
 38 | 
 39 | /**
 40 |  * @brief Free all data.
 41 |  * @param sv - The base vector.
 42 |  */
 43 | void string_vector_free(struct string_vector *sv) {
 44 | 	size_t i = 0;
 45 | 	for (; i < sv->size; i++) {
 46 | 		free(sv->data[i]);
 47 | 	}
 48 | 	free(sv->data);
 49 | }
 50 | 
 51 | /**
 52 |  * @brief Initialise the vector.
 53 |  * @param sv - The base vector.
 54 |  */
 55 | void string_vector_init(struct string_vector *sv) {
 56 | 	sv->data = malloc(sizeof(*sv->data) * 4);
 57 | 	CHECK_MALLOC(sv->data);
 58 | 
 59 | 	sv->capacity = 4;
 60 | 	sv->size = 0;
 61 | }
 62 | 
 63 | /**
 64 |  * @brief Adds a copy to the end of the vector.
 65 |  * @param sv - The base vector.
 66 |  * @param str - The new string to add.
 67 |  */
 68 | void string_vector_push_back(struct string_vector *sv, const char *str) {
 69 | 	string_vector_emplace_back(sv, strdup(str));
 70 | }
 71 | 
 72 | /**
 73 |  * @brief Add a file name to the end of the vector, directly.
 74 |  * @param sv - The base vector.
 75 |  * @param str - The string to emplace.
 76 |  */
 77 | void string_vector_emplace_back(struct string_vector *sv, char *str) {
 78 | 	if (sv->size < sv->capacity) {
 79 | 		sv->data[sv->size++] = str;
 80 | 	} else {
 81 | 		char **ptr = reallocarray(sv->data, sv->capacity / 2, 3 * sizeof(*ptr));
 82 | 		CHECK_MALLOC(ptr);
 83 | 		sv->data = ptr;
 84 | 		sv->capacity = (sv->capacity / 2) * 3;
 85 | 		sv->data[sv->size++] = str;
 86 | 	}
 87 | }
 88 | 
 89 | /**
 90 |  * @brief Return the number of elements.
 91 |  * @param sv - The base vector.
 92 |  * @returns the size of the vector.
 93 |  */
 94 | size_t string_vector_size(const struct string_vector *sv) {
 95 | 	return sv->size;
 96 | }
 97 | 
 98 | /**
 99 |  * @brief Read a *fof* and add its contents into a vector.
100 |  * @param file_name - The file of file names aka. fof.
101 |  * @param sv - The vector to add file names to.
102 |  */
103 | void read_into_string_vector(const char *file_name, struct string_vector *sv) {
104 | 	FILE *file = strcmp(file_name, "-") ? fopen(file_name, "r") : stdin;
105 | 	if (!file) {
106 | 		soft_err("%s", file_name);
107 | 		return;
108 | 	}
109 | 
110 | 	while (1) {
111 | 		char *str = NULL;
112 | 		size_t buffer_size = 0;
113 | 		ssize_t check = getline(&str, &buffer_size, file);
114 | 
115 | 		// EOF is set only *after* getline tried to read past it.
116 | 		if (check == -1 && feof(file) != 0) {
117 | 			free(str);
118 | 			break; // EOF
119 | 		}
120 | 
121 | 		if (check == -1) {
122 | 			soft_err("%s", file_name);
123 | 			break;
124 | 		}
125 | 
126 | 		char *nl = strchr(str, '\n');
127 | 		if (nl) {
128 | 			*nl = '\0'; // remove newline character
129 | 		}
130 | 
131 | 		// ignore empty lines
132 | 		if (strlen(str) == 0) {
133 | 			free(str);
134 | 			continue;
135 | 		}
136 | 
137 | 		string_vector_emplace_back(sv, str);
138 | 	}
139 | 
140 | 	int check = fclose(file);
141 | 	if (check != 0) {
142 | 		soft_err("%s", file_name);
143 | 	}
144 | }
145 | 
146 | /**
147 |  * @brief Joins all sequences from a file into a single long sequence.
148 |  *
149 |  * Apart from reading all sequences from a file, this function also
150 |  * merges them into one long sequence.
151 |  *
152 |  * "I didn't learn joined up handwriting for nothing, you know."
153 |  * ~ Gilderoy Lockhart
154 |  *
155 |  * @param file_name - The name of the file to be used for reading. The name is
156 |  *  also used to infer the sequence name.
157 |  * @param dsa - (output parameter) An array that holds found sequences.
158 |  */
159 | void read_fasta_join(const char *file_name, dsa_t *dsa) {
160 | 	if (!file_name || !dsa) return;
161 | 
162 | 	dsa_t single;
163 | 	dsa_init(&single);
164 | 	read_fasta(file_name, &single);
165 | 
166 | 	if (dsa_size(&single) == 0) {
167 | 		return;
168 | 	}
169 | 
170 | 	seq_t joined = dsa_join(&single);
171 | 
172 | 	/* In join mode we try to be clever about the sequence name. Given the file
173 | 	 * path we extract just the file name. ie. path/file.ext -> file
174 | 	 * This obviously fails on Windows.
175 | 	 */
176 | 
177 | 	const char *left = strrchr(file_name, '/'); // find the last path separator
178 | 	left = (left == NULL) ? file_name : left + 1;
179 | 	// left is the position one of to the right of the path separator
180 | 
181 | 	const char *dot = strchrnul(left, '.'); // find the extension
182 | 
183 | 	// copy only the file name, not its path or extension
184 | 	joined.name = strndup(left, dot - left);
185 | 	CHECK_MALLOC(joined.name);
186 | 
187 | 	dsa_push(dsa, joined);
188 | 	dsa_free(&single);
189 | }
190 | 
191 | /**
192 |  * @brief This function reads sequences from a file.
193 |  * @param file_name - The file to read.
194 |  * @param dsa - (output parameter) An array that holds found sequences.
195 |  */
196 | void read_fasta(const char *file_name, dsa_t *dsa) {
197 | 	if (!file_name || !dsa) return;
198 | 
199 | 	int file_descriptor =
200 | 		strcmp(file_name, "-") ? open(file_name, O_RDONLY) : STDIN_FILENO;
201 | 
202 | 	if (file_descriptor < 0) {
203 | 		soft_err("%s", file_name);
204 | 		return;
205 | 	}
206 | 
207 | 	struct pfasta_parser pp = pfasta_init(file_descriptor);
208 | 	if (pp.errstr) {
209 | 		soft_errx("%s: %s", file_name, pp.errstr);
210 | 		goto fail;
211 | 	}
212 | 
213 | 	seq_t top = {};
214 | 	while (!pp.done) {
215 | 		struct pfasta_record pr = pfasta_read(&pp);
216 | 		if (pp.errstr) {
217 | 			soft_errx("%s: %s", file_name, pp.errstr);
218 | 			goto fail;
219 | 		}
220 | 
221 | 		int check = seq_init(&top, pr.sequence, pr.name);
222 | 
223 | 		// skip broken sequences
224 | 		if (check != 0) continue;
225 | 
226 | 		dsa_push(dsa, top);
227 | 		pfasta_record_free(&pr);
228 | 	}
229 | 
230 | fail:
231 | 	pfasta_free(&pp);
232 | 	close(file_descriptor);
233 | }
234 | 
235 | /**
236 |  * @brief Prints the distance matrix.
237 |  *
238 |  * This function pretty prints the distance matrix. For small distances
239 |  * scientific notation is used.
240 |  *
241 |  * @param D - The distance matrix
242 |  * @param sequences - An array of pointers to the sequences.
243 |  * @param n - The number of sequences.
244 |  * @param warnings - Print warnings? Set to 0 for bootstrapped matrices.
245 |  */
246 | void print_distances(const struct model *D, const seq_t *sequences, size_t n,
247 | 					 int warnings) {
248 | 	size_t i, j;
249 | 	int use_scientific = 0;
250 | 
251 | 	double *DD = malloc(n * n * sizeof(*DD));
252 | 	CHECK_MALLOC(DD);
253 | 
254 | #define DD(X, Y) (DD[(X)*n + (Y)])
255 | 
256 | 	typedef double(estimate_fn)(const model *);
257 | 	estimate_fn *estimate;
258 | 
259 | 	switch (MODEL) {
260 | 		case M_RAW: estimate = &estimate_RAW; break;
261 | 		default:
262 | 		/* intentional fall-through. This is just here to silence any
263 | 		 * compiler warnings. The real default is set in andi.c.*/
264 | 		case M_JC: estimate = &estimate_JC; break;
265 | 		case M_KIMURA: estimate = &estimate_KIMURA; break;
266 | 		case M_LOGDET: estimate = &estimate_LOGDET; break;
267 | 	}
268 | 
269 | 	for (i = 0; i < n; i++) {
270 | 		for (j = 0; j < n; j++) {
271 | 			model datum = D(i, j);
272 | 
273 | 			if (!(FLAGS & F_EXTRA_VERBOSE)) {
274 | 				datum = model_average(&D(i, j), &D(j, i));
275 | 			}
276 | 
277 | 			double dist = DD(i, j) = i == j ? 0.0 : estimate(&datum);
278 | 
279 | 			if (dist > 0 && dist < 0.001) {
280 | 				use_scientific = 1;
281 | 			}
282 | 
283 | 			if (isnan(dist) && warnings) {
284 | 				const char str[] = {
285 | 					"For the two sequences '%s' and '%s' the distance "
286 | 					"computation failed and is reported as nan. "
287 | 					"Please refer to the documentation for further details."};
288 | 				soft_errx(str, sequences[i].name, sequences[j].name);
289 | 			}
290 | 
291 | 			if (!isnan(dist) && i < j && warnings) {
292 | 				double coverage1 = model_coverage(&D(i, j));
293 | 				double coverage2 = model_coverage(&D(j, i));
294 | 
295 | 				if (coverage1 < 0.2 || coverage2 < 0.2) {
296 | 					const char str[] = {
297 | 						"For the two sequences '%s' and '%s' very little "
298 | 						"homology was found (%f and %f, respectively)."};
299 | 					soft_errx(str, sequences[i].name, sequences[j].name,
300 | 							  coverage1, coverage2);
301 | 				}
302 | 			}
303 | 		}
304 | 	}
305 | 
306 | 	printf("%zu\n", n);
307 | 	for (i = 0; i < n; i++) {
308 | 		// Print ten characters of the name. Pad with spaces, if
309 | 		// necessary. Truncate to exactly ten characters if requested by user.
310 | 		printf(FLAGS & F_TRUNCATE_NAMES ? "%-10.10s" : "%-10s",
311 | 			   sequences[i].name);
312 | 
313 | 		for (j = 0; j < n; j++) {
314 | 			// use scientific notation for small numbers
315 | 			printf(use_scientific ? " %1.4e" : " %1.4f", DD(i, j));
316 | 		}
317 | 		printf("\n");
318 | 	}
319 | 
320 | 	free(DD);
321 | }
322 | 
323 | /**
324 |  * @brief Prints the coverage matrix.
325 |  * @param D - The distance matrix
326 |  * @param n - The number of sequences.
327 |  */
328 | void print_coverages(const struct model *D, size_t n) {
329 | 	size_t i, j;
330 | 	printf("\nCoverage:\n");
331 | 	for (i = 0; i < n; i++) {
332 | 		for (j = 0; j < n; j++) {
333 | 			printf("%1.4e ", model_coverage(&D(i, j)));
334 | 		}
335 | 		printf("\n");
336 | 	}
337 | }
338 | 


--------------------------------------------------------------------------------
/src/io.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file
 3 |  * @brief This header contains function declarations for io procedures.
 4 |  */
 5 | #ifndef _IO_H_
 6 | #define _IO_H_
 7 | 
 8 | #include "model.h"
 9 | #include "sequence.h"
10 | #include <err.h>
11 | #include <errno.h>
12 | #include <stdio.h>
13 | 
14 | /**
15 |  * This is a neat hack for dealing with matrices.
16 |  */
17 | #define D(X, Y) (D[(X)*n + (Y)])
18 | #define M(X, Y) (M[(X)*n + (Y)])
19 | 
20 | void read_fasta(const char *, dsa_t *dsa);
21 | void read_fasta_join(const char *, dsa_t *dsa);
22 | 
23 | void print_distances(const struct model *, const seq_t *, size_t, int);
24 | void print_coverages(const struct model *, size_t);
25 | 
26 | /**
27 |  * @brief A dynamically growing structure for file_names.
28 |  */
29 | struct string_vector {
30 | 	char **data;
31 | 	size_t capacity, size;
32 | };
33 | 
34 | char *string_vector_at(struct string_vector *, size_t);
35 | char **string_vector_data(struct string_vector *);
36 | void string_vector_free(struct string_vector *);
37 | void string_vector_init(struct string_vector *);
38 | void string_vector_push_back(struct string_vector *, const char *);
39 | void string_vector_emplace_back(struct string_vector *, char *);
40 | size_t string_vector_size(const struct string_vector *);
41 | 
42 | void read_into_string_vector(const char *, struct string_vector *);
43 | 
44 | #endif // _IO_H_
45 | 


--------------------------------------------------------------------------------
/src/model.c:
--------------------------------------------------------------------------------
  1 | /** @file
  2 |  * @brief This file contains all functions for the mutation matrix and the
  3 |  * estimation of evolutionary distances thereof.
  4 |  */
  5 | 
  6 | #include "model.h"
  7 | #include "global.h"
  8 | #include <gsl/gsl_randist.h>
  9 | #include <inttypes.h>
 10 | #include <math.h>
 11 | #include <stdio.h>
 12 | 
 13 | /**
 14 |  * @brief Sum some mutation count specified by `summands`. Intended to be used
 15 |  * through the `model_sum` macro.
 16 |  *
 17 |  * @param MM - The mutation matrix.
 18 |  * @param summands - The mutations to add.
 19 |  * @returns The sum of mutations.
 20 |  */
 21 | static size_t model_sum_types(const model *MM, const int summands[]) {
 22 | 	size_t total = 0;
 23 | 	for (int i = 0; summands[i] != MUTCOUNTS; ++i) {
 24 | 		total += MM->counts[summands[i]];
 25 | 	}
 26 | 	return total;
 27 | }
 28 | 
 29 | #define model_sum(MM, ...)                                                     \
 30 | 	model_sum_types((MM), (int[]){__VA_ARGS__, MUTCOUNTS})
 31 | 
 32 | /**
 33 |  * @brief Average two mutation matrices.
 34 |  *
 35 |  * @param MM - One matrix
 36 |  * @param NN - Second matrix
 37 |  * @returns The average (sum) of two mutation matrices.
 38 |  */
 39 | model model_average(const model *MM, const model *NN) {
 40 | 	model ret = *MM;
 41 | 	for (int i = 0; i != MUTCOUNTS; ++i) {
 42 | 		ret.counts[i] += NN->counts[i];
 43 | 	}
 44 | 	ret.seq_len += NN->seq_len;
 45 | 	return ret;
 46 | }
 47 | 
 48 | /**
 49 |  * @brief Compute the total number of nucleotides in the pairwise alignment.
 50 |  *
 51 |  * @param MM - The mutation matrix.
 52 |  * @returns The length of the alignment.
 53 |  */
 54 | size_t model_total(const model *MM) {
 55 | 	size_t total = 0;
 56 | 	for (size_t i = 0; i < MUTCOUNTS; ++i) {
 57 | 		total += MM->counts[i];
 58 | 	}
 59 | 	return total;
 60 | }
 61 | 
 62 | /**
 63 |  * @brief Compute the coverage of an alignment.
 64 |  *
 65 |  * @param MM - The mutation matrix.
 66 |  * @returns The relative coverage
 67 |  */
 68 | double model_coverage(const model *MM) {
 69 | 	size_t covered = model_total(MM);
 70 | 	size_t actual = MM->seq_len;
 71 | 
 72 | 	return (double)covered / (double)actual;
 73 | }
 74 | 
 75 | /**
 76 |  * @brief Estimate the uncorrected distance of a pairwise alignment.
 77 |  *
 78 |  * @param MM - The mutation matrix.
 79 |  * @returns The uncorrected substitution rate.
 80 |  */
 81 | double estimate_RAW(const model *MM) {
 82 | 	size_t nucl = model_total(MM);
 83 | 	size_t SNPs = model_sum(MM, AtoC, AtoG, AtoT, CtoA, CtoG, CtoT, GtoA, GtoC,
 84 | 							GtoT, TtoA, TtoC, TtoG);
 85 | 
 86 | 	// Insignificant results. All abort the fail train.
 87 | 	if (nucl <= 3) {
 88 | 		return NAN;
 89 | 	}
 90 | 
 91 | 	return (double)SNPs / (double)nucl;
 92 | }
 93 | 
 94 | /**
 95 |  * @brief Compute the Jukes-Cantor distance.
 96 |  *
 97 |  * @param MM - The mutation matrix.
 98 |  * @returns The corrected JC distance.
 99 |  */
100 | double estimate_JC(const model *MM) {
101 | 	double dist = estimate_RAW(MM);
102 | 	dist = -0.75 * log(1.0 - (4.0 / 3.0) * dist); // jukes cantor
103 | 
104 | 	// fix negative zero
105 | 	return dist <= 0.0 ? 0.0 : dist;
106 | }
107 | 
108 | /** @brief computes the evolutionary distance using K80.
109 |  *
110 |  * @param MM - The mutation matrix.
111 |  * @returns The corrected Kimura distance.
112 |  */
113 | double estimate_KIMURA(const model *MM) {
114 | 	size_t nucl = model_total(MM);
115 | 	size_t transitions = model_sum(MM, AtoG, GtoA, CtoT, TtoC);
116 | 	size_t transversions =
117 | 		model_sum(MM, AtoC, CtoA, AtoT, TtoA, GtoC, CtoG, GtoT, TtoG);
118 | 
119 | 	double P = (double)transitions / (double)nucl;
120 | 	double Q = (double)transversions / (double)nucl;
121 | 
122 | 	double tmp = 1.0 - 2.0 * P - Q;
123 | 	double dist = -0.25 * log((1.0 - 2.0 * Q) * tmp * tmp);
124 | 
125 | 	// fix negative zero
126 | 	return dist <= 0.0 ? 0.0 : dist;
127 | }
128 | 
129 | /** @brief computes the evolutionary distance using LogDet.
130 |  *
131 |  * The LogDet distance between sequence X and and sequence Y
132 |  * is given as
133 |  *
134 |  * -(1 / K) * (log(det(Fxy)) - 0.5 * log(det(Fxx * Fyy)))
135 |  *
136 |  * Where K is the number of character states, Fxy is the site-pattern
137 |  * frequency matrix, and diagonal matrices Fxx and Fyy give the
138 |  * frequencies of the different character states in sequences X and Y.
139 |  *
140 |  * Each i,j-th entry in Fxy is the proportion of homologous sites
141 |  * where sequences X and Y have character states i and j, respectively.
142 |  *
143 |  * For our purposes, X is the Subject (From) sequence and Y is the
144 |  * Query (To) sequence and matrix Fxy looks like
145 |  *
146 |  *  To   A  C  G  T
147 |  * From
148 |  *  A  (            )
149 |  *  C  (            )
150 |  *  G  (            )
151 |  *  T  (            )
152 |  *
153 |  * @param MM - The mutation matrix.
154 |  * @returns The LogDet distance.
155 |  */
156 | double estimate_LOGDET(const model *MM) {
157 | 
158 | 	double nucl = (double)model_total(MM);
159 | 	double P[MUTCOUNTS];
160 | 	for (int i = 0; i < MUTCOUNTS; i++) {
161 | 		P[i] = MM->counts[i] / nucl;
162 | 	}
163 | 
164 | 	double logDetFxxFyy =
165 | 		// log determinant of diagonal matrix of row sums
166 | 		log(model_sum(MM, AtoA, AtoC, AtoG, AtoT) / nucl) +
167 | 		log(model_sum(MM, CtoA, CtoC, CtoG, CtoT) / nucl) +
168 | 		log(model_sum(MM, GtoA, GtoC, GtoG, GtoT) / nucl) +
169 | 		log(model_sum(MM, TtoA, TtoC, TtoG, TtoT) / nucl) +
170 | 		// log determinant of diagonal matrix of column sums
171 | 		log(model_sum(MM, AtoA, CtoA, GtoA, TtoA) / nucl) +
172 | 		log(model_sum(MM, AtoC, CtoC, GtoC, TtoC) / nucl) +
173 | 		log(model_sum(MM, AtoG, CtoG, GtoG, TtoG) / nucl) +
174 | 		log(model_sum(MM, AtoT, CtoT, GtoT, TtoT) / nucl);
175 | 
176 | 	// determinant of the site-pattern frequency matrix
177 | 	double detFxy =
178 | 		P[AtoA] * P[CtoC] * (P[GtoG] * P[TtoT] - P[TtoG] * P[GtoT]) -
179 | 		P[AtoA] * P[CtoG] * (P[GtoC] * P[TtoT] - P[TtoC] * P[GtoT]) +
180 | 		P[AtoA] * P[CtoT] * (P[GtoC] * P[TtoG] - P[TtoC] * P[GtoG]) -
181 | 
182 | 		P[AtoC] * P[CtoA] * (P[GtoG] * P[TtoT] - P[TtoG] * P[GtoT]) +
183 | 		P[AtoC] * P[CtoG] * (P[GtoA] * P[TtoT] - P[TtoA] * P[GtoT]) -
184 | 		P[AtoC] * P[CtoT] * (P[GtoA] * P[TtoG] - P[TtoA] * P[GtoG]) +
185 | 
186 | 		P[AtoG] * P[CtoA] * (P[GtoC] * P[TtoT] - P[TtoC] * P[GtoT]) -
187 | 		P[AtoG] * P[CtoC] * (P[GtoA] * P[TtoT] - P[TtoA] * P[GtoT]) +
188 | 		P[AtoG] * P[CtoT] * (P[GtoA] * P[TtoC] - P[TtoA] * P[GtoC]) -
189 | 
190 | 		P[AtoT] * P[CtoA] * (P[GtoC] * P[TtoG] - P[TtoC] * P[GtoG]) +
191 | 		P[AtoT] * P[CtoC] * (P[GtoA] * P[TtoG] - P[TtoA] * P[GtoG]) -
192 | 		P[AtoT] * P[CtoG] * (P[GtoA] * P[TtoC] - P[TtoA] * P[GtoC]);
193 | 
194 | 	double dist = -0.25 * (log(detFxy) - 0.5 * logDetFxxFyy);
195 | 
196 | 	// fix negative zero
197 | 	return dist <= 0.0 ? 0.0 : dist;
198 | }
199 | 
200 | /** @brief Bootstrap a mutation matrix.
201 |  *
202 |  * The classical bootstrapping process, as described by Felsenstein, resamples
203 |  * all nucleotides of a MSA. As andi only computes a pairwise alignment, this
204 |  * process boils down to a simple multinomial distribution. We just have to
205 |  * resample the elements of the mutation matrix. See Klötzl & Haubold (2016)
206 |  * for details. http://www.mdpi.com/2075-1729/6/1/11/htm
207 |  *
208 |  * @param datum - The original mutation matrix.
209 |  * @returns A bootstrapped mutation matrix.
210 |  */
211 | model model_bootstrap(model datum) {
212 | 	size_t nucl = model_total(&datum);
213 | 	double p[MUTCOUNTS];
214 | 	for (size_t i = 0; i < MUTCOUNTS; ++i) {
215 | 		p[i] = datum.counts[i] / (double)nucl;
216 | 	}
217 | 
218 | 	gsl_ran_multinomial(RNG, MUTCOUNTS, nucl, p, datum.counts);
219 | 
220 | 	return datum;
221 | }
222 | 
223 | /**
224 |  * @brief Given an anchor, classify nucleotides.
225 |  *
226 |  * For anchors we already know that the nucleotides of the subject and the query
227 |  * are equal. Thus only one sequence has to be analysed. Most models don't
228 |  * actually care about the individual nucleotides as long as they are equal in
229 |  * the two sequences. For these models, we just assume equal distribution.
230 |  *
231 |  * @param MM - The mutation matrix
232 |  * @param S - The subject
233 |  * @param len - The anchor length
234 |  */
235 | void model_count_equal(model *MM, const char *S, size_t len) {
236 | 	if (MODEL == M_RAW || MODEL == M_JC || MODEL == M_KIMURA) {
237 | 		size_t fourth = len / 4;
238 | 		MM->counts[AtoA] += fourth;
239 | 		MM->counts[CtoC] += fourth;
240 | 		MM->counts[GtoG] += fourth;
241 | 		MM->counts[TtoT] += fourth + (len & 3);
242 | 		return;
243 | 	}
244 | 
245 | 	// Fall-back algorithm for future models. Note, as this works on a
246 | 	// per-character basis it is slow.
247 | 
248 | 	size_t local_counts[4] = {0};
249 | 
250 | 	for (; len--;) {
251 | 		char s = *S++;
252 | 
253 | 		// ';!#' are all smaller than 'A'
254 | 		if (s < 'A') {
255 | 			// Technically, s can only be ';' at this point.
256 | 			continue;
257 | 		}
258 | 
259 | 		// The four canonical nucleotides can be uniquely identified by the bits
260 | 		// 0x6: A -> 0, C → 1, G → 3, T → 2. Thus the order below is changed.
261 | 		local_counts[(s >> 1) & 3]++;
262 | 	}
263 | 
264 | 	MM->counts[AtoA] += local_counts[0];
265 | 	MM->counts[CtoC] += local_counts[1];
266 | 	MM->counts[GtoG] += local_counts[3];
267 | 	MM->counts[TtoT] += local_counts[2];
268 | }
269 | 
270 | /** @brief Convert a nucleotide to a 2bit representation.
271 |  *
272 |  * We want to map characters:
273 |  *  A → 0
274 |  *  C → 1
275 |  *  G → 2
276 |  *  T → 3
277 |  * The trick used below is that the three lower bits of the
278 |  * characters are unique. Thus, they can be used to compute the mapping
279 |  * above. The mapping itself is done via tricky bitwise operations.
280 |  *
281 |  * @param c - input nucleotide
282 |  * @returns 2bit representation.
283 |  */
284 | char nucl2bit(unsigned char c) {
285 | 	c &= 6;
286 | 	c ^= c >> 1;
287 | 	return c >> 1;
288 | }
289 | 
290 | /**
291 |  * @brief Count the substitutions and add them to the mutation matrix.
292 |  *
293 |  * @param MM - The mutation matrix.
294 |  * @param S - The subject
295 |  * @param Q - The query
296 |  * @param len - The length of the alignment
297 |  */
298 | void model_count(model *MM, const char *S, const char *Q, size_t len) {
299 | 	size_t local_counts[MUTCOUNTS] = {0};
300 | 
301 | 	for (size_t i = 0; i < len; S++, Q++, i++) {
302 | 		char s = *S;
303 | 		char q = *Q;
304 | 
305 | 		// Skip special characters.
306 | 		if (s < 'A' || q < 'A') {
307 | 			continue;
308 | 		}
309 | 
310 | 		// Pick the correct two bits representing s and q.
311 | 		unsigned char foo = nucl2bit(s);
312 | 		unsigned char bar = nucl2bit(q);
313 | 
314 | 		/*
315 | 		 * Finally, we want to map the indices to the correct mutation. This is
316 | 		 * done by utilising the mutation types in model.h.
317 | 		 */
318 | 		unsigned int index = (foo << 2) + bar;
319 | 
320 | 		local_counts[index]++;
321 | 	}
322 | 
323 | 	for (int i = 0; i != MUTCOUNTS; ++i) {
324 | 		MM->counts[i] += local_counts[i];
325 | 	}
326 | }
327 | 


--------------------------------------------------------------------------------
/src/model.h:
--------------------------------------------------------------------------------
 1 | /** @file
 2 |  * @brief This header contains all structures and prototypes for creating a
 3 |  * mutation matrix and estimating distances trough an evolutionary model
 4 |  * thereof.
 5 |  */
 6 | #pragma once
 7 | 
 8 | #include <stdlib.h>
 9 | 
10 | /**
11 |  * This enum contains all possible mutations. The total number
12 |  * of different possible mutations is MUTCOUNTS.
13 |  */
14 | enum {
15 | 	AtoA,
16 | 	AtoC,
17 | 	AtoG,
18 | 	AtoT,
19 | 	CtoA,
20 | 	CtoC,
21 | 	CtoG,
22 | 	CtoT,
23 | 	GtoA,
24 | 	GtoC,
25 | 	GtoG,
26 | 	GtoT,
27 | 	TtoA,
28 | 	TtoC,
29 | 	TtoG,
30 | 	TtoT,
31 | 	MUTCOUNTS
32 | };
33 | 
34 | /** @brief The mutation matrix.
35 |  *
36 |  * We need to keep track of the different types of mutations between two
37 |  * sequences. For this the following matrix is filled.
38 |  *
39 |  *  To   A  C  G  T
40 |  * From
41 |  *  A  (            )
42 |  *  C  (            )
43 |  *  G  (            )
44 |  *  T  (            )
45 |  *
46 |  * The cells are absolute counts. Together with seq_len (the query length),
47 |  * we can deduce the substitution rate and coverage.
48 |  *
49 |  * As libdivsufsort is 32 bit the sequence length is limited to (INT_MAX-1)/2.
50 |  * We can thus use the same limit for the counts.
51 |  */
52 | typedef struct model {
53 | 	/** The absolute counts of mutation types. */
54 | 	unsigned int counts[MUTCOUNTS];
55 | 	/** The query length. */
56 | 	unsigned int seq_len;
57 | } model;
58 | 
59 | void model_count_equal(model *, const char *, size_t);
60 | void model_count(model *, const char *, const char *, size_t);
61 | model model_average(const model *, const model *);
62 | double model_coverage(const model *);
63 | double estimate_RAW(const model *);
64 | double estimate_JC(const model *);
65 | double estimate_KIMURA(const model *);
66 | double estimate_LOGDET(const model *);
67 | model model_bootstrap(model);
68 | 


--------------------------------------------------------------------------------
/src/process.c:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | /**
  4 |  * @file
  5 |  * @brief This file contains various distance methods.
  6 |  */
  7 | #include "process.h"
  8 | #include "esa.h"
  9 | #include "global.h"
 10 | #include "io.h"
 11 | #include "model.h"
 12 | #include "sequence.h"
 13 | #include <math.h>
 14 | #include <stdint.h>
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | 
 18 | #ifdef _OPENMP
 19 | #include <omp.h>
 20 | #endif
 21 | 
 22 | int calculate_bootstrap(const struct model *M, const seq_t *sequences,
 23 | 						size_t n);
 24 | 
 25 | typedef _Bool bool;
 26 | #define false 0
 27 | #define true !false
 28 | 
 29 | /**
 30 |  * @brief This structure captures properties of an anchor.
 31 |  */
 32 | struct anchor {
 33 | 	/** The position on the subject. */
 34 | 	size_t pos_S;
 35 | 	/** The position on the query. */
 36 | 	size_t pos_Q;
 37 | 	/** The length of the exact match. */
 38 | 	size_t length;
 39 | };
 40 | 
 41 | /**
 42 |  * @brief This is a structure of assorted variables needed for anchor finding.
 43 |  */
 44 | struct context {
 45 | 	const esa_s *C;
 46 | 	const char *query;
 47 | 	size_t query_length;
 48 | 	size_t threshold;
 49 | };
 50 | 
 51 | /**
 52 |  * @brief Compute the length of the longest common prefix of two strings.
 53 |  *
 54 |  * @param S - One string.
 55 |  * @param Q - Another string.
 56 |  * @param remaining - The length of one of the strings.
 57 |  * @returns the length of the lcp.
 58 |  */
 59 | static inline size_t lcp(const char *S, const char *Q, size_t remaining) {
 60 | 	size_t length = 0;
 61 | 	while (length < remaining && S[length] == Q[length]) {
 62 | 		length++;
 63 | 	}
 64 | 	return length;
 65 | }
 66 | 
 67 | /**
 68 |  * @brief Check whether the last anchor can be extended by a lucky anchor.
 69 |  *
 70 |  * Anchors are defined to be unique and of a minimum length. The uniqueness
 71 |  * requires us to search throw the suffix array for a second appearance of the
 72 |  * anchor. However, if a left anchor is already unique, we could be sloppy and
 73 |  * drop the uniqueness criterion for the second anchor. This way we can skip the
 74 |  * lookup and just compare characters directly. However, for a lucky anchor the
 75 |  * match still has to be longer than the threshold.
 76 |  *
 77 |  * @param ctx - Matching context of various variables.
 78 |  * @param last_match - The last anchor.
 79 |  * @param this_match - Input/Output variable for the current match.
 80 |  * @returns true iff the current match is a lucky anchor.
 81 |  */
 82 | static inline bool lucky_anchor(const struct context *ctx,
 83 | 								const struct anchor *last_match,
 84 | 								struct anchor *this_match) {
 85 | 
 86 | 	size_t advance = this_match->pos_Q - last_match->pos_Q;
 87 | 	size_t gap = this_match->pos_Q - last_match->pos_Q - last_match->length;
 88 | 
 89 | 	size_t try_pos_S = last_match->pos_S + advance;
 90 | 	if (try_pos_S >= (size_t)ctx->C->len || gap > ctx->threshold) {
 91 | 		return false;
 92 | 	}
 93 | 
 94 | 	this_match->pos_S = try_pos_S;
 95 | 	this_match->length =
 96 | 		lcp(ctx->query + this_match->pos_Q, ctx->C->S + try_pos_S,
 97 | 			ctx->query_length - this_match->pos_Q);
 98 | 
 99 | 	return this_match->length >= ctx->threshold;
100 | }
101 | 
102 | /**
103 |  * @brief Check for a new anchor.
104 |  *
105 |  * Given the current context and starting position check if the new match is an
106 |  * anchor. The latter requires uniqueness and a certain minimum length.
107 |  *
108 |  * @param ctx - Matching context of various variables.
109 |  * @param last_match - (unused)
110 |  * @param this_match - Input/Output variable for the current match.
111 |  * @returns true iff an anchor was found.
112 |  */
113 | static inline bool anchor(const struct context *ctx,
114 | 						  const struct anchor *last_match,
115 | 						  struct anchor *this_match) {
116 | 
117 | 	lcp_inter_t inter = get_match_cached(ctx->C, ctx->query + this_match->pos_Q,
118 | 										 ctx->query_length - this_match->pos_Q);
119 | 
120 | 	this_match->pos_S = ctx->C->SA[inter.i];
121 | 	this_match->length = inter.l <= 0 ? 0 : inter.l;
122 | 	return inter.i == inter.j && this_match->length >= ctx->threshold;
123 | }
124 | 
125 | /**
126 |  * @brief Divergence estimation using the anchor technique.
127 |  *
128 |  * The dist_anchor() function estimates the divergence between two
129 |  * DNA sequences. The subject is given as an ESA, whereas the query
130 |  * is a simple string. This function then looks for *anchors* -- long
131 |  * substrings that exist in both sequences. Then it manually checks for
132 |  * mutations between those anchors.
133 |  *
134 |  * @param C - The enhanced suffix array of the subject.
135 |  * @param query - The actual query string.
136 |  * @param query_length - The length of the query string. Needed for speed
137 |  * reasons.
138 |  * @param threshold - Minimal length for an anchor.
139 |  * @returns A matrix with estimates of base substitutions.
140 |  */
141 | model dist_anchor(const esa_s *C, const char *query, size_t query_length,
142 | 				  size_t threshold) {
143 | 	struct model ret = {.seq_len = query_length, .counts = {0}};
144 | 
145 | 	struct anchor this_match = {0};
146 | 	struct anchor last_match = {0};
147 | 	bool last_was_right_anchor = false;
148 | 	size_t border = C->len / 2;
149 | 
150 | 	struct context ctx = {C, query, query_length, threshold};
151 | 
152 | 	// Iterate over the complete query.
153 | 	while (this_match.pos_Q < query_length) {
154 | 
155 | 		// Check for lucky anchors and fall back to normal strategy.
156 | 		if (lucky_anchor(&ctx, &last_match, &this_match) ||
157 | 			anchor(&ctx, &last_match, &this_match)) {
158 | 			// We have reached a new anchor.
159 | 
160 | 			size_t end_S = last_match.pos_S + last_match.length;
161 | 			size_t end_Q = last_match.pos_Q + last_match.length;
162 | 			// Check if this can be a right anchor to the last one.
163 | 			if (this_match.pos_S > end_S &&
164 | 				this_match.pos_Q - end_Q == this_match.pos_S - end_S &&
165 | 				(this_match.pos_S < border) == (last_match.pos_S < border)) {
166 | 
167 | 				// classify nucleotides in the left qanchor
168 | 				model_count_equal(&ret, query + last_match.pos_Q,
169 | 								  last_match.length);
170 | 
171 | 				// Count the SNPs in between.
172 | 				model_count(&ret, C->S + end_S, query + end_Q,
173 | 							this_match.pos_Q - end_Q);
174 | 				last_was_right_anchor = true;
175 | 			} else {
176 | 				if (last_was_right_anchor) {
177 | 					// If the last was a right anchor, but with the current one,
178 | 					// we cannot extend, then add its length.
179 | 					model_count_equal(&ret, query + last_match.pos_Q,
180 | 									  last_match.length);
181 | 				} else if (last_match.length >= threshold * 2) {
182 | 					// The last anchor wasn't neither a left or right anchor.
183 | 					// But, it was as long as an anchor pair. So still count it.
184 | 					model_count_equal(&ret, query + last_match.pos_Q,
185 | 									  last_match.length);
186 | 				}
187 | 
188 | 				last_was_right_anchor = false;
189 | 			}
190 | 
191 | 			// Cache values for later
192 | 			last_match = this_match;
193 | 		}
194 | 
195 | 		// Advance
196 | 		this_match.pos_Q += this_match.length + 1;
197 | 	}
198 | 
199 | 	// Very special case: The sequences are identical
200 | 	if (last_match.length >= query_length) {
201 | 		model_count_equal(&ret, query, query_length);
202 | 		return ret;
203 | 	}
204 | 
205 | 	// We might miss a few nucleotides if the last anchor was also a right
206 | 	// anchor. The logic is the same as a few lines above.
207 | 	if (last_was_right_anchor) {
208 | 		model_count_equal(&ret, query + last_match.pos_Q, last_match.length);
209 | 	} else if (last_match.length >= threshold * 2) {
210 | 		model_count_equal(&ret, query + last_match.pos_Q, last_match.length);
211 | 	}
212 | 
213 | 	return ret;
214 | }
215 | 
216 | /*
217 |  * Include distMatrix and distMatrixLM.
218 |  */
219 | #define FAST
220 | #include "dist_hack.h"
221 | 
222 | #undef FAST
223 | #include "dist_hack.h"
224 | 
225 | /**
226 |  * @brief Calculates and prints the distance matrix
227 |  * @param sequences - An array of pointers to the sequences.
228 |  * @param n - The number of sequences.
229 |  */
230 | void calculate_distances(seq_t *sequences, size_t n) {
231 | 	struct model *M = NULL;
232 | 
233 | 	// The maximum number of sequences is near 457'845'052.
234 | 	size_t intermediate = SIZE_MAX / sizeof(*M) / n;
235 | 	if (intermediate < n) {
236 | 		size_t root = (size_t)sqrt(SIZE_MAX / sizeof(*M));
237 | 		err(1, "Comparison is limited to %zu sequences (%zu given).", root, n);
238 | 	}
239 | 
240 | 	M = malloc(n * n * sizeof(*M));
241 | 	if (!M) {
242 | 		err(errno, "Could not allocate enough memory for the comparison "
243 | 				   "matrix. Try using --join or --low-memory.");
244 | 	}
245 | 
246 | 	// compute the distances
247 | 	if (FLAGS & F_LOW_MEMORY) {
248 | 		distMatrixLM(M, sequences, n);
249 | 	} else {
250 | 		distMatrix(M, sequences, n);
251 | 	}
252 | 
253 | 	// print the results
254 | 	print_distances(M, sequences, n, 1);
255 | 
256 | 	// print additional information.
257 | 	if (FLAGS & F_VERBOSE) {
258 | 		print_coverages(M, n);
259 | 	}
260 | 
261 | 	// create new bootstrapped distance matrices
262 | 	if (BOOTSTRAP) {
263 | 		int res = calculate_bootstrap(M, sequences, n);
264 | 		if (res) {
265 | 			soft_errx("Bootstrapping failed.");
266 | 		}
267 | 	}
268 | 
269 | 	free(M);
270 | }
271 | 
272 | /** Yet another hack. */
273 | #define B(X, Y) (B[(X)*n + (Y)])
274 | 
275 | /** @brief Computes a bootstrap from _pairwise_ alignments.
276 |  *
277 |  * Doing bootstrapping for alignments with only two sequences is easy. It boils
278 |  * down to a simple multi-nomial process over the substitution matrix.
279 |  *
280 |  * @param M - the initial distance matrix
281 |  * @param sequences - a list of the sequences, containing their lengths
282 |  * @param n - the number of sequences
283 |  *
284 |  * The number of bootstrapped distance matrices to print is implicitly
285 |  * passed via the global `BOOTSTRAP` variable.
286 |  *
287 |  * @returns 0 iff successful.
288 |  */
289 | int calculate_bootstrap(const struct model *M, const seq_t *sequences,
290 | 						size_t n) {
291 | 	if (!M || !sequences || !n) {
292 | 		return 1;
293 | 	}
294 | 
295 | 	// B is the new bootstrap matrix
296 | 	struct model *B = malloc(n * n * sizeof(*B));
297 | 	CHECK_MALLOC(B);
298 | 
299 | 	// Compute a number of new distance matrices
300 | 	while (BOOTSTRAP--) {
301 | 		for (size_t i = 0; i < n; i++) {
302 | 			for (size_t j = i; j < n; j++) {
303 | 				if (i == j) {
304 | 					B(i, j) = (struct model){.seq_len = 1.0, .counts = {1.0}};
305 | 					continue;
306 | 				}
307 | 
308 | 				// Bootstrapping should only be used with averaged distances.
309 | 				model datum = model_average(&M(i, j), &M(j, i));
310 | 				datum = model_bootstrap(datum);
311 | 
312 | 				B(j, i) = B(i, j) = datum;
313 | 			}
314 | 		}
315 | 
316 | 		print_distances(B, sequences, n, 0);
317 | 	}
318 | 
319 | 	free(B);
320 | 	return 0;
321 | }
322 | 


--------------------------------------------------------------------------------
/src/process.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file
 3 |  * @brief This file contains the declarations of functions in process.c
 4 |  *
 5 |  */
 6 | #ifndef _PROCESS_H_
 7 | #define _PROCESS_H_
 8 | 
 9 | #include "sequence.h"
10 | 
11 | void calculate_distances(seq_t *sequences, size_t n);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/sequence.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file
  3 |  * @brief Sequence utilities
  4 |  *
  5 |  * This file contains utility functions for working with DNA sequences.
  6 |  */
  7 | #include <ctype.h>
  8 | #include <limits.h>
  9 | #include <math.h>
 10 | #include <stdlib.h>
 11 | #include <string.h>
 12 | 
 13 | #include "global.h"
 14 | #include "sequence.h"
 15 | #include <compat-stdlib.h>
 16 | 
 17 | void normalize(seq_t *S);
 18 | double shustring_cum_prob(size_t x, double g, size_t l);
 19 | size_t min_anchor_length(double p, double g, size_t l);
 20 | 
 21 | /** Create a new dynamic array for sequences. */
 22 | int dsa_init(dsa_t *A) {
 23 | 	// allocate at least 4 slots so the growth by 1.5 below doesn't get stuck
 24 | 	// at 3 slots.
 25 | 	A->data = malloc(sizeof(*A->data) * 4);
 26 | 	CHECK_MALLOC(A->data);
 27 | 
 28 | 	A->capacity = 4;
 29 | 	A->size = 0;
 30 | 	return 0;
 31 | }
 32 | 
 33 | /** Add a sequence to an array. */
 34 | void dsa_push(dsa_t *A, seq_t S) {
 35 | 	if (A->size < A->capacity) {
 36 | 		A->data[A->size++] = S;
 37 | 	} else {
 38 | 		// use the near-optimal growth factor of 1.5
 39 | 		seq_t *ptr = reallocarray(A->data, A->capacity / 2, sizeof(seq_t) * 3);
 40 | 		CHECK_MALLOC(ptr);
 41 | 
 42 | 		A->capacity = (A->capacity / 2) * 3;
 43 | 		A->data = ptr;
 44 | 		A->data[A->size++] = S;
 45 | 	}
 46 | }
 47 | 
 48 | /** Frees the array and all sequences stored within. */
 49 | void dsa_free(dsa_t *A) {
 50 | 	size_t i;
 51 | 	for (i = 0; i < A->size; i++) {
 52 | 		seq_free(&A->data[i]);
 53 | 	}
 54 | 
 55 | 	free(A->data);
 56 | 	*A = (dsa_t){};
 57 | }
 58 | 
 59 | /** Returns the number of sequences stored within an array. */
 60 | size_t dsa_size(const dsa_t *A) {
 61 | 	return A->size;
 62 | }
 63 | 
 64 | /** Get the raw C array. */
 65 | seq_t *dsa_data(dsa_t *A) {
 66 | 	return A->data;
 67 | }
 68 | 
 69 | /**
 70 |  * @brief Convert an array of multiple sequences into a single sequence.
 71 |  *
 72 |  * This function joins all sequences contained in an array into one
 73 |  * long sequence. The sequences are separated by a `!` character. The
 74 |  * caller has to free the initial array.
 75 |  *
 76 |  * @returns A new sequence representation the union of the array.
 77 |  */
 78 | seq_t dsa_join(dsa_t *A) {
 79 | 	seq_t joined = {};
 80 | 
 81 | 	if (A->size == 0) {
 82 | 		return joined;
 83 | 	}
 84 | 
 85 | 	if (A->size == 1) {
 86 | 		/* If we are to join just one sequence, _move_ its contents. */
 87 | 		joined = A->data[0];
 88 | 		A->data[0] = (seq_t){};
 89 | 		return joined;
 90 | 	}
 91 | 
 92 | 	seq_t *data = A->data;
 93 | 	seq_t *it = data;
 94 | 
 95 | 	// Compute the total length
 96 | 	size_t total = 0, i;
 97 | 	for (i = 0; i < A->size; i++, it++) {
 98 | 		total += it->len + 1;
 99 | 	}
100 | 
101 | 	// A single malloc for the whole new sequence
102 | 	char *ptr = malloc(total);
103 | 	CHECK_MALLOC(ptr);
104 | 	char *next = ptr;
105 | 
106 | 	// Copy all old sequences and add a `!` in between
107 | 
108 | 	it = data;
109 | 	memcpy(next, it->S, it->len);
110 | 	next += it->len;
111 | 
112 | 	for (i = 1, it++; i < A->size; i++, it++) {
113 | 		*next++ = '!';
114 | 		memcpy(next, it->S, it->len);
115 | 		next += it->len;
116 | 	}
117 | 
118 | 	// Don't forget the null byte.
119 | 	*next = '\0';
120 | 
121 | 	joined.S = ptr;
122 | 	joined.len = total - 1; // subtract the null byte
123 | 
124 | 	return joined;
125 | }
126 | 
127 | /**
128 |  * @brief Frees the memory of a given sequence.
129 |  * @param S - The sequence to free.
130 |  */
131 | void seq_free(seq_t *S) {
132 | 	free(S->S);
133 | 	free(S->name);
134 | 	*S = (seq_t){};
135 | }
136 | 
137 | /**
138 |  * @brief Compute the reverse complement.
139 |  * @param str The master string.
140 |  * @param len The length of the master string
141 |  * @return The reverse complement. The caller has to free it!
142 |  */
143 | char *revcomp(const char *str, size_t len) {
144 | 	if (!str) return NULL;
145 | 	char *rev = malloc(len + 1);
146 | 	CHECK_MALLOC(rev);
147 | 
148 | 	char *r = rev;
149 | 	const char *s = &str[len - 1];
150 | 	rev[len] = '\0';
151 | 
152 | 	do {
153 | 		char c = *s--;
154 | 		char d;
155 | 
156 | 		if (c < 'A') {
157 | 			d = ';'; // rosebud
158 | 		} else {
159 | 			d = c ^= c & 2 ? 4 : 21;
160 | 		}
161 | 
162 | 		*r++ = d;
163 | 	} while (--len);
164 | 
165 | 	return rev;
166 | }
167 | 
168 | /**
169 |  * @brief This function concatenates the reverse complement to a given master
170 |  * string. A `#` sign is used as a separator.
171 |  * @param s The master string.
172 |  * @param len Its length.
173 |  * @return The newly concatenated string.
174 |  */
175 | char *catcomp(char *s, size_t len) {
176 | 	if (!s) return NULL;
177 | 
178 | 	char *rev = revcomp(s, len);
179 | 
180 | 	char *temp = realloc(rev, 2 * len + 2);
181 | 	CHECK_MALLOC(temp);
182 | 
183 | 	rev = temp;
184 | 	rev[len] = '#';
185 | 
186 | 	memcpy(rev + len + 1, s, len + 1);
187 | 
188 | 	return rev;
189 | }
190 | 
191 | /**
192 |  * @brief Calculates the GC content of a sequence.
193 |  *
194 |  * This function computes the relative amount of G and C in the total sequence.
195 |  */
196 | double calc_gc(const seq_t *S) {
197 | 	size_t GC = 0;
198 | 	const char *p = S->S;
199 | 
200 | 	for (; *p; p++) {
201 | 		if (*p == 'G' || *p == 'C') {
202 | 			GC++;
203 | 		}
204 | 	}
205 | 
206 | 	return (double)GC / S->len;
207 | }
208 | 
209 | /** @brief Prepares a sequences to be used as the subject in a comparison. */
210 | int seq_subject_init(seq_subject *S, const seq_t *base) {
211 | 	S->gc = calc_gc(base);
212 | 	S->RS = catcomp(base->S, base->len);
213 | 	if (!S->RS) return 1;
214 | 	S->RSlen = 2 * base->len + 1;
215 | 
216 | 	S->threshold = min_anchor_length(ANCHOR_P_VALUE, S->gc, S->RSlen);
217 | 
218 | 	return 0;
219 | }
220 | 
221 | /** @brief Frees some memory unused for when a sequence is only used as query.
222 |  */
223 | void seq_subject_free(seq_subject *S) {
224 | 	free(S->RS);
225 | 	S->RS = NULL;
226 | 	S->RSlen = 0;
227 | 	S->gc = 0.0;
228 | }
229 | 
230 | /** @brief Initializes a sequences
231 |  *
232 |  * @returns 0 iff successful.
233 |  */
234 | int seq_init(seq_t *S, const char *seq, const char *name) {
235 | 	if (!S || !seq || !name) {
236 | 		return 1;
237 | 	}
238 | 
239 | 	*S = (seq_t){.S = strdup(seq), .name = strdup(name)};
240 | 
241 | 	CHECK_MALLOC(S->S);
242 | 	CHECK_MALLOC(S->name);
243 | 
244 | 	normalize(S);
245 | 
246 | 	// recalculate the length because `normalize` might have stripped some
247 | 	// characters.
248 | 	S->len = strlen(S->S);
249 | 
250 | 	return 0;
251 | }
252 | 
253 | /**
254 |  * @brief Restricts a sequence characters set to ACGT.
255 |  *
256 |  * This function strips a sequence of non ACGT characters and converts acgt to
257 |  * the upper case equivalent. A flag is set if a non-canonical character was
258 |  * encountered.
259 |  */
260 | void normalize(seq_t *S) {
261 | 	char *p, *q;
262 | 	char local_non_acgt = 0;
263 | 	for (p = q = S->S; *p; p++) {
264 | 		switch (*p) {
265 | 			case 'A':
266 | 			case 'C':
267 | 			case 'G':
268 | 			case 'T':
269 | 			case '!': *q++ = *p; break;
270 | 			case 'a':
271 | 			case 'c':
272 | 			case 'g':
273 | 			case 't': *q++ = toupper((unsigned char)*p); break;
274 | 			default: local_non_acgt = 1; break;
275 | 		}
276 | 	}
277 | 	*q = '\0';
278 | 	if (local_non_acgt) {
279 | #pragma omp atomic
280 | 		FLAGS |= F_NON_ACGT;
281 | 	}
282 | }
283 | 
284 | /**
285 |  * @brief Calculates the minimum anchor length.
286 |  *
287 |  * Given some parameters calculate the minimum length for anchors according
288 |  * to the distribution from Haubold et al. (2009).
289 |  *
290 |  * @param p - The probability with which an anchor will be created under a
291 |  * random model.
292 |  * @param g - The the relative amount of GC in the subject.
293 |  * @param l - The length of the subject (includes revcomp).
294 |  * @returns The minimum length of an anchor.
295 |  */
296 | size_t min_anchor_length(double p, double g, size_t l) {
297 | 	size_t x = 1;
298 | 
299 | 	while (shustring_cum_prob(x, g / 2, l) < 1 - p) {
300 | 		x++;
301 | 	}
302 | 
303 | 	return x;
304 | }
305 | 
306 | /**
307 |  * @brief Calculates the binomial coefficient of n and k.
308 |  *
309 |  * We could (and probably should) use gsl_sf_lnchoose(xx,kk) for this.
310 |  *
311 |  * @param n - The n part of the binomial coefficient.
312 |  * @param k - analogue.
313 |  * @returns (n choose k)
314 |  */
315 | size_t binomial_coefficient(size_t n, size_t k) {
316 | 	if (n <= 0 || k > n) {
317 | 		return 0;
318 | 	}
319 | 
320 | 	if (k == 0 || k == n) {
321 | 		return 1;
322 | 	}
323 | 
324 | 	if (k > n - k) {
325 | 		k = n - k;
326 | 	}
327 | 
328 | 	size_t res = 1;
329 | 
330 | 	for (size_t i = 1; i <= k; i++) {
331 | 		res *= n - k + i;
332 | 		res /= i;
333 | 	}
334 | 
335 | 	return res;
336 | }
337 | 
338 | /**
339 |  * @brief Given `x` this function calculates the probability of a shustring
340 |  * with a length less or equal to `x` under a random model. This means, it is
341 |  * the cumulative probability.
342 |  *
343 |  * Let X be the longest shortest unique substring (shustring) at any position.
344 |  * Then this function computes P{X <= x} with respect to the given parameter
345 |  * set. See Haubold et al. (2009). Note that `x` includes the final mismatch.
346 |  * Thus, `x` is `match length + 1`.
347 |  *
348 |  * @param x - The maximum length of a shustring.
349 |  * @param p - The half of the relative amount of GC in the DNA.
350 |  * @param l - The length of the subject.
351 |  * @returns The probability of a certain shustring length.
352 |  */
353 | double shustring_cum_prob(size_t x, double p, size_t l) {
354 | 	double xx = (double)x;
355 | 	double ll = (double)l;
356 | 	size_t k;
357 | 
358 | 	double s = 0.0;
359 | 
360 | 	for (k = 0; k <= x; k++) {
361 | 		double kk = (double)k;
362 | 		double t = pow(p, kk) * pow(0.5 - p, xx - kk);
363 | 
364 | 		s += pow(2, xx) * (t * pow(1 - t, ll)) *
365 | 			 (double)binomial_coefficient(x, k);
366 | 		if (s >= 1.0) {
367 | 			s = 1.0;
368 | 			break;
369 | 		}
370 | 	}
371 | 
372 | 	return s;
373 | }
374 | 


--------------------------------------------------------------------------------
/src/sequence.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file
 3 |  * @brief Functions and structures for DNA sequences
 4 |  *
 5 |  */
 6 | #ifndef _SEQUENCE_H_
 7 | #define _SEQUENCE_H_
 8 | 
 9 | #include <err.h>
10 | #include <errno.h>
11 | #include <stdlib.h>
12 | 
13 | /**
14 |  * @brief A structure for sequences.
15 |  *
16 |  * This structure is used to represent a DNA sequence of some kind.
17 |  */
18 | typedef struct seq_s {
19 | 	/** This is the DNAs forward strand as a string. */
20 | 	char *S;
21 | 	/** The length of the forward strand. */
22 | 	size_t len;
23 | 	/** A name for this sequence */
24 | 	char *name;
25 | } seq_t;
26 | 
27 | /**
28 |  * @brief This structure enhances the usual sequence with its reverse
29 |  * complement.
30 |  */
31 | typedef struct seq_subject {
32 | 	/** This member contains first the reverse strand and then the
33 | 		forward strand. */
34 | 	char *RS;
35 | 	/** Corresponds to strlen(RS) */
36 | 	size_t RSlen;
37 | 	/**
38 | 	 * @brief GC-Content
39 | 	 *
40 | 	 * The relative amount of G or C in the DNA.
41 | 	 */
42 | 	double gc;
43 | 	/** The minimum length for an anchor. */
44 | 	size_t threshold;
45 | } seq_subject;
46 | 
47 | void seq_free(seq_t *S);
48 | int seq_subject_init(seq_subject *S, const seq_t *);
49 | void seq_subject_free(seq_subject *S);
50 | int seq_init(seq_t *S, const char *seq, const char *name);
51 | 
52 | /**
53 |  * @brief A dynamically growing structure for sequences.
54 |  */
55 | typedef struct dsa_s {
56 | 	seq_t *data;
57 | 	size_t capacity, size;
58 | } dsa_t;
59 | 
60 | int dsa_init(dsa_t *A);
61 | void dsa_push(dsa_t *A, seq_t S);
62 | void dsa_free(dsa_t *A);
63 | size_t dsa_size(const dsa_t *A);
64 | seq_t *dsa_data(dsa_t *A);
65 | 
66 | seq_t dsa_join(dsa_t *dsa);
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/test/Makefile.am:
--------------------------------------------------------------------------------
 1 | check_PROGRAMS = test_esa test_seq test_fasta test_process
 2 | dist_noinst_DATA = test_extra.sh test_random.sh test_join.sh nan.sh low_homo.sh
 3 | 
 4 | test_seq_SOURCES = test_seq.c $(top_srcdir)/src/sequence.c
 5 | test_seq_CPPFLAGS = -I$(top_srcdir)/src -I$(top_srcdir)/opt -DDEBUG -std=gnu99
 6 | test_seq_CFLAGS = -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
 7 | test_seq_LDADD = $(GLIB_LIBS) $(top_builddir)/opt/libcompat.a
 8 | 
 9 | test_process_SOURCES = test_process.c $(top_srcdir)/src/esa.c $(top_srcdir)/src/io.c $(top_srcdir)/src/model.c $(top_srcdir)/src/process.c $(top_srcdir)/src/sequence.c $(top_srcdir)/src/global.h
10 | test_process_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/src -I$(top_srcdir)/opt -I$(top_srcdir)/libs -DDEBUG -std=gnu99
11 | test_process_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
12 | test_process_LDADD = $(GLIB_LIBS) $(top_builddir)/opt/libcompat.a $(top_builddir)/libs/libpfasta.a
13 | 
14 | test_esa_SOURCES = test_esa.c $(top_srcdir)/src/esa.c $(top_srcdir)/src/sequence.c $(top_srcdir)/src/esa.h
15 | test_esa_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/libs -I$(top_srcdir)/opt -I$(top_srcdir)/src -DDEBUG -std=gnu99
16 | test_esa_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
17 | test_esa_LDADD = $(GLIB_LIBS) $(top_builddir)/opt/libcompat.a
18 | 
19 | test_fasta_SOURCES = test_fasta.cxx
20 | 
21 | .PHONY: all
22 | all: $(check_PROGRAMS)
23 | 


--------------------------------------------------------------------------------
/test/low_homo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -f
 2 | 
 3 | SEED=${RANDOM_SEED:-0}
 4 | SEED2=0
 5 | SEED3=0
 6 | if test $SEED -ne 0; then
 7 | 	SEED=$((SEED + 1))
 8 | 	SEED2=$((SEED + 2))
 9 | 	SEED3=$((SEED + 3))
10 | fi
11 | 
12 | ./test/test_fasta -s $SEED -l 100000 > a_low.fa
13 | ./test/test_fasta -s $SEED2 -l 100000 > b_low.fa
14 | ./test/test_fasta -s $SEED3 -l 100 > both_low.fa
15 | 
16 | cat both_low.fa a_low.fa | awk -v RS='>' '{if($1 == "S0")print ">"$0 > "S0_low.fa"}'
17 | cat both_low.fa b_low.fa | awk -v RS='>' '{if($1 == "S1")print ">"$0 > "S1_low.fa"}'
18 | 
19 | # this is expected to trigger the low homology warning
20 | ./src/andi -j S0_low.fa S1_low.fa 2>&1 | grep 'homology'
21 | EXIT_VAL=$?
22 | 
23 | if [[ EXIT_VAL -ge 1 ]]; then
24 | 	echo "Triggering low homology failed" >&2
25 | 	grep '^>' a_low.fa b_low.fa both_low.fa
26 | fi
27 | 
28 | rm -f a_low.fa b_low.fa both_low.fa S0_low.fa S1_low.fa
29 | exit $EXIT_VAL
30 | 


--------------------------------------------------------------------------------
/test/nan.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -f
 2 | 
 3 | SEED=${RANDOM_SEED:-0}
 4 | SEED2=0
 5 | if test $SEED -ne 0; then
 6 | 	SEED=$((SEED + 1))
 7 | 	SEED2=$((SEED + 2))
 8 | fi
 9 | 
10 | 
11 | ./test/test_fasta -s $SEED -l 10000 > a_nan.fa
12 | ./test/test_fasta -s $SEED2 -l 10000 > b_nan.fa
13 | 
14 | # this is expected to trigger the nan warning
15 | ./src/andi -j a_nan.fa b_nan.fa 2>&1 | grep 'nan'
16 | EXIT_VAL=$?
17 | 
18 | 
19 | if [[ EXIT_VAL -ge 1 ]]; then
20 | 	echo "Triggering nan failed" >&2
21 | 	grep '^>' a_nan.fa b_nan.fa
22 | fi
23 | 
24 | rm -f a_nan.fa b_nan.fa
25 | exit $EXIT_VAL
26 | 


--------------------------------------------------------------------------------
/test/test_esa.c:
--------------------------------------------------------------------------------
  1 | #include "esa.h"
  2 | #include <glib.h>
  3 | #include "global.h"
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | 
  7 | int FLAGS = F_NONE;
  8 | int THREADS = 1;
  9 | double ANCHOR_P_VALUE = 0.025;
 10 | 
 11 | extern const int CACHE_LENGTH;
 12 | 
 13 | char code3char( ssize_t code){
 14 | 	switch( code & 0x7){
 15 | 		case 0: return 'A';
 16 | 		case 1: return 'C';
 17 | 		case 2: return 'G';
 18 | 		case 3: return 'T';
 19 | 		case 4: return '!';
 20 | 		case 5: return ';';
 21 | 		case 6: return '#';
 22 | 	}
 23 | 	return '\0';
 24 | }
 25 | 
 26 | typedef struct {
 27 | 	esa_s *C;
 28 | 	seq_t *S;
 29 | 	seq_subject subject;
 30 | } esa_fixture;
 31 | 
 32 | void assert_equal_lcp( const lcp_inter_t *a, const lcp_inter_t *b){
 33 | 	g_assert_cmpint( a->i, ==, b->i);
 34 | 	g_assert_cmpint( a->j, ==, b->j);
 35 | 	g_assert_cmpint( a->l, ==, b->l);
 36 | }
 37 | 
 38 | void assert_equal_cache_nocache( const esa_s *C, const char *str, size_t qlen){
 39 | 	lcp_inter_t a = get_match_cached(C, str, qlen);
 40 | 	lcp_inter_t b = get_match(C, str, qlen);
 41 | 	assert_equal_lcp( &a, &b);
 42 | 	g_assert(strncmp(str, C->S + C->SA[a.i], a.l) == 0);
 43 | 	g_assert( str[a.l] != C->S[ a.l + C->SA[a.i]] || str[a.l] == '\0');
 44 | }
 45 | 
 46 | void setup( esa_fixture *ef, gconstpointer test_data){
 47 | 	ef->C = malloc( sizeof(esa_s));
 48 | 	ef->S = malloc( sizeof(seq_t));
 49 | 
 50 | 	g_assert( ef->C != NULL);
 51 | 	g_assert( ef->S != NULL);
 52 | 
 53 | 	const char *seq = {
 54 | 		"TACGAGCACTGGTGGAATTGATGTC"
 55 | 		"CAGTCTTATATGGCGCACCAGGCTG"
 56 | 		"ATAGTAGTAGCAGTTTGCTTATCTC"
 57 | 		"ATCGCGTGTTTCCGGATGACAGAGA"
 58 | 		"TACGTGCACTGGTGGGATTGATGTC"
 59 | 		"TAGTATTATATGGCGCACCAGGATG"
 60 | 		"ATAGTAGTAGCAGTTTGCTTATCCC"
 61 | 		"ATCGCGTGTTTGCGGATGACCGAGA"
 62 | 	};
 63 | 
 64 | 	g_assert( seq_init( ef->S, seq, "S0" ) == 0);
 65 | 	seq_subject_init( &ef->subject, ef->S);
 66 | 	g_assert( ef->subject.RS != NULL);
 67 | 	int check = esa_init( ef->C, &ef->subject);
 68 | 	g_assert( check == 0);
 69 | }
 70 | 
 71 | void setup2( esa_fixture *ef, gconstpointer test_data){
 72 | 	ef->C = malloc( sizeof(esa_s));
 73 | 	ef->S = malloc( sizeof(seq_t));
 74 | 
 75 | 	g_assert( ef->C != NULL);
 76 | 	g_assert( ef->S != NULL);
 77 | 
 78 | 	const char *seq = {
 79 | 		"TACGAGCACTGGTGGAATTGATGTC"
 80 | 		"CAGTCTTATATGGCGCACCAGGCTG"
 81 | 		"ATAGTAGTAGCAGTTTGCTTATCTC"
 82 | 		"ATCGCGTGTTTCCGGATGACAGAGA"
 83 | 		"!"
 84 | 		"TACGTGCACTGGTGGGATTGATGTC"
 85 | 		"TAGTATTATATGGCGCACCAGGATG"
 86 | 		"ATAGTAGTAGCAGTTTGCTTATCCC"
 87 | 		"ATCGCGTGTTTGCGGATGACCGAGA"
 88 | 	};
 89 | 
 90 | 	g_assert( seq_init( ef->S, seq, "S0" ) == 0);
 91 | 	seq_subject_init( &ef->subject, ef->S);
 92 | 	g_assert( ef->subject.RS != NULL);
 93 | 	int check = esa_init( ef->C, &ef->subject);
 94 | 	g_assert( check == 0);
 95 | }
 96 | 
 97 | void teardown( esa_fixture *ef, gconstpointer test_data){
 98 | 	esa_free(ef->C);
 99 | 	free(ef->C);
100 | 	seq_free(ef->S);
101 | 	free(ef->S);
102 | 	seq_subject_free(&ef->subject);
103 | }
104 | 
105 | extern int count;
106 | 
107 | void basic( esa_fixture *ef, gconstpointer test_data){
108 | 	esa_s *C = ef->C;
109 | 	g_assert( C->SA);
110 | 
111 | 	lcp_inter_t a = get_match_cached(C, "AAGACTGG", 8);
112 | 	lcp_inter_t b = get_match(C, "AAGACTGG", 8);
113 | 	assert_equal_lcp( &a, &b);
114 | 	g_assert(strncmp("AAGACTGG",C->S + C->SA[a.i], 8) == 0);
115 | 
116 | 	a = get_match_cached(C, "AATTAAAA", 8);
117 | 	b = get_match(C, "AATTAAAA", 8);
118 | 	assert_equal_lcp( &a, &b);
119 | 	g_assert(strncmp("AATTAAAA",C->S + C->SA[a.i], a.l) == 0);
120 | 
121 | 	a = get_match_cached(C, "ACCGAGAA", 8);
122 | 	b = get_match(C, "ACCGAGAA", 8);
123 | 	assert_equal_lcp( &a, &b);
124 | 	g_assert(strncmp("ACCGAGAA",C->S + C->SA[a.i], a.l) == 0);
125 | 
126 | 	a = get_match_cached(C, "AAAAAAAAAAAA", 12);
127 | 	b = get_match(C, "AAAAAAAAAAAA", 12);
128 | 	assert_equal_lcp( &a, &b);
129 | 	g_assert(strncmp("AAAAAAAAAAAA",C->S + C->SA[a.i], a.l) == 0);
130 | 
131 | 	//g_assert_cmpint(count, >=, 1 << (2*8));
132 | }
133 | 
134 | void normq_cached( esa_fixture *ef, gconstpointer test_data){
135 | 	esa_s *C = ef->C;
136 | 	g_assert( C->SA);
137 | 	lcp_inter_t a, b;
138 | 
139 | 	a = get_match_cached(C, "A", 1);
140 | 	b = get_match(C, "A", 1);
141 | 	assert_equal_lcp( &a, &b);
142 | 
143 | 	a = get_match_cached(C, "C", 1);
144 | 	b = get_match(C, "C", 1);
145 | 	assert_equal_lcp( &a, &b);
146 | 
147 | 	a = get_match_cached(C, "CT", 2);
148 | 	b = get_match(C, "CT", 2);
149 | 	assert_equal_lcp( &a, &b);
150 | 
151 | 	a = get_match_cached(C, "AAGACTGG", 8);
152 | 	b = get_match(C, "AAGACTGG", 8);
153 | 	assert_equal_lcp( &a, &b);
154 | 	
155 | 	a = get_match_cached(C, "AATTAAAA", 8);
156 | 	b = get_match(C, "AATTAAAA", 8);
157 | 	assert_equal_lcp( &a, &b);
158 | 
159 | 	a = get_match_cached(C, "ACCGAGAA", 8);
160 | 	b = get_match(C, "ACCGAGAA", 8);
161 | 	assert_equal_lcp( &a, &b);
162 | 
163 | 	a = get_match_cached(C, "AAAAAAAAAAAA", 12);
164 | 	b = get_match(C, "AAAAAAAAAAAA", 12);
165 | 	assert_equal_lcp( &a, &b);
166 | 
167 | 	a = get_match_cached(C, "!AAAAAAAAAAAA", 12);
168 | 	b = get_match(C, "!AAAAAAAAAAAA", 12);
169 | 	assert_equal_lcp( &a, &b);
170 | }
171 | 
172 | size_t MAX_DEPTH = 11;
173 | 
174 | void prefix_dfs( esa_s *C, char *str, size_t depth);
175 | 
176 | void prefix( esa_fixture *ef, gconstpointer test_data){
177 | 	esa_s *C = ef->C;
178 | 	char str[MAX_DEPTH+1];
179 | 	str[MAX_DEPTH] = '\0';
180 | 	prefix_dfs( C, str, 0);
181 | }
182 | 
183 | void prefix_dfs( esa_s *C, char *str, size_t depth){
184 | 	if( depth < MAX_DEPTH){
185 | 		for( int code = 0; code < 4; ++code){
186 | 			str[depth] = code2char(code);
187 | 			prefix_dfs( C, str, depth + 1);
188 | 		}
189 | 	} else {
190 | 		assert_equal_cache_nocache(C, str, depth);
191 | 	}
192 | }
193 | 
194 | int main(int argc, char *argv[])
195 | {
196 | 	g_test_init( &argc, &argv, NULL);
197 | 	g_test_add("/esa/basic", esa_fixture, NULL, setup, basic, teardown);
198 | 	g_test_add("/esa/sample cache", esa_fixture, NULL, setup, normq_cached, teardown);
199 | 	g_test_add("/esa/sample cache 2", esa_fixture, NULL, setup2, normq_cached, teardown);
200 | 	g_test_add("/esa/full cache", esa_fixture, NULL, setup, prefix, teardown);
201 | 	g_test_add("/esa/full cache 2", esa_fixture, NULL, setup2, prefix, teardown);
202 | 
203 | 	
204 | 	return g_test_run();
205 | }
206 | 
207 | 


--------------------------------------------------------------------------------
/test/test_extra.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh -f
 2 | 
 3 | # Test if andi exists, and can be executed
 4 | ./src/andi --version > /dev/null || exit 1
 5 | 
 6 | SEED=${RANDOM_SEED:-0}
 7 | SEED2=0
 8 | SEED3=0
 9 | if test $SEED -ne 0; then
10 |         SEED=$((SEED + 1))
11 |         SEED2=$((SEED + 2))
12 |         SEED3=$((SEED + 3))
13 | fi
14 | 
15 | # Test andi for more than just two sequences at a time
16 | ./test/test_fasta -s $SEED -l 100000 -d 0.01 -d 0.01 -d 0.01 -d 0.01 | ./src/andi > /dev/null || exit 1
17 | 
18 | # Test low-memory mode
19 | ./test/test_fasta -s $SEED2 -l 10000 > test_extra.fasta
20 | ./src/andi test_extra.fasta > extra.out
21 | ./src/andi test_extra.fasta --low-memory > extra_low_memory.out
22 | diff extra.out extra_low_memory.out || exit 1
23 | 
24 | # Test file of filenames
25 | ./test/test_fasta -s $SEED3 -l 10000 > test_extra.fasta
26 | echo "$PWD/test_extra.fasta" > fof.txt
27 | ./src/andi test_extra.fasta > extra.out
28 | ./src/andi --file-of-filenames fof.txt > fof.out
29 | cat fof.txt | ./src/andi --file-of-filenames - > fof2.out
30 | diff extra.out fof.out || exit 1
31 | diff extra.out fof2.out || exit 1
32 | 
33 | 
34 | rm -f test_extra.fasta extra.out extra_low_memory.out fof.out fof2.out fof.txt
35 | 
36 | 


--------------------------------------------------------------------------------
/test/test_fasta.cxx:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This program can create genome sequences with a specific distance.
  3 |  */
  4 | 
  5 | #include <iostream>
  6 | #include <random>
  7 | #include <functional>
  8 | #include <string>
  9 | #include <getopt.h>
 10 | 
 11 | using namespace std;
 12 | 
 13 | void usage();
 14 | void print_seq( unsigned, unsigned, int, int, double);
 15 | 
 16 | int main(int argc, char *argv[]){
 17 | 
 18 | 	random_device rd{};
 19 | 	auto seed = rd();
 20 | 	int length = 1000;
 21 | 	int line_length = 70;
 22 | 	int raw = 0;
 23 | 
 24 | 	auto seqs = vector<double>{0};
 25 | 
 26 | 	int check;
 27 | 	while((check = getopt(argc, argv, "s:l:L:d:r")) != -1){
 28 | 		switch(check) {
 29 | 			case 's':
 30 | 				{
 31 | 					seed = static_cast<unsigned int>(stol(optarg));
 32 | 					if( seed == 0){
 33 | 						seed = rd();
 34 | 					}
 35 | 					break;
 36 | 				}
 37 | 			case 'l': length = stoi(optarg); break;
 38 | 			case 'L': line_length = stoi(optarg); break;
 39 | 			case 'd': seqs.push_back(stod(optarg)); break;
 40 | 			case 'r': raw = 1; break;
 41 | 			case '?':
 42 | 			default: usage(); return 1;
 43 | 		}
 44 | 	}
 45 | 
 46 | 	if( seqs.size() < 2){
 47 | 		seqs.push_back(0.1);
 48 | 	}
 49 | 
 50 | 	if( !raw){
 51 | 		for(auto& dist : seqs) {
 52 | 			auto d = dist;
 53 | 			auto p = 0.75 - 0.75 * exp(-(4.0/3.0) * d);
 54 | 			dist = p;
 55 | 		}
 56 | 	}
 57 | 
 58 | 	auto base_seed = seed;
 59 | 
 60 | 	for( auto i=0u; i< seqs.size(); i++){
 61 | 		cout << ">S" << i << " (base_seed: " << base_seed << ")" << endl;
 62 | 		print_seq( base_seed, seed++, length, line_length, seqs[i]);
 63 | 	}
 64 | 
 65 | 	return 0;
 66 | }
 67 | 
 68 | 
 69 | static auto ACGT = "ACGT";
 70 | static auto NO_A = "CGT";
 71 | static auto NO_C = "AGT";
 72 | static auto NO_G = "ACT";
 73 | static auto NO_T = "ACG";
 74 | 
 75 | void print_seq( unsigned base_seed, unsigned mut_seed, int length, int line_length, double divergence){
 76 | 	char line[line_length+1];
 77 | 	line[line_length] = '\0';
 78 | 
 79 | 	auto base_rand = default_random_engine{base_seed};
 80 | 	auto base_dist = uniform_int_distribution<int>{0,3};
 81 | 	auto base_acgt = [&]{return ACGT[base_dist(base_rand)];};
 82 | 
 83 | 	auto mut_rand = default_random_engine{mut_seed};
 84 | 	auto mut_dist = uniform_real_distribution<double>{0,1};
 85 | 	auto mut = bind( mut_dist, mut_rand);
 86 | 	auto mut_acgt = uniform_int_distribution<int>{0,2};
 87 | 	auto mutate = [&](char c){
 88 | 		int idx = mut_acgt(mut_rand);
 89 | 		switch(c){
 90 | 			case 'A': return NO_A[idx];
 91 | 			case 'C': return NO_C[idx];
 92 | 			case 'G': return NO_G[idx];
 93 | 			case 'T': return NO_T[idx];
 94 | 			default: return 'X';
 95 | 		}
 96 | 	};
 97 | 
 98 | 	double nucleotides = (double)length;
 99 | 	double mutations = nucleotides * divergence;
100 | 
101 | 	for(int i= length, j; i > 0; i -= j){
102 | 		j = min(line_length, i);
103 | 
104 | 		for(auto k=0; k<j; k++){
105 | 			char c = base_acgt();
106 | 
107 | 			if( mut() < mutations / nucleotides ){
108 | 				c = mutate(c);
109 | 				mutations--;
110 | 			}
111 | 
112 | 			line[k] = c;
113 | 			nucleotides--;
114 | 		}
115 | 
116 | 		line[j] = '\0';
117 | 		cout << line << endl;
118 | 	}
119 | }
120 | 
121 | void usage(){
122 | 	const static char *str = {
123 | 		"usage: test_fasta [-l length] [-d dist...] [-L line length] [-s seed] [-r raw]\n"
124 | 	};
125 | 	cerr << str;
126 | }
127 | 


--------------------------------------------------------------------------------
/test/test_join.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh -f
  2 | 
  3 | ./src/andi --help > /dev/null || exit 1
  4 | 
  5 | SEED=${RANDOM_SEED:-0}
  6 | SEED2=0
  7 | SEED3=0
  8 | if test $SEED -ne 0; then
  9 |         SEED=$((SEED + 1))
 10 |         SEED2=$((SEED + 2))
 11 |         SEED3=$((SEED + 3))
 12 | fi
 13 | 
 14 | # Simple join test
 15 | ./test/test_fasta -s $SEED -l 1000 -L 1000 -d 0.1 > p1_join.fasta
 16 | ./test/test_fasta -s $SEED2 -l 1000 -L 1000 -d 0.1 > p2_join.fasta
 17 | ./test/test_fasta -s $SEED3 -l 10000 -L 10000 -d 0.1 > p3_join.fasta
 18 | 
 19 | head -qn 2 p1_join.fasta p2_join.fasta p3_join.fasta > S0_join.fasta
 20 | tail -qn 2 p1_join.fasta p2_join.fasta p3_join.fasta > S1_join.fasta
 21 | 
 22 | rm p1_join.fasta p2_join.fasta p3_join.fasta;
 23 | 
 24 | 
 25 | RES=$(./src/andi -m RAW -t 1 -j S0_join.fasta S1_join.fasta |
 26 | 	tail -n 1 |
 27 | 	awk '{print ($2 - 0.1)}' |
 28 | 	awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}'
 29 | 	)
 30 | 
 31 | if test $RES -ne 1; then
 32 | 	echo "The last test computed a distance deviating more than three percent from its intended value."
 33 | 	echo "See S0_join.fasta and S1_join.fasta for the used sequences."
 34 | 	exit 1;
 35 | fi
 36 | 
 37 | SEED=${RANDOM_SEED:-0}
 38 | SEED2=0
 39 | if test $SEED -ne 0; then
 40 |         SEED=$((SEED + 5))
 41 |         SEED2=$((SEED + 6))
 42 | fi
 43 | 
 44 | #unbalanced number of contigs
 45 | ./test/test_fasta -s $SEED -l 1000 -L 1000 -d 0.1 > p2_join.fasta
 46 | ./test/test_fasta -s $SEED2 -l 10000 -L 10000 -d 0.1 > p3_join.fasta
 47 | 
 48 | head -qn 2 p3_join.fasta > S0_join.fasta
 49 | tail -qn 2 p2_join.fasta p3_join.fasta > S1_join.fasta
 50 | 
 51 | rm p2_join.fasta p3_join.fasta;
 52 | 
 53 | 
 54 | RES=$(./src/andi -m RAW -t1 -j S0_join.fasta S1_join.fasta |
 55 |         tail -n 1 |
 56 |         awk '{print ($2 - 0.1)}' |
 57 |         awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}'
 58 |         )
 59 | 
 60 | if test $RES -ne 1; then
 61 |         echo "The last test computed a distance deviating more than three percent from its intended value."
 62 |         echo "See S0_join.fasta and S1_join.fasta for the used sequences."
 63 |         exit 1;
 64 | fi
 65 | 
 66 | SEED=${RANDOM_SEED:-0}
 67 | SEED2=0
 68 | SEED3=0
 69 | if test $SEED -ne 0; then
 70 |         SEED=$((SEED + 11))
 71 |         SEED2=$((SEED + 12))
 72 |         SEED3=$((SEED + 13))
 73 | fi
 74 | 
 75 | #unbalanced number of contigs 2
 76 | ./test/test_fasta -s $SEED -l 1000 -L 1000 -d 0.1 > p1_join.fasta
 77 | ./test/test_fasta -s $SEED2 -l 1000 -L 1000 -d 0.1 > p2_join.fasta
 78 | ./test/test_fasta -s $SEED3 -l 10000 -L 10000 -d 0.1 > p3_join.fasta
 79 | 
 80 | head -qn 2 p1_join.fasta p3_join.fasta > S0_join.fasta
 81 | tail -qn 2 p1_join.fasta p2_join.fasta p3_join.fasta > S1_join.fasta
 82 | 
 83 | rm p1_join.fasta p2_join.fasta p3_join.fasta;
 84 | 
 85 | 
 86 | RES=$(./src/andi -mRAW -t 1 -j S0_join.fasta S1_join.fasta |
 87 |         tail -n 1 |
 88 |         awk '{print ($2 - 0.1)}' |
 89 |         awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}'
 90 |         )
 91 | 
 92 | if test $RES -ne 1; then
 93 |         echo "The last test computed a distance deviating more than three percent from its intended value."
 94 |         echo "See S0_join.fasta and S1_join.fasta for the used sequences."
 95 |         exit 1;
 96 | fi
 97 | 
 98 | 
 99 | rm S0_join.fasta S1_join.fasta
100 | 


--------------------------------------------------------------------------------
/test/test_process.c:
--------------------------------------------------------------------------------
 1 | #include "global.h"
 2 | #include "process.h"
 3 | #include <glib.h>
 4 | #include <math.h>
 5 | 
 6 | int FLAGS = 0;
 7 | int THREADS = 1;
 8 | long unsigned int BOOTSTRAP = 0;
 9 | double ANCHOR_P_VALUE = 0.025;
10 | gsl_rng *RNG = NULL;
11 | int MODEL = M_JC;
12 | 
13 | double shustring_cum_prob(size_t x, double g, size_t l);
14 | size_t min_anchor_length(double p, double g, size_t l);
15 | 
16 | void test_shustring_cum_prob() {
17 | 	int len = 100000;
18 | 	double gc = 0.5;
19 | 	double p_value = 0.025;
20 | 
21 | 	size_t threshold = min_anchor_length(p_value, gc, len);
22 | 
23 | 	g_assert_cmpfloat(1 - p_value, <, shustring_cum_prob(threshold + 1, gc / 2, len));
24 | 	g_assert_cmpfloat(1 - p_value, <=, shustring_cum_prob(threshold, gc / 2, len));
25 | 	g_assert_cmpfloat(1 - p_value, >, shustring_cum_prob(threshold - 1, gc / 2, len));
26 | }
27 | 
28 | int main(int argc, char *argv[]) {
29 | 	g_test_init(&argc, &argv, NULL);
30 | 	g_test_add_func("/process/shustring_cum_prob", test_shustring_cum_prob);
31 | 
32 | 	return g_test_run();
33 | }
34 | 


--------------------------------------------------------------------------------
/test/test_random.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh -f
 2 | 
 3 | # This scripts test the accuracy of andi with random inputs. For that
 4 | # it uses the small program test_random to generate pairs of sequences
 5 | # with a given distance. By default, test_random creates a new set of
 6 | # sequences each time it is called. Thus, this test has a small, but
 7 | # non-zero probability of failing. That is a problem with Debian's
 8 | # reproducible builds effort. So this script acts as a wrapper around
 9 | # this issue.
10 | #
11 | # Simply calling this script via
12 | #     % ./test/test_random.sh
13 | # checks a new test-case every time. But with the right parameter
14 | #     % RANDOM_SEED=1729 ./test/test_random.sh
15 | # one specific set of sequences is validated.
16 | 
17 | ./src/andi --help > /dev/null || exit 1
18 | 
19 | LENGTH=100000
20 | 
21 | # If RANDOM_SEED is set, use its value. Otherwise 0 is used to signal
22 | # to test_random that a new set of sequences shall be generated.
23 | SEED=${RANDOM_SEED:-0}
24 | 
25 | for dist in 0.0 0.001 0.01 0.02 0.05 0.1 0.2 0.3
26 | do
27 | 	for n in $(seq 10)
28 | 	do
29 | 		if test $SEED -ne 0; then
30 | 			SEED=$((SEED + 1))
31 | 		fi
32 | 
33 | 		res=$(./test/test_fasta -s $SEED -l $LENGTH -d $dist |
34 | 			tee ./test/test_random.fasta |
35 | 			./src/andi -t 1 |
36 | 			tail -n 1 |
37 | 			awk -v dist=$dist '{print $2, dist}' |
38 | 			awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.055 && abs($1-$2) <= 0.055 * $2}')
39 | 		if test $res -ne 1; then
40 | 			echo "The last test computed a distance deviating more than five percent from its intended value."
41 | 			echo "See test_random.fasta for the used sequences."
42 | 			echo "./test/test_fasta -s $SEED -l $LENGTH -d $dist"
43 | 			head -n 1 ./test/test_random.fasta
44 | 			exit 1;
45 | 		fi
46 | 	done
47 | 
48 | 	# raw
49 | 	for n in $(seq 10)
50 | 	do
51 | 		if test $SEED -ne 0; then
52 | 			SEED=$((SEED + 1))
53 | 		fi
54 | 
55 | 		res=$(./test/test_fasta -r -s $SEED -l $LENGTH -d $dist |
56 | 			tee ./test/test_random.fasta |
57 | 			./src/andi -m RAW -t 1 |
58 | 			tail -n 1 |
59 | 			awk -v dist=$dist '{print $2, dist}' |
60 | 			awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.055 && abs($1-$2) <= 0.055 * $2}')
61 | 		if test $res -ne 1; then
62 | 			echo "The last test computed a distance deviating more than five percent from its intended value."
63 | 			echo "See test_random.fasta for the used sequences."
64 | 			echo "./test/test_fasta -r -s $SEED -l $LENGTH -d $dist"
65 | 			head -n 1 ./test/test_random.fasta
66 | 			exit 1;
67 | 		fi
68 | 	done
69 | done
70 | 
71 | rm ./test/test_random.fasta
72 | 


--------------------------------------------------------------------------------
/test/test_seq.c:
--------------------------------------------------------------------------------
 1 | #include <glib.h>
 2 | #include "global.h"
 3 | #include <stdio.h>
 4 | #include <string.h>
 5 | #include "sequence.h"
 6 | 
 7 | double ANCHOR_P_VALUE = 0.025;
 8 | 
 9 | int FLAGS = F_NONE;
10 | 
11 | void test_seq_basic(){
12 | 
13 | 	seq_t S;
14 | 
15 | 	seq_init( &S, "ACGT", "name");
16 | 
17 | 	g_assert_cmpstr(S.S, ==, "ACGT");
18 | 	g_assert_cmpstr(S.name, ==, "name");
19 | 	g_assert_cmpuint(S.len, ==, 4);
20 | 
21 | 	seq_free( &S);
22 | }
23 | 
24 | void test_seq_full(){
25 | 
26 | 	seq_t S;
27 | 	seq_subject subject;
28 | 
29 | 	seq_init( &S, "ACGTTGCA", "name");
30 | 	int check = seq_subject_init( &subject, &S);
31 | 
32 | 	g_assert_cmpint(check, ==, 0);
33 | 
34 | 	g_assert_cmpstr(subject.RS, ==, "TGCAACGT#ACGTTGCA");
35 | 	g_assert_cmpuint(subject.RSlen, ==, 8*2+1);
36 | 	g_assert( subject.gc == 0.5);
37 | 
38 | 	seq_subject_free( &subject);
39 | 	seq_free( &S);
40 | }
41 | 
42 | void test_seq_nonacgt(){
43 | 	seq_t S;
44 | 	seq_subject subject;
45 | 
46 | 	seq_init( &S, "11ACGTNN7682394689NNTGCA11", "name");
47 | 	seq_subject_init( &subject, &S);
48 | 
49 | 	g_assert_cmpstr(S.S, ==, "ACGTTGCA");
50 | 	g_assert_cmpuint(S.len, ==, 8 );
51 | 	g_assert( FLAGS & F_NON_ACGT);
52 | 
53 | 	g_assert_cmpstr(subject.RS, ==, "TGCAACGT#ACGTTGCA");
54 | 	g_assert_cmpuint(subject.RSlen, ==, 8*2+1);
55 | 	g_assert( subject.gc == 0.5);
56 | 
57 | 	seq_subject_free( &subject);
58 | 	seq_free( &S);
59 | 
60 | 	FLAGS = F_NONE;
61 | 
62 | 	seq_init( &S, "@ACGT_!0TGCA        ", "name");
63 | 	seq_subject_init( &subject, &S);
64 | 
65 | 	g_assert_cmpstr(S.S, ==, "ACGT!TGCA");
66 | 	g_assert_cmpuint(S.len, ==, 9 );
67 | 	g_assert( FLAGS & F_NON_ACGT);
68 | 
69 | 	g_assert_cmpstr(subject.RS, ==, "TGCA;ACGT#ACGT!TGCA");
70 | 	g_assert_cmpuint(subject.RSlen, ==, 9*2+1);
71 | 
72 | 	seq_subject_free( &subject);
73 | 	seq_free( &S);
74 | 
75 | 	FLAGS = F_NONE;
76 | 
77 | }
78 | 
79 | int main(int argc, char *argv[])
80 | {
81 | 	g_test_init( &argc, &argv, NULL);
82 | 	g_test_add_func("/seq/basic", test_seq_basic);
83 | 	g_test_add_func("/seq/full", test_seq_full);
84 | 	g_test_add_func("/seq/non acgt", test_seq_nonacgt);
85 | 
86 | 	return g_test_run();
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------