├── .clang-format ├── .gitignore ├── .travis.yml ├── COPYING ├── INSTALL ├── Makefile.am ├── README.md ├── andi-manual.pdf ├── configure.ac ├── docs ├── Doxyfile ├── Makefile.am ├── andi.1.in └── manual │ ├── andi-manual.tex │ ├── andi_labels.pdf │ ├── references.bib │ └── version.tex.in ├── libs ├── Makefile.am ├── pfasta.c └── pfasta.h ├── m4 └── ax_cxx_compile_stdcxx_11.m4 ├── opt ├── Makefile.am ├── compat-stdlib.h ├── compat-string.h ├── reallocarray.c └── strchrnul.c ├── scripts ├── _andi ├── failed.zsh ├── maf2phy.awk └── vmatch.sh ├── src ├── Makefile.am ├── andi.c ├── dist_hack.h ├── esa.c ├── esa.h ├── global.h ├── io.c ├── io.h ├── model.c ├── model.h ├── process.c ├── process.h ├── sequence.c └── sequence.h └── test ├── Makefile.am ├── low_homo.sh ├── nan.sh ├── test_esa.c ├── test_extra.sh ├── test_fasta.cxx ├── test_join.sh ├── test_process.c ├── test_random.sh └── test_seq.c /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 4 3 | TabWidth: 4 4 | UseTab: Always 5 | AllowShortIfStatementsOnASingleLine: true 6 | AllowShortFunctionsOnASingleLine: false 7 | IndentCaseLabels: true 8 | AllowShortCaseLabelsOnASingleLine: true 9 | BreakBeforeBraces: Attach 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binary and automatically generated files 2 | *.o 3 | *.a 4 | andi 5 | andi_* 6 | randomSeed.dat 7 | seedms 8 | testRMQ 9 | src/config.h 10 | src/stamp-h1 11 | src/config.hin 12 | src/config.hin~ 13 | 14 | #docs 15 | docs/doxygen_sqlite3.db 16 | docs/html/* 17 | docs/latex/* 18 | docs/andi.1 19 | *.aux 20 | *.auxlock 21 | *.dep 22 | *.dpth 23 | *.toc 24 | *.out 25 | *.pdf 26 | *.backup 27 | *.bbl 28 | *.blg 29 | !andi-manual.pdf 30 | 31 | *.in 32 | !docs/andi.1.in 33 | !docs/manual/version.tex.in 34 | docs/manual/version.tex 35 | *.log 36 | **/Makefile 37 | configure.scan 38 | config.status 39 | depcomp 40 | install-sh 41 | aclocal.m4 42 | **/.deps/ 43 | autom4te.cache/ 44 | README 45 | ChangeLog 46 | missing 47 | compile 48 | configure 49 | ar-lib 50 | src/.dirstamp 51 | 52 | # test files 53 | *.fasta 54 | cachegrind* 55 | callgrind* 56 | test.trs 57 | test-driver 58 | test_esa 59 | test_seq 60 | test_fasta 61 | test_process 62 | *.trs 63 | 64 | # Coverage 65 | *.gcda 66 | *.gcno 67 | *.gcov 68 | 69 | 70 | andi.sublime-* 71 | 72 | # for legacy git only: 73 | libs/RMQ/.deps/ 74 | libs/RMQ/Makefile 75 | Makefile 76 | 77 | # Profiling: 78 | gmon.out 79 | profile 80 | 81 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | compiler: 3 | - gcc 4 | - clang 5 | arch: 6 | - amd64 7 | - ppc64le 8 | sudo: false 9 | addons: 10 | apt: 11 | sources: 12 | - deadsnakes 13 | - ubuntu-toolchain-r-test 14 | packages: 15 | - cmake 16 | - libglib2.0-dev 17 | - libgsl0-dev 18 | 19 | install: 20 | - export LIBDIVDIR="$HOME/libdivsufsort" 21 | - pip install --user cpp-coveralls 22 | - wget https://github.com/y-256/libdivsufsort/archive/master.tar.gz 23 | - tar -xzvf master.tar.gz 24 | - cd libdivsufsort-master && mkdir build && cd build 25 | - cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="$LIBDIVDIR" .. 26 | - make && make install 27 | 28 | script: 29 | - CONFIGURE_FLAGS="" 30 | - export LD_LIBRARY_PATH="$LIBDIVDIR:$LIBDIVDIR/lib" 31 | - export LIBRARY_PATH="$LIBDIVDIR:$LIBRARY_PATH" 32 | - cd $TRAVIS_BUILD_DIR 33 | - autoreconf -fvi -Im4 34 | - export MYFLAGS="-fprofile-arcs -ftest-coverage -I$LIBDIVDIR/include" 35 | - if [ "${CC}" = "clang" ]; then export CONFIGURE_FLAGS="--disable-openmp"; fi 36 | - ./configure $CONFIGURE_FLAGS --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS" 37 | - make 38 | - make check || cat ./test-suite.log || exit 1 39 | - export MYFLAGS="-I$LIBDIVDIR/include" 40 | - ./configure $CONFIGURE_FLAGS --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS" 41 | - make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\" $CONFIGURE_FLAGS" 42 | after_success: 43 | - if [ "$CXX" = "g++" ]; then coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'; fi 44 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | Installation Instructions 2 | ************************* 3 | 4 | Copyright (C) 1994-1996, 1999-2002, 2004-2011 Free Software Foundation, 5 | Inc. 6 | 7 | Copying and distribution of this file, with or without modification, 8 | are permitted in any medium without royalty provided the copyright 9 | notice and this notice are preserved. This file is offered as-is, 10 | without warranty of any kind. 11 | 12 | Basic Installation 13 | ================== 14 | 15 | Briefly, the shell commands `./configure; make; make install' should 16 | configure, build, and install this package. The following 17 | more-detailed instructions are generic; see the `README' file for 18 | instructions specific to this package. Some packages provide this 19 | `INSTALL' file but do not implement all of the features documented 20 | below. The lack of an optional feature in a given package is not 21 | necessarily a bug. More recommendations for GNU packages can be found 22 | in *note Makefile Conventions: (standards)Makefile Conventions. 23 | 24 | The `configure' shell script attempts to guess correct values for 25 | various system-dependent variables used during compilation. It uses 26 | those values to create a `Makefile' in each directory of the package. 27 | It may also create one or more `.h' files containing system-dependent 28 | definitions. Finally, it creates a shell script `config.status' that 29 | you can run in the future to recreate the current configuration, and a 30 | file `config.log' containing compiler output (useful mainly for 31 | debugging `configure'). 32 | 33 | It can also use an optional file (typically called `config.cache' 34 | and enabled with `--cache-file=config.cache' or simply `-C') that saves 35 | the results of its tests to speed up reconfiguring. Caching is 36 | disabled by default to prevent problems with accidental use of stale 37 | cache files. 38 | 39 | If you need to do unusual things to compile the package, please try 40 | to figure out how `configure' could check whether to do them, and mail 41 | diffs or instructions to the address given in the `README' so they can 42 | be considered for the next release. If you are using the cache, and at 43 | some point `config.cache' contains results you don't want to keep, you 44 | may remove or edit it. 45 | 46 | The file `configure.ac' (or `configure.in') is used to create 47 | `configure' by a program called `autoconf'. You need `configure.ac' if 48 | you want to change it or regenerate `configure' using a newer version 49 | of `autoconf'. 50 | 51 | The simplest way to compile this package is: 52 | 53 | 1. `cd' to the directory containing the package's source code and type 54 | `./configure' to configure the package for your system. 55 | 56 | Running `configure' might take a while. While running, it prints 57 | some messages telling which features it is checking for. 58 | 59 | 2. Type `make' to compile the package. 60 | 61 | 3. Optionally, type `make check' to run any self-tests that come with 62 | the package, generally using the just-built uninstalled binaries. 63 | 64 | 4. Type `make install' to install the programs and any data files and 65 | documentation. When installing into a prefix owned by root, it is 66 | recommended that the package be configured and built as a regular 67 | user, and only the `make install' phase executed with root 68 | privileges. 69 | 70 | 5. Optionally, type `make installcheck' to repeat any self-tests, but 71 | this time using the binaries in their final installed location. 72 | This target does not install anything. Running this target as a 73 | regular user, particularly if the prior `make install' required 74 | root privileges, verifies that the installation completed 75 | correctly. 76 | 77 | 6. You can remove the program binaries and object files from the 78 | source code directory by typing `make clean'. To also remove the 79 | files that `configure' created (so you can compile the package for 80 | a different kind of computer), type `make distclean'. There is 81 | also a `make maintainer-clean' target, but that is intended mainly 82 | for the package's developers. If you use it, you may have to get 83 | all sorts of other programs in order to regenerate files that came 84 | with the distribution. 85 | 86 | 7. Often, you can also type `make uninstall' to remove the installed 87 | files again. In practice, not all packages have tested that 88 | uninstallation works correctly, even though it is required by the 89 | GNU Coding Standards. 90 | 91 | 8. Some packages, particularly those that use Automake, provide `make 92 | distcheck', which can by used by developers to test that all other 93 | targets like `make install' and `make uninstall' work correctly. 94 | This target is generally not run by end users. 95 | 96 | Compilers and Options 97 | ===================== 98 | 99 | Some systems require unusual options for compilation or linking that 100 | the `configure' script does not know about. Run `./configure --help' 101 | for details on some of the pertinent environment variables. 102 | 103 | You can give `configure' initial values for configuration parameters 104 | by setting variables in the command line or in the environment. Here 105 | is an example: 106 | 107 | ./configure CC=c99 CFLAGS=-g LIBS=-lposix 108 | 109 | *Note Defining Variables::, for more details. 110 | 111 | Compiling For Multiple Architectures 112 | ==================================== 113 | 114 | You can compile the package for more than one kind of computer at the 115 | same time, by placing the object files for each architecture in their 116 | own directory. To do this, you can use GNU `make'. `cd' to the 117 | directory where you want the object files and executables to go and run 118 | the `configure' script. `configure' automatically checks for the 119 | source code in the directory that `configure' is in and in `..'. This 120 | is known as a "VPATH" build. 121 | 122 | With a non-GNU `make', it is safer to compile the package for one 123 | architecture at a time in the source code directory. After you have 124 | installed the package for one architecture, use `make distclean' before 125 | reconfiguring for another architecture. 126 | 127 | On MacOS X 10.5 and later systems, you can create libraries and 128 | executables that work on multiple system types--known as "fat" or 129 | "universal" binaries--by specifying multiple `-arch' options to the 130 | compiler but only a single `-arch' option to the preprocessor. Like 131 | this: 132 | 133 | ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ 134 | CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ 135 | CPP="gcc -E" CXXCPP="g++ -E" 136 | 137 | This is not guaranteed to produce working output in all cases, you 138 | may have to build one architecture at a time and combine the results 139 | using the `lipo' tool if you have problems. 140 | 141 | Installation Names 142 | ================== 143 | 144 | By default, `make install' installs the package's commands under 145 | `/usr/local/bin', include files under `/usr/local/include', etc. You 146 | can specify an installation prefix other than `/usr/local' by giving 147 | `configure' the option `--prefix=PREFIX', where PREFIX must be an 148 | absolute file name. 149 | 150 | You can specify separate installation prefixes for 151 | architecture-specific files and architecture-independent files. If you 152 | pass the option `--exec-prefix=PREFIX' to `configure', the package uses 153 | PREFIX as the prefix for installing programs and libraries. 154 | Documentation and other data files still use the regular prefix. 155 | 156 | In addition, if you use an unusual directory layout you can give 157 | options like `--bindir=DIR' to specify different values for particular 158 | kinds of files. Run `configure --help' for a list of the directories 159 | you can set and what kinds of files go in them. In general, the 160 | default for these options is expressed in terms of `${prefix}', so that 161 | specifying just `--prefix' will affect all of the other directory 162 | specifications that were not explicitly provided. 163 | 164 | The most portable way to affect installation locations is to pass the 165 | correct locations to `configure'; however, many packages provide one or 166 | both of the following shortcuts of passing variable assignments to the 167 | `make install' command line to change installation locations without 168 | having to reconfigure or recompile. 169 | 170 | The first method involves providing an override variable for each 171 | affected directory. For example, `make install 172 | prefix=/alternate/directory' will choose an alternate location for all 173 | directory configuration variables that were expressed in terms of 174 | `${prefix}'. Any directories that were specified during `configure', 175 | but not in terms of `${prefix}', must each be overridden at install 176 | time for the entire installation to be relocated. The approach of 177 | makefile variable overrides for each directory variable is required by 178 | the GNU Coding Standards, and ideally causes no recompilation. 179 | However, some platforms have known limitations with the semantics of 180 | shared libraries that end up requiring recompilation when using this 181 | method, particularly noticeable in packages that use GNU Libtool. 182 | 183 | The second method involves providing the `DESTDIR' variable. For 184 | example, `make install DESTDIR=/alternate/directory' will prepend 185 | `/alternate/directory' before all installation names. The approach of 186 | `DESTDIR' overrides is not required by the GNU Coding Standards, and 187 | does not work on platforms that have drive letters. On the other hand, 188 | it does better at avoiding recompilation issues, and works well even 189 | when some directory options were not specified in terms of `${prefix}' 190 | at `configure' time. 191 | 192 | Optional Features 193 | ================= 194 | 195 | If the package supports it, you can cause programs to be installed 196 | with an extra prefix or suffix on their names by giving `configure' the 197 | option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. 198 | 199 | Some packages pay attention to `--enable-FEATURE' options to 200 | `configure', where FEATURE indicates an optional part of the package. 201 | They may also pay attention to `--with-PACKAGE' options, where PACKAGE 202 | is something like `gnu-as' or `x' (for the X Window System). The 203 | `README' should mention any `--enable-' and `--with-' options that the 204 | package recognizes. 205 | 206 | For packages that use the X Window System, `configure' can usually 207 | find the X include and library files automatically, but if it doesn't, 208 | you can use the `configure' options `--x-includes=DIR' and 209 | `--x-libraries=DIR' to specify their locations. 210 | 211 | Some packages offer the ability to configure how verbose the 212 | execution of `make' will be. For these packages, running `./configure 213 | --enable-silent-rules' sets the default to minimal output, which can be 214 | overridden with `make V=1'; while running `./configure 215 | --disable-silent-rules' sets the default to verbose, which can be 216 | overridden with `make V=0'. 217 | 218 | Particular systems 219 | ================== 220 | 221 | On HP-UX, the default C compiler is not ANSI C compatible. If GNU 222 | CC is not installed, it is recommended to use the following options in 223 | order to use an ANSI C compiler: 224 | 225 | ./configure CC="cc -Ae -D_XOPEN_SOURCE=500" 226 | 227 | and if that doesn't work, install pre-built binaries of GCC for HP-UX. 228 | 229 | HP-UX `make' updates targets which have the same time stamps as 230 | their prerequisites, which makes it generally unusable when shipped 231 | generated files such as `configure' are involved. Use GNU `make' 232 | instead. 233 | 234 | On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot 235 | parse its `' header file. The option `-nodtk' can be used as 236 | a workaround. If GNU CC is not installed, it is therefore recommended 237 | to try 238 | 239 | ./configure CC="cc" 240 | 241 | and if that doesn't work, try 242 | 243 | ./configure CC="cc -nodtk" 244 | 245 | On Solaris, don't put `/usr/ucb' early in your `PATH'. This 246 | directory contains several dysfunctional programs; working variants of 247 | these programs are available in `/usr/bin'. So, if you need `/usr/ucb' 248 | in your `PATH', put it _after_ `/usr/bin'. 249 | 250 | On Haiku, software installed for all users goes in `/boot/common', 251 | not `/usr/local'. It is recommended to use the following options: 252 | 253 | ./configure --prefix=/boot/common 254 | 255 | Specifying the System Type 256 | ========================== 257 | 258 | There may be some features `configure' cannot figure out 259 | automatically, but needs to determine by the type of machine the package 260 | will run on. Usually, assuming the package is built to be run on the 261 | _same_ architectures, `configure' can figure that out, but if it prints 262 | a message saying it cannot guess the machine type, give it the 263 | `--build=TYPE' option. TYPE can either be a short name for the system 264 | type, such as `sun4', or a canonical name which has the form: 265 | 266 | CPU-COMPANY-SYSTEM 267 | 268 | where SYSTEM can have one of these forms: 269 | 270 | OS 271 | KERNEL-OS 272 | 273 | See the file `config.sub' for the possible values of each field. If 274 | `config.sub' isn't included in this package, then this package doesn't 275 | need to know the machine type. 276 | 277 | If you are _building_ compiler tools for cross-compiling, you should 278 | use the option `--target=TYPE' to select the type of system they will 279 | produce code for. 280 | 281 | If you want to _use_ a cross compiler, that generates code for a 282 | platform different from the build platform, you should specify the 283 | "host" platform (i.e., that on which the generated programs will 284 | eventually be run) with `--host=TYPE'. 285 | 286 | Sharing Defaults 287 | ================ 288 | 289 | If you want to set default values for `configure' scripts to share, 290 | you can create a site shell script called `config.site' that gives 291 | default values for variables like `CC', `cache_file', and `prefix'. 292 | `configure' looks for `PREFIX/share/config.site' if it exists, then 293 | `PREFIX/etc/config.site' if it exists. Or, you can set the 294 | `CONFIG_SITE' environment variable to the location of the site script. 295 | A warning: not all `configure' scripts look for a site script. 296 | 297 | Defining Variables 298 | ================== 299 | 300 | Variables not defined in a site shell script can be set in the 301 | environment passed to `configure'. However, some packages may run 302 | configure again during the build, and the customized values of these 303 | variables may be lost. In order to avoid this problem, you should set 304 | them in the `configure' command line, using `VAR=value'. For example: 305 | 306 | ./configure CC=/usr/local2/bin/gcc 307 | 308 | causes the specified `gcc' to be used as the C compiler (unless it is 309 | overridden in the site shell script). 310 | 311 | Unfortunately, this technique does not work for `CONFIG_SHELL' due to 312 | an Autoconf bug. Until the bug is fixed you can use this workaround: 313 | 314 | CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash 315 | 316 | `configure' Invocation 317 | ====================== 318 | 319 | `configure' recognizes the following options to control how it 320 | operates. 321 | 322 | `--help' 323 | `-h' 324 | Print a summary of all of the options to `configure', and exit. 325 | 326 | `--help=short' 327 | `--help=recursive' 328 | Print a summary of the options unique to this package's 329 | `configure', and exit. The `short' variant lists options used 330 | only in the top level, while the `recursive' variant lists options 331 | also present in any nested packages. 332 | 333 | `--version' 334 | `-V' 335 | Print the version of Autoconf used to generate the `configure' 336 | script, and exit. 337 | 338 | `--cache-file=FILE' 339 | Enable the cache: use and save the results of the tests in FILE, 340 | traditionally `config.cache'. FILE defaults to `/dev/null' to 341 | disable caching. 342 | 343 | `--config-cache' 344 | `-C' 345 | Alias for `--cache-file=config.cache'. 346 | 347 | `--quiet' 348 | `--silent' 349 | `-q' 350 | Do not print messages saying which checks are being made. To 351 | suppress all normal output, redirect it to `/dev/null' (any error 352 | messages will still be shown). 353 | 354 | `--srcdir=DIR' 355 | Look for the package's source code in directory DIR. Usually 356 | `configure' can determine that directory automatically. 357 | 358 | `--prefix=DIR' 359 | Use DIR as the installation prefix. *note Installation Names:: 360 | for more details, including other options available for fine-tuning 361 | the installation locations. 362 | 363 | `--no-create' 364 | `-n' 365 | Run the configure checks, but stop before creating any output 366 | files. 367 | 368 | `configure' also accepts some other, not widely useful, options. Run 369 | `configure --help' for more details. 370 | 371 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4 2 | AM_DISTCHECK_CONFIGURE_FLAGS="--enable-unit-tests" 3 | 4 | .PHONY: all 5 | 6 | SUBDIRS = . libs opt src docs 7 | DIST_SUBDIRS = . libs opt src docs test 8 | 9 | # Conditionally build the tests 10 | if BUILD_TESTS 11 | 12 | SUBDIRS+= test 13 | 14 | AM_TESTS_ENVIRONMENT= \ 15 | RANDOM_SEED='@SEED@' ; export RANDOM_SEED ; 16 | 17 | XFAIL_TESTS= 18 | TESTS = $(XFAIL_TESTS) test/nan.sh test/low_homo.sh test/test_esa test/test_seq test/test_extra.sh test/test_random.sh test/test_join.sh test/test_process 19 | 20 | $(TESTS): src/andi 21 | 22 | endif # BUILD_TESTS 23 | 24 | 25 | dist_noinst_DATA = ChangeLog README.md 26 | dist_pdf_DATA = andi-manual.pdf 27 | dist_noinst_SCRIPTS= scripts/maf2phy.awk scripts/vmatch.sh scripts/_andi 28 | 29 | # Recreate the changelog, when the version string changes. 30 | ChangeLog: configure.ac 31 | echo "Missing Git" > ChangeLog; 32 | if test -d $(srcdir)/.git; then \ 33 | which git && git log --stat --date=short --abbrev-commit | grep --invert-match '^ [[:alnum:].]' | git stripspace > ChangeLog; \ 34 | fi 35 | 36 | .PHONY: code-docs 37 | code-docs: 38 | cd docs && $(MAKE) code-docs; 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/EvolBioInf/andi.svg?branch=master)](https://travis-ci.org/EvolBioInf/andi) [![Coverage Status](https://coveralls.io/repos/EvolBioInf/andi/badge.svg?branch=master)](https://coveralls.io/r/EvolBioInf/andi?branch=master) 2 | 3 | # About 4 | 5 | This is the `andi` program for estimating the evolutionary distance between closely related genomes. These distances can be used to rapidly infer phylogenies for big sets of genomes. Because `andi` does not compute full alignments, it is so efficient that it scales even up to thousands of bacterial genomes. 6 | 7 | This readme covers all necessary instructions for the impatient to get `andi` up and running. For extensive instructions please consult the [manual](andi-manual.pdf). 8 | 9 | 10 | # Installation and Usage 11 | 12 | Stable versions of `andi` are available via package managers. For manual installation see below. 13 | 14 | For Debian and Ubuntu: 15 | 16 | sudo apt-get install andi 17 | 18 | For macOS with Homebrew: 19 | 20 | brew tap brewsci/bio 21 | brew install andi 22 | 23 | For ArchLinux with aura: 24 | 25 | sudo aura -A andi 26 | 27 | With a successful installation you can get the usage instructions via `--help` or the man page. 28 | 29 | $ andi --help 30 | $ man andi 31 | 32 | You can simply use `andi` with your genomes in `FASTA` format. 33 | 34 | $ andi S1.fasta S2.fasta 35 | 2 36 | S1 0.0 0.1 37 | s2 0.1 0.0 38 | 39 | From this distance matrix the phylogeny can be inferred via neighbor-joining. Check the [manual](andi-manual.pdf) for a more thorough description. 40 | 41 | 42 | ## Manual installation 43 | 44 | If your system does not support one of the above package managers you have to manually build the latest [stable release](https://github.com/EvolBioInf/andi/releases) from a tarball. See the [manual](andi-manual.pdf) for extensive building instructions. 45 | 46 | This program has the following external dependencies: [libdivsufsort](https://github.com/y-256/libdivsufsort) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools. 47 | 48 | Assuming you have installed all prerequisites, building is as easy as follows. 49 | 50 | $ autoreconf -fi -Im4 # optional when building from tarball 51 | $ ./configure 52 | $ make 53 | $ make install 54 | 55 | Excessive build instructions are located in `INSTALL`. 56 | 57 | # Links and Additional Resources 58 | 59 | The release of this software is accompanied by a paper from [Haubold et al.](http://bioinformatics.oxfordjournals.org/content/31/8/1169). It explains the used *anchor distance* strategy in great detail. The `maf2phy.awk` script used in the validation process is located under `scripts`. Simulations were done using our own [simK](http://guanine.evolbio.mpg.de/bioBox/) tool. For a demo visualising the internals of andi visit our [GitHub pages](http://evolbioinf.github.io/andi/). 60 | 61 | ## Data Sets 62 | 63 | 1. 29 E. coli and Shigella strains: [data](http://guanine.evolbio.mpg.de/andi/eco29.fasta.gz) 64 | 2. 109 E. coli ST131 strains ([paper](http://www.pnas.org/content/early/2014/03/28/1322678111.abstract)): 65 | * [99 newly sequenced strains](https://github.com/BeatsonLab-MicrobialGenomics/ST131_99) 66 | * [10 previously published strains](http://guanine.evolbio.mpg.de/andi/st131_extra.tgz) 67 | 3. 3085 Streptococcus pneumoniae strains ([paper](http://www.nature.com/ng/journal/v46/n3/full/ng.2895.html)): ftp://ftp.sanger.ac.uk/pub/pathogens/Streptococcus/pneumoniae/Maela_assemblies.tgz 68 | 69 | ## License 70 | 71 | Copyright © 2014 - 2021 Fabian Klötzl 72 | License GPLv3+: GNU GPL version 3 or later. 73 | 74 | This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. The full license text is available at . 75 | 76 | Some files may be licensed differently. 77 | 78 | ## Contact 79 | 80 | In case of bugs or unexpected errors don't hesitate to send me a mail: kloetzl@evolbio.mpg.de 81 | -------------------------------------------------------------------------------- /andi-manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolBioInf/andi/390af15beb76badaf8f16864a885747aa60956c8/andi-manual.pdf -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_INIT([andi], [0.15-beta]) 2 | AM_INIT_AUTOMAKE([-Wall foreign]) 3 | 4 | AC_CONFIG_MACRO_DIR([m4]) 5 | 6 | AC_PROG_CC 7 | AC_PROG_CXX 8 | AC_PROG_MAKE_SET 9 | AC_PROG_CPP 10 | AC_PROG_RANLIB 11 | m4_ifdef([AM_PROG_AR], [AM_PROG_AR]) 12 | 13 | # Make sure, also the C++ programs are compiled with OpenMP 14 | AC_LANG(C++) 15 | AC_OPENMP 16 | 17 | # Execute all tests using C 18 | AC_LANG(C) 19 | AC_OPENMP 20 | 21 | AC_CHECK_LIB([m],[cos]) 22 | AC_CHECK_LIB([gslcblas],[cblas_dgemm], [], [have_gsl=no]) 23 | AC_CHECK_LIB([gsl],[gsl_ran_binomial], [], [have_gsl=no]) 24 | 25 | AS_IF([test "x$have_gsl" = "xno"],[ 26 | AC_MSG_ERROR([Missing the Gnu Scientific Library.]) 27 | ]) 28 | 29 | # The libdivsufsort header contains some Microsoft extension making 30 | # compilation fail on certain systems (i.e. OS X). Add the following 31 | # flag so the build runs smoothly. 32 | CPPFLAGS="$CPPFLAGS -fms-extensions" 33 | AC_CHECK_HEADERS([divsufsort.h],[have_libdivsufsort=yes],[have_libdivsufsort=no]) 34 | AC_CHECK_LIB(divsufsort, divsufsort, [], [have_libdivsufsort=no]) 35 | 36 | AS_IF([test "x$have_libdivsufsort" = "xno"],[ 37 | AC_MSG_ERROR([Missing libdivsufsort.]) 38 | ]) 39 | 40 | 41 | # The unit tests require GLIB2. So by default do not build the test. 42 | # If enabled, check for glib. 43 | 44 | AC_ARG_ENABLE([unit-tests], 45 | [AS_HELP_STRING([--enable-unit-tests],[build unit tests @<:@default: no@:>@])], 46 | [try_unit_tests=${enableval}],[try_unit_tests=no] 47 | ) 48 | 49 | AM_CONDITIONAL([BUILD_TESTS],[test "x${try_unit_tests}" = xyes]) 50 | 51 | # The user may set a seed for the unit tests, so that builds are reproducible. 52 | # A value of 0 makes the tests random. 53 | AC_ARG_WITH([seed], 54 | [AS_HELP_STRING([--with-seed=INT], 55 | [random seed for reproducible builds. @<:@default: 0@:>@])], 56 | [SEED=$withval], 57 | [SEED=0]) 58 | 59 | AC_SUBST([SEED]) 60 | 61 | AS_IF([test "x${try_unit_tests}" = xyes], [ 62 | have_glib=yes 63 | PKG_CHECK_MODULES([GLIB], [glib-2.0], [], [have_glib=no]) 64 | 65 | if test "x${have_glib}" = xno; then 66 | AC_MSG_ERROR([Missing Glib 2. Either install it or build without unit tests.]) 67 | fi 68 | 69 | AX_CXX_COMPILE_STDCXX_11([],[mandatory]) 70 | ]) 71 | 72 | 73 | # Check for various headers including those used by libdivsufsort. 74 | AC_CHECK_HEADERS([limits.h stdlib.h string.h unistd.h stdint.h inttypes.h err.h errno.h fcntl.h]) 75 | 76 | AC_C_INLINE 77 | AC_TYPE_SIZE_T 78 | AC_TYPE_SSIZE_T 79 | AC_TYPE_INT32_T 80 | AC_TYPE_UINT8_T 81 | AC_HEADER_STDBOOL 82 | 83 | # Until someone convinces me otherwise, I will deactivate the macros 84 | # AC_FUNC_MALLOC and AC_FUNC_REALLOC. They only check if `malloc(0)` retuns a 85 | # non-null pointer. This breaks the build on systems using uClibc, including 86 | # my laptop. 87 | # As requesting zero bytes is not useful, and implementation-defined behaviour, 88 | # it should be avoided in the first place. Thus I really don't need these checks. 89 | 90 | AC_CHECK_FUNCS([floor pow sqrt strdup strerror]) 91 | AC_CHECK_FUNCS([strndup strcasecmp]) 92 | AC_CHECK_FUNCS([strchr strrchr strchrnul]) 93 | AC_CHECK_FUNCS([strtoul strtod]) 94 | AC_CHECK_FUNCS([reallocarray]) 95 | 96 | AM_CONDITIONAL([HAVE_REALLOCARRAY], [test "x$ac_cv_func_reallocarray" = xyes]) 97 | AM_CONDITIONAL([HAVE_STRCHRNUL], [test "x$ac_cv_func_strchrnul" = xyes]) 98 | 99 | AC_CONFIG_HEADERS([src/config.h:src/config.hin]) 100 | 101 | AC_CONFIG_FILES([ 102 | Makefile 103 | docs/andi.1 104 | docs/Makefile 105 | libs/Makefile 106 | opt/Makefile 107 | src/Makefile 108 | test/Makefile 109 | ]) 110 | AC_OUTPUT 111 | 112 | -------------------------------------------------------------------------------- /docs/Makefile.am: -------------------------------------------------------------------------------- 1 | dist_man_MANS = andi.1 2 | dist_noinst_DATA = Doxyfile 3 | 4 | # I intentionally do not list any of the manual files here. I neither want them 5 | # distributed nor installed. The reason is that building the manual requires 6 | # LaTeX with a whole bunch of packages installed. Plus, so many things can go 7 | # wrong, when building, so it's better to inspect the result. Thus, the manual 8 | # has to be build by hand and copied to the right place for distribution. 9 | 10 | .PHONY: code-docs 11 | code-docs: 12 | doxygen 13 | 14 | manual/version.tex: manual/version.tex.in $(top_srcdir)/configure.ac 15 | sed "s/VERSION/$(VERSION)/" manual/version.tex.in > manual/version.tex 16 | 17 | manual/andi-manual.pdf: manual/andi-manual.tex manual/version.tex 18 | @echo "error: manual rebuild of the manual required (no pun intended)." 19 | @exit 1 20 | 21 | # maintainer-clean-local: 22 | # rm -f manual/*{aux,log,out,toc} manual/andi-manual.pdf 23 | -------------------------------------------------------------------------------- /docs/andi.1.in: -------------------------------------------------------------------------------- 1 | .TH ANDI "1" "2020-01-09" "@VERSION@" "andi manual" 2 | .SH NAME 3 | andi \- estimates evolutionary distances 4 | .SH SYNOPSIS 5 | .B andi 6 | [\fIOPTIONS...\fR] \fIFILES\fR... 7 | .SH DESCRIPTION 8 | \fBandi\fR estimates the evolutionary distance between closely related genomes. For this \fBandi\fR reads the input sequences from \fIFASTA\fR files and computes the pairwise anchor distance. The idea behind this is explained in a paper by Haubold et al. (2015). 9 | .SH OUTPUT 10 | The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each entry representing divergence with a positive real number. A distance of zero means that two sequences are identical, whereas other values are estimates for the nucleotide substitution rate (Jukes-Cantor corrected). For technical reasons the comparison might fail and no estimate can be computed. In such cases \fInan\fR is printed. This either means that the input sequences were too short (<200bp) or too diverse (K>0.5) for our method to work properly. 11 | .SH OPTIONS 12 | .TP 13 | \fB\-b\fR \fIINT\fR, \fB\-\-bootstrap\fR=\fIINT\fR 14 | Compute multiple distance matrices, with \fIn-1\fR bootstrapped from the first. See the paper Klötzl & Haubold (2016) for a detailed explanation. 15 | .TP 16 | \fB--file-of-filenames\fR=\fIFILE\fR 17 | Usually, \fBandi\fR is called with the filenames as commandline arguments. With this option the filenames may also be read from a file itself, with one name per line. Use a single dash (\fB'-'\fR) to read from stdin. 18 | .TP 19 | \fB\-j\fR, \fB\-\-join\fR 20 | Use this mode if each of your \fIFASTA\fR files represents one assembly with numerous contigs. \fBandi\fR will then treat all of the contained sequences per file as a single genome. In this mode at least one filename must be provided via command line arguments. For the output the filename is used to identify each sequence. 21 | .TP 22 | \fB\-l\fR, \fB\-\-low-memory\fR 23 | In multithreaded mode, \fBandi\fR requires memory linear to the amount of threads. The low memory mode changes this to a constant demand independent from the used number of threads. Unfortunately, this comes at a significant runtime cost. 24 | .TP 25 | \fB\-m\fR \fIMODEL\fR, \fB\-\-model\fR=\fIMODEL\fR 26 | Set the nucleotide evolution model to one of 'Raw', 'JC', 'Kimura', or 'LogDet'. By default the Jukes-Cantor correction is used. 27 | .TP 28 | \fB\-p\fR \fIFLOAT\fR 29 | Significance of an anchor; default: 0.025. 30 | .TP 31 | \fB--progress\fR[=\fIWHEN\fR] 32 | Print a progress bar. \fIWHEN\fR can be 'auto' (default if omitted), 'always', or 'never'. 33 | .TP 34 | \fB\-t\fR \fIINT\fR, \fB\-\-threads\fR=\fIINT\fR 35 | The number of threads to be used; by default, all available processors are used. 36 | .br 37 | Multithreading is only available if \fBandi\fR was compiled with OpenMP support. 38 | .TP 39 | \fB\-\-truncate-names\fR 40 | By default \fBandi\fR outputs the full names of sequences, optionally padded with spaces, if they are shorter than ten characters. Names longer than ten characters may lead to problems with downstream tools. With this switch names will be truncated. 41 | .TP 42 | \fB\-v\fR, \fB\-\-verbose\fR 43 | Prints additional information, including the amount of found homology. Apply multiple times for extra verboseness. 44 | .TP 45 | \fB\-h\fR, \fB\-\-help\fR 46 | Prints the synopsis and an explanation of available options. 47 | .TP 48 | \fB\-\-version\fR 49 | Outputs version information and acknowledgments. 50 | .SH COPYRIGHT 51 | Copyright \(co 2014 - 2021 Fabian Klötzl 52 | License GPLv3+: GNU GPL version 3 or later. 53 | .br 54 | This is free software: you are free to change and redistribute it. 55 | There is NO WARRANTY, to the extent permitted by law. 56 | The full license text is available at . 57 | .PP 58 | .SH ACKNOWLEDGMENTS 59 | 1) andi: Haubold, B. Klötzl, F. and Pfaffelhuber, P. (2015). andi: Fast and accurate estimation of evolutionary distances between closely related genomes, Bioinformatics 31.8. 60 | .br 61 | 2) Algorithms: Ohlebusch, E. (2013). Bioinformatics Algorithms. Sequence Analysis, Genome Rearrangements, and Phylogenetic Reconstruction. pp 118f. 62 | .br 63 | 3) SA construction: Mori, Y. (2005). libdivsufsort, unpublished. 64 | .br 65 | 4) Bootstrapping: Klötzl, F. and Haubold, B. (2016). Support Values for Genome Phylogenies, Life 6.1. 66 | .SH BUGS 67 | .SS Reporting Bugs 68 | Please report bugs to or at . 69 | -------------------------------------------------------------------------------- /docs/manual/andi-manual.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper, 2 | 10pt, 3 | english, 4 | DIV=12, 5 | BCOR=8mm]{scrbook} 6 | \usepackage[utf8x]{inputenc} 7 | \usepackage{babel} 8 | \usepackage{listings} 9 | \usepackage{xcolor} 10 | \usepackage{hyperref} 11 | \usepackage{siunitx} 12 | \usepackage[T1]{fontenc} 13 | \usepackage{isodate} 14 | \usepackage{graphicx} 15 | \usepackage{amsthm} 16 | \usepackage{acronym} 17 | \usepackage{amssymb} 18 | \usepackage{caption} 19 | \usepackage{subcaption} 20 | \usepackage{xspace} 21 | \usepackage{microtype} 22 | 23 | \bibliographystyle{alpha} 24 | 25 | \DeclareSIUnit\byte{B} 26 | \DeclareSIUnit\basepairs{bp} 27 | \DeclareSIUnit\bit{bit} 28 | 29 | \definecolor{oceangreen}{cmyk}{1,.0,.20,.78} 30 | \addtokomafont{sectioning}{\rmfamily\color{oceangreen}} 31 | 32 | \definecolor{bluekeywords}{rgb}{0.13,0.13,1} 33 | \definecolor{greencomments}{rgb}{0,0.5,0} 34 | \definecolor{turqusnumbers}{rgb}{0.17,0.57,0.69} 35 | \definecolor{redstrings}{rgb}{0.5,0,0} 36 | \definecolor{lightgray}{rgb}{0.9,0.9,0.9} 37 | 38 | \usepackage{libertine} 39 | \fontfamily{libertine} 40 | \selectfont 41 | %\usepackage[scaled]{berasans} 42 | 43 | \newcommand{\thymine}{\textsc{m}\oldstylenums{2}\xspace} 44 | \newcommand{\local}{\textsc{m}\oldstylenums{1}\xspace} 45 | \newcommand{\algo}[1]{\textsc{{#1}}} 46 | \newcommand{\andi}{\algo{andi}\xspace} 47 | \newcommand{\word}[1]{\textsf{\small#1}} 48 | \newcommand{\wchar}[1]{\textsf{\small#1}} 49 | \newcommand{\eco}{\textsc{eco}\oldstylenums{29}\xspace} 50 | \newcommand{\pneu}{\textsc{Pneu}\oldstylenums{3085}\xspace} 51 | 52 | \include{version} 53 | 54 | % Todos at the margin 55 | \newcommand{\todo}[1]{ 56 | \marginpar{\fbox{\begin{minipage}{0.9\marginparwidth} 57 | \scriptsize\sloppy\raggedright #1 58 | \end{minipage}}} 59 | } 60 | 61 | 62 | \newtheorem{definition}{Definition} 63 | 64 | 65 | \lstset{backgroundcolor=\color{lightgray}} 66 | 67 | \lstdefinestyle{shell}{ 68 | language=bash, 69 | columns=flexible, 70 | xleftmargin=12pt, 71 | xrightmargin=12pt, 72 | breaklines=true, 73 | basicstyle=\small\ttfamily, 74 | morekeywords={make, tar, git, sudo, andi, time, man, head, cut, fneighbor, 75 | fretree, figtree, brew, aura, autoreconf, ls}, 76 | % literate={~} {$\sim$}{1} 77 | } 78 | 79 | \lstset{style=shell} 80 | 81 | \title{Documentation of \algo{andi}} 82 | \subtitle{Rapid Estimation of Evolutionary Distances between Genomes\\ {\small\url{https://github.com/EvolBioInf/andi}}} 83 | \author{Fabian Klötzl\\ \href{mailto:kloetzl@evolbio.mpg.de}{kloetzl@evolbio.mpg.de}} 84 | \date{Version \version, \isodate\today \\ 85 | \vspace*{2cm} 86 | \centering\includegraphics[width=0.8\textwidth]{andi_labels.pdf}} 87 | 88 | \begin{document} 89 | 90 | \maketitle 91 | 92 | \section*{Abstract} 93 | This is the documentation of the \andi program for estimating the evolutionary distance between closely related genomes. These distances can be used to rapidly infer phylogenies for big sets of genomes. Because \andi does not compute full alignments, it is so efficient that it scales well up to thousands of bacterial genomes. 94 | 95 | This is scientific software. Please cite our paper \cite{andi} if you use \andi in your publication. Also refer to the paper for the internals of \andi. Additionally, there is a Master's thesis with even more in depth analysis of \andi \cite{kloetzl}. 96 | 97 | \vspace*{1cm} 98 | \section*{License} 99 | This document is release under the Creative Commons Attribution Share-Alike license. This means, you are free to copy and redistribute this document. You may even remix, tweak and build upon this document, as long as you credit me for the work I've done and release your document under the identical terms. The full legal code is available online: {\small\url{https://creativecommons.org/licenses/by-sa/4.0/legalcode}}. 100 | 101 | \tableofcontents 102 | 103 | \chapter{Installation} %%%%% 104 | 105 | \section{Package Manager} 106 | 107 | The easiest way to install \andi is via a package manager. This also handles all dependencies for you. 108 | 109 | 110 | \noindent Debian and Ubuntu: 111 | 112 | \begin{lstlisting} 113 | ~ % sudo apt-get install andi 114 | \end{lstlisting} 115 | 116 | \noindent macOS with homebrew: 117 | 118 | \begin{lstlisting} 119 | ~ % brew tap brewsci/bio 120 | ~ % brew install andi 121 | \end{lstlisting} 122 | 123 | \noindent ArchLinux AUR package with aura: 124 | 125 | \begin{lstlisting} 126 | ~ % aura -A andi 127 | \end{lstlisting} 128 | 129 | \andi is intended to be run in a \algo{Unix} commandline such as \lstinline$bash$ or \lstinline$zsh$. All examples in this document are also intended for that environment. You can verify that \andi was installed correctly by executing \lstinline$andi -h$. This should give you a list of all available options (see Section~\ref{sec:options}). 130 | 131 | \section{Source Package} \label{sub:regular} 132 | 133 | To build \andi from source, download the latest \href{https://github.com/EvolBioInf/andi/releases}{release} from GitHub. Please note, that \andi requires the \algo{Gnu Scientific Library} and \algo{libdivsufsort}\footnote{\url{https://github.com/y-256/libdivsufsort}} for optimal performance \cite{divsufsort}. 134 | 135 | Once you have downloaded the package, unzip it and change into the newly created directory. 136 | 137 | \begin{lstlisting} 138 | ~ % tar -xzvf andi-0.14.tar.gz 139 | ~ % cd andi-0.14 140 | \end{lstlisting} 141 | 142 | \noindent Now build and install \andi. 143 | 144 | \begin{lstlisting} 145 | ~/andi-0.14 % ./configure 146 | ~/andi-0.14 % make 147 | ~/andi-0.14 % sudo make install 148 | \end{lstlisting} 149 | 150 | \noindent This installs \andi for all users on your system. If you do not have root privileges, you will find a working copy of \andi in the \lstinline$src$ subdirectory. For the rest of this documentation, it is assumed, that \andi is in your \textdollar\lstinline!PATH!. 151 | 152 | Now \andi should be ready for use. Try invoking the help. 153 | 154 | \begin{lstlisting} 155 | ~/andi-0.14 % ~/andi 156 | Usage: andi [OPTIONS...] FILES... 157 | FILES... can be any sequence of FASTA files. 158 | Use '-' as file name to read from stdin. 159 | Options: 160 | -b, --bootstrap=INT Print additional bootstrap matrices 161 | --file-of-filenames=FILE Read additional filenames from FILE; one per line 162 | -j, --join Treat all sequences from one file as a single genome 163 | -l, --low-memory Use less memory at the cost of speed 164 | -m, --model=MODEL Pick an evolutionary model of 'Raw', 'JC', 'Kimura', 'LogDet'; default: JC 165 | -p FLOAT Significance of an anchor; default: 0.025 166 | --progress=WHEN Print a progress bar 'always', 'never', or 'auto'; default: auto 167 | -t, --threads=INT Set the number of threads; by default, all processors are used 168 | --truncate-names Truncate names to ten characters 169 | -v, --verbose Prints additional information 170 | -h, --help Display this help and exit 171 | --version Output version information and acknowledgments 172 | \end{lstlisting} 173 | 174 | \noindent \andi also comes with a man page, which can be accessed via \lstinline$man andi$. % But once you are done with this documentation, you will require it scarcely. 175 | 176 | \section{Installing from Git Repository} 177 | 178 | To build \andi from the \algo{Git} repo, you will also need the \algo{autotools}. Refer to your OS documentation for installation instructions. Once done, execute the following steps. 179 | 180 | \begin{lstlisting} 181 | ~ % git clone git@github.com:EvolBioInf/andi.git 182 | ~ % cd andi 183 | ~/andi % autoreconf -fi -Im4 184 | \end{lstlisting} 185 | 186 | \noindent Continue with the \algo{Gnu} trinity as described in Section~\ref{sub:regular}. 187 | 188 | 189 | \chapter{Usage} %%%%% 190 | 191 | The input sequences for \andi should be in \algo{Fasta} format. Any number of files can be passed. Each file may contain more than one sequence. 192 | 193 | \begin{lstlisting} 194 | ~ % andi S1.fasta S2.fasta 195 | 2 196 | S1 0.0000 0.0979 197 | S2 0.0979 0.0000 198 | \end{lstlisting} 199 | 200 | If no file argument is given, \andi reads the input from \algo{stdin}. This makes it convenient to use in \algo{Unix} pipelines. 201 | 202 | \begin{lstlisting} 203 | ~ % cat S1.fasta S2.fasta | andi 204 | 2 205 | S1 0.0000 0.0979 206 | S2 0.0979 0.0000 207 | \end{lstlisting} 208 | 209 | The output of \andi is a matrix in \algo{Phylip} style: On the first line the number of compared sequences is given, \lstinline!2! in our example. Then the matrix is printed, where each line is preceded by the name of the $i$th sequence. Note that the matrix is symmetric and the main diagonal contains only zeros. The numbers themselves are evolutionary distances, estimated from substitution rates. 210 | 211 | 212 | \section{Input} \label{sec:join} 213 | 214 | As mentioned before, \andi reads in \algo{Fasta} files. It recognizes only the four standard bases and is case insensitive (RegEx: \lstinline![acgtACGT]!). All other residue symbols are excluded from the analysis and \andi prints a warning, when this happens. 215 | 216 | If instead of distinct sequences, a \algo{Fasta} file contains contigs belonging to a single taxon, \andi will treat them as a unit when switched into \algo{join} mode. This can be achieved by using the \lstinline!-j! or \lstinline!--join! command line switch. 217 | 218 | \begin{lstlisting} 219 | ~ % andi --join E_coli.fasta Shigella.fasta 220 | [Output] 221 | \end{lstlisting} 222 | 223 | When the \algo{join} mode is active, the file names are used to label the individual sequences. Thus, in \algo{join} mode, each genome has to be in its own file, and furthermore, at least one filename has to be given via the command line. 224 | 225 | If not enough file names are provided, \andi will try to read sequences from the standard input stream. This behaviour can be explicitly triggered by passing a single dash (\lstinline$-$) as a file name, which is useful in pipelines. 226 | 227 | If \andi seems to take unusually long, or requires huge amounts of memory, then you might have forgotten the \algo{join} switch. This makes \andi compare each contig instead of each genome, resulting in many more comparisons! Since version 0.12 \andi produces a progressmeter on the standard error stream. \andi tries to be smart about when to show or hide the progress bar. You can manually change this behaviour using the \lstinline!--progress! option. 228 | 229 | Starting with version 0.11 \andi supports an extra way of input. Instead of passing file names directly to \andi via the commandline arguments, the file names may also be read from a file itself. Using this new \lstinline$--file-of-filenames$ argument can work around limitations imposed be the shell. 230 | 231 | The following three snippets have the same functionality. 232 | 233 | \begin{lstlisting} 234 | ~ % andi --join *.fasta 235 | [Output] 236 | \end{lstlisting} 237 | 238 | \begin{lstlisting} 239 | ~ % ls *.fasta > filenames.txt 240 | ~ % andi --join --file-of-filenames filenames.txt 241 | [Output] 242 | \end{lstlisting} 243 | 244 | \begin{lstlisting} 245 | ~ % ls *.fasta | andi --join --file-of-filenames - 246 | [Output] 247 | \end{lstlisting} 248 | 249 | \section{Output} 250 | 251 | The output of \andi is written to \lstinline$stdout$. This makes it easy to use on the command line and within shell scripts. As seen before, the matrix, computed by \algo{andi}, is given in \algo{Phylip} format \cite{phylip}. 252 | 253 | \begin{lstlisting} 254 | ~ % cat S1.fasta S2.fasta | andi 255 | 2 256 | S1 0.0000 0.0979 257 | S2 0.0979 0.0000 258 | \end{lstlisting} 259 | 260 | If the computation completed successfully, \andi exits with the status code 0. Otherwise, the value of \lstinline$errno$ is used as the exit code. \andi can also produce warnings and error messages for the user's convenience. These messages are printed to \lstinline$stderr$ and thus do not interfere with the normal output. 261 | 262 | \section{Options} \label{sec:options} 263 | 264 | \andi takes a small number of commandline options, of which even fewer are of interest on a day-to-day basis. If \lstinline$andi -h$ displays a \lstinline$-t$ option, then \andi was compiled with multi-threading support (implemented using \algo{OpenMP}). By default, \andi uses all available processors. However, to restrict the number of threads, use \lstinline$-t$. 265 | 266 | \begin{lstlisting} 267 | ~ % time andi ../test/1M.1.fasta -t 1 268 | 2 269 | S1 0.0000 0.0995 270 | S2 0.0995 0.0000 271 | ./andi ../test/1M.1.fasta 0,60s user 0,01s system 99% cpu 0,613 total 272 | ~ % time andi ../test/1M.1.fasta -t 2 273 | 2 274 | S1 0.0000 0.0995 275 | S2 0.0995 0.0000 276 | ./andi ../test/1M.1.fasta -t 2 0,67s user 0,03s system 195% cpu 0,362 total 277 | \end{lstlisting} 278 | 279 | In the above examples the runtime dropped from \SI{0.613}{\second}, to \SI{0.362}{\second} using two threads. Giving \andi more threads than input genomes leads to no further speed improvement. \, The other important option is \lstinline$--join$ (see Section~\ref{sec:join}). 280 | 281 | By default, the distances computed by \andi are \emph{Jukes-Cantor} corrected \cite{jukescantor}. Other evolutionary models are also implemented (Kimura \cite{kimura}, LogDet \cite{logdet}, raw). The \lstinline$--model$ parameter can be used to switch between them. 282 | 283 | Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}. For more information on computing support values from distance matrices see \cite{afra}. 284 | 285 | \begin{lstlisting} 286 | ~ % andi -b 2 ../test/1M.1.fasta 287 | 2 288 | S1 0.0000 0.1067 289 | S2 0.1067 0.0000 290 | 2 291 | S1 0.0000 0.1071 292 | S2 0.1071 0.0000 293 | \end{lstlisting} 294 | 295 | The original \algo{phylip} only supports distance matrices with names no longer than ten characters. However, this sometimes leads to problems with long accession numbers. Starting with version 0.11 \andi prints the full name of a sequence, even if it is longer than ten characters. If your downstream tools have trouble with this, use \lstinline$--truncate-names$ to reimpose the limit. 296 | 297 | Also new in version 0.11 is the \lstinline$--file-of-filenames$ option. See Section~\ref{sec:join} for details. 298 | 299 | \section{Example: \algo{eco29}} 300 | 301 | Here follows a real-world example of how to use \algo{andi}. It makes heavy use of the commandline and tools like \algo{Phylip}. If you prefer \algo{R}, check out this excellent blog post by Kathryn Holt.\footnote{\url{http://holtlab.net/2015/05/08/r-code-to-infer-tree-from-andi-output/}} 302 | 303 | As a data set we use \algo{eco29}; 29 genomes of \textit{E. Coli} and \textit{Shigella}. You can download the data from here: {\small{\url{http://guanine.evolbio.mpg.de/andi/eco29.fasta.gz}}}. The genomes have an average length of 4.9~million nucleotides amounting to a total \SI{138}{\mega\byte}. 304 | 305 | \algo{eco29} comes a single \algo{fasta} file, where each sequence is a genome. To calculate their pairwise distances, enter 306 | 307 | \begin{lstlisting} 308 | ~ % andi eco29.fasta > eco29.mat 309 | andi: The input sequences contained characters other than acgtACGT. These were automatically stripped to ensure correct results. 310 | \end{lstlisting} 311 | 312 | \noindent The \algo{eco29} data set includes non-canonical nucleotides, such as \word{Y}, \word{N}, and \word{P}, which get stripped from the input sequences. The resulting matrix is stored in \lstinline$eco29.mat$; Here is a small excerpt: 313 | 314 | \begin{lstlisting} 315 | ~ % head -n 5 eco29.mat | cut -d ' ' -f 1-5 316 | 29 317 | gi|563845 0.0000e+00 1.8388e-02 1.8439e-02 2.6398e-02 318 | gi|342360 1.8388e-02 0.0000e+00 4.4029e-04 2.6166e-02 319 | gi|300439 1.8439e-02 4.4029e-04 0.0000e+00 2.6123e-02 320 | gi|261117 2.6398e-02 2.6166e-02 2.6123e-02 0.0000e+00 321 | \end{lstlisting} 322 | 323 | \noindent From this we compute a tree via neighbor-joining using a \algo{Phylip} wrapper called \algo{Embassy}.\footnote{\url{http://emboss.sourceforge.net/embassy/\#PHYLIP}} 324 | 325 | \begin{lstlisting} 326 | ~ % fneighbor -datafile eco29.mat -outfile eco29.phylipdump 327 | \end{lstlisting} 328 | \noindent To make this tree easier to read, we can midpoint-root it. 329 | \begin{lstlisting} 330 | ~ % fretree -spp 29 -intreefile eco29.treefile -outtreefile eco29.tree <0.5$) or sprout a lot of indels that make comparison difficult. 416 | 417 | \subsection*{Little Homology} 418 | 419 | Very few anchors were found and thus only a tiny part of the sequences is considered homologous. Expect that the given distance is erroneous. 420 | 421 | \subsection*{Too long name} 422 | 423 | If you added the \lstinline$--truncate-names$ switch and an input name is longer than ten characters, you will receive this warning. 424 | 425 | \chapter{DevOps} %%%%% 426 | 427 | \andi is written in C/C++; mostly C99 with some parts in C++11. The sources are released on \algo{GitHub} as \emph{free software} under the \textsc{Gnu General Public License version~3} \cite{GPL}. Prebundled packages using \algo{autoconf} are also available, with the latest release being {\version} at the time of writing. 428 | 429 | If you are interested in the internals of \algo{andi}, consult the paper \cite{andi} or my Master's thesis \cite{kloetzl}. Both explain the used approach in detail. The latter emphasizes the used algorithms, data structures and their efficient implementation. 430 | 431 | \section{Dependencies} 432 | 433 | Here is a complete list of dependencies required for developing \algo{andi}. 434 | 435 | \begin{itemize} 436 | \item A C and a C++11 compiler, 437 | \item the \algo{autotools}, 438 | \item the \algo{Gnu Scientific Library}, 439 | \item \algo{Pdflatex} with various packages for the manual, 440 | \item \algo{Git}, 441 | \item \algo{glib2} for the unit tests, 442 | \item \algo{doxygen}, 443 | \item and \algo{libdivsufsort}. 444 | \end{itemize} 445 | 446 | 447 | \section{Code Documentation} 448 | 449 | \emph{Every} function in \andi is documented using \algo{doxygen} style comments. To create the documentation run \lstinline$make code-docs$ in the main directory. You will then find the documentation under \lstinline$./docs$. 450 | 451 | 452 | \section{Unit Tests} 453 | 454 | The unit tests are located in the \andi repository under the \lstinline$./test$ directory. Because they require \algo{glib2}, and a C++11 compiler, they are deactivated by default. To enable them, execute 455 | 456 | \begin{lstlisting} 457 | ~/andi % ./configure --enable-unit-tests 458 | \end{lstlisting} 459 | 460 | \noindent during the installation process. You can then verify the build via 461 | 462 | \begin{lstlisting} 463 | ~/andi % make check 464 | \end{lstlisting} 465 | 466 | \noindent The unit tests are also checked each time a commit is sent to the repository. This is done via \algo{TravisCI}.\footnote{\url{https://travis-ci.org/EvolBioInf/andi}} Thus, a warning is produced, when the builds fail, or the unit tests did not run successfully. Currently, the unit tests cover more than 75\% of the code. This is computed via the \algo{Travis} builds and a service called \algo{Coveralls}.\footnote{\url{https://coveralls.io/r/EvolBioInf/andi}} 467 | 468 | \section{Known Issues} 469 | 470 | These minor issues are known. I intend to fix them, when I have time. 471 | 472 | \begin{enumerate} 473 | \item This code will not work under Windows. At two places Unix-only code is used: filepath-separators are assumed to be \lstinline$/$ and file-descriptors are used for I/O. 474 | \item Unit tests for the bootstrapped matrices are missing. 475 | \item Cached intervals are sometimes not “as deep as they could be”. If that got fixed \lstinline$get_match_cache$ could bail out on \lstinline$ij.lcp < CACHE_LENGTH$. However the \lstinline$esa_init_cache$ code is the most fragile part and should be handled with care. 476 | \end{enumerate} 477 | 478 | 479 | \section{Creating a Release} 480 | 481 | A release should be a stable version of \andi with significant improvements over the last version. dotdot releases should be avoided. 482 | 483 | %\subsection{Preparing a new Release} 484 | 485 | Once \andi is matured, the new features implemented, and all tests were run, a new release can be created. First, increase the version number in \lstinline$configure.ac$. Commit that change in git, and tag this commit with \lstinline$vX.y$. Tags should be annotated and signed, if possible. This manual then needs manual rebuilding. 486 | 487 | Ensure that \andi is ready for packaging with \algo{autoconf}. 488 | 489 | \begin{lstlisting} 490 | ~ % make distcheck 491 | make dist-gzip am__post_remove_distdir='@:' 492 | make[1]: Entering directory `/home/kloetzl/Projects/andi' 493 | if test -d "andi-0.9.1-beta"; then find "andi-0.9.1-beta" -type d ! -perm -200 -exec chmod u+w {} ';' && rm -rf "andi-0.9.1-beta" || { sleep 5 && rm -rf "andi-0.9.1-beta"; }; else :; fi 494 | test -d "andi-0.9.1-beta" || mkdir "andi-0.9.1-beta" 495 | (cd src && make top_distdir=../andi-0.9.1-beta distdir=../andi-0.9.1-beta/src \ 496 | am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir) 497 | 498 | ... Loads of output ... 499 | 500 | ================================================= 501 | andi-0.9.1-beta archives ready for distribution: 502 | andi-0.9.1-beta.tar.gz 503 | ================================================= 504 | \end{lstlisting} 505 | 506 | If the command does not build successfully, no tarballs will be created. This may necessitate further study of \algo{autoconf} and \algo{automake}. 507 | 508 | Also verify that the recent changes did not create a performance regression. This includes testing both ends of the scale: \eco and \pneu. Both should be reasonable close to previous releases. 509 | 510 | Create another commit, where you set the version number to the next release (e.\,g., \lstinline$vX.z-beta$). This assures that there is only one commit and build with that specific version. 511 | 512 | \backmatter 513 | %\addcontentsline{toc}{chapter}{Bibliography} 514 | \bibliography{references} 515 | 516 | \end{document} 517 | -------------------------------------------------------------------------------- /docs/manual/andi_labels.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EvolBioInf/andi/390af15beb76badaf8f16864a885747aa60956c8/docs/manual/andi_labels.pdf -------------------------------------------------------------------------------- /docs/manual/references.bib: -------------------------------------------------------------------------------- 1 | @misc{divsufsort, 2 | author="Yuta Mori", 3 | year="2005", 4 | title="Short description of improved two-stage suffix sorting algorithm", 5 | note="\url{http://homepage3.nifty.com/wpage/software/itssort.txt}" 6 | } 7 | 8 | @article{andi, 9 | author = {Haubold, Bernhard and Klötzl, Fabian and Pfaffelhuber, Peter}, 10 | title = {andi: Fast and accurate estimation of evolutionary distances between closely related genomes}, 11 | volume = {31}, 12 | number = {8}, 13 | pages = {1169-1175}, 14 | year = {2015}, 15 | doi = {10.1093/bioinformatics/btu815}, 16 | URL = {http://bioinformatics.oxfordjournals.org/content/31/8/1169.abstract}, 17 | eprint = {http://bioinformatics.oxfordjournals.org/content/31/8/1169.full.pdf+html}, 18 | journal = {Bioinformatics} 19 | } 20 | 21 | @book{Felsenstein, 22 | author={Joseph Felsenstein}, 23 | title={Inferring Phylogenies}, 24 | year={2004}, 25 | publisher={Sinauer Associates, Inc.} 26 | } 27 | 28 | @misc{GPL, 29 | author={{Free~Software~Foundation}}, 30 | year={2007}, 31 | title={Gnu General Public License}, 32 | note={\url{https://gnu.org/licenses/gpl.html}} 33 | } 34 | 35 | @misc{phylip, 36 | author={Felsenstein, J.}, 37 | year={2005}, 38 | title={PHYLIP (Phylogeny Inference Package)}, 39 | version={version 3.6}, 40 | howpublished={Distributed by the author}, 41 | note={Department of Genome Sciences, University of Washington.} 42 | } 43 | 44 | @InProceedings{LLVM, 45 | Author = {Chris Lattner and Vikram Adve}, 46 | Title = {{LLVM}: A Compilation Framework for Lifelong Program 47 | Analysis and Transformation}, 48 | Booktitle = "Code Generation and Optimization", 49 | Month = {Mar}, 50 | Year = {2004}, 51 | pages = {75--88}, 52 | Publisher={International Symposium on Code Generation and Optimization} 53 | } 54 | 55 | @article{ms, 56 | author = {Hudson, Richard R.}, 57 | title = {Generating samples under a Wright–Fisher neutral model of genetic variation}, 58 | volume = {18}, 59 | number = {2}, 60 | pages = {337-338}, 61 | year = {2002}, 62 | doi = {10.1093/bioinformatics/18.2.337}, 63 | URL = {http://bioinformatics.oxfordjournals.org/content/18/2/337.abstract}, 64 | eprint = {http://bioinformatics.oxfordjournals.org/content/18/2/337.full.pdf+html}, 65 | journal = {Bioinformatics} 66 | } 67 | 68 | @article{valgrind, 69 | author = {Nethercote, Nicholas and Seward, Julian}, 70 | title = {Valgrind: A Framework for Heavyweight Dynamic Binary Instrumentation}, 71 | journal = {SIGPLAN Not.}, 72 | issue_date = {June 2007}, 73 | volume = {42}, 74 | number = {6}, 75 | month = jun, 76 | year = {2007}, 77 | issn = {0362-1340}, 78 | pages = {89--100}, 79 | numpages = {12}, 80 | url = {http://doi.acm.org/10.1145/1273442.1250746}, 81 | doi = {10.1145/1273442.1250746}, 82 | acmid = {1250746}, 83 | publisher = {ACM}, 84 | keywords = {Memcheck, Valgrind, dynamic binary analysis, dynamic binary instrumentation, shadow values} 85 | } 86 | 87 | @misc{figtree, 88 | title="FigTree", 89 | author={Andrew Rambaut}, 90 | year={accessed 2015}, 91 | note={\url{http://tree.bio.ed.ac.uk/software/figtree/}} 92 | } 93 | 94 | 95 | @article{jukescantor, 96 | author={Jukes, T. H. and Cantor, C. R.}, 97 | year={1969}, 98 | title={Evolution of protein molecules}, 99 | journal={Mammalian protein metabolism}, 100 | volume={3}, 101 | pages={21-132}, 102 | publisher={Academic Press} 103 | } 104 | 105 | @mastersthesis{kloetzl, 106 | author={Fabian Kl{\"o}tzl}, 107 | school={University of L\"ubeck}, 108 | year={2015}, 109 | title={Efficient Estimation of Evolutionary Distances} 110 | } 111 | 112 | @article{afra, 113 | AUTHOR = {Klötzl, Fabian and Haubold, Bernhard}, 114 | TITLE = {Support Values for Genome Phylogenies}, 115 | JOURNAL = {Life}, 116 | VOLUME = {6}, 117 | YEAR = {2016}, 118 | NUMBER = {1}, 119 | PAGES = {11}, 120 | URL = {http://www.mdpi.com/2075-1729/6/1/11}, 121 | ISSN = {2075-1729}, 122 | DOI = {10.3390/life6010011} 123 | } 124 | 125 | @article{logdet, 126 | AUTHOR = {Lockhart, P.J. and M.A. Steel and M.D. Hendy and D. Penny}, 127 | TITLE = {Recovering Evolutionary Trees under a More Realistic Model of Sequence Evolution}, 128 | JOURNAL = {Molecular Biology and Evolution}, 129 | VOLUME = {11}, 130 | YEAR = {1994}, 131 | NUMBER = {4}, 132 | PAGES = {605-612}, 133 | DOI = {10.1093/oxfordjournals.molbev.a040136} 134 | } 135 | 136 | @article{kimura, 137 | AUTHOR = {Kimura, M.}, 138 | TITLE = {A Simple Method for Estimating Evolutionary Rate of Base Substitutions Through Comparative Studies of Nucleotide Sequences}, 139 | JOURNAL = {Journal of Molecular Evolution}, 140 | VOLUME = {16}, 141 | YEAR = {1980}, 142 | NUMBER = {2}, 143 | PAGES = {111-120}, 144 | DOI = {10.1007/BF01731581} 145 | } 146 | 147 | -------------------------------------------------------------------------------- /docs/manual/version.tex.in: -------------------------------------------------------------------------------- 1 | 2 | \newcommand{\version}{VERSION} 3 | -------------------------------------------------------------------------------- /libs/Makefile.am: -------------------------------------------------------------------------------- 1 | # (C) 2015, Fabian Klötzl ISC License 2 | 3 | noinst_LIBRARIES= libpfasta.a 4 | libpfasta_a_SOURCES= pfasta.c pfasta.h 5 | libpfasta_a_CPPFLAGS= -I$(top_srcdir)/opt 6 | -------------------------------------------------------------------------------- /libs/pfasta.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015-2020, Fabian Klötzl 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted, provided that the above 6 | * copyright notice and this permission notice appear in all copies. 7 | * 8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 | * 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #include "pfasta.h" 29 | 30 | #define VERSION "v15" 31 | 32 | #ifdef __SSE2__ 33 | #include 34 | #endif 35 | 36 | #if __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__) 37 | #include 38 | #define PFASTA_THREADSAFE 1 39 | #else 40 | #define thread_local 41 | #define PFASTA_THREADSAFE 0 42 | #endif 43 | 44 | int pfasta_threadsafe() { 45 | return PFASTA_THREADSAFE ; 46 | } 47 | 48 | /** The following is the maximum length of an error string. It has to be 49 | * carefully chosen, so that all calls to PF_FAIL_STR succeed. For instance, 50 | * the line number can account for up to 20 characters. 51 | */ 52 | #define PF_ERROR_STRING_LENGTH 100 53 | 54 | thread_local char errstr_buffer[PF_ERROR_STRING_LENGTH]; 55 | 56 | void *pfasta_reallocarray(void *ptr, size_t nmemb, size_t size); 57 | 58 | #define BUFFER_SIZE 16384 59 | 60 | #define LIKELY(X) __builtin_expect((intptr_t)(X), 1) 61 | #define UNLIKELY(X) __builtin_expect((intptr_t)(X), 0) 62 | 63 | enum { NO_ERROR, E_EOF, E_ERROR, E_ERRNO, E_BUBBLE, E_STR, E_STR_CONST }; 64 | 65 | #define PF_FAIL_ERRNO(PP) \ 66 | do { \ 67 | (void)strerror_r(errno, errstr_buffer, PF_ERROR_STRING_LENGTH); \ 68 | (PP)->errstr = errstr_buffer; \ 69 | return_code = E_ERRNO; \ 70 | goto cleanup; \ 71 | } while (0) 72 | 73 | #define PF_FAIL_BUBBLE_CHECK(PP, CHECK) \ 74 | do { \ 75 | if (UNLIKELY(CHECK)) { \ 76 | return_code = CHECK; \ 77 | goto cleanup; \ 78 | } \ 79 | } while (0) 80 | 81 | #define PF_FAIL_BUBBLE(PP) \ 82 | do { \ 83 | if (UNLIKELY((PP)->errstr)) { \ 84 | return_code = E_BUBBLE; \ 85 | goto cleanup; \ 86 | } \ 87 | } while (0) 88 | 89 | #define PF_FAIL_STR_CONST(PP, STR) \ 90 | do { \ 91 | (PP)->errstr = (STR); \ 92 | return_code = E_STR_CONST; \ 93 | goto cleanup; \ 94 | } while (0) 95 | 96 | #define PF_FAIL_STR(PP, ...) \ 97 | do { \ 98 | (void)snprintf(errstr_buffer, PF_ERROR_STRING_LENGTH, __VA_ARGS__); \ 99 | (PP)->errstr = errstr_buffer; \ 100 | return_code = E_STR; \ 101 | goto cleanup; \ 102 | } while (0) 103 | 104 | int pfasta_read_name(struct pfasta_parser *pp, struct pfasta_record *pr); 105 | int pfasta_read_comment(struct pfasta_parser *pp, struct pfasta_record *pr); 106 | int pfasta_read_sequence(struct pfasta_parser *pp, struct pfasta_record *pr); 107 | 108 | static inline char *buffer_begin(struct pfasta_parser *pp); 109 | static inline char *buffer_end(struct pfasta_parser *pp); 110 | static inline int buffer_advance(struct pfasta_parser *pp, size_t steps); 111 | static inline int buffer_is_empty(const struct pfasta_parser *pp); 112 | static inline int buffer_is_eof(const struct pfasta_parser *pp); 113 | static inline int buffer_peek(struct pfasta_parser *pp); 114 | static inline int buffer_read(struct pfasta_parser *pp); 115 | 116 | typedef struct dynstr { 117 | char *str; 118 | size_t capacity, count; 119 | } dynstr; 120 | 121 | static inline char *dynstr_move(dynstr *ds); 122 | static inline int dynstr_init(dynstr *ds, struct pfasta_parser *pp); 123 | static inline size_t dynstr_len(const dynstr *ds); 124 | static inline void dynstr_free(dynstr *ds); 125 | static inline int dynstr_append(dynstr *ds, const char *str, size_t length, 126 | struct pfasta_parser *pp); 127 | 128 | static inline int my_isspace(int c) { 129 | // ascii whitespace 130 | return (c >= '\t' && c <= '\r') || (c == ' '); 131 | } 132 | 133 | const char *pfasta_version(void) { return VERSION; } 134 | 135 | int buffer_init(struct pfasta_parser *pp) { 136 | int return_code = 0; 137 | 138 | pp->buffer = malloc(BUFFER_SIZE); 139 | if (!pp->buffer) PF_FAIL_ERRNO(pp); 140 | 141 | int check = buffer_read(pp); 142 | PF_FAIL_BUBBLE_CHECK(pp, check); 143 | 144 | cleanup: 145 | return return_code; 146 | } 147 | 148 | int buffer_read(struct pfasta_parser *pp) { 149 | int return_code = NO_ERROR; 150 | ssize_t count = read(pp->file_descriptor, pp->buffer, BUFFER_SIZE); 151 | 152 | if (UNLIKELY(count < 0)) PF_FAIL_ERRNO(pp); 153 | if (UNLIKELY(count == 0)) { // EOF 154 | pp->fill_ptr = pp->buffer; 155 | pp->read_ptr = pp->buffer + 1; 156 | pp->errstr = "EOF (maybe error)"; // enable bubbling 157 | return E_EOF; 158 | } 159 | 160 | pp->read_ptr = pp->buffer; 161 | pp->fill_ptr = pp->buffer + count; 162 | 163 | cleanup: 164 | return return_code; 165 | } 166 | 167 | int buffer_peek(struct pfasta_parser *pp) { 168 | return LIKELY(pp->read_ptr < pp->fill_ptr) ? *(unsigned char *)pp->read_ptr 169 | : EOF; 170 | } 171 | 172 | char *buffer_begin(struct pfasta_parser *pp) { return pp->read_ptr; } 173 | 174 | char *buffer_end(struct pfasta_parser *pp) { return pp->fill_ptr; } 175 | 176 | inline int buffer_advance(struct pfasta_parser *pp, size_t steps) { 177 | int return_code = 0; 178 | 179 | pp->read_ptr += steps; 180 | if (UNLIKELY(pp->read_ptr >= pp->fill_ptr)) { 181 | assert(pp->read_ptr == pp->fill_ptr); 182 | int check = buffer_read(pp); // resets pointers 183 | PF_FAIL_BUBBLE_CHECK(pp, check); 184 | } 185 | 186 | cleanup: 187 | return return_code; 188 | } 189 | 190 | int buffer_is_empty(const struct pfasta_parser *pp) { 191 | return pp->read_ptr == pp->fill_ptr; 192 | } 193 | 194 | int buffer_is_eof(const struct pfasta_parser *pp) { 195 | return pp->read_ptr > pp->fill_ptr; 196 | } 197 | 198 | char *find_first_space(const char *begin, const char *end) { 199 | size_t offset = 0; 200 | size_t length = end - begin; 201 | 202 | #ifdef __SSE2__ 203 | 204 | typedef __m128i vec_type; 205 | static const size_t vec_size = sizeof(vec_type); 206 | 207 | const vec_type all_tab = _mm_set1_epi8('\t' - 1); 208 | const vec_type all_carriage = _mm_set1_epi8('\r' + 1); 209 | const vec_type all_space = _mm_set1_epi8(' '); 210 | 211 | size_t vec_offset = 0; 212 | size_t vec_length = (end - begin) / vec_size; 213 | 214 | for (; vec_offset < vec_length; vec_offset++) { 215 | vec_type chunk; 216 | memcpy(&chunk, begin + vec_offset * vec_size, vec_size); 217 | 218 | // isspace: \t <= char <= \r || char == space 219 | vec_type v1 = _mm_cmplt_epi8(all_tab, chunk); 220 | vec_type v2 = _mm_cmplt_epi8(chunk, all_carriage); 221 | vec_type v3 = _mm_cmpeq_epi8(chunk, all_space); 222 | 223 | unsigned int vmask = (_mm_movemask_epi8(v1) & _mm_movemask_epi8(v2)) | 224 | _mm_movemask_epi8(v3); 225 | 226 | if (UNLIKELY(vmask)) { 227 | offset += __builtin_ctz(vmask); 228 | offset += vec_offset * vec_size; 229 | return (char *)begin + offset; 230 | } 231 | } 232 | 233 | offset += vec_offset * vec_size; 234 | #endif 235 | 236 | for (; offset < length; offset++) { 237 | if (my_isspace(begin[offset])) break; 238 | } 239 | return (char *)begin + offset; 240 | } 241 | 242 | char *find_first_not_space(const char *begin, const char *end) { 243 | size_t offset = 0; 244 | size_t length = end - begin; 245 | 246 | for (; offset < length; offset++) { 247 | if (!my_isspace(begin[offset])) break; 248 | } 249 | return (char *)begin + offset; 250 | } 251 | 252 | size_t count_newlines(const char *begin, const char *end) { 253 | size_t offset = 0; 254 | size_t length = end - begin; 255 | size_t newlines = 0; 256 | 257 | for (; offset < length; offset++) { 258 | if (begin[offset] == '\n') newlines++; 259 | } 260 | 261 | return newlines; 262 | } 263 | 264 | static int copy_word(struct pfasta_parser *pp, dynstr *target) { 265 | int return_code = 0; 266 | 267 | int c; 268 | while (c = buffer_peek(pp), c != EOF && LIKELY(!my_isspace(c))) { 269 | char *end_of_word = find_first_space(buffer_begin(pp), buffer_end(pp)); 270 | size_t word_length = end_of_word - buffer_begin(pp); 271 | 272 | assert(word_length > 0); 273 | 274 | int check = dynstr_append(target, buffer_begin(pp), word_length, pp); 275 | PF_FAIL_BUBBLE_CHECK(pp, check); 276 | 277 | check = buffer_advance(pp, word_length); 278 | PF_FAIL_BUBBLE_CHECK(pp, check); 279 | } 280 | 281 | cleanup: 282 | return return_code; 283 | } 284 | 285 | static int skip_whitespace(struct pfasta_parser *pp) { 286 | int return_code = 0; 287 | 288 | while (my_isspace(buffer_peek(pp))) { 289 | char *split = find_first_not_space(buffer_begin(pp), buffer_end(pp)); 290 | 291 | // advance may clear the buffer. So count first … 292 | size_t newlines = count_newlines(buffer_begin(pp), split); 293 | int check = buffer_advance(pp, split - buffer_begin(pp)); 294 | PF_FAIL_BUBBLE_CHECK(pp, check); 295 | 296 | // … and then increase the counter. 297 | pp->line_number += newlines; 298 | } 299 | 300 | cleanup: 301 | return return_code; 302 | } 303 | 304 | struct pfasta_parser pfasta_init(int file_descriptor) { 305 | int return_code = 0; 306 | struct pfasta_parser pp = {0}; 307 | pp.line_number = 1; 308 | 309 | pp.file_descriptor = file_descriptor; 310 | int check = buffer_init(&pp); 311 | if (check && check != E_EOF) PF_FAIL_BUBBLE_CHECK(&pp, check); 312 | 313 | if (buffer_is_empty(&pp) || buffer_is_eof(&pp)) { 314 | PF_FAIL_STR(&pp, "File is empty."); 315 | } 316 | 317 | if (buffer_peek(&pp) != '>') { 318 | PF_FAIL_STR(&pp, "File must start with '>'."); 319 | } 320 | 321 | cleanup: 322 | // free buffer if necessary 323 | if (return_code) { 324 | pfasta_free(&pp); 325 | } 326 | pp.done = return_code || buffer_is_eof(&pp); 327 | return pp; 328 | } 329 | 330 | struct pfasta_record pfasta_read(struct pfasta_parser *pp) { 331 | int return_code = 0; 332 | struct pfasta_record pr = {0}; 333 | 334 | int check = pfasta_read_name(pp, &pr); 335 | PF_FAIL_BUBBLE_CHECK(pp, check); 336 | 337 | check = pfasta_read_comment(pp, &pr); 338 | PF_FAIL_BUBBLE_CHECK(pp, check); 339 | 340 | check = pfasta_read_sequence(pp, &pr); 341 | PF_FAIL_BUBBLE_CHECK(pp, check); 342 | 343 | cleanup: 344 | if (return_code) { 345 | pfasta_record_free(&pr); 346 | pfasta_free(pp); 347 | } 348 | pp->done = return_code || buffer_is_eof(pp); 349 | return pr; 350 | } 351 | 352 | int pfasta_read_name(struct pfasta_parser *pp, struct pfasta_record *pr) { 353 | int return_code = 0; 354 | 355 | dynstr name; 356 | dynstr_init(&name, pp); 357 | PF_FAIL_BUBBLE(pp); 358 | 359 | assert(!buffer_is_empty(pp)); 360 | if (buffer_peek(pp) != '>') { 361 | PF_FAIL_STR(pp, "Expected '>' but found '%c' on line %zu.", 362 | buffer_peek(pp), pp->line_number); 363 | } 364 | 365 | int check = buffer_advance(pp, 1); // skip > 366 | if (check == E_EOF) 367 | PF_FAIL_STR(pp, "Unexpected EOF in name on line %zu.", pp->line_number); 368 | PF_FAIL_BUBBLE(pp); 369 | 370 | check = copy_word(pp, &name); 371 | if (check == E_EOF) 372 | PF_FAIL_STR(pp, "Unexpected EOF in name on line %zu.", pp->line_number); 373 | PF_FAIL_BUBBLE(pp); 374 | 375 | if (dynstr_len(&name) == 0) 376 | PF_FAIL_STR(pp, "Empty name on line %zu.", pp->line_number); 377 | 378 | pr->name_length = dynstr_len(&name); 379 | pr->name = dynstr_move(&name); 380 | 381 | cleanup: 382 | if (return_code) { 383 | dynstr_free(&name); 384 | } 385 | return return_code; 386 | } 387 | 388 | int pfasta_read_comment(struct pfasta_parser *pp, struct pfasta_record *pr) { 389 | int return_code = 0; 390 | 391 | if (buffer_peek(pp) == '\n') { 392 | pr->comment_length = 0; 393 | pr->comment = NULL; 394 | return 0; 395 | } 396 | 397 | dynstr comment; 398 | dynstr_init(&comment, pp); 399 | PF_FAIL_BUBBLE(pp); 400 | 401 | assert(!buffer_is_empty(pp)); 402 | 403 | int check = buffer_advance(pp, 1); // skip first whitespace 404 | if (check == E_EOF) goto label_eof; 405 | PF_FAIL_BUBBLE(pp); 406 | 407 | assert(!buffer_is_empty(pp)); 408 | 409 | // get comment 410 | while (buffer_peek(pp) != '\n') { 411 | check = dynstr_append(&comment, buffer_begin(pp), 1, pp); 412 | PF_FAIL_BUBBLE_CHECK(pp, check); 413 | 414 | check = buffer_advance(pp, 1); 415 | if (check == E_EOF) goto label_eof; 416 | PF_FAIL_BUBBLE_CHECK(pp, check); 417 | } 418 | 419 | label_eof: 420 | if (buffer_is_eof(pp)) 421 | PF_FAIL_STR(pp, "Unexpected EOF in comment on line %zu.", 422 | pp->line_number); 423 | 424 | pr->comment_length = dynstr_len(&comment); 425 | pr->comment = dynstr_move(&comment); 426 | 427 | cleanup: 428 | if (return_code) { 429 | dynstr_free(&comment); 430 | } 431 | return return_code; 432 | } 433 | 434 | int pfasta_read_sequence(struct pfasta_parser *pp, struct pfasta_record *pr) { 435 | int return_code = 0; 436 | 437 | dynstr sequence; 438 | dynstr_init(&sequence, pp); 439 | PF_FAIL_BUBBLE(pp); 440 | 441 | assert(!buffer_is_empty(pp)); 442 | assert(!buffer_is_eof(pp)); 443 | assert(buffer_peek(pp) == '\n'); 444 | 445 | int check = skip_whitespace(pp); 446 | if (check == E_EOF) 447 | PF_FAIL_STR(pp, "Empty sequence on line %zu.", pp->line_number); 448 | PF_FAIL_BUBBLE_CHECK(pp, check); 449 | 450 | // Assume a line begins only with alpha, -, *, or more spaces 451 | char c; 452 | while (c = buffer_peek(pp), LIKELY(isalpha(c) || c == '-' || c == '*')) { 453 | int check = copy_word(pp, &sequence); 454 | if (UNLIKELY(check == E_EOF)) break; 455 | PF_FAIL_BUBBLE_CHECK(pp, check); 456 | 457 | // optimize for more common case 458 | ptrdiff_t length = buffer_end(pp) - buffer_begin(pp); 459 | if (LIKELY(length >= 2 && buffer_begin(pp)[0] == '\n' && 460 | buffer_begin(pp)[1] > ' ')) { 461 | pp->read_ptr++; // nasty hack 462 | pp->line_number += 1; 463 | } else { 464 | check = skip_whitespace(pp); 465 | if (UNLIKELY(check == E_EOF)) break; 466 | PF_FAIL_BUBBLE_CHECK(pp, check); 467 | } 468 | } 469 | 470 | if (dynstr_len(&sequence) == 0) 471 | PF_FAIL_STR(pp, "Empty sequence on line %zu.", pp->line_number); 472 | 473 | pr->sequence_length = dynstr_len(&sequence); 474 | pr->sequence = dynstr_move(&sequence); 475 | pp->errstr = NULL; // reset error 476 | 477 | cleanup: 478 | if (return_code) { 479 | dynstr_free(&sequence); 480 | } 481 | return return_code; 482 | } 483 | 484 | void pfasta_record_free(struct pfasta_record *pr) { 485 | if (!pr) return; 486 | free(pr->name); 487 | free(pr->comment); 488 | free(pr->sequence); 489 | pr->name = pr->comment = pr->sequence = NULL; 490 | } 491 | 492 | void pfasta_free(struct pfasta_parser *pp) { 493 | if (!pp) return; 494 | free(pp->buffer); 495 | pp->buffer = NULL; 496 | } 497 | 498 | /** @brief Creates a new string that can grow dynamically. 499 | * 500 | * @param ds - A reference to the dynstr container. 501 | * 502 | * @returns 0 iff successful. 503 | */ 504 | static inline int dynstr_init(dynstr *ds, struct pfasta_parser *pp) { 505 | int return_code = 0; 506 | 507 | *ds = (dynstr){NULL, 0, 0}; 508 | ds->str = malloc(61); 509 | if (!ds->str) PF_FAIL_ERRNO(pp); 510 | 511 | ds->str[0] = '\0'; 512 | ds->capacity = 61; 513 | ds->count = 0; 514 | 515 | cleanup: 516 | return return_code; 517 | } 518 | 519 | /** @brief A append more than one character to a string. 520 | * 521 | * @param ds - A reference to the dynstr container. 522 | * @param str - The new characters. 523 | * @param length - number of new characters to append 524 | * 525 | * @returns 0 iff successful. 526 | */ 527 | static inline int dynstr_append(dynstr *ds, const char *str, size_t length, 528 | struct pfasta_parser *pp) { 529 | int return_code = 0; 530 | size_t required = ds->count + length; 531 | 532 | if (UNLIKELY(required >= ds->capacity)) { 533 | char *neu = pfasta_reallocarray(ds->str, required / 2, 3); 534 | if (UNLIKELY(!neu)) { 535 | dynstr_free(ds); 536 | PF_FAIL_ERRNO(pp); 537 | } 538 | ds->str = neu; 539 | ds->capacity = (required / 2) * 3; 540 | } 541 | 542 | memcpy(ds->str + ds->count, str, length); 543 | ds->count = required; 544 | 545 | cleanup: 546 | return return_code; 547 | } 548 | 549 | /** @brief Frees a dynamic string. */ 550 | static inline void dynstr_free(dynstr *ds) { 551 | if (!ds) return; 552 | free(ds->str); 553 | *ds = (dynstr){NULL, 0, 0}; 554 | } 555 | 556 | /** @brief Returns the string as a standard `char*`. The internal reference is 557 | * then deleted. Hence the name *move* as in *move semantics*. 558 | * 559 | * @param ds - The dynamic string to move from. 560 | * 561 | * @returns a `char*` to a standard null-terminated string. 562 | */ 563 | static inline char *dynstr_move(dynstr *ds) { 564 | char *out = pfasta_reallocarray(ds->str, ds->count + 1, 1); 565 | if (!out) { 566 | out = ds->str; 567 | } 568 | out[ds->count] = '\0'; 569 | *ds = (dynstr){NULL, 0, 0}; 570 | return out; 571 | } 572 | 573 | /** @brief Returns the current length of the dynamic string. */ 574 | static inline size_t dynstr_len(const dynstr *ds) { return ds->count; } 575 | 576 | __attribute__((weak)) void *reallocarray(void *ptr, size_t nmemb, size_t size); 577 | 578 | /** 579 | * @brief Unsafe fallback in case reallocarray isn't provided by the stdlib. 580 | */ 581 | void *pfasta_reallocarray(void *ptr, size_t nmemb, size_t size) { 582 | if (reallocarray == NULL) { 583 | return realloc(ptr, nmemb * size); 584 | } else { 585 | return reallocarray(ptr, nmemb, size); 586 | } 587 | } 588 | -------------------------------------------------------------------------------- /libs/pfasta.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015-2020, Fabian Klötzl 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted, provided that the above 6 | * copyright notice and this permission notice appear in all copies. 7 | * 8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 | * 16 | */ 17 | 18 | #ifndef PFASTA_H 19 | #define PFASTA_H 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | #include 26 | 27 | /** 28 | * There is no magic to this structure. Its just a container of three strings. 29 | * Feel free to duplicate or move them. But don't forget to free the data after 30 | * usage! 31 | */ 32 | struct pfasta_record { 33 | char *name, *comment, *sequence; 34 | size_t name_length, comment_length, sequence_length; 35 | }; 36 | 37 | /** 38 | * This structure holds a number of members to represent the state of the FASTA 39 | * parser. Please make sure that it is properly initialized before usage. 40 | * Always free this structure when the parser is done. 41 | */ 42 | struct pfasta_parser { 43 | const char *errstr; 44 | int done; 45 | 46 | /*< private -- do not touch! >*/ 47 | int file_descriptor; 48 | char *buffer; 49 | char *read_ptr, *fill_ptr; 50 | size_t line_number; 51 | }; 52 | 53 | /** 54 | * This function initializes a `pfasta_parser` struct with a parser bound to a 55 | * specific file descriptor. Iff an error occurred `errstr` is set to contain a 56 | * suitable message. Otherwise you can read data from it as long as `done` isn't 57 | * set. The parser should be freed after usage. 58 | * 59 | * Please note that the user is responsible for opening the file descriptor as 60 | * readable and closing after usage. 61 | */ 62 | struct pfasta_parser pfasta_init(int file_descriptor); 63 | 64 | /** 65 | * Using a properly initialized parser, this function can read FASTA sequences. 66 | * These are stored in the simple structure and returned. On error, the `errstr` 67 | * property of the parser is set. 68 | */ 69 | struct pfasta_record pfasta_read(struct pfasta_parser *pp); 70 | 71 | /** 72 | * This function frees the resources held by a pfasta record. 73 | */ 74 | void pfasta_record_free(struct pfasta_record *pr); 75 | 76 | /** 77 | * This function frees the resources held by a pfasta parser. 78 | */ 79 | void pfasta_free(struct pfasta_parser *pp); 80 | 81 | /** 82 | * Get a string defining the version of the pfasta library. 83 | */ 84 | const char *pfasta_version(void); 85 | 86 | /** 87 | * Returns 0 iff pfasta is not threadsafe. 88 | */ 89 | int pfasta_threadsafe(); 90 | 91 | #ifdef __cplusplus 92 | } 93 | #endif 94 | 95 | #endif /* PFASTA_H */ 96 | -------------------------------------------------------------------------------- /m4/ax_cxx_compile_stdcxx_11.m4: -------------------------------------------------------------------------------- 1 | # ============================================================================ 2 | # http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html 3 | # ============================================================================ 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_CXX_COMPILE_STDCXX_11([ext|noext],[mandatory|optional]) 8 | # 9 | # DESCRIPTION 10 | # 11 | # Check for baseline language coverage in the compiler for the C++11 12 | # standard; if necessary, add switches to CXXFLAGS to enable support. 13 | # 14 | # The first argument, if specified, indicates whether you insist on an 15 | # extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g. 16 | # -std=c++11). If neither is specified, you get whatever works, with 17 | # preference for an extended mode. 18 | # 19 | # The second argument, if specified 'mandatory' or if left unspecified, 20 | # indicates that baseline C++11 support is required and that the macro 21 | # should error out if no mode with that support is found. If specified 22 | # 'optional', then configuration proceeds regardless, after defining 23 | # HAVE_CXX11 if and only if a supporting mode is found. 24 | # 25 | # LICENSE 26 | # 27 | # Copyright (c) 2008 Benjamin Kosnik 28 | # Copyright (c) 2012 Zack Weinberg 29 | # Copyright (c) 2013 Roy Stogner 30 | # Copyright (c) 2014 Alexey Sokolov 31 | # Copyright (c) 2014, 2015 Google Inc. 32 | # 33 | # Copying and distribution of this file, with or without modification, are 34 | # permitted in any medium without royalty provided the copyright notice 35 | # and this notice are preserved. This file is offered as-is, without any 36 | # warranty. 37 | 38 | #serial 7 39 | 40 | m4_define([_AX_CXX_COMPILE_STDCXX_11_testbody], [[ 41 | template 42 | struct check 43 | { 44 | static_assert(sizeof(int) <= sizeof(T), "not big enough"); 45 | }; 46 | 47 | struct Base { 48 | virtual void f() {} 49 | }; 50 | struct Child : public Base { 51 | virtual void f() override {} 52 | }; 53 | 54 | typedef check> right_angle_brackets; 55 | 56 | int a; 57 | decltype(a) b; 58 | 59 | typedef check check_type; 60 | check_type c; 61 | check_type&& cr = static_cast(c); 62 | 63 | auto d = a; 64 | auto l = [](){}; 65 | 66 | // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae 67 | // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this 68 | namespace test_template_alias_sfinae { 69 | struct foo {}; 70 | 71 | template 72 | using member = typename T::member_type; 73 | 74 | template 75 | void func(...) {} 76 | 77 | template 78 | void func(member*) {} 79 | 80 | void test() { 81 | func(0); 82 | } 83 | } 84 | ]]) 85 | 86 | AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [dnl 87 | m4_if([$1], [], [], 88 | [$1], [ext], [], 89 | [$1], [noext], [], 90 | [m4_fatal([invalid argument `$1' to AX_CXX_COMPILE_STDCXX_11])])dnl 91 | m4_if([$2], [], [ax_cxx_compile_cxx11_required=true], 92 | [$2], [mandatory], [ax_cxx_compile_cxx11_required=true], 93 | [$2], [optional], [ax_cxx_compile_cxx11_required=false], 94 | [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX_11])]) 95 | AC_LANG_PUSH([C++])dnl 96 | ac_success=no 97 | AC_CACHE_CHECK(whether $CXX supports C++11 features by default, 98 | ax_cv_cxx_compile_cxx11, 99 | [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])], 100 | [ax_cv_cxx_compile_cxx11=yes], 101 | [ax_cv_cxx_compile_cxx11=no])]) 102 | if test x$ax_cv_cxx_compile_cxx11 = xyes; then 103 | ac_success=yes 104 | fi 105 | 106 | m4_if([$1], [noext], [], [dnl 107 | if test x$ac_success = xno; then 108 | for switch in -std=gnu++11 -std=gnu++0x; do 109 | cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch]) 110 | AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch, 111 | $cachevar, 112 | [ac_save_CXXFLAGS="$CXXFLAGS" 113 | CXXFLAGS="$CXXFLAGS $switch" 114 | AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])], 115 | [eval $cachevar=yes], 116 | [eval $cachevar=no]) 117 | CXXFLAGS="$ac_save_CXXFLAGS"]) 118 | if eval test x\$$cachevar = xyes; then 119 | CXXFLAGS="$CXXFLAGS $switch" 120 | ac_success=yes 121 | break 122 | fi 123 | done 124 | fi]) 125 | 126 | m4_if([$1], [ext], [], [dnl 127 | if test x$ac_success = xno; then 128 | for switch in -std=c++11 -std=c++0x; do 129 | cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch]) 130 | AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch, 131 | $cachevar, 132 | [ac_save_CXXFLAGS="$CXXFLAGS" 133 | CXXFLAGS="$CXXFLAGS $switch" 134 | AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])], 135 | [eval $cachevar=yes], 136 | [eval $cachevar=no]) 137 | CXXFLAGS="$ac_save_CXXFLAGS"]) 138 | if eval test x\$$cachevar = xyes; then 139 | CXXFLAGS="$CXXFLAGS $switch" 140 | ac_success=yes 141 | break 142 | fi 143 | done 144 | fi]) 145 | AC_LANG_POP([C++]) 146 | if test x$ax_cxx_compile_cxx11_required = xtrue; then 147 | if test x$ac_success = xno; then 148 | AC_MSG_ERROR([*** A compiler with support for C++11 language features is required.]) 149 | fi 150 | else 151 | if test x$ac_success = xno; then 152 | HAVE_CXX11=0 153 | AC_MSG_NOTICE([No compiler with C++11 support was found]) 154 | else 155 | HAVE_CXX11=1 156 | AC_DEFINE(HAVE_CXX11,1, 157 | [define if the compiler supports basic C++11 syntax]) 158 | fi 159 | 160 | AC_SUBST(HAVE_CXX11) 161 | fi 162 | ]) 163 | -------------------------------------------------------------------------------- /opt/Makefile.am: -------------------------------------------------------------------------------- 1 | noinst_LIBRARIES= libcompat.a 2 | libcompat_a_SOURCES= compat-string.h compat-stdlib.h 3 | 4 | if !HAVE_STRCHRNUL 5 | libcompat_a_SOURCES+= strchrnul.c 6 | endif 7 | 8 | if !HAVE_REALLOCARRAY 9 | libcompat_a_SOURCES+= reallocarray.c 10 | endif 11 | -------------------------------------------------------------------------------- /opt/compat-stdlib.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void *reallocarray(void *optr, size_t nmemb, size_t size); 4 | -------------------------------------------------------------------------------- /opt/compat-string.h: -------------------------------------------------------------------------------- 1 | #ifndef HAVE_STRCHRNUL 2 | char *strchrnul(const char *s, int c); 3 | #endif 4 | -------------------------------------------------------------------------------- /opt/reallocarray.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "compat-stdlib.h" 4 | 5 | /* 6 | * Copyright (c) 2008 Otto Moerbeek 7 | * 8 | * Permission to use, copy, modify, and distribute this software for any 9 | * purpose with or without fee is hereby granted, provided that the above 10 | * copyright notice and this permission notice appear in all copies. 11 | * 12 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 13 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 14 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 15 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 16 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 17 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 18 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 19 | */ 20 | 21 | /* 22 | * This is sqrt(SIZE_MAX+1), as s1*s2 <= SIZE_MAX 23 | * if both s1 < MUL_NO_OVERFLOW and s2 < MUL_NO_OVERFLOW 24 | */ 25 | #define MUL_NO_OVERFLOW ((size_t)1 << (sizeof(size_t) * 4)) 26 | 27 | void *reallocarray(void *optr, size_t nmemb, size_t size) { 28 | if ((nmemb >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) && nmemb > 0 && 29 | SIZE_MAX / nmemb < size) { 30 | errno = ENOMEM; 31 | return NULL; 32 | } 33 | return realloc(optr, size * nmemb); 34 | } 35 | -------------------------------------------------------------------------------- /opt/strchrnul.c: -------------------------------------------------------------------------------- 1 | /* @brief Here follows a simple implementation of the GNU function `strchrnul`. 2 | * Please check the gnulib manual for details. 3 | */ 4 | #include 5 | 6 | char *strchrnul(const char *s, int c){ 7 | char *p = strchr(s,c); 8 | 9 | return p != NULL ? p : strchr(s, '\0'); 10 | } 11 | -------------------------------------------------------------------------------- /scripts/_andi: -------------------------------------------------------------------------------- 1 | #compdef andi 2 | 3 | # This file allows zsh to complete arguments for andi. As the syntax is 4 | # totally non-obvious, I'll explain the basics here. For details see 5 | # http://zsh.sourceforge.net/Doc/Release/Completion-System.html 6 | # Each line consists of three parts: (A){B}[C] 7 | # The B part performs brace expansion as on the commandline. Thus each 8 | # line with braces gets translated into multiple arguments! Also the 9 | # B part lists the relevant argument for which we are trying to set 10 | # the completion rules. The A part simply states that B shall not be 11 | # completed if A is already present. i.e. Most flags only make sense once, 12 | # with the exception of -v. The string C is simply the message that is 13 | # displayed to the user. 14 | 15 | local info="-h --help --version" 16 | local ret=1 17 | local -a args 18 | 19 | args+=( 20 | "($info -b --bootstrap)"{-b+,--bootstrap=}'[Print additional bootstrap matrices]:int:' 21 | "($info)*--file-of-filenames=[Read additional filenames from file; one per line]:file:_files" 22 | "($info -j --join)"{-j,--join}'[Treat all sequences from one file as a single genome]' 23 | "($info -l --low-memory)"{-l,--low-memory}'[Use less memory at the cost of speed]' 24 | "($info -m --model)"{-m+,--model=}'[Pick an evolutionary model]:model:(( 25 | Raw\:Uncorrected\ distances 26 | JC\:Jukes\-Cantor\ corrected 27 | Kimura\:Kimura\-two\-parameter 28 | LogDet\:Logarithmic\ determinant 29 | ))' 30 | "($info)-p+[Significance of an anchor; default\: 0.025]:float:" 31 | "($info)--progress=[Show progress bar]:when:(always auto never)" 32 | "($info -t --threads)"{-t+,--threads=}'[The number of threads to be used; by default, all available processors are used]:num_threads:' 33 | "($info)--truncate-names[Print only the first ten characters of each name]" 34 | "($info)*"{-v,--verbose}'[Prints additional information]' 35 | '(- *)'{-h,--help}'[Display help and exit]' 36 | '(- *)--version[Output version information and acknowledgments]' 37 | '*:file:_files' 38 | ) 39 | 40 | _arguments -w -s -S $args[@] && ret=0 41 | 42 | return ret 43 | -------------------------------------------------------------------------------- /scripts/failed.zsh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/zsh 2 | 3 | # Compute the number of failing comparisons for different distances. 4 | 5 | DISTS=(0.1 0.2 0.3 0.35 0.4 0.45 0.5 0.55 0.6 0.65 0.7) 6 | 7 | LENGTH=100000 8 | 9 | for dist in $DISTS; do 10 | echo "" > est_$dist.dist 11 | for (( i = 0; i < 1000; i++ )); do 12 | ../test/test_fasta -l $LENGTH -d $dist > temp.fa 13 | ../src/andi ./temp.fa > est.dist 2> /dev/null 14 | tail -n 1 est.dist >> est_$dist.dist 15 | done 16 | avg=$(cat est_$dist.dist | awk '"nan" !~ $2 {sum+=$2;c++}END{print sum/c}') 17 | sd=$(grep -v 'nan' est_$dist.dist | awk '{a[c++]=$2;aa+=$2}END{aa/=NR;for(c=0;c file.maf > file.phy"; 10 | exit(-1); 11 | } 12 | numName = 0; 13 | test = "mult=" n; 14 | }{ 15 | if(/^a/){ 16 | if($0 ~ test) 17 | open = 1; 18 | else 19 | open = 0; 20 | } 21 | if(open && /^s/){ 22 | if(!s[$2]) 23 | names[numNames++] = $2; 24 | s[$2] = s[$2] $7; 25 | } 26 | }END{ 27 | # check equal length of sequences 28 | len = -1; 29 | for(i=0;i 0){ 32 | if(length(s[name]) != len){ 33 | print "sequence length should be " len " but is in fact " length(s[name]); 34 | exit(-1); 35 | } 36 | }else 37 | len = length(s[name]); 38 | } 39 | print numNames, len; 40 | start = 1; 41 | l = 60; 42 | for(i=0;i /dev/null 21 | 22 | # In theory we have to calculate the anchors distance here using the previously 23 | # computed matches. But since vmatch is already significantly slower than andi, 24 | # we skip this step. 25 | done 26 | 27 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS = andi 2 | 3 | andi_SOURCES = andi.c esa.c process.c sequence.c io.c global.h esa.h process.h sequence.h io.h dist_hack.h \ 4 | model.h model.c 5 | andi_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/libs -I$(top_srcdir)/opt -std=gnu99 6 | andi_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra -Wno-missing-field-initializers 7 | andi_LDADD = $(top_builddir)/libs/libpfasta.a $(top_builddir)/opt/libcompat.a 8 | 9 | .PHONY: perf 10 | perf: CFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer 11 | perf: andi 12 | -------------------------------------------------------------------------------- /src/andi.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * 4 | * This is the main file. It contains functions to parse the commandline 5 | * arguments, read files etc. 6 | * 7 | * @brief The main file 8 | * @author Fabian Klötzl 9 | * 10 | * @section License 11 | * 12 | * This program is free software; you can redistribute it and/or 13 | * modify it under the terms of the GNU General Public License as 14 | * published by the Free Software Foundation; either version 3 of 15 | * the License, or (at your option) any later version. 16 | * 17 | * This program is distributed in the hope that it will be useful, but 18 | * WITHOUT ANY WARRANTY; without even the implied warranty of 19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | * General Public License for more details at 21 | * http://www.gnu.org/copyleft/gpl.html 22 | * 23 | */ 24 | 25 | #include "global.h" 26 | #include "io.h" 27 | #include "process.h" 28 | #include "sequence.h" 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #ifdef _OPENMP 41 | #include 42 | #endif 43 | 44 | /* Global variables */ 45 | int FLAGS = 0; 46 | int THREADS = 1; 47 | long unsigned int BOOTSTRAP = 0; 48 | double ANCHOR_P_VALUE = 0.025; 49 | gsl_rng *RNG = NULL; 50 | int MODEL = M_JC; 51 | 52 | void usage(int); 53 | void version(void); 54 | 55 | /** 56 | * @brief The main function. 57 | * 58 | * The main function reads and parses the commandline arguments. Depending on 59 | * the set flags it reads the input files and forwards the contained sequences 60 | * to processing. Also it verifies the input for correctness and issues warnings 61 | * and errors. 62 | */ 63 | int main(int argc, char *argv[]) { 64 | struct option long_options[] = { 65 | {"version", no_argument, NULL, 0}, 66 | {"truncate-names", no_argument, NULL, 0}, 67 | {"file-of-filenames", required_argument, NULL, 0}, 68 | {"progress", optional_argument, NULL, 0}, 69 | {"help", no_argument, NULL, 'h'}, 70 | {"verbose", no_argument, NULL, 'v'}, 71 | {"join", no_argument, NULL, 'j'}, 72 | {"low-memory", no_argument, NULL, 'l'}, 73 | {"threads", required_argument, NULL, 't'}, 74 | {"bootstrap", required_argument, NULL, 'b'}, 75 | {"model", required_argument, NULL, 'm'}, 76 | {0, 0, 0, 0}}; 77 | 78 | #ifdef _OPENMP 79 | // Use all available processors by default. 80 | THREADS = omp_get_num_procs(); 81 | #endif 82 | 83 | enum { P_AUTO, P_NEVER, P_ALWAYS } progress = P_AUTO; 84 | 85 | struct string_vector file_names; 86 | string_vector_init(&file_names); 87 | 88 | // parse arguments 89 | while (1) { 90 | 91 | int option_index = 0; 92 | int c = getopt_long(argc, argv, "jvht:p:m:b:l", long_options, 93 | &option_index); 94 | 95 | if (c == -1) { 96 | break; 97 | } 98 | 99 | switch (c) { 100 | case 0: { 101 | const char *option_str = long_options[option_index].name; 102 | if (strcasecmp(option_str, "version") == 0) { 103 | version(); 104 | } 105 | if (strcasecmp(option_str, "truncate-names") == 0) { 106 | FLAGS |= F_TRUNCATE_NAMES; 107 | } 108 | if (strcasecmp(option_str, "file-of-filenames") == 0) { 109 | read_into_string_vector(optarg, &file_names); 110 | } 111 | if (strcasecmp(option_str, "progress") == 0) { 112 | if (!optarg || strcasecmp(optarg, "always") == 0) { 113 | progress = P_ALWAYS; 114 | } else if (strcasecmp(optarg, "auto") == 0) { 115 | progress = P_AUTO; 116 | } else if (strcasecmp(optarg, "never") == 0) { 117 | progress = P_NEVER; 118 | } else { 119 | warnx("invalid argument to --progress '%s'. Expected " 120 | "one of 'auto', 'always', or 'never'.", 121 | optarg); 122 | } 123 | } 124 | break; 125 | } 126 | case 'h': usage(EXIT_SUCCESS); break; 127 | case 'v': 128 | FLAGS |= FLAGS & F_VERBOSE ? F_EXTRA_VERBOSE : F_VERBOSE; 129 | break; 130 | case 'p': { 131 | errno = 0; 132 | char *end; 133 | double prop = strtod(optarg, &end); 134 | 135 | if (errno || end == optarg || *end != '\0') { 136 | soft_errx( 137 | "Expected a floating point number for -p argument, but " 138 | "'%s' was given. Skipping argument.", 139 | optarg); 140 | break; 141 | } 142 | 143 | if (prop <= 0.0 || prop >= 1.0) { 144 | soft_errx("A probability should be a value between 0 and " 145 | "1, exclusive; Ignoring -p %f argument.", 146 | prop); 147 | break; 148 | } 149 | 150 | ANCHOR_P_VALUE = prop; 151 | break; 152 | } 153 | case 'l': FLAGS |= F_LOW_MEMORY; break; 154 | case 'j': FLAGS |= F_JOIN; break; 155 | case 't': { 156 | #ifdef _OPENMP 157 | errno = 0; 158 | char *end; 159 | long unsigned int threads = strtoul(optarg, &end, 10); 160 | 161 | if (errno || end == optarg || *end != '\0') { 162 | warnx("Expected a number for -t argument, but '%s' was " 163 | "given. Ignoring -t argument.", 164 | optarg); 165 | break; 166 | } 167 | 168 | if (threads > (long unsigned int)omp_get_num_procs()) { 169 | warnx( 170 | "The number of threads to be used, is greater than the " 171 | "number of available processors; Ignoring -t %lu " 172 | "argument.", 173 | threads); 174 | break; 175 | } 176 | 177 | THREADS = threads; 178 | #else 179 | warnx( 180 | "This version of andi was built without OpenMP and thus " 181 | "does not support multi threading. Ignoring -t argument."); 182 | #endif 183 | break; 184 | } 185 | case 'b': { 186 | errno = 0; 187 | char *end; 188 | long unsigned int bootstrap = strtoul(optarg, &end, 10); 189 | 190 | if (errno || end == optarg || *end != '\0' || bootstrap == 0) { 191 | soft_errx( 192 | "Expected a positive number for -b argument, but '%s' " 193 | "was given. Ignoring -b argument.", 194 | optarg); 195 | break; 196 | } 197 | 198 | BOOTSTRAP = bootstrap - 1; 199 | break; 200 | } 201 | case 'm': { 202 | if (strcasecmp(optarg, "RAW") == 0) { 203 | MODEL = M_RAW; 204 | } else if (strcasecmp(optarg, "JC") == 0) { 205 | MODEL = M_JC; 206 | } else if (strcasecmp(optarg, "KIMURA") == 0) { 207 | MODEL = M_KIMURA; 208 | } else if (strcasecmp(optarg, "LOGDET") == 0) { 209 | MODEL = M_LOGDET; 210 | } else { 211 | soft_errx("Ignoring argument for --model. Expected Raw, " 212 | "JC, Kimura or LogDet"); 213 | } 214 | break; 215 | } 216 | case '?': /* intentional fall-through */ 217 | default: usage(EXIT_FAILURE); break; 218 | } 219 | } 220 | 221 | argc -= optind; 222 | argv += optind; 223 | 224 | // copy command line arguments into vector 225 | // std::copy, anyone? 226 | for (size_t i = 0; i < (unsigned int)argc; i++) { 227 | string_vector_push_back(&file_names, argv[i]); 228 | } 229 | 230 | // at least one file name must be given 231 | if (FLAGS & F_JOIN && string_vector_size(&file_names) == 0) { 232 | errx(1, "In join mode at least one filename needs to be supplied."); 233 | } 234 | 235 | size_t minfiles = FLAGS & F_JOIN ? 2 : 1; 236 | if (string_vector_size(&file_names) < minfiles) { 237 | // not enough files passed via arguments 238 | if (!isatty(STDIN_FILENO)) { 239 | // read from stdin in pipe 240 | string_vector_push_back(&file_names, "-"); 241 | } else { 242 | // print a helpful message on './andi' without args 243 | usage(EXIT_FAILURE); 244 | } 245 | } 246 | 247 | // parse fasta files 248 | dsa_t dsa; 249 | dsa_init(&dsa); 250 | for (size_t i = 0; i < string_vector_size(&file_names); i++) { 251 | char *file_name = string_vector_at(&file_names, i); 252 | if (FLAGS & F_JOIN) { 253 | read_fasta_join(file_name, &dsa); 254 | } else { 255 | read_fasta(file_name, &dsa); 256 | } 257 | } 258 | 259 | string_vector_free(&file_names); 260 | 261 | size_t n = dsa_size(&dsa); 262 | 263 | if (n < 2) { 264 | errx(1, 265 | "I am truly sorry, but with less than two sequences (%zu given) " 266 | "there is nothing to compare.", 267 | n); 268 | } 269 | 270 | RNG = gsl_rng_alloc(gsl_rng_default); 271 | if (!RNG) { 272 | err(1, "RNG allocation failed."); 273 | } 274 | 275 | // seed the random number generator with the current time 276 | // TODO: enable seeding for reproducibility 277 | gsl_rng_set(RNG, time(NULL)); 278 | 279 | // Warn about non ACGT residues. 280 | if (FLAGS & F_NON_ACGT) { 281 | warnx("The input sequences contained characters other than acgtACGT. " 282 | "These were automatically stripped to ensure correct results."); 283 | } 284 | 285 | // validate sequence correctness 286 | const seq_t *seq = dsa_data(&dsa); 287 | for (size_t i = 0; i < n; ++i, seq++) { 288 | if ((FLAGS & F_TRUNCATE_NAMES) && strlen(seq->name) > 10) { 289 | warnx("The sequence name '%s' is longer than ten characters. It " 290 | "will be truncated in the output to '%.10s'.", 291 | seq->name, seq->name); 292 | } 293 | 294 | const size_t LENGTH_LIMIT = (INT_MAX - 1) / 2; 295 | if (seq->len > LENGTH_LIMIT) { 296 | errx(1, "The sequence %s is too long. The technical limit is %zu.", 297 | seq->name, LENGTH_LIMIT); 298 | } 299 | 300 | if (seq->len == 0) { 301 | errx(1, "The sequence %s is empty.", seq->name); 302 | } 303 | 304 | if (seq->len < 1000) { 305 | FLAGS |= F_SHORT; 306 | } 307 | } 308 | 309 | if (FLAGS & F_SHORT) { 310 | soft_errx( 311 | "One of the given input sequences is shorter than a thousand " 312 | "nucleotides. This may result in inaccurate distances. Try an " 313 | "alignment instead."); 314 | } 315 | 316 | // determine whether to print a progress bar 317 | if (progress == P_AUTO) { 318 | progress = isatty(STDERR_FILENO) ? P_ALWAYS : P_NEVER; 319 | } 320 | if (progress == P_ALWAYS) { 321 | FLAGS |= F_PRINT_PROGRESS; 322 | } 323 | 324 | // compute distance matrix 325 | calculate_distances(dsa_data(&dsa), n); 326 | 327 | dsa_free(&dsa); 328 | gsl_rng_free(RNG); 329 | 330 | return FLAGS & F_SOFT_ERROR ? EXIT_FAILURE : EXIT_SUCCESS; 331 | } 332 | 333 | /** 334 | * @brief Prints the usage and then exits. 335 | */ 336 | void usage(int status) { 337 | const char str[] = { 338 | "Usage: andi [OPTIONS...] FILES...\n" 339 | "\tFILES... can be any sequence of FASTA files.\n" 340 | "\tUse '-' as file name to read from stdin.\n" 341 | "Options:\n" 342 | " -b, --bootstrap=INT Print additional bootstrap matrices\n" 343 | " --file-of-filenames=FILE Read additional filenames from FILE; " 344 | "one per line\n" 345 | " -j, --join Treat all sequences from one file as a single " 346 | "genome\n" 347 | " -l, --low-memory Use less memory at the cost of speed\n" 348 | " -m, --model=MODEL Pick an evolutionary model of 'Raw', 'JC', " 349 | "'Kimura', 'LogDet'; default: JC\n" 350 | " -p FLOAT Significance of an anchor; default: 0.025\n" 351 | " --progress=WHEN Print a progress bar 'always', 'never', or " 352 | "'auto'; default: auto\n" 353 | #ifdef _OPENMP 354 | " -t, --threads=INT Set the number of threads; by default, all " 355 | "processors are used\n" 356 | #endif 357 | " --truncate-names Truncate names to ten characters\n" 358 | " -v, --verbose Prints additional information\n" 359 | " -h, --help Display this help and exit\n" 360 | " --version Output version information and " 361 | "acknowledgments\n"}; 362 | 363 | fprintf(status == EXIT_SUCCESS ? stdout : stderr, "%s", str); 364 | exit(status); 365 | } 366 | 367 | /** 368 | * @brief This function just prints the version string and then aborts 369 | * the program. It conforms to the [GNU Coding 370 | * Standard](http://www.gnu.org/prep/standards/html_node/_002d_002dversion.html#g_t_002d_002dversion). 371 | */ 372 | void version(void) { 373 | const char str[] = { 374 | "andi " VERSION "\n" 375 | "Copyright (C) 2014 - 2020 Fabian Klötzl\n" 376 | "License GPLv3+: GNU GPL version 3 or later " 377 | "\n" 378 | "This is free software: you are free to change and redistribute it.\n" 379 | "There is NO WARRANTY, to the extent permitted by law.\n\n" 380 | "Acknowledgments:\n" 381 | "1) Andi: Haubold, B. Klötzl, F. and Pfaffelhuber, P. (2015). " 382 | "Fast and accurate estimation of evolutionary distances between " 383 | "closely related genomes, Bioinformatics.\n" 384 | "2) Algorithms: Ohlebusch, E. (2013). Bioinformatics Algorithms. " 385 | "Sequence Analysis, Genome Rearrangements, and Phylogenetic " 386 | "Reconstruction. pp 118f.\n" 387 | "3) SA construction: Mori, Y. (2005). libdivsufsort, unpublished.\n" 388 | "4) Bootstrapping: Klötzl, F. and Haubold, B. (2016). Support Values " 389 | "for Genome Phylogenies, Life 6.1.\n"}; 390 | printf("%s", str); 391 | exit(EXIT_SUCCESS); 392 | } 393 | -------------------------------------------------------------------------------- /src/dist_hack.h: -------------------------------------------------------------------------------- 1 | /** @file 2 | * @brief This file is a preprocessor hack for the two functions `distMatrix` 3 | * and `distMatrixLM`. 4 | */ 5 | // clang-format off 6 | #ifdef FAST 7 | #define NAME distMatrix 8 | #define P_OUTER _Pragma("omp parallel for num_threads( THREADS) default(none) shared(progress_counter) firstprivate( stderr, M, sequences, n, print_progress)") 9 | #define P_INNER 10 | #else 11 | #undef NAME 12 | #undef P_OUTER 13 | #undef P_INNER 14 | #define NAME distMatrixLM 15 | #define P_OUTER 16 | #define P_INNER _Pragma("omp parallel for num_threads( THREADS) default(none) shared(progress_counter) firstprivate( stderr, M, sequences, n, print_progress, i, E, subject)") 17 | #endif 18 | // clang-format on 19 | 20 | /** @brief This function calls dist_andi for pairs of subjects and queries, and 21 | * thereby fills the distance matrix. 22 | * 23 | * This function is actually two functions. It is one template that gets 24 | * compiled into two functions via preprocessor hacks. The reason is DRY (Do not 25 | * Repeat Yourselves). 26 | * The two functions only differ by their name and pragmas; i.e. They run in 27 | * different parallel modes. 28 | * `distMatrix` is faster than `distMatrixLM` but needs more memory. 29 | * 30 | * @param sequences - The sequences to compare 31 | * @param n - The number of sequences 32 | * @param M - A matrix for additional output data 33 | */ 34 | void NAME(struct model *M, const seq_t *sequences, size_t n) { 35 | size_t i; 36 | 37 | size_t progress_counter = 0; 38 | int print_progress = FLAGS & F_PRINT_PROGRESS; 39 | 40 | if (print_progress) { 41 | fprintf(stderr, "Comparing %zu sequences: %5.1f%% (%zu/%zu)", n, 0.0, 42 | (size_t)0, n * n - n); 43 | } 44 | 45 | //#pragma 46 | P_OUTER 47 | for (i = 0; i < n; i++) { 48 | seq_subject subject; 49 | esa_s E; 50 | 51 | if (seq_subject_init(&subject, &sequences[i]) || 52 | esa_init(&E, &subject)) { 53 | errx(1, "Failed to create index for %s.", sequences[i].name); 54 | } 55 | 56 | // now compare every other sequence to i 57 | size_t j; 58 | 59 | P_INNER 60 | for (j = 0; j < n; j++) { 61 | if (j == i) { 62 | M(i, j) = (struct model){.seq_len = 9, .counts = {9}}; 63 | continue; 64 | } 65 | 66 | size_t ql = sequences[j].len; 67 | 68 | M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject.threshold); 69 | 70 | #pragma omp atomic update 71 | progress_counter++; 72 | } 73 | 74 | if (print_progress) { 75 | size_t local_progress_counter; 76 | size_t num_comparisons = n * n - n; 77 | 78 | #pragma omp atomic read 79 | local_progress_counter = progress_counter; 80 | 81 | double progress = 82 | 100.0 * (double)local_progress_counter / num_comparisons; 83 | 84 | #pragma omp critical 85 | fprintf(stderr, "\rComparing %zu sequences: %5.1f%% (%zu/%zu)", n, 86 | progress, local_progress_counter, num_comparisons); 87 | } 88 | 89 | esa_free(&E); 90 | seq_subject_free(&subject); 91 | } 92 | 93 | if (print_progress) { 94 | fprintf(stderr, ", done.\n"); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/esa.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * @brief ESA functions 4 | * 5 | * This file contains various functions that operate on an enhanced suffix 6 | * array. The basic algorithms originate from the book of Ohlebusch 7 | * "Bioinformatics Algorithms" (2013). Most of these were heavily modified 8 | * for improved performance. One example is the lcp-cache. 9 | * 10 | * The ESA structure defined in esa.h contains a `cache` field. This cache is 11 | * used to quickly look up lcp-intervals. Consider the queries "AAGT" and 12 | * "AACG". In both cases the interval for "AA" has to be looked up in the 13 | * ESA. If we simply store the interval for "AA" in the cache, once and use it 14 | * for each query we are significantly faster (up to 7 times). 15 | */ 16 | #include "esa.h" 17 | #include "global.h" 18 | #include 19 | #include 20 | #include 21 | 22 | static void esa_init_cache_dfs(esa_s *, char *str, size_t pos, lcp_inter_t in); 23 | static void esa_init_cache_fill(esa_s *, char *str, size_t pos, lcp_inter_t in); 24 | 25 | static lcp_inter_t get_interval(const esa_s *, lcp_inter_t ij, char a); 26 | lcp_inter_t get_match(const esa_s *, const char *query, size_t qlen); 27 | static lcp_inter_t get_match_from(const esa_s *, const char *query, size_t qlen, 28 | saidx_t k, lcp_inter_t ij); 29 | 30 | static int esa_init_SA(esa_s *); 31 | static int esa_init_LCP(esa_s *); 32 | static int esa_init_CLD(esa_s *); 33 | 34 | /** @brief The prefix length up to which LCP-intervals are cached. */ 35 | const size_t CACHE_LENGTH = 10; 36 | 37 | /** @brief Map a code to the character. */ 38 | char code2char(ssize_t code) { 39 | switch (code & 0x3) { 40 | case 0: return 'A'; 41 | case 1: return 'C'; 42 | case 2: return 'G'; 43 | case 3: return 'T'; 44 | } 45 | return '\0'; 46 | } 47 | 48 | /** @brief Map a character to a two bit code. */ 49 | ssize_t char2code(const char c) { 50 | ssize_t result = -1; 51 | switch (c) { 52 | case 'A': result = 0; break; 53 | case 'C': result = 1; break; 54 | case 'G': result = 2; break; 55 | case 'T': result = 3; break; 56 | } 57 | return result; 58 | } 59 | 60 | #define R(CLD, i) ((CLD)[(i)]) 61 | #define L(CLD, i) ((CLD)[(i)-1]) 62 | 63 | /** @brief Fills the LCP-Interval cache. 64 | * 65 | * Traversing the virtual suffix tree, created by SA, LCP and CLD is rather 66 | * slow. Hence we create a cache, holding the LCP-interval for a prefix of a 67 | * certain length ::CACHE_LENGTH. This function it the entry point for the 68 | * cache filling routine. 69 | * 70 | * @param self - The ESA. 71 | * @returns 0 iff successful 72 | */ 73 | int esa_init_cache(esa_s *self) { 74 | lcp_inter_t *cache = malloc((1 << (2 * CACHE_LENGTH)) * sizeof(*cache)); 75 | CHECK_MALLOC(cache); 76 | 77 | self->cache = cache; 78 | 79 | char str[CACHE_LENGTH + 1]; 80 | str[CACHE_LENGTH] = '\0'; 81 | 82 | saidx_t m = L(self->CLD, self->len); 83 | lcp_inter_t ij = {.i = 0, .j = self->len - 1, .m = m, .l = self->LCP[m]}; 84 | 85 | esa_init_cache_dfs(self, str, 0, ij); 86 | 87 | return 0; 88 | } 89 | 90 | /** @brief Fills the cache — one char at a time. 91 | * 92 | * This function is a depth first search on the virtual suffix tree and fills 93 | * the cache. Or rather it calls it self until some value to cache is 94 | * calculated. This function is a recursive version of get_interval but with 95 | * more edge cases. 96 | * 97 | * @param C - The ESA. 98 | * @param str - The current prefix. 99 | * @param pos - The length of the prefix. 100 | * @param in - The LCP-interval of prefix[0..pos-1]. 101 | */ 102 | void esa_init_cache_dfs(esa_s *C, char *str, size_t pos, const lcp_inter_t in) { 103 | // we are not yet done, but the current strings do not exist in the subject. 104 | if (pos < CACHE_LENGTH && in.i == -1 && in.j == -1) { 105 | esa_init_cache_fill(C, str, pos, in); 106 | return; 107 | } 108 | 109 | // we are past the caching length 110 | if (pos >= CACHE_LENGTH) { 111 | esa_init_cache_fill(C, str, pos, in); 112 | return; 113 | } 114 | 115 | lcp_inter_t ij; 116 | 117 | // iterate over all nucleotides 118 | for (int code = 0; code < 4; ++code) { 119 | str[pos] = code2char(code); 120 | ij = get_interval(C, in, str[pos]); 121 | 122 | // fail early 123 | if (ij.i == -1 && ij.j == -1) { 124 | // if the current extension cannot be found, will with previous one 125 | esa_init_cache_fill(C, str, pos + 1, in); 126 | continue; 127 | } 128 | 129 | // singleton 130 | if (ij.i == ij.j) { 131 | // fix length 132 | ij.l = pos + 1; 133 | esa_init_cache_fill(C, str, pos + 1, ij); 134 | continue; 135 | } 136 | 137 | if (ij.l <= (ssize_t)(pos + 1)) { 138 | // Continue one level deeper 139 | // This is the usual case 140 | esa_init_cache_dfs(C, str, pos + 1, ij); 141 | continue; 142 | } 143 | 144 | // The LCP-interval is deeper than expected 145 | // Check if it still fits into the cache 146 | if ((size_t)ij.l >= CACHE_LENGTH) { 147 | // If the lcp-interval exceeds the cache depth, stop here and fill 148 | esa_init_cache_fill(C, str, pos + 1, in); 149 | continue; 150 | } 151 | 152 | /* At this point the prefix `str` of length `pos` has been found. 153 | * However, the call to `getInterval` above found an interval with 154 | * an LCP value bigger than `pos`. This means that not all elongations 155 | * (more precise: just one) of `str` appear in the subject. Thus fill 156 | * all values with the matched result to far and continue only with 157 | * the one special substring. 158 | */ 159 | esa_init_cache_fill(C, str, pos + 1, in); 160 | 161 | char non_acgt = 0; 162 | 163 | // fast forward 164 | size_t k = pos + 1; 165 | for (; k < (size_t)ij.l; k++) { 166 | // In some very edgy edge cases the lcp-interval `ij` 167 | // contains a `;` or another non-acgt character. Since we 168 | // cannot cache those, break. 169 | char c = C->S[C->SA[ij.i] + k]; 170 | if (char2code(c) < 0) { 171 | non_acgt = 1; 172 | break; 173 | } 174 | 175 | str[k] = c; 176 | } 177 | 178 | // We are skipping intervals here. Maybe for each of them we should also 179 | // fill the cache. However, I haven't yet figured out how to do that 180 | // properly and whether it is worth it. 181 | 182 | if (non_acgt) { 183 | esa_init_cache_fill(C, str, k, ij); 184 | } else { 185 | esa_init_cache_dfs(C, str, k, ij); 186 | } 187 | } 188 | } 189 | 190 | /** @brief Fills the cache with a given value. 191 | * 192 | * Given a prefix and a value this function fills the cache beyond this point 193 | * the value. 194 | * 195 | * @param C - The ESA. 196 | * @param str - The current prefix. 197 | * @param pos - The length of the prefix. 198 | * @param in - The LCP-interval of prefix[0..pos-1]. 199 | */ 200 | void esa_init_cache_fill(esa_s *C, char *str, size_t pos, lcp_inter_t in) { 201 | if (pos < CACHE_LENGTH) { 202 | for (int code = 0; code < 4; ++code) { 203 | str[pos] = code2char(code); 204 | esa_init_cache_fill(C, str, pos + 1, in); 205 | } 206 | } else { 207 | ssize_t code = 0; 208 | for (size_t i = 0; i < CACHE_LENGTH; ++i) { 209 | code <<= 2; 210 | code |= char2code(str[i]); 211 | } 212 | 213 | C->cache[code] = in; 214 | } 215 | } 216 | 217 | /** 218 | * @brief Initializes the FVC (first variant character) array. 219 | * 220 | * The FVC is of my own invention and simply defined as 221 | * `FVC[i] = S[SA[i]+LCP[i]]`. This expression is constantly used in 222 | * get_interval. By precomputing the result, we have less memory 223 | * accesses, less cache misses, and thus improved runtimes of up to 15% 224 | * faster matching. This comes at a negligible cost of increased memory. 225 | * 226 | * @param self - The ESA 227 | * @returns 0 iff successful 228 | */ 229 | int esa_init_FVC(esa_s *self) { 230 | size_t len = self->len; 231 | 232 | char *FVC = self->FVC = malloc(len); 233 | CHECK_MALLOC(FVC); 234 | 235 | const char *S = self->S; 236 | const int *SA = self->SA; 237 | const int *LCP = self->LCP; 238 | 239 | FVC[0] = '\0'; 240 | for (size_t i = len; i; i--, FVC++, SA++, LCP++) { 241 | *FVC = S[*SA + *LCP]; 242 | } 243 | 244 | return 0; 245 | } 246 | 247 | /** @brief Initializes an ESA. 248 | * 249 | * This function initializes an ESA with respect to the provided sequence. 250 | * @param C - The ESA to initialize. 251 | * @param S - The sequence 252 | * @returns 0 iff successful 253 | */ 254 | int esa_init(esa_s *C, const seq_subject *S) { 255 | if (!C || !S || !S->RS) return 1; 256 | 257 | *C = (esa_s){.S = S->RS, .len = S->RSlen}; 258 | 259 | int result; 260 | 261 | result = esa_init_SA(C); 262 | if (result) return result; 263 | 264 | result = esa_init_LCP(C); 265 | if (result) return result; 266 | 267 | result = esa_init_CLD(C); 268 | if (result) return result; 269 | 270 | result = esa_init_FVC(C); 271 | if (result) return result; 272 | 273 | result = esa_init_cache(C); 274 | if (result) return result; 275 | 276 | return 0; 277 | } 278 | 279 | /** @brief Free the private data of an ESA. */ 280 | void esa_free(esa_s *self) { 281 | free(self->SA); 282 | free(self->LCP); 283 | free(self->CLD); 284 | free(self->cache); 285 | free(self->FVC); 286 | *self = (esa_s){}; 287 | } 288 | 289 | /** 290 | * Computes the SA given a string S. To do so it uses libdivsufsort. 291 | * @param C The enhanced suffix array to use. Reads C->S, fills C->SA. 292 | * @returns 0 iff successful 293 | */ 294 | int esa_init_SA(esa_s *C) { 295 | // assert c.S 296 | if (!C || !C->S) { 297 | return 1; 298 | } 299 | 300 | C->SA = malloc(C->len * sizeof(*C->SA)); 301 | CHECK_MALLOC(C->SA); 302 | 303 | return divsufsort((const unsigned char *)C->S, C->SA, C->len); 304 | } 305 | 306 | /** @brief Initializes the CLD (child) array. 307 | * 308 | * See Ohlebusch. 309 | * 310 | * @param C - The ESA 311 | */ 312 | int esa_init_CLD(esa_s *C) { 313 | if (!C || !C->LCP) { 314 | return 1; 315 | } 316 | saidx_t *CLD = C->CLD = malloc((C->len + 1) * sizeof(*CLD)); 317 | CHECK_MALLOC(CLD); 318 | 319 | const saidx_t *LCP = C->LCP; 320 | 321 | typedef struct pair_s { 322 | saidx_t idx, lcp; 323 | } pair_t; 324 | 325 | pair_t *stack = malloc((C->len + 1) * sizeof(*stack)); 326 | CHECK_MALLOC(stack); 327 | pair_t *top = stack; // points at the topmost filled element 328 | pair_t last; 329 | 330 | R(CLD, 0) = C->len; 331 | 332 | top->idx = 0; 333 | top->lcp = -1; 334 | 335 | // iterate over all elements 336 | for (size_t k = 1; k < (size_t)(C->len + 1); k++) { 337 | while (LCP[k] < top->lcp) { 338 | // top->lcp is a leaf 339 | last = *top--; 340 | 341 | // link all elements of same lcp value in a chain 342 | while (top->lcp == last.lcp) { 343 | R(CLD, top->idx) = last.idx; 344 | last = *top--; 345 | } 346 | 347 | // store the l-index of last 348 | if (LCP[k] < top->lcp) { 349 | R(CLD, top->idx) = last.idx; 350 | } else { 351 | L(CLD, k) = last.idx; 352 | } 353 | } 354 | 355 | // continue one level deeper 356 | top++; 357 | top->idx = k; 358 | top->lcp = LCP[k]; 359 | } 360 | 361 | free(stack); 362 | return 0; 363 | } 364 | 365 | /** 366 | * This function computed the LCP array, given the suffix array. Thereto it uses 367 | * a special `phi` array, which makes it slightly faster than the original 368 | * linear-time algorithm by Kasai et al. 369 | * 370 | * @param C The enhanced suffix array to compute the LCP from. 371 | * @returns 0 iff successful 372 | */ 373 | int esa_init_LCP(esa_s *C) { 374 | const char *S = C->S; 375 | const saidx_t *SA = C->SA; 376 | saidx_t len = C->len; 377 | 378 | // Trivial safety checks 379 | if (!C || !S || !SA || len == 0) { 380 | return 1; 381 | } 382 | 383 | // Allocate new memory 384 | // The LCP array is one element longer than S. 385 | saidx_t *LCP = C->LCP = malloc((len + 1) * sizeof(*LCP)); 386 | CHECK_MALLOC(LCP); 387 | 388 | LCP[0] = -1; 389 | LCP[len] = -1; 390 | 391 | // Allocate temporary arrays 392 | saidx_t *PHI = malloc(len * sizeof(*PHI)); 393 | saidx_t *PLCP = PHI; 394 | CHECK_MALLOC(PHI); 395 | 396 | PHI[SA[0]] = -1; 397 | saidx_t k; 398 | ssize_t i; 399 | 400 | for (i = 1; i < len; i++) { 401 | PHI[SA[i]] = SA[i - 1]; 402 | } 403 | 404 | ssize_t l = 0; 405 | for (i = 0; i < len; i++) { 406 | k = PHI[i]; 407 | if (k != -1) { 408 | while (S[k + l] == S[i + l]) { 409 | l++; 410 | } 411 | PLCP[i] = l; 412 | l--; 413 | if (l < 0) l = 0; 414 | } else { 415 | PLCP[i] = -1; 416 | } 417 | } 418 | 419 | // unpermutate the LCP array 420 | for (i = 1; i < len; i++) { 421 | LCP[i] = PLCP[SA[i]]; 422 | } 423 | 424 | free(PHI); 425 | return 0; 426 | } 427 | 428 | /** @brief For the lcp-interval of string `w` compute the interval for `wa` 429 | * 430 | * Say, we already know the LCP-interval ij for a string `w`. Now we want to 431 | * check if `wa` may also be found in the ESA and thus in the subject. So we 432 | * look for the sub interval of `ij` in which all strings feature an `a` as 433 | * the next character. If such a sub interval is found, its boundaries are 434 | * returned. 435 | * 436 | * @param self - The ESA. 437 | * @param ij - The lcp-interval for `w`. 438 | * @param a - The next character. 439 | * @returns The lcp-interval one level deeper. 440 | */ 441 | static lcp_inter_t get_interval(const esa_s *self, lcp_inter_t ij, char a) { 442 | saidx_t i = ij.i; 443 | saidx_t j = ij.j; 444 | 445 | const saidx_t *SA = self->SA; 446 | const saidx_t *LCP = self->LCP; 447 | const char *S = self->S; 448 | const saidx_t *CLD = self->CLD; 449 | const char *FVC = self->FVC; 450 | // check for singleton or empty interval 451 | if (i == j) { 452 | if (S[SA[i] + ij.l] != a) { 453 | ij.i = ij.j = -1; 454 | } 455 | return ij; 456 | } 457 | 458 | int m = ij.m; 459 | int l = ij.l; 460 | 461 | char c = S[SA[i] + l]; 462 | goto SoSueMe; 463 | 464 | do { 465 | c = FVC[i]; 466 | 467 | SoSueMe: 468 | if (c == a) { 469 | /* found ! */ 470 | 471 | if (i != m - 1) { 472 | // found interval contains >1 element 473 | saidx_t n = L(CLD, m); 474 | 475 | ij = (lcp_inter_t){.i = i, .j = m - 1, .m = n, .l = LCP[n]}; 476 | } else { 477 | // empty or singleton 478 | // doing L(CLD, m) is not valid in this case! 479 | ij = (lcp_inter_t){.i = i, .j = i, .m = -1, .l = LCP[i]}; 480 | } 481 | 482 | return ij; 483 | } 484 | 485 | if (c > a) { 486 | break; 487 | } 488 | 489 | i = m; 490 | 491 | if (i == j) { 492 | break; // singleton interval, or `a` not found 493 | } 494 | 495 | m = R(CLD, m); 496 | } while (/*m != "bottom" && */ LCP[m] == l); 497 | 498 | // final sanity check 499 | if (i != ij.i ? FVC[i] == a : S[SA[i] + l] == a) { 500 | ij.i = i; 501 | ij.j = j; 502 | /* Also return the length of the LCP interval including `a` and 503 | * possibly even more characters. Note: l + 1 <= LCP[m] */ 504 | ij.l = LCP[m]; 505 | ij.m = m; 506 | } else { 507 | ij.i = ij.j = -1; 508 | } 509 | 510 | return ij; 511 | } 512 | 513 | /** @brief Compute the longest match of a query with the subject. 514 | * 515 | * The *longest match* is the core concept of `andi`. Its simply defined as the 516 | * longest prefix of a query Q appearing anywhere in the subject S. Talking 517 | * about genetic sequences, a match is a homologous region, likely followed by a 518 | * SNP. 519 | * 520 | * This function returns the interval for where the longest match of the query 521 | * can be found in the ESA. Thereto it expects a starting interval for the 522 | * search. 523 | * 524 | * @param C - The enhanced suffix array for the subject. 525 | * @param query - The query sequence. 526 | * @param qlen - The length of the query. Should correspond to `strlen(query)`. 527 | * @param k - The starting index into the query. 528 | * @param ij - The LCP interval for the string `query[0..k]`. 529 | * @returns The LCP interval for the longest prefix. 530 | */ 531 | lcp_inter_t get_match_from(const esa_s *C, const char *query, size_t qlen, 532 | saidx_t k, lcp_inter_t ij) { 533 | 534 | if (ij.i == -1 && ij.j == -1) { 535 | return ij; 536 | } 537 | 538 | // fail early on singleton intervals. 539 | if (ij.i == ij.j) { 540 | 541 | // try to extend the match. See line 513 below. 542 | saidx_t p = C->SA[ij.i]; 543 | size_t k = ij.l; 544 | const char *S = (const char *)C->S; 545 | 546 | for (; k < qlen && S[p + k]; k++) { 547 | if (S[p + k] != query[k]) { 548 | ij.l = k; 549 | return ij; 550 | } 551 | } 552 | 553 | ij.l = k; 554 | return ij; 555 | } 556 | 557 | saidx_t l, i, j; 558 | 559 | lcp_inter_t res = ij; 560 | 561 | const saidx_t *SA = C->SA; 562 | const char *S = C->S; 563 | 564 | // Loop over the query until a mismatch is found 565 | do { 566 | // Get the subinterval for the next character. 567 | ij = get_interval(C, ij, query[k]); 568 | i = ij.i; 569 | j = ij.j; 570 | 571 | // If our match cannot be extended further, return. 572 | if (i == -1 && j == -1) { 573 | res.l = k; 574 | return res; 575 | } 576 | 577 | res.i = ij.i; 578 | res.j = ij.j; 579 | 580 | l = qlen; 581 | if (i < j && ij.l < l) { 582 | /* Instead of making another look up we can use the LCP interval 583 | * calculated in get_interval */ 584 | l = ij.l; 585 | } 586 | 587 | // By definition, the kth letter of the query was matched. 588 | k++; 589 | 590 | // Extend the match 591 | for (int p = SA[i]; k < l; k++) { 592 | if (S[p + k] != query[k]) { 593 | res.l = k; 594 | return res; 595 | } 596 | } 597 | } while (k < (ssize_t)qlen); 598 | 599 | res.l = qlen; 600 | return res; 601 | } 602 | 603 | /** @brief Get a match. 604 | * 605 | * Given an ESA and a string Q find the longest prefix of Q that matches 606 | * somewhere in C. This search is done entirely via jumping around in the ESA, 607 | * and thus is slow. 608 | * 609 | * @param C - The ESA. 610 | * @param query - The query string — duh. 611 | * @param qlen - The length of the query. 612 | * @returns the lcp interval of the match. 613 | */ 614 | lcp_inter_t get_match(const esa_s *C, const char *query, size_t qlen) { 615 | // sanity checks 616 | if (!C || !query || !C->len || !C->SA || !C->LCP || !C->S || !C->CLD) { 617 | return (lcp_inter_t){-1, -1, -1, -1}; 618 | } 619 | 620 | saidx_t m = L(C->CLD, C->len); 621 | lcp_inter_t ij = {.i = 0, .j = C->len - 1, .m = m, .l = C->LCP[m]}; 622 | 623 | return get_match_from(C, query, qlen, 0, ij); 624 | } 625 | 626 | /** @brief Compute the LCP interval of a query. For a certain prefix length of 627 | * the query its LCP interval is retrieved from a cache. Hence this is faster 628 | * than the naive `get_match`. If the cache fails to provide a proper value, we 629 | * fall back to the standard search. 630 | * 631 | * @param C - The enhanced suffix array for the subject. 632 | * @param query - The query sequence. 633 | * @param qlen - The length of the query. Should correspond to `strlen(query)`. 634 | * @returns The LCP interval for the longest prefix. 635 | */ 636 | lcp_inter_t get_match_cached(const esa_s *C, const char *query, size_t qlen) { 637 | if (qlen <= CACHE_LENGTH) return get_match(C, query, qlen); 638 | 639 | ssize_t offset = 0; 640 | for (size_t i = 0; i < CACHE_LENGTH && offset >= 0; i++) { 641 | offset <<= 2; 642 | offset |= char2code(query[i]); 643 | } 644 | 645 | if (offset < 0) { 646 | return get_match(C, query, qlen); 647 | } 648 | 649 | lcp_inter_t ij = C->cache[offset]; 650 | 651 | if (ij.i == -1 && ij.j == -1) { 652 | return get_match(C, query, qlen); 653 | } 654 | 655 | return get_match_from(C, query, qlen, ij.l, ij); 656 | } 657 | -------------------------------------------------------------------------------- /src/esa.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * @brief This header contains the declarations for functions in esa.c. 4 | * 5 | */ 6 | #ifndef _ESA_H_ 7 | #define _ESA_H_ 8 | 9 | #include "config.h" 10 | #include "sequence.h" 11 | #include 12 | #include 13 | 14 | /** 15 | * @brief Represents LCP-Intervals. 16 | * 17 | * This struct is used to represent LCP-intervals. The member `i` should 18 | * coincide with the lower bound whereas `j` is the upper bound. Both bounds 19 | * are inclusive. So if `i == j` the interval contains exactly one element, 20 | * namely `i`. To represent an empty interval please use `i == j == -1`. 21 | * Other variants, such as `i == j == -2` can be used to indicate an error. 22 | * The common prefix length is denoted by l and should always be non-negative. 23 | * Variables of this type are often called `ij`. 24 | */ 25 | typedef struct { 26 | /** @brief The common prefix length */ 27 | saidx_t l; 28 | /** @brief lower bound */ 29 | saidx_t i; 30 | /** @brief upper bound */ 31 | saidx_t j; 32 | /** The new middle. */ 33 | saidx_t m; 34 | } lcp_inter_t; 35 | 36 | /** 37 | * @brief The ESA type. 38 | * 39 | * This structure holds arrays and objects associated with an enhanced 40 | * suffix array (ESA). 41 | */ 42 | typedef struct esa_s { 43 | /** The base string from which the ESA was generated. */ 44 | const char *S; 45 | /** The actual suffix array with indexes into S. */ 46 | saidx_t *SA; 47 | /** The LCP holds the number of letters up to which a suffix `S[SA[i]]` 48 | equals `S[SA[i-1]]`. Hence the name longest common prefix. For `i = 0` 49 | and `i = len` the LCP value is -1. */ 50 | saidx_t *LCP; 51 | /** The length of the string S. */ 52 | saidx_t len; 53 | /** A cache for lcp-intervals */ 54 | lcp_inter_t *cache; 55 | /** The FVC array holds the character after the LCP. */ 56 | char *FVC; 57 | /** This is the child array. */ 58 | saidx_t *CLD; 59 | } esa_s; 60 | 61 | lcp_inter_t get_match_cached(const esa_s *, const char *query, size_t qlen); 62 | lcp_inter_t get_match(const esa_s *, const char *query, size_t qlen); 63 | int esa_init(esa_s *, const seq_subject *S); 64 | void esa_free(esa_s *); 65 | 66 | #ifdef DEBUG 67 | 68 | char code2char(ssize_t code); 69 | 70 | #endif // DEBUG 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /src/global.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * @brief Global Definitions 4 | * 5 | * This file contains the declaration of global variables and 6 | * their related values. The actual definition is located in andi.c 7 | */ 8 | #ifndef _GLOBAL_H_ 9 | #define _GLOBAL_H_ 10 | #include 11 | 12 | #include "config.h" 13 | #include 14 | 15 | /** 16 | * The *global* variable ::FLAGS is used to set different options 17 | * for the execution of the program. Use `FLAGS & F_NAME` to check 18 | * if `F_NAME` was set. 19 | */ 20 | extern int FLAGS; 21 | 22 | /** 23 | * The *global* variable ::THREADS contains the number of threads the program 24 | * should use. 25 | */ 26 | extern int THREADS; 27 | 28 | /** 29 | * The ::ANCHOR_P_VALUE represents the probability that an anchor will be found, 30 | * if the two sequences are unrelated. I.e. it is the p-value for H_0: random 31 | * sequences. Its value can be set using the `-p` switch. 32 | */ 33 | extern double ANCHOR_P_VALUE; 34 | 35 | /** 36 | * The number of matrices that should be bootstrapped. 37 | */ 38 | extern long unsigned int BOOTSTRAP; 39 | 40 | /** 41 | * A global random number generator. Has to be seedable. 42 | */ 43 | extern gsl_rng *RNG; 44 | 45 | /** 46 | * The evolutionary model. 47 | */ 48 | extern int MODEL; 49 | 50 | enum { M_RAW, M_JC, M_KIMURA, M_LOGDET }; 51 | 52 | /** 53 | * This enum contains the available flags. Please note that all 54 | * available options are a power of 2. 55 | */ 56 | enum { 57 | F_NONE = 0, 58 | F_TRUNCATE_NAMES = 1, 59 | F_VERBOSE = 2, 60 | F_EXTRA_VERBOSE = 4, 61 | F_NON_ACGT = 8, 62 | F_JOIN = 16, 63 | F_LOW_MEMORY = 32, 64 | F_SHORT = 64, 65 | F_PRINT_PROGRESS = 128, 66 | F_SOFT_ERROR = 256 67 | }; 68 | 69 | /** 70 | * @brief This macro is used to unify the checks for the return value of malloc. 71 | * 72 | * @param PTR - The pointer getting checked. 73 | */ 74 | #define CHECK_MALLOC(PTR) \ 75 | do { \ 76 | if (PTR == NULL) { \ 77 | err(errno, "Out of memory"); \ 78 | } \ 79 | } while (0) 80 | 81 | /** 82 | * @brief This macro is used to print a warning and make the program exit with 83 | * an failure exit code, later. 84 | */ 85 | #define soft_err(...) \ 86 | do { \ 87 | FLAGS |= F_SOFT_ERROR; \ 88 | warn(__VA_ARGS__); \ 89 | } while (0) 90 | 91 | /** 92 | * @brief This macro is used to print a warning and make the program exit with 93 | * an failure exit code, later. 94 | */ 95 | #define soft_errx(...) \ 96 | do { \ 97 | FLAGS |= F_SOFT_ERROR; \ 98 | warnx(__VA_ARGS__); \ 99 | } while (0) 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /src/io.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * @brief This file contains the definitions for various io methods. 4 | */ 5 | #define _GNU_SOURCE 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | #include "global.h" 18 | #include "io.h" 19 | 20 | /** 21 | * @brief Access an element. 22 | * @param sv - The base vector. 23 | * @param index - The element index to access. 24 | * @returns the string at position `index`. 25 | */ 26 | char *string_vector_at(struct string_vector *sv, size_t index) { 27 | return sv->data[index]; 28 | } 29 | 30 | /** 31 | * @brief Access the underlying buffer. 32 | * @param sv - The base vector. 33 | * @returns the underlying buffer. 34 | */ 35 | char **string_vector_data(struct string_vector *sv) { 36 | return sv->data; 37 | } 38 | 39 | /** 40 | * @brief Free all data. 41 | * @param sv - The base vector. 42 | */ 43 | void string_vector_free(struct string_vector *sv) { 44 | size_t i = 0; 45 | for (; i < sv->size; i++) { 46 | free(sv->data[i]); 47 | } 48 | free(sv->data); 49 | } 50 | 51 | /** 52 | * @brief Initialise the vector. 53 | * @param sv - The base vector. 54 | */ 55 | void string_vector_init(struct string_vector *sv) { 56 | sv->data = malloc(sizeof(*sv->data) * 4); 57 | CHECK_MALLOC(sv->data); 58 | 59 | sv->capacity = 4; 60 | sv->size = 0; 61 | } 62 | 63 | /** 64 | * @brief Adds a copy to the end of the vector. 65 | * @param sv - The base vector. 66 | * @param str - The new string to add. 67 | */ 68 | void string_vector_push_back(struct string_vector *sv, const char *str) { 69 | string_vector_emplace_back(sv, strdup(str)); 70 | } 71 | 72 | /** 73 | * @brief Add a file name to the end of the vector, directly. 74 | * @param sv - The base vector. 75 | * @param str - The string to emplace. 76 | */ 77 | void string_vector_emplace_back(struct string_vector *sv, char *str) { 78 | if (sv->size < sv->capacity) { 79 | sv->data[sv->size++] = str; 80 | } else { 81 | char **ptr = reallocarray(sv->data, sv->capacity / 2, 3 * sizeof(*ptr)); 82 | CHECK_MALLOC(ptr); 83 | sv->data = ptr; 84 | sv->capacity = (sv->capacity / 2) * 3; 85 | sv->data[sv->size++] = str; 86 | } 87 | } 88 | 89 | /** 90 | * @brief Return the number of elements. 91 | * @param sv - The base vector. 92 | * @returns the size of the vector. 93 | */ 94 | size_t string_vector_size(const struct string_vector *sv) { 95 | return sv->size; 96 | } 97 | 98 | /** 99 | * @brief Read a *fof* and add its contents into a vector. 100 | * @param file_name - The file of file names aka. fof. 101 | * @param sv - The vector to add file names to. 102 | */ 103 | void read_into_string_vector(const char *file_name, struct string_vector *sv) { 104 | FILE *file = strcmp(file_name, "-") ? fopen(file_name, "r") : stdin; 105 | if (!file) { 106 | soft_err("%s", file_name); 107 | return; 108 | } 109 | 110 | while (1) { 111 | char *str = NULL; 112 | size_t buffer_size = 0; 113 | ssize_t check = getline(&str, &buffer_size, file); 114 | 115 | // EOF is set only *after* getline tried to read past it. 116 | if (check == -1 && feof(file) != 0) { 117 | free(str); 118 | break; // EOF 119 | } 120 | 121 | if (check == -1) { 122 | soft_err("%s", file_name); 123 | break; 124 | } 125 | 126 | char *nl = strchr(str, '\n'); 127 | if (nl) { 128 | *nl = '\0'; // remove newline character 129 | } 130 | 131 | // ignore empty lines 132 | if (strlen(str) == 0) { 133 | free(str); 134 | continue; 135 | } 136 | 137 | string_vector_emplace_back(sv, str); 138 | } 139 | 140 | int check = fclose(file); 141 | if (check != 0) { 142 | soft_err("%s", file_name); 143 | } 144 | } 145 | 146 | /** 147 | * @brief Joins all sequences from a file into a single long sequence. 148 | * 149 | * Apart from reading all sequences from a file, this function also 150 | * merges them into one long sequence. 151 | * 152 | * "I didn't learn joined up handwriting for nothing, you know." 153 | * ~ Gilderoy Lockhart 154 | * 155 | * @param file_name - The name of the file to be used for reading. The name is 156 | * also used to infer the sequence name. 157 | * @param dsa - (output parameter) An array that holds found sequences. 158 | */ 159 | void read_fasta_join(const char *file_name, dsa_t *dsa) { 160 | if (!file_name || !dsa) return; 161 | 162 | dsa_t single; 163 | dsa_init(&single); 164 | read_fasta(file_name, &single); 165 | 166 | if (dsa_size(&single) == 0) { 167 | return; 168 | } 169 | 170 | seq_t joined = dsa_join(&single); 171 | 172 | /* In join mode we try to be clever about the sequence name. Given the file 173 | * path we extract just the file name. ie. path/file.ext -> file 174 | * This obviously fails on Windows. 175 | */ 176 | 177 | const char *left = strrchr(file_name, '/'); // find the last path separator 178 | left = (left == NULL) ? file_name : left + 1; 179 | // left is the position one of to the right of the path separator 180 | 181 | const char *dot = strchrnul(left, '.'); // find the extension 182 | 183 | // copy only the file name, not its path or extension 184 | joined.name = strndup(left, dot - left); 185 | CHECK_MALLOC(joined.name); 186 | 187 | dsa_push(dsa, joined); 188 | dsa_free(&single); 189 | } 190 | 191 | /** 192 | * @brief This function reads sequences from a file. 193 | * @param file_name - The file to read. 194 | * @param dsa - (output parameter) An array that holds found sequences. 195 | */ 196 | void read_fasta(const char *file_name, dsa_t *dsa) { 197 | if (!file_name || !dsa) return; 198 | 199 | int file_descriptor = 200 | strcmp(file_name, "-") ? open(file_name, O_RDONLY) : STDIN_FILENO; 201 | 202 | if (file_descriptor < 0) { 203 | soft_err("%s", file_name); 204 | return; 205 | } 206 | 207 | struct pfasta_parser pp = pfasta_init(file_descriptor); 208 | if (pp.errstr) { 209 | soft_errx("%s: %s", file_name, pp.errstr); 210 | goto fail; 211 | } 212 | 213 | seq_t top = {}; 214 | while (!pp.done) { 215 | struct pfasta_record pr = pfasta_read(&pp); 216 | if (pp.errstr) { 217 | soft_errx("%s: %s", file_name, pp.errstr); 218 | goto fail; 219 | } 220 | 221 | int check = seq_init(&top, pr.sequence, pr.name); 222 | 223 | // skip broken sequences 224 | if (check != 0) continue; 225 | 226 | dsa_push(dsa, top); 227 | pfasta_record_free(&pr); 228 | } 229 | 230 | fail: 231 | pfasta_free(&pp); 232 | close(file_descriptor); 233 | } 234 | 235 | /** 236 | * @brief Prints the distance matrix. 237 | * 238 | * This function pretty prints the distance matrix. For small distances 239 | * scientific notation is used. 240 | * 241 | * @param D - The distance matrix 242 | * @param sequences - An array of pointers to the sequences. 243 | * @param n - The number of sequences. 244 | * @param warnings - Print warnings? Set to 0 for bootstrapped matrices. 245 | */ 246 | void print_distances(const struct model *D, const seq_t *sequences, size_t n, 247 | int warnings) { 248 | size_t i, j; 249 | int use_scientific = 0; 250 | 251 | double *DD = malloc(n * n * sizeof(*DD)); 252 | CHECK_MALLOC(DD); 253 | 254 | #define DD(X, Y) (DD[(X)*n + (Y)]) 255 | 256 | typedef double(estimate_fn)(const model *); 257 | estimate_fn *estimate; 258 | 259 | switch (MODEL) { 260 | case M_RAW: estimate = &estimate_RAW; break; 261 | default: 262 | /* intentional fall-through. This is just here to silence any 263 | * compiler warnings. The real default is set in andi.c.*/ 264 | case M_JC: estimate = &estimate_JC; break; 265 | case M_KIMURA: estimate = &estimate_KIMURA; break; 266 | case M_LOGDET: estimate = &estimate_LOGDET; break; 267 | } 268 | 269 | for (i = 0; i < n; i++) { 270 | for (j = 0; j < n; j++) { 271 | model datum = D(i, j); 272 | 273 | if (!(FLAGS & F_EXTRA_VERBOSE)) { 274 | datum = model_average(&D(i, j), &D(j, i)); 275 | } 276 | 277 | double dist = DD(i, j) = i == j ? 0.0 : estimate(&datum); 278 | 279 | if (dist > 0 && dist < 0.001) { 280 | use_scientific = 1; 281 | } 282 | 283 | if (isnan(dist) && warnings) { 284 | const char str[] = { 285 | "For the two sequences '%s' and '%s' the distance " 286 | "computation failed and is reported as nan. " 287 | "Please refer to the documentation for further details."}; 288 | soft_errx(str, sequences[i].name, sequences[j].name); 289 | } 290 | 291 | if (!isnan(dist) && i < j && warnings) { 292 | double coverage1 = model_coverage(&D(i, j)); 293 | double coverage2 = model_coverage(&D(j, i)); 294 | 295 | if (coverage1 < 0.2 || coverage2 < 0.2) { 296 | const char str[] = { 297 | "For the two sequences '%s' and '%s' very little " 298 | "homology was found (%f and %f, respectively)."}; 299 | soft_errx(str, sequences[i].name, sequences[j].name, 300 | coverage1, coverage2); 301 | } 302 | } 303 | } 304 | } 305 | 306 | printf("%zu\n", n); 307 | for (i = 0; i < n; i++) { 308 | // Print ten characters of the name. Pad with spaces, if 309 | // necessary. Truncate to exactly ten characters if requested by user. 310 | printf(FLAGS & F_TRUNCATE_NAMES ? "%-10.10s" : "%-10s", 311 | sequences[i].name); 312 | 313 | for (j = 0; j < n; j++) { 314 | // use scientific notation for small numbers 315 | printf(use_scientific ? " %1.4e" : " %1.4f", DD(i, j)); 316 | } 317 | printf("\n"); 318 | } 319 | 320 | free(DD); 321 | } 322 | 323 | /** 324 | * @brief Prints the coverage matrix. 325 | * @param D - The distance matrix 326 | * @param n - The number of sequences. 327 | */ 328 | void print_coverages(const struct model *D, size_t n) { 329 | size_t i, j; 330 | printf("\nCoverage:\n"); 331 | for (i = 0; i < n; i++) { 332 | for (j = 0; j < n; j++) { 333 | printf("%1.4e ", model_coverage(&D(i, j))); 334 | } 335 | printf("\n"); 336 | } 337 | } 338 | -------------------------------------------------------------------------------- /src/io.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * @brief This header contains function declarations for io procedures. 4 | */ 5 | #ifndef _IO_H_ 6 | #define _IO_H_ 7 | 8 | #include "model.h" 9 | #include "sequence.h" 10 | #include 11 | #include 12 | #include 13 | 14 | /** 15 | * This is a neat hack for dealing with matrices. 16 | */ 17 | #define D(X, Y) (D[(X)*n + (Y)]) 18 | #define M(X, Y) (M[(X)*n + (Y)]) 19 | 20 | void read_fasta(const char *, dsa_t *dsa); 21 | void read_fasta_join(const char *, dsa_t *dsa); 22 | 23 | void print_distances(const struct model *, const seq_t *, size_t, int); 24 | void print_coverages(const struct model *, size_t); 25 | 26 | /** 27 | * @brief A dynamically growing structure for file_names. 28 | */ 29 | struct string_vector { 30 | char **data; 31 | size_t capacity, size; 32 | }; 33 | 34 | char *string_vector_at(struct string_vector *, size_t); 35 | char **string_vector_data(struct string_vector *); 36 | void string_vector_free(struct string_vector *); 37 | void string_vector_init(struct string_vector *); 38 | void string_vector_push_back(struct string_vector *, const char *); 39 | void string_vector_emplace_back(struct string_vector *, char *); 40 | size_t string_vector_size(const struct string_vector *); 41 | 42 | void read_into_string_vector(const char *, struct string_vector *); 43 | 44 | #endif // _IO_H_ 45 | -------------------------------------------------------------------------------- /src/model.c: -------------------------------------------------------------------------------- 1 | /** @file 2 | * @brief This file contains all functions for the mutation matrix and the 3 | * estimation of evolutionary distances thereof. 4 | */ 5 | 6 | #include "model.h" 7 | #include "global.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | /** 14 | * @brief Sum some mutation count specified by `summands`. Intended to be used 15 | * through the `model_sum` macro. 16 | * 17 | * @param MM - The mutation matrix. 18 | * @param summands - The mutations to add. 19 | * @returns The sum of mutations. 20 | */ 21 | static size_t model_sum_types(const model *MM, const int summands[]) { 22 | size_t total = 0; 23 | for (int i = 0; summands[i] != MUTCOUNTS; ++i) { 24 | total += MM->counts[summands[i]]; 25 | } 26 | return total; 27 | } 28 | 29 | #define model_sum(MM, ...) \ 30 | model_sum_types((MM), (int[]){__VA_ARGS__, MUTCOUNTS}) 31 | 32 | /** 33 | * @brief Average two mutation matrices. 34 | * 35 | * @param MM - One matrix 36 | * @param NN - Second matrix 37 | * @returns The average (sum) of two mutation matrices. 38 | */ 39 | model model_average(const model *MM, const model *NN) { 40 | model ret = *MM; 41 | for (int i = 0; i != MUTCOUNTS; ++i) { 42 | ret.counts[i] += NN->counts[i]; 43 | } 44 | ret.seq_len += NN->seq_len; 45 | return ret; 46 | } 47 | 48 | /** 49 | * @brief Compute the total number of nucleotides in the pairwise alignment. 50 | * 51 | * @param MM - The mutation matrix. 52 | * @returns The length of the alignment. 53 | */ 54 | size_t model_total(const model *MM) { 55 | size_t total = 0; 56 | for (size_t i = 0; i < MUTCOUNTS; ++i) { 57 | total += MM->counts[i]; 58 | } 59 | return total; 60 | } 61 | 62 | /** 63 | * @brief Compute the coverage of an alignment. 64 | * 65 | * @param MM - The mutation matrix. 66 | * @returns The relative coverage 67 | */ 68 | double model_coverage(const model *MM) { 69 | size_t covered = model_total(MM); 70 | size_t actual = MM->seq_len; 71 | 72 | return (double)covered / (double)actual; 73 | } 74 | 75 | /** 76 | * @brief Estimate the uncorrected distance of a pairwise alignment. 77 | * 78 | * @param MM - The mutation matrix. 79 | * @returns The uncorrected substitution rate. 80 | */ 81 | double estimate_RAW(const model *MM) { 82 | size_t nucl = model_total(MM); 83 | size_t SNPs = model_sum(MM, AtoC, AtoG, AtoT, CtoA, CtoG, CtoT, GtoA, GtoC, 84 | GtoT, TtoA, TtoC, TtoG); 85 | 86 | // Insignificant results. All abort the fail train. 87 | if (nucl <= 3) { 88 | return NAN; 89 | } 90 | 91 | return (double)SNPs / (double)nucl; 92 | } 93 | 94 | /** 95 | * @brief Compute the Jukes-Cantor distance. 96 | * 97 | * @param MM - The mutation matrix. 98 | * @returns The corrected JC distance. 99 | */ 100 | double estimate_JC(const model *MM) { 101 | double dist = estimate_RAW(MM); 102 | dist = -0.75 * log(1.0 - (4.0 / 3.0) * dist); // jukes cantor 103 | 104 | // fix negative zero 105 | return dist <= 0.0 ? 0.0 : dist; 106 | } 107 | 108 | /** @brief computes the evolutionary distance using K80. 109 | * 110 | * @param MM - The mutation matrix. 111 | * @returns The corrected Kimura distance. 112 | */ 113 | double estimate_KIMURA(const model *MM) { 114 | size_t nucl = model_total(MM); 115 | size_t transitions = model_sum(MM, AtoG, GtoA, CtoT, TtoC); 116 | size_t transversions = 117 | model_sum(MM, AtoC, CtoA, AtoT, TtoA, GtoC, CtoG, GtoT, TtoG); 118 | 119 | double P = (double)transitions / (double)nucl; 120 | double Q = (double)transversions / (double)nucl; 121 | 122 | double tmp = 1.0 - 2.0 * P - Q; 123 | double dist = -0.25 * log((1.0 - 2.0 * Q) * tmp * tmp); 124 | 125 | // fix negative zero 126 | return dist <= 0.0 ? 0.0 : dist; 127 | } 128 | 129 | /** @brief computes the evolutionary distance using LogDet. 130 | * 131 | * The LogDet distance between sequence X and and sequence Y 132 | * is given as 133 | * 134 | * -(1 / K) * (log(det(Fxy)) - 0.5 * log(det(Fxx * Fyy))) 135 | * 136 | * Where K is the number of character states, Fxy is the site-pattern 137 | * frequency matrix, and diagonal matrices Fxx and Fyy give the 138 | * frequencies of the different character states in sequences X and Y. 139 | * 140 | * Each i,j-th entry in Fxy is the proportion of homologous sites 141 | * where sequences X and Y have character states i and j, respectively. 142 | * 143 | * For our purposes, X is the Subject (From) sequence and Y is the 144 | * Query (To) sequence and matrix Fxy looks like 145 | * 146 | * To A C G T 147 | * From 148 | * A ( ) 149 | * C ( ) 150 | * G ( ) 151 | * T ( ) 152 | * 153 | * @param MM - The mutation matrix. 154 | * @returns The LogDet distance. 155 | */ 156 | double estimate_LOGDET(const model *MM) { 157 | 158 | double nucl = (double)model_total(MM); 159 | double P[MUTCOUNTS]; 160 | for (int i = 0; i < MUTCOUNTS; i++) { 161 | P[i] = MM->counts[i] / nucl; 162 | } 163 | 164 | double logDetFxxFyy = 165 | // log determinant of diagonal matrix of row sums 166 | log(model_sum(MM, AtoA, AtoC, AtoG, AtoT) / nucl) + 167 | log(model_sum(MM, CtoA, CtoC, CtoG, CtoT) / nucl) + 168 | log(model_sum(MM, GtoA, GtoC, GtoG, GtoT) / nucl) + 169 | log(model_sum(MM, TtoA, TtoC, TtoG, TtoT) / nucl) + 170 | // log determinant of diagonal matrix of column sums 171 | log(model_sum(MM, AtoA, CtoA, GtoA, TtoA) / nucl) + 172 | log(model_sum(MM, AtoC, CtoC, GtoC, TtoC) / nucl) + 173 | log(model_sum(MM, AtoG, CtoG, GtoG, TtoG) / nucl) + 174 | log(model_sum(MM, AtoT, CtoT, GtoT, TtoT) / nucl); 175 | 176 | // determinant of the site-pattern frequency matrix 177 | double detFxy = 178 | P[AtoA] * P[CtoC] * (P[GtoG] * P[TtoT] - P[TtoG] * P[GtoT]) - 179 | P[AtoA] * P[CtoG] * (P[GtoC] * P[TtoT] - P[TtoC] * P[GtoT]) + 180 | P[AtoA] * P[CtoT] * (P[GtoC] * P[TtoG] - P[TtoC] * P[GtoG]) - 181 | 182 | P[AtoC] * P[CtoA] * (P[GtoG] * P[TtoT] - P[TtoG] * P[GtoT]) + 183 | P[AtoC] * P[CtoG] * (P[GtoA] * P[TtoT] - P[TtoA] * P[GtoT]) - 184 | P[AtoC] * P[CtoT] * (P[GtoA] * P[TtoG] - P[TtoA] * P[GtoG]) + 185 | 186 | P[AtoG] * P[CtoA] * (P[GtoC] * P[TtoT] - P[TtoC] * P[GtoT]) - 187 | P[AtoG] * P[CtoC] * (P[GtoA] * P[TtoT] - P[TtoA] * P[GtoT]) + 188 | P[AtoG] * P[CtoT] * (P[GtoA] * P[TtoC] - P[TtoA] * P[GtoC]) - 189 | 190 | P[AtoT] * P[CtoA] * (P[GtoC] * P[TtoG] - P[TtoC] * P[GtoG]) + 191 | P[AtoT] * P[CtoC] * (P[GtoA] * P[TtoG] - P[TtoA] * P[GtoG]) - 192 | P[AtoT] * P[CtoG] * (P[GtoA] * P[TtoC] - P[TtoA] * P[GtoC]); 193 | 194 | double dist = -0.25 * (log(detFxy) - 0.5 * logDetFxxFyy); 195 | 196 | // fix negative zero 197 | return dist <= 0.0 ? 0.0 : dist; 198 | } 199 | 200 | /** @brief Bootstrap a mutation matrix. 201 | * 202 | * The classical bootstrapping process, as described by Felsenstein, resamples 203 | * all nucleotides of a MSA. As andi only computes a pairwise alignment, this 204 | * process boils down to a simple multinomial distribution. We just have to 205 | * resample the elements of the mutation matrix. See Klötzl & Haubold (2016) 206 | * for details. http://www.mdpi.com/2075-1729/6/1/11/htm 207 | * 208 | * @param datum - The original mutation matrix. 209 | * @returns A bootstrapped mutation matrix. 210 | */ 211 | model model_bootstrap(model datum) { 212 | size_t nucl = model_total(&datum); 213 | double p[MUTCOUNTS]; 214 | for (size_t i = 0; i < MUTCOUNTS; ++i) { 215 | p[i] = datum.counts[i] / (double)nucl; 216 | } 217 | 218 | gsl_ran_multinomial(RNG, MUTCOUNTS, nucl, p, datum.counts); 219 | 220 | return datum; 221 | } 222 | 223 | /** 224 | * @brief Given an anchor, classify nucleotides. 225 | * 226 | * For anchors we already know that the nucleotides of the subject and the query 227 | * are equal. Thus only one sequence has to be analysed. Most models don't 228 | * actually care about the individual nucleotides as long as they are equal in 229 | * the two sequences. For these models, we just assume equal distribution. 230 | * 231 | * @param MM - The mutation matrix 232 | * @param S - The subject 233 | * @param len - The anchor length 234 | */ 235 | void model_count_equal(model *MM, const char *S, size_t len) { 236 | if (MODEL == M_RAW || MODEL == M_JC || MODEL == M_KIMURA) { 237 | size_t fourth = len / 4; 238 | MM->counts[AtoA] += fourth; 239 | MM->counts[CtoC] += fourth; 240 | MM->counts[GtoG] += fourth; 241 | MM->counts[TtoT] += fourth + (len & 3); 242 | return; 243 | } 244 | 245 | // Fall-back algorithm for future models. Note, as this works on a 246 | // per-character basis it is slow. 247 | 248 | size_t local_counts[4] = {0}; 249 | 250 | for (; len--;) { 251 | char s = *S++; 252 | 253 | // ';!#' are all smaller than 'A' 254 | if (s < 'A') { 255 | // Technically, s can only be ';' at this point. 256 | continue; 257 | } 258 | 259 | // The four canonical nucleotides can be uniquely identified by the bits 260 | // 0x6: A -> 0, C → 1, G → 3, T → 2. Thus the order below is changed. 261 | local_counts[(s >> 1) & 3]++; 262 | } 263 | 264 | MM->counts[AtoA] += local_counts[0]; 265 | MM->counts[CtoC] += local_counts[1]; 266 | MM->counts[GtoG] += local_counts[3]; 267 | MM->counts[TtoT] += local_counts[2]; 268 | } 269 | 270 | /** @brief Convert a nucleotide to a 2bit representation. 271 | * 272 | * We want to map characters: 273 | * A → 0 274 | * C → 1 275 | * G → 2 276 | * T → 3 277 | * The trick used below is that the three lower bits of the 278 | * characters are unique. Thus, they can be used to compute the mapping 279 | * above. The mapping itself is done via tricky bitwise operations. 280 | * 281 | * @param c - input nucleotide 282 | * @returns 2bit representation. 283 | */ 284 | char nucl2bit(unsigned char c) { 285 | c &= 6; 286 | c ^= c >> 1; 287 | return c >> 1; 288 | } 289 | 290 | /** 291 | * @brief Count the substitutions and add them to the mutation matrix. 292 | * 293 | * @param MM - The mutation matrix. 294 | * @param S - The subject 295 | * @param Q - The query 296 | * @param len - The length of the alignment 297 | */ 298 | void model_count(model *MM, const char *S, const char *Q, size_t len) { 299 | size_t local_counts[MUTCOUNTS] = {0}; 300 | 301 | for (size_t i = 0; i < len; S++, Q++, i++) { 302 | char s = *S; 303 | char q = *Q; 304 | 305 | // Skip special characters. 306 | if (s < 'A' || q < 'A') { 307 | continue; 308 | } 309 | 310 | // Pick the correct two bits representing s and q. 311 | unsigned char foo = nucl2bit(s); 312 | unsigned char bar = nucl2bit(q); 313 | 314 | /* 315 | * Finally, we want to map the indices to the correct mutation. This is 316 | * done by utilising the mutation types in model.h. 317 | */ 318 | unsigned int index = (foo << 2) + bar; 319 | 320 | local_counts[index]++; 321 | } 322 | 323 | for (int i = 0; i != MUTCOUNTS; ++i) { 324 | MM->counts[i] += local_counts[i]; 325 | } 326 | } 327 | -------------------------------------------------------------------------------- /src/model.h: -------------------------------------------------------------------------------- 1 | /** @file 2 | * @brief This header contains all structures and prototypes for creating a 3 | * mutation matrix and estimating distances trough an evolutionary model 4 | * thereof. 5 | */ 6 | #pragma once 7 | 8 | #include 9 | 10 | /** 11 | * This enum contains all possible mutations. The total number 12 | * of different possible mutations is MUTCOUNTS. 13 | */ 14 | enum { 15 | AtoA, 16 | AtoC, 17 | AtoG, 18 | AtoT, 19 | CtoA, 20 | CtoC, 21 | CtoG, 22 | CtoT, 23 | GtoA, 24 | GtoC, 25 | GtoG, 26 | GtoT, 27 | TtoA, 28 | TtoC, 29 | TtoG, 30 | TtoT, 31 | MUTCOUNTS 32 | }; 33 | 34 | /** @brief The mutation matrix. 35 | * 36 | * We need to keep track of the different types of mutations between two 37 | * sequences. For this the following matrix is filled. 38 | * 39 | * To A C G T 40 | * From 41 | * A ( ) 42 | * C ( ) 43 | * G ( ) 44 | * T ( ) 45 | * 46 | * The cells are absolute counts. Together with seq_len (the query length), 47 | * we can deduce the substitution rate and coverage. 48 | * 49 | * As libdivsufsort is 32 bit the sequence length is limited to (INT_MAX-1)/2. 50 | * We can thus use the same limit for the counts. 51 | */ 52 | typedef struct model { 53 | /** The absolute counts of mutation types. */ 54 | unsigned int counts[MUTCOUNTS]; 55 | /** The query length. */ 56 | unsigned int seq_len; 57 | } model; 58 | 59 | void model_count_equal(model *, const char *, size_t); 60 | void model_count(model *, const char *, const char *, size_t); 61 | model model_average(const model *, const model *); 62 | double model_coverage(const model *); 63 | double estimate_RAW(const model *); 64 | double estimate_JC(const model *); 65 | double estimate_KIMURA(const model *); 66 | double estimate_LOGDET(const model *); 67 | model model_bootstrap(model); 68 | -------------------------------------------------------------------------------- /src/process.c: -------------------------------------------------------------------------------- 1 | 2 | 3 | /** 4 | * @file 5 | * @brief This file contains various distance methods. 6 | */ 7 | #include "process.h" 8 | #include "esa.h" 9 | #include "global.h" 10 | #include "io.h" 11 | #include "model.h" 12 | #include "sequence.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #ifdef _OPENMP 19 | #include 20 | #endif 21 | 22 | int calculate_bootstrap(const struct model *M, const seq_t *sequences, 23 | size_t n); 24 | 25 | typedef _Bool bool; 26 | #define false 0 27 | #define true !false 28 | 29 | /** 30 | * @brief This structure captures properties of an anchor. 31 | */ 32 | struct anchor { 33 | /** The position on the subject. */ 34 | size_t pos_S; 35 | /** The position on the query. */ 36 | size_t pos_Q; 37 | /** The length of the exact match. */ 38 | size_t length; 39 | }; 40 | 41 | /** 42 | * @brief This is a structure of assorted variables needed for anchor finding. 43 | */ 44 | struct context { 45 | const esa_s *C; 46 | const char *query; 47 | size_t query_length; 48 | size_t threshold; 49 | }; 50 | 51 | /** 52 | * @brief Compute the length of the longest common prefix of two strings. 53 | * 54 | * @param S - One string. 55 | * @param Q - Another string. 56 | * @param remaining - The length of one of the strings. 57 | * @returns the length of the lcp. 58 | */ 59 | static inline size_t lcp(const char *S, const char *Q, size_t remaining) { 60 | size_t length = 0; 61 | while (length < remaining && S[length] == Q[length]) { 62 | length++; 63 | } 64 | return length; 65 | } 66 | 67 | /** 68 | * @brief Check whether the last anchor can be extended by a lucky anchor. 69 | * 70 | * Anchors are defined to be unique and of a minimum length. The uniqueness 71 | * requires us to search throw the suffix array for a second appearance of the 72 | * anchor. However, if a left anchor is already unique, we could be sloppy and 73 | * drop the uniqueness criterion for the second anchor. This way we can skip the 74 | * lookup and just compare characters directly. However, for a lucky anchor the 75 | * match still has to be longer than the threshold. 76 | * 77 | * @param ctx - Matching context of various variables. 78 | * @param last_match - The last anchor. 79 | * @param this_match - Input/Output variable for the current match. 80 | * @returns true iff the current match is a lucky anchor. 81 | */ 82 | static inline bool lucky_anchor(const struct context *ctx, 83 | const struct anchor *last_match, 84 | struct anchor *this_match) { 85 | 86 | size_t advance = this_match->pos_Q - last_match->pos_Q; 87 | size_t gap = this_match->pos_Q - last_match->pos_Q - last_match->length; 88 | 89 | size_t try_pos_S = last_match->pos_S + advance; 90 | if (try_pos_S >= (size_t)ctx->C->len || gap > ctx->threshold) { 91 | return false; 92 | } 93 | 94 | this_match->pos_S = try_pos_S; 95 | this_match->length = 96 | lcp(ctx->query + this_match->pos_Q, ctx->C->S + try_pos_S, 97 | ctx->query_length - this_match->pos_Q); 98 | 99 | return this_match->length >= ctx->threshold; 100 | } 101 | 102 | /** 103 | * @brief Check for a new anchor. 104 | * 105 | * Given the current context and starting position check if the new match is an 106 | * anchor. The latter requires uniqueness and a certain minimum length. 107 | * 108 | * @param ctx - Matching context of various variables. 109 | * @param last_match - (unused) 110 | * @param this_match - Input/Output variable for the current match. 111 | * @returns true iff an anchor was found. 112 | */ 113 | static inline bool anchor(const struct context *ctx, 114 | const struct anchor *last_match, 115 | struct anchor *this_match) { 116 | 117 | lcp_inter_t inter = get_match_cached(ctx->C, ctx->query + this_match->pos_Q, 118 | ctx->query_length - this_match->pos_Q); 119 | 120 | this_match->pos_S = ctx->C->SA[inter.i]; 121 | this_match->length = inter.l <= 0 ? 0 : inter.l; 122 | return inter.i == inter.j && this_match->length >= ctx->threshold; 123 | } 124 | 125 | /** 126 | * @brief Divergence estimation using the anchor technique. 127 | * 128 | * The dist_anchor() function estimates the divergence between two 129 | * DNA sequences. The subject is given as an ESA, whereas the query 130 | * is a simple string. This function then looks for *anchors* -- long 131 | * substrings that exist in both sequences. Then it manually checks for 132 | * mutations between those anchors. 133 | * 134 | * @param C - The enhanced suffix array of the subject. 135 | * @param query - The actual query string. 136 | * @param query_length - The length of the query string. Needed for speed 137 | * reasons. 138 | * @param threshold - Minimal length for an anchor. 139 | * @returns A matrix with estimates of base substitutions. 140 | */ 141 | model dist_anchor(const esa_s *C, const char *query, size_t query_length, 142 | size_t threshold) { 143 | struct model ret = {.seq_len = query_length, .counts = {0}}; 144 | 145 | struct anchor this_match = {0}; 146 | struct anchor last_match = {0}; 147 | bool last_was_right_anchor = false; 148 | size_t border = C->len / 2; 149 | 150 | struct context ctx = {C, query, query_length, threshold}; 151 | 152 | // Iterate over the complete query. 153 | while (this_match.pos_Q < query_length) { 154 | 155 | // Check for lucky anchors and fall back to normal strategy. 156 | if (lucky_anchor(&ctx, &last_match, &this_match) || 157 | anchor(&ctx, &last_match, &this_match)) { 158 | // We have reached a new anchor. 159 | 160 | size_t end_S = last_match.pos_S + last_match.length; 161 | size_t end_Q = last_match.pos_Q + last_match.length; 162 | // Check if this can be a right anchor to the last one. 163 | if (this_match.pos_S > end_S && 164 | this_match.pos_Q - end_Q == this_match.pos_S - end_S && 165 | (this_match.pos_S < border) == (last_match.pos_S < border)) { 166 | 167 | // classify nucleotides in the left qanchor 168 | model_count_equal(&ret, query + last_match.pos_Q, 169 | last_match.length); 170 | 171 | // Count the SNPs in between. 172 | model_count(&ret, C->S + end_S, query + end_Q, 173 | this_match.pos_Q - end_Q); 174 | last_was_right_anchor = true; 175 | } else { 176 | if (last_was_right_anchor) { 177 | // If the last was a right anchor, but with the current one, 178 | // we cannot extend, then add its length. 179 | model_count_equal(&ret, query + last_match.pos_Q, 180 | last_match.length); 181 | } else if (last_match.length >= threshold * 2) { 182 | // The last anchor wasn't neither a left or right anchor. 183 | // But, it was as long as an anchor pair. So still count it. 184 | model_count_equal(&ret, query + last_match.pos_Q, 185 | last_match.length); 186 | } 187 | 188 | last_was_right_anchor = false; 189 | } 190 | 191 | // Cache values for later 192 | last_match = this_match; 193 | } 194 | 195 | // Advance 196 | this_match.pos_Q += this_match.length + 1; 197 | } 198 | 199 | // Very special case: The sequences are identical 200 | if (last_match.length >= query_length) { 201 | model_count_equal(&ret, query, query_length); 202 | return ret; 203 | } 204 | 205 | // We might miss a few nucleotides if the last anchor was also a right 206 | // anchor. The logic is the same as a few lines above. 207 | if (last_was_right_anchor) { 208 | model_count_equal(&ret, query + last_match.pos_Q, last_match.length); 209 | } else if (last_match.length >= threshold * 2) { 210 | model_count_equal(&ret, query + last_match.pos_Q, last_match.length); 211 | } 212 | 213 | return ret; 214 | } 215 | 216 | /* 217 | * Include distMatrix and distMatrixLM. 218 | */ 219 | #define FAST 220 | #include "dist_hack.h" 221 | 222 | #undef FAST 223 | #include "dist_hack.h" 224 | 225 | /** 226 | * @brief Calculates and prints the distance matrix 227 | * @param sequences - An array of pointers to the sequences. 228 | * @param n - The number of sequences. 229 | */ 230 | void calculate_distances(seq_t *sequences, size_t n) { 231 | struct model *M = NULL; 232 | 233 | // The maximum number of sequences is near 457'845'052. 234 | size_t intermediate = SIZE_MAX / sizeof(*M) / n; 235 | if (intermediate < n) { 236 | size_t root = (size_t)sqrt(SIZE_MAX / sizeof(*M)); 237 | err(1, "Comparison is limited to %zu sequences (%zu given).", root, n); 238 | } 239 | 240 | M = malloc(n * n * sizeof(*M)); 241 | if (!M) { 242 | err(errno, "Could not allocate enough memory for the comparison " 243 | "matrix. Try using --join or --low-memory."); 244 | } 245 | 246 | // compute the distances 247 | if (FLAGS & F_LOW_MEMORY) { 248 | distMatrixLM(M, sequences, n); 249 | } else { 250 | distMatrix(M, sequences, n); 251 | } 252 | 253 | // print the results 254 | print_distances(M, sequences, n, 1); 255 | 256 | // print additional information. 257 | if (FLAGS & F_VERBOSE) { 258 | print_coverages(M, n); 259 | } 260 | 261 | // create new bootstrapped distance matrices 262 | if (BOOTSTRAP) { 263 | int res = calculate_bootstrap(M, sequences, n); 264 | if (res) { 265 | soft_errx("Bootstrapping failed."); 266 | } 267 | } 268 | 269 | free(M); 270 | } 271 | 272 | /** Yet another hack. */ 273 | #define B(X, Y) (B[(X)*n + (Y)]) 274 | 275 | /** @brief Computes a bootstrap from _pairwise_ alignments. 276 | * 277 | * Doing bootstrapping for alignments with only two sequences is easy. It boils 278 | * down to a simple multi-nomial process over the substitution matrix. 279 | * 280 | * @param M - the initial distance matrix 281 | * @param sequences - a list of the sequences, containing their lengths 282 | * @param n - the number of sequences 283 | * 284 | * The number of bootstrapped distance matrices to print is implicitly 285 | * passed via the global `BOOTSTRAP` variable. 286 | * 287 | * @returns 0 iff successful. 288 | */ 289 | int calculate_bootstrap(const struct model *M, const seq_t *sequences, 290 | size_t n) { 291 | if (!M || !sequences || !n) { 292 | return 1; 293 | } 294 | 295 | // B is the new bootstrap matrix 296 | struct model *B = malloc(n * n * sizeof(*B)); 297 | CHECK_MALLOC(B); 298 | 299 | // Compute a number of new distance matrices 300 | while (BOOTSTRAP--) { 301 | for (size_t i = 0; i < n; i++) { 302 | for (size_t j = i; j < n; j++) { 303 | if (i == j) { 304 | B(i, j) = (struct model){.seq_len = 1.0, .counts = {1.0}}; 305 | continue; 306 | } 307 | 308 | // Bootstrapping should only be used with averaged distances. 309 | model datum = model_average(&M(i, j), &M(j, i)); 310 | datum = model_bootstrap(datum); 311 | 312 | B(j, i) = B(i, j) = datum; 313 | } 314 | } 315 | 316 | print_distances(B, sequences, n, 0); 317 | } 318 | 319 | free(B); 320 | return 0; 321 | } 322 | -------------------------------------------------------------------------------- /src/process.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * @brief This file contains the declarations of functions in process.c 4 | * 5 | */ 6 | #ifndef _PROCESS_H_ 7 | #define _PROCESS_H_ 8 | 9 | #include "sequence.h" 10 | 11 | void calculate_distances(seq_t *sequences, size_t n); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/sequence.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * @brief Sequence utilities 4 | * 5 | * This file contains utility functions for working with DNA sequences. 6 | */ 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "global.h" 14 | #include "sequence.h" 15 | #include 16 | 17 | void normalize(seq_t *S); 18 | double shustring_cum_prob(size_t x, double g, size_t l); 19 | size_t min_anchor_length(double p, double g, size_t l); 20 | 21 | /** Create a new dynamic array for sequences. */ 22 | int dsa_init(dsa_t *A) { 23 | // allocate at least 4 slots so the growth by 1.5 below doesn't get stuck 24 | // at 3 slots. 25 | A->data = malloc(sizeof(*A->data) * 4); 26 | CHECK_MALLOC(A->data); 27 | 28 | A->capacity = 4; 29 | A->size = 0; 30 | return 0; 31 | } 32 | 33 | /** Add a sequence to an array. */ 34 | void dsa_push(dsa_t *A, seq_t S) { 35 | if (A->size < A->capacity) { 36 | A->data[A->size++] = S; 37 | } else { 38 | // use the near-optimal growth factor of 1.5 39 | seq_t *ptr = reallocarray(A->data, A->capacity / 2, sizeof(seq_t) * 3); 40 | CHECK_MALLOC(ptr); 41 | 42 | A->capacity = (A->capacity / 2) * 3; 43 | A->data = ptr; 44 | A->data[A->size++] = S; 45 | } 46 | } 47 | 48 | /** Frees the array and all sequences stored within. */ 49 | void dsa_free(dsa_t *A) { 50 | size_t i; 51 | for (i = 0; i < A->size; i++) { 52 | seq_free(&A->data[i]); 53 | } 54 | 55 | free(A->data); 56 | *A = (dsa_t){}; 57 | } 58 | 59 | /** Returns the number of sequences stored within an array. */ 60 | size_t dsa_size(const dsa_t *A) { 61 | return A->size; 62 | } 63 | 64 | /** Get the raw C array. */ 65 | seq_t *dsa_data(dsa_t *A) { 66 | return A->data; 67 | } 68 | 69 | /** 70 | * @brief Convert an array of multiple sequences into a single sequence. 71 | * 72 | * This function joins all sequences contained in an array into one 73 | * long sequence. The sequences are separated by a `!` character. The 74 | * caller has to free the initial array. 75 | * 76 | * @returns A new sequence representation the union of the array. 77 | */ 78 | seq_t dsa_join(dsa_t *A) { 79 | seq_t joined = {}; 80 | 81 | if (A->size == 0) { 82 | return joined; 83 | } 84 | 85 | if (A->size == 1) { 86 | /* If we are to join just one sequence, _move_ its contents. */ 87 | joined = A->data[0]; 88 | A->data[0] = (seq_t){}; 89 | return joined; 90 | } 91 | 92 | seq_t *data = A->data; 93 | seq_t *it = data; 94 | 95 | // Compute the total length 96 | size_t total = 0, i; 97 | for (i = 0; i < A->size; i++, it++) { 98 | total += it->len + 1; 99 | } 100 | 101 | // A single malloc for the whole new sequence 102 | char *ptr = malloc(total); 103 | CHECK_MALLOC(ptr); 104 | char *next = ptr; 105 | 106 | // Copy all old sequences and add a `!` in between 107 | 108 | it = data; 109 | memcpy(next, it->S, it->len); 110 | next += it->len; 111 | 112 | for (i = 1, it++; i < A->size; i++, it++) { 113 | *next++ = '!'; 114 | memcpy(next, it->S, it->len); 115 | next += it->len; 116 | } 117 | 118 | // Don't forget the null byte. 119 | *next = '\0'; 120 | 121 | joined.S = ptr; 122 | joined.len = total - 1; // subtract the null byte 123 | 124 | return joined; 125 | } 126 | 127 | /** 128 | * @brief Frees the memory of a given sequence. 129 | * @param S - The sequence to free. 130 | */ 131 | void seq_free(seq_t *S) { 132 | free(S->S); 133 | free(S->name); 134 | *S = (seq_t){}; 135 | } 136 | 137 | /** 138 | * @brief Compute the reverse complement. 139 | * @param str The master string. 140 | * @param len The length of the master string 141 | * @return The reverse complement. The caller has to free it! 142 | */ 143 | char *revcomp(const char *str, size_t len) { 144 | if (!str) return NULL; 145 | char *rev = malloc(len + 1); 146 | CHECK_MALLOC(rev); 147 | 148 | char *r = rev; 149 | const char *s = &str[len - 1]; 150 | rev[len] = '\0'; 151 | 152 | do { 153 | char c = *s--; 154 | char d; 155 | 156 | if (c < 'A') { 157 | d = ';'; // rosebud 158 | } else { 159 | d = c ^= c & 2 ? 4 : 21; 160 | } 161 | 162 | *r++ = d; 163 | } while (--len); 164 | 165 | return rev; 166 | } 167 | 168 | /** 169 | * @brief This function concatenates the reverse complement to a given master 170 | * string. A `#` sign is used as a separator. 171 | * @param s The master string. 172 | * @param len Its length. 173 | * @return The newly concatenated string. 174 | */ 175 | char *catcomp(char *s, size_t len) { 176 | if (!s) return NULL; 177 | 178 | char *rev = revcomp(s, len); 179 | 180 | char *temp = realloc(rev, 2 * len + 2); 181 | CHECK_MALLOC(temp); 182 | 183 | rev = temp; 184 | rev[len] = '#'; 185 | 186 | memcpy(rev + len + 1, s, len + 1); 187 | 188 | return rev; 189 | } 190 | 191 | /** 192 | * @brief Calculates the GC content of a sequence. 193 | * 194 | * This function computes the relative amount of G and C in the total sequence. 195 | */ 196 | double calc_gc(const seq_t *S) { 197 | size_t GC = 0; 198 | const char *p = S->S; 199 | 200 | for (; *p; p++) { 201 | if (*p == 'G' || *p == 'C') { 202 | GC++; 203 | } 204 | } 205 | 206 | return (double)GC / S->len; 207 | } 208 | 209 | /** @brief Prepares a sequences to be used as the subject in a comparison. */ 210 | int seq_subject_init(seq_subject *S, const seq_t *base) { 211 | S->gc = calc_gc(base); 212 | S->RS = catcomp(base->S, base->len); 213 | if (!S->RS) return 1; 214 | S->RSlen = 2 * base->len + 1; 215 | 216 | S->threshold = min_anchor_length(ANCHOR_P_VALUE, S->gc, S->RSlen); 217 | 218 | return 0; 219 | } 220 | 221 | /** @brief Frees some memory unused for when a sequence is only used as query. 222 | */ 223 | void seq_subject_free(seq_subject *S) { 224 | free(S->RS); 225 | S->RS = NULL; 226 | S->RSlen = 0; 227 | S->gc = 0.0; 228 | } 229 | 230 | /** @brief Initializes a sequences 231 | * 232 | * @returns 0 iff successful. 233 | */ 234 | int seq_init(seq_t *S, const char *seq, const char *name) { 235 | if (!S || !seq || !name) { 236 | return 1; 237 | } 238 | 239 | *S = (seq_t){.S = strdup(seq), .name = strdup(name)}; 240 | 241 | CHECK_MALLOC(S->S); 242 | CHECK_MALLOC(S->name); 243 | 244 | normalize(S); 245 | 246 | // recalculate the length because `normalize` might have stripped some 247 | // characters. 248 | S->len = strlen(S->S); 249 | 250 | return 0; 251 | } 252 | 253 | /** 254 | * @brief Restricts a sequence characters set to ACGT. 255 | * 256 | * This function strips a sequence of non ACGT characters and converts acgt to 257 | * the upper case equivalent. A flag is set if a non-canonical character was 258 | * encountered. 259 | */ 260 | void normalize(seq_t *S) { 261 | char *p, *q; 262 | char local_non_acgt = 0; 263 | for (p = q = S->S; *p; p++) { 264 | switch (*p) { 265 | case 'A': 266 | case 'C': 267 | case 'G': 268 | case 'T': 269 | case '!': *q++ = *p; break; 270 | case 'a': 271 | case 'c': 272 | case 'g': 273 | case 't': *q++ = toupper((unsigned char)*p); break; 274 | default: local_non_acgt = 1; break; 275 | } 276 | } 277 | *q = '\0'; 278 | if (local_non_acgt) { 279 | #pragma omp atomic 280 | FLAGS |= F_NON_ACGT; 281 | } 282 | } 283 | 284 | /** 285 | * @brief Calculates the minimum anchor length. 286 | * 287 | * Given some parameters calculate the minimum length for anchors according 288 | * to the distribution from Haubold et al. (2009). 289 | * 290 | * @param p - The probability with which an anchor will be created under a 291 | * random model. 292 | * @param g - The the relative amount of GC in the subject. 293 | * @param l - The length of the subject (includes revcomp). 294 | * @returns The minimum length of an anchor. 295 | */ 296 | size_t min_anchor_length(double p, double g, size_t l) { 297 | size_t x = 1; 298 | 299 | while (shustring_cum_prob(x, g / 2, l) < 1 - p) { 300 | x++; 301 | } 302 | 303 | return x; 304 | } 305 | 306 | /** 307 | * @brief Calculates the binomial coefficient of n and k. 308 | * 309 | * We could (and probably should) use gsl_sf_lnchoose(xx,kk) for this. 310 | * 311 | * @param n - The n part of the binomial coefficient. 312 | * @param k - analogue. 313 | * @returns (n choose k) 314 | */ 315 | size_t binomial_coefficient(size_t n, size_t k) { 316 | if (n <= 0 || k > n) { 317 | return 0; 318 | } 319 | 320 | if (k == 0 || k == n) { 321 | return 1; 322 | } 323 | 324 | if (k > n - k) { 325 | k = n - k; 326 | } 327 | 328 | size_t res = 1; 329 | 330 | for (size_t i = 1; i <= k; i++) { 331 | res *= n - k + i; 332 | res /= i; 333 | } 334 | 335 | return res; 336 | } 337 | 338 | /** 339 | * @brief Given `x` this function calculates the probability of a shustring 340 | * with a length less or equal to `x` under a random model. This means, it is 341 | * the cumulative probability. 342 | * 343 | * Let X be the longest shortest unique substring (shustring) at any position. 344 | * Then this function computes P{X <= x} with respect to the given parameter 345 | * set. See Haubold et al. (2009). Note that `x` includes the final mismatch. 346 | * Thus, `x` is `match length + 1`. 347 | * 348 | * @param x - The maximum length of a shustring. 349 | * @param p - The half of the relative amount of GC in the DNA. 350 | * @param l - The length of the subject. 351 | * @returns The probability of a certain shustring length. 352 | */ 353 | double shustring_cum_prob(size_t x, double p, size_t l) { 354 | double xx = (double)x; 355 | double ll = (double)l; 356 | size_t k; 357 | 358 | double s = 0.0; 359 | 360 | for (k = 0; k <= x; k++) { 361 | double kk = (double)k; 362 | double t = pow(p, kk) * pow(0.5 - p, xx - kk); 363 | 364 | s += pow(2, xx) * (t * pow(1 - t, ll)) * 365 | (double)binomial_coefficient(x, k); 366 | if (s >= 1.0) { 367 | s = 1.0; 368 | break; 369 | } 370 | } 371 | 372 | return s; 373 | } 374 | -------------------------------------------------------------------------------- /src/sequence.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * @brief Functions and structures for DNA sequences 4 | * 5 | */ 6 | #ifndef _SEQUENCE_H_ 7 | #define _SEQUENCE_H_ 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | /** 14 | * @brief A structure for sequences. 15 | * 16 | * This structure is used to represent a DNA sequence of some kind. 17 | */ 18 | typedef struct seq_s { 19 | /** This is the DNAs forward strand as a string. */ 20 | char *S; 21 | /** The length of the forward strand. */ 22 | size_t len; 23 | /** A name for this sequence */ 24 | char *name; 25 | } seq_t; 26 | 27 | /** 28 | * @brief This structure enhances the usual sequence with its reverse 29 | * complement. 30 | */ 31 | typedef struct seq_subject { 32 | /** This member contains first the reverse strand and then the 33 | forward strand. */ 34 | char *RS; 35 | /** Corresponds to strlen(RS) */ 36 | size_t RSlen; 37 | /** 38 | * @brief GC-Content 39 | * 40 | * The relative amount of G or C in the DNA. 41 | */ 42 | double gc; 43 | /** The minimum length for an anchor. */ 44 | size_t threshold; 45 | } seq_subject; 46 | 47 | void seq_free(seq_t *S); 48 | int seq_subject_init(seq_subject *S, const seq_t *); 49 | void seq_subject_free(seq_subject *S); 50 | int seq_init(seq_t *S, const char *seq, const char *name); 51 | 52 | /** 53 | * @brief A dynamically growing structure for sequences. 54 | */ 55 | typedef struct dsa_s { 56 | seq_t *data; 57 | size_t capacity, size; 58 | } dsa_t; 59 | 60 | int dsa_init(dsa_t *A); 61 | void dsa_push(dsa_t *A, seq_t S); 62 | void dsa_free(dsa_t *A); 63 | size_t dsa_size(const dsa_t *A); 64 | seq_t *dsa_data(dsa_t *A); 65 | 66 | seq_t dsa_join(dsa_t *dsa); 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /test/Makefile.am: -------------------------------------------------------------------------------- 1 | check_PROGRAMS = test_esa test_seq test_fasta test_process 2 | dist_noinst_DATA = test_extra.sh test_random.sh test_join.sh nan.sh low_homo.sh 3 | 4 | test_seq_SOURCES = test_seq.c $(top_srcdir)/src/sequence.c 5 | test_seq_CPPFLAGS = -I$(top_srcdir)/src -I$(top_srcdir)/opt -DDEBUG -std=gnu99 6 | test_seq_CFLAGS = -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers 7 | test_seq_LDADD = $(GLIB_LIBS) $(top_builddir)/opt/libcompat.a 8 | 9 | test_process_SOURCES = test_process.c $(top_srcdir)/src/esa.c $(top_srcdir)/src/io.c $(top_srcdir)/src/model.c $(top_srcdir)/src/process.c $(top_srcdir)/src/sequence.c $(top_srcdir)/src/global.h 10 | test_process_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/src -I$(top_srcdir)/opt -I$(top_srcdir)/libs -DDEBUG -std=gnu99 11 | test_process_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers 12 | test_process_LDADD = $(GLIB_LIBS) $(top_builddir)/opt/libcompat.a $(top_builddir)/libs/libpfasta.a 13 | 14 | test_esa_SOURCES = test_esa.c $(top_srcdir)/src/esa.c $(top_srcdir)/src/sequence.c $(top_srcdir)/src/esa.h 15 | test_esa_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/libs -I$(top_srcdir)/opt -I$(top_srcdir)/src -DDEBUG -std=gnu99 16 | test_esa_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers 17 | test_esa_LDADD = $(GLIB_LIBS) $(top_builddir)/opt/libcompat.a 18 | 19 | test_fasta_SOURCES = test_fasta.cxx 20 | 21 | .PHONY: all 22 | all: $(check_PROGRAMS) 23 | -------------------------------------------------------------------------------- /test/low_homo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -f 2 | 3 | SEED=${RANDOM_SEED:-0} 4 | SEED2=0 5 | SEED3=0 6 | if test $SEED -ne 0; then 7 | SEED=$((SEED + 1)) 8 | SEED2=$((SEED + 2)) 9 | SEED3=$((SEED + 3)) 10 | fi 11 | 12 | ./test/test_fasta -s $SEED -l 100000 > a_low.fa 13 | ./test/test_fasta -s $SEED2 -l 100000 > b_low.fa 14 | ./test/test_fasta -s $SEED3 -l 100 > both_low.fa 15 | 16 | cat both_low.fa a_low.fa | awk -v RS='>' '{if($1 == "S0")print ">"$0 > "S0_low.fa"}' 17 | cat both_low.fa b_low.fa | awk -v RS='>' '{if($1 == "S1")print ">"$0 > "S1_low.fa"}' 18 | 19 | # this is expected to trigger the low homology warning 20 | ./src/andi -j S0_low.fa S1_low.fa 2>&1 | grep 'homology' 21 | EXIT_VAL=$? 22 | 23 | if [[ EXIT_VAL -ge 1 ]]; then 24 | echo "Triggering low homology failed" >&2 25 | grep '^>' a_low.fa b_low.fa both_low.fa 26 | fi 27 | 28 | rm -f a_low.fa b_low.fa both_low.fa S0_low.fa S1_low.fa 29 | exit $EXIT_VAL 30 | -------------------------------------------------------------------------------- /test/nan.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -f 2 | 3 | SEED=${RANDOM_SEED:-0} 4 | SEED2=0 5 | if test $SEED -ne 0; then 6 | SEED=$((SEED + 1)) 7 | SEED2=$((SEED + 2)) 8 | fi 9 | 10 | 11 | ./test/test_fasta -s $SEED -l 10000 > a_nan.fa 12 | ./test/test_fasta -s $SEED2 -l 10000 > b_nan.fa 13 | 14 | # this is expected to trigger the nan warning 15 | ./src/andi -j a_nan.fa b_nan.fa 2>&1 | grep 'nan' 16 | EXIT_VAL=$? 17 | 18 | 19 | if [[ EXIT_VAL -ge 1 ]]; then 20 | echo "Triggering nan failed" >&2 21 | grep '^>' a_nan.fa b_nan.fa 22 | fi 23 | 24 | rm -f a_nan.fa b_nan.fa 25 | exit $EXIT_VAL 26 | -------------------------------------------------------------------------------- /test/test_esa.c: -------------------------------------------------------------------------------- 1 | #include "esa.h" 2 | #include 3 | #include "global.h" 4 | #include 5 | #include 6 | 7 | int FLAGS = F_NONE; 8 | int THREADS = 1; 9 | double ANCHOR_P_VALUE = 0.025; 10 | 11 | extern const int CACHE_LENGTH; 12 | 13 | char code3char( ssize_t code){ 14 | switch( code & 0x7){ 15 | case 0: return 'A'; 16 | case 1: return 'C'; 17 | case 2: return 'G'; 18 | case 3: return 'T'; 19 | case 4: return '!'; 20 | case 5: return ';'; 21 | case 6: return '#'; 22 | } 23 | return '\0'; 24 | } 25 | 26 | typedef struct { 27 | esa_s *C; 28 | seq_t *S; 29 | seq_subject subject; 30 | } esa_fixture; 31 | 32 | void assert_equal_lcp( const lcp_inter_t *a, const lcp_inter_t *b){ 33 | g_assert_cmpint( a->i, ==, b->i); 34 | g_assert_cmpint( a->j, ==, b->j); 35 | g_assert_cmpint( a->l, ==, b->l); 36 | } 37 | 38 | void assert_equal_cache_nocache( const esa_s *C, const char *str, size_t qlen){ 39 | lcp_inter_t a = get_match_cached(C, str, qlen); 40 | lcp_inter_t b = get_match(C, str, qlen); 41 | assert_equal_lcp( &a, &b); 42 | g_assert(strncmp(str, C->S + C->SA[a.i], a.l) == 0); 43 | g_assert( str[a.l] != C->S[ a.l + C->SA[a.i]] || str[a.l] == '\0'); 44 | } 45 | 46 | void setup( esa_fixture *ef, gconstpointer test_data){ 47 | ef->C = malloc( sizeof(esa_s)); 48 | ef->S = malloc( sizeof(seq_t)); 49 | 50 | g_assert( ef->C != NULL); 51 | g_assert( ef->S != NULL); 52 | 53 | const char *seq = { 54 | "TACGAGCACTGGTGGAATTGATGTC" 55 | "CAGTCTTATATGGCGCACCAGGCTG" 56 | "ATAGTAGTAGCAGTTTGCTTATCTC" 57 | "ATCGCGTGTTTCCGGATGACAGAGA" 58 | "TACGTGCACTGGTGGGATTGATGTC" 59 | "TAGTATTATATGGCGCACCAGGATG" 60 | "ATAGTAGTAGCAGTTTGCTTATCCC" 61 | "ATCGCGTGTTTGCGGATGACCGAGA" 62 | }; 63 | 64 | g_assert( seq_init( ef->S, seq, "S0" ) == 0); 65 | seq_subject_init( &ef->subject, ef->S); 66 | g_assert( ef->subject.RS != NULL); 67 | int check = esa_init( ef->C, &ef->subject); 68 | g_assert( check == 0); 69 | } 70 | 71 | void setup2( esa_fixture *ef, gconstpointer test_data){ 72 | ef->C = malloc( sizeof(esa_s)); 73 | ef->S = malloc( sizeof(seq_t)); 74 | 75 | g_assert( ef->C != NULL); 76 | g_assert( ef->S != NULL); 77 | 78 | const char *seq = { 79 | "TACGAGCACTGGTGGAATTGATGTC" 80 | "CAGTCTTATATGGCGCACCAGGCTG" 81 | "ATAGTAGTAGCAGTTTGCTTATCTC" 82 | "ATCGCGTGTTTCCGGATGACAGAGA" 83 | "!" 84 | "TACGTGCACTGGTGGGATTGATGTC" 85 | "TAGTATTATATGGCGCACCAGGATG" 86 | "ATAGTAGTAGCAGTTTGCTTATCCC" 87 | "ATCGCGTGTTTGCGGATGACCGAGA" 88 | }; 89 | 90 | g_assert( seq_init( ef->S, seq, "S0" ) == 0); 91 | seq_subject_init( &ef->subject, ef->S); 92 | g_assert( ef->subject.RS != NULL); 93 | int check = esa_init( ef->C, &ef->subject); 94 | g_assert( check == 0); 95 | } 96 | 97 | void teardown( esa_fixture *ef, gconstpointer test_data){ 98 | esa_free(ef->C); 99 | free(ef->C); 100 | seq_free(ef->S); 101 | free(ef->S); 102 | seq_subject_free(&ef->subject); 103 | } 104 | 105 | extern int count; 106 | 107 | void basic( esa_fixture *ef, gconstpointer test_data){ 108 | esa_s *C = ef->C; 109 | g_assert( C->SA); 110 | 111 | lcp_inter_t a = get_match_cached(C, "AAGACTGG", 8); 112 | lcp_inter_t b = get_match(C, "AAGACTGG", 8); 113 | assert_equal_lcp( &a, &b); 114 | g_assert(strncmp("AAGACTGG",C->S + C->SA[a.i], 8) == 0); 115 | 116 | a = get_match_cached(C, "AATTAAAA", 8); 117 | b = get_match(C, "AATTAAAA", 8); 118 | assert_equal_lcp( &a, &b); 119 | g_assert(strncmp("AATTAAAA",C->S + C->SA[a.i], a.l) == 0); 120 | 121 | a = get_match_cached(C, "ACCGAGAA", 8); 122 | b = get_match(C, "ACCGAGAA", 8); 123 | assert_equal_lcp( &a, &b); 124 | g_assert(strncmp("ACCGAGAA",C->S + C->SA[a.i], a.l) == 0); 125 | 126 | a = get_match_cached(C, "AAAAAAAAAAAA", 12); 127 | b = get_match(C, "AAAAAAAAAAAA", 12); 128 | assert_equal_lcp( &a, &b); 129 | g_assert(strncmp("AAAAAAAAAAAA",C->S + C->SA[a.i], a.l) == 0); 130 | 131 | //g_assert_cmpint(count, >=, 1 << (2*8)); 132 | } 133 | 134 | void normq_cached( esa_fixture *ef, gconstpointer test_data){ 135 | esa_s *C = ef->C; 136 | g_assert( C->SA); 137 | lcp_inter_t a, b; 138 | 139 | a = get_match_cached(C, "A", 1); 140 | b = get_match(C, "A", 1); 141 | assert_equal_lcp( &a, &b); 142 | 143 | a = get_match_cached(C, "C", 1); 144 | b = get_match(C, "C", 1); 145 | assert_equal_lcp( &a, &b); 146 | 147 | a = get_match_cached(C, "CT", 2); 148 | b = get_match(C, "CT", 2); 149 | assert_equal_lcp( &a, &b); 150 | 151 | a = get_match_cached(C, "AAGACTGG", 8); 152 | b = get_match(C, "AAGACTGG", 8); 153 | assert_equal_lcp( &a, &b); 154 | 155 | a = get_match_cached(C, "AATTAAAA", 8); 156 | b = get_match(C, "AATTAAAA", 8); 157 | assert_equal_lcp( &a, &b); 158 | 159 | a = get_match_cached(C, "ACCGAGAA", 8); 160 | b = get_match(C, "ACCGAGAA", 8); 161 | assert_equal_lcp( &a, &b); 162 | 163 | a = get_match_cached(C, "AAAAAAAAAAAA", 12); 164 | b = get_match(C, "AAAAAAAAAAAA", 12); 165 | assert_equal_lcp( &a, &b); 166 | 167 | a = get_match_cached(C, "!AAAAAAAAAAAA", 12); 168 | b = get_match(C, "!AAAAAAAAAAAA", 12); 169 | assert_equal_lcp( &a, &b); 170 | } 171 | 172 | size_t MAX_DEPTH = 11; 173 | 174 | void prefix_dfs( esa_s *C, char *str, size_t depth); 175 | 176 | void prefix( esa_fixture *ef, gconstpointer test_data){ 177 | esa_s *C = ef->C; 178 | char str[MAX_DEPTH+1]; 179 | str[MAX_DEPTH] = '\0'; 180 | prefix_dfs( C, str, 0); 181 | } 182 | 183 | void prefix_dfs( esa_s *C, char *str, size_t depth){ 184 | if( depth < MAX_DEPTH){ 185 | for( int code = 0; code < 4; ++code){ 186 | str[depth] = code2char(code); 187 | prefix_dfs( C, str, depth + 1); 188 | } 189 | } else { 190 | assert_equal_cache_nocache(C, str, depth); 191 | } 192 | } 193 | 194 | int main(int argc, char *argv[]) 195 | { 196 | g_test_init( &argc, &argv, NULL); 197 | g_test_add("/esa/basic", esa_fixture, NULL, setup, basic, teardown); 198 | g_test_add("/esa/sample cache", esa_fixture, NULL, setup, normq_cached, teardown); 199 | g_test_add("/esa/sample cache 2", esa_fixture, NULL, setup2, normq_cached, teardown); 200 | g_test_add("/esa/full cache", esa_fixture, NULL, setup, prefix, teardown); 201 | g_test_add("/esa/full cache 2", esa_fixture, NULL, setup2, prefix, teardown); 202 | 203 | 204 | return g_test_run(); 205 | } 206 | 207 | -------------------------------------------------------------------------------- /test/test_extra.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -f 2 | 3 | # Test if andi exists, and can be executed 4 | ./src/andi --version > /dev/null || exit 1 5 | 6 | SEED=${RANDOM_SEED:-0} 7 | SEED2=0 8 | SEED3=0 9 | if test $SEED -ne 0; then 10 | SEED=$((SEED + 1)) 11 | SEED2=$((SEED + 2)) 12 | SEED3=$((SEED + 3)) 13 | fi 14 | 15 | # Test andi for more than just two sequences at a time 16 | ./test/test_fasta -s $SEED -l 100000 -d 0.01 -d 0.01 -d 0.01 -d 0.01 | ./src/andi > /dev/null || exit 1 17 | 18 | # Test low-memory mode 19 | ./test/test_fasta -s $SEED2 -l 10000 > test_extra.fasta 20 | ./src/andi test_extra.fasta > extra.out 21 | ./src/andi test_extra.fasta --low-memory > extra_low_memory.out 22 | diff extra.out extra_low_memory.out || exit 1 23 | 24 | # Test file of filenames 25 | ./test/test_fasta -s $SEED3 -l 10000 > test_extra.fasta 26 | echo "$PWD/test_extra.fasta" > fof.txt 27 | ./src/andi test_extra.fasta > extra.out 28 | ./src/andi --file-of-filenames fof.txt > fof.out 29 | cat fof.txt | ./src/andi --file-of-filenames - > fof2.out 30 | diff extra.out fof.out || exit 1 31 | diff extra.out fof2.out || exit 1 32 | 33 | 34 | rm -f test_extra.fasta extra.out extra_low_memory.out fof.out fof2.out fof.txt 35 | 36 | -------------------------------------------------------------------------------- /test/test_fasta.cxx: -------------------------------------------------------------------------------- 1 | /** 2 | * This program can create genome sequences with a specific distance. 3 | */ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | void usage(); 14 | void print_seq( unsigned, unsigned, int, int, double); 15 | 16 | int main(int argc, char *argv[]){ 17 | 18 | random_device rd{}; 19 | auto seed = rd(); 20 | int length = 1000; 21 | int line_length = 70; 22 | int raw = 0; 23 | 24 | auto seqs = vector{0}; 25 | 26 | int check; 27 | while((check = getopt(argc, argv, "s:l:L:d:r")) != -1){ 28 | switch(check) { 29 | case 's': 30 | { 31 | seed = static_cast(stol(optarg)); 32 | if( seed == 0){ 33 | seed = rd(); 34 | } 35 | break; 36 | } 37 | case 'l': length = stoi(optarg); break; 38 | case 'L': line_length = stoi(optarg); break; 39 | case 'd': seqs.push_back(stod(optarg)); break; 40 | case 'r': raw = 1; break; 41 | case '?': 42 | default: usage(); return 1; 43 | } 44 | } 45 | 46 | if( seqs.size() < 2){ 47 | seqs.push_back(0.1); 48 | } 49 | 50 | if( !raw){ 51 | for(auto& dist : seqs) { 52 | auto d = dist; 53 | auto p = 0.75 - 0.75 * exp(-(4.0/3.0) * d); 54 | dist = p; 55 | } 56 | } 57 | 58 | auto base_seed = seed; 59 | 60 | for( auto i=0u; i< seqs.size(); i++){ 61 | cout << ">S" << i << " (base_seed: " << base_seed << ")" << endl; 62 | print_seq( base_seed, seed++, length, line_length, seqs[i]); 63 | } 64 | 65 | return 0; 66 | } 67 | 68 | 69 | static auto ACGT = "ACGT"; 70 | static auto NO_A = "CGT"; 71 | static auto NO_C = "AGT"; 72 | static auto NO_G = "ACT"; 73 | static auto NO_T = "ACG"; 74 | 75 | void print_seq( unsigned base_seed, unsigned mut_seed, int length, int line_length, double divergence){ 76 | char line[line_length+1]; 77 | line[line_length] = '\0'; 78 | 79 | auto base_rand = default_random_engine{base_seed}; 80 | auto base_dist = uniform_int_distribution{0,3}; 81 | auto base_acgt = [&]{return ACGT[base_dist(base_rand)];}; 82 | 83 | auto mut_rand = default_random_engine{mut_seed}; 84 | auto mut_dist = uniform_real_distribution{0,1}; 85 | auto mut = bind( mut_dist, mut_rand); 86 | auto mut_acgt = uniform_int_distribution{0,2}; 87 | auto mutate = [&](char c){ 88 | int idx = mut_acgt(mut_rand); 89 | switch(c){ 90 | case 'A': return NO_A[idx]; 91 | case 'C': return NO_C[idx]; 92 | case 'G': return NO_G[idx]; 93 | case 'T': return NO_T[idx]; 94 | default: return 'X'; 95 | } 96 | }; 97 | 98 | double nucleotides = (double)length; 99 | double mutations = nucleotides * divergence; 100 | 101 | for(int i= length, j; i > 0; i -= j){ 102 | j = min(line_length, i); 103 | 104 | for(auto k=0; k /dev/null || exit 1 4 | 5 | SEED=${RANDOM_SEED:-0} 6 | SEED2=0 7 | SEED3=0 8 | if test $SEED -ne 0; then 9 | SEED=$((SEED + 1)) 10 | SEED2=$((SEED + 2)) 11 | SEED3=$((SEED + 3)) 12 | fi 13 | 14 | # Simple join test 15 | ./test/test_fasta -s $SEED -l 1000 -L 1000 -d 0.1 > p1_join.fasta 16 | ./test/test_fasta -s $SEED2 -l 1000 -L 1000 -d 0.1 > p2_join.fasta 17 | ./test/test_fasta -s $SEED3 -l 10000 -L 10000 -d 0.1 > p3_join.fasta 18 | 19 | head -qn 2 p1_join.fasta p2_join.fasta p3_join.fasta > S0_join.fasta 20 | tail -qn 2 p1_join.fasta p2_join.fasta p3_join.fasta > S1_join.fasta 21 | 22 | rm p1_join.fasta p2_join.fasta p3_join.fasta; 23 | 24 | 25 | RES=$(./src/andi -m RAW -t 1 -j S0_join.fasta S1_join.fasta | 26 | tail -n 1 | 27 | awk '{print ($2 - 0.1)}' | 28 | awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}' 29 | ) 30 | 31 | if test $RES -ne 1; then 32 | echo "The last test computed a distance deviating more than three percent from its intended value." 33 | echo "See S0_join.fasta and S1_join.fasta for the used sequences." 34 | exit 1; 35 | fi 36 | 37 | SEED=${RANDOM_SEED:-0} 38 | SEED2=0 39 | if test $SEED -ne 0; then 40 | SEED=$((SEED + 5)) 41 | SEED2=$((SEED + 6)) 42 | fi 43 | 44 | #unbalanced number of contigs 45 | ./test/test_fasta -s $SEED -l 1000 -L 1000 -d 0.1 > p2_join.fasta 46 | ./test/test_fasta -s $SEED2 -l 10000 -L 10000 -d 0.1 > p3_join.fasta 47 | 48 | head -qn 2 p3_join.fasta > S0_join.fasta 49 | tail -qn 2 p2_join.fasta p3_join.fasta > S1_join.fasta 50 | 51 | rm p2_join.fasta p3_join.fasta; 52 | 53 | 54 | RES=$(./src/andi -m RAW -t1 -j S0_join.fasta S1_join.fasta | 55 | tail -n 1 | 56 | awk '{print ($2 - 0.1)}' | 57 | awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}' 58 | ) 59 | 60 | if test $RES -ne 1; then 61 | echo "The last test computed a distance deviating more than three percent from its intended value." 62 | echo "See S0_join.fasta and S1_join.fasta for the used sequences." 63 | exit 1; 64 | fi 65 | 66 | SEED=${RANDOM_SEED:-0} 67 | SEED2=0 68 | SEED3=0 69 | if test $SEED -ne 0; then 70 | SEED=$((SEED + 11)) 71 | SEED2=$((SEED + 12)) 72 | SEED3=$((SEED + 13)) 73 | fi 74 | 75 | #unbalanced number of contigs 2 76 | ./test/test_fasta -s $SEED -l 1000 -L 1000 -d 0.1 > p1_join.fasta 77 | ./test/test_fasta -s $SEED2 -l 1000 -L 1000 -d 0.1 > p2_join.fasta 78 | ./test/test_fasta -s $SEED3 -l 10000 -L 10000 -d 0.1 > p3_join.fasta 79 | 80 | head -qn 2 p1_join.fasta p3_join.fasta > S0_join.fasta 81 | tail -qn 2 p1_join.fasta p2_join.fasta p3_join.fasta > S1_join.fasta 82 | 83 | rm p1_join.fasta p2_join.fasta p3_join.fasta; 84 | 85 | 86 | RES=$(./src/andi -mRAW -t 1 -j S0_join.fasta S1_join.fasta | 87 | tail -n 1 | 88 | awk '{print ($2 - 0.1)}' | 89 | awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}' 90 | ) 91 | 92 | if test $RES -ne 1; then 93 | echo "The last test computed a distance deviating more than three percent from its intended value." 94 | echo "See S0_join.fasta and S1_join.fasta for the used sequences." 95 | exit 1; 96 | fi 97 | 98 | 99 | rm S0_join.fasta S1_join.fasta 100 | -------------------------------------------------------------------------------- /test/test_process.c: -------------------------------------------------------------------------------- 1 | #include "global.h" 2 | #include "process.h" 3 | #include 4 | #include 5 | 6 | int FLAGS = 0; 7 | int THREADS = 1; 8 | long unsigned int BOOTSTRAP = 0; 9 | double ANCHOR_P_VALUE = 0.025; 10 | gsl_rng *RNG = NULL; 11 | int MODEL = M_JC; 12 | 13 | double shustring_cum_prob(size_t x, double g, size_t l); 14 | size_t min_anchor_length(double p, double g, size_t l); 15 | 16 | void test_shustring_cum_prob() { 17 | int len = 100000; 18 | double gc = 0.5; 19 | double p_value = 0.025; 20 | 21 | size_t threshold = min_anchor_length(p_value, gc, len); 22 | 23 | g_assert_cmpfloat(1 - p_value, <, shustring_cum_prob(threshold + 1, gc / 2, len)); 24 | g_assert_cmpfloat(1 - p_value, <=, shustring_cum_prob(threshold, gc / 2, len)); 25 | g_assert_cmpfloat(1 - p_value, >, shustring_cum_prob(threshold - 1, gc / 2, len)); 26 | } 27 | 28 | int main(int argc, char *argv[]) { 29 | g_test_init(&argc, &argv, NULL); 30 | g_test_add_func("/process/shustring_cum_prob", test_shustring_cum_prob); 31 | 32 | return g_test_run(); 33 | } 34 | -------------------------------------------------------------------------------- /test/test_random.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -f 2 | 3 | # This scripts test the accuracy of andi with random inputs. For that 4 | # it uses the small program test_random to generate pairs of sequences 5 | # with a given distance. By default, test_random creates a new set of 6 | # sequences each time it is called. Thus, this test has a small, but 7 | # non-zero probability of failing. That is a problem with Debian's 8 | # reproducible builds effort. So this script acts as a wrapper around 9 | # this issue. 10 | # 11 | # Simply calling this script via 12 | # % ./test/test_random.sh 13 | # checks a new test-case every time. But with the right parameter 14 | # % RANDOM_SEED=1729 ./test/test_random.sh 15 | # one specific set of sequences is validated. 16 | 17 | ./src/andi --help > /dev/null || exit 1 18 | 19 | LENGTH=100000 20 | 21 | # If RANDOM_SEED is set, use its value. Otherwise 0 is used to signal 22 | # to test_random that a new set of sequences shall be generated. 23 | SEED=${RANDOM_SEED:-0} 24 | 25 | for dist in 0.0 0.001 0.01 0.02 0.05 0.1 0.2 0.3 26 | do 27 | for n in $(seq 10) 28 | do 29 | if test $SEED -ne 0; then 30 | SEED=$((SEED + 1)) 31 | fi 32 | 33 | res=$(./test/test_fasta -s $SEED -l $LENGTH -d $dist | 34 | tee ./test/test_random.fasta | 35 | ./src/andi -t 1 | 36 | tail -n 1 | 37 | awk -v dist=$dist '{print $2, dist}' | 38 | awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.055 && abs($1-$2) <= 0.055 * $2}') 39 | if test $res -ne 1; then 40 | echo "The last test computed a distance deviating more than five percent from its intended value." 41 | echo "See test_random.fasta for the used sequences." 42 | echo "./test/test_fasta -s $SEED -l $LENGTH -d $dist" 43 | head -n 1 ./test/test_random.fasta 44 | exit 1; 45 | fi 46 | done 47 | 48 | # raw 49 | for n in $(seq 10) 50 | do 51 | if test $SEED -ne 0; then 52 | SEED=$((SEED + 1)) 53 | fi 54 | 55 | res=$(./test/test_fasta -r -s $SEED -l $LENGTH -d $dist | 56 | tee ./test/test_random.fasta | 57 | ./src/andi -m RAW -t 1 | 58 | tail -n 1 | 59 | awk -v dist=$dist '{print $2, dist}' | 60 | awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.055 && abs($1-$2) <= 0.055 * $2}') 61 | if test $res -ne 1; then 62 | echo "The last test computed a distance deviating more than five percent from its intended value." 63 | echo "See test_random.fasta for the used sequences." 64 | echo "./test/test_fasta -r -s $SEED -l $LENGTH -d $dist" 65 | head -n 1 ./test/test_random.fasta 66 | exit 1; 67 | fi 68 | done 69 | done 70 | 71 | rm ./test/test_random.fasta 72 | -------------------------------------------------------------------------------- /test/test_seq.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "global.h" 3 | #include 4 | #include 5 | #include "sequence.h" 6 | 7 | double ANCHOR_P_VALUE = 0.025; 8 | 9 | int FLAGS = F_NONE; 10 | 11 | void test_seq_basic(){ 12 | 13 | seq_t S; 14 | 15 | seq_init( &S, "ACGT", "name"); 16 | 17 | g_assert_cmpstr(S.S, ==, "ACGT"); 18 | g_assert_cmpstr(S.name, ==, "name"); 19 | g_assert_cmpuint(S.len, ==, 4); 20 | 21 | seq_free( &S); 22 | } 23 | 24 | void test_seq_full(){ 25 | 26 | seq_t S; 27 | seq_subject subject; 28 | 29 | seq_init( &S, "ACGTTGCA", "name"); 30 | int check = seq_subject_init( &subject, &S); 31 | 32 | g_assert_cmpint(check, ==, 0); 33 | 34 | g_assert_cmpstr(subject.RS, ==, "TGCAACGT#ACGTTGCA"); 35 | g_assert_cmpuint(subject.RSlen, ==, 8*2+1); 36 | g_assert( subject.gc == 0.5); 37 | 38 | seq_subject_free( &subject); 39 | seq_free( &S); 40 | } 41 | 42 | void test_seq_nonacgt(){ 43 | seq_t S; 44 | seq_subject subject; 45 | 46 | seq_init( &S, "11ACGTNN7682394689NNTGCA11", "name"); 47 | seq_subject_init( &subject, &S); 48 | 49 | g_assert_cmpstr(S.S, ==, "ACGTTGCA"); 50 | g_assert_cmpuint(S.len, ==, 8 ); 51 | g_assert( FLAGS & F_NON_ACGT); 52 | 53 | g_assert_cmpstr(subject.RS, ==, "TGCAACGT#ACGTTGCA"); 54 | g_assert_cmpuint(subject.RSlen, ==, 8*2+1); 55 | g_assert( subject.gc == 0.5); 56 | 57 | seq_subject_free( &subject); 58 | seq_free( &S); 59 | 60 | FLAGS = F_NONE; 61 | 62 | seq_init( &S, "@ACGT_!0TGCA ", "name"); 63 | seq_subject_init( &subject, &S); 64 | 65 | g_assert_cmpstr(S.S, ==, "ACGT!TGCA"); 66 | g_assert_cmpuint(S.len, ==, 9 ); 67 | g_assert( FLAGS & F_NON_ACGT); 68 | 69 | g_assert_cmpstr(subject.RS, ==, "TGCA;ACGT#ACGT!TGCA"); 70 | g_assert_cmpuint(subject.RSlen, ==, 9*2+1); 71 | 72 | seq_subject_free( &subject); 73 | seq_free( &S); 74 | 75 | FLAGS = F_NONE; 76 | 77 | } 78 | 79 | int main(int argc, char *argv[]) 80 | { 81 | g_test_init( &argc, &argv, NULL); 82 | g_test_add_func("/seq/basic", test_seq_basic); 83 | g_test_add_func("/seq/full", test_seq_full); 84 | g_test_add_func("/seq/non acgt", test_seq_nonacgt); 85 | 86 | return g_test_run(); 87 | } 88 | 89 | --------------------------------------------------------------------------------