├── README ├── CHANGES ├── debian ├── compat ├── source │ └── format ├── libpandaseq-url0.install ├── pandaseq.install ├── libpandaseq7.install ├── pandaseq-dev.install ├── rules ├── copyright └── control ├── deps-url.in ├── testing ├── .indent.pro ├── setup.vala ├── Makefile ├── README.md └── reg-test.vala ├── autogen.sh ├── buffer.list ├── panda_api.c ├── pandabug ├── pc.in ├── pc-url.in ├── m4 ├── ax_check_cflags.m4 ├── ag_check_uname_syscall.m4 └── legacy_pkg.m4 ├── plugin_empty.c ├── pandaseq-checkid.1 ├── .gitignore ├── nt.h ├── lib.rc ├── plugin_completely_miss_the_point.c ├── prob.h ├── .indent.pro ├── plugin_min_phred.c ├── buffer.h ├── algo.h ├── pandaxs.in ├── plugin_before.c ├── plugin_after.c ├── plugin_min_overlapbits.c ├── pandaseq-diff.1 ├── module.h ├── vapi-url.in ├── pandaseq-hang.1 ├── plugin_overlap_stat.c ├── main.c ├── plugin_filter.c ├── check_parser.c ├── pandaseq-plugin.h ├── pandaseq-linebuf.h ├── main-diff.c ├── pandaseq.spec.in ├── main-hang.c ├── .travis.yml ├── plugin_other_primer.c ├── assembler.h ├── misc.c ├── pandaseq-nt.h ├── bzstream.c ├── plugin_validtag.c ├── buffer.c ├── pandaseq-set.h ├── pandaseq-url.h ├── pandaxs.1 ├── linebuf.c ├── algo_ea_util.c ├── algo_stitch.c ├── misc.h ├── plugin_pear_test.c ├── plugin_sample.c ├── main-parse.c ├── args_array.c ├── algo_flash.c ├── pandaseq-iter.h ├── algo_rdp_mle.c ├── hang.c ├── mktable.c ├── algo_pear.c ├── idset.c ├── pandaseq-writer.h ├── algo_example.c ├── pandaseq-seqid.h ├── configure.ac ├── pandaseq-module.h ├── algo.c ├── offset.c ├── algo_uparse.c ├── pandaseq-log.h ├── algo_simple_bayes.c ├── pandaseq-tablebuilder.h ├── pandaseq-mux.h ├── iter.c ├── nt.c ├── fileio.c ├── tablebuilder.c └── args_fastq.c /README: -------------------------------------------------------------------------------- 1 | README.md -------------------------------------------------------------------------------- /CHANGES: -------------------------------------------------------------------------------- 1 | debian/changelog -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 7 2 | -------------------------------------------------------------------------------- /deps-url.in: -------------------------------------------------------------------------------- 1 | pandaseq-2 2 | -------------------------------------------------------------------------------- /testing/.indent.pro: -------------------------------------------------------------------------------- 1 | ../.indent.pro -------------------------------------------------------------------------------- /debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (native) 2 | -------------------------------------------------------------------------------- /debian/libpandaseq-url0.install: -------------------------------------------------------------------------------- 1 | usr/lib/libpandaseq-url.so.* 2 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | test -d m4 || mkdir m4 4 | 5 | autoreconf -i 6 | -------------------------------------------------------------------------------- /debian/pandaseq.install: -------------------------------------------------------------------------------- 1 | usr/bin/pandaseq* 2 | usr/share/man/man1/pandaseq*.1 3 | -------------------------------------------------------------------------------- /buffer.list: -------------------------------------------------------------------------------- 1 | BUFFER(static, char, BUFFER_SIZE) 2 | BUFFER(seqid, char, BUFFER_SIZE) 3 | -------------------------------------------------------------------------------- /debian/libpandaseq7.install: -------------------------------------------------------------------------------- 1 | usr/lib/libpandaseq.so.* 2 | usr/lib/pandaseq*/*.so* 3 | -------------------------------------------------------------------------------- /panda_api.c: -------------------------------------------------------------------------------- 1 | #include 2 | int PANDACONCAT( 3 | PANDASEQ_MODULE, 4 | _LTX_api) = PANDA_API; 5 | -------------------------------------------------------------------------------- /debian/pandaseq-dev.install: -------------------------------------------------------------------------------- 1 | usr/bin/pandaxs 2 | usr/include/* 3 | usr/lib/*.a 4 | usr/lib/*.so 5 | usr/lib/pkgconfig 6 | usr/share/doc/pandaseq/plugin_sample.c 7 | usr/share/man/man1/pandaxs.1 8 | usr/share/vala 9 | -------------------------------------------------------------------------------- /pandabug: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | uname -a 4 | test -x "$(which lsb_release)" && lsb_release -a 5 | test -x "$(which dpkg)" && dpkg -l '*pandaseq*' 6 | test -x "$(which rpm)" && rpm -q '*pandaseq*' 7 | test -d .git && git rev-parse HEAD 8 | which pandaseq 9 | test -x pandaseq && pandaseq -v 10 | -------------------------------------------------------------------------------- /pc.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | exec_prefix=@exec_prefix@ 3 | libdir=@libdir@ 4 | bindir=@bindir@ 5 | includedir=@includedir@/@LIB_NAME@ 6 | datarootdir=@datarootdir@ 7 | datadir=@datadir@ 8 | vapidir=@datadir@/vala/vapi 9 | 10 | Name: PANDAseq 11 | Description: Assemble forward and reverse reads from Illumina FASTQ data 12 | Version: @VERSION@ 13 | Requires: 14 | Libs: -L${libdir} -lpandaseq 15 | Cflags: -I${includedir} 16 | -------------------------------------------------------------------------------- /pc-url.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | exec_prefix=@exec_prefix@ 3 | libdir=@libdir@ 4 | bindir=@bindir@ 5 | includedir=@includedir@/@LIB_NAME@ 6 | datarootdir=@datarootdir@ 7 | datadir=@datadir@ 8 | vapidir=@datadir@/vala/vapi 9 | 10 | Name: PANDAseq-URL 11 | Description: Assemble forward and reverse reads from Illumina FASTQ data (URL reader) 12 | Version: @VERSION@ 13 | Requires: 14 | Libs: -L${libdir} -lpandaseq-url 15 | Cflags: -I${includedir} 16 | -------------------------------------------------------------------------------- /m4/ax_check_cflags.m4: -------------------------------------------------------------------------------- 1 | # AX_CHECK_CFLAGS(ADDITIONAL-CFLAGS, ACTION-IF-FOUND, ACTION-IF-NOT-FOUND) 2 | # 3 | # checks whether the $(CC) compiler accepts the ADDITIONAL-CFLAGS 4 | # if so, they are added to the CXXFLAGS 5 | AC_DEFUN([AX_CHECK_CFLAGS], 6 | [ 7 | AC_MSG_CHECKING([whether compiler accepts "$1"]) 8 | cat > conftest.c << EOF 9 | int main(){ 10 | return 0; 11 | } 12 | EOF 13 | if $CC $CPPFLAGS $CFLAGS -o conftest.o conftest.c [$1] > /dev/null 2>&1 14 | then 15 | AC_MSG_RESULT([yes]) 16 | CFLAGS="${CFLAGS} [$1]" 17 | [$2] 18 | else 19 | AC_MSG_RESULT([no]) 20 | [$3] 21 | fi 22 | ])dnl AX_CHECK_CFLAGS 23 | -------------------------------------------------------------------------------- /plugin_empty.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | HELP("Drops empty (zero-length) output sequences.", "empty"); 4 | VER_INFO("1.0"); 5 | 6 | static bool check_func( 7 | PandaLogProxy logger, 8 | const panda_result_seq *sequence, 9 | void* data) { 10 | (void) logger; 11 | (void) data; 12 | 13 | return sequence->sequence_length > 0; 14 | } 15 | 16 | OPEN { 17 | *precheck = NULL; 18 | *check = (PandaCheck) check_func; 19 | *destroy = NULL; 20 | *user_data = NULL; 21 | 22 | if (args != NULL && *args != '\0') { 23 | panda_log_proxy_write_f(logger, "No arguments allowed to empty filter."); 24 | return false; 25 | } 26 | return true; 27 | } 28 | -------------------------------------------------------------------------------- /pandaseq-checkid.1: -------------------------------------------------------------------------------- 1 | .\" Authors: Andre Masella 2 | .TH pandaseq-checkid 1 "November 2012" "2.4" "USER COMMANDS" 3 | .SH NAME 4 | pandaseq-checkid \- Illumina tag checker 5 | .SH SYNOPSIS 6 | \fBpandaseq-checkid\fR "\fIsequenceid\fR" ... 7 | .SH DESCRIPTION 8 | PANDASEQ is rather picky about the names of the paired-end Illumina reads. This is for your protection. It attempts to validate that the have correct tags and flow cell information. Naturally, this causes a number of problems, especially as some sequencing centres manipulate the tags. This program shows the information as PANDAseq gathers it out of the FASTQ header. 9 | 10 | If the output shows \fBhastag\fR to be false, then the \fB-B\fR flag should be passed to 11 | .BR pandaseq (1). 12 | 13 | .SH SEE ALSO 14 | .BR pandaseq (1). 15 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | # -*- makefile -*- 3 | # Sample debian/rules that uses debhelper. 4 | # 5 | # This file was originally written by Joey Hess and Craig Small. 6 | # As a special exception, when this file is copied by dh-make into a 7 | # dh-make output file, you may use that output file without restriction. 8 | # This special exception was added by Craig Small in version 0.37 of dh-make. 9 | # 10 | # Modified to make a template file for a multi-binary package with separated 11 | # build-arch and build-indep targets by Bill Allombert 2001 12 | 13 | # Uncomment this to turn on verbose mode. 14 | #export DH_VERBOSE=1 15 | 16 | # This has to be exported to make some magic below work. 17 | export DH_OPTIONS 18 | 19 | 20 | %: 21 | dh $@ 22 | 23 | .PHONY: override_dh_strip 24 | override_dh_strip: 25 | dh_strip --dbg-package=pandaseq-dbg 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.a 2 | *.deps 3 | *.exe 4 | *.la 5 | *.lo 6 | *.log 7 | *.o 8 | *.pc 9 | *.rpm 10 | *.tar.gz 11 | *.trs 12 | *~ 13 | .deps 14 | .libs 15 | /*.vapi 16 | /Makefile 17 | Makefile.in 18 | PANDAseq-*.pkg 19 | aclocal.m4 20 | ar-lib 21 | autom4te.cache 22 | build-macos-pkg 23 | build-stamp 24 | check_parser 25 | compile 26 | config.* 27 | configure 28 | debian/*debhelper* 29 | debian/*substvars 30 | debian/files 31 | debian/libpandaseq*/ 32 | debian/pandaseq-dbg 33 | debian/pandaseq-dev 34 | debian/tmp/ 35 | depcomp 36 | install-sh 37 | libtool 38 | ltmain.sh 39 | m4/libtool.m4 40 | m4/lt*.m4 41 | missing 42 | mktable 43 | mock 44 | pandaseq 45 | pandaseq-checkid 46 | pandaseq-diff 47 | pandaseq-hang 48 | pandaseq.spec 49 | pandaxs 50 | pkgroot 51 | stamp-h1 52 | table.c 53 | table.h 54 | test-driver 55 | testing/reg-test 56 | testing/reg-test.c 57 | testing/setup.c 58 | mktable.dSYM 59 | -------------------------------------------------------------------------------- /nt.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #ifndef NT_H 19 | # define NT_H 20 | extern char iupac_forward[32]; 21 | extern char iupac_reverse[32]; 22 | #endif 23 | -------------------------------------------------------------------------------- /testing/setup.vala: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | public Panda.Assemble create_assembler (Panda.LogProxy logger) { 19 | var assembler = new Panda.Assembler (null, logger); 20 | return assembler.assemble; 21 | } 22 | -------------------------------------------------------------------------------- /lib.rc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "config.h" 3 | 4 | #define STR0(x) #x 5 | #define STR(x) STR0(x) 6 | 7 | VS_VERSION_INFO VERSIONINFO 8 | FILEVERSION LIB_MAJOR,LIB_MINOR,0,0 9 | PRODUCTVERSION VERSION_MAJOR,VERSION_MINOR,0,0 10 | FILEFLAGSMASK VS_FFI_FILEFLAGSMASK 11 | FILEFLAGS 0 12 | FILEOS VOS_NT_WINDOWS32 13 | FILETYPE VFT_DLL 14 | FILESUBTYPE VFT2_UNKNOWN 15 | BEGIN 16 | BLOCK "StringFileInfo" 17 | BEGIN 18 | BLOCK "080904b0" 19 | BEGIN 20 | VALUE "CompanyName", "Josh Neufeld Lab" 21 | VALUE "FileDescription", "PANDAseq Illumina Pair-end Assembler." 22 | VALUE "FileVersion", PACKAGE_VERSION 23 | VALUE "InternalName", PACKAGE 24 | VALUE "LegalCopyright", "©2014 Andre Masella" 25 | VALUE "OriginalFilename", PACKAGE "-" STR(LIB_MAJOR) ".dll" 26 | VALUE "ProductName", "PANDAseq Library" 27 | VALUE "ProductVersion", PACKAGE_VERSION 28 | END 29 | END 30 | BLOCK "VarFileInfo" 31 | BEGIN 32 | VALUE "Translation", 0x809, 1200 33 | END 34 | END 35 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | Format: http://dep.debian.net/deps/dep5 2 | Upstream-Name: pandaseq 3 | Source: http://github.com/neufeld/pandaseq 4 | 5 | Files: * 6 | Copyright: 2011-2012 Andre Masella 7 | License: GPL-3.0+ 8 | 9 | License: GPL-3.0+ 10 | This program is free software: you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation, either version 3 of the License, or 13 | (at your option) any later version. 14 | . 15 | This package is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | . 20 | You should have received a copy of the GNU General Public License 21 | along with this program. If not, see . 22 | . 23 | On Debian systems, the complete text of the GNU General 24 | Public License version 3 can be found in "/usr/share/common-licenses/GPL-3". 25 | -------------------------------------------------------------------------------- /plugin_completely_miss_the_point.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | HELP("Filter out sequences that have mismatches in the overlap region.", "completely_miss_the_point:mismatches"); 6 | 7 | VER_INFO("1.0"); 8 | 9 | static bool check_func( 10 | PandaLogProxy logger, 11 | const panda_result_seq *sequence, 12 | void *user_data) { 13 | (void) logger; 14 | 15 | return sequence->overlap_mismatches <= (size_t) *(int *) user_data; 16 | } 17 | 18 | OPEN { 19 | int mismatches; 20 | 21 | (void) precheck; 22 | 23 | if (args == NULL || *args == '\0') { 24 | panda_log_proxy_write_str(logger, "Please supply the maximum allowed mismatches.\n"); 25 | return false; 26 | } 27 | errno = 0; 28 | mismatches = strtol(args, NULL, 10); 29 | if (errno != 0 || mismatches < 0 || (size_t) mismatches > PANDA_MAX_LEN) { 30 | panda_log_proxy_write_str(logger, "Bad maximum allowed mismatches.\n"); 31 | return false; 32 | } 33 | *check = check_func; 34 | *user_data = PANDA_STRUCT_DUP(&mismatches); 35 | *destroy = free; 36 | return true; 37 | } 38 | -------------------------------------------------------------------------------- /prob.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #ifndef PROB_H 19 | # define PROB_H 20 | 21 | # define PROBABILITY(score) (pow(10.0, (-(double)(score)) / 10.0)) 22 | # define PHREDMAX 46 23 | # define PHREDCLAMP(x) ((x) > PHREDMAX ? PHREDMAX : ((x) < 0 ? 0 : (x))) 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /.indent.pro: -------------------------------------------------------------------------------- 1 | -nbad -bap -bfda -nbc -bbo -hnl -br -brf -brs -c33 -cd33 -ncdb -cdw -ce -ci8 -cli0 -d0 -di1 -nfc1 -i8 -ip0 -l0 -nlp -npcs -nprs -npsl -sai -saf -saw -nbs -cs -sc -sob -nfca -cp33 -ss -ts8 -il1 -lc80 -ppi8 2 | -T bool 3 | -T BZFILE 4 | -T FILE 5 | -T gzFile 6 | -T PandaAlgorithm 7 | -T PandaAlgorithmClass 8 | -T PandaArgsFastq 9 | -T PandaAssembler 10 | -T PandaBufferRead 11 | -T PandaBufferWrite 12 | -T PandaCheck 13 | -T PandaCode 14 | -T PandaComputeMatch 15 | -T PandaComputeOverlap 16 | -T PandaDebug 17 | -T PandaDestroy 18 | -T PandaFailAlign 19 | -T PandaIdFmt 20 | -T PandaIter 21 | -T panda_kmer 22 | -T PandaLineBuf 23 | -T PandaLogger 24 | -T PandaLogProxy 25 | -T PandaModule 26 | -T PandaModuleCallback 27 | -T PandaMux 28 | -T PandaNextChar 29 | -T PandaNextSeq 30 | -T panda_nt 31 | -T PandaOpener 32 | -T PandaPreCheck 33 | -T PandaPrintf 34 | -T panda_qual 35 | -T panda_result 36 | -T panda_result_seq 37 | -T panda_seq_identifier 38 | -T PandaSet 39 | -T PandaSetup 40 | -T PandaTagging 41 | -T PandaTBld 42 | -T PandaTweakAssembler 43 | -T panda_tweak_assembler 44 | -T PandaTweakGeneral 45 | -T panda_tweak_general 46 | -T PandaWriter 47 | -T size_t 48 | -------------------------------------------------------------------------------- /plugin_min_phred.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | HELP("Ensure the minimum score of all the output bases is above a certain PHRED value.", "min_phred:value"); 5 | 6 | VER_INFO("1.0"); 7 | 8 | static bool check_func( 9 | PandaLogProxy logger, 10 | const panda_result_seq *sequence, 11 | void *user_data) { 12 | size_t it; 13 | 14 | (void) logger; 15 | 16 | for (it = 0; it < sequence->sequence_length; it++) { 17 | if (panda_result_phred(&sequence->sequence[it]) < *(int *) user_data) { 18 | return false; 19 | } 20 | } 21 | return true; 22 | } 23 | 24 | OPEN { 25 | long int value; 26 | char *endptr; 27 | 28 | (void) precheck; 29 | 30 | if (args == NULL || *args == '\0') { 31 | panda_log_proxy_write_str(logger, "Need a number for a PHRED score.\n"); 32 | return false; 33 | } 34 | 35 | value = strtol(args, &endptr, 10); 36 | if ((endptr != NULL && *endptr != '\0') || value < 0 || value > 127) { 37 | panda_log_proxy_write_str(logger, "PHRED score must be a number between 0 and 127.\n"); 38 | return false; 39 | } 40 | *check = check_func; 41 | *user_data = PANDA_STRUCT_DUP(&value); 42 | *destroy = free; 43 | return true; 44 | } 45 | -------------------------------------------------------------------------------- /m4/ag_check_uname_syscall.m4: -------------------------------------------------------------------------------- 1 | dnl @synopsis AG_CHECK_UNAME_SYSCALL 2 | dnl 3 | dnl Check that the POSIX compliant uname(2) call works properly. 4 | dnl 5 | dnl @category C 6 | dnl @author Bruce Korb 7 | dnl @version 2001-12-01 8 | dnl @license GPLWithACException 9 | 10 | dnl DO NOT EDIT THIS FILE (ag_check_uname_syscall.m4) 11 | dnl 12 | dnl It has been AutoGen-ed Saturday December 1, 2001 at 09:21:28 PM PST 13 | dnl From the definitions bkorb.def 14 | dnl and the template file conftest.tpl 15 | dnl See: http://autogen.sf.net for a description of the AutoGen project 16 | 17 | AC_DEFUN([AG_CHECK_UNAME_SYSCALL],[ 18 | AC_MSG_CHECKING([whether uname(2) is POSIX]) 19 | AC_CACHE_VAL([ag_cv_uname_syscall],[ 20 | AC_TRY_RUN([#include 21 | int main() { struct utsname unm; 22 | return uname( &unm ); }],[ag_cv_uname_syscall=yes],[ag_cv_uname_syscall=no],[ag_cv_uname_syscall=no] 23 | ) # end of TRY_RUN]) # end of CACHE_VAL 24 | 25 | AC_MSG_RESULT([$ag_cv_uname_syscall]) 26 | if test x$ag_cv_uname_syscall = xyes 27 | then 28 | AC_DEFINE(HAVE_UNAME_SYSCALL, 1, 29 | [Define this if uname(2) is POSIX]) 30 | fi 31 | ]) # end of AC_DEFUN of AG_CHECK_UNAME_SYSCALL 32 | -------------------------------------------------------------------------------- /buffer.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #ifndef BUFFER_H 19 | # define BUFFER_H 20 | 21 | # include "pandaseq.h" 22 | # define BUFFER_SIZE 1024 23 | 24 | # define BUFFER(name, type, length) type *PANDACONCAT(name, _buffer)(void); 25 | # include "buffer.list" 26 | # undef BUFFER 27 | 28 | void bufferprintf( 29 | char *buffer, 30 | char *fmt, 31 | ...); 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /algo.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef ALGO_H 20 | # define ALGO_H 21 | # include "config.h" 22 | # include "pandaseq.h" 23 | # include "misc.h" 24 | # ifdef HAVE_PTHREAD 25 | # include 26 | # endif 27 | 28 | struct panda_algorithm { 29 | const struct panda_algorithm_class *clazz; 30 | volatile size_t refcnt; 31 | # ifdef HAVE_PTHREAD 32 | pthread_mutex_t mutex; 33 | # endif 34 | void *end; 35 | }; 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /testing/Makefile: -------------------------------------------------------------------------------- 1 | PANDASEQ_PREFIX=/usr/local 2 | 3 | reg-test: reg-test.vala control_setup.o experiment_setup.o renamed_pandaseq.a 4 | PKG_CONFIG_PATH=${PANDASEQ_PREFIX}/lib/pkgconfig valac --vapidir ${PANDASEQ_PREFIX}/share/vala/vapi reg-test.vala --pkg pandaseq-2 --pkg pandaseq-2-url -X control_setup.o -X experiment_setup.o -X renamed_pandaseq.a -g --save-temps -X -lpthread -X -lbz2 -X -lz -X -lm -X -lltdl 5 | 6 | setup.c: setup.vala 7 | PKG_CONFIG_PATH=${PANDASEQ_PREFIX}/lib/pkgconfig valac -g -C setup.vala --pkg pandaseq-2 8 | 9 | control_setup.o: setup.c 10 | gcc -Dcreate_assembler=create_assembler_control -c -o $@ $< $(shell PKG_CONFIG_PATH=${PANDASEQ_PREFIX}/lib/pkgconfig pkg-config --cflags glib-2.0 pandaseq-2) 11 | 12 | experiment_raw_setup.o: setup.c 13 | gcc -Dcreate_assembler=create_assembler_experiment -c -o $@ $< $(shell PKG_CONFIG_PATH=${PANDASEQ_PREFIX}/lib/pkgconfig pkg-config --cflags glib-2.0) -I.. 14 | 15 | renamed_pandaseq.a: ../.libs/libpandaseq.a 16 | nm $< --defined-only -B -g | awk 'NF == 3 { print $$3, "xx" $$3 }' | objcopy --redefine-syms=/dev/stdin $< $@ 17 | 18 | experiment_setup.o: experiment_raw_setup.o 19 | nm $< -B -g | awk 'NF == 2 && $$2 ~ /^panda_/ { print $$2, "xx" $$2 }' | objcopy --redefine-syms=/dev/stdin $< $@ 20 | 21 | clean: 22 | rm -f reg-test *.o *.a setup.c reg-test.c 23 | 24 | .PHONY: clean 25 | -------------------------------------------------------------------------------- /pandaxs.in: -------------------------------------------------------------------------------- 1 | #!@SHELL@ 2 | 3 | for each 4 | do 5 | if [[ "${each}" == -* ]] 6 | then 7 | PT="${PT} ${each}" 8 | else 9 | if [ "x${SRC}" = x ] 10 | then 11 | SRC="${each}" 12 | MOD=$(basename "${each}" .c) 13 | else 14 | SRC="${SRC} ${each}" 15 | fi 16 | fi 17 | done 18 | 19 | if [ "x${SRC}" = x ] 20 | then 21 | echo "Error: No C file provided." 22 | echo "Usage: $0 module.c supplementary.c -lextralib ..." 23 | exit 1 24 | fi 25 | 26 | prefix=@prefix@ 27 | exec_prefix=@exec_prefix@ 28 | LO_FILES="" 29 | for each in @includedir@/@LIB_NAME@/panda_api.c $SRC 30 | do 31 | LO="${MOD}-$(basename "${each}.lo")" 32 | @MODULE_LIBTOOL@ --tag=CC --mode=compile @CC@ -DPANDASEQ_MODULE=$MOD @DEFS@ -I@includedir@/@LIB_NAME@ @MODULE_CFLAGS@ -c -o "$LO" "$each" $PT || exit 1 33 | LO_FILES="$LO_FILES $LO" 34 | done 35 | @MODULE_LIBTOOL@ --tag=CC --mode=link @CC@ -module @MODULE_CFLAGS@ -export-dynamic -export-symbols-regex "^${MOD}_LTX_" @MODULE_LDFLAGS@ -rpath @libdir@/@PACKAGE@@LIB_MAJOR@ -o "${MOD}.la" $LO_FILES $PT || exit 1 36 | if [ -d "@libdir@/@PACKAGE@@LIB_MAJOR@" -a -w "@libdir@/@PACKAGE@@LIB_MAJOR@" ] 37 | then 38 | @MODULE_LIBTOOL@ --mode=install @INSTALL@ "${MOD}.la" "@libdir@/@PACKAGE@@LIB_MAJOR@/${MOD}.la" 39 | else 40 | echo To install: sudo @MODULE_LIBTOOL@ --mode=install @INSTALL@ "${MOD}.la" "@libdir@/@PACKAGE@@LIB_MAJOR@/${MOD}.la" 41 | fi 42 | -------------------------------------------------------------------------------- /plugin_before.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | HELP("Include only sequences in the one before the provided sequence", "before:sequenceid"); 4 | 5 | VER_INFO("1.0"); 6 | 7 | struct data { 8 | panda_seq_identifier marker_id; 9 | bool state; 10 | }; 11 | 12 | static bool precheck_func( 13 | PandaLogProxy logger, 14 | const panda_seq_identifier *id, 15 | const panda_qual *forward, 16 | size_t forward_length, 17 | const panda_qual *reverse, 18 | size_t reverse_length, 19 | void *user_data) { 20 | 21 | struct data *data = (struct data *) user_data; 22 | 23 | (void) logger; 24 | (void) forward; 25 | (void) forward_length; 26 | (void) reverse; 27 | (void) reverse_length; 28 | 29 | if (panda_seqid_equal(&data->marker_id, id)) { 30 | data->state = true; 31 | } 32 | return !data->state; 33 | } 34 | 35 | OPEN { 36 | struct data data; 37 | 38 | (void) check; 39 | 40 | if (args == NULL) { 41 | panda_log_proxy_write_str(logger, "ERR\tBEFORE\tNO ID\n"); 42 | return false; 43 | } 44 | 45 | if (panda_seqid_parse(&data.marker_id, args[0] == '@' ? (args + 1) : args, PANDA_TAG_OPTIONAL) == 0) { 46 | panda_log_proxy_write_f(logger, "ERR\tBEFORE\tBAD\t%s\n", args); 47 | return false; 48 | } else { 49 | data.state = false; 50 | *precheck = precheck_func; 51 | *destroy = free; 52 | *user_data = PANDA_STRUCT_DUP(&data); 53 | return true; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /plugin_after.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | HELP("Include only sequences in the one after the provided sequence", "after:sequenceid"); 5 | 6 | VER_INFO("1.0"); 7 | 8 | struct data { 9 | panda_seq_identifier marker_id; 10 | bool state; 11 | }; 12 | 13 | static bool precheck_func( 14 | PandaLogProxy logger, 15 | const panda_seq_identifier *id, 16 | const panda_qual *forward, 17 | size_t forward_length, 18 | const panda_qual *reverse, 19 | size_t reverse_length, 20 | void *user_data) { 21 | 22 | struct data *data = (struct data *) user_data; 23 | 24 | (void) logger; 25 | (void) forward; 26 | (void) forward_length; 27 | (void) reverse; 28 | (void) reverse_length; 29 | 30 | if (panda_seqid_equal(&data->marker_id, id)) { 31 | data->state = true; 32 | } 33 | return data->state; 34 | } 35 | 36 | OPEN { 37 | struct data data; 38 | 39 | (void) check; 40 | 41 | if (args == NULL) { 42 | panda_log_proxy_write_str(logger, "ERR\tAFTER\tNO ID\n"); 43 | return false; 44 | } 45 | 46 | if (panda_seqid_parse(&data.marker_id, args[0] == '@' ? (args + 1) : args, PANDA_TAG_OPTIONAL) == 0) { 47 | panda_log_proxy_write_f(logger, "ERR\tAFTER\tBAD\t%s\n", args); 48 | return false; 49 | } else { 50 | data.state = false; 51 | *precheck = precheck_func; 52 | *destroy = free; 53 | *user_data = PANDA_STRUCT_DUP(&data); 54 | 55 | return true; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /plugin_min_overlapbits.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include"table.h" 8 | 9 | #ifndef M_LN2 10 | # define M_LN2 0.69314718055994530942 11 | #endif 12 | 13 | HELP("Check the number of bits saved (Cole 2013).", "min_overlapbits:15"); 14 | 15 | VER_INFO("1.0"); 16 | 17 | static bool check_func( 18 | PandaLogProxy logger, 19 | const panda_result_seq *sequence, 20 | void *user_data) { 21 | (void) logger; 22 | return *(double *) user_data <= sequence->estimated_overlap_probability; 23 | } 24 | 25 | OPEN { 26 | double bits_saved = 15 * M_LN2; // change from bits to nats 27 | char *remainder = NULL; 28 | double orig_value; 29 | 30 | (void) precheck; 31 | 32 | if (args != NULL) { 33 | errno = 0; 34 | orig_value = strtod(args, &remainder); 35 | bits_saved = orig_value * M_LN2; // change from bits to nats 36 | 37 | if (errno != 0) { 38 | panda_log_proxy_write_str(logger, "bits_saved"); 39 | return false; 40 | } else if (*remainder != '\0') { 41 | panda_log_proxy_write_f(logger, "bits_saved: trailing garbage: %s\n", remainder); 42 | return false; 43 | } 44 | if (bits_saved < 0) { 45 | panda_log_proxy_write_f(logger, "Value %f out of range for bits saved cut-off.", orig_value); 46 | return false; 47 | } 48 | } 49 | *check = check_func; 50 | *user_data = PANDA_STRUCT_DUP(&bits_saved); 51 | *destroy = free; 52 | return true; 53 | } 54 | -------------------------------------------------------------------------------- /pandaseq-diff.1: -------------------------------------------------------------------------------- 1 | '\" e 2 | .\" Authors: Andre Masella 3 | .EQ 4 | delim $$ 5 | .EN 6 | .EQ 7 | .EN 8 | .TH pandaseq-diff 1 "February 2014" "2.7" "USER COMMANDS" 9 | .SH NAME 10 | pandaseq-diff \- PAired-eND Assembler for DNA sequences Comparison 11 | .SH SYNOPSIS 12 | .B pandaseq-diff 13 | common 14 | .B -- 15 | control 16 | .B -- 17 | experimental 18 | .SH DESCRIPTION 19 | PANDASEQ has many knobs to adjust to try to produce the ideal sequence output. To facilitate selecting parameters, PANDASEQ-DIFF allows running experiments to see how different parameters affect assembly. 20 | .SH OPTIONS 21 | The options are largely the same as 22 | .BR pandaseq (1) 23 | except that no output is generated, so all options for writing data are not present. Options are given in three groups: options that will be used for both control and experiment, options only for the control, and options only for the experiment. All options related to input data must be in the common options pool. 24 | 25 | Normal logging is suppressed, so the option should first be tested with 26 | .BR pandaseq (1). 27 | 28 | .TP 29 | \-v 30 | When individual output base qualities are different, show the bases that are different and the differing quality scores. 31 | .SH EXAMPLES 32 | 33 | .B pandaseq-diff -f s_7_1.fastq.bz2 -r s_7_2.fastq.bz2 -- -t 0.9 -- -A rdp_mle 34 | 35 | This will assemble data from lane 7, comparing the original PANDAseq algorithm, with the output quality above 0.9, against the RDP MLE, using the default quality cut-off. 36 | .SH SEE ALSO 37 | .BR pandaseq (1). 38 | -------------------------------------------------------------------------------- /testing/README.md: -------------------------------------------------------------------------------- 1 | This is an experimental tool for determining if changes to PANDAseq affect the output. You will need Vala to compile it. 2 | 3 | 1. Install PANDAseq. 4 | 2. Install Vala (`sudo apt-get install valac` or `sudo yum install vala vala-tools`). 5 | 3. Modify PANDAseq and compile it, but do not install. 6 | 3. Compiled the tester using `make`. 7 | 8 | If PANDAseq has been installed somewhere that pkg-config, and Vala are not looking, set the `PREFIX` in the Makefile. 9 | 10 | 5. Run regression test on a sample dataset: 11 | 12 | ./reg-test -f mcbath_1.fastq.bz2 -r mcbath_2.fastq.bz2 13 | 14 | Each read pair will be assembled by both the existing and new assemblers, and the results compared. 15 | 16 | The recommended data set is the sample dataset provided at [McBath dataset](http://neufeldserver.uwaterloo.ca/~apmasell/pandaseq_sampledata.tar) or [a small subset](http://neufeldserver.uwaterloo.ca/~apmasell/pandaseq_sampledata_small.tar) that was used in the original publication. You can use the `-W` option to download and use these sequences automatically. 17 | 18 | This code makes use of strange `objcopy` behaviour and so requires that methods are not called on the assembler and that no ABI changes have occured. It also is probably very non-portable, but does work on Linux. 19 | 20 | By default, the test suite will only compare the most basic setup of an assembler with no options. To compare other conditions, edit `setup.vala` to prepare the assembler in the desired way. The same configuration will be used for both the new and old library. 21 | -------------------------------------------------------------------------------- /module.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #ifndef PLUGIN_H 19 | # define PLUGIN_H 20 | # include "pandaseq.h" 21 | 22 | extern bool module_checkseq( 23 | PandaAssembler assembler, 24 | panda_result_seq *sequence); 25 | extern bool module_precheckseq( 26 | PandaAssembler assembler, 27 | panda_seq_identifier *id, 28 | const panda_qual *forward, 29 | size_t forward_length, 30 | const panda_qual *reverse, 31 | size_t reverse_length); 32 | extern void module_help( 33 | PandaAssembler assembler); 34 | extern void module_version( 35 | PandaAssembler assembler); 36 | extern bool module_init( 37 | PandaAssembler assembler); 38 | extern void module_cleanup( 39 | PandaAssembler assembler); 40 | extern void module_destroy( 41 | PandaAssembler assembler); 42 | 43 | extern void module_show_all( 44 | void); 45 | #endif 46 | -------------------------------------------------------------------------------- /vapi-url.in: -------------------------------------------------------------------------------- 1 | /* vim: set filetype=vala: */ 2 | /** 3 | * PANDAseq @VERSION@ Illumina assembler: URL Opener 4 | * 5 | * Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 6 | */ 7 | namespace Panda { 8 | /** 9 | * Decompress BZipped data. 10 | * @param source the underlying stream to decompress. 11 | * @param verbosity the BZip logging level. 12 | */ 13 | [CCode (cname = "panda_bz_decompress", cheader_filename = "pandaseq-url.h")] 14 | public BufferRead? bz_decompress (owned BufferRead? source, int verbosity = 0); 15 | 16 | /** 17 | * Open a URL and read the sequence. 18 | * @param url the URL, as understood by cURL. 19 | */ 20 | [CCode (cname = "panda_open_url", cheader_filename = "pandaseq-url.h")] 21 | public BufferRead? open_url (string url, LogProxy logger); 22 | 23 | namespace cURL { 24 | /** 25 | * Increment the reference count on the cURL library. 26 | * 27 | * Since cURL needs to be initialised, PANDAseq will do this automatically when a URL is opened and automatically call the matching clean up when all readers have been disposed. 28 | * 29 | * If the program wishes to use cURL, it should call this method to increment the reference count on PANDAseq's internal counter, such that it will not clean up the cURL library while in use. 30 | * 31 | * @return whether the library was successfully initialised. 32 | */ 33 | [CCode (cname = "panda_curl_ref", cheader_filename = "pandaseq-url.h")] 34 | public bool @ref (); 35 | 36 | /** 37 | * Decrement the reference count on the cURL library. 38 | */ 39 | [CCode (cname = "panda_curl_unref", cheader_filename = "pandaseq-url.h")] 40 | public void unref (); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /pandaseq-hang.1: -------------------------------------------------------------------------------- 1 | .\" Authors: Andre Masella 2 | .TH pandaseq-hang 1 "May 2013" "1.0" "USER COMMANDS" 3 | .SH NAME 4 | pandaseq-hang \- PAired-eND Assembler for DNA sequences for files which overhang 5 | .SH SYNOPSIS 6 | .B pandaseq-hang 7 | [ 8 | .B \-P 9 | .I forwardtrim 10 | ] [ 11 | .B \-Q 12 | .I reversetrim 13 | ] [ 14 | .B \-s 15 | ] ... 16 | .SH DESCRIPTION 17 | PANDASEQ assembles paired-end Illumina reads into sequences, trying to correct for errors and uncalled bases. The regular PANDAseq software cannot cope with long overhangs where the reverse read extends past the first base of the forward read, and vice versa. This version can clip reads after a sequence has been recognised so that assembly works. For more information, see 18 | .BR pandaseq (1). 19 | .SH OPTIONS 20 | All parameters not listed here are identical to their 21 | .BR pandaseq (1) 22 | versions. 23 | .TP 24 | \-P forwardtrim 25 | Seach for the provided nucleotide sequence in the forward read and discard all sequence including and after this sequence if it is found in the forward read. This sequence should be in the reverse primer from the original PCR. 26 | .TP 27 | \-Q reversetrim 28 | Seach for the provided nucleotide sequence in the reverse read and discard all sequence including and after this sequence if it is found in the reverse read. This sequence should be in the forward primer from the original PCR. 29 | .TP 30 | \-s 31 | If the reads do not contain the trim sequences provided, attempt assembly anyway. If this is not provided, any reads not containing the trim sequences are discarded. This can be useful if the sequence length is just on the border between overhanging and normal. 32 | .SH SEE ALSO 33 | .BR pandaseq (1), 34 | .BR pandaxs (1). 35 | -------------------------------------------------------------------------------- /plugin_overlap_stat.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | HELP("Produce statistics on the overlaps examined. Somewhat interesting to see the efficiency of the k-mer table.", "overlap_stat"); 5 | 6 | VER_INFO("1.0"); 7 | 8 | struct data { 9 | PandaWriter writer; 10 | size_t counts[]; 11 | }; 12 | 13 | static bool check_func( 14 | PandaLogProxy logger, 15 | const panda_result_seq *sequence, 16 | void *user_data) { 17 | 18 | struct data *data = (struct data *) user_data; 19 | 20 | (void) logger; 21 | 22 | if (sequence->overlaps_examined > 0) { 23 | data->counts[sequence->overlaps_examined - 1]++; 24 | } 25 | return true; 26 | } 27 | 28 | static void cleanup( 29 | struct data *data) { 30 | size_t it; 31 | size_t max; 32 | 33 | for (max = PANDA_MAX_LEN - 1; data->counts[max] == 0; max--) ; 34 | 35 | panda_writer_append(data->writer, "STAT\tEXAMINED"); 36 | for (it = 0; it <= max; it++) { 37 | panda_writer_append(data->writer, " %d", data->counts[it]); 38 | } 39 | panda_writer_append_c(data->writer, '\n'); 40 | panda_writer_commit(data->writer); 41 | panda_writer_unref(data->writer); 42 | free(data); 43 | } 44 | 45 | OPEN { 46 | struct data *data; 47 | size_t it; 48 | 49 | if (args != NULL && args[0] != '\0') { 50 | panda_log_proxy_write_str(logger, "ERR\tOVERLAPSTAT\n"); 51 | return false; 52 | } 53 | data = malloc(sizeof(struct data) + sizeof(size_t) * PANDA_MAX_LEN); 54 | for (it = 0; it < PANDA_MAX_LEN; it++) { 55 | data->counts[it] = 0; 56 | } 57 | data->writer = panda_writer_ref(panda_log_proxy_get_writer(logger)); 58 | *precheck = NULL; 59 | *check = check_func; 60 | *destroy = (PandaDestroy) cleanup; 61 | *user_data = data; 62 | return true; 63 | } 64 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include "pandaseq.h" 20 | #include "misc.h" 21 | 22 | int main( 23 | int argc, 24 | char **argv) { 25 | MANAGED_STACK(PandaOutputSeq, output); 26 | PandaAssembler assembler; 27 | PandaArgsFastq data = panda_args_fastq_new(); 28 | PandaMux mux; 29 | bool result; 30 | int threads; 31 | 32 | if (!panda_parse_args(argv, argc, panda_stdargs, panda_stdargs_length, panda_args_fastq_args, panda_args_fastq_args_length, (PandaTweakGeneral) panda_args_fastq_tweak, (PandaOpener) panda_args_fastq_opener, (PandaSetup) panda_args_fastq_setup, data, &assembler, &mux, &threads, &output, &output_data, &output_destroy)) { 33 | panda_args_fastq_free(data); 34 | return 1; 35 | } 36 | result = panda_run_pool(threads, assembler, mux, output, output_data, output_destroy); 37 | panda_args_fastq_free(data); 38 | return result ? 0 : 1; 39 | } 40 | -------------------------------------------------------------------------------- /plugin_filter.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | HELP("Filters sequences based on the contents of a file of ids, one sequence ID per line.", "filter:file"); 5 | 6 | VER_INFO("1.0"); 7 | 8 | static bool precheck_func( 9 | PandaLogProxy logger, 10 | const panda_seq_identifier *id, 11 | const panda_qual *forward, 12 | size_t forward_length, 13 | const panda_qual *reverse, 14 | size_t reverse_length, 15 | void *user_data) { 16 | 17 | (void) logger; 18 | (void) forward; 19 | (void) forward_length; 20 | (void) reverse; 21 | (void) reverse_length; 22 | return panda_idset_contains((PandaSet) user_data, id); 23 | } 24 | 25 | OPEN { 26 | char buffer[1024]; 27 | bool close = false; 28 | FILE *file; 29 | PandaSet set; 30 | 31 | (void) check; 32 | 33 | if (args == NULL || *args == '\0') { 34 | file = stdin; 35 | } else { 36 | file = fopen(args, "r"); 37 | if (file == NULL) { 38 | panda_log_proxy_perror(logger, args); 39 | return false; 40 | } 41 | close = true; 42 | } 43 | set = panda_idset_new(); 44 | while (fgets(buffer, sizeof(buffer), file) != NULL) { 45 | int it; 46 | for (it = 0; buffer[it] != '\n'; it++) ; 47 | buffer[it] = '\0'; 48 | 49 | if (!panda_idset_add_str(set, buffer[0] == '@' ? (buffer + 1) : buffer, PANDA_TAG_OPTIONAL, NULL, NULL)) { 50 | panda_log_proxy_write_f(logger, "ERR\tFILTER\tBAD\t%s\n", buffer); 51 | if (close) 52 | fclose(file); 53 | return false; 54 | } 55 | } 56 | if (ferror(file)) { 57 | panda_log_proxy_perror(logger, args); 58 | if (close) 59 | fclose(file); 60 | return false; 61 | } 62 | if (close) 63 | fclose(file); 64 | *precheck = precheck_func; 65 | *user_data = set; 66 | *destroy = (PandaDestroy) panda_idset_unref; 67 | return true; 68 | } 69 | -------------------------------------------------------------------------------- /check_parser.c: -------------------------------------------------------------------------------- 1 | #define _POSIX_C_SOURCE 2 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "config.h" 9 | #include "pandaseq.h" 10 | 11 | typedef struct { 12 | const char *str; 13 | int dir; 14 | PandaIdFmt format; 15 | panda_seq_identifier id; 16 | } test_case; 17 | 18 | bool check( 19 | const test_case * test) { 20 | PandaIdFmt detected_format; 21 | panda_seq_identifier id; 22 | panda_seqid_clear(&id); 23 | int dir = panda_seqid_parse_fail(&id, test->str, PANDA_TAG_OPTIONAL, &detected_format, NULL); 24 | return panda_seqid_equal(&id, &test->id) && dir == test->dir && detected_format == test->format; 25 | } 26 | 27 | const test_case checks[] = { 28 | {"M01271:10:000000000-A3WGH:1:1101:18786:6175 1:N:0:1", 1, PANDA_IDFMT_CASAVA_1_7, {"M01271", "10", "000000000-A3WGH", 1, 1101, 18786, 6175, "1"}}, 29 | {"ILLUMINA-BE9C3F:29:FC:3:1:2462:1120 1:N:0:GCTATA", 1, PANDA_IDFMT_CASAVA_1_7, {"ILLUMINA-BE9C3F", "29", "FC", 3, 1, 2462, 1120, "GCTATA"}}, 30 | {"M00958:47:000000000-A3GH7:1:1101:15028:1512 2:N:0:3", 2, PANDA_IDFMT_CASAVA_1_7, {"M00958", "47", "000000000-A3GH7", 1, 1101, 15028, 1512, "3"}}, 31 | {"1468:1:1:12675:1118#ATCACGA/1", 1, PANDA_IDFMT_CASAVA_1_4, {"1468", "", "", 1, 1, 12675, 1118, "ATCACGA"}}, 32 | {"1468:1:1:12675:1118#ATCACGA/2", 2, PANDA_IDFMT_CASAVA_1_4, {"1468", "", "", 1, 1, 12675, 1118, "ATCACGA"}}, 33 | {"MISEQ03:18:000000000-A1REG:1:1101:14774:1712#GATAGTGCCAC/1", 1, PANDA_IDFMT_CASAVA_CONVERTED, {"MISEQ03", "18", "000000000-A1REG", 1, 1101, 14774, 1712, "GATAGTGCCAC"}} 34 | }; 35 | 36 | int main( 37 | ) { 38 | int exit_code = 0; 39 | for (size_t it = 0; it < sizeof(checks) / sizeof(*checks); it++) { 40 | if (!check(checks + it)) { 41 | fprintf(stderr, "FAILED: %s\n", checks[it].str); 42 | exit_code = 1; 43 | } 44 | } 45 | return exit_code; 46 | } 47 | -------------------------------------------------------------------------------- /pandaseq-plugin.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef _PANDASEQ_PLUGIN_H 20 | # define _PANDASEQ_PLUGIN_H 21 | # ifdef __cplusplus 22 | # define EXTERN_C_BEGIN extern "C" { 23 | # define EXTERN_C_END } 24 | # else 25 | # define EXTERN_C_BEGIN 26 | # define EXTERN_C_END 27 | # endif 28 | # include 29 | # include 30 | # include 31 | EXTERN_C_BEGIN 32 | # define PANDA_STRUCT_DUP(item) memcpy(malloc(sizeof(*item)), item, sizeof(*item)) 33 | # define OPEN bool PANDACONCAT(PANDASEQ_MODULE,_LTX_opener) (PandaLogProxy logger, const char *args, PandaPreCheck *precheck, PandaCheck *check, void **user_data, PandaDestroy *destroy) 34 | # define HELP(desc, usage) const char *PANDACONCAT(PANDASEQ_MODULE,_LTX_desc) = desc; const char *PANDACONCAT(PANDASEQ_MODULE,_LTX_usage) = usage 35 | # define VER_INFO(version) const char *PANDACONCAT(PANDASEQ_MODULE,_LTX_version) = version 36 | EXTERN_C_END 37 | #endif 38 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: pandaseq 2 | Section: science 3 | Maintainer: Andre Masella 4 | Build-Depends: debhelper (>= 7.0.50~), autotools-dev, zlib1g-dev, libbz2-dev, libcurl-dev, libltdl-dev, libtool 5 | Priority: extra 6 | Standards-Version: 3.9.1 7 | Homepage: http://github.com/neufeld/pandaseq 8 | 9 | Package: pandaseq 10 | Architecture: any 11 | Depends: ${shlibs:Depends}, ${misc:Depends} 12 | Description: Pair-end read assembler 13 | PANDA assembles forward and reverse reads from Illumina FASTQ data 14 | 15 | Package: libpandaseq7 16 | Architecture: any 17 | Depends: ${shlibs:Depends}, ${misc:Depends} 18 | Description: Pair-end read assembler 19 | PANDA assembles forward and reverse reads from Illumina FASTQ data 20 | 21 | Package: libpandaseq-url0 22 | Architecture: any 23 | Depends: ${shlibs:Depends}, ${misc:Depends} 24 | Description: Pair-end read assembler -- URL handler 25 | PANDA assembles forward and reverse reads from Illumina FASTQ data 26 | . 27 | This package allows reading of files on remote servers. 28 | 29 | Package: pandaseq-dev 30 | Architecture: any 31 | Depends: ${misc:Depends}, libpandaseq7 (= ${binary:Version}), libpandaseq-url0 (= ${binary:Version}), libtool 32 | Description: Pair-end read assembler -- Development tools 33 | PANDA assembles forward and reverse reads from Illumina FASTQ data 34 | . 35 | This package contains development tools for creating PANDAseq 36 | validation modules. If you are only assembling sequences, this is 37 | not necessary. 38 | 39 | Package: pandaseq-dbg 40 | Architecture: any 41 | Section: debug 42 | Priority: extra 43 | Depends: 44 | pandaseq (= ${binary:Version}), 45 | ${misc:Depends} 46 | Description: Pair-end read assembler -- Debugging symbols 47 | PANDA assembles forward and reverse reads from Illumina FASTQ data 48 | . 49 | This package contains debugging symbols. If you are only assembling 50 | sequences, this is not necessary. 51 | 52 | -------------------------------------------------------------------------------- /pandaseq-linebuf.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef _PANDASEQ_LINEBUF_H 20 | # define _PANDASEQ_LINEBUF_H 21 | # ifdef __cplusplus 22 | # define EXTERN_C_BEGIN extern "C" { 23 | # define EXTERN_C_END } 24 | # else 25 | # define EXTERN_C_BEGIN 26 | # define EXTERN_C_END 27 | # endif 28 | # include 29 | EXTERN_C_BEGIN 30 | /* === Constructors === */ 31 | /** 32 | * Create a new line reader from a buffer reading source. 33 | * @read: (closure read_data) (scope notified): the function to do reading. 34 | */ 35 | PandaLineBuf panda_linebuf_new( 36 | PandaBufferRead read, 37 | void *read_data, 38 | PandaDestroy read_destroy); 39 | 40 | /* === Methods === */ 41 | /** 42 | * Destroy the line buffer. 43 | */ 44 | void panda_linebuf_free( 45 | PandaLineBuf linebuf); 46 | /** 47 | * Read the next line. 48 | * Returns: (transfer none) (allow-none): the next line in the file. This is only valid until the next call. 49 | */ 50 | const char *panda_linebuf_next( 51 | PandaLineBuf linebuf); 52 | EXTERN_C_END 53 | #endif 54 | -------------------------------------------------------------------------------- /main-diff.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2014 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include "pandaseq.h" 20 | #include "misc.h" 21 | 22 | int main( 23 | int argc, 24 | char **argv) { 25 | MANAGED_STACK(PandaNextSeq, 26 | next); 27 | PandaAssembler control_assembler; 28 | PandaAssembler experimental_assembler; 29 | PandaArgsFastq data = panda_args_fastq_new(); 30 | bool result; 31 | bool suppress_quality_diffs; 32 | 33 | if (!panda_diff_parse_args(argv, argc, panda_stdargs, panda_stdargs_length, panda_args_fastq_args, panda_args_fastq_args_length, (PandaTweakGeneral) panda_args_fastq_tweak, (PandaOpener) panda_args_fastq_opener, (PandaSetup) panda_args_fastq_setup, data, &control_assembler, &experimental_assembler, &next, &next_data, &next_destroy, &suppress_quality_diffs)) { 34 | panda_args_fastq_free(data); 35 | DESTROY_STACK(next); 36 | return 1; 37 | } 38 | result = panda_diff(next, next_data, (PandaAssemble) panda_assembler_assemble, control_assembler, (PandaAssemble) panda_assembler_assemble, experimental_assembler, suppress_quality_diffs); 39 | DESTROY_STACK(next); 40 | panda_assembler_unref(control_assembler); 41 | panda_assembler_unref(experimental_assembler); 42 | panda_args_fastq_free(data); 43 | return result ? 0 : 1; 44 | } 45 | -------------------------------------------------------------------------------- /pandaseq.spec.in: -------------------------------------------------------------------------------- 1 | Name: pandaseq 2 | Version: @PACKAGE_VERSION@ 3 | Release: 1%{?dist} 4 | Summary: Pair-end read assembly 5 | Group: Applications/Engineering 6 | 7 | License: GPLv3+ 8 | Source: https://github.com/neufeld/pandaseq/archive/v%{version}.tar.gz 9 | URL: http://github.com/neufeld/pandaseq 10 | 11 | BuildRequires: zlib-devel 12 | BuildRequires: bzip2-devel 13 | BuildRequires: libtool-ltdl-devel 14 | BuildRequires: autoconf 15 | BuildRequires: automake 16 | BuildRequires: libtool 17 | 18 | %description 19 | PANDA assembles forward and reverse reads from Illumina FASTQ data 20 | 21 | %package devel 22 | Summary: Pair-end read assembly -- Development tools 23 | Requires: libtool 24 | Requires: pandaseq-lib 25 | 26 | %description devel 27 | PANDA assembles forward and reverse reads from Illumina FASTQ data 28 | This package contains development tools for creating PANDAseq 29 | validation modules. If you are only assembling sequences, this is 30 | not necessary. 31 | 32 | %package lib 33 | Summary: Pair-end read assembly -- Libraries 34 | 35 | %description lib 36 | PANDA assembles forward and reverse reads from Illumina FASTQ data 37 | This package contains shared libraries. 38 | 39 | %prep 40 | %setup 41 | 42 | %build 43 | autoreconf -i 44 | %configure --disable-static 45 | make %{?_smp_mflags} 46 | 47 | %install 48 | rm -rf %{buildroot} 49 | make install DESTDIR=%{buildroot} 50 | rm -f %{buildroot}%{_libdir}/*.la %{buildroot}%{_libdir}/pandaseq*/*.la 51 | 52 | %clean 53 | rm -rf ${buildroot} 54 | 55 | %files devel 56 | %{_bindir}/pandaxs 57 | %{_includedir}/pandaseq*/* 58 | %{_libdir}/pkgconfig/*.pc 59 | %{_libdir}/libpandaseq*.so 60 | %{_datarootdir}/vala/vapi/* 61 | %doc %{_mandir}/man1/pandaxs.1.gz 62 | %doc %{_defaultdocdir}/pandaseq/plugin_sample.c 63 | 64 | %files lib 65 | %{_libdir}/libpandaseq*.so.* 66 | %{_libdir}/pandaseq*/*.so* 67 | 68 | %files 69 | %{_bindir}/pandaseq* 70 | %doc %{_mandir}/man1/pandaseq*.1.gz 71 | %doc %{_defaultdocdir}/pandaseq/README 72 | 73 | %post 74 | /sbin/ldconfig 75 | 76 | %postun 77 | /sbin/ldconfig 78 | -------------------------------------------------------------------------------- /main-hang.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include 19 | #include "config.h" 20 | #include "pandaseq.h" 21 | #include "misc.h" 22 | 23 | int main( 24 | int argc, 25 | char **argv) { 26 | MANAGED_STACK(PandaOutputSeq, output); 27 | PandaAssembler assembler; 28 | PandaArgsHang data = panda_args_hang_new(panda_args_fastq_new(), (PandaDestroy) panda_args_fastq_free, (PandaTweakGeneral) panda_args_fastq_tweak, (PandaOpener) panda_args_fastq_opener, (PandaSetup) panda_args_fastq_setup); 29 | const panda_tweak_general **general_args; 30 | size_t general_args_length; 31 | PandaMux mux; 32 | bool result; 33 | int threads; 34 | 35 | general_args = panda_args_hang_args(panda_args_fastq_args, panda_args_fastq_args_length, &general_args_length); 36 | 37 | if (!panda_parse_args(argv, argc, panda_stdargs, panda_stdargs_length, general_args, general_args_length, (PandaTweakGeneral) panda_args_hang_tweak, (PandaOpener) panda_args_hang_opener, (PandaSetup) panda_args_hang_setup, data, &assembler, &mux, &threads, &output, &output_data, &output_destroy)) { 38 | free(general_args); 39 | panda_args_hang_free(data); 40 | return 1; 41 | } 42 | free(general_args); 43 | result = panda_run_pool(threads, assembler, mux, output, output_data, output_destroy); 44 | panda_args_hang_free(data); 45 | return result ? 0 : 1; 46 | } 47 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | addons: 3 | apt: 4 | packages: 5 | - groff 6 | env: 7 | - OPTIONS= GET_WIN= 8 | - OPTIONS="--host=i586-mingw32msvc --build=$(uname -m)-pc-linux-gnu --enable-shared 9 | --with-libcurl=./win" PKG_CONFIG_PATH=./win/lib/pkgconfig LDFLAGS="-L./win/lib" 10 | CPPFLAGS="-I./win/include" GET_WIN=true 11 | - OPTIONS=--disable-threads GET_WIN= 12 | - OPTIONS="--disable-threads --host=i586-mingw32msvc --build=$(uname -m)-pc-linux-gnu 13 | --enable-shared --with-libcurl=./win" PKG_CONFIG_PATH=./win/lib/pkgconfig LDFLAGS="-L./win/lib" 14 | CPPFLAGS="-I./win/include" GET_WIN=true 15 | matrix: 16 | include: 17 | - env: OPTIONS= GET_WIN= 18 | compiler: clang 19 | - env: OPTIONS=--disable-threads GET_WIN= 20 | compiler: clang 21 | before_script: 22 | - test "x$GET_WIN" = x || wget -O- https://storage.googleapis.com/pandaseq/pandaseq-windeps.tar.bz2 23 | | tar xj 24 | - "./autogen.sh" 25 | - test "x$GET_WIN" = x || export CC=i586-mingw32msvc-gcc 26 | script: "./configure $OPTIONS && make all && test x$GET_WIN = xtrue || make check" 27 | before_deploy: 28 | - mkdir www 29 | - man -Thtml ./pandaseq.1 > www/pandaseq.html 30 | notifications: 31 | on_success: never 32 | on_failure: change 33 | deploy: 34 | provider: gcs 35 | access_key_id: GOOGFGAJFDXIBCUHHUBRG67I 36 | secret_access_key: 37 | secure: HvMEAOFb3pc1LmFi8tmjBJVpuD2oINLBAQsx93SdvPqJZllpSQAxwPyzZdSyThk0eClUHdNkTX4notXjjnoPaxKUjnzVtSeet/WhwrDVLhB/cGc7rxTZC7dO/iaNgzW2GmBHCPwb2CfzJ2bmLgTIWQrUDDCYPkGNvVBZb8ATR/Cbc61QuLspjKA6vqQYQJG51y7CdGqjPeTwCTWxc/KlYGzIoodEwBfs9uLaVk/FUkj81lN89SG848ubEYxjzOUABE2pWI1NonImbhbSC5pdSaJBnQhuRinxyfDIC4FfLgvhbb0OYc58FtYJQE/SYLLw7tQb9AAgJmHxsn3WGFaYvrlAPL1DSe19jJoBRouKG9+as9h/r0+ndyVfkV3JEgcojxJb6GzxZdZgIoHIuXmnXnibgoyg/KkhGQjbMNKYoJ8Pl8BEeEHsqP3xvnT6MNRThVgOkxs+5t52kJ8ovHV7KVM4wuL4DhfClfSQXLXu+D+AqvwLWH4PxRHyVWCLJs4q/C6EFB9ivf5ze6wRPbke9haN6jtJ3hZCdzB95la5Ap3MFdR5HHYNWGBJ/OilYFHDVhBz/KEazN+qsw0vjtcX1WZXRJs0aXW9P6q8XgIzgKoXDvYAhbdbYZIe1utdv06VsctWhSNjdkWxVse948D/moI5vUEiFeVd8nVXSKEWe7Y= 38 | bucket: pandaseq 39 | skip_cleanup: true 40 | local_dir: www 41 | acl: public_read 42 | on: 43 | branch: master 44 | repo: neufeld/pandaseq 45 | -------------------------------------------------------------------------------- /m4/legacy_pkg.m4: -------------------------------------------------------------------------------- 1 | # pkg.m4 - Macros to make old packages look like pkg-config. -*- Autoconf -*- 2 | # 3 | # Copyright © 2014 Andre Masella . 4 | # Copyright © 2004 Scott James Remnant . 5 | # 6 | # This program is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program; if not, write to the Free Software 18 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # As a special exception to the GNU General Public License, if you 21 | # distribute this file as part of a program that contains a 22 | # configuration script generated by Autoconf, you may include it under 23 | # the same distribution terms that you use for the rest of that program. 24 | 25 | # LEGACY_CHECK_MODULES(VARIABLE-PREFIX, HEADER, LIBRARY, FUNCTION, [OTHER-LIBS], 26 | # [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) 27 | # -------------------------------------------------------------- 28 | AC_DEFUN([LEGACY_CHECK_MODULES], 29 | [AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1])dnl 30 | AC_ARG_VAR([$1][_LIBS], [linker flags for $1])dnl 31 | 32 | pkg_original_libs="$LIBS" 33 | AC_CHECK_HEADERS([$2], [pkg_failed=no], [pkg_failed=yes]) 34 | if test $pkg_failed = no; then 35 | AC_CHECK_LIB([$3], [$4], [], [pkg_failed=yes], [$5]) 36 | fi 37 | 38 | m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS 39 | and $1[]_LIBS.]) 40 | 41 | if test $pkg_failed = yes; then 42 | pkg_failed=yes # make sure this block isn't empty 43 | $7 44 | else 45 | $1[]_CFLAGS="" 46 | $1[]_LIBS="$LIBS" 47 | LIBS="$pkg_original_libs" 48 | $6 49 | fi[]dnl 50 | ])# LEGACY_CHECK_MODULES 51 | -------------------------------------------------------------------------------- /plugin_other_primer.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | HELP("Remove reads with another primer. Use f for forward, r for reverse.", "other_primer:[fr]:NNNNN"); 5 | 6 | VER_INFO("1.0"); 7 | 8 | struct data { 9 | size_t primer_length; 10 | bool forward; 11 | panda_nt primer[1]; 12 | }; 13 | 14 | static bool precheck_func( 15 | PandaLogProxy logger, 16 | const panda_seq_identifier *id, 17 | const panda_qual *forward, 18 | size_t forward_length, 19 | const panda_qual *reverse, 20 | size_t reverse_length, 21 | struct data *data) { 22 | 23 | (void) logger; 24 | (void) id; 25 | 26 | return panda_compute_offset_qual(log(0.9), 0.01, !data->forward, data->forward ? forward : reverse, data->forward ? forward_length : reverse_length, data->primer, data->primer_length) == 0; 27 | } 28 | 29 | OPEN { 30 | struct data *data; 31 | bool forward; 32 | size_t it; 33 | 34 | (void) check; 35 | 36 | if (args == NULL || *args == '\0') { 37 | return false; 38 | } 39 | if (*args == 'f' || *args == 'p') { 40 | forward = true; 41 | } else if (*args == 'r' || *args == 'q') { 42 | forward = false; 43 | } else { 44 | panda_log_proxy_write_f(logger, "ERR\tOTHER_PRIMER\tINIT\tExpected f or r, but got %c.\n", (int) *args); 45 | return false; 46 | } 47 | args++; 48 | if (*args != ':') { 49 | panda_log_proxy_write_f(logger, "ERR\tOTHER_PRIMER\tINIT\tExpected :, but got %c.\n", (int) *args); 50 | return false; 51 | } 52 | args++; 53 | if (*args == '\0') { 54 | panda_log_proxy_write_f(logger, "ERR\tOTHER_PRIMER\tINIT\tPrimer cannot be empty.\n"); 55 | return false; 56 | } 57 | 58 | data = malloc(sizeof(struct data) + sizeof(panda_nt) * strlen(args)); 59 | data->forward = forward; 60 | data->primer_length = strlen(args); 61 | for (it = 0; it < data->primer_length; it++) { 62 | if ((data->primer[it] = (forward ? panda_nt_from_ascii : panda_nt_from_ascii_complement) (args[it])) == '\0') { 63 | panda_log_proxy_write_f(logger, "ERR\tOTHER_PRIMER\tBADNT\t%c\n", (int) args[it]); 64 | free(data); 65 | return false; 66 | } 67 | } 68 | 69 | *precheck = (PandaPreCheck) precheck_func; 70 | *user_data = data; 71 | *destroy = (PandaDestroy) free; 72 | return true; 73 | } 74 | -------------------------------------------------------------------------------- /assembler.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef ASM_H 20 | # define ASM_H 21 | # include "config.h" 22 | # include "pandaseq.h" 23 | # include "misc.h" 24 | # ifdef HAVE_PTHREAD 25 | # include 26 | # endif 27 | 28 | struct panda_assembler { 29 | volatile size_t refcnt; 30 | 31 | MANAGED_MEMBER( 32 | PandaNextSeq, 33 | next); 34 | MANAGED_MEMBER( 35 | PandaFailAlign, 36 | noalgn); 37 | PandaLogProxy logger; 38 | 39 | size_t *rejected; 40 | PandaModule *modules; 41 | size_t modules_length; 42 | size_t modules_size; 43 | 44 | double threshold; 45 | size_t minoverlap; 46 | size_t maxoverlap; 47 | 48 | seqindex *kmerseen; 49 | size_t num_kmers; 50 | PandaAlgorithm algo; 51 | 52 | panda_result_seq result; 53 | 54 | size_t forward_primer_length; 55 | size_t reverse_primer_length; 56 | size_t forward_trim; 57 | size_t reverse_trim; 58 | 59 | long nofpcount; 60 | long norpcount; 61 | long okcount; 62 | long lowqcount; 63 | long degencount; 64 | long noalgncount; 65 | long badreadcount; 66 | long slowcount; 67 | long count; 68 | bool post_primers; 69 | # ifdef HAVE_PTHREAD 70 | pthread_mutex_t mutex; 71 | # endif 72 | panda_nt forward_primer[MAX_LEN]; 73 | panda_nt reverse_primer[MAX_LEN]; 74 | panda_result result_seq[2 * MAX_LEN]; 75 | long overlapcount[2 * MAX_LEN]; 76 | size_t longest_overlap; 77 | char name[MAX_LEN]; 78 | double primer_penalty; 79 | }; 80 | 81 | #endif 82 | -------------------------------------------------------------------------------- /misc.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include 20 | #if HAVE_SYS_PARAM_H 21 | # include 22 | #endif 23 | #if HAVE_SYS_SYSCTL_H 24 | # include 25 | #endif 26 | #include "pandaseq.h" 27 | 28 | char const *panda_version( 29 | void) { 30 | return PACKAGE_STRING; 31 | } 32 | 33 | int panda_api_version( 34 | void) { 35 | return PANDA_API; 36 | } 37 | 38 | size_t panda_max_len( 39 | void) { 40 | return MAX_LEN; 41 | } 42 | 43 | #ifdef _WIN32 44 | # include 45 | int panda_get_default_worker_threads( 46 | void) { 47 | SYSTEM_INFO sysinfo; 48 | GetSystemInfo(&sysinfo); 49 | 50 | return (sysinfo.dwNumberOfProcessors < 1) ? 1 : sysinfo.dwNumberOfProcessors; 51 | } 52 | #elif defined(_SC_NPROCESSORS_ONLN) 53 | int panda_get_default_worker_threads( 54 | void) { 55 | int num_cpus = sysconf(_SC_NPROCESSORS_ONLN); 56 | return (num_cpus < 1) ? 1 : num_cpus; 57 | } 58 | #elif defined(HW_AVAILCPU) 59 | # include 60 | # include 61 | # include 62 | int panda_get_default_worker_threads( 63 | void) { 64 | int num_cpus; 65 | int mib[4]; 66 | size_t len = sizeof(num_cpus); 67 | mib[0] = CTL_HW; 68 | mib[1] = HW_AVAILCPU; 69 | sysctl(mib, 2, &num_cpu, &len, NULL, 0); 70 | 71 | if (num_cpu < 1) { 72 | mib[1] = HW_NCPU; 73 | sysctl(mib, 2, &numCPU, &len, NULL, 0); 74 | } 75 | return (num_cpus < 1) ? 1 : num_cpus; 76 | } 77 | #else 78 | int panda_get_default_worker_threads( 79 | void) { 80 | return 1; 81 | } 82 | #endif 83 | -------------------------------------------------------------------------------- /pandaseq-nt.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef _PANDASEQ_NT_H 20 | # define _PANDASEQ_NT_H 21 | # ifdef __cplusplus 22 | # define EXTERN_C_BEGIN extern "C" { 23 | # define EXTERN_C_END } 24 | # else 25 | # define EXTERN_C_BEGIN 26 | # define EXTERN_C_END 27 | # endif 28 | # include 29 | # include 30 | # include 31 | EXTERN_C_BEGIN 32 | /** 33 | * Nothing (invalid nucleotide) 34 | */ 35 | # define PANDA_NT_Z ((panda_nt)0) 36 | /** 37 | * Adenine 38 | */ 39 | # define PANDA_NT_A ((panda_nt)1) 40 | /** 41 | * Cytosine 42 | */ 43 | # define PANDA_NT_C ((panda_nt)2) 44 | /** 45 | * Guanine 46 | */ 47 | # define PANDA_NT_G ((panda_nt)4) 48 | /** 49 | * Thyamine 50 | */ 51 | # define PANDA_NT_T ((panda_nt)8) 52 | /** 53 | * Is nucleotide degenerate? 54 | */ 55 | # define PANDA_NT_IS_DEGN(v) (((((unsigned int)(v)) * 0x200040008001ULL & 0x111111111111111ULL) % 0xf) != 1) 56 | /** 57 | * Is nucleotide all possible values? 58 | */ 59 | # define PANDA_NT_IS_N(n) ((n) == (panda_nt)0x0F) 60 | /** 61 | * Get the nucleotide code for an ASCII character in IUPAC 62 | */ 63 | panda_nt panda_nt_from_ascii( 64 | char c); 65 | /** 66 | * Get the complement nucleotide code for an ASCII character in IUPAC 67 | */ 68 | panda_nt panda_nt_from_ascii_complement( 69 | char c); 70 | /** 71 | * Get the complementary nucleotide. 72 | */ 73 | panda_nt panda_nt_complement( 74 | panda_nt nt); 75 | /** 76 | * Convert a nucleotide to an IUPAC representation 77 | */ 78 | char panda_nt_to_ascii( 79 | panda_nt val); 80 | EXTERN_C_END 81 | #endif 82 | -------------------------------------------------------------------------------- /bzstream.c: -------------------------------------------------------------------------------- 1 | #include "config.h" 2 | #include 3 | #include 4 | #include "pandaseq.h" 5 | #include "misc.h" 6 | 7 | struct bz_stream_data { 8 | bz_stream strm; 9 | size_t buffer_length; 10 | MANAGED_MEMBER( 11 | PandaBufferRead, 12 | source); 13 | char buffer[]; 14 | }; 15 | 16 | static bool read_stream( 17 | char *buffer, 18 | size_t buffer_length, 19 | size_t *read, 20 | struct bz_stream_data *data) { 21 | 22 | int ret; 23 | while (true) { 24 | if (data->strm.avail_in == 0) { 25 | size_t avail_in; 26 | data->strm.next_in = data->buffer; 27 | if (!data->source(data->buffer, data->buffer_length, &avail_in, data->source_data)) { 28 | *read = 0; 29 | return false; 30 | } 31 | if (avail_in == 0) { 32 | *read = 0; 33 | return true; 34 | } 35 | data->strm.avail_in = avail_in; 36 | } 37 | data->strm.next_out = buffer; 38 | data->strm.avail_out = buffer_length; 39 | ret = BZ2_bzDecompress(&data->strm); 40 | 41 | if (ret == BZ_OK || ret == BZ_STREAM_END) { 42 | *read = buffer_length - data->strm.avail_out; 43 | if (*read > 0) { 44 | return true; 45 | } 46 | } else { 47 | *read = 0; 48 | return false; 49 | } 50 | } 51 | } 52 | 53 | static void destroy_stream( 54 | struct bz_stream_data *data) { 55 | BZ2_bzDecompressEnd(&data->strm); 56 | DESTROY_MEMBER(data, source); 57 | free(data); 58 | } 59 | 60 | #define BUFFER_SIZE 1024 61 | PandaBufferRead panda_bz_decompress( 62 | PandaBufferRead source, 63 | void *source_data, 64 | PandaDestroy source_destroy, 65 | int verbosity, 66 | void **user_data, 67 | PandaDestroy *destroy) { 68 | 69 | struct bz_stream_data *data; 70 | 71 | if (source == NULL) { 72 | *user_data = NULL; 73 | *destroy = NULL; 74 | return NULL; 75 | } 76 | 77 | data = malloc(sizeof(struct bz_stream_data) + BUFFER_SIZE); 78 | data->buffer_length = BUFFER_SIZE; 79 | 80 | data->strm.bzalloc = NULL; 81 | data->strm.bzfree = NULL; 82 | data->strm.opaque = NULL; 83 | if (BZ2_bzDecompressInit(&data->strm, verbosity, 0) == BZ_OK) { 84 | data->strm.avail_in = 0; 85 | 86 | data->source = source; 87 | data->source_data = source_data; 88 | data->source_destroy = source_destroy; 89 | 90 | *user_data = data; 91 | *destroy = (PandaDestroy) destroy_stream; 92 | return (PandaBufferRead) read_stream; 93 | } else { 94 | free(data); 95 | source_destroy(source_data); 96 | *user_data = NULL; 97 | *destroy = NULL; 98 | return NULL; 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /plugin_validtag.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | HELP("Filter out any sequences without a valid index tag.", "validtag:TAG1:TAG2:TAG3"); 6 | VER_INFO("1.0"); 7 | 8 | struct data { 9 | char **tags; 10 | char *tag_data; 11 | int numtags; 12 | int taglen; 13 | }; 14 | 15 | static bool precheck_func( 16 | PandaLogProxy logger, 17 | const panda_seq_identifier *id, 18 | const panda_qual *forward, 19 | size_t forward_length, 20 | const panda_qual *reverse, 21 | size_t reverse_length, 22 | void *user_data) { 23 | struct data *data = (struct data *) user_data; 24 | 25 | int it; 26 | const char *tag = id->tag; 27 | (void) logger; 28 | (void) forward; 29 | (void) forward_length; 30 | (void) reverse; 31 | (void) reverse_length; 32 | if (tag == NULL) 33 | return false; 34 | 35 | for (it = 0; it < data->numtags; it++) { 36 | if (strncmp(tag, data->tags[it], data->taglen) == 0) { 37 | return true; 38 | } 39 | } 40 | return false; 41 | } 42 | 43 | static void destroy_func( 44 | struct data *data) { 45 | free(data->tags); 46 | free(data->tag_data); 47 | free(data); 48 | } 49 | 50 | OPEN { 51 | struct data data; 52 | const char *it = args; 53 | char *wit; 54 | char **currtag; 55 | 56 | (void) check; 57 | data.numtags = 1; 58 | data.taglen = 0; 59 | 60 | if (args == NULL) { 61 | panda_log_proxy_write_f(logger, "ERR\tVALTAG\tNOTAGS\n"); 62 | return false; 63 | } 64 | while (*it != '\0' && *it != ':') { 65 | data.taglen++; 66 | it++; 67 | } 68 | if (data.taglen == 0) { 69 | panda_log_proxy_write_f(logger, "ERR\tVALTAG\tNOTAGS\n"); 70 | return false; 71 | } 72 | 73 | if (*it != '\0') { 74 | it++; 75 | while (*it != '\0') { 76 | int currtaglen = 0; 77 | while (*it != ':' && *it != '\0') { 78 | it++; 79 | currtaglen++; 80 | } 81 | data.numtags++; 82 | if (*it == ':') 83 | it++; 84 | if (currtaglen != data.taglen) { 85 | panda_log_proxy_write_f(logger, "ERR\tVALTAG\tBADTLEN\t%d != %d %s\n", currtaglen, data.taglen, it - currtaglen); 86 | return false; 87 | } 88 | } 89 | } 90 | 91 | data.tags = malloc(sizeof(char *) * data.numtags); 92 | data.tag_data = malloc(strlen(args) + 1); 93 | memcpy(data.tag_data, args, strlen(args) + 1); 94 | currtag = data.tags; 95 | wit = data.tag_data; 96 | *currtag++ = wit; 97 | while (*wit != '\0') { 98 | if (*wit == ':') { 99 | *wit = '\0'; 100 | *currtag++ = ++wit; 101 | } 102 | wit++; 103 | } 104 | 105 | *precheck = precheck_func; 106 | *user_data = PANDA_STRUCT_DUP(&data); 107 | *destroy = (PandaDestroy) destroy_func; 108 | return true; 109 | } 110 | -------------------------------------------------------------------------------- /buffer.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include 20 | #include 21 | #include 22 | #ifdef HAVE_PTHREAD 23 | # include 24 | #endif 25 | #include "buffer.h" 26 | 27 | PandaDebug panda_debug_flags = PANDA_DEBUG_DEFAULT; 28 | 29 | #if HAVE_PTHREAD 30 | 31 | # define BUFFER(name, type, size) pthread_key_t PANDACONCAT(name, _key); 32 | # include "buffer.list" 33 | # undef BUFFER 34 | 35 | __attribute__ ((constructor)) 36 | static void lib_init( 37 | void) { 38 | # define BUFFER(name, type, size) pthread_key_create(&PANDACONCAT(name, _key), free); 39 | # include "buffer.list" 40 | # undef BUFFER 41 | } 42 | 43 | __attribute__ ((destructor)) 44 | void lib_destroy( 45 | void) { 46 | # define BUFFER(name, type, size) free(pthread_getspecific(PANDACONCAT(name, _key))); pthread_key_delete(PANDACONCAT(name, _key)); 47 | # include "buffer.list" 48 | # undef BUFFER 49 | } 50 | 51 | static void *get_buffer( 52 | pthread_key_t key, 53 | size_t size) { 54 | void *buffer; 55 | if ((buffer = pthread_getspecific(key)) == NULL) { 56 | buffer = malloc(size); 57 | pthread_setspecific(key, buffer); 58 | } 59 | return buffer; 60 | } 61 | 62 | # define BUFFER(name, type, size) type *PANDACONCAT(name, _buffer)(void) { return get_buffer(PANDACONCAT(name, _key), sizeof(type) * size); } 63 | # include "buffer.list" 64 | # undef BUFFER 65 | #else 66 | # define BUFFER(name, type, size) static type PANDACONCAT(name, buffer)[size]; type *PANDACONCAT(name, _buffer)(void) { return PANDACONCAT(name, buffer); } 67 | # include "buffer.list" 68 | # undef BUFFER 69 | #endif 70 | 71 | void bufferprintf( 72 | char *buffer, 73 | char *fmt, 74 | ...) { 75 | va_list va; 76 | va_start(va, fmt); 77 | (void) vsnprintf(buffer, BUFFER_SIZE, fmt, va); 78 | va_end(va); 79 | } 80 | -------------------------------------------------------------------------------- /pandaseq-set.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef _PANDASEQ_SET_H 20 | # define _PANDASEQ_SET_H 21 | # ifdef __cplusplus 22 | # define EXTERN_C_BEGIN extern "C" { 23 | # define EXTERN_C_END } 24 | # else 25 | # define EXTERN_C_BEGIN 26 | # define EXTERN_C_END 27 | # endif 28 | # include 29 | # include 30 | # include 31 | EXTERN_C_BEGIN 32 | /* === Constructor === */ 33 | /** 34 | * Create a new, empty set. 35 | */ 36 | PandaSet panda_idset_new( 37 | void); 38 | 39 | /* === Methods === */ 40 | 41 | /** 42 | * Add a sequence identifier to a set. 43 | */ 44 | void panda_idset_add( 45 | PandaSet set, 46 | const panda_seq_identifier *id); 47 | 48 | /** 49 | * Parse a sequence identifier and add it to the set. 50 | * @id: the text id to parse 51 | * @detected_format: (out): What pipeline produced this header 52 | * @end_ptr: (out) (transfer none): The point in the input where parsing stopped. If parsing was successful, this will be the end of the string. 53 | * Returns: true on success 54 | * @see panda_seqid_parse_fail 55 | */ 56 | bool panda_idset_add_str( 57 | PandaSet set, 58 | const char *id, 59 | PandaTagging policy, 60 | PandaIdFmt *detected_format, 61 | const char **end_ptr); 62 | 63 | /** 64 | * Check if a sequence identifier has been added to the set. 65 | */ 66 | bool panda_idset_contains( 67 | PandaSet set, 68 | const panda_seq_identifier *id); 69 | 70 | /** 71 | * Increase the reference count on a set. 72 | * 73 | * This is thread-safe. 74 | */ 75 | PandaSet panda_idset_ref( 76 | PandaSet set); 77 | 78 | /** 79 | * Decrease the reference count on a set. 80 | * 81 | * This is thread-safe. 82 | * @set: (transfer full): the set to be released. 83 | */ 84 | void panda_idset_unref( 85 | PandaSet set); 86 | EXTERN_C_END 87 | #endif 88 | -------------------------------------------------------------------------------- /pandaseq-url.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #ifndef _PANDASEQ_URL_H 19 | # define _PANDASEQ_URL_H 20 | # include 21 | # ifdef __cplusplus 22 | # define EXTERN_C_BEGIN extern "C" { 23 | # define EXTERN_C_END } 24 | # else 25 | # define EXTERN_C_BEGIN 26 | # define EXTERN_C_END 27 | # endif 28 | EXTERN_C_BEGIN 29 | /** 30 | * Decompress BZipped data. 31 | * @source: (closure source_data) (scope notified) (allow-none): the underlying stream to decompress. 32 | * @verbosity: the BZip logging level. 33 | * Returns:(closure data) (scope notified) (allow-none): the read function. 34 | */ 35 | PandaBufferRead panda_bz_decompress( 36 | PandaBufferRead source, 37 | void *source_data, 38 | PandaDestroy source_destroy, 39 | int verbosity, 40 | void **user_data, 41 | PandaDestroy *destroy); 42 | 43 | /** 44 | * Open a URL and read the sequence. 45 | * @url: the URL, as understood by cURL. 46 | * Returns:(closure data) (scope notified) (allow-none): the read function. 47 | */ 48 | PandaBufferRead panda_open_url( 49 | const char *url, 50 | PandaLogProxy logger, 51 | void **data, 52 | PandaDestroy *destroy); 53 | 54 | /** 55 | * Increment the reference count on the cURL library. 56 | * 57 | * Since cURL needs to be initialised, PANDAseq will do this automatically when a URL is opened and automatically call the matching clean up when all readers have been disposed. 58 | * 59 | * If the program wishes to use cURL, it should call this method to increment the reference count on PANDAseq's internal counter, such that it will not clean up the cURL library while in use. 60 | * 61 | * Returns: whether the library was successfully initialised. 62 | */ 63 | bool panda_curl_ref( 64 | void); 65 | 66 | /** 67 | * Decrement the reference count on the cURL library. 68 | */ 69 | void panda_curl_unref( 70 | void); 71 | 72 | EXTERN_C_END 73 | #endif 74 | -------------------------------------------------------------------------------- /pandaxs.1: -------------------------------------------------------------------------------- 1 | .\" Authors: Andre Masella 2 | .TH pandaxs 1 "June 2011" "2.0" "USER COMMANDS" 3 | .SH NAME 4 | pandaxs \- PAired-eND Assembler for DNA sequences plugin compiler 5 | .SH SYNOPSIS 6 | .B pandaxs 7 | .I source.c 8 | .I FLAGS 9 | .SH DESCRIPTION 10 | PANDASEQ assembles paired-end Illumina reads into sequences, trying to correct for errors and uncalled bases. Sequences can be validated by arbitrary user modules. 11 | .B pandaxs 12 | facilitiates the compilation of such modules using GNU 13 | .BR libtool(1) 14 | and the installed compiler. 15 | .SH OPTIONS 16 | .TP 17 | source.c 18 | A C program source, with the structure defined below, to be compiled. 19 | .TP 20 | FLAGS 21 | If the module requires libraries or other C files, they may be included. In fact, any typical CFLAGS or LDFLAGS may be included. 22 | .SH UNDERSTANDING VALIDATION 23 | Essentially, PANDAseq will assemble a sequence and then send the output to each validation module the user has specified with the \fB-C\fR option. Each module can then analyse the reconstruction and elect to keep or discard the sequence. Modules are given two options: \fBCHECK\fR and \fBPRECHECK\fR, which are done after and before sequence reconstruction, respectively. If a sequence can be discarded using only the information from the FASTQ file (i.e., the header and the reads), then a discard from a \fBPRECHECK\fR will be much faster as PANDAseq need not go to the time of assembling the sequence. 24 | 25 | Validation modules are not permitted to modify the sequence. 26 | .SH STRUCTURE 27 | Writing a module requires familiarity with the C programming language. Various macros are available to assist in creating correct output. Generally: 28 | 29 | .IP 1. 30 | Copy the \fBsample.c\fR included in the \fBshare/doc/pandaseq\fR directory. 31 | .IP 2. 32 | Modify it to do your desired validation. You may included any needed libraries. 33 | .IP 3. 34 | Use \fBpandaxs\fR to compile and install your module, linking against any needed libraries. 35 | 36 | .P 37 | In multi-threaded situations, \fIno\fR guarantees are made about multiple entries into your functions, the module must be re-entrant or do any locking. Memory management is not handled, though \fBCLEANUP\fR will be called if \fBINIT\fR has been called. It is possible that \fBINIT\fR will be called again. 38 | 39 | Sequences will not necessarily be presented in order in multi-threaded conditions and the sequences of multiple files can be intermingled. In this way, both \fBafter\fR and \fBbefore\fR are broken. 40 | 41 | Modules are checked for ABI compatibility. A module will fail to load if the PANDAseq ABI has changed since it was compiled. 42 | 43 | There is no standard for the format of arguments to modules, but spaces are best avoided. 44 | 45 | .SH SEE ALSO 46 | .BR libtool (1), 47 | .BR pandaseq (1). 48 | -------------------------------------------------------------------------------- /linebuf.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include 20 | #include 21 | #include "pandaseq.h" 22 | #include "misc.h" 23 | 24 | struct panda_linebuf { 25 | char data[10 * MAX_LEN]; 26 | size_t data_length; 27 | size_t offset; 28 | MANAGED_MEMBER( 29 | PandaBufferRead, 30 | read); 31 | }; 32 | 33 | PandaLineBuf panda_linebuf_new( 34 | PandaBufferRead read, 35 | void *read_data, 36 | PandaDestroy read_destroy) { 37 | PandaLineBuf buffer; 38 | if (read == NULL) 39 | return NULL; 40 | buffer = malloc(sizeof(struct panda_linebuf)); 41 | buffer->data_length = 0; 42 | buffer->offset = 0; 43 | buffer->read = read; 44 | buffer->read_data = read_data; 45 | buffer->read_destroy = read_destroy; 46 | return buffer; 47 | } 48 | 49 | void panda_linebuf_free( 50 | PandaLineBuf linebuf) { 51 | if (linebuf == NULL) 52 | return; 53 | DESTROY_MEMBER(linebuf, read); 54 | free(linebuf); 55 | } 56 | 57 | const char *panda_linebuf_next( 58 | PandaLineBuf linebuf) { 59 | char *end; 60 | if (linebuf->offset > 0) { 61 | memmove(linebuf->data, linebuf->data + linebuf->offset, linebuf->data_length - linebuf->offset); 62 | linebuf->data_length -= linebuf->offset; 63 | linebuf->offset = 0; 64 | } 65 | 66 | while ((end = memchr(linebuf->data, '\n', linebuf->data_length)) == NULL && linebuf->data_length < sizeof(linebuf->data)) { 67 | size_t new_bytes = 0; 68 | if (!linebuf->read(linebuf->data + linebuf->data_length, sizeof(linebuf->data) - linebuf->data_length, &new_bytes, linebuf->read_data)) { 69 | return NULL; 70 | } 71 | if (new_bytes == 0) { 72 | end = linebuf->data + linebuf->data_length + 1; 73 | break; 74 | } 75 | linebuf->data_length += new_bytes; 76 | } 77 | if (end == NULL || linebuf->data_length == 0 || *end == '\0') { 78 | return NULL; 79 | } 80 | 81 | /* White out any carriage returns if we get DOS-formatted files. */ 82 | if (end != linebuf->data && end[-1] == '\r') { 83 | end[-1] = '\0'; 84 | } 85 | 86 | *end = '\0'; 87 | linebuf->offset = end - linebuf->data + 1; 88 | return linebuf->data; 89 | } 90 | -------------------------------------------------------------------------------- /algo_ea_util.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | Based on the work of Expression Analysis / Erik Aronesty: 5 | https://code.google.com/p/ea-utils/wiki/FastqJoin 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with this program. If not, see . 19 | 20 | */ 21 | #include "config.h" 22 | #include 23 | #include 24 | #include 25 | #include "pandaseq.h" 26 | #include "prob.h" 27 | #include "table.h" 28 | 29 | static double overlap_probability( 30 | void *data, 31 | const panda_qual *forward, 32 | size_t forward_length, 33 | const panda_qual *reverse, 34 | size_t reverse_length, 35 | size_t overlap) { 36 | size_t mismatches = 0; 37 | size_t i; 38 | size_t real_overlap = 0; 39 | 40 | (void) data; 41 | 42 | for (i = 0; i < overlap; i++) { 43 | int findex = forward_length + i - overlap; 44 | int rindex = reverse_length - i - 1; 45 | if (findex < 0 || rindex < 0 || (size_t) findex >= forward_length || (size_t) rindex >= reverse_length) 46 | continue; 47 | panda_nt f = forward[findex].nt; 48 | panda_nt r = reverse[rindex].nt; 49 | if (PANDA_NT_IS_N(f) || PANDA_NT_IS_N(r) || (f & r) == 0) { 50 | mismatches++; 51 | } 52 | real_overlap++; 53 | } 54 | 55 | return log((((double) mismatches) * mismatches + 1) / real_overlap); 56 | } 57 | 58 | static double match_probability( 59 | void *data, 60 | bool match, 61 | char a, 62 | char b) { 63 | int score = (a > b) ? PHREDCLAMP(a) : PHREDCLAMP(b); 64 | (void) data; 65 | (void) match; 66 | return qual_score[score]; 67 | } 68 | 69 | static PandaAlgorithm from_string( 70 | const char *argument) { 71 | 72 | if (argument != NULL) { 73 | fprintf(stderr, "No arguments allowed: %s\n", argument); 74 | return NULL; 75 | } 76 | return panda_algorithm_ea_util_new(); 77 | } 78 | 79 | const struct panda_algorithm_class panda_algorithm_ea_util_class = { 80 | .data_size = 0, 81 | .name = "ea_util", 82 | .create = from_string, 83 | .data_destroy = NULL, 84 | .overlap_probability = (PandaComputeOverlap) overlap_probability, 85 | .match_probability = (PandaComputeMatch) match_probability, 86 | .prob_unpaired = qual_nn_simple_bayesian, 87 | }; 88 | 89 | PandaAlgorithm panda_algorithm_ea_util_new( 90 | void) { 91 | return panda_algorithm_new(&panda_algorithm_ea_util_class); 92 | } 93 | -------------------------------------------------------------------------------- /algo_stitch.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | [stitch](https://github.com/audy/stitch) 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | 18 | */ 19 | #include "config.h" 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "pandaseq.h" 25 | #include "prob.h" 26 | #include "table.h" 27 | 28 | static double overlap_probability( 29 | void *data, 30 | const panda_qual *forward, 31 | size_t forward_length, 32 | const panda_qual *reverse, 33 | size_t reverse_length, 34 | size_t overlap) { 35 | size_t score = 0; 36 | size_t i; 37 | 38 | (void) data; 39 | 40 | for (i = 0; i < overlap; i++) { 41 | int findex = forward_length + i - overlap; 42 | int rindex = reverse_length - i - 1; 43 | if (findex < 0 || rindex < 0 || (size_t) findex >= forward_length || (size_t) rindex >= reverse_length) 44 | continue; 45 | panda_nt f = forward[findex].nt; 46 | panda_nt r = reverse[rindex].nt; 47 | if (PANDA_NT_IS_N(f) || PANDA_NT_IS_N(r)) { 48 | score += 0; 49 | } else if ((f & r) != 0) { 50 | score += 1; 51 | } else { 52 | score -= 1; 53 | } 54 | } 55 | 56 | return log(score / (double) (forward_length + reverse_length)); 57 | } 58 | 59 | static double match_probability( 60 | void *data, 61 | bool match, 62 | char a, 63 | char b) { 64 | (void) data; 65 | /* Stitch doesn't reconstruct quality scores, so we just do what simple Bayes does. */ 66 | return (match ? qual_match_simple_bayesian : qual_mismatch_simple_bayesian)[PHREDCLAMP(a)][PHREDCLAMP(b)]; 67 | } 68 | 69 | static PandaAlgorithm from_string( 70 | const char *argument) { 71 | 72 | if (argument != NULL) { 73 | fprintf(stderr, "No arguments allowed: %s\n", argument); 74 | return NULL; 75 | } 76 | return panda_algorithm_stitch_new(); 77 | } 78 | 79 | const struct panda_algorithm_class panda_algorithm_stitch_class = { 80 | .data_size = 0, 81 | .name = "stitch", 82 | .create = from_string, 83 | .data_destroy = NULL, 84 | .overlap_probability = (PandaComputeOverlap) overlap_probability, 85 | .match_probability = (PandaComputeMatch) match_probability, 86 | .prob_unpaired = qual_nn_simple_bayesian, 87 | }; 88 | 89 | PandaAlgorithm panda_algorithm_stitch_new( 90 | void) { 91 | return panda_algorithm_new(&panda_algorithm_stitch_class); 92 | } 93 | -------------------------------------------------------------------------------- /misc.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #ifndef MISC_H 19 | # define MISC_H 20 | 21 | # include 22 | # include 23 | # include "pandaseq.h" 24 | 25 | # define DESTROY_MEMBER(self, name) if ((self)->name ## _destroy != NULL && (self)->name != NULL) { (self)->name ## _destroy((self)->name ## _data); } (self)->name = NULL; (self)->name ## _data = NULL; (self)->name ## _destroy = NULL 26 | # define DESTROY_STACK(name) if (name ## _destroy != NULL && name != NULL) { name ## _destroy(name ## _data); } name = NULL; name ## _data = NULL; name ## _destroy = NULL 27 | # define MANAGED_MEMBER(type, name) type name; void * name ## _data; PandaDestroy name ## _destroy 28 | # define MANAGED_STACK(type, name) type name = NULL; void * name ## _data = NULL; PandaDestroy name ## _destroy = NULL 29 | # define MAYBE(x) if (x != NULL) *x 30 | # define free0(val) if ((val) != NULL) free(val); (val) = NULL 31 | 32 | typedef unsigned short seqindex; 33 | # define KMER_LEN ((size_t) 8) 34 | # define KMERSEEN_SIZE(num_kmers) (sizeof(seqindex) * (num_kmers) * (1 << (2 * KMER_LEN))) 35 | 36 | typedef struct { 37 | size_t kmer; 38 | ptrdiff_t posn; 39 | ptrdiff_t bad; 40 | } kmer_it; 41 | # define _FOREACH_KMER(iterator,sequence,suffix,start,badstart,check,step,badreset) for ((iterator).posn = (start), (iterator).kmer = 0, (iterator).bad = badstart; (iterator).posn check; (iterator).posn step) if ((iterator).kmer = (((iterator).kmer << 2) | ((sequence)[(iterator).posn]suffix == PANDA_NT_T ? 3 : (sequence)[(iterator).posn]suffix == PANDA_NT_G ? 2 : (sequence)[(iterator).posn]suffix == PANDA_NT_C ? 1 : 0)) & ((1 << (2 * badreset)) - 1), PANDA_NT_IS_N((sequence)[(iterator).posn]suffix)) { (iterator).bad = badreset; } else if ((iterator).bad > 0) { (iterator).bad--; } else 42 | # define FOREACH_KMER(iterator,sequence,suffix) _FOREACH_KMER(iterator, sequence, suffix, 0, KMER_LEN, < (ptrdiff_t)sequence ## _length, ++, KMER_LEN) 43 | # define FOREACH_KMER_REVERSE(iterator,sequence,suffix) _FOREACH_KMER(iterator, sequence, suffix, sequence ## _length - 1, KMER_LEN, >= 0, --, KMER_LEN) 44 | # define KMER(kmerit) ((kmerit).kmer) 45 | # define KMER_POSITION(kmerit) ((kmerit).posn) 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /plugin_pear_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | struct data { 9 | double alpha; 10 | double beta; 11 | double cutoff; 12 | }; 13 | 14 | HELP("Use the statistical test from PEAR (Zhang 2013)", "pear_test:alpha=1.0,beta=-1.0,cutoff=0.01"); 15 | 16 | VER_INFO("1.0"); 17 | 18 | static bool check_func( 19 | PandaLogProxy logger, 20 | const panda_result_seq *sequence, 21 | void *user_data) { 22 | 23 | struct data *data = (struct data *) user_data; 24 | double product = 1; 25 | size_t i; 26 | double oes = data->alpha * (sequence->overlap - sequence->overlap_mismatches) + data->beta * sequence->overlap_mismatches; 27 | 28 | (void) logger; 29 | 30 | for (i = sequence->overlap; i < sequence->forward_length && i < sequence->reverse_length; i++) { 31 | double sum = 0; 32 | size_t k; 33 | size_t l_i = ceil((oes - data->beta * i) / (data->alpha - data->beta)) - 1; 34 | for (k = 0; k < l_i; k++) { 35 | double i_choose_k = lgamma(i + 1) - lgamma(k + 1) - lgamma(i - k + 1); 36 | sum += exp(i_choose_k + k * log(0.25) + (i - k) * log(0.75)); 37 | } 38 | product *= sum; 39 | } 40 | return data->cutoff > 1 - product * product; 41 | } 42 | 43 | struct { 44 | const char *name; 45 | size_t holder; 46 | } const token[] = { 47 | {.name = "alpha",.holder = offsetof(struct data, alpha)}, 48 | {.name = "beta",.holder = offsetof(struct data, beta)}, 49 | {.name = "cutoff",.holder = offsetof(struct data, cutoff)}, 50 | {NULL, 0} 51 | }; 52 | 53 | bool parse_argument( 54 | PandaLogProxy logger, 55 | const char *value, 56 | const char *arg_name, 57 | double *output) { 58 | char *remainder = NULL; 59 | 60 | (void) logger; 61 | 62 | errno = 0; 63 | *output = strtod(value, &remainder); 64 | if (errno != 0) { 65 | panda_log_proxy_perror(logger, arg_name); 66 | return false; 67 | } else if (*remainder != '\0') { 68 | panda_log_proxy_write_f(logger, "%s: trailing garbage: %s\n", arg_name, remainder); 69 | return false; 70 | } 71 | return true; 72 | } 73 | 74 | static bool key_processor( 75 | const char *key, 76 | const char *value, 77 | void *data) { 78 | size_t it; 79 | for (it = 0; token[it].name != NULL; it++) { 80 | if (strcmp(key, token[it].name) == 0) { 81 | return parse_argument((PandaLogProxy) data, value, token[it].name, (double *) ((char *) data + token[it].holder)); 82 | } 83 | } 84 | panda_log_proxy_write_f((PandaLogProxy) data, "Unknown setting: /%s/\n", key); 85 | return false; 86 | } 87 | 88 | OPEN { 89 | struct data data; 90 | 91 | (void) precheck; 92 | data.alpha = 1; 93 | data.beta = -1; 94 | data.cutoff = 0.01; 95 | 96 | if (!panda_parse_key_values(args, key_processor, logger)) 97 | return false; 98 | if (data.cutoff < 0 || data.cutoff > 1) { 99 | panda_log_proxy_write_f(logger, "Value %f out of range for p-value cut-off.", data.cutoff); 100 | return false; 101 | } 102 | *check = check_func; 103 | *user_data = PANDA_STRUCT_DUP(&data); 104 | *destroy = free; 105 | return true; 106 | } 107 | -------------------------------------------------------------------------------- /plugin_sample.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | /* Adding a validation plugin: 4 | * 5 | * To create a validation plugin, copy this file. 6 | * In your module, you can provide 3 functions, as shown below. 7 | * 8 | * When you are ready to compile, use ``pandaxs yourmodule.c'' to compile. 9 | * You can also link against other libraries 10 | * (e.g., ``pandaxs yourmodule.c -lglib -I/usr/include/glib-2.0'') 11 | * If you do this as root, the module will be installed into /usr/lib/pandaseq 12 | * or a similar location. 13 | */ 14 | 15 | /* 16 | * Provide a description and usage information 17 | */ 18 | 19 | HELP("This is a sample module that does nothing", "sample:args"); 20 | 21 | /* 22 | * Provide version information 23 | */ 24 | VER_INFO("1.0"); 25 | 26 | /* 27 | * If data is needed, store it in a structure. 28 | */ 29 | 30 | struct data { 31 | int some_value; 32 | }; 33 | 34 | /* Given a sequence, determine if the sequence is valid. 35 | * @logger: View pandaseq-log.h for more information about PandaLogProxy. 36 | * @sequence: View pandaseq-common.h for more information about panda_result_seq. 37 | * Returns: true if the sequence should be kept. 38 | * At least one of this function or precheck is required. 39 | */ 40 | static bool check_func( 41 | PandaLogProxy logger, 42 | const panda_result_seq *sequence, 43 | struct data *data) { 44 | panda_log_proxy_write_str(logger, "INFO\tSAMPLE\tCHECK\n"); 45 | return true; 46 | } 47 | 48 | /* Given the forward and reverse reads, determine if the sequence is worth assembling. 49 | * @logger: View pandaseq-log.h for more information about PandaLogProxy. 50 | * @id: View pandaseq-common.h for more information about panda_seqid. 51 | * @forward: (array length=forward_length): The forward read, of type panda_qual. 52 | * @reverse: (array length=reverse_length): The reverse read, of type panda_qual. 53 | * Returns: true if the reads should be assembled. 54 | * At least one of this function or check is required. 55 | */ 56 | static bool precheck_func( 57 | PandaLogProxy logger, 58 | const panda_seq_identifier *id, 59 | const panda_qual *forward, 60 | size_t forward_length, 61 | const panda_qual *reverse, 62 | size_t reverse_length, 63 | struct data *data) { 64 | panda_log_proxy_write_f(logger, "INFO\tSAMPLE\tPRECHECK\n"); 65 | return true; 66 | } 67 | 68 | /* Called once upon completion to perform any needed cleanup. 69 | * This function is optional. 70 | */ 71 | static void destroy_func( 72 | struct data *data) { 73 | free(data); 74 | } 75 | 76 | /* Called once to initialise the module upon loading. Arguments can be provided 77 | * to the module upon loading. 78 | * (e.g., "-C /usr/lib/pandaseq/mynewmodule.so:foo=bar", then args = "foo=bar") 79 | * @logger: View pandaseq-log.h for more information about PandaLogProxy. 80 | * @args: the argument string provided. 81 | * Returns: false if there is a failure to initialise. 82 | */ 83 | OPEN { 84 | struct data data; 85 | panda_log_proxy_write_f(logger, "INFO\tSAMPLE\tINIT\t%s\n", args); 86 | 87 | *check = (PandaCheck) check_func; 88 | *precheck = (PandaPreCheck) precheck_func; 89 | *user_data = PANDA_STRUCT_DUP(&data); 90 | *destroy = (PandaDestroy) destroy_func; 91 | return true; 92 | } 93 | -------------------------------------------------------------------------------- /main-parse.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #define _POSIX_C_SOURCE 2 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "config.h" 26 | #include "pandaseq.h" 27 | 28 | int main( 29 | int argc, 30 | char **argv) { 31 | int c; 32 | bool help = false; 33 | panda_seq_identifier id; 34 | int it; 35 | const char *optlist = "hv"; 36 | bool version = false; 37 | 38 | while ((c = getopt(argc, argv, optlist)) != -1) { 39 | switch (c) { 40 | case 'h': 41 | help = true; 42 | break; 43 | case 'v': 44 | version = true; 45 | break; 46 | case '?': 47 | if (strchr(optlist, optopt) != NULL) { 48 | fprintf(stderr, "Option -%c requires an argument.\n", optopt); 49 | } else if (isprint(optopt)) { 50 | fprintf(stderr, "Unknown option `-%c'.\n", optopt); 51 | } else { 52 | fprintf(stderr, "Unknown option character `\\x%x'.\n", (unsigned int) optopt); 53 | } 54 | return 1; 55 | default: 56 | abort(); 57 | } 58 | } 59 | 60 | if (version) { 61 | fprintf(stderr, "%s <%s>\n", PACKAGE_STRING, PACKAGE_BUGREPORT); 62 | return 1; 63 | } 64 | if (argc < 2 || help) { 65 | fprintf(stderr, "%s <%s>\nUsage: %s \"seq header\" ...\nCheck is the sequence header is recognised by PANDAseq.\n", PACKAGE_STRING, PACKAGE_BUGREPORT, argv[0]); 66 | return 1; 67 | } 68 | 69 | for (it = 1; it < argc; it++) { 70 | const char *endptr; 71 | int dir; 72 | PandaIdFmt detected_format; 73 | panda_seqid_clear(&id); 74 | dir = panda_seqid_parse_fail(&id, argv[it] + (argv[it][0] == '@' ? 1 : 0), PANDA_TAG_OPTIONAL, &detected_format, &endptr); 75 | if (dir == 0) { 76 | int count; 77 | printf("%s\n", argv[it]); 78 | for (count = endptr - argv[it] - 2; count > 0; count--) { 79 | putchar(' '); 80 | } 81 | printf("^\n\tBAD\n"); 82 | } else { 83 | panda_seqid_print(&id, stdout); 84 | printf("\n\tGOOD\n\tdirection = %s\n\thastag = %s\n", panda_idfmt_has_direction(detected_format) ? (dir == 1 ? "forward" : "reverse") : "unknown", id.tag[0] == '\0' ? "no" : "yes"); 85 | } 86 | printf("\tinstrument = \"%s\"\n\trun = \"%s\"\n\tflowcell = \"%s\"\n\tlane = %d\n\ttile = %d\n\tx = %d\n\ty = %d\n\ttag = \"%s\"\n\tgenerator = %s\n", id.instrument, id.run, id.flowcell, id.lane, id.tile, id.x, id.y, id.tag, panda_idfmt_str(detected_format)); 87 | } 88 | return 0; 89 | } 90 | -------------------------------------------------------------------------------- /args_array.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include 20 | #include "pandaseq.h" 21 | 22 | int panda_tweak_general_compare( 23 | const panda_tweak_general *a, 24 | const panda_tweak_general *b) { 25 | return a->flag - b->flag; 26 | } 27 | 28 | int panda_tweak_assembler_compare( 29 | const panda_tweak_assembler *a, 30 | const panda_tweak_assembler *b) { 31 | return a->flag - b->flag; 32 | } 33 | 34 | int panda_tweak_general_compare_p( 35 | const panda_tweak_general *const *a, 36 | const panda_tweak_general *const *b) { 37 | return panda_tweak_general_compare(*a, *b); 38 | } 39 | 40 | int panda_tweak_assembler_compare_p( 41 | const panda_tweak_assembler *const *a, 42 | const panda_tweak_assembler *const *b) { 43 | return panda_tweak_assembler_compare(*a, *b); 44 | } 45 | 46 | void panda_tweak_general_sort( 47 | const panda_tweak_general **array, 48 | size_t length) { 49 | qsort(array, length, sizeof(panda_tweak_general *), (int (*)(const void *, const void *)) panda_tweak_general_compare_p); 50 | } 51 | 52 | void panda_tweak_assembler_sort( 53 | const panda_tweak_assembler **array, 54 | size_t length) { 55 | qsort(array, length, sizeof(panda_tweak_assembler *), (int (*)(const void *, const void *)) panda_tweak_assembler_compare_p); 56 | } 57 | 58 | static void append_array( 59 | const void ***array, 60 | size_t *length, 61 | const void **additions, 62 | size_t additions_length) { 63 | size_t it; 64 | 65 | if (*array == NULL) 66 | *length = 0; 67 | 68 | *array = realloc(*array, (*length + additions_length) * sizeof(void *)); 69 | 70 | for (it = 0; it < additions_length; it++) { 71 | (*array)[*length + it] = additions[it]; 72 | } 73 | *length += additions_length; 74 | } 75 | 76 | void panda_tweak_general_append( 77 | const panda_tweak_general ***array, 78 | size_t *length, 79 | const panda_tweak_general *const *additions, 80 | size_t additions_length) { 81 | append_array((const void ***) array, length, (const void **) additions, additions_length); 82 | panda_tweak_general_sort(*array, *length); 83 | } 84 | 85 | void panda_tweak_assembler_append( 86 | const panda_tweak_assembler ***array, 87 | size_t *length, 88 | const panda_tweak_assembler *const *additions, 89 | size_t additions_length) { 90 | append_array((const void ***) array, length, (const void **) additions, additions_length); 91 | panda_tweak_assembler_sort(*array, *length); 92 | } 93 | -------------------------------------------------------------------------------- /algo_flash.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | Based on work by Tanja Magoc and Eric Biggers: 5 | http://ccb.jhu.edu/software/FLASH/ 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with this program. If not, see . 19 | 20 | */ 21 | #include "config.h" 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include "pandaseq.h" 27 | #include "prob.h" 28 | #include "table.h" 29 | 30 | static double overlap_probability( 31 | void *data, 32 | const panda_qual *forward, 33 | size_t forward_length, 34 | const panda_qual *reverse, 35 | size_t reverse_length, 36 | size_t overlap) { 37 | int sum_quality = 0; 38 | size_t mismatches = 0; 39 | size_t real_overlap = 0; 40 | size_t i; 41 | 42 | (void) data; 43 | 44 | for (i = 0; i < overlap; i++) { 45 | int findex = forward_length + i - overlap; 46 | int rindex = reverse_length - i - 1; 47 | if (findex < 0 || rindex < 0 || (size_t) findex >= forward_length || (size_t) rindex >= reverse_length) 48 | continue; 49 | panda_nt f = forward[findex].nt; 50 | panda_nt r = reverse[rindex].nt; 51 | if (PANDA_NT_IS_N(f) || PANDA_NT_IS_N(r) || (f & r) == 0) { 52 | char min_quality = forward[findex].qual < reverse[rindex].qual ? forward[findex].qual : reverse[rindex].qual; 53 | mismatches++; 54 | sum_quality += min_quality; 55 | } 56 | real_overlap++; 57 | } 58 | 59 | return real_overlap == 0 ? -2 : log(mismatches / real_overlap); 60 | } 61 | 62 | static double match_probability( 63 | void *data, 64 | bool match, 65 | char a, 66 | char b) { 67 | int score; 68 | (void) data; 69 | if (match) { 70 | score = (a > b) ? PHREDCLAMP(a) : PHREDCLAMP(b); 71 | } else { 72 | score = PHREDCLAMP(a) - PHREDCLAMP(b); 73 | if (score < 0) 74 | score = -score; 75 | if (score < 2) { 76 | score = 2; 77 | } 78 | } 79 | return qual_score[score]; 80 | } 81 | 82 | static PandaAlgorithm from_string( 83 | const char *argument) { 84 | if (argument != NULL) { 85 | fprintf(stderr, "No arguments allowed: %s\n", argument); 86 | return NULL; 87 | } 88 | return panda_algorithm_flash_new(); 89 | } 90 | 91 | const struct panda_algorithm_class panda_algorithm_flash_class = { 92 | .data_size = 0, 93 | .name = "flash", 94 | .create = from_string, 95 | .data_destroy = NULL, 96 | .overlap_probability = (PandaComputeOverlap) overlap_probability, 97 | .match_probability = (PandaComputeMatch) match_probability, 98 | .prob_unpaired = qual_nn_simple_bayesian, 99 | }; 100 | 101 | PandaAlgorithm panda_algorithm_flash_new( 102 | void) { 103 | return panda_algorithm_new(&panda_algorithm_flash_class); 104 | } 105 | -------------------------------------------------------------------------------- /pandaseq-iter.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef _PANDASEQ_ITER_H 20 | # define _PANDASEQ_ITER_H 21 | # ifdef __cplusplus 22 | # define EXTERN_C_BEGIN extern "C" { 23 | # define EXTERN_C_END } 24 | # else 25 | # define EXTERN_C_BEGIN 26 | # define EXTERN_C_END 27 | # endif 28 | # include 29 | EXTERN_C_BEGIN 30 | /* === Constructors === */ 31 | /** 32 | * Create an iterator over a sequence of nucleotides. 33 | * @seq: (array length=seq_length) (scope container): the sequence to iterate over, and its length. This sequence must not be freed during the life of the iterator. 34 | * @reverse: true to iterate from the end of the sequence rather than the beginning 35 | * @k: the length of the output words. This must range between 1 and 4 * sizeof(size_t). Any other values will be converted to the standard k-mer length of 8. 36 | */ 37 | PandaIter panda_iterate_nt( 38 | panda_nt *seq, 39 | size_t seq_length, 40 | bool reverse, 41 | int k); 42 | 43 | /** 44 | * Iterate over quality-annotated sequence. 45 | * @see panda_iterate_nt 46 | */ 47 | PandaIter panda_iterate_qual( 48 | panda_qual *seq, 49 | size_t seq_length, 50 | bool reverse, 51 | int k); 52 | 53 | /** 54 | * Iterate over probability-annotated sequence. 55 | * @see panda_iterate_nt 56 | */ 57 | PandaIter panda_iterate_result( 58 | panda_result *seq, 59 | size_t seq_length, 60 | bool reverse, 61 | int k); 62 | 63 | /* === Methods === */ 64 | 65 | /** 66 | * Copy an iterator to a new one, preserving its current state. 67 | */ 68 | PandaIter panda_iter_dup( 69 | PandaIter iter); 70 | 71 | /** 72 | * Destroy an iterator. 73 | */ 74 | void panda_iter_free( 75 | PandaIter iter); 76 | 77 | /** 78 | * Advance to the next position in the sequence. 79 | * Returns: (allow-none) (transfer none): if null, there are no more k-mers in the sequence 80 | */ 81 | const panda_kmer *panda_iter_next( 82 | PandaIter iter); 83 | 84 | /** 85 | * Set an iterator back to the beginning of the sequence. 86 | */ 87 | void panda_iter_reset( 88 | PandaIter iter); 89 | 90 | /* === Getters and Setters === */ 91 | 92 | /** 93 | * Get the number of useful bits in the output. 94 | * 95 | * This is the maximum value of panda_kmer.kmer for this iterator. 96 | */ 97 | size_t panda_iter_bits( 98 | PandaIter iter); 99 | 100 | /** 101 | * Get the k-mer length for the iterator. 102 | */ 103 | int panda_iter_k( 104 | PandaIter iter); 105 | 106 | EXTERN_C_END 107 | #endif 108 | -------------------------------------------------------------------------------- /testing/reg-test.vala: -------------------------------------------------------------------------------- 1 | /* PANDAseq-Diff -- Check differences between two versions of PANDAseq. 2 | Copyright (C) 2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | string? forward_file = null; 19 | string? reverse_file = null; 20 | bool web = false; 21 | bool suppress_quality_diffs = false; 22 | const string URL = "http://neufeldserver.uwaterloo.ca/~apmasell/mcbath-small_%d.fastq.bz2"; 23 | 24 | const OptionEntry[] options = { 25 | { "forward", 'f', 0, OptionArg.FILENAME, ref forward_file, "Forward read FASTQ file.", "forward.fastq.bz2" }, 26 | { "reverse", 'r', 0, OptionArg.FILENAME, ref reverse_file, "Reverse read FASTQ file.", "reverse.fastq.bz2" }, 27 | { "suppress-quality", 'Q', 0, OptionArg.NONE, ref suppress_quality_diffs, "Ignore differences in quality scores of output bases.", null }, 28 | { "web", 'W', 0, OptionArg.NONE, ref web, "Get files from the web.", null }, 29 | { null } 30 | }; 31 | 32 | public extern Panda.Assemble? create_assembler_control (Panda.LogProxy logger); 33 | public extern Panda.Assemble? create_assembler_experiment (Panda.LogProxy logger); 34 | 35 | public int main (string[] args) { 36 | try { 37 | var opt_context = new OptionContext ("- PANDAseq Diff"); 38 | opt_context.set_help_enabled (true); 39 | opt_context.add_main_entries (options, null); 40 | if (!opt_context.parse (ref args)) { 41 | stdout.printf ("Problem parsing arguments.\n"); 42 | return 1; 43 | } 44 | } catch (OptionError e) { 45 | stdout.printf ("%s\n", e.message); 46 | stdout.printf ("Run '%s --help' to see a full list of available command line options.\n", args[0]); 47 | return 1; 48 | } 49 | var logger = new Panda.LogProxy (new Panda.Writer.null ()); 50 | Panda.NextSeq reader; 51 | if (web) { 52 | var forward = Panda.bz_decompress (Panda.open_url (URL.printf (1), logger)); 53 | var reverse = Panda.bz_decompress (Panda.open_url (URL.printf (2), logger)); 54 | if (forward == null || reverse == null) { 55 | return 1; 56 | } 57 | reader = Panda.create_fastq_reader ((owned) forward, (owned) reverse, logger); 58 | } else { 59 | if (forward_file == null) { 60 | stdout.printf ("You must supply a forward read file.\n"); 61 | return 1; 62 | } 63 | if (reverse_file == null) { 64 | stdout.printf ("You must supply a reverse read file.\n"); 65 | return 1; 66 | } 67 | 68 | reader = Panda.open_fastq (forward_file, reverse_file, logger); 69 | } 70 | if (reader == null) { 71 | stdout.printf ("Could not open input sequences.\n"); 72 | return 1; 73 | } 74 | 75 | var control = create_assembler_control (logger); 76 | var experimental = create_assembler_experiment (logger); 77 | if (control == null || experimental == null) { 78 | return 1; 79 | } 80 | 81 | return Panda.diff (reader, control, experimental, suppress_quality_diffs) ? 2 : 0; 82 | } 83 | -------------------------------------------------------------------------------- /algo_rdp_mle.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | Copyright (C) 2013 Qiong Wang 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | 18 | */ 19 | #include "config.h" 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "pandaseq.h" 25 | #include "algo.h" 26 | #include "prob.h" 27 | #include "table.h" 28 | 29 | static double match_probability( 30 | void *data, 31 | bool match, 32 | char a, 33 | char b) { 34 | (void) data; 35 | if (match) { 36 | char max = (a >= b) ? a : b; 37 | return qual_score[PHREDCLAMP(max)]; 38 | } else { 39 | return qual_mismatch_assembled_rdp_mle[PHREDCLAMP(a)][PHREDCLAMP(b)]; 40 | } 41 | } 42 | 43 | static double overlap_probability( 44 | void *data, 45 | const panda_qual *forward, 46 | size_t forward_length, 47 | const panda_qual *reverse, 48 | size_t reverse_length, 49 | size_t overlap) { 50 | 51 | double probability = 0; 52 | size_t i; 53 | 54 | (void) data; 55 | for (i = 0; i < overlap; i++) { 56 | int findex = forward_length + i - overlap; 57 | int rindex = reverse_length - i - 1; 58 | if (findex < 0 || rindex < 0 || (size_t) findex > forward_length || (size_t) rindex > reverse_length) 59 | continue; 60 | panda_nt f = forward[findex].nt; 61 | panda_nt r = reverse[rindex].nt; 62 | char fqual = PHREDCLAMP(forward[findex].qual); 63 | char rqual = PHREDCLAMP(reverse[rindex].qual); 64 | bool ismatch = ((f & r) != 0); 65 | 66 | /* when two bases match, the assumption that the forward and reverse bases are from independent observations doesn't work with the MiSeq mock community data we tested. Instead, the higher score of the two raw base q scores is close to the predicated error rate */ 67 | if (ismatch) { 68 | probability += qual_match_simple_bayesian[(int) fqual][(int) rqual] - qual_nn_simple_bayesian; 69 | } else { 70 | probability += qual_mismatch_rdp_mle[(int) fqual][(int) rqual] - qual_nn_simple_bayesian; 71 | } 72 | } 73 | return probability; 74 | } 75 | 76 | static PandaAlgorithm from_string( 77 | const char *argument) { 78 | 79 | if (argument == NULL || strlen(argument) == 0) 80 | return panda_algorithm_rdp_mle_new(); 81 | return NULL; 82 | } 83 | 84 | const struct panda_algorithm_class panda_algorithm_rdp_mle_class = { 85 | .data_size = 0, 86 | .name = "rdp_mle", 87 | .create = from_string, 88 | .data_destroy = NULL, 89 | .overlap_probability = (PandaComputeOverlap) overlap_probability, 90 | .match_probability = (PandaComputeMatch) match_probability, 91 | .prob_unpaired = qual_nn_simple_bayesian, 92 | }; 93 | 94 | PandaAlgorithm panda_algorithm_rdp_mle_new( 95 | void) { 96 | PandaAlgorithm algo = panda_algorithm_new(&panda_algorithm_rdp_mle_class); 97 | return algo; 98 | } 99 | -------------------------------------------------------------------------------- /hang.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include 19 | #include "config.h" 20 | #ifdef HAVE_PTHREAD 21 | # include 22 | #endif 23 | #include "pandaseq.h" 24 | #include "misc.h" 25 | 26 | struct hang_data { 27 | MANAGED_MEMBER( 28 | PandaNextSeq, 29 | next); 30 | PandaLogProxy logger; 31 | panda_nt forward[MAX_LEN]; 32 | size_t forward_length; 33 | panda_nt reverse[MAX_LEN]; 34 | size_t reverse_length; 35 | bool skip; 36 | double threshold; 37 | }; 38 | 39 | bool hang_next( 40 | panda_seq_identifier *id, 41 | const panda_qual **forward, 42 | size_t *forward_length, 43 | const panda_qual **reverse, 44 | size_t *reverse_length, 45 | void *user_data) { 46 | struct hang_data *data = (struct hang_data *) user_data; 47 | while (data->next(id, forward, forward_length, reverse, reverse_length, data->next_data)) { 48 | size_t offset; 49 | if (data->forward_length > 0) { 50 | offset = panda_compute_offset_qual(data->threshold, 0, true, *forward, *forward_length, data->forward, data->forward_length); 51 | if (offset == 0) { 52 | panda_log_proxy_write(data->logger, PANDA_CODE_NO_FORWARD_PRIMER, NULL, id, "OVERHANGING REJECT"); 53 | if (!data->skip) 54 | continue; 55 | } else { 56 | *forward_length -= offset - 1; 57 | } 58 | } 59 | if (data->reverse_length > 0) { 60 | offset = panda_compute_offset_qual(data->threshold, 0, true, *reverse, *reverse_length, data->reverse, data->reverse_length); 61 | if (offset == 0) { 62 | panda_log_proxy_write(data->logger, PANDA_CODE_NO_REVERSE_PRIMER, NULL, id, "OVERHANGING REJECT"); 63 | if (!data->skip) 64 | continue; 65 | } else { 66 | *reverse_length -= offset - 1; 67 | } 68 | } 69 | return true; 70 | } 71 | return false; 72 | } 73 | 74 | void hang_free( 75 | void *user_data) { 76 | struct hang_data *hang_data = (struct hang_data *) user_data; 77 | DESTROY_MEMBER(hang_data, next); 78 | panda_log_proxy_unref(hang_data->logger); 79 | free(hang_data); 80 | } 81 | 82 | PandaNextSeq panda_trim_overhangs( 83 | PandaNextSeq inner, 84 | void *inner_data, 85 | PandaDestroy inner_destroy, 86 | PandaLogProxy logger, 87 | panda_nt *forward, 88 | size_t forward_length, 89 | panda_nt *reverse, 90 | size_t reverse_length, 91 | bool skip, 92 | double threshold, 93 | void **next_data, 94 | PandaDestroy *next_destroy) { 95 | struct hang_data *data = malloc(sizeof(struct hang_data)); 96 | size_t it; 97 | 98 | data->next = inner; 99 | data->next_data = inner_data; 100 | data->next_destroy = inner_destroy; 101 | data->skip = skip; 102 | data->threshold = threshold; 103 | for (it = 0; it < forward_length; it++) 104 | data->forward[forward_length - it - 1] = forward[it]; 105 | for (it = 0; it < reverse_length; it++) 106 | data->reverse[reverse_length - it - 1] = reverse[it]; 107 | data->forward_length = forward_length; 108 | data->reverse_length = reverse_length; 109 | data->logger = panda_log_proxy_ref(logger); 110 | 111 | *next_data = data; 112 | *next_destroy = hang_free; 113 | return hang_next; 114 | } 115 | -------------------------------------------------------------------------------- /mktable.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #include 20 | #include 21 | #include "pandaseq-tablebuilder.h" 22 | 23 | static double match( 24 | double p, 25 | double q, 26 | void *data) { 27 | 28 | (void) data; 29 | 30 | return (1 - p) * (1 - q) + p * q / 3; 31 | } 32 | 33 | static double mismatch( 34 | double p, 35 | double q, 36 | void *data) { 37 | 38 | (void) data; 39 | 40 | return (1 - p) * q / 3 + (1 - q) * p / 3 + 2 * p * q / 9; 41 | } 42 | 43 | static double match_pear( 44 | double p, 45 | double q, 46 | void *data) { 47 | 48 | (void) data; 49 | 50 | return (1 - (1 - q) * p / 3 - (1 - p) * q / 3 - 2 * (1 - p) * (1 - q) / 9); 51 | } 52 | 53 | static double mismatch_pear( 54 | double p, 55 | double q, 56 | void *data) { 57 | 58 | (void) data; 59 | 60 | return (1 - p) * q / 3 + (1 - q) * p / 3 + p * q / 2; 61 | } 62 | 63 | static double score( 64 | double p, 65 | void *data) { 66 | 67 | (void) data; 68 | 69 | if (p == 1) { 70 | return -2; 71 | } 72 | return log(1.0 - p); 73 | } 74 | 75 | static double score_err( 76 | double p, 77 | void *data) { 78 | 79 | (void) data; 80 | 81 | return log(p); 82 | } 83 | 84 | double mismatch_rdp( 85 | double p, 86 | double q, 87 | void *data) { 88 | 89 | (void) data; 90 | 91 | return ((1 - p) * q / 3 + (1 - q) * p / 3 + 2 * p * q / 9); 92 | } 93 | 94 | double mismatch_rdp_assembled( 95 | double p, 96 | double q, 97 | void *data) { 98 | 99 | (void) data; 100 | 101 | double min = (p <= q) ? p : q; 102 | double value = 1 - (min - p * q / 3.0) / (p + q - 4.0 / 3.0 * p * q); 103 | return (value == 0) ? DBL_MIN : value; 104 | } 105 | 106 | static double match_uparse( 107 | double p, 108 | double q, 109 | void *data) { 110 | double value; 111 | 112 | (void) data; 113 | 114 | value = 1 - p * q / (1 - p - q + 4 * p * q / 3); 115 | /* This should never be negative, but it is. */ 116 | return (value <= 0) ? DBL_MIN : value; 117 | } 118 | 119 | static double mismatch_uparse( 120 | double p, 121 | double q, 122 | void *data) { 123 | double value; 124 | 125 | (void) data; 126 | 127 | value = 1 - (p + q / 3) / (p + q - 4 * p * q / 3); 128 | /* This should never be negative, but it is. */ 129 | return (value <= 0) ? DBL_MIN : value; 130 | } 131 | 132 | int main( 133 | void) { 134 | PandaTBld t_bld; 135 | 136 | t_bld = panda_tbld_open("table"); 137 | if (t_bld == NULL) { 138 | return 1; 139 | } 140 | 141 | panda_tbld_constant(t_bld, "qual_nn_simple_bayesian", log(0.25)); 142 | panda_tbld_matrix_prob(t_bld, "qual_match_simple_bayesian", match, NULL, true); 143 | panda_tbld_matrix_prob(t_bld, "qual_mismatch_simple_bayesian", mismatch, NULL, true); 144 | panda_tbld_matrix_prob(t_bld, "qual_match_pear", match_pear, NULL, true); 145 | panda_tbld_matrix_prob(t_bld, "qual_mismatch_pear", mismatch_pear, NULL, true); 146 | panda_tbld_matrix_prob(t_bld, "qual_mismatch_rdp_mle", mismatch_rdp, NULL, true); 147 | panda_tbld_matrix_prob(t_bld, "qual_mismatch_assembled_rdp_mle", mismatch_rdp_assembled, NULL, true); 148 | panda_tbld_matrix_prob(t_bld, "qual_match_uparse", match_uparse, NULL, true); 149 | panda_tbld_matrix_prob(t_bld, "qual_mismatch_uparse", mismatch_uparse, NULL, true); 150 | panda_tbld_array_prob(t_bld, "qual_score", score, NULL, false); 151 | panda_tbld_array_prob(t_bld, "qual_score_err", score_err, NULL, false); 152 | 153 | panda_tbld_free(t_bld); 154 | return 0; 155 | } 156 | -------------------------------------------------------------------------------- /algo_pear.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "pandaseq.h" 24 | #include "algo.h" 25 | #include "prob.h" 26 | #include "table.h" 27 | 28 | struct pear { 29 | double random_base; 30 | }; 31 | 32 | static double overlap_probability( 33 | struct pear *data, 34 | const panda_qual *forward, 35 | size_t forward_length, 36 | const panda_qual *reverse, 37 | size_t reverse_length, 38 | size_t overlap) { 39 | double probability = 0; 40 | size_t i; 41 | 42 | for (i = 0; i < overlap; i++) { 43 | int findex = forward_length + i - overlap; 44 | int rindex = reverse_length - i - 1; 45 | if (findex < 0 || rindex < 0 || (size_t) findex >= forward_length || (size_t) rindex >= reverse_length) 46 | continue; 47 | panda_nt f = forward[findex].nt; 48 | panda_nt r = reverse[rindex].nt; 49 | if (PANDA_NT_IS_N(f) || PANDA_NT_IS_N(r)) { 50 | probability -= data->random_base; 51 | } else if ((f & r) != 0) { 52 | probability += qual_match_pear[PHREDCLAMP(forward[findex].qual)][PHREDCLAMP(forward[rindex].qual)]; 53 | } else { 54 | probability += qual_mismatch_pear[PHREDCLAMP(forward[findex].qual)][PHREDCLAMP(forward[rindex].qual)]; 55 | } 56 | } 57 | 58 | return probability; 59 | } 60 | 61 | static double match_probability( 62 | struct pear *data, 63 | bool match, 64 | char a, 65 | char b) { 66 | (void) data; 67 | return (match ? qual_match_pear : qual_mismatch_pear)[PHREDCLAMP(a)][PHREDCLAMP(b)]; 68 | } 69 | 70 | static PandaAlgorithm from_string( 71 | const char *argument) { 72 | PandaAlgorithm algo; 73 | double random_base; 74 | char *end; 75 | 76 | if (argument == NULL) 77 | return panda_algorithm_pear_new(); 78 | errno = 0; 79 | random_base = strtod(argument, &end); 80 | if (errno == ERANGE || *end != '\0') { 81 | fprintf(stderr, "Cannot parse value: %s\n", argument); 82 | return NULL; 83 | } 84 | if (random_base < 0 || random_base > 1) { 85 | fprintf(stderr, "Random base %f is not a probability.\n", random_base); 86 | return NULL; 87 | } 88 | algo = panda_algorithm_pear_new(); 89 | panda_algorithm_pear_set_random_base_log_p(algo, log(random_base)); 90 | return algo; 91 | } 92 | 93 | const struct panda_algorithm_class panda_algorithm_pear_class = { 94 | .data_size = sizeof(struct pear), 95 | .name = "pear", 96 | .create = from_string, 97 | .data_destroy = NULL, 98 | .overlap_probability = (PandaComputeOverlap) overlap_probability, 99 | .match_probability = (PandaComputeMatch) match_probability, 100 | .prob_unpaired = qual_nn_simple_bayesian, 101 | }; 102 | 103 | PandaAlgorithm panda_algorithm_pear_new( 104 | void) { 105 | PandaAlgorithm algo = panda_algorithm_new(&panda_algorithm_pear_class); 106 | panda_algorithm_pear_set_random_base_log_p(algo, log(0.25)); 107 | return algo; 108 | } 109 | 110 | void panda_algorithm_pear_set_random_base_log_p( 111 | PandaAlgorithm algorithm, 112 | double log_p) { 113 | if (panda_algorithm_is_a(algorithm, &panda_algorithm_pear_class)) { 114 | ((struct pear *) panda_algorithm_data(algorithm))->random_base = log_p; 115 | } 116 | } 117 | 118 | double panda_algorithm_pear_get_random_base_log_p( 119 | PandaAlgorithm algorithm) { 120 | if (panda_algorithm_is_a(algorithm, &panda_algorithm_pear_class)) { 121 | return ((struct pear *) panda_algorithm_data(algorithm))->random_base; 122 | } else { 123 | return 1; 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /idset.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include "pandaseq.h" 20 | #include 21 | #ifdef HAVE_PTHREAD 22 | # include 23 | 24 | /* All idsets share a single mutex to control reference counts */ 25 | static pthread_mutex_t ref_lock = PTHREAD_MUTEX_INITIALIZER; 26 | #endif 27 | 28 | typedef struct node *node_p; 29 | struct node { 30 | panda_seq_identifier id; 31 | node_p left; 32 | node_p right; 33 | }; 34 | 35 | struct panda_idset { 36 | volatile size_t refcnt; 37 | node_p root; 38 | 39 | }; 40 | 41 | PandaSet panda_idset_new( 42 | void) { 43 | PandaSet set = malloc(sizeof(struct panda_idset)); 44 | if (set == NULL) 45 | return NULL; 46 | set->refcnt = 1; 47 | set->root = NULL; 48 | 49 | return set; 50 | } 51 | 52 | PandaSet panda_idset_ref( 53 | PandaSet set) { 54 | #ifdef HAVE_PTHREAD 55 | pthread_mutex_lock(&ref_lock); 56 | #endif 57 | set->refcnt++; 58 | #ifdef HAVE_PTHREAD 59 | pthread_mutex_unlock(&ref_lock); 60 | #endif 61 | return set; 62 | } 63 | 64 | static void node_free( 65 | node_p node) { 66 | if (node != NULL) { 67 | node_free(node->left); 68 | node_free(node->right); 69 | free(node); 70 | } 71 | } 72 | 73 | void panda_idset_unref( 74 | PandaSet set) { 75 | size_t count; 76 | if (set == NULL) 77 | return; 78 | #ifdef HAVE_PTHREAD 79 | pthread_mutex_lock(&ref_lock); 80 | #endif 81 | count = --(set->refcnt); 82 | #ifdef HAVE_PTHREAD 83 | pthread_mutex_unlock(&ref_lock); 84 | #endif 85 | if (count == 0) { 86 | node_free(set->root); 87 | free(set); 88 | } 89 | } 90 | 91 | static void insert( 92 | node_p * node, 93 | const panda_seq_identifier *id) { 94 | if (*node == NULL) { 95 | (*node) = malloc(sizeof(struct node)); 96 | if (*node == NULL) 97 | return; 98 | (*node)->left = NULL; 99 | (*node)->right = NULL; 100 | panda_seqid_clear(&(*node)->id); 101 | (*node)->id = *id; 102 | } else { 103 | int comparison = panda_seqid_compare(id, &(*node)->id); 104 | if (comparison < 0) { 105 | insert(&(*node)->left, id); 106 | if ((*node)->left->id.x < (*node)->id.x) { 107 | node_p temp = (*node)->left; 108 | (*node)->left = temp->right; 109 | temp->right = *node; 110 | *node = temp; 111 | } 112 | } else if (comparison > 0) { 113 | insert(&(*node)->right, id); 114 | if ((*node)->right->id.x < (*node)->id.x) { 115 | node_p temp = (*node)->right; 116 | (*node)->right = temp->left; 117 | temp->left = *node; 118 | *node = temp; 119 | } 120 | } 121 | } 122 | } 123 | 124 | void panda_idset_add( 125 | PandaSet set, 126 | const panda_seq_identifier *id) { 127 | insert(&set->root, id); 128 | } 129 | 130 | bool panda_idset_add_str( 131 | PandaSet set, 132 | const char *id, 133 | PandaTagging policy, 134 | PandaIdFmt *detected_format, 135 | const char **end_ptr) { 136 | panda_seq_identifier seq_id; 137 | if (panda_seqid_parse_fail(&seq_id, id, policy, detected_format, end_ptr) == 0) { 138 | return false; 139 | } else { 140 | panda_idset_add(set, &seq_id); 141 | return true; 142 | } 143 | } 144 | 145 | bool panda_idset_contains( 146 | PandaSet set, 147 | const panda_seq_identifier *id) { 148 | node_p curr = set->root; 149 | while (curr != NULL) { 150 | int comp = panda_seqid_compare(id, &curr->id); 151 | if (comp == 0) 152 | return true; 153 | if (comp < 0) 154 | curr = curr->left; 155 | else 156 | curr = curr->right; 157 | } 158 | return false; 159 | } 160 | -------------------------------------------------------------------------------- /pandaseq-writer.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef _PANDASEQ_WRITER_H 20 | # define _PANDASEQ_WRITER_H 21 | # ifdef __cplusplus 22 | # define EXTERN_C_BEGIN extern "C" { 23 | # define EXTERN_C_END } 24 | # else 25 | # define EXTERN_C_BEGIN 26 | # define EXTERN_C_END 27 | # endif 28 | # include 29 | # include 30 | # include 31 | EXTERN_C_BEGIN 32 | /* === Constructor === */ 33 | /** 34 | * Create a new writer, backed by some target. 35 | */ 36 | PandaWriter panda_writer_new( 37 | PandaBufferWrite write, 38 | void *write_data, 39 | PandaDestroy write_destroy); 40 | /** 41 | * Create a new writer, backed by an open file. 42 | * @file: (transfer full): the open file. 43 | */ 44 | PandaWriter panda_writer_new_file( 45 | FILE *file); 46 | /** 47 | * Create a writer which discards all writes. 48 | */ 49 | PandaWriter panda_writer_new_null( 50 | void); 51 | /** 52 | * Create a new writer, backed by standard error. 53 | */ 54 | PandaWriter panda_writer_new_stderr( 55 | void); 56 | /** 57 | * Create a new writer, backed by standard output. 58 | */ 59 | PandaWriter panda_writer_new_stdout( 60 | void); 61 | 62 | /** 63 | * Open a file for writing. 64 | * @filename: The file to write. 65 | * @bzip: Write BZipped text rather than plain text. 66 | * Returns: (allow-none): A writer. 67 | */ 68 | PandaWriter panda_writer_open_file( 69 | const char *filename, 70 | bool bzip); 71 | 72 | /* === Methods === */ 73 | /** 74 | * Write a printf-like formatted string to the output. 75 | */ 76 | void panda_writer_append( 77 | PandaWriter writer, 78 | const char *format, 79 | ...); 80 | /** 81 | * Write a single character to the output. 82 | */ 83 | void panda_writer_append_c( 84 | PandaWriter writer, 85 | char c); 86 | /** 87 | * Write a sequence identifier to the output. 88 | */ 89 | void panda_writer_append_id( 90 | PandaWriter writer, 91 | const panda_seq_identifier *id); 92 | /** 93 | * Write a printf-like formatted string to the output. 94 | */ 95 | void panda_writer_append_v( 96 | PandaWriter writer, 97 | const char *format, 98 | va_list va); 99 | /** 100 | * End the current transaction and start another. 101 | * 102 | * This will consider the appending done so far to this writer to be a unit 103 | * that can be passed to the output when necessary. 104 | */ 105 | void panda_writer_commit( 106 | PandaWriter writer); 107 | 108 | /** 109 | * A target write to be commited at the same time as this one. 110 | * 111 | * A writer may have a slave writer that will recieve a commit whenever panda_writer_commit is called on this writer. 112 | */ 113 | void panda_writer_set_slave( 114 | PandaWriter writer, 115 | PandaWriter slave); 116 | PandaWriter panda_writer_get_slave( 117 | PandaWriter writer); 118 | 119 | /** 120 | * Force writing all buffered data to the output. 121 | * 122 | * This requires getting a lock and happens automatically under normal circumstances. 123 | */ 124 | void panda_writer_flush( 125 | PandaWriter writer); 126 | 127 | /** 128 | * Increase the reference count on a writer. 129 | * 130 | * This is thread-safe. 131 | */ 132 | PandaWriter panda_writer_ref( 133 | PandaWriter writer); 134 | 135 | /** 136 | * Decrease the reference count on a writer. 137 | * 138 | * This is thread-safe. 139 | * @set: (transfer full): the set to be released. 140 | */ 141 | void panda_writer_unref( 142 | PandaWriter writer); 143 | EXTERN_C_END 144 | #endif 145 | -------------------------------------------------------------------------------- /algo_example.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include 20 | #include "pandaseq.h" 21 | 22 | /* 23 | * To create a new scoring algorithm: 24 | * 1. Make a copy of this file with the name of your algorithm. 25 | * 2. Edit `Makefile.am` and include the new file in the `libpandaseq_la_SOURCES` list. 26 | * 3. Edit `algo.c` and include your algorithm in the `panda_algorithm_register` list. 27 | * 4. Edit `pandaseq-algorithm.h` and add a new section for your algorithm and the weird 3-line definition stanza. 28 | * 5. Fill in this file, renaming "example" to the name of your algorithm. 29 | * 6. Compile and test. 30 | * 7. Edit `pandaseq.1` and include documentation about the parameters. 31 | * 8. (Optional) create a Vala class in `vapi.in`. 32 | * 33 | * Have a look at the existing algorithms to get an idea of how to write these. 34 | */ 35 | 36 | struct example { 37 | /* Create all the parameters your algorithm needs. */ 38 | }; 39 | 40 | static double overlap_probability( 41 | struct example *data, 42 | const panda_qual *forward, 43 | size_t forward_length, 44 | const panda_qual *reverse, 45 | size_t reverse_length, 46 | size_t overlap) { 47 | 48 | /* Compute the probability of this overlap being correct, as a log probability and return the value. The overlap region may be longer than either read, so be sure to handle those cases. */ 49 | } 50 | 51 | static double match_probability( 52 | struct example *data, 53 | bool match, 54 | char a, 55 | char b) { 56 | /* Compute the log probability that two bases, of scores `a` and `b` are either matched or mismatched based on `match`. If a calculation can be transformed into a lookup table, it can be precomputed in `mktable.c`. */ 57 | } 58 | 59 | static PandaAlgorithm from_string( 60 | const char *argument) { 61 | PandaAlgorithm algo; 62 | 63 | /* Parse the possibly null command line argument and return a new algorithm. */ 64 | } 65 | 66 | /* This is the class definition. Just give it a name. */ 67 | const struct panda_algorithm_class panda_algorithm_simple_bayes_class = { 68 | .data_size = sizeof(struct example), 69 | .name = "example", 70 | .create = from_string, 71 | .data_destroy = NULL, 72 | .overlap_probability = (PandaComputeOverlap) overlap_probability, 73 | .match_probability = (PandaComputeMatch) match_probability, 74 | .prob_unpaired = qual_nn_simple_bayesian, 75 | }; 76 | 77 | /* The constructor for your algorithm. It needs to call the super constructor with your class, then you may initialise your variables. Also, include the definition in pandaseq-algorithm.h. */ 78 | PandaAlgorithm panda_algorithm_example_new( 79 | void) { 80 | PandaAlgorithm algo = panda_algorithm_new(&panda_algorithm_example_class); 81 | /* Set default parameters here. Use the setters defined below. */ 82 | return algo; 83 | } 84 | 85 | /* Create getters and setters for all the parameters of the algorithm. Create a pair for each parameter needed. Also, include the definition in pandaseq-algorithm.h. */ 86 | double panda_algorithm_example_get_parameter( 87 | PandaAlgorithm algorithm) { 88 | if (panda_algorithm_is_a(algorithm, &panda_algorithm_example_bayes_class)) { 89 | return ((struct example *) panda_algorithm_data(algorithm))->parameter; 90 | } else { 91 | return -1; 92 | } 93 | } 94 | 95 | void panda_algorithm_example_set_parameterrror_estimation( 96 | PandaAlgorithm algorithm, 97 | double parameter) { 98 | if (panda_algorithm_is_a(algorithm, &panda_algorithm_example_class)) { 99 | struct example *data = panda_algorithm_data(algorithm); 100 | data->parameter = parameter; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /pandaseq-seqid.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef _PANDASEQ_SEQID_H 20 | # define _PANDASEQ_SEQID_H 21 | # ifdef __cplusplus 22 | # define EXTERN_C_BEGIN extern "C" { 23 | # define EXTERN_C_END } 24 | # else 25 | # define EXTERN_C_BEGIN 26 | # define EXTERN_C_END 27 | # endif 28 | # include 29 | EXTERN_C_BEGIN 30 | /** 31 | * Display the name of the header format. 32 | */ 33 | const char *panda_idfmt_str( 34 | PandaIdFmt format); 35 | 36 | /** 37 | * Does the header format indicate the direction of the read (i.e., forward or reverse). 38 | * 39 | * Reads from the SRAs have the direction information mangled. 40 | */ 41 | bool panda_idfmt_has_direction( 42 | PandaIdFmt format); 43 | 44 | /* === Constructors === */ 45 | 46 | /** 47 | * Reset a sequnce identifier. 48 | * @src: The structure to read. 49 | * @dest: (out caller-allocates): The structure to write. 50 | */ 51 | void panda_seqid_copy( 52 | const panda_seq_identifier *src, 53 | panda_seq_identifier *dest); 54 | 55 | /** 56 | * Reset a sequnce identifier. 57 | * @id: (out caller-allocates): The structure to clear. 58 | */ 59 | void panda_seqid_clear( 60 | panda_seq_identifier *id); 61 | 62 | /** 63 | * Parse an Illumina header 64 | * 65 | * @id: (out caller-allocates): The structure to fill with the parse result. 66 | * Returns: The function returns the direction of the sequence (1 for forward, 2 or 3 for reverse) or 0 if an error occurs. Sequences from the Short Read Archive are always 1. 67 | */ 68 | int panda_seqid_parse( 69 | panda_seq_identifier *id, 70 | const char *input, 71 | PandaTagging policy); 72 | 73 | /** 74 | * Parse the Illumina header 75 | * 76 | * @id: (out caller-allocates): The structure to fill with the parse result. 77 | * @detected_format: (out): The pipeline that produced this header. 78 | * @end_ptr: (out) (transfer none): The point in the input where parsing stopped. If parsing was successful, this will be the end of the string. 79 | * Returns: The function returns the direction of the sequence (1 for forward, 2 or 3 for reverse) or 0 if an error occurs. Sequences from the Short Read Archive are always 1. 80 | * @see panda_seqid_parse 81 | */ 82 | int panda_seqid_parse_fail( 83 | panda_seq_identifier *id, 84 | const char *input, 85 | PandaTagging policy, 86 | PandaIdFmt *detected_format, 87 | const char **end_ptr); 88 | 89 | /* === Methods === */ 90 | 91 | /** 92 | * Order two Illumina headers 93 | */ 94 | int panda_seqid_compare( 95 | const panda_seq_identifier *one, 96 | const panda_seq_identifier *two); 97 | 98 | /** 99 | * Compare two Illumina headers 100 | */ 101 | bool panda_seqid_equal( 102 | const panda_seq_identifier *one, 103 | const panda_seq_identifier *two); 104 | 105 | /** 106 | * Write an Illumina header for a sequence identifier to a file 107 | */ 108 | void panda_seqid_print( 109 | const panda_seq_identifier *id, 110 | FILE *file); 111 | 112 | /** 113 | * Create an Illumina header for a sequence identifier 114 | * @id: (allow-none): The identifer to be formatted 115 | * Returns: (transfer none): Subsequent calls will obliterate the previously returned string. 116 | */ 117 | const char *panda_seqid_str( 118 | const panda_seq_identifier *id); 119 | 120 | /** 121 | * Write the Illumina header to a printf-like function 122 | * @xprintf: (closure x): The callback to accept the input. 123 | */ 124 | void panda_seqid_xprint( 125 | const panda_seq_identifier *id, 126 | PandaPrintf xprintf, 127 | void *x); 128 | EXTERN_C_END 129 | #endif 130 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_INIT([pandaseq], [2.11], [andre@masella.name]) 2 | AC_PREREQ([2.60]) 3 | AM_INIT_AUTOMAKE([-Wall foreign]) 4 | AC_CONFIG_MACRO_DIR([m4]) 5 | AC_CONFIG_HEADERS(config.h) 6 | AC_ARG_ENABLE(debug, AC_HELP_STRING([--enable-debug], [build with debugging output [default=no].]), [DEBUG="$enableval"], [DEBUG="no"]) 7 | AC_ARG_ENABLE(threads, AC_HELP_STRING([--disable-threads], [disable thread support (default is autodetect)])) 8 | AC_ARG_WITH(max-len, AC_HELP_STRING([--with-max-len=LEN], [sets the maximum read length]), [MAX_LEN="$withval"], [MAX_LEN="450"]) 9 | 10 | if ! test "$MAX_LEN" -eq "$MAX_LEN" 2> /dev/null 11 | then 12 | AC_MSG_ERROR([Bad read length $MAX_LEN]) 13 | fi 14 | AC_DEFINE_UNQUOTED([MAX_LEN], $MAX_LEN, [Maximum read length]) 15 | 16 | m4_pattern_allow([AM_PROG_AR]) 17 | AM_PROG_AR 18 | AM_PROG_CC_C_O 19 | AC_PROG_LIBTOOL 20 | LT_INIT 21 | LT_LIB_M 22 | LT_PROG_RC 23 | AX_CHECK_CFLAGS([-pedantic]) 24 | AC_HEADER_STDC 25 | 26 | m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])], ) 27 | 28 | if test "x$enable_debug" = xyes; then 29 | AC_DEFINE(DEBUG, 1, [Enable debugging code]) 30 | fi 31 | 32 | AC_CANONICAL_BUILD 33 | AC_CANONICAL_HOST 34 | 35 | AC_MSG_CHECKING([(non-cross) C compiler for building tools]) 36 | if test "$cross_compiling" = "yes"; then 37 | CC_FOR_BUILD="${CC_FOR_BUILD-gcc}" 38 | else 39 | CC_FOR_BUILD="${CC_FOR_BUILD-$CC}" 40 | fi 41 | AC_MSG_RESULT([$CC_FOR_BUILD]) 42 | AC_ARG_VAR(CC_FOR_BUILD,[C compiler for building tools]) 43 | 44 | AC_MSG_CHECKING([for module flags]) 45 | is_windows=false 46 | MODULE_CFLAGS="$CFLAGS" 47 | MODULE_LDFLAGS="$LDFLAGS -no-undefined -avoid-version -shared" 48 | MODULE_LIBTOOL=libtool 49 | case "$host" in 50 | *-mingw*) 51 | MODULE_LDFLAGS="$MODULE_LDFLAGS -Wl,-static" 52 | is_windows=true 53 | ;; 54 | *darwin*|*Darwin*) 55 | MODULE_LDFLAGS="$MODULE_LDFLAGS -flat_namespace" 56 | MODULE_LIBTOOL=glibtool 57 | ;; 58 | esac 59 | AC_MSG_RESULT([$MODULE_LDFLAGS]) 60 | AC_SUBST(MODULE_LIBTOOL) 61 | AC_SUBST(MODULE_CFLAGS) 62 | AC_SUBST(MODULE_LDFLAGS) 63 | 64 | if test "$enable_threads" != no; then 65 | ACX_PTHREAD 66 | else 67 | acx_pthread_ok=no 68 | fi 69 | AM_CONDITIONAL([PTHREAD], [test x$acx_pthread_ok = xyes]) 70 | 71 | AG_CHECK_UNAME_SYSCALL 72 | AC_CHECK_HEADERS_ONCE([sys/param.h]) 73 | AC_CHECK_HEADERS([sys/sysctl.h], [], [], 74 | [[#if HAVE_SYS_PARAM_H 75 | # include 76 | #endif 77 | ]]) 78 | 79 | PKG_CHECK_MODULES(Z, [ zlib ]) 80 | PKG_CHECK_MODULES(CURL, [ libcurl ], [have_curl=true], [have_curl=false]) 81 | AM_CONDITIONAL([LIBCURL], [test x$have_curl = xtrue]) 82 | LEGACY_CHECK_MODULES(BZ, [bzlib.h], [bz2], [BZ2_bzDecompressInit], [], [], [AC_MSG_ERROR([*** bzip2 required, install bzip2 library])]) 83 | LEGACY_CHECK_MODULES(LTDL, [ltdl.h], [ltdl], [lt_dlinit], [], [], [AC_MSG_ERROR([*** ltld required, install libtool library. ])]) 84 | 85 | AM_CONDITIONAL([IS_WINDOWS], [test x$is_windows = xtrue]) 86 | 87 | LIB_NAME=pandaseq-$(echo $PACKAGE_VERSION | sed -e 's/\..*$//g') 88 | AC_SUBST(LIB_NAME) 89 | 90 | VERSION_MAJOR=$(echo $PACKAGE_VERSION | cut -f 1 -d . ) 91 | VERSION_MINOR=$(echo $PACKAGE_VERSION | cut -f 2 -d . ) 92 | AC_DEFINE_UNQUOTED(VERSION_MAJOR, ${VERSION_MAJOR}, [Package major version number.]) 93 | AC_DEFINE_UNQUOTED(VERSION_MINOR, ${VERSION_MINOR}, [Package minor version number.]) 94 | 95 | LIB_NAME=pandaseq-$(echo $PACKAGE_VERSION | sed -e 's/\..*$//g') 96 | AC_SUBST(LIB_NAME) 97 | 98 | # http://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html#Updating-version-info 99 | LIB_MAJOR=7 100 | LIB_MINOR=0 101 | LIB_VER=${LIB_MAJOR}:${LIB_MINOR}:0 102 | LIB_URL_VER=0:0:0 103 | AC_DEFINE_UNQUOTED(LIB_MAJOR, ${LIB_MAJOR}, [Library major version number.]) 104 | AC_DEFINE_UNQUOTED(LIB_MINOR, ${LIB_MINOR}, [Library minor version number.]) 105 | AC_SUBST(LIB_MAJOR) 106 | AC_SUBST(LIB_VER) 107 | AC_SUBST(LIB_URL_VER) 108 | 109 | AC_CONFIG_FILES([Makefile]) 110 | AC_CONFIG_FILES([pandaseq.spec]) 111 | AC_CONFIG_FILES(${LIB_NAME}.pc:pc.in, [], [LIB_NAME=$LIB_NAME]) 112 | AC_CONFIG_FILES(${LIB_NAME}.vapi:vapi.in, [], [LIB_NAME=$LIB_NAME]) 113 | AC_CONFIG_FILES(${LIB_NAME}-url.deps:deps-url.in, [], [LIB_NAME=$LIB_NAME]) 114 | AC_CONFIG_FILES(${LIB_NAME}-url.pc:pc-url.in, [], [LIB_NAME=$LIB_NAME]) 115 | AC_CONFIG_FILES(${LIB_NAME}-url.vapi:vapi-url.in, [], [LIB_NAME=$LIB_NAME]) 116 | AC_CONFIG_FILES([pandaxs:pandaxs.in], [chmod +x pandaxs]) 117 | AC_CONFIG_FILES([build-macos-pkg:build-macos-pkg.in], [chmod +x build-macos-pkg]) 118 | AC_OUTPUT 119 | -------------------------------------------------------------------------------- /pandaseq-module.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef _PANDASEQ_MODULE_H 20 | # define _PANDASEQ_MODULE_H 21 | # ifdef __cplusplus 22 | # define EXTERN_C_BEGIN extern "C" { 23 | # define EXTERN_C_END } 24 | # else 25 | # define EXTERN_C_BEGIN 26 | # define EXTERN_C_END 27 | # endif 28 | # include 29 | EXTERN_C_BEGIN 30 | /** 31 | * The current module API version of the running library 32 | */ 33 | int panda_api_version( 34 | void); 35 | 36 | /* === Constructors === */ 37 | 38 | /** 39 | * Create a module given sequence checking parameters. 40 | * 41 | * @name: the name of the module, for user interaction 42 | * @check: (closure user_data): the function to be run after assembly 43 | * @precheck: (closure user_data): a function to be run before assembly 44 | * @user_data: (transfer full): the context data for the functions. The user is responsible for managing the memory associated with user_data, but the cleanup function will always be called to do so. 45 | * @cleanup: (closure user_data): a function to be called when this module is garbage collected 46 | */ 47 | PandaModule panda_module_new( 48 | const char *name, 49 | PandaCheck check, 50 | PandaPreCheck precheck, 51 | void *user_data, 52 | PandaDestroy cleanup); 53 | 54 | /** 55 | * Load a module from a string containg the module name and arguments. 56 | * 57 | * @path: the name or path to a module separated by LT_PATHSEP_CHAR and any arguments to the initialisation function of that module 58 | */ 59 | PandaModule panda_module_load( 60 | PandaLogProxy logger, 61 | const char *path); 62 | 63 | /* === Methods === */ 64 | 65 | /** 66 | * Increase the reference count on a module. 67 | */ 68 | PandaModule panda_module_ref( 69 | PandaModule module); 70 | 71 | /** 72 | * Decrease the reference count on a module. 73 | * @module: (transfer full): the module to release. 74 | */ 75 | void panda_module_unref( 76 | PandaModule module); 77 | 78 | /* === Getter and Setters === */ 79 | 80 | /** 81 | * Get the version of a module. 82 | * 83 | * This is only appropriate for loaded modules. Modules constructed by panda_module_new will always return PANDA_API. 84 | */ 85 | int panda_module_get_api( 86 | PandaModule module); 87 | 88 | /** 89 | * Get the arguments passed on loading of a module of a module. 90 | * 91 | * This is only appropriate for loaded modules. 92 | * Returns: (transfer none) (allow-none): The usage help text. 93 | */ 94 | const char *panda_module_get_args( 95 | PandaModule module); 96 | 97 | /** 98 | * Get the description of a module. 99 | * 100 | * This is only appropriate for loaded modules. 101 | * Returns: (transfer none) (allow-none): The description help text. 102 | */ 103 | const char *panda_module_get_description( 104 | PandaModule module); 105 | 106 | /** 107 | * Get the name of a module. 108 | * 109 | * Returns: (transfer none): the module's name 110 | */ 111 | const char *panda_module_get_name( 112 | PandaModule module); 113 | 114 | /** 115 | * Get the usage information (i.e., help text) of a module. 116 | * 117 | * This is only appropriate for loaded modules. 118 | * Returns: (transfer none) (allow-none): The usage help text. 119 | */ 120 | const char *panda_module_get_usage( 121 | PandaModule module); 122 | 123 | /** 124 | * Get the version of a module. 125 | * 126 | * This is only appropriate for loaded modules. 127 | * Returns: (transfer none) (allow-none): The usage help text. 128 | */ 129 | const char *panda_module_get_version( 130 | PandaModule module); 131 | EXTERN_C_END 132 | #endif 133 | -------------------------------------------------------------------------------- /algo.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include 20 | #include 21 | #ifdef HAVE_PTHREAD 22 | # include 23 | #endif 24 | #include "pandaseq.h" 25 | #include "algo.h" 26 | 27 | double panda_algorithm_quality_compare( 28 | PandaAlgorithm algorithm, 29 | const panda_qual *a, 30 | const panda_qual *b) { 31 | return algorithm->clazz->match_probability(&algorithm->end, (a->nt & b->nt) != '\0', a->qual, b->qual); 32 | } 33 | 34 | void *panda_algorithm_data( 35 | PandaAlgorithm algo) { 36 | return &algo->end; 37 | } 38 | 39 | PandaAlgorithmClass panda_algorithm_class( 40 | PandaAlgorithm algo) { 41 | return algo->clazz; 42 | } 43 | 44 | bool panda_algorithm_is_a( 45 | PandaAlgorithm algo, 46 | PandaAlgorithmClass clazz) { 47 | return algo != NULL && algo->clazz == clazz; 48 | } 49 | 50 | PandaAlgorithm panda_algorithm_ref( 51 | PandaAlgorithm algo) { 52 | #ifdef HAVE_PTHREAD 53 | pthread_mutex_lock(&algo->mutex); 54 | #endif 55 | algo->refcnt++; 56 | #ifdef HAVE_PTHREAD 57 | pthread_mutex_unlock(&algo->mutex); 58 | #endif 59 | return algo; 60 | } 61 | 62 | void panda_algorithm_unref( 63 | PandaAlgorithm algo) { 64 | size_t count; 65 | if (algo == NULL) 66 | return; 67 | #ifdef HAVE_PTHREAD 68 | pthread_mutex_lock(&algo->mutex); 69 | #endif 70 | count = --(algo->refcnt); 71 | #ifdef HAVE_PTHREAD 72 | pthread_mutex_unlock(&algo->mutex); 73 | #endif 74 | if (count == 0) { 75 | #ifdef HAVE_PTHREAD 76 | pthread_mutex_destroy(&algo->mutex); 77 | #endif 78 | if (algo->clazz->data_destroy != NULL) { 79 | algo->clazz->data_destroy(&algo->end); 80 | } 81 | free(algo); 82 | } 83 | } 84 | 85 | PandaAlgorithm panda_algorithm_new( 86 | PandaAlgorithmClass clazz) { 87 | PandaAlgorithm instance = malloc(sizeof(struct panda_algorithm) + clazz->data_size); 88 | #ifdef HAVE_PTHREAD 89 | pthread_mutex_init(&instance->mutex, NULL); 90 | #endif 91 | instance->refcnt = 1; 92 | instance->clazz = clazz; 93 | return instance; 94 | } 95 | 96 | PandaAlgorithmClass *panda_algorithms = NULL; 97 | size_t panda_algorithms_length = 0; 98 | static size_t algorithms_size = 10; 99 | 100 | static int algorithm_compare( 101 | const void *a, 102 | const void *b) { 103 | return strcmp(((*(PandaAlgorithmClass *) a))->name, (*((PandaAlgorithmClass *) b))->name); 104 | } 105 | 106 | void panda_algorithm_register( 107 | PandaAlgorithmClass clazz) { 108 | size_t it; 109 | for (it = 0; it < panda_algorithms_length; it++) { 110 | if (panda_algorithms[it] == clazz) { 111 | return; 112 | } 113 | } 114 | if (panda_algorithms_length == algorithms_size) { 115 | algorithms_size *= 2; 116 | panda_algorithms = realloc(panda_algorithms, algorithms_size * sizeof(PandaAlgorithmClass)); 117 | } 118 | panda_algorithms[panda_algorithms_length++] = clazz; 119 | qsort(panda_algorithms, panda_algorithms_length, sizeof(PandaAlgorithmClass), algorithm_compare); 120 | } 121 | 122 | __attribute__ ((constructor)) 123 | static void lib_init( 124 | void) { 125 | panda_algorithms = calloc(sizeof(PandaAlgorithmClass), algorithms_size); 126 | panda_algorithm_register(&panda_algorithm_ea_util_class); 127 | panda_algorithm_register(&panda_algorithm_flash_class); 128 | panda_algorithm_register(&panda_algorithm_pear_class); 129 | panda_algorithm_register(&panda_algorithm_rdp_mle_class); 130 | panda_algorithm_register(&panda_algorithm_simple_bayes_class); 131 | panda_algorithm_register(&panda_algorithm_stitch_class); 132 | panda_algorithm_register(&panda_algorithm_uparse_class); 133 | } 134 | 135 | __attribute__ ((destructor)) 136 | static void lib_destroy( 137 | void) { 138 | free(panda_algorithms); 139 | } 140 | -------------------------------------------------------------------------------- /offset.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "pandaseq.h" 25 | #include "prob.h" 26 | #include "table.h" 27 | 28 | #ifndef M_LN2 29 | # define M_LN2 0.69314718055994530942 30 | #endif 31 | 32 | #define CIRC(index, len) (((index) + (len)) % (len)) 33 | 34 | /* Compute 1-exp(p) See */ 35 | double panda_log1mexp( 36 | double p) { 37 | return (p > M_LN2) ? log1p(-exp(-p)) : log(-expm1(-p)); 38 | } 39 | 40 | typedef void ( 41 | *base_score) ( 42 | const void *data, 43 | panda_nt *base, 44 | double *prob, 45 | double *notprob); 46 | 47 | static size_t computeoffset( 48 | double threshold, 49 | double penalty, 50 | bool reverse, 51 | const unsigned char *seq, 52 | size_t seq_length, 53 | size_t size, 54 | base_score score, 55 | const panda_nt *primer, 56 | size_t primerlen) { 57 | /* Circular buffer of probabilities of primer alignment indexed by the offset. */ 58 | double probabilities[primerlen]; 59 | double bestpr = exp(primerlen * threshold); 60 | size_t bestindex = 0; 61 | size_t index; 62 | if (primerlen > seq_length) { 63 | return 0; 64 | } 65 | 66 | for (index = 0; index < primerlen; index++) { 67 | probabilities[index] = -INFINITY; 68 | } 69 | 70 | for (index = 0; index < seq_length; index++) { 71 | ptrdiff_t x; 72 | double last_pr = exp(probabilities[CIRC(index, primerlen)] / (index + 1)) - index * penalty; 73 | /* The last bucket in the buffer holds the probability of a complete alignment. If it so better than we have seen previously, store it. */ 74 | if (last_pr > bestpr) { 75 | bestpr = last_pr; 76 | bestindex = index + 1; 77 | } 78 | probabilities[CIRC(index, primerlen)] = 0; 79 | for (x = (ptrdiff_t) (primerlen > index ? index : primerlen - 1); x >= 0; x--) { 80 | if (!PANDA_NT_IS_N(primer[x])) { 81 | panda_nt nt; 82 | double p; 83 | double notp; 84 | score(&seq[size * (reverse ? (seq_length - index - 1) : index)], &nt, &p, ¬p); 85 | probabilities[CIRC(index - x, primerlen)] += ((nt & primer[x]) != 0) ? p : notp; 86 | } 87 | } 88 | } 89 | return bestindex; 90 | } 91 | 92 | void qual_base_score( 93 | const void *data, 94 | panda_nt *base, 95 | double *prob, 96 | double *notprob) { 97 | int phred = PHREDCLAMP(((panda_qual *) data)->qual); 98 | *base = ((panda_qual *) data)->nt; 99 | *prob = qual_score[phred]; 100 | *notprob = qual_score_err[phred]; 101 | } 102 | 103 | size_t panda_compute_offset_qual( 104 | double threshold, 105 | double penalty, 106 | bool reverse, 107 | const panda_qual *haystack, 108 | size_t haystack_length, 109 | const panda_nt *needle, 110 | size_t needle_length) { 111 | return computeoffset(threshold, penalty, reverse, (const unsigned char *) haystack, haystack_length, sizeof(panda_qual), qual_base_score, needle, needle_length); 112 | } 113 | 114 | void result_base_score( 115 | const void *data, 116 | panda_nt *base, 117 | double *prob, 118 | double *notprob) { 119 | *base = ((panda_result *) data)->nt; 120 | *prob = ((panda_result *) data)->p; 121 | *notprob = panda_log1mexp(*prob); 122 | } 123 | 124 | size_t panda_compute_offset_result( 125 | double threshold, 126 | double penalty, 127 | bool reverse, 128 | const panda_result *haystack, 129 | size_t haystack_length, 130 | const panda_nt *needle, 131 | size_t needle_length) { 132 | return computeoffset(threshold, penalty, reverse, (const unsigned char *) haystack, haystack_length, sizeof(panda_result), result_base_score, needle, needle_length); 133 | } 134 | -------------------------------------------------------------------------------- /algo_uparse.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "pandaseq.h" 24 | #include "prob.h" 25 | #include "table.h" 26 | 27 | struct uparse { 28 | double q; 29 | double pmatch; 30 | double pmismatch; 31 | }; 32 | 33 | static double overlap_probability( 34 | struct uparse *data, 35 | const panda_qual *forward, 36 | size_t forward_length, 37 | const panda_qual *reverse, 38 | size_t reverse_length, 39 | size_t overlap) { 40 | size_t matches = 0; 41 | size_t mismatches = 0; 42 | size_t unknowns = 0; 43 | size_t i; 44 | 45 | for (i = 0; i < overlap; i++) { 46 | int findex = forward_length + i - overlap; 47 | int rindex = reverse_length - i - 1; 48 | if (findex < 0 || rindex < 0 || (size_t) findex >= forward_length || (size_t) rindex >= reverse_length) 49 | continue; 50 | panda_nt f = forward[findex].nt; 51 | panda_nt r = reverse[rindex].nt; 52 | if (PANDA_NT_IS_N(f) || PANDA_NT_IS_N(r)) { 53 | unknowns++; 54 | } else if ((f & r) != 0) { 55 | matches++; 56 | } else { 57 | mismatches++; 58 | } 59 | } 60 | 61 | if (overlap >= forward_length && overlap >= reverse_length) { 62 | return (qual_nn_simple_bayesian * unknowns + matches * data->pmatch + mismatches * data->pmismatch); 63 | } else { 64 | return (qual_nn_simple_bayesian * (forward_length + reverse_length - 2 * overlap + unknowns) + matches * data->pmatch + mismatches * data->pmismatch); 65 | } 66 | } 67 | 68 | static double match_probability( 69 | struct uparse *data, 70 | bool match, 71 | char a, 72 | char b) { 73 | (void) data; 74 | return (match ? qual_match_uparse : qual_mismatch_uparse)[PHREDCLAMP(a)][PHREDCLAMP(b)]; 75 | } 76 | 77 | static PandaAlgorithm from_string( 78 | const char *argument) { 79 | PandaAlgorithm algo; 80 | double err_estimation; 81 | char *end; 82 | 83 | if (argument == NULL) 84 | return panda_algorithm_uparse_new(); 85 | errno = 0; 86 | err_estimation = strtod(argument, &end); 87 | if (errno == ERANGE || *end != '\0') { 88 | fprintf(stderr, "Cannot parse value: %s\n", argument); 89 | return NULL; 90 | } 91 | if (err_estimation < 0 || err_estimation > 1) { 92 | fprintf(stderr, "Error estimation %f is not a probability.\n", err_estimation); 93 | return NULL; 94 | } 95 | algo = panda_algorithm_uparse_new(); 96 | panda_algorithm_uparse_set_error_estimation(algo, err_estimation); 97 | return algo; 98 | } 99 | 100 | const struct panda_algorithm_class panda_algorithm_uparse_class = { 101 | .data_size = sizeof(struct uparse), 102 | .name = "uparse", 103 | .create = from_string, 104 | .data_destroy = NULL, 105 | .overlap_probability = (PandaComputeOverlap) overlap_probability, 106 | .match_probability = (PandaComputeMatch) match_probability, 107 | .prob_unpaired = qual_nn_simple_bayesian, 108 | }; 109 | 110 | PandaAlgorithm panda_algorithm_uparse_new( 111 | void) { 112 | PandaAlgorithm algo = panda_algorithm_new(&panda_algorithm_uparse_class); 113 | panda_algorithm_uparse_set_error_estimation(algo, 0.36); 114 | return algo; 115 | } 116 | 117 | double panda_algorithm_uparse_get_error_estimation( 118 | PandaAlgorithm algorithm) { 119 | if (panda_algorithm_is_a(algorithm, &panda_algorithm_uparse_class)) { 120 | return ((struct uparse *) panda_algorithm_data(algorithm))->q; 121 | } else { 122 | return -1; 123 | } 124 | } 125 | 126 | void panda_algorithm_uparse_set_error_estimation( 127 | PandaAlgorithm algorithm, 128 | double q) { 129 | if (q > 0 && q < 1 && panda_algorithm_is_a(algorithm, &panda_algorithm_uparse_class)) { 130 | struct uparse *data = panda_algorithm_data(algorithm); 131 | data->q = q; 132 | data->pmatch = log(1 - q * q * (1 - 2 * q + 4 * q * q / 3)); 133 | data->pmismatch = log(1 - 4 * q / 3 / (2 * q - 4 * q * q / 3)); 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /pandaseq-log.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef _PANDASEQ_LOG_H 20 | # define _PANDASEQ_LOG_H 21 | # ifdef __cplusplus 22 | # define EXTERN_C_BEGIN extern "C" { 23 | # define EXTERN_C_END } 24 | # else 25 | # define EXTERN_C_BEGIN 26 | # define EXTERN_C_END 27 | # endif 28 | # include 29 | # include 30 | # include 31 | EXTERN_C_BEGIN 32 | /* === Constructors === */ 33 | /** 34 | * Write the log to a transactional writer. 35 | */ 36 | PandaLogProxy panda_log_proxy_new( 37 | PandaWriter writer); 38 | 39 | /** 40 | * Create a new proxy to standard error. 41 | */ 42 | PandaLogProxy panda_log_proxy_new_stderr( 43 | ); 44 | 45 | /** 46 | * Write the log to an open file. 47 | */ 48 | PandaLogProxy panda_log_proxy_new_file( 49 | FILE *file); 50 | 51 | /** 52 | * Open a file for writing error messages. 53 | * @filename: The file to write. 54 | * @bzip: Write BZipped text rather than plain text. 55 | * Returns: (allow-none): A logger proxy. 56 | */ 57 | PandaLogProxy panda_log_proxy_open_file( 58 | const char *filename, 59 | bool bzip); 60 | 61 | /* === Getters and Setters === */ 62 | 63 | PandaWriter panda_log_proxy_get_writer( 64 | PandaLogProxy proxy); 65 | 66 | /* === Methods === */ 67 | 68 | /** 69 | * Increase the reference count on a proxy. 70 | */ 71 | PandaLogProxy panda_log_proxy_ref( 72 | PandaLogProxy proxy); 73 | 74 | /** 75 | * Decrease the reference count on a proxy. 76 | * @proxy: (transfer full): the proxy to release. 77 | */ 78 | void panda_log_proxy_unref( 79 | PandaLogProxy proxy); 80 | 81 | /** 82 | * Writes an error message to the log with the same semantics as the POSIX perror function. 83 | */ 84 | void panda_log_proxy_perror( 85 | PandaLogProxy proxy, 86 | const char *prefix); 87 | 88 | /** 89 | * Print a message to the log. 90 | * 91 | * This method is thread-safe. 92 | */ 93 | void panda_log_proxy_write( 94 | PandaLogProxy proxy, 95 | PandaCode code, 96 | PandaAssembler assembler, 97 | panda_seq_identifier *id, 98 | const char *message); 99 | 100 | /** 101 | * Print the overlap histogram of an assember to the log. 102 | * 103 | * This method is thread-safe. 104 | */ 105 | void panda_log_proxy_write_overlap( 106 | PandaLogProxy proxy, 107 | PandaAssembler assembler); 108 | 109 | /** 110 | * Put a printf-like message in the log. 111 | */ 112 | void panda_log_proxy_write_f( 113 | PandaLogProxy proxy, 114 | const char *format, 115 | ...); 116 | 117 | /** 118 | * Print a string to the log. 119 | * 120 | * This method is thread-safe. 121 | */ 122 | void panda_log_proxy_write_str( 123 | PandaLogProxy proxy, 124 | const char *str); 125 | 126 | /** 127 | * Print a double with a STAT header to the log. 128 | * 129 | * This method is thread-safe. 130 | */ 131 | void panda_log_proxy_stat_double( 132 | PandaLogProxy proxy, 133 | PandaAssembler assembler, 134 | const char *name, 135 | double value); 136 | 137 | /** 138 | * Print a double with a STAT header to the log. 139 | * 140 | * This method is thread-safe. 141 | */ 142 | void panda_log_proxy_stat_long( 143 | PandaLogProxy proxy, 144 | PandaAssembler assembler, 145 | const char *name, 146 | long value); 147 | 148 | /** 149 | * Print a size_t with a STAT header to the log. 150 | * 151 | * This method is thread-safe. 152 | */ 153 | void panda_log_proxy_stat_size_t( 154 | PandaLogProxy proxy, 155 | PandaAssembler assembler, 156 | const char *name, 157 | size_t value); 158 | 159 | /** 160 | * Print a string with a STAT header to the log. 161 | * 162 | * This method is thread-safe. 163 | */ 164 | void panda_log_proxy_stat_str( 165 | PandaLogProxy proxy, 166 | PandaAssembler assembler, 167 | const char *name, 168 | const char *value); 169 | 170 | EXTERN_C_END 171 | #endif 172 | -------------------------------------------------------------------------------- /algo_simple_bayes.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "pandaseq.h" 24 | #include "prob.h" 25 | #include "table.h" 26 | 27 | struct simple_bayes { 28 | double q; 29 | double pmatch; 30 | double pmismatch; 31 | }; 32 | 33 | static double overlap_probability( 34 | struct simple_bayes *data, 35 | const panda_qual *forward, 36 | size_t forward_length, 37 | const panda_qual *reverse, 38 | size_t reverse_length, 39 | size_t overlap) { 40 | size_t matches = 0; 41 | size_t mismatches = 0; 42 | size_t unknowns = 0; 43 | size_t i; 44 | 45 | for (i = 0; i < overlap; i++) { 46 | int findex = forward_length + i - overlap; 47 | int rindex = reverse_length - i - 1; 48 | if (findex < 0 || rindex < 0 || (size_t) findex >= forward_length || (size_t) rindex >= reverse_length) 49 | continue; 50 | panda_nt f = forward[findex].nt; 51 | panda_nt r = reverse[rindex].nt; 52 | if (PANDA_NT_IS_N(f) || PANDA_NT_IS_N(r)) { 53 | unknowns++; 54 | } else if ((f & r) != 0) { 55 | matches++; 56 | } else { 57 | mismatches++; 58 | } 59 | } 60 | 61 | if (overlap >= forward_length && overlap >= reverse_length) { 62 | return (qual_nn_simple_bayesian * unknowns + matches * data->pmatch + mismatches * data->pmismatch); 63 | } else { 64 | return (qual_nn_simple_bayesian * (forward_length + reverse_length - 2 * overlap + unknowns) + matches * data->pmatch + mismatches * data->pmismatch); 65 | } 66 | } 67 | 68 | static double match_probability( 69 | struct simple_bayes *data, 70 | bool match, 71 | char a, 72 | char b) { 73 | (void) data; 74 | return (match ? qual_match_simple_bayesian : qual_mismatch_simple_bayesian)[PHREDCLAMP(a)][PHREDCLAMP(b)]; 75 | } 76 | 77 | static PandaAlgorithm from_string( 78 | const char *argument) { 79 | PandaAlgorithm algo; 80 | double err_estimation; 81 | char *end; 82 | 83 | if (argument == NULL) 84 | return panda_algorithm_simple_bayes_new(); 85 | errno = 0; 86 | err_estimation = strtod(argument, &end); 87 | if (errno == ERANGE || *end != '\0') { 88 | fprintf(stderr, "Cannot parse value: %s\n", argument); 89 | return NULL; 90 | } 91 | if (err_estimation < 0 || err_estimation > 1) { 92 | fprintf(stderr, "Error estimation %f is not a probability.\n", err_estimation); 93 | return NULL; 94 | } 95 | algo = panda_algorithm_simple_bayes_new(); 96 | panda_algorithm_simple_bayes_set_error_estimation(algo, err_estimation); 97 | return algo; 98 | } 99 | 100 | const struct panda_algorithm_class panda_algorithm_simple_bayes_class = { 101 | .data_size = sizeof(struct simple_bayes), 102 | .name = "simple_bayesian", 103 | .create = from_string, 104 | .data_destroy = NULL, 105 | .overlap_probability = (PandaComputeOverlap) overlap_probability, 106 | .match_probability = (PandaComputeMatch) match_probability, 107 | .prob_unpaired = qual_nn_simple_bayesian, 108 | }; 109 | 110 | PandaAlgorithm panda_algorithm_simple_bayes_new( 111 | void) { 112 | PandaAlgorithm algo = panda_algorithm_new(&panda_algorithm_simple_bayes_class); 113 | panda_algorithm_simple_bayes_set_error_estimation(algo, 0.36); 114 | return algo; 115 | } 116 | 117 | double panda_algorithm_simple_bayes_get_error_estimation( 118 | PandaAlgorithm algorithm) { 119 | if (panda_algorithm_is_a(algorithm, &panda_algorithm_simple_bayes_class)) { 120 | return ((struct simple_bayes *) panda_algorithm_data(algorithm))->q; 121 | } else { 122 | return -1; 123 | } 124 | } 125 | 126 | void panda_algorithm_simple_bayes_set_error_estimation( 127 | PandaAlgorithm algorithm, 128 | double q) { 129 | if (q > 0 && q < 1 && panda_algorithm_is_a(algorithm, &panda_algorithm_simple_bayes_class)) { 130 | struct simple_bayes *data = panda_algorithm_data(algorithm); 131 | data->q = q; 132 | data->pmatch = log(0.25 * (1 - 2 * q + q * q)); 133 | data->pmismatch = log((3 * q - 2 * q * q) / 18.0); 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /pandaseq-tablebuilder.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef _PANDASEQ_TBLD_H 20 | # define _PANDASEQ_TBLD_H 21 | # ifdef __cplusplus 22 | # define EXTERN_C_BEGIN extern "C" { 23 | # define EXTERN_C_END } 24 | # else 25 | # define EXTERN_C_BEGIN 26 | # define EXTERN_C_END 27 | # endif 28 | # include 29 | # include 30 | # include 31 | EXTERN_C_BEGIN typedef struct panda_tbld *PandaTBld; 32 | /** 33 | * A formula to compute over a range [0, max) of type size_t. 34 | * @context: (closure): the provided user context. 35 | */ 36 | typedef double ( 37 | *PandaArrayFormula) ( 38 | size_t i, 39 | void *context); 40 | /** 41 | * A formula to compute over a range P_i : i in [0, PHRED_MAX], where P_i is a PHRED score converted to a probability. 42 | * @context: (closure): the provided user context. 43 | */ 44 | typedef double ( 45 | *PandaArrayProbFormula) ( 46 | double p_i, 47 | void *context); 48 | /** 49 | * A formula to compute over a matrix of size_t pairs. 50 | * @context: (closure): the provided user context. 51 | * @see PandaArrayFormula 52 | */ 53 | typedef double ( 54 | *PandaMatrixFormula) ( 55 | size_t x, 56 | size_t y, 57 | void *context); 58 | /** 59 | * A formula to compute over a matrix of probability pairs. 60 | * @context: (closure): the provided user context. 61 | * @see PandaArrayProbFormula 62 | */ 63 | typedef double ( 64 | *PandaMatrixProbFormula) ( 65 | double p_x, 66 | double p_y, 67 | void *context); 68 | 69 | /* === Constructors === */ 70 | /** 71 | * Create a new table written to a file. 72 | * 73 | * The file name and header symbols will be inferred from the provided name. 74 | * Returns: (allow-none): the builder, unless an error occurred. 75 | */ 76 | PandaTBld panda_tbld_open( 77 | const char *base_name); 78 | 79 | /* === Methods === */ 80 | void panda_tbld_free( 81 | PandaTBld t_bld); 82 | /** 83 | * Write an array filling it with the provided formula. 84 | * @name: the C symbol for the formula. 85 | * @formula: (closure formula_context): the formula to compute. 86 | * @max: the array length of the output. 87 | */ 88 | void panda_tbld_array( 89 | PandaTBld t_bld, 90 | const char *name, 91 | PandaArrayFormula formula, 92 | void *formula_context, 93 | size_t max); 94 | /** 95 | * Write an array filling it with the provided formula, computed over PHRED probabilities. 96 | * @name: the C symbol for the formula. 97 | * @formula: (closure formula_context): the formula to compute. 98 | * @log_output: write the logarithm of the output, rather than the output. 99 | */ 100 | void panda_tbld_array_prob( 101 | PandaTBld t_bld, 102 | const char *name, 103 | PandaArrayProbFormula formula, 104 | void *formula_context, 105 | bool log_output); 106 | /** 107 | * Write `#define` constant. 108 | * @name: the C symbol for the constant. 109 | * @value: the value to write 110 | */ 111 | void panda_tbld_constant( 112 | PandaTBld t_bld, 113 | const char *name, 114 | double value); 115 | /** 116 | * Write an array of arrays filling it with the provided formula. 117 | * @name: the C symbol for the formula. 118 | * @formula: (closure formula_context): the formula to compute. 119 | * @x_max: the outer array length of the output. 120 | * @y_max: the inner array length of the output. 121 | */ 122 | void panda_tbld_matrix( 123 | PandaTBld t_bld, 124 | const char *name, 125 | PandaMatrixFormula formula, 126 | void *formula_context, 127 | size_t x_max, 128 | size_t y_max); 129 | /** 130 | * Write an array of arrays filling it with the provided formula, computed over PHRED probabilities. 131 | * @name: the C symbol for the formula. 132 | * @formula: (closure formula_context): the formula to compute. 133 | * @log_output: write the logarithm of the output, rather than the output. 134 | */ 135 | void panda_tbld_matrix_prob( 136 | PandaTBld t_bld, 137 | const char *name, 138 | PandaMatrixProbFormula formula, 139 | void *formula_context, 140 | bool log_output); 141 | EXTERN_C_END 142 | #endif 143 | -------------------------------------------------------------------------------- /pandaseq-mux.h: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #ifndef _PANDASEQ_MUX_H 20 | # define _PANDASEQ_MUX_H 21 | # ifdef __cplusplus 22 | # define EXTERN_C_BEGIN extern "C" { 23 | # define EXTERN_C_END } 24 | # else 25 | # define EXTERN_C_BEGIN 26 | # define EXTERN_C_END 27 | # endif 28 | # include 29 | EXTERN_C_BEGIN 30 | /* === Constructors === */ 31 | /** 32 | * Create a new multiplexed data source from a sequence callback. 33 | * 34 | * The interface will guarantee that only one call will be made at a time to the data source or the logger. However, the interface makes no guarantees in which thread the call will be made. Furthermore, the logger may be call multiple times by different assembly processes (i.e., the logging messages from different sequences may be interleaved). 35 | * @next: (closure next_data) (scope notified): the next sequence handler 36 | * @logger: the logger callback 37 | */ 38 | PandaMux panda_mux_new( 39 | PandaNextSeq next, 40 | void *next_data, 41 | PandaDestroy next_destroy, 42 | PandaLogProxy logger); 43 | 44 | /** 45 | * Create a new multiplexed reader for given to FASTQ streams. 46 | * @see panda_create_fastq_reader 47 | */ 48 | PandaMux panda_mux_new_fastq_reader( 49 | PandaBufferRead forward, 50 | void *forward_data, 51 | PandaDestroy forward_destroy, 52 | PandaBufferRead reverse, 53 | void *reverse_data, 54 | PandaDestroy reverse_destroy, 55 | PandaLogProxy logger, 56 | unsigned char qualmin, 57 | PandaTagging policy); 58 | 59 | /** 60 | * Open a pair of FASTQ files for multi-threaded assembled. 61 | * @see panda_assembler_open_fastq 62 | */ 63 | PandaMux panda_mux_open_fastq( 64 | const char *forward, 65 | const char *reverse, 66 | PandaLogProxy logger, 67 | unsigned char qualmin, 68 | PandaTagging policy); 69 | 70 | /* === Methods === */ 71 | 72 | /** 73 | * Create a new assembler using the multiplexer as it sequence source. 74 | * 75 | * The new assembler will draw sequences from the original source in a thread-safe way. Each assembler is not thread-safe. This means that, to use the interface correctly, one creates a sequence source, wraps it in a multiplexer, then creates an assembler for every thread. Each assembler should be accessed in only one thread. It may be advisable to create a single assembler and set its configuration, then copy the settings to subsequently created assemblers. 76 | * @see panda_assembler_copy_configuration 77 | */ 78 | PandaAssembler panda_mux_create_assembler( 79 | PandaMux mux); 80 | 81 | /** 82 | * Create a new assembler using the multiplexer as it sequence source with a custom k-mer table size. 83 | * @see panda_mux_create_assembler 84 | * @see panda_assembler_new_kmer 85 | */ 86 | PandaAssembler panda_mux_create_assembler_kmer( 87 | PandaMux mux, 88 | size_t num_kmers); 89 | 90 | /** 91 | * Increase the reference count on a multiplexer. 92 | */ 93 | PandaMux panda_mux_ref( 94 | PandaMux mux); 95 | 96 | /** 97 | * Attached a callback for every sequence that fails to have an overlap. 98 | * 99 | * This will be called when a sequence fails to have an overlap computed. This does not include sequences that are missing primers or sequences that are assembled and discarded by modules. 100 | * 101 | * Synchronisation is the resonsibility of the callee. This function must be re-entrant. 102 | * 103 | * @handler: (closure handler_data) (scope notified): the callback for a failed pair 104 | */ 105 | 106 | void panda_mux_set_fail_alignment( 107 | PandaMux mux, 108 | PandaFailAlign handler, 109 | void *handler_data, 110 | PandaDestroy handler_destroy); 111 | 112 | /** 113 | * Decrease the reference count on a multiplexer. 114 | * @mux: (transfer full): the mux to be released. 115 | */ 116 | void panda_mux_unref( 117 | PandaMux mux); 118 | 119 | /** 120 | * Get the number of assemblers created so far. 121 | */ 122 | size_t panda_mux_get_child_count( 123 | PandaMux mux); 124 | 125 | /** 126 | * The logging proxy used by this mux 127 | * Returns: (transfer none): the proxy 128 | */ 129 | PandaLogProxy panda_mux_get_loggger( 130 | PandaMux mux); 131 | 132 | EXTERN_C_END 133 | #endif 134 | -------------------------------------------------------------------------------- /iter.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include 19 | #include 20 | #include "config.h" 21 | #include "pandaseq.h" 22 | #include "misc.h" 23 | 24 | enum iter_type { 25 | ITER_QUAL, 26 | ITER_NT, 27 | ITER_RESULT 28 | }; 29 | 30 | struct panda_iter { 31 | enum iter_type type; 32 | bool reverse; 33 | int k; 34 | kmer_it it; 35 | 36 | panda_qual *qual; 37 | size_t qual_length; 38 | 39 | panda_nt *nt; 40 | size_t nt_length; 41 | 42 | panda_result *result; 43 | size_t result_length; 44 | 45 | panda_kmer output; 46 | }; 47 | 48 | void panda_iter_free( 49 | PandaIter iter) { 50 | free(iter); 51 | } 52 | 53 | PandaIter panda_iter_dup( 54 | PandaIter iter) { 55 | PandaIter new_iter = malloc(sizeof(struct panda_iter)); 56 | memcpy(new_iter, iter, sizeof(struct panda_iter)); 57 | return new_iter; 58 | } 59 | 60 | void panda_iter_reset( 61 | PandaIter iter) { 62 | if (iter->reverse) { 63 | switch (iter->type) { 64 | case ITER_QUAL: 65 | iter->it.posn = iter->qual_length; 66 | break; 67 | case ITER_NT: 68 | iter->it.posn = iter->nt_length; 69 | break; 70 | case ITER_RESULT: 71 | iter->it.posn = iter->result_length; 72 | break; 73 | } 74 | } else { 75 | iter->it.posn = -1; 76 | } 77 | iter->it.bad = iter->k; 78 | } 79 | 80 | int panda_iter_k( 81 | PandaIter iter) { 82 | return iter->k; 83 | } 84 | 85 | size_t panda_iter_bits( 86 | PandaIter iter) { 87 | return iter->k * 2; 88 | } 89 | 90 | #define RETURN_KMER return iter->output.kmer = iter->it.kmer, iter->output.posn = iter->it.posn, &iter->output 91 | const panda_kmer *panda_iter_next( 92 | PandaIter iter) { 93 | switch (iter->type) { 94 | case ITER_QUAL: 95 | if (iter->reverse) { 96 | iter->it.posn--; 97 | _FOREACH_KMER(iter->it, iter->qual,.nt, iter->it.posn, iter->it.bad, >=0, --, iter->k) { 98 | RETURN_KMER; 99 | } 100 | } else { 101 | iter->it.posn++; 102 | _FOREACH_KMER(iter->it, iter->qual,.nt, iter->it.posn, iter->it.bad, <(ptrdiff_t) iter->qual_length, ++, iter->k) { 103 | RETURN_KMER; 104 | } 105 | } 106 | return NULL; 107 | case ITER_NT: 108 | if (iter->reverse) { 109 | iter->it.posn--; 110 | _FOREACH_KMER(iter->it, iter->nt,, iter->it.posn, iter->it.bad, >=0, --, iter->k) { 111 | RETURN_KMER; 112 | } 113 | } else { 114 | iter->it.posn++; 115 | _FOREACH_KMER(iter->it, iter->nt,, iter->it.posn, iter->it.bad, <(ptrdiff_t) iter->nt_length, ++, iter->k) { 116 | RETURN_KMER; 117 | } 118 | } 119 | return NULL; 120 | case ITER_RESULT: 121 | if (iter->reverse) { 122 | iter->it.posn--; 123 | _FOREACH_KMER(iter->it, iter->result,.nt, iter->it.posn, iter->it.bad, >=0, --, iter->k) { 124 | RETURN_KMER; 125 | } 126 | } else { 127 | iter->it.posn++; 128 | _FOREACH_KMER(iter->it, iter->result,.nt, iter->it.posn, iter->it.bad, <(ptrdiff_t) iter->result_length, ++, iter->k) { 129 | RETURN_KMER; 130 | } 131 | } 132 | return NULL; 133 | } 134 | return NULL; 135 | } 136 | 137 | static PandaIter iter_new( 138 | enum iter_type type, 139 | bool reverse, 140 | int k) { 141 | PandaIter iter = malloc(sizeof(struct panda_iter)); 142 | iter->type = type; 143 | iter->reverse = reverse; 144 | if (k < 1) { 145 | iter->k = KMER_LEN; 146 | } else { 147 | iter->k = ((size_t) k < sizeof(size_t) * 4) ? (size_t) k : (sizeof(size_t) * 4); 148 | } 149 | return iter; 150 | } 151 | 152 | PandaIter panda_iterate_qual( 153 | panda_qual *seq, 154 | size_t seq_length, 155 | bool reverse, 156 | int k) { 157 | PandaIter iter = iter_new(ITER_QUAL, reverse, k); 158 | iter->qual = seq; 159 | iter->qual_length = seq_length; 160 | panda_iter_reset(iter); 161 | return iter; 162 | } 163 | 164 | PandaIter panda_iterate_nt( 165 | panda_nt *seq, 166 | size_t seq_length, 167 | bool reverse, 168 | int k) { 169 | PandaIter iter = iter_new(ITER_NT, reverse, k); 170 | iter->nt = seq; 171 | iter->nt_length = seq_length; 172 | panda_iter_reset(iter); 173 | return iter; 174 | } 175 | 176 | PandaIter panda_iterate_result( 177 | panda_result *seq, 178 | size_t seq_length, 179 | bool reverse, 180 | int k) { 181 | PandaIter iter = iter_new(ITER_RESULT, reverse, k); 182 | iter->result = seq; 183 | iter->result_length = seq_length; 184 | panda_iter_reset(iter); 185 | return iter; 186 | } 187 | -------------------------------------------------------------------------------- /nt.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include 20 | #include "pandaseq.h" 21 | #include "nt.h" 22 | #include "prob.h" 23 | #include "table.h" 24 | 25 | static char ntchar[16] = { 'N', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N' }; 26 | 27 | static panda_nt complementary_nt[] = { 28 | /*N*/ 0, 29 | /*A-T */ PANDA_NT_T, 30 | /*C-G */ PANDA_NT_G, 31 | /*M-K */ PANDA_NT_G | PANDA_NT_T, 32 | /*G-C */ PANDA_NT_C, 33 | /*R-Y */ PANDA_NT_C | PANDA_NT_T, 34 | /*S-S */ PANDA_NT_C | PANDA_NT_G, 35 | /*V*/ PANDA_NT_A | PANDA_NT_C | PANDA_NT_G, 36 | /*T-A */ PANDA_NT_A, 37 | /*W-W */ PANDA_NT_A | PANDA_NT_T, 38 | /*Y-R */ PANDA_NT_A | PANDA_NT_G, 39 | /*H*/ PANDA_NT_A | PANDA_NT_C | PANDA_NT_G, 40 | /*K-M */ PANDA_NT_A | PANDA_NT_C, 41 | /*D*/ PANDA_NT_A | PANDA_NT_C | PANDA_NT_T, 42 | /*B*/ PANDA_NT_A | PANDA_NT_C | PANDA_NT_G, 43 | /*N-N */ PANDA_NT_A | PANDA_NT_C | PANDA_NT_G | PANDA_NT_T 44 | }; 45 | 46 | panda_nt iupac_forward[32] = { 47 | /*@ */ PANDA_NT_Z, 48 | /*A*/ PANDA_NT_A, 49 | /*B*/ PANDA_NT_C | PANDA_NT_G | PANDA_NT_T, 50 | /*C*/ PANDA_NT_C, 51 | /*D*/ PANDA_NT_A | PANDA_NT_G | PANDA_NT_T, 52 | /*E*/ PANDA_NT_Z, 53 | /*F*/ PANDA_NT_Z, 54 | /*G*/ PANDA_NT_G, 55 | /*H*/ PANDA_NT_A | PANDA_NT_C | PANDA_NT_T, 56 | /*I*/ PANDA_NT_Z, 57 | /*J*/ PANDA_NT_Z, 58 | /*K*/ PANDA_NT_G | PANDA_NT_T, 59 | /*L*/ PANDA_NT_Z, 60 | /*M*/ PANDA_NT_A | PANDA_NT_C, 61 | /*N*/ PANDA_NT_A | PANDA_NT_C | PANDA_NT_G | PANDA_NT_T, 62 | /*O*/ PANDA_NT_Z, 63 | /*P*/ PANDA_NT_Z, 64 | /*Q*/ PANDA_NT_Z, 65 | /*R*/ PANDA_NT_A | PANDA_NT_G, 66 | /*S*/ PANDA_NT_C | PANDA_NT_G, 67 | /*T*/ PANDA_NT_T, 68 | /*U*/ PANDA_NT_T, 69 | /*V*/ PANDA_NT_A | PANDA_NT_C | PANDA_NT_G, 70 | /*W*/ PANDA_NT_A | PANDA_NT_T, 71 | /*X*/ PANDA_NT_A | PANDA_NT_C | PANDA_NT_G | PANDA_NT_T, 72 | /*Y*/ PANDA_NT_C | PANDA_NT_T, 73 | /*Z*/ PANDA_NT_Z, 74 | /*[ */ PANDA_NT_Z, 75 | /*\ */ PANDA_NT_Z, 76 | /*] */ PANDA_NT_Z, 77 | /*^ */ PANDA_NT_Z, 78 | /*_*/ PANDA_NT_Z 79 | }; 80 | 81 | panda_nt iupac_reverse[32] = { 82 | /* @ */ PANDA_NT_Z, 83 | /*A*/ PANDA_NT_T, 84 | /*B*/ PANDA_NT_G | PANDA_NT_C | PANDA_NT_A, 85 | /*C*/ PANDA_NT_G, 86 | /*D*/ PANDA_NT_T | PANDA_NT_C | PANDA_NT_A, 87 | /*E*/ PANDA_NT_Z, 88 | /*F*/ PANDA_NT_Z, 89 | /*G*/ PANDA_NT_C, 90 | /*H*/ PANDA_NT_T | PANDA_NT_G | PANDA_NT_A, 91 | /*I*/ PANDA_NT_Z, 92 | /*J*/ PANDA_NT_Z, 93 | /*K*/ PANDA_NT_C | PANDA_NT_A, 94 | /*L*/ PANDA_NT_Z, 95 | /*M*/ PANDA_NT_T | PANDA_NT_G, 96 | /*N*/ PANDA_NT_A | PANDA_NT_C | PANDA_NT_G | PANDA_NT_T, 97 | /*O*/ PANDA_NT_Z, 98 | /*P*/ PANDA_NT_Z, 99 | /*Q*/ PANDA_NT_Z, 100 | /*R*/ PANDA_NT_T | PANDA_NT_C, 101 | /*S*/ PANDA_NT_G | PANDA_NT_C, 102 | /*T*/ PANDA_NT_A, 103 | /*U*/ PANDA_NT_A, 104 | /*V*/ PANDA_NT_T | PANDA_NT_G | PANDA_NT_C, 105 | /*W*/ PANDA_NT_T | PANDA_NT_A, 106 | /*X*/ PANDA_NT_A | PANDA_NT_C | PANDA_NT_G | PANDA_NT_T, 107 | /*Y*/ PANDA_NT_G | PANDA_NT_A, 108 | /*Z*/ PANDA_NT_Z, 109 | /*[ */ PANDA_NT_Z, 110 | /*\ */ PANDA_NT_Z, 111 | /*] */ PANDA_NT_Z, 112 | /*^ */ PANDA_NT_Z, 113 | /*_*/ PANDA_NT_Z 114 | }; 115 | 116 | double panda_quality_probability( 117 | const panda_qual *q) { 118 | return exp(panda_quality_log_probability(q)); 119 | } 120 | 121 | double panda_quality_log_probability( 122 | const panda_qual *q) { 123 | return qual_score[PHREDCLAMP(q->qual)]; 124 | } 125 | 126 | char panda_result_phred( 127 | const panda_result *r) { 128 | 129 | char lower = 0; 130 | char upper = PHREDMAX; 131 | 132 | if (r->p <= qual_score[0]) 133 | return 1; 134 | 135 | while (lower < upper) { 136 | char mid = lower + (upper - lower) / 2; 137 | if (qual_score[(int) mid] == r->p) { 138 | return mid; 139 | } 140 | if (mid == lower) { 141 | return lower; 142 | } else if (qual_score[(int) mid] > r->p) { 143 | upper = mid; 144 | } else if (qual_score[(int) mid] < r->p) { 145 | lower = mid + 1; 146 | } 147 | } 148 | 149 | return lower; 150 | } 151 | 152 | panda_nt panda_nt_from_ascii( 153 | char c) { 154 | return iupac_forward[(int) c & 0x1F]; 155 | } 156 | 157 | panda_nt panda_nt_from_ascii_complement( 158 | char c) { 159 | return iupac_reverse[(int) c & 0x1F]; 160 | } 161 | 162 | panda_nt panda_nt_complement( 163 | panda_nt nt) { 164 | return complementary_nt[nt & 0xF]; 165 | } 166 | 167 | char panda_nt_to_ascii( 168 | panda_nt val) { 169 | if (val < PANDA_NT_Z || val > (panda_nt) 15) { 170 | return 'N'; 171 | } 172 | return ntchar[(int) (val)]; 173 | } 174 | -------------------------------------------------------------------------------- /fileio.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2012 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #include "config.h" 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #if HAVE_PTHREAD 25 | # include 26 | #endif 27 | #include "pandaseq.h" 28 | #include "misc.h" 29 | #ifdef HAVE_PTHREAD 30 | # include"pandaseq-mux.h" 31 | #endif 32 | 33 | static bool buff_read_gz( 34 | char *buf, 35 | size_t buf_len, 36 | size_t *read, 37 | void *data) { 38 | gzFile file = (gzFile) data; 39 | int code; 40 | code = gzread(file, buf, buf_len); 41 | if (code < 1) { 42 | *read = 0; 43 | return gzeof(file); 44 | } 45 | *read = code; 46 | return true; 47 | } 48 | 49 | static bool buff_read_bz2( 50 | char *buf, 51 | size_t buf_len, 52 | size_t *read, 53 | void *data) { 54 | BZFILE *file = (BZFILE *) data; 55 | int bzerror; 56 | *read = BZ2_bzRead(&bzerror, file, buf, buf_len); 57 | return bzerror == BZ_OK || bzerror == BZ_STREAM_END; 58 | } 59 | 60 | PandaBufferRead panda_open_buffer( 61 | const char *file_name, 62 | PandaLogProxy logger, 63 | void **user_data, 64 | PandaDestroy *destroy) { 65 | char buffer[2]; 66 | int fd; 67 | *user_data = NULL; 68 | *destroy = NULL; 69 | 70 | fd = open(file_name, O_RDONLY); 71 | if (fd < 0 || read(fd, &buffer, 2) != 2 || lseek(fd, 0, SEEK_SET) != 0) { 72 | panda_log_proxy_write(logger, PANDA_CODE_NO_FILE, NULL, NULL, file_name); 73 | return NULL; 74 | } 75 | if (buffer[0] == 'B' && buffer[1] == 'Z') { 76 | BZFILE *bz_file; 77 | bz_file = BZ2_bzdopen(fd, "r"); 78 | if (bz_file == NULL) { 79 | panda_log_proxy_write(logger, PANDA_CODE_NO_FILE, NULL, NULL, file_name); 80 | close(fd); 81 | return NULL; 82 | } 83 | *user_data = bz_file; 84 | *destroy = BZ2_bzclose; 85 | return buff_read_bz2; 86 | } else { 87 | gzFile gz_file; 88 | gz_file = gzdopen(fd, "r"); 89 | if (gz_file == NULL) { 90 | panda_log_proxy_write(logger, PANDA_CODE_NO_FILE, NULL, NULL, file_name); 91 | close(fd); 92 | return NULL; 93 | } 94 | *user_data = gz_file; 95 | *destroy = (PandaDestroy) gzclose; 96 | return buff_read_gz; 97 | } 98 | } 99 | 100 | PandaNextSeq panda_open_fastq( 101 | const char *forward, 102 | const char *reverse, 103 | PandaLogProxy logger, 104 | unsigned char qualmin, 105 | PandaTagging policy, 106 | const char *index, 107 | void **user_data, 108 | PandaDestroy *destroy) { 109 | MANAGED_STACK(PandaBufferRead, 110 | forward_file); 111 | MANAGED_STACK(PandaBufferRead, 112 | reverse_file); 113 | MANAGED_STACK(PandaBufferRead, 114 | index_file); 115 | 116 | *user_data = NULL; 117 | *destroy = NULL; 118 | 119 | forward_file = panda_open_buffer(forward, logger, &forward_file_data, &forward_file_destroy); 120 | if (forward_file == NULL) { 121 | return NULL; 122 | } 123 | 124 | reverse_file = panda_open_buffer(reverse, logger, &reverse_file_data, &reverse_file_destroy); 125 | if (reverse_file == NULL) { 126 | DESTROY_STACK(forward_file); 127 | return NULL; 128 | } 129 | index_file = index == NULL ? NULL : panda_open_buffer(index, logger, &index_file_data, &index_file_destroy); 130 | if (index != NULL && index_file == NULL) { 131 | DESTROY_STACK(forward_file); 132 | DESTROY_STACK(reverse_file); 133 | return NULL; 134 | } 135 | 136 | return panda_create_fastq_reader(forward_file, forward_file_data, forward_file_destroy, reverse_file, reverse_file_data, reverse_file_destroy, logger, qualmin, policy, index_file, index_file_data, index_file_destroy, user_data, destroy); 137 | } 138 | 139 | PandaAssembler panda_assembler_open_fastq( 140 | const char *forward, 141 | const char *reverse, 142 | PandaLogProxy logger, 143 | unsigned char qualmin, 144 | PandaTagging policy) { 145 | PandaNextSeq next; 146 | void *next_data; 147 | PandaDestroy next_destroy; 148 | if ((next = panda_open_fastq(forward, reverse, logger, qualmin, policy, NULL, &next_data, &next_destroy)) == NULL) { 149 | return NULL; 150 | } 151 | 152 | return panda_assembler_new(next, next_data, next_destroy, logger); 153 | } 154 | 155 | #ifdef HAVE_PTHREAD 156 | PandaMux panda_mux_open_fastq( 157 | const char *forward, 158 | const char *reverse, 159 | PandaLogProxy logger, 160 | unsigned char qualmin, 161 | PandaTagging policy) { 162 | PandaNextSeq next; 163 | void *next_data; 164 | PandaDestroy next_destroy; 165 | if ((next = panda_open_fastq(forward, reverse, logger, qualmin, policy, NULL, &next_data, &next_destroy)) == NULL) { 166 | 167 | return NULL; 168 | } 169 | return panda_mux_new(next, next_data, next_destroy, logger); 170 | } 171 | #endif 172 | -------------------------------------------------------------------------------- /tablebuilder.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "pandaseq-tablebuilder.h" 25 | #include "prob.h" 26 | 27 | struct panda_tbld { 28 | FILE *header; 29 | FILE *source; 30 | }; 31 | 32 | PandaTBld panda_tbld_open( 33 | const char *base_name) { 34 | char buffer[1024]; 35 | PandaTBld t_bld; 36 | FILE *header; 37 | FILE *source; 38 | size_t it; 39 | if (strlen(base_name) > 1000) 40 | return NULL; 41 | 42 | snprintf(buffer, 1024, "%s.c", base_name); 43 | source = fopen(buffer, "w"); 44 | if (source == NULL) { 45 | perror(buffer); 46 | return NULL; 47 | } 48 | snprintf(buffer, 1024, "%s.h", base_name); 49 | header = fopen(buffer, "w"); 50 | if (header == NULL) { 51 | perror(buffer); 52 | return NULL; 53 | } 54 | t_bld = malloc(sizeof(struct panda_tbld)); 55 | t_bld->source = source; 56 | t_bld->header = header; 57 | for (it = 0; it < strlen(base_name); it++) { 58 | buffer[it] = isalnum(base_name[it]) ? toupper(base_name[it]) : '_'; 59 | } 60 | buffer[it] = '\0'; 61 | fprintf(header, "#ifndef _%s_H\n#define _%s_H\n", buffer, buffer); 62 | return t_bld; 63 | } 64 | 65 | void panda_tbld_free( 66 | PandaTBld t_bld) { 67 | fprintf(t_bld->header, "#endif\n"); 68 | (void) fclose(t_bld->source); 69 | (void) fclose(t_bld->header); 70 | free(t_bld); 71 | } 72 | 73 | void panda_tbld_array( 74 | PandaTBld t_bld, 75 | const char *name, 76 | PandaArrayFormula formula, 77 | void *formula_context, 78 | size_t max) { 79 | size_t i; 80 | fprintf(t_bld->header, "extern const double %s[%zd];\n", name, max); 81 | fprintf(t_bld->source, "const double %s[%zd] = {\n", name, max); 82 | for (i = 0; i < max; i++) { 83 | if (i > 0) { 84 | fprintf(t_bld->source, ","); 85 | } 86 | fprintf(t_bld->source, " %g", formula(i, formula_context)); 87 | } 88 | fprintf(t_bld->source, "};\n"); 89 | } 90 | 91 | struct array_prob { 92 | PandaArrayProbFormula formula; 93 | void *formula_context; 94 | bool log_output; 95 | }; 96 | 97 | static double array_prob_formula( 98 | size_t x, 99 | void *_data) { 100 | struct array_prob *data = (struct array_prob *) _data; 101 | double result = data->formula(PROBABILITY(x), data->formula_context); 102 | return data->log_output ? log(result) : result; 103 | } 104 | 105 | void panda_tbld_array_prob( 106 | PandaTBld t_bld, 107 | const char *name, 108 | PandaArrayProbFormula formula, 109 | void *formula_context, 110 | bool log_output) { 111 | struct array_prob context; 112 | 113 | context.formula = formula; 114 | context.formula_context = formula_context; 115 | context.log_output = log_output; 116 | 117 | panda_tbld_array(t_bld, name, array_prob_formula, &context, PHREDMAX + 1); 118 | } 119 | 120 | void panda_tbld_constant( 121 | PandaTBld t_bld, 122 | const char *name, 123 | double value) { 124 | fprintf(t_bld->header, "#define %s %g\n", name, value); 125 | } 126 | 127 | void panda_tbld_matrix( 128 | PandaTBld t_bld, 129 | const char *name, 130 | PandaMatrixFormula formula, 131 | void *formula_context, 132 | size_t x_max, 133 | size_t y_max) { 134 | size_t i, j; 135 | fprintf(t_bld->header, "extern const double %s[][%zd];\n", name, y_max); 136 | fprintf(t_bld->source, "const double %s[][%zd] = {\n", name, y_max); 137 | for (i = 0; i < x_max; i++) { 138 | if (i > 0) { 139 | fprintf(t_bld->source, ", \n"); 140 | } 141 | fprintf(t_bld->source, "\t{"); 142 | for (j = 0; j < y_max; j++) { 143 | if (j > 0) { 144 | fprintf(t_bld->source, ","); 145 | } 146 | 147 | fprintf(t_bld->source, " %g", formula(i, j, formula_context)); 148 | 149 | } 150 | fprintf(t_bld->source, "}"); 151 | } 152 | fprintf(t_bld->source, "};\n"); 153 | } 154 | 155 | struct matrix_prob { 156 | PandaMatrixProbFormula formula; 157 | void *formula_context; 158 | bool log_output; 159 | }; 160 | 161 | static double matrix_prob_formula( 162 | size_t x, 163 | size_t y, 164 | void *_data) { 165 | struct matrix_prob *data = (struct matrix_prob *) _data; 166 | double result = data->formula(PROBABILITY(x), PROBABILITY(y), data->formula_context); 167 | return data->log_output ? log(result) : result; 168 | } 169 | 170 | void panda_tbld_matrix_prob( 171 | PandaTBld t_bld, 172 | const char *name, 173 | PandaMatrixProbFormula formula, 174 | void *formula_context, 175 | bool log_output) { 176 | struct matrix_prob context; 177 | 178 | context.formula = formula; 179 | context.formula_context = formula_context; 180 | context.log_output = log_output; 181 | 182 | panda_tbld_matrix(t_bld, name, matrix_prob_formula, &context, PHREDMAX + 1, PHREDMAX + 1); 183 | } 184 | -------------------------------------------------------------------------------- /args_fastq.c: -------------------------------------------------------------------------------- 1 | /* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers. 2 | Copyright (C) 2011-2013 Andre Masella 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | */ 18 | #define _POSIX_C_SOURCE 2 19 | #include "config.h" 20 | #include 21 | #include "pandaseq.h" 22 | 23 | struct panda_args_fastq { 24 | bool fastq; 25 | const char *forward_filename; 26 | bool no_algn_qual; 27 | PandaWriter no_algn_writer; 28 | PandaTagging policy; 29 | int qualmin; 30 | const char *reverse_filename; 31 | const char *index_filename; 32 | }; 33 | 34 | PandaArgsFastq panda_args_fastq_new( 35 | ) { 36 | PandaArgsFastq data = malloc(sizeof(struct panda_args_fastq)); 37 | data->forward_filename = NULL; 38 | data->no_algn_qual = false; 39 | data->no_algn_writer = NULL; 40 | data->policy = PANDA_TAG_PRESENT; 41 | data->qualmin = 33; 42 | data->reverse_filename = NULL; 43 | data->index_filename = NULL; 44 | 45 | return data; 46 | } 47 | 48 | void panda_args_fastq_free( 49 | PandaArgsFastq data) { 50 | panda_writer_unref(data->no_algn_writer); 51 | free(data); 52 | } 53 | 54 | bool panda_args_fastq_tweak( 55 | PandaArgsFastq data, 56 | char flag, 57 | const char *argument) { 58 | switch (flag) { 59 | case '6': 60 | data->qualmin = 64; 61 | return true; 62 | case 'B': 63 | data->policy = PANDA_TAG_OPTIONAL; 64 | return true; 65 | case 'f': 66 | data->forward_filename = argument; 67 | return true; 68 | case 'i': 69 | data->index_filename = argument; 70 | return true; 71 | case 'j': 72 | fprintf(stderr, "-j option is no longer necessary. Compression is auto-detected.\n"); 73 | return true; 74 | case 'r': 75 | data->reverse_filename = argument; 76 | return true; 77 | case 'u': 78 | case 'U': 79 | data->no_algn_qual = flag == 'U'; 80 | panda_writer_unref(data->no_algn_writer); 81 | data->no_algn_writer = panda_writer_open_file(argument, false); 82 | if (data->no_algn_writer == NULL) 83 | perror(argument); 84 | return (data->no_algn_writer != NULL); 85 | default: 86 | return false; 87 | } 88 | } 89 | 90 | static const panda_tweak_general fastq_phred = { '6', true, NULL, "Use PHRED+64 (CASAVA 1.3-1.7) instead of PHRED+33 (CASAVA 1.8+).", false }; 91 | static const panda_tweak_general fastq_barcoded = { 'B', true, NULL, "Allow unbarcoded sequences (try this for BADID errors).", false }; 92 | static const panda_tweak_general fastq_unalign_qual = { 'U', true, "unaligned.txt", "File to write unalignable read pairs with quality scores.", false }; 93 | static const panda_tweak_general fastq_forward = { 'f', false, "forward.fastq", "Input FASTQ file containing forward reads.", false }; 94 | static const panda_tweak_general fastq_index = { 'i', false, "index.fastq", "Input FASTQ file containing separate barcode/index reads.", false }; 95 | static const panda_tweak_general fastq_bzip = { 'j', true, NULL, "Input files are bzipped. (Deprecated.)", true }; 96 | static const panda_tweak_general fastq_reverse = { 'r', false, "reverse.fastq", "Input FASTQ file containing reverse reads.", false }; 97 | static const panda_tweak_general fastq_unalign = { 'u', true, "unaligned.txt", "File to write unalignable read pairs.", false }; 98 | 99 | const panda_tweak_general *const panda_args_fastq_args[] = { 100 | &fastq_phred, 101 | &fastq_barcoded, 102 | &fastq_unalign_qual, 103 | &fastq_forward, 104 | &fastq_index, 105 | &fastq_bzip, 106 | &fastq_reverse, 107 | &fastq_unalign 108 | }; 109 | 110 | const size_t panda_args_fastq_args_length = sizeof(panda_args_fastq_args) / sizeof(panda_tweak_general *); 111 | 112 | PandaNextSeq panda_args_fastq_opener( 113 | PandaArgsFastq data, 114 | PandaLogProxy logger, 115 | PandaFailAlign *fail, 116 | void **fail_data, 117 | PandaDestroy *fail_destroy, 118 | void **next_data, 119 | PandaDestroy *next_destroy) { 120 | 121 | if (data->forward_filename == NULL || data->reverse_filename == NULL) { 122 | panda_log_proxy_write_f(logger, "You must supply both forward and reverse reads.\n"); 123 | return NULL; 124 | } 125 | 126 | if (data->no_algn_writer != NULL) { 127 | *fail = (PandaFailAlign) (data->no_algn_qual ? panda_output_fail_qual : panda_output_fail); 128 | *fail_data = data->no_algn_writer; 129 | *fail_destroy = (PandaDestroy) panda_writer_unref; 130 | data->no_algn_writer = NULL; 131 | } else { 132 | *fail = NULL; 133 | *fail_data = NULL; 134 | *fail_destroy = NULL; 135 | } 136 | return panda_open_fastq(data->forward_filename, data->reverse_filename, logger, data->qualmin, data->policy, data->index_filename, next_data, next_destroy); 137 | } 138 | 139 | bool panda_args_fastq_setup( 140 | PandaArgsFastq data, 141 | PandaAssembler assembler) { 142 | /* This doesn't do anything, but it might in future and it's not worth changing the API. */ 143 | (void) data; 144 | (void) assembler; 145 | return true; 146 | } 147 | --------------------------------------------------------------------------------