├── INSTALL ├── README.md ├── bin ├── mkbwt ├── mkfmi └── seq2fun ├── database ├── ko_fullname.txt ├── org_species.txt ├── pathway_hierarchy ├── pathway_ko.txt └── pathway_ko_stats.txt ├── src ├── adaptertrimmer.cpp ├── adaptertrimmer.h ├── basecorrector.cpp ├── basecorrector.h ├── bwt │ ├── Makefile │ ├── bwt.c │ ├── bwt.h │ ├── common.h │ ├── compactfmi.c │ ├── compactfmi.h │ ├── fmi.h │ ├── fmicommon.h │ ├── mkbwt.c │ ├── mkbwt_vars.h │ ├── mkfmi.c │ ├── mkfmi_vars.h │ ├── multikeyqsort.c │ ├── multikeyqsort.h │ ├── readFasta.c │ ├── readFasta.h │ ├── sequence.c │ ├── sequence.h │ ├── suffixArray.c │ └── suffixArray.h ├── bwtfmiDB.cpp ├── bwtfmiDB.h ├── cmdline.h ├── common.h ├── duplicate.cpp ├── duplicate.h ├── evaluator.cpp ├── evaluator.h ├── fastareader.cpp ├── fastareader.h ├── fastqreader.cpp ├── fastqreader.h ├── filter.cpp ├── filter.h ├── filterresult.cpp ├── filterresult.h ├── fragment.cpp ├── fragment.h ├── htmlreporter.cpp ├── htmlreporter.h ├── htmlreporterall.cpp ├── htmlreporterall.h ├── include │ └── ncbi-blast+ │ │ ├── algo │ │ └── blast │ │ │ ├── composition_adjustment │ │ │ ├── composition_constants.h │ │ │ ├── matrix_frequency_data.c │ │ │ └── matrix_frequency_data.h │ │ │ └── core │ │ │ ├── blast_def.h │ │ │ ├── blast_dynarray.c │ │ │ ├── blast_dynarray.h │ │ │ ├── blast_encoding.c │ │ │ ├── blast_encoding.h │ │ │ ├── blast_export.h │ │ │ ├── blast_filter.c │ │ │ ├── blast_filter.h │ │ │ ├── blast_hits.h │ │ │ ├── blast_hits_priv.h │ │ │ ├── blast_hspfilter.h │ │ │ ├── blast_message.c │ │ │ ├── blast_message.h │ │ │ ├── blast_options.c │ │ │ ├── blast_options.h │ │ │ ├── blast_parameters.h │ │ │ ├── blast_posit.c │ │ │ ├── blast_posit.h │ │ │ ├── blast_program.c │ │ │ ├── blast_program.h │ │ │ ├── blast_psi.c │ │ │ ├── blast_psi.h │ │ │ ├── blast_psi_priv.c │ │ │ ├── blast_psi_priv.h │ │ │ ├── blast_query_info.c │ │ │ ├── blast_query_info.h │ │ │ ├── blast_rps.h │ │ │ ├── blast_seg.c │ │ │ ├── blast_seg.h │ │ │ ├── blast_seqsrc.h │ │ │ ├── blast_stat.c │ │ │ ├── blast_stat.h │ │ │ ├── blast_toolkit.h │ │ │ ├── blast_util.c │ │ │ ├── blast_util.h │ │ │ ├── gapinfo.h │ │ │ ├── hspfilter_besthit.h │ │ │ ├── hspfilter_collector.c │ │ │ ├── hspfilter_collector.h │ │ │ ├── lookup_wrap.h │ │ │ ├── matrix_freq_ratios.c │ │ │ ├── matrix_freq_ratios.h │ │ │ ├── ncbi_erf.c │ │ │ ├── ncbi_math.c │ │ │ ├── ncbi_math.h │ │ │ ├── ncbi_std.c │ │ │ ├── ncbi_std.h │ │ │ ├── pattern.c │ │ │ ├── pattern.h │ │ │ └── pattern_priv.h │ │ ├── common │ │ ├── ncbi_skew_guard.h │ │ └── ncbiconf_impl.h │ │ ├── connect │ │ ├── connect_export.h │ │ ├── ncbi_core.h │ │ └── ncbi_types.h │ │ ├── corelib │ │ ├── ncbitype.h │ │ └── ncbitype.h.dmnd │ │ ├── ncbiconf.h │ │ ├── ncbiconf_unix.h │ │ └── util │ │ └── tables │ │ ├── raw_scoremat.c │ │ ├── raw_scoremat.h │ │ ├── sm_blosum45.c │ │ ├── sm_blosum50.c │ │ ├── sm_blosum62.c │ │ ├── sm_blosum80.c │ │ ├── sm_blosum90.c │ │ ├── sm_identity.c │ │ ├── sm_pam250.c │ │ ├── sm_pam30.c │ │ ├── sm_pam70.c │ │ └── tables_export.h ├── jsonreporter.cpp ├── jsonreporter.h ├── knownadapters.h ├── makefile ├── nucleotidetree.cpp ├── nucleotidetree.h ├── options.cpp ├── options.h ├── overlapanalysis.cpp ├── overlapanalysis.h ├── peprocessor.cpp ├── peprocessor.h ├── polyx.cpp ├── polyx.h ├── processor.cpp ├── processor.h ├── read.cpp ├── read.h ├── seprocessor.cpp ├── seprocessor.h ├── seq2fun.cpp ├── seqtract ├── seqtract.cpp ├── seqtractpeprocessor.cpp ├── seqtractpeprocessor.h ├── sequence.cpp ├── sequence.h ├── stats.cpp ├── stats.h ├── threadconfig.cpp ├── threadconfig.h ├── threadsconfig2.cpp ├── threadsconfig2.h ├── transsearcher.cpp ├── transsearcher.hpp ├── umiprocessor.cpp ├── umiprocessor.h ├── unittest.cpp ├── unittest.h ├── util.h ├── writer.cpp ├── writer.h ├── writerthread.cpp ├── writerthread.h └── zlib │ ├── crc32.h │ ├── deflate.h │ ├── gzguts.h │ ├── inffast.h │ ├── inffixed.h │ ├── inflate.h │ ├── inftrees.h │ ├── trees.h │ ├── zconf.h │ ├── zlib.h │ └── zutil.h └── testdata ├── D1.CE2-S4-LT_R1.fastq.gz ├── D1.CE2-S4-LT_R2.fastq.gz ├── D2.CE2-H2-LT_R1.fastq.gz ├── D2.CE2-H2-LT_R2.fastq.gz ├── example_annotation.txt └── sample.txt /bin/mkbwt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/bin/mkbwt -------------------------------------------------------------------------------- /bin/mkfmi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/bin/mkfmi -------------------------------------------------------------------------------- /bin/seq2fun: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/bin/seq2fun -------------------------------------------------------------------------------- /src/adaptertrimmer.h: -------------------------------------------------------------------------------- 1 | #ifndef ADAPTER_TRIMMER_H 2 | #define ADAPTER_TRIMMER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "overlapanalysis.h" 8 | #include "filterresult.h" 9 | #include "options.h" 10 | 11 | using namespace std; 12 | 13 | class AdapterTrimmer{ 14 | public: 15 | AdapterTrimmer(); 16 | ~AdapterTrimmer(); 17 | 18 | static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit); 19 | static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0); 20 | static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false, int matchReq = 4); 21 | static bool trimByMultiSequences(Read* r1, FilterResult* fr, vector& adapterList, bool isR2 = false, bool incTrimmedCounter = true); 22 | 23 | static bool trimPolyA(Read* r1, FilterResult* fr, bool isR2 = false, bool incTrimmedCounter = true); 24 | 25 | static bool test(); 26 | 27 | 28 | }; 29 | 30 | 31 | #endif -------------------------------------------------------------------------------- /src/basecorrector.cpp: -------------------------------------------------------------------------------- 1 | #include "basecorrector.h" 2 | #include "util.h" 3 | 4 | BaseCorrector::BaseCorrector(){ 5 | } 6 | 7 | 8 | BaseCorrector::~BaseCorrector(){ 9 | } 10 | 11 | int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit) { 12 | OverlapResult ov = OverlapAnalysis::analyze(r1, r2, diffLimit, overlapRequire, diffPercentLimit); 13 | return correctByOverlapAnalysis(r1, r2, fr, ov); 14 | } 15 | 16 | int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov) { 17 | // we only correct overlap 18 | if(ov.diff == 0 || !ov.overlapped) 19 | return 0; 20 | 21 | int ol = ov.overlap_len; 22 | int start1 = max(0, ov.offset); 23 | int start2 = r2->length() - max(0, -ov.offset) - 1; 24 | 25 | const char* seq1 = r1->mSeq.mStr.c_str(); 26 | const char* seq2 = r2->mSeq.mStr.c_str(); 27 | const char* qual1 = r1->mQuality.c_str(); 28 | const char* qual2 = r2->mQuality.c_str(); 29 | 30 | const char GOOD_QUAL = num2qual(30); 31 | const char BAD_QUAL = num2qual(14); 32 | 33 | int corrected = 0; 34 | int uncorrected = 0; 35 | bool r1Corrected = false; 36 | bool r2Corrected = false; 37 | for(int i=0; i= GOOD_QUAL && qual2[p2] <= BAD_QUAL) { 43 | // use R1 44 | r2->mSeq.mStr[p2] = complement(seq1[p1]); 45 | r2->mQuality[p2] = qual1[p1]; 46 | corrected++; 47 | r2Corrected = true; 48 | if(fr) { 49 | fr->addCorrection(seq2[p2], complement(seq1[p1])); 50 | } 51 | } else if(qual2[p2] >= GOOD_QUAL && qual1[p1] <= BAD_QUAL) { 52 | // use R2 53 | r1->mSeq.mStr[p1] = complement(seq2[p2]); 54 | r1->mQuality[p1] = qual2[p2]; 55 | corrected++; 56 | r1Corrected = true; 57 | if(fr) { 58 | fr->addCorrection(seq1[p1], complement(seq2[p2])); 59 | } 60 | } else { 61 | uncorrected++; 62 | } 63 | } 64 | } 65 | 66 | // should never happen 67 | if(uncorrected + corrected != ov.diff) { 68 | static bool warned = false; 69 | if(!warned){ 70 | cerr << "WARNING: the algorithm is wrong! uncorrected + corrected != ov.diff" << endl; 71 | warned = true; 72 | } 73 | } 74 | 75 | if(corrected > 0 && fr) { 76 | if(r1Corrected && r2Corrected) 77 | fr->incCorrectedReads(2); 78 | else 79 | fr->incCorrectedReads(1); 80 | } 81 | 82 | return corrected; 83 | } 84 | 85 | bool BaseCorrector::test() { 86 | Read r1("@name", 87 | "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCACGGGG", 88 | "+", 89 | "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEE"); 90 | Read r2("@name", 91 | "AAAAAAAAAACCCCGGGGAAAATTTTAAAATTGGGGGGGGGGTGGGGGGGGGGGGG", 92 | "+", 93 | "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEE"); 94 | 95 | correctByOverlapAnalysis(&r1, &r2, NULL, 5, 30, 0.2); 96 | 97 | if(r1.mSeq.mStr != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGG") 98 | return false; 99 | if(r2.mSeq.mStr != "AAAAAAAAAACCCCGGGGAAAATTTTAAAATTGGGGGGGGGGGGGGGGGGGGGGGG") 100 | return false; 101 | if(r1.mQuality != "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE") 102 | return false; 103 | if(r2.mQuality != "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE") 104 | return false; 105 | 106 | return true; 107 | } -------------------------------------------------------------------------------- /src/basecorrector.h: -------------------------------------------------------------------------------- 1 | #ifndef BASE_CORRECTOR_H 2 | #define BASE_CORRECTOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "overlapanalysis.h" 8 | #include "filterresult.h" 9 | #include "options.h" 10 | 11 | using namespace std; 12 | 13 | class BaseCorrector{ 14 | public: 15 | BaseCorrector(); 16 | ~BaseCorrector(); 17 | 18 | static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit); 19 | static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov); 20 | static bool test(); 21 | }; 22 | 23 | 24 | #endif -------------------------------------------------------------------------------- /src/bwt/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #CFLAGS = -g 3 | CFLAGS = -O3 -g -Wno-unused-result 4 | LDLIBS = -lpthread -lm 5 | 6 | ifeq ($(uname -s), "Darwin") 7 | LD_LIBS_STATIC = -Wl,-all_load -lpthread -Wl,-noall_load -lm 8 | else 9 | LD_LIBS_STATIC = -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -lm 10 | endif 11 | 12 | all: mkbwt mkfmi Makefile 13 | 14 | mkbwt: mkbwt.o readFasta.o suffixArray.o multikeyqsort.o sequence.o 15 | 16 | mkfmi: mkfmi.o bwt.o suffixArray.o compactfmi.o 17 | 18 | mkbwt.o: mkbwt_vars.h mkbwt.c common.h multikeyqsort.h sequence.h 19 | 20 | mkfmi.o: mkfmi_vars.h mkfmi.c fmi.h common.h 21 | 22 | sequence.o: sequence.h common.h 23 | 24 | readFasta.o: readFasta.c readFasta.h sequence.h common.h 25 | 26 | compactfmi.o: compactfmi.c compactfmi.h common.h fmicommon.h 27 | 28 | suffixArray.o: suffixArray.c suffixArray.h common.h sequence.h 29 | 30 | bwt.o: bwt.c bwt.h fmi.h common.h 31 | 32 | multikeyqsort.o: multikeyqsort.c multikeyqsort.h 33 | 34 | clean: 35 | rm -f mkfmi mkbwt 36 | 37 | static: LDFLAGS = -static 38 | static: LDLIBS = $(LD_LIBS_STATIC) 39 | static: all 40 | 41 | debug: all 42 | 43 | .PHONY: clean static debug 44 | -------------------------------------------------------------------------------- /src/bwt/bwt.h: -------------------------------------------------------------------------------- 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh, 2 | * Kaiju is licensed under the GPLv3, see the file LICENSE. */ 3 | #ifndef BWT_h 4 | #define BWT_h 5 | 6 | #include "common.h" 7 | #include "fmi.h" 8 | #include "suffixArray.h" 9 | 10 | typedef struct { 11 | IndexType len; // Length of bwt (not counting initial zeros) 12 | int nseq; 13 | uchar *bwt; 14 | 15 | // Alphabet 16 | int alen; 17 | char *alphabet; 18 | 19 | FMI *f; 20 | suffixArray *s; 21 | 22 | } BWT; 23 | 24 | 25 | typedef struct _SI_ { 26 | IndexType start; // Start of suffix interval 27 | int len; // Interval length 28 | int qi; // Position in query 29 | int ql; // Length in query (if relevant) 30 | int count; // Used to count matches below current 31 | int score; 32 | struct _SI_ *next; 33 | struct _SI_ *samelen; 34 | } SI; 35 | 36 | 37 | 38 | /* FUNCTION PROTOTYPES BEGIN ( by funcprototypes.pl ) */ 39 | void write_BWT_header(BWT *b, FILE *bwtfile); 40 | BWT *read_BWT(FILE *bwtfile); 41 | BWT *readIndexes(FILE *fp); 42 | void get_suffix(FMI *fmi, suffixArray *s, IndexType i, int *iseq, IndexType *pos); 43 | uchar *retrieve_seq(int snum, BWT *b); 44 | IndexType InitialSI(FMI *f, uchar ct, IndexType *si); 45 | IndexType UpdateSI(FMI *f, uchar ct, IndexType *si, IndexType *newsi); 46 | void recursive_free_SI(SI *si); 47 | SI *maxMatches(FMI *f, char *str, int len, int L, int max_matches); 48 | SI *maxMatches_withStart(FMI *f, char *str, int len, int L, int max_matches, IndexType si0, IndexType si1, int offset); 49 | SI *greedyExact(FMI *f, char *str, int len, int L, int jump); 50 | /* FUNCTION PROTOTYPES END */ 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /src/bwt/common.h: -------------------------------------------------------------------------------- 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh, 2 | * Kaiju is licensed under the GPLv3, see the file LICENSE. */ 3 | #ifndef COMMON_h 4 | #define COMMON_h 5 | 6 | #include 7 | 8 | typedef unsigned char uchar; 9 | typedef unsigned short int ushort; 10 | typedef unsigned int uint; 11 | typedef long int IndexType; 12 | 13 | static void ERROR(char *text, int errornum) { 14 | fprintf(stderr,"%s\n",text); 15 | exit(errornum); 16 | } 17 | 18 | static void ERRORs(char *format, char *text, int errornum) { 19 | fprintf(stderr,"%s\n",text); 20 | exit(errornum); 21 | } 22 | 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /src/bwt/compactfmi.h: -------------------------------------------------------------------------------- 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh, 2 | * Kaiju is licensed under the GPLv3, see the file LICENSE. */ 3 | #ifndef COMPACTFMI_h 4 | #define COMPACTFMI_h 5 | 6 | #include "common.h" 7 | 8 | /* Simple FM index with a checkpoint for every 256 letters */ 9 | typedef struct { 10 | int alen; // Length of alphabet 11 | IndexType bwtlen; // Total length of BWT 12 | uchar *bwt; // BWT string 13 | int N1; // Total number of entries in index 1 (bwtlen>>ex1 +1); 14 | int N2; // Total number of entries in index 2 (bwtlen>>ex2 +1); 15 | IndexType **index1; // FM index1 (one array per letter) 16 | ushort **index2; // Counts relative to index1 checkpoints (assuming 16 bit int) 17 | int *startLcode; // start numbers for byte encoding of letter and number 18 | } FMI; 19 | 20 | 21 | 22 | 23 | /* FUNCTION PROTOTYPES BEGIN ( by funcprototypes.pl ) */ 24 | FMI *alloc_FMI(uchar *bwt, IndexType bwtlen, int alen); 25 | FMI *read_fmi(FILE *fp); 26 | void write_fmi(const FMI *f, FILE *fp); 27 | IndexType FMindex(FMI *f, uchar ct, IndexType k); 28 | IndexType FMindexCurrent(FMI *f, uchar *c, IndexType k); 29 | void FMindexAll(FMI *f, IndexType k, IndexType *fmia); 30 | void FMIrecode(FMI *fmi); 31 | FMI *makeIndex(uchar *bwt, long bwtlen, int alen); 32 | FMI *makeIndex_OLD(uchar *bwt, long bwtlen, int alen); 33 | /* FUNCTION PROTOTYPES END */ 34 | 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /src/bwt/fmi.h: -------------------------------------------------------------------------------- 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh, 2 | * Kaiju is licensed under the GPLv3, see the file LICENSE. */ 3 | #ifndef COMPACTFMI_h 4 | #define COMPACTFMI_h 5 | 6 | #include "common.h" 7 | 8 | /* Simple FM index with a checkpoint for every 256 letters */ 9 | typedef struct { 10 | int alen; // Length of alphabet 11 | IndexType bwtlen; // Total length of BWT 12 | uchar *bwt; // BWT string 13 | int N1; // Total number of entries in index 1 (bwtlen>>ex1 +1); 14 | int N2; // Total number of entries in index 2 (bwtlen>>ex2 +1); 15 | IndexType **index1; // FM index1 (one array per letter) 16 | ushort **index2; // Counts relative to index1 checkpoints (assuming 16 bit int) 17 | int *startLcode; // start numbers for byte encoding of letter and number 18 | } FMI; 19 | 20 | 21 | 22 | 23 | /* FUNCTION PROTOTYPES BEGIN ( by funcprototypes.pl ) */ 24 | FMI *alloc_FMI(uchar *bwt, IndexType bwtlen, int alen); 25 | FMI *read_fmi(FILE *fp); 26 | void write_fmi(const FMI *f, FILE *fp); 27 | IndexType FMindex(FMI *f, uchar ct, IndexType k); 28 | IndexType FMindexCurrent(FMI *f, uchar *c, IndexType k); 29 | void FMindexAll(FMI *f, IndexType k, IndexType *fmia); 30 | void FMIrecode(FMI *fmi); 31 | FMI *makeIndex(uchar *bwt, long bwtlen, int alen); 32 | FMI *makeIndex_OLD(uchar *bwt, long bwtlen, int alen); 33 | /* FUNCTION PROTOTYPES END */ 34 | 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /src/bwt/mkfmi.c: -------------------------------------------------------------------------------- 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh, 2 | * Kaiju is licensed under the GPLv3, see the file LICENSE. */ 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "common.h" 9 | #include "fmi.h" 10 | #include "bwt.h" 11 | #include "suffixArray.h" 12 | #include "mkfmi_vars.h" 13 | 14 | void error(char *format, char *arg) { 15 | fprintf(stderr,"ERROR: "); 16 | fprintf(stderr,format,arg); 17 | exit(1); 18 | } 19 | 20 | 21 | int main (int argc, char **argv) { 22 | int l; 23 | FILE *fp=NULL; 24 | BWT *b; 25 | char *filename; 26 | 27 | /* Parsing options and arguments */ 28 | OPT_read_cmdline(opt_struct, argc, argv); 29 | if (help) { OPT_help(opt_struct); exit(0); } 30 | OPT_print_vars(stderr, opt_struct, "# ", 0); 31 | 32 | if (!filenm) { 33 | fprintf(stderr,"You have to specify name for index files (first argument)\n"); 34 | exit(5); 35 | } 36 | 37 | l=strlen(filenm); 38 | filename = (char *)malloc((l+10)*sizeof(char)); 39 | strcpy(filename,filenm); 40 | 41 | /* Read BWT */ 42 | strcpy(filename+l,".bwt"); 43 | fp = fopen(filename,"r"); 44 | if (!fp) error("File %s containing BWT could not be opened for reading\n",filename); 45 | fprintf(stderr,"Reading BWT from file %s ... ",filename); 46 | b = read_BWT(fp); 47 | fclose(fp); 48 | fprintf(stderr,"DONE\n"); 49 | fprintf(stderr,"BWT of length %ld has been read with %d sequencs, alphabet=%s\n", 50 | b->len, b->nseq, b->alphabet); 51 | 52 | /* Read SA */ 53 | strcpy(filename+l,".sa"); 54 | fp = fopen(filename,"r"); 55 | if (!fp) error("File %s containing SA could not be opened for reading\n",filename); 56 | fprintf(stderr,"Reading suffix array from file %s ... ",filename); 57 | b->s = read_suffixArray_header(fp); 58 | /* If the whole SA is saved, don't read it! */ 59 | if (b->s->chpt_exp > 0) read_suffixArray_body(b->s,fp); 60 | fclose(fp); 61 | fprintf(stderr,"DONE\n"); 62 | 63 | /* Concatenate stuff in fmi file */ 64 | strcpy(filename+l,".fmi"); 65 | fp = fopen(filename,"w"); 66 | if (!fp) error("File %s for FMI could not be opened for reading\n",filename); 67 | fprintf(stderr,"Writing BWT header and SA to file %s ... ",filename); 68 | write_BWT_header(b, fp); 69 | write_suffixArray(b->s,fp); 70 | fprintf(stderr,"DONE\n"); 71 | 72 | fprintf(stderr,"Constructing FM index\n"); 73 | b->f = makeIndex(b->bwt, b->len, b->alen); 74 | fprintf(stderr,"\nDONE\n"); 75 | 76 | fprintf(stderr,"Writing FM index to file ... "); 77 | write_fmi(b->f,fp); 78 | fclose(fp); 79 | fprintf(stderr,"DONE\n"); 80 | 81 | if (removecmd) { 82 | int cl = strlen(removecmd); 83 | char *command = malloc(cl+2*l+20); 84 | sprintf(command,"%s %s.sa %s.bwt",removecmd,filenm,filenm); 85 | fprintf(stderr,"Removing files with this command: %s\n",command); 86 | system(command); 87 | free(command); 88 | } 89 | else { 90 | strcpy(filename+l,".bwt"); 91 | fprintf(stderr,"\n !! You can now delete files %s and ",filename); 92 | strcpy(filename+l,".sa"); 93 | fprintf(stderr,"%s !!\n\n",filename); 94 | free(filename); 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /src/bwt/multikeyqsort.c: -------------------------------------------------------------------------------- 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh, 2 | * Kaiju is licensed under the GPLv3, see the file LICENSE. */ 3 | /* 4 | Copied from 5 | http://www.drdobbs.com/database/sorting-strings-with-three-way-radix-qui/184410724 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #ifndef min 13 | #define min(a, b) ((a)<=(b) ? (a) : (b)) 14 | #endif 15 | 16 | 17 | static inline void swap(char *a[], int i, int j) 18 | { char *t = a[i]; 19 | a[i] = a[j]; 20 | a[j] = t; 21 | } 22 | static inline void vecswap(char *a[], int i, int j, int n) 23 | { while (n-- > 0) 24 | swap(a, i++, j++); 25 | } 26 | 27 | 28 | #define ch(i) a[i][depth] 29 | 30 | 31 | /* Faster version */ 32 | 33 | /* Std strcmp use unsigned comparison */ 34 | static inline int my_strcmp(const char *s1, const char *s2) { 35 | while (*s1 && *s2 && *s1==*s2) { ++s1; ++s2; } 36 | return (int)( *s1 - *s2 ); 37 | } 38 | 39 | 40 | static inline int med3func(char *a[], int ia, int ib, int ic, int depth) 41 | { int va, vb, vc; 42 | if ((va=ch(ia)) == (vb=ch(ib))) 43 | return ia; 44 | if ((vc=ch(ic)) == va || vc == vb) 45 | return ic; 46 | return va < vb ? 47 | (vb < vc ? ib : (va < vc ? ic : ia ) ) 48 | : (vb > vc ? ib : (va < vc ? ia : ic ) ); 49 | } 50 | 51 | 52 | void inssort(char *a[], int n, int depth) 53 | { int i, j; 54 | for (i = 1; i < n; i++) 55 | for (j = i; j > 0; j--) { 56 | if (my_strcmp(a[j-1]+depth, a[j]+depth) <= 0) 57 | break; 58 | swap(a, j, j-1); 59 | } 60 | } 61 | 62 | 63 | void ssort2(char *a[], int n, int depth) 64 | { int le, lt, gt, ge, r, v; 65 | int pl, pm, pn, d; 66 | 67 | if (n <= 10) { 68 | inssort(a, n, depth); 69 | return; 70 | } 71 | 72 | pl = 0; 73 | pm = n/2; 74 | pn = n-1; 75 | if (n > 50) { 76 | d = n/8; 77 | pl = med3func(a, pl, pl+d, pl+2*d,depth); 78 | pm = med3func(a, pm-d, pm, pm+d,depth); 79 | pn = med3func(a, pn-2*d, pn-d, pn,depth); 80 | } 81 | pm = med3func(a, pl, pm, pn,depth); 82 | swap(a, 0, pm); 83 | v = ch(0); 84 | for (le = 1; le < n && ch(le) == v; le++) 85 | ; 86 | if (le == n) { 87 | if (v != 0) ssort2(a, n, depth+1); 88 | return; 89 | } 90 | lt = le; 91 | gt = ge = n-1; 92 | for (;;) { 93 | for ( ; lt <= gt && ch(lt) <= v; lt++) 94 | if (ch(lt) == v) swap(a, le++, lt); 95 | for ( ; lt <= gt && ch(gt) >= v; gt--) { 96 | if (ch(gt) == v) swap(a, gt, ge--); 97 | } 98 | if (lt > gt) 99 | break; 100 | swap(a, lt++, gt--); 101 | } 102 | r = min(le, lt-le); 103 | vecswap(a, 0, lt-r, r); 104 | r = min(ge-gt, n-ge-1); 105 | vecswap(a, lt, n-r, r); 106 | ssort2(a, lt-le, depth); 107 | if (v != 0) 108 | ssort2(a + lt-le, le + n-ge-1, depth+1); 109 | ssort2(a + n-(ge-gt), ge-gt, depth); 110 | } 111 | 112 | 113 | //void ssort2main(char *a[], int n) 114 | void multikeyqsort(char *a[], int n) 115 | { ssort2(a, n, 0); } 116 | -------------------------------------------------------------------------------- /src/bwt/multikeyqsort.h: -------------------------------------------------------------------------------- 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh, 2 | * Kaiju is licensed under the GPLv3, see the file LICENSE. */ 3 | 4 | void multikeyqsort(char **a, int n); 5 | 6 | -------------------------------------------------------------------------------- /src/bwt/readFasta.h: -------------------------------------------------------------------------------- 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh, 2 | * Kaiju is licensed under the GPLv3, see the file LICENSE. */ 3 | #ifndef READFASTA_h 4 | #define READFASTA_h 5 | 6 | #include "common.h" 7 | 8 | /* FUNCTION PROTOTYPES BEGIN ( by funcprototypes.pl ) */ 9 | SEQstruct *revcompSEQstruct(SEQstruct *ss, char *s, char *translate); 10 | SEQstruct *readFasta(FILE *fp, long length, char *transtab, char *complement, char term, int padding); 11 | char *translation_table(char *alphabet, char *translation, char dummy, int casesens); 12 | /* FUNCTION PROTOTYPES END */ 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /src/bwt/sequence.c: -------------------------------------------------------------------------------- 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh, 2 | * Kaiju is licensed under the GPLv3, see the file LICENSE. */ 3 | #include 4 | #include 5 | #include 6 | #include "sequence.h" 7 | 8 | 9 | SEQstruct *alloc_SEQstruct() { 10 | SEQstruct *ss=(SEQstruct *)malloc(sizeof(SEQstruct)); 11 | ss->len = 0; 12 | ss->rc = 0; 13 | ss->pos = 0; 14 | ss->id = NULL; 15 | ss->descr = NULL; 16 | ss->start = NULL; 17 | ss->id_filepos = 0; 18 | ss->seq_filepos = 0; 19 | ss->sort_order = 0; 20 | ss->next=NULL; 21 | return ss; 22 | } 23 | 24 | void free_SEQstruct(SEQstruct *ss) { 25 | if (ss) { 26 | if (ss->id) { 27 | free(ss->id); 28 | ss->id = NULL; 29 | } 30 | if(ss->descr) { 31 | free(ss->descr); 32 | ss->descr = NULL; 33 | } // added; 34 | if(ss->start) { 35 | free(ss->start);//added; 36 | ss->start = NULL; 37 | } 38 | free(ss); 39 | ss = NULL; 40 | } 41 | } 42 | 43 | /* Assumes that base->start points to the whole sequence */ 44 | void recursive_free_SEQstruct(SEQstruct *base) { 45 | SEQstruct *ss, *next; 46 | if (base->start){ 47 | free(base->start); 48 | base->start = NULL; 49 | } 50 | ss=base; 51 | next=ss->next; 52 | while (ss) { 53 | free_SEQstruct(ss); 54 | ss=next; 55 | if (next) next=ss->next; 56 | } 57 | } 58 | 59 | 60 | /* Makes a translation table from an alphabet to a translation, so 61 | table[alphabet[i]] = translation[i] 62 | 63 | If translation==NULL, the letter alphabet[i] is translated to i. 64 | 65 | Letters not in alphabet are translated to dummy. 66 | If dummy==0, dummy is set to the translation of the last char in alphabet 67 | (assumed to be a "wildcard" character) 68 | 69 | Translation for non-characters is -1. 70 | 71 | casesens !=0, means case sensitive, otherwise case insentitive. 72 | 73 | Returns an array (char *table) of length 128 74 | 75 | The length of the translation table (which may contain zeros) 76 | HAS to be as long (or longer) than alphabet. 77 | 78 | If 0 is not in the alphabet, it is translated to 0 79 | */ 80 | static char *translation_table(char *alphabet, char *translation, char dummy, int casesens) { 81 | int i, l, freetrans=0; 82 | char *table = (char*)malloc(128*sizeof(char)); 83 | 84 | l = strlen(alphabet); 85 | 86 | if (translation==0) { 87 | translation = (char *)malloc(l*sizeof(char)); 88 | for (i=0; ia = strdup(a); 140 | astruct->len = strlen(a); 141 | astruct->caseSens = caseSens; 142 | astruct->trans = translation_table(a, NULL, astruct->len-1, astruct->caseSens); 143 | if (revcomp) astruct->comp = dnaComplement(a); 144 | else astruct->comp = NULL; 145 | return astruct; 146 | } 147 | 148 | 149 | void free_AlphabetStruct(AlphabetStruct *astruct) { 150 | if (astruct) { 151 | if (astruct->a) free(astruct->a); 152 | if (astruct->trans) free(astruct->trans); 153 | if (astruct->comp) free(astruct->comp); 154 | free(astruct); 155 | } 156 | } 157 | 158 | 159 | 160 | /* 161 | translate a sequence (s) to numbers 162 | */ 163 | void translate2numbers(uchar *s, const IndexType slen, AlphabetStruct *astruct) { 164 | IndexType k; 165 | for (k=0;ktrans[s[k]]; 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /src/bwt/sequence.h: -------------------------------------------------------------------------------- 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh, 2 | * Kaiju is licensed under the GPLv3, see the file LICENSE. */ 3 | #ifndef SEQUENCE_h 4 | #define SEQUENCE_h 5 | 6 | #include "../bwt/common.h" 7 | 8 | typedef struct __SEQstruct__ { 9 | char *id; 10 | char *descr; // Description (stuff following id). Allocated with id. 11 | char rc; // Reverse complement. 0=forward 1=complement 12 | long len; 13 | long pos; // Index of sequence in long allocation 14 | char *start; 15 | long id_filepos; 16 | long seq_filepos; 17 | int sort_order; 18 | struct __SEQstruct__ *next; 19 | } SEQstruct; 20 | 21 | typedef struct { 22 | int len; // Alphabet length 23 | int caseSens; 24 | char *a; // Alphabet sequence (0 terminated, first char is terminator, last may be wildcard) 25 | char *trans; // Translate char c to int i: trans[c]=i 26 | char *comp; // DNA complement comp[a]=t, etc. 27 | } AlphabetStruct; 28 | 29 | 30 | 31 | static inline int letter2number(char c, AlphabetStruct *a) { return a->trans[(int)c];} 32 | static inline char number2letter(int i, AlphabetStruct *a) { return a->a[i];} 33 | 34 | 35 | /* FUNCTION PROTOTYPES BEGIN ( by funcprototypes.pl ) */ 36 | SEQstruct *alloc_SEQstruct(); 37 | void free_SEQstruct(SEQstruct *ss); 38 | void recursive_free_SEQstruct(SEQstruct *base); 39 | AlphabetStruct *alloc_AlphabetStruct(char *a, int caseSens, int revcomp); 40 | void free_AlphabetStruct(AlphabetStruct *astruct); 41 | void translate2numbers(uchar *s, const IndexType slen, AlphabetStruct *astruct); 42 | /* FUNCTION PROTOTYPES END */ 43 | 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/bwt/suffixArray.h: -------------------------------------------------------------------------------- 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh, 2 | * Kaiju is licensed under the GPLv3, see the file LICENSE. */ 3 | #ifndef SUFFIXARRAY_h 4 | #define SUFFIXARRAY_h 5 | 6 | #include "common.h" 7 | #include "fmi.h" 8 | #include "sequence.h" 9 | 10 | typedef struct { 11 | IndexType len; // Length of (actual) SA ( = bwtlen) 12 | 13 | // Suffix array checkpoints 14 | IndexType ncheck; // Number of checkpoints 15 | uchar *sa; // Actual array holding SA checkpoints 16 | int chpt_exp; // Exponent of checkpoint distance 17 | int nbytes; // Number of bytes used per entry 18 | int sbits; // Number of bits used for encoding sequence number 19 | int pbits; // Number of bits used to encode position 20 | long mask; // Mask for lowest pbits bits 21 | long check; // Used to check if we are at a checkpoint 22 | 23 | // Sequence information 24 | int nseq; // Number of sequences 25 | char **ids; // IDs (in order of forward sorted seqs) 26 | int *seqTermOrder; // Order of sequence termination 27 | IndexType *seqlengths; // lengths of sequences 28 | IndexType maxlength; // Maximum length of sequences 29 | int hash_step; // distance in hash table 30 | SEQstruct **hash; 31 | char *seqstart; // Start of sequence 32 | 33 | } suffixArray; 34 | 35 | 36 | /* Decode long from n bytes */ 37 | static inline long uchar2long(uchar *c, int n) { 38 | long val=*c++; 39 | while ( --n >0 ) val = (val<<8) + *c++; 40 | return val; 41 | } 42 | 43 | /* 44 | For SA entry k, return seq no. (in *nseq) and position within (*pos) 45 | Entry consists of nbytes bytes starting at position sa+k*nbytes. 46 | */ 47 | static inline void suffixArray_decode_number(int *nseq, long *pos, long k, suffixArray *s) { 48 | long val = uchar2long( (s->sa + k * s->nbytes), s->nbytes); 49 | *nseq = (int)(val>>s->pbits); 50 | *pos = val & s->mask; 51 | } 52 | 53 | 54 | 55 | 56 | 57 | /* FUNCTION PROTOTYPES BEGIN ( by funcprototypes.pl ) */ 58 | void suffixArray_make_hash(SEQstruct *base, suffixArray *s, int Hstep); 59 | suffixArray *init_suffixArray(SEQstruct *ss, int chpt_exp); 60 | void write_suffixArray_checkpoints(char **sa, IndexType start, IndexType length, 61 | suffixArray *s, FILE *sa_file); 62 | void write_suffixArray_header(suffixArray *s, FILE *fp); 63 | suffixArray *read_suffixArray_header(FILE *fp); 64 | void read_suffixArray_body(suffixArray *s, FILE *fp); 65 | void write_suffixArray(suffixArray *s, FILE *fp); 66 | /* FUNCTION PROTOTYPES END */ 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /src/bwtfmiDB.cpp: -------------------------------------------------------------------------------- 1 | #include "bwtfmiDB.h" 2 | 3 | BwtFmiDB::BwtFmiDB(Options * & opt) { 4 | mOptions = opt; 5 | init(); 6 | } 7 | 8 | BwtFmiDB::~BwtFmiDB() { 9 | free_BWT(); 10 | if (tastruct->trans) free(tastruct->trans); 11 | if (tastruct->a) free(tastruct->a); 12 | if (tastruct) free(tastruct); 13 | if (mOptions->transSearch.SEG) { 14 | SegParametersFree(tblast_seg_params); 15 | } 16 | } 17 | 18 | 19 | void BwtFmiDB::free_BWT() { 20 | if (tbwt == NULL) 21 | return; // Check if bwt is NULL 22 | if (tbwt->f != NULL) { 23 | free_FMI(tbwt->f); 24 | tbwt->f = NULL; 25 | } 26 | if (tbwt->s != NULL) { 27 | free_suffixArray(tbwt->s); 28 | tbwt->s = NULL; 29 | } 30 | // Free dynamically allocated members 31 | if (tbwt->bwt != NULL) { 32 | free(tbwt->bwt); 33 | tbwt->bwt = NULL; 34 | } 35 | if (tbwt->alphabet != NULL) { 36 | free(tbwt->alphabet); 37 | tbwt->alphabet = NULL; 38 | } 39 | // Finally, free the BWT structure itself 40 | free(tbwt); 41 | } 42 | 43 | void BwtFmiDB::free_FMI(FMI*& fmi) { 44 | if (fmi == NULL) return; // Check if fmi is NULL 45 | 46 | if (fmi->index1 != NULL) { 47 | for (int i = 0; i < fmi->N1; i++) { 48 | if (fmi->index1[i] != NULL) { 49 | free(fmi->index1[i]); 50 | fmi->index1[i] = NULL; 51 | } 52 | } 53 | free(fmi->index1); 54 | fmi->index1 = NULL; 55 | } 56 | 57 | if (fmi->index2 != NULL) { 58 | for (int i = 0; i < fmi->N2; i++) { 59 | if (fmi->index2[i] != NULL) { 60 | free(fmi->index2[i]); 61 | fmi->index2[i] = NULL; 62 | } 63 | } 64 | free(fmi->index2); 65 | fmi->index2 = NULL; 66 | } 67 | 68 | if (fmi->startLcode != NULL) { 69 | free(fmi->startLcode); 70 | fmi->startLcode = NULL; 71 | } 72 | 73 | // Free dynamically allocated members 74 | if (fmi->bwt != NULL) { 75 | free(fmi->bwt); 76 | fmi->bwt = NULL; 77 | } 78 | // Finally, free the FMI structure itself 79 | free(fmi); 80 | } 81 | 82 | void BwtFmiDB::free_suffixArray(suffixArray*& sa) { 83 | if (sa == NULL) return; // Check if sa is NULL 84 | // Free dynamically allocated members 85 | if (sa->sa != NULL) { 86 | free(sa->sa); 87 | sa->sa = NULL; 88 | } 89 | if (sa->seqTermOrder != NULL) { 90 | free(sa->seqTermOrder); 91 | sa->seqTermOrder = NULL; 92 | } 93 | if (sa->seqlengths != NULL) { 94 | free(sa->seqlengths); 95 | sa->seqlengths = NULL; 96 | } 97 | if (sa->hash != NULL) { 98 | for (int i = 0; i < sa->nseq; ++i) { 99 | SEQstruct *cur = sa->hash[i]; 100 | recursive_free_SEQstruct(cur); 101 | } 102 | free(sa->hash); 103 | sa->hash = NULL; 104 | } 105 | if (sa->ids != NULL) { 106 | for (int i = 0; i < sa->nseq; ++i) { 107 | free(sa->ids[i]); 108 | } 109 | free(sa->ids); 110 | sa->ids = NULL; 111 | } 112 | //if (sa->seqstart != NULL) { 113 | // free(sa->seqstart); 114 | // sa->seqstart = NULL; 115 | // } 116 | // Finally, free the suffixArray structure itself 117 | free(sa); 118 | sa = NULL; 119 | } 120 | 121 | void BwtFmiDB::init() { 122 | if (!mOptions->transSearch.tfmi.empty()) { 123 | if (mOptions->verbose) { 124 | std::string msg = "Reading protein (trans search) BWT FMI index from file " + mOptions->transSearch.tfmi; 125 | mOptions->longlog ? loginfolong(msg) : loginfo(msg); 126 | } 127 | 128 | FILE * tfile = fopen(mOptions->transSearch.tfmi.c_str(), "r"); 129 | tbwt = readIndexes(tfile); 130 | Transsearch = true; 131 | fclose(tfile); 132 | tfmi = tbwt->f; 133 | //if (mOptions->verbose) { 134 | std::stringstream msgs; 135 | msgs << "Protein (trans search) BWT of length " << tbwt->len << " has been read with " << tbwt->nseq << " sequences, alphabet = " << tbwt->alphabet; 136 | mOptions->longlog ? loginfolong(msgs.str()) : loginfo(msgs.str()); 137 | //} 138 | 139 | tdb_length = (double) (tbwt->len - tbwt->nseq); 140 | if (mOptions->verbose) { 141 | std::string msg = "Protein (trans search) double length is " + to_string(tdb_length); 142 | mOptions->longlog ? loginfolong(msg) : loginfo(msg); 143 | } 144 | 145 | tastruct = alloc_AlphabetStruct(tbwt->alphabet, 0, 0); 146 | 147 | //need to be conformed. 148 | if (mOptions->transSearch.SEG) { 149 | tblast_seg_params = SegParametersNewAa(); //need to be conformed; 150 | tblast_seg_params->overlaps = TRUE; 151 | } 152 | } 153 | 154 | if (mOptions->verbose) { 155 | mOptions->longlog ? loginfolong("finish BwtFmiDB initiation") : loginfo("finish BwtFmiDB initiation"); 156 | } 157 | } 158 | 159 | -------------------------------------------------------------------------------- /src/bwtfmiDB.h: -------------------------------------------------------------------------------- 1 | #ifndef BWTFMIDB_H 2 | #define BWTFMIDB_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "util.h" 11 | #include "options.h" 12 | 13 | #include "include/ncbi-blast+/algo/blast/core/blast_seg.h" 14 | #include "include/ncbi-blast+/algo/blast/core/blast_filter.h" 15 | #include "include/ncbi-blast+/algo/blast/core/blast_encoding.h" 16 | 17 | extern "C" { 18 | #include "bwt/fmi.h" 19 | #include "bwt/bwt.h" 20 | #include "bwt/sequence.h" 21 | } 22 | using namespace std; 23 | 24 | class BwtFmiDB { 25 | public: 26 | BwtFmiDB(Options * & opt); 27 | ~BwtFmiDB(); 28 | void free_BWT(); 29 | void free_FMI(FMI*& fmi); 30 | void free_suffixArray(suffixArray*& sa); 31 | //for trans search 32 | BWT * tbwt; 33 | FMI * tfmi; 34 | AlphabetStruct * tastruct; 35 | SegParameters * tblast_seg_params; 36 | double tdb_length; 37 | bool Transsearch; 38 | 39 | private: 40 | void init(); 41 | 42 | private: 43 | Options * mOptions; 44 | }; 45 | 46 | #endif /* BWTFMIDB_H */ 47 | 48 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H 2 | #define COMMON_H 3 | 4 | #include 5 | 6 | #define SEQ2FUNR_VER "2.0.6" 7 | 8 | #define _DEBUG false 9 | 10 | typedef long int64; 11 | typedef unsigned long uint64; 12 | 13 | typedef int int32; 14 | typedef unsigned int uint32; 15 | 16 | typedef short int16; 17 | typedef unsigned short uint16; 18 | 19 | typedef char int8; 20 | typedef unsigned char uint8; 21 | 22 | const char ATCG_BASES[] = {'A', 'T', 'C', 'G'}; 23 | 24 | #pragma pack(2) 25 | 26 | 27 | #pragma pack() 28 | 29 | // the limit of the queue to store the packs 30 | // error may happen if it generates more packs than this number 31 | static const int PACK_NUM_LIMIT = 10000000; 32 | 33 | // how many reads one pack has 34 | static const int PACK_SIZE = 1000;//1000000 35 | 36 | // if one pack is produced, but not consumed, it will be kept in the memory 37 | // this number limit the number of in memory packs 38 | // if the number of in memory packs is full, the producer thread should sleep 39 | static const int PACK_IN_MEM_LIMIT = 80;//500000 40 | 41 | // if read number is more than this, warn it 42 | static const int WARN_STANDALONE_READ_LIMIT = 500000;//5000000 43 | 44 | // different filtering results, bigger number means worse 45 | // if r1 and r2 are both failed, then the bigger one of the two results will be recorded 46 | // we reserve some gaps for future types to be added 47 | static const int PASS_FILTER = 0; 48 | static const int FAIL_POLY_X = 4; 49 | static const int FAIL_OVERLAP = 8; 50 | static const int FAIL_N_BASE = 12; 51 | static const int FAIL_LENGTH = 16; 52 | static const int FAIL_TOO_LONG = 17; 53 | static const int FAIL_QUALITY = 20; 54 | static const int FAIL_COMPLEXITY = 24; 55 | 56 | // how many types in total we support 57 | static const int FILTER_RESULT_TYPES = 32; 58 | 59 | const static char* FAILED_TYPES[FILTER_RESULT_TYPES] = { 60 | "passed", "", "", "", 61 | "failed_polyx_filter", "", "", "", 62 | "failed_bad_overlap", "", "", "", 63 | "failed_too_many_n_bases", "", "", "", 64 | "failed_too_short", "failed_too_long", "", "", 65 | "failed_quality_filter", "", "", "", 66 | "failed_low_complexity", "", "", "", 67 | "", "", "", "" 68 | }; 69 | 70 | #endif /* COMMON_H */ 71 | -------------------------------------------------------------------------------- /src/duplicate.cpp: -------------------------------------------------------------------------------- 1 | #include "duplicate.h" 2 | #include "overlapanalysis.h" 3 | #include 4 | #include 5 | 6 | Duplicate::Duplicate(Options* & opt) { 7 | mOptions = opt; 8 | mKeyLenInBase = mOptions->duplicate.keylen; 9 | mKeyLenInBit = 1<<(2*mKeyLenInBase); 10 | mDups = new uint64[mKeyLenInBit]; 11 | memset(mDups, 0, sizeof(uint64)*mKeyLenInBit); 12 | mCounts = new uint16[mKeyLenInBit]; 13 | memset(mCounts, 0, sizeof(uint16)*mKeyLenInBit); 14 | mGC = new uint8[mKeyLenInBit]; 15 | memset(mGC, 0, sizeof(uint8)*mKeyLenInBit); 16 | } 17 | 18 | Duplicate::~Duplicate(){ 19 | delete[] mDups; 20 | delete[] mCounts; 21 | } 22 | 23 | uint64 Duplicate::seq2int(const char* data, int start, int keylen, bool& valid) { 24 | uint64 ret = 0; 25 | for(int i=0; i kmer32) { 59 | mDups[key] = kmer32; 60 | mCounts[key] = 1; 61 | mGC[key] = gc; 62 | } 63 | } 64 | } 65 | 66 | void Duplicate::statRead(Read* r) { 67 | if(r->length() < 32) 68 | return; 69 | 70 | int start1 = 0; 71 | int start2 = max(0, r->length() - 32 - 5); 72 | 73 | const char* data = r->mSeq.mStr.c_str(); 74 | bool valid = true; 75 | 76 | uint64 ret = seq2int(data, start1, mKeyLenInBase, valid); 77 | uint32 key = (uint32)ret; 78 | if(!valid) 79 | return; 80 | 81 | uint64 kmer32 = seq2int(data, start2, 32, valid); 82 | if(!valid) 83 | return; 84 | 85 | int gc = 0; 86 | 87 | // not calculated 88 | if(mCounts[key] == 0) { 89 | for(int i=0; ilength(); i++) { 90 | if(data[i] == 'C' || data[i] == 'T') 91 | gc++; 92 | } 93 | } 94 | 95 | gc = round(255.0 * (double) gc / (double) r->length()); 96 | 97 | addRecord(key, kmer32, (uint8)gc); 98 | } 99 | 100 | void Duplicate::statPair(Read* r1, Read* r2) { 101 | if(r1->length() < 32 || r2->length() < 32) 102 | return; 103 | 104 | const char* data1 = r1->mSeq.mStr.c_str(); 105 | const char* data2 = r2->mSeq.mStr.c_str(); 106 | bool valid = true; 107 | 108 | uint64 ret = seq2int(data1, 0, mKeyLenInBase, valid); 109 | uint32 key = (uint32)ret; 110 | if(!valid) 111 | return; 112 | 113 | uint64 kmer32 = seq2int(data2, 0, 32, valid); 114 | if(!valid) 115 | return; 116 | 117 | int gc = 0; 118 | 119 | // not calculated 120 | if(mCounts[key] == 0) { 121 | for(int i=0; ilength(); i++) { 122 | if(data1[i] == 'G' || data1[i] == 'C') 123 | gc++; 124 | } 125 | for(int i=0; ilength(); i++) { 126 | if(data2[i] == 'G' || data2[i] == 'C') 127 | gc++; 128 | } 129 | } 130 | 131 | gc = round(255.0 * (double) gc / (double)( r1->length() + r2->length())); 132 | 133 | addRecord(key, kmer32, gc); 134 | } 135 | 136 | double Duplicate::statAll(int* hist, double* meanGC, int histSize) { 137 | long totalNum = 0; 138 | long dupNum = 0; 139 | int* gcStatNum = new int[histSize]; 140 | memset(gcStatNum, 0, sizeof(int)*histSize); 141 | for(int key=0; key 0) { 146 | totalNum += count; 147 | dupNum += count - 1; 148 | 149 | if(count >= histSize){ 150 | hist[histSize-1]++; 151 | meanGC[histSize-1] += gc; 152 | gcStatNum[histSize-1]++; 153 | } 154 | else{ 155 | hist[count]++; 156 | meanGC[count] += gc; 157 | gcStatNum[count]++; 158 | } 159 | } 160 | } 161 | 162 | for(int i=0; i 0) { 164 | meanGC[i] = meanGC[i] / 255.0 / gcStatNum[i]; 165 | } 166 | } 167 | 168 | delete[] gcStatNum; 169 | 170 | if(totalNum == 0) 171 | return 0.0; 172 | else 173 | return (double)dupNum / (double)totalNum; 174 | } -------------------------------------------------------------------------------- /src/duplicate.h: -------------------------------------------------------------------------------- 1 | #ifndef DUPLICATE_H 2 | #define DUPLICATE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include "options.h" 9 | #include "common.h" 10 | 11 | using namespace std; 12 | 13 | class Duplicate{ 14 | public: 15 | Duplicate(Options* & opt); 16 | ~Duplicate(); 17 | 18 | void statRead(Read* r1); 19 | void statPair(Read* r1, Read* r2); 20 | uint64 seq2int(const char* data, int start, int keylen, bool& valid); 21 | void addRecord(uint32 key, uint64 kmer32, uint8 gc); 22 | 23 | // make histogram and get duplication rate 24 | double statAll(int* hist, double* meanGC, int histSize); 25 | 26 | private: 27 | Options* mOptions; 28 | int mKeyLenInBase; 29 | int mKeyLenInBit; 30 | uint64* mDups; 31 | uint16* mCounts; 32 | uint8* mGC; 33 | 34 | }; 35 | 36 | #endif -------------------------------------------------------------------------------- /src/evaluator.h: -------------------------------------------------------------------------------- 1 | #ifndef EVALUATOR_H 2 | #define EVALUATOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "options.h" 8 | #include "util.h" 9 | #include "read.h" 10 | 11 | using namespace std; 12 | 13 | class Evaluator{ 14 | public: 15 | Evaluator(Options* & opt); 16 | ~Evaluator(); 17 | // evaluate how many reads are stored in the input file 18 | void evaluateReadNum(long& readNum); 19 | string evalAdapterAndReadNumDepreciated(long& readNum); 20 | string evalAdapterAndReadNum(long& readNum, bool isR2); 21 | bool isTwoColorSystem(); 22 | void evaluateSeqLen(); 23 | void evaluateOverRepSeqs(); 24 | void computeOverRepSeq(string filename, map& hotseqs, int seqLen); 25 | int computeSeqLen(string filename); 26 | 27 | static bool test(); 28 | static string matchKnownAdapter(string seq); 29 | private: 30 | Options* mOptions; 31 | string int2seq(unsigned int val, int seqlen); 32 | int seq2int(string& seq, int pos, int seqlen, int lastVal = -1); 33 | string getAdapterWithSeed(int seed, Read** loadedReads, long records, int keylen); 34 | }; 35 | 36 | 37 | #endif -------------------------------------------------------------------------------- /src/fastareader.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "fastareader.h" 3 | #include "util.h" 4 | #include 5 | 6 | FastaReader::FastaReader(string faFile, bool forceUpperCase) 7 | { 8 | // Set locale and disable stdio synchronization to improve iostream performance 9 | // http://www.drdobbs.com/the-standard-librarian-iostreams-and-std/184401305 10 | // http://stackoverflow.com/questions/5166263/how-to-get-iostream-to-perform-better 11 | setlocale(LC_ALL,"C"); 12 | ios_base::sync_with_stdio(false); 13 | 14 | mFastaFile = faFile; 15 | mForceUpperCase = forceUpperCase; 16 | if (is_directory(mFastaFile)) { 17 | string error_msg = "There is a problem with the provided fasta file: \'"; 18 | error_msg.append(mFastaFile); 19 | error_msg.append("\' is a directory NOT a file...\n"); 20 | throw invalid_argument(error_msg); 21 | } 22 | mFastaFileStream.open( mFastaFile.c_str(),ios::in); 23 | // verify that the file can be read 24 | if (!mFastaFileStream.is_open()) { 25 | string msg = "There is a problem with the provided fasta file: could NOT read "; 26 | msg.append(mFastaFile.c_str()); 27 | msg.append("...\n"); 28 | throw invalid_argument(msg); 29 | } 30 | 31 | char c; 32 | // seek to first contig 33 | while (mFastaFileStream.get(c) && c != '>') { 34 | if (mFastaFileStream.eof()) { 35 | break; 36 | } 37 | } 38 | } 39 | 40 | FastaReader::~FastaReader() 41 | { 42 | if (mFastaFileStream.is_open()) { 43 | mFastaFileStream.close(); 44 | } 45 | } 46 | 47 | void FastaReader::readNext() 48 | { 49 | mCurrentID = ""; 50 | mCurrentDescription = ""; 51 | mCurrentSequence = ""; 52 | bool foundHeader = false; 53 | 54 | char c; 55 | stringstream ssSeq; 56 | stringstream ssHeader; 57 | while(true){ 58 | mFastaFileStream.get(c); 59 | if(c == '>' || mFastaFileStream.eof()) 60 | break; 61 | else { 62 | if (foundHeader){ 63 | if(mForceUpperCase && c>='a' && c<='z') { 64 | c -= ('a' - 'A'); 65 | } 66 | ssSeq << c; 67 | } 68 | else 69 | ssHeader << c; 70 | } 71 | 72 | string line = ""; 73 | getline(mFastaFileStream,line,'\n'); 74 | 75 | 76 | if(foundHeader == false) { 77 | ssHeader << line; 78 | foundHeader = true; 79 | } 80 | else { 81 | str_keep_valid_sequence(line, mForceUpperCase); 82 | ssSeq << line; 83 | } 84 | } 85 | mCurrentSequence = ssSeq.str(); 86 | string header = ssHeader.str(); 87 | 88 | mCurrentID = header; 89 | } 90 | 91 | bool FastaReader::hasNext() { 92 | return !mFastaFileStream.eof(); 93 | } 94 | 95 | void FastaReader::readAll() { 96 | while(!mFastaFileStream.eof()){ 97 | readNext(); 98 | mAllContigs[mCurrentID] = mCurrentSequence; 99 | } 100 | } 101 | 102 | bool FastaReader::test(){ 103 | FastaReader reader("testdata/tinyref.fa"); 104 | reader.readAll(); 105 | 106 | string contig1 = "GATCACAGGTCTATCACCCTATTAATTGGTATTTTCGTCTGGGGGGTGTGGAGCCGGAGCACCCTATGTCGCAGT"; 107 | string contig2 = "GTCTGCACAGCCGCTTTCCACACAGAACCCCCCCCTCCCCCCGCTTCTGGCAAACCCCAAAAACAAAGAACCCTA"; 108 | 109 | if(reader.mAllContigs.count("contig1") == 0 || reader.mAllContigs.count("contig2") == 0 ) 110 | return false; 111 | 112 | if(reader.mAllContigs["contig1"] != contig1 || reader.mAllContigs["contig2"] != contig2 ) 113 | return false; 114 | 115 | return true; 116 | 117 | } 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /src/fastareader.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTA_READER_H 2 | #define FASTA_READER_H 3 | 4 | // includes 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | using namespace std; 16 | 17 | class FastaReader 18 | { 19 | public: 20 | FastaReader(string fastaFile, bool forceUpperCase = true); 21 | ~FastaReader(); 22 | bool hasNext(); 23 | void readNext(); 24 | void readAll(); 25 | 26 | inline string currentID() 27 | { 28 | return mCurrentID; 29 | } 30 | 31 | inline string currentDescription() 32 | { 33 | return mCurrentDescription; 34 | } 35 | 36 | inline string currentSequence() 37 | { 38 | return mCurrentSequence; 39 | } 40 | 41 | inline map& contigs() { 42 | return mAllContigs; 43 | } 44 | 45 | static bool test(); 46 | 47 | 48 | public: 49 | string mCurrentSequence; 50 | string mCurrentID ; 51 | string mCurrentDescription; 52 | map mAllContigs; 53 | 54 | private: 55 | bool readLine(); 56 | bool endOfLine(char c); 57 | void setFastaSequenceIdDescription(); 58 | 59 | private: 60 | string mFastaFile; 61 | ifstream mFastaFileStream; 62 | bool mForceUpperCase; 63 | }; 64 | 65 | 66 | #endif 67 | 68 | -------------------------------------------------------------------------------- /src/fastqreader.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTQ_READER_H 2 | #define FASTQ_READER_H 3 | 4 | #include 5 | #include 6 | #include "read.h" 7 | #ifdef DYNAMIC_ZLIB 8 | #include 9 | #else 10 | #include "zlib/zlib.h" 11 | #endif 12 | #include "common.h" 13 | #include 14 | #include 15 | 16 | class FastqReader{ 17 | public: 18 | FastqReader(string filename, bool hasQuality = true, bool phred64=false, size_t fastqBufferSize=1<<20); 19 | ~FastqReader(); 20 | bool isZipped(); 21 | 22 | void getBytes(size_t& bytesRead, size_t& bytesTotal); 23 | 24 | //this function is not thread-safe 25 | //do not call read() of a same FastqReader object from different threads concurrently 26 | Read* read(); 27 | bool eof(); 28 | bool hasNoLineBreakAtEnd(); 29 | 30 | public: 31 | static bool isZipFastq(string filename); 32 | static bool isFastq(string filename); 33 | static bool test(); 34 | 35 | private: 36 | void init(); 37 | void close(); 38 | string getLine(); 39 | void clearLineBreaks(char* line); 40 | void readToBuf(); 41 | 42 | private: 43 | string mFilename; 44 | gzFile mZipFile; 45 | FILE* mFile; 46 | bool mZipped; 47 | bool mHasQuality; 48 | bool mPhred64; 49 | char* mBuf; 50 | int mBufDataLen; 51 | int mBufUsedLen; 52 | bool mStdinMode; 53 | bool mHasNoLineBreakAtEnd; 54 | size_t mFastqBufSize; 55 | }; 56 | 57 | class FastqReaderPair{ 58 | public: 59 | FastqReaderPair(FastqReader* left, FastqReader* right); 60 | FastqReaderPair(string leftName, string rightName, bool hasQuality = true, bool phred64 = false, bool interleaved = false, size_t fastqBufferSize = 1<<20); 61 | ~FastqReaderPair(); 62 | ReadPair* read(); 63 | public: 64 | FastqReader* mLeft; 65 | FastqReader* mRight; 66 | bool mInterleaved; 67 | }; 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /src/filter.h: -------------------------------------------------------------------------------- 1 | #ifndef FILTER_H 2 | #define FILTER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "options.h" 9 | #include "read.h" 10 | 11 | using namespace std; 12 | 13 | class Filter{ 14 | public: 15 | Filter(Options* & opt); 16 | ~Filter(); 17 | int passFilter(Read* r); 18 | bool passLowComplexityFilter(Read* r); 19 | Read* trimAndCut(Read* r, int front, int tail, int& frontTrimmed); 20 | bool filterByIndex(Read* r); 21 | bool filterByIndex(Read* r1, Read* r2); 22 | static bool test(); 23 | 24 | private: 25 | bool match(vector& list, string target, int threshold); 26 | 27 | private: 28 | Options* mOptions; 29 | }; 30 | 31 | 32 | #endif -------------------------------------------------------------------------------- /src/filterresult.h: -------------------------------------------------------------------------------- 1 | #ifndef FILTER_RESULT_H 2 | #define FILTER_RESULT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "common.h" 10 | #include "options.h" 11 | #include 12 | #include 13 | 14 | struct classcomp { 15 | bool operator() (const string& lhs, const string& rhs) const { 16 | if (lhs.length() < rhs.length()) 17 | return true; 18 | else if(lhs.length() == rhs.length()) { 19 | return lhs < rhs; 20 | } else 21 | return false; 22 | } 23 | }; 24 | 25 | using namespace std; 26 | 27 | class FilterResult{ 28 | public: 29 | FilterResult(Options* & opt, bool paired = false); 30 | ~FilterResult(); 31 | inline long* getFilterReadStats() {return mFilterReadStats;} 32 | void addFilterResult(int result, int readNum=1); 33 | static FilterResult* merge(vector& list); 34 | void print(); 35 | // for single end 36 | void addAdapterTrimmed(string adapter, bool isR2 = false, bool incTrimmedCounter = true); 37 | // for paired end 38 | void addAdapterTrimmed(string adapter1, string adapter2); 39 | void addPolyXTrimmed(int base, int length); 40 | long getTotalPolyXTrimmedReads(); 41 | long getTotalPolyXTrimmedBases(); 42 | // a part of JSON report 43 | void reportJson(ofstream& ofs, string padding); 44 | // a part of JSON report for adapters 45 | void reportAdapterJson(ofstream& ofs, string padding); 46 | // a part of JSON report for polyX trim 47 | void reportPolyXTrimJson(ofstream& ofs, string padding); 48 | // a part of HTML report 49 | void reportHtml(ofstream& ofs, long totalReads, long totalBases); 50 | // a part of HTML report for adapters 51 | void reportAdapterHtml(ofstream& ofs, long totalBases); 52 | void outputAdaptersJson(ofstream& ofs, map& adapterCounts); 53 | void outputAdaptersHtml(ofstream& ofs, map& adapterCounts, long totalBases); 54 | // deal with base correction results 55 | long* getCorrectionMatrix() {return mCorrectionMatrix;} 56 | long getTotalCorrectedBases(); 57 | void addCorrection(char from, char to); 58 | long getCorrectionNum(char from, char to); 59 | void incCorrectedReads(int count); 60 | void addMergedPairs(int pairs); 61 | 62 | 63 | public: 64 | Options* mOptions; 65 | bool mPaired; 66 | long mCorrectedReads; 67 | long mMergedPairs; 68 | private: 69 | long mFilterReadStats[FILTER_RESULT_TYPES]; 70 | long mTrimmedAdapterRead; 71 | long mTrimmedAdapterBases; 72 | long mTrimmedPolyXReads[4] = {0}; 73 | long mTrimmedPolyXBases[4] = {0}; 74 | map mAdapter1; 75 | map mAdapter2; 76 | long* mCorrectionMatrix; 77 | }; 78 | 79 | #endif -------------------------------------------------------------------------------- /src/fragment.cpp: -------------------------------------------------------------------------------- 1 | #include "fragment.h" 2 | 3 | Fragment::Fragment(const std::string & s) { 4 | seq = s; 5 | } 6 | 7 | Fragment::Fragment(const std::string & s, bool b) { 8 | seq = s; 9 | SEGchecked = true; 10 | } 11 | 12 | Fragment:: Fragment(const std::string & s, unsigned int n, unsigned int p, int d){ 13 | seq = s; 14 | num_mm = n; 15 | diff = d; 16 | pos_lastmm = p; 17 | } 18 | 19 | Fragment::Fragment(const std::string & s, unsigned int n, unsigned int p, int d, IndexType arg_si0, IndexType arg_si1, int len){ 20 | seq = s; 21 | num_mm = n; 22 | diff = d; 23 | pos_lastmm = p; 24 | si0 = arg_si0; 25 | si1 = arg_si1; 26 | matchlen = len; 27 | SEGchecked = true; 28 | } // fragments with substitutions have been checked before 29 | 30 | Fragment::Fragment(const std::string & s, unsigned int n, unsigned int p, int d, SI * si){ 31 | seq = s; 32 | num_mm = n; 33 | diff = d; 34 | pos_lastmm = p; 35 | si0 = si->start; 36 | si1 = si->start + (IndexType) si->len; 37 | matchlen = si->ql; 38 | } 39 | 40 | Fragment::Fragment(const std::string & s, unsigned int n, unsigned int p, SI * si){ 41 | seq = s; 42 | num_mm = n; 43 | pos_lastmm = p; 44 | si0 = si->start; 45 | si1 = si->start + (IndexType) si->len; 46 | matchlen = si->ql; 47 | } 48 | 49 | Fragment::Fragment(const std::string & s, unsigned int n, unsigned int p) { 50 | seq = s; 51 | num_mm = n; 52 | pos_lastmm = p; 53 | } -------------------------------------------------------------------------------- /src/fragment.h: -------------------------------------------------------------------------------- 1 | #ifndef FRAGMENT_H 2 | #define FRAGMENT_H 3 | 4 | #include 5 | 6 | extern "C" { 7 | #include "bwt/bwt.h" 8 | } 9 | 10 | using namespace std; 11 | 12 | class Fragment { 13 | public: 14 | std::string seq; 15 | unsigned int num_mm = 0; 16 | int diff = 0; 17 | unsigned int pos_lastmm = 0; 18 | IndexType si0, si1, arg_si0, arg_si1; 19 | int matchlen; 20 | bool SEGchecked = false; 21 | 22 | Fragment(const std::string & s); 23 | 24 | Fragment(const std::string & s, bool b); 25 | 26 | Fragment(const std::string & s, unsigned int n, unsigned int p, int d); 27 | 28 | Fragment(const std::string & s, unsigned int n, unsigned int p, int d, IndexType arg_si0, IndexType arg_si1, int len); 29 | 30 | Fragment(const std::string & s, unsigned int n, unsigned int p, int d, SI * si); 31 | 32 | Fragment(const std::string & s, unsigned int n, unsigned int p, SI * si); 33 | 34 | Fragment(const std::string & s, unsigned int n, unsigned int p); 35 | }; 36 | 37 | #endif /* FRAGMENT_H */ 38 | 39 | -------------------------------------------------------------------------------- /src/htmlreporter.h: -------------------------------------------------------------------------------- 1 | #ifndef HTML_REPORTER_H 2 | #define HTML_REPORTER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "options.h" 14 | #include "stats.h" 15 | #include "filterresult.h" 16 | #include "common.h" 17 | #include "util.h" 18 | 19 | using namespace std; 20 | 21 | class HtmlReporter{ 22 | public: 23 | HtmlReporter(Options* & opt); 24 | ~HtmlReporter(); 25 | void setDupHist(int* dupHist, double* dupMeanGC, double dupRate); 26 | void setInsertHist(atomic_long* insertHist, int insertSizePeak); 27 | void report(FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL); 28 | static void outputRow(ofstream& ofs, string key, long value); 29 | static void outputRow(ofstream& ofs, string key, string value); 30 | static void outputLongRow(ofstream& ofs, string key, string value); 31 | static string formatNumber(long number); 32 | static string getPercents(long numerator, long denominator); 33 | private: 34 | const string getCurrentSystemTime(); 35 | void printHeader(ofstream& ofs); 36 | void printCSS(ofstream& ofs); 37 | void printJS(ofstream& ofs); 38 | void printFooter(ofstream& ofs); 39 | void reportDuplication(ofstream& ofs); 40 | void reportInsertSize(ofstream& ofs, int isizeLimit); 41 | void printSummary(ofstream& ofs, FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2, Stats* postStats2); 42 | 43 | void printAnnotationResults(ofstream & ofs); 44 | void reportRarefaction(ofstream& ofs); 45 | void reportKOBarPlot(ofstream& ofs); 46 | void reportRarefactionId(ofstream& ofs); 47 | void reportBarPlotId(ofstream& ofs); 48 | void reportPathway(ofstream& ofs); 49 | void reportSpecies(ofstream& ofs); 50 | 51 | private: 52 | Options* mOptions; 53 | int* mDupHist; 54 | double* mDupMeanGC; 55 | double mDupRate; 56 | atomic_long* mInsertHist; 57 | int mInsertSizePeak; 58 | }; 59 | 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /src/htmlreporterall.h: -------------------------------------------------------------------------------- 1 | #ifndef HTMLREPORTERALL_H 2 | #define HTMLREPORTERALL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "util.h" 16 | #include "common.h" 17 | #include "options.h" 18 | 19 | 20 | using namespace std; 21 | 22 | class HtmlReporterAll { 23 | public: 24 | HtmlReporterAll(Options* & opt); 25 | ~HtmlReporterAll(); 26 | void report(); 27 | static void outputRow(ofstream& ofs, string key, long value); 28 | static void outputRow(ofstream& ofs, string key, string value); 29 | static void outputRow(ofstream& ofs, std::vector & samplesVec); 30 | static void outputSummaryTable(ofstream& ofs, std::vector & samplesVec); 31 | 32 | private: 33 | const string getCurrentSystemTime(); 34 | void printHeader(ofstream& ofs); 35 | void printCSS(ofstream& ofs); 36 | void printJS(ofstream& ofs); 37 | void printFooter(ofstream& ofs); 38 | 39 | void printAnnotationResults(ofstream & ofs); 40 | void reportRarefactionKO(ofstream& ofs); 41 | void reportRarefactionKO3D(ofstream& ofs); 42 | void reportKOBarPlot(ofstream& ofs); 43 | void reportRarefactionS2f(ofstream& ofs); 44 | void reportRarefactionS2f3D(ofstream& ofs); 45 | void reportS2fBarPlot(ofstream& ofs); 46 | void reportPathwayBarPlot(ofstream& ofs); 47 | void reportOrgBarPlot(ofstream& ofs); 48 | void reportReadsQualityPlot3D(ofstream& ofs); 49 | void reportAllTables(); 50 | static string list2string(std::vector & x_vec, int top); 51 | static string list2string(std::vector & x_vec, int top); 52 | static string list2string(std::vector & x_vec, int top); 53 | static string list2string(std::vector & x_vec, int top); 54 | static string list2string2(std::vector & x_vec, int top); 55 | std::vector smNmVec; 56 | std::vector > koFreqVec; 57 | std::vector > > idFreqVec; 58 | std::vector > pathwayFreqVec; 59 | std::vector > orgFreqVec; 60 | 61 | private: 62 | Options * mOptions; 63 | 64 | }; 65 | 66 | #endif /* HTMLREPORTERALL_H */ 67 | 68 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/composition_adjustment/composition_constants.h: -------------------------------------------------------------------------------- 1 | /* $Id: composition_constants.h 187049 2010-03-26 14:52:29Z satskyse $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * ===========================================================================*/ 25 | /** 26 | * @file composition_constants.h 27 | * Constants used in compositional score matrix adjustment 28 | * 29 | * @author E. Michael Gertz, Alejandro Schaffer, Yi-Kuo Yu 30 | */ 31 | 32 | 33 | #ifndef __COMPOSITION_CONSTANTS__ 34 | #define __COMPOSITION_CONSTANTS__ 35 | 36 | #include 37 | 38 | #ifdef __cplusplus 39 | extern "C" { 40 | #endif 41 | 42 | /** Minimum score in a matrix */ 43 | #define COMPO_SCORE_MIN INT2_MIN 44 | 45 | /** Number of standard amino acids */ 46 | #define COMPO_NUM_TRUE_AA 20 47 | 48 | /** The largest alphabet supported by this code (the code supports 26 49 | or 28 character amino acid alphabets). Used to specify the size of 50 | structures that are statically allocated. */ 51 | #define COMPO_LARGEST_ALPHABET 28 52 | 53 | /* NOTE: Please keep these comments in sync with argument descriptions in 54 | * CCompositionBasedStatsArgs::SetArgumentDescriptions() 55 | */ 56 | 57 | /** An collection of constants that specify all permissible 58 | * modes of composition adjustment */ 59 | typedef enum ECompoAdjustModes { 60 | /** Don't use composition based statistics */ 61 | eNoCompositionBasedStats = 0, 62 | /** Composition-based statistics as in NAR 29:2994-3005, 2001 */ 63 | eCompositionBasedStats = 1, 64 | /** Composition-based score adjustment as in Bioinformatics 21:902-911, 65 | * 2005, conditioned on sequence properties. Cannot be applied to PSSMs. */ 66 | eCompositionMatrixAdjust = 2, 67 | /** Composition-based score adjustment as in Bioinformatics 21:902-911, 68 | * 2005, unconditionally. Cannot be applied to PSSMs. */ 69 | eCompoForceFullMatrixAdjust = 3, 70 | eNumCompoAdjustModes 71 | } ECompoAdjustModes; 72 | 73 | 74 | /** An collection of constants that specify all rules that may 75 | * be used to generate a compositionally adjusted matrix. */ 76 | typedef enum EMatrixAdjustRule { 77 | eDontAdjustMatrix = (-1), 78 | eCompoScaleOldMatrix = 0, 79 | eUnconstrainedRelEntropy = 1, 80 | eRelEntropyOldMatrixNewContext = 2, 81 | eRelEntropyOldMatrixOldContext = 3, 82 | eUserSpecifiedRelEntropy = 4 83 | } EMatrixAdjustRule; 84 | 85 | 86 | #ifdef __cplusplus 87 | } 88 | #endif 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/composition_adjustment/matrix_frequency_data.h: -------------------------------------------------------------------------------- 1 | /* $Id: matrix_frequency_data.h 103491 2007-05-04 17:18:18Z kazimird $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * ===========================================================================*/ 25 | /** 26 | * @file matrix_frequency_data.h 27 | * Definitions used to get joint probabilities for a scoring matrix 28 | * 29 | * @author Alejandro Schaffer, E. Michael Gertz 30 | */ 31 | #ifndef __MATRIX_FREQUENCY_DATA__ 32 | #define __MATRIX_FREQUENCY_DATA__ 33 | 34 | #include 35 | 36 | #ifdef __cplusplus 37 | extern "C" { 38 | #endif 39 | 40 | /** 41 | * Get joint probabilities for the named matrix. 42 | * 43 | * @param probs the joint probabilities [out] 44 | * @param row_sums sum of the values in each row of probs [out] 45 | * @param col_sums sum of the values in each column of probs [out] 46 | * @param matrixName the name of the matrix sought [in] 47 | * @returns 0 if successful; -1 if the named matrix is not known. 48 | */ 49 | NCBI_XBLAST_EXPORT 50 | int Blast_GetJointProbsForMatrix(double ** probs, double row_sums[], 51 | double col_sums[], const char *matrixName); 52 | 53 | 54 | /** Return true if frequency data is available for the given matrix name. */ 55 | NCBI_XBLAST_EXPORT 56 | const double * Blast_GetMatrixBackgroundFreq(const char *matrix_name); 57 | 58 | 59 | /** Retrieve the background letter probabilities implicitly used in 60 | * constructing the score matrix matrix_name. */ 61 | NCBI_XBLAST_EXPORT 62 | int Blast_FrequencyDataIsAvailable(const char *matrix_name); 63 | 64 | #ifdef __cplusplus 65 | } 66 | #endif 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/core/blast_encoding.c: -------------------------------------------------------------------------------- 1 | /* $Id: blast_encoding.c 118195 2008-01-24 21:22:19Z camacho $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Christiam Camacho 27 | * 28 | */ 29 | 30 | /** @file blast_encoding.c 31 | * Definitions of static arrays defined in blast_encoding.h. 32 | * @sa blast_encoding.h 33 | */ 34 | 35 | #ifndef SKIP_DOXYGEN_PROCESSING 36 | static char const rcsid[] = 37 | "$Id: blast_encoding.c 118195 2008-01-24 21:22:19Z camacho $"; 38 | #endif /* SKIP_DOXYGEN_PROCESSING */ 39 | 40 | #include 41 | 42 | const Uint1 NCBI4NA_TO_BLASTNA[BLASTNA_SIZE] = { 43 | 15, /* Gap, 0 */ 44 | 0, /* A, 1 */ 45 | 1, /* C, 2 */ 46 | 6, /* M, 3 */ 47 | 2, /* G, 4 */ 48 | 4, /* R, 5 */ 49 | 9, /* S, 6 */ 50 | 13, /* V, 7 */ 51 | 3, /* T, 8 */ 52 | 8, /* W, 9 */ 53 | 5, /* Y, 10 */ 54 | 12, /* H, 11 */ 55 | 7, /* K, 12 */ 56 | 11, /* D, 13 */ 57 | 10, /* B, 14 */ 58 | 14 /* N, 15 */ 59 | }; 60 | 61 | const Uint1 BLASTNA_TO_NCBI4NA[BLASTNA_SIZE] = { 62 | 1, /* A, 0 */ 63 | 2, /* C, 1 */ 64 | 4, /* G, 2 */ 65 | 8, /* T, 3 */ 66 | 5, /* R, 4 */ 67 | 10, /* Y, 5 */ 68 | 3, /* M, 6 */ 69 | 12, /* K, 7 */ 70 | 9, /* W, 8 */ 71 | 6, /* S, 9 */ 72 | 14, /* B, 10 */ 73 | 13, /* D, 11 */ 74 | 11, /* H, 12 */ 75 | 7, /* V, 13 */ 76 | 15, /* N, 14 */ 77 | 0 /* Gap, 15 */ 78 | }; 79 | 80 | const char BLASTNA_TO_IUPACNA[BLASTNA_SIZE] = { 81 | 'A', 'C', 'G', 'T', 'R', 'Y', 'M', 'K', 82 | 'W', 'S', 'B', 'D', 'H', 'V', 'N', '-' 83 | }; 84 | 85 | const char NCBI4NA_TO_IUPACNA[BLASTNA_SIZE] = { 86 | '-', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 87 | 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N' 88 | }; 89 | 90 | const Uint1 IUPACNA_TO_BLASTNA[128]={ 91 | 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 92 | 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 93 | 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 94 | 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 95 | 15, 0,10, 1,11,15,15, 2,12,15,15, 7,15, 6,14,15, 96 | 15,15, 4, 9, 3,15,13, 8,15, 5,15,15,15,15,15,15, 97 | 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 98 | 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15}; 99 | 100 | const Uint1 IUPACNA_TO_NCBI4NA[128]={ 101 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 103 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 105 | 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0, 106 | 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0, 107 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 108 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 109 | 110 | const Uint1 AMINOACID_TO_NCBISTDAA[128] = { 111 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 112 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 113 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,25, 0, 0, 0, 0, 0, 114 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 115 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,27,10,11,12,13,26, 116 | 14,15,16,17,18,24,19,20,21,22,23, 0, 0, 0, 0, 0, 117 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 118 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 119 | 120 | const char NCBISTDAA_TO_AMINOACID[BLASTAA_SIZE] = { 121 | '-','A','B','C','D','E','F','G','H','I','K','L','M', 122 | 'N','P','Q','R','S','T','V','W','X','Y','Z','U','*', 123 | 'O', 'J'}; 124 | 125 | const Uint1 kProtSentinel = NULLB; 126 | const Uint1 kNuclSentinel = 0xF; 127 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/core/blast_encoding.h: -------------------------------------------------------------------------------- 1 | /* $Id: blast_encoding.h 118195 2008-01-24 21:22:19Z camacho $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Christiam Camacho 27 | * 28 | */ 29 | 30 | /** @file blast_encoding.h 31 | * Declarations of static arrays used to define some NCBI encodings to be used 32 | * in a toolkit independent manner by the BLAST engine. 33 | */ 34 | 35 | #ifndef ALGO_BLAST_CORE___BLAST_ENCODING__H 36 | #define ALGO_BLAST_CORE___BLAST_ENCODING__H 37 | 38 | #include 39 | #include 40 | 41 | /** @addtogroup AlgoBlast 42 | * 43 | * @{ 44 | */ 45 | 46 | #ifdef __cplusplus 47 | extern "C" { 48 | #endif 49 | 50 | /** Different types of sequence encodings for sequence retrieval from the 51 | * BLAST database 52 | */ 53 | typedef enum { 54 | eBlastEncodingProtein = 0, /**< NCBIstdaa */ 55 | eBlastEncodingNucleotide = 1, /**< Special encoding for preliminary 56 | stage of BLAST: permutation of NCBI4na. 57 | A.k.a.: BLASTNA encoding 58 | */ 59 | eBlastEncodingNcbi4na = 2, /**< NCBI4na */ 60 | eBlastEncodingNcbi2na = 3, /**< NCBI2na */ 61 | eBlastEncodingError = 255 /**< Error value for encoding */ 62 | } EBlastEncoding; 63 | 64 | /* Nucleotide encodings */ 65 | 66 | /** Translates between ncbi4na and blastna. The first four elements 67 | * of this array match ncbi2na. */ 68 | NCBI_XBLAST_EXPORT extern const Uint1 NCBI4NA_TO_BLASTNA[]; 69 | 70 | /** Translates between blastna and ncbi4na. */ 71 | NCBI_XBLAST_EXPORT extern const Uint1 BLASTNA_TO_NCBI4NA[]; 72 | 73 | /** Translates between iupacna and blastna. */ 74 | NCBI_XBLAST_EXPORT extern const Uint1 IUPACNA_TO_BLASTNA[]; 75 | 76 | /** Translates between iupacna and ncbi4na. */ 77 | NCBI_XBLAST_EXPORT extern const Uint1 IUPACNA_TO_NCBI4NA[]; 78 | 79 | /** Translates between ncbieaa and ncbistdaa. */ 80 | NCBI_XBLAST_EXPORT extern const Uint1 AMINOACID_TO_NCBISTDAA[]; 81 | 82 | /** Translates between ncbieaa and ncbistdaa. */ 83 | NCBI_XBLAST_EXPORT extern const char NCBISTDAA_TO_AMINOACID[]; 84 | 85 | /** Translates between blastna and iupacna. */ 86 | NCBI_XBLAST_EXPORT extern const char BLASTNA_TO_IUPACNA[]; 87 | 88 | /** Translates between ncbi4na and iupacna. */ 89 | NCBI_XBLAST_EXPORT extern const char NCBI4NA_TO_IUPACNA[]; 90 | 91 | #define BLAST2NA_SIZE 4 /**< Size of compressed nucleic acid alphabet */ 92 | #define BLASTNA_SIZE 16 /**< Size of nucleic acid alphabet */ 93 | #define BLASTAA_SIZE 28 /**< Size of aminoacid alphabet */ 94 | 95 | 96 | #define BLASTNA_SEQ_CODE 99 /**< Identifies the blastna alphabet, for use in 97 | blast only. */ 98 | #define BLASTAA_SEQ_CODE 11 /**< == Seq_code_ncbistdaa */ 99 | #define NCBI4NA_SEQ_CODE 4 /**< == Seq_code_ncbi4na */ 100 | 101 | /** Sentinel byte for protein sequences */ 102 | NCBI_XBLAST_EXPORT extern const Uint1 kProtSentinel; 103 | /** Sentinel nibble for nucleotide sequences */ 104 | NCBI_XBLAST_EXPORT extern const Uint1 kNuclSentinel; 105 | 106 | #ifdef __cplusplus 107 | } 108 | #endif 109 | 110 | /* @} */ 111 | 112 | #endif /* ALGO_BLAST_CORE___BLAST_ENCODING__H */ 113 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/core/blast_export.h: -------------------------------------------------------------------------------- 1 | #ifndef BLAST_EXPORT__H 2 | #define BLAST_EXPORT__H 3 | 4 | /* $Id: blast_export.h 166398 2009-07-22 15:51:55Z ucko $ 5 | * =========================================================================== 6 | * 7 | * PUBLIC DOMAIN NOTICE 8 | * National Center for Biotechnology Information 9 | * 10 | * This software/database is a "United States Government Work" under the 11 | * terms of the United States Copyright Act. It was written as part of 12 | * the author's official duties as a United States Government employee and 13 | * thus cannot be copyrighted. This software/database is freely available 14 | * to the public for use. The National Library of Medicine and the U.S. 15 | * Government have not placed any restriction on its use or reproduction. 16 | * 17 | * Although all reasonable efforts have been taken to ensure the accuracy 18 | * and reliability of the software and data, the NLM and the U.S. 19 | * Government do not and cannot warrant the performance or results that 20 | * may be obtained by using this software or data. The NLM and the U.S. 21 | * Government disclaim all warranties, express or implied, including 22 | * warranties of performance, merchantability or fitness for any particular 23 | * purpose. 24 | * 25 | * Please cite the author in any work or product based on this material. 26 | * 27 | * =========================================================================== 28 | * 29 | * Author: Viatcheslav Gorelenkov 30 | * 31 | */ 32 | 33 | /** @file blast_export.h 34 | * Defines to provide correct exporting from BLAST DLL in Windows. 35 | * These are necessary to compile DLLs with Visual C++ - exports must be 36 | * explicitly labeled as such. 37 | */ 38 | 39 | 40 | 41 | 42 | #if defined(WIN32) && defined(NCBI_DLL_BUILD) 43 | 44 | #ifndef _MSC_VER 45 | # error "This toolkit is not buildable with a compiler other than MSVC." 46 | #endif 47 | 48 | 49 | #ifdef NCBI_XALGO_EXPORTS 50 | # define NCBI_XBLAST_EXPORT __declspec(dllexport) 51 | #else 52 | # define NCBI_XBLAST_EXPORT __declspec(dllimport) 53 | #endif 54 | 55 | #elif defined(HAVE_ATTRIBUTE_VISIBILITY_DEFAULT) 56 | 57 | # define NCBI_XBLAST_EXPORT __attribute__((visibility("default"))) 58 | 59 | #else 60 | 61 | /** 62 | * NULL operations for other cases 63 | */ 64 | 65 | # define NCBI_XBLAST_EXPORT 66 | 67 | 68 | #endif 69 | 70 | #endif /* BLAST_EXPORT__H */ 71 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/core/blast_hits_priv.h: -------------------------------------------------------------------------------- 1 | /* $Id: blast_hits_priv.h 103491 2007-05-04 17:18:18Z kazimird $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Christiam Camacho 27 | * 28 | */ 29 | 30 | /** @file blast_hits_priv.h 31 | * Utilities for dealing with BLAST HSPs in the core of BLAST. 32 | */ 33 | 34 | #ifndef ALGO_BLAST_CORE___BLAST_HITS_PRIV__H 35 | #define ALGO_BLAST_CORE___BLAST_HITS_PRIV__H 36 | 37 | #include 38 | #include 39 | 40 | #ifdef __cplusplus 41 | extern "C" { 42 | #endif 43 | 44 | /** Check the gapped alignments for an overlap of two different alignments. 45 | * A sufficient overlap is when two alignments have the same start values 46 | * of have the same final values. 47 | * @param hsp_array Pointer to an array of BlastHSP structures [in] 48 | * @param hsp_count The size of the hsp_array [in] 49 | * @return The number of valid alignments remaining. 50 | */ 51 | Int4 52 | Blast_CheckHSPsForCommonEndpoints(BlastHSP* *hsp_array, Int4 hsp_count); 53 | 54 | /** Comparison callback function for sorting HSPs, first by score in descending 55 | * order, then by location. Among alignments with equal score, an HSP will 56 | * precede any other HSPs that are completely contained within its endpoints. 57 | * 58 | * H2 is contained in H1 if and only if 59 | * H1.query.offset <= H2.query.offset <= H2.query.end <= H1.query.end 60 | * H1.sbjct.offset <= H2.sbjct.offset <= H2.sbjct.end <= H1.sbjct.end 61 | */ 62 | int 63 | ScoreCompareHSPs(const void* h1, const void* h2); 64 | 65 | /** TRUE if c is between a and b; f between d and e. Determines if the 66 | * coordinates are already in an HSP that has been evaluated. 67 | */ 68 | #define CONTAINED_IN_HSP(a,b,c,d,e,f) \ 69 | (((a <= c && b >= c) && (d <= f && e >= f)) ? TRUE : FALSE) 70 | 71 | #ifdef __cplusplus 72 | } 73 | #endif 74 | 75 | #endif /* !ALGO_BLAST_CORE__BLAST_HITS_PRIV__H */ 76 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/core/blast_program.c: -------------------------------------------------------------------------------- 1 | #ifndef SKIP_DOXYGEN_PROCESSING 2 | static char const rcsid[] = 3 | "$Id: blast_program.c 97157 2007-01-19 14:27:24Z madden $"; 4 | #endif /* SKIP_DOXYGEN_PROCESSING */ 5 | /* =========================================================================== 6 | * 7 | * PUBLIC DOMAIN NOTICE 8 | * National Center for Biotechnology Information 9 | * 10 | * This software/database is a "United States Government Work" under the 11 | * terms of the United States Copyright Act. It was written as part of 12 | * the author's official duties as a United States Government employee and 13 | * thus cannot be copyrighted. This software/database is freely available 14 | * to the public for use. The National Library of Medicine and the U.S. 15 | * Government have not placed any restriction on its use or reproduction. 16 | * 17 | * Although all reasonable efforts have been taken to ensure the accuracy 18 | * and reliability of the software and data, the NLM and the U.S. 19 | * Government do not and cannot warrant the performance or results that 20 | * may be obtained by using this software or data. The NLM and the U.S. 21 | * Government disclaim all warranties, express or implied, including 22 | * warranties of performance, merchantability or fitness for any particular 23 | * purpose. 24 | * 25 | * Please cite the author in any work or product based on this material. 26 | * 27 | * =========================================================================== 28 | * 29 | * Author: Christiam Camacho / Ilya Dondoshansky 30 | * 31 | */ 32 | 33 | /** @file blast_program.c 34 | * Implementation auxiliary functions to determine traits of the various BLAST 35 | * programs supported by core BLAST 36 | */ 37 | 38 | #include 39 | 40 | /** Convert an arbitrary integer to true/false */ 41 | #define SAFE_CAST_INT_TO_BOOLEAN(p) (((p) != 0) ? TRUE : FALSE) 42 | 43 | /* Classify query sequence */ 44 | Boolean Blast_QueryIsProtein(EBlastProgramType p) 45 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PROTEIN_QUERY_MASK); } 46 | 47 | Boolean Blast_QueryIsNucleotide(EBlastProgramType p) 48 | { return SAFE_CAST_INT_TO_BOOLEAN(p & NUCLEOTIDE_QUERY_MASK); } 49 | 50 | Boolean Blast_QueryIsPssm(EBlastProgramType p) 51 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PSSM_QUERY_MASK); } 52 | 53 | /* Classify subject sequence */ 54 | Boolean Blast_SubjectIsProtein(EBlastProgramType p) 55 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PROTEIN_SUBJECT_MASK); } 56 | 57 | Boolean Blast_SubjectIsNucleotide(EBlastProgramType p) 58 | { return SAFE_CAST_INT_TO_BOOLEAN(p & NUCLEOTIDE_SUBJECT_MASK); } 59 | 60 | Boolean Blast_SubjectIsPssm(EBlastProgramType p) 61 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PSSM_SUBJECT_MASK); } 62 | 63 | /* Handle translated searches */ 64 | Boolean Blast_QueryIsTranslated(EBlastProgramType p) 65 | { return SAFE_CAST_INT_TO_BOOLEAN(p & TRANSLATED_QUERY_MASK); } 66 | 67 | Boolean Blast_SubjectIsTranslated(EBlastProgramType p) 68 | { return SAFE_CAST_INT_TO_BOOLEAN(p & TRANSLATED_SUBJECT_MASK); } 69 | 70 | /* Handle special programs */ 71 | Boolean Blast_ProgramIsPsiBlast(EBlastProgramType p) 72 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PSSM_QUERY_MASK); } 73 | 74 | Boolean Blast_ProgramIsPhiBlast(EBlastProgramType p) 75 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PATTERN_QUERY_MASK); } 76 | 77 | Boolean Blast_ProgramIsRpsBlast(EBlastProgramType p) 78 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PSSM_SUBJECT_MASK); } 79 | 80 | Boolean Blast_ProgramIsValid(EBlastProgramType p) 81 | { 82 | switch (p) { 83 | case eBlastTypeBlastp: 84 | case eBlastTypeBlastn: 85 | case eBlastTypeBlastx: 86 | case eBlastTypeTblastn: 87 | case eBlastTypeTblastx: 88 | case eBlastTypePsiBlast: 89 | case eBlastTypePsiTblastn: 90 | case eBlastTypeRpsBlast: 91 | case eBlastTypeRpsTblastn: 92 | case eBlastTypePhiBlastp: 93 | case eBlastTypePhiBlastn: 94 | return TRUE; 95 | break; 96 | default: 97 | return FALSE; 98 | break; 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/core/blast_seg.h: -------------------------------------------------------------------------------- 1 | /* $Id: blast_seg.h 114718 2007-11-28 15:52:56Z ivanov $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Ilya Dondoshansky 27 | * 28 | */ 29 | 30 | /** @file blast_seg.h 31 | * SEG filtering functions. @todo FIXME: should this be combined with 32 | * blast_filter/dust? Needs doxygen documentation and comments 33 | */ 34 | 35 | #ifndef __BLAST_SEG__ 36 | #define __BLAST_SEG__ 37 | 38 | #include 39 | #include 40 | 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | /** Structure to hold parameters for seg search. 47 | */ 48 | typedef struct SegParameters 49 | { 50 | Int4 window; /**< initial window size to trigger further work. */ 51 | double locut; 52 | double hicut; 53 | Int4 period; 54 | Int4 hilenmin; 55 | Boolean overlaps; /* merge overlapping pieces if TRUE. */ 56 | Int4 maxtrim; 57 | Int4 maxbogus; 58 | } SegParameters; 59 | 60 | /** Allocated SeqParameter struct for proteins and fills with default values. 61 | * @return pointer to SegParameters 62 | */ 63 | NCBI_XBLAST_EXPORT 64 | SegParameters* SegParametersNewAa (void); 65 | 66 | /** Free SegParameters structure 67 | * @param sparamsp object to be freed [in] 68 | */ 69 | NCBI_XBLAST_EXPORT 70 | void SegParametersFree(SegParameters* sparamsp); 71 | 72 | /** Runs seg on a protein sequence in ncbistdaa. 73 | * @param sequence the protein residues in ncbistdaa [in] 74 | * @param length number of redidues [in] 75 | * @param offset amount to shift over resulting locations 76 | * (if full sequence not passed in) [in] 77 | * @param sparamsp the seg parameters created with SegParametersNewAa [in] 78 | * @param seg_locs resulting locations for filtering [out] 79 | * @return zero on success 80 | */ 81 | NCBI_XBLAST_EXPORT 82 | Int2 SeqBufferSeg (Uint1* sequence, Int4 length, Int4 offset, 83 | SegParameters* sparamsp, BlastSeqLoc** seg_locs); 84 | 85 | #ifdef __cplusplus 86 | } 87 | #endif 88 | #endif /* !__BLAST_FILTER__ */ 89 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/core/blast_toolkit.h: -------------------------------------------------------------------------------- 1 | #ifndef _BLAST_TOOLKIT__H 2 | #define _BLAST_TOOLKIT__H 3 | 4 | /** @file blast_toolkit.h 5 | * Choose C++ basic defines 6 | */ 7 | 8 | #include 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/core/hspfilter_collector.h: -------------------------------------------------------------------------------- 1 | /* $Id: hspfilter_collector.h 161402 2009-05-27 17:35:47Z camacho $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Ning Ma 27 | * 28 | */ 29 | 30 | /** @file hspfilter_collector.h 31 | * Implementation of a number of BlastHSPWriters to save hits from 32 | * a BLAST search, and subsequently return them in sorted order. 33 | */ 34 | 35 | #ifndef ALGO_BLAST_CORE__HSPFILTER_COLLECTOR__H 36 | #define ALGO_BLAST_CORE__HSPFILTER_COLLECTOR__H 37 | 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | #ifdef __cplusplus 46 | extern "C" { 47 | #endif 48 | 49 | /** Keeps prelim_hitlist_size and HitSavingOptions together. */ 50 | typedef struct BlastHSPCollectorParams { 51 | EBlastProgramType program;/**< program type */ 52 | Int4 prelim_hitlist_size; /**< number of hits saved during preliminary 53 | part of search. */ 54 | Int4 hsp_num_max; /**< number of HSPs to save per db sequence.*/ 55 | } BlastHSPCollectorParams; 56 | 57 | /** Sets up parameter set for use by collector. 58 | * @param program Blast program type.[in] 59 | * @param hit_options field hitlist_size and hsp_num_max needed, a pointer to 60 | * this structure will be stored on resulting structure.[in] 61 | * @param ext_options field compositionBasedStats needed here. [in] 62 | * @param scoring_options gapped_calculation needed here. [in] 63 | * @return the pointer to the allocated parameter 64 | */ 65 | NCBI_XBLAST_EXPORT 66 | BlastHSPCollectorParams* 67 | BlastHSPCollectorParamsNew(const BlastHitSavingOptions* hit_options, 68 | Int4 compositionBasedStats, 69 | Boolean gapped_calculation); 70 | 71 | /** Deallocates the BlastHSPCollectorParams structure passed in 72 | * @param opts structure to deallocate [in] 73 | * @return NULL 74 | */ 75 | NCBI_XBLAST_EXPORT 76 | BlastHSPCollectorParams* 77 | BlastHSPCollectorParamsFree(BlastHSPCollectorParams* opts); 78 | 79 | /** WriterInfo to create a default writer: the collecter 80 | * @param params The collector parameters. 81 | * @return pointer to WriterInfo 82 | */ 83 | NCBI_XBLAST_EXPORT 84 | BlastHSPWriterInfo* 85 | BlastHSPCollectorInfoNew(BlastHSPCollectorParams* params); 86 | 87 | #ifdef __cplusplus 88 | } 89 | #endif 90 | 91 | #endif /* !ALGO_BLAST_CORE__HSPFILTER_COLLECTOR__H */ 92 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/core/lookup_wrap.h: -------------------------------------------------------------------------------- 1 | /* $Id: lookup_wrap.h 369355 2012-07-18 17:07:15Z morgulis $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Ilya Dondoshansky 27 | * 28 | */ 29 | 30 | /** @file lookup_wrap.h 31 | * Wrapper for all lookup tables used in BLAST 32 | */ 33 | 34 | #ifndef ALGO_BLAST_CORE__LOOKUP_WRAP__H 35 | #define ALGO_BLAST_CORE__LOOKUP_WRAP__H 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | #ifdef __cplusplus 44 | extern "C" { 45 | #endif 46 | 47 | /** Wrapper structure for different types of BLAST lookup tables */ 48 | typedef struct LookupTableWrap { 49 | ELookupTableType lut_type; /**< What kind of a lookup table it is? */ 50 | void* lut; /**< Pointer to the actual lookup table structure */ 51 | void* read_indexed_db; /**< function used to retrieve hits 52 | from an indexed database */ 53 | void* check_index_oid; /**< function used to check if seeds 54 | for a given oid are present */ 55 | void * end_search_indication; /**< function used to report that 56 | a thread is done iterating over 57 | the database in preliminary 58 | search */ 59 | void* lookup_callback; /**< function used to look up an 60 | index->q_off pair */ 61 | } LookupTableWrap; 62 | 63 | /** Function pointer type to check the presence of index->q_off pair */ 64 | typedef Boolean (*T_Lookup_Callback)(const LookupTableWrap *, Int4, Int4); 65 | 66 | /** Create the lookup table for all query words. 67 | * @param query The query sequence [in] 68 | * @param lookup_options What kind of lookup table to build? [in] 69 | * @param query_options options for query setup [in] 70 | * @param lookup_segments Locations on query to be used for lookup table 71 | * construction [in] 72 | * @param sbp Scoring block containing matrix [in] 73 | * @param lookup_wrap_ptr The initialized lookup table [out] 74 | * @param rps_info Structure containing RPS blast setup information [in] 75 | * @param error_msg message with warning or errors [in|out] 76 | */ 77 | NCBI_XBLAST_EXPORT 78 | Int2 LookupTableWrapInit(BLAST_SequenceBlk* query, 79 | const LookupTableOptions* lookup_options, 80 | const QuerySetUpOptions* query_options, 81 | BlastSeqLoc* lookup_segments, BlastScoreBlk* sbp, 82 | LookupTableWrap** lookup_wrap_ptr, const BlastRPSInfo *rps_info, 83 | Blast_Message* *error_msg); 84 | 85 | /** Deallocate memory for the lookup table */ 86 | NCBI_XBLAST_EXPORT 87 | LookupTableWrap* LookupTableWrapFree(LookupTableWrap* lookup); 88 | 89 | /** Default size of offset arrays filled in a single ScanSubject call. */ 90 | #define OFFSET_ARRAY_SIZE 4096 91 | 92 | /** Determine the size of the offsets arrays to be filled by 93 | * the ScanSubject function. 94 | */ 95 | NCBI_XBLAST_EXPORT 96 | Int4 GetOffsetArraySize(LookupTableWrap* lookup); 97 | 98 | #ifdef __cplusplus 99 | } 100 | #endif 101 | #endif /* !ALGO_BLAST_CORE__LOOKUP_WRAP__H */ 102 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/core/matrix_freq_ratios.h: -------------------------------------------------------------------------------- 1 | #ifndef ALGO_BLAST_CORE___MATRIX_FREQ_RATIOS__H 2 | #define ALGO_BLAST_CORE___MATRIX_FREQ_RATIOS__H 3 | 4 | /* $Id: matrix_freq_ratios.h 439650 2014-07-02 13:40:24Z madden $ 5 | * =========================================================================== 6 | * 7 | * PUBLIC DOMAIN NOTICE 8 | * National Center for Biotechnology Information 9 | * 10 | * This software/database is a "United States Government Work" under the 11 | * terms of the United States Copyright Act. It was written as part of 12 | * the author's official duties as a United States Government employee and 13 | * thus cannot be copyrighted. This software/database is freely available 14 | * to the public for use. The National Library of Medicine and the U.S. 15 | * Government have not placed any restriction on its use or reproduction. 16 | * 17 | * Although all reasonable efforts have been taken to ensure the accuracy 18 | * and reliability of the software and data, the NLM and the U.S. 19 | * Government do not and cannot warrant the performance or results that 20 | * may be obtained by using this software or data. The NLM and the U.S. 21 | * Government disclaim all warranties, express or implied, including 22 | * warranties of performance, merchantability or fitness for any particular 23 | * purpose. 24 | * 25 | * Please cite the author in any work or product based on this material. 26 | * 27 | * =========================================================================== 28 | * 29 | * Author: Christiam Camacho 30 | * 31 | */ 32 | 33 | /** @file matrix_freq_ratios.h 34 | * Interface to retrieve the frequency ratios for various scoring matrices. 35 | * 36 | * See explanation in p 2996 of Nucleic Acids Research, 2001, Vol 29, No 14. 37 | */ 38 | 39 | #include 40 | 41 | #ifdef __cplusplus 42 | extern "C" { 43 | #endif 44 | 45 | /** Stores the frequency ratios along with their bit scale factor */ 46 | typedef struct SFreqRatios { 47 | 48 | /** The actual frequency ratios */ 49 | double** data; 50 | 51 | /** Used to multiply the values in the above matrix to obtain scores in bit 52 | * units */ 53 | int bit_scale_factor; 54 | 55 | } SFreqRatios; 56 | 57 | /** Retrive the matrix's frequency ratios. 58 | * @param matrix_name Available options include: 59 | * BLOSUM62 60 | * BLOSUM62_20 61 | * BLOSUM62_20A 62 | * BLOSUM62_20B 63 | * BLOSUM45 64 | * BLOSUM80 65 | * BLOSUM50 66 | * BLOSUM90 67 | * PAM30 68 | * PAM70 69 | * PAM250 70 | * @return NULL on error 71 | */ 72 | NCBI_XBLAST_EXPORT SFreqRatios* 73 | _PSIMatrixFrequencyRatiosNew(const char* matrix_name); 74 | 75 | /** Deallocate the frequency ratios structure */ 76 | NCBI_XBLAST_EXPORT SFreqRatios* 77 | _PSIMatrixFrequencyRatiosFree(SFreqRatios* freq_ratios); 78 | 79 | #ifdef __cplusplus 80 | } 81 | #endif 82 | 83 | #endif /* !ALGO_BLAST_CORE__MATRIX_FREQ_RATIOS__H */ 84 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/algo/blast/core/pattern_priv.h: -------------------------------------------------------------------------------- 1 | /* $Id: pattern_priv.h 103491 2007-05-04 17:18:18Z kazimird $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Ilya Dondoshansky 27 | * 28 | */ 29 | 30 | /** @file pattern_priv.h 31 | * Auxiliary functions for finding pattern matches in sequence (PHI-BLAST), that 32 | * are used in multiple source files. 33 | */ 34 | 35 | #ifndef ALGO_BLAST_CORE__PATTERN_PRIV_H 36 | #define ALGO_BLAST_CORE__PATTERN_PRIV_H 37 | 38 | #include 39 | #include 40 | 41 | #ifdef __cplusplus 42 | extern "C" { 43 | #endif 44 | 45 | /** Routine to find hits of pattern to sequence when sequence is proteins 46 | * @param hitArray An array of matches to pass back [out] 47 | * @param seq The input sequence [in] 48 | * @param len1 Length of the input sequence. [in] 49 | * @param pattern_blk Carries variables that keep track of search 50 | * parameters. [in] 51 | * @return the number of matches found. 52 | */ 53 | Int4 54 | _PHIBlastFindHitsShort(Int4 *hitArray, const Uint1* seq, Int4 len1, 55 | const SPHIPatternSearchBlk *pattern_blk); 56 | 57 | /** Shift each word in the array left by 1 bit and add bit b. 58 | * If the new values is bigger than an overflow threshold, then subtract the 59 | * overflow threshold. 60 | * @param a Array of integers, representing words in a pattern [in] [out] 61 | * @param b bit to add [in] 62 | * @param numWords Number of words to process [in] 63 | */ 64 | void 65 | _PHIPatternWordsLeftShift(Int4 *a, Uint1 b, Int4 numWords); 66 | 67 | /** Do a word-by-word bit-wise or of two integer arrays and put the result back 68 | * in the first array. 69 | * @param a First array [in] [out] 70 | * @param b Second array [in] 71 | * @param numWords Number of words in a and b [in] 72 | */ 73 | void 74 | _PHIPatternWordsBitwiseOr(Int4 *a, Int4 *b, Int4 numWords); 75 | 76 | /** Do a word-by-word bit-wise and of two integer arrays and put the result in 77 | * a new array. 78 | * @param result Result of the operation [out] 79 | * @param a First array [in] 80 | * @param b Second array [in] 81 | * @param numWords Size of the two input arrays [in] 82 | * @return 1 if there are any non-zero words, otherwize 0. 83 | */ 84 | Int4 85 | _PHIPatternWordsBitwiseAnd(Int4 *result, Int4 *a, Int4 *b, Int4 numWords); 86 | 87 | /** Masks all bits corresponding to the aminoacid alphabet, i.e. the first 26 88 | * bits of an integer number. 89 | */ 90 | extern const int kMaskAaAlphabetBits; 91 | 92 | /** Looks for 1 bits in the same position of s and mask 93 | * Let R be the rightmost position where s and mask both have a 1. 94 | * Let L < R be the rightmost position where mask has a 1, if any, 95 | * or -1 otherwise. 96 | * @param s Number to check bits in [in] 97 | * @param mask Mask to apply [in] 98 | * @param rightOne The rightmost position where s and mask both have a 1 [out] 99 | * @param rightMaskOnly The rightmost position < rightOne, where mask has a 1, 100 | * if any, or -1 otherwise [out] 101 | */ 102 | void 103 | _PHIGetRightOneBits(Int4 s, Int4 mask, Int4* rightOne, Int4* rightMaskOnly); 104 | 105 | #ifdef __cplusplus 106 | } 107 | #endif 108 | 109 | #endif /* !ALGO_BLAST_CORE__PATTERN_PRIV_H */ 110 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/common/ncbi_skew_guard.h: -------------------------------------------------------------------------------- 1 | /* $Id: ncbi_skew_guard.h 346326 2011-12-06 15:28:48Z ucko $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Aaron Ucko, NCBI 27 | * 28 | */ 29 | 30 | /** @file ncbi_skew_guard.h 31 | * Implementation header to catch build setups that mix incompatible 32 | * C and C++ Toolkit installations. 33 | * 34 | * Available as on the C++ side and 35 | * on the C side, in each case customized upon 36 | * installation to identify itself appropriately. 37 | */ 38 | 39 | /* In-house C++ Toolkit installations define NCBI_INSTALLED_CXX_VER to 40 | * the corresponding date stamp (also available as NCBI_DEVELOPMENT_VER 41 | * or NCBI_PRODUCTION_VER). */ 42 | /* #undef NCBI_INSTALLED_CXX_VER */ 43 | 44 | /* Accompanying copies of the C Toolkit define NCBI_EXPECTED_CXX_VER 45 | * accordingly. */ 46 | /* #undef NCBI_EXPECTED_CXX_VER */ 47 | 48 | #if defined(_NCBILCL_) && defined(FORWARDING_NCBICONF_H) \ 49 | && !defined(NCBI_ALLOW_MISMATCHED_VERSIONS) 50 | 51 | /* The last change to shared headers before this guard came along occurred 52 | * on Nov. 30, 2011. */ 53 | #define NCBI_MIN_CXX_VER 20111130 54 | 55 | # if defined(NCBI_INSTALLED_CXX_VER) 56 | 57 | # if !defined(NCBI_EXPECTED_CXX_VER) \ 58 | || NCBI_INSTALLED_CXX_VER != NCBI_EXPECTED_CXX_VER 59 | # error Please use the C Toolkit installation accompanying your C++ Toolkit tree. 60 | # endif 61 | 62 | # else 63 | 64 | # include 65 | # if NCBI_DEVELOPMENT_VER < NCBI_MIN_CXX_VER 66 | # error Please use a fresher C++ Toolkit version for C Toolkit compatibility. 67 | # elif defined(NCBI_EXPECTED_CXX_VER) 68 | # if (defined(NCBI_PRODUCTION_VER) ? NCBI_PRODUCTION_VER \ 69 | : NCBI_DEVELOPMENT_VER) \ 70 | != NCBI_EXPECTED_CXX_VER 71 | # error Please use matching C and C++ Toolkit versions. 72 | # endif 73 | # endif 74 | 75 | # endif 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/common/ncbiconf_impl.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON___NCBICONF_IMPL__H 2 | #define COMMON___NCBICONF_IMPL__H 3 | 4 | /* $Id: ncbiconf_impl.h 457074 2015-01-20 16:19:10Z ucko $ 5 | * =========================================================================== 6 | * 7 | * PUBLIC DOMAIN NOTICE 8 | * National Center for Biotechnology Information 9 | * 10 | * This software/database is a "United States Government Work" under the 11 | * terms of the United States Copyright Act. It was written as part of 12 | * the author's official duties as a United States Government employee and 13 | * thus cannot be copyrighted. This software/database is freely available 14 | * to the public for use. The National Library of Medicine and the U.S. 15 | * Government have not placed any restriction on its use or reproduction. 16 | * 17 | * Although all reasonable efforts have been taken to ensure the accuracy 18 | * and reliability of the software and data, the NLM and the U.S. 19 | * Government do not and cannot warrant the performance or results that 20 | * may be obtained by using this software or data. The NLM and the U.S. 21 | * Government disclaim all warranties, express or implied, including 22 | * warranties of performance, merchantability or fitness for any particular 23 | * purpose. 24 | * 25 | * Please cite the author in any work or product based on this material. 26 | * 27 | * =========================================================================== 28 | * 29 | * Author: Anton Lavrentiev 30 | * 31 | * 32 | */ 33 | 34 | /** 35 | * @file ncbiconf_impl.h 36 | * 37 | * Configuration macros. 38 | */ 39 | 40 | #ifndef FORWARDING_NCBICONF_H 41 | # error "The header can be used from only." 42 | #endif /*!FORWARDING_NCBICONF_H*/ 43 | 44 | 45 | /** @addtogroup Portability 46 | * 47 | * @{ 48 | */ 49 | 50 | 51 | /* Threads configuration 52 | */ 53 | 54 | #undef NCBI_NO_THREADS 55 | #undef NCBI_THREADS 56 | #undef NCBI_POSIX_THREADS 57 | #undef NCBI_WIN32_THREADS 58 | 59 | #if defined(_MT) && !defined(NCBI_WITHOUT_MT) 60 | # if defined(NCBI_OS_MSWIN) 61 | # define NCBI_WIN32_THREADS 62 | # elif defined(NCBI_OS_UNIX) 63 | # define NCBI_POSIX_THREADS 64 | # else 65 | # define NCBI_NO_THREADS 66 | # endif 67 | #else 68 | # define NCBI_NO_THREADS 69 | #endif 70 | 71 | #if !defined(NCBI_NO_THREADS) 72 | # define NCBI_THREADS 73 | #endif 74 | 75 | /* Sync Windows/Cygwin preprocessor conditionals governing wide 76 | * character usage. */ 77 | 78 | #if defined(UNICODE) && !defined(_UNICODE) 79 | # define _UNICODE 1 80 | #elif defined(_UNICODE) && !defined(UNICODE) 81 | # define UNICODE 1 82 | #endif 83 | 84 | /* New/nonstandard keywords 85 | */ 86 | 87 | #if defined(__cplusplus) && defined(NCBI_RESTRICT_CXX) 88 | # define NCBI_RESTRICT NCBI_RESTRICT_CXX 89 | #elif !defined(__cplusplus) && defined(NCBI_RESTRICT_C) 90 | # define NCBI_RESTRICT NCBI_RESTRICT_C 91 | #elif __STDC_VERSION__ >= 199901 /* C99 specifies restrict */ 92 | # define NCBI_RESTRICT restrict 93 | #else 94 | # define NCBI_RESTRICT 95 | #endif 96 | 97 | #ifndef NCBI_FORCEINLINE 98 | # ifdef __cplusplus 99 | # define NCBI_FORCEINLINE inline 100 | # else 101 | # define NCBI_FORCEINLINE 102 | # endif 103 | #endif 104 | 105 | #ifndef NCBI_NORETURN 106 | # ifdef __GNUC__ 107 | # define NCBI_NORETURN __attribute__((__noreturn__)) 108 | # else 109 | # define NCBI_NORETURN 110 | # endif 111 | #endif 112 | 113 | /* Definition of packed enum type, to save some memory */ 114 | /* enum EMyEnum NCBI_PACKED_ENUM_TYPE(Type) { ... } NCBI_PACKED_ENUM_END(); */ 115 | #ifndef NCBI_PACKED_ENUM_TYPE 116 | # define NCBI_PACKED_ENUM_TYPE(type) 117 | #endif 118 | #ifndef NCBI_PACKED_ENUM_END 119 | # ifdef NCBI_PACKED 120 | # define NCBI_PACKED_ENUM_END() NCBI_PACKED 121 | # else 122 | # define NCBI_PACKED_ENUM_END() 123 | # endif 124 | #endif 125 | 126 | #ifndef NCBI_WARN_UNUSED_RESULT 127 | # define NCBI_WARN_UNUSED_RESULT 128 | #endif 129 | 130 | #ifdef __cplusplus 131 | # if __cplusplus >= 201103L || defined(__GXX_EXPERIMENTAL_CXX0X__) \ 132 | || defined(__GXX_EXPERIMENTAL_CPP0X__) 133 | # define NCBI_HAVE_CXX11 1 134 | # endif 135 | # if defined(NCBI_HAVE_CXX11) \ 136 | || (defined(NCBI_COMPILER_MSVC) && _MSC_VER >= 1600) 137 | # define HAVE_IS_SORTED 1 138 | # define HAVE_NULLPTR 1 139 | # endif 140 | # if defined(NCBI_HAVE_CXX11) /* or recent MSVC too? */ 141 | # if !defined(NCBI_COMPILER_ICC) || NCBI_COMPILER_VERSION >= 1400 142 | /* Exclude ICC 13.x and below, which don't support using "enum class" 143 | * in conjunction with switch. */ 144 | # define HAVE_ENUM_CLASS 1 145 | # endif 146 | # endif 147 | #endif 148 | 149 | #include 150 | 151 | /* @} */ 152 | 153 | #endif /* COMMON___NCBICONF_IMPL__H */ 154 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/connect/connect_export.h: -------------------------------------------------------------------------------- 1 | #ifndef CONNECT___CONNECT_EXPORT__H 2 | #define CONNECT___CONNECT_EXPORT__H 3 | 4 | /* $Id: connect_export.h 166398 2009-07-22 15:51:55Z ucko $ 5 | * =========================================================================== 6 | * 7 | * PUBLIC DOMAIN NOTICE 8 | * National Center for Biotechnology Information 9 | * 10 | * This software/database is a "United States Government Work" under the 11 | * terms of the United States Copyright Act. It was written as part of 12 | * the author's official duties as a United States Government employee and 13 | * thus cannot be copyrighted. This software/database is freely available 14 | * to the public for use. The National Library of Medicine and the U.S. 15 | * Government have not placed any restriction on its use or reproduction. 16 | * 17 | * Although all reasonable efforts have been taken to ensure the accuracy 18 | * and reliability of the software and data, the NLM and the U.S. 19 | * Government do not and cannot warrant the performance or results that 20 | * may be obtained by using this software or data. The NLM and the U.S. 21 | * Government disclaim all warranties, express or implied, including 22 | * warranties of performance, merchantability or fitness for any particular 23 | * purpose. 24 | * 25 | * Please cite the author in any work or product based on this material. 26 | * 27 | * =========================================================================== 28 | * 29 | * Author: Mike DiCuccio 30 | * 31 | * File Description: 32 | * Defines to provide correct exporting from CONNECT DLL in Windows. 33 | * These are necessary to compile DLLs with Visual C++ - exports must be 34 | * explicitly labeled as such. 35 | */ 36 | 37 | 38 | /** @addtogroup WinDLL 39 | * 40 | * @{ 41 | */ 42 | 43 | 44 | #if defined(WIN32) && defined(NCBI_DLL_BUILD) 45 | 46 | #ifndef _MSC_VER 47 | # error "This toolkit is not buildable with a compiler other than MSVC." 48 | #endif 49 | 50 | 51 | /* 52 | * Dumping ground for Windows-specific stuff 53 | */ 54 | #pragma warning (disable : 4786 4251 4275) 55 | 56 | 57 | #ifdef NCBI_CORE_EXPORTS 58 | # define NCBI_XCONNECT_EXPORTS 59 | #endif 60 | 61 | 62 | #ifdef NCBI_XCONNECT_EXPORTS 63 | # define NCBI_XCONNECT_EXPORT __declspec(dllexport) 64 | #else 65 | # define NCBI_XCONNECT_EXPORT __declspec(dllimport) 66 | #endif 67 | 68 | 69 | #elif defined(__GNUC__) && __GNUC__ >= 4 70 | 71 | # define NCBI_XCONNECT_EXPORT __attribute__((visibility("default"))) 72 | 73 | #else 74 | 75 | /* 76 | * NULL operations for other cases 77 | */ 78 | 79 | # define NCBI_XCONNECT_EXPORT 80 | 81 | 82 | #endif 83 | 84 | 85 | /* @} */ 86 | 87 | #endif /* CONNECT___CONNECT_EXPORT__H */ 88 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/ncbiconf.h: -------------------------------------------------------------------------------- 1 | #ifndef FORWARDING_NCBICONF_H 2 | #define FORWARDING_NCBICONF_H 3 | 4 | /* $Id: ncbiconf.h 485953 2015-11-30 17:53:34Z blastadm $ 5 | * =========================================================================== 6 | * 7 | * PUBLIC DOMAIN NOTICE 8 | * National Center for Biotechnology Information 9 | * 10 | * This software/database is a "United States Government Work" under the 11 | * terms of the United States Copyright Act. It was written as part of 12 | * the author's official duties as a United States Government employee and 13 | * thus cannot be copyrighted. This software/database is freely available 14 | * to the public for use. The National Library of Medicine and the U.S. 15 | * Government have not placed any restriction on its use or reproduction. 16 | * 17 | * Although all reasonable efforts have been taken to ensure the accuracy 18 | * and reliability of the software and data, the NLM and the U.S. 19 | * Government do not and cannot warrant the performance or results that 20 | * may be obtained by using this software or data. The NLM and the U.S. 21 | * Government disclaim all warranties, express or implied, including 22 | * warranties of performance, merchantability or fitness for any particular 23 | * purpose. 24 | * 25 | * Please cite the author in any work or product based on this material. 26 | * 27 | * =========================================================================== 28 | * 29 | * Authors: Denis Vakatov, Aaron Ucko 30 | * 31 | */ 32 | 33 | /** @file ncbiconf.h 34 | ** Front end for a platform-specific configuration summary. 35 | **/ 36 | 37 | #ifdef _MSC_VER 38 | # include 39 | #elif defined(NCBI_XCODE_BUILD) 40 | # include 41 | #else 42 | #include 43 | #endif 44 | 45 | #ifdef NCBI_UNIVERSAL_BUILD 46 | /* sort out the remaining details */ 47 | # include 48 | #endif 49 | 50 | #include 51 | 52 | #endif /* FORWARDING_NCBICONF_H */ 53 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/util/tables/sm_blosum45.c: -------------------------------------------------------------------------------- 1 | /* $Id: sm_blosum45.c 90506 2006-09-25 19:30:59Z madden $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Aaron Ucko, Mike Gertz 27 | * 28 | * File Description: 29 | * Protein alignment score matrices; shared between the two toolkits. 30 | * 31 | * =========================================================================== 32 | */ 33 | 34 | #include 35 | 36 | /** Entries for the BLOSUM45 matrix at a scale of ln(2)/3.0. */ 37 | 38 | static const TNCBIScore s_Blosum45PSM[25 * 25] = { 39 | /* A, R, N, D, C, Q, E, G, H, I, L, K, M, 40 | F, P, S, T, W, Y, V, B, J, Z, X, * */ 41 | /*A*/ 5, -2, -1, -2, -1, -1, -1, 0, -2, -1, -1, -1, -1, 42 | -2, -1, 1, 0, -2, -2, 0, -1, -1, -1, -1, -5, 43 | /*R*/ -2, 7, 0, -1, -3, 1, 0, -2, 0, -3, -2, 3, -1, 44 | -2, -2, -1, -1, -2, -1, -2, -1, -3, 1, -1, -5, 45 | /*N*/ -1, 0, 6, 2, -2, 0, 0, 0, 1, -2, -3, 0, -2, 46 | -2, -2, 1, 0, -4, -2, -3, 5, -3, 0, -1, -5, 47 | /*D*/ -2, -1, 2, 7, -3, 0, 2, -1, 0, -4, -3, 0, -3, 48 | -4, -1, 0, -1, -4, -2, -3, 6, -3, 1, -1, -5, 49 | /*C*/ -1, -3, -2, -3, 12, -3, -3, -3, -3, -3, -2, -3, -2, 50 | -2, -4, -1, -1, -5, -3, -1, -2, -2, -3, -1, -5, 51 | /*Q*/ -1, 1, 0, 0, -3, 6, 2, -2, 1, -2, -2, 1, 0, 52 | -4, -1, 0, -1, -2, -1, -3, 0, -2, 4, -1, -5, 53 | /*E*/ -1, 0, 0, 2, -3, 2, 6, -2, 0, -3, -2, 1, -2, 54 | -3, 0, 0, -1, -3, -2, -3, 1, -3, 5, -1, -5, 55 | /*G*/ 0, -2, 0, -1, -3, -2, -2, 7, -2, -4, -3, -2, -2, 56 | -3, -2, 0, -2, -2, -3, -3, -1, -4, -2, -1, -5, 57 | /*H*/ -2, 0, 1, 0, -3, 1, 0, -2, 10, -3, -2, -1, 0, 58 | -2, -2, -1, -2, -3, 2, -3, 0, -2, 0, -1, -5, 59 | /*I*/ -1, -3, -2, -4, -3, -2, -3, -4, -3, 5, 2, -3, 2, 60 | 0, -2, -2, -1, -2, 0, 3, -3, 4, -3, -1, -5, 61 | /*L*/ -1, -2, -3, -3, -2, -2, -2, -3, -2, 2, 5, -3, 2, 62 | 1, -3, -3, -1, -2, 0, 1, -3, 4, -2, -1, -5, 63 | /*K*/ -1, 3, 0, 0, -3, 1, 1, -2, -1, -3, -3, 5, -1, 64 | -3, -1, -1, -1, -2, -1, -2, 0, -3, 1, -1, -5, 65 | /*M*/ -1, -1, -2, -3, -2, 0, -2, -2, 0, 2, 2, -1, 6, 66 | 0, -2, -2, -1, -2, 0, 1, -2, 2, -1, -1, -5, 67 | /*F*/ -2, -2, -2, -4, -2, -4, -3, -3, -2, 0, 1, -3, 0, 68 | 8, -3, -2, -1, 1, 3, 0, -3, 1, -3, -1, -5, 69 | /*P*/ -1, -2, -2, -1, -4, -1, 0, -2, -2, -2, -3, -1, -2, 70 | -3, 9, -1, -1, -3, -3, -3, -2, -3, -1, -1, -5, 71 | /*S*/ 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -3, -1, -2, 72 | -2, -1, 4, 2, -4, -2, -1, 0, -2, 0, -1, -5, 73 | /*T*/ 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, 74 | -1, -1, 2, 5, -3, -1, 0, 0, -1, -1, -1, -5, 75 | /*W*/ -2, -2, -4, -4, -5, -2, -3, -2, -3, -2, -2, -2, -2, 76 | 1, -3, -4, -3, 15, 3, -3, -4, -2, -2, -1, -5, 77 | /*Y*/ -2, -1, -2, -2, -3, -1, -2, -3, 2, 0, 0, -1, 0, 78 | 3, -3, -2, -1, 3, 8, -1, -2, 0, -2, -1, -5, 79 | /*V*/ 0, -2, -3, -3, -1, -3, -3, -3, -3, 3, 1, -2, 1, 80 | 0, -3, -1, 0, -3, -1, 5, -3, 2, -3, -1, -5, 81 | /*B*/ -1, -1, 5, 6, -2, 0, 1, -1, 0, -3, -3, 0, -2, 82 | -3, -2, 0, 0, -4, -2, -3, 5, -3, 1, -1, -5, 83 | /*J*/ -1, -3, -3, -3, -2, -2, -3, -4, -2, 4, 4, -3, 2, 84 | 1, -3, -2, -1, -2, 0, 2, -3, 4, -2, -1, -5, 85 | /*Z*/ -1, 1, 0, 1, -3, 4, 5, -2, 0, -3, -2, 1, -1, 86 | -3, -1, 0, -1, -2, -2, -3, 1, -2, 5, -1, -5, 87 | /*X*/ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 88 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -5, 89 | /***/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 90 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 1 91 | }; 92 | const SNCBIPackedScoreMatrix NCBISM_Blosum45 = { 93 | "ARNDCQEGHILKMFPSTWYVBJZX*", 94 | s_Blosum45PSM, 95 | -5 96 | }; 97 | 98 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/util/tables/sm_blosum50.c: -------------------------------------------------------------------------------- 1 | /* $Id: sm_blosum50.c 90507 2006-09-25 19:31:51Z madden $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Aaron Ucko, Mike Gertz 27 | * 28 | * File Description: 29 | * Protein alignment score matrices; shared between the two toolkits. 30 | * 31 | * =========================================================================== 32 | */ 33 | 34 | #include 35 | 36 | /** Entries for the BLOSUM50 matrix at a scale of ln(2)/3.0. */ 37 | 38 | static const TNCBIScore s_Blosum50PSM[25 * 25] = { 39 | /* A, R, N, D, C, Q, E, G, H, I, L, K, M, 40 | F, P, S, T, W, Y, V, B, J, Z, X, * */ 41 | /*A*/ 5, -2, -1, -2, -1, -1, -1, 0, -2, -1, -2, -1, -1, 42 | -3, -1, 1, 0, -3, -2, 0, -2, -2, -1, -1, -5, 43 | /*R*/ -2, 7, -1, -2, -4, 1, 0, -3, 0, -4, -3, 3, -2, 44 | -3, -3, -1, -1, -3, -1, -3, -1, -3, 0, -1, -5, 45 | /*N*/ -1, -1, 7, 2, -2, 0, 0, 0, 1, -3, -4, 0, -2, 46 | -4, -2, 1, 0, -4, -2, -3, 5, -4, 0, -1, -5, 47 | /*D*/ -2, -2, 2, 8, -4, 0, 2, -1, -1, -4, -4, -1, -4, 48 | -5, -1, 0, -1, -5, -3, -4, 6, -4, 1, -1, -5, 49 | /*C*/ -1, -4, -2, -4, 13, -3, -3, -3, -3, -2, -2, -3, -2, 50 | -2, -4, -1, -1, -5, -3, -1, -3, -2, -3, -1, -5, 51 | /*Q*/ -1, 1, 0, 0, -3, 7, 2, -2, 1, -3, -2, 2, 0, 52 | -4, -1, 0, -1, -1, -1, -3, 0, -3, 4, -1, -5, 53 | /*E*/ -1, 0, 0, 2, -3, 2, 6, -3, 0, -4, -3, 1, -2, 54 | -3, -1, -1, -1, -3, -2, -3, 1, -3, 5, -1, -5, 55 | /*G*/ 0, -3, 0, -1, -3, -2, -3, 8, -2, -4, -4, -2, -3, 56 | -4, -2, 0, -2, -3, -3, -4, -1, -4, -2, -1, -5, 57 | /*H*/ -2, 0, 1, -1, -3, 1, 0, -2, 10, -4, -3, 0, -1, 58 | -1, -2, -1, -2, -3, 2, -4, 0, -3, 0, -1, -5, 59 | /*I*/ -1, -4, -3, -4, -2, -3, -4, -4, -4, 5, 2, -3, 2, 60 | 0, -3, -3, -1, -3, -1, 4, -4, 4, -3, -1, -5, 61 | /*L*/ -2, -3, -4, -4, -2, -2, -3, -4, -3, 2, 5, -3, 3, 62 | 1, -4, -3, -1, -2, -1, 1, -4, 4, -3, -1, -5, 63 | /*K*/ -1, 3, 0, -1, -3, 2, 1, -2, 0, -3, -3, 6, -2, 64 | -4, -1, 0, -1, -3, -2, -3, 0, -3, 1, -1, -5, 65 | /*M*/ -1, -2, -2, -4, -2, 0, -2, -3, -1, 2, 3, -2, 7, 66 | 0, -3, -2, -1, -1, 0, 1, -3, 2, -1, -1, -5, 67 | /*F*/ -3, -3, -4, -5, -2, -4, -3, -4, -1, 0, 1, -4, 0, 68 | 8, -4, -3, -2, 1, 4, -1, -4, 1, -4, -1, -5, 69 | /*P*/ -1, -3, -2, -1, -4, -1, -1, -2, -2, -3, -4, -1, -3, 70 | -4, 10, -1, -1, -4, -3, -3, -2, -3, -1, -1, -5, 71 | /*S*/ 1, -1, 1, 0, -1, 0, -1, 0, -1, -3, -3, 0, -2, 72 | -3, -1, 5, 2, -4, -2, -2, 0, -3, 0, -1, -5, 73 | /*T*/ 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, 74 | -2, -1, 2, 5, -3, -2, 0, 0, -1, -1, -1, -5, 75 | /*W*/ -3, -3, -4, -5, -5, -1, -3, -3, -3, -3, -2, -3, -1, 76 | 1, -4, -4, -3, 15, 2, -3, -5, -2, -2, -1, -5, 77 | /*Y*/ -2, -1, -2, -3, -3, -1, -2, -3, 2, -1, -1, -2, 0, 78 | 4, -3, -2, -2, 2, 8, -1, -3, -1, -2, -1, -5, 79 | /*V*/ 0, -3, -3, -4, -1, -3, -3, -4, -4, 4, 1, -3, 1, 80 | -1, -3, -2, 0, -3, -1, 5, -3, 2, -3, -1, -5, 81 | /*B*/ -2, -1, 5, 6, -3, 0, 1, -1, 0, -4, -4, 0, -3, 82 | -4, -2, 0, 0, -5, -3, -3, 6, -4, 1, -1, -5, 83 | /*J*/ -2, -3, -4, -4, -2, -3, -3, -4, -3, 4, 4, -3, 2, 84 | 1, -3, -3, -1, -2, -1, 2, -4, 4, -3, -1, -5, 85 | /*Z*/ -1, 0, 0, 1, -3, 4, 5, -2, 0, -3, -3, 1, -1, 86 | -4, -1, 0, -1, -2, -2, -3, 1, -3, 5, -1, -5, 87 | /*X*/ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 88 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -5, 89 | /***/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 90 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 1 91 | }; 92 | const SNCBIPackedScoreMatrix NCBISM_Blosum50 = { 93 | "ARNDCQEGHILKMFPSTWYVBJZX*", 94 | s_Blosum50PSM, 95 | -5 96 | }; 97 | 98 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/util/tables/sm_blosum62.c: -------------------------------------------------------------------------------- 1 | /* $Id: sm_blosum62.c 90506 2006-09-25 19:30:59Z madden $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Aaron Ucko, Mike Gertz 27 | * 28 | * File Description: 29 | * Protein alignment score matrices; shared between the two toolkits. 30 | * 31 | * =========================================================================== 32 | */ 33 | 34 | #include 35 | 36 | /** Entries for the BLOSUM62 matrix at a scale of ln(2)/2.0. */ 37 | 38 | static const TNCBIScore s_Blosum62PSM[25 * 25] = { 39 | /* A, R, N, D, C, Q, E, G, H, I, L, K, M, 40 | F, P, S, T, W, Y, V, B, J, Z, X, * */ 41 | /*A*/ 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, 42 | -2, -1, 1, 0, -3, -2, 0, -2, -1, -1, -1, -4, 43 | /*R*/ -1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, 44 | -3, -2, -1, -1, -3, -2, -3, -1, -2, 0, -1, -4, 45 | /*N*/ -2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, 46 | -3, -2, 1, 0, -4, -2, -3, 4, -3, 0, -1, -4, 47 | /*D*/ -2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, 48 | -3, -1, 0, -1, -4, -3, -3, 4, -3, 1, -1, -4, 49 | /*C*/ 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, 50 | -2, -3, -1, -1, -2, -2, -1, -3, -1, -3, -1, -4, 51 | /*Q*/ -1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, 52 | -3, -1, 0, -1, -2, -1, -2, 0, -2, 4, -1, -4, 53 | /*E*/ -1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, 54 | -3, -1, 0, -1, -3, -2, -2, 1, -3, 4, -1, -4, 55 | /*G*/ 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, 56 | -3, -2, 0, -2, -2, -3, -3, -1, -4, -2, -1, -4, 57 | /*H*/ -2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, 58 | -1, -2, -1, -2, -2, 2, -3, 0, -3, 0, -1, -4, 59 | /*I*/ -1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 60 | 0, -3, -2, -1, -3, -1, 3, -3, 3, -3, -1, -4, 61 | /*L*/ -1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 62 | 0, -3, -2, -1, -2, -1, 1, -4, 3, -3, -1, -4, 63 | /*K*/ -1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, 64 | -3, -1, 0, -1, -3, -2, -2, 0, -3, 1, -1, -4, 65 | /*M*/ -1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 66 | 0, -2, -1, -1, -1, -1, 1, -3, 2, -1, -1, -4, 67 | /*F*/ -2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 68 | 6, -4, -2, -2, 1, 3, -1, -3, 0, -3, -1, -4, 69 | /*P*/ -1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, 70 | -4, 7, -1, -1, -4, -3, -2, -2, -3, -1, -1, -4, 71 | /*S*/ 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, 72 | -2, -1, 4, 1, -3, -2, -2, 0, -2, 0, -1, -4, 73 | /*T*/ 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, 74 | -2, -1, 1, 5, -2, -2, 0, -1, -1, -1, -1, -4, 75 | /*W*/ -3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 76 | 1, -4, -3, -2, 11, 2, -3, -4, -2, -2, -1, -4, 77 | /*Y*/ -2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 78 | 3, -3, -2, -2, 2, 7, -1, -3, -1, -2, -1, -4, 79 | /*V*/ 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, 80 | -1, -2, -2, 0, -3, -1, 4, -3, 2, -2, -1, -4, 81 | /*B*/ -2, -1, 4, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, 82 | -3, -2, 0, -1, -4, -3, -3, 4, -3, 0, -1, -4, 83 | /*J*/ -1, -2, -3, -3, -1, -2, -3, -4, -3, 3, 3, -3, 2, 84 | 0, -3, -2, -1, -2, -1, 2, -3, 3, -3, -1, -4, 85 | /*Z*/ -1, 0, 0, 1, -3, 4, 4, -2, 0, -3, -3, 1, -1, 86 | -3, -1, 0, -1, -2, -2, -2, 0, -3, 4, -1, -4, 87 | /*X*/ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 88 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -4, 89 | /***/ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 90 | -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1 91 | }; 92 | const SNCBIPackedScoreMatrix NCBISM_Blosum62 = { 93 | "ARNDCQEGHILKMFPSTWYVBJZX*", 94 | s_Blosum62PSM, 95 | -4 96 | }; 97 | 98 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/util/tables/sm_identity.c: -------------------------------------------------------------------------------- 1 | /* $Id: sm_identity.c 458581 2015-02-06 15:18:12Z boratyng $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Greg Boratyn 27 | * 28 | * File Description: 29 | * Protein alignment score matrices; shared between the two toolkits. 30 | * 31 | * =========================================================================== 32 | */ 33 | 34 | #include 35 | 36 | /** Entries for the IDENTITY matrix. */ 37 | 38 | static const TNCBIScore s_IdentityPSM[25 * 25] = { 39 | /* A, R, N, D, C, Q, E, G, H, I, L, K, M, 40 | F, P, S, T, W, Y, V, B, J, Z, X, * */ 41 | /*A*/ 9, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 42 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 43 | /*R*/ -5, 9, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 44 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 45 | /*N*/ -5, -5, 9, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 46 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 47 | /*D*/ -5, -5, -5, 9, -5, -5, -5, -5, -5, -5, -5, -5, -5, 48 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 49 | /*C*/ -5, -5, -5, -5, 9, -5, -5, -5, -5, -5, -5, -5, -5, 50 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 51 | /*Q*/ -5, -5, -5, -5, -5, 9, -5, -5, -5, -5, -5, -5, -5, 52 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 53 | /*E*/ -5, -5, -5, -5, -5, -5, 9, -5, -5, -5, -5, -5, -5, 54 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 55 | /*G*/ -5, -5, -5, -5, -5, -5, -5, 9, -5, -5, -5, -5, -5, 56 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 57 | /*H*/ -5, -5, -5, -5, -5, -5, -5, -5, 9, -5, -5, -5, -5, 58 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 59 | /*I*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, 9, -5, -5, -5, 60 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 61 | /*L*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 9, -5, -5, 62 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 63 | /*K*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 9, -5, 64 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 65 | /*M*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 9, 66 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 67 | /*F*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 68 | 9, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 69 | /*P*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 70 | -5, 9, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 71 | /*S*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 72 | -5, -5, 9, -5, -5, -5, -5, -5, -5, -5, -5, -5, 73 | /*T*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 74 | -5, -5, -5, 9, -5, -5, -5, -5, -5, -5, -5, -5, 75 | /*W*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 76 | -5, -5, -5, -5, 9, -5, -5, -5, -5, -5, -5, -5, 77 | /*Y*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 78 | -5, -5, -5, -5, -5, 9, -5, -5, -5, -5, -5, -5, 79 | /*V*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 80 | -5, -5, -5, -5, -5, -5, 9, -5, -5, -5, -5, -5, 81 | /*B*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 82 | -5, -5, -5, -5, -5, -5, -5, 9, -5, -5, -5, -5, 83 | /*J*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 84 | -5, -5, -5, -5, -5, -5, -5, -5, 9, -5, -5, -5, 85 | /*Z*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 86 | -5, -5, -5, -5, -5, -5, -5, -5, -5, 9, -5, -5, 87 | /*X*/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 88 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 89 | /***/ -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 90 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 9 91 | }; 92 | const SNCBIPackedScoreMatrix NCBISM_Identity = { 93 | "ARNDCQEGHILKMFPSTWYVBJZX*", 94 | s_IdentityPSM, 95 | -5 96 | }; 97 | 98 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/util/tables/sm_pam250.c: -------------------------------------------------------------------------------- 1 | /* $Id: sm_pam250.c 90506 2006-09-25 19:30:59Z madden $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Aaron Ucko, Mike Gertz 27 | * 28 | * File Description: 29 | * Protein alignment score matrices; shared between the two toolkits. 30 | * 31 | * =========================================================================== 32 | */ 33 | 34 | #include 35 | 36 | /** Entries for the PAM250 matrix at a scale of ln(2)/3.0. */ 37 | 38 | static const TNCBIScore s_Pam250PSM[25 * 25] = { 39 | /* A, R, N, D, C, Q, E, G, H, I, L, K, M, 40 | F, P, S, T, W, Y, V, B, J, Z, X, * */ 41 | /*A*/ 2, -2, 0, 0, -2, 0, 0, 1, -1, -1, -2, -1, -1, 42 | -3, 1, 1, 1, -6, -3, 0, 0, -1, 0, -1, -8, 43 | /*R*/ -2, 6, 0, -1, -4, 1, -1, -3, 2, -2, -3, 3, 0, 44 | -4, 0, 0, -1, 2, -4, -2, -1, -3, 0, -1, -8, 45 | /*N*/ 0, 0, 2, 2, -4, 1, 1, 0, 2, -2, -3, 1, -2, 46 | -3, 0, 1, 0, -4, -2, -2, 2, -3, 1, -1, -8, 47 | /*D*/ 0, -1, 2, 4, -5, 2, 3, 1, 1, -2, -4, 0, -3, 48 | -6, -1, 0, 0, -7, -4, -2, 3, -3, 3, -1, -8, 49 | /*C*/ -2, -4, -4, -5, 12, -5, -5, -3, -3, -2, -6, -5, -5, 50 | -4, -3, 0, -2, -8, 0, -2, -4, -5, -5, -1, -8, 51 | /*Q*/ 0, 1, 1, 2, -5, 4, 2, -1, 3, -2, -2, 1, -1, 52 | -5, 0, -1, -1, -5, -4, -2, 1, -2, 3, -1, -8, 53 | /*E*/ 0, -1, 1, 3, -5, 2, 4, 0, 1, -2, -3, 0, -2, 54 | -5, -1, 0, 0, -7, -4, -2, 3, -3, 3, -1, -8, 55 | /*G*/ 1, -3, 0, 1, -3, -1, 0, 5, -2, -3, -4, -2, -3, 56 | -5, 0, 1, 0, -7, -5, -1, 0, -4, 0, -1, -8, 57 | /*H*/ -1, 2, 2, 1, -3, 3, 1, -2, 6, -2, -2, 0, -2, 58 | -2, 0, -1, -1, -3, 0, -2, 1, -2, 2, -1, -8, 59 | /*I*/ -1, -2, -2, -2, -2, -2, -2, -3, -2, 5, 2, -2, 2, 60 | 1, -2, -1, 0, -5, -1, 4, -2, 3, -2, -1, -8, 61 | /*L*/ -2, -3, -3, -4, -6, -2, -3, -4, -2, 2, 6, -3, 4, 62 | 2, -3, -3, -2, -2, -1, 2, -3, 5, -3, -1, -8, 63 | /*K*/ -1, 3, 1, 0, -5, 1, 0, -2, 0, -2, -3, 5, 0, 64 | -5, -1, 0, 0, -3, -4, -2, 1, -3, 0, -1, -8, 65 | /*M*/ -1, 0, -2, -3, -5, -1, -2, -3, -2, 2, 4, 0, 6, 66 | 0, -2, -2, -1, -4, -2, 2, -2, 3, -2, -1, -8, 67 | /*F*/ -3, -4, -3, -6, -4, -5, -5, -5, -2, 1, 2, -5, 0, 68 | 9, -5, -3, -3, 0, 7, -1, -4, 2, -5, -1, -8, 69 | /*P*/ 1, 0, 0, -1, -3, 0, -1, 0, 0, -2, -3, -1, -2, 70 | -5, 6, 1, 0, -6, -5, -1, -1, -2, 0, -1, -8, 71 | /*S*/ 1, 0, 1, 0, 0, -1, 0, 1, -1, -1, -3, 0, -2, 72 | -3, 1, 2, 1, -2, -3, -1, 0, -2, 0, -1, -8, 73 | /*T*/ 1, -1, 0, 0, -2, -1, 0, 0, -1, 0, -2, 0, -1, 74 | -3, 0, 1, 3, -5, -3, 0, 0, -1, -1, -1, -8, 75 | /*W*/ -6, 2, -4, -7, -8, -5, -7, -7, -3, -5, -2, -3, -4, 76 | 0, -6, -2, -5, 17, 0, -6, -5, -3, -6, -1, -8, 77 | /*Y*/ -3, -4, -2, -4, 0, -4, -4, -5, 0, -1, -1, -4, -2, 78 | 7, -5, -3, -3, 0, 10, -2, -3, -1, -4, -1, -8, 79 | /*V*/ 0, -2, -2, -2, -2, -2, -2, -1, -2, 4, 2, -2, 2, 80 | -1, -1, -1, 0, -6, -2, 4, -2, 2, -2, -1, -8, 81 | /*B*/ 0, -1, 2, 3, -4, 1, 3, 0, 1, -2, -3, 1, -2, 82 | -4, -1, 0, 0, -5, -3, -2, 3, -3, 2, -1, -8, 83 | /*J*/ -1, -3, -3, -3, -5, -2, -3, -4, -2, 3, 5, -3, 3, 84 | 2, -2, -2, -1, -3, -1, 2, -3, 5, -2, -1, -8, 85 | /*Z*/ 0, 0, 1, 3, -5, 3, 3, 0, 2, -2, -3, 0, -2, 86 | -5, 0, 0, -1, -6, -4, -2, 2, -2, 3, -1, -8, 87 | /*X*/ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 88 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -8, 89 | /***/ -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, 90 | -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, 1 91 | }; 92 | const SNCBIPackedScoreMatrix NCBISM_Pam250 = { 93 | "ARNDCQEGHILKMFPSTWYVBJZX*", 94 | s_Pam250PSM, 95 | -8 96 | }; 97 | 98 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/util/tables/sm_pam30.c: -------------------------------------------------------------------------------- 1 | /* $Id: sm_pam30.c 90506 2006-09-25 19:30:59Z madden $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Aaron Ucko, Mike Gertz 27 | * 28 | * File Description: 29 | * Protein alignment score matrices; shared between the two toolkits. 30 | * 31 | * =========================================================================== 32 | */ 33 | 34 | #include 35 | 36 | /** Entries for the PAM30 matrix at a scale of ln(2)/2.0. */ 37 | 38 | static const TNCBIScore s_Pam30PSM[25 * 25] = { 39 | /* A, R, N, D, C, Q, E, G, H, I, L, K, M, 40 | F, P, S, T, W, Y, V, B, J, Z, X, * */ 41 | /*A*/ 6, -7, -4, -3, -6, -4, -2, -2, -7, -5, -6, -7, -5, 42 | -8, -2, 0, -1,-13, -8, -2, -3, -6, -3, -1,-17, 43 | /*R*/ -7, 8, -6,-10, -8, -2, -9, -9, -2, -5, -8, 0, -4, 44 | -9, -4, -3, -6, -2,-10, -8, -7, -7, -4, -1,-17, 45 | /*N*/ -4, -6, 8, 2,-11, -3, -2, -3, 0, -5, -7, -1, -9, 46 | -9, -6, 0, -2, -8, -4, -8, 6, -6, -3, -1,-17, 47 | /*D*/ -3,-10, 2, 8,-14, -2, 2, -3, -4, -7,-12, -4,-11, 48 | -15, -8, -4, -5,-15,-11, -8, 6,-10, 1, -1,-17, 49 | /*C*/ -6, -8,-11,-14, 10,-14,-14, -9, -7, -6,-15,-14,-13, 50 | -13, -8, -3, -8,-15, -4, -6,-12, -9,-14, -1,-17, 51 | /*Q*/ -4, -2, -3, -2,-14, 8, 1, -7, 1, -8, -5, -3, -4, 52 | -13, -3, -5, -5,-13,-12, -7, -3, -5, 6, -1,-17, 53 | /*E*/ -2, -9, -2, 2,-14, 1, 8, -4, -5, -5, -9, -4, -7, 54 | -14, -5, -4, -6,-17, -8, -6, 1, -7, 6, -1,-17, 55 | /*G*/ -2, -9, -3, -3, -9, -7, -4, 6, -9,-11,-10, -7, -8, 56 | -9, -6, -2, -6,-15,-14, -5, -3,-10, -5, -1,-17, 57 | /*H*/ -7, -2, 0, -4, -7, 1, -5, -9, 9, -9, -6, -6,-10, 58 | -6, -4, -6, -7, -7, -3, -6, -1, -7, -1, -1,-17, 59 | /*I*/ -5, -5, -5, -7, -6, -8, -5,-11, -9, 8, -1, -6, -1, 60 | -2, -8, -7, -2,-14, -6, 2, -6, 5, -6, -1,-17, 61 | /*L*/ -6, -8, -7,-12,-15, -5, -9,-10, -6, -1, 7, -8, 1, 62 | -3, -7, -8, -7, -6, -7, -2, -9, 6, -7, -1,-17, 63 | /*K*/ -7, 0, -1, -4,-14, -3, -4, -7, -6, -6, -8, 7, -2, 64 | -14, -6, -4, -3,-12, -9, -9, -2, -7, -4, -1,-17, 65 | /*M*/ -5, -4, -9,-11,-13, -4, -7, -8,-10, -1, 1, -2, 11, 66 | -4, -8, -5, -4,-13,-11, -1,-10, 0, -5, -1,-17, 67 | /*F*/ -8, -9, -9,-15,-13,-13,-14, -9, -6, -2, -3,-14, -4, 68 | 9,-10, -6, -9, -4, 2, -8,-10, -2,-13, -1,-17, 69 | /*P*/ -2, -4, -6, -8, -8, -3, -5, -6, -4, -8, -7, -6, -8, 70 | -10, 8, -2, -4,-14,-13, -6, -7, -7, -4, -1,-17, 71 | /*S*/ 0, -3, 0, -4, -3, -5, -4, -2, -6, -7, -8, -4, -5, 72 | -6, -2, 6, 0, -5, -7, -6, -1, -8, -5, -1,-17, 73 | /*T*/ -1, -6, -2, -5, -8, -5, -6, -6, -7, -2, -7, -3, -4, 74 | -9, -4, 0, 7,-13, -6, -3, -3, -5, -6, -1,-17, 75 | /*W*/ -13, -2, -8,-15,-15,-13,-17,-15, -7,-14, -6,-12,-13, 76 | -4,-14, -5,-13, 13, -5,-15,-10, -7,-14, -1,-17, 77 | /*Y*/ -8,-10, -4,-11, -4,-12, -8,-14, -3, -6, -7, -9,-11, 78 | 2,-13, -7, -6, -5, 10, -7, -6, -7, -9, -1,-17, 79 | /*V*/ -2, -8, -8, -8, -6, -7, -6, -5, -6, 2, -2, -9, -1, 80 | -8, -6, -6, -3,-15, -7, 7, -8, 0, -6, -1,-17, 81 | /*B*/ -3, -7, 6, 6,-12, -3, 1, -3, -1, -6, -9, -2,-10, 82 | -10, -7, -1, -3,-10, -6, -8, 6, -8, 0, -1,-17, 83 | /*J*/ -6, -7, -6,-10, -9, -5, -7,-10, -7, 5, 6, -7, 0, 84 | -2, -7, -8, -5, -7, -7, 0, -8, 6, -6, -1,-17, 85 | /*Z*/ -3, -4, -3, 1,-14, 6, 6, -5, -1, -6, -7, -4, -5, 86 | -13, -4, -5, -6,-14, -9, -6, 0, -6, 6, -1,-17, 87 | /*X*/ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 88 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,-17, 89 | /***/ -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, 90 | -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, 1 91 | }; 92 | const SNCBIPackedScoreMatrix NCBISM_Pam30 = { 93 | "ARNDCQEGHILKMFPSTWYVBJZX*", 94 | s_Pam30PSM, 95 | -17 96 | }; 97 | 98 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/util/tables/sm_pam70.c: -------------------------------------------------------------------------------- 1 | /* $Id: sm_pam70.c 90506 2006-09-25 19:30:59Z madden $ 2 | * =========================================================================== 3 | * 4 | * PUBLIC DOMAIN NOTICE 5 | * National Center for Biotechnology Information 6 | * 7 | * This software/database is a "United States Government Work" under the 8 | * terms of the United States Copyright Act. It was written as part of 9 | * the author's official duties as a United States Government employee and 10 | * thus cannot be copyrighted. This software/database is freely available 11 | * to the public for use. The National Library of Medicine and the U.S. 12 | * Government have not placed any restriction on its use or reproduction. 13 | * 14 | * Although all reasonable efforts have been taken to ensure the accuracy 15 | * and reliability of the software and data, the NLM and the U.S. 16 | * Government do not and cannot warrant the performance or results that 17 | * may be obtained by using this software or data. The NLM and the U.S. 18 | * Government disclaim all warranties, express or implied, including 19 | * warranties of performance, merchantability or fitness for any particular 20 | * purpose. 21 | * 22 | * Please cite the author in any work or product based on this material. 23 | * 24 | * =========================================================================== 25 | * 26 | * Author: Aaron Ucko, Mike Gertz 27 | * 28 | * File Description: 29 | * Protein alignment score matrices; shared between the two toolkits. 30 | * 31 | * =========================================================================== 32 | */ 33 | 34 | #include 35 | 36 | /** Entries for the PAM70 matrix at a scale of ln(2)/2.0. */ 37 | 38 | static const TNCBIScore s_Pam70PSM[25 * 25] = { 39 | /* A, R, N, D, C, Q, E, G, H, I, L, K, M, 40 | F, P, S, T, W, Y, V, B, J, Z, X, * */ 41 | /*A*/ 5, -4, -2, -1, -4, -2, -1, 0, -4, -2, -4, -4, -3, 42 | -6, 0, 1, 1, -9, -5, -1, -1, -3, -1, -1,-11, 43 | /*R*/ -4, 8, -3, -6, -5, 0, -5, -6, 0, -3, -6, 2, -2, 44 | -7, -2, -1, -4, 0, -7, -5, -4, -5, -2, -1,-11, 45 | /*N*/ -2, -3, 6, 3, -7, -1, 0, -1, 1, -3, -5, 0, -5, 46 | -6, -3, 1, 0, -6, -3, -5, 5, -4, -1, -1,-11, 47 | /*D*/ -1, -6, 3, 6, -9, 0, 3, -1, -1, -5, -8, -2, -7, 48 | -10, -4, -1, -2,-10, -7, -5, 5, -7, 2, -1,-11, 49 | /*C*/ -4, -5, -7, -9, 9, -9, -9, -6, -5, -4,-10, -9, -9, 50 | -8, -5, -1, -5,-11, -2, -4, -8, -7, -9, -1,-11, 51 | /*Q*/ -2, 0, -1, 0, -9, 7, 2, -4, 2, -5, -3, -1, -2, 52 | -9, -1, -3, -3, -8, -8, -4, -1, -3, 5, -1,-11, 53 | /*E*/ -1, -5, 0, 3, -9, 2, 6, -2, -2, -4, -6, -2, -4, 54 | -9, -3, -2, -3,-11, -6, -4, 2, -5, 5, -1,-11, 55 | /*G*/ 0, -6, -1, -1, -6, -4, -2, 6, -6, -6, -7, -5, -6, 56 | -7, -3, 0, -3,-10, -9, -3, -1, -7, -3, -1,-11, 57 | /*H*/ -4, 0, 1, -1, -5, 2, -2, -6, 8, -6, -4, -3, -6, 58 | -4, -2, -3, -4, -5, -1, -4, 0, -4, 1, -1,-11, 59 | /*I*/ -2, -3, -3, -5, -4, -5, -4, -6, -6, 7, 1, -4, 1, 60 | 0, -5, -4, -1, -9, -4, 3, -4, 4, -4, -1,-11, 61 | /*L*/ -4, -6, -5, -8,-10, -3, -6, -7, -4, 1, 6, -5, 2, 62 | -1, -5, -6, -4, -4, -4, 0, -6, 5, -4, -1,-11, 63 | /*K*/ -4, 2, 0, -2, -9, -1, -2, -5, -3, -4, -5, 6, 0, 64 | -9, -4, -2, -1, -7, -7, -6, -1, -5, -2, -1,-11, 65 | /*M*/ -3, -2, -5, -7, -9, -2, -4, -6, -6, 1, 2, 0, 10, 66 | -2, -5, -3, -2, -8, -7, 0, -6, 2, -3, -1,-11, 67 | /*F*/ -6, -7, -6,-10, -8, -9, -9, -7, -4, 0, -1, -9, -2, 68 | 8, -7, -4, -6, -2, 4, -5, -7, -1, -9, -1,-11, 69 | /*P*/ 0, -2, -3, -4, -5, -1, -3, -3, -2, -5, -5, -4, -5, 70 | -7, 7, 0, -2, -9, -9, -3, -4, -5, -2, -1,-11, 71 | /*S*/ 1, -1, 1, -1, -1, -3, -2, 0, -3, -4, -6, -2, -3, 72 | -4, 0, 5, 2, -3, -5, -3, 0, -5, -2, -1,-11, 73 | /*T*/ 1, -4, 0, -2, -5, -3, -3, -3, -4, -1, -4, -1, -2, 74 | -6, -2, 2, 6, -8, -4, -1, -1, -3, -3, -1,-11, 75 | /*W*/ -9, 0, -6,-10,-11, -8,-11,-10, -5, -9, -4, -7, -8, 76 | -2, -9, -3, -8, 13, -3,-10, -7, -5,-10, -1,-11, 77 | /*Y*/ -5, -7, -3, -7, -2, -8, -6, -9, -1, -4, -4, -7, -7, 78 | 4, -9, -5, -4, -3, 9, -5, -4, -4, -7, -1,-11, 79 | /*V*/ -1, -5, -5, -5, -4, -4, -4, -3, -4, 3, 0, -6, 0, 80 | -5, -3, -3, -1,-10, -5, 6, -5, 1, -4, -1,-11, 81 | /*B*/ -1, -4, 5, 5, -8, -1, 2, -1, 0, -4, -6, -1, -6, 82 | -7, -4, 0, -1, -7, -4, -5, 5, -5, 1, -1,-11, 83 | /*J*/ -3, -5, -4, -7, -7, -3, -5, -7, -4, 4, 5, -5, 2, 84 | -1, -5, -5, -3, -5, -4, 1, -5, 5, -4, -1,-11, 85 | /*Z*/ -1, -2, -1, 2, -9, 5, 5, -3, 1, -4, -4, -2, -3, 86 | -9, -2, -2, -3,-10, -7, -4, 1, -4, 5, -1,-11, 87 | /*X*/ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 88 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,-11, 89 | /***/ -11,-11,-11,-11,-11,-11,-11,-11,-11,-11,-11,-11,-11, 90 | -11,-11,-11,-11,-11,-11,-11,-11,-11,-11,-11, 1 91 | }; 92 | const SNCBIPackedScoreMatrix NCBISM_Pam70 = { 93 | "ARNDCQEGHILKMFPSTWYVBJZX*", 94 | s_Pam70PSM, 95 | -11 96 | }; 97 | 98 | -------------------------------------------------------------------------------- /src/include/ncbi-blast+/util/tables/tables_export.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_TABLES___TABLES_EXPORT__H 2 | #define UTIL_TABLES___TABLES_EXPORT__H 3 | 4 | /* $Id: tables_export.h 166398 2009-07-22 15:51:55Z ucko $ 5 | * =========================================================================== 6 | * 7 | * PUBLIC DOMAIN NOTICE 8 | * National Center for Biotechnology Information 9 | * 10 | * This software/database is a "United States Government Work" under the 11 | * terms of the United States Copyright Act. It was written as part of 12 | * the author's official duties as a United States Government employee and 13 | * thus cannot be copyrighted. This software/database is freely available 14 | * to the public for use. The National Library of Medicine and the U.S. 15 | * Government have not placed any restriction on its use or reproduction. 16 | * 17 | * Although all reasonable efforts have been taken to ensure the accuracy 18 | * and reliability of the software and data, the NLM and the U.S. 19 | * Government do not and cannot warrant the performance or results that 20 | * may be obtained by using this software or data. The NLM and the U.S. 21 | * Government disclaim all warranties, express or implied, including 22 | * warranties of performance, merchantability or fitness for any particular 23 | * purpose. 24 | * 25 | * Please cite the author in any work or product based on this material. 26 | * 27 | * =========================================================================== 28 | * 29 | * Authors: Anatoliy Kuznetsov, Mike DiCuccio, Aaron Ucko 30 | * 31 | * File Description: 32 | * Defines to provide correct exporting from TABLES DLL in Windows. 33 | * These are necessary to compile DLLs with Visual C++ - exports must be 34 | * explicitly labeled as such. 35 | */ 36 | 37 | 38 | /** @addtogroup WinDLL 39 | * 40 | * @{ 41 | */ 42 | 43 | 44 | #if defined(WIN32) && defined(NCBI_DLL_BUILD) 45 | 46 | #ifndef _MSC_VER 47 | # error "This toolkit is not buildable with a compiler other than MSVC." 48 | #endif 49 | 50 | 51 | /* 52 | * Dumping ground for Windows-specific stuff 53 | */ 54 | #pragma warning (disable : 4786 4251 4275) 55 | 56 | 57 | #ifdef NCBI_CORE_EXPORTS 58 | # define NCBI_TABLES_EXPORTS 59 | #endif 60 | 61 | 62 | #ifdef NCBI_TABLES_EXPORTS 63 | # define NCBI_TABLES_EXPORT __declspec(dllexport) 64 | #else 65 | # define NCBI_TABLES_EXPORT __declspec(dllimport) 66 | #endif /* NCBI_TABLES_EXPORTS */ 67 | 68 | 69 | 70 | #elif defined(__GNUC__) && __GNUC__ >= 4 71 | 72 | # define NCBI_TABLES_EXPORT __attribute__((visibility("default"))) 73 | 74 | #else 75 | 76 | /* 77 | * NULL operations for other cases 78 | */ 79 | 80 | # define NCBI_TABLES_EXPORT 81 | 82 | 83 | #endif 84 | 85 | 86 | /* @} */ 87 | 88 | #endif /* UTIL_TABLES___TABLES_EXPORT__H */ 89 | -------------------------------------------------------------------------------- /src/jsonreporter.h: -------------------------------------------------------------------------------- 1 | #ifndef JSON_REPORTER_H 2 | #define JSON_REPORTER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "options.h" 11 | #include "stats.h" 12 | #include "filterresult.h" 13 | #include "common.h" 14 | #include "util.h" 15 | 16 | using namespace std; 17 | 18 | class JsonReporter{ 19 | public: 20 | JsonReporter(Options* & opt); 21 | ~JsonReporter(); 22 | 23 | void setDupHist(int* dupHist, double* dupMeanGC, double dupRate); 24 | void setInsertHist(atomic_long* insertHist, int insertSizePeak); 25 | void report(FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL); 26 | 27 | private: 28 | Options* mOptions; 29 | int* mDupHist; 30 | double* mDupMeanGC; 31 | double mDupRate; 32 | atomic_long* mInsertHist; 33 | int mInsertSizePeak; 34 | }; 35 | 36 | 37 | #endif -------------------------------------------------------------------------------- /src/nucleotidetree.cpp: -------------------------------------------------------------------------------- 1 | #include "nucleotidetree.h" 2 | #include 3 | 4 | NucleotideNode::NucleotideNode(){ 5 | count = 0; 6 | base = 'N'; 7 | memset(children, 0, sizeof(NucleotideNode*)*8); 8 | } 9 | NucleotideNode::~NucleotideNode(){ 10 | for(int i=0; i<8; i++) { 11 | if(children[i]) 12 | delete children[i]; 13 | } 14 | } 15 | void NucleotideNode::dfs() { 16 | //cerr << base; 17 | //cerr << count; 18 | printf("%c", base); 19 | printf("%d", count); 20 | bool hasChild = false; 21 | for(int i=0; i<8; i++) { 22 | if(children[i]){ 23 | children[i]->dfs(); 24 | hasChild = true; 25 | } 26 | } 27 | if(!hasChild) { 28 | printf("\n"); 29 | } 30 | } 31 | 32 | NucleotideTree::NucleotideTree(Options* & opt){ 33 | mOptions = opt; 34 | mRoot = new NucleotideNode(); 35 | } 36 | 37 | 38 | NucleotideTree::~NucleotideTree(){ 39 | delete mRoot; 40 | } 41 | 42 | void NucleotideTree::addSeq(string seq) { 43 | NucleotideNode* curNode = mRoot; 44 | for(int i=0; ichildren[base] == NULL) { 49 | curNode->children[base] = new NucleotideNode(); 50 | curNode->children[base]->base = seq[i]; 51 | } 52 | curNode->children[base]->count++; 53 | curNode = curNode->children[base]; 54 | } 55 | } 56 | 57 | string NucleotideTree::getDominantPath(bool& reachedLeaf) { 58 | stringstream ss; 59 | const double RATIO_THRESHOLD = 0.95; 60 | const int NUM_THRESHOLD = 50; 61 | NucleotideNode* curNode = mRoot; 62 | while(true) { 63 | int total = 0; 64 | for(int i=0; i<8; i++) { 65 | if(curNode->children[i] != NULL) 66 | total += curNode->children[i]->count; 67 | } 68 | if(total < NUM_THRESHOLD) 69 | break; 70 | bool hasDominant = false; 71 | for(int i=0; i<8; i++) { 72 | if(curNode->children[i] == NULL) 73 | continue; 74 | if(curNode->children[i]->count / (double)total >= RATIO_THRESHOLD) { 75 | hasDominant = true; 76 | ss << curNode->children[i]->base; 77 | curNode = curNode->children[i]; 78 | break; 79 | } 80 | } 81 | if(!hasDominant) { 82 | reachedLeaf = false; 83 | break; 84 | } 85 | } 86 | return ss.str(); 87 | 88 | } 89 | 90 | bool NucleotideTree::test() { 91 | // NucleotideTree tree(NULL); 92 | // for(int i=0; i<100; i++) { 93 | // tree.addSeq("AAAATTTT"); 94 | // tree.addSeq("AAAATTTTGGGG"); 95 | // tree.addSeq("AAAATTTTGGGGCCCC"); 96 | // tree.addSeq("AAAATTTTGGGGCCAA"); 97 | // } 98 | // tree.addSeq("AAAATTTTGGGACCCC"); 99 | // 100 | // bool reachedLeaf = true; 101 | // string path = tree.getDominantPath(reachedLeaf); 102 | // printf("%s\n", path.c_str()); 103 | // return path == "AAAATTTTGGGGCC"; 104 | } -------------------------------------------------------------------------------- /src/nucleotidetree.h: -------------------------------------------------------------------------------- 1 | #ifndef NUCLEICTREE_H 2 | #define NUCLEICTREE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "options.h" 9 | 10 | using namespace std; 11 | 12 | // (A,T,C,G,N) & 0X07 = (1,4,7,6,3) 13 | class NucleotideNode{ 14 | public: 15 | NucleotideNode(); 16 | ~NucleotideNode(); 17 | void dfs(); 18 | 19 | public: 20 | int count; 21 | char base; 22 | NucleotideNode* children[8]; 23 | }; 24 | 25 | class NucleotideTree{ 26 | public: 27 | NucleotideTree(Options* & opt); 28 | ~NucleotideTree(); 29 | void addSeq(string seq); 30 | string getDominantPath(bool& reachedLeaf); 31 | 32 | static bool test(); 33 | 34 | private: 35 | Options* mOptions; 36 | NucleotideNode* mRoot; 37 | }; 38 | 39 | 40 | #endif -------------------------------------------------------------------------------- /src/overlapanalysis.h: -------------------------------------------------------------------------------- 1 | #ifndef OVERLAP_ANALYSIS_H 2 | #define OVERLAP_ANALYSIS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "common.h" 10 | #include "options.h" 11 | #include "read.h" 12 | 13 | using namespace std; 14 | 15 | class OverlapResult { 16 | public: 17 | bool overlapped; 18 | int offset; 19 | int overlap_len; 20 | int diff; 21 | }; 22 | 23 | class OverlapAnalysis{ 24 | public: 25 | OverlapAnalysis(); 26 | ~OverlapAnalysis(); 27 | 28 | static OverlapResult analyze(Sequence& r1, Sequence& r2, int diffLimit, int overlapRequire, double diffPercentLimit); 29 | static OverlapResult analyze(Read* r1, Read* r2, int diffLimit, int overlapRequire, double diffPercentLimit); 30 | static Read* merge(Read* r1, Read* r2, OverlapResult ov); 31 | 32 | public: 33 | static bool test(); 34 | 35 | }; 36 | 37 | #endif -------------------------------------------------------------------------------- /src/peprocessor.h: -------------------------------------------------------------------------------- 1 | #ifndef PE_PROCESSOR_H 2 | #define PE_PROCESSOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "fastqreader.h" 27 | #include "util.h" 28 | #include "adaptertrimmer.h" 29 | #include "basecorrector.h" 30 | #include "jsonreporter.h" 31 | #include "htmlreporter.h" 32 | #include "polyx.h" 33 | #include "options.h" 34 | #include "threadconfig.h" 35 | #include "filter.h" 36 | #include "umiprocessor.h" 37 | #include "overlapanalysis.h" 38 | #include "writerthread.h" 39 | #include "duplicate.h" 40 | #include "read.h" 41 | #include "bwtfmiDB.h" 42 | 43 | using namespace std; 44 | 45 | struct ReadPairPack { 46 | ReadPair** data; 47 | int count; 48 | }; 49 | 50 | typedef struct ReadPairPack ReadPairPack; 51 | 52 | struct ReadPairRepository { 53 | ReadPairPack** packBuffer; 54 | atomic_long readPos; 55 | atomic_long writePos; 56 | }; 57 | 58 | typedef struct ReadPairRepository ReadPairRepository; 59 | 60 | class PairEndProcessor{ 61 | public: 62 | PairEndProcessor(Options* & opt, BwtFmiDB * & tbwtfmiDB); 63 | ~PairEndProcessor(); 64 | bool process(); 65 | 66 | private: 67 | bool processPairEnd(ReadPairPack* pack, ThreadConfig* config); 68 | bool processRead(Read* r, ReadPair* originalRead, bool reversed); 69 | void initPackRepository(); 70 | void destroyPackRepository(); 71 | void producePack(ReadPairPack* pack); 72 | void consumePack(ThreadConfig* config); 73 | void producerTask(); 74 | void consumerTask(ThreadConfig* config); 75 | void initConfig(ThreadConfig* config); 76 | void initOutput(); 77 | void closeOutput(); 78 | void statInsertSize(Read* r1, Read* r2, OverlapResult& ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0); 79 | int getPeakInsertSize(); 80 | void writeTask(WriterThread* config); 81 | void prepareResults(); 82 | 83 | private: 84 | ReadPairRepository mRepo; 85 | atomic_bool mProduceFinished; 86 | atomic_int mFinishedThreads; 87 | std::mutex mOutputMtx; 88 | std::mutex mInputMtx; 89 | std::mutex logMtx; 90 | Options* mOptions; 91 | Filter* mFilter; 92 | gzFile mZipFile1; 93 | gzFile mZipFile2; 94 | ofstream* mOutStream1; 95 | ofstream* mOutStream2; 96 | UmiProcessor* mUmiProcessor; 97 | atomic_long* mInsertSizeHist; 98 | WriterThread* mLeftWriter; 99 | WriterThread* mRightWriter; 100 | WriterThread* mUnpairedLeftWriter; 101 | WriterThread* mUnpairedRightWriter; 102 | WriterThread* mMergedWriter; 103 | WriterThread* mFailedWriter; 104 | WriterThread* mReadsKOWriter; 105 | Duplicate* mDuplicate; 106 | BwtFmiDB *tbwtfmiDB; 107 | std::string fileoutname; 108 | }; 109 | 110 | 111 | #endif 112 | -------------------------------------------------------------------------------- /src/polyx.cpp: -------------------------------------------------------------------------------- 1 | #include "polyx.h" 2 | #include "common.h" 3 | 4 | PolyX::PolyX(){ 5 | } 6 | 7 | 8 | PolyX::~PolyX(){ 9 | } 10 | 11 | void PolyX::trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq) { 12 | trimPolyG(r1, fr, compareReq); 13 | trimPolyG(r2, fr, compareReq); 14 | } 15 | 16 | void PolyX::trimPolyG(Read* r, FilterResult* fr, int compareReq) { 17 | const int allowOneMismatchForEach = 8; 18 | const int maxMismatch = 5; 19 | 20 | const char* data = r->mSeq.mStr.c_str(); 21 | 22 | int rlen = r->length(); 23 | 24 | int mismatch = 0; 25 | int i = 0; 26 | int firstGPos = rlen - 1; 27 | for(i=0; i< rlen; i++) { 28 | if(data[rlen - i - 1] != 'G') { 29 | mismatch++; 30 | } else { 31 | firstGPos = rlen - i -1; 32 | } 33 | 34 | int allowedMismatch = (i+1)/allowOneMismatchForEach; 35 | if(mismatch > maxMismatch || (mismatch>allowedMismatch && i>= compareReq-1) ) 36 | break; 37 | } 38 | 39 | if(i >= compareReq) { 40 | r->resize(firstGPos); 41 | } 42 | } 43 | 44 | void PolyX::trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq) { 45 | trimPolyX(r1, fr, compareReq); 46 | trimPolyX(r2, fr, compareReq); 47 | } 48 | 49 | void PolyX::trimPolyX(Read* r, FilterResult* fr, int compareReq) { 50 | const int allowOneMismatchForEach = 8; 51 | const int maxMismatch = 5; 52 | 53 | const char* data = r->mSeq.mStr.c_str(); 54 | 55 | int rlen = r->length(); 56 | 57 | 58 | int atcgNumbers[4] = {0, 0, 0, 0}; 59 | int pos = 0; 60 | for(pos=0; pos= allowOneMismatchForEach || pos+1 >= compareReq-1)) { 93 | break; 94 | } 95 | } 96 | 97 | // has polyX 98 | if(pos+1 >= compareReq) { 99 | // find the poly 100 | int poly; 101 | int maxCount = -1; 102 | for(int b=0; b<4; b++) { 103 | if(atcgNumbers[b] > maxCount){ 104 | maxCount = atcgNumbers[b]; 105 | poly = b; 106 | } 107 | } 108 | char polyBase = ATCG_BASES[poly]; 109 | while(data[rlen - pos - 1] != polyBase && pos>=0) 110 | pos--; 111 | 112 | r->resize(rlen - pos - 1); 113 | if(fr) 114 | fr->addPolyXTrimmed(poly, pos + 1); 115 | } 116 | } 117 | 118 | bool PolyX::test() { 119 | 120 | // Read r("@name", 121 | // "ATTTTAAAAAAAAAATAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAT", 122 | // "+", 123 | // "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E"); 124 | // 125 | // FilterResult fr(NULL, false); 126 | // PolyX::trimPolyX(&r, &fr, 10); 127 | // r.print(); 128 | // 129 | // return r.mSeq.mStr == "ATTTT" && fr.getTotalPolyXTrimmedReads() == 1 && fr.getTotalPolyXTrimmedBases() == 51; 130 | } -------------------------------------------------------------------------------- /src/polyx.h: -------------------------------------------------------------------------------- 1 | #ifndef POLY_X_H 2 | #define POLY_X_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "overlapanalysis.h" 8 | #include "filterresult.h" 9 | #include "options.h" 10 | 11 | using namespace std; 12 | 13 | class PolyX{ 14 | public: 15 | PolyX(); 16 | ~PolyX(); 17 | 18 | static void trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq); 19 | static void trimPolyG(Read* r1, FilterResult* fr, int compareReq); 20 | static void trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq); 21 | static void trimPolyX(Read* r1, FilterResult* fr, int compareReq); 22 | static bool test(); 23 | 24 | 25 | }; 26 | 27 | 28 | #endif -------------------------------------------------------------------------------- /src/processor.cpp: -------------------------------------------------------------------------------- 1 | #include "processor.h" 2 | #include "peprocessor.h" 3 | #include "seprocessor.h" 4 | 5 | Processor::Processor(Options* & opt){ 6 | mOptions = opt; 7 | } 8 | 9 | Processor::~Processor(){ 10 | } 11 | 12 | bool Processor::process(BwtFmiDB * & tbwtfmiDB) { 13 | if(mOptions->isPaired()) { 14 | PairEndProcessor p(mOptions, tbwtfmiDB); 15 | p.process(); 16 | } else { 17 | SingleEndProcessor p(mOptions, tbwtfmiDB); 18 | p.process(); 19 | } 20 | 21 | return true; 22 | } -------------------------------------------------------------------------------- /src/processor.h: -------------------------------------------------------------------------------- 1 | #ifndef PROCESSOR_H 2 | #define PROCESSOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "options.h" 8 | #include "bwtfmiDB.h" 9 | 10 | using namespace std; 11 | 12 | class Processor{ 13 | public: 14 | Processor(Options* & opt); 15 | ~Processor(); 16 | bool process(BwtFmiDB * & tbwtfmiDB); 17 | 18 | private: 19 | Options* mOptions; 20 | }; 21 | 22 | 23 | #endif -------------------------------------------------------------------------------- /src/read.h: -------------------------------------------------------------------------------- 1 | #ifndef READ_H 2 | #define READ_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "sequence.h" 10 | #include 11 | #include "util.h" 12 | 13 | using namespace std; 14 | 15 | class Read{ 16 | public: 17 | Read(string name, string seq, string strand, string quality, bool phred64 = false); 18 | Read(string name, Sequence seq, string strand, string quality, bool phred64 = false); 19 | Read(string name, string seq, string strand); 20 | Read(string name, Sequence seq, string strand); 21 | Read(Read &r); 22 | void print(); 23 | void printFile(ofstream& file); 24 | Read* reverseComplement(); 25 | string firstIndex(); 26 | string lastIndex(); 27 | // default is Q20 28 | int lowQualCount(int qual = 20); 29 | int length(); 30 | string toString(); 31 | string toFastaR1(); 32 | string toFastaR2(); 33 | string toStringWithTag(string tag); 34 | string toStringWithTag(uint32* tag); 35 | string toStringWithTagRm(); 36 | void resize(int len); 37 | void convertPhred64To33(); 38 | void trimFront(int len); 39 | bool fixMGI(); 40 | 41 | public: 42 | static bool test(); 43 | 44 | private: 45 | 46 | 47 | public: 48 | string mName; 49 | Sequence mSeq; 50 | string mStrand; 51 | string mQuality; 52 | bool mHasQuality; 53 | }; 54 | 55 | class ReadPair{ 56 | public: 57 | ReadPair(Read* left, Read* right); 58 | ~ReadPair(); 59 | 60 | // merge a pair, without consideration of seq error caused false INDEL 61 | Read* fastMerge(); 62 | public: 63 | Read* mLeft; 64 | Read* mRight; 65 | 66 | public: 67 | static bool test(); 68 | }; 69 | 70 | class ReadItem{ 71 | public: 72 | std::string name1; 73 | std::string name2; 74 | std::string sequence1; 75 | std::string quality1; 76 | std::string sequence2; 77 | std::string quality2; 78 | bool paired = false; 79 | ReadItem(const std::string &, const std::string &); 80 | ReadItem(const std::string &, const std::string &, const std::string &); 81 | // ReadItem(const std::string & n1, const std::string & s1, const std::string & q1); 82 | // ReadItem(const std::string & n1, const std::string & s1, const std::string & q1, const std::string & n2, const std::string & s2, const std::string & q2); 83 | // std::string toStringR1(); 84 | std::string toStringWithTagR1(std::string & tag); 85 | // std::string toStringR2(); 86 | std::string toStringWithTagR2(std::string & tag); 87 | 88 | // std::string toStringRQ1(); 89 | // std::string toStringWithTagRQ1(std::string & tag); 90 | // std::string toStringRQ2(); 91 | // std::string toStringWithTagRQ2(std::string & tag); 92 | // 93 | }; 94 | 95 | #endif -------------------------------------------------------------------------------- /src/seprocessor.h: -------------------------------------------------------------------------------- 1 | #ifndef SE_PROCESSOR_H 2 | #define SE_PROCESSOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "options.h" 26 | #include "threadconfig.h" 27 | #include "filter.h" 28 | #include "umiprocessor.h" 29 | #include "writerthread.h" 30 | #include "duplicate.h" 31 | #include "fastqreader.h" 32 | #include "util.h" 33 | #include "jsonreporter.h" 34 | #include "htmlreporter.h" 35 | #include "adaptertrimmer.h" 36 | #include "polyx.h" 37 | #include "read.h" 38 | #include "common.h" 39 | #include "bwtfmiDB.h" 40 | 41 | 42 | using namespace std; 43 | 44 | struct ReadPack { 45 | Read** data; 46 | int count; 47 | }; 48 | 49 | typedef struct ReadPack ReadPack; 50 | 51 | struct ReadRepository { 52 | ReadPack** packBuffer; 53 | atomic_long readPos; 54 | atomic_long writePos; 55 | }; 56 | 57 | typedef struct ReadRepository ReadRepository; 58 | 59 | class SingleEndProcessor{ 60 | public: 61 | SingleEndProcessor(Options* & opt, BwtFmiDB * tbwtfmiDB); 62 | ~SingleEndProcessor(); 63 | bool process(); 64 | 65 | private: 66 | bool processSingleEnd(ReadPack* pack, ThreadConfig* config); 67 | void initPackRepository(); 68 | void destroyPackRepository(); 69 | void producePack(ReadPack* pack); 70 | void consumePack(ThreadConfig* config); 71 | void producerTask(); 72 | void consumerTask(ThreadConfig* config); 73 | void initConfig(ThreadConfig* config); 74 | void initOutput(); 75 | void closeOutput(); 76 | void writeTask(WriterThread* config); 77 | void prepareResults(); 78 | private: 79 | Options* mOptions; 80 | ReadRepository mRepo; 81 | atomic_bool mProduceFinished; 82 | atomic_int mFinishedThreads; 83 | std::mutex mInputMtx; 84 | std::mutex mOutputMtx; 85 | std::mutex logMtx; 86 | Filter* mFilter; 87 | gzFile mZipFile; 88 | ofstream* mOutStream; 89 | UmiProcessor* mUmiProcessor; 90 | WriterThread* mLeftWriter; 91 | WriterThread* mFailedWriter; 92 | Duplicate* mDuplicate; 93 | WriterThread* mReadsKOWriter; 94 | BwtFmiDB *tbwtfmiDB; 95 | std::string fileoutname; 96 | 97 | }; 98 | 99 | 100 | #endif 101 | -------------------------------------------------------------------------------- /src/seqtract: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/src/seqtract -------------------------------------------------------------------------------- /src/seqtractpeprocessor.h: -------------------------------------------------------------------------------- 1 | #ifndef SEQTRACTPEPROCESSOR_H 2 | #define SEQTRACTPEPROCESSOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "options.h" 12 | #include "fastqreader.h" 13 | #include "threadsconfig2.h" 14 | #include "read.h" 15 | #include "util.h" 16 | 17 | 18 | using namespace std; 19 | 20 | struct ReadPack{ 21 | Read** data; 22 | int count; 23 | }; 24 | typedef struct ReadPack ReadPack; 25 | 26 | struct ReadRepository{ 27 | ReadPack** packBuffer; 28 | size_t readPos; 29 | size_t writePos; 30 | size_t readCounter; 31 | std::mutex mtx; 32 | std::mutex readCounterMtx; 33 | std::condition_variable repoNotFull; 34 | std::condition_variable repoNotEmpty; 35 | }; 36 | typedef struct ReadPairRepository ReadPairRepository; 37 | 38 | 39 | class SeqTractPeProcessor { 40 | public: 41 | SeqTractPeProcessor(Options * opt); 42 | ~SeqTractPeProcessor(); 43 | bool process(); 44 | 45 | private: 46 | bool processReads(ReadPack* pack); 47 | void initPackRepository(); 48 | void destroyPackRepository(); 49 | void producePack(ReadPack* pack); 50 | void consumePack(); 51 | void producerTask(); 52 | void consumerTask(); 53 | void writeTask(ThreadsConfig2* config); 54 | 55 | private: 56 | Options* mOptions; 57 | ReadRepository mRepo; 58 | bool mProduceFinished; 59 | ThreadsConfig2** mConfigs; 60 | int mSampleSize; 61 | std::unordered_set featureUSet; 62 | }; 63 | 64 | #endif /* SEQTRACTPEPROCESSOR_H */ 65 | 66 | -------------------------------------------------------------------------------- /src/sequence.cpp: -------------------------------------------------------------------------------- 1 | #include "sequence.h" 2 | 3 | Sequence::Sequence(){ 4 | } 5 | 6 | Sequence::Sequence(string seq){ 7 | mStr = seq; 8 | } 9 | 10 | void Sequence::print(){ 11 | std::cerr << mStr; 12 | } 13 | 14 | int Sequence::length(){ 15 | return mStr.length(); 16 | } 17 | 18 | Sequence Sequence::reverseComplement(){ 19 | string str(mStr.length(), 0); 20 | for(int c=0;c 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | class Sequence{ 12 | public: 13 | Sequence(); 14 | Sequence(string seq); 15 | void print(); 16 | int length(); 17 | Sequence reverseComplement(); 18 | 19 | Sequence operator~(); 20 | 21 | static bool test(); 22 | 23 | public: 24 | string mStr; 25 | }; 26 | 27 | #endif -------------------------------------------------------------------------------- /src/stats.h: -------------------------------------------------------------------------------- 1 | #ifndef STATS_H 2 | #define STATS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "read.h" 10 | #include "options.h" 11 | 12 | using namespace std; 13 | 14 | class Stats{ 15 | public: 16 | // this @guessedCycles parameter should be calculated using the first several records 17 | Stats(Options* & opt, bool isRead2 = false, int guessedCycles = 0, int bufferMargin = 1024); 18 | ~Stats(); 19 | int getCycles(); 20 | long getReads(); 21 | long getBases(); 22 | long getQ20(); 23 | long getQ30(); 24 | long getGCNumber(); 25 | // by default the qualified qual score is Q20 ('5') 26 | void statRead(Read* r); 27 | 28 | static Stats* merge(vector& list); 29 | void print(); 30 | void summarize(bool forced = false); 31 | // a port of JSON report 32 | void reportJson(ofstream& ofs, string padding); 33 | // a port of HTML report 34 | void reportHtml(ofstream& ofs, string filteringType, string readName); 35 | void reportHtmlQuality(ofstream& ofs, string filteringType, string readName); 36 | void reportHtmlContents(ofstream& ofs, string filteringType, string readName); 37 | void reportHtmlKMER(ofstream& ofs, string filteringType, string readName); 38 | void reportHtmlORA(ofstream& ofs, string filteringType, string readName); 39 | bool isLongRead(); 40 | void initOverRepSeq(); 41 | int getMeanLength(); 42 | 43 | public: 44 | static string list2string(double* list, int size); 45 | static string list2string(double* list, int size, long* coords); 46 | static string list2string(long* list, int size); 47 | static string list2string(std::vector & x_vec, int top); 48 | static string list2string(std::vector & y_vec, int top); 49 | static string list2string(std::vector & x_vec, int top); 50 | //static string list2string(std::vector & y_vec, int top); 51 | static int base2val(char base); 52 | 53 | private: 54 | void extendBuffer(int newBufLen); 55 | string makeKmerTD(int i, int j); 56 | string kmer3(int val); 57 | string kmer2(int val); 58 | void deleteOverRepSeqDist(); 59 | bool overRepPassed(string& seq, long count); 60 | 61 | private: 62 | Options* mOptions; 63 | bool mIsRead2; 64 | long mReads; 65 | int mEvaluatedSeqLen; 66 | /* 67 | why we use 8 here? 68 | map A/T/C/G/N to 0~7 by their ASCII % 8: 69 | 'A' % 8 = 1 70 | 'T' % 8 = 4 71 | 'C' % 8 = 3 72 | 'G' % 8 = 7 73 | 'N' % 8 = 6 74 | */ 75 | long *mCycleQ30Bases[8]; 76 | long *mCycleQ20Bases[8]; 77 | long *mCycleBaseContents[8]; 78 | long *mCycleBaseQual[8]; 79 | long *mCycleTotalBase; 80 | long *mCycleTotalQual; 81 | long *mKmer; 82 | 83 | map mQualityCurves; 84 | map mContentCurves; 85 | map mOverRepSeq; 86 | map mOverRepSeqDist; 87 | 88 | 89 | int mCycles; 90 | int mBufLen; 91 | long mBases; 92 | long mQ20Bases[8]; 93 | long mQ30Bases[8]; 94 | long mBaseContents[8]; 95 | long mQ20Total; 96 | long mQ30Total; 97 | bool summarized; 98 | long mKmerMax; 99 | long mKmerMin; 100 | int mKmerBufLen; 101 | long mLengthSum; 102 | }; 103 | 104 | #endif -------------------------------------------------------------------------------- /src/threadconfig.cpp: -------------------------------------------------------------------------------- 1 | #include "threadconfig.h" 2 | #include "util.h" 3 | 4 | ThreadConfig::ThreadConfig(Options* & opt, BwtFmiDB* & tbwtfmiDB, int threadId, bool paired){ 5 | mOptions = opt; 6 | mThreadId = threadId; 7 | mWorkingSplit = threadId; 8 | mCurrentSplitReads = 0; 9 | mPreStats1 = new Stats(mOptions, false); 10 | mPostStats1 = new Stats(mOptions, false); 11 | if(paired){ 12 | mPreStats2 = new Stats(mOptions, true); 13 | mPostStats2 = new Stats(mOptions, true); 14 | } else { 15 | mPreStats2 = NULL; 16 | mPostStats2 = NULL; 17 | } 18 | mWriter1 = NULL; 19 | mWriter2 = NULL; 20 | 21 | mFilterResult = new FilterResult(opt, paired); 22 | mCanBeStopped = false; 23 | //mBwtfmiDB = tbwtfmiDB; 24 | mTransSearcher = new TransSearcher(mOptions, tbwtfmiDB); 25 | } 26 | 27 | ThreadConfig::~ThreadConfig() { 28 | cleanup(); 29 | if(mTransSearcher != NULL){ 30 | delete mTransSearcher; 31 | mTransSearcher = NULL; 32 | } 33 | } 34 | 35 | void ThreadConfig::cleanup() { 36 | if(mOptions->split.enabled && mOptions->split.byFileNumber) 37 | writeEmptyFilesForSplitting(); 38 | deleteWriter(); 39 | } 40 | 41 | void ThreadConfig::deleteWriter() { 42 | if(mWriter1 != NULL) { 43 | delete mWriter1; 44 | mWriter1 = NULL; 45 | } 46 | if(mWriter2 != NULL) { 47 | delete mWriter2; 48 | mWriter2 = NULL; 49 | } 50 | } 51 | 52 | void ThreadConfig::initWriter(string filename1) { 53 | deleteWriter(); 54 | mWriter1 = new Writer(filename1, mOptions->compression); 55 | } 56 | 57 | void ThreadConfig::initWriter(string filename1, string filename2) { 58 | deleteWriter(); 59 | mWriter1 = new Writer(filename1, mOptions->compression); 60 | mWriter2 = new Writer(filename2, mOptions->compression); 61 | } 62 | 63 | void ThreadConfig::initWriter(ofstream* stream) { 64 | deleteWriter(); 65 | mWriter1 = new Writer(stream); 66 | } 67 | 68 | void ThreadConfig::initWriter(ofstream* stream1, ofstream* stream2) { 69 | deleteWriter(); 70 | mWriter1 = new Writer(stream1); 71 | mWriter2 = new Writer(stream2); 72 | } 73 | 74 | void ThreadConfig::initWriter(gzFile gzfile) { 75 | deleteWriter(); 76 | mWriter1 = new Writer(gzfile); 77 | } 78 | 79 | void ThreadConfig::initWriter(gzFile gzfile1, gzFile gzfile2) { 80 | deleteWriter(); 81 | mWriter1 = new Writer(gzfile1); 82 | mWriter2 = new Writer(gzfile2); 83 | } 84 | 85 | void ThreadConfig::addFilterResult(int result, int readNum) { 86 | mFilterResult->addFilterResult(result, readNum); 87 | } 88 | 89 | void ThreadConfig::addMergedPairs(int pairs) { 90 | mFilterResult->addMergedPairs(pairs); 91 | } 92 | 93 | void ThreadConfig::initWriterForSplit() { 94 | if(mOptions->out1.empty()) 95 | return ; 96 | 97 | // use 1-based naming 98 | string num = to_string(mWorkingSplit + 1); 99 | // padding for digits like 0001 100 | if(mOptions->split.digits > 0){ 101 | while(num.size() < mOptions->split.digits) 102 | num = "0" + num; 103 | } 104 | 105 | string filename1 = joinpath(dirname(mOptions->out1), num + "." + basename(mOptions->out1)); 106 | if(!mOptions->isPaired()) { 107 | initWriter(filename1); 108 | } else { 109 | string filename2 = joinpath(dirname(mOptions->out2), num + "." + basename(mOptions->out2)); 110 | initWriter(filename1, filename2); 111 | } 112 | } 113 | 114 | void ThreadConfig::markProcessed(long readNum) { 115 | mCurrentSplitReads += readNum; 116 | if(!mOptions->split.enabled) 117 | return ; 118 | // if splitting is enabled, check whether current file is full 119 | if(mCurrentSplitReads >= mOptions->split.size) { 120 | // if it's splitting by file number, totally we cannot exceed split.number 121 | // if it's splitting by file lines, then we don't need to check 122 | if(mOptions->split.byFileLines || mWorkingSplit + mOptions->thread < mOptions->split.number ){ 123 | mWorkingSplit += mOptions->thread; 124 | initWriterForSplit(); 125 | mCurrentSplitReads = 0; 126 | } else { 127 | // this thread can be stoped now since all its tasks are done 128 | // only a part of threads have to deal with the remaining reads 129 | if(mOptions->split.number % mOptions->thread >0 130 | && mThreadId >= mOptions->split.number % mOptions->thread) 131 | mCanBeStopped = true; 132 | } 133 | } 134 | } 135 | 136 | // if a task of writting N files is assigned to this thread, but the input file doesn't have so many reads to input 137 | // write some empty files so it will not break following pipelines 138 | void ThreadConfig::writeEmptyFilesForSplitting() { 139 | while(mWorkingSplit + mOptions->thread < mOptions->split.number) { 140 | mWorkingSplit += mOptions->thread; 141 | initWriterForSplit(); 142 | mCurrentSplitReads = 0; 143 | } 144 | } 145 | 146 | bool ThreadConfig::canBeStopped() { 147 | return mCanBeStopped; 148 | } 149 | -------------------------------------------------------------------------------- /src/threadconfig.h: -------------------------------------------------------------------------------- 1 | #ifndef THREAD_CONFIG_H 2 | #define THREAD_CONFIG_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "stats.h" 9 | #include "writer.h" 10 | #include "options.h" 11 | #include "filterresult.h" 12 | #include "transsearcher.hpp" 13 | #include "bwtfmiDB.h" 14 | 15 | using namespace std; 16 | 17 | class ThreadConfig{ 18 | public: 19 | ThreadConfig(Options* & opt, BwtFmiDB* & tbwtfmiDB, int threadId, bool paired = false); 20 | ~ThreadConfig(); 21 | inline Stats* getPreStats1() {return mPreStats1;} 22 | inline Stats* getPostStats1() {return mPostStats1;} 23 | inline Stats* getPreStats2() {return mPreStats2;} 24 | inline Stats* getPostStats2() {return mPostStats2;} 25 | inline Writer* getWriter1() {return mWriter1;} 26 | inline Writer* getWriter2() {return mWriter2;} 27 | inline FilterResult* getFilterResult() {return mFilterResult;} 28 | inline TransSearcher* getTransSearcher(){return mTransSearcher;} 29 | 30 | void initWriter(string filename1); 31 | void initWriter(string filename1, string filename2); 32 | void initWriter(ofstream* stream); 33 | void initWriter(ofstream* stream1, ofstream* stream2); 34 | void initWriter(gzFile gzfile); 35 | void initWriter(gzFile gzfile1, gzFile gzfile2); 36 | 37 | void addFilterResult(int result, int readNum); 38 | void addMergedPairs(int pairs); 39 | 40 | int getThreadId() {return mThreadId;} 41 | // for splitting output 42 | // increase mCurrentSplitReads by readNum, and check it with options->split.size; 43 | void markProcessed(long readNum); 44 | void initWriterForSplit(); 45 | bool canBeStopped(); 46 | void cleanup(); 47 | 48 | private: 49 | void deleteWriter(); 50 | void writeEmptyFilesForSplitting(); 51 | 52 | private: 53 | Stats* mPreStats1; 54 | Stats* mPostStats1; 55 | Stats* mPreStats2; 56 | Stats* mPostStats2; 57 | Writer* mWriter1; 58 | Writer* mWriter2; 59 | Options* mOptions; 60 | FilterResult* mFilterResult; 61 | 62 | // for spliting output 63 | int mThreadId; 64 | int mWorkingSplit; 65 | long mCurrentSplitReads; 66 | bool mCanBeStopped; 67 | 68 | TransSearcher* mTransSearcher; 69 | //BwtFmiDB* mBwtfmiDB; 70 | }; 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /src/threadsconfig2.cpp: -------------------------------------------------------------------------------- 1 | #include "threadsconfig2.h" 2 | #include "util.h" 3 | #include 4 | #include 5 | 6 | ThreadsConfig2::ThreadsConfig2(Options* & opt, int threadId) { 7 | mOptions = opt; 8 | mThreadId = threadId; 9 | mWriter1 = NULL; 10 | mInputCounter = 0; 11 | mOutputCounter = 0; 12 | mInputCompleted = false; 13 | mRingBuffer = new char*[PACK_NUM_LIMIT]; 14 | memset(mRingBuffer, 0, sizeof(char*) * PACK_NUM_LIMIT); 15 | mRingBufferSizes = new size_t[PACK_NUM_LIMIT]; 16 | memset(mRingBufferSizes, 0, sizeof(char*) * PACK_NUM_LIMIT); 17 | string fullpath = ""; 18 | if(mThreadId < mOptions->mSeqExtractions.targetGenesSubVec.size()){ 19 | auto filename = mOptions->mSeqExtractions.targetGenesSubVec[mThreadId] + mOptions->mSeqExtractions.suffix; 20 | fullpath = joinpath(mOptions->mSeqExtractions.outputDir, filename); 21 | } else { 22 | fullpath = mOptions->mSeqExtractions.undeterminedFileNameOut; 23 | } 24 | initWriter(fullpath); 25 | } 26 | 27 | ThreadsConfig2::~ThreadsConfig2() { 28 | cleanup(); 29 | if (mRingBuffer) { 30 | delete mRingBuffer; 31 | mRingBuffer = NULL; 32 | } 33 | if (mRingBufferSizes) { 34 | delete mRingBufferSizes; 35 | mRingBufferSizes = NULL; 36 | } 37 | } 38 | 39 | void ThreadsConfig2::initWriter(string filename1) { 40 | deleteWriter(); 41 | mWriter1 = new Writer(filename1, mOptions->compression); 42 | } 43 | 44 | void ThreadsConfig2::initWriter(ofstream* stream) { 45 | deleteWriter(); 46 | mWriter1 = new Writer(stream); 47 | } 48 | 49 | void ThreadsConfig2::initWriter(gzFile gzfile) { 50 | deleteWriter(); 51 | mWriter1 = new Writer(gzfile); 52 | } 53 | 54 | void ThreadsConfig2::deleteWriter() { 55 | if(mWriter1 != NULL){ 56 | delete mWriter1; 57 | mWriter1 = NULL; 58 | } 59 | } 60 | 61 | void ThreadsConfig2::cleanup(){ 62 | deleteWriter(); 63 | } 64 | 65 | bool ThreadsConfig2::isCompleted(){ 66 | return mInputCompleted && (mOutputCounter == mInputCounter); 67 | } 68 | 69 | void ThreadsConfig2::setInputCompleted(){ 70 | mInputCompleted = true; 71 | } 72 | 73 | void ThreadsConfig2::input(char* data, size_t size){ 74 | long target = mInputCounter % PACK_NUM_LIMIT; 75 | mRingBuffer[target] = data; 76 | mRingBufferSizes[target] = size; 77 | mInputCounter++; 78 | } 79 | 80 | void ThreadsConfig2::output(){ 81 | if(mOutputCounter >= mInputCounter){ 82 | usleep(100); 83 | } 84 | while(mOutputCounter < mInputCounter){ 85 | long target = mOutputCounter % PACK_NUM_LIMIT; 86 | mWriter1->write(mRingBuffer[target], mRingBufferSizes[target]); 87 | delete mRingBuffer[target]; 88 | mRingBuffer[target] = NULL; 89 | mOutputCounter++; 90 | } 91 | } -------------------------------------------------------------------------------- /src/threadsconfig2.h: -------------------------------------------------------------------------------- 1 | #ifndef THREADSCONFIG2_H 2 | #define THREADSCONFIG2_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "writer.h" 10 | #include "options.h" 11 | 12 | using namespace std; 13 | 14 | class ThreadsConfig2 { 15 | public: 16 | ThreadsConfig2(Options* & opt, int threadId); 17 | ~ThreadsConfig2(); 18 | 19 | void initWriter(string filename1); 20 | void initWriter(ofstream* stream); 21 | void initWriter(gzFile gzfile); 22 | 23 | int getThreadId(){return mThreadId;}; 24 | void cleanup(); 25 | 26 | bool isCompleted(); 27 | void output(); 28 | void input(char* data, size_t size); 29 | void setInputCompleted(); 30 | 31 | private: 32 | void deleteWriter(); 33 | 34 | private: 35 | Writer* mWriter1; 36 | Options* mOptions; 37 | 38 | int mThreadId; 39 | bool mInputCompleted; 40 | atomic_long mInputCounter; 41 | atomic_long mOutputCounter; 42 | char** mRingBuffer; 43 | size_t* mRingBufferSizes; 44 | }; 45 | 46 | #endif /* THREADSCONFIG2_H */ 47 | 48 | -------------------------------------------------------------------------------- /src/transsearcher.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TRANSSEARCHER_HPP 2 | #define TRANSSEARCHER_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "util.h" 28 | #include "algo/blast/core/blast_seg.h" 29 | #include "algo/blast/core/blast_filter.h" 30 | #include "algo/blast/core/blast_encoding.h" 31 | #include "read.h" 32 | #include "options.h" 33 | #include "fragment.h" 34 | #include "options.h" 35 | #include "bwtfmiDB.h" 36 | #include "common.h" 37 | 38 | extern "C" { 39 | #include "bwt/bwt.h" 40 | } 41 | 42 | const double LN_2 = 0.6931471805; 43 | const double LAMBDA = 0.3176; 44 | const double LN_K = -2.009915479; 45 | 46 | class TransSearcher { 47 | protected: 48 | uint8_t codon_to_int(const char* codon); 49 | uint8_t revcomp_codon_to_int(const char* codon); 50 | 51 | uint8_t nuc2int[256]; 52 | uint8_t compnuc2int[256]; 53 | char codon2aa[256]; 54 | uint8_t aa2int[256]; 55 | 56 | std::map> blosum_subst; 57 | int8_t blosum62diag[20]; 58 | int8_t b62[20][20]; 59 | 60 | std::string translations[6]; 61 | std::multimap> fragments; 62 | std::vector best_matches_SI; 63 | std::vector longest_matches_SI; 64 | std::vector best_matches; 65 | std::vector longest_fragments; 66 | 67 | unsigned int best_match_score = 0; 68 | double query_len; 69 | uint32_t read_count = 0; 70 | uint32 uniq_mapped_reads = 0; 71 | uint32 multi_mapped_reads = 0; 72 | 73 | void clearFragments(); 74 | unsigned int calcScore(const std::string &); 75 | unsigned int calcScore(const std::string &, int); 76 | unsigned int calcScore(const std::string &, size_t, size_t, int); 77 | void addAllMismatchVariantsAtPosSI(const Fragment *, unsigned int, size_t, SI *); // used in Greedy mode 78 | Fragment * getNextFragment(unsigned int); 79 | void eval_match_scores(SI *si, Fragment *); 80 | void getAllFragmentsBits(const std::string & line); 81 | void getLongestFragmentsBits(const std::string & line); 82 | void flush_output(); 83 | void preProcess(); 84 | void doProcess(); 85 | uint32 * postProcess(); 86 | 87 | protected: 88 | void classify_length(); 89 | void classify_greedyblosum(); 90 | void ids_from_SI(SI *); 91 | void ids_from_SI_recursive(SI *); 92 | std::set match_ids; 93 | std::set matched_genids; 94 | std::map tmpIdFreqMap; 95 | std::map idFreqSubMap; 96 | Options * mOptions; 97 | BwtFmiDB * tbwtfmiDB; 98 | 99 | public: 100 | TransSearcher(Options * & opt, BwtFmiDB * & mBwtfmiDB); 101 | void transSearch(Read * item, uint32* & orthId); 102 | void transSearch(Read * item1, Read * item2, uint32* & orthId); 103 | inline std::map getIdFreqSubMap(){return idFreqSubMap;}; 104 | static std::map merge(std::vector> & list); 105 | }; 106 | 107 | 108 | #endif /* TRANSSEARCHER_HPP */ 109 | -------------------------------------------------------------------------------- /src/umiprocessor.cpp: -------------------------------------------------------------------------------- 1 | #include "umiprocessor.h" 2 | 3 | UmiProcessor::UmiProcessor(Options* & opt){ 4 | mOptions = opt; 5 | } 6 | 7 | 8 | UmiProcessor::~UmiProcessor(){ 9 | } 10 | 11 | void UmiProcessor::process(Read* r1, Read* r2) { 12 | if(!mOptions->umi.enabled) 13 | return; 14 | 15 | string umi; 16 | if(mOptions->umi.location == UMI_LOC_INDEX1) 17 | umi = r1->firstIndex(); 18 | else if(mOptions->umi.location == UMI_LOC_INDEX2 && r2) 19 | umi = r2->lastIndex(); 20 | else if(mOptions->umi.location == UMI_LOC_READ1){ 21 | umi = r1->mSeq.mStr.substr(0, min(r1->length(), mOptions->umi.length)); 22 | r1->trimFront(umi.length() + mOptions->umi.skip); 23 | } 24 | else if(mOptions->umi.location == UMI_LOC_READ2 && r2){ 25 | umi = r2->mSeq.mStr.substr(0, min(r2->length(), mOptions->umi.length)); 26 | r2->trimFront(umi.length() + mOptions->umi.skip); 27 | } 28 | else if(mOptions->umi.location == UMI_LOC_PER_INDEX){ 29 | string umiMerged = r1->firstIndex(); 30 | if(r2) { 31 | umiMerged = umiMerged + "_" + r2->lastIndex(); 32 | } 33 | 34 | addUmiToName(r1, umiMerged); 35 | if(r2) { 36 | addUmiToName(r2, umiMerged); 37 | } 38 | } 39 | else if(mOptions->umi.location == UMI_LOC_PER_READ){ 40 | string umi1 = r1->mSeq.mStr.substr(0, min(r1->length(), mOptions->umi.length)); 41 | string umiMerged = umi1; 42 | r1->trimFront(umi1.length() + mOptions->umi.skip); 43 | if(r2){ 44 | string umi2 = r2->mSeq.mStr.substr(0, min(r2->length(), mOptions->umi.length)); 45 | umiMerged = umiMerged + "_" + umi2; 46 | r2->trimFront(umi2.length() + mOptions->umi.skip); 47 | } 48 | 49 | addUmiToName(r1, umiMerged); 50 | if(r2){ 51 | addUmiToName(r2, umiMerged); 52 | } 53 | } 54 | 55 | if(mOptions->umi.location != UMI_LOC_PER_INDEX && mOptions->umi.location != UMI_LOC_PER_READ) { 56 | if(r1 && !umi.empty()) 57 | addUmiToName(r1, umi); 58 | if(r2 && !umi.empty()) 59 | addUmiToName(r2, umi); 60 | } 61 | } 62 | 63 | void UmiProcessor::addUmiToName(Read* r, string umi){ 64 | string tag; 65 | if(mOptions->umi.prefix.empty()) 66 | tag = ":" + umi; 67 | else 68 | tag = ":" + mOptions->umi.prefix + "_" + umi; 69 | int spacePos = -1; 70 | for(int i=0; imName.length(); i++) { 71 | if(r->mName[i] == ' ') { 72 | spacePos = i; 73 | break; 74 | } 75 | } 76 | if(spacePos == -1) { 77 | r->mName = r->mName + tag; 78 | } else { 79 | r->mName = r->mName.substr(0, spacePos) + tag + r->mName.substr(spacePos, r->mName.length() - spacePos); 80 | } 81 | 82 | } 83 | 84 | 85 | bool UmiProcessor::test() { 86 | return true; 87 | } -------------------------------------------------------------------------------- /src/umiprocessor.h: -------------------------------------------------------------------------------- 1 | #ifndef UMI_PROCESSOR_H 2 | #define UMI_PROCESSOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "options.h" 8 | #include "read.h" 9 | 10 | using namespace std; 11 | 12 | class UmiProcessor{ 13 | public: 14 | UmiProcessor(Options* & opt); 15 | ~UmiProcessor(); 16 | void process(Read* r1, Read* r2 = NULL); 17 | void addUmiToName(Read* r, string umi); 18 | static bool test(); 19 | 20 | private: 21 | Options* mOptions; 22 | }; 23 | 24 | 25 | #endif -------------------------------------------------------------------------------- /src/unittest.cpp: -------------------------------------------------------------------------------- 1 | #include "unittest.h" 2 | #include "sequence.h" 3 | #include "fastqreader.h" 4 | #include "read.h" 5 | #include "overlapanalysis.h" 6 | #include "filter.h" 7 | #include "adaptertrimmer.h" 8 | #include "basecorrector.h" 9 | #include "polyx.h" 10 | #include "nucleotidetree.h" 11 | #include "evaluator.h" 12 | #include 13 | 14 | UnitTest::UnitTest(){ 15 | 16 | } 17 | 18 | void UnitTest::run(){ 19 | bool passed = true; 20 | passed &= report(Sequence::test(), "Sequence::test"); 21 | passed &= report(Read::test(), "Read::test"); 22 | passed &= report(OverlapAnalysis::test(), "OverlapAnalysis::test"); 23 | passed &= report(Filter::test(), "Filter::test"); 24 | passed &= report(AdapterTrimmer::test(), "AdapterTrimmer::test"); 25 | passed &= report(BaseCorrector::test(), "BaseCorrector::test"); 26 | passed &= report(PolyX::test(), "PolyX::test"); 27 | passed &= report(NucleotideTree::test(), "NucleotideTree::test"); 28 | passed &= report(Evaluator::test(), "Evaluator::test"); 29 | printf("\n==========================\n"); 30 | printf("%s\n\n", passed?"ALL PASSED":"FAILED"); 31 | } 32 | 33 | bool UnitTest::report(bool result, string message) { 34 | printf("%s:%s\n\n", message.c_str(), result?" PASSED":" FAILED"); 35 | return result; 36 | } -------------------------------------------------------------------------------- /src/unittest.h: -------------------------------------------------------------------------------- 1 | #ifndef UNIT_TEST_H 2 | #define UNIT_TEST_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | class UnitTest{ 11 | public: 12 | UnitTest(); 13 | void run(); 14 | bool report(bool result, string message); 15 | }; 16 | 17 | #endif -------------------------------------------------------------------------------- /src/writer.cpp: -------------------------------------------------------------------------------- 1 | #include "writer.h" 2 | #include "util.h" 3 | #include "fastqreader.h" 4 | #include 5 | 6 | Writer::Writer(string filename, int compression) { 7 | mCompression = compression; 8 | mFilename = filename; 9 | mZipFile = NULL; 10 | mZipped = false; 11 | haveToClose = true; 12 | init(); 13 | } 14 | 15 | Writer::Writer(ofstream* stream) { 16 | mZipFile = NULL; 17 | mZipped = false; 18 | mOutStream = stream; 19 | haveToClose = false; 20 | } 21 | 22 | Writer::Writer(gzFile gzfile) { 23 | mOutStream = NULL; 24 | mZipFile = gzfile; 25 | mZipped = true; 26 | haveToClose = false; 27 | } 28 | 29 | Writer::~Writer() { 30 | if (haveToClose) { 31 | close(); 32 | } 33 | } 34 | 35 | string Writer::filename() { 36 | return mFilename; 37 | } 38 | 39 | void Writer::init() { 40 | if (ends_with(mFilename, ".gz")) { 41 | mZipFile = gzopen(mFilename.c_str(), "w"); 42 | gzsetparams(mZipFile, mCompression, Z_DEFAULT_STRATEGY); 43 | gzbuffer(mZipFile, 1024 * 1024); 44 | mZipped = true; 45 | } else { 46 | mOutStream = new ofstream(); 47 | mOutStream->open(mFilename.c_str(), ifstream::out); 48 | mZipped = false; 49 | } 50 | } 51 | 52 | bool Writer::writeLine(string& linestr) { 53 | const char* line = linestr.c_str(); 54 | size_t size = linestr.length(); 55 | size_t written; 56 | bool status; 57 | if (mZipped) { 58 | written = gzwrite(mZipFile, line, size); 59 | gzputc(mZipFile, '\n'); 60 | status = size == written; 61 | } else { 62 | mOutStream->write(line, size); 63 | mOutStream->put('\n'); 64 | status = !mOutStream->fail(); 65 | } 66 | 67 | return status; 68 | } 69 | 70 | bool Writer::writeString(string& str) { 71 | const char* strdata = str.c_str(); 72 | size_t size = str.length(); 73 | size_t written; 74 | bool status; 75 | if (mZipped) { 76 | written = gzwrite(mZipFile, strdata, size); 77 | status = size == written; 78 | } else { 79 | mOutStream->write(strdata, size); 80 | status = !mOutStream->fail(); 81 | } 82 | 83 | return status; 84 | } 85 | 86 | bool Writer::write(char* strdata, size_t size) { 87 | size_t written; 88 | bool status; 89 | 90 | if (mZipped) { 91 | written = gzwrite(mZipFile, strdata, size); 92 | status = size == written; 93 | } else { 94 | mOutStream->write(strdata, size); 95 | status = !mOutStream->fail(); 96 | } 97 | return status; 98 | } 99 | 100 | void Writer::close() { 101 | if (mZipped) { 102 | if (mZipFile) { 103 | gzflush(mZipFile, Z_FINISH); 104 | gzclose(mZipFile); 105 | mZipFile = NULL; 106 | } 107 | } else if (mOutStream) { 108 | if (mOutStream->is_open()) { 109 | mOutStream->flush(); 110 | //TODO: following two lines will cause crash 111 | //mOutStream->close(); 112 | //delete mOutStream; 113 | mOutStream = NULL; 114 | } 115 | } 116 | } 117 | 118 | bool Writer::isZipped() { 119 | return mZipped; 120 | } -------------------------------------------------------------------------------- /src/writer.h: -------------------------------------------------------------------------------- 1 | #ifndef _WRITER_H 2 | #define _WRITER_H 3 | 4 | #include 5 | #include 6 | #ifdef DYNAMIC_ZLIB 7 | #include 8 | #else 9 | #include "zlib/zlib.h" 10 | #endif 11 | #include "common.h" 12 | #include 13 | #include 14 | 15 | using namespace std; 16 | 17 | class Writer{ 18 | public: 19 | Writer(string filename, int compression = 3); 20 | Writer(ofstream* stream); 21 | Writer(gzFile gzfile); 22 | ~Writer(); 23 | bool isZipped(); 24 | bool writeString(string& s); 25 | bool writeLine(string& linestr); 26 | bool write(char* strdata, size_t size); 27 | string filename(); 28 | 29 | public: 30 | static bool test(); 31 | 32 | private: 33 | void init(); 34 | void close(); 35 | 36 | private: 37 | string mFilename; 38 | gzFile mZipFile; 39 | ofstream* mOutStream; 40 | bool mZipped; 41 | int mCompression; 42 | bool haveToClose; 43 | }; 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/writerthread.cpp: -------------------------------------------------------------------------------- 1 | #include "writerthread.h" 2 | #include "util.h" 3 | #include 4 | #include 5 | 6 | WriterThread::WriterThread(Options* & opt, string filename) { 7 | mOptions = opt; 8 | mWriter1 = NULL; 9 | mInputCounter = 0; 10 | mOutputCounter = 0; 11 | mInputCompleted = false; 12 | mFilename = filename; 13 | mRingBuffer = new char*[PACK_NUM_LIMIT]; 14 | memset(mRingBuffer, 0, sizeof (char*) * PACK_NUM_LIMIT); 15 | mRingBufferSizes = new size_t[PACK_NUM_LIMIT]; 16 | memset(mRingBufferSizes, 0, sizeof (size_t) * PACK_NUM_LIMIT); 17 | initWriter(filename); 18 | } 19 | 20 | WriterThread::~WriterThread() { 21 | cleanup(); 22 | delete[] mRingBuffer; 23 | delete[] mRingBufferSizes; 24 | } 25 | 26 | bool WriterThread::isCompleted() { 27 | return mInputCompleted && (mOutputCounter == mInputCounter); 28 | } 29 | 30 | bool WriterThread::setInputCompleted() { 31 | mInputCompleted = true; 32 | return true; 33 | } 34 | 35 | void WriterThread::output() { 36 | if (mOutputCounter >= mInputCounter) { 37 | usleep(100); 38 | } 39 | while (mOutputCounter < mInputCounter) { 40 | mWriter1->write(mRingBuffer[mOutputCounter], mRingBufferSizes[mOutputCounter]); 41 | delete[] mRingBuffer[mOutputCounter]; 42 | mRingBuffer[mOutputCounter] = NULL; 43 | mOutputCounter++; 44 | } 45 | } 46 | 47 | void WriterThread::input(char* data, size_t size) { 48 | mRingBuffer[mInputCounter] = data; 49 | mRingBufferSizes[mInputCounter] = size; 50 | mInputCounter++; 51 | } 52 | 53 | void WriterThread::cleanup() { 54 | deleteWriter(); 55 | } 56 | 57 | void WriterThread::deleteWriter() { 58 | if (mWriter1 != NULL) { 59 | delete mWriter1; 60 | mWriter1 = NULL; 61 | } 62 | } 63 | 64 | void WriterThread::initWriter(string filename1) { 65 | deleteWriter(); 66 | mWriter1 = new Writer(filename1, mOptions->compression); 67 | } 68 | 69 | void WriterThread::initWriter(ofstream* stream) { 70 | deleteWriter(); 71 | mWriter1 = new Writer(stream); 72 | } 73 | 74 | void WriterThread::initWriter(gzFile gzfile) { 75 | deleteWriter(); 76 | mWriter1 = new Writer(gzfile); 77 | } 78 | 79 | long WriterThread::bufferLength() { 80 | return mInputCounter - mOutputCounter; 81 | } 82 | -------------------------------------------------------------------------------- /src/writerthread.h: -------------------------------------------------------------------------------- 1 | #ifndef WRITER_THREAD_H 2 | #define WRITER_THREAD_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "writer.h" 9 | #include "options.h" 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | class WriterThread{ 16 | public: 17 | WriterThread(Options* & opt, string filename); 18 | ~WriterThread(); 19 | 20 | void initWriter(string filename1); 21 | void initWriter(ofstream* stream); 22 | void initWriter(gzFile gzfile); 23 | 24 | void cleanup(); 25 | 26 | bool isCompleted(); 27 | void output(); 28 | void input(char* data, size_t size); 29 | bool setInputCompleted(); 30 | 31 | long bufferLength(); 32 | string getFilename() {return mFilename;} 33 | 34 | private: 35 | void deleteWriter(); 36 | 37 | private: 38 | Writer* mWriter1; 39 | Options* mOptions; 40 | string mFilename; 41 | 42 | // for spliting output 43 | bool mInputCompleted; 44 | atomic_long mInputCounter; 45 | atomic_long mOutputCounter; 46 | char** mRingBuffer; 47 | size_t* mRingBufferSizes; 48 | 49 | mutex mtx; 50 | 51 | }; 52 | 53 | #endif -------------------------------------------------------------------------------- /src/zlib/inffast.h: -------------------------------------------------------------------------------- 1 | /* inffast.h -- header to use inffast.c 2 | * Copyright (C) 1995-2003, 2010 Mark Adler 3 | * For conditions of distribution and use, see copyright notice in zlib.h 4 | */ 5 | 6 | /* WARNING: this file should *not* be used by applications. It is 7 | part of the implementation of the compression library and is 8 | subject to change. Applications should only use zlib.h. 9 | */ 10 | 11 | void ZLIB_INTERNAL inflate_fast OF((z_streamp strm, unsigned start)); 12 | -------------------------------------------------------------------------------- /src/zlib/inftrees.h: -------------------------------------------------------------------------------- 1 | /* inftrees.h -- header to use inftrees.c 2 | * Copyright (C) 1995-2005, 2010 Mark Adler 3 | * For conditions of distribution and use, see copyright notice in zlib.h 4 | */ 5 | 6 | /* WARNING: this file should *not* be used by applications. It is 7 | part of the implementation of the compression library and is 8 | subject to change. Applications should only use zlib.h. 9 | */ 10 | 11 | /* Structure for decoding tables. Each entry provides either the 12 | information needed to do the operation requested by the code that 13 | indexed that table entry, or it provides a pointer to another 14 | table that indexes more bits of the code. op indicates whether 15 | the entry is a pointer to another table, a literal, a length or 16 | distance, an end-of-block, or an invalid code. For a table 17 | pointer, the low four bits of op is the number of index bits of 18 | that table. For a length or distance, the low four bits of op 19 | is the number of extra bits to get after the code. bits is 20 | the number of bits in this code or part of the code to drop off 21 | of the bit buffer. val is the actual byte to output in the case 22 | of a literal, the base length or distance, or the offset from 23 | the current table to the next table. Each entry is four bytes. */ 24 | typedef struct { 25 | unsigned char op; /* operation, extra bits, table bits */ 26 | unsigned char bits; /* bits in this part of the code */ 27 | unsigned short val; /* offset in table or code value */ 28 | } code; 29 | 30 | /* op values as set by inflate_table(): 31 | 00000000 - literal 32 | 0000tttt - table link, tttt != 0 is the number of table index bits 33 | 0001eeee - length or distance, eeee is the number of extra bits 34 | 01100000 - end of block 35 | 01000000 - invalid code 36 | */ 37 | 38 | /* Maximum size of the dynamic table. The maximum number of code structures is 39 | 1444, which is the sum of 852 for literal/length codes and 592 for distance 40 | codes. These values were found by exhaustive searches using the program 41 | examples/enough.c found in the zlib distribtution. The arguments to that 42 | program are the number of symbols, the initial root table size, and the 43 | maximum bit length of a code. "enough 286 9 15" for literal/length codes 44 | returns returns 852, and "enough 30 6 15" for distance codes returns 592. 45 | The initial root table size (9 or 6) is found in the fifth argument of the 46 | inflate_table() calls in inflate.c and infback.c. If the root table size is 47 | changed, then these maximum sizes would be need to be recalculated and 48 | updated. */ 49 | #define ENOUGH_LENS 852 50 | #define ENOUGH_DISTS 592 51 | #define ENOUGH (ENOUGH_LENS+ENOUGH_DISTS) 52 | 53 | /* Type of code to build for inflate_table() */ 54 | typedef enum { 55 | CODES, 56 | LENS, 57 | DISTS 58 | } codetype; 59 | 60 | int ZLIB_INTERNAL inflate_table OF((codetype type, unsigned short FAR *lens, 61 | unsigned codes, code FAR * FAR *table, 62 | unsigned FAR *bits, unsigned short FAR *work)); 63 | -------------------------------------------------------------------------------- /testdata/D1.CE2-S4-LT_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/testdata/D1.CE2-S4-LT_R1.fastq.gz -------------------------------------------------------------------------------- /testdata/D1.CE2-S4-LT_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/testdata/D1.CE2-S4-LT_R2.fastq.gz -------------------------------------------------------------------------------- /testdata/D2.CE2-H2-LT_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/testdata/D2.CE2-H2-LT_R1.fastq.gz -------------------------------------------------------------------------------- /testdata/D2.CE2-H2-LT_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/testdata/D2.CE2-H2-LT_R2.fastq.gz -------------------------------------------------------------------------------- /testdata/sample.txt: -------------------------------------------------------------------------------- 1 | D1.CE2-S4-LT D1.CE2-S4-LT_R1.fastq.gz D1.CE2-S4-LT_R2.fastq.gz control 2 | D2.CE2-H2-LT D2.CE2-H2-LT_R1.fastq.gz D2.CE2-H2-LT_R2.fastq.gz high 3 | --------------------------------------------------------------------------------