├── INSTALL
├── README.md
├── bin
    ├── mkbwt
    ├── mkfmi
    └── seq2fun
├── database
    ├── ko_fullname.txt
    ├── org_species.txt
    ├── pathway_hierarchy
    ├── pathway_ko.txt
    └── pathway_ko_stats.txt
├── src
    ├── adaptertrimmer.cpp
    ├── adaptertrimmer.h
    ├── basecorrector.cpp
    ├── basecorrector.h
    ├── bwt
    │   ├── Makefile
    │   ├── bwt.c
    │   ├── bwt.h
    │   ├── common.h
    │   ├── compactfmi.c
    │   ├── compactfmi.h
    │   ├── fmi.h
    │   ├── fmicommon.h
    │   ├── mkbwt.c
    │   ├── mkbwt_vars.h
    │   ├── mkfmi.c
    │   ├── mkfmi_vars.h
    │   ├── multikeyqsort.c
    │   ├── multikeyqsort.h
    │   ├── readFasta.c
    │   ├── readFasta.h
    │   ├── sequence.c
    │   ├── sequence.h
    │   ├── suffixArray.c
    │   └── suffixArray.h
    ├── bwtfmiDB.cpp
    ├── bwtfmiDB.h
    ├── cmdline.h
    ├── common.h
    ├── duplicate.cpp
    ├── duplicate.h
    ├── evaluator.cpp
    ├── evaluator.h
    ├── fastareader.cpp
    ├── fastareader.h
    ├── fastqreader.cpp
    ├── fastqreader.h
    ├── filter.cpp
    ├── filter.h
    ├── filterresult.cpp
    ├── filterresult.h
    ├── fragment.cpp
    ├── fragment.h
    ├── htmlreporter.cpp
    ├── htmlreporter.h
    ├── htmlreporterall.cpp
    ├── htmlreporterall.h
    ├── include
    │   └── ncbi-blast+
    │   │   ├── algo
    │   │       └── blast
    │   │       │   ├── composition_adjustment
    │   │       │       ├── composition_constants.h
    │   │       │       ├── matrix_frequency_data.c
    │   │       │       └── matrix_frequency_data.h
    │   │       │   └── core
    │   │       │       ├── blast_def.h
    │   │       │       ├── blast_dynarray.c
    │   │       │       ├── blast_dynarray.h
    │   │       │       ├── blast_encoding.c
    │   │       │       ├── blast_encoding.h
    │   │       │       ├── blast_export.h
    │   │       │       ├── blast_filter.c
    │   │       │       ├── blast_filter.h
    │   │       │       ├── blast_hits.h
    │   │       │       ├── blast_hits_priv.h
    │   │       │       ├── blast_hspfilter.h
    │   │       │       ├── blast_message.c
    │   │       │       ├── blast_message.h
    │   │       │       ├── blast_options.c
    │   │       │       ├── blast_options.h
    │   │       │       ├── blast_parameters.h
    │   │       │       ├── blast_posit.c
    │   │       │       ├── blast_posit.h
    │   │       │       ├── blast_program.c
    │   │       │       ├── blast_program.h
    │   │       │       ├── blast_psi.c
    │   │       │       ├── blast_psi.h
    │   │       │       ├── blast_psi_priv.c
    │   │       │       ├── blast_psi_priv.h
    │   │       │       ├── blast_query_info.c
    │   │       │       ├── blast_query_info.h
    │   │       │       ├── blast_rps.h
    │   │       │       ├── blast_seg.c
    │   │       │       ├── blast_seg.h
    │   │       │       ├── blast_seqsrc.h
    │   │       │       ├── blast_stat.c
    │   │       │       ├── blast_stat.h
    │   │       │       ├── blast_toolkit.h
    │   │       │       ├── blast_util.c
    │   │       │       ├── blast_util.h
    │   │       │       ├── gapinfo.h
    │   │       │       ├── hspfilter_besthit.h
    │   │       │       ├── hspfilter_collector.c
    │   │       │       ├── hspfilter_collector.h
    │   │       │       ├── lookup_wrap.h
    │   │       │       ├── matrix_freq_ratios.c
    │   │       │       ├── matrix_freq_ratios.h
    │   │       │       ├── ncbi_erf.c
    │   │       │       ├── ncbi_math.c
    │   │       │       ├── ncbi_math.h
    │   │       │       ├── ncbi_std.c
    │   │       │       ├── ncbi_std.h
    │   │       │       ├── pattern.c
    │   │       │       ├── pattern.h
    │   │       │       └── pattern_priv.h
    │   │   ├── common
    │   │       ├── ncbi_skew_guard.h
    │   │       └── ncbiconf_impl.h
    │   │   ├── connect
    │   │       ├── connect_export.h
    │   │       ├── ncbi_core.h
    │   │       └── ncbi_types.h
    │   │   ├── corelib
    │   │       ├── ncbitype.h
    │   │       └── ncbitype.h.dmnd
    │   │   ├── ncbiconf.h
    │   │   ├── ncbiconf_unix.h
    │   │   └── util
    │   │       └── tables
    │   │           ├── raw_scoremat.c
    │   │           ├── raw_scoremat.h
    │   │           ├── sm_blosum45.c
    │   │           ├── sm_blosum50.c
    │   │           ├── sm_blosum62.c
    │   │           ├── sm_blosum80.c
    │   │           ├── sm_blosum90.c
    │   │           ├── sm_identity.c
    │   │           ├── sm_pam250.c
    │   │           ├── sm_pam30.c
    │   │           ├── sm_pam70.c
    │   │           └── tables_export.h
    ├── jsonreporter.cpp
    ├── jsonreporter.h
    ├── knownadapters.h
    ├── makefile
    ├── nucleotidetree.cpp
    ├── nucleotidetree.h
    ├── options.cpp
    ├── options.h
    ├── overlapanalysis.cpp
    ├── overlapanalysis.h
    ├── peprocessor.cpp
    ├── peprocessor.h
    ├── polyx.cpp
    ├── polyx.h
    ├── processor.cpp
    ├── processor.h
    ├── read.cpp
    ├── read.h
    ├── seprocessor.cpp
    ├── seprocessor.h
    ├── seq2fun.cpp
    ├── seqtract
    ├── seqtract.cpp
    ├── seqtractpeprocessor.cpp
    ├── seqtractpeprocessor.h
    ├── sequence.cpp
    ├── sequence.h
    ├── stats.cpp
    ├── stats.h
    ├── threadconfig.cpp
    ├── threadconfig.h
    ├── threadsconfig2.cpp
    ├── threadsconfig2.h
    ├── transsearcher.cpp
    ├── transsearcher.hpp
    ├── umiprocessor.cpp
    ├── umiprocessor.h
    ├── unittest.cpp
    ├── unittest.h
    ├── util.h
    ├── writer.cpp
    ├── writer.h
    ├── writerthread.cpp
    ├── writerthread.h
    └── zlib
    │   ├── crc32.h
    │   ├── deflate.h
    │   ├── gzguts.h
    │   ├── inffast.h
    │   ├── inffixed.h
    │   ├── inflate.h
    │   ├── inftrees.h
    │   ├── trees.h
    │   ├── zconf.h
    │   ├── zlib.h
    │   └── zutil.h
└── testdata
    ├── D1.CE2-S4-LT_R1.fastq.gz
    ├── D1.CE2-S4-LT_R2.fastq.gz
    ├── D2.CE2-H2-LT_R1.fastq.gz
    ├── D2.CE2-H2-LT_R2.fastq.gz
    ├── example_annotation.txt
    └── sample.txt


/bin/mkbwt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/bin/mkbwt


--------------------------------------------------------------------------------
/bin/mkfmi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/bin/mkfmi


--------------------------------------------------------------------------------
/bin/seq2fun:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/bin/seq2fun


--------------------------------------------------------------------------------
/src/adaptertrimmer.h:
--------------------------------------------------------------------------------
 1 | #ifndef ADAPTER_TRIMMER_H
 2 | #define ADAPTER_TRIMMER_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "overlapanalysis.h"
 8 | #include "filterresult.h"
 9 | #include "options.h"
10 | 
11 | using namespace std;
12 | 
13 | class AdapterTrimmer{
14 | public:
15 |     AdapterTrimmer();
16 |     ~AdapterTrimmer();
17 | 
18 |     static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit);
19 |     static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0);
20 |     static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false, int matchReq = 4);
21 |     static bool trimByMultiSequences(Read* r1, FilterResult* fr, vector<string>& adapterList, bool isR2 = false, bool incTrimmedCounter = true);
22 |     
23 |     static bool trimPolyA(Read* r1, FilterResult* fr, bool isR2 = false, bool incTrimmedCounter = true);
24 |     
25 |     static bool test();
26 | 
27 | 
28 | };
29 | 
30 | 
31 | #endif


--------------------------------------------------------------------------------
/src/basecorrector.cpp:
--------------------------------------------------------------------------------
  1 | #include "basecorrector.h"
  2 | #include "util.h"
  3 | 
  4 | BaseCorrector::BaseCorrector(){
  5 | }
  6 | 
  7 | 
  8 | BaseCorrector::~BaseCorrector(){
  9 | }
 10 | 
 11 | int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit) {
 12 |     OverlapResult ov = OverlapAnalysis::analyze(r1, r2, diffLimit, overlapRequire, diffPercentLimit);
 13 |     return correctByOverlapAnalysis(r1, r2, fr, ov);
 14 | }
 15 | 
 16 | int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov) {
 17 |     // we only correct overlap
 18 |     if(ov.diff == 0 || !ov.overlapped)
 19 |         return 0;
 20 | 
 21 |     int ol = ov.overlap_len;
 22 |     int start1 = max(0, ov.offset);
 23 |     int start2 = r2->length() -  max(0, -ov.offset) - 1;
 24 | 
 25 |     const char* seq1 = r1->mSeq.mStr.c_str();
 26 |     const char* seq2 = r2->mSeq.mStr.c_str();
 27 |     const char* qual1 = r1->mQuality.c_str();
 28 |     const char* qual2 = r2->mQuality.c_str();
 29 | 
 30 |     const char GOOD_QUAL = num2qual(30);
 31 |     const char BAD_QUAL = num2qual(14);
 32 | 
 33 |     int corrected = 0;
 34 |     int uncorrected = 0;
 35 |     bool r1Corrected = false;
 36 |     bool r2Corrected = false;
 37 |     for(int i=0; i<ol; i++) {
 38 |         int p1 = start1 + i;
 39 |         int p2 = start2 - i;
 40 | 
 41 |         if(seq1[p1] != complement(seq2[p2])) {
 42 |             if(qual1[p1] >= GOOD_QUAL && qual2[p2] <= BAD_QUAL) {
 43 |                 // use R1
 44 |                 r2->mSeq.mStr[p2] = complement(seq1[p1]);
 45 |                 r2->mQuality[p2] = qual1[p1];
 46 |                 corrected++;
 47 |                 r2Corrected = true;
 48 |                 if(fr) {
 49 |                     fr->addCorrection(seq2[p2], complement(seq1[p1]));
 50 |                 }
 51 |             } else if(qual2[p2] >= GOOD_QUAL && qual1[p1] <= BAD_QUAL) {
 52 |                 // use R2
 53 |                 r1->mSeq.mStr[p1] = complement(seq2[p2]);
 54 |                 r1->mQuality[p1] = qual2[p2];
 55 |                 corrected++;
 56 |                 r1Corrected = true;
 57 |                 if(fr) {
 58 |                     fr->addCorrection(seq1[p1], complement(seq2[p2]));
 59 |                 }
 60 |             } else {
 61 |                 uncorrected++;
 62 |             }
 63 |         }
 64 |     }
 65 | 
 66 |     // should never happen
 67 |     if(uncorrected + corrected != ov.diff) {
 68 |         static bool warned = false;
 69 |         if(!warned){
 70 |             cerr << "WARNING: the algorithm is wrong! uncorrected + corrected != ov.diff" << endl;
 71 |             warned = true;
 72 |         }
 73 |     }
 74 | 
 75 |     if(corrected > 0 && fr) {
 76 |         if(r1Corrected && r2Corrected)
 77 |             fr->incCorrectedReads(2);
 78 |         else
 79 |             fr->incCorrectedReads(1);
 80 |     }
 81 | 
 82 |     return corrected;
 83 | }
 84 | 
 85 | bool BaseCorrector::test() {
 86 |     Read r1("@name",
 87 |         "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCACGGGG",
 88 |         "+",
 89 |         "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEE");
 90 |     Read r2("@name",
 91 |         "AAAAAAAAAACCCCGGGGAAAATTTTAAAATTGGGGGGGGGGTGGGGGGGGGGGGG",
 92 |         "+",
 93 |         "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEE");
 94 | 
 95 |     correctByOverlapAnalysis(&r1, &r2, NULL, 5, 30, 0.2);
 96 | 
 97 |     if(r1.mSeq.mStr != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGG")
 98 |         return false;
 99 |     if(r2.mSeq.mStr != "AAAAAAAAAACCCCGGGGAAAATTTTAAAATTGGGGGGGGGGGGGGGGGGGGGGGG")
100 |         return false;
101 |     if(r1.mQuality != "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE")
102 |         return false;
103 |     if(r2.mQuality != "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE")
104 |         return false;
105 | 
106 |     return true;
107 | }


--------------------------------------------------------------------------------
/src/basecorrector.h:
--------------------------------------------------------------------------------
 1 | #ifndef BASE_CORRECTOR_H
 2 | #define BASE_CORRECTOR_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "overlapanalysis.h"
 8 | #include "filterresult.h"
 9 | #include "options.h"
10 | 
11 | using namespace std;
12 | 
13 | class BaseCorrector{
14 | public:
15 |     BaseCorrector();
16 |     ~BaseCorrector();
17 | 
18 |     static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit);
19 |     static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov);
20 |     static bool test();
21 | };
22 | 
23 | 
24 | #endif


--------------------------------------------------------------------------------
/src/bwt/Makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | #CFLAGS  = -g
 3 | CFLAGS = -O3 -g -Wno-unused-result
 4 | LDLIBS = -lpthread -lm
 5 | 
 6 | ifeq ($(uname -s), "Darwin")
 7 | LD_LIBS_STATIC = -Wl,-all_load -lpthread -Wl,-noall_load -lm
 8 | else
 9 | LD_LIBS_STATIC = -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -lm
10 | endif
11 | 
12 | all: mkbwt mkfmi Makefile
13 | 
14 | mkbwt: mkbwt.o readFasta.o suffixArray.o multikeyqsort.o sequence.o
15 | 
16 | mkfmi: mkfmi.o bwt.o suffixArray.o compactfmi.o
17 | 
18 | mkbwt.o: mkbwt_vars.h mkbwt.c common.h multikeyqsort.h sequence.h
19 | 
20 | mkfmi.o: mkfmi_vars.h mkfmi.c fmi.h common.h
21 | 
22 | sequence.o: sequence.h common.h
23 | 
24 | readFasta.o: readFasta.c readFasta.h sequence.h common.h
25 | 
26 | compactfmi.o: compactfmi.c compactfmi.h common.h fmicommon.h
27 | 
28 | suffixArray.o: suffixArray.c suffixArray.h common.h sequence.h
29 | 
30 | bwt.o: bwt.c bwt.h fmi.h common.h
31 | 
32 | multikeyqsort.o: multikeyqsort.c multikeyqsort.h
33 | 
34 | clean:
35 | 	rm -f mkfmi mkbwt
36 | 
37 | static: LDFLAGS = -static
38 | static: LDLIBS = $(LD_LIBS_STATIC)
39 | static: all
40 | 
41 | debug: all
42 | 
43 | .PHONY: clean static debug
44 | 


--------------------------------------------------------------------------------
/src/bwt/bwt.h:
--------------------------------------------------------------------------------
 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh,
 2 |  * Kaiju is licensed under the GPLv3, see the file LICENSE. */
 3 | #ifndef BWT_h
 4 | #define BWT_h
 5 | 
 6 | #include "common.h"
 7 | #include "fmi.h"
 8 | #include "suffixArray.h"
 9 | 
10 | typedef struct {
11 |   IndexType len;      // Length of bwt (not counting initial zeros)
12 |   int nseq;
13 |   uchar *bwt;
14 | 
15 |   // Alphabet
16 |   int alen;
17 |   char *alphabet;
18 | 
19 |   FMI *f;
20 |   suffixArray *s;
21 | 
22 | } BWT;
23 | 
24 | 
25 | typedef struct _SI_ {
26 |   IndexType start;  // Start of suffix interval
27 |   int len;          // Interval length
28 |   int qi;           // Position in query
29 |   int ql;           // Length in query (if relevant)
30 |   int count;        // Used to count matches below current
31 |   int score;
32 |   struct _SI_ *next;
33 |   struct _SI_ *samelen;
34 | } SI;
35 | 
36 | 
37 | 
38 | /* FUNCTION PROTOTYPES BEGIN  ( by funcprototypes.pl ) */
39 | void write_BWT_header(BWT *b, FILE *bwtfile);
40 | BWT *read_BWT(FILE *bwtfile);
41 | BWT *readIndexes(FILE *fp);
42 | void get_suffix(FMI *fmi, suffixArray *s, IndexType i, int *iseq, IndexType *pos);
43 | uchar *retrieve_seq(int snum, BWT *b);
44 | IndexType InitialSI(FMI *f, uchar ct, IndexType *si);
45 | IndexType UpdateSI(FMI *f, uchar ct, IndexType *si, IndexType *newsi);
46 | void recursive_free_SI(SI *si);
47 | SI *maxMatches(FMI *f, char *str, int len, int L, int max_matches);
48 | SI *maxMatches_withStart(FMI *f, char *str, int len, int L, int max_matches, IndexType si0, IndexType si1, int offset);
49 | SI *greedyExact(FMI *f, char *str, int len, int L, int jump);
50 | /* FUNCTION PROTOTYPES END */
51 | 
52 | #endif
53 | 


--------------------------------------------------------------------------------
/src/bwt/common.h:
--------------------------------------------------------------------------------
 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh,
 2 |  * Kaiju is licensed under the GPLv3, see the file LICENSE. */
 3 | #ifndef COMMON_h
 4 | #define COMMON_h
 5 | 
 6 | #include <stdio.h>
 7 | 
 8 | typedef unsigned char uchar;
 9 | typedef unsigned short int ushort;
10 | typedef unsigned int uint;
11 | typedef long int IndexType;
12 | 
13 | static void ERROR(char *text, int errornum) {
14 |   fprintf(stderr,"%s\n",text);
15 |   exit(errornum);
16 | }
17 | 
18 | static void ERRORs(char *format, char *text, int errornum) {
19 |   fprintf(stderr,"%s\n",text);
20 |   exit(errornum);
21 | }
22 | 
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/src/bwt/compactfmi.h:
--------------------------------------------------------------------------------
 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh,
 2 |  * Kaiju is licensed under the GPLv3, see the file LICENSE. */
 3 | #ifndef COMPACTFMI_h
 4 | #define COMPACTFMI_h
 5 | 
 6 | #include "common.h"
 7 | 
 8 | /* Simple FM index with a checkpoint for every 256 letters */
 9 | typedef struct {
10 |   int alen;           // Length of alphabet
11 |   IndexType bwtlen;   // Total length of BWT
12 |   uchar *bwt;         // BWT string
13 |   int N1;             // Total number of entries in index 1 (bwtlen>>ex1 +1);
14 |   int N2;             // Total number of entries in index 2 (bwtlen>>ex2 +1);
15 |   IndexType **index1; // FM index1 (one array per letter)
16 |   ushort **index2;    // Counts relative to index1 checkpoints (assuming 16 bit int)
17 |   int *startLcode;    // start numbers for byte encoding of letter and number
18 | } FMI;
19 | 
20 | 
21 | 
22 | 
23 | /* FUNCTION PROTOTYPES BEGIN  ( by funcprototypes.pl ) */
24 | FMI *alloc_FMI(uchar *bwt, IndexType bwtlen, int alen);
25 | FMI *read_fmi(FILE *fp);
26 | void write_fmi(const FMI *f, FILE *fp);
27 | IndexType FMindex(FMI *f, uchar ct, IndexType k);
28 | IndexType FMindexCurrent(FMI *f, uchar *c, IndexType k);
29 | void FMindexAll(FMI *f, IndexType k, IndexType *fmia);
30 | void FMIrecode(FMI *fmi);
31 | FMI *makeIndex(uchar *bwt, long bwtlen, int alen);
32 | FMI *makeIndex_OLD(uchar *bwt, long bwtlen, int alen);
33 | /* FUNCTION PROTOTYPES END */
34 | 
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/src/bwt/fmi.h:
--------------------------------------------------------------------------------
 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh,
 2 |  * Kaiju is licensed under the GPLv3, see the file LICENSE. */
 3 | #ifndef COMPACTFMI_h
 4 | #define COMPACTFMI_h
 5 | 
 6 | #include "common.h"
 7 | 
 8 | /* Simple FM index with a checkpoint for every 256 letters */
 9 | typedef struct {
10 |   int alen;           // Length of alphabet
11 |   IndexType bwtlen;   // Total length of BWT
12 |   uchar *bwt;         // BWT string
13 |   int N1;             // Total number of entries in index 1 (bwtlen>>ex1 +1);
14 |   int N2;             // Total number of entries in index 2 (bwtlen>>ex2 +1);
15 |   IndexType **index1; // FM index1 (one array per letter)
16 |   ushort **index2;    // Counts relative to index1 checkpoints (assuming 16 bit int)
17 |   int *startLcode;    // start numbers for byte encoding of letter and number
18 | } FMI;
19 | 
20 | 
21 | 
22 | 
23 | /* FUNCTION PROTOTYPES BEGIN  ( by funcprototypes.pl ) */
24 | FMI *alloc_FMI(uchar *bwt, IndexType bwtlen, int alen);
25 | FMI *read_fmi(FILE *fp);
26 | void write_fmi(const FMI *f, FILE *fp);
27 | IndexType FMindex(FMI *f, uchar ct, IndexType k);
28 | IndexType FMindexCurrent(FMI *f, uchar *c, IndexType k);
29 | void FMindexAll(FMI *f, IndexType k, IndexType *fmia);
30 | void FMIrecode(FMI *fmi);
31 | FMI *makeIndex(uchar *bwt, long bwtlen, int alen);
32 | FMI *makeIndex_OLD(uchar *bwt, long bwtlen, int alen);
33 | /* FUNCTION PROTOTYPES END */
34 | 
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/src/bwt/mkfmi.c:
--------------------------------------------------------------------------------
 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh,
 2 |  * Kaiju is licensed under the GPLv3, see the file LICENSE. */
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <ctype.h>
 7 | 
 8 | #include "common.h"
 9 | #include "fmi.h"
10 | #include "bwt.h"
11 | #include "suffixArray.h"
12 | #include "mkfmi_vars.h"
13 | 
14 | void error(char *format, char *arg) {
15 |   fprintf(stderr,"ERROR: ");
16 |   fprintf(stderr,format,arg);
17 |   exit(1);
18 | }
19 | 
20 | 
21 | int main (int argc, char **argv) {
22 |   int l;
23 |   FILE *fp=NULL;
24 |   BWT *b;
25 |   char *filename;
26 | 
27 |   /* Parsing options and arguments */
28 |   OPT_read_cmdline(opt_struct, argc, argv);
29 |   if (help) { OPT_help(opt_struct); exit(0); }
30 |   OPT_print_vars(stderr, opt_struct, "# ", 0);
31 | 
32 |   if (!filenm) {
33 |     fprintf(stderr,"You have to specify name for index files (first argument)\n");
34 |     exit(5);
35 |   }
36 | 
37 |   l=strlen(filenm);
38 |   filename = (char *)malloc((l+10)*sizeof(char));
39 |   strcpy(filename,filenm);
40 | 
41 |   /* Read BWT */
42 |   strcpy(filename+l,".bwt");
43 |   fp = fopen(filename,"r");
44 |   if (!fp) error("File %s containing BWT could not be opened for reading\n",filename);
45 |   fprintf(stderr,"Reading BWT from file %s ... ",filename);
46 |   b = read_BWT(fp);
47 |   fclose(fp);
48 |   fprintf(stderr,"DONE\n");
49 |   fprintf(stderr,"BWT of length %ld has been read with %d sequencs, alphabet=%s\n",
50 | 	  b->len, b->nseq, b->alphabet); 
51 | 
52 |   /* Read SA */
53 |   strcpy(filename+l,".sa");
54 |   fp = fopen(filename,"r");
55 |   if (!fp) error("File %s containing SA could not be opened for reading\n",filename);
56 |   fprintf(stderr,"Reading suffix array from file %s ... ",filename);
57 |   b->s = read_suffixArray_header(fp);
58 |   /* If the whole SA is saved, don't read it! */
59 |   if (b->s->chpt_exp > 0) read_suffixArray_body(b->s,fp);
60 |   fclose(fp);
61 |   fprintf(stderr,"DONE\n");
62 | 
63 |   /* Concatenate stuff in fmi file */
64 |   strcpy(filename+l,".fmi");
65 |   fp = fopen(filename,"w");
66 |   if (!fp) error("File %s for FMI could not be opened for reading\n",filename);
67 |   fprintf(stderr,"Writing BWT header and SA to file  %s ... ",filename);
68 |   write_BWT_header(b, fp);
69 |   write_suffixArray(b->s,fp);
70 |   fprintf(stderr,"DONE\n");
71 | 
72 |   fprintf(stderr,"Constructing FM index\n");
73 |   b->f = makeIndex(b->bwt, b->len, b->alen);
74 |   fprintf(stderr,"\nDONE\n");
75 | 
76 |   fprintf(stderr,"Writing FM index to file ... ");
77 |   write_fmi(b->f,fp);
78 |   fclose(fp);
79 |   fprintf(stderr,"DONE\n");
80 | 
81 |   if (removecmd) {
82 |     int cl = strlen(removecmd);
83 |     char *command = malloc(cl+2*l+20);
84 |     sprintf(command,"%s %s.sa %s.bwt",removecmd,filenm,filenm);
85 |     fprintf(stderr,"Removing files with this command: %s\n",command);
86 |     system(command);
87 |     free(command);
88 |   }
89 |   else {
90 |     strcpy(filename+l,".bwt");
91 |     fprintf(stderr,"\n  !!  You can now delete files %s and ",filename);
92 |     strcpy(filename+l,".sa");
93 |     fprintf(stderr,"%s  !!\n\n",filename);
94 |     free(filename);
95 |   }
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/src/bwt/multikeyqsort.c:
--------------------------------------------------------------------------------
  1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh,
  2 |  * Kaiju is licensed under the GPLv3, see the file LICENSE. */
  3 | /*
  4 |   Copied from 
  5 |   http://www.drdobbs.com/database/sorting-strings-with-three-way-radix-qui/184410724
  6 | */
  7 | 
  8 | #include <stdlib.h>
  9 | #include <stdio.h>
 10 | #include <string.h>
 11 | 
 12 | #ifndef min 
 13 | #define min(a, b) ((a)<=(b) ? (a) : (b)) 
 14 | #endif
 15 | 
 16 | 
 17 | static inline void swap(char *a[], int i, int j) 
 18 | {     char *t = a[i];
 19 |       a[i] = a[j];
 20 |       a[j] = t; 
 21 | }
 22 | static inline void vecswap(char *a[], int i, int j, int n) 
 23 | {     while (n-- > 0)
 24 |          swap(a, i++, j++); 
 25 | }
 26 | 
 27 | 
 28 | #define ch(i) a[i][depth] 
 29 | 
 30 | 
 31 | /* Faster version */
 32 | 
 33 | /* Std strcmp use unsigned comparison */
 34 | static inline int my_strcmp(const char *s1, const char *s2) {
 35 |   while (*s1 && *s2 && *s1==*s2) { ++s1; ++s2; }
 36 |   return (int)( *s1 - *s2 );
 37 | }
 38 | 
 39 | 
 40 | static inline int med3func(char *a[], int ia, int ib, int ic, int depth) 
 41 | {   int va, vb, vc;
 42 |     if ((va=ch(ia)) == (vb=ch(ib)))
 43 |          return ia;
 44 |     if ((vc=ch(ic)) == va || vc == vb)
 45 |          return ic;
 46 |     return va < vb ?
 47 |           (vb < vc ? ib : (va < vc ? ic : ia ) )
 48 |         : (vb > vc ? ib : (va < vc ? ia : ic ) ); 
 49 | } 
 50 | 
 51 | 
 52 | void inssort(char *a[], int n, int depth) 
 53 | {   int i, j;
 54 |     for (i = 1; i < n; i++)
 55 |       for (j = i; j > 0; j--) {
 56 |          if (my_strcmp(a[j-1]+depth, a[j]+depth) <= 0)
 57 |              break;
 58 |          swap(a, j, j-1);
 59 |       } 
 60 | }  
 61 | 
 62 | 
 63 | void ssort2(char *a[], int n, int depth) 
 64 | {    int le, lt, gt, ge, r, v;
 65 |      int pl, pm, pn, d;
 66 | 
 67 |      if (n <= 10) {
 68 |         inssort(a, n, depth);
 69 |         return;
 70 |      }
 71 | 
 72 |      pl = 0;
 73 |      pm = n/2;
 74 |      pn = n-1;
 75 |      if (n > 50) {
 76 |         d = n/8;
 77 |         pl = med3func(a, pl, pl+d, pl+2*d,depth);
 78 |         pm = med3func(a, pm-d, pm, pm+d,depth);
 79 |         pn = med3func(a, pn-2*d, pn-d, pn,depth);
 80 |      }
 81 |      pm = med3func(a, pl, pm, pn,depth);
 82 |      swap(a, 0, pm);
 83 |      v = ch(0);
 84 |      for (le = 1; le < n && ch(le) == v; le++)
 85 |        ;  
 86 |      if (le == n) {
 87 |          if (v != 0) ssort2(a, n, depth+1);
 88 |          return;
 89 |      }
 90 |      lt = le;
 91 |      gt = ge = n-1;
 92 |      for (;;) {
 93 |          for ( ; lt <= gt && ch(lt) <= v; lt++)
 94 |              if (ch(lt) == v) swap(a, le++, lt);
 95 |          for ( ; lt <= gt && ch(gt) >= v; gt--) {
 96 | 	   if (ch(gt) == v) swap(a, gt, ge--);
 97 | 	 }
 98 |          if (lt > gt)
 99 |              break;
100 |          swap(a, lt++, gt--);
101 |      }
102 |      r = min(le, lt-le);
103 |      vecswap(a, 0, lt-r, r);
104 |      r = min(ge-gt, n-ge-1);
105 |      vecswap(a, lt, n-r, r);
106 |      ssort2(a, lt-le, depth);
107 |      if (v != 0)
108 |        ssort2(a + lt-le, le + n-ge-1, depth+1);
109 |      ssort2(a + n-(ge-gt), ge-gt, depth); 
110 | }
111 | 
112 | 
113 | //void ssort2main(char *a[], int n) 
114 | void multikeyqsort(char *a[], int n)
115 | { ssort2(a, n, 0); }
116 | 


--------------------------------------------------------------------------------
/src/bwt/multikeyqsort.h:
--------------------------------------------------------------------------------
1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh,
2 |  * Kaiju is licensed under the GPLv3, see the file LICENSE. */
3 | 
4 | void multikeyqsort(char **a, int n);
5 | 
6 | 


--------------------------------------------------------------------------------
/src/bwt/readFasta.h:
--------------------------------------------------------------------------------
 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh,
 2 |  * Kaiju is licensed under the GPLv3, see the file LICENSE. */
 3 | #ifndef READFASTA_h
 4 | #define READFASTA_h
 5 | 
 6 | #include "common.h"
 7 | 
 8 | /* FUNCTION PROTOTYPES BEGIN  ( by funcprototypes.pl ) */
 9 | SEQstruct *revcompSEQstruct(SEQstruct *ss, char *s, char *translate);
10 | SEQstruct *readFasta(FILE *fp, long length, char *transtab, char *complement, char term, int padding);
11 | char *translation_table(char *alphabet, char *translation, char dummy, int casesens);
12 | /* FUNCTION PROTOTYPES END */
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/src/bwt/sequence.c:
--------------------------------------------------------------------------------
  1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh,
  2 |  * Kaiju is licensed under the GPLv3, see the file LICENSE. */
  3 | #include <stdlib.h>
  4 | #include <ctype.h>
  5 | #include <string.h>
  6 | #include "sequence.h"
  7 | 
  8 | 
  9 | SEQstruct *alloc_SEQstruct() {
 10 |   SEQstruct *ss=(SEQstruct *)malloc(sizeof(SEQstruct));
 11 |   ss->len = 0;
 12 |   ss->rc = 0;
 13 |   ss->pos = 0;
 14 |   ss->id = NULL;
 15 |   ss->descr = NULL;
 16 |   ss->start = NULL;
 17 |   ss->id_filepos = 0;
 18 |   ss->seq_filepos = 0;
 19 |   ss->sort_order = 0;
 20 |   ss->next=NULL;
 21 |   return ss;
 22 | }
 23 | 
 24 | void free_SEQstruct(SEQstruct *ss) {
 25 |   if (ss) {
 26 |     if (ss->id) {
 27 |       free(ss->id);
 28 |       ss->id = NULL;
 29 |     }
 30 |     if(ss->descr) {
 31 |       free(ss->descr);
 32 |       ss->descr = NULL; 
 33 |     }            // added;
 34 |     if(ss->start) {
 35 |       free(ss->start);//added;
 36 |       ss->start = NULL;
 37 |     }
 38 |     free(ss);
 39 |     ss = NULL;
 40 |   }
 41 | }
 42 | 
 43 | /* Assumes that base->start points to the whole sequence */
 44 | void recursive_free_SEQstruct(SEQstruct *base) {
 45 |   SEQstruct *ss, *next;
 46 |   if (base->start){
 47 |       free(base->start);
 48 |       base->start = NULL;
 49 |   }
 50 |   ss=base;
 51 |   next=ss->next;
 52 |   while (ss) {
 53 |     free_SEQstruct(ss);
 54 |     ss=next;
 55 |     if (next) next=ss->next;
 56 |   }
 57 | }
 58 | 
 59 | 
 60 | /* Makes a translation table from an alphabet to a translation, so
 61 |         table[alphabet[i]] = translation[i]
 62 | 
 63 |    If translation==NULL, the letter alphabet[i] is translated to i.
 64 | 
 65 |    Letters not in alphabet are translated to dummy.
 66 |    If dummy==0, dummy is set to the translation of the last char in alphabet
 67 |    (assumed to be a "wildcard" character)
 68 | 
 69 |    Translation for non-characters is -1.
 70 | 
 71 |    casesens !=0, means case sensitive, otherwise case insentitive.
 72 | 
 73 |    Returns an array (char *table) of length 128
 74 | 
 75 |    The length of the translation table (which may contain zeros)
 76 |    HAS to be as long (or longer) than alphabet.
 77 | 
 78 |    If 0 is not in the alphabet, it is translated to 0
 79 | */
 80 | static char *translation_table(char *alphabet, char *translation, char dummy, int casesens) {
 81 |   int i, l, freetrans=0;
 82 |   char *table = (char*)malloc(128*sizeof(char));
 83 | 
 84 |   l = strlen(alphabet);
 85 | 
 86 |   if (translation==0) {
 87 |     translation = (char *)malloc(l*sizeof(char));
 88 |     for (i=0; i<l; ++i) translation[i]=i;
 89 |     freetrans=1;
 90 |   }
 91 |   if (dummy==0) dummy = translation[l-1];
 92 | 
 93 |   table[0]=0;
 94 |   for (i=1; i<128; ++i) { table[i]=-1; if (isalpha((char)i)) table[i]=dummy; }
 95 | 
 96 |   if (!casesens) {
 97 |     for (i=0; i<l; ++i) {
 98 |       table[toupper(alphabet[i])]=translation[i];
 99 |       table[tolower(alphabet[i])]=translation[i];
100 |     }
101 |   }
102 |   else {
103 |     for (i=0; i<l; ++i) table[alphabet[i]] = translation[i];
104 |   }
105 | 
106 |   if (freetrans) free(translation);
107 | 
108 |   return table;
109 | }
110 | 
111 | 
112 | 
113 | static char *dnaComplement(char *alphabet) {
114 |   int l=strlen(alphabet);
115 |   char *comp=(char *)malloc((l+1)*sizeof(char));
116 |   int i;
117 |   for (i=0;i<l;++i) {
118 |     switch (alphabet[i]) {
119 |     case 'a': comp[i]='t'; break;
120 |     case 'A': comp[i]='T'; break;
121 |     case 't': comp[i]='a'; break;
122 |     case 'T': comp[i]='A'; break;
123 |     case 'c': comp[i]='g'; break;
124 |     case 'C': comp[i]='G'; break;
125 |     case 'g': comp[i]='c'; break;
126 |     case 'G': comp[i]='C'; break;
127 |     default: comp[i]=alphabet[i];
128 |     }
129 |   }
130 |   comp[i]=0;
131 |   return comp;
132 | }
133 | 
134 | 
135 | 
136 | 
137 | AlphabetStruct *alloc_AlphabetStruct(char *a, int caseSens, int revcomp) {
138 |   AlphabetStruct *astruct = (AlphabetStruct *)malloc(sizeof(AlphabetStruct));
139 |   astruct->a = strdup(a);
140 |   astruct->len = strlen(a);
141 |   astruct->caseSens = caseSens;
142 |   astruct->trans = translation_table(a, NULL, astruct->len-1, astruct->caseSens);
143 |   if (revcomp) astruct->comp = dnaComplement(a);
144 |   else astruct->comp = NULL;
145 |   return astruct;
146 | }
147 | 
148 | 
149 | void free_AlphabetStruct(AlphabetStruct *astruct) {
150 |   if (astruct) {
151 |     if (astruct->a) free(astruct->a);
152 |     if (astruct->trans) free(astruct->trans);
153 |     if (astruct->comp) free(astruct->comp);
154 |     free(astruct);
155 |   }
156 | }
157 | 
158 | 
159 | 
160 | /*
161 |   translate a sequence (s) to numbers
162 |  */
163 | void translate2numbers(uchar *s, const IndexType slen, AlphabetStruct *astruct) {
164 |   IndexType k;
165 |   for (k=0;k<slen;++k){
166 |       s[k]=astruct->trans[s[k]];
167 |   }
168 | }
169 | 


--------------------------------------------------------------------------------
/src/bwt/sequence.h:
--------------------------------------------------------------------------------
 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh,
 2 |  * Kaiju is licensed under the GPLv3, see the file LICENSE. */
 3 | #ifndef SEQUENCE_h
 4 | #define SEQUENCE_h
 5 | 
 6 | #include "../bwt/common.h"
 7 | 
 8 | typedef struct __SEQstruct__ {
 9 |   char *id;
10 |   char *descr;     // Description (stuff following id). Allocated with id.
11 |   char rc;         // Reverse complement. 0=forward 1=complement
12 |   long len;
13 |   long pos;        // Index of sequence in long allocation
14 |   char *start;
15 |   long id_filepos;
16 |   long seq_filepos;
17 |   int sort_order;
18 |   struct __SEQstruct__ *next;
19 | } SEQstruct;
20 | 
21 | typedef struct {
22 |   int len;        // Alphabet length
23 |   int caseSens;
24 |   char *a;        // Alphabet sequence (0 terminated, first char is terminator, last may be wildcard)
25 |   char *trans;    // Translate char c to int i: trans[c]=i
26 |   char *comp;     // DNA complement comp[a]=t, etc.
27 | } AlphabetStruct;
28 | 
29 | 
30 | 
31 | static inline int letter2number(char c, AlphabetStruct *a) { return a->trans[(int)c];}
32 | static inline char number2letter(int i, AlphabetStruct *a) { return a->a[i];}
33 | 
34 | 
35 | /* FUNCTION PROTOTYPES BEGIN  ( by funcprototypes.pl ) */
36 | SEQstruct *alloc_SEQstruct();
37 | void free_SEQstruct(SEQstruct *ss);
38 | void recursive_free_SEQstruct(SEQstruct *base);
39 | AlphabetStruct *alloc_AlphabetStruct(char *a, int caseSens, int revcomp);
40 | void free_AlphabetStruct(AlphabetStruct *astruct);
41 | void translate2numbers(uchar *s, const IndexType slen, AlphabetStruct *astruct);
42 | /* FUNCTION PROTOTYPES END */
43 | 
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/bwt/suffixArray.h:
--------------------------------------------------------------------------------
 1 | /* This file is part of Kaiju, Copyright 2015,2016 Peter Menzel and Anders Krogh,
 2 |  * Kaiju is licensed under the GPLv3, see the file LICENSE. */
 3 | #ifndef SUFFIXARRAY_h
 4 | #define SUFFIXARRAY_h
 5 | 
 6 | #include "common.h"
 7 | #include "fmi.h"
 8 | #include "sequence.h"
 9 | 
10 | typedef struct {
11 |   IndexType len;      // Length of (actual) SA ( = bwtlen)
12 | 
13 |   // Suffix array checkpoints
14 |   IndexType ncheck;   // Number of checkpoints
15 |   uchar *sa;          // Actual array holding SA checkpoints
16 |   int chpt_exp;       // Exponent of checkpoint distance
17 |   int nbytes;         // Number of bytes used per entry
18 |   int sbits;          // Number of bits used for encoding sequence number
19 |   int pbits;          // Number of bits used to encode position
20 |   long mask;          // Mask for lowest pbits bits
21 |   long check;         // Used to check if we are at a checkpoint
22 | 
23 |   // Sequence information
24 |   int nseq;              // Number of sequences
25 |   char **ids;            // IDs (in order of forward sorted seqs)
26 |   int *seqTermOrder;     // Order of sequence termination
27 |   IndexType *seqlengths; // lengths of sequences
28 |   IndexType maxlength;   // Maximum length of sequences
29 |   int hash_step;         // distance in hash table
30 |   SEQstruct **hash;
31 |   char *seqstart;   // Start of sequence
32 | 
33 | } suffixArray;
34 | 
35 | 
36 | /* Decode long from n bytes */
37 | static inline long uchar2long(uchar *c, int n) {
38 |   long val=*c++;
39 |   while ( --n >0 ) val = (val<<8) + *c++;
40 |   return val;
41 | }
42 | 
43 | /*
44 |   For SA entry k, return seq no. (in *nseq) and position within (*pos)
45 |   Entry consists of nbytes bytes starting at position sa+k*nbytes.
46 | */
47 | static inline void suffixArray_decode_number(int *nseq, long *pos, long k, suffixArray *s) {
48 |   long val = uchar2long( (s->sa + k * s->nbytes), s->nbytes);
49 |   *nseq = (int)(val>>s->pbits);
50 |   *pos = val & s->mask;
51 | }
52 | 
53 | 
54 | 
55 | 
56 | 
57 | /* FUNCTION PROTOTYPES BEGIN  ( by funcprototypes.pl ) */
58 | void suffixArray_make_hash(SEQstruct *base, suffixArray *s, int Hstep);
59 | suffixArray *init_suffixArray(SEQstruct *ss, int chpt_exp);
60 | void write_suffixArray_checkpoints(char **sa, IndexType start, IndexType length,
61 | 				   suffixArray *s, FILE *sa_file);
62 | void write_suffixArray_header(suffixArray *s, FILE *fp);
63 | suffixArray *read_suffixArray_header(FILE *fp);
64 | void read_suffixArray_body(suffixArray *s, FILE *fp);
65 | void write_suffixArray(suffixArray *s, FILE *fp);
66 | /* FUNCTION PROTOTYPES END */
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/src/bwtfmiDB.cpp:
--------------------------------------------------------------------------------
  1 | #include "bwtfmiDB.h"
  2 | 
  3 | BwtFmiDB::BwtFmiDB(Options * & opt) {
  4 |     mOptions = opt;
  5 |     init();
  6 | }
  7 | 
  8 | BwtFmiDB::~BwtFmiDB() {
  9 |     free_BWT();
 10 |     if (tastruct->trans) free(tastruct->trans);
 11 |     if (tastruct->a) free(tastruct->a);
 12 |     if (tastruct) free(tastruct);
 13 |     if (mOptions->transSearch.SEG) {
 14 |         SegParametersFree(tblast_seg_params);
 15 |     }
 16 | }
 17 | 
 18 | 
 19 | void BwtFmiDB::free_BWT() {
 20 |     if (tbwt == NULL)
 21 |         return; // Check if bwt is NULL
 22 |     if (tbwt->f != NULL) {
 23 |         free_FMI(tbwt->f);
 24 |         tbwt->f = NULL;
 25 |     }
 26 |     if (tbwt->s != NULL) {
 27 |         free_suffixArray(tbwt->s);
 28 |         tbwt->s = NULL;
 29 |     }
 30 |     // Free dynamically allocated members
 31 |     if (tbwt->bwt != NULL) {
 32 |         free(tbwt->bwt);
 33 |         tbwt->bwt = NULL;
 34 |     }
 35 |     if (tbwt->alphabet != NULL) {
 36 |         free(tbwt->alphabet);
 37 |         tbwt->alphabet = NULL;
 38 |     }
 39 |     // Finally, free the BWT structure itself
 40 |     free(tbwt);
 41 | }
 42 | 
 43 | void BwtFmiDB::free_FMI(FMI*& fmi) {
 44 |     if (fmi == NULL) return; // Check if fmi is NULL
 45 | 
 46 |     if (fmi->index1 != NULL) {
 47 |         for (int i = 0; i < fmi->N1; i++) {
 48 |             if (fmi->index1[i] != NULL) {
 49 |                 free(fmi->index1[i]);
 50 |                 fmi->index1[i] = NULL;
 51 |             }
 52 |         }
 53 |         free(fmi->index1);
 54 |         fmi->index1 = NULL;
 55 |     }
 56 | 
 57 |     if (fmi->index2 != NULL) {
 58 |         for (int i = 0; i < fmi->N2; i++) {
 59 |             if (fmi->index2[i] != NULL) {
 60 |                 free(fmi->index2[i]);
 61 |                 fmi->index2[i] = NULL;
 62 |             }
 63 |         }
 64 |         free(fmi->index2);
 65 |         fmi->index2 = NULL;
 66 |     }
 67 | 
 68 |     if (fmi->startLcode != NULL) {
 69 |         free(fmi->startLcode);
 70 |         fmi->startLcode = NULL;
 71 |     }
 72 |     
 73 |         // Free dynamically allocated members
 74 |     if (fmi->bwt != NULL) {
 75 |         free(fmi->bwt);
 76 |         fmi->bwt = NULL;
 77 |     }
 78 |     // Finally, free the FMI structure itself
 79 |     free(fmi);
 80 | }
 81 | 
 82 | void BwtFmiDB::free_suffixArray(suffixArray*& sa) {
 83 |     if (sa == NULL) return; // Check if sa is NULL
 84 |     // Free dynamically allocated members
 85 |     if (sa->sa != NULL) {
 86 |         free(sa->sa);
 87 |         sa->sa = NULL;
 88 |     }
 89 |     if (sa->seqTermOrder != NULL) {
 90 |         free(sa->seqTermOrder);
 91 |         sa->seqTermOrder = NULL;
 92 |     }
 93 |     if (sa->seqlengths != NULL) {
 94 |         free(sa->seqlengths);
 95 |         sa->seqlengths = NULL;
 96 |     }
 97 |     if (sa->hash != NULL) {
 98 |         for (int i = 0; i < sa->nseq; ++i) {
 99 |             SEQstruct *cur = sa->hash[i];
100 |             recursive_free_SEQstruct(cur);
101 |         }
102 |         free(sa->hash);
103 |         sa->hash = NULL;
104 |     }
105 |     if (sa->ids != NULL) {
106 |         for (int i = 0; i < sa->nseq; ++i) {
107 |             free(sa->ids[i]);
108 |         }
109 |         free(sa->ids);
110 |         sa->ids = NULL;
111 |     }
112 |     //if (sa->seqstart != NULL) {
113 |       //  free(sa->seqstart);
114 |        // sa->seqstart = NULL;
115 |    // }
116 |     // Finally, free the suffixArray structure itself
117 |     free(sa);
118 |     sa = NULL;
119 | }
120 | 
121 | void BwtFmiDB::init() {
122 |     if (!mOptions->transSearch.tfmi.empty()) {
123 |         if (mOptions->verbose) {
124 |             std::string msg = "Reading protein (trans search) BWT FMI index from file " + mOptions->transSearch.tfmi;
125 |             mOptions->longlog ? loginfolong(msg) : loginfo(msg);
126 |         }
127 | 
128 |         FILE * tfile = fopen(mOptions->transSearch.tfmi.c_str(), "r");
129 |         tbwt = readIndexes(tfile);
130 |         Transsearch = true;
131 |         fclose(tfile);
132 |         tfmi = tbwt->f;
133 |         //if (mOptions->verbose) {
134 |             std::stringstream msgs;
135 |             msgs << "Protein (trans search) BWT of length " << tbwt->len << " has been read with " << tbwt->nseq << " sequences, alphabet = " << tbwt->alphabet;
136 |             mOptions->longlog ? loginfolong(msgs.str()) : loginfo(msgs.str());
137 |         //}
138 | 
139 |         tdb_length = (double) (tbwt->len - tbwt->nseq);
140 |         if (mOptions->verbose) {
141 |         std::string msg = "Protein (trans search) double length is " + to_string(tdb_length);
142 |             mOptions->longlog ? loginfolong(msg) : loginfo(msg);
143 |         }
144 | 
145 |         tastruct = alloc_AlphabetStruct(tbwt->alphabet, 0, 0);
146 | 
147 |         //need to be conformed.
148 |         if (mOptions->transSearch.SEG) {
149 |             tblast_seg_params = SegParametersNewAa(); //need to be conformed;
150 |             tblast_seg_params->overlaps = TRUE;
151 |         }
152 |     }
153 | 
154 |     if (mOptions->verbose) {
155 |         mOptions->longlog ? loginfolong("finish BwtFmiDB initiation") : loginfo("finish BwtFmiDB initiation");
156 |     }
157 | }
158 | 
159 | 


--------------------------------------------------------------------------------
/src/bwtfmiDB.h:
--------------------------------------------------------------------------------
 1 | #ifndef BWTFMIDB_H
 2 | #define BWTFMIDB_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <string>
 6 | #include <fstream>
 7 | #include <iostream>
 8 | #include <sstream>
 9 | 
10 | #include "util.h"
11 | #include "options.h"
12 | 
13 | #include "include/ncbi-blast+/algo/blast/core/blast_seg.h"
14 | #include "include/ncbi-blast+/algo/blast/core/blast_filter.h"
15 | #include "include/ncbi-blast+/algo/blast/core/blast_encoding.h"
16 | 
17 | extern "C" {
18 | #include "bwt/fmi.h"
19 | #include "bwt/bwt.h"
20 | #include "bwt/sequence.h"
21 | }
22 | using namespace std;
23 | 
24 | class BwtFmiDB {
25 | public:
26 |     BwtFmiDB(Options * & opt);
27 |     ~BwtFmiDB();
28 |     void free_BWT();
29 |     void free_FMI(FMI*& fmi);
30 |     void free_suffixArray(suffixArray*& sa);
31 |     //for trans search
32 |     BWT * tbwt;
33 |     FMI * tfmi;
34 |     AlphabetStruct * tastruct;
35 |     SegParameters * tblast_seg_params;
36 |     double tdb_length;
37 |     bool Transsearch;
38 |     
39 | private:
40 |     void init();
41 |     
42 | private:
43 |     Options * mOptions;
44 | };
45 | 
46 | #endif /* BWTFMIDB_H */
47 | 
48 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef COMMON_H
 2 | #define COMMON_H
 3 | 
 4 | #include <string>
 5 | 
 6 | #define SEQ2FUNR_VER "2.0.6"
 7 | 
 8 | #define _DEBUG false
 9 | 
10 | typedef long int64;
11 | typedef unsigned long uint64;
12 | 
13 | typedef int int32;
14 | typedef unsigned int uint32;
15 | 
16 | typedef short int16;
17 | typedef unsigned short uint16;
18 | 
19 | typedef char int8;
20 | typedef unsigned char uint8;
21 | 
22 | const char ATCG_BASES[] = {'A', 'T', 'C', 'G'};
23 | 
24 | #pragma pack(2) 
25 | 
26 | 
27 | #pragma pack() 
28 | 
29 | // the limit of the queue to store the packs
30 | // error may happen if it generates more packs than this number
31 | static const int PACK_NUM_LIMIT  = 10000000; 
32 | 
33 | // how many reads one pack has
34 | static const int PACK_SIZE = 1000;//1000000
35 | 
36 | // if one pack is produced, but not consumed, it will be kept in the memory
37 | // this number limit the number of in memory packs
38 | // if the number of in memory packs is full, the producer thread should sleep
39 | static const int PACK_IN_MEM_LIMIT = 80;//500000
40 | 
41 | // if read number is more than this, warn it
42 | static const int WARN_STANDALONE_READ_LIMIT = 500000;//5000000
43 | 
44 | // different filtering results, bigger number means worse
45 | // if r1 and r2 are both failed, then the bigger one of the two results will be recorded
46 | // we reserve some gaps for future types to be added
47 | static const int PASS_FILTER = 0;
48 | static const int FAIL_POLY_X = 4;
49 | static const int FAIL_OVERLAP = 8;
50 | static const int FAIL_N_BASE = 12;
51 | static const int FAIL_LENGTH = 16;
52 | static const int FAIL_TOO_LONG = 17;
53 | static const int FAIL_QUALITY = 20;
54 | static const int FAIL_COMPLEXITY = 24;
55 | 
56 | // how many types in total we support
57 | static const int FILTER_RESULT_TYPES = 32;
58 | 
59 | const static char* FAILED_TYPES[FILTER_RESULT_TYPES] = {
60 | 	"passed", "", "", "",
61 | 	"failed_polyx_filter", "", "", "",
62 | 	"failed_bad_overlap", "", "", "",
63 | 	"failed_too_many_n_bases", "", "", "",
64 | 	"failed_too_short", "failed_too_long", "", "",
65 | 	"failed_quality_filter", "", "", "",
66 | 	"failed_low_complexity", "", "", "",
67 | 	"", "", "", ""
68 | };
69 | 
70 | #endif /* COMMON_H */
71 | 


--------------------------------------------------------------------------------
/src/duplicate.cpp:
--------------------------------------------------------------------------------
  1 | #include "duplicate.h"
  2 | #include "overlapanalysis.h"
  3 | #include <memory.h>
  4 | #include <math.h>
  5 | 
  6 | Duplicate::Duplicate(Options* & opt) {
  7 |     mOptions = opt;
  8 |     mKeyLenInBase = mOptions->duplicate.keylen;
  9 |     mKeyLenInBit = 1<<(2*mKeyLenInBase);
 10 |     mDups = new uint64[mKeyLenInBit];
 11 |     memset(mDups, 0, sizeof(uint64)*mKeyLenInBit);
 12 |     mCounts = new uint16[mKeyLenInBit];
 13 |     memset(mCounts, 0, sizeof(uint16)*mKeyLenInBit);
 14 |     mGC = new uint8[mKeyLenInBit];
 15 |     memset(mGC, 0, sizeof(uint8)*mKeyLenInBit);
 16 | }
 17 | 
 18 | Duplicate::~Duplicate(){
 19 |     delete[] mDups;
 20 |     delete[] mCounts;
 21 | }
 22 | 
 23 | uint64 Duplicate::seq2int(const char* data, int start, int keylen, bool& valid) {
 24 |     uint64 ret = 0;
 25 |     for(int i=0; i<keylen; i++) {
 26 |         switch(data[start + i]) {
 27 |             case 'A':
 28 |                 ret += 0;
 29 |                 break;
 30 |             case 'T':
 31 |                 ret += 1;
 32 |                 break;
 33 |             case 'C':
 34 |                 ret += 2;
 35 |                 break;
 36 |             case 'G':
 37 |                 ret += 3;
 38 |                 break;
 39 |             default:
 40 |                 valid = false;
 41 |                 return 0;
 42 |         }
 43 |         // if it's not the last one, shift it by 2 bits
 44 |         if(i != keylen-1)
 45 |             ret <<= 2;
 46 |     }
 47 |     return ret;
 48 | }
 49 | 
 50 | void Duplicate::addRecord(uint32 key, uint64 kmer32, uint8 gc) {
 51 |     if(mCounts[key] == 0) {
 52 |         mCounts[key] = 1;
 53 |         mDups[key] = kmer32;
 54 |         mGC[key] = gc;
 55 |     } else {
 56 |         if(mDups[key] == kmer32)
 57 |             mCounts[key]++;
 58 |         else if(mDups[key] > kmer32) {
 59 |             mDups[key] = kmer32;
 60 |             mCounts[key] = 1;
 61 |             mGC[key] = gc;
 62 |         }
 63 |     }
 64 | }
 65 | 
 66 | void Duplicate::statRead(Read* r) {
 67 |     if(r->length() < 32)
 68 |         return;
 69 | 
 70 |     int start1 = 0;
 71 |     int start2 = max(0, r->length() - 32 - 5);
 72 | 
 73 |     const char* data = r->mSeq.mStr.c_str();
 74 |     bool valid = true;
 75 | 
 76 |     uint64 ret = seq2int(data, start1, mKeyLenInBase, valid);
 77 |     uint32 key = (uint32)ret;
 78 |     if(!valid)
 79 |         return;
 80 | 
 81 |     uint64 kmer32 = seq2int(data, start2, 32, valid);
 82 |     if(!valid)
 83 |         return;
 84 | 
 85 |     int gc = 0;
 86 | 
 87 |     // not calculated
 88 |     if(mCounts[key] == 0) {
 89 |         for(int i=0; i<r->length(); i++) {
 90 |             if(data[i] == 'C' || data[i] == 'T')
 91 |                 gc++;
 92 |         }
 93 |     }
 94 | 
 95 |     gc = round(255.0 * (double) gc / (double) r->length());
 96 | 
 97 |     addRecord(key, kmer32, (uint8)gc);
 98 | }
 99 | 
100 | void Duplicate::statPair(Read* r1, Read* r2) {
101 |     if(r1->length() < 32 || r2->length() < 32)
102 |         return;
103 | 
104 |     const char* data1 = r1->mSeq.mStr.c_str();
105 |     const char* data2 = r2->mSeq.mStr.c_str();
106 |     bool valid = true;
107 | 
108 |     uint64 ret = seq2int(data1, 0, mKeyLenInBase, valid);
109 |     uint32 key = (uint32)ret;
110 |     if(!valid)
111 |         return;
112 | 
113 |     uint64 kmer32 = seq2int(data2, 0, 32, valid);
114 |     if(!valid)
115 |         return;
116 | 
117 |     int gc = 0;
118 | 
119 |     // not calculated
120 |     if(mCounts[key] == 0) {
121 |         for(int i=0; i<r1->length(); i++) {
122 |             if(data1[i] == 'G' || data1[i] == 'C')
123 |                 gc++;
124 |         }
125 |         for(int i=0; i<r2->length(); i++) {
126 |             if(data2[i] == 'G' || data2[i] == 'C')
127 |                 gc++;
128 |         }
129 |     }
130 | 
131 |     gc = round(255.0 * (double) gc / (double)( r1->length() + r2->length()));
132 | 
133 |     addRecord(key, kmer32, gc);
134 | }
135 | 
136 | double Duplicate::statAll(int* hist, double* meanGC, int histSize) {
137 |     long totalNum = 0;
138 |     long dupNum = 0;
139 |     int* gcStatNum = new int[histSize];
140 |     memset(gcStatNum, 0, sizeof(int)*histSize);
141 |     for(int key=0; key<mKeyLenInBit; key++) {
142 |         int count = mCounts[key];
143 |         double gc = mGC[key];
144 | 
145 |         if(count > 0) {
146 |             totalNum += count;
147 |             dupNum += count - 1;
148 | 
149 |             if(count >= histSize){
150 |                 hist[histSize-1]++;
151 |                 meanGC[histSize-1] += gc;
152 |                 gcStatNum[histSize-1]++;
153 |             }
154 |             else{
155 |                 hist[count]++;
156 |                 meanGC[count] += gc;
157 |                 gcStatNum[count]++;
158 |             }
159 |         }
160 |     }
161 | 
162 |     for(int i=0; i<histSize; i++) {
163 |         if(gcStatNum[i] > 0) {
164 |             meanGC[i] = meanGC[i] / 255.0 / gcStatNum[i];
165 |         }
166 |     }
167 | 
168 |     delete[] gcStatNum;
169 | 
170 |     if(totalNum == 0)
171 |         return 0.0;
172 |     else
173 |         return (double)dupNum / (double)totalNum;
174 | }


--------------------------------------------------------------------------------
/src/duplicate.h:
--------------------------------------------------------------------------------
 1 | #ifndef DUPLICATE_H
 2 | #define DUPLICATE_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "read.h"
 8 | #include "options.h"
 9 | #include "common.h"
10 | 
11 | using namespace std;
12 | 
13 | class Duplicate{
14 | public:
15 |     Duplicate(Options* & opt);
16 |     ~Duplicate();
17 | 
18 |     void statRead(Read* r1);
19 |     void statPair(Read* r1, Read* r2);
20 |     uint64 seq2int(const char* data, int start, int keylen, bool& valid);
21 |     void addRecord(uint32 key, uint64 kmer32, uint8 gc);
22 | 
23 |     // make histogram and get duplication rate
24 |     double statAll(int* hist, double* meanGC, int histSize);
25 | 
26 | private:
27 |     Options* mOptions;
28 |     int mKeyLenInBase;
29 |     int mKeyLenInBit;
30 |     uint64* mDups;
31 |     uint16* mCounts;
32 |     uint8* mGC;
33 |     
34 | };
35 | 
36 | #endif


--------------------------------------------------------------------------------
/src/evaluator.h:
--------------------------------------------------------------------------------
 1 | #ifndef EVALUATOR_H
 2 | #define EVALUATOR_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "options.h"
 8 | #include "util.h"
 9 | #include "read.h"
10 | 
11 | using namespace std;
12 | 
13 | class Evaluator{
14 | public:
15 |     Evaluator(Options* & opt);
16 |     ~Evaluator();
17 |     // evaluate how many reads are stored in the input file
18 |     void evaluateReadNum(long& readNum);
19 |     string evalAdapterAndReadNumDepreciated(long& readNum);
20 |     string evalAdapterAndReadNum(long& readNum, bool isR2);
21 |     bool isTwoColorSystem();
22 |     void evaluateSeqLen();
23 |     void evaluateOverRepSeqs();
24 |     void computeOverRepSeq(string filename, map<string, long>& hotseqs, int seqLen);
25 |     int computeSeqLen(string filename);
26 | 
27 |     static bool test();
28 |     static string matchKnownAdapter(string seq);
29 | private:
30 |     Options* mOptions;
31 |     string int2seq(unsigned int val, int seqlen);
32 |     int seq2int(string& seq, int pos, int seqlen, int lastVal = -1);
33 |     string getAdapterWithSeed(int seed, Read** loadedReads, long records, int keylen);
34 | };
35 | 
36 | 
37 | #endif


--------------------------------------------------------------------------------
/src/fastareader.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "fastareader.h"
  3 | #include "util.h"
  4 | #include <sstream>
  5 | 
  6 | FastaReader::FastaReader(string faFile, bool forceUpperCase)
  7 | {
  8 |     // Set locale and disable stdio synchronization to improve iostream performance
  9 |     // http://www.drdobbs.com/the-standard-librarian-iostreams-and-std/184401305
 10 |     // http://stackoverflow.com/questions/5166263/how-to-get-iostream-to-perform-better
 11 |     setlocale(LC_ALL,"C");
 12 |     ios_base::sync_with_stdio(false);
 13 | 
 14 |     mFastaFile = faFile;
 15 |     mForceUpperCase = forceUpperCase;
 16 |     if (is_directory(mFastaFile)) {
 17 |         string error_msg = "There is a problem with the provided fasta file: \'";
 18 |         error_msg.append(mFastaFile);
 19 |         error_msg.append("\' is a directory NOT a file...\n");
 20 |         throw invalid_argument(error_msg);
 21 |     }
 22 |     mFastaFileStream.open( mFastaFile.c_str(),ios::in);
 23 |     // verify that the file can be read
 24 |     if (!mFastaFileStream.is_open()) {
 25 |         string msg = "There is a problem with the provided fasta file: could NOT read ";
 26 |         msg.append(mFastaFile.c_str());
 27 |         msg.append("...\n");
 28 |         throw invalid_argument(msg);
 29 |     }
 30 | 
 31 |     char c;
 32 |     // seek to first contig
 33 |     while (mFastaFileStream.get(c) && c != '>') {
 34 |         if (mFastaFileStream.eof()) {
 35 |             break;
 36 |         }
 37 |     }
 38 | }
 39 | 
 40 | FastaReader::~FastaReader()
 41 | {
 42 |     if (mFastaFileStream.is_open()) {
 43 |         mFastaFileStream.close();
 44 |     }
 45 | }
 46 | 
 47 | void FastaReader::readNext()
 48 | {
 49 |     mCurrentID = "";
 50 |     mCurrentDescription = "";
 51 |     mCurrentSequence = "";
 52 |     bool foundHeader = false;
 53 |     
 54 |     char c;
 55 |     stringstream ssSeq;
 56 |     stringstream ssHeader;
 57 |     while(true){
 58 |         mFastaFileStream.get(c);
 59 |         if(c == '>' || mFastaFileStream.eof())
 60 |             break;
 61 |         else {
 62 |             if (foundHeader){
 63 |                 if(mForceUpperCase && c>='a' && c<='z') {
 64 |                     c -= ('a' - 'A');
 65 |                 }
 66 |                 ssSeq << c;
 67 |             }
 68 |             else
 69 |                 ssHeader << c;
 70 |         }
 71 | 
 72 |         string line = "";
 73 |         getline(mFastaFileStream,line,'\n');
 74 | 
 75 | 
 76 |         if(foundHeader == false) {
 77 |             ssHeader << line;
 78 |             foundHeader = true;
 79 |         }
 80 |         else {
 81 |             str_keep_valid_sequence(line, mForceUpperCase);
 82 |             ssSeq << line;
 83 |         }
 84 |     }
 85 |     mCurrentSequence = ssSeq.str();
 86 |     string header = ssHeader.str();
 87 | 
 88 |     mCurrentID = header;
 89 | }
 90 | 
 91 | bool FastaReader::hasNext() {
 92 |     return !mFastaFileStream.eof();
 93 | }
 94 | 
 95 | void FastaReader::readAll() {
 96 |     while(!mFastaFileStream.eof()){
 97 |         readNext();
 98 |         mAllContigs[mCurrentID] = mCurrentSequence;
 99 |     }
100 | }
101 | 
102 | bool FastaReader::test(){
103 |     FastaReader reader("testdata/tinyref.fa");
104 |     reader.readAll();
105 | 
106 |     string contig1 = "GATCACAGGTCTATCACCCTATTAATTGGTATTTTCGTCTGGGGGGTGTGGAGCCGGAGCACCCTATGTCGCAGT";
107 |     string contig2 = "GTCTGCACAGCCGCTTTCCACACAGAACCCCCCCCTCCCCCCGCTTCTGGCAAACCCCAAAAACAAAGAACCCTA";
108 | 
109 |     if(reader.mAllContigs.count("contig1") == 0 || reader.mAllContigs.count("contig2") == 0 )
110 |         return false;
111 | 
112 |     if(reader.mAllContigs["contig1"] != contig1 || reader.mAllContigs["contig2"] != contig2 )
113 |         return false;
114 | 
115 |     return true;
116 | 
117 | }
118 | 
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/src/fastareader.h:
--------------------------------------------------------------------------------
 1 | #ifndef FASTA_READER_H
 2 | #define FASTA_READER_H
 3 | 
 4 | // includes
 5 | #include <cctype>
 6 | #include <clocale>
 7 | #include <cstdlib>
 8 | #include <fstream>
 9 | #include <iostream>
10 | #include <stdexcept>
11 | #include <string>
12 | #include <map>
13 | 
14 | 
15 | using namespace std;
16 | 
17 | class FastaReader
18 | {
19 | public:
20 |     FastaReader(string fastaFile, bool forceUpperCase = true);
21 |     ~FastaReader();
22 |     bool hasNext();
23 |     void readNext();
24 |     void readAll();
25 | 
26 |     inline string currentID()
27 |     {
28 |         return mCurrentID;
29 |     }
30 | 
31 |     inline string currentDescription()
32 |     {
33 |         return mCurrentDescription;
34 |     }
35 | 
36 |     inline string currentSequence()
37 |     {
38 |         return mCurrentSequence;
39 |     }
40 | 
41 |     inline map<string, string>& contigs() {
42 |         return mAllContigs;
43 |     }
44 | 
45 |     static bool test();
46 | 
47 | 
48 | public:
49 |     string mCurrentSequence;
50 |     string mCurrentID ;
51 |     string mCurrentDescription;
52 |     map<string, string> mAllContigs;
53 | 
54 | private:
55 |     bool readLine();
56 |     bool endOfLine(char c);
57 |     void setFastaSequenceIdDescription();
58 | 
59 | private:
60 |     string mFastaFile;
61 |     ifstream mFastaFileStream;
62 |     bool mForceUpperCase;
63 | };
64 | 
65 | 
66 | #endif
67 | 
68 | 


--------------------------------------------------------------------------------
/src/fastqreader.h:
--------------------------------------------------------------------------------
 1 | #ifndef FASTQ_READER_H
 2 | #define FASTQ_READER_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include "read.h"
 7 | #ifdef DYNAMIC_ZLIB
 8 |   #include <zlib.h>
 9 | #else
10 |   #include "zlib/zlib.h"
11 | #endif
12 | #include "common.h"
13 | #include <iostream>
14 | #include <fstream>
15 | 
16 | class FastqReader{
17 | public:
18 | 	FastqReader(string filename, bool hasQuality = true, bool phred64=false, size_t fastqBufferSize=1<<20);
19 | 	~FastqReader();
20 | 	bool isZipped();
21 | 
22 | 	void getBytes(size_t& bytesRead, size_t& bytesTotal);
23 | 
24 | 	//this function is not thread-safe
25 | 	//do not call read() of a same FastqReader object from different threads concurrently
26 | 	Read* read();
27 | 	bool eof();
28 | 	bool hasNoLineBreakAtEnd();
29 | 
30 | public:
31 | 	static bool isZipFastq(string filename);
32 | 	static bool isFastq(string filename);
33 | 	static bool test();
34 | 
35 | private:
36 | 	void init();
37 | 	void close();
38 | 	string getLine();
39 | 	void clearLineBreaks(char* line);
40 | 	void readToBuf();
41 | 
42 | private:
43 | 	string mFilename;
44 | 	gzFile mZipFile;
45 | 	FILE* mFile;
46 | 	bool mZipped;
47 | 	bool mHasQuality;
48 | 	bool mPhred64;
49 | 	char* mBuf;
50 | 	int mBufDataLen;
51 | 	int mBufUsedLen;
52 | 	bool mStdinMode;
53 | 	bool mHasNoLineBreakAtEnd;
54 |         size_t mFastqBufSize;
55 | };
56 | 
57 | class FastqReaderPair{
58 | public:
59 | 	FastqReaderPair(FastqReader* left, FastqReader* right);
60 | 	FastqReaderPair(string leftName, string rightName, bool hasQuality = true, bool phred64 = false, bool interleaved = false, size_t fastqBufferSize = 1<<20);
61 | 	~FastqReaderPair();
62 | 	ReadPair* read();
63 | public:
64 | 	FastqReader* mLeft;
65 | 	FastqReader* mRight;
66 | 	bool mInterleaved;
67 | };
68 | 
69 | #endif
70 | 


--------------------------------------------------------------------------------
/src/filter.h:
--------------------------------------------------------------------------------
 1 | #ifndef FILTER_H
 2 | #define FILTER_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <vector>
 8 | #include "options.h"
 9 | #include "read.h"
10 | 
11 | using namespace std;
12 | 
13 | class Filter{
14 | public:
15 |     Filter(Options* & opt);
16 |     ~Filter();
17 |     int passFilter(Read* r);
18 |     bool passLowComplexityFilter(Read* r);
19 |     Read* trimAndCut(Read* r, int front, int tail, int& frontTrimmed);
20 |     bool filterByIndex(Read* r);
21 |     bool filterByIndex(Read* r1, Read* r2);
22 |     static bool test();
23 | 
24 | private:
25 |     bool match(vector<string>& list, string target, int threshold);
26 | 
27 | private:
28 |     Options* mOptions;
29 | };
30 | 
31 | 
32 | #endif


--------------------------------------------------------------------------------
/src/filterresult.h:
--------------------------------------------------------------------------------
 1 | #ifndef FILTER_RESULT_H
 2 | #define FILTER_RESULT_H
 3 | 
 4 | #include <iostream>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string>
 8 | #include <vector>
 9 | #include "common.h"
10 | #include "options.h"
11 | #include <fstream>
12 | #include <map>
13 | 
14 | struct classcomp {
15 |     bool operator() (const string& lhs, const string& rhs) const {
16 |         if (lhs.length() < rhs.length())
17 |             return true;
18 |         else if(lhs.length() == rhs.length()) {
19 |             return lhs < rhs;
20 |         } else
21 |             return false;
22 |     }
23 | };
24 | 
25 | using namespace std;
26 | 
27 | class FilterResult{
28 | public:
29 |     FilterResult(Options* & opt, bool paired = false);
30 |     ~FilterResult();
31 |     inline long* getFilterReadStats() {return mFilterReadStats;}
32 |     void addFilterResult(int result, int readNum=1);
33 |     static FilterResult* merge(vector<FilterResult*>& list);
34 |     void print();
35 |     // for single end
36 |     void addAdapterTrimmed(string adapter, bool isR2 = false, bool incTrimmedCounter = true);
37 |     // for paired end
38 |     void addAdapterTrimmed(string adapter1, string adapter2);
39 |     void addPolyXTrimmed(int base, int length);
40 |     long getTotalPolyXTrimmedReads();
41 |     long getTotalPolyXTrimmedBases();
42 |     // a part of JSON report
43 |     void reportJson(ofstream& ofs, string padding);
44 |     // a part of JSON report for adapters
45 |     void reportAdapterJson(ofstream& ofs, string padding);
46 |     // a part of JSON report for polyX trim
47 |     void reportPolyXTrimJson(ofstream& ofs, string padding);
48 |     // a part of HTML report
49 |     void reportHtml(ofstream& ofs, long totalReads, long totalBases);
50 |     // a part of HTML report for adapters
51 |     void reportAdapterHtml(ofstream& ofs, long totalBases);
52 |     void outputAdaptersJson(ofstream& ofs, map<string, long, classcomp>& adapterCounts);
53 |     void outputAdaptersHtml(ofstream& ofs, map<string, long, classcomp>& adapterCounts, long totalBases);
54 |     // deal with base correction results
55 |     long* getCorrectionMatrix() {return mCorrectionMatrix;}
56 |     long getTotalCorrectedBases();
57 |     void addCorrection(char from, char to);
58 |     long getCorrectionNum(char from, char to);
59 |     void incCorrectedReads(int count);
60 |     void addMergedPairs(int pairs);
61 | 
62 | 
63 | public:
64 |     Options* mOptions;
65 |     bool mPaired;
66 |     long mCorrectedReads;
67 |     long mMergedPairs;
68 | private:
69 |     long mFilterReadStats[FILTER_RESULT_TYPES];
70 |     long mTrimmedAdapterRead;
71 |     long mTrimmedAdapterBases;
72 |     long mTrimmedPolyXReads[4] = {0};
73 |     long mTrimmedPolyXBases[4] = {0};
74 |     map<string, long, classcomp> mAdapter1;
75 |     map<string, long, classcomp> mAdapter2;
76 |     long* mCorrectionMatrix;
77 | };
78 | 
79 | #endif


--------------------------------------------------------------------------------
/src/fragment.cpp:
--------------------------------------------------------------------------------
 1 | #include "fragment.h"
 2 | 
 3 | Fragment::Fragment(const std::string & s) {
 4 |     seq = s;
 5 | }
 6 | 
 7 | Fragment::Fragment(const std::string & s, bool b) {
 8 |  seq = s;
 9 |  SEGchecked = true;
10 | }
11 | 
12 | Fragment:: Fragment(const std::string & s, unsigned int n, unsigned int p, int d){
13 |  seq = s;
14 |  num_mm = n;
15 |  diff = d;
16 |  pos_lastmm = p;
17 | }
18 | 
19 | Fragment::Fragment(const std::string & s, unsigned int n, unsigned int p, int d, IndexType arg_si0, IndexType arg_si1, int len){
20 |  seq = s;
21 |  num_mm = n;
22 |  diff = d;
23 |  pos_lastmm = p;
24 |  si0 = arg_si0;
25 |  si1 = arg_si1;
26 |  matchlen = len;
27 |  SEGchecked = true;
28 | } // fragments with substitutions have been checked before
29 | 
30 | Fragment::Fragment(const std::string & s, unsigned int n, unsigned int p, int d, SI * si){
31 |  seq = s;
32 |  num_mm = n;
33 |  diff = d;
34 |  pos_lastmm = p;
35 |  si0 = si->start;
36 |  si1 = si->start + (IndexType) si->len;
37 |  matchlen = si->ql;
38 | }
39 | 
40 | Fragment::Fragment(const std::string & s, unsigned int n, unsigned int p, SI * si){
41 |  seq = s;
42 |  num_mm = n;
43 |  pos_lastmm = p;
44 |  si0 = si->start;
45 |  si1 = si->start + (IndexType) si->len;
46 |  matchlen = si->ql; 
47 | }
48 | 
49 | Fragment::Fragment(const std::string & s, unsigned int n, unsigned int p) {
50 | seq = s;
51 | num_mm = n;
52 | pos_lastmm = p; 
53 | }


--------------------------------------------------------------------------------
/src/fragment.h:
--------------------------------------------------------------------------------
 1 | #ifndef FRAGMENT_H
 2 | #define FRAGMENT_H
 3 | 
 4 | #include <string>
 5 | 
 6 | extern "C" {
 7 | #include "bwt/bwt.h"
 8 | }
 9 | 
10 | using namespace std;
11 | 
12 | class Fragment {
13 | public:
14 |     std::string seq;
15 |     unsigned int num_mm = 0;
16 |     int diff = 0;
17 |     unsigned int pos_lastmm = 0;
18 |     IndexType si0, si1, arg_si0, arg_si1;
19 |     int matchlen;
20 |     bool SEGchecked = false;
21 | 
22 |     Fragment(const std::string & s);
23 | 
24 |     Fragment(const std::string & s, bool b);
25 | 
26 |     Fragment(const std::string & s, unsigned int n, unsigned int p, int d);
27 | 
28 |     Fragment(const std::string & s, unsigned int n, unsigned int p, int d, IndexType arg_si0, IndexType arg_si1, int len);
29 | 
30 |     Fragment(const std::string & s, unsigned int n, unsigned int p, int d, SI * si);
31 | 
32 |     Fragment(const std::string & s, unsigned int n, unsigned int p, SI * si);
33 | 
34 |     Fragment(const std::string & s, unsigned int n, unsigned int p);
35 | };
36 | 
37 | #endif /* FRAGMENT_H */
38 | 
39 | 


--------------------------------------------------------------------------------
/src/htmlreporter.h:
--------------------------------------------------------------------------------
 1 | #ifndef HTML_REPORTER_H
 2 | #define HTML_REPORTER_H
 3 | 
 4 | #include <iostream>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string>
 8 | #include <fstream>
 9 | #include <time.h>
10 | #include <atomic>
11 | #include <iomanip>
12 | 
13 | #include "options.h"
14 | #include "stats.h"
15 | #include "filterresult.h"
16 | #include "common.h"
17 | #include "util.h"
18 | 
19 | using namespace std;
20 | 
21 | class HtmlReporter{
22 | public:
23 |     HtmlReporter(Options* & opt);
24 |     ~HtmlReporter();
25 |     void setDupHist(int* dupHist, double* dupMeanGC, double dupRate);
26 |     void setInsertHist(atomic_long* insertHist, int insertSizePeak);
27 |     void report(FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL);
28 |     static void outputRow(ofstream& ofs, string key, long value);
29 |     static void outputRow(ofstream& ofs, string key, string value);
30 |     static void outputLongRow(ofstream& ofs, string key, string value);
31 |     static string formatNumber(long number);
32 |     static string getPercents(long numerator, long denominator);
33 | private:
34 |     const string getCurrentSystemTime();
35 |     void printHeader(ofstream& ofs);
36 |     void printCSS(ofstream& ofs);
37 |     void printJS(ofstream& ofs);
38 |     void printFooter(ofstream& ofs);
39 |     void reportDuplication(ofstream& ofs);
40 |     void reportInsertSize(ofstream& ofs, int isizeLimit);
41 |     void printSummary(ofstream& ofs, FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2, Stats* postStats2);
42 |     
43 |     void printAnnotationResults(ofstream & ofs);
44 |     void reportRarefaction(ofstream& ofs);
45 |     void reportKOBarPlot(ofstream& ofs);
46 |     void reportRarefactionId(ofstream& ofs);
47 |     void reportBarPlotId(ofstream& ofs);
48 |     void reportPathway(ofstream& ofs);
49 |     void reportSpecies(ofstream& ofs);
50 |     
51 | private:
52 |     Options* mOptions;
53 |     int* mDupHist;
54 |     double* mDupMeanGC;
55 |     double mDupRate;
56 |     atomic_long* mInsertHist;
57 |     int mInsertSizePeak;
58 | };
59 | 
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/src/htmlreporterall.h:
--------------------------------------------------------------------------------
 1 | #ifndef HTMLREPORTERALL_H
 2 | #define HTMLREPORTERALL_H
 3 | 
 4 | #include <iostream>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string>
 8 | #include <fstream>
 9 | #include <chrono>
10 | #include <memory.h>
11 | #include <valarray>
12 | #include <time.h>
13 | #include <iomanip>
14 | 
15 | #include "util.h"
16 | #include "common.h"
17 | #include "options.h"
18 | 
19 | 
20 | using namespace std;
21 | 
22 | class HtmlReporterAll {
23 | public:
24 |     HtmlReporterAll(Options* & opt);
25 |     ~HtmlReporterAll();
26 |     void report();
27 |     static void outputRow(ofstream& ofs, string key, long value);
28 |     static void outputRow(ofstream& ofs, string key, string value);
29 |     static void outputRow(ofstream& ofs, std::vector<Sample> & samplesVec);
30 |     static void outputSummaryTable(ofstream& ofs, std::vector<Sample> & samplesVec);
31 |     
32 | private:
33 |     const string getCurrentSystemTime();
34 |     void printHeader(ofstream& ofs);
35 |     void printCSS(ofstream& ofs);
36 |     void printJS(ofstream& ofs);
37 |     void printFooter(ofstream& ofs);
38 |     
39 |     void printAnnotationResults(ofstream & ofs);
40 |     void reportRarefactionKO(ofstream& ofs);
41 |     void reportRarefactionKO3D(ofstream& ofs);
42 |     void reportKOBarPlot(ofstream& ofs);
43 |     void reportRarefactionS2f(ofstream& ofs);
44 |     void reportRarefactionS2f3D(ofstream& ofs);
45 |     void reportS2fBarPlot(ofstream& ofs);
46 |     void reportPathwayBarPlot(ofstream& ofs);
47 |     void reportOrgBarPlot(ofstream& ofs);
48 |     void reportReadsQualityPlot3D(ofstream& ofs);
49 |     void reportAllTables();
50 |     static string list2string(std::vector<long> & x_vec, int top);
51 |     static string list2string(std::vector<int> & x_vec, int top);
52 |     static string list2string(std::vector<double> & x_vec, int top);
53 |     static string list2string(std::vector<string> & x_vec, int top);
54 |     static string list2string2(std::vector<string> & x_vec, int top);
55 |     std::vector<std::string> smNmVec;
56 |     std::vector<std::vector<std::string> > koFreqVec;
57 |     std::vector<std::pair<const uint32*, std::vector<uint32> > > idFreqVec;
58 |     std::vector<std::vector<std::string> > pathwayFreqVec;
59 |     std::vector<std::vector<std::string> > orgFreqVec;
60 |     
61 | private:
62 |     Options * mOptions;
63 | 
64 | };
65 | 
66 | #endif /* HTMLREPORTERALL_H */
67 | 
68 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/composition_adjustment/composition_constants.h:
--------------------------------------------------------------------------------
 1 | /* $Id: composition_constants.h 187049 2010-03-26 14:52:29Z satskyse $
 2 |  * ===========================================================================
 3 |  *
 4 |  *                            PUBLIC DOMAIN NOTICE
 5 |  *               National Center for Biotechnology Information
 6 |  *
 7 |  *  This software/database is a "United States Government Work" under the
 8 |  *  terms of the United States Copyright Act.  It was written as part of
 9 |  *  the author's official duties as a United States Government employee and
10 |  *  thus cannot be copyrighted.  This software/database is freely available
11 |  *  to the public for use. The National Library of Medicine and the U.S.
12 |  *  Government have not placed any restriction on its use or reproduction.
13 |  *
14 |  *  Although all reasonable efforts have been taken to ensure the accuracy
15 |  *  and reliability of the software and data, the NLM and the U.S.
16 |  *  Government do not and cannot warrant the performance or results that
17 |  *  may be obtained by using this software or data. The NLM and the U.S.
18 |  *  Government disclaim all warranties, express or implied, including
19 |  *  warranties of performance, merchantability or fitness for any particular
20 |  *  purpose.
21 |  *
22 |  *  Please cite the author in any work or product based on this material.
23 |  *
24 |  * ===========================================================================*/
25 | /**
26 |  * @file composition_constants.h
27 |  * Constants used in compositional score matrix adjustment
28 |  *
29 |  * @author E. Michael Gertz, Alejandro Schaffer, Yi-Kuo Yu
30 |  */
31 | 
32 | 
33 | #ifndef __COMPOSITION_CONSTANTS__
34 | #define __COMPOSITION_CONSTANTS__
35 | 
36 | #include <algo/blast/core/ncbi_std.h>
37 | 
38 | #ifdef __cplusplus
39 | extern "C" {
40 | #endif
41 | 
42 | /** Minimum score in a matrix */
43 | #define COMPO_SCORE_MIN INT2_MIN
44 | 
45 | /** Number of standard amino acids */
46 | #define COMPO_NUM_TRUE_AA 20
47 | 
48 | /** The largest alphabet supported by this code (the code supports 26
49 |     or 28 character amino acid alphabets). Used to specify the size of
50 |     structures that are statically allocated. */
51 | #define COMPO_LARGEST_ALPHABET 28
52 | 
53 | /* NOTE: Please keep these comments in sync with argument descriptions in
54 |  * CCompositionBasedStatsArgs::SetArgumentDescriptions()
55 |  */
56 | 
57 | /** An collection of constants that specify all permissible
58 |  * modes of composition adjustment */
59 | typedef enum ECompoAdjustModes {
60 |     /** Don't use composition based statistics */
61 |     eNoCompositionBasedStats       = 0, 
62 |     /** Composition-based statistics as in NAR 29:2994-3005, 2001 */
63 |     eCompositionBasedStats         = 1, 
64 |     /** Composition-based score adjustment as in Bioinformatics 21:902-911,
65 |      * 2005, conditioned on sequence properties. Cannot be applied to PSSMs. */
66 |     eCompositionMatrixAdjust       = 2, 
67 |     /** Composition-based score adjustment as in Bioinformatics 21:902-911,
68 |      * 2005, unconditionally. Cannot be applied to PSSMs. */
69 |     eCompoForceFullMatrixAdjust    = 3,
70 |     eNumCompoAdjustModes
71 | } ECompoAdjustModes;
72 | 
73 | 
74 | /** An collection of constants that specify all rules that may
75 |  *  be used to generate a compositionally adjusted matrix.  */
76 | typedef enum EMatrixAdjustRule {
77 |     eDontAdjustMatrix              = (-1),
78 |     eCompoScaleOldMatrix           = 0,
79 |     eUnconstrainedRelEntropy       = 1,
80 |     eRelEntropyOldMatrixNewContext = 2,
81 |     eRelEntropyOldMatrixOldContext = 3,
82 |     eUserSpecifiedRelEntropy       = 4
83 | } EMatrixAdjustRule;
84 | 
85 | 
86 | #ifdef __cplusplus
87 | }
88 | #endif
89 | 
90 | #endif
91 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/composition_adjustment/matrix_frequency_data.h:
--------------------------------------------------------------------------------
 1 | /* $Id: matrix_frequency_data.h 103491 2007-05-04 17:18:18Z kazimird $
 2 |  * ===========================================================================
 3 |  *
 4 |  *                            PUBLIC DOMAIN NOTICE
 5 |  *               National Center for Biotechnology Information
 6 |  *
 7 |  *  This software/database is a "United States Government Work" under the
 8 |  *  terms of the United States Copyright Act.  It was written as part of
 9 |  *  the author's official duties as a United States Government employee and
10 |  *  thus cannot be copyrighted.  This software/database is freely available
11 |  *  to the public for use. The National Library of Medicine and the U.S.
12 |  *  Government have not placed any restriction on its use or reproduction.
13 |  *
14 |  *  Although all reasonable efforts have been taken to ensure the accuracy
15 |  *  and reliability of the software and data, the NLM and the U.S.
16 |  *  Government do not and cannot warrant the performance or results that
17 |  *  may be obtained by using this software or data. The NLM and the U.S.
18 |  *  Government disclaim all warranties, express or implied, including
19 |  *  warranties of performance, merchantability or fitness for any particular
20 |  *  purpose.
21 |  *
22 |  *  Please cite the author in any work or product based on this material.
23 |  *
24 |  * ===========================================================================*/
25 | /**
26 |  * @file matrix_frequency_data.h
27 |  * Definitions used to get joint probabilities for a scoring matrix
28 |  *
29 |  * @author Alejandro Schaffer, E. Michael Gertz
30 |  */
31 | #ifndef __MATRIX_FREQUENCY_DATA__
32 | #define __MATRIX_FREQUENCY_DATA__
33 | 
34 | #include <algo/blast/core/blast_export.h>
35 | 
36 | #ifdef __cplusplus
37 | extern "C" {
38 | #endif
39 | 
40 | /**
41 |  * Get joint probabilities for the named matrix.
42 |  *
43 |  * @param probs        the joint probabilities [out]
44 |  * @param row_sums     sum of the values in each row of probs [out]
45 |  * @param col_sums     sum of the values in each column of probs [out]
46 |  * @param matrixName   the name of the matrix sought [in]
47 |  * @returns 0 if successful; -1 if the named matrix is not known.
48 |  */
49 | NCBI_XBLAST_EXPORT
50 | int Blast_GetJointProbsForMatrix(double ** probs, double row_sums[],
51 |                                  double col_sums[], const char *matrixName);
52 | 
53 | 
54 | /** Return true if frequency data is available for the given matrix name. */
55 | NCBI_XBLAST_EXPORT
56 | const double * Blast_GetMatrixBackgroundFreq(const char *matrix_name);
57 | 
58 | 
59 | /** Retrieve the background letter probabilities implicitly used in
60 |  * constructing the score matrix matrix_name. */
61 | NCBI_XBLAST_EXPORT
62 | int Blast_FrequencyDataIsAvailable(const char *matrix_name);
63 | 
64 | #ifdef __cplusplus
65 | }
66 | #endif
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/core/blast_encoding.c:
--------------------------------------------------------------------------------
  1 | /* $Id: blast_encoding.c 118195 2008-01-24 21:22:19Z camacho $
  2 |  * ===========================================================================
  3 |  *
  4 |  *                            PUBLIC DOMAIN NOTICE
  5 |  *               National Center for Biotechnology Information
  6 |  *
  7 |  *  This software/database is a "United States Government Work" under the
  8 |  *  terms of the United States Copyright Act.  It was written as part of
  9 |  *  the author's official duties as a United States Government employee and
 10 |  *  thus cannot be copyrighted.  This software/database is freely available
 11 |  *  to the public for use. The National Library of Medicine and the U.S.
 12 |  *  Government have not placed any restriction on its use or reproduction.
 13 |  *
 14 |  *  Although all reasonable efforts have been taken to ensure the accuracy
 15 |  *  and reliability of the software and data, the NLM and the U.S.
 16 |  *  Government do not and cannot warrant the performance or results that
 17 |  *  may be obtained by using this software or data. The NLM and the U.S.
 18 |  *  Government disclaim all warranties, express or implied, including
 19 |  *  warranties of performance, merchantability or fitness for any particular
 20 |  *  purpose.
 21 |  *
 22 |  *  Please cite the author in any work or product based on this material.
 23 |  *
 24 |  * ===========================================================================
 25 |  *
 26 |  * Author:  Christiam Camacho
 27 |  *
 28 |  */
 29 | 
 30 | /** @file blast_encoding.c
 31 |  * Definitions of static arrays defined in blast_encoding.h.
 32 |  * @sa blast_encoding.h
 33 |  */
 34 | 
 35 | #ifndef SKIP_DOXYGEN_PROCESSING
 36 | static char const rcsid[] =
 37 |     "$Id: blast_encoding.c 118195 2008-01-24 21:22:19Z camacho $";
 38 | #endif /* SKIP_DOXYGEN_PROCESSING */
 39 | 
 40 | #include <algo/blast/core/blast_encoding.h>
 41 | 
 42 | const Uint1 NCBI4NA_TO_BLASTNA[BLASTNA_SIZE] = {
 43 |     15,     /* Gap, 0 */
 44 |      0,     /* A,   1 */
 45 |      1,     /* C,   2 */
 46 |      6,     /* M,   3 */
 47 |      2,     /* G,   4 */
 48 |      4,     /* R,   5 */
 49 |      9,     /* S,   6 */
 50 |     13,     /* V,   7 */
 51 |      3,     /* T,   8 */
 52 |      8,     /* W,   9 */
 53 |      5,     /* Y,  10 */
 54 |     12,     /* H,  11 */
 55 |      7,     /* K,  12 */
 56 |     11,     /* D,  13 */
 57 |     10,     /* B,  14 */
 58 |     14      /* N,  15 */
 59 | };
 60 | 
 61 | const Uint1 BLASTNA_TO_NCBI4NA[BLASTNA_SIZE] = {
 62 |      1,     /* A, 0 */
 63 |      2,     /* C, 1 */
 64 |      4,     /* G, 2 */
 65 |      8,     /* T, 3 */
 66 |      5,     /* R, 4 */
 67 |     10,     /* Y, 5 */
 68 |      3,     /* M, 6 */
 69 |     12,     /* K, 7 */
 70 |      9,     /* W, 8 */
 71 |      6,     /* S, 9 */
 72 |     14,     /* B, 10 */
 73 |     13,     /* D, 11 */
 74 |     11,     /* H, 12 */
 75 |      7,     /* V, 13 */
 76 |     15,     /* N, 14 */
 77 |      0      /* Gap, 15 */
 78 | };
 79 | 
 80 | const char BLASTNA_TO_IUPACNA[BLASTNA_SIZE] = {
 81 |     'A', 'C', 'G', 'T', 'R', 'Y', 'M', 'K', 
 82 |     'W', 'S', 'B', 'D', 'H', 'V', 'N', '-'
 83 | };
 84 | 
 85 | const char NCBI4NA_TO_IUPACNA[BLASTNA_SIZE] = {
 86 |     '-', 'A', 'C', 'M', 'G', 'R', 'S', 'V',
 87 |     'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'
 88 | };
 89 | 
 90 | const Uint1 IUPACNA_TO_BLASTNA[128]={
 91 | 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
 92 | 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
 93 | 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
 94 | 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
 95 | 15, 0,10, 1,11,15,15, 2,12,15,15, 7,15, 6,14,15,
 96 | 15,15, 4, 9, 3,15,13, 8,15, 5,15,15,15,15,15,15,
 97 | 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
 98 | 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15};
 99 | 
100 | const Uint1 IUPACNA_TO_NCBI4NA[128]={
101 |  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 |  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 |  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 |  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 |  0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0,
106 |  0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0,
107 |  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
108 |  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
109 | 
110 | const Uint1 AMINOACID_TO_NCBISTDAA[128] = {
111 |  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
112 |  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 |  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,25, 0, 0, 0, 0, 0,
114 |  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 |  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,27,10,11,12,13,26,
116 | 14,15,16,17,18,24,19,20,21,22,23, 0, 0, 0, 0, 0,
117 |  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 |  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
119 | 
120 | const char NCBISTDAA_TO_AMINOACID[BLASTAA_SIZE] = {
121 | '-','A','B','C','D','E','F','G','H','I','K','L','M',
122 | 'N','P','Q','R','S','T','V','W','X','Y','Z','U','*',
123 | 'O', 'J'};
124 | 
125 | const Uint1 kProtSentinel = NULLB;
126 | const Uint1 kNuclSentinel = 0xF;
127 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/core/blast_encoding.h:
--------------------------------------------------------------------------------
  1 | /*  $Id: blast_encoding.h 118195 2008-01-24 21:22:19Z camacho $
  2 |  * ===========================================================================
  3 |  *
  4 |  *                            PUBLIC DOMAIN NOTICE
  5 |  *               National Center for Biotechnology Information
  6 |  *
  7 |  *  This software/database is a "United States Government Work" under the
  8 |  *  terms of the United States Copyright Act.  It was written as part of
  9 |  *  the author's official duties as a United States Government employee and
 10 |  *  thus cannot be copyrighted.  This software/database is freely available
 11 |  *  to the public for use. The National Library of Medicine and the U.S.
 12 |  *  Government have not placed any restriction on its use or reproduction.
 13 |  *
 14 |  *  Although all reasonable efforts have been taken to ensure the accuracy
 15 |  *  and reliability of the software and data, the NLM and the U.S.
 16 |  *  Government do not and cannot warrant the performance or results that
 17 |  *  may be obtained by using this software or data. The NLM and the U.S.
 18 |  *  Government disclaim all warranties, express or implied, including
 19 |  *  warranties of performance, merchantability or fitness for any particular
 20 |  *  purpose.
 21 |  *
 22 |  *  Please cite the author in any work or product based on this material.
 23 |  *
 24 |  * ===========================================================================
 25 |  *
 26 |  * Author:  Christiam Camacho
 27 |  *
 28 |  */
 29 | 
 30 | /** @file blast_encoding.h
 31 |  *  Declarations of static arrays used to define some NCBI encodings to be used
 32 |  *  in a toolkit independent manner by the BLAST engine.
 33 |  */
 34 | 
 35 | #ifndef ALGO_BLAST_CORE___BLAST_ENCODING__H
 36 | #define ALGO_BLAST_CORE___BLAST_ENCODING__H
 37 | 
 38 | #include <algo/blast/core/ncbi_std.h>
 39 | #include <algo/blast/core/blast_export.h>
 40 | 
 41 | /** @addtogroup AlgoBlast
 42 |  *
 43 |  * @{
 44 |  */
 45 | 
 46 | #ifdef __cplusplus
 47 | extern "C" {
 48 | #endif
 49 | 
 50 | /** Different types of sequence encodings for sequence retrieval from the 
 51 |  * BLAST database 
 52 |  */
 53 | typedef enum { 
 54 |     eBlastEncodingProtein       = 0, /**< NCBIstdaa */
 55 |     eBlastEncodingNucleotide    = 1, /**< Special encoding for preliminary 
 56 |                                        stage of BLAST: permutation of NCBI4na.
 57 |                                        A.k.a.: BLASTNA encoding
 58 |                                       */
 59 |     eBlastEncodingNcbi4na       = 2, /**< NCBI4na */
 60 |     eBlastEncodingNcbi2na       = 3, /**< NCBI2na */
 61 |     eBlastEncodingError         = 255 /**< Error value for encoding */
 62 | } EBlastEncoding;
 63 | 
 64 | /* Nucleotide encodings */
 65 | 
 66 | /** Translates between ncbi4na and blastna. The first four elements
 67 |  *	of this array match ncbi2na. */
 68 | NCBI_XBLAST_EXPORT extern const Uint1 NCBI4NA_TO_BLASTNA[];
 69 | 
 70 | /** Translates between blastna and ncbi4na. */
 71 | NCBI_XBLAST_EXPORT extern const Uint1 BLASTNA_TO_NCBI4NA[];
 72 | 
 73 | /** Translates between iupacna and blastna. */
 74 | NCBI_XBLAST_EXPORT extern const Uint1 IUPACNA_TO_BLASTNA[];
 75 | 
 76 | /** Translates between iupacna and ncbi4na. */
 77 | NCBI_XBLAST_EXPORT extern const Uint1 IUPACNA_TO_NCBI4NA[];
 78 | 
 79 | /** Translates between ncbieaa and ncbistdaa. */
 80 | NCBI_XBLAST_EXPORT extern const Uint1 AMINOACID_TO_NCBISTDAA[];
 81 | 
 82 | /** Translates between ncbieaa and ncbistdaa. */
 83 | NCBI_XBLAST_EXPORT extern const char NCBISTDAA_TO_AMINOACID[];
 84 | 
 85 | /** Translates between blastna and iupacna. */
 86 | NCBI_XBLAST_EXPORT extern const char BLASTNA_TO_IUPACNA[];
 87 | 
 88 | /** Translates between ncbi4na and iupacna. */
 89 | NCBI_XBLAST_EXPORT extern const char NCBI4NA_TO_IUPACNA[];
 90 | 
 91 | #define BLAST2NA_SIZE 4     /**< Size of compressed nucleic acid alphabet */
 92 | #define BLASTNA_SIZE 16     /**< Size of nucleic acid alphabet */
 93 | #define BLASTAA_SIZE 28     /**< Size of aminoacid alphabet */
 94 | 
 95 | 
 96 | #define BLASTNA_SEQ_CODE 99 /**< Identifies the blastna alphabet, for use in 
 97 |                                 blast only. */
 98 | #define BLASTAA_SEQ_CODE 11 /**< == Seq_code_ncbistdaa */
 99 | #define NCBI4NA_SEQ_CODE 4  /**< == Seq_code_ncbi4na */	
100 | 
101 | /** Sentinel byte for protein sequences */
102 | NCBI_XBLAST_EXPORT extern const Uint1 kProtSentinel;
103 | /** Sentinel nibble for nucleotide sequences */
104 | NCBI_XBLAST_EXPORT extern const Uint1 kNuclSentinel;
105 | 
106 | #ifdef __cplusplus
107 | }
108 | #endif
109 | 
110 | /* @} */
111 | 
112 | #endif  /* ALGO_BLAST_CORE___BLAST_ENCODING__H */
113 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/core/blast_export.h:
--------------------------------------------------------------------------------
 1 | #ifndef BLAST_EXPORT__H
 2 | #define BLAST_EXPORT__H
 3 | 
 4 | /*  $Id: blast_export.h 166398 2009-07-22 15:51:55Z ucko $
 5 |  * ===========================================================================
 6 |  *
 7 |  *                            PUBLIC DOMAIN NOTICE
 8 |  *               National Center for Biotechnology Information
 9 |  *
10 |  *  This software/database is a "United States Government Work" under the
11 |  *  terms of the United States Copyright Act.  It was written as part of
12 |  *  the author's official duties as a United States Government employee and
13 |  *  thus cannot be copyrighted.  This software/database is freely available
14 |  *  to the public for use. The National Library of Medicine and the U.S.
15 |  *  Government have not placed any restriction on its use or reproduction.
16 |  *
17 |  *  Although all reasonable efforts have been taken to ensure the accuracy
18 |  *  and reliability of the software and data, the NLM and the U.S.
19 |  *  Government do not and cannot warrant the performance or results that
20 |  *  may be obtained by using this software or data. The NLM and the U.S.
21 |  *  Government disclaim all warranties, express or implied, including
22 |  *  warranties of performance, merchantability or fitness for any particular
23 |  *  purpose.
24 |  *
25 |  *  Please cite the author in any work or product based on this material.
26 |  *
27 |  * ===========================================================================
28 |  *
29 |  * Author:  Viatcheslav Gorelenkov
30 |  *
31 |  */
32 | 
33 | /** @file blast_export.h
34 |  * Defines to provide correct exporting from BLAST DLL in Windows.
35 |  * These are necessary to compile DLLs with Visual C++ - exports must be
36 |  * explicitly labeled as such.
37 |  */
38 | 
39 | 
40 | 
41 | 
42 | #if defined(WIN32)  &&  defined(NCBI_DLL_BUILD)
43 | 
44 | #ifndef _MSC_VER
45 | #  error "This toolkit is not buildable with a compiler other than MSVC."
46 | #endif
47 | 
48 | 
49 | #ifdef NCBI_XALGO_EXPORTS
50 | #  define NCBI_XBLAST_EXPORT        __declspec(dllexport)
51 | #else
52 | #  define NCBI_XBLAST_EXPORT        __declspec(dllimport)
53 | #endif
54 | 
55 | #elif defined(HAVE_ATTRIBUTE_VISIBILITY_DEFAULT)
56 | 
57 | #  define NCBI_XBLAST_EXPORT        __attribute__((visibility("default")))
58 | 
59 | #else
60 | 
61 | /**
62 |  * NULL operations for other cases
63 |  */
64 | 
65 | #  define NCBI_XBLAST_EXPORT
66 | 
67 | 
68 | #endif
69 | 
70 | #endif  /*  BLAST_EXPORT__H  */
71 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/core/blast_hits_priv.h:
--------------------------------------------------------------------------------
 1 | /*  $Id: blast_hits_priv.h 103491 2007-05-04 17:18:18Z kazimird $
 2 |  * ===========================================================================
 3 |  *
 4 |  *                            PUBLIC DOMAIN NOTICE
 5 |  *               National Center for Biotechnology Information
 6 |  *
 7 |  *  This software/database is a "United States Government Work" under the
 8 |  *  terms of the United States Copyright Act.  It was written as part of
 9 |  *  the author's official duties as a United States Government employee and
10 |  *  thus cannot be copyrighted.  This software/database is freely available
11 |  *  to the public for use. The National Library of Medicine and the U.S.
12 |  *  Government have not placed any restriction on its use or reproduction.
13 |  *
14 |  *  Although all reasonable efforts have been taken to ensure the accuracy
15 |  *  and reliability of the software and data, the NLM and the U.S.
16 |  *  Government do not and cannot warrant the performance or results that
17 |  *  may be obtained by using this software or data. The NLM and the U.S.
18 |  *  Government disclaim all warranties, express or implied, including
19 |  *  warranties of performance, merchantability or fitness for any particular
20 |  *  purpose.
21 |  *
22 |  *  Please cite the author in any work or product based on this material.
23 |  *
24 |  * ===========================================================================
25 |  *
26 |  * Author:  Christiam Camacho
27 |  *
28 |  */
29 | 
30 | /** @file blast_hits_priv.h
31 |  *  Utilities for dealing with BLAST HSPs in the core of BLAST.
32 |  */
33 | 
34 | #ifndef ALGO_BLAST_CORE___BLAST_HITS_PRIV__H
35 | #define ALGO_BLAST_CORE___BLAST_HITS_PRIV__H
36 | 
37 | #include <algo/blast/core/ncbi_std.h>
38 | #include <algo/blast/core/blast_hits.h>
39 | 
40 | #ifdef __cplusplus
41 | extern "C" {
42 | #endif
43 | 
44 | /** Check the gapped alignments for an overlap of two different alignments.
45 |  * A sufficient overlap is when two alignments have the same start values
46 |  * of have the same final values. 
47 |  * @param hsp_array Pointer to an array of BlastHSP structures [in]
48 |  * @param hsp_count The size of the hsp_array [in]
49 |  * @return The number of valid alignments remaining. 
50 | */
51 | Int4
52 | Blast_CheckHSPsForCommonEndpoints(BlastHSP* *hsp_array, Int4 hsp_count);
53 | 
54 | /** Comparison callback function for sorting HSPs, first by score in descending
55 |  * order, then by location. Among alignments with equal score, an HSP will 
56 |  * precede any other HSPs that are completely contained within its endpoints.
57 |  *
58 |  * H2 is contained in H1 if and only if                                         
59 |  * H1.query.offset <= H2.query.offset <= H2.query.end <= H1.query.end 
60 |  * H1.sbjct.offset <= H2.sbjct.offset <= H2.sbjct.end <= H1.sbjct.end
61 |  */
62 | int
63 | ScoreCompareHSPs(const void* h1, const void* h2);
64 | 
65 | /** TRUE if c is between a and b; f between d and e.  Determines if the
66 |  * coordinates are already in an HSP that has been evaluated. 
67 | */
68 | #define CONTAINED_IN_HSP(a,b,c,d,e,f) \
69 |     (((a <= c && b >= c) && (d <= f && e >= f)) ? TRUE : FALSE)
70 | 
71 | #ifdef __cplusplus
72 | }
73 | #endif
74 | 
75 | #endif /* !ALGO_BLAST_CORE__BLAST_HITS_PRIV__H */
76 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/core/blast_program.c:
--------------------------------------------------------------------------------
  1 | #ifndef SKIP_DOXYGEN_PROCESSING
  2 | static char const rcsid[] =
  3 |     "$Id: blast_program.c 97157 2007-01-19 14:27:24Z madden $";
  4 | #endif /* SKIP_DOXYGEN_PROCESSING */
  5 | /* ===========================================================================
  6 |  *
  7 |  *                            PUBLIC DOMAIN NOTICE
  8 |  *               National Center for Biotechnology Information
  9 |  *
 10 |  *  This software/database is a "United States Government Work" under the
 11 |  *  terms of the United States Copyright Act.  It was written as part of
 12 |  *  the author's official duties as a United States Government employee and
 13 |  *  thus cannot be copyrighted.  This software/database is freely available
 14 |  *  to the public for use. The National Library of Medicine and the U.S.
 15 |  *  Government have not placed any restriction on its use or reproduction.
 16 |  *
 17 |  *  Although all reasonable efforts have been taken to ensure the accuracy
 18 |  *  and reliability of the software and data, the NLM and the U.S.
 19 |  *  Government do not and cannot warrant the performance or results that
 20 |  *  may be obtained by using this software or data. The NLM and the U.S.
 21 |  *  Government disclaim all warranties, express or implied, including
 22 |  *  warranties of performance, merchantability or fitness for any particular
 23 |  *  purpose.
 24 |  *
 25 |  *  Please cite the author in any work or product based on this material.
 26 |  *
 27 |  * ===========================================================================
 28 |  *
 29 |  * Author:  Christiam Camacho / Ilya Dondoshansky
 30 |  *
 31 |  */
 32 | 
 33 | /** @file blast_program.c
 34 |  * Implementation auxiliary functions to determine traits of the various BLAST
 35 |  * programs supported by core BLAST
 36 |  */
 37 |     
 38 | #include <algo/blast/core/blast_program.h>
 39 | 
 40 | /** Convert an arbitrary integer to true/false */
 41 | #define SAFE_CAST_INT_TO_BOOLEAN(p) (((p) != 0) ? TRUE : FALSE)
 42 | 
 43 | /* Classify query sequence */
 44 | Boolean Blast_QueryIsProtein(EBlastProgramType p)
 45 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PROTEIN_QUERY_MASK); }
 46 | 
 47 | Boolean Blast_QueryIsNucleotide(EBlastProgramType p)
 48 | { return SAFE_CAST_INT_TO_BOOLEAN(p & NUCLEOTIDE_QUERY_MASK); }
 49 | 
 50 | Boolean Blast_QueryIsPssm(EBlastProgramType p)
 51 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PSSM_QUERY_MASK); }
 52 | 
 53 | /* Classify subject sequence */
 54 | Boolean Blast_SubjectIsProtein(EBlastProgramType p)
 55 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PROTEIN_SUBJECT_MASK); }
 56 | 
 57 | Boolean Blast_SubjectIsNucleotide(EBlastProgramType p)
 58 | { return SAFE_CAST_INT_TO_BOOLEAN(p & NUCLEOTIDE_SUBJECT_MASK); }
 59 | 
 60 | Boolean Blast_SubjectIsPssm(EBlastProgramType p)
 61 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PSSM_SUBJECT_MASK); }
 62 | 
 63 | /* Handle translated searches */
 64 | Boolean Blast_QueryIsTranslated(EBlastProgramType p)
 65 | { return SAFE_CAST_INT_TO_BOOLEAN(p & TRANSLATED_QUERY_MASK); }
 66 | 
 67 | Boolean Blast_SubjectIsTranslated(EBlastProgramType p)
 68 | { return SAFE_CAST_INT_TO_BOOLEAN(p & TRANSLATED_SUBJECT_MASK); }
 69 | 
 70 | /* Handle special programs */
 71 | Boolean Blast_ProgramIsPsiBlast(EBlastProgramType p)
 72 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PSSM_QUERY_MASK); }
 73 | 
 74 | Boolean Blast_ProgramIsPhiBlast(EBlastProgramType p)
 75 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PATTERN_QUERY_MASK); }
 76 | 
 77 | Boolean Blast_ProgramIsRpsBlast(EBlastProgramType p)
 78 | { return SAFE_CAST_INT_TO_BOOLEAN(p & PSSM_SUBJECT_MASK); }
 79 | 
 80 | Boolean Blast_ProgramIsValid(EBlastProgramType p)
 81 | {
 82 |     switch (p) {
 83 |     case eBlastTypeBlastp:
 84 |     case eBlastTypeBlastn:
 85 |     case eBlastTypeBlastx:
 86 |     case eBlastTypeTblastn:
 87 |     case eBlastTypeTblastx:
 88 |     case eBlastTypePsiBlast:
 89 |     case eBlastTypePsiTblastn:
 90 |     case eBlastTypeRpsBlast:
 91 |     case eBlastTypeRpsTblastn:
 92 |     case eBlastTypePhiBlastp:
 93 |     case eBlastTypePhiBlastn:
 94 |         return TRUE;
 95 |         break;
 96 |     default:
 97 |         return FALSE;
 98 |         break;
 99 |     }
100 | }
101 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/core/blast_seg.h:
--------------------------------------------------------------------------------
 1 | /* $Id: blast_seg.h 114718 2007-11-28 15:52:56Z ivanov $
 2 |  * ===========================================================================
 3 |  *
 4 |  *                            PUBLIC DOMAIN NOTICE
 5 |  *               National Center for Biotechnology Information
 6 |  *
 7 |  *  This software/database is a "United States Government Work" under the
 8 |  *  terms of the United States Copyright Act.  It was written as part of
 9 |  *  the author's official duties as a United States Government employee and
10 |  *  thus cannot be copyrighted.  This software/database is freely available
11 |  *  to the public for use. The National Library of Medicine and the U.S.
12 |  *  Government have not placed any restriction on its use or reproduction.
13 |  *
14 |  *  Although all reasonable efforts have been taken to ensure the accuracy
15 |  *  and reliability of the software and data, the NLM and the U.S.
16 |  *  Government do not and cannot warrant the performance or results that
17 |  *  may be obtained by using this software or data. The NLM and the U.S.
18 |  *  Government disclaim all warranties, express or implied, including
19 |  *  warranties of performance, merchantability or fitness for any particular
20 |  *  purpose.
21 |  *
22 |  *  Please cite the author in any work or product based on this material.
23 |  *
24 |  * ===========================================================================
25 |  *
26 |  * Author:  Ilya Dondoshansky
27 |  *
28 |  */
29 | 
30 | /** @file blast_seg.h
31 |  * SEG filtering functions. @todo FIXME: should this be combined with
32 |  * blast_filter/dust? Needs doxygen documentation and comments
33 |  */
34 | 
35 | #ifndef __BLAST_SEG__
36 | #define __BLAST_SEG__
37 | 
38 | #include <algo/blast/core/ncbi_std.h>
39 | #include <algo/blast/core/blast_def.h>
40 | 
41 | 
42 | #ifdef __cplusplus
43 | extern "C" {
44 | #endif
45 | 
46 | /** Structure to hold parameters for seg search.
47 |  */
48 | typedef struct SegParameters
49 |   {
50 |    Int4 window;         /**< initial window size to trigger further work. */
51 |    double locut;        
52 |    double hicut;
53 |    Int4 period;
54 |    Int4 hilenmin;
55 |    Boolean overlaps;	/* merge overlapping pieces if TRUE. */
56 |    Int4 maxtrim;
57 |    Int4 maxbogus;
58 |   } SegParameters;
59 | 
60 | /** Allocated SeqParameter struct for proteins and fills with default values.
61 |  * @return pointer to SegParameters
62 |  */
63 | NCBI_XBLAST_EXPORT
64 | SegParameters* SegParametersNewAa (void);
65 | 
66 | /** Free SegParameters structure
67 |  * @param sparamsp object to be freed [in]
68 |  */
69 | NCBI_XBLAST_EXPORT
70 | void SegParametersFree(SegParameters* sparamsp);
71 | 
72 | /** Runs seg on a protein sequence in ncbistdaa.
73 |  * @param sequence the protein residues in ncbistdaa [in]
74 |  * @param length number of redidues [in]
75 |  * @param offset amount to shift over resulting locations 
76 |  *    (if full sequence not passed in) [in]
77 |  * @param sparamsp the seg parameters created with SegParametersNewAa [in]
78 |  * @param seg_locs resulting locations for filtering [out]
79 |  * @return zero on success
80 |  */
81 | NCBI_XBLAST_EXPORT
82 | Int2 SeqBufferSeg (Uint1* sequence, Int4 length, Int4 offset,
83 |                    SegParameters* sparamsp, BlastSeqLoc** seg_locs);
84 | 
85 | #ifdef __cplusplus
86 | }
87 | #endif
88 | #endif /* !__BLAST_FILTER__ */
89 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/core/blast_toolkit.h:
--------------------------------------------------------------------------------
 1 | #ifndef _BLAST_TOOLKIT__H
 2 | #define _BLAST_TOOLKIT__H
 3 | 
 4 | /** @file blast_toolkit.h
 5 |  * Choose C++ basic defines
 6 |  */
 7 | 
 8 | #include <corelib/ncbitype.h>
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/core/hspfilter_collector.h:
--------------------------------------------------------------------------------
 1 | /*  $Id: hspfilter_collector.h 161402 2009-05-27 17:35:47Z camacho $
 2 |  * ===========================================================================
 3 |  *
 4 |  *                            PUBLIC DOMAIN NOTICE
 5 |  *               National Center for Biotechnology Information
 6 |  *
 7 |  *  This software/database is a "United States Government Work" under the
 8 |  *  terms of the United States Copyright Act.  It was written as part of
 9 |  *  the author's official duties as a United States Government employee and
10 |  *  thus cannot be copyrighted.  This software/database is freely available
11 |  *  to the public for use. The National Library of Medicine and the U.S.
12 |  *  Government have not placed any restriction on its use or reproduction.
13 |  *
14 |  *  Although all reasonable efforts have been taken to ensure the accuracy
15 |  *  and reliability of the software and data, the NLM and the U.S.
16 |  *  Government do not and cannot warrant the performance or results that
17 |  *  may be obtained by using this software or data. The NLM and the U.S.
18 |  *  Government disclaim all warranties, express or implied, including
19 |  *  warranties of performance, merchantability or fitness for any particular
20 |  *  purpose.
21 |  *
22 |  *  Please cite the author in any work or product based on this material.
23 |  *
24 |  * ===========================================================================
25 |  *
26 |  * Author:  Ning Ma
27 |  *
28 |  */
29 | 
30 | /** @file hspfilter_collector.h
31 |  * Implementation of a number of BlastHSPWriters to save hits from
32 |  * a BLAST search, and subsequently return them in sorted order.
33 |  */
34 | 
35 | #ifndef ALGO_BLAST_CORE__HSPFILTER_COLLECTOR__H
36 | #define ALGO_BLAST_CORE__HSPFILTER_COLLECTOR__H
37 | 
38 | #include <algo/blast/core/ncbi_std.h>
39 | #include <algo/blast/core/blast_program.h>
40 | #include <algo/blast/core/blast_options.h>
41 | #include <algo/blast/core/blast_hspfilter.h>
42 | #include <algo/blast/core/blast_hits.h>
43 | #include <connect/ncbi_core.h>
44 | 
45 | #ifdef __cplusplus
46 | extern "C" {
47 | #endif
48 | 
49 | /** Keeps prelim_hitlist_size and HitSavingOptions together. */
50 | typedef struct BlastHSPCollectorParams {
51 |    EBlastProgramType program;/**< program type */
52 |    Int4 prelim_hitlist_size; /**< number of hits saved during preliminary
53 |                                   part of search. */
54 |    Int4 hsp_num_max;         /**< number of HSPs to save per db sequence.*/
55 | } BlastHSPCollectorParams;
56 | 
57 | /** Sets up parameter set for use by collector.
58 |  * @param program Blast program type.[in]
59 |  * @param hit_options field hitlist_size and hsp_num_max needed, a pointer to 
60 |  *      this structure will be stored on resulting structure.[in]
61 |  * @param ext_options field compositionBasedStats needed here. [in]
62 |  * @param scoring_options gapped_calculation needed here. [in]
63 |  * @return the pointer to the allocated parameter
64 |  */
65 | NCBI_XBLAST_EXPORT
66 | BlastHSPCollectorParams*
67 | BlastHSPCollectorParamsNew(const BlastHitSavingOptions* hit_options,
68 |                            Int4 compositionBasedStats,
69 |                            Boolean gapped_calculation);
70 | 
71 | /** Deallocates the BlastHSPCollectorParams structure passed in
72 |  * @param opts structure to deallocate [in]
73 |  * @return NULL
74 |  */
75 | NCBI_XBLAST_EXPORT
76 | BlastHSPCollectorParams*
77 | BlastHSPCollectorParamsFree(BlastHSPCollectorParams* opts);
78 | 
79 | /** WriterInfo to create a default writer: the collecter
80 |  * @param params The collector parameters.
81 |  * @return pointer to WriterInfo
82 |  */
83 | NCBI_XBLAST_EXPORT
84 | BlastHSPWriterInfo* 
85 | BlastHSPCollectorInfoNew(BlastHSPCollectorParams* params);
86 | 
87 | #ifdef __cplusplus
88 | }
89 | #endif
90 | 
91 | #endif /* !ALGO_BLAST_CORE__HSPFILTER_COLLECTOR__H */
92 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/core/lookup_wrap.h:
--------------------------------------------------------------------------------
  1 | /* $Id: lookup_wrap.h 369355 2012-07-18 17:07:15Z morgulis $
  2 |  * ===========================================================================
  3 |  *
  4 |  *                            PUBLIC DOMAIN NOTICE
  5 |  *               National Center for Biotechnology Information
  6 |  *
  7 |  *  This software/database is a "United States Government Work" under the
  8 |  *  terms of the United States Copyright Act.  It was written as part of
  9 |  *  the author's official duties as a United States Government employee and
 10 |  *  thus cannot be copyrighted.  This software/database is freely available
 11 |  *  to the public for use. The National Library of Medicine and the U.S.
 12 |  *  Government have not placed any restriction on its use or reproduction.
 13 |  *
 14 |  *  Although all reasonable efforts have been taken to ensure the accuracy
 15 |  *  and reliability of the software and data, the NLM and the U.S.
 16 |  *  Government do not and cannot warrant the performance or results that
 17 |  *  may be obtained by using this software or data. The NLM and the U.S.
 18 |  *  Government disclaim all warranties, express or implied, including
 19 |  *  warranties of performance, merchantability or fitness for any particular
 20 |  *  purpose.
 21 |  *
 22 |  *  Please cite the author in any work or product based on this material.
 23 |  *
 24 |  * ===========================================================================
 25 |  *
 26 |  * Author: Ilya Dondoshansky
 27 |  *
 28 |  */
 29 | 
 30 | /** @file lookup_wrap.h
 31 |  * Wrapper for all lookup tables used in BLAST
 32 |  */
 33 | 
 34 | #ifndef ALGO_BLAST_CORE__LOOKUP_WRAP__H
 35 | #define ALGO_BLAST_CORE__LOOKUP_WRAP__H
 36 | 
 37 | #include <algo/blast/core/ncbi_std.h>
 38 | #include <algo/blast/core/blast_def.h>
 39 | #include <algo/blast/core/blast_options.h>
 40 | #include <algo/blast/core/blast_rps.h>
 41 | #include <algo/blast/core/blast_stat.h>
 42 | 
 43 | #ifdef __cplusplus
 44 | extern "C" {
 45 | #endif
 46 | 
 47 | /** Wrapper structure for different types of BLAST lookup tables */
 48 | typedef struct LookupTableWrap {
 49 |    ELookupTableType lut_type; /**< What kind of a lookup table it is? */
 50 |    void* lut; /**< Pointer to the actual lookup table structure */
 51 |    void* read_indexed_db; /**< function used to retrieve hits
 52 |                               from an indexed database */
 53 |    void* check_index_oid; /**< function used to check if seeds
 54 |                                for a given oid are present */
 55 |    void * end_search_indication; /**< function used to report that
 56 |                                       a thread is done iterating over
 57 |                                       the database in preliminary
 58 |                                       search */
 59 |    void* lookup_callback;    /**< function used to look up an
 60 |                                   index->q_off pair */
 61 | } LookupTableWrap;
 62 | 
 63 | /** Function pointer type to check the presence of index->q_off pair */
 64 | typedef Boolean (*T_Lookup_Callback)(const LookupTableWrap *, Int4, Int4);
 65 | 
 66 | /** Create the lookup table for all query words.
 67 |  * @param query The query sequence [in]
 68 |  * @param lookup_options What kind of lookup table to build? [in]
 69 |  * @param query_options options for query setup [in]
 70 |  * @param lookup_segments Locations on query to be used for lookup table
 71 |  *                        construction [in]
 72 |  * @param sbp Scoring block containing matrix [in]
 73 |  * @param lookup_wrap_ptr The initialized lookup table [out]
 74 |  * @param rps_info Structure containing RPS blast setup information [in]
 75 |  * @param error_msg message with warning or errors [in|out]
 76 |  */
 77 | NCBI_XBLAST_EXPORT
 78 | Int2 LookupTableWrapInit(BLAST_SequenceBlk* query, 
 79 |         const LookupTableOptions* lookup_options,	
 80 |         const QuerySetUpOptions* query_options,
 81 |         BlastSeqLoc* lookup_segments, BlastScoreBlk* sbp, 
 82 |         LookupTableWrap** lookup_wrap_ptr, const BlastRPSInfo *rps_info,
 83 |         Blast_Message* *error_msg);
 84 | 
 85 | /** Deallocate memory for the lookup table */
 86 | NCBI_XBLAST_EXPORT
 87 | LookupTableWrap* LookupTableWrapFree(LookupTableWrap* lookup);
 88 | 
 89 | /** Default size of offset arrays filled in a single ScanSubject call. */
 90 | #define OFFSET_ARRAY_SIZE 4096
 91 | 
 92 | /** Determine the size of the offsets arrays to be filled by
 93 |  * the ScanSubject function.
 94 |  */
 95 | NCBI_XBLAST_EXPORT
 96 | Int4 GetOffsetArraySize(LookupTableWrap* lookup);
 97 | 
 98 | #ifdef __cplusplus
 99 | }
100 | #endif
101 | #endif /* !ALGO_BLAST_CORE__LOOKUP_WRAP__H */
102 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/core/matrix_freq_ratios.h:
--------------------------------------------------------------------------------
 1 | #ifndef ALGO_BLAST_CORE___MATRIX_FREQ_RATIOS__H
 2 | #define ALGO_BLAST_CORE___MATRIX_FREQ_RATIOS__H
 3 | 
 4 | /*  $Id: matrix_freq_ratios.h 439650 2014-07-02 13:40:24Z madden $
 5 |  * ===========================================================================
 6 |  *
 7 |  *                            PUBLIC DOMAIN NOTICE
 8 |  *               National Center for Biotechnology Information
 9 |  *
10 |  *  This software/database is a "United States Government Work" under the
11 |  *  terms of the United States Copyright Act.  It was written as part of
12 |  *  the author's official duties as a United States Government employee and
13 |  *  thus cannot be copyrighted.  This software/database is freely available
14 |  *  to the public for use. The National Library of Medicine and the U.S.
15 |  *  Government have not placed any restriction on its use or reproduction.
16 |  *
17 |  *  Although all reasonable efforts have been taken to ensure the accuracy
18 |  *  and reliability of the software and data, the NLM and the U.S.
19 |  *  Government do not and cannot warrant the performance or results that
20 |  *  may be obtained by using this software or data. The NLM and the U.S.
21 |  *  Government disclaim all warranties, express or implied, including
22 |  *  warranties of performance, merchantability or fitness for any particular
23 |  *  purpose.
24 |  *
25 |  *  Please cite the author in any work or product based on this material.
26 |  *
27 |  * ===========================================================================
28 |  *
29 |  * Author:  Christiam Camacho
30 |  *
31 |  */
32 | 
33 | /** @file matrix_freq_ratios.h
34 |  *  Interface to retrieve the frequency ratios for various scoring matrices.
35 |  *
36 |  *  See explanation in p 2996 of Nucleic Acids Research, 2001, Vol 29, No 14.
37 |  */
38 | 
39 | #include <algo/blast/core/blast_encoding.h>
40 | 
41 | #ifdef __cplusplus
42 | extern "C" {
43 | #endif
44 | 
45 | /** Stores the frequency ratios along with their bit scale factor */
46 | typedef struct SFreqRatios {
47 | 
48 |     /** The actual frequency ratios */
49 |     double**   data;
50 | 
51 |     /** Used to multiply the values in the above matrix to obtain scores in bit
52 |      * units */
53 |     int        bit_scale_factor;
54 | 
55 | } SFreqRatios;
56 | 
57 | /** Retrive the matrix's frequency ratios.
58 |  * @param matrix_name Available options include:
59 |  *          BLOSUM62
60 |  *          BLOSUM62_20
61 |  *          BLOSUM62_20A
62 |  *          BLOSUM62_20B
63 |  *          BLOSUM45
64 |  *          BLOSUM80
65 |  *          BLOSUM50
66 |  *          BLOSUM90
67 |  *          PAM30
68 |  *          PAM70
69 |  *          PAM250
70 |  * @return NULL on error
71 |  */
72 | NCBI_XBLAST_EXPORT SFreqRatios*
73 | _PSIMatrixFrequencyRatiosNew(const char* matrix_name);
74 | 
75 | /** Deallocate the frequency ratios structure */
76 | NCBI_XBLAST_EXPORT SFreqRatios*
77 | _PSIMatrixFrequencyRatiosFree(SFreqRatios* freq_ratios);
78 | 
79 | #ifdef __cplusplus
80 | }
81 | #endif
82 | 
83 | #endif /* !ALGO_BLAST_CORE__MATRIX_FREQ_RATIOS__H */
84 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/algo/blast/core/pattern_priv.h:
--------------------------------------------------------------------------------
  1 | /* $Id: pattern_priv.h 103491 2007-05-04 17:18:18Z kazimird $
  2 |  * ===========================================================================
  3 |  *
  4 |  *                            PUBLIC DOMAIN NOTICE
  5 |  *               National Center for Biotechnology Information
  6 |  *
  7 |  *  This software/database is a "United States Government Work" under the
  8 |  *  terms of the United States Copyright Act.  It was written as part of
  9 |  *  the author's official duties as a United States Government employee and
 10 |  *  thus cannot be copyrighted.  This software/database is freely available
 11 |  *  to the public for use. The National Library of Medicine and the U.S.
 12 |  *  Government have not placed any restriction on its use or reproduction.
 13 |  *
 14 |  *  Although all reasonable efforts have been taken to ensure the accuracy
 15 |  *  and reliability of the software and data, the NLM and the U.S.
 16 |  *  Government do not and cannot warrant the performance or results that
 17 |  *  may be obtained by using this software or data. The NLM and the U.S.
 18 |  *  Government disclaim all warranties, express or implied, including
 19 |  *  warranties of performance, merchantability or fitness for any particular
 20 |  *  purpose.
 21 |  *
 22 |  *  Please cite the author in any work or product based on this material.
 23 |  *
 24 |  * ===========================================================================
 25 |  *
 26 |  * Author: Ilya Dondoshansky
 27 |  *
 28 |  */
 29 | 
 30 | /** @file pattern_priv.h
 31 |  * Auxiliary functions for finding pattern matches in sequence (PHI-BLAST), that
 32 |  * are used in multiple source files.
 33 |  */
 34 | 
 35 | #ifndef ALGO_BLAST_CORE__PATTERN_PRIV_H
 36 | #define ALGO_BLAST_CORE__PATTERN_PRIV_H
 37 | 
 38 | #include <algo/blast/core/ncbi_std.h>
 39 | #include <algo/blast/core/pattern.h>
 40 | 
 41 | #ifdef __cplusplus
 42 | extern "C" {
 43 | #endif
 44 | 
 45 | /** Routine to find hits of pattern to sequence when sequence is proteins
 46 |  * @param hitArray An array of matches to pass back [out]
 47 |  * @param seq The input sequence [in]
 48 |  * @param len1 Length of the input sequence. [in]
 49 |  * @param pattern_blk Carries variables that keep track of search 
 50 |  *                      parameters. [in]
 51 |  * @return the number of matches found.
 52 |  */
 53 | Int4 
 54 | _PHIBlastFindHitsShort(Int4 *hitArray, const Uint1* seq, Int4 len1, 
 55 |                        const SPHIPatternSearchBlk *pattern_blk);
 56 | 
 57 | /** Shift each word in the array left by 1 bit and add bit b.
 58 |  * If the new values is bigger than an overflow threshold, then subtract the
 59 |  * overflow threshold.
 60 |  * @param a Array of integers, representing words in a pattern [in] [out]
 61 |  * @param b bit to add [in]
 62 |  * @param numWords Number of words to process [in]
 63 |  */
 64 | void 
 65 | _PHIPatternWordsLeftShift(Int4 *a, Uint1 b, Int4 numWords);
 66 | 
 67 | /** Do a word-by-word bit-wise or of two integer arrays and put the result back
 68 |  * in the first array.
 69 |  * @param a First array [in] [out]
 70 |  * @param b Second array [in]
 71 |  * @param numWords Number of words in a and b [in]
 72 |  */
 73 | void 
 74 | _PHIPatternWordsBitwiseOr(Int4 *a, Int4 *b, Int4 numWords);
 75 | 
 76 | /** Do a word-by-word bit-wise and of two integer arrays and put the result in
 77 |  * a new array.
 78 |  * @param result Result of the operation [out]
 79 |  * @param a First array [in]
 80 |  * @param b Second array [in]
 81 |  * @param numWords Size of the two input arrays [in]
 82 |  * @return 1 if there are any non-zero words, otherwize 0. 
 83 |  */
 84 | Int4
 85 | _PHIPatternWordsBitwiseAnd(Int4 *result, Int4 *a, Int4 *b, Int4 numWords);
 86 | 
 87 | /** Masks all bits corresponding to the aminoacid alphabet, i.e. the first 26
 88 |  * bits of an integer number.
 89 |  */
 90 | extern const int kMaskAaAlphabetBits;
 91 | 
 92 | /** Looks for 1 bits in the same position of s and mask
 93 |  * Let R be the rightmost position where s and mask both have a 1.
 94 |  * Let L < R be the rightmost position where mask has a 1, if any, 
 95 |  * or -1 otherwise.
 96 |  * @param s Number to check bits in [in]
 97 |  * @param mask Mask to apply [in]
 98 |  * @param rightOne The rightmost position where s and mask both have a 1 [out]
 99 |  * @param rightMaskOnly The rightmost position < rightOne, where mask has a 1,
100 |  *                       if any, or -1 otherwise [out]
101 |  */
102 | void
103 | _PHIGetRightOneBits(Int4 s, Int4 mask, Int4* rightOne, Int4* rightMaskOnly);
104 | 
105 | #ifdef __cplusplus
106 | }
107 | #endif
108 | 
109 | #endif /* !ALGO_BLAST_CORE__PATTERN_PRIV_H */
110 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/common/ncbi_skew_guard.h:
--------------------------------------------------------------------------------
 1 | /*  $Id: ncbi_skew_guard.h 346326 2011-12-06 15:28:48Z ucko $
 2 |  * ===========================================================================
 3 |  *
 4 |  *                            PUBLIC DOMAIN NOTICE
 5 |  *               National Center for Biotechnology Information
 6 |  *
 7 |  *  This software/database is a "United States Government Work" under the
 8 |  *  terms of the United States Copyright Act.  It was written as part of
 9 |  *  the author's official duties as a United States Government employee and
10 |  *  thus cannot be copyrighted.  This software/database is freely available
11 |  *  to the public for use. The National Library of Medicine and the U.S.
12 |  *  Government have not placed any restriction on its use or reproduction.
13 |  *
14 |  *  Although all reasonable efforts have been taken to ensure the accuracy
15 |  *  and reliability of the software and data, the NLM and the U.S.
16 |  *  Government do not and cannot warrant the performance or results that
17 |  *  may be obtained by using this software or data. The NLM and the U.S.
18 |  *  Government disclaim all warranties, express or implied, including
19 |  *  warranties of performance, merchantability or fitness for any particular
20 |  *  purpose.
21 |  *
22 |  *  Please cite the author in any work or product based on this material.
23 |  *
24 |  * ===========================================================================
25 |  *
26 |  * Author:  Aaron Ucko, NCBI
27 |  *
28 |  */
29 | 
30 | /** @file ncbi_skew_guard.h
31 |   * Implementation header to catch build setups that mix incompatible
32 |   * C and C++ Toolkit installations.
33 |   * 
34 |   * Available as <common/ncbi_skew_guard.h> on the C++ side and
35 |   * <ncbi_skew_guard.h> on the C side, in each case customized upon
36 |   * installation to identify itself appropriately.
37 |   */
38 | 
39 | /* In-house C++ Toolkit installations define NCBI_INSTALLED_CXX_VER to
40 |  * the corresponding date stamp (also available as NCBI_DEVELOPMENT_VER
41 |  * or NCBI_PRODUCTION_VER). */
42 | /* #undef NCBI_INSTALLED_CXX_VER */
43 | 
44 | /* Accompanying copies of the C Toolkit define NCBI_EXPECTED_CXX_VER
45 |  * accordingly. */
46 | /* #undef NCBI_EXPECTED_CXX_VER */
47 | 
48 | #if defined(_NCBILCL_)  &&  defined(FORWARDING_NCBICONF_H) \
49 |     &&  !defined(NCBI_ALLOW_MISMATCHED_VERSIONS)
50 | 
51 | /* The last change to shared headers before this guard came along occurred
52 |  * on Nov. 30, 2011. */
53 | #define NCBI_MIN_CXX_VER 20111130
54 | 
55 | #  if defined(NCBI_INSTALLED_CXX_VER)
56 | 
57 | #    if !defined(NCBI_EXPECTED_CXX_VER) \
58 |         ||  NCBI_INSTALLED_CXX_VER != NCBI_EXPECTED_CXX_VER
59 | #      error Please use the C Toolkit installation accompanying your C++ Toolkit tree.
60 | #    endif
61 | 
62 | #  else
63 | 
64 | #    include <common/ncbi_source_ver.h>
65 | #    if NCBI_DEVELOPMENT_VER < NCBI_MIN_CXX_VER
66 | #      error Please use a fresher C++ Toolkit version for C Toolkit compatibility.
67 | #    elif defined(NCBI_EXPECTED_CXX_VER)
68 | #      if (defined(NCBI_PRODUCTION_VER) ? NCBI_PRODUCTION_VER \
69 |            : NCBI_DEVELOPMENT_VER) \
70 |           != NCBI_EXPECTED_CXX_VER
71 | #        error Please use matching C and C++ Toolkit versions.
72 | #      endif
73 | #    endif
74 | 
75 | #  endif
76 | 
77 | #endif
78 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/common/ncbiconf_impl.h:
--------------------------------------------------------------------------------
  1 | #ifndef COMMON___NCBICONF_IMPL__H
  2 | #define COMMON___NCBICONF_IMPL__H
  3 | 
  4 | /* $Id: ncbiconf_impl.h 457074 2015-01-20 16:19:10Z ucko $
  5 |  * ===========================================================================
  6 |  *
  7 |  *                            PUBLIC DOMAIN NOTICE
  8 |  *               National Center for Biotechnology Information
  9 |  *
 10 |  *  This software/database is a "United States Government Work" under the
 11 |  *  terms of the United States Copyright Act.  It was written as part of
 12 |  *  the author's official duties as a United States Government employee and
 13 |  *  thus cannot be copyrighted.  This software/database is freely available
 14 |  *  to the public for use. The National Library of Medicine and the U.S.
 15 |  *  Government have not placed any restriction on its use or reproduction.
 16 |  *
 17 |  *  Although all reasonable efforts have been taken to ensure the accuracy
 18 |  *  and reliability of the software and data, the NLM and the U.S.
 19 |  *  Government do not and cannot warrant the performance or results that
 20 |  *  may be obtained by using this software or data. The NLM and the U.S.
 21 |  *  Government disclaim all warranties, express or implied, including
 22 |  *  warranties of performance, merchantability or fitness for any particular
 23 |  *  purpose.
 24 |  *
 25 |  *  Please cite the author in any work or product based on this material.
 26 |  *
 27 |  * ===========================================================================
 28 |  *
 29 |  *  Author:  Anton Lavrentiev
 30 |  *
 31 |  *
 32 |  */
 33 | 
 34 | /**
 35 |  * @file ncbiconf_impl.h
 36 |  *
 37 |  * Configuration macros.
 38 |  */
 39 | 
 40 | #ifndef FORWARDING_NCBICONF_H
 41 | #  error "The header can be used from <ncbiconf.h> only."
 42 | #endif /*!FORWARDING_NCBICONF_H*/
 43 | 
 44 | 
 45 | /** @addtogroup Portability
 46 |  *
 47 |  * @{
 48 |  */
 49 | 
 50 | 
 51 | /* Threads configuration
 52 |  */
 53 | 
 54 | #undef NCBI_NO_THREADS
 55 | #undef NCBI_THREADS
 56 | #undef NCBI_POSIX_THREADS
 57 | #undef NCBI_WIN32_THREADS
 58 | 
 59 | #if defined(_MT)  &&  !defined(NCBI_WITHOUT_MT)
 60 | #  if defined(NCBI_OS_MSWIN)
 61 | #    define NCBI_WIN32_THREADS
 62 | #  elif defined(NCBI_OS_UNIX)
 63 | #    define NCBI_POSIX_THREADS
 64 | #  else
 65 | #    define NCBI_NO_THREADS
 66 | #  endif
 67 | #else
 68 | #  define NCBI_NO_THREADS
 69 | #endif
 70 | 
 71 | #if !defined(NCBI_NO_THREADS)
 72 | #  define NCBI_THREADS
 73 | #endif
 74 | 
 75 | /* Sync Windows/Cygwin preprocessor conditionals governing wide
 76 |  * character usage. */
 77 | 
 78 | #if defined(UNICODE)  &&  !defined(_UNICODE)
 79 | #  define _UNICODE 1
 80 | #elif defined(_UNICODE)  &&  !defined(UNICODE)
 81 | #  define UNICODE 1
 82 | #endif
 83 | 
 84 | /* New/nonstandard keywords
 85 |  */
 86 | 
 87 | #if defined(__cplusplus)  &&  defined(NCBI_RESTRICT_CXX)
 88 | #  define NCBI_RESTRICT NCBI_RESTRICT_CXX
 89 | #elif !defined(__cplusplus)  &&  defined(NCBI_RESTRICT_C)
 90 | #  define NCBI_RESTRICT NCBI_RESTRICT_C
 91 | #elif __STDC_VERSION__ >= 199901 /* C99 specifies restrict */
 92 | #  define NCBI_RESTRICT restrict
 93 | #else
 94 | #  define NCBI_RESTRICT
 95 | #endif
 96 | 
 97 | #ifndef NCBI_FORCEINLINE
 98 | #  ifdef __cplusplus
 99 | #    define NCBI_FORCEINLINE inline
100 | #  else
101 | #    define NCBI_FORCEINLINE
102 | #  endif
103 | #endif
104 | 
105 | #ifndef NCBI_NORETURN
106 | #  ifdef __GNUC__
107 | #    define NCBI_NORETURN __attribute__((__noreturn__))
108 | #  else
109 | #    define NCBI_NORETURN
110 | #  endif
111 | #endif
112 | 
113 | /* Definition of packed enum type, to save some memory */
114 | /* enum EMyEnum NCBI_PACKED_ENUM_TYPE(Type) { ... } NCBI_PACKED_ENUM_END(); */
115 | #ifndef NCBI_PACKED_ENUM_TYPE
116 | #  define NCBI_PACKED_ENUM_TYPE(type)
117 | #endif
118 | #ifndef NCBI_PACKED_ENUM_END
119 | #  ifdef NCBI_PACKED
120 | #    define NCBI_PACKED_ENUM_END() NCBI_PACKED
121 | #  else
122 | #    define NCBI_PACKED_ENUM_END()
123 | #  endif
124 | #endif
125 | 
126 | #ifndef NCBI_WARN_UNUSED_RESULT
127 | #  define NCBI_WARN_UNUSED_RESULT
128 | #endif
129 | 
130 | #ifdef __cplusplus
131 | #  if __cplusplus >= 201103L || defined(__GXX_EXPERIMENTAL_CXX0X__) \
132 |       || defined(__GXX_EXPERIMENTAL_CPP0X__)
133 | #    define NCBI_HAVE_CXX11 1
134 | #  endif
135 | #  if defined(NCBI_HAVE_CXX11) \
136 |       ||  (defined(NCBI_COMPILER_MSVC)  &&  _MSC_VER >= 1600)
137 | #    define HAVE_IS_SORTED 1
138 | #    define HAVE_NULLPTR 1
139 | #  endif
140 | #  if defined(NCBI_HAVE_CXX11) /* or recent MSVC too? */
141 | #    if !defined(NCBI_COMPILER_ICC)  ||  NCBI_COMPILER_VERSION >= 1400
142 |        /* Exclude ICC 13.x and below, which don't support using "enum class"
143 |         * in conjunction with switch. */
144 | #      define HAVE_ENUM_CLASS 1
145 | #    endif
146 | #  endif
147 | #endif
148 | 
149 | #include <common/ncbi_skew_guard.h>
150 | 
151 | /* @} */
152 | 
153 | #endif  /* COMMON___NCBICONF_IMPL__H */
154 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/connect/connect_export.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONNECT___CONNECT_EXPORT__H
 2 | #define CONNECT___CONNECT_EXPORT__H
 3 | 
 4 | /* $Id: connect_export.h 166398 2009-07-22 15:51:55Z ucko $
 5 |  * ===========================================================================
 6 |  *
 7 |  *                            PUBLIC DOMAIN NOTICE
 8 |  *               National Center for Biotechnology Information
 9 |  *
10 |  *  This software/database is a "United States Government Work" under the
11 |  *  terms of the United States Copyright Act.  It was written as part of
12 |  *  the author's official duties as a United States Government employee and
13 |  *  thus cannot be copyrighted.  This software/database is freely available
14 |  *  to the public for use. The National Library of Medicine and the U.S.
15 |  *  Government have not placed any restriction on its use or reproduction.
16 |  *
17 |  *  Although all reasonable efforts have been taken to ensure the accuracy
18 |  *  and reliability of the software and data, the NLM and the U.S.
19 |  *  Government do not and cannot warrant the performance or results that
20 |  *  may be obtained by using this software or data. The NLM and the U.S.
21 |  *  Government disclaim all warranties, express or implied, including
22 |  *  warranties of performance, merchantability or fitness for any particular
23 |  *  purpose.
24 |  *
25 |  *  Please cite the author in any work or product based on this material.
26 |  *
27 |  * ===========================================================================
28 |  *
29 |  * Author:  Mike DiCuccio
30 |  *
31 |  * File Description:
32 |  *    Defines to provide correct exporting from CONNECT DLL in Windows.
33 |  *    These are necessary to compile DLLs with Visual C++ - exports must be
34 |  *    explicitly labeled as such.
35 |  */
36 | 
37 | 
38 | /** @addtogroup WinDLL
39 |  *
40 |  * @{
41 |  */
42 | 
43 | 
44 | #if defined(WIN32)  &&  defined(NCBI_DLL_BUILD)
45 | 
46 | #ifndef _MSC_VER
47 | #  error "This toolkit is not buildable with a compiler other than MSVC."
48 | #endif
49 | 
50 | 
51 | /*
52 |  * Dumping ground for Windows-specific stuff
53 |  */
54 | #pragma warning (disable : 4786 4251 4275)
55 | 
56 | 
57 | #ifdef NCBI_CORE_EXPORTS
58 | #  define NCBI_XCONNECT_EXPORTS
59 | #endif
60 | 
61 | 
62 | #ifdef NCBI_XCONNECT_EXPORTS
63 | #  define NCBI_XCONNECT_EXPORT      __declspec(dllexport)
64 | #else
65 | #  define NCBI_XCONNECT_EXPORT      __declspec(dllimport)
66 | #endif
67 | 
68 | 
69 | #elif defined(__GNUC__)  &&  __GNUC__ >= 4
70 | 
71 | #  define NCBI_XCONNECT_EXPORT      __attribute__((visibility("default")))
72 | 
73 | #else
74 | 
75 | /*
76 |  * NULL operations for other cases
77 |  */
78 | 
79 | #  define NCBI_XCONNECT_EXPORT
80 | 
81 | 
82 | #endif
83 | 
84 | 
85 | /* @} */
86 | 
87 | #endif  /*  CONNECT___CONNECT_EXPORT__H  */
88 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/ncbiconf.h:
--------------------------------------------------------------------------------
 1 | #ifndef FORWARDING_NCBICONF_H
 2 | #define FORWARDING_NCBICONF_H
 3 | 
 4 | /*  $Id: ncbiconf.h 485953 2015-11-30 17:53:34Z blastadm $
 5 |  * ===========================================================================
 6 |  *
 7 |  *                            PUBLIC DOMAIN NOTICE
 8 |  *               National Center for Biotechnology Information
 9 |  *
10 |  *  This software/database is a "United States Government Work" under the
11 |  *  terms of the United States Copyright Act.  It was written as part of
12 |  *  the author's official duties as a United States Government employee and
13 |  *  thus cannot be copyrighted.  This software/database is freely available
14 |  *  to the public for use. The National Library of Medicine and the U.S.
15 |  *  Government have not placed any restriction on its use or reproduction.
16 |  *
17 |  *  Although all reasonable efforts have been taken to ensure the accuracy
18 |  *  and reliability of the software and data, the NLM and the U.S.
19 |  *  Government do not and cannot warrant the performance or results that
20 |  *  may be obtained by using this software or data. The NLM and the U.S.
21 |  *  Government disclaim all warranties, express or implied, including
22 |  *  warranties of performance, merchantability or fitness for any particular
23 |  *  purpose.
24 |  *
25 |  *  Please cite the author in any work or product based on this material.
26 |  *
27 |  * ===========================================================================
28 |  *
29 |  * Authors: Denis Vakatov, Aaron Ucko
30 |  *
31 |  */
32 | 
33 | /** @file ncbiconf.h
34 |  ** Front end for a platform-specific configuration summary.
35 |  **/
36 | 
37 | #ifdef _MSC_VER
38 | #  include <common/config/ncbiconf_msvc.h>
39 | #elif defined(NCBI_XCODE_BUILD)
40 | #  include <common/config/ncbiconf_xcode.h>
41 | #else
42 | #include <ncbiconf_unix.h>
43 | #endif
44 | 
45 | #ifdef NCBI_UNIVERSAL_BUILD
46 | /* sort out the remaining details */
47 | #  include <common/config/ncbiconf_universal.h>
48 | #endif
49 | 
50 | #include <common/ncbiconf_impl.h>
51 | 
52 | #endif  /* FORWARDING_NCBICONF_H */
53 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/util/tables/sm_blosum45.c:
--------------------------------------------------------------------------------
 1 | /*  $Id: sm_blosum45.c 90506 2006-09-25 19:30:59Z madden $
 2 | * ===========================================================================
 3 | *
 4 | *                            PUBLIC DOMAIN NOTICE
 5 | *               National Center for Biotechnology Information
 6 | *
 7 | *  This software/database is a "United States Government Work" under the
 8 | *  terms of the United States Copyright Act.  It was written as part of
 9 | *  the author's official duties as a United States Government employee and
10 | *  thus cannot be copyrighted.  This software/database is freely available
11 | *  to the public for use. The National Library of Medicine and the U.S.
12 | *  Government have not placed any restriction on its use or reproduction.
13 | *
14 | *  Although all reasonable efforts have been taken to ensure the accuracy
15 | *  and reliability of the software and data, the NLM and the U.S.
16 | *  Government do not and cannot warrant the performance or results that
17 | *  may be obtained by using this software or data. The NLM and the U.S.
18 | *  Government disclaim all warranties, express or implied, including
19 | *  warranties of performance, merchantability or fitness for any particular
20 | *  purpose.
21 | *
22 | *  Please cite the author in any work or product based on this material.
23 | *
24 | * ===========================================================================
25 | *
26 | * Author:  Aaron Ucko, Mike Gertz
27 | *
28 | * File Description:
29 | *   Protein alignment score matrices; shared between the two toolkits.
30 | *
31 | * ===========================================================================
32 | */
33 | 
34 | #include <util/tables/raw_scoremat.h>
35 | 
36 | /** Entries for the BLOSUM45 matrix at a scale of ln(2)/3.0. */
37 | 
38 | static const TNCBIScore s_Blosum45PSM[25 * 25] = {
39 |     /*       A,  R,  N,  D,  C,  Q,  E,  G,  H,  I,  L,  K,  M,
40 |              F,  P,  S,  T,  W,  Y,  V,  B,  J,  Z,  X,  *        */ 
41 |     /*A*/    5, -2, -1, -2, -1, -1, -1,  0, -2, -1, -1, -1, -1,
42 |             -2, -1,  1,  0, -2, -2,  0, -1, -1, -1, -1, -5,
43 |     /*R*/   -2,  7,  0, -1, -3,  1,  0, -2,  0, -3, -2,  3, -1,
44 |             -2, -2, -1, -1, -2, -1, -2, -1, -3,  1, -1, -5,
45 |     /*N*/   -1,  0,  6,  2, -2,  0,  0,  0,  1, -2, -3,  0, -2,
46 |             -2, -2,  1,  0, -4, -2, -3,  5, -3,  0, -1, -5,
47 |     /*D*/   -2, -1,  2,  7, -3,  0,  2, -1,  0, -4, -3,  0, -3,
48 |             -4, -1,  0, -1, -4, -2, -3,  6, -3,  1, -1, -5,
49 |     /*C*/   -1, -3, -2, -3, 12, -3, -3, -3, -3, -3, -2, -3, -2,
50 |             -2, -4, -1, -1, -5, -3, -1, -2, -2, -3, -1, -5,
51 |     /*Q*/   -1,  1,  0,  0, -3,  6,  2, -2,  1, -2, -2,  1,  0,
52 |             -4, -1,  0, -1, -2, -1, -3,  0, -2,  4, -1, -5,
53 |     /*E*/   -1,  0,  0,  2, -3,  2,  6, -2,  0, -3, -2,  1, -2,
54 |             -3,  0,  0, -1, -3, -2, -3,  1, -3,  5, -1, -5,
55 |     /*G*/    0, -2,  0, -1, -3, -2, -2,  7, -2, -4, -3, -2, -2,
56 |             -3, -2,  0, -2, -2, -3, -3, -1, -4, -2, -1, -5,
57 |     /*H*/   -2,  0,  1,  0, -3,  1,  0, -2, 10, -3, -2, -1,  0,
58 |             -2, -2, -1, -2, -3,  2, -3,  0, -2,  0, -1, -5,
59 |     /*I*/   -1, -3, -2, -4, -3, -2, -3, -4, -3,  5,  2, -3,  2,
60 |              0, -2, -2, -1, -2,  0,  3, -3,  4, -3, -1, -5,
61 |     /*L*/   -1, -2, -3, -3, -2, -2, -2, -3, -2,  2,  5, -3,  2,
62 |              1, -3, -3, -1, -2,  0,  1, -3,  4, -2, -1, -5,
63 |     /*K*/   -1,  3,  0,  0, -3,  1,  1, -2, -1, -3, -3,  5, -1,
64 |             -3, -1, -1, -1, -2, -1, -2,  0, -3,  1, -1, -5,
65 |     /*M*/   -1, -1, -2, -3, -2,  0, -2, -2,  0,  2,  2, -1,  6,
66 |              0, -2, -2, -1, -2,  0,  1, -2,  2, -1, -1, -5,
67 |     /*F*/   -2, -2, -2, -4, -2, -4, -3, -3, -2,  0,  1, -3,  0,
68 |              8, -3, -2, -1,  1,  3,  0, -3,  1, -3, -1, -5,
69 |     /*P*/   -1, -2, -2, -1, -4, -1,  0, -2, -2, -2, -3, -1, -2,
70 |             -3,  9, -1, -1, -3, -3, -3, -2, -3, -1, -1, -5,
71 |     /*S*/    1, -1,  1,  0, -1,  0,  0,  0, -1, -2, -3, -1, -2,
72 |             -2, -1,  4,  2, -4, -2, -1,  0, -2,  0, -1, -5,
73 |     /*T*/    0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1,
74 |             -1, -1,  2,  5, -3, -1,  0,  0, -1, -1, -1, -5,
75 |     /*W*/   -2, -2, -4, -4, -5, -2, -3, -2, -3, -2, -2, -2, -2,
76 |              1, -3, -4, -3, 15,  3, -3, -4, -2, -2, -1, -5,
77 |     /*Y*/   -2, -1, -2, -2, -3, -1, -2, -3,  2,  0,  0, -1,  0,
78 |              3, -3, -2, -1,  3,  8, -1, -2,  0, -2, -1, -5,
79 |     /*V*/    0, -2, -3, -3, -1, -3, -3, -3, -3,  3,  1, -2,  1,
80 |              0, -3, -1,  0, -3, -1,  5, -3,  2, -3, -1, -5,
81 |     /*B*/   -1, -1,  5,  6, -2,  0,  1, -1,  0, -3, -3,  0, -2,
82 |             -3, -2,  0,  0, -4, -2, -3,  5, -3,  1, -1, -5,
83 |     /*J*/   -1, -3, -3, -3, -2, -2, -3, -4, -2,  4,  4, -3,  2,
84 |              1, -3, -2, -1, -2,  0,  2, -3,  4, -2, -1, -5,
85 |     /*Z*/   -1,  1,  0,  1, -3,  4,  5, -2,  0, -3, -2,  1, -1,
86 |             -3, -1,  0, -1, -2, -2, -3,  1, -2,  5, -1, -5,
87 |     /*X*/   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
88 |             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -5,
89 |     /***/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
90 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,  1
91 | };
92 | const SNCBIPackedScoreMatrix NCBISM_Blosum45 = {
93 |     "ARNDCQEGHILKMFPSTWYVBJZX*",
94 |     s_Blosum45PSM,
95 |     -5
96 | };
97 | 
98 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/util/tables/sm_blosum50.c:
--------------------------------------------------------------------------------
 1 | /*  $Id: sm_blosum50.c 90507 2006-09-25 19:31:51Z madden $
 2 | * ===========================================================================
 3 | *
 4 | *                            PUBLIC DOMAIN NOTICE
 5 | *               National Center for Biotechnology Information
 6 | *
 7 | *  This software/database is a "United States Government Work" under the
 8 | *  terms of the United States Copyright Act.  It was written as part of
 9 | *  the author's official duties as a United States Government employee and
10 | *  thus cannot be copyrighted.  This software/database is freely available
11 | *  to the public for use. The National Library of Medicine and the U.S.
12 | *  Government have not placed any restriction on its use or reproduction.
13 | *
14 | *  Although all reasonable efforts have been taken to ensure the accuracy
15 | *  and reliability of the software and data, the NLM and the U.S.
16 | *  Government do not and cannot warrant the performance or results that
17 | *  may be obtained by using this software or data. The NLM and the U.S.
18 | *  Government disclaim all warranties, express or implied, including
19 | *  warranties of performance, merchantability or fitness for any particular
20 | *  purpose.
21 | *
22 | *  Please cite the author in any work or product based on this material.
23 | *
24 | * ===========================================================================
25 | *
26 | * Author:  Aaron Ucko, Mike Gertz
27 | *
28 | * File Description:
29 | *   Protein alignment score matrices; shared between the two toolkits.
30 | *
31 | * ===========================================================================
32 | */
33 | 
34 | #include <util/tables/raw_scoremat.h>
35 | 
36 | /** Entries for the BLOSUM50 matrix at a scale of ln(2)/3.0. */
37 | 
38 | static const TNCBIScore s_Blosum50PSM[25 * 25] = {
39 |     /*       A,  R,  N,  D,  C,  Q,  E,  G,  H,  I,  L,  K,  M,
40 |              F,  P,  S,  T,  W,  Y,  V,  B,  J,  Z,  X,  *        */ 
41 |     /*A*/    5, -2, -1, -2, -1, -1, -1,  0, -2, -1, -2, -1, -1,
42 |             -3, -1,  1,  0, -3, -2,  0, -2, -2, -1, -1, -5,
43 |     /*R*/   -2,  7, -1, -2, -4,  1,  0, -3,  0, -4, -3,  3, -2,
44 |             -3, -3, -1, -1, -3, -1, -3, -1, -3,  0, -1, -5,
45 |     /*N*/   -1, -1,  7,  2, -2,  0,  0,  0,  1, -3, -4,  0, -2,
46 |             -4, -2,  1,  0, -4, -2, -3,  5, -4,  0, -1, -5,
47 |     /*D*/   -2, -2,  2,  8, -4,  0,  2, -1, -1, -4, -4, -1, -4,
48 |             -5, -1,  0, -1, -5, -3, -4,  6, -4,  1, -1, -5,
49 |     /*C*/   -1, -4, -2, -4, 13, -3, -3, -3, -3, -2, -2, -3, -2,
50 |             -2, -4, -1, -1, -5, -3, -1, -3, -2, -3, -1, -5,
51 |     /*Q*/   -1,  1,  0,  0, -3,  7,  2, -2,  1, -3, -2,  2,  0,
52 |             -4, -1,  0, -1, -1, -1, -3,  0, -3,  4, -1, -5,
53 |     /*E*/   -1,  0,  0,  2, -3,  2,  6, -3,  0, -4, -3,  1, -2,
54 |             -3, -1, -1, -1, -3, -2, -3,  1, -3,  5, -1, -5,
55 |     /*G*/    0, -3,  0, -1, -3, -2, -3,  8, -2, -4, -4, -2, -3,
56 |             -4, -2,  0, -2, -3, -3, -4, -1, -4, -2, -1, -5,
57 |     /*H*/   -2,  0,  1, -1, -3,  1,  0, -2, 10, -4, -3,  0, -1,
58 |             -1, -2, -1, -2, -3,  2, -4,  0, -3,  0, -1, -5,
59 |     /*I*/   -1, -4, -3, -4, -2, -3, -4, -4, -4,  5,  2, -3,  2,
60 |              0, -3, -3, -1, -3, -1,  4, -4,  4, -3, -1, -5,
61 |     /*L*/   -2, -3, -4, -4, -2, -2, -3, -4, -3,  2,  5, -3,  3,
62 |              1, -4, -3, -1, -2, -1,  1, -4,  4, -3, -1, -5,
63 |     /*K*/   -1,  3,  0, -1, -3,  2,  1, -2,  0, -3, -3,  6, -2,
64 |             -4, -1,  0, -1, -3, -2, -3,  0, -3,  1, -1, -5,
65 |     /*M*/   -1, -2, -2, -4, -2,  0, -2, -3, -1,  2,  3, -2,  7,
66 |              0, -3, -2, -1, -1,  0,  1, -3,  2, -1, -1, -5,
67 |     /*F*/   -3, -3, -4, -5, -2, -4, -3, -4, -1,  0,  1, -4,  0,
68 |              8, -4, -3, -2,  1,  4, -1, -4,  1, -4, -1, -5,
69 |     /*P*/   -1, -3, -2, -1, -4, -1, -1, -2, -2, -3, -4, -1, -3,
70 |             -4, 10, -1, -1, -4, -3, -3, -2, -3, -1, -1, -5,
71 |     /*S*/    1, -1,  1,  0, -1,  0, -1,  0, -1, -3, -3,  0, -2,
72 |             -3, -1,  5,  2, -4, -2, -2,  0, -3,  0, -1, -5,
73 |     /*T*/    0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1,
74 |             -2, -1,  2,  5, -3, -2,  0,  0, -1, -1, -1, -5,
75 |     /*W*/   -3, -3, -4, -5, -5, -1, -3, -3, -3, -3, -2, -3, -1,
76 |              1, -4, -4, -3, 15,  2, -3, -5, -2, -2, -1, -5,
77 |     /*Y*/   -2, -1, -2, -3, -3, -1, -2, -3,  2, -1, -1, -2,  0,
78 |              4, -3, -2, -2,  2,  8, -1, -3, -1, -2, -1, -5,
79 |     /*V*/    0, -3, -3, -4, -1, -3, -3, -4, -4,  4,  1, -3,  1,
80 |             -1, -3, -2,  0, -3, -1,  5, -3,  2, -3, -1, -5,
81 |     /*B*/   -2, -1,  5,  6, -3,  0,  1, -1,  0, -4, -4,  0, -3,
82 |             -4, -2,  0,  0, -5, -3, -3,  6, -4,  1, -1, -5,
83 |     /*J*/   -2, -3, -4, -4, -2, -3, -3, -4, -3,  4,  4, -3,  2,
84 |              1, -3, -3, -1, -2, -1,  2, -4,  4, -3, -1, -5,
85 |     /*Z*/   -1,  0,  0,  1, -3,  4,  5, -2,  0, -3, -3,  1, -1,
86 |             -4, -1,  0, -1, -2, -2, -3,  1, -3,  5, -1, -5,
87 |     /*X*/   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
88 |             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -5,
89 |     /***/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
90 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,  1
91 | };
92 | const SNCBIPackedScoreMatrix NCBISM_Blosum50 = {
93 |     "ARNDCQEGHILKMFPSTWYVBJZX*",
94 |     s_Blosum50PSM,
95 |     -5
96 | };
97 | 
98 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/util/tables/sm_blosum62.c:
--------------------------------------------------------------------------------
 1 | /*  $Id: sm_blosum62.c 90506 2006-09-25 19:30:59Z madden $
 2 | * ===========================================================================
 3 | *
 4 | *                            PUBLIC DOMAIN NOTICE
 5 | *               National Center for Biotechnology Information
 6 | *
 7 | *  This software/database is a "United States Government Work" under the
 8 | *  terms of the United States Copyright Act.  It was written as part of
 9 | *  the author's official duties as a United States Government employee and
10 | *  thus cannot be copyrighted.  This software/database is freely available
11 | *  to the public for use. The National Library of Medicine and the U.S.
12 | *  Government have not placed any restriction on its use or reproduction.
13 | *
14 | *  Although all reasonable efforts have been taken to ensure the accuracy
15 | *  and reliability of the software and data, the NLM and the U.S.
16 | *  Government do not and cannot warrant the performance or results that
17 | *  may be obtained by using this software or data. The NLM and the U.S.
18 | *  Government disclaim all warranties, express or implied, including
19 | *  warranties of performance, merchantability or fitness for any particular
20 | *  purpose.
21 | *
22 | *  Please cite the author in any work or product based on this material.
23 | *
24 | * ===========================================================================
25 | *
26 | * Author:  Aaron Ucko, Mike Gertz
27 | *
28 | * File Description:
29 | *   Protein alignment score matrices; shared between the two toolkits.
30 | *
31 | * ===========================================================================
32 | */
33 | 
34 | #include <util/tables/raw_scoremat.h>
35 | 
36 | /** Entries for the BLOSUM62 matrix at a scale of ln(2)/2.0. */
37 | 
38 | static const TNCBIScore s_Blosum62PSM[25 * 25] = {
39 |     /*       A,  R,  N,  D,  C,  Q,  E,  G,  H,  I,  L,  K,  M,
40 |              F,  P,  S,  T,  W,  Y,  V,  B,  J,  Z,  X,  *        */ 
41 |     /*A*/    4, -1, -2, -2,  0, -1, -1,  0, -2, -1, -1, -1, -1,
42 |             -2, -1,  1,  0, -3, -2,  0, -2, -1, -1, -1, -4,
43 |     /*R*/   -1,  5,  0, -2, -3,  1,  0, -2,  0, -3, -2,  2, -1,
44 |             -3, -2, -1, -1, -3, -2, -3, -1, -2,  0, -1, -4,
45 |     /*N*/   -2,  0,  6,  1, -3,  0,  0,  0,  1, -3, -3,  0, -2,
46 |             -3, -2,  1,  0, -4, -2, -3,  4, -3,  0, -1, -4,
47 |     /*D*/   -2, -2,  1,  6, -3,  0,  2, -1, -1, -3, -4, -1, -3,
48 |             -3, -1,  0, -1, -4, -3, -3,  4, -3,  1, -1, -4,
49 |     /*C*/    0, -3, -3, -3,  9, -3, -4, -3, -3, -1, -1, -3, -1,
50 |             -2, -3, -1, -1, -2, -2, -1, -3, -1, -3, -1, -4,
51 |     /*Q*/   -1,  1,  0,  0, -3,  5,  2, -2,  0, -3, -2,  1,  0,
52 |             -3, -1,  0, -1, -2, -1, -2,  0, -2,  4, -1, -4,
53 |     /*E*/   -1,  0,  0,  2, -4,  2,  5, -2,  0, -3, -3,  1, -2,
54 |             -3, -1,  0, -1, -3, -2, -2,  1, -3,  4, -1, -4,
55 |     /*G*/    0, -2,  0, -1, -3, -2, -2,  6, -2, -4, -4, -2, -3,
56 |             -3, -2,  0, -2, -2, -3, -3, -1, -4, -2, -1, -4,
57 |     /*H*/   -2,  0,  1, -1, -3,  0,  0, -2,  8, -3, -3, -1, -2,
58 |             -1, -2, -1, -2, -2,  2, -3,  0, -3,  0, -1, -4,
59 |     /*I*/   -1, -3, -3, -3, -1, -3, -3, -4, -3,  4,  2, -3,  1,
60 |              0, -3, -2, -1, -3, -1,  3, -3,  3, -3, -1, -4,
61 |     /*L*/   -1, -2, -3, -4, -1, -2, -3, -4, -3,  2,  4, -2,  2,
62 |              0, -3, -2, -1, -2, -1,  1, -4,  3, -3, -1, -4,
63 |     /*K*/   -1,  2,  0, -1, -3,  1,  1, -2, -1, -3, -2,  5, -1,
64 |             -3, -1,  0, -1, -3, -2, -2,  0, -3,  1, -1, -4,
65 |     /*M*/   -1, -1, -2, -3, -1,  0, -2, -3, -2,  1,  2, -1,  5,
66 |              0, -2, -1, -1, -1, -1,  1, -3,  2, -1, -1, -4,
67 |     /*F*/   -2, -3, -3, -3, -2, -3, -3, -3, -1,  0,  0, -3,  0,
68 |              6, -4, -2, -2,  1,  3, -1, -3,  0, -3, -1, -4,
69 |     /*P*/   -1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2,
70 |             -4,  7, -1, -1, -4, -3, -2, -2, -3, -1, -1, -4,
71 |     /*S*/    1, -1,  1,  0, -1,  0,  0,  0, -1, -2, -2,  0, -1,
72 |             -2, -1,  4,  1, -3, -2, -2,  0, -2,  0, -1, -4,
73 |     /*T*/    0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1,
74 |             -2, -1,  1,  5, -2, -2,  0, -1, -1, -1, -1, -4,
75 |     /*W*/   -3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1,
76 |              1, -4, -3, -2, 11,  2, -3, -4, -2, -2, -1, -4,
77 |     /*Y*/   -2, -2, -2, -3, -2, -1, -2, -3,  2, -1, -1, -2, -1,
78 |              3, -3, -2, -2,  2,  7, -1, -3, -1, -2, -1, -4,
79 |     /*V*/    0, -3, -3, -3, -1, -2, -2, -3, -3,  3,  1, -2,  1,
80 |             -1, -2, -2,  0, -3, -1,  4, -3,  2, -2, -1, -4,
81 |     /*B*/   -2, -1,  4,  4, -3,  0,  1, -1,  0, -3, -4,  0, -3,
82 |             -3, -2,  0, -1, -4, -3, -3,  4, -3,  0, -1, -4,
83 |     /*J*/   -1, -2, -3, -3, -1, -2, -3, -4, -3,  3,  3, -3,  2,
84 |              0, -3, -2, -1, -2, -1,  2, -3,  3, -3, -1, -4,
85 |     /*Z*/   -1,  0,  0,  1, -3,  4,  4, -2,  0, -3, -3,  1, -1,
86 |             -3, -1,  0, -1, -2, -2, -2,  0, -3,  4, -1, -4,
87 |     /*X*/   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
88 |             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -4,
89 |     /***/   -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
90 |             -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,  1
91 | };
92 | const SNCBIPackedScoreMatrix NCBISM_Blosum62 = {
93 |     "ARNDCQEGHILKMFPSTWYVBJZX*",
94 |     s_Blosum62PSM,
95 |     -4
96 | };
97 | 
98 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/util/tables/sm_identity.c:
--------------------------------------------------------------------------------
 1 | /*  $Id: sm_identity.c 458581 2015-02-06 15:18:12Z boratyng $
 2 | * ===========================================================================
 3 | *
 4 | *                            PUBLIC DOMAIN NOTICE
 5 | *               National Center for Biotechnology Information
 6 | *
 7 | *  This software/database is a "United States Government Work" under the
 8 | *  terms of the United States Copyright Act.  It was written as part of
 9 | *  the author's official duties as a United States Government employee and
10 | *  thus cannot be copyrighted.  This software/database is freely available
11 | *  to the public for use. The National Library of Medicine and the U.S.
12 | *  Government have not placed any restriction on its use or reproduction.
13 | *
14 | *  Although all reasonable efforts have been taken to ensure the accuracy
15 | *  and reliability of the software and data, the NLM and the U.S.
16 | *  Government do not and cannot warrant the performance or results that
17 | *  may be obtained by using this software or data. The NLM and the U.S.
18 | *  Government disclaim all warranties, express or implied, including
19 | *  warranties of performance, merchantability or fitness for any particular
20 | *  purpose.
21 | *
22 | *  Please cite the author in any work or product based on this material.
23 | *
24 | * ===========================================================================
25 | *
26 | * Author:  Greg Boratyn
27 | *
28 | * File Description:
29 | *   Protein alignment score matrices; shared between the two toolkits.
30 | *
31 | * ===========================================================================
32 | */
33 | 
34 | #include <util/tables/raw_scoremat.h>
35 | 
36 | /** Entries for the IDENTITY matrix. */
37 | 
38 | static const TNCBIScore s_IdentityPSM[25 * 25] = {
39 |     /*       A,  R,  N,  D,  C,  Q,  E,  G,  H,  I,  L,  K,  M,
40 |              F,  P,  S,  T,  W,  Y,  V,  B,  J,  Z,  X,  *        */ 
41 |     /*A*/    9, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
42 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
43 |     /*R*/   -5,  9, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
44 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
45 |     /*N*/   -5, -5,  9, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
46 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
47 |     /*D*/   -5, -5, -5,  9, -5, -5, -5, -5, -5, -5, -5, -5, -5,
48 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
49 |     /*C*/   -5, -5, -5, -5,  9, -5, -5, -5, -5, -5, -5, -5, -5,
50 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
51 |     /*Q*/   -5, -5, -5, -5, -5,  9, -5, -5, -5, -5, -5, -5, -5,
52 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
53 |     /*E*/   -5, -5, -5, -5, -5, -5,  9, -5, -5, -5, -5, -5, -5,
54 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
55 |     /*G*/   -5, -5, -5, -5, -5, -5, -5,  9, -5, -5, -5, -5, -5,
56 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
57 |     /*H*/   -5, -5, -5, -5, -5, -5, -5, -5,  9, -5, -5, -5, -5,
58 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
59 |     /*I*/   -5, -5, -5, -5, -5, -5, -5, -5, -5,  9, -5, -5, -5,
60 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
61 |     /*L*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,  9, -5, -5,
62 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
63 |     /*K*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,  9, -5,
64 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
65 |     /*M*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,  9,
66 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
67 |     /*F*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
68 |              9, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
69 |     /*P*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
70 |             -5,  9, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
71 |     /*S*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
72 |             -5, -5,  9, -5, -5, -5, -5, -5, -5, -5, -5, -5,
73 |     /*T*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
74 |             -5, -5, -5,  9, -5, -5, -5, -5, -5, -5, -5, -5,
75 |     /*W*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
76 |             -5, -5, -5, -5,  9, -5, -5, -5, -5, -5, -5, -5,
77 |     /*Y*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
78 |             -5, -5, -5, -5, -5,  9, -5, -5, -5, -5, -5, -5,
79 |     /*V*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
80 |             -5, -5, -5, -5, -5, -5,  9, -5, -5, -5, -5, -5,
81 |     /*B*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
82 |             -5, -5, -5, -5, -5, -5, -5,  9, -5, -5, -5, -5,
83 |     /*J*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
84 |             -5, -5, -5, -5, -5, -5, -5, -5,  9, -5, -5, -5,
85 |     /*Z*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
86 |             -5, -5, -5, -5, -5, -5, -5, -5, -5,  9, -5, -5,
87 |     /*X*/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
88 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
89 |     /***/   -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
90 |             -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,  9
91 | };
92 | const SNCBIPackedScoreMatrix NCBISM_Identity = {
93 |     "ARNDCQEGHILKMFPSTWYVBJZX*",
94 |     s_IdentityPSM,
95 |     -5
96 | };
97 | 
98 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/util/tables/sm_pam250.c:
--------------------------------------------------------------------------------
 1 | /*  $Id: sm_pam250.c 90506 2006-09-25 19:30:59Z madden $
 2 | * ===========================================================================
 3 | *
 4 | *                            PUBLIC DOMAIN NOTICE
 5 | *               National Center for Biotechnology Information
 6 | *
 7 | *  This software/database is a "United States Government Work" under the
 8 | *  terms of the United States Copyright Act.  It was written as part of
 9 | *  the author's official duties as a United States Government employee and
10 | *  thus cannot be copyrighted.  This software/database is freely available
11 | *  to the public for use. The National Library of Medicine and the U.S.
12 | *  Government have not placed any restriction on its use or reproduction.
13 | *
14 | *  Although all reasonable efforts have been taken to ensure the accuracy
15 | *  and reliability of the software and data, the NLM and the U.S.
16 | *  Government do not and cannot warrant the performance or results that
17 | *  may be obtained by using this software or data. The NLM and the U.S.
18 | *  Government disclaim all warranties, express or implied, including
19 | *  warranties of performance, merchantability or fitness for any particular
20 | *  purpose.
21 | *
22 | *  Please cite the author in any work or product based on this material.
23 | *
24 | * ===========================================================================
25 | *
26 | * Author:  Aaron Ucko, Mike Gertz
27 | *
28 | * File Description:
29 | *   Protein alignment score matrices; shared between the two toolkits.
30 | *
31 | * ===========================================================================
32 | */
33 | 
34 | #include <util/tables/raw_scoremat.h>
35 | 
36 | /** Entries for the PAM250 matrix at a scale of ln(2)/3.0. */
37 | 
38 | static const TNCBIScore s_Pam250PSM[25 * 25] = {
39 |     /*       A,  R,  N,  D,  C,  Q,  E,  G,  H,  I,  L,  K,  M,
40 |              F,  P,  S,  T,  W,  Y,  V,  B,  J,  Z,  X,  *        */ 
41 |     /*A*/    2, -2,  0,  0, -2,  0,  0,  1, -1, -1, -2, -1, -1,
42 |             -3,  1,  1,  1, -6, -3,  0,  0, -1,  0, -1, -8,
43 |     /*R*/   -2,  6,  0, -1, -4,  1, -1, -3,  2, -2, -3,  3,  0,
44 |             -4,  0,  0, -1,  2, -4, -2, -1, -3,  0, -1, -8,
45 |     /*N*/    0,  0,  2,  2, -4,  1,  1,  0,  2, -2, -3,  1, -2,
46 |             -3,  0,  1,  0, -4, -2, -2,  2, -3,  1, -1, -8,
47 |     /*D*/    0, -1,  2,  4, -5,  2,  3,  1,  1, -2, -4,  0, -3,
48 |             -6, -1,  0,  0, -7, -4, -2,  3, -3,  3, -1, -8,
49 |     /*C*/   -2, -4, -4, -5, 12, -5, -5, -3, -3, -2, -6, -5, -5,
50 |             -4, -3,  0, -2, -8,  0, -2, -4, -5, -5, -1, -8,
51 |     /*Q*/    0,  1,  1,  2, -5,  4,  2, -1,  3, -2, -2,  1, -1,
52 |             -5,  0, -1, -1, -5, -4, -2,  1, -2,  3, -1, -8,
53 |     /*E*/    0, -1,  1,  3, -5,  2,  4,  0,  1, -2, -3,  0, -2,
54 |             -5, -1,  0,  0, -7, -4, -2,  3, -3,  3, -1, -8,
55 |     /*G*/    1, -3,  0,  1, -3, -1,  0,  5, -2, -3, -4, -2, -3,
56 |             -5,  0,  1,  0, -7, -5, -1,  0, -4,  0, -1, -8,
57 |     /*H*/   -1,  2,  2,  1, -3,  3,  1, -2,  6, -2, -2,  0, -2,
58 |             -2,  0, -1, -1, -3,  0, -2,  1, -2,  2, -1, -8,
59 |     /*I*/   -1, -2, -2, -2, -2, -2, -2, -3, -2,  5,  2, -2,  2,
60 |              1, -2, -1,  0, -5, -1,  4, -2,  3, -2, -1, -8,
61 |     /*L*/   -2, -3, -3, -4, -6, -2, -3, -4, -2,  2,  6, -3,  4,
62 |              2, -3, -3, -2, -2, -1,  2, -3,  5, -3, -1, -8,
63 |     /*K*/   -1,  3,  1,  0, -5,  1,  0, -2,  0, -2, -3,  5,  0,
64 |             -5, -1,  0,  0, -3, -4, -2,  1, -3,  0, -1, -8,
65 |     /*M*/   -1,  0, -2, -3, -5, -1, -2, -3, -2,  2,  4,  0,  6,
66 |              0, -2, -2, -1, -4, -2,  2, -2,  3, -2, -1, -8,
67 |     /*F*/   -3, -4, -3, -6, -4, -5, -5, -5, -2,  1,  2, -5,  0,
68 |              9, -5, -3, -3,  0,  7, -1, -4,  2, -5, -1, -8,
69 |     /*P*/    1,  0,  0, -1, -3,  0, -1,  0,  0, -2, -3, -1, -2,
70 |             -5,  6,  1,  0, -6, -5, -1, -1, -2,  0, -1, -8,
71 |     /*S*/    1,  0,  1,  0,  0, -1,  0,  1, -1, -1, -3,  0, -2,
72 |             -3,  1,  2,  1, -2, -3, -1,  0, -2,  0, -1, -8,
73 |     /*T*/    1, -1,  0,  0, -2, -1,  0,  0, -1,  0, -2,  0, -1,
74 |             -3,  0,  1,  3, -5, -3,  0,  0, -1, -1, -1, -8,
75 |     /*W*/   -6,  2, -4, -7, -8, -5, -7, -7, -3, -5, -2, -3, -4,
76 |              0, -6, -2, -5, 17,  0, -6, -5, -3, -6, -1, -8,
77 |     /*Y*/   -3, -4, -2, -4,  0, -4, -4, -5,  0, -1, -1, -4, -2,
78 |              7, -5, -3, -3,  0, 10, -2, -3, -1, -4, -1, -8,
79 |     /*V*/    0, -2, -2, -2, -2, -2, -2, -1, -2,  4,  2, -2,  2,
80 |             -1, -1, -1,  0, -6, -2,  4, -2,  2, -2, -1, -8,
81 |     /*B*/    0, -1,  2,  3, -4,  1,  3,  0,  1, -2, -3,  1, -2,
82 |             -4, -1,  0,  0, -5, -3, -2,  3, -3,  2, -1, -8,
83 |     /*J*/   -1, -3, -3, -3, -5, -2, -3, -4, -2,  3,  5, -3,  3,
84 |              2, -2, -2, -1, -3, -1,  2, -3,  5, -2, -1, -8,
85 |     /*Z*/    0,  0,  1,  3, -5,  3,  3,  0,  2, -2, -3,  0, -2,
86 |             -5,  0,  0, -1, -6, -4, -2,  2, -2,  3, -1, -8,
87 |     /*X*/   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
88 |             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -8,
89 |     /***/   -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8,
90 |             -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8,  1
91 | };
92 | const SNCBIPackedScoreMatrix NCBISM_Pam250 = {
93 |     "ARNDCQEGHILKMFPSTWYVBJZX*",
94 |     s_Pam250PSM,
95 |     -8
96 | };
97 | 
98 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/util/tables/sm_pam30.c:
--------------------------------------------------------------------------------
 1 | /*  $Id: sm_pam30.c 90506 2006-09-25 19:30:59Z madden $
 2 | * ===========================================================================
 3 | *
 4 | *                            PUBLIC DOMAIN NOTICE
 5 | *               National Center for Biotechnology Information
 6 | *
 7 | *  This software/database is a "United States Government Work" under the
 8 | *  terms of the United States Copyright Act.  It was written as part of
 9 | *  the author's official duties as a United States Government employee and
10 | *  thus cannot be copyrighted.  This software/database is freely available
11 | *  to the public for use. The National Library of Medicine and the U.S.
12 | *  Government have not placed any restriction on its use or reproduction.
13 | *
14 | *  Although all reasonable efforts have been taken to ensure the accuracy
15 | *  and reliability of the software and data, the NLM and the U.S.
16 | *  Government do not and cannot warrant the performance or results that
17 | *  may be obtained by using this software or data. The NLM and the U.S.
18 | *  Government disclaim all warranties, express or implied, including
19 | *  warranties of performance, merchantability or fitness for any particular
20 | *  purpose.
21 | *
22 | *  Please cite the author in any work or product based on this material.
23 | *
24 | * ===========================================================================
25 | *
26 | * Author:  Aaron Ucko, Mike Gertz
27 | *
28 | * File Description:
29 | *   Protein alignment score matrices; shared between the two toolkits.
30 | *
31 | * ===========================================================================
32 | */
33 | 
34 | #include <util/tables/raw_scoremat.h>
35 | 
36 | /** Entries for the PAM30 matrix at a scale of ln(2)/2.0. */
37 | 
38 | static const TNCBIScore s_Pam30PSM[25 * 25] = {
39 |     /*       A,  R,  N,  D,  C,  Q,  E,  G,  H,  I,  L,  K,  M,
40 |              F,  P,  S,  T,  W,  Y,  V,  B,  J,  Z,  X,  *        */ 
41 |     /*A*/    6, -7, -4, -3, -6, -4, -2, -2, -7, -5, -6, -7, -5,
42 |             -8, -2,  0, -1,-13, -8, -2, -3, -6, -3, -1,-17,
43 |     /*R*/   -7,  8, -6,-10, -8, -2, -9, -9, -2, -5, -8,  0, -4,
44 |             -9, -4, -3, -6, -2,-10, -8, -7, -7, -4, -1,-17,
45 |     /*N*/   -4, -6,  8,  2,-11, -3, -2, -3,  0, -5, -7, -1, -9,
46 |             -9, -6,  0, -2, -8, -4, -8,  6, -6, -3, -1,-17,
47 |     /*D*/   -3,-10,  2,  8,-14, -2,  2, -3, -4, -7,-12, -4,-11,
48 |            -15, -8, -4, -5,-15,-11, -8,  6,-10,  1, -1,-17,
49 |     /*C*/   -6, -8,-11,-14, 10,-14,-14, -9, -7, -6,-15,-14,-13,
50 |            -13, -8, -3, -8,-15, -4, -6,-12, -9,-14, -1,-17,
51 |     /*Q*/   -4, -2, -3, -2,-14,  8,  1, -7,  1, -8, -5, -3, -4,
52 |            -13, -3, -5, -5,-13,-12, -7, -3, -5,  6, -1,-17,
53 |     /*E*/   -2, -9, -2,  2,-14,  1,  8, -4, -5, -5, -9, -4, -7,
54 |            -14, -5, -4, -6,-17, -8, -6,  1, -7,  6, -1,-17,
55 |     /*G*/   -2, -9, -3, -3, -9, -7, -4,  6, -9,-11,-10, -7, -8,
56 |             -9, -6, -2, -6,-15,-14, -5, -3,-10, -5, -1,-17,
57 |     /*H*/   -7, -2,  0, -4, -7,  1, -5, -9,  9, -9, -6, -6,-10,
58 |             -6, -4, -6, -7, -7, -3, -6, -1, -7, -1, -1,-17,
59 |     /*I*/   -5, -5, -5, -7, -6, -8, -5,-11, -9,  8, -1, -6, -1,
60 |             -2, -8, -7, -2,-14, -6,  2, -6,  5, -6, -1,-17,
61 |     /*L*/   -6, -8, -7,-12,-15, -5, -9,-10, -6, -1,  7, -8,  1,
62 |             -3, -7, -8, -7, -6, -7, -2, -9,  6, -7, -1,-17,
63 |     /*K*/   -7,  0, -1, -4,-14, -3, -4, -7, -6, -6, -8,  7, -2,
64 |            -14, -6, -4, -3,-12, -9, -9, -2, -7, -4, -1,-17,
65 |     /*M*/   -5, -4, -9,-11,-13, -4, -7, -8,-10, -1,  1, -2, 11,
66 |             -4, -8, -5, -4,-13,-11, -1,-10,  0, -5, -1,-17,
67 |     /*F*/   -8, -9, -9,-15,-13,-13,-14, -9, -6, -2, -3,-14, -4,
68 |              9,-10, -6, -9, -4,  2, -8,-10, -2,-13, -1,-17,
69 |     /*P*/   -2, -4, -6, -8, -8, -3, -5, -6, -4, -8, -7, -6, -8,
70 |            -10,  8, -2, -4,-14,-13, -6, -7, -7, -4, -1,-17,
71 |     /*S*/    0, -3,  0, -4, -3, -5, -4, -2, -6, -7, -8, -4, -5,
72 |             -6, -2,  6,  0, -5, -7, -6, -1, -8, -5, -1,-17,
73 |     /*T*/   -1, -6, -2, -5, -8, -5, -6, -6, -7, -2, -7, -3, -4,
74 |             -9, -4,  0,  7,-13, -6, -3, -3, -5, -6, -1,-17,
75 |     /*W*/  -13, -2, -8,-15,-15,-13,-17,-15, -7,-14, -6,-12,-13,
76 |             -4,-14, -5,-13, 13, -5,-15,-10, -7,-14, -1,-17,
77 |     /*Y*/   -8,-10, -4,-11, -4,-12, -8,-14, -3, -6, -7, -9,-11,
78 |              2,-13, -7, -6, -5, 10, -7, -6, -7, -9, -1,-17,
79 |     /*V*/   -2, -8, -8, -8, -6, -7, -6, -5, -6,  2, -2, -9, -1,
80 |             -8, -6, -6, -3,-15, -7,  7, -8,  0, -6, -1,-17,
81 |     /*B*/   -3, -7,  6,  6,-12, -3,  1, -3, -1, -6, -9, -2,-10,
82 |            -10, -7, -1, -3,-10, -6, -8,  6, -8,  0, -1,-17,
83 |     /*J*/   -6, -7, -6,-10, -9, -5, -7,-10, -7,  5,  6, -7,  0,
84 |             -2, -7, -8, -5, -7, -7,  0, -8,  6, -6, -1,-17,
85 |     /*Z*/   -3, -4, -3,  1,-14,  6,  6, -5, -1, -6, -7, -4, -5,
86 |            -13, -4, -5, -6,-14, -9, -6,  0, -6,  6, -1,-17,
87 |     /*X*/   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
88 |             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,-17,
89 |     /***/  -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,
90 |            -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,  1
91 | };
92 | const SNCBIPackedScoreMatrix NCBISM_Pam30 = {
93 |     "ARNDCQEGHILKMFPSTWYVBJZX*",
94 |     s_Pam30PSM,
95 |     -17
96 | };
97 | 
98 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/util/tables/sm_pam70.c:
--------------------------------------------------------------------------------
 1 | /*  $Id: sm_pam70.c 90506 2006-09-25 19:30:59Z madden $
 2 | * ===========================================================================
 3 | *
 4 | *                            PUBLIC DOMAIN NOTICE
 5 | *               National Center for Biotechnology Information
 6 | *
 7 | *  This software/database is a "United States Government Work" under the
 8 | *  terms of the United States Copyright Act.  It was written as part of
 9 | *  the author's official duties as a United States Government employee and
10 | *  thus cannot be copyrighted.  This software/database is freely available
11 | *  to the public for use. The National Library of Medicine and the U.S.
12 | *  Government have not placed any restriction on its use or reproduction.
13 | *
14 | *  Although all reasonable efforts have been taken to ensure the accuracy
15 | *  and reliability of the software and data, the NLM and the U.S.
16 | *  Government do not and cannot warrant the performance or results that
17 | *  may be obtained by using this software or data. The NLM and the U.S.
18 | *  Government disclaim all warranties, express or implied, including
19 | *  warranties of performance, merchantability or fitness for any particular
20 | *  purpose.
21 | *
22 | *  Please cite the author in any work or product based on this material.
23 | *
24 | * ===========================================================================
25 | *
26 | * Author:  Aaron Ucko, Mike Gertz
27 | *
28 | * File Description:
29 | *   Protein alignment score matrices; shared between the two toolkits.
30 | *
31 | * ===========================================================================
32 | */
33 | 
34 | #include <util/tables/raw_scoremat.h>
35 | 
36 | /** Entries for the PAM70 matrix at a scale of ln(2)/2.0. */
37 | 
38 | static const TNCBIScore s_Pam70PSM[25 * 25] = {
39 |     /*       A,  R,  N,  D,  C,  Q,  E,  G,  H,  I,  L,  K,  M,
40 |              F,  P,  S,  T,  W,  Y,  V,  B,  J,  Z,  X,  *        */ 
41 |     /*A*/    5, -4, -2, -1, -4, -2, -1,  0, -4, -2, -4, -4, -3,
42 |             -6,  0,  1,  1, -9, -5, -1, -1, -3, -1, -1,-11,
43 |     /*R*/   -4,  8, -3, -6, -5,  0, -5, -6,  0, -3, -6,  2, -2,
44 |             -7, -2, -1, -4,  0, -7, -5, -4, -5, -2, -1,-11,
45 |     /*N*/   -2, -3,  6,  3, -7, -1,  0, -1,  1, -3, -5,  0, -5,
46 |             -6, -3,  1,  0, -6, -3, -5,  5, -4, -1, -1,-11,
47 |     /*D*/   -1, -6,  3,  6, -9,  0,  3, -1, -1, -5, -8, -2, -7,
48 |            -10, -4, -1, -2,-10, -7, -5,  5, -7,  2, -1,-11,
49 |     /*C*/   -4, -5, -7, -9,  9, -9, -9, -6, -5, -4,-10, -9, -9,
50 |             -8, -5, -1, -5,-11, -2, -4, -8, -7, -9, -1,-11,
51 |     /*Q*/   -2,  0, -1,  0, -9,  7,  2, -4,  2, -5, -3, -1, -2,
52 |             -9, -1, -3, -3, -8, -8, -4, -1, -3,  5, -1,-11,
53 |     /*E*/   -1, -5,  0,  3, -9,  2,  6, -2, -2, -4, -6, -2, -4,
54 |             -9, -3, -2, -3,-11, -6, -4,  2, -5,  5, -1,-11,
55 |     /*G*/    0, -6, -1, -1, -6, -4, -2,  6, -6, -6, -7, -5, -6,
56 |             -7, -3,  0, -3,-10, -9, -3, -1, -7, -3, -1,-11,
57 |     /*H*/   -4,  0,  1, -1, -5,  2, -2, -6,  8, -6, -4, -3, -6,
58 |             -4, -2, -3, -4, -5, -1, -4,  0, -4,  1, -1,-11,
59 |     /*I*/   -2, -3, -3, -5, -4, -5, -4, -6, -6,  7,  1, -4,  1,
60 |              0, -5, -4, -1, -9, -4,  3, -4,  4, -4, -1,-11,
61 |     /*L*/   -4, -6, -5, -8,-10, -3, -6, -7, -4,  1,  6, -5,  2,
62 |             -1, -5, -6, -4, -4, -4,  0, -6,  5, -4, -1,-11,
63 |     /*K*/   -4,  2,  0, -2, -9, -1, -2, -5, -3, -4, -5,  6,  0,
64 |             -9, -4, -2, -1, -7, -7, -6, -1, -5, -2, -1,-11,
65 |     /*M*/   -3, -2, -5, -7, -9, -2, -4, -6, -6,  1,  2,  0, 10,
66 |             -2, -5, -3, -2, -8, -7,  0, -6,  2, -3, -1,-11,
67 |     /*F*/   -6, -7, -6,-10, -8, -9, -9, -7, -4,  0, -1, -9, -2,
68 |              8, -7, -4, -6, -2,  4, -5, -7, -1, -9, -1,-11,
69 |     /*P*/    0, -2, -3, -4, -5, -1, -3, -3, -2, -5, -5, -4, -5,
70 |             -7,  7,  0, -2, -9, -9, -3, -4, -5, -2, -1,-11,
71 |     /*S*/    1, -1,  1, -1, -1, -3, -2,  0, -3, -4, -6, -2, -3,
72 |             -4,  0,  5,  2, -3, -5, -3,  0, -5, -2, -1,-11,
73 |     /*T*/    1, -4,  0, -2, -5, -3, -3, -3, -4, -1, -4, -1, -2,
74 |             -6, -2,  2,  6, -8, -4, -1, -1, -3, -3, -1,-11,
75 |     /*W*/   -9,  0, -6,-10,-11, -8,-11,-10, -5, -9, -4, -7, -8,
76 |             -2, -9, -3, -8, 13, -3,-10, -7, -5,-10, -1,-11,
77 |     /*Y*/   -5, -7, -3, -7, -2, -8, -6, -9, -1, -4, -4, -7, -7,
78 |              4, -9, -5, -4, -3,  9, -5, -4, -4, -7, -1,-11,
79 |     /*V*/   -1, -5, -5, -5, -4, -4, -4, -3, -4,  3,  0, -6,  0,
80 |             -5, -3, -3, -1,-10, -5,  6, -5,  1, -4, -1,-11,
81 |     /*B*/   -1, -4,  5,  5, -8, -1,  2, -1,  0, -4, -6, -1, -6,
82 |             -7, -4,  0, -1, -7, -4, -5,  5, -5,  1, -1,-11,
83 |     /*J*/   -3, -5, -4, -7, -7, -3, -5, -7, -4,  4,  5, -5,  2,
84 |             -1, -5, -5, -3, -5, -4,  1, -5,  5, -4, -1,-11,
85 |     /*Z*/   -1, -2, -1,  2, -9,  5,  5, -3,  1, -4, -4, -2, -3,
86 |             -9, -2, -2, -3,-10, -7, -4,  1, -4,  5, -1,-11,
87 |     /*X*/   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
88 |             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,-11,
89 |     /***/  -11,-11,-11,-11,-11,-11,-11,-11,-11,-11,-11,-11,-11,
90 |            -11,-11,-11,-11,-11,-11,-11,-11,-11,-11,-11,  1
91 | };
92 | const SNCBIPackedScoreMatrix NCBISM_Pam70 = {
93 |     "ARNDCQEGHILKMFPSTWYVBJZX*",
94 |     s_Pam70PSM,
95 |     -11
96 | };
97 | 
98 | 


--------------------------------------------------------------------------------
/src/include/ncbi-blast+/util/tables/tables_export.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_TABLES___TABLES_EXPORT__H
 2 | #define UTIL_TABLES___TABLES_EXPORT__H
 3 | 
 4 | /*  $Id: tables_export.h 166398 2009-07-22 15:51:55Z ucko $
 5 |  * ===========================================================================
 6 |  *
 7 |  *                            PUBLIC DOMAIN NOTICE
 8 |  *               National Center for Biotechnology Information
 9 |  *
10 |  *  This software/database is a "United States Government Work" under the
11 |  *  terms of the United States Copyright Act.  It was written as part of
12 |  *  the author's official duties as a United States Government employee and
13 |  *  thus cannot be copyrighted.  This software/database is freely available
14 |  *  to the public for use. The National Library of Medicine and the U.S.
15 |  *  Government have not placed any restriction on its use or reproduction.
16 |  *
17 |  *  Although all reasonable efforts have been taken to ensure the accuracy
18 |  *  and reliability of the software and data, the NLM and the U.S.
19 |  *  Government do not and cannot warrant the performance or results that
20 |  *  may be obtained by using this software or data. The NLM and the U.S.
21 |  *  Government disclaim all warranties, express or implied, including
22 |  *  warranties of performance, merchantability or fitness for any particular
23 |  *  purpose.
24 |  *
25 |  *  Please cite the author in any work or product based on this material.
26 |  *
27 |  * ===========================================================================
28 |  *
29 |  * Authors:  Anatoliy Kuznetsov, Mike DiCuccio, Aaron Ucko
30 |  *
31 |  * File Description:
32 |  *    Defines to provide correct exporting from TABLES DLL in Windows.
33 |  *    These are necessary to compile DLLs with Visual C++ - exports must be
34 |  *    explicitly labeled as such.
35 |  */
36 | 
37 | 
38 | /** @addtogroup WinDLL
39 |  *
40 |  * @{
41 |  */
42 | 
43 | 
44 | #if defined(WIN32)  &&  defined(NCBI_DLL_BUILD)
45 | 
46 | #ifndef _MSC_VER
47 | #  error "This toolkit is not buildable with a compiler other than MSVC."
48 | #endif
49 | 
50 | 
51 | /*
52 |  * Dumping ground for Windows-specific stuff
53 |  */
54 | #pragma warning (disable : 4786 4251 4275)
55 | 
56 | 
57 | #ifdef NCBI_CORE_EXPORTS
58 | #  define NCBI_TABLES_EXPORTS
59 | #endif
60 | 
61 | 
62 | #ifdef NCBI_TABLES_EXPORTS
63 | #  define NCBI_TABLES_EXPORT      __declspec(dllexport)
64 | #else
65 | #  define NCBI_TABLES_EXPORT      __declspec(dllimport)
66 | #endif /* NCBI_TABLES_EXPORTS */
67 | 
68 | 
69 | 
70 | #elif defined(__GNUC__)  &&  __GNUC__ >= 4
71 | 
72 | #  define NCBI_TABLES_EXPORT      __attribute__((visibility("default")))
73 | 
74 | #else
75 | 
76 | /*
77 |  * NULL operations for other cases
78 |  */
79 | 
80 | #  define NCBI_TABLES_EXPORT
81 | 
82 | 
83 | #endif
84 | 
85 | 
86 | /* @} */
87 | 
88 | #endif  /*  UTIL_TABLES___TABLES_EXPORT__H  */
89 | 


--------------------------------------------------------------------------------
/src/jsonreporter.h:
--------------------------------------------------------------------------------
 1 | #ifndef JSON_REPORTER_H
 2 | #define JSON_REPORTER_H
 3 | 
 4 | #include <iostream>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string>
 8 | #include <atomic>
 9 | #include <fstream>
10 | #include "options.h"
11 | #include "stats.h"
12 | #include "filterresult.h"
13 | #include "common.h"
14 | #include "util.h"
15 | 
16 | using namespace std;
17 | 
18 | class JsonReporter{
19 | public:
20 |     JsonReporter(Options* & opt);
21 |     ~JsonReporter();
22 | 
23 |     void setDupHist(int* dupHist, double* dupMeanGC, double dupRate);
24 |     void setInsertHist(atomic_long* insertHist, int insertSizePeak);
25 |     void report(FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL);
26 | 
27 | private:
28 |     Options* mOptions;
29 |     int* mDupHist;
30 |     double* mDupMeanGC;
31 |     double mDupRate;
32 |     atomic_long* mInsertHist;
33 |     int mInsertSizePeak;
34 | };
35 | 
36 | 
37 | #endif


--------------------------------------------------------------------------------
/src/nucleotidetree.cpp:
--------------------------------------------------------------------------------
  1 | #include "nucleotidetree.h"
  2 | #include <sstream>
  3 | 
  4 | NucleotideNode::NucleotideNode(){
  5 |     count = 0;
  6 |     base = 'N';
  7 |     memset(children, 0, sizeof(NucleotideNode*)*8);
  8 | }
  9 | NucleotideNode::~NucleotideNode(){
 10 |     for(int i=0; i<8; i++) {
 11 |         if(children[i])
 12 |             delete children[i];
 13 |     }
 14 | }
 15 | void NucleotideNode::dfs() {
 16 |     //cerr << base;
 17 |     //cerr << count;
 18 |     printf("%c", base);
 19 |     printf("%d", count);
 20 |     bool hasChild = false;
 21 |     for(int i=0; i<8; i++) {
 22 |         if(children[i]){
 23 |             children[i]->dfs();
 24 |             hasChild = true;
 25 |         }
 26 |     }
 27 |     if(!hasChild) {
 28 |         printf("\n");
 29 |     }
 30 | }
 31 | 
 32 | NucleotideTree::NucleotideTree(Options* & opt){
 33 |     mOptions = opt;
 34 |     mRoot = new NucleotideNode();
 35 | }
 36 | 
 37 | 
 38 | NucleotideTree::~NucleotideTree(){
 39 |     delete mRoot;
 40 | }
 41 | 
 42 | void NucleotideTree::addSeq(string seq) {
 43 |     NucleotideNode* curNode = mRoot;
 44 |     for(int i=0; i<seq.length(); i++) {
 45 |         if(seq[i] == 'N')
 46 |             break;
 47 |         char base = seq[i] & 0x07;
 48 |         if(curNode->children[base] == NULL) {
 49 |             curNode->children[base] = new NucleotideNode();
 50 |             curNode->children[base]->base = seq[i];
 51 |         }
 52 |         curNode->children[base]->count++;
 53 |         curNode = curNode->children[base];
 54 |     }
 55 | }
 56 | 
 57 | string NucleotideTree::getDominantPath(bool& reachedLeaf) {
 58 |     stringstream ss;
 59 |     const double RATIO_THRESHOLD = 0.95;
 60 |     const int NUM_THRESHOLD = 50;
 61 |     NucleotideNode* curNode = mRoot;
 62 |     while(true) {
 63 |         int total = 0;
 64 |         for(int i=0; i<8; i++) {
 65 |             if(curNode->children[i] != NULL)
 66 |                 total += curNode->children[i]->count;
 67 |         }
 68 |         if(total < NUM_THRESHOLD)
 69 |             break;
 70 |         bool hasDominant = false;
 71 |         for(int i=0; i<8; i++) {
 72 |             if(curNode->children[i] == NULL)
 73 |                 continue;
 74 |             if(curNode->children[i]->count / (double)total >= RATIO_THRESHOLD) {
 75 |                 hasDominant = true;
 76 |                 ss << curNode->children[i]->base;
 77 |                 curNode = curNode->children[i];
 78 |                 break;
 79 |             }
 80 |         }
 81 |         if(!hasDominant) {
 82 |             reachedLeaf = false;
 83 |             break;
 84 |         }
 85 |     }
 86 |     return ss.str();
 87 | 
 88 | }
 89 | 
 90 | bool NucleotideTree::test() {
 91 | //    NucleotideTree tree(NULL);
 92 | //    for(int i=0; i<100; i++) {
 93 | //        tree.addSeq("AAAATTTT");
 94 | //        tree.addSeq("AAAATTTTGGGG");
 95 | //        tree.addSeq("AAAATTTTGGGGCCCC");
 96 | //        tree.addSeq("AAAATTTTGGGGCCAA");
 97 | //    }
 98 | //    tree.addSeq("AAAATTTTGGGACCCC");
 99 | //
100 | //    bool reachedLeaf = true;
101 | //    string path = tree.getDominantPath(reachedLeaf);
102 | //    printf("%s\n", path.c_str());
103 | //    return path == "AAAATTTTGGGGCC";
104 | }


--------------------------------------------------------------------------------
/src/nucleotidetree.h:
--------------------------------------------------------------------------------
 1 | #ifndef NUCLEICTREE_H
 2 | #define NUCLEICTREE_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <memory.h>
 8 | #include "options.h"
 9 | 
10 | using namespace std;
11 | 
12 | // (A,T,C,G,N) & 0X07 = (1,4,7,6,3)
13 | class NucleotideNode{
14 | public:
15 |     NucleotideNode();
16 |     ~NucleotideNode();
17 |     void dfs();
18 | 
19 | public:
20 |     int count;
21 |     char base;
22 |     NucleotideNode* children[8];
23 | };
24 | 
25 | class NucleotideTree{
26 | public:
27 |     NucleotideTree(Options* & opt);
28 |     ~NucleotideTree();
29 |     void addSeq(string seq);
30 |     string getDominantPath(bool& reachedLeaf);
31 | 
32 |     static bool test();
33 | 
34 | private:
35 |     Options* mOptions;
36 |     NucleotideNode* mRoot;
37 | };
38 | 
39 | 
40 | #endif


--------------------------------------------------------------------------------
/src/overlapanalysis.h:
--------------------------------------------------------------------------------
 1 | #ifndef OVERLAP_ANALYSIS_H
 2 | #define OVERLAP_ANALYSIS_H
 3 | 
 4 | #include <iostream>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string>
 8 | #include <vector>
 9 | #include "common.h"
10 | #include "options.h"
11 | #include "read.h"
12 | 
13 | using namespace std;
14 | 
15 | class OverlapResult {
16 | public:
17 |     bool overlapped;
18 |     int offset;
19 |     int overlap_len;
20 |     int diff;
21 | };
22 | 
23 | class OverlapAnalysis{
24 | public:
25 |     OverlapAnalysis();
26 |     ~OverlapAnalysis();
27 | 
28 |     static OverlapResult analyze(Sequence&  r1, Sequence&  r2, int diffLimit, int overlapRequire, double diffPercentLimit);
29 |     static OverlapResult analyze(Read* r1, Read* r2, int diffLimit, int overlapRequire, double diffPercentLimit);
30 |     static Read* merge(Read* r1, Read* r2, OverlapResult ov);
31 | 
32 | public:
33 |     static bool test();
34 | 
35 | };
36 | 
37 | #endif


--------------------------------------------------------------------------------
/src/peprocessor.h:
--------------------------------------------------------------------------------
  1 | #ifndef PE_PROCESSOR_H
  2 | #define PE_PROCESSOR_H
  3 | 
  4 | #include <algorithm>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <string>
  8 | #include <cstdlib>
  9 | #include <condition_variable>
 10 | #include <mutex>
 11 | #include <thread>
 12 | #include <sstream>
 13 | #include <string>
 14 | #include <fstream>
 15 | #include <iostream>
 16 | #include <unistd.h>
 17 | #include <functional>
 18 | #include <memory.h>
 19 | #include <unordered_map>
 20 | #include <map>
 21 | #include <future>
 22 | #include <deque>
 23 | #include <time.h>
 24 | #include <iomanip>
 25 | 
 26 | #include "fastqreader.h"
 27 | #include "util.h"
 28 | #include "adaptertrimmer.h"
 29 | #include "basecorrector.h"
 30 | #include "jsonreporter.h"
 31 | #include "htmlreporter.h"
 32 | #include "polyx.h"
 33 | #include "options.h"
 34 | #include "threadconfig.h"
 35 | #include "filter.h"
 36 | #include "umiprocessor.h"
 37 | #include "overlapanalysis.h"
 38 | #include "writerthread.h"
 39 | #include "duplicate.h"
 40 | #include "read.h"
 41 | #include "bwtfmiDB.h"
 42 | 
 43 | using namespace std;
 44 | 
 45 | struct ReadPairPack {
 46 |     ReadPair** data;
 47 |     int count;
 48 | };
 49 | 
 50 | typedef struct ReadPairPack ReadPairPack;
 51 | 
 52 | struct ReadPairRepository {
 53 |     ReadPairPack** packBuffer;
 54 |     atomic_long readPos;
 55 |     atomic_long writePos;
 56 | };
 57 | 
 58 | typedef struct ReadPairRepository ReadPairRepository;
 59 | 
 60 | class PairEndProcessor{
 61 | public:
 62 |     PairEndProcessor(Options* & opt, BwtFmiDB * & tbwtfmiDB);
 63 |     ~PairEndProcessor();
 64 |     bool process();
 65 | 
 66 | private:
 67 |     bool processPairEnd(ReadPairPack* pack, ThreadConfig* config);
 68 |     bool processRead(Read* r, ReadPair* originalRead, bool reversed);
 69 |     void initPackRepository();
 70 |     void destroyPackRepository();
 71 |     void producePack(ReadPairPack* pack);
 72 |     void consumePack(ThreadConfig* config);
 73 |     void producerTask();
 74 |     void consumerTask(ThreadConfig* config);
 75 |     void initConfig(ThreadConfig* config);
 76 |     void initOutput();
 77 |     void closeOutput();
 78 |     void statInsertSize(Read* r1, Read* r2, OverlapResult& ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0);
 79 |     int getPeakInsertSize();
 80 |     void writeTask(WriterThread* config);
 81 |     void prepareResults();
 82 | 
 83 | private:
 84 |     ReadPairRepository mRepo;
 85 |     atomic_bool mProduceFinished;
 86 |     atomic_int mFinishedThreads;
 87 |     std::mutex mOutputMtx;
 88 |     std::mutex mInputMtx;
 89 |     std::mutex logMtx;
 90 |     Options* mOptions;
 91 |     Filter* mFilter;
 92 |     gzFile mZipFile1;
 93 |     gzFile mZipFile2;
 94 |     ofstream* mOutStream1;
 95 |     ofstream* mOutStream2;
 96 |     UmiProcessor* mUmiProcessor;
 97 |     atomic_long* mInsertSizeHist;
 98 |     WriterThread* mLeftWriter;
 99 |     WriterThread* mRightWriter;
100 |     WriterThread* mUnpairedLeftWriter;
101 |     WriterThread* mUnpairedRightWriter;
102 |     WriterThread* mMergedWriter;
103 |     WriterThread* mFailedWriter;
104 |     WriterThread* mReadsKOWriter;
105 |     Duplicate* mDuplicate;
106 |     BwtFmiDB *tbwtfmiDB;
107 |     std::string fileoutname;
108 | };
109 | 
110 | 
111 | #endif
112 | 


--------------------------------------------------------------------------------
/src/polyx.cpp:
--------------------------------------------------------------------------------
  1 | #include "polyx.h"
  2 | #include "common.h"
  3 | 
  4 | PolyX::PolyX(){
  5 | }
  6 | 
  7 | 
  8 | PolyX::~PolyX(){
  9 | }
 10 | 
 11 | void PolyX::trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq) {
 12 |     trimPolyG(r1, fr, compareReq);
 13 |     trimPolyG(r2, fr, compareReq);
 14 | }
 15 | 
 16 | void PolyX::trimPolyG(Read* r, FilterResult* fr, int compareReq) {
 17 |     const int allowOneMismatchForEach = 8;
 18 |     const int maxMismatch = 5;
 19 | 
 20 |     const char* data = r->mSeq.mStr.c_str();
 21 | 
 22 |     int rlen = r->length();
 23 | 
 24 |     int mismatch = 0;
 25 |     int i = 0;
 26 |     int firstGPos = rlen - 1;
 27 |     for(i=0; i< rlen; i++) {
 28 |         if(data[rlen - i - 1] != 'G') {
 29 |             mismatch++;
 30 |         } else {
 31 |             firstGPos = rlen - i -1;
 32 |         }
 33 | 
 34 |         int allowedMismatch = (i+1)/allowOneMismatchForEach;
 35 |         if(mismatch > maxMismatch || (mismatch>allowedMismatch && i>= compareReq-1) )
 36 |             break;
 37 |     }
 38 | 
 39 |     if(i >= compareReq) {
 40 |         r->resize(firstGPos);
 41 |     }
 42 | }
 43 | 
 44 | void PolyX::trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq) {
 45 |     trimPolyX(r1, fr, compareReq);
 46 |     trimPolyX(r2, fr, compareReq);
 47 | }
 48 | 
 49 | void PolyX::trimPolyX(Read* r, FilterResult* fr, int compareReq) {
 50 |     const int allowOneMismatchForEach = 8;
 51 |     const int maxMismatch = 5;
 52 | 
 53 |     const char* data = r->mSeq.mStr.c_str();
 54 | 
 55 |     int rlen = r->length();
 56 | 
 57 | 
 58 |     int atcgNumbers[4] = {0, 0, 0, 0};
 59 |     int pos = 0;
 60 |     for(pos=0; pos<rlen; pos++) {
 61 |         switch(data[rlen - pos - 1]) {
 62 |             case 'A':
 63 |                 atcgNumbers[0]++;
 64 |                 break;
 65 |             case 'T':
 66 |                 atcgNumbers[1]++;
 67 |                 break;
 68 |             case 'C':
 69 |                 atcgNumbers[2]++;
 70 |                 break;
 71 |             case 'G':
 72 |                 atcgNumbers[3]++;
 73 |                 break;
 74 |             case 'N':
 75 |                 atcgNumbers[0]++;
 76 |                 atcgNumbers[1]++;
 77 |                 atcgNumbers[2]++;
 78 |                 atcgNumbers[3]++;
 79 |                 break;
 80 |             default:
 81 |                 break;
 82 |         }
 83 | 
 84 |         int cmp = (pos+1);
 85 |         int allowedMismatch = min(maxMismatch, cmp/allowOneMismatchForEach);
 86 | 
 87 |         bool needToBreak = true;
 88 |         for(int b=0; b<4; b++) {
 89 |             if(cmp - atcgNumbers[b] <= allowedMismatch)
 90 |                 needToBreak = false;
 91 |         }
 92 |         if(needToBreak && (pos >= allowOneMismatchForEach || pos+1 >= compareReq-1)) {
 93 |             break;
 94 |         }
 95 |     }
 96 | 
 97 |     // has polyX
 98 |     if(pos+1 >= compareReq) {
 99 |         // find the poly
100 |         int poly;
101 |         int maxCount = -1;
102 |         for(int b=0; b<4; b++) {
103 |             if(atcgNumbers[b] > maxCount){
104 |                 maxCount = atcgNumbers[b];
105 |                 poly = b;
106 |             }
107 |         }
108 |         char polyBase = ATCG_BASES[poly];
109 |         while(data[rlen - pos - 1] != polyBase && pos>=0)
110 |             pos--;
111 | 
112 |         r->resize(rlen - pos - 1);
113 |         if(fr)
114 |           fr->addPolyXTrimmed(poly, pos + 1);
115 |     }
116 | }
117 | 
118 | bool PolyX::test() {
119 | 
120 | //    Read r("@name",
121 | //        "ATTTTAAAAAAAAAATAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAT",
122 | //        "+",
123 | //        "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E");
124 | //
125 | //    FilterResult fr(NULL, false);
126 | //    PolyX::trimPolyX(&r, &fr, 10);
127 | //    r.print();
128 | //
129 | //    return r.mSeq.mStr == "ATTTT" && fr.getTotalPolyXTrimmedReads() == 1 && fr.getTotalPolyXTrimmedBases() == 51;
130 | }


--------------------------------------------------------------------------------
/src/polyx.h:
--------------------------------------------------------------------------------
 1 | #ifndef POLY_X_H
 2 | #define POLY_X_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "overlapanalysis.h"
 8 | #include "filterresult.h"
 9 | #include "options.h"
10 | 
11 | using namespace std;
12 | 
13 | class PolyX{
14 | public:
15 |     PolyX();
16 |     ~PolyX();
17 | 
18 |     static void trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq);
19 |     static void trimPolyG(Read* r1, FilterResult* fr, int compareReq);
20 |     static void trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq);
21 |     static void trimPolyX(Read* r1, FilterResult* fr, int compareReq);
22 |     static bool test();
23 | 
24 | 
25 | };
26 | 
27 | 
28 | #endif


--------------------------------------------------------------------------------
/src/processor.cpp:
--------------------------------------------------------------------------------
 1 | #include "processor.h"
 2 | #include "peprocessor.h"
 3 | #include "seprocessor.h"
 4 | 
 5 | Processor::Processor(Options* & opt){
 6 |     mOptions = opt;
 7 | }
 8 | 
 9 | Processor::~Processor(){
10 | }
11 | 
12 | bool Processor::process(BwtFmiDB * & tbwtfmiDB) {
13 |     if(mOptions->isPaired()) {
14 |         PairEndProcessor p(mOptions, tbwtfmiDB);
15 |         p.process();
16 |     } else {
17 |         SingleEndProcessor p(mOptions, tbwtfmiDB);
18 |         p.process();
19 |     }
20 | 
21 |     return true;
22 | }


--------------------------------------------------------------------------------
/src/processor.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROCESSOR_H
 2 | #define PROCESSOR_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "options.h"
 8 | #include "bwtfmiDB.h"
 9 | 
10 | using namespace std;
11 | 
12 | class Processor{
13 | public:
14 |     Processor(Options* & opt);
15 |     ~Processor();
16 |     bool process(BwtFmiDB * & tbwtfmiDB);
17 | 
18 | private:
19 |     Options* mOptions;
20 | };
21 | 
22 | 
23 | #endif


--------------------------------------------------------------------------------
/src/read.h:
--------------------------------------------------------------------------------
 1 | #ifndef READ_H
 2 | #define READ_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <iostream>
 8 | #include <fstream>
 9 | #include "sequence.h"
10 | #include <vector>
11 | #include "util.h"
12 | 
13 | using namespace std;
14 | 
15 | class Read{
16 | public:
17 |     Read(string name, string seq, string strand, string quality, bool phred64 = false);
18 |     Read(string name, Sequence seq, string strand, string quality, bool phred64 = false);
19 |     Read(string name, string seq, string strand);
20 |     Read(string name, Sequence seq, string strand);
21 |     Read(Read &r);
22 |     void print();
23 |     void printFile(ofstream& file);
24 |     Read* reverseComplement();
25 |     string firstIndex();
26 |     string lastIndex();
27 |     // default is Q20
28 |     int lowQualCount(int qual = 20);
29 |     int length();
30 |     string toString();
31 |     string toFastaR1();
32 |     string toFastaR2();
33 |     string toStringWithTag(string tag);
34 |     string toStringWithTag(uint32* tag);
35 |     string toStringWithTagRm();
36 |     void resize(int len);
37 |     void convertPhred64To33();
38 |     void trimFront(int len);
39 |     bool fixMGI();
40 | 
41 | public:
42 |     static bool test();
43 | 
44 | private:
45 | 
46 | 
47 | public:
48 | 	string mName;
49 | 	Sequence mSeq;
50 | 	string mStrand;
51 | 	string mQuality;
52 | 	bool mHasQuality;
53 | };
54 | 
55 | class ReadPair{
56 | public:
57 |     ReadPair(Read* left, Read* right);
58 |     ~ReadPair();
59 | 
60 |     // merge a pair, without consideration of seq error caused false INDEL
61 |     Read* fastMerge();
62 | public:
63 |     Read* mLeft;
64 |     Read* mRight;
65 | 
66 | public:
67 |     static bool test();
68 | };
69 | 
70 | class ReadItem{
71 | public:
72 |     std::string name1;
73 |     std::string name2;
74 |     std::string sequence1;
75 |     std::string quality1;
76 |     std::string sequence2;
77 |     std::string quality2;
78 |     bool paired = false;
79 |     ReadItem(const std::string &, const std::string &);
80 |     ReadItem(const std::string &, const std::string &, const std::string &);
81 | //    ReadItem(const std::string & n1, const std::string & s1, const std::string & q1);
82 | //    ReadItem(const std::string & n1, const std::string & s1, const std::string & q1, const std::string & n2, const std::string & s2, const std::string & q2);
83 | //    std::string toStringR1();
84 |     std::string toStringWithTagR1(std::string & tag);
85 | //    std::string toStringR2();
86 |     std::string toStringWithTagR2(std::string & tag);
87 | 
88 | //    std::string toStringRQ1();
89 | //    std::string toStringWithTagRQ1(std::string & tag);
90 | //    std::string toStringRQ2();
91 | //    std::string toStringWithTagRQ2(std::string & tag);
92 | //    
93 | };
94 | 
95 | #endif


--------------------------------------------------------------------------------
/src/seprocessor.h:
--------------------------------------------------------------------------------
  1 | #ifndef SE_PROCESSOR_H
  2 | #define SE_PROCESSOR_H
  3 | 
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <string>
  7 | #include <string>
  8 | #include <fstream>
  9 | #include <iostream>
 10 | #include <unistd.h>
 11 | #include <functional>
 12 | #include <thread>
 13 | #include <memory.h>
 14 | #include <cstdlib>
 15 | #include <condition_variable>
 16 | #include <mutex>
 17 | #include <sstream>
 18 | #include <unordered_map>
 19 | #include <map>
 20 | #include <future>
 21 | #include <deque>
 22 | #include <time.h>
 23 | #include <iomanip>
 24 | 
 25 | #include "options.h"
 26 | #include "threadconfig.h"
 27 | #include "filter.h"
 28 | #include "umiprocessor.h"
 29 | #include "writerthread.h"
 30 | #include "duplicate.h"
 31 | #include "fastqreader.h"
 32 | #include "util.h"
 33 | #include "jsonreporter.h"
 34 | #include "htmlreporter.h"
 35 | #include "adaptertrimmer.h"
 36 | #include "polyx.h"
 37 | #include "read.h"
 38 | #include "common.h"
 39 | #include "bwtfmiDB.h"
 40 | 
 41 | 
 42 | using namespace std;
 43 | 
 44 | struct ReadPack {
 45 |     Read** data;
 46 |     int count;
 47 | };
 48 | 
 49 | typedef struct ReadPack ReadPack;
 50 | 
 51 | struct ReadRepository {
 52 |     ReadPack** packBuffer;
 53 |     atomic_long readPos;
 54 |     atomic_long writePos;
 55 | };
 56 | 
 57 | typedef struct ReadRepository ReadRepository;
 58 | 
 59 | class SingleEndProcessor{
 60 | public:
 61 |     SingleEndProcessor(Options* & opt, BwtFmiDB * tbwtfmiDB);
 62 |     ~SingleEndProcessor();
 63 |     bool process();
 64 | 
 65 | private:
 66 |     bool processSingleEnd(ReadPack* pack, ThreadConfig* config);
 67 |     void initPackRepository();
 68 |     void destroyPackRepository();
 69 |     void producePack(ReadPack* pack);
 70 |     void consumePack(ThreadConfig* config);
 71 |     void producerTask();
 72 |     void consumerTask(ThreadConfig* config);
 73 |     void initConfig(ThreadConfig* config);
 74 |     void initOutput();
 75 |     void closeOutput();
 76 |     void writeTask(WriterThread* config);
 77 |     void prepareResults();
 78 | private:
 79 |     Options* mOptions;
 80 |     ReadRepository mRepo;
 81 |     atomic_bool mProduceFinished;
 82 |     atomic_int mFinishedThreads;
 83 |     std::mutex mInputMtx;
 84 |     std::mutex mOutputMtx;
 85 |     std::mutex logMtx;
 86 |     Filter* mFilter;
 87 |     gzFile mZipFile;
 88 |     ofstream* mOutStream;
 89 |     UmiProcessor* mUmiProcessor;
 90 |     WriterThread* mLeftWriter;
 91 |     WriterThread* mFailedWriter;
 92 |     Duplicate* mDuplicate;
 93 |     WriterThread* mReadsKOWriter;
 94 |     BwtFmiDB *tbwtfmiDB;
 95 |     std::string fileoutname;
 96 |     
 97 | };
 98 | 
 99 | 
100 | #endif
101 | 


--------------------------------------------------------------------------------
/src/seqtract:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/src/seqtract


--------------------------------------------------------------------------------
/src/seqtractpeprocessor.h:
--------------------------------------------------------------------------------
 1 | #ifndef SEQTRACTPEPROCESSOR_H
 2 | #define SEQTRACTPEPROCESSOR_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <cstdlib>
 8 | #include <condition_variable>
 9 | #include <mutex>
10 | #include <thread>
11 | #include "options.h"
12 | #include "fastqreader.h"
13 | #include "threadsconfig2.h"
14 | #include "read.h"
15 | #include "util.h"
16 | 
17 | 
18 | using namespace std;
19 | 
20 | struct ReadPack{
21 |     Read** data;
22 |     int count;
23 | };
24 | typedef struct ReadPack ReadPack;
25 | 
26 | struct ReadRepository{
27 |     ReadPack** packBuffer;
28 |     size_t readPos;
29 |     size_t writePos;
30 |     size_t readCounter;
31 |     std::mutex mtx;
32 |     std::mutex readCounterMtx;
33 |     std::condition_variable repoNotFull;
34 |     std::condition_variable repoNotEmpty;
35 | };
36 | typedef struct ReadPairRepository ReadPairRepository;
37 | 
38 | 
39 | class SeqTractPeProcessor {
40 | public:
41 |     SeqTractPeProcessor(Options * opt);
42 |     ~SeqTractPeProcessor();
43 |     bool process();
44 |     
45 | private:
46 |     bool processReads(ReadPack* pack);
47 |     void initPackRepository();
48 |     void destroyPackRepository();
49 |     void producePack(ReadPack* pack);
50 |     void consumePack();
51 |     void producerTask();
52 |     void consumerTask();
53 |     void writeTask(ThreadsConfig2* config);
54 |     
55 | private:
56 |     Options* mOptions;
57 |     ReadRepository mRepo;
58 |     bool mProduceFinished;
59 |     ThreadsConfig2** mConfigs;
60 |     int mSampleSize;
61 |     std::unordered_set<std::string> featureUSet;
62 | };
63 | 
64 | #endif /* SEQTRACTPEPROCESSOR_H */
65 | 
66 | 


--------------------------------------------------------------------------------
/src/sequence.cpp:
--------------------------------------------------------------------------------
 1 | #include "sequence.h"
 2 | 
 3 | Sequence::Sequence(){
 4 | }
 5 | 
 6 | Sequence::Sequence(string seq){
 7 |     mStr = seq;
 8 | }
 9 | 
10 | void Sequence::print(){
11 |     std::cerr << mStr;
12 | }
13 | 
14 | int Sequence::length(){
15 |     return mStr.length();
16 | }
17 | 
18 | Sequence Sequence::reverseComplement(){
19 |     string str(mStr.length(), 0);
20 |     for(int c=0;c<mStr.length();c++){
21 |         char base = mStr[c];
22 |         switch(base){
23 |             case 'A':
24 |             case 'a':
25 |                 str[mStr.length()-c-1] = 'T';
26 |                 break;
27 |             case 'T':
28 |             case 't':
29 |                 str[mStr.length()-c-1] = 'A';
30 |                 break;
31 |             case 'C':
32 |             case 'c':
33 |                 str[mStr.length()-c-1] = 'G';
34 |                 break;
35 |             case 'G':
36 |             case 'g':
37 |                 str[mStr.length()-c-1] = 'C';
38 |                 break;
39 |             default:
40 |                 str[mStr.length()-c-1] = 'N';
41 |         }
42 |     }
43 |     return Sequence(str);
44 | }
45 | 
46 | Sequence Sequence::operator~(){
47 |     return reverseComplement();
48 | }
49 | 
50 | bool Sequence::test(){
51 |     Sequence s("AAAATTTTCCCCGGGG");
52 |     Sequence rc = ~s;
53 |     if (s.mStr != "AAAATTTTCCCCGGGG" ){
54 |         cerr << "Failed in reverseComplement() expect AAAATTTTCCCCGGGG, but get "<< s.mStr;
55 |         return false;
56 |     }
57 |     if (rc.mStr != "CCCCGGGGAAAATTTT" ){
58 |         cerr << "Failed in reverseComplement() expect CCCCGGGGAAAATTTT, but get "<< rc.mStr;
59 |         return false;
60 |     }
61 |     return true;
62 | }


--------------------------------------------------------------------------------
/src/sequence.h:
--------------------------------------------------------------------------------
 1 | #ifndef SEQUENCE_H
 2 | #define SEQUENCE_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <iostream>
 8 | 
 9 | using namespace std;
10 | 
11 | class Sequence{
12 | public:
13 |     Sequence();
14 |     Sequence(string seq);
15 |     void print();
16 |     int length();
17 |     Sequence reverseComplement();
18 | 
19 |     Sequence operator~();
20 | 
21 |     static bool test();
22 | 
23 | public:
24 |     string mStr;
25 | };
26 | 
27 | #endif


--------------------------------------------------------------------------------
/src/stats.h:
--------------------------------------------------------------------------------
  1 | #ifndef STATS_H
  2 | #define STATS_H
  3 | 
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <string>
  7 | #include <vector>
  8 | #include <map>
  9 | #include "read.h"
 10 | #include "options.h"
 11 | 
 12 | using namespace std;
 13 | 
 14 | class Stats{
 15 | public:
 16 |     // this @guessedCycles parameter should be calculated using the first several records
 17 |     Stats(Options* & opt, bool isRead2 = false, int guessedCycles = 0, int bufferMargin = 1024);
 18 |     ~Stats();
 19 |     int getCycles();
 20 |     long getReads();
 21 |     long getBases();
 22 |     long getQ20();
 23 |     long getQ30();
 24 |     long getGCNumber();
 25 |     // by default the qualified qual score is Q20 ('5')
 26 |     void statRead(Read* r);
 27 | 
 28 |     static Stats* merge(vector<Stats*>& list);
 29 |     void print();
 30 |     void summarize(bool forced = false);
 31 |     // a port of JSON report
 32 |     void reportJson(ofstream& ofs, string padding);
 33 |     // a port of HTML report
 34 |     void reportHtml(ofstream& ofs, string filteringType, string readName);
 35 |     void reportHtmlQuality(ofstream& ofs, string filteringType, string readName);
 36 |     void reportHtmlContents(ofstream& ofs, string filteringType, string readName);
 37 |     void reportHtmlKMER(ofstream& ofs, string filteringType, string readName);
 38 |     void reportHtmlORA(ofstream& ofs, string filteringType, string readName);
 39 |     bool isLongRead();
 40 |     void initOverRepSeq();
 41 |     int getMeanLength();
 42 | 
 43 | public:
 44 |     static string list2string(double* list, int size);
 45 |     static string list2string(double* list, int size, long* coords);
 46 |     static string list2string(long* list, int size);
 47 |     static string list2string(std::vector<std::string> & x_vec, int top);
 48 |     static string list2string(std::vector<double> & y_vec, int top);
 49 |     static string list2string(std::vector<int> & x_vec, int top);
 50 |     //static string list2string(std::vector<int> & y_vec, int top);
 51 |     static int base2val(char base);
 52 | 
 53 | private:
 54 |     void extendBuffer(int newBufLen);
 55 |     string makeKmerTD(int i, int j);
 56 |     string kmer3(int val);
 57 |     string kmer2(int val);
 58 |     void deleteOverRepSeqDist();
 59 |     bool overRepPassed(string& seq, long count);
 60 | 
 61 | private:
 62 |     Options* mOptions;
 63 |     bool mIsRead2;
 64 |     long mReads;
 65 |     int mEvaluatedSeqLen;
 66 |     /* 
 67 |     why we use 8 here?
 68 |     map A/T/C/G/N to 0~7 by their ASCII % 8:
 69 |     'A' % 8 = 1
 70 |     'T' % 8 = 4
 71 |     'C' % 8 = 3
 72 |     'G' % 8 = 7
 73 |     'N' % 8 = 6
 74 |     */
 75 |     long *mCycleQ30Bases[8];
 76 |     long *mCycleQ20Bases[8];
 77 |     long *mCycleBaseContents[8];
 78 |     long *mCycleBaseQual[8];
 79 |     long *mCycleTotalBase;
 80 |     long *mCycleTotalQual;
 81 |     long *mKmer;
 82 | 
 83 |     map<string, double*> mQualityCurves;
 84 |     map<string, double*> mContentCurves;
 85 |     map<string, long> mOverRepSeq;
 86 |     map<string, long*> mOverRepSeqDist;
 87 | 
 88 | 
 89 |     int mCycles;
 90 |     int mBufLen;
 91 |     long mBases;
 92 |     long mQ20Bases[8];
 93 |     long mQ30Bases[8];
 94 |     long mBaseContents[8];
 95 |     long mQ20Total;
 96 |     long mQ30Total;
 97 |     bool summarized;
 98 |     long mKmerMax;
 99 |     long mKmerMin;
100 |     int mKmerBufLen;
101 |     long mLengthSum;
102 | };
103 | 
104 | #endif


--------------------------------------------------------------------------------
/src/threadconfig.cpp:
--------------------------------------------------------------------------------
  1 | #include "threadconfig.h"
  2 | #include "util.h"
  3 | 
  4 | ThreadConfig::ThreadConfig(Options* & opt, BwtFmiDB* & tbwtfmiDB, int threadId, bool paired){
  5 |     mOptions = opt;
  6 |     mThreadId = threadId;
  7 |     mWorkingSplit = threadId;
  8 |     mCurrentSplitReads = 0;
  9 |     mPreStats1 = new Stats(mOptions, false);
 10 |     mPostStats1 = new Stats(mOptions, false);
 11 |     if(paired){
 12 |         mPreStats2 = new Stats(mOptions, true);
 13 |         mPostStats2 = new Stats(mOptions, true);
 14 |     } else {
 15 |         mPreStats2 = NULL;
 16 |         mPostStats2 = NULL;
 17 |     }
 18 |     mWriter1 = NULL;
 19 |     mWriter2 = NULL;
 20 | 
 21 |     mFilterResult = new FilterResult(opt, paired);
 22 |     mCanBeStopped = false;
 23 |     //mBwtfmiDB = tbwtfmiDB;
 24 |     mTransSearcher = new TransSearcher(mOptions, tbwtfmiDB);
 25 | }
 26 | 
 27 | ThreadConfig::~ThreadConfig() {
 28 |     cleanup();
 29 |     if(mTransSearcher != NULL){
 30 |         delete mTransSearcher;
 31 |         mTransSearcher = NULL;
 32 |     }
 33 | }
 34 | 
 35 | void ThreadConfig::cleanup() {
 36 |     if(mOptions->split.enabled && mOptions->split.byFileNumber)
 37 |         writeEmptyFilesForSplitting();
 38 |     deleteWriter();
 39 | }
 40 | 
 41 | void ThreadConfig::deleteWriter() {
 42 |     if(mWriter1 != NULL) {
 43 |         delete mWriter1;
 44 |         mWriter1 = NULL;
 45 |     }
 46 |     if(mWriter2 != NULL) {
 47 |         delete mWriter2;
 48 |         mWriter2 = NULL;
 49 |     }
 50 | }
 51 | 
 52 | void ThreadConfig::initWriter(string filename1) {
 53 |     deleteWriter();
 54 |     mWriter1 = new Writer(filename1, mOptions->compression);
 55 | }
 56 | 
 57 | void ThreadConfig::initWriter(string filename1, string filename2) {
 58 |     deleteWriter();
 59 |     mWriter1 = new Writer(filename1, mOptions->compression);
 60 |     mWriter2 = new Writer(filename2, mOptions->compression);
 61 | }
 62 | 
 63 | void ThreadConfig::initWriter(ofstream* stream) {
 64 |     deleteWriter();
 65 |     mWriter1 = new Writer(stream);
 66 | }
 67 | 
 68 | void ThreadConfig::initWriter(ofstream* stream1, ofstream* stream2) {
 69 |     deleteWriter();
 70 |     mWriter1 = new Writer(stream1);
 71 |     mWriter2 = new Writer(stream2);
 72 | }
 73 | 
 74 | void ThreadConfig::initWriter(gzFile gzfile) {
 75 |     deleteWriter();
 76 |     mWriter1 = new Writer(gzfile);
 77 | }
 78 | 
 79 | void ThreadConfig::initWriter(gzFile gzfile1, gzFile gzfile2) {
 80 |     deleteWriter();
 81 |     mWriter1 = new Writer(gzfile1);
 82 |     mWriter2 = new Writer(gzfile2);
 83 | }
 84 | 
 85 | void ThreadConfig::addFilterResult(int result, int readNum) {
 86 |     mFilterResult->addFilterResult(result, readNum);
 87 | }
 88 | 
 89 | void ThreadConfig::addMergedPairs(int pairs) {
 90 |     mFilterResult->addMergedPairs(pairs);
 91 | }
 92 | 
 93 | void ThreadConfig::initWriterForSplit() {
 94 |     if(mOptions->out1.empty())
 95 |         return ;
 96 | 
 97 |     // use 1-based naming
 98 |     string num = to_string(mWorkingSplit + 1);
 99 |     // padding for digits like 0001
100 |     if(mOptions->split.digits > 0){
101 |         while(num.size() < mOptions->split.digits)
102 |             num = "0" + num;
103 |     }
104 | 
105 |     string filename1 = joinpath(dirname(mOptions->out1), num + "." + basename(mOptions->out1));
106 |     if(!mOptions->isPaired()) {
107 |         initWriter(filename1);
108 |     } else {
109 |         string filename2 = joinpath(dirname(mOptions->out2), num + "." + basename(mOptions->out2));
110 |         initWriter(filename1, filename2);
111 |     }
112 | }
113 | 
114 | void ThreadConfig::markProcessed(long readNum) {
115 |     mCurrentSplitReads += readNum;
116 |     if(!mOptions->split.enabled)
117 |         return ;
118 |     // if splitting is enabled, check whether current file is full
119 |     if(mCurrentSplitReads >= mOptions->split.size) {
120 |         // if it's splitting by file number, totally we cannot exceed split.number
121 |         // if it's splitting by file lines, then we don't need to check
122 |         if(mOptions->split.byFileLines || mWorkingSplit + mOptions->thread < mOptions->split.number ){
123 |             mWorkingSplit += mOptions->thread;
124 |             initWriterForSplit();
125 |             mCurrentSplitReads = 0;
126 |         } else {
127 |             // this thread can be stoped now since all its tasks are done
128 |             // only a part of threads have to deal with the remaining reads
129 |             if(mOptions->split.number % mOptions->thread >0 
130 |                 && mThreadId >= mOptions->split.number % mOptions->thread)
131 |                 mCanBeStopped = true;
132 |         }
133 |     }
134 | }
135 | 
136 | // if a task of writting N files is assigned to this thread, but the input file doesn't have so many reads to input
137 | // write some empty files so it will not break following pipelines
138 | void ThreadConfig::writeEmptyFilesForSplitting() {
139 |     while(mWorkingSplit + mOptions->thread < mOptions->split.number) {
140 |         mWorkingSplit += mOptions->thread;
141 |             initWriterForSplit();
142 |             mCurrentSplitReads = 0;
143 |     }
144 | }
145 | 
146 | bool ThreadConfig::canBeStopped() {
147 |     return mCanBeStopped;
148 | }
149 | 


--------------------------------------------------------------------------------
/src/threadconfig.h:
--------------------------------------------------------------------------------
 1 | #ifndef THREAD_CONFIG_H
 2 | #define THREAD_CONFIG_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <vector>
 8 | #include "stats.h"
 9 | #include "writer.h"
10 | #include "options.h"
11 | #include "filterresult.h"
12 | #include "transsearcher.hpp"
13 | #include "bwtfmiDB.h"
14 | 
15 | using namespace std;
16 | 
17 | class ThreadConfig{
18 | public:
19 |     ThreadConfig(Options* & opt, BwtFmiDB* & tbwtfmiDB, int threadId, bool paired = false);
20 |     ~ThreadConfig();
21 |     inline Stats* getPreStats1() {return mPreStats1;}
22 |     inline Stats* getPostStats1() {return mPostStats1;}
23 |     inline Stats* getPreStats2() {return mPreStats2;}
24 |     inline Stats* getPostStats2() {return mPostStats2;}
25 |     inline Writer* getWriter1() {return mWriter1;}
26 |     inline Writer* getWriter2() {return mWriter2;}
27 |     inline FilterResult* getFilterResult() {return mFilterResult;}
28 |     inline TransSearcher* getTransSearcher(){return mTransSearcher;}
29 | 
30 |     void initWriter(string filename1);
31 |     void initWriter(string filename1, string filename2);
32 |     void initWriter(ofstream* stream);
33 |     void initWriter(ofstream* stream1, ofstream* stream2);
34 |     void initWriter(gzFile gzfile);
35 |     void initWriter(gzFile gzfile1, gzFile gzfile2);
36 | 
37 |     void addFilterResult(int result, int readNum);
38 |     void addMergedPairs(int pairs);
39 | 
40 |     int getThreadId() {return mThreadId;}
41 |     // for splitting output
42 |     // increase mCurrentSplitReads by readNum, and check it with options->split.size;
43 |     void markProcessed(long readNum);
44 |     void initWriterForSplit();
45 |     bool canBeStopped();
46 |     void cleanup();
47 | 
48 | private:
49 |     void deleteWriter();
50 |     void writeEmptyFilesForSplitting();
51 | 
52 | private:
53 |     Stats* mPreStats1;
54 |     Stats* mPostStats1;
55 |     Stats* mPreStats2;
56 |     Stats* mPostStats2;
57 |     Writer* mWriter1;
58 |     Writer* mWriter2;
59 |     Options* mOptions;
60 |     FilterResult* mFilterResult;
61 | 
62 |     // for spliting output
63 |     int mThreadId;
64 |     int mWorkingSplit;
65 |     long mCurrentSplitReads;
66 |     bool mCanBeStopped;
67 |     
68 |     TransSearcher* mTransSearcher;
69 |     //BwtFmiDB* mBwtfmiDB;
70 | };
71 | 
72 | #endif
73 | 


--------------------------------------------------------------------------------
/src/threadsconfig2.cpp:
--------------------------------------------------------------------------------
 1 | #include "threadsconfig2.h"
 2 | #include "util.h"
 3 | #include <memory.h>
 4 | #include <unistd.h>
 5 | 
 6 | ThreadsConfig2::ThreadsConfig2(Options* & opt, int threadId) {
 7 |     mOptions = opt;
 8 |     mThreadId = threadId;
 9 |     mWriter1 = NULL;
10 |     mInputCounter = 0;
11 |     mOutputCounter = 0;
12 |     mInputCompleted = false;
13 |     mRingBuffer = new char*[PACK_NUM_LIMIT];
14 |     memset(mRingBuffer, 0, sizeof(char*) * PACK_NUM_LIMIT);
15 |     mRingBufferSizes = new size_t[PACK_NUM_LIMIT];
16 |     memset(mRingBufferSizes, 0, sizeof(char*) * PACK_NUM_LIMIT);
17 |     string fullpath = "";
18 |     if(mThreadId < mOptions->mSeqExtractions.targetGenesSubVec.size()){
19 |         auto filename = mOptions->mSeqExtractions.targetGenesSubVec[mThreadId] + mOptions->mSeqExtractions.suffix;
20 |         fullpath = joinpath(mOptions->mSeqExtractions.outputDir, filename);
21 |     } else {
22 |         fullpath = mOptions->mSeqExtractions.undeterminedFileNameOut;
23 |     }
24 |     initWriter(fullpath);
25 | }
26 | 
27 | ThreadsConfig2::~ThreadsConfig2() {
28 |     cleanup();
29 |     if (mRingBuffer) {
30 |         delete mRingBuffer;
31 |         mRingBuffer = NULL;
32 |     }
33 |     if (mRingBufferSizes) {
34 |         delete mRingBufferSizes;
35 |         mRingBufferSizes = NULL;
36 |     }
37 | }
38 | 
39 | void ThreadsConfig2::initWriter(string filename1) {
40 |     deleteWriter();
41 |     mWriter1 = new Writer(filename1, mOptions->compression);
42 | }
43 | 
44 | void ThreadsConfig2::initWriter(ofstream* stream) {
45 |     deleteWriter();
46 |     mWriter1 = new Writer(stream);
47 | }
48 | 
49 | void ThreadsConfig2::initWriter(gzFile gzfile) {
50 |     deleteWriter();
51 |     mWriter1 = new Writer(gzfile);
52 | }
53 | 
54 | void ThreadsConfig2::deleteWriter() {
55 |     if(mWriter1 != NULL){
56 |         delete mWriter1;
57 |         mWriter1 = NULL;
58 |     }
59 | }
60 | 
61 | void ThreadsConfig2::cleanup(){
62 |     deleteWriter();
63 | }
64 | 
65 | bool ThreadsConfig2::isCompleted(){
66 |     return mInputCompleted && (mOutputCounter == mInputCounter);
67 | }
68 | 
69 | void ThreadsConfig2::setInputCompleted(){
70 |     mInputCompleted = true;
71 | }
72 | 
73 | void ThreadsConfig2::input(char* data, size_t size){
74 |     long target = mInputCounter % PACK_NUM_LIMIT;
75 |     mRingBuffer[target] = data;
76 |     mRingBufferSizes[target] = size;
77 |     mInputCounter++;
78 | }
79 | 
80 | void ThreadsConfig2::output(){
81 |     if(mOutputCounter >= mInputCounter){
82 |         usleep(100);
83 |     }
84 |     while(mOutputCounter < mInputCounter){
85 |         long target = mOutputCounter % PACK_NUM_LIMIT;
86 |         mWriter1->write(mRingBuffer[target], mRingBufferSizes[target]);
87 |         delete mRingBuffer[target];
88 |         mRingBuffer[target] = NULL;
89 |         mOutputCounter++;
90 |     }
91 | }


--------------------------------------------------------------------------------
/src/threadsconfig2.h:
--------------------------------------------------------------------------------
 1 | #ifndef THREADSCONFIG2_H
 2 | #define THREADSCONFIG2_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <vector>
 8 | #include <atomic>
 9 | #include "writer.h"
10 | #include "options.h"
11 | 
12 | using namespace std;
13 | 
14 | class ThreadsConfig2 {
15 | public:
16 |     ThreadsConfig2(Options* & opt, int threadId);
17 |     ~ThreadsConfig2();
18 |     
19 |     void initWriter(string filename1);
20 |     void initWriter(ofstream* stream);
21 |     void initWriter(gzFile gzfile);
22 |     
23 |     int getThreadId(){return mThreadId;};
24 |     void cleanup();
25 |     
26 |     bool isCompleted();
27 |     void output();
28 |     void input(char* data, size_t size);
29 |     void setInputCompleted();
30 |     
31 | private:
32 |     void deleteWriter();
33 |     
34 | private:
35 |     Writer* mWriter1;
36 |     Options* mOptions;
37 |     
38 |     int mThreadId;
39 |     bool mInputCompleted;
40 |     atomic_long mInputCounter;
41 |     atomic_long mOutputCounter;
42 |     char** mRingBuffer;
43 |     size_t* mRingBufferSizes;
44 | };
45 | 
46 | #endif /* THREADSCONFIG2_H */
47 | 
48 | 


--------------------------------------------------------------------------------
/src/transsearcher.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef TRANSSEARCHER_HPP
  2 | #define TRANSSEARCHER_HPP
  3 | 
  4 | #include <stdint.h>
  5 | #include <assert.h>
  6 | #include <unordered_map>
  7 | #include <unordered_set>
  8 | #include <set>
  9 | #include <list>
 10 | #include <cmath>
 11 | #include <algorithm>
 12 | #include <mutex>
 13 | #include <iostream>
 14 | #include <sstream>
 15 | #include <vector>
 16 | #include <iterator>
 17 | #include <string>
 18 | #include <cstring>
 19 | #include <climits>
 20 | #include <map>
 21 | #include <utility>
 22 | #include <functional>
 23 | #include <locale>
 24 | #include <stdio.h>
 25 | #include <cmath>
 26 | 
 27 | #include "util.h"
 28 | #include "algo/blast/core/blast_seg.h"
 29 | #include "algo/blast/core/blast_filter.h"
 30 | #include "algo/blast/core/blast_encoding.h"
 31 | #include "read.h"
 32 | #include "options.h"
 33 | #include "fragment.h"
 34 | #include "options.h"
 35 | #include "bwtfmiDB.h"
 36 | #include "common.h"
 37 | 
 38 | extern "C" {
 39 | #include "bwt/bwt.h"
 40 | }
 41 | 
 42 | const double LN_2 = 0.6931471805;
 43 | const double LAMBDA = 0.3176;
 44 | const double LN_K = -2.009915479;
 45 | 
 46 | class TransSearcher {
 47 | protected:
 48 |     uint8_t codon_to_int(const char* codon);
 49 |     uint8_t revcomp_codon_to_int(const char* codon);
 50 | 
 51 |     uint8_t nuc2int[256];
 52 |     uint8_t compnuc2int[256];
 53 |     char codon2aa[256];
 54 |     uint8_t aa2int[256];
 55 | 
 56 |     std::map<char, std::vector<char>> blosum_subst;
 57 |     int8_t blosum62diag[20];
 58 |     int8_t b62[20][20];
 59 | 
 60 |     std::string translations[6];
 61 |     std::multimap<unsigned int, Fragment *, std::greater<unsigned int>> fragments;
 62 |     std::vector<SI *> best_matches_SI;
 63 |     std::vector<SI *> longest_matches_SI;
 64 |     std::vector<std::string> best_matches;
 65 |     std::vector<std::string> longest_fragments;
 66 |     
 67 |     unsigned int best_match_score = 0;
 68 |     double query_len;
 69 |     uint32_t read_count = 0;
 70 |     uint32 uniq_mapped_reads = 0;
 71 |     uint32 multi_mapped_reads = 0;
 72 | 
 73 |     void clearFragments();
 74 |     unsigned int calcScore(const std::string &);
 75 |     unsigned int calcScore(const std::string &, int);
 76 |     unsigned int calcScore(const std::string &, size_t, size_t, int);
 77 |     void addAllMismatchVariantsAtPosSI(const Fragment *, unsigned int, size_t, SI *); // used in Greedy mode
 78 |     Fragment * getNextFragment(unsigned int);
 79 |     void eval_match_scores(SI *si, Fragment *);
 80 |     void getAllFragmentsBits(const std::string & line);
 81 |     void getLongestFragmentsBits(const std::string & line);
 82 |     void flush_output();
 83 |     void preProcess();
 84 |     void doProcess();
 85 |     uint32 * postProcess();
 86 | 
 87 | protected:
 88 |     void classify_length();
 89 |     void classify_greedyblosum();
 90 |     void ids_from_SI(SI *);
 91 |     void ids_from_SI_recursive(SI *);
 92 |     std::set<char *> match_ids;
 93 |     std::set<const uint32 *> matched_genids;
 94 |     std::map<const uint32 *, uint32> tmpIdFreqMap;
 95 |     std::map<const uint32 *, uint32> idFreqSubMap;
 96 |     Options * mOptions;   
 97 |     BwtFmiDB * tbwtfmiDB;
 98 |     
 99 | public:
100 |     TransSearcher(Options * & opt, BwtFmiDB * & mBwtfmiDB);
101 |     void transSearch(Read * item, uint32* & orthId);
102 |     void transSearch(Read * item1, Read * item2, uint32* & orthId);
103 |     inline std::map<const uint32 *, uint32> getIdFreqSubMap(){return idFreqSubMap;};
104 |     static std::map<const uint32 *, uint32> merge(std::vector<std::map<const uint32*, uint32>> & list);
105 | };
106 | 
107 | 
108 | #endif /* TRANSSEARCHER_HPP */
109 | 


--------------------------------------------------------------------------------
/src/umiprocessor.cpp:
--------------------------------------------------------------------------------
 1 | #include "umiprocessor.h"
 2 | 
 3 | UmiProcessor::UmiProcessor(Options* & opt){
 4 |     mOptions = opt;
 5 | }
 6 | 
 7 | 
 8 | UmiProcessor::~UmiProcessor(){
 9 | }
10 | 
11 | void UmiProcessor::process(Read* r1, Read* r2) {
12 |     if(!mOptions->umi.enabled)
13 |         return;
14 | 
15 |     string umi;
16 |     if(mOptions->umi.location == UMI_LOC_INDEX1)
17 |         umi = r1->firstIndex();
18 |     else if(mOptions->umi.location == UMI_LOC_INDEX2 && r2)
19 |         umi = r2->lastIndex();
20 |     else if(mOptions->umi.location == UMI_LOC_READ1){
21 |         umi = r1->mSeq.mStr.substr(0, min(r1->length(), mOptions->umi.length));
22 |         r1->trimFront(umi.length() + mOptions->umi.skip);
23 |     }
24 |     else if(mOptions->umi.location == UMI_LOC_READ2 && r2){
25 |         umi = r2->mSeq.mStr.substr(0, min(r2->length(), mOptions->umi.length));
26 |         r2->trimFront(umi.length() + mOptions->umi.skip);
27 |     }
28 |     else if(mOptions->umi.location == UMI_LOC_PER_INDEX){
29 |         string umiMerged = r1->firstIndex();
30 |         if(r2) {
31 |             umiMerged = umiMerged + "_" + r2->lastIndex();
32 |         }
33 | 
34 |         addUmiToName(r1, umiMerged);
35 |         if(r2) {
36 |             addUmiToName(r2, umiMerged);
37 |         }
38 |     }
39 |     else if(mOptions->umi.location == UMI_LOC_PER_READ){
40 |         string umi1 = r1->mSeq.mStr.substr(0, min(r1->length(), mOptions->umi.length));
41 |         string umiMerged = umi1;
42 |         r1->trimFront(umi1.length() + mOptions->umi.skip);
43 |         if(r2){
44 |             string umi2 = r2->mSeq.mStr.substr(0, min(r2->length(), mOptions->umi.length));
45 |             umiMerged = umiMerged + "_" + umi2;
46 |             r2->trimFront(umi2.length() + mOptions->umi.skip);
47 |         }
48 | 
49 |         addUmiToName(r1, umiMerged);
50 |         if(r2){
51 |             addUmiToName(r2, umiMerged);
52 |         }
53 |     }
54 | 
55 |     if(mOptions->umi.location != UMI_LOC_PER_INDEX && mOptions->umi.location != UMI_LOC_PER_READ) {
56 |         if(r1 && !umi.empty()) 
57 |             addUmiToName(r1, umi);
58 |         if(r2 && !umi.empty())
59 |             addUmiToName(r2, umi);
60 |     }
61 | }
62 | 
63 | void UmiProcessor::addUmiToName(Read* r, string umi){
64 |     string tag;
65 |     if(mOptions->umi.prefix.empty())
66 |         tag = ":" + umi;
67 |     else
68 |         tag = ":" + mOptions->umi.prefix + "_" + umi;
69 |     int spacePos = -1;
70 |     for(int i=0; i<r->mName.length(); i++) {
71 |         if(r->mName[i] == ' ') {
72 |             spacePos = i;
73 |             break;
74 |         }
75 |     }
76 |     if(spacePos == -1) {
77 |         r->mName = r->mName + tag;
78 |     } else {
79 |         r->mName = r->mName.substr(0, spacePos) + tag + r->mName.substr(spacePos, r->mName.length() - spacePos);
80 |     }
81 | 
82 | }
83 | 
84 | 
85 | bool UmiProcessor::test() {
86 |     return true;
87 | }


--------------------------------------------------------------------------------
/src/umiprocessor.h:
--------------------------------------------------------------------------------
 1 | #ifndef UMI_PROCESSOR_H
 2 | #define UMI_PROCESSOR_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "options.h"
 8 | #include "read.h"
 9 | 
10 | using namespace std;
11 | 
12 | class UmiProcessor{
13 | public:
14 |     UmiProcessor(Options* & opt);
15 |     ~UmiProcessor();
16 |     void process(Read* r1, Read* r2 = NULL);
17 |     void addUmiToName(Read* r, string umi);
18 |     static bool test();
19 | 
20 | private:
21 |     Options* mOptions;
22 | };
23 | 
24 | 
25 | #endif


--------------------------------------------------------------------------------
/src/unittest.cpp:
--------------------------------------------------------------------------------
 1 | #include "unittest.h"
 2 | #include "sequence.h"
 3 | #include "fastqreader.h"
 4 | #include "read.h"
 5 | #include "overlapanalysis.h"
 6 | #include "filter.h"
 7 | #include "adaptertrimmer.h"
 8 | #include "basecorrector.h"
 9 | #include "polyx.h"
10 | #include "nucleotidetree.h"
11 | #include "evaluator.h"
12 | #include <time.h>
13 | 
14 | UnitTest::UnitTest(){
15 | 
16 | }
17 | 
18 | void UnitTest::run(){
19 |     bool passed = true;
20 |     passed &= report(Sequence::test(), "Sequence::test");
21 |     passed &= report(Read::test(), "Read::test");
22 |     passed &= report(OverlapAnalysis::test(), "OverlapAnalysis::test");
23 |     passed &= report(Filter::test(), "Filter::test");
24 |     passed &= report(AdapterTrimmer::test(), "AdapterTrimmer::test");
25 |     passed &= report(BaseCorrector::test(), "BaseCorrector::test");
26 |     passed &= report(PolyX::test(), "PolyX::test");
27 |     passed &= report(NucleotideTree::test(), "NucleotideTree::test");
28 |     passed &= report(Evaluator::test(), "Evaluator::test");
29 |     printf("\n==========================\n");
30 |     printf("%s\n\n", passed?"ALL PASSED":"FAILED");
31 | }
32 | 
33 | bool UnitTest::report(bool result, string message) {
34 |     printf("%s:%s\n\n", message.c_str(), result?" PASSED":" FAILED");
35 |     return result;
36 | }


--------------------------------------------------------------------------------
/src/unittest.h:
--------------------------------------------------------------------------------
 1 | #ifndef UNIT_TEST_H
 2 | #define UNIT_TEST_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | 
 8 | using namespace std;
 9 | 
10 | class UnitTest{
11 | public:
12 |     UnitTest();
13 |     void run();
14 |     bool report(bool result, string message);
15 | };
16 | 
17 | #endif


--------------------------------------------------------------------------------
/src/writer.cpp:
--------------------------------------------------------------------------------
  1 | #include "writer.h"
  2 | #include "util.h"
  3 | #include "fastqreader.h"
  4 | #include <string.h>
  5 | 
  6 | Writer::Writer(string filename, int compression) {
  7 |     mCompression = compression;
  8 |     mFilename = filename;
  9 |     mZipFile = NULL;
 10 |     mZipped = false;
 11 |     haveToClose = true;
 12 |     init();
 13 | }
 14 | 
 15 | Writer::Writer(ofstream* stream) {
 16 |     mZipFile = NULL;
 17 |     mZipped = false;
 18 |     mOutStream = stream;
 19 |     haveToClose = false;
 20 | }
 21 | 
 22 | Writer::Writer(gzFile gzfile) {
 23 |     mOutStream = NULL;
 24 |     mZipFile = gzfile;
 25 |     mZipped = true;
 26 |     haveToClose = false;
 27 | }
 28 | 
 29 | Writer::~Writer() {
 30 |     if (haveToClose) {
 31 |         close();
 32 |     }
 33 | }
 34 | 
 35 | string Writer::filename() {
 36 |     return mFilename;
 37 | }
 38 | 
 39 | void Writer::init() {
 40 |     if (ends_with(mFilename, ".gz")) {
 41 |         mZipFile = gzopen(mFilename.c_str(), "w");
 42 |         gzsetparams(mZipFile, mCompression, Z_DEFAULT_STRATEGY);
 43 |         gzbuffer(mZipFile, 1024 * 1024);
 44 |         mZipped = true;
 45 |     } else {
 46 |         mOutStream = new ofstream();
 47 |         mOutStream->open(mFilename.c_str(), ifstream::out);
 48 |         mZipped = false;
 49 |     }
 50 | }
 51 | 
 52 | bool Writer::writeLine(string& linestr) {
 53 |     const char* line = linestr.c_str();
 54 |     size_t size = linestr.length();
 55 |     size_t written;
 56 |     bool status;
 57 |     if (mZipped) {
 58 |         written = gzwrite(mZipFile, line, size);
 59 |         gzputc(mZipFile, '\n');
 60 |         status = size == written;
 61 |     } else {
 62 |         mOutStream->write(line, size);
 63 |         mOutStream->put('\n');
 64 |         status = !mOutStream->fail();
 65 |     }
 66 | 
 67 |     return status;
 68 | }
 69 | 
 70 | bool Writer::writeString(string& str) {
 71 |     const char* strdata = str.c_str();
 72 |     size_t size = str.length();
 73 |     size_t written;
 74 |     bool status;
 75 |     if (mZipped) {
 76 |         written = gzwrite(mZipFile, strdata, size);
 77 |         status = size == written;
 78 |     } else {
 79 |         mOutStream->write(strdata, size);
 80 |         status = !mOutStream->fail();
 81 |     }
 82 | 
 83 |     return status;
 84 | }
 85 | 
 86 | bool Writer::write(char* strdata, size_t size) {
 87 |     size_t written;
 88 |     bool status;
 89 | 
 90 |     if (mZipped) {
 91 |         written = gzwrite(mZipFile, strdata, size);
 92 |         status = size == written;
 93 |     } else {
 94 |         mOutStream->write(strdata, size);
 95 |         status = !mOutStream->fail();
 96 |     }
 97 |     return status;
 98 | }
 99 | 
100 | void Writer::close() {
101 |     if (mZipped) {
102 |         if (mZipFile) {
103 |             gzflush(mZipFile, Z_FINISH);
104 |             gzclose(mZipFile);
105 |             mZipFile = NULL;
106 |         }
107 |     } else if (mOutStream) {
108 |         if (mOutStream->is_open()) {
109 |             mOutStream->flush();
110 |             //TODO: following two lines will cause crash
111 |             //mOutStream->close();
112 |             //delete mOutStream;
113 |             mOutStream = NULL;
114 |         }
115 |     }
116 | }
117 | 
118 | bool Writer::isZipped() {
119 |     return mZipped;
120 | }


--------------------------------------------------------------------------------
/src/writer.h:
--------------------------------------------------------------------------------
 1 | #ifndef _WRITER_H
 2 | #define _WRITER_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #ifdef DYNAMIC_ZLIB
 7 |   #include <zlib.h>
 8 | #else
 9 |   #include "zlib/zlib.h"
10 | #endif
11 | #include "common.h"
12 | #include <iostream>
13 | #include <fstream>
14 | 
15 | using namespace std;
16 | 
17 | class Writer{
18 | public:
19 | 	Writer(string filename, int compression = 3);
20 | 	Writer(ofstream* stream);
21 | 	Writer(gzFile gzfile);
22 | 	~Writer();
23 | 	bool isZipped();
24 | 	bool writeString(string& s);
25 | 	bool writeLine(string& linestr);
26 | 	bool write(char* strdata, size_t size);
27 | 	string filename();
28 | 
29 | public:
30 | 	static bool test();
31 | 
32 | private:
33 | 	void init();
34 | 	void close();
35 | 
36 | private:
37 | 	string mFilename;
38 | 	gzFile mZipFile;
39 | 	ofstream* mOutStream;
40 | 	bool mZipped;
41 | 	int mCompression;
42 | 	bool haveToClose;
43 | };
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/writerthread.cpp:
--------------------------------------------------------------------------------
 1 | #include "writerthread.h"
 2 | #include "util.h"
 3 | #include <memory.h>
 4 | #include <unistd.h>
 5 | 
 6 | WriterThread::WriterThread(Options* & opt, string filename) {
 7 |     mOptions = opt;
 8 |     mWriter1 = NULL;
 9 |     mInputCounter = 0;
10 |     mOutputCounter = 0;
11 |     mInputCompleted = false;
12 |     mFilename = filename;
13 |     mRingBuffer = new char*[PACK_NUM_LIMIT];
14 |     memset(mRingBuffer, 0, sizeof (char*) * PACK_NUM_LIMIT);
15 |     mRingBufferSizes = new size_t[PACK_NUM_LIMIT];
16 |     memset(mRingBufferSizes, 0, sizeof (size_t) * PACK_NUM_LIMIT);
17 |     initWriter(filename);
18 | }
19 | 
20 | WriterThread::~WriterThread() {
21 |     cleanup();
22 |     delete[] mRingBuffer;
23 |     delete[] mRingBufferSizes;
24 | }
25 | 
26 | bool WriterThread::isCompleted() {
27 |     return mInputCompleted && (mOutputCounter == mInputCounter);
28 | }
29 | 
30 | bool WriterThread::setInputCompleted() {
31 |     mInputCompleted = true;
32 |     return true;
33 | }
34 | 
35 | void WriterThread::output() {
36 |     if (mOutputCounter >= mInputCounter) {
37 |         usleep(100);
38 |     }
39 |     while (mOutputCounter < mInputCounter) {
40 |         mWriter1->write(mRingBuffer[mOutputCounter], mRingBufferSizes[mOutputCounter]);
41 |         delete[] mRingBuffer[mOutputCounter];
42 |         mRingBuffer[mOutputCounter] = NULL;
43 |         mOutputCounter++;
44 |     }
45 | }
46 | 
47 | void WriterThread::input(char* data, size_t size) {
48 |     mRingBuffer[mInputCounter] = data;
49 |     mRingBufferSizes[mInputCounter] = size;
50 |     mInputCounter++;
51 | }
52 | 
53 | void WriterThread::cleanup() {
54 |     deleteWriter();
55 | }
56 | 
57 | void WriterThread::deleteWriter() {
58 |     if (mWriter1 != NULL) {
59 |         delete mWriter1;
60 |         mWriter1 = NULL;
61 |     }
62 | }
63 | 
64 | void WriterThread::initWriter(string filename1) {
65 |     deleteWriter();
66 |     mWriter1 = new Writer(filename1, mOptions->compression);
67 | }
68 | 
69 | void WriterThread::initWriter(ofstream* stream) {
70 |     deleteWriter();
71 |     mWriter1 = new Writer(stream);
72 | }
73 | 
74 | void WriterThread::initWriter(gzFile gzfile) {
75 |     deleteWriter();
76 |     mWriter1 = new Writer(gzfile);
77 | }
78 | 
79 | long WriterThread::bufferLength() {
80 |     return mInputCounter - mOutputCounter;
81 | }
82 | 


--------------------------------------------------------------------------------
/src/writerthread.h:
--------------------------------------------------------------------------------
 1 | #ifndef WRITER_THREAD_H
 2 | #define WRITER_THREAD_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <vector>
 8 | #include "writer.h"
 9 | #include "options.h"
10 | #include <atomic>
11 | #include <mutex>
12 | 
13 | using namespace std;
14 | 
15 | class WriterThread{
16 | public:
17 |     WriterThread(Options* & opt, string filename);
18 |     ~WriterThread();
19 | 
20 |     void initWriter(string filename1);
21 |     void initWriter(ofstream* stream);
22 |     void initWriter(gzFile gzfile);
23 | 
24 |     void cleanup();
25 | 
26 |     bool isCompleted();
27 |     void output();
28 |     void input(char* data, size_t size);
29 |     bool setInputCompleted();
30 | 
31 |     long bufferLength();
32 |     string getFilename() {return mFilename;}
33 | 
34 | private:
35 |     void deleteWriter();
36 | 
37 | private:
38 |     Writer* mWriter1;
39 |     Options* mOptions;
40 |     string mFilename;
41 | 
42 |     // for spliting output
43 |     bool mInputCompleted;
44 |     atomic_long mInputCounter;
45 |     atomic_long mOutputCounter;
46 |     char** mRingBuffer;
47 |     size_t* mRingBufferSizes;
48 | 
49 |     mutex mtx;
50 | 
51 | };
52 | 
53 | #endif


--------------------------------------------------------------------------------
/src/zlib/inffast.h:
--------------------------------------------------------------------------------
 1 | /* inffast.h -- header to use inffast.c
 2 |  * Copyright (C) 1995-2003, 2010 Mark Adler
 3 |  * For conditions of distribution and use, see copyright notice in zlib.h
 4 |  */
 5 | 
 6 | /* WARNING: this file should *not* be used by applications. It is
 7 |    part of the implementation of the compression library and is
 8 |    subject to change. Applications should only use zlib.h.
 9 |  */
10 | 
11 | void ZLIB_INTERNAL inflate_fast OF((z_streamp strm, unsigned start));
12 | 


--------------------------------------------------------------------------------
/src/zlib/inftrees.h:
--------------------------------------------------------------------------------
 1 | /* inftrees.h -- header to use inftrees.c
 2 |  * Copyright (C) 1995-2005, 2010 Mark Adler
 3 |  * For conditions of distribution and use, see copyright notice in zlib.h
 4 |  */
 5 | 
 6 | /* WARNING: this file should *not* be used by applications. It is
 7 |    part of the implementation of the compression library and is
 8 |    subject to change. Applications should only use zlib.h.
 9 |  */
10 | 
11 | /* Structure for decoding tables.  Each entry provides either the
12 |    information needed to do the operation requested by the code that
13 |    indexed that table entry, or it provides a pointer to another
14 |    table that indexes more bits of the code.  op indicates whether
15 |    the entry is a pointer to another table, a literal, a length or
16 |    distance, an end-of-block, or an invalid code.  For a table
17 |    pointer, the low four bits of op is the number of index bits of
18 |    that table.  For a length or distance, the low four bits of op
19 |    is the number of extra bits to get after the code.  bits is
20 |    the number of bits in this code or part of the code to drop off
21 |    of the bit buffer.  val is the actual byte to output in the case
22 |    of a literal, the base length or distance, or the offset from
23 |    the current table to the next table.  Each entry is four bytes. */
24 | typedef struct {
25 |     unsigned char op;           /* operation, extra bits, table bits */
26 |     unsigned char bits;         /* bits in this part of the code */
27 |     unsigned short val;         /* offset in table or code value */
28 | } code;
29 | 
30 | /* op values as set by inflate_table():
31 |     00000000 - literal
32 |     0000tttt - table link, tttt != 0 is the number of table index bits
33 |     0001eeee - length or distance, eeee is the number of extra bits
34 |     01100000 - end of block
35 |     01000000 - invalid code
36 |  */
37 | 
38 | /* Maximum size of the dynamic table.  The maximum number of code structures is
39 |    1444, which is the sum of 852 for literal/length codes and 592 for distance
40 |    codes.  These values were found by exhaustive searches using the program
41 |    examples/enough.c found in the zlib distribtution.  The arguments to that
42 |    program are the number of symbols, the initial root table size, and the
43 |    maximum bit length of a code.  "enough 286 9 15" for literal/length codes
44 |    returns returns 852, and "enough 30 6 15" for distance codes returns 592.
45 |    The initial root table size (9 or 6) is found in the fifth argument of the
46 |    inflate_table() calls in inflate.c and infback.c.  If the root table size is
47 |    changed, then these maximum sizes would be need to be recalculated and
48 |    updated. */
49 | #define ENOUGH_LENS 852
50 | #define ENOUGH_DISTS 592
51 | #define ENOUGH (ENOUGH_LENS+ENOUGH_DISTS)
52 | 
53 | /* Type of code to build for inflate_table() */
54 | typedef enum {
55 |     CODES,
56 |     LENS,
57 |     DISTS
58 | } codetype;
59 | 
60 | int ZLIB_INTERNAL inflate_table OF((codetype type, unsigned short FAR *lens,
61 |                              unsigned codes, code FAR * FAR *table,
62 |                              unsigned FAR *bits, unsigned short FAR *work));
63 | 


--------------------------------------------------------------------------------
/testdata/D1.CE2-S4-LT_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/testdata/D1.CE2-S4-LT_R1.fastq.gz


--------------------------------------------------------------------------------
/testdata/D1.CE2-S4-LT_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/testdata/D1.CE2-S4-LT_R2.fastq.gz


--------------------------------------------------------------------------------
/testdata/D2.CE2-H2-LT_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/testdata/D2.CE2-H2-LT_R1.fastq.gz


--------------------------------------------------------------------------------
/testdata/D2.CE2-H2-LT_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xia-lab/Seq2Fun/32a89d0ee8c47764e54d1f259d3831e8ca0fd2f5/testdata/D2.CE2-H2-LT_R2.fastq.gz


--------------------------------------------------------------------------------
/testdata/sample.txt:
--------------------------------------------------------------------------------
1 | D1.CE2-S4-LT	D1.CE2-S4-LT_R1.fastq.gz	D1.CE2-S4-LT_R2.fastq.gz	control
2 | D2.CE2-H2-LT	D2.CE2-H2-LT_R1.fastq.gz	D2.CE2-H2-LT_R2.fastq.gz	high
3 | 


--------------------------------------------------------------------------------