├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── data
    ├── SARS-CoV-2.genomes.fa
    └── SARS-CoV-2.kmer.fa
└── src
    ├── adaptertrimmer.cpp
    ├── adaptertrimmer.h
    ├── basecorrector.cpp
    ├── basecorrector.h
    ├── cmdline.h
    ├── common.h
    ├── coverage.js
    ├── duplicate.cpp
    ├── duplicate.h
    ├── editdistance.cpp
    ├── editdistance.h
    ├── evaluator.cpp
    ├── evaluator.h
    ├── fastareader.cpp
    ├── fastareader.h
    ├── fastqreader.cpp
    ├── fastqreader.h
    ├── filter.cpp
    ├── filter.h
    ├── filterresult.cpp
    ├── filterresult.h
    ├── genomes.cpp
    ├── genomes.h
    ├── htmlreporter.cpp
    ├── htmlreporter.h
    ├── jsonreporter.cpp
    ├── jsonreporter.h
    ├── kmer.cpp
    ├── kmer.h
    ├── kmercollection.cpp
    ├── kmercollection.h
    ├── knownadapters.h
    ├── main.cpp
    ├── nucleotidetree.cpp
    ├── nucleotidetree.h
    ├── options.cpp
    ├── options.h
    ├── overlapanalysis.cpp
    ├── overlapanalysis.h
    ├── peprocessor.cpp
    ├── peprocessor.h
    ├── polyx.cpp
    ├── polyx.h
    ├── processor.cpp
    ├── processor.h
    ├── read.cpp
    ├── read.h
    ├── seprocessor.cpp
    ├── seprocessor.h
    ├── sequence.cpp
    ├── sequence.h
    ├── stats.cpp
    ├── stats.h
    ├── threadconfig.cpp
    ├── threadconfig.h
    ├── umiprocessor.cpp
    ├── umiprocessor.h
    ├── unittest.cpp
    ├── unittest.h
    ├── util.h
    ├── virusdetector.cpp
    ├── virusdetector.h
    ├── writer.cpp
    ├── writer.h
    ├── writerthread.cpp
    ├── writerthread.h
    └── zlib
        ├── crc32.h
        ├── deflate.h
        ├── gzguts.h
        ├── inffast.h
        ├── inffixed.h
        ├── inflate.h
        ├── inftrees.h
        ├── trees.h
        ├── zconf.h
        ├── zlib.h
        └── zutil.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 OpenGene - Open Source Genetics Toolbox
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | DIR_INC := ./inc
 2 | DIR_SRC := ./src
 3 | DIR_OBJ := ./obj
 4 | 
 5 | PREFIX ?= /usr/local
 6 | BINDIR ?= $(PREFIX)/bin
 7 | INCLUDE_DIRS ?=
 8 | LIBRARY_DIRS ?=
 9 | 
10 | SRC := $(wildcard ${DIR_SRC}/*.cpp)
11 | OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${SRC}))
12 | 
13 | TARGET := fastv
14 | 
15 | BIN_TARGET := ${TARGET}
16 | 
17 | CXX ?= g++
18 | CXXFLAGS := -std=c++11 -g -O3 -I${DIR_INC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS}
19 | LIBS := -lz -lpthread
20 | LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS)
21 | 
22 | 
23 | ${BIN_TARGET}:${OBJ}
24 | 	$(CXX) $(OBJ) -o $@ $(LD_FLAGS)
25 | 
26 | ${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp make_obj_dir
27 | 	$(CXX) -c $< -o $@ $(CXXFLAGS)
28 | 
29 | .PHONY:clean
30 | clean:
31 | 	@if test -d $(DIR_OBJ) ; \
32 | 	then \
33 | 		find $(DIR_OBJ) -name *.o -delete; \
34 | 	fi
35 | 	@if test -e $(TARGET) ; \
36 | 	then \
37 | 		rm $(TARGET) ; \
38 | 	fi
39 | 
40 | make_obj_dir:
41 | 	@if test ! -d $(DIR_OBJ) ; \
42 | 	then \
43 | 		mkdir $(DIR_OBJ) ; \
44 | 	fi
45 | 
46 | install:
47 | 	install $(TARGET) $(BINDIR)/$(TARGET)
48 | 	@echo "Installed."
49 | 


--------------------------------------------------------------------------------
/src/adaptertrimmer.cpp:
--------------------------------------------------------------------------------
  1 | #include "adaptertrimmer.h"
  2 | 
  3 | AdapterTrimmer::AdapterTrimmer(){
  4 | }
  5 | 
  6 | 
  7 | AdapterTrimmer::~AdapterTrimmer(){
  8 | }
  9 | 
 10 | bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit) {
 11 |     OverlapResult ov = OverlapAnalysis::analyze(r1, r2, diffLimit, overlapRequire, diffPercentLimit);
 12 |     return trimByOverlapAnalysis(r1, r2, fr, ov);
 13 | }
 14 | 
 15 | bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1, int frontTrimmed2) {
 16 |     int ol = ov.overlap_len;
 17 |     if(ov.overlapped && ov.offset < 0) {
 18 | 
 19 |         //5'      ......frontTrimmed1......|------------------------------------------|----- 3'
 20 |         //3' -----|-------------------------------------------|......frontTrimmed2.....      5'
 21 | 
 22 |         int len1 = min(r1->length(), ol + frontTrimmed2);
 23 |         int len2 = min(r2->length(), ol + frontTrimmed1);
 24 |         string adapter1 = r1->mSeq.mStr.substr(len1, r1->length() - len1);
 25 |         string adapter2 = r2->mSeq.mStr.substr(len2, r2->length() - len2);
 26 | 
 27 |         if(_DEBUG) {
 28 |             cerr << adapter1 << endl;
 29 |             cerr << adapter2 << endl;
 30 |             cerr << "frontTrimmed2: " << frontTrimmed1 << endl;
 31 |             cerr << "frontTrimmed2: " << frontTrimmed2 << endl;
 32 |             cerr << "overlap:" << ov.offset << "," << ov.overlap_len << ", " << ov.diff << endl;
 33 |             r1->print();
 34 |             r2->reverseComplement()->print();
 35 |             cerr <<endl;
 36 |         }
 37 |         r1->mSeq.mStr = r1->mSeq.mStr.substr(0, len1);
 38 |         r1->mQuality = r1->mQuality.substr(0, len1);
 39 |         r2->mSeq.mStr = r2->mSeq.mStr.substr(0, len2);
 40 |         r2->mQuality = r2->mQuality.substr(0, len2);
 41 | 
 42 |         fr->addAdapterTrimmed(adapter1, adapter2);
 43 |         return true;
 44 |     }
 45 |     return false;
 46 | }
 47 | 
 48 | bool AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector<string>& adapterList, bool isR2, bool incTrimmedCounter) {
 49 |     int matchReq = 4;
 50 |     if(adapterList.size() > 16)
 51 |         matchReq = 5;
 52 |     if(adapterList.size() > 256)
 53 |         matchReq = 6;
 54 |     bool trimmed = false;
 55 | 
 56 |     string originalSeq = r->mSeq.mStr;
 57 |     for(int i=0; i<adapterList.size(); i++) {
 58 |         trimmed |= trimBySequence(r, NULL, adapterList[i], isR2, matchReq);
 59 |     }
 60 | 
 61 |     if(trimmed) {
 62 |         string adapter = originalSeq.substr(r->length(), originalSeq.length() - r->length());
 63 |         if(fr)
 64 |             fr->addAdapterTrimmed(adapter, isR2, incTrimmedCounter);
 65 |         else
 66 |             cerr << adapter << endl;
 67 |     }
 68 | 
 69 |     return trimmed;
 70 | }
 71 | 
 72 | bool AdapterTrimmer::trimBySequence(Read* r, FilterResult* fr, string& adapterseq, bool isR2, int matchReq) {
 73 |     const int allowOneMismatchForEach = 8;
 74 | 
 75 |     int rlen = r->length();
 76 |     int alen = adapterseq.length();
 77 | 
 78 |     const char* adata = adapterseq.c_str();
 79 |     const char* rdata = r->mSeq.mStr.c_str();
 80 | 
 81 |     if(alen < matchReq)
 82 |         return false;
 83 | 
 84 |     int pos=0;
 85 |     bool found = false;
 86 |     int start = 0;
 87 |     if(alen >= 16)
 88 |         start = -4;
 89 |     else if(alen >= 12)
 90 |         start = -3;
 91 |     else if(alen >= 8)
 92 |         start = -2;
 93 |     // we start from negative numbers since the Illumina adapter dimer usually have the first A skipped as A-tailing
 94 |     for(pos = start; pos<rlen-matchReq; pos++) {
 95 |         int cmplen = min(rlen - pos, alen);
 96 |         int allowedMismatch = cmplen/allowOneMismatchForEach;
 97 |         int mismatch = 0;
 98 |         bool matched = true;
 99 |         for(int i=max(0, -pos); i<cmplen; i++) {
100 |             if( adata[i] != rdata[i+pos] ){
101 |                 mismatch++;
102 |                 if(mismatch > allowedMismatch) {
103 |                     matched = false;
104 |                     break;
105 |                 }
106 |             }
107 |         }
108 |         if(matched) {
109 |             found = true;
110 |             break;
111 |         }
112 | 
113 |     }
114 | 
115 |     if(found) {
116 |         if(pos < 0) {
117 |             string adapter = adapterseq.substr(0, alen+pos);
118 |             r->mSeq.mStr.resize(0);
119 |             r->mQuality.resize(0);
120 |             if(fr) {
121 |                 fr->addAdapterTrimmed(adapter, isR2);
122 |             }
123 | 
124 |         } else {
125 |             string adapter = r->mSeq.mStr.substr(pos, rlen-pos);
126 |             r->mSeq.mStr = r->mSeq.mStr.substr(0, pos);
127 |             r->mQuality = r->mQuality.substr(0, pos);
128 |             if(fr) {
129 |                 fr->addAdapterTrimmed(adapter, isR2);
130 |             }
131 |         }
132 |         return true;
133 |     }
134 | 
135 |     return false;
136 | }
137 | 
138 | bool AdapterTrimmer::test() {
139 |     Read r("@name",
140 |         "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGG",
141 |         "+",
142 |         "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E");
143 |     string adapter = "TTTTCCACGGGGATACTACTG";
144 |     bool trimmed = AdapterTrimmer::trimBySequence(&r, NULL, adapter);
145 |     if (r.mSeq.mStr != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAA")
146 |         return false;
147 | 
148 |     Read read("@name",
149 |         "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGGAAATTTCCCGGGAAATTTCCCGGGATCGATCGATCGATCGAATTCC",
150 |         "+",
151 |         "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE");
152 |     vector<string> adapterList;
153 |     adapterList.push_back("GCTAGCTAGCTAGCTA");
154 |     adapterList.push_back("AAATTTCCCGGGAAATTTCCCGGG");
155 |     adapterList.push_back("ATCGATCGATCGATCG");
156 |     adapterList.push_back("AATTCCGGAATTCCGG");
157 |     trimmed = AdapterTrimmer::trimByMultiSequences(&read, NULL, adapterList);
158 |     if (read.mSeq.mStr != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGG") {
159 |         cerr << read.mSeq.mStr << endl;
160 |         return false;
161 |     }
162 | 
163 |     return true;
164 | }


--------------------------------------------------------------------------------
/src/adaptertrimmer.h:
--------------------------------------------------------------------------------
 1 | #ifndef ADAPTER_TRIMMER_H
 2 | #define ADAPTER_TRIMMER_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "overlapanalysis.h"
 8 | #include "filterresult.h"
 9 | #include "options.h"
10 | 
11 | using namespace std;
12 | 
13 | class AdapterTrimmer{
14 | public:
15 |     AdapterTrimmer();
16 |     ~AdapterTrimmer();
17 | 
18 |     static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit);
19 |     static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0);
20 |     static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false, int matchReq = 4);
21 |     static bool trimByMultiSequences(Read* r1, FilterResult* fr, vector<string>& adapterList, bool isR2 = false, bool incTrimmedCounter = true);
22 |     static bool test();
23 | 
24 | 
25 | };
26 | 
27 | 
28 | #endif


--------------------------------------------------------------------------------
/src/basecorrector.cpp:
--------------------------------------------------------------------------------
  1 | #include "basecorrector.h"
  2 | #include "util.h"
  3 | 
  4 | BaseCorrector::BaseCorrector(){
  5 | }
  6 | 
  7 | 
  8 | BaseCorrector::~BaseCorrector(){
  9 | }
 10 | 
 11 | int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit) {
 12 |     OverlapResult ov = OverlapAnalysis::analyze(r1, r2, diffLimit, overlapRequire, diffPercentLimit);
 13 |     return correctByOverlapAnalysis(r1, r2, fr, ov);
 14 | }
 15 | 
 16 | int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov) {
 17 |     // we only correct overlap
 18 |     if(ov.diff == 0 || !ov.overlapped)
 19 |         return 0;
 20 | 
 21 |     int ol = ov.overlap_len;
 22 |     int start1 = max(0, ov.offset);
 23 |     int start2 = r2->length() -  max(0, -ov.offset) - 1;
 24 | 
 25 |     const char* seq1 = r1->mSeq.mStr.c_str();
 26 |     const char* seq2 = r2->mSeq.mStr.c_str();
 27 |     const char* qual1 = r1->mQuality.c_str();
 28 |     const char* qual2 = r2->mQuality.c_str();
 29 | 
 30 |     const char GOOD_QUAL = num2qual(30);
 31 |     const char BAD_QUAL = num2qual(14);
 32 | 
 33 |     int corrected = 0;
 34 |     int uncorrected = 0;
 35 |     bool r1Corrected = false;
 36 |     bool r2Corrected = false;
 37 |     for(int i=0; i<ol; i++) {
 38 |         int p1 = start1 + i;
 39 |         int p2 = start2 - i;
 40 | 
 41 |         if(seq1[p1] != complement(seq2[p2])) {
 42 |             if(qual1[p1] >= GOOD_QUAL && qual2[p2] <= BAD_QUAL) {
 43 |                 // use R1
 44 |                 r2->mSeq.mStr[p2] = complement(seq1[p1]);
 45 |                 r2->mQuality[p2] = qual1[p1];
 46 |                 corrected++;
 47 |                 r2Corrected = true;
 48 |                 if(fr) {
 49 |                     fr->addCorrection(seq2[p2], complement(seq1[p1]));
 50 |                 }
 51 |             } else if(qual2[p2] >= GOOD_QUAL && qual1[p1] <= BAD_QUAL) {
 52 |                 // use R2
 53 |                 r1->mSeq.mStr[p1] = complement(seq2[p2]);
 54 |                 r1->mQuality[p1] = qual2[p2];
 55 |                 corrected++;
 56 |                 r1Corrected = true;
 57 |                 if(fr) {
 58 |                     fr->addCorrection(seq1[p1], complement(seq2[p2]));
 59 |                 }
 60 |             } else {
 61 |                 uncorrected++;
 62 |             }
 63 |         }
 64 |     }
 65 | 
 66 |     // should never happen
 67 |     if(uncorrected + corrected != ov.diff) {
 68 |         static bool warned = false;
 69 |         if(!warned){
 70 |             cerr << "WARNING: the algorithm is wrong! uncorrected + corrected != ov.diff" << endl;
 71 |             warned = true;
 72 |         }
 73 |     }
 74 | 
 75 |     if(corrected > 0 && fr) {
 76 |         if(r1Corrected && r2Corrected)
 77 |             fr->incCorrectedReads(2);
 78 |         else
 79 |             fr->incCorrectedReads(1);
 80 |     }
 81 | 
 82 |     return corrected;
 83 | }
 84 | 
 85 | bool BaseCorrector::test() {
 86 |     Read r1("@name",
 87 |         "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCACGGGG",
 88 |         "+",
 89 |         "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEE");
 90 |     Read r2("@name",
 91 |         "AAAAAAAAAACCCCGGGGAAAATTTTAAAATTGGGGGGGGGGTGGGGGGGGGGGGG",
 92 |         "+",
 93 |         "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEE");
 94 | 
 95 |     correctByOverlapAnalysis(&r1, &r2, NULL, 5, 30, 0.2);
 96 | 
 97 |     if(r1.mSeq.mStr != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGG")
 98 |         return false;
 99 |     if(r2.mSeq.mStr != "AAAAAAAAAACCCCGGGGAAAATTTTAAAATTGGGGGGGGGGGGGGGGGGGGGGGG")
100 |         return false;
101 |     if(r1.mQuality != "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE")
102 |         return false;
103 |     if(r2.mQuality != "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE")
104 |         return false;
105 | 
106 |     return true;
107 | }


--------------------------------------------------------------------------------
/src/basecorrector.h:
--------------------------------------------------------------------------------
 1 | #ifndef BASE_CORRECTOR_H
 2 | #define BASE_CORRECTOR_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "overlapanalysis.h"
 8 | #include "filterresult.h"
 9 | #include "options.h"
10 | 
11 | using namespace std;
12 | 
13 | class BaseCorrector{
14 | public:
15 |     BaseCorrector();
16 |     ~BaseCorrector();
17 | 
18 |     static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit);
19 |     static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov);
20 |     static bool test();
21 | };
22 | 
23 | 
24 | #endif


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef COMMON_H
 2 | #define COMMON_H
 3 | 
 4 | #define FASTV_VER "0.10.0"
 5 | 
 6 | #define _DEBUG false
 7 | 
 8 | typedef long int64;
 9 | typedef unsigned long uint64;
10 | 
11 | typedef int int32;
12 | typedef unsigned int uint32;
13 | 
14 | typedef short int16;
15 | typedef unsigned short uint16;
16 | 
17 | typedef char int8;
18 | typedef unsigned char uint8;
19 | 
20 | const char ATCG_BASES[] = {'A', 'T', 'C', 'G'};
21 | 
22 | #pragma pack(2) 
23 | 
24 | 
25 | #pragma pack() 
26 | 
27 | // the limit of the queue to store the packs
28 | // error may happen if it generates more packs than this number
29 | static const int PACK_NUM_LIMIT  = 10000000;
30 | 
31 | // how many reads one pack has
32 | static const int PACK_SIZE = 1000;
33 | 
34 | // if one pack is produced, but not consumed, it will be kept in the memory
35 | // this number limit the number of in memory packs
36 | // if the number of in memory packs is full, the producer thread should sleep
37 | static const int PACK_IN_MEM_LIMIT = 500;
38 | 
39 | // if read number is more than this, warn it
40 | static const int WARN_STANDALONE_READ_LIMIT = 10000;
41 | 
42 | // different filtering results, bigger number means worse
43 | // if r1 and r2 are both failed, then the bigger one of the two results will be recorded
44 | // we reserve some gaps for future types to be added
45 | static const int PASS_FILTER = 0;
46 | static const int FAIL_POLY_X = 4;
47 | static const int FAIL_OVERLAP = 8;
48 | static const int FAIL_N_BASE = 12;
49 | static const int FAIL_LENGTH = 16;
50 | static const int FAIL_TOO_LONG = 17;
51 | static const int FAIL_QUALITY = 20;
52 | static const int FAIL_COMPLEXITY = 24;
53 | 
54 | // how many types in total we support
55 | static const int FILTER_RESULT_TYPES = 32;
56 | 
57 | const static char* FAILED_TYPES[FILTER_RESULT_TYPES] = {
58 | 	"passed", "", "", "",
59 | 	"failed_polyx_filter", "", "", "",
60 | 	"failed_bad_overlap", "", "", "",
61 | 	"failed_too_many_n_bases", "", "", "",
62 | 	"failed_too_short", "failed_too_long", "", "",
63 | 	"failed_quality_filter", "", "", "",
64 | 	"failed_low_complexity", "", "", "",
65 | 	"", "", "", ""
66 | };
67 | 
68 | 
69 | #endif /* COMMON_H */
70 | 


--------------------------------------------------------------------------------
/src/coverage.js:
--------------------------------------------------------------------------------
  1 | var maxSize = 0;
  2 | var maxCoverage = 0.0;
  3 | var mapMargin = 30;
  4 | var mismatchRatioThreshold = 0.05;
  5 | var sorting_by_coverage_rate = 1;
  6 | 
  7 | function switch_sort() {
  8 |     if(sorting_by_coverage_rate == 1) {
  9 |         sorting_by_coverage_rate = 0;
 10 |         genome_coverage_data.sort(sortByBases);
 11 |         drawCoverages('genome_coverage', genome_coverage_data, genome_sizes, stats_bin);
 12 |         document.getElementById("sort_by_div").innerHTML = "Order by: <a href='javascript:switch_sort();'>Coverage rate</a> | <font color='#FF6600'>Bases on target</font>";
 13 |     } else {
 14 |         sorting_by_coverage_rate = 1;
 15 |         genome_coverage_data.sort(sortByCoverageRate);
 16 |         drawCoverages('genome_coverage', genome_coverage_data, genome_sizes, stats_bin);
 17 |         document.getElementById("sort_by_div").innerHTML = "Order by: <font color='#FF6600'>Coverage rate</font> | <a href='javascript:switch_sort();'>Bases on target</a>";
 18 |     }
 19 | }
 20 | 
 21 | function sortByCoverageRate(a, b) {
 22 |     var r = b["coverage_rate"] -  a["coverage_rate"];
 23 |     if(r == 0)
 24 |         return b["bases"] -  a["bases"];
 25 |     else
 26 |         return r;
 27 | }
 28 | 
 29 | function sortByBases(a, b) {
 30 |     var r= b["bases"] -  a["bases"];
 31 |     if(r == 0)
 32 |         return b["coverage_rate"] -  a["coverage_rate"];
 33 |     else
 34 |         return r;
 35 | }
 36 | 
 37 | function drawCoverages(divid, data, sizes, bin) {
 38 |     mapcontainer = document.getElementById(divid);
 39 |     var hasData = false;
 40 |     for(d in data) {
 41 |         hasData = true;
 42 |         if(sizes[d] > maxSize)
 43 |             maxSize = sizes[d];
 44 | 
 45 |         for(c in data[d]["coverage"]) {
 46 |             if(data[d]["coverage"][c] > maxCoverage)
 47 |                 maxCoverage = data[d]["coverage"][c];
 48 |         }
 49 |     }
 50 |     if(hasData) {
 51 |         mapcontainer.style.display = 'block';
 52 |     } else {
 53 |         mapcontainer.style.display = 'none';
 54 |     }
 55 |     var childs = mapcontainer.childNodes;
 56 |     for(var i = 0; i < childs.length; i++) {
 57 |       mapcontainer.removeChild(childs[i]);
 58 |     }
 59 |     var colorTableHTML = "<div id='colortable' style='text-align:center;padding:5px;font-size:14px;font-family:Arial;'>";
 60 |     colorTableHTML += "<center><table style='border:0px;'> <tr>  <td style='width:200px; border:0px;text-align:right;'> Mismatch ratio = 0 </td>";
 61 |     var count = 0;
 62 |     var tds = 30;
 63 |     while (count < tds) {
 64 |         var mr = mismatchRatioThreshold * count/tds ;
 65 |         count++;
 66 |         var c = getColor(mr);
 67 |         colorTableHTML += "<td style='background:" + c + "; width=10px;' title = '" + mr + "'></td>  "; 
 68 |     }
 69 |     colorTableHTML += "<td style='width:200px;border:0px;'> Mismatch ratio = " + mismatchRatioThreshold + " </td>";
 70 |     colorTableHTML += "</tr></table></center></div>";
 71 |     mapcontainer.innerHTML = colorTableHTML;
 72 | 
 73 |     for(d in data) {
 74 |         var genome = data[d];
 75 |         cvs = document.createElement("canvas");
 76 |         cvs.id = 'coverage_' + d.toString();
 77 |         cvs.width=mapcontainer.offsetWidth - 10;
 78 |         cvs.height=60;
 79 |         cvs.style.padding='2px 0px 2px 0px';
 80 |         cvs.onmousemove = onCanvasMove;
 81 |         cvs.onmouseover = onCanvasIn;
 82 |         cvs.onmouseout = onCanvasOut;
 83 |         mapcontainer.appendChild(cvs);
 84 | 
 85 |         drawGenome(genome, cvs.id, sizes[d], bin);
 86 | 
 87 |         var namediv = document.createElement("div"); 
 88 |         namediv.innerHTML = "<div style='text-align:center;padding:2px;font-size:10px;color:#339967;'> " + data[d]['name'] + " (" + data[d]['coverage_rate'] + "% covered, " + data[d]['reads'] + " reads, " + data[d]['bases'] + " bases)</div>" ;; 
 89 |         mapcontainer.appendChild(namediv); 
 90 |     }
 91 | }
 92 | 
 93 | function onCanvasMove(e) {
 94 |     var cvs = e.target;
 95 |     var id = parseInt(cvs.id.substring(9));
 96 |     console.log(cvs.id.substring(9));
 97 |     console.log(id);
 98 |     if(!genome_coverage_data[id])
 99 |         return;
100 | 
101 |     genome = genome_coverage_data[id];
102 |     var mapw = cvs.width;
103 |     var maph = cvs.height;
104 |     var x = e.clientX - cvs.offsetLeft;
105 |     var pos = (x - mapMargin) * maxSize / (mapw - mapMargin*2);
106 |     pos = Math.round(pos / stats_bin);
107 | 
108 |     console.log(pos);
109 | 
110 |     if(!genome["coverage"][pos])
111 |         return;
112 | 
113 |     var start = pos * stats_bin;
114 |     var end = (pos+1) * stats_bin - 1;
115 |     var html = start + "-" + end + "<br>";
116 |     html += "mean coverage: " + genome["coverage"][pos] + "<br>";
117 |     html += "mean mismatch ratio: " + genome["mismatch_ratios"][pos] + "<br>";
118 | 
119 |     var tips = document.getElementById('maptips');
120 |     tips.style.position="absolute";
121 |     tips.style.left = e.clientX + 5 + tips.parentNode.scrollLeft;
122 |     tips.style.top = e.clientY + 5 + tips.parentNode.scrollTop + document.body.scrollTop;
123 |     tips.innerHTML = html;
124 | 
125 |     console.log(html);
126 | }
127 | 
128 | function onCanvasIn() {
129 |     var tips = document.getElementById('maptips');
130 |     tips.style.display = 'block';
131 | }
132 | 
133 | function onCanvasOut() {
134 |     var tips = document.getElementById('maptips');
135 |     tips.style.display = 'none';
136 | }
137 | 
138 | function drawGenome(genome, canvasid, size, bin) {
139 |     var cvs = document.getElementById(canvasid);
140 |     var mapw = cvs.width;
141 |     var maph = cvs.height;
142 |     var ctx = cvs.getContext("2d"); 
143 |     var texth = 15;
144 | 
145 |     var name = genome['name'];
146 |     var reads = genome['reads'];
147 |     var bases = genome['bases'];
148 |     var avg_mismatch_ratio = genome['avg_mismatch_ratio'];
149 |     var coverage = genome['coverage'];
150 |     var mismatch_ratios = genome['mismatch_ratios'];
151 | 
152 |     for(var pos in coverage) {
153 |         var c = coverage[pos];
154 |         var mr = mismatch_ratios[pos];
155 | 
156 |         var w = (mapw - mapMargin*2) * bin / maxSize;
157 |         var drawW = w;
158 |         var h = (maph-texth)* (c/maxCoverage);
159 |         var drawH = h;
160 |         var centerX = mapMargin + (pos-0.5) * (mapw - mapMargin*2) * bin/maxSize;
161 |         var x = centerX - 1;
162 |         var y = maph- texth;
163 |         ctx.fillStyle=getColor(mr);
164 |         ctx.fillRect(x, y, drawW, -drawH);
165 |     }
166 | 
167 |     var xbars = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
168 |     var tailPainted = false;
169 |     for(b in xbars) {
170 |         var tick = Math.round( maxSize * xbars[b]/10 );
171 |         if(tick > size) {
172 |             if(tailPainted)
173 |                 continue;
174 |             tailPainted = true;
175 |             tick = size;
176 |         }
177 |         var x = mapMargin + (mapw - mapMargin*2) * tick/maxSize;
178 |         x = Math.round(x);
179 |         ctx.font = "10px Arial";
180 |         ctx.fillStyle = "#999999";
181 |         ctx.fillText(tick.toString(), x, maph);
182 |     }
183 | 
184 |     ctx.font = "10px Arial";
185 |     ctx.fillStyle = "#AAAAAA";
186 |     ctx.fillText(maxCoverage.toString() + "", 10, 10);
187 | }
188 | 
189 | function getColor(mr) {
190 |     if(mr > mismatchRatioThreshold)
191 |         return "rgb(128, 128, 128)";
192 |     else {
193 |         var p = mr/mismatchRatioThreshold;
194 |         var diff = 120* p;
195 |         return "rgb(" + (128+diff) + "," + (128-diff) + "," + (128-diff) + ")";
196 |     }
197 | }


--------------------------------------------------------------------------------
/src/duplicate.cpp:
--------------------------------------------------------------------------------
  1 | #include "duplicate.h"
  2 | #include "overlapanalysis.h"
  3 | #include <memory.h>
  4 | #include <math.h>
  5 | 
  6 | Duplicate::Duplicate(Options* opt) {
  7 |     mOptions = opt;
  8 |     mKeyLenInBase = mOptions->duplicate.keylen;
  9 |     mKeyLenInBit = 1<<(2*mKeyLenInBase);
 10 |     mDups = new uint64[mKeyLenInBit];
 11 |     memset(mDups, 0, sizeof(uint64)*mKeyLenInBit);
 12 |     mCounts = new uint16[mKeyLenInBit];
 13 |     memset(mCounts, 0, sizeof(uint16)*mKeyLenInBit);
 14 |     mGC = new uint8[mKeyLenInBit];
 15 |     memset(mGC, 0, sizeof(uint8)*mKeyLenInBit);
 16 | }
 17 | 
 18 | Duplicate::~Duplicate(){
 19 |     delete[] mDups;
 20 |     delete[] mCounts;
 21 | }
 22 | 
 23 | uint64 Duplicate::seq2int(const char* data, int start, int keylen, bool& valid) {
 24 |     uint64 ret = 0;
 25 |     for(int i=0; i<keylen; i++) {
 26 |         switch(data[start + i]) {
 27 |             case 'A':
 28 |                 ret += 0;
 29 |                 break;
 30 |             case 'T':
 31 |                 ret += 1;
 32 |                 break;
 33 |             case 'C':
 34 |                 ret += 2;
 35 |                 break;
 36 |             case 'G':
 37 |                 ret += 3;
 38 |                 break;
 39 |             default:
 40 |                 valid = false;
 41 |                 return 0;
 42 |         }
 43 |         // if it's not the last one, shift it by 2 bits
 44 |         if(i != keylen-1)
 45 |             ret <<= 2;
 46 |     }
 47 |     return ret;
 48 | }
 49 | 
 50 | void Duplicate::addRecord(uint32 key, uint64 kmer32, uint8 gc) {
 51 |     if(mCounts[key] == 0) {
 52 |         mCounts[key] = 1;
 53 |         mDups[key] = kmer32;
 54 |         mGC[key] = gc;
 55 |     } else {
 56 |         if(mDups[key] == kmer32)
 57 |             mCounts[key]++;
 58 |         else if(mDups[key] > kmer32) {
 59 |             mDups[key] = kmer32;
 60 |             mCounts[key] = 1;
 61 |             mGC[key] = gc;
 62 |         }
 63 |     }
 64 | }
 65 | 
 66 | void Duplicate::statRead(Read* r) {
 67 |     if(r->length() < 32)
 68 |         return;
 69 | 
 70 |     int start1 = 0;
 71 |     int start2 = max(0, r->length() - 32 - 5);
 72 | 
 73 |     const char* data = r->mSeq.mStr.c_str();
 74 |     bool valid = true;
 75 | 
 76 |     uint64 ret = seq2int(data, start1, mKeyLenInBase, valid);
 77 |     uint32 key = (uint32)ret;
 78 |     if(!valid)
 79 |         return;
 80 | 
 81 |     uint64 kmer32 = seq2int(data, start2, 32, valid);
 82 |     if(!valid)
 83 |         return;
 84 | 
 85 |     int gc = 0;
 86 | 
 87 |     // not calculated
 88 |     if(mCounts[key] == 0) {
 89 |         for(int i=0; i<r->length(); i++) {
 90 |             if(data[i] == 'C' || data[i] == 'T')
 91 |                 gc++;
 92 |         }
 93 |     }
 94 | 
 95 |     gc = round(255.0 * (double) gc / (double) r->length());
 96 | 
 97 |     addRecord(key, kmer32, (uint8)gc);
 98 | }
 99 | 
100 | void Duplicate::statPair(Read* r1, Read* r2) {
101 |     if(r1->length() < 32 || r2->length() < 32)
102 |         return;
103 | 
104 |     const char* data1 = r1->mSeq.mStr.c_str();
105 |     const char* data2 = r2->mSeq.mStr.c_str();
106 |     bool valid = true;
107 | 
108 |     uint64 ret = seq2int(data1, 0, mKeyLenInBase, valid);
109 |     uint32 key = (uint32)ret;
110 |     if(!valid)
111 |         return;
112 | 
113 |     uint64 kmer32 = seq2int(data2, 0, 32, valid);
114 |     if(!valid)
115 |         return;
116 | 
117 |     int gc = 0;
118 | 
119 |     // not calculated
120 |     if(mCounts[key] == 0) {
121 |         for(int i=0; i<r1->length(); i++) {
122 |             if(data1[i] == 'G' || data1[i] == 'C')
123 |                 gc++;
124 |         }
125 |         for(int i=0; i<r2->length(); i++) {
126 |             if(data2[i] == 'G' || data2[i] == 'C')
127 |                 gc++;
128 |         }
129 |     }
130 | 
131 |     gc = round(255.0 * (double) gc / (double)( r1->length() + r2->length()));
132 | 
133 |     addRecord(key, kmer32, gc);
134 | }
135 | 
136 | double Duplicate::statAll(int* hist, double* meanGC, int histSize) {
137 |     long totalNum = 0;
138 |     long dupNum = 0;
139 |     int* gcStatNum = new int[histSize];
140 |     memset(gcStatNum, 0, sizeof(int)*histSize);
141 |     for(int key=0; key<mKeyLenInBit; key++) {
142 |         int count = mCounts[key];
143 |         double gc = mGC[key];
144 | 
145 |         if(count > 0) {
146 |             totalNum += count;
147 |             dupNum += count - 1;
148 | 
149 |             if(count >= histSize){
150 |                 hist[histSize-1]++;
151 |                 meanGC[histSize-1] += gc;
152 |                 gcStatNum[histSize-1]++;
153 |             }
154 |             else{
155 |                 hist[count]++;
156 |                 meanGC[count] += gc;
157 |                 gcStatNum[count]++;
158 |             }
159 |         }
160 |     }
161 | 
162 |     for(int i=0; i<histSize; i++) {
163 |         if(gcStatNum[i] > 0) {
164 |             meanGC[i] = meanGC[i] / 255.0 / gcStatNum[i];
165 |         }
166 |     }
167 | 
168 |     delete[] gcStatNum;
169 | 
170 |     if(totalNum == 0)
171 |         return 0.0;
172 |     else
173 |         return (double)dupNum / (double)totalNum;
174 | }


--------------------------------------------------------------------------------
/src/duplicate.h:
--------------------------------------------------------------------------------
 1 | #ifndef DUPLICATE_H
 2 | #define DUPLICATE_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "read.h"
 8 | #include "options.h"
 9 | #include "common.h"
10 | 
11 | using namespace std;
12 | 
13 | class Duplicate{
14 | public:
15 |     Duplicate(Options* opt);
16 |     ~Duplicate();
17 | 
18 |     void statRead(Read* r1);
19 |     void statPair(Read* r1, Read* r2);
20 |     uint64 seq2int(const char* data, int start, int keylen, bool& valid);
21 |     void addRecord(uint32 key, uint64 kmer32, uint8 gc);
22 | 
23 |     // make histogram and get duplication rate
24 |     double statAll(int* hist, double* meanGC, int histSize);
25 | 
26 | private:
27 |     Options* mOptions;
28 |     int mKeyLenInBase;
29 |     int mKeyLenInBit;
30 |     uint64* mDups;
31 |     uint16* mCounts;
32 |     uint8* mGC;
33 |     
34 | };
35 | 
36 | #endif


--------------------------------------------------------------------------------
/src/editdistance.cpp:
--------------------------------------------------------------------------------
  1 | // -------
  2 | // License
  3 | // -------
  4 | //
  5 | // It is released under the MIT license.
  6 | //
  7 | //     Copyright (c) 2013 Hiroyuki Tanaka
  8 | //
  9 | //     Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 10 | //
 11 | //     The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 12 | //
 13 | //     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 14 | 
 15 | 
 16 | #include <stdint.h>
 17 | #include <stdio.h>
 18 | #include <cstdlib>
 19 | #include <cstring>
 20 | #include <string>
 21 | #include <map>
 22 | #include <vector>
 23 | #include <iostream>
 24 | #include <bitset>
 25 | #include <time.h>
 26 | 
 27 | #include "editdistance.h"
 28 | 
 29 | using namespace std;
 30 | 
 31 | template<typename T, typename TVALUE>
 32 | unsigned int edit_distance_bpv(T &cmap, char const *vec, size_t const &vecsize, unsigned int const &tmax, unsigned int const &tlen) {
 33 |     int D = tmax * 64 + tlen;
 34 |     TVALUE D0, HP, HN, VP, VN;
 35 |     uint64_t top = (1L << (tlen - 1));  // 末尾のvectorに適用
 36 |     uint64_t lmb = (1L << 63);
 37 | 
 38 |     for(size_t i = 0; i <= tmax; ++i) {
 39 |         VP[i] = 0;
 40 |         VN[i] = 0;
 41 |     }
 42 |     for(size_t i = 0; i < tmax; ++i) VP[i] = ~0;
 43 |     for(size_t i = 0; i < tlen; ++i) VP[tmax] |= (1L << i);
 44 |     for(size_t i = 0; i < vecsize; ++i) {
 45 |         TVALUE &PM = cmap[vec[i]];
 46 |         for(int r = 0; r <= tmax; ++r) {
 47 |             uint64_t X = PM[r];
 48 |             if(r > 0 && (HN[r - 1] & lmb)) X |= 1L;
 49 |             D0[r] = (((X & VP[r]) + VP[r]) ^ VP[r]) | X | VN[r];
 50 |             HP[r] = VN[r] | ~(D0[r] | VP[r]);
 51 |             HN[r] = D0[r] & VP[r];
 52 |             X = (HP[r] << 1L);
 53 |             if(r == 0 || HP[r - 1] & lmb) X |= 1L;
 54 |             VP[r] = (HN[r] << 1L) | ~(D0[r] | X);
 55 |             if(r > 0 && (HN[r - 1] & lmb)) VP[r] |= 1L;
 56 |             VN[r] = D0[r] & X;
 57 |         }
 58 |         if(HP[tmax] & top) ++D;
 59 |         else if(HN[tmax] & top) --D;
 60 |     }
 61 |     return D;
 62 | }
 63 | 
 64 | 
 65 | /// c.f. http://handasse.blogspot.com/2009/04/c_29.html
 66 | template<typename T>
 67 | unsigned int edit_distance_dp(T const *str1, size_t const size1, T const *str2, size_t const size2) {
 68 |     vector< vector<uint32_t> > d(size1 + 1, vector<uint32_t>(size2 + 1));
 69 |     for (int i = 0; i < size1 + 1; i++) d[i][0] = i;
 70 |     for (int i = 0; i < size2 + 1; i++) d[0][i] = i;
 71 |     for (int i = 1; i < size1 + 1; i++) {
 72 |         for (int j = 1; j < size2 + 1; j++) {
 73 |             d[i][j] = min(min(d[i-1][j], d[i][j-1]) + 1, d[i-1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1));
 74 |         }
 75 |     }
 76 |     return d[size1][size2];
 77 | }
 78 | 
 79 | template <size_t N>
 80 | struct varr {
 81 |     uint64_t arr_[N];
 82 |     uint64_t & operator[](size_t const &i) {
 83 |         return arr_[i];
 84 |     }
 85 | };
 86 | 
 87 | 
 88 | template<size_t N>
 89 | unsigned int edit_distance_map_(char const *a, size_t const asize, char const *b, size_t const bsize) {
 90 |     typedef map<char, varr<N> > cmap_v;
 91 |     cmap_v cmap;
 92 |     unsigned int tmax = (asize - 1) >> 6;
 93 |     unsigned int tlen = asize - tmax * 64;
 94 |     for(size_t i = 0; i < tmax; ++i) {
 95 |         for(size_t j = 0; j < 64; ++j) cmap[a[i * 64 + j]][i] |= (1L << j);
 96 |     }
 97 |     for(size_t i = 0; i < tlen; ++i) cmap[a[tmax * 64 + i]][tmax] |= (1L << i);
 98 |     return edit_distance_bpv<cmap_v, typename cmap_v::mapped_type>(cmap, b, bsize, tmax, tlen);
 99 | }
100 | 
101 | unsigned int edit_distance(const char *a, const unsigned int asize, const char *b, const unsigned int bsize) {
102 |     if(asize == 0) return bsize;
103 |     else if(bsize == 0) return asize;
104 |     char const *ap, *bp;
105 |     unsigned int const *asizep, *bsizep;
106 |     if(asize < bsize) ap = b, bp = a, asizep = &bsize, bsizep = &asize;
107 |     else ap = a, bp = b, asizep = &asize, bsizep = &bsize;
108 |     size_t vsize = ((*asizep - 1) >> 6) + 1;
109 |     if(vsize > 10) {
110 |         char const *_ = ap;
111 |         unsigned int const *__ = asizep;
112 |         ap = bp, bp = _, asizep = bsizep, bsizep = __;
113 |         vsize = ((*asizep - 1) >> 6) + 1;
114 |     }
115 | 
116 |     if(vsize == 1) return edit_distance_map_<1>(ap, *asizep, bp, *bsizep);
117 |     else if(vsize == 2) return edit_distance_map_<2>(ap, *asizep, bp, *bsizep);
118 |     else if(vsize == 3) return edit_distance_map_<3>(ap, *asizep, bp, *bsizep);
119 |     else if(vsize == 4) return edit_distance_map_<4>(ap, *asizep, bp, *bsizep);
120 |     else if(vsize == 5) return edit_distance_map_<5>(ap, *asizep, bp, *bsizep);
121 |     else if(vsize == 6) return edit_distance_map_<6>(ap, *asizep, bp, *bsizep);
122 |     else if(vsize == 7) return edit_distance_map_<7>(ap, *asizep, bp, *bsizep);
123 |     else if(vsize == 8) return edit_distance_map_<8>(ap, *asizep, bp, *bsizep);
124 |     else if(vsize == 9) return edit_distance_map_<9>(ap, *asizep, bp, *bsizep);
125 |     else if(vsize == 10) return edit_distance_map_<10>(ap, *asizep, bp, *bsizep);
126 |     return edit_distance_dp<char>(ap, *asizep, bp, *bsizep); 
127 | }
128 | 
129 | unsigned int edit_distance(string a, string b) {
130 |     return edit_distance(a.c_str(), a.length(), b.c_str(), b.length());
131 | }
132 | 
133 | unsigned int hamming_distance(const char *a, const unsigned int asize, const char *b, const unsigned int bsize) {
134 |     int dis = 0;
135 |     for(int i=0; i<min(asize, bsize); i++) {
136 |         if(a[i]!=b[i])
137 |             dis++;
138 |     }
139 |     return dis;
140 | }
141 | 
142 | bool editdistance_test(){
143 |     const char* str1[3] = {
144 |         "CCTATCAGGGAGCTGTGGGCCAGCCAGGAGGCAGCACATGCCCAATCCCAGGCCCCTCCCGTTGTAAGTTCCCGTTCTACCCGACAGGGACCTGCTGACAAAAGACAGGGCTGGAGAGCCAGCCTGAAGGCCCTGGGACCCTTCTATCCAC",
145 |         "ACTTATGTTTTTAAATGAGGATTATTGATAGTACTCTTGGTTTTTATACCATTCAGATCACTGAATTTATAAAGTACCCATCTAGTACTTCAAAAAGTAAAGTGTTCTGCCAGATCTTAGGTATAGAGGACCCTAACACAGTAAGATCGGA",
146 |         "TAGGGGTATGAGTAGAGCTGAGCTGGGGGAAAAGAGGGAAATTCCCAGGGGTGGAGGAAGAGTCAAGTCCCCCTCTACACCTAGAGGATGAACTTAAGGAAGGAGTGAAGGTCATATGTGTTGTTCCTGAGGAAAAGGCCGCTGTAGAAAA",
147 |         };
148 |     const char* str2[3] = {
149 |         "CCTATCAGGGAGCTGTGGGCCAGCCAGGAGGCAGCACATGCCCAATCCCAGGCCCCTCCCGTTGTAAGTTCCCGTTCTACCCGACAGGGACCTGCTGACAAAAGACAGGGCTGGAGAGCCAGCCTGAAGGCCCTGGGACCCTTCTATCCAC",
150 |         "ACTTATGTTTTTAAATGAGGATTATTGATAGTACTCTTGGTTTTTATACCATTCAGATCACTGAATTTATAAAGTACCCATCTAGTACTTGAAAAAGTAAAGTGTTCTGCCAGATCTTAGGTATAGAGGACCCTAACACAGTAAGATCGGA",
151 |         "CCTGGGCCTGGCCCTTGTCTAAAACTGACTCTTTTGAGGGTGATTTTGGATGTTCTTAGTAGAGTCTCTCACCTGTACTTTCCTTGCCTAAGGTGCTGTCTTCTCTTGCAGGTTGCCTACACGTTCCTCACATGCCCTAAGAACCATGGGA",
152 |         };
153 |     int result[3] = {
154 |         0,
155 |         1,
156 |         90,
157 |         };
158 | 
159 |     for(int i=0;i<3;i++){
160 |         int ret  = 0;
161 |         clock_t t1 = clock();
162 |         for (int p=0;p<100000;p++){
163 |             ret = edit_distance(str1[i], strlen(str1[i]), str2[i], strlen(str2[i]));
164 |         }
165 |         clock_t t2 = clock();
166 |         printf("test 100000 edit_distance, takes %lu ms\n", (t2-t1)/1000);
167 |         if(ret != result[i]){
168 |             printf("Fail: (edit_distance), expect %d, but got %d: \n%s\n%s\n", result[i], ret, str1[i], str2[i]);
169 |             return false;
170 |         }
171 |     }
172 |     return true;
173 | }
174 | 


--------------------------------------------------------------------------------
/src/editdistance.h:
--------------------------------------------------------------------------------
 1 | #ifndef ___EDITDISTANCE__H__
 2 | #define ___EDITDISTANCE__H__
 3 | 
 4 | #include <stdint.h>
 5 | #include <string>
 6 | 
 7 | // struct PatternMap {
 8 | //     uint64_t p_[256][4];
 9 | //     unsigned int tmax_;
10 | //     unsigned int tlen_;
11 | // };
12 | 
13 | using namespace std;
14 | 
15 | unsigned int edit_distance(const char *a, const unsigned int asize, const char *b, const unsigned int bsize);
16 | // void create_patternmap(struct PatternMap *pm, const int64_t *a, const unsigned int size);
17 | // unsigned int edit_distance_by_patternmap(struct PatternMap *mp, const int64_t *b, const unsigned int size);
18 | 
19 | unsigned int edit_distance(string a, string b);
20 | 
21 | unsigned int hamming_distance(const char *a, const unsigned int asize, const char *b, const unsigned int bsize);
22 | 
23 | bool editdistance_test();
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/src/evaluator.h:
--------------------------------------------------------------------------------
 1 | #ifndef EVALUATOR_H
 2 | #define EVALUATOR_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "options.h"
 8 | #include "util.h"
 9 | #include "read.h"
10 | 
11 | using namespace std;
12 | 
13 | class Evaluator{
14 | public:
15 |     Evaluator(Options* opt);
16 |     ~Evaluator();
17 |     // evaluate how many reads are stored in the input file
18 |     void evaluateReadNum(long& readNum);
19 |     string evalAdapterAndReadNumDepreciated(long& readNum);
20 |     string evalAdapterAndReadNum(long& readNum, bool isR2);
21 |     bool isTwoColorSystem();
22 |     void evaluateSeqLen();
23 |     int computeSeqLen(string filename);
24 | 
25 |     static bool test();
26 |     static string matchKnownAdapter(string seq);
27 | private:
28 |     Options* mOptions;
29 |     string int2seq(unsigned int val, int seqlen);
30 |     int seq2int(string& seq, int pos, int seqlen, int lastVal = -1);
31 |     string getAdapterWithSeed(int seed, Read** loadedReads, long records, int keylen);
32 | };
33 | 
34 | 
35 | #endif


--------------------------------------------------------------------------------
/src/fastareader.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "fastareader.h"
  3 | #include "util.h"
  4 | #include <sstream>
  5 | #include <string.h>
  6 | 
  7 | FastaReader::FastaReader(string faFile, bool forceUpperCase)
  8 | {
  9 |     // Set locale and disable stdio synchronization to improve iostream performance
 10 |     // http://www.drdobbs.com/the-standard-librarian-iostreams-and-std/184401305
 11 |     // http://stackoverflow.com/questions/5166263/how-to-get-iostream-to-perform-better
 12 |     setlocale(LC_ALL,"C");
 13 |     ios_base::sync_with_stdio(false);
 14 | 
 15 |     mFilename = faFile;
 16 |     mForceUpperCase = forceUpperCase;
 17 | 
 18 |     if (ends_with(mFilename, ".fasta.gz") || ends_with(mFilename, ".fa.gz") || ends_with(mFilename, ".fna.gz")){
 19 |         mZipFile = gzopen(mFilename.c_str(), "r");
 20 |         mZipped = true;
 21 |     }
 22 |     else if(ends_with(mFilename, ".fasta") || ends_with(mFilename, ".fa") || ends_with(mFilename, ".fna")){
 23 |         mFile.open(mFilename.c_str(), ifstream::in);
 24 |         mZipped = false;
 25 |     } else {
 26 |         error_exit("FASTA file should have a name (*.fasta, *.fa or *.fna) or (*.fasta.gz, *.fa.gz or *.fna.gz). Not a FASTA file: " + mFilename);
 27 |     }
 28 | 
 29 |     char c;
 30 |     // seek to first contig
 31 |     while (getChar(c) && c != '>') {
 32 |         if (eof()) {
 33 |             break;
 34 |         }
 35 |     }
 36 | }
 37 | 
 38 | FastaReader::~FastaReader()
 39 | {
 40 |     if (mZipped){
 41 |         if (mZipFile){
 42 |             gzclose(mZipFile);
 43 |             mZipFile = NULL;
 44 |         }
 45 |     }
 46 |     else {
 47 |         if (mFile.is_open()){
 48 |             mFile.close();
 49 |         }
 50 |     }
 51 | }
 52 | 
 53 | bool FastaReader::getLine(char* line, int maxLine){
 54 |     bool status = true;
 55 |     if(mZipped)
 56 |         status = gzgets(mZipFile, line, maxLine);
 57 |     else {
 58 |         mFile.getline(line, maxLine, '\n');
 59 |         status = !mFile.fail();
 60 |     }
 61 | 
 62 |     // trim \n, \r or \r\n in the tail
 63 |     int readed = strlen(line);
 64 |     if(readed >=2 ){
 65 |         if(line[readed-1] == '\n' || line[readed-1] == '\r'){
 66 |             line[readed-1] = '\0';
 67 |             if(line[readed-2] == '\r')
 68 |                 line[readed-2] = '\0';
 69 |         }
 70 |     }
 71 | 
 72 |     return status;
 73 | }
 74 | 
 75 | bool FastaReader::eof() {
 76 |     if (mZipped) {
 77 |         return gzeof(mZipFile);
 78 |     } else {
 79 |         return mFile.eof();
 80 |     }
 81 | }
 82 | 
 83 | bool FastaReader::getChar(char& c) {
 84 |     bool status = true;
 85 |     if (mZipped) {
 86 |         c = (char)gzgetc(mZipFile);
 87 |         if(c == -1)
 88 |             status = false;
 89 |     } else {
 90 |         mFile.get(c);
 91 |         status = !mFile.fail();
 92 |     }
 93 |     return status;
 94 | }
 95 | 
 96 | void FastaReader::readNext()
 97 | {
 98 |     const int maxLine = 1024;
 99 |     char linebuf[maxLine];
100 | 
101 |     mCurrentID = "";
102 |     mCurrentDescription = "";
103 |     mCurrentSequence = "";
104 |     bool foundHeader = false;
105 |     
106 |     char c;
107 |     stringstream ssSeq;
108 |     stringstream ssHeader;
109 |     while(true){
110 |         getChar(c);
111 |         // skip blank line
112 |         if(c == '\n' && !eof())
113 |             continue;
114 |         if(c == '>' || eof())
115 |             break;
116 |         else {
117 |             if (foundHeader){
118 |                 if(mForceUpperCase && c>='a' && c<='z') {
119 |                     c -= ('a' - 'A');
120 |                 }
121 |                 ssSeq << c;
122 |             }
123 |             else
124 |                 ssHeader << c;
125 |         }
126 |         string line;
127 |         if(mZipped) {
128 |             getLine(linebuf, maxLine);
129 |             line = string(linebuf);
130 |         } else {
131 |             getline(mFile,line,'\n');
132 |         }
133 | 
134 |         // fix \r\n issue
135 |         if(line.length()>0)  {
136 |             if(line[line.length()-1] == '\r') {
137 |                 line = line.substr(0, line.length()-1);
138 |             }
139 |         }
140 | 
141 |         if(foundHeader == false) {
142 |             ssHeader << line;
143 |             foundHeader = true;
144 |         }
145 |         else {
146 |             str_keep_valid_sequence(line, mForceUpperCase);
147 |             ssSeq << line;
148 |         }
149 |     }
150 |     mCurrentSequence = ssSeq.str();
151 |     string header = ssHeader.str();
152 | 
153 |     mCurrentID = header;
154 | }
155 | 
156 | bool FastaReader::hasNext() {
157 |     return !eof();
158 | }
159 | 
160 | void FastaReader::readAll() {
161 |     while(!eof()){
162 |         readNext();
163 |         mAllContigs[mCurrentID] = mCurrentSequence;
164 |     }
165 | }
166 | 
167 | bool FastaReader::test(){
168 |     FastaReader reader("testdata/tinyref.fa");
169 |     reader.readAll();
170 | 
171 |     string contig1 = "GATCACAGGTCTATCACCCTATTAATTGGTATTTTCGTCTGGGGGGTGTGGAGCCGGAGCACCCTATGTCGCAGT";
172 |     string contig2 = "GTCTGCACAGCCGCTTTCCACACAGAACCCCCCCCTCCCCCCGCTTCTGGCAAACCCCAAAAACAAAGAACCCTA";
173 | 
174 |     if(reader.mAllContigs.count("contig1") == 0 || reader.mAllContigs.count("contig2") == 0 )
175 |         return false;
176 | 
177 |     if(reader.mAllContigs["contig1"] != contig1 || reader.mAllContigs["contig2"] != contig2 )
178 |         return false;
179 | 
180 |     return true;
181 | 
182 | }
183 | 
184 | 
185 | 
186 | 


--------------------------------------------------------------------------------
/src/fastareader.h:
--------------------------------------------------------------------------------
 1 | #ifndef FASTA_READER_H
 2 | #define FASTA_READER_H
 3 | 
 4 | // includes
 5 | #include <cctype>
 6 | #include <clocale>
 7 | #include <cstdlib>
 8 | #include <fstream>
 9 | #include <iostream>
10 | #include <stdexcept>
11 | #include <string>
12 | #include <map>
13 | #include "zlib/zlib.h"
14 | 
15 | using namespace std;
16 | 
17 | class FastaReader
18 | {
19 | public:
20 |     FastaReader(string fastaFile, bool forceUpperCase = true);
21 |     ~FastaReader();
22 |     bool hasNext();
23 |     void readNext();
24 |     void readAll();
25 | 
26 |     inline string currentID()
27 |     {
28 |         return mCurrentID;
29 |     }
30 | 
31 |     inline string currentDescription()
32 |     {
33 |         return mCurrentDescription;
34 |     }
35 | 
36 |     inline string currentSequence()
37 |     {
38 |         return mCurrentSequence;
39 |     }
40 | 
41 |     inline map<string, string>& contigs() {
42 |         return mAllContigs;
43 |     }
44 | 
45 |     static bool test();
46 | 
47 | 
48 | public:
49 |     string mCurrentSequence;
50 |     string mCurrentID ;
51 |     string mCurrentDescription;
52 |     map<string, string> mAllContigs;
53 | 
54 | private:
55 |     bool readLine();
56 |     bool endOfLine(char c);
57 |     void setFastaSequenceIdDescription();
58 |     bool getLine(char* line, int maxLine);
59 |     bool getChar(char& c);
60 |     bool eof();
61 | 
62 | private:
63 |     string mFilename;
64 |     bool mForceUpperCase;
65 |     gzFile mZipFile;
66 |     ifstream mFile;
67 |     bool mZipped;
68 | };
69 | 
70 | 
71 | #endif
72 | 
73 | 


--------------------------------------------------------------------------------
/src/fastqreader.cpp:
--------------------------------------------------------------------------------
  1 | #include "fastqreader.h"
  2 | #include "util.h"
  3 | #include <string.h>
  4 | 
  5 | #define FQ_BUF_SIZE (1<<20)
  6 | 
  7 | FastqReader::FastqReader(string filename, bool hasQuality, bool phred64){
  8 | 	mFilename = filename;
  9 | 	mZipFile = NULL;
 10 | 	mZipped = false;
 11 | 	mFile = NULL;
 12 | 	mStdinMode = false;
 13 | 	mPhred64 = phred64;
 14 | 	mHasQuality = hasQuality;
 15 | 	mBuf = new char[FQ_BUF_SIZE];
 16 | 	mBufDataLen = 0;
 17 | 	mBufUsedLen = 0;
 18 | 	mHasNoLineBreakAtEnd = false;
 19 | 	init();
 20 | }
 21 | 
 22 | FastqReader::~FastqReader(){
 23 | 	close();
 24 | 	delete mBuf;
 25 | }
 26 | 
 27 | bool FastqReader::hasNoLineBreakAtEnd() {
 28 | 	return mHasNoLineBreakAtEnd;
 29 | }
 30 | 
 31 | void FastqReader::readToBuf() {
 32 | 	if(mZipped) {
 33 | 		mBufDataLen = gzread(mZipFile, mBuf, FQ_BUF_SIZE);
 34 | 		if(mBufDataLen == -1) {
 35 | 			cerr << "Error to read gzip file" << endl;
 36 | 		}
 37 | 	} else {
 38 | 		mBufDataLen = fread(mBuf, 1, FQ_BUF_SIZE, mFile);
 39 | 	}
 40 | 	mBufUsedLen = 0;
 41 | 
 42 | 	if(mBufDataLen < FQ_BUF_SIZE) {
 43 | 		if(mBuf[mBufDataLen-1] != '\n')
 44 | 			mHasNoLineBreakAtEnd = true;
 45 | 	}
 46 | }
 47 | 
 48 | void FastqReader::init(){
 49 | 	if (ends_with(mFilename, ".gz")){
 50 | 		mZipFile = gzopen(mFilename.c_str(), "r");
 51 | 		mZipped = true;
 52 | 		gzrewind(mZipFile);
 53 | 	}
 54 | 	else {
 55 | 		if(mFilename == "/dev/stdin") {
 56 | 			mFile = stdin;
 57 | 		}
 58 | 		else
 59 | 			mFile = fopen(mFilename.c_str(), "rb");
 60 | 		if(mFile == NULL) {
 61 | 			error_exit("Failed to open file: " + mFilename);
 62 | 		}
 63 | 		mZipped = false;
 64 | 	}
 65 | 	readToBuf();
 66 | }
 67 | 
 68 | void FastqReader::getBytes(size_t& bytesRead, size_t& bytesTotal) {
 69 | 	if(mZipped) {
 70 | 		bytesRead = gzoffset(mZipFile);
 71 | 	} else {
 72 | 		bytesRead = ftell(mFile);//mFile.tellg();
 73 | 	}
 74 | 
 75 | 	// use another ifstream to not affect current reader
 76 | 	ifstream is(mFilename);
 77 | 	is.seekg (0, is.end);
 78 | 	bytesTotal = is.tellg();
 79 | }
 80 | 
 81 | void FastqReader::clearLineBreaks(char* line) {
 82 | 
 83 | 	// trim \n, \r or \r\n in the tail
 84 | 	int readed = strlen(line);
 85 | 	if(readed >=2 ){
 86 | 		if(line[readed-1] == '\n' || line[readed-1] == '\r'){
 87 | 			line[readed-1] = '\0';
 88 | 			if(line[readed-2] == '\r')
 89 | 				line[readed-2] = '\0';
 90 | 		}
 91 | 	}
 92 | }
 93 | 
 94 | string FastqReader::getLine(){
 95 | 	static int c=0;
 96 | 	c++;
 97 | 	int copied = 0;
 98 | 
 99 | 	int start = mBufUsedLen;
100 | 	int end = start;
101 | 
102 | 	while(end < mBufDataLen) {
103 | 		if(mBuf[end] != '\r' && mBuf[end] != '\n')
104 | 			end++;
105 | 		else
106 | 			break;
107 | 	}
108 | 
109 | 	// this line well contained in this buf, or this is the last buf
110 | 	if(end < mBufDataLen || mBufDataLen < FQ_BUF_SIZE) {
111 | 		int len = end - start;
112 | 		string line(mBuf+start, len);
113 | 
114 | 		// skip \n or \r
115 | 		end++;
116 | 		// handle \r\n
117 | 		if(end < mBufDataLen-1 && mBuf[end-1]=='\r' && mBuf[end] == '\n')
118 | 			end++;
119 | 
120 | 		mBufUsedLen = end;
121 | 
122 | 		return line;
123 | 	}
124 | 
125 | 	// this line is not contained in this buf, we need to read new buf
126 | 	string str(mBuf+start, mBufDataLen - start);
127 | 
128 | 	while(true) {
129 | 		readToBuf();
130 | 		start = 0;
131 | 		end = 0;
132 | 		while(end < mBufDataLen) {
133 | 			if(mBuf[end] != '\r' && mBuf[end] != '\n')
134 | 				end++;
135 | 			else
136 | 				break;
137 | 		}
138 | 		// this line well contained in this buf, we need to read new buf
139 | 		if(end < mBufDataLen || mBufDataLen < FQ_BUF_SIZE) {
140 | 			int len = end - start;
141 | 			str.append(mBuf+start, len);
142 | 
143 | 			// skip \n or \r
144 | 			end++;
145 | 			// handle \r\n
146 | 			if(end < mBufDataLen-1 && mBuf[end] == '\n')
147 | 				end++;
148 | 
149 | 			mBufUsedLen = end;
150 | 			return str;
151 | 		}
152 | 		// even this new buf is not enough, although impossible
153 | 		str.append(mBuf+start, mBufDataLen);
154 | 	}
155 | 
156 | 	return string();
157 | }
158 | 
159 | bool FastqReader::eof() {
160 | 	if (mZipped) {
161 | 		return gzeof(mZipFile);
162 | 	} else {
163 | 		return feof(mFile);//mFile.eof();
164 | 	}
165 | }
166 | 
167 | Read* FastqReader::read(){
168 | 	if (mZipped){
169 | 		if (mZipFile == NULL)
170 | 			return NULL;
171 | 	}
172 | 
173 | 	if(mBufUsedLen >= mBufDataLen && eof()) {
174 | 		return NULL;
175 | 	}
176 | 
177 | 	string name = getLine();
178 | 	// name should start with @
179 | 	while((name.empty() && !(mBufUsedLen >= mBufDataLen && eof())) || (!name.empty() && name[0]!='@')){
180 | 		name = getLine();
181 | 	}
182 | 
183 | 	if(name.empty())
184 | 		return NULL;
185 | 
186 | 	string sequence = getLine();
187 | 	string strand = getLine();
188 | 
189 | 	// WAR for FQ with no quality
190 | 	if (!mHasQuality){
191 | 		string quality = string(sequence.length(), 'K');
192 | 		return new Read(name, sequence, strand, quality, mPhred64);
193 | 	}
194 | 	else {
195 | 		string quality = getLine();
196 | 		if(quality.length() != sequence.length()) {
197 | 			cerr << "ERROR: sequence and quality have different length:" << endl;
198 | 			cerr << name << endl;
199 | 			cerr << sequence << endl;
200 | 			cerr << strand << endl;
201 | 			cerr << quality << endl;
202 | 			return NULL;
203 | 		}
204 | 		return new Read(name, sequence, strand, quality, mPhred64);
205 | 	}
206 | 
207 | 	return NULL;
208 | }
209 | 
210 | void FastqReader::close(){
211 | 	if (mZipped){
212 | 		if (mZipFile){
213 | 			gzclose(mZipFile);
214 | 			mZipFile = NULL;
215 | 		}
216 | 	}
217 | 	else {
218 | 		if (mFile){
219 | 			fclose(mFile);//mFile.close();
220 | 			mFile = NULL;
221 | 		}
222 | 	}
223 | }
224 | 
225 | bool FastqReader::isZipFastq(string filename) {
226 | 	if (ends_with(filename, ".fastq.gz"))
227 | 		return true;
228 | 	else if (ends_with(filename, ".fq.gz"))
229 | 		return true;
230 | 	else if (ends_with(filename, ".fasta.gz"))
231 | 		return true;
232 | 	else if (ends_with(filename, ".fa.gz"))
233 | 		return true;
234 | 	else
235 | 		return false;
236 | }
237 | 
238 | bool FastqReader::isFastq(string filename) {
239 | 	if (ends_with(filename, ".fastq"))
240 | 		return true;
241 | 	else if (ends_with(filename, ".fq"))
242 | 		return true;
243 | 	else if (ends_with(filename, ".fasta"))
244 | 		return true;
245 | 	else if (ends_with(filename, ".fa"))
246 | 		return true;
247 | 	else
248 | 		return false;
249 | }
250 | 
251 | bool FastqReader::isZipped(){
252 | 	return mZipped;
253 | }
254 | 
255 | bool FastqReader::test(){
256 | 	FastqReader reader1("testdata/R1.fq");
257 | 	FastqReader reader2("testdata/R1.fq.gz");
258 | 	Read* r1 = NULL;
259 | 	Read* r2 = NULL;
260 | 	while(true){
261 | 		r1=reader1.read();
262 | 		r2=reader2.read();
263 | 		if(r1 == NULL || r2 == NULL)
264 | 			break;
265 | 		if(r1->mSeq.mStr != r2->mSeq.mStr){
266 | 			return false;
267 | 		}
268 | 		delete r1;
269 | 		delete r2;
270 | 	}
271 | 	return true;
272 | }
273 | 
274 | FastqReaderPair::FastqReaderPair(FastqReader* left, FastqReader* right){
275 | 	mLeft = left;
276 | 	mRight = right;
277 | }
278 | 
279 | FastqReaderPair::FastqReaderPair(string leftName, string rightName, bool hasQuality, bool phred64, bool interleaved){
280 | 	mInterleaved = interleaved;
281 | 	mLeft = new FastqReader(leftName, hasQuality, phred64);
282 | 	if(mInterleaved)
283 | 		mRight = NULL;
284 | 	else
285 | 		mRight = new FastqReader(rightName, hasQuality, phred64);
286 | }
287 | 
288 | FastqReaderPair::~FastqReaderPair(){
289 | 	if(mLeft){
290 | 		delete mLeft;
291 | 		mLeft = NULL;
292 | 	}
293 | 	if(mRight){
294 | 		delete mRight;
295 | 		mRight = NULL;
296 | 	}
297 | }
298 | 
299 | ReadPair* FastqReaderPair::read(){
300 | 	Read* l = mLeft->read();
301 | 	Read* r = NULL;
302 | 	if(mInterleaved)
303 | 		r = mLeft->read();
304 | 	else
305 | 		r = mRight->read();
306 | 	if(!l || !r){
307 | 		return NULL;
308 | 	} else {
309 | 		return new ReadPair(l, r);
310 | 	}
311 | }
312 | 


--------------------------------------------------------------------------------
/src/fastqreader.h:
--------------------------------------------------------------------------------
 1 | #ifndef FASTQ_READER_H
 2 | #define FASTQ_READER_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include "read.h"
 7 | #ifdef DYNAMIC_ZLIB
 8 |   #include <zlib.h>
 9 | #else
10 |   #include "zlib/zlib.h"
11 | #endif
12 | #include "common.h"
13 | #include <iostream>
14 | #include <fstream>
15 | 
16 | class FastqReader{
17 | public:
18 | 	FastqReader(string filename, bool hasQuality = true, bool phred64=false);
19 | 	~FastqReader();
20 | 	bool isZipped();
21 | 
22 | 	void getBytes(size_t& bytesRead, size_t& bytesTotal);
23 | 
24 | 	//this function is not thread-safe
25 | 	//do not call read() of a same FastqReader object from different threads concurrently
26 | 	Read* read();
27 | 	bool eof();
28 | 	bool hasNoLineBreakAtEnd();
29 | 
30 | public:
31 | 	static bool isZipFastq(string filename);
32 | 	static bool isFastq(string filename);
33 | 	static bool test();
34 | 
35 | private:
36 | 	void init();
37 | 	void close();
38 | 	string getLine();
39 | 	void clearLineBreaks(char* line);
40 | 	void readToBuf();
41 | 
42 | private:
43 | 	string mFilename;
44 | 	gzFile mZipFile;
45 | 	FILE* mFile;
46 | 	bool mZipped;
47 | 	bool mHasQuality;
48 | 	bool mPhred64;
49 | 	char* mBuf;
50 | 	int mBufDataLen;
51 | 	int mBufUsedLen;
52 | 	bool mStdinMode;
53 | 	bool mHasNoLineBreakAtEnd;
54 | 
55 | };
56 | 
57 | class FastqReaderPair{
58 | public:
59 | 	FastqReaderPair(FastqReader* left, FastqReader* right);
60 | 	FastqReaderPair(string leftName, string rightName, bool hasQuality = true, bool phred64 = false, bool interleaved = false);
61 | 	~FastqReaderPair();
62 | 	ReadPair* read();
63 | public:
64 | 	FastqReader* mLeft;
65 | 	FastqReader* mRight;
66 | 	bool mInterleaved;
67 | };
68 | 
69 | #endif
70 | 


--------------------------------------------------------------------------------
/src/filter.cpp:
--------------------------------------------------------------------------------
  1 | #include "processor.h"
  2 | #include "peprocessor.h"
  3 | #include "seprocessor.h"
  4 | #include "overlapanalysis.h"
  5 | 
  6 | Filter::Filter(Options* opt){
  7 |     mOptions = opt;
  8 | }
  9 | 
 10 | 
 11 | Filter::~Filter(){
 12 | }
 13 | 
 14 | int Filter::passFilter(Read* r) {
 15 |     if(r == NULL || r->length()==0) {
 16 |         return FAIL_LENGTH;
 17 |     }
 18 | 
 19 |     int rlen = r->length();
 20 |     int lowQualNum = 0;
 21 |     int nBaseNum = 0;
 22 |     int totalQual = 0;
 23 | 
 24 |     // need to recalculate lowQualNum and nBaseNum if the corresponding filters are enabled
 25 |     if(mOptions->qualfilter.enabled || mOptions->lengthFilter.enabled) {
 26 |         const char* seqstr = r->mSeq.mStr.c_str();
 27 |         const char* qualstr = r->mQuality.c_str();
 28 | 
 29 |         for(int i=0; i<rlen; i++) {
 30 |             char base = seqstr[i];
 31 |             char qual = qualstr[i];
 32 | 
 33 |             totalQual += qual - 33;
 34 | 
 35 |             if(qual < mOptions->qualfilter.qualifiedQual)
 36 |                 lowQualNum ++;
 37 | 
 38 |             if(base == 'N')
 39 |                 nBaseNum++;
 40 |         }
 41 |     }
 42 | 
 43 |     if(mOptions->qualfilter.enabled) {
 44 |         if(lowQualNum > (mOptions->qualfilter.unqualifiedPercentLimit * rlen / 100.0) )
 45 |             return FAIL_QUALITY;
 46 |         else if(mOptions->qualfilter.avgQualReq > 0 && (totalQual / rlen)<mOptions->qualfilter.avgQualReq)
 47 |             return FAIL_QUALITY;
 48 |         else if(nBaseNum > mOptions->qualfilter.nBaseLimit )
 49 |             return FAIL_N_BASE;
 50 |     }
 51 | 
 52 |     if(mOptions->lengthFilter.enabled) {
 53 |         if(rlen < mOptions->lengthFilter.requiredLength)
 54 |             return FAIL_LENGTH;
 55 |         if(mOptions->lengthFilter.maxLength > 0 && rlen > mOptions->lengthFilter.maxLength)
 56 |             return FAIL_TOO_LONG;
 57 |     }
 58 | 
 59 |     if(mOptions->complexityFilter.enabled) {
 60 |         if(!passLowComplexityFilter(r))
 61 |             return FAIL_COMPLEXITY;
 62 |     }
 63 | 
 64 |     return PASS_FILTER;
 65 | }
 66 | 
 67 | bool Filter::passLowComplexityFilter(Read* r) {
 68 |     int diff = 0;
 69 |     int length = r->length();
 70 |     if(length <= 1)
 71 |         return false;
 72 |     const char* data = r->mSeq.mStr.c_str();
 73 |     for(int i=0; i<length-1; i++) {
 74 |         if(data[i] != data[i+1])
 75 |             diff++;
 76 |     }
 77 |     if( (double)diff/(double)(length-1) >= mOptions->complexityFilter.threshold )
 78 |         return true;
 79 |     else
 80 |         return false;
 81 | }
 82 | 
 83 | Read* Filter::trimAndCut(Read* r, int front, int tail, int& frontTrimmed) {
 84 |     frontTrimmed = 0;
 85 |     // return the same read for speed if no change needed
 86 |     if(front == 0 && tail == 0 && !mOptions->qualityCut.enabledFront && !mOptions->qualityCut.enabledTail && !mOptions->qualityCut.enabledRight)
 87 |         return r;
 88 | 
 89 | 
 90 |     int rlen = r->length() - front - tail ; 
 91 |     if (rlen < 0)
 92 |         return NULL;
 93 | 
 94 |     if(front == 0 && !mOptions->qualityCut.enabledFront && !mOptions->qualityCut.enabledTail && !mOptions->qualityCut.enabledRight){
 95 |         r->resize(rlen);
 96 |         return r;
 97 |     } else if(!mOptions->qualityCut.enabledFront && !mOptions->qualityCut.enabledTail && !mOptions->qualityCut.enabledRight){
 98 |         r->mSeq.mStr = r->mSeq.mStr.substr(front, rlen);
 99 |         r->mQuality = r->mQuality.substr(front, rlen);
100 |         frontTrimmed  = front;
101 |         return r;
102 |     }
103 | 
104 |     // need quality cutting
105 | 
106 |     int l = r->length();
107 |     const char* qualstr = r->mQuality.c_str();
108 |     const char* seq = r->mSeq.mStr.c_str();
109 |     // quality cutting forward
110 |     if(mOptions->qualityCut.enabledFront) {
111 |         int w = mOptions->qualityCut.windowSizeFront;
112 |         int s = front;
113 |         if(l - front - tail - w <= 0)
114 |             return NULL;
115 | 
116 |         int totalQual = 0;
117 | 
118 |         // preparing rolling
119 |         for(int i=0; i<w-1; i++)
120 |             totalQual += qualstr[s+i];
121 | 
122 |         for(s=front; s+w<l-tail; s++) {
123 |             totalQual += qualstr[s+w-1];
124 |             // rolling
125 |             if(s > front) {
126 |                 totalQual -= qualstr[s-1];
127 |             }
128 |             // add 33 for phred33 transforming
129 |             if((double)totalQual / (double)w >= 33 + mOptions->qualityCut.qualityFront)
130 |                 break;
131 |         }
132 | 
133 |         // the trimming in front is forwarded and rlen is recalculated
134 |         if(s >0 )
135 |             s = s+w-1;
136 |         while(s<l && seq[s] == 'N')
137 |             s++;
138 |         front = s;
139 |         rlen = l - front - tail;
140 |     }
141 | 
142 |     // quality cutting in right mode
143 |     if(mOptions->qualityCut.enabledRight) {
144 |         int w = mOptions->qualityCut.windowSizeRight;
145 |         int s = front;
146 |         if(l - front - tail - w <= 0)
147 |             return NULL;
148 | 
149 |         int totalQual = 0;
150 | 
151 |         // preparing rolling
152 |         for(int i=0; i<w-1; i++)
153 |             totalQual += qualstr[s+i];
154 | 
155 |         bool foundLowQualWindow = false;
156 | 
157 |         for(s=front; s+w<l-tail; s++) {
158 |             totalQual += qualstr[s+w-1];
159 |             // rolling
160 |             if(s > front) {
161 |                 totalQual -= qualstr[s-1];
162 |             }
163 |             // add 33 for phred33 transforming
164 |             if((double)totalQual / (double)w < 33 + mOptions->qualityCut.qualityRight) {
165 |                 foundLowQualWindow = true;
166 |                 break;
167 |             }
168 |         }
169 | 
170 |         if(foundLowQualWindow ) {
171 |             // keep the good bases in the window
172 |             while(s<l-1 && qualstr[s]>=33 + mOptions->qualityCut.qualityRight)
173 |                 s++;
174 |             rlen = s - front;
175 |         }
176 |     }
177 | 
178 |     // quality cutting backward
179 |     if(!mOptions->qualityCut.enabledRight && mOptions->qualityCut.enabledTail) {
180 |         int w = mOptions->qualityCut.windowSizeTail;
181 |         if(l - front - tail - w <= 0)
182 |             return NULL;
183 | 
184 |         int totalQual = 0;
185 |         int t = l - tail - 1;
186 | 
187 |         // preparing rolling
188 |         for(int i=0; i<w-1; i++)
189 |             totalQual += qualstr[t-i];
190 | 
191 |         for(t=l-tail-1; t-w>=front; t--) {
192 |             totalQual += qualstr[t-w+1];
193 |             // rolling
194 |             if(t < l-tail-1) {
195 |                 totalQual -= qualstr[t+1];
196 |             }
197 |             // add 33 for phred33 transforming
198 |             if((double)totalQual / (double)w >= 33 + mOptions->qualityCut.qualityTail)
199 |                 break;
200 |         }
201 | 
202 |         if(t < l-1)
203 |             t = t-w+1;
204 |         while(t>=0 && seq[t] == 'N')
205 |             t--;
206 |         rlen = t - front + 1;
207 |     }
208 | 
209 |     if(rlen <= 0 || front >= l-1)
210 |         return NULL;
211 | 
212 |     r->mSeq.mStr = r->mSeq.mStr.substr(front, rlen);
213 |     r->mQuality = r->mQuality.substr(front, rlen);
214 | 
215 |     frontTrimmed = front;
216 | 
217 |     return r;
218 | }
219 | 
220 | bool Filter::match(vector<string>& list, string target, int threshold) {
221 |     for(int i=0; i<list.size(); i++) {
222 |         int diff = 0;
223 |         int len1 = list[i].length();
224 |         int len2 = target.length();
225 |         for(int s=0; s<len1 && s<len2; s++) {
226 |             if(list[i][s] != target[s]) {
227 |                 diff++;
228 |                 if(diff>threshold)
229 |                     break;
230 |             }
231 |         }
232 |         if(diff <= threshold)
233 |             return true;
234 |     }
235 |     return false;
236 | }
237 | 
238 | bool Filter::test() {
239 |     Read r("@name",
240 |         "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTT",
241 |         "+",
242 |         "/////CCCCCCCCCCCC////CCCCCCCCCCCCCC////E");
243 |     Options opt;
244 |     opt.qualityCut.enabledFront = true;
245 |     opt.qualityCut.enabledTail = true;
246 |     opt.qualityCut.windowSizeFront = 4;
247 |     opt.qualityCut.qualityFront = 20;
248 |     opt.qualityCut.windowSizeTail = 4;
249 |     opt.qualityCut.qualityTail = 20;
250 |     Filter filter(&opt);
251 |     int frontTrimmed = 0;
252 |     Read* ret = filter.trimAndCut(&r, 0, 1, frontTrimmed);
253 |     ret->print();
254 |     
255 |     return ret->mSeq.mStr == "CCCCCCCCCCCCCCCCCCCCCCCCCCCC"
256 |         && ret->mQuality == "CCCCCCCCCCC////CCCCCCCCCCCCC";
257 | }


--------------------------------------------------------------------------------
/src/filter.h:
--------------------------------------------------------------------------------
 1 | #ifndef FILTER_H
 2 | #define FILTER_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <vector>
 8 | #include "options.h"
 9 | #include "read.h"
10 | 
11 | using namespace std;
12 | 
13 | class Filter{
14 | public:
15 |     Filter(Options* opt);
16 |     ~Filter();
17 |     int passFilter(Read* r);
18 |     bool passLowComplexityFilter(Read* r);
19 |     Read* trimAndCut(Read* r, int front, int tail, int& frontTrimmed);
20 |     static bool test();
21 | 
22 | private:
23 |     bool match(vector<string>& list, string target, int threshold);
24 | 
25 | private:
26 |     Options* mOptions;
27 | };
28 | 
29 | 
30 | #endif


--------------------------------------------------------------------------------
/src/filterresult.h:
--------------------------------------------------------------------------------
 1 | #ifndef FILTER_RESULT_H
 2 | #define FILTER_RESULT_H
 3 | 
 4 | #include <iostream>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string>
 8 | #include <vector>
 9 | #include "common.h"
10 | #include "options.h"
11 | #include <fstream>
12 | #include <map>
13 | 
14 | struct classcomp {
15 |     bool operator() (const string& lhs, const string& rhs) const {
16 |         if (lhs.length() < rhs.length())
17 |             return true;
18 |         else if(lhs.length() == rhs.length()) {
19 |             return lhs < rhs;
20 |         } else
21 |             return false;
22 |     }
23 | };
24 | 
25 | using namespace std;
26 | 
27 | class FilterResult{
28 | public:
29 |     FilterResult(Options* opt, bool paired = false);
30 |     ~FilterResult();
31 |     inline long* getFilterReadStats() {return mFilterReadStats;}
32 |     void addFilterResult(int result, int readNum=1);
33 |     static FilterResult* merge(vector<FilterResult*>& list);
34 |     void print();
35 |     // for single end
36 |     void addAdapterTrimmed(string adapter, bool isR2 = false, bool incTrimmedCounter = true);
37 |     // for paired end
38 |     void addAdapterTrimmed(string adapter1, string adapter2);
39 |     void addPolyXTrimmed(int base, int length);
40 |     long getTotalPolyXTrimmedReads();
41 |     long getTotalPolyXTrimmedBases();
42 |     // a part of JSON report
43 |     void reportJson(ofstream& ofs, string padding);
44 |     // a part of JSON report for adapters
45 |     void reportAdapterJson(ofstream& ofs, string padding);
46 |     // a part of JSON report for polyX trim
47 |     void reportPolyXTrimJson(ofstream& ofs, string padding);
48 |     // a part of HTML report
49 |     void reportHtml(ofstream& ofs, long totalReads, long totalBases);
50 |     // a part of HTML report for adapters
51 |     void reportAdapterHtml(ofstream& ofs, long totalBases);
52 |     void outputAdaptersJson(ofstream& ofs, map<string, long, classcomp>& adapterCounts);
53 |     void outputAdaptersHtml(ofstream& ofs, map<string, long, classcomp>& adapterCounts, long totalBases);
54 |     // deal with base correction results
55 |     long* getCorrectionMatrix() {return mCorrectionMatrix;}
56 |     long getTotalCorrectedBases();
57 |     void addCorrection(char from, char to);
58 |     long getCorrectionNum(char from, char to);
59 |     void incCorrectedReads(int count);
60 |     void addMergedPairs(int pairs);
61 | 
62 | 
63 | public:
64 |     Options* mOptions;
65 |     bool mPaired;
66 |     long mCorrectedReads;
67 |     long mMergedPairs;
68 | private:
69 |     long mFilterReadStats[FILTER_RESULT_TYPES];
70 |     long mTrimmedAdapterRead;
71 |     long mTrimmedAdapterBases;
72 |     long mTrimmedPolyXReads[4] = {0};
73 |     long mTrimmedPolyXBases[4] = {0};
74 |     map<string, long, classcomp> mAdapter1;
75 |     map<string, long, classcomp> mAdapter2;
76 |     long* mCorrectionMatrix;
77 | };
78 | 
79 | #endif


--------------------------------------------------------------------------------
/src/genomes.h:
--------------------------------------------------------------------------------
 1 | #ifndef GENOMES_H
 2 | #define GENOMES_H
 3 | 
 4 | // includes
 5 | #include <iostream>
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | #include <string>
 9 | #include <fstream>
10 | #include "common.h"
11 | #include "fastareader.h"
12 | #include <vector>
13 | #include <list>
14 | #include <set>
15 | #include <unordered_map>
16 | #include "options.h"
17 | 
18 | using namespace std;
19 | 
20 | class MapResult{
21 | 
22 | public:
23 |     MapResult(){
24 |         mapped = false;
25 |         start = 0;
26 |         len = 0;
27 |         ed = 0x7FFFFFFF; // initialized with a very large ED
28 |     }
29 | public:
30 |     bool mapped;
31 |     uint32 start;
32 |     uint32 len;
33 |     uint32 ed; // edit distance
34 | };
35 | 
36 | class Genomes
37 | {
38 | public:
39 |     Genomes(string fastaFile, Options* opt);
40 |     ~Genomes();
41 | 
42 |     void cover(int id, uint32 pos, uint32 len, uint32 ed, float frac);
43 |     bool hasKey(uint64 key);
44 |     bool align(string& seq);
45 |     void report();
46 |     void reportJSON(ofstream& ofs);
47 |     void reportHtml(ofstream& ofs);
48 | 
49 |     static uint32 packIdPos(uint32 id, uint32 position);
50 |     static void unpackIdPos(uint32 data,uint32& id, uint32& pos);
51 | 
52 | private:
53 |     void init();
54 |     void buildKmerTable();
55 |     void addKmer(uint64 key, uint32 id, uint32 pos);
56 |     void initLowComplexityKeys();
57 |     MapResult mapToGenome(string& seq, uint32 seqPos, string& genome, uint32 genomePos);
58 |     void initBloomFilter();
59 |     string getPlotX(int id);
60 |     string getCoverageY(int id);
61 |     string getEditDistanceY(int id);
62 |     void initBinSize();
63 |     double getCoverageRate(int id);
64 | 
65 | private:
66 |     int mGenomeNum;
67 |     FastaReader* mFastaReader;
68 |     vector<string> mSequences;
69 |     vector<string> mNames;
70 |     vector<vector<float>> mCoverage;
71 |     vector<vector<float>> mEditDistance;
72 |     vector<long> mTotalEditDistance;
73 |     vector<long> mReads;
74 |     vector<long> mBases;
75 |     // unit32 = 8 bits genome id + 24 bits positions
76 |     unordered_map<uint64, list<uint32>> mKmerTable; 
77 |     set<uint64> mLowComplexityKeys;
78 |     Options* mOptions;
79 |     long mHitCount;
80 |     long mMissedCount;
81 |     char* mBloomFilterArray;
82 | };
83 | 
84 | 
85 | #endif


--------------------------------------------------------------------------------
/src/htmlreporter.h:
--------------------------------------------------------------------------------
 1 | #ifndef HTML_REPORTER_H
 2 | #define HTML_REPORTER_H
 3 | 
 4 | #include <iostream>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string>
 8 | #include "options.h"
 9 | #include "stats.h"
10 | #include "filterresult.h"
11 | #include <fstream>
12 | #include "virusdetector.h"
13 | 
14 | using namespace std;
15 | 
16 | class HtmlReporter{
17 | public:
18 |     HtmlReporter(Options* opt);
19 |     ~HtmlReporter();
20 |     void setDupHist(int* dupHist, double* dupMeanGC, double dupRate);
21 |     void setInsertHist(long* insertHist, int insertSizePeak);
22 |     void report(VirusDetector* vd,FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL);
23 | 
24 |     static void outputRow(ofstream& ofs, string key, long value);
25 |     static void outputRow(ofstream& ofs, string key, string value);
26 |     static string formatNumber(long number);
27 |     static string getPercents(long numerator, long denominator);
28 | private:
29 |     const string getCurrentSystemTime();
30 |     void printHeader(ofstream& ofs);
31 |     void printCSS(ofstream& ofs);
32 |     void printJS(ofstream& ofs);
33 |     void printFooter(ofstream& ofs);
34 |     void reportDuplication(ofstream& ofs);
35 |     void reportInsertSize(ofstream& ofs, int isizeLimit);
36 |     void printSummary(ofstream& ofs, FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2, Stats* postStats2);
37 |     void printDetectionResult(ofstream& ofs, Kmer* kmer);
38 |     void printGenomeCoverage(ofstream& ofs, Genomes* g);
39 |     void reportKmerHits(ofstream& ofs, Kmer* kmer);
40 |     void reportKmerCollection(ofstream& ofs, KmerCollection* kc);
41 |     
42 |     
43 | private:
44 |     Options* mOptions;
45 |     int* mDupHist;
46 |     double* mDupMeanGC;
47 |     double mDupRate;
48 |     long* mInsertHist;
49 |     int mInsertSizePeak;
50 | };
51 | 
52 | 
53 | #endif


--------------------------------------------------------------------------------
/src/jsonreporter.cpp:
--------------------------------------------------------------------------------
  1 | #include "jsonreporter.h"
  2 | 
  3 | JsonReporter::JsonReporter(Options* opt){
  4 |     mOptions = opt;
  5 |     mDupHist = NULL;
  6 |     mDupRate = 0;
  7 | }
  8 | 
  9 | JsonReporter::~JsonReporter(){
 10 | }
 11 | 
 12 | void JsonReporter::setDupHist(int* dupHist, double* dupMeanGC, double dupRate) {
 13 |     mDupHist = dupHist;
 14 |     mDupMeanGC = dupMeanGC;
 15 |     mDupRate = dupRate;
 16 | }
 17 | 
 18 | void JsonReporter::setInsertHist(long* insertHist, int insertSizePeak) {
 19 |     mInsertHist = insertHist;
 20 |     mInsertSizePeak = insertSizePeak;
 21 | }
 22 | 
 23 | extern string command;
 24 | void JsonReporter::report(VirusDetector* vd, FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2, Stats* postStats2) {
 25 |     ofstream ofs;
 26 |     ofs.open(mOptions->jsonFile, ifstream::out);
 27 |     ofs << "{" << endl;
 28 | 
 29 |     long pre_total_reads = preStats1->getReads();
 30 |     if(preStats2)
 31 |         pre_total_reads += preStats2->getReads();
 32 | 
 33 |     long pre_total_bases = preStats1->getBases();
 34 |     if(preStats2)
 35 |         pre_total_bases += preStats2->getBases();
 36 | 
 37 |     long pre_q20_bases = preStats1->getQ20();
 38 |     if(preStats2)
 39 |         pre_q20_bases += preStats2->getQ20();
 40 | 
 41 |     long pre_q30_bases = preStats1->getQ30();
 42 |     if(preStats2)
 43 |         pre_q30_bases += preStats2->getQ30();
 44 | 
 45 |     long pre_total_gc = preStats1->getGCNumber();
 46 |     if(preStats2)
 47 |         pre_total_gc += preStats2->getGCNumber();
 48 | 
 49 |     long post_total_reads = postStats1->getReads();
 50 |     if(postStats2)
 51 |         post_total_reads += postStats2->getReads();
 52 | 
 53 |     long post_total_bases = postStats1->getBases();
 54 |     if(postStats2)
 55 |         post_total_bases += postStats2->getBases();
 56 | 
 57 |     long post_q20_bases = postStats1->getQ20();
 58 |     if(postStats2)
 59 |         post_q20_bases += postStats2->getQ20();
 60 | 
 61 |     long post_q30_bases = postStats1->getQ30();
 62 |     if(postStats2)
 63 |         post_q30_bases += postStats2->getQ30();
 64 | 
 65 |     long post_total_gc = postStats1->getGCNumber();
 66 |     if(postStats2)
 67 |         post_total_gc += postStats2->getGCNumber();
 68 | 
 69 |     // KMER detection
 70 |     Kmer* kmer = vd->getKmer();
 71 |     if(kmer) {
 72 |         string detectionResult;
 73 |         if(kmer->getMeanHit() >= mOptions->positiveThreshold)
 74 |             detectionResult = "POSITIVE";
 75 |         else
 76 |             detectionResult = "NEGATIVE";
 77 | 
 78 |         ofs << "\t" << "\"kmer_detection_result\": {" << endl;
 79 |         ofs << "\t\t" << "\"result\": \"" << detectionResult << "\"," << endl;
 80 |         ofs << "\t\t" << "\"mean_coverage\": " << kmer->getMeanHit() << "," << endl;
 81 |         ofs << "\t\t" << "\"positive_thread\": " << mOptions->positiveThreshold << "," << endl;
 82 | 
 83 |         // unique kmer hits
 84 |         ofs << "\t\t" << "\"kmer_hits\": {" << endl;
 85 |             kmer->reportJSON(ofs);
 86 |         ofs << "\t\t" << "}" << endl;
 87 | 
 88 | 
 89 |         ofs << "\t" << "}," << endl;
 90 |     }
 91 | 
 92 |     // KMER detection
 93 |     Genomes* genome = vd->getGenomes();
 94 |     if(genome) {
 95 |         genome->reportJSON(ofs);
 96 |     }
 97 | 
 98 |     // KMER detection
 99 |     KmerCollection* kc = vd->getKmerCollection();
100 |     if(kc) {
101 |         kc->reportJSON(ofs);
102 |     }
103 | 
104 |     // summary
105 |     ofs << "\t" << "\"summary\": {" << endl;
106 | 
107 |     ofs << "\t\t" << "\"before_filtering\": {" << endl;
108 |     ofs << "\t\t\t" << "\"total_reads\":" << pre_total_reads << "," << endl; 
109 |     ofs << "\t\t\t" << "\"total_bases\":" << pre_total_bases << "," << endl; 
110 |     ofs << "\t\t\t" << "\"q20_bases\":" << pre_q20_bases << "," << endl; 
111 |     ofs << "\t\t\t" << "\"q30_bases\":" << pre_q30_bases << "," << endl; 
112 |     ofs << "\t\t\t" << "\"q20_rate\":" << (pre_total_bases == 0?0.0:(double)pre_q20_bases / (double)pre_total_bases) << "," << endl; 
113 |     ofs << "\t\t\t" << "\"q30_rate\":" << (pre_total_bases == 0?0.0:(double)pre_q30_bases / (double)pre_total_bases) << "," << endl; 
114 |     ofs << "\t\t\t" << "\"read1_mean_length\":" << preStats1->getMeanLength() << "," << endl;
115 |     if(mOptions->isPaired())
116 |         ofs << "\t\t\t" << "\"read2_mean_length\":" << preStats2->getMeanLength() << "," << endl;
117 |     ofs << "\t\t\t" << "\"gc_content\":" << (pre_total_bases == 0?0.0:(double)pre_total_gc / (double)pre_total_bases)  << endl; 
118 |     ofs << "\t\t" << "}," << endl;
119 | 
120 |     ofs << "\t\t" << "\"after_filtering\": {" << endl;
121 |     ofs << "\t\t\t" << "\"total_reads\":" << post_total_reads << "," << endl; 
122 |     ofs << "\t\t\t" << "\"total_bases\":" << post_total_bases << "," << endl; 
123 |     ofs << "\t\t\t" << "\"q20_bases\":" << post_q20_bases << "," << endl; 
124 |     ofs << "\t\t\t" << "\"q30_bases\":" << post_q30_bases << "," << endl; 
125 |     ofs << "\t\t\t" << "\"q20_rate\":" << (post_total_bases == 0?0.0:(double)post_q20_bases / (double)post_total_bases) << "," << endl; 
126 |     ofs << "\t\t\t" << "\"q30_rate\":" << (post_total_bases == 0?0.0:(double)post_q30_bases / (double)post_total_bases) << "," << endl; 
127 |     ofs << "\t\t\t" << "\"read1_mean_length\":" << postStats1->getMeanLength() << "," << endl;
128 |     if(mOptions->isPaired())
129 |         ofs << "\t\t\t" << "\"read2_mean_length\":" << postStats2->getMeanLength() << "," << endl;
130 |     ofs << "\t\t\t" << "\"gc_content\":" << (post_total_bases == 0?0.0:(double)post_total_gc / (double)post_total_bases)  << endl; 
131 |     ofs << "\t\t" << "}";
132 | 
133 |     ofs << endl;
134 | 
135 |     ofs << "\t" << "}," << endl;
136 | 
137 |     if(result) {
138 |         ofs << "\t" << "\"filtering_result\": " ;
139 |         result -> reportJson(ofs, "\t");
140 |     }
141 | 
142 |     if(mOptions->duplicate.enabled) {
143 |         ofs << "\t" << "\"duplication\": {" << endl;
144 |         ofs << "\t\t\"rate\": " << mDupRate << "," << endl;
145 |         ofs << "\t\t\"histogram\": [";
146 |         for(int d=1; d<mOptions->duplicate.histSize; d++) {
147 |             ofs << mDupHist[d];
148 |             if(d!=mOptions->duplicate.histSize-1)
149 |                 ofs << ",";
150 |         }
151 |         ofs << "]," << endl;
152 |         ofs << "\t\t\"mean_gc\": [";
153 |         for(int d=1; d<mOptions->duplicate.histSize; d++) {
154 |             ofs << mDupMeanGC[d];
155 |             if(d!=mOptions->duplicate.histSize-1)
156 |                 ofs << ",";
157 |         }
158 |         ofs << "]" << endl;
159 |         ofs << "\t" << "}";
160 |         ofs << "," << endl;
161 |     }
162 | 
163 |     if(mOptions->isPaired()) {
164 |         ofs << "\t" << "\"insert_size\": {" << endl;
165 |         ofs << "\t\t\"peak\": " << mInsertSizePeak << "," << endl;
166 |         ofs << "\t\t\"unknown\": " << mInsertHist[mOptions->insertSizeMax] << "," << endl;
167 |         ofs << "\t\t\"histogram\": [";
168 |         for(int d=0; d<mOptions->insertSizeMax; d++) {
169 |             ofs << mInsertHist[d];
170 |             if(d!=mOptions->insertSizeMax-1)
171 |                 ofs << ",";
172 |         }
173 |         ofs << "]" << endl;
174 |         ofs << "\t" << "}";
175 |         ofs << "," << endl;
176 |     }
177 | 
178 |     if(result && mOptions->adapterCuttingEnabled()) {
179 |         ofs << "\t" << "\"adapter_cutting\": " ;
180 |         result -> reportAdapterJson(ofs, "\t");
181 |     }
182 | 
183 |     if(result && mOptions->polyXTrimmingEnabled()) {
184 |         ofs << "\t" << "\"polyx_trimming\": " ;
185 |         result -> reportPolyXTrimJson(ofs, "\t");
186 |     }
187 | 
188 |     if(preStats1) {
189 |         ofs << "\t" << "\"read1_before_filtering\": " ;
190 |         preStats1 -> reportJson(ofs, "\t");
191 |     }
192 | 
193 |     if(preStats2) {
194 |         ofs << "\t" << "\"read2_before_filtering\": " ;
195 |         preStats2 -> reportJson(ofs, "\t");
196 |     }
197 | 
198 |     if(postStats1) {
199 |         string name = "read1_after_filtering";
200 |         ofs << "\t" << "\"" << name << "\": " ;
201 |         postStats1 -> reportJson(ofs, "\t");
202 |     }
203 | 
204 |     if(postStats2) {
205 |         ofs << "\t" << "\"read2_after_filtering\": " ;
206 |         postStats2 -> reportJson(ofs, "\t");
207 |     }
208 | 
209 |     ofs << "\t\"command\": " << "\"" << command << "\"" << endl;
210 | 
211 |     ofs << "}";
212 | }


--------------------------------------------------------------------------------
/src/jsonreporter.h:
--------------------------------------------------------------------------------
 1 | #ifndef JSON_REPORTER_H
 2 | #define JSON_REPORTER_H
 3 | 
 4 | #include <iostream>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string>
 8 | #include "options.h"
 9 | #include "stats.h"
10 | #include "filterresult.h"
11 | #include <fstream>
12 | #include "virusdetector.h"
13 | 
14 | using namespace std;
15 | 
16 | class JsonReporter{
17 | public:
18 |     JsonReporter(Options* opt);
19 |     ~JsonReporter();
20 | 
21 |     void setDupHist(int* dupHist, double* dupMeanGC, double dupRate);
22 |     void setInsertHist(long* insertHist, int insertSizePeak);
23 |     void report(VirusDetector* vd, FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL);
24 | 
25 | private:
26 |     Options* mOptions;
27 |     int* mDupHist;
28 |     double* mDupMeanGC;
29 |     double mDupRate;
30 |     long* mInsertHist;
31 |     int mInsertSizePeak;
32 | };
33 | 
34 | 
35 | #endif


--------------------------------------------------------------------------------
/src/kmer.cpp:
--------------------------------------------------------------------------------
  1 | #include "kmer.h"
  2 | #include "util.h"
  3 | #include <sstream>
  4 | #include <string.h>
  5 | 
  6 | Kmer::Kmer(string filename, Options* opt)
  7 | {
  8 |     mFastaReader = NULL;
  9 |     mOptions = opt;
 10 |     init(filename);
 11 |     resultMade = false;
 12 | }
 13 | 
 14 | Kmer::~Kmer()
 15 | {
 16 |     if(mFastaReader) {
 17 |         delete mFastaReader;
 18 |         mFastaReader = NULL;
 19 |     }
 20 | }
 21 | 
 22 | void Kmer::init(string filename)
 23 | {
 24 |     mFastaReader = new FastaReader(filename);
 25 |     mFastaReader->readAll();
 26 | 
 27 |     map<string, string> kmers = mFastaReader->contigs();
 28 |     map<string, string>::iterator iter;
 29 | 
 30 |     bool initialized = false;
 31 |     for(iter = kmers.begin(); iter != kmers.end() ; iter++) {
 32 |         string seq = iter->second;
 33 | 
 34 |         if(!initialized) {
 35 |             initialized = true;
 36 |             if(mOptions->kmerKeyLen == 0)
 37 |                 mOptions->kmerKeyLen = seq.length();
 38 |         }
 39 |         if(seq.length() != mOptions->kmerKeyLen) {
 40 |             cerr << "KMER length must be " << mOptions->kmerKeyLen << ", skipped " << seq << endl;
 41 |             continue;
 42 |         }
 43 |         bool valid = true;
 44 |         uint64 kmer64 = seq2uint64(seq, 0, seq.length(), valid);
 45 |         if(valid) {
 46 |             mKmerHits[kmer64] = 0;
 47 |             mNames[kmer64] = iter->first;
 48 |             mSequences[kmer64] = iter->second;
 49 |         } else {
 50 |             cerr << iter->first << ": " << seq << " skipped" << endl;
 51 |         }
 52 |     }
 53 | 
 54 |     if(mKmerHits.size() == 0) {
 55 |         error_exit("No unique KMER specified!");
 56 |     }
 57 | }
 58 | 
 59 | void Kmer::makeResults() {
 60 |     mResults.clear();
 61 |     unordered_map<uint64, uint32>::iterator iter;
 62 |     for(iter = mKmerHits.begin(); iter != mKmerHits.end(); iter++) {
 63 |         uint64 kmer64 = iter->first;
 64 |         string title = mNames[kmer64] + "_" + mSequences[kmer64];
 65 |         mResults[title] =  iter->second;
 66 |     }
 67 |     resultMade = true;
 68 | }
 69 | 
 70 | void Kmer::report() {
 71 |     if(!resultMade)
 72 |         makeResults();
 73 | 
 74 |     map<string, uint32>::iterator iter;
 75 |     for(iter = mResults.begin(); iter != mResults.end(); iter++) {
 76 |         cerr << iter->first << ": " << iter->second << endl;
 77 |     }
 78 | 
 79 |     double meanHit = getMeanHit();
 80 |     cerr << endl;
 81 |     cerr << "Mean depth: " << meanHit << endl<<endl;
 82 |     if(meanHit >= mOptions->positiveThreshold)
 83 |         cerr << "Result: POSITIVE";
 84 |     else
 85 |         cerr << "Result: NEGATIVE";
 86 |     cerr << " (" << "threshold: " <<  mOptions->positiveThreshold << ")" << endl;
 87 | }
 88 | 
 89 | double Kmer::getMeanHit() {
 90 |     if(mKmerHits.size() == 0)
 91 |         return 0.0;
 92 | 
 93 |     double total = 0;
 94 |     unordered_map<uint64, uint32>::iterator iter;
 95 |     for(iter = mKmerHits.begin(); iter != mKmerHits.end(); iter++) {
 96 |         total += iter->second;
 97 |     }
 98 |     return total / (double) mKmerHits.size();
 99 | }
100 | 
101 | bool Kmer::add(uint64 kmer64) {
102 |     unordered_map<uint64, uint32>::iterator iter = mKmerHits.find(kmer64);
103 |     if(iter != mKmerHits.end()) {
104 |         iter->second++;
105 |         return true;
106 |     }
107 |     return false;
108 | }
109 | 
110 | 
111 | string Kmer::getPlotX() {
112 |     if(!resultMade)
113 |         makeResults();
114 | 
115 |     stringstream ss;
116 |     map<string, uint32>::iterator iter;
117 |     int first = true;
118 |     for(iter = mResults.begin(); iter != mResults.end(); iter++) {
119 |         if(first) {
120 |             first = false;
121 |         } else 
122 |             ss << ",";
123 | 
124 |         ss << "\"" << iter->first << "\"";
125 |     }
126 |     return ss.str();
127 | }
128 | 
129 | string Kmer::getPlotY() {
130 |     if(!resultMade)
131 |         makeResults();
132 | 
133 |     stringstream ss;
134 |     map<string, uint32>::iterator iter;
135 |     int first = true;
136 |     for(iter = mResults.begin(); iter != mResults.end(); iter++) {
137 |         if(first) {
138 |             first = false;
139 |         } else 
140 |             ss << ",";
141 | 
142 |         ss << iter->second;
143 |     }
144 |     return ss.str();
145 | }
146 | 
147 | int Kmer::getKmerCount() {
148 |     return mKmerHits.size();
149 | }
150 | 
151 | void Kmer::reportJSON(ofstream& ofs) {
152 |     if(!resultMade)
153 |         makeResults();
154 | 
155 |     map<string, uint32>::iterator iter;
156 |     int first = true;
157 |     for(iter = mResults.begin(); iter != mResults.end(); iter++) {
158 |         if(first) {
159 |             first = false;
160 |         } else 
161 |             ofs << "," << endl;
162 | 
163 |         ofs << "\t\t\t\"" << iter->first << "\"";
164 |         ofs << ":" << iter->second;
165 |     }
166 |     ofs << endl;
167 | }
168 | 
169 | uint64 Kmer::seq2uint64(string& seq, uint32 pos, uint32 len, bool& valid) {
170 |     uint64 key = 0;
171 |     for(uint32 i=0; i<len; i++) {
172 |         key = (key << 2);
173 |         switch(seq[pos +i]) {
174 |             case 'A':
175 |                 key += 0;
176 |                 break;
177 |             case 'T':
178 |                 key += 1;
179 |                 break;
180 |             case 'C':
181 |                 key += 2;
182 |                 break;
183 |             case 'G':
184 |                 key += 3;
185 |                 break;
186 |             case 'N':
187 |             default:
188 |                 valid = false;
189 |                 return 0;
190 |         }
191 |     }
192 |     valid = true;
193 |     return key;
194 | }


--------------------------------------------------------------------------------
/src/kmer.h:
--------------------------------------------------------------------------------
 1 | #ifndef KMER_H
 2 | #define KMER_H
 3 | 
 4 | // includes
 5 | #include "common.h"
 6 | #include <vector>
 7 | #include <unordered_map>
 8 | #include <map>
 9 | #include "fastareader.h"
10 | #include "options.h"
11 | #include <fstream>
12 | 
13 | using namespace std;
14 | 
15 | class Kmer
16 | {
17 | public:
18 |     Kmer(string filename, Options* opt);
19 |     ~Kmer();
20 |     void init(string filename);
21 |     bool add(uint64 kmer64);
22 |     void report();
23 |     double getMeanHit();
24 |     string getPlotX();
25 |     string getPlotY();
26 |     int getKmerCount();
27 |     void reportJSON(ofstream& ofs);
28 | 
29 |     static uint64 seq2uint64(string& seq, uint32 pos, uint32 len, bool& valid);
30 | 
31 | private:
32 |     void makeResults();
33 | 
34 | private:
35 |     unordered_map<uint64, uint32> mKmerHits;
36 |     FastaReader* mFastaReader;
37 |     unordered_map<uint64, string> mNames;
38 |     unordered_map<uint64, string> mSequences;
39 |     map<string, uint32> mResults;
40 |     Options* mOptions;
41 |     bool resultMade;
42 | };
43 | 
44 | 
45 | #endif


--------------------------------------------------------------------------------
/src/kmercollection.h:
--------------------------------------------------------------------------------
 1 | #ifndef ALLKMER_H
 2 | #define ALLKMER_H
 3 | 
 4 | // includes
 5 | #include "common.h"
 6 | #include <vector>
 7 | #include <unordered_map>
 8 | #include <map>
 9 | #include "fastareader.h"
10 | #include "options.h"
11 | #include "zlib/zlib.h"
12 | #include "common.h"
13 | #include <iostream>
14 | #include <fstream>
15 | #include <mutex>
16 | 
17 | #define  MTX_COUNT 100
18 | #define COLLISION_FLAG 0xFFFFFFFF
19 | 
20 | using namespace std;
21 | 
22 | class KCResult {
23 | public:
24 |     string mName;
25 |     uint64  mHit;
26 |     int mMedianHit;
27 |     double mMeanHit;
28 |     double mCoverage;
29 |     int mKmerCount;
30 |     int mUniqueReads;
31 | };
32 | 
33 | class KCHit {
34 | public:
35 |     uint64 mKey64;
36 |     uint32 mID;
37 |     uint32 mHit;
38 | };
39 | 
40 | class KmerCollection
41 | {
42 | public:
43 |     KmerCollection(string filename, Options* opt);
44 |     ~KmerCollection();
45 |     void init();
46 |     void report();
47 |     void reportJSON(ofstream& ofs);
48 |     void reportHTML(ofstream& ofs);
49 |     uint32 add(uint64 kmer64);
50 |     void addGenomeRead(uint32 genomeID);
51 | 
52 |     uint32 packIdCount(uint32 id, uint32 count);
53 |     void unpackIdCount(uint32 data,uint32& id, uint32& count);
54 |     void stat();
55 | 
56 | private:
57 |     bool getLine(char* line, int maxLine);
58 |     uint64 makeHash(uint64 key);
59 |     bool eof();
60 |     void makeBitAndMask();
61 |     bool isHighConfidence(KCResult kcr);
62 | private:
63 |     Options* mOptions;
64 |     vector<string> mNames;
65 |     vector<uint64> mHits;
66 |     vector<int> mMedianHits;
67 |     vector<double> mMeanHits;
68 |     vector<double> mCoverage;
69 |     vector<int> mKmerCounts;
70 |     vector<int> mGenomeReads;
71 |     vector<KCResult> mResults;
72 |     int mNumber;
73 |     uint32 mUniqueHashNum;
74 |     uint32* mHashKCH;
75 |     KCHit* mKCHits;
76 |     string mFilename;
77 |     gzFile mZipFile;
78 |     ifstream mFile;
79 |     bool mZipped;
80 |     int mIdBits;
81 |     uint32 mIdMask;
82 |     uint32 mCountMax;
83 |     bool mStatDone;
84 |     uint32 mUniqueNumber;
85 | };
86 | 
87 | 
88 | #endif


--------------------------------------------------------------------------------
/src/nucleotidetree.cpp:
--------------------------------------------------------------------------------
  1 | #include "nucleotidetree.h"
  2 | #include <sstream>
  3 | 
  4 | NucleotideNode::NucleotideNode(){
  5 |     count = 0;
  6 |     base = 'N';
  7 |     memset(children, 0, sizeof(NucleotideNode*)*8);
  8 | }
  9 | NucleotideNode::~NucleotideNode(){
 10 |     for(int i=0; i<8; i++) {
 11 |         if(children[i])
 12 |             delete children[i];
 13 |     }
 14 | }
 15 | void NucleotideNode::dfs() {
 16 |     //cerr << base;
 17 |     //cerr << count;
 18 |     printf("%c", base);
 19 |     printf("%d", count);
 20 |     bool hasChild = false;
 21 |     for(int i=0; i<8; i++) {
 22 |         if(children[i]){
 23 |             children[i]->dfs();
 24 |             hasChild = true;
 25 |         }
 26 |     }
 27 |     if(!hasChild) {
 28 |         printf("\n");
 29 |     }
 30 | }
 31 | 
 32 | NucleotideTree::NucleotideTree(Options* opt){
 33 |     mOptions = opt;
 34 |     mRoot = new NucleotideNode();
 35 | }
 36 | 
 37 | 
 38 | NucleotideTree::~NucleotideTree(){
 39 |     delete mRoot;
 40 | }
 41 | 
 42 | void NucleotideTree::addSeq(string seq) {
 43 |     NucleotideNode* curNode = mRoot;
 44 |     for(int i=0; i<seq.length(); i++) {
 45 |         if(seq[i] == 'N')
 46 |             break;
 47 |         char base = seq[i] & 0x07;
 48 |         if(curNode->children[base] == NULL) {
 49 |             curNode->children[base] = new NucleotideNode();
 50 |             curNode->children[base]->base = seq[i];
 51 |         }
 52 |         curNode->children[base]->count++;
 53 |         curNode = curNode->children[base];
 54 |     }
 55 | }
 56 | 
 57 | string NucleotideTree::getDominantPath(bool& reachedLeaf) {
 58 |     stringstream ss;
 59 |     const double RATIO_THRESHOLD = 0.95;
 60 |     const int NUM_THRESHOLD = 50;
 61 |     NucleotideNode* curNode = mRoot;
 62 |     while(true) {
 63 |         int total = 0;
 64 |         for(int i=0; i<8; i++) {
 65 |             if(curNode->children[i] != NULL)
 66 |                 total += curNode->children[i]->count;
 67 |         }
 68 |         if(total < NUM_THRESHOLD)
 69 |             break;
 70 |         bool hasDominant = false;
 71 |         for(int i=0; i<8; i++) {
 72 |             if(curNode->children[i] == NULL)
 73 |                 continue;
 74 |             if(curNode->children[i]->count / (double)total >= RATIO_THRESHOLD) {
 75 |                 hasDominant = true;
 76 |                 ss << curNode->children[i]->base;
 77 |                 curNode = curNode->children[i];
 78 |                 break;
 79 |             }
 80 |         }
 81 |         if(!hasDominant) {
 82 |             reachedLeaf = false;
 83 |             break;
 84 |         }
 85 |     }
 86 |     return ss.str();
 87 | 
 88 | }
 89 | 
 90 | bool NucleotideTree::test() {
 91 |     NucleotideTree tree(NULL);
 92 |     for(int i=0; i<100; i++) {
 93 |         tree.addSeq("AAAATTTT");
 94 |         tree.addSeq("AAAATTTTGGGG");
 95 |         tree.addSeq("AAAATTTTGGGGCCCC");
 96 |         tree.addSeq("AAAATTTTGGGGCCAA");
 97 |     }
 98 |     tree.addSeq("AAAATTTTGGGACCCC");
 99 | 
100 |     bool reachedLeaf = true;
101 |     string path = tree.getDominantPath(reachedLeaf);
102 |     printf("%s\n", path.c_str());
103 |     return path == "AAAATTTTGGGGCC";
104 | }


--------------------------------------------------------------------------------
/src/nucleotidetree.h:
--------------------------------------------------------------------------------
 1 | #ifndef NUCLEICTREE_H
 2 | #define NUCLEICTREE_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <memory.h>
 8 | #include "options.h"
 9 | 
10 | using namespace std;
11 | 
12 | // (A,T,C,G,N) & 0X07 = (1,4,7,6,3)
13 | class NucleotideNode{
14 | public:
15 |     NucleotideNode();
16 |     ~NucleotideNode();
17 |     void dfs();
18 | 
19 | public:
20 |     int count;
21 |     char base;
22 |     NucleotideNode* children[8];
23 | };
24 | 
25 | class NucleotideTree{
26 | public:
27 |     NucleotideTree(Options* opt);
28 |     ~NucleotideTree();
29 |     void addSeq(string seq);
30 |     string getDominantPath(bool& reachedLeaf);
31 | 
32 |     static bool test();
33 | 
34 | private:
35 |     Options* mOptions;
36 |     NucleotideNode* mRoot;
37 | };
38 | 
39 | 
40 | #endif


--------------------------------------------------------------------------------
/src/options.h:
--------------------------------------------------------------------------------
  1 | #ifndef OPTIONS_H
  2 | #define OPTIONS_H
  3 | 
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <string>
  7 | #include <vector>
  8 | #include <map>
  9 | 
 10 | using namespace std;
 11 | 
 12 | #define UMI_LOC_NONE 0
 13 | #define UMI_LOC_INDEX1 1
 14 | #define UMI_LOC_INDEX2 2
 15 | #define UMI_LOC_READ1 3
 16 | #define UMI_LOC_READ2 4
 17 | #define UMI_LOC_PER_INDEX 5
 18 | #define UMI_LOC_PER_READ 6
 19 | 
 20 | class DuplicationOptions {
 21 | public:
 22 |     DuplicationOptions() {
 23 |         enabled = true;
 24 |         keylen = 12;
 25 |         histSize = 32;
 26 |     }
 27 | public:
 28 |     bool enabled;
 29 |     int keylen;
 30 |     int histSize;
 31 | };
 32 | 
 33 | 
 34 | class LowComplexityFilterOptions {
 35 | public:
 36 |     LowComplexityFilterOptions() {
 37 |         enabled = false;
 38 |         threshold = 0.3;
 39 |     }
 40 | public:
 41 |     bool enabled;
 42 |     double threshold;
 43 | };
 44 | 
 45 | class PolyGTrimmerOptions {
 46 | public:
 47 |     PolyGTrimmerOptions() {
 48 |         enabled = false;
 49 |         minLen = 10;
 50 |     }
 51 | public:
 52 |     bool enabled;
 53 |     int minLen;
 54 | };
 55 | 
 56 | class PolyXTrimmerOptions {
 57 | public:
 58 |     PolyXTrimmerOptions() {
 59 |         enabled = false;
 60 |         minLen = 10;
 61 |     }
 62 | public:
 63 |     bool enabled;
 64 |     int minLen;
 65 | };
 66 | 
 67 | class UMIOptions {
 68 | public:
 69 |     UMIOptions() {
 70 |         enabled = false;
 71 |         location = UMI_LOC_NONE;
 72 |         length = 0;
 73 |         skip = 0;
 74 |     }
 75 | public:
 76 |     bool enabled;
 77 |     int location;
 78 |     int length;
 79 |     int skip;
 80 |     string prefix;
 81 |     string separator;
 82 | };
 83 | 
 84 | class CorrectionOptions {
 85 | public:
 86 |     CorrectionOptions() {
 87 |         enabled = false;
 88 |     }
 89 | public:
 90 |     bool enabled;
 91 | };
 92 | 
 93 | class QualityCutOptions {
 94 | public:
 95 |     QualityCutOptions() {
 96 |         enabledFront = false;
 97 |         enabledTail = false;
 98 |         enabledRight = false;
 99 |         windowSizeShared = 4;
100 |         qualityShared = 20;
101 |         windowSizeFront = windowSizeShared;
102 |         qualityFront = qualityShared;
103 |         windowSizeTail = windowSizeShared;
104 |         qualityTail = qualityShared;
105 |         windowSizeRight = windowSizeShared;
106 |         qualityRight = qualityShared;
107 |     }
108 | public:
109 |     // enable 5' cutting by quality
110 |     bool enabledFront;
111 |     // enable 3' cutting by quality
112 |     bool enabledTail;
113 |     // enable agressive cutting mode
114 |     bool enabledRight;
115 |     // the sliding window size
116 |     int windowSizeShared;
117 |     // the mean quality requirement
118 |     int qualityShared;
119 |     // the sliding window size for cutting by quality in 5'
120 |     int windowSizeFront;
121 |     // the mean quality requirement for cutting by quality in 5'
122 |     int qualityFront;
123 |     // the sliding window size for cutting by quality in 3'
124 |     int windowSizeTail;
125 |     // the mean quality requirement for cutting by quality in 3'
126 |     int qualityTail;
127 |     // the sliding window size for cutting by quality in aggressive mode
128 |     int windowSizeRight;
129 |     // the mean quality requirement for cutting by quality in aggressive mode
130 |     int qualityRight;
131 | };
132 | 
133 | class AdapterOptions {
134 | public:
135 |     AdapterOptions() {
136 |         enabled = true;
137 |         hasSeqR1 = false;
138 |         hasSeqR2 = false;
139 |         detectAdapterForPE = false;
140 |     }
141 | public:
142 |     bool enabled;
143 |     string sequence;
144 |     string sequenceR2;
145 |     string detectedAdapter1;
146 |     string detectedAdapter2;
147 |     vector<string> seqsInFasta;
148 |     string fastaFile;
149 |     bool hasSeqR1;
150 |     bool hasSeqR2;
151 |     bool hasFasta;
152 |     bool detectAdapterForPE;
153 | };
154 | 
155 | class TrimmingOptions {
156 | public:
157 |     TrimmingOptions() {
158 |         front1 = 0;
159 |         tail1 = 0;
160 |         front2 = 0;
161 |         tail2 = 0;
162 |         maxLen1 = 0;
163 |         maxLen2 = 0;
164 |     }
165 | public:
166 |     // trimming first cycles for read1
167 |     int front1;
168 |     // trimming last cycles for read1
169 |     int tail1;
170 |     // trimming first cycles for read2
171 |     int front2;
172 |     // trimming last cycles for read2
173 |     int tail2;
174 |     // max length of read1
175 |     int maxLen1;
176 |     // max length of read2
177 |     int maxLen2;
178 | };
179 | 
180 | class QualityFilteringOptions {
181 | public:
182 |     QualityFilteringOptions() {
183 |         enabled = true;
184 |         // '0' = Q15
185 |         qualifiedQual = '0';
186 |         unqualifiedPercentLimit = 40;
187 |         nBaseLimit = 5;
188 |     }
189 | public:
190 |     // quality filter enabled
191 |     bool enabled;
192 |     // if a base's quality phred score < qualifiedPhred, then it's considered as a low_qual_base
193 |     char qualifiedQual;
194 |     // if low_qual_base_num > lowQualLimit, then discard this read
195 |     int unqualifiedPercentLimit;
196 |     // if n_base_number > nBaseLimit, then discard this read
197 |     int nBaseLimit;
198 |     // if average qual score < avgQualReq, then discard this read
199 |     int avgQualReq;
200 | };
201 | 
202 | class ReadLengthFilteringOptions {
203 | public:
204 |     ReadLengthFilteringOptions() {
205 |         enabled = false;
206 |         requiredLength = 15;
207 |         maxLength = 0;
208 |     }
209 | public:
210 |     // length filter enabled
211 |     bool enabled;
212 |     // if read_length < requiredLength, then this read is discard
213 |     int requiredLength;
214 |     // length limit, 0 for no limitation
215 |     int maxLength;
216 | };
217 | 
218 | class Options{
219 | public:
220 |     Options();
221 |     void init();
222 |     bool isPaired();
223 |     bool validate();
224 |     bool adapterCuttingEnabled();
225 |     bool polyXTrimmingEnabled();
226 |     string getAdapter1();
227 |     string getAdapter2();
228 |     bool shallDetectAdapter(bool isR2 = false);
229 |     void loadFastaAdapters();
230 | 
231 | public:
232 |     // file name of read1 input
233 |     string in1;
234 |     // file name of read2 input
235 |     string in2;
236 |     // file name of read1 output
237 |     string out1;
238 |     // file name of read2 output
239 |     string out2;
240 |     // genome FASTA file
241 |     string genomeFile;
242 |     // kmer FASTA file
243 |     string kmerFile;
244 |     // kmer FASTA file
245 |     string kmerCollectionFile;
246 |     // json file
247 |     string jsonFile;
248 |     // html file
249 |     string htmlFile;
250 |     // html report title
251 |     string reportTitle;
252 |     // compression level
253 |     int compression;
254 |     // the input file is using phred64 quality scoring
255 |     bool phred64;
256 |     // do not rewrite existing files
257 |     bool dontOverwrite;
258 |     // read STDIN
259 |     bool inputFromSTDIN;
260 |     // write STDOUT
261 |     bool outputToSTDOUT;
262 |     // the input R1 file is interleaved
263 |     bool interleavedInput;
264 |     // only process first N reads
265 |     int readsToProcess;
266 |     // worker thread number
267 |     int thread;
268 |     // trimming options
269 |     TrimmingOptions trim;
270 |     // quality filtering options
271 |     QualityFilteringOptions qualfilter;
272 |     // length filtering options
273 |     ReadLengthFilteringOptions lengthFilter;
274 |     // adapter options
275 |     AdapterOptions adapter;
276 |     // options for quality cutting
277 |     QualityCutOptions qualityCut;
278 |     // options for base correction
279 |     CorrectionOptions correction;
280 |     // options for UMI
281 |     UMIOptions umi;
282 |     // 3' end polyG trimming, default for Illumina NextSeq/NovaSeq
283 |     PolyGTrimmerOptions polyGTrim;
284 |     // 3' end polyX trimming
285 |     PolyXTrimmerOptions polyXTrim;
286 |     int seqLen1;
287 |     int seqLen2;
288 |     // low complexity filtering
289 |     LowComplexityFilterOptions complexityFilter;
290 |     // options for duplication profiling
291 |     DuplicationOptions duplicate;
292 |     // options for duplication profiling
293 |     int insertSizeMax;
294 |     // overlap analysis threshold
295 |     int overlapRequire;
296 |     int overlapDiffLimit;
297 |     int overlapDiffPercentLimit;
298 |     // output debug information
299 |     bool verbose;
300 |     // the length of KMER, default is 25
301 |     int kmerKeyLen;
302 |     // the threshold of positive result
303 |     double positiveThreshold;
304 |     // the threshold of depth for a region considered as covered
305 |     double depthThreshold;
306 |     // if ed(read, genome) <= edThreshold, then think it as a match
307 |     int edThreshold;
308 |     // the bin size for stat coverage and edit distance
309 |     int statsBinSize;
310 |     // read with length >= longReadThreshold will be considered as long reads
311 |     int longReadThreshold;
312 |     // long reads will be split to reads with length <= segmentLength
313 |     int segmentLength;
314 |     //  coverage threshold to be reported in kmer collection results
315 |     double kcCoverageThreshold;
316 |     // coverage for high-confidence KCR
317 |     double kcCoverageHighConfidence;
318 |     // median hit for high-confidence KCR
319 |     double kcMedianHitHighConfidence;
320 | 
321 | };
322 | 
323 | #endif


--------------------------------------------------------------------------------
/src/overlapanalysis.cpp:
--------------------------------------------------------------------------------
  1 | #include "overlapanalysis.h"
  2 | 
  3 | OverlapAnalysis::OverlapAnalysis(){
  4 | }
  5 | 
  6 | 
  7 | OverlapAnalysis::~OverlapAnalysis(){
  8 | }
  9 | 
 10 | OverlapResult OverlapAnalysis::analyze(Read* r1, Read* r2, int overlapDiffLimit, int overlapRequire, double diffPercentLimit) {
 11 |     return analyze(r1->mSeq, r2->mSeq, overlapDiffLimit, overlapRequire, diffPercentLimit);
 12 | }
 13 | 
 14 | // ported from the python code of AfterQC
 15 | OverlapResult OverlapAnalysis::analyze(Sequence& r1, Sequence& r2, int diffLimit, int overlapRequire, double diffPercentLimit) {
 16 |     Sequence rcr2 = ~r2;
 17 |     int len1 = r1.length();
 18 |     int len2 = rcr2.length();
 19 |     // use the pointer directly for speed
 20 |     const char* str1 = r1.mStr.c_str();
 21 |     const char* str2 = rcr2.mStr.c_str();
 22 | 
 23 |     int complete_compare_require = 50;
 24 | 
 25 |     int overlap_len = 0;
 26 |     int offset = 0;
 27 |     int diff = 0;
 28 | 
 29 |     // forward
 30 |     // a match of less than overlapRequire is considered as unconfident
 31 |     while (offset < len1-overlapRequire) {
 32 |         // the overlap length of r1 & r2 when r2 is move right for offset
 33 |         overlap_len = min(len1 - offset, len2);
 34 |         int overlapDiffLimit = min(diffLimit, (int)(overlap_len * diffPercentLimit));
 35 | 
 36 |         diff = 0;
 37 |         int i = 0;
 38 |         for (i=0; i<overlap_len; i++) {
 39 |             if (str1[offset + i] != str2[i]){
 40 |                 diff += 1;
 41 |                 if (diff > overlapDiffLimit && i < complete_compare_require)
 42 |                     break;
 43 |             }
 44 |         }
 45 |         
 46 |         if (diff <= overlapDiffLimit || (diff > overlapDiffLimit && i>complete_compare_require)){
 47 |             OverlapResult ov;
 48 |             ov.overlapped = true;
 49 |             ov.offset = offset;
 50 |             ov.overlap_len = overlap_len;
 51 |             ov.diff = diff;
 52 |             return ov;
 53 |         }
 54 | 
 55 |         offset += 1;
 56 |     }
 57 | 
 58 | 
 59 |     // reverse
 60 |     // in this case, the adapter is sequenced since TEMPLATE_LEN < SEQ_LEN
 61 |     // check if distance can get smaller if offset goes negative
 62 |     // this only happens when insert DNA is shorter than sequencing read length, and some adapter/primer is sequenced but not trimmed cleanly
 63 |     // we go reversely
 64 |     offset = 0;
 65 |     while (offset > -(len2-overlapRequire)){
 66 |         // the overlap length of r1 & r2 when r2 is move right for offset
 67 |         overlap_len = min(len1,  len2- abs(offset));
 68 |         int overlapDiffLimit = min(diffLimit, (int)(overlap_len * diffPercentLimit));
 69 | 
 70 |         diff = 0;
 71 |         int i = 0;
 72 |         for (i=0; i<overlap_len; i++) {
 73 |             if (str1[i] != str2[-offset + i]){
 74 |                 diff += 1;
 75 |                 if (diff > overlapDiffLimit && i < complete_compare_require)
 76 |                     break;
 77 |             }
 78 |         }
 79 |         
 80 |         if (diff <= overlapDiffLimit || (diff > overlapDiffLimit && i>complete_compare_require)){
 81 |             OverlapResult ov;
 82 |             ov.overlapped = true;
 83 |             ov.offset = offset;
 84 |             ov.overlap_len = overlap_len;
 85 |             ov.diff = diff;
 86 |             return ov;
 87 |         }
 88 | 
 89 |         offset -= 1;
 90 |     }
 91 | 
 92 |     OverlapResult ov;
 93 |     ov.overlapped = false;
 94 |     ov.offset = ov.overlap_len = ov.diff = 0;
 95 |     return ov;
 96 | }
 97 | 
 98 | Read* OverlapAnalysis::merge(Read* r1, Read* r2, OverlapResult ov) {
 99 |     int ol = ov.overlap_len;
100 |     if(!ov.overlapped)
101 |         return NULL;
102 | 
103 |     int len1 = ol + max(0, ov.offset);
104 |     int len2 = 0; 
105 |     if(ov.offset > 0)
106 |         len2 = r2->length() - ol;
107 | 
108 |     Read* rr2 = r2->reverseComplement();
109 |     string mergedSeq = r1->mSeq.mStr.substr(0, len1);
110 |     if(ov.offset > 0) {
111 |         mergedSeq += rr2->mSeq.mStr.substr(ol, len2);
112 |     }
113 | 
114 |     string mergedQual = r1->mQuality.substr(0, len1);
115 |     if(ov.offset > 0) {
116 |         mergedQual += rr2->mQuality.substr(ol, len2);
117 |     }
118 | 
119 |     delete rr2;
120 | 
121 |     string name = r1->mName + " merged_" + to_string(len1) + "_" + to_string(len2);
122 |     Read* mergedRead = new Read(name, mergedSeq, r1->mStrand, mergedQual);
123 | 
124 |     return mergedRead;
125 | }
126 | 
127 | bool OverlapAnalysis::test(){
128 |     //Sequence r1("CAGCGCCTACGGGCCCCTTTTTCTGCGCGACCGCGTGGCTGTGGGCGCGGATGCCTTTGAGCGCGGTGACTTCTCACTGCGTATCGAGCCGCTGGAGGTCTCCC");
129 |     //Sequence r2("ACCTCCAGCGGCTCGATACGCAGTGAGAAGTCACCGCGCTCAAAGGCATCCGCGCCCACAGCCACGCGGTCGCGCAGAAAAAGGGGCCCGTAGGCGCGGCTCCC");
130 | 
131 |     Sequence r1("CAGCGCCTACGGGCCCCTTTTTCTGCGCGACCGCGTGGCTGTGGGCGCGGATGCCTTTGAGCGCGGTGACTTCTCACTGCGTATCGAGC");
132 |     Sequence r2("ACCTCCAGCGGCTCGATACGCAGTGAGAAGTCACCGCGCTCAAAGGCATCCGCGCCCACAGCCACGCGGTCGCGCAGAAAAAGGGGTCC");
133 |     string qual1("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF");
134 |     string qual2("#########################################################################################");
135 |     
136 |     OverlapResult ov = OverlapAnalysis::analyze(r1, r2, 2, 30, 0.2);
137 | 
138 |     Read read1("name1", r1, "+", qual1);
139 |     Read read2("name2", r2, "+", qual2);
140 | 
141 |     Read* mergedRead = OverlapAnalysis::merge(&read1, &read2, ov);
142 |     mergedRead->print();
143 | 
144 |     return ov.overlapped && ov.offset == 10 && ov.overlap_len == 79 && ov.diff == 1;
145 | }


--------------------------------------------------------------------------------
/src/overlapanalysis.h:
--------------------------------------------------------------------------------
 1 | #ifndef OVERLAP_ANALYSIS_H
 2 | #define OVERLAP_ANALYSIS_H
 3 | 
 4 | #include <iostream>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string>
 8 | #include <vector>
 9 | #include "common.h"
10 | #include "options.h"
11 | #include "read.h"
12 | 
13 | using namespace std;
14 | 
15 | class OverlapResult {
16 | public:
17 |     bool overlapped;
18 |     int offset;
19 |     int overlap_len;
20 |     int diff;
21 | };
22 | 
23 | class OverlapAnalysis{
24 | public:
25 |     OverlapAnalysis();
26 |     ~OverlapAnalysis();
27 | 
28 |     static OverlapResult analyze(Sequence&  r1, Sequence&  r2, int diffLimit, int overlapRequire, double diffPercentLimit);
29 |     static OverlapResult analyze(Read* r1, Read* r2, int diffLimit, int overlapRequire, double diffPercentLimit);
30 |     static Read* merge(Read* r1, Read* r2, OverlapResult ov);
31 | 
32 | public:
33 |     static bool test();
34 | 
35 | };
36 | 
37 | #endif


--------------------------------------------------------------------------------
/src/peprocessor.h:
--------------------------------------------------------------------------------
 1 | #ifndef PE_PROCESSOR_H
 2 | #define PE_PROCESSOR_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "read.h"
 8 | #include <cstdlib>
 9 | #include <condition_variable>
10 | #include <mutex>
11 | #include <thread>
12 | #include "options.h"
13 | #include "threadconfig.h"
14 | #include "filter.h"
15 | #include "umiprocessor.h"
16 | #include "overlapanalysis.h"
17 | #include "writerthread.h"
18 | #include "duplicate.h"
19 | #include "virusdetector.h"
20 | 
21 | 
22 | using namespace std;
23 | 
24 | struct ReadPairPack {
25 |     ReadPair** data;
26 |     int count;
27 | };
28 | 
29 | typedef struct ReadPairPack ReadPairPack;
30 | 
31 | struct ReadPairRepository {
32 |     ReadPairPack** packBuffer;
33 |     atomic_long readPos;
34 |     atomic_long writePos;
35 |     //std::mutex mtx;
36 |     //std::mutex readCounterMtx;
37 |     //std::condition_variable repoNotFull;
38 |     //std::condition_variable repoNotEmpty;
39 | };
40 | 
41 | typedef struct ReadPairRepository ReadPairRepository;
42 | 
43 | class PairEndProcessor{
44 | public:
45 |     PairEndProcessor(Options* opt);
46 |     ~PairEndProcessor();
47 |     bool process();
48 | 
49 | private:
50 |     bool processPairEnd(ReadPairPack* pack, ThreadConfig* config);
51 |     bool processRead(Read* r, ReadPair* originalRead, bool reversed);
52 |     void initPackRepository();
53 |     void destroyPackRepository();
54 |     void producePack(ReadPairPack* pack);
55 |     void consumePack(ThreadConfig* config);
56 |     void producerTask();
57 |     void consumerTask(ThreadConfig* config);
58 |     void initConfig(ThreadConfig* config);
59 |     void initOutput();
60 |     void closeOutput();
61 |     void statInsertSize(Read* r1, Read* r2, OverlapResult& ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0);
62 |     int getPeakInsertSize();
63 |     void writeTask(WriterThread* config);
64 | 
65 | private:
66 |     ReadPairRepository mRepo;
67 |     atomic_bool mProduceFinished;
68 |     atomic_int mFinishedThreads;
69 |     std::mutex mOutputMtx;
70 |     std::mutex mInputMtx;
71 |     Options* mOptions;
72 |     Filter* mFilter;
73 |     gzFile mZipFile1;
74 |     gzFile mZipFile2;
75 |     ofstream* mOutStream1;
76 |     ofstream* mOutStream2;
77 |     UmiProcessor* mUmiProcessor;
78 |     long* mInsertSizeHist;
79 |     WriterThread* mLeftWriter;
80 |     WriterThread* mRightWriter;
81 |     Duplicate* mDuplicate;
82 |     VirusDetector* mVirusDetector;
83 | };
84 | 
85 | 
86 | #endif


--------------------------------------------------------------------------------
/src/polyx.cpp:
--------------------------------------------------------------------------------
  1 | #include "polyx.h"
  2 | #include "common.h"
  3 | 
  4 | PolyX::PolyX(){
  5 | }
  6 | 
  7 | 
  8 | PolyX::~PolyX(){
  9 | }
 10 | 
 11 | void PolyX::trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq) {
 12 |     trimPolyG(r1, fr, compareReq);
 13 |     trimPolyG(r2, fr, compareReq);
 14 | }
 15 | 
 16 | void PolyX::trimPolyG(Read* r, FilterResult* fr, int compareReq) {
 17 |     const int allowOneMismatchForEach = 8;
 18 |     const int maxMismatch = 5;
 19 | 
 20 |     const char* data = r->mSeq.mStr.c_str();
 21 | 
 22 |     int rlen = r->length();
 23 | 
 24 |     int mismatch = 0;
 25 |     int i = 0;
 26 |     int firstGPos = rlen - 1;
 27 |     for(i=0; i< rlen; i++) {
 28 |         if(data[rlen - i - 1] != 'G') {
 29 |             mismatch++;
 30 |         } else {
 31 |             firstGPos = rlen - i -1;
 32 |         }
 33 | 
 34 |         int allowedMismatch = (i+1)/allowOneMismatchForEach;
 35 |         if(mismatch > maxMismatch || (mismatch>allowedMismatch && i>= compareReq-1) )
 36 |             break;
 37 |     }
 38 | 
 39 |     if(i >= compareReq) {
 40 |         r->resize(firstGPos);
 41 |     }
 42 | }
 43 | 
 44 | void PolyX::trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq) {
 45 |     trimPolyX(r1, fr, compareReq);
 46 |     trimPolyX(r2, fr, compareReq);
 47 | }
 48 | 
 49 | void PolyX::trimPolyX(Read* r, FilterResult* fr, int compareReq) {
 50 |     const int allowOneMismatchForEach = 8;
 51 |     const int maxMismatch = 5;
 52 | 
 53 |     const char* data = r->mSeq.mStr.c_str();
 54 | 
 55 |     int rlen = r->length();
 56 | 
 57 | 
 58 |     int atcgNumbers[4] = {0, 0, 0, 0};
 59 |     int pos = 0;
 60 |     for(pos=0; pos<rlen; pos++) {
 61 |         switch(data[rlen - pos - 1]) {
 62 |             case 'A':
 63 |                 atcgNumbers[0]++;
 64 |                 break;
 65 |             case 'T':
 66 |                 atcgNumbers[1]++;
 67 |                 break;
 68 |             case 'C':
 69 |                 atcgNumbers[2]++;
 70 |                 break;
 71 |             case 'G':
 72 |                 atcgNumbers[3]++;
 73 |                 break;
 74 |             case 'N':
 75 |                 atcgNumbers[0]++;
 76 |                 atcgNumbers[1]++;
 77 |                 atcgNumbers[2]++;
 78 |                 atcgNumbers[3]++;
 79 |                 break;
 80 |             default:
 81 |                 break;
 82 |         }
 83 | 
 84 |         int cmp = (pos+1);
 85 |         int allowedMismatch = min(maxMismatch, cmp/allowOneMismatchForEach);
 86 | 
 87 |         bool needToBreak = true;
 88 |         for(int b=0; b<4; b++) {
 89 |             if(cmp - atcgNumbers[b] <= allowedMismatch)
 90 |                 needToBreak = false;
 91 |         }
 92 |         if(needToBreak && (pos >= allowOneMismatchForEach || pos+1 >= compareReq-1)) {
 93 |             break;
 94 |         }
 95 |     }
 96 | 
 97 |     // has polyX
 98 |     if(pos+1 >= compareReq) {
 99 |         // find the poly
100 |         int poly;
101 |         int maxCount = -1;
102 |         for(int b=0; b<4; b++) {
103 |             if(atcgNumbers[b] > maxCount){
104 |                 maxCount = atcgNumbers[b];
105 |                 poly = b;
106 |             }
107 |         }
108 |         char polyBase = ATCG_BASES[poly];
109 |         while(data[rlen - pos - 1] != polyBase && pos>=0)
110 |             pos--;
111 | 
112 |         r->resize(rlen - pos - 1);
113 |         if(fr)
114 |           fr->addPolyXTrimmed(poly, pos + 1);
115 |     }
116 | }
117 | 
118 | bool PolyX::test() {
119 | 
120 |     Read r("@name",
121 |         "ATTTTAAAAAAAAAATAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAT",
122 |         "+",
123 |         "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E");
124 | 
125 |     FilterResult fr(NULL, false);
126 |     PolyX::trimPolyX(&r, &fr, 10);
127 |     r.print();
128 | 
129 |     return r.mSeq.mStr == "ATTTT" && fr.getTotalPolyXTrimmedReads() == 1 && fr.getTotalPolyXTrimmedBases() == 51;
130 | }


--------------------------------------------------------------------------------
/src/polyx.h:
--------------------------------------------------------------------------------
 1 | #ifndef POLY_X_H
 2 | #define POLY_X_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "overlapanalysis.h"
 8 | #include "filterresult.h"
 9 | #include "options.h"
10 | 
11 | using namespace std;
12 | 
13 | class PolyX{
14 | public:
15 |     PolyX();
16 |     ~PolyX();
17 | 
18 |     static void trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq);
19 |     static void trimPolyG(Read* r1, FilterResult* fr, int compareReq);
20 |     static void trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq);
21 |     static void trimPolyX(Read* r1, FilterResult* fr, int compareReq);
22 |     static bool test();
23 | 
24 | 
25 | };
26 | 
27 | 
28 | #endif


--------------------------------------------------------------------------------
/src/processor.cpp:
--------------------------------------------------------------------------------
 1 | #include "processor.h"
 2 | #include "peprocessor.h"
 3 | #include "seprocessor.h"
 4 | 
 5 | Processor::Processor(Options* opt){
 6 |     mOptions = opt;
 7 | }
 8 | 
 9 | 
10 | Processor::~Processor(){
11 | }
12 | 
13 | bool Processor::process() {
14 |     if(mOptions->isPaired()) {
15 |         PairEndProcessor p(mOptions);
16 |         p.process();
17 |     } else {
18 |         SingleEndProcessor p(mOptions);
19 |         p.process();
20 |     }
21 | 
22 |     return true;
23 | }


--------------------------------------------------------------------------------
/src/processor.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROCESSOR_H
 2 | #define PROCESSOR_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "options.h"
 8 | 
 9 | using namespace std;
10 | 
11 | class Processor{
12 | public:
13 |     Processor(Options* opt);
14 |     ~Processor();
15 |     bool process();
16 | 
17 | private:
18 |     Options* mOptions;
19 | };
20 | 
21 | 
22 | #endif


--------------------------------------------------------------------------------
/src/read.cpp:
--------------------------------------------------------------------------------
  1 | #include "read.h"
  2 | #include <sstream>
  3 | #include "util.h"
  4 | 
  5 | Read::Read(string name, string seq, string strand, string quality, bool phred64){
  6 | 	mName = name;
  7 | 	mSeq = Sequence(seq);
  8 | 	mStrand = strand;
  9 | 	mQuality = quality;
 10 | 	mHasQuality = true;
 11 | 	if(phred64)
 12 | 		convertPhred64To33();
 13 | }
 14 | 
 15 | Read::Read(string name, string seq, string strand){
 16 | 	mName = name;
 17 | 	mSeq = Sequence(seq);
 18 | 	mStrand = strand;
 19 | 	mHasQuality = false;
 20 | }
 21 | 
 22 | Read::Read(string name, Sequence seq, string strand, string quality, bool phred64){
 23 | 	mName = name;
 24 | 	mSeq = seq;
 25 | 	mStrand = strand;
 26 | 	mQuality = quality;
 27 | 	mHasQuality = true;
 28 | 	if(phred64)
 29 | 		convertPhred64To33();
 30 | }
 31 | 
 32 | Read::Read(string name, Sequence seq, string strand){
 33 | 	mName = name;
 34 | 	mSeq = seq;
 35 | 	mStrand = strand;
 36 | 	mHasQuality = false;
 37 | }
 38 | 
 39 | void Read::convertPhred64To33(){
 40 | 	for(int i=0; i<mQuality.length(); i++) {
 41 | 		mQuality[i] = max(33, mQuality[i] - (64-33));
 42 | 	}
 43 | }
 44 | 
 45 | Read::Read(Read &r) {
 46 | 	mName = r.mName;
 47 | 	mSeq = r.mSeq;
 48 | 	mStrand = r.mStrand;
 49 | 	mQuality = r.mQuality;
 50 | 	mHasQuality = r.mHasQuality;
 51 | }
 52 | 
 53 | void Read::print(){
 54 | 	std::cerr << mName << endl;
 55 | 	std::cerr << mSeq.mStr << endl;
 56 | 	std::cerr << mStrand << endl;
 57 | 	if(mHasQuality)
 58 | 		std::cerr << mQuality << endl;
 59 | }
 60 | 
 61 | void Read::printFile(ofstream& file){
 62 | 	file << mName << endl;
 63 | 	file << mSeq.mStr << endl;
 64 | 	file << mStrand << endl;
 65 | 	if(mHasQuality)
 66 | 		file << mQuality << endl;
 67 | }
 68 | 
 69 | Read* Read::reverseComplement(){
 70 | 	Sequence seq = ~mSeq;
 71 | 	string qual;
 72 | 	qual.assign(mQuality.rbegin(), mQuality.rend());
 73 | 	string strand = (mStrand=="+") ? "-" : "+";
 74 | 	return new Read(mName, seq, strand, qual);
 75 | }
 76 | 
 77 | void Read::resize(int len) {
 78 | 	if(len > length() || len<0)
 79 | 		return ;
 80 | 	mSeq.mStr.resize(len);
 81 | 	mQuality.resize(len);
 82 | }
 83 |    
 84 | void Read::trimFront(int len){
 85 | 	len = min(length()-1, len);
 86 | 	mSeq.mStr = mSeq.mStr.substr(len, mSeq.mStr.length() - len);
 87 | 	mQuality = mQuality.substr(len, mQuality.length() - len);
 88 | }
 89 | 
 90 | string Read::lastIndex(){
 91 | 	int len = mName.length();
 92 | 	if(len<5)
 93 | 		return "";
 94 | 	for(int i=len-3;i>=0;i--){
 95 | 		if(mName[i]==':' || mName[i]=='+'){
 96 | 			return mName.substr(i+1, len-i);
 97 | 		}
 98 | 	}
 99 | 	return "";
100 | }
101 | 
102 | string Read::firstIndex(){
103 | 	int len = mName.length();
104 | 	int end = len;
105 | 	if(len<5)
106 | 		return "";
107 | 	for(int i=len-3;i>=0;i--){
108 | 		if(mName[i]=='+')
109 | 			end = i-1;
110 | 		if(mName[i]==':'){
111 | 			return mName.substr(i+1, end-i);
112 | 		}
113 | 	}
114 | 	return "";
115 | }
116 | 
117 | int Read::lowQualCount(int qual){
118 | 	int count = 0;
119 | 	for(int q=0;q<mQuality.size();q++){
120 | 		if(mQuality[q] < qual + 33)
121 | 			count++;
122 | 	}
123 | 	return count;
124 | }
125 | 
126 | int Read::length(){
127 | 	return mSeq.length();
128 | }
129 | 
130 | string Read::toString() {
131 | 	return mName + "\n" + mSeq.mStr + "\n" + mStrand + "\n" + mQuality + "\n";
132 | }
133 | 
134 | string Read::toStringWithTag(string tag) {
135 | 	return mName + " " + tag + "\n" + mSeq.mStr + "\n" + mStrand + "\n" + mQuality + "\n";
136 | }
137 | 
138 | bool Read::fixMGI() {
139 | 	int len = mName.length();
140 | 	if(mName[len-1]=='1' || mName[len-1]=='2') {
141 | 		if(mName[len-2] == '/') {
142 | 			mName = mName.substr(0, len-2) + " " + mName.substr(len-2, 2);
143 | 			return true;
144 | 		}
145 | 	}
146 | 	return false;
147 | }
148 | 
149 | vector<Read*> Read::split(int segment) {
150 | 	vector<Read*> ret;
151 | 	int splitted = 0;
152 | 	string name = mName;
153 | 	string strand = mStrand;
154 | 	while(splitted < length()) {
155 | 		int len = min(segment, length() - splitted);
156 | 		string seq = mSeq.mStr.substr(splitted, len);
157 | 		string quality = mQuality.substr(splitted, len);
158 | 		Read* r = new Read(name, seq, strand, quality);
159 | 		ret.push_back(r);
160 | 		splitted += len;
161 | 	}
162 | 	return ret;
163 | }
164 | 
165 | bool Read::test(){
166 | 	Read r("@NS500713:64:HFKJJBGXY:1:11101:20469:1097 1:N:0:TATAGCCT+GGTCCCGA",
167 | 		"CTCTTGGACTCTAACACTGTTTTTTCTTATGAAAACACAGGAGTGATGACTAGTTGAGTGCATTCTTATGAGACTCATAGTCATTCTATGATGTAGTTTTCCTTAGGAGGACATTTTTTACATGAAATTATTAACCTAAATAGAGTTGATC",
168 | 		"+",
169 | 		"AAAAA6EEEEEEEEEEEEEEEEE#EEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE<EEEEAEEEEEEEEEEEEEEEAEEE/EEEEEEEEEEAAEAEAAEEEAEEAA");
170 | 	string idx = r.lastIndex();
171 | 	return idx == "GGTCCCGA";
172 | }
173 | 
174 | ReadPair::ReadPair(Read* left, Read* right){
175 | 	mLeft = left;
176 | 	mRight = right;
177 | }
178 | 
179 | ReadPair::~ReadPair(){
180 | 	if(mLeft){
181 | 		delete mLeft;
182 | 		mLeft = NULL;
183 | 	}
184 | 	if(mRight){
185 | 		delete mRight;
186 | 		mRight = NULL;
187 | 	}
188 | }
189 | 
190 | Read* ReadPair::fastMerge(){
191 | 	Read* rcRight = mRight->reverseComplement();
192 | 	int len1 = mLeft->length();
193 | 	int len2 = rcRight->length();
194 | 	// use the pointer directly for speed
195 | 	const char* str1 = mLeft->mSeq.mStr.c_str();
196 | 	const char* str2 = rcRight->mSeq.mStr.c_str();
197 | 	const char* qual1 = mLeft->mQuality.c_str();
198 | 	const char* qual2 = rcRight->mQuality.c_str();
199 | 
200 | 	// we require at least 30 bp overlapping to merge a pair
201 | 	const int MIN_OVERLAP = 30;
202 | 	bool overlapped = false;
203 | 	int olen = MIN_OVERLAP;
204 | 	int diff = 0;
205 | 	// the diff count for 1 high qual + 1 low qual
206 | 	int lowQualDiff = 0;
207 | 
208 | 	while(olen <= min(len1, len2)){
209 | 		diff = 0;
210 | 		lowQualDiff = 0;
211 | 		bool ok = true;
212 | 		int offset = len1 - olen;
213 | 		for(int i=0;i<olen;i++){
214 | 			if(str1[offset+i] != str2[i]){
215 | 				diff++;
216 | 				// one is >= Q30 and the other is <= Q15
217 | 				if((qual1[offset+i]>='?' && qual2[i]<='0') || (qual1[offset+i]<='0' && qual2[i]>='?')){
218 | 					lowQualDiff++;
219 | 				}
220 | 				// we disallow high quality diff, and only allow up to 3 low qual diff
221 | 				if(diff>lowQualDiff || lowQualDiff>=3){
222 | 					ok = false;
223 | 					break;
224 | 				}
225 | 			}
226 | 		}
227 | 		if(ok){
228 | 			overlapped = true;
229 | 			break;
230 | 		}
231 | 		olen++;
232 | 	}
233 | 
234 | 	if(overlapped){
235 | 		int offset = len1 - olen;
236 | 		stringstream ss;
237 | 		ss << mLeft->mName << " merged offset:" << offset << " overlap:" << olen << " diff:" << diff;
238 | 		string mergedName = ss.str();
239 | 		string mergedSeq = mLeft->mSeq.mStr.substr(0, offset) + rcRight->mSeq.mStr;
240 | 		string mergedQual = mLeft->mQuality.substr(0, offset) + rcRight->mQuality;
241 | 		// quality adjuction and correction for low qual diff
242 | 		for(int i=0;i<olen;i++){
243 | 			if(str1[offset+i] != str2[i]){
244 | 				if(qual1[offset+i]>='?' && qual2[i]<='0'){
245 | 					mergedSeq[offset+i] = str1[offset+i];
246 | 					mergedQual[offset+i] = qual1[offset+i];
247 | 				} else {
248 | 					mergedSeq[offset+i] = str2[i];
249 | 					mergedQual[offset+i] = qual2[i];
250 | 				}
251 | 			} else {
252 | 				// add the quality of the pair to make a high qual
253 | 				mergedQual[offset+i] =  qual1[offset+i] + qual2[i] - 33;
254 | 			}
255 | 		}
256 | 		delete rcRight;
257 | 		return new Read(mergedName, mergedSeq, "+", mergedQual);
258 | 	}
259 | 
260 | 	delete rcRight;
261 | 	return NULL;
262 | }
263 | 
264 | bool ReadPair::test(){
265 | 	Read* left = new Read("@NS500713:64:HFKJJBGXY:1:11101:20469:1097 1:N:0:TATAGCCT+GGTCCCGA",
266 | 		"TTTTTTCTCTTGGACTCTAACACTGTTTTTTCTTATGAAAACACAGGAGTGATGACTAGTTGAGTGCATTCTTATGAGACTCATAGTCATTCTATGATGTAG",
267 | 		"+",
268 | 		"AAAAA6EEEEEEEEEEEEEEEEE#EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE");
269 | 	Read* right = new Read("@NS500713:64:HFKJJBGXY:1:11101:20469:1097 1:N:0:TATAGCCT+GGTCCCGA",
270 | 		"AAAAAACTACACCATAGAATGACTATGAGTCTCATAAGAATGCACTCAACTAGTCATCACTCCTGTGTTTTCATAAGAAAAAACAGTGTTAGAGTCCAAGAG",
271 | 		"+",
272 | 		"AAAAA6EEEEE/EEEEEEEEEEE#EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE");
273 | 
274 | 	ReadPair pair(left, right);
275 | 	Read* merged = pair.fastMerge();
276 | 	if(merged == NULL)
277 | 		return false;
278 | 
279 | 	if(merged->mSeq.mStr != "TTTTTTCTCTTGGACTCTAACACTGTTTTTTCTTATGAAAACACAGGAGTGATGACTAGTTGAGTGCATTCTTATGAGACTCATAGTCATTCTATGATGTAGTTTTTT")
280 | 		return false;
281 | 
282 | 	return true;
283 | }
284 | 


--------------------------------------------------------------------------------
/src/read.h:
--------------------------------------------------------------------------------
 1 | #ifndef READ_H
 2 | #define READ_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <iostream>
 8 | #include <fstream>
 9 | #include "sequence.h"
10 | #include <vector>
11 | 
12 | using namespace std;
13 | 
14 | class Read{
15 | public:
16 | 	Read(string name, string seq, string strand, string quality, bool phred64=false);
17 |     Read(string name, Sequence seq, string strand, string quality, bool phred64=false);
18 | 	Read(string name, string seq, string strand);
19 |     Read(string name, Sequence seq, string strand);
20 |     Read(Read &r);
21 | 	void print();
22 |     void printFile(ofstream& file);
23 |     Read* reverseComplement();
24 |     string firstIndex();
25 |     string lastIndex();
26 |     // default is Q20
27 |     int lowQualCount(int qual=20);
28 |     int length();
29 |     string toString();
30 |     string toStringWithTag(string tag);
31 |     void resize(int len);
32 |     void convertPhred64To33();
33 |     void trimFront(int len);
34 |     bool fixMGI();
35 |     vector<Read*> split(int segment);
36 | 
37 | public:
38 |     static bool test();
39 | 
40 | private:
41 | 
42 | 
43 | public:
44 | 	string mName;
45 | 	Sequence mSeq;
46 | 	string mStrand;
47 | 	string mQuality;
48 | 	bool mHasQuality;
49 | };
50 | 
51 | class ReadPair{
52 | public:
53 |     ReadPair(Read* left, Read* right);
54 |     ~ReadPair();
55 | 
56 |     // merge a pair, without consideration of seq error caused false INDEL
57 |     Read* fastMerge();
58 | public:
59 |     Read* mLeft;
60 |     Read* mRight;
61 | 
62 | public:
63 |     static bool test();
64 | };
65 | 
66 | #endif


--------------------------------------------------------------------------------
/src/seprocessor.h:
--------------------------------------------------------------------------------
 1 | #ifndef SE_PROCESSOR_H
 2 | #define SE_PROCESSOR_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "read.h"
 8 | #include <cstdlib>
 9 | #include <condition_variable>
10 | #include <mutex>
11 | #include <thread>
12 | #include "options.h"
13 | #include "threadconfig.h"
14 | #include "filter.h"
15 | #include "umiprocessor.h"
16 | #include "writerthread.h"
17 | #include "duplicate.h"
18 | #include "virusdetector.h"
19 | 
20 | using namespace std;
21 | 
22 | struct ReadPack {
23 |     Read** data;
24 |     int count;
25 | };
26 | 
27 | typedef struct ReadPack ReadPack;
28 | 
29 | struct ReadRepository {
30 |     ReadPack** packBuffer;
31 |     atomic_long readPos;
32 |     atomic_long writePos;
33 |     //std::mutex mtx;
34 |     //std::mutex readCounterMtx;
35 |     //std::condition_variable repoNotFull;
36 |     //std::condition_variable repoNotEmpty;
37 | };
38 | 
39 | typedef struct ReadRepository ReadRepository;
40 | 
41 | class SingleEndProcessor{
42 | public:
43 |     SingleEndProcessor(Options* opt);
44 |     ~SingleEndProcessor();
45 |     bool process();
46 | 
47 | private:
48 |     bool processSingleEnd(ReadPack* pack, ThreadConfig* config);
49 |     void initPackRepository();
50 |     void destroyPackRepository();
51 |     void producePack(ReadPack* pack);
52 |     void consumePack(ThreadConfig* config);
53 |     void producerTask();
54 |     void consumerTask(ThreadConfig* config);
55 |     void initConfig(ThreadConfig* config);
56 |     void initOutput();
57 |     void closeOutput();
58 |     void writeTask(WriterThread* config);
59 | 
60 | private:
61 |     Options* mOptions;
62 |     ReadRepository mRepo;
63 |     atomic_bool mProduceFinished;
64 |     atomic_int mFinishedThreads;
65 |     std::mutex mInputMtx;
66 |     std::mutex mOutputMtx;
67 |     Filter* mFilter;
68 |     gzFile mZipFile;
69 |     ofstream* mOutStream;
70 |     UmiProcessor* mUmiProcessor;
71 |     WriterThread* mLeftWriter;
72 |     Duplicate* mDuplicate;
73 |     VirusDetector* mVirusDetector;
74 | };
75 | 
76 | 
77 | #endif


--------------------------------------------------------------------------------
/src/sequence.cpp:
--------------------------------------------------------------------------------
 1 | #include "sequence.h"
 2 | 
 3 | Sequence::Sequence(){
 4 | }
 5 | 
 6 | Sequence::Sequence(string seq){
 7 |     mStr = seq;
 8 | }
 9 | 
10 | void Sequence::print(){
11 |     std::cerr << mStr;
12 | }
13 | 
14 | int Sequence::length(){
15 |     return mStr.length();
16 | }
17 | 
18 | Sequence Sequence::reverseComplement(){
19 |     string str(mStr.length(), 0);
20 |     for(int c=0;c<mStr.length();c++){
21 |         char base = mStr[c];
22 |         switch(base){
23 |             case 'A':
24 |             case 'a':
25 |                 str[mStr.length()-c-1] = 'T';
26 |                 break;
27 |             case 'T':
28 |             case 't':
29 |                 str[mStr.length()-c-1] = 'A';
30 |                 break;
31 |             case 'C':
32 |             case 'c':
33 |                 str[mStr.length()-c-1] = 'G';
34 |                 break;
35 |             case 'G':
36 |             case 'g':
37 |                 str[mStr.length()-c-1] = 'C';
38 |                 break;
39 |             default:
40 |                 str[mStr.length()-c-1] = 'N';
41 |         }
42 |     }
43 |     return Sequence(str);
44 | }
45 | 
46 | Sequence Sequence::operator~(){
47 |     return reverseComplement();
48 | }
49 | 
50 | bool Sequence::test(){
51 |     Sequence s("AAAATTTTCCCCGGGG");
52 |     Sequence rc = ~s;
53 |     if (s.mStr != "AAAATTTTCCCCGGGG" ){
54 |         cerr << "Failed in reverseComplement() expect AAAATTTTCCCCGGGG, but get "<< s.mStr;
55 |         return false;
56 |     }
57 |     if (rc.mStr != "CCCCGGGGAAAATTTT" ){
58 |         cerr << "Failed in reverseComplement() expect CCCCGGGGAAAATTTT, but get "<< rc.mStr;
59 |         return false;
60 |     }
61 |     return true;
62 | }


--------------------------------------------------------------------------------
/src/sequence.h:
--------------------------------------------------------------------------------
 1 | #ifndef SEQUENCE_H
 2 | #define SEQUENCE_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <iostream>
 8 | 
 9 | using namespace std;
10 | 
11 | class Sequence{
12 | public:
13 |     Sequence();
14 |     Sequence(string seq);
15 |     void print();
16 |     int length();
17 |     Sequence reverseComplement();
18 | 
19 |     Sequence operator~();
20 | 
21 |     static bool test();
22 | 
23 | public:
24 |     string mStr;
25 | };
26 | 
27 | #endif


--------------------------------------------------------------------------------
/src/stats.h:
--------------------------------------------------------------------------------
 1 | #ifndef STATS_H
 2 | #define STATS_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <vector>
 8 | #include <map>
 9 | #include "read.h"
10 | #include "options.h"
11 | 
12 | using namespace std;
13 | 
14 | class Stats{
15 | public:
16 |     // this @guessedCycles parameter should be calculated using the first several records
17 |     Stats(Options* opt, bool isRead2 = false, int guessedCycles = 0, int bufferMargin = 1024);
18 |     ~Stats();
19 |     int getCycles();
20 |     long getReads();
21 |     long getBases();
22 |     long getQ20();
23 |     long getQ30();
24 |     long getGCNumber();
25 |     // by default the qualified qual score is Q20 ('5')
26 |     void statRead(Read* r);
27 | 
28 |     static Stats* merge(vector<Stats*>& list);
29 |     void print();
30 |     void summarize(bool forced = false);
31 |     // a port of JSON report
32 |     void reportJson(ofstream& ofs, string padding);
33 |     // a port of HTML report
34 |     void reportHtml(ofstream& ofs, string filteringType, string readName);
35 |     void reportHtmlQuality(ofstream& ofs, string filteringType, string readName);
36 |     void reportHtmlContents(ofstream& ofs, string filteringType, string readName);
37 |     bool isLongRead();
38 |     int getMeanLength();
39 | 
40 | public:
41 |     static string list2string(double* list, int size);
42 |     static string list2string(double* list, int size, long* coords);
43 |     static string list2string(long* list, int size);
44 |     static int base2val(char base);
45 | 
46 | private:
47 |     void extendBuffer(int newBufLen);
48 | 
49 | private:
50 |     Options* mOptions;
51 |     bool mIsRead2;
52 |     long mReads;
53 |     int mEvaluatedSeqLen;
54 |     /* 
55 |     why we use 8 here?
56 |     map A/T/C/G/N to 0~7 by their ASCII % 8:
57 |     'A' % 8 = 1
58 |     'T' % 8 = 4
59 |     'C' % 8 = 3
60 |     'G' % 8 = 7
61 |     'N' % 8 = 6
62 |     */
63 |     long *mCycleQ30Bases[8];
64 |     long *mCycleQ20Bases[8];
65 |     long *mCycleBaseContents[8];
66 |     long *mCycleBaseQual[8];
67 |     long *mCycleTotalBase;
68 |     long *mCycleTotalQual;
69 |     long *mKmer;
70 | 
71 |     map<string, double*> mQualityCurves;
72 |     map<string, double*> mContentCurves;
73 | 
74 | 
75 |     int mCycles;
76 |     int mBufLen;
77 |     long mBases;
78 |     long mQ20Bases[8];
79 |     long mQ30Bases[8];
80 |     long mBaseContents[8];
81 |     long mQ20Total;
82 |     long mQ30Total;
83 |     bool summarized;
84 |     long mKmerMax;
85 |     long mKmerMin;
86 |     int mKmerBufLen;
87 |     long mLengthSum;
88 | };
89 | 
90 | #endif


--------------------------------------------------------------------------------
/src/threadconfig.cpp:
--------------------------------------------------------------------------------
 1 | #include "threadconfig.h"
 2 | #include "util.h"
 3 | 
 4 | ThreadConfig::ThreadConfig(Options* opt, int threadId, bool paired){
 5 |     mOptions = opt;
 6 |     mThreadId = threadId;
 7 |     mPreStats1 = new Stats(mOptions, false);
 8 |     mPostStats1 = new Stats(mOptions, false);
 9 |     if(paired){
10 |         mPreStats2 = new Stats(mOptions, true);
11 |         mPostStats2 = new Stats(mOptions, true);
12 |     }
13 |     else {
14 |         mPreStats2 = NULL;
15 |         mPostStats2 = NULL;
16 |     }
17 |     mWriter1 = NULL;
18 |     mWriter2 = NULL;
19 | 
20 |     mFilterResult = new FilterResult(opt, paired);
21 |     mCanBeStopped = false;
22 | }
23 | 
24 | ThreadConfig::~ThreadConfig() {
25 |     cleanup();
26 | }
27 | 
28 | void ThreadConfig::cleanup() {
29 |     deleteWriter();
30 | }
31 | 
32 | void ThreadConfig::deleteWriter() {
33 |     if(mWriter1 != NULL) {
34 |         delete mWriter1;
35 |         mWriter1 = NULL;
36 |     }
37 |     if(mWriter2 != NULL) {
38 |         delete mWriter2;
39 |         mWriter2 = NULL;
40 |     }
41 | }
42 | 
43 | void ThreadConfig::initWriter(string filename1) {
44 |     deleteWriter();
45 |     mWriter1 = new Writer(filename1, mOptions->compression);
46 | }
47 | 
48 | void ThreadConfig::initWriter(string filename1, string filename2) {
49 |     deleteWriter();
50 |     mWriter1 = new Writer(filename1, mOptions->compression);
51 |     mWriter2 = new Writer(filename2, mOptions->compression);
52 | }
53 | 
54 | void ThreadConfig::initWriter(ofstream* stream) {
55 |     deleteWriter();
56 |     mWriter1 = new Writer(stream);
57 | }
58 | 
59 | void ThreadConfig::initWriter(ofstream* stream1, ofstream* stream2) {
60 |     deleteWriter();
61 |     mWriter1 = new Writer(stream1);
62 |     mWriter2 = new Writer(stream2);
63 | }
64 | 
65 | void ThreadConfig::initWriter(gzFile gzfile) {
66 |     deleteWriter();
67 |     mWriter1 = new Writer(gzfile);
68 | }
69 | 
70 | void ThreadConfig::initWriter(gzFile gzfile1, gzFile gzfile2) {
71 |     deleteWriter();
72 |     mWriter1 = new Writer(gzfile1);
73 |     mWriter2 = new Writer(gzfile2);
74 | }
75 | 
76 | void ThreadConfig::addFilterResult(int result, int readNum) {
77 |     mFilterResult->addFilterResult(result, readNum);
78 | }
79 | 
80 | void ThreadConfig::addMergedPairs(int pairs) {
81 |     mFilterResult->addMergedPairs(pairs);
82 | }
83 | 
84 | void ThreadConfig::markProcessed(long readNum) {
85 | }
86 | 
87 | bool ThreadConfig::canBeStopped() {
88 |     return mCanBeStopped;
89 | }


--------------------------------------------------------------------------------
/src/threadconfig.h:
--------------------------------------------------------------------------------
 1 | #ifndef THREAD_CONFIG_H
 2 | #define THREAD_CONFIG_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <vector>
 8 | #include "stats.h"
 9 | #include "writer.h"
10 | #include "options.h"
11 | #include "filterresult.h"
12 | 
13 | using namespace std;
14 | 
15 | class ThreadConfig{
16 | public:
17 |     ThreadConfig(Options* opt, int threadId, bool paired = false);
18 |     ~ThreadConfig();
19 |     inline Stats* getPreStats1() {return mPreStats1;}
20 |     inline Stats* getPostStats1() {return mPostStats1;}
21 |     inline Stats* getPreStats2() {return mPreStats2;}
22 |     inline Stats* getPostStats2() {return mPostStats2;}
23 |     inline Writer* getWriter1() {return mWriter1;}
24 |     inline Writer* getWriter2() {return mWriter2;}
25 |     inline FilterResult* getFilterResult() {return mFilterResult;}
26 | 
27 |     void initWriter(string filename1);
28 |     void initWriter(string filename1, string filename2);
29 |     void initWriter(ofstream* stream);
30 |     void initWriter(ofstream* stream1, ofstream* stream2);
31 |     void initWriter(gzFile gzfile);
32 |     void initWriter(gzFile gzfile1, gzFile gzfile2);
33 | 
34 |     void addFilterResult(int result, int readNum);
35 |     void addMergedPairs(int pairs);
36 | 
37 |     int getThreadId() {return mThreadId;}
38 |     // for splitting output
39 |     // increase mCurrentSplitReads by readNum, and check it with options->split.size;
40 |     void markProcessed(long readNum);
41 |     bool canBeStopped();
42 |     void cleanup();
43 | 
44 | private:
45 |     void deleteWriter();
46 | 
47 | private:
48 |     Stats* mPreStats1;
49 |     Stats* mPostStats1;
50 |     Stats* mPreStats2;
51 |     Stats* mPostStats2;
52 |     Writer* mWriter1;
53 |     Writer* mWriter2;
54 |     Options* mOptions;
55 |     FilterResult* mFilterResult;
56 | 
57 |     int mThreadId;
58 |     bool mCanBeStopped;
59 | };
60 | 
61 | #endif


--------------------------------------------------------------------------------
/src/umiprocessor.cpp:
--------------------------------------------------------------------------------
 1 | #include "umiprocessor.h"
 2 | 
 3 | UmiProcessor::UmiProcessor(Options* opt){
 4 |     mOptions = opt;
 5 | }
 6 | 
 7 | 
 8 | UmiProcessor::~UmiProcessor(){
 9 | }
10 | 
11 | void UmiProcessor::process(Read* r1, Read* r2) {
12 |     if(!mOptions->umi.enabled)
13 |         return;
14 | 
15 |     string umi;
16 |     if(mOptions->umi.location == UMI_LOC_INDEX1)
17 |         umi = r1->firstIndex();
18 |     else if(mOptions->umi.location == UMI_LOC_INDEX2 && r2)
19 |         umi = r2->lastIndex();
20 |     else if(mOptions->umi.location == UMI_LOC_READ1){
21 |         umi = r1->mSeq.mStr.substr(0, min(r1->length(), mOptions->umi.length));
22 |         r1->trimFront(umi.length() + mOptions->umi.skip);
23 |     }
24 |     else if(mOptions->umi.location == UMI_LOC_READ2 && r2){
25 |         umi = r2->mSeq.mStr.substr(0, min(r2->length(), mOptions->umi.length));
26 |         r2->trimFront(umi.length() + mOptions->umi.skip);
27 |     }
28 |     else if(mOptions->umi.location == UMI_LOC_PER_INDEX){
29 |         string umiMerged = r1->firstIndex();
30 |         if(r2) {
31 |             umiMerged = umiMerged + "_" + r2->lastIndex();
32 |         }
33 | 
34 |         addUmiToName(r1, umiMerged);
35 |         if(r2) {
36 |             addUmiToName(r2, umiMerged);
37 |         }
38 |     }
39 |     else if(mOptions->umi.location == UMI_LOC_PER_READ){
40 |         string umi1 = r1->mSeq.mStr.substr(0, min(r1->length(), mOptions->umi.length));
41 |         string umiMerged = umi1;
42 |         r1->trimFront(umi1.length() + mOptions->umi.skip);
43 |         if(r2){
44 |             string umi2 = r2->mSeq.mStr.substr(0, min(r2->length(), mOptions->umi.length));
45 |             umiMerged = umiMerged + "_" + umi2;
46 |             r2->trimFront(umi2.length() + mOptions->umi.skip);
47 |         }
48 | 
49 |         addUmiToName(r1, umiMerged);
50 |         if(r2){
51 |             addUmiToName(r2, umiMerged);
52 |         }
53 |     }
54 | 
55 |     if(mOptions->umi.location != UMI_LOC_PER_INDEX && mOptions->umi.location != UMI_LOC_PER_READ) {
56 |         if(r1 && !umi.empty()) 
57 |             addUmiToName(r1, umi);
58 |         if(r2 && !umi.empty())
59 |             addUmiToName(r2, umi);
60 |     }
61 | }
62 | 
63 | void UmiProcessor::addUmiToName(Read* r, string umi){
64 |     string tag;
65 |     if(mOptions->umi.prefix.empty())
66 |         tag = ":" + umi;
67 |     else
68 |         tag = ":" + mOptions->umi.prefix + "_" + umi;
69 |     int spacePos = -1;
70 |     for(int i=0; i<r->mName.length(); i++) {
71 |         if(r->mName[i] == ' ') {
72 |             spacePos = i;
73 |             break;
74 |         }
75 |     }
76 |     if(spacePos == -1) {
77 |         r->mName = r->mName + tag;
78 |     } else {
79 |         r->mName = r->mName.substr(0, spacePos) + tag + r->mName.substr(spacePos, r->mName.length() - spacePos);
80 |     }
81 | 
82 | }
83 | 
84 | 
85 | bool UmiProcessor::test() {
86 |     return true;
87 | }


--------------------------------------------------------------------------------
/src/umiprocessor.h:
--------------------------------------------------------------------------------
 1 | #ifndef UMI_PROCESSOR_H
 2 | #define UMI_PROCESSOR_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include "options.h"
 8 | #include "read.h"
 9 | 
10 | using namespace std;
11 | 
12 | class UmiProcessor{
13 | public:
14 |     UmiProcessor(Options* opt);
15 |     ~UmiProcessor();
16 |     void process(Read* r1, Read* r2 = NULL);
17 |     void addUmiToName(Read* r, string umi);
18 |     static bool test();
19 | 
20 | private:
21 |     Options* mOptions;
22 | };
23 | 
24 | 
25 | #endif


--------------------------------------------------------------------------------
/src/unittest.cpp:
--------------------------------------------------------------------------------
 1 | #include "unittest.h"
 2 | #include "sequence.h"
 3 | #include "fastqreader.h"
 4 | #include "read.h"
 5 | #include "overlapanalysis.h"
 6 | #include "filter.h"
 7 | #include "adaptertrimmer.h"
 8 | #include "basecorrector.h"
 9 | #include "polyx.h"
10 | #include "nucleotidetree.h"
11 | #include "evaluator.h"
12 | #include <time.h>
13 | 
14 | UnitTest::UnitTest(){
15 | 
16 | }
17 | 
18 | void UnitTest::run(){
19 |     bool passed = true;
20 |     passed &= report(Sequence::test(), "Sequence::test");
21 |     passed &= report(Read::test(), "Read::test");
22 |     passed &= report(OverlapAnalysis::test(), "OverlapAnalysis::test");
23 |     passed &= report(Filter::test(), "Filter::test");
24 |     passed &= report(AdapterTrimmer::test(), "AdapterTrimmer::test");
25 |     passed &= report(BaseCorrector::test(), "BaseCorrector::test");
26 |     passed &= report(PolyX::test(), "PolyX::test");
27 |     passed &= report(NucleotideTree::test(), "NucleotideTree::test");
28 |     passed &= report(Evaluator::test(), "Evaluator::test");
29 |     printf("\n==========================\n");
30 |     printf("%s\n\n", passed?"ALL PASSED":"FAILED");
31 | }
32 | 
33 | bool UnitTest::report(bool result, string message) {
34 |     printf("%s:%s\n\n", message.c_str(), result?" PASSED":" FAILED");
35 |     return result;
36 | }


--------------------------------------------------------------------------------
/src/unittest.h:
--------------------------------------------------------------------------------
 1 | #ifndef UNIT_TEST_H
 2 | #define UNIT_TEST_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | 
 8 | using namespace std;
 9 | 
10 | class UnitTest{
11 | public:
12 |     UnitTest();
13 |     void run();
14 |     bool report(bool result, string message);
15 | };
16 | 
17 | #endif


--------------------------------------------------------------------------------
/src/util.h:
--------------------------------------------------------------------------------
  1 | #ifndef UTIL_H
  2 | #define UTIL_H
  3 | 
  4 | #include <stdlib.h>
  5 | #include <string>
  6 | #include <iostream>
  7 | #include <vector>
  8 | #include <sys/stat.h>
  9 | #include <algorithm>
 10 | #include <time.h>
 11 | #include <mutex>
 12 | 
 13 | using namespace std;
 14 | 
 15 | inline char complement(char base) {
 16 |     switch(base){
 17 |         case 'A':
 18 |         case 'a':
 19 |             return 'T';
 20 |         case 'T':
 21 |         case 't':
 22 |             return 'A';
 23 |         case 'C':
 24 |         case 'c':
 25 |             return 'G';
 26 |         case 'G':
 27 |         case 'g':
 28 |             return 'C';
 29 |         default:
 30 |             return 'N';
 31 |     }
 32 | }
 33 | 
 34 | inline bool starts_with( string const & value,  string const & starting)
 35 | {
 36 |     if (starting.size() > value.size()) return false;
 37 |     return  equal(starting.begin(), starting.end(), value.begin());
 38 | }
 39 | 
 40 | inline bool ends_with( string const & value,  string const & ending)
 41 | {
 42 | 	if (ending.size() > value.size()) return false;
 43 | 	return  equal(ending.rbegin(), ending.rend(), value.rbegin());
 44 | }
 45 | 
 46 | inline string trim(const string& str)
 47 | {
 48 |     string::size_type pos = str.find_first_not_of(' ');
 49 |     if (pos == string::npos)
 50 |     {
 51 |         return string("");
 52 |     }
 53 |     string::size_type pos2 = str.find_last_not_of(' ');
 54 |     if (pos2 != string::npos)
 55 |     {
 56 |         return str.substr(pos, pos2 - pos + 1);
 57 |     }
 58 |     return str.substr(pos);
 59 | }
 60 | 
 61 | inline int split(const string& str, vector<string>& ret_, string sep = ",")
 62 | {
 63 |     if (str.empty())
 64 |     {
 65 |         return 0;
 66 |     }
 67 | 
 68 |     string tmp;
 69 |     string::size_type pos_begin = str.find_first_not_of(sep);
 70 |     string::size_type comma_pos = 0;
 71 | 
 72 |     while (pos_begin != string::npos)
 73 |     {
 74 |         comma_pos = str.find(sep, pos_begin);
 75 |         if (comma_pos != string::npos)
 76 |         {
 77 |             tmp = str.substr(pos_begin, comma_pos - pos_begin);
 78 |             pos_begin = comma_pos + sep.length();
 79 |         }
 80 |         else
 81 |         {
 82 |             tmp = str.substr(pos_begin);
 83 |             pos_begin = comma_pos;
 84 |         }
 85 | 
 86 |         ret_.push_back(tmp);
 87 |         tmp.clear();
 88 |     }
 89 |     return 0;
 90 | }
 91 | 
 92 | inline string replace(const string& str, const string& src, const string& dest)
 93 | {
 94 |     string ret;
 95 | 
 96 |     string::size_type pos_begin = 0;
 97 |     string::size_type pos       = str.find(src);
 98 |     while (pos != string::npos)
 99 |     {
100 |         ret.append(str.data() + pos_begin, pos - pos_begin);
101 |         ret += dest;
102 |         pos_begin = pos + 1;
103 |         pos       = str.find(src, pos_begin);
104 |     }
105 |     if (pos_begin < str.length())
106 |     {
107 |         ret.append(str.begin() + pos_begin, str.end());
108 |     }
109 |     return ret;
110 | }
111 | 
112 | inline string reverse(const string& str) {
113 |     string ret(str.length(), 0);
114 |     for(int pos=0; pos<str.length(); pos++) {
115 |         ret[pos] = str[str.length() - pos - 1];
116 |     }
117 |     return ret;
118 | }
119 | 
120 | inline string basename(const string& filename){
121 |     string::size_type pos = filename.find_last_of('/');
122 |     if (pos == string::npos)
123 |         return filename;
124 |     else if(pos == filename.length()-1)
125 |         return ""; // a bad filename
126 |     else
127 |         return filename.substr(pos+1, filename.length() - pos - 1);
128 | }
129 | 
130 | inline string dirname(const string& filename){
131 |     string::size_type pos = filename.find_last_of('/');
132 |     if (pos == string::npos) {
133 |         return "./";
134 |     } else
135 |         return filename.substr(0, pos+1);
136 | }
137 | 
138 | inline string joinpath(const string& dirname, const string& basename){
139 |     if(dirname[dirname.length()-1] == '/'){
140 |         return dirname + basename;
141 |     } else {
142 |         return dirname + "/" + basename;
143 |     }
144 | }
145 | 
146 | //Check if a string is a file or directory
147 | inline bool file_exists(const  string& s)
148 | {
149 |     bool exists = false;
150 |     if(s.length() > 0) {
151 |         struct stat status;
152 |         int result = stat( s.c_str(), &status );
153 |         if(result == 0) {
154 |             exists = true;
155 |         }
156 |     }
157 |     return exists;
158 | }
159 | 
160 | 
161 | // check if a string is a directory
162 | inline bool is_directory(const  string& path)
163 | {
164 |     bool isdir = false;
165 |     struct stat status;
166 |     // visual studion use _S_IFDIR instead of S_IFDIR
167 |     // http://msdn.microsoft.com/en-us/library/14h5k7ff.aspx
168 | #ifdef _MSC_VER
169 | #define S_IFDIR _S_IFDIR
170 | #endif
171 |     stat( path.c_str(), &status );
172 |     if ( status.st_mode &  S_IFDIR  ) {
173 |         isdir = true;
174 |     }
175 | // #endif
176 |     return isdir;
177 | }
178 | 
179 | inline void check_file_valid(const  string& s) {
180 |     if(!file_exists(s)){
181 |         cerr << "ERROR: file '" << s << "' doesn't exist, quit now" << endl;
182 |         exit(-1);
183 |     }
184 |     if(is_directory(s)){
185 |         cerr << "ERROR: '" << s << "' is a folder, not a file, quit now" << endl;
186 |         exit(-1);
187 |     }
188 | }
189 | 
190 | inline void check_file_writable(const  string& s) {
191 |     string dir = dirname(s);
192 |     if(!file_exists(dir)) {
193 |         cerr << "ERROR: '" << dir << " doesn't exist. Create this folder and run this command again." << endl;
194 |         exit(-1);
195 |     }
196 |     if(is_directory(s)){
197 |         cerr << "ERROR: '" << s << "' is not a writable file, quit now" << endl;
198 |         exit(-1);
199 |     }
200 | }
201 | 
202 | // Remove non alphabetic characters from a string
203 | inline  string str_keep_alpha(const  string& s)
204 | {
205 |      string new_str;
206 |     for( size_t it =0; it < s.size(); it++) {
207 |         if(  isalpha(s[it]) ) {
208 |             new_str += s[it];
209 |         }
210 |     }
211 |     return new_str;
212 | }
213 | 
214 | 
215 | // Remove invalid sequence characters from a string
216 | inline void str_keep_valid_sequence(  string& s, bool forceUpperCase = false)
217 | {
218 |     size_t total = 0;
219 |     const char case_gap = 'a' - 'A';
220 |     for( size_t it =0; it < s.size(); it++) {
221 |         char c = s[it];
222 |         if(forceUpperCase && c>='a' && c<='z') {
223 |             c -= case_gap;
224 |         }
225 |         if(  isalpha(c) || c == '-' || c == '*' ) {
226 |             s[total] = c;
227 |             total ++;
228 |         }
229 |     }
230 | 
231 |     s.resize(total);
232 | }
233 | 
234 | inline int find_with_right_pos(const string& str, const string& pattern, int start=0) {
235 |     int pos = str.find(pattern, start);
236 |     if (pos < 0)
237 |         return -1;
238 |     else
239 |         return pos + pattern.length();
240 | }
241 | 
242 | inline void str2upper(string& s){
243 |     transform(s.begin(), s.end(), s.begin(), (int (*)(int))toupper);
244 | }
245 | 
246 | inline void str2lower(string& s){
247 |     transform(s.begin(), s.end(), s.begin(), (int (*)(int))tolower);
248 | }
249 | 
250 | inline char num2qual(int num) {
251 |     if(num > 127 - 33)
252 |         num = 127 - 33;
253 |     if(num < 0)
254 |         num = 0;
255 | 
256 |     char c = num + 33;
257 |     return c;
258 | }
259 | 
260 | inline void error_exit(const string& msg) {
261 |     cerr << "ERROR: " << msg << endl;
262 |     exit(-1);
263 | }
264 | 
265 | extern mutex logmtx;
266 | inline void loginfo(const string s){
267 |     logmtx.lock();
268 |     time_t tt = time(NULL);
269 |     tm* t= localtime(&tt);
270 |     cerr<<"["<<t->tm_hour<<":"<<t->tm_min<<":"<<t->tm_sec<<"] "<<s<<endl;
271 |     logmtx.unlock();
272 | }
273 | 
274 | #endif /* UTIL_H */
275 | 


--------------------------------------------------------------------------------
/src/virusdetector.cpp:
--------------------------------------------------------------------------------
  1 | #include "virusdetector.h"
  2 | 
  3 | VirusDetector::VirusDetector(Options* opt){
  4 |     mOptions = opt;
  5 | 
  6 |     mKmer = NULL;
  7 |     if(!mOptions->kmerFile.empty())
  8 |         mKmer = new Kmer(mOptions->kmerFile, opt);
  9 | 
 10 |     mKmerCollection = NULL;
 11 |     if(!mOptions->kmerCollectionFile.empty())
 12 |         mKmerCollection = new KmerCollection(mOptions->kmerCollectionFile, opt);
 13 | 
 14 |     // no KMER file, the kmerKeyLen is not intialized
 15 |     if(mOptions->kmerKeyLen == 0)
 16 |         mOptions->kmerKeyLen = 25;
 17 |     mGenomes = NULL;
 18 |     if(!mOptions->genomeFile.empty())
 19 |         mGenomes = new Genomes(mOptions->genomeFile, opt);
 20 |     mHits = 0;
 21 | }
 22 | 
 23 | VirusDetector::~VirusDetector(){
 24 |     if(mKmer) {
 25 |         delete mKmer;
 26 |         mKmer = NULL;
 27 |     }
 28 |     if(mKmerCollection) {
 29 |         delete mKmerCollection;
 30 |         mKmerCollection = NULL;
 31 |     }
 32 |     if(mGenomes) {
 33 |         delete mGenomes;
 34 |         mGenomes = NULL;
 35 |     }
 36 | }
 37 | 
 38 | void VirusDetector::report() {
 39 |     if(mKmer) {
 40 |         cerr << "Coverage for target unique KMER file:"<<endl;
 41 |         mKmer->report();
 42 |     }
 43 |     if(mKmerCollection) {
 44 |         cerr << endl << "Detection result for provided KMER collection:"<<endl;
 45 |         mKmerCollection->report();
 46 |     }
 47 |     if(mGenomes) {
 48 |         //mGenomes->report();
 49 |     }
 50 | }
 51 | 
 52 | bool VirusDetector::detect(Read* r) {
 53 |     if(r->length() >= mOptions->longReadThreshold) {
 54 |         // long reads, split it
 55 |         vector<Read*> reads = r->split(mOptions->segmentLength);
 56 |         bool detected = false;
 57 |         for(int i=0; i<reads.size(); i++) {
 58 |             // recursive
 59 |             detected |= detect(reads[i]);
 60 |             delete reads[i];
 61 |             reads[i] = NULL;
 62 |         }
 63 |         return detected;
 64 |     }
 65 |     string& seq = r->mSeq.mStr;
 66 |     Sequence rSequence = ~(r->mSeq);
 67 |     string& rseq = rSequence.mStr;
 68 | 
 69 |     return scan(seq) | scan(rseq);
 70 | }
 71 | 
 72 | bool VirusDetector::scan(string& seq) {
 73 |     int hitCount = 0;
 74 | 
 75 |     int keylen = mOptions->kmerKeyLen;
 76 |     int blankBits = 64 - 2*keylen;
 77 | 
 78 |     bool onlyHitOneGenome = true;
 79 |     uint32 lastGenomeID = 0;
 80 | 
 81 |     if(seq.length() < keylen)
 82 |         return false;
 83 | 
 84 |     bool valid = true;
 85 |     bool needAlignment = false;
 86 | 
 87 |     uint32 start = 0;
 88 |     uint64 key = Kmer::seq2uint64(seq, start, keylen-1, valid);
 89 |     while(valid == false) {
 90 |         start++;
 91 |         key = Kmer::seq2uint64(seq, start, keylen-1, valid);
 92 |         // reach the tail
 93 |         if(start >= seq.length() - keylen)
 94 |             return false;
 95 |     }
 96 |     for(uint32 pos = start; pos < seq.length() - keylen; pos++) {
 97 |         key = (key << 2);
 98 |         switch(seq[pos + keylen-1]) {
 99 |             case 'A':
100 |                 key += 0;
101 |                 break;
102 |             case 'T':
103 |                 key += 1;
104 |                 break;
105 |             case 'C':
106 |                 key += 2;
107 |                 break;
108 |             case 'G':
109 |                 key += 3;
110 |                 break;
111 |             case 'N':
112 |             default:
113 |                 // we have to skip the segments covering this N
114 |                 if(pos >= seq.length() - keylen)
115 |                     continue;
116 |                 pos++;
117 |                 key = Kmer::seq2uint64(seq, pos, keylen-1, valid);
118 |                 bool outterBreak = false;
119 |                 while(valid == false) {
120 |                     pos++;
121 |                     key = Kmer::seq2uint64(seq, pos, keylen-1, valid);
122 |                     // reach the tail
123 |                     if(pos >= seq.length() - keylen) {
124 |                         outterBreak = true;
125 |                         break;
126 |                     }
127 |                 }
128 |                 if(outterBreak)
129 |                     break;
130 | 
131 |                 continue;
132 |         }
133 |         key = (key << blankBits) >> blankBits;
134 | 
135 |         // add to genome stats
136 |         if(!needAlignment && mGenomes && mGenomes->hasKey(key)) {
137 |             needAlignment = true;
138 |             if(!mKmer)
139 |                 break;
140 |         }
141 | 
142 |         // add to Kmer stas
143 |         if(mKmer) {
144 |             bool hit = mKmer->add(key);
145 |             if(hit)
146 |                 hitCount++;
147 |         }
148 | 
149 |         if(mKmerCollection) {
150 |             uint32 gid = mKmerCollection->add(key);
151 |             if(gid > 0) {
152 |                 if(lastGenomeID!=0 && gid!=lastGenomeID)
153 |                     onlyHitOneGenome = false;
154 |                 lastGenomeID = gid;
155 |             }
156 |         }
157 |     }
158 | 
159 |     if(mKmerCollection && onlyHitOneGenome && lastGenomeID>0)
160 |         mKmerCollection->addGenomeRead(lastGenomeID);
161 | 
162 |     bool wellMapped = false;
163 |     if(needAlignment && mGenomes)
164 |         wellMapped = mGenomes->align(seq);
165 | 
166 |     return hitCount>0 || wellMapped;
167 | }


--------------------------------------------------------------------------------
/src/virusdetector.h:
--------------------------------------------------------------------------------
 1 | #ifndef VIRUSDETECTOR_H
 2 | #define VIRUSDETECTOR_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <vector>
 8 | #include "options.h"
 9 | #include "read.h"
10 | #include "kmer.h"
11 | #include "genomes.h"
12 | #include "kmercollection.h"
13 | 
14 | using namespace std;
15 | 
16 | class VirusDetector{
17 | public:
18 |     VirusDetector(Options* opt);
19 |     ~VirusDetector();
20 |     bool detect(Read* r);
21 |     bool scan(string& seq);
22 |     void report();
23 | 
24 |     Kmer* getKmer() {return mKmer;}
25 |     Genomes* getGenomes() {return mGenomes;}
26 |     KmerCollection* getKmerCollection() {return mKmerCollection;}
27 | 
28 | 
29 | private:
30 |     Options* mOptions;
31 |     Genomes* mGenomes;
32 |     Kmer* mKmer;
33 |     KmerCollection* mKmerCollection;
34 |     uint64 mHits;
35 | };
36 | 
37 | 
38 | #endif


--------------------------------------------------------------------------------
/src/writer.cpp:
--------------------------------------------------------------------------------
  1 | #include "writer.h"
  2 | #include "util.h"
  3 | #include "fastqreader.h"
  4 | #include <string.h>
  5 | 
  6 | Writer::Writer(string filename, int compression){
  7 | 	mCompression = compression;
  8 | 	mFilename = filename;
  9 | 	mZipFile = NULL;
 10 | 	mZipped = false;
 11 | 	haveToClose = true;
 12 | 	init();
 13 | }
 14 | 
 15 | Writer::Writer(ofstream* stream) {
 16 | 	mZipFile = NULL;
 17 | 	mZipped = false;
 18 | 	mOutStream = stream;
 19 | 	haveToClose = false;
 20 | }
 21 | 
 22 | Writer::Writer(gzFile gzfile) {
 23 | 	mOutStream = NULL;
 24 | 	mZipFile = gzfile;
 25 | 	mZipped = true;
 26 | 	haveToClose = false;
 27 | }
 28 | 
 29 | Writer::~Writer(){
 30 | 	if(haveToClose) {
 31 | 		close();
 32 | 	}
 33 | }
 34 | 
 35 | string Writer::filename(){
 36 | 	return mFilename;
 37 | }
 38 | 
 39 | void Writer::init(){
 40 | 	if (ends_with(mFilename, ".gz")){
 41 | 		mZipFile = gzopen(mFilename.c_str(), "w");
 42 |         gzsetparams(mZipFile, mCompression, Z_DEFAULT_STRATEGY);
 43 |         gzbuffer(mZipFile, 1024*1024);
 44 | 		mZipped = true;
 45 | 	}
 46 | 	else {
 47 | 		mOutStream = new ofstream();
 48 | 		mOutStream->open(mFilename.c_str(), ifstream::out);
 49 | 		mZipped = false;
 50 | 	}
 51 | }
 52 | 
 53 | bool Writer::writeLine(string& linestr){
 54 | 	const char* line = linestr.c_str();
 55 | 	size_t size = linestr.length();
 56 | 	size_t written;
 57 | 	bool status;
 58 | 	if(mZipped){
 59 | 		written = gzwrite(mZipFile, line, size);
 60 | 		gzputc(mZipFile, '\n');
 61 | 		status = size == written;
 62 | 	}
 63 | 	else{
 64 | 		mOutStream->write(line, size);
 65 | 		mOutStream->put('\n');
 66 | 		status = !mOutStream->fail();
 67 | 	}
 68 | 
 69 | 	return status;
 70 | }
 71 | 
 72 | bool Writer::writeString(string& str){
 73 | 	const char* strdata = str.c_str();
 74 | 	size_t size = str.length();
 75 | 	size_t written;
 76 | 	bool status;
 77 | 	if(mZipped){
 78 | 		written = gzwrite(mZipFile, strdata, size);
 79 | 		status = size == written;
 80 | 	}
 81 | 	else{
 82 | 		mOutStream->write(strdata, size);
 83 | 		status = !mOutStream->fail();
 84 | 	}
 85 | 
 86 | 	return status;
 87 | }
 88 | 
 89 | bool Writer::write(char* strdata, size_t size) {
 90 | 	size_t written;
 91 | 	bool status;
 92 | 	
 93 | 	if(mZipped){
 94 | 		written = gzwrite(mZipFile, strdata, size);
 95 | 		status = size == written;
 96 | 	}
 97 | 	else{
 98 | 		mOutStream->write(strdata, size);
 99 | 		status = !mOutStream->fail();
100 | 	}
101 | 	return status;
102 | }
103 | 
104 | void Writer::close(){
105 | 	if (mZipped){
106 | 		if (mZipFile){
107 | 			gzflush(mZipFile, Z_FINISH);
108 | 			gzclose(mZipFile);
109 | 			mZipFile = NULL;
110 | 		}
111 | 	}
112 | 	else if(mOutStream) {
113 | 		if (mOutStream->is_open()){
114 | 			mOutStream->flush();
115 | 			//TODO: following two lines will cause crash
116 | 			//mOutStream->close();
117 | 			//delete mOutStream;
118 | 			mOutStream = NULL;
119 | 		}
120 | 	}
121 | }
122 | 
123 | bool Writer::isZipped(){
124 | 	return mZipped;
125 | }


--------------------------------------------------------------------------------
/src/writer.h:
--------------------------------------------------------------------------------
 1 | #ifndef _WRITER_H
 2 | #define _WRITER_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #ifdef DYNAMIC_ZLIB
 7 |   #include <zlib.h>
 8 | #else
 9 |   #include "zlib/zlib.h"
10 | #endif
11 | #include "common.h"
12 | #include <iostream>
13 | #include <fstream>
14 | 
15 | using namespace std;
16 | 
17 | class Writer{
18 | public:
19 | 	Writer(string filename, int compression = 3);
20 | 	Writer(ofstream* stream);
21 | 	Writer(gzFile gzfile);
22 | 	~Writer();
23 | 	bool isZipped();
24 | 	bool writeString(string& s);
25 | 	bool writeLine(string& linestr);
26 | 	bool write(char* strdata, size_t size);
27 | 	string filename();
28 | 
29 | public:
30 | 	static bool test();
31 | 
32 | private:
33 | 	void init();
34 | 	void close();
35 | 
36 | private:
37 | 	string mFilename;
38 | 	gzFile mZipFile;
39 | 	ofstream* mOutStream;
40 | 	bool mZipped;
41 | 	int mCompression;
42 | 	bool haveToClose;
43 | };
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/writerthread.cpp:
--------------------------------------------------------------------------------
 1 | #include "writerthread.h"
 2 | #include "util.h"
 3 | #include <memory.h>
 4 | #include <unistd.h>
 5 | 
 6 | WriterThread::WriterThread(Options* opt, string filename){
 7 |     mOptions = opt;
 8 | 
 9 |     mWriter1 = NULL;
10 | 
11 |     mInputCounter = 0;
12 |     mOutputCounter = 0;
13 |     mInputCompleted = false;
14 |     mFilename = filename;
15 | 
16 |     mRingBuffer = new char*[PACK_NUM_LIMIT];
17 |     memset(mRingBuffer, 0, sizeof(char*) * PACK_NUM_LIMIT);
18 |     mRingBufferSizes = new size_t[PACK_NUM_LIMIT];
19 |     memset(mRingBufferSizes, 0, sizeof(size_t) * PACK_NUM_LIMIT);
20 |     initWriter(filename);
21 | }
22 | 
23 | WriterThread::~WriterThread() {
24 |     cleanup();
25 |     delete mRingBuffer;
26 | }
27 | 
28 | bool WriterThread::isCompleted() 
29 | {
30 |     return mInputCompleted && (mOutputCounter == mInputCounter);
31 | }
32 | 
33 | bool WriterThread::setInputCompleted() {
34 |     mInputCompleted = true;
35 |     return true;
36 | }
37 | 
38 | void WriterThread::output(){
39 |     if(mOutputCounter >= mInputCounter) {
40 |         usleep(100);
41 |     }
42 |     while( mOutputCounter < mInputCounter) 
43 |     {
44 |         mWriter1->write(mRingBuffer[mOutputCounter], mRingBufferSizes[mOutputCounter]);
45 |         delete mRingBuffer[mOutputCounter];
46 |         mRingBuffer[mOutputCounter] = NULL;
47 |         mOutputCounter++;
48 |     }
49 | }
50 | 
51 | void  WriterThread::input(char* data, size_t size){
52 |     mRingBuffer[mInputCounter] = data;
53 |     mRingBufferSizes[mInputCounter] = size;
54 |     mInputCounter++;
55 | }
56 | 
57 | void WriterThread::cleanup() {
58 |     deleteWriter();
59 | }
60 | 
61 | void WriterThread::deleteWriter() {
62 |     if(mWriter1 != NULL) {
63 |         delete mWriter1;
64 |         mWriter1 = NULL;
65 |     }
66 | }
67 | 
68 | void WriterThread::initWriter(string filename1) {
69 |     deleteWriter();
70 |     mWriter1 = new Writer(filename1, mOptions->compression);
71 | }
72 | 
73 | void WriterThread::initWriter(ofstream* stream) {
74 |     deleteWriter();
75 |     mWriter1 = new Writer(stream);
76 | }
77 | 
78 | void WriterThread::initWriter(gzFile gzfile) {
79 |     deleteWriter();
80 |     mWriter1 = new Writer(gzfile);
81 | }
82 | 
83 | long WriterThread::bufferLength(){
84 |     return mInputCounter - mOutputCounter;
85 | }
86 | 


--------------------------------------------------------------------------------
/src/writerthread.h:
--------------------------------------------------------------------------------
 1 | #ifndef WRITER_THREAD_H
 2 | #define WRITER_THREAD_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string>
 7 | #include <vector>
 8 | #include "writer.h"
 9 | #include "options.h"
10 | #include <atomic>
11 | #include <mutex>
12 | 
13 | using namespace std;
14 | 
15 | class WriterThread{
16 | public:
17 |     WriterThread(Options* opt, string filename);
18 |     ~WriterThread();
19 | 
20 |     void initWriter(string filename1);
21 |     void initWriter(ofstream* stream);
22 |     void initWriter(gzFile gzfile);
23 | 
24 |     void cleanup();
25 | 
26 |     bool isCompleted();
27 |     void output();
28 |     void input(char* data, size_t size);
29 |     bool setInputCompleted();
30 | 
31 |     long bufferLength();
32 |     string getFilename() {return mFilename;}
33 | 
34 | private:
35 |     void deleteWriter();
36 | 
37 | private:
38 |     Writer* mWriter1;
39 |     Options* mOptions;
40 |     string mFilename;
41 | 
42 |     // for spliting output
43 |     bool mInputCompleted;
44 |     atomic_long mInputCounter;
45 |     atomic_long mOutputCounter;
46 |     char** mRingBuffer;
47 |     size_t* mRingBufferSizes;
48 | 
49 |     mutex mtx;
50 | 
51 | };
52 | 
53 | #endif


--------------------------------------------------------------------------------
/src/zlib/deflate.h:
--------------------------------------------------------------------------------
  1 | /* deflate.h -- internal compression state
  2 |  * Copyright (C) 1995-2012 Jean-loup Gailly
  3 |  * For conditions of distribution and use, see copyright notice in zlib.h
  4 |  */
  5 | 
  6 | /* WARNING: this file should *not* be used by applications. It is
  7 |    part of the implementation of the compression library and is
  8 |    subject to change. Applications should only use zlib.h.
  9 |  */
 10 | 
 11 | /* @(#) $Id$ */
 12 | 
 13 | #ifndef DEFLATE_H
 14 | #define DEFLATE_H
 15 | 
 16 | #include "zutil.h"
 17 | 
 18 | /* define NO_GZIP when compiling if you want to disable gzip header and
 19 |    trailer creation by deflate().  NO_GZIP would be used to avoid linking in
 20 |    the crc code when it is not needed.  For shared libraries, gzip encoding
 21 |    should be left enabled. */
 22 | #ifndef NO_GZIP
 23 | #  define GZIP
 24 | #endif
 25 | 
 26 | /* ===========================================================================
 27 |  * Internal compression state.
 28 |  */
 29 | 
 30 | #define LENGTH_CODES 29
 31 | /* number of length codes, not counting the special END_BLOCK code */
 32 | 
 33 | #define LITERALS  256
 34 | /* number of literal bytes 0..255 */
 35 | 
 36 | #define L_CODES (LITERALS+1+LENGTH_CODES)
 37 | /* number of Literal or Length codes, including the END_BLOCK code */
 38 | 
 39 | #define D_CODES   30
 40 | /* number of distance codes */
 41 | 
 42 | #define BL_CODES  19
 43 | /* number of codes used to transfer the bit lengths */
 44 | 
 45 | #define HEAP_SIZE (2*L_CODES+1)
 46 | /* maximum heap size */
 47 | 
 48 | #define MAX_BITS 15
 49 | /* All codes must not exceed MAX_BITS bits */
 50 | 
 51 | #define Buf_size 16
 52 | /* size of bit buffer in bi_buf */
 53 | 
 54 | #define INIT_STATE    42
 55 | #define EXTRA_STATE   69
 56 | #define NAME_STATE    73
 57 | #define COMMENT_STATE 91
 58 | #define HCRC_STATE   103
 59 | #define BUSY_STATE   113
 60 | #define FINISH_STATE 666
 61 | /* Stream status */
 62 | 
 63 | 
 64 | /* Data structure describing a single value and its code string. */
 65 | typedef struct ct_data_s {
 66 |     union {
 67 |         ush  freq;       /* frequency count */
 68 |         ush  code;       /* bit string */
 69 |     } fc;
 70 |     union {
 71 |         ush  dad;        /* father node in Huffman tree */
 72 |         ush  len;        /* length of bit string */
 73 |     } dl;
 74 | } FAR ct_data;
 75 | 
 76 | #define Freq fc.freq
 77 | #define Code fc.code
 78 | #define Dad  dl.dad
 79 | #define Len  dl.len
 80 | 
 81 | typedef struct static_tree_desc_s  static_tree_desc;
 82 | 
 83 | typedef struct tree_desc_s {
 84 |     ct_data *dyn_tree;           /* the dynamic tree */
 85 |     int     max_code;            /* largest code with non zero frequency */
 86 |     static_tree_desc *stat_desc; /* the corresponding static tree */
 87 | } FAR tree_desc;
 88 | 
 89 | typedef ush Pos;
 90 | typedef Pos FAR Posf;
 91 | typedef unsigned IPos;
 92 | 
 93 | /* A Pos is an index in the character window. We use short instead of int to
 94 |  * save space in the various tables. IPos is used only for parameter passing.
 95 |  */
 96 | 
 97 | typedef struct internal_state {
 98 |     z_streamp strm;      /* pointer back to this zlib stream */
 99 |     int   status;        /* as the name implies */
100 |     Bytef *pending_buf;  /* output still pending */
101 |     ulg   pending_buf_size; /* size of pending_buf */
102 |     Bytef *pending_out;  /* next pending byte to output to the stream */
103 |     uInt   pending;      /* nb of bytes in the pending buffer */
104 |     int   wrap;          /* bit 0 true for zlib, bit 1 true for gzip */
105 |     gz_headerp  gzhead;  /* gzip header information to write */
106 |     uInt   gzindex;      /* where in extra, name, or comment */
107 |     Byte  method;        /* can only be DEFLATED */
108 |     int   last_flush;    /* value of flush param for previous deflate call */
109 | 
110 |                 /* used by deflate.c: */
111 | 
112 |     uInt  w_size;        /* LZ77 window size (32K by default) */
113 |     uInt  w_bits;        /* log2(w_size)  (8..16) */
114 |     uInt  w_mask;        /* w_size - 1 */
115 | 
116 |     Bytef *window;
117 |     /* Sliding window. Input bytes are read into the second half of the window,
118 |      * and move to the first half later to keep a dictionary of at least wSize
119 |      * bytes. With this organization, matches are limited to a distance of
120 |      * wSize-MAX_MATCH bytes, but this ensures that IO is always
121 |      * performed with a length multiple of the block size. Also, it limits
122 |      * the window size to 64K, which is quite useful on MSDOS.
123 |      * To do: use the user input buffer as sliding window.
124 |      */
125 | 
126 |     ulg window_size;
127 |     /* Actual size of window: 2*wSize, except when the user input buffer
128 |      * is directly used as sliding window.
129 |      */
130 | 
131 |     Posf *prev;
132 |     /* Link to older string with same hash index. To limit the size of this
133 |      * array to 64K, this link is maintained only for the last 32K strings.
134 |      * An index in this array is thus a window index modulo 32K.
135 |      */
136 | 
137 |     Posf *head; /* Heads of the hash chains or NIL. */
138 | 
139 |     uInt  ins_h;          /* hash index of string to be inserted */
140 |     uInt  hash_size;      /* number of elements in hash table */
141 |     uInt  hash_bits;      /* log2(hash_size) */
142 |     uInt  hash_mask;      /* hash_size-1 */
143 | 
144 |     uInt  hash_shift;
145 |     /* Number of bits by which ins_h must be shifted at each input
146 |      * step. It must be such that after MIN_MATCH steps, the oldest
147 |      * byte no longer takes part in the hash key, that is:
148 |      *   hash_shift * MIN_MATCH >= hash_bits
149 |      */
150 | 
151 |     long block_start;
152 |     /* Window position at the beginning of the current output block. Gets
153 |      * negative when the window is moved backwards.
154 |      */
155 | 
156 |     uInt match_length;           /* length of best match */
157 |     IPos prev_match;             /* previous match */
158 |     int match_available;         /* set if previous match exists */
159 |     uInt strstart;               /* start of string to insert */
160 |     uInt match_start;            /* start of matching string */
161 |     uInt lookahead;              /* number of valid bytes ahead in window */
162 | 
163 |     uInt prev_length;
164 |     /* Length of the best match at previous step. Matches not greater than this
165 |      * are discarded. This is used in the lazy match evaluation.
166 |      */
167 | 
168 |     uInt max_chain_length;
169 |     /* To speed up deflation, hash chains are never searched beyond this
170 |      * length.  A higher limit improves compression ratio but degrades the
171 |      * speed.
172 |      */
173 | 
174 |     uInt max_lazy_match;
175 |     /* Attempt to find a better match only when the current match is strictly
176 |      * smaller than this value. This mechanism is used only for compression
177 |      * levels >= 4.
178 |      */
179 | #   define max_insert_length  max_lazy_match
180 |     /* Insert new strings in the hash table only if the match length is not
181 |      * greater than this length. This saves time but degrades compression.
182 |      * max_insert_length is used only for compression levels <= 3.
183 |      */
184 | 
185 |     int level;    /* compression level (1..9) */
186 |     int strategy; /* favor or force Huffman coding*/
187 | 
188 |     uInt good_match;
189 |     /* Use a faster search when the previous match is longer than this */
190 | 
191 |     int nice_match; /* Stop searching when current match exceeds this */
192 | 
193 |                 /* used by trees.c: */
194 |     /* Didn't use ct_data typedef below to suppress compiler warning */
195 |     struct ct_data_s dyn_ltree[HEAP_SIZE];   /* literal and length tree */
196 |     struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */
197 |     struct ct_data_s bl_tree[2*BL_CODES+1];  /* Huffman tree for bit lengths */
198 | 
199 |     struct tree_desc_s l_desc;               /* desc. for literal tree */
200 |     struct tree_desc_s d_desc;               /* desc. for distance tree */
201 |     struct tree_desc_s bl_desc;              /* desc. for bit length tree */
202 | 
203 |     ush bl_count[MAX_BITS+1];
204 |     /* number of codes at each bit length for an optimal tree */
205 | 
206 |     int heap[2*L_CODES+1];      /* heap used to build the Huffman trees */
207 |     int heap_len;               /* number of elements in the heap */
208 |     int heap_max;               /* element of largest frequency */
209 |     /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
210 |      * The same heap array is used to build all trees.
211 |      */
212 | 
213 |     uch depth[2*L_CODES+1];
214 |     /* Depth of each subtree used as tie breaker for trees of equal frequency
215 |      */
216 | 
217 |     uchf *l_buf;          /* buffer for literals or lengths */
218 | 
219 |     uInt  lit_bufsize;
220 |     /* Size of match buffer for literals/lengths.  There are 4 reasons for
221 |      * limiting lit_bufsize to 64K:
222 |      *   - frequencies can be kept in 16 bit counters
223 |      *   - if compression is not successful for the first block, all input
224 |      *     data is still in the window so we can still emit a stored block even
225 |      *     when input comes from standard input.  (This can also be done for
226 |      *     all blocks if lit_bufsize is not greater than 32K.)
227 |      *   - if compression is not successful for a file smaller than 64K, we can
228 |      *     even emit a stored file instead of a stored block (saving 5 bytes).
229 |      *     This is applicable only for zip (not gzip or zlib).
230 |      *   - creating new Huffman trees less frequently may not provide fast
231 |      *     adaptation to changes in the input data statistics. (Take for
232 |      *     example a binary file with poorly compressible code followed by
233 |      *     a highly compressible string table.) Smaller buffer sizes give
234 |      *     fast adaptation but have of course the overhead of transmitting
235 |      *     trees more frequently.
236 |      *   - I can't count above 4
237 |      */
238 | 
239 |     uInt last_lit;      /* running index in l_buf */
240 | 
241 |     ushf *d_buf;
242 |     /* Buffer for distances. To simplify the code, d_buf and l_buf have
243 |      * the same number of elements. To use different lengths, an extra flag
244 |      * array would be necessary.
245 |      */
246 | 
247 |     ulg opt_len;        /* bit length of current block with optimal trees */
248 |     ulg static_len;     /* bit length of current block with static trees */
249 |     uInt matches;       /* number of string matches in current block */
250 |     uInt insert;        /* bytes at end of window left to insert */
251 | 
252 | #ifdef DEBUG
253 |     ulg compressed_len; /* total bit length of compressed file mod 2^32 */
254 |     ulg bits_sent;      /* bit length of compressed data sent mod 2^32 */
255 | #endif
256 | 
257 |     ush bi_buf;
258 |     /* Output buffer. bits are inserted starting at the bottom (least
259 |      * significant bits).
260 |      */
261 |     int bi_valid;
262 |     /* Number of valid bits in bi_buf.  All bits above the last valid bit
263 |      * are always zero.
264 |      */
265 | 
266 |     ulg high_water;
267 |     /* High water mark offset in window for initialized bytes -- bytes above
268 |      * this are set to zero in order to avoid memory check warnings when
269 |      * longest match routines access bytes past the input.  This is then
270 |      * updated to the new high water mark.
271 |      */
272 | 
273 | } FAR deflate_state;
274 | 
275 | /* Output a byte on the stream.
276 |  * IN assertion: there is enough room in pending_buf.
277 |  */
278 | #define put_byte(s, c) {s->pending_buf[s->pending++] = (c);}
279 | 
280 | 
281 | #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
282 | /* Minimum amount of lookahead, except at the end of the input file.
283 |  * See deflate.c for comments about the MIN_MATCH+1.
284 |  */
285 | 
286 | #define MAX_DIST(s)  ((s)->w_size-MIN_LOOKAHEAD)
287 | /* In order to simplify the code, particularly on 16 bit machines, match
288 |  * distances are limited to MAX_DIST instead of WSIZE.
289 |  */
290 | 
291 | #define WIN_INIT MAX_MATCH
292 | /* Number of bytes after end of data in window to initialize in order to avoid
293 |    memory checker errors from longest match routines */
294 | 
295 |         /* in trees.c */
296 | void ZLIB_INTERNAL _tr_init OF((deflate_state *s));
297 | int ZLIB_INTERNAL _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc));
298 | void ZLIB_INTERNAL _tr_flush_block OF((deflate_state *s, charf *buf,
299 |                         ulg stored_len, int last));
300 | void ZLIB_INTERNAL _tr_flush_bits OF((deflate_state *s));
301 | void ZLIB_INTERNAL _tr_align OF((deflate_state *s));
302 | void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
303 |                         ulg stored_len, int last));
304 | 
305 | #define d_code(dist) \
306 |    ((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)])
307 | /* Mapping from a distance to a distance code. dist is the distance - 1 and
308 |  * must not have side effects. _dist_code[256] and _dist_code[257] are never
309 |  * used.
310 |  */
311 | 
312 | #ifndef DEBUG
313 | /* Inline versions of _tr_tally for speed: */
314 | 
315 | #if defined(GEN_TREES_H) || !defined(STDC)
316 |   extern uch ZLIB_INTERNAL _length_code[];
317 |   extern uch ZLIB_INTERNAL _dist_code[];
318 | #else
319 |   extern const uch ZLIB_INTERNAL _length_code[];
320 |   extern const uch ZLIB_INTERNAL _dist_code[];
321 | #endif
322 | 
323 | # define _tr_tally_lit(s, c, flush) \
324 |   { uch cc = (c); \
325 |     s->d_buf[s->last_lit] = 0; \
326 |     s->l_buf[s->last_lit++] = cc; \
327 |     s->dyn_ltree[cc].Freq++; \
328 |     flush = (s->last_lit == s->lit_bufsize-1); \
329 |    }
330 | # define _tr_tally_dist(s, distance, length, flush) \
331 |   { uch len = (length); \
332 |     ush dist = (distance); \
333 |     s->d_buf[s->last_lit] = dist; \
334 |     s->l_buf[s->last_lit++] = len; \
335 |     dist--; \
336 |     s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \
337 |     s->dyn_dtree[d_code(dist)].Freq++; \
338 |     flush = (s->last_lit == s->lit_bufsize-1); \
339 |   }
340 | #else
341 | # define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c)
342 | # define _tr_tally_dist(s, distance, length, flush) \
343 |               flush = _tr_tally(s, distance, length)
344 | #endif
345 | 
346 | #endif /* DEFLATE_H */
347 | 


--------------------------------------------------------------------------------
/src/zlib/gzguts.h:
--------------------------------------------------------------------------------
  1 | /* gzguts.h -- zlib internal header definitions for gz* operations
  2 |  * Copyright (C) 2004, 2005, 2010, 2011, 2012, 2013 Mark Adler
  3 |  * For conditions of distribution and use, see copyright notice in zlib.h
  4 |  */
  5 | 
  6 | #ifdef _LARGEFILE64_SOURCE
  7 | #  ifndef _LARGEFILE_SOURCE
  8 | #    define _LARGEFILE_SOURCE 1
  9 | #  endif
 10 | #  ifdef _FILE_OFFSET_BITS
 11 | #    undef _FILE_OFFSET_BITS
 12 | #  endif
 13 | #endif
 14 | 
 15 | #ifdef HAVE_HIDDEN
 16 | #  define ZLIB_INTERNAL __attribute__((visibility ("hidden")))
 17 | #else
 18 | #  define ZLIB_INTERNAL
 19 | #endif
 20 | 
 21 | #include <stdio.h>
 22 | #include "zlib.h"
 23 | #ifdef STDC
 24 | #  include <string.h>
 25 | #  include <stdlib.h>
 26 | #  include <limits.h>
 27 | #endif
 28 | #include <fcntl.h>
 29 | 
 30 | #ifdef _WIN32
 31 | #  include <stddef.h>
 32 | #endif
 33 | 
 34 | #if defined(__TURBOC__) || defined(_MSC_VER) || defined(_WIN32)
 35 | #  include <io.h>
 36 | #endif
 37 | 
 38 | #ifdef WINAPI_FAMILY
 39 | #  define open _open
 40 | #  define read _read
 41 | #  define write _write
 42 | #  define close _close
 43 | #endif
 44 | 
 45 | #ifdef NO_DEFLATE       /* for compatibility with old definition */
 46 | #  define NO_GZCOMPRESS
 47 | #endif
 48 | 
 49 | #if defined(STDC99) || (defined(__TURBOC__) && __TURBOC__ >= 0x550)
 50 | #  ifndef HAVE_VSNPRINTF
 51 | #    define HAVE_VSNPRINTF
 52 | #  endif
 53 | #endif
 54 | 
 55 | #if defined(__CYGWIN__)
 56 | #  ifndef HAVE_VSNPRINTF
 57 | #    define HAVE_VSNPRINTF
 58 | #  endif
 59 | #endif
 60 | 
 61 | #if defined(MSDOS) && defined(__BORLANDC__) && (BORLANDC > 0x410)
 62 | #  ifndef HAVE_VSNPRINTF
 63 | #    define HAVE_VSNPRINTF
 64 | #  endif
 65 | #endif
 66 | 
 67 | #ifndef HAVE_VSNPRINTF
 68 | #  ifdef MSDOS
 69 | /* vsnprintf may exist on some MS-DOS compilers (DJGPP?),
 70 |    but for now we just assume it doesn't. */
 71 | #    define NO_vsnprintf
 72 | #  endif
 73 | #  ifdef __TURBOC__
 74 | #    define NO_vsnprintf
 75 | #  endif
 76 | #  ifdef WIN32
 77 | /* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */
 78 | #    if !defined(vsnprintf) && !defined(NO_vsnprintf)
 79 | #      if !defined(_MSC_VER) || ( defined(_MSC_VER) && _MSC_VER < 1500 )
 80 | #         define vsnprintf _vsnprintf
 81 | #      endif
 82 | #    endif
 83 | #  endif
 84 | #  ifdef __SASC
 85 | #    define NO_vsnprintf
 86 | #  endif
 87 | #  ifdef VMS
 88 | #    define NO_vsnprintf
 89 | #  endif
 90 | #  ifdef __OS400__
 91 | #    define NO_vsnprintf
 92 | #  endif
 93 | #  ifdef __MVS__
 94 | #    define NO_vsnprintf
 95 | #  endif
 96 | #endif
 97 | 
 98 | /* unlike snprintf (which is required in C99, yet still not supported by
 99 |    Microsoft more than a decade later!), _snprintf does not guarantee null
100 |    termination of the result -- however this is only used in gzlib.c where
101 |    the result is assured to fit in the space provided */
102 | #ifdef _MSC_VER
103 | #  define snprintf _snprintf
104 | #endif
105 | 
106 | #ifndef local
107 | #  define local static
108 | #endif
109 | /* compile with -Dlocal if your debugger can't find static symbols */
110 | 
111 | /* gz* functions always use library allocation functions */
112 | #ifndef STDC
113 |   extern voidp  malloc OF((uInt size));
114 |   extern void   free   OF((voidpf ptr));
115 | #endif
116 | 
117 | /* get errno and strerror definition */
118 | #if defined UNDER_CE
119 | #  include <windows.h>
120 | #  define zstrerror() gz_strwinerror((DWORD)GetLastError())
121 | #else
122 | #  ifndef NO_STRERROR
123 | #    include <errno.h>
124 | #    define zstrerror() strerror(errno)
125 | #  else
126 | #    define zstrerror() "stdio error (consult errno)"
127 | #  endif
128 | #endif
129 | 
130 | /* provide prototypes for these when building zlib without LFS */
131 | #if !defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0
132 |     ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
133 |     ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int));
134 |     ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile));
135 |     ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile));
136 | #endif
137 | 
138 | /* default memLevel */
139 | #if MAX_MEM_LEVEL >= 8
140 | #  define DEF_MEM_LEVEL 8
141 | #else
142 | #  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
143 | #endif
144 | 
145 | /* default i/o buffer size -- double this for output when reading (this and
146 |    twice this must be able to fit in an unsigned type) */
147 | #define GZBUFSIZE 8192
148 | 
149 | /* gzip modes, also provide a little integrity check on the passed structure */
150 | #define GZ_NONE 0
151 | #define GZ_READ 7247
152 | #define GZ_WRITE 31153
153 | #define GZ_APPEND 1     /* mode set to GZ_WRITE after the file is opened */
154 | 
155 | /* values for gz_state how */
156 | #define LOOK 0      /* look for a gzip header */
157 | #define COPY 1      /* copy input directly */
158 | #define GZIP 2      /* decompress a gzip stream */
159 | 
160 | /* internal gzip file state data structure */
161 | typedef struct {
162 |         /* exposed contents for gzgetc() macro */
163 |     struct gzFile_s x;      /* "x" for exposed */
164 |                             /* x.have: number of bytes available at x.next */
165 |                             /* x.next: next output data to deliver or write */
166 |                             /* x.pos: current position in uncompressed data */
167 |         /* used for both reading and writing */
168 |     int mode;               /* see gzip modes above */
169 |     int fd;                 /* file descriptor */
170 |     char *path;             /* path or fd for error messages */
171 |     unsigned size;          /* buffer size, zero if not allocated yet */
172 |     unsigned want;          /* requested buffer size, default is GZBUFSIZE */
173 |     unsigned char *in;      /* input buffer */
174 |     unsigned char *out;     /* output buffer (double-sized when reading) */
175 |     int direct;             /* 0 if processing gzip, 1 if transparent */
176 |         /* just for reading */
177 |     int how;                /* 0: get header, 1: copy, 2: decompress */
178 |     z_off64_t start;        /* where the gzip data started, for rewinding */
179 |     int eof;                /* true if end of input file reached */
180 |     int past;               /* true if read requested past end */
181 |         /* just for writing */
182 |     int level;              /* compression level */
183 |     int strategy;           /* compression strategy */
184 |         /* seek request */
185 |     z_off64_t skip;         /* amount to skip (already rewound if backwards) */
186 |     int seek;               /* true if seek request pending */
187 |         /* error information */
188 |     int err;                /* error code */
189 |     char *msg;              /* error message */
190 |         /* zlib inflate or deflate stream */
191 |     z_stream strm;          /* stream structure in-place (not a pointer) */
192 | } gz_state;
193 | typedef gz_state FAR *gz_statep;
194 | 
195 | /* shared functions */
196 | void ZLIB_INTERNAL gz_error OF((gz_statep, int, const char *));
197 | #if defined UNDER_CE
198 | char ZLIB_INTERNAL *gz_strwinerror OF((DWORD error));
199 | #endif
200 | 
201 | /* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t
202 |    value -- needed when comparing unsigned to z_off64_t, which is signed
203 |    (possible z_off64_t types off_t, off64_t, and long are all signed) */
204 | #ifdef INT_MAX
205 | #  define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX)
206 | #else
207 | unsigned ZLIB_INTERNAL gz_intmax OF((void));
208 | #  define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > gz_intmax())
209 | #endif
210 | 


--------------------------------------------------------------------------------
/src/zlib/inffast.h:
--------------------------------------------------------------------------------
 1 | /* inffast.h -- header to use inffast.c
 2 |  * Copyright (C) 1995-2003, 2010 Mark Adler
 3 |  * For conditions of distribution and use, see copyright notice in zlib.h
 4 |  */
 5 | 
 6 | /* WARNING: this file should *not* be used by applications. It is
 7 |    part of the implementation of the compression library and is
 8 |    subject to change. Applications should only use zlib.h.
 9 |  */
10 | 
11 | void ZLIB_INTERNAL inflate_fast OF((z_streamp strm, unsigned start));
12 | 


--------------------------------------------------------------------------------
/src/zlib/inffixed.h:
--------------------------------------------------------------------------------
 1 |     /* inffixed.h -- table for decoding fixed codes
 2 |      * Generated automatically by makefixed().
 3 |      */
 4 | 
 5 |     /* WARNING: this file should *not* be used by applications.
 6 |        It is part of the implementation of this library and is
 7 |        subject to change. Applications should only use zlib.h.
 8 |      */
 9 | 
10 |     static const code lenfix[512] = {
11 |         {96,7,0},{0,8,80},{0,8,16},{20,8,115},{18,7,31},{0,8,112},{0,8,48},
12 |         {0,9,192},{16,7,10},{0,8,96},{0,8,32},{0,9,160},{0,8,0},{0,8,128},
13 |         {0,8,64},{0,9,224},{16,7,6},{0,8,88},{0,8,24},{0,9,144},{19,7,59},
14 |         {0,8,120},{0,8,56},{0,9,208},{17,7,17},{0,8,104},{0,8,40},{0,9,176},
15 |         {0,8,8},{0,8,136},{0,8,72},{0,9,240},{16,7,4},{0,8,84},{0,8,20},
16 |         {21,8,227},{19,7,43},{0,8,116},{0,8,52},{0,9,200},{17,7,13},{0,8,100},
17 |         {0,8,36},{0,9,168},{0,8,4},{0,8,132},{0,8,68},{0,9,232},{16,7,8},
18 |         {0,8,92},{0,8,28},{0,9,152},{20,7,83},{0,8,124},{0,8,60},{0,9,216},
19 |         {18,7,23},{0,8,108},{0,8,44},{0,9,184},{0,8,12},{0,8,140},{0,8,76},
20 |         {0,9,248},{16,7,3},{0,8,82},{0,8,18},{21,8,163},{19,7,35},{0,8,114},
21 |         {0,8,50},{0,9,196},{17,7,11},{0,8,98},{0,8,34},{0,9,164},{0,8,2},
22 |         {0,8,130},{0,8,66},{0,9,228},{16,7,7},{0,8,90},{0,8,26},{0,9,148},
23 |         {20,7,67},{0,8,122},{0,8,58},{0,9,212},{18,7,19},{0,8,106},{0,8,42},
24 |         {0,9,180},{0,8,10},{0,8,138},{0,8,74},{0,9,244},{16,7,5},{0,8,86},
25 |         {0,8,22},{64,8,0},{19,7,51},{0,8,118},{0,8,54},{0,9,204},{17,7,15},
26 |         {0,8,102},{0,8,38},{0,9,172},{0,8,6},{0,8,134},{0,8,70},{0,9,236},
27 |         {16,7,9},{0,8,94},{0,8,30},{0,9,156},{20,7,99},{0,8,126},{0,8,62},
28 |         {0,9,220},{18,7,27},{0,8,110},{0,8,46},{0,9,188},{0,8,14},{0,8,142},
29 |         {0,8,78},{0,9,252},{96,7,0},{0,8,81},{0,8,17},{21,8,131},{18,7,31},
30 |         {0,8,113},{0,8,49},{0,9,194},{16,7,10},{0,8,97},{0,8,33},{0,9,162},
31 |         {0,8,1},{0,8,129},{0,8,65},{0,9,226},{16,7,6},{0,8,89},{0,8,25},
32 |         {0,9,146},{19,7,59},{0,8,121},{0,8,57},{0,9,210},{17,7,17},{0,8,105},
33 |         {0,8,41},{0,9,178},{0,8,9},{0,8,137},{0,8,73},{0,9,242},{16,7,4},
34 |         {0,8,85},{0,8,21},{16,8,258},{19,7,43},{0,8,117},{0,8,53},{0,9,202},
35 |         {17,7,13},{0,8,101},{0,8,37},{0,9,170},{0,8,5},{0,8,133},{0,8,69},
36 |         {0,9,234},{16,7,8},{0,8,93},{0,8,29},{0,9,154},{20,7,83},{0,8,125},
37 |         {0,8,61},{0,9,218},{18,7,23},{0,8,109},{0,8,45},{0,9,186},{0,8,13},
38 |         {0,8,141},{0,8,77},{0,9,250},{16,7,3},{0,8,83},{0,8,19},{21,8,195},
39 |         {19,7,35},{0,8,115},{0,8,51},{0,9,198},{17,7,11},{0,8,99},{0,8,35},
40 |         {0,9,166},{0,8,3},{0,8,131},{0,8,67},{0,9,230},{16,7,7},{0,8,91},
41 |         {0,8,27},{0,9,150},{20,7,67},{0,8,123},{0,8,59},{0,9,214},{18,7,19},
42 |         {0,8,107},{0,8,43},{0,9,182},{0,8,11},{0,8,139},{0,8,75},{0,9,246},
43 |         {16,7,5},{0,8,87},{0,8,23},{64,8,0},{19,7,51},{0,8,119},{0,8,55},
44 |         {0,9,206},{17,7,15},{0,8,103},{0,8,39},{0,9,174},{0,8,7},{0,8,135},
45 |         {0,8,71},{0,9,238},{16,7,9},{0,8,95},{0,8,31},{0,9,158},{20,7,99},
46 |         {0,8,127},{0,8,63},{0,9,222},{18,7,27},{0,8,111},{0,8,47},{0,9,190},
47 |         {0,8,15},{0,8,143},{0,8,79},{0,9,254},{96,7,0},{0,8,80},{0,8,16},
48 |         {20,8,115},{18,7,31},{0,8,112},{0,8,48},{0,9,193},{16,7,10},{0,8,96},
49 |         {0,8,32},{0,9,161},{0,8,0},{0,8,128},{0,8,64},{0,9,225},{16,7,6},
50 |         {0,8,88},{0,8,24},{0,9,145},{19,7,59},{0,8,120},{0,8,56},{0,9,209},
51 |         {17,7,17},{0,8,104},{0,8,40},{0,9,177},{0,8,8},{0,8,136},{0,8,72},
52 |         {0,9,241},{16,7,4},{0,8,84},{0,8,20},{21,8,227},{19,7,43},{0,8,116},
53 |         {0,8,52},{0,9,201},{17,7,13},{0,8,100},{0,8,36},{0,9,169},{0,8,4},
54 |         {0,8,132},{0,8,68},{0,9,233},{16,7,8},{0,8,92},{0,8,28},{0,9,153},
55 |         {20,7,83},{0,8,124},{0,8,60},{0,9,217},{18,7,23},{0,8,108},{0,8,44},
56 |         {0,9,185},{0,8,12},{0,8,140},{0,8,76},{0,9,249},{16,7,3},{0,8,82},
57 |         {0,8,18},{21,8,163},{19,7,35},{0,8,114},{0,8,50},{0,9,197},{17,7,11},
58 |         {0,8,98},{0,8,34},{0,9,165},{0,8,2},{0,8,130},{0,8,66},{0,9,229},
59 |         {16,7,7},{0,8,90},{0,8,26},{0,9,149},{20,7,67},{0,8,122},{0,8,58},
60 |         {0,9,213},{18,7,19},{0,8,106},{0,8,42},{0,9,181},{0,8,10},{0,8,138},
61 |         {0,8,74},{0,9,245},{16,7,5},{0,8,86},{0,8,22},{64,8,0},{19,7,51},
62 |         {0,8,118},{0,8,54},{0,9,205},{17,7,15},{0,8,102},{0,8,38},{0,9,173},
63 |         {0,8,6},{0,8,134},{0,8,70},{0,9,237},{16,7,9},{0,8,94},{0,8,30},
64 |         {0,9,157},{20,7,99},{0,8,126},{0,8,62},{0,9,221},{18,7,27},{0,8,110},
65 |         {0,8,46},{0,9,189},{0,8,14},{0,8,142},{0,8,78},{0,9,253},{96,7,0},
66 |         {0,8,81},{0,8,17},{21,8,131},{18,7,31},{0,8,113},{0,8,49},{0,9,195},
67 |         {16,7,10},{0,8,97},{0,8,33},{0,9,163},{0,8,1},{0,8,129},{0,8,65},
68 |         {0,9,227},{16,7,6},{0,8,89},{0,8,25},{0,9,147},{19,7,59},{0,8,121},
69 |         {0,8,57},{0,9,211},{17,7,17},{0,8,105},{0,8,41},{0,9,179},{0,8,9},
70 |         {0,8,137},{0,8,73},{0,9,243},{16,7,4},{0,8,85},{0,8,21},{16,8,258},
71 |         {19,7,43},{0,8,117},{0,8,53},{0,9,203},{17,7,13},{0,8,101},{0,8,37},
72 |         {0,9,171},{0,8,5},{0,8,133},{0,8,69},{0,9,235},{16,7,8},{0,8,93},
73 |         {0,8,29},{0,9,155},{20,7,83},{0,8,125},{0,8,61},{0,9,219},{18,7,23},
74 |         {0,8,109},{0,8,45},{0,9,187},{0,8,13},{0,8,141},{0,8,77},{0,9,251},
75 |         {16,7,3},{0,8,83},{0,8,19},{21,8,195},{19,7,35},{0,8,115},{0,8,51},
76 |         {0,9,199},{17,7,11},{0,8,99},{0,8,35},{0,9,167},{0,8,3},{0,8,131},
77 |         {0,8,67},{0,9,231},{16,7,7},{0,8,91},{0,8,27},{0,9,151},{20,7,67},
78 |         {0,8,123},{0,8,59},{0,9,215},{18,7,19},{0,8,107},{0,8,43},{0,9,183},
79 |         {0,8,11},{0,8,139},{0,8,75},{0,9,247},{16,7,5},{0,8,87},{0,8,23},
80 |         {64,8,0},{19,7,51},{0,8,119},{0,8,55},{0,9,207},{17,7,15},{0,8,103},
81 |         {0,8,39},{0,9,175},{0,8,7},{0,8,135},{0,8,71},{0,9,239},{16,7,9},
82 |         {0,8,95},{0,8,31},{0,9,159},{20,7,99},{0,8,127},{0,8,63},{0,9,223},
83 |         {18,7,27},{0,8,111},{0,8,47},{0,9,191},{0,8,15},{0,8,143},{0,8,79},
84 |         {0,9,255}
85 |     };
86 | 
87 |     static const code distfix[32] = {
88 |         {16,5,1},{23,5,257},{19,5,17},{27,5,4097},{17,5,5},{25,5,1025},
89 |         {21,5,65},{29,5,16385},{16,5,3},{24,5,513},{20,5,33},{28,5,8193},
90 |         {18,5,9},{26,5,2049},{22,5,129},{64,5,0},{16,5,2},{23,5,385},
91 |         {19,5,25},{27,5,6145},{17,5,7},{25,5,1537},{21,5,97},{29,5,24577},
92 |         {16,5,4},{24,5,769},{20,5,49},{28,5,12289},{18,5,13},{26,5,3073},
93 |         {22,5,193},{64,5,0}
94 |     };
95 | 


--------------------------------------------------------------------------------
/src/zlib/inflate.h:
--------------------------------------------------------------------------------
  1 | /* inflate.h -- internal inflate state definition
  2 |  * Copyright (C) 1995-2009 Mark Adler
  3 |  * For conditions of distribution and use, see copyright notice in zlib.h
  4 |  */
  5 | 
  6 | /* WARNING: this file should *not* be used by applications. It is
  7 |    part of the implementation of the compression library and is
  8 |    subject to change. Applications should only use zlib.h.
  9 |  */
 10 | 
 11 | /* define NO_GZIP when compiling if you want to disable gzip header and
 12 |    trailer decoding by inflate().  NO_GZIP would be used to avoid linking in
 13 |    the crc code when it is not needed.  For shared libraries, gzip decoding
 14 |    should be left enabled. */
 15 | #ifndef NO_GZIP
 16 | #  define GUNZIP
 17 | #endif
 18 | 
 19 | /* Possible inflate modes between inflate() calls */
 20 | typedef enum {
 21 |     HEAD,       /* i: waiting for magic header */
 22 |     FLAGS,      /* i: waiting for method and flags (gzip) */
 23 |     TIME,       /* i: waiting for modification time (gzip) */
 24 |     OS,         /* i: waiting for extra flags and operating system (gzip) */
 25 |     EXLEN,      /* i: waiting for extra length (gzip) */
 26 |     EXTRA,      /* i: waiting for extra bytes (gzip) */
 27 |     NAME,       /* i: waiting for end of file name (gzip) */
 28 |     COMMENT,    /* i: waiting for end of comment (gzip) */
 29 |     HCRC,       /* i: waiting for header crc (gzip) */
 30 |     DICTID,     /* i: waiting for dictionary check value */
 31 |     DICT,       /* waiting for inflateSetDictionary() call */
 32 |         TYPE,       /* i: waiting for type bits, including last-flag bit */
 33 |         TYPEDO,     /* i: same, but skip check to exit inflate on new block */
 34 |         STORED,     /* i: waiting for stored size (length and complement) */
 35 |         COPY_,      /* i/o: same as COPY below, but only first time in */
 36 |         COPY,       /* i/o: waiting for input or output to copy stored block */
 37 |         TABLE,      /* i: waiting for dynamic block table lengths */
 38 |         LENLENS,    /* i: waiting for code length code lengths */
 39 |         CODELENS,   /* i: waiting for length/lit and distance code lengths */
 40 |             LEN_,       /* i: same as LEN below, but only first time in */
 41 |             LEN,        /* i: waiting for length/lit/eob code */
 42 |             LENEXT,     /* i: waiting for length extra bits */
 43 |             DIST,       /* i: waiting for distance code */
 44 |             DISTEXT,    /* i: waiting for distance extra bits */
 45 |             MATCH,      /* o: waiting for output space to copy string */
 46 |             LIT,        /* o: waiting for output space to write literal */
 47 |     CHECK,      /* i: waiting for 32-bit check value */
 48 |     LENGTH,     /* i: waiting for 32-bit length (gzip) */
 49 |     DONE,       /* finished check, done -- remain here until reset */
 50 |     BAD,        /* got a data error -- remain here until reset */
 51 |     MEM,        /* got an inflate() memory error -- remain here until reset */
 52 |     SYNC        /* looking for synchronization bytes to restart inflate() */
 53 | } inflate_mode;
 54 | 
 55 | /*
 56 |     State transitions between above modes -
 57 | 
 58 |     (most modes can go to BAD or MEM on error -- not shown for clarity)
 59 | 
 60 |     Process header:
 61 |         HEAD -> (gzip) or (zlib) or (raw)
 62 |         (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME -> COMMENT ->
 63 |                   HCRC -> TYPE
 64 |         (zlib) -> DICTID or TYPE
 65 |         DICTID -> DICT -> TYPE
 66 |         (raw) -> TYPEDO
 67 |     Read deflate blocks:
 68 |             TYPE -> TYPEDO -> STORED or TABLE or LEN_ or CHECK
 69 |             STORED -> COPY_ -> COPY -> TYPE
 70 |             TABLE -> LENLENS -> CODELENS -> LEN_
 71 |             LEN_ -> LEN
 72 |     Read deflate codes in fixed or dynamic block:
 73 |                 LEN -> LENEXT or LIT or TYPE
 74 |                 LENEXT -> DIST -> DISTEXT -> MATCH -> LEN
 75 |                 LIT -> LEN
 76 |     Process trailer:
 77 |         CHECK -> LENGTH -> DONE
 78 |  */
 79 | 
 80 | /* state maintained between inflate() calls.  Approximately 10K bytes. */
 81 | struct inflate_state {
 82 |     inflate_mode mode;          /* current inflate mode */
 83 |     int last;                   /* true if processing last block */
 84 |     int wrap;                   /* bit 0 true for zlib, bit 1 true for gzip */
 85 |     int havedict;               /* true if dictionary provided */
 86 |     int flags;                  /* gzip header method and flags (0 if zlib) */
 87 |     unsigned dmax;              /* zlib header max distance (INFLATE_STRICT) */
 88 |     unsigned long check;        /* protected copy of check value */
 89 |     unsigned long total;        /* protected copy of output count */
 90 |     gz_headerp head;            /* where to save gzip header information */
 91 |         /* sliding window */
 92 |     unsigned wbits;             /* log base 2 of requested window size */
 93 |     unsigned wsize;             /* window size or zero if not using window */
 94 |     unsigned whave;             /* valid bytes in the window */
 95 |     unsigned wnext;             /* window write index */
 96 |     unsigned char FAR *window;  /* allocated sliding window, if needed */
 97 |         /* bit accumulator */
 98 |     unsigned long hold;         /* input bit accumulator */
 99 |     unsigned bits;              /* number of bits in "in" */
100 |         /* for string and stored block copying */
101 |     unsigned length;            /* literal or length of data to copy */
102 |     unsigned offset;            /* distance back to copy string from */
103 |         /* for table and code decoding */
104 |     unsigned extra;             /* extra bits needed */
105 |         /* fixed and dynamic code tables */
106 |     code const FAR *lencode;    /* starting table for length/literal codes */
107 |     code const FAR *distcode;   /* starting table for distance codes */
108 |     unsigned lenbits;           /* index bits for lencode */
109 |     unsigned distbits;          /* index bits for distcode */
110 |         /* dynamic table building */
111 |     unsigned ncode;             /* number of code length code lengths */
112 |     unsigned nlen;              /* number of length code lengths */
113 |     unsigned ndist;             /* number of distance code lengths */
114 |     unsigned have;              /* number of code lengths in lens[] */
115 |     code FAR *next;             /* next available space in codes[] */
116 |     unsigned short lens[320];   /* temporary storage for code lengths */
117 |     unsigned short work[288];   /* work area for code table building */
118 |     code codes[ENOUGH];         /* space for code tables */
119 |     int sane;                   /* if false, allow invalid distance too far */
120 |     int back;                   /* bits back of last unprocessed length/lit */
121 |     unsigned was;               /* initial length of match */
122 | };
123 | 


--------------------------------------------------------------------------------
/src/zlib/inftrees.h:
--------------------------------------------------------------------------------
 1 | /* inftrees.h -- header to use inftrees.c
 2 |  * Copyright (C) 1995-2005, 2010 Mark Adler
 3 |  * For conditions of distribution and use, see copyright notice in zlib.h
 4 |  */
 5 | 
 6 | /* WARNING: this file should *not* be used by applications. It is
 7 |    part of the implementation of the compression library and is
 8 |    subject to change. Applications should only use zlib.h.
 9 |  */
10 | 
11 | /* Structure for decoding tables.  Each entry provides either the
12 |    information needed to do the operation requested by the code that
13 |    indexed that table entry, or it provides a pointer to another
14 |    table that indexes more bits of the code.  op indicates whether
15 |    the entry is a pointer to another table, a literal, a length or
16 |    distance, an end-of-block, or an invalid code.  For a table
17 |    pointer, the low four bits of op is the number of index bits of
18 |    that table.  For a length or distance, the low four bits of op
19 |    is the number of extra bits to get after the code.  bits is
20 |    the number of bits in this code or part of the code to drop off
21 |    of the bit buffer.  val is the actual byte to output in the case
22 |    of a literal, the base length or distance, or the offset from
23 |    the current table to the next table.  Each entry is four bytes. */
24 | typedef struct {
25 |     unsigned char op;           /* operation, extra bits, table bits */
26 |     unsigned char bits;         /* bits in this part of the code */
27 |     unsigned short val;         /* offset in table or code value */
28 | } code;
29 | 
30 | /* op values as set by inflate_table():
31 |     00000000 - literal
32 |     0000tttt - table link, tttt != 0 is the number of table index bits
33 |     0001eeee - length or distance, eeee is the number of extra bits
34 |     01100000 - end of block
35 |     01000000 - invalid code
36 |  */
37 | 
38 | /* Maximum size of the dynamic table.  The maximum number of code structures is
39 |    1444, which is the sum of 852 for literal/length codes and 592 for distance
40 |    codes.  These values were found by exhaustive searches using the program
41 |    examples/enough.c found in the zlib distribtution.  The arguments to that
42 |    program are the number of symbols, the initial root table size, and the
43 |    maximum bit length of a code.  "enough 286 9 15" for literal/length codes
44 |    returns returns 852, and "enough 30 6 15" for distance codes returns 592.
45 |    The initial root table size (9 or 6) is found in the fifth argument of the
46 |    inflate_table() calls in inflate.c and infback.c.  If the root table size is
47 |    changed, then these maximum sizes would be need to be recalculated and
48 |    updated. */
49 | #define ENOUGH_LENS 852
50 | #define ENOUGH_DISTS 592
51 | #define ENOUGH (ENOUGH_LENS+ENOUGH_DISTS)
52 | 
53 | /* Type of code to build for inflate_table() */
54 | typedef enum {
55 |     CODES,
56 |     LENS,
57 |     DISTS
58 | } codetype;
59 | 
60 | int ZLIB_INTERNAL inflate_table OF((codetype type, unsigned short FAR *lens,
61 |                              unsigned codes, code FAR * FAR *table,
62 |                              unsigned FAR *bits, unsigned short FAR *work));
63 | 


--------------------------------------------------------------------------------
/src/zlib/trees.h:
--------------------------------------------------------------------------------
  1 | /* header created automatically with -DGEN_TREES_H */
  2 | 
  3 | local const ct_data static_ltree[L_CODES+2] = {
  4 | {{ 12},{  8}}, {{140},{  8}}, {{ 76},{  8}}, {{204},{  8}}, {{ 44},{  8}},
  5 | {{172},{  8}}, {{108},{  8}}, {{236},{  8}}, {{ 28},{  8}}, {{156},{  8}},
  6 | {{ 92},{  8}}, {{220},{  8}}, {{ 60},{  8}}, {{188},{  8}}, {{124},{  8}},
  7 | {{252},{  8}}, {{  2},{  8}}, {{130},{  8}}, {{ 66},{  8}}, {{194},{  8}},
  8 | {{ 34},{  8}}, {{162},{  8}}, {{ 98},{  8}}, {{226},{  8}}, {{ 18},{  8}},
  9 | {{146},{  8}}, {{ 82},{  8}}, {{210},{  8}}, {{ 50},{  8}}, {{178},{  8}},
 10 | {{114},{  8}}, {{242},{  8}}, {{ 10},{  8}}, {{138},{  8}}, {{ 74},{  8}},
 11 | {{202},{  8}}, {{ 42},{  8}}, {{170},{  8}}, {{106},{  8}}, {{234},{  8}},
 12 | {{ 26},{  8}}, {{154},{  8}}, {{ 90},{  8}}, {{218},{  8}}, {{ 58},{  8}},
 13 | {{186},{  8}}, {{122},{  8}}, {{250},{  8}}, {{  6},{  8}}, {{134},{  8}},
 14 | {{ 70},{  8}}, {{198},{  8}}, {{ 38},{  8}}, {{166},{  8}}, {{102},{  8}},
 15 | {{230},{  8}}, {{ 22},{  8}}, {{150},{  8}}, {{ 86},{  8}}, {{214},{  8}},
 16 | {{ 54},{  8}}, {{182},{  8}}, {{118},{  8}}, {{246},{  8}}, {{ 14},{  8}},
 17 | {{142},{  8}}, {{ 78},{  8}}, {{206},{  8}}, {{ 46},{  8}}, {{174},{  8}},
 18 | {{110},{  8}}, {{238},{  8}}, {{ 30},{  8}}, {{158},{  8}}, {{ 94},{  8}},
 19 | {{222},{  8}}, {{ 62},{  8}}, {{190},{  8}}, {{126},{  8}}, {{254},{  8}},
 20 | {{  1},{  8}}, {{129},{  8}}, {{ 65},{  8}}, {{193},{  8}}, {{ 33},{  8}},
 21 | {{161},{  8}}, {{ 97},{  8}}, {{225},{  8}}, {{ 17},{  8}}, {{145},{  8}},
 22 | {{ 81},{  8}}, {{209},{  8}}, {{ 49},{  8}}, {{177},{  8}}, {{113},{  8}},
 23 | {{241},{  8}}, {{  9},{  8}}, {{137},{  8}}, {{ 73},{  8}}, {{201},{  8}},
 24 | {{ 41},{  8}}, {{169},{  8}}, {{105},{  8}}, {{233},{  8}}, {{ 25},{  8}},
 25 | {{153},{  8}}, {{ 89},{  8}}, {{217},{  8}}, {{ 57},{  8}}, {{185},{  8}},
 26 | {{121},{  8}}, {{249},{  8}}, {{  5},{  8}}, {{133},{  8}}, {{ 69},{  8}},
 27 | {{197},{  8}}, {{ 37},{  8}}, {{165},{  8}}, {{101},{  8}}, {{229},{  8}},
 28 | {{ 21},{  8}}, {{149},{  8}}, {{ 85},{  8}}, {{213},{  8}}, {{ 53},{  8}},
 29 | {{181},{  8}}, {{117},{  8}}, {{245},{  8}}, {{ 13},{  8}}, {{141},{  8}},
 30 | {{ 77},{  8}}, {{205},{  8}}, {{ 45},{  8}}, {{173},{  8}}, {{109},{  8}},
 31 | {{237},{  8}}, {{ 29},{  8}}, {{157},{  8}}, {{ 93},{  8}}, {{221},{  8}},
 32 | {{ 61},{  8}}, {{189},{  8}}, {{125},{  8}}, {{253},{  8}}, {{ 19},{  9}},
 33 | {{275},{  9}}, {{147},{  9}}, {{403},{  9}}, {{ 83},{  9}}, {{339},{  9}},
 34 | {{211},{  9}}, {{467},{  9}}, {{ 51},{  9}}, {{307},{  9}}, {{179},{  9}},
 35 | {{435},{  9}}, {{115},{  9}}, {{371},{  9}}, {{243},{  9}}, {{499},{  9}},
 36 | {{ 11},{  9}}, {{267},{  9}}, {{139},{  9}}, {{395},{  9}}, {{ 75},{  9}},
 37 | {{331},{  9}}, {{203},{  9}}, {{459},{  9}}, {{ 43},{  9}}, {{299},{  9}},
 38 | {{171},{  9}}, {{427},{  9}}, {{107},{  9}}, {{363},{  9}}, {{235},{  9}},
 39 | {{491},{  9}}, {{ 27},{  9}}, {{283},{  9}}, {{155},{  9}}, {{411},{  9}},
 40 | {{ 91},{  9}}, {{347},{  9}}, {{219},{  9}}, {{475},{  9}}, {{ 59},{  9}},
 41 | {{315},{  9}}, {{187},{  9}}, {{443},{  9}}, {{123},{  9}}, {{379},{  9}},
 42 | {{251},{  9}}, {{507},{  9}}, {{  7},{  9}}, {{263},{  9}}, {{135},{  9}},
 43 | {{391},{  9}}, {{ 71},{  9}}, {{327},{  9}}, {{199},{  9}}, {{455},{  9}},
 44 | {{ 39},{  9}}, {{295},{  9}}, {{167},{  9}}, {{423},{  9}}, {{103},{  9}},
 45 | {{359},{  9}}, {{231},{  9}}, {{487},{  9}}, {{ 23},{  9}}, {{279},{  9}},
 46 | {{151},{  9}}, {{407},{  9}}, {{ 87},{  9}}, {{343},{  9}}, {{215},{  9}},
 47 | {{471},{  9}}, {{ 55},{  9}}, {{311},{  9}}, {{183},{  9}}, {{439},{  9}},
 48 | {{119},{  9}}, {{375},{  9}}, {{247},{  9}}, {{503},{  9}}, {{ 15},{  9}},
 49 | {{271},{  9}}, {{143},{  9}}, {{399},{  9}}, {{ 79},{  9}}, {{335},{  9}},
 50 | {{207},{  9}}, {{463},{  9}}, {{ 47},{  9}}, {{303},{  9}}, {{175},{  9}},
 51 | {{431},{  9}}, {{111},{  9}}, {{367},{  9}}, {{239},{  9}}, {{495},{  9}},
 52 | {{ 31},{  9}}, {{287},{  9}}, {{159},{  9}}, {{415},{  9}}, {{ 95},{  9}},
 53 | {{351},{  9}}, {{223},{  9}}, {{479},{  9}}, {{ 63},{  9}}, {{319},{  9}},
 54 | {{191},{  9}}, {{447},{  9}}, {{127},{  9}}, {{383},{  9}}, {{255},{  9}},
 55 | {{511},{  9}}, {{  0},{  7}}, {{ 64},{  7}}, {{ 32},{  7}}, {{ 96},{  7}},
 56 | {{ 16},{  7}}, {{ 80},{  7}}, {{ 48},{  7}}, {{112},{  7}}, {{  8},{  7}},
 57 | {{ 72},{  7}}, {{ 40},{  7}}, {{104},{  7}}, {{ 24},{  7}}, {{ 88},{  7}},
 58 | {{ 56},{  7}}, {{120},{  7}}, {{  4},{  7}}, {{ 68},{  7}}, {{ 36},{  7}},
 59 | {{100},{  7}}, {{ 20},{  7}}, {{ 84},{  7}}, {{ 52},{  7}}, {{116},{  7}},
 60 | {{  3},{  8}}, {{131},{  8}}, {{ 67},{  8}}, {{195},{  8}}, {{ 35},{  8}},
 61 | {{163},{  8}}, {{ 99},{  8}}, {{227},{  8}}
 62 | };
 63 | 
 64 | local const ct_data static_dtree[D_CODES] = {
 65 | {{ 0},{ 5}}, {{16},{ 5}}, {{ 8},{ 5}}, {{24},{ 5}}, {{ 4},{ 5}},
 66 | {{20},{ 5}}, {{12},{ 5}}, {{28},{ 5}}, {{ 2},{ 5}}, {{18},{ 5}},
 67 | {{10},{ 5}}, {{26},{ 5}}, {{ 6},{ 5}}, {{22},{ 5}}, {{14},{ 5}},
 68 | {{30},{ 5}}, {{ 1},{ 5}}, {{17},{ 5}}, {{ 9},{ 5}}, {{25},{ 5}},
 69 | {{ 5},{ 5}}, {{21},{ 5}}, {{13},{ 5}}, {{29},{ 5}}, {{ 3},{ 5}},
 70 | {{19},{ 5}}, {{11},{ 5}}, {{27},{ 5}}, {{ 7},{ 5}}, {{23},{ 5}}
 71 | };
 72 | 
 73 | const uch ZLIB_INTERNAL _dist_code[DIST_CODE_LEN] = {
 74 |  0,  1,  2,  3,  4,  4,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,
 75 |  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10,
 76 | 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
 77 | 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
 78 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13,
 79 | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
 80 | 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
 81 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
 82 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
 83 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15,
 84 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 85 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 86 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  0,  0, 16, 17,
 87 | 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
 88 | 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
 89 | 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
 90 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
 91 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
 92 | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
 93 | 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
 94 | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
 95 | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
 96 | 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
 97 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
 98 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
 99 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29
100 | };
101 | 
102 | const uch ZLIB_INTERNAL _length_code[MAX_MATCH-MIN_MATCH+1]= {
103 |  0,  1,  2,  3,  4,  5,  6,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 12, 12,
104 | 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
105 | 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
106 | 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
107 | 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
108 | 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
109 | 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
110 | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
111 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
112 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
113 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
114 | 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
115 | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28
116 | };
117 | 
118 | local const int base_length[LENGTH_CODES] = {
119 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
120 | 64, 80, 96, 112, 128, 160, 192, 224, 0
121 | };
122 | 
123 | local const int base_dist[D_CODES] = {
124 |     0,     1,     2,     3,     4,     6,     8,    12,    16,    24,
125 |    32,    48,    64,    96,   128,   192,   256,   384,   512,   768,
126 |  1024,  1536,  2048,  3072,  4096,  6144,  8192, 12288, 16384, 24576
127 | };
128 | 
129 | 


--------------------------------------------------------------------------------
/src/zlib/zutil.h:
--------------------------------------------------------------------------------
  1 | /* zutil.h -- internal interface and configuration of the compression library
  2 |  * Copyright (C) 1995-2013 Jean-loup Gailly.
  3 |  * For conditions of distribution and use, see copyright notice in zlib.h
  4 |  */
  5 | 
  6 | /* WARNING: this file should *not* be used by applications. It is
  7 |    part of the implementation of the compression library and is
  8 |    subject to change. Applications should only use zlib.h.
  9 |  */
 10 | 
 11 | /* @(#) $Id$ */
 12 | 
 13 | #ifndef ZUTIL_H
 14 | #define ZUTIL_H
 15 | 
 16 | #ifdef HAVE_HIDDEN
 17 | #  define ZLIB_INTERNAL __attribute__((visibility ("hidden")))
 18 | #else
 19 | #  define ZLIB_INTERNAL
 20 | #endif
 21 | 
 22 | #include "zlib.h"
 23 | 
 24 | #if defined(STDC) && !defined(Z_SOLO)
 25 | #  if !(defined(_WIN32_WCE) && defined(_MSC_VER))
 26 | #    include <stddef.h>
 27 | #  endif
 28 | #  include <string.h>
 29 | #  include <stdlib.h>
 30 | #endif
 31 | 
 32 | #ifdef Z_SOLO
 33 |    typedef long ptrdiff_t;  /* guess -- will be caught if guess is wrong */
 34 | #endif
 35 | 
 36 | #ifndef local
 37 | #  define local static
 38 | #endif
 39 | /* compile with -Dlocal if your debugger can't find static symbols */
 40 | 
 41 | typedef unsigned char  uch;
 42 | typedef uch FAR uchf;
 43 | typedef unsigned short ush;
 44 | typedef ush FAR ushf;
 45 | typedef unsigned long  ulg;
 46 | 
 47 | extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 48 | /* (size given to avoid silly warnings with Visual C++) */
 49 | 
 50 | #define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)]
 51 | 
 52 | #define ERR_RETURN(strm,err) \
 53 |   return (strm->msg = ERR_MSG(err), (err))
 54 | /* To be used only when the state is known to be valid */
 55 | 
 56 |         /* common constants */
 57 | 
 58 | #ifndef DEF_WBITS
 59 | #  define DEF_WBITS MAX_WBITS
 60 | #endif
 61 | /* default windowBits for decompression. MAX_WBITS is for compression only */
 62 | 
 63 | #if MAX_MEM_LEVEL >= 8
 64 | #  define DEF_MEM_LEVEL 8
 65 | #else
 66 | #  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
 67 | #endif
 68 | /* default memLevel */
 69 | 
 70 | #define STORED_BLOCK 0
 71 | #define STATIC_TREES 1
 72 | #define DYN_TREES    2
 73 | /* The three kinds of block type */
 74 | 
 75 | #define MIN_MATCH  3
 76 | #define MAX_MATCH  258
 77 | /* The minimum and maximum match lengths */
 78 | 
 79 | #define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */
 80 | 
 81 |         /* target dependencies */
 82 | 
 83 | #if defined(MSDOS) || (defined(WINDOWS) && !defined(WIN32))
 84 | #  define OS_CODE  0x00
 85 | #  ifndef Z_SOLO
 86 | #    if defined(__TURBOC__) || defined(__BORLANDC__)
 87 | #      if (__STDC__ == 1) && (defined(__LARGE__) || defined(__COMPACT__))
 88 |          /* Allow compilation with ANSI keywords only enabled */
 89 |          void _Cdecl farfree( void *block );
 90 |          void *_Cdecl farmalloc( unsigned long nbytes );
 91 | #      else
 92 | #        include <alloc.h>
 93 | #      endif
 94 | #    else /* MSC or DJGPP */
 95 | #      include <malloc.h>
 96 | #    endif
 97 | #  endif
 98 | #endif
 99 | 
100 | #ifdef AMIGA
101 | #  define OS_CODE  0x01
102 | #endif
103 | 
104 | #if defined(VAXC) || defined(VMS)
105 | #  define OS_CODE  0x02
106 | #  define F_OPEN(name, mode) \
107 |      fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512")
108 | #endif
109 | 
110 | #if defined(ATARI) || defined(atarist)
111 | #  define OS_CODE  0x05
112 | #endif
113 | 
114 | #ifdef OS2
115 | #  define OS_CODE  0x06
116 | #  if defined(M_I86) && !defined(Z_SOLO)
117 | #    include <malloc.h>
118 | #  endif
119 | #endif
120 | 
121 | #if defined(MACOS) || defined(TARGET_OS_MAC)
122 | #  define OS_CODE  0x07
123 | #  ifndef Z_SOLO
124 | #    if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os
125 | #      include <unix.h> /* for fdopen */
126 | #    else
127 | #      ifndef fdopen
128 | #        define fdopen(fd,mode) NULL /* No fdopen() */
129 | #      endif
130 | #    endif
131 | #  endif
132 | #endif
133 | 
134 | #ifdef TOPS20
135 | #  define OS_CODE  0x0a
136 | #endif
137 | 
138 | #ifdef WIN32
139 | #  ifndef __CYGWIN__  /* Cygwin is Unix, not Win32 */
140 | #    define OS_CODE  0x0b
141 | #  endif
142 | #endif
143 | 
144 | #ifdef __50SERIES /* Prime/PRIMOS */
145 | #  define OS_CODE  0x0f
146 | #endif
147 | 
148 | #if defined(_BEOS_) || defined(RISCOS)
149 | #  define fdopen(fd,mode) NULL /* No fdopen() */
150 | #endif
151 | 
152 | #if (defined(_MSC_VER) && (_MSC_VER > 600)) && !defined __INTERIX
153 | #  if defined(_WIN32_WCE)
154 | #    define fdopen(fd,mode) NULL /* No fdopen() */
155 | #    ifndef _PTRDIFF_T_DEFINED
156 |        typedef int ptrdiff_t;
157 | #      define _PTRDIFF_T_DEFINED
158 | #    endif
159 | #  else
160 | #    define fdopen(fd,type)  _fdopen(fd,type)
161 | #  endif
162 | #endif
163 | 
164 | #if defined(__BORLANDC__) && !defined(MSDOS)
165 |   #pragma warn -8004
166 |   #pragma warn -8008
167 |   #pragma warn -8066
168 | #endif
169 | 
170 | /* provide prototypes for these when building zlib without LFS */
171 | #if !defined(_WIN32) && \
172 |     (!defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0)
173 |     ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t));
174 |     ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t));
175 | #endif
176 | 
177 |         /* common defaults */
178 | 
179 | #ifndef OS_CODE
180 | #  define OS_CODE  0x03  /* assume Unix */
181 | #endif
182 | 
183 | #ifndef F_OPEN
184 | #  define F_OPEN(name, mode) fopen((name), (mode))
185 | #endif
186 | 
187 |          /* functions */
188 | 
189 | #if defined(pyr) || defined(Z_SOLO)
190 | #  define NO_MEMCPY
191 | #endif
192 | #if defined(SMALL_MEDIUM) && !defined(_MSC_VER) && !defined(__SC__)
193 |  /* Use our own functions for small and medium model with MSC <= 5.0.
194 |   * You may have to use the same strategy for Borland C (untested).
195 |   * The __SC__ check is for Symantec.
196 |   */
197 | #  define NO_MEMCPY
198 | #endif
199 | #if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY)
200 | #  define HAVE_MEMCPY
201 | #endif
202 | #ifdef HAVE_MEMCPY
203 | #  ifdef SMALL_MEDIUM /* MSDOS small or medium model */
204 | #    define zmemcpy _fmemcpy
205 | #    define zmemcmp _fmemcmp
206 | #    define zmemzero(dest, len) _fmemset(dest, 0, len)
207 | #  else
208 | #    define zmemcpy memcpy
209 | #    define zmemcmp memcmp
210 | #    define zmemzero(dest, len) memset(dest, 0, len)
211 | #  endif
212 | #else
213 |    void ZLIB_INTERNAL zmemcpy OF((Bytef* dest, const Bytef* source, uInt len));
214 |    int ZLIB_INTERNAL zmemcmp OF((const Bytef* s1, const Bytef* s2, uInt len));
215 |    void ZLIB_INTERNAL zmemzero OF((Bytef* dest, uInt len));
216 | #endif
217 | 
218 | /* Diagnostic functions */
219 | #ifdef DEBUG
220 | #  include <stdio.h>
221 |    extern int ZLIB_INTERNAL z_verbose;
222 |    extern void ZLIB_INTERNAL z_error OF((char *m));
223 | #  define Assert(cond,msg) {if(!(cond)) z_error(msg);}
224 | #  define Trace(x) {if (z_verbose>=0) fprintf x ;}
225 | #  define Tracev(x) {if (z_verbose>0) fprintf x ;}
226 | #  define Tracevv(x) {if (z_verbose>1) fprintf x ;}
227 | #  define Tracec(c,x) {if (z_verbose>0 && (c)) fprintf x ;}
228 | #  define Tracecv(c,x) {if (z_verbose>1 && (c)) fprintf x ;}
229 | #else
230 | #  define Assert(cond,msg)
231 | #  define Trace(x)
232 | #  define Tracev(x)
233 | #  define Tracevv(x)
234 | #  define Tracec(c,x)
235 | #  define Tracecv(c,x)
236 | #endif
237 | 
238 | #ifndef Z_SOLO
239 |    voidpf ZLIB_INTERNAL zcalloc OF((voidpf opaque, unsigned items,
240 |                                     unsigned size));
241 |    void ZLIB_INTERNAL zcfree  OF((voidpf opaque, voidpf ptr));
242 | #endif
243 | 
244 | #define ZALLOC(strm, items, size) \
245 |            (*((strm)->zalloc))((strm)->opaque, (items), (size))
246 | #define ZFREE(strm, addr)  (*((strm)->zfree))((strm)->opaque, (voidpf)(addr))
247 | #define TRY_FREE(s, p) {if (p) ZFREE(s, p);}
248 | 
249 | /* Reverse the bytes in a 32-bit value */
250 | #define ZSWAP32(q) ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
251 |                     (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
252 | 
253 | #endif /* ZUTIL_H */
254 | 


--------------------------------------------------------------------------------