├── src ├── unittest.h ├── processor.h ├── zlib │ ├── inffast.h │ ├── inftrees.h │ ├── inffixed.h │ ├── inflate.h │ ├── gzguts.h │ ├── zutil.h │ └── trees.h ├── processor.cpp ├── matcher.h ├── umiprocessor.h ├── sequence.h ├── basecorrector.h ├── polyx.h ├── filter.h ├── nucleotidetree.h ├── readpool.h ├── jsonreporter.h ├── adaptertrimmer.h ├── duplicate.h ├── overlapanalysis.h ├── writerthread.h ├── evaluator.h ├── unittest.cpp ├── fastareader.h ├── seprocessor.h ├── read.h ├── htmlreporter.h ├── common.h ├── readpool.cpp ├── writer.h ├── peprocessor.h ├── writerthread.cpp ├── threadconfig.h ├── sequence.cpp ├── umiprocessor.cpp ├── filterresult.h ├── nucleotidetree.cpp ├── stats.h ├── fastqreader.h ├── fastareader.cpp ├── polyx.cpp ├── basecorrector.cpp ├── matcher.cpp ├── writer.cpp ├── duplicate.cpp ├── threadconfig.cpp ├── singleproducersingleconsumerlist.h ├── jsonreporter.cpp ├── adaptertrimmer.cpp ├── util.h ├── overlapanalysis.cpp ├── filter.cpp ├── read.cpp ├── options.h └── fastqreader.cpp ├── .gitignore ├── LICENSE ├── Makefile ├── .github └── workflows │ └── ci.yml └── testdata ├── R1.fq └── R2.fq /src/unittest.h: -------------------------------------------------------------------------------- 1 | #ifndef UNIT_TEST_H 2 | #define UNIT_TEST_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | class UnitTest{ 11 | public: 12 | UnitTest(); 13 | void run(); 14 | bool report(bool result, string message); 15 | }; 16 | 17 | #endif -------------------------------------------------------------------------------- /src/processor.h: -------------------------------------------------------------------------------- 1 | #ifndef PROCESSOR_H 2 | #define PROCESSOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "options.h" 8 | 9 | using namespace std; 10 | 11 | class Processor{ 12 | public: 13 | Processor(Options* opt); 14 | ~Processor(); 15 | bool process(); 16 | 17 | private: 18 | Options* mOptions; 19 | }; 20 | 21 | 22 | #endif -------------------------------------------------------------------------------- /src/zlib/inffast.h: -------------------------------------------------------------------------------- 1 | /* inffast.h -- header to use inffast.c 2 | * Copyright (C) 1995-2003, 2010 Mark Adler 3 | * For conditions of distribution and use, see copyright notice in zlib.h 4 | */ 5 | 6 | /* WARNING: this file should *not* be used by applications. It is 7 | part of the implementation of the compression library and is 8 | subject to change. Applications should only use zlib.h. 9 | */ 10 | 11 | void ZLIB_INTERNAL inflate_fast OF((z_streamp strm, unsigned start)); 12 | -------------------------------------------------------------------------------- /src/processor.cpp: -------------------------------------------------------------------------------- 1 | #include "processor.h" 2 | #include "peprocessor.h" 3 | #include "seprocessor.h" 4 | 5 | Processor::Processor(Options* opt){ 6 | mOptions = opt; 7 | } 8 | 9 | 10 | Processor::~Processor(){ 11 | } 12 | 13 | bool Processor::process() { 14 | if(mOptions->isPaired()) { 15 | PairEndProcessor p(mOptions); 16 | p.process(); 17 | } else { 18 | SingleEndProcessor p(mOptions); 19 | p.process(); 20 | } 21 | 22 | return true; 23 | } -------------------------------------------------------------------------------- /src/matcher.h: -------------------------------------------------------------------------------- 1 | #ifndef MATCHER_H 2 | #define MATCHER_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | class Matcher{ 11 | public: 12 | Matcher(); 13 | ~Matcher(); 14 | 15 | static bool matchWithOneInsertion(const char* insData, const char* normalData, int cmplen, int diffLimit); 16 | static int diffWithOneInsertion(const char* insData, const char* normalData, int cmplen, int diffLimit); 17 | 18 | 19 | }; 20 | 21 | 22 | #endif -------------------------------------------------------------------------------- /src/umiprocessor.h: -------------------------------------------------------------------------------- 1 | #ifndef UMI_PROCESSOR_H 2 | #define UMI_PROCESSOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "options.h" 8 | #include "read.h" 9 | 10 | using namespace std; 11 | 12 | class UmiProcessor{ 13 | public: 14 | UmiProcessor(Options* opt); 15 | ~UmiProcessor(); 16 | void process(Read* r1, Read* r2 = NULL); 17 | void addUmiToName(Read* r, string umi); 18 | static bool test(); 19 | 20 | private: 21 | Options* mOptions; 22 | }; 23 | 24 | 25 | #endif -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | fastp 3 | 4 | # Prerequisites 5 | *.d 6 | 7 | # Compiled Object files 8 | *.slo 9 | *.lo 10 | *.o 11 | *.obj 12 | 13 | # Precompiled Headers 14 | *.gch 15 | *.pch 16 | 17 | # Compiled Dynamic libraries 18 | *.so 19 | *.dylib 20 | *.dll 21 | 22 | # Fortran module files 23 | *.mod 24 | *.smod 25 | 26 | # Compiled Static libraries 27 | *.lai 28 | *.la 29 | *.a 30 | *.lib 31 | 32 | # Executables 33 | *.exe 34 | *.out 35 | *.app 36 | 37 | # Editor Config 38 | .vscode 39 | 40 | # Test Output 41 | *.json 42 | *.html -------------------------------------------------------------------------------- /src/sequence.h: -------------------------------------------------------------------------------- 1 | #ifndef SEQUENCE_H 2 | #define SEQUENCE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | class Sequence{ 12 | public: 13 | Sequence(); 14 | Sequence(string* seq); 15 | ~Sequence(); 16 | void print(); 17 | int length(); 18 | Sequence reverseComplement(); 19 | 20 | Sequence operator~(); 21 | 22 | static bool test(); 23 | static string reverseComplement(string* origin); 24 | 25 | public: 26 | string* mStr; 27 | }; 28 | 29 | #endif -------------------------------------------------------------------------------- /src/basecorrector.h: -------------------------------------------------------------------------------- 1 | #ifndef BASE_CORRECTOR_H 2 | #define BASE_CORRECTOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "overlapanalysis.h" 8 | #include "filterresult.h" 9 | #include "options.h" 10 | 11 | using namespace std; 12 | 13 | class BaseCorrector{ 14 | public: 15 | BaseCorrector(); 16 | ~BaseCorrector(); 17 | 18 | static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit); 19 | static int correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov); 20 | static bool test(); 21 | }; 22 | 23 | 24 | #endif -------------------------------------------------------------------------------- /src/polyx.h: -------------------------------------------------------------------------------- 1 | #ifndef POLY_X_H 2 | #define POLY_X_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "overlapanalysis.h" 8 | #include "filterresult.h" 9 | #include "options.h" 10 | 11 | using namespace std; 12 | 13 | class PolyX{ 14 | public: 15 | PolyX(); 16 | ~PolyX(); 17 | 18 | static void trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq); 19 | static void trimPolyG(Read* r1, FilterResult* fr, int compareReq); 20 | static void trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq); 21 | static void trimPolyX(Read* r1, FilterResult* fr, int compareReq); 22 | static bool test(); 23 | 24 | 25 | }; 26 | 27 | 28 | #endif -------------------------------------------------------------------------------- /src/filter.h: -------------------------------------------------------------------------------- 1 | #ifndef FILTER_H 2 | #define FILTER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "options.h" 9 | #include "read.h" 10 | 11 | using namespace std; 12 | 13 | class Filter{ 14 | public: 15 | Filter(Options* opt); 16 | ~Filter(); 17 | int passFilter(Read* r); 18 | bool passLowComplexityFilter(Read* r); 19 | Read* trimAndCut(Read* r, int front, int tail, int& frontTrimmed); 20 | bool filterByIndex(Read* r); 21 | bool filterByIndex(Read* r1, Read* r2); 22 | static bool test(); 23 | 24 | private: 25 | bool match(vector& list, string target, int threshold); 26 | 27 | private: 28 | Options* mOptions; 29 | }; 30 | 31 | 32 | #endif -------------------------------------------------------------------------------- /src/nucleotidetree.h: -------------------------------------------------------------------------------- 1 | #ifndef NUCLEICTREE_H 2 | #define NUCLEICTREE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "options.h" 9 | 10 | using namespace std; 11 | 12 | // (A,T,C,G,N) & 0X07 = (1,4,7,6,3) 13 | class NucleotideNode{ 14 | public: 15 | NucleotideNode(); 16 | ~NucleotideNode(); 17 | void dfs(); 18 | 19 | public: 20 | int count; 21 | char base; 22 | NucleotideNode* children[8]; 23 | }; 24 | 25 | class NucleotideTree{ 26 | public: 27 | NucleotideTree(Options* opt); 28 | ~NucleotideTree(); 29 | void addSeq(string seq); 30 | string getDominantPath(bool& reachedLeaf); 31 | 32 | static bool test(); 33 | 34 | private: 35 | Options* mOptions; 36 | NucleotideNode* mRoot; 37 | }; 38 | 39 | 40 | #endif -------------------------------------------------------------------------------- /src/readpool.h: -------------------------------------------------------------------------------- 1 | // in-memory pooled Reads to reduce new/delete operations 2 | // for each thread, one SISC list is used 3 | 4 | #ifndef READ_POOL_H 5 | #define READ_POOL_H 6 | 7 | #include 8 | #include 9 | #include 10 | #include "read.h" 11 | #include "options.h" 12 | #include "singleproducersingleconsumerlist.h" 13 | 14 | using namespace std; 15 | 16 | class ReadPool{ 17 | public: 18 | ReadPool(Options* opt); 19 | ~ReadPool(); 20 | bool input(int tid, Read* r); 21 | Read* getOne(); 22 | void initBufferLists(); 23 | void cleanup(); 24 | size_t size(); 25 | 26 | private: 27 | void updateFullStatus(); 28 | 29 | private: 30 | Options* mOptions; 31 | SingleProducerSingleConsumerList** mBufferLists; 32 | size_t mProduced; 33 | size_t mConsumed; 34 | unsigned long mLimit; 35 | bool mIsFull; 36 | }; 37 | 38 | #endif -------------------------------------------------------------------------------- /src/jsonreporter.h: -------------------------------------------------------------------------------- 1 | #ifndef JSON_REPORTER_H 2 | #define JSON_REPORTER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "options.h" 9 | #include "stats.h" 10 | #include "filterresult.h" 11 | #include 12 | #include 13 | #include "common.h" 14 | #include "util.h" 15 | 16 | using namespace std; 17 | 18 | class JsonReporter{ 19 | public: 20 | JsonReporter(Options* opt); 21 | ~JsonReporter(); 22 | 23 | void setDup(double dupRate); 24 | void setInsertHist(atomic_long* insertHist, int insertSizePeak); 25 | void report(FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL); 26 | 27 | private: 28 | Options* mOptions; 29 | int* mDupHist; 30 | double* mDupMeanGC; 31 | double mDupRate; 32 | atomic_long* mInsertHist; 33 | int mInsertSizePeak; 34 | }; 35 | 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/adaptertrimmer.h: -------------------------------------------------------------------------------- 1 | #ifndef ADAPTER_TRIMMER_H 2 | #define ADAPTER_TRIMMER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "overlapanalysis.h" 8 | #include "filterresult.h" 9 | #include "options.h" 10 | 11 | using namespace std; 12 | 13 | class AdapterTrimmer{ 14 | public: 15 | AdapterTrimmer(); 16 | ~AdapterTrimmer(); 17 | 18 | static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit); 19 | static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0); 20 | static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false, int matchReq = 4); 21 | static bool trimByMultiSequences(Read* r1, FilterResult* fr, vector& adapterList, bool isR2 = false, bool incTrimmedCounter = true); 22 | static bool test(); 23 | 24 | }; 25 | 26 | 27 | #endif -------------------------------------------------------------------------------- /src/duplicate.h: -------------------------------------------------------------------------------- 1 | #ifndef DUPLICATE_H 2 | #define DUPLICATE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include "options.h" 9 | #include "common.h" 10 | #include 11 | 12 | using namespace std; 13 | 14 | class Duplicate{ 15 | public: 16 | Duplicate(Options* opt); 17 | ~Duplicate(); 18 | 19 | bool checkRead(Read* r1); 20 | bool checkPair(Read* r1, Read* r2); 21 | void seq2intvector(const char* data, int len, uint64* output, int posOffset = 0); 22 | 23 | double getDupRate(); 24 | 25 | private: 26 | void initPrimeArrays(); 27 | bool applyBloomFilter(uint64* positions); 28 | 29 | private: 30 | Options* mOptions; 31 | uint64 mBufLenInBits; 32 | uint64 mBufLenInBytes; 33 | uint32 mBufNum; 34 | atomic_uchar* mDupBuf; 35 | uint64* mPrimeArrays; 36 | atomic_ulong mTotalReads; 37 | atomic_ulong mDupReads; 38 | uint64 mOffsetMask; 39 | 40 | }; 41 | 42 | #endif -------------------------------------------------------------------------------- /src/overlapanalysis.h: -------------------------------------------------------------------------------- 1 | #ifndef OVERLAP_ANALYSIS_H 2 | #define OVERLAP_ANALYSIS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "common.h" 10 | #include "options.h" 11 | #include "read.h" 12 | 13 | using namespace std; 14 | 15 | class OverlapResult { 16 | public: 17 | bool overlapped; 18 | int offset; 19 | int overlap_len; 20 | int diff; 21 | bool hasGap; 22 | }; 23 | 24 | class OverlapAnalysis{ 25 | public: 26 | OverlapAnalysis(); 27 | ~OverlapAnalysis(); 28 | 29 | static OverlapResult analyze(string* r1, string* r2, int diffLimit, int overlapRequire, double diffPercentLimit, bool allowGap = false); 30 | static OverlapResult analyze(Read* r1, Read* r2, int diffLimit, int overlapRequire, double diffPercentLimit, bool allowGap = false); 31 | static Read* merge(Read* r1, Read* r2, OverlapResult ov); 32 | 33 | public: 34 | static bool test(); 35 | 36 | }; 37 | 38 | #endif -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 OpenGene - Open Source Genetics Toolbox 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/writerthread.h: -------------------------------------------------------------------------------- 1 | #ifndef WRITER_THREAD_H 2 | #define WRITER_THREAD_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "writer.h" 9 | #include "options.h" 10 | #include 11 | #include 12 | #include "singleproducersingleconsumerlist.h" 13 | 14 | using namespace std; 15 | 16 | class WriterThread{ 17 | public: 18 | WriterThread(Options* opt, string filename, bool isSTDOUT = false); 19 | ~WriterThread(); 20 | 21 | void initWriter(string filename1, bool isSTDOUT = false); 22 | void initBufferLists(); 23 | 24 | void cleanup(); 25 | 26 | bool isCompleted(); 27 | void output(); 28 | void input(int tid, string* data); 29 | bool setInputCompleted(); 30 | 31 | long bufferLength() {return mBufferLength;}; 32 | string getFilename() {return mFilename;} 33 | 34 | private: 35 | void deleteWriter(); 36 | 37 | private: 38 | Writer* mWriter1; 39 | Options* mOptions; 40 | string mFilename; 41 | 42 | // for spliting output 43 | bool mInputCompleted; 44 | atomic_long mBufferLength; 45 | SingleProducerSingleConsumerList** mBufferLists; 46 | int mWorkingBufferList; 47 | }; 48 | 49 | #endif -------------------------------------------------------------------------------- /src/evaluator.h: -------------------------------------------------------------------------------- 1 | #ifndef EVALUATOR_H 2 | #define EVALUATOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "options.h" 8 | #include "util.h" 9 | #include "read.h" 10 | 11 | using namespace std; 12 | 13 | class Evaluator{ 14 | public: 15 | Evaluator(Options* opt); 16 | ~Evaluator(); 17 | // evaluate how many reads are stored in the input file 18 | void evaluateReadNum(long& readNum); 19 | string evalAdapterAndReadNum(long& readNum, bool isR2); 20 | bool isTwoColorSystem(); 21 | void evaluateSeqLen(); 22 | void evaluateOverRepSeqs(); 23 | void computeOverRepSeq(string filename, map& hotseqs, int seqLen); 24 | int computeSeqLen(string filename); 25 | 26 | static bool test(); 27 | static string matchKnownAdapter(string seq); 28 | private: 29 | Options* mOptions; 30 | string int2seq(unsigned int val, int seqlen); 31 | int seq2int(string* seq, int pos, int seqlen, int lastVal = -1); 32 | int seq2int(string& seq, int pos, int seqlen, int lastVal = -1); 33 | string getAdapterWithSeed(int seed, Read** loadedReads, long records, int keylen); 34 | string checkKnownAdapters(Read** reads, long num); 35 | }; 36 | 37 | 38 | #endif -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | DIR_INC := ./inc 2 | DIR_SRC := ./src 3 | DIR_OBJ := ./obj 4 | 5 | PREFIX ?= /usr/local 6 | BINDIR ?= $(PREFIX)/bin 7 | INCLUDE_DIRS ?= 8 | LIBRARY_DIRS ?= 9 | 10 | SRC := $(wildcard ${DIR_SRC}/*.cpp) 11 | OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${SRC})) 12 | 13 | TARGET := fastp 14 | 15 | BIN_TARGET := ${TARGET} 16 | 17 | CXX ?= g++ 18 | CXXFLAGS := -std=c++11 -pthread -g -O3 -MD -MP -I${DIR_INC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS} 19 | LIBS := -lisal -ldeflate -lpthread 20 | STATIC_FLAGS := -static -Wl,--no-as-needed -pthread 21 | LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS) 22 | STATIC_LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(STATIC_FLAGS) $(LIBS) $(STATIC_LD_FLAGS) 23 | 24 | 25 | ${BIN_TARGET}:${OBJ} 26 | $(CXX) $(OBJ) -o $@ $(LD_FLAGS) 27 | 28 | static:${OBJ} 29 | $(CXX) $(OBJ) -o ${BIN_TARGET} $(STATIC_LD_FLAGS) 30 | 31 | ${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp 32 | @mkdir -p $(@D) 33 | $(CXX) -c $< -o $@ $(CXXFLAGS) 34 | 35 | .PHONY:clean 36 | .PHONY:static 37 | clean: 38 | @rm -rf $(DIR_OBJ) 39 | @rm -f $(TARGET) 40 | 41 | install: 42 | install $(TARGET) $(BINDIR)/$(TARGET) 43 | @echo "Installed." 44 | 45 | -include $(OBJ:.o=.d) 46 | -------------------------------------------------------------------------------- /src/unittest.cpp: -------------------------------------------------------------------------------- 1 | #include "unittest.h" 2 | #include "sequence.h" 3 | #include "fastqreader.h" 4 | #include "read.h" 5 | #include "overlapanalysis.h" 6 | #include "filter.h" 7 | #include "adaptertrimmer.h" 8 | #include "basecorrector.h" 9 | #include "polyx.h" 10 | #include "nucleotidetree.h" 11 | #include "evaluator.h" 12 | #include 13 | 14 | UnitTest::UnitTest(){ 15 | 16 | } 17 | 18 | void UnitTest::run(){ 19 | bool passed = true; 20 | passed &= report(Sequence::test(), "Sequence::test"); 21 | passed &= report(Read::test(), "Read::test"); 22 | passed &= report(FastqReader::test(), "FastqReader::test"); 23 | passed &= report(OverlapAnalysis::test(), "OverlapAnalysis::test"); 24 | passed &= report(Filter::test(), "Filter::test"); 25 | passed &= report(AdapterTrimmer::test(), "AdapterTrimmer::test"); 26 | passed &= report(BaseCorrector::test(), "BaseCorrector::test"); 27 | passed &= report(PolyX::test(), "PolyX::test"); 28 | passed &= report(NucleotideTree::test(), "NucleotideTree::test"); 29 | passed &= report(Evaluator::test(), "Evaluator::test"); 30 | printf("\n==========================\n"); 31 | printf("%s\n\n", passed?"ALL PASSED":"FAILED"); 32 | } 33 | 34 | bool UnitTest::report(bool result, string message) { 35 | printf("%s:%s\n\n", message.c_str(), result?" PASSED":" FAILED"); 36 | return result; 37 | } -------------------------------------------------------------------------------- /src/fastareader.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTA_READER_H 2 | #define FASTA_READER_H 3 | 4 | // includes 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | class FastaReader 17 | { 18 | public: 19 | FastaReader(string fastaFile, bool forceUpperCase = true); 20 | ~FastaReader(); 21 | bool hasNext(); 22 | void readNext(); 23 | void readAll(); 24 | 25 | inline string currentID() 26 | { 27 | return mCurrentID; 28 | } 29 | 30 | inline string currentDescription() 31 | { 32 | return mCurrentDescription; 33 | } 34 | 35 | inline string currentSequence() 36 | { 37 | return mCurrentSequence; 38 | } 39 | 40 | inline map& contigs() { 41 | return mAllContigs; 42 | } 43 | 44 | static bool test(); 45 | 46 | 47 | public: 48 | string mCurrentSequence; 49 | string mCurrentID ; 50 | string mCurrentDescription; 51 | map mAllContigs; 52 | 53 | private: 54 | bool readLine(); 55 | bool endOfLine(char c); 56 | void setFastaSequenceIdDescription(); 57 | 58 | private: 59 | string mFastaFile; 60 | ifstream mFastaFileStream; 61 | bool mForceUpperCase; 62 | }; 63 | 64 | 65 | #endif 66 | 67 | -------------------------------------------------------------------------------- /src/seprocessor.h: -------------------------------------------------------------------------------- 1 | #ifndef SE_PROCESSOR_H 2 | #define SE_PROCESSOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "options.h" 13 | #include "threadconfig.h" 14 | #include "filter.h" 15 | #include "umiprocessor.h" 16 | #include "writerthread.h" 17 | #include "duplicate.h" 18 | #include "singleproducersingleconsumerlist.h" 19 | #include "readpool.h" 20 | 21 | using namespace std; 22 | 23 | typedef struct ReadRepository ReadRepository; 24 | 25 | class SingleEndProcessor{ 26 | public: 27 | SingleEndProcessor(Options* opt); 28 | ~SingleEndProcessor(); 29 | bool process(); 30 | 31 | private: 32 | bool processSingleEnd(ReadPack* pack, ThreadConfig* config); 33 | void readerTask(); 34 | void processorTask(ThreadConfig* config); 35 | void initConfig(ThreadConfig* config); 36 | void initOutput(); 37 | void closeOutput(); 38 | void writerTask(WriterThread* config); 39 | void recycleToPool(int tid, Read* r); 40 | 41 | private: 42 | Options* mOptions; 43 | atomic_bool mReaderFinished; 44 | atomic_int mFinishedThreads; 45 | Filter* mFilter; 46 | UmiProcessor* mUmiProcessor; 47 | WriterThread* mLeftWriter; 48 | WriterThread* mFailedWriter; 49 | Duplicate* mDuplicate; 50 | SingleProducerSingleConsumerList** mInputLists; 51 | size_t mPackReadCounter; 52 | atomic_long mPackProcessedCounter; 53 | ReadPool* mReadPool; 54 | }; 55 | 56 | 57 | #endif -------------------------------------------------------------------------------- /src/read.h: -------------------------------------------------------------------------------- 1 | #ifndef READ_H 2 | #define READ_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "sequence.h" 10 | #include 11 | 12 | using namespace std; 13 | 14 | class Read{ 15 | public: 16 | Read(string* name, string* seq, string* strand, string* quality, bool phred64=false); 17 | Read(const char* name, const char* seq, const char* strand, const char* quality, bool phred64=false); 18 | ~Read(); 19 | void print(); 20 | void printFile(ofstream& file); 21 | Read* reverseComplement(); 22 | string firstIndex(); 23 | string lastIndex(); 24 | // default is Q20 25 | int lowQualCount(int qual=20); 26 | int length(); 27 | string toString(); 28 | string toStringWithTag(string tag); 29 | void appendToString(string* target); 30 | void appendToStringWithTag(string* target, string tag); 31 | void resize(int len); 32 | void convertPhred64To33(); 33 | void trimFront(int len); 34 | bool fixMGI(); 35 | 36 | public: 37 | static bool test(); 38 | 39 | private: 40 | 41 | 42 | public: 43 | string* mName; 44 | string* mSeq; 45 | string* mStrand; 46 | string* mQuality; 47 | }; 48 | 49 | class ReadPair{ 50 | public: 51 | ReadPair(Read* left, Read* right); 52 | ~ReadPair(); 53 | 54 | // merge a pair, without consideration of seq error caused false INDEL 55 | Read* fastMerge(); 56 | public: 57 | Read* mLeft; 58 | Read* mRight; 59 | 60 | public: 61 | static bool test(); 62 | }; 63 | 64 | struct ReadPack { 65 | Read** data; 66 | int count; 67 | }; 68 | 69 | typedef struct ReadPack ReadPack; 70 | 71 | #endif -------------------------------------------------------------------------------- /src/htmlreporter.h: -------------------------------------------------------------------------------- 1 | #ifndef HTML_REPORTER_H 2 | #define HTML_REPORTER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "options.h" 9 | #include "stats.h" 10 | #include "filterresult.h" 11 | #include 12 | #include 13 | #include "common.h" 14 | #include "util.h" 15 | 16 | using namespace std; 17 | 18 | class HtmlReporter{ 19 | public: 20 | HtmlReporter(Options* opt); 21 | ~HtmlReporter(); 22 | void setDup(double dupRate); 23 | void setInsertHist(atomic_long* insertHist, int insertSizePeak); 24 | void report(FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2 = NULL, Stats* postStats2 = NULL); 25 | 26 | static void outputRow(ofstream& ofs, string key, long value); 27 | static void outputRow(ofstream& ofs, string key, string value); 28 | static string formatNumber(long number); 29 | static string getPercents(long numerator, long denominator); 30 | private: 31 | const string getCurrentSystemTime(); 32 | void printHeader(ofstream& ofs); 33 | void printCSS(ofstream& ofs); 34 | void printJS(ofstream& ofs); 35 | void printFooter(ofstream& ofs); 36 | void reportQualHistogram(ofstream& ofs, string caption, Stats* stats1, Stats* stats2 = NULL); 37 | void reportDuplication(ofstream& ofs); 38 | void reportInsertSize(ofstream& ofs, int isizeLimit); 39 | void printSummary(ofstream& ofs, FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2, Stats* postStats2); 40 | 41 | private: 42 | Options* mOptions; 43 | int* mDupHist; 44 | double* mDupMeanGC; 45 | double mDupRate; 46 | atomic_long* mInsertHist; 47 | int mInsertSizePeak; 48 | }; 49 | 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H 2 | #define COMMON_H 3 | 4 | #define FASTP_VER "1.0.1" 5 | 6 | #define _DEBUG false 7 | 8 | #ifndef _WIN32 9 | typedef long int64; 10 | typedef unsigned long uint64; 11 | #else 12 | typedef long long int64; 13 | typedef unsigned long long uint64; 14 | #endif 15 | 16 | typedef int int32; 17 | typedef unsigned int uint32; 18 | 19 | typedef short int16; 20 | typedef unsigned short uint16; 21 | 22 | typedef char int8; 23 | typedef unsigned char uint8; 24 | 25 | const char ATCG_BASES[] = {'A', 'T', 'C', 'G'}; 26 | 27 | #pragma pack(2) 28 | 29 | 30 | #pragma pack() 31 | 32 | 33 | // how many reads one pack has 34 | static const int PACK_SIZE = 256; 35 | 36 | // if one pack is produced, but not consumed, it will be kept in the memory 37 | // this number limit the number of in memory packs 38 | // if the number of in memory packs is full, the producer thread should sleep 39 | static const int PACK_IN_MEM_LIMIT = 128; 40 | 41 | 42 | // different filtering results, bigger number means worse 43 | // if r1 and r2 are both failed, then the bigger one of the two results will be recorded 44 | // we reserve some gaps for future types to be added 45 | static const int PASS_FILTER = 0; 46 | static const int FAIL_POLY_X = 4; 47 | static const int FAIL_OVERLAP = 8; 48 | static const int FAIL_N_BASE = 12; 49 | static const int FAIL_LENGTH = 16; 50 | static const int FAIL_TOO_LONG = 17; 51 | static const int FAIL_QUALITY = 20; 52 | static const int FAIL_COMPLEXITY = 24; 53 | 54 | // how many types in total we support 55 | static const int FILTER_RESULT_TYPES = 32; 56 | 57 | const static char* FAILED_TYPES[FILTER_RESULT_TYPES] = { 58 | "passed", "", "", "", 59 | "failed_polyx_filter", "", "", "", 60 | "failed_bad_overlap", "", "", "", 61 | "failed_too_many_n_bases", "", "", "", 62 | "failed_too_short", "failed_too_long", "", "", 63 | "failed_quality_filter", "", "", "", 64 | "failed_low_complexity", "", "", "", 65 | "", "", "", "" 66 | }; 67 | 68 | #endif /* COMMON_H */ 69 | -------------------------------------------------------------------------------- /src/readpool.cpp: -------------------------------------------------------------------------------- 1 | #include "readpool.h" 2 | #include "util.h" 3 | #include 4 | #include 5 | #include "common.h" 6 | 7 | ReadPool::ReadPool(Options* opt){ 8 | mOptions = opt; 9 | initBufferLists(); 10 | mLimit = PACK_SIZE * PACK_IN_MEM_LIMIT; 11 | if(mOptions->interleavedInput) 12 | mLimit *= 2; 13 | mIsFull = false; 14 | mProduced = 0; 15 | mConsumed = 0; 16 | } 17 | 18 | ReadPool::~ReadPool() { 19 | cleanup(); 20 | } 21 | 22 | bool ReadPool::input(int tid, Read* data) { 23 | if(mIsFull) 24 | return false; 25 | 26 | mBufferLists[tid]->produce(data); 27 | mProduced++; 28 | if((mProduced & 0xFF) == 0) 29 | updateFullStatus(); 30 | return true; 31 | } 32 | 33 | void ReadPool::cleanup() { 34 | for(int t=0; tthread; t++) { 35 | while(mBufferLists[t]->canBeConsumed()) { 36 | Read* r = mBufferLists[t]->consume(); 37 | mConsumed++; 38 | delete r; 39 | } 40 | delete mBufferLists[t]; 41 | } 42 | delete[] mBufferLists; 43 | } 44 | 45 | void ReadPool::initBufferLists() { 46 | mBufferLists = new SingleProducerSingleConsumerList*[mOptions->thread]; 47 | for(int t=0; tthread; t++) { 48 | mBufferLists[t] = new SingleProducerSingleConsumerList(); 49 | } 50 | } 51 | 52 | void ReadPool::updateFullStatus() { 53 | mIsFull = size() > mLimit; 54 | } 55 | 56 | size_t ReadPool::size() { 57 | size_t total = 0; 58 | for(int t=0; tthread; t++) { 59 | total += mBufferLists[t]->size(); 60 | } 61 | return total; 62 | } 63 | 64 | Read* ReadPool::getOne() { 65 | for(int t=0; tthread; t++) { 66 | if(mBufferLists[t]->canBeConsumed()) { 67 | Read* r = mBufferLists[t]->consume(); 68 | mConsumed++; 69 | if((mConsumed & 0xFF) == 0) 70 | updateFullStatus(); 71 | return r; 72 | } 73 | } 74 | return NULL; 75 | } -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: fastp ci 2 | on: 3 | pull_request: 4 | branches: 5 | - master 6 | jobs: 7 | build: 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | os: 12 | - ubuntu-24.04 13 | - macos-12 14 | runs-on: ${{ matrix.os }} 15 | steps: 16 | - name: checkout scm 17 | uses: actions/checkout@v3 18 | 19 | - name: Get number of CPU cores 20 | uses: SimenB/github-actions-cpu-cores@v1 21 | id: cpu-cores 22 | 23 | - name: install build dependencies (Ubuntu) 24 | run: sudo apt update && sudo apt install -y build-essential nasm 25 | if: runner.os == 'Linux' 26 | 27 | - name: install build dependencies (MacOS) 28 | run: brew install automake autoconf coreutils nasm 29 | if: runner.os == 'macOS' 30 | 31 | - name: get deflate 32 | uses: actions/checkout@v3 33 | with: 34 | repository: ebiggers/libdeflate 35 | path: src/libs/deflate 36 | ref: v1.22 37 | 38 | - name: build deflate 39 | run: | 40 | cd src/libs/deflate 41 | cmake -B build 42 | cmake --build build -j ${{ steps.cpu-cores.outputs.count }} 43 | sudo cmake --install build 44 | cd - 45 | 46 | - name: get isa-l 47 | uses: actions/checkout@v3 48 | with: 49 | repository: intel/isa-l 50 | path: src/libs/isa-l 51 | ref: v2.31.0 52 | 53 | - name: build isa-l 54 | run: | 55 | cd src/libs/isa-l 56 | ./autogen.sh 57 | ./configure --prefix=/usr/local 58 | make -j ${{ steps.cpu-cores.outputs.count }} 59 | sudo make install 60 | cd - 61 | 62 | - name: make fatsp (MacOS) 63 | run: bash -c 'make -j $(nproc)' 64 | if: runner.os == 'macOS' 65 | 66 | - name: make fastp static (Ubuntu) 67 | run: bash -c 'make -j $(nproc) static' 68 | if: runner.os == 'Linux' 69 | 70 | - name: test 71 | run: chmod a+x ./fastp && ./fastp --version 72 | -------------------------------------------------------------------------------- /src/writer.h: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021 Shifu Chen 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | #ifndef _WRITER_H 26 | #define _WRITER_H 27 | 28 | #include 29 | #include 30 | #include "common.h" 31 | #include 32 | #include 33 | #include "libdeflate.h" 34 | #include "options.h" 35 | #include 36 | 37 | using namespace std; 38 | 39 | class Writer{ 40 | public: 41 | Writer(Options* opt, string filename, int compression, bool isSTDOUT = false); 42 | ~Writer(); 43 | bool isZipped(); 44 | bool writeString(const string& str); 45 | bool writeString(string* str); 46 | bool write(const char* strdata, size_t size); 47 | void flush(); 48 | string filename(); 49 | 50 | public: 51 | static bool test(); 52 | 53 | private: 54 | void init(); 55 | void close(); 56 | bool writeInternal(const char* strdata, size_t size); 57 | 58 | private: 59 | string mFilename; 60 | libdeflate_compressor* mCompressor; 61 | //ofstream* mOutStream; 62 | FILE* mFP; 63 | bool mZipped; 64 | int mCompression; 65 | bool haveToClose; 66 | char* mBuffer; 67 | size_t mBufDataLen; 68 | size_t mBufSize; 69 | Options* mOptions; 70 | bool mSTDOUT; 71 | }; 72 | 73 | #endif -------------------------------------------------------------------------------- /src/peprocessor.h: -------------------------------------------------------------------------------- 1 | #ifndef PE_PROCESSOR_H 2 | #define PE_PROCESSOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "options.h" 13 | #include "threadconfig.h" 14 | #include "filter.h" 15 | #include "umiprocessor.h" 16 | #include "overlapanalysis.h" 17 | #include "writerthread.h" 18 | #include "duplicate.h" 19 | #include "readpool.h" 20 | 21 | 22 | using namespace std; 23 | 24 | typedef struct ReadPairRepository ReadPairRepository; 25 | 26 | class PairEndProcessor{ 27 | public: 28 | PairEndProcessor(Options* opt); 29 | ~PairEndProcessor(); 30 | bool process(); 31 | 32 | private: 33 | bool processPairEnd(ReadPack* leftPack, ReadPack* rightPack, ThreadConfig* config); 34 | void readerTask(bool isLeft); 35 | void interleavedReaderTask(); 36 | void processorTask(ThreadConfig* config); 37 | void initConfig(ThreadConfig* config); 38 | void initOutput(); 39 | void closeOutput(); 40 | void statInsertSize(Read* r1, Read* r2, OverlapResult& ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0); 41 | int getPeakInsertSize(); 42 | void writerTask(WriterThread* config); 43 | void recycleToPool1(int tid, Read* r); 44 | void recycleToPool2(int tid, Read* r); 45 | 46 | private: 47 | atomic_bool mLeftReaderFinished; 48 | atomic_bool mRightReaderFinished; 49 | atomic_int mFinishedThreads; 50 | Options* mOptions; 51 | Filter* mFilter; 52 | UmiProcessor* mUmiProcessor; 53 | atomic_long* mInsertSizeHist; 54 | WriterThread* mLeftWriter; 55 | WriterThread* mRightWriter; 56 | WriterThread* mUnpairedLeftWriter; 57 | WriterThread* mUnpairedRightWriter; 58 | WriterThread* mMergedWriter; 59 | WriterThread* mFailedWriter; 60 | WriterThread* mOverlappedWriter; 61 | Duplicate* mDuplicate; 62 | SingleProducerSingleConsumerList** mLeftInputLists; 63 | SingleProducerSingleConsumerList** mRightInputLists; 64 | size_t mLeftPackReadCounter; 65 | size_t mRightPackReadCounter; 66 | atomic_long mPackProcessedCounter; 67 | ReadPool* mLeftReadPool; 68 | ReadPool* mRightReadPool; 69 | atomic_bool shouldStopReading; 70 | }; 71 | 72 | 73 | #endif -------------------------------------------------------------------------------- /src/writerthread.cpp: -------------------------------------------------------------------------------- 1 | #include "writerthread.h" 2 | #include "util.h" 3 | #include 4 | #include 5 | 6 | WriterThread::WriterThread(Options* opt, string filename, bool isSTDOUT){ 7 | mOptions = opt; 8 | 9 | mWriter1 = NULL; 10 | 11 | mInputCompleted = false; 12 | mFilename = filename; 13 | 14 | initWriter(filename, isSTDOUT); 15 | initBufferLists(); 16 | mWorkingBufferList = 0; // 0 ~ mOptions->thread-1 17 | mBufferLength = 0; 18 | } 19 | 20 | WriterThread::~WriterThread() { 21 | cleanup(); 22 | } 23 | 24 | bool WriterThread::isCompleted() 25 | { 26 | return mInputCompleted && (mBufferLength==0); 27 | } 28 | 29 | bool WriterThread::setInputCompleted() { 30 | mInputCompleted = true; 31 | for(int t=0; tthread; t++) { 32 | mBufferLists[t]->setProducerFinished(); 33 | } 34 | return true; 35 | } 36 | 37 | void WriterThread::output(){ 38 | SingleProducerSingleConsumerList* list = mBufferLists[mWorkingBufferList]; 39 | if(!list->canBeConsumed()) { 40 | usleep(100); 41 | } else { 42 | string* str = list->consume(); 43 | mWriter1->write(str->data(), str->length()); 44 | delete str; 45 | mBufferLength--; 46 | mWorkingBufferList = (mWorkingBufferList+1)%mOptions->thread; 47 | } 48 | } 49 | 50 | 51 | void WriterThread::input(int tid, string* data) { 52 | mBufferLists[tid]->produce(data); 53 | mBufferLength++; 54 | } 55 | 56 | void WriterThread::cleanup() { 57 | deleteWriter(); 58 | for(int t=0; tthread; t++) { 59 | delete mBufferLists[t]; 60 | } 61 | delete[] mBufferLists; 62 | mBufferLists = NULL; 63 | } 64 | 65 | void WriterThread::deleteWriter() { 66 | if(mWriter1 != NULL) { 67 | delete mWriter1; 68 | mWriter1 = NULL; 69 | } 70 | } 71 | 72 | void WriterThread::initWriter(string filename1, bool isSTDOUT) { 73 | deleteWriter(); 74 | mWriter1 = new Writer(mOptions, filename1, mOptions->compression, isSTDOUT); 75 | } 76 | 77 | void WriterThread::initBufferLists() { 78 | mBufferLists = new SingleProducerSingleConsumerList*[mOptions->thread]; 79 | for(int t=0; tthread; t++) { 80 | mBufferLists[t] = new SingleProducerSingleConsumerList(); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/threadconfig.h: -------------------------------------------------------------------------------- 1 | #ifndef THREAD_CONFIG_H 2 | #define THREAD_CONFIG_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "stats.h" 9 | #include "writer.h" 10 | #include "options.h" 11 | #include "filterresult.h" 12 | #include "singleproducersingleconsumerlist.h" 13 | 14 | using namespace std; 15 | 16 | class ThreadConfig{ 17 | public: 18 | ThreadConfig(Options* opt, int threadId, bool paired = false); 19 | ~ThreadConfig(); 20 | inline Stats* getPreStats1() {return mPreStats1;} 21 | inline Stats* getPostStats1() {return mPostStats1;} 22 | inline Stats* getPreStats2() {return mPreStats2;} 23 | inline Stats* getPostStats2() {return mPostStats2;} 24 | inline Writer* getWriter1() {return mWriter1;} 25 | inline Writer* getWriter2() {return mWriter2;} 26 | inline FilterResult* getFilterResult() {return mFilterResult;} 27 | 28 | void initWriter(string filename1); 29 | void initWriter(string filename1, string filename2); 30 | 31 | void addFilterResult(int result, int readNum); 32 | void addMergedPairs(int pairs); 33 | 34 | int getThreadId() {return mThreadId;} 35 | // for splitting output 36 | // increase mCurrentSplitReads by readNum, and check it with options->split.size; 37 | void markProcessed(long readNum); 38 | void initWriterForSplit(); 39 | bool canBeStopped(); 40 | void cleanup(); 41 | 42 | // input list 43 | void setInputList(SingleProducerSingleConsumerList* list); 44 | void setInputListPair(SingleProducerSingleConsumerList* left, SingleProducerSingleConsumerList* right); 45 | SingleProducerSingleConsumerList* getLeftInput(){return mLeftInputList;} 46 | SingleProducerSingleConsumerList* getRightInput(){return mRightInputList;} 47 | 48 | private: 49 | void deleteWriter(); 50 | void writeEmptyFilesForSplitting(); 51 | 52 | private: 53 | Stats* mPreStats1; 54 | Stats* mPostStats1; 55 | Stats* mPreStats2; 56 | Stats* mPostStats2; 57 | Writer* mWriter1; 58 | Writer* mWriter2; 59 | Options* mOptions; 60 | FilterResult* mFilterResult; 61 | SingleProducerSingleConsumerList* mLeftInputList; 62 | SingleProducerSingleConsumerList* mRightInputList; 63 | 64 | // for spliting output 65 | int mThreadId; 66 | int mWorkingSplit; 67 | long mCurrentSplitReads; 68 | bool mCanBeStopped; 69 | }; 70 | 71 | #endif -------------------------------------------------------------------------------- /src/sequence.cpp: -------------------------------------------------------------------------------- 1 | #include "sequence.h" 2 | 3 | Sequence::Sequence(){ 4 | } 5 | 6 | Sequence::Sequence(string* seq){ 7 | mStr = seq; 8 | } 9 | 10 | Sequence::~Sequence(){ 11 | if(mStr) 12 | delete mStr; 13 | } 14 | 15 | void Sequence::print(){ 16 | std::cerr << *mStr; 17 | } 18 | 19 | int Sequence::length(){ 20 | return mStr->length(); 21 | } 22 | 23 | string Sequence::reverseComplement(string* origin) { 24 | string str(origin->length(), 0); 25 | int len = origin->length(); 26 | for(int c=0;clength();c++){ 27 | char base = (*origin)[c]; 28 | switch(base){ 29 | case 'A': 30 | case 'a': 31 | str[len-c-1] = 'T'; 32 | break; 33 | case 'T': 34 | case 't': 35 | str[len-c-1] = 'A'; 36 | break; 37 | case 'C': 38 | case 'c': 39 | str[len-c-1] = 'G'; 40 | break; 41 | case 'G': 42 | case 'g': 43 | str[len-c-1] = 'C'; 44 | break; 45 | default: 46 | str[len-c-1] = 'N'; 47 | } 48 | } 49 | return str; 50 | } 51 | 52 | Sequence Sequence::reverseComplement(){ 53 | string* str = new string(mStr->length(), 0); 54 | int len = mStr->length(); 55 | for(int c=0;clength();c++){ 56 | char base = (*mStr)[c]; 57 | switch(base){ 58 | case 'A': 59 | case 'a': 60 | (*str)[len-c-1] = 'T'; 61 | break; 62 | case 'T': 63 | case 't': 64 | (*str)[len-c-1] = 'A'; 65 | break; 66 | case 'C': 67 | case 'c': 68 | (*str)[len-c-1] = 'G'; 69 | break; 70 | case 'G': 71 | case 'g': 72 | (*str)[len-c-1] = 'C'; 73 | break; 74 | default: 75 | (*str)[len-c-1] = 'N'; 76 | } 77 | } 78 | return Sequence(str); 79 | } 80 | 81 | Sequence Sequence::operator~(){ 82 | return reverseComplement(); 83 | } 84 | 85 | bool Sequence::test(){ 86 | Sequence s(new string("AAAATTTTCCCCGGGG")); 87 | Sequence rc = ~s; 88 | if (*(s.mStr) != "AAAATTTTCCCCGGGG" ){ 89 | cerr << "Failed in reverseComplement() expect AAAATTTTCCCCGGGG, but get "<< *(s.mStr); 90 | return false; 91 | } 92 | if (*(rc.mStr) != "CCCCGGGGAAAATTTT" ){ 93 | cerr << "Failed in reverseComplement() expect CCCCGGGGAAAATTTT, but get "<< *(rc.mStr); 94 | return false; 95 | } 96 | return true; 97 | } -------------------------------------------------------------------------------- /src/umiprocessor.cpp: -------------------------------------------------------------------------------- 1 | #include "umiprocessor.h" 2 | 3 | UmiProcessor::UmiProcessor(Options* opt){ 4 | mOptions = opt; 5 | } 6 | 7 | 8 | UmiProcessor::~UmiProcessor(){ 9 | } 10 | 11 | void UmiProcessor::process(Read* r1, Read* r2) { 12 | if(!mOptions->umi.enabled) 13 | return; 14 | 15 | string umi; 16 | if(mOptions->umi.location == UMI_LOC_INDEX1) 17 | umi = r1->firstIndex(); 18 | else if(mOptions->umi.location == UMI_LOC_INDEX2 && r2) 19 | umi = r2->lastIndex(); 20 | else if(mOptions->umi.location == UMI_LOC_READ1){ 21 | umi = r1->mSeq->substr(0, min(r1->length(), mOptions->umi.length)); 22 | r1->trimFront(umi.length() + mOptions->umi.skip); 23 | } 24 | else if(mOptions->umi.location == UMI_LOC_READ2 && r2){ 25 | umi = r2->mSeq->substr(0, min(r2->length(), mOptions->umi.length)); 26 | r2->trimFront(umi.length() + mOptions->umi.skip); 27 | } 28 | else if(mOptions->umi.location == UMI_LOC_PER_INDEX){ 29 | string umiMerged = r1->firstIndex(); 30 | if(r2) { 31 | umiMerged = umiMerged + "_" + r2->lastIndex(); 32 | } 33 | 34 | addUmiToName(r1, umiMerged); 35 | if(r2) { 36 | addUmiToName(r2, umiMerged); 37 | } 38 | } 39 | else if(mOptions->umi.location == UMI_LOC_PER_READ){ 40 | string umi1 = r1->mSeq->substr(0, min(r1->length(), mOptions->umi.length)); 41 | string umiMerged = umi1; 42 | r1->trimFront(umi1.length() + mOptions->umi.skip); 43 | if(r2){ 44 | string umi2 = r2->mSeq->substr(0, min(r2->length(), mOptions->umi.length)); 45 | umiMerged = umiMerged + "_" + umi2; 46 | r2->trimFront(umi2.length() + mOptions->umi.skip); 47 | } 48 | 49 | addUmiToName(r1, umiMerged); 50 | if(r2){ 51 | addUmiToName(r2, umiMerged); 52 | } 53 | } 54 | 55 | if(mOptions->umi.location != UMI_LOC_PER_INDEX && mOptions->umi.location != UMI_LOC_PER_READ) { 56 | if(r1 && !umi.empty()) 57 | addUmiToName(r1, umi); 58 | if(r2 && !umi.empty()) 59 | addUmiToName(r2, umi); 60 | } 61 | } 62 | 63 | void UmiProcessor::addUmiToName(Read* r, string umi){ 64 | string tag; 65 | string delimiter = mOptions->umi.delimiter; 66 | if(mOptions->umi.prefix.empty()) 67 | tag = delimiter + umi; 68 | else 69 | tag = delimiter + mOptions->umi.prefix + "_" + umi; 70 | int spacePos = -1; 71 | for(int i=0; imName->length(); i++) { 72 | if(r->mName->at(i) == ' ') { 73 | spacePos = i; 74 | break; 75 | } 76 | } 77 | if(spacePos == -1) { 78 | r->mName->append(tag); 79 | } else { 80 | r->mName->insert(spacePos, tag); 81 | } 82 | 83 | } 84 | 85 | 86 | bool UmiProcessor::test() { 87 | return true; 88 | } 89 | -------------------------------------------------------------------------------- /src/filterresult.h: -------------------------------------------------------------------------------- 1 | #ifndef FILTER_RESULT_H 2 | #define FILTER_RESULT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "common.h" 10 | #include "options.h" 11 | #include 12 | #include 13 | 14 | struct classcomp { 15 | bool operator() (const string& lhs, const string& rhs) const { 16 | if (lhs.length() < rhs.length()) 17 | return true; 18 | else if(lhs.length() == rhs.length()) { 19 | return lhs < rhs; 20 | } else 21 | return false; 22 | } 23 | }; 24 | 25 | using namespace std; 26 | 27 | class FilterResult{ 28 | public: 29 | FilterResult(Options* opt, bool paired = false); 30 | ~FilterResult(); 31 | inline long* getFilterReadStats() {return mFilterReadStats;} 32 | void addFilterResult(int result, int readNum=1); 33 | static FilterResult* merge(vector& list); 34 | void print(); 35 | // for single end 36 | void addAdapterTrimmed(string adapter, bool isR2 = false, bool incTrimmedCounter = true); 37 | // for paired end 38 | void addAdapterTrimmed(string adapter1, string adapter2); 39 | void addPolyXTrimmed(int base, int length); 40 | long getTotalPolyXTrimmedReads(); 41 | long getTotalPolyXTrimmedBases(); 42 | // a part of JSON report 43 | void reportJson(ofstream& ofs, string padding); 44 | // a part of JSON report for adapters 45 | void reportAdapterJson(ofstream& ofs, string padding); 46 | // a part of JSON report for polyX trim 47 | void reportPolyXTrimJson(ofstream& ofs, string padding); 48 | // a part of HTML report 49 | void reportHtml(ofstream& ofs, long totalReads, long totalBases); 50 | // a part of HTML report for adapters 51 | void reportAdapterHtml(ofstream& ofs, long totalBases); 52 | void outputAdaptersJson(ofstream& ofs, map& adapterCounts); 53 | int outputAdaptersHtml(ofstream& ofs, map& adapterCounts, long totalBases, int limitCount = 0); 54 | int getAdapterReportCount(map& adapterCounts); 55 | // deal with base correction results 56 | long* getCorrectionMatrix() {return mCorrectionMatrix;} 57 | long getTotalCorrectedBases(); 58 | void addCorrection(char from, char to); 59 | long getCorrectionNum(char from, char to); 60 | void incCorrectedReads(int count); 61 | void addMergedPairs(int pairs); 62 | bool isLowComplexity(string& adapter); 63 | 64 | 65 | public: 66 | Options* mOptions; 67 | bool mPaired; 68 | long mCorrectedReads; 69 | long mMergedPairs; 70 | private: 71 | long mFilterReadStats[FILTER_RESULT_TYPES]; 72 | long mTrimmedAdapterRead; 73 | long mTrimmedAdapterBases; 74 | long mTrimmedPolyXReads[4] = {0}; 75 | long mTrimmedPolyXBases[4] = {0}; 76 | map mAdapter1; 77 | map mAdapter2; 78 | long* mCorrectionMatrix; 79 | }; 80 | 81 | #endif -------------------------------------------------------------------------------- /testdata/R1.fq: -------------------------------------------------------------------------------- 1 | @AS500713:64:HFKJJBGXY:1:11101:1675:1101 1:A:0:TATAGCCT+GACCCCCA 2 | 3 | + 4 | 5 | @AS500713:64:HFKJJBGXY:1:11101:17113:1101 1:A:0:TATAGCCT+GTTTCTTA 6 | TACAAAATGCACATCGCTGAAAGGGGTAAAGGAGAGAAATCGCTTTATAAAACCTTGAAAAGGAATATTCAAATATAAGCTGGGAAGGTATAAAAAACTCTGTACATCACAAGTAAACAAATGGAACCTGCAAAATATTAAACAAAGGATT 7 | + 8 | AAAAAEEEEE6EEAAAEEEEE6EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEEEEEEEEEECFE####EEEE6EE 3 | 4 | NucleotideNode::NucleotideNode(){ 5 | count = 0; 6 | base = 'N'; 7 | memset(children, 0, sizeof(NucleotideNode*)*8); 8 | } 9 | NucleotideNode::~NucleotideNode(){ 10 | for(int i=0; i<8; i++) { 11 | if(children[i]) 12 | delete children[i]; 13 | } 14 | } 15 | void NucleotideNode::dfs() { 16 | //cerr << base; 17 | //cerr << count; 18 | printf("%c", base); 19 | printf("%d", count); 20 | bool hasChild = false; 21 | for(int i=0; i<8; i++) { 22 | if(children[i]){ 23 | children[i]->dfs(); 24 | hasChild = true; 25 | } 26 | } 27 | if(!hasChild) { 28 | printf("\n"); 29 | } 30 | } 31 | 32 | NucleotideTree::NucleotideTree(Options* opt){ 33 | mOptions = opt; 34 | mRoot = new NucleotideNode(); 35 | } 36 | 37 | 38 | NucleotideTree::~NucleotideTree(){ 39 | delete mRoot; 40 | } 41 | 42 | void NucleotideTree::addSeq(string seq) { 43 | NucleotideNode* curNode = mRoot; 44 | for(int i=0; ichildren[base] == NULL) { 49 | curNode->children[base] = new NucleotideNode(); 50 | curNode->children[base]->base = seq[i]; 51 | } 52 | curNode->children[base]->count++; 53 | curNode = curNode->children[base]; 54 | } 55 | } 56 | 57 | string NucleotideTree::getDominantPath(bool& reachedLeaf) { 58 | stringstream ss; 59 | const double RATIO_THRESHOLD = 0.95; 60 | const int NUM_THRESHOLD = 50; 61 | NucleotideNode* curNode = mRoot; 62 | while(true) { 63 | int total = 0; 64 | for(int i=0; i<8; i++) { 65 | if(curNode->children[i] != NULL) 66 | total += curNode->children[i]->count; 67 | } 68 | if(total < NUM_THRESHOLD) 69 | break; 70 | bool hasDominant = false; 71 | for(int i=0; i<8; i++) { 72 | if(curNode->children[i] == NULL) 73 | continue; 74 | if(curNode->children[i]->count / (double)total >= RATIO_THRESHOLD) { 75 | hasDominant = true; 76 | ss << curNode->children[i]->base; 77 | curNode = curNode->children[i]; 78 | break; 79 | } 80 | } 81 | if(!hasDominant) { 82 | reachedLeaf = false; 83 | break; 84 | } 85 | } 86 | return ss.str(); 87 | 88 | } 89 | 90 | bool NucleotideTree::test() { 91 | NucleotideTree tree(NULL); 92 | for(int i=0; i<100; i++) { 93 | tree.addSeq("AAAATTTT"); 94 | tree.addSeq("AAAATTTTGGGG"); 95 | tree.addSeq("AAAATTTTGGGGCCCC"); 96 | tree.addSeq("AAAATTTTGGGGCCAA"); 97 | } 98 | tree.addSeq("AAAATTTTGGGACCCC"); 99 | 100 | bool reachedLeaf = true; 101 | string path = tree.getDominantPath(reachedLeaf); 102 | printf("%s\n", path.c_str()); 103 | return path == "AAAATTTTGGGGCC"; 104 | } -------------------------------------------------------------------------------- /src/stats.h: -------------------------------------------------------------------------------- 1 | #ifndef STATS_H 2 | #define STATS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "read.h" 10 | #include "options.h" 11 | 12 | using namespace std; 13 | 14 | class Stats{ 15 | public: 16 | // this @guessedCycles parameter should be calculated using the first several records 17 | Stats(Options* opt, bool isRead2 = false, int guessedCycles = 0, int bufferMargin = 1024); 18 | ~Stats(); 19 | int getCycles(); 20 | long getReads(); 21 | long getBases(); 22 | long getQ20(); 23 | long getQ30(); 24 | long getQ40(); 25 | long getGCNumber(); 26 | long* getQualHist(); 27 | // by default the qualified qual score is Q20 ('5') 28 | void statRead(Read* r); 29 | 30 | static Stats* merge(vector& list); 31 | void print(); 32 | void summarize(bool forced = false); 33 | // a port of JSON report 34 | void reportJson(ofstream& ofs, string padding); 35 | // a port of HTML report 36 | void reportHtml(ofstream& ofs, string filteringType, string readName); 37 | void reportHtmlQuality(ofstream& ofs, string filteringType, string readName); 38 | void reportHtmlContents(ofstream& ofs, string filteringType, string readName); 39 | void reportHtmlKMER(ofstream& ofs, string filteringType, string readName); 40 | void reportHtmlORA(ofstream& ofs, string filteringType, string readName); 41 | bool isLongRead(); 42 | void initOverRepSeq(); 43 | int getMeanLength(); 44 | 45 | public: 46 | static string list2string(double* list, int size); 47 | static string list2string(double* list, int size, long* coords); 48 | static string list2string(long* list, int size); 49 | static int base2val(char base); 50 | 51 | private: 52 | void extendBuffer(int newBufLen); 53 | string makeKmerTD(int i, int j); 54 | string kmer3(int val); 55 | string kmer2(int val); 56 | void deleteOverRepSeqDist(); 57 | bool overRepPassed(string& seq, long count); 58 | 59 | private: 60 | Options* mOptions; 61 | bool mIsRead2; 62 | long mReads; 63 | int mEvaluatedSeqLen; 64 | /* 65 | why we use 8 here? 66 | map A/T/C/G/N to 0~7 by their ASCII % 8: 67 | 'A' % 8 = 1 68 | 'T' % 8 = 4 69 | 'C' % 8 = 3 70 | 'G' % 8 = 7 71 | 'N' % 8 = 6 72 | */ 73 | long *mCycleQ30Bases[8]; 74 | long *mCycleQ20Bases[8]; 75 | long *mCycleBaseContents[8]; 76 | long *mCycleBaseQual[8]; 77 | long *mCycleTotalBase; 78 | long *mCycleTotalQual; 79 | long *mKmer; 80 | long mBaseQualHistogram[128]; 81 | 82 | map mQualityCurves; 83 | map mContentCurves; 84 | map mOverRepSeq; 85 | map mOverRepSeqDist; 86 | 87 | 88 | int mCycles; 89 | int mBufLen; 90 | long mBases; 91 | long mQ20Bases[8]; 92 | long mQ30Bases[8]; 93 | long mBaseContents[8]; 94 | long mQ20Total; 95 | long mQ30Total; 96 | long mQ40Total; 97 | bool summarized; 98 | long mKmerMax; 99 | long mKmerMin; 100 | int mKmerBufLen; 101 | long mLengthSum; 102 | }; 103 | 104 | #endif -------------------------------------------------------------------------------- /src/fastqreader.h: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021 Shifu Chen 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | #ifndef FASTQ_READER_H 26 | #define FASTQ_READER_H 27 | 28 | #include 29 | #include 30 | #include "read.h" 31 | #include "common.h" 32 | #include 33 | #include 34 | #include "igzip_lib.h" 35 | #include "readpool.h" 36 | 37 | class FastqReader{ 38 | public: 39 | FastqReader(string filename, bool hasQuality = true, bool phred64=false); 40 | ~FastqReader(); 41 | bool isZipped(); 42 | 43 | void getBytes(size_t& bytesRead, size_t& bytesTotal); 44 | 45 | //this function is not thread-safe 46 | //do not call read() of a same FastqReader object from different threads concurrently 47 | Read* read(); 48 | bool eof(); 49 | bool hasNoLineBreakAtEnd(); 50 | void setReadPool(ReadPool* rp); 51 | 52 | public: 53 | static bool isZipFastq(string filename); 54 | static bool isFastq(string filename); 55 | static bool test(); 56 | 57 | private: 58 | void init(); 59 | void close(); 60 | void getLine(string* line); 61 | void clearLineBreaks(char* line); 62 | void readToBuf(); 63 | void readToBufIgzip(); 64 | bool bufferFinished(); 65 | 66 | private: 67 | string mFilename; 68 | struct isal_gzip_header mGzipHeader; 69 | struct inflate_state mGzipState; 70 | unsigned char *mGzipInputBuffer; 71 | unsigned char *mGzipOutputBuffer; 72 | size_t mGzipInputBufferSize; 73 | size_t mGzipOutputBufferSize; 74 | size_t mGzipInputUsedBytes; 75 | FILE* mFile; 76 | bool mZipped; 77 | char* mFastqBuf; 78 | int mBufDataLen; 79 | int mBufUsedLen; 80 | bool mStdinMode; 81 | bool mHasNoLineBreakAtEnd; 82 | long mCounter; 83 | bool mHasQuality; 84 | bool mPhred64; 85 | ReadPool* mReadPool; 86 | 87 | }; 88 | 89 | class FastqReaderPair{ 90 | public: 91 | FastqReaderPair(FastqReader* left, FastqReader* right); 92 | FastqReaderPair(string leftName, string rightName, bool hasQuality = true, bool phred64 = false, bool interleaved = false); 93 | ~FastqReaderPair(); 94 | ReadPair* read(); 95 | public: 96 | FastqReader* mLeft; 97 | FastqReader* mRight; 98 | bool mInterleaved; 99 | }; 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /testdata/R2.fq: -------------------------------------------------------------------------------- 1 | @AS500713:64:HFKJJBGXY:1:11101:1675:1101 2:A:0:TATAGCCT+GACCCCCA 2 | TAGGGCAAAACAAATAAGACAAAAAAAAAAAATGAATGAAAATGCAATTTTATTTACCACTTTGATGCTCAAAATGGCACTGCCAGGAAGCTGCCTGGGTTTAAAAATTTCCCGACCTCCTGAAATGTCTGGGGACCAGGAAGGTGGGCTC 3 | + 4 | AAAAA/EEEA/EEEE#AEA<####E######EE/A#E/##EEEE/E< 5 | 6 | FastaReader::FastaReader(string faFile, bool forceUpperCase) 7 | { 8 | // Set locale and disable stdio synchronization to improve iostream performance 9 | // http://www.drdobbs.com/the-standard-librarian-iostreams-and-std/184401305 10 | // http://stackoverflow.com/questions/5166263/how-to-get-iostream-to-perform-better 11 | setlocale(LC_ALL,"C"); 12 | ios_base::sync_with_stdio(false); 13 | 14 | mFastaFile = faFile; 15 | mForceUpperCase = forceUpperCase; 16 | if (is_directory(mFastaFile)) { 17 | string error_msg = "There is a problem with the provided fasta file: \'"; 18 | error_msg.append(mFastaFile); 19 | error_msg.append("\' is a directory NOT a file...\n"); 20 | throw invalid_argument(error_msg); 21 | } 22 | mFastaFileStream.open( mFastaFile.c_str(),ios::in); 23 | // verify that the file can be read 24 | if (!mFastaFileStream.is_open()) { 25 | string msg = "There is a problem with the provided fasta file: could NOT read "; 26 | msg.append(mFastaFile.c_str()); 27 | msg.append("...\n"); 28 | throw invalid_argument(msg); 29 | } 30 | 31 | char c; 32 | // seek to first contig 33 | while (mFastaFileStream.get(c) && c != '>') { 34 | if (mFastaFileStream.eof()) { 35 | break; 36 | } 37 | } 38 | } 39 | 40 | FastaReader::~FastaReader() 41 | { 42 | if (mFastaFileStream.is_open()) { 43 | mFastaFileStream.close(); 44 | } 45 | } 46 | 47 | void FastaReader::readNext() 48 | { 49 | mCurrentID = ""; 50 | mCurrentDescription = ""; 51 | mCurrentSequence = ""; 52 | bool foundHeader = false; 53 | 54 | char c; 55 | stringstream ssSeq; 56 | stringstream ssHeader; 57 | while(true){ 58 | mFastaFileStream.get(c); 59 | if(c == '>' || mFastaFileStream.eof()) 60 | break; 61 | else { 62 | if (foundHeader){ 63 | if(mForceUpperCase && c>='a' && c<='z') { 64 | c -= ('a' - 'A'); 65 | } 66 | ssSeq << c; 67 | } 68 | else 69 | ssHeader << c; 70 | } 71 | 72 | string line = ""; 73 | getline(mFastaFileStream,line,'\n'); 74 | 75 | 76 | if(foundHeader == false) { 77 | ssHeader << line; 78 | foundHeader = true; 79 | } 80 | else { 81 | str_keep_valid_sequence(line, mForceUpperCase); 82 | ssSeq << line; 83 | } 84 | } 85 | mCurrentSequence = ssSeq.str(); 86 | string header = ssHeader.str(); 87 | 88 | mCurrentID = header; 89 | } 90 | 91 | bool FastaReader::hasNext() { 92 | return !mFastaFileStream.eof(); 93 | } 94 | 95 | void FastaReader::readAll() { 96 | while(!mFastaFileStream.eof()){ 97 | readNext(); 98 | mAllContigs[mCurrentID] = mCurrentSequence; 99 | } 100 | } 101 | 102 | bool FastaReader::test(){ 103 | FastaReader reader("testdata/tinyref.fa"); 104 | reader.readAll(); 105 | 106 | string contig1 = "GATCACAGGTCTATCACCCTATTAATTGGTATTTTCGTCTGGGGGGTGTGGAGCCGGAGCACCCTATGTCGCAGT"; 107 | string contig2 = "GTCTGCACAGCCGCTTTCCACACAGAACCCCCCCCTCCCCCCGCTTCTGGCAAACCCCAAAAACAAAGAACCCTA"; 108 | 109 | if(reader.mAllContigs.count("contig1") == 0 || reader.mAllContigs.count("contig2") == 0 ) 110 | return false; 111 | 112 | if(reader.mAllContigs["contig1"] != contig1 || reader.mAllContigs["contig2"] != contig2 ) 113 | return false; 114 | 115 | return true; 116 | 117 | } 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /src/polyx.cpp: -------------------------------------------------------------------------------- 1 | #include "polyx.h" 2 | #include "common.h" 3 | 4 | PolyX::PolyX(){ 5 | } 6 | 7 | 8 | PolyX::~PolyX(){ 9 | } 10 | 11 | void PolyX::trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq) { 12 | trimPolyG(r1, fr, compareReq); 13 | trimPolyG(r2, fr, compareReq); 14 | } 15 | 16 | void PolyX::trimPolyG(Read* r, FilterResult* fr, int compareReq) { 17 | const int allowOneMismatchForEach = 8; 18 | const int maxMismatch = 5; 19 | 20 | const char* data = r->mSeq->c_str(); 21 | 22 | int rlen = r->length(); 23 | 24 | int mismatch = 0; 25 | int i = 0; 26 | int firstGPos = rlen - 1; 27 | for(i=0; i< rlen; i++) { 28 | if(data[rlen - i - 1] != 'G') { 29 | mismatch++; 30 | } else { 31 | firstGPos = rlen - i -1; 32 | } 33 | 34 | int allowedMismatch = (i+1)/allowOneMismatchForEach; 35 | if(mismatch > maxMismatch || (mismatch>allowedMismatch && i>= compareReq-1) ) 36 | break; 37 | } 38 | 39 | if(i >= compareReq) { 40 | r->resize(firstGPos); 41 | } 42 | } 43 | 44 | void PolyX::trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq) { 45 | trimPolyX(r1, fr, compareReq); 46 | trimPolyX(r2, fr, compareReq); 47 | } 48 | 49 | void PolyX::trimPolyX(Read* r, FilterResult* fr, int compareReq) { 50 | const int allowOneMismatchForEach = 8; 51 | const int maxMismatch = 5; 52 | 53 | const char* data = r->mSeq->c_str(); 54 | 55 | int rlen = r->length(); 56 | 57 | 58 | int atcgNumbers[4] = {0, 0, 0, 0}; 59 | int pos = 0; 60 | for(pos=0; pos= allowOneMismatchForEach || pos+1 >= compareReq-1)) { 93 | break; 94 | } 95 | } 96 | 97 | // has polyX 98 | if(pos+1 >= compareReq) { 99 | // find the poly 100 | int poly; 101 | int maxCount = -1; 102 | for(int b=0; b<4; b++) { 103 | if(atcgNumbers[b] > maxCount){ 104 | maxCount = atcgNumbers[b]; 105 | poly = b; 106 | } 107 | } 108 | char polyBase = ATCG_BASES[poly]; 109 | while(data[rlen - pos - 1] != polyBase && pos>=0) 110 | pos--; 111 | 112 | r->resize(rlen - pos - 1); 113 | if(fr) 114 | fr->addPolyXTrimmed(poly, pos + 1); 115 | } 116 | } 117 | 118 | bool PolyX::test() { 119 | 120 | Read r("@name", 121 | "ATTTTAAAAAAAAAATAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAT", 122 | "+", 123 | "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E"); 124 | 125 | FilterResult fr(NULL, false); 126 | PolyX::trimPolyX(&r, &fr, 10); 127 | r.print(); 128 | 129 | return *r.mSeq == "ATTTT" && fr.getTotalPolyXTrimmedReads() == 1 && fr.getTotalPolyXTrimmedBases() == 51; 130 | } -------------------------------------------------------------------------------- /src/basecorrector.cpp: -------------------------------------------------------------------------------- 1 | #include "basecorrector.h" 2 | #include "util.h" 3 | 4 | BaseCorrector::BaseCorrector(){ 5 | } 6 | 7 | 8 | BaseCorrector::~BaseCorrector(){ 9 | } 10 | 11 | int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit) { 12 | OverlapResult ov = OverlapAnalysis::analyze(r1, r2, diffLimit, overlapRequire, diffPercentLimit); 13 | return correctByOverlapAnalysis(r1, r2, fr, ov); 14 | } 15 | 16 | int BaseCorrector::correctByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov) { 17 | // we only correct overlap 18 | if(ov.diff == 0 || !ov.overlapped) 19 | return 0; 20 | 21 | int ol = ov.overlap_len; 22 | int start1 = max(0, ov.offset); 23 | int start2 = r2->length() - max(0, -ov.offset) - 1; 24 | 25 | const char* seq1 = r1->mSeq->c_str(); 26 | const char* seq2 = r2->mSeq->c_str(); 27 | const char* qual1 = r1->mQuality->c_str(); 28 | const char* qual2 = r2->mQuality->c_str(); 29 | 30 | const char GOOD_QUAL = num2qual(30); 31 | const char BAD_QUAL = num2qual(14); 32 | 33 | int corrected = 0; 34 | int uncorrected = 0; 35 | bool r1Corrected = false; 36 | bool r2Corrected = false; 37 | for(int i=0; i= GOOD_QUAL && qual2[p2] <= BAD_QUAL) { 43 | // use R1 44 | (*r2->mSeq)[p2] = complement(seq1[p1]); 45 | (*r2->mQuality)[p2] = qual1[p1]; 46 | corrected++; 47 | r2Corrected = true; 48 | if(fr) { 49 | fr->addCorrection(seq2[p2], complement(seq1[p1])); 50 | } 51 | } else if(qual2[p2] >= GOOD_QUAL && qual1[p1] <= BAD_QUAL) { 52 | // use R2 53 | (*r1->mSeq)[p1] = complement(seq2[p2]); 54 | (*r1->mQuality)[p1] = qual2[p2]; 55 | corrected++; 56 | r1Corrected = true; 57 | if(fr) { 58 | fr->addCorrection(seq1[p1], complement(seq2[p2])); 59 | } 60 | } else { 61 | uncorrected++; 62 | } 63 | } 64 | } 65 | 66 | // should never happen 67 | if(uncorrected + corrected != ov.diff) { 68 | static bool warned = false; 69 | if(!warned){ 70 | cerr << "WARNING: the algorithm is wrong! uncorrected + corrected != ov.diff" << endl; 71 | warned = true; 72 | } 73 | } 74 | 75 | if(corrected > 0 && fr) { 76 | if(r1Corrected && r2Corrected) 77 | fr->incCorrectedReads(2); 78 | else 79 | fr->incCorrectedReads(1); 80 | } 81 | 82 | return corrected; 83 | } 84 | 85 | bool BaseCorrector::test() { 86 | Read r1(new string("@name"), 87 | new string("TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCACGGGG"), 88 | new string("+"), 89 | new string("EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEE")); 90 | Read r2(new string("@name"), 91 | new string("AAAAAAAAAACCCCGGGGAAAATTTTAAAATTGGGGGGGGGGTGGGGGGGGGGGGG"), 92 | new string("+"), 93 | new string("EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEE")); 94 | 95 | correctByOverlapAnalysis(&r1, &r2, NULL, 5, 30, 0.2); 96 | 97 | if(*r1.mSeq != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGG") 98 | return false; 99 | if(*r2.mSeq != "AAAAAAAAAACCCCGGGGAAAATTTTAAAATTGGGGGGGGGGGGGGGGGGGGGGGG") 100 | return false; 101 | if(*r1.mQuality != "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE") 102 | return false; 103 | if(*r2.mQuality != "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE") 104 | return false; 105 | 106 | return true; 107 | } -------------------------------------------------------------------------------- /src/matcher.cpp: -------------------------------------------------------------------------------- 1 | #include "matcher.h" 2 | 3 | Matcher::Matcher(){ 4 | } 5 | 6 | 7 | Matcher::~Matcher(){ 8 | } 9 | 10 | bool Matcher::matchWithOneInsertion(const char* insData, const char* normalData, int cmplen, int diffLimit) { 11 | // accumlated mismatches from left/right 12 | int accMismatchFromLeft[cmplen]; 13 | int accMismatchFromRight[cmplen]; 14 | 15 | // accMismatchFromLeft[0]: head vs. head 16 | // accMismatchFromRight[cmplen-1]: tail vs. tail 17 | accMismatchFromLeft[0] = insData[0] == normalData[0] ? 0 : 1; 18 | accMismatchFromRight[cmplen-1] = insData[cmplen] == normalData[cmplen-1] ? 0 : 1; 19 | for(int i=1; idiffLimit) 26 | break; 27 | } 28 | for(int i=cmplen - 2; i>=0; i--) { 29 | if(insData[i+1] != normalData[i]) 30 | accMismatchFromRight[i] = accMismatchFromRight[i+1]+1; 31 | else 32 | accMismatchFromRight[i] = accMismatchFromRight[i+1]; 33 | if(accMismatchFromRight[i] + accMismatchFromLeft[0]> diffLimit) { 34 | for(int p=0; p diffLimit) 47 | return false; 48 | int diff = accMismatchFromLeft[i-1] + accMismatchFromRight[i]; 49 | if(diff <= diffLimit) 50 | return true; 51 | } 52 | 53 | return false; 54 | } 55 | 56 | int Matcher::diffWithOneInsertion(const char* insData, const char* normalData, int cmplen, int diffLimit) { 57 | // accumlated mismatches from left/right 58 | int accMismatchFromLeft[cmplen]; 59 | int accMismatchFromRight[cmplen]; 60 | 61 | // accMismatchFromLeft[0]: head vs. head 62 | // accMismatchFromRight[cmplen-1]: tail vs. tail 63 | accMismatchFromLeft[0] = insData[0] == normalData[0] ? 0 : 1; 64 | accMismatchFromRight[cmplen-1] = insData[cmplen] == normalData[cmplen-1] ? 0 : 1; 65 | for(int i=1; idiffLimit) 72 | break; 73 | } 74 | for(int i=cmplen - 2; i>=0; i--) { 75 | if(insData[i+1] != normalData[i]) 76 | accMismatchFromRight[i] = accMismatchFromRight[i+1]+1; 77 | else 78 | accMismatchFromRight[i] = accMismatchFromRight[i+1]; 79 | if(accMismatchFromRight[i] + accMismatchFromLeft[0]> diffLimit) { 80 | for(int p=0; p diffLimit) 94 | return -1; // -1 means higher than diffLimit 95 | int diff = accMismatchFromLeft[i-1] + accMismatchFromRight[i]; 96 | if(diff <= minDiff) 97 | minDiff = diff; 98 | } 99 | 100 | return minDiff; 101 | } -------------------------------------------------------------------------------- /src/writer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021 Shifu Chen 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | #include "writer.h" 26 | #include "util.h" 27 | #include 28 | 29 | Writer::Writer(Options* opt, string filename, int compression, bool isSTDOUT){ 30 | mCompression = compression; 31 | mFilename = filename; 32 | mCompressor = NULL; 33 | mZipped = false; 34 | haveToClose = true; 35 | mBuffer = NULL; 36 | mBufDataLen = 0; 37 | mOptions = opt; 38 | mBufSize = mOptions->writerBufferSize; 39 | mSTDOUT = isSTDOUT; 40 | init(); 41 | } 42 | 43 | Writer::~Writer(){ 44 | flush(); 45 | if(haveToClose) { 46 | close(); 47 | } 48 | } 49 | 50 | void Writer::flush() { 51 | if(mBufDataLen > 0) { 52 | writeInternal(mBuffer, mBufDataLen); 53 | mBufDataLen = 0; 54 | } 55 | } 56 | 57 | string Writer::filename(){ 58 | return mFilename; 59 | } 60 | 61 | void Writer::init(){ 62 | mBuffer = (char*) malloc(mBufSize); 63 | if(mBuffer == NULL) { 64 | error_exit("Failed to allocate write buffer with size: " + to_string(mBufSize)); 65 | } 66 | if(mSTDOUT) { 67 | mFP = stdout; 68 | return ; 69 | } 70 | if (ends_with(mFilename, ".gz")){ 71 | mCompressor = libdeflate_alloc_compressor(mCompression); 72 | if(mCompressor == NULL) { 73 | error_exit("Failed to alloc libdeflate_alloc_compressor, please check the libdeflate library."); 74 | } 75 | mZipped = true; 76 | mFP = fopen(mFilename.c_str(), "wb"); 77 | if(mFP == NULL) { 78 | error_exit("Failed to write: " + mFilename); 79 | } 80 | } else { 81 | mFP = fopen(mFilename.c_str(), "wb"); 82 | if(mFP == NULL) { 83 | error_exit("Failed to write: " + mFilename); 84 | } 85 | //mOutStream = new ofstream(); 86 | //mOutStream->open(mFilename.c_str(), ifstream::out); 87 | } 88 | } 89 | 90 | bool Writer::writeString(const string& str) { 91 | return write(str.data(), str.length()); 92 | } 93 | 94 | bool Writer::writeString(string* str) { 95 | return write(str->data(), str->length()); 96 | } 97 | 98 | bool Writer::write(const char* strdata, size_t size) { 99 | if(size + mBufDataLen > mBufSize) 100 | flush(); 101 | if(size > mBufSize) 102 | return writeInternal(strdata, size); 103 | else { 104 | memcpy(mBuffer + mBufDataLen, strdata, size); 105 | mBufDataLen += size; 106 | } 107 | return true; 108 | } 109 | 110 | bool Writer::writeInternal(const char* strdata, size_t size) { 111 | size_t written; 112 | bool status; 113 | 114 | if(mZipped){ 115 | size_t bound = libdeflate_gzip_compress_bound(mCompressor, size); 116 | void* out = malloc(bound); 117 | size_t outsize = libdeflate_gzip_compress(mCompressor, strdata, size, out, bound); 118 | if(outsize == 0) 119 | status = false; 120 | else { 121 | size_t ret = fwrite(out, 1, outsize, mFP ); 122 | status = ret>0; 123 | //mOutStream->write((char*)out, outsize); 124 | //status = !mOutStream->fail(); 125 | } 126 | free(out); 127 | } 128 | else{ 129 | size_t ret = fwrite(strdata, 1, size, mFP ); 130 | status = ret>0; 131 | } 132 | return status; 133 | } 134 | 135 | void Writer::close(){ 136 | if (mZipped){ 137 | if (mCompressor){ 138 | libdeflate_free_compressor(mCompressor); 139 | mCompressor = NULL; 140 | } 141 | } 142 | if(mBuffer) { 143 | free(mBuffer); 144 | mBuffer = NULL; 145 | } 146 | if(mFP && !mSTDOUT) { 147 | fclose(mFP); 148 | mFP = NULL; 149 | } 150 | } 151 | 152 | bool Writer::isZipped(){ 153 | return mZipped; 154 | } -------------------------------------------------------------------------------- /src/duplicate.cpp: -------------------------------------------------------------------------------- 1 | #include "duplicate.h" 2 | #include "overlapanalysis.h" 3 | #include 4 | #include 5 | #include "util.h" 6 | 7 | const int PRIME_ARRAY_LEN = 1<<9; 8 | 9 | Duplicate::Duplicate(Options* opt) { 10 | mOptions = opt; 11 | 12 | // 1G mem required 13 | mBufLenInBytes = 1L <<29; 14 | mBufNum = 2; 15 | 16 | // the memory usage increases as the accuracy level increases 17 | // level 1: 1G 18 | // level 2: 2G 19 | // level 3: 4G 20 | // level 4: 8G 21 | // level 5: 16G 22 | // level 6: 24G 23 | switch(mOptions->duplicate.accuracyLevel) { 24 | case 1: 25 | break; 26 | case 2: 27 | mBufLenInBytes *= 2; 28 | break; 29 | case 3: 30 | mBufLenInBytes *= 2; 31 | mBufNum *= 2; 32 | break; 33 | case 4: 34 | mBufLenInBytes *= 4; 35 | mBufNum *= 2; 36 | break; 37 | case 5: 38 | mBufLenInBytes *= 8; 39 | mBufNum *= 2; 40 | break; 41 | case 6: 42 | mBufLenInBytes *= 8; 43 | mBufNum *= 3; 44 | break; 45 | default: 46 | break; 47 | } 48 | 49 | mOffsetMask = PRIME_ARRAY_LEN * mBufNum - 1; 50 | 51 | mBufLenInBits = mBufLenInBytes << 3; 52 | mDupBuf = new atomic_uchar[mBufLenInBytes * mBufNum]; 53 | if(!mDupBuf) { 54 | error_exit("Out of memory, failed to allocate " + to_string(mBufLenInBytes * mBufNum) + " bytes buffer for duplication analysis, please reduce the dup_accuracy_level and try again."); 55 | } 56 | memset(mDupBuf, 0, sizeof(atomic_uchar) * mBufLenInBytes * mBufNum); 57 | 58 | mPrimeArrays = new uint64[mBufNum * PRIME_ARRAY_LEN]; 59 | memset(mPrimeArrays, 0, sizeof(uint64) * mBufNum * PRIME_ARRAY_LEN); 60 | initPrimeArrays(); 61 | 62 | mTotalReads = 0; 63 | mDupReads = 0; 64 | } 65 | 66 | void Duplicate::initPrimeArrays() { 67 | uint64 number = 10000; 68 | uint64 count = 0; 69 | while(count < mBufNum * PRIME_ARRAY_LEN) { 70 | number++; 71 | bool isPrime = true; 72 | for(uint64 i=2; i<=sqrt(number); i++) { 73 | if(number%i == 0) { 74 | isPrime = false; 75 | break; 76 | } 77 | } 78 | if(isPrime) { 79 | mPrimeArrays[count] = number; 80 | count++; 81 | number += 10000; 82 | } 83 | } 84 | } 85 | 86 | Duplicate::~Duplicate(){ 87 | delete[] mDupBuf; 88 | delete[] mPrimeArrays; 89 | } 90 | 91 | void Duplicate::seq2intvector(const char* data, int len, uint64* output, int posOffset) { 92 | for(int p=0; plength(); 125 | seq2intvector(r->mSeq->c_str(), len, positions); 126 | bool isDup = applyBloomFilter(positions); 127 | delete[] positions; 128 | 129 | mTotalReads++; 130 | if(isDup) 131 | mDupReads++; 132 | 133 | return isDup; 134 | } 135 | 136 | bool Duplicate::checkPair(Read* r1, Read* r2) { 137 | uint64* positions = new uint64[mBufNum]; 138 | 139 | // init 140 | for(int i=0; imSeq->c_str(), r1->length(), positions); 143 | seq2intvector(r2->mSeq->c_str(), r2->length(), positions, r1->length()); 144 | bool isDup = applyBloomFilter(positions); 145 | delete[] positions; 146 | 147 | mTotalReads++; 148 | if(isDup) 149 | mDupReads++; 150 | 151 | return isDup; 152 | } 153 | 154 | bool Duplicate::applyBloomFilter(uint64* positions) { 155 | bool isDup = true; 156 | for(int i=0; i> 3; 159 | uint32 bitOffset = pos & 0x07; 160 | uint8 byte = (0x01) << bitOffset; 161 | 162 | //isDup = isDup && (mDupBuf[i * mBufLenInBytes + bytePos] & byte); 163 | uint8 ret = atomic_fetch_or(mDupBuf + i * mBufLenInBytes + bytePos, byte); 164 | isDup = (ret & byte) != 0; 165 | } 166 | return isDup; 167 | } 168 | 169 | double Duplicate::getDupRate() { 170 | if(mTotalReads == 0) 171 | return 0.0; 172 | return (double)mDupReads/(double)mTotalReads; 173 | } -------------------------------------------------------------------------------- /src/threadconfig.cpp: -------------------------------------------------------------------------------- 1 | #include "threadconfig.h" 2 | #include "util.h" 3 | 4 | ThreadConfig::ThreadConfig(Options* opt, int threadId, bool paired){ 5 | mOptions = opt; 6 | mThreadId = threadId; 7 | mWorkingSplit = threadId; 8 | mCurrentSplitReads = 0; 9 | mPreStats1 = new Stats(mOptions, false); 10 | mPostStats1 = new Stats(mOptions, false); 11 | if(paired){ 12 | mPreStats2 = new Stats(mOptions, true); 13 | mPostStats2 = new Stats(mOptions, true); 14 | } 15 | else { 16 | mPreStats2 = NULL; 17 | mPostStats2 = NULL; 18 | } 19 | mWriter1 = NULL; 20 | mWriter2 = NULL; 21 | 22 | mFilterResult = new FilterResult(opt, paired); 23 | mCanBeStopped = false; 24 | mLeftInputList = NULL; 25 | mRightInputList = NULL; 26 | } 27 | 28 | ThreadConfig::~ThreadConfig() { 29 | cleanup(); 30 | } 31 | 32 | void ThreadConfig::cleanup() { 33 | if(mOptions->split.enabled && mOptions->split.byFileNumber) 34 | writeEmptyFilesForSplitting(); 35 | deleteWriter(); 36 | if(mLeftInputList) { 37 | delete mLeftInputList; 38 | mLeftInputList = NULL; 39 | } 40 | if(mRightInputList) { 41 | delete mRightInputList; 42 | mRightInputList = NULL; 43 | } 44 | if(mPreStats1) { 45 | delete mPreStats1; 46 | mPreStats1 = NULL; 47 | } 48 | if(mPostStats1) { 49 | delete mPostStats1; 50 | mPostStats1 = NULL; 51 | } 52 | if(mPreStats2) { 53 | delete mPreStats2; 54 | mPreStats2 = NULL; 55 | } 56 | if(mPostStats2) { 57 | delete mPostStats2; 58 | mPostStats2 = NULL; 59 | } 60 | if(mFilterResult) { 61 | delete mFilterResult; 62 | mFilterResult = NULL; 63 | } 64 | } 65 | 66 | 67 | void ThreadConfig::setInputList(SingleProducerSingleConsumerList* list) { 68 | mLeftInputList = list; 69 | } 70 | 71 | void ThreadConfig::setInputListPair(SingleProducerSingleConsumerList* left, SingleProducerSingleConsumerList* right) { 72 | mLeftInputList = left; 73 | mRightInputList = right; 74 | } 75 | 76 | void ThreadConfig::deleteWriter() { 77 | if(mWriter1 != NULL) { 78 | delete mWriter1; 79 | mWriter1 = NULL; 80 | } 81 | if(mWriter2 != NULL) { 82 | delete mWriter2; 83 | mWriter2 = NULL; 84 | } 85 | } 86 | 87 | void ThreadConfig::initWriter(string filename1) { 88 | deleteWriter(); 89 | mWriter1 = new Writer(mOptions, filename1, mOptions->compression); 90 | } 91 | 92 | void ThreadConfig::initWriter(string filename1, string filename2) { 93 | deleteWriter(); 94 | mWriter1 = new Writer(mOptions, filename1, mOptions->compression); 95 | mWriter2 = new Writer(mOptions, filename2, mOptions->compression); 96 | } 97 | 98 | void ThreadConfig::addFilterResult(int result, int readNum) { 99 | mFilterResult->addFilterResult(result, readNum); 100 | } 101 | 102 | void ThreadConfig::addMergedPairs(int pairs) { 103 | mFilterResult->addMergedPairs(pairs); 104 | } 105 | 106 | void ThreadConfig::initWriterForSplit() { 107 | if(mOptions->out1.empty()) 108 | return ; 109 | 110 | // use 1-based naming 111 | string num = to_string(mWorkingSplit + 1); 112 | // padding for digits like 0001 113 | if(mOptions->split.digits > 0){ 114 | while(num.size() < mOptions->split.digits) 115 | num = "0" + num; 116 | } 117 | 118 | string filename1 = joinpath(dirname(mOptions->out1), num + "." + basename(mOptions->out1)); 119 | if(!mOptions->isPaired()) { 120 | initWriter(filename1); 121 | } else { 122 | string filename2 = joinpath(dirname(mOptions->out2), num + "." + basename(mOptions->out2)); 123 | initWriter(filename1, filename2); 124 | } 125 | } 126 | 127 | void ThreadConfig::markProcessed(long readNum) { 128 | mCurrentSplitReads += readNum; 129 | if(!mOptions->split.enabled) 130 | return ; 131 | // if splitting is enabled, check whether current file is full 132 | if(mCurrentSplitReads >= mOptions->split.size) { 133 | // if it's splitting by file number, totally we cannot exceed split.number 134 | // if it's splitting by file lines, then we don't need to check 135 | if(mOptions->split.byFileLines || mWorkingSplit + mOptions->thread < mOptions->split.number ){ 136 | mWorkingSplit += mOptions->thread; 137 | initWriterForSplit(); 138 | mCurrentSplitReads = 0; 139 | } else { 140 | // this thread can be stoped now since all its tasks are done 141 | // only a part of threads have to deal with the remaining reads 142 | if(mOptions->split.number % mOptions->thread >0 143 | && mThreadId >= mOptions->split.number % mOptions->thread) 144 | mCanBeStopped = true; 145 | } 146 | } 147 | } 148 | 149 | // if a task of writting N files is assigned to this thread, but the input file doesn't have so many reads to input 150 | // write some empty files so it will not break following pipelines 151 | void ThreadConfig::writeEmptyFilesForSplitting() { 152 | while(mWorkingSplit + mOptions->thread < mOptions->split.number) { 153 | mWorkingSplit += mOptions->thread; 154 | initWriterForSplit(); 155 | mCurrentSplitReads = 0; 156 | } 157 | } 158 | 159 | bool ThreadConfig::canBeStopped() { 160 | return mCanBeStopped; 161 | } -------------------------------------------------------------------------------- /src/singleproducersingleconsumerlist.h: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021 Shifu Chen 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | // A ultra-fast lock-free linked list for single-producer, single-consumer threading 26 | // Memory usage overhead: 3M bytes per list, if you want to save memory, please use smaller block and smaller ring buffer 27 | // The type T is usually a pointer, a internal type (such as int, long), or a class supports assignment T a = b; 28 | 29 | /* WARNING: only supports up to 1G unconsumed elements in list, 30 | which means: produced - consumed must < 1G, 31 | this is usually much more than enough, 32 | but if you want to support even more unconsumed elements, 33 | please modify the value of blocksRingBufferSize as you want. 34 | */ 35 | 36 | #ifndef SINGLE_PRODUCER_SINGLE_CONSUMER_LIST_H 37 | #define SINGLE_PRODUCER_SINGLE_CONSUMER_LIST_H 38 | 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | template 46 | struct LockFreeListItem { 47 | public: 48 | inline LockFreeListItem(T val) { 49 | value = val; 50 | nextItemReady = false; 51 | nextItem = NULL; 52 | } 53 | inline LockFreeListItem() { 54 | nextItem = NULL; 55 | nextItemReady = false; 56 | } 57 | T value; 58 | LockFreeListItem* nextItem; 59 | std::atomic_bool nextItemReady; 60 | }; 61 | 62 | template 63 | class SingleProducerSingleConsumerList { 64 | public: 65 | inline SingleProducerSingleConsumerList() { 66 | head = NULL; 67 | tail = NULL; 68 | producerFinished = false; 69 | consumerFinished = false; 70 | produced = 0; 71 | consumed = 0; 72 | recycled = 0; 73 | blocksRingBufferSize = 0x1L << 18; 74 | blocksRingBufferSizeMask = blocksRingBufferSize - 1; 75 | blocksNum = 0; 76 | // 2M memory 77 | blocks = new LockFreeListItem*[blocksRingBufferSize]; 78 | memset(blocks, 0, sizeof(LockFreeListItem*) * blocksRingBufferSize); 79 | } 80 | inline ~SingleProducerSingleConsumerList() { 81 | while(recycled < blocksNum) { 82 | delete[] blocks[recycled & blocksRingBufferSizeMask]; 83 | blocks[recycled & blocksRingBufferSizeMask] = NULL; 84 | recycled++; 85 | } 86 | delete[] blocks; 87 | blocks = NULL; 88 | } 89 | inline size_t size() { 90 | return produced - consumed; 91 | } 92 | inline bool canBeConsumed() { 93 | if(head == NULL) 94 | return false; 95 | return head->nextItemReady || producerFinished; 96 | } 97 | inline void produce(T val) { 98 | LockFreeListItem* item = makeItem(val); 99 | if(head==NULL) { 100 | head = item; 101 | tail = item; 102 | } else { 103 | tail->nextItem = item; 104 | tail->nextItemReady = true; 105 | tail = item; 106 | } 107 | produced++; 108 | } 109 | inline T consume() { 110 | assert(head != NULL); 111 | T val = head->value; 112 | head = head->nextItem; 113 | consumed++; 114 | if((consumed & 0xFFF) == 0) 115 | recycle(); 116 | return val; 117 | } 118 | inline bool isProducerFinished() { 119 | return producerFinished; 120 | } 121 | inline bool isConsumerFinished() { 122 | return consumerFinished; 123 | } 124 | inline void setProducerFinished() { 125 | producerFinished = true; 126 | } 127 | inline void setConsumerFinished() { 128 | consumerFinished = true; 129 | } 130 | private: 131 | // blockized list 132 | inline LockFreeListItem* makeItem(T val) { 133 | unsigned long blk = produced >> 12; 134 | unsigned long idx = produced & 0xFFF; 135 | size_t size = 0x01<<12; 136 | if(blocksNum <= blk) { 137 | LockFreeListItem* buffer = new LockFreeListItem[size]; 138 | memset(buffer, 0, sizeof(LockFreeListItem) * size); 139 | blocks[blocksNum & blocksRingBufferSizeMask] = buffer; 140 | blocksNum++; 141 | } 142 | LockFreeListItem* item = blocks[blk & blocksRingBufferSizeMask]+idx; 143 | item->value = val; 144 | return item; 145 | } 146 | 147 | inline void recycle() { 148 | unsigned long blk = consumed >> 12; 149 | while((recycled+1) < blk) { 150 | delete[] blocks[recycled & blocksRingBufferSizeMask]; 151 | blocks[recycled & blocksRingBufferSizeMask] = NULL; 152 | recycled++; 153 | } 154 | } 155 | 156 | private: 157 | LockFreeListItem* head; 158 | LockFreeListItem* tail; 159 | LockFreeListItem** blocks; 160 | std::atomic_bool producerFinished; 161 | std::atomic_bool consumerFinished; 162 | unsigned long produced; 163 | unsigned long consumed; 164 | unsigned long recycled; 165 | unsigned long blocksRingBufferSize; 166 | unsigned long blocksRingBufferSizeMask; 167 | unsigned long blocksNum; 168 | }; 169 | 170 | #endif 171 | -------------------------------------------------------------------------------- /src/zlib/inffixed.h: -------------------------------------------------------------------------------- 1 | /* inffixed.h -- table for decoding fixed codes 2 | * Generated automatically by makefixed(). 3 | */ 4 | 5 | /* WARNING: this file should *not* be used by applications. 6 | It is part of the implementation of this library and is 7 | subject to change. Applications should only use zlib.h. 8 | */ 9 | 10 | static const code lenfix[512] = { 11 | {96,7,0},{0,8,80},{0,8,16},{20,8,115},{18,7,31},{0,8,112},{0,8,48}, 12 | {0,9,192},{16,7,10},{0,8,96},{0,8,32},{0,9,160},{0,8,0},{0,8,128}, 13 | {0,8,64},{0,9,224},{16,7,6},{0,8,88},{0,8,24},{0,9,144},{19,7,59}, 14 | {0,8,120},{0,8,56},{0,9,208},{17,7,17},{0,8,104},{0,8,40},{0,9,176}, 15 | {0,8,8},{0,8,136},{0,8,72},{0,9,240},{16,7,4},{0,8,84},{0,8,20}, 16 | {21,8,227},{19,7,43},{0,8,116},{0,8,52},{0,9,200},{17,7,13},{0,8,100}, 17 | {0,8,36},{0,9,168},{0,8,4},{0,8,132},{0,8,68},{0,9,232},{16,7,8}, 18 | {0,8,92},{0,8,28},{0,9,152},{20,7,83},{0,8,124},{0,8,60},{0,9,216}, 19 | {18,7,23},{0,8,108},{0,8,44},{0,9,184},{0,8,12},{0,8,140},{0,8,76}, 20 | {0,9,248},{16,7,3},{0,8,82},{0,8,18},{21,8,163},{19,7,35},{0,8,114}, 21 | {0,8,50},{0,9,196},{17,7,11},{0,8,98},{0,8,34},{0,9,164},{0,8,2}, 22 | {0,8,130},{0,8,66},{0,9,228},{16,7,7},{0,8,90},{0,8,26},{0,9,148}, 23 | {20,7,67},{0,8,122},{0,8,58},{0,9,212},{18,7,19},{0,8,106},{0,8,42}, 24 | {0,9,180},{0,8,10},{0,8,138},{0,8,74},{0,9,244},{16,7,5},{0,8,86}, 25 | {0,8,22},{64,8,0},{19,7,51},{0,8,118},{0,8,54},{0,9,204},{17,7,15}, 26 | {0,8,102},{0,8,38},{0,9,172},{0,8,6},{0,8,134},{0,8,70},{0,9,236}, 27 | {16,7,9},{0,8,94},{0,8,30},{0,9,156},{20,7,99},{0,8,126},{0,8,62}, 28 | {0,9,220},{18,7,27},{0,8,110},{0,8,46},{0,9,188},{0,8,14},{0,8,142}, 29 | {0,8,78},{0,9,252},{96,7,0},{0,8,81},{0,8,17},{21,8,131},{18,7,31}, 30 | {0,8,113},{0,8,49},{0,9,194},{16,7,10},{0,8,97},{0,8,33},{0,9,162}, 31 | {0,8,1},{0,8,129},{0,8,65},{0,9,226},{16,7,6},{0,8,89},{0,8,25}, 32 | {0,9,146},{19,7,59},{0,8,121},{0,8,57},{0,9,210},{17,7,17},{0,8,105}, 33 | {0,8,41},{0,9,178},{0,8,9},{0,8,137},{0,8,73},{0,9,242},{16,7,4}, 34 | {0,8,85},{0,8,21},{16,8,258},{19,7,43},{0,8,117},{0,8,53},{0,9,202}, 35 | {17,7,13},{0,8,101},{0,8,37},{0,9,170},{0,8,5},{0,8,133},{0,8,69}, 36 | {0,9,234},{16,7,8},{0,8,93},{0,8,29},{0,9,154},{20,7,83},{0,8,125}, 37 | {0,8,61},{0,9,218},{18,7,23},{0,8,109},{0,8,45},{0,9,186},{0,8,13}, 38 | {0,8,141},{0,8,77},{0,9,250},{16,7,3},{0,8,83},{0,8,19},{21,8,195}, 39 | {19,7,35},{0,8,115},{0,8,51},{0,9,198},{17,7,11},{0,8,99},{0,8,35}, 40 | {0,9,166},{0,8,3},{0,8,131},{0,8,67},{0,9,230},{16,7,7},{0,8,91}, 41 | {0,8,27},{0,9,150},{20,7,67},{0,8,123},{0,8,59},{0,9,214},{18,7,19}, 42 | {0,8,107},{0,8,43},{0,9,182},{0,8,11},{0,8,139},{0,8,75},{0,9,246}, 43 | {16,7,5},{0,8,87},{0,8,23},{64,8,0},{19,7,51},{0,8,119},{0,8,55}, 44 | {0,9,206},{17,7,15},{0,8,103},{0,8,39},{0,9,174},{0,8,7},{0,8,135}, 45 | {0,8,71},{0,9,238},{16,7,9},{0,8,95},{0,8,31},{0,9,158},{20,7,99}, 46 | {0,8,127},{0,8,63},{0,9,222},{18,7,27},{0,8,111},{0,8,47},{0,9,190}, 47 | {0,8,15},{0,8,143},{0,8,79},{0,9,254},{96,7,0},{0,8,80},{0,8,16}, 48 | {20,8,115},{18,7,31},{0,8,112},{0,8,48},{0,9,193},{16,7,10},{0,8,96}, 49 | {0,8,32},{0,9,161},{0,8,0},{0,8,128},{0,8,64},{0,9,225},{16,7,6}, 50 | {0,8,88},{0,8,24},{0,9,145},{19,7,59},{0,8,120},{0,8,56},{0,9,209}, 51 | {17,7,17},{0,8,104},{0,8,40},{0,9,177},{0,8,8},{0,8,136},{0,8,72}, 52 | {0,9,241},{16,7,4},{0,8,84},{0,8,20},{21,8,227},{19,7,43},{0,8,116}, 53 | {0,8,52},{0,9,201},{17,7,13},{0,8,100},{0,8,36},{0,9,169},{0,8,4}, 54 | {0,8,132},{0,8,68},{0,9,233},{16,7,8},{0,8,92},{0,8,28},{0,9,153}, 55 | {20,7,83},{0,8,124},{0,8,60},{0,9,217},{18,7,23},{0,8,108},{0,8,44}, 56 | {0,9,185},{0,8,12},{0,8,140},{0,8,76},{0,9,249},{16,7,3},{0,8,82}, 57 | {0,8,18},{21,8,163},{19,7,35},{0,8,114},{0,8,50},{0,9,197},{17,7,11}, 58 | {0,8,98},{0,8,34},{0,9,165},{0,8,2},{0,8,130},{0,8,66},{0,9,229}, 59 | {16,7,7},{0,8,90},{0,8,26},{0,9,149},{20,7,67},{0,8,122},{0,8,58}, 60 | {0,9,213},{18,7,19},{0,8,106},{0,8,42},{0,9,181},{0,8,10},{0,8,138}, 61 | {0,8,74},{0,9,245},{16,7,5},{0,8,86},{0,8,22},{64,8,0},{19,7,51}, 62 | {0,8,118},{0,8,54},{0,9,205},{17,7,15},{0,8,102},{0,8,38},{0,9,173}, 63 | {0,8,6},{0,8,134},{0,8,70},{0,9,237},{16,7,9},{0,8,94},{0,8,30}, 64 | {0,9,157},{20,7,99},{0,8,126},{0,8,62},{0,9,221},{18,7,27},{0,8,110}, 65 | {0,8,46},{0,9,189},{0,8,14},{0,8,142},{0,8,78},{0,9,253},{96,7,0}, 66 | {0,8,81},{0,8,17},{21,8,131},{18,7,31},{0,8,113},{0,8,49},{0,9,195}, 67 | {16,7,10},{0,8,97},{0,8,33},{0,9,163},{0,8,1},{0,8,129},{0,8,65}, 68 | {0,9,227},{16,7,6},{0,8,89},{0,8,25},{0,9,147},{19,7,59},{0,8,121}, 69 | {0,8,57},{0,9,211},{17,7,17},{0,8,105},{0,8,41},{0,9,179},{0,8,9}, 70 | {0,8,137},{0,8,73},{0,9,243},{16,7,4},{0,8,85},{0,8,21},{16,8,258}, 71 | {19,7,43},{0,8,117},{0,8,53},{0,9,203},{17,7,13},{0,8,101},{0,8,37}, 72 | {0,9,171},{0,8,5},{0,8,133},{0,8,69},{0,9,235},{16,7,8},{0,8,93}, 73 | {0,8,29},{0,9,155},{20,7,83},{0,8,125},{0,8,61},{0,9,219},{18,7,23}, 74 | {0,8,109},{0,8,45},{0,9,187},{0,8,13},{0,8,141},{0,8,77},{0,9,251}, 75 | {16,7,3},{0,8,83},{0,8,19},{21,8,195},{19,7,35},{0,8,115},{0,8,51}, 76 | {0,9,199},{17,7,11},{0,8,99},{0,8,35},{0,9,167},{0,8,3},{0,8,131}, 77 | {0,8,67},{0,9,231},{16,7,7},{0,8,91},{0,8,27},{0,9,151},{20,7,67}, 78 | {0,8,123},{0,8,59},{0,9,215},{18,7,19},{0,8,107},{0,8,43},{0,9,183}, 79 | {0,8,11},{0,8,139},{0,8,75},{0,9,247},{16,7,5},{0,8,87},{0,8,23}, 80 | {64,8,0},{19,7,51},{0,8,119},{0,8,55},{0,9,207},{17,7,15},{0,8,103}, 81 | {0,8,39},{0,9,175},{0,8,7},{0,8,135},{0,8,71},{0,9,239},{16,7,9}, 82 | {0,8,95},{0,8,31},{0,9,159},{20,7,99},{0,8,127},{0,8,63},{0,9,223}, 83 | {18,7,27},{0,8,111},{0,8,47},{0,9,191},{0,8,15},{0,8,143},{0,8,79}, 84 | {0,9,255} 85 | }; 86 | 87 | static const code distfix[32] = { 88 | {16,5,1},{23,5,257},{19,5,17},{27,5,4097},{17,5,5},{25,5,1025}, 89 | {21,5,65},{29,5,16385},{16,5,3},{24,5,513},{20,5,33},{28,5,8193}, 90 | {18,5,9},{26,5,2049},{22,5,129},{64,5,0},{16,5,2},{23,5,385}, 91 | {19,5,25},{27,5,6145},{17,5,7},{25,5,1537},{21,5,97},{29,5,24577}, 92 | {16,5,4},{24,5,769},{20,5,49},{28,5,12289},{18,5,13},{26,5,3073}, 93 | {22,5,193},{64,5,0} 94 | }; 95 | -------------------------------------------------------------------------------- /src/zlib/inflate.h: -------------------------------------------------------------------------------- 1 | /* inflate.h -- internal inflate state definition 2 | * Copyright (C) 1995-2009 Mark Adler 3 | * For conditions of distribution and use, see copyright notice in zlib.h 4 | */ 5 | 6 | /* WARNING: this file should *not* be used by applications. It is 7 | part of the implementation of the compression library and is 8 | subject to change. Applications should only use zlib.h. 9 | */ 10 | 11 | /* define NO_GZIP when compiling if you want to disable gzip header and 12 | trailer decoding by inflate(). NO_GZIP would be used to avoid linking in 13 | the crc code when it is not needed. For shared libraries, gzip decoding 14 | should be left enabled. */ 15 | #ifndef NO_GZIP 16 | # define GUNZIP 17 | #endif 18 | 19 | /* Possible inflate modes between inflate() calls */ 20 | typedef enum { 21 | HEAD, /* i: waiting for magic header */ 22 | FLAGS, /* i: waiting for method and flags (gzip) */ 23 | TIME, /* i: waiting for modification time (gzip) */ 24 | OS, /* i: waiting for extra flags and operating system (gzip) */ 25 | EXLEN, /* i: waiting for extra length (gzip) */ 26 | EXTRA, /* i: waiting for extra bytes (gzip) */ 27 | NAME, /* i: waiting for end of file name (gzip) */ 28 | COMMENT, /* i: waiting for end of comment (gzip) */ 29 | HCRC, /* i: waiting for header crc (gzip) */ 30 | DICTID, /* i: waiting for dictionary check value */ 31 | DICT, /* waiting for inflateSetDictionary() call */ 32 | TYPE, /* i: waiting for type bits, including last-flag bit */ 33 | TYPEDO, /* i: same, but skip check to exit inflate on new block */ 34 | STORED, /* i: waiting for stored size (length and complement) */ 35 | COPY_, /* i/o: same as COPY below, but only first time in */ 36 | COPY, /* i/o: waiting for input or output to copy stored block */ 37 | TABLE, /* i: waiting for dynamic block table lengths */ 38 | LENLENS, /* i: waiting for code length code lengths */ 39 | CODELENS, /* i: waiting for length/lit and distance code lengths */ 40 | LEN_, /* i: same as LEN below, but only first time in */ 41 | LEN, /* i: waiting for length/lit/eob code */ 42 | LENEXT, /* i: waiting for length extra bits */ 43 | DIST, /* i: waiting for distance code */ 44 | DISTEXT, /* i: waiting for distance extra bits */ 45 | MATCH, /* o: waiting for output space to copy string */ 46 | LIT, /* o: waiting for output space to write literal */ 47 | CHECK, /* i: waiting for 32-bit check value */ 48 | LENGTH, /* i: waiting for 32-bit length (gzip) */ 49 | DONE, /* finished check, done -- remain here until reset */ 50 | BAD, /* got a data error -- remain here until reset */ 51 | MEM, /* got an inflate() memory error -- remain here until reset */ 52 | SYNC /* looking for synchronization bytes to restart inflate() */ 53 | } inflate_mode; 54 | 55 | /* 56 | State transitions between above modes - 57 | 58 | (most modes can go to BAD or MEM on error -- not shown for clarity) 59 | 60 | Process header: 61 | HEAD -> (gzip) or (zlib) or (raw) 62 | (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME -> COMMENT -> 63 | HCRC -> TYPE 64 | (zlib) -> DICTID or TYPE 65 | DICTID -> DICT -> TYPE 66 | (raw) -> TYPEDO 67 | Read deflate blocks: 68 | TYPE -> TYPEDO -> STORED or TABLE or LEN_ or CHECK 69 | STORED -> COPY_ -> COPY -> TYPE 70 | TABLE -> LENLENS -> CODELENS -> LEN_ 71 | LEN_ -> LEN 72 | Read deflate codes in fixed or dynamic block: 73 | LEN -> LENEXT or LIT or TYPE 74 | LENEXT -> DIST -> DISTEXT -> MATCH -> LEN 75 | LIT -> LEN 76 | Process trailer: 77 | CHECK -> LENGTH -> DONE 78 | */ 79 | 80 | /* state maintained between inflate() calls. Approximately 10K bytes. */ 81 | struct inflate_state { 82 | inflate_mode mode; /* current inflate mode */ 83 | int last; /* true if processing last block */ 84 | int wrap; /* bit 0 true for zlib, bit 1 true for gzip */ 85 | int havedict; /* true if dictionary provided */ 86 | int flags; /* gzip header method and flags (0 if zlib) */ 87 | unsigned dmax; /* zlib header max distance (INFLATE_STRICT) */ 88 | unsigned long check; /* protected copy of check value */ 89 | unsigned long total; /* protected copy of output count */ 90 | gz_headerp head; /* where to save gzip header information */ 91 | /* sliding window */ 92 | unsigned wbits; /* log base 2 of requested window size */ 93 | unsigned wsize; /* window size or zero if not using window */ 94 | unsigned whave; /* valid bytes in the window */ 95 | unsigned wnext; /* window write index */ 96 | unsigned char FAR *window; /* allocated sliding window, if needed */ 97 | /* bit accumulator */ 98 | unsigned long hold; /* input bit accumulator */ 99 | unsigned bits; /* number of bits in "in" */ 100 | /* for string and stored block copying */ 101 | unsigned length; /* literal or length of data to copy */ 102 | unsigned offset; /* distance back to copy string from */ 103 | /* for table and code decoding */ 104 | unsigned extra; /* extra bits needed */ 105 | /* fixed and dynamic code tables */ 106 | code const FAR *lencode; /* starting table for length/literal codes */ 107 | code const FAR *distcode; /* starting table for distance codes */ 108 | unsigned lenbits; /* index bits for lencode */ 109 | unsigned distbits; /* index bits for distcode */ 110 | /* dynamic table building */ 111 | unsigned ncode; /* number of code length code lengths */ 112 | unsigned nlen; /* number of length code lengths */ 113 | unsigned ndist; /* number of distance code lengths */ 114 | unsigned have; /* number of code lengths in lens[] */ 115 | code FAR *next; /* next available space in codes[] */ 116 | unsigned short lens[320]; /* temporary storage for code lengths */ 117 | unsigned short work[288]; /* work area for code table building */ 118 | code codes[ENOUGH]; /* space for code tables */ 119 | int sane; /* if false, allow invalid distance too far */ 120 | int back; /* bits back of last unprocessed length/lit */ 121 | unsigned was; /* initial length of match */ 122 | }; 123 | -------------------------------------------------------------------------------- /src/jsonreporter.cpp: -------------------------------------------------------------------------------- 1 | #include "jsonreporter.h" 2 | 3 | JsonReporter::JsonReporter(Options* opt){ 4 | mOptions = opt; 5 | mDupHist = NULL; 6 | mDupRate = 0; 7 | } 8 | 9 | JsonReporter::~JsonReporter(){ 10 | } 11 | 12 | void JsonReporter::setDup(double dupRate) { 13 | mDupRate = dupRate; 14 | } 15 | 16 | void JsonReporter::setInsertHist(atomic_long* insertHist, int insertSizePeak) { 17 | mInsertHist = insertHist; 18 | mInsertSizePeak = insertSizePeak; 19 | } 20 | 21 | extern string command; 22 | void JsonReporter::report(FilterResult* result, Stats* preStats1, Stats* postStats1, Stats* preStats2, Stats* postStats2) { 23 | ofstream ofs; 24 | ofs.open(mOptions->jsonFile, ifstream::out); 25 | ofs << "{" << endl; 26 | 27 | // sequencing info 28 | string sequencingInfo = mOptions->isPaired()?"paired end":"single end"; 29 | if(mOptions->isPaired()) { 30 | sequencingInfo += " (" + to_string(preStats1->getCycles()) + " cycles + " + to_string(preStats2->getCycles()) + " cycles)"; 31 | } else { 32 | sequencingInfo += " (" + to_string(preStats1->getCycles()) + " cycles)"; 33 | } 34 | 35 | long pre_total_reads = preStats1->getReads(); 36 | if(preStats2) 37 | pre_total_reads += preStats2->getReads(); 38 | 39 | long pre_total_bases = preStats1->getBases(); 40 | if(preStats2) 41 | pre_total_bases += preStats2->getBases(); 42 | 43 | long pre_q20_bases = preStats1->getQ20(); 44 | if(preStats2) 45 | pre_q20_bases += preStats2->getQ20(); 46 | 47 | long pre_q30_bases = preStats1->getQ30(); 48 | if(preStats2) 49 | pre_q30_bases += preStats2->getQ30(); 50 | 51 | long pre_total_gc = preStats1->getGCNumber(); 52 | if(preStats2) 53 | pre_total_gc += preStats2->getGCNumber(); 54 | 55 | long post_total_reads = postStats1->getReads(); 56 | if(postStats2) 57 | post_total_reads += postStats2->getReads(); 58 | 59 | long post_total_bases = postStats1->getBases(); 60 | if(postStats2) 61 | post_total_bases += postStats2->getBases(); 62 | 63 | long post_q20_bases = postStats1->getQ20(); 64 | if(postStats2) 65 | post_q20_bases += postStats2->getQ20(); 66 | 67 | long post_q30_bases = postStats1->getQ30(); 68 | if(postStats2) 69 | post_q30_bases += postStats2->getQ30(); 70 | 71 | long post_total_gc = postStats1->getGCNumber(); 72 | if(postStats2) 73 | post_total_gc += postStats2->getGCNumber(); 74 | 75 | // summary 76 | ofs << "\t" << "\"summary\": {" << endl; 77 | ofs << "\t\t" << "\"fastp_version\": \""<< FASTP_VER << "\"," << endl; 78 | ofs << "\t\t" << "\"sequencing\": \""<< sequencingInfo << "\"," << endl; 79 | ofs << "\t\t" << "\"before_filtering\": {" << endl; 80 | ofs << "\t\t\t" << "\"total_reads\":" << pre_total_reads << "," << endl; 81 | ofs << "\t\t\t" << "\"total_bases\":" << pre_total_bases << "," << endl; 82 | ofs << "\t\t\t" << "\"q20_bases\":" << pre_q20_bases << "," << endl; 83 | ofs << "\t\t\t" << "\"q30_bases\":" << pre_q30_bases << "," << endl; 84 | ofs << "\t\t\t" << "\"q20_rate\":" << (pre_total_bases == 0?0.0:(double)pre_q20_bases / (double)pre_total_bases) << "," << endl; 85 | ofs << "\t\t\t" << "\"q30_rate\":" << (pre_total_bases == 0?0.0:(double)pre_q30_bases / (double)pre_total_bases) << "," << endl; 86 | ofs << "\t\t\t" << "\"read1_mean_length\":" << preStats1->getMeanLength() << "," << endl; 87 | if(mOptions->isPaired()) 88 | ofs << "\t\t\t" << "\"read2_mean_length\":" << preStats2->getMeanLength() << "," << endl; 89 | ofs << "\t\t\t" << "\"gc_content\":" << (pre_total_bases == 0?0.0:(double)pre_total_gc / (double)pre_total_bases) << endl; 90 | ofs << "\t\t" << "}," << endl; 91 | 92 | ofs << "\t\t" << "\"after_filtering\": {" << endl; 93 | ofs << "\t\t\t" << "\"total_reads\":" << post_total_reads << "," << endl; 94 | ofs << "\t\t\t" << "\"total_bases\":" << post_total_bases << "," << endl; 95 | ofs << "\t\t\t" << "\"q20_bases\":" << post_q20_bases << "," << endl; 96 | ofs << "\t\t\t" << "\"q30_bases\":" << post_q30_bases << "," << endl; 97 | ofs << "\t\t\t" << "\"q20_rate\":" << (post_total_bases == 0?0.0:(double)post_q20_bases / (double)post_total_bases) << "," << endl; 98 | ofs << "\t\t\t" << "\"q30_rate\":" << (post_total_bases == 0?0.0:(double)post_q30_bases / (double)post_total_bases) << "," << endl; 99 | ofs << "\t\t\t" << "\"read1_mean_length\":" << postStats1->getMeanLength() << "," << endl; 100 | if(mOptions->isPaired() && !mOptions->merge.enabled) 101 | ofs << "\t\t\t" << "\"read2_mean_length\":" << postStats2->getMeanLength() << "," << endl; 102 | ofs << "\t\t\t" << "\"gc_content\":" << (post_total_bases == 0?0.0:(double)post_total_gc / (double)post_total_bases) << endl; 103 | ofs << "\t\t" << "}"; 104 | 105 | ofs << endl; 106 | 107 | ofs << "\t" << "}," << endl; 108 | 109 | if(result) { 110 | ofs << "\t" << "\"filtering_result\": " ; 111 | result -> reportJson(ofs, "\t"); 112 | } 113 | 114 | if(mOptions->duplicate.enabled) { 115 | ofs << "\t" << "\"duplication\": {" << endl; 116 | ofs << "\t\t\"rate\": " << mDupRate << endl; 117 | ofs << "\t" << "}"; 118 | ofs << "," << endl; 119 | } 120 | 121 | if(mOptions->isPaired()) { 122 | ofs << "\t" << "\"insert_size\": {" << endl; 123 | ofs << "\t\t\"peak\": " << mInsertSizePeak << "," << endl; 124 | ofs << "\t\t\"unknown\": " << mInsertHist[mOptions->insertSizeMax] << "," << endl; 125 | ofs << "\t\t\"histogram\": ["; 126 | for(int d=0; dinsertSizeMax; d++) { 127 | ofs << mInsertHist[d]; 128 | if(d!=mOptions->insertSizeMax-1) 129 | ofs << ","; 130 | } 131 | ofs << "]" << endl; 132 | ofs << "\t" << "}"; 133 | ofs << "," << endl; 134 | } 135 | 136 | if(result && mOptions->adapterCuttingEnabled()) { 137 | ofs << "\t" << "\"adapter_cutting\": " ; 138 | result -> reportAdapterJson(ofs, "\t"); 139 | } 140 | 141 | if(result && mOptions->polyXTrimmingEnabled()) { 142 | ofs << "\t" << "\"polyx_trimming\": " ; 143 | result -> reportPolyXTrimJson(ofs, "\t"); 144 | } 145 | 146 | if(preStats1) { 147 | ofs << "\t" << "\"read1_before_filtering\": " ; 148 | preStats1 -> reportJson(ofs, "\t"); 149 | } 150 | 151 | if(preStats2) { 152 | ofs << "\t" << "\"read2_before_filtering\": " ; 153 | preStats2 -> reportJson(ofs, "\t"); 154 | } 155 | 156 | if(postStats1) { 157 | string name = "read1_after_filtering"; 158 | if(mOptions->merge.enabled) 159 | name = "merged_and_filtered"; 160 | ofs << "\t" << "\"" << name << "\": " ; 161 | postStats1 -> reportJson(ofs, "\t"); 162 | } 163 | 164 | if(postStats2 && !mOptions->merge.enabled) { 165 | ofs << "\t" << "\"read2_after_filtering\": " ; 166 | postStats2 -> reportJson(ofs, "\t"); 167 | } 168 | 169 | ofs << "\t\"command\": " << "\"" << command << "\"" << endl; 170 | 171 | ofs << "}"; 172 | } -------------------------------------------------------------------------------- /src/zlib/gzguts.h: -------------------------------------------------------------------------------- 1 | /* gzguts.h -- zlib internal header definitions for gz* operations 2 | * Copyright (C) 2004, 2005, 2010, 2011, 2012, 2013 Mark Adler 3 | * For conditions of distribution and use, see copyright notice in zlib.h 4 | */ 5 | 6 | #ifdef _LARGEFILE64_SOURCE 7 | # ifndef _LARGEFILE_SOURCE 8 | # define _LARGEFILE_SOURCE 1 9 | # endif 10 | # ifdef _FILE_OFFSET_BITS 11 | # undef _FILE_OFFSET_BITS 12 | # endif 13 | #endif 14 | 15 | #ifdef HAVE_HIDDEN 16 | # define ZLIB_INTERNAL __attribute__((visibility ("hidden"))) 17 | #else 18 | # define ZLIB_INTERNAL 19 | #endif 20 | 21 | #include 22 | #include "zlib.h" 23 | #ifdef STDC 24 | # include 25 | # include 26 | # include 27 | #endif 28 | #include 29 | 30 | #ifdef _WIN32 31 | # include 32 | #endif 33 | 34 | #if defined(__TURBOC__) || defined(_MSC_VER) || defined(_WIN32) 35 | # include 36 | #endif 37 | 38 | #ifdef WINAPI_FAMILY 39 | # define open _open 40 | # define read _read 41 | # define write _write 42 | # define close _close 43 | #endif 44 | 45 | #ifdef NO_DEFLATE /* for compatibility with old definition */ 46 | # define NO_GZCOMPRESS 47 | #endif 48 | 49 | #if defined(STDC99) || (defined(__TURBOC__) && __TURBOC__ >= 0x550) 50 | # ifndef HAVE_VSNPRINTF 51 | # define HAVE_VSNPRINTF 52 | # endif 53 | #endif 54 | 55 | #if defined(__CYGWIN__) 56 | # ifndef HAVE_VSNPRINTF 57 | # define HAVE_VSNPRINTF 58 | # endif 59 | #endif 60 | 61 | #if defined(MSDOS) && defined(__BORLANDC__) && (BORLANDC > 0x410) 62 | # ifndef HAVE_VSNPRINTF 63 | # define HAVE_VSNPRINTF 64 | # endif 65 | #endif 66 | 67 | #ifndef HAVE_VSNPRINTF 68 | # ifdef MSDOS 69 | /* vsnprintf may exist on some MS-DOS compilers (DJGPP?), 70 | but for now we just assume it doesn't. */ 71 | # define NO_vsnprintf 72 | # endif 73 | # ifdef __TURBOC__ 74 | # define NO_vsnprintf 75 | # endif 76 | # ifdef WIN32 77 | /* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */ 78 | # if !defined(vsnprintf) && !defined(NO_vsnprintf) 79 | # if !defined(_MSC_VER) || ( defined(_MSC_VER) && _MSC_VER < 1500 ) 80 | # define vsnprintf _vsnprintf 81 | # endif 82 | # endif 83 | # endif 84 | # ifdef __SASC 85 | # define NO_vsnprintf 86 | # endif 87 | # ifdef VMS 88 | # define NO_vsnprintf 89 | # endif 90 | # ifdef __OS400__ 91 | # define NO_vsnprintf 92 | # endif 93 | # ifdef __MVS__ 94 | # define NO_vsnprintf 95 | # endif 96 | #endif 97 | 98 | /* unlike snprintf (which is required in C99, yet still not supported by 99 | Microsoft more than a decade later!), _snprintf does not guarantee null 100 | termination of the result -- however this is only used in gzlib.c where 101 | the result is assured to fit in the space provided */ 102 | #ifdef _MSC_VER 103 | # define snprintf _snprintf 104 | #endif 105 | 106 | #ifndef local 107 | # define local static 108 | #endif 109 | /* compile with -Dlocal if your debugger can't find static symbols */ 110 | 111 | /* gz* functions always use library allocation functions */ 112 | #ifndef STDC 113 | extern voidp malloc OF((uInt size)); 114 | extern void free OF((voidpf ptr)); 115 | #endif 116 | 117 | /* get errno and strerror definition */ 118 | #if defined UNDER_CE 119 | # include 120 | # define zstrerror() gz_strwinerror((DWORD)GetLastError()) 121 | #else 122 | # ifndef NO_STRERROR 123 | # include 124 | # define zstrerror() strerror(errno) 125 | # else 126 | # define zstrerror() "stdio error (consult errno)" 127 | # endif 128 | #endif 129 | 130 | /* provide prototypes for these when building zlib without LFS */ 131 | #if !defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0 132 | ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *)); 133 | ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int)); 134 | ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile)); 135 | ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile)); 136 | #endif 137 | 138 | /* default memLevel */ 139 | #if MAX_MEM_LEVEL >= 8 140 | # define DEF_MEM_LEVEL 8 141 | #else 142 | # define DEF_MEM_LEVEL MAX_MEM_LEVEL 143 | #endif 144 | 145 | /* default i/o buffer size -- double this for output when reading (this and 146 | twice this must be able to fit in an unsigned type) */ 147 | #define GZBUFSIZE 8192 148 | 149 | /* gzip modes, also provide a little integrity check on the passed structure */ 150 | #define GZ_NONE 0 151 | #define GZ_READ 7247 152 | #define GZ_WRITE 31153 153 | #define GZ_APPEND 1 /* mode set to GZ_WRITE after the file is opened */ 154 | 155 | /* values for gz_state how */ 156 | #define LOOK 0 /* look for a gzip header */ 157 | #define COPY 1 /* copy input directly */ 158 | #define GZIP 2 /* decompress a gzip stream */ 159 | 160 | /* internal gzip file state data structure */ 161 | typedef struct { 162 | /* exposed contents for gzgetc() macro */ 163 | struct gzFile_s x; /* "x" for exposed */ 164 | /* x.have: number of bytes available at x.next */ 165 | /* x.next: next output data to deliver or write */ 166 | /* x.pos: current position in uncompressed data */ 167 | /* used for both reading and writing */ 168 | int mode; /* see gzip modes above */ 169 | int fd; /* file descriptor */ 170 | char *path; /* path or fd for error messages */ 171 | unsigned size; /* buffer size, zero if not allocated yet */ 172 | unsigned want; /* requested buffer size, default is GZBUFSIZE */ 173 | unsigned char *in; /* input buffer */ 174 | unsigned char *out; /* output buffer (double-sized when reading) */ 175 | int direct; /* 0 if processing gzip, 1 if transparent */ 176 | /* just for reading */ 177 | int how; /* 0: get header, 1: copy, 2: decompress */ 178 | z_off64_t start; /* where the gzip data started, for rewinding */ 179 | int eof; /* true if end of input file reached */ 180 | int past; /* true if read requested past end */ 181 | /* just for writing */ 182 | int level; /* compression level */ 183 | int strategy; /* compression strategy */ 184 | /* seek request */ 185 | z_off64_t skip; /* amount to skip (already rewound if backwards) */ 186 | int seek; /* true if seek request pending */ 187 | /* error information */ 188 | int err; /* error code */ 189 | char *msg; /* error message */ 190 | /* zlib inflate or deflate stream */ 191 | z_stream strm; /* stream structure in-place (not a pointer) */ 192 | } gz_state; 193 | typedef gz_state FAR *gz_statep; 194 | 195 | /* shared functions */ 196 | void ZLIB_INTERNAL gz_error OF((gz_statep, int, const char *)); 197 | #if defined UNDER_CE 198 | char ZLIB_INTERNAL *gz_strwinerror OF((DWORD error)); 199 | #endif 200 | 201 | /* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t 202 | value -- needed when comparing unsigned to z_off64_t, which is signed 203 | (possible z_off64_t types off_t, off64_t, and long are all signed) */ 204 | #ifdef INT_MAX 205 | # define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX) 206 | #else 207 | unsigned ZLIB_INTERNAL gz_intmax OF((void)); 208 | # define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > gz_intmax()) 209 | #endif 210 | -------------------------------------------------------------------------------- /src/adaptertrimmer.cpp: -------------------------------------------------------------------------------- 1 | #include "adaptertrimmer.h" 2 | #include "matcher.h" 3 | 4 | AdapterTrimmer::AdapterTrimmer(){ 5 | } 6 | 7 | 8 | AdapterTrimmer::~AdapterTrimmer(){ 9 | } 10 | 11 | bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit) { 12 | OverlapResult ov = OverlapAnalysis::analyze(r1, r2, diffLimit, overlapRequire, diffPercentLimit); 13 | return trimByOverlapAnalysis(r1, r2, fr, ov); 14 | } 15 | 16 | bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1, int frontTrimmed2) { 17 | int ol = ov.overlap_len; 18 | if(ov.overlapped && ov.offset < 0) { 19 | 20 | //5' ......frontTrimmed1......|------------------------------------------|----- 3' 21 | //3' -----|-------------------------------------------|......frontTrimmed2..... 5' 22 | 23 | int len1 = min(r1->length(), ol + frontTrimmed2); 24 | int len2 = min(r2->length(), ol + frontTrimmed1); 25 | string adapter1 = r1->mSeq->substr(len1, r1->length() - len1); 26 | string adapter2 = r2->mSeq->substr(len2, r2->length() - len2); 27 | 28 | if(_DEBUG) { 29 | cerr << adapter1 << endl; 30 | cerr << adapter2 << endl; 31 | cerr << "frontTrimmed2: " << frontTrimmed1 << endl; 32 | cerr << "frontTrimmed2: " << frontTrimmed2 << endl; 33 | cerr << "overlap:" << ov.offset << "," << ov.overlap_len << ", " << ov.diff << endl; 34 | r1->print(); 35 | r2->reverseComplement()->print(); 36 | cerr <resize(len1); 39 | r2->resize(len2); 40 | 41 | fr->addAdapterTrimmed(adapter1, adapter2); 42 | return true; 43 | } 44 | return false; 45 | } 46 | 47 | bool AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector& adapterList, bool isR2, bool incTrimmedCounter) { 48 | int matchReq = 4; 49 | if(adapterList.size() > 16) 50 | matchReq = 5; 51 | if(adapterList.size() > 256) 52 | matchReq = 6; 53 | bool trimmed = false; 54 | 55 | string* originalSeq = r->mSeq; 56 | for(int i=0; isubstr(r->length(), originalSeq->length() - r->length()); 62 | if(fr) 63 | fr->addAdapterTrimmed(adapter, isR2, incTrimmedCounter); 64 | else 65 | cerr << adapter << endl; 66 | } 67 | 68 | return trimmed; 69 | } 70 | 71 | bool AdapterTrimmer::trimBySequence(Read* r, FilterResult* fr, string& adapterseq, bool isR2, int matchReq) { 72 | const int allowOneMismatchForEach = 8; 73 | 74 | int rlen = r->length(); 75 | int alen = adapterseq.length(); 76 | 77 | const char* adata = adapterseq.c_str(); 78 | const char* rdata = r->mSeq->c_str(); 79 | 80 | if(alen < matchReq) 81 | return false; 82 | 83 | int pos=0; 84 | bool found = false; 85 | int start = 0; 86 | if(alen >= 16) 87 | start = -4; 88 | else if(alen >= 12) 89 | start = -3; 90 | else if(alen >= 8) 91 | start = -2; 92 | // we start from negative numbers since the Illumina adapter dimer usually have the first A skipped as A-tailing 93 | // try exact match with hamming distance (no insertion of deletion) 94 | for(pos = start; pos allowedMismatch) { 103 | matched = false; 104 | break; 105 | } 106 | } 107 | } 108 | if(matched) { 109 | found = true; 110 | break; 111 | } 112 | 113 | } 114 | 115 | // if failed to exact match, we try one gap 116 | // to lower computational cost, we only allow one gap, and it's much enough for short reads 117 | // we try insertion in the sequence 118 | bool hasInsertion = false; 119 | if(!found) { 120 | for(pos = 0; posmSeq->resize(0); 154 | r->mQuality->resize(0); 155 | if(fr) { 156 | fr->addAdapterTrimmed(adapter, isR2); 157 | } 158 | 159 | } else { 160 | string adapter = r->mSeq->substr(pos, rlen-pos); 161 | r->resize(pos); 162 | if(fr) { 163 | fr->addAdapterTrimmed(adapter, isR2); 164 | } 165 | } 166 | return true; 167 | } 168 | 169 | return false; 170 | } 171 | 172 | bool AdapterTrimmer::test() { 173 | Read r("@name", 174 | "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGG", 175 | "+", 176 | "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E"); 177 | string adapter = "TTTTCCACGGGGATACTACTG"; 178 | bool trimmed = AdapterTrimmer::trimBySequence(&r, NULL, adapter); 179 | if (*r.mSeq != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAA") 180 | return false; 181 | 182 | Read read("@name", 183 | "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGGAAATTTCCCGGGAAATTTCCCGGGATCGATCGATCGATCGAATTCC", 184 | "+", 185 | "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE"); 186 | vector adapterList; 187 | adapterList.push_back("GCTAGCTAGCTAGCTA"); 188 | adapterList.push_back("AAATTTCCCGGGAAATTTCCCGGG"); 189 | adapterList.push_back("ATCGATCGATCGATCG"); 190 | adapterList.push_back("AATTCCGGAATTCCGG"); 191 | trimmed = AdapterTrimmer::trimByMultiSequences(&read, NULL, adapterList); 192 | if (*read.mSeq != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAATTTTCCCCGGGG") { 193 | cerr << read.mSeq << endl; 194 | return false; 195 | } 196 | 197 | return true; 198 | } -------------------------------------------------------------------------------- /src/zlib/zutil.h: -------------------------------------------------------------------------------- 1 | /* zutil.h -- internal interface and configuration of the compression library 2 | * Copyright (C) 1995-2013 Jean-loup Gailly. 3 | * For conditions of distribution and use, see copyright notice in zlib.h 4 | */ 5 | 6 | /* WARNING: this file should *not* be used by applications. It is 7 | part of the implementation of the compression library and is 8 | subject to change. Applications should only use zlib.h. 9 | */ 10 | 11 | /* @(#) $Id$ */ 12 | 13 | #ifndef ZUTIL_H 14 | #define ZUTIL_H 15 | 16 | #ifdef HAVE_HIDDEN 17 | # define ZLIB_INTERNAL __attribute__((visibility ("hidden"))) 18 | #else 19 | # define ZLIB_INTERNAL 20 | #endif 21 | 22 | #include "zlib.h" 23 | 24 | #if defined(STDC) && !defined(Z_SOLO) 25 | # if !(defined(_WIN32_WCE) && defined(_MSC_VER)) 26 | # include 27 | # endif 28 | # include 29 | # include 30 | #endif 31 | 32 | #ifdef Z_SOLO 33 | typedef long ptrdiff_t; /* guess -- will be caught if guess is wrong */ 34 | #endif 35 | 36 | #ifndef local 37 | # define local static 38 | #endif 39 | /* compile with -Dlocal if your debugger can't find static symbols */ 40 | 41 | typedef unsigned char uch; 42 | typedef uch FAR uchf; 43 | typedef unsigned short ush; 44 | typedef ush FAR ushf; 45 | typedef unsigned long ulg; 46 | 47 | extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */ 48 | /* (size given to avoid silly warnings with Visual C++) */ 49 | 50 | #define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)] 51 | 52 | #define ERR_RETURN(strm,err) \ 53 | return (strm->msg = ERR_MSG(err), (err)) 54 | /* To be used only when the state is known to be valid */ 55 | 56 | /* common constants */ 57 | 58 | #ifndef DEF_WBITS 59 | # define DEF_WBITS MAX_WBITS 60 | #endif 61 | /* default windowBits for decompression. MAX_WBITS is for compression only */ 62 | 63 | #if MAX_MEM_LEVEL >= 8 64 | # define DEF_MEM_LEVEL 8 65 | #else 66 | # define DEF_MEM_LEVEL MAX_MEM_LEVEL 67 | #endif 68 | /* default memLevel */ 69 | 70 | #define STORED_BLOCK 0 71 | #define STATIC_TREES 1 72 | #define DYN_TREES 2 73 | /* The three kinds of block type */ 74 | 75 | #define MIN_MATCH 3 76 | #define MAX_MATCH 258 77 | /* The minimum and maximum match lengths */ 78 | 79 | #define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */ 80 | 81 | /* target dependencies */ 82 | 83 | #if defined(MSDOS) || (defined(WINDOWS) && !defined(WIN32)) 84 | # define OS_CODE 0x00 85 | # ifndef Z_SOLO 86 | # if defined(__TURBOC__) || defined(__BORLANDC__) 87 | # if (__STDC__ == 1) && (defined(__LARGE__) || defined(__COMPACT__)) 88 | /* Allow compilation with ANSI keywords only enabled */ 89 | void _Cdecl farfree( void *block ); 90 | void *_Cdecl farmalloc( unsigned long nbytes ); 91 | # else 92 | # include 93 | # endif 94 | # else /* MSC or DJGPP */ 95 | # include 96 | # endif 97 | # endif 98 | #endif 99 | 100 | #ifdef AMIGA 101 | # define OS_CODE 0x01 102 | #endif 103 | 104 | #if defined(VAXC) || defined(VMS) 105 | # define OS_CODE 0x02 106 | # define F_OPEN(name, mode) \ 107 | fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512") 108 | #endif 109 | 110 | #if defined(ATARI) || defined(atarist) 111 | # define OS_CODE 0x05 112 | #endif 113 | 114 | #ifdef OS2 115 | # define OS_CODE 0x06 116 | # if defined(M_I86) && !defined(Z_SOLO) 117 | # include 118 | # endif 119 | #endif 120 | 121 | #if defined(MACOS) || defined(TARGET_OS_MAC) 122 | # define OS_CODE 0x07 123 | # ifndef Z_SOLO 124 | # if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os 125 | # include /* for fdopen */ 126 | # else 127 | # ifndef fdopen 128 | # define fdopen(fd,mode) NULL /* No fdopen() */ 129 | # endif 130 | # endif 131 | # endif 132 | #endif 133 | 134 | #ifdef TOPS20 135 | # define OS_CODE 0x0a 136 | #endif 137 | 138 | #ifdef WIN32 139 | # ifndef __CYGWIN__ /* Cygwin is Unix, not Win32 */ 140 | # define OS_CODE 0x0b 141 | # endif 142 | #endif 143 | 144 | #ifdef __50SERIES /* Prime/PRIMOS */ 145 | # define OS_CODE 0x0f 146 | #endif 147 | 148 | #if defined(_BEOS_) || defined(RISCOS) 149 | # define fdopen(fd,mode) NULL /* No fdopen() */ 150 | #endif 151 | 152 | #if (defined(_MSC_VER) && (_MSC_VER > 600)) && !defined __INTERIX 153 | # if defined(_WIN32_WCE) 154 | # define fdopen(fd,mode) NULL /* No fdopen() */ 155 | # ifndef _PTRDIFF_T_DEFINED 156 | typedef int ptrdiff_t; 157 | # define _PTRDIFF_T_DEFINED 158 | # endif 159 | # else 160 | # define fdopen(fd,type) _fdopen(fd,type) 161 | # endif 162 | #endif 163 | 164 | #if defined(__BORLANDC__) && !defined(MSDOS) 165 | #pragma warn -8004 166 | #pragma warn -8008 167 | #pragma warn -8066 168 | #endif 169 | 170 | /* provide prototypes for these when building zlib without LFS */ 171 | #if !defined(_WIN32) && \ 172 | (!defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0) 173 | ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t)); 174 | ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t)); 175 | #endif 176 | 177 | /* common defaults */ 178 | 179 | #ifndef OS_CODE 180 | # define OS_CODE 0x03 /* assume Unix */ 181 | #endif 182 | 183 | #ifndef F_OPEN 184 | # define F_OPEN(name, mode) fopen((name), (mode)) 185 | #endif 186 | 187 | /* functions */ 188 | 189 | #if defined(pyr) || defined(Z_SOLO) 190 | # define NO_MEMCPY 191 | #endif 192 | #if defined(SMALL_MEDIUM) && !defined(_MSC_VER) && !defined(__SC__) 193 | /* Use our own functions for small and medium model with MSC <= 5.0. 194 | * You may have to use the same strategy for Borland C (untested). 195 | * The __SC__ check is for Symantec. 196 | */ 197 | # define NO_MEMCPY 198 | #endif 199 | #if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY) 200 | # define HAVE_MEMCPY 201 | #endif 202 | #ifdef HAVE_MEMCPY 203 | # ifdef SMALL_MEDIUM /* MSDOS small or medium model */ 204 | # define zmemcpy _fmemcpy 205 | # define zmemcmp _fmemcmp 206 | # define zmemzero(dest, len) _fmemset(dest, 0, len) 207 | # else 208 | # define zmemcpy memcpy 209 | # define zmemcmp memcmp 210 | # define zmemzero(dest, len) memset(dest, 0, len) 211 | # endif 212 | #else 213 | void ZLIB_INTERNAL zmemcpy OF((Bytef* dest, const Bytef* source, uInt len)); 214 | int ZLIB_INTERNAL zmemcmp OF((const Bytef* s1, const Bytef* s2, uInt len)); 215 | void ZLIB_INTERNAL zmemzero OF((Bytef* dest, uInt len)); 216 | #endif 217 | 218 | /* Diagnostic functions */ 219 | #ifdef DEBUG 220 | # include 221 | extern int ZLIB_INTERNAL z_verbose; 222 | extern void ZLIB_INTERNAL z_error OF((char *m)); 223 | # define Assert(cond,msg) {if(!(cond)) z_error(msg);} 224 | # define Trace(x) {if (z_verbose>=0) fprintf x ;} 225 | # define Tracev(x) {if (z_verbose>0) fprintf x ;} 226 | # define Tracevv(x) {if (z_verbose>1) fprintf x ;} 227 | # define Tracec(c,x) {if (z_verbose>0 && (c)) fprintf x ;} 228 | # define Tracecv(c,x) {if (z_verbose>1 && (c)) fprintf x ;} 229 | #else 230 | # define Assert(cond,msg) 231 | # define Trace(x) 232 | # define Tracev(x) 233 | # define Tracevv(x) 234 | # define Tracec(c,x) 235 | # define Tracecv(c,x) 236 | #endif 237 | 238 | #ifndef Z_SOLO 239 | voidpf ZLIB_INTERNAL zcalloc OF((voidpf opaque, unsigned items, 240 | unsigned size)); 241 | void ZLIB_INTERNAL zcfree OF((voidpf opaque, voidpf ptr)); 242 | #endif 243 | 244 | #define ZALLOC(strm, items, size) \ 245 | (*((strm)->zalloc))((strm)->opaque, (items), (size)) 246 | #define ZFREE(strm, addr) (*((strm)->zfree))((strm)->opaque, (voidpf)(addr)) 247 | #define TRY_FREE(s, p) {if (p) ZFREE(s, p);} 248 | 249 | /* Reverse the bytes in a 32-bit value */ 250 | #define ZSWAP32(q) ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \ 251 | (((q) & 0xff00) << 8) + (((q) & 0xff) << 24)) 252 | 253 | #endif /* ZUTIL_H */ 254 | -------------------------------------------------------------------------------- /src/util.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_H 2 | #define UTIL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | inline char complement(char base) { 17 | switch(base){ 18 | case 'A': 19 | case 'a': 20 | return 'T'; 21 | case 'T': 22 | case 't': 23 | return 'A'; 24 | case 'C': 25 | case 'c': 26 | return 'G'; 27 | case 'G': 28 | case 'g': 29 | return 'C'; 30 | default: 31 | return 'N'; 32 | } 33 | } 34 | 35 | inline bool starts_with( string const & value, string const & starting) 36 | { 37 | if (starting.size() > value.size()) return false; 38 | return equal(starting.begin(), starting.end(), value.begin()); 39 | } 40 | 41 | inline bool starts_with( string* value, string const & starting) 42 | { 43 | return starts_with(*value, starting); 44 | } 45 | 46 | inline bool ends_with( string const & value, string const & ending) 47 | { 48 | if (ending.size() > value.size()) return false; 49 | return equal(ending.rbegin(), ending.rend(), value.rbegin()); 50 | } 51 | 52 | inline string trim(const string& str) 53 | { 54 | string::size_type pos = str.find_first_not_of(' '); 55 | if (pos == string::npos) 56 | { 57 | return string(""); 58 | } 59 | string::size_type pos2 = str.find_last_not_of(' '); 60 | if (pos2 != string::npos) 61 | { 62 | return str.substr(pos, pos2 - pos + 1); 63 | } 64 | return str.substr(pos); 65 | } 66 | 67 | inline int split(const string& str, vector& ret_, string sep = ",") 68 | { 69 | if (str.empty()) 70 | { 71 | return 0; 72 | } 73 | 74 | string tmp; 75 | string::size_type pos_begin = str.find_first_not_of(sep); 76 | string::size_type comma_pos = 0; 77 | 78 | while (pos_begin != string::npos) 79 | { 80 | comma_pos = str.find(sep, pos_begin); 81 | if (comma_pos != string::npos) 82 | { 83 | tmp = str.substr(pos_begin, comma_pos - pos_begin); 84 | pos_begin = comma_pos + sep.length(); 85 | } 86 | else 87 | { 88 | tmp = str.substr(pos_begin); 89 | pos_begin = comma_pos; 90 | } 91 | 92 | ret_.push_back(tmp); 93 | tmp.clear(); 94 | } 95 | return 0; 96 | } 97 | 98 | inline string replace(const string& str, const string& src, const string& dest) 99 | { 100 | string ret; 101 | 102 | string::size_type pos_begin = 0; 103 | string::size_type pos = str.find(src); 104 | while (pos != string::npos) 105 | { 106 | ret.append(str.data() + pos_begin, pos - pos_begin); 107 | ret += dest; 108 | pos_begin = pos + 1; 109 | pos = str.find(src, pos_begin); 110 | } 111 | if (pos_begin < str.length()) 112 | { 113 | ret.append(str.begin() + pos_begin, str.end()); 114 | } 115 | return ret; 116 | } 117 | 118 | inline string reverse(const string& str) { 119 | string ret(str.length(), 0); 120 | for(int pos=0; pos 0) { 157 | struct stat status; 158 | int result = stat( s.c_str(), &status ); 159 | if(result == 0) { 160 | exists = true; 161 | } 162 | } 163 | return exists; 164 | } 165 | 166 | 167 | // check if a string is a directory 168 | inline bool is_directory(const string& path) 169 | { 170 | bool isdir = false; 171 | struct stat status; 172 | // visual studion use _S_IFDIR instead of S_IFDIR 173 | // http://msdn.microsoft.com/en-us/library/14h5k7ff.aspx 174 | #ifdef _MSC_VER 175 | #define S_IFDIR _S_IFDIR 176 | #endif 177 | stat( path.c_str(), &status ); 178 | if ( status.st_mode & S_IFDIR ) { 179 | isdir = true; 180 | } 181 | // #endif 182 | return isdir; 183 | } 184 | 185 | inline void check_file_valid(const string& s) { 186 | if(!file_exists(s)){ 187 | cerr << "ERROR: file '" << s << "' doesn't exist, quit now" << endl; 188 | exit(-1); 189 | } 190 | if(is_directory(s)){ 191 | cerr << "ERROR: '" << s << "' is a folder, not a file, quit now" << endl; 192 | exit(-1); 193 | } 194 | } 195 | 196 | inline bool check_filename_valid(const string& s){ 197 | return 0 < trim(s).length() && trim(s).length() <= 255 && regex_match(s, regex("^[A-Za-z0-9_\\.\\-]+$")); 198 | } 199 | 200 | inline void check_file_writable(const string& s) { 201 | string dir = dirname(s); 202 | if(!file_exists(dir)) { 203 | cerr << "ERROR: '" << dir << " doesn't exist. Create this folder and run this command again." << endl; 204 | exit(-1); 205 | } 206 | if(is_directory(s)){ 207 | cerr << "ERROR: '" << s << "' is not a writable file, quit now" << endl; 208 | exit(-1); 209 | } 210 | } 211 | 212 | // Remove non alphabetic characters from a string 213 | inline string str_keep_alpha(const string& s) 214 | { 215 | string new_str; 216 | for( size_t it =0; it < s.size(); it++) { 217 | if( isalpha(s[it]) ) { 218 | new_str += s[it]; 219 | } 220 | } 221 | return new_str; 222 | } 223 | 224 | 225 | // Remove invalid sequence characters from a string 226 | inline void str_keep_valid_sequence( string& s, bool forceUpperCase = false) 227 | { 228 | size_t total = 0; 229 | const char case_gap = 'a' - 'A'; 230 | for( size_t it =0; it < s.size(); it++) { 231 | char c = s[it]; 232 | if(forceUpperCase && c>='a' && c<='z') { 233 | c -= case_gap; 234 | } 235 | if( isalpha(c) || c == '-' || c == '*' ) { 236 | s[total] = c; 237 | total ++; 238 | } 239 | } 240 | 241 | s.resize(total); 242 | } 243 | 244 | inline int find_with_right_pos(const string& str, const string& pattern, int start=0) { 245 | int pos = str.find(pattern, start); 246 | if (pos < 0) 247 | return -1; 248 | else 249 | return pos + pattern.length(); 250 | } 251 | 252 | inline void str2upper(string& s){ 253 | transform(s.begin(), s.end(), s.begin(), (int (*)(int))toupper); 254 | } 255 | 256 | inline void str2lower(string& s){ 257 | transform(s.begin(), s.end(), s.begin(), (int (*)(int))tolower); 258 | } 259 | 260 | inline char num2qual(int num) { 261 | if(num > 127 - 33) 262 | num = 127 - 33; 263 | if(num < 0) 264 | num = 0; 265 | 266 | char c = num + 33; 267 | return c; 268 | } 269 | 270 | inline void error_exit(const string& msg) { 271 | cerr << "ERROR: " << msg << endl; 272 | exit(-1); 273 | } 274 | 275 | extern mutex logmtx; 276 | inline void loginfo(const string s){ 277 | logmtx.lock(); 278 | time_t tt = time(NULL); 279 | tm* t= localtime(&tt); 280 | fprintf(stderr, "[%02d:%02d:%02d] %s \n", t->tm_hour, t->tm_min, t->tm_sec, s.c_str()); 281 | logmtx.unlock(); 282 | } 283 | 284 | #endif /* UTIL_H */ 285 | -------------------------------------------------------------------------------- /src/overlapanalysis.cpp: -------------------------------------------------------------------------------- 1 | #include "matcher.h" 2 | #include "overlapanalysis.h" 3 | 4 | OverlapAnalysis::OverlapAnalysis(){ 5 | } 6 | 7 | 8 | OverlapAnalysis::~OverlapAnalysis(){ 9 | } 10 | 11 | OverlapResult OverlapAnalysis::analyze(Read* r1, Read* r2, int overlapDiffLimit, int overlapRequire, double diffPercentLimit, bool allowGap) { 12 | return analyze(r1->mSeq, r2->mSeq, overlapDiffLimit, overlapRequire, diffPercentLimit, allowGap); 13 | } 14 | 15 | // ported from the python code of AfterQC 16 | OverlapResult OverlapAnalysis::analyze(string* r1, string* r2, int diffLimit, int overlapRequire, double diffPercentLimit, bool allowGap) { 17 | string rcr2 = Sequence::reverseComplement(r2); 18 | int len1 = r1->length(); 19 | int len2 = rcr2.length(); 20 | // use the pointer directly for speed 21 | const char* str1 = r1->c_str(); 22 | const char* str2 = rcr2.c_str(); 23 | 24 | int complete_compare_require = 50; 25 | 26 | int overlap_len = 0; 27 | int offset = 0; 28 | int diff = 0; 29 | 30 | // forward with no gap 31 | // a match of less than overlapRequire is considered as unconfident 32 | while (offset < len1-overlapRequire) { 33 | // the overlap length of r1 & r2 when r2 is move right for offset 34 | overlap_len = min(len1 - offset, len2); 35 | int overlapDiffLimit = min(diffLimit, (int)(overlap_len * diffPercentLimit)); 36 | 37 | diff = 0; 38 | int i = 0; 39 | for (i=0; i overlapDiffLimit && i < complete_compare_require) 43 | break; 44 | } 45 | } 46 | 47 | if (diff <= overlapDiffLimit || (diff > overlapDiffLimit && i>complete_compare_require)){ 48 | OverlapResult ov; 49 | ov.overlapped = true; 50 | ov.offset = offset; 51 | ov.overlap_len = overlap_len; 52 | ov.diff = diff; 53 | ov.hasGap = false; 54 | return ov; 55 | } 56 | 57 | offset += 1; 58 | } 59 | 60 | 61 | // reverse with no gap 62 | // in this case, the adapter is sequenced since TEMPLATE_LEN < SEQ_LEN 63 | // check if distance can get smaller if offset goes negative 64 | // this only happens when insert DNA is shorter than sequencing read length, and some adapter/primer is sequenced but not trimmed cleanly 65 | // we go reversely 66 | offset = 0; 67 | while (offset > -(len2-overlapRequire)){ 68 | // the overlap length of r1 & r2 when r2 is move right for offset 69 | overlap_len = min(len1, len2- abs(offset)); 70 | int overlapDiffLimit = min(diffLimit, (int)(overlap_len * diffPercentLimit)); 71 | 72 | diff = 0; 73 | int i = 0; 74 | for (i=0; i overlapDiffLimit && i < complete_compare_require) 78 | break; 79 | } 80 | } 81 | 82 | if (diff <= overlapDiffLimit || (diff > overlapDiffLimit && i>complete_compare_require)){ 83 | OverlapResult ov; 84 | ov.overlapped = true; 85 | ov.offset = offset; 86 | ov.overlap_len = overlap_len; 87 | ov.diff = diff; 88 | ov.hasGap = false; 89 | return ov; 90 | } 91 | 92 | offset -= 1; 93 | } 94 | 95 | if(allowGap) { 96 | // forward with one gap 97 | offset = 0; 98 | while (offset < len1-overlapRequire) { 99 | // the overlap length of r1 & r2 when r2 is move right for offset 100 | overlap_len = min(len1 - offset, len2); 101 | int overlapDiffLimit = min(diffLimit, (int)(overlap_len * diffPercentLimit)); 102 | 103 | int diff = Matcher::diffWithOneInsertion(str1 + offset, str2, overlap_len-1, overlapDiffLimit); 104 | if(diff <0 || diff > overlapDiffLimit) 105 | diff = Matcher::diffWithOneInsertion(str2, str1 + offset, overlap_len-1, overlapDiffLimit); 106 | 107 | if (diff <= overlapDiffLimit && diff >=0){ 108 | OverlapResult ov; 109 | ov.overlapped = true; 110 | ov.offset = offset; 111 | ov.overlap_len = overlap_len; 112 | ov.diff = diff; 113 | ov.hasGap = true; 114 | return ov; 115 | } 116 | 117 | offset += 1; 118 | } 119 | 120 | // reverse with one gap 121 | offset = 0; 122 | while (offset > -(len2-overlapRequire)){ 123 | // the overlap length of r1 & r2 when r2 is move right for offset 124 | overlap_len = min(len1, len2- abs(offset)); 125 | int overlapDiffLimit = min(diffLimit, (int)(overlap_len * diffPercentLimit)); 126 | 127 | int diff = Matcher::diffWithOneInsertion(str1, str2-offset, overlap_len-1, overlapDiffLimit); 128 | if(diff <0 || diff > overlapDiffLimit) 129 | diff = Matcher::diffWithOneInsertion(str2-offset, str1, overlap_len-1, overlapDiffLimit); 130 | 131 | if (diff <= overlapDiffLimit && diff >=0){ 132 | OverlapResult ov; 133 | ov.overlapped = true; 134 | ov.offset = offset; 135 | ov.overlap_len = overlap_len; 136 | ov.diff = diff; 137 | ov.hasGap = true; 138 | return ov; 139 | } 140 | 141 | offset -= 1; 142 | } 143 | } 144 | 145 | OverlapResult ov; 146 | ov.overlapped = false; 147 | ov.offset = ov.overlap_len = ov.diff = 0; 148 | ov.hasGap = false; 149 | return ov; 150 | } 151 | 152 | Read* OverlapAnalysis::merge(Read* r1, Read* r2, OverlapResult ov) { 153 | int ol = ov.overlap_len; 154 | if(!ov.overlapped) 155 | return NULL; 156 | 157 | int len1 = ol + max(0, ov.offset); 158 | int len2 = 0; 159 | if(ov.offset > 0) 160 | len2 = r2->length() - ol; 161 | 162 | Read* rr2 = r2->reverseComplement(); 163 | string mergedSeq = r1->mSeq->substr(0, len1); 164 | if(ov.offset > 0) { 165 | mergedSeq += rr2->mSeq->substr(ol, len2); 166 | } 167 | 168 | string mergedQual = r1->mQuality->substr(0, len1); 169 | if(ov.offset > 0) { 170 | mergedQual += rr2->mQuality->substr(ol, len2); 171 | } 172 | 173 | delete rr2; 174 | 175 | string name = *(r1->mName) + " merged_" + to_string(len1) + "_" + to_string(len2); 176 | string strand = *(r1->mStrand); 177 | if (strand != "+") { 178 | strand = strand + " merged_" + to_string(len1) + "_" + to_string(len2); 179 | } 180 | Read* mergedRead = new Read(new string(name), new string(mergedSeq), new string(strand), new string(mergedQual)); 181 | 182 | return mergedRead; 183 | } 184 | 185 | bool OverlapAnalysis::test(){ 186 | //Sequence r1("CAGCGCCTACGGGCCCCTTTTTCTGCGCGACCGCGTGGCTGTGGGCGCGGATGCCTTTGAGCGCGGTGACTTCTCACTGCGTATCGAGCCGCTGGAGGTCTCCC"); 187 | //Sequence r2("ACCTCCAGCGGCTCGATACGCAGTGAGAAGTCACCGCGCTCAAAGGCATCCGCGCCCACAGCCACGCGGTCGCGCAGAAAAAGGGGCCCGTAGGCGCGGCTCCC"); 188 | 189 | string* r1 = new string("CAGCGCCTACGGGCCCCTTTTTCTGCGCGACCGCGTGGCTGTGGGCGCGGATGCCTTTGAGCGCGGTGACTTCTCACTGCGTATCGAGC"); 190 | string* r2 = new string("ACCTCCAGCGGCTCGATACGCAGTGAGAAGTCACCGCGCTCAAAGGCATCCGCGCCCACAGCCACGCGGTCGCGCAGAAAAAGGGGTCC"); 191 | string* qual1 = new string("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"); 192 | string* qual2 = new string("#########################################################################################"); 193 | 194 | OverlapResult ov = OverlapAnalysis::analyze(r1, r2, 2, 30, 0.2); 195 | 196 | Read read1(new string("name1"), r1, new string("+"), qual1); 197 | Read read2(new string("name2"), r2, new string("+"), qual2); 198 | 199 | Read* mergedRead = OverlapAnalysis::merge(&read1, &read2, ov); 200 | mergedRead->print(); 201 | 202 | return ov.overlapped && ov.offset == 10 && ov.overlap_len == 79 && ov.diff == 1; 203 | } 204 | -------------------------------------------------------------------------------- /src/zlib/trees.h: -------------------------------------------------------------------------------- 1 | /* header created automatically with -DGEN_TREES_H */ 2 | 3 | local const ct_data static_ltree[L_CODES+2] = { 4 | {{ 12},{ 8}}, {{140},{ 8}}, {{ 76},{ 8}}, {{204},{ 8}}, {{ 44},{ 8}}, 5 | {{172},{ 8}}, {{108},{ 8}}, {{236},{ 8}}, {{ 28},{ 8}}, {{156},{ 8}}, 6 | {{ 92},{ 8}}, {{220},{ 8}}, {{ 60},{ 8}}, {{188},{ 8}}, {{124},{ 8}}, 7 | {{252},{ 8}}, {{ 2},{ 8}}, {{130},{ 8}}, {{ 66},{ 8}}, {{194},{ 8}}, 8 | {{ 34},{ 8}}, {{162},{ 8}}, {{ 98},{ 8}}, {{226},{ 8}}, {{ 18},{ 8}}, 9 | {{146},{ 8}}, {{ 82},{ 8}}, {{210},{ 8}}, {{ 50},{ 8}}, {{178},{ 8}}, 10 | {{114},{ 8}}, {{242},{ 8}}, {{ 10},{ 8}}, {{138},{ 8}}, {{ 74},{ 8}}, 11 | {{202},{ 8}}, {{ 42},{ 8}}, {{170},{ 8}}, {{106},{ 8}}, {{234},{ 8}}, 12 | {{ 26},{ 8}}, {{154},{ 8}}, {{ 90},{ 8}}, {{218},{ 8}}, {{ 58},{ 8}}, 13 | {{186},{ 8}}, {{122},{ 8}}, {{250},{ 8}}, {{ 6},{ 8}}, {{134},{ 8}}, 14 | {{ 70},{ 8}}, {{198},{ 8}}, {{ 38},{ 8}}, {{166},{ 8}}, {{102},{ 8}}, 15 | {{230},{ 8}}, {{ 22},{ 8}}, {{150},{ 8}}, {{ 86},{ 8}}, {{214},{ 8}}, 16 | {{ 54},{ 8}}, {{182},{ 8}}, {{118},{ 8}}, {{246},{ 8}}, {{ 14},{ 8}}, 17 | {{142},{ 8}}, {{ 78},{ 8}}, {{206},{ 8}}, {{ 46},{ 8}}, {{174},{ 8}}, 18 | {{110},{ 8}}, {{238},{ 8}}, {{ 30},{ 8}}, {{158},{ 8}}, {{ 94},{ 8}}, 19 | {{222},{ 8}}, {{ 62},{ 8}}, {{190},{ 8}}, {{126},{ 8}}, {{254},{ 8}}, 20 | {{ 1},{ 8}}, {{129},{ 8}}, {{ 65},{ 8}}, {{193},{ 8}}, {{ 33},{ 8}}, 21 | {{161},{ 8}}, {{ 97},{ 8}}, {{225},{ 8}}, {{ 17},{ 8}}, {{145},{ 8}}, 22 | {{ 81},{ 8}}, {{209},{ 8}}, {{ 49},{ 8}}, {{177},{ 8}}, {{113},{ 8}}, 23 | {{241},{ 8}}, {{ 9},{ 8}}, {{137},{ 8}}, {{ 73},{ 8}}, {{201},{ 8}}, 24 | {{ 41},{ 8}}, {{169},{ 8}}, {{105},{ 8}}, {{233},{ 8}}, {{ 25},{ 8}}, 25 | {{153},{ 8}}, {{ 89},{ 8}}, {{217},{ 8}}, {{ 57},{ 8}}, {{185},{ 8}}, 26 | {{121},{ 8}}, {{249},{ 8}}, {{ 5},{ 8}}, {{133},{ 8}}, {{ 69},{ 8}}, 27 | {{197},{ 8}}, {{ 37},{ 8}}, {{165},{ 8}}, {{101},{ 8}}, {{229},{ 8}}, 28 | {{ 21},{ 8}}, {{149},{ 8}}, {{ 85},{ 8}}, {{213},{ 8}}, {{ 53},{ 8}}, 29 | {{181},{ 8}}, {{117},{ 8}}, {{245},{ 8}}, {{ 13},{ 8}}, {{141},{ 8}}, 30 | {{ 77},{ 8}}, {{205},{ 8}}, {{ 45},{ 8}}, {{173},{ 8}}, {{109},{ 8}}, 31 | {{237},{ 8}}, {{ 29},{ 8}}, {{157},{ 8}}, {{ 93},{ 8}}, {{221},{ 8}}, 32 | {{ 61},{ 8}}, {{189},{ 8}}, {{125},{ 8}}, {{253},{ 8}}, {{ 19},{ 9}}, 33 | {{275},{ 9}}, {{147},{ 9}}, {{403},{ 9}}, {{ 83},{ 9}}, {{339},{ 9}}, 34 | {{211},{ 9}}, {{467},{ 9}}, {{ 51},{ 9}}, {{307},{ 9}}, {{179},{ 9}}, 35 | {{435},{ 9}}, {{115},{ 9}}, {{371},{ 9}}, {{243},{ 9}}, {{499},{ 9}}, 36 | {{ 11},{ 9}}, {{267},{ 9}}, {{139},{ 9}}, {{395},{ 9}}, {{ 75},{ 9}}, 37 | {{331},{ 9}}, {{203},{ 9}}, {{459},{ 9}}, {{ 43},{ 9}}, {{299},{ 9}}, 38 | {{171},{ 9}}, {{427},{ 9}}, {{107},{ 9}}, {{363},{ 9}}, {{235},{ 9}}, 39 | {{491},{ 9}}, {{ 27},{ 9}}, {{283},{ 9}}, {{155},{ 9}}, {{411},{ 9}}, 40 | {{ 91},{ 9}}, {{347},{ 9}}, {{219},{ 9}}, {{475},{ 9}}, {{ 59},{ 9}}, 41 | {{315},{ 9}}, {{187},{ 9}}, {{443},{ 9}}, {{123},{ 9}}, {{379},{ 9}}, 42 | {{251},{ 9}}, {{507},{ 9}}, {{ 7},{ 9}}, {{263},{ 9}}, {{135},{ 9}}, 43 | {{391},{ 9}}, {{ 71},{ 9}}, {{327},{ 9}}, {{199},{ 9}}, {{455},{ 9}}, 44 | {{ 39},{ 9}}, {{295},{ 9}}, {{167},{ 9}}, {{423},{ 9}}, {{103},{ 9}}, 45 | {{359},{ 9}}, {{231},{ 9}}, {{487},{ 9}}, {{ 23},{ 9}}, {{279},{ 9}}, 46 | {{151},{ 9}}, {{407},{ 9}}, {{ 87},{ 9}}, {{343},{ 9}}, {{215},{ 9}}, 47 | {{471},{ 9}}, {{ 55},{ 9}}, {{311},{ 9}}, {{183},{ 9}}, {{439},{ 9}}, 48 | {{119},{ 9}}, {{375},{ 9}}, {{247},{ 9}}, {{503},{ 9}}, {{ 15},{ 9}}, 49 | {{271},{ 9}}, {{143},{ 9}}, {{399},{ 9}}, {{ 79},{ 9}}, {{335},{ 9}}, 50 | {{207},{ 9}}, {{463},{ 9}}, {{ 47},{ 9}}, {{303},{ 9}}, {{175},{ 9}}, 51 | {{431},{ 9}}, {{111},{ 9}}, {{367},{ 9}}, {{239},{ 9}}, {{495},{ 9}}, 52 | {{ 31},{ 9}}, {{287},{ 9}}, {{159},{ 9}}, {{415},{ 9}}, {{ 95},{ 9}}, 53 | {{351},{ 9}}, {{223},{ 9}}, {{479},{ 9}}, {{ 63},{ 9}}, {{319},{ 9}}, 54 | {{191},{ 9}}, {{447},{ 9}}, {{127},{ 9}}, {{383},{ 9}}, {{255},{ 9}}, 55 | {{511},{ 9}}, {{ 0},{ 7}}, {{ 64},{ 7}}, {{ 32},{ 7}}, {{ 96},{ 7}}, 56 | {{ 16},{ 7}}, {{ 80},{ 7}}, {{ 48},{ 7}}, {{112},{ 7}}, {{ 8},{ 7}}, 57 | {{ 72},{ 7}}, {{ 40},{ 7}}, {{104},{ 7}}, {{ 24},{ 7}}, {{ 88},{ 7}}, 58 | {{ 56},{ 7}}, {{120},{ 7}}, {{ 4},{ 7}}, {{ 68},{ 7}}, {{ 36},{ 7}}, 59 | {{100},{ 7}}, {{ 20},{ 7}}, {{ 84},{ 7}}, {{ 52},{ 7}}, {{116},{ 7}}, 60 | {{ 3},{ 8}}, {{131},{ 8}}, {{ 67},{ 8}}, {{195},{ 8}}, {{ 35},{ 8}}, 61 | {{163},{ 8}}, {{ 99},{ 8}}, {{227},{ 8}} 62 | }; 63 | 64 | local const ct_data static_dtree[D_CODES] = { 65 | {{ 0},{ 5}}, {{16},{ 5}}, {{ 8},{ 5}}, {{24},{ 5}}, {{ 4},{ 5}}, 66 | {{20},{ 5}}, {{12},{ 5}}, {{28},{ 5}}, {{ 2},{ 5}}, {{18},{ 5}}, 67 | {{10},{ 5}}, {{26},{ 5}}, {{ 6},{ 5}}, {{22},{ 5}}, {{14},{ 5}}, 68 | {{30},{ 5}}, {{ 1},{ 5}}, {{17},{ 5}}, {{ 9},{ 5}}, {{25},{ 5}}, 69 | {{ 5},{ 5}}, {{21},{ 5}}, {{13},{ 5}}, {{29},{ 5}}, {{ 3},{ 5}}, 70 | {{19},{ 5}}, {{11},{ 5}}, {{27},{ 5}}, {{ 7},{ 5}}, {{23},{ 5}} 71 | }; 72 | 73 | const uch ZLIB_INTERNAL _dist_code[DIST_CODE_LEN] = { 74 | 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 75 | 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 76 | 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 77 | 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 78 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 79 | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 80 | 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 81 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 82 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 83 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 84 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 85 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 86 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 0, 0, 16, 17, 87 | 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 88 | 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 89 | 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 90 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 91 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 92 | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 93 | 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 94 | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 95 | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 96 | 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 97 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 98 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 99 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29 100 | }; 101 | 102 | const uch ZLIB_INTERNAL _length_code[MAX_MATCH-MIN_MATCH+1]= { 103 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, 12, 104 | 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 105 | 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 106 | 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 107 | 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 108 | 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 109 | 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 110 | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 111 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 112 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 113 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 114 | 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 115 | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28 116 | }; 117 | 118 | local const int base_length[LENGTH_CODES] = { 119 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 120 | 64, 80, 96, 112, 128, 160, 192, 224, 0 121 | }; 122 | 123 | local const int base_dist[D_CODES] = { 124 | 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 125 | 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 126 | 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576 127 | }; 128 | 129 | -------------------------------------------------------------------------------- /src/filter.cpp: -------------------------------------------------------------------------------- 1 | #include "processor.h" 2 | #include "peprocessor.h" 3 | #include "seprocessor.h" 4 | #include "overlapanalysis.h" 5 | 6 | Filter::Filter(Options* opt){ 7 | mOptions = opt; 8 | } 9 | 10 | 11 | Filter::~Filter(){ 12 | } 13 | 14 | int Filter::passFilter(Read* r) { 15 | if(r == NULL || r->length()==0) { 16 | return FAIL_LENGTH; 17 | } 18 | 19 | int rlen = r->length(); 20 | int lowQualNum = 0; 21 | int nBaseNum = 0; 22 | int totalQual = 0; 23 | 24 | // need to recalculate lowQualNum and nBaseNum if the corresponding filters are enabled 25 | if(mOptions->qualfilter.enabled || mOptions->lengthFilter.enabled) { 26 | const char* seqstr = r->mSeq->c_str(); 27 | const char* qualstr = r->mQuality->c_str(); 28 | 29 | for(int i=0; iqualfilter.qualifiedQual) 36 | lowQualNum ++; 37 | 38 | if(base == 'N') 39 | nBaseNum++; 40 | } 41 | } 42 | 43 | if(mOptions->qualfilter.enabled) { 44 | if(lowQualNum > (mOptions->qualfilter.unqualifiedPercentLimit * rlen / 100.0) ) 45 | return FAIL_QUALITY; 46 | else if(mOptions->qualfilter.avgQualReq > 0 && (totalQual / rlen)qualfilter.avgQualReq) 47 | return FAIL_QUALITY; 48 | else if(nBaseNum > mOptions->qualfilter.nBaseLimit ) 49 | return FAIL_N_BASE; 50 | } 51 | 52 | if(mOptions->lengthFilter.enabled) { 53 | if(rlen < mOptions->lengthFilter.requiredLength) 54 | return FAIL_LENGTH; 55 | if(mOptions->lengthFilter.maxLength > 0 && rlen > mOptions->lengthFilter.maxLength) 56 | return FAIL_TOO_LONG; 57 | } 58 | 59 | if(mOptions->complexityFilter.enabled) { 60 | if(!passLowComplexityFilter(r)) 61 | return FAIL_COMPLEXITY; 62 | } 63 | 64 | return PASS_FILTER; 65 | } 66 | 67 | bool Filter::passLowComplexityFilter(Read* r) { 68 | int diff = 0; 69 | int length = r->length(); 70 | if(length <= 1) 71 | return false; 72 | const char* data = r->mSeq->c_str(); 73 | for(int i=0; i= mOptions->complexityFilter.threshold ) 78 | return true; 79 | else 80 | return false; 81 | } 82 | 83 | Read* Filter::trimAndCut(Read* r, int front, int tail, int& frontTrimmed) { 84 | frontTrimmed = 0; 85 | // return the same read for speed if no change needed 86 | if(front == 0 && tail == 0 && !mOptions->qualityCut.enabledFront && !mOptions->qualityCut.enabledTail && !mOptions->qualityCut.enabledRight) 87 | return r; 88 | 89 | 90 | int rlen = r->length() - front - tail ; 91 | if (rlen < 0) 92 | return NULL; 93 | 94 | if(front == 0 && !mOptions->qualityCut.enabledFront && !mOptions->qualityCut.enabledTail && !mOptions->qualityCut.enabledRight){ 95 | r->resize(rlen); 96 | return r; 97 | } else if(!mOptions->qualityCut.enabledFront && !mOptions->qualityCut.enabledTail && !mOptions->qualityCut.enabledRight){ 98 | r->mSeq->erase(0,front); 99 | r->mSeq->resize(rlen); 100 | r->mQuality->erase(0,front); 101 | r->mQuality->resize(rlen); 102 | frontTrimmed = front; 103 | return r; 104 | } 105 | 106 | // need quality cutting 107 | 108 | int l = r->length(); 109 | const char* qualstr = r->mQuality->c_str(); 110 | const char* seq = r->mSeq->c_str(); 111 | // quality cutting forward 112 | if(mOptions->qualityCut.enabledFront) { 113 | int w = mOptions->qualityCut.windowSizeFront; 114 | int s = front; 115 | if(l - front - tail - w <= 0) 116 | return NULL; 117 | 118 | int totalQual = 0; 119 | 120 | // preparing rolling 121 | for(int i=0; i front) { 128 | totalQual -= qualstr[s-1]; 129 | } 130 | // add 33 for phred33 transforming 131 | if((double)totalQual / (double)w >= 33 + mOptions->qualityCut.qualityFront) 132 | break; 133 | } 134 | 135 | // the trimming in front is forwarded and rlen is recalculated 136 | if(s >0 ) 137 | s = s+w-1; 138 | while(squalityCut.enabledRight) { 146 | int w = mOptions->qualityCut.windowSizeRight; 147 | int s = front; 148 | if(l - front - tail - w <= 0) 149 | return NULL; 150 | 151 | int totalQual = 0; 152 | 153 | // preparing rolling 154 | for(int i=0; i front) { 163 | totalQual -= qualstr[s-1]; 164 | } 165 | // add 33 for phred33 transforming 166 | if((double)totalQual / (double)w < 33 + mOptions->qualityCut.qualityRight) { 167 | foundLowQualWindow = true; 168 | break; 169 | } 170 | } 171 | 172 | if(foundLowQualWindow ) { 173 | // keep the good bases in the window 174 | while(s=33 + mOptions->qualityCut.qualityRight) 175 | s++; 176 | rlen = s - front; 177 | } 178 | } 179 | 180 | // quality cutting backward 181 | if(!mOptions->qualityCut.enabledRight && mOptions->qualityCut.enabledTail) { 182 | int w = mOptions->qualityCut.windowSizeTail; 183 | if(l - front - tail - w <= 0) 184 | return NULL; 185 | 186 | int totalQual = 0; 187 | int t = l - tail - 1; 188 | 189 | // preparing rolling 190 | for(int i=0; i=front; t--) { 194 | totalQual += qualstr[t-w+1]; 195 | // rolling 196 | if(t < l-tail-1) { 197 | totalQual -= qualstr[t+1]; 198 | } 199 | // add 33 for phred33 transforming 200 | if((double)totalQual / (double)w >= 33 + mOptions->qualityCut.qualityTail) 201 | break; 202 | } 203 | 204 | if(t < l-1) 205 | t = t-w+1; 206 | while(t>=0 && seq[t] == 'N') 207 | t--; 208 | rlen = t - front + 1; 209 | } 210 | 211 | if(rlen <= 0 || front >= l-1) 212 | return NULL; 213 | 214 | r->mSeq->erase(0, front); 215 | r->mSeq->resize(rlen); 216 | r->mQuality->erase(0, front); 217 | r->mQuality->resize(rlen); 218 | 219 | frontTrimmed = front; 220 | 221 | return r; 222 | } 223 | 224 | bool Filter::filterByIndex(Read* r) { 225 | if(mOptions->indexFilter.enabled) { 226 | if( match(mOptions->indexFilter.blacklist1, r->firstIndex(), mOptions->indexFilter.threshold) ) 227 | return true; 228 | } 229 | return false; 230 | } 231 | 232 | bool Filter::filterByIndex(Read* r1, Read* r2) { 233 | if(mOptions->indexFilter.enabled) { 234 | if( match(mOptions->indexFilter.blacklist1, r1->firstIndex(), mOptions->indexFilter.threshold) ) 235 | return true; 236 | if( match(mOptions->indexFilter.blacklist2, r2->lastIndex(), mOptions->indexFilter.threshold) ) 237 | return true; 238 | } 239 | return false; 240 | } 241 | 242 | bool Filter::match(vector& list, string target, int threshold) { 243 | for(int i=0; ithreshold) 251 | break; 252 | } 253 | } 254 | if(diff <= threshold) 255 | return true; 256 | } 257 | return false; 258 | } 259 | 260 | bool Filter::test() { 261 | Read r("@name", 262 | "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTT", 263 | "+", 264 | "/////CCCCCCCCCCCC////CCCCCCCCCCCCCC////E"); 265 | Options opt; 266 | opt.qualityCut.enabledFront = true; 267 | opt.qualityCut.enabledTail = true; 268 | opt.qualityCut.windowSizeFront = 4; 269 | opt.qualityCut.qualityFront = 20; 270 | opt.qualityCut.windowSizeTail = 4; 271 | opt.qualityCut.qualityTail = 20; 272 | Filter filter(&opt); 273 | int frontTrimmed = 0; 274 | Read* ret = filter.trimAndCut(&r, 0, 1, frontTrimmed); 275 | ret->print(); 276 | 277 | return *ret->mSeq == "CCCCCCCCCCCCCCCCCCCCCCCCCCCC" 278 | && *ret->mQuality == "CCCCCCCCCCC////CCCCCCCCCCCCC"; 279 | } -------------------------------------------------------------------------------- /src/read.cpp: -------------------------------------------------------------------------------- 1 | #include "read.h" 2 | #include 3 | #include 4 | #include "util.h" 5 | 6 | Read::Read(string* name, string* seq, string* strand, string* quality, bool phred64){ 7 | mName = name; 8 | mSeq = seq; 9 | mStrand = strand; 10 | mQuality = quality; 11 | if(phred64) 12 | convertPhred64To33(); 13 | } 14 | 15 | Read::Read(const char* name, const char* seq, const char* strand, const char* quality, bool phred64) { 16 | mName = new string(name); 17 | mSeq = new string(seq); 18 | mStrand = new string(strand); 19 | mQuality = new string(quality); 20 | if(phred64) 21 | convertPhred64To33(); 22 | } 23 | 24 | Read::~Read(){ 25 | if(mName) 26 | delete mName; 27 | if(mStrand) 28 | delete mStrand; 29 | if(mQuality) 30 | delete mQuality; 31 | if(mSeq) 32 | delete mSeq; 33 | } 34 | 35 | void Read::convertPhred64To33(){ 36 | for(int i=0; ilength(); i++) { 37 | (*mQuality)[i] = max(33, (*mQuality)[i] - (64-33)); 38 | } 39 | } 40 | 41 | void Read::print(){ 42 | std::cerr << *mName << endl; 43 | std::cerr << *(mSeq) << endl; 44 | std::cerr << *mStrand << endl; 45 | std::cerr << *mQuality << endl; 46 | } 47 | 48 | void Read::printFile(ofstream& file){ 49 | file << *mName << endl; 50 | file << *mSeq << endl; 51 | file << *mStrand << endl; 52 | file << *mQuality << endl; 53 | } 54 | 55 | Read* Read::reverseComplement(){ 56 | string seq = Sequence::reverseComplement(mSeq); 57 | string qual; 58 | qual.assign(mQuality->rbegin(), mQuality->rend()); 59 | return new Read(mName->c_str(), seq.c_str(), "+", qual.c_str()); 60 | } 61 | 62 | void Read::resize(int len) { 63 | if(len > length() || len<0) 64 | return ; 65 | mSeq->resize(len); 66 | mQuality->resize(len); 67 | } 68 | 69 | void Read::trimFront(int len){ 70 | len = min(length()-1, len); 71 | mSeq->erase(0, len); 72 | mQuality->erase(0, len); 73 | } 74 | 75 | string Read::lastIndex(){ 76 | int len = mName->length(); 77 | if(len<5) 78 | return ""; 79 | for(int i=len-3;i>=0;i--){ 80 | if((*mName)[i]==':' || (*mName)[i]=='+'){ 81 | return mName->substr(i+1, len-i); 82 | } 83 | } 84 | return ""; 85 | } 86 | 87 | string Read::firstIndex(){ 88 | int len = mName->length(); 89 | int end = len; 90 | if(len<5) 91 | return ""; 92 | for(int i=len-3;i>=0;i--){ 93 | if((*mName)[i]=='+') 94 | end = i-1; 95 | if((*mName)[i]==':'){ 96 | return mName->substr(i+1, end-i); 97 | } 98 | } 99 | return ""; 100 | } 101 | 102 | int Read::lowQualCount(int qual){ 103 | int count = 0; 104 | for(int q=0;qsize();q++){ 105 | if((*mQuality)[q] < qual + 33) 106 | count++; 107 | } 108 | return count; 109 | } 110 | 111 | int Read::length(){ 112 | return mSeq->length(); 113 | } 114 | 115 | string Read::toString() { 116 | return *mName + "\n" + *mSeq + "\n" + *mStrand + "\n" + *mQuality + "\n"; 117 | } 118 | 119 | void Read::appendToString(string* target) { 120 | size_t size = mName->length() + mSeq->length() + mStrand->length() + mQuality->length() + 4; 121 | char* str = new char[size + 1]; 122 | size_t total = 0; 123 | memcpy(str + total, mName->data(), mName->length()); 124 | total += mName->length(); 125 | str[total] = '\n'; 126 | total++; 127 | memcpy(str + total, mSeq->data(), mSeq->length()); 128 | total += mSeq->length(); 129 | str[total] = '\n'; 130 | total++; 131 | memcpy(str + total, mStrand->data(), mStrand->length()); 132 | total += mStrand->length(); 133 | str[total] = '\n'; 134 | total++; 135 | memcpy(str + total, mQuality->data(), mQuality->length()); 136 | total += mQuality->length(); 137 | str[total] = '\n'; 138 | total++; 139 | str[total] = '\0'; 140 | 141 | target->append(str, size); 142 | delete[] str; 143 | } 144 | 145 | void Read::appendToStringWithTag(string* target, string tag) { 146 | size_t size = mName->length() + 1 + tag.length() + mSeq->length() + mStrand->length() + mQuality->length() + 4; 147 | char* str = new char[size + 1]; 148 | size_t total = 0; 149 | memcpy(str + total, mName->data(), mName->length()); 150 | total += mName->length(); 151 | str[total] = ' '; 152 | total++; 153 | memcpy(str + total, tag.data(), tag.length()); 154 | total += tag.length(); 155 | str[total] = '\n'; 156 | total++; 157 | memcpy(str + total, mSeq->data(), mSeq->length()); 158 | total += mSeq->length(); 159 | str[total] = '\n'; 160 | total++; 161 | memcpy(str + total, mStrand->data(), mStrand->length()); 162 | total += mStrand->length(); 163 | str[total] = '\n'; 164 | total++; 165 | memcpy(str + total, mQuality->data(), mQuality->length()); 166 | total += mQuality->length(); 167 | str[total] = '\n'; 168 | total++; 169 | str[total] = '\0'; 170 | 171 | target->append(str, size); 172 | delete[] str; 173 | } 174 | 175 | string Read::toStringWithTag(string tag) { 176 | return *mName + " " + tag + "\n" + *mSeq + "\n" + *mStrand + "\n" + *mQuality + "\n"; 177 | } 178 | 179 | bool Read::fixMGI() { 180 | int len = mName->length(); 181 | if((*mName)[len-1]=='1' || (*mName)[len-1]=='2') { 182 | if((*mName)[len-2] == '/') { 183 | string* newName = new string(mName->substr(0, len-2) + " " + mName->substr(len-2, 2)); 184 | delete mName; 185 | mName = newName; 186 | return true; 187 | } 188 | } 189 | return false; 190 | } 191 | 192 | bool Read::test(){ 193 | Read r(new string("@NS500713:64:HFKJJBGXY:1:11101:20469:1097 1:N:0:TATAGCCT+GGTCCCGA"), 194 | new string("CTCTTGGACTCTAACACTGTTTTTTCTTATGAAAACACAGGAGTGATGACTAGTTGAGTGCATTCTTATGAGACTCATAGTCATTCTATGATGTAGTTTTCCTTAGGAGGACATTTTTTACATGAAATTATTAACCTAAATAGAGTTGATC"), 195 | new string("+"), 196 | new string("AAAAA6EEEEEEEEEEEEEEEEE#EEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEreverseComplement(); 219 | int len1 = mLeft->length(); 220 | int len2 = rcRight->length(); 221 | // use the pointer directly for speed 222 | const char* str1 = mLeft->mSeq->c_str(); 223 | const char* str2 = rcRight->mSeq->c_str(); 224 | const char* qual1 = mLeft->mQuality->c_str(); 225 | const char* qual2 = rcRight->mQuality->c_str(); 226 | 227 | // we require at least 30 bp overlapping to merge a pair 228 | const int MIN_OVERLAP = 30; 229 | bool overlapped = false; 230 | int olen = MIN_OVERLAP; 231 | int diff = 0; 232 | // the diff count for 1 high qual + 1 low qual 233 | int lowQualDiff = 0; 234 | 235 | while(olen <= min(len1, len2)){ 236 | diff = 0; 237 | lowQualDiff = 0; 238 | bool ok = true; 239 | int offset = len1 - olen; 240 | for(int i=0;i= Q30 and the other is <= Q15 244 | if((qual1[offset+i]>='?' && qual2[i]<='0') || (qual1[offset+i]<='0' && qual2[i]>='?')){ 245 | lowQualDiff++; 246 | } 247 | // we disallow high quality diff, and only allow up to 3 low qual diff 248 | if(diff>lowQualDiff || lowQualDiff>=3){ 249 | ok = false; 250 | break; 251 | } 252 | } 253 | } 254 | if(ok){ 255 | overlapped = true; 256 | break; 257 | } 258 | olen++; 259 | } 260 | 261 | if(overlapped){ 262 | int offset = len1 - olen; 263 | stringstream ss; 264 | ss << mLeft->mName << " merged offset:" << offset << " overlap:" << olen << " diff:" << diff; 265 | string mergedName = ss.str(); 266 | string mergedSeq = mLeft->mSeq->substr(0, offset) + *(rcRight->mSeq); 267 | string mergedQual = mLeft->mQuality->substr(0, offset) + *(rcRight->mQuality); 268 | // quality adjuction and correction for low qual diff 269 | for(int i=0;i='?' && qual2[i]<='0'){ 272 | mergedSeq[offset+i] = str1[offset+i]; 273 | mergedQual[offset+i] = qual1[offset+i]; 274 | } else { 275 | mergedSeq[offset+i] = str2[i]; 276 | mergedQual[offset+i] = qual2[i]; 277 | } 278 | } else { 279 | // add the quality of the pair to make a high qual 280 | mergedQual[offset+i] = qual1[offset+i] + qual2[i] - 33; 281 | } 282 | } 283 | delete rcRight; 284 | return new Read(new string(mergedName), new string(mergedSeq), new string("+"), new string(mergedQual)); 285 | } 286 | 287 | delete rcRight; 288 | return NULL; 289 | } 290 | 291 | bool ReadPair::test(){ 292 | Read* left = new Read(new string("@NS500713:64:HFKJJBGXY:1:11101:20469:1097 1:N:0:TATAGCCT+GGTCCCGA"), 293 | new string("TTTTTTCTCTTGGACTCTAACACTGTTTTTTCTTATGAAAACACAGGAGTGATGACTAGTTGAGTGCATTCTTATGAGACTCATAGTCATTCTATGATGTAG"), 294 | new string("+"), 295 | new string("AAAAA6EEEEEEEEEEEEEEEEE#EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE")); 296 | Read* right = new Read(new string("@NS500713:64:HFKJJBGXY:1:11101:20469:1097 1:N:0:TATAGCCT+GGTCCCGA"), 297 | new string("AAAAAACTACACCATAGAATGACTATGAGTCTCATAAGAATGCACTCAACTAGTCATCACTCCTGTGTTTTCATAAGAAAAAACAGTGTTAGAGTCCAAGAG"), 298 | new string("+"), 299 | new string("AAAAA6EEEEE/EEEEEEEEEEE#EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE")); 300 | 301 | ReadPair pair(left, right); 302 | Read* merged = pair.fastMerge(); 303 | if(merged == NULL) 304 | return false; 305 | 306 | if(*(merged->mSeq) != "TTTTTTCTCTTGGACTCTAACACTGTTTTTTCTTATGAAAACACAGGAGTGATGACTAGTTGAGTGCATTCTTATGAGACTCATAGTCATTCTATGATGTAGTTTTTT") 307 | return false; 308 | 309 | return true; 310 | } 311 | -------------------------------------------------------------------------------- /src/options.h: -------------------------------------------------------------------------------- 1 | #ifndef OPTIONS_H 2 | #define OPTIONS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | #define UMI_LOC_NONE 0 13 | #define UMI_LOC_INDEX1 1 14 | #define UMI_LOC_INDEX2 2 15 | #define UMI_LOC_READ1 3 16 | #define UMI_LOC_READ2 4 17 | #define UMI_LOC_PER_INDEX 5 18 | #define UMI_LOC_PER_READ 6 19 | 20 | class MergeOptions { 21 | public: 22 | MergeOptions() { 23 | enabled = false; 24 | includeUnmerged = false; 25 | } 26 | public: 27 | bool enabled; 28 | bool includeUnmerged; 29 | string out; 30 | }; 31 | 32 | class DuplicationOptions { 33 | public: 34 | DuplicationOptions() { 35 | enabled = true; 36 | histSize = 32; 37 | dedup = false; 38 | accuracyLevel = 1; 39 | } 40 | public: 41 | bool enabled; 42 | int histSize; 43 | bool dedup; 44 | int accuracyLevel; 45 | }; 46 | 47 | class IndexFilterOptions { 48 | public: 49 | IndexFilterOptions() { 50 | enabled = false; 51 | threshold = 0; 52 | } 53 | public: 54 | vector blacklist1; 55 | vector blacklist2; 56 | bool enabled; 57 | int threshold; 58 | }; 59 | 60 | class LowComplexityFilterOptions { 61 | public: 62 | LowComplexityFilterOptions() { 63 | enabled = false; 64 | threshold = 0.3; 65 | } 66 | public: 67 | bool enabled; 68 | double threshold; 69 | }; 70 | 71 | class OverrepresentedSequenceAnasysOptions { 72 | public: 73 | OverrepresentedSequenceAnasysOptions() { 74 | enabled = false; 75 | sampling = 20; 76 | } 77 | public: 78 | bool enabled; 79 | int sampling; 80 | }; 81 | 82 | class PolyGTrimmerOptions { 83 | public: 84 | PolyGTrimmerOptions() { 85 | enabled = false; 86 | minLen = 10; 87 | } 88 | public: 89 | bool enabled; 90 | int minLen; 91 | }; 92 | 93 | class PolyXTrimmerOptions { 94 | public: 95 | PolyXTrimmerOptions() { 96 | enabled = false; 97 | minLen = 10; 98 | } 99 | public: 100 | bool enabled; 101 | int minLen; 102 | }; 103 | 104 | class UMIOptions { 105 | public: 106 | UMIOptions() { 107 | enabled = false; 108 | location = UMI_LOC_NONE; 109 | length = 0; 110 | skip = 0; 111 | delimiter= ":"; 112 | } 113 | public: 114 | bool enabled; 115 | int location; 116 | int length; 117 | int skip; 118 | string prefix; 119 | string separator; 120 | string delimiter; 121 | }; 122 | 123 | class CorrectionOptions { 124 | public: 125 | CorrectionOptions() { 126 | enabled = false; 127 | } 128 | public: 129 | bool enabled; 130 | }; 131 | 132 | class QualityCutOptions { 133 | public: 134 | QualityCutOptions() { 135 | enabledFront = false; 136 | enabledTail = false; 137 | enabledRight = false; 138 | windowSizeShared = 4; 139 | qualityShared = 20; 140 | windowSizeFront = windowSizeShared; 141 | qualityFront = qualityShared; 142 | windowSizeTail = windowSizeShared; 143 | qualityTail = qualityShared; 144 | windowSizeRight = windowSizeShared; 145 | qualityRight = qualityShared; 146 | } 147 | public: 148 | // enable 5' cutting by quality 149 | bool enabledFront; 150 | // enable 3' cutting by quality 151 | bool enabledTail; 152 | // enable agressive cutting mode 153 | bool enabledRight; 154 | // the sliding window size 155 | int windowSizeShared; 156 | // the mean quality requirement 157 | int qualityShared; 158 | // the sliding window size for cutting by quality in 5' 159 | int windowSizeFront; 160 | // the mean quality requirement for cutting by quality in 5' 161 | int qualityFront; 162 | // the sliding window size for cutting by quality in 3' 163 | int windowSizeTail; 164 | // the mean quality requirement for cutting by quality in 3' 165 | int qualityTail; 166 | // the sliding window size for cutting by quality in aggressive mode 167 | int windowSizeRight; 168 | // the mean quality requirement for cutting by quality in aggressive mode 169 | int qualityRight; 170 | }; 171 | 172 | class SplitOptions { 173 | public: 174 | SplitOptions() { 175 | enabled = false; 176 | needEvaluation = false; 177 | number = 0; 178 | size = 0; 179 | digits = 4; 180 | byFileNumber = false; 181 | byFileLines = false; 182 | } 183 | public: 184 | bool enabled; 185 | // number of files 186 | int number; 187 | // lines of each file 188 | long size; 189 | // digits number of file name prefix, for example 0001 means 4 digits 190 | int digits; 191 | // need evaluation? 192 | bool needEvaluation; 193 | bool byFileNumber; 194 | bool byFileLines; 195 | }; 196 | 197 | class AdapterOptions { 198 | public: 199 | AdapterOptions() { 200 | enabled = true; 201 | hasSeqR1 = false; 202 | hasSeqR2 = false; 203 | detectAdapterForPE = false; 204 | allowGapOverlapTrimming = false; 205 | } 206 | public: 207 | bool enabled; 208 | string sequence; 209 | string sequenceR2; 210 | string detectedAdapter1; 211 | string detectedAdapter2; 212 | vector seqsInFasta; 213 | string fastaFile; 214 | bool hasSeqR1; 215 | bool hasSeqR2; 216 | bool hasFasta; 217 | bool detectAdapterForPE; 218 | bool allowGapOverlapTrimming; 219 | }; 220 | 221 | class TrimmingOptions { 222 | public: 223 | TrimmingOptions() { 224 | front1 = 0; 225 | tail1 = 0; 226 | front2 = 0; 227 | tail2 = 0; 228 | maxLen1 = 0; 229 | maxLen2 = 0; 230 | } 231 | public: 232 | // trimming first cycles for read1 233 | int front1; 234 | // trimming last cycles for read1 235 | int tail1; 236 | // trimming first cycles for read2 237 | int front2; 238 | // trimming last cycles for read2 239 | int tail2; 240 | // max length of read1 241 | int maxLen1; 242 | // max length of read2 243 | int maxLen2; 244 | }; 245 | 246 | class QualityFilteringOptions { 247 | public: 248 | QualityFilteringOptions() { 249 | enabled = true; 250 | // '0' = Q15 251 | qualifiedQual = '0'; 252 | unqualifiedPercentLimit = 40; 253 | nBaseLimit = 5; 254 | } 255 | public: 256 | // quality filter enabled 257 | bool enabled; 258 | // if a base's quality phred score < qualifiedPhred, then it's considered as a low_qual_base 259 | char qualifiedQual; 260 | // if low_qual_base_num > lowQualLimit, then discard this read 261 | int unqualifiedPercentLimit; 262 | // if n_base_number > nBaseLimit, then discard this read 263 | int nBaseLimit; 264 | // if average qual score < avgQualReq, then discard this read 265 | int avgQualReq; 266 | }; 267 | 268 | class ReadLengthFilteringOptions { 269 | public: 270 | ReadLengthFilteringOptions() { 271 | enabled = false; 272 | requiredLength = 15; 273 | maxLength = 0; 274 | } 275 | public: 276 | // length filter enabled 277 | bool enabled; 278 | // if read_length < requiredLength, then this read is discard 279 | int requiredLength; 280 | // length limit, 0 for no limitation 281 | int maxLength; 282 | }; 283 | 284 | class Options{ 285 | public: 286 | Options(); 287 | void init(); 288 | bool isPaired(); 289 | bool validate(); 290 | bool adapterCuttingEnabled(); 291 | bool polyXTrimmingEnabled(); 292 | string getAdapter1(); 293 | string getAdapter2(); 294 | void initIndexFiltering(string blacklistFile1, string blacklistFile2, int threshold = 0); 295 | vector makeListFromFileByLine(string filename); 296 | bool shallDetectAdapter(bool isR2 = false); 297 | void loadFastaAdapters(); 298 | 299 | public: 300 | // file name of read1 input 301 | string in1; 302 | // file name of read2 input 303 | string in2; 304 | // file name of read1 output 305 | string out1; 306 | // file name of read2 output 307 | string out2; 308 | // file name of unpaired read1 output 309 | string unpaired1; 310 | // file name of unpaired read2 output 311 | string unpaired2; 312 | // file name of failed reads output 313 | string failedOut; 314 | // json file 315 | string overlappedOut; 316 | // json file 317 | string jsonFile; 318 | // html file 319 | string htmlFile; 320 | // html report title 321 | string reportTitle; 322 | // compression level 323 | int compression; 324 | // the input file is using phred64 quality scoring 325 | bool phred64; 326 | // do not rewrite existing files 327 | bool dontOverwrite; 328 | // read STDIN 329 | bool inputFromSTDIN; 330 | // write STDOUT 331 | bool outputToSTDOUT; 332 | // the input R1 file is interleaved 333 | bool interleavedInput; 334 | // only process first N reads 335 | int readsToProcess; 336 | // fix the MGI ID tailing issue 337 | bool fixMGI; 338 | // worker thread number 339 | int thread; 340 | // trimming options 341 | TrimmingOptions trim; 342 | // quality filtering options 343 | QualityFilteringOptions qualfilter; 344 | // length filtering options 345 | ReadLengthFilteringOptions lengthFilter; 346 | // adapter options 347 | AdapterOptions adapter; 348 | // multiple file splitting options 349 | SplitOptions split; 350 | // options for quality cutting 351 | QualityCutOptions qualityCut; 352 | // options for base correction 353 | CorrectionOptions correction; 354 | // options for UMI 355 | UMIOptions umi; 356 | // 3' end polyG trimming, default for Illumina NextSeq/NovaSeq 357 | PolyGTrimmerOptions polyGTrim; 358 | // 3' end polyX trimming 359 | PolyXTrimmerOptions polyXTrim; 360 | // for overrepresentation analysis 361 | OverrepresentedSequenceAnasysOptions overRepAnalysis; 362 | map overRepSeqs1; 363 | map overRepSeqs2; 364 | int seqLen1; 365 | int seqLen2; 366 | // low complexity filtering 367 | LowComplexityFilterOptions complexityFilter; 368 | // black lists for filtering by index 369 | IndexFilterOptions indexFilter; 370 | // options for duplication profiling 371 | DuplicationOptions duplicate; 372 | // max value of insert size 373 | int insertSizeMax; 374 | // overlap analysis threshold 375 | int overlapRequire; 376 | int overlapDiffLimit; 377 | int overlapDiffPercentLimit; 378 | // output debug information 379 | bool verbose; 380 | // merge options 381 | MergeOptions merge; 382 | // the buffer size for writer 383 | size_t writerBufferSize; 384 | 385 | }; 386 | 387 | #endif 388 | -------------------------------------------------------------------------------- /src/fastqreader.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021 Shifu Chen 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | #include "fastqreader.h" 26 | #include "util.h" 27 | #include 28 | #include 29 | 30 | #define FQ_BUF_SIZE (1<<23) 31 | #define IGZIP_IN_BUF_SIZE (1<<22) 32 | #define GZIP_HEADER_BYTES_REQ (1<<16) 33 | 34 | FastqReader::FastqReader(string filename, bool hasQuality, bool phred64){ 35 | mFilename = filename; 36 | mZipped = false; 37 | mFile = NULL; 38 | mStdinMode = false; 39 | mFastqBuf = new char[FQ_BUF_SIZE]; 40 | mBufDataLen = 0; 41 | mBufUsedLen = 0; 42 | mHasNoLineBreakAtEnd = false; 43 | mGzipInputBufferSize = IGZIP_IN_BUF_SIZE; 44 | mGzipInputBuffer = new unsigned char[mGzipInputBufferSize]; 45 | mGzipOutputBufferSize = FQ_BUF_SIZE; 46 | mGzipOutputBuffer = (unsigned char*)mFastqBuf; 47 | mCounter = 0; 48 | mPhred64 = phred64; 49 | mHasQuality = hasQuality; 50 | mHasNoLineBreakAtEnd = false; 51 | mGzipInputUsedBytes = 0; 52 | mReadPool = NULL; 53 | init(); 54 | } 55 | 56 | FastqReader::~FastqReader(){ 57 | close(); 58 | delete[] mFastqBuf; 59 | delete[] mGzipInputBuffer; 60 | } 61 | 62 | bool FastqReader::hasNoLineBreakAtEnd() { 63 | return mHasNoLineBreakAtEnd; 64 | } 65 | 66 | void FastqReader::setReadPool(ReadPool* rp) { 67 | mReadPool = rp; 68 | } 69 | 70 | 71 | bool FastqReader::bufferFinished() { 72 | if(mZipped) { 73 | return eof() && mGzipState.avail_in == 0; 74 | } else { 75 | return eof(); 76 | } 77 | } 78 | 79 | void FastqReader::readToBufIgzip(){ 80 | mBufDataLen = 0; 81 | while(mBufDataLen == 0) { 82 | if(eof() && mGzipState.avail_in==0) 83 | return; 84 | if (mGzipState.avail_in == 0) { 85 | mGzipState.next_in = mGzipInputBuffer; 86 | mGzipState.avail_in = fread(mGzipState.next_in, 1, mGzipInputBufferSize, mFile); 87 | mGzipInputUsedBytes += mGzipState.avail_in; 88 | } 89 | mGzipState.next_out = mGzipOutputBuffer; 90 | mGzipState.avail_out = mGzipOutputBufferSize; 91 | 92 | int ret = isal_inflate(&mGzipState); 93 | if (ret != ISAL_DECOMP_OK) { 94 | error_exit("igzip: encountered while decompressing file: " + mFilename); 95 | } 96 | mBufDataLen = mGzipState.next_out - mGzipOutputBuffer; 97 | if(eof() || mGzipState.avail_in>0) 98 | break; 99 | } 100 | // this block is finished 101 | if(mGzipState.block_state == ISAL_BLOCK_FINISH) { 102 | // a new block begins 103 | if(!eof() || mGzipState.avail_in > 0) { 104 | if (mGzipState.avail_in == 0) { 105 | isal_inflate_reset(&mGzipState); 106 | mGzipState.next_in = mGzipInputBuffer; 107 | mGzipState.avail_in = fread(mGzipState.next_in, 1, mGzipInputBufferSize, mFile); 108 | mGzipInputUsedBytes += mGzipState.avail_in; 109 | } else if (mGzipState.avail_in >= GZIP_HEADER_BYTES_REQ){ 110 | unsigned char* old_next_in = mGzipState.next_in; 111 | size_t old_avail_in = mGzipState.avail_in; 112 | isal_inflate_reset(&mGzipState); 113 | mGzipState.avail_in = old_avail_in; 114 | mGzipState.next_in = old_next_in; 115 | } else { 116 | size_t old_avail_in = mGzipState.avail_in; 117 | memmove(mGzipInputBuffer, mGzipState.next_in, mGzipState.avail_in); 118 | size_t added = 0; 119 | if(!eof()) { 120 | added = fread(mGzipInputBuffer + mGzipState.avail_in, 1, mGzipInputBufferSize - mGzipState.avail_in, mFile); 121 | mGzipInputUsedBytes += added; 122 | } 123 | isal_inflate_reset(&mGzipState); 124 | mGzipState.next_in = mGzipInputBuffer; 125 | mGzipState.avail_in = old_avail_in + added; 126 | } 127 | int ret = isal_read_gzip_header(&mGzipState, &mGzipHeader); 128 | if (ret != ISAL_DECOMP_OK) { 129 | error_exit("igzip: invalid gzip header found"); 130 | } 131 | } 132 | } 133 | 134 | if(eof() && mGzipState.avail_in == 0) { 135 | // all data was processed - fail if not at logical end of zip file (truncated?) 136 | if (mGzipState.block_state != ISAL_BLOCK_FINISH || !mGzipState.bfinal) { 137 | error_exit("igzip: unexpected eof"); 138 | } 139 | } 140 | } 141 | 142 | void FastqReader::readToBuf() { 143 | mBufDataLen = 0; 144 | if(mZipped) { 145 | readToBufIgzip(); 146 | } else { 147 | if(!eof()) 148 | mBufDataLen = fread(mFastqBuf, 1, FQ_BUF_SIZE, mFile); 149 | } 150 | mBufUsedLen = 0; 151 | 152 | if(bufferFinished() && mBufDataLen>0) { 153 | if(mFastqBuf[mBufDataLen-1] != '\n') 154 | mHasNoLineBreakAtEnd = true; 155 | } 156 | } 157 | 158 | void FastqReader::init(){ 159 | if (ends_with(mFilename, ".gz")){ 160 | mFile = fopen(mFilename.c_str(), "rb"); 161 | if(mFile == NULL) { 162 | error_exit("Failed to open file: " + mFilename); 163 | } 164 | isal_gzip_header_init(&mGzipHeader); 165 | isal_inflate_init(&mGzipState); 166 | mGzipState.crc_flag = ISAL_GZIP_NO_HDR_VER; 167 | mGzipState.next_in = mGzipInputBuffer; 168 | mGzipState.avail_in = fread(mGzipState.next_in, 1, mGzipInputBufferSize, mFile); 169 | mGzipInputUsedBytes += mGzipState.avail_in; 170 | int ret = isal_read_gzip_header(&mGzipState, &mGzipHeader); 171 | if (ret != ISAL_DECOMP_OK) { 172 | error_exit("igzip: Error invalid gzip header found: " + mFilename); 173 | } 174 | mZipped = true; 175 | } 176 | else { 177 | if(mFilename == "/dev/stdin") { 178 | mFile = stdin; 179 | } 180 | else 181 | mFile = fopen(mFilename.c_str(), "rb"); 182 | if(mFile == NULL) { 183 | error_exit("Failed to open file: " + mFilename); 184 | } 185 | mZipped = false; 186 | } 187 | readToBuf(); 188 | } 189 | 190 | void FastqReader::getBytes(size_t& bytesRead, size_t& bytesTotal) { 191 | if(mZipped) { 192 | bytesRead = mGzipInputUsedBytes - mGzipState.avail_in; 193 | } else { 194 | bytesRead = ftell(mFile);//mFile.tellg(); 195 | } 196 | // use another ifstream to not affect current reader 197 | ifstream is(mFilename); 198 | is.seekg (0, is.end); 199 | bytesTotal = is.tellg(); 200 | } 201 | 202 | void FastqReader::clearLineBreaks(char* line) { 203 | 204 | // trim \n, \r or \r\n in the tail 205 | int readed = strlen(line); 206 | if(readed >=2 ){ 207 | if(line[readed-1] == '\n' || line[readed-1] == '\r'){ 208 | line[readed-1] = '\0'; 209 | if(line[readed-2] == '\r') 210 | line[readed-2] = '\0'; 211 | } 212 | } 213 | } 214 | 215 | bool FastqReader::eof() { 216 | return feof(mFile);//mFile.eof(); 217 | } 218 | 219 | void FastqReader::getLine(string* line){ 220 | int copied = 0; 221 | 222 | int start = mBufUsedLen; 223 | int end = start; 224 | 225 | while(end < mBufDataLen) { 226 | if(mFastqBuf[end] != '\r' && mFastqBuf[end] != '\n') 227 | end++; 228 | else 229 | break; 230 | } 231 | 232 | // this line well contained in this buf, or this is the last buf 233 | if(end < mBufDataLen || bufferFinished()) { 234 | int len = end - start; 235 | line->assign(mFastqBuf+start, len); 236 | 237 | // skip \n or \r 238 | end++; 239 | // handle \r\n 240 | if(end < mBufDataLen-1 && mFastqBuf[end-1]=='\r' && mFastqBuf[end] == '\n') 241 | end++; 242 | 243 | mBufUsedLen = end; 244 | 245 | return ; 246 | } 247 | 248 | // this line is not contained in this buf, we need to read new buf 249 | line->assign(mFastqBuf+start, mBufDataLen - start); 250 | 251 | while(true) { 252 | readToBuf(); 253 | start = 0; 254 | end = 0; 255 | // handle the case that \r or \n in the start of buf 256 | if(line->empty()) { 257 | while(start < mBufDataLen && (mFastqBuf[start] == '\r' || mFastqBuf[start] == '\n')) 258 | start++; 259 | end = start; 260 | } 261 | while(end < mBufDataLen) { 262 | if(mFastqBuf[end] != '\r' && mFastqBuf[end] != '\n') 263 | end++; 264 | else 265 | break; 266 | } 267 | // this line well contained in this buf 268 | if(end < mBufDataLen || bufferFinished()) { 269 | int len = end - start; 270 | line->append(mFastqBuf+start, len); 271 | 272 | // skip \n or \r 273 | end++; 274 | // handle \r\n 275 | if(end < mBufDataLen-1 && mFastqBuf[end] == '\n') 276 | end++; 277 | 278 | mBufUsedLen = end; 279 | return; 280 | } 281 | // even this new buf is not enough, although impossible 282 | line->append(mFastqBuf+start, mBufDataLen); 283 | } 284 | 285 | return; 286 | } 287 | 288 | Read* FastqReader::read(){ 289 | if(mBufUsedLen >= mBufDataLen && bufferFinished()) { 290 | return NULL; 291 | } 292 | 293 | string* name; 294 | string* sequence; 295 | string* strand; 296 | string* quality; 297 | 298 | Read* readInPool = NULL; 299 | if(mReadPool) 300 | readInPool = mReadPool->getOne(); 301 | 302 | if(readInPool) { 303 | name = readInPool->mName; 304 | sequence = readInPool->mSeq; 305 | strand = readInPool->mStrand; 306 | quality = readInPool->mQuality; 307 | } else { 308 | name = new string(); 309 | sequence = new string(); 310 | strand = new string(); 311 | quality = new string(); 312 | } 313 | 314 | getLine(name); 315 | // name should start with @ 316 | while((name->empty() && !(mBufUsedLen >= mBufDataLen && bufferFinished())) || (!name->empty() && (*name)[0]!='@')){ 317 | getLine(name); 318 | } 319 | if(name->empty()) 320 | return NULL; 321 | 322 | getLine(sequence); 323 | getLine(strand); 324 | getLine(quality); 325 | 326 | if (strand->empty() || (*strand)[0]!='+') { 327 | cerr << *name << endl; 328 | cerr << "Expected '+', got " << *strand << endl; 329 | cerr << "Your FASTQ may be invalid, please check the tail of your FASTQ file" << endl; 330 | return NULL; 331 | } 332 | 333 | if(quality->length() != sequence->length()) { 334 | cerr << "ERROR: sequence and quality have different length:" << endl; 335 | cerr << *name << endl; 336 | cerr << *sequence << endl; 337 | cerr << *strand << endl; 338 | cerr << *quality << endl; 339 | cerr << "Your FASTQ may be invalid, please check the tail of your FASTQ file" << endl; 340 | return NULL; 341 | } 342 | 343 | if(readInPool) 344 | return readInPool; 345 | else 346 | return new Read(name, sequence, strand, quality, mPhred64); 347 | } 348 | 349 | void FastqReader::close(){ 350 | if (mFile){ 351 | fclose(mFile); 352 | mFile = NULL; 353 | } 354 | } 355 | 356 | bool FastqReader::isZipFastq(string filename) { 357 | if (ends_with(filename, ".fastq.gz")) 358 | return true; 359 | else if (ends_with(filename, ".fq.gz")) 360 | return true; 361 | else if (ends_with(filename, ".fasta.gz")) 362 | return true; 363 | else if (ends_with(filename, ".fa.gz")) 364 | return true; 365 | else 366 | return false; 367 | } 368 | 369 | bool FastqReader::isFastq(string filename) { 370 | if (ends_with(filename, ".fastq")) 371 | return true; 372 | else if (ends_with(filename, ".fq")) 373 | return true; 374 | else if (ends_with(filename, ".fasta")) 375 | return true; 376 | else if (ends_with(filename, ".fa")) 377 | return true; 378 | else 379 | return false; 380 | } 381 | 382 | bool FastqReader::isZipped(){ 383 | return mZipped; 384 | } 385 | 386 | bool FastqReader::test(){ 387 | FastqReader reader1("testdata/R1.fq"); 388 | FastqReader reader2("testdata/R1.fq"); 389 | Read* r1 = NULL; 390 | Read* r2 = NULL; 391 | int i=0; 392 | while(true){ 393 | i++; 394 | r1=reader1.read(); 395 | r2=reader2.read(); 396 | if(r1 == NULL || r2==NULL) 397 | break; 398 | r1->print(); 399 | r2->print(); 400 | delete r1; 401 | delete r2; 402 | } 403 | return true; 404 | } 405 | 406 | FastqReaderPair::FastqReaderPair(FastqReader* left, FastqReader* right){ 407 | mLeft = left; 408 | mRight = right; 409 | } 410 | 411 | FastqReaderPair::FastqReaderPair(string leftName, string rightName, bool hasQuality, bool phred64, bool interleaved){ 412 | mInterleaved = interleaved; 413 | mLeft = new FastqReader(leftName, hasQuality, phred64); 414 | if(mInterleaved) 415 | mRight = NULL; 416 | else 417 | mRight = new FastqReader(rightName, hasQuality, phred64); 418 | } 419 | 420 | FastqReaderPair::~FastqReaderPair(){ 421 | if(mLeft){ 422 | delete mLeft; 423 | mLeft = NULL; 424 | } 425 | if(mRight){ 426 | delete mRight; 427 | mRight = NULL; 428 | } 429 | } 430 | 431 | ReadPair* FastqReaderPair::read(){ 432 | Read* l = mLeft->read(); 433 | Read* r = NULL; 434 | if(mInterleaved) 435 | r = mLeft->read(); 436 | else 437 | r = mRight->read(); 438 | if(!l || !r){ 439 | return NULL; 440 | } else { 441 | return new ReadPair(l, r); 442 | } 443 | } 444 | --------------------------------------------------------------------------------