├── .gitignore ├── alignment_util.h ├── Makefile ├── bit_array.cpp ├── csa_bwt.h ├── rle_bwt.h ├── file_iterators.h ├── example ├── README.md └── run_example.sh ├── LICENSE ├── string_util.h ├── base_bwt.h ├── string_util.cpp ├── bit_array.h ├── base_bwt.cpp ├── alignment_util.cpp ├── file_iterators.cpp ├── README.md ├── csa_bwt.cpp ├── rle_bwt.cpp ├── converter └── converter_main.cpp ├── CTPL ├── ctpl_stl.h └── LICENSE └── main.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | fmlrc 3 | fmlrc-convert 4 | -------------------------------------------------------------------------------- /alignment_util.h: -------------------------------------------------------------------------------- 1 | #ifndef ALIGNMENT_UTIL_H 2 | #define ALIGNMENT_UTIL_H 3 | 4 | //C headers 5 | #include 6 | 7 | //C++ headers 8 | #include 9 | 10 | using namespace std; 11 | 12 | uint64_t editDistance(const vector &s1, const vector &s2); 13 | pair editDistance_minimize(const vector &s1, const vector &s2); 14 | 15 | #endif -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC=g++ 2 | CFLAGS=-O3 -std=c++11 -Wall -pthread 3 | LDFLAGS=-O3 -std=c++11 -Wall -pthread 4 | SOURCES=alignment_util.cpp base_bwt.cpp bit_array.cpp csa_bwt.cpp file_iterators.cpp main.cpp rle_bwt.cpp string_util.cpp 5 | OBJECTS=$(SOURCES:.cpp=.o) 6 | EXECUTABLE=fmlrc 7 | 8 | CON_SOURCES=converter/converter_main.cpp 9 | CON_OBJECTS=$(CON_SOURCES:.cpp=.o) 10 | CON_EXEC=fmlrc-convert 11 | 12 | all: $(EXECUTABLE) $(CON_EXEC) 13 | 14 | $(EXECUTABLE): $(OBJECTS) 15 | $(CC) $(LDFLAGS) $(OBJECTS) -o $@ 16 | 17 | $(CON_EXEC): $(CON_OBJECTS) 18 | $(CC) $(LDFLAGS) $(CON_OBJECTS) -o $@ 19 | 20 | .cpp.o: 21 | $(CC) -c $(CFLAGS) $< -o $@ -------------------------------------------------------------------------------- /bit_array.cpp: -------------------------------------------------------------------------------- 1 | 2 | //C headers 3 | #include 4 | 5 | //custom headers 6 | #include "bit_array.h" 7 | 8 | using namespace std; 9 | 10 | BitArray::BitArray(uint64_t baLen) { 11 | this->numValues = ceil(baLen/64.0); 12 | this->ba = vector(this->numValues, 0); 13 | } 14 | 15 | void BitArray::createIndex(uint64_t initialRank) { 16 | this->index = vector(this->numValues+1, 0); 17 | 18 | uint64_t offset = initialRank; 19 | for(uint64_t x = 0; x < this->numValues; x++) { 20 | this->index[x] = offset; 21 | offset += rank64(this->ba[x]); 22 | } 23 | this->index[this->numValues] = offset; 24 | } -------------------------------------------------------------------------------- /csa_bwt.h: -------------------------------------------------------------------------------- 1 | #ifndef CSA_BWT_H 2 | #define CSA_BWT_H 3 | 4 | //C headers 5 | #include 6 | 7 | //C++ headers 8 | 9 | //custom headers 10 | #include "bit_array.h" 11 | #include "base_bwt.h" 12 | 13 | using namespace std; 14 | 15 | class CSA_BWT : public BaseBWT{ 16 | private: 17 | //constructFMIndex() 18 | vector csa; 19 | 20 | //these functions build all auxiliary structures required for the FM-index lookups 21 | void constructFMIndex(bool storeDN); 22 | 23 | public: 24 | //constructor and destructor 25 | CSA_BWT(string inFN, bool storeD=true); 26 | ~CSA_BWT(); 27 | 28 | //query sub-routines 29 | bwtRange constrainRange(uint8_t sym, bwtRange inRange); 30 | }; 31 | 32 | #endif -------------------------------------------------------------------------------- /rle_bwt.h: -------------------------------------------------------------------------------- 1 | #ifndef RLE_BWT_H 2 | #define RLE_BWT_H 3 | 4 | //C headers 5 | #include 6 | 7 | //C++ headers 8 | #include 9 | #include 10 | 11 | //Custom headers 12 | #include "base_bwt.h" 13 | 14 | using namespace std; 15 | 16 | class RLE_BWT : public BaseBWT { 17 | private: 18 | //loaded from disk 19 | uint8_t bitPower; 20 | uint64_t binSize; 21 | 22 | //constructFMIndex() - IMPORTANT: THIS IS TRANSPOSED COMPARED TO PYTHON IMPL 23 | //aka row = symbol; column = index 24 | uint64_t** fmIndex; 25 | vector refFM; 26 | uint64_t offsetSum; 27 | 28 | //these functions build all auxiliary structures required for the FM-index lookups 29 | void constructFMIndex(); 30 | 31 | public: 32 | //constructor and destructor 33 | RLE_BWT(string inFN, uint8_t bitPower=8); 34 | ~RLE_BWT(); 35 | 36 | //query sub-routines 37 | bwtRange constrainRange(uint8_t sym, bwtRange inRange); 38 | }; 39 | 40 | #endif -------------------------------------------------------------------------------- /file_iterators.h: -------------------------------------------------------------------------------- 1 | #ifndef FILE_ITERATORS_H 2 | #define FILE_ITERATORS_H 3 | 4 | //C headers 5 | #include 6 | 7 | //C++ headers 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | struct LongReadFA { 14 | string label; 15 | string seq; 16 | }; 17 | 18 | class FastaIterator { 19 | private: 20 | bool isMoreData; 21 | bool is_fq; 22 | string fastFN; 23 | string nextLine; 24 | ifstream ifp; 25 | public: 26 | //constructor 27 | FastaIterator(string fastFN); 28 | 29 | //funcs that matter 30 | inline bool isMore() { return this->isMoreData;}; 31 | struct LongReadFA getNextRead(); 32 | }; 33 | 34 | class FastaWriter { 35 | private: 36 | string fastaFN; 37 | int symsPerLine; 38 | ofstream ofp; 39 | public: 40 | //constructor 41 | FastaWriter(string fastaFN, int symsPerLine=50); 42 | 43 | //destructor 44 | ~FastaWriter(); 45 | 46 | //funcs that matter 47 | bool writeRead(LongReadFA r); 48 | }; 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | ## Run _E. coli_ Example 3 | ### Commands 4 | 1. Go to a directory to store fmlrc and clone this repository: 5 | ```bash 6 | git clone https://github.com/holtjma/fmlrc.git 7 | ``` 8 | 2. Change into the fmlrc example directory: 9 | ```bash 10 | cd fmlrc/example 11 | ``` 12 | 3. Run the examples script. This will automatically install ropebwt2 to a local path, download short- and long-read _E. coli_ data, build the multi-string BWT using ropebwt2 and fmlrc-convert, and lastly run use fmlrc to correct the first 400 reads in the long-read data: 13 | ```bash 14 | ./run_example.sh 15 | ``` 16 | 17 | ### Outputs 18 | 1. `./ecoli_comp_msbwt.npy` - this contains the run-length encoded multi-string BWT using the same encoding as the [msbwt](https://github.com/holtjma/msbwt) python package. Note: if you wish to use this with msbwt, it will need to be placed in it's own directory and renamed to `comp_msbwt.npy` in that directory. For more information, please refer to the msbwt wiki pages. 19 | 2. `./corrected_final.fa` - this contains the first 400 long reads after correction using fmlrc. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (C) 2016 James Holt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | 23 | 24 | -------------------------------------------------------------------------------- /string_util.h: -------------------------------------------------------------------------------- 1 | #ifndef STRING_UTIL_H 2 | #define STRING_UTIL_H 3 | 4 | //C headers 5 | #include 6 | 7 | //C++ headers 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | namespace string_util { 14 | extern vector STRING_TO_INT; 15 | extern vector INT_TO_STRING; 16 | extern vector REV_COMP_I; 17 | 18 | //ALWAYS CALL THIS ONCE 19 | void initializeStringUtil(); 20 | 21 | //utilities 22 | inline vector reverseComplement_i(vector seq) { 23 | uint64_t seqLen = seq.size(); 24 | vector ret = vector(seqLen); 25 | for(uint64_t x = 0; x < seqLen; x++) { 26 | ret[x] = REV_COMP_I[seq[seqLen-x-1]]; 27 | } 28 | return ret; 29 | }; 30 | 31 | inline vector stoi(string seq) { 32 | uint64_t seqLen = seq.size(); 33 | vector ret(seqLen); 34 | for(uint64_t x = 0; x < seqLen; x++) { 35 | ret[x] = STRING_TO_INT[seq[x]]; 36 | } 37 | return ret; 38 | }; 39 | 40 | inline string itos(vector seq_i) { 41 | uint64_t seqLen = seq_i.size(); 42 | string ret(seqLen, ' '); 43 | for(uint64_t x = 0; x < seqLen; x++) { 44 | ret[x] = INT_TO_STRING[seq_i[x]]; 45 | } 46 | return ret; 47 | }; 48 | } 49 | 50 | #endif -------------------------------------------------------------------------------- /base_bwt.h: -------------------------------------------------------------------------------- 1 | #ifndef BASE_BWT_H 2 | #define BASE_BWT_H 3 | 4 | //C headers 5 | #include 6 | 7 | //C++ headers 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | enum { 14 | VC_LEN = 6,//$ A C G N T 15 | 16 | LETTER_BITS = 3, //defined 17 | NUMBER_BITS = 5, //8-letterBits 18 | NUM_POWER = 32, //2**numberBits 19 | MASK = 7, //255 >> numberBits 20 | 21 | //These used to be pre-defined, but are set up as user options now 22 | //BIT_POWER = 8, //defined 23 | //BIN_SIZE = 256 //2**self.bitPower 24 | }; 25 | 26 | struct bwtRange { 27 | uint64_t l; 28 | uint64_t h; 29 | }; 30 | 31 | class BaseBWT { 32 | protected: 33 | //loaded from disk 34 | string bwtFN; 35 | vector bwt; 36 | 37 | //constructTotalCounts() 38 | vector totalCounts; 39 | 40 | //constructIndexing() 41 | vector startIndex; 42 | vector endIndex; 43 | uint64_t totalSize; 44 | 45 | //these functions build all auxiliary structures required for the FM-index lookups 46 | void constructTotalCounts(); 47 | void constructIndexing(); 48 | 49 | public: 50 | //constructor and destructor 51 | BaseBWT(); 52 | ~BaseBWT(); 53 | 54 | //basic query functions 55 | uint64_t countKmer(uint8_t * kmer, uint64_t kmerSize); 56 | 57 | //multi-query functions 58 | vector countPileup_i(vector seq, uint64_t kmerSize); 59 | 60 | //query sub-routines 61 | virtual bwtRange constrainRange(uint8_t sym, bwtRange inRange) = 0; 62 | }; 63 | 64 | #endif -------------------------------------------------------------------------------- /string_util.cpp: -------------------------------------------------------------------------------- 1 | 2 | //C headers 3 | #include 4 | 5 | //C++ headers 6 | #include 7 | 8 | //Custom headers 9 | #include "string_util.h" 10 | 11 | namespace string_util { 12 | //this is externed in the header file, so declare it here 13 | vector STRING_TO_INT; 14 | vector INT_TO_STRING; 15 | vector REV_COMP_I; 16 | 17 | void initializeStringUtil() { 18 | //set everything initially to 'N' 19 | STRING_TO_INT = vector(256, 4); 20 | 21 | //now set the specific values 22 | STRING_TO_INT['$'] = 0; 23 | STRING_TO_INT['A'] = 1; 24 | STRING_TO_INT['C'] = 2; 25 | STRING_TO_INT['G'] = 3; 26 | STRING_TO_INT['N'] = 4; 27 | STRING_TO_INT['T'] = 5; 28 | 29 | //set the lower-case ones also 30 | STRING_TO_INT['a'] = 1; 31 | STRING_TO_INT['c'] = 2; 32 | STRING_TO_INT['g'] = 3; 33 | STRING_TO_INT['n'] = 4; 34 | STRING_TO_INT['t'] = 5; 35 | 36 | //do the reverse array also 37 | INT_TO_STRING = vector(6); 38 | INT_TO_STRING[0] = '$'; 39 | INT_TO_STRING[1] = 'A'; 40 | INT_TO_STRING[2] = 'C'; 41 | INT_TO_STRING[3] = 'G'; 42 | INT_TO_STRING[4] = 'N'; 43 | INT_TO_STRING[5] = 'T'; 44 | 45 | //initialize the reverse-complement arrays 46 | REV_COMP_I = vector(6); 47 | REV_COMP_I[0] = 0;//$$ 48 | REV_COMP_I[1] = 5;//AT 49 | REV_COMP_I[2] = 3;//CG 50 | REV_COMP_I[3] = 2;//GC 51 | REV_COMP_I[4] = 4;//NN 52 | REV_COMP_I[5] = 1;//TA 53 | } 54 | } -------------------------------------------------------------------------------- /bit_array.h: -------------------------------------------------------------------------------- 1 | #ifndef BIT_ARRAY_H 2 | #define BIT_ARRAY_H 3 | 4 | //C headers 5 | #include 6 | 7 | //C++ headers 8 | #include 9 | 10 | using namespace std; 11 | 12 | inline void setBit64(vector &bitArray, uint64_t index) { 13 | bitArray[index >> 6] |= ((uint64_t)0x1 << (index & 0x3F)); 14 | } 15 | 16 | //returns the number of set bits in value 17 | inline uint64_t rank64(uint64_t value) { 18 | uint64_t ret = value-((value & (uint64_t)0xAAAAAAAAAAAAAAAA) >> 1); 19 | ret = (ret & (uint64_t)0x3333333333333333) + ((ret >> 2) & (uint64_t)0x3333333333333333); 20 | ret = (ret + (ret >> 4)) & (uint64_t)0x0F0F0F0F0F0F0F0F; 21 | return (ret * (uint64_t)0x0101010101010101) >> 56; 22 | } 23 | 24 | class BitArray { 25 | private: 26 | uint64_t numValues; 27 | vector ba; 28 | vector index; 29 | public: 30 | //constructor 31 | BitArray(uint64_t baLen); 32 | 33 | //use this to fill in the array with values 34 | inline void setBit(uint64_t ind) { 35 | setBit64(this->ba, ind); 36 | } 37 | 38 | //once filled in, use this to build the offsets 39 | void createIndex(uint64_t initialRank=0); 40 | 41 | //currently returns the number of set bits up to and including "pos" 42 | inline uint64_t rank(uint64_t pos) { 43 | //up to and including "pos" 44 | //return this->index[pos >> 6] + rank64(this->ba[pos >> 6] << (~pos & 0x3F)); 45 | //up to but NOT including "pos" 46 | return this->index[pos >> 6] + rank64((this->ba[pos >> 6] << (~pos & 0x3F)) << 1); 47 | } 48 | 49 | //TODO: do we need this for our purposes? I don't think so right now 50 | //inline select(uint64_t rank); 51 | }; 52 | 53 | #endif -------------------------------------------------------------------------------- /example/run_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Download this repository via "git clone https://github.com/holtjma/fmlrc.git" 3 | #cd to "fmlrc/examples" and run "./run_example.sh" 4 | 5 | #download and build ropebwt2 6 | if [ ! -f ./ropebwt2/ropebwt2 ]; then 7 | git clone https://github.com/lh3/ropebwt2.git 8 | cd ropebwt2; make; cd .. 9 | fi 10 | 11 | #download and build fmlrc 12 | if [ ! -f ../fmlrc ]; then 13 | cd ..; make; cd example 14 | fi 15 | 16 | DATADIR="example1" 17 | if [ ! -d ${DATADIR} ]; then 18 | mkdir ${DATADIR} 19 | fi 20 | 21 | #download short-read ecoli 22 | if [ ! -f ${DATADIR}/ERR022075_1.fastq.gz ]; then 23 | curl -o ${DATADIR}/ERR022075_1.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR022/ERR022075/ERR022075_1.fastq.gz 24 | fi 25 | if [ ! -f ${DATADIR}/ERR022075_2.fastq.gz ]; then 26 | curl -o ${DATADIR}/ERR022075_2.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR022/ERR022075/ERR022075_2.fastq.gz 27 | fi 28 | 29 | #download long-read ecoli and convert to fasta format 30 | if [ ! -f ${DATADIR}/PacBioCLR/PacBio_10kb_CLR.fasta ]; then 31 | curl -o ${DATADIR}/Ecoli_MG1655_pacBioToCA.tgz http://files.pacb.com/datasets/secondary-analysis/e-coli-k12-de-novo/1.3.0/Ecoli_MG1655_pacBioToCA.tgz 32 | tar -xvzf ${DATADIR}/Ecoli_MG1655_pacBioToCA.tgz -C ${DATADIR} 33 | awk 'NR%4==1||NR%4==2' ${DATADIR}/PacBioCLR/PacBio_10kb_CLR.fastq | tr "@" ">" > ${DATADIR}/PacBioCLR/PacBio_10kb_CLR.fasta 34 | fi 35 | 36 | #build the bwt 37 | if [ ! -f ${DATADIR}/ecoli_comp_msbwt.npy ]; then 38 | mkdir temp 39 | gunzip -c ${DATADIR}/ERR022075_?.fastq.gz | awk "NR % 4 == 2" | sort -T ./temp | tr NT TN | ./ropebwt2/ropebwt2 -LR | tr NT TN | ../fmlrc-convert ${DATADIR}/ecoli_comp_msbwt.npy 40 | fi 41 | 42 | #run fmlrc 43 | NUM_PROCS=4 44 | ../fmlrc -p $NUM_PROCS -e 400 ${DATADIR}/ecoli_comp_msbwt.npy ${DATADIR}/PacBioCLR/PacBio_10kb_CLR.fasta ${DATADIR}/corrected_final.fa -------------------------------------------------------------------------------- /base_bwt.cpp: -------------------------------------------------------------------------------- 1 | 2 | //C headers 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | //C++ headers 9 | #include 10 | #include 11 | #include 12 | 13 | //Custom headers 14 | #include "string_util.h" 15 | #include "rle_bwt.h" 16 | 17 | using namespace std; 18 | 19 | BaseBWT::BaseBWT() { 20 | 21 | } 22 | 23 | void BaseBWT::constructTotalCounts() { 24 | //the setup 25 | this->totalCounts = vector(VC_LEN, 0); 26 | uint8_t prevChar = 255; 27 | uint8_t currentChar; 28 | uint64_t powerMultiple = 1; 29 | uint64_t bwtSize = this->bwt.size(); 30 | uint64_t currentCount; 31 | 32 | //go through each run and add the symbol counts 33 | for(uint64_t x = 0; x < bwtSize; x++) { 34 | currentChar = this->bwt[x] & MASK; 35 | if(currentChar == prevChar) { 36 | powerMultiple *= NUM_POWER; 37 | } 38 | else { 39 | powerMultiple = 1; 40 | } 41 | prevChar = currentChar; 42 | currentCount = (this->bwt[x] >> LETTER_BITS)* powerMultiple; 43 | this->totalCounts[currentChar] += currentCount; 44 | } 45 | } 46 | 47 | void BaseBWT::constructIndexing() { 48 | this->startIndex = vector(VC_LEN, 0); 49 | this->endIndex = vector(VC_LEN, 0); 50 | 51 | uint64_t pos = 0; 52 | for(uint64_t x = 0; x < VC_LEN; x++) { 53 | this->startIndex[x] = pos; 54 | pos += this->totalCounts[x]; 55 | this->endIndex[x] = pos; 56 | } 57 | this->totalSize = pos; 58 | } 59 | 60 | BaseBWT::~BaseBWT() { 61 | 62 | } 63 | 64 | 65 | uint64_t BaseBWT::countKmer(uint8_t * kmer, uint64_t kmerSize) { 66 | bwtRange ret; 67 | ret.l = 0; 68 | ret.h = this->totalSize; 69 | 70 | for(int64_t x = kmerSize-1; x >= 0 && ret.l != ret.h; x--) { 71 | ret = this->constrainRange(kmer[x], ret); 72 | } 73 | 74 | return ret.h-ret.l; 75 | } 76 | 77 | vector BaseBWT::countPileup_i(vector seq, uint64_t kmerSize) { 78 | uint64_t seqLen = seq.size(); 79 | if(seqLen < kmerSize) { 80 | return vector(0); 81 | } 82 | 83 | uint64_t numCounts = seqLen-kmerSize+1; 84 | vector ret = vector(numCounts); 85 | 86 | vector revComp = string_util::reverseComplement_i(seq); 87 | 88 | for(uint64_t x = 0; x < numCounts; x++) { 89 | ret[x] = this->countKmer(&seq[x], kmerSize)+this->countKmer(&revComp[seqLen-kmerSize-x], kmerSize); 90 | } 91 | return ret; 92 | } 93 | -------------------------------------------------------------------------------- /alignment_util.cpp: -------------------------------------------------------------------------------- 1 | 2 | //C headers 3 | #include 4 | 5 | //C++ headers 6 | #include 7 | 8 | //my custom headers 9 | #include "alignment_util.h" 10 | 11 | using namespace std; 12 | 13 | //This code is essentially an adaptation of the algorithms on wikipedia: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C.2B.2B 14 | 15 | uint64_t editDistance(const vector &s1, const vector &s2) 16 | { 17 | //modified version of https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C.2B.2B 18 | const std::size_t len1 = s1.size(), len2 = s2.size(); 19 | std::vector col(len2+1), prevCol(len2+1); 20 | 21 | for (unsigned int i = 0; i < prevCol.size(); i++) 22 | prevCol[i] = i; 23 | for (unsigned int i = 0; i < len1; i++) { 24 | col[0] = i+1; 25 | for (unsigned int j = 0; j < len2; j++) 26 | // note that std::min({arg1, arg2, arg3}) works only in C++11, 27 | // for C++98 use std::min(std::min(arg1, arg2), arg3) 28 | //col[j+1] = std::min({ prevCol[1 + j] + 1, col[j] + 1, prevCol[j] + (s1[i]==s2[j] ? 0 : 1) }); 29 | col[j+1] = std::min(std::min(prevCol[1+j]+1, col[j]+1), prevCol[j]+(s1[i]==s2[j] ? 0 : 1)); 30 | col.swap(prevCol); 31 | } 32 | return prevCol[len2]; 33 | } 34 | 35 | pair editDistance_minimize(const vector &s1, const vector &s2) 36 | { 37 | //modified version of https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C.2B.2B 38 | const std::size_t len1 = s1.size(), len2 = s2.size(); 39 | std::vector col(len2+1), prevCol(len2+1); 40 | 41 | for (unsigned int i = 0; i < prevCol.size(); i++) 42 | prevCol[i] = i; 43 | for (unsigned int i = 0; i < len1; i++) { 44 | col[0] = i+1; 45 | for (unsigned int j = 0; j < len2; j++) 46 | // note that std::min({arg1, arg2, arg3}) works only in C++11, 47 | // for C++98 use std::min(std::min(arg1, arg2), arg3) 48 | //col[j+1] = std::min({ prevCol[1 + j] + 1, col[j] + 1, prevCol[j] + (s1[i]==s2[j] ? 0 : 1) }); 49 | col[j+1] = std::min(std::min(prevCol[1+j]+1, col[j]+1), prevCol[j]+(s1[i]==s2[j] ? 0 : 1)); 50 | col.swap(prevCol); 51 | } 52 | 53 | //get the last smallest value 54 | uint64_t argMin = len2; 55 | for(int64_t x = len2-1; x >= 0; x--) { 56 | if(prevCol[x] < prevCol[argMin]) argMin = x; 57 | } 58 | 59 | //pick the smallest score 60 | pair ret;// = scoreArray[oLen][mLen][choice]; 61 | ret.first = prevCol[argMin]; 62 | ret.second = argMin; 63 | return ret; 64 | } -------------------------------------------------------------------------------- /file_iterators.cpp: -------------------------------------------------------------------------------- 1 | 2 | //C headers 3 | #include 4 | 5 | //C++ headers 6 | #include 7 | #include 8 | #include 9 | 10 | //my headers 11 | #include "file_iterators.h" 12 | 13 | using namespace std; 14 | 15 | FastaIterator::FastaIterator(string fastFN) { 16 | //open the file 17 | this->fastFN = fastFN; 18 | this->ifp.open(this->fastFN); 19 | this->is_fq = fastFN.at(fastFN.length() - 1) == 'q'; 20 | 21 | //read the first line so we're in the correct state 22 | this->isMoreData = (bool)getline(this->ifp, this->nextLine); 23 | } 24 | 25 | struct LongReadFA FastaIterator::getNextRead() { 26 | struct LongReadFA ret; 27 | ret.label = this->nextLine; 28 | 29 | if(this->is_fq) { 30 | // reads only sequence data from a FASTQ file, ignoring quality strings 31 | ret.label.replace(0,1,">"); // change fastq to fasta format label line 32 | 33 | // sequence 34 | getline(this->ifp, ret.seq); 35 | // qual header '+' 36 | getline(this->ifp, this->nextLine); 37 | // quality string 38 | getline(this->ifp, this->nextLine); 39 | // next label (if any) 40 | this->isMoreData = (bool)getline(this->ifp, this->nextLine); 41 | 42 | return ret; 43 | } 44 | 45 | vector seqFrags = vector(); 46 | uint64_t seqLen = 0; 47 | 48 | while(getline(this->ifp, this->nextLine)) { 49 | if(this->nextLine[0] == '>') { 50 | //put the string together and return 51 | ret.seq.resize(seqLen); 52 | uint64_t currPos = 0; 53 | for(uint64_t x = 0; x < seqFrags.size(); x++) { 54 | ret.seq.replace(currPos, seqFrags[x].size(), seqFrags[x]); 55 | currPos += seqFrags[x].size(); 56 | } 57 | return ret; 58 | } 59 | else { 60 | //push back a fragments 61 | seqFrags.push_back(this->nextLine); 62 | seqLen += this->nextLine.size(); 63 | } 64 | } 65 | 66 | //we hit the last line 67 | this->isMoreData = false; 68 | 69 | //put the string together and return 70 | ret.seq.resize(seqLen); 71 | uint64_t currPos = 0; 72 | for(uint64_t x = 0; x < seqFrags.size(); x++) { 73 | ret.seq.replace(currPos, seqFrags[x].size(), seqFrags[x]); 74 | currPos += seqFrags[x].size(); 75 | } 76 | return ret; 77 | } 78 | 79 | FastaWriter::FastaWriter(string fastaFN, int symsPerLine) { 80 | this->fastaFN = fastaFN; 81 | this->symsPerLine = symsPerLine; 82 | 83 | this->ofp.open(this->fastaFN); 84 | } 85 | 86 | FastaWriter::~FastaWriter() { 87 | this->ofp.close(); 88 | } 89 | 90 | bool FastaWriter::writeRead(LongReadFA r) { 91 | this->ofp << r.label << "\n"; 92 | for(uint64_t x = 0; x < r.seq.size(); x += this->symsPerLine) { 93 | this->ofp << r.seq.substr(x, this->symsPerLine) << "\n"; 94 | } 95 | return true; 96 | } 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FMLRC 2 | ## Update - Version 2 3 | FMLRC has been succeeded by `fmlrc2` ([https://github.com/HudsonAlpha/rust-fmlrc](https://github.com/HudsonAlpha/rust-fmlrc)). 4 | Preliminary results show `fmlrc2` has near identical results, but runs in <50% of the time. 5 | It is also implemented in Rust, leveraging the cargo ecosystem. 6 | Assuming the results are satisfactory for your use case, we recommend switching to `fmlrc2` for the reduced run-time and to receive future updates to the algorithm. 7 | 8 | ## Introduction 9 | FMLRC, or FM-index Long Read Corrector, is a tool for performing hybrid correction of long read sequencing using the BWT and FM-index of short-read sequencing data. 10 | Given a BWT of the short-read sequencing data, FMLRC will build an FM-index and use that as an implicit de Bruijn graph. 11 | Each long read is then corrected independently by identifying low frequency k-mers in the long read and replacing them with the closest matching high frequency k-mers in the implicit de Bruijn graph. 12 | In contrast to other de Bruijn graph based implementations, FMLRC is not restricted to a particular k-mer size and instead uses a two pass method with both a short "k-mer" and a longer "K-mer". 13 | This allows FMLRC to correct through low complexity regions that are computational difficult for short k-mers. 14 | 15 | Included in this package are two implementations of the FM-index component of FMLRC. 16 | The default implementation is requires less CPU time but uses a higher sampled FM-index that requires more memory. 17 | The second implementation is more similar to a traditional sampled FM-index that requires less memory, but at the cost of longer computation times. 18 | Both implementation handle parallelization by distributing the reads across all available threads. 19 | 20 | ## Quick-start 21 | A full example is available in the `example` subfolder. Please refer to the [README](https://github.com/holtjma/fmlrc/tree/master/example) for directions. 22 | 23 | ## Installation and Setup 24 | First, download the latest version of FMLRC and unzip it. Then simply make the program and run it with the "-h" option to verify it installed. 25 | 26 | cd fmlrc 27 | make 28 | ./fmlrc -h 29 | 30 | ## Building the short-read BWT 31 | Prior to running FMLRC, a BWT of the short-read sequencing data needs to be constructed. 32 | Currently, the implementation expects it to be in the Run-Length Encoded (RLE) format of the [*msbwt*](https://github.com/holtjma/msbwt) python package. 33 | We recommend building the BWT using [*ropebwt2*](https://github.com/lh3/ropebwt2) by following the instructions on [Converting to the fmlrc RLE-BWT format](https://github.com/holtjma/fmlrc/wiki/Converting-to-the-fmlrc-RLE-BWT-format). 34 | Alternatively, the *msbwt* package can directly build these BWTs ([Constructing the BWT wiki](https://github.com/holtjma/msbwt/wiki/Constructing-the-MSBWT)), but it may be slower and less memory efficient. 35 | 36 | ## Running FMLRC 37 | Once a short-read BWT is constructed, the execution of FMLRC is relatively simple: 38 | 39 | ./fmlrc [options] 40 | 41 | Here is a partial list of the more useful options of FMLRC: 42 | 43 | * -k - sets the length for the short k-mer pass (default: 21) 44 | * -K - sets the length for the long K-mer pass (default: 59) 45 | * -p - sets the number of threads allowed for correction (default: 1) 46 | 47 | ## Reference 48 | 49 | [Wang, Jeremy R. and Holt, James and McMillan, Leonard and Jones, Corbin D. FMLRC: Hybrid long read error correction using an FM-index. BMC Bioinformatics, 2018. 19 (1) 50.](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2051-3) 50 | -------------------------------------------------------------------------------- /csa_bwt.cpp: -------------------------------------------------------------------------------- 1 | 2 | //C headers 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | //C++ headers 9 | #include 10 | #include 11 | #include 12 | 13 | //Custom headers 14 | #include "string_util.h" 15 | #include "csa_bwt.h" 16 | 17 | using namespace std; 18 | 19 | CSA_BWT::CSA_BWT(string inFN, bool storeD) { 20 | this->bwtFN = inFN; 21 | 22 | //get the bwt file size 23 | struct stat bwt_st; 24 | stat(this->bwtFN.c_str(), &bwt_st); 25 | 26 | //get ready to read the bwt 27 | ifstream bwtIn(this->bwtFN, ios::in | ios::binary); 28 | this->bwt = vector(1000); 29 | 30 | //read the numpy header: http://docs.scipy.org/doc/numpy-1.10.1/neps/npy-format.html 31 | bwtIn.read((char*)&this->bwt[0], 16); 32 | uint16_t headerLen = this->bwt[8]+256*(int)this->bwt[9]; 33 | uint16_t skipBytes = 10+headerLen; 34 | if(skipBytes % 16 != 0) { 35 | skipBytes = ((skipBytes / 16)+1)*16; 36 | } 37 | bwtIn.read((char*)&this->bwt[0], skipBytes - 16); 38 | 39 | //finally allocate and load the bwt 40 | uint64_t bwtDiskSize = bwt_st.st_size - skipBytes; 41 | this->bwt = vector(bwtDiskSize); 42 | bwtIn.read((char*)&this->bwt[0], bwtDiskSize); 43 | bwtIn.close(); 44 | printf("loaded bwt with %lu compressed values\n", this->bwt.size()); 45 | 46 | //first get the total symbol counts 47 | this->constructTotalCounts(); 48 | 49 | //build some auxiliary indices 50 | this->constructIndexing(); 51 | 52 | //now build the FM-index 53 | this->constructFMIndex(storeD); 54 | 55 | //now delete the original BWT 56 | this->bwt = vector(0); 57 | } 58 | 59 | void CSA_BWT::constructFMIndex(bool storeD) { 60 | //figure out the number of entries and pre-allocate 61 | 62 | this->csa = vector(VC_LEN); 63 | for(int x = 0; x < VC_LEN; x++) { 64 | if(x != 0 || storeD) this->csa[x] = new BitArray(this->totalSize); 65 | else this->csa[x] = new BitArray(1); //this is just a dummy to get deleted later 66 | } 67 | 68 | uint8_t prevChar = 0; 69 | uint64_t totalCharCount = 0; 70 | uint64_t powerMultiple = 1; 71 | //uint64_t binEnd = 0; 72 | //uint64_t binID = 0; 73 | uint64_t bwtIndex = 0; 74 | //uint64_t prevStart = 0; 75 | uint8_t currentChar; 76 | 77 | //vector setCount = vector(VC_LEN, 0); 78 | 79 | //go through each run in the BWT and set FM-indices as we go 80 | uint64_t numBytes = this->bwt.size(); 81 | for(uint64_t x = 0; x < numBytes; x++) { 82 | //printf("%d %d\n", x, numBytes); 83 | currentChar = this->bwt[x] & MASK; 84 | if(currentChar == prevChar) { 85 | totalCharCount += (this->bwt[x] >> LETTER_BITS) * powerMultiple; 86 | powerMultiple *= NUM_POWER; 87 | } 88 | else { 89 | //first save the current FM-index entry 90 | if(prevChar != 0 || storeD) { 91 | for(uint64_t y = bwtIndex; y < bwtIndex+totalCharCount; y++) { 92 | this->csa[prevChar]->setBit(y); 93 | //setCount[prevChar] += 1; 94 | } 95 | } 96 | 97 | //now add the previous 98 | bwtIndex += totalCharCount; 99 | prevChar = currentChar; 100 | totalCharCount = this->bwt[x] >> LETTER_BITS; 101 | powerMultiple = NUM_POWER; 102 | } 103 | } 104 | 105 | if(prevChar != 0 || storeD) { 106 | for(uint64_t y = bwtIndex; y < bwtIndex+totalCharCount; y++) { 107 | this->csa[prevChar]->setBit(y); 108 | } 109 | } 110 | 111 | for(uint64_t x = 0; x < VC_LEN; x++) { 112 | if(x != 0 || storeD) this->csa[x]->createIndex(this->startIndex[x]); 113 | } 114 | } 115 | 116 | CSA_BWT::~CSA_BWT() { 117 | for(int x = 0; x < VC_LEN; x++) { 118 | delete this->csa[x]; 119 | } 120 | } 121 | 122 | bwtRange CSA_BWT::constrainRange(uint8_t sym, bwtRange inRange) { 123 | //first find the low value 124 | bwtRange ret; 125 | ret.l = this->csa[sym]->rank(inRange.l); 126 | ret.h = this->csa[sym]->rank(inRange.h); 127 | return ret; 128 | }; -------------------------------------------------------------------------------- /rle_bwt.cpp: -------------------------------------------------------------------------------- 1 | 2 | //C headers 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | //C++ headers 9 | #include 10 | #include 11 | #include 12 | 13 | //Custom headers 14 | #include "string_util.h" 15 | #include "rle_bwt.h" 16 | 17 | using namespace std; 18 | 19 | RLE_BWT::RLE_BWT(string inFN, uint8_t bitPower) { 20 | this->bwtFN = inFN; 21 | this->bitPower = bitPower; 22 | this->binSize = pow(2, this->bitPower); 23 | 24 | //get the bwt file size 25 | struct stat bwt_st; 26 | stat(this->bwtFN.c_str(), &bwt_st); 27 | 28 | //get ready to read the bwt 29 | ifstream bwtIn(this->bwtFN, ios::in | ios::binary); 30 | this->bwt = vector(1000); 31 | 32 | //read the numpy header: http://docs.scipy.org/doc/numpy-1.10.1/neps/npy-format.html 33 | bwtIn.read((char*)&this->bwt[0], 16); 34 | uint16_t headerLen = this->bwt[8]+256*(int)this->bwt[9]; 35 | uint16_t skipBytes = 10+headerLen; 36 | if(skipBytes % 16 != 0) { 37 | skipBytes = ((skipBytes / 16)+1)*16; 38 | } 39 | bwtIn.read((char*)&this->bwt[0], skipBytes-16); 40 | 41 | //finally allocate and load the bwt 42 | uint64_t bwtDiskSize = bwt_st.st_size - skipBytes; 43 | this->bwt = vector(bwtDiskSize); 44 | bwtIn.read((char*)&this->bwt[0], bwtDiskSize); 45 | bwtIn.close(); 46 | printf("loaded bwt with %lu compressed values\n", this->bwt.size()); 47 | 48 | //first get the total symbol counts 49 | this->constructTotalCounts(); 50 | 51 | //build some auxiliary indices 52 | this->constructIndexing(); 53 | 54 | //now build the FM-index 55 | this->constructFMIndex(); 56 | } 57 | 58 | void RLE_BWT::constructFMIndex() { 59 | //figure out the number of entries and pre-allocate 60 | //uint64_t samplingSize = (uint64_t)ceil(((float)this->totalSize+1)/this->binSize); 61 | uint64_t samplingSize = (uint64_t)ceil(((float)this->totalSize+1)/this->binSize)+1; 62 | this->fmIndex = new uint64_t*[VC_LEN]; 63 | for(int x = 0; x < VC_LEN; x++) { 64 | this->fmIndex[x] = new uint64_t[samplingSize]; 65 | } 66 | this->refFM = vector(samplingSize, 0); 67 | 68 | uint8_t prevChar = 0; 69 | uint64_t totalCharCount = 0; 70 | uint64_t powerMultiple = 1; 71 | uint64_t binEnd = 0; 72 | uint64_t binID = 0; 73 | uint64_t bwtIndex = 0; 74 | uint64_t prevStart = 0; 75 | uint8_t currentChar; 76 | 77 | vector countsSoFar = vector(VC_LEN); 78 | for(int x = 0; x < VC_LEN; x++) { 79 | countsSoFar[x] = this->startIndex[x]; 80 | } 81 | 82 | //go through each run in the BWT and set FM-indices as we go 83 | uint64_t numBytes = this->bwt.size(); 84 | for(uint64_t x = 0; x < numBytes; x++) { 85 | currentChar = this->bwt[x] & MASK; 86 | if(currentChar == prevChar) { 87 | totalCharCount += (this->bwt[x] >> LETTER_BITS) * powerMultiple; 88 | powerMultiple *= NUM_POWER; 89 | } 90 | else { 91 | //first save the current FM-index entry 92 | while(bwtIndex+totalCharCount >= binEnd) { 93 | this->refFM[binID] = prevStart; 94 | for(int y = 0; y < VC_LEN; y++) { 95 | this->fmIndex[y][binID] = countsSoFar[y]; 96 | } 97 | binEnd += this->binSize; 98 | binID++; 99 | } 100 | 101 | //now add the previous 102 | countsSoFar[prevChar] += totalCharCount; 103 | bwtIndex += totalCharCount; 104 | 105 | prevChar = currentChar; 106 | prevStart = x; 107 | totalCharCount = this->bwt[x] >> LETTER_BITS; 108 | powerMultiple = NUM_POWER; 109 | } 110 | } 111 | 112 | while(bwtIndex+totalCharCount >= binEnd) { 113 | this->refFM[binID] = prevStart; 114 | for(int y = 0; y < VC_LEN; y++) { 115 | this->fmIndex[y][binID] = countsSoFar[y]; 116 | } 117 | binEnd += this->binSize; 118 | binID++; 119 | } 120 | 121 | //set the last entry 122 | countsSoFar[prevChar] += totalCharCount;//forces countSoFar to hold the very end FM-index entry 123 | this->refFM[samplingSize-1] = numBytes; //need to point to the index at the end 124 | for(int y = 0; y < VC_LEN; y++) { 125 | this->fmIndex[y][samplingSize-1] = countsSoFar[y]; 126 | } 127 | 128 | //calculate the total offsetSum 129 | this->offsetSum = 0; 130 | for(int x = 0; x < VC_LEN; x++) { 131 | this->offsetSum += this->fmIndex[x][0]; 132 | } 133 | } 134 | 135 | RLE_BWT::~RLE_BWT() { 136 | for(int x = 0; x < VC_LEN; x++) { 137 | delete this->fmIndex[x]; 138 | } 139 | delete this->fmIndex; 140 | } 141 | 142 | bwtRange RLE_BWT::constrainRange(uint8_t sym, bwtRange inRange) { 143 | //first find the low value 144 | uint64_t binID = inRange.l >> this->bitPower; 145 | uint64_t compressedIndex = this->refFM[binID]; 146 | uint64_t bwtIndex = 0; 147 | for(uint64_t x = 0; x < VC_LEN; x++) { 148 | bwtIndex += this->fmIndex[x][binID]; 149 | } 150 | bwtIndex -= this->offsetSum; 151 | 152 | bwtRange ret; 153 | ret.l = this->fmIndex[sym][binID]; 154 | 155 | /* 156 | Dear future Matt, 157 | You have already tried using shifts (<<, >>) instead of multiplication, no effect (if anything it was worse). 158 | Sincerely, 159 | Pass Matt 160 | */ 161 | uint8_t prevChar = 255; 162 | uint8_t currentChar; 163 | uint64_t prevCount = 0; 164 | uint64_t powerMultiple = 1; 165 | 166 | while(bwtIndex+prevCount < inRange.l) { 167 | currentChar = this->bwt[compressedIndex] & MASK; 168 | if(currentChar == prevChar) { 169 | prevCount += (this->bwt[compressedIndex] >> LETTER_BITS) * powerMultiple; 170 | powerMultiple *= NUM_POWER; 171 | } 172 | else { 173 | if(prevChar == sym) ret.l += prevCount; 174 | 175 | bwtIndex += prevCount; 176 | prevCount = this->bwt[compressedIndex] >> LETTER_BITS; 177 | prevChar = currentChar; 178 | powerMultiple = NUM_POWER; 179 | } 180 | compressedIndex++; 181 | } 182 | 183 | uint64_t tempC = ret.l; 184 | if(prevChar == sym) ret.l += inRange.l - bwtIndex; 185 | 186 | //now find the high value 187 | uint64_t binID_h = inRange.h >> this->bitPower; 188 | if(binID == binID_h) ret.h = tempC; 189 | else { 190 | compressedIndex = this->refFM[binID_h]; 191 | bwtIndex = 0; 192 | for(uint64_t x = 0; x < VC_LEN; x++) { 193 | bwtIndex += this->fmIndex[x][binID_h]; 194 | } 195 | bwtIndex -= this->offsetSum; 196 | 197 | ret.h = this->fmIndex[sym][binID_h]; 198 | 199 | prevChar = 255; 200 | prevCount = 0; 201 | powerMultiple = 1; 202 | } 203 | 204 | while(bwtIndex+prevCount < inRange.h) { 205 | currentChar = this->bwt[compressedIndex] & MASK; 206 | if(currentChar == prevChar) { 207 | prevCount += (this->bwt[compressedIndex] >> LETTER_BITS) * powerMultiple; 208 | powerMultiple *= NUM_POWER; 209 | } 210 | else { 211 | if(prevChar == sym) ret.h += prevCount; 212 | 213 | bwtIndex += prevCount; 214 | prevCount = this->bwt[compressedIndex] >> LETTER_BITS; 215 | prevChar = currentChar; 216 | powerMultiple = NUM_POWER; 217 | } 218 | compressedIndex++; 219 | } 220 | 221 | if(prevChar == sym) ret.h += inRange.h - bwtIndex; 222 | return ret; 223 | } 224 | -------------------------------------------------------------------------------- /converter/converter_main.cpp: -------------------------------------------------------------------------------- 1 | 2 | //C headers 3 | #include 4 | #include 5 | #include 6 | 7 | //C++ headers 8 | #include 9 | #include 10 | #include 11 | 12 | struct Parameters { 13 | bool FORCE_OVERWRITE; 14 | bool USE_STDIN; 15 | std::string filename; 16 | }; 17 | 18 | const std::string VERSION = "1.0.0"; 19 | 20 | int runConverter(Parameters myParams, char * outFN) { 21 | FILE * inputStream; 22 | if(myParams.USE_STDIN) { 23 | inputStream = stdin; 24 | printf("[fmlrc-convert] Reading from stdin\n"); 25 | } 26 | else { 27 | inputStream = fopen(myParams.filename.c_str(), "r"); 28 | printf("[fmlrc-convert] Reading from \"%s\"\n", myParams.filename.c_str()); 29 | } 30 | 31 | FILE * outputStream = fopen(outFN, "w+"); 32 | 33 | //TODO: lots of header related items 34 | unsigned long BUFFER_SIZE = 1024; 35 | unsigned char buffer[BUFFER_SIZE]; 36 | 37 | //most of the files I've seen are 80 and '\x46', I'm increasing it just in case 38 | unsigned long headerSize = 96; 39 | std::string headerHex = "\x56"; 40 | unsigned long x; 41 | for(x=0; x < headerSize-1; x++) { 42 | buffer[x] = 32; //hex value 20 = ' ' 43 | } 44 | buffer[headerSize-1] = 10; //hex value 0a = '\n' 45 | fwrite(buffer, 1, headerSize, outputStream); 46 | 47 | //set up the translation, default is 255 48 | unsigned char translator[256]; 49 | std::fill_n(translator, 256, 255); 50 | 51 | std::string validSymbols = "$ACGNT"; 52 | x = 0; 53 | for(char &c: validSymbols) { 54 | translator[(unsigned char)c] = x; 55 | x++; 56 | } 57 | unsigned long symCount[6] = {0}; 58 | 59 | //first read 60 | unsigned long readBytes = fread(buffer, 1, BUFFER_SIZE, inputStream); 61 | 62 | unsigned char currSym = buffer[0]; 63 | unsigned long currCount = 0; 64 | unsigned char writeByte; 65 | unsigned long bytesWritten = 0; 66 | 67 | //core loop 68 | while(readBytes > 0) { 69 | for(x = 0; x < readBytes; x++) { 70 | if(currSym == buffer[x]) { 71 | currCount++; 72 | } 73 | else { 74 | //check for invalid character; if it's the new line symbol, we will ignore it 75 | if(translator[currSym] == 255) { 76 | if(currSym != 10){ 77 | printf("[fmlrc-convert] ERROR - unexpected symbol in input: char: \"%c\", hex: \"%x\"\n", currSym, currSym); 78 | return 1; 79 | } 80 | } 81 | else { 82 | //we are at the end of the run so handle it 83 | symCount[translator[currSym]] += currCount; 84 | while(currCount > 0) { 85 | writeByte = translator[currSym] | ((currCount & 0x1F) << 3); 86 | fwrite(&writeByte, 1, 1, outputStream); 87 | currCount = currCount >> 5; 88 | bytesWritten += 1; 89 | } 90 | 91 | //get the next symbol since it's valid and start a new run 92 | currSym = buffer[x]; 93 | currCount = 1; 94 | } 95 | } 96 | } 97 | 98 | //get the next batch to parse through 99 | readBytes = fread(buffer, 1, BUFFER_SIZE, inputStream); 100 | } 101 | 102 | //handle the last run 103 | if(translator[currSym] == 255){ 104 | if(currSym != 10){ 105 | printf("[fmlrc-convert] ERROR - unexpected symbol in input: char: \"%c\", hex: \"%x\"\n", currSym, currSym); 106 | return 1; 107 | } 108 | } 109 | else { 110 | //we are at the end of the last run so handle it 111 | symCount[translator[currSym]] += currCount; 112 | while(currCount > 0) { 113 | writeByte = translator[currSym] | ((currCount & 0x1F) << 3); 114 | fwrite(&writeByte, 1, 1, outputStream); 115 | currCount = currCount >> 5; 116 | bytesWritten += 1; 117 | } 118 | 119 | //clear these 120 | currSym = 0; 121 | currCount = 0; 122 | } 123 | 124 | //we have finished the compression part, close the input file 125 | fclose(inputStream); 126 | 127 | //now that we know the total length, fill in the bytes for our header 128 | //have to do some special things due to the \x00 characters; might be a better way to do this 129 | std::string initialWrite = "\x93NUMPY\x01"; 130 | initialWrite.push_back('\0'); 131 | initialWrite.push_back(headerHex.c_str()[0]); 132 | initialWrite.push_back('\0'); 133 | initialWrite += "{\'descr\': \'|u1\', \'fortran_order\': False, \'shape\': ("; 134 | initialWrite += std::to_string(bytesWritten); 135 | initialWrite += ",), }"; 136 | 137 | fseek(outputStream, 0, SEEK_SET); 138 | fwrite(initialWrite.c_str(), 1, initialWrite.length(), outputStream); 139 | //printf("init write len: %lu\n", initialWrite.length()); 140 | //finally close it all out 141 | fclose(outputStream); 142 | 143 | printf("[fmlrc-convert] symbol counts ($, A, C, G, N, T) = (%ld, %ld, %ld, %ld, %ld, %ld)\n", symCount[0], symCount[1], symCount[2], symCount[3], symCount[4], symCount[5]); 144 | printf("[fmlrc-convert] RLE-BWT byte length: %ld\n", bytesWritten); 145 | printf("[fmlrc-convert] RLE-BWT conversion complete.\n"); 146 | return 0; 147 | } 148 | 149 | int main(int argc, char* argv[]) { 150 | 151 | ////////////////////////////////////////////////////////// 152 | //DEFAULT PARAMETERS 153 | Parameters myParams; 154 | myParams.FORCE_OVERWRITE = false; //if true, this will silently overwrite the output file if it exists 155 | myParams.USE_STDIN = true; //if true, we will read all input from stdin 156 | myParams.filename = ""; //if the previous parameter is false, this is the filename we are reading from 157 | ////////////////////////////////////////////////////////// 158 | 159 | char opt; 160 | bool helpRequest = false; 161 | while((opt = getopt(argc, argv, "hvfi:")) != -1) { 162 | if(opt == 'h') helpRequest = true; 163 | else if(opt == 'v') { 164 | printf("[fmlrc-convert] version %s\n", VERSION.c_str()); 165 | return 0; 166 | } 167 | else if(opt == 'i') { 168 | myParams.USE_STDIN = false; 169 | myParams.filename = optarg; 170 | }else if (opt == 'f') myParams.FORCE_OVERWRITE = true; 171 | else printf("[fmlrc-convert] UNHANDLED OPTION: %d %c %s\n", optind, opt, optarg); 172 | } 173 | 174 | if(argc-optind < 1 || helpRequest) { 175 | printf("Usage: fmlrc-convert [options] \n"); 176 | printf("Options: -h print help menu\n"); 177 | printf(" -v print version number and exit\n"); 178 | printf(" -f force overwrite of existing file (default: false)\n"); 179 | printf(" -i STR the plain text BWT file to be converted into msbwt format (default: stdin)\n"); 180 | return 0; 181 | } 182 | 183 | //Input error checking 184 | char * bwtFN = argv[optind]; 185 | struct stat buffer; 186 | if(!myParams.FORCE_OVERWRITE && stat(bwtFN, &buffer) == 0) { 187 | printf("[fmlrc-convert] ERROR: output file already exists, use -f to force overwrite\n"); 188 | return 1; 189 | } 190 | 191 | if(!myParams.USE_STDIN && stat(myParams.filename.c_str(), &buffer) != 0) { 192 | printf("[fmlrc-convert] ERROR: input filename does not exist\n"); 193 | return 1; 194 | } 195 | 196 | //lets do the things 197 | return runConverter(myParams, bwtFN); 198 | } -------------------------------------------------------------------------------- /CTPL/ctpl_stl.h: -------------------------------------------------------------------------------- 1 | /********************************************************* 2 | * 3 | * Copyright (C) 2014 by Vitaliy Vitsentiy 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | *********************************************************/ 18 | 19 | 20 | #ifndef __ctpl_stl_thread_pool_H__ 21 | #define __ctpl_stl_thread_pool_H__ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | 34 | 35 | // thread pool to run user's functors with signature 36 | // ret func(int id, other_params) 37 | // where id is the index of the thread that runs the functor 38 | // ret is some return type 39 | 40 | 41 | namespace ctpl { 42 | 43 | namespace detail { 44 | template 45 | class Queue { 46 | public: 47 | bool push(T const & value) { 48 | std::unique_lock lock(this->mutex); 49 | this->q.push(value); 50 | return true; 51 | } 52 | // deletes the retrieved element, do not use for non integral types 53 | bool pop(T & v) { 54 | std::unique_lock lock(this->mutex); 55 | if (this->q.empty()) 56 | return false; 57 | v = this->q.front(); 58 | this->q.pop(); 59 | return true; 60 | } 61 | bool empty() { 62 | std::unique_lock lock(this->mutex); 63 | return this->q.empty(); 64 | } 65 | private: 66 | std::queue q; 67 | std::mutex mutex; 68 | }; 69 | } 70 | 71 | class thread_pool { 72 | 73 | public: 74 | 75 | thread_pool() { this->init(); } 76 | thread_pool(int nThreads) { this->init(); this->resize(nThreads); } 77 | 78 | // the destructor waits for all the functions in the queue to be finished 79 | ~thread_pool() { 80 | this->stop(true); 81 | } 82 | 83 | // get the number of running threads in the pool 84 | int size() { return static_cast(this->threads.size()); } 85 | 86 | // number of idle threads 87 | int n_idle() { return this->nWaiting; } 88 | std::thread & get_thread(int i) { return *this->threads[i]; } 89 | 90 | // change the number of threads in the pool 91 | // should be called from one thread, otherwise be careful to not interleave, also with this->stop() 92 | // nThreads must be >= 0 93 | void resize(int nThreads) { 94 | if (!this->isStop && !this->isDone) { 95 | int oldNThreads = static_cast(this->threads.size()); 96 | if (oldNThreads <= nThreads) { // if the number of threads is increased 97 | this->threads.resize(nThreads); 98 | this->flags.resize(nThreads); 99 | 100 | for (int i = oldNThreads; i < nThreads; ++i) { 101 | this->flags[i] = std::make_shared>(false); 102 | this->set_thread(i); 103 | } 104 | } 105 | else { // the number of threads is decreased 106 | for (int i = oldNThreads - 1; i >= nThreads; --i) { 107 | *this->flags[i] = true; // this thread will finish 108 | this->threads[i]->detach(); 109 | } 110 | { 111 | // stop the detached threads that were waiting 112 | std::unique_lock lock(this->mutex); 113 | this->cv.notify_all(); 114 | } 115 | this->threads.resize(nThreads); // safe to delete because the threads are detached 116 | this->flags.resize(nThreads); // safe to delete because the threads have copies of shared_ptr of the flags, not originals 117 | } 118 | } 119 | } 120 | 121 | // empty the queue 122 | void clear_queue() { 123 | std::function * _f; 124 | while (this->q.pop(_f)) 125 | delete _f; // empty the queue 126 | } 127 | 128 | // pops a functional wrapper to the original function 129 | std::function pop() { 130 | std::function * _f = nullptr; 131 | this->q.pop(_f); 132 | std::unique_ptr> func(_f); // at return, delete the function even if an exception occurred 133 | std::function f; 134 | if (_f) 135 | f = *_f; 136 | return f; 137 | } 138 | 139 | // wait for all computing threads to finish and stop all threads 140 | // may be called asynchronously to not pause the calling thread while waiting 141 | // if isWait == true, all the functions in the queue are run, otherwise the queue is cleared without running the functions 142 | void stop(bool isWait = false) { 143 | if (!isWait) { 144 | if (this->isStop) 145 | return; 146 | this->isStop = true; 147 | for (int i = 0, n = this->size(); i < n; ++i) { 148 | *this->flags[i] = true; // command the threads to stop 149 | } 150 | this->clear_queue(); // empty the queue 151 | } 152 | else { 153 | if (this->isDone || this->isStop) 154 | return; 155 | this->isDone = true; // give the waiting threads a command to finish 156 | } 157 | { 158 | std::unique_lock lock(this->mutex); 159 | this->cv.notify_all(); // stop all waiting threads 160 | } 161 | for (int i = 0; i < static_cast(this->threads.size()); ++i) { // wait for the computing threads to finish 162 | if (this->threads[i]->joinable()) 163 | this->threads[i]->join(); 164 | } 165 | // if there were no threads in the pool but some functors in the queue, the functors are not deleted by the threads 166 | // therefore delete them here 167 | this->clear_queue(); 168 | this->threads.clear(); 169 | this->flags.clear(); 170 | } 171 | 172 | template 173 | auto push(F && f, Rest&&... rest) ->std::future { 174 | auto pck = std::make_shared>( 175 | std::bind(std::forward(f), std::placeholders::_1, std::forward(rest)...) 176 | ); 177 | auto _f = new std::function([pck](int id) { 178 | (*pck)(id); 179 | }); 180 | this->q.push(_f); 181 | std::unique_lock lock(this->mutex); 182 | this->cv.notify_one(); 183 | return pck->get_future(); 184 | } 185 | 186 | // run the user's function that excepts argument int - id of the running thread. returned value is templatized 187 | // operator returns std::future, where the user can get the result and rethrow the catched exceptins 188 | template 189 | auto push(F && f) ->std::future { 190 | auto pck = std::make_shared>(std::forward(f)); 191 | auto _f = new std::function([pck](int id) { 192 | (*pck)(id); 193 | }); 194 | this->q.push(_f); 195 | std::unique_lock lock(this->mutex); 196 | this->cv.notify_one(); 197 | return pck->get_future(); 198 | } 199 | 200 | 201 | private: 202 | 203 | // deleted 204 | thread_pool(const thread_pool &);// = delete; 205 | thread_pool(thread_pool &&);// = delete; 206 | thread_pool & operator=(const thread_pool &);// = delete; 207 | thread_pool & operator=(thread_pool &&);// = delete; 208 | 209 | void set_thread(int i) { 210 | std::shared_ptr> flag(this->flags[i]); // a copy of the shared ptr to the flag 211 | auto f = [this, i, flag/* a copy of the shared ptr to the flag */]() { 212 | std::atomic & _flag = *flag; 213 | std::function * _f; 214 | bool isPop = this->q.pop(_f); 215 | while (true) { 216 | while (isPop) { // if there is anything in the queue 217 | std::unique_ptr> func(_f); // at return, delete the function even if an exception occurred 218 | (*_f)(i); 219 | if (_flag) 220 | return; // the thread is wanted to stop, return even if the queue is not empty yet 221 | else 222 | isPop = this->q.pop(_f); 223 | } 224 | // the queue is empty here, wait for the next command 225 | std::unique_lock lock(this->mutex); 226 | ++this->nWaiting; 227 | this->cv.wait(lock, [this, &_f, &isPop, &_flag](){ isPop = this->q.pop(_f); return isPop || this->isDone || _flag; }); 228 | --this->nWaiting; 229 | if (!isPop) 230 | return; // if the queue is empty and this->isDone == true or *flag then return 231 | } 232 | }; 233 | this->threads[i].reset(new std::thread(f)); // compiler may not support std::make_unique() 234 | } 235 | 236 | void init() { this->nWaiting = 0; this->isStop = false; this->isDone = false; } 237 | 238 | std::vector> threads; 239 | std::vector>> flags; 240 | detail::Queue *> q; 241 | std::atomic isDone; 242 | std::atomic isStop; 243 | std::atomic nWaiting; // how many threads are waiting 244 | 245 | std::mutex mutex; 246 | std::condition_variable cv; 247 | }; 248 | 249 | } 250 | 251 | #endif // __ctpl_stl_thread_pool_H__ 252 | -------------------------------------------------------------------------------- /CTPL/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | 179 | Copyright (C) 2014 by Vitaliy Vitsentiy 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | 2 | //C headers 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | //C++ headers 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | //external custom headers 16 | #include "CTPL/ctpl_stl.h" 17 | 18 | //my custom headers 19 | #include "alignment_util.h" 20 | #include "base_bwt.h" 21 | #include "csa_bwt.h" 22 | #include "file_iterators.h" 23 | #include "rle_bwt.h" 24 | #include "string_util.h" 25 | 26 | struct Parameters { 27 | bool USE_FM_INDEX; 28 | uint64_t k; 29 | uint64_t K; 30 | uint64_t MIN_COUNT; 31 | uint64_t MAX_BRANCH_ATTEMPT_LENGTH; 32 | uint64_t BRANCH_LIMIT_FACTOR; 33 | double BRANCH_BUFFER_FACTOR; 34 | double TAIL_BUFFER_FACTOR; 35 | double FRAC; 36 | //uint64_t MAX_TRIES; 37 | uint8_t FM_BIT_POWER; 38 | bool VERBOSE; 39 | }; 40 | 41 | struct CorrectionResults { 42 | string label; 43 | string originalSeq; 44 | string correctedSeq; 45 | double avgBefore; 46 | double avgAfter; 47 | }; 48 | 49 | //valid char info 50 | enum { 51 | VALID_CHARS_LEN = 4 52 | }; 53 | const vector VALID_CHARS = {1, 2, 3, 5}; 54 | 55 | const string VERSION = "1.0.0"; 56 | 57 | uint64_t calculateMedian(vector inArray, uint64_t minValue) { 58 | /* 59 | Calculates the median of the array ignoring all values < minValue 60 | Note: this impl doesn't average the median if there are an even number of values, just picks the lower one 61 | @param inArray - the vector to calculate the median of 62 | @param minValue - any values less than this will be ignored in the calculation 63 | */ 64 | 65 | uint64_t arrayLen = inArray.size(); 66 | vector arrayCopy = vector(arrayLen, 0); 67 | uint64_t l = 0; 68 | 69 | for(uint64_t x = 0; x < arrayLen; x++) { 70 | if(inArray[x] >= minValue) { 71 | arrayCopy[l] = inArray[x]; 72 | l++; 73 | } 74 | } 75 | 76 | if(l == 0) { 77 | return 0; 78 | } 79 | else { 80 | nth_element(arrayCopy.begin(), arrayCopy.begin()+(l-1)/2, arrayCopy.begin()+l); 81 | return arrayCopy[(l-1)/2]; 82 | } 83 | } 84 | 85 | vector > multiBridge(BaseBWT * rle_p, vector seedKmer, vector targetKmer, uint64_t tMin, uint64_t branchLim, uint64_t maxBranchLen) { 86 | /* 87 | printf("multibridge "); 88 | for(int x = 0; x < seedKmer.size(); x++) printf("%d", seedKmer[x]); 89 | printf(" "); 90 | for(int x = 0; x < targetKmer.size(); x++) printf("%d", targetKmer[x]); 91 | printf("\n"); 92 | */ 93 | 94 | //printf("tMin = %d\n", tMin); 95 | 96 | vector > ret = vector >(0); 97 | uint64_t kmerLen = seedKmer.size(); 98 | 99 | vector counts = vector(4); 100 | 101 | uint64_t numBranched = 0; 102 | 103 | // cdef str currBridge 104 | vector currBridge; 105 | // cdef list currBridgeList 106 | uint64_t currBridgeLen = 0; 107 | vector currKmer = vector(kmerLen, 4); 108 | vector revKmer = vector(kmerLen, 4); 109 | 110 | // cdef list possBridges = [(seedKmer, kmerLen)] 111 | vector > possBridges = vector >(); 112 | possBridges.push_back(vector(seedKmer)); 113 | 114 | // cdef unsigned char * currBridge_view 115 | // cdef unsigned char * currKmer_view = currKmer 116 | // cdef unsigned char * revKmer_view = revKmer 117 | // cdef unsigned char * targetKmer_view = targetKmer 118 | // #print currKmer_view[0], ord('A') 119 | 120 | // cdef unsigned long i, x 121 | // cdef str c 122 | 123 | uint64_t maxPos; 124 | 125 | //while we have things to explore, and we haven't explored too many, and we don't have a ridiculous number of possibilities 126 | while(possBridges.size() > 0 && numBranched < branchLim) { 127 | currBridge = possBridges.back(); 128 | possBridges.pop_back(); 129 | currBridgeLen = currBridge.size(); 130 | numBranched++; 131 | 132 | for(unsigned int x = 0; x < kmerLen; x++) { 133 | currKmer[x] = currBridge[currBridgeLen-kmerLen+x]; 134 | revKmer[kmerLen-x-1] = string_util::REV_COMP_I[currKmer[x]]; 135 | } 136 | 137 | //try to extend the bridge 138 | while(currBridgeLen < maxBranchLen) { 139 | //shift the current k-mer over one in preparation for the last base toggle 140 | for(unsigned int x = 0; x < kmerLen-1; x++) { 141 | currKmer[x] = currKmer[x+1]; 142 | revKmer[kmerLen-x-1] = revKmer[kmerLen-x-2]; 143 | } 144 | 145 | maxPos = 0; 146 | 147 | //count and pick the highest 148 | //printf("counts "); 149 | for(int x = 0; x < VALID_CHARS_LEN; x++) { 150 | currKmer[kmerLen-1] = VALID_CHARS[x]; 151 | revKmer[0] = string_util::REV_COMP_I[VALID_CHARS[x]]; 152 | counts[x] = rle_p->countKmer(&currKmer[0], kmerLen)+rle_p->countKmer(&revKmer[0], kmerLen); 153 | //printf("%d ", counts[x]); 154 | if(counts[x] > counts[maxPos]) maxPos = x; 155 | } 156 | //printf("\n"); 157 | 158 | //make sure the highest is high enough for us to consider it 159 | if(counts[maxPos] >= tMin) { 160 | currBridge.push_back(4); 161 | 162 | if(possBridges.size() < branchLim) { 163 | for(unsigned int x = 0; x < VALID_CHARS_LEN; x++) { 164 | if(x != maxPos && counts[x] >= tMin) { 165 | //add the ones we aren't exploring right now if they're high enough 166 | currBridge[currBridgeLen] = VALID_CHARS[x]; 167 | possBridges.push_back(vector(currBridge.begin(), currBridge.end())); 168 | } 169 | } 170 | } 171 | else { 172 | //printf("exit A\n"); 173 | return vector >(); 174 | } 175 | 176 | //now really add the symbol 177 | currBridge[currBridgeLen] = VALID_CHARS[maxPos]; 178 | currBridgeLen++; 179 | 180 | currKmer[kmerLen-1] = VALID_CHARS[maxPos]; 181 | revKmer[0] = string_util::REV_COMP_I[VALID_CHARS[maxPos]]; 182 | } 183 | else { 184 | //our BEST doesn't pass the threshold on this path, stop following 185 | //print currBridge, counts, tMin 186 | break; 187 | } 188 | 189 | if(equal(targetKmer.begin(), targetKmer.end(), currKmer.begin())) { 190 | ret.push_back(currBridge); 191 | if(ret.size() >= branchLim) { 192 | //printf("exit B\n"); 193 | return vector >(); 194 | } 195 | } 196 | } 197 | } 198 | 199 | if(numBranched < branchLim) { 200 | return ret; 201 | } 202 | else { 203 | //printf("exit C\n"); 204 | return vector >(); 205 | } 206 | } 207 | 208 | vector > shortAssemble(BaseBWT * rle_p, vector seedKmer, uint64_t tMin, uint64_t branchLim, uint64_t maxBranchLen) { 209 | vector > ret = vector >(0); 210 | uint64_t kmerLen = seedKmer.size(); 211 | 212 | vector counts = vector(4); 213 | 214 | uint64_t numBranched = 0; 215 | 216 | vector currBridge; 217 | uint64_t currBridgeLen = 0; 218 | vector currKmer = vector(kmerLen, 4); 219 | vector revKmer = vector(kmerLen, 4); 220 | 221 | vector > possBridges = vector >(); 222 | possBridges.push_back(vector(seedKmer)); 223 | 224 | uint64_t maxPos; 225 | 226 | //while we have things to explore, and we haven't explored too many, and we don't have a ridiculous number of possibilities 227 | while(possBridges.size() > 0 && numBranched < branchLim) { 228 | currBridge = possBridges.back(); 229 | possBridges.pop_back(); 230 | currBridgeLen = currBridge.size(); 231 | numBranched++; 232 | 233 | for(unsigned int x = 0; x < kmerLen; x++) { 234 | currKmer[x] = currBridge[currBridgeLen-kmerLen+x]; 235 | revKmer[kmerLen-x-1] = string_util::REV_COMP_I[currKmer[x]]; 236 | } 237 | 238 | //try to extend the bridge 239 | while(currBridgeLen < maxBranchLen) { 240 | //shift the current k-mer over one in preparation for the last base toggle 241 | for(unsigned int x = 0; x < kmerLen-1; x++) { 242 | currKmer[x] = currKmer[x+1]; 243 | revKmer[kmerLen-x-1] = revKmer[kmerLen-x-2]; 244 | } 245 | 246 | maxPos = 0; 247 | 248 | //count and pick the highest 249 | //printf("counts "); 250 | for(int x = 0; x < VALID_CHARS_LEN; x++) { 251 | currKmer[kmerLen-1] = VALID_CHARS[x]; 252 | revKmer[0] = string_util::REV_COMP_I[VALID_CHARS[x]]; 253 | counts[x] = rle_p->countKmer(&currKmer[0], kmerLen)+rle_p->countKmer(&revKmer[0], kmerLen); 254 | //printf("%d ", counts[x]); 255 | if(counts[x] > counts[maxPos]) maxPos = x; 256 | } 257 | //printf("\n"); 258 | 259 | //make sure the highest is high enough for us to consider it 260 | if(counts[maxPos] >= tMin) { 261 | currBridge.push_back(4); 262 | 263 | if(possBridges.size() < branchLim) { 264 | for(unsigned int x = 0; x < VALID_CHARS_LEN; x++) { 265 | if(x != maxPos && counts[x] >= tMin) { 266 | //add the ones we aren't exploring right now if they're high enough 267 | currBridge[currBridgeLen] = VALID_CHARS[x]; 268 | possBridges.push_back(vector(currBridge.begin(), currBridge.end())); 269 | } 270 | } 271 | } 272 | else { 273 | //printf("exit A\n"); 274 | return vector >(); 275 | } 276 | 277 | //now really add the symbol 278 | currBridge[currBridgeLen] = VALID_CHARS[maxPos]; 279 | currBridgeLen++; 280 | 281 | currKmer[kmerLen-1] = VALID_CHARS[maxPos]; 282 | revKmer[0] = string_util::REV_COMP_I[VALID_CHARS[maxPos]]; 283 | } 284 | else { 285 | //our BEST doesn't pass the threshold on this path, stop following 286 | //print currBridge, counts, tMin 287 | break; 288 | } 289 | } 290 | 291 | if(currBridgeLen == maxBranchLen) { 292 | ret.push_back(currBridge); 293 | if(ret.size() >= branchLim) { 294 | return vector >(); 295 | } 296 | } 297 | } 298 | 299 | //make sure we didn't go overboard 300 | if(numBranched < branchLim) { 301 | return ret; 302 | } 303 | else { 304 | return vector >(); 305 | } 306 | } 307 | 308 | //this structure is used to store any corrections we find 309 | struct Correction { 310 | uint64_t start; 311 | uint64_t end; 312 | vector seq; 313 | }; 314 | 315 | vector correctionPass(BaseBWT * rle_p, vector seq_i, Parameters myParams, uint64_t kmerSize) { 316 | /* 317 | This function performs a single pass of the correction algorithm 318 | @param rle_p - a pointer to the BWT that represents our DBG 319 | @param seq_i - the read sequence we are correcting in vector form 320 | @param myParams - parameters that are modified by the user to effect these methods 321 | @param kmerSize - the 'k' in k-mer 322 | @return - a vector representation of the string 323 | */ 324 | 325 | //this is the only parameter that is dynamic right now 326 | //uint64_t BRANCH_LIMIT = 2*kmerSize; 327 | //I think this is too high, the benefit is relatively low for a large time increase 328 | //uint64_t BRANCH_LIMIT = 10*kmerSize; 329 | //TODO: make this a user-parameter in the long run, for now we test for a default 330 | //uint64_t BRANCH_LIMIT = 4*kmerSize; 331 | uint64_t BRANCH_LIMIT = myParams.BRANCH_LIMIT_FACTOR*kmerSize; 332 | 333 | vector pu = rle_p->countPileup_i(seq_i, kmerSize); 334 | 335 | double nzMed = calculateMedian(pu, myParams.MIN_COUNT); 336 | if(nzMed < myParams.MIN_COUNT) { 337 | //basically if our median is super low, we have no chance of fixing it 338 | return seq_i; 339 | } 340 | 341 | //try to dynamically set the threshold, but make sure its at least MIN_COUNT 342 | uint64_t thresh = (uint64_t)(myParams.FRAC * nzMed); 343 | if(thresh < myParams.MIN_COUNT) thresh = myParams.MIN_COUNT; 344 | 345 | //prep for the actual corrections now 346 | int64_t prevFound = -1; 347 | vector seedKmer = vector(kmerSize); 348 | vector targetKmer = vector(kmerSize); 349 | uint64_t maxBranchLength; 350 | vector > bridgePoints; 351 | vector > bridgePoints_ed = vector >(); 352 | 353 | vector correctionsList = vector(0); 354 | Correction newCorr; 355 | 356 | uint64_t x = 0; 357 | uint64_t puSize = pu.size(); 358 | 359 | while(x < puSize) { 360 | if(pu[x] < thresh) { 361 | prevFound = x-1; 362 | 363 | //find the next index that is above the threshold 364 | while(x < puSize && pu[x] < thresh) { 365 | x++; 366 | } 367 | 368 | if(prevFound == -1 && x < puSize) { 369 | //handle the head case 370 | maxBranchLength = (uint64_t)(myParams.TAIL_BUFFER_FACTOR*(x+kmerSize)); 371 | if(maxBranchLength <= myParams.MAX_BRANCH_ATTEMPT_LENGTH) { 372 | //get the first found k-mer and reverse complement it 373 | seedKmer.assign(seq_i.begin()+x, seq_i.begin()+x+kmerSize); 374 | seedKmer = string_util::reverseComplement_i(seedKmer); 375 | 376 | //now assemble out from it 377 | bridgePoints = shortAssemble(rle_p, seedKmer, thresh, BRANCH_LIMIT, maxBranchLength); 378 | 379 | //remember to rev comp this also 380 | vector orig = string_util::reverseComplement_i(vector(seq_i.begin(), seq_i.begin()+x+kmerSize)); 381 | 382 | vector > edScores = vector >(bridgePoints.size()); 383 | uint64_t minScore = 0xFFFFFFFFFFFFFFFF; 384 | for(uint64_t y = 0; y < bridgePoints.size(); y++) { 385 | edScores[y] = editDistance_minimize(orig, bridgePoints[y]); 386 | if(edScores[y].first < minScore) minScore = edScores[y].first; 387 | } 388 | 389 | bridgePoints_ed.clear(); 390 | for(uint64_t y = 0; y < bridgePoints.size(); y++) { 391 | if(edScores[y].first == minScore) bridgePoints_ed.push_back(vector(bridgePoints[y].begin(), bridgePoints[y].begin()+edScores[y].second)); 392 | } 393 | 394 | if(bridgePoints_ed.size() == 0) { 395 | //do nothing, we didn't find anything good 396 | } 397 | else if(bridgePoints_ed.size() == 1) 398 | { 399 | //one bridge with smallest edit distance 400 | newCorr.start = 0; 401 | newCorr.end = x+kmerSize; 402 | newCorr.seq = string_util::reverseComplement_i(bridgePoints_ed[0]); 403 | correctionsList.push_back(newCorr); 404 | } 405 | else { 406 | //multiple with same edit distance, look at overall counts 407 | uint64_t maxCount = 0; 408 | uint64_t maxID = 0; 409 | vector edPU; 410 | uint64_t summation; 411 | for(uint64_t y = 0; y < bridgePoints_ed.size(); y++) { 412 | edPU = rle_p->countPileup_i(bridgePoints_ed[y], kmerSize); 413 | summation = 0; 414 | summation = accumulate(edPU.begin(), edPU.end(), summation); 415 | if(summation > maxCount) { 416 | maxCount = summation; 417 | maxID = y; 418 | } 419 | } 420 | 421 | //now save it 422 | newCorr.start = 0; 423 | newCorr.end = x+kmerSize; 424 | newCorr.seq = string_util::reverseComplement_i(bridgePoints_ed[maxID]); 425 | correctionsList.push_back(newCorr); 426 | } 427 | } 428 | } 429 | else if(prevFound >= 0 && x < puSize) { 430 | //handle a bridging case 431 | /* 432 | for(uint64_t y = 0; y < kmerSize; y++) { 433 | seedKmer[y] = seq_i[prevFound+y]; 434 | targetKmer[y] = seq_i[x+y]; 435 | } 436 | */ 437 | seedKmer.assign(seq_i.begin()+prevFound, seq_i.begin()+prevFound+kmerSize); 438 | targetKmer.assign(seq_i.begin()+x, seq_i.begin()+x+kmerSize); 439 | maxBranchLength = (uint64_t)(myParams.BRANCH_BUFFER_FACTOR*(x-prevFound+kmerSize)); 440 | 441 | //printf("testing %d %d\n", maxBranchLength, myParams.MAX_BRANCH_ATTEMPT_LENGTH); 442 | 443 | if(maxBranchLength < myParams.MAX_BRANCH_ATTEMPT_LENGTH) { 444 | //try forward first 445 | bridgePoints = multiBridge(rle_p, seedKmer, targetKmer, thresh, BRANCH_LIMIT, maxBranchLength); 446 | 447 | //try reverse complement if we failed 448 | if(bridgePoints.size() == 0) { 449 | bridgePoints = multiBridge(rle_p, string_util::reverseComplement_i(targetKmer), string_util::reverseComplement_i(seedKmer), thresh, BRANCH_LIMIT, maxBranchLength); 450 | 451 | //make sure to fix the results here 452 | for(unsigned int y = 0; y < bridgePoints.size(); y++) { 453 | bridgePoints[y] = string_util::reverseComplement_i(bridgePoints[y]); 454 | } 455 | } 456 | 457 | //printf("bp size: %d\n", bridgePoints.size()); 458 | if(bridgePoints.size() == 0) { 459 | //no bridges found 460 | //calculate a midpoint 461 | uint64_t midPoint = (uint64_t)((prevFound+x+kmerSize)/2.0); 462 | maxBranchLength = (uint64_t)(myParams.TAIL_BUFFER_FACTOR*(midPoint-prevFound)); 463 | 464 | if(maxBranchLength < myParams.MAX_BRANCH_ATTEMPT_LENGTH) { 465 | //try to extend from the left to the middle 466 | bridgePoints = shortAssemble(rle_p, seedKmer, thresh, BRANCH_LIMIT, maxBranchLength); 467 | vector orig = vector(seq_i.begin()+prevFound, seq_i.begin()+midPoint); 468 | 469 | //calculate the minimized edit distances 470 | vector > edScores = vector >(bridgePoints.size()); 471 | uint64_t minScore = 0xFFFFFFFFFFFFFFFF; 472 | for(uint64_t y = 0; y < bridgePoints.size(); y++) { 473 | edScores[y] = editDistance_minimize(orig, bridgePoints[y]); 474 | if(edScores[y].first < minScore) minScore = edScores[y].first; 475 | } 476 | 477 | //clip the strings by the minimized length 478 | bridgePoints_ed.clear(); 479 | for(uint64_t y = 0; y < bridgePoints.size(); y++) { 480 | if(edScores[y].first == minScore) bridgePoints_ed.push_back(vector(bridgePoints[y].begin(), bridgePoints[y].begin()+edScores[y].second)); 481 | } 482 | 483 | //if(bridgePoints_ed.size() > 0 && minScore > (midPoint-prevFound)*.4) printf("big ED: %lu %lu\n", minScore, midPoint-prevFound); 484 | 485 | //TODO: make .4 a constant 486 | if(bridgePoints_ed.size() == 0 || minScore > (midPoint-prevFound)*.4) { 487 | //do nothing, we didn't find anything good 488 | } 489 | else if(bridgePoints_ed.size() == 1) 490 | { 491 | //one bridge with smallest edit distance 492 | newCorr.start = prevFound; 493 | newCorr.end = midPoint; 494 | newCorr.seq = bridgePoints_ed[0]; 495 | correctionsList.push_back(newCorr); 496 | //printf("left to mid found\n"); 497 | } 498 | else { 499 | //multiple with same edit distance, look at overall counts 500 | uint64_t maxCount = 0; 501 | uint64_t maxID = 0; 502 | vector edPU; 503 | uint64_t summation; 504 | for(uint64_t y = 0; y < bridgePoints_ed.size(); y++) { 505 | edPU = rle_p->countPileup_i(bridgePoints_ed[y], kmerSize); 506 | summation = 0; 507 | summation = accumulate(edPU.begin(), edPU.end(), summation); 508 | if(summation > maxCount) { 509 | maxCount = summation; 510 | maxID = y; 511 | } 512 | } 513 | 514 | //now save it 515 | newCorr.start = prevFound; 516 | newCorr.end = midPoint; 517 | newCorr.seq = bridgePoints_ed[maxID]; 518 | correctionsList.push_back(newCorr); 519 | //printf("left to mid found\n"); 520 | } 521 | 522 | //try to extend from the right to the middle 523 | vector revTarget = string_util::reverseComplement_i(targetKmer); 524 | 525 | //now assemble out from it 526 | bridgePoints = shortAssemble(rle_p, revTarget, thresh, BRANCH_LIMIT, maxBranchLength); 527 | 528 | //remember to rev comp this also 529 | orig = string_util::reverseComplement_i(vector(seq_i.begin()+midPoint, seq_i.begin()+x+kmerSize)); 530 | 531 | edScores = vector >(bridgePoints.size()); 532 | minScore = 0xFFFFFFFFFFFFFFFF; 533 | for(uint64_t y = 0; y < bridgePoints.size(); y++) { 534 | edScores[y] = editDistance_minimize(orig, bridgePoints[y]); 535 | if(edScores[y].first < minScore) minScore = edScores[y].first; 536 | } 537 | 538 | bridgePoints_ed.clear(); 539 | for(uint64_t y = 0; y < bridgePoints.size(); y++) { 540 | if(edScores[y].first == minScore) bridgePoints_ed.push_back(vector(bridgePoints[y].begin(), bridgePoints[y].begin()+edScores[y].second)); 541 | } 542 | 543 | //TODO: make .4 a constant 544 | if(bridgePoints_ed.size() == 0 || minScore > (midPoint-prevFound)*.4) { 545 | //do nothing, we didn't find anything good 546 | } 547 | else if(bridgePoints_ed.size() == 1) 548 | { 549 | //one bridge with smallest edit distance 550 | newCorr.start = midPoint; 551 | newCorr.end = x+kmerSize; 552 | newCorr.seq = string_util::reverseComplement_i(bridgePoints_ed[0]); 553 | correctionsList.push_back(newCorr); 554 | //printf("right to mid found\n"); 555 | } 556 | else { 557 | //multiple with same edit distance, look at overall counts 558 | uint64_t maxCount = 0; 559 | uint64_t maxID = 0; 560 | vector edPU; 561 | uint64_t summation; 562 | for(uint64_t y = 0; y < bridgePoints_ed.size(); y++) { 563 | edPU = rle_p->countPileup_i(bridgePoints_ed[y], kmerSize); 564 | summation = 0; 565 | summation = accumulate(edPU.begin(), edPU.end(), summation); 566 | if(summation > maxCount) { 567 | maxCount = summation; 568 | maxID = y; 569 | } 570 | } 571 | 572 | //now save it 573 | newCorr.start = midPoint; 574 | newCorr.end = x+kmerSize; 575 | newCorr.seq = string_util::reverseComplement_i(bridgePoints_ed[maxID]); 576 | correctionsList.push_back(newCorr); 577 | //printf("right to mid found\n"); 578 | } 579 | } 580 | } 581 | else if(bridgePoints.size() == 1) { 582 | //one bridge found, add it on 583 | newCorr.start = prevFound; 584 | newCorr.end = x+kmerSize; 585 | newCorr.seq = bridgePoints[0]; 586 | correctionsList.push_back(newCorr); 587 | } 588 | else { 589 | //multiple bridges found, pick the best one by edit distance 590 | vector orig = vector(seq_i.begin()+prevFound, seq_i.begin()+x+kmerSize); 591 | 592 | vector edScores = vector(bridgePoints.size()); 593 | uint64_t minScore = 0xFFFFFFFFFFFFFFFF; 594 | for(uint64_t y = 0; y < bridgePoints.size(); y++) { 595 | edScores[y] = editDistance(orig, bridgePoints[y]); 596 | if(edScores[y] < minScore) minScore = edScores[y]; 597 | } 598 | 599 | bridgePoints_ed.clear(); 600 | for(uint64_t y = 0; y < bridgePoints.size(); y++) { 601 | if(edScores[y] == minScore) bridgePoints_ed.push_back(bridgePoints[y]); 602 | } 603 | if(bridgePoints_ed.size() == 1) 604 | { 605 | //one bridge with smallest edit distance 606 | newCorr.start = prevFound; 607 | newCorr.end = x+kmerSize; 608 | newCorr.seq = bridgePoints_ed[0]; 609 | correctionsList.push_back(newCorr); 610 | } 611 | else { 612 | //multiple with same edit distance, look at overall counts 613 | uint64_t maxCount = 0; 614 | uint64_t maxID = 0; 615 | vector edPU; 616 | uint64_t summation; 617 | for(uint64_t y = 0; y < bridgePoints_ed.size(); y++) { 618 | edPU = rle_p->countPileup_i(bridgePoints_ed[y], kmerSize); 619 | summation = 0; 620 | summation = accumulate(edPU.begin(), edPU.end(), summation); 621 | if(summation > maxCount) { 622 | maxCount = summation; 623 | maxID = y; 624 | } 625 | } 626 | 627 | //now save it 628 | newCorr.start = prevFound; 629 | newCorr.end = x+kmerSize; 630 | newCorr.seq = bridgePoints_ed[maxID]; 631 | correctionsList.push_back(newCorr); 632 | } 633 | } 634 | 635 | //if we found a bridge, no need to keep trying 636 | //if(bridgePoints.size() > 0) break; 637 | } 638 | } 639 | } 640 | else { 641 | //the counts were okay, no correction needed here 642 | x++; 643 | } 644 | } 645 | 646 | x = puSize; 647 | 648 | //use the tail factor for the buffer 649 | maxBranchLength = (uint64_t)(myParams.TAIL_BUFFER_FACTOR*(x-prevFound+kmerSize)); 650 | if(maxBranchLength <= myParams.MAX_BRANCH_ATTEMPT_LENGTH && pu[puSize-1] < thresh && prevFound >= 0) { 651 | //copy the seed k-mer 652 | seedKmer.assign(seq_i.begin()+prevFound, seq_i.begin()+prevFound+kmerSize); 653 | bridgePoints = shortAssemble(rle_p, seedKmer, thresh, BRANCH_LIMIT, maxBranchLength); 654 | 655 | vector orig = vector(seq_i.begin()+prevFound, seq_i.end()); 656 | 657 | //calculate the minimized edit distances 658 | vector > edScores = vector >(bridgePoints.size()); 659 | uint64_t minScore = 0xFFFFFFFFFFFFFFFF; 660 | for(uint64_t y = 0; y < bridgePoints.size(); y++) { 661 | edScores[y] = editDistance_minimize(orig, bridgePoints[y]); 662 | if(edScores[y].first < minScore) minScore = edScores[y].first; 663 | } 664 | 665 | //clip the strings by the minimized length 666 | bridgePoints_ed.clear(); 667 | for(uint64_t y = 0; y < bridgePoints.size(); y++) { 668 | if(edScores[y].first == minScore) bridgePoints_ed.push_back(vector(bridgePoints[y].begin(), bridgePoints[y].begin()+edScores[y].second)); 669 | } 670 | 671 | if(bridgePoints_ed.size() == 0) { 672 | //do nothing, we didn't find anything good 673 | } 674 | else if(bridgePoints_ed.size() == 1) 675 | { 676 | //one bridge with smallest edit distance 677 | newCorr.start = prevFound; 678 | newCorr.end = seq_i.size(); 679 | newCorr.seq = bridgePoints_ed[0]; 680 | correctionsList.push_back(newCorr); 681 | } 682 | else { 683 | //multiple with same edit distance, look at overall counts 684 | uint64_t maxCount = 0; 685 | uint64_t maxID = 0; 686 | vector edPU; 687 | uint64_t summation; 688 | for(uint64_t y = 0; y < bridgePoints_ed.size(); y++) { 689 | edPU = rle_p->countPileup_i(bridgePoints_ed[y], kmerSize); 690 | summation = 0; 691 | summation = accumulate(edPU.begin(), edPU.end(), summation); 692 | if(summation > maxCount) { 693 | maxCount = summation; 694 | maxID = y; 695 | } 696 | } 697 | 698 | //now save it 699 | newCorr.start = prevFound; 700 | newCorr.end = seq_i.size(); 701 | newCorr.seq = bridgePoints_ed[maxID]; 702 | correctionsList.push_back(newCorr); 703 | } 704 | } 705 | 706 | //go through and insert the corrections in reverse 707 | vector ret = vector(seq_i.begin(), seq_i.end()); 708 | Correction c; 709 | for(int64_t x = correctionsList.size()-1; x >=0; x--) { 710 | //get the modification 711 | c = correctionsList[x]; 712 | 713 | //delete what was in the range before 714 | ret.erase(ret.begin()+c.start, ret.begin()+c.end); 715 | 716 | //insert the new values 717 | ret.insert(ret.begin()+c.start, c.seq.begin(), c.seq.end()); 718 | } 719 | 720 | return ret; 721 | } 722 | 723 | CorrectionResults correctRead_job(int id, BaseBWT * rle_p, LongReadFA inputRead, Parameters myParams) { 724 | //prep the return value 725 | CorrectionResults ret; 726 | ret.label = inputRead.label; 727 | ret.originalSeq = inputRead.seq; 728 | 729 | //1 - translate string to vector 730 | vector seq_i = string_util::stoi(inputRead.seq); 731 | 732 | //2 - correct with small k 733 | vector corrected_k = correctionPass(rle_p, seq_i, myParams, myParams.k); 734 | /* 735 | while(seq_i.size() != corrected_k.size() || !equal(seq_i.begin(), seq_i.end(), corrected_k.begin())) { 736 | //printf("looping here\n"); 737 | seq_i = vector(corrected_k); 738 | corrected_k = correctionPass(rle_p, seq_i, myParams, myParams.k); 739 | } 740 | */ 741 | 742 | if(myParams.k == myParams.K) { 743 | //3a - k = K, skip second pass 744 | //4 - translate vector to string 745 | ret.correctedSeq = string_util::itos(corrected_k); 746 | 747 | if(myParams.VERBOSE) { 748 | //seq_i = string_util::stoi(inputRead.seq); 749 | vector c1 = rle_p->countPileup_i(seq_i, myParams.k); 750 | vector c2 = rle_p->countPileup_i(corrected_k, myParams.k); 751 | ret.avgBefore = accumulate(c1.begin(), c1.end(), 0.0)/c1.size(); 752 | ret.avgAfter = accumulate(c2.begin(), c2.end(), 0.0)/c2.size(); 753 | } 754 | else { 755 | ret.avgBefore = 0; 756 | ret.avgAfter = 0; 757 | } 758 | } 759 | else { 760 | //3b - correct with big K 761 | vector corrected_K = correctionPass(rle_p, corrected_k, myParams, myParams.K); 762 | 763 | //4 - translate vector to string 764 | ret.correctedSeq = string_util::itos(corrected_K); 765 | /* 766 | while(seq_i.size() != corrected_K.size() || !equal(seq_i.begin(), seq_i.end(), corrected_K.begin())) { 767 | //printf("looping here 2\n"); 768 | seq_i = vector(corrected_K); 769 | corrected_K = correctionPass(rle_p, seq_i, myParams, myParams.K); 770 | } 771 | */ 772 | 773 | if(myParams.VERBOSE) { 774 | //seq_i = string_util::stoi(inputRead.seq); 775 | vector c1 = rle_p->countPileup_i(seq_i, myParams.k); 776 | vector c2 = rle_p->countPileup_i(corrected_K, myParams.k); 777 | ret.avgBefore = accumulate(c1.begin(), c1.end(), 0.0)/c1.size(); 778 | ret.avgAfter = accumulate(c2.begin(), c2.end(), 0.0)/c2.size(); 779 | } 780 | else { 781 | ret.avgBefore = 0; 782 | ret.avgAfter = 0; 783 | } 784 | } 785 | 786 | //for(int x = 0; x < c2.size(); x++) printf("%lu ", c2[x]); 787 | //printf("\n"); 788 | 789 | //5 - return result 790 | return ret; 791 | } 792 | 793 | int main(int argc, char* argv[]) { 794 | 795 | ////////////////////////////////////////////////////////// 796 | //DEFAULT PARAMETERS 797 | Parameters myParams; 798 | myParams.USE_FM_INDEX = false; //if enabled, we will use the RLE_BWT, else CSA_BWT 799 | myParams.k = 21; //small k-mer 800 | myParams.K = 59; //big K-mer 801 | myParams.MIN_COUNT = 5; //threshold for counting, overrides FRAC* 802 | myParams.FRAC = 0.1; //the factor applied to the median to determine a dynamic threshold 803 | myParams.MAX_BRANCH_ATTEMPT_LENGTH = 10000; //maximum length of a gap that we will try to cross; longer can mean more CPU usage 804 | //TODO: make user configurable 805 | myParams.BRANCH_LIMIT_FACTOR = 4; //this*kmerSize = the number of branches that the assembly allows before giving up 806 | myParams.BRANCH_BUFFER_FACTOR = 1.3; //the factor applied to any bridge gap to allow for insertions 807 | myParams.TAIL_BUFFER_FACTOR = 1.05; //the factor applied to any head/tail gap to allow for insertions 808 | //myParams.MAX_TRIES = 1; //DEPRECATED 809 | myParams.FM_BIT_POWER = 8; //the in-memory FM-index samples at 2^FM_BIT_POWER; smaller = faster access but more memory 810 | myParams.VERBOSE = false; //if true, information about each read is dumped in the output 811 | 812 | uint64_t poolSize = 10000; //the number of jobs waiting to be processed at any given point in time; smaller may lead to lower process utilization but also less memory 813 | int numThreads = 1; //the number of concurrent correction threads 814 | uint64_t beginID = 0; //0-indexed id of the first read to process (default: beginning) 815 | uint64_t endID = 0xFFFFFFFFFFFFFFFF; //0-indexed id of the last read to process (default: all reads) 816 | ////////////////////////////////////////////////////////// 817 | 818 | char opt; 819 | bool helpRequest = false; 820 | while((opt = getopt(argc, argv, "hvk:K:p:b:e:m:f:B:iF:V")) != -1) { 821 | if(opt == 'h') helpRequest = true; 822 | else if(opt == 'v') { 823 | printf("fmlrc version %s\n", VERSION.c_str()); 824 | return 0; 825 | } 826 | else if(opt == 'k') myParams.k = atoi(optarg); 827 | else if(opt == 'K') myParams.K = atoi(optarg); 828 | else if(opt == 'p') numThreads = atoi(optarg); 829 | else if(opt == 'b') beginID = atoi(optarg); 830 | else if(opt == 'e') endID = atoi(optarg); 831 | else if(opt == 'm') myParams.MIN_COUNT = atoi(optarg); 832 | else if(opt == 'f') myParams.FRAC = atof(optarg); 833 | else if(opt == 'B') myParams.BRANCH_LIMIT_FACTOR = atoi(optarg); 834 | //MAX_BRANCH_ATTEMPT_LENGTH 835 | //BRANCH_BUFFER_FACTOR 836 | //TAIL_BUFFER_FACTOR 837 | else if(opt == 'i') myParams.USE_FM_INDEX = true; 838 | else if(opt == 'F') myParams.FM_BIT_POWER = atoi(optarg); 839 | else if(opt == 'V') myParams.VERBOSE = true; 840 | else printf("UNHANDLED OPTION: %d %c %s\n", optind, opt, optarg); 841 | } 842 | 843 | if(argc-optind < 3 || helpRequest) { 844 | printf("Usage: fmlrc [options] \n"); 845 | printf("Options: -h print help menu\n"); 846 | printf(" -v print version number and exit\n"); 847 | printf(" -k INT small k-mer size (default: 21)\n"); 848 | printf(" -K INT large K-mer size (default: 59), set K=k for single pass\n"); 849 | printf(" -p INT number of correction threads\n"); 850 | printf(" -b INT index of read to start with (default: 0)\n"); 851 | printf(" -e INT index of read to end with (default: end of file)\n"); 852 | printf(" -m INT absolute minimum count to consider a path (default: 5)\n"); 853 | printf(" -f FLOAT dynamic minimum fraction of median to consider a path (default: .10)\n"); 854 | printf(" -B INT set branch limit to * (default: 4)\n"); 855 | printf(" -i build a sampled FM-index instead of bit arrays\n"); 856 | printf(" -F INT FM-index is sampled every 2** values (default: 8); requires -i\n"); 857 | printf(" -V verbose output\n"); 858 | return 0; 859 | } 860 | 861 | if(beginID > endID) { 862 | printf("ERROR: parameter -b must be less than or equal to parameter -e\n"); 863 | return 1; 864 | } 865 | 866 | if(myParams.FRAC < 0 || myParams.FRAC > 1) { 867 | printf("ERROR: parameter -f must be within the range [0, 1]\n"); 868 | return 1; 869 | } 870 | 871 | char * bwtFN = argv[optind]; 872 | struct stat buffer; 873 | if(stat(bwtFN, &buffer) != 0) { 874 | printf("ERROR: BWT file does not exist\n"); 875 | return 1; 876 | } 877 | 878 | char * longReadFN = argv[optind+1]; 879 | if(stat(longReadFN, &buffer) != 0) { 880 | printf("ERROR: Fasta/q file does not exist\n"); 881 | return 1; 882 | } 883 | 884 | if(strncmp(longReadFN + strlen(longReadFN) - 6, ".fasta", 6) != 0 && strncmp(longReadFN + strlen(longReadFN) - 3, ".fa", 3) != 0 && strncmp(longReadFN + strlen(longReadFN) - 6, ".fastq", 6) != 0 && strncmp(longReadFN + strlen(longReadFN) - 3, ".fq", 3) != 0) { 885 | printf("ERROR: input long reads must be in FASTA or FASTQ format - file must end in '.fasta', '.fa', '.fastq', or '.fq'\n"); 886 | return 1; 887 | } 888 | 889 | //we need to always call this once 890 | string_util::initializeStringUtil(); 891 | 892 | //load the BWT into memory 893 | BaseBWT * rle;// = new CSA_BWT(bwtFN, myParams.FM_BIT_POWER); 894 | if(myParams.USE_FM_INDEX) rle = new RLE_BWT(bwtFN, myParams.FM_BIT_POWER); 895 | else rle = new CSA_BWT(bwtFN, false); //THE false MEANS WE PROMISE NOT TO QUERY '$' 896 | 897 | //open the fasta/q file for reading 898 | FastaIterator fi(longReadFN); 899 | 900 | //open the output fasta file for writing 901 | char * correctedReadFN = argv[optind+2]; 902 | FastaWriter fw(correctedReadFN); 903 | 904 | //now we need to set up our pool and stuff 905 | ctpl::thread_pool myPool(numThreads); 906 | 907 | //skip however many reads we were told to skip 908 | if(beginID > 0) printf("Skipping %llu reads...\n", beginID); 909 | uint64_t skippedReadCount = 0; 910 | while(skippedReadCount < beginID && fi.isMore()) { 911 | fi.getNextRead(); 912 | skippedReadCount++; 913 | } 914 | 915 | uint64_t jobsToProcess = endID - beginID; 916 | uint64_t jobsLoaded = 0; 917 | uint64_t jobsCompleted = 0; 918 | 919 | //preload the first jobs 920 | vector > results(poolSize); 921 | LongReadFA inputRead; 922 | for(uint64_t x = 0; x < poolSize && fi.isMore() && jobsLoaded < jobsToProcess; x++) { 923 | inputRead = fi.getNextRead(); 924 | results[x] = myPool.push(correctRead_job, rle, inputRead, myParams); 925 | jobsLoaded++; 926 | } 927 | 928 | //now load the jobs as they empty out 929 | uint64_t currJobSlot = 0; 930 | CorrectionResults currResults; 931 | LongReadFA outRead; 932 | while(fi.isMore() && jobsLoaded < jobsToProcess) { 933 | //get the results 934 | currResults = results[currJobSlot].get(); 935 | outRead.label = currResults.label; 936 | outRead.seq = currResults.correctedSeq; 937 | fw.writeRead(outRead); 938 | if(myParams.VERBOSE) printf("%llu: avg change %lf -> %lf\n", beginID+jobsCompleted, currResults.avgBefore, currResults.avgAfter); 939 | jobsCompleted++; 940 | 941 | //load the next job in 942 | inputRead = fi.getNextRead(); 943 | results[currJobSlot] = myPool.push(correctRead_job, rle, inputRead, myParams); 944 | jobsLoaded++; 945 | 946 | //increment the slot we are looking at, looping around if necessary 947 | currJobSlot++; 948 | if(currJobSlot == poolSize){ 949 | currJobSlot = 0; 950 | if(!myParams.VERBOSE) printf("Processed %llu reads\n", jobsCompleted); 951 | } 952 | } 953 | 954 | //now we just wait on the remaining jobs to finish 955 | while(jobsCompleted < jobsLoaded) { 956 | //get the results 957 | currResults = results[currJobSlot].get(); 958 | outRead.label = currResults.label; 959 | outRead.seq = currResults.correctedSeq; 960 | fw.writeRead(outRead); 961 | if(myParams.VERBOSE) printf("%llu: avg change %lf -> %lf\n", beginID+jobsCompleted, currResults.avgBefore, currResults.avgAfter); 962 | jobsCompleted++; 963 | 964 | //increment the slot we are looking at, looping around if necessary 965 | currJobSlot++; 966 | if(currJobSlot == poolSize){ 967 | currJobSlot = 0; 968 | if(!myParams.VERBOSE) printf("Processed %llu reads\n", jobsCompleted); 969 | } 970 | } 971 | 972 | //this is the only thing to clean up 973 | delete rle; 974 | printf("Finished processing reads [%llu, %llu)\n", beginID, beginID+jobsCompleted); 975 | 976 | return 0; 977 | } 978 | --------------------------------------------------------------------------------