├── .github └── workflows │ └── ccpp.yml ├── .gitmodules ├── AffineOneGapAlign.h ├── Alchemy2.cpp ├── Alignment.h ├── AlignmentBlock.h ├── BasicEndpoint.h ├── Chain.h ├── ChainRefine.h ├── ClusterRefine.h ├── Clustering.h ├── CompareLists.h ├── DivideSubByCol1.h ├── DivideSubByCol2.h ├── DivideSubByRow1.h ├── DivideSubByRow2.h ├── Fragment.h ├── Fragment_Info.h ├── Genome.h ├── GlobalChain.h ├── IndelRefine.h ├── Info.h ├── Input.h ├── LICENSE.txt ├── LinearExtend.h ├── LocalRefineAlignment.h ├── LogLookUpTable.h ├── MMIndex.h ├── Makefile ├── MapRead.h ├── Map_highacc.h ├── Map_lowacc.h ├── Mapping_ultility.h ├── MinCount.h ├── NaiveDP.h ├── Options.h ├── Path.h ├── Point.h ├── PrioritySearchTree.h ├── QueryTime.cpp ├── README.md ├── Read.h ├── RefineBreakpoint.h ├── SeqUtils.h ├── Sorting.h ├── SparseDP.h ├── SparseDP_Forward.h ├── SplitClusters.h ├── SubProblem.h ├── SubRountine.h ├── TestAffineOneGapAlign.cpp ├── TestGlobalChain.cpp ├── Timing.h ├── TupleOps.h ├── Types.h ├── call_assembly_SVs ├── ParseAlignment.py ├── README.md ├── SamToVCF.py ├── callassemblysv.json ├── callassemblysv.snakefile ├── combinehapSV.json ├── combinehapSV.snakefile ├── hg19_centromere.bed ├── hg19_centromere.wide.bed ├── mergeSV.py ├── parsealignment_maternal.sh ├── parsealignment_paternal.sh └── run_callsv.sh ├── image ├── logo_small.png ├── lra_logo.png ├── runtime.png ├── truvari_plot.pdf └── truvari_plot.png ├── logo ├── logo_small.png └── lra_logo.png ├── lra.cpp ├── meson.build ├── overload.h └── subprojects ├── htslib.wrap └── zlib.wrap /.github/workflows/ccpp.yml: -------------------------------------------------------------------------------- 1 | name: C/C++ CI 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Git Submodules 13 | run: git submodule update --init --recursive 14 | - name: meson_init 15 | run: pip install meson ninja 16 | - name: meson_setup 17 | run: 18 | meson setup builddir 19 | - name: meson_compile 20 | run: meson compile -C builddir 21 | 22 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChaissonLab/LRA/6221610ddef555af76985a91160a9e336e3d9035/.gitmodules -------------------------------------------------------------------------------- /AlignmentBlock.h: -------------------------------------------------------------------------------- 1 | #ifndef ALIGNMENT_BLOCK_H_ 2 | #define ALIGNMENT_BLOCK_H_ 3 | 4 | #include 5 | #include 6 | #include "Types.h" 7 | 8 | using namespace std; 9 | class Block { 10 | public: 11 | // 12 | // An alignment is a collection of blocks. The qPos and tPos in a block 13 | // is relative to the beginning of the alignment rather than the 14 | // target or query. 15 | // 16 | 17 | GenomePos qPos, tPos, length; 18 | Block() {qPos=tPos=length=0;} 19 | Block(GenomePos q, GenomePos t, GenomePos l) : qPos(q), tPos(t), length(l) {} 20 | Block& Assign(Block &rhs) { 21 | qPos = rhs.qPos; 22 | tPos = rhs.tPos; 23 | length = rhs.length; 24 | return *this; 25 | } 26 | 27 | GenomePos QEnd() { 28 | return qPos + length; 29 | } 30 | 31 | GenomePos TEnd() { 32 | return tPos + length; 33 | } 34 | 35 | void Clear() { 36 | qPos = tPos = length = 0; 37 | } 38 | }; 39 | 40 | 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /BasicEndpoint.h: -------------------------------------------------------------------------------- 1 | #ifndef BASIC_ENDPOINT_H_ 2 | #define BASIC_ENDPOINT_H_ 3 | 4 | 5 | 6 | 7 | /* 8 | An endpoint is one of the ends of a fragment, where 9 | a fragment is an exact match between two genomes. 10 | So, a fragment is a 2D object that has a position and length, 11 | and an endpoint is 1D, where it just has a position. 12 | A fragment may be associated with a score that is the score 13 | of the fragment in a maximum scoring chain. When finding a 14 | maximum scoring chain using priority search trees, one must 15 | be able to set the score of a fragment when indexing solely 16 | a point. 17 | */ 18 | 19 | 20 | class Coordinate { 21 | private: 22 | UInt x; 23 | UInt y; 24 | public: 25 | UInt GetX() const { return x;} 26 | UInt GetY() const { return y;} 27 | UInt SetX(UInt _x) { return (x = _x);} 28 | UInt SetY(UInt _y) { return (y = _y);} 29 | int operator<(const Coordinate &rhs) const { 30 | if (x == rhs.GetX()) return y < rhs.GetY(); 31 | else return x < rhs.GetX(); 32 | } 33 | 34 | int operator<=(const Coordinate &rhs) const { 35 | return (*this.x < rhs.x) or (x == rhs.x && y <= rhs.y); 36 | } 37 | 38 | int Equals(const Coordinate &rhs) const { 39 | return (x == rhs.GetX() and y == rhs.GetY()); 40 | } 41 | 42 | // 43 | // Synonym for Equals. 44 | // 45 | int operator==(const Coordinate &rhs) const { 46 | return this->Equals(rhs); 47 | } 48 | 49 | Coordinate &operator=(const Coordinate &rhs) { 50 | this->x = rhs.x; 51 | this->y = rhs.y; 52 | return *this; 53 | } 54 | }; 55 | 56 | 57 | template 58 | class BasicEndpoint { 59 | T_ScoredFragment *fragmentPtr; 60 | 61 | public: 62 | enum WhichEnd {Start, End}; 63 | // typedef Coordinate KeyType; 64 | typedef UInt KeyType; 65 | class LessThan { 66 | public: 67 | int operator()(const BasicEndpoint &lhs, const BasicEndpoint &rhs) const { 68 | return lhs.p <= rhs.p; 69 | } 70 | }; 71 | 72 | public:// private: 73 | Coordinate p; 74 | WhichEnd side; 75 | 76 | WhichEnd GetSide() { return side; } 77 | 78 | void FragmentPtrToStart(T_ScoredFragment *fragment) { 79 | p.SetX(fragment->GetX()); 80 | p.SetY(fragment->GetY()); 81 | side = Start; 82 | fragmentPtr = fragment; 83 | } 84 | 85 | void FragmentPtrToEnd(T_ScoredFragment *fragment) { 86 | p.SetX(fragment->GetX() + fragment->GetXLength()); 87 | p.SetY(fragment->GetY() + fragment->GetYLength()); 88 | side = End; 89 | fragmentPtr = fragment; 90 | } 91 | 92 | int GetScore() { 93 | return fragmentPtr->GetScore(); 94 | } 95 | int SetScore(int score) { 96 | return (fragmentPtr->SetScore(score)); 97 | } 98 | 99 | T_ScoredFragment* SetScoredReference(T_ScoredFragment *_fragmentPtr) { 100 | return (fragmentPtr = _fragmentPtr); 101 | } 102 | int operator<(const BasicEndpoint &rhs) const { 103 | return p < rhs.p; 104 | } 105 | KeyType GetKey() { 106 | return p.GetY(); 107 | } 108 | T_ScoredFragment* GetFragmentPtr() { 109 | return fragmentPtr; 110 | } 111 | void SetChainPrev(T_ScoredFragment *prevChainFragment) { 112 | fragmentPtr->SetChainPrev(prevChainFragment); 113 | } 114 | }; 115 | 116 | 117 | 118 | #endif 119 | -------------------------------------------------------------------------------- /CompareLists.h: -------------------------------------------------------------------------------- 1 | #ifndef COMPARE_LISTS_H_ 2 | #define COMPARE_LISTS_H_ 3 | #include 4 | #include "Options.h" 5 | #include "Types.h" 6 | 7 | // Compare minimizers from reference and reads without frequency 8 | template 9 | void CompareLists(typename vector::iterator qBegin, typename vector::iterator qEnd, typename vector::iterator tBegin, typename vector::iterator tEnd, 10 | vector> &result, const Options &opts, bool Global, int64_t maxDiagNum = 0, int64_t minDiagNum = 0, bool canonical=true) { 11 | // 12 | // If canonical == True, for_mask = 0111...11 --> minimizer & for_mask = 0minimizer. 13 | // Else, for_mask = 111...11 --> minimizer & for_mask = minimizer 14 | // For LocalTuple, for_mask = 111...11 always. 15 | // 16 | Tup for_mask = tup::for_mask_s; 17 | if (!canonical and Global) { 18 | for_mask = ~(for_mask & 0); 19 | } 20 | // cerr << "canonical: " << canonical << " for_mask: " << for_mask << endl; 21 | int nAdded=0; 22 | int origSize=result.size(); 23 | long qs = 0, qe = qEnd - qBegin - 1; 24 | long ts = 0, te = tEnd - tBegin; 25 | typename vector::iterator slb; 26 | 27 | if (qBegin == qEnd or tBegin == tEnd) { 28 | result.clear(); 29 | return; 30 | } 31 | long maxFreq; 32 | if ( Global ) { maxFreq = opts.globalMaxFreq;} else { maxFreq = opts.localMaxFreq;} 33 | 34 | #ifdef _TESTING_ 35 | vector isect; 36 | cout << "comparing " << qEnd-qBegin << " and " << te-ts << " lists" << endl; 37 | std::set_intersection(qBegin, qEnd, tBegin, tEnd, back_inserter(isect)); 38 | cout << "Matched " << isect.size() << " slowly." << endl; 39 | #endif 40 | 41 | long nMatch=0; 42 | long iter=0; 43 | do { 44 | tup startGap, endGap; 45 | ++iter; 46 | // ts is at the beginning of the matches in t. Anything before that in qBegin can't match, sl skip past that 47 | while (qs <= qe and (qBegin[qs].t & for_mask) < (tBegin[ts].t & for_mask)) { 48 | qs++; 49 | } 50 | // no possible matches, exit. 51 | if (qs >= qe) { 52 | return; 53 | } 54 | // startGap.t is how far below the tuple in t is the tuple in q 55 | if (qs < qe) { 56 | startGap.t = (qBegin[qs].t & for_mask) - (tBegin[ts].t & for_mask); 57 | } 58 | if (qs == qe) { 59 | endGap = startGap; 60 | } 61 | else { 62 | // Move past any entries guaranteed to not be in target 63 | while (qe > qs and te > ts and (qBegin[qe].t & for_mask) > (tBegin[te-1].t & for_mask)) { 64 | qe--; 65 | } 66 | // End gap is how much less the last element in qBegin is than TBegin 67 | endGap.t = (tBegin[te-1].t & for_mask) - (qBegin[qe].t & for_mask); 68 | } 69 | if (startGap.t == 0 or startGap > endGap) { 70 | // 71 | // Find entry in t that could match qs 72 | // 73 | typename vector::iterator lb; 74 | long tsOrig=ts; 75 | long qsOrig=qs; 76 | lb = lower_bound(tBegin + ts, tBegin + te, qBegin[qs]); 77 | ts = lb - tBegin; 78 | if ((tBegin[ts].t & for_mask) == (qBegin[qs].t & for_mask)) { 79 | GenomePos tsStart = ts; 80 | GenomePos tsi = ts; 81 | while (tsi != te and (qBegin[qs].t & for_mask) == (tBegin[tsi].t & for_mask)) { tsi++; } 82 | GenomePos qsStart = qs; 83 | while (qs < qe and (qBegin[qs+1].t & for_mask) == (qBegin[qs].t & for_mask)) { qs++; } 84 | for (GenomePos ti = tsStart; ti != tsi; ti++) { 85 | if (qs - qsStart < maxFreq) { 86 | for (GenomePos qi = qsStart; qi <= qs; qi++) { 87 | if (maxDiagNum != 0 and minDiagNum != 0) { 88 | int64_t Diag = (int64_t) tBegin[ti].pos - (int64_t) qBegin[qi].pos; 89 | if (Diag <= maxDiagNum and Diag >= minDiagNum) { 90 | result.push_back(pair(qBegin[qi], tBegin[ti])); 91 | } 92 | } 93 | else { 94 | result.push_back(pair(qBegin[qi], tBegin[ti])); 95 | } 96 | } 97 | } 98 | } 99 | } 100 | 101 | while (ts < te and tBegin[ts].t == tBegin[tsOrig].t) { ts++; } 102 | while (qs < qe and qBegin[qs].t == qBegin[qsOrig].t) { qs++;} 103 | } 104 | else { 105 | // 106 | // End gap is greater than start gap, search from the other direction 107 | // 108 | typename vector::iterator ub; 109 | assert(te > ts); 110 | if (tBegin + te != tEnd and (tBegin[te-1].t & for_mask) == (qBegin[qe].t & for_mask)) { 111 | // pass 112 | } 113 | else { 114 | ub = upper_bound(tBegin+ts, tBegin+te, qBegin[qe]); 115 | // *ub is > qBegin[qe] 116 | te = ub - tBegin; 117 | } 118 | GenomePos teStart=te, tei=te; 119 | while (tei > ts and (tBegin[tei-1].t & for_mask) == (qBegin[qe].t & for_mask)) { 120 | tei--; 121 | } 122 | if (tei < teStart and teStart > 0) { 123 | GenomePos qeStart=qe; 124 | while (qe > qs and (qBegin[qe].t & for_mask) == (qBegin[qe-1].t & for_mask)) { qe--;} 125 | for (GenomePos ti = tei; ti < teStart; ti++) { 126 | if (qeStart - qe < maxFreq) { 127 | for (GenomePos qi = qe; qi <= qeStart; qi++) { 128 | if (maxDiagNum != 0 and minDiagNum != 0) { 129 | int64_t Diag = (int64_t) tBegin[ti].pos - (int64_t) qBegin[qi].pos; 130 | if (Diag <= maxDiagNum and Diag >= minDiagNum) { 131 | result.push_back(pair(qBegin[qi], tBegin[ti])); 132 | } 133 | } 134 | else { 135 | result.push_back(pair(qBegin[qi], tBegin[ti])); 136 | } 137 | } 138 | } 139 | } 140 | } 141 | te=tei; 142 | } 143 | } while (qs < qe and ts < te); 144 | nAdded=result.size() - origSize; 145 | 146 | } 147 | 148 | template 149 | void CompareLists(vector &query, vector &target, vector > &result, const Options &opts, bool Global) { 150 | CompareLists(query.begin(), query.end(), target.begin(), target.end(), result, opts, Global); 151 | } 152 | 153 | #endif 154 | -------------------------------------------------------------------------------- /DivideSubByCol2.h: -------------------------------------------------------------------------------- 1 | #ifndef DIVIDE_SUB_BY_COL2_H_ 2 | #define DIVIDE_SUB_BY_COL2_H_ 3 | 4 | 5 | #include 6 | #include 7 | #include //std::floor 8 | #include 9 | #include 10 | #include 11 | 12 | #include "SubProblem.h" 13 | #include "Fragment_Info.h" 14 | #include "Info.h" 15 | #include "overload.h" 16 | #include "Types.h" 17 | #include "Point.h" 18 | 19 | using std::cerr; 20 | using std::cout; 21 | using std::endl; 22 | using std::iota; 23 | 24 | 25 | //ScanPoints_col find Di and Ei array for non-leaf cases 26 | // Note: H1[j].inv == 0 and backdiag 27 | void 28 | ScanPoints_Col2(std::vector & V, std::vector & H1, std::vector & H2, std::vector & Bi, unsigned int & s, unsigned int & e, bool & DE, unsigned int & n) { 29 | // elements in set are unique and follow an increasing order 30 | std::set ForwardIndex; 31 | for (unsigned int i = s; i < e; ++i) { 32 | unsigned int count = 0; 33 | for (unsigned int j = V[i].pstart; j < V[i].pend; ++j) { 34 | if (H1[H2[j]].ind == DE and H1[H2[j]].inv == 0) { 35 | long int l = (long int)(H1[H2[j]].se.second) + (long int)(H1[H2[j]].se.first); 36 | ForwardIndex.insert(l); 37 | ++count; 38 | } 39 | } 40 | 41 | if (count != 0) { 42 | if (DE == 1) V[i].SS_B2.push_back(n); 43 | else V[i].SS_A2.push_back(n); 44 | } 45 | } 46 | // elements in D/E array are in the ascending order 47 | for (std::set::iterator it = ForwardIndex.begin(); it != ForwardIndex.end(); ++it) { 48 | Bi.push_back(*it); 49 | } 50 | } 51 | 52 | 53 | //ScanPoints_col find Di and Ei array for leaf cases 54 | // Note: H1[j].inv == 0 and backdiag 55 | void 56 | ScanPoints_Col2(std::vector & V, std::vector & H1, std::vector & H2, std::vector & Bi, std::vector & Ci, 57 | unsigned int & s, unsigned int & e, unsigned int & n) { 58 | 59 | std::set ForwardIndex1; 60 | std::set ForwardIndex2; 61 | 62 | for (unsigned int i = s; i < e; ++i) { 63 | unsigned int count1 = 0; 64 | unsigned int count2 = 0; 65 | for (unsigned int j = V[i].pstart; j < V[i].pend; ++j) { 66 | if (H1[H2[j]].ind == 1 and H1[H2[j]].inv == 0) { 67 | long int l = (long int)(H1[H2[j]].se.second) + (long int)(H1[H2[j]].se.first); 68 | ForwardIndex1.insert(l); 69 | ++count1; 70 | } 71 | else if (H1[H2[j]].ind == 0 and H1[H2[j]].inv == 0) { 72 | long int r = (long int)(H1[H2[j]].se.second) + (long int)(H1[H2[j]].se.first); 73 | ForwardIndex2.insert(r); 74 | ++count2; 75 | } 76 | } 77 | 78 | if (count1 != 0 and count2 != 0) { 79 | V[i].SS_B2.push_back(n); 80 | V[i].SS_A2.push_back(n); 81 | } 82 | } 83 | 84 | for (std::set::iterator it = ForwardIndex1.begin(); it != ForwardIndex1.end(); ++it) { // elements in D array are in the ascending order 85 | Bi.push_back(*it); 86 | } 87 | for (std::set::iterator it = ForwardIndex2.begin(); it != ForwardIndex2.end(); ++it) { // elements in D array are in the ascending order 88 | Ci.push_back(*it); 89 | } 90 | } 91 | 92 | 93 | void 94 | Decide_Eb_Db_C2 (std::vector & Di, std::vector & Ei, std::vector & Db, std::vector & Eb, std::vector & E) { 95 | 96 | for (unsigned int s = 0; s < Di.size(); ++s) { 97 | // find the index *t that Ei[*t] is the first element which is >= Di[s] 98 | std::vector::iterator t = Lower_Bound::iterator,long int>(E.begin(), E.end(), Di[s], Ei); 99 | if (t == E.end()) { 100 | break; 101 | } 102 | else{ 103 | Db[s] = *t; 104 | Eb[*t] = s; 105 | } 106 | } 107 | 108 | unsigned int cur = -1; 109 | for (unsigned int s = 0; s < Eb.size(); ++s) { 110 | if (Eb[s] == -1 and cur == -1) { 111 | continue; 112 | } 113 | else if (Eb[s] != -1) { 114 | cur = Eb[s]; 115 | } 116 | else { 117 | Eb[s] = cur; 118 | } 119 | } 120 | } 121 | 122 | 123 | void 124 | DivideSubProbByCol2 (std::vector & H1, std::vector & H2, std::vector & V, unsigned int start, unsigned int end, 125 | unsigned int & n, StackOfSubProblems & Sub, int & eeC) { // [start, end) is a half open interval 126 | 127 | if (end == start + 1) { // subproblem A is empty, while B contains only one row. This is a leaf case. 128 | 129 | Subproblem ss = Subproblem(n); 130 | Sub.Push_Back(eeC, ss); // ss is a subproblem which Di and Ei coming from one row 131 | ++eeC; 132 | ScanPoints_Col2(V, H1, H2, Sub[eeC - 1].Ei, Sub[eeC - 1].Di, start, end, n); 133 | 134 | if (!Sub[eeC - 1].Ei.empty() and !Sub[eeC - 1].Di.empty()) { 135 | 136 | // initialize Sub[eeC- 1] 137 | unsigned int l = Sub[eeC - 1].Di.size(); 138 | unsigned int h = Sub[eeC - 1].Ei.size(); 139 | 140 | Sub[eeC - 1].E.assign(h, 0); 141 | std::iota(Sub[eeC - 1].E.begin(), Sub[eeC - 1].E.end(), 0); 142 | Sub[eeC - 1].Eb.assign(h, -1); 143 | Sub[eeC - 1].Db.assign(l, -1); 144 | Decide_Eb_Db_C2(Sub[eeC - 1].Di, Sub[eeC - 1].Ei, Sub[eeC - 1].Db, Sub[eeC - 1].Eb, Sub[eeC - 1].E); 145 | 146 | // initialize other attributes of this subproblem 147 | Sub[eeC - 1].Dv.assign(l, 0); 148 | Sub[eeC - 1].Dp.assign(l, 0); 149 | Sub[eeC - 1].D.assign(l, 0); 150 | std::iota(Sub[eeC - 1].D.begin(), Sub[eeC - 1].D.end(), 0); 151 | 152 | Sub[eeC - 1].Ev.assign(h, 0); 153 | Sub[eeC - 1].Ep.assign(h, 0); 154 | std::pair dummy_pair = std::make_pair(-1, h+1); 155 | Sub[eeC - 1].S_1.push(dummy_pair); 156 | 157 | } 158 | else { 159 | Sub.pop_back(); // delete subproblem ss 160 | // Sub.ClearSingle(eeC); 161 | --eeC; 162 | --n; 163 | } 164 | } 165 | else{ 166 | 167 | Subproblem s = Subproblem(n); 168 | Sub.Push_Back(eeC, s); 169 | ++eeC; 170 | 171 | // scan the points to determine Di 172 | unsigned int med = std::floor((start + end)/2); 173 | bool DE = 1; // DE == 1 means scan points to determin Ei (find for start points); 174 | ScanPoints_Col2(V, H1, H2, Sub[eeC-1].Ei, start, med, DE, n); 175 | 176 | DE = 0; // scan the points to determine Di 177 | ScanPoints_Col2(V, H1, H2, Sub[eeC-1].Di, med, end, DE, n); 178 | 179 | if (Sub[eeC-1].Ei.empty() and Sub[eeC-1].Di.empty()) { // Di is empty and Ei is empty 180 | Sub.pop_back(); // delete subproblem ss 181 | // Sub.ClearSingle(eeC); 182 | --eeC; 183 | --n; 184 | } 185 | else if (Sub[eeC-1].Ei.empty() and !Sub[eeC-1].Di.empty()) { // Di is non-empty and Ei is empty 186 | ++n; 187 | DivideSubProbByCol2(H1, H2, V, std::floor((start + end)/2), end, n, Sub, eeC); 188 | } 189 | else if (!Sub[eeC-1].Ei.empty() and Sub[eeC-1].Di.empty()) { 190 | ++n; 191 | DivideSubProbByCol2(H1, H2, V, start, std::floor((start + end)/2), n, Sub, eeC); 192 | } 193 | else { // non-leaf case 194 | 195 | // initialize Sub[eeC-1].Eb and Sub[eeC-1].Db 196 | unsigned int l = Sub[eeC-1].Di.size(); 197 | unsigned int h = Sub[eeC-1].Ei.size(); 198 | 199 | //std::vector p(h, -1); 200 | //std::vector z(l, -1); 201 | //std::vector t(h, 0); 202 | 203 | Sub[eeC-1].E.assign(h, 0); 204 | std::iota(Sub[eeC-1].E.begin(), Sub[eeC-1].E.end(), 0); 205 | Sub[eeC-1].Eb.assign(h, -1); 206 | Sub[eeC-1].Db.assign(l, -1); 207 | Decide_Eb_Db_C2(Sub[eeC-1].Di, Sub[eeC-1].Ei, Sub[eeC-1].Db, Sub[eeC-1].Eb, Sub[eeC-1].E); 208 | 209 | // initialize other attributes of this subproblem 210 | //std::vector v(l, 0); 211 | //std::vector w(l, 0); 212 | Sub[eeC-1].Dv.assign(l, 0); 213 | Sub[eeC-1].Dp.assign(l, 0); 214 | Sub[eeC-1].D.assign(l, 0); 215 | std::iota(Sub[eeC-1].D.begin(), Sub[eeC-1].D.end(), 0); 216 | 217 | //std::vector q(h,0); 218 | Sub[eeC-1].Ev.assign(h, 0); 219 | Sub[eeC-1].Ep.assign(h, 0); 220 | std::pair dummy_pair = std::make_pair(-1, h+1); 221 | Sub[eeC-1].S_1.push(dummy_pair); 222 | ++n; 223 | DivideSubProbByCol2(H1, H2, V, std::floor((start + end)/2), end, n, Sub, eeC); 224 | ++n; 225 | DivideSubProbByCol2(H1, H2, V, start, std::floor((start + end)/2), n, Sub, eeC); 226 | } 227 | } 228 | } 229 | 230 | #endif -------------------------------------------------------------------------------- /DivideSubByRow2.h: -------------------------------------------------------------------------------- 1 | #ifndef DIVIDE_SUB_BY_ROW2_H_ 2 | #define DIVIDE_SUB_BY_ROW2_H_ 3 | 4 | 5 | #include 6 | #include // std::pair 7 | #include //std::floor std::iota 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | #include "SubProblem.h" 14 | #include "Fragment_Info.h" 15 | #include "Info.h" 16 | #include "overload.h" 17 | #include "Types.h" 18 | #include "Point.h" 19 | 20 | 21 | using std::cerr; 22 | using std::cout; 23 | using std::endl; 24 | using std::iota; 25 | 26 | 27 | // This function finds Di and Ei array for non-leaf case 28 | // Note this function didn't count the number of points which have forward diagonal <= the current forward diagonal 29 | // Note: H1[j].inv == 0 30 | void 31 | ScanPoints_Row2 (std::vector & V, std::vector & H1, std::vector & Bi, unsigned int & s, unsigned int & e, bool & DE, unsigned int & n) { 32 | // elements in set are unique and follow an increasing order 33 | std::set ForwardIndex; 34 | for (unsigned int i = s; i < e; ++i) { 35 | 36 | unsigned int count = 0; 37 | for (unsigned int j = V[i].pstart; j < V[i].pend; ++j) { 38 | if (H1[j].ind == DE and H1[j].inv == 0) { // H1[j].ind == DE == 1 means finding start points 39 | long int l = (long int)(H1[j].se.second) + (long int)(H1[j].se.first); // back diagonal 40 | ForwardIndex.insert(l); 41 | ++count; 42 | } 43 | } 44 | 45 | if (count != 0) { 46 | if (DE == 1) V[i].SS_B2.push_back(n); 47 | else V[i].SS_A2.push_back(n); 48 | } 49 | 50 | } 51 | // elements in D/E array are in the decreasing order 52 | for (std::set::reverse_iterator it = ForwardIndex.rbegin(); it != ForwardIndex.rend(); ++it) { 53 | Bi.push_back(*it); 54 | } 55 | } 56 | 57 | 58 | 59 | // This function finds Di and Ei array for leaf-case 60 | // Note this function didn't count the number of points which have forward diagonal <= the current forward diagonal 61 | // Note: H1[j].inv == 0 62 | void 63 | ScanPoints_Row2 (std::vector & V, std::vector & H1, std::vector & Bi, std::vector & Ci, 64 | unsigned int & s, unsigned int & e, unsigned int & n) { 65 | 66 | std::set ForwardIndex1; // ForwardIndex1 is for Ei array 67 | std::set ForwardIndex2; // ForwardIndex2 is for Di array 68 | for (unsigned int i = s; i < e; ++i) { 69 | 70 | unsigned int count1 = 0; 71 | unsigned int count2 = 0; 72 | for (unsigned int j = V[i].pstart; j < V[i].pend; ++j) { 73 | if (H1[j].ind == 1 and H1[j].inv == 0) { // H1[j].ind == 1 means finding start points 74 | long int l = (long int)(H1[j].se.second) + (long int)(H1[j].se.first); // back diagonal 75 | ForwardIndex1.insert(l); 76 | ++count1; 77 | } 78 | else if (H1[j].ind == 0 and H1[j].inv == 0) { // H1[j].ind == 0 means finding end points 79 | long int r = (long int)(H1[j].se.second) + (long int)(H1[j].se.first); 80 | ForwardIndex2.insert(r); 81 | ++count2; 82 | } 83 | } 84 | 85 | if (count1 != 0 and count2 != 0) { 86 | V[i].SS_B2.push_back(n); 87 | V[i].SS_A2.push_back(n); 88 | } 89 | 90 | } 91 | 92 | for (std::set::reverse_iterator it = ForwardIndex1.rbegin(); it != ForwardIndex1.rend(); ++it) { // elements in D array are in the decreasing order 93 | Bi.push_back(*it); 94 | } 95 | for (std::set::reverse_iterator it = ForwardIndex2.rbegin(); it != ForwardIndex2.rend(); ++it) { // elements in E array are in the decreasing order 96 | Ci.push_back(*it); 97 | } 98 | } 99 | 100 | // This function will decide Eb and Db array 101 | void 102 | Decide_Eb_Db_R2 (std::vector & Di, std::vector & Ei, std::vector & Db, std::vector & Eb, std::vector & E) { 103 | // find the index *t that Ei[*t] is the first element which is < Di[s] 104 | // Note: here we compare from the right 105 | // 106 | for (unsigned int s = 0; s < Di.size(); ++s) { 107 | std::vector::reverse_iterator t = Lower_Bound::reverse_iterator,long int>(E.rbegin(), E.rend(), Di[s], Ei); 108 | if (t == E.rbegin()) { 109 | break; 110 | } 111 | else{ 112 | //std::prev(t); 113 | --t; // move to right by one step 114 | Db[s] = *t; 115 | Eb[*t] = s; 116 | } 117 | } 118 | 119 | unsigned int cur = -1; 120 | for (unsigned int s = 0; s < Eb.size(); ++s) { 121 | if (Eb[s] == -1 and cur == -1) { 122 | continue; 123 | } 124 | else if (Eb[s] != -1) { 125 | cur = Eb[s]; 126 | } 127 | else { 128 | Eb[s] = cur; 129 | } 130 | } 131 | } 132 | 133 | 134 | // This function Divide SubProblems by row for s2 and e2 135 | void 136 | DivideSubProbByRow2 (std::vector & H1, std::vector & V, unsigned int start, unsigned int end, 137 | unsigned int & n, StackOfSubProblems & Sub, int & eeR) { // [start, end) is a half open interval 138 | 139 | if (end == start + 1) { // subproblem A is empty, while B contains only one row. This is a leaf case. 140 | 141 | Subproblem ss = Subproblem(n); 142 | Sub.Push_Back(eeR, ss); // ss is a subproblem which Di and Ei coming from one row 143 | ++eeR; 144 | 145 | // scan the points to determine Ei and Di 146 | ScanPoints_Row2(V, H1, Sub[eeR - 1].Ei, Sub[eeR - 1].Di, start, end, n); 147 | 148 | if (!Sub[eeR - 1].Ei.empty() and !Sub[eeR - 1].Di.empty()) { 149 | 150 | // initialize Sub[eeR - 1] 151 | unsigned int l = Sub[eeR - 1].Di.size(); 152 | unsigned int h = Sub[eeR - 1].Ei.size(); 153 | 154 | Sub[eeR - 1].E.assign(h, 0); 155 | std::iota(Sub[eeR - 1].E.begin(), Sub[eeR - 1].E.end(), 0); 156 | Sub[eeR - 1].Eb.assign(h, -1); 157 | Sub[eeR - 1].Db.assign(l, -1); 158 | Decide_Eb_Db_R2(Sub[eeR - 1].Di, Sub[eeR - 1].Ei, Sub[eeR - 1].Db, Sub[eeR - 1].Eb, Sub[eeR - 1].E); 159 | 160 | // initialize other attributes of this subproblem 161 | Sub[eeR - 1].Dv.assign(l, 0); 162 | Sub[eeR- 1].Dp.assign(l, 0); 163 | Sub[eeR - 1].D.assign(l, 0); 164 | std::iota(Sub[eeR - 1].D.begin(), Sub[eeR - 1].D.end(), 0); 165 | 166 | Sub[eeR - 1].Ev.assign(h, 0); 167 | Sub[eeR - 1].Ep.assign(h, 0); 168 | std::pair dummy_pair = std::make_pair(-1, h+1); 169 | Sub[eeR - 1].S_1.push(dummy_pair); 170 | 171 | } 172 | else { 173 | Sub.pop_back(); // delete subproblem ss 174 | // Sub.ClearSingle(eeR); 175 | --eeR; 176 | --n; 177 | } 178 | } 179 | else{ 180 | 181 | Subproblem s = Subproblem(n); 182 | Sub.Push_Back(eeR, s); 183 | ++eeR; 184 | // scan the points to determine Di 185 | unsigned int med = std::floor((start + end)/2); 186 | bool DE = 0; // DE == 0 means scan points to determin Di (find for end points); 187 | //cerr << "scan points to determin Di in ["<< start << ", " << med << ")" << endl; 188 | ScanPoints_Row2(V, H1, Sub[eeR -1].Di, start, med, DE, n); 189 | // scan the points to determine Ei 190 | //cerr << "scan points to determine Ei in ["<< med << ", " << end << ")" << endl; 191 | DE = 1; 192 | ScanPoints_Row2(V, H1, Sub[eeR -1].Ei, med, end, DE, n); 193 | 194 | 195 | if (Sub[eeR -1].Ei.empty() and Sub[eeR -1].Di.empty()) { // Di is empty and Ei is empty 196 | Sub.pop_back(); 197 | // Sub.ClearSingle(eeR); 198 | --eeR; 199 | --n; 200 | } 201 | else if (Sub[eeR -1].Ei.empty() and !Sub[eeR -1].Di.empty()) { // Di is non-empty and Ei is empty 202 | //cerr << "Di is non-empty and Ei is empty: " << n << "\n"; 203 | ++n; 204 | //cerr <<"start: " << start+ 1 << ", med: " << std::floor((start + 1 + end + 1)/2) << ", n: " << n << "\n"; 205 | DivideSubProbByRow2(H1, V, start, std::floor((start + end)/2), n, Sub, eeR); 206 | } 207 | else if (!Sub[eeR -1].Ei.empty() and Sub[eeR -1].Di.empty()) { // Di is empty and Ei is non-empty 208 | //cerr << "Di is empty and Ei is non-empty: " << n << "\n"; 209 | ++n; 210 | //cerr <<"med: " << std::floor((start + 1 + end + 1)/2) << ", end: " << end + 1 << ", n: " << n << "\n"; 211 | DivideSubProbByRow2(H1, V, std::floor((start + end)/2), end, n, Sub, eeR); 212 | } 213 | else { 214 | 215 | // This is an non-leaf case 216 | // initialize Sub[eeR -1].Eb and Sub[eeR -1].Db 217 | unsigned int l = Sub[eeR -1].Di.size(); 218 | unsigned int h = Sub[eeR -1].Ei.size(); 219 | 220 | //std::vector p(h, -1); 221 | //std::vector z(l, -1); 222 | //std::vector t(h, 0); 223 | Sub[eeR -1].E.assign(h, 0); 224 | std::iota(Sub[eeR -1].E.begin(), Sub[eeR -1].E.end(), 0); 225 | Sub[eeR -1].Eb.assign(h, -1); 226 | Sub[eeR -1].Db.assign(l, -1); 227 | Decide_Eb_Db_R2(Sub[eeR -1].Di, Sub[eeR -1].Ei, Sub[eeR -1].Db, Sub[eeR -1].Eb, Sub[eeR -1].E); 228 | 229 | // initialize other attributes of this subproblem 230 | //std::vector v(l, 0); 231 | //std::vector w(l, 0); 232 | Sub[eeR -1].Dv.assign(l, 0); 233 | Sub[eeR -1].Dp.assign(l, 0); 234 | Sub[eeR -1].D.assign(l, 0); 235 | std::iota(Sub[eeR -1].D.begin(), Sub[eeR -1].D.end(), 0); 236 | 237 | //std::vector q(h, 0); 238 | Sub[eeR -1].Ev.assign(h, 0); 239 | Sub[eeR -1].Ep.assign(h, 0); 240 | std::pair dummy_pair = std::make_pair(-1, h+1); 241 | Sub[eeR -1].S_1.push(dummy_pair); 242 | ++n; 243 | //cerr <<"start: " << start+ 1 << ", med: " << std::floor((start + 1 + end + 1)/2) << ", n: " << n << "\n"; 244 | DivideSubProbByRow2(H1, V, start, std::floor((start + end)/2), n, Sub, eeR); 245 | ++n; 246 | //cerr <<"med: " << std::floor((start + 1 + end + 1)/2) << ", end: " << end + 1 << ", n: " << n << "\n"; 247 | DivideSubProbByRow2(H1, V, std::floor((start + end)/2), end, n, Sub, eeR); 248 | } 249 | 250 | } 251 | } 252 | 253 | #endif -------------------------------------------------------------------------------- /Fragment.h: -------------------------------------------------------------------------------- 1 | #ifndef FRAGMENT_H_ 2 | #define FRAGMENT_H_ 3 | 4 | class Fragment { 5 | public: 6 | int xl, yl, xh, yh; 7 | int score; 8 | int prev; 9 | int index; 10 | int GetScore() { 11 | return score; 12 | } 13 | Fragment(int _xl, int _yl, int _xh, int _yh, int _s, int _idx) : xl(_xl), yl(_yl), xh(_xh), yh(_yh), score(_s), index(_idx) {prev=-1;} 14 | }; 15 | #endif 16 | -------------------------------------------------------------------------------- /Fragment_Info.h: -------------------------------------------------------------------------------- 1 | #ifndef PFRAGMENT_INFO_H_ 2 | #define PFRAGMENT_INFO_H_ 3 | 4 | // TODO(Jingwen): combine Value Vector with Fragment_Info Pair 5 | #include 6 | #include 7 | 8 | 9 | class Fragment_Info 10 | { 11 | public: 12 | float val; 13 | int clusterNum; // store the index of the Cluster which the current anchor comes from; 14 | // int matchstartNum; 15 | long int prev_sub; // the previous subproblem's number 16 | long int prev_ind; // the index in the Ev of the previous subproblem 17 | bool prev; //if prev == TRUE then the previous subproblem is row subproblem. Else it's col subproblem 18 | bool inv; // if inv == TRUE, then the previous subproblem is dividing (s1, e1). Else it's dividing (s2, e2) 19 | bool orient; // if orient = 0 means reverse orientated anchor 20 | vector SS_A_R1; 21 | vector SS_B_R1; 22 | unsigned int counter_A_R1; 23 | unsigned int counter_B_R1; 24 | vector SS_A_C1; 25 | vector SS_B_C1; 26 | unsigned int counter_A_C1; 27 | unsigned int counter_B_C1; 28 | vector SS_A_R2; 29 | vector SS_B_R2; 30 | unsigned int counter_A_R2; 31 | unsigned int counter_B_R2; 32 | vector SS_A_C2; 33 | vector SS_B_C2; 34 | unsigned int counter_A_C2; 35 | unsigned int counter_B_C2; 36 | Fragment_Info(); 37 | ~Fragment_Info() {}; // deconstructor 38 | friend ostream & operator<<(ostream & os, const Fragment_Info & M); 39 | }; 40 | 41 | 42 | Fragment_Info::Fragment_Info () { 43 | prev_sub = -1; 44 | prev_ind = -1; 45 | prev = 1; 46 | inv = 1; 47 | orient = 1; 48 | } 49 | 50 | 51 | ostream & operator<<(ostream & os, const Fragment_Info & M) { 52 | os << "val: " << M.val << ", prev_sub: " << M.prev_sub << ", prev_ind: " << M.prev_ind << ", prev: " << M.prev << "\n"; 53 | os << "SS_A_R1: " << M.SS_A_R1 << "\n"; 54 | os << "SS_B_R1: " << M.SS_B_R1 << "\n"; 55 | os << "counter_A_R1: " << M.counter_A_R1 << "\n"; 56 | os << "counter_B_R1: " << M.counter_B_R1 << "\n"; 57 | os << "SS_A_C1: " << M.SS_A_C1 << "\n"; 58 | os << "SS_B_C1: " << M.SS_B_C1 << "\n"; 59 | os << "counter_A_C1: " << M.counter_A_C1 << "\n"; 60 | os << "counter_B_C1: " << M.counter_B_C1 << "\n"; 61 | return os; 62 | } 63 | 64 | 65 | class Fragment_valueOrder { 66 | public: 67 | vector fragments_value; 68 | vector index; 69 | 70 | Fragment_valueOrder(const vector *c) { 71 | vector fv(c->size(), 0); 72 | for (unsigned int i = 0; i < c->size(); i++) { 73 | fv[i] = (*c)[i].val; 74 | } 75 | fragments_value = fv; 76 | assert(fragments_value.size() == c->size()); 77 | index.resize(c->size()); 78 | for (unsigned int i = 0;i < index.size(); i++) { index[i] = i;} 79 | Sort(); 80 | } 81 | 82 | int operator()(const int i, const int j) { 83 | assert(i < fragments_value.size()); 84 | assert(j < fragments_value.size()); 85 | return fragments_value[i] > fragments_value[j]; 86 | } 87 | 88 | void Sort() { 89 | sort(index.begin(), index.end(), *this); 90 | } 91 | 92 | float & operator[](int i) { 93 | return fragments_value[index[i]]; 94 | } 95 | 96 | int size() { 97 | return index.size(); 98 | } 99 | }; 100 | 101 | 102 | #endif -------------------------------------------------------------------------------- /Genome.h: -------------------------------------------------------------------------------- 1 | #ifndef GENOME_H_ 2 | #define GENOME_H_ 3 | #include "htslib/kseq.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "Types.h" 10 | #include 11 | using namespace std; 12 | 13 | class Header { 14 | public: 15 | vector names; 16 | vector pos; 17 | Header() { 18 | pos.push_back(0); 19 | } 20 | int Find(uint64_t query) { 21 | 22 | if (pos.size() > 0 and query == pos[0]) { 23 | return 0; 24 | } 25 | else { 26 | vector::iterator it = lower_bound(pos.begin(), pos.end(), query); 27 | int i = it - pos.begin(); 28 | assert(i > 0); 29 | if (query == *it) return i; 30 | return i - 1; 31 | } 32 | } 33 | uint64_t GetChromPos(uint64_t query) { 34 | int i = Find(query); 35 | return query-pos[i]; 36 | } 37 | 38 | uint64_t GetOffset(uint64_t query) { 39 | int i = Find(query); 40 | return pos[i]; 41 | } 42 | 43 | uint64_t GetNextOffset(uint64_t query) { 44 | int i = Find(query); 45 | assert(i+1 < pos.size()); 46 | return pos[i+1]; 47 | } 48 | 49 | // int GetChromStart(uint64_t query) { 50 | // vector::iterator it = lower_bound(pos.begin(), pos.end(), query); 51 | // return query - *it; 52 | // } 53 | 54 | void Add(const char* name, uint64_t p) { 55 | names.push_back(string(name)); 56 | pos.push_back(p); 57 | } 58 | 59 | void Write(ofstream &out) { 60 | int idxLen = names.size(); 61 | out.write((char*) &idxLen, sizeof(int)); 62 | for(int i=0; i < names.size();i++) { 63 | int nameLen=names[i].size(); 64 | out.write((char*) &nameLen, sizeof(int)); 65 | out.write((char*) names[i].c_str(), names[i].size()); 66 | } 67 | out.write((char*) &pos[0], sizeof(int64_t)*pos.size()); 68 | } 69 | 70 | void Read(ifstream &in) { 71 | int idxLen; 72 | in.read((char*) &idxLen, sizeof(int)); 73 | names.resize(idxLen); 74 | pos.resize(idxLen+1); 75 | for(int i=0; i < names.size(); i++) { 76 | int nameLen; 77 | in.read((char*) &nameLen, sizeof(int)); 78 | char *name = new char[nameLen+1]; 79 | name[nameLen] = '\0'; 80 | in.read((char*) name, nameLen); 81 | names[i] = name; 82 | } 83 | in.read((char*) &pos[0],sizeof(int64_t)*pos.size()); 84 | } 85 | void WriteSAMHeader(ostream &out) { 86 | for (int i=0; i < names.size(); i++) { 87 | out << "@SQ\tSN:"< seqs; 95 | vector lengths; 96 | map nameMap; 97 | uint64_t GetSize() { 98 | if (header.pos.size() == 0) { 99 | return 0; 100 | } 101 | else { 102 | return header.pos[header.pos.size()-1]; 103 | } 104 | } 105 | 106 | Header header; 107 | char *GlobalIndexToSeq(long index) { 108 | int chrom=header.Find(index); 109 | uint64_t chromPos=index-header.pos[chrom]; 110 | assert(chrom < seqs.size()); 111 | assert(chromPos < lengths[chrom]); 112 | return &seqs[chrom][chromPos]; 113 | } 114 | 115 | void Read(string &genome) { 116 | ifstream testGenome(genome.c_str()); 117 | if (testGenome.good() == false or testGenome.eof()) { 118 | cerr << "Cannot open target " << genome << endl; 119 | exit(1); 120 | } 121 | 122 | gzFile f = gzopen(genome.c_str(), "r"); 123 | kseq_t *ks = kseq_init(f); 124 | uint64_t offset=0; 125 | int i=0; 126 | while (kseq_read(ks) >= 0) { // each kseq_read() call reads one query sequence 127 | char *seq = new char[ks->seq.l]; 128 | for (int j=0;jseq.l;j++) { seq[j] = toupper(ks->seq.s[j]);} 129 | // memcpy(seq, ks->seq.s, ks->seq.l); 130 | seqs.push_back(seq); 131 | lengths.push_back(ks->seq.l); 132 | offset+=ks->seq.l; 133 | header.Add(ks->name.s, offset); 134 | nameMap[ks->name.s] = i; 135 | i++; 136 | } 137 | kseq_destroy(ks); 138 | } 139 | ~Genome() { 140 | for (int i = 0; i < seqs.size(); i++) { 141 | delete[] seqs[i]; 142 | seqs[i] = NULL; 143 | } 144 | } 145 | int GetIndex(string chrom) { 146 | if (nameMap.find(chrom) == nameMap.end()){ 147 | return -1; 148 | } 149 | else { 150 | return nameMap[chrom]; 151 | } 152 | } 153 | void GlobalPosToChrom(long offset, long &chromPos, string &name) { 154 | int index=header.Find(offset); 155 | name=header.names[index]; 156 | chromPos=offset-header.pos[index]; 157 | } 158 | 159 | char *OffsetToChrom(GenomePos offset) { 160 | int chromIndex = header.Find(offset); 161 | assert(chromIndex < seqs.size()); 162 | return seqs[chromIndex]; 163 | } 164 | }; 165 | 166 | #endif 167 | -------------------------------------------------------------------------------- /GlobalChain.h: -------------------------------------------------------------------------------- 1 | #ifndef GLOBAL_CHAIN_H_ 2 | #define GLOBAL_CHAIN_H_ 3 | using namespace std; 4 | #include "TupleOps.h" 5 | #include 6 | #include "PrioritySearchTree.h" 7 | #include 8 | #include "Fragment.h" 9 | #include 10 | class Endpoint { 11 | public: 12 | int x; 13 | int y; 14 | int fragment; 15 | 16 | int score; 17 | int chainPrev; 18 | typedef unsigned int KeyType; 19 | 20 | enum WhichEnd {Start, End}; 21 | WhichEnd side; 22 | Endpoint(int _x, int _y, int _f, WhichEnd _s) : x(_x), y(_y), fragment(_f), side(_s) { chainPrev=-1;} 23 | int GetX() const { 24 | return x; 25 | } 26 | int GetY() const { 27 | return y; 28 | } 29 | void SetScore(int s) { 30 | score = s; 31 | } 32 | WhichEnd GetSide() const { 33 | return side; 34 | } 35 | bool operator()(const Endpoint &rhs) const { 36 | if (x != rhs.x) { 37 | return x 66 | void FragmentSetToEndpoints(vector &fragments, vector &endpoints) { 67 | 68 | endpoints.resize(fragments.size()*2); 69 | 70 | int i; 71 | int ep = 0; 72 | for (i = 0; i < fragments.size(); i++) { 73 | endpoints[ep].x = fragments[i].xl; 74 | endpoints[ep].y = fragments[i].yl; 75 | endpoints[ep].side = T_Endpoint::Start; 76 | endpoints[ep].fragment = i; 77 | ep++; 78 | 79 | endpoints[ep].x = fragments[i].xh; 80 | endpoints[ep].y = fragments[i].yh; 81 | endpoints[ep].side = T_Endpoint::End; 82 | endpoints[ep].fragment = i; 83 | ep++; 84 | } 85 | } 86 | 87 | template 88 | int GlobalChain( vector &fragments, 89 | vector &optFragmentChainIndices, 90 | vector &endpoints) { 91 | 92 | 93 | // 94 | // Initialize the fragment score to be the length of each fragment. 95 | // 96 | if (fragments.size() == 0) { 97 | return 0; 98 | } 99 | 100 | // 101 | // Add the start/end points of each fragment. This allows separate scoring 102 | // of start points and activation of endpoints. 103 | // 104 | 105 | 106 | FragmentSetToEndpoints(fragments, endpoints); 107 | 108 | // 109 | // The Starting points of all the fragmements are in order, 110 | // but not necessarily all of the end endpoints, so 111 | // the list must be resorted. 112 | // 113 | std::sort(endpoints.begin(), endpoints.end(), typename T_Endpoint::LessThan()); 114 | 115 | PrioritySearchTree pst; 116 | 117 | pst.CreateTree(endpoints); 118 | // pst.Print(); 119 | unsigned int p; 120 | unsigned int maxScoringEndpoint = 0; 121 | bool maxScoringEndpointFound = false; 122 | for (p = 0; p < endpoints.size(); p++) { 123 | int x = endpoints[p].x; 124 | int y = endpoints[p].y; 125 | if (endpoints[p].GetSide() == T_Endpoint::Start) { 126 | int maxPointIndex=0; 127 | if (pst.FindIndexOfMaxPoint(endpoints, endpoints[p].y, maxPointIndex)) { 128 | assert(endpoints[maxPointIndex].fragment != endpoints[p].fragment); 129 | int fPrev = endpoints[maxPointIndex].fragment; 130 | fragments[endpoints[p].fragment].prev = fPrev; 131 | int score = fragments[endpoints[maxPointIndex].fragment].score + fragments[endpoints[p].fragment].score; 132 | /* 133 | cerr << "Score at " << endpoints[p].x << "\t" << endpoints[p].y << "\t" << fragments[fPrev].xl << "\t" 134 | << fragments[fPrev].yl << "\t" 135 | << fragments[fPrev].xh << "\t" 136 | << fragments[fPrev].yh << "\t" 137 | << fragments[fPrev].score << "\tscore:\t" << score << endl; 138 | */ 139 | fragments[endpoints[p].fragment].score = score; 140 | // pst.Print(); 141 | } 142 | else { 143 | fragments[endpoints[p].fragment].prev = -1; 144 | } 145 | } else { 146 | assert(endpoints[p].GetSide() == T_Endpoint::End); 147 | // 148 | // The score of the fragment should be already set. So simply activate 149 | // it here (make the point be visible in a search). 150 | // 151 | endpoints[p].score = fragments[endpoints[p].fragment].score; 152 | pst.Activate(endpoints, p); 153 | if (maxScoringEndpointFound == false or 154 | fragments[endpoints[maxScoringEndpoint].fragment].score < fragments[endpoints[p].fragment].score) { 155 | maxScoringEndpoint = p; 156 | maxScoringEndpointFound = true; 157 | } 158 | } 159 | } 160 | 161 | // 162 | // Now compute the chain of optimum fragments 163 | // 164 | T_Fragment *optFragmentPtr; 165 | if (maxScoringEndpointFound == false) { 166 | // 167 | // Null case, no endpoints have been processed. 168 | // 169 | return 0; 170 | } 171 | 172 | int prev = endpoints[maxScoringEndpoint].fragment; 173 | unsigned int numIter = 0; 174 | while (prev != -1 ) { 175 | optFragmentChainIndices.push_back(prev); 176 | 177 | prev = fragments[prev].prev; 178 | // 179 | // Do a sanity check to make sure this loop is finite -- the optimal 180 | // fragment chain should never contain more fragments than what are 181 | // input. 182 | // 183 | assert(numIter < fragments.size()); 184 | ++numIter; 185 | } 186 | reverse(optFragmentChainIndices.begin(), optFragmentChainIndices.end()); 187 | return optFragmentChainIndices.size(); 188 | 189 | } 190 | 191 | #endif 192 | -------------------------------------------------------------------------------- /Info.h: -------------------------------------------------------------------------------- 1 | #ifndef INFO_H_ 2 | #define INFO_H_ 3 | 4 | #include 5 | #include 6 | 7 | using std::vector; 8 | 9 | typedef std::pair Pair; 10 | 11 | 12 | class info 13 | { 14 | public: 15 | unsigned int pstart; 16 | unsigned int pend; 17 | unsigned int rc_num; // rc_num means the row/col number 18 | unsigned int num; // num means the subproblem Sub[num] which the current row/col belongs to 19 | vector SS_A1; // SS_A1 stores the subproblem number which end points (e1) on the current row are in Di 20 | vector SS_B1; // SS_B1 stores the subproblem number which start points (s1) the current row are in Ei 21 | vector SS_A2; // SS_A2 stores the subproblem number which end points (e2) on the current row are in Di 22 | vector SS_B2; // SS_B2 stores the subproblem number which start points (s2) the current row are in Ei 23 | info(unsigned int s, unsigned int e, unsigned int n) : pstart(s), pend(e), rc_num(n), num(0) {} // constructor 24 | ~info() {}; 25 | friend std::ostream & operator<<(std::ostream & os, const info & t); // overload of operator << 26 | }; 27 | 28 | std::ostream & operator<<(std::ostream & os, const info & M) { 29 | os << "{pstart: " << M.pstart << ", pend: " << M.pend << ", rc_num: " << M.rc_num << ", num: " << M.num << endl; 30 | os << "SS_A1: " << M.SS_A1 << endl; 31 | os << "SS_B1: " << M.SS_B1 << "} "<< endl; 32 | 33 | return os; 34 | } 35 | 36 | #endif -------------------------------------------------------------------------------- /Input.h: -------------------------------------------------------------------------------- 1 | #ifndef INPUT_H_ 2 | #define INPUT_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include "Read.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "Options.h" 20 | 21 | KSEQ_INIT(gzFile, gzread) 22 | 23 | class Input { 24 | public: 25 | enum InputType { FASTA, FASTQ, HTS }; 26 | int inputType; 27 | istream *strmPtr; 28 | ifstream strm; 29 | gzFile fp; 30 | htsFile *htsfp; 31 | kseq_t *ks; 32 | bam_hdr_t *samHeader; 33 | pthread_mutex_t semaphore; 34 | bool doInit; 35 | int curFile; 36 | int basesRead; 37 | long totalRead; 38 | bool done; 39 | int nReads; 40 | int flagRemove; 41 | string format; 42 | clock_t timestamp; 43 | 44 | vector allReads; 45 | Input() { 46 | flagRemove=0; 47 | doInit = true; 48 | curFile = 0; 49 | basesRead = 0; 50 | totalRead = 0; 51 | done=false; 52 | ks=NULL; 53 | htsfp=NULL; 54 | fp=NULL; 55 | } 56 | bool StreamIsFasta(istream &s) { 57 | if (s.eof() or s.good() == false) { 58 | return false; 59 | } 60 | if (s.peek() == '>') { 61 | return true; 62 | } 63 | return false; 64 | } 65 | 66 | bool StreamIsFastq(istream &s) { 67 | if (s.eof() or s.good() == false) { 68 | return false; 69 | } 70 | vector lines(2); 71 | string line; 72 | if (s.peek() != '@') { return false;} 73 | getline(s,lines[0]); 74 | getline(s,lines[1]); 75 | bool res=false; 76 | if (s.peek() == '+') { 77 | res=true; 78 | } 79 | for (int j=2; j > 0; j--) { 80 | s.putback('\n'); 81 | for (int i=lines[j-1].size(); i > 0; i--) { 82 | s.putback(lines[j-1][i-1]); 83 | } 84 | } 85 | return res; 86 | } 87 | 88 | bool Initialize(string &filename) { 89 | nReads=0; 90 | 91 | doInit = false; 92 | 93 | /* 94 | // When HTSLIB 1.13 is released, this may be used to initialize input 95 | htsfp = hts_open(filename.c_str(),"r"); 96 | const htsFormat *fmt = hts_get_format(htsfp); 97 | format=hts_format_file_extension(fmt); 98 | 99 | if (format == "fq" or format == "fa") { 100 | inputType=0; 101 | return true; 102 | } 103 | else if (format == "sam" or format == "bam" or format == "cram") { 104 | samHeader = sam_hdr_read(htsfp); 105 | return true; 106 | } 107 | else { 108 | cerr << "Cannot determine type of input " << endl; 109 | exit(1); 110 | } 111 | */ 112 | 113 | if (filename == "-" or filename == "stdin" or filename == "/dev/stdin") { 114 | strmPtr = &cin; 115 | } 116 | else { 117 | strm.open(filename.c_str()); 118 | strmPtr=&strm; 119 | } 120 | 121 | if (StreamIsFasta(*strmPtr)) { 122 | inputType=FASTA; 123 | return true; 124 | } 125 | else if (StreamIsFastq(*strmPtr) ) { 126 | inputType=FASTQ; 127 | return true; 128 | } 129 | else { 130 | 131 | // 132 | // possibly sam 133 | // 134 | if (filename == "-" or filename == "stdin" or filename == "/dev/stdin") { 135 | cout << "Streaming of sam/bam/cram input is not supported. You can convert to fasta/fastq, e.g.:" << endl 136 | << "samtools fasta input.bam | lra align ref.fa -" << endl; 137 | exit(1); 138 | } 139 | if (htsfp != NULL) { 140 | hts_close(htsfp); 141 | bam_hdr_destroy(samHeader); 142 | 143 | } 144 | 145 | 146 | htsfp = hts_open(filename.c_str(),"r"); 147 | const htsFormat *fmt = hts_get_format(htsfp); 148 | if (fmt == NULL or (fmt->format != sam and fmt->format != bam)) { 149 | cout << "Cannot determine format of input reads." << endl; 150 | exit(1); 151 | } 152 | 153 | samHeader = sam_hdr_read(htsfp); 154 | inputType=HTS; 155 | return true; 156 | } 157 | return false; 158 | } 159 | 160 | 161 | bool Initialize(vector &_allReads) { 162 | // 163 | // Check to see if the input is fasta 164 | // 165 | allReads = _allReads; 166 | if (allReads.size() == 0) { 167 | exit(0); 168 | } 169 | 170 | 171 | pthread_mutex_init(&semaphore, NULL); 172 | 173 | 174 | if (Initialize(allReads[curFile]) == false) { 175 | return 0; 176 | } 177 | timestamp = clock(); 178 | doInit = false; 179 | return true; 180 | } 181 | 182 | bool GetNext(Read &read, Options &opt, bool overrideSemaphore=false, bool top=true) { 183 | read.Clear(); 184 | bool readOne=false; 185 | if (overrideSemaphore == false and top == true) { 186 | pthread_mutex_lock(&semaphore); 187 | } 188 | string name; 189 | string seq; 190 | string qual; 191 | //---------------------------------------------- 192 | if (inputType == FASTA and strmPtr->eof()) // Any more FASTA files? 193 | { 194 | strm.close(); // at eof so close before checking if another file is in input list 195 | ++curFile; 196 | 197 | if (curFile >= allReads.size()) // any more input file in list 198 | { // no more read files? 199 | return 0; 200 | } 201 | if (Initialize(allReads[curFile]) == false) // does next input file initialise? 202 | { 203 | return 0; 204 | } 205 | } 206 | //---------------------------------------------- 207 | if (inputType == FASTA or inputType == FASTQ) { 208 | if (strmPtr->eof()) { 209 | return 0; 210 | } 211 | if (inputType == FASTA) { 212 | string header; 213 | char c; 214 | getline(*strmPtr, header); 215 | stringstream nameStrm(header); 216 | nameStrm >> c >> read.name; 217 | c=strmPtr->peek(); 218 | 219 | while (c != EOF and c != '>') { 220 | string line; 221 | getline(*strmPtr, line); 222 | int i=0,j=0; 223 | for (i=0; i < line.size(); i++) { if (line[i] != ' ') { line[j] = toupper(line[i]); j++;} } 224 | line.resize(j); 225 | seq+=line; 226 | c=strmPtr->peek(); 227 | } 228 | if (c == EOF) { 229 | strmPtr->get(); 230 | } 231 | } 232 | 233 | else if (inputType == FASTQ) { 234 | string header; 235 | string sep; 236 | char c; 237 | getline(*strmPtr, header); 238 | getline(*strmPtr, seq); 239 | getline(*strmPtr, sep); 240 | getline(*strmPtr, qual); 241 | 242 | if (header.size() ==0 or seq.size() == 0 or sep.size() == 0 or qual.size() == 0) { 243 | // ------------------------------------------- 244 | strm.close(); 245 | ++curFile; 246 | if (curFile >= allReads.size()) // Exit if no more input files. 247 | { // no more read files? 248 | readOne = false; 249 | return 0; 250 | } 251 | if (Initialize(allReads[curFile]) == false) 252 | { 253 | readOne = false; 254 | return 0; 255 | } 256 | // opened next input file - is it fastq? (set in Initialize()) 257 | if (inputType == FASTQ) 258 | { 259 | getline(*strmPtr, header); 260 | getline(*strmPtr, seq); 261 | getline(*strmPtr, sep); 262 | getline(*strmPtr, qual); 263 | } 264 | } 265 | if (header.size() == 0 or seq.size() == 0 or sep.size() == 0 or qual.size() == 0) 266 | { 267 | readOne = false; 268 | return 0; 269 | } 270 | // ------------------------------------------- 271 | else { 272 | stringstream nameStrm(header); 273 | nameStrm >> c >> read.name; 274 | int i,j; 275 | for (i=0,j=0; i < seq.size(); i++) { if (seq[i] != ' ') { seq[j] = toupper(seq[i]); j++;} } 276 | seq.resize(j); 277 | 278 | for (i=0,j=0; i < qual.size(); i++) { if (qual[i] != ' ') { qual[j] = qual[i]; j++;} } 279 | qual.resize(j); 280 | } 281 | } 282 | read.seq = new char[seq.size()+1]; 283 | memcpy(read.seq, seq.c_str(), seq.size()); 284 | read.length=seq.size(); 285 | read.seq[read.length] = '\0'; 286 | if (qual.size() > 0) { 287 | assert(qual.size() == seq.size()); 288 | read.qual = new char[qual.size()+1]; 289 | memcpy(read.qual, qual.c_str(), qual.size()); 290 | read.qual[read.length] = '\0'; 291 | } 292 | read.passthrough=NULL; 293 | readOne=true; 294 | nReads++; 295 | } 296 | else if (inputType == HTS) { 297 | int res; 298 | bam1_t *b = bam_init1(); 299 | res= sam_read1(htsfp, samHeader, b); 300 | 301 | while (res >= 0 and readOne == false) { 302 | if (res >= 0) { 303 | if ((b->core.flag & flagRemove) == 0) { 304 | // // get auxilary tags 305 | // if (opt.passthroughtag and bam_get_aux(b)) { 306 | // unsigned char *pq = bam_get_aux(b); 307 | // int pq_len = strlen((char*)pq); 308 | // read.passthrough = new unsigned char[pq_len + 1]; 309 | // for (int p=0; pcore.l_qseq; 316 | read.seq = new char[read.length]; 317 | read.name = string(bam_get_qname(b)); 318 | read.flags = b->core.flag; 319 | uint8_t *q = bam_get_seq(b); 320 | for (int i=0; i < read.length; i++) {read.seq[i]=seq_nt16_str[bam_seqi(q,i)]; } 321 | char* qual=(char*) bam_get_qual(b); 322 | if (qual[0] == char(0xff)) { 323 | read.qual = new char[2]; 324 | read.qual[1] = '\0'; 325 | read.qual[0] = '*'; 326 | } 327 | else { 328 | read.qual=new char[read.length+1]; 329 | for (int q=0; q < read.length; q++) { 330 | read.qual[q] = qual[q]+33; 331 | } 332 | read.qual[read.length]='\0'; 333 | } 334 | 335 | // 336 | // Eventually this will store the passthrough data 337 | // 338 | readOne=true; 339 | if (opt.passthroughtag) { 340 | int ksLen; 341 | kstring_t fullKs; 342 | int fullLen; 343 | fullKs = { 0, 0, NULL }; 344 | fullLen = sam_format1(samHeader, b, &fullKs); 345 | int t=0; 346 | int numTab=0; 347 | while (t < fullKs.l and numTab < 11) 348 | { 349 | if (fullKs.s[t] == '\t') 350 | { 351 | numTab++; 352 | } 353 | t+=1; 354 | } 355 | if (t < fullKs.l) 356 | { 357 | int lenPassthrough=fullKs.l-t; 358 | if (lenPassthrough > 0) { 359 | read.passthrough=new char[lenPassthrough+1]; 360 | read.passthrough[lenPassthrough]='\0'; 361 | memcpy(read.passthrough, fullKs.s + t, lenPassthrough); 362 | } 363 | else 364 | { 365 | read.passthrough=NULL; 366 | } 367 | } 368 | free(fullKs.s); 369 | } 370 | nReads++; 371 | bam_destroy1(b); 372 | b=NULL; 373 | //bam1_t *b = bam_init1(); 374 | } 375 | else { 376 | bam_destroy1(b); 377 | b = bam_init1(); 378 | res= sam_read1(htsfp, samHeader, b); 379 | } 380 | } 381 | } 382 | if (res < 0) { 383 | if (b != NULL) { 384 | bam_destroy1(b); 385 | readOne = false; 386 | } 387 | } 388 | } 389 | 390 | if (readOne == false and top == true ) { 391 | ++curFile; 392 | doInit=true; 393 | readOne=GetNext(read, opt, overrideSemaphore, false); 394 | } 395 | basesRead += read.length; 396 | totalRead += read.length; 397 | 398 | 399 | if (overrideSemaphore== false and top == true) { 400 | pthread_mutex_unlock(&semaphore); 401 | } 402 | return readOne; 403 | } 404 | 405 | bool BufferedRead(vector &reads, int maxBufferSize, Options &opt) { 406 | int totalSize=0; 407 | 408 | pthread_mutex_lock(&semaphore); 409 | Read read; 410 | 411 | while(totalSize < maxBufferSize and GetNext(read, opt, true, true)) { 412 | reads.resize(reads.size()+1); 413 | reads[reads.size()-1]=read; 414 | totalSize += read.length; 415 | read.Clear(); 416 | } 417 | 418 | pthread_mutex_unlock(&semaphore); 419 | 420 | return reads.size(); 421 | } 422 | 423 | 424 | }; 425 | 426 | 427 | #endif 428 | 429 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | USC-RL v1.0 3 | The Software is made available for academic or non-commercial purposes only. The license is for 4 | a copy of the program for an unlimited term. Individuals requesting a license for commercial use 5 | must pay for a commercial license. 6 | USC Stevens Institute for Innovation 7 | University of Southern California 8 | 1150 S. Olive Street, Suite 2300 9 | Los Angeles, CA 90115, USA 10 | ATTN: Accounting 11 | DISCLAIMER. USC MAKES NO EXPRESS OR IMPLIED WARRANTIES, EITHER IN FACT OR BY 12 | OPERATION OF LAW, BY STATUTE OR OTHERWISE, AND USC SPECIFICALLY AND EXPRESSLY 13 | DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS FOR A 14 | PARTICULAR PURPOSE, VALIDITY OF THE SOFTWARE OR ANY OTHER INTELLECTUAL PROPERTY 15 | RIGHTS OR NON-INFRINGEMENT OF THE INTELLECTUAL PROPERTY OR OTHER RIGHTS OF ANY 16 | THIRD PARTY. SOFTWARE IS MADE AVAILABLE AS-IS. 17 | LIMITATION OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT WILL 18 | USC BE LIABLE TO ANY USER OF THIS CODE FOR ANY INCIDENTAL, CONSEQUENTIAL, EXEMPLARY 19 | OR PUNITIVE DAMAGES OF ANY KIND, LOST GOODWILL, LOST PROFITS, LOST BUSINESS AND/OR 20 | ANY INDIRECT ECONOMIC DAMAGES WHATSOEVER, REGARDLESS OF WHETHER SUCH DAMAGES 21 | ARISE FROM CLAIMS BASED UPON CONTRACT, NEGLIGENCE, TORT (INCLUDING STRICT LIABILITY 22 | OR OTHER LEGAL THEORY), A BREACH OF ANY WARRANTY OR TERM OF THIS AGREEMENT, AND 23 | REGARDLESS OF WHETHER USC WAS ADVISED OR HAD REASON TO KNOW OF THE POSSIBILITY OF 24 | INCURRING SUCH DAMAGES IN ADVANCE. 25 | For commercial license pricing and annual commercial update and support pricing, please 26 | contact: 27 | Bonnie Phan Wolfe 28 | USC Stevens Institute for Innovation 29 | University of Southern California 30 | 1150 S. Olive Street, Suite 2300 31 | Los Angeles, CA 90015, USA 32 | Tel: (213) 821-0938 33 | Fax: +1 213-821-5001 34 | Email: bonniep@usc.edu, cc accounting@stevens.usc.edu 35 | 36 | -------------------------------------------------------------------------------- /LogLookUpTable.h: -------------------------------------------------------------------------------- 1 | #ifndef LOG_LOOK_UP_TABLE_H_ 2 | #define LOG_LOOK_UP_TABLE_H_ 3 | 4 | #include 5 | #include 6 | 7 | // static vector LookUpTable; 8 | 9 | void 10 | CreateLookUpTable(std::vector & LookUpTable){ 11 | for (int i = 1; i <= 10001; i = i + 5) { 12 | LookUpTable.push_back(logf(i)); 13 | } 14 | 15 | } 16 | 17 | 18 | #endif -------------------------------------------------------------------------------- /MMIndex.h: -------------------------------------------------------------------------------- 1 | #ifndef MMINDEX_H_ 2 | #define MMINDEX_H_ 3 | #include "TupleOps.h" 4 | #include "Options.h" 5 | #include "Genome.h" 6 | #include "Sorting.h" 7 | #include "MinCount.h" 8 | #include /*iota */ 9 | #include /* ceil */ 10 | #include // std::pair, std::make_pair 11 | #include 12 | 13 | template 14 | class SortByPos { 15 | public: 16 | int operator() (const Tup &a, const Tup &b) { 17 | return (a.pos < b.pos); 18 | } 19 | }; 20 | 21 | template 22 | void PrintPairs(vector > &mins, int k, int cluster=-1) { 23 | CartesianTargetSort(mins); 24 | for(int i = 0; i < mins.size();i++) { 25 | string s; 26 | TupleToString(mins[i].first.t, k, s); 27 | #ifdef _TESTING_ 28 | if (cluster != -1) { 29 | cout << "clust\t" << cluster << "\t"; 30 | } 31 | cout << i << "\t" << mins[i].first.pos << "\t" << mins[i].second.pos << "\t" << s << endl; 32 | #endif 33 | } 34 | } 35 | 36 | template 37 | void PrintIndex(vector &minimizers, int k) { 38 | sort(minimizers.begin(), minimizers.end(), SortByPos()); 39 | for(int i = 0; i < minimizers.size();i++) { 40 | string s; 41 | TupleToString(minimizers[i].t, k, s); 42 | cout << i << "\t" << minimizers[i].pos << "\t" << s << endl; 43 | } 44 | } 45 | 46 | template 47 | void CalculateMinimizerStats(vector &minimizers, vector &mmfreqs) { 48 | int distinct = 0; // Number of distinct minimizers 49 | float avg_freq = 0; 50 | int avg_distance = 0; 51 | int unique = 0; 52 | int total_freq = 0; 53 | unordered_map miniDistinct; 54 | for (int n = 0; n < minimizers.size(); n++) { 55 | unordered_map::const_iterator got = miniDistinct.find(minimizers[n].t); 56 | if (got == miniDistinct.end()) { 57 | miniDistinct[minimizers[n].t] = 0; 58 | } 59 | if (mmfreqs[n] == 1) unique++; 60 | total_freq += mmfreqs[n]; 61 | 62 | } 63 | distinct = miniDistinct.size(); 64 | avg_freq = (float) total_freq / minimizers.size(); 65 | cerr << "sample minimizers: " << minimizers.size() << " distinct minimizers: " << distinct << " unique minimizers: " << (float) unique / minimizers.size() 66 | << " average minimizer frequency: " << avg_freq << endl; 67 | } 68 | 69 | template 70 | void RemoveFrequent(vector &minimizers, int maxFreq) { 71 | int c=0,n=0; 72 | int before=minimizers.size(); 73 | while(n < minimizers.size()) { 74 | int ne=n; 75 | while (ne < minimizers.size() and minimizers[ne].t == minimizers[n].t) { ne++;} 76 | if (ne - n < maxFreq) { 77 | int end = ne; 78 | for (ne = n; ne < end; ne++, c++) { 79 | minimizers[c] = minimizers[ne]; 80 | } 81 | } 82 | n=ne; 83 | } 84 | minimizers.resize(c); 85 | } 86 | 87 | template 88 | void RemoveFrequent(vector &minimizers, vector &mmfreqs, vector &Freq, vector &remove) { 89 | int c = 0; 90 | for (int n = 0; n < minimizers.size(); n++) { 91 | if (remove[n] == 0) { 92 | minimizers[c] = minimizers[n]; 93 | mmfreqs.push_back(Freq[n]); 94 | c++; 95 | } 96 | } 97 | minimizers.resize(c); 98 | } 99 | 100 | class LocalIndex { 101 | public: 102 | int localIndexWindow; 103 | int k; 104 | int w; 105 | int maxFreq; 106 | vector minimizers; 107 | vector seqOffsets; // seqOffsets stores actual boundaries 108 | vector tupleBoundaries; // tupleBoundaries stores the number of minimizers in the corresponding interval 109 | uint64_t offset; 110 | void StoreLocalIndexWindow(int index_size) { 111 | if (index_size != 0) { 112 | localIndexWindow = min(1 << (LOCAL_POS_BITS-1), index_size); 113 | } 114 | else { 115 | localIndexWindow = 1 << (LOCAL_POS_BITS-1) ; 116 | } 117 | } 118 | LocalIndex(int index_window=0) { 119 | k = 10; 120 | w=5; 121 | offset=0; 122 | maxFreq=5; 123 | tupleBoundaries.push_back(0); 124 | seqOffsets.push_back(0); 125 | StoreLocalIndexWindow(index_window); 126 | } 127 | 128 | LocalIndex( LocalIndex &init) { 129 | k=init.k; 130 | w=init.w; 131 | offset=0; 132 | maxFreq=init.maxFreq; 133 | localIndexWindow = init.localIndexWindow; 134 | tupleBoundaries.push_back(0); 135 | seqOffsets.push_back(0); 136 | } 137 | 138 | void Write(string filename) { 139 | ofstream fout(filename.c_str(), ios::out|ios::binary); 140 | fout.write((char*)&k, sizeof(int)); 141 | fout.write((char*)&w, sizeof(int)); 142 | fout.write((char*)&localIndexWindow, sizeof(int)); 143 | int nRegions=seqOffsets.size(); 144 | fout.write((char*)&nRegions, sizeof(int)); 145 | fout.write((char*)&seqOffsets[0], sizeof(uint64_t)*seqOffsets.size()); 146 | fout.write((char*)&tupleBoundaries[0], sizeof(uint64_t)*tupleBoundaries.size()); 147 | uint64_t nMin = minimizers.size(); 148 | fout.write((char*)&nMin, sizeof(uint64_t)); 149 | fout.write((char*)&minimizers[0], sizeof(LocalTuple)*minimizers.size()); 150 | fout.close(); 151 | } 152 | 153 | int Read(string filename) { 154 | ifstream fin(filename.c_str(), ios::in|ios::binary); 155 | if (fin.good() == false or fin.eof() == true) { 156 | return 0; 157 | } 158 | fin.read((char*)&k, sizeof(int)); 159 | fin.read((char*)&w, sizeof(int)); 160 | fin.read((char*)&localIndexWindow, sizeof(int)); 161 | int nRegions; 162 | fin.read((char*)&nRegions, sizeof(int)); 163 | seqOffsets.resize(nRegions); 164 | fin.read((char*)&seqOffsets[0], sizeof(uint64_t)*nRegions); 165 | tupleBoundaries.resize(nRegions); 166 | fin.read((char*)&tupleBoundaries[0], sizeof(uint64_t)*nRegions); 167 | uint64_t nMin; 168 | fin.read((char*) &nMin, sizeof(uint64_t)); 169 | minimizers.resize(nMin); 170 | fin.read((char*)&minimizers[0], sizeof(LocalTuple)*nMin); 171 | fin.close(); 172 | return 1; 173 | } 174 | 175 | int LookupIndex(uint64_t querySeqPos) { 176 | if (seqOffsets.size() == 0) { 177 | return 0; 178 | } 179 | assert(querySeqPos <= seqOffsets[seqOffsets.size()-1]); 180 | vector::iterator it; 181 | it = lower_bound(seqOffsets.begin(), seqOffsets.end(), querySeqPos); 182 | // while(it != seqOffsets.end() and *it == querySeqPos) { ++it;} 183 | int index = it - seqOffsets.begin(); 184 | if (*it != querySeqPos) { 185 | return index - 1; 186 | } 187 | else { 188 | return index; 189 | } 190 | } 191 | 192 | void MinimizerBounds(uint64_t querySeqPos, uint64_t &lb, uint64_t &ub) { 193 | assert(querySeqPos < minimizers.size()); 194 | int index = this->LookupIndex(querySeqPos); 195 | assert(index < tupleBoundaries.size()); 196 | lb = tupleBoundaries[index]; 197 | ub = tupleBoundaries[index+1]; 198 | } 199 | 200 | void IndexSeq(char* seq, int seqLen) { 201 | int gi = 0; 202 | int nIndex = seqLen / localIndexWindow; 203 | 204 | if (seqLen % localIndexWindow != 0) { 205 | nIndex +=1; 206 | } 207 | GenomePos seqPos=0; 208 | 209 | vector locMinimizers; 210 | GenomePos netSize=0; 211 | for (int i = 0; i < nIndex; i++) { 212 | locMinimizers.clear(); 213 | StoreMinimizers_noncanonical(&seq[seqPos], min((GenomePos)seqLen, (GenomePos) (seqPos+localIndexWindow)) - seqPos, 214 | k, w, locMinimizers, false); 215 | //RemoveFrequent(locMinimizers, maxFreq) 216 | // Sort minimzers by tuple value. 217 | // 218 | sort(locMinimizers.begin(), locMinimizers.end()); 219 | // 220 | // Remove frequenct tuples 221 | // 222 | RemoveFrequent(locMinimizers, maxFreq); 223 | 224 | // 225 | // Update local sequence pos (index in chrom). 226 | // 227 | seqPos+=(GenomePos)min((int)localIndexWindow, (int) (seqLen - seqPos)); 228 | 229 | // 230 | // Add boundaries representing the end of the current interval. 231 | // 232 | seqOffsets.push_back(offset+seqPos); 233 | 234 | // 235 | // Add minimizers and store where they end. 236 | // 237 | minimizers.insert(minimizers.end(), locMinimizers.begin(), locMinimizers.end()); 238 | tupleBoundaries.push_back(minimizers.size()); 239 | netSize+=minimizers.size(); 240 | } 241 | // 242 | // Update offset for recently added sequence 243 | // 244 | offset+=seqLen; 245 | } 246 | 247 | void IndexFile(string &genome) { 248 | gzFile f = gzopen(genome.c_str(), "r"); 249 | kseq_t *ks = kseq_init(f); 250 | while (kseq_read(ks) >= 0) { 251 | // cerr << "Storing for "<< ks->name.s << endl; 252 | IndexSeq(ks->seq.s, ks->seq.l); 253 | } 254 | } 255 | 256 | }; 257 | 258 | void CountSort(const vector & Freq, const int & RANGE, const vector & Remove, vector & Sortindex){ 259 | // Create a count vector to store counts of each frequency 260 | vector count(RANGE + 1, 0); 261 | 262 | // Store counts of each frequency in v 263 | for (uint32_t i = 0; i < Freq.size(); i++) { 264 | if (Remove[i] == 0) { 265 | ++count[Freq[i]]; 266 | } 267 | } 268 | 269 | // Change count[i] so that count[i] now contains actual 270 | // position of each frequency 271 | for (int i = 1; i <= RANGE; i++) { 272 | count[i] += count[i-1]; 273 | } 274 | 275 | // Build the output sorted vector 276 | for (uint32_t i = 0; i < Freq.size() ; i++) { 277 | if (Remove[i] == 0) { 278 | assert (Freq[i] <= RANGE); 279 | Sortindex[count[Freq[i]] - 1] = i; 280 | --count[Freq[i]]; 281 | } 282 | } 283 | } 284 | 285 | 286 | void StoreIndex(string &genome, vector &minimizers, Header &header, Options &opts) { 287 | if (opts.localK > 10) { 288 | cerr << "ERROR, local k must be at most 10." << endl; 289 | exit(1); 290 | } 291 | ifstream testGenome(genome.c_str()); 292 | if (testGenome.good() == false or testGenome.eof()) { 293 | cerr << "Cannot open target " << genome << endl; 294 | exit(1); 295 | } 296 | gzFile f = gzopen(genome.c_str(), "r"); 297 | 298 | kseq_t *ks = kseq_init(f); 299 | GenomePos offset=0; 300 | 301 | while (kseq_read(ks) >= 0) { // each kseq_read() call reads one query sequence 302 | int prevMinCount = minimizers.size(); 303 | StoreMinimizers(ks->seq.s, ks->seq.l, opts.globalK, opts.globalW, minimizers, true); 304 | 305 | for (GenomePos i = prevMinCount; i < minimizers.size(); i++) { 306 | minimizers[i].pos+=offset; 307 | } 308 | offset += ks->seq.l; 309 | header.Add(ks->name.s, offset); 310 | } 311 | kseq_destroy(ks); 312 | gzclose(f); 313 | cerr << "Sorting " << minimizers.size() << " minimizers" << endl; 314 | sort(minimizers.begin(), minimizers.end()); 315 | cerr << "done Sorting" << endl; 316 | 317 | // 318 | // Get the frequency for minimizers; Store the frequency in Freq; 319 | // 320 | // int rz = 1; 321 | // if (header.pos.back()/1000000000 > 1) {rz = header.pos.back()/1000000000;} 322 | // int RANGE = opts.globalMaxFreq * rz; 323 | vector Remove (minimizers.size(), 0); 324 | vector Freq(minimizers.size(), 0); 325 | 326 | uint32_t n = 0; uint32_t ne = 0; 327 | uint32_t unremoved = 0; 328 | uint32_t removed = 0; 329 | // Tuple for_mask = 1; 330 | // for_mask = ~(for_mask << 63); // for_mask = 0111..11; 331 | Tuple for_mask = GenomeTuple::for_mask_s; 332 | while (n < minimizers.size()) { 333 | ne = n + 1; 334 | while (ne < minimizers.size() and (minimizers[ne].t & for_mask) == (minimizers[n].t & for_mask)) {ne++;} 335 | if (ne - n > opts.globalMaxFreq) { // opts.minimizerFreq*rz is the rough threshold 336 | for (uint32_t i = n; i < ne; i++) { 337 | Freq[i] = ne - n; 338 | Remove[i] = 1; 339 | } 340 | removed += ne-n; 341 | assert(removed + unremoved <= Remove.size()); 342 | } 343 | else { 344 | for (uint32_t i = n; i < ne; i++) { 345 | Freq[i] = ne - n; 346 | } 347 | unremoved += ne-n; 348 | assert(removed + unremoved <= Remove.size()); 349 | } 350 | n = ne; 351 | } 352 | assert(removed + unremoved == Remove.size()); 353 | cerr << unremoved << " minimizers with multiplicity smaller than " << opts.globalMaxFreq << endl; 354 | // 355 | // Sort unremoved minimizers by frequency 356 | // Use count sort 357 | // 358 | uint32_t sz = header.pos.back()/opts.globalWinsize; 359 | if (header.pos.back()/opts.globalWinsize % opts.globalWinsize > 0) sz += 1; 360 | vector Sortindex(unremoved, 0); 361 | CountSort(Freq, opts.globalMaxFreq, Remove, Sortindex); 362 | 363 | vector winCount(sz, opts.NumOfminimizersPerWindow); // 50 is a parameter that can be changed 364 | for (uint32_t s = 0; s < Sortindex.size(); s++) { 365 | uint32_t id = minimizers[Sortindex[s]].pos/opts.globalWinsize; 366 | if (winCount[id] > 0) { 367 | winCount[id] -= 1; 368 | } 369 | // if (winCount[id] > 0 and minimizers[Sortindex[s]].pos < id*opts.globalWinsize + 5) { // force the minimizer to fall into the first 10bp of the window 370 | // winCount[id] -= 1; 371 | // } 372 | else { 373 | 374 | Remove[Sortindex[s]] = 1; 375 | } 376 | } 377 | 378 | if (opts.dotPlot) { 379 | ofstream outNameStrm("minimizers.txt"); 380 | for (int m=0; m < minimizers.size(); m++) { 381 | if (Remove[m] == 0) { 382 | outNameStrm << minimizers[m].t << "\t" 383 | << minimizers[m].pos << "\t" 384 | << minimizers[m].pos + opts.globalK << "\t" 385 | << Freq[m] << "\t" 386 | << Remove[m] << endl; 387 | } 388 | } 389 | outNameStrm.close(); 390 | } 391 | // 392 | // Remove too frequent minimizers; 393 | // 394 | vector mmfreqs; 395 | RemoveFrequent (minimizers, mmfreqs, Freq, Remove); 396 | if (opts.CalculateMinimizerStats) { 397 | CalculateMinimizerStats(minimizers, mmfreqs); 398 | } 399 | cerr << "There are " << minimizers.size() << " minimizers left" << endl; 400 | } 401 | 402 | int ReadIndex(string fn, vector &index, Header &h, Options &opts) { 403 | ifstream fin(fn.c_str(), ios::in|ios::binary); 404 | if (fin.good() == false or fin.eof()) { 405 | return 0; 406 | } 407 | int64_t len; 408 | fin.read((char*) &len, sizeof(int64_t)); 409 | fin.read((char*) &opts.globalK, sizeof(int)); 410 | h.Read(fin); 411 | index.resize(len); 412 | fin.read((char*) &index[0], sizeof(GenomeTuple)*len); 413 | return len; 414 | } 415 | 416 | void WriteIndex(string fn, vector &index, Header &h, Options &opts) { 417 | ofstream fout(fn.c_str(), ios::out|ios::binary); 418 | int64_t minLength = index.size(); 419 | fout.write((char*) &minLength, sizeof(int64_t)); // write the length of index 420 | fout.write((char*) &opts.globalK, sizeof(int)); // write the kmer length 421 | h.Write(fout); // write info about genome 422 | fout.write((char*) &index[0], sizeof(GenomeTuple)*index.size()); // write minimizers 423 | fout.close(); 424 | } 425 | 426 | #endif 427 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROG= lra 2 | PROG_EXTRA= alchemy2 qti 3 | LIBS= -lz -lpthread -lhts 4 | PROF=/home/cmb-16/mjc/shared/lib/ 5 | DEBUG?="" 6 | OPT?="" 7 | STATIC= 8 | CXX=g++ -std=c++14 9 | CFLAGS=-g 10 | asan?="" 11 | tsan?="" 12 | 13 | ifneq ($(DEBUG), "") 14 | CFLAGS=-g 15 | else 16 | CFLAGS=-O2 -DNDEBUG 17 | endif 18 | 19 | ifneq ($(asan), "") 20 | CFLAGS+=-fsanitize=address 21 | LIBS+=-fsanitize=address 22 | endif 23 | 24 | ifneq ($(tsan), "") 25 | CFLAGS+=-fsanitize=thread 26 | LIBS+=-fsanitize=thread 27 | endif 28 | 29 | ifneq ($(OPT), "") 30 | #STATIC=-L $(PROF) -lprofiler 31 | endif 32 | 33 | HEADERS=MinCount.h \ 34 | SeqUtils.h \ 35 | CompareLists.h \ 36 | TupleOps.h \ 37 | Sorting.h \ 38 | MMIndex.h \ 39 | Options.h\ 40 | Clustering.h \ 41 | Genome.h \ 42 | Alignment.h \ 43 | Read.h \ 44 | MapRead.h \ 45 | Input.h \ 46 | Fragment.h\ 47 | BasicEndpoint.h\ 48 | PrioritySearchTree.h\ 49 | AffineOneGapAlign.h \ 50 | GlobalChain.h \ 51 | Read.h \ 52 | SparseDP.h \ 53 | Timing.h \ 54 | IndelRefine.h \ 55 | Mapping_ultility.h \ 56 | SparseDP_Forward.h \ 57 | DivideSubByCol1.h \ 58 | DivideSubByCol2.h \ 59 | DivideSubByRow1.h \ 60 | DivideSubByRow2.h \ 61 | AlignmentBlock.h \ 62 | ChainRefine.h \ 63 | Chain.h \ 64 | ClusterRefine.h \ 65 | Fragment_Info.h \ 66 | LinearExtend.h \ 67 | LocalRefineAlignment.h \ 68 | LogLookUpTable.h \ 69 | Map_highacc.h \ 70 | Map_lowacc.h \ 71 | SubRountine.h \ 72 | Types.h \ 73 | SubProblem.h \ 74 | SplitClusters.h \ 75 | RefineBreakpoint.h 76 | 77 | all:$(PROG) 78 | 79 | # tag: TestAffineOneGapAlign.cpp AffineOneGapAlign.h 80 | # $(CXX) -g TestAffineOneGapAlign.cpp -o tag 81 | # # -D _MAT_PRINT_ 82 | 83 | # tgc: TestGlobalChain.cpp GlobalChain.h Fragment.h BasicEndpoint.h PrioritySearchTree.h 84 | # $(CXX) -g TestGlobalChain.cpp -o tgc 85 | 86 | # tir: TestIndelRefine.cpp IndelRefine.h 87 | # $(CXX) -g TestIndelRefine.cpp -I $(CONDA_PREFIX)/include -L $(CONDA_PREFIX)/lib -lhts -o tir -lbz2 -lz 88 | 89 | lra: lra.o 90 | $(CXX) $(CFLAGS) $(STATIC) $^ -I -L/usr/lib64 -L $(CONDA_PREFIX)/lib $(LIBS) -o $@ -Wl,-rpath-link=$(CONDA_PREFIX)/lib 91 | 92 | alchemy2: Alchemy2.o 93 | $(CXX) $(CFLAGS) $(STATIC) $^ -L $(CONDA_PREFIX)/lib $(LIBS) -o $@ -Wl,-rpath,$(CONDA_PREFIX)/lib 94 | 95 | qti: QueryTime.o 96 | $(CXX) $(CFLAGS) $(STATIC) $^ -L $(CONDA_PREFIX)/lib $(LIBS) -o $@ 97 | 98 | lra.o: lra.cpp $(HEADERS) 99 | $(CXX) $(CFLAGS) -c -I $(CONDA_PREFIX)/include lra.cpp 100 | 101 | Alchemy2.o: Alchemy2.cpp Genome.h 102 | $(CXX) $(CFLAGS) -c -I $(CONDA_PREFIX)/include Alchemy2.cpp 103 | 104 | QueryTime.o: QueryTime.cpp $(HEADERS) $(CONDA_PREFIX)/lib/libhts.a 105 | $(CXX) $(CFLAGS) -c -I $(CONDA_PREFIX)/include QueryTime.cpp 106 | clean: 107 | rm -f $(PROG) $(PROG_EXTRA) *.o 108 | -------------------------------------------------------------------------------- /MapRead.h: -------------------------------------------------------------------------------- 1 | #ifndef MAP_READ_H_ 2 | #define MAP_READ_H_ 3 | #include 4 | #include "MMIndex.h" 5 | #include "Genome.h" 6 | #include "Read.h" 7 | #include "Options.h" 8 | #include "CompareLists.h" 9 | #include "Sorting.h" 10 | #include "TupleOps.h" 11 | #include "Clustering.h" 12 | #include "AffineOneGapAlign.h" 13 | #include "TupleOps.h" 14 | #include "SparseDP.h" 15 | #include "SparseDP_Forward.h" 16 | #include "Chain.h" 17 | #include "overload.h" 18 | #include "LinearExtend.h" 19 | #include "SplitClusters.h" 20 | #include "Timing.h" 21 | #include "ClusterRefine.h" 22 | #include "IndelRefine.h" 23 | #include "LocalRefineAlignment.h" 24 | #include "Map_lowacc.h" 25 | #include "Map_highacc.h" 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include // std::log 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | using namespace std; 38 | 39 | 40 | class SortClusterBySize { 41 | public: 42 | bool operator()(const Cluster &a, const Cluster &b) { 43 | return a.matches.size() > b.matches.size(); 44 | } 45 | }; 46 | 47 | class SortAlignmentsByMatches { 48 | public: 49 | bool operator()(const SegAlignmentGroup a, const SegAlignmentGroup b) const { 50 | return a.nm > b.nm; 51 | } 52 | }; 53 | 54 | void RankClustersByScore(vector &clusters) { 55 | sort(clusters.begin(), clusters.end(), SortClusterBySize()); 56 | } 57 | 58 | int SetStrand(Read &read, Genome &genome, const Options &opts, GenomePairs &matches) { 59 | int nSame=0; 60 | int nDifferent=0; 61 | for (int m=0; m< matches.size(); m++) { 62 | int chromIndex = genome.header.Find(matches[m].second.pos); 63 | char *chrom=genome.seqs[chromIndex]; 64 | int chromPos = matches[m].second.pos - genome.header.pos[chromIndex]; 65 | GenomeTuple readTup, genomeTup; 66 | StoreTuple(read.seq, matches[m].first.pos, opts.globalK, readTup); 67 | StoreTuple(chrom, chromPos, opts.globalK, genomeTup); 68 | if (readTup.t == genomeTup.t) { 69 | nSame++; 70 | } 71 | else { 72 | nDifferent++; 73 | } 74 | } 75 | if (nSame > nDifferent) { 76 | return 0; 77 | } 78 | else { 79 | return 1; 80 | } 81 | } 82 | 83 | template 84 | void SwapReadCoordinates(vector &matches, GenomePos readLength, GenomePos kmer){ 85 | 86 | for (int i=0; i < matches.size(); i++) { 87 | matches[i].first.pos = readLength - (matches[i].first.pos+ kmer); 88 | } 89 | } 90 | 91 | // void ReverseClusterStrand(Read &read, Genome &genome, const Options &opts, vector &clusters) { 92 | // for (int c = 0; c < clusters.size(); c++) { 93 | // SwapStrand(read, opts, clusters[c].matches); 94 | // clusters[c].strand = 1; 95 | // } 96 | // } 97 | 98 | // void SetClusterStrand(Read &read, Genome &genome, const Options &opts, 99 | // vector &clusters) { 100 | // for (int c = 0; c < clusters.size(); c++) { 101 | // clusters[c].strand = SetStrand(read, genome, opts, clusters[c].matches); 102 | // if (clusters[c].strand == 1) { 103 | // SwapStrand(read, opts, clusters[c].matches); 104 | // } 105 | // } 106 | // } 107 | 108 | 109 | void 110 | SeparateMatchesByStrand(Read &read, Genome &genome, int k, vector &allMatches, vector &forMatches, 111 | vector &revMatches, string &baseName) { 112 | // 113 | // A value of 0 implies forward strand match. 114 | // 115 | vector strand(allMatches.size(), 0); 116 | int nForward=0; 117 | for (int i=0; i < allMatches.size(); i++) { 118 | int readPos = allMatches[i].first.pos; 119 | uint64_t refPos = allMatches[i].second.pos; 120 | char *genomePtr = genome.GlobalIndexToSeq(refPos); 121 | // 122 | // Read and genome are identical, the match is in the forward strand 123 | if (strncmp(&read.seq[readPos], genomePtr, k) == 0) { 124 | nForward++; 125 | } 126 | else { 127 | // 128 | // The k-mers are not identical, but a match was stored between 129 | // readPos and *genomePtr, therefore the match must be reverse. 130 | // 131 | strand[i] = true; 132 | } 133 | } 134 | // 135 | // Populate two lists, one for forward matches one for reverse. 136 | // 137 | forMatches.resize(nForward); 138 | revMatches.resize(allMatches.size() - nForward); 139 | int i = 0,r = 0,f = 0; 140 | for (i = 0,r = 0,f = 0; i < allMatches.size(); i++) { 141 | if (strand[i] == 0) { 142 | forMatches[f] = allMatches[i]; 143 | f++; 144 | } 145 | else { 146 | revMatches[r] = allMatches[i]; 147 | r++; 148 | } 149 | } 150 | } 151 | 152 | 153 | int 154 | MapRead(const vector & LookUpTable, Read &read, Genome &genome, vector &genomemm, LocalIndex &glIndex, const Options &opts, 155 | ostream *output, ostream *svsigstrm, Timing &timing, IndelRefineBuffers &indelRefineBuffers, pthread_mutex_t *semaphore=NULL) { 156 | read.unaligned = 0; 157 | string baseName = read.name; 158 | for (int i=0; i < baseName.size(); i++) { 159 | if (baseName[i] == '/') baseName[i] = '_'; 160 | if (baseName[i] == '|') baseName[i] = '_'; 161 | } 162 | vector readmm; // readmm stores minimizers 163 | vector > allMatches, forMatches, revMatches; 164 | timing.Start(); 165 | // 166 | // Add pointers to seq that make code more readable. 167 | // 168 | char *readRC; 169 | CreateRC(read.seq, read.length, readRC); 170 | char *strands[2] = { read.seq, readRC }; 171 | 172 | if (opts.storeAll) { 173 | Options allOpts = opts; 174 | allOpts.globalW=1; 175 | StoreMinimizers(read.seq, read.length, allOpts.globalK, allOpts.globalW, readmm, true); 176 | // StoreMinimizers_noncanonical(read.seq, read.length, allOpts.globalK, allOpts.globalW, readmm, true); 177 | } 178 | else { 179 | // Options partOpts = opts; 180 | // partOpts.globalW = opts.globalW - 5; 181 | StoreMinimizers(read.seq, read.length, opts.globalK, opts.globalW, readmm, true); 182 | // StoreMinimizers_noncanonical(read.seq, read.length, opts.globalK, opts.globalW, readmm, true); 183 | } 184 | timing.Tick("Store minimizers"); 185 | sort(readmm.begin(), readmm.end()); //sort kmers in readmm(minimizers) 186 | timing.Tick("Sort minimizers"); 187 | // 188 | // Add matches between the read and the genome. 189 | // 190 | CompareLists(readmm, genomemm, allMatches, opts, true); 191 | timing.Tick("CompareLists"); 192 | 193 | if (opts.dotPlot and opts.readname == read.name ) { 194 | ofstream clust("all-matches.dots"); 195 | for (int m = 0; m < allMatches.size(); m++) { 196 | clust << allMatches[m].first.pos << "\t" << allMatches[m].second.pos 197 | << "\t" << allMatches[m].first.pos + opts.globalK << "\t" 198 | << allMatches[m].second.pos+ opts.globalK << endl; 199 | } 200 | clust.close(); 201 | } 202 | 203 | SeparateMatchesByStrand(read, genome, opts.globalK, allMatches, forMatches, revMatches, baseName); 204 | allMatches.clear(); 205 | if (forMatches.size() == 0 and revMatches.size() == 0) { 206 | read.unaligned = 1; 207 | output_unaligned(read, opts, *output); 208 | return 0; 209 | } 210 | if (opts.debug and opts.dotPlot and opts.readname == read.name ) { 211 | ofstream fclust("for-matches_original.dots"); 212 | for (int m = 0; m < forMatches.size(); m++) { 213 | fclust << forMatches[m].first.pos << "\t" << forMatches[m].second.pos << "\t" << opts.globalK + forMatches[m].first.pos << "\t" 214 | << forMatches[m].second.pos + opts.globalK << "\t" << m << endl; 215 | } 216 | fclust.close(); 217 | ofstream rclust("rev-matches_original.dots"); 218 | for (int m=0; m < revMatches.size(); m++) { 219 | rclust << revMatches[m].first.pos << "\t" << revMatches[m].second.pos + opts.globalK << "\t" << opts.globalK + revMatches[m].first.pos << "\t" 220 | << revMatches[m].second.pos << "\t" << m << endl; 221 | } 222 | rclust.close(); 223 | ofstream rclustdiag("rev-matches_original.diag.dots"); 224 | for (int m=0; m < revMatches.size(); m++) { 225 | rclustdiag << revMatches[m].first.pos - revMatches[m].second.pos << "\t" << opts.globalK + revMatches[m].first.pos << "\t" 226 | << revMatches[m].second.pos << "\t" << m << endl; 227 | } 228 | rclustdiag.close(); 229 | 230 | 231 | } 232 | 233 | int rt = 0; 234 | if (opts.bypassClustering) { 235 | rt = MapRead_lowacc(forMatches, revMatches, LookUpTable, read, genome, genomemm, glIndex, opts, output, svsigstrm, 236 | timing, indelRefineBuffers, strands, readRC, semaphore); 237 | } 238 | else { 239 | rt = MapRead_highacc(forMatches, revMatches, LookUpTable, read, genome, genomemm, glIndex, opts, output, svsigstrm, 240 | timing, indelRefineBuffers, strands, readRC, semaphore); 241 | } 242 | 243 | delete[] readRC; 244 | return rt; 245 | 246 | // /* 247 | // if (semaphore != NULL ) { 248 | // pthread_mutex_unlock(semaphore); 249 | // } 250 | // */ 251 | // // 252 | // // Done with one read. Clean memory. 253 | // // 254 | // delete[] readRC; 255 | // for (int a = 0; a < alignments.size(); a++) { 256 | // for (int s = 0; s < alignments[a].SegAlignment.size(); s++) { 257 | // delete alignments[a].SegAlignment[s]; 258 | // } 259 | // } 260 | // //read.Clear(); 261 | // if (alignments.size() > 0) return 1; 262 | // return 0; 263 | } 264 | 265 | #endif 266 | -------------------------------------------------------------------------------- /MinCount.h: -------------------------------------------------------------------------------- 1 | #ifndef MIN_COUNT_H_ 2 | #define MIN_COUNT_H_ 3 | #include "TupleOps.h" 4 | #include "SeqUtils.h" 5 | #include "htslib/kseq.h" 6 | 7 | template 8 | void StoreMinimizers(char *seq, GenomePos seqLen, int k, int w, vector &minimizers, bool Global, bool canonical = true) { 9 | // 10 | // Initialize first. 11 | // 12 | if (seqLen < k) { 13 | return; 14 | } 15 | TupPos cur, curRC, minTuple, can; 16 | GenomePos minPos; 17 | int windowSpan=w+k-1; 18 | GenomePos p = 0; 19 | TupPos m; 20 | // 21 | // Skip N's as a start 22 | InitMask(m, k); 23 | int nextValidWindowEnd=0; 24 | int nextValidWindowStart=0; 25 | bool valid=false; 26 | if (seqLen < windowSpan) return; 27 | while (nextValidWindowStart < seqLen - windowSpan and !valid) { 28 | valid=true; 29 | for (int n=nextValidWindowStart; valid and n < nextValidWindowStart+windowSpan; n++ ) { 30 | if (seqLen < n) return; 31 | if (seqMapN[seq[n]] > 3) { 32 | nextValidWindowStart = n+1; 33 | valid=false; 34 | } 35 | } 36 | } 37 | // all n 38 | if (valid == false) { 39 | return; 40 | } 41 | nextValidWindowEnd = nextValidWindowStart + windowSpan; 42 | 43 | StoreTuple(seq,p,k,cur); 44 | TupleRC(cur, curRC, k); 45 | // 46 | // Initialize the first minimzer. 47 | // Store canonical information in the rightest bit of can.t; The last bit = 1 ==> reverse strand 48 | // If canonical == True, for_mask = 0111...11 --> minimizer & for_mask = 0minimizer; rev_mask = 1000...00 --> minimizer | rev_mask = 1minimizer 49 | // Else for_mask = 111...11 --> minimizer & for_mask = minimizer, rev_mask = 000...00 --> minimizer | rev_mask = minimizer 50 | // 51 | Tup for_mask = TupPos::for_mask_s; 52 | Tup rev_mask = TupPos::rev_mask_s; 53 | Tup mask = 0; 54 | if (!canonical and Global) { 55 | rev_mask = (rev_mask & mask); // 0000...00 64 bits 56 | for_mask = ~rev_mask; // 111...11 64 bits 57 | } 58 | 59 | if (canonical) { 60 | if ((cur.t & for_mask) < (curRC.t & for_mask)) can.t = (cur.t & for_mask); //can.t = min(cur.t, curRC.t); 61 | else can.t = (curRC.t | rev_mask); 62 | } 63 | else { can.t = cur.t; } 64 | minPos = 0; 65 | TupPos activeMinimizer, curMinimizer; 66 | activeMinimizer.t = can.t; 67 | activeMinimizer.pos = 0; 68 | vector curTuples(w); 69 | curTuples[0] = activeMinimizer; 70 | 71 | // 72 | // Find the active minimizer in this window 73 | // 74 | int nMinimizers=1; 75 | int nSkipped=0; 76 | 77 | for (p = 1; p < w && p < seqLen-k+1 ; p++) { 78 | 79 | ShiftOne(seq, p+k-1, m, cur); 80 | ShiftOneRC(seq, p+k-1, k, curRC); 81 | /* 82 | Tuple test, testrc; 83 | StoreTuple(seq->seq.s, p, k, test); 84 | TupleRC(test, testrc, k); 85 | assert(test == cur); 86 | assert(testrc == curRC); 87 | */ 88 | curMinimizer.pos = p; 89 | if ((cur.t & for_mask) < (curRC.t & for_mask)) curMinimizer.t = (cur.t & for_mask); 90 | else curMinimizer.t = (curRC.t | rev_mask); 91 | if (curMinimizer.t < activeMinimizer.t) { 92 | activeMinimizer.t = curMinimizer.t; 93 | activeMinimizer.pos = p; 94 | } 95 | curTuples[p%w] = curMinimizer; 96 | } 97 | // 98 | // Only store the first minimizer if the first window starts at the beginning of the sequence. 99 | // 100 | if (nextValidWindowEnd == windowSpan ) { 101 | minimizers.push_back(activeMinimizer); 102 | } 103 | // Now scan the chromosome 104 | minTuple.t=m.t; 105 | for (p = w; p < seqLen-k+1; p++) { 106 | // If the next valid window ends at the next nucleotide, check to see if 107 | // it is a valid window (no N's). If so, bump by one. 108 | // Otherwise, search for the next valid window end. 109 | if ( nextValidWindowEnd == p+k-1) { 110 | if ( seqMapN[seq[p+k-1]] <= 3 ) { 111 | nextValidWindowEnd++; 112 | } 113 | else { 114 | nextValidWindowStart = p+k; 115 | valid=false; 116 | if (seqLen < windowSpan) return; 117 | while (nextValidWindowStart < seqLen - windowSpan and not valid) { 118 | valid=true; 119 | for (int n=nextValidWindowStart; valid and n < nextValidWindowStart+windowSpan; n++ ) { 120 | if (seqMapN[seq[n]] > 3) { 121 | nextValidWindowStart = n+1; 122 | valid=false; 123 | } 124 | } 125 | } 126 | // all n 127 | if (valid == false) { 128 | return; 129 | } 130 | nextValidWindowEnd = nextValidWindowStart + windowSpan; 131 | } 132 | } 133 | 134 | ShiftOne(seq, p+k-1, m, cur); 135 | ShiftOneRC(seq, p+k-1, k, curRC); 136 | #ifdef _TESTING_ 137 | TupPos test, testrc; 138 | StoreTuple(seq, p, k, test); 139 | TupleRC(test, testrc, k); 140 | 141 | assert(test.t == cur.t); 142 | assert(testrc.t == curRC.t); 143 | #endif 144 | if ((cur.t & for_mask) < (curRC.t & for_mask)) curMinimizer.t = (cur.t & for_mask); 145 | else curMinimizer.t = (curRC.t | rev_mask); 146 | curMinimizer.pos = p; 147 | curTuples[p%w] = curMinimizer; 148 | if (p - w >= activeMinimizer.pos) { 149 | activeMinimizer = curTuples[0]; 150 | for (int j =1; j < w; j++) { 151 | if ((curTuples[j].t & for_mask) < (activeMinimizer.t & for_mask)) { 152 | activeMinimizer = curTuples[j]; 153 | } 154 | } 155 | if (nextValidWindowEnd == p+k) { 156 | minimizers.push_back(activeMinimizer); 157 | nMinimizers+=1; 158 | } 159 | else { 160 | nSkipped++; 161 | } 162 | } 163 | else { 164 | if ((curMinimizer.t & for_mask) < (activeMinimizer.t & for_mask)) { //TODO(Jingwen) 165 | activeMinimizer = curMinimizer; 166 | if (nextValidWindowEnd == p+k) { 167 | minimizers.push_back(activeMinimizer); 168 | nMinimizers++; 169 | } 170 | else { 171 | nSkipped++; 172 | } 173 | } 174 | } 175 | if (p + 1 % 10000 == 0) { 176 | cerr << p +1 << "\t" << nMinimizers << "\t" << nSkipped << endl; 177 | } 178 | } 179 | } 180 | 181 | template 182 | void StoreMinimizers_noncanonical(char *seq, GenomePos seqLen, int k, int w, vector &minimizers, bool Global) { 183 | // 184 | // Initialize first. 185 | // 186 | if (seqLen < k) { 187 | return; 188 | } 189 | TupPos cur, curRC, minTuple, can; 190 | GenomePos minPos; 191 | 192 | GenomePos p = 0; 193 | TupPos m; 194 | InitMask(m, k); 195 | int nextValidWindowEnd=0; 196 | int nextValidWindowStart=0; 197 | int windowSpan=w+k-1; 198 | bool valid=false; 199 | if (seqLen < windowSpan) return; 200 | while (nextValidWindowStart < seqLen - windowSpan and !valid) { 201 | valid=true; 202 | for (int n=nextValidWindowStart; valid and n < nextValidWindowStart+windowSpan; n++ ) { 203 | if (seqLen < n) return; 204 | if (seqMapN[seq[n]] > 3) { 205 | nextValidWindowStart = n+1; 206 | valid=false; 207 | } 208 | } 209 | } 210 | // all n 211 | if (valid == false) { 212 | return; 213 | } 214 | nextValidWindowEnd = nextValidWindowStart + windowSpan; 215 | 216 | 217 | StoreTuple(seq, p, k, cur); 218 | // TupleRC(cur, curRC, k); 219 | // 220 | // Initialize the first minimzer. 221 | // Store canonical information in the rightest bit of can.t; The last bit = 1 ==> reverse strand 222 | // If canonical == True, for_mask = 0111...11 --> minimizer & for_mask = 0minimizer; rev_mask = 1000...00 --> minimizer | rev_mask = 1minimizer 223 | // Else for_mask = 111...11 --> minimizer & for_mask = minimizer, rev_mask = 000...00 --> minimizer | rev_mask = minimizer 224 | // 225 | Tup for_mask = TupPos::for_mask_s; 226 | Tup rev_mask = TupPos::rev_mask_s; 227 | Tup mask = 0; 228 | if (Global) { 229 | rev_mask = (rev_mask & mask); // 0000...00 64 bits 230 | for_mask = ~rev_mask; // 111...11 64 bits 231 | } 232 | // cerr << "Global: " << Global << endl; 233 | // cerr << "for_mask: " << for_mask << endl; 234 | // cerr << "rev_mask: " << rev_mask << endl; 235 | 236 | can.t = cur.t; 237 | minPos = 0; 238 | TupPos activeMinimizer, curMinimizer; 239 | 240 | 241 | activeMinimizer.t = can.t; 242 | activeMinimizer.pos = 0; 243 | vector curTuples(w); 244 | curTuples[0] = activeMinimizer; 245 | 246 | // 247 | // Find the active minimizer in this window 248 | // 249 | int nMinimizers=1; 250 | 251 | for (p = 1; p < w && p < seqLen-k+1 ; p++) { 252 | ShiftOne(seq, p+k-1, m, cur); 253 | // ShiftOneRC(seq, p+k-1, k, curRC); 254 | /* 255 | Tuple test, testrc; 256 | StoreTuple(seq->seq.s, p, k, test); 257 | TupleRC(test, testrc, k); 258 | assert(test == cur); 259 | assert(testrc == curRC); 260 | */ 261 | curMinimizer.pos = p; 262 | curMinimizer.t = (cur.t & for_mask); 263 | // if ((cur.t & for_mask) < (curRC.t & for_mask)) curMinimizer.t = (cur.t & for_mask); 264 | // else curMinimizer.t = (curRC.t | rev_mask); 265 | if (curMinimizer.t < activeMinimizer.t) { 266 | activeMinimizer.t = curMinimizer.t; 267 | activeMinimizer.pos = p; 268 | } 269 | curTuples[p%w] = curMinimizer; 270 | } 271 | if (nextValidWindowEnd == windowSpan ) { 272 | minimizers.push_back(activeMinimizer); 273 | } 274 | // Now scan the chromosome 275 | minTuple.t=m.t; 276 | for (p = w; p < seqLen-k+1; p++) { 277 | // Check if past current active minimzier 278 | ShiftOne(seq, p+k-1, m, cur); 279 | // ShiftOneRC(seq, p+k-1, k, curRC); 280 | 281 | curMinimizer.t = (cur.t & for_mask); 282 | // if ((cur.t & for_mask) < (curRC.t & for_mask)) curMinimizer.t = (cur.t & for_mask); 283 | // else curMinimizer.t = (curRC.t | rev_mask); 284 | 285 | if ( nextValidWindowEnd == p+k-1) { 286 | if ( seqMapN[seq[p+k-1]] <= 3 ) { 287 | nextValidWindowEnd++; 288 | } 289 | else { 290 | nextValidWindowStart = p+k; 291 | valid=false; 292 | if (seqLen < windowSpan) return; 293 | while (nextValidWindowStart < seqLen - windowSpan and not valid) { 294 | valid=true; 295 | for (int n=nextValidWindowStart; valid and n < nextValidWindowStart+windowSpan; n++ ) { 296 | if (seqLen < n) return; 297 | if (seqMapN[seq[n]] > 3) { 298 | nextValidWindowStart = n+1; 299 | valid=false; 300 | } 301 | } 302 | } 303 | // all n 304 | if (valid == false) { 305 | return; 306 | } 307 | nextValidWindowEnd = nextValidWindowStart + windowSpan; 308 | } 309 | } 310 | 311 | curMinimizer.pos = p; 312 | curTuples[p%w] = curMinimizer; 313 | if (p - w >= activeMinimizer.pos) { 314 | activeMinimizer = curTuples[0]; 315 | for (int j =1; j < w; j++) { 316 | if ((curTuples[j].t & for_mask) < (activeMinimizer.t & for_mask)) { 317 | activeMinimizer = curTuples[j]; 318 | } 319 | } 320 | if (nextValidWindowEnd == p+k) { 321 | minimizers.push_back(activeMinimizer); 322 | nMinimizers+=1; 323 | } 324 | } 325 | else { 326 | if ((curMinimizer.t & for_mask) < (activeMinimizer.t & for_mask)) { //TODO(Jingwen) 327 | activeMinimizer = curMinimizer; 328 | if (nextValidWindowEnd == p+k) { 329 | minimizers.push_back(activeMinimizer); 330 | nMinimizers++; 331 | } 332 | } 333 | } 334 | if (p + 1 % 10000 == 0) { 335 | cerr << p +1 << endl; 336 | } 337 | } 338 | } 339 | #endif 340 | -------------------------------------------------------------------------------- /NaiveDP.h: -------------------------------------------------------------------------------- 1 | #ifndef NAIVE_D_P 2 | #define NAIVE_D_P 3 | 4 | 5 | #include //std::cout 6 | #include // std::labs 7 | #include // std::FILE std::perror 8 | #include 9 | 10 | #include "IndexedSeed.h" 11 | 12 | 13 | 14 | // Gap cost function: log 15 | int64_t 16 | GapCost (unsigned int i, unsigned int j, unsigned int i_prime, unsigned int j_prime) { // end_x, end_y, start_x, start_y (x cordinate is read, y cordinate is genome) 17 | // some function about j-i - (j_prime - i_prime) 18 | int64_t ii = (int64_t) i; 19 | int64_t jj = (int64_t) j; 20 | int64_t ii_prime = (int64_t) i_prime; 21 | int64_t jj_prime = (int64_t) j_prime; 22 | 23 | int64_t t = (jj - ii) - (jj_prime - ii_prime); 24 | double a = floor(8*log(abs(t) + 1) + 2); 25 | int64_t b = (int64_t)a; 26 | return b; 27 | } 28 | 29 | //-------------debug 30 | int64_t 31 | Gaplength (unsigned int i, unsigned int j, unsigned int i_prime, unsigned int j_prime) { // end_x, end_y, start_x, start_y (x cordinate is read, y cordinate is genome) 32 | // some function about j-i - (j_prime - i_prime) 33 | int64_t ii = (int64_t) i; 34 | int64_t jj = (int64_t) j; 35 | int64_t ii_prime = (int64_t) i_prime; 36 | int64_t jj_prime = (int64_t) j_prime; 37 | 38 | int64_t t = abs((jj - ii) - (jj_prime - ii_prime)); 39 | 40 | return t; 41 | } 42 | 43 | 44 | /* 45 | int64_t 46 | GapCost (unsigned int i, unsigned int j, unsigned int i_prime, unsigned int j_prime) { // end_x, end_y, start_x, start_y (x cordinate is read, y cordinate is genome) 47 | // some function about j-i - (j_prime - i_prime) 48 | int64_t ii = (int64_t) i; 49 | int64_t jj = (int64_t) j; 50 | int64_t ii_prime = (int64_t) i_prime; 51 | int64_t jj_prime = (int64_t) j_prime; 52 | 53 | int64_t t = (jj - ii) - (jj_prime - ii_prime); 54 | double a = floor(cbrt(abs(t) + 1)); 55 | int64_t b = (int64_t)a; 56 | return b; 57 | } 58 | */ 59 | /* 60 | int64_t 61 | GapCost (unsigned int i, unsigned int j, unsigned int i_prime, unsigned int j_prime) { // end_x, end_y, start_x, start_y (x cordinate is read, y cordinate is genome) 62 | // some function about j-i - (j_prime - i_prime) !!!!!!!!!!!!!!!!!!! 63 | int64_t ii = (int64_t) i; 64 | int64_t jj = (int64_t) j; 65 | int64_t ii_prime = (int64_t) i_prime; 66 | int64_t jj_prime = (int64_t) j_prime; 67 | int64_t t = (jj - ii) - (jj_prime - ii_prime); 68 | 69 | float gap_score = 2; // gap openning penalty 70 | for (unsigned y = 0; y < t; y++) { 71 | gap_score = gap_score + max(1.00, 2.00 - 0.15*y); 72 | } 73 | int64_t b = (int64_t)gap_score; 74 | return b; 75 | } 76 | 77 | */ 78 | 79 | // Debug code 80 | void 81 | SaveOriginalSeed (Cluster &rCr, FILE* fh, int k) { 82 | if (fh == NULL) { 83 | perror("Eorror opening file: "); 84 | } 85 | else { 86 | for (vector::iterator it = rCr.matches.begin(); it != rCr.matches.end(); ++it) { 87 | fprintf(fh, "%u %u %u %u\n", (*it).first.pos, (*it).second.pos, (*it).first.pos + k - 1, (*it).second.pos + k - 1); 88 | } 89 | } 90 | } 91 | 92 | 93 | void 94 | SavetupChain(vector &rCr, FILE* fh, int k) { 95 | if (fh == NULL) { 96 | perror("Eorror opening file: "); 97 | } 98 | else { 99 | for (vector::iterator it = rCr.begin(); it != rCr.end(); ++it) { 100 | fprintf(fh, "%u %u %u %u\n", (*it).first.pos, (*it).second.pos, (*it).first.pos + k - 1, (*it).second.pos + k - 1); 101 | } 102 | } 103 | } 104 | 105 | 106 | // Debug code 107 | void 108 | SaveseedSet (IndSeedSet &seedSet, FILE* fd) { 109 | if (fd == NULL) { 110 | perror("Error opening file: "); 111 | } 112 | else { 113 | for (TIterator tt = begin(seedSet, seqan::Standard()); tt != end(seedSet, seqan::Standard()); ++tt) { 114 | fprintf(fd, "%lu %lu %lu %lu\n", beginPositionH(*tt), beginPositionV(*tt), endPositionH(*tt), endPositionV(*tt)); // (read_start, genome_start,read_end, genome_end) 115 | } 116 | } 117 | } 118 | 119 | 120 | 121 | // Debug code 122 | void 123 | SaveSparse (seqan::String &chain, FILE* fi) { 124 | if (fi == NULL) { 125 | perror("Error opening file: "); 126 | } 127 | else { 128 | for (unsigned i = 0; i < length(chain); ++i) { 129 | fprintf(fi, "%lu %lu %lu %lu\n", beginPositionH(chain[i]), beginPositionV(chain[i]), endPositionH(chain[i]), endPositionV(chain[i])); 130 | } 131 | } 132 | } 133 | 134 | 135 | 136 | template 137 | void NaiveDP (TSeedSet &seedSet, seqan::String &chain) { 138 | seqan::String seeds; 139 | seqan::resize(seeds, seqan::length(seedSet)); 140 | std::copy(seedSet._seeds.begin(), seedSet._seeds.end(), seqan::begin(seeds, seqan::Standard())); 141 | // 142 | //std::cout << "length(seeds): " << length(seeds) << std::endl; 143 | //--------------------------------------------------------------------------------------------- 144 | // Step 1: generate the sorted list of interval points 145 | // -------------------------------------------------------------------------------------------- 146 | 147 | typedef seqan::Triple TIntervalPoint; 148 | typedef seqan::String TIntervalPoints; 149 | typedef typename seqan::Iterator::Type TIntervalPointsIterator; 150 | TIntervalPoints intervalPoints; //intervalPoints contains all the start/end points of seeds 151 | vector qualityOfChainEndingIn(seqan::length(seeds)); 152 | vector predecessor(seqan::length(seeds)); 153 | 154 | 155 | for (unsigned i = 0; i < seqan::length(seeds); ++i) { 156 | qualityOfChainEndingIn[i] = seqan::seedSize(seeds[i]); 157 | predecessor[i] = std::numeric_limits::max(); 158 | seqan::appendValue(intervalPoints, TIntervalPoint(beginPositionH(seeds[i]), true, i)); 159 | seqan::appendValue(intervalPoints, TIntervalPoint(endPositionH(seeds[i]), false, i)); 160 | } 161 | std::sort(seqan::begin(intervalPoints, seqan::Standard()), seqan::end(intervalPoints, seqan::Standard())); // end goes before start if their positions are the same 162 | 163 | //debug code 164 | //cout << "length(seeds): " << length(seeds) << endl; 165 | //cout << "length(intervalPoints): " << length(intervalPoints) << endl; 166 | /* 167 | for (TIntervalPointsIterator it = seqan::begin(intervalPoints, seqan::Standard()); it != seqan::end(intervalPoints, seqan::Standard()); ++it) { 168 | if (it->i2 == true) { 169 | cout << "*it-true: " << *it << endl; 170 | } 171 | else { 172 | cout << "*it--false: " << *it << endl; 173 | } 174 | } 175 | 176 | 177 | */ 178 | 179 | //cout << "step 1 is finished " << endl; 180 | // --------------------------------------------------------------------------------------- 181 | // Step 2: bulid the chain 182 | // ---------------------------------------------------------------------------------------- 183 | // build a list of "intermediate solutions" 184 | // Each solution is represented by the triple (value of best chain so far, endPosition in dim(Genome), last seed of the chain) 185 | 186 | typedef seqan::Triple TIntermediateSolution; 187 | typedef std::multiset TIntermediateSolutions; // Elements in multiset are in ascending order 188 | typedef typename TIntermediateSolutions::iterator TIntermediateSolutionsIterator; 189 | 190 | // For all interval points..... 191 | TIntermediateSolutions intermediateSolutions; 192 | // vector intermediateSolutions; //(seqan::length(seeds)); 193 | 194 | for (TIntervalPointsIterator it_k = seqan::begin(intervalPoints); it_k != seqan::end(intervalPoints); ++it_k) { 195 | TSeed const & seed_k = seeds[it_k->i3]; 196 | 197 | 198 | 199 | if (it_k->i2) { // It's a begin point 200 | // Find the seed j so that seed j's Genome.cordinate is less or equal to the beginPositionV of seed_k 201 | 202 | /* 203 | TIntermediateSolution referenceSolution(beginPositionV(seed_k), std::numeric_limits::max(), std::numeric_limits::max()); 204 | TIntermediateSolutionsIterator it_q = intermediateSolutions.upper_bound(referenceSolution); // the beginPositionV of it_q is larger than the beginPositionV 205 | 206 | // STl gives us upper_bound which returns a pointer to the first one that compares greater than the reference one. 207 | // Special case: If intermediateSolutions is empty or there is no chain that ends before seed_k begins 208 | if (intermediateSolutions.empty() || it_q == intermediateSolutions.begin()) { 209 | continue; 210 | } 211 | */ 212 | 213 | if (intermediateSolutions.empty()) { 214 | continue; 215 | } 216 | 217 | 218 | 219 | unsigned j = 0; 220 | // cerr << "intermediateSolutions.size(): " << intermediateSolutions.size() << endl; 221 | 222 | 223 | int64_t quality = qualityOfChainEndingIn[it_k->i3]; // quality stores the current maximum 224 | for (TIntermediateSolutionsIterator it_j = intermediateSolutions.begin(); it_j != intermediateSolutions.end(); ++it_j) { // it_j->i1 <= beginPositionV(seed_k) 225 | //cout << "endPositionH(seeds[it_j->i3]: " << endPositionH(seeds[it_j->i3]) << " " << endPositionV(seeds[it_j->i3]) << " " 226 | // << beginPositionH(seed_k) << " " << beginPositionV(seed_k) << endl; 227 | 228 | 229 | if (beginPositionV(seed_k) >= it_j->i2 && quality <= qualityOfChainEndingIn[it_k->i3] + it_j->i1 - 230 | GapCost(endPositionH(seeds[it_j->i3]), endPositionV(seeds[it_j->i3]), beginPositionH(seed_k), beginPositionV(seed_k))) { // Jingwen: Is it "<=" or "<" 231 | 232 | quality = qualityOfChainEndingIn[it_k->i3] + it_j->i1 - 233 | GapCost(endPositionH(seeds[it_j->i3]), endPositionV(seeds[it_j->i3]), beginPositionH(seed_k), beginPositionV(seed_k)); 234 | 235 | predecessor[it_k->i3] = it_j->i3; 236 | 237 | } 238 | } 239 | qualityOfChainEndingIn[it_k->i3] = quality; 240 | 241 | 242 | } 243 | else { // It's an end point 244 | TIntermediateSolution intermediate_k(qualityOfChainEndingIn[it_k->i3], endPositionV(seeds[it_k->i3]), it_k->i3); 245 | intermediateSolutions.insert(intermediate_k); 246 | } 247 | } 248 | 249 | //cout << "step 2 is finished " << endl; 250 | // ------------------------------------------------------------------------------------------- 251 | // Step 3: Write out the resulting chain 252 | // ------------------------------------------------------------------------------------------- 253 | 254 | /* 255 | //Debug-----print intermediateSolutions 256 | cout << "intermediateSolutions.size(): " << intermediateSolutions.size() << endl; 257 | for (TIntermediateSolutionsIterator it = intermediateSolutions.begin(); it != intermediateSolutions.end(); ++it) { 258 | cout << "*it: " << *it << endl; 259 | } 260 | */ 261 | 262 | 263 | clear(chain); 264 | unsigned next = intermediateSolutions.rbegin()->i3; 265 | while (next != std::numeric_limits::max()) 266 | { 267 | appendValue(chain, seeds[next]); 268 | next = predecessor[next]; 269 | } 270 | reverse(chain); 271 | } 272 | 273 | #endif 274 | -------------------------------------------------------------------------------- /Options.h: -------------------------------------------------------------------------------- 1 | #ifndef OPTIONS_H_ 2 | #define OPTIONS_H_ 3 | 4 | const unsigned int REF_LOC=1; 5 | const unsigned int REF_DYN=2; 6 | const unsigned int REF_DP=4; 7 | 8 | class Options { 9 | public: 10 | enum AlignType { ont, clr, ccs, contig}; 11 | int globalK; 12 | int localK; 13 | int globalW; 14 | int localW; 15 | int globalMaxFreq; 16 | int localMaxFreq; 17 | int maxDiag; 18 | int cleanMaxDiag; 19 | int minClusterSize; 20 | int minClusterLength; 21 | int window; 22 | bool dotPlot; 23 | bool mergeClusters; 24 | bool mergeGapped; 25 | int minDiagCluster; 26 | int minRefinedClusterSize; 27 | bool viewPairwise; 28 | bool hardClip; 29 | string printFormat; 30 | //int bestn; 31 | bool storeAll; 32 | int nproc; 33 | string outfile; 34 | string outsvfile; 35 | int maxCandidates; 36 | int refineLevel; 37 | bool doBandedAlignment; 38 | int maxGap; 39 | int maxGapBtwnAnchors; 40 | int localIndexWindow; 41 | bool NaiveDP; 42 | bool SparseDP; 43 | bool LookUpTable; 44 | int readStart; 45 | int readStride; 46 | bool seqan; 47 | int localMatch; 48 | int localMismatch; 49 | int localIndel; 50 | int localBand; 51 | int refineBand; 52 | int MergeSplit; 53 | int flagRemove; 54 | int maxRemovePairedIndelsLength; // if an anchor's length is larger than this parameter, 55 | // then even if it has paired indels before and after it, we do not delete this anchor. 56 | int maxRemoveSpuriousAnchorsDist; 57 | int minRemoveSpuriousAnchorsNum; 58 | int minRemoveSpuriousAnchorsLength; 59 | int NumAln; 60 | int BtnSubClusterswindow; 61 | int binLength; 62 | int minBinNum; 63 | bool HighlyAccurate; 64 | int splitdist; 65 | float firstcoefficient; 66 | float secondcoefficient; 67 | //int minimizerFreq; 68 | int NumOfminimizersPerWindow; 69 | bool Printsvsig; 70 | int svsigLen; 71 | float alnthres; 72 | string timing; 73 | int PrintNumAln; 74 | AlignType readType; 75 | bool storeTiming; 76 | int sseBand; 77 | int globalWinsize; 78 | int minUniqueStretchNum; 79 | int minUniqueStretchDist; 80 | float slope; 81 | float rate_FirstSDPValue; 82 | float rate_value; 83 | long maxDrift; 84 | int minTightCluster; 85 | bool RefineBySDP; 86 | int refineSpaceDiag; 87 | float initial_anchorbonus; 88 | float anchorstoosparse; 89 | bool passthroughtag; 90 | bool CheckTrueIntervalInFineCluster; 91 | bool CalculateMinimizerStats; 92 | bool skipBandedRefine; 93 | string readname; 94 | int merge_dist; 95 | bool SkipLocalMinimizer; 96 | bool SkipClusering; 97 | bool RemovePairedIndels; 98 | bool RemoveSpuriousAnchors; 99 | float second_anchorbonus; 100 | int RoughClustermaxGap; 101 | int SecondCleanMinDiagCluster; 102 | int SecondCleanMaxDiag; 103 | bool debug; 104 | int anchorPerlength; 105 | int punish_anchorfreq; 106 | int cleanClustersize; 107 | bool bypassClustering; 108 | bool refineEnd; 109 | int refineSpaceDist; 110 | bool ExtractDiagonalFromClean; 111 | bool trimrefine; 112 | bool limitrefine; 113 | int freeGap; 114 | int maxP; 115 | float gapopen; 116 | float gapextend; 117 | float gaproot; 118 | int gapCeiling1; 119 | int gapCeiling2; 120 | bool showmm; 121 | bool printMD; 122 | bool refineBreakpoint; 123 | Options() { 124 | showmm=true; 125 | printMD=false; 126 | refineBreakpoint=false; 127 | freeGap=10; 128 | maxP=2000; 129 | skipBandedRefine=false; 130 | storeTiming=false; 131 | readType=Options::ont; 132 | localMatch=4; 133 | localMismatch=-3; 134 | localIndel=-4; 135 | localBand=15; 136 | refineBand=7; 137 | sseBand=30; 138 | readStart=0; 139 | readStride=1; 140 | dotPlot=false; 141 | globalK=17; 142 | globalW=10; 143 | localK=7; 144 | localW=5; 145 | //bestn=1; 146 | globalMaxFreq=50; 147 | localMaxFreq=30; 148 | maxDiag=500; // We want maxDiag to be a small number (used to be 500) //// For CCS, need to be smaller!!! //// lots of unmapped reads due to 500; 149 | cleanMaxDiag=100; 150 | minDiagCluster=10; // used to be 20 151 | // This parameter is used in CleanOffDiagonal function; It's better not to set it to a single value. 152 | // This parameter is used in another CleanOFFDiagonal function 153 | // This parameter can be deleted here 154 | 155 | minClusterSize=2; // 156 | minClusterLength=50; // For CCS, need to be larger!(200) 157 | minRefinedClusterSize=40; 158 | window=100; 159 | mergeGapped=false; 160 | viewPairwise=false; 161 | hardClip=false; 162 | printFormat="p"; 163 | storeAll=false; 164 | nproc=1; 165 | outfile=""; 166 | outsvfile=""; 167 | maxCandidates=10; 168 | doBandedAlignment=true; 169 | refineLevel= REF_LOC | REF_DYN | REF_DP; 170 | maxGap=5000; // 5000 171 | maxGapBtwnAnchors=1000; // no larger than 2000 // used to be 1500 // 1000 172 | mergeClusters=true; 173 | NaiveDP=false; 174 | seqan=false; 175 | SparseDP=true; 176 | LookUpTable=true; 177 | MergeSplit=true; 178 | flagRemove=0; 179 | maxRemovePairedIndelsLength=500; // used to be 50 180 | //maxRemoveSpuriousAnchorsDist=200; 181 | //minRemoveSpuriousAnchorsNum=15; 182 | //minRemoveSpuriousAnchorsLength=100; 183 | maxRemoveSpuriousAnchorsDist=500; 184 | minRemoveSpuriousAnchorsNum=10; 185 | NumAln = 3; 186 | PrintNumAln = 1; 187 | BtnSubClusterswindow = 800; 188 | binLength = 20000; 189 | minBinNum = 3; 190 | HighlyAccurate = false; 191 | splitdist = 50000; 192 | firstcoefficient = 18; 193 | secondcoefficient = 12; 194 | //minimizerFreq = 50; 195 | NumOfminimizersPerWindow = 5; 196 | Printsvsig=false; 197 | svsigLen = 25; 198 | alnthres = 0.7f; 199 | timing=""; 200 | localIndexWindow=256; 201 | globalWinsize = 16; 202 | minUniqueStretchNum = 1; 203 | minUniqueStretchDist = 50; 204 | slope=1; 205 | rate_FirstSDPValue=0.2; 206 | rate_value=0.8; 207 | maxDrift=400; 208 | minTightCluster=10; 209 | RefineBySDP=true; 210 | refineSpaceDiag=5; 211 | initial_anchorbonus=1.0; 212 | anchorstoosparse=0.01; 213 | passthroughtag=false; 214 | CheckTrueIntervalInFineCluster=false; 215 | CalculateMinimizerStats=false; 216 | merge_dist=100; 217 | SkipLocalMinimizer=false; 218 | SkipClusering=false; 219 | RemovePairedIndels=true; 220 | RemoveSpuriousAnchors=true; 221 | second_anchorbonus=2; 222 | RoughClustermaxGap=1000; 223 | SecondCleanMinDiagCluster=40; 224 | SecondCleanMaxDiag=10; 225 | debug=false; 226 | anchorPerlength=10; 227 | punish_anchorfreq=10; 228 | cleanClustersize=100; 229 | bypassClustering=false; 230 | refineEnd=false; 231 | refineSpaceDist=10000; 232 | ExtractDiagonalFromClean=false; 233 | trimrefine=false; 234 | limitrefine=true; 235 | gapopen=2; 236 | gapextend=10; 237 | gaproot=2.0; 238 | gapCeiling1=1500; 239 | gapCeiling2=3000; 240 | } 241 | }; 242 | #endif 243 | -------------------------------------------------------------------------------- /Path.h: -------------------------------------------------------------------------------- 1 | #ifndef PATH_H_ 2 | #define PATH_H_ 3 | 4 | enum Arrow { Diagonal, Up, Left, 5 | AffineInsUp, AffineInsOpen, AffineInsClose, 6 | AffineDelLeft, AffineDelOpen, AffineDelClose, 7 | AffineHPInsUp, AffineHPInsOpen, AffineHPInsClose, 8 | NoArrow, 9 | DiagonalXYZ, 10 | InsertX,InsertY,InsertZ, // imply diagonal yz/xz/xy 11 | DiagonalXY, DiagonalYZ, DiagonalXZ, // imply insertion of Z/X/Y, 12 | // 13 | // These are used to denote an affine gap has been closed 14 | // from a different matrix. This is used in OneGap alignment. 15 | // 16 | AffineLongDelLeft, AffineLongDelClose, 17 | AffineLongIns, AffineLongInsClose, 18 | Star 19 | }; 20 | 21 | enum MatrixLabel {Match, AffineHPIns, AffineIns, AffineDel, AffineHPDel}; 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /Point.h: -------------------------------------------------------------------------------- 1 | #ifndef POINT_H_ 2 | #define POINT_H_ 3 | 4 | #include "Types.h" 5 | 6 | class Point 7 | { 8 | public: 9 | 10 | Pair se; //store the coordinates of a point (q, t) 11 | bool orient; // if orient = 0 means reverse orientated anchor 12 | bool ind; // ind = 1 means this is a start; ind = 0 means this is an end 13 | bool inv; // inv = 1 means this is forward directiion; inv = 0 means this is a backward direction 14 | unsigned int frag_num; // store the index of the fragment that contains this point 15 | int clusterNum; // store the index of the Cluster which the current point comes from; 16 | // int matchstartNum; 17 | Point(unsigned int & frag_num1); 18 | Point() {orient = 0; ind = 0; inv = 0;}; 19 | ~Point() {}; 20 | 21 | friend std::ostream & operator<<(std::ostream & os, const Point & t); // overload of operator << 22 | }; 23 | 24 | 25 | Point::Point(unsigned int & frag_num1) { 26 | frag_num = frag_num1; 27 | orient = 0; ind = 0; inv = 0; 28 | } 29 | 30 | std::ostream & operator<<(std::ostream & os, const Point & M) { 31 | os << "Point: { Pair:" << M.se << "; ind: " << M.ind << "; frag_num: " << M.frag_num << "\n"; 32 | return os; 33 | } 34 | 35 | 36 | #endif -------------------------------------------------------------------------------- /PrioritySearchTree.h: -------------------------------------------------------------------------------- 1 | #ifndef PRIORITY_SEARCH_TREE_H_ 2 | #define PRIORITY_SEARCH_TREE_H_ 3 | #include 4 | 5 | /* 6 | * Define a priority search tree on a point that implements 7 | * the following interface: 8 | * 9 | * int T_point::GetIndex() 10 | * - Return the index of the point in a list of points. 11 | * int T_point::GetKey() 12 | * - Return the key value that the points are sorted by (x-value in a 2D query) 13 | * int T_point::GetValue() 14 | * - Return the value of a point. 15 | * int T_point::SetValue(int value) 16 | * - sets the value of a point. 17 | * 18 | * This class implements a query FindMax(key), which returns 19 | * the index of the point with greatest value of all points with key [0...key). 20 | * 21 | * 22 | */ 23 | #include 24 | template 25 | class PSTVertex { 26 | public: 27 | unsigned int leftChildIndex; 28 | unsigned int rightChildIndex; 29 | unsigned int isALeaf; 30 | typename T_Point::KeyType medianKey; 31 | typename T_Point::KeyType maxKey; 32 | int pointIndex; 33 | int maxScoreNode; 34 | int fx,fy; 35 | PSTVertex() { 36 | isALeaf = 0; 37 | leftChildIndex = 0; 38 | rightChildIndex = 0; 39 | maxScoreNode = -1; 40 | maxKey = 0; 41 | medianKey = 0; 42 | pointIndex = -1; 43 | fx=-1,fy=-1; 44 | } 45 | }; 46 | 47 | template 48 | class PrioritySearchTree { 49 | public: 50 | void Print() { 51 | int i; 52 | cerr << "index\tleaf\tleft\tright\tmedian\tmax\tpoint\tnode\tx\ty" << endl; 53 | for (i=0; i < tree.size(); i++) { 54 | cerr << "pst: " << i << "\t" << (int) tree[i].isALeaf << "\t" << tree[i].leftChildIndex << "\t" << tree[i].rightChildIndex << "\t" << tree[i].medianKey << "\t" << tree[i].maxKey << "\t" << tree[i].pointIndex << "\t" << tree[i].maxScoreNode << "\t" << tree[i].fx << "\t" << tree[i].fy << endl; 55 | } 56 | } 57 | 58 | private: 59 | 60 | vector > tree; 61 | vector > *treePtr; 62 | int GetMedianIndex(int start, int end) { 63 | return (end + start) / 2; 64 | } 65 | typename T_Point::KeyType CreateTree(vector &points, int start, int end, unsigned int &iterativeIndex) { 66 | 67 | assert(iterativeIndex < (*treePtr).size()); 68 | // 69 | // Look to see if this vertex is the parent of a leaf -- when there are only 70 | // two points below. 71 | // 72 | 73 | int medianIndex = GetMedianIndex(start, end); 74 | int curVertexIndex = iterativeIndex; 75 | (*treePtr)[curVertexIndex].medianKey = points[medianIndex].GetKey(); 76 | 77 | if (end == start) { 78 | // No children for this node, done. 79 | (*treePtr)[curVertexIndex].pointIndex = start; 80 | (*treePtr)[curVertexIndex].fx=points[medianIndex].x; 81 | (*treePtr)[curVertexIndex].fy=points[medianIndex].y; 82 | return (*treePtr)[curVertexIndex].medianKey; 83 | } 84 | // 85 | // Check to see if the current 86 | // node is a leaf node. No recursion on this node. 87 | // 88 | 89 | if (end - start == 1) { 90 | (*treePtr)[curVertexIndex].isALeaf = 1; 91 | (*treePtr)[curVertexIndex].medianKey = points[start].GetKey(); 92 | (*treePtr)[curVertexIndex].pointIndex = start; 93 | (*treePtr)[curVertexIndex].fx=points[medianIndex].x; 94 | (*treePtr)[curVertexIndex].fy=points[medianIndex].y; 95 | 96 | // 97 | // Return the key of this vertex. The parent 98 | // will know what to do with it. If this is 99 | // a left child, the parent will use the key to 100 | // distinguish what is on the left side of the branches. 101 | // If it is the right side of a (*treePtr), it is ignored. 102 | // 103 | return (*treePtr)[curVertexIndex].medianKey; 104 | } 105 | else { 106 | // 107 | // This vertex contains at least two children, so it is not 108 | // a leaf. Recurse assigning leaves. 109 | // 110 | 111 | (*treePtr)[curVertexIndex].isALeaf = 0; 112 | (*treePtr)[curVertexIndex].leftChildIndex = ++iterativeIndex; 113 | typename T_Point::KeyType leftTreeKey, rightTreeKey; 114 | leftTreeKey = CreateTree(points, start, medianIndex, iterativeIndex); 115 | 116 | // 117 | // The leftTreeKey separates the branches BELOW this vertex. 118 | // 119 | (*treePtr)[curVertexIndex].medianKey = leftTreeKey; 120 | 121 | (*treePtr)[curVertexIndex].rightChildIndex = ++iterativeIndex; 122 | rightTreeKey = CreateTree(points, medianIndex, end, iterativeIndex); 123 | // 124 | // The rightTreeKey will separate the parent's left tree from the right. 125 | // 126 | (*treePtr)[curVertexIndex].maxKey = rightTreeKey; 127 | return rightTreeKey; 128 | } 129 | } 130 | 131 | int FindIndexOfMaxPoint(int curVertexIndex, 132 | vector &points, typename T_Point::KeyType maxKey, int &maxPointValue, int &maxPointIndex) { 133 | 134 | // 135 | // Attempt to find the leaf vertex beneath this vertex that has the largest 136 | // score, with a key less than max key. 137 | // 138 | // On return: 139 | // Return 1 if a value is assigned to maxPointValue, 0 otherwise. 140 | // If a value is assigned to maxPointValue, this sets: 141 | // maxPointValue is the score of the maximum point. 142 | // maxPointIndex the index of the point in 'points' that has the maximum score. 143 | // 144 | 145 | // 146 | // The vertex at curVertexIndex has a max score node beneath it, if it has been 147 | // initialized. If the maxScoreNode has a key less than the current maxKey, then we 148 | // know the maximum value is contained beneath this vertex, AND that its key 149 | // is within the range in the rage maximum query. 150 | // That means that there is no need to continue the search below here. 151 | // 152 | if ((*treePtr)[curVertexIndex].maxScoreNode == -1) { 153 | return 0; 154 | } 155 | if (points[(*treePtr)[curVertexIndex].maxScoreNode].GetKey() < maxKey) { 156 | if (points[(*treePtr)[curVertexIndex].maxScoreNode].score > maxPointValue) { 157 | maxPointValue = points[(*treePtr)[curVertexIndex].maxScoreNode].score; 158 | maxPointIndex = (*treePtr)[curVertexIndex].maxScoreNode; 159 | return 1; 160 | } 161 | else { 162 | return 0; 163 | } 164 | } 165 | // 166 | // Otherwise, the maximum scoring node beneath this node has a key greater than 167 | // the max key. That means that the search must continue for the maximum value 168 | // node with a key less than 'maxKey'. The search has two cases: 169 | // 170 | // First, if the median key of this node is greater than the maxKey, all keys 171 | // on the right side of the tree are greater than maxKey, so do not search there. 172 | // 173 | // If the median key of this node si less than maxKey, there may be a node on the left 174 | // or right child of the current node with a maximum key. Search both to the 175 | // left and right. 176 | // 177 | else { 178 | if (!(*treePtr)[curVertexIndex].isALeaf) { 179 | if (maxKey <= (*treePtr)[curVertexIndex].medianKey) { 180 | return FindIndexOfMaxPoint((*treePtr)[curVertexIndex].leftChildIndex, points, maxKey, maxPointValue, maxPointIndex); 181 | } 182 | else { 183 | int foundValueLeft, foundValueRight; 184 | foundValueLeft = FindIndexOfMaxPoint((*treePtr)[curVertexIndex].leftChildIndex, points, maxKey, maxPointValue, maxPointIndex); 185 | foundValueRight = FindIndexOfMaxPoint((*treePtr)[curVertexIndex].rightChildIndex, points, maxKey, maxPointValue, maxPointIndex); 186 | return (foundValueLeft or foundValueRight); 187 | } 188 | } 189 | else { 190 | // 191 | // The current node is a leaf node, but due to the condition from before, its key 192 | // is greater than or equal to the max key, therefore its score cannot 193 | // be used for the maximum score. 194 | // Returning 0 here signifies that this search-branch did not turn up any candidates for 195 | // the maximum scoring node. 196 | return 0; 197 | } 198 | } 199 | } 200 | 201 | public: 202 | PrioritySearchTree() { 203 | treePtr = NULL; 204 | } 205 | 206 | void CreateTree(vector &points, vector > *bufTreePtr=NULL) { 207 | /* 208 | * Precondition: points is sorted according to key. 209 | */ 210 | 211 | // 212 | // The tree is a binary tree containing all the points. The 213 | // perfectly balanced tree is of maximum size points.size()-1, so 214 | // go ahead and preallocate that now. 215 | // 216 | if (bufTreePtr != NULL) { 217 | treePtr = bufTreePtr; 218 | } 219 | else { 220 | treePtr = &tree; 221 | } 222 | treePtr->resize((points.size() * 2) - 1); 223 | unsigned int curVertexIndex = 0; 224 | CreateTree(points, 0, points.size(), curVertexIndex); 225 | } 226 | 227 | 228 | // 229 | // Implement the tree as an array of interior nodes. 230 | // Since there is already space allocated for the 231 | 232 | 233 | int FindPoint(typename T_Point::KeyType pointKey, int curVertexIndex, int &pointVertexIndex) { 234 | if ((*treePtr)[curVertexIndex].isALeaf) { 235 | pointVertexIndex = curVertexIndex; 236 | return (*treePtr)[curVertexIndex].medianKey == pointKey; 237 | } 238 | else { 239 | if (pointKey <= (*treePtr)[curVertexIndex].medianKey) { 240 | return FindPoint(pointKey, (*treePtr)[curVertexIndex].leftChildIndex, pointVertexIndex); 241 | } 242 | else { 243 | return FindPoint(pointKey, (*treePtr)[curVertexIndex].rightChildIndex, pointVertexIndex); 244 | } 245 | } 246 | } 247 | 248 | void Activate(vector &points, int pointIndex) { 249 | int pointScore = points[pointIndex].score; 250 | // Now, update the pMax scores in the tree 251 | 252 | int curVertexIndex = 0; 253 | typename T_Point::KeyType pointKey = points[pointIndex].GetKey(); 254 | unsigned int itIndex = 0; 255 | while (pointIndex != -1 and 256 | (*treePtr)[curVertexIndex].isALeaf == 0) { 257 | assert(itIndex < (*treePtr).size()); 258 | if ((*treePtr)[curVertexIndex].maxScoreNode == -1 or 259 | points[(*treePtr)[curVertexIndex].maxScoreNode].score <= pointScore) { 260 | int tmpPMaxIndex = (*treePtr)[curVertexIndex].maxScoreNode; 261 | (*treePtr)[curVertexIndex].maxScoreNode = pointIndex; 262 | pointIndex = tmpPMaxIndex; 263 | } 264 | 265 | if (pointKey <= (*treePtr)[curVertexIndex].medianKey) { 266 | curVertexIndex = (*treePtr)[curVertexIndex].leftChildIndex; 267 | } 268 | else { 269 | curVertexIndex = (*treePtr)[curVertexIndex].rightChildIndex; 270 | } 271 | 272 | // Keep track of the number of times this loop is executed... an 273 | // infinite loop will bomb. 274 | ++itIndex; 275 | } 276 | } 277 | 278 | int FindIndexOfMaxPoint(vector &points, typename T_Point::KeyType maxPointKey, int &maxPointIndex) { 279 | // start at the root 280 | int curVertexIndex = 0; 281 | if ((*treePtr)[curVertexIndex].maxScoreNode == -1) { 282 | // 283 | // This case can only be hit if none of the points have been 284 | // activated. 285 | // 286 | return 0; 287 | } 288 | int maxPointValue = -1; 289 | return FindIndexOfMaxPoint(0, points, maxPointKey, maxPointValue, maxPointIndex); 290 | } 291 | }; 292 | 293 | #endif 294 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ![](image/logo_small.png) 3 | 4 | 5 | ## Getting Started 6 | 7 | - [Getting Started](#started) 8 | ```sh 9 | # index reference genome 10 | lra index -CCS/CLR/ONT/CONTIG ref.fa 11 | # Map sequence to reference 12 | lra align -CCS/CLR/ONT/CONTIG ref.fa read.fa -t 16 -p s > output.sam 13 | ``` 14 | ## Table of Contents 15 | 16 | - [Introduction](#intro) 17 | - [Users' Guide](#uguide) 18 | - [Installation](#install) 19 | - [Index reference](#index) 20 | - [Align reads/contigs to reference](#align) 21 | - [Output format](#output) 22 | 23 | ## Introduction 24 | 25 |

26 |

27 | 28 |
29 | lra is a sequence alignment program that aligns long reads from single-molecule sequencing (SMS) instruments, or megabase-scale contigs from SMS assemblies. lra implements seed chaining sparse dynamic programming with a concave gap function to read and assembly alignment, which is also extended to allow for inversion cases. lra alignment approach increases sensitivity and specificity for SV discovery, particularly for variants above 1kb and when discovering variation from ONT reads, while having runtime that arecomparable (1.05-3.76×) to current methods. When applied to calling variation from *de novo* assembly contigs, there is a 3.2% increase in Truvari F1 score compared to minimap2+htsbox. 30 | 31 | 32 | ## Users' Guide 33 | 34 | ### Installation 35 | 36 | Install lra by bioconda: `conda install -c bioconda lra` 37 | 38 | Install lra from github or release: The dependencies are [zlib][zlib], [htslib][htslib]. 39 | Users can install `zlib` and `htslib` through conda and build lra in conda environment. 40 | 1. `conda activate env`; 41 | 2. Install dependency: `conda install -c bioconda htslib` and `conda install -c anaconda zlib`; 42 | 3. Get released latest source code from github `wget https://github.com/ChaissonLab/lra/archive/VX.XX.tar.gz && tar -xvf VX.XX.tar.gz && cd lra-X.XX/ && make`. 43 | Or get source code directly from the master branch `git clone --recursive https://github.com/ChaissonLab/lra.git -b master && cd lra && make`. You are all set for the installation! 44 | 45 | 46 | 47 | ### Index reference 48 | lra needs to first build a two-tiered minimizer indexes (global and local) for the reference before mapping. Both can be built at once using commands: 49 | ``` 50 | lra index -CCS/CLR/ONT/CONTIG ref.fa 51 | ``` 52 | lra has different parameters setting for the index when aligning reads from different sequencing instruments (CCS/CLR/ONT/CONTIG). You can also custimize the parameters. Details see `lra index --help`. lra takes a few minutes to index the human reference genome. 53 | 54 | Alternatively the global and local indexes may be built separately: 55 | ``` 56 | lra global -CCS/CLR/ONT/CONTIG ref.fa 57 | lra local -CCS/CLR/ONT/CONTIG ref.fa 58 | ``` 59 | 60 | ### Align reads/contigs to reference 61 | lra takes reads fasta, fastq or bam format in the mapping step. The output format can be SAM, PAF, BED and pairwise alignment. Details see `lra align --help`. The usage of multiple threads can be specified by `-t`. lra uses the same base algorithm for mapping all datatypes with different parameters settings. It is recommended to choose among `CCS/CLR/ONT/CONTIG` based on the accuracy and average length of the input reads. 62 | 63 | ``` 64 | lra align -CCS/CLR/ONT/CONTIG ref.fa read.fa -t 16 -p s > output.sam 65 | lra align -CCS/CLR/ONT/CONTIG ref.fa read.fa -t 16 -p p > output.paf 66 | lra align -CCS/CLR/ONT/CONTIG ref.fa read.fa -t 16 -p b > output.bed 67 | ``` 68 | 69 | If you have read.fa.gz, you may pip the read.fa to lra. 70 | 71 | ``` 72 | zcat read.fa.gz | lra align -CCS ref.fa /dev/stdin -t -p s > output.sam 73 | ``` 74 | 75 | ### Output format 76 | 77 | lra uses a set of customized tags in SAM and PAF output. 78 | 79 | |Tag|Type |Description | 80 | |:--|:----:|:-------------------------------------------------------------------| 81 | |NM |i |Number of mismatches + insertions + deletions in the alignment. | 82 | |NX |i |Number of mismatches in the alignment. | 83 | |ND |i |Number of bases of deletions in the alignment. | 84 | |TD |i |Number of deletions in the alignment. | 85 | |NI |i |Number of bases of insertions in the alignment. | 86 | |TI |i |Number of insertions in the alignment. | 87 | |NV |f |The alignment score. | 88 | |TP |A |Type of aln, P/primary, S/secondary, I/inversion. | 89 | |RT |i |runtime. | 90 | |CG |z |CIGAR string. | 91 | |AO |i |This number shows the order of the aligned segment when a read is split.| 92 | 93 | 94 | [zlib]: http://zlib.net/ 95 | [htslib]: https://github.com/samtools/htslib/ 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /Read.h: -------------------------------------------------------------------------------- 1 | #ifndef READ_H_ 2 | #define READ_H_ 3 | 4 | #include 5 | using namespace std; 6 | class Read { 7 | public: 8 | char *seq; 9 | char *qual; 10 | int length; 11 | 12 | char *passthrough; 13 | string name; 14 | int flags; 15 | bool unaligned; 16 | void Clear() { 17 | if (seq != NULL) { 18 | delete[] seq; 19 | seq=NULL; 20 | } 21 | if (qual != NULL) { 22 | delete[] qual; 23 | qual=NULL; 24 | } 25 | if (passthrough != NULL) { 26 | delete[] passthrough; 27 | passthrough=NULL; 28 | } 29 | length=0; 30 | name=""; 31 | } 32 | Read() { 33 | seq=NULL; 34 | length=0; 35 | qual=NULL; 36 | name=""; 37 | passthrough=NULL; 38 | flags=0; 39 | unaligned=0; 40 | } 41 | Read(char* _sq, int _len, string _name, char*_qual=NULL) { 42 | seq=_sq; 43 | length=_len; 44 | qual=_qual; 45 | passthrough=NULL; 46 | name=_name; 47 | unaligned=0; 48 | } 49 | Read& operator=(const Read& rhs) { 50 | length=rhs.length; 51 | seq = NULL; 52 | qual= NULL; 53 | unaligned=0; 54 | if (rhs.length > 0) { 55 | if (rhs.seq != NULL) { 56 | seq = new char[length]; 57 | memcpy(seq, rhs.seq, length); 58 | } 59 | if (rhs.qual != NULL) { 60 | int qualLen=strlen(rhs.qual); 61 | qual = new char[qualLen+1]; 62 | memcpy(qual, rhs.qual, qualLen); 63 | qual[qualLen] = '\0'; 64 | } 65 | } 66 | if (rhs.passthrough != NULL) { 67 | passthrough=new char[strlen(rhs.passthrough)]; 68 | memcpy(passthrough, rhs.passthrough, strlen(rhs.passthrough)); 69 | } 70 | 71 | name=rhs.name; 72 | return *this; 73 | } 74 | }; 75 | 76 | #endif 77 | -------------------------------------------------------------------------------- /RefineBreakpoint.h: -------------------------------------------------------------------------------- 1 | #ifndef _REFINE_Breakpoint_H_ 2 | #define _REFINE_Breakpoint_H_ 3 | #include 4 | #include "AlignmentBlock.h" 5 | 6 | void PrependBlocks(vector &src, vector &dest) { 7 | if (src.size() == 0) { return;} 8 | if (dest.size() == 0) { dest = src; return;} 9 | 10 | int last=src.size()-1; 11 | // If the last match is a gapless extension, that can mess up some cigar parsing. 12 | if (src[last].tPos + src[last].length == dest[0].tPos and 13 | src[last].qPos + src[last].length == dest[0].qPos) { 14 | dest[0].tPos -= src[last].length; 15 | dest[0].qPos -= src[last].length; 16 | dest[0].length += src[last].length; 17 | src.resize(last); 18 | } 19 | 20 | dest.resize(dest.size()+ src.size()); 21 | for (int i=dest.size(); i > src.size(); i--) { 22 | dest[i-1] = dest[i-src.size()-1]; 23 | } 24 | for (int i=0; i < src.size(); i++) { 25 | dest[i] = src[i]; 26 | } 27 | } 28 | 29 | void AppendBlocks(vector &src, vector &dest) { 30 | if (src.size() == 0) { return;} 31 | if (dest.size() == 0) { dest = src; return;} 32 | 33 | int last=dest.size()-1; 34 | // If the last match is a gapless extension, that can mess up some cigar parsing. 35 | int srcStart=0; 36 | if (dest[last].tPos + dest[last].length == src[0].tPos and 37 | dest[last].qPos + dest[last].length == src[0].qPos) { 38 | dest[last].length += src[0].length; 39 | srcStart=1; 40 | } 41 | int destEnd=dest.size(); 42 | dest.resize(dest.size() + src.size() - srcStart); 43 | for (int i=srcStart; i < src.size(); i++) { 44 | dest[destEnd] = src[i]; 45 | destEnd++; 46 | } 47 | } 48 | 49 | 50 | void PathToBlocks(vector &path, vector &blocks) { 51 | int i=0; 52 | int left=1,down=2,diag=3; 53 | int q=0, t=0; 54 | while (i < path.size() and path[i] != diag and (path[i] == left or path[i] == down)) { 55 | if (path[i] == left) { q++;} 56 | if (path[i] == down) { t++;} 57 | i++; 58 | } 59 | 60 | while (i < path.size()) { 61 | int ml=0; 62 | int qs=q; 63 | int ts=t; 64 | while (i < path.size() and path[i] == diag) { 65 | ml++; 66 | q++; 67 | t++; 68 | i++; 69 | } 70 | 71 | while (i < path.size() and (path[i] == left or path[i] == down)) { 72 | if (path[i] == left) { q++;} 73 | if (path[i] == down) { t++;} 74 | i++; 75 | } 76 | int match=min(q-qs, t-ts); 77 | if (match > 0) { 78 | blocks.push_back(Block(qs,ts,match)); 79 | } 80 | } 81 | } 82 | 83 | void PrintMat(vector &mat, int r) { 84 | for (int i=0; i < mat.size(); i++) { 85 | cout << "\t" << mat[i]; 86 | if ((i+1)%r == 0) { cout << endl;} 87 | } 88 | } 89 | 90 | void TraceBack(vector &path, int q, int t, int r, vector &tb) { 91 | q++; 92 | t++; 93 | int i=t*r+q; 94 | int left=1; 95 | int down=2; 96 | int diag=3; 97 | while (q >0 or t > 0) { 98 | if (path[i] == diag) { 99 | q--; 100 | t--; 101 | tb.push_back(diag); 102 | 103 | } 104 | if (path[i] == left) { 105 | q--; 106 | tb.push_back(left); 107 | } 108 | if (path[i] == down) { 109 | t--; 110 | tb.push_back(down); 111 | } 112 | i=(t)*r+(q); 113 | } 114 | reverse(tb.begin(), tb.end()); 115 | } 116 | 117 | void StoreQScoreVect(vector &score, vector &path, int q, int t, int r, vector &qv, vector &index) { 118 | qv.resize(r-1); 119 | index.resize(r-1); 120 | fill(qv.begin(), qv.end(), 0); 121 | int i=(t+1)*r+q+1; 122 | int left=1; 123 | int down=2; 124 | int diag=3; 125 | // index into matrix is up by 1 126 | q++; 127 | t++; 128 | while (i > 0) { 129 | if (path[i] == diag or path[i] == left) { 130 | assert(q > 0); 131 | qv[q-1] = score[i]; 132 | index[q-1] = i; 133 | } 134 | if (path[i] == diag) { 135 | q--; 136 | t--; 137 | } 138 | if (path[i] == left) { 139 | q--; 140 | } 141 | if (path[i] == down) { 142 | t--; 143 | } 144 | i=(t)*r+(q); 145 | } 146 | } 147 | 148 | 149 | 150 | void RSdp(string &q, string &t, vector &path, vector &score, 151 | int mat, int mis, int indel) { 152 | 153 | path.resize((q.size()+1)*(t.size()+1), -1); 154 | score.resize((q.size()+1)*(t.size()+1),0); 155 | int qs=q.size(); 156 | int ts=t.size(); 157 | int left=1; 158 | int down=2; 159 | int diag=3; 160 | int row=qs+1; 161 | for (int i=1; i < qs+1;i++) { 162 | path[i] = left; 163 | score[i] = score[i-1] + indel; 164 | } 165 | for (int i=1; i < ts+1; i++) { 166 | path[row*i] = down; 167 | score[i*row] = score[(i-1)*row] + indel; 168 | } 169 | for (int i=0; i < ts; i++) { 170 | for (int j=0; j < qs; j++) { 171 | int diagScore = score[i*row+j]; 172 | if (q[j] == t[i]) { 173 | diagScore += mat; 174 | } 175 | else { 176 | diagScore += mis; 177 | } 178 | int leftScore = score[(i+1)*row + j] + indel; 179 | int downScore = score[i*row + (j+1)] + indel; 180 | int maxScore =max(diagScore, max(leftScore, downScore)); 181 | score[(i+1)*row+(j+1)] = maxScore; 182 | if (maxScore == diagScore) { 183 | path[(i+1)*row+(j+1)] = diag; 184 | } 185 | else if (maxScore == leftScore) { 186 | path[(i+1)*row+(j+1)] = left; 187 | } 188 | else { 189 | path[(i+1)*row+(j+1)] = down; 190 | } 191 | } 192 | } 193 | // 194 | // No need for trace back here. 195 | // 196 | } 197 | 198 | int FindMax(vector &score, int row, int &q, int &t) { 199 | if (score.size() == 0) { 200 | q=t=0; 201 | return 0; 202 | } 203 | vector::iterator itr = max_element(score.begin(), score.end()); 204 | int index=itr-score.begin(); 205 | t=index/row-1; 206 | q=index%row-1; 207 | return score[index]; 208 | } 209 | 210 | void RefineBreakpoint(Read &read, 211 | Genome &genome, 212 | Alignment &leftAln, 213 | Alignment &rightAln, 214 | const Options &opts) { 215 | int lqs=0, lqe=0, lts=0, lte=0; 216 | int rqs=0, rqe=0, rts=0, rte=0; 217 | 218 | lqs=leftAln.GetQStart(); 219 | lqe=leftAln.GetQEnd(); 220 | lts=leftAln.GetTStart(); 221 | lte=leftAln.GetTEnd(); 222 | int flqs=0, flqe=0, rlqs=0, rlqe=0; 223 | if (leftAln.strand == 0) { 224 | flqs=lqs; 225 | flqe=lqe; 226 | } 227 | else { 228 | flqs = read.length - lqe; 229 | flqe = read.length - lqs; 230 | } 231 | 232 | rqs=rightAln.GetQStart(); 233 | rqe=rightAln.GetQEnd(); 234 | rts=rightAln.GetTStart(); 235 | rte=rightAln.GetTEnd(); 236 | int frqs=0, frqe=0; 237 | if (rightAln.strand == 0) { 238 | frqs=rqs; 239 | frqe=rqe; 240 | } 241 | else { 242 | frqs = read.length - rqe; 243 | frqe = read.length - rqs; 244 | } 245 | /* 246 | cerr << "LEFT\t" << (int) leftAln.strand << "\t" << leftAln.blocks.size() << "\t" << flqs << "\t" << flqe << "\t" 247 | << "RIGHT\t" << (int) rightAln.strand << "\t" << rightAln.blocks.size() << "\t" << frqs << "\t" << frqe << endl; 248 | */ 249 | int MAX_GAP=500; 250 | 251 | if (frqs > flqe and frqs - flqe < MAX_GAP) { 252 | // 253 | // The two endpoints are close enough to attempt to refine the breakpoint. 254 | // 255 | // For now just do standard dp, speed up later if need be. 256 | int span = frqs - flqe; 257 | // 258 | // Determine spans that are refined for left. 259 | // 260 | string lqString, ltString; 261 | char *tChrom=genome.seqs[leftAln.chromIndex]; 262 | bool lPrefixExtend=false; 263 | int lqExtStart, lqExtEnd; 264 | int ltExtStart, ltExtEnd; 265 | 266 | if (leftAln.strand == 0) { 267 | // 268 | // Missed segment is at right side of left aln 269 | // 270 | lqExtStart = lqe; 271 | lqExtEnd = lqe+span; 272 | lqString = string(leftAln.read + lqe, span); 273 | 274 | // 275 | // Left align is forward strand. Refining will go from the end of alignment on 276 | // 277 | ltExtStart = leftAln.GetTEnd(); 278 | int tSpan = min(genome.lengths[leftAln.chromIndex]-leftAln.GetTEnd(), span); 279 | ltExtEnd = ltExtStart+tSpan; 280 | ltString = string(tChrom+ltExtStart, tSpan); 281 | } 282 | else { 283 | assert(lqs-span > 0); 284 | // missed segment is at left side of rev strand 285 | lqExtStart = lqs-span; 286 | lqExtEnd = lqs; 287 | lqString = string(leftAln.read + lqExtStart, span); 288 | 289 | // 290 | // Left align is reverse strand. The gap goes forward in the read, which means it exends back in target 291 | ltExtEnd = leftAln.GetTStart(); 292 | ltExtStart = max(0,ltExtEnd-span); 293 | ltString = string(tChrom+ltExtStart, ltExtEnd-ltExtStart); 294 | lPrefixExtend=true; 295 | reverse(lqString.begin(), lqString.end()); 296 | reverse(ltString.begin(), ltString.end()); 297 | } 298 | 299 | vector lPath, lScore, rPath, rScore; 300 | int mat=2; 301 | int mis=-2; 302 | int gap=-4; 303 | /* 304 | cerr << "query" << endl; 305 | cerr << lqString << endl; 306 | cerr << "target" << endl; 307 | cerr << ltString << endl; 308 | */ 309 | RSdp(lqString, ltString, lPath, lScore, mat,mis,gap); 310 | 311 | // 312 | // Right-hand side logic is the reverse. 313 | // 314 | string rqString, rtString; 315 | string rqStringCopy, rtStringCopy; 316 | char *rtChrom=genome.seqs[rightAln.chromIndex]; 317 | bool rPrefixExtend=false; 318 | int rqExtStart, rqExtEnd; 319 | int rtExtStart, rtExtEnd; 320 | if (rightAln.strand == 0) { 321 | // 322 | 323 | rqExtStart = rqs - span; 324 | rqExtEnd = rqs; 325 | rqString=string(rightAln.read + rqExtStart, span); 326 | 327 | int rtSpan = min(rightAln.GetTStart(), span); 328 | rtExtStart = rightAln.GetTStart() - rtSpan; 329 | rtExtEnd = rightAln.GetTStart(); 330 | rtString = string(rtChrom+rtExtStart, rtSpan); 331 | reverse(rqString.begin(), rqString.end()); 332 | reverse(rtString.begin(), rtString.end()); 333 | rPrefixExtend=true; 334 | } 335 | else { 336 | rqExtStart = rightAln.GetQEnd(); 337 | rqExtEnd = rqExtStart+span; 338 | assert(rqExtStart+span <= read.length); 339 | rqString = string(rightAln.read + rqExtStart, span); 340 | 341 | rtExtStart = rightAln.GetTEnd(); 342 | int tSpan = span; 343 | if (rtExtStart+span >= genome.lengths[rightAln.chromIndex]) { 344 | tSpan = genome.lengths[rightAln.chromIndex] - rtExtStart; 345 | } 346 | rtExtEnd = rtExtStart+tSpan; 347 | rtString = string(rtChrom+rtExtStart, tSpan); 348 | } 349 | /* 350 | cerr << "rquery" << endl; 351 | cerr << rqString << endl; 352 | cerr << "rtarget" << endl; 353 | cerr << rtString << endl; 354 | */ 355 | 356 | RSdp(rqString, rtString, rPath, rScore, mat,mis,gap); 357 | 358 | 359 | int mls, mlq, mlt, mrs, mrq, mrt; 360 | mls=FindMax(lScore, span+1, mlq, mlt); 361 | mrs=FindMax(rScore, span+1, mrq, mrt); 362 | // cerr << "left " << mlq << "\t" << mlt << "\t" << mls << "\tright\t" << mrq << "\t" << mrt << "\t" << mrs << endl; 363 | // Now to merge the two results. 364 | // 365 | // Case 1, the local alignments do not overlap 366 | int maxLIndex=0; 367 | int maxRIndex=0; 368 | 369 | if (mlq < span - mrq ) { 370 | // cerr << "Alignments do not overlap " << endl; 371 | } 372 | else { 373 | // cerr << "Alignments do overlap. Optimize" << endl; 374 | vector lqScores, rqScores, lqIndex, rqIndex; 375 | StoreQScoreVect(lScore, lPath, mlq, mlt, span+1, lqScores, lqIndex); 376 | StoreQScoreVect(rScore, rPath, mrq, mrt, span+1, rqScores, rqIndex); 377 | assert(lqScores.size() == rqScores.size()); 378 | int maxScore=0; 379 | for (int i=0; i < lqScores.size(); i++ ) { 380 | if (lqScores[i] + rqScores[lqScores.size()-i-1] > maxScore) { 381 | maxScore = lqScores[i] + rqScores[lqScores.size()-i-1]; 382 | maxLIndex=i; 383 | maxRIndex=lqScores.size()-i-1; 384 | } 385 | } 386 | int maxLI=lqIndex[maxLIndex]; 387 | int maxRI=rqIndex[maxRIndex]; 388 | mlq=maxLIndex; 389 | mlt=maxLI/(span+1)-1; 390 | mrq=maxRIndex; 391 | mrt=maxRI/(span+1)-1; 392 | 393 | } 394 | vector ltb, rtb; 395 | TraceBack(lPath, mlq, mlt, span+1, ltb); 396 | TraceBack(rPath, mrq, mrt, span+1, rtb); 397 | /* 398 | for (int i=0;i lBlocks, rBlocks; 407 | int lqBlockStart=-1, ltBlockStart=-1; 408 | if (lPrefixExtend == true) { 409 | reverse(ltb.begin(), ltb.end()); 410 | reverse(lqString.begin(), lqString.end()); 411 | reverse(ltString.begin(), ltString.end()); 412 | int qs=lqExtEnd-1; 413 | int ts=ltExtEnd-1; 414 | for (int i=0; i < ltb.size(); i++) { 415 | if (ltb[i] == 3) { qs--; ts--;} 416 | if (ltb[i] == 2) { ts--;} 417 | if (ltb[i] == 1) { qs--;} 418 | } 419 | lqBlockStart=leftAln.GetQStart()-mlq-1; 420 | ltBlockStart=leftAln.GetTStart()-mlt-1; 421 | // mlq=lqString.size() - mlq - 1; 422 | // mlt=ltString.size() - mlt - 1; 423 | } 424 | else { 425 | lqBlockStart=leftAln.GetQEnd(); 426 | ltBlockStart=leftAln.GetTEnd(); 427 | } 428 | 429 | PathToBlocks(ltb, lBlocks); 430 | for (int i=0; i < lBlocks.size(); i++) { lBlocks[i].qPos+=lqBlockStart; lBlocks[i].tPos+=ltBlockStart;} 431 | 432 | if (lPrefixExtend) { 433 | PrependBlocks(lBlocks, leftAln.blocks); 434 | } 435 | else { 436 | AppendBlocks(lBlocks, leftAln.blocks); 437 | } 438 | 439 | 440 | int rqBlockStart=-1, rtBlockStart=-1; 441 | if (rPrefixExtend == true) { 442 | reverse(rtb.begin(), rtb.end()); 443 | rqBlockStart=rightAln.GetQStart()-mrq-1; 444 | rtBlockStart=rightAln.GetTStart()-mrt-1; 445 | // mrq=rqString.size() - mrq - 1; 446 | // mrt=rtString.size() - mrt - 1; 447 | } 448 | else { 449 | rqBlockStart = rightAln.GetQEnd(); 450 | rtBlockStart = rightAln.GetTEnd(); 451 | } 452 | PathToBlocks(rtb, rBlocks); 453 | for (int i=0; i < rBlocks.size(); i++) { rBlocks[i].qPos+=rqBlockStart; rBlocks[i].tPos+=rtBlockStart;} 454 | 455 | 456 | if (rPrefixExtend) { 457 | PrependBlocks(rBlocks, rightAln.blocks); 458 | } 459 | else { 460 | AppendBlocks(rBlocks, rightAln.blocks); 461 | } 462 | } 463 | 464 | } 465 | 466 | 467 | 468 | 469 | #endif 470 | -------------------------------------------------------------------------------- /SeqUtils.h: -------------------------------------------------------------------------------- 1 | #ifndef SEQ_UTILS_H_ 2 | #define SEQ_UTILS_H_ 3 | 4 | // 5 | // Map from ascii to 2 bit representation. 6 | // 7 | static int seqMap[] = { 8 | 0,1,2,3,0,1,2,3, 9 | 0,0,0,0,0,0,0,0, 10 | 0,0,0,0,0,0,0,0, 11 | 0,0,0,0,0,0,0,0, 12 | 0,0,0,0,0,0,0,0, 13 | 0,0,0,0,0,0,0,0, 14 | 0,0,0,0,0,0,0,0, 15 | 0,0,0,0,0,0,0,0, 16 | 0,0,0,1,0,0,0,2, 17 | 0,0,0,0,0,0,0,0, 18 | 0,0,0,0,3,0,0,0, 19 | 0,0,0,0,0,0,0,0, 20 | 0,0,0,1,0,0,0,2, 21 | 0,0,0,0,0,0,0,0, 22 | 0,0,0,0,3,0,0,0, 23 | 0,0,0,0,0,0,0,0, 24 | 0,0,0,0,0,0,0,0, 25 | 0,0,0,0,0,0,0,0, 26 | 0,0,0,0,0,0,0,0, 27 | 0,0,0,0,0,0,0,0, 28 | 0,0,0,0,0,0,0,0, 29 | 0,0,0,0,0,0,0,0, 30 | 0,0,0,0,0,0,0,0, 31 | 0,0,0,0,0,0,0,0, 32 | 0,0,0,0,0,0,0,0, 33 | 0,0,0,0,0,0,0,0, 34 | 0,0,0,0,0,0,0,0, 35 | 0,0,0,0,0,0,0,0, 36 | 0,0,0,0,0,0,0,0, 37 | 0,0,0,0,0,0,0,0, 38 | 0,0,0,0,0,0,0,0, 39 | 0,0,0,0,0,0,0,0 40 | }; 41 | 42 | static int seqMapN[] = { 43 | 0,1,2,3,0,1,2,3, 44 | 4,4,4,4,4,4,4,4, 45 | 4,4,4,4,4,4,4,4, 46 | 4,4,4,4,4,4,4,4, 47 | 4,4,4,4,4,4,4,4, 48 | 4,4,4,4,4,4,4,4, 49 | 4,4,4,4,4,4,4,4, 50 | 4,4,4,4,4,4,4,4, 51 | 4,0,4,1,4,4,4,2, 52 | 4,4,4,4,4,4,4,4, 53 | 4,4,4,4,3,4,4,4, 54 | 4,4,4,4,4,4,4,4, 55 | 4,0,4,1,4,4,4,2, 56 | 4,4,4,4,4,4,4,4, 57 | 4,4,4,4,3,4,4,4, 58 | 4,4,4,4,4,4,4,4, 59 | 4,4,4,4,4,4,4,4, 60 | 4,4,4,4,4,4,4,4, 61 | 4,4,4,4,4,4,4,4, 62 | 4,4,4,4,4,4,4,4, 63 | 4,4,4,4,4,4,4,4, 64 | 4,4,4,4,4,4,4,4, 65 | 4,4,4,4,4,4,4,4, 66 | 4,4,4,4,4,4,4,4, 67 | 4,4,4,4,4,4,4,4, 68 | 4,4,4,4,4,4,4,4, 69 | 4,4,4,4,4,4,4,4, 70 | 4,4,4,4,4,4,4,4, 71 | 4,4,4,4,4,4,4,4, 72 | 4,4,4,4,4,4,4,4, 73 | 4,4,4,4,4,4,4,4, 74 | 4,4,4,4,4,4,4,4 75 | }; 76 | 77 | static int revComp[] = { 78 | 3,2,1,0,0,0,0,0, 79 | 0,0,0,0,0,0,0,0, 80 | 0,0,0,0,0,0,0,0, 81 | 0,0,0,0,0,0,0,0, 82 | 0,0,0,0,0,0,0,0, 83 | 0,0,0,0,0,0,0,0, 84 | 0,0,0,0,0,0,0,0, 85 | 0,0,0,0,0,0,0,0, 86 | 0,3,0,2,0,0,0,1, 87 | 0,0,0,0,0,0,0,0, 88 | 0,0,0,0,0,0,0,0, 89 | 0,0,0,0,0,0,0,0, 90 | 0,3,0,2,0,0,0,1, 91 | 0,0,0,0,0,0,0,0, 92 | 0,0,0,0,0,0,0,0, 93 | 0,0,0,0,0,0,0,0, 94 | 0,0,0,0,0,0,0,0, 95 | 0,0,0,0,0,0,0,0, 96 | 0,0,0,0,0,0,0,0, 97 | 0,0,0,0,0,0,0,0, 98 | 0,0,0,0,0,0,0,0, 99 | 0,0,0,0,0,0,0,0, 100 | 0,0,0,0,0,0,0,0, 101 | 0,0,0,0,0,0,0,0, 102 | 0,0,0,0,0,0,0,0, 103 | 0,0,0,0,0,0,0,0, 104 | 0,0,0,0,0,0,0,0, 105 | 0,0,0,0,0,0,0,0, 106 | 0,0,0,0,0,0,0,0, 107 | 0,0,0,0,0,0,0,0, 108 | 0,0,0,0,0,0,0,0, 109 | 0,0,0,0,0,0,0,0 110 | }; 111 | 112 | static unsigned char RevCompNuc[] = { 113 | 'N','N','N','N','N','N','N','N', 114 | 'N','N','N','N','N','N','N','N', 115 | 'N','N','N','N','N','N','N','N', 116 | 'N','N','N','N','N','N','N','N', 117 | 'N','N','N','N','N','N','N','N', 118 | 'N','N','N','N','N','N','N','N', 119 | 'N','N','N','N','N','N','N','N', 120 | 'N','N','N','N','N','N','N','N', 121 | 'N','T','N','G','N','N','N','C', 122 | 'N','N','N','N','N','N','N','N', 123 | 'N','N','N','N','A','N','N','N', 124 | 'N','N','N','N','N','N','N','N', 125 | 'N','t','N','g','N','N','N','c', 126 | 'N','N','N','N','N','N','n','N', 127 | 'N','N','N','N','a','N','N','N', 128 | 'N','N','N','N','N','N','N','N', 129 | 'N','N','N','N','N','N','N','N', 130 | 'N','N','N','N','N','N','N','N', 131 | 'N','N','N','N','N','N','N','N', 132 | 'N','N','N','N','N','N','N','N', 133 | 'N','N','N','N','N','N','N','N', 134 | 'N','N','N','N','N','N','N','N', 135 | 'N','N','N','N','N','N','N','N', 136 | 'N','N','N','N','N','N','N','N', 137 | 'N','N','N','N','N','N','N','N', 138 | 'N','N','N','N','N','N','N','N', 139 | 'N','N','N','N','N','N','N','N', 140 | 'N','N','N','N','N','N','N','N', 141 | 'N','N','N','N','N','N','N','N', 142 | 'N','N','N','N','N','N','N','N', 143 | 'N','N','N','N','N','N','N','N', 144 | 'N','N','N','N','N','N','N','N' 145 | }; 146 | 147 | 148 | 149 | const char *binMap = "ACGT"; 150 | 151 | void CreateRC(char* seq, long l, char *& dest) { 152 | dest = new char[l]; 153 | 154 | for (long i = 0; i < l; i++) { 155 | dest[l-i-1] = RevCompNuc[seq[i]]; 156 | } 157 | 158 | } 159 | 160 | #endif 161 | -------------------------------------------------------------------------------- /Sorting.h: -------------------------------------------------------------------------------- 1 | #ifndef SORTING_H_ 2 | #define SORTING_H_ 3 | 4 | #include 5 | #include "Types.h" 6 | #include 7 | #include 8 | #include 9 | 10 | using std::sort; 11 | using std::pair; 12 | using std::tuple; 13 | 14 | template 15 | class DiagonalIndexSort { 16 | public: 17 | typename vector >::iterator tuples; 18 | long operator()(const int &a, const int &b) { 19 | typename vector >::iterator ap=tuples+a; 20 | typename vector >::iterator bp=tuples+b; 21 | long aDiag = (long)ap->first.pos - (long)ap->second.pos, 22 | bDiag= (long)bp->first.pos - (long)bp->second.pos; 23 | 24 | if (aDiag != bDiag) { 25 | return aDiag < bDiag; 26 | } 27 | else { 28 | return ap->first.pos < bp->first.pos; 29 | } 30 | } 31 | 32 | }; 33 | 34 | template 35 | class DiagonalSortOp { 36 | public: 37 | long operator()(const pair &a, const pair &b) { 38 | long aDiag = (long)a.first.pos - (long)a.second.pos, 39 | bDiag= (long)b.first.pos - (long)b.second.pos; 40 | if (aDiag != bDiag) { 41 | return aDiag < bDiag; 42 | } 43 | else { 44 | return a.first.pos < b.first.pos; 45 | } 46 | } 47 | }; 48 | 49 | template 50 | void DiagonalSort(typename vector >::iterator begin, typename vector >::iterator end, int minRange=0) { 51 | if (minRange == 0 or end-begin < minRange) { 52 | sort(begin, end, DiagonalSortOp()); 53 | } 54 | else { 55 | DiagonalIndexSort sorter; 56 | sorter.tuples=begin; 57 | vector index(end-begin); 58 | std::iota(index.begin(), index.end(), 0); 59 | sort(index.begin(), index.end(), sorter); 60 | GenomePos pos; 61 | vector > temp(end-begin); 62 | copy(begin,end, temp.begin()); 63 | for (int i=0; i < index.size(); i++) { 64 | temp[i]=*(begin+index[i]); 65 | } 66 | copy(temp.begin(), temp.end(), begin); 67 | } 68 | } 69 | 70 | template 71 | void DiagonalSort(vector > &vals, int minRange=0) { 72 | DiagonalSort(vals.begin(), vals.end(), minRange); 73 | } 74 | 75 | template 76 | class AntiDiagonalSortOp { 77 | public: 78 | AntiDiagonalSortOp() {} 79 | GenomePos operator()(const pair &a, const pair &b) { 80 | GenomePos aDiag = a.first.pos + a.second.pos, 81 | bDiag= b.first.pos + b.second.pos; 82 | 83 | if (aDiag != bDiag) { 84 | return aDiag < bDiag; 85 | } 86 | else { 87 | return a.first.pos < b.first.pos; 88 | } 89 | } 90 | }; 91 | 92 | template 93 | class AntiDiagonalIndexSort { 94 | public: 95 | typename vector >::iterator tuples; 96 | GenomePos operator()(const int &a, const int &b) { 97 | typename vector >::iterator ap=tuples+a; 98 | typename vector >::iterator bp=tuples+b; 99 | GenomePos aDiag = ap->first.pos + ap->second.pos; 100 | GenomePos bDiag = bp->first.pos + bp->second.pos; 101 | 102 | if (aDiag != bDiag) { 103 | return aDiag < bDiag; 104 | } 105 | else { 106 | return ap->first.pos < bp->first.pos; 107 | } 108 | } 109 | 110 | }; 111 | 112 | template 113 | void AntiDiagonalSort(typename vector >::iterator begin, 114 | typename vector >::iterator end, int sortByIndex=0) { 115 | if (sortByIndex == 0 or end-begin < sortByIndex) { 116 | sort(begin, end, AntiDiagonalSortOp()); 117 | } 118 | else { 119 | 120 | AntiDiagonalIndexSort sorter; 121 | sorter.tuples=begin; 122 | // sorter.length=genomeLength; 123 | vector index(end-begin); 124 | std::iota(index.begin(), index.end(), 0); 125 | sort(index.begin(), index.end(), sorter); 126 | GenomePos pos; 127 | vector > temp(end-begin); 128 | copy(begin,end, temp.begin()); 129 | 130 | for (int i=0; i < index.size(); i++) { 131 | temp[i]=*(begin+index[i]); 132 | } 133 | copy(temp.begin(), temp.end(), begin); 134 | } 135 | 136 | } 137 | 138 | template 139 | void AntiDiagonalSort(vector > &vals, int sortByIndex=0) { 140 | AntiDiagonalSort(vals.begin(), vals.end(), sortByIndex); 141 | } 142 | 143 | template 144 | class CartesianSortOp { 145 | public: 146 | int operator()(const pair &a, const pair &b) { 147 | if (a.first.pos != b.first.pos) { 148 | return a.first.pos < b.first.pos; 149 | } 150 | else { 151 | return a.second.pos < b.second.pos; 152 | } 153 | } 154 | }; 155 | 156 | template 157 | void CartesianSort(vector > &vals, int s, int e) { 158 | sort(vals.begin() + s, vals.begin() + e, CartesianSortOp()); 159 | } 160 | 161 | template 162 | void CartesianSort(typename vector >::iterator begin, typename vector >::iterator end) { 163 | sort(begin, end, CartesianSortOp()); 164 | } 165 | 166 | template 167 | void CartesianSort(vector > &vals) { 168 | CartesianSort(vals.begin(), vals.end()); 169 | } 170 | 171 | template 172 | int CartesianLowerBound(typename vector >::iterator begin, 173 | typename vector >::iterator end, int64_t query) { 174 | pair queryTup; 175 | queryTup.first.pos = query; 176 | queryTup.second.pos = 0; 177 | return lower_bound(begin, end, queryTup, CartesianSortOp()) - begin; 178 | } 179 | 180 | template 181 | class CartesianTargetSortOp { 182 | public: 183 | int operator()(const pair &a, const pair &b) { 184 | if (a.second.pos != b.second.pos) { 185 | return a.second.pos < b.second.pos; 186 | } 187 | else { 188 | return a.first.pos < b.first.pos; 189 | } 190 | } 191 | }; 192 | 193 | 194 | template 195 | void CartesianTargetSort(vector > &vals) { 196 | CartesianTargetSort(vals.begin(), vals.end()); 197 | } 198 | 199 | template 200 | void CartesianTargetSort(typename vector >::iterator begin, typename vector >::iterator end) { 201 | sort(begin, end, CartesianTargetSortOp()); 202 | } 203 | 204 | 205 | template 206 | void CartesianTargetSort(vector> &matches, int s, int e) { 207 | sort(matches.begin() + s, matches.begin() + e, CartesianTargetSortOp()); 208 | } 209 | 210 | 211 | template 212 | int CartesianTargetLowerBound(typename vector >::iterator begin, typename vector >::iterator end, int64_t query) { 213 | pair queryTup; 214 | queryTup.second.pos = query; queryTup.first.pos = 0; 215 | return lower_bound(begin, end, queryTup, CartesianTargetSortOp()) - begin; 216 | } 217 | 218 | template 219 | int CartesianTargetUpperBound(typename vector >::iterator begin, typename vector >::iterator end, int64_t query) { 220 | pair queryTup; 221 | queryTup.second.pos = query; queryTup.first.pos = 0; 222 | return upper_bound(begin, end, queryTup, CartesianTargetSortOp()) - begin; 223 | } 224 | 225 | 226 | template 227 | class SortByRowOp { 228 | public: 229 | int operator()(const T & a, const T & b) { 230 | if (a.se.first != b.se.first) { 231 | return a.se.first < b.se.first; 232 | } 233 | else if (a.se.second != b.se.second){ 234 | return a.se.second < b.se.second; 235 | } 236 | else { 237 | return a.ind < b.ind; 238 | } 239 | } 240 | }; 241 | 242 | 243 | template 244 | class SortByColOp { 245 | public: 246 | 247 | SortByColOp(std::vector & H);// constructor 248 | std::vector * Hp; 249 | 250 | int operator()(const T2 & a, const T2 & b) { 251 | if ((*Hp)[a].se.second != (*Hp)[b].se.second) { 252 | return (*Hp)[a].se.second < (*Hp)[b].se.second; 253 | } 254 | else if ((*Hp)[a].se.first != (*Hp)[b].se.first){ 255 | return (*Hp)[a].se.first < (*Hp)[b].se.first; 256 | } 257 | else { 258 | return (*Hp)[a].ind < (*Hp)[b].ind; 259 | } 260 | } 261 | }; 262 | 263 | // constructor 264 | template 265 | SortByColOp::SortByColOp(std::vector & H) { 266 | Hp = & H; 267 | } 268 | 269 | 270 | template 271 | class SortByBackDiagOp 272 | { 273 | public: 274 | SortByBackDiagOp(std::vector & H); // constructor && initialization list 275 | 276 | std::vector * Hp; 277 | 278 | int operator()(const T2 & a, const T2 & b) { 279 | long int aBackDiag = (*Hp)[a].se.first + (*Hp)[a].se.second; 280 | long int bBackDiag = (*Hp)[b].se.first + (*Hp)[b].se.second; 281 | if (aBackDiag != bBackDiag) { 282 | return aBackDiag < bBackDiag; 283 | } 284 | else if ((*Hp)[a].se.first != (*Hp)[b].se.first){ 285 | return (*Hp)[a].se.first < (*Hp)[b].se.first; 286 | } 287 | else { 288 | return (*Hp)[a].ind < (*Hp)[b].ind; 289 | } 290 | } 291 | }; 292 | 293 | 294 | // Constructor 295 | template 296 | SortByBackDiagOp::SortByBackDiagOp(std::vector & H) { 297 | Hp = & H; 298 | } 299 | 300 | 301 | // This Lower_bound function return the index of the element 302 | // the first element in the range [first,last) which is greater than or equal to val. 303 | template 304 | T1 Lower_Bound (T1 first, T1 last, long int val, std::vector & E_1) { 305 | 306 | T1 it; 307 | unsigned int count, step; 308 | count = std::distance(first, last); 309 | while (count > 0) { 310 | it = first; step = count/2; std::advance(it, step); 311 | if ( E_1[*it] < val) { 312 | first = ++it; 313 | count -= step + 1; 314 | } 315 | else count = step; 316 | } 317 | return first; 318 | } 319 | 320 | 321 | #endif 322 | -------------------------------------------------------------------------------- /SplitClusters.h: -------------------------------------------------------------------------------- 1 | #ifndef SPLIT_CLUSTERS_H_ 2 | #define SPLIT_CLUSTERS_H_ 3 | 4 | #include //std::cout 5 | #include 6 | #include // std::labs, std::EXIT FAILURE, std::EXIT SUCCESS 7 | #include