├── AlignGraph ├── AlignGraph └── AlignGraph.cpp ├── Eval-AlignGraph ├── .nfs00000000000025d800016c95 ├── Eval-AlignGraph └── Eval-AlignGraph.cpp └── README.md /AlignGraph/AlignGraph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baoe/AlignGraph/1fada4c173f7e2111eeaf6da79e28d7fc32e6df2/AlignGraph/AlignGraph -------------------------------------------------------------------------------- /AlignGraph/AlignGraph.cpp: -------------------------------------------------------------------------------- 1 | //********************************************************************************** 2 | //* Title: AlignGraph: algorithm for secondary de novo genome assembly guided by closely related references 3 | //* Platform: 64-Bit Linux 4 | //* Author: Ergude Bao 5 | //* Affliation: Department of Computer Science & Engineering 6 | //* University of California, Riverside 7 | //* Date: 03/24/2011 8 | //* Copy Right: Artistic License 2.0 9 | //********************************************************************************** 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | using namespace std; 23 | 24 | #define TEST 25 | #define OPTIMIZATION 26 | 27 | #define MAX 99999 28 | #ifdef OPTIMIZATION 29 | #define INIT_CONTIG_THRESHOLD 0.5//0 30 | #else 31 | #define INIT_CONTIG_THRESHOLD 0.4 32 | #endif 33 | #define CONTIG_THRESHOLD 0.5//0.1 34 | #define THRESHOLD 0.6//0.2 35 | #define SD 0//5 36 | #define SI 0//5 37 | #define BATCH 1000000// BATCH = 2n 38 | #define COVERAGE 10 39 | #define EP 5// 5, 20, 100, max 40 | #define LARGE_CHUNK 1000000//20000 41 | #define SMALL_CHUNK 20000 42 | #define MIN_THRESHOLD 0.1 43 | 44 | typedef struct structSegment 45 | { 46 | unsigned int sourceStart; 47 | unsigned int targetStart; 48 | unsigned int size; 49 | } Segment; 50 | 51 | typedef struct contiMerStruct 52 | { 53 | char nucleotide; 54 | unsigned int contigID; 55 | unsigned int contigOffset; 56 | //unsigned int previousID; 57 | //unsigned int previousOffset; 58 | //unsigned int previousItem; 59 | unsigned int nextID; 60 | unsigned int nextOffset; 61 | unsigned int nextItem; 62 | } ContiMer; 63 | 64 | typedef struct nextStruct 65 | { 66 | unsigned int nextID; 67 | unsigned int nextOffset; 68 | unsigned int nextItem; 69 | } Next; 70 | 71 | typedef struct previousStruct 72 | { 73 | unsigned int previousID; 74 | unsigned int previousOffset; 75 | unsigned int previousItem; 76 | } Previous; 77 | 78 | typedef struct kMerStruct 79 | { 80 | unsigned int traversed; 81 | vector s; 82 | unsigned int contigID;// omitted, since it is rare that two k-mers at the same genome position with the same contig offset have different contig IDs 83 | unsigned int contigOffset; 84 | unsigned int contigID0;// omitted with the same reason with above 85 | unsigned int contigOffset0; 86 | unsigned int chromosomeID0; 87 | unsigned int chromosomeOffset0; 88 | vector next; 89 | // vector previous; 90 | // unsigned int nextID; 91 | // unsigned int nextOffset; 92 | // unsigned int nextItem; 93 | // unsigned int previousID; 94 | // unsigned int previousOffset; 95 | // unsigned int previousItem; 96 | int coverage; 97 | int A, C, G, T, N; 98 | } KMer; 99 | 100 | typedef struct baseStruct 101 | { 102 | vector kMer; 103 | char nucleotide; 104 | vector contiMer; 105 | } Base; 106 | 107 | typedef struct positionStruct 108 | { 109 | unsigned int chromosomeID; 110 | unsigned int chromosomeOffset; 111 | } Position; 112 | 113 | typedef struct seqStruct 114 | { 115 | int adjusted; 116 | vector nucleotides; 117 | vector > positionSets; 118 | vector frSets; 119 | int outputted; 120 | int ID; 121 | } Seq; 122 | 123 | typedef struct contigStruct 124 | { 125 | int extended; 126 | /// unsigned int contigID; 127 | /// unsigned int contigID0; 128 | /// unsigned int contigOffset; 129 | /// unsigned int contigOffset0; 130 | unsigned int startID; 131 | unsigned int startOffset; 132 | unsigned int endID; 133 | unsigned int endOffset; 134 | unsigned int startID0; 135 | unsigned int startOffset0; 136 | unsigned int endID0; 137 | unsigned int endOffset0; 138 | vector nucleotides; 139 | } Contig; 140 | 141 | typedef struct contigPositionStruct 142 | { 143 | int targetID; 144 | unsigned int sourceStart; 145 | unsigned int sourceEnd; 146 | unsigned int targetStart; 147 | unsigned int targetEnd; 148 | int fr; 149 | } ContigPosition; 150 | 151 | typedef struct contigBaseStruct 152 | { 153 | char base; 154 | int coverage; 155 | } ContigBase; 156 | 157 | typedef struct insertStruct 158 | { 159 | int distanceLow; 160 | int distanceHigh; 161 | int numChromosomes; 162 | int fastMap; 163 | int iterativeMap; 164 | } Insert; 165 | 166 | vector > genome; 167 | 168 | vector genomeIds; 169 | 170 | vector contigs; 171 | 172 | vector contigIds; 173 | 174 | string itoa(int number) 175 | { 176 | stringstream ss;//create a stringstream 177 | ss << number;//add number to the stream 178 | return ss.str();//return a string with the contents of the stream 179 | } 180 | 181 | void parseBOWTIE(string buf, unsigned int & targetID, unsigned int & targetStart, unsigned int & targetEnd, unsigned int & targetGap, unsigned int & sourceID, unsigned int & sourceStart, unsigned int & sourceEnd, unsigned int & sourceGap, unsigned int & sourceSize, vector & seg, unsigned int & fr) 182 | { 183 | int item = 0, i, j, j0 = 0, j1 = 0, j2 = 0, j3 = 0, j4 = 0, k, insertion = 0, deletion = 0, total = 0, start = 0, end = 0, tag = 1, IDTag = 0; 184 | char sourceIDBuf[100] = {'\0'}, targetIDBuf[100] = {'\0'}, sourceStartBuf[100] = {'\0'}, targetStartBuf[100] = {'\0'}, CIGARBuf[100] = {'\0'}, frBuf[100] = {'\0'}, realTargetIDBuf[100] = {'\0'}; 185 | Segment s; 186 | 187 | for(i = 0; i < buf.size(); i ++) 188 | { 189 | if(buf[i] == ' ') 190 | { 191 | item ++; 192 | continue; 193 | } 194 | 195 | if(buf[i] == '\0') break; 196 | 197 | if(item == 0) 198 | sourceIDBuf[j3 ++] = buf[i]; 199 | if(item == 1) 200 | frBuf[j4 ++] = buf[i]; 201 | if(item == 2) 202 | { 203 | if(buf[i] == '*') 204 | { 205 | sourceID = atoi(sourceIDBuf); 206 | fr = ((atoi(frBuf) & 0x00000010) == 0x00000010) ? 1 : 0; 207 | targetID = targetStart = targetEnd = targetGap = sourceStart = sourceEnd = sourceGap = sourceSize = -1; 208 | return; 209 | } 210 | if(buf[i] == '.') IDTag = 1; 211 | if(IDTag == 0) targetIDBuf[j0 ++] = buf[i]; 212 | } 213 | if(item == 3) 214 | targetStartBuf[j1 ++] = buf[i]; 215 | if(item == 5) 216 | if(buf[i] == '0' || buf[i] == '1' || buf[i] == '2' || buf[i] == '3' || buf[i] == '4' || buf[i] == '5' || buf[i] == '6' || buf[i] == '7' || buf[i] == '8' || buf[i] == '9') 217 | { 218 | CIGARBuf[j2 ++] = buf[i]; 219 | } 220 | else if(buf[i] == 'I') 221 | { 222 | insertion = insertion + atoi(CIGARBuf); 223 | total = total + atoi(CIGARBuf); 224 | for(k = 0; k < j2; k ++) 225 | CIGARBuf[k] = '\0'; 226 | j2 = 0; 227 | } 228 | else if(buf[i] == 'D') 229 | { 230 | deletion = deletion + atoi(CIGARBuf); 231 | // total = total + atoi(CIGARBuf); 232 | for(k = 0; k < j2; k ++) 233 | CIGARBuf[k] = '\0'; 234 | j2 = 0; 235 | } 236 | else if(buf[i] == 'M') 237 | { 238 | s.sourceStart = total; 239 | s.targetStart = atoi(targetStartBuf) + total + deletion - start - insertion - 1;// offset is 1-based 240 | s.size = atoi(CIGARBuf); 241 | seg.push_back(s); 242 | total = total + atoi(CIGARBuf); 243 | for(k = 0; k < j2; k ++) 244 | CIGARBuf[k] = '\0'; 245 | j2 = tag = 0; 246 | } 247 | else if(buf[i] == 'S' && tag) 248 | { 249 | start = atoi(CIGARBuf); 250 | total = total + atoi(CIGARBuf); 251 | for(k = 0; k < j2; k ++) 252 | CIGARBuf[k] = '\0'; 253 | j2 = tag = 0; 254 | } 255 | else if(buf[i] == 'S') 256 | { 257 | end = atoi(CIGARBuf); 258 | total = total + atoi(CIGARBuf); 259 | for(k = 0; k < j2; k ++) 260 | CIGARBuf[k] = '\0'; 261 | j2 = 0; 262 | } 263 | else 264 | { 265 | if(buf[i] != '*') 266 | { 267 | cout << "unknown character: " << buf[i] << endl; 268 | exit(-1); 269 | } 270 | } 271 | } 272 | sourceID = atoi(sourceIDBuf); 273 | sourceStart = start; 274 | sourceEnd = total - end; 275 | sourceGap = insertion; 276 | sourceSize = total; 277 | if(IDTag == 0) 278 | targetID = targetIDBuf[0] == '*' ? -1 : 0;// atoi(targetIDBuf); 279 | else 280 | targetID = targetIDBuf[0] == '*' ? -1 : atoi(targetIDBuf); 281 | targetStart = atoi(targetStartBuf) - 1;// offset is 1-based 282 | targetEnd = targetStart + total + deletion - insertion;// added: - insertion 283 | targetGap = deletion; 284 | fr = ((atoi(frBuf) & 0x00000010) == 0x00000010) ? 1 : 0; 285 | } 286 | 287 | void loadGenome(vector > & genome, int chromosomeID) 288 | { 289 | vector chromosome; 290 | string buf; 291 | Base b; 292 | int gp, cp, i; 293 | ifstream g; 294 | string s; 295 | 296 | s = "tmp/_genome." + itoa(chromosomeID) + ".fa"; 297 | g.open(s.c_str()); 298 | if(g.is_open()) 299 | { 300 | while(g.good()) 301 | { 302 | getline(g, buf); 303 | if(buf[0] == 0) break; 304 | 305 | if(buf[0] == '>') 306 | genome.push_back(chromosome); 307 | else 308 | for(i = 0; i < buf.size(); i ++) 309 | { 310 | b.nucleotide = buf[i]; 311 | genome[genome.size() - 1].push_back(b); 312 | } 313 | } 314 | } 315 | else 316 | { 317 | cout << "CANNOT OPEN FILE!" << endl; 318 | exit(-1); 319 | } 320 | } 321 | 322 | void loadSeq(ifstream & in, vector & seqs) 323 | { 324 | string buf; 325 | int i, sp, ssp; 326 | Seq s; 327 | 328 | s.adjusted = 0; 329 | s.outputted = 0; 330 | 331 | if(in.is_open()) 332 | { 333 | while(in.good()) 334 | { 335 | getline(in, buf); 336 | if(buf[0] == 0) 337 | break; 338 | 339 | if(buf[0] == '>') 340 | { 341 | for(i = 0; i < buf.size(); i ++) 342 | if(buf[i] == '.') 343 | break; 344 | s.ID = atoi(buf.substr(i + 1, buf.size()).c_str()); 345 | seqs.push_back(s); 346 | } 347 | else 348 | { 349 | for(i = 0; i < buf.size(); i ++) 350 | seqs[seqs.size() - 1].nucleotides.push_back(buf[i]); 351 | } 352 | } 353 | } 354 | else 355 | { 356 | cout << "CANNOT OPEN FILE!" << endl; 357 | exit(-1); 358 | } 359 | } 360 | 361 | int loadSeq(ifstream & in, vector & seqs, int & aliStartID, int & seqStartID) 362 | { 363 | string buf; 364 | int i, sp, ssp, p; 365 | Seq s; 366 | 367 | aliStartID = seqStartID + 1; 368 | s.adjusted = 0; 369 | p = 0; 370 | if(in.is_open()) 371 | { 372 | while(in.good()) 373 | { 374 | getline(in, buf); 375 | if(buf[0] == 0) 376 | break; 377 | 378 | if(buf[0] == '>') 379 | { 380 | seqs.push_back(s); 381 | } 382 | else 383 | { 384 | for(i = 0; i < buf.size(); i ++) 385 | seqs[seqs.size() - 1].nucleotides.push_back(buf[i]); 386 | 387 | if(p == 1) 388 | { 389 | seqStartID ++; 390 | if((seqStartID + 1) % BATCH == 0) return 0; 391 | p = 0; 392 | } 393 | else 394 | p = 1; 395 | } 396 | } 397 | return 1; 398 | } 399 | else 400 | { 401 | cout << "CANNOT OPEN FILE!" << endl; 402 | exit(-1); 403 | } 404 | } 405 | 406 | int parseBLAT(string buf, unsigned int & targetID, unsigned int & targetStart, unsigned int & targetEnd, unsigned int & targetGap, unsigned int & sourceID, unsigned int & sourceStart, unsigned int & sourceEnd, unsigned int & sourceGap, unsigned int & sourceSize, vector & seg, unsigned int & fr, unsigned int & targetSize) 407 | { 408 | char targetIDBuf[100] = {'\0'}, targetStartBuf[100] = {'\0'}, targetEndBuf[100] = {'\0'}, targetGapBuf[100] = {'\0'}, sourceIDBuf[100] = {'\0'}, sourceStartBuf[100] = {'\0'}, sourceEndBuf[100] = {'\0'}, sourceGapBuf[100] = {'\0'}, sourceSizeBuf[100] = {'\0'}, blockBuf[100] = {'\0'}, targetSizeBuf[100] = {'\0'}, realSourceIDBuf[100] = {'\0'}, realBuf[100] = {'\0'}; 409 | int item = 0, i, j0 = 0, j1 = 0, j2 = 0, j3 = 0, j4 = 0, j5 = 0, j6 = 0, j7 = 0, j8 = 0, j9 = 0, j, k, sp, j10 = 0, j11 = 0, j12 = 0; 410 | Segment s; 411 | fr = -1; 412 | 413 | seg.clear(); 414 | 415 | for(i = 0; i < buf.size(); i ++) 416 | { 417 | if(buf[i] == ' ') 418 | { 419 | item ++; 420 | sp = 0; 421 | continue; 422 | } 423 | 424 | if(buf[i] == '\0') break; 425 | 426 | if(item == 13) 427 | targetIDBuf[j0 ++] = buf[i]; 428 | if(item == 15) 429 | targetStartBuf[j1 ++] = buf[i]; 430 | if(item == 16) 431 | targetEndBuf[j2 ++] = buf[i]; 432 | if(item == 7) 433 | targetGapBuf[j3 ++] = buf[i]; 434 | if(item == 9) 435 | sourceIDBuf[j4 ++] = buf[i]; 436 | if(item == 11) 437 | sourceStartBuf[j5 ++] = buf[i]; 438 | if(item == 12) 439 | sourceEndBuf[j6 ++] = buf[i]; 440 | if(item == 5) 441 | sourceGapBuf[j7 ++] = buf[i]; 442 | if(item == 10) 443 | sourceSizeBuf[j8 ++] = buf[i]; 444 | if(item == 18) 445 | { 446 | if(buf[i] == ',') 447 | { 448 | s.sourceStart = s.targetStart = -1; 449 | s.size = atoi(blockBuf); 450 | seg.push_back(s); 451 | 452 | for(k = 0; k < j9; k ++) 453 | blockBuf[k] = '\0'; 454 | j9 = 0; 455 | } 456 | else 457 | blockBuf[j9 ++] = buf[i]; 458 | } 459 | if(item == 19) 460 | { 461 | if(buf[i] == ',') 462 | { 463 | seg[sp ++].sourceStart = atoi(blockBuf); 464 | 465 | for(k = 0; k < j9; k ++) 466 | blockBuf[k] = '\0'; 467 | j9 = 0; 468 | } 469 | else 470 | blockBuf[j9 ++] = buf[i]; 471 | } 472 | if(item == 20) 473 | { 474 | if(buf[i] == '\0') 475 | break; 476 | if(buf[i] == ',') 477 | { 478 | seg[sp ++].targetStart = atoi(blockBuf); 479 | 480 | for(k = 0; k < j9; k ++) 481 | blockBuf[k] = '\0'; 482 | j9 = 0; 483 | } 484 | else 485 | blockBuf[j9 ++] = buf[i]; 486 | } 487 | if(item == 8 && fr == -1) 488 | fr = buf[i] == '+' ? 0 : 1; 489 | if(item == 14) 490 | targetSizeBuf[j10 ++] = buf[i]; 491 | } 492 | targetID = atoi(targetIDBuf); 493 | targetStart = atoi(targetStartBuf); 494 | targetEnd = atoi(targetEndBuf); 495 | targetGap = atoi(targetGapBuf); 496 | // sourceID = atoi(sourceIDBuf); 497 | sourceStart = atoi(sourceStartBuf); 498 | sourceEnd = atoi(sourceEndBuf); 499 | sourceGap = atoi(sourceGapBuf); 500 | // sourceSize = atoi(sourceSizeBuf); 501 | targetSize = atoi(targetSizeBuf); 502 | 503 | for(i = 0; i < 100; i ++) 504 | if(sourceIDBuf[i] == '.') 505 | break; 506 | if(i < 100) 507 | { 508 | for(j = 0; j < i; j ++) 509 | realSourceIDBuf[j11 ++] = sourceIDBuf[j]; 510 | for(j = i + 1; j < 100; j ++) 511 | realBuf[j12 ++] = sourceIDBuf[j]; 512 | sourceID = atoi(realSourceIDBuf); 513 | sourceSize = atoi(sourceSizeBuf) ; 514 | return atoi(realBuf); 515 | } 516 | else 517 | { 518 | sourceID = atoi(sourceIDBuf); 519 | sourceSize = atoi(sourceSizeBuf); 520 | return sourceSize; 521 | } 522 | } 523 | 524 | void parseDelta(string buf, int & a, int & b, int & c, int & d, int & realA, int & realB) 525 | { 526 | char aBuf[100] = {'\0'}, bBuf[100] = {'\0'}, cBuf[100] = {'\0'}, dBuf[100] = {'\0'}, realABuf[100] = {'\0'}, realBBuf[100] = {'\0'}; 527 | int item = 0, j0 = 0, j1 = 0, j2 = 0, j3 = 0, rj0 = 0, rj1 = 0, i, aTag, bTag; 528 | 529 | for(i = 0; i < buf.size(); i ++) 530 | { 531 | if(buf[i] == ' ') 532 | { 533 | item ++; 534 | continue; 535 | } 536 | 537 | if(buf[i] == '\0') break; 538 | 539 | if(item == 0 && buf[i] != '>') 540 | aBuf[j0 ++] = buf[i]; 541 | if(item == 1) 542 | bBuf[j1 ++] = buf[i]; 543 | if(item == 2) 544 | cBuf[j2 ++] = buf[i]; 545 | if(item == 3) 546 | dBuf[j3 ++] = buf[i]; 547 | } 548 | for(i = 0, aTag = 0; i < 100; i ++) 549 | { 550 | if(aBuf[i] == '.') 551 | { 552 | aTag = 1; 553 | continue; 554 | } 555 | 556 | if(aBuf[i] == '\0') break; 557 | 558 | if(aTag == 1) 559 | { 560 | realABuf[rj0 ++] = aBuf[i]; 561 | aBuf[i] = '\0'; 562 | } 563 | } 564 | for(i = 0, bTag = 0; i < 100; i ++) 565 | { 566 | if(bBuf[i] == '.') 567 | { 568 | bTag = 1; 569 | continue; 570 | } 571 | 572 | if(bBuf[i] == '\0') break; 573 | 574 | if(bTag == 1) 575 | { 576 | realBBuf[rj0 ++] = bBuf[i]; 577 | bBuf[i] = '\0'; 578 | } 579 | } 580 | a = atoi(aBuf); 581 | realA = aTag == 1 ? atoi(realABuf) : -1; 582 | b = atoi(bBuf); 583 | realB = bTag == 1 ? atoi(realBBuf) : -1; 584 | c = atoi(cBuf); 585 | d = atoi(dBuf); 586 | } 587 | 588 | void delta2psl(string st0) 589 | { 590 | ifstream in; 591 | ofstream out; 592 | string st1, buf, align; 593 | int i, j, sp, b, targetID, sourceID, targetSize, sourceSize, targetStart, targetEnd, sourceStart, sourceEnd, sourceGap, targetGap, realSourceID, realTargetID, dummyID; 594 | char fr; 595 | vector seg; 596 | Segment s; 597 | 598 | in.open(st0.c_str()); 599 | st1 = st0.substr(0, st0.find('.', st0.find('.', 0) + 1)).append(".psl"); 600 | out.open(st1.c_str()); 601 | 602 | if(in.is_open()) 603 | { 604 | getline(in, buf); 605 | getline(in, buf); 606 | while(in.good()) 607 | { 608 | align.clear(); seg.clear(); sourceGap = targetGap = 0; 609 | getline(in, buf); 610 | if(buf[0] == '\0') break; 611 | 612 | if(buf[0] == '>') 613 | { 614 | parseDelta(buf, targetID, sourceID, targetSize, sourceSize, realTargetID, realSourceID); 615 | getline(in, buf); 616 | } 617 | parseDelta(buf, targetStart, targetEnd, sourceStart, sourceEnd, dummyID, dummyID); 618 | if(sourceStart < sourceEnd) 619 | fr = '+'; 620 | else 621 | { 622 | fr = '-'; 623 | b = sourceStart; 624 | sourceStart = sourceEnd; 625 | sourceEnd = b; 626 | } 627 | getline(in, buf); 628 | 629 | //generate alignment string 630 | while(buf[0] != '0') 631 | { 632 | b = atoi(buf.c_str()); 633 | for(i = 1; i < abs(b); i ++) 634 | align.push_back('M'); 635 | if(b > 0) 636 | { 637 | align.push_back('I'); 638 | targetGap ++; 639 | } 640 | else 641 | { 642 | align.push_back('D'); 643 | sourceGap ++; 644 | } 645 | getline(in, buf); 646 | } 647 | for(i = align.size(); i < (sourceEnd - sourceStart + 1) + targetGap; i ++) 648 | align.push_back('M'); 649 | 650 | //calculate seg 651 | for(i = 0, j = 0; i < align.size(); i ++) 652 | { 653 | if(align[i] != 'I') 654 | j ++; 655 | if(i == 0 && align[i] == 'M') 656 | { 657 | s.sourceStart = sourceStart - 1 + j; 658 | s.targetStart = s.size = -1; 659 | seg.push_back(s); 660 | } 661 | if(i - 1 >= 0 && (align[i - 1] == 'I' || align[i - 1] == 'D') && align[i] == 'M') 662 | { 663 | s.sourceStart = sourceStart - 1 + j; 664 | s.targetStart = s.size = -1; 665 | seg.push_back(s); 666 | } 667 | } 668 | for(i = 0, j = 0, sp = 0; i < align.size(); i ++) 669 | { 670 | if(align[i] != 'D') 671 | j ++; 672 | if(i == 0 && align[i] == 'M') 673 | { 674 | seg[sp ++].targetStart = targetStart - 1 + j; 675 | } 676 | if(i - 1 >= 0 && (align[i - 1] == 'I' || align[i - 1] == 'D') && align[i] == 'M') 677 | { 678 | seg[sp ++].targetStart = targetStart - 1 + j; 679 | } 680 | } 681 | for(i = 0, j = 0, sp = 0; i < align.size(); i ++) 682 | { 683 | if(align[i] == 'M') 684 | j ++; 685 | if(i + 1 < align.size() && align[i] == 'M' && (align[i + 1] == 'I' || align[i + 1] == 'D')) 686 | { 687 | seg[sp ++].size = j; 688 | j = 0; 689 | } 690 | if(i == align.size() - 1 && align[i] == 'M') 691 | seg[sp].size = j; 692 | } 693 | 694 | //write out one psl line 695 | sourceStart --; 696 | targetStart --; 697 | for(i = 0; i < seg.size(); i ++) 698 | { 699 | seg[i].sourceStart --; 700 | seg[i].targetStart --; 701 | } 702 | out << "NA NA NA NA NA " << sourceGap << " NA " << targetGap << " " << fr << " "; 703 | if(realSourceID != -1) 704 | out << sourceID << "." << realSourceID << " "; 705 | else 706 | out << sourceID << " "; 707 | out << sourceSize << " " << sourceStart << " " << sourceEnd << " "; 708 | if(realTargetID != -1) 709 | out << targetID << "." << realTargetID << " "; 710 | else 711 | out << targetID << " "; 712 | out << targetSize << " " << targetStart << " " << targetEnd << " NA "; 713 | for(i = 0; i < seg.size(); i ++) 714 | out << seg[i].size << ","; 715 | out << " "; 716 | for(i = 0; i < seg.size(); i ++) 717 | out << seg[i].sourceStart << ","; 718 | out << " "; 719 | for(i = 0; i < seg.size(); i ++) 720 | out << seg[i].targetStart << ","; 721 | out << endl; 722 | } 723 | } 724 | else 725 | { 726 | cout << "CANNOT OPEN FILE!" << endl; 727 | exit(-1); 728 | } 729 | } 730 | 731 | int keepPositions(vector & contigs, unsigned int sourceID, vector & segs, double threshold) 732 | //This is only used to complete the combination of BLAT's local alignments 733 | { 734 | int pp, match; 735 | 736 | if(sourceID == -1) return 1; 737 | if(contigs[sourceID].positionSets.size() == 0) return 1; 738 | match = 0; 739 | for(pp = 0; pp < contigs[sourceID].positionSets[contigs[sourceID].positionSets.size() - 1].size(); pp ++) 740 | { 741 | if(contigs[sourceID].positionSets[contigs[sourceID].positionSets.size() - 1][pp].chromosomeID != -1) 742 | match ++; 743 | } 744 | if((double) match / contigs[sourceID].positionSets[contigs[sourceID].positionSets.size() - 1].size() >= threshold) 745 | return 1; 746 | else 747 | return 0; 748 | } 749 | 750 | void complement(vector & contigs, unsigned int sourceID) 751 | { 752 | unsigned int np; 753 | 754 | for(np = 0; np < contigs[sourceID].nucleotides.size(); np ++) 755 | if(contigs[sourceID].nucleotides[np] == 'A') contigs[sourceID].nucleotides[np] = 'T'; 756 | else if(contigs[sourceID].nucleotides[np] == 'C') contigs[sourceID].nucleotides[np] = 'G'; 757 | else if(contigs[sourceID].nucleotides[np] == 'G') contigs[sourceID].nucleotides[np] = 'C'; 758 | else if(contigs[sourceID].nucleotides[np] == 'T') contigs[sourceID].nucleotides[np] = 'A'; 759 | //do nothing for 'N' 760 | } 761 | 762 | static unsigned int sourceIDBak = -1; 763 | void updateContig(vector & contigs, unsigned int sourceID, unsigned int targetID, vector & segs, unsigned int fr, double threshold) 764 | { 765 | int sp, ssp, np; 766 | vector positions; 767 | Position p; 768 | 769 | if(targetID == -1) return; 770 | p.chromosomeID = p.chromosomeOffset = -1; 771 | 772 | if(sourceID != sourceIDBak) 773 | { 774 | if(keepPositions(contigs, sourceIDBak, segs, threshold) == 0) 775 | { 776 | contigs[sourceIDBak].positionSets.erase(contigs[sourceIDBak].positionSets.end() - 1); 777 | contigs[sourceIDBak].frSets.erase(contigs[sourceIDBak].frSets.end() - 1); 778 | } 779 | contigs[sourceID].positionSets.push_back(positions); 780 | contigs[sourceID].frSets.push_back(fr); 781 | for(np = 0; np < contigs[sourceID].nucleotides.size(); np ++) 782 | contigs[sourceID].positionSets[contigs[sourceID].positionSets.size() - 1].push_back(p); 783 | //contigs[sourceID].positionSets[contigs[sourceID].positionSets.size() - 1].resize(contigs[sourceID].nucleotides.size(), p); 784 | sourceIDBak = sourceID; 785 | } 786 | else 787 | { 788 | for(sp = 0; sp < segs.size(); sp ++) 789 | for(ssp = segs[sp].sourceStart; ssp < segs[sp].sourceStart + segs[sp].size; ssp ++) 790 | { 791 | if(contigs[sourceID].positionSets[contigs[sourceID].positionSets.size() - 1][ssp].chromosomeID != -1) 792 | { 793 | if(keepPositions(contigs, sourceID, segs, threshold) == 0) 794 | { 795 | contigs[sourceID].positionSets.erase(contigs[sourceID].positionSets.end() - 1); 796 | contigs[sourceID].frSets.erase(contigs[sourceID].frSets.end() - 1); 797 | } 798 | contigs[sourceID].positionSets.push_back(positions); 799 | contigs[sourceID].frSets.push_back(fr); 800 | for(np = 0; np < contigs[sourceID].nucleotides.size(); np ++) 801 | contigs[sourceID].positionSets[contigs[sourceID].positionSets.size() - 1].push_back(p); 802 | //contigs[sourceID].positionSets[contigs[sourceID].positionSets.size() - 1].resize(contigs[sourceID].nucleotides.size(), p); 803 | goto cont; 804 | } 805 | } 806 | } 807 | 808 | cont: 809 | for(sp = 0; sp < segs.size(); sp ++) 810 | for(ssp = 0; ssp < segs[sp].size; ssp ++) 811 | { 812 | contigs[sourceID].positionSets[contigs[sourceID].positionSets.size() - 1][segs[sp].sourceStart + ssp].chromosomeID = targetID; 813 | contigs[sourceID].positionSets[contigs[sourceID].positionSets.size() - 1][segs[sp].sourceStart + ssp].chromosomeOffset = segs[sp].targetStart + ssp; 814 | } 815 | } 816 | 817 | void loadContiAli(ifstream & ca, vector & contigs, int chromosomeID) 818 | { 819 | string buf; 820 | unsigned int targetID, targetStart, targetEnd, targetGap, sourceID, sourceStart, sourceEnd, sourceGap, sourceSize, fr, targetSize; 821 | int i, cp, pp, ppp, seqID, realSourceID; 822 | vector segs; 823 | 824 | sourceID = -1; 825 | if(ca.is_open()) 826 | { 827 | while(ca.good()) 828 | { 829 | getline(ca, buf); 830 | if(buf[0] == 0) 831 | { 832 | if(keepPositions(contigs, sourceID, segs, CONTIG_THRESHOLD) == 0) 833 | { 834 | contigs[sourceID].positionSets.erase(contigs[sourceID].positionSets.end() - 1);// have to keep or discard the last alignment 835 | } 836 | break; 837 | } 838 | 839 | realSourceID = parseBLAT(buf, targetID, targetStart, targetEnd, targetGap, sourceID, sourceStart, sourceEnd, sourceGap, sourceSize, segs, fr, targetSize); 840 | 841 | if((double) (sourceEnd - sourceStart - sourceGap) / sourceSize >= INIT_CONTIG_THRESHOLD && (double) (targetEnd - targetStart - targetGap) / (targetEnd - targetStart) >= INIT_CONTIG_THRESHOLD && sourceSize > 200) 842 | { 843 | updateContig(contigs, sourceID, targetID, segs, fr, CONTIG_THRESHOLD); 844 | } 845 | } 846 | } 847 | else 848 | { 849 | cout << "CANNOT OPEN FILE!" << endl; 850 | exit(-1); 851 | } 852 | } 853 | 854 | void reverseComplement(vector & seq) 855 | { 856 | int np; 857 | 858 | reverse(seq.begin(), seq.end()); 859 | for(np = 0; np < seq.size(); np ++) 860 | if(seq[np] == 'A') seq[np] = 'T'; 861 | else if(seq[np] == 'C') seq[np] = 'G'; 862 | else if(seq[np] == 'G') seq[np] = 'C'; 863 | else if(seq[np] == 'T') seq[np] = 'A'; 864 | //do nothing for 'N' 865 | } 866 | 867 | void printInitialContigs(ofstream & out, Seq & seq, int seqID) 868 | { 869 | int np; 870 | 871 | if(seq.outputted == 0) 872 | { 873 | out << ">" << seqID << endl; 874 | for(np = 0; np < seq.nucleotides.size(); np ++) 875 | { 876 | out << seq.nucleotides[np]; 877 | if((np + 1) % 60 ==0 || np == seq.nucleotides.size() - 1) 878 | out << endl; 879 | } 880 | seq.outputted = 1; 881 | } 882 | } 883 | 884 | void updateGenomeWithContig(vector & seqs, vector > & genome, int chrID) 885 | { 886 | int sp, pp, ppp, npp, gp, tag, seqID, size, np; 887 | unsigned int chromosomeID, chromosomeOffset, nextID, nextOffset, nextItem, cpp, i, previousIDBak, previousOffsetBak, previousItemBak; 888 | Base b; 889 | ContiMer p, q; 890 | KMer k; 891 | char nucleotide; 892 | 893 | previousIDBak = previousOffsetBak = previousItemBak = -1; 894 | for(sp = 0; sp < seqs.size(); sp ++) 895 | { 896 | pp = 0; 897 | cont: 898 | size = seqs[sp].positionSets.size() == 0 ? 0 : 1; 899 | for(; pp < seqs[sp].positionSets.size() ; pp ++) 900 | { 901 | //for(spp = 0; spp < sp; spp ++) // it is too time-consuming to iterate different seqs with different positions for one seq, so function compatible enables merging kmers from with different contig ids 902 | for(ppp = 0; ppp < pp; ppp ++) 903 | if(abs((int)(seqs[sp].positionSets[pp][0].chromosomeOffset - seqs[sp].positionSets[ppp][0].chromosomeOffset)) < seqs[sp].nucleotides.size()) 904 | { 905 | pp ++; 906 | goto cont; 907 | } 908 | for(ppp = 0; ppp < seqs[sp].positionSets[pp].size() - 1; ppp ++) 909 | { 910 | if(seqs[sp].positionSets[pp][ppp].chromosomeID != -1) 911 | { 912 | chromosomeID = seqs[sp].positionSets[pp][ppp].chromosomeID; 913 | chromosomeOffset = seqs[sp].positionSets[pp][ppp].chromosomeOffset; 914 | if(genome[chromosomeID][chromosomeOffset].contiMer.size() >= 2) 915 | { 916 | pp ++; 917 | goto cont; 918 | } 919 | } 920 | } 921 | tag = 0; 922 | if(seqs[sp].frSets[pp] == 1) 923 | { 924 | reverseComplement(seqs[sp].nucleotides); 925 | tag = 1; 926 | } 927 | //has to always adjust nucleotides according to fr 928 | seqs[sp].outputted = 1; 929 | // printInitialContigs(out, seqs[sp], sp); 930 | for(ppp = 0; ppp < seqs[sp].positionSets[pp].size() - 1; ppp ++) 931 | { 932 | if(seqs[sp].positionSets[pp][ppp].chromosomeID != -1) 933 | { 934 | chromosomeID = seqs[sp].positionSets[pp][ppp].chromosomeID; 935 | chromosomeOffset = seqs[sp].positionSets[pp][ppp].chromosomeOffset; 936 | nextID = seqs[sp].positionSets[pp][ppp + 1].chromosomeID; 937 | nextOffset = seqs[sp].positionSets[pp][ppp + 1].chromosomeOffset; 938 | // nextItem = genome[nextID][nextOffset].nextPosition.size(); 939 | nucleotide = seqs[sp].nucleotides[ppp]; 940 | if(nextID == -1)// insertion to genome 941 | { 942 | for(npp = ppp + 2; npp < seqs[sp].positionSets[pp].size(); npp ++) 943 | { 944 | if(seqs[sp].positionSets[pp][npp].chromosomeID != -1) 945 | { 946 | nextID = seqs[sp].positionSets[pp][npp].chromosomeID; 947 | nextOffset = seqs[sp].positionSets[pp][npp].chromosomeOffset; 948 | if(seqs[sp].positionSets[pp][npp].chromosomeID == seqs[sp].positionSets[pp][ppp].chromosomeID && npp - ppp < SI)// small insertion 949 | { 950 | // cout << "SI" << endl; 951 | // for(cpp = chromosomeOffset; cpp < nextOffset; cpp ++) 952 | { 953 | //p.previousID = previousIDBak; 954 | //p.previousOffset = previousOffsetBak; 955 | //p.previousItem = previousItemBak; 956 | p.nextID = nextID; 957 | p.nextOffset = nextOffset; 958 | p.nextItem = genome[p.nextID][p.nextOffset].contiMer.size(); 959 | p.contigID = sp; 960 | p.contigOffset = ppp; 961 | p.nucleotide = nucleotide; 962 | genome[chromosomeID][chromosomeOffset].contiMer.push_back(p); 963 | 964 | previousIDBak = chromosomeID; 965 | previousOffsetBak = chromosomeOffset; 966 | previousItemBak = genome[chromosomeID][chromosomeOffset].contiMer.size() - 1; 967 | //k.contigOffset = k.contigOffset0 = k.chromosomeID0 = k.chromosomeOffset0 = -1; 968 | //genome[chromosomeID][chromosomeOffset].kMer.push_back(k); 969 | //genome[chromosomeID][cpp].nextID = chromosomeID; 970 | //genome[chromosomeID][cpp].nextOffset = cpp + 1; 971 | } 972 | // cout << "IS" << endl; 973 | } 974 | else// large insertion 975 | { 976 | // cout << "LI" << endl; 977 | //p.previousID = previousIDBak; 978 | //p.previousOffset = previousOffsetBak; 979 | //p.previousItem = previousItemBak; 980 | p.nextID = chromosomeID; 981 | p.nextOffset = genome[chromosomeID].size(); 982 | p.nextItem = 0; 983 | p.contigID = sp; 984 | p.contigOffset = ppp; 985 | p.nucleotide = nucleotide; 986 | genome[chromosomeID][chromosomeOffset].contiMer.push_back(p); 987 | previousIDBak = chromosomeID; 988 | previousOffsetBak = chromosomeOffset; 989 | previousItemBak = genome[chromosomeID][chromosomeOffset].contiMer.size() - 1; 990 | //k.contigOffset = k.contigOffset0 = k.chromosomeID0 = k.chromosomeOffset0 = -1; 991 | //genome[chromosomeID][chromosomeOffset].kMer.push_back(k); 992 | //genome[chromosomeID][chromosomeOffset].nextID = chromosomeID; 993 | //genome[chromosomeID][chromosomeOffset].nextOffset = genome[chromosomeID].size(); 994 | for(i = 0; i < npp - ppp - 2; i ++) 995 | { 996 | b.contiMer.clear(); 997 | b.nucleotide = seqs[sp].nucleotides[ppp + 1 + i]; 998 | //p.previousID = previousIDBak; 999 | //p.previousOffset = previousOffsetBak; 1000 | //p.previousItem = previousItemBak; 1001 | p.nextID = chromosomeID; 1002 | p.nextOffset = genome[chromosomeID].size() + 1; 1003 | p.nextItem = 0; 1004 | p.contigID = sp; 1005 | p.contigOffset = ppp + 1 + i; 1006 | p.nucleotide = b.nucleotide; 1007 | b.contiMer.push_back(p); 1008 | 1009 | previousIDBak = chromosomeID; 1010 | previousOffsetBak = genome[chromosomeID].size(); 1011 | previousItemBak = 0; 1012 | //k.contigOffset = k.contigOffset0 = k.chromosomeID0 = k.chromosomeOffset0 = -1; 1013 | //b.kMer.push_back(k); 1014 | //b.nextID = chromosomeID; 1015 | //b.nextOffset = genome[chromosomeID].size(); 1016 | genome[chromosomeID].push_back(b); 1017 | } 1018 | b.contiMer.clear(); 1019 | b.nucleotide = seqs[sp].nucleotides[npp - 1]; 1020 | //p.previousID = previousIDBak; 1021 | //p.previousOffset = previousOffsetBak; 1022 | //p.previousItem = previousItemBak; 1023 | p.nextID = nextID; //contigs[cp].positionSets[pp][npp].chromosomeID; 1024 | p.nextOffset = nextOffset; //contigs[cp].positionSets[pp][npp].chromosomeOffset; 1025 | p.nextItem = genome[p.nextID][p.nextOffset].contiMer.size(); 1026 | p.contigID = sp; 1027 | p.contigOffset = npp - 1; 1028 | p.nucleotide = b.nucleotide; 1029 | b.contiMer.push_back(p); 1030 | 1031 | previousIDBak = chromosomeID; 1032 | previousOffsetBak = genome[chromosomeID].size(); 1033 | previousItemBak = 0; 1034 | //k.contigOffset = k.contigOffset0 = k.chromosomeID0 = k.chromosomeOffset0 = -1; 1035 | //b.kMer.push_back(k); 1036 | genome[chromosomeID].push_back(b); 1037 | //genome[chromosomeID][genome[chromosomeID].size() - 1].nextID = nextID; 1038 | //genome[chromosomeID][genome[chromosomeID].size() - 1].nextOffset = nextOffset; 1039 | // cout << "IL" << endl; 1040 | } 1041 | ppp = npp - 1; 1042 | break; 1043 | } 1044 | } 1045 | } 1046 | else if(nextID == chromosomeID && nextOffset != chromosomeOffset + 1)// deletion from genome 1047 | { 1048 | if(nextOffset - chromosomeOffset < SD)// small deletion 1049 | { 1050 | // cout << "SD" << endl; 1051 | for(cpp = chromosomeOffset; cpp < nextOffset; cpp ++) 1052 | { 1053 | //p.previousID = previousIDBak; 1054 | //p.previousOffset = previousOffsetBak; 1055 | //p.previousItem = previousItemBak; 1056 | p.nextID = chromosomeID; 1057 | p.nextOffset = cpp + 1; 1058 | p.nextItem = genome[p.nextID][p.nextOffset].contiMer.size(); 1059 | p.contigID = sp; 1060 | p.contigOffset = ppp; 1061 | if(cpp == chromosomeOffset) p.nucleotide = nucleotide; 1062 | else p.nucleotide = genome[chromosomeID][cpp].nucleotide; 1063 | genome[chromosomeID][cpp].contiMer.push_back(p); 1064 | 1065 | previousIDBak = chromosomeID; 1066 | previousOffsetBak = cpp; 1067 | previousItemBak = genome[chromosomeID][cpp].contiMer.size() - 1; 1068 | //k.contigOffset = k.contigOffset0 = k.chromosomeID0 = k.chromosomeOffset0 = -1; 1069 | //genome[chromosomeID][cpp].kMer.push_back(k); 1070 | //genome[chromosomeID][cpp].nextID = chromosomeID; 1071 | //genome[chromosomeID][cpp].nextOffset = cpp + 1; 1072 | } 1073 | // cout << "DS" << endl; 1074 | } 1075 | else// large deletion 1076 | { 1077 | // cout << "LD" << endl; 1078 | //p.previousID = previousIDBak; 1079 | //p.previousOffset = previousOffsetBak; 1080 | //p.previousItem = previousItemBak; 1081 | p.nextID = nextID; 1082 | p.nextOffset = nextOffset; 1083 | p.nextItem = genome[p.nextID][p.nextOffset].contiMer.size(); 1084 | p.contigID = sp; 1085 | p.contigOffset = ppp; 1086 | p.nucleotide = nucleotide; 1087 | genome[chromosomeID][chromosomeOffset].contiMer.push_back(p); 1088 | 1089 | previousIDBak = chromosomeID; 1090 | previousOffsetBak = chromosomeOffset; 1091 | previousItemBak = genome[chromosomeID][chromosomeOffset].contiMer.size() - 1; 1092 | //k.contigOffset = k.contigOffset0 = k.chromosomeID0 = k.chromosomeOffset0 = -1; 1093 | //genome[chromosomeID][chromosomeOffset].kMer.push_back(k); 1094 | //genome[chromosomeID][chromosomeOffset].nextID = nextID; 1095 | //genome[chromosomeID][chromosomeOffset].nextOffset = nextOffset; 1096 | // cout << "DL" << endl; 1097 | } 1098 | } 1099 | else// ordinary case 1100 | { 1101 | //p.previousID = previousIDBak; 1102 | //p.previousOffset = previousOffsetBak; 1103 | //p.previousItem = previousItemBak; 1104 | p.nextID = nextID; 1105 | p.nextOffset = nextOffset; 1106 | p.nextItem = genome[p.nextID][p.nextOffset].contiMer.size(); 1107 | p.contigID = sp; 1108 | p.contigOffset = ppp; 1109 | p.nucleotide = nucleotide; 1110 | genome[chromosomeID][chromosomeOffset].contiMer.push_back(p); 1111 | previousIDBak = chromosomeID; 1112 | previousOffsetBak = chromosomeOffset; 1113 | previousItemBak = genome[chromosomeID][chromosomeOffset].contiMer.size() - 1; 1114 | //k.contigOffset = k.contigOffset0 = k.chromosomeID0 = k.chromosomeOffset0 = -1; 1115 | //genome[chromosomeID][chromosomeOffset].kMer.push_back(k); 1116 | //genome[chromosomeID][chromosomeOffset].nextID = nextID; 1117 | //genome[chromosomeID][chromosomeOffset].nextOffset = nextOffset; 1118 | } 1119 | } 1120 | } 1121 | if(nextID != -1) 1122 | { 1123 | //p.previousID = previousIDBak; 1124 | //p.previousOffset = previousOffsetBak; 1125 | //p.previousItem = previousItemBak; 1126 | p.nextID = p.nextOffset = p.nextItem = -1; 1127 | p.contigID = sp; 1128 | p.contigOffset = ppp; 1129 | p.nucleotide = genome[nextID][nextOffset].nucleotide; 1130 | genome[nextID][nextOffset].contiMer.push_back(p); 1131 | 1132 | //k.contigOffset = k.contigOffset0 = k.chromosomeID0 = k.chromosomeOffset0 = -1; 1133 | //genome[nextID][nextOffset].kMer.push_back(k); 1134 | } 1135 | else// chromosomeID != -1 and nextID == -1 1136 | { 1137 | //p.previousID = previousIDBak; 1138 | //p.previousOffset = previousOffsetBak; 1139 | //p.previousItem = previousItemBak; 1140 | p.nextID = p.nextOffset = p.nextItem = -1; 1141 | p.contigID = sp; 1142 | p.contigOffset = ppp; 1143 | p.nucleotide = genome[chromosomeID][chromosomeOffset].nucleotide; 1144 | genome[chromosomeID][chromosomeOffset].contiMer.push_back(p); 1145 | 1146 | //k.contigOffset = k.contigOffset0 = k.chromosomeID0 = k.chromosomeOffset0 = -1; 1147 | //genome[chromosomeID][chromosomeOffset].kMer.push_back(k); 1148 | } 1149 | if(tag == 1) reverseComplement(seqs[sp].nucleotides); 1150 | } 1151 | 1152 | // cout << "*************************" << endl; 1153 | // for(gp = 0; gp < genome.size(); gp ++) 1154 | // { 1155 | // for(cpp = 0; cpp < genome[gp].size(); cpp ++) 1156 | // { 1157 | // cout << "["; 1158 | // for(pp = 0; pp < genome[gp][cpp].contiMer.size(); pp ++) 1159 | // cout << "<" << genome[gp][cpp].contiMer[pp].contigOffset << "| " << genome[gp][cpp].contiMer[pp].previousID << ", " << genome[gp][cpp].contiMer[pp].previousOffset << ", " << genome[gp][cpp].contiMer[pp].previousItem << "| " << genome[gp][cpp].contiMer[pp].nextID << ", " << genome[gp][cpp].contiMer[pp].nextOffset << ", " << genome[gp][cpp].contiMer[pp].nextItem << ">"; 1160 | // cout << "]"; 1161 | // } 1162 | // cout << endl; 1163 | // } 1164 | // cout << "-------------------------" << endl; 1165 | // for(gp = 0; gp < genome.size(); gp ++) 1166 | // { 1167 | // for(cpp = 0; cpp < genome[gp].size(); cpp ++) 1168 | // { 1169 | // cout << "["; 1170 | // for(pp = 0; pp < genome[gp][cpp].kMer.size(); pp ++) 1171 | // cout << "<" << genome[gp][cpp].kMer[pp].contigOffset << ", " << genome[gp][cpp].kMer[pp].contigOffset0 << ", " << genome[gp][cpp].kMer[pp].chromosomeID0 << ", " << genome[gp][cpp].kMer[pp].chromosomeOffset0 << "| " << genome[gp][cpp].kMer[pp].previous[0].previousID << ", " << genome[gp][cpp].kMer[pp].previous[0].previousOffset << ", " << genome[gp][cpp].kMer[pp].previous[0].previousItem << "| " << genome[gp][cpp].kMer[pp].next[0].nextID << ", " << genome[gp][cpp].kMer[pp].next[0].nextOffset << ", " << genome[gp][cpp].kMer[pp].next[0].nextItem << ">"; 1172 | // cout << "]"; 1173 | // } 1174 | // } 1175 | // cout << endl; 1176 | // cout << "*************************" << endl; 1177 | } 1178 | 1179 | ofstream out; 1180 | string s = "tmp/_initial_contigs." + itoa(chrID) + ".fa"; 1181 | out.open(s.c_str()); 1182 | vector > contigs; 1183 | vector contig; 1184 | vector outputted; 1185 | vector maxOutputted; 1186 | int cp, frag; 1187 | 1188 | int IDBak = -1; 1189 | for(sp = 0; sp < seqs.size(); sp ++) 1190 | { 1191 | if(seqs[sp].ID != IDBak) 1192 | { 1193 | maxOutputted.push_back(0); 1194 | outputted.push_back(0); 1195 | contigs.push_back(contig); 1196 | IDBak = seqs[sp].ID; 1197 | } 1198 | maxOutputted[maxOutputted.size() - 1] ++; 1199 | outputted[outputted.size() - 1] = outputted[outputted.size() - 1] + seqs[sp].outputted; 1200 | for(np = 0; np < seqs[sp].nucleotides.size(); np ++) 1201 | contigs[contigs.size() - 1].push_back(seqs[sp].nucleotides[np]); 1202 | } 1203 | 1204 | for(cp = 0; cp < contigs.size(); cp ++) 1205 | { 1206 | if((double) outputted[cp] / (double) maxOutputted[cp] >= CONTIG_THRESHOLD) 1207 | { 1208 | out << ">" << cp << endl; 1209 | for(np = 0; np < contigs[cp].size(); np ++) 1210 | { 1211 | out << contigs[cp][np]; 1212 | if((np + 1) % 60 == 0 || np == contigs[cp].size() - 1) 1213 | out << endl; 1214 | } 1215 | } 1216 | } 1217 | } 1218 | 1219 | void loadContigAlignment(vector > & genome, int chromosomeID) 1220 | { 1221 | vector contigs; 1222 | ifstream c, ca; 1223 | string s; 1224 | 1225 | c.open("tmp/_contigs.fa"); 1226 | s = "tmp/_contigs_genome." + itoa(chromosomeID) + ".psl"; 1227 | ca.open(s.c_str()); 1228 | loadSeq(c, contigs); 1229 | loadContiAli(ca, contigs, chromosomeID); 1230 | updateGenomeWithContig(contigs, genome, chromosomeID); 1231 | } 1232 | 1233 | int loadReadAli(ifstream & ra, vector & reads, int & aliStartID, int & seqStartID) 1234 | { 1235 | string buf; 1236 | unsigned int targetID1, targetStart1, targetEnd1, targetGap1, sourceID1, sourceStart1, sourceEnd1, sourceGap1, sourceSize1, fr1, targetID2, targetStart2, targetEnd2, targetGap2, sourceID2, sourceStart2, sourceEnd2, sourceGap2, sourceSize2, fr2; 1237 | vector segs1, segs2; 1238 | int gp, cpp, pp; 1239 | int count = 0; 1240 | 1241 | sourceIDBak = -1; 1242 | if(ra.is_open()) 1243 | { 1244 | while(ra.good()) 1245 | { 1246 | getline(ra, buf); 1247 | if(buf[0] == 0) break; 1248 | if(buf[0] == '@') continue; 1249 | parseBOWTIE(buf, targetID1, targetStart1, targetEnd1, targetGap1, sourceID1, sourceStart1, sourceEnd1, sourceGap1, sourceSize1, segs1, fr1); 1250 | getline(ra, buf); 1251 | if(buf[0] == 0) 1252 | { 1253 | cout << "BROKEN BOWTIE FILE" << endl; 1254 | exit(-1); 1255 | } 1256 | parseBOWTIE(buf, targetID2, targetStart2, targetEnd2, targetGap2, sourceID2, sourceStart2, sourceEnd2, sourceGap2, sourceSize2, segs2, fr2); 1257 | 1258 | if(sourceID1 < aliStartID) continue; 1259 | if(sourceID1 > seqStartID) return 0; 1260 | 1261 | if(targetID1 != -1 && targetID2 != -1 && (double) (sourceEnd1 - sourceStart1 - sourceGap1) / /*(sourceEnd1 - sourceStart1)*/ sourceSize1 >= THRESHOLD && (double) (targetEnd1 - targetStart1 - targetGap1) / (targetEnd1 - targetStart1) >= THRESHOLD && (double) (sourceEnd2 - sourceStart2 - sourceGap2) / /*(sourceEnd2 - sourceStart2)*/ sourceSize2 >= THRESHOLD && (double) (targetEnd2 - targetStart2 - targetGap2) / (targetEnd2 - targetStart2) >= THRESHOLD) 1262 | { 1263 | updateContig(reads, (sourceID1 - aliStartID) * 2, targetID1, segs1, fr1, THRESHOLD); 1264 | updateContig(reads, (sourceID2 - aliStartID) * 2 + 1, targetID2, segs2, fr2, THRESHOLD); 1265 | count ++; 1266 | } 1267 | segs1.clear(); 1268 | segs2.clear(); 1269 | } 1270 | return 1; 1271 | } 1272 | else 1273 | { 1274 | cout << "CANNOT OPEN FILE!" << endl; 1275 | exit(-1); 1276 | } 1277 | } 1278 | 1279 | int getDiff(vector s1, vector s2) 1280 | { 1281 | int diff = 0, i; 1282 | 1283 | if(s1.size() == 0 || s2.size() == 0) return -1; 1284 | 1285 | for(i = 0; i < s1.size(); i ++) 1286 | if(s1[i] != s2[i]) 1287 | diff ++; 1288 | return diff; 1289 | } 1290 | 1291 | //allowed pair distance < distanceHigh - distanceLow so that PE de Bruijn graph's branch solving function works 1292 | //distanceHigh - distanceLow > 50 1293 | int compatible(KMer & k1, KMer & k2, int insertVariation, int mrl) 1294 | { 1295 | if( 1296 | (k1.contigID == -1 || k2.contigID == -1 || (k1.contigID != -1 && k1.contigID == k2.contigID && abs((int)(k1.contigOffset - k2.contigOffset)) <= 5 * EP) 1297 | #ifdef OPTIMIZATION 1298 | || (k1.contigID != -1 && k2.contigID != -1 && k1.contigID != k2.contigID) 1299 | #endif 1300 | ) && 1301 | // it is hard to judge if two overlapping contigs should be joined or are just repetitive. Here I find it performs better to join them. 1302 | (k1.contigID0 == -1 || k2.contigID0 == -1 || (k1.contigID0 != -1 && k1.contigID0 == k2.contigID0 && abs((int)(k1.contigOffset0 - k2.contigOffset0)) <= 2 * insertVariation + 5 * EP) 1303 | #ifdef OPTIMIZATION 1304 | || (k1.contigID0 != -1 && k2.contigID0 != -1 && k1.contigID0 != k2.contigID0) 1305 | #endif 1306 | ) && 1307 | (k1.chromosomeID0 == -1 || k2.chromosomeID0 == -1 || (k1.chromosomeID0 != -1 && k1.chromosomeID0 == k2.chromosomeID0 && abs((int)(k1.chromosomeOffset0 - k2.chromosomeOffset0)) <= 2 * insertVariation + 5 * EP)) 1308 | ) 1309 | return 1; 1310 | else 1311 | return 0; 1312 | } 1313 | 1314 | int nextCompatible(Next next1, Next next2) 1315 | { 1316 | if(next1.nextID == -1 || next2.nextID == -1) 1317 | { 1318 | cout << "KMER ERROR" << endl; 1319 | exit(-1); 1320 | } 1321 | 1322 | if(next1.nextID == next2.nextID && next1.nextOffset == next2.nextOffset && next1.nextItem == next2.nextItem) 1323 | return 1; 1324 | return 0; 1325 | } 1326 | 1327 | int previousCompatible(Previous previous1, Previous previous2) 1328 | { 1329 | if(previous1.previousID == -1 || previous2.previousID == -1) 1330 | { 1331 | cout << "KMER ERROR" << endl; 1332 | exit(-1); 1333 | } 1334 | 1335 | if(previous1.previousID == previous2.previousID && previous1.previousOffset == previous2.previousOffset && previous1.previousItem == previous2.previousItem) 1336 | return 1; 1337 | return 0; 1338 | } 1339 | 1340 | void updateKBases(KMer k, KMer & k0) 1341 | { 1342 | if(k.s.size() > 0) 1343 | switch(k.s[0]) 1344 | { 1345 | case 'A': k0.A ++; break; 1346 | case 'C': k0.C ++; break; 1347 | case 'G': k0.G ++; break; 1348 | case 'T': k0.T ++; break; 1349 | default: k0.N ++; 1350 | } 1351 | } 1352 | 1353 | void updateKMer(vector > & genome, unsigned int chromosomeID, unsigned int chromosomeOffset, unsigned int nextID, unsigned int nextOffset, unsigned int chromosomeID0, unsigned int chromosomeOffset0, unsigned int nextID0, unsigned int nextOffset0, vector s, vector nextS, int insertVariation, int mrl) 1354 | { 1355 | int ip, ipp, ip0, np, pp, cip, nip, size; 1356 | KMer k1, k2; 1357 | ContiMer p; 1358 | vector nextItem, chromosomeItem; 1359 | Next next; 1360 | Previous previous; 1361 | 1362 | k1.traversed = 0; 1363 | k1.s = s; 1364 | k1.chromosomeID0 = chromosomeID0; 1365 | k1.chromosomeOffset0 = chromosomeOffset0; 1366 | k1.A = k1.C = k1.G = k1.T = k1.N = 0; 1367 | k1.coverage = 1; 1368 | 1369 | if(genome[chromosomeID][chromosomeOffset].contiMer.size() == 0 && (chromosomeID0 == -1 || genome[chromosomeID0][chromosomeOffset0].contiMer.size() == 0)) 1370 | { 1371 | k1.contigID = -1; 1372 | k1.contigID0 = -1; 1373 | k1.contigOffset = -1; 1374 | k1.contigOffset0 = -1; 1375 | for(ipp = 0; ipp < genome[chromosomeID][chromosomeOffset].kMer.size(); ipp ++) 1376 | if(compatible(k1, genome[chromosomeID][chromosomeOffset].kMer[ipp], insertVariation, mrl)) 1377 | break; 1378 | if(ipp == genome[chromosomeID][chromosomeOffset].kMer.size()) 1379 | { 1380 | // cout << "insert to " << chromosomeID << ", " << chromosomeOffset << ", " << ipp << endl; 1381 | genome[chromosomeID][chromosomeOffset].kMer.push_back(k1); 1382 | updateKBases(k1, genome[chromosomeID][chromosomeOffset].kMer[ipp]); 1383 | } 1384 | else 1385 | { 1386 | genome[chromosomeID][chromosomeOffset].kMer[ipp].coverage ++; 1387 | // cout << "update " << chromosomeID << ", " << chromosomeOffset << ", " << ipp << endl; 1388 | updateKBases(k1, genome[chromosomeID][chromosomeOffset].kMer[ipp]); 1389 | } 1390 | chromosomeItem.push_back(ipp); 1391 | goto cont; 1392 | } 1393 | 1394 | if(genome[chromosomeID][chromosomeOffset].contiMer.size() != 0 && (chromosomeID0 == -1 || genome[chromosomeID0][chromosomeOffset0].contiMer.size() == 0)) 1395 | { 1396 | k1.contigID0 = -1; 1397 | k1.contigOffset0 = -1; 1398 | for(ip = 0; ip < genome[chromosomeID][chromosomeOffset].contiMer.size(); ip ++) 1399 | { 1400 | k1.contigID = genome[chromosomeID][chromosomeOffset].contiMer[ip].contigID; 1401 | k1.contigOffset = genome[chromosomeID][chromosomeOffset].contiMer[ip].contigOffset; 1402 | for(ipp = 0; ipp < genome[chromosomeID][chromosomeOffset].kMer.size(); ipp ++) 1403 | if(compatible(k1, genome[chromosomeID][chromosomeOffset].kMer[ipp], insertVariation, mrl)) 1404 | break; 1405 | if(ipp == genome[chromosomeID][chromosomeOffset].kMer.size()) 1406 | { 1407 | // cout << "insert to " << chromosomeID << ", " << chromosomeOffset << ", " << ipp << endl; 1408 | genome[chromosomeID][chromosomeOffset].kMer.push_back(k1); 1409 | updateKBases(k1, genome[chromosomeID][chromosomeOffset].kMer[ipp]); 1410 | } 1411 | else 1412 | { 1413 | genome[chromosomeID][chromosomeOffset].kMer[ipp].coverage ++; 1414 | // cout << "update " << chromosomeID << ", " << chromosomeOffset << ", " << ipp << endl; 1415 | updateKBases(k1, genome[chromosomeID][chromosomeOffset].kMer[ipp]); 1416 | } 1417 | chromosomeItem.push_back(ipp); 1418 | } 1419 | goto cont; 1420 | } 1421 | 1422 | if(genome[chromosomeID][chromosomeOffset].contiMer.size() == 0 && (chromosomeID0 != -1 && genome[chromosomeID0][chromosomeOffset0].contiMer.size() != 0)) 1423 | { 1424 | k1.contigID = -1; 1425 | k1.contigOffset = -1; 1426 | for(ip0 = 0; ip0 < genome[chromosomeID0][chromosomeOffset0].contiMer.size(); ip0 ++) 1427 | { 1428 | k1.contigID0 = genome[chromosomeID0][chromosomeOffset0].contiMer[ip0].contigID; 1429 | k1.contigOffset0 = genome[chromosomeID0][chromosomeOffset0].contiMer[ip0].contigOffset; 1430 | for(ipp = 0; ipp < genome[chromosomeID][chromosomeOffset].kMer.size(); ipp ++) 1431 | if(compatible(k1, genome[chromosomeID][chromosomeOffset].kMer[ipp], insertVariation, mrl)) 1432 | break; 1433 | if(ipp == genome[chromosomeID][chromosomeOffset].kMer.size()) 1434 | { 1435 | // cout << "insert to " << chromosomeID << ", " << chromosomeOffset << ", " << ipp << endl; 1436 | genome[chromosomeID][chromosomeOffset].kMer.push_back(k1); 1437 | updateKBases(k1, genome[chromosomeID][chromosomeOffset].kMer[ipp]); 1438 | } 1439 | else 1440 | { 1441 | genome[chromosomeID][chromosomeOffset].kMer[ipp].coverage ++; 1442 | // cout << "update " << chromosomeID << ", " << chromosomeOffset << ", " << ipp << endl; 1443 | updateKBases(k1, genome[chromosomeID][chromosomeOffset].kMer[ipp]); 1444 | } 1445 | chromosomeItem.push_back(ipp); 1446 | } 1447 | goto cont; 1448 | } 1449 | 1450 | if(genome[chromosomeID][chromosomeOffset].contiMer.size() != 0 && (chromosomeID0 != -1 && genome[chromosomeID0][chromosomeOffset0].contiMer.size() != 0)) 1451 | { 1452 | for(ip = 0; ip < genome[chromosomeID][chromosomeOffset].contiMer.size(); ip ++) 1453 | for(ip0 = 0; ip0 < genome[chromosomeID0][chromosomeOffset0].contiMer.size(); ip0 ++) 1454 | { 1455 | k1.contigID = genome[chromosomeID][chromosomeOffset].contiMer[ip].contigID; 1456 | k1.contigID0 = genome[chromosomeID0][chromosomeOffset0].contiMer[ip0].contigID; 1457 | k1.contigOffset = genome[chromosomeID][chromosomeOffset].contiMer[ip].contigOffset; 1458 | k1.contigOffset0 = genome[chromosomeID0][chromosomeOffset0].contiMer[ip0].contigOffset; 1459 | for(ipp = 0; ipp < genome[chromosomeID][chromosomeOffset].kMer.size(); ipp ++) 1460 | if(compatible(k1, genome[chromosomeID][chromosomeOffset].kMer[ipp], insertVariation, mrl)) 1461 | break; 1462 | if(ipp == genome[chromosomeID][chromosomeOffset].kMer.size()) 1463 | { 1464 | // cout << "insert to " << chromosomeID << ", " << chromosomeOffset << ", " << ipp << endl; 1465 | genome[chromosomeID][chromosomeOffset].kMer.push_back(k1); 1466 | updateKBases(k1, genome[chromosomeID][chromosomeOffset].kMer[ipp]); 1467 | } 1468 | else 1469 | { 1470 | genome[chromosomeID][chromosomeOffset].kMer[ipp].coverage ++; 1471 | // cout << "update " << chromosomeID << ", " << chromosomeOffset << ", " << ipp << endl; 1472 | updateKBases(k1, genome[chromosomeID][chromosomeOffset].kMer[ipp]); 1473 | } 1474 | chromosomeItem.push_back(ipp); 1475 | } 1476 | goto cont; 1477 | } 1478 | 1479 | cont: 1480 | k2.traversed = 0; 1481 | k2.s = nextS; 1482 | k2.chromosomeID0 = nextID0; 1483 | k2.chromosomeOffset0 = nextOffset0; 1484 | k2.coverage = 0; 1485 | k2.A = k2.C = k2.G = k2.T = k2.N = 0; 1486 | 1487 | if(genome[nextID][nextOffset].contiMer.size() == 0 && (nextID0 == -1 || genome[nextID0][nextOffset0].contiMer.size() == 0)) 1488 | { 1489 | k2.contigID = -1; 1490 | k2.contigID0 = -1; 1491 | k2.contigOffset = -1; 1492 | k2.contigOffset0 = -1; 1493 | for(ipp = 0; ipp < genome[nextID][nextOffset].kMer.size(); ipp ++) 1494 | if(compatible(k2, genome[nextID][nextOffset].kMer[ipp], insertVariation, mrl)) 1495 | break; 1496 | if(ipp == genome[nextID][nextOffset].kMer.size()) 1497 | { 1498 | // cout << "insert to " << nextID << ", " << nextOffset << ", " << ipp << endl; 1499 | genome[nextID][nextOffset].kMer.push_back(k2); 1500 | } 1501 | else 1502 | { 1503 | // genome[chromosomeID][chromosomeOffset].kMer[ipp].coverage ++; 1504 | // cout << "update " << nextID << ", " << nextOffset << ", " << ipp << endl; 1505 | } 1506 | nextItem.push_back(ipp); 1507 | goto conti; 1508 | } 1509 | 1510 | if(genome[nextID][nextOffset].contiMer.size() != 0 && (nextID0 == -1 || genome[nextID0][nextOffset0].contiMer.size() == 0)) 1511 | { 1512 | k2.contigID0 = -1; 1513 | k2.contigOffset0 = -1; 1514 | for(ip = 0; ip < genome[nextID][nextOffset].contiMer.size(); ip ++) 1515 | { 1516 | k2.contigID = genome[nextID][nextOffset].contiMer[ip].contigID; 1517 | k2.contigOffset = genome[nextID][nextOffset].contiMer[ip].contigOffset; 1518 | for(ipp = 0; ipp < genome[nextID][nextOffset].kMer.size(); ipp ++) 1519 | if(compatible(k2, genome[nextID][nextOffset].kMer[ipp], insertVariation, mrl)) 1520 | break; 1521 | if(ipp == genome[nextID][nextOffset].kMer.size()) 1522 | { 1523 | // cout << "insert to " << nextID << ", " << nextOffset << ", " << ipp << endl; 1524 | genome[nextID][nextOffset].kMer.push_back(k2); 1525 | } 1526 | else 1527 | { 1528 | // genome[chromosomeID][chromosomeOffset].kMer[ipp].coverage ++; 1529 | // cout << "update " << nextID << ", " << nextOffset << ", " << ipp << endl; 1530 | } 1531 | nextItem.push_back(ipp); 1532 | } 1533 | goto conti; 1534 | } 1535 | 1536 | if(genome[nextID][nextOffset].contiMer.size() == 0 && (nextID0 != -1 && genome[nextID0][nextOffset0].contiMer.size() != 0)) 1537 | { 1538 | k2.contigID = -1; 1539 | k2.contigOffset = -1; 1540 | for(ip0 = 0; ip0 < genome[nextID0][nextOffset0].contiMer.size(); ip0 ++) 1541 | { 1542 | k2.contigID0 = genome[nextID0][nextOffset0].contiMer[ip0].contigID; 1543 | k2.contigOffset0 = genome[nextID0][nextOffset0].contiMer[ip0].contigOffset; 1544 | for(ipp = 0; ipp < genome[nextID][nextOffset].kMer.size(); ipp ++) 1545 | if(compatible(k2, genome[nextID][nextOffset].kMer[ipp], insertVariation, mrl)) 1546 | break; 1547 | if(ipp == genome[nextID][nextOffset].kMer.size()) 1548 | { 1549 | // cout << "insert to " << nextID << ", " << nextOffset << ", " << ipp << endl; 1550 | genome[nextID][nextOffset].kMer.push_back(k2); 1551 | } 1552 | else 1553 | { 1554 | // genome[chromosomeID][chromosomeOffset].kMer[ipp].coverage ++; 1555 | // cout << "update " << nextID << ", " << nextOffset << ", " << ipp << endl; 1556 | } 1557 | nextItem.push_back(ipp); 1558 | } 1559 | goto conti; 1560 | } 1561 | 1562 | if(genome[nextID][nextOffset].contiMer.size() != 0 && (nextID0 != -1 && genome[nextID0][nextOffset0].contiMer.size() != 0)) 1563 | { 1564 | for(ip = 0; ip < genome[nextID][nextOffset].contiMer.size(); ip ++) 1565 | for(ip0 = 0; ip0 < genome[nextID0][nextOffset0].contiMer.size(); ip0 ++) 1566 | { 1567 | k2.contigID = genome[nextID][nextOffset].contiMer[ip].contigID; 1568 | k2.contigID0 = genome[nextID0][nextOffset0].contiMer[ip0].contigID; 1569 | k2.contigOffset = genome[nextID][nextOffset].contiMer[ip].contigOffset; 1570 | k2.contigOffset0 = genome[nextID0][nextOffset0].contiMer[ip0].contigOffset; 1571 | for(ipp = 0; ipp < genome[nextID][nextOffset].kMer.size(); ipp ++) 1572 | if(compatible(k2, genome[nextID][nextOffset].kMer[ipp], insertVariation, mrl)) 1573 | break; 1574 | if(ipp == genome[nextID][nextOffset].kMer.size()) 1575 | { 1576 | // cout << "insert to " << nextID << ", " << nextOffset << ", " << ipp << endl; 1577 | genome[nextID][nextOffset].kMer.push_back(k2); 1578 | } 1579 | else 1580 | { 1581 | // genome[chromosomeID][chromosomeOffset].kMer[ipp].coverage ++; 1582 | // cout << "update " << nextID << ", " << nextOffset << ", " << ipp << endl; 1583 | } 1584 | nextItem.push_back(ipp); 1585 | } 1586 | goto conti; 1587 | } 1588 | //it is important to consider the case that one read geneartes two or more k-mers 1589 | conti: 1590 | for(cip = 0; cip < chromosomeItem.size(); cip ++) 1591 | for(nip = 0; nip < nextItem.size(); nip ++) 1592 | { 1593 | next.nextID = nextID; 1594 | next.nextOffset = nextOffset; 1595 | next.nextItem = nextItem[nip]; 1596 | for(np = 0; np < genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].next.size(); np ++) 1597 | if(nextCompatible(next, genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].next[np])) 1598 | break; 1599 | //decide connectivity between m new k1's and n new k2's 1600 | if(np == genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].next.size() && 1601 | 1602 | (genome[nextID][nextOffset].kMer[nextItem[nip]].contigID == -1 || 1603 | genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].contigID == -1 || 1604 | (genome[nextID][nextOffset].kMer[nextItem[nip]].contigID != -1 && genome[nextID][nextOffset].kMer[nextItem[nip]].contigID == genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].contigID && abs((int)(genome[nextID][nextOffset].kMer[nextItem[nip]].contigOffset - genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].contigOffset)) <= 5 * EP) 1605 | #ifdef OPTIMIZATION 1606 | || genome[nextID][nextOffset].kMer[nextItem[nip]].contigID != -1 && genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].contigID != -1 && genome[nextID][nextOffset].kMer[nextItem[nip]].contigID != genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].contigID 1607 | #endif 1608 | ) && 1609 | (genome[nextID][nextOffset].kMer[nextItem[nip]].contigID0 == -1 || 1610 | genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].contigID0 == -1 || 1611 | (genome[nextID][nextOffset].kMer[nextItem[nip]].contigID0 != -1 && genome[nextID][nextOffset].kMer[nextItem[nip]].contigID0 == genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].contigID0 && abs((int)(genome[nextID][nextOffset].kMer[nextItem[nip]].contigOffset0 - genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].contigOffset0)) <= 2 * insertVariation + 5 * EP) 1612 | #ifdef OPTIMIZATION 1613 | || genome[nextID][nextOffset].kMer[nextItem[nip]].contigID0 != -1 && genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].contigID0 != -1 && genome[nextID][nextOffset].kMer[nextItem[nip]].contigID0 != genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].contigID0 1614 | #endif 1615 | )) 1616 | 1617 | { 1618 | // cout << "connect [" << chromosomeID << ", " << chromosomeOffset << ", " << cip << ", " << np << "] to [" << nextID << ", " << nextOffset << ", " << nip << "]" << endl; 1619 | genome[chromosomeID][chromosomeOffset].kMer[chromosomeItem[cip]].next.push_back(next); 1620 | } 1621 | // else 1622 | // cout << "keep [" << chromosomeID << ", " << chromosomeOffset << ", " << cip << ", " << np << "] to [" << nextID << ", " << nextOffset << ", " << nip << "]" << endl; 1623 | } 1624 | } 1625 | 1626 | void switchSeqs(vector & p1, vector & p2, vector & n1, vector & n2) 1627 | { 1628 | vector tp; 1629 | vector tn; 1630 | 1631 | tp = p1; p1 = p2; p2 = tp; 1632 | tn = n1; n1 = n2; n2 = tn; 1633 | } 1634 | 1635 | void updateGenomeWithRead(vector & seqs, vector > & genome, int length, int insertVariation, int mrl) 1636 | { 1637 | int sp, pp, ppp, npp, gp, sp0, ip, ipp, np, tag0, tag1, tag2;// sp0 is for the other read 1638 | unsigned int chromosomeID, chromosomeOffset, nextID, nextOffset, nextItem, cpp, i, chromosomeID0, chromosomeOffset0, nextID0, nextOffset0, size; 1639 | Base b; 1640 | ContiMer p; 1641 | KMer k; 1642 | vector s, nextS; 1643 | 1644 | for(sp = 0, sp0 = 1; sp < seqs.size(); sp = sp + 2, sp0 = sp0 + 2) 1645 | { 1646 | pp = 0; 1647 | cont: 1648 | for(; pp < seqs[sp].positionSets.size(); pp ++) 1649 | { 1650 | for(ppp = 0; ppp < pp; ppp ++) 1651 | if(abs((int)(seqs[sp].positionSets[pp][0].chromosomeOffset - seqs[sp].positionSets[ppp][0].chromosomeOffset)) < seqs[sp].nucleotides.size()) 1652 | { 1653 | pp ++; 1654 | goto cont; 1655 | } 1656 | tag0 = tag1 = tag2 = 0; 1657 | if(seqs[sp].frSets[pp] == 1 && seqs[sp0].frSets[pp] == 0) 1658 | { 1659 | reverseComplement(seqs[sp].nucleotides); 1660 | tag0 = 1; 1661 | } 1662 | else if(seqs[sp0].frSets[pp] == 1 && seqs[sp].frSets[pp] == 0) 1663 | { 1664 | reverseComplement(seqs[sp0].nucleotides); 1665 | tag1 = 1; 1666 | } 1667 | else 1668 | { 1669 | cout << "BOWTIE ALIGNMENT ERROR" << endl; 1670 | exit(-1); 1671 | } 1672 | for(ppp = 0; ppp < seqs[sp].positionSets[pp].size() - length; ppp ++) 1673 | if(seqs[sp].positionSets[pp][ppp].chromosomeID != -1 && seqs[sp0].positionSets[pp][ppp].chromosomeID != -1 && 1674 | seqs[sp].positionSets[pp][ppp].chromosomeOffset > seqs[sp0].positionSets[pp][ppp].chromosomeOffset) 1675 | { 1676 | switchSeqs(seqs[sp].positionSets[pp], seqs[sp0].positionSets[pp], seqs[sp].nucleotides, seqs[sp0].nucleotides); 1677 | tag2 = 1; 1678 | break; 1679 | } 1680 | //has to always adjust nucleotides according to fr to accomodate reverse complements 1681 | for(ppp = 0; ppp < seqs[sp].positionSets[pp].size() - length; ppp ++) 1682 | { 1683 | if(seqs[sp].positionSets[pp][ppp].chromosomeID != -1) 1684 | { 1685 | chromosomeID = seqs[sp].positionSets[pp][ppp].chromosomeID; 1686 | chromosomeOffset = seqs[sp].positionSets[pp][ppp].chromosomeOffset; 1687 | chromosomeID0 = seqs[sp0].positionSets[pp][ppp].chromosomeID; 1688 | chromosomeOffset0 = seqs[sp0].positionSets[pp][ppp].chromosomeOffset; 1689 | 1690 | nextID = seqs[sp].positionSets[pp][ppp + 1].chromosomeID; 1691 | nextOffset = seqs[sp].positionSets[pp][ppp + 1].chromosomeOffset; 1692 | nextID0 = seqs[sp0].positionSets[pp][ppp + 1].chromosomeID; 1693 | nextOffset0 = seqs[sp0].positionSets[pp][ppp + 1].chromosomeOffset; 1694 | // nextItem = genome[nextID][nextOffset].nextPosition.size(); 1695 | if(nextID == -1)// insertion to genome 1696 | { 1697 | for(npp = ppp + 2; npp < seqs[sp].positionSets[pp].size(); npp ++) 1698 | { 1699 | if(seqs[sp].positionSets[pp][npp].chromosomeID != -1) 1700 | { 1701 | nextID = seqs[sp].positionSets[pp][npp].chromosomeID; 1702 | nextOffset = seqs[sp].positionSets[pp][npp].chromosomeOffset; 1703 | nextID0 = seqs[sp0].positionSets[pp][npp].chromosomeID; 1704 | nextOffset0 = seqs[sp0].positionSets[pp][npp].chromosomeOffset; 1705 | if(seqs[sp].positionSets[pp][npp].chromosomeID == seqs[sp].positionSets[pp][ppp].chromosomeID && npp - ppp < MAX)// small insertion 1706 | { 1707 | if(nextOffset == chromosomeOffset + 1) 1708 | // for(cpp = chromosomeOffset; cpp < nextOffset; cpp ++) 1709 | { 1710 | // p.nextID = nextID; 1711 | // p.nextOffset = nextOffset; 1712 | // p.nextItem = genome[p.nextID][p.nextOffset].nextPosition.size(); 1713 | // p.contigOffset = ppp; 1714 | // genome[chromosomeID][chromosomeOffset].nextPosition.push_back(p); 1715 | // cout << "SI" << endl; 1716 | if(s.size() > 0) s.clear(); 1717 | for(np = ppp; np < ppp + length; np ++) 1718 | s.push_back(seqs[sp].nucleotides[np]); 1719 | if(nextS.size() > 0) nextS.clear(); 1720 | //for(np = ppp + 1; np < ppp + length + 1; np ++) 1721 | size = npp + length < seqs[sp].nucleotides.size() ? npp + length : seqs[sp].nucleotides.size(); 1722 | for(np = npp; np < size; np ++) 1723 | nextS.push_back(seqs[sp].nucleotides[np]); 1724 | updateKMer(genome, chromosomeID, chromosomeOffset, nextID, nextOffset, chromosomeID0, chromosomeOffset0, nextID0, nextOffset0, s, nextS, insertVariation, mrl); 1725 | //genome[chromosomeID][cpp].nextID = chromosomeID; 1726 | //genome[chromosomeID][cpp].nextOffset = cpp + 1; 1727 | } 1728 | 1729 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1730 | else 1731 | { 1732 | if(s.size() > 0) s.clear(); 1733 | for(np = ppp; np < ppp + length; np ++) 1734 | s.push_back(seqs[sp].nucleotides[np]); 1735 | if(nextS.size() > 0) nextS.clear(); 1736 | updateKMer(genome, chromosomeID, chromosomeOffset, chromosomeID, chromosomeOffset + 1, chromosomeID0, chromosomeOffset0, -1, -1, s, nextS, insertVariation, mrl); 1737 | for(cpp = chromosomeOffset + 1; cpp < nextOffset - 1; cpp ++) 1738 | { 1739 | if(s.size() > 0) s.clear(); 1740 | if(nextS.size() > 0) nextS.clear(); 1741 | updateKMer(genome, chromosomeID, cpp, chromosomeID, cpp + 1, -1, -1, -1, -1, s, nextS, insertVariation, mrl); 1742 | } 1743 | if(s.size() > 0) s.clear(); 1744 | if(nextS.size() > 0) nextS.clear(); 1745 | //for(np = ppp + 1; np < ppp + length + 1; np ++) 1746 | size = npp + length < seqs[sp].nucleotides.size() ? npp + length : seqs[sp].nucleotides.size(); 1747 | for(np = npp; np < size; np ++) 1748 | nextS.push_back(seqs[sp].nucleotides[np]); 1749 | updateKMer(genome, chromosomeID, cpp, chromosomeID, cpp + 1, -1, -1, nextID0, nextOffset0, s, nextS, insertVariation, mrl); 1750 | 1751 | } 1752 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1753 | } 1754 | else// large insertion 1755 | //large insertion is ignored for simplicity 1756 | { 1757 | // cout << "LI" << endl; 1758 | p.nextID = chromosomeID; 1759 | p.nextOffset = genome[chromosomeID].size(); 1760 | p.nextItem = 0; 1761 | genome[chromosomeID][chromosomeOffset].contiMer.push_back(p); 1762 | //genome[chromosomeID][chromosomeOffset].nextID = chromosomeID; 1763 | //genome[chromosomeID][chromosomeOffset].nextOffset = genome[chromosomeID].size(); 1764 | for(i = 0; i < npp - ppp - 2; i ++) 1765 | { 1766 | b.contiMer.clear(); 1767 | b.nucleotide = seqs[sp].nucleotides[ppp + 1 + i]; 1768 | p.nextID = chromosomeID; 1769 | p.nextOffset = genome[chromosomeID].size() + 1; 1770 | p.nextItem = 0; 1771 | b.contiMer.push_back(p); 1772 | //b.nextID = chromosomeID; 1773 | //b.nextOffset = genome[chromosomeID].size(); 1774 | genome[chromosomeID].push_back(b); 1775 | } 1776 | b.contiMer.clear(); 1777 | b.nucleotide = seqs[sp].nucleotides[npp]; 1778 | p.nextID = nextID; //contigs[cp].positionSets[pp][npp].chromosomeID; 1779 | p.nextOffset = nextOffset; //contigs[cp].positionSets[pp][npp].chromosomeOffset; 1780 | p.nextItem = genome[p.nextID][p.nextOffset].contiMer.size(); 1781 | b.contiMer.push_back(p); 1782 | genome[chromosomeID].push_back(b); 1783 | //genome[chromosomeID][genome[chromosomeID].size() - 1].nextID = nextID; 1784 | //genome[chromosomeID][genome[chromosomeID].size() - 1].nextOffset = nextOffset; 1785 | } 1786 | ppp = npp - 1; 1787 | break; 1788 | } 1789 | } 1790 | } 1791 | else if(nextID == chromosomeID && nextOffset != chromosomeOffset + 1)// deletion from genome 1792 | { 1793 | if(nextOffset - chromosomeOffset < SD)// small deletion 1794 | { 1795 | // cout << "SD" << endl; 1796 | if(s.size() > 0) s.clear(); 1797 | for(np = ppp; np < ppp + length; np ++) 1798 | s.push_back(seqs[sp].nucleotides[np]); 1799 | if(nextS.size() > 0) nextS.clear(); 1800 | // for(np = ppp + 1; np < ppp + length + 1; np ++) 1801 | // nextS.push_back(seqs[sp].nucleotides[np]); 1802 | updateKMer(genome, chromosomeID, chromosomeOffset, chromosomeID, chromosomeOffset + 1, chromosomeID0, chromosomeOffset0, -1, -1, s, nextS, insertVariation, mrl); 1803 | for(cpp = chromosomeOffset + 1; cpp < nextOffset - 1; cpp ++) 1804 | { 1805 | // p.nextID = chromosomeID; 1806 | // p.nextOffset = cpp + 1; 1807 | // p.nextItem = genome[p.nextID][p.nextOffset].nextPosition.size(); 1808 | // genome[chromosomeID][cpp].nextPosition.push_back(p); 1809 | if(s.size() > 0) s.clear(); 1810 | if(nextS.size() > 0) nextS.clear(); 1811 | updateKMer(genome, chromosomeID, cpp, chromosomeID, cpp + 1, -1, -1, -1, -1, s, nextS, insertVariation, mrl); 1812 | //genome[chromosomeID][cpp].nextID = chromosomeID; 1813 | //genome[chromosomeID][cpp].nextOffset = cpp + 1; 1814 | } 1815 | if(s.size() > 0) s.clear(); 1816 | if(nextS.size() > 0) nextS.clear(); 1817 | for(np = ppp + 1; np < ppp + length + 1; np ++) 1818 | nextS.push_back(seqs[sp].nucleotides[np]); 1819 | updateKMer(genome, chromosomeID, cpp, chromosomeID, cpp + 1, -1, -1, nextID0, nextOffset0, s, nextS, insertVariation, mrl); 1820 | //The empty s and nextS make it possible to generate single nucleotide contig (the min contig size is not k-mer size) 1821 | } 1822 | else// large deletion 1823 | { 1824 | // cout << "LD" << endl; 1825 | // p.nextID = nextID; 1826 | // p.nextOffset = nextOffset; 1827 | // p.nextItem = genome[p.nextID][p.nextOffset].nextPosition.size(); 1828 | // genome[chromosomeID][chromosomeOffset].nextPosition.push_back(p); 1829 | if(s.size() > 0) s.clear(); 1830 | for(np = ppp; np < ppp + length; np ++) 1831 | s.push_back(seqs[sp].nucleotides[np]); 1832 | if(nextS.size() > 0) nextS.clear(); 1833 | for(np = ppp + 1; np < ppp + length + 1; np ++) 1834 | nextS.push_back(seqs[sp].nucleotides[np]); 1835 | updateKMer(genome, chromosomeID, chromosomeOffset, nextID, nextOffset, chromosomeID0, chromosomeOffset0, nextID0, nextOffset0, s, nextS, insertVariation, mrl); 1836 | //genome[chromosomeID][chromosomeOffset].nextID = nextID; 1837 | //genome[chromosomeID][chromosomeOffset].nextOffset = nextOffset; 1838 | 1839 | } 1840 | } 1841 | else// ordinary case 1842 | { 1843 | // cout << "OD" << endl; 1844 | // p.nextID = nextID; 1845 | // p.nextOffset = nextOffset; 1846 | // p.nextItem = genome[p.nextID][p.nextOffset].nextPosition.size(); 1847 | // genome[chromosomeID][chromosomeOffset].nextPosition.push_back(p); 1848 | if(s.size() > 0) s.clear(); 1849 | for(np = ppp; np < ppp + length; np ++) 1850 | s.push_back(seqs[sp].nucleotides[np]); 1851 | if(nextS.size() > 0) nextS.clear(); 1852 | for(np = ppp + 1; np < ppp + length + 1; np ++) 1853 | nextS.push_back(seqs[sp].nucleotides[np]); 1854 | updateKMer(genome, chromosomeID, chromosomeOffset, nextID, nextOffset, chromosomeID0, chromosomeOffset0, nextID0, nextOffset0, s, nextS, insertVariation, mrl); 1855 | //genome[chromosomeID][chromosomeOffset].nextID = nextID; 1856 | //genome[chromosomeID][chromosomeOffset].nextOffset = nextOffset; 1857 | } 1858 | } 1859 | } 1860 | if(tag2 == 1) switchSeqs(seqs[sp].positionSets[pp], seqs[sp0].positionSets[pp], seqs[sp].nucleotides, seqs[sp0].nucleotides); 1861 | if(tag0 == 1 && tag1 == 0) reverseComplement(seqs[sp].nucleotides); 1862 | else if(tag1 == 1 && tag0 == 0) reverseComplement(seqs[sp0].nucleotides); 1863 | else 1864 | { 1865 | cout << "UNKNOWN ERROR" << endl; 1866 | exit(-1); 1867 | } 1868 | } 1869 | } 1870 | } 1871 | 1872 | void loadReadAlignment(vector > & genome, int length, int insertVariation, int chromosomeID, int mrl) 1873 | { 1874 | vector reads; 1875 | ifstream r, ra; 1876 | int seqStartID, aliStartID, finish, rp; 1877 | string s; 1878 | // int counter = 0; 1879 | 1880 | r.open("tmp/_reads.fa"); 1881 | s = "tmp/_reads_genome." + itoa(chromosomeID) + ".bowtie"; 1882 | ra.open(s.c_str()); 1883 | 1884 | seqStartID = aliStartID = -1; 1885 | cont: 1886 | finish = loadSeq(r, reads, aliStartID, seqStartID); 1887 | loadReadAli(ra, reads, aliStartID, seqStartID); 1888 | updateGenomeWithRead(reads, genome, length, insertVariation, mrl); 1889 | if(finish == 0)// && counter < 2) 1890 | { 1891 | // cout << counter ++ << endl; 1892 | reads.clear(); 1893 | goto cont; 1894 | } 1895 | } 1896 | 1897 | int contain(unsigned int startID1, unsigned int startOffset1, unsigned int endID1, unsigned int endOffset1, unsigned int startID2, unsigned int startOffset2, unsigned int endID2, unsigned int endOffset2) 1898 | { 1899 | if(startID1 == startID2 && endID1 == endID2 && startOffset1 <= startOffset2 && endOffset1 >= endOffset2) 1900 | return 1; 1901 | return 0; 1902 | } 1903 | 1904 | void filterLowCoverage(vector > & genome, int coverage) 1905 | { 1906 | unsigned int gp, cp, ip, cpp, pp; 1907 | 1908 | for(gp = 0; gp < genome.size(); gp ++) 1909 | for(cp = 0; cp < genome[gp].size(); cp ++) 1910 | for(ip = 0; ip < genome[gp][cp].kMer.size(); ip ++) 1911 | { 1912 | if(genome[gp][cp].kMer[ip].contigID == -1) 1913 | { 1914 | if(genome[gp][cp].kMer[ip].coverage < coverage) 1915 | genome[gp][cp].kMer[ip].traversed = 1; 1916 | } 1917 | } 1918 | } 1919 | 1920 | void countBranches(vector > & genome) 1921 | { 1922 | unsigned int gp, cp, ip, nBranches = 0; 1923 | 1924 | for(gp = 0; gp < genome.size(); gp ++) 1925 | for(cp = 0; cp < genome[gp].size(); cp ++) 1926 | for(ip = 0; ip < genome[gp][cp].kMer.size(); ip ++) 1927 | if(genome[gp][cp].kMer[ip].traversed == 0 && genome[gp][cp].kMer[ip].next.size() > 1) 1928 | nBranches ++; 1929 | 1930 | cout << "ALERT: " << nBranches << " branches in de Bruijn graph" << endl; 1931 | } 1932 | 1933 | void checkContiMers(vector > & genome) 1934 | { 1935 | unsigned int gp, cp, ip, nBranches = 0; 1936 | 1937 | for(gp = 0; gp < genome.size(); gp ++) 1938 | for(cp = 0; cp < genome[gp].size(); cp ++) 1939 | for(ip = 0; ip < genome[gp][cp].contiMer.size(); ip ++) 1940 | if(genome[gp][cp].contiMer[ip].contigID != -1 && genome[gp][cp].contiMer[ip].contigOffset == -1) 1941 | cout << "ERROR@ " << gp << ", " << cp << ", " << ip << endl; 1942 | } 1943 | 1944 | char max(int a, int c, int g, int t, int n) 1945 | { 1946 | if(a == 0 && c == 0 && g == 0 && t == 0 && n == 0) return 'X'; 1947 | if(a >= c && a >= g && a >= t && a >= n) return 'A'; 1948 | if(c >= a && c >= g && c >= t && c >= n) return 'C'; 1949 | if(g >= a && g >= c && g >= t && g >= n) return 'G'; 1950 | if(t >= a && t >= c && t >= g && t >= n) return 'T'; 1951 | if(n >= a && n >= c && n >= g && n >= t) return 'N'; 1952 | } 1953 | 1954 | void extdContigs1(vector > & genome, int coverage, int k, int chromosomeID) 1955 | { 1956 | unsigned int gp, cp, ip, gpp, cpp, ipp, gppBak, cppBak, ippBak, seqID, i, kMerTag, numItem, nextItem, sp, gpBak, cpBak, ipBak, np, gpp0, cpp0, ngp, ncp, nip, ippp, item, next, count, nCount, startIDBak = -1, startOffsetBak = -1, endIDBak = -1, endOffsetBak = -1; 1957 | Contig contig; 1958 | vector sBak; 1959 | vector contigsBuf; 1960 | char nucleotide; 1961 | 1962 | //****************************************************************************************************************************************************************************** 1963 | ofstream out; 1964 | string s = "tmp/_pre_extended_contigs." + itoa(chromosomeID) + ".fa"; 1965 | out.open(s.c_str()); 1966 | //****************************************************************************************************************************************************************************** 1967 | 1968 | filterLowCoverage(genome, coverage); 1969 | // countBranches(genome); 1970 | 1971 | seqID = 0; 1972 | for(gp = 0; gp < genome.size(); gp ++) 1973 | for(cp = 0; cp < genome[gp].size();) 1974 | { 1975 | // if(genome[gp][cp].kMer.size() != 0) 1976 | for(ip = 0; ip < genome[gp][cp].kMer.size(); ip ++) 1977 | { 1978 | if(genome[gp][cp].kMer[ip].traversed == 0) 1979 | { 1980 | gpp = gp; 1981 | cpp = cp; 1982 | ipp = ip; 1983 | kMerTag = 1; 1984 | contigs.push_back(contig); 1985 | contigs[contigs.size() - 1].startID = gp; 1986 | contigs[contigs.size() - 1].startOffset = cp; 1987 | contigs[contigs.size() - 1].startID0 = genome[gp][cp].kMer[ip].chromosomeID0; 1988 | contigs[contigs.size() - 1].startOffset0 = genome[gp][cp].kMer[ip].chromosomeOffset0; 1989 | contigs[contigs.size() - 1].extended = 0; 1990 | 1991 | while(kMerTag == 1 && genome[gpp][cpp].kMer[ipp].traversed == 0 || kMerTag == 0) 1992 | { 1993 | if(kMerTag == 0) 1994 | contigs[contigs.size() - 1].nucleotides.push_back(genome[gpp][cpp].contiMer[ipp].nucleotide); 1995 | else 1996 | { 1997 | nucleotide = max(genome[gpp][cpp].kMer[ipp].A, genome[gpp][cpp].kMer[ipp].C, genome[gpp][cpp].kMer[ipp].G, genome[gpp][cpp].kMer[ipp].T, genome[gpp][cpp].kMer[ipp].N); 1998 | if(nucleotide != 'X') 1999 | contigs[contigs.size() - 1].nucleotides.push_back(nucleotide); 2000 | else 2001 | contigs[contigs.size() - 1].nucleotides.push_back(genome[gpp][cpp].nucleotide); 2002 | } 2003 | 2004 | if(kMerTag == 1 && genome[gpp][cpp].kMer[ipp].contigOffset != -1 || kMerTag == 0) 2005 | //incorporate novel ones 2006 | contigs[contigs.size() - 1].extended = 1; 2007 | 2008 | if(kMerTag == 1) 2009 | { 2010 | gpp0 = genome[gpp][cpp].kMer[ipp].chromosomeID0; 2011 | cpp0 = genome[gpp][cpp].kMer[ipp].chromosomeOffset0; 2012 | // cout << "k-mer " << cpp << ": " << genome[gpp][cpp].nucleotide << " | "; 2013 | genome[gpp][cpp].kMer[ipp].traversed = 1; 2014 | gpBak = gpp; 2015 | cpBak = cpp; 2016 | ipBak = ipp; 2017 | sBak = genome[gpp][cpp].kMer[ipp].s; 2018 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2019 | 2020 | nCount = 0; 2021 | next = -1; 2022 | for(np = 0; np < genome[gpp][cpp].kMer[ipp].next.size(); np ++) 2023 | { 2024 | ngp = genome[gpp][cpp].kMer[ipp].next[np].nextID; 2025 | ncp = genome[gpp][cpp].kMer[ipp].next[np].nextOffset; 2026 | nip = genome[gpp][cpp].kMer[ipp].next[np].nextItem; 2027 | if(ngp != -1 && genome[ngp][ncp].kMer[nip].traversed == 0) 2028 | { 2029 | next = np; 2030 | nCount ++; 2031 | } 2032 | } 2033 | if(nCount == 1) 2034 | 2035 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2036 | // if(genome[gpp][cpp].kMer[ipp].next.size() == 1 && genome[gpp][cpp].kMer[ipp].next[0].nextID != -1)// need improvement considering filteration 2037 | { 2038 | gppBak = genome[gpp][cpp].kMer[ipp].next[next].nextID; 2039 | cppBak = genome[gpp][cpp].kMer[ipp].next[next].nextOffset; 2040 | ippBak = genome[gpp][cpp].kMer[ipp].next[next].nextItem; 2041 | gpp = gppBak; 2042 | cpp = cppBak; 2043 | ipp = ippBak; 2044 | kMerTag = 1; 2045 | // cout << "keep k-mer" << " | "; 2046 | } 2047 | else if(genome[gpp][cpp].contiMer.size() == 1 && genome[gpp][cpp].contiMer[0].nextID != -1) 2048 | { 2049 | gppBak = genome[gpp][cpp].contiMer[0].nextID; 2050 | cppBak = genome[gpp][cpp].contiMer[0].nextOffset; 2051 | ippBak = genome[gpp][cpp].contiMer[0].nextItem; 2052 | gpp = gppBak; 2053 | cpp = cppBak; 2054 | ipp = ippBak; 2055 | kMerTag = 0; 2056 | // cout << "switch from k-mer to conti-mer" << " | "; 2057 | } 2058 | else 2059 | kMerTag = -1; 2060 | } 2061 | else// if(kMerTag == 0) 2062 | { 2063 | // cout << "conti-mer " << cpp << ": " << genome[gpp][cpp].nucleotide << " | "; 2064 | if(genome[gpp][cpp].contiMer[ipp].nextID != -1) 2065 | { 2066 | gppBak = genome[gpp][cpp].contiMer[ipp].nextID; 2067 | cppBak = genome[gpp][cpp].contiMer[ipp].nextOffset; 2068 | ippBak = genome[gpp][cpp].contiMer[ipp].nextItem; 2069 | gpp = gppBak; 2070 | cpp = cppBak; 2071 | ipp = ippBak; 2072 | kMerTag = 0; 2073 | //maybe not necessary to get back to k-mer? 2074 | // for(ippBak = 0, numItem = 0; ippBak < genome[gpp][cpp].kMer.size(); ippBak ++) 2075 | // if(genome[gpp][cpp].kMer[ippBak].traversed == 0) 2076 | // { 2077 | // numItem ++; 2078 | // nextItem = ippBak; 2079 | // } 2080 | // if(numItem == 1) 2081 | // { 2082 | // cout << "switch from conti-mer to k-mer (1)" << " | "; 2083 | // ipp = nextItem; 2084 | // kMerTag = 1; 2085 | // } 2086 | // else 2087 | // cout << "keep conti-mer" << " | "; 2088 | } 2089 | else 2090 | { 2091 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2092 | 2093 | count = nCount = 0; 2094 | item = next = -1; 2095 | for(ippp = 0; ippp < genome[gpp][cpp].kMer.size(); ippp ++) 2096 | if(genome[gpp][cpp].kMer[ippp].traversed == 0) 2097 | { 2098 | count ++; 2099 | item = ippp; 2100 | } 2101 | if(count == 1) 2102 | { 2103 | for(np = 0; np < genome[gpp][cpp].kMer[item].next.size(); np ++) 2104 | { 2105 | ngp = genome[gpp][cpp].kMer[item].next[np].nextID; 2106 | ncp = genome[gpp][cpp].kMer[item].next[np].nextOffset; 2107 | nip = genome[gpp][cpp].kMer[item].next[np].nextItem; 2108 | if(ngp != -1 && genome[ngp][ncp].kMer[nip].traversed == 0) 2109 | { 2110 | nCount ++; 2111 | next = np; 2112 | } 2113 | } 2114 | } 2115 | if(nCount == 1) 2116 | 2117 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2118 | // if(genome[gpp][cpp].kMer.size() == 1 && genome[gpp][cpp].kMer[0].next.size() == 1 && genome[gpp][cpp].kMer[0].next[0].nextID != -1) 2119 | { 2120 | gppBak = genome[gpp][cpp].kMer[item].next[next].nextID; 2121 | cppBak = genome[gpp][cpp].kMer[item].next[next].nextOffset; 2122 | ippBak = genome[gpp][cpp].kMer[item].next[next].nextItem; 2123 | gpp = gppBak; 2124 | cpp = cppBak; 2125 | ipp = ippBak; 2126 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2127 | if(genome[gpp][cpp].kMer[ipp].traversed == 0) 2128 | kMerTag = 1; 2129 | else 2130 | kMerTag = -2; 2131 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2132 | //kMerTag = 1; 2133 | // cout << "switch from conti-mer to k-mer (2)" << " | "; 2134 | } 2135 | else 2136 | kMerTag = -2; 2137 | } 2138 | } 2139 | } 2140 | 2141 | //xpp and xppBak are always the same 2142 | if(kMerTag == 1)// && genome[gpp][cpp].kMer[ipp].traversed == 1 2143 | { 2144 | contigs[contigs.size() - 1].endID = gppBak; 2145 | contigs[contigs.size() - 1].endOffset = cppBak; 2146 | } 2147 | else//kMerTag == -1 || kMerTag == -2 2148 | { 2149 | contigs[contigs.size() - 1].endID = gpp; 2150 | contigs[contigs.size() - 1].endOffset = cpp; 2151 | } 2152 | 2153 | if(kMerTag == 1 || kMerTag == -1) 2154 | { 2155 | contigs[contigs.size() - 1].endID0 = genome[gpp][cpp].kMer[ipp].chromosomeID0; 2156 | contigs[contigs.size() - 1].endOffset0 = genome[gpp][cpp].kMer[ipp].chromosomeOffset0; 2157 | } 2158 | else// kMerTag == -2 2159 | { 2160 | contigs[contigs.size() - 1].endID0 = -1; 2161 | contigs[contigs.size() - 1].endOffset0 = -1; 2162 | } 2163 | 2164 | if(kMerTag == -1 || kMerTag == 1) 2165 | { 2166 | //append the last |s| - 1 bases only if getting out of while from k-mer rather than conti-mer 2167 | for(sp = 1; sp < sBak.size(); sp ++) 2168 | contigs[contigs.size() - 1].nucleotides.push_back(sBak[sp]); 2169 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2170 | contigs[contigs.size() - 1].endOffset = contigs[contigs.size() - 1].endOffset + sBak.size() - 1; 2171 | contigs[contigs.size() - 1].endOffset0 = contigs[contigs.size() - 1].endOffset0 + sBak.size() - 1; 2172 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2173 | } 2174 | 2175 | //****************************************************************************************************************************************************************************** 2176 | if(/*contigs[contigs.size() - 1].extended == 1 && */ contain(startIDBak, startOffsetBak, endIDBak, endOffsetBak, contigs[contigs.size() - 1].startID, contigs[contigs.size() - 1].startOffset, contigs[contigs.size() - 1].endID, contigs[contigs.size() - 1].endOffset) == 0) 2177 | { 2178 | out << ">" << seqID ++ << ", " /*<< contigs[contigs.size() - 1].contigID << ", " << contigs[contigs.size() - 1].contigOffset << ", " << contigs[contigs.size() - 1].contigID0 << ", " << contigs[contigs.size() - 1].contigOffset0 << ", "*/ << contigs[contigs.size() - 1].extended << ", " << contigs[contigs.size() - 1].startID << ", " << contigs[contigs.size() - 1].startOffset << ", " << contigs[contigs.size() - 1].endID << ", " << contigs[contigs.size() - 1].endOffset << ", " << contigs[contigs.size() - 1].startID0 << ", " << contigs[contigs.size() - 1].startOffset0 << ", " << contigs[contigs.size() - 1].endID0 << ", " << contigs[contigs.size() - 1].endOffset0 << " " << endl; 2179 | for(i = 0; i < contigs[contigs.size() - 1].nucleotides.size(); i ++) 2180 | { 2181 | out << contigs[contigs.size() - 1].nucleotides[i]; 2182 | if((i + 1) % 60 == 0 || i == contigs[contigs.size() - 1].nucleotides.size() - 1) 2183 | out << endl; 2184 | } 2185 | startIDBak = contigs[contigs.size() - 1].startID; 2186 | startOffsetBak = contigs[contigs.size() - 1].startOffset; 2187 | endIDBak = contigs[contigs.size() - 1].endID; 2188 | endOffsetBak = contigs[contigs.size() - 1].endOffset; 2189 | } 2190 | contigs.clear(); 2191 | //****************************************************************************************************************************************************************************** 2192 | } 2193 | } 2194 | if(endOffsetBak - startOffsetBak > 100000) 2195 | { 2196 | if(gp == endIDBak && cp + 1000 < endOffsetBak) 2197 | cp = cp + 1000; 2198 | else 2199 | cp ++; 2200 | } 2201 | else 2202 | cp ++; 2203 | } 2204 | } 2205 | 2206 | void parse(string buf, int & extended, unsigned int & startID, unsigned int & startOffset, unsigned int & endID, unsigned int & endOffset, unsigned int & startID0, unsigned int & startOffset0, unsigned int & endID0, unsigned int & endOffset0) 2207 | { 2208 | int i, item = 0, j1 = 0, j2 = 0, j3 = 0, j4 = 0, j5 = 0, j6 = 0, j7 = 0, j8 = 0, j9 = 0, j10 = 0, j11 = 0, j12 = 0, j13 = 0; 2209 | char contigIDBuf[20] = {'\0'}, contigOffsetBuf[20] = {'\0'}, contigID0Buf[20] = {'\0'}, contigOffset0Buf[20] = {'\0'}, extendedBuf[20] = {'\0'}, startIDBuf[20] = {'\0'}, startOffsetBuf[20] = {'\0'}, endIDBuf[20] = {'\0'}, endOffsetBuf[20] = {'\0'}, startID0Buf[20] = {'\0'}, startOffset0Buf[20] = {'\0'}, endID0Buf[20] = {'\0'}, endOffset0Buf[20] = {'\0'}; 2210 | 2211 | for(i = 0; i < buf.size(); i ++) 2212 | { 2213 | if(buf[i] == ' ') 2214 | { 2215 | item ++; 2216 | continue; 2217 | } 2218 | 2219 | if(item == 0) 2220 | continue; 2221 | else if(item == 1) 2222 | extendedBuf[j1 ++] = buf[i]; 2223 | else if(item == 2) 2224 | startIDBuf[j2 ++] = buf[i]; 2225 | else if(item == 3) 2226 | startOffsetBuf[j3 ++] = buf[i]; 2227 | else if(item == 4) 2228 | endIDBuf[j4 ++] = buf[i]; 2229 | else if(item == 5) 2230 | endOffsetBuf[j5 ++] = buf[i]; 2231 | else if(item == 6) 2232 | startID0Buf[j6 ++] = buf[i]; 2233 | else if(item == 7) 2234 | startOffset0Buf[j7 ++] = buf[i]; 2235 | else if(item == 8) 2236 | endID0Buf[j8 ++] = buf[i]; 2237 | else if(item == 9) 2238 | endOffset0Buf[j9 ++] = buf[i]; 2239 | else if(item == 10) 2240 | break; 2241 | else 2242 | { 2243 | cout << "UNKNOWN ERROR" << endl; 2244 | exit(-1); 2245 | } 2246 | } 2247 | extended = atoi(extendedBuf); 2248 | startID = atoi(startIDBuf); 2249 | startOffset = atoi(startOffsetBuf); 2250 | endID = atoi(endIDBuf); 2251 | endOffset = atoi(endOffsetBuf); 2252 | startID0 = atoi(startID0Buf); 2253 | startOffset0 = atoi(startOffset0Buf); 2254 | endID0 = atoi(endID0Buf); 2255 | endOffset0 = atoi(endOffset0Buf); 2256 | } 2257 | 2258 | void loadContigs(int chromosomeID) 2259 | { 2260 | int cp, cpp, i; 2261 | ifstream in; 2262 | string buf; 2263 | Contig contig; 2264 | 2265 | string s = "tmp/_pre_extended_contigs." + itoa(chromosomeID) + ".fa"; 2266 | in.open(s.c_str()); 2267 | contig.extended = /*contig.contigID = contig.contigOffset = contig.contigID0 = contig.contigOffset0 =*/ contig.startID = contig.startOffset = contig.endID = contig.endOffset = contig.startID0 = contig.startOffset0 = contig.endID0 = contig.endOffset0 = -1; 2268 | 2269 | if(in.is_open()) 2270 | { 2271 | while(in.good()) 2272 | { 2273 | getline(in, buf); 2274 | if(buf[0] == 0) 2275 | break; 2276 | 2277 | if(buf[0] == '>') 2278 | { 2279 | contigs.push_back(contig); 2280 | parse(buf, contigs[contigs.size() - 1].extended, /*contigs[contigs.size() - 1].contigID, contigs[contigs.size() - 1].contigOffset, contigs[contigs.size() - 1].contigID0, contigs[contigs.size() - 1].contigOffset0,*/ contigs[contigs.size() - 1].startID, contigs[contigs.size() - 1].startOffset, contigs[contigs.size() - 1].endID, contigs[contigs.size() - 1].endOffset, contigs[contigs.size() - 1].startID0, contigs[contigs.size() - 1].startOffset0, contigs[contigs.size() - 1].endID0, contigs[contigs.size() - 1].endOffset0); 2281 | } 2282 | else 2283 | { 2284 | for(i = 0; i < buf.size(); i ++) 2285 | contigs[contigs.size() - 1].nucleotides.push_back(buf[i]); 2286 | } 2287 | } 2288 | } 2289 | else 2290 | { 2291 | cout << "CANNOT OPEN FILE!" << endl; 2292 | exit(-1); 2293 | } 2294 | } 2295 | 2296 | void extdContigs2(int chromosomeID) 2297 | { 2298 | int cp, cpp, cppBak, np; 2299 | vector contigsBuf; 2300 | 2301 | loadContigs(chromosomeID); 2302 | 2303 | for(cp = 0; cp < contigs.size(); cp ++) 2304 | { 2305 | if(contigs[cp].extended == 1) 2306 | { 2307 | for(cpp = cp + 1; cpp < contigs.size(); cpp ++) 2308 | //if(contigs[cpp].extended == 1) 2309 | { 2310 | if(contain(contigs[cp].startID, contigs[cp].startOffset, contigs[cp].endID, contigs[cp].endOffset, 2311 | contigs[cpp].startID, contigs[cpp].startOffset, contigs[cpp].endID, contigs[cpp].endOffset)) 2312 | { 2313 | // cout << "duplication removed" << endl; 2314 | contigs[cpp].extended = 2; 2315 | } 2316 | else if(contigs[cp].endID != contigs[cpp].startID || contigs[cp].endOffset < contigs[cpp].startOffset) 2317 | break; 2318 | } 2319 | } 2320 | } 2321 | 2322 | for(cp = contigs.size() - 1; cp != -1; cp --) 2323 | { 2324 | if(contigs[cp].extended == 1) 2325 | { 2326 | for(cpp = cp - 1; cpp != -1; cpp --) 2327 | //if(contigs[cpp].extended == 1) 2328 | { 2329 | if(contain(contigs[cp].startID, contigs[cp].startOffset, contigs[cp].endID, contigs[cp].endOffset, 2330 | contigs[cpp].startID, contigs[cpp].startOffset, contigs[cpp].endID, contigs[cpp].endOffset)) 2331 | { 2332 | // cout << "duplication removed" << endl; 2333 | contigs[cpp].extended = 2; 2334 | } 2335 | else if(contigs[cpp].endID != contigs[cp].startID || contigs[cpp].endOffset < contigs[cp].startOffset) 2336 | break; 2337 | } 2338 | } 2339 | } 2340 | //remove duplication (0: no contig; 1: extended with contig; 2: filtered) 2341 | 2342 | for(cp = 0; cp < contigs.size(); cp ++) 2343 | { 2344 | cont: 2345 | if(contigs[cp].extended == 1) 2346 | { 2347 | cppBak = -1; 2348 | contigsBuf.clear(); 2349 | for(cpp = cp + 1; cpp < contigs.size(); cpp ++) 2350 | if(contigs[cpp].extended != 2) 2351 | { 2352 | if(//contigs[cp].endID == contigs[cpp].startID && 2353 | //contigs[cp].endOffset < contigs[cpp].startOffset && 2354 | contigs[cp].endOffset >= contigs[cpp].startOffset) 2355 | { 2356 | // cout << "potential extension: " << contigs[cp].startOffset << " to " << contigs[cpp].startOffset << endl; 2357 | contigsBuf.push_back(contigs[cpp]); 2358 | cppBak = cpp; 2359 | } 2360 | else if(//contigs[cp].endID != contigs[cpp].startID || 2361 | contigs[cp].endOffset < contigs[cpp].startOffset) 2362 | break; 2363 | } 2364 | if(contigsBuf.size() == 1) 2365 | { 2366 | // cout << "real extension: " << contigs[cp].startOffset << " to " << contigs[cppBak].startOffset << endl; 2367 | contigs[cppBak].extended = 2; 2368 | for(np = //contigs[cp].endOffset < contigs[cpp].startOffset ? 0 : 2369 | contigs[cp].endOffset - contigsBuf[0].startOffset + 1; np < contigsBuf[0].nucleotides.size(); np ++) 2370 | contigs[cp].nucleotides.push_back(contigsBuf[0].nucleotides[np]); 2371 | contigs[cp].endID = contigsBuf[0].endID; 2372 | contigs[cp].endOffset = contigsBuf[0].endOffset; 2373 | contigs[cp].endID0 = contigsBuf[0].endID0; 2374 | contigs[cp].endOffset0 = contigsBuf[0].endOffset0; 2375 | goto cont; 2376 | } 2377 | } 2378 | } 2379 | //join contigs 2380 | } 2381 | 2382 | void extendContigs(vector > & genome, int coverage, int k, int chromosomeID) 2383 | { 2384 | extdContigs1(genome, coverage, k, chromosomeID); 2385 | extdContigs2(chromosomeID); 2386 | } 2387 | 2388 | int overlap(unsigned int x1, unsigned int y1, unsigned int x2, unsigned int y2) 2389 | { 2390 | if(x1 <= x2 && x2 <= y1 && y1 <= y2 && (int)y1 - (int)x2 > 0 || x2 <= x1 && x1 <= y2 && y2 <= y1 && (int)y2 - (int)x1 > 0 || x1 <= x2 && x2 <= y2 && y2 <= y1 && (int)y2 - (int)x2 > 0 || x2 <= x1 && x1 <= y1 && y1 <= y2 && (int)y1 - (int)x1 > 0) 2391 | return 1; 2392 | else 2393 | return 0; 2394 | } 2395 | 2396 | void scaffoldContigs(vector > & genome, int chromosomeID) 2397 | { 2398 | unsigned int seqID, cp, cp0, cpp, i, sp, spp; 2399 | vector > scaffolds; 2400 | vector scaffold; 2401 | int cont, covered; 2402 | 2403 | seqID = 0; 2404 | for(cp = 0; cp < contigs.size(); cp ++) 2405 | { 2406 | if(contigs[cp].startID != -1 && contigs[cp].extended == 1) 2407 | { 2408 | scaffolds.push_back(scaffold); 2409 | for(cpp = 0; cpp < contigs[cp].nucleotides.size(); cpp ++) 2410 | scaffolds[scaffolds.size() - 1].push_back(contigs[cp].nucleotides[cpp]); 2411 | contigs[cp].startID = -1; 2412 | } 2413 | else 2414 | continue; 2415 | 2416 | cont = 1; 2417 | while(contigs[cp].startID0 == contigs[cp].endID0 && cont) 2418 | { 2419 | cont = 0; 2420 | for(cp0 = cp + 1; cp0 < contigs.size(); cp0 ++) 2421 | { 2422 | if(cp0 != cp && contigs[cp].endID0 == contigs[cp0].startID && contigs[cp0].startID == contigs[cp0].endID && overlap(contigs[cp].startOffset0, contigs[cp].endOffset0, contigs[cp0].startOffset, contigs[cp0].endOffset) && contigs[cp0].extended == 1) 2423 | { 2424 | if(contigs[cp0].startOffset > contigs[cp].endOffset) 2425 | { 2426 | covered = 0; 2427 | for(i = 0; i < contigs[cp0].startOffset - contigs[cp].endOffset - 1; i ++) 2428 | if(genome[0][contigs[cp].endOffset + i + 1].kMer.size() > 0 || genome[0][contigs[cp].endOffset + i + 1].contiMer.size() > 0) 2429 | covered ++; 2430 | if(contigs[cp0].startOffset - contigs[cp].endOffset - 1 != 0 && (double) covered / (contigs[cp0].startOffset - contigs[cp].endOffset - 1) >= 0.5 || contigs[cp0].startOffset - contigs[cp].endOffset - 1 == 0) 2431 | for(i = 0; i < contigs[cp0].startOffset - contigs[cp].endOffset - 1; i ++) 2432 | scaffolds[scaffolds.size() - 1].push_back(genome[0][contigs[cp].endOffset + i + 1].nucleotide); 2433 | // scaffolds[scaffolds.size() - 1].push_back('N'); 2434 | else 2435 | goto conti; 2436 | } 2437 | for(cpp = 0; cpp < contigs[cp0].nucleotides.size(); cpp ++) 2438 | scaffolds[scaffolds.size() - 1].push_back(contigs[cp0].nucleotides[cpp]); 2439 | contigs[cp0].startID = -1; 2440 | cp = cp0; 2441 | cont = 1; 2442 | break; 2443 | } 2444 | conti:; 2445 | } 2446 | } 2447 | 2448 | } 2449 | 2450 | //output extended contigs 2451 | ofstream out; 2452 | string s = "tmp/_extended_contigs." + itoa(chromosomeID) + ".fa"; 2453 | out.open(s.c_str()); 2454 | for(sp = 0; sp < scaffolds.size(); sp ++) 2455 | { 2456 | out << ">" << seqID ++ << endl; 2457 | for(spp = 0; spp < scaffolds[sp].size(); spp ++) 2458 | { 2459 | out << scaffolds[sp][spp]; 2460 | if((spp + 1) % 60 == 0 || spp == scaffolds[sp].size() - 1) 2461 | out << endl; 2462 | } 2463 | } 2464 | } 2465 | 2466 | int readLog() 2467 | { 2468 | ifstream in; 2469 | string buf; 2470 | 2471 | in.open("tmp/log.txt"); 2472 | if(in.is_open()) 2473 | { 2474 | getline(in, buf); 2475 | return(atoi(buf.c_str())); 2476 | } 2477 | else 2478 | { 2479 | cout << "CANNOT OPEN FILE!" << endl; 2480 | exit(-1); 2481 | } 2482 | 2483 | } 2484 | 2485 | void ref1() 2486 | { 2487 | int numChromosomes, i, j, k, seqID; 2488 | string s; 2489 | vector > initContigs, extdContigs; 2490 | vector contig; 2491 | string buf; 2492 | unsigned int sourceID, targetID, targetStart, targetEnd, targetGap, sourceStart, sourceEnd, sourceGap, sourceSize, targetSize, fr; 2493 | vector initTags, extdTags; 2494 | ifstream in, ex, ps; 2495 | vector seg; 2496 | ofstream e, ini; 2497 | vector initNums; 2498 | 2499 | numChromosomes = readLog(); 2500 | 2501 | ///////////////////////////////////////////////////////////////////for easy alignment///////////////////////////////////////////////////////////////////////////////////// 2502 | 2503 | ifstream tmpIn; 2504 | ofstream tmpOut; 2505 | vector > tmpContigs; 2506 | for(i = 0; i < numChromosomes; i ++) 2507 | { 2508 | tmpIn.clear(); 2509 | tmpOut.clear(); 2510 | tmpContigs.clear(); 2511 | seqID = 0; 2512 | 2513 | s = "tmp/_initial_contigs." + itoa(i) + ".fa"; 2514 | tmpIn.open(s.c_str()); 2515 | s = "tmp/_short_initial_contigs." + itoa(i) + ".fa"; 2516 | tmpOut.open(s.c_str()); 2517 | if(tmpIn.is_open()) 2518 | { 2519 | while(tmpIn.good()) 2520 | { 2521 | getline(tmpIn, buf); 2522 | if(buf[0] == 0) break; 2523 | 2524 | if(buf[0] == '>') 2525 | tmpContigs.push_back(contig); 2526 | else 2527 | for(j = 0; j < buf.size(); j ++) 2528 | tmpContigs[tmpContigs.size() - 1].push_back(buf[j]); 2529 | } 2530 | } 2531 | else 2532 | { 2533 | cout << "CANNOT OPEN FILE!" << endl; 2534 | exit(-1); 2535 | } 2536 | 2537 | for(i = 0; i < tmpContigs.size(); i ++) 2538 | { 2539 | tmpOut << ">" << i << endl; 2540 | if(tmpContigs[i].size() > SMALL_CHUNK) 2541 | { 2542 | for(j = 0; j < SMALL_CHUNK; j ++) 2543 | { 2544 | tmpOut << tmpContigs[i][j]; 2545 | if((j + 1) % 60 == 0 || j == SMALL_CHUNK - 1) 2546 | tmpOut << endl; 2547 | } 2548 | } 2549 | else 2550 | { 2551 | for(j = 0; j < tmpContigs[i].size(); j ++) 2552 | { 2553 | tmpOut << tmpContigs[i][j]; 2554 | if((j + 1) % 60 == 0 || j == tmpContigs[i].size() - 1) 2555 | tmpOut << endl; 2556 | } 2557 | } 2558 | } 2559 | 2560 | tmpIn.close(); 2561 | tmpOut.close(); 2562 | } 2563 | 2564 | ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2565 | for(i = 0; i < numChromosomes; i ++) 2566 | { 2567 | s = "pblat tmp/_extended_contigs." + itoa(i) + ".fa tmp/_short_initial_contigs." + itoa(i) + ".fa -noHead tmp/_short_initial_contigs_extended_contigs." + itoa(i) + ".psl -threads=8 > blat_doc.txt 2> blat_doc.txt"; 2568 | if(system(s.c_str()) != 0) 2569 | { 2570 | s = "blat tmp/_extended_contigs." + itoa(i) + ".fa tmp/_short_initial_contigs." + itoa(i) + ".fa -noHead tmp/_short_initial_contigs_extended_contigs." + itoa(i) + ".psl > blat_doc.txt 2> blat_doc.txt"; 2571 | if(system(s.c_str()) != 0) {cout << "BLAT CALL FAILED!" << endl; exit(-1);} 2572 | } 2573 | } 2574 | 2575 | seqID = 0; 2576 | for(i = 0; i < numChromosomes; i ++) 2577 | { 2578 | s = "tmp/_extended_contigs." + itoa(i) + ".fa"; 2579 | ex.clear(); 2580 | ex.open(s.c_str()); 2581 | extdContigs.clear(); 2582 | extdTags.clear(); 2583 | if(ex.is_open()) 2584 | { 2585 | while(ex.good()) 2586 | { 2587 | getline(ex, buf); 2588 | if(buf[0] == 0) break; 2589 | 2590 | if(buf[0] == '>') 2591 | { 2592 | extdContigs.push_back(contig); 2593 | extdTags.push_back(0); 2594 | } 2595 | else 2596 | for(j = 0; j < buf.size(); j ++) 2597 | extdContigs[extdContigs.size() - 1].push_back(buf[j]); 2598 | } 2599 | } 2600 | else 2601 | { 2602 | cout << "CANNOT OPEN FILE!" << endl; 2603 | return; 2604 | } 2605 | 2606 | s = "tmp/_initial_contigs." + itoa(i) + ".fa"; 2607 | in.clear(); 2608 | in.open(s.c_str()); 2609 | initContigs.clear(); 2610 | initTags.clear(); 2611 | if(in.is_open()) 2612 | { 2613 | while(in.good()) 2614 | { 2615 | getline(in, buf); 2616 | if(buf[0] == 0) break; 2617 | 2618 | if(buf[0] == '>') 2619 | { 2620 | initContigs.push_back(contig); 2621 | initNums.push_back(atoi(buf.substr(1, buf.size()).c_str())); 2622 | initTags.push_back(0); 2623 | } 2624 | else 2625 | for(j = 0; j < buf.size(); j ++) 2626 | initContigs[initContigs.size() - 1].push_back(buf[j]); 2627 | } 2628 | } 2629 | else 2630 | { 2631 | cout << "CANNOT OPEN FILE!" << endl; 2632 | return; 2633 | } 2634 | 2635 | s = "tmp/_short_initial_contigs_extended_contigs." + itoa(i) + ".psl"; 2636 | ps.clear(); 2637 | ps.open(s.c_str()); 2638 | if(ps.is_open()) 2639 | { 2640 | while(ps.good()) 2641 | { 2642 | getline(ps, buf); 2643 | if(buf[0] == 0) break; 2644 | 2645 | parseBLAT(buf, targetID, targetStart, targetEnd, targetGap, sourceID, sourceStart, sourceEnd, sourceGap, sourceSize, seg, fr, targetSize); 2646 | if((double)(sourceEnd - sourceStart - sourceGap) / sourceSize >= 0.95 && (double)(targetEnd - targetStart - targetGap) / (double)(targetEnd - targetStart) >= 0.95 && targetSize > sourceSize + 100) 2647 | { 2648 | initTags[sourceID] = 1; 2649 | extdTags[targetID] = 1; 2650 | } 2651 | } 2652 | } 2653 | else 2654 | { 2655 | cout << "CANNOT OPEN FILE!" << endl; 2656 | return; 2657 | } 2658 | 2659 | s = "tmp/_post_extended_contigs." + itoa(i) + ".fa"; 2660 | e.clear(); 2661 | e.open(s.c_str()); 2662 | if(e.is_open()) 2663 | { 2664 | for(j = 0; j < extdTags.size(); j ++) 2665 | if(extdTags[j] == 1) 2666 | { 2667 | e << ">" << seqID ++ << endl; 2668 | for(k = 0; k < extdContigs[j].size(); k ++) 2669 | { 2670 | e << extdContigs[j][k]; 2671 | if(k == extdContigs[j].size() - 1 || (k + 1) % 60 == 0) 2672 | e << endl; 2673 | } 2674 | } 2675 | } 2676 | else 2677 | { 2678 | cout << "CANNOT OPEN FILE!" << endl; 2679 | return; 2680 | } 2681 | 2682 | s = "tmp/_post_initial_contigs." + itoa(i) + ".fa"; 2683 | ini.clear(); 2684 | ini.open(s.c_str()); 2685 | if(ini.is_open()) 2686 | { 2687 | for(j = 0; j < initTags.size(); j ++) 2688 | if(initTags[j] == 1) 2689 | { 2690 | ini << ">" << initNums[j] << endl; 2691 | for(k = 0; k < initContigs[j].size(); k ++) 2692 | { 2693 | ini << initContigs[j][k]; 2694 | if(k == initContigs[j].size() - 1 || (k + 1) % 60 == 0) 2695 | ini << endl; 2696 | } 2697 | } 2698 | } 2699 | else 2700 | { 2701 | cout << "CANNOT OPEN FILE!" << endl; 2702 | return; 2703 | } 2704 | 2705 | ex.close(); 2706 | ps.close(); 2707 | e.close(); 2708 | ini.close(); 2709 | } 2710 | } 2711 | 2712 | void ref2(ofstream & e, ofstream & r) 2713 | { 2714 | int numChromosomes, i, j, k, seqID; 2715 | string s; 2716 | vector > initContigs, extdContigs; 2717 | vector contig; 2718 | string buf; 2719 | unsigned int sourceID, targetID, targetStart, targetEnd, targetGap, sourceStart, sourceEnd, sourceGap, sourceSize, targetSize, fr; 2720 | vector initTags, extdTags; 2721 | ifstream in, ex, ps; 2722 | vector seg; 2723 | 2724 | numChromosomes = readLog(); 2725 | for(i = 0; i < numChromosomes; i ++) 2726 | { 2727 | s = "pblat tmp/_post_extended_contigs." + itoa(i) + ".fa tmp/_post_initial_contigs." + itoa(i) + ".fa -noHead tmp/_initial_contigs_extended_contigs." + itoa(i) + ".psl -threads=8 > blat_doc.txt 2> blat_doc.txt"; 2728 | if(system(s.c_str()) != 0) 2729 | { 2730 | s = "blat tmp/_post_extended_contigs." + itoa(i) + ".fa tmp/_post_initial_contigs." + itoa(i) + ".fa -noHead tmp/_initial_contigs_extended_contigs." + itoa(i) + ".psl > blat_doc.txt 2> blat_doc.txt"; 2731 | if(system(s.c_str()) != 0) {cout << "BLAT CALL FAILED!" << endl; exit(-1);} 2732 | } 2733 | } 2734 | 2735 | in.open("tmp/_contigs.fa"); 2736 | if(in.is_open()) 2737 | { 2738 | while(in.good()) 2739 | { 2740 | getline(in, buf); 2741 | if(buf[0] == 0) break; 2742 | 2743 | if(buf[0] == '>') 2744 | { 2745 | initContigs.push_back(contig); 2746 | initTags.push_back(0); 2747 | } 2748 | else 2749 | for(i = 0; i < buf.size(); i++) 2750 | initContigs[initContigs.size() - 1].push_back(buf[i]); 2751 | } 2752 | } 2753 | else 2754 | { 2755 | cout << "CANNOT OPEN FILE!" << endl; 2756 | return; 2757 | } 2758 | 2759 | seqID = 0; 2760 | for(i = 0; i < numChromosomes; i ++) 2761 | { 2762 | s = "tmp/_post_extended_contigs." + itoa(i) + ".fa"; 2763 | ex.clear(); 2764 | ex.open(s.c_str()); 2765 | extdContigs.clear(); 2766 | extdTags.clear(); 2767 | if(ex.is_open()) 2768 | { 2769 | while(ex.good()) 2770 | { 2771 | getline(ex, buf); 2772 | if(buf[0] == 0) break; 2773 | 2774 | if(buf[0] == '>') 2775 | { 2776 | extdContigs.push_back(contig); 2777 | extdTags.push_back(0); 2778 | } 2779 | else 2780 | for(j = 0; j < buf.size(); j ++) 2781 | extdContigs[extdContigs.size() - 1].push_back(buf[j]); 2782 | } 2783 | } 2784 | else 2785 | { 2786 | cout << "CANNOT OPEN FILE!" << endl; 2787 | return; 2788 | } 2789 | 2790 | s = "tmp/_post_initial_contigs_post_extended_contigs." + itoa(i) + ".psl"; 2791 | ps.clear(); 2792 | ps.open(s.c_str()); 2793 | if(ps.is_open()) 2794 | { 2795 | while(ps.good()) 2796 | { 2797 | getline(ps, buf); 2798 | if(buf[0] == 0) break; 2799 | 2800 | parseBLAT(buf, targetID, targetStart, targetEnd, targetGap, sourceID, sourceStart, sourceEnd, sourceGap, sourceSize, seg, fr, targetSize); 2801 | if((double)(sourceEnd - sourceStart - sourceGap) / sourceSize >= 0.95 && (double)(targetEnd - targetStart - targetGap) / (double)(targetEnd - targetStart) >= 0.95 && targetSize > sourceSize + 100) 2802 | { 2803 | initTags[sourceID] = 1; 2804 | extdTags[targetID] = 1; 2805 | } 2806 | } 2807 | } 2808 | else 2809 | { 2810 | cout << "CANNOT OPEN FILE!" << endl; 2811 | return; 2812 | } 2813 | 2814 | if(e.is_open()) 2815 | { 2816 | for(j = 0; j < extdTags.size(); j ++) 2817 | if(extdTags[j] == 1) 2818 | { 2819 | e << ">" << i << ": " << seqID ++ << endl; 2820 | for(k = 0; k < extdContigs[j].size(); k ++) 2821 | { 2822 | e << extdContigs[j][k]; 2823 | if(k == extdContigs[j].size() - 1 || (k + 1) % 60 == 0) 2824 | e << endl; 2825 | } 2826 | } 2827 | } 2828 | else 2829 | { 2830 | cout << "CANNOT OPEN FILE!" << endl; 2831 | return; 2832 | } 2833 | ex.close(); 2834 | ps.close(); 2835 | } 2836 | 2837 | if(r.is_open()) 2838 | { 2839 | for(i = 0; i < initTags.size(); i ++) 2840 | if(initTags[i] == 0) 2841 | { 2842 | r << ">" << i << endl; 2843 | for(j = 0; j < initContigs[i].size(); j ++) 2844 | { 2845 | r << initContigs[i][j]; 2846 | if(j == initContigs[i].size() - 1 || (j + 1) % 60 == 0) 2847 | r << endl; 2848 | } 2849 | } 2850 | } 2851 | else 2852 | { 2853 | cout << "CANNOT OPEN FILE!" << endl; 2854 | return; 2855 | } 2856 | } 2857 | 2858 | void refinement(ofstream & e, ofstream & r) 2859 | { 2860 | ref1(); 2861 | ref2(e, r); 2862 | } 2863 | 2864 | void refinement(ofstream & e, ofstream & r, int fastMap, int uniqueExtension, int numChromosomes) 2865 | { 2866 | int i, j, k, seqID; 2867 | string s, s0; 2868 | vector > initContigs, extdContigs; 2869 | vector contig; 2870 | string buf; 2871 | unsigned int sourceID, targetID, targetStart, targetEnd, targetGap, sourceStart, sourceEnd, sourceGap, sourceSize, targetSize, fr, realSourceSize; 2872 | vector initTags, extdTags; 2873 | ifstream in, ex, ps, cf; 2874 | vector seg; 2875 | vector initNums; 2876 | int ID, IDBak = -1, targetIDBak; 2877 | vector > extdInitMap; 2878 | vector eim; 2879 | 2880 | #ifdef TEST 2881 | ofstream ini, ext; 2882 | ini.open("in.fa"); 2883 | ext.open("ex.fa"); 2884 | #endif 2885 | 2886 | ///////////////////////////////////////////////////////////////////for easy alignment///////////////////////////////////////////////////////////////////////////////////// 2887 | 2888 | ifstream tmpIn; 2889 | ofstream tmpOut; 2890 | vector > tmpContigs; 2891 | for(i = 0; i < numChromosomes; i ++) 2892 | { 2893 | tmpIn.clear(); 2894 | tmpOut.clear(); 2895 | tmpContigs.clear(); 2896 | initNums.clear(); 2897 | seqID = 0; 2898 | 2899 | s = "tmp/_initial_contigs." + itoa(i) + ".fa"; 2900 | tmpIn.open(s.c_str()); 2901 | s = "tmp/_short_initial_contigs." + itoa(i) + ".fa"; 2902 | tmpOut.open(s.c_str()); 2903 | if(tmpIn.is_open()) 2904 | { 2905 | while(tmpIn.good()) 2906 | { 2907 | getline(tmpIn, buf); 2908 | if(buf[0] == 0) break; 2909 | 2910 | if(buf[0] == '>') 2911 | { 2912 | tmpContigs.push_back(contig); 2913 | initNums.push_back(atoi(buf.substr(1, buf.size()).c_str())); 2914 | } 2915 | else 2916 | for(j = 0; j < buf.size(); j ++) 2917 | tmpContigs[tmpContigs.size() - 1].push_back(buf[j]); 2918 | } 2919 | } 2920 | else 2921 | { 2922 | cout << "CANNOT OPEN FILE!" << endl; 2923 | exit(-1); 2924 | } 2925 | 2926 | for(j = 0; j < tmpContigs.size(); j ++) 2927 | { 2928 | 2929 | if(tmpContigs[j].size() > SMALL_CHUNK) 2930 | { 2931 | tmpOut << ">" << initNums[j] << "." << tmpContigs[j].size() << endl;// preserve id and size; parse for short contigs in a different way 2932 | for(k = 0; k < SMALL_CHUNK; k ++) 2933 | { 2934 | tmpOut << tmpContigs[j][k]; 2935 | if((k + 1) % 60 == 0 || k == SMALL_CHUNK - 1) 2936 | tmpOut << endl; 2937 | } 2938 | } 2939 | else 2940 | { 2941 | tmpOut << ">" << initNums[j] << endl; 2942 | for(k = 0; k < tmpContigs[j].size(); k ++) 2943 | { 2944 | tmpOut << tmpContigs[j][k]; 2945 | if((k + 1) % 60 == 0 || k == tmpContigs[j].size() - 1) 2946 | tmpOut << endl; 2947 | } 2948 | } 2949 | } 2950 | 2951 | tmpIn.close(); 2952 | tmpOut.close(); 2953 | } 2954 | 2955 | ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2956 | 2957 | if(fastMap == 1) 2958 | { 2959 | //NUCMER 2960 | for(i = 0; i < numChromosomes; i ++) 2961 | { 2962 | s = "nucmer tmp/_extended_contigs." + itoa(i) + ".fa tmp/_short_initial_contigs." + itoa(i) + ".fa -p tmp/_short_initial_contigs_extended_contigs." + itoa(i) + " > nucmer_doc.txt 2> nucmer_doc.txt"; 2963 | if(system(s.c_str()) != 0) 2964 | { 2965 | s = "touch tmp/_short_initial_contigs_extended_contigs." + itoa(i) + ".delta"; 2966 | system(s.c_str()); 2967 | } 2968 | s = "tmp/_short_initial_contigs_extended_contigs." + itoa(i) + ".delta"; 2969 | delta2psl(s); 2970 | } 2971 | } 2972 | else 2973 | { 2974 | for(i = 0; i < numChromosomes; i ++) 2975 | { 2976 | s = "pblat tmp/_extended_contigs." + itoa(i) + ".fa tmp/_short_initial_contigs." + itoa(i) + ".fa -noHead tmp/_short_initial_contigs_extended_contigs." + itoa(i) + ".psl -fastMap -threads=8 > blat_doc.txt 2> blat_doc.txt"; 2977 | if(system(s.c_str()) != 0) 2978 | { 2979 | s = "blat tmp/_extended_contigs." + itoa(i) + ".fa tmp/_short_initial_contigs." + itoa(i) + ".fa -noHead tmp/_short_initial_contigs_extended_contigs." + itoa(i) + ".psl -fastMap > blat_doc.txt 2> blat_doc.txt"; 2980 | if(system(s.c_str()) != 0) {cout << "BLAT CALL FAILED!" << endl; exit(-1);} 2981 | } 2982 | } 2983 | } 2984 | 2985 | in.open("tmp/_contigs.fa"); 2986 | if(in.is_open()) 2987 | { 2988 | while(in.good()) 2989 | { 2990 | getline(in, buf); 2991 | if(buf[0] == 0) break; 2992 | 2993 | if(buf[0] == '>') 2994 | { 2995 | for(i = 0; i < buf.size(); i ++) 2996 | if(buf[i] == '.') break; 2997 | ID = atoi(buf.substr(i + 1, buf.size()).c_str()); 2998 | if(ID != IDBak) 2999 | { 3000 | initContigs.push_back(contig); 3001 | initTags.push_back(0); 3002 | IDBak = ID; 3003 | } 3004 | } 3005 | else 3006 | for(i = 0; i < buf.size(); i++) 3007 | initContigs[initContigs.size() - 1].push_back(buf[i]); 3008 | } 3009 | } 3010 | else 3011 | { 3012 | cout << "CANNOT OPEN FILE!" << endl; 3013 | return; 3014 | } 3015 | 3016 | int line; 3017 | seqID = 0; 3018 | for(i = 0; i < numChromosomes; i ++) 3019 | { 3020 | s = "tmp/_extended_contigs." + itoa(i) + ".fa"; 3021 | ex.open(s.c_str()); 3022 | extdContigs.clear(); 3023 | extdTags.clear(); 3024 | if(ex.is_open()) 3025 | { 3026 | while(ex.good()) 3027 | { 3028 | getline(ex, buf); 3029 | if(buf[0] == 0) break; 3030 | 3031 | if(buf[0] == '>') 3032 | { 3033 | extdContigs.push_back(contig); 3034 | extdTags.push_back(0); 3035 | extdInitMap.push_back(eim); 3036 | } 3037 | else 3038 | for(j = 0; j < buf.size(); j ++) 3039 | extdContigs[extdContigs.size() - 1].push_back(buf[j]); 3040 | } 3041 | } 3042 | else 3043 | { 3044 | cout << "CANNOT OPEN FILE!" << endl; 3045 | return; 3046 | } 3047 | 3048 | targetIDBak = -1; 3049 | s = "tmp/_short_initial_contigs_extended_contigs." + itoa(i) + ".psl"; 3050 | ps.open(s.c_str()); 3051 | if(ps.is_open()) 3052 | { 3053 | while(ps.good()) 3054 | { 3055 | getline(ps, buf); 3056 | if(buf[0] == 0) break; 3057 | 3058 | realSourceSize = parseBLAT(buf, targetID, targetStart, targetEnd, targetGap, sourceID, sourceStart, sourceEnd, sourceGap, sourceSize, seg, fr, targetSize); 3059 | if((double)(sourceEnd - sourceStart - sourceGap) / sourceSize >= 0.8 && (double)(targetEnd - targetStart - targetGap) / (double)(targetEnd - targetStart) >= 0.8 && targetSize > realSourceSize + 100 && realSourceSize > targetSize / 100) 3060 | { 3061 | if(uniqueExtension == 1) 3062 | { 3063 | if(initTags[sourceID] > 0 && targetIDBak != -1) 3064 | { 3065 | if(extdTags[targetIDBak] < extdTags[targetID]) 3066 | { 3067 | extdTags[targetIDBak] = 0; 3068 | extdInitMap[targetIDBak].pop_back(); 3069 | extdTags[targetID] = targetSize; 3070 | initTags[sourceID] = 1; 3071 | extdInitMap[targetID].push_back(sourceID); 3072 | } 3073 | } 3074 | else 3075 | { 3076 | extdTags[targetID] = targetSize; 3077 | initTags[sourceID] = 1; 3078 | extdInitMap[targetID].push_back(sourceID); 3079 | } 3080 | targetIDBak = targetID; 3081 | } 3082 | else 3083 | { 3084 | extdTags[targetID] = 1; 3085 | initTags[sourceID] = 1; 3086 | extdInitMap[targetID].push_back(sourceID); 3087 | } 3088 | } 3089 | } 3090 | } 3091 | else 3092 | { 3093 | cout << "CANNOT OPEN FILE!" << endl; 3094 | return; 3095 | } 3096 | 3097 | if(e.is_open()) 3098 | { 3099 | for(j = 0; j < extdTags.size(); j ++) 3100 | if(extdTags[j] > 0) 3101 | { 3102 | e << ">" << "AlignGraph" << seqID << " @ " << genomeIds[i] << " : "; 3103 | for(k = 0; k < extdInitMap[j].size(); k ++) 3104 | e << contigIds[extdInitMap[j][k]] << " ; "; 3105 | e << endl; 3106 | #ifdef TEST 3107 | ext << ">" << i << ": " << seqID << endl; 3108 | #endif 3109 | seqID ++; 3110 | for(k = 0; k < extdContigs[j].size(); k ++) 3111 | { 3112 | e << extdContigs[j][k]; 3113 | #ifdef TEST 3114 | ext << extdContigs[j][k]; 3115 | #endif 3116 | if(k == extdContigs[j].size() - 1 || (k + 1) % 60 == 0) 3117 | { 3118 | e << endl; 3119 | #ifdef TEST 3120 | ext << endl; 3121 | #endif 3122 | } 3123 | } 3124 | } 3125 | } 3126 | else 3127 | { 3128 | cout << "CANNOT OPEN FILE!" << endl; 3129 | return; 3130 | } 3131 | ex.close(); 3132 | ps.close(); 3133 | } 3134 | 3135 | if(r.is_open()) 3136 | { 3137 | for(i = 0; i < initTags.size(); i ++) 3138 | if(initTags[i] == 0) 3139 | { 3140 | r << ">" << contigIds[i] << endl; 3141 | for(j = 0; j < initContigs[i].size(); j ++) 3142 | { 3143 | r << initContigs[i][j]; 3144 | if(j == initContigs[i].size() - 1 || (j + 1) % 60 == 0) 3145 | r << endl; 3146 | } 3147 | } 3148 | 3149 | cf.open("tmp/_chaff.fa"); 3150 | if(cf.is_open()) 3151 | { 3152 | while(cf.good()) 3153 | { 3154 | getline(cf, buf); 3155 | if(buf[0] == 0) break; 3156 | 3157 | for(i = 0; i < buf.size(); i ++) 3158 | r << buf[i]; 3159 | r << endl; 3160 | } 3161 | } 3162 | else 3163 | { 3164 | cout << "CANNOT OPEN FILE!" << endl; 3165 | return; 3166 | } 3167 | } 3168 | else 3169 | { 3170 | cout << "CANNOT OPEN FILE!" << endl; 3171 | return; 3172 | } 3173 | 3174 | #ifdef TEST 3175 | if(ini.is_open()) 3176 | { 3177 | for(i = 0; i < initTags.size(); i ++) 3178 | if(initTags[i] == 1) 3179 | { 3180 | ini << ">" << i << endl; 3181 | for(j = 0; j < initContigs[i].size(); j ++) 3182 | { 3183 | ini << initContigs[i][j]; 3184 | if(j == initContigs[i].size() - 1 || (j + 1) % 60 == 0) 3185 | ini << endl; 3186 | } 3187 | } 3188 | } 3189 | else 3190 | { 3191 | cout << "CANNOT OPEN FILE!" << endl; 3192 | return; 3193 | } 3194 | #endif 3195 | } 3196 | 3197 | int maxReadLength(ifstream & r) 3198 | { 3199 | int max = 0, len = 0; 3200 | string buf; 3201 | 3202 | if(r.is_open()) 3203 | { 3204 | while(r.good()) 3205 | { 3206 | getline(r, buf); 3207 | if(buf[0] == 0) break; 3208 | if(buf[0] == '>') 3209 | { 3210 | if(len > max) 3211 | max = len; 3212 | len = 0; 3213 | continue; 3214 | } 3215 | len = len + buf.size(); 3216 | } 3217 | if(len > max) max = len; 3218 | } 3219 | else 3220 | { 3221 | cout << "CANNOT OPEN FILE!" << endl; 3222 | exit(-1); 3223 | } 3224 | 3225 | return max; 3226 | } 3227 | 3228 | void formalizeInput(ifstream & in, string file) 3229 | { 3230 | string buf; 3231 | int i, cp, cpp, total, sp; 3232 | unsigned long seqID = 0, count = 0, realSeqID = 0; 3233 | vector > contigs; 3234 | vector contig; 3235 | ofstream out, addOut; 3236 | string id; 3237 | vector cIds; 3238 | 3239 | in.clear(); 3240 | in.seekg(0); 3241 | out.open(file.c_str()); 3242 | if(file == "tmp/_contigs.fa") addOut.open("tmp/_chaff.fa"); 3243 | 3244 | if(file == "tmp/_contigs.fa" || file == "tmp/_extended_contigs.fa" || file == "tmp/_remaining_contigs.fa") 3245 | { 3246 | if(in.is_open()) 3247 | { 3248 | while(in.good()) 3249 | { 3250 | getline(in, buf); 3251 | if(buf[0] == 0) 3252 | break; 3253 | if(buf[0] == '>') 3254 | { 3255 | contigs.push_back(contig); 3256 | cIds.push_back(buf.substr(1, buf.size())); 3257 | } 3258 | else 3259 | for(i = 0; i < buf.size(); i ++) 3260 | contigs[contigs.size() - 1].push_back(buf[i]); 3261 | } 3262 | 3263 | for(cp = 0; cp < contigs.size(); cp ++) 3264 | { 3265 | if(contigs[cp].size() > 200) 3266 | { 3267 | if(contigs[cp].size() < LARGE_CHUNK) 3268 | { 3269 | out << ">" << seqID ++ << "." << realSeqID << endl; 3270 | for(cpp = 0; cpp < contigs[cp].size(); cpp ++) 3271 | { 3272 | out << contigs[cp][cpp]; 3273 | if((cpp + 1) % 60 == 0 || cpp == contigs[cp].size() - 1) 3274 | out << endl; 3275 | } 3276 | } 3277 | else 3278 | { 3279 | out << ">" << seqID ++ << "." << realSeqID << endl; 3280 | for(cpp = 0, total = 0; cpp < contigs[cp].size(); cpp ++) 3281 | { 3282 | out << contigs[cp][cpp]; 3283 | if((cpp + 1) % LARGE_CHUNK == 0 && cpp < contigs[cp].size() - 1 - 60) 3284 | { 3285 | total = total + LARGE_CHUNK; 3286 | out << endl; 3287 | out << ">" << seqID ++ << "." << realSeqID << endl; 3288 | continue; 3289 | } 3290 | if((cpp + 1 - total) % 60 == 0 || cpp == contigs[cp].size() - 1) 3291 | out << endl; 3292 | } 3293 | } 3294 | realSeqID ++; 3295 | } 3296 | else 3297 | { 3298 | addOut << ">" << cIds[cp] << endl; 3299 | for(cpp = 0; cpp < contigs[cp].size(); cpp ++) 3300 | { 3301 | addOut << contigs[cp][cpp]; 3302 | if((cpp + 1) % 60 == 0 || cpp == contigs[cp].size() - 1) 3303 | addOut << endl; 3304 | } 3305 | cIds[cp] = "\0"; 3306 | } 3307 | } 3308 | 3309 | contigIds.clear(); 3310 | for(sp = 0; sp < cIds.size(); sp ++) 3311 | if(cIds[sp] != "\0") 3312 | contigIds.push_back(cIds[sp]); 3313 | } 3314 | else 3315 | { 3316 | cout << "CANNOT OPEN FILE!" << endl; 3317 | exit(-1); 3318 | } 3319 | } 3320 | else 3321 | { 3322 | if(in.is_open()) 3323 | { 3324 | while(in.good()) 3325 | { 3326 | getline(in, buf); 3327 | if(buf[0] == 0) 3328 | break; 3329 | if(buf[0] == '>') 3330 | out << ">" << seqID ++ << endl; 3331 | else 3332 | { 3333 | for(i = 0; i < buf.size(); i ++) 3334 | out << buf[i]; 3335 | out << endl; 3336 | } 3337 | } 3338 | } 3339 | else 3340 | { 3341 | cout << "CANNOT OPEN FILE!" << endl; 3342 | exit(-1); 3343 | } 3344 | } 3345 | } 3346 | 3347 | int formalizeGenome(ifstream & in, int p) 3348 | { 3349 | string buf; 3350 | int i, chromosomeID, gp, cp, q; 3351 | ofstream out, out0; 3352 | string s; 3353 | vector > genome; 3354 | vector g; 3355 | vector id; 3356 | 3357 | if(in.is_open()) 3358 | { 3359 | while(in.good()) 3360 | { 3361 | getline(in, buf); 3362 | if(buf[0] == 0) 3363 | break; 3364 | if(buf[0] == '>') 3365 | { 3366 | genome.push_back(g); 3367 | genomeIds.push_back(buf.substr(1, buf.size())); 3368 | } 3369 | else 3370 | { 3371 | for(i = 0; i < buf.size(); i ++) 3372 | genome[genome.size() - 1].push_back(buf[i]); 3373 | } 3374 | } 3375 | } 3376 | else 3377 | { 3378 | cout << "CANNOT OPEN FILE!" << endl; 3379 | exit(-1); 3380 | } 3381 | 3382 | chromosomeID = 0; 3383 | out0.open("tmp/_genome.fa"); 3384 | for(gp = 0; gp < genome.size(); gp ++) 3385 | { 3386 | s = "tmp/_genome." + itoa(chromosomeID) + ".fa"; 3387 | out.open(s.c_str()); 3388 | out << ">0" << endl; 3389 | out0 << ">" << chromosomeID << endl; 3390 | q = 1; 3391 | for(cp = 0; cp < genome[gp].size(); cp ++) 3392 | { 3393 | out << genome[gp][cp]; 3394 | out0 << genome[gp][cp]; 3395 | if((cp + 1) % 60 == 0 || cp == genome[gp].size() - 1 || ((cp + 1) % (genome[gp].size() / p) == 0 && q < p)) 3396 | { 3397 | out << endl; 3398 | out0 << endl; 3399 | } 3400 | if(cp != genome[gp].size() - 1 && ((cp + 1) % (genome[gp].size() / p) == 0 && q < p)) 3401 | { 3402 | out.close(); 3403 | chromosomeID ++; 3404 | q ++; 3405 | s = "tmp/_genome." + itoa(chromosomeID) + ".fa"; 3406 | out.open(s.c_str()); 3407 | out << ">0" << endl; 3408 | out0 << ">" << chromosomeID << endl; 3409 | } 3410 | } 3411 | out.close(); 3412 | chromosomeID ++; 3413 | } 3414 | genome.clear(); 3415 | out.close(); 3416 | out0.close(); 3417 | return chromosomeID; 3418 | } 3419 | 3420 | int formalizeInput(ifstream & in1, ifstream & in2, string file, string file1, string file2) 3421 | { 3422 | string buf1, buf2; 3423 | int i, size; 3424 | unsigned long seqID = 0; 3425 | ofstream out, out1, out2; 3426 | vector read1, read2; 3427 | 3428 | in1.clear(); 3429 | in2.clear(); 3430 | in1.seekg(0); 3431 | in2.seekg(0); 3432 | out.open(file.c_str()); 3433 | out1.open(file1.c_str()); 3434 | out2.open(file2.c_str()); 3435 | 3436 | if(in1.is_open() && in2.is_open()) 3437 | { 3438 | while(in1.good() && in2.good()) 3439 | { 3440 | getline(in1, buf1); 3441 | getline(in2, buf2); 3442 | if(buf1[0] == 0 && buf2[0] == 0) 3443 | break; 3444 | else if(buf1[0] == 0 && buf2[0] != 0 || buf1[0] != 0 && buf2[0] == 0) 3445 | { 3446 | cout << "INCONSISTENT PE FILES!" << endl; 3447 | exit(-1); 3448 | } 3449 | 3450 | if(buf1[0] == '>' && buf2[0] == '>') 3451 | { 3452 | if(read1.size() != 0 && read2.size() != 0) 3453 | { 3454 | size = read1.size() < read2.size() ? read1.size() : read2.size(); 3455 | out << ">" << seqID << endl; 3456 | for(i = 0; i < size; i ++) 3457 | out << read1[i]; 3458 | out << endl; 3459 | out1 << ">" << seqID << endl; 3460 | for(i = 0; i < size; i ++) 3461 | out1 << read1[i]; 3462 | out1 << endl; 3463 | 3464 | out << ">" << seqID << endl; 3465 | for(i = 0; i < size; i ++) 3466 | out << read2[i]; 3467 | out << endl; 3468 | out2 << ">" << seqID ++ << endl; 3469 | for(i = 0; i < size; i ++) 3470 | out2 << read2[i]; 3471 | out2 << endl; 3472 | } 3473 | 3474 | read1.clear(); 3475 | read2.clear(); 3476 | } 3477 | else if(buf1[0] != '>' && buf2[0] != '>') 3478 | { 3479 | for(i = 0; i < buf1.size(); i ++) 3480 | read1.push_back(buf1[i]); 3481 | for(i = 0; i < buf2.size(); i ++) 3482 | read2.push_back(buf2[i]); 3483 | } 3484 | else 3485 | { 3486 | cout << "INCONSISTENT PE FILES!" << endl; 3487 | exit(-1); 3488 | } 3489 | } 3490 | if(read1.size() != 0 && read2.size() != 0) 3491 | { 3492 | size = read1.size() < read2.size() ? read1.size() : read2.size(); 3493 | out << ">" << seqID << endl; 3494 | for(i = 0; i < size; i ++) 3495 | out << read1[i]; 3496 | out << endl; 3497 | out1 << ">" << seqID << endl; 3498 | for(i = 0; i < size; i ++) 3499 | out1 << read1[i]; 3500 | out1 << endl; 3501 | 3502 | out << ">" << seqID << endl; 3503 | for(i = 0; i < size; i ++) 3504 | out << read2[i]; 3505 | out << endl; 3506 | out2 << ">" << seqID ++ << endl; 3507 | for(i = 0; i < size; i ++) 3508 | out2 << read2[i]; 3509 | out2 << endl; 3510 | } 3511 | } 3512 | else 3513 | { 3514 | cout << "CANNOT OPEN FILE!" << endl; 3515 | exit(-1); 3516 | } 3517 | return seqID; 3518 | } 3519 | 3520 | int parseBT(string buf) 3521 | { 3522 | int i, item = 0, j = 0; 3523 | char targetIDBuf[10] = {'\0'}; 3524 | 3525 | for(i = 0; i < buf.size(); i ++) 3526 | { 3527 | if(buf[i] == ' ') 3528 | { 3529 | item ++; 3530 | continue; 3531 | } 3532 | 3533 | if(item == 2) 3534 | { 3535 | if(buf[i] == '*') return -1; 3536 | 3537 | targetIDBuf[j ++] = buf[i]; 3538 | } 3539 | 3540 | if(item == 3) break; 3541 | } 3542 | return atoi(targetIDBuf); 3543 | } 3544 | 3545 | void distributeAlignments(int numChromosomes) 3546 | { 3547 | ifstream in; 3548 | ofstream out; 3549 | string buf, s; 3550 | int chromosomeID; 3551 | 3552 | in.open("tmp/_reads_genome.bowtie"); 3553 | for(chromosomeID = 0; chromosomeID < numChromosomes; chromosomeID ++) 3554 | { 3555 | s = "tmp/_reads_genome." + itoa(chromosomeID) + ".bowtie"; 3556 | out.open(s.c_str()); 3557 | if(in.is_open()) 3558 | { 3559 | while(in.good()) 3560 | { 3561 | getline(in, buf); 3562 | if(buf[0] == '@') continue; 3563 | if(buf[0] == 0) break; 3564 | 3565 | if(parseBT(buf) == chromosomeID) 3566 | out << buf << endl; 3567 | } 3568 | } 3569 | else 3570 | { 3571 | cout << "CANNOT OPEN FILE!" << endl; 3572 | exit(-1); 3573 | } 3574 | in.clear(); 3575 | in.seekg(0); 3576 | out.close(); 3577 | } 3578 | in.close(); 3579 | } 3580 | 3581 | void * task0(void * arg) 3582 | { 3583 | Insert ins; 3584 | string command; 3585 | stringstream distanceLowStr, distanceHighStr; 3586 | int chromosomeID; 3587 | 3588 | ins = *(Insert *) arg; 3589 | distanceLowStr << ins.distanceLow; 3590 | distanceHighStr << ins.distanceHigh; 3591 | ins.numChromosomes; 3592 | ins.fastMap; 3593 | ins.iterativeMap; 3594 | 3595 | if(ins.iterativeMap == 1) 3596 | { 3597 | for(chromosomeID = 0; chromosomeID < ins.numChromosomes; chromosomeID ++) 3598 | { 3599 | command = "bowtie2-build -f tmp/_genome." + itoa(chromosomeID) + ".fa tmp/_genome." + itoa(chromosomeID) + " > bowtie_doc.txt 2> bowtie_doc.txt"; 3600 | system(command.c_str()); 3601 | command = "bowtie2 -f --no-mixed -k 5 -p 8 --local --mp 3,1 --rdg 2,1 --rfg 2,1 --score-min G,5,2 -I " + distanceLowStr.str() + " -X " + distanceHighStr.str() + " --no-discordant -x tmp/_genome." + itoa(chromosomeID) + " -1 tmp/_reads_1.fa -2 tmp/_reads_2.fa --reorder > tmp/_reads_genome." + itoa(chromosomeID) + ".bowtie 2> bowtie_doc.txt"; 3602 | system(command.c_str()); 3603 | } 3604 | } 3605 | else 3606 | { 3607 | command = "bowtie2-build -f tmp/_genome.fa tmp/_genome > bowtie_doc.txt 2> bowtie_doc.txt"; 3608 | system(command.c_str()); 3609 | command = "bowtie2 -f --no-mixed -k 5 -p 8 --local --mp 3,1 --rdg 2,1 --rfg 2,1 --score-min G,5,2 -I " + distanceLowStr.str() + " -X " + distanceHighStr.str() + " --no-discordant -x tmp/_genome -1 tmp/_reads_1.fa -2 tmp/_reads_2.fa --reorder > tmp/_reads_genome.bowtie 2> bowtie_doc.txt"; 3610 | system(command.c_str()); 3611 | distributeAlignments(ins.numChromosomes); 3612 | } 3613 | } 3614 | 3615 | void * task1(void * arg) 3616 | { 3617 | Insert ins; 3618 | int chromosomeID; 3619 | string command, s; 3620 | stringstream distanceLowStr, distanceHighStr; 3621 | 3622 | ins = *(Insert *) arg; 3623 | distanceLowStr << ins.distanceLow; 3624 | distanceHighStr << ins.distanceHigh; 3625 | ins.numChromosomes; 3626 | ins.fastMap; 3627 | ins.iterativeMap; 3628 | 3629 | if(ins.fastMap == 1) 3630 | { 3631 | //NUCMER 3632 | for(chromosomeID = 0; chromosomeID < ins.numChromosomes; chromosomeID ++) 3633 | { 3634 | command = "nucmer tmp/_genome." + itoa(chromosomeID) + ".fa tmp/_contigs.fa -p tmp/_contigs_genome." + itoa(chromosomeID) + " > nucmer_doc.txt 2> nucmer_doc.txt"; 3635 | if(system(command.c_str()) != 0) 3636 | { 3637 | command = "touch tmp/_contigs_genome." + itoa(chromosomeID) + ".delta"; 3638 | system(command.c_str()); 3639 | } 3640 | command = "tmp/_contigs_genome." + itoa(chromosomeID) + ".delta"; 3641 | delta2psl(command); 3642 | } 3643 | } 3644 | else 3645 | { 3646 | for(chromosomeID = 0; chromosomeID < ins.numChromosomes; chromosomeID ++) 3647 | { 3648 | command = "pblat tmp/_genome." + itoa(chromosomeID) + ".fa tmp/_contigs.fa -noHead tmp/_contigs_genome." + itoa(chromosomeID) + ".psl -fastMap -threads=8 > blat_doc.txt 2> blat_doc.txt"; 3649 | if(system(command.c_str()) != 0) 3650 | { 3651 | command = "blat tmp/_genome." + itoa(chromosomeID) + ".fa tmp/_contigs.fa -noHead tmp/_contigs_genome." + itoa(chromosomeID) + ".psl -fastMap > blat_doc.txt 2> blat_doc.txt"; 3652 | if(system(command.c_str()) != 0) {cout << "BLAT CALL FAILED!" << endl; exit(-1);} 3653 | } 3654 | } 3655 | } 3656 | } 3657 | 3658 | void nonParallelMap(int distanceLow, int distanceHigh, int numChromosomes, int fastMap, int iterativeMap) 3659 | { 3660 | string command, s; 3661 | stringstream distanceLowStr, distanceHighStr; 3662 | int chromosomeID; 3663 | 3664 | distanceLowStr << distanceLow; 3665 | distanceHighStr << distanceHigh; 3666 | 3667 | if(iterativeMap == 1) 3668 | { 3669 | for(chromosomeID = 0; chromosomeID < numChromosomes; chromosomeID ++) 3670 | { 3671 | command = "bowtie2-build -f tmp/_genome." + itoa(chromosomeID) + ".fa tmp/_genome." + itoa(chromosomeID) + " > bowtie_doc.txt 2> bowtie_doc.txt"; 3672 | system(command.c_str()); 3673 | command = "bowtie2 -f --no-mixed -k 5 -p 8 --local --mp 3,1 --rdg 2,1 --rfg 2,1 --score-min G,5,2 -I " + distanceLowStr.str() + " -X " + distanceHighStr.str() + " --no-discordant -x tmp/_genome." + itoa(chromosomeID) + " -1 tmp/_reads_1.fa -2 tmp/_reads_2.fa --reorder > tmp/_reads_genome." + itoa(chromosomeID) + ".bowtie 2>> bowtie_doc.txt";// --very-sensitive-local 3674 | // command = "bowtie2 -f --no-mixed -k 5 -p 8 --end-to-end --mp 3,1 --rdg 2,1 --rfg 2,1 --score-min L,-0.24,-0.24 -I " + distanceLowStr.str() + " -X " + distanceHighStr.str() + " --no-discordant -x tmp/_genome." + itoa(chromosomeID) + " -1 tmp/_reads_1.fa -2 tmp/_reads_2.fa --reorder > tmp/_reads_genome." + itoa(chromosomeID) + ".bowtie 2> bowtie_doc.txt"; 3675 | system(command.c_str()); 3676 | } 3677 | } 3678 | else 3679 | { 3680 | command = "bowtie2-build -f tmp/_genome.fa tmp/_genome > bowtie_doc.txt 2> bowtie_doc.txt"; 3681 | system(command.c_str()); 3682 | command = "bowtie2 -f --no-mixed -k 5 -p 8 --local --mp 3,1 --rdg 2,1 --rfg 2,1 --score-min G,5,2 -I " + distanceLowStr.str() + " -X " + distanceHighStr.str() + " --no-discordant -x tmp/_genome -1 tmp/_reads_1.fa -2 tmp/_reads_2.fa --reorder > tmp/_reads_genome.bowtie 2> bowtie_doc.txt"; 3683 | system(command.c_str()); 3684 | distributeAlignments(numChromosomes); 3685 | } 3686 | 3687 | if(fastMap == 1) 3688 | { 3689 | //NUCMER 3690 | command = "nucmer tmp/_genome." + itoa(chromosomeID) + ".fa tmp/_contigs.fa -p tmp/_contigs_genome." + itoa(chromosomeID) + " > nucmer_doc.txt 2> nucmer_doc.txt"; 3691 | if(system(command.c_str()) != 0) 3692 | { 3693 | command = "touch tmp/_contigs_genome." + itoa(chromosomeID) + ".delta"; 3694 | system(command.c_str()); 3695 | } 3696 | command = "tmp/_contigs_genome." + itoa(chromosomeID) + ".delta"; 3697 | delta2psl(command); 3698 | } 3699 | else 3700 | { 3701 | for(chromosomeID = 0; chromosomeID < numChromosomes; chromosomeID ++) 3702 | { 3703 | // command = "lastdb -c -uMAM8 tmp/_genome." + itoa(chromosomeID) + ".db tmp/_genome." + itoa(chromosomeID) + ".fa > last_doc.txt"; 3704 | // system(command.c_str()); 3705 | // command = "lastal -e34 -m100 tmp/_genome." + itoa(chromosomeID) + ".db tmp/_contigs.fa | last-split > tmp/_contigs_genome." + itoa(chromosomeID) + ".maf 2>> last_doc.txt"; 3706 | // system(command.c_str()); 3707 | // command = "maf-convert.py psl tmp/_contigs_genome." + itoa(chromosomeID) + ".maf > tmp/_contigs_genome." + itoa(chromosomeID) + ".psl 2>> last_doc.txt"; 3708 | // system(command.c_str()); 3709 | 3710 | command = "pblat tmp/_genome." + itoa(chromosomeID) + ".fa tmp/_contigs.fa -noHead tmp/_contigs_genome." + itoa(chromosomeID) + ".psl -fastMap -threads=8 > blat_doc.txt 2> blat_doc.txt";// -minIdentity=50 -q=dnax -t=dnax 3711 | if(system(command.c_str()) != 0) 3712 | { 3713 | command = "blat tmp/_genome." + itoa(chromosomeID) + ".fa tmp/_contigs.fa -noHead tmp/_contigs_genome." + itoa(chromosomeID) + ".psl -fastMap > blat_doc.txt 2> blat_doc.txt"; 3714 | if(system(command.c_str()) != 0) {cout << "BLAT CALL FAILED!" << endl; exit(-1);} 3715 | } 3716 | } 3717 | } 3718 | } 3719 | 3720 | void parallelMap(int distanceLow, int distanceHigh, int numChromosomes, int fastMap, int iterativeMap) 3721 | { 3722 | pthread_t t0, t1; 3723 | Insert ins; 3724 | 3725 | ins.distanceLow = distanceLow; 3726 | ins.distanceHigh = distanceHigh; 3727 | ins.numChromosomes = numChromosomes; 3728 | ins.fastMap = fastMap; 3729 | ins.iterativeMap = iterativeMap; 3730 | if(pthread_create(&t0, NULL, task0, &ins) != 0) {nonParallelMap(distanceLow, distanceHigh, numChromosomes, fastMap, iterativeMap); return;} 3731 | if(pthread_create(&t1, NULL, task1, &ins) != 0) {nonParallelMap(distanceLow, distanceHigh, numChromosomes, fastMap, iterativeMap); return;} 3732 | 3733 | if(pthread_join(t0, NULL) != 0) {perror("Thread join faied"); exit(EXIT_FAILURE);} 3734 | if(pthread_join(t1, NULL) != 0) {perror("Thread join faied"); exit(EXIT_FAILURE);} 3735 | } 3736 | 3737 | void writeLog(int n) 3738 | { 3739 | ofstream out; 3740 | 3741 | out.open("tmp/log.txt"); 3742 | if(out.is_open()) 3743 | out << n << endl; 3744 | else 3745 | { 3746 | cout << "CANNOT OPEN FILE!" << endl; 3747 | exit(-1); 3748 | } 3749 | } 3750 | 3751 | void checkRatio(int numChromosomes) 3752 | { 3753 | int i, rp, chromosomeID, aligned; 3754 | unsigned targetID1, targetStart1, targetEnd1, targetGap1, sourceID1, sourceStart1, sourceEnd1, sourceGap1, sourceSize1, fr1, targetID2, targetStart2, targetEnd2, targetGap2, sourceID2, sourceStart2, sourceEnd2, sourceGap2, sourceSize2, fr2; 3755 | vector segs1, segs2; 3756 | ifstream r, ra; 3757 | string s, buf; 3758 | vector reads; 3759 | double ratio; 3760 | 3761 | r.open("tmp/_reads_1.fa"); 3762 | if(r.is_open()) 3763 | { 3764 | while(r.good()) 3765 | { 3766 | getline(r, buf); 3767 | if(buf[0] == '>') 3768 | reads.push_back(0); 3769 | } 3770 | } 3771 | else 3772 | { 3773 | cout << "CANNOT OPEN FILE!" << endl; 3774 | exit(-1); 3775 | } 3776 | 3777 | for(chromosomeID = 0; chromosomeID < numChromosomes; chromosomeID ++) 3778 | { 3779 | s = "tmp/_reads_genome." + itoa(chromosomeID) + ".bowtie"; 3780 | ra.clear(); 3781 | ra.seekg(0); 3782 | ra.open(s.c_str()); 3783 | if(ra.is_open()) 3784 | { 3785 | while(ra.good()) 3786 | { 3787 | getline(ra, buf); 3788 | if(buf[0] == 0) break; 3789 | if(buf[0] == '@') continue; 3790 | parseBOWTIE(buf, targetID1, targetStart1, targetEnd1, targetGap1, sourceID1, sourceStart1, sourceEnd1, sourceGap1, sourceSize1, segs1, fr1); 3791 | 3792 | getline(ra, buf); 3793 | if(buf[0] == 0) 3794 | { 3795 | cout << "BROKEN BOWTIE FILE" << endl; 3796 | exit(-1); 3797 | } 3798 | parseBOWTIE(buf, targetID2, targetStart2, targetEnd2, targetGap2, sourceID2, sourceStart2, sourceEnd2, sourceGap2, sourceSize2, segs2, fr2); 3799 | if(targetID1 != -1 && targetID2 != -1 && (double) (sourceEnd1 - sourceStart1 - sourceGap1) / /*(sourceEnd1 - sourceStart1)*/ sourceSize1 >= THRESHOLD && (double) (targetEnd1 - targetStart1 - targetGap1) / (targetEnd1 - targetStart1) >= THRESHOLD && (double) (sourceEnd2 - sourceStart2 - sourceGap2) / /*(sourceEnd2 - sourceStart2)*/ sourceSize2 >= THRESHOLD && (double) (targetEnd2 - targetStart2 - targetGap2) / (targetEnd2 - targetStart2) >= THRESHOLD) 3800 | reads[sourceID1] = 1; 3801 | } 3802 | } 3803 | else 3804 | { 3805 | cout << "CANNOT OPEN FILE!" << endl; 3806 | exit(-1); 3807 | } 3808 | } 3809 | 3810 | for(rp = 0, aligned = 0; rp < reads.size(); rp ++) 3811 | if(reads[rp] == 1) 3812 | aligned ++; 3813 | ratio = aligned == 0 ? 0 : (double) aligned / reads.size(); 3814 | cout << " - " << ratio * 100 << "% reads aligned "; 3815 | if(ratio < 0.25) 3816 | cout << "(warning: ratio below 25%; hard to guarantee good results)" << endl; 3817 | else 3818 | cout << endl; 3819 | } 3820 | 3821 | void makeAlignment(int distanceLow, int distanceHigh, string id, int fastMap) 3822 | { 3823 | string command, s; 3824 | 3825 | command = "bowtie2-build -f tmp/_" + id + "_contigs.fa tmp/_" + id + "_contigs > bowtie_doc.txt 2> bowtie_doc.txt"; 3826 | system(command.c_str()); 3827 | command = "bowtie2 -f --no-mixed -k 1 -p 8 -I " + itoa(distanceLow) + " -X " + itoa(distanceHigh) + " --no-discordant -x tmp/_" + id + "_contigs -1 tmp/_reads_1.fa -2 tmp/_reads_2.fa --reorder > tmp/_reads_" + id + "_contigs.bowtie 2> bowtie_doc.txt"; 3828 | 3829 | system(command.c_str()); 3830 | if(fastMap == 1) 3831 | { 3832 | //NUCMER 3833 | command = "nucmer tmp/_genome.fa tmp/_" + id + "_contigs.fa -p tmp/_" + id + "_contigs_genome.delta > nucmer_doc.txt 2> nucmer_doc.txt"; 3834 | if(system(command.c_str()) != 0) 3835 | { 3836 | command = "touch tmp/_" + id + "_contigs_genome.delta"; 3837 | system(command.c_str()); 3838 | } 3839 | command = "tmp/_" + id + "_contigs_genome.delta"; 3840 | delta2psl(command); 3841 | } 3842 | else 3843 | { 3844 | command = "pblat tmp/_genome.fa tmp/_" + id + "_contigs.fa -noHead tmp/_" + id + "_contigs_genome.psl -fastMap -threads=8 > blat_doc.txt 2> blat_doc.txt"; 3845 | if(system(command.c_str()) != 0) 3846 | { 3847 | command = "blat tmp/_genome.fa tmp/_" + id + "_contigs.fa -noHead tmp/_" + id + "_contigs_genome.psl -fastMap > blat_doc.txt 2> blat_doc.txt"; 3848 | if(system(command.c_str()) != 0) {cout << "BLAT CALL FAILED!" << endl; exit(-1);} 3849 | } 3850 | } 3851 | } 3852 | 3853 | vector > loadPreContigs(string id) 3854 | { 3855 | ifstream c; 3856 | string s, buf; 3857 | vector > contigs; 3858 | vector ic; 3859 | ContigBase cb; 3860 | int i; 3861 | 3862 | s = "tmp/_" + id + "_contigs.fa"; 3863 | c.open(s.c_str()); 3864 | if(c.is_open()) 3865 | { 3866 | while(c.good()) 3867 | { 3868 | getline(c, buf); 3869 | if(buf[0] == 0) break; 3870 | 3871 | if(buf[0] == '>') 3872 | contigs.push_back(ic); 3873 | else 3874 | { 3875 | for(i = 0; i < buf.size(); i ++) 3876 | { 3877 | cb.base = buf[i]; 3878 | cb.coverage = 0; 3879 | contigs[contigs.size() - 1].push_back(cb); 3880 | } 3881 | } 3882 | } 3883 | } 3884 | else 3885 | { 3886 | cout << "CANNOT OPEN FILE!" << endl; 3887 | exit(-1); 3888 | } 3889 | 3890 | return contigs; 3891 | } 3892 | 3893 | vector > loadContigs(string id, vector > preContigs) 3894 | { 3895 | ifstream c; 3896 | string buf, s; 3897 | int i, j, realIDBak = -1, seqID = -1; 3898 | char realIDBuf[10]; 3899 | vector > contigs; 3900 | vector ic; 3901 | ContigBase cb; 3902 | 3903 | s = "tmp/_" + id + "_contigs.fa"; 3904 | c.open(s.c_str()); 3905 | if(c.is_open()) 3906 | { 3907 | while(c.good()) 3908 | { 3909 | getline(c, buf); 3910 | if(buf[0] == 0) break; 3911 | 3912 | if(buf[0] == '>') 3913 | { 3914 | for(i = 1; i < buf.size() && buf[i] != '.'; i ++); 3915 | i ++; 3916 | for(j = 0; i < buf.size(); i ++, j ++) 3917 | realIDBuf[j] = buf[i]; 3918 | for(; j < 10; j ++) realIDBuf[j] = '\0'; 3919 | if(atoi(realIDBuf) > realIDBak) 3920 | { 3921 | contigs.push_back(ic); 3922 | realIDBak = atoi(realIDBuf); 3923 | } 3924 | 3925 | seqID ++; 3926 | for(i = 0; i < preContigs[seqID].size(); i ++) 3927 | contigs[contigs.size() - 1].push_back(preContigs[seqID][i]); 3928 | } 3929 | } 3930 | } 3931 | else 3932 | { 3933 | cout << "CANNOT OPEN FILE!" << endl; 3934 | exit(-1); 3935 | } 3936 | 3937 | return contigs; 3938 | } 3939 | 3940 | void loadReadAlignment(vector > & preContigs, string id) 3941 | { 3942 | ifstream ra; 3943 | string buf, s; 3944 | unsigned int targetID1, targetStart1, targetEnd1, targetGap1, sourceID1, sourceStart1, sourceEnd1, sourceGap1, sourceSize1, fr1, targetID2, targetStart2, targetEnd2, targetGap2, sourceID2, sourceStart2, sourceEnd2, sourceGap2, sourceSize2, fr2; 3945 | int bp; 3946 | vector segs1, segs2; 3947 | int line = 0; 3948 | 3949 | s = "tmp/_reads_" + id + "_contigs.bowtie"; 3950 | ra.open(s.c_str()); 3951 | if(ra.is_open()) 3952 | { 3953 | while(ra.good()) 3954 | { 3955 | getline(ra, buf); 3956 | if(buf[0] == 0) break; 3957 | if(buf[0] == '@') continue; 3958 | 3959 | parseBOWTIE(buf, targetID1, targetStart1, targetEnd1, targetGap1, sourceID1, sourceStart1, sourceEnd1, sourceGap1, sourceSize1, segs1, fr1); 3960 | getline(ra, buf); 3961 | if(buf[0] == 0) 3962 | { 3963 | cout << "BROKEN BOWTIE FILE!" << endl; 3964 | exit(-1); 3965 | } 3966 | parseBOWTIE(buf, targetID2, targetStart2, targetEnd2, targetGap2, sourceID2, sourceStart2, sourceEnd2, sourceGap2, sourceSize2, segs2, fr2); 3967 | 3968 | if(targetID1 != -1 && targetID2 != -1) 3969 | { 3970 | for(bp = targetStart1; bp < targetEnd1; bp ++) 3971 | preContigs[targetID1][bp].coverage ++; 3972 | for(bp = targetStart2; bp < targetEnd2; bp ++) 3973 | preContigs[targetID2][bp].coverage ++; 3974 | } 3975 | segs1.clear(); 3976 | segs2.clear(); 3977 | } 3978 | } 3979 | else 3980 | { 3981 | cout << "CANNOT OPEN FILE!" << endl; 3982 | exit(-1); 3983 | } 3984 | } 3985 | 3986 | int conflict(unsigned int x1, unsigned int y1, unsigned int x2, unsigned int y2) 3987 | { 3988 | if(x1 <= x2 && x2 <= y1 && y1 <= y2 && (int)y1 - (int)x2 >= 100 || x2 <= x1 && x1 <= y2 && y2 <= y1 && (int)y2 - (int)x1 >= 100 || x1 <= x2 && x2 <= y2 && y2 <= y1 && (int)y2 - (int)x2 >= 100 || x2 <= x1 && x1 <= y1 && y1 <= y2 && (int)y1 - (int)x1 >= 100 || 3989 | x1 <= x2 && y2 <= y1 || x2 <= x1 && y1 <= y2) 3990 | return 1; 3991 | else 3992 | return 0; 3993 | } 3994 | 3995 | int close(unsigned int y1, unsigned int x2, unsigned int threshold) 3996 | { 3997 | if(abs((int)x2 - (int)y1) < threshold) 3998 | return 1; 3999 | else 4000 | return 0; 4001 | } 4002 | 4003 | vector > loadContigAlignment(vector > contigs, string id) 4004 | { 4005 | ifstream ca; 4006 | vector > positions; 4007 | vector position; 4008 | ContigPosition p, p0; 4009 | string buf; 4010 | unsigned int targetID, sourceID, targetStart, targetEnd, targetGap, sourceStart, sourceEnd, sourceGap, sourceSize, targetSize, fr, sourceIDBak; 4011 | int keep, sp, pp, ppp, min, mp, start, end, cp, bp, realSourceID, realSourceIDBak; 4012 | string s; 4013 | vector segs; 4014 | 4015 | for(sp = 0; sp < contigs.size(); sp ++) 4016 | positions.push_back(position); 4017 | p0.targetID = p0.sourceStart = p0.sourceEnd = p0.targetStart = p0.targetEnd = p0.fr = -1; 4018 | 4019 | s = "tmp/_" + id + "_contigs_genome.psl"; 4020 | ca.open(s.c_str()); 4021 | realSourceIDBak = -1; 4022 | if(ca.is_open()) 4023 | { 4024 | while(ca.good()) 4025 | { 4026 | getline(ca, buf); 4027 | if(buf[0] == 0) break; 4028 | 4029 | realSourceID = parseBLAT(buf, targetID, targetStart, targetEnd, targetGap, sourceID, sourceStart, sourceEnd, sourceGap, sourceSize, segs, fr, targetSize); 4030 | if(realSourceID > realSourceIDBak) 4031 | { 4032 | realSourceIDBak = realSourceID; 4033 | sourceIDBak = sourceID; 4034 | } 4035 | sourceStart = (sourceID - sourceIDBak) * LARGE_CHUNK + sourceStart; 4036 | sourceEnd = (sourceID - sourceIDBak) * LARGE_CHUNK + sourceEnd; 4037 | 4038 | if(sourceEnd - sourceStart >= 100 && (double)(sourceEnd - sourceStart - sourceGap) / (sourceEnd - sourceStart) >= MIN_THRESHOLD && (double)(targetEnd - targetStart - targetGap) / (double)(targetEnd - targetStart) >= MIN_THRESHOLD) 4039 | { 4040 | keep = 1; 4041 | for(pp = 0; pp < positions[realSourceID].size(); pp ++) 4042 | if(positions[realSourceID][pp].targetID != -1 && targetID == positions[realSourceID][pp].targetID && conflict(sourceStart, sourceEnd, positions[realSourceID][pp].sourceStart, positions[realSourceID][pp].sourceEnd)) 4043 | { 4044 | if(sourceEnd - sourceStart < positions[realSourceID][pp].sourceEnd - positions[realSourceID][pp].sourceStart) 4045 | keep = 0; 4046 | else 4047 | positions[realSourceID][pp] = p0; 4048 | } 4049 | if(keep) 4050 | { 4051 | p.sourceStart = sourceStart; 4052 | p.sourceEnd = sourceEnd; 4053 | p.targetStart = targetStart; 4054 | p.targetEnd = targetEnd; 4055 | p.targetID = targetID; 4056 | p.fr = fr; 4057 | positions[realSourceID].push_back(p); 4058 | } 4059 | } 4060 | } 4061 | } 4062 | else 4063 | { 4064 | cout << "CANNOT OPEN FILE!" << endl; 4065 | exit(-1); 4066 | } 4067 | 4068 | for(sp = 0; sp < positions.size(); sp ++) 4069 | for(pp = 0; pp < positions[sp].size(); pp ++) 4070 | { 4071 | for(ppp = 0; ppp < positions[sp].size(); ppp ++) 4072 | { 4073 | if(ppp != pp && positions[sp][pp].targetID != -1 && positions[sp][ppp].targetID != -1 && positions[sp][pp].targetID == positions[sp][ppp].targetID && close(positions[sp][pp].sourceEnd, positions[sp][ppp].sourceStart, abs((int)positions[sp][pp].sourceEnd - (int)positions[sp][pp].sourceStart) / 10) && close(positions[sp][pp].targetEnd, positions[sp][ppp].targetStart, abs((int)positions[sp][pp].targetEnd - (int)positions[sp][pp].targetStart) / 10) && positions[sp][pp].fr == positions[sp][ppp].fr) 4074 | { 4075 | positions[sp][pp].sourceEnd = positions[sp][ppp].sourceEnd; 4076 | positions[sp][pp].targetEnd = positions[sp][ppp].targetEnd; 4077 | positions[sp][ppp] = p0; 4078 | ppp = 0; 4079 | } 4080 | } 4081 | } 4082 | 4083 | for(sp = 0; sp < positions.size(); sp ++) 4084 | for(pp = 0; pp < positions[sp].size(); pp ++) 4085 | for(ppp = pp + 1; ppp < positions[sp].size(); ppp ++) 4086 | if(positions[sp][pp].targetID != -1 && positions[sp][ppp].targetID != -1 && conflict(positions[sp][pp].sourceStart, positions[sp][pp].sourceEnd, positions[sp][ppp].sourceStart, positions[sp][ppp].sourceEnd)) 4087 | if(positions[sp][pp].sourceEnd - positions[sp][pp].sourceStart > positions[sp][ppp].sourceEnd - positions[sp][ppp].sourceStart) 4088 | positions[sp][ppp] = p0; 4089 | else 4090 | positions[sp][pp] = p0; 4091 | //remove duplicated alignments to different chrs 4092 | 4093 | for(sp = 0, cp = 0; sp < positions.size(); sp ++, cp ++) 4094 | for(pp = 0; pp < positions[sp].size(); pp ++) 4095 | for(ppp = pp + 1; ppp < positions[sp].size(); ppp ++) 4096 | if(positions[sp][pp].targetID != -1 && positions[sp][ppp].targetID != -1 && overlap(positions[sp][pp].sourceStart, positions[sp][pp].sourceEnd, positions[sp][ppp].sourceStart, positions[sp][ppp].sourceEnd))// overlap 4097 | { 4098 | min = MAX; mp = -1; 4099 | if(positions[sp][pp].sourceStart <= positions[sp][ppp].sourceStart) 4100 | { 4101 | start = positions[sp][ppp].sourceStart; 4102 | end = positions[cp][pp].sourceEnd - 1; 4103 | } 4104 | else 4105 | { 4106 | start = positions[sp][pp].sourceStart; 4107 | end = positions[cp][ppp].sourceEnd - 1; 4108 | } 4109 | 4110 | for(bp = start; bp <= end; bp ++) 4111 | if(contigs[cp][bp].coverage < min) 4112 | { 4113 | min = contigs[cp][bp].coverage; 4114 | mp = bp; 4115 | } 4116 | 4117 | if(positions[sp][pp].sourceStart <= positions[sp][ppp].sourceStart) 4118 | { 4119 | positions[sp][pp].sourceEnd = mp; 4120 | positions[cp][ppp].sourceStart = mp + 1;// leave alone targetStart and targetEnd 4121 | } 4122 | else 4123 | { 4124 | positions[sp][ppp].sourceEnd = mp; 4125 | positions[cp][pp].sourceStart = mp + 1; 4126 | } 4127 | } 4128 | else if(positions[sp][pp].targetID != -1 && positions[sp][ppp].targetID != -1 && positions[sp][pp].sourceEnd == positions[sp][ppp].sourceStart)// adjacent 4129 | { 4130 | if(contigs[cp][positions[sp][pp].sourceEnd - 1].coverage < contigs[cp][positions[sp][ppp].sourceStart].coverage) 4131 | positions[sp][pp].sourceEnd --; 4132 | else 4133 | positions[sp][ppp].sourceStart ++; 4134 | } 4135 | else if(positions[sp][pp].targetID != -1 && positions[sp][ppp].targetID != -1 && positions[sp][ppp].sourceEnd == positions[sp][pp].sourceStart) 4136 | { 4137 | if(contigs[cp][positions[sp][ppp].sourceEnd - 1].coverage < contigs[cp][positions[sp][pp].sourceStart].coverage) 4138 | positions[sp][ppp].sourceEnd --; 4139 | else 4140 | positions[sp][pp].sourceStart ++; 4141 | } 4142 | //keep distance between two consecutive local alignments 4143 | 4144 | return positions; 4145 | } 4146 | 4147 | void removeMasb(string file, vector > & contigs, vector > positions, string id, int coverage) 4148 | { 4149 | int sp, cp, pp, bp, bpp, start, end, total, part, p, i; 4150 | ifstream cf; 4151 | ofstream out; 4152 | string s, buf; 4153 | vector > splitContigs; 4154 | vector sc; 4155 | 4156 | for(sp = 0, cp = 0; sp < positions.size(); sp ++, cp ++) 4157 | { 4158 | for(pp = 0; pp < positions[sp].size(); pp ++) 4159 | if(positions[sp][pp].targetID != -1 && (double)(positions[sp][pp].sourceEnd - positions[sp][pp].sourceStart) / contigs[sp].size() >= 0.8) 4160 | { 4161 | for(bp = 0; bp < contigs[cp].size(); bp ++) 4162 | contigs[cp][bp].coverage = -1;// safe 4163 | goto cont; 4164 | } 4165 | 4166 | for(pp = 0; pp < positions[sp].size(); pp ++) 4167 | { 4168 | if(positions[sp][pp].targetID != -1) 4169 | for(bp = positions[sp][pp].sourceStart; bp < positions[sp][pp].sourceEnd; bp ++) 4170 | contigs[cp][bp].coverage = -1;// safe 4171 | } 4172 | for(bp = 0; bp < contigs[cp].size(); bp ++) 4173 | { 4174 | if(contigs[cp][bp].coverage != -1) 4175 | { 4176 | if(bp != 0 && bp != contigs[cp].size() - 1 && contigs[cp][bp - 1].coverage == -1 && contigs[cp][bp + 1].coverage == -1) 4177 | { 4178 | if(contigs[cp][bp].coverage < coverage) 4179 | contigs[cp][bp].coverage = -2;// removed 4180 | else 4181 | contigs[cp][bp].coverage = -1; 4182 | continue; 4183 | } 4184 | // special case 4185 | 4186 | if(bp == 0 || contigs[cp][bp - 1].coverage == -1) 4187 | { 4188 | start = bp; 4189 | total = contigs[cp][bp].coverage; 4190 | } 4191 | else if(bp == contigs[cp].size() - 1 || contigs[cp][bp + 1].coverage == -1) 4192 | { 4193 | end = bp; 4194 | total = total + contigs[cp][bp].coverage; 4195 | 4196 | if(total / (end - start + 1) < coverage) 4197 | { 4198 | for(bpp = start; bpp <= end; bpp ++) 4199 | contigs[cp][bpp].coverage = -2;// removed 4200 | } 4201 | else 4202 | { 4203 | for(bpp = start; bpp <= end; bpp ++) 4204 | contigs[cp][bpp].coverage = -1;// safe 4205 | } 4206 | } 4207 | else 4208 | total = total + contigs[cp][bp].coverage; 4209 | } 4210 | } 4211 | // for(bp = 0; bp < contigs[cp].size(); bp ++) 4212 | // { 4213 | // if(contigs[cp][bp].coverage == -1) break; 4214 | // contigs[cp][bp].coverage = -1; 4215 | // }// do not remove head 4216 | // for(bp = contigs[cp].size() - 1; bp >= 0; bp --) 4217 | // { 4218 | // if(contigs[cp][bp].coverage == -1) break; 4219 | // contigs[cp][bp].coverage = -1; 4220 | // }// do not remove tail 4221 | cont:; 4222 | } 4223 | 4224 | s = "corrected_" + file; 4225 | out.open(s.c_str()); 4226 | if(out.is_open()) 4227 | { 4228 | for(cp = 0; cp < contigs.size(); cp ++) 4229 | { 4230 | splitContigs.clear(); 4231 | for(bp = 0; bp < contigs[cp].size(); bp ++) 4232 | { 4233 | if(splitContigs.size() == 0 && contigs[cp][bp].coverage == -1 || contigs[cp][bp - 1].coverage == -2 && contigs[cp][bp].coverage == -1) 4234 | splitContigs.push_back(sc); 4235 | if(contigs[cp][bp].coverage == -1) 4236 | splitContigs[splitContigs.size() - 1].push_back(contigs[cp][bp].base); 4237 | if(bp == contigs[cp].size() - 1 || contigs[cp][bp].coverage == -1 && contigs[cp][bp + 1].coverage == -2) 4238 | if(splitContigs.size() > 0 && splitContigs[splitContigs.size() - 1].size() <= 200) 4239 | splitContigs.pop_back(); 4240 | } 4241 | 4242 | for(sp = 0; sp < splitContigs.size(); sp ++) 4243 | { 4244 | if(splitContigs.size() == 1) 4245 | out << ">" << contigIds[cp] << endl; 4246 | else 4247 | out << ">" << contigIds[cp] << " : part" << sp << endl; 4248 | for(bp = 0; bp < splitContigs[sp].size(); bp ++) 4249 | { 4250 | out << splitContigs[sp][bp]; 4251 | if((bp + 1) % 60 == 0 || bp == splitContigs[sp].size() - 1) 4252 | out << endl; 4253 | } 4254 | } 4255 | } 4256 | } 4257 | else 4258 | { 4259 | cout << "CANNOT OPEN FILE!" << endl; 4260 | exit(-1); 4261 | } 4262 | 4263 | if(id == "remaining") 4264 | { 4265 | cf.open("tmp/_chaff.fa"); 4266 | if(cf.is_open()) 4267 | { 4268 | while(cf.good()) 4269 | { 4270 | getline(cf, buf); 4271 | if(buf[0] == 0) break; 4272 | 4273 | for(i = 0; i < buf.size(); i ++) 4274 | out << buf[i]; 4275 | out << endl; 4276 | } 4277 | } 4278 | } 4279 | } 4280 | 4281 | void removeMisassembly(string file, int distanceLow, int distanceHigh, string id, int coverage, int fastMap) 4282 | { 4283 | ifstream c; 4284 | string s; 4285 | vector > preContigs, contigs; 4286 | vector > positions; 4287 | 4288 | c.open(file.c_str()); 4289 | s = "tmp/_" + id + "_contigs.fa"; 4290 | formalizeInput(c, s.c_str()); 4291 | makeAlignment(distanceLow, distanceHigh, id, fastMap); 4292 | preContigs = loadPreContigs(id); 4293 | loadReadAlignment(preContigs, id); 4294 | contigs = loadContigs(id, preContigs); 4295 | positions = loadContigAlignment(contigs, id); 4296 | removeMasb(file, contigs, positions, id, coverage); 4297 | } 4298 | 4299 | void setCommand(ofstream & wcmd, string cmd) 4300 | { 4301 | wcmd << cmd << endl; 4302 | } 4303 | 4304 | void print() 4305 | { 4306 | cout << "AlignGraph --read1 reads_1.fa --read2 reads_2.fa --contig contigs.fa --genome genome.fa --distanceLow distanceLow --distanceHigh distancehigh --extendedContig extendedContigs.fa --remainingContig remainingContigs.fa [--kMer k --insertVariation insertVariation --covereage coverage --part p --ratioCheck --iterativeMap --misassemblyRemoval --resume]" << endl; 4307 | cout << "Inputs:" << endl; 4308 | cout << "--read1 is the the first pair of PE DNA reads in fasta format" << endl; 4309 | cout << "--read2 is the the second pair of PE DNA reads in fasta format" << endl; 4310 | cout << "--contig is the initial contigs in fasta format" << endl; 4311 | cout << "--genome is the reference genome in fasta format" << endl; 4312 | cout << "--distanceLow is the lower bound of alignment distance between the first and second pairs of PE DNA reads (recommended: max{insert length - 1000, single read length})" << endl; 4313 | cout << "--distanceHigh is the upper bound of alignment distance between the first and second pairs of PE DNA reads (recommended: insert length + 1000)" << endl; 4314 | cout << "Outputs:" << endl; 4315 | cout << "--extendedContig is the extended contig file in fasta format" << endl; 4316 | cout << "--remainingContig is the not extended initial contig file in fasta format" << endl; 4317 | cout << "Options:" << endl; 4318 | cout << "--kMer is the k-mer size (default: 5)" << endl; 4319 | cout << "--insertVariation is the small variation of insert length (default: 50)" << endl; 4320 | cout << "--coverage is the minimum coverage to keep a path in de Bruijn graph (default: 20)" << endl; 4321 | cout << "--part is the number of parts a chromosome is divided into when it is loaded to reduce memory requirement (default: 1)" << endl; 4322 | cout << "--fastMap calls NUCMER to make fast but less sensitive and accurate contig alignment instead of BLAT (default: none)" << endl; 4323 | cout << "--ratioCheck checks read alignment ratio to the reference beforehand and warns if the ratio is too low; may take a little more time (default: none)" << endl; 4324 | cout << "--iterativeMap aligns reads to one chromosome and then another rather than directly to the genome, which increases sensitivity while loses precision (default: none)" << endl; 4325 | cout << "--misassemblyRemoval detects and then breaks at or removes misassembed regions (default: none)" << endl; 4326 | cout << "--resume resumes the previous unfinished running from several checkpoints (default: none)" << endl; 4327 | } 4328 | 4329 | void getParameters(ifstream & rcmd, ifstream & r1, ifstream & r2, ifstream & c, ifstream & g, ofstream & e, ofstream & r, int & distanceLow, int & distanceHigh, int & k, int & insertVariation, int & coverage, int & part, int & tagRead1, int & tagRead2, int & tagContig, int & tagGenome, int & tagDistanceLow, int & tagDistanceHigh, int & tagExtendedContig, int & tagRemainingContig, int & tagKMer, int & tagInsertVariation, int & tagCoverage, int & tagPart, int & tagFastMap, int & tagRatioCheck, int & tagUniqueExtension, int & tagIterativeMap, int & tagMisassemblyRemoval, int & tagResume, string & ext, string & rmn) 4330 | { 4331 | int i = -1, count = 0; 4332 | string buf, bufCheck1; 4333 | stringstream bufCheck2; 4334 | 4335 | if(rcmd.is_open()) 4336 | { 4337 | while(rcmd.good()) 4338 | { 4339 | getline(rcmd, buf); 4340 | if(buf[0] == 0) break; 4341 | 4342 | count ++; 4343 | } 4344 | } 4345 | else 4346 | { 4347 | cout << "CANNOT OPEN FILE!" << endl; 4348 | exit(-1); 4349 | } 4350 | 4351 | rcmd.clear(); 4352 | rcmd.seekg(0); 4353 | 4354 | if(rcmd.is_open()) 4355 | { 4356 | while(rcmd.good()) 4357 | { 4358 | getline(rcmd, buf); 4359 | if(buf[0] == 0) break; 4360 | 4361 | i ++; 4362 | if(buf == "--read1") 4363 | { 4364 | if(tagRead1 == 1 || i == count - 1) 4365 | { 4366 | print(); 4367 | exit(-1); 4368 | } 4369 | getline(rcmd, buf); 4370 | r1.open(buf.c_str()); 4371 | if(!r1.is_open()) 4372 | { 4373 | cout << "CANNOT OPEN FILE!" << endl; 4374 | print(); 4375 | exit(-1); 4376 | } 4377 | tagRead1 = 1; 4378 | } 4379 | else if(buf == "--read2") 4380 | { 4381 | if(tagRead2 == 1 || i == count - 1) 4382 | { 4383 | print(); 4384 | exit(-1); 4385 | } 4386 | getline(rcmd, buf); 4387 | r2.open(buf.c_str()); 4388 | if(!r2.is_open()) 4389 | { 4390 | cout << "CANNOT OPEN FILE!" << endl; 4391 | print(); 4392 | exit(-1); 4393 | } 4394 | tagRead2 = 1; 4395 | } 4396 | else if(buf == "--contig") 4397 | { 4398 | if(tagContig == 1 || i == count - 1) 4399 | { 4400 | print(); 4401 | exit(-1); 4402 | } 4403 | getline(rcmd, buf); 4404 | c.open(buf.c_str()); 4405 | if(!c.is_open()) 4406 | { 4407 | cout << "CANNOT OPEN FILE!" << endl; 4408 | print(); 4409 | exit(-1); 4410 | } 4411 | tagContig = 1; 4412 | } 4413 | else if(buf == "--genome") 4414 | { 4415 | if(tagGenome == 1 || i == count - 1) 4416 | { 4417 | print(); 4418 | exit(-1); 4419 | } 4420 | getline(rcmd, buf); 4421 | g.open(buf.c_str()); 4422 | if(!g.is_open()) 4423 | { 4424 | cout << "CANNOT OPEN FILE!" << endl; 4425 | print(); 4426 | exit(-1); 4427 | } 4428 | tagGenome = 1; 4429 | } 4430 | else if(buf == "--distanceLow") 4431 | { 4432 | if(tagDistanceLow == 1 || i == count - 1) 4433 | { 4434 | print(); 4435 | exit(-1); 4436 | } 4437 | getline(rcmd, buf); 4438 | distanceLow = atoi(buf.c_str()); 4439 | bufCheck2.str(""); 4440 | bufCheck2 << distanceLow; 4441 | bufCheck1 = bufCheck2.str(); 4442 | if(bufCheck1 != buf) 4443 | { 4444 | print(); 4445 | exit(-1); 4446 | } 4447 | tagDistanceLow = 1; 4448 | } 4449 | else if(buf == "--distanceHigh") 4450 | { 4451 | if(tagDistanceHigh == 1 || i == count - 1) 4452 | { 4453 | print(); 4454 | exit(-1); 4455 | } 4456 | getline(rcmd, buf); 4457 | distanceHigh = atoi(buf.c_str()); 4458 | bufCheck2.str(""); 4459 | bufCheck2 << distanceHigh; 4460 | bufCheck1 = bufCheck2.str(); 4461 | if(bufCheck1 != buf) 4462 | { 4463 | print(); 4464 | exit(-1); 4465 | } 4466 | tagDistanceHigh = 1; 4467 | } 4468 | else if(buf == "--extendedContig") 4469 | { 4470 | if(tagExtendedContig == 1 || i == count - 1) 4471 | { 4472 | print(); 4473 | exit(-1); 4474 | } 4475 | getline(rcmd, buf); 4476 | e.open(buf.c_str()); 4477 | ext = buf; 4478 | if(!e.is_open()) 4479 | { 4480 | cout << "CANNOT OPEN FILE!" << endl; 4481 | print(); 4482 | exit(-1); 4483 | } 4484 | tagExtendedContig = 1; 4485 | } 4486 | else if(buf == "--remainingContig") 4487 | { 4488 | if(tagRemainingContig == 1 || i == count - 1) 4489 | { 4490 | print(); 4491 | exit(-1); 4492 | } 4493 | getline(rcmd, buf); 4494 | r.open(buf.c_str()); 4495 | rmn = buf; 4496 | if(!r.is_open()) 4497 | { 4498 | cout << "CANNOT OPEN FILE!" << endl; 4499 | print(); 4500 | exit(-1); 4501 | } 4502 | tagRemainingContig = 1; 4503 | } 4504 | else if(buf == "--kMer") 4505 | { 4506 | if(tagKMer == 1 || i == count - 1) 4507 | { 4508 | print(); 4509 | exit(-1); 4510 | } 4511 | getline(rcmd, buf); 4512 | k = atoi(buf.c_str()); 4513 | bufCheck2.str(""); 4514 | bufCheck2 << k; 4515 | bufCheck1 = bufCheck2.str(); 4516 | if(bufCheck1 != buf) 4517 | { 4518 | print(); 4519 | exit(-1); 4520 | } 4521 | tagKMer = 1; 4522 | } 4523 | else if(buf == "--insertVariation") 4524 | { 4525 | if(tagInsertVariation == 1 || i == count - 1) 4526 | { 4527 | print(); 4528 | exit(-1); 4529 | } 4530 | getline(rcmd, buf); 4531 | insertVariation = atoi(buf.c_str()); 4532 | bufCheck2.str(""); 4533 | bufCheck2 << insertVariation; 4534 | bufCheck1 = bufCheck2.str(); 4535 | if(bufCheck1 != buf) 4536 | { 4537 | print(); 4538 | exit(-1); 4539 | } 4540 | tagInsertVariation = 1; 4541 | } 4542 | else if(buf == "--coverage") 4543 | { 4544 | if(tagCoverage == 1 || i == count - 1) 4545 | { 4546 | print(); 4547 | exit(-1); 4548 | } 4549 | getline(rcmd, buf); 4550 | coverage = atoi(buf.c_str()); 4551 | bufCheck2.str(""); 4552 | bufCheck2 << coverage; 4553 | bufCheck1 = bufCheck2.str(); 4554 | if(bufCheck1 != buf) 4555 | { 4556 | print(); 4557 | exit(-1); 4558 | } 4559 | tagCoverage = 1; 4560 | } 4561 | else if(buf == "--part") 4562 | { 4563 | if(tagPart == 1 || i == count - 1) 4564 | { 4565 | print(); 4566 | exit(-1); 4567 | } 4568 | getline(rcmd, buf); 4569 | part = atoi(buf.c_str()); 4570 | bufCheck2.str(""); 4571 | bufCheck2 << part; 4572 | bufCheck1 = bufCheck2.str(); 4573 | if(bufCheck1 != buf) 4574 | { 4575 | print(); 4576 | exit(-1); 4577 | } 4578 | tagPart = 1; 4579 | } 4580 | else if(buf == "--fastMap") 4581 | { 4582 | if(tagFastMap == 1) 4583 | { 4584 | print(); 4585 | exit(-1); 4586 | } 4587 | tagFastMap = 1; 4588 | } 4589 | else if(buf == "--ratioCheck") 4590 | { 4591 | if(tagRatioCheck == 1) 4592 | { 4593 | print(); 4594 | exit(-1); 4595 | } 4596 | tagRatioCheck = 1; 4597 | } 4598 | else if(buf == "--uniqueExtension") 4599 | { 4600 | if(tagUniqueExtension == 1) 4601 | { 4602 | print(); 4603 | exit(-1); 4604 | } 4605 | tagUniqueExtension = 1; 4606 | } 4607 | else if(buf == "--iterativeMap") 4608 | { 4609 | if(tagIterativeMap == 1) 4610 | { 4611 | print(); 4612 | exit(-1); 4613 | } 4614 | tagIterativeMap = 1; 4615 | } 4616 | else if(buf == "--misassemblyRemoval") 4617 | { 4618 | if(tagMisassemblyRemoval == 1) 4619 | { 4620 | print(); 4621 | exit(-1); 4622 | } 4623 | tagMisassemblyRemoval = 1; 4624 | } 4625 | else if(buf == "--resume") 4626 | { 4627 | if(tagResume == 1 || count != 1) 4628 | { 4629 | print(); 4630 | exit(-1); 4631 | } 4632 | tagResume = 1; 4633 | } 4634 | else 4635 | { 4636 | print(); 4637 | exit(-1); 4638 | } 4639 | } 4640 | } 4641 | else 4642 | { 4643 | cout << "CANNOT OPEN CHECKPOINT FILE!" << endl; 4644 | exit(-1); 4645 | } 4646 | } 4647 | 4648 | void setCheckpoint(ofstream & wcp, string s) 4649 | { 4650 | wcp << s << endl; 4651 | } 4652 | 4653 | void getCheckpoint(ifstream & rcp, int & cp) 4654 | { 4655 | int i; 4656 | string s; 4657 | 4658 | cp = -1; 4659 | if(rcp.is_open()) 4660 | { 4661 | while(rcp.good()) 4662 | { 4663 | getline(rcp, s); 4664 | if(s[0] == 0) break; 4665 | 4666 | cp = atoi(s.c_str()); 4667 | } 4668 | } 4669 | else 4670 | { 4671 | cout << "CANNOT OPEN FILE!" << endl; 4672 | exit(-1); 4673 | } 4674 | 4675 | if(cp == -1) 4676 | { 4677 | cout << "NOT REACHED CHECKPOINT. PLEASE RERUN!" << endl; 4678 | exit(-1); 4679 | } 4680 | } 4681 | 4682 | void testAligners(int tagFastMap) 4683 | { 4684 | if(system("bowtie2 -h > bowtie_doc.txt 2> bowtie_doc.txt") != 0) {cout << "BOWTIE2 CALL FAILED!" << endl; exit(-1);} 4685 | if(tagFastMap == 0) 4686 | { 4687 | // if(system("pblat > blat_doc.txt 2> blat_doc.txt") != 0 && system("blat -h > blat_doc.txt 2> blat_doc.txt") != 0) {cout << "BLAT CALL FAILED!" << endl; exit(-1);} 4688 | // BLAT check is left for later when it is running 4689 | } 4690 | else 4691 | { 4692 | if(system("nucmer -h > nucmer_doc.txt 2> nucmer_doc.txt") != 0) {cout << "NUCMER CALL FAILED!" << endl; exit(-1);} 4693 | } 4694 | } 4695 | 4696 | int main(int argc, char * argv[]) 4697 | { 4698 | ifstream r1, r2, c, g, rcmd, rcp; 4699 | ofstream e, r, wcmd, wcp; 4700 | string ext, rmn; 4701 | int i, tagRead1 = 0, tagRead2 = 0, tagContig = 0, tagGenome = 0, tagExtendedContig = 0, tagKMer = 0, tagDistanceLow = 0, tagDistanceHigh = 0, tagNoAlignment = 1, k = 5, distanceLow = 0, distanceHigh = MAX, chromosomeID, numChromosomes, coverage = 20, tagCoverage = 0, mrl, mrl1, mrl2, tagInsertVariation = 0, insertVariation = 50, tagRemainingContig = 0, part = 1, tagPart = 0, tagFastMap = 0, numReads, tagRatioCheck = 0, tagUniqueExtension = 0, tagIterativeMap = 0, tagMisassemblyRemoval = 0, tagResume = 0, cp = 0; 4702 | time_t start, end, startAlign, endAlign; 4703 | 4704 | cout << "AlignGraph: algorithm for secondary de novo genome assembly guided by closely related references" << endl; 4705 | cout << "By Ergude Bao, CS Department, UC-Riverside. All Rights Reserved" << endl << endl; 4706 | 4707 | start = time(NULL); 4708 | wcmd.open("command.txt"); 4709 | if(wcmd.is_open()) 4710 | { 4711 | for(i = 1; i < argc; i ++) 4712 | setCommand(wcmd, argv[i]); 4713 | } 4714 | else 4715 | { 4716 | cout << "CANNOT OPEN FILE!" << endl; 4717 | return 0; 4718 | } 4719 | wcmd.close(); 4720 | rcmd.open("command.txt"); 4721 | getParameters(rcmd, r1, r2, c, g, e, r, distanceLow, distanceHigh, k, insertVariation, coverage, part, tagRead1, tagRead2, tagContig, tagGenome, tagDistanceLow, tagDistanceHigh, tagExtendedContig, tagRemainingContig, tagKMer, tagInsertVariation, tagCoverage, tagPart, tagFastMap, tagRatioCheck, tagUniqueExtension, tagIterativeMap, tagMisassemblyRemoval, tagResume, ext, rmn); 4722 | rcmd.close(); 4723 | 4724 | if(tagResume == 0) 4725 | { 4726 | if(tagRead1 == 0 || tagRead2 == 0 || tagContig == 0 || tagGenome == 0 || tagExtendedContig == 0 || tagRemainingContig == 0 || k <= 0 || tagDistanceLow == 0 || tagDistanceHigh == 0 || distanceLow > distanceHigh || distanceLow < 0 || insertVariation < 0 || part < 1 || part > 10 || k > maxReadLength(r1) || k > maxReadLength(r2)) 4727 | { 4728 | print(); 4729 | return 0; 4730 | } 4731 | testAligners(tagFastMap); 4732 | 4733 | system("test -d \"tmp\"; t=$?; if [ $t -eq 1 ]; then mkdir tmp; fi"); 4734 | wcmd.open("tmp/_command.txt"); 4735 | for(i = 1; i < argc; i ++) 4736 | setCommand(wcmd, argv[i]); 4737 | wcp.open("tmp/_checkpoint.txt"); 4738 | 4739 | formalizeInput(r1, r2, "tmp/_reads.fa", "tmp/_reads_1.fa", "tmp/_reads_2.fa"); 4740 | formalizeInput(c, "tmp/_contigs.fa"); 4741 | numChromosomes = formalizeGenome(g, part); 4742 | startAlign = time(NULL); 4743 | parallelMap(distanceLow, distanceHigh, numChromosomes, tagFastMap, tagIterativeMap); 4744 | endAlign = time(NULL); 4745 | cout << "(0) Alignment finished" << endl; 4746 | setCheckpoint(wcp, "0"); 4747 | } 4748 | else 4749 | { 4750 | rcp.open("tmp/_checkpoint.txt"); 4751 | getCheckpoint(rcp, cp); 4752 | rcmd.open("tmp/_command.txt"); 4753 | getParameters(rcmd, r1, r2, c, g, e, r, distanceLow, distanceHigh, k, insertVariation, coverage, part, tagRead1, tagRead2, tagContig, tagGenome, tagDistanceLow, tagDistanceHigh, tagExtendedContig, tagRemainingContig, tagKMer, tagInsertVariation, tagCoverage, tagPart, tagFastMap, tagRatioCheck, tagUniqueExtension, tagIterativeMap, tagMisassemblyRemoval, tagResume, ext, rmn); 4754 | cout << "RESUMED SUCCESSFULLY :-)" << endl; 4755 | wcp.open("tmp/_checkpoint.txt", ios::app); 4756 | 4757 | formalizeInput(c, "tmp/_contigs.fa"); 4758 | numChromosomes = formalizeGenome(g, part); 4759 | startAlign = endAlign = time(NULL); 4760 | } 4761 | 4762 | if(tagRatioCheck == 1) 4763 | checkRatio(numChromosomes); 4764 | 4765 | for(chromosomeID = cp; chromosomeID < numChromosomes; chromosomeID ++) 4766 | { 4767 | cout << endl << "CHROMOSOME " << chromosomeID << ": " << endl; 4768 | loadGenome(genome, chromosomeID); 4769 | cout << "(1) Chromosome loaded" << endl; 4770 | loadContigAlignment(genome, chromosomeID); 4771 | cout << "(2) Contig alignment loaded" << endl; 4772 | loadReadAlignment(genome, k, insertVariation, chromosomeID, mrl); 4773 | cout << "(3) Read alignment loaded" << endl; 4774 | extendContigs(genome, coverage, k, chromosomeID); 4775 | cout << "(4) Contigs extended" << endl; 4776 | scaffoldContigs(genome, chromosomeID); 4777 | cout << "(5) Contigs scaffolded" << endl; 4778 | system("ps euf >> mem.txt"); 4779 | contigs.clear(); 4780 | genome.clear(); 4781 | sourceIDBak = -1; 4782 | setCheckpoint(wcp, itoa(chromosomeID + 1)); 4783 | } 4784 | 4785 | refinement(e, r, tagFastMap, tagUniqueExtension, numChromosomes); 4786 | 4787 | if(tagMisassemblyRemoval == 1) 4788 | { 4789 | removeMisassembly(ext, distanceLow, distanceHigh, "extended", coverage, tagFastMap); 4790 | removeMisassembly(rmn, distanceLow, distanceHigh, "remaining", coverage, tagFastMap); 4791 | cout << endl << "(6) Misassemblies removed" << endl; 4792 | } 4793 | 4794 | end = time(NULL); 4795 | cout << endl << "FINISHED SUCCESSFULLY for " << end - start << " seconds (" << endAlign - startAlign << " seconds for alignment) :-)" << endl; 4796 | } 4797 | 4798 | -------------------------------------------------------------------------------- /Eval-AlignGraph/.nfs00000000000025d800016c95: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baoe/AlignGraph/1fada4c173f7e2111eeaf6da79e28d7fc32e6df2/Eval-AlignGraph/.nfs00000000000025d800016c95 -------------------------------------------------------------------------------- /Eval-AlignGraph/Eval-AlignGraph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baoe/AlignGraph/1fada4c173f7e2111eeaf6da79e28d7fc32e6df2/Eval-AlignGraph/Eval-AlignGraph -------------------------------------------------------------------------------- /Eval-AlignGraph/Eval-AlignGraph.cpp: -------------------------------------------------------------------------------- 1 | //********************************************************************************** 2 | //* Title: Eval-AlignGraph: Evaluation tool distributed with AlignGraph 3 | //* Platform: 64-Bit Linux 4 | //* Author: Ergude Bao 5 | //* Affliation: Department of Computer Science & Engineering 6 | //* University of California, Riverside 7 | //* Date: 03/24/2011 8 | //* Copy Right: Artistic License 2.0 9 | //********************************************************************************** 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | using namespace std; 22 | 23 | #define IDENTITY 0.1 24 | #define CUTOFF 1000 25 | #define SIZE 1000000 26 | #define FASTMAP 0 27 | 28 | typedef struct posStruct 29 | { 30 | int targetID; 31 | unsigned int sourceStart; 32 | unsigned int sourceEnd; 33 | unsigned int targetStart; 34 | unsigned int targetEnd; 35 | unsigned int sourceGap; 36 | unsigned int targetGap; 37 | int fr; 38 | int alignedBases; 39 | } pos; 40 | 41 | void parseBLAT(string buf, int & targetID, unsigned int & targetStart, unsigned int & targetEnd, unsigned int & targetGap, int & sourceID, unsigned int & sourceStart, unsigned int & sourceEnd, unsigned int & sourceGap, unsigned int & sourceSize, int & fragID, int & fr, int & alignedBases) 42 | { 43 | char targetIDBuf[100] = {'\0'}, targetStartBuf[100] = {'\0'}, targetEndBuf[100] = {'\0'}, targetGapBuf[100] = {'\0'}, sourceIDBuf[100] = {'\0'}, sourceStartBuf[100] = {'\0'}, sourceEndBuf[100] = {'\0'}, sourceGapBuf[100] = {'\0'}, sourceSizeBuf[100] = {'\0'}, blockBuf[100] = {'\0'}, sourceIDBuf1[100] = {'\0'}, sourceIDBuf2[100] = {'\0'}; 44 | int item = 0, i, j = 0, k, ab = 0; 45 | vector seg; 46 | 47 | for(i = 0; i < buf.size(); i ++) 48 | { 49 | if(buf[i] == ' ') 50 | { 51 | item ++; 52 | j = 0; 53 | continue; 54 | } 55 | if(item == 13) 56 | targetIDBuf[j ++] = buf[i]; 57 | if(item == 15) 58 | targetStartBuf[j ++] = buf[i]; 59 | if(item == 16) 60 | targetEndBuf[j ++] = buf[i]; 61 | if(item == 7) 62 | targetGapBuf[j ++] = buf[i]; 63 | if(item == 9) 64 | sourceIDBuf[j ++] = buf[i]; 65 | if(item == 11) 66 | sourceStartBuf[j ++] = buf[i]; 67 | if(item == 12) 68 | sourceEndBuf[j ++] = buf[i]; 69 | if(item == 5) 70 | sourceGapBuf[j ++] = buf[i]; 71 | if(item == 10) 72 | sourceSizeBuf[j ++] = buf[i]; 73 | if(item == 8) 74 | fr = buf[i] == '+' ? 0 : 1; 75 | if(item == 18) 76 | { 77 | if(buf[i] == ',') 78 | { 79 | seg.push_back(atoi(blockBuf)); 80 | 81 | for(k = 0; k < j; k ++) 82 | blockBuf[k] = '\0'; 83 | j = 0; 84 | } 85 | else 86 | blockBuf[j ++] = buf[i]; 87 | } 88 | } 89 | targetID = atoi(targetIDBuf); //IMPORTANT! HAS TO BE ADJUSTED FOR MORE THAN ONE CHRS 90 | // if(targetID > 10) targetID = 0; 91 | targetStart = atoi(targetStartBuf); 92 | targetEnd = atoi(targetEndBuf); 93 | targetGap = atoi(targetGapBuf); 94 | sourceGap = atoi(sourceGapBuf); 95 | sourceSize = atoi(sourceSizeBuf); 96 | 97 | for(i = 0; i < 100; i ++) 98 | if(sourceIDBuf[i] == '.') 99 | break; 100 | if(i == 100) 101 | { 102 | sourceID = atoi(sourceIDBuf); 103 | fragID = 0; 104 | } 105 | else 106 | { 107 | for(j = 0; j < i; j ++) 108 | sourceIDBuf1[j] = sourceIDBuf[j]; 109 | for(j = i + 1; j < 100; j ++) 110 | sourceIDBuf2[j - i - 1] = sourceIDBuf[j]; 111 | sourceID = atoi(sourceIDBuf1); 112 | fragID = atoi(sourceIDBuf2); 113 | } 114 | sourceStart = fragID * SIZE + atoi(sourceStartBuf); 115 | sourceEnd = fragID * SIZE + atoi(sourceEndBuf); 116 | 117 | for(i = 0; i < seg.size(); i ++) 118 | ab = ab + seg[i]; 119 | alignedBases = ab; 120 | } 121 | 122 | int conflict(unsigned int x1, unsigned int y1, unsigned int x2, unsigned int y2) 123 | { 124 | if(x1 <= x2 && x2 <= y1 && y1 <= y2 && (int)y1 - (int)x2 >= 100 || x2 <= x1 && x1 <= y2 && y2 <= y1 && (int)y2 - (int)x1 >= 100 || x1 <= x2 && x2 <= y2 && y2 <= y1 && (int)y2 - (int)x2 >= 100 || x2 <= x1 && x1 <= y1 && y1 <= y2 && (int)y1 - (int)x1 >= 100 || 125 | x1 <= x2 && y2 <= y1 || x2 <= x1 && y1 <= y2) 126 | return 1; 127 | else 128 | return 0; 129 | } 130 | 131 | int close(unsigned int y1, unsigned int x2, unsigned int threshold) 132 | { 133 | if(abs((int)x2 - (int)y1) < threshold) 134 | return 1; 135 | else 136 | return 0; 137 | } 138 | 139 | vector > loadGenome() 140 | { 141 | ifstream in; 142 | string buf; 143 | vector gb; 144 | int i; 145 | vector > genome; 146 | 147 | in.open("etmp/_genome.fa"); 148 | if(in.is_open()) 149 | { 150 | while(in.good()) 151 | { 152 | getline(in, buf); 153 | if(buf[0] == 0) break; 154 | 155 | if(buf[0] == '>') 156 | genome.push_back(gb); 157 | else 158 | for(i = 0; i < buf.size(); i ++) 159 | genome[genome.size() - 1].push_back(0); 160 | } 161 | } 162 | else 163 | { 164 | cout << "CANNOT OPEN FILE!" << endl; 165 | exit(-1); 166 | } 167 | 168 | return genome; 169 | } 170 | 171 | vector > loadContigs() 172 | { 173 | ifstream in; 174 | string buf; 175 | int i, j, initIDBak = -1; 176 | char initIDBuf[100]; 177 | vector > initContigs; 178 | vector ic; 179 | 180 | in.open("etmp/_contigs.fa"); 181 | if(in.is_open()) 182 | { 183 | while(in.good()) 184 | { 185 | getline(in, buf); 186 | if(buf[0] == 0) break; 187 | 188 | if(buf[0] == '>') 189 | { 190 | for(i = 1, j = 0; i < buf.size() && buf[i] != '.'; i ++, j ++) initIDBuf[j] = buf[i]; 191 | for(; j < 100; j ++) initIDBuf[j] = '\0'; 192 | if(atoi(initIDBuf) > initIDBak) 193 | { 194 | initContigs.push_back(ic); 195 | initIDBak = atoi(initIDBuf); 196 | } 197 | continue; 198 | } 199 | else 200 | for(i = 0; i < buf.size(); i ++) 201 | initContigs[initContigs.size() - 1].push_back(buf[i]); 202 | } 203 | } 204 | else 205 | { 206 | cout << "CANNOT OPEN FILE!" << endl; 207 | exit(-1); 208 | } 209 | 210 | return initContigs; 211 | } 212 | 213 | vector > loadContigsAlignment(int size) 214 | { 215 | ifstream in; 216 | vector > positions; 217 | vector position; 218 | pos p, p0; 219 | string buf; 220 | unsigned int targetStart, targetEnd, targetGap, sourceStart, sourceEnd, sourceGap, sourceSize; 221 | int targetID, sourceID, keep, i, j, k, fragID, fr, alignedBases; 222 | 223 | for(i = 0; i < size; i ++) 224 | positions.push_back(position); 225 | p0.targetID = p0.sourceStart = p0.sourceEnd = p0.targetStart = p0.targetEnd = p0.fr = p0.sourceGap = p0.targetGap = p0.alignedBases = -1; 226 | 227 | in.open("etmp/_contigs_genome.psl"); 228 | if(in.is_open()) 229 | { 230 | while(in.good()) 231 | { 232 | getline(in, buf); 233 | if(buf[0] == 0) break; 234 | 235 | parseBLAT(buf, targetID, targetStart, targetEnd, targetGap, sourceID, sourceStart, sourceEnd, sourceGap, sourceSize, fragID, fr, alignedBases); 236 | if(sourceEnd - sourceStart >= 100 && (double)(sourceEnd - sourceStart - sourceGap) / (sourceEnd - sourceStart) >= IDENTITY && (double)(targetEnd - targetStart - targetGap) / (double)(targetEnd - targetStart) >= IDENTITY) 237 | { 238 | keep = 1; 239 | for(i = 0; i < positions[sourceID].size(); i ++) 240 | if(positions[sourceID][i].targetID != -1 && targetID == positions[sourceID][i].targetID && conflict(sourceStart, sourceEnd, positions[sourceID][i].sourceStart, positions[sourceID][i].sourceEnd)) 241 | { 242 | if(sourceEnd - sourceStart < positions[sourceID][i].sourceEnd - positions[sourceID][i].sourceStart) 243 | keep = 0; 244 | else 245 | positions[sourceID][i] = p0; 246 | } 247 | if(keep) 248 | { 249 | p.sourceStart = sourceStart; 250 | p.sourceEnd = sourceEnd; 251 | p.targetStart = targetStart; 252 | p.targetEnd = targetEnd; 253 | p.targetID = targetID; 254 | p.sourceGap = sourceGap; 255 | p.targetGap = targetGap; 256 | p.fr = fr; 257 | p.alignedBases = alignedBases; 258 | positions[sourceID].push_back(p); 259 | } 260 | } 261 | } 262 | } 263 | else 264 | { 265 | cout << "CANNOT OPEN FILE!" << endl; 266 | exit(-1); 267 | } 268 | 269 | for(i = 0; i < positions.size(); i ++) 270 | { 271 | for(j = 0; j < positions[i].size(); j ++) 272 | { 273 | for(k = 0; k < positions[i].size(); k ++) 274 | { 275 | if(k != j && positions[i][j].targetID != -1 && positions[i][k].targetID != -1 && positions[i][j].targetID == positions[i][k].targetID && close(positions[i][j].sourceEnd, positions[i][k].sourceStart, abs((int)positions[i][j].sourceEnd - (int)positions[i][j].sourceStart) / 10) && close(positions[i][j].targetEnd, positions[i][k].targetStart, abs((int)positions[i][j].targetEnd - (int)positions[i][j].targetStart) / 10) && positions[i][j].fr == positions[i][k].fr) 276 | { 277 | 278 | positions[i][j].sourceEnd = positions[i][k].sourceEnd; 279 | positions[i][j].targetEnd = positions[i][k].targetEnd; 280 | positions[i][j].sourceGap = positions[i][j].sourceGap + positions[i][k].sourceGap; 281 | positions[i][j].targetGap = positions[i][j].targetGap + positions[i][k].targetGap; 282 | positions[i][j].alignedBases = positions[i][j].alignedBases + positions[i][k].alignedBases; 283 | positions[i][k] = p0; 284 | k = 0; 285 | } 286 | } 287 | } 288 | } 289 | 290 | for(i = 0; i < positions.size(); i ++) 291 | for(j = 0; j < positions[i].size(); j ++) 292 | for(k = j + 1; k < positions[i].size(); k ++) 293 | { 294 | if(positions[i][j].targetID != -1 && positions[i][k].targetID != -1 && conflict(positions[i][j].sourceStart, positions[i][j].sourceEnd, positions[i][k].sourceStart, positions[i][k].sourceEnd) == 1) 295 | { 296 | if(positions[i][j].sourceEnd - positions[i][j].sourceStart > positions[i][k].sourceEnd - positions[i][k].sourceStart) 297 | positions[i][k] = p0; 298 | else 299 | { 300 | positions[i][j] = p0; 301 | break; 302 | } 303 | } 304 | } 305 | //remove duplicated alignments to different chrs 306 | 307 | return positions; 308 | } 309 | 310 | void analyze(char argv[], vector > & genome, vector > & initContigs, vector > & positions) 311 | { 312 | ofstream out; 313 | int max, misassembly, i, j, k, alignedBases, totalBases, errors, sum, coveredLength, contigBases, totalLength; 314 | vector trueContigLengths; 315 | vector identity; 316 | double totalIdentity; 317 | 318 | max = misassembly = 0; 319 | for(i = 0; i < positions.size(); i ++) 320 | { 321 | for(j = 0; j < positions[i].size(); j ++) 322 | if(positions[i][j].targetID != -1 && (double)(positions[i][j].sourceEnd - positions[i][j].sourceStart) / initContigs[i].size() >= 0.8) 323 | { 324 | trueContigLengths.push_back(positions[i][j].sourceEnd - positions[i][j].sourceStart); 325 | 326 | for(k = positions[i][j].targetStart; k < positions[i][j].targetEnd; k ++) 327 | genome[positions[i][j].targetID][k] = 1; 328 | 329 | if(max < positions[i][j].sourceEnd - positions[i][j].sourceStart) 330 | max = positions[i][j].sourceEnd - positions[i][j].sourceStart; 331 | 332 | alignedBases = positions[i][j].alignedBases; 333 | totalBases = positions[i][j].targetEnd - positions[i][j].targetStart + positions[i][j].targetGap; 334 | identity.push_back((double) alignedBases * trueContigLengths[trueContigLengths.size() - 1] / totalBases); 335 | 336 | goto end; 337 | } 338 | 339 | for(j = 0, errors = 0; j < positions[i].size(); j ++) 340 | { 341 | if(positions[i][j].targetID != -1) 342 | { 343 | trueContigLengths.push_back(positions[i][j].sourceEnd - positions[i][j].sourceStart); 344 | 345 | for(k = positions[i][j].targetStart; k < positions[i][j].targetEnd; k ++) 346 | genome[positions[i][j].targetID][k] = 1; 347 | 348 | if(max < positions[i][j].sourceEnd - positions[i][j].sourceStart) 349 | max = positions[i][j].sourceEnd - positions[i][j].sourceStart; 350 | 351 | alignedBases = positions[i][j].alignedBases; 352 | totalBases = positions[i][j].targetEnd - positions[i][j].targetStart + positions[i][j].targetGap; 353 | identity.push_back((double) alignedBases * trueContigLengths[trueContigLengths.size() - 1] / totalBases); 354 | 355 | errors ++; 356 | } 357 | } 358 | if(errors == 0) 359 | ;//misassembly ++;//do nothing 360 | else if(errors == 1) 361 | misassembly ++; 362 | else if(errors >= 2) 363 | misassembly = misassembly + errors - 1; 364 | end:; 365 | } 366 | 367 | out.open(argv); 368 | if(out.is_open() == 0) {cout << "CANNOT OPEN FILE!"; exit(-1);} 369 | out << setw(21) << left << "#contigs" << initContigs.size() << endl; 370 | out << setw(21) << left << "#true contigs" << trueContigLengths.size() << endl; 371 | 372 | for(i = 0, totalLength = 0; i < trueContigLengths.size(); i ++) 373 | totalLength = totalLength + trueContigLengths[i]; 374 | sort(trueContigLengths.begin(), trueContigLengths.end()); 375 | for(i = trueContigLengths.size() - 1, sum = 0; i >= 0; i --) 376 | { 377 | sum = sum + trueContigLengths[i]; 378 | if(sum > totalLength / 2) break; 379 | } 380 | out << setw(21) << left << "N50" << trueContigLengths[i] << endl; 381 | 382 | for(i = 0, coveredLength = 0; i < genome.size(); i ++) 383 | for(j = 0; j < genome[i].size(); j ++) 384 | if(genome[i][j]) coveredLength ++; 385 | out << setw(21) << left << "covered length" << coveredLength << endl; 386 | 387 | for(i = 0, contigBases = 0; i < initContigs.size(); i ++) 388 | contigBases = contigBases + initContigs[i].size(); 389 | 390 | out << setw(21) << left << "average length" << totalLength / trueContigLengths.size() << endl; 391 | 392 | out << setw(21) << left << "maximum length" << max << endl; 393 | 394 | out << setw(21) << left << "MPMB" << (double) misassembly / ((double) (contigBases) / 1000000) << endl; 395 | 396 | for(i = 0, totalIdentity = 0; i < identity.size(); i ++) 397 | totalIdentity = totalIdentity + identity[i]; 398 | out << setw(21) << left << "average identity" << totalIdentity / totalLength << endl; 399 | } 400 | 401 | void formalizeGenome(char argv[]) 402 | { 403 | ifstream in; 404 | ofstream out; 405 | string buf; 406 | vector > genome; 407 | vector g; 408 | int i, j; 409 | 410 | in.open(argv); 411 | if(in.is_open()) 412 | { 413 | while(in.good()) 414 | { 415 | getline(in, buf); 416 | if(buf[0] == 0) break; 417 | 418 | if(buf[0] == '>') 419 | genome.push_back(g); 420 | else 421 | for(i = 0; i < buf.size(); i ++) 422 | genome[genome.size() - 1].push_back(buf[i]); 423 | } 424 | } 425 | else 426 | { 427 | cout << "CANNOT OPEN FILE!" << endl; 428 | exit(-1); 429 | } 430 | 431 | out.open("etmp/_genome.fa"); 432 | if(out.is_open()) 433 | { 434 | for(i = 0; i < genome.size(); i ++) 435 | { 436 | out << ">" << i << endl; 437 | for(j = 0; j < genome[i].size(); j ++) 438 | { 439 | out << genome[i][j]; 440 | if(j == genome[i].size() - 1 || (j + 1) % 60 == 0) 441 | out << endl; 442 | } 443 | } 444 | } 445 | else 446 | { 447 | cout << "CANNOT OPEN FILE!" << endl; 448 | exit(-1); 449 | } 450 | } 451 | 452 | void formalizeContigs(char argv[]) 453 | { 454 | ifstream in; 455 | ofstream out; 456 | string buf; 457 | vector > contigs; 458 | vector c; 459 | vector > contigsBuf; 460 | int i, j, k, seqID = 0; 461 | 462 | in.open(argv); 463 | if(in.is_open()) 464 | { 465 | while(in.good()) 466 | { 467 | getline(in, buf); 468 | if(buf[0] == 0) break; 469 | 470 | if(buf[0] == '>') 471 | contigs.push_back(c); 472 | else 473 | for(i = 0; i < buf.size(); i ++) 474 | contigs[contigs.size() - 1].push_back(buf[i]); 475 | } 476 | } 477 | else 478 | { 479 | cout << "CANNOT OPEN FILE!" << endl; 480 | exit(-1); 481 | } 482 | 483 | out.open("etmp/_contigs.fa"); 484 | if(out.is_open()) 485 | { 486 | for(i = 0; i < contigs.size(); i ++) 487 | { 488 | if(contigs[i].size() >= CUTOFF) 489 | { 490 | if(contigs[i].size() < SIZE) 491 | { 492 | out << ">" << seqID ++ << endl; 493 | for(j = 0; j < contigs[i].size(); j ++) 494 | { 495 | out << contigs[i][j]; 496 | if((j + 1) % 60 == 0 || j == contigs[i].size() - 1) 497 | out << endl; 498 | } 499 | } 500 | else 501 | { 502 | contigsBuf.clear(); 503 | contigsBuf.push_back(c); 504 | for(j = 0; j < contigs[i].size(); j ++) 505 | { 506 | contigsBuf[contigsBuf.size() - 1].push_back(contigs[i][j]); 507 | if((j + 1) % SIZE == 0 && j < contigs[i].size() - 1) contigsBuf.push_back(c); 508 | } 509 | 510 | for(j = 0; j < contigsBuf.size(); j ++) 511 | { 512 | out << ">" << seqID << "." << j << endl; 513 | for(k = 0; k < contigsBuf[j].size(); k ++) 514 | { 515 | out << contigsBuf[j][k]; 516 | if((k + 1) % 60 == 0 || k == contigsBuf[j].size() - 1) 517 | out << endl; 518 | } 519 | } 520 | 521 | seqID ++; 522 | } 523 | } 524 | } 525 | } 526 | else 527 | { 528 | cout << "CANNOT OPEN FILE!" << endl; 529 | exit(-1); 530 | } 531 | } 532 | 533 | void makeAlignment() 534 | { 535 | if(FASTMAP == 1) 536 | system("blat etmp/_genome.fa etmp/_contigs.fa -noHead etmp/_contigs_genome.psl -fastMap > blat_doc.txt"); 537 | else 538 | system("blat etmp/_genome.fa etmp/_contigs.fa -noHead etmp/_contigs_genome.psl > blat_doc.txt"); 539 | } 540 | 541 | void print() 542 | { 543 | cout << "Eval-AlignGraph arg1, arg2, arg3" << endl; 544 | cout << "arg1 = file of target genome in FASTA format" << endl; 545 | cout << "arg2 = file of contigs in FASTA format" << endl; 546 | cout << "arg3 = file name for outputted statistics" << endl; 547 | } 548 | 549 | int main(int argc, char * argv[]) 550 | { 551 | vector > positions; 552 | vector > genome; 553 | vector > initContigs; 554 | 555 | cout << "Eval-AlignGraph: Evaluation tool distributed with AlignGraph" << endl; 556 | cout << "By Ergude Bao, CS Department, UC-Riverside. All Rights Reserved" << endl << endl; 557 | 558 | if(argc != 4) {print(); return 0;} 559 | 560 | system("test -d \"etmp\"; t=$?; if [ $t -eq 1 ]; then mkdir etmp; fi"); 561 | formalizeGenome(argv[1]); 562 | formalizeContigs(argv[2]); 563 | makeAlignment(); 564 | cout << "(1) Alignment finished" << endl; 565 | 566 | genome = loadGenome(); 567 | initContigs = loadContigs(); 568 | positions = loadContigsAlignment(initContigs.size()); 569 | analyze(argv[3], genome, initContigs, positions); 570 | cout << "(2) Statistics generated and done :-)" << endl; 571 | } 572 | 573 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### LATEST NEWS 2 | [AlignGraph2](https://github.com/Godotcoffee/AlignGraph2) for improving PacBio long read assemblies is available! Pleae feel free to have a try and give us suggestions and/or comments by sending Shien Huang an email: 19112201@bjtu.edu.cn. 3 | 4 | ### Overview 5 | AlignGraph is a software that extends and joins contigs or scaffolds by reassembling them with help provided by a reference genome of a closely related organism. 6 | 7 | ### Copy right 8 | AlignGraph is under the [Artistic License 2.0](http://opensource.org/licenses/Artistic-2.0). 9 | 10 | ### How to cite AlignGraph? 11 | If you use AlignGraph, please cite the following paper: 12 | Bao E, Jiang T, Girke T (2014) AlignGraph: algorithm for secondary de novo genome assembly guided by closely related references. Bioinformatics: [epub](http://www.hubmed.org/display.cgi?uids=24932000). 13 | 14 | ### Short manual 15 | 1. System requirements 16 | 17 | AlignGraph is suitable for 32-bit or 64-bit machines with Linux operating systems. At least 4GB of system memory is recommended for assembling larger data sets. 18 | 19 | 2. Installation 20 | 21 | Aligners [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) and [BLAT](http://genome.ucsc.edu/FAQ/FAQblat.html)/[PBLAT](http://icebert.github.io/pblat/)/[NUCMER](http://mummer.sourceforge.net/) (BLAT's version should be v34 or below) are required to run AlignGraph. 22 | * To use Bowtie2 and BLAT/PBLAT/NUCMER, put them to your $PATH: `export PATH=PATH2BOWTIE2:$PATH` and `export PATH=PATH2BLAT/PBLAT/NUCMER:$PATH`. 23 | * The downloaded AlignGraph.cpp file can be compiled with command `g++ -o AlignGraph AlignGraph.cpp -lpthread`. 24 | 25 | 3. Inputs 26 | * Paired-end DNA reads in FASTA format. 27 | * De novo contigs or scaffolds assembled by any de novo DNA-Seq assembler (Velvet, ABySS, ALLPATHS-LG, SOAPdenovo, etc.). 28 | * Reference genome from a closely related species. 29 | 30 | 4. Using AlignGraph 31 | 32 | ``` 33 | AlignGraph --read1 reads_1.fa --read2 reads_2.fa --contig contigs.fa --genome genome.fa --distanceLow distanceLow --distanceHigh distancehigh --extendedContig extendedContigs.fa --remainingContig remainingContigs.fa [--kMer k --insertVariation insertVariation --coverage coverage --part p --fastMap --ratioCheck --iterativeMap --misassemblyRemoval --resume] 34 | ``` 35 | 36 | Inputs: 37 | --read1 is the the first pair of PE DNA reads in fasta format. 38 | --read2 is the the second pair of PE DNA reads in fasta format. 39 | --contig is the initial contigs/scaffolds in fasta format. 40 | --genome is the reference genome in fasta format. 41 | --distanceLow is the lower bound of alignment distance between the first and second pairs of PE DNA reads (recommended: max{insert length - 1000, single read length}). 42 | --distanceHigh is the upper bound of alignment distance between the first and second pairs of PE DNA reads (recommended: insert length + 1000). 43 | Outputs: 44 | --extendedContig is the extended contig/scaffold file in fasta format. 45 | --remainingContig is the not extended initial contig/scaffold file in fasta format. 46 | Options: 47 | --kMer is the k-mer size (default: 5). 48 | --insertVariation is the standard variation of insert length (default: 100). 49 | --coverage is the minimum coverage to keep a path in de Bruijn graph (default: 20). 50 | --part is the number of parts a chromosome is divided into when it is loaded to reduce memory requirement (default: 1). 51 | --fastMap calls NUCMER to make fast but less sensitive and accurate contig alignment instead of BLAT (default: none). 52 | --ratioCheck checks read alignment ratio to the reference beforehand and warns if the ratio is too low; may take a little more time (default: none). 53 | --iterativeMap aligns reads to one chromosome and then another rather than directly to the genome, which increases sensitivity while loses precision (default: none). 54 | --misassemblyRemoval detects and then breaks at or removes misassembed regions (default: none). 55 | --resume resumes the previous unfinished running from several checkpoints (default: none). 56 | 57 | 5. Outputs 58 | * Extended contigs or scaffolds in FASTA format. The format of the specification for each extended contig or scaffold (the string following the '>' of FASTA file) is: `AlignGraphX @ chromosomeID : contig/scaffoldID ; contig/scaffoldID ; contig/scaffoldID ... : partY`, where chromosomeID is the specification of the reference chromosome used to generate the extended contig or scaffold, X is a number starting from 0 to identify the extended contig or scaffold for each reference chromosome, and contig/scaffoldIDs are the specifications of the extendable contigs or scaffolds. If misassemblyRemoval is specified, partY shows the Y-th subcontig or subscaffold of the misassembled contig or scaffold split at misassemblies. 59 | * Remaining contigs or scaffolds not extended in FASTA format. 60 | 61 | 6. Example commandline 62 | 63 | Given PE reads files reads_1.fa and reads_2.fa with single read length 100 bp and insert length 500 bp, --distanceLow could be max{500 - 1000, 100} = 100 and --distanceHigh could be 500 + 1000 = 1500, so the simplest commandline with pre-assembled contigs file contigs.fa and reference genome genome.fa should be: 64 | 65 | ``` 66 | AlignGraph --read1 reads_1.fa --read2 reads_2.fa --contig contigs.fa --genome genome.fa --distanceLow 100 --distanceHigh 1500 --extendedContig extendedContigs.fa --remainingContig remainingContigs.fa 67 | ``` 68 | 69 | ### Eval-AlignGraph 70 | Eval-AlignGraph is the evaluation tool distributed with AlignGraph to generate statistics of the contigs or scaffolds. By default the contigs or scaffolds are aligned to the target genome by BLAT without the -fastMap option, but it can be enabled to avoid super long time waiting on some data with a little sensitivity loss by changing the value of FASTMAP macro from 0 to 1 in the source code. 71 | 72 | ### FAQs 73 | 1. How can I input multiple libraries with different insert lengths? 74 | 75 | Suppose you have one library x1.fa/y1.fa with insert length I1, and another library x2.fa/y2.fa with insert length I2, then you can simply combine x1.fa and x2.fa into x.fa, combine y1.fa and y2.fa into y.fa, and then input x.fa and y.fa. When you specify --distanceLow, let insert length be min{I1, I2}; for --distanceHigh, let insert length be max{I1, I2}. 76 | 77 | 2. Can I use mate pair libraries? 78 | 79 | You can but it is not recommended, since good alignment cannot be guaranteed with the very short reads and the very large insert length. 80 | 81 | 3. Which aligner do I need to put to $PATH: BLAT, PBLAT or NUCMER? 82 | 83 | BLAT and PBLAT are more sensitive and accurate than NUCMER, and PBLAT is the parallelized version of BLAT much faster, so PBLAT should be the first choice. However, the current PBLAT is not stable, so it is highly recommended to put both PBLAT and BLAT to your $PATH, so that AlignGraph can call BLAT instead when PBLAT fails somewhere. If PBLAT and BLAT are too slow to finish, you would have to switch to NUCMER and put it to your $PATH (remember to also specify the -fastMap option in command). 84 | 85 | 4. Why do I get the error "BLAT CALL FAILED!" even if I have put BLAT to my $PATH? 86 | 87 | The current version of BLAT (v35) is not compatible with AlignGraph, so you would have to use an earlier version (preferably v34) to avoid this error. 88 | 89 | 5. Why is there rare or no extension made by AlignGraph? 90 | 91 | How much extensions AlignGraph can make is mainly dependent on factors like how close the reference genome and the target genome are, and how well the pre-assembly worked. Therefore, it is possible there is rare or no extension, either because the reference genome is not so similar to the target genome, or because the upstream assemblies are already good enough for the current version of AlignGraph. We are currently working on improving AlignGraph's performance, so that more extensions can be made with a relatively different reference genome, but this may take some time. 92 | 93 | 6. How many threads are used for Bowtie2 and PBLAT? 94 | 95 | 8 threads are used. Currently users cannot make changes to this, since this is a moderate choice for either single CPU machines (overhead for parallelization would not be too large) or multiple CPU machines. 96 | 97 | ### Erratum 98 | There is a small error in page i322. See [erratum](http://biocluster.ucr.edu/~ebao/erratum.pdf) for more details. 99 | 100 | --------------------------------------------------------------------------------