├── .gitignore ├── LICENSE ├── Makefile ├── NGmerge.c ├── NGmerge.h ├── README.md ├── UserGuide.pdf ├── VERSION ├── figures ├── figure1.png └── figure2.png └── qual_profile.txt /.gitignore: -------------------------------------------------------------------------------- 1 | NGmerge 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (C) 2017 John M. Gaspar (jsh58@wildcats.unh.edu) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PREFIX=/usr/local 2 | DESTDIR= 3 | CC?=gcc 4 | VERSION?=0.3 5 | DISTNAME=NGmerge-${VERSION} 6 | CFLAGS?=-fopenmp 7 | 8 | NGmerge: NGmerge.c NGmerge.h 9 | $(CC) -g -Wall -std=gnu99 -O2 ${CFLAGS} -o NGmerge NGmerge.c -lz ${LDFLAGS} 10 | 11 | install: NGmerge 12 | @mkdir -p $(DESTDIR)$(PREFIX)/bin 13 | cp NGmerge $(DESTDIR)$(PREFIX)/bin 14 | 15 | clean: 16 | -@rm NGmerge 2>/dev/null || true 17 | -@rm -rf ${DISTNAME}* 18 | 19 | dist: 20 | rm -rf ${DISTNAME}* 21 | mkdir ${DISTNAME} 22 | cp -r `ls | grep -v ${DISTNAME}` ${DISTNAME} 23 | tar czvf ${DISTNAME}.tar.gz ${DISTNAME} 24 | -------------------------------------------------------------------------------- /NGmerge.c: -------------------------------------------------------------------------------- 1 | /* 2 | John M. Gaspar (jsh58@wildcats.unh.edu) 3 | April 2015 (updated 2016, 2017) 4 | 5 | Analyzing paired-end reads for overlaps. Two modes: 6 | - 'stitch': producing a single, merged read for reads 7 | with sufficient overlaps 8 | - 'adapter-removal': removing adapters (3' overhangs 9 | of stitched alignment) from individual reads 10 | 11 | Version 0.3 12 | */ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "NGmerge.h" 22 | 23 | /* void printVersion() 24 | * Print version and copyright. 25 | */ 26 | void printVersion(void) { 27 | fprintf(stderr, "NGmerge, version %s\n", VERSION); 28 | fprintf(stderr, "Copyright (C) 2017 John M. Gaspar (jsh58@wildcats.unh.edu)\n"); 29 | exit(-1); 30 | } 31 | 32 | /* void usage() 33 | * Prints usage information. 34 | */ 35 | void usage(int exitval) { 36 | fprintf(stderr, "Usage: NGmerge {-%c -%c ", FIRST, SECOND); 37 | fprintf(stderr, " -%c } [optional arguments]\n", OUTFILE); 38 | fprintf(stderr, "Required arguments:\n"); 39 | fprintf(stderr, " -%c Input FASTQ file with reads from forward direction\n", FIRST); 40 | fprintf(stderr, " -%c Input FASTQ file with reads from reverse direction\n", SECOND); 41 | fprintf(stderr, " -%c Output FASTQ file(s):\n", OUTFILE); 42 | fprintf(stderr, " - in 'stitch' mode (def.), the file of merged reads\n"); 43 | fprintf(stderr, " - in 'adapter-removal' mode (-%c), the output files\n", ADAPTOPT); 44 | fprintf(stderr, " will be %s and %s\n", ONEEXT, TWOEXT); 45 | fprintf(stderr, "Alignment parameters:\n"); 46 | fprintf(stderr, " -%c Minimum overlap of the paired-end reads (def. %d)\n", OVERLAP, DEFOVER); 47 | fprintf(stderr, " -%c Mismatches to allow in the overlapped region\n", MISMATCH); 48 | fprintf(stderr, " (a fraction of the overlap length; def. %.2f)\n", DEFMISM); 49 | fprintf(stderr, " -%c Use 'adapter-removal' mode (also sets -%c option)\n", ADAPTOPT, DOVEOPT); 50 | fprintf(stderr, " -%c Option to check for dovetailing (with 3' overhangs)\n", DOVEOPT); 51 | fprintf(stderr, " -%c Minimum overlap of dovetailed alignments (def. %d)\n", DOVEOVER, DEFDOVE); 52 | fprintf(stderr, " -%c Option to produce shortest stitched read\n", MAXOPT); 53 | fprintf(stderr, "I/O options:\n"); 54 | fprintf(stderr, " -%c Log file for stitching results of each read pair\n", LOGFILE); 55 | fprintf(stderr, " -%c FASTQ files for reads that failed stitching\n", UNFILE); 56 | fprintf(stderr, " (output as %s and %s)\n", ONEEXT, TWOEXT); 57 | fprintf(stderr, " -%c Log file for dovetailed reads (adapter sequences)\n", DOVEFILE); 58 | fprintf(stderr, " -%c Log file for formatted alignments of merged reads\n", ALNFILE); 59 | fprintf(stderr, " -%c/-%c Option to gzip (-%c) or not (-%c) FASTQ output(s)\n", GZOPT, UNGZOPT, GZOPT, UNGZOPT); 60 | fprintf(stderr, " -%c Option to produce interleaved FASTQ output(s)\n", INTEROPT); 61 | fprintf(stderr, " -%c Use given error profile for merged qual scores\n", QUALFILE); 62 | fprintf(stderr, " -%c Use 'fastq-join' method for merged qual scores\n", FJOINOPT); 63 | fprintf(stderr, " -%c FASTQ quality offset (def. %d)\n", QUALITY, OFFSET); 64 | fprintf(stderr, " -%c Maximum input quality score (0-based; def. %d)\n", SETQUAL, MAXQUAL); 65 | fprintf(stderr, " -%c Number of threads to use (def. %d)\n", THREADS, DEFTHR); 66 | fprintf(stderr, " -%c Option to print status updates/counts to stderr\n", VERBOSE); 67 | exit(exitval); 68 | } 69 | 70 | /* int error() 71 | * Prints an error message. 72 | */ 73 | int error(char* msg, enum errCode err) { 74 | fprintf(stderr, "Error! %s%s\n", msg, errMsg[err]); 75 | return -1; 76 | } 77 | 78 | /* void* memalloc() 79 | * Allocates a heap block. 80 | */ 81 | void* memalloc(int size) { 82 | void* ans = malloc(size); 83 | if (ans == NULL) 84 | exit(error("", ERRMEM)); 85 | return ans; 86 | } 87 | 88 | /* float getFloat(char*) 89 | * Converts the given char* to a float. 90 | */ 91 | float getFloat(char* in) { 92 | char* endptr; 93 | float ans = strtof(in, &endptr); 94 | if (*endptr != '\0') 95 | exit(error(in, ERRFLOAT)); 96 | return ans; 97 | } 98 | 99 | /* int getInt(char*) 100 | * Converts the given char* to an int. 101 | */ 102 | int getInt(char* in) { 103 | char* endptr; 104 | int ans = (int) strtol(in, &endptr, 10); 105 | if (*endptr != '\0') 106 | exit(error(in, ERRINT)); 107 | return ans; 108 | } 109 | 110 | /* char rc(char) 111 | * Returns the complement of the given base. 112 | */ 113 | char rc(char in) { 114 | char out; 115 | if (in == 'A') out = 'T'; 116 | else if (in == 'T') out = 'A'; 117 | else if (in == 'C') out = 'G'; 118 | else if (in == 'G') out = 'C'; 119 | else if (in == 'N') out = 'N'; 120 | else { 121 | char msg[4] = "' '"; 122 | msg[1] = in; 123 | exit(error(msg, ERRUNK)); 124 | } 125 | return out; 126 | } 127 | 128 | /* char* getLine() 129 | * Reads the next line from a file. 130 | */ 131 | char* getLine(char* line, int size, File in, bool gz) { 132 | if (gz) 133 | return gzgets(in.gzf, line, size); 134 | else 135 | return fgets(line, size, in.f); 136 | } 137 | 138 | /* void checkHeaders() 139 | * Ensure headers match (up to first space character); 140 | * create consensus header. 141 | */ 142 | void checkHeaders(char* head1, char* head2, char* header) { 143 | bool ok = false; // match boolean 144 | int j; 145 | for (j = 0; head1[j] != '\n' && head1[j] != '\0'; j++) { 146 | if (head1[j] != head2[j]) { 147 | if (ok) 148 | break; 149 | for ( ; head1[j] != '\n' && head1[j] != '\0' 150 | && head1[j] != ' '; j++) ; 151 | head1[j] = '\0'; // trim head1 for err msg 152 | exit(error(head1, ERRHEAD)); 153 | } else if (head1[j] == ' ') 154 | ok = true; // headers match 155 | header[j] = head1[j]; 156 | } 157 | if (header[j - 1] == ' ') 158 | header[j - 1] = '\0'; // removing trailing space 159 | else 160 | header[j] = '\0'; 161 | } 162 | 163 | /* void checkQual() 164 | * Check given quality string for offset errors. 165 | */ 166 | void checkQual(char* qual, int len, int offset, 167 | int maxQual) { 168 | for (int i = 0; i < len; i++) 169 | // error if qual < 0 or qual > maxQual 170 | if (qual[i] < offset || qual[i] > offset + maxQual) { 171 | char* msg = (char*) memalloc(MAX_SIZE); 172 | sprintf(msg, "(range [0, %d], offset %d) '%c'", 173 | maxQual, offset, qual[i]); 174 | exit(error(msg, ERROFFSET)); 175 | } 176 | } 177 | 178 | /* void processSeq() 179 | * Process the given sequence; save length; 180 | * for 2nd read, save reversed seq/qual. 181 | */ 182 | void processSeq(char** read, int* len, bool i, 183 | int j, int offset, int maxQual) { 184 | 185 | // remove new-line character and save length 186 | int k; 187 | for (k = 0; read[j][k] != '\n' && read[j][k] != '\0'; k++) ; 188 | read[j][k] = '\0'; 189 | if (j == SEQ) 190 | *len = k; // save read length 191 | else if (k != *len) 192 | exit(error("", ERRQUAL)); // seq/qual length mismatch 193 | 194 | // for 2nd read (i == true), save revComp(seq) or rev(qual) 195 | if (i) { 196 | int dest = j + EXTRA; // save to 'extra' field of read2 197 | int m = 0; 198 | if (j == SEQ) { 199 | dest++; // increment b/c of fastq 'plus' line 200 | for (k--; k > -1; k--) 201 | read[dest][m++] = rc(read[j][k]); 202 | } else 203 | for (k--; k > -1; k--) 204 | read[dest][m++] = read[j][k]; 205 | read[dest][m] = '\0'; 206 | } else if (j == SEQ) 207 | // check 1st read's sequence for non-ACGTN chars 208 | for (int m = 0; m < k; m++) 209 | rc(read[j][m]); 210 | 211 | // check quality scores 212 | if (j == QUAL) 213 | checkQual(read[j], k, offset, maxQual); 214 | } 215 | 216 | /* bool loadReads() 217 | * Load a pair of reads. Check formatting, determine 218 | * consensus header. Return false on EOF. 219 | */ 220 | bool loadReads(File in1, File in2, char** read1, char** read2, 221 | char* header, int* len1, int* len2, int offset, 222 | int maxQual, bool gz1, bool gz2) { 223 | 224 | // load both reads from input files (LOCK) 225 | bool flag = false; // boolean for EOF 226 | #pragma omp critical 227 | for (int i = 0; i < 2; i++) { 228 | File in = in1; 229 | char** read = read1; 230 | bool gz = gz1; 231 | if (i) { 232 | in = in2; 233 | read = read2; 234 | gz = gz2; 235 | } 236 | 237 | // load read (4 lines) 238 | for (int j = 0; j < FASTQ; j++) 239 | if (getLine(read[j], MAX_SIZE, in, gz) == NULL) { 240 | if (j == 0) { 241 | if (i == 0) { 242 | flag = true; // EOF 243 | break; 244 | } else { 245 | int k = 0; 246 | for ( ; read1[HEAD][k] != '\n' && read1[HEAD][k] != '\0' 247 | && read1[HEAD][k] != ' '; k++) ; 248 | read1[HEAD][k] = '\0'; // trim header for err msg 249 | exit(error(read1[HEAD], ERRHEAD)); 250 | } 251 | } else 252 | exit(error("", ERRSEQ)); 253 | } 254 | if (flag) 255 | break; 256 | 257 | } // (UNLOCK) 258 | 259 | if (flag) 260 | return false; // EOF 261 | 262 | // check fastq formatting 263 | if (read1[HEAD][0] != BEGIN || read1[PLUS][0] != PLUSCHAR 264 | || read2[HEAD][0] != BEGIN || read2[PLUS][0] != PLUSCHAR) 265 | exit(error("", ERRFASTQ)); 266 | 267 | // process sequence/quality lines 268 | processSeq(read1, len1, false, SEQ, offset, maxQual); 269 | processSeq(read1, len1, false, QUAL, offset, maxQual); 270 | processSeq(read2, len2, true, SEQ, offset, maxQual); 271 | processSeq(read2, len2, true, QUAL, offset, maxQual); 272 | 273 | // check headers 274 | checkHeaders(read1[HEAD], read2[HEAD], header); 275 | 276 | return true; 277 | } 278 | 279 | /* float compare() 280 | * Compare two sequences. Return the fraction mismatch. 281 | */ 282 | float compare(char* seq1, char* seq2, int length, 283 | float mismatch, int overlap) { 284 | int mis = 0; // number of mismatches 285 | int len = length; // length of overlap, not counting Ns 286 | float allow = len * mismatch; 287 | for (int i = 0; i < length; i++) { 288 | // do not count Ns 289 | if (seq1[i] == 'N' || seq2[i] == 'N') { 290 | if (--len < overlap || mis > len * mismatch) 291 | return NOTMATCH; 292 | allow = len * mismatch; 293 | } else if (seq1[i] != seq2[i] && ++mis > allow) 294 | return NOTMATCH; 295 | } 296 | return (float) mis / len; 297 | } 298 | 299 | /* int findPos() 300 | * Find optimal overlapping position. 301 | * Currently, quality scores are not considered 302 | * (e.g. decreased penalty for a low-quality mismatch). 303 | */ 304 | int findPos (char* seq1, char* seq2, char* qual1, 305 | char* qual2, int len1, int len2, int overlap, 306 | bool dovetail, int doveOverlap, float mismatch, 307 | bool maxLen, float* best) { 308 | 309 | // check for regular (non-dovetailed) alignments 310 | int pos = len1 - overlap + 1; // position of match 311 | int i = len1 - overlap; 312 | for ( ; i > -1 && len1 - i <= len2; i--) { 313 | // align sequences 314 | float res = compare(seq1 + i, seq2, len1 - i, 315 | mismatch, overlap); 316 | 317 | // compare result 318 | if (res < *best || (res == *best && !maxLen)) { 319 | *best = res; 320 | pos = i; 321 | } 322 | if (res == 0.0f && maxLen) 323 | return pos; // shortcut for exact match 324 | } 325 | 326 | // check for dovetailing 327 | if (dovetail) { 328 | 329 | // if no regular alignment, reset i 330 | if (i == len1 - overlap) 331 | i = (len1 > len2 ? len1 - len2 - 1 : -1); 332 | 333 | // continue decrementing i 334 | for ( ; ; i--) { 335 | float res = NOTMATCH; 336 | if (i >= 0) { 337 | // read1 is longer, with 3' overhang 338 | if (len2 < doveOverlap) 339 | break; 340 | res = compare(seq1 + i, seq2, len2, 341 | mismatch, doveOverlap); 342 | 343 | } else if (len1 < len2 + i) { 344 | // read2 has 3' overhang, read1 determines overlap 345 | if (len1 < doveOverlap) 346 | break; 347 | res = compare(seq1, seq2 - i, len1, 348 | mismatch, doveOverlap); 349 | 350 | } else { 351 | // read2 has 3' overhang and determines overlap 352 | if (len2 + i < doveOverlap) 353 | break; 354 | res = compare(seq1, seq2 - i, len2 + i, 355 | mismatch, doveOverlap); 356 | } 357 | 358 | // compare result 359 | if (res < *best || (res == *best && !maxLen)) { 360 | *best = res; 361 | pos = i; 362 | } 363 | if (res == 0.0f && maxLen) 364 | return pos; // shortcut for exact match 365 | } 366 | } 367 | 368 | return pos; 369 | } 370 | 371 | /* void printDove() 372 | * Log 3' overhangs of dovetailed reads. 373 | */ 374 | void printDove(File dove, char* header, char** read1, 375 | char** read2, int len1, int len2, int pos, 376 | omp_lock_t* lock) { 377 | if (len1 > len2 + pos || pos < 0) { 378 | omp_set_lock(lock); 379 | fprintf(dove.f, "%s\t%s\t%s\n", header + 1, 380 | len1 > len2 + pos ? read1[SEQ] + len2 + pos : "-", 381 | pos < 0 ? read2[SEQ] + len2 + pos : "-"); 382 | omp_unset_lock(lock); 383 | } 384 | } 385 | 386 | /* void printGZNoAdapt() 387 | * Print the reads minus adapters (gzip output). 388 | */ 389 | void printGZNoAdapt(gzFile out1, gzFile out2, 390 | char** read1, char** read2, int end1, int end2) { 391 | 392 | // print fwd read 393 | gzprintf(out1, "%s", read1[HEAD]); 394 | for (int i = 0; i < end1; i++) 395 | gzputc(out1, read1[SEQ][i]); 396 | gzprintf(out1, "\n%s", read1[PLUS]); 397 | for (int i = 0; i < end1; i++) 398 | gzputc(out1, read1[QUAL][i]); 399 | gzputc(out1, '\n'); 400 | 401 | // print rev read 402 | gzprintf(out2, "%s", read2[HEAD]); 403 | for (int i = 0; i < end2; i++) 404 | gzputc(out2, read2[SEQ][i]); 405 | gzprintf(out2, "\n%s", read2[PLUS]); 406 | for (int i = 0; i < end2; i++) 407 | gzputc(out2, read2[QUAL][i]); 408 | gzputc(out2, '\n'); 409 | } 410 | 411 | /* void printNoAdapt() 412 | * Print the reads minus adapters. 413 | */ 414 | void printNoAdapt(FILE* out1, FILE* out2, char** read1, 415 | char** read2, int end1, int end2) { 416 | 417 | // print fwd read 418 | fprintf(out1, "%s", read1[HEAD]); 419 | for (int i = 0; i < end1; i++) 420 | fputc(read1[SEQ][i], out1); 421 | fprintf(out1, "\n%s", read1[PLUS]); 422 | for (int i = 0; i < end1; i++) 423 | fputc(read1[QUAL][i], out1); 424 | fputc('\n', out1); 425 | 426 | // print rev read 427 | fprintf(out2, "%s", read2[HEAD]); 428 | for (int i = 0; i < end2; i++) 429 | fputc(read2[SEQ][i], out2); 430 | fprintf(out2, "\n%s", read2[PLUS]); 431 | for (int i = 0; i < end2; i++) 432 | fputc(read2[QUAL][i], out2); 433 | fputc('\n', out2); 434 | } 435 | 436 | /* bool printResAdapt() 437 | * Control printing of reads minus adapters. 438 | * Return 1 if adapter found, else 0. 439 | */ 440 | bool printResAdapt(File out1, File out2, File dove, 441 | bool doveOpt, char* header, char** read1, char** read2, 442 | int len1, int len2, int pos, float best, bool gz, 443 | omp_lock_t* lock) { 444 | 445 | bool adapter = false; 446 | int end1 = len1; 447 | int end2 = len2; 448 | 449 | // if found, identify locations of adapters 450 | if (len1 > len2 + pos || pos < 0) { 451 | adapter = true; 452 | if (len1 > len2 + pos) 453 | end1 = len2 + pos; 454 | if (pos < 0) 455 | end2 += pos; 456 | if (doveOpt) 457 | printDove(dove, header, read1, read2, 458 | len1, len2, pos, lock + DOVE); 459 | } 460 | 461 | // print output 462 | omp_set_lock(lock + OUT); 463 | if (gz) 464 | printGZNoAdapt(out1.gzf, out2.gzf, read1, read2, 465 | end1, end2); 466 | else 467 | printNoAdapt(out1.f, out2.f, read1, read2, 468 | end1, end2); 469 | omp_unset_lock(lock + OUT); 470 | 471 | return adapter; 472 | } 473 | 474 | /* void printAln2() 475 | * Printing details of stitch mismatches. 476 | */ 477 | void printAln2(File aln, char* header, char** read1, 478 | char** read2, int len1, int len2, int pos) { 479 | int i = pos; 480 | int j = 0; 481 | if (pos < 0) { 482 | j = -pos; 483 | i = 0; 484 | } 485 | while (i < len1 && j < len2) { 486 | if (read1[SEQ][i] == 'N' || read2[SEQ + EXTRA + 1][j] == 'N' 487 | || read1[SEQ][i] != read2[SEQ + EXTRA + 1][j]) 488 | fprintf(aln.f, "%s\t%d\t%c\t%c\t%c\t%c\n", 489 | header + 1, i, read1[SEQ][i], read1[QUAL][i], 490 | read2[SEQ + EXTRA + 1][j], read2[QUAL + EXTRA][j]); 491 | i++; 492 | j++; 493 | } 494 | } 495 | 496 | /* void printAln() 497 | * Print nicely formatted alignment of stitched reads. 498 | */ 499 | void printAln(File aln, char* header, char** read1, 500 | char** read2, int len1, int len2, int pos) { 501 | fprintf(aln.f, "%s\n", header + 1); 502 | 503 | // print sequence alignment 504 | fprintf(aln.f, "seq_R1: "); 505 | for (int i = 0; i > pos; i--) 506 | fputc(' ', aln.f); 507 | fprintf(aln.f, "%s\n", read1[SEQ]); 508 | 509 | // print '|' for matches, ':' for Ns 510 | fprintf(aln.f, " "); 511 | int i; 512 | for (i = 0; i < abs(pos); i++) 513 | fputc(' ', aln.f); 514 | int j = 0; 515 | if (pos < 0) { 516 | j = -pos; 517 | i = 0; 518 | } 519 | while (i < len1 && j < len2) { 520 | fputc((read1[SEQ][i] == 'N' || read2[SEQ + EXTRA + 1][j] == 'N') ? 521 | ':' : (read1[SEQ][i] == read2[SEQ + EXTRA + 1][j] ? 522 | '|' : ' '), aln.f); 523 | i++; 524 | j++; 525 | } 526 | fputc('\n', aln.f); 527 | 528 | fprintf(aln.f, "seq_R2: "); 529 | for (int i = 0; i < pos; i++) 530 | fputc(' ', aln.f); 531 | fprintf(aln.f, "%s\n\n", read2[SEQ + EXTRA + 1]); 532 | 533 | // print quality scores 534 | fprintf(aln.f, "qual_R1: "); 535 | for (int i = 0; i > pos; i--) 536 | fputc(' ', aln.f); 537 | fprintf(aln.f, "%s\n", read1[QUAL]); 538 | fprintf(aln.f, "qual_R2: "); 539 | for (int i = 0; i < pos; i++) 540 | fputc(' ', aln.f); 541 | fprintf(aln.f, "%s\n\n", read2[QUAL + EXTRA]); 542 | } 543 | 544 | /* void createSeq() 545 | * Create stitched sequence (into seq1, qual1). 546 | * Use empirical error profiles for quality scores, 547 | * or 'fastq-join' method. 548 | */ 549 | void createSeq(char* seq1, char* seq2, char* qual1, 550 | char* qual2, int len1, int len2, int pos, 551 | int offset, char** match, char** mism, bool fjoin) { 552 | int len = len2 + pos; // length of stitched sequence 553 | for (int i = 0; i < len; i++) { 554 | if (i - pos < 0) { 555 | // 1st read only: continue 556 | continue; 557 | } else if (i >= len1) { 558 | // 2nd read only: copy seq and qual 559 | seq1[i] = seq2[i-pos]; 560 | qual1[i] = qual2[i-pos]; 561 | } else if (seq2[i-pos] == 'N') { 562 | // 2nd read 'N': continue 563 | continue; 564 | } else if (seq1[i] == 'N') { 565 | // 1st read 'N': copy seq and qual 566 | seq1[i] = seq2[i-pos]; 567 | qual1[i] = qual2[i-pos]; 568 | } else if (seq1[i] != seq2[i-pos]) { 569 | // mismatch: 570 | // - base matches higher quality score or equal 571 | // quality score that is closer to 5' end 572 | // - quality score calculated as diff (fastq-join 573 | // method) or copied from mism array 574 | if (qual1[i] < qual2[i-pos] || 575 | (qual1[i] == qual2[i-pos] && i >= len / 2.0) ) 576 | seq1[i] = seq2[i-pos]; 577 | if (fjoin) 578 | qual1[i] = abs(qual2[i-pos] - qual1[i]) + offset; 579 | else 580 | qual1[i] = mism[ (int) qual1[i] - offset ] 581 | [ (int) qual2[i-pos] - offset ] + offset; 582 | } else { 583 | // match: 584 | // - quality score calculated as max (fastq-join 585 | // method) or copied from match array 586 | if (fjoin) { 587 | if (qual1[i] < qual2[i-pos]) 588 | qual1[i] = qual2[i-pos]; 589 | } else 590 | qual1[i] = match[ (int) qual1[i] - offset ] 591 | [ (int) qual2[i-pos] - offset ] + offset; 592 | } 593 | } 594 | seq1[len] = '\0'; 595 | qual1[len] = '\0'; 596 | } 597 | 598 | /* void printRes() 599 | * Print stitched read. 600 | */ 601 | void printRes(File out, File log, bool logOpt, File dove, 602 | bool doveOpt, File aln, int alnOpt, char* header, 603 | char** read1, char** read2, int len1, int len2, 604 | int pos, float best, int offset, bool gz, bool fjoin, 605 | char** match, char** mism, omp_lock_t* lock) { 606 | // log result 607 | if (logOpt) { 608 | omp_set_lock(lock + LOG); 609 | fprintf(log.f, "%s\t%d\t%d\t", header + 1, 610 | pos < 0 ? (len2+pos < len1 ? len2+pos : len1) : 611 | (len1-pos < len2 ? len1-pos : len2), len2 + pos); 612 | best ? fprintf(log.f, "%.3f", best) : fprintf(log.f, "0"); 613 | fprintf(log.f, "\n"); 614 | omp_unset_lock(lock + LOG); 615 | } 616 | if (doveOpt) 617 | printDove(dove, header, read1, read2, len1, len2, 618 | pos, lock + DOVE); 619 | 620 | // print formatted alignments 621 | if (alnOpt == 1) { 622 | omp_set_lock(lock + ALN); 623 | printAln(aln, header, read1, read2, len1, len2, pos); 624 | 625 | // create stitched sequence 626 | createSeq(read1[SEQ], read2[SEQ + EXTRA + 1], 627 | read1[QUAL], read2[QUAL + EXTRA], len1, len2, 628 | pos, offset, match, mism, fjoin); 629 | 630 | // print merged seq to alignment output 631 | fprintf(aln.f, "merged\nseq: "); 632 | for (int i = 0; i > pos; i--) 633 | fputc(' ', aln.f); 634 | fprintf(aln.f, "%s\n", read1[SEQ]); 635 | fprintf(aln.f, "qual: "); 636 | for (int i = 0; i > pos; i--) 637 | fputc(' ', aln.f); 638 | fprintf(aln.f, "%s\n\n\n", read1[QUAL]); 639 | 640 | omp_unset_lock(lock + ALN); 641 | 642 | } else { 643 | 644 | // print stitch differences 645 | if (alnOpt == 2) { 646 | omp_set_lock(lock + ALN); 647 | printAln2(aln, header, read1, read2, len1, len2, pos); 648 | omp_unset_lock(lock + ALN); 649 | } 650 | 651 | // create stitched sequence 652 | createSeq(read1[SEQ], read2[SEQ + EXTRA + 1], 653 | read1[QUAL], read2[QUAL + EXTRA], len1, len2, 654 | pos, offset, match, mism, fjoin); 655 | } 656 | 657 | // print stitched sequence 658 | omp_set_lock(lock + OUT); 659 | if (gz) 660 | gzprintf(out.gzf, "%s\n%s\n+\n%s\n", header, 661 | read1[SEQ], read1[QUAL]); 662 | else 663 | fprintf(out.f, "%s\n%s\n+\n%s\n", header, 664 | read1[SEQ], read1[QUAL]); 665 | omp_unset_lock(lock + OUT); 666 | 667 | } 668 | 669 | /* void printFail() 670 | * Print stitch failure reads. 671 | */ 672 | void printFail(File un1, File un2, bool unOpt, 673 | File log, bool logOpt, char* header, char** read1, 674 | char** read2, bool gz, omp_lock_t* outLock, 675 | omp_lock_t* logLock) { 676 | if (logOpt) { 677 | omp_set_lock(logLock); 678 | fprintf(log.f, "%s\t%s\n", header + 1, NA); 679 | omp_unset_lock(logLock); 680 | } 681 | if (unOpt) { 682 | omp_set_lock(outLock); 683 | if (gz) { 684 | gzprintf(un1.gzf, "%s%s\n%s%s\n", read1[HEAD], 685 | read1[SEQ], read1[PLUS], read1[QUAL]); 686 | gzprintf(un2.gzf, "%s%s\n%s%s\n", read2[HEAD], 687 | read2[SEQ], read2[PLUS], read2[QUAL]); 688 | } else { 689 | fprintf(un1.f, "%s%s\n%s%s\n", read1[HEAD], 690 | read1[SEQ], read1[PLUS], read1[QUAL]); 691 | fprintf(un2.f, "%s%s\n%s%s\n", read2[HEAD], 692 | read2[SEQ], read2[PLUS], read2[QUAL]); 693 | } 694 | omp_unset_lock(outLock); 695 | } 696 | } 697 | 698 | /* int readFile() 699 | * Analyzes the reads in a set of input files. 700 | * Controls writing to the output file(s). 701 | * Multithreaded. 702 | */ 703 | int readFile(File in1, File in2, File out, File out2, 704 | File un1, File un2, bool unOpt, File log, 705 | bool logOpt, int overlap, bool dovetail, int doveOverlap, 706 | File dove, bool doveOpt, File aln, int alnOpt, 707 | bool adaptOpt, float mismatch, bool maxLen, 708 | int* stitch, int offset, int maxQual, 709 | bool gz1, bool gz2, bool gzOut, bool fjoin, 710 | char** match, char** mism, int threads) { 711 | 712 | // initialize omp locks -- out, un, log, dove, aln 713 | omp_lock_t lock[OMP_LOCKS]; 714 | for (int i = 0; i < OMP_LOCKS; i++) 715 | omp_init_lock(&lock[i]); 716 | 717 | // process files in parallel 718 | int count = 0, stitchRed = 0; 719 | #pragma omp parallel num_threads(threads) reduction(+: count, stitchRed) 720 | { 721 | 722 | // allocate memory for both reads 723 | char** read1 = (char**) memalloc(FASTQ * sizeof(char*)); 724 | char** read2 = (char**) memalloc((FASTQ + EXTRA) * sizeof(char*)); 725 | for (int i = 0; i < FASTQ + EXTRA; i++) { 726 | if (i < FASTQ) 727 | read1[i] = (char*) memalloc(MAX_SIZE); 728 | // for 2nd read, save extra fields for revComp(seq) and rev(qual) 729 | read2[i] = (char*) memalloc(MAX_SIZE); 730 | } 731 | char* header = (char*) memalloc(MAX_SIZE); // consensus header 732 | 733 | // process reads 734 | int len1 = 0, len2 = 0; // lengths of reads 735 | while (loadReads(in1, in2, read1, read2, header, 736 | &len1, &len2, offset, maxQual, gz1, gz2)) { 737 | 738 | // find optimal overlap 739 | float best = 1.0f; 740 | int pos = findPos(read1[SEQ], read2[SEQ + EXTRA + 1], 741 | read1[QUAL], read2[QUAL + EXTRA], len1, len2, overlap, 742 | dovetail, doveOverlap, mismatch, maxLen, &best); 743 | 744 | // print result 745 | if (pos == len1 - overlap + 1) { 746 | // stitch failure 747 | if (adaptOpt) 748 | printFail(out, out2, 1, log, 0, header, read1, 749 | read2, gzOut, lock + OUT, lock + LOG); 750 | else 751 | printFail(un1, un2, unOpt, log, logOpt, header, 752 | read1, read2, gzOut, lock + UN, lock + LOG); 753 | } else { 754 | // stitch success 755 | if (adaptOpt) { 756 | stitchRed += printResAdapt(out, out2, dove, doveOpt, 757 | header, read1, read2, len1, len2, pos, best, 758 | gzOut, lock); 759 | } else { 760 | printRes(out, log, logOpt, dove, doveOpt, aln, alnOpt, 761 | header, read1, read2, len1, len2, pos, best, offset, 762 | gzOut, fjoin, match, mism, lock); 763 | stitchRed++; 764 | } 765 | } 766 | 767 | count++; 768 | } 769 | 770 | // free memory 771 | free(header); 772 | for (int i = 0; i < FASTQ + EXTRA; i++) { 773 | if (i < FASTQ) 774 | free(read1[i]); 775 | free(read2[i]); 776 | } 777 | free(read1); 778 | free(read2); 779 | 780 | } // END parallel 781 | 782 | // destroy omp locks 783 | for (int i = 0; i < 5; i++) 784 | omp_destroy_lock(&lock[i]); 785 | 786 | *stitch = stitchRed; 787 | return count; 788 | } 789 | 790 | /* void openWrite() 791 | * Open a file for writing (stdout if file is '-'). 792 | */ 793 | void openWrite(char* outFile, File* out, bool gz) { 794 | if (outFile[0] == '-' && strlen(outFile) > 1) 795 | exit(error(outFile, ERRNAME)); 796 | if (gz) { 797 | if (!strcmp(outFile + strlen(outFile) - strlen(GZEXT), GZEXT) 798 | || !strcmp(outFile, "/dev/null")) 799 | out->gzf = gzopen(outFile, "w"); 800 | else if (!strcmp(outFile, "-")) 801 | out->gzf = gzdopen(fileno(stdout), "wb"); 802 | else { 803 | // add ".gz" to outFile 804 | char* outFile2 = memalloc(strlen(outFile) 805 | + strlen(GZEXT) + 1); 806 | strcpy(outFile2, outFile); 807 | strcat(outFile2, GZEXT); 808 | out->gzf = gzopen(outFile2, "w"); 809 | free(outFile2); 810 | } 811 | if (out->gzf == NULL) 812 | exit(error(outFile, ERROPENW)); 813 | } else { 814 | out->f = (strcmp(outFile, "-") ? 815 | fopen(outFile, "w") : stdout); 816 | if (out->f == NULL) 817 | exit(error(outFile, ERROPENW)); 818 | } 819 | } 820 | 821 | /* void openFiles() 822 | * Opens output files for the program, 823 | * adjusting file names/extensions as needed. 824 | */ 825 | void openFiles(char* outFile, File* out, File* out2, 826 | char* unFile, File* un1, File* un2, 827 | char* logFile, File* log, 828 | char* doveFile, File* dove, bool dovetail, 829 | char* alnFile, File* aln, 830 | bool adaptOpt, bool gz, bool interOpt) { 831 | 832 | if (adaptOpt) { 833 | if (interOpt) 834 | openWrite(outFile, out, gz); 835 | else if (! strcmp(outFile, "-")) 836 | exit(error("stdout + \"_1.fastq\"", ERROPENW)); 837 | else if (! strcmp(outFile, "/dev/null")) { 838 | openWrite(outFile, out, gz); 839 | openWrite(outFile, out2, gz); 840 | } else { 841 | // add "_1.fastq" and "_2.fastq" extensions 842 | int add = strlen(ONEEXT) > strlen(TWOEXT) ? 843 | strlen(ONEEXT) + 1 : strlen(TWOEXT) + 1; 844 | char* outFile2 = memalloc(strlen(outFile) + add); 845 | strcpy(outFile2, outFile); 846 | strcat(outFile2, ONEEXT); 847 | openWrite(outFile2, out, gz); 848 | strcpy(outFile2, outFile); 849 | strcat(outFile2, TWOEXT); 850 | openWrite(outFile2, out2, gz); 851 | free(outFile2); 852 | } 853 | 854 | } else { 855 | openWrite(outFile, out, gz); 856 | 857 | // open optional files 858 | if (unFile != NULL) { 859 | if (interOpt) 860 | openWrite(unFile, un1, gz); 861 | else if (! strcmp(unFile, "-")) 862 | exit(error("stdout + \"_1.fastq\"", ERROPENW)); 863 | else { 864 | // add "_1.fastq" and "_2.fastq" extensions 865 | int add = strlen(ONEEXT) > strlen(TWOEXT) ? 866 | strlen(ONEEXT) + 1 : strlen(TWOEXT) + 1; 867 | char* unFile2 = memalloc(strlen(unFile) + add); 868 | strcpy(unFile2, unFile); 869 | strcat(unFile2, ONEEXT); 870 | openWrite(unFile2, un1, gz); 871 | strcpy(unFile2, unFile); 872 | strcat(unFile2, TWOEXT); 873 | openWrite(unFile2, un2, gz); 874 | free(unFile2); 875 | } 876 | } 877 | if (logFile != NULL) { 878 | openWrite(logFile, log, false); 879 | fprintf(log->f, "Read\tOverlapLen\tStitchedLen\tMismatch\n"); 880 | } 881 | if (alnFile != NULL) 882 | openWrite(alnFile, aln, false); 883 | } 884 | 885 | if (dovetail && doveFile != NULL) { 886 | openWrite(doveFile, dove, false); 887 | fprintf(dove->f, "Read\tAdapter_R1\tAdapter_R2\n"); 888 | } 889 | } 890 | 891 | /* bool openRead() 892 | * Open a file for reading (stdin if file is '-'). 893 | * Return true if gzip compressed. 894 | */ 895 | bool openRead(char* inFile, File* in) { 896 | 897 | // open file or stdin 898 | bool stdinBool = (strcmp(inFile, "-") ? false : true); 899 | FILE* dummy = (stdinBool ? stdin : fopen(inFile, "r")); 900 | if (dummy == NULL) 901 | exit(error(inFile, ERROPEN)); 902 | 903 | // check for gzip compression: magic number 0x1F, 0x8B 904 | bool gzip = true; 905 | int save = 0; // first char to pushback (for stdin) 906 | int i, j; 907 | for (i = 0; i < 2; i++) { 908 | j = fgetc(dummy); 909 | if (j == EOF) 910 | exit(error(inFile, ERROPEN)); 911 | if ( (i && (unsigned char) j != 0x8B) 912 | || (! i && (unsigned char) j != 0x1F) ) { 913 | gzip = false; 914 | break; 915 | } 916 | if (! i) 917 | save = j; 918 | } 919 | 920 | // for stdin, push back chars 921 | if (stdinBool) { 922 | if (gzip) 923 | exit(error("", ERRGZIP)); 924 | if (ungetc(j, dummy) == EOF) 925 | exit(error("", ERRUNGET)); 926 | if (i && ungetc(save, dummy) == EOF) 927 | exit(error("", ERRUNGET)); 928 | } 929 | 930 | // open file 931 | if (gzip) { 932 | if (fclose(dummy)) 933 | exit(error("", ERRCLOSE)); 934 | in->gzf = gzopen(inFile, "r"); 935 | if (in->gzf == NULL) 936 | exit(error(inFile, ERROPEN)); 937 | } else { 938 | if (! stdinBool) 939 | rewind(dummy); 940 | in->f = dummy; 941 | } 942 | 943 | return gzip; 944 | } 945 | 946 | /* void loadQual() 947 | * Load quality score profiles from file. 948 | */ 949 | void loadQual(char* qualFile, int maxQual, 950 | char*** match, char*** mism) { 951 | File qual; 952 | bool gz = openRead(qualFile, &qual); 953 | char* line = memalloc(MAX_SIZE); 954 | 955 | char** arr = NULL; // array to save to 956 | int i = 0, matIdx = 0, misIdx = 0; // array indices 957 | while (getLine(line, MAX_SIZE, qual, gz) != NULL) { 958 | if (line[0] == '#' || line[0] == '\n') { 959 | // determine target array 960 | i = 0; 961 | if (! strcmp(line + 1, "match\n")) 962 | arr = *match; 963 | else if (! strcmp(line + 1, "mismatch\n")) 964 | arr = *mism; 965 | } else if (arr == NULL) { 966 | continue; 967 | } else { 968 | // remove trailing '\n' 969 | int j; 970 | for (j = 0; line[j] != '\n' && line[j] != '\0'; j++) ; 971 | line[j] = '\0'; 972 | 973 | // save values to array 974 | char* tok = strtok(line, CSV); 975 | for (j = 0; j < maxQual + 1; j++) { 976 | if (tok == NULL) { 977 | char* msg = (char*) memalloc(MAX_SIZE); 978 | sprintf(msg, "(range [0, %d]) %s", 979 | maxQual, qualFile); 980 | exit(error(msg, ERRRANGE)); 981 | } 982 | arr[i][j] = getInt(tok); 983 | tok = strtok(NULL, CSV); 984 | } 985 | i++; 986 | if ( (arr == *match && ++matIdx > maxQual) 987 | || (arr == *mism && ++misIdx > maxQual) ) 988 | arr = NULL; 989 | } 990 | } 991 | 992 | // make sure all values were loaded 993 | if (matIdx < maxQual + 1 || misIdx < maxQual + 1) { 994 | char* msg = (char*) memalloc(MAX_SIZE); 995 | sprintf(msg, "(range [0, %d]) %s", maxQual, qualFile); 996 | exit(error(msg, ERRRANGE)); 997 | } 998 | 999 | if ( (gz && gzclose(qual.gzf) != Z_OK) || 1000 | (! gz && fclose(qual.f) ) ) 1001 | exit(error("", ERRCLOSE)); 1002 | free(line); 1003 | } 1004 | 1005 | /* void saveQual() 1006 | * Save quality score profiles. 1007 | */ 1008 | void saveQual(char* qualFile, int maxQual, 1009 | char*** match, char*** mism) { 1010 | 1011 | // allocate memory 1012 | *match = (char**) memalloc((maxQual + 1) * sizeof(char*)); 1013 | *mism = (char**) memalloc((maxQual + 1) * sizeof(char*)); 1014 | for (int i = 0; i < maxQual + 1; i++) { 1015 | (*match)[ i ] = (char*) memalloc(maxQual + 1); 1016 | (*mism)[ i ] = (char*) memalloc(maxQual + 1); 1017 | } 1018 | 1019 | if (qualFile == NULL) { 1020 | // copy quality profile from const arrays 1021 | if (maxQual > MAXQUAL) 1022 | exit(error("", ERRDEFQ)); 1023 | for (int i = 0; i < maxQual + 1; i++) 1024 | for (int j = 0; j < maxQual + 1; j++) { 1025 | (*match)[ i ][ j ] = match_profile[ i ][ j ]; 1026 | (*mism)[ i ][ j ] = mismatch_profile[ i ][ j ]; 1027 | } 1028 | } else 1029 | // load from file 1030 | loadQual(qualFile, maxQual, match, mism); 1031 | 1032 | } 1033 | 1034 | /* void runProgram() 1035 | * Controls the opening/closing of files, 1036 | * and analysis by readFile(). 1037 | */ 1038 | void runProgram(char* outFile, char* inFile1, 1039 | char* inFile2, bool inter, char* unFile, 1040 | char* logFile, int overlap, bool dovetail, 1041 | char* doveFile, int doveOverlap, char* alnFile, 1042 | int alnOpt, bool adaptOpt, int gzOut, bool fjoin, 1043 | bool interOpt, float mismatch, bool maxLen, 1044 | int offset, int maxQual, char* qualFile, 1045 | bool verbose, int threads) { 1046 | 1047 | // get first set of input file names 1048 | char* end1, *end2; 1049 | char* file1 = strtok_r(inFile1, COM, &end1); 1050 | char* file2 = file1; 1051 | if (! inter) 1052 | file2 = strtok_r(inFile2, COM, &end2); 1053 | 1054 | // loop through input files 1055 | File out, out2, un1, un2, log, dove, aln; // output files 1056 | char** match = NULL, **mism = NULL; // quality score profiles 1057 | int i = 0; // count of files processed 1058 | int tCount = 0, tStitch = 0; // counting variables 1059 | while (file1 && file2) { 1060 | 1061 | // open input files 1062 | File in1, in2; 1063 | bool gz1 = openRead(file1, &in1); 1064 | bool gz2 = gz1; 1065 | if (! inter) 1066 | gz2 = openRead(file2, &in2); 1067 | 1068 | // on first iteration, load quals and open outputs 1069 | if (! i) { 1070 | // load quality score profile 1071 | if (! fjoin && ! adaptOpt) 1072 | saveQual(qualFile, maxQual, &match, &mism); 1073 | 1074 | // open output files 1075 | if (gzOut == -1) 1076 | gzOut = 0; 1077 | else if (gz1 || gz2) 1078 | gzOut = 1; 1079 | openFiles(outFile, &out, &out2, 1080 | unFile, &un1, &un2, logFile, &log, 1081 | doveFile, &dove, dovetail, alnFile, &aln, 1082 | adaptOpt, gzOut, interOpt); 1083 | } 1084 | 1085 | // process files 1086 | if (verbose) 1087 | fprintf(stderr, "Processing files: %s,%s\n", file1, 1088 | inter ? "(interleaved)" : file2); 1089 | int stitch = 0; // counting variable 1090 | int count = readFile(in1, inter ? in1 : in2, 1091 | out, interOpt ? out : out2, 1092 | un1, interOpt ? un1 : un2, unFile != NULL, 1093 | log, logFile != NULL, 1094 | overlap, dovetail, doveOverlap, dove, 1095 | dovetail && doveFile != NULL, aln, alnOpt, 1096 | adaptOpt, mismatch, maxLen, &stitch, 1097 | offset, maxQual, gz1, gz2, gzOut, fjoin, 1098 | match, mism, threads); 1099 | tCount += count; 1100 | tStitch += stitch; 1101 | 1102 | // log counts 1103 | if (verbose) { 1104 | fprintf(stderr, " Fragments (pairs of reads) analyzed: %d\n", count); 1105 | if (adaptOpt) 1106 | fprintf(stderr, " Adapters removed: %d\n", stitch); 1107 | else 1108 | fprintf(stderr, " Successfully stitched: %d\n", stitch); 1109 | } 1110 | 1111 | // close input files 1112 | if ( (gz1 && gzclose(in1.gzf) != Z_OK) || (! gz1 && fclose(in1.f)) 1113 | || (! inter && ( (gz2 && gzclose(in2.gzf) != Z_OK) 1114 | || (! gz2 && fclose(in2.f)) ) ) ) 1115 | exit(error("", ERRCLOSE)); 1116 | 1117 | file1 = strtok_r(NULL, COM, &end1); 1118 | file2 = file1; 1119 | if (! inter) 1120 | file2 = strtok_r(NULL, COM, &end2); 1121 | i++; 1122 | } 1123 | 1124 | if (verbose && i > 1) { 1125 | fprintf(stderr, "Total counts\n"); 1126 | fprintf(stderr, " Fragments (pairs of reads) analyzed: %d\n", tCount); 1127 | if (adaptOpt) 1128 | fprintf(stderr, " Adapters removed: %d\n", tStitch); 1129 | else 1130 | fprintf(stderr, " Successfully stitched: %d\n", tStitch); 1131 | } 1132 | 1133 | // free memory for qual score profiles 1134 | if (! fjoin && ! adaptOpt) { 1135 | for (int i = 0; i < maxQual + 1; i++) { 1136 | free(match[i]); 1137 | free(mism[i]); 1138 | } 1139 | free(match); 1140 | free(mism); 1141 | } 1142 | 1143 | // close files 1144 | if ( ( gzOut && ( gzclose(out.gzf) != Z_OK || 1145 | (adaptOpt && ! interOpt && gzclose(out2.gzf) != Z_OK) || 1146 | (unFile != NULL && (gzclose(un1.gzf) != Z_OK || 1147 | (! interOpt && gzclose(un2.gzf) != Z_OK)) ) ) ) || 1148 | ( ! gzOut && ( fclose(out.f) || 1149 | (adaptOpt && ! interOpt && fclose(out2.f)) || 1150 | (unFile != NULL && (fclose(un1.f) || 1151 | (! interOpt && fclose(un2.f)) ) ) ) ) || 1152 | (logFile != NULL && fclose(log.f)) || 1153 | (dovetail && doveFile != NULL && fclose(dove.f)) || 1154 | (alnFile != NULL && fclose(aln.f)) ) 1155 | exit(error("", ERRCLOSE)); 1156 | } 1157 | 1158 | /* void getArgs() 1159 | * Parse the command-line. Check for errors. 1160 | */ 1161 | void getArgs(int argc, char** argv) { 1162 | 1163 | // default parameters/filenames 1164 | char* outFile = NULL, *inFile1 = NULL, *inFile2 = NULL, 1165 | *unFile = NULL, *logFile = NULL, *doveFile = NULL, 1166 | *alnFile = NULL, *qualFile = NULL; 1167 | int overlap = DEFOVER, doveOverlap = DEFDOVE, gzOut = 0, 1168 | offset = OFFSET, maxQual = MAXQUAL, threads = DEFTHR; 1169 | float mismatch = DEFMISM; 1170 | bool dovetail = false, adaptOpt = false, maxLen = true, 1171 | diffOpt = false, interOpt = false, fjoin = false, 1172 | verbose = false; 1173 | 1174 | // parse argv 1175 | int c; 1176 | while ( (c = getopt_long(argc, argv, OPTIONS, long_options, NULL)) != -1 ) 1177 | switch (c) { 1178 | case HELP: usage(0); break; 1179 | case VERSOPT: printVersion(); break; 1180 | case MAXOPT: maxLen = false; break; 1181 | case DOVEOPT: dovetail = true; break; 1182 | case ADAPTOPT: adaptOpt = true; break; 1183 | case GZOPT: gzOut = 1; break; 1184 | case UNGZOPT: gzOut = -1; break; 1185 | case DIFFOPT: diffOpt = true; break; 1186 | case INTEROPT: interOpt = true; break; 1187 | case FJOINOPT: fjoin = true; break; 1188 | case VERBOSE: verbose = true; break; 1189 | case OUTFILE: outFile = optarg; break; 1190 | case FIRST: inFile1 = optarg; break; 1191 | case SECOND: inFile2 = optarg; break; 1192 | case UNFILE: unFile = optarg; break; 1193 | case LOGFILE: logFile = optarg; break; 1194 | case DOVEFILE: doveFile = optarg; break; 1195 | case ALNFILE: alnFile = optarg; break; 1196 | case OVERLAP: overlap = getInt(optarg); break; 1197 | case DOVEOVER: doveOverlap = getInt(optarg); break; 1198 | case MISMATCH: mismatch = getFloat(optarg); break; 1199 | case QUALITY: offset = getInt(optarg); break; 1200 | case SETQUAL: maxQual = getInt(optarg); break; 1201 | case QUALFILE: qualFile = optarg; break; 1202 | case THREADS: threads = getInt(optarg); break; 1203 | default: exit(-1); 1204 | } 1205 | if (optind < argc) 1206 | exit(error(argv[optind], ERRPARAM)); 1207 | 1208 | // check for argument errors 1209 | if (outFile == NULL || inFile1 == NULL) { 1210 | error("", ERRFILE); 1211 | usage(-1); 1212 | } 1213 | bool inter = false; // interleaved input 1214 | if (inFile2 == NULL) { 1215 | if (verbose) 1216 | fprintf(stderr, "Warning: only one input file specified -- assuming interleaved\n"); 1217 | inter = true; 1218 | } 1219 | if (qualFile != NULL) 1220 | fjoin = false; // given qualFile takes precedence over fastq-join method 1221 | if (overlap <= 0 || doveOverlap <= 0) 1222 | exit(error("", ERROVER)); 1223 | if (mismatch < 0.0f || mismatch >= 1.0f) 1224 | exit(error("", ERRMISM)); 1225 | if (threads < 1) 1226 | exit(error("", ERRTHREAD)); 1227 | 1228 | // adjust parameters for adapter-removal mode 1229 | if (adaptOpt) { 1230 | dovetail = true; 1231 | unFile = logFile = alnFile = qualFile = NULL; 1232 | } 1233 | int alnOpt = (alnFile != NULL ? (diffOpt ? 2 : 1) : 0); 1234 | 1235 | // send arguments to runProgram() 1236 | runProgram(outFile, inFile1, inFile2, inter, unFile, 1237 | logFile, overlap, dovetail, doveFile, doveOverlap, 1238 | alnFile, alnOpt, adaptOpt, gzOut, fjoin, interOpt, 1239 | mismatch, maxLen, offset, maxQual, qualFile, verbose, 1240 | threads); 1241 | } 1242 | 1243 | /* int main() 1244 | * Main. 1245 | */ 1246 | int main(int argc, char* argv[]) { 1247 | getArgs(argc, argv); 1248 | return 0; 1249 | } 1250 | -------------------------------------------------------------------------------- /NGmerge.h: -------------------------------------------------------------------------------- 1 | /* 2 | John M. Gaspar (jsh58@wildcats.unh.edu) 3 | April 2015 (updated 2016, 2017) 4 | 5 | Header file for NGmerge.c. 6 | */ 7 | #define VERSION "0.3" 8 | 9 | // constants 10 | #define MAX_SIZE 1024 // maximum length of input lines (incl. seq/qual) 11 | #define NOTMATCH 1.5f // stitch failure 12 | #define COM ", " // separator for input file names 13 | #define CSV ",\t" // separator for quality score profile 14 | #define NA "NA" // n/a (for output log file) 15 | 16 | // default parameter values 17 | #define DEFOVER 20 // min. overlap 18 | #define DEFDOVE 50 // min. overlap for dovetailed alignments 19 | #define DEFMISM 0.1f // mismatch fraction 20 | #define OFFSET 33 // fastq quality offset (Sanger = 33) 21 | #define MAXQUAL 40 // maximum quality score (0-based) 22 | #define DEFTHR 1 // number of threads 23 | 24 | // fastq parts 25 | enum fastq { HEAD, SEQ, PLUS, QUAL, FASTQ }; // lines of a fastq read 26 | #define BEGIN '@' // beginning of header line 27 | #define PLUSCHAR '+' // beginning of 3rd line 28 | #define EXTRA 2 // save 2 extra strings for 2nd read: 29 | // revComp(seq) and rev(qual) 30 | 31 | // command-line options 32 | #define OPTIONS "h1:2:o:f:l:m:p:de:c:saj:bzyigq:u:w:n:vV" 33 | #define HELP 'h' 34 | #define FIRST '1' 35 | #define SECOND '2' 36 | #define OUTFILE 'o' 37 | #define UNFILE 'f' 38 | #define LOGFILE 'l' 39 | #define OVERLAP 'm' 40 | #define MISMATCH 'p' 41 | #define DOVEOPT 'd' 42 | #define DOVEOVER 'e' 43 | #define DOVEFILE 'c' 44 | #define MAXOPT 's' 45 | #define ADAPTOPT 'a' 46 | #define ALNFILE 'j' 47 | #define DIFFOPT 'b' 48 | #define GZOPT 'z' 49 | #define UNGZOPT 'y' 50 | #define INTEROPT 'i' 51 | #define FJOINOPT 'g' 52 | #define QUALITY 'q' 53 | #define SETQUAL 'u' 54 | #define QUALFILE 'w' 55 | #define THREADS 'n' 56 | #define VERBOSE 'v' 57 | #define VERSOPT 'V' 58 | 59 | static struct option long_options[] = { 60 | {"help", no_argument, NULL, HELP}, 61 | {"verbose", no_argument, NULL, VERBOSE}, 62 | {"version", no_argument, NULL, VERSOPT}, 63 | {0, 0, 0, 0} 64 | }; 65 | 66 | // extensions for output files 67 | #define ONEEXT "_1.fastq" 68 | #define TWOEXT "_2.fastq" 69 | #define GZEXT ".gz" // for gzip compression 70 | 71 | // OMP locks 72 | enum omp_locks { OUT, UN, LOG, DOVE, ALN, OMP_LOCKS }; 73 | 74 | // error messages 75 | enum errCode { ERRFILE, ERROPEN, ERRCLOSE, ERROPENW, ERRUNK, 76 | ERRMEM, ERRSEQ, ERRQUAL, ERRHEAD, ERRINT, ERRFLOAT, ERRPARAM, 77 | ERROVER, ERRMISM, ERRFASTQ, ERROFFSET, ERRUNGET, ERRGZIP, 78 | ERRTHREAD, ERRNAME, ERRRANGE, ERRDEFQ, DEFERR 79 | }; 80 | const char* errMsg[] = { "Need input/output files", 81 | ": cannot open file for reading", 82 | "Cannot close file", 83 | ": cannot open file for writing", 84 | ": unknown nucleotide", 85 | "Cannot allocate memory", 86 | "Cannot load sequence", 87 | "Sequence/quality scores do not match", 88 | ": not matched in input files", 89 | ": cannot convert to int", 90 | ": cannot convert to float", 91 | ": unknown command-line argument", 92 | "Overlap must be greater than 0", 93 | "Mismatch must be in [0,1)", 94 | "Input file does not follow fastq format", 95 | ": quality score outside of set range", 96 | "Failure in ungetc() call", 97 | "Cannot pipe in gzip compressed file (use zcat instead)", 98 | "Number of threads must be >= 1", 99 | ": output filename cannot start with '-'", 100 | ": file missing values for quality score range", 101 | "Cannot increase max. quality score with default error profile", 102 | "Unknown error" 103 | }; 104 | 105 | // generic File type 106 | typedef union file { 107 | FILE* f; 108 | gzFile gzf; 109 | } File; 110 | 111 | // error profiles -- matches and mismatches 112 | const char match_profile[ MAXQUAL + 1 ][ MAXQUAL + 1 ] = { 113 | {25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40}, 114 | {25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40}, 115 | {25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40}, 116 | {25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40}, 117 | {25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40}, 118 | {25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 38, 38, 38, 39, 39, 40}, 119 | {25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 27, 27, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 37, 37, 38, 38, 38, 39, 39}, 120 | {25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39}, 121 | {25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 37, 37, 38, 38, 39, 39}, 122 | {25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39}, 123 | {26, 26, 26, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 31, 31, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39}, 124 | {26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39}, 125 | {27, 27, 27, 26, 26, 26, 26, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36, 36, 36, 37, 37, 38, 39, 39}, 126 | {27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 36, 36, 37, 37, 38, 39, 39}, 127 | {28, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 36, 36, 36, 37, 37, 38, 39, 39}, 128 | {29, 29, 29, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39}, 129 | {29, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 28, 28, 28, 28, 29, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39}, 130 | {30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 38, 38, 39, 39}, 131 | {30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 37, 37, 38, 38, 39, 39}, 132 | {31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38, 39, 39}, 133 | {32, 32, 32, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38, 39, 39}, 134 | {32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 37, 37, 38, 38, 38, 39, 40}, 135 | {33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 40}, 136 | {34, 34, 34, 33, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 38, 38, 39, 39, 40}, 137 | {34, 34, 34, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 39, 39, 40}, 138 | {35, 35, 35, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 39, 39, 39, 40}, 139 | {35, 35, 35, 35, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40}, 140 | {36, 36, 36, 35, 35, 35, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40}, 141 | {36, 36, 36, 36, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40}, 142 | {37, 37, 37, 36, 36, 36, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40}, 143 | {37, 37, 37, 37, 36, 36, 36, 36, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40}, 144 | {38, 38, 38, 37, 37, 37, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 40, 40}, 145 | {38, 38, 38, 38, 37, 37, 37, 37, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 40, 40}, 146 | {39, 39, 39, 38, 38, 37, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 40, 40}, 147 | {39, 39, 39, 39, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40}, 148 | {39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40}, 149 | {40, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40}, 150 | {40, 40, 40, 40, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40}, 151 | {40, 40, 40, 40, 40, 40, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40}, 152 | {40, 40, 40, 40, 40, 40, 40, 40, 40, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40}, 153 | {40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40} 154 | }; 155 | 156 | const char mismatch_profile[ MAXQUAL + 1 ][ MAXQUAL + 1 ] = { 157 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 7, 10, 12, 13, 15, 16, 18, 19, 20, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39}, 158 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 7, 10, 12, 13, 15, 16, 18, 19, 20, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39}, 159 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 7, 10, 12, 13, 15, 16, 18, 19, 20, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39}, 160 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 6, 9, 11, 13, 14, 16, 17, 18, 20, 21, 22, 24, 26, 29, 31, 33, 35, 37, 38}, 161 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 5, 8, 11, 12, 14, 15, 17, 18, 19, 20, 22, 23, 25, 28, 30, 32, 34, 36, 38}, 162 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 4, 8, 10, 12, 13, 15, 16, 17, 19, 20, 21, 23, 25, 27, 29, 31, 33, 35, 37}, 163 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 7, 9, 11, 13, 14, 16, 17, 18, 19, 21, 22, 24, 26, 28, 31, 33, 35, 37}, 164 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 6, 9, 11, 12, 14, 15, 16, 18, 19, 20, 21, 23, 25, 27, 30, 32, 34, 36}, 165 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 5, 8, 10, 12, 13, 14, 16, 17, 18, 19, 21, 22, 24, 26, 29, 31, 34, 36}, 166 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 4, 7, 9, 11, 12, 14, 15, 16, 18, 19, 20, 22, 23, 25, 28, 31, 33, 35}, 167 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 6, 8, 10, 12, 13, 15, 16, 17, 18, 20, 21, 23, 25, 27, 30, 32, 34}, 168 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 5, 8, 10, 11, 13, 14, 15, 17, 18, 19, 20, 22, 24, 26, 29, 31, 34}, 169 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 7, 9, 11, 12, 13, 15, 16, 17, 18, 20, 21, 23, 25, 28, 30, 33}, 170 | {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 6, 8, 10, 11, 13, 14, 15, 17, 18, 19, 21, 22, 24, 27, 30, 32}, 171 | {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 5, 7, 9, 11, 12, 13, 15, 16, 17, 18, 20, 21, 23, 26, 28, 31}, 172 | {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 6, 8, 10, 11, 13, 14, 15, 17, 18, 19, 21, 22, 24, 27, 30}, 173 | {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 5, 7, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 23, 26, 29}, 174 | {2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 5, 7, 9, 11, 12, 13, 15, 16, 18, 19, 20, 22, 24, 27}, 175 | {2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 4, 7, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 26}, 176 | {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 7, 9, 11, 12, 14, 16, 17, 19, 20, 22, 25}, 177 | {7, 7, 7, 6, 5, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 7, 9, 11, 13, 15, 16, 18, 19, 21, 23}, 178 | {9, 9, 9, 8, 8, 7, 6, 6, 4, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 4, 7, 10, 12, 14, 15, 17, 19, 20, 22}, 179 | {11, 11, 11, 10, 10, 9, 9, 8, 7, 6, 6, 4, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 5, 8, 10, 13, 14, 16, 18, 19, 21}, 180 | {12, 12, 12, 12, 12, 11, 10, 10, 9, 9, 8, 7, 6, 5, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 7, 9, 11, 13, 15, 17, 19, 20}, 181 | {14, 14, 14, 14, 13, 13, 12, 12, 11, 10, 10, 9, 8, 7, 6, 5, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 4, 8, 10, 12, 14, 16, 18, 20}, 182 | {15, 15, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 10, 9, 9, 8, 6, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 6, 9, 11, 13, 15, 17, 19}, 183 | {17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 12, 11, 11, 10, 8, 6, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 4, 8, 10, 13, 14, 16, 18}, 184 | {18, 18, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 11, 10, 8, 6, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 7, 10, 12, 14, 15, 17}, 185 | {19, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 12, 10, 8, 5, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 7, 9, 11, 13, 14, 16}, 186 | {20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 13, 12, 10, 8, 5, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 6, 8, 10, 12, 14, 15}, 187 | {22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 13, 12, 10, 8, 6, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 3, 6, 8, 10, 11, 13, 14}, 188 | {23, 23, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 15, 14, 12, 11, 9, 7, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 6, 8, 9, 11, 12, 13}, 189 | {25, 25, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 19, 18, 17, 16, 15, 14, 13, 11, 10, 8, 7, 6, 4, 2, 2, 2, 2, 2, 2, 3, 5, 6, 7, 9, 10, 11, 13}, 190 | {28, 28, 28, 27, 26, 25, 25, 24, 23, 23, 22, 21, 21, 20, 20, 19, 19, 18, 17, 16, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 5, 4, 4, 5, 6, 6, 7, 9, 10, 11, 12}, 191 | {30, 30, 30, 29, 28, 28, 27, 26, 25, 25, 24, 23, 22, 22, 21, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12}, 192 | {32, 32, 32, 31, 30, 30, 29, 29, 28, 27, 26, 25, 24, 24, 23, 22, 21, 21, 20, 19, 18, 17, 16, 15, 15, 14, 13, 12, 12, 11, 10, 10, 9, 9, 9, 9, 9, 9, 10, 11, 11}, 193 | {33, 33, 33, 33, 32, 32, 31, 31, 30, 29, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 10, 10, 11, 11}, 194 | {35, 35, 35, 35, 34, 34, 33, 33, 32, 32, 31, 30, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11}, 195 | {37, 37, 37, 36, 36, 36, 35, 35, 34, 34, 33, 33, 32, 31, 31, 30, 29, 28, 26, 25, 24, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11}, 196 | {39, 39, 39, 38, 38, 37, 37, 37, 36, 36, 35, 35, 34, 34, 33, 32, 32, 31, 30, 29, 27, 26, 25, 24, 23, 22, 21, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 11}, 197 | {40, 40, 40, 40, 40, 39, 39, 39, 38, 38, 37, 37, 36, 36, 35, 35, 34, 33, 33, 32, 31, 30, 29, 28, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 17, 16, 15, 14, 13, 13, 12} 198 | }; 199 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NGmerge: merging paired-end reads and removing sequencing adapters 2 | 3 | Gaspar JM. BMC Bioinformatics. 2018 Dec 20;19(1):536. [[PubMed](https://www.ncbi.nlm.nih.gov/pubmed/30572828)] [[BMC](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2579-2)] [[PDF](https://rdcu.be/bd2AW)] 4 | 5 | ## Table of Contents 6 | * [Introduction](#intro) 7 | * [Quick start](#quick) 8 | * [Software compilation](#compile) 9 | * [Usage message](#usage) 10 | * [Alignment method](#align) 11 | * [Stitch mode](#stitch) 12 | * [I/O files and options](#stitch-io) 13 | * [Alignment parameters](#stitch-aln) 14 | * [Quality score profile options](#stitch-qual) 15 | * [Adapter-removal mode](#adapter) 16 | * [I/O files and options](#adapter-io) 17 | * [Alignment parameters](#adapter-aln) 18 | * [Miscellaneous](#misc) 19 | * [Contact](#contact) 20 |

21 | 22 | ## Introduction 23 | 24 | NGmerge operates on paired-end high-throughput sequence reads in two distinct modes (Fig. 1). 25 | 26 | In the default [stitch mode](#stitch), NGmerge combines paired-end reads that overlap into a single read that spans the full length of the original DNA fragment (Fig. 1A). The ends of the merged read are defined by the 5' ends of the original reads. Reads that fail the stitching process (due to a lack of sufficient overlap, or excessive sequencing errors) are placed into secondary output files, if the user requires them. 27 | 28 | The alternative [adapter-removal mode](#adapter) returns the original reads as pairs, removing the 3' overhangs of those reads whose valid stitched alignment has this characteristic (Fig. 1B). Reads whose alignments do not have such overhangs (or do not align at all) will also be printed to the output files, unmodified. 29 | 30 |
31 | Analysis modes of NGmerge 32 |
Figure 1. Analysis modes of NGmerge. The diagrams show the paired-end reads (R1, R2) derived from sequencing DNA fragments (white boxes) with sequencing adapters (gray boxes) on either end.
33 |
34 |

35 | 36 | ### Quick start 37 | 38 | Given: 39 | * `sample_R1.fastq.gz`, `sample_R2.fastq.gz` (paired-end sequence files for a sample) 40 | * `NGmerge` (downloaded and compiled as described [below](#compile)) 41 |

42 | 43 | To produce stitched reads (Fig. 1A): `sample_merged.fastq.gz` 44 | ``` 45 | $ ./NGmerge -1 sample_R1.fastq.gz -2 sample_R2.fastq.gz -o sample_merged.fastq.gz 46 | ``` 47 |
48 | 49 | To produce reads with adapters removed (Fig. 1B): `sample_noadapters_1.fastq.gz` and `sample_noadapters_2.fastq.gz` 50 | ``` 51 | $ ./NGmerge -a -1 sample_R1.fastq.gz -2 sample_R2.fastq.gz -o sample_noadapters 52 | ``` 53 |
54 | 55 | 56 | ### Software compilation 57 | 58 | The software can be downloaded from [GitHub](https://github.com/harvardinformatics/NGmerge). (and you're already here! congratulations!) 59 | 60 | A Makefile is provided for compilation with [GCC](https://gcc.gnu.org/releases.html), and both [zlib](http://zlib.net) and [OpenMP](https://www.openmp.org/) are also required. The program has been tested after compilation with GCC 6.3.0, zlib 1.2.8, and OpenMP 4.0. 61 | 62 | To compile, run `make` in the folder in which the software was downloaded. The executable `NGmerge` should be produced. 63 |

64 | 65 | 66 | ### Usage message 67 | 68 | ``` 69 | Usage: ./NGmerge {-1 -2 -o } [optional arguments] 70 | Required arguments: 71 | -1 Input FASTQ file with reads from forward direction 72 | -2 Input FASTQ file with reads from reverse direction 73 | -o Output FASTQ file(s): 74 | - in 'stitch' mode (def.), the file of merged reads 75 | - in 'adapter-removal' mode (-a), the output files 76 | will be _1.fastq and _2.fastq 77 | Alignment parameters: 78 | -m Minimum overlap of the paired-end reads (def. 20) 79 | -p Mismatches to allow in the overlapped region 80 | (a fraction of the overlap length; def. 0.10) 81 | -a Use 'adapter-removal' mode (also sets -d option) 82 | -d Option to check for dovetailing (with 3' overhangs) 83 | -e Minimum overlap of dovetailed alignments (def. 50) 84 | -s Option to produce shortest stitched read 85 | I/O options: 86 | -l Log file for stitching results of each read pair 87 | -f FASTQ files for reads that failed stitching 88 | (output as _1.fastq and _2.fastq) 89 | -c Log file for dovetailed reads (adapter sequences) 90 | -j Log file for formatted alignments of merged reads 91 | -z/-y Option to gzip (-z) or not (-y) FASTQ output(s) 92 | -i Option to produce interleaved FASTQ output(s) 93 | -w Use given error profile for merged qual scores 94 | -g Use 'fastq-join' method for merged qual scores 95 | -q FASTQ quality offset (def. 33) 96 | -u Maximum input quality score (0-based; def. 40) 97 | -n Number of threads to use (def. 1) 98 | -v Option to print status updates/counts to stderr 99 | ``` 100 |
101 | 102 | ## Alignment method 103 | 104 | In either analysis mode (Fig. 1), NGmerge evaluates all possible gapless alignments of a pair of reads in attempting to find an optimal one. The determinations of which alignments are considered, and then which alignment (if any) is both valid and optimal, are made according to several parameters: `-m`, `-p`, `-d`, `-e`, and `-s`. 105 | 106 | NGmerge begins by aligning a pair of reads (R1, R2) such that the minimum overlap parameter (`-m`, default 20bp) is met. It then checks each possible alignment of the reads until they overlap with no 3' overhangs (Fig. 2A). If the `-d` option is selected (or in adapter-removal mode [`-a`, which automatically sets `-d`]), NGmerge additionally evaluates dovetailed alignments (with 3' overhangs), down to the minimum length set by the `-e` parameter (Fig. 2B). 107 | 108 |
109 | Alignment method of NGmerge 110 |
Figure 2. Alignments considered by NGmerge. A: Default alignments range from those with the minimal overlap length (set by -m), to complete overlaps with no overhangs. B: When the -d option is selected, NGmerge also evaluates dovetailed alignments.
111 |
112 |

113 | 114 | For each alignment, NGmerge computes the fraction mismatch (the number of mismatches between the R1 and R2 reads, divided by the overlap length). Alignments with calculated values no more than the threshold set by the `-p` parameter (default 0.10) are considered valid. If multiple valid alignments are found, the one with the lowest fraction mismatch is selected as the optimal alignment. In rare cases where multiple alignments have identical fraction mismatches, the longest is preferred by default (unless `-s` is set). In all of these calculations, ambiguous bases (Ns) are considered neither matches nor mismatches. 115 | 116 | Further descriptions of these parameters are provided [below](#stitch-aln). 117 |

118 | 119 | ## Stitch mode 120 | 121 | ### I/O files and options 122 | 123 | #### Input files 124 | 125 | ``` 126 | -1 Input FASTQ file with reads from forward direction 127 | -2 Input FASTQ file with reads from reverse direction 128 | ``` 129 | 130 | NGmerge analyzes unaligned paired-end reads in [FASTQ format](https://en.wikipedia.org/wiki/FASTQ_format). The input files can be gzip-compressed. Multiple sets of input files can be specified, comma-separated (or space-separated, in quotes). 131 | 132 | The input files must list the reads in the same order. The program requires that the paired reads' headers match, at least up to the first space character. 133 | 134 | An input file of interleaved reads can be analyzed by not specifying a `-2` file. Also, it is possible to read from `stdin` using `-`, e.g. `-1 -`. 135 | 136 | Since the merged reads are defined by the 5' ends of the paired reads' alignments (Fig. 1A), one should be wary of quality trimming the reads at those ends. For example, when using a program such as [qualTrim](https://github.com/jsh58/AmpliconTools), one should specify `-3` to ensure that quality trimming occurs only at the 3' ends, prior to using NGmerge. 137 |

138 | 139 | #### Output files and options 140 | 141 | ``` 142 | -o Output FASTQ file: 143 | - in 'stitch' mode (def.), the file of merged reads 144 | ``` 145 | The primary output file in stitch mode is the file of merged reads, in FASTQ format. It is possible to write to `stdout` with `-o -` (see also `-y`, below). 146 |

147 | 148 | ``` 149 | -f FASTQ files for reads that failed stitching 150 | (output as _1.fastq and _2.fastq) 151 | ``` 152 | When specified, all the reads that failed the merging procedure will be written to the output files, as they appeared in the original inputs. 153 |

154 | 155 | ``` 156 | -z/-y Option to gzip (-z) or not (-y) FASTQ output(s) 157 | ``` 158 | By default, all FASTQ output files will be gzip-compressed if and only if the input files are (with multiple sets of input files, the outputs will be compressed if either of the first set of inputs is). Specifying `-z` will guarantee that the outputs are gzip-compressed, whereas `-y` will guarantee that they are not, regardless of the inputs' formats. Note that all gzip-compressed outputs will automatically have '.gz' appended to their filenames, if necessary. 159 |

160 | 161 | ``` 162 | -i Option to produce interleaved FASTQ output(s) 163 | ``` 164 | In stitch mode, this applies only to the optional output from `-f` (above). Instead of two outputs, a single interleaved output will be produced (and no '.fastq' suffix will be appended to the filename). 165 |

166 | 167 | ``` 168 | -l Log file for stitching results of each read pair 169 | ``` 170 | This log file lists the following for each read pair in the input file(s): 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 |
Readread header, not including @
OverlapLentotal length of the read overlap, including Ns; NA if reads were not merged (and remaining columns are left blank)
StitchedLentotal length of the merged read
Mismatchfraction of mismatched bases (count of mismatches divided by overlap length [not including Ns]); must be less than or equal to -p value (see below)
189 |
190 | 191 | ``` 192 | -c Log file for dovetailed reads (adapter sequences) 193 | ``` 194 | This log file lists the following for each read pair whose optimal valid alignment has 3' overhangs: 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 |
Readread header, not including @
Adapter_R13' overhang of R1 read; - if no overhang
Adapter_R23' overhang of R2 read; - if no overhang
209 | 210 | The columns are labeled 'Adapter' because, if the reads were not trimmed on their 5' ends, these extra sequences should be adapters. If the sequences that appear in the 'Adapter' columns are not consistent, they may be false positives, and one should consider decreasing `-p` or increasing `-e`. 211 |

212 | 213 | ``` 214 | -j Log file for formatted alignments of merged reads 215 | ``` 216 | For each pair of reads that was successfully merged, this log file lists alignments of the reads' sequences and quality scores, along with the resulting merged sequence and quality scores. For example: 217 | ``` 218 | sample_read1.1 219 | seq_R1: CTCACACTCAATCTTTTATCACGAAGTCATGATTGAATCGCGAGTGGTCG 220 | |||| ||||||||||||||| || |||||||||||| 221 | seq_R2: TTTACCACGAAGTCATGATTAAAGCGCGAGTGGTCGGCAGATTGCGATAA 222 | 223 | qual_R1: 1101?B10>F111122BE1B22G/<F/11> 224 | qual_R2: F/F/19B99BFFE;//;//;-----@E;/EA;AA900000:....00:/; 225 | 226 | merged 227 | seq: CTCACACTCAATCTTTTATCACGAAGTCATGATTGAATCGCGAGTGGTCGGCAGATTGCGATAA 228 | qual: 1101?B10>F1111G>HG"GEBFHHHHB>GG>>G?H="DHFFCHGHDDBD0000:....00:/; 229 | ``` 230 |
231 | 232 | ### Alignment parameters 233 | 234 | ``` 235 | -m Minimum overlap of the paired-end reads (def. 20) 236 | ``` 237 | This is the minimum overlap length (in bp) for valid alignments of a pair of reads (see Fig. 2A). Note that ambiguous bases (Ns) do not count toward this minimum length. 238 |

239 | 240 | ``` 241 | -p Mismatches to allow in the overlapped region 242 | (a fraction of the overlap length; def. 0.10) 243 | ``` 244 | This parameter determines how stringent the evaluation of an alignment is. The value must be in the interval [0, 1), with lower values equating to increased stringency. Specifying `-p 0` means that only perfect alignments (with no mismatches) are valid; the default value of 0.10 means that a valid alignment can have at most 10% mismatches (calculated as the number of mismatches divided by the overlap length [not counting Ns]). 245 |

246 | 247 | ``` 248 | -d Option to check for dovetailing (with 3' overhangs) 249 | ``` 250 | When this option is selected, alignments in which a read's 3' end extends past its pair's 5' end will be evaluated, down to a minimum length (see Fig. 2B). By default, such alignments are not even considered. Since the merged read is defined by the original reads' 5' ends, the 3' overhangs are automatically removed. These overhangs, which are typically adapters, can be printed to a separate log file (see `-c`, above). 251 |

252 | 253 | ``` 254 | -e Minimum overlap of dovetailed alignments (def. 50) 255 | ``` 256 | This is the minimum overlap length (in bp) for alignments with 3' overhangs (see Fig. 2B). This value should be set to the length of the absolute shortest DNA fragment that may have been sequenced. Using a value that is too low may result in false positives, especially if the reads contain repetitive sequences. 257 |

258 | 259 | ``` 260 | -s Option to produce shortest stitched read 261 | ``` 262 | Given multiple valid alignments with identical fraction mismatch scores, NGmerge will select the longest stitched read by default. With `-s`, the shortest stitched read will be preferred instead. 263 |

264 | 265 | 266 | ### Quality score profile options 267 | 268 | By default, NGmerge uses hard-coded profiles when determining the quality scores of overlapping bases. There are separate profiles for cases where the R1 base and the R2 base match, and for when they do not match. Those who do not wish to use these profiles have two alternative options: 269 |

270 | 271 | ``` 272 | -w Use given error profile for merged qual scores 273 | ``` 274 | With this option, NGmerge will use the quality score profiles in the provided file. The file must list two matrices of comma- or tab-separated values that follow header lines `#match` and `#mismatch`. One should follow the template of the given [`qual_profile.txt`](https://github.com/harvardinformatics/NGmerge/blob/master/qual_profile.txt) file, which mimics the hard-coded profiles of NGmerge with the quality score range of [0, 40]. 275 |

276 | 277 | ``` 278 | -g Use 'fastq-join' method for merged qual scores 279 | ``` 280 | With this option, NGmerge will use a method similar to that of the program [fastq-join](https://github.com/ExpressionAnalysis/ea-utils/blob/wiki/FastqJoin.md). In cases where the R1 base and R2 base match, the higher quality score is used for the merged base. When they do not match, the merged base's quality score is calculated as the difference in the two quality scores. 281 |

282 | 283 | 284 | ## Adapter-removal mode 285 | 286 | ``` 287 | -a Use 'adapter-removal' mode (also sets -d option) 288 | ``` 289 | This option **must** be specified for NGmerge to run in adapter-removal mode. As indicated, it automatically sets the `-d` option to check for dovetailed alignments. 290 |

291 | 292 | ### I/O files and options 293 | 294 | #### Input files 295 | 296 | The formatting of the input files is described [above](#stitch-input). 297 |

298 | 299 | #### Output files and options 300 | 301 | ``` 302 | -o Output FASTQ files: 303 | - in 'adapter-removal' mode (-a), the output files 304 | will be _1.fastq and _2.fastq 305 | ``` 306 | In adapter-removal mode, all reads are printed to the output files. The only modifications are the clipping of the 3' overhangs of reads whose alignments have such overhangs. 307 |

308 | 309 | ``` 310 | -i Option to produce interleaved FASTQ output(s) 311 | ``` 312 | With this option, instead of two outputs, a single interleaved output will be produced (and no '.fastq' suffix will be appended to the filename). 313 |

314 | 315 | ``` 316 | -z/-y Option to gzip (-z) or not (-y) FASTQ output(s) 317 | ``` 318 | These options are described [above](#stitch-output). 319 |

320 | 321 | ``` 322 | -c Log file for dovetailed reads (adapter sequences) 323 | ``` 324 | This log file is described [above](#stitch-output). 325 |

326 | 327 | In adapter-removal mode, the following files **cannot** be produced: 328 | ``` 329 | -f FASTQ files for reads that failed stitching 330 | (output as _1.fastq and _2.fastq) 331 | -l Log file for stitching results of each read pair 332 | -j Log file for formatted alignments of merged reads 333 | ``` 334 |
335 | 336 | ### Alignment parameters 337 | 338 | These parameters are described [above](#stitch-aln). 339 | 340 | As noted previously, the `-d` option is automatically set in adapter-removal mode. 341 |

342 | 343 | ## Miscellaneous 344 | 345 | ``` 346 | -n Number of threads to use (def. 1) 347 | ``` 348 | To reduce computational time, one can run NGmerge across multiple cores via this option. Note that gzip compression and decompression is not parallelized, so the computational savings are not linear. 349 |

350 | 351 | ``` 352 | -q FASTQ quality offset (def. 33) 353 | -u Maximum input quality score (0-based; def. 40) 354 | ``` 355 | These two parameters set the range of quality scores for the input FASTQ files. The default values match the [Sanger format](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), with quality scores in the range [0, 40] spanning ASCII values [33, 73]. 356 |

357 | 358 | ``` 359 | -b Option to print mismatches only to -j log file 360 | ``` 361 | Instead of printing full alignments, the log file specified by `-j` will list the details of the mismatches: the read header, position, and the base and quality score for both the R1 and R2 reads. This is useful for calculating separate error rates for matches and mismatches. 362 |

363 | 364 | 365 | Other options: 366 | ``` 367 | -v/--verbose Option to print status updates/counts to stderr 368 | -h/--help Print the usage message and exit 369 | -V/--version Print the version and exit 370 | ``` 371 |
372 | 373 | Other notes: 374 | 375 | * NGmerge cannot gzip-compress multiple output files that are `stdout`. For example, the following will produce an error: 376 | * `-o - -a` without `-i` 377 | * `-f -` without `-a` and without `-i` 378 |

379 | 380 | ## Contact 381 | 382 | NGmerge 383 | 384 | Copyright © 2017 John M. Gaspar (jsh58@wildcats.unh.edu) 385 | 386 | -------------------------------------------------------------------------------- /UserGuide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/NGmerge/224fc6a0066024e05965d101d998704815cb4c41/UserGuide.pdf -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | NGmerge 2 | 3 | Copyright (C) 2017 John M. Gaspar (jsh58@wildcats.unh.edu) 4 | 5 | Version 0.3 - May 8, 2018 6 | * Update Makefile with bioconda friendly changes 7 | * NGmerge -h returns 0 8 | 9 | Version 0.2 - Nov 14, 2017 10 | * Fix data race in alignments output 11 | * Combine merged sequence functions for default and fastq-join modes 12 | 13 | Version 0.1 - Oct 11, 2017 14 | * First public release on GitHub 15 | (based on stitch in AmpliconTools [github.com/jsh58/AmpliconTools] 16 | and stitch v0.8 [github.com/jsh58/stitch]) 17 | -------------------------------------------------------------------------------- /figures/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/NGmerge/224fc6a0066024e05965d101d998704815cb4c41/figures/figure1.png -------------------------------------------------------------------------------- /figures/figure2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/NGmerge/224fc6a0066024e05965d101d998704815cb4c41/figures/figure2.png -------------------------------------------------------------------------------- /qual_profile.txt: -------------------------------------------------------------------------------- 1 | #match 2 | 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40 3 | 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40 4 | 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40 5 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40, 40 6 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39, 39, 40 7 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 38, 38, 38, 39, 39, 40 8 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 27, 27, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 37, 37, 38, 38, 38, 39, 39 9 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 39, 39 10 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 37, 37, 38, 38, 39, 39 11 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39 12 | 26, 26, 26, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 31, 31, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39 13 | 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39 14 | 27, 27, 27, 26, 26, 26, 26, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36, 36, 36, 37, 37, 38, 39, 39 15 | 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 36, 36, 37, 37, 38, 39, 39 16 | 28, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 36, 36, 36, 37, 37, 38, 39, 39 17 | 29, 29, 29, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39 18 | 29, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 28, 28, 28, 28, 29, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39 19 | 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 38, 38, 39, 39 20 | 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 37, 37, 38, 38, 39, 39 21 | 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38, 39, 39 22 | 32, 32, 32, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38, 39, 39 23 | 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 37, 37, 38, 38, 38, 39, 40 24 | 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 40 25 | 34, 34, 34, 33, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 38, 38, 39, 39, 40 26 | 34, 34, 34, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 39, 39, 40 27 | 35, 35, 35, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 39, 39, 39, 40 28 | 35, 35, 35, 35, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40 29 | 36, 36, 36, 35, 35, 35, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40 30 | 36, 36, 36, 36, 35, 35, 35, 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40 31 | 37, 37, 37, 36, 36, 36, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40 32 | 37, 37, 37, 37, 36, 36, 36, 36, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40 33 | 38, 38, 38, 37, 37, 37, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 40, 40 34 | 38, 38, 38, 38, 37, 37, 37, 37, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 40, 40 35 | 39, 39, 39, 38, 38, 37, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 40, 40 36 | 39, 39, 39, 39, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40 37 | 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40 38 | 40, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40 39 | 40, 40, 40, 40, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40 40 | 40, 40, 40, 40, 40, 40, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40 41 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40 42 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40 43 | 44 | #mismatch 45 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 7, 10, 12, 13, 15, 16, 18, 19, 20, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39 46 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 7, 10, 12, 13, 15, 16, 18, 19, 20, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39 47 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 7, 10, 12, 13, 15, 16, 18, 19, 20, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39 48 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 6, 9, 11, 13, 14, 16, 17, 18, 20, 21, 22, 24, 26, 29, 31, 33, 35, 37, 38 49 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 5, 8, 11, 12, 14, 15, 17, 18, 19, 20, 22, 23, 25, 28, 30, 32, 34, 36, 38 50 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 4, 8, 10, 12, 13, 15, 16, 17, 19, 20, 21, 23, 25, 27, 29, 31, 33, 35, 37 51 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 7, 9, 11, 13, 14, 16, 17, 18, 19, 21, 22, 24, 26, 28, 31, 33, 35, 37 52 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 6, 9, 11, 12, 14, 15, 16, 18, 19, 20, 21, 23, 25, 27, 30, 32, 34, 36 53 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 5, 8, 10, 12, 13, 14, 16, 17, 18, 19, 21, 22, 24, 26, 29, 31, 34, 36 54 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 4, 7, 9, 11, 12, 14, 15, 16, 18, 19, 20, 22, 23, 25, 28, 31, 33, 35 55 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 6, 8, 10, 12, 13, 15, 16, 17, 18, 20, 21, 23, 25, 27, 30, 32, 34 56 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 5, 8, 10, 11, 13, 14, 15, 17, 18, 19, 20, 22, 24, 26, 29, 31, 34 57 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 7, 9, 11, 12, 13, 15, 16, 17, 18, 20, 21, 23, 25, 28, 30, 33 58 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 6, 8, 10, 11, 13, 14, 15, 17, 18, 19, 21, 22, 24, 27, 30, 32 59 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 5, 7, 9, 11, 12, 13, 15, 16, 17, 18, 20, 21, 23, 26, 28, 31 60 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 6, 8, 10, 11, 13, 14, 15, 17, 18, 19, 21, 22, 24, 27, 30 61 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 5, 7, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 23, 26, 29 62 | 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 5, 7, 9, 11, 12, 13, 15, 16, 18, 19, 20, 22, 24, 27 63 | 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 4, 7, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 26 64 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 7, 9, 11, 12, 14, 16, 17, 19, 20, 22, 25 65 | 7, 7, 7, 6, 5, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 7, 9, 11, 13, 15, 16, 18, 19, 21, 23 66 | 9, 9, 9, 8, 8, 7, 6, 6, 4, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 4, 7, 10, 12, 14, 15, 17, 19, 20, 22 67 | 11, 11, 11, 10, 10, 9, 9, 8, 7, 6, 6, 4, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 5, 8, 10, 13, 14, 16, 18, 19, 21 68 | 12, 12, 12, 12, 12, 11, 10, 10, 9, 9, 8, 7, 6, 5, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 7, 9, 11, 13, 15, 17, 19, 20 69 | 14, 14, 14, 14, 13, 13, 12, 12, 11, 10, 10, 9, 8, 7, 6, 5, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 4, 8, 10, 12, 14, 16, 18, 20 70 | 15, 15, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 10, 9, 9, 8, 6, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 6, 9, 11, 13, 15, 17, 19 71 | 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 12, 11, 11, 10, 8, 6, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 4, 8, 10, 13, 14, 16, 18 72 | 18, 18, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 11, 10, 8, 6, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 7, 10, 12, 14, 15, 17 73 | 19, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 12, 10, 8, 5, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 7, 9, 11, 13, 14, 16 74 | 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 13, 12, 10, 8, 5, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 6, 8, 10, 12, 14, 15 75 | 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 13, 12, 10, 8, 6, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 3, 6, 8, 10, 11, 13, 14 76 | 23, 23, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 15, 14, 12, 11, 9, 7, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 6, 8, 9, 11, 12, 13 77 | 25, 25, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 19, 18, 17, 16, 15, 14, 13, 11, 10, 8, 7, 6, 4, 2, 2, 2, 2, 2, 2, 3, 5, 6, 7, 9, 10, 11, 13 78 | 28, 28, 28, 27, 26, 25, 25, 24, 23, 23, 22, 21, 21, 20, 20, 19, 19, 18, 17, 16, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 5, 4, 4, 5, 6, 6, 7, 9, 10, 11, 12 79 | 30, 30, 30, 29, 28, 28, 27, 26, 25, 25, 24, 23, 22, 22, 21, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12 80 | 32, 32, 32, 31, 30, 30, 29, 29, 28, 27, 26, 25, 24, 24, 23, 22, 21, 21, 20, 19, 18, 17, 16, 15, 15, 14, 13, 12, 12, 11, 10, 10, 9, 9, 9, 9, 9, 9, 10, 11, 11 81 | 33, 33, 33, 33, 32, 32, 31, 31, 30, 29, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 10, 10, 11, 11 82 | 35, 35, 35, 35, 34, 34, 33, 33, 32, 32, 31, 30, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11 83 | 37, 37, 37, 36, 36, 36, 35, 35, 34, 34, 33, 33, 32, 31, 31, 30, 29, 28, 26, 25, 24, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11 84 | 39, 39, 39, 38, 38, 37, 37, 37, 36, 36, 35, 35, 34, 34, 33, 32, 32, 31, 30, 29, 27, 26, 25, 24, 23, 22, 21, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 11 85 | 40, 40, 40, 40, 40, 39, 39, 39, 38, 38, 37, 37, 36, 36, 35, 35, 34, 33, 33, 32, 31, 30, 29, 28, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 17, 16, 15, 14, 13, 13, 12 86 | --------------------------------------------------------------------------------