├── test.fastq.gz ├── Makefile ├── test.fasta ├── test_old_casava.fastq ├── test_casava_18.fastq ├── test.fastq ├── README.md ├── fastool.c └── kseq.h /test.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fstrozzi/Fastool/HEAD/test.fastq.gz -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CFLAGS = -O2 -std=c99 -Werror 3 | 4 | all:kseq.h fastool.c 5 | $(CC) $(CFLAGS) fastool.c -o fastool 6 | 7 | clean: 8 | rm -f *.o fastool 9 | -------------------------------------------------------------------------------- /test.fasta: -------------------------------------------------------------------------------- 1 | >seq1:11:455 2 | AACCTGATTGGGATCACTATAGCTTTGAAGGCGTGCGCCAATTTCGTCGAAAGCTTGTTGCCAGCTTAATGGCTTGTAACAGTCGCTGACGGCATCATATTCAAAGGCTGAGTGAGTCGCCCGCAGCCTCAGCTGA 3 | >seq2:34:77 4 | GTTGCCGGATATTCCTGAATGGTGACCTGCAGCGTTAACTGCTTATCATCACGCATCACTACTACAGGGATCACCGAACCAGGGCGAATTTCCGCCACCTGATCCATCGTCTCCAGGCTGAGGA 5 | -------------------------------------------------------------------------------- /test_old_casava.fastq: -------------------------------------------------------------------------------- 1 | @HWI-ST896:156:D0JFYACXX:5:1101:1652:2132/1 2 | CCTGCAGTNGTCAAAATGATGACCCTCCCACGCNTAGCTGNCCTTCACATGCGGCCGTGCAGCCTCTACATTATCTACTCCAGTTTCCCACAGGTTTGAGA 3 | + 4 | #1=BDDEFHHHHHIIGGIIIIIIDHHIDFHIIIIIIIIIIIIIIIIIIIIIIDIIIEHHAGGIFHEE:?@D?##,,5;??CCDDBDDD>BB@A>CDBBBDB 5 | @HWI-ST896:156:D0JFYACXX:5:1101:1563:2186/2 6 | TCTTTGCAGTCTTACAACGCAACTACTCCCGTGGACGATACCATTGGGAAGGATGCGTTACGGTCTACCTTGCTTATCCTCTAAGAGCGTGCCCGGAAGCA 7 | + 8 | CCCFFFFFHHHHHJIJIJJJIJJJJJJIIIJJJJIJJIIJJGIIHHFFFFDDDDDDDDEDDDDDDCDCEECDDDDDDDDDDDCEDDDDDDCCDDDD?CCC4 9 | -------------------------------------------------------------------------------- /test_casava_18.fastq: -------------------------------------------------------------------------------- 1 | @HWI-ST896:156:D0JFYACXX:5:1101:1652:2132 1:N:0:GATCAG 2 | CCTGCAGTNGTCAAAATGATGACCCTCCCACGCNTAGCTGNCCTTCACATGCGGCCGTGCAGCCTCTACATTATCTACTCCAGTTTCCCACAGGTTTGAGA 3 | + 4 | #1=BDDEFHHHHHIIGGIIIIIIDHHIDFHIIIIIIIIIIIIIIIIIIIIIIDIIIEHHAGGIFHEE:?@D?##,,5;??CCDDBDDD>BB@A>CDBBBDB 5 | @HWI-ST896:156:D0JFYACXX:5:1101:1563:2186 2:N:0:GATCAG 6 | TCTTTGCAGTCTTACAACGCAACTACTCCCGTGGACGATACCATTGGGAAGGATGCGTTACGGTCTACCTTGCTTATCCTCTAAGAGCGTGCCCGGAAGCA 7 | + 8 | CCCFFFFFHHHHHJIJIJJJIJJJJJJIIIJJJJIJJIIJJGIIHHFFFFDDDDDDDDEDDDDDDCDCEECDDDDDDDDDDDCEDDDDDDCCDDDD?CCC4 9 | -------------------------------------------------------------------------------- /test.fastq: -------------------------------------------------------------------------------- 1 | @seq1:11:455 2 | AACCTGATTGGGATCACTATAGCTTTGAAGGCGTGCGCCAATTTCGTCGAAAGCTTGTTGCCAGCTTAATGGCTTGTAACAGTCGCTGACGGCATCATATTCAAAGGCTGAGTGAGTCGCCCGCAGCCTCAGCTGA 3 | + 4 | >>>>>>>>>>>;=;;<<98>>>==<89962/2==>>>>>>>>><>>>>>>>9=>>>>>>>>>>>>>>>>>>>>>>>;>=>>>>>>>>>>>>>>>>99;+,((55*6.8;;<<>8888.,+,&(.,35*6;88446: 5 | @seq2:34:77 6 | GTTGCCGGATATTCCTGAATGGTGACCTGCAGCGTTAACTGCTTATCATCACGCATCACTACTACAGGGATCACCGAACCAGGGCGAATTTCCGCCACCTGATCCATCGTCTCCAGGCTGAGGA 7 | + 8 | >>>>>>>>>>>>>>>>>>>6615=7959>>=>>>>>>>>>>>>>>>>>>>>>>>;:<=>>>>:><;;=8<4=>>>;>>>>568.44943:09<9<<==<;195::39<:<<9=;,,(,9::289 9 | @seq3:56:189 10 | gttgccggatattcctgaatggtgacctgcagcgttaactgcttatcatcacgcatcactactacagggatcaccgaaccagggcgaatttccgccacctgatccatcgtctccaggctgagga 11 | + 12 | >>>>>>>>>>>>>>>>>>>6615=7959>>=>>>>>>>>>>>>>>>>>>>>>>>;:<=>>>>:><;;=8<4=>>>;>>>>568.44943:09<9<<==<;195::39<:<<9=;,,(,9::289 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Fastool 2 | ======= 3 | 4 | A simple and quick tool to read huge FastQ and FastA files (both normal and gzipped) and manipulate them. 5 | 6 | It makes use of the KSeq library (http://lh3lh3.users.sourceforge.net/kseq.shtml) for fast access to FastQ/A files. 7 | 8 | Installation 9 | ------------ 10 | 11 | Clone this repository and run make. 12 | 13 | Usage 14 | ----- 15 | 16 | Usage: %s (--rev) (--append [string_to_append_to_header]) (--to-fasta) (--illumina-trinity) sequences_1.fastq/a sequences_2.fastq/a ... 17 | 18 | --to-fasta (optional): convert FastQ files to FastA format. 19 | 20 | --rev (optional): reverse complement all the sequences in the dataset (both FastQ and FastA). 21 | 22 | --append (optional): add a string at the end of each sequence header (both FastQ and FastA). 23 | 24 | --illumina-trinity (optional): directly converts Casava 1.8+ FastQ ID format to Trinity Fasta input format (appending /1 and /2 for PE reads) 25 | 26 | Examples 27 | -------- 28 | 29 | FastQ conversion for Trinity pipeline 30 | 31 | fastool --illumina-trinity sequences.fastq > sequences.fasta 32 | 33 | FastQ to FastA conversion 34 | 35 | fastool --to-fasta sequences.fastq > sequences.fasta 36 | 37 | Return the reverse complement 38 | 39 | fastool --rev sequences.fastq > reverse_complement.fastq 40 | 41 | Append '/1' to the end of the sequence ID 42 | 43 | fastool --append /1 sequences.fasta > forward_sequences.fasta 44 | 45 | Append '/2' to the end of the sequence ID and return the reverse complement 46 | 47 | fastool --append /2 --rev sequences.fasta > reverse_sequences.fasta 48 | 49 | Can process more then one file 50 | 51 | fastool --to-fasta --append /1 sequences1.fastq sequences2.fastq sequences3.fastq > all_sequences.fasta 52 | 53 | Can be used with pipes, for example to read from compressed files 54 | 55 | zcat sequences.fastq.gz | fastool --to-fasta > sequences.fasta 56 | 57 | License 58 | ------- 59 | 60 | Permission is hereby granted, free of charge, to any person obtaining 61 | a copy of this software and associated documentation files (the 62 | "Software"), to deal in the Software without restriction, including 63 | without limitation the rights to use, copy, modify, merge, publish, 64 | distribute, sublicense, and/or sell copies of the Software, and to 65 | permit persons to whom the Software is furnished to do so, subject to 66 | the following conditions: 67 | 68 | The above copyright notice and this permission notice shall be 69 | included in all copies or substantial portions of the Software. 70 | 71 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 72 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 73 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 74 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 75 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 76 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 77 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 78 | SOFTWARE. 79 | 80 | Copyright 81 | --------- 82 | 83 | Copyright (c) 2012 Francesco Strozzi 84 | 85 | -------------------------------------------------------------------------------- /fastool.c: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2012 Francesco Strozzi 3 | 4 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 5 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 6 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 7 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 8 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 9 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 10 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 11 | 12 | */ 13 | 14 | #define _POSIX_C_SOURCE 1 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "kseq.h" 20 | 21 | KSEQ_INIT(int, read) 22 | 23 | int print_seq(int ilmn_trinity, char *append, int to_fasta, char *s[]) { 24 | if (ilmn_trinity && (s[0][strlen(s[0])-2] != '/') && s[3] != NULL) { 25 | printf(">%s/%c\n", s[0], s[3][0]); 26 | } 27 | else if (append == NULL) { 28 | printf(">%s\n", s[0]); 29 | } 30 | else { 31 | printf(">%s%s\n", s[0], append); 32 | } 33 | printf("%s\n", s[1]); 34 | if (!to_fasta && s[2] != NULL && !ilmn_trinity) { 35 | printf("+\n%s\n",s[2]); 36 | } 37 | return 0; 38 | } 39 | 40 | 41 | int process_input(FILE* stream, int rev_comp, char *string, int to_fa, int ilmn_trin) { 42 | 43 | kseq_t *seq; 44 | seq = kseq_init(fileno(stream)); 45 | int count = 0; 46 | int res = 0; 47 | if (rev_comp) { 48 | while ((res = kseq_read(seq)) >= 0) { 49 | 50 | char *sequence_to_print[4] = {[2] = NULL}; 51 | 52 | char quality[seq->qual.l]; 53 | if (seq->qual.s) { 54 | for (int i = 0; i < seq->qual.l; ++i) 55 | { 56 | quality[i] = *(seq->qual.s + seq->qual.l -1 -i); 57 | } 58 | quality[seq->qual.l] = '\0'; 59 | sequence_to_print[2] = quality; 60 | } 61 | 62 | char rev_seq[seq->seq.l]; 63 | for(int i = 0; i < seq->seq.l; ++i) { 64 | if (*(seq->seq.s + seq->seq.l-1 - i) == 'A' || *(seq->seq.s + seq->seq.l-1 - i) == 'a') rev_seq[i] = 'T'; 65 | else if (*(seq->seq.s + seq->seq.l-1 - i) == 'C' || *(seq->seq.s + seq->seq.l-1 - i) == 'c') rev_seq[i] = 'G'; 66 | else if (*(seq->seq.s + seq->seq.l-1 - i) == 'T' || *(seq->seq.s + seq->seq.l-1 - i) == 't') rev_seq[i] = 'A'; 67 | else if (*(seq->seq.s + seq->seq.l-1 - i) == 'G' || *(seq->seq.s + seq->seq.l-1 - i) == 'g') rev_seq[i] = 'C'; 68 | else if (*(seq->seq.s + seq->seq.l-1 - i) == 'N' || *(seq->seq.s + seq->seq.l-1 - i) == 'n') rev_seq[i] = 'N'; 69 | else if (*(seq->seq.s + seq->seq.l-1 - i) == 'U' || *(seq->seq.s + seq->seq.l-1 - i) == 'u') rev_seq[i] = 'A'; 70 | } 71 | rev_seq[seq->seq.l] = '\0'; 72 | sequence_to_print[0] = seq->name.s; 73 | sequence_to_print[1] = rev_seq; 74 | 75 | if (seq->comment.s) { 76 | sequence_to_print[3] = seq->comment.s; 77 | } 78 | else { 79 | sequence_to_print[3] = NULL; 80 | } 81 | 82 | print_seq(ilmn_trin, string, to_fa, sequence_to_print ); 83 | count++; 84 | } 85 | 86 | 87 | } 88 | else { 89 | while ((res = kseq_read(seq)) >= 0) { 90 | char *sequence_to_print[4]; 91 | sequence_to_print[0] = seq->name.s; 92 | sequence_to_print[1] = seq->seq.s; 93 | for (int i=0;i < strlen(sequence_to_print[1]); i++) { 94 | sequence_to_print[1][i] = toupper(sequence_to_print[1][i]); 95 | } 96 | if (seq->qual.s) sequence_to_print[2] = seq->qual.s; 97 | if (seq->comment.s) { 98 | sequence_to_print[3] = seq->comment.s; 99 | } 100 | else { 101 | sequence_to_print[3] = NULL; 102 | } 103 | print_seq(ilmn_trin, string, to_fa, sequence_to_print); 104 | count++; 105 | } 106 | } 107 | 108 | kseq_destroy(seq); 109 | fclose(stream); 110 | fprintf(stderr,"Sequences parsed: %d\n",count); 111 | if (res == -1) { 112 | exit(0); 113 | } 114 | else { 115 | fprintf (stderr,"fastool: parsing error, truncated sequence and/or quality detected!\n"); 116 | exit(1); 117 | } 118 | 119 | } 120 | 121 | void print_help(char *command_line) { 122 | printf("Usage: %s (--rev) (--append [string_to_append_to_header]) (--to-fasta) (--illumina-trinity) sequences_1.fastq/a sequences_2.fastq/a ... \n",command_line); 123 | } 124 | 125 | int main(int argc, char *argv[]) 126 | { 127 | 128 | int reverse_complement = 0; 129 | char *string_to_append = NULL; 130 | int to_fasta = 0; 131 | int read_from_file = 0; 132 | int illumina_trinity = 0; 133 | 134 | if(argc == 1) { 135 | print_help(argv[0]); 136 | exit(1); 137 | } 138 | 139 | for(int i = 1; i < argc; ++i) 140 | { 141 | if(strcmp(argv[i],"--rev") == 0) reverse_complement = 1; 142 | else if (strcmp(argv[1],"-h") == 0) { 143 | print_help(argv[0]); 144 | exit(1); 145 | } 146 | else if(strcmp(argv[i],"--to-fasta") == 0) to_fasta = 1; 147 | else if((strcmp(argv[i],"--append") == 0)) { 148 | if (i+1 == argc) { 149 | printf("String to append is missing!\n"); 150 | exit(1); 151 | } 152 | else { 153 | string_to_append = argv[i+1]; 154 | i++; 155 | } 156 | } 157 | else if((strcmp(argv[i],"--illumina-trinity") == 0)) illumina_trinity = 1; 158 | else { 159 | if (illumina_trinity && string_to_append) { 160 | printf("You are using both --append and --illumina-trinity options. You can only provide one or the other.\n"); 161 | exit(1); 162 | } 163 | read_from_file = 1; 164 | FILE* fp; 165 | if (!(fp = fopen(argv[i],"r"))) { 166 | printf("No %s file found!\n", argv[i]); 167 | exit(1); 168 | } 169 | process_input(fp,reverse_complement, string_to_append, to_fasta, illumina_trinity); 170 | } 171 | } 172 | if (!read_from_file) 173 | { 174 | FILE* fp_stdin; 175 | fp_stdin = fdopen(fileno(stdin), "rb"); 176 | process_input(fp_stdin, reverse_complement, string_to_append, to_fasta, illumina_trinity); 177 | } 178 | 179 | } 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Genome Research Ltd (GRL). 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Heng Li */ 27 | 28 | /* Last Modified: 12APR2009 */ 29 | 30 | #ifndef AC_KSEQ_H 31 | #define AC_KSEQ_H 32 | 33 | #include 34 | #include 35 | #include 36 | 37 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 38 | #define KS_SEP_TAB 1 // isspace() && !' ' 39 | #define KS_SEP_MAX 1 40 | 41 | #define __KS_TYPE(type_t) \ 42 | typedef struct __kstream_t { \ 43 | char *buf; \ 44 | int begin, end, is_eof; \ 45 | type_t f; \ 46 | } kstream_t; 47 | 48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 50 | 51 | #define __KS_BASIC(type_t, __bufsize) \ 52 | static inline kstream_t *ks_init(type_t f) \ 53 | { \ 54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 55 | ks->f = f; \ 56 | ks->buf = (char*)malloc(__bufsize); \ 57 | return ks; \ 58 | } \ 59 | static inline void ks_destroy(kstream_t *ks) \ 60 | { \ 61 | if (ks) { \ 62 | free(ks->buf); \ 63 | free(ks); \ 64 | } \ 65 | } 66 | 67 | #define __KS_GETC(__read, __bufsize) \ 68 | static inline int ks_getc(kstream_t *ks) \ 69 | { \ 70 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ 71 | if (ks->begin >= ks->end) { \ 72 | ks->begin = 0; \ 73 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 74 | if (ks->end < __bufsize) ks->is_eof = 1; \ 75 | if (ks->end == 0) return -1; \ 76 | } \ 77 | return (int)ks->buf[ks->begin++]; \ 78 | } 79 | 80 | #ifndef KSTRING_T 81 | #define KSTRING_T kstring_t 82 | typedef struct __kstring_t { 83 | size_t l, m; 84 | char *s; 85 | } kstring_t; 86 | #endif 87 | 88 | #ifndef kroundup32 89 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 90 | #endif 91 | 92 | #define __KS_GETUNTIL(__read, __bufsize) \ 93 | static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 94 | { \ 95 | if (dret) *dret = 0; \ 96 | str->l = 0; \ 97 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ 98 | for (;;) { \ 99 | int i; \ 100 | if (ks->begin >= ks->end) { \ 101 | if (!ks->is_eof) { \ 102 | ks->begin = 0; \ 103 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 104 | if (ks->end < __bufsize) ks->is_eof = 1; \ 105 | if (ks->end == 0) break; \ 106 | } else break; \ 107 | } \ 108 | if (delimiter > KS_SEP_MAX) { \ 109 | for (i = ks->begin; i < ks->end; ++i) \ 110 | if (ks->buf[i] == delimiter) break; \ 111 | } else if (delimiter == KS_SEP_SPACE) { \ 112 | for (i = ks->begin; i < ks->end; ++i) \ 113 | if (isspace(ks->buf[i])) break; \ 114 | } else if (delimiter == KS_SEP_TAB) { \ 115 | for (i = ks->begin; i < ks->end; ++i) \ 116 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 117 | } else i = 0; /* never come to here! */ \ 118 | if (str->m - str->l < i - ks->begin + 1) { \ 119 | str->m = str->l + (i - ks->begin) + 1; \ 120 | kroundup32(str->m); \ 121 | str->s = (char*)realloc(str->s, str->m); \ 122 | } \ 123 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 124 | str->l = str->l + (i - ks->begin); \ 125 | ks->begin = i + 1; \ 126 | if (i < ks->end) { \ 127 | if (dret) *dret = ks->buf[i]; \ 128 | break; \ 129 | } \ 130 | } \ 131 | if (str->l == 0) { \ 132 | str->m = 1; \ 133 | str->s = (char*)calloc(1, 1); \ 134 | } \ 135 | str->s[str->l] = '\0'; \ 136 | return str->l; \ 137 | } 138 | 139 | #define KSTREAM_INIT(type_t, __read, __bufsize) \ 140 | __KS_TYPE(type_t) \ 141 | __KS_BASIC(type_t, __bufsize) \ 142 | __KS_GETC(__read, __bufsize) \ 143 | __KS_GETUNTIL(__read, __bufsize) 144 | 145 | #define __KSEQ_BASIC(type_t) \ 146 | static inline kseq_t *kseq_init(type_t fd) \ 147 | { \ 148 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 149 | s->f = ks_init(fd); \ 150 | return s; \ 151 | } \ 152 | static inline void kseq_rewind(kseq_t *ks) \ 153 | { \ 154 | ks->last_char = 0; \ 155 | ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ 156 | } \ 157 | static inline void kseq_destroy(kseq_t *ks) \ 158 | { \ 159 | if (!ks) return; \ 160 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 161 | ks_destroy(ks->f); \ 162 | free(ks); \ 163 | } 164 | 165 | /* Return value: 166 | >=0 length of the sequence (normal) 167 | -1 end-of-file 168 | -2 truncated quality string 169 | */ 170 | #define __KSEQ_READ \ 171 | static int kseq_read(kseq_t *seq) \ 172 | { \ 173 | int c; \ 174 | kstream_t *ks = seq->f; \ 175 | if (seq->last_char == 0) { /* then jump to the next header line */ \ 176 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 177 | if (c == -1) return -1; /* end of file */ \ 178 | seq->last_char = c; \ 179 | } /* the first header char has been read */ \ 180 | seq->comment.l = seq->seq.l = seq->qual.l = 0; \ 181 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ 182 | if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ 183 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 184 | if (isgraph(c)) { /* printable non-space character */ \ 185 | if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ 186 | seq->seq.m = seq->seq.l + 2; \ 187 | kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ 188 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 189 | } \ 190 | seq->seq.s[seq->seq.l++] = (char)c; \ 191 | } \ 192 | } \ 193 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 194 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 195 | if (c != '+') return seq->seq.l; /* FASTA */ \ 196 | if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ 197 | seq->qual.m = seq->seq.m; \ 198 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 199 | } \ 200 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 201 | if (c == -1) return -2; /* we should not stop here */ \ 202 | while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ 203 | if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ 204 | seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ 205 | seq->last_char = 0; /* we have not come to the next header line */ \ 206 | if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ 207 | return seq->seq.l; \ 208 | } 209 | 210 | #define __KSEQ_TYPE(type_t) \ 211 | typedef struct { \ 212 | kstring_t name, comment, seq, qual; \ 213 | int last_char; \ 214 | kstream_t *f; \ 215 | } kseq_t; 216 | 217 | #define KSEQ_INIT(type_t, __read) \ 218 | KSTREAM_INIT(type_t, __read, 4096) \ 219 | __KSEQ_TYPE(type_t) \ 220 | __KSEQ_BASIC(type_t) \ 221 | __KSEQ_READ 222 | 223 | #endif 224 | --------------------------------------------------------------------------------