├── .gitignore ├── data └── hs37N.bed.gz ├── Makefile ├── README.md ├── lorenz.c ├── main.c ├── kvec.h ├── faidx.h ├── kthread.c ├── razf.h ├── kdq.h ├── bedidx.c ├── ldup.c ├── hts.h ├── kstring.h ├── cv.c ├── ksort.h ├── count.c ├── sam.h ├── bgzf.h ├── sv.c ├── group.c ├── kseq.h ├── plp-diff.js ├── faidx.c ├── cnv.c ├── khash.h ├── plp-joint.js └── trim.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.a 3 | .*.swp 4 | Makefile.bak 5 | lt-trim 6 | lt-group 7 | -------------------------------------------------------------------------------- /data/hs37N.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lh3/lianti/HEAD/data/hs37N.bed.gz -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC=gcc 2 | CFLAGS=-g -Wall -O2 -Wno-unused-function 3 | PROG=lianti 4 | OBJS=kthread.o bgzf.o razf.o hts.o bedidx.o faidx.o sam.o \ 5 | trim.o ldup.o group.o count.o cnv.o sv.o pileup.o lorenz.o cv.o \ 6 | main.o 7 | 8 | .c.o: 9 | $(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@ 10 | 11 | all:$(PROG) 12 | 13 | lianti:$(OBJS) 14 | $(CC) $(CFLAGS) $(OBJS) -o $@ -lz -lm -lpthread 15 | 16 | bgzf.o:bgzf.c bgzf.h khash.h 17 | $(CC) -c $(CFLAGS) $(DFLAGS) -DBGZF_MT $(INCLUDES) bgzf.c -o $@ 18 | 19 | ldup.o:ldup.c sam.h bgzf.h hts.h kdq.h khash.h 20 | $(CC) -c $(CFLAGS) $(DFLAGS) -DBGZF_MT $(INCLUDES) ldup.c -o $@ 21 | 22 | clean: 23 | rm -fr gmon.out *.o ext/*.o a.out *~ *.a *.dSYM session* $(PROG) 24 | 25 | depend: 26 | (LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c) 27 | 28 | # DO NOT DELETE 29 | 30 | bedidx.o: ksort.h kseq.h khash.h 31 | bgzf.o: bgzf.h 32 | break.o: sam.h bgzf.h hts.h 33 | cnv.o: kvec.h kseq.h ksort.h 34 | count.o: kvec.h kseq.h kdq.h 35 | faidx.o: faidx.h khash.h razf.h 36 | group.o: sam.h bgzf.h hts.h kdq.h kvec.h ksort.h 37 | hts.o: bgzf.h hts.h kseq.h khash.h ksort.h 38 | ldup.o: sam.h bgzf.h hts.h kdq.h khash.h 39 | lorenz.o: sam.h bgzf.h hts.h 40 | cv.o: sam.h bgzf.h hts.h 41 | pileup.o: sam.h bgzf.h hts.h faidx.h ksort.h 42 | razf.o: razf.h 43 | sam.o: sam.h bgzf.h hts.h khash.h kseq.h kstring.h 44 | trim.o: kvec.h khash.h kseq.h 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Getting Started 2 | 3 | ```sh 4 | git clone https://github.com/lh3/lianti 5 | cd lianti && make 6 | # preprocessing, mapping and marking duplicates 7 | seqtk mergepe read1.fq.gz read2.fq.gz | ./lianti trim - | bwa mem -Cpt8 ref.fa - \ 8 | | samtools view -uS - | sambamba sort /dev/stdin -o /dev/stdout | ./lianti ldup - > aln.bam 9 | # calling SNVs 10 | ./lianti pileup -ycf ref.fa -P20 -L1 bulk.bam lianti.bam > raw.vcf 11 | k8 plp-diff.js raw.vcf > filtered.txt 12 | ``` 13 | 14 | ## Introduction 15 | 16 | [LIANTI][lianti-paper] is a single-cell whole-genome amplification method. 17 | This repo implements tools to preprocess raw LIANTI sequence data and to 18 | call sequence variations from the alignment. Probably you would like to use the 19 | `trim` command to trim adapters, identify barcodes and merge overlapping read 20 | ends. It is non-trivial to reimplement these tedious functionality on your own. 21 | `ldup` marks PCR duplicates in a barcode-aware manner. It has been superseded 22 | by the `ldup` command in the [adna][adna] repo which is more general. You may 23 | consider to call SNVs with this toolkit, too, but it is not that hard to roll 24 | your own anyway. Calling SVs and CNVs is hard with any callers. This repo does 25 | consider some LIANTI-specific features, but generally you should not expect it 26 | to be the state of art. Good luck. 27 | 28 | [adna]: https://github.com/DReichLab/adna 29 | [lianti-paper]: http://science.sciencemag.org/content/356/6334/189 30 | -------------------------------------------------------------------------------- /lorenz.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "sam.h" 4 | 5 | void *bed_read(const char *fn); 6 | int bed_overlap(const void *_h, const char *chr, int beg, int end); 7 | void bed_destroy(void *_h); 8 | uint64_t bed_totlen(void *_bed); 9 | 10 | #define MAX_DEPTH 10000 11 | 12 | int main_lorenz(int argc, char *argv[]) 13 | { 14 | bam_plp_t plp; 15 | BGZF *fp; 16 | bam_hdr_t *h; 17 | const bam_pileup1_t *p; 18 | int i, c, n_plp, tid, pos, step = 1000; 19 | uint64_t *cnt, bed_len = 0, sum_partial = 0, cov = 0, tot = 0, tot_partial = 0; 20 | void *bed = 0; 21 | 22 | while ((c = getopt(argc, argv, "b:s:")) >= 0) { 23 | if (c == 'b') { 24 | bed = bed_read(optarg); 25 | bed_len = bed_totlen(bed); 26 | fprintf(stderr, "[M::%s] total length in BED: %ld\n", __func__, (long)bed_len); 27 | } else if (c == 's') step = atoi(optarg); 28 | } 29 | if (optind == argc) { 30 | fprintf(stderr, "Usage: lianti lorenz [-b bed] [-s step=%d] \n", step); 31 | return 1; 32 | } 33 | 34 | cnt = (uint64_t*)calloc(MAX_DEPTH + 1, sizeof(uint64_t)); 35 | fp = bgzf_open(argv[optind], "r"); 36 | h = bam_hdr_read(fp); 37 | 38 | plp = bam_plp_init((bam_plp_auto_f)bam_read1, fp); 39 | while ((p = bam_plp_auto(plp, &tid, &pos, &n_plp)) != 0) { 40 | if (bed_overlap(bed, h->target_name[tid], pos, pos + 1)) 41 | ++cnt[n_plp < MAX_DEPTH? n_plp : MAX_DEPTH]; 42 | } 43 | for (i = 1; i <= MAX_DEPTH; ++i) cov += cnt[i]; 44 | cnt[0] = bed_len - cov; 45 | for (i = 0; i <= MAX_DEPTH; ++i) tot += cnt[i] * i; 46 | bam_plp_destroy(plp); 47 | 48 | printf("%.4f\t%.4f\n", 0., 0.); 49 | for (i = 0, sum_partial = tot_partial = 0; i <= MAX_DEPTH; ++i) { 50 | if (cnt[i] <= step) { 51 | sum_partial += cnt[i], tot_partial += i * cnt[i]; 52 | printf("%.4f\t%.4f\n", (double)sum_partial / bed_len, (double)tot_partial / tot); 53 | } else { 54 | uint64_t rest = cnt[i]; 55 | while (rest) { 56 | int x = rest < step? rest : step; 57 | sum_partial += x, tot_partial += i * x; 58 | printf("%.4f\t%.4f\n", (double)sum_partial / bed_len, (double)tot_partial / tot); 59 | rest -= x; 60 | } 61 | } 62 | } 63 | 64 | bam_hdr_destroy(h); 65 | bgzf_close(fp); 66 | free(cnt); 67 | if (bed) bed_destroy(bed); 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define LT_VERSION "r142" 7 | 8 | int main_trim(int argc, char *argv[]); 9 | int main_ldup(int argc, char *argv[]); 10 | int main_group(int argc, char *argv[]); 11 | int main_count(int argc, char *argv[]); 12 | int main_cnv(int argc, char *argv[]); 13 | int main_pileup(int argc, char *argv[]); 14 | int main_lorenz(int argc, char *argv[]); 15 | int main_sv(int argc, char *argv[]); 16 | int main_cv(int argc, char *argv[]); 17 | 18 | void liftrlimit() 19 | { 20 | #ifdef __linux__ 21 | struct rlimit r; 22 | getrlimit(RLIMIT_AS, &r); 23 | r.rlim_cur = r.rlim_max; 24 | setrlimit(RLIMIT_AS, &r); 25 | #endif 26 | } 27 | 28 | double cputime() 29 | { 30 | struct rusage r; 31 | getrusage(RUSAGE_SELF, &r); 32 | return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec); 33 | } 34 | 35 | double realtime() 36 | { 37 | struct timeval tp; 38 | struct timezone tzp; 39 | gettimeofday(&tp, &tzp); 40 | return tp.tv_sec + tp.tv_usec * 1e-6; 41 | } 42 | 43 | int main(int argc, char *argv[]) 44 | { 45 | int ret = 0, i; 46 | double t_start; 47 | liftrlimit(); 48 | if (argc == 1) { 49 | fprintf(stderr, "Usage: lianti \n"); 50 | fprintf(stderr, "Commands:\n"); 51 | fprintf(stderr, " trim trim binding motifs and adapter sequences\n"); 52 | fprintf(stderr, " ldup mark Illumina PCR duplicates\n"); 53 | fprintf(stderr, " group group reads into alleles\n"); 54 | fprintf(stderr, " count compute allele depth\n"); 55 | fprintf(stderr, " cnv call copy number variations from read depth\n"); 56 | fprintf(stderr, " sv call structural variations from split reads\n"); 57 | fprintf(stderr, " pileup lianti-aware pileup\n"); 58 | fprintf(stderr, " lorenz compute the Lorenz evaluation curve\n"); 59 | fprintf(stderr, " cv compute conefficient of variation of binned read depth\n"); 60 | fprintf(stderr, " version print version number\n\n"); 61 | fprintf(stderr, "Typical workflow:\n"); 62 | fprintf(stderr, " seqtk mergepe read1.fq.gz read2.fq.gz | lianti trim - | bwa mem -Cpt8 ref.fa - \\\n"); 63 | fprintf(stderr, " | samtools view -uS - | sambamba sort /dev/stdin | lianti ldup - > aln.bam\n"); 64 | fprintf(stderr, " lianti group aln.bam | bgzip > alleles.bed.gz\n"); 65 | fprintf(stderr, " lianti count alleles.bed.gz > depth.bed.gz\n"); 66 | return 1; 67 | } 68 | t_start = realtime(); 69 | if (strcmp(argv[1], "trim") == 0) ret = main_trim(argc-1, argv+1); 70 | else if (strcmp(argv[1], "ldup") == 0) ret = main_ldup(argc-1, argv+1); 71 | else if (strcmp(argv[1], "group") == 0) ret = main_group(argc-1, argv+1); 72 | else if (strcmp(argv[1], "count") == 0) ret = main_count(argc-1, argv+1); 73 | else if (strcmp(argv[1], "cnv") == 0) ret = main_cnv(argc-1, argv+1); 74 | else if (strcmp(argv[1], "pileup") == 0) ret = main_pileup(argc-1, argv+1); 75 | else if (strcmp(argv[1], "lorenz") == 0) ret = main_lorenz(argc-1, argv+1); 76 | else if (strcmp(argv[1], "sv") == 0) ret = main_sv(argc-1, argv+1); 77 | else if (strcmp(argv[1], "cv") == 0) ret = main_cv(argc-1, argv+1); 78 | else if (strcmp(argv[1], "version") == 0) { 79 | puts(LT_VERSION); 80 | return 0; 81 | } else { 82 | fprintf(stderr, "[E::%s] unknown command\n", __func__); 83 | return 1; 84 | } 85 | if (ret == 0) { 86 | fprintf(stderr, "[M::%s] Version: %s\n", __func__, LT_VERSION); 87 | fprintf(stderr, "[M::%s] CMD:", __func__); 88 | for (i = 0; i < argc; ++i) 89 | fprintf(stderr, " %s", argv[i]); 90 | fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_start, cputime()); 91 | } 92 | return ret; 93 | } 94 | -------------------------------------------------------------------------------- /kvec.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* 27 | An example: 28 | 29 | #include "kvec.h" 30 | int main() { 31 | kvec_t(int) array; 32 | kv_init(array); 33 | kv_push(int, array, 10); // append 34 | kv_a(int, array, 20) = 5; // dynamic 35 | kv_A(array, 20) = 4; // static 36 | kv_destroy(array); 37 | return 0; 38 | } 39 | */ 40 | 41 | /* 42 | 2008-09-22 (0.1.0): 43 | 44 | * The initial version. 45 | 46 | */ 47 | 48 | #ifndef AC_KVEC_H 49 | #define AC_KVEC_H 50 | 51 | #include 52 | 53 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 54 | 55 | #define kvec_t(type) struct { size_t n, m; type *a; } 56 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) 57 | #define kv_destroy(v) free((v).a) 58 | #define kv_A(v, i) ((v).a[(i)]) 59 | #define kv_pop(v) ((v).a[--(v).n]) 60 | #define kv_size(v) ((v).n) 61 | #define kv_max(v) ((v).m) 62 | 63 | #define kv_resize(type, v, s) do { \ 64 | if ((v).m < (s)) { \ 65 | (v).m = (s); \ 66 | kv_roundup32((v).m); \ 67 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ 68 | } \ 69 | } while (0) 70 | 71 | #define kv_copy(type, v1, v0) do { \ 72 | if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ 73 | (v1).n = (v0).n; \ 74 | memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ 75 | } while (0) \ 76 | 77 | #define kv_push(type, v, x) do { \ 78 | if ((v).n == (v).m) { \ 79 | (v).m = (v).m? (v).m<<1 : 2; \ 80 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ 81 | } \ 82 | (v).a[(v).n++] = (x); \ 83 | } while (0) 84 | 85 | #define kv_pushp(type, v, p) do { \ 86 | if ((v).n == (v).m) { \ 87 | (v).m = (v).m? (v).m<<1 : 2; \ 88 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ 89 | } \ 90 | *(p) = &(v).a[(v).n++]; \ 91 | } while (0) 92 | 93 | #define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ 94 | ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ 95 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ 96 | : (v).n <= (size_t)(i)? (v).n = (i) \ 97 | : 0), (v).a[(i)] 98 | 99 | #define kv_reverse(type, v, start) do { \ 100 | if ((v).m > 0 && (v).n > (start)) { \ 101 | size_t __i, __end = (v).n - (start); \ 102 | type *__a = (v).a + (start); \ 103 | for (__i = 0; __i < __end>>1; ++__i) { \ 104 | type __t = __a[__end - 1 - __i]; \ 105 | __a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \ 106 | } \ 107 | } \ 108 | } while (0) 109 | 110 | #endif 111 | -------------------------------------------------------------------------------- /faidx.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Genome Research Ltd (GRL). 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Heng Li */ 27 | 28 | #ifndef FAIDX_H 29 | #define FAIDX_H 30 | 31 | /*! 32 | @header 33 | 34 | Index FASTA files and extract subsequence. 35 | 36 | @copyright The Wellcome Trust Sanger Institute. 37 | */ 38 | 39 | struct __faidx_t; 40 | typedef struct __faidx_t faidx_t; 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | /*! 47 | @abstract Build index for a FASTA or razip compressed FASTA file. 48 | @param fn FASTA file name 49 | @return 0 on success; or -1 on failure 50 | @discussion File "fn.fai" will be generated. 51 | */ 52 | int fai_build(const char *fn); 53 | 54 | /*! 55 | @abstract Distroy a faidx_t struct. 56 | @param fai Pointer to the struct to be destroyed 57 | */ 58 | void fai_destroy(faidx_t *fai); 59 | 60 | /*! 61 | @abstract Load index from "fn.fai". 62 | @param fn File name of the FASTA file 63 | */ 64 | faidx_t *fai_load(const char *fn); 65 | 66 | /*! 67 | @abstract Fetch the sequence in a region. 68 | @param fai Pointer to the faidx_t struct 69 | @param reg Region in the format "chr2:20,000-30,000" 70 | @param len Length of the region 71 | @return Pointer to the sequence; null on failure 72 | 73 | @discussion The returned sequence is allocated by malloc family 74 | and should be destroyed by end users by calling free() on it. 75 | */ 76 | char *fai_fetch(const faidx_t *fai, const char *reg, int *len); 77 | 78 | /*! 79 | @abstract Fetch the number of sequences. 80 | @param fai Pointer to the faidx_t struct 81 | @return The number of sequences 82 | */ 83 | int faidx_fetch_nseq(const faidx_t *fai); 84 | 85 | /*! 86 | @abstract Fetch the sequence in a region. 87 | @param fai Pointer to the faidx_t struct 88 | @param c_name Region name 89 | @param p_beg_i Beginning position number (zero-based) 90 | @param p_end_i End position number (zero-based) 91 | @param len Length of the region 92 | @return Pointer to the sequence; null on failure 93 | 94 | @discussion The returned sequence is allocated by malloc family 95 | and should be destroyed by end users by calling free() on it. 96 | */ 97 | char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len); 98 | 99 | /*! 100 | @abstract Return name of i-th sequence 101 | */ 102 | const char *faidx_iseq(const faidx_t *fai, int i); 103 | 104 | /*! 105 | @abstract Return sequence length, -1 if not present 106 | */ 107 | int faidx_seq_len(const faidx_t *fai, const char *seq); 108 | 109 | #ifdef __cplusplus 110 | } 111 | #endif 112 | 113 | #endif 114 | -------------------------------------------------------------------------------- /kthread.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | /************ 6 | * kt_for() * 7 | ************/ 8 | 9 | struct kt_for_t; 10 | 11 | typedef struct { 12 | struct kt_for_t *t; 13 | long i; 14 | } ktf_worker_t; 15 | 16 | typedef struct kt_for_t { 17 | int n_threads; 18 | long n; 19 | ktf_worker_t *w; 20 | void (*func)(void*,long,int); 21 | void *data; 22 | } kt_for_t; 23 | 24 | static inline long steal_work(kt_for_t *t) 25 | { 26 | int i, min_i = -1; 27 | long k, min = LONG_MAX; 28 | for (i = 0; i < t->n_threads; ++i) 29 | if (min > t->w[i].i) min = t->w[i].i, min_i = i; 30 | k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads); 31 | return k >= t->n? -1 : k; 32 | } 33 | 34 | static void *ktf_worker(void *data) 35 | { 36 | ktf_worker_t *w = (ktf_worker_t*)data; 37 | long i; 38 | for (;;) { 39 | i = __sync_fetch_and_add(&w->i, w->t->n_threads); 40 | if (i >= w->t->n) break; 41 | w->t->func(w->t->data, i, w - w->t->w); 42 | } 43 | while ((i = steal_work(w->t)) >= 0) 44 | w->t->func(w->t->data, i, w - w->t->w); 45 | pthread_exit(0); 46 | } 47 | 48 | void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n) 49 | { 50 | if (n_threads > 1) { 51 | int i; 52 | kt_for_t t; 53 | pthread_t *tid; 54 | t.func = func, t.data = data, t.n_threads = n_threads, t.n = n; 55 | t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t)); 56 | tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); 57 | for (i = 0; i < n_threads; ++i) 58 | t.w[i].t = &t, t.w[i].i = i; 59 | for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]); 60 | for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); 61 | } else { 62 | long j; 63 | for (j = 0; j < n; ++j) func(data, j, 0); 64 | } 65 | } 66 | 67 | /***************** 68 | * kt_pipeline() * 69 | *****************/ 70 | 71 | struct ktp_t; 72 | 73 | typedef struct { 74 | struct ktp_t *pl; 75 | int64_t index; 76 | int step; 77 | void *data; 78 | } ktp_worker_t; 79 | 80 | typedef struct ktp_t { 81 | void *shared; 82 | void *(*func)(void*, int, void*); 83 | int64_t index; 84 | int n_workers, n_steps; 85 | ktp_worker_t *workers; 86 | pthread_mutex_t mutex; 87 | pthread_cond_t cv; 88 | } ktp_t; 89 | 90 | static void *ktp_worker(void *data) 91 | { 92 | ktp_worker_t *w = (ktp_worker_t*)data; 93 | ktp_t *p = w->pl; 94 | while (w->step < p->n_steps) { 95 | // test whether we can kick off the job with this worker 96 | pthread_mutex_lock(&p->mutex); 97 | for (;;) { 98 | int i; 99 | // test whether another worker is doing the same step 100 | for (i = 0; i < p->n_workers; ++i) { 101 | if (w == &p->workers[i]) continue; // ignore itself 102 | if (p->workers[i].step <= w->step && p->workers[i].index < w->index) 103 | break; 104 | } 105 | if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps 106 | pthread_cond_wait(&p->cv, &p->mutex); 107 | } 108 | pthread_mutex_unlock(&p->mutex); 109 | 110 | // working on w->step 111 | w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL 112 | 113 | // update step and let other workers know 114 | pthread_mutex_lock(&p->mutex); 115 | w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps; 116 | if (w->step == 0) w->index = p->index++; 117 | pthread_cond_broadcast(&p->cv); 118 | pthread_mutex_unlock(&p->mutex); 119 | } 120 | pthread_exit(0); 121 | } 122 | 123 | void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps) 124 | { 125 | ktp_t aux; 126 | pthread_t *tid; 127 | int i; 128 | 129 | if (n_threads < 1) n_threads = 1; 130 | aux.n_workers = n_threads; 131 | aux.n_steps = n_steps; 132 | aux.func = func; 133 | aux.shared = shared_data; 134 | aux.index = 0; 135 | pthread_mutex_init(&aux.mutex, 0); 136 | pthread_cond_init(&aux.cv, 0); 137 | 138 | aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t)); 139 | for (i = 0; i < n_threads; ++i) { 140 | ktp_worker_t *w = &aux.workers[i]; 141 | w->step = 0; w->pl = &aux; w->data = 0; 142 | w->index = aux.index++; 143 | } 144 | 145 | tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); 146 | for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]); 147 | for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); 148 | 149 | pthread_mutex_destroy(&aux.mutex); 150 | pthread_cond_destroy(&aux.cv); 151 | } 152 | -------------------------------------------------------------------------------- /razf.h: -------------------------------------------------------------------------------- 1 | /*- 2 | * RAZF : Random Access compressed(Z) File 3 | * Version: 1.0 4 | * Release Date: 2008-10-27 5 | * 6 | * Copyright 2008, Jue Ruan , Heng Li 7 | * 8 | * All rights reserved. 9 | * 10 | * Redistribution and use in source and binary forms, with or without 11 | * modification, are permitted provided that the following conditions 12 | * are met: 13 | * 1. Redistributions of source code must retain the above copyright 14 | * notice, this list of conditions and the following disclaimer. 15 | * 2. Redistributions in binary form must reproduce the above copyright 16 | * notice, this list of conditions and the following disclaimer in the 17 | * documentation and/or other materials provided with the distribution. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 | * SUCH DAMAGE. 30 | */ 31 | 32 | 33 | #ifndef __RAZF_RJ_H 34 | #define __RAZF_RJ_H 35 | 36 | #include 37 | #include 38 | #include "zlib.h" 39 | 40 | #ifdef _USE_KNETFILE 41 | #include "knetfile.h" 42 | #endif 43 | 44 | #if ZLIB_VERNUM < 0x1221 45 | #define _RZ_READONLY 46 | struct _gz_header_s; 47 | typedef struct _gz_header_s _gz_header; 48 | #define gz_header _gz_header 49 | #endif 50 | 51 | #define WINDOW_BITS 15 52 | 53 | #ifndef RZ_BLOCK_SIZE 54 | #define RZ_BLOCK_SIZE (1<mode from HEAD to TYPE after call inflateReset */ 104 | int buf_off, buf_len; 105 | int z_err, z_eof; 106 | int seekable; 107 | /* Indice where the source is seekable */ 108 | int load_index; 109 | /* set has_index to 0 in mode 'w', then index will be discarded */ 110 | } RAZF; 111 | 112 | #ifdef __cplusplus 113 | extern "C" { 114 | #endif 115 | 116 | RAZF* razf_dopen(int data_fd, const char *mode); 117 | RAZF *razf_open(const char *fn, const char *mode); 118 | int razf_write(RAZF* rz, const void *data, int size); 119 | int razf_read(RAZF* rz, void *data, int size); 120 | int64_t razf_seek(RAZF* rz, int64_t pos, int where); 121 | void razf_close(RAZF* rz); 122 | 123 | #define razf_tell(rz) ((rz)->out) 124 | 125 | RAZF* razf_open2(const char *filename, const char *mode); 126 | RAZF* razf_dopen2(int fd, const char *mode); 127 | uint64_t razf_tell2(RAZF *rz); 128 | int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where); 129 | 130 | #ifdef __cplusplus 131 | } 132 | #endif 133 | 134 | #endif 135 | -------------------------------------------------------------------------------- /kdq.h: -------------------------------------------------------------------------------- 1 | #ifndef __AC_KDQ_H 2 | #define __AC_KDQ_H 3 | 4 | #include 5 | #include 6 | 7 | #define __KDQ_TYPE(type) \ 8 | typedef struct { \ 9 | size_t front:58, bits:6, count, mask; \ 10 | type *a; \ 11 | } kdq_##type##_t; 12 | 13 | #define kdq_t(type) kdq_##type##_t 14 | #define kdq_size(q) ((q)->count) 15 | #define kdq_first(q) ((q)->a[(q)->front]) 16 | #define kdq_last(q) ((q)->a[((q)->front + (q)->count - 1) & (q)->mask]) 17 | #define kdq_at(q, i) ((q)->a[((q)->front + (i)) & (q)->mask]) 18 | 19 | #define __KDQ_IMPL(type, SCOPE) \ 20 | SCOPE kdq_##type##_t *kdq_init_##type() \ 21 | { \ 22 | kdq_##type##_t *q; \ 23 | q = (kdq_##type##_t*)calloc(1, sizeof(kdq_##type##_t)); \ 24 | q->bits = 2, q->mask = (1ULL<bits) - 1; \ 25 | q->a = (type*)malloc((1<bits) * sizeof(type)); \ 26 | return q; \ 27 | } \ 28 | SCOPE void kdq_destroy_##type(kdq_##type##_t *q) \ 29 | { \ 30 | if (q == 0) return; \ 31 | free(q->a); free(q); \ 32 | } \ 33 | SCOPE int kdq_resize_##type(kdq_##type##_t *q, int new_bits) \ 34 | { \ 35 | size_t new_size = 1ULL<bits; \ 36 | if (new_size < q->count) { /* not big enough */ \ 37 | int i; \ 38 | for (i = 0; i < 64; ++i) \ 39 | if (1ULL< q->count) break; \ 40 | new_bits = i, new_size = 1ULL<bits) return q->bits; /* unchanged */ \ 43 | if (new_bits > q->bits) q->a = (type*)realloc(q->a, (1ULL<front + q->count <= old_size) { /* unwrapped */ \ 45 | if (q->front + q->count > new_size) /* only happens for shrinking */ \ 46 | memmove(q->a, q->a + new_size, (q->front + q->count - new_size) * sizeof(type)); \ 47 | } else { /* wrapped */ \ 48 | memmove(q->a + (new_size - (old_size - q->front)), q->a + q->front, (old_size - q->front) * sizeof(type)); \ 49 | q->front = new_size - (old_size - q->front); \ 50 | } \ 51 | q->bits = new_bits, q->mask = (1ULL<bits) - 1; \ 52 | if (new_bits < q->bits) q->a = (type*)realloc(q->a, (1ULL<bits; \ 54 | } \ 55 | SCOPE type *kdq_pushp_##type(kdq_##type##_t *q) \ 56 | { \ 57 | if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ 58 | return &q->a[((q->count++) + q->front) & (q)->mask]; \ 59 | } \ 60 | SCOPE void kdq_push_##type(kdq_##type##_t *q, type v) \ 61 | { \ 62 | if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ 63 | q->a[((q->count++) + q->front) & (q)->mask] = v; \ 64 | } \ 65 | SCOPE type *kdq_unshiftp_##type(kdq_##type##_t *q) \ 66 | { \ 67 | if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ 68 | ++q->count; \ 69 | q->front = q->front? q->front - 1 : (1ULL<bits) - 1; \ 70 | return &q->a[q->front]; \ 71 | } \ 72 | SCOPE void kdq_unshift_##type(kdq_##type##_t *q, type v) \ 73 | { \ 74 | type *p; \ 75 | p = kdq_unshiftp_##type(q); \ 76 | *p = v; \ 77 | } \ 78 | SCOPE type *kdq_pop_##type(kdq_##type##_t *q) \ 79 | { \ 80 | return q->count? &q->a[((--q->count) + q->front) & q->mask] : 0; \ 81 | } \ 82 | SCOPE type *kdq_shift_##type(kdq_##type##_t *q) \ 83 | { \ 84 | type *d = 0; \ 85 | if (q->count == 0) return 0; \ 86 | d = &q->a[q->front++]; \ 87 | q->front &= q->mask; \ 88 | --q->count; \ 89 | return d; \ 90 | } 91 | 92 | #define KDQ_INIT2(type, SCOPE) \ 93 | __KDQ_TYPE(type) \ 94 | __KDQ_IMPL(type, SCOPE) 95 | 96 | #ifndef klib_unused 97 | #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) 98 | #define klib_unused __attribute__ ((__unused__)) 99 | #else 100 | #define klib_unused 101 | #endif 102 | #endif /* klib_unused */ 103 | 104 | #define KDQ_INIT(type) KDQ_INIT2(type, static inline klib_unused) 105 | 106 | #define KDQ_DECLARE(type) \ 107 | __KDQ_TYPE(type) \ 108 | kdq_##type##_t *kdq_init_##type(); \ 109 | void kdq_destroy_##type(kdq_##type##_t *q); \ 110 | int kdq_resize_##type(kdq_##type##_t *q, int new_bits); \ 111 | type *kdq_pushp_##type(kdq_##type##_t *q); \ 112 | void kdq_push_##type(kdq_##type##_t *q, type v); \ 113 | type *kdq_unshiftp_##type(kdq_##type##_t *q); \ 114 | void kdq_unshift_##type(kdq_##type##_t *q, type v); \ 115 | type *kdq_pop_##type(kdq_##type##_t *q); \ 116 | type *kdq_shift_##type(kdq_##type##_t *q); 117 | 118 | #define kdq_init(type) kdq_init_##type() 119 | #define kdq_destroy(type, q) kdq_destroy_##type(q) 120 | #define kdq_resize(type, q, new_bits) kdq_resize_##type(q, new_bits) 121 | #define kdq_pushp(type, q) kdq_pushp_##type(q) 122 | #define kdq_push(type, q, v) kdq_push_##type(q, v) 123 | #define kdq_pop(type, q) kdq_pop_##type(q) 124 | #define kdq_unshiftp(type, q) kdq_unshiftp_##type(q) 125 | #define kdq_unshift(type, q, v) kdq_unshift_##type(q, v) 126 | #define kdq_shift(type, q) kdq_shift_##type(q) 127 | 128 | #endif 129 | -------------------------------------------------------------------------------- /bedidx.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "ksort.h" 8 | KSORT_INIT_GENERIC(uint64_t) 9 | 10 | #include "kseq.h" 11 | KSTREAM_INIT(gzFile, gzread, 8192) 12 | 13 | typedef struct { 14 | int n, m; 15 | uint64_t *a; 16 | int *idx; 17 | } bed_reglist_t; 18 | 19 | #include "khash.h" 20 | KHASH_MAP_INIT_STR(reg, bed_reglist_t) 21 | 22 | #define LIDX_SHIFT 13 23 | 24 | typedef kh_reg_t reghash_t; 25 | 26 | uint64_t bed_totlen(void *_h) 27 | { 28 | reghash_t *h = (reghash_t*)_h; 29 | khint_t k; 30 | uint64_t len = 0; 31 | for (k = 0; k < kh_end(h); ++k) { 32 | if (kh_exist(h, k)) { 33 | bed_reglist_t *p = &kh_val(h, k); 34 | int i; 35 | for (i = 0; i < p->n; ++i) 36 | len += (uint32_t)p->a[i] - (p->a[i]>>32); 37 | } 38 | } 39 | return len; 40 | } 41 | 42 | int *bed_index_core(int n, uint64_t *a, int *n_idx) 43 | { 44 | int i, j, m, *idx; 45 | m = *n_idx = 0; idx = 0; 46 | for (i = 0; i < n; ++i) { 47 | int beg, end; 48 | beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; 49 | if (m < end + 1) { 50 | int oldm = m; 51 | m = end + 1; 52 | kroundup32(m); 53 | idx = (int*)realloc(idx, m * sizeof(int)); 54 | for (j = oldm; j < m; ++j) idx[j] = -1; 55 | } 56 | if (beg == end) { 57 | if (idx[beg] < 0) idx[beg] = i; 58 | } else { 59 | for (j = beg; j <= end; ++j) 60 | if (idx[j] < 0) idx[j] = i; 61 | } 62 | *n_idx = end + 1; 63 | } 64 | return idx; 65 | } 66 | 67 | void bed_index(void *_h) 68 | { 69 | reghash_t *h = (reghash_t*)_h; 70 | khint_t k; 71 | for (k = 0; k < kh_end(h); ++k) { 72 | if (kh_exist(h, k)) { 73 | bed_reglist_t *p = &kh_val(h, k); 74 | if (p->idx) free(p->idx); 75 | ks_introsort(uint64_t, p->n, p->a); 76 | p->idx = bed_index_core(p->n, p->a, &p->m); 77 | } 78 | } 79 | } 80 | 81 | int bed_overlap_core(const bed_reglist_t *p, int beg, int end) 82 | { 83 | int i, min_off; 84 | if (p->n == 0) return 0; 85 | min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; 86 | if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here 87 | int n = beg>>LIDX_SHIFT; 88 | if (n > p->n) n = p->n; 89 | for (i = n - 1; i >= 0; --i) 90 | if (p->idx[i] >= 0) break; 91 | min_off = i >= 0? p->idx[i] : 0; 92 | } 93 | for (i = min_off; i < p->n; ++i) { 94 | if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed 95 | if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) 96 | return 1; // find the overlap; return 97 | } 98 | return 0; 99 | } 100 | 101 | int bed_overlap(const void *_h, const char *chr, int beg, int end) 102 | { 103 | const reghash_t *h = (const reghash_t*)_h; 104 | khint_t k; 105 | if (!h) return 0; 106 | k = kh_get(reg, h, chr); 107 | if (k == kh_end(h)) return 0; 108 | return bed_overlap_core(&kh_val(h, k), beg, end); 109 | } 110 | 111 | void *bed_read(const char *fn) 112 | { 113 | reghash_t *h = kh_init(reg); 114 | gzFile fp; 115 | kstream_t *ks; 116 | int dret; 117 | kstring_t *str; 118 | // read the list 119 | fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); 120 | if (fp == 0) return 0; 121 | str = (kstring_t*)calloc(1, sizeof(kstring_t)); 122 | ks = ks_init(fp); 123 | while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name 124 | int beg = -1, end = -1; 125 | bed_reglist_t *p; 126 | khint_t k = kh_get(reg, h, str->s); 127 | if (k == kh_end(h)) { // absent from the hash table 128 | int ret; 129 | char *s = strdup(str->s); 130 | k = kh_put(reg, h, s, &ret); 131 | memset(&kh_val(h, k), 0, sizeof(bed_reglist_t)); 132 | } 133 | p = &kh_val(h, k); 134 | if (dret != '\n') { // if the lines has other characters 135 | if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { 136 | beg = atoi(str->s); // begin 137 | if (dret != '\n') { 138 | if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { 139 | end = atoi(str->s); // end 140 | if (end < beg) end = -1; 141 | } 142 | } 143 | } 144 | } 145 | if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line 146 | if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column 147 | if (beg >= 0 && end > beg) { 148 | if (p->n == p->m) { 149 | p->m = p->m? p->m<<1 : 4; 150 | p->a = (uint64_t*)realloc(p->a, p->m * 8); 151 | } 152 | p->a[p->n++] = (uint64_t)beg<<32 | end; 153 | } 154 | } 155 | ks_destroy(ks); 156 | gzclose(fp); 157 | free(str->s); free(str); 158 | bed_index(h); 159 | return h; 160 | } 161 | 162 | void bed_destroy(void *_h) 163 | { 164 | reghash_t *h = (reghash_t*)_h; 165 | khint_t k; 166 | for (k = 0; k < kh_end(h); ++k) { 167 | if (kh_exist(h, k)) { 168 | free(kh_val(h, k).a); 169 | free(kh_val(h, k).idx); 170 | free((char*)kh_key(h, k)); 171 | } 172 | } 173 | kh_destroy(reg, h); 174 | } 175 | -------------------------------------------------------------------------------- /ldup.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "sam.h" 5 | 6 | typedef struct { 7 | bam1_t *b; 8 | uint8_t is_pe, is_left, full_ovlp; 9 | } elem_t; 10 | 11 | #include "kdq.h" 12 | KDQ_INIT(elem_t) 13 | 14 | #include "khash.h" 15 | KHASH_SET_INIT_INT64(64) 16 | KHASH_SET_INIT_STR(s) 17 | 18 | #define AUX_REALLOC_SIZE 1024 19 | 20 | static uint64_t lt_n_frags_noBC, lt_n_dups_noBC, lt_n_frags_BC, lt_n_dups_BC; 21 | 22 | static inline uint64_t X31_hash_string(const char *s) 23 | { 24 | uint64_t h = *s; 25 | if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; 26 | return h; 27 | } 28 | 29 | static khash_t(64) *process(kdq_t(elem_t) *q, BGZF *fp, khash_t(s) *marked, khash_t(64) *aux) 30 | { 31 | int i, absent; 32 | if (kh_n_buckets(aux) < kdq_size(q) * 4 && kh_n_buckets(aux) <= AUX_REALLOC_SIZE) { 33 | kh_clear(64, aux); 34 | } else { 35 | kh_destroy(64, aux); 36 | aux = kh_init(64); 37 | } 38 | for (i = 0; i < kdq_size(q); ++i) { 39 | elem_t *e = &kdq_at(q, i); 40 | bam1_t *b = e->b; 41 | char *qname = bam_get_qname(b); 42 | if (b->core.flag&0x800) continue; 43 | if ((b->core.flag&1) && (b->core.flag&(BAM_FREAD1|BAM_FREAD2))) { // PE 44 | e->is_pe = 1; 45 | e->full_ovlp = (b->core.tid == b->core.mtid && b->core.pos == b->core.mpos); 46 | e->is_left = 0; 47 | if (b->core.tid < b->core.mtid) e->is_left = 1; 48 | else if (b->core.tid == b->core.mtid) { 49 | if (b->core.pos < b->core.mpos) e->is_left = 1; 50 | else if (b->core.pos == b->core.mpos && (b->core.flag&BAM_FREAD1)) e->is_left = 1; 51 | } 52 | } else { // SE 53 | e->is_left = 1; 54 | e->is_pe = e->full_ovlp = 0; 55 | } 56 | if (e->is_left) { 57 | const uint8_t *BC = 0; 58 | uint64_t key; 59 | khint_t k; 60 | BC = bam_aux_get(b, "BC"); 61 | if (BC) ++lt_n_frags_BC; 62 | else ++lt_n_frags_noBC; 63 | key = BC? X31_hash_string(bam_aux2Z(BC)) : 0; 64 | k = kh_put(64, aux, key, &absent); 65 | if (!absent) { 66 | if (BC) ++lt_n_dups_BC; 67 | else ++lt_n_dups_noBC; 68 | b->core.flag |= BAM_FDUP; 69 | if (e->is_pe) kh_put(s, marked, strdup(qname), &absent); 70 | } 71 | } 72 | } 73 | for (i = 0; i < kdq_size(q); ++i) { 74 | elem_t *e = &kdq_at(q, i); 75 | bam1_t *b = e->b; 76 | char *qname = bam_get_qname(b); 77 | if (e->is_pe && !e->is_left) { 78 | khint_t k; 79 | k = kh_get(s, marked, qname); 80 | if (k != kh_end(marked)) { 81 | b->core.flag |= BAM_FDUP; 82 | free((char*)kh_key(marked, k)); 83 | kh_del(s, marked, k); 84 | } 85 | } 86 | } 87 | while (kdq_size(q)) { 88 | elem_t *e; 89 | e = kdq_shift(elem_t, q); 90 | bam_write1(fp, e->b); 91 | bam_destroy1(e->b); 92 | } 93 | return aux; 94 | } 95 | 96 | int main_ldup(int argc, char *argv[]) 97 | { 98 | int c, clevel = -1, ret; 99 | int last_tid = -1, last_pos = -1; 100 | BGZF *fpr, *fpw; 101 | bam_hdr_t *h; 102 | bam1_t *b; 103 | khash_t(s) *marked; 104 | khash_t(64) *aux; 105 | kdq_t(elem_t) *q; 106 | khint_t k; 107 | 108 | while ((c = getopt(argc, argv, "l:")) >= 0) { 109 | if (c == 'l') clevel = atoi(optarg); 110 | } 111 | if (optind == argc) { 112 | fprintf(stderr, "Usage: lianti ldup [options] \n"); 113 | fprintf(stderr, "Options:\n"); 114 | fprintf(stderr, " -l INT zlib compression level [zlib default]\n"); 115 | return 1; 116 | } 117 | 118 | fpr = strcmp(argv[optind], "-")? bgzf_open(argv[optind], "r") : bgzf_dopen(fileno(stdin), "r"); 119 | h = bam_hdr_read(fpr); 120 | if (clevel >= 0 && clevel <= 9) { 121 | char mode[3]; 122 | sprintf(mode, "w%d", clevel); 123 | fpw = bgzf_dopen(fileno(stdout), mode); 124 | } else fpw = bgzf_dopen(fileno(stdout), "w"); 125 | bgzf_mt(fpw, 3, 256); 126 | bam_hdr_write(fpw, h); 127 | aux = kh_init(64); 128 | marked = kh_init(s); 129 | q = kdq_init(elem_t); 130 | 131 | b = bam_init1(); 132 | while ((ret = bam_read1(fpr, b)) >= 0) { 133 | elem_t *e; 134 | b->core.flag &= ~BAM_FDUP; 135 | if (b->core.tid != last_tid || b->core.pos != last_pos) { 136 | if (last_tid >= 0 && last_pos >= 0) 137 | aux = process(q, fpw, marked, aux); 138 | last_tid = b->core.tid, last_pos = b->core.pos; 139 | } 140 | if (b->core.tid < 0) break; 141 | e = kdq_pushp(elem_t, q); 142 | e->b = bam_init1(); 143 | bam_copy1(e->b, b); 144 | } 145 | aux = process(q, fpw, marked, aux); 146 | if (ret >= 0) { 147 | do { 148 | bam_write1(fpw, b); 149 | } while (bam_read1(fpr, b) >= 0); 150 | } 151 | bam_destroy1(b); 152 | 153 | kdq_destroy(elem_t, q); 154 | fprintf(stderr, "[M::%s] %ld+%ld fragments; %ld+%ld duplicates; %d unpaired reads\n", __func__, 155 | (long)lt_n_frags_BC, (long)lt_n_frags_noBC, (long)lt_n_dups_BC, (long)lt_n_dups_noBC, kh_size(marked)); 156 | for (k = 0; k < kh_end(marked); ++k) 157 | if (kh_exist(marked, k)) free((char*)kh_key(marked, k)); 158 | kh_destroy(s, marked); 159 | kh_destroy(64, aux); 160 | bgzf_close(fpw); 161 | bam_hdr_destroy(h); 162 | bgzf_close(fpr); 163 | return 0; 164 | } 165 | -------------------------------------------------------------------------------- /hts.h: -------------------------------------------------------------------------------- 1 | #ifndef HTS_H 2 | #define HTS_H 3 | 4 | #define HTS_VERSION "lite-r308" 5 | 6 | #include 7 | #include "bgzf.h" 8 | 9 | #ifndef KSTRING_T 10 | #define KSTRING_T kstring_t 11 | typedef struct __kstring_t { 12 | size_t l, m; 13 | char *s; 14 | } kstring_t; 15 | #endif 16 | 17 | #ifndef kroundup32 18 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 19 | #endif 20 | 21 | #define hts_expand(type_t, n, m, ptr) if ((n) > (m)) { \ 22 | (m) = (n); kroundup32(m); \ 23 | (ptr) = (type_t*)realloc((ptr), (m) * sizeof(type_t)); \ 24 | } 25 | 26 | /************ 27 | * File I/O * 28 | ************/ 29 | 30 | typedef struct { 31 | uint32_t is_bin:1, is_write:1, is_be:1, dummy:29; 32 | int64_t lineno; 33 | kstring_t line; 34 | char *fn, *fn_aux; 35 | void *fp; // file pointer; actual type depending on is_bin and is_write 36 | } htsFile; 37 | 38 | /********************** 39 | * Exported functions * 40 | **********************/ 41 | 42 | extern int hts_verbose; 43 | extern unsigned char seq_nt16_table[256]; 44 | extern char seq_nt16_str[]; 45 | 46 | #ifdef __cplusplus 47 | extern "C" { 48 | #endif 49 | 50 | htsFile *hts_open(const char *fn, const char *mode, const char *fn_aux); 51 | void hts_close(htsFile *fp); 52 | int hts_getline(htsFile *fp, int delimiter, kstring_t *str); 53 | char **hts_readlines(const char *fn, int *_n); 54 | 55 | #ifdef __cplusplus 56 | } 57 | #endif 58 | 59 | /************ 60 | * Indexing * 61 | ************/ 62 | 63 | #define HTS_IDX_NOCOOR (-2) 64 | #define HTS_IDX_START (-3) 65 | #define HTS_IDX_REST (-4) 66 | 67 | #define HTS_FMT_CSI 0 68 | #define HTS_FMT_BAI 1 69 | #define HTS_FMT_TBI 2 70 | 71 | struct __hts_idx_t; 72 | typedef struct __hts_idx_t hts_idx_t; 73 | 74 | typedef struct { 75 | uint64_t u, v; 76 | } hts_pair64_t; 77 | 78 | typedef struct { 79 | int32_t m, n; 80 | uint64_t loff; 81 | hts_pair64_t *list; 82 | } hts_bin_t; 83 | 84 | typedef struct { 85 | uint32_t read_rest:1, finished:1, dummy:29; 86 | int tid, beg, end, n_off, i; 87 | uint64_t curr_off; 88 | hts_pair64_t *off; 89 | } hts_itr_t; 90 | 91 | #ifdef __cplusplus 92 | extern "C" { 93 | #endif 94 | 95 | #define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7) 96 | #define hts_bin_parent(l) (((l) - 1) >> 3) 97 | 98 | hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls); 99 | void hts_idx_destroy(hts_idx_t *idx); 100 | int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped); 101 | void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset); 102 | 103 | void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt); 104 | hts_idx_t *hts_idx_load(const char *fn, int fmt); // download the index if remote 105 | hts_idx_t *hts_idx_load_direct(const char *fn, int fmt); // directly load the remote index 106 | 107 | uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta); 108 | void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy); 109 | 110 | const char *hts_parse_reg(const char *s, int *beg, int *end); 111 | hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end); 112 | void hts_itr_destroy(hts_itr_t *iter); 113 | 114 | typedef int (*hts_readrec_f)(BGZF*, void*, void*, int*, int*, int*); 115 | typedef int (*hts_name2id_f)(void*, const char*); 116 | 117 | hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr); 118 | int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, hts_readrec_f readrec, void *hdr); 119 | 120 | #ifdef __cplusplus 121 | } 122 | #endif 123 | 124 | static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) 125 | { 126 | int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7; 127 | for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l)) 128 | if (beg>>s == end>>s) return t + (beg>>s); 129 | return 0; 130 | } 131 | 132 | static inline int hts_bin_bot(int bin, int n_lvls) 133 | { 134 | int l, b; 135 | for (l = 0, b = bin; b; ++l, b = hts_bin_parent(b)); // compute the level of bin 136 | return (bin - hts_bin_first(l)) << (n_lvls - l) * 3; 137 | } 138 | 139 | /************** 140 | * Endianness * 141 | **************/ 142 | 143 | static inline int ed_is_big() 144 | { 145 | long one= 1; 146 | return !(*((char *)(&one))); 147 | } 148 | static inline uint16_t ed_swap_2(uint16_t v) 149 | { 150 | return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); 151 | } 152 | static inline void *ed_swap_2p(void *x) 153 | { 154 | *(uint16_t*)x = ed_swap_2(*(uint16_t*)x); 155 | return x; 156 | } 157 | static inline uint32_t ed_swap_4(uint32_t v) 158 | { 159 | v = ((v & 0x0000FFFFU) << 16) | (v >> 16); 160 | return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); 161 | } 162 | static inline void *ed_swap_4p(void *x) 163 | { 164 | *(uint32_t*)x = ed_swap_4(*(uint32_t*)x); 165 | return x; 166 | } 167 | static inline uint64_t ed_swap_8(uint64_t v) 168 | { 169 | v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); 170 | v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); 171 | return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); 172 | } 173 | static inline void *ed_swap_8p(void *x) 174 | { 175 | *(uint64_t*)x = ed_swap_8(*(uint64_t*)x); 176 | return x; 177 | } 178 | 179 | #endif 180 | -------------------------------------------------------------------------------- /kstring.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #ifndef KSTRING_H 27 | #define KSTRING_H 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #ifndef kroundup32 35 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 36 | #endif 37 | 38 | #ifndef KSTRING_T 39 | #define KSTRING_T kstring_t 40 | typedef struct __kstring_t { 41 | unsigned l, m; 42 | char *s; 43 | } kstring_t; 44 | #endif 45 | 46 | typedef struct { 47 | uint64_t tab[4]; 48 | int sep, finished; 49 | const char *p; // end of the current token 50 | } ks_tokaux_t; 51 | 52 | static inline void ks_resize(kstring_t *s, size_t size) 53 | { 54 | if (s->m < size) { 55 | s->m = size; 56 | kroundup32(s->m); 57 | s->s = (char*)realloc(s->s, s->m); 58 | } 59 | } 60 | 61 | static inline int kputsn(const char *p, int l, kstring_t *s) 62 | { 63 | if (s->l + l + 1 >= s->m) { 64 | s->m = s->l + l + 2; 65 | kroundup32(s->m); 66 | s->s = (char*)realloc(s->s, s->m); 67 | } 68 | memcpy(s->s + s->l, p, l); 69 | s->l += l; 70 | s->s[s->l] = 0; 71 | return l; 72 | } 73 | 74 | static inline int kputs(const char *p, kstring_t *s) 75 | { 76 | return kputsn(p, strlen(p), s); 77 | } 78 | 79 | static inline int kputc(int c, kstring_t *s) 80 | { 81 | if (s->l + 1 >= s->m) { 82 | s->m = s->l + 2; 83 | kroundup32(s->m); 84 | s->s = (char*)realloc(s->s, s->m); 85 | } 86 | s->s[s->l++] = c; 87 | s->s[s->l] = 0; 88 | return c; 89 | } 90 | 91 | static inline void kputc_(int c, kstring_t *s) 92 | { 93 | if (s->l + 1 > s->m) { 94 | s->m = s->l + 1; 95 | kroundup32(s->m); 96 | s->s = (char*)realloc(s->s, s->m); 97 | } 98 | s->s[s->l++] = c; 99 | } 100 | 101 | static inline void kputsn_(const void *p, int l, kstring_t *s) 102 | { 103 | if (s->l + l > s->m) { 104 | s->m = s->l + l; 105 | kroundup32(s->m); 106 | s->s = (char*)realloc(s->s, s->m); 107 | } 108 | memcpy(s->s + s->l, p, l); 109 | s->l += l; 110 | } 111 | 112 | static inline int kputw(int c, kstring_t *s) 113 | { 114 | char buf[16]; 115 | int l, x; 116 | if (c == 0) return kputc('0', s); 117 | for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; 118 | if (c < 0) buf[l++] = '-'; 119 | if (s->l + l + 1 >= s->m) { 120 | s->m = s->l + l + 2; 121 | kroundup32(s->m); 122 | s->s = (char*)realloc(s->s, s->m); 123 | } 124 | for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; 125 | s->s[s->l] = 0; 126 | return 0; 127 | } 128 | 129 | static inline int kputuw(unsigned c, kstring_t *s) 130 | { 131 | char buf[16]; 132 | int l, i; 133 | unsigned x; 134 | if (c == 0) return kputc('0', s); 135 | for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; 136 | if (s->l + l + 1 >= s->m) { 137 | s->m = s->l + l + 2; 138 | kroundup32(s->m); 139 | s->s = (char*)realloc(s->s, s->m); 140 | } 141 | for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; 142 | s->s[s->l] = 0; 143 | return 0; 144 | } 145 | 146 | static inline int ksprintf(kstring_t *s, const char *fmt, ...) 147 | { 148 | va_list ap; 149 | int l; 150 | va_start(ap, fmt); 151 | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. 152 | va_end(ap); 153 | if ((size_t)l + 1 > s->m - s->l) { 154 | s->m = s->l + l + 2; 155 | kroundup32(s->m); 156 | s->s = (char*)realloc(s->s, s->m); 157 | va_start(ap, fmt); 158 | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); 159 | } 160 | va_end(ap); 161 | s->l += l; 162 | return l; 163 | } 164 | 165 | static inline char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux) 166 | { 167 | const char *p, *start; 168 | if (sep) { // set up the table 169 | if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished 170 | aux->finished = 0; 171 | if (sep[1]) { 172 | aux->sep = -1; 173 | aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; 174 | for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f); 175 | } else aux->sep = sep[0]; 176 | } 177 | if (aux->finished) return 0; 178 | else if (str) aux->p = str - 1, aux->finished = 0; 179 | if (aux->sep < 0) { 180 | for (p = start = aux->p + 1; *p; ++p) 181 | if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; 182 | } else { 183 | for (p = start = aux->p + 1; *p; ++p) 184 | if (*p == aux->sep) break; 185 | } 186 | aux->p = p; // end of token 187 | if (*p == 0) aux->finished = 1; // no more tokens 188 | return (char*)start; 189 | } 190 | 191 | #endif 192 | -------------------------------------------------------------------------------- /cv.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "sam.h" 7 | #include "kseq.h" 8 | KSTREAM_INIT(gzFile, gzread, 8192) 9 | 10 | #define NUM_BIN_SIZE 73 11 | 12 | void *bed_read(const char *fn); 13 | int bed_overlap(const void *_h, const char *chr, int beg, int end); 14 | void bed_destroy(void *_h); 15 | uint64_t bed_totlen(void *_bed); 16 | 17 | uint64_t bin_size_table[NUM_BIN_SIZE] = { 18 | 1, 2, 3, 4, 5, 6, 7, 8, 9, 19 | 10, 20, 30, 40, 50, 60, 70, 80, 90, 20 | 100, 200, 300, 400, 500, 600, 700, 800, 900, 21 | 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 22 | 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 23 | 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 24 | 1000000, 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 25 | 10000000, 20000000, 30000000, 40000000, 50000000, 60000000, 70000000, 80000000, 90000000, 26 | 100000000 27 | }; 28 | 29 | // update results after reading a new base pair, whose depth is n_plp 30 | static void lt_cv_auto(uint64_t n_plp, uint64_t* _n_bp, double* current_value, double* n, double* sum, double* sqsum) { 31 | int i; 32 | ++*_n_bp; 33 | for (i = 0; i < NUM_BIN_SIZE; ++i) { 34 | current_value[i] += n_plp; 35 | if (*_n_bp % bin_size_table[i] == 0) { // after a whole bin is read, store the 0th, 1st, and 2nd moments 36 | ++n[i]; 37 | sum[i] += current_value[i]; 38 | sqsum[i] += current_value[i] * current_value[i]; 39 | current_value[i] = 0; 40 | } 41 | } 42 | } 43 | 44 | int main_cv(int argc, char *argv[]) 45 | { 46 | int bed_input = 0; 47 | bam_plp_t plp; 48 | BGZF *fp; 49 | bam_hdr_t *h; 50 | const bam_pileup1_t *p; 51 | int i, c, n_plp, tid, pos, last_pos = -1, last_tid = -1; 52 | uint64_t bed_len = 0, n_bp = 0; 53 | double *current_value, *n, *sum, *sqsum; 54 | void *bed = 0; 55 | gzFile c_fp; 56 | kstream_t *ks; 57 | int dret, st = -1, en = -1; 58 | kstring_t *str_chr, *str_num; 59 | 60 | while ((c = getopt(argc, argv, "b:c")) >= 0) { 61 | if (c == 'b') { 62 | bed = bed_read(optarg); 63 | bed_len = bed_totlen(bed); 64 | fprintf(stderr, "[M::%s] total length in BED: %ld\n", __func__, (long)bed_len); 65 | } else if (c == 'c') { 66 | bed_input = 1; 67 | } 68 | } 69 | if (optind == argc) { 70 | fprintf(stderr, "Usage: lianti cv [-b bed] \n"); 71 | fprintf(stderr, " or: lianti cv -c [-b bed] \n"); 72 | return 1; 73 | } 74 | 75 | current_value = (double*)calloc(NUM_BIN_SIZE, sizeof(double)); 76 | n = (double*)calloc(NUM_BIN_SIZE, sizeof(double)); 77 | sum = (double*)calloc(NUM_BIN_SIZE, sizeof(double)); 78 | sqsum = (double*)calloc(NUM_BIN_SIZE, sizeof(double)); 79 | 80 | if (bed_input) { // if input is a BED file ("-c") (for example, from "lianti count") 81 | // below are modified from bedidx.c and count.c 82 | c_fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); 83 | str_chr = (kstring_t*)calloc(1, sizeof(kstring_t)); 84 | str_num = (kstring_t*)calloc(1, sizeof(kstring_t)); 85 | ks = ks_init(c_fp); 86 | while (ks_getuntil(ks, 0, str_chr, &dret) >= 0) { // read chr name 87 | for (i = 0; i < 3; ++i) { 88 | ks_getuntil(ks, 0, str_num, &dret); 89 | if (i == 0) st = atoi(str_num->s); // read region start 90 | else if (i == 1) en = atoi(str_num->s); // read region end 91 | else if (i == 2) n_plp = atoi(str_num->s); // read allele count 92 | } 93 | if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line 94 | for (pos = st; pos < en; ++pos) { // check intersection with the mask bp-by-bp 95 | if (bed_overlap(bed, str_chr->s, pos, pos + 1) == 0) 96 | continue; 97 | lt_cv_auto(n_plp, &n_bp, current_value, n, sum, sqsum); 98 | } 99 | } 100 | ks_destroy(ks); 101 | gzclose(c_fp); 102 | free(str_chr->s); 103 | free(str_num->s); 104 | free(str_chr); 105 | free(str_num); 106 | // above are modified from bedidx.c and count.c 107 | } else { // if input is a BAM file 108 | fp = bgzf_open(argv[optind], "r"); 109 | h = bam_hdr_read(fp); 110 | 111 | plp = bam_plp_init((bam_plp_auto_f)bam_read1, fp); 112 | while ((p = bam_plp_auto(plp, &tid, &pos, &n_plp)) != 0) { 113 | // below are modified from develop branch of bam2depth.c (Jul 1, 2016) 114 | while (tid > last_tid) { 115 | if (last_tid >= 0) { 116 | // Deal with remainder or entirety of last tid. 117 | while (++last_pos < h->target_len[last_tid]) { 118 | // Horribly inefficient, but the bed API is an obfuscated black box. 119 | if (bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) 120 | continue; 121 | lt_cv_auto(0, &n_bp, current_value, n, sum, sqsum); 122 | } 123 | } 124 | last_tid++; 125 | last_pos = -1; 126 | } 127 | // Deal with missing portion of current tid 128 | while (++last_pos < pos) { 129 | if (bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0) 130 | continue; 131 | lt_cv_auto(0, &n_bp, current_value, n, sum, sqsum); 132 | } 133 | last_tid = tid; 134 | last_pos = pos; 135 | if (bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) 136 | continue; 137 | lt_cv_auto(n_plp, &n_bp, current_value, n, sum, sqsum); 138 | // above are modified from develop branch of bam2depth.c (Jul 1, 2016) 139 | } 140 | bam_plp_destroy(plp); 141 | bam_hdr_destroy(h); 142 | bgzf_close(fp); 143 | } 144 | 145 | for (i = 0; i < NUM_BIN_SIZE; ++i) { 146 | printf("%ld\t%f\n", (long)bin_size_table[i], sqrt((sqsum[i] - sum[i] * sum[i] / n[i]) / n[i]) / sum[i] * n[i]); // calculate CV from the moments 147 | } 148 | 149 | free(current_value); 150 | free(n); 151 | free(sum); 152 | free(sqsum); 153 | if (bed) bed_destroy(bed); 154 | return 0; 155 | } 156 | -------------------------------------------------------------------------------- /ksort.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | // This is a simplified version of ksort.h 27 | 28 | #ifndef AC_KSORT_H 29 | #define AC_KSORT_H 30 | 31 | #include 32 | #include 33 | 34 | typedef struct { 35 | void *left, *right; 36 | int depth; 37 | } ks_isort_stack_t; 38 | 39 | #define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } 40 | 41 | #define KSORT_INIT(name, type_t, __sort_lt) \ 42 | static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ 43 | { \ 44 | type_t *i, *j, swap_tmp; \ 45 | for (i = s + 1; i < t; ++i) \ 46 | for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ 47 | swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ 48 | } \ 49 | } \ 50 | void ks_combsort_##name(size_t n, type_t a[]) \ 51 | { \ 52 | const double shrink_factor = 1.2473309501039786540366528676643; \ 53 | int do_swap; \ 54 | size_t gap = n; \ 55 | type_t tmp, *i, *j; \ 56 | do { \ 57 | if (gap > 2) { \ 58 | gap = (size_t)(gap / shrink_factor); \ 59 | if (gap == 9 || gap == 10) gap = 11; \ 60 | } \ 61 | do_swap = 0; \ 62 | for (i = a; i < a + n - gap; ++i) { \ 63 | j = i + gap; \ 64 | if (__sort_lt(*j, *i)) { \ 65 | tmp = *i; *i = *j; *j = tmp; \ 66 | do_swap = 1; \ 67 | } \ 68 | } \ 69 | } while (do_swap || gap > 2); \ 70 | if (gap != 1) __ks_insertsort_##name(a, a + n); \ 71 | } \ 72 | void ks_introsort_##name(size_t n, type_t a[]) \ 73 | { \ 74 | int d; \ 75 | ks_isort_stack_t *top, *stack; \ 76 | type_t rp, swap_tmp; \ 77 | type_t *s, *t, *i, *j, *k; \ 78 | \ 79 | if (n < 1) return; \ 80 | else if (n == 2) { \ 81 | if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ 82 | return; \ 83 | } \ 84 | for (d = 2; 1ul<>1) + 1; \ 95 | if (__sort_lt(*k, *i)) { \ 96 | if (__sort_lt(*k, *j)) k = j; \ 97 | } else k = __sort_lt(*j, *i)? i : j; \ 98 | rp = *k; \ 99 | if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ 100 | for (;;) { \ 101 | do ++i; while (__sort_lt(*i, rp)); \ 102 | do --j; while (i <= j && __sort_lt(rp, *j)); \ 103 | if (j <= i) break; \ 104 | swap_tmp = *i; *i = *j; *j = swap_tmp; \ 105 | } \ 106 | swap_tmp = *i; *i = *t; *t = swap_tmp; \ 107 | if (i-s > t-i) { \ 108 | if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ 109 | s = t-i > 16? i+1 : t; \ 110 | } else { \ 111 | if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ 112 | t = i-s > 16? i-1 : s; \ 113 | } \ 114 | } else { \ 115 | if (top == stack) { \ 116 | free(stack); \ 117 | __ks_insertsort_##name(a, a+n); \ 118 | return; \ 119 | } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ 120 | } \ 121 | } \ 122 | } \ 123 | type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ 124 | { \ 125 | type_t *low, *high, *k, *ll, *hh, *mid; \ 126 | low = arr; high = arr + n - 1; k = arr + kk; \ 127 | for (;;) { \ 128 | if (high <= low) return *k; \ 129 | if (high == low + 1) { \ 130 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ 131 | return *k; \ 132 | } \ 133 | mid = low + (high - low) / 2; \ 134 | if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ 135 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ 136 | if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ 137 | KSORT_SWAP(type_t, *mid, *(low+1)); \ 138 | ll = low + 1; hh = high; \ 139 | for (;;) { \ 140 | do ++ll; while (__sort_lt(*ll, *low)); \ 141 | do --hh; while (__sort_lt(*low, *hh)); \ 142 | if (hh < ll) break; \ 143 | KSORT_SWAP(type_t, *ll, *hh); \ 144 | } \ 145 | KSORT_SWAP(type_t, *low, *hh); \ 146 | if (hh <= k) low = ll; \ 147 | if (hh >= k) high = hh - 1; \ 148 | } \ 149 | } \ 150 | 151 | #define ks_introsort(name, n, a) ks_introsort_##name(n, a) 152 | #define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) 153 | 154 | #define ks_lt_generic(a, b) ((a) < (b)) 155 | #define ks_lt_str(a, b) (strcmp((a), (b)) < 0) 156 | 157 | typedef const char *ksstr_t; 158 | 159 | #define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) 160 | #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) 161 | 162 | #endif 163 | -------------------------------------------------------------------------------- /count.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "kvec.h" 9 | #include "kseq.h" 10 | KSTREAM_INIT(gzFile, gzread, 65536) 11 | 12 | #define MAX_HIST 10 13 | static uint64_t cnt_hist[MAX_HIST+1]; 14 | 15 | extern void ks_introsort_int(size_t, int[]); 16 | 17 | typedef struct { 18 | int ctg; 19 | uint32_t lo:1, st:31; 20 | uint32_t ro:1, en:31; 21 | int mq; 22 | int n_seg, n_frag; 23 | } lt_frag_t; 24 | 25 | #include "kdq.h" 26 | KDQ_INIT(lt_frag_t) 27 | 28 | typedef struct { 29 | int no_merge; 30 | int min_frag, min_frag2; 31 | int min_mq; 32 | } lt_copt_t; 33 | 34 | void lt_copt_init(lt_copt_t *opt) 35 | { 36 | memset(opt, 0, sizeof(lt_copt_t)); 37 | opt->min_frag = 5; 38 | opt->min_frag2 = 10; 39 | opt->min_mq = 40; 40 | } 41 | 42 | typedef struct { 43 | kstream_t *ks; 44 | gzFile fp; 45 | kstring_t s; 46 | int n_ctg, m_ctg; 47 | char **ctg; 48 | } lt_reader_t; 49 | 50 | lt_reader_t *lt_cnt_open(const char *fn) 51 | { 52 | lt_reader_t *r; 53 | gzFile fp; 54 | fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); 55 | if (fp == 0) return 0; 56 | r = (lt_reader_t*)calloc(1, sizeof(lt_reader_t)); 57 | r->fp = fp; 58 | r->ks = ks_init(fp); 59 | return r; 60 | } 61 | 62 | void lt_cnt_close(lt_reader_t *r) 63 | { 64 | int i; 65 | for (i = 0; i < r->n_ctg; ++i) free(r->ctg[i]); 66 | free(r->ctg); 67 | free(r->s.s); 68 | ks_destroy(r->ks); 69 | gzclose(r->fp); 70 | free(r); 71 | } 72 | 73 | int lt_cnt_read(lt_reader_t *r, lt_frag_t *f) 74 | { 75 | int i, ret, c, dret; 76 | char *p, *q; 77 | if ((ret = ks_getuntil(r->ks, KS_SEP_LINE, &r->s, &dret)) < 0) return ret; 78 | for (p = q = r->s.s, i = 0;; ++p) { 79 | if (*p == 0 || *p == '\t') { 80 | c = *p, *p = 0; 81 | if (i == 0) { // contig name 82 | if (r->n_ctg == 0 || strcmp(r->ctg[r->n_ctg-1], q) != 0) { 83 | if (r->n_ctg == r->m_ctg) { 84 | r->m_ctg = r->m_ctg? r->m_ctg<<1 : 4; 85 | r->ctg = (char**)realloc(r->ctg, r->m_ctg * sizeof(char*)); 86 | } 87 | r->ctg[r->n_ctg++] = strdup(q); 88 | } 89 | f->ctg = r->n_ctg - 1; 90 | } else if (i == 1) { 91 | f->st = atoi(q); 92 | } else if (i == 2) { 93 | f->en = atoi(q); 94 | } else if (i == 3) { 95 | char *t; 96 | f->n_seg = strtol(q, &t, 10); 97 | f->n_frag = strtol(t + 1, &t, 10); 98 | assert(t[1] == '<' || t[1] == '|'); 99 | f->lo = t[1] == '<'? 1 : 0; 100 | f->ro = t[2] == '>'? 1 : 0; 101 | } else if (i == 6) { 102 | f->mq = atoi(q); 103 | } 104 | if (c == 0) break; 105 | ++i, q = p + 1; 106 | } 107 | } 108 | return 0; 109 | } 110 | 111 | typedef struct { 112 | kdq_t(lt_frag_t) *q; 113 | int last_ctg, last_pos; 114 | } lt_cntbuf_t; 115 | 116 | lt_cntbuf_t *lt_buf_init(void) 117 | { 118 | lt_cntbuf_t *b; 119 | b = (lt_cntbuf_t*)calloc(1, sizeof(lt_cntbuf_t)); 120 | b->q = kdq_init(lt_frag_t); 121 | return b; 122 | } 123 | 124 | void lt_buf_destroy(lt_cntbuf_t *b) 125 | { 126 | kdq_destroy(lt_frag_t, b->q); 127 | free(b); 128 | } 129 | 130 | static void clear_up_to(const lt_copt_t *opt, lt_cntbuf_t *b, int end, char *const* ctg) 131 | { 132 | int s = b->last_pos; 133 | kdq_t(lt_frag_t) *q = b->q; 134 | while (kdq_size(q) && s < end) { 135 | int i, s2 = end, d = 0, d2 = 0, d3 = 0; 136 | for (i = 0; i < kdq_size(q); ++i) { 137 | lt_frag_t *f = &kdq_at(q, i); 138 | if (s >= f->st && s < f->en) { // overlapping the counting position 139 | if (f->mq >= opt->min_mq) ++d; 140 | if (f->mq >= opt->min_mq && f->n_frag >= opt->min_frag2) ++d2; 141 | ++d3; 142 | if (f->en <= end) 143 | s2 = s2 < f->en? s2 : f->en; 144 | } else if (s < f->st) { // start after the counting position 145 | s2 = s2 < f->st? s2 : f->st; 146 | } 147 | } 148 | if (s2 != INT_MAX) { 149 | printf("%s\t%d\t%d\t%d\t%d\t%d\n", ctg[b->last_ctg], s, s2, d, d2, d3); 150 | cnt_hist[d < MAX_HIST? d : MAX_HIST] += s2 - s; 151 | } 152 | s = s2; 153 | while (kdq_size(q) && kdq_first(q).en <= s) 154 | kdq_shift(lt_frag_t, q); 155 | } 156 | if (end != INT_MAX) { 157 | cnt_hist[0] += end - s; 158 | if (s < end) printf("%s\t%d\t%d\t0\t0\t0\n", ctg[b->last_ctg], s, end); 159 | } 160 | b->last_pos = end; 161 | } 162 | 163 | static int test_merge(lt_cntbuf_t *b, lt_frag_t *f) 164 | { 165 | if (f->ctg == b->last_ctg && f->lo) { 166 | int i, max = 0, max_i = -1; 167 | kdq_t(lt_frag_t) *q = b->q; 168 | for (i = 0; i < kdq_size(q); ++i) { 169 | lt_frag_t *g = &kdq_at(q, i); 170 | if (g->ro && f->st < g->en && f->en >= g->en) { 171 | if (g->en - f->st > max) max = g->en - f->st, max_i = i; 172 | } 173 | } 174 | if (max > 0) { 175 | lt_frag_t *g = &kdq_at(q, max_i); 176 | assert(f->en >= g->en && f->st < g->en && f->st >= g->st); 177 | g->ro = f->ro, g->en = f->en; 178 | ++g->n_seg, g->n_frag += f->n_frag; 179 | return 1; 180 | } 181 | } 182 | return 0; 183 | } 184 | 185 | void lt_buf_push(const lt_copt_t *opt, lt_cntbuf_t *b, lt_frag_t *f, char *const* ctg) 186 | { 187 | if (f) { 188 | lt_frag_t *p; 189 | if (!opt->no_merge && test_merge(b, f)) return; 190 | if (f->n_frag < opt->min_frag) return; 191 | if (f->ctg != b->last_ctg) { 192 | clear_up_to(opt, b, INT_MAX, ctg); 193 | b->last_ctg = f->ctg; 194 | b->last_pos = 0; 195 | } 196 | clear_up_to(opt, b, f->st, ctg); 197 | b->last_pos = f->st; 198 | p = kdq_pushp(lt_frag_t, b->q); 199 | memcpy(p, f, sizeof(lt_frag_t)); 200 | } else clear_up_to(opt, b, INT_MAX, ctg); 201 | } 202 | 203 | #include 204 | 205 | int main_count(int argc, char *argv[]) 206 | { 207 | int c, i; 208 | lt_copt_t opt; 209 | lt_reader_t *r; 210 | lt_frag_t f; 211 | lt_cntbuf_t *b; 212 | 213 | lt_copt_init(&opt); 214 | while ((c = getopt(argc, argv, "Mn:q:")) >= 0) { 215 | if (c == 'M') opt.no_merge = 1; 216 | else if (c == 'q') opt.min_mq = atoi(optarg); 217 | else if (c == 'n') { 218 | char *q; 219 | opt.min_frag = strtol(optarg, &q, 10); 220 | opt.min_frag2 = *q == ','? atoi(q+1) : opt.min_frag*2; 221 | } 222 | } 223 | if (optind == argc) { 224 | fprintf(stderr, "Usage: lianti group | lianti count [options] -\n"); 225 | fprintf(stderr, "Options:\n"); 226 | fprintf(stderr, " -n INT1[,INT2] ignore fragments consisting of =%d&&nReads>=%d} depth{mapQ>=%d&&nReads>=%d} depthAll\n", opt.min_mq, opt.min_frag, opt.min_mq, opt.min_frag2); 231 | return 1; 232 | } 233 | 234 | r = lt_cnt_open(argv[optind]); 235 | b = lt_buf_init(); 236 | while (lt_cnt_read(r, &f) >= 0) 237 | lt_buf_push(&opt, b, &f, r->ctg); 238 | lt_buf_push(&opt, b, 0, r->ctg); 239 | lt_buf_destroy(b); 240 | lt_cnt_close(r); 241 | for (i = 0; i <= MAX_HIST; ++i) 242 | fprintf(stderr, "H\t%d\t%ld\n", i, (long)cnt_hist[i]); 243 | return 0; 244 | } 245 | -------------------------------------------------------------------------------- /sam.h: -------------------------------------------------------------------------------- 1 | #ifndef BAM_H 2 | #define BAM_H 3 | 4 | #include 5 | #include "bgzf.h" 6 | #include "hts.h" 7 | 8 | /********************** 9 | *** SAM/BAM header *** 10 | **********************/ 11 | 12 | typedef struct { 13 | int32_t n_targets, ignore_sam_err; 14 | uint32_t l_text; 15 | uint32_t *target_len; 16 | uint8_t *cigar_tab; 17 | char **target_name; 18 | char *text; 19 | void *sdict; 20 | } bam_hdr_t; 21 | 22 | /**************************** 23 | *** CIGAR related macros *** 24 | ****************************/ 25 | 26 | #define BAM_CMATCH 0 27 | #define BAM_CINS 1 28 | #define BAM_CDEL 2 29 | #define BAM_CREF_SKIP 3 30 | #define BAM_CSOFT_CLIP 4 31 | #define BAM_CHARD_CLIP 5 32 | #define BAM_CPAD 6 33 | #define BAM_CEQUAL 7 34 | #define BAM_CDIFF 8 35 | #define BAM_CBACK 9 36 | 37 | #define BAM_CIGAR_STR "MIDNSHP=XB" 38 | #define BAM_CIGAR_SHIFT 4 39 | #define BAM_CIGAR_MASK 0xf 40 | #define BAM_CIGAR_TYPE 0x3C1A7 41 | 42 | #define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) 43 | #define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT) 44 | #define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)]) 45 | #define bam_cigar_gen(l, o) ((l)<>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference 47 | 48 | #define BAM_FPAIRED 1 49 | #define BAM_FPROPER_PAIR 2 50 | #define BAM_FUNMAP 4 51 | #define BAM_FMUNMAP 8 52 | #define BAM_FREVERSE 16 53 | #define BAM_FMREVERSE 32 54 | #define BAM_FREAD1 64 55 | #define BAM_FREAD2 128 56 | #define BAM_FSECONDARY 256 57 | #define BAM_FQCFAIL 512 58 | #define BAM_FDUP 1024 59 | #define BAM_FSUPP 2048 60 | 61 | /************************* 62 | *** Alignment records *** 63 | *************************/ 64 | 65 | typedef struct { 66 | int32_t tid; 67 | int32_t pos; 68 | uint32_t bin:16, qual:8, l_qname:8; 69 | uint32_t flag:16, n_cigar:16; 70 | int32_t l_qseq; 71 | int32_t mtid; 72 | int32_t mpos; 73 | int32_t isize; 74 | } bam1_core_t; 75 | 76 | typedef struct { 77 | bam1_core_t core; 78 | int l_data, m_data; 79 | uint8_t *data; 80 | } bam1_t; 81 | 82 | #define bam_is_rev(b) (((b)->core.flag&BAM_FREVERSE) != 0) 83 | #define bam_is_mrev(b) (((b)->core.flag&BAM_FMREVERSE) != 0) 84 | #define bam_get_qname(b) ((char*)(b)->data) 85 | #define bam_get_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) 86 | #define bam_get_seq(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname) 87 | #define bam_get_qual(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) 88 | #define bam_get_aux(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1) + (b)->core.l_qseq) 89 | #define bam_get_l_aux(b) ((b)->l_data - ((b)->core.n_cigar<<2) - (b)->core.l_qname - (b)->core.l_qseq - (((b)->core.l_qseq + 1)>>1)) 90 | #define bam_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf) 91 | 92 | /************************** 93 | *** Exported functions *** 94 | **************************/ 95 | 96 | #ifdef __cplusplus 97 | extern "C" { 98 | #endif 99 | 100 | /*************** 101 | *** BAM I/O *** 102 | ***************/ 103 | 104 | bam_hdr_t *bam_hdr_read(BGZF *fp); 105 | int bam_hdr_write(BGZF *fp, const bam_hdr_t *h); 106 | void bam_hdr_destroy(bam_hdr_t *h); 107 | int bam_name2id(bam_hdr_t *h, const char *ref); 108 | 109 | bam1_t *bam_init1(void); 110 | void bam_destroy1(bam1_t *b); 111 | int bam_read1(BGZF *fp, bam1_t *b); 112 | int bam_write1(BGZF *fp, const bam1_t *b); 113 | bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc); 114 | int bam_readrec(BGZF *fp, void *null, bam1_t *b, int *tid, int *beg, int *end); 115 | 116 | int bam_cigar2qlen(int n_cigar, const uint32_t *cigar); 117 | int bam_cigar2rlen(int n_cigar, const uint32_t *cigar); 118 | 119 | /******************** 120 | *** BAM indexing *** 121 | ********************/ 122 | 123 | #define bam_itr_destroy(iter) hts_itr_destroy(iter) 124 | #define bam_itr_queryi(idx, tid, beg, end) hts_itr_query(idx, tid, beg, end) 125 | #define bam_itr_querys(idx, hdr, s) hts_itr_querys((idx), (s), (hts_name2id_f)(bam_name2id), (hdr)) 126 | #define bam_itr_next(fp, itr, r) hts_itr_next((fp), (itr), (r), (hts_readrec_f)(bam_readrec), 0) 127 | #define bam_index_load(fn) hts_idx_load((fn), HTS_FMT_BAI) 128 | 129 | int bam_index_build(const char *fn, int min_shift); 130 | 131 | /*************** 132 | *** SAM I/O *** 133 | ***************/ 134 | 135 | #define sam_open(fn, mode, fnaux) hts_open(fn, mode, fnaux) 136 | #define sam_close(fp) hts_close(fp) 137 | 138 | typedef htsFile samFile; 139 | bam_hdr_t *sam_hdr_parse(int l_text, const char *text); 140 | bam_hdr_t *sam_hdr_read(samFile *fp); 141 | int sam_hdr_write(samFile *fp, const bam_hdr_t *h); 142 | 143 | int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b); 144 | int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str); 145 | int sam_read1(samFile *fp, bam_hdr_t *h, bam1_t *b); 146 | int sam_write1(samFile *fp, const bam_hdr_t *h, const bam1_t *b); 147 | 148 | /************************************* 149 | *** Manipulating auxiliary fields *** 150 | *************************************/ 151 | 152 | uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); 153 | int32_t bam_aux2i(const uint8_t *s); 154 | double bam_aux2f(const uint8_t *s); 155 | char bam_aux2A(const uint8_t *s); 156 | char *bam_aux2Z(const uint8_t *s); 157 | 158 | void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); 159 | int bam_aux_del(bam1_t *b, uint8_t *s); 160 | 161 | #ifdef __cplusplus 162 | } 163 | #endif 164 | 165 | /************************** 166 | *** Pileup and Mpileup *** 167 | **************************/ 168 | 169 | #if !defined(BAM_NO_PILEUP) 170 | 171 | #define BAM_PLP_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) 172 | 173 | typedef struct { 174 | bam1_t *b; 175 | int32_t qpos; 176 | int indel, level; 177 | uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; 178 | } bam_pileup1_t; 179 | 180 | typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); 181 | 182 | struct __bam_plp_t; 183 | typedef struct __bam_plp_t *bam_plp_t; 184 | 185 | struct __bam_mplp_t; 186 | typedef struct __bam_mplp_t *bam_mplp_t; 187 | 188 | #ifdef __cplusplus 189 | extern "C" { 190 | #endif 191 | 192 | bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data); 193 | void bam_plp_destroy(bam_plp_t iter); 194 | int bam_plp_push(bam_plp_t iter, const bam1_t *b); 195 | const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); 196 | const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); 197 | void bam_plp_set_mask(bam_plp_t iter, int mask); 198 | void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); 199 | void bam_plp_reset(bam_plp_t iter); 200 | 201 | bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); 202 | void bam_mplp_set_mask(bam_mplp_t iter, int mask); 203 | void bam_mplp_destroy(bam_mplp_t iter); 204 | void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); 205 | int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); 206 | 207 | #ifdef __cplusplus 208 | } 209 | #endif 210 | 211 | #endif // ~!defined(BAM_NO_PILEUP) 212 | 213 | #endif 214 | -------------------------------------------------------------------------------- /bgzf.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 4 | 2011, 2012 Attractive Chaos 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* The BGZF library was originally written by Bob Handsaker from the Broad 26 | * Institute. It was later improved by the SAMtools developers. */ 27 | 28 | #ifndef __BGZF_H 29 | #define __BGZF_H 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #define BGZF_BLOCK_SIZE 0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE 37 | #define BGZF_MAX_BLOCK_SIZE 0x10000 38 | 39 | #define BGZF_ERR_ZLIB 1 40 | #define BGZF_ERR_HEADER 2 41 | #define BGZF_ERR_IO 4 42 | #define BGZF_ERR_MISUSE 8 43 | 44 | typedef struct { 45 | int errcode:16, is_write:2, is_be:2, compress_level:12; 46 | int cache_size; 47 | int block_length, block_offset; 48 | int64_t block_address; 49 | void *uncompressed_block, *compressed_block; 50 | void *cache; // a pointer to a hash table 51 | void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading 52 | #ifdef BGZF_MT 53 | void *mt; // only used for multi-threading 54 | #endif 55 | } BGZF; 56 | 57 | #ifndef KSTRING_T 58 | #define KSTRING_T kstring_t 59 | typedef struct __kstring_t { 60 | size_t l, m; 61 | char *s; 62 | } kstring_t; 63 | #endif 64 | 65 | #ifdef __cplusplus 66 | extern "C" { 67 | #endif 68 | 69 | /****************** 70 | * Basic routines * 71 | ******************/ 72 | 73 | /** 74 | * Open an existing file descriptor for reading or writing. 75 | * 76 | * @param fd file descriptor 77 | * @param mode mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies 78 | * the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored. 79 | * @return BGZF file handler; 0 on error 80 | */ 81 | BGZF* bgzf_dopen(int fd, const char *mode); 82 | 83 | #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility 84 | 85 | /** 86 | * Open the specified file for reading or writing. 87 | */ 88 | BGZF* bgzf_open(const char* path, const char *mode); 89 | 90 | /** 91 | * Close the BGZF and free all associated resources. 92 | * 93 | * @param fp BGZF file handler 94 | * @return 0 on success and -1 on error 95 | */ 96 | int bgzf_close(BGZF *fp); 97 | 98 | /** 99 | * Read up to _length_ bytes from the file storing into _data_. 100 | * 101 | * @param fp BGZF file handler 102 | * @param data data array to read into 103 | * @param length size of data to read 104 | * @return number of bytes actually read; 0 on end-of-file and -1 on error 105 | */ 106 | ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length); 107 | 108 | /** 109 | * Write _length_ bytes from _data_ to the file. 110 | * 111 | * @param fp BGZF file handler 112 | * @param data data array to write 113 | * @param length size of data to write 114 | * @return number of bytes actually written; -1 on error 115 | */ 116 | ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length); 117 | 118 | /** 119 | * Write the data in the buffer to the file. 120 | */ 121 | int bgzf_flush(BGZF *fp); 122 | 123 | /** 124 | * Return a virtual file pointer to the current location in the file. 125 | * No interpetation of the value should be made, other than a subsequent 126 | * call to bgzf_seek can be used to position the file at the same point. 127 | * Return value is non-negative on success. 128 | */ 129 | #define bgzf_tell(fp) ((((BGZF*)fp)->block_address << 16) | (((BGZF*)fp)->block_offset & 0xFFFF)) 130 | 131 | /** 132 | * Set the file to read from the location specified by _pos_. 133 | * 134 | * @param fp BGZF file handler 135 | * @param pos virtual file offset returned by bgzf_tell() 136 | * @param whence must be SEEK_SET 137 | * @return 0 on success and -1 on error 138 | */ 139 | int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence); 140 | 141 | /** 142 | * Check if the BGZF end-of-file (EOF) marker is present 143 | * 144 | * @param fp BGZF file handler opened for reading 145 | * @return 1 if EOF is present; 0 if not or on I/O error 146 | */ 147 | int bgzf_check_EOF(BGZF *fp); 148 | 149 | /** 150 | * Check if a file is in the BGZF format 151 | * 152 | * @param fn file name 153 | * @return 1 if _fn_ is BGZF; 0 if not or on I/O error 154 | */ 155 | int bgzf_is_bgzf(const char *fn); 156 | 157 | /********************* 158 | * Advanced routines * 159 | *********************/ 160 | 161 | /** 162 | * Set the cache size. Only effective when compiled with -DBGZF_CACHE. 163 | * 164 | * @param fp BGZF file handler 165 | * @param size size of cache in bytes; 0 to disable caching (default) 166 | */ 167 | void bgzf_set_cache_size(BGZF *fp, int size); 168 | 169 | /** 170 | * Flush the file if the remaining buffer size is smaller than _size_ 171 | */ 172 | int bgzf_flush_try(BGZF *fp, ssize_t size); 173 | 174 | /** 175 | * Read one byte from a BGZF file. It is faster than bgzf_read() 176 | * @param fp BGZF file handler 177 | * @return byte read; -1 on end-of-file or error 178 | */ 179 | int bgzf_getc(BGZF *fp); 180 | 181 | /** 182 | * Read one line from a BGZF file. It is faster than bgzf_getc() 183 | * 184 | * @param fp BGZF file handler 185 | * @param delim delimitor 186 | * @param str string to write to; must be initialized 187 | * @return length of the string; 0 on end-of-file; negative on error 188 | */ 189 | int bgzf_getline(BGZF *fp, int delim, kstring_t *str); 190 | 191 | /** 192 | * Read the next BGZF block. 193 | */ 194 | int bgzf_read_block(BGZF *fp); 195 | 196 | #ifdef BGZF_MT 197 | /** 198 | * Enable multi-threading (only effective on writing) 199 | * 200 | * @param fp BGZF file handler; must be opened for writing 201 | * @param n_threads #threads used for writing 202 | * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended 203 | */ 204 | int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks); 205 | #endif 206 | 207 | #ifdef __cplusplus 208 | } 209 | #endif 210 | 211 | #endif 212 | -------------------------------------------------------------------------------- /sv.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "sam.h" 6 | #include "kvec.h" 7 | #include "ksort.h" 8 | 9 | typedef struct { 10 | int tid, pos, mapq, nm; 11 | int qs, qe, rs, re, rev; 12 | } side_t; 13 | 14 | typedef struct { 15 | int64_t mid; 16 | uint64_t pos[2]; 17 | int qgap; 18 | int dir:16, rdrev:16; 19 | } break_t; 20 | 21 | #define sv1_lt(a, b) ((a).mid < (b).mid) 22 | KSORT_INIT(sv1, break_t, sv1_lt) 23 | 24 | #define sv2_lt(a, b) ((a).pos[0] < (b).pos[0]) 25 | KSORT_INIT(sv2, break_t, sv2_lt) 26 | 27 | int main_sv(int argc, char *argv[]) 28 | { 29 | BGZF *fp; 30 | bam_hdr_t *h; 31 | bam1_t *b; 32 | int c, i, min_mapq = 50, max_nm = 3, print_bp = 0, max_gap = 50, min_cnt = 3; 33 | uint64_t *off; 34 | kvec_t(break_t) a = {0,0,0}; 35 | 36 | while ((c = getopt(argc, argv, "pq:m:g:n:")) >= 0) { 37 | if (c == 'q') min_mapq = atoi(optarg); 38 | else if (c == 'm') max_nm = atoi(optarg); 39 | else if (c == 'p') print_bp = 1; 40 | else if (c == 'g') max_gap = atoi(optarg); 41 | else if (c == 'n') min_cnt = atoi(optarg); 42 | } 43 | if (optind == argc) { 44 | fprintf(stderr, "Usage: lianti sv [options] \n"); 45 | fprintf(stderr, "Options:\n"); 46 | fprintf(stderr, " -q INT min mapping quality [%d]\n", min_mapq); 47 | fprintf(stderr, " -m INT max NM [%d]\n", max_nm); 48 | fprintf(stderr, " -g INT max gap [%d]\n", max_gap); 49 | fprintf(stderr, " -n INT min count [%d]\n", min_cnt); 50 | fprintf(stderr, " -p output break points, not the clustered SV calls\n"); 51 | return 1; 52 | } 53 | 54 | fp = bgzf_open(argv[optind], "r"); 55 | h = bam_hdr_read(fp); 56 | 57 | off = (uint64_t*)calloc(h->n_targets + 1, 8); 58 | for (i = 0; i < h->n_targets; ++i) off[i+1] = off[i] + h->target_len[i]; 59 | 60 | b = bam_init1(); 61 | while (bam_read1(fp, b) >= 0) { 62 | const bam1_core_t *c = &b->core; 63 | const uint8_t *SA = 0; 64 | char *sa, *p; 65 | int i, n_semicolon = 0; 66 | side_t s[2], t; 67 | int64_t mid; 68 | if ((c->flag & (BAM_FUNMAP|BAM_FSUPP|BAM_FQCFAIL|BAM_FSECONDARY|BAM_FDUP)) || c->tid < 0) continue; 69 | SA = bam_aux_get(b, "SA"); 70 | if (SA == 0) continue; 71 | sa = bam_aux2Z(SA); 72 | for (p = sa; *p; ++p) 73 | if (*p == ';') ++n_semicolon; 74 | if (n_semicolon != 1) continue; 75 | 76 | for (i = 0; i < 2; ++i) { 77 | int k, clip[2], ql, rl; 78 | clip[0] = clip[1] = ql = rl = 0; 79 | if (i == 0) { 80 | const uint8_t *NM; 81 | const uint32_t *cigar; 82 | cigar = bam_get_cigar(b); 83 | s[i].tid = c->tid; 84 | s[i].rs = c->pos; 85 | s[i].rev = (c->flag&BAM_FREVERSE)? 1 : 0; 86 | for (k = 0; k < c->n_cigar; ++k) { 87 | int op = bam_cigar_op(cigar[k]); 88 | int len = bam_cigar_oplen(cigar[k]); 89 | if (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP) clip[k?1:0] = len; 90 | else if (op == BAM_CMATCH) ql += len, rl += len; 91 | else if (op == BAM_CINS) ql += len; 92 | else if (op == BAM_CDEL || op == BAM_CREF_SKIP) rl += len; 93 | } 94 | s[i].mapq = c->qual; 95 | s[i].nm = ((NM = bam_aux_get(b, "NM")) != 0)? bam_aux2i(NM) : -1; 96 | } else { 97 | int n_op = 0; 98 | for (p = sa; *p != ','; ++p); 99 | *p = 0; 100 | s[i].tid = bam_name2id(h, sa); 101 | assert(s[i].tid >= 0 && s[i].tid < h->n_targets); 102 | s[i].rs = strtol(p+1, &p, 10) - 1; 103 | s[i].rev = p[1] == '+'? 0 : 1; 104 | for (p += 3; *p && *p != ','; ++p, ++n_op) { 105 | int len; 106 | len = strtol(p, &p, 10); 107 | if (*p == 'H' || *p == 'S') clip[n_op?1:0] = len; 108 | else if (*p == 'M') ql += len, rl += len; 109 | else if (*p == 'I') ql += len; 110 | else if (*p == 'D' || *p == 'N') rl += len; 111 | } 112 | s[i].mapq = strtol(p+1, &p, 10); 113 | s[i].nm = strtol(p+1, &p, 10); 114 | } 115 | if (s[i].mapq < min_mapq || s[i].nm > max_nm) break; 116 | if (clip[0] == 0 && clip[1] == 0) break; 117 | if (s[i].rev) { 118 | s[i].qs = clip[1]; 119 | s[i].re = s[i].rs; 120 | s[i].rs = s[i].re + rl; 121 | } else { 122 | s[i].qs = clip[0]; 123 | s[i].re = s[i].rs + rl; 124 | } 125 | s[i].qe = s[i].qs + ql; 126 | } 127 | if (i != 2) continue; 128 | if (s[0].qs > s[1].qs) t = s[0], s[0] = s[1], s[1] = t; 129 | mid = ((off[s[0].tid] + s[0].re) + (off[s[1].tid] + s[1].rs)) >> 1; 130 | mid = (!s[0].rev && s[0].rs + off[s[0].tid] < s[1].rs + off[s[1].tid]) || (s[0].rev && s[0].rs + off[s[0].tid] > s[1].rs + off[s[1].tid])? mid : mid + off[h->n_targets]; 131 | if (s[0].rev != s[1].rev) mid = -mid; 132 | if (!print_bp) { 133 | break_t *p; 134 | uint64_t tmp; 135 | kv_pushp(break_t, a, &p); 136 | p->mid = mid; 137 | p->pos[0] = (uint64_t)s[0].tid<<32 | s[0].re; 138 | p->pos[1] = (uint64_t)s[1].tid<<32 | s[1].rs; 139 | p->qgap = s[1].qs - s[0].qe; 140 | p->dir = (!!s[0].rev)<<1 | (!!s[1].rev); 141 | p->rdrev = 0; 142 | if (p->pos[0] > p->pos[1]) { 143 | tmp = p->pos[0], p->pos[0] = p->pos[1], p->pos[1] = tmp; 144 | p->dir = ((p->dir&1)^1)<<1 | (p->dir>>1^1); 145 | p->rdrev = 1; 146 | } 147 | } else printf("%s\t%d\t%c\t%d\t%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%lld\n", h->target_name[s[0].tid], s[0].re, "+-"[s[0].rev], s[0].mapq, s[0].qe - s[0].qs, s[0].nm, 148 | h->target_name[s[1].tid], s[1].rs, "+-"[s[1].rev], s[1].mapq, s[1].qe - s[1].qs, s[1].nm, s[1].qs - s[0].qe, (long long)mid); 149 | } 150 | bam_destroy1(b); 151 | 152 | if (!print_bp) { 153 | int start; 154 | ks_introsort(sv1, a.n, a.a); 155 | for (start = 0, i = 1; i <= a.n; ++i) { 156 | if (i == a.n || a.a[i].mid - a.a[i-1].mid > max_gap) { 157 | if (i - start >= min_cnt) { 158 | int j, subst; 159 | ks_introsort(sv2, i - start, &a.a[start]); 160 | for (subst = start, j = start + 1; j <= i; ++j) { 161 | if (j == i || a.a[j].pos[0]>>32 != a.a[j-1].pos[0]>>32 || a.a[j].pos[1]>>32 != a.a[j-1].pos[1]>>32 || a.a[j].pos - a.a[j-1].pos > max_gap) { 162 | int k, type; 163 | int64_t pos[2], qgap = 0; 164 | break_t *p = &a.a[subst]; 165 | for (k = subst, pos[0] = pos[1] = 0; k < j; ++k) { 166 | pos[0] += (uint32_t)a.a[k].pos[0]; 167 | pos[1] += (uint32_t)a.a[k].pos[1]; 168 | qgap += a.a[k].qgap; 169 | } 170 | pos[0] = (int)((double)pos[0] / (j - subst) + .499); 171 | pos[1] = (int)((double)pos[1] / (j - subst) + .499); 172 | qgap = (int)((double)qgap / (j - subst) + .499); 173 | type = p->mid >= 0 && p->mid < off[h->n_targets] && p->pos[0]>>32 == p->pos[1]>>32? 'G' : p->pos[0]>>32 == p->pos[1]>>32? 'S' : 'T'; 174 | printf("SV\t%s\t%d\t%c\t%s\t%d\t%c\t%c", h->target_name[p->pos[0]>>32], (uint32_t)pos[0], "+-"[p->dir>>1&1], 175 | h->target_name[p->pos[1]>>32], (uint32_t)pos[1], "+-"[p->dir&1], type); 176 | if (type == 'G') printf("\t%d", (int)(qgap - (pos[1] - pos[0]))); 177 | else printf("\t."); 178 | printf("\t%d\n", j - subst); 179 | for (k = subst; k < j; ++k) { 180 | break_t *q = &a.a[k]; 181 | printf("RD\t%s\t%d\t%c\t%s\t%d\t%c\t%d\t%c\n", h->target_name[q->pos[0]>>32], (uint32_t)q->pos[0], "+-"[q->dir>>1&1], 182 | h->target_name[q->pos[1]>>32], (uint32_t)q->pos[1], "+-"[q->dir&1], q->qgap, "+-"[q->rdrev]); 183 | } 184 | printf("//\n"); 185 | subst = j; 186 | } 187 | } 188 | } 189 | start = i; 190 | } 191 | } 192 | free(a.a); 193 | } 194 | 195 | free(off); 196 | bam_hdr_destroy(h); 197 | bgzf_close(fp); 198 | return 0; 199 | } 200 | -------------------------------------------------------------------------------- /group.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "sam.h" 6 | #include "kdq.h" 7 | #include "kvec.h" 8 | #include "ksort.h" 9 | KSORT_INIT_GENERIC(int) 10 | 11 | typedef struct { 12 | int l_ovlp; 13 | int max_seg; 14 | int min_frag; 15 | int fuzz_merge, fuzz_st, fuzz_ovlp; 16 | int no_merge; 17 | } lt_opt_t; 18 | 19 | typedef struct { 20 | int tid, st, en, far_st; 21 | int n_frag, n_seg; 22 | uint32_t is_rev:1, l_open:1, r_open:1; 23 | uint64_t sum_mq2; 24 | 25 | int n, m; 26 | int *a; 27 | } lt_group_t; 28 | 29 | #define group_lt(a, b) ((a).st < (b).st || ((a).st == (b).st && (a).is_rev < (b).is_rev)) 30 | KSORT_INIT(grp, lt_group_t, group_lt) 31 | 32 | KDQ_INIT(lt_group_t) 33 | 34 | typedef struct { 35 | kdq_t(lt_group_t) *q; 36 | int r_tid, r_max_en; 37 | 38 | int n, m; 39 | lt_group_t *a; 40 | } lt_groups_t; 41 | 42 | static void lt_opt_init(lt_opt_t *opt) 43 | { 44 | memset(opt, 0, sizeof(lt_opt_t)); 45 | opt->l_ovlp = 9; 46 | opt->max_seg = 10000; 47 | opt->fuzz_merge = 10; 48 | opt->fuzz_st = 2; 49 | opt->fuzz_ovlp = 2; 50 | opt->min_frag = 3; 51 | } 52 | 53 | lt_groups_t *lt_grp_init(void) 54 | { 55 | lt_groups_t *g; 56 | g = (lt_groups_t*)calloc(1, sizeof(lt_groups_t)); 57 | g->q = kdq_init(lt_group_t); 58 | g->r_tid = -1; 59 | return g; 60 | } 61 | 62 | void lt_grp_destroy(lt_groups_t *g) 63 | { 64 | kdq_destroy(lt_group_t, g->q); 65 | free(g->a); 66 | free(g); 67 | } 68 | 69 | void lt_grp_push_region(const lt_opt_t *opt, const bam_hdr_t *h, lt_groups_t *g, const lt_group_t *r) 70 | { 71 | if (g->n && (r == 0 || r->tid != g->r_tid || r->far_st >= g->r_max_en)) { 72 | int i, j; 73 | ks_introsort(grp, g->n, g->a); 74 | /* pass 1: fuzzy start 75 | |------------> or <-----------------| 76 | |--------------> <--------------| */ 77 | for (i = 0; i < g->n; ++i) { 78 | lt_group_t *q = &g->a[i]; 79 | if (q->n_frag == 0 || (!q->l_open && !q->r_open)) continue; 80 | for (j = i + 1; j < g->n && g->a[j].st < q->en; ++j) { 81 | lt_group_t *p = &g->a[j]; 82 | if (p->n_frag == 0 || q->is_rev != p->is_rev) continue; 83 | if ((!q->is_rev && p->st - q->st <= opt->fuzz_st) || (q->is_rev && p->en - q->en <= opt->fuzz_st && q->en - p->en <= opt->fuzz_st)) { 84 | q->n_frag += p->n_frag; 85 | q->en = q->en > p->en? q->en : p->en; 86 | q->sum_mq2 += p->sum_mq2; 87 | p->n_frag = 0; 88 | } 89 | } 90 | } 91 | /* pass 2: forward-reverse merge 92 | |--------------> 93 | <------------------| */ 94 | for (i = 0; i < g->n; ++i) { 95 | lt_group_t *q = &g->a[i]; 96 | if (q->n_frag == 0 || (!q->l_open && !q->r_open)) continue; 97 | for (j = i + 1; j < g->n && g->a[j].st < q->en; ++j) { 98 | lt_group_t *p = &g->a[j]; 99 | if (p->n_frag == 0) continue; 100 | if (q->r_open && p->l_open && q->en <= p->en && p->st - q->st <= opt->fuzz_merge) { // merge 101 | q->n_frag += p->n_frag; 102 | q->r_open = 0; 103 | q->en = q->en > p->en? q->en : p->en; 104 | q->sum_mq2 += p->sum_mq2; 105 | p->n_frag = 0; 106 | } 107 | } 108 | } 109 | if (opt->no_merge) goto print_reg; 110 | // pass n: merge 9bp overlaps 111 | for (i = 0; i < g->n; ++i) { 112 | lt_group_t *q = &g->a[i]; 113 | if (q->n_frag == 0) continue; 114 | for (j = i + 1; j < g->n; ++j) { 115 | lt_group_t *p = &g->a[j]; 116 | if (q->en <= p->st) break; 117 | if (p->n_frag == 0) continue; 118 | if (q->en - p->st >= opt->l_ovlp - opt->fuzz_ovlp && q->en - p->st <= opt->l_ovlp + opt->fuzz_ovlp) { // TODO: better strategy: first count possible merges; don't merge if multiple 119 | q->n_frag += p->n_frag; 120 | q->en = p->en; 121 | q->r_open = p->r_open; 122 | q->sum_mq2 += p->sum_mq2; 123 | ++q->n_seg; 124 | p->n_frag = 0; 125 | } 126 | } 127 | } 128 | print_reg: 129 | // print out 130 | for (i = 0; i < g->n; ++i) { 131 | lt_group_t *p = &g->a[i]; 132 | if (p->n_frag) 133 | printf("%s\t%d\t%d\t%d:%d:%c%c\t%c\t%d\t%d\n", h->target_name[p->tid], p->st, p->en, p->n_seg, p->n_frag, 134 | "|<"[p->l_open], "|>"[p->r_open], "+-"[p->is_rev], p->n_frag, (int)(sqrt((float)p->sum_mq2 / p->n_frag) + .499)); 135 | } 136 | g->n = 0, g->r_tid = -1, g->r_max_en = 0; 137 | } 138 | if (r) { 139 | lt_group_t *t; 140 | kv_pushp(lt_group_t, *g, &t); 141 | memcpy(t, r, sizeof(lt_group_t)); 142 | t->n_seg = 1; 143 | g->r_tid = t->tid; 144 | g->r_max_en = g->r_max_en > t->en? g->r_max_en : t->en; 145 | } 146 | } 147 | 148 | int lt_grp_segflt(int min_frag, lt_group_t *p) 149 | { 150 | int flt; 151 | assert(p->n == p->n_frag); 152 | p->far_st = p->st; 153 | if (p->n >= min_frag) { 154 | int i, l1, l2 = 0, n2 = 0, s2; 155 | ks_introsort(int, p->n, p->a); 156 | l1 = p->a[p->n - min_frag]; 157 | for (i = 1, s2 = 0, l2 = p->a[0]; i <= p->n; ++i) { 158 | if (i == p->n || p->a[i] != p->a[i-1]) { 159 | if (i - s2 > n2) 160 | n2 = i - s2, l2 = p->a[i-1]; 161 | s2 = i; 162 | } 163 | } 164 | l1 = l1 > l2? l1 : l2; 165 | if (!p->is_rev) p->en = p->st + l1; 166 | else p->st = p->en - l1; 167 | flt = 0; 168 | } else flt = 1; 169 | free(p->a); 170 | p->a = 0, p->m = p->n = 0; 171 | return flt; 172 | } 173 | 174 | void lt_grp_push_read(const lt_opt_t *opt, lt_groups_t *g, const bam_hdr_t *h, const bam1_t *b) 175 | { 176 | int i, st, en, is_rev; 177 | const bam1_core_t *c = &b->core; 178 | 179 | if (b == 0) { 180 | while (kdq_size(g->q)) { 181 | lt_group_t *p = &kdq_first(g->q); 182 | if (!lt_grp_segflt(opt->min_frag, p)) 183 | lt_grp_push_region(opt, h, g, p); 184 | kdq_shift(lt_group_t, g->q); 185 | } 186 | lt_grp_push_region(opt, h, g, 0); 187 | return; 188 | } 189 | // compute st, en and rev 190 | if (c->flag & (BAM_FUNMAP|BAM_FDUP|BAM_FSUPP)) return; 191 | if (bam_aux_get(b, "SA")) return; // TODO: we can relax this, in future 192 | if (bam_aux_get(b, "BC") == 0) return; // skip sonicated reads 193 | if (c->flag & 2) { 194 | if (c->tid != c->mtid || c->isize > opt->max_seg || c->isize < 0) 195 | return; 196 | st = c->pos; 197 | en = st + c->isize; 198 | is_rev = (c->flag & BAM_FREAD1)? 0 : 1; 199 | } else { 200 | st = c->pos; 201 | en = st + bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)); 202 | is_rev = (c->flag & 16)? 1 : 0; 203 | } 204 | 205 | while (kdq_size(g->q)) { 206 | lt_group_t *p = &kdq_first(g->q); 207 | if (p->tid != c->tid || p->en <= st) { 208 | if (!lt_grp_segflt(opt->min_frag, p)) 209 | lt_grp_push_region(opt, h, g, p); 210 | kdq_shift(lt_group_t, g->q); 211 | } else break; 212 | } 213 | for (i = 0; i < kdq_size(g->q); ++i) { 214 | lt_group_t *p = &kdq_at(g->q, i); 215 | int added = 0; 216 | if (p->is_rev != is_rev) continue; 217 | if (!p->is_rev) { 218 | if (st == p->st) 219 | added = 1, ++p->n_frag, p->en = en > p->en? en : p->en; 220 | } else { 221 | if (en == p->en) 222 | added = 1, ++p->n_frag; 223 | } 224 | if (added) { 225 | int q = c->qual < 60? c->qual : 60; 226 | kv_push(int, *p, en - st); 227 | p->sum_mq2 += q * q; 228 | break; 229 | } 230 | } 231 | if (i == kdq_size(g->q)) { 232 | int q = c->qual < 60? c->qual : 60; 233 | lt_group_t *p; 234 | p = kdq_pushp(lt_group_t, g->q); 235 | p->tid = c->tid, p->st = st, p->en = en, p->is_rev = is_rev, p->n_frag = 1; 236 | p->l_open = is_rev? 1 : 0; 237 | p->r_open = is_rev? 0 : 1; 238 | p->sum_mq2 = q * q; 239 | p->n = 0, p->m = 4; 240 | p->a = (int*)malloc(p->m * sizeof(int)); 241 | p->a[p->n++] = en - st; 242 | } 243 | } 244 | 245 | #include 246 | 247 | int main_group(int argc, char *argv[]) 248 | { 249 | int c; 250 | lt_opt_t opt; 251 | BGZF *fp; 252 | bam_hdr_t *h; 253 | bam1_t *b; 254 | lt_groups_t *g; 255 | 256 | lt_opt_init(&opt); 257 | while ((c = getopt(argc, argv, "l:n:Ms:m:o:")) >= 0) { 258 | if (c == 'l') opt.l_ovlp = atoi(optarg); 259 | else if (c == 'n') opt.min_frag = atoi(optarg); 260 | else if (c == 'M') opt.no_merge = 1; 261 | else if (c == 's') opt.fuzz_st = atoi(optarg); 262 | else if (c == 'o') opt.fuzz_ovlp = atoi(optarg); 263 | else if (c == 'm') opt.fuzz_merge = atoi(optarg); 264 | } 265 | if (optind == argc) { 266 | fprintf(stderr, "Usage: lianti group [options] \n"); 267 | fprintf(stderr, "Options:\n"); 268 | fprintf(stderr, " -l INT expected overlap length between two adjacent alleles [%d]\n", opt.l_ovlp); 269 | fprintf(stderr, " -n INT skip alleles consisting of = 0) 284 | lt_grp_push_read(&opt, g, h, b); 285 | lt_grp_push_read(&opt, g, h, 0); 286 | bam_destroy1(b); 287 | lt_grp_destroy(g); 288 | 289 | bam_hdr_destroy(h); 290 | bgzf_close(fp); 291 | return 0; 292 | } 293 | -------------------------------------------------------------------------------- /kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Last Modified: 05MAR2012 */ 27 | 28 | #ifndef AC_KSEQ_H 29 | #define AC_KSEQ_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 36 | #define KS_SEP_TAB 1 // isspace() && !' ' 37 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 38 | #define KS_SEP_MAX 2 39 | 40 | #define __KS_TYPE(type_t) \ 41 | typedef struct __kstream_t { \ 42 | int begin, end; \ 43 | int is_eof:2, bufsize:30; \ 44 | type_t f; \ 45 | unsigned char *buf; \ 46 | } kstream_t; 47 | 48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 50 | 51 | #define __KS_BASIC(SCOPE, type_t, __bufsize) \ 52 | SCOPE kstream_t *ks_init(type_t f) \ 53 | { \ 54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 55 | ks->f = f; ks->bufsize = __bufsize; \ 56 | ks->buf = (unsigned char*)malloc(__bufsize); \ 57 | return ks; \ 58 | } \ 59 | SCOPE void ks_destroy(kstream_t *ks) \ 60 | { \ 61 | if (!ks) return; \ 62 | free(ks->buf); \ 63 | free(ks); \ 64 | } 65 | 66 | #define __KS_INLINED(__read) \ 67 | static inline int ks_getc(kstream_t *ks) \ 68 | { \ 69 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ 70 | if (ks->begin >= ks->end) { \ 71 | ks->begin = 0; \ 72 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 73 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ 74 | if (ks->end == 0) return -1; \ 75 | } \ 76 | return (int)ks->buf[ks->begin++]; \ 77 | } \ 78 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 79 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 80 | 81 | #ifndef KSTRING_T 82 | #define KSTRING_T kstring_t 83 | typedef struct __kstring_t { 84 | unsigned l, m; 85 | char *s; 86 | } kstring_t; 87 | #endif 88 | 89 | #ifndef kroundup32 90 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 91 | #endif 92 | 93 | #define __KS_GETUNTIL(SCOPE, __read) \ 94 | SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 95 | { \ 96 | if (dret) *dret = 0; \ 97 | str->l = append? str->l : 0; \ 98 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ 99 | for (;;) { \ 100 | int i; \ 101 | if (ks->begin >= ks->end) { \ 102 | if (!ks->is_eof) { \ 103 | ks->begin = 0; \ 104 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 105 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ 106 | if (ks->end == 0) break; \ 107 | } else break; \ 108 | } \ 109 | if (delimiter == KS_SEP_LINE) { \ 110 | for (i = ks->begin; i < ks->end; ++i) \ 111 | if (ks->buf[i] == '\n') break; \ 112 | } else if (delimiter > KS_SEP_MAX) { \ 113 | for (i = ks->begin; i < ks->end; ++i) \ 114 | if (ks->buf[i] == delimiter) break; \ 115 | } else if (delimiter == KS_SEP_SPACE) { \ 116 | for (i = ks->begin; i < ks->end; ++i) \ 117 | if (isspace(ks->buf[i])) break; \ 118 | } else if (delimiter == KS_SEP_TAB) { \ 119 | for (i = ks->begin; i < ks->end; ++i) \ 120 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 121 | } else i = 0; /* never come to here! */ \ 122 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 123 | str->m = str->l + (i - ks->begin) + 1; \ 124 | kroundup32(str->m); \ 125 | str->s = (char*)realloc(str->s, str->m); \ 126 | } \ 127 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 128 | str->l = str->l + (i - ks->begin); \ 129 | ks->begin = i + 1; \ 130 | if (i < ks->end) { \ 131 | if (dret) *dret = ks->buf[i]; \ 132 | break; \ 133 | } \ 134 | } \ 135 | if (str->s == 0) { \ 136 | str->m = 1; \ 137 | str->s = (char*)calloc(1, 1); \ 138 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 139 | str->s[str->l] = '\0'; \ 140 | return str->l; \ 141 | } 142 | 143 | #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ 144 | __KS_TYPE(type_t) \ 145 | __KS_BASIC(SCOPE, type_t, __bufsize) \ 146 | __KS_GETUNTIL(SCOPE, __read) \ 147 | __KS_INLINED(__read) 148 | 149 | #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) 150 | 151 | #define KSTREAM_DECLARE(type_t, __read) \ 152 | __KS_TYPE(type_t) \ 153 | extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ 154 | extern kstream_t *ks_init(type_t f); \ 155 | extern void ks_destroy(kstream_t *ks); \ 156 | __KS_INLINED(__read) 157 | 158 | /****************** 159 | * FASTA/Q parser * 160 | ******************/ 161 | 162 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) 163 | 164 | #define __KSEQ_BASIC(SCOPE, type_t) \ 165 | SCOPE kseq_t *kseq_init(type_t fd) \ 166 | { \ 167 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 168 | s->f = ks_init(fd); \ 169 | return s; \ 170 | } \ 171 | SCOPE void kseq_destroy(kseq_t *ks) \ 172 | { \ 173 | if (!ks) return; \ 174 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 175 | ks_destroy(ks->f); \ 176 | free(ks); \ 177 | } 178 | 179 | /* Return value: 180 | >=0 length of the sequence (normal) 181 | -1 end-of-file 182 | -2 truncated quality string 183 | */ 184 | #define __KSEQ_READ(SCOPE) \ 185 | SCOPE int kseq_read(kseq_t *seq) \ 186 | { \ 187 | int c; \ 188 | kstream_t *ks = seq->f; \ 189 | if (seq->last_char == 0) { /* then jump to the next header line */ \ 190 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 191 | if (c == -1) return -1; /* end of file */ \ 192 | seq->last_char = c; \ 193 | } /* else: the first header char has been read in the previous call */ \ 194 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ 195 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ 196 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ 197 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ 198 | seq->seq.m = 256; \ 199 | seq->seq.s = (char*)malloc(seq->seq.m); \ 200 | } \ 201 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 202 | if (c == '\n') continue; /* skip empty lines */ \ 203 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ 204 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ 205 | } \ 206 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 207 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ 208 | seq->seq.m = seq->seq.l + 2; \ 209 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ 210 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 211 | } \ 212 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 213 | if (c != '+') return seq->seq.l; /* FASTA */ \ 214 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ 215 | seq->qual.m = seq->seq.m; \ 216 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 217 | } \ 218 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 219 | if (c == -1) return -2; /* error: no quality string */ \ 220 | while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ 221 | seq->last_char = 0; /* we have not come to the next header line */ \ 222 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ 223 | return seq->seq.l; \ 224 | } 225 | 226 | #define __KSEQ_TYPE(type_t) \ 227 | typedef struct { \ 228 | kstring_t name, comment, seq, qual; \ 229 | int last_char; \ 230 | kstream_t *f; \ 231 | } kseq_t; 232 | 233 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ 234 | KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ 235 | __KSEQ_TYPE(type_t) \ 236 | __KSEQ_BASIC(SCOPE, type_t) \ 237 | __KSEQ_READ(SCOPE) 238 | 239 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) 240 | 241 | #define KSEQ_DECLARE(type_t) \ 242 | __KS_TYPE(type_t) \ 243 | __KSEQ_TYPE(type_t) \ 244 | extern kseq_t *kseq_init(type_t fd); \ 245 | void kseq_destroy(kseq_t *ks); \ 246 | int kseq_read(kseq_t *seq); 247 | 248 | #endif 249 | -------------------------------------------------------------------------------- /plp-diff.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env k8 2 | 3 | var getopt = function(args, ostr) { 4 | var oli; // option letter list index 5 | if (typeof(getopt.place) == 'undefined') 6 | getopt.ind = 0, getopt.arg = null, getopt.place = -1; 7 | if (getopt.place == -1) { // update scanning pointer 8 | if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { 9 | getopt.place = -1; 10 | return null; 11 | } 12 | if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" 13 | ++getopt.ind; 14 | getopt.place = -1; 15 | return null; 16 | } 17 | } 18 | var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity 19 | if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { 20 | if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. 21 | if (getopt.place < 0) ++getopt.ind; 22 | return '?'; 23 | } 24 | if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument 25 | getopt.arg = null; 26 | if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; 27 | } else { // need an argument 28 | if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) 29 | getopt.arg = args[getopt.ind].substr(getopt.place); 30 | else if (args.length <= ++getopt.ind) { // no arg 31 | getopt.place = -1; 32 | if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; 33 | return '?'; 34 | } else getopt.arg = args[getopt.ind]; // white space 35 | getopt.place = -1; 36 | ++getopt.ind; 37 | } 38 | return optopt; 39 | } 40 | 41 | var c, min_snv_dp = 5, min_snv_dp_ds = 1, min_snv_ab = .2, min_snv_frag_conflict = 0, min_bulk_dp = 15, min_bulk_var_dp = 5, min_bulk_het_ab = .3, min_mapq = 40, min_snv_dist = 100, max_bulk_cnt = 0; 42 | var hap = false, max_hap_err = 1, output_TP = false, force_sgl = false, pair_mode = false, cnt_gap = false; 43 | while ((c = getopt(arguments, "n:m:b:q:a:A:d:he:P1pgs:f:c:")) != null) { 44 | if (c == 'n') min_snv_dp = parseInt(getopt.arg); 45 | else if (c == 'c') max_bulk_cnt = parseInt(getopt.arg); 46 | else if (c == 's') min_snv_dp_ds = parseInt(getopt.arg); 47 | else if (c == 'm') min_bulk_var_dp = parseInt(getopt.arg); 48 | else if (c == 'b') min_bulk_dp = parseInt(getopt.arg); 49 | else if (c == 'q') min_mapq = parseInt(getopt.arg); 50 | else if (c == 'a') min_snv_ab = parseFloat(getopt.arg); 51 | else if (c == 'A') min_bulk_het_ab = parseFloat(getopt.arg); 52 | else if (c == 'd') min_snv_dist = parseInt(getopt.arg); 53 | else if (c == 'f') min_snv_frag_conflict = parseInt(getopt.arg); 54 | else if (c == 'h') hap = true; 55 | else if (c == 'P') output_TP = true; 56 | else if (c == 'e') max_hap_err = parseInt(getopt.arg); 57 | else if (c == '1') force_sgl = true; 58 | else if (c == 'p') pair_mode = true; 59 | else if (c == 'g') cnt_gap = true; 60 | } 61 | 62 | if (getopt.ind == arguments.length) { 63 | print("Usage: k8 plp-diff.js [options] "); 64 | print("Options:"); 65 | print(" -q INT min RMS mapping quality ["+min_mapq+"]"); 66 | print(" -b INT min bulk read depth to call a het ["+min_bulk_dp+"]"); 67 | print(" -m INT min bulk allele depth to call a het ["+min_bulk_var_dp+"]"); 68 | print(" -A FLOAT min bulk allele balance to call a het ["+min_bulk_het_ab+"]"); 69 | print(" -n INT min single-cell ALT read depth to call a SNV ["+min_snv_dp+"]"); 70 | print(" -s INT min single-cell ALT read depth on each strand ["+min_snv_dp_ds+"]"); 71 | print(" -a FLOAT min single-cell ALT allele balance to call a SNV ["+min_snv_ab+"]"); 72 | print(" -f INT min single-cell fragment conflicts ["+min_snv_frag_conflict+"]"); 73 | print(" -d INT drop SNVs within INT-bp between each other ["+min_snv_dist+"]"); 74 | print(" -h haploid mode"); 75 | print(" -e INT ignore a bulk variant if #ref_alleles > INT ["+max_hap_err+"]"); 76 | print(" -1 only look at the first two samples"); 77 | print(" -p double-cell mode"); 78 | print(" -g count gaps"); 79 | exit(1); 80 | } 81 | 82 | var file = arguments[getopt.ind] == "-"? new File() : new File(arguments[getopt.ind]); 83 | var buf = new Bytes(); 84 | 85 | var n_bulk_het = 0, n_ado_ref = 0, n_ado_alt = 0, n_ado_both = 0, n_het_fn = 0, n_het_fn2 = 0, n_snv = 0, n_snv_nonCT = 0, n_ins = 0, n_del = 0; 86 | var last = []; 87 | while (file.readline(buf) >= 0) { 88 | var m, is_indel, t = buf.toString().split("\t"); 89 | if (t[0].charAt(0) == '#') continue; // skip VCF header 90 | if (t.length < 11) { 91 | warn("WARNING: incomplete line: '" + buf.toString() + "'"); 92 | continue; 93 | } 94 | is_indel = (t[3].length == 1 && t[4].length == 1)? false : true; 95 | if (!cnt_gap && is_indel) continue; 96 | if (force_sgl) t.length = 11; 97 | t[1] = parseInt(t[1]); 98 | t[3] = t[3].toUpperCase(); 99 | var u = t[9].split(/[:,]/); 100 | var v = t[10].split(/[:,]/); 101 | var w = t.length >= 12? t[11].split(/[:,]/) : null; 102 | if (u.length < 5 || v.length < 5) continue; // something is wrong 103 | for (var i = 1; i < u.length; ++i) { // convert to integers 104 | u[i] = u[i] == '.'? 0 : parseInt(u[i]); 105 | v[i] = v[i] == '.'? 0 : parseInt(v[i]); 106 | if (w) w[i] = w[i] == '.'? 0 : parseInt(w[i]); 107 | } 108 | var ref = t[3], alt = t[4]; 109 | if (t[3] > t[4] && !is_indel) { // determine mutation type 110 | if (t[3] == 'C') t[3] = 'G'; 111 | else if (t[3] == 'G') t[3] = 'C'; 112 | else if (t[3] == 'T') t[3] = 'A'; 113 | if (t[4] == 'A') t[4] = 'T'; 114 | else if (t[4] == 'C') t[4] = 'G'; 115 | else if (t[4] == 'G') t[4] = 'C'; 116 | } 117 | var bulk_dp = u[1] + u[2] + u[3] + u[4]; 118 | if (bulk_dp < min_bulk_dp) continue; // bulk does not have enough coverage 119 | if ((m = /\bAMQ=([\.\d,]+)/.exec(t[7])) != null) { // actually for bialliac SNVs, the block can be simpler; but let's be more general 120 | var mq = 256, s = m[1].split(","); 121 | for (var i = 0; i < s.length; ++i) { 122 | if (s[i] == '.') continue; 123 | var x = parseInt(s[i]); 124 | mq = mq < x? mq : x; 125 | } 126 | if (mq < min_mapq) continue; 127 | } 128 | var is_snv_called; 129 | if (t.length <= 11) { // only two samples: bulk and a single cell 130 | if (!hap) is_snv_called = v[2] + v[4] >= min_snv_dp && (v[2] + v[4]) / (v[1] + v[2] + v[3] + v[4]) >= min_snv_ab? true : false; 131 | else is_snv_called = v[2] + v[4] >= min_snv_dp && v[1] + v[3] == 0? true : false; 132 | if (v.length > 5 && v[5] > min_snv_frag_conflict) is_snv_called = false; 133 | if (v[2] < min_snv_dp_ds || v[4] < min_snv_dp_ds) is_snv_called = false; 134 | } else { // three samples: bulk and a pair of single cells; FIXME: min_snv_dp_ds is not used in this block 135 | if (pair_mode && !hap) { 136 | is_snv_called = true; 137 | if (v[2] + v[4] + w[2] + w[4] < min_snv_dp) is_snv_called = false; 138 | else if ((v[2] + v[4] + w[2] + w[4]) / (v[1] + v[2] + v[3] + v[4] + w[1] + w[2] + w[3] + w[4]) < min_snv_ab) is_snv_called = false; 139 | else if (v[2] + v[4] < 3 || w[2] + w[4] < 3) is_snv_called = false; 140 | } else { 141 | var called1, called2; 142 | if (!hap) { 143 | called1 = v[2] + v[4] >= min_snv_dp && (v[2] + v[4]) / (v[1] + v[2] + v[3] + v[4]) >= min_snv_ab? true : false; 144 | called2 = w[2] + w[4] >= min_snv_dp && (w[2] + w[4]) / (w[1] + w[2] + w[3] + w[4]) >= min_snv_ab? true : false; 145 | } else { 146 | called1 = v[2] + v[4] >= min_snv_dp && v[1] + v[3] == 0? true : false; 147 | called2 = w[2] + w[4] >= min_snv_dp && w[1] + w[3] == 0? true : false; 148 | } 149 | is_snv_called = called1 && called2? true : false; 150 | } 151 | if (v.length > 5 && v[5] > 0) is_snv_called = false; 152 | if (w.length > 5 && w[5] > 0) is_snv_called = false; 153 | } 154 | if (!hap) { 155 | if (u[1] > 0 && u[2] > 0 && u[3] > 0 && u[4] > 0 && u[1] + u[3] >= min_bulk_var_dp && u[2] + u[4] >= min_bulk_var_dp 156 | && (u[1] + u[3]) / bulk_dp >= min_bulk_het_ab && (u[2] + u[4]) / bulk_dp >= min_bulk_het_ab) // a bulk het 157 | { 158 | ++n_bulk_het; 159 | // count ADO 160 | if (v[1] + v[2] + v[3] + v[4] == 0) ++n_ado_ref, ++n_ado_alt, ++n_ado_both; 161 | else if (v[1] + v[3] == 0) ++n_ado_ref; 162 | else if (v[2] + v[4] == 0) ++n_ado_alt; 163 | // count FN 164 | if (!is_snv_called) ++n_het_fn; 165 | if (is_snv_called && output_TP) print("TP", t[0], t[1], ref, alt, t[3]+t[4], u[1]+u[3], u[2]+u[4]); 166 | } 167 | } else { 168 | if (u[2] > 0 && u[4] > 0 && u[2] + u[4] >= min_bulk_var_dp && u[1] + u[3] <= max_hap_err) { 169 | ++n_bulk_het; // ok, this is really a hom 170 | if (v[2] + v[4] == 0) ++n_ado_alt; 171 | if (!is_snv_called) ++n_het_fn; 172 | } 173 | } 174 | if (u[2] + u[4] <= max_bulk_cnt && is_snv_called) { // a potential SNV 175 | var s, type, flt_this = false; 176 | type = t[3].length > t[4].length? 'DEL' : t[3].length < t[4].length? 'INS' : t[3]+t[4]; 177 | if (t.length <= 11) s = [t[0], t[1], ref, alt, type, v[1] + v[3], v[2] + v[4], v.length > 5? v[5] : 0, t[7]]; 178 | else s = [t[0], t[1], ref, alt, type, v[1] + v[3] + w[1] + w[3], v[2] + v[4] + w[2] + w[4], v.length > 5? v[5] + w[5] : 0, t[7]]; 179 | for (var i = 0; i < last.length; ++i) { 180 | if (last[i][0] != t[0] || t[1] - last[i][1] > min_snv_dist) { 181 | if (!last[i][2]) { 182 | print('NV', last[i][3].join("\t")); 183 | if (type == 'INS') ++n_ins; 184 | else if (type == 'DEL') ++n_del; 185 | else ++n_snv; 186 | if (last[i][3][4] != "CT" && type != 'INS' && type != 'DEL') 187 | ++n_snv_nonCT; 188 | } 189 | last.shift(); 190 | --i; 191 | } else last[i][2] = flt_this = true; // filtered 192 | } 193 | last.push([t[0], t[1], flt_this, s]); 194 | } 195 | } 196 | for (var i = 0; i < last.length; ++i) 197 | if (!last[i][2]) { 198 | print('NV', last[i][3].join("\t")); 199 | ++n_snv; 200 | if (last[i][3][4] != "CT") ++n_snv_nonCT; 201 | } 202 | 203 | print("NE", n_bulk_het); 204 | print("VN", n_het_fn, (n_het_fn / n_bulk_het).toFixed(4)); 205 | if (!hap) print("RO", n_ado_ref, (n_ado_ref / n_bulk_het).toFixed(4)); 206 | print("AO", n_ado_alt, (n_ado_alt / n_bulk_het).toFixed(4)); 207 | if (!hap) print("BO", n_ado_both, (n_ado_both / n_bulk_het).toFixed(4)); 208 | print("NN", n_snv, n_snv_nonCT, n_del, n_ins); 209 | 210 | buf.destroy(); 211 | file.close(); 212 | -------------------------------------------------------------------------------- /faidx.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "faidx.h" 7 | #include "khash.h" 8 | 9 | typedef struct { 10 | int32_t line_len, line_blen; 11 | int64_t len; 12 | uint64_t offset; 13 | } faidx1_t; 14 | KHASH_MAP_INIT_STR(s, faidx1_t) 15 | 16 | #ifndef _NO_RAZF 17 | #include "razf.h" 18 | #else 19 | #ifdef _WIN32 20 | #define ftello(fp) ftell(fp) 21 | #define fseeko(fp, offset, whence) fseek(fp, offset, whence) 22 | #else 23 | extern off_t ftello(FILE *stream); 24 | extern int fseeko(FILE *stream, off_t offset, int whence); 25 | #endif 26 | #define RAZF FILE 27 | #define razf_read(fp, buf, size) fread(buf, 1, size, fp) 28 | #define razf_open(fn, mode) fopen(fn, mode) 29 | #define razf_close(fp) fclose(fp) 30 | #define razf_seek(fp, offset, whence) fseeko(fp, offset, whence) 31 | #define razf_tell(fp) ftello(fp) 32 | #endif 33 | #ifdef _USE_KNETFILE 34 | #include "knetfile.h" 35 | #endif 36 | 37 | struct __faidx_t { 38 | RAZF *rz; 39 | int n, m; 40 | char **name; 41 | khash_t(s) *hash; 42 | }; 43 | 44 | #ifndef kroundup32 45 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 46 | #endif 47 | 48 | static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset) 49 | { 50 | khint_t k; 51 | int ret; 52 | faidx1_t t; 53 | if (idx->n == idx->m) { 54 | idx->m = idx->m? idx->m<<1 : 16; 55 | idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m); 56 | } 57 | idx->name[idx->n] = strdup(name); 58 | k = kh_put(s, idx->hash, idx->name[idx->n], &ret); 59 | t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset; 60 | kh_value(idx->hash, k) = t; 61 | ++idx->n; 62 | } 63 | 64 | faidx_t *fai_build_core(RAZF *rz) 65 | { 66 | char c, *name; 67 | int l_name, m_name, ret; 68 | int line_len, line_blen, state; 69 | int l1, l2; 70 | faidx_t *idx; 71 | uint64_t offset; 72 | int64_t len; 73 | 74 | idx = (faidx_t*)calloc(1, sizeof(faidx_t)); 75 | idx->hash = kh_init(s); 76 | name = 0; l_name = m_name = 0; 77 | len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; 78 | while (razf_read(rz, &c, 1)) { 79 | if (c == '\n') { // an empty line 80 | if (state == 1) { 81 | offset = razf_tell(rz); 82 | continue; 83 | } else if ((state == 0 && len < 0) || state == 2) continue; 84 | } 85 | if (c == '>') { // fasta header 86 | if (len >= 0) 87 | fai_insert_index(idx, name, len, line_len, line_blen, offset); 88 | l_name = 0; 89 | while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) { 90 | if (m_name < l_name + 2) { 91 | m_name = l_name + 2; 92 | kroundup32(m_name); 93 | name = (char*)realloc(name, m_name); 94 | } 95 | name[l_name++] = c; 96 | } 97 | name[l_name] = '\0'; 98 | if (ret == 0) { 99 | fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); 100 | free(name); fai_destroy(idx); 101 | return 0; 102 | } 103 | if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n'); 104 | state = 1; len = 0; 105 | offset = razf_tell(rz); 106 | } else { 107 | if (state == 3) { 108 | fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); 109 | free(name); fai_destroy(idx); 110 | return 0; 111 | } 112 | if (state == 2) state = 3; 113 | l1 = l2 = 0; 114 | do { 115 | ++l1; 116 | if (isgraph(c)) ++l2; 117 | } while ((ret = razf_read(rz, &c, 1)) && c != '\n'); 118 | if (state == 3 && l2) { 119 | fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); 120 | free(name); fai_destroy(idx); 121 | return 0; 122 | } 123 | ++l1; len += l2; 124 | if (state == 1) line_len = l1, line_blen = l2, state = 0; 125 | else if (state == 0) { 126 | if (l1 != line_len || l2 != line_blen) state = 2; 127 | } 128 | } 129 | } 130 | fai_insert_index(idx, name, len, line_len, line_blen, offset); 131 | free(name); 132 | return idx; 133 | } 134 | 135 | void fai_save(const faidx_t *fai, FILE *fp) 136 | { 137 | khint_t k; 138 | int i; 139 | for (i = 0; i < fai->n; ++i) { 140 | faidx1_t x; 141 | k = kh_get(s, fai->hash, fai->name[i]); 142 | x = kh_value(fai->hash, k); 143 | #ifdef _WIN32 144 | fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len); 145 | #else 146 | fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len); 147 | #endif 148 | } 149 | } 150 | 151 | faidx_t *fai_read(FILE *fp) 152 | { 153 | faidx_t *fai; 154 | char *buf, *p; 155 | int len, line_len, line_blen; 156 | #ifdef _WIN32 157 | long offset; 158 | #else 159 | long long offset; 160 | #endif 161 | fai = (faidx_t*)calloc(1, sizeof(faidx_t)); 162 | fai->hash = kh_init(s); 163 | buf = (char*)calloc(0x10000, 1); 164 | while (!feof(fp) && fgets(buf, 0x10000, fp)) { 165 | for (p = buf; *p && isgraph(*p); ++p); 166 | *p = 0; ++p; 167 | #ifdef _WIN32 168 | sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len); 169 | #else 170 | sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len); 171 | #endif 172 | fai_insert_index(fai, buf, len, line_len, line_blen, offset); 173 | } 174 | free(buf); 175 | return fai; 176 | } 177 | 178 | void fai_destroy(faidx_t *fai) 179 | { 180 | int i; 181 | for (i = 0; i < fai->n; ++i) free(fai->name[i]); 182 | free(fai->name); 183 | kh_destroy(s, fai->hash); 184 | if (fai->rz) razf_close(fai->rz); 185 | free(fai); 186 | } 187 | 188 | int fai_build(const char *fn) 189 | { 190 | char *str; 191 | RAZF *rz; 192 | FILE *fp; 193 | faidx_t *fai; 194 | str = (char*)calloc(strlen(fn) + 5, 1); 195 | sprintf(str, "%s.fai", fn); 196 | rz = razf_open(fn, "r"); 197 | if (rz == 0) { 198 | fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); 199 | free(str); 200 | return -1; 201 | } 202 | fai = fai_build_core(rz); 203 | razf_close(rz); 204 | fp = fopen(str, "wb"); 205 | if (fp == 0) { 206 | fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str); 207 | fai_destroy(fai); free(str); 208 | return -1; 209 | } 210 | fai_save(fai, fp); 211 | fclose(fp); 212 | free(str); 213 | fai_destroy(fai); 214 | return 0; 215 | } 216 | 217 | #ifdef _USE_KNETFILE 218 | FILE *download_and_open(const char *fn) 219 | { 220 | const int buf_size = 1 * 1024 * 1024; 221 | uint8_t *buf; 222 | FILE *fp; 223 | knetFile *fp_remote; 224 | const char *url = fn; 225 | const char *p; 226 | int l = strlen(fn); 227 | for (p = fn + l - 1; p >= fn; --p) 228 | if (*p == '/') break; 229 | fn = p + 1; 230 | 231 | // First try to open a local copy 232 | fp = fopen(fn, "r"); 233 | if (fp) 234 | return fp; 235 | 236 | // If failed, download from remote and open 237 | fp_remote = knet_open(url, "rb"); 238 | if (fp_remote == 0) { 239 | fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url); 240 | return NULL; 241 | } 242 | if ((fp = fopen(fn, "wb")) == 0) { 243 | fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn); 244 | knet_close(fp_remote); 245 | return NULL; 246 | } 247 | buf = (uint8_t*)calloc(buf_size, 1); 248 | while ((l = knet_read(fp_remote, buf, buf_size)) != 0) 249 | fwrite(buf, 1, l, fp); 250 | free(buf); 251 | fclose(fp); 252 | knet_close(fp_remote); 253 | 254 | return fopen(fn, "r"); 255 | } 256 | #endif 257 | 258 | faidx_t *fai_load(const char *fn) 259 | { 260 | char *str; 261 | FILE *fp; 262 | faidx_t *fai; 263 | str = (char*)calloc(strlen(fn) + 5, 1); 264 | sprintf(str, "%s.fai", fn); 265 | 266 | #ifdef _USE_KNETFILE 267 | if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) 268 | { 269 | fp = download_and_open(str); 270 | if ( !fp ) 271 | { 272 | fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str); 273 | free(str); 274 | return 0; 275 | } 276 | } 277 | else 278 | #endif 279 | fp = fopen(str, "rb"); 280 | if (fp == 0) { 281 | fprintf(stderr, "[fai_load] build FASTA index.\n"); 282 | fai_build(fn); 283 | fp = fopen(str, "rb"); 284 | if (fp == 0) { 285 | fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); 286 | free(str); 287 | return 0; 288 | } 289 | } 290 | 291 | fai = fai_read(fp); 292 | fclose(fp); 293 | 294 | fai->rz = razf_open(fn, "rb"); 295 | free(str); 296 | if (fai->rz == 0) { 297 | fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); 298 | return 0; 299 | } 300 | return fai; 301 | } 302 | 303 | char *fai_fetch(const faidx_t *fai, const char *str, int *len) 304 | { 305 | char *s, c; 306 | int i, l, k, name_end; 307 | khiter_t iter; 308 | faidx1_t val; 309 | khash_t(s) *h; 310 | int beg, end; 311 | 312 | beg = end = -1; 313 | h = fai->hash; 314 | name_end = l = strlen(str); 315 | s = (char*)malloc(l+1); 316 | // remove space 317 | for (i = k = 0; i < l; ++i) 318 | if (!isspace(str[i])) s[k++] = str[i]; 319 | s[k] = 0; l = k; 320 | // determine the sequence name 321 | for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end 322 | if (i >= 0) name_end = i; 323 | if (name_end < l) { // check if this is really the end 324 | int n_hyphen = 0; 325 | for (i = name_end + 1; i < l; ++i) { 326 | if (s[i] == '-') ++n_hyphen; 327 | else if (!isdigit(s[i]) && s[i] != ',') break; 328 | } 329 | if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name 330 | s[name_end] = 0; 331 | iter = kh_get(s, h, s); 332 | if (iter == kh_end(h)) { // cannot find the sequence name 333 | iter = kh_get(s, h, str); // try str as the name 334 | if (iter == kh_end(h)) { 335 | *len = 0; 336 | free(s); return 0; 337 | } else s[name_end] = ':', name_end = l; 338 | } 339 | } else iter = kh_get(s, h, str); 340 | if(iter == kh_end(h)) { 341 | fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str); 342 | free(s); 343 | return 0; 344 | }; 345 | val = kh_value(h, iter); 346 | // parse the interval 347 | if (name_end < l) { 348 | for (i = k = name_end + 1; i < l; ++i) 349 | if (s[i] != ',') s[k++] = s[i]; 350 | s[k] = 0; 351 | beg = atoi(s + name_end + 1); 352 | for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; 353 | end = i < k? atoi(s + i + 1) : val.len; 354 | if (beg > 0) --beg; 355 | } else beg = 0, end = val.len; 356 | if (beg >= val.len) beg = val.len; 357 | if (end >= val.len) end = val.len; 358 | if (beg > end) beg = end; 359 | free(s); 360 | 361 | // now retrieve the sequence 362 | l = 0; 363 | s = (char*)malloc(end - beg + 2); 364 | razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); 365 | while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err) 366 | if (isgraph(c)) s[l++] = c; 367 | s[l] = '\0'; 368 | *len = l; 369 | return s; 370 | } 371 | 372 | int main_faidx(int argc, char *argv[]) 373 | { 374 | if (argc == 1) { 375 | fprintf(stderr, "Usage: faidx [ [...]]\n"); 376 | return 1; 377 | } else { 378 | if (argc == 2) fai_build(argv[1]); 379 | else { 380 | int i, j, k, l; 381 | char *s; 382 | faidx_t *fai; 383 | fai = fai_load(argv[1]); 384 | if (fai == 0) return 1; 385 | for (i = 2; i != argc; ++i) { 386 | printf(">%s\n", argv[i]); 387 | s = fai_fetch(fai, argv[i], &l); 388 | for (j = 0; j < l; j += 60) { 389 | for (k = 0; k < 60 && k < l - j; ++k) 390 | putchar(s[j + k]); 391 | putchar('\n'); 392 | } 393 | free(s); 394 | } 395 | fai_destroy(fai); 396 | } 397 | } 398 | return 0; 399 | } 400 | 401 | int faidx_fetch_nseq(const faidx_t *fai) 402 | { 403 | return fai->n; 404 | } 405 | 406 | const char *faidx_iseq(const faidx_t *fai, int i) 407 | { 408 | return fai->name[i]; 409 | } 410 | 411 | int faidx_seq_len(const faidx_t *fai, const char *seq) 412 | { 413 | khint_t k = kh_get(s, fai->hash, seq); 414 | if ( k == kh_end(fai->hash) ) return -1; 415 | return kh_val(fai->hash, k).len; 416 | } 417 | 418 | char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len) 419 | { 420 | int l; 421 | char c; 422 | khiter_t iter; 423 | faidx1_t val; 424 | char *seq=NULL; 425 | 426 | // Adjust position 427 | iter = kh_get(s, fai->hash, c_name); 428 | if(iter == kh_end(fai->hash)) return 0; 429 | val = kh_value(fai->hash, iter); 430 | if(p_end_i < p_beg_i) p_beg_i = p_end_i; 431 | if(p_beg_i < 0) p_beg_i = 0; 432 | else if(val.len <= p_beg_i) p_beg_i = val.len - 1; 433 | if(p_end_i < 0) p_end_i = 0; 434 | else if(val.len <= p_end_i) p_end_i = val.len - 1; 435 | 436 | // Now retrieve the sequence 437 | l = 0; 438 | seq = (char*)malloc(p_end_i - p_beg_i + 2); 439 | razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); 440 | while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1) 441 | if (isgraph(c)) seq[l++] = c; 442 | seq[l] = '\0'; 443 | *len = l; 444 | return seq; 445 | } 446 | -------------------------------------------------------------------------------- /cnv.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "kvec.h" 10 | #include "kseq.h" 11 | KSTREAM_INIT(gzFile, gzread, 0x10000) 12 | 13 | #include "ksort.h" 14 | KSORT_INIT_GENERIC(float) 15 | 16 | /************************************* 17 | * Find all maximal scoring segments * 18 | *************************************/ 19 | 20 | typedef struct { 21 | int st, en; 22 | float L, R; 23 | int pre; 24 | } msseg_aux_t; 25 | 26 | typedef struct { 27 | int st, en; 28 | float sc; 29 | } msseg_t; 30 | 31 | typedef kvec_t(msseg_t) msseg_v; 32 | typedef kvec_t(msseg_aux_t) msseg_aux_v; 33 | 34 | static void add_segs(msseg_v *ret, msseg_aux_v *seg, int min_sc) 35 | { 36 | int i; 37 | for (i = 0; i < seg->n; ++i) { 38 | msseg_aux_t *p = &seg->a[i]; 39 | if (p->R - p->L >= min_sc) { 40 | msseg_t *q; 41 | kv_pushp(msseg_t, *ret, &q); 42 | q->st = p->st, q->en = p->en, q->sc = p->R - p->L; 43 | } 44 | } 45 | seg->n = 0; 46 | } 47 | 48 | msseg_t *mss_find_all(int n, const float *S, float min_sc, int *n_seg) 49 | { 50 | int i, j; 51 | float L; 52 | msseg_v ret = {0,0,0}; 53 | msseg_aux_v seg = {0,0,0}; 54 | msseg_aux_t t; 55 | 56 | for (i = L = 0; i < n;) { 57 | if (S[i] > 0) { 58 | int k; 59 | float R = L + S[i]; 60 | for (k = i + 1; k < n && S[k] > 0.; ++k) 61 | R += S[k]; 62 | t.st = i, t.en = k, t.L = L, t.R = R; 63 | while (1) { 64 | msseg_aux_t *p; 65 | for (j = seg.n - 1; j >= 0;) { 66 | p = &seg.a[j]; 67 | if (p->L < t.L) break; 68 | j = p->pre >= 0? p->pre : j - 1; 69 | } 70 | if (j >= 0 && seg.a[j].R < t.R) { 71 | p = &seg.a[j]; 72 | t.st = p->st, t.L = p->L, t.pre = p->pre; 73 | seg.n = j; 74 | } else { 75 | if (j < 0) add_segs(&ret, &seg, min_sc); 76 | t.pre = j; 77 | kv_push(msseg_aux_t, seg, t); 78 | break; 79 | } 80 | } 81 | L = R, i = k; 82 | } else L += S[i++]; 83 | } 84 | add_segs(&ret, &seg, min_sc); 85 | free(seg.a); 86 | ret.a = (msseg_t*)realloc(ret.a, ret.n * sizeof(msseg_t)); 87 | *n_seg = ret.n; 88 | return ret.a; 89 | } 90 | 91 | float mss_find_one(int n, const float *S) 92 | { 93 | int i; 94 | float L, L_max; 95 | for (i = 0, L = L_max = 0.; i < n; ++i) { 96 | L += S[i]; 97 | if (L < 0.) L = 0.; 98 | else if (L > L_max) L_max = L; 99 | } 100 | return L_max; 101 | } 102 | 103 | void mss_shuffle(int n, float *a) 104 | { 105 | int i, j; 106 | for (i = n; i > 1; --i) { 107 | float tmp; 108 | j = (int)(drand48() * i); 109 | tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; 110 | } 111 | } 112 | 113 | /************************* 114 | * Brent 1D root finding * 115 | *************************/ 116 | 117 | #define BR_ITMAX 100 118 | #define BR_EPS 3.0e-8 119 | #define BR_SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a)) 120 | 121 | double brent_root(double (*func)(double, void*), double x1, double x2, double tol, int *err, void *data) 122 | { 123 | int iter; 124 | double a = x1, b = x2, c = x2, d, e, min1, min2; 125 | double fa = (*func)(a, data), fb = (*func)(b, data), fc, p, q, r, s, tol1, xm; 126 | 127 | *err = 0; 128 | if ((fa > 0.0 && fb > 0.0) || (fa < 0.0 && fb < 0.0)) { 129 | *err = 1; 130 | return 0.; 131 | } 132 | fc = fb, e = d = b - a; 133 | for (iter = 1; iter <= BR_ITMAX; ++iter) { 134 | if ((fb > 0.0 && fc > 0.0) || (fb < 0.0 && fc < 0.0)) 135 | c = a, fc = fa, e = d = b - a; 136 | if (fabs(fc) < fabs(fb)) { 137 | a = b, b = c, c = a; 138 | fa = fb, fb = fc, fc = fa; 139 | } 140 | tol1 = 2.0 * BR_EPS * fabs(b) + 0.5 * tol; 141 | xm = 0.5 * (c - b); 142 | if (fabs(xm) <= tol1 || fb == 0.0) return b; 143 | if (fabs(e) >= tol1 && fabs(fa) > fabs(fb)) { 144 | s = fb / fa; 145 | if (a == c) { 146 | p = 2.0 * xm * s; 147 | q = 1.0 - s; 148 | } else { 149 | q = fa / fc; 150 | r = fb / fc; 151 | p = s * (2.0 * xm * q * (q - r) - (b - a) * (r - 1.0)); 152 | q = (q - 1.0) * (r - 1.0) * (s - 1.0); 153 | } 154 | if (p > 0.0) q = -q; 155 | p = fabs(p); 156 | min1 = 3.0 * xm * q - fabs(tol1 * q); 157 | min2 = fabs(e * q); 158 | if (2.0 * p < (min1 < min2 ? min1 : min2)) { 159 | e = d, d = p / q; 160 | } else d = xm, e = d; 161 | } else d = xm, e = d; 162 | a = b, fa = fb; 163 | if (fabs(d) > tol1) b += d; 164 | else b += BR_SIGN(tol1, xm); 165 | fb = (*func)(b, data); 166 | } 167 | *err = 2; 168 | return 0.; 169 | } 170 | 171 | /*********************** 172 | * Gumbel distribution * 173 | ***********************/ 174 | 175 | typedef struct { 176 | int n; 177 | float *a; 178 | double mu; 179 | } brent_aux_t; 180 | 181 | static double gumbel_beta(double x, void *data) 182 | { 183 | brent_aux_t *d = (brent_aux_t*)data; 184 | int i; 185 | double s0, s1, s2, beta; 186 | for (i = 0, s0 = s1 = s2 = 0.; i < d->n; ++i) { 187 | double t = exp(-d->a[i] / x); 188 | s0 += d->a[i]; 189 | s1 += t; 190 | s2 += d->a[i] * t; 191 | } 192 | s0 /= d->n; 193 | beta = s0 - s2 / s1; 194 | d->mu = -x * log(s1 / d->n); 195 | return beta - x; 196 | } 197 | 198 | void lt_gumbel_est(int n, float *S, int n_perm, float x[2]) 199 | { 200 | brent_aux_t aux; 201 | int k, err; 202 | float *ev, x0 = 1000., x1, x2; 203 | double t; 204 | 205 | ev = (float*)malloc(n * sizeof(float)); 206 | for (k = 0; k < n_perm; ++k) { 207 | mss_shuffle(n, S); 208 | ev[k] = mss_find_one(n, S); 209 | } 210 | aux.n = n_perm, aux.a = ev; 211 | while (1) { 212 | t = gumbel_beta(x0, &aux); 213 | if (isnan(t)) x0 *= 2.; 214 | else break; 215 | } 216 | if (t > 0.) { 217 | x1 = x0, x2 = x0 * 2.; 218 | while (1) { 219 | t = gumbel_beta(x2, &aux); 220 | if (t > 0.) x2 *= 2.; 221 | else break; 222 | } 223 | } else { 224 | x2 = x0, x1 = x0 / 2.; 225 | while (1) { 226 | t = gumbel_beta(x1, &aux); 227 | if (isnan(t)) x1 *= 1.414; 228 | else if (t < 0.) x1 /= 2; 229 | else break; 230 | } 231 | } 232 | x[1] = brent_root(gumbel_beta, x1, x2, 1e-3, &err, &aux); 233 | assert(err == 0); 234 | x[0] = aux.mu; 235 | free(ev); 236 | } 237 | 238 | double lt_gumbel_cdf(const float x[2], float z) 239 | { 240 | return exp(-exp(-(z-x[0])/x[1])); 241 | } 242 | 243 | double lt_gumbel_ccdf(const float x[2], float z) 244 | { 245 | double y; 246 | y = exp(-(z-x[0])/x[1]); 247 | return y > .001? 1. - lt_gumbel_cdf(x, z) : y * (1. - y * (.5 - y/6.)); 248 | } 249 | 250 | double lt_gumbel_quantile(const float x[2], float p) 251 | { 252 | return x[0] - x[1] * log(-log(p)); 253 | } 254 | 255 | /****************** 256 | * CNV parameters * 257 | ******************/ 258 | 259 | typedef struct { 260 | int ploidy, n_perm, show_evidence, split_len; 261 | float pen_miss, pen_coef; 262 | float rep_thres; 263 | } lt_cnvopt_t; 264 | 265 | void lt_cnvopt_init(lt_cnvopt_t *opt) 266 | { 267 | memset(opt, 0, sizeof(lt_cnvopt_t)); 268 | opt->ploidy = 2; 269 | opt->n_perm = 200; 270 | opt->pen_coef = 4.; 271 | opt->pen_miss = .1; 272 | opt->rep_thres = 1e-4; 273 | opt->split_len = 1000; 274 | } 275 | 276 | /**************** 277 | * Depth reader * 278 | ****************/ 279 | 280 | typedef struct { 281 | uint32_t e; 282 | uint8_t d[3], flt; 283 | } lt_dp1_t; 284 | 285 | typedef kvec_t(lt_dp1_t) lt_depth1_v; 286 | 287 | typedef struct { 288 | char *name; 289 | lt_depth1_v d; 290 | } lt_rawdp_t; 291 | 292 | void lt_dp_destroy(int n, lt_rawdp_t *d) 293 | { 294 | int i; 295 | for (i = 0; i < n; ++i) { 296 | free(d[i].name); 297 | free(d[i].d.a); 298 | } 299 | free(d); 300 | } 301 | 302 | void *bed_read(const char *fn); 303 | int bed_overlap(const void *_h, const char *chr, int beg, int end); 304 | void bed_destroy(void *_h); 305 | 306 | lt_rawdp_t *lt_dp_read(const char *fn, int *n, const char *fn_gap, int split_len) 307 | { 308 | int dret; 309 | gzFile fp; 310 | kstream_t *ks; 311 | kstring_t str = {0,0,0}; 312 | kvec_t(lt_rawdp_t) a = {0,0,0}; 313 | lt_rawdp_t *r; 314 | void *gap = 0; 315 | 316 | fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); 317 | ks = ks_init(fp); 318 | if (fn_gap) gap = bed_read(fn_gap); 319 | 320 | while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { 321 | int i, c, st = -1; 322 | char *q, *p; 323 | lt_dp1_t t; 324 | t.e = -1; 325 | for (i = 0, q = p = str.s;; ++p) { 326 | if (*p == 0 || *p == '\t') { 327 | c = *p, *p = 0; 328 | if (i == 0) { 329 | if (a.n == 0 || strcmp(q, a.a[a.n-1].name) != 0) { 330 | kv_pushp(lt_rawdp_t, a, &r); 331 | r->name = strdup(q); 332 | kv_init(r->d); 333 | } 334 | } else if (i == 1) { 335 | st = atoi(q); 336 | } else if (i == 2) { 337 | t.e = atoi(q); 338 | } else if (i >= 3 && i <= 5) { 339 | int x; 340 | x = atoi(q); 341 | t.d[i-3] = x < 255? x : 255; 342 | } 343 | if (c == 0) break; 344 | q = p + 1, ++i; 345 | } 346 | } 347 | if (i >= 5) { 348 | t.flt = gap && bed_overlap(gap, a.a[a.n-1].name, st, t.e)? 1 : 0; 349 | if (t.flt == 0) { 350 | int len = t.e - st; 351 | lt_dp1_t t2 = t; 352 | while (len > split_len + (split_len>>1)) { 353 | t2.e = st + split_len; 354 | kv_push(lt_dp1_t, a.a[a.n-1].d, t2); 355 | len -= split_len; 356 | st += split_len; 357 | } 358 | t2.e = st + len; 359 | kv_push(lt_dp1_t, a.a[a.n-1].d, t2); 360 | } else kv_push(lt_dp1_t, a.a[a.n-1].d, t); 361 | } 362 | } 363 | 364 | free(str.s); 365 | if (gap) bed_destroy(gap); 366 | ks_destroy(ks); 367 | gzclose(fp); 368 | *n = a.n; 369 | return a.a; 370 | } 371 | 372 | /***************************** 373 | * Estimate super-parameters * 374 | *****************************/ 375 | 376 | #define LT_GAIN 0 377 | #define LT_LOSS1 1 378 | #define LT_LOSS2 2 379 | 380 | typedef struct { 381 | float penalty[3], gumbel[3][2]; 382 | } lt_cnvpar_t; 383 | 384 | static inline int classify_signal(int type, const lt_dp1_t *p, int ploidy) 385 | { 386 | int s = 0; 387 | if (p->flt) return 0; 388 | if (type == LT_GAIN) { 389 | if (p->d[1] > ploidy) s = 1; 390 | else if (p->d[0] <= ploidy) s = -1; 391 | } else if (type == LT_LOSS1) { 392 | if (p->d[2] < ploidy) s = 1; 393 | else if (p->d[2] >= ploidy) s = -1; 394 | else if (p->d[0] >= ploidy) s = -2; 395 | } else if (type == LT_LOSS2) { 396 | if (p->d[2] == 0) s = 1; 397 | else if (p->d[0] > 0) s = -4; 398 | else if (p->d[2] > 0) s = -8; 399 | } 400 | return s; 401 | } 402 | 403 | static void gen_S(const lt_cnvopt_t *opt, const lt_cnvpar_t *par, int type, int n, const lt_dp1_t *d, float *S) 404 | { 405 | int i, l; 406 | float pen_nosig = par->penalty[type]; 407 | float pen_miss = opt->pen_miss * par->penalty[type]; 408 | for (i = l = 0; i < n; ++i) { 409 | const lt_dp1_t *p = &d[i]; 410 | int len = p->e - (i? (p-1)->e : 0); 411 | int s = classify_signal(type, p, opt->ploidy); 412 | if (s == 1) S[l++] = len; 413 | else if (s < 0) S[l++] = s * pen_nosig * len; 414 | else S[l++] = -pen_miss * len; 415 | } 416 | } 417 | 418 | void lt_cnv_par(const lt_cnvopt_t *opt, int n_chr, const lt_rawdp_t *d, lt_cnvpar_t *par) 419 | { 420 | int i, k, l, tot, type; 421 | float *S; 422 | 423 | for (k = tot = 0; k < n_chr; ++k) tot += d[k].d.n; 424 | S = (float*)malloc(tot * sizeof(float)); 425 | 426 | for (type = 0; type < 3; ++type) { 427 | int64_t l_sig, l_nosig; 428 | l_sig = l_nosig = 0; 429 | for (k = 0; k < n_chr; ++k) { 430 | const lt_rawdp_t *dk = &d[k]; 431 | for (i = 0; i < dk->d.n; ++i) { 432 | lt_dp1_t *p = &dk->d.a[i]; 433 | int len = p->e - (i? (p-1)->e : 0); 434 | int s = classify_signal(type, p, opt->ploidy); 435 | if (s == 1) l_sig += len; 436 | else if (s < 0) l_nosig += len; 437 | } 438 | } 439 | par->penalty[type] = opt->pen_coef * l_sig / l_nosig; 440 | printf("%cS\t%ld\t%ld\t%.3f\n", "GLA"[type], (long)l_sig, (long)l_nosig, par->penalty[type]); 441 | for (k = l = 0; k < n_chr; ++k) { 442 | gen_S(opt, par, type, d[k].d.n, d[k].d.a, &S[l]); 443 | l += d[k].d.n; 444 | } 445 | lt_gumbel_est(tot, S, opt->n_perm, par->gumbel[type]); 446 | printf("%cP\t%.3f\t%.3f\t%.3f\n", "GLA"[type], par->gumbel[type][0], par->gumbel[type][1], lt_gumbel_quantile(par->gumbel[type], 1. - opt->rep_thres)); 447 | } 448 | free(S); 449 | } 450 | 451 | void lt_cnv_call(const lt_cnvopt_t *opt, const lt_cnvpar_t *par, int n_chr, const lt_rawdp_t *dp) 452 | { 453 | int max_len, k, type; 454 | float *S; 455 | for (k = max_len = 0; k < n_chr; ++k) 456 | max_len = max_len > dp[k].d.n? max_len : dp[k].d.n; 457 | S = (float*)malloc(max_len * sizeof(float)); 458 | for (type = 0; type < 3; ++type) { 459 | for (k = 0; k < n_chr; ++k) { 460 | msseg_t *seg; 461 | int i, n_seg, n = dp[k].d.n; 462 | lt_dp1_t *d = dp[k].d.a; 463 | gen_S(opt, par, type, n, d, S); 464 | seg = mss_find_all(n, S, lt_gumbel_quantile(par->gumbel[type], 1. - opt->rep_thres), &n_seg); 465 | for (i = 0; i < n_seg; ++i) { 466 | msseg_t *si = &seg[i]; 467 | int j, en = d[si->en-1].e, st = si->st? d[si->st-1].e : 0; 468 | printf("%c%c\t%s\t%d\t%d\t%.2f\t%.3g\n", "GLA"[type], "GLA"[type], dp[k].name, st, en, si->sc, lt_gumbel_ccdf(par->gumbel[type], si->sc)); 469 | if (opt->show_evidence) 470 | for (j = si->st; j < si->en; ++j) 471 | printf("%cE\t%d\t%d\t%d\t%d\n", "GLA"[type], d[j].e, d[j].d[0], d[j].d[1], d[j].d[2]); 472 | } 473 | free(seg); 474 | } 475 | } 476 | free(S); 477 | } 478 | 479 | int main_cnv(int argc, char *argv[]) 480 | { 481 | int c, n_chr; 482 | lt_cnvopt_t opt; 483 | lt_cnvpar_t par; 484 | lt_rawdp_t *dp; 485 | 486 | lt_cnvopt_init(&opt); 487 | while ((c = getopt(argc, argv, "c:p:P:s:e")) >= 0) { 488 | if (c == 'P') opt.rep_thres = atof(optarg); 489 | else if (c == 'p') opt.ploidy = atoi(optarg); 490 | else if (c == 'e') opt.show_evidence = 1; 491 | else if (c == 'c') opt.pen_coef = atof(optarg); 492 | else if (c == 's') opt.split_len = atoi(optarg); 493 | } 494 | if (argc - optind < 2) { 495 | fprintf(stderr, "Usage: lianti cnv [options] \n"); 496 | fprintf(stderr, "Options:\n"); 497 | fprintf(stderr, " -p INT expected ploidy [%d]\n", opt.ploidy); 498 | fprintf(stderr, " -P FLOAT P-value threshold [%g]\n", opt.rep_thres); 499 | fprintf(stderr, " -c FLOAT penalty coefficient [%g]\n", opt.pen_coef); 500 | fprintf(stderr, " -s INT split an interval if longer than INT [%d]\n", opt.split_len); 501 | return 1; 502 | } 503 | 504 | dp = lt_dp_read(argv[optind], &n_chr, argv[optind+1], opt.split_len); 505 | lt_cnv_par(&opt, n_chr, dp, &par); 506 | lt_cnv_call(&opt, &par, n_chr, dp); 507 | lt_dp_destroy(n_chr, dp); 508 | 509 | return 0; 510 | } 511 | -------------------------------------------------------------------------------- /khash.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* 27 | An example: 28 | 29 | #include "khash.h" 30 | KHASH_MAP_INIT_INT(32, char) 31 | int main() { 32 | int ret, is_missing; 33 | khiter_t k; 34 | khash_t(32) *h = kh_init(32); 35 | k = kh_put(32, h, 5, &ret); 36 | kh_value(h, k) = 10; 37 | k = kh_get(32, h, 10); 38 | is_missing = (k == kh_end(h)); 39 | k = kh_get(32, h, 5); 40 | kh_del(32, h, k); 41 | for (k = kh_begin(h); k != kh_end(h); ++k) 42 | if (kh_exist(h, k)) kh_value(h, k) = 1; 43 | kh_destroy(32, h); 44 | return 0; 45 | } 46 | */ 47 | 48 | /* 49 | 2011-12-29 (0.2.7): 50 | 51 | * Minor code clean up; no actual effect. 52 | 53 | 2011-09-16 (0.2.6): 54 | 55 | * The capacity is a power of 2. This seems to dramatically improve the 56 | speed for simple keys. Thank Zilong Tan for the suggestion. Reference: 57 | 58 | - http://code.google.com/p/ulib/ 59 | - http://nothings.org/computer/judy/ 60 | 61 | * Allow to optionally use linear probing which usually has better 62 | performance for random input. Double hashing is still the default as it 63 | is more robust to certain non-random input. 64 | 65 | * Added Wang's integer hash function (not used by default). This hash 66 | function is more robust to certain non-random input. 67 | 68 | 2011-02-14 (0.2.5): 69 | 70 | * Allow to declare global functions. 71 | 72 | 2009-09-26 (0.2.4): 73 | 74 | * Improve portability 75 | 76 | 2008-09-19 (0.2.3): 77 | 78 | * Corrected the example 79 | * Improved interfaces 80 | 81 | 2008-09-11 (0.2.2): 82 | 83 | * Improved speed a little in kh_put() 84 | 85 | 2008-09-10 (0.2.1): 86 | 87 | * Added kh_clear() 88 | * Fixed a compiling error 89 | 90 | 2008-09-02 (0.2.0): 91 | 92 | * Changed to token concatenation which increases flexibility. 93 | 94 | 2008-08-31 (0.1.2): 95 | 96 | * Fixed a bug in kh_get(), which has not been tested previously. 97 | 98 | 2008-08-31 (0.1.1): 99 | 100 | * Added destructor 101 | */ 102 | 103 | 104 | #ifndef __AC_KHASH_H 105 | #define __AC_KHASH_H 106 | 107 | /*! 108 | @header 109 | 110 | Generic hash table library. 111 | */ 112 | 113 | #define AC_VERSION_KHASH_H "0.2.6" 114 | 115 | #include 116 | #include 117 | #include 118 | 119 | /* compipler specific configuration */ 120 | 121 | #if UINT_MAX == 0xffffffffu 122 | typedef unsigned int khint32_t; 123 | #elif ULONG_MAX == 0xffffffffu 124 | typedef unsigned long khint32_t; 125 | #endif 126 | 127 | #if ULONG_MAX == ULLONG_MAX 128 | typedef unsigned long khint64_t; 129 | #else 130 | typedef unsigned long long khint64_t; 131 | #endif 132 | 133 | #ifdef _MSC_VER 134 | #define inline __inline 135 | #endif 136 | 137 | typedef khint32_t khint_t; 138 | typedef khint_t khiter_t; 139 | 140 | #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) 141 | #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) 142 | #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) 143 | #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) 144 | #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) 145 | #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) 146 | #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) 147 | 148 | #ifdef KHASH_LINEAR 149 | #define __ac_inc(k, m) 1 150 | #else 151 | #define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) 152 | #endif 153 | 154 | #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) 155 | 156 | #ifndef kroundup32 157 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 158 | #endif 159 | 160 | static const double __ac_HASH_UPPER = 0.77; 161 | 162 | #define __KHASH_TYPE(name, khkey_t, khval_t) \ 163 | typedef struct { \ 164 | khint_t n_buckets, size, n_occupied, upper_bound; \ 165 | khint32_t *flags; \ 166 | khkey_t *keys; \ 167 | khval_t *vals; \ 168 | } kh_##name##_t; 169 | 170 | #define KHASH_DECLARE(name, khkey_t, khval_t) \ 171 | __KHASH_TYPE(name, khkey_t, khval_t) \ 172 | extern kh_##name##_t *kh_init_##name(); \ 173 | extern void kh_destroy_##name(kh_##name##_t *h); \ 174 | extern void kh_clear_##name(kh_##name##_t *h); \ 175 | extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ 176 | extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ 177 | extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ 178 | extern void kh_del_##name(kh_##name##_t *h, khint_t x); 179 | 180 | #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ 181 | __KHASH_TYPE(name, khkey_t, khval_t) \ 182 | SCOPE kh_##name##_t *kh_init_##name() { \ 183 | return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ 184 | } \ 185 | SCOPE void kh_destroy_##name(kh_##name##_t *h) \ 186 | { \ 187 | if (h) { \ 188 | free(h->keys); free(h->flags); \ 189 | free(h->vals); \ 190 | free(h); \ 191 | } \ 192 | } \ 193 | SCOPE void kh_clear_##name(kh_##name##_t *h) \ 194 | { \ 195 | if (h && h->flags) { \ 196 | memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ 197 | h->size = h->n_occupied = 0; \ 198 | } \ 199 | } \ 200 | SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ 201 | { \ 202 | if (h->n_buckets) { \ 203 | khint_t inc, k, i, last, mask; \ 204 | mask = h->n_buckets - 1; \ 205 | k = __hash_func(key); i = k & mask; \ 206 | inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ 207 | while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ 208 | i = (i + inc) & mask; \ 209 | if (i == last) return h->n_buckets; \ 210 | } \ 211 | return __ac_iseither(h->flags, i)? h->n_buckets : i; \ 212 | } else return 0; \ 213 | } \ 214 | SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ 215 | { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ 216 | khint32_t *new_flags = 0; \ 217 | khint_t j = 1; \ 218 | { \ 219 | kroundup32(new_n_buckets); \ 220 | if (new_n_buckets < 4) new_n_buckets = 4; \ 221 | if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ 222 | else { /* hash table size to be changed (shrink or expand); rehash */ \ 223 | new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ 224 | memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ 225 | if (h->n_buckets < new_n_buckets) { /* expand */ \ 226 | h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ 227 | if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ 228 | } /* otherwise shrink */ \ 229 | } \ 230 | } \ 231 | if (j) { /* rehashing is needed */ \ 232 | for (j = 0; j != h->n_buckets; ++j) { \ 233 | if (__ac_iseither(h->flags, j) == 0) { \ 234 | khkey_t key = h->keys[j]; \ 235 | khval_t val; \ 236 | khint_t new_mask; \ 237 | new_mask = new_n_buckets - 1; \ 238 | if (kh_is_map) val = h->vals[j]; \ 239 | __ac_set_isdel_true(h->flags, j); \ 240 | while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ 241 | khint_t inc, k, i; \ 242 | k = __hash_func(key); \ 243 | i = k & new_mask; \ 244 | inc = __ac_inc(k, new_mask); \ 245 | while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ 246 | __ac_set_isempty_false(new_flags, i); \ 247 | if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ 248 | { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ 249 | if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ 250 | __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ 251 | } else { /* write the element and jump out of the loop */ \ 252 | h->keys[i] = key; \ 253 | if (kh_is_map) h->vals[i] = val; \ 254 | break; \ 255 | } \ 256 | } \ 257 | } \ 258 | } \ 259 | if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ 260 | h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ 261 | if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ 262 | } \ 263 | free(h->flags); /* free the working space */ \ 264 | h->flags = new_flags; \ 265 | h->n_buckets = new_n_buckets; \ 266 | h->n_occupied = h->size; \ 267 | h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ 268 | } \ 269 | } \ 270 | SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ 271 | { \ 272 | khint_t x; \ 273 | if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ 274 | if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ 275 | else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ 276 | } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ 277 | { \ 278 | khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ 279 | x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ 280 | if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ 281 | else { \ 282 | inc = __ac_inc(k, mask); last = i; \ 283 | while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ 284 | if (__ac_isdel(h->flags, i)) site = i; \ 285 | i = (i + inc) & mask; \ 286 | if (i == last) { x = site; break; } \ 287 | } \ 288 | if (x == h->n_buckets) { \ 289 | if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ 290 | else x = i; \ 291 | } \ 292 | } \ 293 | } \ 294 | if (__ac_isempty(h->flags, x)) { /* not present at all */ \ 295 | h->keys[x] = key; \ 296 | __ac_set_isboth_false(h->flags, x); \ 297 | ++h->size; ++h->n_occupied; \ 298 | *ret = 1; \ 299 | } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ 300 | h->keys[x] = key; \ 301 | __ac_set_isboth_false(h->flags, x); \ 302 | ++h->size; \ 303 | *ret = 2; \ 304 | } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ 305 | return x; \ 306 | } \ 307 | SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ 308 | { \ 309 | if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ 310 | __ac_set_isdel_true(h->flags, x); \ 311 | --h->size; \ 312 | } \ 313 | } 314 | 315 | #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ 316 | KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) 317 | 318 | /* --- BEGIN OF HASH FUNCTIONS --- */ 319 | 320 | /*! @function 321 | @abstract Integer hash function 322 | @param key The integer [khint32_t] 323 | @return The hash value [khint_t] 324 | */ 325 | #define kh_int_hash_func(key) (khint32_t)(key) 326 | /*! @function 327 | @abstract Integer comparison function 328 | */ 329 | #define kh_int_hash_equal(a, b) ((a) == (b)) 330 | /*! @function 331 | @abstract 64-bit integer hash function 332 | @param key The integer [khint64_t] 333 | @return The hash value [khint_t] 334 | */ 335 | #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) 336 | /*! @function 337 | @abstract 64-bit integer comparison function 338 | */ 339 | #define kh_int64_hash_equal(a, b) ((a) == (b)) 340 | /*! @function 341 | @abstract const char* hash function 342 | @param s Pointer to a null terminated string 343 | @return The hash value 344 | */ 345 | static inline khint_t __ac_X31_hash_string(const char *s) 346 | { 347 | khint_t h = *s; 348 | if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; 349 | return h; 350 | } 351 | /*! @function 352 | @abstract Another interface to const char* hash function 353 | @param key Pointer to a null terminated string [const char*] 354 | @return The hash value [khint_t] 355 | */ 356 | #define kh_str_hash_func(key) __ac_X31_hash_string(key) 357 | /*! @function 358 | @abstract Const char* comparison function 359 | */ 360 | #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) 361 | 362 | static inline khint_t __ac_Wang_hash(khint_t key) 363 | { 364 | key += ~(key << 15); 365 | key ^= (key >> 10); 366 | key += (key << 3); 367 | key ^= (key >> 6); 368 | key += ~(key << 11); 369 | key ^= (key >> 16); 370 | return key; 371 | } 372 | #define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) 373 | 374 | /* --- END OF HASH FUNCTIONS --- */ 375 | 376 | /* Other convenient macros... */ 377 | 378 | /*! 379 | @abstract Type of the hash table. 380 | @param name Name of the hash table [symbol] 381 | */ 382 | #define khash_t(name) kh_##name##_t 383 | 384 | /*! @function 385 | @abstract Initiate a hash table. 386 | @param name Name of the hash table [symbol] 387 | @return Pointer to the hash table [khash_t(name)*] 388 | */ 389 | #define kh_init(name) kh_init_##name() 390 | 391 | /*! @function 392 | @abstract Destroy a hash table. 393 | @param name Name of the hash table [symbol] 394 | @param h Pointer to the hash table [khash_t(name)*] 395 | */ 396 | #define kh_destroy(name, h) kh_destroy_##name(h) 397 | 398 | /*! @function 399 | @abstract Reset a hash table without deallocating memory. 400 | @param name Name of the hash table [symbol] 401 | @param h Pointer to the hash table [khash_t(name)*] 402 | */ 403 | #define kh_clear(name, h) kh_clear_##name(h) 404 | 405 | /*! @function 406 | @abstract Resize a hash table. 407 | @param name Name of the hash table [symbol] 408 | @param h Pointer to the hash table [khash_t(name)*] 409 | @param s New size [khint_t] 410 | */ 411 | #define kh_resize(name, h, s) kh_resize_##name(h, s) 412 | 413 | /*! @function 414 | @abstract Insert a key to the hash table. 415 | @param name Name of the hash table [symbol] 416 | @param h Pointer to the hash table [khash_t(name)*] 417 | @param k Key [type of keys] 418 | @param r Extra return code: 0 if the key is present in the hash table; 419 | 1 if the bucket is empty (never used); 2 if the element in 420 | the bucket has been deleted [int*] 421 | @return Iterator to the inserted element [khint_t] 422 | */ 423 | #define kh_put(name, h, k, r) kh_put_##name(h, k, r) 424 | 425 | /*! @function 426 | @abstract Retrieve a key from the hash table. 427 | @param name Name of the hash table [symbol] 428 | @param h Pointer to the hash table [khash_t(name)*] 429 | @param k Key [type of keys] 430 | @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] 431 | */ 432 | #define kh_get(name, h, k) kh_get_##name(h, k) 433 | 434 | /*! @function 435 | @abstract Remove a key from the hash table. 436 | @param name Name of the hash table [symbol] 437 | @param h Pointer to the hash table [khash_t(name)*] 438 | @param k Iterator to the element to be deleted [khint_t] 439 | */ 440 | #define kh_del(name, h, k) kh_del_##name(h, k) 441 | 442 | /*! @function 443 | @abstract Test whether a bucket contains data. 444 | @param h Pointer to the hash table [khash_t(name)*] 445 | @param x Iterator to the bucket [khint_t] 446 | @return 1 if containing data; 0 otherwise [int] 447 | */ 448 | #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) 449 | 450 | /*! @function 451 | @abstract Get key given an iterator 452 | @param h Pointer to the hash table [khash_t(name)*] 453 | @param x Iterator to the bucket [khint_t] 454 | @return Key [type of keys] 455 | */ 456 | #define kh_key(h, x) ((h)->keys[x]) 457 | 458 | /*! @function 459 | @abstract Get value given an iterator 460 | @param h Pointer to the hash table [khash_t(name)*] 461 | @param x Iterator to the bucket [khint_t] 462 | @return Value [type of values] 463 | @discussion For hash sets, calling this results in segfault. 464 | */ 465 | #define kh_val(h, x) ((h)->vals[x]) 466 | 467 | /*! @function 468 | @abstract Alias of kh_val() 469 | */ 470 | #define kh_value(h, x) ((h)->vals[x]) 471 | 472 | /*! @function 473 | @abstract Get the start iterator 474 | @param h Pointer to the hash table [khash_t(name)*] 475 | @return The start iterator [khint_t] 476 | */ 477 | #define kh_begin(h) (khint_t)(0) 478 | 479 | /*! @function 480 | @abstract Get the end iterator 481 | @param h Pointer to the hash table [khash_t(name)*] 482 | @return The end iterator [khint_t] 483 | */ 484 | #define kh_end(h) ((h)->n_buckets) 485 | 486 | /*! @function 487 | @abstract Get the number of elements in the hash table 488 | @param h Pointer to the hash table [khash_t(name)*] 489 | @return Number of elements in the hash table [khint_t] 490 | */ 491 | #define kh_size(h) ((h)->size) 492 | 493 | /*! @function 494 | @abstract Get the number of buckets in the hash table 495 | @param h Pointer to the hash table [khash_t(name)*] 496 | @return Number of buckets in the hash table [khint_t] 497 | */ 498 | #define kh_n_buckets(h) ((h)->n_buckets) 499 | 500 | /* More conenient interfaces */ 501 | 502 | /*! @function 503 | @abstract Instantiate a hash set containing integer keys 504 | @param name Name of the hash table [symbol] 505 | */ 506 | #define KHASH_SET_INIT_INT(name) \ 507 | KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) 508 | 509 | /*! @function 510 | @abstract Instantiate a hash map containing integer keys 511 | @param name Name of the hash table [symbol] 512 | @param khval_t Type of values [type] 513 | */ 514 | #define KHASH_MAP_INIT_INT(name, khval_t) \ 515 | KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) 516 | 517 | /*! @function 518 | @abstract Instantiate a hash map containing 64-bit integer keys 519 | @param name Name of the hash table [symbol] 520 | */ 521 | #define KHASH_SET_INIT_INT64(name) \ 522 | KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) 523 | 524 | /*! @function 525 | @abstract Instantiate a hash map containing 64-bit integer keys 526 | @param name Name of the hash table [symbol] 527 | @param khval_t Type of values [type] 528 | */ 529 | #define KHASH_MAP_INIT_INT64(name, khval_t) \ 530 | KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) 531 | 532 | typedef const char *kh_cstr_t; 533 | /*! @function 534 | @abstract Instantiate a hash map containing const char* keys 535 | @param name Name of the hash table [symbol] 536 | */ 537 | #define KHASH_SET_INIT_STR(name) \ 538 | KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) 539 | 540 | /*! @function 541 | @abstract Instantiate a hash map containing const char* keys 542 | @param name Name of the hash table [symbol] 543 | @param khval_t Type of values [type] 544 | */ 545 | #define KHASH_MAP_INIT_STR(name, khval_t) \ 546 | KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) 547 | 548 | #endif /* __AC_KHASH_H */ 549 | -------------------------------------------------------------------------------- /plp-joint.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env k8 2 | 3 | var version = "r158"; 4 | 5 | /************ 6 | * getopt() * 7 | ************/ 8 | 9 | var getopt = function(args, ostr) { 10 | var oli; // option letter list index 11 | if (typeof(getopt.place) == 'undefined') 12 | getopt.ind = 0, getopt.arg = null, getopt.place = -1; 13 | if (getopt.place == -1) { // update scanning pointer 14 | if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { 15 | getopt.place = -1; 16 | return null; 17 | } 18 | if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" 19 | ++getopt.ind; 20 | getopt.place = -1; 21 | return null; 22 | } 23 | } 24 | var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity 25 | if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { 26 | if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. 27 | if (getopt.place < 0) ++getopt.ind; 28 | return '?'; 29 | } 30 | if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument 31 | getopt.arg = null; 32 | if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; 33 | } else { // need an argument 34 | if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) 35 | getopt.arg = args[getopt.ind].substr(getopt.place); 36 | else if (args.length <= ++getopt.ind) { // no arg 37 | getopt.place = -1; 38 | if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; 39 | return '?'; 40 | } else getopt.arg = args[getopt.ind]; // white space 41 | getopt.place = -1; 42 | ++getopt.ind; 43 | } 44 | return optopt; 45 | } 46 | 47 | /************************************* 48 | * Parameters & command-line parsing * 49 | *************************************/ 50 | 51 | var c, min_mapq = 50, flt_win = 100, n_bulk = 1, is_hap_cell = false, show_flt = false, auto_only = false; 52 | var min_dp_alt_cell = 5, min_dp_alt_strand_cell = 2, min_ab_cell = 0.2, max_lt_cell = 1, min_end_len = 10, min_joint_cell = 2, min_joint_strand_cell = 1; 53 | var min_dp_bulk = 20, min_het_dp_bulk = 8, max_alt_dp_bulk = 0, min_het_ab_bulk = 0.3, is_hap_bulk = false; 54 | var min_dp_dmg_strand = 4; 55 | var fn_var = null, fn_hap = null, fn_excl = null, fn_rep = null; 56 | 57 | while ((c = getopt(arguments, "h:A:b:v:D:e:Hl:a:s:w:m:Fr:uL:B:S:Pj:J:")) != null) { 58 | if (c == 'b') n_bulk = parseInt(getopt.arg); 59 | else if (c == 'H') is_hap_cell = true; 60 | else if (c == 'h') fn_hap = getopt.arg; 61 | else if (c == 'e') fn_excl = getopt.arg; 62 | else if (c == 'v') fn_var = getopt.arg; 63 | else if (c == 'r') fn_rep = getopt.arg; 64 | else if (c == 'F') show_flt = true; 65 | else if (c == 'u') auto_only = true; 66 | else if (c == 'a') min_dp_alt_cell = parseInt(getopt.arg); 67 | else if (c == 's') min_dp_alt_strand_cell = parseInt(getopt.arg); 68 | else if (c == 'w') flt_win = parseInt(getopt.arg); 69 | else if (c == 'S') min_dp_dmg_strand = parseInt(getopt.arg); 70 | else if (c == 'B') min_ab_cell = parseFloat(getopt.arg); 71 | else if (c == 'l') max_lt_cell = parseInt(getopt.arg); 72 | else if (c == 'L') min_end_len = parseInt(getopt.arg); 73 | else if (c == 'D') min_dp_bulk = parseInt(getopt.arg); 74 | else if (c == 'A') min_het_dp_bulk = parseInt(getopt.arg); 75 | else if (c == 'm') max_alt_dp_bulk = parseInt(getopt.arg); 76 | else if (c == 'P') is_hap_bulk = is_hap_cell = true; 77 | else if (c == 'j') min_joint_cell = parseInt(getopt.arg); 78 | else if (c == 'J') min_joint_strand_cell = parseInt(getopt.arg); 79 | } 80 | 81 | if (min_dp_alt_strand_cell * 2 > min_dp_alt_cell) 82 | throw("2 * {-s} should not be larger than {-a}"); 83 | 84 | if (arguments.length - getopt.ind == 0) { 85 | print("Usage: plp-joint.js [options] "); 86 | print("Options:"); 87 | print(" General:"); 88 | print(" -b INT number of bulk samples [1]"); 89 | print(" -h FILE samples in FILE are haploid []"); 90 | print(" -H mark all single-cell samples as haploid"); 91 | print(" -e FILE exclude samples contained in FILE []"); 92 | print(" -v FILE exclude positions in VCF FILE []"); 93 | print(" -r FILE cell replicates []"); 94 | print(" -F print SNVs filtered by -w and -v"); 95 | print(" -u process autosomes only"); 96 | print(" Cell:"); 97 | print(" -a INT min ALT read depth to call an SNV [" + min_dp_alt_cell + "]"); 98 | print(" -s INT min ALT read depth per strand [" + min_dp_alt_strand_cell + "]"); 99 | print(" -l INT max LIANTI conflicting reads [" + max_lt_cell + "]"); 100 | print(" -L INT min distance towards the end of a read [" + min_end_len + "]"); 101 | print(" -w INT size of window to filter clustered SNVs [" + flt_win + "]"); 102 | print(" -S INT min strand depth at candidate DNA damages [" + min_dp_dmg_strand + "]"); 103 | print(" -B FLOAT min ALT allele balance [" + min_ab_cell + "]"); 104 | print(" -j INT min allele depth to call joint SNVs [" + min_joint_cell + "]"); 105 | print(" -J INT min allele depth on both strands to call joint SNVs [" + min_joint_strand_cell + "]"); 106 | print(" Bulk:"); 107 | print(" -D INT min bulk read depth [" + min_dp_bulk + "]"); 108 | print(" -A INT min bulk ALT read depth to call a het [" + min_het_dp_bulk + "]"); 109 | print(" -m INT max bulk ALT read depth to call an SNV [" + max_alt_dp_bulk + "]"); 110 | print(" -P the bulk is haploid"); 111 | exit(1); 112 | } 113 | 114 | print('CL', 'plp-joint.js ' + arguments.join(" ")); 115 | print('VN', version); 116 | print('CC', 'SM sample name (each sample)'); 117 | print('CC', 'NV somatic SNVs'); 118 | print('CC', 'NN number of called somatic SNVs (each)'); 119 | print('CC', 'NR false negative rate (each)'); 120 | print('CC', 'NC number of somatic SNVs after FNR correction (each)'); 121 | print('CC', 'NA alignment somatic SNVs'); 122 | print('CC', 'DV DNA damages or amplification errors'); 123 | print('CC', 'DN number of called damages/errors (each)'); 124 | print('CC', 'DR false negative rate of damages/errors (each)'); 125 | print('CC', 'DC number of damages/errors after FNR correction (each)'); 126 | print('CC'); 127 | 128 | /*********************** 129 | * Auxiliary functions * 130 | ***********************/ 131 | 132 | function read_list(fn) 133 | { 134 | if (fn == null || fn == "") return {}; 135 | var buf = new Bytes(); 136 | var file = fn == '-'? new File() : new File(fn); 137 | var h = {}; 138 | while (file.readline(buf) >= 0) { 139 | var t = buf.toString().split("\t"); 140 | h[t[0]] = 1; 141 | } 142 | file.close(); 143 | buf.destroy(); 144 | return h; 145 | } 146 | 147 | function aggregate_calls(x, cell_meta, is_hap_bulk) 148 | { 149 | var bulk_ad = [0, 0], bulk_alt = [], cell_hit_jv = [], cell_hit_nv = []; 150 | for (var i = 0; i < x.bulk.length; ++i) 151 | bulk_ad[0] += x.bulk[i].ad[0], bulk_ad[1] += x.bulk[i].ad[1], bulk_alt.push(x.bulk[i].ad[1]); 152 | if (bulk_ad[1] != 0) bulk_ad[1] = bulk_alt.join(":"); 153 | for (var i = 0; i < x.cell.length; ++i) { 154 | var c = x.cell[i]; 155 | if (!x.flt && x.n_joint_alt >= 2) { 156 | var b; 157 | if (c.flt || c.dp == 0) b = '.'; 158 | else if (c.ad[0] > 0 && c.ad[1] >= min_joint_cell) b = '1'; 159 | else if (c.ad[1] >= min_joint_cell) b = is_hap_bulk? '6' : cell_meta[i].ploidy == 1? '4' : '2'; 160 | else if (c.ad[1] > 0) b = '.'; 161 | else b = is_hap_bulk? '5' : cell_meta[i].ploidy == 1? '3' : '0'; 162 | cell_meta[i].calls.push(b); 163 | if (c.ad[1] >= min_joint_cell) 164 | cell_hit_jv.push([cell_meta[i].name, c.adf[1], c.adr[1]].join(":")); 165 | } 166 | if (!c.flt && c.alt && !x.flt) { 167 | ++cell_meta[i].snv; 168 | cell_hit_nv.push([cell_meta[i].name, c.adf[1], c.adr[1]].join(":")); 169 | } 170 | } 171 | if (cell_hit_nv.length > 0) 172 | print('NV', x.ctg, x.pos, x.ref, x.alt, bulk_ad.join("\t"), cell_hit_nv.length, cell_hit_nv.join("\t")); 173 | if (cell_hit_jv.length > 0) 174 | print('JV', x.ctg, x.pos, x.ref, x.alt, bulk_ad.join("\t"), cell_hit_jv.length, cell_hit_jv.join("\t")); 175 | } 176 | 177 | /******** 178 | * Main * 179 | ********/ 180 | 181 | var file, buf = new Bytes(), re_auto = new RegExp('^(chr)?([0-9]+)$'); 182 | 183 | var var_map = new Map(); 184 | if (fn_var != null) { 185 | warn('Reading sites to filter...'); 186 | file = new File(fn_var); 187 | while (file.readline(buf) >= 0) { 188 | var t = buf.toString().split("\t"); 189 | if (t[0][0] == '#') continue; 190 | var_map.put(t[0] + ':' + t[1]); 191 | } 192 | file.close(); 193 | } 194 | 195 | var rep_str = {}; 196 | if (fn_rep != null) { 197 | file = new File(fn_rep); 198 | while (file.readline(buf) >= 0) { 199 | var t = buf.toString().split(/\s+/); 200 | for (var i = 1; i < t.length; ++i) 201 | rep_str[t[i]] = t[0]; 202 | } 203 | file.close(); 204 | } 205 | 206 | var sample_excl = read_list(fn_excl); 207 | var sample_hap = read_list(fn_hap); 208 | var col2cell = []; 209 | var cell_meta = []; 210 | 211 | warn('Calling...'); 212 | file = arguments[getopt.ind] == '-'? new File() : new File(arguments[getopt.ind]); 213 | var rep_id = [], last = [], last_bulk = [], n_het_bulk = 0, n_hom_bulk = 0; 214 | while (file.readline(buf) >= 0) { 215 | var m, t = buf.toString().split("\t"); 216 | if (t[0] == '#CHROM') { // parse the sample line 217 | var sample_name = []; 218 | for (var i = 9 + n_bulk; i < t.length; ++i) { 219 | var s1 = t[i], s2 = s1.replace(/\.bam$/, ""); 220 | if (sample_excl[s1] || sample_excl[s2]) continue; 221 | if (rep_str[s1] || rep_str[s2]) continue; 222 | var pl = is_hap_cell || sample_hap[s1] || sample_hap[s2]? 1 : 2; 223 | cell_meta.push({ name:s2, ploidy:pl, col:i, ado:[0,0], fn:0, snv:0, dmg:0, dmg_fp:0, dmg_fn:[0, 0], calls:[] }); 224 | sample_name.push(s2); // for printing only 225 | } 226 | for (var i = 0; i < cell_meta.length; ++i) 227 | col2cell[cell_meta[i].col] = i; 228 | // construct rep_id 229 | for (var i = 0; i < t.length; ++i) 230 | rep_id[i] = i; 231 | if (fn_rep != null) { 232 | var sample2id = {}; 233 | for (var i = 9; i < t.length; ++i) { 234 | var s1 = t[i], s2 = s1.replace(/\.bam$/, ""); 235 | sample2id[s1] = sample2id[s2] = i; 236 | } 237 | for (var i = 9; i < t.length; ++i) { 238 | var s1 = t[i], s2 = s1.replace(/\.bam$/, ""); 239 | if (rep_str[s1] || rep_str[s2]) { 240 | var s3 = rep_str[s1] != null? rep_str[s1] : rep_str[s2]; 241 | if (sample2id[s3] != null) rep_id[i] = sample2id[s3]; 242 | } 243 | } 244 | } 245 | print('SM', sample_name.join("\t")); 246 | continue; 247 | } else if (t[0][0] == '#') continue; // skip header 248 | 249 | if (auto_only && !re_auto.test(t[0])) continue; 250 | t[1] = parseInt(t[1]); 251 | 252 | var flt_bulks = false, flt_snv = false; 253 | 254 | // skip bad sites: mapQ 255 | if ((m = /AMQ=([\d,]+)/.exec(t[7])) != null) { 256 | var s = m[1].split(","), flt = false; 257 | for (var j = 0; j < s.length; ++j) 258 | if (parseInt(s[j]) < min_mapq) 259 | flt = true; 260 | if (flt) flt_bulks = flt_snv = true; 261 | } 262 | 263 | // parse the FORMAT field 264 | var fmt = t[8].split(":"), fmt_hash = {}; 265 | for (var i = 0; i < fmt.length; ++i) 266 | fmt_hash[fmt[i]] = i; 267 | var fmt_ltdrop = fmt_hash["LTDROP"]; 268 | var fmt_alen = fmt_hash["ALEN"]; 269 | var fmt_adf = fmt_hash["ADF"]; 270 | var fmt_adr = fmt_hash["ADR"]; 271 | if (fmt_adf == null || fmt_adr == null) 272 | throw Error('missing ADF or ADR in FORMAT'); 273 | 274 | // parse VCF (this part works with multiple ALT alleles) 275 | var cell = [], bulk = []; 276 | for (var i = 9; i < t.length; ++i) { 277 | var cell_id = col2cell[rep_id[i]]; 278 | if (i >= 9 + n_bulk && cell_id == null) continue; // exclude this sample 279 | var s = t[i].split(":"); 280 | var lt = fmt_ltdrop != null && s[fmt_ltdrop] != '.'? parseInt(s[fmt_ltdrop]) : 0; 281 | var adf = s[fmt_adf].split(","); 282 | var adr = s[fmt_adr].split(","); 283 | var ad = [], dp = 0; 284 | if (adf.length != adr.length) throw Error("Inconsistent VCF"); 285 | var dp_ref = 0, dp_alt = 0; 286 | for (var j = 0; j < adf.length; ++j) { 287 | adf[j] = parseInt(adf[j]); 288 | adr[j] = parseInt(adr[j]); 289 | if (j == 0) dp_ref += adf[j] + adr[j]; 290 | else dp_alt += adf[j] + adr[j]; 291 | ad[j] = adf[j] + adr[j]; 292 | dp += ad[j]; 293 | } 294 | if (i < 9 + n_bulk) { 295 | bulk.push({ dp:dp, ad:ad, adf:adf, adr:adr }); 296 | } else { 297 | var flt = false, flt_dmg = false; 298 | if (cell_meta[cell_id].ploidy == 1 && dp_alt > 0 && dp_ref > 0) flt = true; // two alleles in a haploid cell; flt_dmg is not affected by this 299 | if (lt > max_lt_cell) flt = true; 300 | if (fmt_alen != null && s[fmt_alen] != '.') { 301 | var u = s[fmt_alen].split(","); 302 | for (var j = 1; j < u.length; ++j) 303 | if (u[j] != '.' && parseFloat(u[j]) < min_end_len) 304 | flt = flt_dmg = true; 305 | } 306 | if (cell[cell_id] == null) { 307 | cell[cell_id] = { flt:flt, dp:dp, ad:ad, adf:adf, adr:adr, lt:lt, flt_dmg:flt_dmg }; 308 | } else { 309 | var c = cell[cell_id]; 310 | if (flt) c.flt = flt; 311 | if (flt_dmg) c.flt_dmg = true; 312 | if (c.lt < lt) c.lt = lt; 313 | c.dp = 0; 314 | for (var j = 0; j < ad.length; ++j) { 315 | if (c.adf[j] > adf[j]) c.adf[j] = adf[j]; 316 | if (c.adr[j] > adr[j]) c.adr[j] = adr[j]; 317 | c.ad[j] = c.adf[j] + c.adr[j]; 318 | c.dp += c.ad[j]; 319 | } 320 | } 321 | } 322 | } 323 | 324 | // only consider beallelic sites for calling 325 | var alt = t[4].split(","); 326 | if (alt.length != 1 || alt[0].length != 1 || t[3].length != 1) 327 | flt_bulks = flt_snv = true; 328 | 329 | // test het in the bulk(s) 330 | var all_het = true, all_hom = true, all_good_alt = true; 331 | for (var i = 0; i < bulk.length; ++i) { 332 | var b = bulk[i]; 333 | b.het = b.hom = false; 334 | if (b.adf[0] > 0 && b.adf[1] > 0 && b.adr[0] > 0 && b.adr[1] > 0 && b.ad[0] >= min_het_dp_bulk && b.ad[1] >= min_het_dp_bulk) { 335 | if (b.ad[0] >= b.dp * min_het_ab_bulk && b.ad[1] >= b.dp * min_het_ab_bulk) 336 | b.het = true; 337 | } 338 | if (!b.het && b.ad[1] >= min_het_dp_bulk && b.adf[1] > 0 && b.adr[1] > 0 && b.ad[0] <= max_alt_dp_bulk) 339 | b.hom = true; 340 | if (b.ad[1] < min_het_dp_bulk) all_good_alt = false; 341 | if (!b.het) all_het = false; 342 | if (!b.hom) all_hom = false; 343 | if (b.dp < min_dp_bulk) 344 | flt_bulks = true; 345 | } 346 | 347 | // output differences in bulk 348 | if (n_bulk > 1 && !is_hap_bulk) { 349 | var bulk_diff = false, n_bulk_ref = 0, n_bulk_alt = 0; 350 | for (var i = 0; i < bulk.length; ++i) { 351 | var b = bulk[i]; 352 | if (b.ad[1] == 0) ++n_bulk_ref; 353 | else if (b.ad[1] >= min_dp_alt_cell && b.adf[1] >= min_dp_alt_strand_cell && b.adr[1] >= min_dp_alt_strand_cell) 354 | ++n_bulk_alt; 355 | } 356 | if (n_bulk_ref > 0 && n_bulk_alt > 0) { 357 | var ad = []; 358 | for (var i = 0; i < bulk.length; ++i) 359 | ad.push(bulk[i].adf[1] + ':' + bulk[i].adr[1]); 360 | 361 | while (last_bulk.length && (last_bulk[0].ctg != t[0] || last_bulk[0].pos + flt_win < t[1])) { 362 | var x = last_bulk.shift(); 363 | if (!x.flt) print('BV', x.data); 364 | } 365 | var flt_this = flt_bulks; 366 | if (var_map && var_map.get(t[0] + ':' + t[1]) != null) 367 | flt_this = true; 368 | for (var j = 0; j < last_bulk.length; ++j) { 369 | flt_this = true; 370 | last_bulk[j].flt = true; 371 | } 372 | last_bulk.push({ flt:flt_this, ctg:t[0], pos:t[1], data:[t[0], t[1], t[3], t[4], ad.join("\t")].join("\t") }); 373 | } 374 | } 375 | 376 | // count ADO 377 | if (is_hap_bulk && all_hom && !flt_bulks) { 378 | ++n_hom_bulk; 379 | for (var j = 0; j < cell.length; ++j) 380 | if (cell[j].flt || cell[j].ad[1] < min_joint_cell) 381 | ++cell_meta[j].ado[1]; 382 | } 383 | if (!is_hap_bulk && all_het && !flt_bulks) { 384 | ++n_het_bulk; 385 | for (var j = 0; j < cell.length; ++j) { 386 | if (cell[j].flt || cell[j].ad[0] < min_joint_cell) ++cell_meta[j].ado[0]; // ref allele dropped 387 | if (cell[j].flt || cell[j].ad[1] < min_joint_cell) ++cell_meta[j].ado[1]; // alt allele dropped 388 | } 389 | } 390 | 391 | // test if ALT is callable and count FN 392 | var n_joint_alt = 0; 393 | for (var i = 0; i < cell.length; ++i) { 394 | var c = cell[i]; 395 | // If a cell is haploid and it has ref alleles, c.flt will be true. The conditions below work with haploid cells. 396 | c.alt = (!c.flt && c.ad[1] >= min_dp_alt_cell && c.adf[1] >= min_dp_alt_strand_cell && c.adr[1] >= min_dp_alt_strand_cell && c.ad[1] >= c.dp * min_ab_cell); 397 | c.joint_alt = (!c.flt && c.ad[1] >= min_joint_cell && c.adf[1] >= min_joint_strand_cell && c.adr[1] >= min_joint_strand_cell); 398 | if (c.joint_alt) ++n_joint_alt; 399 | if (!flt_bulks && !c.alt && ((is_hap_bulk && all_hom) || (!is_hap_bulk && all_het))) 400 | ++cell_meta[i].fn; 401 | // whether to call a damage 402 | c.dmg = (!c.flt_dmg && c.ad[1] >= min_dp_dmg_strand && c.ad[0] >= min_dp_dmg_strand && c.adf[1] * c.adr[1] == 0 && c.adf[0] * c.adf[1] == 0 && c.adr[0] * c.adr[1] == 0); 403 | if (all_het && !flt_bulks) { 404 | if (!(!c.flt_dmg && c.adf[0] >= min_dp_dmg_strand && c.adr[0] >= min_dp_dmg_strand)) ++cell_meta[i].dmg_fn[0]; 405 | if (!(!c.flt_dmg && c.adf[1] >= min_dp_dmg_strand && c.adr[1] >= min_dp_dmg_strand)) ++cell_meta[i].dmg_fn[1]; 406 | if (cell_meta[i].ploidy > 1 && c.dmg) ++cell_meta[i].dmg_fp; // no dmg_fp of this kind for a haploid cell 407 | } 408 | } 409 | 410 | // skip the highly unlikely scenario: all bulks have good ALT alleles. The site is not used for window filtering. 411 | if (all_good_alt) continue; 412 | 413 | // requiring at least one bulk to have good RefHom 414 | var n_bulk_ref = 0; 415 | for (var i = 0; i < bulk.length; ++i) 416 | if (bulk[i].ad[1] <= max_alt_dp_bulk) 417 | ++n_bulk_ref; 418 | if (n_bulk_ref == 0) flt_snv = true; // flag the infavorable scenario: no bulks with good RefHom; this site may be used for window filtering later 419 | 420 | // print sites with conflicting strand information 421 | if (!flt_snv && !flt_bulks) { 422 | var tmp = []; 423 | for (var i = 0; i < cell.length; ++i) { 424 | var c = cell[i]; 425 | if (!c.dmg) continue; 426 | tmp.push(cell_meta[i].name + ':' + c.adf.join(",") + ':' + c.adr.join(",")); 427 | ++cell_meta[i].dmg; 428 | } 429 | if (tmp.length > 0 && var_map && var_map.get(t[0] + ':' + t[1]) != null) tmp.length = 0; 430 | if (tmp.length > 0) { 431 | var bulk_str = ""; 432 | for (var i = 0; i < bulk.length; ++i) { 433 | if (i) bulk_str += ";"; 434 | bulk_str += bulk[i].adf.join(",") + ":" + bulk[i].adr.join(","); 435 | } 436 | print('DV', t[0], t[1], t[3], t[4], bulk_str, tmp.length, tmp.join("\t")); 437 | } 438 | } 439 | 440 | // test SNV (usually requiring double strands) 441 | var cell_alt_f = 0, cell_alt_r = 0; 442 | for (var i = 0; i < cell.length; ++i) { 443 | if (cell[i].flt) continue; 444 | cell_alt_f += cell[i].adf[1]; 445 | cell_alt_r += cell[i].adr[1]; 446 | } 447 | if (cell_alt_f < min_dp_alt_strand_cell || cell_alt_r < min_dp_alt_strand_cell || cell_alt_f + cell_alt_r < min_dp_alt_cell) // too few ALT reads in cell(s) 448 | flt_snv = true; 449 | 450 | // filter by window & print 451 | while (last.length && (last[0].ctg != t[0] || last[0].pos + flt_win < t[1])) { 452 | var x = last.shift(); 453 | if (show_flt || !x.flt) aggregate_calls(x, cell_meta, is_hap_bulk); 454 | } 455 | 456 | var flt_this = flt_snv; 457 | if (flt_bulks) flt_this = true; 458 | if (var_map && var_map.get(t[0] + ':' + t[1]) != null) 459 | flt_this = true; 460 | for (var j = 0; j < last.length; ++j) { 461 | for (var i = 0; i < cell.length; ++i) 462 | if (cell[i].ad[1] > 0 && last[j].cell[i].ad[1] > 0) 463 | flt_this = last[j].flt = true; 464 | } 465 | 466 | last.push({ flt:flt_this, n_joint_alt:n_joint_alt, ctg:t[0], pos:t[1], bulk:bulk, cell:cell, ref:t[3], alt:t[4] }); 467 | } 468 | while (last_bulk.length) { 469 | var x = last_bulk.shift(); 470 | if (!x.flt) print('BV', x.data); 471 | } 472 | while (last.length) { 473 | var x = last.shift(); 474 | if (show_flt || !x.flt) aggregate_calls(x, cell_meta, is_hap_bulk); 475 | } 476 | 477 | /*************************** 478 | * Output final statistics * 479 | ***************************/ 480 | 481 | var snv = [], fnr = [], corr_snv = [], dmg = [], corr_dmg = [], ado = [], fnr_dmg = [], fpr_dmg = []; 482 | for (var i = 0; i < cell_meta.length; ++i) { 483 | var c = cell_meta[i]; 484 | ado[i] = is_hap_bulk? c.ado[1] / n_hom_bulk : c.ploidy == 1? 2. * c.ado[1] / n_het_bulk - 1. : c.ado[1] / n_het_bulk; 485 | snv[i] = c.snv; 486 | fnr[i] = is_hap_bulk? c.fn / n_hom_bulk : c.ploidy == 1? 2. * c.fn / n_het_bulk - 1. : c.fn / n_het_bulk; 487 | corr_snv[i] = snv[i] / (1.0 - fnr[i]); 488 | dmg[i] = c.dmg; 489 | if (!is_hap_bulk) { 490 | fpr_dmg[i] = c.dmg_fp / n_het_bulk; 491 | fnr_dmg[i] = c.ploidy == 1? 2. * c.dmg_fn[1] / n_het_bulk - 1. : c.dmg_fn[1] / n_het_bulk; 492 | corr_dmg[i] = ((dmg[i] - corr_snv[i] * fpr_dmg[i]) / (1.0 - fnr[i])).toFixed(2); 493 | fnr_dmg[i] = fnr_dmg[i].toFixed(4); 494 | fpr_dmg[i] = fpr_dmg[i].toFixed(4); 495 | } 496 | fnr[i] = fnr[i].toFixed(4); 497 | corr_snv[i] = corr_snv[i].toFixed(2); 498 | } 499 | print('NN', snv.join("\t")); 500 | print('NR', fnr.join("\t")); 501 | print('NC', corr_snv.join("\t")); 502 | print('DN', dmg.join("\t")); 503 | if (!is_hap_bulk) { 504 | print('DP', fpr_dmg.join("\t")); 505 | print('DR', fnr_dmg.join("\t")); 506 | print('DC', corr_dmg.join("\t")); 507 | } 508 | 509 | // output "multi-alignment" 510 | for (var i = 0; i < cell_meta.length; ++i) 511 | print('NA', cell_meta[i].name, ado[i].toFixed(4), cell_meta[i].calls.join("")); 512 | 513 | /******** 514 | * Free * 515 | ********/ 516 | 517 | if (var_map != null) var_map.destroy(); 518 | buf.destroy(); 519 | file.close(); 520 | -------------------------------------------------------------------------------- /trim.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "kvec.h" 8 | 9 | /******************** 10 | * Global variables * 11 | ********************/ 12 | 13 | const char *lt_bind = "GGGAGATGTGTATAAGAGACAG"; // including the leading GGG 14 | const char *lt_promoter = "GAACAGAATTTAATACGACTCACTATA"; // T7 promoter sequence 15 | const char *lt_adapter1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"; // Illumina 3'-end adapter 16 | const char *lt_adapter2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"; 17 | const char *lt_oligo_for= "TTCAGGAAAACCTGA"; 18 | const char *lt_oligo_rev= "TCAGGTTTTCCTGAA"; 19 | const char *lt_bind_rev = "CTGTCTCTTATACACATCT"; // excluding the reverse of GGG 20 | 21 | enum lt_type_e { 22 | LT_UNKNOWN = 0, 23 | LT_AMBI_BASE = 1, 24 | LT_SHORT_SEQ = 2, 25 | LT_NO_BINDING = 3, 26 | LT_TOO_MANY_BINDING = 4, 27 | LT_POST_PROMOTER = 5, 28 | LT_CHIMERA = 6, 29 | LT_SHORT_MERGE = 11, 30 | LT_SHORT_PE = 12, 31 | LT_MERGED = 21, 32 | LT_NO_MERGE = 22, 33 | LT_AMBI_MERGE = 23 34 | }; 35 | 36 | typedef struct { 37 | int n_threads; 38 | int chunk_size; 39 | int min_seq_len; 40 | int max_qual; 41 | int max_ovlp_pen, min_ovlp_len; 42 | int max_adap_pen, min_adap_len; 43 | int max_bc_pen, min_bc_len; 44 | int bc_len; 45 | int tab_out; 46 | } lt_opt_t; 47 | 48 | static void lt_opt_init(lt_opt_t *opt) 49 | { 50 | memset(opt, 0, sizeof(lt_opt_t)); 51 | opt->n_threads = 2; 52 | opt->chunk_size = 10000000; 53 | opt->max_qual = 50; 54 | opt->min_seq_len = 40; 55 | opt->max_ovlp_pen = 2; 56 | opt->min_ovlp_len = 8; 57 | opt->max_adap_pen = 1; 58 | opt->min_adap_len = 3; 59 | opt->max_bc_pen = 2; 60 | opt->min_bc_len = 6; 61 | opt->bc_len = 8; 62 | } 63 | 64 | /****************** 65 | * K-mer matching * 66 | ******************/ 67 | 68 | #include "khash.h" 69 | KHASH_SET_INIT_INT64(s64) 70 | typedef khash_t(s64) lt_seqcloud1_t; 71 | 72 | typedef struct { 73 | int l; 74 | uint64_t s; 75 | lt_seqcloud1_t *mm; 76 | } lt_seqcloud_t; 77 | 78 | unsigned char seq_nt4_table[256] = { 79 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 80 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 81 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, 82 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 83 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 84 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 85 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 86 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 87 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 88 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 89 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 90 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 91 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 92 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 93 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 94 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 95 | }; 96 | 97 | static lt_seqcloud_t *lt_sc_init(void) 98 | { 99 | lt_seqcloud_t *sc; 100 | sc = (lt_seqcloud_t*)calloc(1, sizeof(lt_seqcloud_t)); 101 | sc->mm = kh_init(s64); 102 | return sc; 103 | } 104 | 105 | void lt_sc_destroy(lt_seqcloud_t *sc) 106 | { 107 | kh_destroy(s64, sc->mm); 108 | free(sc); 109 | } 110 | 111 | void lt_sc_add_core(lt_seqcloud_t *sc, uint64_t s) 112 | { 113 | int i, absent; 114 | sc->s = s = s & ((1ULL<l*2) - 1); 115 | for (i = 0; i < sc->l; ++i) { 116 | int i2 = i * 2, a, c = s>>i2&3; 117 | for (a = 1; a < 4; ++a) { 118 | uint64_t x = (s & ~(3ULL << i2)) | (uint64_t)((a+c)&3) << i2; 119 | kh_put(s64, sc->mm, x, &absent); 120 | } 121 | } 122 | } 123 | 124 | lt_seqcloud_t *lt_sc_gen(const char *s) 125 | { 126 | lt_seqcloud_t *sc; 127 | uint64_t x = 0; 128 | int i; 129 | sc = lt_sc_init(); 130 | sc->l = strlen(s); 131 | for (i = 0; s[i] && i < sc->l; ++i) { 132 | int c = seq_nt4_table[(uint8_t)s[i]]; 133 | if (c > 3) { 134 | lt_sc_destroy(sc); 135 | return 0; 136 | } 137 | x = x << 2 | c; 138 | } 139 | lt_sc_add_core(sc, x); 140 | return sc; 141 | } 142 | 143 | typedef struct { 144 | uint32_t pos:30, type:2; 145 | } lt_sc_hit_t; 146 | 147 | int lt_sc_test(const lt_seqcloud_t *sc, const char *seq, int max_hits, lt_sc_hit_t *hits) 148 | { 149 | int i, l, n = 0; 150 | uint64_t x = 0, mask = (1ULL << sc->l*2) - 1; 151 | for (i = l = 0; seq[i]; ++i) { 152 | int c = seq_nt4_table[(uint8_t)seq[i]]; 153 | if (c < 4) { 154 | x = (x << 2 | c) & mask; 155 | if (++l >= sc->l) { 156 | if (x == sc->s || kh_get(s64, sc->mm, x) != kh_end(sc->mm)) { 157 | hits[n].pos = i - (sc->l - 1); 158 | hits[n++].type = x == sc->s? 0 : 1; 159 | if (n == max_hits) return n; 160 | } 161 | } 162 | } else l = 0, x = 0; 163 | } 164 | return n; 165 | } 166 | 167 | /********************** 168 | * Reverse complement * 169 | **********************/ 170 | 171 | char comp_tab[] = { 172 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 173 | 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 174 | 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 175 | 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 176 | 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O', 177 | 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95, 178 | 64, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o', 179 | 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127 180 | }; 181 | 182 | void lt_seq_rev(int l, const char *f, char *r) 183 | { 184 | int i; 185 | for (i = 0; i < l; ++i) 186 | r[l - i - 1] = f[i]; 187 | r[l] = 0; 188 | } 189 | 190 | void lt_seq_revcomp(int l, const char *f, char *r) 191 | { 192 | int i; 193 | for (i = 0; i < l; ++i) 194 | r[l - i - 1] = (uint8_t)f[i] >= 128? 'N' : comp_tab[(uint8_t)f[i]]; 195 | r[l] = 0; 196 | } 197 | 198 | /********************** 199 | * Ungapped extension * 200 | **********************/ 201 | 202 | #define LT_QUAL_THRES 53 // =33+20 203 | #define LT_HIGH_PEN 3 204 | #define LT_LOW_PEN 1 205 | 206 | int lt_ue_for1(int l1, const char *s1, const char *q1, int l2, const char *s2, const char *q2, int min_len, int max_pen) 207 | { 208 | int i, pen = 0; 209 | for (i = 0; i < l1 && i < l2; ++i) { 210 | if (s1[i] != s2[i]) { 211 | pen += q1[i] >= LT_QUAL_THRES && (q2 == 0 || q2[i] >= LT_QUAL_THRES)? LT_HIGH_PEN : LT_LOW_PEN; 212 | if (i <= min_len && pen > max_pen) break; 213 | if (i > min_len && pen * min_len > i * max_pen) break; // in effect: pen > max_pen * ((double)i / min_len) 214 | } 215 | } 216 | return i; 217 | } 218 | 219 | int lt_ue_rev1(int l1, const char *s1, const char *q1, int l2, const char *s2, const char *q2, int min_len, int max_pen) 220 | { 221 | int i, pen = 0; 222 | for (i = 0; i < l1 && i < l2; ++i) { 223 | if (s1[l1-1-i] != s2[l2-1-i]) { 224 | pen += q1[l1-1-i] >= LT_QUAL_THRES && (q2 == 0 || q2[l2-1-i] >= LT_QUAL_THRES)? LT_HIGH_PEN : LT_LOW_PEN; 225 | if (i <= min_len && pen > max_pen) break; 226 | if (i > min_len && pen * min_len > i * max_pen) break; 227 | } 228 | } 229 | return i; 230 | } 231 | 232 | int lt_ue_for(int l1, const char *s1, const char *q1, int l2, const char *s2, const char *q2, int max_pen, int min_len, int max_pos, uint64_t *pos) 233 | { 234 | int i, n = 0; 235 | for (i = min_len; i <= l1; ++i) { 236 | int l; 237 | l = lt_ue_for1(i, s1 + l1 - i, q1 + l1 - i, l2, s2, q2, min_len, max_pen); 238 | if (l >= min_len && (l == i || l == l2)) { 239 | pos[n++] = (uint64_t)(l1 - i) << 32 | l; 240 | if (n == max_pos) return n; 241 | } 242 | } 243 | return n; 244 | } 245 | 246 | int lt_ue_rev(int l1, const char *s1, const char *q1, int l2, const char *s2, const char *q2, int max_pen, int min_len, int max_pos, uint64_t *pos) 247 | { 248 | int i, n = 0; 249 | for (i = min_len; i <= l1; ++i) { 250 | int l; 251 | l = lt_ue_rev1(i, s1, q1, l2, s2, q2, min_len, max_pen); 252 | if (l >= min_len && (l == i || l == l2)) { 253 | pos[n++] = (uint64_t)(l1 - i) << 32 | l; 254 | if (n == max_pos) return n; 255 | } 256 | } 257 | return n; 258 | } 259 | 260 | int lt_ue_contained(int l1, const char *s1, const char *q1, int l2, const char *s2, const char *q2, int max_pen, int max_pos, uint64_t *pos) 261 | { 262 | int i, n = 0; 263 | for (i = 1; i < l2 - l1; ++i) { 264 | int l; 265 | l = lt_ue_for1(l1, s1, q1, l2 - i, s2 + i, q2 + i, l1, max_pen); 266 | if (l == l1) { 267 | pos[n++] = (uint64_t)i << 32 | l; 268 | if (n == max_pos) return n; 269 | } 270 | } 271 | return n; 272 | } 273 | 274 | /********************** 275 | * Batch FASTQ reader * 276 | **********************/ 277 | 278 | #include 279 | #include "kseq.h" 280 | KSEQ_INIT(gzFile, gzread) 281 | 282 | typedef struct { 283 | uint32_t l_seq:31, dbl_bind:1; 284 | enum lt_type_e type; 285 | char *name, *seq, *qual, *bc; 286 | } bseq1_t; 287 | 288 | bseq1_t *bseq_read(kseq_t *ks, int chunk_size, int *n_) 289 | { 290 | int size = 0, m, n; 291 | bseq1_t *seqs; 292 | m = n = 0; seqs = 0; 293 | while (kseq_read(ks) >= 0) { 294 | bseq1_t *s; 295 | if (n >= m) { 296 | m = m? m<<1 : 256; 297 | seqs = realloc(seqs, m * sizeof(bseq1_t)); 298 | } 299 | s = &seqs[n]; 300 | s->name = strdup(ks->name.s); 301 | s->seq = strdup(ks->seq.s); 302 | s->qual = ks->qual.l? strdup(ks->qual.s) : 0; 303 | s->bc = 0; 304 | s->l_seq = ks->seq.l; 305 | s->dbl_bind = 0; 306 | s->type = LT_UNKNOWN; 307 | size += seqs[n++].l_seq; 308 | if (size >= chunk_size && (n&1) == 0) break; 309 | } 310 | *n_ = n; 311 | return seqs; 312 | } 313 | 314 | /********************************* 315 | * Core trimming/merging routine * 316 | *********************************/ 317 | 318 | typedef struct { 319 | lt_opt_t opt; 320 | lt_seqcloud_t *sc_bind, *sc_prom; 321 | kseq_t *ks; 322 | } lt_global_t; 323 | 324 | void lt_global_init(lt_global_t *g) 325 | { 326 | memset(g, 0, sizeof(lt_global_t)); 327 | lt_opt_init(&g->opt); 328 | } 329 | 330 | #define MAX_BINDING_HITS 3 331 | 332 | static inline void trim_bseq_5(bseq1_t *s, int l) 333 | { 334 | memmove(s->seq, s->seq + l, s->l_seq - l); 335 | memmove(s->qual, s->qual + l, s->l_seq - l); 336 | s->l_seq -= l; 337 | s->seq[s->l_seq] = s->qual[s->l_seq] = 0; 338 | } 339 | 340 | static inline int merge_base(int max_qual, char fc, char fq, char rc, char rq) 341 | { 342 | int y; 343 | if (fc == rc) { 344 | int q = fq > rq? (fq - 33) + (rq - 33) / 2 : (rq - 33) + (fq - 33) / 2; 345 | y = toupper(fc) | (33 + (q < max_qual? q : max_qual)) << 8; 346 | } else { 347 | if (fq > rq) y = toupper(fc) | (33 + (fq - rq)) << 8; 348 | else y = toupper(rc) | (33 + (rq - fq)) << 8; 349 | } 350 | return y; 351 | } 352 | 353 | static inline void trim_adap(bseq1_t *s, const char *adap, int is_5, int min_len, int max_pen, int allow_contained) 354 | { 355 | int n_hits, l_adap; 356 | uint64_t hits[4]; 357 | l_adap = strlen(adap); 358 | if (is_5) n_hits = lt_ue_rev(s->l_seq, s->seq, s->qual, l_adap, adap, 0, max_pen, min_len, 4, hits); 359 | else n_hits = lt_ue_for(s->l_seq, s->seq, s->qual, l_adap, adap, 0, max_pen, min_len, 4, hits); 360 | if (n_hits > 0 && (allow_contained || (hits[0]>>32) + (uint32_t)hits[0] == s->l_seq || (hits[n_hits-1]>>32) + (uint32_t)hits[n_hits-1] == s->l_seq)) { 361 | int len = s->l_seq - (hits[n_hits-1]>>32); // trim the longest hit 362 | if (is_5) { 363 | if (len > min_len) trim_bseq_5(s, len); // trim 364 | else memset(s->qual, 33+1, len); // reduce baseQ 365 | } else { 366 | if (len > min_len) s->l_seq -= len, s->seq[s->l_seq] = s->qual[s->l_seq] = 0; // trim 367 | else memset(s->qual + (s->l_seq - len), 33+1, len); // reduce baseQ 368 | } 369 | } 370 | } 371 | 372 | void lt_process(const lt_global_t *g, bseq1_t s[2]) 373 | { 374 | int i, k, n_hits[2], mlen; 375 | lt_sc_hit_t hits[2][MAX_BINDING_HITS]; 376 | char *rseq, *rqual, *xseq, *xqual, *bc; 377 | 378 | mlen = s[0].l_seq > s[1].l_seq? s[0].l_seq : s[1].l_seq; 379 | rseq = (char*)alloca(mlen + 1); 380 | rqual = (char*)alloca(mlen + 1); 381 | xseq = (char*)alloca(s[0].l_seq + s[1].l_seq + 1); 382 | xqual = (char*)alloca(s[0].l_seq + s[1].l_seq + 1); 383 | bc = (char*)alloca(mlen + 1); 384 | 385 | // trim heading and trailing N 386 | for (k = 0; k < 2; ++k) { 387 | bseq1_t *sk = &s[k]; 388 | for (i = sk->l_seq - 1; i >= 0; --i) // trim trailing "N" 389 | if (sk->seq[i] != 'N') break; 390 | sk->l_seq = i + 1; 391 | sk->seq[sk->l_seq] = sk->qual[sk->l_seq] = 0; 392 | for (i = 0; i < sk->l_seq; ++i) // trim heading "N" 393 | if (sk->seq[i] != 'N') break; 394 | if (i) trim_bseq_5(sk, i); 395 | } 396 | // trim Illumina PE adapters 397 | trim_adap(&s[0], lt_adapter1, 0, g->opt.min_adap_len, g->opt.max_adap_pen, 1); 398 | trim_adap(&s[1], lt_adapter2, 0, g->opt.min_adap_len, g->opt.max_adap_pen, 1); 399 | // find binding motifs 400 | for (k = 0; k < 2; ++k) 401 | n_hits[k] = lt_sc_test(g->sc_bind, s[k].seq, MAX_BINDING_HITS, hits[k]); 402 | if (s[0].l_seq < g->opt.min_seq_len || s[1].l_seq < g->opt.min_seq_len) { 403 | s[0].type = s[1].type = LT_SHORT_SEQ; 404 | } else if (n_hits[0] + n_hits[1] == 0) { 405 | s[0].type = s[1].type = LT_NO_BINDING; 406 | } else if (n_hits[0] == MAX_BINDING_HITS || n_hits[1] == MAX_BINDING_HITS) { 407 | s[0].type = s[1].type = LT_TOO_MANY_BINDING; 408 | } else if (n_hits[0] > 0 && n_hits[1] > 0) { // both ends contain the binding motif 409 | int bpos[2], l_prom[2]; 410 | s[0].dbl_bind = s[1].dbl_bind = 1; 411 | for (i = 0; i < 2; ++i) { 412 | bpos[i] = hits[i][n_hits[i] - 1].pos; 413 | l_prom[i] = lt_ue_rev1(bpos[i], s[i].seq, s[i].qual, g->sc_prom->l, lt_promoter, 0, g->opt.min_bc_len, g->opt.max_bc_pen); 414 | if (l_prom[i] != bpos[i] && l_prom[i] != g->sc_prom->l) 415 | l_prom[i] = 0; 416 | if (l_prom[i] < g->opt.min_bc_len) l_prom[i] = 0; 417 | } 418 | if (l_prom[0] == 0 && l_prom[1] == 0) { 419 | s[0].type = s[1].type = LT_CHIMERA; 420 | } else { 421 | int w = l_prom[0] < l_prom[1]? 1 : 0; 422 | trim_bseq_5(&s[w], bpos[w] + g->sc_bind->l); 423 | n_hits[w] = 0; 424 | } 425 | } 426 | // find end overlaps 427 | if ((n_hits[0] == 0 || n_hits[1] == 0) && s[0].type == LT_UNKNOWN) { 428 | int f, r, fpos, bpos, l_bc_prom; 429 | lt_sc_hit_t hits_prom; 430 | f = n_hits[0]? 0 : 1; 431 | r = f^1; 432 | bpos = hits[f][n_hits[f] - 1].pos; 433 | fpos = bpos + g->sc_bind->l; 434 | // barcode 435 | l_bc_prom = lt_ue_rev1(bpos, s[f].seq, s[f].qual, g->sc_prom->l, lt_promoter, 0, g->opt.min_bc_len, g->opt.max_bc_pen); // test if "barcode" is actually teh T7 promotor 436 | if (l_bc_prom >= g->opt.min_bc_len) { 437 | bc[0] = '*', bc[1] = 0; 438 | } else if (bpos < g->opt.bc_len) { 439 | strncpy(bc, s[f].seq, bpos); 440 | bc[bpos] = 0; 441 | } else { 442 | strncpy(bc, s[f].seq + (bpos - g->opt.bc_len), g->opt.bc_len); 443 | bc[g->opt.bc_len] = 0; 444 | } 445 | s[0].bc = strdup(bc); 446 | s[1].bc = strdup(bc); 447 | // test merge and promoter sequences 448 | if (lt_sc_test(g->sc_prom, &s[f].seq[fpos], 1, &hits_prom) > 0) { 449 | s[0].type = s[1].type = LT_POST_PROMOTER; 450 | } else { 451 | int n_fh, n_rh, n_ch; 452 | uint64_t fh[2], rh[2], ch[2]; 453 | // reverse the other read 454 | lt_seq_revcomp(s[r].l_seq, s[r].seq, rseq); 455 | lt_seq_rev(s[r].l_seq, s[r].qual, rqual); 456 | // find overlaps 457 | n_fh = lt_ue_for(s[f].l_seq - bpos, &s[f].seq[bpos], &s[f].qual[bpos], s[r].l_seq, rseq, rqual, g->opt.max_ovlp_pen, g->opt.min_ovlp_len, 2, fh); 458 | n_rh = lt_ue_rev(s[f].l_seq - bpos, &s[f].seq[bpos], &s[f].qual[bpos], s[r].l_seq, rseq, rqual, g->opt.max_ovlp_pen, g->opt.min_ovlp_len, 2, rh); 459 | n_ch = lt_ue_contained(s[f].l_seq - bpos, &s[f].seq[bpos], &s[f].qual[bpos], s[r].l_seq, rseq, rqual, g->opt.max_ovlp_pen, 2, ch); 460 | if (n_fh + n_rh + n_ch > 1) { 461 | s[0].type = s[1].type = LT_AMBI_MERGE; 462 | } else if (n_fh + n_rh + n_ch == 0) { 463 | s[0].type = s[1].type = LT_NO_MERGE; 464 | } else { 465 | int x = 0; 466 | s[0].type = s[1].type = LT_MERGED; 467 | if (n_fh == 1) { 468 | /* GGG19bp-------------------> 469 | <--------------- */ 470 | int l = (uint32_t)fh[0], st = fh[0]>>32; 471 | for (i = fpos; i < bpos + st; ++i) 472 | xseq[x] = s[f].seq[i], xqual[x++] = s[f].qual[i]; 473 | for (i = fpos < bpos + st? 0 : fpos - (bpos + st); i < l; ++i) { 474 | int j = bpos + st + i, y; 475 | y = merge_base(g->opt.max_qual, s[f].seq[j], s[f].qual[j], rseq[i], rqual[i]); 476 | xseq[x] = (uint8_t)y, xqual[x++] = y>>8; 477 | } 478 | if (l < s[r].l_seq) { 479 | for (i = l; i < s[r].l_seq; ++i) 480 | xseq[x] = rseq[i], xqual[x++] = rqual[i]; 481 | } else { 482 | for (i = bpos + l; i < s[f].l_seq; ++i) 483 | xseq[x] = s[f].seq[i], xqual[x++] = s[f].qual[i]; 484 | } 485 | } else if (n_rh == 1) { 486 | /* GGG19bp---------------------> 487 | <-------------------------- */ 488 | int l = (uint32_t)rh[0], st = rh[0]>>32, l2; 489 | for (i = fpos; i < s[f].l_seq - st - l; ++i) 490 | xseq[x] = s[f].seq[i], xqual[x++] = s[f].qual[i]; 491 | l2 = fpos < s[f].l_seq - st - l? l : s[f].l_seq - st - fpos; 492 | for (i = s[r].l_seq - l2; i < s[r].l_seq; ++i) { 493 | int j = i - s[r].l_seq + (s[f].l_seq - st), y; 494 | y = merge_base(g->opt.max_qual, s[f].seq[j], s[f].qual[j], rseq[i], rqual[i]); 495 | xseq[x] = (uint8_t)y, xqual[x++] = y>>8; 496 | } 497 | } else if (n_ch == 1) { 498 | /* GGG19bp----------------> 499 | <----------------------------- */ 500 | int j, st = ch[0]>>32; 501 | for (j = fpos; j < s[f].l_seq; ++j) { 502 | int i = j + st - bpos, y; 503 | y = merge_base(g->opt.max_qual, s[f].seq[j], s[f].qual[j], rseq[i], rqual[i]); 504 | xseq[x] = (uint8_t)y, xqual[x++] = y>>8; 505 | } 506 | } 507 | xseq[x] = xqual[x] = 0; 508 | for (i = 0; i < x; ++i) 509 | if (xseq[i] == 'N' || xseq[i] == 'n') break; 510 | if (i != x) s[0].type = s[1].type = LT_AMBI_BASE; 511 | else if (x < g->opt.min_seq_len) s[0].type = s[1].type = LT_SHORT_MERGE; 512 | free(s[0].seq); free(s[0].qual); 513 | s[0].seq = strdup(xseq); 514 | s[0].qual = strdup(xqual); 515 | s[0].l_seq = x; 516 | s[1].l_seq = 0; 517 | } 518 | } 519 | if (s[0].type == LT_AMBI_MERGE || s[0].type == LT_NO_MERGE) { 520 | int n_ambi = 0; 521 | trim_bseq_5(&s[f], fpos); 522 | for (k = 0; k < 2; ++k) { 523 | for (i = 0; i < s[k].l_seq; ++i) 524 | if (s[k].seq[i] == 'N') ++n_ambi; 525 | } 526 | if (n_ambi) s[0].type = s[1].type = LT_AMBI_BASE; 527 | else if (s[f].l_seq < g->opt.min_seq_len || s[r].l_seq < g->opt.min_seq_len) 528 | s[0].type = s[1].type = LT_SHORT_PE; 529 | if (f == 1) { 530 | char *tmp; 531 | tmp = s[f].seq, s[f].seq = s[r].seq, s[r].seq = tmp; 532 | tmp = s[f].qual, s[f].qual = s[r].qual, s[r].qual = tmp; 533 | i = s[f].l_seq, s[f].l_seq = s[r].l_seq, s[r].l_seq = i; 534 | f = 0, r = 1; 535 | } 536 | } 537 | } 538 | if (s[0].type == LT_MERGED || s[0].type == LT_NO_MERGE || s[0].type == LT_AMBI_MERGE || s[0].type == LT_NO_BINDING) { 539 | if (s->type == LT_NO_BINDING) { 540 | for (i = 0; i < 2; ++i) { 541 | trim_adap(&s[i], lt_oligo_for, 0, g->opt.min_adap_len, g->opt.max_adap_pen, 0); 542 | trim_adap(&s[i], lt_bind_rev, 0, g->opt.min_adap_len, g->opt.max_adap_pen, 1); 543 | trim_adap(&s[i], lt_oligo_rev, 1, g->opt.min_adap_len, g->opt.max_adap_pen, 0); 544 | trim_adap(&s[i], lt_bind, 1, g->opt.min_adap_len, g->opt.max_adap_pen, 1); 545 | } 546 | } else { 547 | trim_adap(&s[0], lt_oligo_for, 0, g->opt.min_adap_len, g->opt.max_adap_pen, 0); 548 | trim_adap(&s[0], lt_bind_rev, 0, g->opt.min_adap_len, g->opt.max_adap_pen, 1); 549 | if (s->type != LT_MERGED) { 550 | trim_adap(&s[1], lt_bind_rev, 0, g->opt.min_adap_len, g->opt.max_adap_pen, 1); 551 | trim_adap(&s[1], lt_oligo_rev, 1, g->opt.min_adap_len, g->opt.max_adap_pen, 0); 552 | trim_adap(&s[1], lt_bind, 1, g->opt.min_adap_len, g->opt.max_adap_pen, 1); 553 | } 554 | } 555 | } 556 | } 557 | 558 | /********************** 559 | * Callback functions * 560 | **********************/ 561 | 562 | void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); 563 | void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps); 564 | 565 | typedef struct { 566 | int n_seqs; 567 | bseq1_t *seqs; 568 | lt_global_t *g; 569 | } data_for_t; 570 | 571 | static void worker_for(void *_data, long i, int tid) 572 | { 573 | data_for_t *data = (data_for_t*)_data; 574 | lt_process(data->g, &data->seqs[i<<1]); 575 | } 576 | 577 | static void *worker_pipeline(void *shared, int step, void *_data) 578 | { 579 | int i; 580 | lt_global_t *g = (lt_global_t*)shared; 581 | if (step == 0) { 582 | data_for_t *ret; 583 | ret = calloc(1, sizeof(data_for_t)); 584 | ret->seqs = bseq_read(g->ks, g->opt.chunk_size, &ret->n_seqs); 585 | assert((ret->n_seqs&1) == 0); 586 | ret->g = g; 587 | if (ret->seqs) return ret; 588 | else free(ret); 589 | } else if (step == 1) { 590 | data_for_t *data = (data_for_t*)_data; 591 | kt_for(g->opt.n_threads, worker_for, data, data->n_seqs>>1); 592 | return data; 593 | } else if (step == 2) { 594 | data_for_t *data = (data_for_t*)_data; 595 | if (g->opt.tab_out) { // tabular output 596 | for (i = 0; i < data->n_seqs; i += 2) { 597 | bseq1_t *s = &data->seqs[i]; 598 | printf("%s\t%d\n", s->name, s->type); 599 | } 600 | } else { // FASTQ output (FASTA not supported yet) 601 | for (i = 0; i < data->n_seqs; ++i) { 602 | bseq1_t *s = &data->seqs[i]; 603 | if (s->l_seq > 0 && (s->type == LT_NO_MERGE || s->type == LT_AMBI_MERGE || s->type == LT_MERGED || s->type == LT_NO_BINDING)) { 604 | putchar(s->qual? '@' : '>'); fputs(s->name, stdout); 605 | if (s->type != LT_MERGED) { 606 | putchar('/'); putchar("12"[i&1]); 607 | } 608 | printf(" YT:i:%d", s->type); 609 | if (s->type == LT_NO_MERGE || s->type == LT_AMBI_MERGE || s->type == LT_MERGED) { 610 | if (s->bc) { fputs("\tBC:Z:", stdout); fputs(s->bc[0] == 0? "*" : s->bc, stdout); } 611 | printf("\tYD:i:%d", s->dbl_bind); 612 | } 613 | putchar('\n'); 614 | puts(s->seq); 615 | if (s->qual) { puts("+"); puts(s->qual); } 616 | } 617 | } 618 | } 619 | for (i = 0; i < data->n_seqs; ++i) { // deallocate 620 | bseq1_t *s = &data->seqs[i]; 621 | free(s->bc); free(s->seq); free(s->qual); free(s->name); 622 | } 623 | free(data->seqs); free(data); 624 | } 625 | return 0; 626 | } 627 | 628 | #include 629 | 630 | int main_trim(int argc, char *argv[]) 631 | { 632 | int c; 633 | lt_global_t g; 634 | gzFile fp; 635 | 636 | lt_global_init(&g); 637 | while ((c = getopt(argc, argv, "Tt:b:l:")) >= 0) { 638 | if (c == 't') g.opt.n_threads = atoi(optarg); 639 | else if (c == 'T') g.opt.tab_out = 1; 640 | else if (c == 'b') g.opt.bc_len = atoi(optarg); 641 | else if (c == 'l') g.opt.min_seq_len = atoi(optarg); 642 | } 643 | if (argc - optind < 1) { 644 | fprintf(stderr, "Usage: seqtk mergepe | lianti trim [options] -\n"); 645 | fprintf(stderr, "Options:\n"); 646 | fprintf(stderr, " -t INT number of threads [%d]\n", g.opt.n_threads); 647 | fprintf(stderr, " -b INT barcode length [%d]\n", g.opt.bc_len); 648 | fprintf(stderr, " -l INT min read/fragment length to output [%d]\n", g.opt.min_seq_len); 649 | fprintf(stderr, " -T tabular output for debugging\n"); 650 | return 1; 651 | } 652 | 653 | fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); 654 | g.ks = kseq_init(fp); 655 | g.sc_bind = lt_sc_gen(lt_bind); 656 | g.sc_prom = lt_sc_gen(lt_promoter); 657 | 658 | kt_pipeline(2, worker_pipeline, &g, 3); 659 | 660 | lt_sc_destroy(g.sc_prom); 661 | lt_sc_destroy(g.sc_bind); 662 | kseq_destroy(g.ks); 663 | gzclose(fp); 664 | return 0; 665 | } 666 | --------------------------------------------------------------------------------