├── tabix-0.2.6 ├── perl │ ├── typemap │ ├── MANIFEST │ ├── Makefile.PL │ ├── t │ │ ├── 01local.t │ │ └── 02remote.t │ ├── TabixIterator.pm │ ├── Tabix.xs │ └── Tabix.pm ├── bgzf.o ├── bgzip ├── bgzip.o ├── index.o ├── main.o ├── tabix ├── bedidx.o ├── kstring.o ├── knetfile.o ├── libtabix.a ├── example.gtf.gz ├── example.gtf.gz.tbi ├── bam_endian.h ├── Makefile ├── kstring.h ├── knetfile.h ├── python │ ├── setup.py │ ├── test.py │ └── tabixmodule.c ├── tabix.py ├── NEWS ├── tabix.1 ├── bedidx.c ├── kstring.c ├── tabix.h ├── tabix.tex ├── bgzf.h ├── bgzip.c ├── kseq.h ├── ksort.h ├── main.c ├── TabixReader.java ├── khash.h ├── bgzf.c ├── knetfile.c └── ChangeLog ├── test.txt ├── README.md └── fathmm-MKL.py /tabix-0.2.6/perl/typemap: -------------------------------------------------------------------------------- 1 | TYPEMAP 2 | tabix_t* T_PTROBJ 3 | ti_iter_t T_PTROBJ -------------------------------------------------------------------------------- /tabix-0.2.6/bgzf.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/bgzf.o -------------------------------------------------------------------------------- /tabix-0.2.6/bgzip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/bgzip -------------------------------------------------------------------------------- /tabix-0.2.6/bgzip.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/bgzip.o -------------------------------------------------------------------------------- /tabix-0.2.6/index.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/index.o -------------------------------------------------------------------------------- /tabix-0.2.6/main.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/main.o -------------------------------------------------------------------------------- /tabix-0.2.6/tabix: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/tabix -------------------------------------------------------------------------------- /tabix-0.2.6/bedidx.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/bedidx.o -------------------------------------------------------------------------------- /tabix-0.2.6/kstring.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/kstring.o -------------------------------------------------------------------------------- /tabix-0.2.6/knetfile.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/knetfile.o -------------------------------------------------------------------------------- /tabix-0.2.6/libtabix.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/libtabix.a -------------------------------------------------------------------------------- /tabix-0.2.6/example.gtf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/example.gtf.gz -------------------------------------------------------------------------------- /tabix-0.2.6/example.gtf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/example.gtf.gz.tbi -------------------------------------------------------------------------------- /tabix-0.2.6/perl/MANIFEST: -------------------------------------------------------------------------------- 1 | MANIFEST 2 | typemap 3 | Tabix.xs 4 | Tabix.pm 5 | TabixIterator.pm 6 | Makefile.PL 7 | t/01local.t 8 | t/02remote.t -------------------------------------------------------------------------------- /test.txt: -------------------------------------------------------------------------------- 1 | # Note: records beginning with a '#' are not processed (comments) 2 | # 3 | # The software expects data to be in the following format (comma-separated): chromosome, position, reference and mutant base 4 | # 5 | 1,916549,A,G 6 | 1,935222,C,A 7 | 1,11854785,C,T 8 | 1,11854786,C,T 9 | -------------------------------------------------------------------------------- /tabix-0.2.6/perl/Makefile.PL: -------------------------------------------------------------------------------- 1 | use ExtUtils::MakeMaker; 2 | WriteMakefile( 3 | NAME => 'Tabix', 4 | VERSION_FROM => 'Tabix.pm', 5 | LIBS => ['-lz -L.. -ltabix'], 6 | DEFINE => '-D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE', 7 | INC => '-I..', 8 | ); 9 | -------------------------------------------------------------------------------- /tabix-0.2.6/perl/t/01local.t: -------------------------------------------------------------------------------- 1 | #-*-Perl-*- 2 | use Test::More tests => 9; 3 | BEGIN { use_ok('Tabix') }; 4 | 5 | { # C-like low-level interface 6 | my $t = tabix_open("../example.gtf.gz"); 7 | ok($t); 8 | my $iter = tabix_query($t, "chr1", 0, 2000); 9 | ok($iter); 10 | $_ = 0; 11 | ++$_ while (tabix_read($t, $iter)); 12 | is($_, 6); 13 | tabix_iter_free($iter); 14 | @_ = tabix_getnames($t); 15 | is(scalar(@_), 2); 16 | } 17 | 18 | { # OOP high-level interface 19 | my $t = Tabix->new(-data=>"../example.gtf.gz"); 20 | ok($t); 21 | my $iter = $t->query("chr1", 3000, 5000); 22 | ok($iter); 23 | $_ = 0; 24 | ++$_ while ($t->read($iter)); 25 | is($_, 27); 26 | @_ = $t->getnames; 27 | is($_[1], "chr2"); 28 | } 29 | -------------------------------------------------------------------------------- /tabix-0.2.6/perl/TabixIterator.pm: -------------------------------------------------------------------------------- 1 | package TabixIterator; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp qw/croak/; 6 | 7 | require Exporter; 8 | 9 | our @ISA = qw/Exporter/; 10 | our @EXPORT = qw/tabix_iter_free/; 11 | 12 | our $VERSION = '0.2.0'; 13 | 14 | require XSLoader; 15 | XSLoader::load('Tabix', $VERSION); 16 | 17 | sub new { 18 | my $invocant = shift; 19 | my $class = ref($invocant) || $invocant; 20 | my $self = {}; 21 | bless($self, $class); 22 | return $self; 23 | } 24 | 25 | sub set { 26 | my ($self, $iter) = @_; 27 | $self->{_} = $iter; 28 | } 29 | 30 | sub get { 31 | my $self = shift; 32 | return $self->{_}; 33 | } 34 | 35 | sub DESTROY { 36 | my $self = shift; 37 | tabix_iter_free($self->{_}) if ($self->{_}); 38 | } 39 | 40 | 1; 41 | __END__ 42 | -------------------------------------------------------------------------------- /tabix-0.2.6/perl/t/02remote.t: -------------------------------------------------------------------------------- 1 | #-*-Perl-*- 2 | use Test::More tests => 9; 3 | BEGIN { use_ok('Tabix') }; 4 | 5 | { # FTP access 6 | my $t = Tabix->new(-data=>"ftp://ftp.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/CEU.SRP000031.2010_03.genotypes.vcf.gz"); 7 | ok($t); 8 | my $iter = $t->query("1", 1000000, 1100000); 9 | ok($iter); 10 | $_ = 0; 11 | ++$_ while ($t->read($iter)); 12 | is($_, 306); 13 | @_ = $t->getnames; 14 | is(scalar(@_), 22); 15 | } 16 | 17 | { # FTP access plus FTP index 18 | my $t = Tabix->new(-data=>"ftp://ftp.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/CEU.SRP000031.2010_03.genotypes.vcf.gz", 19 | -index=>"ftp://ftp.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/CEU.SRP000031.2010_03.genotypes.vcf.gz.tbi"); 20 | ok($t); 21 | my $iter = $t->query("19", 10000000, 10100000); 22 | ok($iter); 23 | $_ = 0; 24 | ++$_ while ($t->read($iter)); 25 | is($_, 268); 26 | @_ = $t->getnames; 27 | is(scalar(@_), 22); 28 | } 29 | -------------------------------------------------------------------------------- /tabix-0.2.6/bam_endian.h: -------------------------------------------------------------------------------- 1 | #ifndef BAM_ENDIAN_H 2 | #define BAM_ENDIAN_H 3 | 4 | #include 5 | 6 | static inline int bam_is_big_endian() 7 | { 8 | long one= 1; 9 | return !(*((char *)(&one))); 10 | } 11 | static inline uint16_t bam_swap_endian_2(uint16_t v) 12 | { 13 | return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); 14 | } 15 | static inline void *bam_swap_endian_2p(void *x) 16 | { 17 | *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); 18 | return x; 19 | } 20 | static inline uint32_t bam_swap_endian_4(uint32_t v) 21 | { 22 | v = ((v & 0x0000FFFFU) << 16) | (v >> 16); 23 | return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); 24 | } 25 | static inline void *bam_swap_endian_4p(void *x) 26 | { 27 | *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); 28 | return x; 29 | } 30 | static inline uint64_t bam_swap_endian_8(uint64_t v) 31 | { 32 | v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); 33 | v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); 34 | return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); 35 | } 36 | static inline void *bam_swap_endian_8p(void *x) 37 | { 38 | *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); 39 | return x; 40 | } 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /tabix-0.2.6/perl/Tabix.xs: -------------------------------------------------------------------------------- 1 | #include "EXTERN.h" 2 | #include "perl.h" 3 | #include "XSUB.h" 4 | 5 | #include 6 | #include "tabix.h" 7 | 8 | MODULE = Tabix PACKAGE = Tabix 9 | 10 | tabix_t* 11 | tabix_open(fn, fnidx=0) 12 | char *fn 13 | char *fnidx 14 | CODE: 15 | RETVAL = ti_open(fn, fnidx); 16 | OUTPUT: 17 | RETVAL 18 | 19 | void 20 | tabix_close(t) 21 | tabix_t *t 22 | CODE: 23 | ti_close(t); 24 | 25 | ti_iter_t 26 | tabix_query(t, seq=0, beg=0, end=0x7fffffff) 27 | tabix_t *t 28 | const char *seq 29 | int beg 30 | int end 31 | PREINIT: 32 | CODE: 33 | RETVAL = ti_query(t, seq, beg, end); 34 | OUTPUT: 35 | RETVAL 36 | 37 | SV* 38 | tabix_read(t, iter) 39 | tabix_t *t 40 | ti_iter_t iter 41 | PREINIT: 42 | const char *s; 43 | int len; 44 | CODE: 45 | s = ti_read(t, iter, &len); 46 | if (s == 0) 47 | return XSRETURN_EMPTY; 48 | RETVAL = newSVpv(s, len); 49 | OUTPUT: 50 | RETVAL 51 | 52 | void 53 | tabix_getnames(t) 54 | tabix_t *t 55 | PREINIT: 56 | const char **names; 57 | int i, n; 58 | PPCODE: 59 | ti_lazy_index_load(t); 60 | names = ti_seqname(t->idx, &n); 61 | for (i = 0; i < n; ++i) 62 | XPUSHs(sv_2mortal(newSVpv(names[i], 0))); 63 | free(names); 64 | 65 | MODULE = Tabix PACKAGE = TabixIterator 66 | 67 | void 68 | tabix_iter_free(iter) 69 | ti_iter_t iter 70 | CODE: 71 | ti_iter_destroy(iter); 72 | -------------------------------------------------------------------------------- /tabix-0.2.6/perl/Tabix.pm: -------------------------------------------------------------------------------- 1 | package Tabix; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp qw/croak/; 6 | 7 | use TabixIterator; 8 | 9 | require Exporter; 10 | 11 | our @ISA = qw/Exporter/; 12 | our @EXPORT = qw/tabix_open tabix_close tabix_read tabix_query tabix_getnames tabix_iter_free/; 13 | 14 | our $VERSION = '0.2.0'; 15 | 16 | require XSLoader; 17 | XSLoader::load('Tabix', $VERSION); 18 | 19 | sub new { 20 | my $invocant = shift; 21 | my %args = @_; 22 | $args{-data} || croak("-data argument required"); 23 | my $class = ref($invocant) || $invocant; 24 | my $self = {}; 25 | bless($self, $class); 26 | $self->open($args{-data}, $args{-index}); 27 | return $self; 28 | } 29 | 30 | sub open { 31 | my ($self, $fn, $fnidx) = @_; 32 | $self->close; 33 | $self->{_fn} = $fn; 34 | $self->{_fnidx} = $fnidx; 35 | $self->{_} = $fnidx? tabix_open($fn, $fnidx) : tabix_open($fn); 36 | } 37 | 38 | sub close { 39 | my $self = shift; 40 | if ($self->{_}) { 41 | tabix_close($self->{_}); 42 | delete($self->{_}); delete($self->{_fn}); delete($self->{_fnidx}); 43 | } 44 | } 45 | 46 | sub DESTROY { 47 | my $self = shift; 48 | $self->close; 49 | } 50 | 51 | sub query { 52 | my $self = shift; 53 | my $iter; 54 | if (@_) { 55 | $iter = tabix_query($self->{_}, @_); 56 | } else { 57 | $iter = tabix_query($self->{_}); 58 | } 59 | my $i = TabixIterator->new; 60 | $i->set($iter); 61 | return $i; 62 | } 63 | 64 | sub read { 65 | my $self = shift; 66 | my $iter = shift; 67 | return tabix_read($self->{_}, $iter->get); 68 | } 69 | 70 | sub getnames { 71 | my $self = shift; 72 | return tabix_getnames($self->{_}); 73 | } 74 | 75 | 1; 76 | __END__ 77 | -------------------------------------------------------------------------------- /tabix-0.2.6/Makefile: -------------------------------------------------------------------------------- 1 | CC= gcc 2 | CFLAGS= -g -Wall -O2 -fPIC #-m64 #-arch ppc 3 | DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -DBGZF_CACHE 4 | LOBJS= bgzf.o kstring.o knetfile.o index.o bedidx.o 5 | AOBJS= main.o 6 | PROG= tabix bgzip 7 | INCLUDES= 8 | SUBDIRS= . 9 | LIBPATH= 10 | LIBCURSES= 11 | 12 | .SUFFIXES:.c .o 13 | 14 | .c.o: 15 | $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ 16 | 17 | all-recur lib-recur clean-recur cleanlocal-recur install-recur: 18 | @target=`echo $@ | sed s/-recur//`; \ 19 | wdir=`pwd`; \ 20 | list='$(SUBDIRS)'; for subdir in $$list; do \ 21 | cd $$subdir; \ 22 | $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ 23 | INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ 24 | cd $$wdir; \ 25 | done; 26 | 27 | all:$(PROG) 28 | 29 | lib:libtabix.a 30 | 31 | libtabix.so.1:$(LOBJS) 32 | $(CC) -shared -Wl,-soname,libtabix.so -o $@ $(LOBJS) -lc -lz 33 | 34 | libtabix.1.dylib:$(LOBJS) 35 | libtool -dynamic $(LOBJS) -o $@ -lc -lz 36 | 37 | libtabix.a:$(LOBJS) 38 | $(AR) -csru $@ $(LOBJS) 39 | 40 | tabix:lib $(AOBJS) 41 | $(CC) $(CFLAGS) -o $@ $(AOBJS) -L. -ltabix -lm $(LIBPATH) -lz 42 | 43 | bgzip:bgzip.o bgzf.o knetfile.o 44 | $(CC) $(CFLAGS) -o $@ bgzip.o bgzf.o knetfile.o -lz 45 | 46 | TabixReader.class:TabixReader.java 47 | javac -cp .:sam.jar TabixReader.java 48 | 49 | kstring.o:kstring.h 50 | knetfile.o:knetfile.h 51 | bgzf.o:bgzf.h knetfile.h 52 | index.o:bgzf.h tabix.h khash.h ksort.h kstring.h 53 | main.o:tabix.h kstring.h bgzf.h 54 | bgzip.o:bgzf.h 55 | bedidx.o:kseq.h khash.h 56 | 57 | tabix.pdf:tabix.tex 58 | pdflatex tabix.tex 59 | 60 | cleanlocal: 61 | rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a tabix.aux tabix.log tabix.pdf *.class libtabix.*.dylib libtabix.so* 62 | 63 | clean:cleanlocal-recur 64 | -------------------------------------------------------------------------------- /tabix-0.2.6/kstring.h: -------------------------------------------------------------------------------- 1 | #ifndef KSTRING_H 2 | #define KSTRING_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifndef kroundup32 9 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 10 | #endif 11 | 12 | #ifndef KSTRING_T 13 | #define KSTRING_T kstring_t 14 | typedef struct __kstring_t { 15 | size_t l, m; 16 | char *s; 17 | } kstring_t; 18 | #endif 19 | 20 | int ksprintf(kstring_t *s, const char *fmt, ...); 21 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); 22 | 23 | // calculate the auxiliary array, allocated by calloc() 24 | int *ksBM_prep(const uint8_t *pat, int m); 25 | 26 | /* Search pat in str and returned the list of matches. The size of the 27 | * list is returned as n_matches. _prep is the array returned by 28 | * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */ 29 | int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches); 30 | 31 | static inline int kputsn(const char *p, int l, kstring_t *s) 32 | { 33 | if (s->l + l + 1 >= s->m) { 34 | s->m = s->l + l + 2; 35 | kroundup32(s->m); 36 | s->s = (char*)realloc(s->s, s->m); 37 | } 38 | strncpy(s->s + s->l, p, l); 39 | s->l += l; 40 | s->s[s->l] = 0; 41 | return l; 42 | } 43 | 44 | static inline int kputs(const char *p, kstring_t *s) 45 | { 46 | return kputsn(p, strlen(p), s); 47 | } 48 | 49 | static inline int kputc(int c, kstring_t *s) 50 | { 51 | if (s->l + 1 >= s->m) { 52 | s->m = s->l + 2; 53 | kroundup32(s->m); 54 | s->s = (char*)realloc(s->s, s->m); 55 | } 56 | s->s[s->l++] = c; 57 | s->s[s->l] = 0; 58 | return c; 59 | } 60 | 61 | static inline int *ksplit(kstring_t *s, int delimiter, int *n) 62 | { 63 | int max = 0, *offsets = 0; 64 | *n = ksplit_core(s->s, delimiter, &max, &offsets); 65 | return offsets; 66 | } 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /tabix-0.2.6/knetfile.h: -------------------------------------------------------------------------------- 1 | #ifndef KNETFILE_H 2 | #define KNETFILE_H 3 | 4 | #include 5 | #include 6 | 7 | #ifndef _WIN32 8 | #define netread(fd, ptr, len) read(fd, ptr, len) 9 | #define netwrite(fd, ptr, len) write(fd, ptr, len) 10 | #define netclose(fd) close(fd) 11 | #else 12 | #include 13 | #define netread(fd, ptr, len) recv(fd, ptr, len, 0) 14 | #define netwrite(fd, ptr, len) send(fd, ptr, len, 0) 15 | #define netclose(fd) closesocket(fd) 16 | #endif 17 | 18 | // FIXME: currently I/O is unbuffered 19 | 20 | #define KNF_TYPE_LOCAL 1 21 | #define KNF_TYPE_FTP 2 22 | #define KNF_TYPE_HTTP 3 23 | 24 | typedef struct knetFile_s { 25 | int type, fd; 26 | int64_t offset; 27 | char *host, *port; 28 | 29 | // the following are for FTP only 30 | int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; 31 | char *response, *retr, *size_cmd; 32 | int64_t seek_offset; // for lazy seek 33 | int64_t file_size; 34 | 35 | // the following are for HTTP only 36 | char *path, *http_host; 37 | } knetFile; 38 | 39 | #define knet_tell(fp) ((fp)->offset) 40 | #define knet_fileno(fp) ((fp)->fd) 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | #ifdef _WIN32 47 | int knet_win32_init(); 48 | void knet_win32_destroy(); 49 | #endif 50 | 51 | knetFile *knet_open(const char *fn, const char *mode); 52 | 53 | /* 54 | This only works with local files. 55 | */ 56 | knetFile *knet_dopen(int fd, const char *mode); 57 | 58 | /* 59 | If ->is_ready==0, this routine updates ->fd; otherwise, it simply 60 | reads from ->fd. 61 | */ 62 | off_t knet_read(knetFile *fp, void *buf, off_t len); 63 | 64 | /* 65 | This routine only sets ->offset and ->is_ready=0. It does not 66 | communicate with the FTP server. 67 | */ 68 | off_t knet_seek(knetFile *fp, int64_t off, int whence); 69 | int knet_close(knetFile *fp); 70 | 71 | #ifdef __cplusplus 72 | } 73 | #endif 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /tabix-0.2.6/python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # The MIT License 4 | # 5 | # Copyright (c) 2011 Seoul National University. 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining 8 | # a copy of this software and associated documentation files (the 9 | # "Software"), to deal in the Software without restriction, including 10 | # without limitation the rights to use, copy, modify, merge, publish, 11 | # distribute, sublicense, and/or sell copies of the Software, and to 12 | # permit persons to whom the Software is furnished to do so, subject to 13 | # the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be 16 | # included in all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 22 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 23 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 24 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | # SOFTWARE. 26 | # 27 | # Contact: Hyeshik Chang 28 | 29 | from distutils.core import setup, Extension 30 | 31 | # Change this to True when you need the knetfile support. 32 | USE_KNETFILE = False 33 | 34 | TABIX_SOURCE_FILES = [ 35 | '../bgzf.c', '../bgzip.c', '../index.c', '../knetfile.c', '../kstring.c' 36 | ] 37 | 38 | define_options = [('_FILE_OFFSET_BITS', 64)] 39 | if USE_KNETFILE: 40 | define_options.append(('_USE_KNETFILE', 1)) 41 | 42 | ext_modules = [Extension("tabix", ["tabixmodule.c"] + TABIX_SOURCE_FILES, 43 | include_dirs=['..'], 44 | libraries=['z'], 45 | define_macros=define_options)] 46 | 47 | setup (name = 'tabix', 48 | version = '1.0', 49 | description = 'Python interface to tabix, a generic indexer ' 50 | 'for TAB-delimited genome position files', 51 | author = 'Hyeshik Chang', 52 | author_email = 'hyeshik@snu.ac.kr', 53 | license = 'MIT', 54 | ext_modules = ext_modules 55 | ) 56 | -------------------------------------------------------------------------------- /tabix-0.2.6/tabix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Author: Heng Li and Aaron Quinlan 4 | # License: MIT/X11 5 | 6 | import sys 7 | from ctypes import * 8 | from ctypes.util import find_library 9 | import glob, platform 10 | 11 | def load_shared_library(lib, _path='.', ver='*'): 12 | """Search for and load the tabix library. The 13 | expectation is that the library is located in 14 | the current directory (ie. "./") 15 | """ 16 | # find from the system path 17 | path = find_library(lib) 18 | if (path == None): # if fail, search in the custom directory 19 | s = platform.system() 20 | if (s == 'Darwin'): suf = ver+'.dylib' 21 | elif (s == 'Linux'): suf = '.so'+ver 22 | candidates = glob.glob(_path+'/lib'+lib+suf); 23 | if (len(candidates) == 1): path = candidates[0] 24 | else: return None 25 | cdll.LoadLibrary(path) 26 | return CDLL(path) 27 | 28 | def tabix_init(): 29 | """Initialize and return a tabix reader object 30 | for subsequent tabix_get() calls. 31 | """ 32 | tabix = load_shared_library('tabix') 33 | if (tabix == None): return None 34 | tabix.ti_read.restype = c_char_p 35 | # on Mac OS X 10.6, the following declarations are required. 36 | tabix.ti_open.restype = c_void_p 37 | tabix.ti_querys.argtypes = [c_void_p, c_char_p] 38 | tabix.ti_querys.restype = c_void_p 39 | tabix.ti_query.argtypes = [c_void_p, c_char_p, c_int, c_int] 40 | tabix.ti_query.restype = c_void_p 41 | tabix.ti_read.argtypes = [c_void_p, c_void_p, c_void_p] 42 | tabix.ti_iter_destroy.argtypes = [c_void_p] 43 | tabix.ti_close.argtypes = [c_void_p] 44 | # FIXME: explicit declarations for APIs not used in this script 45 | return tabix 46 | 47 | # OOP interface 48 | class Tabix: 49 | def __init__(self, fn, fnidx=0): 50 | self.tabix = tabix_init(); 51 | if (self.tabix == None): 52 | sys.stderr.write("[Tabix] Please make sure the shared library is compiled and available.\n") 53 | return 54 | self.fp = self.tabix.ti_open(fn, fnidx); 55 | 56 | def __del__(self): 57 | if (self.tabix): self.tabix.ti_close(self.fp) 58 | 59 | def fetch(self, chr, start=-1, end=-1): 60 | """Generator function that will yield each interval 61 | within the requested range from the requested file. 62 | """ 63 | if (self.tabix == None): return 64 | if (start < 0): iter = self.tabix.ti_querys(self.fp, chr) # chr looks like: "chr2:1,000-2,000" or "chr2" 65 | else: iter = self.tabix.ti_query(self.fp, chr, start, end) # chr must be a sequence name 66 | if (iter == None): 67 | sys.stderr.write("[Tabix] Malformatted query or wrong sequence name.\n") 68 | return 69 | while (1): # iterate 70 | s = self.tabix.ti_read(self.fp, iter, 0) 71 | if (s == None): break 72 | yield s 73 | self.tabix.ti_iter_destroy(iter) 74 | 75 | # command-line interface 76 | def main(): 77 | if (len(sys.argv) < 3): 78 | sys.stderr.write("Usage: tabix.py \n") 79 | sys.exit(1) 80 | 81 | # report the features in the requested interval 82 | tabix = Tabix(sys.argv[1]) 83 | for line in tabix.fetch(sys.argv[2]): 84 | print line 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /tabix-0.2.6/NEWS: -------------------------------------------------------------------------------- 1 | Release 0.2.4 (10 April, 2011) 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 3 | 4 | Notable changes: 5 | 6 | * Give an error if the index file is older than the data file. 7 | 8 | * Avoid a segfault given flawed input. 9 | 10 | * Added Python APIs contributed by Hyeshik Chang. The new APIs do not bind to 11 | the dynamic library and are reported to be faster. Pysam also comes with a 12 | tabix binding. 13 | 14 | * Added option "-r" for efficient header replacement. 15 | 16 | * Added BED support. 17 | 18 | * Synchronized the BGZF library between tabix and samtools. 19 | 20 | (0.2.4: 10 April 2011, r949) 21 | 22 | 23 | 24 | Beta Release 0.2.3 (8 December, 2010) 25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 26 | 27 | Notable changes: 28 | 29 | * Fixed a minor bug where the first record in a headerless file may be 30 | missed. 31 | 32 | * Added an option to print header lines. 33 | 34 | * Fixed a rare bug which may occasionally happen when retrieving data 35 | from a region without any records. 36 | 37 | * Enhanced error reporting. 38 | 39 | * Fixed a bug in bgzip which may delete the original file even if not 40 | intended. 41 | 42 | (0.2.3: 8 December 2010, r876) 43 | 44 | 45 | 46 | Beta Release 0.2.2 (28 June, 2010) 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | Notable changes: 50 | 51 | * Dropped the VCF3 support. Added VCF4 support. 52 | 53 | * Avoided the function name collision with samtools. 54 | 55 | (0.2.2: 28 June 2010, r603) 56 | 57 | 58 | 59 | Beta Release 0.2.1 (3 June, 2010) 60 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 61 | 62 | Notable changes: 63 | 64 | * Allow shared library to be compiled. Added python binding to the 65 | shared library. 66 | 67 | (0.2.1: 3 June 2010, r582) 68 | 69 | 70 | 71 | Beta Release 0.2.0 (11 May, 2010) 72 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 73 | 74 | Notable changes: 75 | 76 | * Fixed an issue for random access given an interval end larger than 77 | 2^29. 78 | 79 | * Updated the Java binding. 80 | 81 | * Added a Perl module using XS. 82 | 83 | * Improved the C APIs. 84 | 85 | (0.2.0: 11 May 2010, r574) 86 | 87 | 88 | 89 | Beta Release 0.1.6 (9 May, 2010) 90 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 91 | 92 | Notable changes: 93 | 94 | * Improved backward compatibility. Release 0.1.5 does not work with the 95 | buggy index file generated by 0.1.2. 96 | 97 | * Fixed a bug in building linear index. The bug does not affect the 98 | results, only affects efficiency in rare cases. 99 | 100 | * Reduced the number of seek calls given an index generated by old 101 | version of tabix. 102 | 103 | * Added new APIs for retrieving data via an iterator. The old callback 104 | APIs are not changed, although internally it uses iterator to 105 | retrieve data. 106 | 107 | I am trying to freeze tabix. I just hope I am committing new bugs. 108 | 109 | (0.1.6: 9 May 2010, r563) 110 | 111 | 112 | 113 | Beta Release 0.1.5 (5 May, 2010) 114 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 115 | 116 | Notable changes: 117 | 118 | * Clarified that tabix is released under MIT/X11. 119 | 120 | * Improved the robustness of indexing and retrieval. 121 | 122 | * Reduced the number of seek calls when the specified region starts 123 | from a 16kb block with no data. The index format is the same, but the 124 | content is changed a little. 125 | 126 | (0.1.5: 5 May 2010, r560) 127 | -------------------------------------------------------------------------------- /tabix-0.2.6/python/test.py: -------------------------------------------------------------------------------- 1 | # 2 | # The MIT License 3 | # 4 | # Copyright (c) 2011 Seoul National University. 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | # 26 | # Contact: Hyeshik Chang 27 | 28 | import unittest 29 | import random 30 | import gzip 31 | import tabix 32 | 33 | EXAMPLEFILE = '../example.gtf.gz' 34 | 35 | def load_example_regions(path): 36 | alldata = [] 37 | for line in gzip.GzipFile(EXAMPLEFILE): 38 | fields = line.decode('ascii')[:-1].split('\t') 39 | seqid = fields[0] 40 | begin = int(fields[3]) 41 | end = int(fields[4]) 42 | alldata.append((seqid, begin, end, fields[:7])) 43 | 44 | return alldata 45 | 46 | def does_overlap(A, B, C, D): 47 | return (A <= D <= B) or (C <= B <= D) 48 | 49 | def sample_test_dataset(regions, ntests): 50 | seqids = [seqid for seqid, _, _, _ in regions] 51 | lowerbound = max(0, min(begin for _, begin, _, _ in regions) - 1000) 52 | upperbound = max(end for _, _, end, _ in regions) + 1000 53 | 54 | tests = [] 55 | for i in range(ntests): 56 | seqid = random.choice(seqids) 57 | low = random.randrange(lowerbound, upperbound) 58 | high = random.randrange(low, upperbound) 59 | 60 | # for 1-based both-end inclusive intervals 61 | matches = [info for seq, begin, end, info in regions 62 | if seqid == seq and does_overlap(begin, end, low, high)] 63 | 64 | tests.append((seqid, low, high, matches)) 65 | 66 | return tests 67 | 68 | def tbresult2excerpt(tbmatches): 69 | return [fields[:7] for fields in tbmatches] 70 | 71 | class TabixTest(unittest.TestCase): 72 | regions = load_example_regions(EXAMPLEFILE) 73 | testset = sample_test_dataset(regions, 500) 74 | 75 | def setUp(self): 76 | self.tb = tabix.Tabix(EXAMPLEFILE) 77 | 78 | def testQuery(self): 79 | for seqid, low, high, matches in self.testset: 80 | tbresult = tbresult2excerpt(self.tb.query(seqid, low, high)) 81 | self.assertEqual(tbresult, matches) 82 | 83 | def testQueryS(self): 84 | for seqid, low, high, matches in self.testset: 85 | tbresult = tbresult2excerpt(self.tb.querys('%s:%d-%d' % 86 | (seqid, low, high))) 87 | self.assertEqual(tbresult, matches) 88 | 89 | 90 | if __name__ == '__main__': 91 | unittest.main() 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fathmm-MKL 2 | 3 | Predicting the functional consequences of both coding and non-coding single nucleotide variants (see http://fathmm.biocompute.org.uk). 4 | 5 | For more information, please refer to the following publication: 6 | 7 | Shihab HA, Rogers MF, Gough J, Mort M, Cooper DN, Day INM, Gaunt TR, Campbell C (2014). An Integrative Approach to Predicting the Functional Consequences of Non-coding and Coding Sequence Variation. *Bioinformatics* (In Press) 8 | 9 | ## General Requirements 10 | 11 | You will need the following packages installed on your system: 12 | 13 | * ```tabix``` (included as part of this repository) 14 | * ```Python``` (tested with Python 2.7) 15 | 16 | ## Running the Software 17 | 18 | * Clone this repository 19 | 20 | ``` 21 | git clone https://github.com/HAShihab/fathmm-MKL 22 | cd fathmm-MKL/ 23 | ``` 24 | 25 | * Download our pre-computed database: 26 | 27 | ``` 28 | wget http://fathmm.biocompute.org.uk/database/fathmm-MKL_Current.tab.gz 29 | ``` 30 | 31 | **Note:** this database contains one-based coordinates (positions). For true bed format (i.e. zero-based coordinates), please download the following database: http://fathmm.biocompute.org.uk/database/fathmm-MKL_Current_zerobased.tab.gz 32 | 33 | | Datafile | md5sum | 34 | | -------- | ------- | 35 | | http://fathmm.biocompute.org.uk/database/fathmm-MKL_Current.tab.gz | b8f4dd120586a34c82d5cc87cfe2a4ca | 36 | | http://fathmm.biocompute.org.uk/database/fathmm-MKL_Current_zerobased.tab.gz | c3213196a2471ade3742bd8f8a96d4cc | 37 | 38 | * Add `tabix` to your PATH and create the database index file (*please be patient, this may take a while!*): 39 | 40 | ``` 41 | export PATH=./tabix-0.2.6/:$PATH 42 | tabix -f -p bed fathmm-MKL_Current.tab.gz 43 | ``` 44 | 45 | * Run our script using the following command: 46 | 47 | ``` 48 | python fathmm-MKL.py 49 | ``` 50 | 51 | In the above command, `````` is the list of mutations to process (see ```test.txt``` for an example), `````` is where the predictions are written and `````` is the pre-computed database downloaded in *Step 1*. 52 | 53 | **Note:** the database index file must be created before running our script. If this has not been created, your output will contain "No Prediction Found" for all variants! 54 | 55 | ## Prediction Interpretation 56 | 57 | Predictions are given as *p*-values in the range [0, 1]: values above 0.5 are predicted to be deleterious, while those below 0.5 are predicted to be neutral or benign. *P*-values close to the extremes (0 or 1) are the highest-confidence predictions that yield the highest accuracy. 58 | 59 | We use distinct predictors for positions either in coding regions (positions within coding-sequence exons) and non-coding regions (positions in intergenic regions, introns or non-coding genes). The coding predictor is based on 10 groups of features, labeled A-J; the non-coding predictor uses a subset of 4 of these feature groups, A-D (see our related publication for details on the groups and their sources). 60 | 61 | **Note:** predictions based on a subset of features may not be as accurate as those based on complete feature sets. In particular, predictions that are missing the conservation score features (groups A and E) will tend to be less accurate than other predictions. To aid in interpreting these predictions, we provide a list of the feature groups that contributed to each prediction. 62 | 63 | ## Genome Build 64 | 65 | FATHMM-MKL predictions are based on the GRCh37/hg19 genome build. 66 | 67 | ## Contributing: 68 | 69 | We welcome any comments and/or suggestions that you may have regarding our software - please send an email to fathmm@biocompute.org.uk 70 | 71 | -------------------------------------------------------------------------------- /fathmm-MKL.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import argparse 6 | import subprocess 7 | 8 | # 9 | if __name__ == '__main__': 10 | ''' 11 | fathmm-MKL.py: Predict the Functional Consequences of Single Nucleotide Variants (SNVs) 12 | ''' 13 | 14 | # fetch argument(s) 15 | parser = argparse.ArgumentParser( 16 | description = 'Predict the Functional Consequences of Single Nucleotide Variants (SNVs)', 17 | add_help = False 18 | ) 19 | parser.add_argument( 20 | "-h", 21 | "--help", 22 | action = "help", 23 | help = argparse.SUPPRESS 24 | ) 25 | 26 | group = \ 27 | parser.add_argument_group("Required") 28 | group.add_argument( 29 | 'fi', 30 | metavar = '', 31 | type = argparse.FileType("r"), 32 | help = 'the mutation data to process' 33 | ) 34 | group.add_argument( 35 | 'fo', 36 | metavar = '', 37 | type = argparse.FileType("w"), 38 | help = 'where predictions are written' 39 | ) 40 | 41 | group.add_argument( 42 | 'db', 43 | metavar = '', 44 | type = argparse.FileType("r"), 45 | help = 'precomputed database of fathmm-MKL predictions' 46 | ) 47 | 48 | Args = parser.parse_args() 49 | 50 | # 51 | 52 | Args.fo.write("\t".join([ 53 | "# Chromosome", 54 | "Position", 55 | "Ref. Base", 56 | "Mutant Base", 57 | "Non-Coding Score", 58 | "Non-Coding Groups", 59 | "Coding Score", 60 | "Coding Groups", 61 | "Warning" 62 | ]) + "\n") 63 | 64 | for query in Args.fi: 65 | if not query.strip() or query.startswith("#"): 66 | continue 67 | query = query.strip().upper().split(",") 68 | Pred = [ '', '', '', '', "No Prediction Found" ] 69 | 70 | 71 | # approve query ... 72 | try: 73 | assert query.__len__() == 4 # required data present in query 74 | 75 | int(query[1]) # is position numeric 76 | assert query[2] in \ 77 | [ "A", "C", "G", "T" ] # expected base 78 | assert query[3] in \ 79 | [ "A", "C", "G", "T" ] # expected base 80 | except: 81 | Args.fo.write("\t".join([ '', '', '', '', '', '', '', '', "Error: Unexpected Format '" + ",".join(query) + "'" ] ) + "\n"); continue 82 | 83 | 84 | # fetch prediction ... 85 | proc = subprocess.Popen([ "tabix " + Args.db.name + " " + query[0] + ":" + str(int(query[1]) + 1) + "-" + str(int(query[1]) + 1) ], stdout=subprocess.PIPE, shell=True) 86 | data, err = proc.communicate() 87 | if err: 88 | Pred[-1] = "Error: 'tabix' command"; continue 89 | if data: 90 | for record in data.decode().split("\n"): 91 | if not record: 92 | continue 93 | record = record.strip().split("\t") 94 | 95 | if not record[0] == query[0]: 96 | Pred[-1] = "Error: Unexpected Chromosome"; break 97 | if not record[1] == query[1]: 98 | Pred[-1] = "Error: Unexpected Position"; break 99 | if not record[3] == query[2]: 100 | Pred[-1] = "Warning: Inconsistent Base (Expecting '" + record[3] + "')"; break 101 | if record[4] == query[3]: 102 | Pred = record[5:] + [ '' ] 103 | break 104 | 105 | Args.fo.write("\t".join( query + Pred ) + "\n") 106 | -------------------------------------------------------------------------------- /tabix-0.2.6/tabix.1: -------------------------------------------------------------------------------- 1 | .TH tabix 1 "11 May 2010" "tabix-0.2.0" "Bioinformatics tools" 2 | .SH NAME 3 | .PP 4 | bgzip - Block compression/decompression utility 5 | .PP 6 | tabix - Generic indexer for TAB-delimited genome position files 7 | .SH SYNOPSIS 8 | .PP 9 | .B bgzip 10 | .RB [ \-cdhB ] 11 | .RB [ \-b 12 | .IR virtualOffset ] 13 | .RB [ \-s 14 | .IR size ] 15 | .RI [ file ] 16 | .PP 17 | .B tabix 18 | .RB [ \-0lf ] 19 | .RB [ \-p 20 | .R gff|bed|sam|vcf] 21 | .RB [ \-s 22 | .IR seqCol ] 23 | .RB [ \-b 24 | .IR begCol ] 25 | .RB [ \-e 26 | .IR endCol ] 27 | .RB [ \-S 28 | .IR lineSkip ] 29 | .RB [ \-c 30 | .IR metaChar ] 31 | .I in.tab.bgz 32 | .RI [ "region1 " [ "region2 " [ ... "]]]" 33 | 34 | .SH DESCRIPTION 35 | .PP 36 | Tabix indexes a TAB-delimited genome position file 37 | .I in.tab.bgz 38 | and creates an index file 39 | .I in.tab.bgz.tbi 40 | when 41 | .I region 42 | is absent from the command-line. The input data file must be position 43 | sorted and compressed by 44 | .B bgzip 45 | which has a 46 | .BR gzip (1) 47 | like interface. After indexing, tabix is able to quickly retrieve data 48 | lines overlapping 49 | .I regions 50 | specified in the format "chr:beginPos-endPos". Fast data retrieval also 51 | works over network if URI is given as a file name and in this case the 52 | index file will be downloaded if it is not present locally. 53 | 54 | .SH OPTIONS OF TABIX 55 | .TP 10 56 | .BI "-p " STR 57 | Input format for indexing. Valid values are: gff, bed, sam, vcf and 58 | psltab. This option should not be applied together with any of 59 | .BR \-s ", " \-b ", " \-e ", " \-c " and " \-0 ; 60 | it is not used for data retrieval because this setting is stored in 61 | the index file. [gff] 62 | .TP 63 | .BI "-s " INT 64 | Column of sequence name. Option 65 | .BR \-s ", " \-b ", " \-e ", " \-S ", " \-c " and " \-0 66 | are all stored in the index file and thus not used in data retrieval. [1] 67 | .TP 68 | .BI "-b " INT 69 | Column of start chromosomal position. [4] 70 | .TP 71 | .BI "-e " INT 72 | Column of end chromosomal position. The end column can be the same as the 73 | start column. [5] 74 | .TP 75 | .BI "-S " INT 76 | Skip first INT lines in the data file. [0] 77 | .TP 78 | .BI "-c " CHAR 79 | Skip lines started with character CHAR. [#] 80 | .TP 81 | .B -0 82 | Specify that the position in the data file is 0-based (e.g. UCSC files) 83 | rather than 1-based. 84 | .TP 85 | .B -h 86 | Print the header/meta lines. 87 | .TP 88 | .B -B 89 | The second argument is a BED file. When this option is in use, the input 90 | file may not be sorted or indexed. The entire input will be read sequentially. Nonetheless, 91 | with this option, the format of the input must be specificed correctly on the command line. 92 | .TP 93 | .B -f 94 | Force to overwrite the index file if it is present. 95 | .TP 96 | .B -l 97 | List the sequence names stored in the index file. 98 | .RE 99 | 100 | .SH EXAMPLE 101 | (grep ^"#" in.gff; grep -v ^"#" in.gff | sort -k1,1 -k4,4n) | bgzip > sorted.gff.gz; 102 | 103 | tabix -p gff sorted.gff.gz; 104 | 105 | tabix sorted.gff.gz chr1:10,000,000-20,000,000; 106 | 107 | .SH NOTES 108 | It is straightforward to achieve overlap queries using the standard 109 | B-tree index (with or without binning) implemented in all SQL databases, 110 | or the R-tree index in PostgreSQL and Oracle. But there are still many 111 | reasons to use tabix. Firstly, tabix directly works with a lot of widely 112 | used TAB-delimited formats such as GFF/GTF and BED. We do not need to 113 | design database schema or specialized binary formats. Data do not need 114 | to be duplicated in different formats, either. Secondly, tabix works on 115 | compressed data files while most SQL databases do not. The GenCode 116 | annotation GTF can be compressed down to 4%. Thirdly, tabix is 117 | fast. The same indexing algorithm is known to work efficiently for an 118 | alignment with a few billion short reads. SQL databases probably cannot 119 | easily handle data at this scale. Last but not the least, tabix supports 120 | remote data retrieval. One can put the data file and the index at an FTP 121 | or HTTP server, and other users or even web services will be able to get 122 | a slice without downloading the entire file. 123 | 124 | .SH AUTHOR 125 | .PP 126 | Tabix was written by Heng Li. The BGZF library was originally 127 | implemented by Bob Handsaker and modified by Heng Li for remote file 128 | access and in-memory caching. 129 | 130 | .SH SEE ALSO 131 | .PP 132 | .BR samtools (1) 133 | -------------------------------------------------------------------------------- /tabix-0.2.6/bedidx.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "ksort.h" 8 | KSORT_INIT_GENERIC(uint64_t) 9 | 10 | #include "kseq.h" 11 | KSTREAM_INIT(gzFile, gzread, 8192) 12 | 13 | typedef struct { 14 | int n, m; 15 | uint64_t *a; 16 | int *idx; 17 | } bed_reglist_t; 18 | 19 | #include "khash.h" 20 | KHASH_MAP_INIT_STR(reg, bed_reglist_t) 21 | 22 | #define LIDX_SHIFT 13 23 | 24 | typedef kh_reg_t reghash_t; 25 | 26 | int *bed_index_core(int n, uint64_t *a, int *n_idx) 27 | { 28 | int i, j, m, *idx; 29 | m = *n_idx = 0; idx = 0; 30 | for (i = 0; i < n; ++i) { 31 | int beg, end; 32 | beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; 33 | if (m < end + 1) { 34 | int oldm = m; 35 | m = end + 1; 36 | kroundup32(m); 37 | idx = realloc(idx, m * sizeof(int)); 38 | for (j = oldm; j < m; ++j) idx[j] = -1; 39 | } 40 | if (beg == end) { 41 | if (idx[beg] < 0) idx[beg] = i; 42 | } else { 43 | for (j = beg; j <= end; ++j) 44 | if (idx[j] < 0) idx[j] = i; 45 | } 46 | *n_idx = end + 1; 47 | } 48 | return idx; 49 | } 50 | 51 | void bed_index(void *_h) 52 | { 53 | reghash_t *h = (reghash_t*)_h; 54 | khint_t k; 55 | for (k = 0; k < kh_end(h); ++k) { 56 | if (kh_exist(h, k)) { 57 | bed_reglist_t *p = &kh_val(h, k); 58 | if (p->idx) free(p->idx); 59 | ks_introsort(uint64_t, p->n, p->a); 60 | p->idx = bed_index_core(p->n, p->a, &p->m); 61 | } 62 | } 63 | } 64 | 65 | int bed_overlap_core(const bed_reglist_t *p, int beg, int end) 66 | { 67 | int i, min_off; 68 | if (p->n == 0) return 0; 69 | min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; 70 | if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here 71 | int n = beg>>LIDX_SHIFT; 72 | if (n > p->n) n = p->n; 73 | for (i = n - 1; i >= 0; --i) 74 | if (p->idx[i] >= 0) break; 75 | min_off = i >= 0? p->idx[i] : 0; 76 | } 77 | for (i = min_off; i < p->n; ++i) { 78 | if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed 79 | if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) 80 | return 1; // find the overlap; return 81 | } 82 | return 0; 83 | } 84 | 85 | int bed_overlap(const void *_h, const char *chr, int beg, int end) 86 | { 87 | const reghash_t *h = (const reghash_t*)_h; 88 | khint_t k; 89 | if (!h) return 0; 90 | k = kh_get(reg, h, chr); 91 | if (k == kh_end(h)) return 0; 92 | return bed_overlap_core(&kh_val(h, k), beg, end); 93 | } 94 | 95 | void *bed_read(const char *fn) 96 | { 97 | reghash_t *h = kh_init(reg); 98 | gzFile fp; 99 | kstream_t *ks; 100 | int dret; 101 | kstring_t *str; 102 | // read the list 103 | fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); 104 | if (fp == 0) return 0; 105 | str = calloc(1, sizeof(kstring_t)); 106 | ks = ks_init(fp); 107 | while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name 108 | int beg = -1, end = -1; 109 | bed_reglist_t *p; 110 | khint_t k = kh_get(reg, h, str->s); 111 | if (k == kh_end(h)) { // absent from the hash table 112 | int ret; 113 | char *s = strdup(str->s); 114 | k = kh_put(reg, h, s, &ret); 115 | memset(&kh_val(h, k), 0, sizeof(bed_reglist_t)); 116 | } 117 | p = &kh_val(h, k); 118 | if (dret != '\n') { // if the lines has other characters 119 | if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { 120 | beg = atoi(str->s); // begin 121 | if (dret != '\n') { 122 | if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) 123 | end = atoi(str->s); // end 124 | } 125 | } 126 | } 127 | if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line 128 | if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column 129 | if (beg >= 0 && end > beg) { 130 | if (p->n == p->m) { 131 | p->m = p->m? p->m<<1 : 4; 132 | p->a = realloc(p->a, p->m * 8); 133 | } 134 | p->a[p->n++] = (uint64_t)beg<<32 | end; 135 | } 136 | } 137 | ks_destroy(ks); 138 | gzclose(fp); 139 | free(str->s); free(str); 140 | bed_index(h); 141 | return h; 142 | } 143 | 144 | void bed_destroy(void *_h) 145 | { 146 | reghash_t *h = (reghash_t*)_h; 147 | khint_t k; 148 | for (k = 0; k < kh_end(h); ++k) { 149 | if (kh_exist(h, k)) { 150 | free(kh_val(h, k).a); 151 | free(kh_val(h, k).idx); 152 | free((char*)kh_key(h, k)); 153 | } 154 | } 155 | kh_destroy(reg, h); 156 | } 157 | -------------------------------------------------------------------------------- /tabix-0.2.6/kstring.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "kstring.h" 7 | 8 | int ksprintf(kstring_t *s, const char *fmt, ...) 9 | { 10 | va_list ap; 11 | int l; 12 | va_start(ap, fmt); 13 | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. 14 | va_end(ap); 15 | if (l + 1 > s->m - s->l) { 16 | s->m = s->l + l + 2; 17 | kroundup32(s->m); 18 | s->s = (char*)realloc(s->s, s->m); 19 | va_start(ap, fmt); 20 | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); 21 | } 22 | va_end(ap); 23 | s->l += l; 24 | return l; 25 | } 26 | 27 | // s MUST BE a null terminated string; l = strlen(s) 28 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) 29 | { 30 | int i, n, max, last_char, last_start, *offsets, l; 31 | n = 0; max = *_max; offsets = *_offsets; 32 | l = strlen(s); 33 | 34 | #define __ksplit_aux do { \ 35 | if (_offsets) { \ 36 | s[i] = 0; \ 37 | if (n == max) { \ 38 | max = max? max<<1 : 2; \ 39 | offsets = (int*)realloc(offsets, sizeof(int) * max); \ 40 | } \ 41 | offsets[n++] = last_start; \ 42 | } else ++n; \ 43 | } while (0) 44 | 45 | for (i = 0, last_char = last_start = 0; i <= l; ++i) { 46 | if (delimiter == 0) { 47 | if (isspace(s[i]) || s[i] == 0) { 48 | if (isgraph(last_char)) __ksplit_aux; // the end of a field 49 | } else { 50 | if (isspace(last_char) || last_char == 0) last_start = i; 51 | } 52 | } else { 53 | if (s[i] == delimiter || s[i] == 0) { 54 | if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field 55 | } else { 56 | if (last_char == delimiter || last_char == 0) last_start = i; 57 | } 58 | } 59 | last_char = s[i]; 60 | } 61 | *_max = max; *_offsets = offsets; 62 | return n; 63 | } 64 | 65 | /********************** 66 | * Boyer-Moore search * 67 | **********************/ 68 | 69 | // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html 70 | int *ksBM_prep(const uint8_t *pat, int m) 71 | { 72 | int i, *suff, *prep, *bmGs, *bmBc; 73 | prep = calloc(m + 256, 1); 74 | bmGs = prep; bmBc = prep + m; 75 | { // preBmBc() 76 | for (i = 0; i < 256; ++i) bmBc[i] = m; 77 | for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; 78 | } 79 | suff = calloc(m, sizeof(int)); 80 | { // suffixes() 81 | int f = 0, g; 82 | suff[m - 1] = m; 83 | g = m - 1; 84 | for (i = m - 2; i >= 0; --i) { 85 | if (i > g && suff[i + m - 1 - f] < i - g) 86 | suff[i] = suff[i + m - 1 - f]; 87 | else { 88 | if (i < g) g = i; 89 | f = i; 90 | while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; 91 | suff[i] = f - g; 92 | } 93 | } 94 | } 95 | { // preBmGs() 96 | int j = 0; 97 | for (i = 0; i < m; ++i) bmGs[i] = m; 98 | for (i = m - 1; i >= 0; --i) 99 | if (suff[i] == i + 1) 100 | for (; j < m - 1 - i; ++j) 101 | if (bmGs[j] == m) 102 | bmGs[j] = m - 1 - i; 103 | for (i = 0; i <= m - 2; ++i) 104 | bmGs[m - 1 - suff[i]] = m - 1 - i; 105 | } 106 | free(suff); 107 | return prep; 108 | } 109 | 110 | int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches) 111 | { 112 | int i, j, *prep, *bmGs, *bmBc; 113 | int *matches = 0, mm = 0, nm = 0; 114 | prep = _prep? _prep : ksBM_prep(pat, m); 115 | bmGs = prep; bmBc = prep + m; 116 | j = 0; 117 | while (j <= n - m) { 118 | for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); 119 | if (i < 0) { 120 | if (nm == mm) { 121 | mm = mm? mm<<1 : 1; 122 | matches = realloc(matches, mm * sizeof(int)); 123 | } 124 | matches[nm++] = j; 125 | j += bmGs[0]; 126 | } else { 127 | int max = bmBc[str[i+j]] - m + 1 + i; 128 | if (max < bmGs[i]) max = bmGs[i]; 129 | j += max; 130 | } 131 | } 132 | *n_matches = nm; 133 | if (_prep == 0) free(prep); 134 | return matches; 135 | } 136 | 137 | #ifdef KSTRING_MAIN 138 | #include 139 | int main() 140 | { 141 | kstring_t *s; 142 | int *fields, n, i; 143 | s = (kstring_t*)calloc(1, sizeof(kstring_t)); 144 | // test ksprintf() 145 | ksprintf(s, " abcdefg: %d ", 100); 146 | printf("'%s'\n", s->s); 147 | // test ksplit() 148 | fields = ksplit(s, 0, &n); 149 | for (i = 0; i < n; ++i) 150 | printf("field[%d] = '%s'\n", i, s->s + fields[i]); 151 | free(s); 152 | 153 | { 154 | static char *str = "abcdefgcdg"; 155 | static char *pat = "cd"; 156 | int n, *matches; 157 | matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n); 158 | printf("%d: \n", n); 159 | for (i = 0; i < n; ++i) 160 | printf("- %d\n", matches[i]); 161 | free(matches); 162 | } 163 | return 0; 164 | } 165 | #endif 166 | -------------------------------------------------------------------------------- /tabix-0.2.6/tabix.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2009 Genome Research Ltd (GRL), 2010 Broad Institute 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Heng Li */ 27 | 28 | #ifndef __TABIDX_H 29 | #define __TABIDX_H 30 | 31 | #include 32 | #include "kstring.h" 33 | #include "bgzf.h" 34 | 35 | #define TI_PRESET_GENERIC 0 36 | #define TI_PRESET_SAM 1 37 | #define TI_PRESET_VCF 2 38 | 39 | #define TI_FLAG_UCSC 0x10000 40 | 41 | typedef int (*ti_fetch_f)(int l, const char *s, void *data); 42 | 43 | struct __ti_index_t; 44 | typedef struct __ti_index_t ti_index_t; 45 | 46 | struct __ti_iter_t; 47 | typedef struct __ti_iter_t *ti_iter_t; 48 | 49 | typedef struct { 50 | BGZF *fp; 51 | ti_index_t *idx; 52 | char *fn, *fnidx; 53 | } tabix_t; 54 | 55 | typedef struct { 56 | int32_t preset; 57 | int32_t sc, bc, ec; // seq col., beg col. and end col. 58 | int32_t meta_char, line_skip; 59 | } ti_conf_t; 60 | 61 | typedef struct { 62 | int beg, end; 63 | char *ss, *se; 64 | } ti_interval_t; 65 | 66 | extern ti_conf_t ti_conf_gff, ti_conf_bed, ti_conf_psltbl, ti_conf_vcf, ti_conf_sam; // preset 67 | 68 | #ifdef __cplusplus 69 | extern "C" { 70 | #endif 71 | 72 | /******************* 73 | * High-level APIs * 74 | *******************/ 75 | 76 | tabix_t *ti_open(const char *fn, const char *fnidx); 77 | int ti_lazy_index_load(tabix_t *t); 78 | void ti_close(tabix_t *t); 79 | ti_iter_t ti_query(tabix_t *t, const char *name, int beg, int end); 80 | ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end); 81 | ti_iter_t ti_querys(tabix_t *t, const char *reg); 82 | const char *ti_read(tabix_t *t, ti_iter_t iter, int *len); 83 | 84 | /* Destroy the iterator */ 85 | void ti_iter_destroy(ti_iter_t iter); 86 | 87 | /* Get the list of sequence names. Each "char*" pointer points to a 88 | * internal member of the index, so DO NOT modify the returned 89 | * pointer; otherwise the index will be corrupted. The returned 90 | * pointer should be freed by a single free() call by the routine 91 | * calling this function. The number of sequences is returned at *n. */ 92 | const char **ti_seqname(const ti_index_t *idx, int *n); 93 | 94 | /****************** 95 | * Low-level APIs * 96 | ******************/ 97 | 98 | /* Build the index for file . File .tbi will be generated 99 | * and overwrite the file of the same name. Return -1 on failure. */ 100 | int ti_index_build(const char *fn, const ti_conf_t *conf); 101 | 102 | /* Load the index from file .tbi. If is a URL and the index 103 | * file is not in the working directory, .tbi will be 104 | * downloaded. Return NULL on failure. */ 105 | ti_index_t *ti_index_load(const char *fn); 106 | 107 | ti_index_t *ti_index_load_local(const char *fnidx); 108 | 109 | /* Destroy the index */ 110 | void ti_index_destroy(ti_index_t *idx); 111 | 112 | /* Parse a region like: chr2, chr2:100, chr2:100-200. Return -1 on failure. */ 113 | int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin, int *end); 114 | 115 | int ti_get_tid(const ti_index_t *idx, const char *name); 116 | 117 | /* Get the iterator pointing to the first record at the current file 118 | * position. If the file is just openned, the iterator points to the 119 | * first record in the file. */ 120 | ti_iter_t ti_iter_first(void); 121 | 122 | /* Get the iterator pointing to the first record in region tid:beg-end */ 123 | ti_iter_t ti_iter_query(const ti_index_t *idx, int tid, int beg, int end); 124 | 125 | /* Get the data line pointed by the iterator and iterate to the next record. */ 126 | const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len); 127 | 128 | const ti_conf_t *ti_get_conf(ti_index_t *idx); 129 | int ti_get_intv(const ti_conf_t *conf, int len, char *line, ti_interval_t *intv); 130 | 131 | /******************* 132 | * Deprecated APIs * 133 | *******************/ 134 | 135 | /* The callback version for random access */ 136 | int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func); 137 | 138 | /* Read one line. */ 139 | int ti_readline(BGZF *fp, kstring_t *str); 140 | 141 | #ifdef __cplusplus 142 | } 143 | #endif 144 | 145 | #endif 146 | -------------------------------------------------------------------------------- /tabix-0.2.6/tabix.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt]{article} 2 | \usepackage{color} 3 | \definecolor{gray}{rgb}{0.7,0.7,0.7} 4 | 5 | \setlength{\topmargin}{0.0cm} 6 | \setlength{\textheight}{21.5cm} 7 | \setlength{\oddsidemargin}{0cm} 8 | \setlength{\textwidth}{16.5cm} 9 | \setlength{\columnsep}{0.6cm} 10 | 11 | \begin{document} 12 | 13 | \title{The Tabix index file format} 14 | \author{Heng Li} 15 | \date{} 16 | 17 | \maketitle 18 | 19 | \begin{center} 20 | \begin{tabular}{|l|l|l|l|l|l|l|} 21 | \hline 22 | \multicolumn{4}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\ 23 | \hline\hline 24 | \multicolumn{4}{|l|}{\tt magic} & Magic string & {\tt char[4]} & TBI$\backslash$1 \\ 25 | \hline 26 | \multicolumn{4}{|l|}{\tt n\_ref} & \# sequences & {\tt int32\_t} & \\ 27 | \hline 28 | \multicolumn{4}{|l|}{\tt format} & Format (0: generic; 1: SAM; 2: VCF) & {\tt int32\_t} & \\ 29 | \hline 30 | \multicolumn{4}{|l|}{\tt col\_seq} & Column for the sequence name & {\tt int32\_t} & \\ 31 | \hline 32 | \multicolumn{4}{|l|}{\tt col\_beg} & Column for the start of a region & {\tt int32\_t} & \\ 33 | \hline 34 | \multicolumn{4}{|l|}{\tt col\_end} & Column for the end of a region & {\tt int32\_t} & \\ 35 | \hline 36 | \multicolumn{4}{|l|}{\tt meta} & Leading character for comment lines & {\tt int32\_t} & \\ 37 | \hline 38 | \multicolumn{4}{|l|}{\tt skip} & \# lines to skip at the beginning & {\tt int32\_t} & \\ 39 | \hline 40 | \multicolumn{4}{|l|}{\tt l\_nm} & Length of concatenated sequence names & {\tt int32\_t} & \\ 41 | \hline 42 | \multicolumn{4}{|l|}{\tt names} & Concatenated names, each zero terminated & {\tt char[l\_nm]} & \\ 43 | \hline 44 | \multicolumn{7}{|c|}{\textcolor{gray}{\it List of indices (n=n\_ref)}}\\ 45 | \cline{2-7} 46 | \hspace{0.1cm} & \multicolumn{3}{l|}{\tt n\_bin} & \# distinct bins (for the binning index) & {\tt int32\_t} & \\ 47 | \cline{2-7} 48 | & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct bins (n=n\_bin)}} \\ 49 | \cline{3-7} 50 | & \hspace{0.1cm} & \multicolumn{2}{l|}{\tt bin} & Distinct bin number & {\tt uint32\_t} & \\ 51 | \cline{3-7} 52 | & & \multicolumn{2}{l|}{\tt n\_chunk} & \# chunks & {\tt int32\_t} & \\ 53 | \cline{3-7} 54 | & & \multicolumn{5}{c|}{\textcolor{gray}{\it List of chunks (n=n\_chunk)}} \\ 55 | \cline{4-7} 56 | & & \hspace{0.1cm} & {\tt cnk\_beg} & Virtual file offset of the start of the chunk & {\tt uint64\_t} & \\ 57 | \cline{4-7} 58 | & & & {\tt cnk\_end} & Virtual file offset of the end of the chunk & {\tt uint64\_t} & \\ 59 | \cline{2-7} 60 | & \multicolumn{3}{l|}{\tt n\_intv} & \# 16kb intervals (for the linear index) & {\tt int32\_t} & \\ 61 | \cline{2-7} 62 | & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct intervals (n=n\_intv)}} \\ 63 | \cline{3-7} 64 | & & \multicolumn{2}{l|}{\tt ioff} & File offset of the first record in the interval & {\tt uint64\_t} & \\ 65 | \hline 66 | \end{tabular} 67 | \end{center} 68 | 69 | {\bf Notes:} 70 | 71 | \begin{itemize} 72 | \item The index file is BGZF compressed. 73 | \item All integers are little-endian. 74 | \item When {\tt (format\&0x10000)} is true, the coordinate follows the 75 | {\tt BED} rule (i.e. half-closed-half-open and zero based); otherwise, 76 | the coordinate follows the {\tt GFF} rule (closed and one based). 77 | \item For the SAM format, the end of a region equals {\tt POS} plus the 78 | reference length in the alignment, inferred from {\tt CIGAR}. For the 79 | VCF format, the end of a region equals {\tt POS} plus the size of the 80 | deletion. 81 | \item Field {\tt col\_beg} may equal {\tt col\_end}, and in this case, 82 | the end of a region is {\tt end}={\tt beg+1}. 83 | \item Example. For {\tt GFF}, {\tt format}=0, {\tt col\_seq}=1, {\tt 84 | col\_beg}=4, {\tt col\_end}=5, {\tt meta}=`{\tt \#}' and {\tt 85 | skip}=0. For {\tt BED}, {\tt format}=0x10000, {\tt col\_seq}=1, {\tt 86 | col\_beg}=2, {\tt col\_end}=3, {\tt meta}=`{\tt \#}' and {\tt 87 | skip}=0. 88 | \item Given a zero-based, half-closed and half-open region {\tt 89 | [beg,end)}, the {\tt bin} number is calculated with the following C 90 | function: 91 | \begin{verbatim} 92 | int reg2bin(int beg, int end) { 93 | --end; 94 | if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14); 95 | if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17); 96 | if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20); 97 | if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23); 98 | if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26); 99 | return 0; 100 | } 101 | \end{verbatim} 102 | \item The list of bins that may overlap a region {\tt [beg,end)} can be 103 | obtained with the following C function. 104 | \begin{verbatim} 105 | #define MAX_BIN (((1<<18)-1)/7) 106 | int reg2bins(int rbeg, int rend, uint16_t list[MAX_BIN]) 107 | { 108 | int i = 0, k; 109 | --rend; 110 | list[i++] = 0; 111 | for (k = 1 + (rbeg>>26); k <= 1 + (rend>>26); ++k) list[i++] = k; 112 | for (k = 9 + (rbeg>>23); k <= 9 + (rend>>23); ++k) list[i++] = k; 113 | for (k = 73 + (rbeg>>20); k <= 73 + (rend>>20); ++k) list[i++] = k; 114 | for (k = 585 + (rbeg>>17); k <= 585 + (rend>>17); ++k) list[i++] = k; 115 | for (k = 4681 + (rbeg>>14); k <= 4681 + (rend>>14); ++k) list[i++] = k; 116 | return i; // #elements in list[] 117 | } 118 | \end{verbatim} 119 | \end{itemize} 120 | 121 | \end{document} -------------------------------------------------------------------------------- /tabix-0.2.6/bgzf.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 4 | 2011 Attractive Chaos 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* The BGZF library was originally written by Bob Handsaker from the Broad 26 | * Institute. It was later improved by the SAMtools developers. */ 27 | 28 | #ifndef __BGZF_H 29 | #define __BGZF_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #define BGZF_BLOCK_SIZE 0x10000 // 64k 36 | 37 | #define BGZF_ERR_ZLIB 1 38 | #define BGZF_ERR_HEADER 2 39 | #define BGZF_ERR_IO 4 40 | #define BGZF_ERR_MISUSE 8 41 | 42 | typedef struct { 43 | int open_mode:8, compress_level:8, errcode:16; 44 | int cache_size; 45 | int block_length, block_offset; 46 | int64_t block_address; 47 | void *uncompressed_block, *compressed_block; 48 | void *cache; // a pointer to a hash table 49 | void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading 50 | } BGZF; 51 | 52 | #ifndef KSTRING_T 53 | #define KSTRING_T kstring_t 54 | typedef struct __kstring_t { 55 | size_t l, m; 56 | char *s; 57 | } kstring_t; 58 | #endif 59 | 60 | #ifdef __cplusplus 61 | extern "C" { 62 | #endif 63 | 64 | /****************** 65 | * Basic routines * 66 | ******************/ 67 | 68 | /** 69 | * Open an existing file descriptor for reading or writing. 70 | * 71 | * @param fd file descriptor 72 | * @param mode mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies 73 | * the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored. 74 | * @return BGZF file handler; 0 on error 75 | */ 76 | BGZF* bgzf_dopen(int fd, const char *mode); 77 | 78 | /** 79 | * Open the specified file for reading or writing. 80 | */ 81 | BGZF* bgzf_open(const char* path, const char *mode); 82 | 83 | /** 84 | * Close the BGZF and free all associated resources. 85 | * 86 | * @param fp BGZF file handler 87 | * @return 0 on success and -1 on error 88 | */ 89 | int bgzf_close(BGZF *fp); 90 | 91 | /** 92 | * Read up to _length_ bytes from the file storing into _data_. 93 | * 94 | * @param fp BGZF file handler 95 | * @param data data array to read into 96 | * @param length size of data to read 97 | * @return number of bytes actually read; 0 on end-of-file and -1 on error 98 | */ 99 | ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length); 100 | 101 | /** 102 | * Write _length_ bytes from _data_ to the file. 103 | * 104 | * @param fp BGZF file handler 105 | * @param data data array to write 106 | * @param length size of data to write 107 | * @return number of bytes actually written; -1 on error 108 | */ 109 | ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length); 110 | 111 | /** 112 | * Write the data in the buffer to the file. 113 | */ 114 | int bgzf_flush(BGZF *fp); 115 | 116 | /** 117 | * Return a virtual file pointer to the current location in the file. 118 | * No interpetation of the value should be made, other than a subsequent 119 | * call to bgzf_seek can be used to position the file at the same point. 120 | * Return value is non-negative on success. 121 | */ 122 | #define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) 123 | 124 | /** 125 | * Set the file to read from the location specified by _pos_. 126 | * 127 | * @param fp BGZF file handler 128 | * @param pos virtual file offset returned by bgzf_tell() 129 | * @param whence must be SEEK_SET 130 | * @return 0 on success and -1 on error 131 | */ 132 | int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence); 133 | 134 | /** 135 | * Check if the BGZF end-of-file (EOF) marker is present 136 | * 137 | * @param fp BGZF file handler opened for reading 138 | * @return 1 if EOF is present; 0 if not or on I/O error 139 | */ 140 | int bgzf_check_EOF(BGZF *fp); 141 | 142 | /** 143 | * Check if a file is in the BGZF format 144 | * 145 | * @param fn file name 146 | * @return 1 if _fn_ is BGZF; 0 if not or on I/O error 147 | */ 148 | int bgzf_is_bgzf(const char *fn); 149 | 150 | /********************* 151 | * Advanced routines * 152 | *********************/ 153 | 154 | /** 155 | * Set the cache size. Only effective when compiled with -DBGZF_CACHE. 156 | * 157 | * @param fp BGZF file handler 158 | * @param size size of cache in bytes; 0 to disable caching (default) 159 | */ 160 | void bgzf_set_cache_size(BGZF *fp, int size); 161 | 162 | /** 163 | * Flush the file if the remaining buffer size is smaller than _size_ 164 | */ 165 | int bgzf_flush_try(BGZF *fp, ssize_t size); 166 | 167 | /** 168 | * Read one byte from a BGZF file. It is faster than bgzf_read() 169 | * @param fp BGZF file handler 170 | * @return byte read; -1 on end-of-file or error 171 | */ 172 | int bgzf_getc(BGZF *fp); 173 | 174 | /** 175 | * Read one line from a BGZF file. It is faster than bgzf_getc() 176 | * 177 | * @param fp BGZF file handler 178 | * @param delim delimitor 179 | * @param str string to write to; must be initialized 180 | * @return length of the string; 0 on end-of-file; negative on error 181 | */ 182 | int bgzf_getline(BGZF *fp, int delim, kstring_t *str); 183 | 184 | /** 185 | * Read the next BGZF block. 186 | */ 187 | int bgzf_read_block(BGZF *fp); 188 | 189 | #ifdef __cplusplus 190 | } 191 | #endif 192 | 193 | #endif 194 | -------------------------------------------------------------------------------- /tabix-0.2.6/bgzip.c: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | */ 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include "bgzf.h" 33 | 34 | static const int WINDOW_SIZE = 64 * 1024; 35 | 36 | static int bgzip_main_usage() 37 | { 38 | fprintf(stderr, "\n"); 39 | fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n"); 40 | fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n"); 41 | fprintf(stderr, " -d decompress\n"); 42 | fprintf(stderr, " -f overwrite files without asking\n"); 43 | fprintf(stderr, " -b INT decompress at virtual file pointer INT\n"); 44 | fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n"); 45 | fprintf(stderr, " -h give this help\n"); 46 | fprintf(stderr, "\n"); 47 | return 1; 48 | } 49 | 50 | static int write_open(const char *fn, int is_forced) 51 | { 52 | int fd = -1; 53 | char c; 54 | if (!is_forced) { 55 | if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { 56 | fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn); 57 | scanf("%c", &c); 58 | if (c != 'Y' && c != 'y') { 59 | fprintf(stderr, "[bgzip] not overwritten\n"); 60 | exit(1); 61 | } 62 | } 63 | } 64 | if (fd < 0) { 65 | if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { 66 | fprintf(stderr, "[bgzip] %s: Fail to write\n", fn); 67 | exit(1); 68 | } 69 | } 70 | return fd; 71 | } 72 | 73 | static void fail(BGZF* fp) 74 | { 75 | fprintf(stderr, "Error: %d\n", fp->errcode); 76 | exit(1); 77 | } 78 | 79 | int main(int argc, char **argv) 80 | { 81 | int c, compress, pstdout, is_forced; 82 | BGZF *fp; 83 | void *buffer; 84 | long start, end, size; 85 | 86 | compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; 87 | while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){ 88 | switch(c){ 89 | case 'h': return bgzip_main_usage(); 90 | case 'd': compress = 0; break; 91 | case 'c': pstdout = 1; break; 92 | case 'b': start = atol(optarg); break; 93 | case 's': size = atol(optarg); break; 94 | case 'f': is_forced = 1; break; 95 | } 96 | } 97 | if (size >= 0) end = start + size; 98 | if (end >= 0 && end < start) { 99 | fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); 100 | return 1; 101 | } 102 | if (compress == 1) { 103 | struct stat sbuf; 104 | int f_src = fileno(stdin); 105 | int f_dst = fileno(stdout); 106 | 107 | if ( argc>optind ) 108 | { 109 | if ( stat(argv[optind],&sbuf)<0 ) 110 | { 111 | fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); 112 | return 1; 113 | } 114 | 115 | if ((f_src = open(argv[optind], O_RDONLY)) < 0) { 116 | fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); 117 | return 1; 118 | } 119 | 120 | if (pstdout) 121 | f_dst = fileno(stdout); 122 | else 123 | { 124 | char *name = malloc(strlen(argv[optind]) + 5); 125 | strcpy(name, argv[optind]); 126 | strcat(name, ".gz"); 127 | f_dst = write_open(name, is_forced); 128 | if (f_dst < 0) return 1; 129 | free(name); 130 | } 131 | } 132 | else if (!pstdout && isatty(fileno((FILE *)stdout)) ) 133 | return bgzip_main_usage(); 134 | 135 | fp = bgzf_dopen(f_dst, "w"); 136 | buffer = malloc(WINDOW_SIZE); 137 | while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) 138 | if (bgzf_write(fp, buffer, c) < 0) fail(fp); 139 | // f_dst will be closed here 140 | if (bgzf_close(fp) < 0) fail(fp); 141 | if (argc > optind && !pstdout) unlink(argv[optind]); 142 | free(buffer); 143 | close(f_src); 144 | return 0; 145 | } else { 146 | struct stat sbuf; 147 | int f_dst; 148 | 149 | if ( argc>optind ) 150 | { 151 | if ( stat(argv[optind],&sbuf)<0 ) 152 | { 153 | fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); 154 | return 1; 155 | } 156 | char *name; 157 | int len = strlen(argv[optind]); 158 | if ( strcmp(argv[optind]+len-3,".gz") ) 159 | { 160 | fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); 161 | return 1; 162 | } 163 | fp = bgzf_open(argv[optind], "r"); 164 | if (fp == NULL) { 165 | fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); 166 | return 1; 167 | } 168 | 169 | if (pstdout) { 170 | f_dst = fileno(stdout); 171 | } 172 | else { 173 | name = strdup(argv[optind]); 174 | name[strlen(name) - 3] = '\0'; 175 | f_dst = write_open(name, is_forced); 176 | free(name); 177 | } 178 | } 179 | else if (!pstdout && isatty(fileno((FILE *)stdin)) ) 180 | return bgzip_main_usage(); 181 | else 182 | { 183 | f_dst = fileno(stdout); 184 | fp = bgzf_dopen(fileno(stdin), "r"); 185 | if (fp == NULL) { 186 | fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); 187 | return 1; 188 | } 189 | } 190 | buffer = malloc(WINDOW_SIZE); 191 | if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp); 192 | while (1) { 193 | if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); 194 | else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); 195 | if (c == 0) break; 196 | if (c < 0) fail(fp); 197 | start += c; 198 | write(f_dst, buffer, c); 199 | if (end >= 0 && start >= end) break; 200 | } 201 | free(buffer); 202 | if (bgzf_close(fp) < 0) fail(fp); 203 | if (!pstdout) unlink(argv[optind]); 204 | return 0; 205 | } 206 | } 207 | -------------------------------------------------------------------------------- /tabix-0.2.6/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Genome Research Ltd (GRL). 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Heng Li */ 27 | 28 | /* 29 | 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*" 30 | */ 31 | 32 | /* Last Modified: 12APR2009 */ 33 | 34 | #ifndef AC_KSEQ_H 35 | #define AC_KSEQ_H 36 | 37 | #include 38 | #include 39 | #include 40 | 41 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 42 | #define KS_SEP_TAB 1 // isspace() && !' ' 43 | #define KS_SEP_MAX 1 44 | 45 | #define __KS_TYPE(type_t) \ 46 | typedef struct __kstream_t { \ 47 | unsigned char *buf; \ 48 | int begin, end, is_eof; \ 49 | type_t f; \ 50 | } kstream_t; 51 | 52 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 53 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 54 | 55 | #define __KS_BASIC(type_t, __bufsize) \ 56 | static inline kstream_t *ks_init(type_t f) \ 57 | { \ 58 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 59 | ks->f = f; \ 60 | ks->buf = malloc(__bufsize); \ 61 | return ks; \ 62 | } \ 63 | static inline void ks_destroy(kstream_t *ks) \ 64 | { \ 65 | if (ks) { \ 66 | free(ks->buf); \ 67 | free(ks); \ 68 | } \ 69 | } 70 | 71 | #define __KS_GETC(__read, __bufsize) \ 72 | static inline int ks_getc(kstream_t *ks) \ 73 | { \ 74 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ 75 | if (ks->begin >= ks->end) { \ 76 | ks->begin = 0; \ 77 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 78 | if (ks->end < __bufsize) ks->is_eof = 1; \ 79 | if (ks->end == 0) return -1; \ 80 | } \ 81 | return (int)ks->buf[ks->begin++]; \ 82 | } 83 | 84 | #ifndef KSTRING_T 85 | #define KSTRING_T kstring_t 86 | typedef struct __kstring_t { 87 | size_t l, m; 88 | char *s; 89 | } kstring_t; 90 | #endif 91 | 92 | #ifndef kroundup32 93 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 94 | #endif 95 | 96 | #define __KS_GETUNTIL(__read, __bufsize) \ 97 | static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 98 | { \ 99 | if (dret) *dret = 0; \ 100 | str->l = 0; \ 101 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ 102 | for (;;) { \ 103 | int i; \ 104 | if (ks->begin >= ks->end) { \ 105 | if (!ks->is_eof) { \ 106 | ks->begin = 0; \ 107 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 108 | if (ks->end < __bufsize) ks->is_eof = 1; \ 109 | if (ks->end == 0) break; \ 110 | } else break; \ 111 | } \ 112 | if (delimiter > KS_SEP_MAX) { \ 113 | for (i = ks->begin; i < ks->end; ++i) \ 114 | if (ks->buf[i] == delimiter) break; \ 115 | } else if (delimiter == KS_SEP_SPACE) { \ 116 | for (i = ks->begin; i < ks->end; ++i) \ 117 | if (isspace(ks->buf[i])) break; \ 118 | } else if (delimiter == KS_SEP_TAB) { \ 119 | for (i = ks->begin; i < ks->end; ++i) \ 120 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 121 | } else i = 0; /* never come to here! */ \ 122 | if (str->m - str->l < i - ks->begin + 1) { \ 123 | str->m = str->l + (i - ks->begin) + 1; \ 124 | kroundup32(str->m); \ 125 | str->s = (char*)realloc(str->s, str->m); \ 126 | } \ 127 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 128 | str->l = str->l + (i - ks->begin); \ 129 | ks->begin = i + 1; \ 130 | if (i < ks->end) { \ 131 | if (dret) *dret = ks->buf[i]; \ 132 | break; \ 133 | } \ 134 | } \ 135 | if (str->l == 0) { \ 136 | str->m = 1; \ 137 | str->s = (char*)calloc(1, 1); \ 138 | } \ 139 | str->s[str->l] = '\0'; \ 140 | return str->l; \ 141 | } 142 | 143 | #define KSTREAM_INIT(type_t, __read, __bufsize) \ 144 | __KS_TYPE(type_t) \ 145 | __KS_BASIC(type_t, __bufsize) \ 146 | __KS_GETC(__read, __bufsize) \ 147 | __KS_GETUNTIL(__read, __bufsize) 148 | 149 | #define __KSEQ_BASIC(type_t) \ 150 | static inline kseq_t *kseq_init(type_t fd) \ 151 | { \ 152 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 153 | s->f = ks_init(fd); \ 154 | return s; \ 155 | } \ 156 | static inline void kseq_rewind(kseq_t *ks) \ 157 | { \ 158 | ks->last_char = 0; \ 159 | ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ 160 | } \ 161 | static inline void kseq_destroy(kseq_t *ks) \ 162 | { \ 163 | if (!ks) return; \ 164 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 165 | ks_destroy(ks->f); \ 166 | free(ks); \ 167 | } 168 | 169 | /* Return value: 170 | >=0 length of the sequence (normal) 171 | -1 end-of-file 172 | -2 truncated quality string 173 | */ 174 | #define __KSEQ_READ \ 175 | static int kseq_read(kseq_t *seq) \ 176 | { \ 177 | int c; \ 178 | kstream_t *ks = seq->f; \ 179 | if (seq->last_char == 0) { /* then jump to the next header line */ \ 180 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 181 | if (c == -1) return -1; /* end of file */ \ 182 | seq->last_char = c; \ 183 | } /* the first header char has been read */ \ 184 | seq->comment.l = seq->seq.l = seq->qual.l = 0; \ 185 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ 186 | if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ 187 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 188 | if (isgraph(c)) { /* printable non-space character */ \ 189 | if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ 190 | seq->seq.m = seq->seq.l + 2; \ 191 | kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ 192 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 193 | } \ 194 | seq->seq.s[seq->seq.l++] = (char)c; \ 195 | } \ 196 | } \ 197 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 198 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 199 | if (c != '+') return seq->seq.l; /* FASTA */ \ 200 | if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ 201 | seq->qual.m = seq->seq.m; \ 202 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 203 | } \ 204 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 205 | if (c == -1) return -2; /* we should not stop here */ \ 206 | while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ 207 | if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ 208 | seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ 209 | seq->last_char = 0; /* we have not come to the next header line */ \ 210 | if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ 211 | return seq->seq.l; \ 212 | } 213 | 214 | #define __KSEQ_TYPE(type_t) \ 215 | typedef struct { \ 216 | kstring_t name, comment, seq, qual; \ 217 | int last_char; \ 218 | kstream_t *f; \ 219 | } kseq_t; 220 | 221 | #define KSEQ_INIT(type_t, __read) \ 222 | KSTREAM_INIT(type_t, __read, 4096) \ 223 | __KSEQ_TYPE(type_t) \ 224 | __KSEQ_BASIC(type_t) \ 225 | __KSEQ_READ 226 | 227 | #endif 228 | -------------------------------------------------------------------------------- /tabix-0.2.6/ksort.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Genome Research Ltd (GRL). 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Heng Li */ 27 | 28 | /* 29 | 2008-11-16 (0.1.4): 30 | 31 | * Fixed a bug in introsort() that happens in rare cases. 32 | 33 | 2008-11-05 (0.1.3): 34 | 35 | * Fixed a bug in introsort() for complex comparisons. 36 | 37 | * Fixed a bug in mergesort(). The previous version is not stable. 38 | 39 | 2008-09-15 (0.1.2): 40 | 41 | * Accelerated introsort. On my Mac (not on another Linux machine), 42 | my implementation is as fast as std::sort on random input. 43 | 44 | * Added combsort and in introsort, switch to combsort if the 45 | recursion is too deep. 46 | 47 | 2008-09-13 (0.1.1): 48 | 49 | * Added k-small algorithm 50 | 51 | 2008-09-05 (0.1.0): 52 | 53 | * Initial version 54 | 55 | */ 56 | 57 | #ifndef AC_KSORT_H 58 | #define AC_KSORT_H 59 | 60 | #include 61 | #include 62 | 63 | typedef struct { 64 | void *left, *right; 65 | int depth; 66 | } ks_isort_stack_t; 67 | 68 | #define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } 69 | 70 | #define KSORT_INIT(name, type_t, __sort_lt) \ 71 | void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ 72 | { \ 73 | type_t *a2[2], *a, *b; \ 74 | int curr, shift; \ 75 | \ 76 | a2[0] = array; \ 77 | a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ 78 | for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ 134 | ks_heapadjust_##name(i, lsize, l); \ 135 | } \ 136 | void ks_heapsort_##name(size_t lsize, type_t l[]) \ 137 | { \ 138 | size_t i; \ 139 | for (i = lsize - 1; i > 0; --i) { \ 140 | type_t tmp; \ 141 | tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ 142 | } \ 143 | } \ 144 | inline void __ks_insertsort_##name(type_t *s, type_t *t) \ 145 | { \ 146 | type_t *i, *j, swap_tmp; \ 147 | for (i = s + 1; i < t; ++i) \ 148 | for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ 149 | swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ 150 | } \ 151 | } \ 152 | void ks_combsort_##name(size_t n, type_t a[]) \ 153 | { \ 154 | const double shrink_factor = 1.2473309501039786540366528676643; \ 155 | int do_swap; \ 156 | size_t gap = n; \ 157 | type_t tmp, *i, *j; \ 158 | do { \ 159 | if (gap > 2) { \ 160 | gap = (size_t)(gap / shrink_factor); \ 161 | if (gap == 9 || gap == 10) gap = 11; \ 162 | } \ 163 | do_swap = 0; \ 164 | for (i = a; i < a + n - gap; ++i) { \ 165 | j = i + gap; \ 166 | if (__sort_lt(*j, *i)) { \ 167 | tmp = *i; *i = *j; *j = tmp; \ 168 | do_swap = 1; \ 169 | } \ 170 | } \ 171 | } while (do_swap || gap > 2); \ 172 | if (gap != 1) __ks_insertsort_##name(a, a + n); \ 173 | } \ 174 | void ks_introsort_##name(size_t n, type_t a[]) \ 175 | { \ 176 | int d; \ 177 | ks_isort_stack_t *top, *stack; \ 178 | type_t rp, swap_tmp; \ 179 | type_t *s, *t, *i, *j, *k; \ 180 | \ 181 | if (n < 1) return; \ 182 | else if (n == 2) { \ 183 | if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ 184 | return; \ 185 | } \ 186 | for (d = 2; 1ul<>1) + 1; \ 197 | if (__sort_lt(*k, *i)) { \ 198 | if (__sort_lt(*k, *j)) k = j; \ 199 | } else k = __sort_lt(*j, *i)? i : j; \ 200 | rp = *k; \ 201 | if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ 202 | for (;;) { \ 203 | do ++i; while (__sort_lt(*i, rp)); \ 204 | do --j; while (i <= j && __sort_lt(rp, *j)); \ 205 | if (j <= i) break; \ 206 | swap_tmp = *i; *i = *j; *j = swap_tmp; \ 207 | } \ 208 | swap_tmp = *i; *i = *t; *t = swap_tmp; \ 209 | if (i-s > t-i) { \ 210 | if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ 211 | s = t-i > 16? i+1 : t; \ 212 | } else { \ 213 | if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ 214 | t = i-s > 16? i-1 : s; \ 215 | } \ 216 | } else { \ 217 | if (top == stack) { \ 218 | free(stack); \ 219 | __ks_insertsort_##name(a, a+n); \ 220 | return; \ 221 | } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ 222 | } \ 223 | } \ 224 | } \ 225 | /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ 226 | /* 0 <= kk < n */ \ 227 | type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ 228 | { \ 229 | type_t *low, *high, *k, *ll, *hh, *mid; \ 230 | low = arr; high = arr + n - 1; k = arr + kk; \ 231 | for (;;) { \ 232 | if (high <= low) return *k; \ 233 | if (high == low + 1) { \ 234 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ 235 | return *k; \ 236 | } \ 237 | mid = low + (high - low) / 2; \ 238 | if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ 239 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ 240 | if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ 241 | KSORT_SWAP(type_t, *mid, *(low+1)); \ 242 | ll = low + 1; hh = high; \ 243 | for (;;) { \ 244 | do ++ll; while (__sort_lt(*ll, *low)); \ 245 | do --hh; while (__sort_lt(*low, *hh)); \ 246 | if (hh < ll) break; \ 247 | KSORT_SWAP(type_t, *ll, *hh); \ 248 | } \ 249 | KSORT_SWAP(type_t, *low, *hh); \ 250 | if (hh <= k) low = ll; \ 251 | if (hh >= k) high = hh - 1; \ 252 | } \ 253 | } 254 | 255 | #define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) 256 | #define ks_introsort(name, n, a) ks_introsort_##name(n, a) 257 | #define ks_combsort(name, n, a) ks_combsort_##name(n, a) 258 | #define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) 259 | #define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) 260 | #define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) 261 | #define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) 262 | 263 | #define ks_lt_generic(a, b) ((a) < (b)) 264 | #define ks_lt_str(a, b) (strcmp((a), (b)) < 0) 265 | 266 | typedef const char *ksstr_t; 267 | 268 | #define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) 269 | #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) 270 | 271 | #endif 272 | -------------------------------------------------------------------------------- /tabix-0.2.6/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "bgzf.h" 8 | #include "tabix.h" 9 | #include "knetfile.h" 10 | 11 | #define PACKAGE_VERSION "0.2.5 (r1005)" 12 | 13 | #define error(...) { fprintf(stderr,__VA_ARGS__); return -1; } 14 | 15 | int reheader_file(const char *header, const char *file, int meta) 16 | { 17 | BGZF *fp = bgzf_open(file,"r"); 18 | if (bgzf_read_block(fp) != 0 || !fp->block_length) 19 | return -1; 20 | 21 | char *buffer = fp->uncompressed_block; 22 | int skip_until = 0; 23 | 24 | if ( buffer[0]==meta ) 25 | { 26 | skip_until = 1; 27 | 28 | // Skip the header 29 | while (1) 30 | { 31 | if ( buffer[skip_until]=='\n' ) 32 | { 33 | skip_until++; 34 | if ( skip_until>=fp->block_length ) 35 | { 36 | if (bgzf_read_block(fp) != 0 || !fp->block_length) 37 | error("no body?\n"); 38 | skip_until = 0; 39 | } 40 | // The header has finished 41 | if ( buffer[skip_until]!=meta ) break; 42 | } 43 | skip_until++; 44 | if ( skip_until>=fp->block_length ) 45 | { 46 | if (bgzf_read_block(fp) != 0 || !fp->block_length) 47 | error("no body?\n"); 48 | skip_until = 0; 49 | } 50 | } 51 | } 52 | 53 | FILE *fh = fopen(header,"r"); 54 | if ( !fh ) 55 | error("%s: %s", header,strerror(errno)); 56 | int page_size = getpagesize(); 57 | char *buf = valloc(page_size); 58 | BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w"); 59 | ssize_t nread; 60 | while ( (nread=fread(buf,1,page_size-1,fh))>0 ) 61 | { 62 | if ( nreaderrcode); 65 | } 66 | fclose(fh); 67 | 68 | if ( fp->block_length - skip_until > 0 ) 69 | { 70 | if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) 71 | error("Error: %d\n",fp->errcode); 72 | } 73 | if (bgzf_flush(bgzf_out) < 0) 74 | error("Error: %d\n",bgzf_out->errcode); 75 | 76 | while (1) 77 | { 78 | #ifdef _USE_KNETFILE 79 | nread = knet_read(fp->fp, buf, page_size); 80 | #else 81 | nread = fread(buf, 1, page_size, fp->fp); 82 | #endif 83 | if ( nread<=0 ) 84 | break; 85 | 86 | int count = fwrite(buf, 1, nread, bgzf_out->fp); 87 | if (count != nread) 88 | error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); 89 | } 90 | 91 | if (bgzf_close(bgzf_out) < 0) 92 | error("Error: %d\n",bgzf_out->errcode); 93 | 94 | return 0; 95 | } 96 | 97 | 98 | int main(int argc, char *argv[]) 99 | { 100 | int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, print_only_header = 0, bed_reg = 0; 101 | ti_conf_t conf = ti_conf_gff, *conf_ptr = NULL; 102 | const char *reheader = NULL; 103 | while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhHfBr:")) >= 0) { 104 | switch (c) { 105 | case 'B': bed_reg = 1; break; 106 | case '0': conf.preset |= TI_FLAG_UCSC; break; 107 | case 'S': skip = atoi(optarg); break; 108 | case 'c': meta = optarg[0]; break; 109 | case 'p': 110 | if (strcmp(optarg, "gff") == 0) conf_ptr = &ti_conf_gff; 111 | else if (strcmp(optarg, "bed") == 0) conf_ptr = &ti_conf_bed; 112 | else if (strcmp(optarg, "sam") == 0) conf_ptr = &ti_conf_sam; 113 | else if (strcmp(optarg, "vcf") == 0 || strcmp(optarg, "vcf4") == 0) conf_ptr = &ti_conf_vcf; 114 | else if (strcmp(optarg, "psltbl") == 0) conf_ptr = &ti_conf_psltbl; 115 | else { 116 | fprintf(stderr, "[main] unrecognized preset '%s'\n", optarg); 117 | return 1; 118 | } 119 | break; 120 | case 's': conf.sc = atoi(optarg); break; 121 | case 'b': conf.bc = atoi(optarg); break; 122 | case 'e': conf.ec = atoi(optarg); break; 123 | case 'l': list_chrms = 1; break; 124 | case 'h': print_header = 1; break; 125 | case 'H': print_only_header = 1; break; 126 | case 'f': force = 1; break; 127 | case 'r': reheader = optarg; break; 128 | } 129 | } 130 | if (optind == argc) { 131 | fprintf(stderr, "\n"); 132 | fprintf(stderr, "Program: tabix (TAB-delimited file InderXer)\n"); 133 | fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION); 134 | fprintf(stderr, "Usage: tabix [region1 [region2 [...]]]\n\n"); 135 | fprintf(stderr, "Options: -p STR preset: gff, bed, sam, vcf, psltbl [gff]\n"); 136 | fprintf(stderr, " -s INT sequence name column [1]\n"); 137 | fprintf(stderr, " -b INT start column [4]\n"); 138 | fprintf(stderr, " -e INT end column; can be identical to '-b' [5]\n"); 139 | fprintf(stderr, " -S INT skip first INT lines [0]\n"); 140 | fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n"); 141 | fprintf(stderr, " -r FILE replace the header with the content of FILE [null]\n"); 142 | fprintf(stderr, " -B region1 is a BED file (entire file will be read)\n"); 143 | fprintf(stderr, " -0 zero-based coordinate\n"); 144 | fprintf(stderr, " -h print also the header lines\n"); 145 | fprintf(stderr, " -H print only the header lines\n"); 146 | fprintf(stderr, " -l list chromosome names\n"); 147 | fprintf(stderr, " -f force to overwrite the index\n"); 148 | fprintf(stderr, "\n"); 149 | return 1; 150 | } 151 | if ( !conf_ptr ) 152 | { 153 | int l = strlen(argv[optind]); 154 | int strcasecmp(const char *s1, const char *s2); 155 | if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &ti_conf_gff; 156 | else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &ti_conf_bed; 157 | else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &ti_conf_sam; 158 | else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &ti_conf_vcf; 159 | else if (l>=10 && strcasecmp(argv[optind]+l-10, ".psltbl.gz") == 0) conf_ptr = &ti_conf_psltbl; 160 | } 161 | if ( conf_ptr ) 162 | conf = *conf_ptr; 163 | 164 | if (skip >= 0) conf.line_skip = skip; 165 | if (meta >= 0) conf.meta_char = meta; 166 | if (list_chrms) { 167 | ti_index_t *idx; 168 | int i, n; 169 | const char **names; 170 | idx = ti_index_load(argv[optind]); 171 | if (idx == 0) { 172 | fprintf(stderr, "[main] fail to load the index file.\n"); 173 | return 1; 174 | } 175 | names = ti_seqname(idx, &n); 176 | for (i = 0; i < n; ++i) printf("%s\n", names[i]); 177 | free(names); 178 | ti_index_destroy(idx); 179 | return 0; 180 | } 181 | if (reheader) 182 | return reheader_file(reheader,argv[optind],conf.meta_char); 183 | 184 | struct stat stat_tbi,stat_vcf; 185 | char *fnidx = calloc(strlen(argv[optind]) + 5, 1); 186 | strcat(strcpy(fnidx, argv[optind]), ".tbi"); 187 | 188 | if (optind + 1 == argc && !print_only_header) { 189 | if (force == 0) { 190 | if (stat(fnidx, &stat_tbi) == 0) 191 | { 192 | // Before complaining, check if the VCF file isn't newer. This is a common source of errors, 193 | // people tend not to notice that tabix failed 194 | stat(argv[optind], &stat_vcf); 195 | if ( stat_vcf.st_mtime <= stat_tbi.st_mtime ) 196 | { 197 | fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n"); 198 | free(fnidx); 199 | return 1; 200 | } 201 | } 202 | } 203 | if ( bgzf_is_bgzf(argv[optind])!=1 ) 204 | { 205 | fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]); 206 | free(fnidx); 207 | return 1; 208 | } 209 | if ( !conf_ptr ) 210 | { 211 | // Building the index but the file type was neither recognised nor given. If no custom change 212 | // has been made, warn the user that GFF is used 213 | if ( conf.preset==ti_conf_gff.preset 214 | && conf.sc==ti_conf_gff.sc 215 | && conf.bc==ti_conf_gff.bc 216 | && conf.ec==ti_conf_gff.ec 217 | && conf.meta_char==ti_conf_gff.meta_char 218 | && conf.line_skip==ti_conf_gff.line_skip ) 219 | fprintf(stderr,"[tabix] The file type not recognised and -p not given, using the preset [gff].\n"); 220 | } 221 | return ti_index_build(argv[optind], &conf); 222 | } 223 | { // retrieve 224 | tabix_t *t; 225 | // On some systems, stat on non-existent files returns undefined value for sm_mtime, the user had to use -f 226 | int is_remote = (strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx) ? 1 : 0; 227 | if ( !is_remote ) 228 | { 229 | // Common source of errors: new VCF is used with an old index 230 | stat(fnidx, &stat_tbi); 231 | stat(argv[optind], &stat_vcf); 232 | if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime ) 233 | { 234 | fprintf(stderr, "[tabix] the index file either does not exist or is older than the vcf file. Please reindex.\n"); 235 | free(fnidx); 236 | return 1; 237 | } 238 | } 239 | free(fnidx); 240 | 241 | if ((t = ti_open(argv[optind], 0)) == 0) { 242 | fprintf(stderr, "[main] fail to open the data file.\n"); 243 | return 1; 244 | } 245 | if ( print_only_header ) 246 | { 247 | ti_iter_t iter; 248 | const char *s; 249 | int len; 250 | if (ti_lazy_index_load(t) < 0 && bed_reg == 0) { 251 | fprintf(stderr,"[tabix] failed to load the index file.\n"); 252 | return 1; 253 | } 254 | const ti_conf_t *idxconf = ti_get_conf(t->idx); 255 | iter = ti_query(t, 0, 0, 0); 256 | while ((s = ti_read(t, iter, &len)) != 0) { 257 | if ((int)(*s) != idxconf->meta_char) break; 258 | fputs(s, stdout); fputc('\n', stdout); 259 | } 260 | ti_iter_destroy(iter); 261 | return 0; 262 | } 263 | 264 | if (strcmp(argv[optind+1], ".") == 0) { // retrieve all 265 | ti_iter_t iter; 266 | const char *s; 267 | int len; 268 | iter = ti_query(t, 0, 0, 0); 269 | while ((s = ti_read(t, iter, &len)) != 0) { 270 | fputs(s, stdout); fputc('\n', stdout); 271 | } 272 | ti_iter_destroy(iter); 273 | } else { // retrieve from specified regions 274 | int i, len; 275 | ti_iter_t iter; 276 | const char *s; 277 | const ti_conf_t *idxconf; 278 | 279 | if (ti_lazy_index_load(t) < 0 && bed_reg == 0) { 280 | fprintf(stderr,"[tabix] failed to load the index file.\n"); 281 | return 1; 282 | } 283 | idxconf = ti_get_conf(t->idx); 284 | 285 | if ( print_header ) 286 | { 287 | // If requested, print the header lines here 288 | iter = ti_query(t, 0, 0, 0); 289 | while ((s = ti_read(t, iter, &len)) != 0) { 290 | if ((int)(*s) != idxconf->meta_char) break; 291 | fputs(s, stdout); fputc('\n', stdout); 292 | } 293 | ti_iter_destroy(iter); 294 | } 295 | if (bed_reg) { 296 | extern int bed_overlap(const void *_h, const char *chr, int beg, int end); 297 | extern void *bed_read(const char *fn); 298 | extern void bed_destroy(void *_h); 299 | 300 | const ti_conf_t *conf_ = idxconf? idxconf : &conf; // use the index file if available 301 | void *bed = bed_read(argv[optind+1]); // load the BED file 302 | ti_interval_t intv; 303 | 304 | if (bed == 0) { 305 | fprintf(stderr, "[main] fail to read the BED file.\n"); 306 | return 1; 307 | } 308 | iter = ti_query(t, 0, 0, 0); 309 | while ((s = ti_read(t, iter, &len)) != 0) { 310 | int c; 311 | ti_get_intv(conf_, len, (char*)s, &intv); 312 | c = *intv.se; *intv.se = '\0'; 313 | if (bed_overlap(bed, intv.ss, intv.beg, intv.end)) { 314 | *intv.se = c; 315 | puts(s); 316 | } 317 | *intv.se = c; 318 | } 319 | ti_iter_destroy(iter); 320 | bed_destroy(bed); 321 | } else { 322 | for (i = optind + 1; i < argc; ++i) { 323 | int tid, beg, end; 324 | if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) { 325 | iter = ti_queryi(t, tid, beg, end); 326 | while ((s = ti_read(t, iter, &len)) != 0) { 327 | fputs(s, stdout); fputc('\n', stdout); 328 | } 329 | ti_iter_destroy(iter); 330 | } 331 | // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n"); 332 | } 333 | } 334 | } 335 | ti_close(t); 336 | } 337 | return 0; 338 | } 339 | -------------------------------------------------------------------------------- /tabix-0.2.6/python/tabixmodule.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * The MIT License 3 | * 4 | * Copyright (c) 2011 Seoul National University. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining 7 | * a copy of this software and associated documentation files (the 8 | * "Software"), to deal in the Software without restriction, including 9 | * without limitation the rights to use, copy, modify, merge, publish, 10 | * distribute, sublicense, and/or sell copies of the Software, and to 11 | * permit persons to whom the Software is furnished to do so, subject to 12 | * the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be 15 | * included in all copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * Contact: Hyeshik Chang 29 | */ 30 | 31 | #define PY_SSIZE_T_CLEAN 32 | #include "Python.h" 33 | #include "tabix.h" 34 | 35 | static PyObject *TabixError; 36 | 37 | typedef struct { 38 | PyObject_HEAD 39 | tabix_t *tb; 40 | char *fn; 41 | } TabixObject; 42 | 43 | typedef struct { 44 | PyObject_HEAD 45 | TabixObject *tbobj; 46 | ti_iter_t iter; 47 | } TabixIteratorObject; 48 | 49 | static PyTypeObject Tabix_Type, TabixIterator_Type; 50 | 51 | /* --- TabixIterator --------------------------------------------------- */ 52 | 53 | static PyObject * 54 | tabixiter_create(TabixObject *parentidx, ti_iter_t iter) 55 | { 56 | TabixIteratorObject *self; 57 | 58 | self = PyObject_New(TabixIteratorObject, &TabixIterator_Type); 59 | if (self == NULL) 60 | return NULL; 61 | 62 | Py_INCREF(parentidx); 63 | self->tbobj = parentidx; 64 | self->iter = iter; 65 | 66 | return (PyObject *)self; 67 | } 68 | 69 | static void 70 | tabixiter_dealloc(TabixIteratorObject *self) 71 | { 72 | ti_iter_destroy(self->iter); 73 | Py_DECREF(self->tbobj); 74 | PyObject_Del(self); 75 | } 76 | 77 | static PyObject * 78 | tabixiter_iter(PyObject *self) 79 | { 80 | Py_INCREF(self); 81 | return self; 82 | } 83 | 84 | #if PY_MAJOR_VERSION < 3 85 | # define PYOBJECT_FROM_STRING_AND_SIZE PyString_FromStringAndSize 86 | #else 87 | # define PYOBJECT_FROM_STRING_AND_SIZE PyUnicode_FromStringAndSize 88 | #endif 89 | 90 | static PyObject * 91 | tabixiter_iternext(TabixIteratorObject *self) 92 | { 93 | const char *chunk; 94 | int len, i; 95 | 96 | chunk = ti_read(self->tbobj->tb, self->iter, &len); 97 | if (chunk != NULL) { 98 | PyObject *ret, *column; 99 | Py_ssize_t colidx; 100 | const char *ptr, *begin; 101 | 102 | ret = PyList_New(0); 103 | if (ret == NULL) 104 | return NULL; 105 | 106 | colidx = 0; 107 | ptr = begin = chunk; 108 | for (i = len; i > 0; i--, ptr++) 109 | if (*ptr == '\t') { 110 | column = PYOBJECT_FROM_STRING_AND_SIZE(begin, 111 | (Py_ssize_t)(ptr - begin)); 112 | if (column == NULL || PyList_Append(ret, column) == -1) { 113 | Py_DECREF(ret); 114 | return NULL; 115 | } 116 | 117 | Py_DECREF(column); 118 | begin = ptr + 1; 119 | colidx++; 120 | } 121 | 122 | column = PYOBJECT_FROM_STRING_AND_SIZE(begin, (Py_ssize_t)(ptr - begin)); 123 | if (column == NULL || PyList_Append(ret, column) == -1) { 124 | Py_DECREF(ret); 125 | return NULL; 126 | } 127 | Py_DECREF(column); 128 | 129 | return ret; 130 | } 131 | else 132 | return NULL; 133 | } 134 | 135 | static PyMethodDef tabixiter_methods[] = { 136 | {NULL, NULL} /* sentinel */ 137 | }; 138 | 139 | static PyTypeObject TabixIterator_Type = { 140 | PyVarObject_HEAD_INIT(NULL, 0) 141 | "tabix.TabixIterator", /*tp_name*/ 142 | sizeof(TabixIteratorObject), /*tp_basicsize*/ 143 | 0, /*tp_itemsize*/ 144 | /* methods */ 145 | (destructor)tabixiter_dealloc, /*tp_dealloc*/ 146 | 0, /*tp_print*/ 147 | 0, /*tp_getattr*/ 148 | 0, /*tp_setattr*/ 149 | 0, /*tp_compare*/ 150 | 0, /*tp_repr*/ 151 | 0, /*tp_as_number*/ 152 | 0, /*tp_as_sequence*/ 153 | 0, /*tp_as_mapping*/ 154 | 0, /*tp_hash*/ 155 | 0, /*tp_call*/ 156 | 0, /*tp_str*/ 157 | 0, /*tp_getattro*/ 158 | 0, /*tp_setattro*/ 159 | 0, /*tp_as_buffer*/ 160 | Py_TPFLAGS_DEFAULT, /*tp_flags*/ 161 | 0, /*tp_doc*/ 162 | 0, /*tp_traverse*/ 163 | 0, /*tp_clear*/ 164 | 0, /*tp_richcompare*/ 165 | 0, /*tp_weaklistoffset*/ 166 | tabixiter_iter, /*tp_iter*/ 167 | (iternextfunc)tabixiter_iternext, /*tp_iternext*/ 168 | tabixiter_methods, /*tp_methods*/ 169 | 0, /*tp_members*/ 170 | 0, /*tp_getset*/ 171 | 0, /*tp_base*/ 172 | 0, /*tp_dict*/ 173 | 0, /*tp_descr_get*/ 174 | 0, /*tp_descr_set*/ 175 | 0, /*tp_dictoffset*/ 176 | 0, /*tp_init*/ 177 | 0, /*tp_alloc*/ 178 | 0, /*tp_new*/ 179 | 0, /*tp_free*/ 180 | 0, /*tp_is_gc*/ 181 | }; 182 | 183 | 184 | /* --- Tabix ----------------------------------------------------------- */ 185 | 186 | static PyObject * 187 | tabix_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 188 | { 189 | TabixObject *self; 190 | const char *fn, *fnidx=NULL; 191 | static char *kwnames[]={"fn", "fnidx", NULL}; 192 | tabix_t *tb; 193 | 194 | if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|z:Tabix", 195 | kwnames, &fn, &fnidx)) 196 | return NULL; 197 | 198 | tb = ti_open(fn, fnidx); 199 | if (tb == NULL) { 200 | PyErr_SetString(TabixError, "Can't open the index file."); 201 | return NULL; 202 | } 203 | 204 | self = (TabixObject *)type->tp_alloc(type, 0); 205 | if (self == NULL) 206 | return NULL; 207 | 208 | self->tb = tb; 209 | self->fn = strdup(fn); 210 | 211 | return (PyObject *)self; 212 | } 213 | 214 | static void 215 | tabix_dealloc(TabixObject *self) 216 | { 217 | free(self->fn); 218 | ti_close(self->tb); 219 | PyObject_Del(self); 220 | } 221 | 222 | static PyObject * 223 | tabix_query(TabixObject *self, PyObject *args) 224 | { 225 | char *name; 226 | int begin, end; 227 | ti_iter_t result; 228 | 229 | if (!PyArg_ParseTuple(args, "sii:query", &name, &begin, &end)) 230 | return NULL; 231 | 232 | result = ti_query(self->tb, name, begin, end); 233 | if (result == NULL) { 234 | PyErr_SetString(TabixError, "query failed"); 235 | return NULL; 236 | } 237 | 238 | return tabixiter_create(self, result); 239 | } 240 | 241 | static PyObject * 242 | tabix_queryi(TabixObject *self, PyObject *args) 243 | { 244 | int tid, begin, end; 245 | ti_iter_t result; 246 | 247 | if (!PyArg_ParseTuple(args, "iii:queryi", &tid, &begin, &end)) 248 | return NULL; 249 | 250 | result = ti_queryi(self->tb, tid, begin, end); 251 | if (result == NULL) { 252 | PyErr_SetString(TabixError, "query failed"); 253 | return NULL; 254 | } 255 | 256 | return tabixiter_create(self, result); 257 | } 258 | 259 | static PyObject * 260 | tabix_querys(TabixObject *self, PyObject *args) 261 | { 262 | const char *reg; 263 | ti_iter_t result; 264 | 265 | if (!PyArg_ParseTuple(args, "s:querys", ®)) 266 | return NULL; 267 | 268 | result = ti_querys(self->tb, reg); 269 | if (result == NULL) { 270 | PyErr_SetString(TabixError, "query failed"); 271 | return NULL; 272 | } 273 | 274 | return tabixiter_create(self, result); 275 | } 276 | 277 | static PyObject * 278 | tabix_repr(TabixObject *self) 279 | { 280 | #if PY_MAJOR_VERSION < 3 281 | return PyString_FromFormat("", self->fn); 282 | #else 283 | return PyUnicode_FromFormat("", self->fn); 284 | #endif 285 | } 286 | 287 | static PyMethodDef tabix_methods[] = { 288 | {"query", (PyCFunction)tabix_query, METH_VARARGS, 289 | PyDoc_STR("T.query(name, begin, end) -> iterator")}, 290 | {"queryi", (PyCFunction)tabix_queryi, METH_VARARGS, 291 | PyDoc_STR("T.queryi(tid, begin, id) -> iterator")}, 292 | {"querys", (PyCFunction)tabix_querys, METH_VARARGS, 293 | PyDoc_STR("T.querys(region) -> iterator")}, 294 | {NULL, NULL} /* sentinel */ 295 | }; 296 | 297 | static PyTypeObject Tabix_Type = { 298 | /* The ob_type field must be initialized in the module init function 299 | * to be portable to Windows without using C++. */ 300 | PyVarObject_HEAD_INIT(NULL, 0) 301 | "tabix.Tabix", /*tp_name*/ 302 | sizeof(TabixObject), /*tp_basicsize*/ 303 | 0, /*tp_itemsize*/ 304 | /* methods */ 305 | (destructor)tabix_dealloc, /*tp_dealloc*/ 306 | 0, /*tp_print*/ 307 | 0, /*tp_getattr*/ 308 | 0, /*tp_setattr*/ 309 | 0, /*tp_compare*/ 310 | (reprfunc)tabix_repr, /*tp_repr*/ 311 | 0, /*tp_as_number*/ 312 | 0, /*tp_as_sequence*/ 313 | 0, /*tp_as_mapping*/ 314 | 0, /*tp_hash*/ 315 | 0, /*tp_call*/ 316 | 0, /*tp_str*/ 317 | 0, /*tp_getattro*/ 318 | 0, /*tp_setattro*/ 319 | 0, /*tp_as_buffer*/ 320 | Py_TPFLAGS_DEFAULT, /*tp_flags*/ 321 | 0, /*tp_doc*/ 322 | 0, /*tp_traverse*/ 323 | 0, /*tp_clear*/ 324 | 0, /*tp_richcompare*/ 325 | 0, /*tp_weaklistoffset*/ 326 | 0, /*tp_iter*/ 327 | 0, /*tp_iternext*/ 328 | tabix_methods, /*tp_methods*/ 329 | 0, /*tp_members*/ 330 | 0, /*tp_getset*/ 331 | 0, /*tp_base*/ 332 | 0, /*tp_dict*/ 333 | 0, /*tp_descr_get*/ 334 | 0, /*tp_descr_set*/ 335 | 0, /*tp_dictoffset*/ 336 | 0, /*tp_init*/ 337 | 0, /*tp_alloc*/ 338 | (newfunc)tabix_new, /*tp_new*/ 339 | 0, /*tp_free*/ 340 | 0, /*tp_is_gc*/ 341 | }; 342 | /* --------------------------------------------------------------------- */ 343 | 344 | static PyMethodDef tabix_functions[] = { 345 | {NULL, NULL} /* sentinel */ 346 | }; 347 | 348 | PyDoc_STRVAR(module_doc, 349 | "Python interface to tabix, Heng Li's generic indexer for TAB-delimited " 350 | "genome position filesThis is a template module just for instruction."); 351 | 352 | #if PY_MAJOR_VERSION >= 3 353 | static struct PyModuleDef tabixmodule = { 354 | PyModuleDef_HEAD_INIT, 355 | "tabix", 356 | module_doc, 357 | -1, 358 | tabix_functions, 359 | NULL, 360 | NULL, 361 | NULL, 362 | NULL 363 | }; 364 | #endif 365 | 366 | #if PY_MAJOR_VERSION < 3 367 | PyMODINIT_FUNC inittabix(void) 368 | #else 369 | PyMODINIT_FUNC PyInit_tabix(void) 370 | #endif 371 | { 372 | PyObject *m; 373 | 374 | if (PyType_Ready(&Tabix_Type) < 0) 375 | goto fail; 376 | if (PyType_Ready(&TabixIterator_Type) < 0) 377 | goto fail; 378 | 379 | #if PY_MAJOR_VERSION < 3 380 | m = Py_InitModule3("tabix", tabix_functions, module_doc); 381 | #else 382 | m = PyModule_Create(&tabixmodule); 383 | #endif 384 | if (m == NULL) 385 | goto fail; 386 | 387 | if (TabixError == NULL) { 388 | TabixError = PyErr_NewException("tabix.error", NULL, NULL); 389 | if (TabixError == NULL) 390 | goto fail; 391 | } 392 | Py_INCREF(TabixError); 393 | PyModule_AddObject(m, "error", TabixError); 394 | 395 | PyModule_AddObject(m, "Tabix", (PyObject *)&Tabix_Type); 396 | PyModule_AddObject(m, "TabixIterator", (PyObject *)&TabixIterator_Type); 397 | 398 | #if PY_MAJOR_VERSION >= 3 399 | return m; 400 | #endif 401 | 402 | fail: 403 | #if PY_MAJOR_VERSION < 3 404 | return; 405 | #else 406 | return NULL; 407 | #endif 408 | } 409 | -------------------------------------------------------------------------------- /tabix-0.2.6/TabixReader.java: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2010 Broad Institute. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Heng Li */ 27 | 28 | import net.sf.samtools.util.BlockCompressedInputStream; 29 | 30 | import java.io.*; 31 | import java.nio.*; 32 | import java.util.HashMap; 33 | import java.util.ArrayList; 34 | import java.util.Arrays; 35 | import java.lang.StringBuffer; 36 | 37 | public class TabixReader 38 | { 39 | private String mFn; 40 | private BlockCompressedInputStream mFp; 41 | 42 | private int mPreset; 43 | private int mSc; 44 | private int mBc; 45 | private int mEc; 46 | private int mMeta; 47 | private int mSkip; 48 | private String[] mSeq; 49 | 50 | private HashMap mChr2tid; 51 | 52 | private static int MAX_BIN = 37450; 53 | private static int TAD_MIN_CHUNK_GAP = 32768; 54 | private static int TAD_LIDX_SHIFT = 14; 55 | 56 | private class TPair64 implements Comparable { 57 | long u, v; 58 | public TPair64(final long _u, final long _v) { 59 | u = _u; v = _v; 60 | } 61 | public TPair64(final TPair64 p) { 62 | u = p.u; v = p.v; 63 | } 64 | public int compareTo(final TPair64 p) { 65 | return u == p.u? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0))? -1 : 1; // unsigned 64-bit comparison 66 | } 67 | }; 68 | 69 | private class TIndex { 70 | HashMap b; // binning index 71 | long[] l; // linear index 72 | }; 73 | private TIndex[] mIndex; 74 | 75 | private class TIntv { 76 | int tid, beg, end; 77 | }; 78 | 79 | private static boolean less64(final long u, final long v) { // unsigned 64-bit comparison 80 | return (u < v) ^ (u < 0) ^ (v < 0); 81 | } 82 | 83 | /** 84 | * The constructor 85 | * 86 | * @param fn File name of the data file 87 | */ 88 | public TabixReader(final String fn) throws IOException { 89 | mFn = fn; 90 | mFp = new BlockCompressedInputStream(new File(fn)); 91 | readIndex(); 92 | } 93 | 94 | private static int reg2bins(final int beg, final int _end, final int[] list) { 95 | int i = 0, k, end = _end; 96 | if (beg >= end) return 0; 97 | if (end >= 1<<29) end = 1<<29; 98 | --end; 99 | list[i++] = 0; 100 | for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; 101 | for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k; 102 | for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k; 103 | for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k; 104 | for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k; 105 | return i; 106 | } 107 | 108 | public static int readInt(final InputStream is) throws IOException { 109 | byte[] buf = new byte[4]; 110 | is.read(buf); 111 | return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt(); 112 | } 113 | 114 | public static long readLong(final InputStream is) throws IOException { 115 | byte[] buf = new byte[8]; 116 | is.read(buf); 117 | return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong(); 118 | } 119 | 120 | public static String readLine(final InputStream is) throws IOException { 121 | StringBuffer buf = new StringBuffer(); 122 | int c; 123 | while ((c = is.read()) >= 0 && c != '\n') 124 | buf.append((char)c); 125 | if (c < 0) return null; 126 | return buf.toString(); 127 | } 128 | 129 | /** 130 | * Read the Tabix index from a file 131 | * 132 | * @param fp File pointer 133 | */ 134 | public void readIndex(final File fp) throws IOException { 135 | if (fp == null) return; 136 | BlockCompressedInputStream is = new BlockCompressedInputStream(fp); 137 | byte[] buf = new byte[4]; 138 | 139 | is.read(buf, 0, 4); // read "TBI\1" 140 | mSeq = new String[readInt(is)]; // # sequences 141 | mChr2tid = new HashMap(); 142 | mPreset = readInt(is); 143 | mSc = readInt(is); 144 | mBc = readInt(is); 145 | mEc = readInt(is); 146 | mMeta = readInt(is); 147 | mSkip = readInt(is); 148 | // read sequence dictionary 149 | int i, j, k, l = readInt(is); 150 | buf = new byte[l]; 151 | is.read(buf); 152 | for (i = j = k = 0; i < buf.length; ++i) { 153 | if (buf[i] == 0) { 154 | byte[] b = new byte[i - j]; 155 | System.arraycopy(buf, j, b, 0, b.length); 156 | String s = new String(b); 157 | mChr2tid.put(s, k); 158 | mSeq[k++] = s; 159 | j = i + 1; 160 | } 161 | } 162 | // read the index 163 | mIndex = new TIndex[mSeq.length]; 164 | for (i = 0; i < mSeq.length; ++i) { 165 | // the binning index 166 | int n_bin = readInt(is); 167 | mIndex[i] = new TIndex(); 168 | mIndex[i].b = new HashMap(); 169 | for (j = 0; j < n_bin; ++j) { 170 | int bin = readInt(is); 171 | TPair64[] chunks = new TPair64[readInt(is)]; 172 | for (k = 0; k < chunks.length; ++k) { 173 | long u = readLong(is); 174 | long v = readLong(is); 175 | chunks[k] = new TPair64(u, v); // in C, this is inefficient 176 | } 177 | mIndex[i].b.put(bin, chunks); 178 | } 179 | // the linear index 180 | mIndex[i].l = new long[readInt(is)]; 181 | for (k = 0; k < mIndex[i].l.length; ++k) 182 | mIndex[i].l[k] = readLong(is); 183 | } 184 | // close 185 | is.close(); 186 | } 187 | 188 | /** 189 | * Read the Tabix index from the default file. 190 | */ 191 | public void readIndex() throws IOException { 192 | readIndex(new File(mFn + ".tbi")); 193 | } 194 | 195 | /** 196 | * Read one line from the data file. 197 | */ 198 | public String readLine() throws IOException { 199 | return readLine(mFp); 200 | } 201 | 202 | private int chr2tid(final String chr) { 203 | if (mChr2tid.containsKey(chr)) return mChr2tid.get(chr); 204 | else return -1; 205 | } 206 | 207 | /** 208 | * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000" 209 | * 210 | * @param reg Region string 211 | * @return An array where the three elements are sequence_id, 212 | * region_begin and region_end. On failure, sequence_id==-1. 213 | */ 214 | public int[] parseReg(final String reg) { // FIXME: NOT working when the sequence name contains : or -. 215 | String chr; 216 | int colon, hyphen; 217 | int[] ret = new int[3]; 218 | colon = reg.indexOf(':'); hyphen = reg.indexOf('-'); 219 | chr = colon >= 0? reg.substring(0, colon) : reg; 220 | ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen >= 0? hyphen : reg.length())) - 1 : 0; 221 | ret[2] = hyphen >= 0? Integer.parseInt(reg.substring(hyphen+1)) : 0x7fffffff; 222 | ret[0] = chr2tid(chr); 223 | return ret; 224 | } 225 | 226 | private TIntv getIntv(final String s) { 227 | TIntv intv = new TIntv(); 228 | int col = 0, end = 0, beg = 0; 229 | while ((end = s.indexOf('\t', beg)) >= 0 || end == -1) { 230 | ++col; 231 | if (col == mSc) { 232 | intv.tid = chr2tid(s.substring(beg, end)); 233 | } else if (col == mBc) { 234 | intv.beg = intv.end = Integer.parseInt(s.substring(beg, end==-1?s.length():end)); 235 | if ((mPreset&0x10000) != 0) ++intv.end; 236 | else --intv.beg; 237 | if (intv.beg < 0) intv.beg = 0; 238 | if (intv.end < 1) intv.end = 1; 239 | } else { // FIXME: SAM supports are not tested yet 240 | if ((mPreset&0xffff) == 0) { // generic 241 | if (col == mEc) 242 | intv.end = Integer.parseInt(s.substring(beg, end)); 243 | } else if ((mPreset&0xffff) == 1) { // SAM 244 | if (col == 6) { // CIGAR 245 | int l = 0, i, j; 246 | String cigar = s.substring(beg, end); 247 | for (i = j = 0; i < cigar.length(); ++i) { 248 | if (cigar.charAt(i) > '9') { 249 | int op = cigar.charAt(i); 250 | if (op == 'M' || op == 'D' || op == 'N') 251 | l += Integer.parseInt(cigar.substring(j, i)); 252 | } 253 | } 254 | intv.end = intv.beg + l; 255 | } 256 | } else if ((mPreset&0xffff) == 2) { // VCF 257 | String alt; 258 | alt = end >= 0? s.substring(beg, end) : s.substring(beg); 259 | if (col == 4) { // REF 260 | if (alt.length() > 0) intv.end = intv.beg + alt.length(); 261 | } else if (col == 8) { // INFO 262 | int e_off = -1, i = alt.indexOf("END="); 263 | if (i == 0) e_off = 4; 264 | else if (i > 0) { 265 | i = alt.indexOf(";END="); 266 | if (i >= 0) e_off = i + 5; 267 | } 268 | if (e_off > 0) { 269 | i = alt.indexOf(";", e_off); 270 | intv.end = Integer.parseInt(i > e_off? alt.substring(e_off, i) : alt.substring(e_off)); 271 | } 272 | } 273 | } 274 | } 275 | if (end == -1) break; 276 | beg = end + 1; 277 | } 278 | return intv; 279 | } 280 | 281 | public class Iterator { 282 | private int i, n_seeks; 283 | private int tid, beg, end; 284 | private TPair64[] off; 285 | private long curr_off; 286 | private boolean iseof; 287 | 288 | public Iterator(final int _tid, final int _beg, final int _end, final TPair64[] _off) { 289 | i = -1; n_seeks = 0; curr_off = 0; iseof = false; 290 | off = _off; tid = _tid; beg = _beg; end = _end; 291 | } 292 | 293 | public String next() throws IOException { 294 | if (iseof) return null; 295 | for (;;) { 296 | if (curr_off == 0 || !less64(curr_off, off[i].v)) { // then jump to the next chunk 297 | if (i == off.length - 1) break; // no more chunks 298 | if (i >= 0) assert(curr_off == off[i].v); // otherwise bug 299 | if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek 300 | mFp.seek(off[i+1].u); 301 | curr_off = mFp.getFilePointer(); 302 | ++n_seeks; 303 | } 304 | ++i; 305 | } 306 | String s; 307 | if ((s = readLine(mFp)) != null) { 308 | TIntv intv; 309 | char[] str = s.toCharArray(); 310 | curr_off = mFp.getFilePointer(); 311 | if (str.length == 0 || str[0] == mMeta) continue; 312 | intv = getIntv(s); 313 | if (intv.tid != tid || intv.beg >= end) break; // no need to proceed 314 | else if (intv.end > beg && intv.beg < end) return s; // overlap; return 315 | } else break; // end of file 316 | } 317 | iseof = true; 318 | return null; 319 | } 320 | }; 321 | 322 | public Iterator query(final int tid, final int beg, final int end) { 323 | TPair64[] off, chunks; 324 | long min_off; 325 | TIndex idx = mIndex[tid]; 326 | int[] bins = new int[MAX_BIN]; 327 | int i, l, n_off, n_bins = reg2bins(beg, end, bins); 328 | if (idx.l.length > 0) 329 | min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT]; 330 | else min_off = 0; 331 | for (i = n_off = 0; i < n_bins; ++i) { 332 | if ((chunks = idx.b.get(bins[i])) != null) 333 | n_off += chunks.length; 334 | } 335 | if (n_off == 0) return null; 336 | off = new TPair64[n_off]; 337 | for (i = n_off = 0; i < n_bins; ++i) 338 | if ((chunks = idx.b.get(bins[i])) != null) 339 | for (int j = 0; j < chunks.length; ++j) 340 | if (less64(min_off, chunks[j].v)) 341 | off[n_off++] = new TPair64(chunks[j]); 342 | if (n_off == 0) return null; 343 | Arrays.sort(off, 0, n_off); 344 | // resolve completely contained adjacent blocks 345 | for (i = 1, l = 0; i < n_off; ++i) { 346 | if (less64(off[l].v, off[i].v)) { 347 | ++l; 348 | off[l].u = off[i].u; off[l].v = off[i].v; 349 | } 350 | } 351 | n_off = l + 1; 352 | // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing 353 | for (i = 1; i < n_off; ++i) 354 | if (!less64(off[i-1].v, off[i].u)) off[i-1].v = off[i].u; 355 | // merge adjacent blocks 356 | for (i = 1, l = 0; i < n_off; ++i) { 357 | if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; 358 | else { 359 | ++l; 360 | off[l].u = off[i].u; 361 | off[l].v = off[i].v; 362 | } 363 | } 364 | n_off = l + 1; 365 | // return 366 | TPair64[] ret = new TPair64[n_off]; 367 | for (i = 0; i < n_off; ++i) ret[i] = new TPair64(off[i].u, off[i].v); // in C, this is inefficient 368 | return new TabixReader.Iterator(tid, beg, end, ret); 369 | } 370 | 371 | public Iterator query(final String reg) { 372 | int[] x = parseReg(reg); 373 | return query(x[0], x[1], x[2]); 374 | } 375 | 376 | public static void main(String[] args) { 377 | if (args.length < 1) { 378 | System.out.println("Usage: java -cp .:sam.jar TabixReader [region]"); 379 | System.exit(1); 380 | } 381 | try { 382 | TabixReader tr = new TabixReader(args[0]); 383 | String s; 384 | if (args.length == 1) { // no region is specified; print the whole file 385 | while ((s = tr.readLine()) != null) 386 | System.out.println(s); 387 | } else { // a region is specified; random access 388 | TabixReader.Iterator iter = tr.query(args[1]); // get the iterator 389 | while (iter != null && (s = iter.next()) != null) 390 | System.out.println(s); 391 | } 392 | } catch (IOException e) { 393 | } 394 | } 395 | } 396 | -------------------------------------------------------------------------------- /tabix-0.2.6/khash.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Genome Research Ltd (GRL). 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Heng Li */ 27 | 28 | /* 29 | An example: 30 | 31 | #include "khash.h" 32 | KHASH_MAP_INIT_INT(32, char) 33 | int main() { 34 | int ret, is_missing; 35 | khiter_t k; 36 | khash_t(32) *h = kh_init(32); 37 | k = kh_put(32, h, 5, &ret); 38 | if (!ret) kh_del(32, h, k); 39 | kh_value(h, k) = 10; 40 | k = kh_get(32, h, 10); 41 | is_missing = (k == kh_end(h)); 42 | k = kh_get(32, h, 5); 43 | kh_del(32, h, k); 44 | for (k = kh_begin(h); k != kh_end(h); ++k) 45 | if (kh_exist(h, k)) kh_value(h, k) = 1; 46 | kh_destroy(32, h); 47 | return 0; 48 | } 49 | */ 50 | 51 | /* 52 | 2008-09-19 (0.2.3): 53 | 54 | * Corrected the example 55 | * Improved interfaces 56 | 57 | 2008-09-11 (0.2.2): 58 | 59 | * Improved speed a little in kh_put() 60 | 61 | 2008-09-10 (0.2.1): 62 | 63 | * Added kh_clear() 64 | * Fixed a compiling error 65 | 66 | 2008-09-02 (0.2.0): 67 | 68 | * Changed to token concatenation which increases flexibility. 69 | 70 | 2008-08-31 (0.1.2): 71 | 72 | * Fixed a bug in kh_get(), which has not been tested previously. 73 | 74 | 2008-08-31 (0.1.1): 75 | 76 | * Added destructor 77 | */ 78 | 79 | 80 | #ifndef __AC_KHASH_H 81 | #define __AC_KHASH_H 82 | 83 | /*! 84 | @header 85 | 86 | Generic hash table library. 87 | 88 | @copyright Heng Li 89 | */ 90 | 91 | #define AC_VERSION_KHASH_H "0.2.2" 92 | 93 | #include 94 | #include 95 | #include 96 | 97 | typedef uint32_t khint_t; 98 | typedef khint_t khiter_t; 99 | 100 | #define __ac_HASH_PRIME_SIZE 32 101 | static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = 102 | { 103 | 0ul, 3ul, 11ul, 23ul, 53ul, 104 | 97ul, 193ul, 389ul, 769ul, 1543ul, 105 | 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, 106 | 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, 107 | 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, 108 | 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, 109 | 3221225473ul, 4294967291ul 110 | }; 111 | 112 | #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) 113 | #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) 114 | #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) 115 | #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) 116 | #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) 117 | #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) 118 | #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) 119 | 120 | static const double __ac_HASH_UPPER = 0.77; 121 | 122 | #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ 123 | typedef struct { \ 124 | khint_t n_buckets, size, n_occupied, upper_bound; \ 125 | uint32_t *flags; \ 126 | khkey_t *keys; \ 127 | khval_t *vals; \ 128 | } kh_##name##_t; \ 129 | static inline kh_##name##_t *kh_init_##name() { \ 130 | return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ 131 | } \ 132 | static inline void kh_destroy_##name(kh_##name##_t *h) \ 133 | { \ 134 | if (h) { \ 135 | free(h->keys); free(h->flags); \ 136 | free(h->vals); \ 137 | free(h); \ 138 | } \ 139 | } \ 140 | static inline void kh_clear_##name(kh_##name##_t *h) \ 141 | { \ 142 | if (h && h->flags) { \ 143 | memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \ 144 | h->size = h->n_occupied = 0; \ 145 | } \ 146 | } \ 147 | static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ 148 | { \ 149 | if (h->n_buckets) { \ 150 | khint_t inc, k, i, last; \ 151 | k = __hash_func(key); i = k % h->n_buckets; \ 152 | inc = 1 + k % (h->n_buckets - 1); last = i; \ 153 | while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ 154 | if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ 155 | else i += inc; \ 156 | if (i == last) return h->n_buckets; \ 157 | } \ 158 | return __ac_iseither(h->flags, i)? h->n_buckets : i; \ 159 | } else return 0; \ 160 | } \ 161 | static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ 162 | { \ 163 | uint32_t *new_flags = 0; \ 164 | khint_t j = 1; \ 165 | { \ 166 | khint_t t = __ac_HASH_PRIME_SIZE - 1; \ 167 | while (__ac_prime_list[t] > new_n_buckets) --t; \ 168 | new_n_buckets = __ac_prime_list[t+1]; \ 169 | if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ 170 | else { \ 171 | new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ 172 | memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ 173 | if (h->n_buckets < new_n_buckets) { \ 174 | h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ 175 | if (kh_is_map) \ 176 | h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ 177 | } \ 178 | } \ 179 | } \ 180 | if (j) { \ 181 | for (j = 0; j != h->n_buckets; ++j) { \ 182 | if (__ac_iseither(h->flags, j) == 0) { \ 183 | khkey_t key = h->keys[j]; \ 184 | khval_t val; \ 185 | if (kh_is_map) val = h->vals[j]; \ 186 | __ac_set_isdel_true(h->flags, j); \ 187 | while (1) { \ 188 | khint_t inc, k, i; \ 189 | k = __hash_func(key); \ 190 | i = k % new_n_buckets; \ 191 | inc = 1 + k % (new_n_buckets - 1); \ 192 | while (!__ac_isempty(new_flags, i)) { \ 193 | if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ 194 | else i += inc; \ 195 | } \ 196 | __ac_set_isempty_false(new_flags, i); \ 197 | if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ 198 | { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ 199 | if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ 200 | __ac_set_isdel_true(h->flags, i); \ 201 | } else { \ 202 | h->keys[i] = key; \ 203 | if (kh_is_map) h->vals[i] = val; \ 204 | break; \ 205 | } \ 206 | } \ 207 | } \ 208 | } \ 209 | if (h->n_buckets > new_n_buckets) { \ 210 | h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ 211 | if (kh_is_map) \ 212 | h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ 213 | } \ 214 | free(h->flags); \ 215 | h->flags = new_flags; \ 216 | h->n_buckets = new_n_buckets; \ 217 | h->n_occupied = h->size; \ 218 | h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ 219 | } \ 220 | } \ 221 | static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ 222 | { \ 223 | khint_t x; \ 224 | if (h->n_occupied >= h->upper_bound) { \ 225 | if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ 226 | else kh_resize_##name(h, h->n_buckets + 1); \ 227 | } \ 228 | { \ 229 | khint_t inc, k, i, site, last; \ 230 | x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ 231 | if (__ac_isempty(h->flags, i)) x = i; \ 232 | else { \ 233 | inc = 1 + k % (h->n_buckets - 1); last = i; \ 234 | while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ 235 | if (__ac_isdel(h->flags, i)) site = i; \ 236 | if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ 237 | else i += inc; \ 238 | if (i == last) { x = site; break; } \ 239 | } \ 240 | if (x == h->n_buckets) { \ 241 | if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ 242 | else x = i; \ 243 | } \ 244 | } \ 245 | } \ 246 | if (__ac_isempty(h->flags, x)) { \ 247 | h->keys[x] = key; \ 248 | __ac_set_isboth_false(h->flags, x); \ 249 | ++h->size; ++h->n_occupied; \ 250 | *ret = 1; \ 251 | } else if (__ac_isdel(h->flags, x)) { \ 252 | h->keys[x] = key; \ 253 | __ac_set_isboth_false(h->flags, x); \ 254 | ++h->size; \ 255 | *ret = 2; \ 256 | } else *ret = 0; \ 257 | return x; \ 258 | } \ 259 | static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ 260 | { \ 261 | if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ 262 | __ac_set_isdel_true(h->flags, x); \ 263 | --h->size; \ 264 | } \ 265 | } 266 | 267 | /* --- BEGIN OF HASH FUNCTIONS --- */ 268 | 269 | /*! @function 270 | @abstract Integer hash function 271 | @param key The integer [uint32_t] 272 | @return The hash value [khint_t] 273 | */ 274 | #define kh_int_hash_func(key) (uint32_t)(key) 275 | /*! @function 276 | @abstract Integer comparison function 277 | */ 278 | #define kh_int_hash_equal(a, b) ((a) == (b)) 279 | /*! @function 280 | @abstract 64-bit integer hash function 281 | @param key The integer [uint64_t] 282 | @return The hash value [khint_t] 283 | */ 284 | #define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11) 285 | /*! @function 286 | @abstract 64-bit integer comparison function 287 | */ 288 | #define kh_int64_hash_equal(a, b) ((a) == (b)) 289 | /*! @function 290 | @abstract const char* hash function 291 | @param s Pointer to a null terminated string 292 | @return The hash value 293 | */ 294 | static inline khint_t __ac_X31_hash_string(const char *s) 295 | { 296 | khint_t h = *s; 297 | if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; 298 | return h; 299 | } 300 | /*! @function 301 | @abstract Another interface to const char* hash function 302 | @param key Pointer to a null terminated string [const char*] 303 | @return The hash value [khint_t] 304 | */ 305 | #define kh_str_hash_func(key) __ac_X31_hash_string(key) 306 | /*! @function 307 | @abstract Const char* comparison function 308 | */ 309 | #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) 310 | 311 | /* --- END OF HASH FUNCTIONS --- */ 312 | 313 | /* Other necessary macros... */ 314 | 315 | /*! 316 | @abstract Type of the hash table. 317 | @param name Name of the hash table [symbol] 318 | */ 319 | #define khash_t(name) kh_##name##_t 320 | 321 | /*! @function 322 | @abstract Initiate a hash table. 323 | @param name Name of the hash table [symbol] 324 | @return Pointer to the hash table [khash_t(name)*] 325 | */ 326 | #define kh_init(name) kh_init_##name() 327 | 328 | /*! @function 329 | @abstract Destroy a hash table. 330 | @param name Name of the hash table [symbol] 331 | @param h Pointer to the hash table [khash_t(name)*] 332 | */ 333 | #define kh_destroy(name, h) kh_destroy_##name(h) 334 | 335 | /*! @function 336 | @abstract Reset a hash table without deallocating memory. 337 | @param name Name of the hash table [symbol] 338 | @param h Pointer to the hash table [khash_t(name)*] 339 | */ 340 | #define kh_clear(name, h) kh_clear_##name(h) 341 | 342 | /*! @function 343 | @abstract Resize a hash table. 344 | @param name Name of the hash table [symbol] 345 | @param h Pointer to the hash table [khash_t(name)*] 346 | @param s New size [khint_t] 347 | */ 348 | #define kh_resize(name, h, s) kh_resize_##name(h, s) 349 | 350 | /*! @function 351 | @abstract Insert a key to the hash table. 352 | @param name Name of the hash table [symbol] 353 | @param h Pointer to the hash table [khash_t(name)*] 354 | @param k Key [type of keys] 355 | @param r Extra return code: 0 if the key is present in the hash table; 356 | 1 if the bucket is empty (never used); 2 if the element in 357 | the bucket has been deleted [int*] 358 | @return Iterator to the inserted element [khint_t] 359 | */ 360 | #define kh_put(name, h, k, r) kh_put_##name(h, k, r) 361 | 362 | /*! @function 363 | @abstract Retrieve a key from the hash table. 364 | @param name Name of the hash table [symbol] 365 | @param h Pointer to the hash table [khash_t(name)*] 366 | @param k Key [type of keys] 367 | @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] 368 | */ 369 | #define kh_get(name, h, k) kh_get_##name(h, k) 370 | 371 | /*! @function 372 | @abstract Remove a key from the hash table. 373 | @param name Name of the hash table [symbol] 374 | @param h Pointer to the hash table [khash_t(name)*] 375 | @param k Iterator to the element to be deleted [khint_t] 376 | */ 377 | #define kh_del(name, h, k) kh_del_##name(h, k) 378 | 379 | 380 | /*! @function 381 | @abstract Test whether a bucket contains data. 382 | @param h Pointer to the hash table [khash_t(name)*] 383 | @param x Iterator to the bucket [khint_t] 384 | @return 1 if containing data; 0 otherwise [int] 385 | */ 386 | #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) 387 | 388 | /*! @function 389 | @abstract Get key given an iterator 390 | @param h Pointer to the hash table [khash_t(name)*] 391 | @param x Iterator to the bucket [khint_t] 392 | @return Key [type of keys] 393 | */ 394 | #define kh_key(h, x) ((h)->keys[x]) 395 | 396 | /*! @function 397 | @abstract Get value given an iterator 398 | @param h Pointer to the hash table [khash_t(name)*] 399 | @param x Iterator to the bucket [khint_t] 400 | @return Value [type of values] 401 | @discussion For hash sets, calling this results in segfault. 402 | */ 403 | #define kh_val(h, x) ((h)->vals[x]) 404 | 405 | /*! @function 406 | @abstract Alias of kh_val() 407 | */ 408 | #define kh_value(h, x) ((h)->vals[x]) 409 | 410 | /*! @function 411 | @abstract Get the start iterator 412 | @param h Pointer to the hash table [khash_t(name)*] 413 | @return The start iterator [khint_t] 414 | */ 415 | #define kh_begin(h) (khint_t)(0) 416 | 417 | /*! @function 418 | @abstract Get the end iterator 419 | @param h Pointer to the hash table [khash_t(name)*] 420 | @return The end iterator [khint_t] 421 | */ 422 | #define kh_end(h) ((h)->n_buckets) 423 | 424 | /*! @function 425 | @abstract Get the number of elements in the hash table 426 | @param h Pointer to the hash table [khash_t(name)*] 427 | @return Number of elements in the hash table [khint_t] 428 | */ 429 | #define kh_size(h) ((h)->size) 430 | 431 | /*! @function 432 | @abstract Get the number of buckets in the hash table 433 | @param h Pointer to the hash table [khash_t(name)*] 434 | @return Number of buckets in the hash table [khint_t] 435 | */ 436 | #define kh_n_buckets(h) ((h)->n_buckets) 437 | 438 | /* More conenient interfaces */ 439 | 440 | /*! @function 441 | @abstract Instantiate a hash set containing integer keys 442 | @param name Name of the hash table [symbol] 443 | */ 444 | #define KHASH_SET_INIT_INT(name) \ 445 | KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) 446 | 447 | /*! @function 448 | @abstract Instantiate a hash map containing integer keys 449 | @param name Name of the hash table [symbol] 450 | @param khval_t Type of values [type] 451 | */ 452 | #define KHASH_MAP_INIT_INT(name, khval_t) \ 453 | KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) 454 | 455 | /*! @function 456 | @abstract Instantiate a hash map containing 64-bit integer keys 457 | @param name Name of the hash table [symbol] 458 | */ 459 | #define KHASH_SET_INIT_INT64(name) \ 460 | KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) 461 | 462 | /*! @function 463 | @abstract Instantiate a hash map containing 64-bit integer keys 464 | @param name Name of the hash table [symbol] 465 | @param khval_t Type of values [type] 466 | */ 467 | #define KHASH_MAP_INIT_INT64(name, khval_t) \ 468 | KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) 469 | 470 | typedef const char *kh_cstr_t; 471 | /*! @function 472 | @abstract Instantiate a hash map containing const char* keys 473 | @param name Name of the hash table [symbol] 474 | */ 475 | #define KHASH_SET_INIT_STR(name) \ 476 | KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) 477 | 478 | /*! @function 479 | @abstract Instantiate a hash map containing const char* keys 480 | @param name Name of the hash table [symbol] 481 | @param khval_t Type of values [type] 482 | */ 483 | #define KHASH_MAP_INIT_STR(name, khval_t) \ 484 | KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) 485 | 486 | #endif /* __AC_KHASH_H */ 487 | -------------------------------------------------------------------------------- /tabix-0.2.6/bgzf.c: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 4 | 2011 Attractive Chaos 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include "bgzf.h" 32 | 33 | #ifdef _USE_KNETFILE 34 | #include "knetfile.h" 35 | typedef knetFile *_bgzf_file_t; 36 | #define _bgzf_open(fn, mode) knet_open(fn, mode) 37 | #define _bgzf_dopen(fp, mode) knet_dopen(fp, mode) 38 | #define _bgzf_close(fp) knet_close(fp) 39 | #define _bgzf_fileno(fp) ((fp)->fd) 40 | #define _bgzf_tell(fp) knet_tell(fp) 41 | #define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence) 42 | #define _bgzf_read(fp, buf, len) knet_read(fp, buf, len) 43 | #define _bgzf_write(fp, buf, len) knet_write(fp, buf, len) 44 | #else // ~defined(_USE_KNETFILE) 45 | #if defined(_WIN32) || defined(_MSC_VER) 46 | #define ftello(fp) ftell(fp) 47 | #define fseeko(fp, offset, whence) fseek(fp, offset, whence) 48 | #else // ~defined(_WIN32) 49 | extern off_t ftello(FILE *stream); 50 | extern int fseeko(FILE *stream, off_t offset, int whence); 51 | #endif // ~defined(_WIN32) 52 | typedef FILE *_bgzf_file_t; 53 | #define _bgzf_open(fn, mode) fopen(fn, mode) 54 | #define _bgzf_dopen(fp, mode) fdopen(fp, mode) 55 | #define _bgzf_close(fp) fclose(fp) 56 | #define _bgzf_fileno(fp) fileno(fp) 57 | #define _bgzf_tell(fp) ftello(fp) 58 | #define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence) 59 | #define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp) 60 | #define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp) 61 | #endif // ~define(_USE_KNETFILE) 62 | 63 | #define BLOCK_HEADER_LENGTH 18 64 | #define BLOCK_FOOTER_LENGTH 8 65 | 66 | /* BGZF/GZIP header (speciallized from RFC 1952; little endian): 67 | +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ 68 | | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN| 69 | +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ 70 | */ 71 | static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; 72 | 73 | #ifdef BGZF_CACHE 74 | typedef struct { 75 | int size; 76 | uint8_t *block; 77 | int64_t end_offset; 78 | } cache_t; 79 | #include "khash.h" 80 | KHASH_MAP_INIT_INT64(cache, cache_t) 81 | #endif 82 | 83 | static inline void packInt16(uint8_t *buffer, uint16_t value) 84 | { 85 | buffer[0] = value; 86 | buffer[1] = value >> 8; 87 | } 88 | 89 | static inline int unpackInt16(const uint8_t *buffer) 90 | { 91 | return buffer[0] | buffer[1] << 8; 92 | } 93 | 94 | static inline void packInt32(uint8_t *buffer, uint32_t value) 95 | { 96 | buffer[0] = value; 97 | buffer[1] = value >> 8; 98 | buffer[2] = value >> 16; 99 | buffer[3] = value >> 24; 100 | } 101 | 102 | static BGZF *bgzf_read_init() 103 | { 104 | BGZF *fp; 105 | fp = calloc(1, sizeof(BGZF)); 106 | fp->open_mode = 'r'; 107 | fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE); 108 | fp->compressed_block = malloc(BGZF_BLOCK_SIZE); 109 | #ifdef BGZF_CACHE 110 | fp->cache = kh_init(cache); 111 | #endif 112 | return fp; 113 | } 114 | 115 | static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level 116 | { 117 | BGZF *fp; 118 | fp = calloc(1, sizeof(BGZF)); 119 | fp->open_mode = 'w'; 120 | fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE); 121 | fp->compressed_block = malloc(BGZF_BLOCK_SIZE); 122 | fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 123 | if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; 124 | return fp; 125 | } 126 | // get the compress level from the mode string 127 | static int mode2level(const char *__restrict mode) 128 | { 129 | int i, compress_level = -1; 130 | for (i = 0; mode[i]; ++i) 131 | if (mode[i] >= '0' && mode[i] <= '9') break; 132 | if (mode[i]) compress_level = (int)mode[i] - '0'; 133 | if (strchr(mode, 'u')) compress_level = 0; 134 | return compress_level; 135 | } 136 | 137 | BGZF *bgzf_open(const char *path, const char *mode) 138 | { 139 | BGZF *fp = 0; 140 | if (strchr(mode, 'r') || strchr(mode, 'R')) { 141 | _bgzf_file_t fpr; 142 | if ((fpr = _bgzf_open(path, "r")) == 0) return 0; 143 | fp = bgzf_read_init(); 144 | fp->fp = fpr; 145 | } else if (strchr(mode, 'w') || strchr(mode, 'W')) { 146 | FILE *fpw; 147 | if ((fpw = fopen(path, "w")) == 0) return 0; 148 | fp = bgzf_write_init(mode2level(mode)); 149 | fp->fp = fpw; 150 | } 151 | return fp; 152 | } 153 | 154 | BGZF *bgzf_dopen(int fd, const char *mode) 155 | { 156 | BGZF *fp = 0; 157 | if (strchr(mode, 'r') || strchr(mode, 'R')) { 158 | _bgzf_file_t fpr; 159 | if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0; 160 | fp = bgzf_read_init(); 161 | fp->fp = fpr; 162 | } else if (strchr(mode, 'w') || strchr(mode, 'W')) { 163 | FILE *fpw; 164 | if ((fpw = fdopen(fd, "w")) == 0) return 0; 165 | fp = bgzf_write_init(mode2level(mode)); 166 | fp->fp = fpw; 167 | } 168 | return fp; 169 | } 170 | 171 | // Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length. 172 | static int deflate_block(BGZF *fp, int block_length) 173 | { 174 | uint8_t *buffer = fp->compressed_block; 175 | int buffer_size = BGZF_BLOCK_SIZE; 176 | int input_length = block_length; 177 | int compressed_length = 0; 178 | int remaining; 179 | uint32_t crc; 180 | 181 | assert(block_length <= BGZF_BLOCK_SIZE); // guaranteed by the caller 182 | memcpy(buffer, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block 183 | while (1) { // loop to retry for blocks that do not compress enough 184 | int status; 185 | z_stream zs; 186 | zs.zalloc = NULL; 187 | zs.zfree = NULL; 188 | zs.next_in = fp->uncompressed_block; 189 | zs.avail_in = input_length; 190 | zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; 191 | zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; 192 | status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); // -15 to disable zlib header/footer 193 | if (status != Z_OK) { 194 | fp->errcode |= BGZF_ERR_ZLIB; 195 | return -1; 196 | } 197 | status = deflate(&zs, Z_FINISH); 198 | if (status != Z_STREAM_END) { // not compressed enough 199 | deflateEnd(&zs); // reset the stream 200 | if (status == Z_OK) { // reduce the size and recompress 201 | input_length -= 1024; 202 | assert(input_length > 0); // logically, this should not happen 203 | continue; 204 | } 205 | fp->errcode |= BGZF_ERR_ZLIB; 206 | return -1; 207 | } 208 | if (deflateEnd(&zs) != Z_OK) { 209 | fp->errcode |= BGZF_ERR_ZLIB; 210 | return -1; 211 | } 212 | compressed_length = zs.total_out; 213 | compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; 214 | assert(compressed_length <= BGZF_BLOCK_SIZE); 215 | break; 216 | } 217 | 218 | assert(compressed_length > 0); 219 | packInt16((uint8_t*)&buffer[16], compressed_length - 1); // write the compressed_length; -1 to fit 2 bytes 220 | crc = crc32(0L, NULL, 0L); 221 | crc = crc32(crc, fp->uncompressed_block, input_length); 222 | packInt32((uint8_t*)&buffer[compressed_length-8], crc); 223 | packInt32((uint8_t*)&buffer[compressed_length-4], input_length); 224 | 225 | remaining = block_length - input_length; 226 | if (remaining > 0) { 227 | assert(remaining <= input_length); 228 | memcpy(fp->uncompressed_block, fp->uncompressed_block + input_length, remaining); 229 | } 230 | fp->block_offset = remaining; 231 | return compressed_length; 232 | } 233 | 234 | // Inflate the block in fp->compressed_block into fp->uncompressed_block 235 | static int inflate_block(BGZF* fp, int block_length) 236 | { 237 | z_stream zs; 238 | zs.zalloc = NULL; 239 | zs.zfree = NULL; 240 | zs.next_in = fp->compressed_block + 18; 241 | zs.avail_in = block_length - 16; 242 | zs.next_out = fp->uncompressed_block; 243 | zs.avail_out = BGZF_BLOCK_SIZE; 244 | 245 | if (inflateInit2(&zs, -15) != Z_OK) { 246 | fp->errcode |= BGZF_ERR_ZLIB; 247 | return -1; 248 | } 249 | if (inflate(&zs, Z_FINISH) != Z_STREAM_END) { 250 | inflateEnd(&zs); 251 | fp->errcode |= BGZF_ERR_ZLIB; 252 | return -1; 253 | } 254 | if (inflateEnd(&zs) != Z_OK) { 255 | fp->errcode |= BGZF_ERR_ZLIB; 256 | return -1; 257 | } 258 | return zs.total_out; 259 | } 260 | 261 | static int check_header(const uint8_t *header) 262 | { 263 | return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0 264 | && unpackInt16((uint8_t*)&header[10]) == 6 265 | && header[12] == 'B' && header[13] == 'C' 266 | && unpackInt16((uint8_t*)&header[14]) == 2); 267 | } 268 | 269 | #ifdef BGZF_CACHE 270 | static void free_cache(BGZF *fp) 271 | { 272 | khint_t k; 273 | khash_t(cache) *h = (khash_t(cache)*)fp->cache; 274 | if (fp->open_mode != 'r') return; 275 | for (k = kh_begin(h); k < kh_end(h); ++k) 276 | if (kh_exist(h, k)) free(kh_val(h, k).block); 277 | kh_destroy(cache, h); 278 | } 279 | 280 | static int load_block_from_cache(BGZF *fp, int64_t block_address) 281 | { 282 | khint_t k; 283 | cache_t *p; 284 | khash_t(cache) *h = (khash_t(cache)*)fp->cache; 285 | k = kh_get(cache, h, block_address); 286 | if (k == kh_end(h)) return 0; 287 | p = &kh_val(h, k); 288 | if (fp->block_length != 0) fp->block_offset = 0; 289 | fp->block_address = block_address; 290 | fp->block_length = p->size; 291 | memcpy(fp->uncompressed_block, p->block, BGZF_BLOCK_SIZE); 292 | _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET); 293 | return p->size; 294 | } 295 | 296 | static void cache_block(BGZF *fp, int size) 297 | { 298 | int ret; 299 | khint_t k; 300 | cache_t *p; 301 | khash_t(cache) *h = (khash_t(cache)*)fp->cache; 302 | if (BGZF_BLOCK_SIZE >= fp->cache_size) return; 303 | if ((kh_size(h) + 1) * BGZF_BLOCK_SIZE > fp->cache_size) { 304 | /* A better way would be to remove the oldest block in the 305 | * cache, but here we remove a random one for simplicity. This 306 | * should not have a big impact on performance. */ 307 | for (k = kh_begin(h); k < kh_end(h); ++k) 308 | if (kh_exist(h, k)) break; 309 | if (k < kh_end(h)) { 310 | free(kh_val(h, k).block); 311 | kh_del(cache, h, k); 312 | } 313 | } 314 | k = kh_put(cache, h, fp->block_address, &ret); 315 | if (ret == 0) return; // if this happens, a bug! 316 | p = &kh_val(h, k); 317 | p->size = fp->block_length; 318 | p->end_offset = fp->block_address + size; 319 | p->block = malloc(BGZF_BLOCK_SIZE); 320 | memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_BLOCK_SIZE); 321 | } 322 | #else 323 | static void free_cache(BGZF *fp) {} 324 | static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;} 325 | static void cache_block(BGZF *fp, int size) {} 326 | #endif 327 | 328 | int bgzf_read_block(BGZF *fp) 329 | { 330 | uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block; 331 | int count, size = 0, block_length, remaining; 332 | int64_t block_address; 333 | block_address = _bgzf_tell((_bgzf_file_t)fp->fp); 334 | if (load_block_from_cache(fp, block_address)) return 0; 335 | count = _bgzf_read(fp->fp, header, sizeof(header)); 336 | if (count == 0) { // no data read 337 | fp->block_length = 0; 338 | return 0; 339 | } 340 | if (count != sizeof(header) || !check_header(header)) { 341 | fp->errcode |= BGZF_ERR_HEADER; 342 | return -1; 343 | } 344 | size = count; 345 | block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1" 346 | compressed_block = (uint8_t*)fp->compressed_block; 347 | memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); 348 | remaining = block_length - BLOCK_HEADER_LENGTH; 349 | count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining); 350 | if (count != remaining) { 351 | fp->errcode |= BGZF_ERR_IO; 352 | return -1; 353 | } 354 | size += count; 355 | if ((count = inflate_block(fp, block_length)) < 0) return -1; 356 | if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek. 357 | fp->block_address = block_address; 358 | fp->block_length = count; 359 | cache_block(fp, size); 360 | return 0; 361 | } 362 | 363 | ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length) 364 | { 365 | ssize_t bytes_read = 0; 366 | uint8_t *output = data; 367 | if (length <= 0) return 0; 368 | assert(fp->open_mode == 'r'); 369 | while (bytes_read < length) { 370 | int copy_length, available = fp->block_length - fp->block_offset; 371 | uint8_t *buffer; 372 | if (available <= 0) { 373 | if (bgzf_read_block(fp) != 0) return -1; 374 | available = fp->block_length - fp->block_offset; 375 | if (available <= 0) break; 376 | } 377 | copy_length = length - bytes_read < available? length - bytes_read : available; 378 | buffer = fp->uncompressed_block; 379 | memcpy(output, buffer + fp->block_offset, copy_length); 380 | fp->block_offset += copy_length; 381 | output += copy_length; 382 | bytes_read += copy_length; 383 | } 384 | if (fp->block_offset == fp->block_length) { 385 | fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); 386 | fp->block_offset = fp->block_length = 0; 387 | } 388 | return bytes_read; 389 | } 390 | 391 | int bgzf_flush(BGZF *fp) 392 | { 393 | assert(fp->open_mode == 'w'); 394 | while (fp->block_offset > 0) { 395 | int block_length; 396 | block_length = deflate_block(fp, fp->block_offset); 397 | if (block_length < 0) return -1; 398 | if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) { 399 | fp->errcode |= BGZF_ERR_IO; // possibly truncated file 400 | return -1; 401 | } 402 | fp->block_address += block_length; 403 | } 404 | return 0; 405 | } 406 | 407 | int bgzf_flush_try(BGZF *fp, ssize_t size) 408 | { 409 | if (fp->block_offset + size > BGZF_BLOCK_SIZE) 410 | return bgzf_flush(fp); 411 | return -1; 412 | } 413 | 414 | ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length) 415 | { 416 | const uint8_t *input = data; 417 | int block_length = BGZF_BLOCK_SIZE, bytes_written; 418 | assert(fp->open_mode == 'w'); 419 | input = data; 420 | bytes_written = 0; 421 | while (bytes_written < length) { 422 | uint8_t* buffer = fp->uncompressed_block; 423 | int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written; 424 | memcpy(buffer + fp->block_offset, input, copy_length); 425 | fp->block_offset += copy_length; 426 | input += copy_length; 427 | bytes_written += copy_length; 428 | if (fp->block_offset == block_length && bgzf_flush(fp)) break; 429 | } 430 | return bytes_written; 431 | } 432 | 433 | int bgzf_close(BGZF* fp) 434 | { 435 | int ret, count, block_length; 436 | if (fp == 0) return -1; 437 | if (fp->open_mode == 'w') { 438 | if (bgzf_flush(fp) != 0) return -1; 439 | block_length = deflate_block(fp, 0); // write an empty block 440 | count = fwrite(fp->compressed_block, 1, block_length, fp->fp); 441 | if (fflush(fp->fp) != 0) { 442 | fp->errcode |= BGZF_ERR_IO; 443 | return -1; 444 | } 445 | } 446 | ret = fp->open_mode == 'w'? fclose(fp->fp) : _bgzf_close(fp->fp); 447 | if (ret != 0) return -1; 448 | free(fp->uncompressed_block); 449 | free(fp->compressed_block); 450 | free_cache(fp); 451 | free(fp); 452 | return 0; 453 | } 454 | 455 | void bgzf_set_cache_size(BGZF *fp, int cache_size) 456 | { 457 | if (fp) fp->cache_size = cache_size; 458 | } 459 | 460 | int bgzf_check_EOF(BGZF *fp) 461 | { 462 | static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; 463 | uint8_t buf[28]; 464 | off_t offset; 465 | offset = _bgzf_tell((_bgzf_file_t)fp->fp); 466 | if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0; 467 | _bgzf_read(fp->fp, buf, 28); 468 | _bgzf_seek(fp->fp, offset, SEEK_SET); 469 | return (memcmp(magic, buf, 28) == 0)? 1 : 0; 470 | } 471 | 472 | int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) 473 | { 474 | int block_offset; 475 | int64_t block_address; 476 | 477 | if (fp->open_mode != 'r' || where != SEEK_SET) { 478 | fp->errcode |= BGZF_ERR_MISUSE; 479 | return -1; 480 | } 481 | block_offset = pos & 0xFFFF; 482 | block_address = pos >> 16; 483 | if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) { 484 | fp->errcode |= BGZF_ERR_IO; 485 | return -1; 486 | } 487 | fp->block_length = 0; // indicates current block has not been loaded 488 | fp->block_address = block_address; 489 | fp->block_offset = block_offset; 490 | return 0; 491 | } 492 | 493 | int bgzf_is_bgzf(const char *fn) 494 | { 495 | uint8_t buf[16]; 496 | int n; 497 | _bgzf_file_t fp; 498 | if ((fp = _bgzf_open(fn, "r")) == 0) return 0; 499 | n = _bgzf_read(fp, buf, 16); 500 | _bgzf_close(fp); 501 | if (n != 16) return 0; 502 | return memcmp(g_magic, buf, 16) == 0? 1 : 0; 503 | } 504 | 505 | int bgzf_getc(BGZF *fp) 506 | { 507 | int c; 508 | if (fp->block_offset >= fp->block_length) { 509 | if (bgzf_read_block(fp) != 0) return -2; /* error */ 510 | if (fp->block_length == 0) return -1; /* end-of-file */ 511 | } 512 | c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; 513 | if (fp->block_offset == fp->block_length) { 514 | fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); 515 | fp->block_offset = 0; 516 | fp->block_length = 0; 517 | } 518 | return c; 519 | } 520 | 521 | #ifndef kroundup32 522 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 523 | #endif 524 | 525 | int bgzf_getline(BGZF *fp, int delim, kstring_t *str) 526 | { 527 | int l, state = 0; 528 | unsigned char *buf = (unsigned char*)fp->uncompressed_block; 529 | str->l = 0; 530 | do { 531 | if (fp->block_offset >= fp->block_length) { 532 | if (bgzf_read_block(fp) != 0) { state = -2; break; } 533 | if (fp->block_length == 0) { state = -1; break; } 534 | } 535 | for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l); 536 | if (l < fp->block_length) state = 1; 537 | l -= fp->block_offset; 538 | if (str->l + l + 1 >= str->m) { 539 | str->m = str->l + l + 2; 540 | kroundup32(str->m); 541 | str->s = (char*)realloc(str->s, str->m); 542 | } 543 | memcpy(str->s + str->l, buf + fp->block_offset, l); 544 | str->l += l; 545 | fp->block_offset += l + 1; 546 | if (fp->block_offset >= fp->block_length) { 547 | fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); 548 | fp->block_offset = 0; 549 | fp->block_length = 0; 550 | } 551 | } while (state == 0); 552 | if (str->l == 0 && state < 0) return state; 553 | str->s[str->l] = 0; 554 | return str->l; 555 | } 556 | -------------------------------------------------------------------------------- /tabix-0.2.6/knetfile.c: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Genome Research Ltd (GRL). 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Heng Li */ 27 | 28 | /* Probably I will not do socket programming in the next few years and 29 | therefore I decide to heavily annotate this file, for Linux and 30 | Windows as well. -lh3 */ 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | #ifdef _WIN32 42 | #include 43 | #else 44 | #include 45 | #include 46 | #include 47 | #endif 48 | 49 | #include "knetfile.h" 50 | 51 | /* In winsock.h, the type of a socket is SOCKET, which is: "typedef 52 | * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed 53 | * integer -1. In knetfile.c, I use "int" for socket type 54 | * throughout. This should be improved to avoid confusion. 55 | * 56 | * In Linux/Mac, recv() and read() do almost the same thing. You can see 57 | * in the header file that netread() is simply an alias of read(). In 58 | * Windows, however, they are different and using recv() is mandatory. 59 | */ 60 | 61 | /* This function tests if the file handler is ready for reading (or 62 | * writing if is_read==0). */ 63 | static int socket_wait(int fd, int is_read) 64 | { 65 | fd_set fds, *fdr = 0, *fdw = 0; 66 | struct timeval tv; 67 | int ret; 68 | tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out 69 | FD_ZERO(&fds); 70 | FD_SET(fd, &fds); 71 | if (is_read) fdr = &fds; 72 | else fdw = &fds; 73 | ret = select(fd+1, fdr, fdw, 0, &tv); 74 | #ifndef _WIN32 75 | if (ret == -1) perror("select"); 76 | #else 77 | if (ret == 0) 78 | fprintf(stderr, "select time-out\n"); 79 | else if (ret == SOCKET_ERROR) 80 | fprintf(stderr, "select: %d\n", WSAGetLastError()); 81 | #endif 82 | return ret; 83 | } 84 | 85 | #ifndef _WIN32 86 | /* This function does not work with Windows due to the lack of 87 | * getaddrinfo() in winsock. It is addapted from an example in "Beej's 88 | * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ 89 | static int socket_connect(const char *host, const char *port) 90 | { 91 | #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) 92 | 93 | int on = 1, fd; 94 | struct linger lng = { 0, 0 }; 95 | struct addrinfo hints, *res; 96 | memset(&hints, 0, sizeof(struct addrinfo)); 97 | hints.ai_family = AF_UNSPEC; 98 | hints.ai_socktype = SOCK_STREAM; 99 | /* In Unix/Mac, getaddrinfo() is the most convenient way to get 100 | * server information. */ 101 | if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); 102 | if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); 103 | /* The following two setsockopt() are used by ftplib 104 | * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they 105 | * necessary. */ 106 | if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); 107 | if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); 108 | if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); 109 | freeaddrinfo(res); 110 | return fd; 111 | } 112 | #else 113 | /* MinGW's printf has problem with "%lld" */ 114 | char *int64tostr(char *buf, int64_t x) 115 | { 116 | int cnt; 117 | int i = 0; 118 | do { 119 | buf[i++] = '0' + x % 10; 120 | x /= 10; 121 | } while (x); 122 | buf[i] = 0; 123 | for (cnt = i, i = 0; i < cnt/2; ++i) { 124 | int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; 125 | } 126 | return buf; 127 | } 128 | 129 | int64_t strtoint64(const char *buf) 130 | { 131 | int64_t x; 132 | for (x = 0; *buf != '\0'; ++buf) 133 | x = x * 10 + ((int64_t) *buf - 48); 134 | return x; 135 | } 136 | /* In windows, the first thing is to establish the TCP connection. */ 137 | int knet_win32_init() 138 | { 139 | WSADATA wsaData; 140 | return WSAStartup(MAKEWORD(2, 2), &wsaData); 141 | } 142 | void knet_win32_destroy() 143 | { 144 | WSACleanup(); 145 | } 146 | /* A slightly modfied version of the following function also works on 147 | * Mac (and presummably Linux). However, this function is not stable on 148 | * my Mac. It sometimes works fine but sometimes does not. Therefore for 149 | * non-Windows OS, I do not use this one. */ 150 | static SOCKET socket_connect(const char *host, const char *port) 151 | { 152 | #define __err_connect(func) \ 153 | do { \ 154 | fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \ 155 | return -1; \ 156 | } while (0) 157 | 158 | int on = 1; 159 | SOCKET fd; 160 | struct linger lng = { 0, 0 }; 161 | struct sockaddr_in server; 162 | struct hostent *hp = 0; 163 | // open socket 164 | if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); 165 | if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); 166 | if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); 167 | // get host info 168 | if (isalpha(host[0])) hp = gethostbyname(host); 169 | else { 170 | struct in_addr addr; 171 | addr.s_addr = inet_addr(host); 172 | hp = gethostbyaddr((char*)&addr, 4, AF_INET); 173 | } 174 | if (hp == 0) __err_connect("gethost"); 175 | // connect 176 | server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); 177 | server.sin_family= AF_INET; 178 | server.sin_port = htons(atoi(port)); 179 | if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); 180 | // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) 181 | return fd; 182 | } 183 | #endif 184 | 185 | static off_t my_netread(int fd, void *buf, off_t len) 186 | { 187 | off_t rest = len, curr, l = 0; 188 | /* recv() and read() may not read the required length of data with 189 | * one call. They have to be called repeatedly. */ 190 | while (rest) { 191 | if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading 192 | curr = netread(fd, buf + l, rest); 193 | /* According to the glibc manual, section 13.2, a zero returned 194 | * value indicates end-of-file (EOF), which should mean that 195 | * read() will not return zero if EOF has not been met but data 196 | * are not immediately available. */ 197 | if (curr == 0) break; 198 | l += curr; rest -= curr; 199 | } 200 | return l; 201 | } 202 | 203 | /************************* 204 | * FTP specific routines * 205 | *************************/ 206 | 207 | static int kftp_get_response(knetFile *ftp) 208 | { 209 | #ifndef _WIN32 210 | unsigned char c; 211 | #else 212 | char c; 213 | #endif 214 | int n = 0; 215 | char *p; 216 | if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; 217 | while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O 218 | //fputc(c, stderr); 219 | if (n >= ftp->max_response) { 220 | ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; 221 | ftp->response = realloc(ftp->response, ftp->max_response); 222 | } 223 | ftp->response[n++] = c; 224 | if (c == '\n') { 225 | if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) 226 | && ftp->response[3] != '-') break; 227 | n = 0; 228 | continue; 229 | } 230 | } 231 | if (n < 2) return -1; 232 | ftp->response[n-2] = 0; 233 | return strtol(ftp->response, &p, 0); 234 | } 235 | 236 | static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) 237 | { 238 | if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing 239 | netwrite(ftp->ctrl_fd, cmd, strlen(cmd)); 240 | return is_get? kftp_get_response(ftp) : 0; 241 | } 242 | 243 | static int kftp_pasv_prep(knetFile *ftp) 244 | { 245 | char *p; 246 | int v[6]; 247 | kftp_send_cmd(ftp, "PASV\r\n", 1); 248 | for (p = ftp->response; *p && *p != '('; ++p); 249 | if (*p != '(') return -1; 250 | ++p; 251 | sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); 252 | memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); 253 | ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; 254 | return 0; 255 | } 256 | 257 | 258 | static int kftp_pasv_connect(knetFile *ftp) 259 | { 260 | char host[80], port[10]; 261 | if (ftp->pasv_port == 0) { 262 | fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); 263 | return -1; 264 | } 265 | sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); 266 | sprintf(port, "%d", ftp->pasv_port); 267 | ftp->fd = socket_connect(host, port); 268 | if (ftp->fd == -1) return -1; 269 | return 0; 270 | } 271 | 272 | int kftp_connect(knetFile *ftp) 273 | { 274 | ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); 275 | if (ftp->ctrl_fd == -1) return -1; 276 | kftp_get_response(ftp); 277 | kftp_send_cmd(ftp, "USER anonymous\r\n", 1); 278 | kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); 279 | kftp_send_cmd(ftp, "TYPE I\r\n", 1); 280 | return 0; 281 | } 282 | 283 | int kftp_reconnect(knetFile *ftp) 284 | { 285 | if (ftp->ctrl_fd != -1) { 286 | netclose(ftp->ctrl_fd); 287 | ftp->ctrl_fd = -1; 288 | } 289 | netclose(ftp->fd); 290 | ftp->fd = -1; 291 | return kftp_connect(ftp); 292 | } 293 | 294 | // initialize ->type, ->host, ->retr and ->size 295 | knetFile *kftp_parse_url(const char *fn, const char *mode) 296 | { 297 | knetFile *fp; 298 | char *p; 299 | int l; 300 | if (strstr(fn, "ftp://") != fn) return 0; 301 | for (p = (char*)fn + 6; *p && *p != '/'; ++p); 302 | if (*p != '/') return 0; 303 | l = p - fn - 6; 304 | fp = calloc(1, sizeof(knetFile)); 305 | fp->type = KNF_TYPE_FTP; 306 | fp->fd = -1; 307 | /* the Linux/Mac version of socket_connect() also recognizes a port 308 | * like "ftp", but the Windows version does not. */ 309 | fp->port = strdup("21"); 310 | fp->host = calloc(l + 1, 1); 311 | if (strchr(mode, 'c')) fp->no_reconnect = 1; 312 | strncpy(fp->host, fn + 6, l); 313 | fp->retr = calloc(strlen(p) + 8, 1); 314 | sprintf(fp->retr, "RETR %s\r\n", p); 315 | fp->size_cmd = calloc(strlen(p) + 8, 1); 316 | sprintf(fp->size_cmd, "SIZE %s\r\n", p); 317 | fp->seek_offset = 0; 318 | return fp; 319 | } 320 | // place ->fd at offset off 321 | int kftp_connect_file(knetFile *fp) 322 | { 323 | int ret; 324 | long long file_size; 325 | if (fp->fd != -1) { 326 | netclose(fp->fd); 327 | if (fp->no_reconnect) kftp_get_response(fp); 328 | } 329 | kftp_pasv_prep(fp); 330 | kftp_send_cmd(fp, fp->size_cmd, 1); 331 | #ifndef _WIN32 332 | if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) 333 | { 334 | fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); 335 | return -1; 336 | } 337 | #else 338 | const char *p = fp->response; 339 | while (*p != ' ') ++p; 340 | while (*p < '0' || *p > '9') ++p; 341 | file_size = strtoint64(p); 342 | #endif 343 | fp->file_size = file_size; 344 | if (fp->offset>=0) { 345 | char tmp[32]; 346 | #ifndef _WIN32 347 | sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); 348 | #else 349 | strcpy(tmp, "REST "); 350 | int64tostr(tmp + 5, fp->offset); 351 | strcat(tmp, "\r\n"); 352 | #endif 353 | kftp_send_cmd(fp, tmp, 1); 354 | } 355 | kftp_send_cmd(fp, fp->retr, 0); 356 | kftp_pasv_connect(fp); 357 | ret = kftp_get_response(fp); 358 | if (ret != 150) { 359 | fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); 360 | netclose(fp->fd); 361 | fp->fd = -1; 362 | return -1; 363 | } 364 | fp->is_ready = 1; 365 | return 0; 366 | } 367 | 368 | 369 | /************************** 370 | * HTTP specific routines * 371 | **************************/ 372 | 373 | knetFile *khttp_parse_url(const char *fn, const char *mode) 374 | { 375 | knetFile *fp; 376 | char *p, *proxy, *q; 377 | int l; 378 | if (strstr(fn, "http://") != fn) return 0; 379 | // set ->http_host 380 | for (p = (char*)fn + 7; *p && *p != '/'; ++p); 381 | l = p - fn - 7; 382 | fp = calloc(1, sizeof(knetFile)); 383 | fp->http_host = calloc(l + 1, 1); 384 | strncpy(fp->http_host, fn + 7, l); 385 | fp->http_host[l] = 0; 386 | for (q = fp->http_host; *q && *q != ':'; ++q); 387 | if (*q == ':') *q++ = 0; 388 | // get http_proxy 389 | proxy = getenv("http_proxy"); 390 | // set ->host, ->port and ->path 391 | if (proxy == 0) { 392 | fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. 393 | fp->port = strdup(*q? q : "80"); 394 | fp->path = strdup(*p? p : "/"); 395 | } else { 396 | fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); 397 | for (q = fp->host; *q && *q != ':'; ++q); 398 | if (*q == ':') *q++ = 0; 399 | fp->port = strdup(*q? q : "80"); 400 | fp->path = strdup(fn); 401 | } 402 | fp->type = KNF_TYPE_HTTP; 403 | fp->ctrl_fd = fp->fd = -1; 404 | fp->seek_offset = 0; 405 | return fp; 406 | } 407 | 408 | int khttp_connect_file(knetFile *fp) 409 | { 410 | int ret, l = 0; 411 | char *buf, *p; 412 | if (fp->fd != -1) netclose(fp->fd); 413 | fp->fd = socket_connect(fp->host, fp->port); 414 | buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. 415 | l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); 416 | l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); 417 | l += sprintf(buf + l, "\r\n"); 418 | netwrite(fp->fd, buf, l); 419 | l = 0; 420 | while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency 421 | if (buf[l] == '\n' && l >= 3) 422 | if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; 423 | ++l; 424 | } 425 | buf[l] = 0; 426 | if (l < 14) { // prematured header 427 | netclose(fp->fd); 428 | fp->fd = -1; 429 | return -1; 430 | } 431 | ret = strtol(buf + 8, &p, 0); // HTTP return code 432 | if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file 433 | off_t rest = fp->offset; 434 | while (rest) { 435 | off_t l = rest < 0x10000? rest : 0x10000; 436 | rest -= my_netread(fp->fd, buf, l); 437 | } 438 | } else if (ret != 206 && ret != 200) { 439 | free(buf); 440 | fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret); 441 | netclose(fp->fd); 442 | fp->fd = -1; 443 | return -1; 444 | } 445 | free(buf); 446 | fp->is_ready = 1; 447 | return 0; 448 | } 449 | 450 | /******************** 451 | * Generic routines * 452 | ********************/ 453 | 454 | knetFile *knet_open(const char *fn, const char *mode) 455 | { 456 | knetFile *fp = 0; 457 | if (mode[0] != 'r') { 458 | fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); 459 | return 0; 460 | } 461 | if (strstr(fn, "ftp://") == fn) { 462 | fp = kftp_parse_url(fn, mode); 463 | if (fp == 0) return 0; 464 | if (kftp_connect(fp) == -1) { 465 | knet_close(fp); 466 | return 0; 467 | } 468 | kftp_connect_file(fp); 469 | } else if (strstr(fn, "http://") == fn) { 470 | fp = khttp_parse_url(fn, mode); 471 | if (fp == 0) return 0; 472 | khttp_connect_file(fp); 473 | } else { // local file 474 | #ifdef _WIN32 475 | /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may 476 | * be undefined on some systems, although it is defined on my 477 | * Mac and the Linux I have tested on. */ 478 | int fd = open(fn, O_RDONLY | O_BINARY); 479 | #else 480 | int fd = open(fn, O_RDONLY); 481 | #endif 482 | if (fd == -1) { 483 | perror("open"); 484 | return 0; 485 | } 486 | fp = (knetFile*)calloc(1, sizeof(knetFile)); 487 | fp->type = KNF_TYPE_LOCAL; 488 | fp->fd = fd; 489 | fp->ctrl_fd = -1; 490 | } 491 | if (fp && fp->fd == -1) { 492 | knet_close(fp); 493 | return 0; 494 | } 495 | return fp; 496 | } 497 | 498 | knetFile *knet_dopen(int fd, const char *mode) 499 | { 500 | knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); 501 | fp->type = KNF_TYPE_LOCAL; 502 | fp->fd = fd; 503 | return fp; 504 | } 505 | 506 | off_t knet_read(knetFile *fp, void *buf, off_t len) 507 | { 508 | off_t l = 0; 509 | if (fp->fd == -1) return 0; 510 | if (fp->type == KNF_TYPE_FTP) { 511 | if (fp->is_ready == 0) { 512 | if (!fp->no_reconnect) kftp_reconnect(fp); 513 | kftp_connect_file(fp); 514 | } 515 | } else if (fp->type == KNF_TYPE_HTTP) { 516 | if (fp->is_ready == 0) 517 | khttp_connect_file(fp); 518 | } 519 | if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX 520 | off_t rest = len, curr; 521 | while (rest) { 522 | curr = read(fp->fd, buf + l, rest); 523 | if (curr == 0) break; 524 | l += curr; rest -= curr; 525 | } 526 | } else l = my_netread(fp->fd, buf, len); 527 | fp->offset += l; 528 | return l; 529 | } 530 | 531 | off_t knet_seek(knetFile *fp, int64_t off, int whence) 532 | { 533 | if (whence == SEEK_SET && off == fp->offset) return 0; 534 | if (fp->type == KNF_TYPE_LOCAL) { 535 | /* Be aware that lseek() returns the offset after seeking, 536 | * while fseek() returns zero on success. */ 537 | off_t offset = lseek(fp->fd, off, whence); 538 | if (offset == -1) { 539 | // Be silent, it is OK for knet_seek to fail when the file is streamed 540 | // fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); 541 | return -1; 542 | } 543 | fp->offset = offset; 544 | return 0; 545 | } 546 | else if (fp->type == KNF_TYPE_FTP) 547 | { 548 | if (whence==SEEK_CUR) 549 | fp->offset += off; 550 | else if (whence==SEEK_SET) 551 | fp->offset = off; 552 | else if ( whence==SEEK_END) 553 | fp->offset = fp->file_size+off; 554 | fp->is_ready = 0; 555 | return 0; 556 | } 557 | else if (fp->type == KNF_TYPE_HTTP) 558 | { 559 | if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? 560 | fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n"); 561 | errno = ESPIPE; 562 | return -1; 563 | } 564 | if (whence==SEEK_CUR) 565 | fp->offset += off; 566 | else if (whence==SEEK_SET) 567 | fp->offset = off; 568 | fp->is_ready = 0; 569 | return fp->offset; 570 | } 571 | errno = EINVAL; 572 | fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); 573 | return -1; 574 | } 575 | 576 | int knet_close(knetFile *fp) 577 | { 578 | if (fp == 0) return 0; 579 | if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific 580 | if (fp->fd != -1) { 581 | /* On Linux/Mac, netclose() is an alias of close(), but on 582 | * Windows, it is an alias of closesocket(). */ 583 | if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); 584 | else netclose(fp->fd); 585 | } 586 | free(fp->host); free(fp->port); 587 | free(fp->response); free(fp->retr); free(fp->size_cmd); // FTP specific 588 | free(fp->path); free(fp->http_host); // HTTP specific 589 | free(fp); 590 | return 0; 591 | } 592 | 593 | #ifdef KNETFILE_MAIN 594 | int main(void) 595 | { 596 | char *buf; 597 | knetFile *fp; 598 | int type = 4, l; 599 | #ifdef _WIN32 600 | knet_win32_init(); 601 | #endif 602 | buf = calloc(0x100000, 1); 603 | if (type == 0) { 604 | fp = knet_open("knetfile.c", "r"); 605 | knet_seek(fp, 1000, SEEK_SET); 606 | } else if (type == 1) { // NCBI FTP, large file 607 | fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); 608 | knet_seek(fp, 2500000000ll, SEEK_SET); 609 | l = knet_read(fp, buf, 255); 610 | } else if (type == 2) { 611 | fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); 612 | knet_seek(fp, 1000, SEEK_SET); 613 | } else if (type == 3) { 614 | fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); 615 | knet_seek(fp, 1000, SEEK_SET); 616 | } else if (type == 4) { 617 | fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); 618 | knet_read(fp, buf, 10000); 619 | knet_seek(fp, 20000, SEEK_SET); 620 | knet_seek(fp, 10000, SEEK_SET); 621 | l = knet_read(fp, buf+10000, 10000000) + 10000; 622 | } 623 | if (type != 4 && type != 1) { 624 | knet_read(fp, buf, 255); 625 | buf[255] = 0; 626 | printf("%s\n", buf); 627 | } else write(fileno(stdout), buf, l); 628 | knet_close(fp); 629 | free(buf); 630 | return 0; 631 | } 632 | #endif 633 | -------------------------------------------------------------------------------- /tabix-0.2.6/ChangeLog: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------ 2 | r942 | lh3lh3 | 2011-03-31 16:39:50 -0400 (Thu, 31 Mar 2011) | 2 lines 3 | Changed paths: 4 | M /trunk/tabix/main.c 5 | 6 | update version number 7 | 8 | ------------------------------------------------------------------------ 9 | r940 | lh3lh3 | 2011-03-31 16:38:03 -0400 (Thu, 31 Mar 2011) | 2 lines 10 | Changed paths: 11 | M /trunk/tabix/bedidx.c 12 | M /trunk/tabix/main.c 13 | 14 | fixed two bugs due to recent changes 15 | 16 | ------------------------------------------------------------------------ 17 | r939 | lh3lh3 | 2011-03-31 16:12:21 -0400 (Thu, 31 Mar 2011) | 2 lines 18 | Changed paths: 19 | M /trunk/tabix/bgzf.c 20 | M /trunk/tabix/bgzf.h 21 | M /trunk/tabix/main.c 22 | 23 | update to the latest bgzf.* 24 | 25 | ------------------------------------------------------------------------ 26 | r938 | lh3lh3 | 2011-03-31 16:02:21 -0400 (Thu, 31 Mar 2011) | 2 lines 27 | Changed paths: 28 | M /trunk/tabix/index.c 29 | M /trunk/tabix/main.c 30 | M /trunk/tabix/tabix.h 31 | 32 | BED support 33 | 34 | ------------------------------------------------------------------------ 35 | r937 | lh3lh3 | 2011-03-31 15:03:49 -0400 (Thu, 31 Mar 2011) | 2 lines 36 | Changed paths: 37 | M /trunk/tabix/Makefile 38 | A /trunk/tabix/bedidx.c 39 | M /trunk/tabix/example.gtf.gz.tbi 40 | M /trunk/tabix/index.c 41 | A /trunk/tabix/kseq.h 42 | M /trunk/tabix/tabix.h 43 | 44 | restructure get_intv() for BED support 45 | 46 | ------------------------------------------------------------------------ 47 | r919 | petulda | 2011-02-24 10:14:14 -0500 (Thu, 24 Feb 2011) | 1 line 48 | Changed paths: 49 | M /trunk/tabix/bgzf.c 50 | M /trunk/tabix/bgzf.h 51 | M /trunk/tabix/index.c 52 | M /trunk/tabix/main.c 53 | 54 | New -r (reheader) option for efficient header replacement. 55 | ------------------------------------------------------------------------ 56 | r915 | lh3lh3 | 2011-02-22 09:50:57 -0500 (Tue, 22 Feb 2011) | 2 lines 57 | Changed paths: 58 | A /trunk/tabix/python 59 | A /trunk/tabix/python/setup.py (from /trunk/tabix/setup.py:914) 60 | A /trunk/tabix/python/tabixmodule.c (from /trunk/tabix/tabixmodule.c:914) 61 | A /trunk/tabix/python/test.py (from /trunk/tabix/test.py:914) 62 | D /trunk/tabix/setup.py 63 | D /trunk/tabix/tabixmodule.c 64 | D /trunk/tabix/test.py 65 | 66 | move to a new python/ directory 67 | 68 | ------------------------------------------------------------------------ 69 | r914 | lh3lh3 | 2011-02-22 09:49:35 -0500 (Tue, 22 Feb 2011) | 2 lines 70 | Changed paths: 71 | A /trunk/tabix/setup.py 72 | A /trunk/tabix/tabixmodule.c 73 | A /trunk/tabix/test.py 74 | 75 | CPython C-API by Hyeshik Chang 76 | 77 | ------------------------------------------------------------------------ 78 | r904 | petulda | 2011-01-28 08:06:27 -0500 (Fri, 28 Jan 2011) | 1 line 79 | Changed paths: 80 | M /trunk/tabix/index.c 81 | 82 | Check the number of fields on each line and exit nicely without segfault 83 | ------------------------------------------------------------------------ 84 | r901 | petulda | 2011-01-21 06:45:37 -0500 (Fri, 21 Jan 2011) | 1 line 85 | Changed paths: 86 | M /trunk/tabix/main.c 87 | 88 | Fix: Complain only when VCF is newer, not newer or same mtime 89 | ------------------------------------------------------------------------ 90 | r900 | petulda | 2011-01-21 04:23:04 -0500 (Fri, 21 Jan 2011) | 1 line 91 | Changed paths: 92 | M /trunk/tabix/main.c 93 | 94 | Prevent the common user mistake and check the timestamps of the vcf and index file 95 | ------------------------------------------------------------------------ 96 | r876 | lh3lh3 | 2010-12-08 12:38:45 -0500 (Wed, 08 Dec 2010) | 2 lines 97 | Changed paths: 98 | M /trunk/tabix/ChangeLog 99 | M /trunk/tabix/NEWS 100 | M /trunk/tabix/main.c 101 | 102 | Release tabix-0.2.3 103 | 104 | ------------------------------------------------------------------------ 105 | r875 | lh3lh3 | 2010-12-08 12:28:35 -0500 (Wed, 08 Dec 2010) | 2 lines 106 | Changed paths: 107 | M /trunk/tabix/ChangeLog 108 | M /trunk/tabix/index.c 109 | 110 | Fixed a minor bug in generating index 111 | 112 | ------------------------------------------------------------------------ 113 | r855 | petulda | 2010-11-25 11:50:13 -0500 (Thu, 25 Nov 2010) | 1 line 114 | Changed paths: 115 | M /trunk/tabix/main.c 116 | 117 | Disable "unknown target name or minus interval" warning. 118 | ------------------------------------------------------------------------ 119 | r775 | petulda | 2010-10-26 15:02:30 -0400 (Tue, 26 Oct 2010) | 1 line 120 | Changed paths: 121 | M /trunk/tabix/main.c 122 | 123 | Added -h option to print header lines 124 | ------------------------------------------------------------------------ 125 | r742 | jmarshall | 2010-09-27 06:47:23 -0400 (Mon, 27 Sep 2010) | 2 lines 126 | Changed paths: 127 | M /trunk/tabix 128 | 129 | Add svn:ignore properties for intermediate and generated files. 130 | 131 | ------------------------------------------------------------------------ 132 | r725 | lh3lh3 | 2010-09-15 13:01:53 -0400 (Wed, 15 Sep 2010) | 2 lines 133 | Changed paths: 134 | M /trunk/tabix/bgzip.c 135 | 136 | patches by Peter Chines 137 | 138 | ------------------------------------------------------------------------ 139 | r714 | lh3lh3 | 2010-09-07 10:13:25 -0400 (Tue, 07 Sep 2010) | 2 lines 140 | Changed paths: 141 | M /trunk/tabix/TabixReader.java 142 | M /trunk/tabix/index.c 143 | M /trunk/tabix/main.c 144 | 145 | fixed a bug in C/Java when n_off == 0 146 | 147 | ------------------------------------------------------------------------ 148 | r712 | lh3lh3 | 2010-09-03 09:21:23 -0400 (Fri, 03 Sep 2010) | 2 lines 149 | Changed paths: 150 | M /trunk/tabix/TabixReader.java 151 | 152 | fixed a bug in parsing region strings 153 | 154 | ------------------------------------------------------------------------ 155 | r700 | petulda | 2010-08-25 10:42:37 -0400 (Wed, 25 Aug 2010) | 1 line 156 | Changed paths: 157 | M /trunk/tabix/main.c 158 | 159 | Fix: Exit with an error rather than segfault when index is not present and region is queried 160 | ------------------------------------------------------------------------ 161 | r696 | petulda | 2010-08-24 10:24:12 -0400 (Tue, 24 Aug 2010) | 1 line 162 | Changed paths: 163 | M /trunk/tabix/bgzf.c 164 | M /trunk/tabix/bgzf.h 165 | M /trunk/tabix/index.c 166 | M /trunk/tabix/main.c 167 | 168 | Complain about not-bgzipped files and check for noncontinuous chromosome blocks 169 | ------------------------------------------------------------------------ 170 | r603 | lh3lh3 | 2010-06-28 10:49:39 -0400 (Mon, 28 Jun 2010) | 2 lines 171 | Changed paths: 172 | M /trunk/tabix/NEWS 173 | M /trunk/tabix/TabixReader.java 174 | M /trunk/tabix/index.c 175 | M /trunk/tabix/main.c 176 | 177 | Release tabix-0.2.2 178 | 179 | ------------------------------------------------------------------------ 180 | r597 | lh3lh3 | 2010-06-13 21:08:29 -0400 (Sun, 13 Jun 2010) | 3 lines 181 | Changed paths: 182 | M /trunk/tabix/index.c 183 | 184 | Change the namespace of sorting, to avoid function name collision with samtools. 185 | 186 | 187 | ------------------------------------------------------------------------ 188 | r582 | lh3lh3 | 2010-06-03 10:40:25 -0400 (Thu, 03 Jun 2010) | 2 lines 189 | Changed paths: 190 | M /trunk/tabix/NEWS 191 | M /trunk/tabix/main.c 192 | M /trunk/tabix/tabix.py 193 | 194 | Release tabix-0.2.1 195 | 196 | ------------------------------------------------------------------------ 197 | r581 | lh3lh3 | 2010-05-24 14:24:24 -0400 (Mon, 24 May 2010) | 2 lines 198 | Changed paths: 199 | M /trunk/tabix/tabix.py 200 | 201 | OOP interface with the help from Aaron Quinlan 202 | 203 | ------------------------------------------------------------------------ 204 | r580 | lh3lh3 | 2010-05-23 23:36:05 -0400 (Sun, 23 May 2010) | 2 lines 205 | Changed paths: 206 | M /trunk/tabix/tabix.py 207 | 208 | minor change 209 | 210 | ------------------------------------------------------------------------ 211 | r579 | lh3lh3 | 2010-05-23 23:25:24 -0400 (Sun, 23 May 2010) | 2 lines 212 | Changed paths: 213 | M /trunk/tabix/tabix.py 214 | 215 | For Snow Leopard compatibility 216 | 217 | ------------------------------------------------------------------------ 218 | r575 | lh3lh3 | 2010-05-12 19:31:27 -0400 (Wed, 12 May 2010) | 4 lines 219 | Changed paths: 220 | M /trunk/tabix/Makefile 221 | M /trunk/tabix/index.c 222 | M /trunk/tabix/tabix.h 223 | A /trunk/tabix/tabix.py 224 | 225 | * optionally generate shared library for Mac and Linux 226 | * added a python script that directly calls the shared library 227 | * added a new API for easy python access 228 | 229 | ------------------------------------------------------------------------ 230 | r574 | lh3lh3 | 2010-05-11 12:14:27 -0400 (Tue, 11 May 2010) | 2 lines 231 | Changed paths: 232 | M /trunk/tabix/ChangeLog 233 | M /trunk/tabix/NEWS 234 | M /trunk/tabix/perl/Tabix.pm 235 | M /trunk/tabix/perl/TabixIterator.pm 236 | M /trunk/tabix/tabix.1 237 | 238 | Release tabix-0.2.0 239 | 240 | ------------------------------------------------------------------------ 241 | r573 | lh3lh3 | 2010-05-11 12:08:30 -0400 (Tue, 11 May 2010) | 2 lines 242 | Changed paths: 243 | M /trunk/tabix/Makefile 244 | 245 | Added -fPIC 246 | 247 | ------------------------------------------------------------------------ 248 | r572 | lh3lh3 | 2010-05-11 11:59:07 -0400 (Tue, 11 May 2010) | 2 lines 249 | Changed paths: 250 | M /trunk/tabix/perl/MANIFEST 251 | 252 | update 253 | 254 | ------------------------------------------------------------------------ 255 | r571 | lh3lh3 | 2010-05-11 11:56:54 -0400 (Tue, 11 May 2010) | 4 lines 256 | Changed paths: 257 | A /trunk/tabix/example.gtf.gz 258 | A /trunk/tabix/example.gtf.gz.tbi 259 | M /trunk/tabix/index.c 260 | M /trunk/tabix/main.c 261 | M /trunk/tabix/perl/MANIFEST 262 | M /trunk/tabix/perl/Tabix.pm 263 | M /trunk/tabix/perl/Tabix.xs 264 | A /trunk/tabix/perl/TabixIterator.pm 265 | A /trunk/tabix/perl/t 266 | A /trunk/tabix/perl/t/01local.t 267 | A /trunk/tabix/perl/t/02remote.t 268 | M /trunk/tabix/tabix.1 269 | M /trunk/tabix/tabix.h 270 | 271 | * improved C/Perl APIs 272 | * added test for Perl 273 | * added an tiny example 274 | 275 | ------------------------------------------------------------------------ 276 | r570 | lh3lh3 | 2010-05-11 01:04:21 -0400 (Tue, 11 May 2010) | 2 lines 277 | Changed paths: 278 | M /trunk/tabix/TabixReader.java 279 | 280 | fixed the same issue in java 281 | 282 | ------------------------------------------------------------------------ 283 | r569 | lh3lh3 | 2010-05-11 01:03:24 -0400 (Tue, 11 May 2010) | 3 lines 284 | Changed paths: 285 | M /trunk/tabix/index.c 286 | M /trunk/tabix/perl/Tabix.pm 287 | M /trunk/tabix/perl/Tabix.xs 288 | 289 | * fixed a potential issue in index.c 290 | * improve perl APIs 291 | 292 | ------------------------------------------------------------------------ 293 | r568 | lh3lh3 | 2010-05-10 23:46:21 -0400 (Mon, 10 May 2010) | 2 lines 294 | Changed paths: 295 | M /trunk/tabix/perl/Tabix.xs 296 | 297 | return an array from get_names() 298 | 299 | ------------------------------------------------------------------------ 300 | r567 | lh3lh3 | 2010-05-10 23:38:46 -0400 (Mon, 10 May 2010) | 4 lines 301 | Changed paths: 302 | M /trunk/tabix/TabixReader.java 303 | M /trunk/tabix/index.c 304 | A /trunk/tabix/perl 305 | A /trunk/tabix/perl/MANIFEST 306 | A /trunk/tabix/perl/Makefile.PL 307 | A /trunk/tabix/perl/Tabix.pm 308 | A /trunk/tabix/perl/Tabix.xs 309 | A /trunk/tabix/perl/typemap 310 | M /trunk/tabix/tabix.h 311 | 312 | * added the initial perl binding. The interface needs to be improved. 313 | * added a new API for perl binding 314 | * fixed a potential bug in java. 315 | 316 | ------------------------------------------------------------------------ 317 | r565 | lh3lh3 | 2010-05-09 23:24:35 -0400 (Sun, 09 May 2010) | 2 lines 318 | Changed paths: 319 | M /trunk/tabix/main.c 320 | 321 | Release tabix-0.1.6 322 | 323 | ------------------------------------------------------------------------ 324 | r564 | lh3lh3 | 2010-05-09 23:01:49 -0400 (Sun, 09 May 2010) | 2 lines 325 | Changed paths: 326 | M /trunk/tabix/index.c 327 | 328 | fixed a typo 329 | 330 | ------------------------------------------------------------------------ 331 | r563 | lh3lh3 | 2010-05-09 22:58:26 -0400 (Sun, 09 May 2010) | 2 lines 332 | Changed paths: 333 | A /trunk/tabix/ChangeLog 334 | M /trunk/tabix/NEWS 335 | M /trunk/tabix/index.c 336 | M /trunk/tabix/main.c 337 | M /trunk/tabix/tabix.h 338 | 339 | If nothing bad happens, this will become 0.1.6 340 | 341 | ------------------------------------------------------------------------ 342 | r562 | lh3lh3 | 2010-05-09 19:43:56 -0400 (Sun, 09 May 2010) | 2 lines 343 | Changed paths: 344 | M /trunk/tabix/index.c 345 | 346 | Fixed a bug 347 | 348 | ------------------------------------------------------------------------ 349 | r560 | lh3lh3 | 2010-05-05 10:59:09 -0400 (Wed, 05 May 2010) | 3 lines 350 | Changed paths: 351 | A /trunk/tabix/NEWS 352 | M /trunk/tabix/TabixReader.java 353 | M /trunk/tabix/index.c 354 | M /trunk/tabix/main.c 355 | M /trunk/tabix/tabix.1 356 | M /trunk/tabix/tabix.h 357 | 358 | * Release tabix-0.1.5 (r560) 359 | * Improve seeking efficiency. Index file needs to be rebuilt. 360 | 361 | ------------------------------------------------------------------------ 362 | r559 | lh3lh3 | 2010-05-04 23:11:42 -0400 (Tue, 04 May 2010) | 2 lines 363 | Changed paths: 364 | M /trunk/tabix/main.c 365 | 366 | Release tabix-0.1.4 (r559) 367 | 368 | ------------------------------------------------------------------------ 369 | r558 | lh3lh3 | 2010-05-01 12:48:01 -0400 (Sat, 01 May 2010) | 2 lines 370 | Changed paths: 371 | M /trunk/tabix/TabixReader.java 372 | 373 | implement SAM/VCF support; NOT tested yet 374 | 375 | ------------------------------------------------------------------------ 376 | r557 | lh3lh3 | 2010-05-01 00:42:34 -0400 (Sat, 01 May 2010) | 2 lines 377 | Changed paths: 378 | A /trunk/tabix/TabixReader.java 379 | 380 | The Java implementation of tabix. 381 | 382 | ------------------------------------------------------------------------ 383 | r556 | lh3lh3 | 2010-04-30 22:34:07 -0400 (Fri, 30 Apr 2010) | 4 lines 384 | Changed paths: 385 | M /trunk/tabix/index.c 386 | M /trunk/tabix/knetfile.c 387 | M /trunk/tabix/main.c 388 | 389 | * tabix-0.1.3-3 (r556) 390 | * fixed a small memory leak in knetfile 391 | * fixed a minor bug for remote downloading 392 | 393 | ------------------------------------------------------------------------ 394 | r555 | lh3lh3 | 2010-04-30 22:15:12 -0400 (Fri, 30 Apr 2010) | 4 lines 395 | Changed paths: 396 | M /trunk/tabix/Makefile 397 | M /trunk/tabix/index.c 398 | M /trunk/tabix/main.c 399 | 400 | * tabix-0.1.3-2 (r555) 401 | * do not overwrite index file by default 402 | * a little code cleanup 403 | 404 | ------------------------------------------------------------------------ 405 | r554 | lh3lh3 | 2010-04-30 21:44:31 -0400 (Fri, 30 Apr 2010) | 2 lines 406 | Changed paths: 407 | M /trunk/tabix/index.c 408 | 409 | fixed a potential bug for UCSC-like coordinate 410 | 411 | ------------------------------------------------------------------------ 412 | r553 | lh3lh3 | 2010-04-28 17:43:41 -0400 (Wed, 28 Apr 2010) | 2 lines 413 | Changed paths: 414 | M /trunk/tabix/tabix.tex 415 | 416 | minor clarification to the format spec 417 | 418 | ------------------------------------------------------------------------ 419 | r552 | lh3lh3 | 2010-04-28 16:33:07 -0400 (Wed, 28 Apr 2010) | 3 lines 420 | Changed paths: 421 | M /trunk/tabix/Makefile 422 | M /trunk/tabix/bgzip.c 423 | A /trunk/tabix/tabix.tex 424 | 425 | * added the format specification 426 | * fixed a typo in bgzip 427 | 428 | ------------------------------------------------------------------------ 429 | r550 | petulda | 2010-04-22 11:03:24 -0400 (Thu, 22 Apr 2010) | 1 line 430 | Changed paths: 431 | M /trunk/tabix/bgzip.c 432 | 433 | The behaviour changed slightly to mimic gzip. Detect if std descriptors are connected to the terminal. 434 | ------------------------------------------------------------------------ 435 | r549 | petulda | 2010-04-22 09:46:10 -0400 (Thu, 22 Apr 2010) | 1 line 436 | Changed paths: 437 | M /trunk/tabix/bgzip.c 438 | 439 | Fix in src/dst file detection and slight change of behaviour 440 | ------------------------------------------------------------------------ 441 | r548 | petulda | 2010-04-19 04:39:46 -0400 (Mon, 19 Apr 2010) | 1 line 442 | Changed paths: 443 | M /trunk/tabix/index.c 444 | 445 | Close file descriptor in ti_list_chromosomes 446 | ------------------------------------------------------------------------ 447 | r547 | petulda | 2010-04-16 09:27:11 -0400 (Fri, 16 Apr 2010) | 1 line 448 | Changed paths: 449 | M /trunk/tabix/index.c 450 | M /trunk/tabix/main.c 451 | M /trunk/tabix/tabix.h 452 | 453 | Added the -l option for listing chromosomes 454 | ------------------------------------------------------------------------ 455 | r544 | lh3lh3 | 2010-03-29 10:58:48 -0400 (Mon, 29 Mar 2010) | 2 lines 456 | Changed paths: 457 | M /trunk/tabix/main.c 458 | 459 | removed a line of debugging code 460 | 461 | ------------------------------------------------------------------------ 462 | r543 | lh3lh3 | 2010-03-19 12:29:16 -0400 (Fri, 19 Mar 2010) | 3 lines 463 | Changed paths: 464 | M /trunk/tabix/index.c 465 | M /trunk/tabix/main.c 466 | M /trunk/tabix/tabix.1 467 | 468 | * tabix-0.1.3 (r543) 469 | * fixed another off-by-one bug 470 | 471 | ------------------------------------------------------------------------ 472 | r542 | lh3lh3 | 2010-03-16 22:35:52 -0400 (Tue, 16 Mar 2010) | 2 lines 473 | Changed paths: 474 | M /trunk/tabix/index.c 475 | M /trunk/tabix/main.c 476 | M /trunk/tabix/tabix.1 477 | 478 | Release tabix-0.1.1 479 | 480 | ------------------------------------------------------------------------ 481 | r506 | lh3lh3 | 2009-11-02 23:20:12 -0500 (Mon, 02 Nov 2009) | 2 lines 482 | Changed paths: 483 | M /trunk/tabix/main.c 484 | 485 | Release tabix-0.1.0 486 | 487 | ------------------------------------------------------------------------ 488 | r505 | lh3lh3 | 2009-11-02 23:15:49 -0500 (Mon, 02 Nov 2009) | 2 lines 489 | Changed paths: 490 | A /trunk/tabix/tabix.1 491 | 492 | documentation 493 | 494 | ------------------------------------------------------------------------ 495 | r504 | lh3lh3 | 2009-11-02 11:08:18 -0500 (Mon, 02 Nov 2009) | 5 lines 496 | Changed paths: 497 | M /trunk/tabix/Makefile 498 | M /trunk/tabix/bgzip.c 499 | M /trunk/tabix/index.c 500 | M /trunk/tabix/main.c 501 | M /trunk/tabix/tabix.h 502 | 503 | * tabix-0.0.0-5 (r504) 504 | * fixed a critical bug in fetching data (a typo in fact) 505 | * support SAM (tested on ex1.sam) and VCF (not tested) 506 | * improve the command-line interface 507 | 508 | ------------------------------------------------------------------------ 509 | r503 | lh3lh3 | 2009-11-02 10:04:43 -0500 (Mon, 02 Nov 2009) | 3 lines 510 | Changed paths: 511 | M /trunk/tabix/Makefile 512 | M /trunk/tabix/index.c 513 | M /trunk/tabix/main.c 514 | 515 | * tabix-0.0.0-4 (r503) 516 | * index files are bgzf compressed 517 | 518 | ------------------------------------------------------------------------ 519 | r502 | lh3lh3 | 2009-11-02 09:47:25 -0500 (Mon, 02 Nov 2009) | 4 lines 520 | Changed paths: 521 | M /trunk/tabix/index.c 522 | M /trunk/tabix/main.c 523 | M /trunk/tabix/tabix.h 524 | 525 | * tabix-0.0.0-3 (r502) 526 | * support meta lines (not tested) 527 | * I am going to make the index file in the BGZF format 528 | 529 | ------------------------------------------------------------------------ 530 | r501 | lh3lh3 | 2009-11-01 22:03:07 -0500 (Sun, 01 Nov 2009) | 3 lines 531 | Changed paths: 532 | M /trunk/tabix/Makefile 533 | M /trunk/tabix/bgzf.h 534 | M /trunk/tabix/index.c 535 | M /trunk/tabix/main.c 536 | 537 | * tabix-0.0.0-2 (r501) 538 | * accelerate ti_readline() 539 | 540 | ------------------------------------------------------------------------ 541 | r500 | lh3lh3 | 2009-11-01 20:49:52 -0500 (Sun, 01 Nov 2009) | 3 lines 542 | Changed paths: 543 | M /trunk/tabix/Makefile 544 | M /trunk/tabix/bgzip.c 545 | M /trunk/tabix/index.c 546 | M /trunk/tabix/main.c 547 | 548 | * tabix-0.0.0-1 (r500) 549 | * apparently working 550 | 551 | ------------------------------------------------------------------------ 552 | r499 | lh3lh3 | 2009-11-01 14:04:52 -0500 (Sun, 01 Nov 2009) | 2 lines 553 | Changed paths: 554 | D /trunk/tabix/parser.c 555 | 556 | obsolete file 557 | 558 | ------------------------------------------------------------------------ 559 | r498 | lh3lh3 | 2009-11-01 14:04:08 -0500 (Sun, 01 Nov 2009) | 2 lines 560 | Changed paths: 561 | M /trunk/tabix/bgzip.c 562 | 563 | bgzip is more like gzip in its command-line interface 564 | 565 | ------------------------------------------------------------------------ 566 | r497 | lh3lh3 | 2009-11-01 13:43:35 -0500 (Sun, 01 Nov 2009) | 2 lines 567 | Changed paths: 568 | A /trunk/tabix/Makefile 569 | A /trunk/tabix/bam_endian.h 570 | A /trunk/tabix/bgzf.c 571 | A /trunk/tabix/bgzf.h 572 | A /trunk/tabix/bgzip.c 573 | A /trunk/tabix/index.c 574 | A /trunk/tabix/khash.h 575 | A /trunk/tabix/knetfile.c 576 | A /trunk/tabix/knetfile.h 577 | A /trunk/tabix/ksort.h 578 | A /trunk/tabix/kstring.c 579 | A /trunk/tabix/kstring.h 580 | A /trunk/tabix/main.c 581 | A /trunk/tabix/parser.c 582 | A /trunk/tabix/tabix.h 583 | 584 | initial source code. It is BUGGY! 585 | 586 | ------------------------------------------------------------------------ 587 | r496 | lh3lh3 | 2009-11-01 13:42:39 -0500 (Sun, 01 Nov 2009) | 2 lines 588 | Changed paths: 589 | A /trunk/tabix 590 | 591 | A generic indexer for TAB-delimited genome position files 592 | 593 | ------------------------------------------------------------------------ 594 | --------------------------------------------------------------------------------