├── tabix-0.2.6
    ├── perl
    │   ├── typemap
    │   ├── MANIFEST
    │   ├── Makefile.PL
    │   ├── t
    │   │   ├── 01local.t
    │   │   └── 02remote.t
    │   ├── TabixIterator.pm
    │   ├── Tabix.xs
    │   └── Tabix.pm
    ├── bgzf.o
    ├── bgzip
    ├── bgzip.o
    ├── index.o
    ├── main.o
    ├── tabix
    ├── bedidx.o
    ├── kstring.o
    ├── knetfile.o
    ├── libtabix.a
    ├── example.gtf.gz
    ├── example.gtf.gz.tbi
    ├── bam_endian.h
    ├── Makefile
    ├── kstring.h
    ├── knetfile.h
    ├── python
    │   ├── setup.py
    │   ├── test.py
    │   └── tabixmodule.c
    ├── tabix.py
    ├── NEWS
    ├── tabix.1
    ├── bedidx.c
    ├── kstring.c
    ├── tabix.h
    ├── tabix.tex
    ├── bgzf.h
    ├── bgzip.c
    ├── kseq.h
    ├── ksort.h
    ├── main.c
    ├── TabixReader.java
    ├── khash.h
    ├── bgzf.c
    ├── knetfile.c
    └── ChangeLog
├── test.txt
├── README.md
└── fathmm-MKL.py


/tabix-0.2.6/perl/typemap:
--------------------------------------------------------------------------------
1 | TYPEMAP
2 | tabix_t*   T_PTROBJ
3 | ti_iter_t  T_PTROBJ


--------------------------------------------------------------------------------
/tabix-0.2.6/bgzf.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/bgzf.o


--------------------------------------------------------------------------------
/tabix-0.2.6/bgzip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/bgzip


--------------------------------------------------------------------------------
/tabix-0.2.6/bgzip.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/bgzip.o


--------------------------------------------------------------------------------
/tabix-0.2.6/index.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/index.o


--------------------------------------------------------------------------------
/tabix-0.2.6/main.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/main.o


--------------------------------------------------------------------------------
/tabix-0.2.6/tabix:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/tabix


--------------------------------------------------------------------------------
/tabix-0.2.6/bedidx.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/bedidx.o


--------------------------------------------------------------------------------
/tabix-0.2.6/kstring.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/kstring.o


--------------------------------------------------------------------------------
/tabix-0.2.6/knetfile.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/knetfile.o


--------------------------------------------------------------------------------
/tabix-0.2.6/libtabix.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/libtabix.a


--------------------------------------------------------------------------------
/tabix-0.2.6/example.gtf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/example.gtf.gz


--------------------------------------------------------------------------------
/tabix-0.2.6/example.gtf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HAShihab/fathmm-MKL/HEAD/tabix-0.2.6/example.gtf.gz.tbi


--------------------------------------------------------------------------------
/tabix-0.2.6/perl/MANIFEST:
--------------------------------------------------------------------------------
1 | MANIFEST
2 | typemap
3 | Tabix.xs
4 | Tabix.pm
5 | TabixIterator.pm
6 | Makefile.PL
7 | t/01local.t
8 | t/02remote.t


--------------------------------------------------------------------------------
/test.txt:
--------------------------------------------------------------------------------
1 | # Note: records beginning with a '#' are not processed (comments)
2 | #
3 | # The software expects data to be in the following format (comma-separated): chromosome, position, reference and mutant base
4 | #
5 | 1,916549,A,G
6 | 1,935222,C,A
7 | 1,11854785,C,T
8 | 1,11854786,C,T
9 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/perl/Makefile.PL:
--------------------------------------------------------------------------------
1 | use ExtUtils::MakeMaker;
2 | WriteMakefile(
3 | 			  NAME         => 'Tabix',
4 | 			  VERSION_FROM => 'Tabix.pm',
5 | 			  LIBS         => ['-lz -L.. -ltabix'],
6 | 			  DEFINE       => '-D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE',
7 | 			  INC          => '-I..',
8 | 			 );
9 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/perl/t/01local.t:
--------------------------------------------------------------------------------
 1 | #-*-Perl-*-
 2 | use Test::More tests => 9;
 3 | BEGIN { use_ok('Tabix') };
 4 | 
 5 | { # C-like low-level interface
 6 | 	my $t = tabix_open("../example.gtf.gz");
 7 | 	ok($t);
 8 | 	my $iter = tabix_query($t, "chr1", 0, 2000);
 9 | 	ok($iter);
10 | 	$_ = 0;
11 | 	++$_ while (tabix_read($t, $iter));
12 | 	is($_, 6);
13 | 	tabix_iter_free($iter);
14 | 	@_ = tabix_getnames($t);
15 | 	is(scalar(@_), 2);
16 | }
17 | 
18 | { # OOP high-level interface
19 | 	my $t = Tabix->new(-data=>"../example.gtf.gz");
20 | 	ok($t);
21 | 	my $iter = $t->query("chr1", 3000, 5000);
22 | 	ok($iter);
23 | 	$_ = 0;
24 | 	++$_ while ($t->read($iter));
25 | 	is($_, 27);
26 | 	@_ = $t->getnames;
27 | 	is($_[1], "chr2");
28 | }
29 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/perl/TabixIterator.pm:
--------------------------------------------------------------------------------
 1 | package TabixIterator;
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Carp qw/croak/;
 6 | 
 7 | require Exporter;
 8 | 
 9 | our @ISA = qw/Exporter/;
10 | our @EXPORT = qw/tabix_iter_free/;
11 | 
12 | our $VERSION = '0.2.0';
13 | 
14 | require XSLoader;
15 | XSLoader::load('Tabix', $VERSION);
16 | 
17 | sub new {
18 |   my $invocant = shift;
19 |   my $class = ref($invocant) || $invocant;
20 |   my $self = {};
21 |   bless($self, $class);
22 |   return $self;
23 | }
24 | 
25 | sub set {
26 |   my ($self, $iter) = @_;
27 |   $self->{_} = $iter;
28 | }
29 | 
30 | sub get {
31 |   my $self = shift;
32 |   return $self->{_};
33 | }
34 | 
35 | sub DESTROY {
36 |   my $self = shift;
37 |   tabix_iter_free($self->{_}) if ($self->{_});
38 | }
39 | 
40 | 1;
41 | __END__
42 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/perl/t/02remote.t:
--------------------------------------------------------------------------------
 1 | #-*-Perl-*-
 2 | use Test::More tests => 9;
 3 | BEGIN { use_ok('Tabix') };
 4 | 
 5 | { # FTP access
 6 | 	my $t = Tabix->new(-data=>"ftp://ftp.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/CEU.SRP000031.2010_03.genotypes.vcf.gz");
 7 | 	ok($t);
 8 | 	my $iter = $t->query("1", 1000000, 1100000);
 9 | 	ok($iter);
10 | 	$_ = 0;
11 | 	++$_ while ($t->read($iter));
12 | 	is($_, 306);
13 | 	@_ = $t->getnames;
14 | 	is(scalar(@_), 22);
15 | }
16 | 
17 | { # FTP access plus FTP index
18 | 	my $t = Tabix->new(-data=>"ftp://ftp.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/CEU.SRP000031.2010_03.genotypes.vcf.gz",
19 | 					   -index=>"ftp://ftp.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/CEU.SRP000031.2010_03.genotypes.vcf.gz.tbi");
20 | 	ok($t);
21 | 	my $iter = $t->query("19", 10000000, 10100000);
22 | 	ok($iter);
23 | 	$_ = 0;
24 | 	++$_ while ($t->read($iter));
25 | 	is($_, 268);
26 | 	@_ = $t->getnames;
27 | 	is(scalar(@_), 22);
28 | }
29 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/bam_endian.h:
--------------------------------------------------------------------------------
 1 | #ifndef BAM_ENDIAN_H
 2 | #define BAM_ENDIAN_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | static inline int bam_is_big_endian()
 7 | {
 8 | 	long one= 1;
 9 | 	return !(*((char *)(&one)));
10 | }
11 | static inline uint16_t bam_swap_endian_2(uint16_t v)
12 | {
13 | 	return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
14 | }
15 | static inline void *bam_swap_endian_2p(void *x)
16 | {
17 | 	*(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
18 | 	return x;
19 | }
20 | static inline uint32_t bam_swap_endian_4(uint32_t v)
21 | {
22 | 	v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
23 | 	return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
24 | }
25 | static inline void *bam_swap_endian_4p(void *x)
26 | {
27 | 	*(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
28 | 	return x;
29 | }
30 | static inline uint64_t bam_swap_endian_8(uint64_t v)
31 | {
32 | 	v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
33 | 	v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
34 | 	return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
35 | }
36 | static inline void *bam_swap_endian_8p(void *x)
37 | {
38 | 	*(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
39 | 	return x;
40 | }
41 | 
42 | #endif
43 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/perl/Tabix.xs:
--------------------------------------------------------------------------------
 1 | #include "EXTERN.h"
 2 | #include "perl.h"
 3 | #include "XSUB.h"
 4 | 
 5 | #include <stdlib.h>
 6 | #include "tabix.h"
 7 | 
 8 | MODULE = Tabix PACKAGE = Tabix
 9 | 
10 | tabix_t*
11 | tabix_open(fn, fnidx=0)
12 | 	char *fn
13 | 	char *fnidx
14 |   CODE:
15 | 	RETVAL = ti_open(fn, fnidx);
16 |   OUTPUT:
17 | 	RETVAL
18 | 
19 | void
20 | tabix_close(t)
21 | 	tabix_t *t
22 |   CODE:
23 | 	ti_close(t);
24 | 
25 | ti_iter_t
26 | tabix_query(t, seq=0, beg=0, end=0x7fffffff)
27 | 	tabix_t *t
28 | 	const char *seq
29 | 	int beg
30 | 	int end
31 |   PREINIT:
32 |   CODE:
33 | 	RETVAL = ti_query(t, seq, beg, end);
34 |   OUTPUT:
35 | 	RETVAL
36 | 
37 | SV*
38 | tabix_read(t, iter)
39 | 	tabix_t *t
40 | 	ti_iter_t iter
41 |   PREINIT:
42 | 	const char *s;
43 | 	int len;
44 |   CODE:
45 | 	s = ti_read(t, iter, &len);
46 | 	if (s == 0)
47 | 	   return XSRETURN_EMPTY;
48 | 	RETVAL = newSVpv(s, len);
49 |   OUTPUT:
50 | 	RETVAL
51 | 
52 | void
53 | tabix_getnames(t)
54 | 	tabix_t *t
55 |   PREINIT:
56 | 	const char **names;
57 | 	int i, n;
58 |   PPCODE:
59 | 	ti_lazy_index_load(t);
60 | 	names = ti_seqname(t->idx, &n);
61 | 	for (i = 0; i < n; ++i)
62 | 		XPUSHs(sv_2mortal(newSVpv(names[i], 0)));
63 | 	free(names);
64 | 
65 | MODULE = Tabix PACKAGE = TabixIterator
66 | 
67 | void
68 | tabix_iter_free(iter)
69 | 	ti_iter_t iter
70 |   CODE:
71 | 	ti_iter_destroy(iter);
72 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/perl/Tabix.pm:
--------------------------------------------------------------------------------
 1 | package Tabix;
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Carp qw/croak/;
 6 | 
 7 | use TabixIterator;
 8 | 
 9 | require Exporter;
10 | 
11 | our @ISA = qw/Exporter/;
12 | our @EXPORT = qw/tabix_open tabix_close tabix_read tabix_query tabix_getnames tabix_iter_free/;
13 | 
14 | our $VERSION = '0.2.0';
15 | 
16 | require XSLoader;
17 | XSLoader::load('Tabix', $VERSION);
18 | 
19 | sub new {
20 |   my $invocant = shift;
21 |   my %args = @_;
22 |   $args{-data} || croak("-data argument required");
23 |   my $class = ref($invocant) || $invocant;
24 |   my $self = {};
25 |   bless($self, $class);
26 |   $self->open($args{-data}, $args{-index});
27 |   return $self;
28 | }
29 | 
30 | sub open {
31 |   my ($self, $fn, $fnidx) = @_;
32 |   $self->close;
33 |   $self->{_fn} = $fn;
34 |   $self->{_fnidx} = $fnidx;
35 |   $self->{_} = $fnidx? tabix_open($fn, $fnidx) : tabix_open($fn);
36 | }
37 | 
38 | sub close {
39 |   my $self = shift;
40 |   if ($self->{_}) {
41 | 	tabix_close($self->{_});
42 | 	delete($self->{_}); delete($self->{_fn}); delete($self->{_fnidx});
43 |   }
44 | }
45 | 
46 | sub DESTROY {
47 |   my $self = shift;
48 |   $self->close;
49 | }
50 | 
51 | sub query {
52 |   my $self = shift;
53 |   my $iter;
54 |   if (@_) {
55 | 	$iter = tabix_query($self->{_}, @_);
56 |   } else {
57 | 	$iter = tabix_query($self->{_});
58 |   }
59 |   my $i = TabixIterator->new;
60 |   $i->set($iter);
61 |   return $i;
62 | }
63 | 
64 | sub read {
65 |   my $self = shift;
66 |   my $iter = shift;
67 |   return tabix_read($self->{_}, $iter->get);
68 | }
69 | 
70 | sub getnames {
71 |   my $self = shift;
72 |   return tabix_getnames($self->{_});
73 | }
74 | 
75 | 1;
76 | __END__
77 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/Makefile:
--------------------------------------------------------------------------------
 1 | CC=			gcc
 2 | CFLAGS=		-g -Wall -O2 -fPIC #-m64 #-arch ppc
 3 | DFLAGS=		-D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -DBGZF_CACHE
 4 | LOBJS=		bgzf.o kstring.o knetfile.o index.o bedidx.o
 5 | AOBJS=		main.o
 6 | PROG=		tabix bgzip
 7 | INCLUDES=
 8 | SUBDIRS=	.
 9 | LIBPATH=
10 | LIBCURSES=	
11 | 
12 | .SUFFIXES:.c .o
13 | 
14 | .c.o:
15 | 		$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
16 | 
17 | all-recur lib-recur clean-recur cleanlocal-recur install-recur:
18 | 		@target=`echo $@ | sed s/-recur//`; \
19 | 		wdir=`pwd`; \
20 | 		list='$(SUBDIRS)'; for subdir in $$list; do \
21 | 			cd $$subdir; \
22 | 			$(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
23 | 				INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \
24 | 			cd $$wdir; \
25 | 		done;
26 | 
27 | all:$(PROG)
28 | 
29 | lib:libtabix.a
30 | 
31 | libtabix.so.1:$(LOBJS)
32 | 		$(CC) -shared -Wl,-soname,libtabix.so -o $@ $(LOBJS) -lc -lz
33 | 
34 | libtabix.1.dylib:$(LOBJS)
35 | 		libtool -dynamic $(LOBJS) -o $@ -lc -lz
36 | 
37 | libtabix.a:$(LOBJS)
38 | 		$(AR) -csru $@ $(LOBJS)
39 | 
40 | tabix:lib $(AOBJS)
41 | 		$(CC) $(CFLAGS) -o $@ $(AOBJS) -L. -ltabix -lm $(LIBPATH) -lz
42 | 
43 | bgzip:bgzip.o bgzf.o knetfile.o
44 | 		$(CC) $(CFLAGS) -o $@ bgzip.o bgzf.o knetfile.o -lz
45 | 
46 | TabixReader.class:TabixReader.java
47 | 		javac -cp .:sam.jar TabixReader.java
48 | 
49 | kstring.o:kstring.h
50 | knetfile.o:knetfile.h
51 | bgzf.o:bgzf.h knetfile.h
52 | index.o:bgzf.h tabix.h khash.h ksort.h kstring.h
53 | main.o:tabix.h kstring.h bgzf.h
54 | bgzip.o:bgzf.h
55 | bedidx.o:kseq.h khash.h
56 | 
57 | tabix.pdf:tabix.tex
58 | 		pdflatex tabix.tex
59 | 
60 | cleanlocal:
61 | 		rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a tabix.aux tabix.log tabix.pdf *.class libtabix.*.dylib libtabix.so*
62 | 
63 | clean:cleanlocal-recur
64 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/kstring.h:
--------------------------------------------------------------------------------
 1 | #ifndef KSTRING_H
 2 | #define KSTRING_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <stdint.h>
 7 | 
 8 | #ifndef kroundup32
 9 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
10 | #endif
11 | 
12 | #ifndef KSTRING_T
13 | #define KSTRING_T kstring_t
14 | typedef struct __kstring_t {
15 | 	size_t l, m;
16 | 	char *s;
17 | } kstring_t;
18 | #endif
19 | 
20 | int ksprintf(kstring_t *s, const char *fmt, ...);
21 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
22 | 
23 | // calculate the auxiliary array, allocated by calloc()
24 | int *ksBM_prep(const uint8_t *pat, int m);
25 | 
26 | /* Search pat in str and returned the list of matches. The size of the
27 |  * list is returned as n_matches. _prep is the array returned by
28 |  * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */
29 | int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches);
30 | 
31 | static inline int kputsn(const char *p, int l, kstring_t *s)
32 | {
33 | 	if (s->l + l + 1 >= s->m) {
34 | 		s->m = s->l + l + 2;
35 | 		kroundup32(s->m);
36 | 		s->s = (char*)realloc(s->s, s->m);
37 | 	}
38 | 	strncpy(s->s + s->l, p, l);
39 | 	s->l += l;
40 | 	s->s[s->l] = 0;
41 | 	return l;
42 | }
43 | 
44 | static inline int kputs(const char *p, kstring_t *s)
45 | {
46 | 	return kputsn(p, strlen(p), s);
47 | }
48 | 
49 | static inline int kputc(int c, kstring_t *s)
50 | {
51 | 	if (s->l + 1 >= s->m) {
52 | 		s->m = s->l + 2;
53 | 		kroundup32(s->m);
54 | 		s->s = (char*)realloc(s->s, s->m);
55 | 	}
56 | 	s->s[s->l++] = c;
57 | 	s->s[s->l] = 0;
58 | 	return c;
59 | }
60 | 
61 | static inline int *ksplit(kstring_t *s, int delimiter, int *n)
62 | {
63 | 	int max = 0, *offsets = 0;
64 | 	*n = ksplit_core(s->s, delimiter, &max, &offsets);
65 | 	return offsets;
66 | }
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/knetfile.h:
--------------------------------------------------------------------------------
 1 | #ifndef KNETFILE_H
 2 | #define KNETFILE_H
 3 | 
 4 | #include <stdint.h>
 5 | #include <fcntl.h>
 6 | 
 7 | #ifndef _WIN32
 8 | #define netread(fd, ptr, len) read(fd, ptr, len)
 9 | #define netwrite(fd, ptr, len) write(fd, ptr, len)
10 | #define netclose(fd) close(fd)
11 | #else
12 | #include <winsock2.h>
13 | #define netread(fd, ptr, len) recv(fd, ptr, len, 0)
14 | #define netwrite(fd, ptr, len) send(fd, ptr, len, 0)
15 | #define netclose(fd) closesocket(fd)
16 | #endif
17 | 
18 | // FIXME: currently I/O is unbuffered
19 | 
20 | #define KNF_TYPE_LOCAL 1
21 | #define KNF_TYPE_FTP   2
22 | #define KNF_TYPE_HTTP  3
23 | 
24 | typedef struct knetFile_s {
25 | 	int type, fd;
26 | 	int64_t offset;
27 | 	char *host, *port;
28 | 
29 | 	// the following are for FTP only
30 | 	int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready;
31 | 	char *response, *retr, *size_cmd;
32 | 	int64_t seek_offset; // for lazy seek
33 |     int64_t file_size;
34 | 
35 | 	// the following are for HTTP only
36 | 	char *path, *http_host;
37 | } knetFile;
38 | 
39 | #define knet_tell(fp) ((fp)->offset)
40 | #define knet_fileno(fp) ((fp)->fd)
41 | 
42 | #ifdef __cplusplus
43 | extern "C" {
44 | #endif
45 | 
46 | #ifdef _WIN32
47 | 	int knet_win32_init();
48 | 	void knet_win32_destroy();
49 | #endif
50 | 
51 | 	knetFile *knet_open(const char *fn, const char *mode);
52 | 
53 | 	/* 
54 | 	   This only works with local files.
55 | 	 */
56 | 	knetFile *knet_dopen(int fd, const char *mode);
57 | 
58 | 	/*
59 | 	  If ->is_ready==0, this routine updates ->fd; otherwise, it simply
60 | 	  reads from ->fd.
61 | 	 */
62 | 	off_t knet_read(knetFile *fp, void *buf, off_t len);
63 | 
64 | 	/*
65 | 	  This routine only sets ->offset and ->is_ready=0. It does not
66 | 	  communicate with the FTP server.
67 | 	 */
68 | 	off_t knet_seek(knetFile *fp, int64_t off, int whence);
69 | 	int knet_close(knetFile *fp);
70 | 
71 | #ifdef __cplusplus
72 | }
73 | #endif
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/python/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # The MIT License
 4 | #
 5 | # Copyright (c) 2011 Seoul National University.
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining
 8 | # a copy of this software and associated documentation files (the
 9 | # "Software"), to deal in the Software without restriction, including
10 | # without limitation the rights to use, copy, modify, merge, publish,
11 | # distribute, sublicense, and/or sell copies of the Software, and to
12 | # permit persons to whom the Software is furnished to do so, subject to
13 | # the following conditions:
14 | #
15 | # The above copyright notice and this permission notice shall be
16 | # included in all copies or substantial portions of the Software.
17 | #
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
22 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
23 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | # SOFTWARE.
26 | #
27 | # Contact: Hyeshik Chang <hyeshik@snu.ac.kr>
28 | 
29 | from distutils.core import setup, Extension
30 | 
31 | # Change this to True when you need the knetfile support.
32 | USE_KNETFILE = False
33 | 
34 | TABIX_SOURCE_FILES = [
35 |     '../bgzf.c', '../bgzip.c', '../index.c', '../knetfile.c', '../kstring.c'
36 | ]
37 | 
38 | define_options = [('_FILE_OFFSET_BITS', 64)]
39 | if USE_KNETFILE:
40 |     define_options.append(('_USE_KNETFILE', 1))
41 | 
42 | ext_modules = [Extension("tabix", ["tabixmodule.c"] + TABIX_SOURCE_FILES,
43 |                          include_dirs=['..'],
44 |                          libraries=['z'],
45 |                          define_macros=define_options)]
46 | 
47 | setup (name = 'tabix',
48 |        version = '1.0',
49 |        description = 'Python interface to tabix, a generic indexer '
50 |                      'for TAB-delimited genome position files',
51 |        author = 'Hyeshik Chang',
52 |        author_email = 'hyeshik@snu.ac.kr',
53 |        license = 'MIT',
54 |        ext_modules = ext_modules
55 | )
56 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/tabix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Author: Heng Li and Aaron Quinlan
 4 | # License: MIT/X11
 5 | 
 6 | import sys
 7 | from ctypes import *
 8 | from ctypes.util import find_library
 9 | import glob, platform
10 | 
11 | def load_shared_library(lib, _path='.', ver='*'):
12 |     """Search for and load the tabix library. The
13 |     expectation is that the library is located in
14 |     the current directory (ie. "./")
15 |     """
16 |     # find from the system path
17 |     path = find_library(lib)
18 |     if (path == None): # if fail, search in the custom directory
19 |         s = platform.system()
20 |         if (s == 'Darwin'): suf = ver+'.dylib'
21 |         elif (s == 'Linux'): suf = '.so'+ver
22 |         candidates = glob.glob(_path+'/lib'+lib+suf);
23 |         if (len(candidates) == 1): path = candidates[0]
24 |         else: return None
25 |     cdll.LoadLibrary(path)
26 |     return CDLL(path)
27 | 
28 | def tabix_init():
29 |     """Initialize and return a tabix reader object
30 |     for subsequent tabix_get() calls.  
31 |     """
32 |     tabix = load_shared_library('tabix')
33 |     if (tabix == None): return None
34 |     tabix.ti_read.restype = c_char_p
35 |     # on Mac OS X 10.6, the following declarations are required.
36 |     tabix.ti_open.restype = c_void_p
37 |     tabix.ti_querys.argtypes = [c_void_p, c_char_p]
38 |     tabix.ti_querys.restype = c_void_p
39 |     tabix.ti_query.argtypes = [c_void_p, c_char_p, c_int, c_int]
40 |     tabix.ti_query.restype = c_void_p
41 |     tabix.ti_read.argtypes = [c_void_p, c_void_p, c_void_p]
42 |     tabix.ti_iter_destroy.argtypes = [c_void_p]
43 |     tabix.ti_close.argtypes = [c_void_p]
44 |     # FIXME: explicit declarations for APIs not used in this script
45 |     return tabix
46 | 
47 | # OOP interface
48 | class Tabix:
49 |     def __init__(self, fn, fnidx=0):
50 |         self.tabix = tabix_init();
51 |         if (self.tabix == None):
52 |             sys.stderr.write("[Tabix] Please make sure the shared library is compiled and available.\n")
53 |             return
54 |         self.fp = self.tabix.ti_open(fn, fnidx);
55 | 
56 |     def __del__(self):
57 |         if (self.tabix): self.tabix.ti_close(self.fp)
58 | 
59 |     def fetch(self, chr, start=-1, end=-1):
60 |         """Generator function that will yield each interval
61 |         within the requested range from the requested file.
62 |         """
63 |         if (self.tabix == None): return
64 |         if (start < 0): iter = self.tabix.ti_querys(self.fp, chr) # chr looks like: "chr2:1,000-2,000" or "chr2"
65 |         else: iter = self.tabix.ti_query(self.fp, chr, start, end) # chr must be a sequence name
66 |         if (iter == None):        
67 |             sys.stderr.write("[Tabix] Malformatted query or wrong sequence name.\n")
68 |             return
69 |         while (1): # iterate
70 |             s = self.tabix.ti_read(self.fp, iter, 0)
71 |             if (s == None): break
72 |             yield s   
73 |         self.tabix.ti_iter_destroy(iter)
74 | 
75 | # command-line interface
76 | def main():
77 |     if (len(sys.argv) < 3):
78 |         sys.stderr.write("Usage: tabix.py <in.gz> <reg>\n")
79 |         sys.exit(1)
80 |     
81 |     # report the features in the requested interval
82 |     tabix = Tabix(sys.argv[1])
83 |     for line in tabix.fetch(sys.argv[2]):
84 |         print line
85 | 
86 | if __name__ == '__main__':
87 |     main()
88 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/NEWS:
--------------------------------------------------------------------------------
  1 | Release 0.2.4 (10 April, 2011)
  2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  3 | 
  4 | Notable changes:
  5 | 
  6 |  * Give an error if the index file is older than the data file.
  7 | 
  8 |  * Avoid a segfault given flawed input.
  9 | 
 10 |  * Added Python APIs contributed by Hyeshik Chang. The new APIs do not bind to
 11 |    the dynamic library and are reported to be faster. Pysam also comes with a
 12 |    tabix binding.
 13 | 
 14 |  * Added option "-r" for efficient header replacement.
 15 | 
 16 |  * Added BED support.
 17 | 
 18 |  * Synchronized the BGZF library between tabix and samtools.
 19 | 
 20 | (0.2.4: 10 April 2011, r949)
 21 | 
 22 | 
 23 | 
 24 | Beta Release 0.2.3 (8 December, 2010)
 25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 26 | 
 27 | Notable changes:
 28 | 
 29 |  * Fixed a minor bug where the first record in a headerless file may be
 30 |    missed.
 31 | 
 32 |  * Added an option to print header lines.
 33 | 
 34 |  * Fixed a rare bug which may occasionally happen when retrieving data
 35 |    from a region without any records.
 36 | 
 37 |  * Enhanced error reporting.
 38 | 
 39 |  * Fixed a bug in bgzip which may delete the original file even if not
 40 |    intended.
 41 | 
 42 | (0.2.3: 8 December 2010, r876)
 43 | 
 44 | 
 45 | 
 46 | Beta Release 0.2.2 (28 June, 2010)
 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 48 | 
 49 | Notable changes:
 50 | 
 51 |  * Dropped the VCF3 support. Added VCF4 support.
 52 | 
 53 |  * Avoided the function name collision with samtools.
 54 | 
 55 | (0.2.2: 28 June 2010, r603)
 56 | 
 57 | 
 58 | 
 59 | Beta Release 0.2.1 (3 June, 2010)
 60 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 61 | 
 62 | Notable changes:
 63 | 
 64 |  * Allow shared library to be compiled. Added python binding to the
 65 |    shared library.
 66 | 
 67 | (0.2.1: 3 June 2010, r582)
 68 | 
 69 | 
 70 | 
 71 | Beta Release 0.2.0 (11 May, 2010)
 72 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 73 | 
 74 | Notable changes:
 75 | 
 76 |  * Fixed an issue for random access given an interval end larger than
 77 |    2^29.
 78 | 
 79 |  * Updated the Java binding.
 80 | 
 81 |  * Added a Perl module using XS.
 82 | 
 83 |  * Improved the C APIs.
 84 | 
 85 | (0.2.0: 11 May 2010, r574)
 86 | 
 87 | 
 88 | 
 89 | Beta Release 0.1.6 (9 May, 2010)
 90 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 91 | 
 92 | Notable changes:
 93 | 
 94 |  * Improved backward compatibility. Release 0.1.5 does not work with the
 95 |    buggy index file generated by 0.1.2.
 96 | 
 97 |  * Fixed a bug in building linear index. The bug does not affect the
 98 |    results, only affects efficiency in rare cases.
 99 | 
100 |  * Reduced the number of seek calls given an index generated by old
101 |    version of tabix.
102 | 
103 |  * Added new APIs for retrieving data via an iterator. The old callback
104 |    APIs are not changed, although internally it uses iterator to
105 |    retrieve data.
106 | 
107 | I am trying to freeze tabix. I just hope I am committing new bugs.
108 | 
109 | (0.1.6: 9 May 2010, r563)
110 | 
111 | 
112 | 
113 | Beta Release 0.1.5 (5 May, 2010)
114 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
115 | 
116 | Notable changes:
117 | 
118 |  * Clarified that tabix is released under MIT/X11.
119 | 
120 |  * Improved the robustness of indexing and retrieval.
121 | 
122 |  * Reduced the number of seek calls when the specified region starts
123 |    from a 16kb block with no data. The index format is the same, but the
124 |    content is changed a little.
125 | 
126 | (0.1.5: 5 May 2010, r560)
127 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/python/test.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # The MIT License
 3 | #
 4 | # Copyright (c) 2011 Seoul National University.
 5 | #
 6 | # Permission is hereby granted, free of charge, to any person obtaining
 7 | # a copy of this software and associated documentation files (the
 8 | # "Software"), to deal in the Software without restriction, including
 9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | # SOFTWARE.
25 | #
26 | # Contact: Hyeshik Chang <hyeshik@snu.ac.kr>
27 | 
28 | import unittest
29 | import random
30 | import gzip
31 | import tabix
32 | 
33 | EXAMPLEFILE = '../example.gtf.gz'
34 | 
35 | def load_example_regions(path):
36 |     alldata = []
37 |     for line in gzip.GzipFile(EXAMPLEFILE):
38 |         fields = line.decode('ascii')[:-1].split('\t')
39 |         seqid = fields[0]
40 |         begin = int(fields[3])
41 |         end = int(fields[4])
42 |         alldata.append((seqid, begin, end, fields[:7]))
43 | 
44 |     return alldata
45 | 
46 | def does_overlap(A, B, C, D):
47 |     return (A <= D <= B) or (C <= B <= D)
48 | 
49 | def sample_test_dataset(regions, ntests):
50 |     seqids = [seqid for seqid, _, _, _ in regions]
51 |     lowerbound = max(0, min(begin for _, begin, _, _ in regions) - 1000)
52 |     upperbound = max(end for _, _, end, _ in regions) + 1000
53 | 
54 |     tests = []
55 |     for i in range(ntests):
56 |         seqid = random.choice(seqids)
57 |         low = random.randrange(lowerbound, upperbound)
58 |         high = random.randrange(low, upperbound)
59 | 
60 |         # for 1-based both-end inclusive intervals
61 |         matches = [info for seq, begin, end, info in regions
62 |                    if seqid == seq and does_overlap(begin, end, low, high)]
63 | 
64 |         tests.append((seqid, low, high, matches))
65 | 
66 |     return tests
67 | 
68 | def tbresult2excerpt(tbmatches):
69 |     return [fields[:7] for fields in tbmatches]
70 | 
71 | class TabixTest(unittest.TestCase):
72 |     regions = load_example_regions(EXAMPLEFILE)
73 |     testset = sample_test_dataset(regions, 500)
74 | 
75 |     def setUp(self):
76 |         self.tb = tabix.Tabix(EXAMPLEFILE)
77 | 
78 |     def testQuery(self):
79 |         for seqid, low, high, matches in self.testset:
80 |             tbresult = tbresult2excerpt(self.tb.query(seqid, low, high))
81 |             self.assertEqual(tbresult, matches)
82 | 
83 |     def testQueryS(self):
84 |         for seqid, low, high, matches in self.testset:
85 |             tbresult = tbresult2excerpt(self.tb.querys('%s:%d-%d' %
86 |                                                        (seqid, low, high)))
87 |             self.assertEqual(tbresult, matches)
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     unittest.main()
92 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # fathmm-MKL
 2 | 
 3 | Predicting the functional consequences of both coding and non-coding single nucleotide variants (see http://fathmm.biocompute.org.uk).
 4 | 
 5 | For more information, please refer to the following publication:
 6 | 
 7 | Shihab HA, Rogers MF, Gough J, Mort M, Cooper DN, Day INM, Gaunt TR, Campbell C (2014). An Integrative Approach to Predicting the Functional Consequences of Non-coding and Coding Sequence Variation. *Bioinformatics* (In Press)
 8 | 
 9 | ## General Requirements
10 | 
11 | You will need the following packages installed on your system:
12 | 
13 | * ```tabix``` (included as part of this repository)
14 | * ```Python``` (tested with Python 2.7)
15 | 
16 | ## Running the Software
17 | 
18 | * Clone this repository
19 | 
20 | ```
21 | git clone https://github.com/HAShihab/fathmm-MKL
22 | cd fathmm-MKL/
23 | ```
24 | 
25 | * Download our pre-computed database:
26 | 
27 | ```
28 | wget http://fathmm.biocompute.org.uk/database/fathmm-MKL_Current.tab.gz
29 | ```
30 | 
31 | **Note:** this database contains one-based coordinates (positions).  For true bed format (i.e. zero-based coordinates), please download the following database: http://fathmm.biocompute.org.uk/database/fathmm-MKL_Current_zerobased.tab.gz
32 | 
33 | | Datafile | md5sum |
34 | | -------- | ------- |
35 | | http://fathmm.biocompute.org.uk/database/fathmm-MKL_Current.tab.gz | b8f4dd120586a34c82d5cc87cfe2a4ca |
36 | | http://fathmm.biocompute.org.uk/database/fathmm-MKL_Current_zerobased.tab.gz |  c3213196a2471ade3742bd8f8a96d4cc |
37 | 
38 | * Add `tabix` to your PATH and create the database index file (*please be patient, this may take a while!*):
39 | 
40 | ```
41 | export PATH=./tabix-0.2.6/:$PATH
42 | tabix -f -p bed fathmm-MKL_Current.tab.gz
43 | ```
44 | 
45 | * Run our script using the following command:
46 | 
47 | ```
48 | python fathmm-MKL.py <fin> <fo> <db>
49 | ```
50 | 
51 | In the above command, ```<fin>``` is the list of mutations to process (see ```test.txt``` for an example), ```<fo>``` is where the predictions are written and ```<db>``` is the pre-computed database downloaded in *Step 1*.
52 | 
53 | **Note:** the database index file must be created before running our script.  If this has not been created, your output will contain "No Prediction Found" for all variants!
54 | 
55 | ## Prediction Interpretation
56 | 
57 | Predictions are given as *p*-values in the range [0, 1]: values above 0.5 are predicted to be deleterious, while those below 0.5 are predicted to be neutral or benign. *P*-values close to the extremes (0 or 1) are the highest-confidence predictions that yield the highest accuracy.
58 | 
59 | We use distinct predictors for positions either in coding regions (positions within coding-sequence exons) and non-coding regions (positions in intergenic regions, introns or non-coding genes). The coding predictor is based on 10 groups of features, labeled A-J; the non-coding predictor uses a subset of 4 of these feature groups, A-D (see our related publication for details on the groups and their sources).
60 | 
61 | **Note:** predictions based on a subset of features may not be as accurate as those based on complete feature sets. In particular, predictions that are missing the conservation score features (groups A and E) will tend to be less accurate than other predictions. To aid in interpreting these predictions, we provide a list of the feature groups that contributed to each prediction. 
62 | 
63 | ## Genome Build
64 | 
65 | FATHMM-MKL predictions are based on the GRCh37/hg19 genome build.
66 | 
67 | ## Contributing:
68 | 
69 | We welcome any comments and/or suggestions that you may have regarding our software - please send an email to fathmm@biocompute.org.uk
70 | 
71 | 


--------------------------------------------------------------------------------
/fathmm-MKL.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import sys
  5 | import argparse
  6 | import subprocess
  7 | 
  8 | #
  9 | if __name__ == '__main__':
 10 |     '''
 11 |     fathmm-MKL.py: Predict the Functional Consequences of Single Nucleotide Variants (SNVs)
 12 |     '''
 13 |     
 14 |     # fetch argument(s)
 15 |     parser = argparse.ArgumentParser(
 16 |                                       description = 'Predict the Functional Consequences of Single Nucleotide Variants (SNVs)',
 17 |                                       add_help = False
 18 |                                     )
 19 |     parser.add_argument(
 20 |                          "-h",
 21 |                          "--help",
 22 |                          action = "help",
 23 |                          help = argparse.SUPPRESS
 24 |                        )
 25 |     
 26 |     group = \
 27 |         parser.add_argument_group("Required")
 28 |     group.add_argument(
 29 |                         'fi',
 30 |                         metavar = '<F1>',
 31 |                         type = argparse.FileType("r"),
 32 |                         help = 'the mutation data to process'
 33 |                       )
 34 |     group.add_argument(
 35 |                         'fo',
 36 |                         metavar = '<F2>',
 37 |                         type = argparse.FileType("w"),
 38 |                         help = 'where predictions are written'
 39 |                       )
 40 |     
 41 |     group.add_argument(
 42 |                         'db',
 43 |                         metavar = '<db>',
 44 |                         type = argparse.FileType("r"),
 45 |                         help = 'precomputed database of fathmm-MKL predictions'
 46 |                       )
 47 |     
 48 |     Args = parser.parse_args()
 49 |     
 50 |     #
 51 |     
 52 |     Args.fo.write("\t".join([ 
 53 |                              "# Chromosome",
 54 |                              "Position",
 55 |                              "Ref. Base",
 56 |                              "Mutant Base",
 57 |                              "Non-Coding Score",
 58 |                              "Non-Coding Groups",
 59 |                              "Coding Score",
 60 |                              "Coding Groups",
 61 |                              "Warning"
 62 |                            ]) + "\n")
 63 | 
 64 |     for query in Args.fi:
 65 |         if not query.strip() or query.startswith("#"):
 66 |             continue
 67 |         query = query.strip().upper().split(",")
 68 |         Pred  = [ '', '', '', '', "No Prediction Found" ]
 69 |         
 70 |         
 71 |         # approve query ...
 72 |         try:
 73 |             assert query.__len__() == 4     # required data present in query
 74 |             
 75 |             int(query[1])                   # is position numeric
 76 |             assert query[2] in \
 77 |                 [ "A", "C", "G", "T" ]      # expected base
 78 |             assert query[3] in \
 79 |                 [ "A", "C", "G", "T" ]      # expected base
 80 |         except:
 81 |             Args.fo.write("\t".join([ '', '', '', '', '', '', '', '', "Error: Unexpected Format '" + ",".join(query) + "'" ] ) + "\n"); continue
 82 |         
 83 |         
 84 |         # fetch prediction ...
 85 |         proc      = subprocess.Popen([ "tabix " + Args.db.name + " " + query[0] + ":" + str(int(query[1]) + 1) + "-" + str(int(query[1]) + 1) ], stdout=subprocess.PIPE, shell=True)
 86 |         data, err = proc.communicate()
 87 |         if err:
 88 |             Pred[-1] = "Error: 'tabix' command"; continue
 89 |         if data:
 90 |             for record in data.decode().split("\n"):
 91 |                 if not record:
 92 |                     continue
 93 |                 record = record.strip().split("\t")
 94 |                 
 95 |                 if not record[0] == query[0]:
 96 |                     Pred[-1] = "Error: Unexpected Chromosome"; break
 97 |                 if not record[1] == query[1]:
 98 |                     Pred[-1] = "Error: Unexpected Position";   break
 99 |                 if not record[3] == query[2]:
100 |                     Pred[-1] = "Warning: Inconsistent Base (Expecting '" + record[3] + "')";   break
101 |                 if record[4] == query[3]:
102 |                     Pred = record[5:] + [ '' ]
103 |                     break
104 |         
105 |         Args.fo.write("\t".join( query + Pred ) + "\n")
106 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/tabix.1:
--------------------------------------------------------------------------------
  1 | .TH tabix 1 "11 May 2010" "tabix-0.2.0" "Bioinformatics tools"
  2 | .SH NAME
  3 | .PP
  4 | bgzip - Block compression/decompression utility
  5 | .PP
  6 | tabix - Generic indexer for TAB-delimited genome position files
  7 | .SH SYNOPSIS
  8 | .PP
  9 | .B bgzip
 10 | .RB [ \-cdhB ]
 11 | .RB [ \-b
 12 | .IR virtualOffset ]
 13 | .RB [ \-s
 14 | .IR size ]
 15 | .RI [ file ]
 16 | .PP
 17 | .B tabix
 18 | .RB [ \-0lf ]
 19 | .RB [ \-p
 20 | .R gff|bed|sam|vcf]
 21 | .RB [ \-s
 22 | .IR seqCol ]
 23 | .RB [ \-b
 24 | .IR begCol ]
 25 | .RB [ \-e
 26 | .IR endCol ]
 27 | .RB [ \-S
 28 | .IR lineSkip ]
 29 | .RB [ \-c
 30 | .IR metaChar ]
 31 | .I in.tab.bgz
 32 | .RI [ "region1 " [ "region2 " [ ... "]]]"
 33 | 
 34 | .SH DESCRIPTION
 35 | .PP
 36 | Tabix indexes a TAB-delimited genome position file
 37 | .I in.tab.bgz
 38 | and creates an index file
 39 | .I in.tab.bgz.tbi
 40 | when
 41 | .I region
 42 | is absent from the command-line. The input data file must be position
 43 | sorted and compressed by
 44 | .B bgzip
 45 | which has a
 46 | .BR gzip (1)
 47 | like interface. After indexing, tabix is able to quickly retrieve data
 48 | lines overlapping
 49 | .I regions
 50 | specified in the format "chr:beginPos-endPos". Fast data retrieval also
 51 | works over network if URI is given as a file name and in this case the
 52 | index file will be downloaded if it is not present locally.
 53 | 
 54 | .SH OPTIONS OF TABIX
 55 | .TP 10
 56 | .BI "-p " STR
 57 | Input format for indexing. Valid values are: gff, bed, sam, vcf and
 58 | psltab. This option should not be applied together with any of
 59 | .BR \-s ", " \-b ", " \-e ", " \-c " and " \-0 ;
 60 | it is not used for data retrieval because this setting is stored in
 61 | the index file. [gff]
 62 | .TP
 63 | .BI "-s " INT
 64 | Column of sequence name. Option
 65 | .BR \-s ", " \-b ", " \-e ", " \-S ", " \-c " and " \-0
 66 | are all stored in the index file and thus not used in data retrieval. [1]
 67 | .TP
 68 | .BI "-b " INT
 69 | Column of start chromosomal position. [4]
 70 | .TP
 71 | .BI "-e " INT
 72 | Column of end chromosomal position. The end column can be the same as the
 73 | start column. [5]
 74 | .TP
 75 | .BI "-S " INT
 76 | Skip first INT lines in the data file. [0]
 77 | .TP
 78 | .BI "-c " CHAR
 79 | Skip lines started with character CHAR. [#]
 80 | .TP
 81 | .B -0
 82 | Specify that the position in the data file is 0-based (e.g. UCSC files)
 83 | rather than 1-based.
 84 | .TP
 85 | .B -h
 86 | Print the header/meta lines.
 87 | .TP
 88 | .B -B
 89 | The second argument is a BED file. When this option is in use, the input
 90 | file may not be sorted or indexed. The entire input will be read sequentially. Nonetheless,
 91 | with this option, the format of the input must be specificed correctly on the command line.
 92 | .TP
 93 | .B -f
 94 | Force to overwrite the index file if it is present.
 95 | .TP
 96 | .B -l
 97 | List the sequence names stored in the index file.
 98 | .RE
 99 | 
100 | .SH EXAMPLE
101 | (grep ^"#" in.gff; grep -v ^"#" in.gff | sort -k1,1 -k4,4n) | bgzip > sorted.gff.gz;
102 | 
103 | tabix -p gff sorted.gff.gz;
104 | 
105 | tabix sorted.gff.gz chr1:10,000,000-20,000,000;
106 | 
107 | .SH NOTES
108 | It is straightforward to achieve overlap queries using the standard
109 | B-tree index (with or without binning) implemented in all SQL databases,
110 | or the R-tree index in PostgreSQL and Oracle. But there are still many
111 | reasons to use tabix. Firstly, tabix directly works with a lot of widely
112 | used TAB-delimited formats such as GFF/GTF and BED. We do not need to
113 | design database schema or specialized binary formats. Data do not need
114 | to be duplicated in different formats, either. Secondly, tabix works on
115 | compressed data files while most SQL databases do not. The GenCode
116 | annotation GTF can be compressed down to 4%.  Thirdly, tabix is
117 | fast. The same indexing algorithm is known to work efficiently for an
118 | alignment with a few billion short reads. SQL databases probably cannot
119 | easily handle data at this scale. Last but not the least, tabix supports
120 | remote data retrieval. One can put the data file and the index at an FTP
121 | or HTTP server, and other users or even web services will be able to get
122 | a slice without downloading the entire file.
123 | 
124 | .SH AUTHOR
125 | .PP
126 | Tabix was written by Heng Li. The BGZF library was originally
127 | implemented by Bob Handsaker and modified by Heng Li for remote file
128 | access and in-memory caching.
129 | 
130 | .SH SEE ALSO
131 | .PP
132 | .BR samtools (1)
133 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/bedidx.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdint.h>
  3 | #include <string.h>
  4 | #include <stdio.h>
  5 | #include <zlib.h>
  6 | 
  7 | #include "ksort.h"
  8 | KSORT_INIT_GENERIC(uint64_t)
  9 | 
 10 | #include "kseq.h"
 11 | KSTREAM_INIT(gzFile, gzread, 8192)
 12 | 
 13 | typedef struct {
 14 | 	int n, m;
 15 | 	uint64_t *a;
 16 | 	int *idx;
 17 | } bed_reglist_t;
 18 | 
 19 | #include "khash.h"
 20 | KHASH_MAP_INIT_STR(reg, bed_reglist_t)
 21 | 
 22 | #define LIDX_SHIFT 13
 23 | 
 24 | typedef kh_reg_t reghash_t;
 25 | 
 26 | int *bed_index_core(int n, uint64_t *a, int *n_idx)
 27 | {
 28 | 	int i, j, m, *idx;
 29 | 	m = *n_idx = 0; idx = 0;
 30 | 	for (i = 0; i < n; ++i) {
 31 | 		int beg, end;
 32 | 		beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
 33 | 		if (m < end + 1) {
 34 | 			int oldm = m;
 35 | 			m = end + 1;
 36 | 			kroundup32(m);
 37 | 			idx = realloc(idx, m * sizeof(int));
 38 | 			for (j = oldm; j < m; ++j) idx[j] = -1;
 39 | 		}
 40 | 		if (beg == end) {
 41 | 			if (idx[beg] < 0) idx[beg] = i;
 42 | 		} else {
 43 | 			for (j = beg; j <= end; ++j)
 44 | 				if (idx[j] < 0) idx[j] = i;
 45 | 		}
 46 | 		*n_idx = end + 1;
 47 | 	}
 48 | 	return idx;
 49 | }
 50 | 
 51 | void bed_index(void *_h)
 52 | {
 53 | 	reghash_t *h = (reghash_t*)_h;
 54 | 	khint_t k;
 55 | 	for (k = 0; k < kh_end(h); ++k) {
 56 | 		if (kh_exist(h, k)) {
 57 | 			bed_reglist_t *p = &kh_val(h, k);
 58 | 			if (p->idx) free(p->idx);
 59 | 			ks_introsort(uint64_t, p->n, p->a);
 60 | 			p->idx = bed_index_core(p->n, p->a, &p->m);
 61 | 		}
 62 | 	}
 63 | }
 64 | 
 65 | int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
 66 | {
 67 | 	int i, min_off;
 68 | 	if (p->n == 0) return 0;
 69 | 	min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
 70 | 	if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
 71 | 		int n = beg>>LIDX_SHIFT;
 72 | 		if (n > p->n) n = p->n;
 73 | 		for (i = n - 1; i >= 0; --i)
 74 | 			if (p->idx[i] >= 0) break;
 75 | 		min_off = i >= 0? p->idx[i] : 0;
 76 | 	}
 77 | 	for (i = min_off; i < p->n; ++i) {
 78 | 		if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
 79 | 		if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
 80 | 			return 1; // find the overlap; return
 81 | 	}
 82 | 	return 0;
 83 | }
 84 | 
 85 | int bed_overlap(const void *_h, const char *chr, int beg, int end)
 86 | {
 87 | 	const reghash_t *h = (const reghash_t*)_h;
 88 | 	khint_t k;
 89 | 	if (!h) return 0;
 90 | 	k = kh_get(reg, h, chr);
 91 | 	if (k == kh_end(h)) return 0;
 92 | 	return bed_overlap_core(&kh_val(h, k), beg, end);
 93 | }
 94 | 
 95 | void *bed_read(const char *fn)
 96 | {
 97 | 	reghash_t *h = kh_init(reg);
 98 | 	gzFile fp;
 99 | 	kstream_t *ks;
100 | 	int dret;
101 | 	kstring_t *str;
102 | 	// read the list
103 | 	fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
104 | 	if (fp == 0) return 0;
105 | 	str = calloc(1, sizeof(kstring_t));
106 | 	ks = ks_init(fp);
107 | 	while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
108 | 		int beg = -1, end = -1;
109 | 		bed_reglist_t *p;
110 | 		khint_t k = kh_get(reg, h, str->s);
111 | 		if (k == kh_end(h)) { // absent from the hash table
112 | 			int ret;
113 | 			char *s = strdup(str->s);
114 | 			k = kh_put(reg, h, s, &ret);
115 | 			memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
116 | 		}
117 | 		p = &kh_val(h, k);
118 | 		if (dret != '\n') { // if the lines has other characters
119 | 			if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
120 | 				beg = atoi(str->s); // begin
121 | 				if (dret != '\n') {
122 | 					if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0]))
123 | 						end = atoi(str->s); // end
124 | 				}
125 | 			}
126 | 		}
127 | 		if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
128 | 		if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
129 | 		if (beg >= 0 && end > beg) {
130 | 			if (p->n == p->m) {
131 | 				p->m = p->m? p->m<<1 : 4;
132 | 				p->a = realloc(p->a, p->m * 8);
133 | 			}
134 | 			p->a[p->n++] = (uint64_t)beg<<32 | end;
135 | 		}
136 | 	}
137 | 	ks_destroy(ks);
138 | 	gzclose(fp);
139 | 	free(str->s); free(str);
140 | 	bed_index(h);
141 | 	return h;
142 | }
143 | 
144 | void bed_destroy(void *_h)
145 | {
146 | 	reghash_t *h = (reghash_t*)_h;
147 | 	khint_t k;
148 | 	for (k = 0; k < kh_end(h); ++k) {
149 | 		if (kh_exist(h, k)) {
150 | 			free(kh_val(h, k).a);
151 | 			free(kh_val(h, k).idx);
152 | 			free((char*)kh_key(h, k));
153 | 		}
154 | 	}
155 | 	kh_destroy(reg, h);
156 | }
157 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/kstring.c:
--------------------------------------------------------------------------------
  1 | #include <stdarg.h>
  2 | #include <stdio.h>
  3 | #include <ctype.h>
  4 | #include <string.h>
  5 | #include <stdint.h>
  6 | #include "kstring.h"
  7 | 
  8 | int ksprintf(kstring_t *s, const char *fmt, ...)
  9 | {
 10 | 	va_list ap;
 11 | 	int l;
 12 | 	va_start(ap, fmt);
 13 | 	l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'.
 14 | 	va_end(ap);
 15 | 	if (l + 1 > s->m - s->l) {
 16 | 		s->m = s->l + l + 2;
 17 | 		kroundup32(s->m);
 18 | 		s->s = (char*)realloc(s->s, s->m);
 19 | 		va_start(ap, fmt);
 20 | 		l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
 21 | 	}
 22 | 	va_end(ap);
 23 | 	s->l += l;
 24 | 	return l;
 25 | }
 26 | 
 27 | // s MUST BE a null terminated string; l = strlen(s)
 28 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
 29 | {
 30 | 	int i, n, max, last_char, last_start, *offsets, l;
 31 | 	n = 0; max = *_max; offsets = *_offsets;
 32 | 	l = strlen(s);
 33 | 	
 34 | #define __ksplit_aux do {												\
 35 | 		if (_offsets) {													\
 36 | 			s[i] = 0;													\
 37 | 			if (n == max) {												\
 38 | 				max = max? max<<1 : 2;									\
 39 | 				offsets = (int*)realloc(offsets, sizeof(int) * max);	\
 40 | 			}															\
 41 | 			offsets[n++] = last_start;									\
 42 | 		} else ++n;														\
 43 | 	} while (0)
 44 | 
 45 | 	for (i = 0, last_char = last_start = 0; i <= l; ++i) {
 46 | 		if (delimiter == 0) {
 47 | 			if (isspace(s[i]) || s[i] == 0) {
 48 | 				if (isgraph(last_char)) __ksplit_aux; // the end of a field
 49 | 			} else {
 50 | 				if (isspace(last_char) || last_char == 0) last_start = i;
 51 | 			}
 52 | 		} else {
 53 | 			if (s[i] == delimiter || s[i] == 0) {
 54 | 				if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
 55 | 			} else {
 56 | 				if (last_char == delimiter || last_char == 0) last_start = i;
 57 | 			}
 58 | 		}
 59 | 		last_char = s[i];
 60 | 	}
 61 | 	*_max = max; *_offsets = offsets;
 62 | 	return n;
 63 | }
 64 | 
 65 | /**********************
 66 |  * Boyer-Moore search *
 67 |  **********************/
 68 | 
 69 | // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
 70 | int *ksBM_prep(const uint8_t *pat, int m)
 71 | {
 72 | 	int i, *suff, *prep, *bmGs, *bmBc;
 73 | 	prep = calloc(m + 256, 1);
 74 | 	bmGs = prep; bmBc = prep + m;
 75 | 	{ // preBmBc()
 76 | 		for (i = 0; i < 256; ++i) bmBc[i] = m;
 77 | 		for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
 78 | 	}
 79 | 	suff = calloc(m, sizeof(int));
 80 | 	{ // suffixes()
 81 | 		int f = 0, g;
 82 | 		suff[m - 1] = m;
 83 | 		g = m - 1;
 84 | 		for (i = m - 2; i >= 0; --i) {
 85 | 			if (i > g && suff[i + m - 1 - f] < i - g)
 86 | 				suff[i] = suff[i + m - 1 - f];
 87 | 			else {
 88 | 				if (i < g) g = i;
 89 | 				f = i;
 90 | 				while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
 91 | 				suff[i] = f - g;
 92 | 			}
 93 | 		}
 94 | 	}
 95 | 	{ // preBmGs()
 96 | 		int j = 0;
 97 | 		for (i = 0; i < m; ++i) bmGs[i] = m;
 98 | 		for (i = m - 1; i >= 0; --i)
 99 | 			if (suff[i] == i + 1)
100 | 				for (; j < m - 1 - i; ++j)
101 | 					if (bmGs[j] == m)
102 | 						bmGs[j] = m - 1 - i;
103 | 		for (i = 0; i <= m - 2; ++i)
104 | 			bmGs[m - 1 - suff[i]] = m - 1 - i;
105 | 	}
106 | 	free(suff);
107 | 	return prep;
108 | }
109 | 
110 | int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches)
111 | {
112 | 	int i, j, *prep, *bmGs, *bmBc;
113 | 	int *matches = 0, mm = 0, nm = 0;
114 | 	prep = _prep? _prep : ksBM_prep(pat, m);
115 | 	bmGs = prep; bmBc = prep + m;
116 | 	j = 0;
117 | 	while (j <= n - m) {
118 | 		for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
119 | 		if (i < 0) {
120 | 			if (nm == mm) {
121 | 				mm = mm? mm<<1 : 1;
122 | 				matches = realloc(matches, mm * sizeof(int));
123 | 			}
124 | 			matches[nm++] = j;
125 | 			j += bmGs[0];
126 | 		} else {
127 | 			int max = bmBc[str[i+j]] - m + 1 + i;
128 | 			if (max < bmGs[i]) max = bmGs[i];
129 | 			j += max;
130 | 		}
131 | 	}
132 | 	*n_matches = nm;
133 | 	if (_prep == 0) free(prep);
134 | 	return matches;
135 | }
136 | 
137 | #ifdef KSTRING_MAIN
138 | #include <stdio.h>
139 | int main()
140 | {
141 | 	kstring_t *s;
142 | 	int *fields, n, i;
143 | 	s = (kstring_t*)calloc(1, sizeof(kstring_t));
144 | 	// test ksprintf()
145 | 	ksprintf(s, " abcdefg:    %d ", 100);
146 | 	printf("'%s'\n", s->s);
147 | 	// test ksplit()
148 | 	fields = ksplit(s, 0, &n);
149 | 	for (i = 0; i < n; ++i)
150 | 		printf("field[%d] = '%s'\n", i, s->s + fields[i]);
151 | 	free(s);
152 | 
153 | 	{
154 | 		static char *str = "abcdefgcdg";
155 | 		static char *pat = "cd";
156 | 		int n, *matches;
157 | 		matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n);
158 | 		printf("%d: \n", n);
159 | 		for (i = 0; i < n; ++i)
160 | 			printf("- %d\n", matches[i]);
161 | 		free(matches);
162 | 	}
163 | 	return 0;
164 | }
165 | #endif
166 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/tabix.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2009 Genome Research Ltd (GRL), 2010 Broad Institute
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /* Contact: Heng Li <lh3@live.co.uk> */
 27 | 
 28 | #ifndef __TABIDX_H
 29 | #define __TABIDX_H
 30 | 
 31 | #include <stdint.h>
 32 | #include "kstring.h"
 33 | #include "bgzf.h"
 34 | 
 35 | #define TI_PRESET_GENERIC 0
 36 | #define TI_PRESET_SAM     1
 37 | #define TI_PRESET_VCF     2
 38 | 
 39 | #define TI_FLAG_UCSC      0x10000
 40 | 
 41 | typedef int (*ti_fetch_f)(int l, const char *s, void *data);
 42 | 
 43 | struct __ti_index_t;
 44 | typedef struct __ti_index_t ti_index_t;
 45 | 
 46 | struct __ti_iter_t;
 47 | typedef struct __ti_iter_t *ti_iter_t;
 48 | 
 49 | typedef struct {
 50 | 	BGZF *fp;
 51 | 	ti_index_t *idx;
 52 | 	char *fn, *fnidx;
 53 | } tabix_t;
 54 | 
 55 | typedef struct {
 56 | 	int32_t preset;
 57 | 	int32_t sc, bc, ec; // seq col., beg col. and end col.
 58 | 	int32_t meta_char, line_skip;
 59 | } ti_conf_t;
 60 | 
 61 | typedef struct {
 62 | 	int beg, end;
 63 | 	char *ss, *se;
 64 | } ti_interval_t;
 65 | 
 66 | extern ti_conf_t ti_conf_gff, ti_conf_bed, ti_conf_psltbl, ti_conf_vcf, ti_conf_sam; // preset
 67 | 
 68 | #ifdef __cplusplus
 69 | extern "C" {
 70 | #endif
 71 | 
 72 | 	/*******************
 73 | 	 * High-level APIs *
 74 | 	 *******************/
 75 | 
 76 | 	tabix_t *ti_open(const char *fn, const char *fnidx);
 77 | 	int ti_lazy_index_load(tabix_t *t);
 78 | 	void ti_close(tabix_t *t);
 79 | 	ti_iter_t ti_query(tabix_t *t, const char *name, int beg, int end);
 80 | 	ti_iter_t ti_queryi(tabix_t *t, int tid, int beg, int end);
 81 | 	ti_iter_t ti_querys(tabix_t *t, const char *reg);
 82 | 	const char *ti_read(tabix_t *t, ti_iter_t iter, int *len);
 83 | 
 84 | 	/* Destroy the iterator */
 85 | 	void ti_iter_destroy(ti_iter_t iter);
 86 | 
 87 | 	/* Get the list of sequence names. Each "char*" pointer points to a
 88 | 	 * internal member of the index, so DO NOT modify the returned
 89 | 	 * pointer; otherwise the index will be corrupted. The returned
 90 | 	 * pointer should be freed by a single free() call by the routine
 91 | 	 * calling this function. The number of sequences is returned at *n. */
 92 | 	const char **ti_seqname(const ti_index_t *idx, int *n);
 93 | 
 94 | 	/******************
 95 | 	 * Low-level APIs *
 96 | 	 ******************/
 97 | 
 98 | 	/* Build the index for file <fn>. File <fn>.tbi will be generated
 99 | 	 * and overwrite the file of the same name. Return -1 on failure. */
100 | 	int ti_index_build(const char *fn, const ti_conf_t *conf);
101 | 
102 | 	/* Load the index from file <fn>.tbi. If <fn> is a URL and the index
103 | 	 * file is not in the working directory, <fn>.tbi will be
104 | 	 * downloaded. Return NULL on failure. */
105 | 	ti_index_t *ti_index_load(const char *fn);
106 | 
107 | 	ti_index_t *ti_index_load_local(const char *fnidx);
108 | 
109 | 	/* Destroy the index */
110 | 	void ti_index_destroy(ti_index_t *idx);
111 | 
112 | 	/* Parse a region like: chr2, chr2:100, chr2:100-200. Return -1 on failure. */
113 | 	int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin, int *end);
114 | 
115 | 	int ti_get_tid(const ti_index_t *idx, const char *name);
116 | 
117 | 	/* Get the iterator pointing to the first record at the current file
118 | 	 * position. If the file is just openned, the iterator points to the
119 | 	 * first record in the file. */
120 | 	ti_iter_t ti_iter_first(void);
121 | 
122 | 	/* Get the iterator pointing to the first record in region tid:beg-end */
123 | 	ti_iter_t ti_iter_query(const ti_index_t *idx, int tid, int beg, int end);
124 | 
125 | 	/* Get the data line pointed by the iterator and iterate to the next record. */
126 | 	const char *ti_iter_read(BGZF *fp, ti_iter_t iter, int *len);
127 | 
128 | 	const ti_conf_t *ti_get_conf(ti_index_t *idx);
129 | 	int ti_get_intv(const ti_conf_t *conf, int len, char *line, ti_interval_t *intv);
130 | 
131 | 	/*******************
132 | 	 * Deprecated APIs *
133 | 	 *******************/
134 | 
135 | 	/* The callback version for random access */
136 | 	int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func);
137 | 
138 | 	/* Read one line. */
139 | 	int ti_readline(BGZF *fp, kstring_t *str);
140 | 
141 | #ifdef __cplusplus
142 | }
143 | #endif
144 | 
145 | #endif
146 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/tabix.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[10pt]{article}
  2 | \usepackage{color}
  3 | \definecolor{gray}{rgb}{0.7,0.7,0.7}
  4 | 
  5 | \setlength{\topmargin}{0.0cm}
  6 | \setlength{\textheight}{21.5cm}
  7 | \setlength{\oddsidemargin}{0cm} 
  8 | \setlength{\textwidth}{16.5cm}
  9 | \setlength{\columnsep}{0.6cm}
 10 | 
 11 | \begin{document}
 12 | 
 13 | \title{The Tabix index file format}
 14 | \author{Heng Li}
 15 | \date{}
 16 | 
 17 | \maketitle
 18 | 
 19 | \begin{center}
 20 | \begin{tabular}{|l|l|l|l|l|l|l|}
 21 | \hline
 22 | \multicolumn{4}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\
 23 | \hline\hline
 24 | \multicolumn{4}{|l|}{\tt magic} & Magic string & {\tt char[4]} & TBI$\backslash$1 \\
 25 | \hline
 26 | \multicolumn{4}{|l|}{\tt n\_ref} & \# sequences & {\tt int32\_t} & \\
 27 | \hline
 28 | \multicolumn{4}{|l|}{\tt format} & Format (0: generic; 1: SAM; 2: VCF) & {\tt int32\_t} & \\
 29 | \hline
 30 | \multicolumn{4}{|l|}{\tt col\_seq} & Column for the sequence name & {\tt int32\_t} & \\
 31 | \hline
 32 | \multicolumn{4}{|l|}{\tt col\_beg} & Column for the start of a region & {\tt int32\_t} & \\
 33 | \hline
 34 | \multicolumn{4}{|l|}{\tt col\_end} & Column for the end of a region & {\tt int32\_t} & \\
 35 | \hline
 36 | \multicolumn{4}{|l|}{\tt meta} & Leading character for comment lines & {\tt int32\_t} & \\
 37 | \hline
 38 | \multicolumn{4}{|l|}{\tt skip} & \# lines to skip at the beginning & {\tt int32\_t} & \\
 39 | \hline
 40 | \multicolumn{4}{|l|}{\tt l\_nm} & Length of concatenated sequence names & {\tt int32\_t} & \\
 41 | \hline
 42 | \multicolumn{4}{|l|}{\tt names} & Concatenated names, each zero terminated & {\tt char[l\_nm]} & \\
 43 | \hline
 44 | \multicolumn{7}{|c|}{\textcolor{gray}{\it List of indices (n=n\_ref)}}\\
 45 | \cline{2-7}
 46 | \hspace{0.1cm} & \multicolumn{3}{l|}{\tt n\_bin} & \# distinct bins (for the binning index) & {\tt int32\_t} & \\
 47 | \cline{2-7}
 48 |  & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct bins (n=n\_bin)}} \\
 49 | \cline{3-7}
 50 |  & \hspace{0.1cm} & \multicolumn{2}{l|}{\tt bin} & Distinct bin number & {\tt uint32\_t} & \\
 51 | \cline{3-7}
 52 |  & & \multicolumn{2}{l|}{\tt n\_chunk} & \# chunks & {\tt int32\_t} & \\
 53 | \cline{3-7}
 54 |  & & \multicolumn{5}{c|}{\textcolor{gray}{\it List of chunks (n=n\_chunk)}} \\
 55 | \cline{4-7}
 56 |  & & \hspace{0.1cm} & {\tt cnk\_beg} & Virtual file offset of the start of the chunk & {\tt uint64\_t} & \\
 57 | \cline{4-7}
 58 |  & & & {\tt cnk\_end} & Virtual file offset of the end of the chunk & {\tt uint64\_t} & \\
 59 | \cline{2-7}
 60 |  & \multicolumn{3}{l|}{\tt n\_intv} & \# 16kb intervals (for the linear index) & {\tt int32\_t} & \\
 61 | \cline{2-7}
 62 |  & \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct intervals (n=n\_intv)}} \\
 63 | \cline{3-7}
 64 |  & & \multicolumn{2}{l|}{\tt ioff} & File offset of the first record in the interval & {\tt uint64\_t} & \\
 65 | \hline
 66 | \end{tabular}
 67 | \end{center}
 68 | 
 69 | {\bf Notes:}
 70 | 
 71 | \begin{itemize}
 72 | \item The index file is BGZF compressed.
 73 | \item All integers are little-endian.
 74 | \item When {\tt (format\&0x10000)} is true, the coordinate follows the
 75 |   {\tt BED} rule (i.e. half-closed-half-open and zero based); otherwise,
 76 |   the coordinate follows the {\tt GFF} rule (closed and one based).
 77 | \item For the SAM format, the end of a region equals {\tt POS} plus the
 78 |   reference length in the alignment, inferred from {\tt CIGAR}. For the
 79 |   VCF format, the end of a region equals {\tt POS} plus the size of the
 80 |   deletion.
 81 | \item Field {\tt col\_beg} may equal {\tt col\_end}, and in this case,
 82 |   the end of a region is {\tt end}={\tt beg+1}.
 83 | \item Example. For {\tt GFF}, {\tt format}=0, {\tt col\_seq}=1, {\tt
 84 |     col\_beg}=4, {\tt col\_end}=5, {\tt meta}=`{\tt \#}' and {\tt
 85 |     skip}=0. For {\tt BED}, {\tt format}=0x10000, {\tt col\_seq}=1, {\tt
 86 |     col\_beg}=2, {\tt col\_end}=3, {\tt meta}=`{\tt \#}' and {\tt
 87 |     skip}=0.
 88 | \item Given a zero-based, half-closed and half-open region {\tt
 89 |     [beg,end)}, the {\tt bin} number is calculated with the following C
 90 |   function:
 91 | \begin{verbatim}
 92 | int reg2bin(int beg, int end) {
 93 |   --end;
 94 |   if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
 95 |   if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
 96 |   if (beg>>20 == end>>20) return  ((1<<9)-1)/7 + (beg>>20);
 97 |   if (beg>>23 == end>>23) return  ((1<<6)-1)/7 + (beg>>23);
 98 |   if (beg>>26 == end>>26) return  ((1<<3)-1)/7 + (beg>>26);
 99 |   return 0;
100 | }
101 | \end{verbatim}
102 | \item The list of bins that may overlap a region {\tt [beg,end)} can be
103 |   obtained with the following C function.
104 | \begin{verbatim}
105 | #define MAX_BIN (((1<<18)-1)/7)
106 | int reg2bins(int rbeg, int rend, uint16_t list[MAX_BIN])
107 | {
108 |   int i = 0, k;
109 |   --rend;
110 |   list[i++] = 0;
111 |   for (k =    1 + (rbeg>>26); k <=    1 + (rend>>26); ++k) list[i++] = k;
112 |   for (k =    9 + (rbeg>>23); k <=    9 + (rend>>23); ++k) list[i++] = k;
113 |   for (k =   73 + (rbeg>>20); k <=   73 + (rend>>20); ++k) list[i++] = k;
114 |   for (k =  585 + (rbeg>>17); k <=  585 + (rend>>17); ++k) list[i++] = k;
115 |   for (k = 4681 + (rbeg>>14); k <= 4681 + (rend>>14); ++k) list[i++] = k;
116 |   return i; // #elements in list[]
117 | }
118 | \end{verbatim}
119 | \end{itemize}
120 | 
121 | \end{document}


--------------------------------------------------------------------------------
/tabix-0.2.6/bgzf.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
  4 |                  2011 Attractive Chaos <attractor@live.co.uk>
  5 | 
  6 |    Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |    of this software and associated documentation files (the "Software"), to deal
  8 |    in the Software without restriction, including without limitation the rights
  9 |    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |    copies of the Software, and to permit persons to whom the Software is
 11 |    furnished to do so, subject to the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be included in
 14 |    all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 |    THE SOFTWARE.
 23 | */
 24 | 
 25 | /* The BGZF library was originally written by Bob Handsaker from the Broad
 26 |  * Institute. It was later improved by the SAMtools developers. */
 27 | 
 28 | #ifndef __BGZF_H
 29 | #define __BGZF_H
 30 | 
 31 | #include <stdint.h>
 32 | #include <stdio.h>
 33 | #include <zlib.h>
 34 | 
 35 | #define BGZF_BLOCK_SIZE 0x10000 // 64k
 36 | 
 37 | #define BGZF_ERR_ZLIB   1
 38 | #define BGZF_ERR_HEADER 2
 39 | #define BGZF_ERR_IO     4
 40 | #define BGZF_ERR_MISUSE 8
 41 | 
 42 | typedef struct {
 43 |     int open_mode:8, compress_level:8, errcode:16;
 44 | 	int cache_size;
 45 |     int block_length, block_offset;
 46 |     int64_t block_address;
 47 |     void *uncompressed_block, *compressed_block;
 48 | 	void *cache; // a pointer to a hash table
 49 | 	void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading
 50 | } BGZF;
 51 | 
 52 | #ifndef KSTRING_T
 53 | #define KSTRING_T kstring_t
 54 | typedef struct __kstring_t {
 55 | 	size_t l, m;
 56 | 	char *s;
 57 | } kstring_t;
 58 | #endif
 59 | 
 60 | #ifdef __cplusplus
 61 | extern "C" {
 62 | #endif
 63 | 
 64 | 	/******************
 65 | 	 * Basic routines *
 66 | 	 ******************/
 67 | 
 68 | 	/**
 69 | 	 * Open an existing file descriptor for reading or writing.
 70 | 	 *
 71 | 	 * @param fd    file descriptor
 72 | 	 * @param mode  mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies
 73 | 	 *              the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored.
 74 |      * @return      BGZF file handler; 0 on error
 75 | 	 */
 76 | 	BGZF* bgzf_dopen(int fd, const char *mode);
 77 | 
 78 | 	/**
 79 | 	 * Open the specified file for reading or writing.
 80 | 	 */
 81 | 	BGZF* bgzf_open(const char* path, const char *mode);
 82 | 
 83 | 	/**
 84 | 	 * Close the BGZF and free all associated resources.
 85 | 	 *
 86 | 	 * @param fp    BGZF file handler
 87 | 	 * @return      0 on success and -1 on error
 88 | 	 */
 89 | 	int bgzf_close(BGZF *fp);
 90 | 
 91 | 	/**
 92 | 	 * Read up to _length_ bytes from the file storing into _data_.
 93 | 	 *
 94 | 	 * @param fp     BGZF file handler
 95 | 	 * @param data   data array to read into
 96 | 	 * @param length size of data to read
 97 | 	 * @return       number of bytes actually read; 0 on end-of-file and -1 on error
 98 | 	 */
 99 | 	ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length);
100 | 
101 | 	/**
102 | 	 * Write _length_ bytes from _data_ to the file.
103 | 	 *
104 | 	 * @param fp     BGZF file handler
105 | 	 * @param data   data array to write
106 | 	 * @param length size of data to write
107 | 	 * @return       number of bytes actually written; -1 on error
108 | 	 */
109 | 	ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length);
110 | 
111 | 	/**
112 | 	 * Write the data in the buffer to the file.
113 | 	 */
114 | 	int bgzf_flush(BGZF *fp);
115 | 
116 | 	/**
117 | 	 * Return a virtual file pointer to the current location in the file.
118 | 	 * No interpetation of the value should be made, other than a subsequent
119 | 	 * call to bgzf_seek can be used to position the file at the same point.
120 | 	 * Return value is non-negative on success.
121 | 	 */
122 | 	#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF))
123 | 
124 | 	/**
125 | 	 * Set the file to read from the location specified by _pos_.
126 | 	 *
127 | 	 * @param fp     BGZF file handler
128 | 	 * @param pos    virtual file offset returned by bgzf_tell()
129 | 	 * @param whence must be SEEK_SET
130 | 	 * @return       0 on success and -1 on error
131 | 	 */
132 | 	int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence);
133 | 
134 | 	/**
135 | 	 * Check if the BGZF end-of-file (EOF) marker is present
136 | 	 *
137 | 	 * @param fp    BGZF file handler opened for reading
138 | 	 * @return      1 if EOF is present; 0 if not or on I/O error
139 | 	 */
140 | 	int bgzf_check_EOF(BGZF *fp);
141 | 
142 | 	/**
143 | 	 * Check if a file is in the BGZF format
144 | 	 *
145 | 	 * @param fn    file name
146 | 	 * @return      1 if _fn_ is BGZF; 0 if not or on I/O error
147 | 	 */
148 | 	 int bgzf_is_bgzf(const char *fn);
149 | 
150 | 	/*********************
151 | 	 * Advanced routines *
152 | 	 *********************/
153 | 
154 | 	/**
155 | 	 * Set the cache size. Only effective when compiled with -DBGZF_CACHE.
156 | 	 *
157 | 	 * @param fp    BGZF file handler
158 | 	 * @param size  size of cache in bytes; 0 to disable caching (default)
159 | 	 */
160 | 	void bgzf_set_cache_size(BGZF *fp, int size);
161 | 
162 | 	/**
163 | 	 * Flush the file if the remaining buffer size is smaller than _size_ 
164 | 	 */
165 | 	int bgzf_flush_try(BGZF *fp, ssize_t size);
166 | 
167 | 	/**
168 | 	 * Read one byte from a BGZF file. It is faster than bgzf_read()
169 | 	 * @param fp     BGZF file handler
170 | 	 * @return       byte read; -1 on end-of-file or error
171 | 	 */
172 | 	int bgzf_getc(BGZF *fp);
173 | 
174 | 	/**
175 | 	 * Read one line from a BGZF file. It is faster than bgzf_getc()
176 | 	 *
177 | 	 * @param fp     BGZF file handler
178 | 	 * @param delim  delimitor
179 | 	 * @param str    string to write to; must be initialized
180 | 	 * @return       length of the string; 0 on end-of-file; negative on error
181 | 	 */
182 | 	int bgzf_getline(BGZF *fp, int delim, kstring_t *str);
183 | 
184 | 	/**
185 | 	 * Read the next BGZF block.
186 | 	 */
187 | 	int bgzf_read_block(BGZF *fp);
188 | 
189 | #ifdef __cplusplus
190 | }
191 | #endif
192 | 
193 | #endif
194 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/bgzip.c:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |    of this software and associated documentation files (the "Software"), to deal
  7 |    in the Software without restriction, including without limitation the rights
  8 |    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |    copies of the Software, and to permit persons to whom the Software is
 10 |    furnished to do so, subject to the following conditions:
 11 | 
 12 |    The above copyright notice and this permission notice shall be included in
 13 |    all copies or substantial portions of the Software.
 14 | 
 15 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 |    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 |    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 |    THE SOFTWARE.
 22 | */
 23 | 
 24 | #include <stdlib.h>
 25 | #include <string.h>
 26 | #include <stdio.h>
 27 | #include <fcntl.h>
 28 | #include <unistd.h>
 29 | #include <errno.h>
 30 | #include <sys/select.h>
 31 | #include <sys/stat.h>
 32 | #include "bgzf.h"
 33 | 
 34 | static const int WINDOW_SIZE = 64 * 1024;
 35 | 
 36 | static int bgzip_main_usage()
 37 | {
 38 | 	fprintf(stderr, "\n");
 39 | 	fprintf(stderr, "Usage:   bgzip [options] [file] ...\n\n");
 40 | 	fprintf(stderr, "Options: -c      write on standard output, keep original files unchanged\n");
 41 | 	fprintf(stderr, "         -d      decompress\n");
 42 | 	fprintf(stderr, "         -f      overwrite files without asking\n");
 43 | 	fprintf(stderr, "         -b INT  decompress at virtual file pointer INT\n");
 44 | 	fprintf(stderr, "         -s INT  decompress INT bytes in the uncompressed file\n");
 45 | 	fprintf(stderr, "         -h      give this help\n");
 46 | 	fprintf(stderr, "\n");
 47 | 	return 1;
 48 | }
 49 | 
 50 | static int write_open(const char *fn, int is_forced)
 51 | {
 52 | 	int fd = -1;
 53 | 	char c;
 54 | 	if (!is_forced) {
 55 | 		if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
 56 | 			fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
 57 | 			scanf("%c", &c);
 58 | 			if (c != 'Y' && c != 'y') {
 59 | 				fprintf(stderr, "[bgzip] not overwritten\n");
 60 | 				exit(1);
 61 | 			}
 62 | 		}
 63 | 	}
 64 | 	if (fd < 0) {
 65 | 		if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
 66 | 			fprintf(stderr, "[bgzip] %s: Fail to write\n", fn);
 67 | 			exit(1);
 68 | 		}
 69 | 	}
 70 | 	return fd;
 71 | }
 72 | 
 73 | static void fail(BGZF* fp)
 74 | {
 75 |     fprintf(stderr, "Error: %d\n", fp->errcode);
 76 |     exit(1);
 77 | }
 78 | 
 79 | int main(int argc, char **argv)
 80 | {
 81 | 	int c, compress, pstdout, is_forced;
 82 | 	BGZF *fp;
 83 | 	void *buffer;
 84 | 	long start, end, size;
 85 | 
 86 | 	compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
 87 | 	while((c  = getopt(argc, argv, "cdhfb:s:")) >= 0){
 88 | 		switch(c){
 89 | 		case 'h': return bgzip_main_usage();
 90 | 		case 'd': compress = 0; break;
 91 | 		case 'c': pstdout = 1; break;
 92 | 		case 'b': start = atol(optarg); break;
 93 | 		case 's': size = atol(optarg); break;
 94 | 		case 'f': is_forced = 1; break;
 95 | 		}
 96 | 	}
 97 | 	if (size >= 0) end = start + size;
 98 | 	if (end >= 0 && end < start) {
 99 | 		fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
100 | 		return 1;
101 | 	}
102 | 	if (compress == 1) {
103 | 		struct stat sbuf;
104 | 		int f_src = fileno(stdin);
105 | 		int f_dst = fileno(stdout);
106 | 
107 | 		if ( argc>optind )
108 | 		{
109 | 			if ( stat(argv[optind],&sbuf)<0 ) 
110 | 			{ 
111 | 				fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
112 | 				return 1; 
113 | 			}
114 | 
115 | 			if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
116 | 				fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
117 | 				return 1;
118 | 			}
119 | 
120 | 			if (pstdout)
121 | 				f_dst = fileno(stdout);
122 | 			else
123 | 			{
124 | 				char *name = malloc(strlen(argv[optind]) + 5);
125 | 				strcpy(name, argv[optind]);
126 | 				strcat(name, ".gz");
127 | 				f_dst = write_open(name, is_forced);
128 | 				if (f_dst < 0) return 1;
129 | 				free(name);
130 | 			}
131 | 		}
132 | 		else if (!pstdout && isatty(fileno((FILE *)stdout)) )
133 | 			return bgzip_main_usage();
134 | 
135 | 		fp = bgzf_dopen(f_dst, "w");
136 | 		buffer = malloc(WINDOW_SIZE);
137 | 		while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
138 | 			if (bgzf_write(fp, buffer, c) < 0) fail(fp);
139 | 		// f_dst will be closed here
140 | 		if (bgzf_close(fp) < 0) fail(fp);
141 | 		if (argc > optind && !pstdout) unlink(argv[optind]);
142 | 		free(buffer);
143 | 		close(f_src);
144 | 		return 0;
145 | 	} else {
146 | 		struct stat sbuf;
147 | 		int f_dst;
148 | 
149 | 		if ( argc>optind )
150 | 		{
151 | 			if ( stat(argv[optind],&sbuf)<0 )
152 | 			{
153 | 				fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
154 | 				return 1;
155 | 			}
156 | 			char *name;
157 | 			int len = strlen(argv[optind]);
158 | 			if ( strcmp(argv[optind]+len-3,".gz") )
159 | 			{
160 | 				fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
161 | 				return 1;
162 | 			}
163 | 			fp = bgzf_open(argv[optind], "r");
164 | 			if (fp == NULL) {
165 | 				fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
166 | 				return 1;
167 | 			}
168 | 
169 | 			if (pstdout) {
170 | 				f_dst = fileno(stdout);
171 | 			}
172 | 			else {
173 | 				name = strdup(argv[optind]);
174 | 				name[strlen(name) - 3] = '\0';
175 | 				f_dst = write_open(name, is_forced);
176 | 				free(name);
177 | 			}
178 | 		}
179 | 		else if (!pstdout && isatty(fileno((FILE *)stdin)) )
180 | 			return bgzip_main_usage();
181 | 		else
182 | 		{
183 | 			f_dst = fileno(stdout);
184 | 			fp = bgzf_dopen(fileno(stdin), "r");
185 | 			if (fp == NULL) {
186 | 				fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
187 | 				return 1;
188 | 			}
189 | 		}
190 | 		buffer = malloc(WINDOW_SIZE);
191 | 		if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp);
192 | 		while (1) {
193 | 			if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
194 | 			else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
195 | 			if (c == 0) break;
196 | 			if (c < 0) fail(fp);
197 | 			start += c;
198 | 			write(f_dst, buffer, c);
199 | 			if (end >= 0 && start >= end) break;
200 | 		}
201 | 		free(buffer);
202 | 		if (bgzf_close(fp) < 0) fail(fp);
203 | 		if (!pstdout) unlink(argv[optind]);
204 | 		return 0;
205 | 	}
206 | }
207 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/kseq.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008 Genome Research Ltd (GRL).
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /* Contact: Heng Li <lh3@sanger.ac.uk> */
 27 | 
 28 | /*
 29 |   2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*"
 30 |  */
 31 | 
 32 | /* Last Modified: 12APR2009 */
 33 | 
 34 | #ifndef AC_KSEQ_H
 35 | #define AC_KSEQ_H
 36 | 
 37 | #include <ctype.h>
 38 | #include <string.h>
 39 | #include <stdlib.h>
 40 | 
 41 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
 42 | #define KS_SEP_TAB   1 // isspace() && !' '
 43 | #define KS_SEP_MAX   1
 44 | 
 45 | #define __KS_TYPE(type_t)						\
 46 | 	typedef struct __kstream_t {				\
 47 | 		unsigned char *buf;						\
 48 | 		int begin, end, is_eof;					\
 49 | 		type_t f;								\
 50 | 	} kstream_t;
 51 | 
 52 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
 53 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
 54 | 
 55 | #define __KS_BASIC(type_t, __bufsize)								\
 56 | 	static inline kstream_t *ks_init(type_t f)						\
 57 | 	{																\
 58 | 		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
 59 | 		ks->f = f;													\
 60 | 		ks->buf = malloc(__bufsize);								\
 61 | 		return ks;													\
 62 | 	}																\
 63 | 	static inline void ks_destroy(kstream_t *ks)					\
 64 | 	{																\
 65 | 		if (ks) {													\
 66 | 			free(ks->buf);											\
 67 | 			free(ks);												\
 68 | 		}															\
 69 | 	}
 70 | 
 71 | #define __KS_GETC(__read, __bufsize)						\
 72 | 	static inline int ks_getc(kstream_t *ks)				\
 73 | 	{														\
 74 | 		if (ks->is_eof && ks->begin >= ks->end) return -1;	\
 75 | 		if (ks->begin >= ks->end) {							\
 76 | 			ks->begin = 0;									\
 77 | 			ks->end = __read(ks->f, ks->buf, __bufsize);	\
 78 | 			if (ks->end < __bufsize) ks->is_eof = 1;		\
 79 | 			if (ks->end == 0) return -1;					\
 80 | 		}													\
 81 | 		return (int)ks->buf[ks->begin++];					\
 82 | 	}
 83 | 
 84 | #ifndef KSTRING_T
 85 | #define KSTRING_T kstring_t
 86 | typedef struct __kstring_t {
 87 | 	size_t l, m;
 88 | 	char *s;
 89 | } kstring_t;
 90 | #endif
 91 | 
 92 | #ifndef kroundup32
 93 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 94 | #endif
 95 | 
 96 | #define __KS_GETUNTIL(__read, __bufsize)								\
 97 | 	static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
 98 | 	{																	\
 99 | 		if (dret) *dret = 0;											\
100 | 		str->l = 0;														\
101 | 		if (ks->begin >= ks->end && ks->is_eof) return -1;				\
102 | 		for (;;) {														\
103 | 			int i;														\
104 | 			if (ks->begin >= ks->end) {									\
105 | 				if (!ks->is_eof) {										\
106 | 					ks->begin = 0;										\
107 | 					ks->end = __read(ks->f, ks->buf, __bufsize);		\
108 | 					if (ks->end < __bufsize) ks->is_eof = 1;			\
109 | 					if (ks->end == 0) break;							\
110 | 				} else break;											\
111 | 			}															\
112 | 			if (delimiter > KS_SEP_MAX) {								\
113 | 				for (i = ks->begin; i < ks->end; ++i)					\
114 | 					if (ks->buf[i] == delimiter) break;					\
115 | 			} else if (delimiter == KS_SEP_SPACE) {						\
116 | 				for (i = ks->begin; i < ks->end; ++i)					\
117 | 					if (isspace(ks->buf[i])) break;						\
118 | 			} else if (delimiter == KS_SEP_TAB) {						\
119 | 				for (i = ks->begin; i < ks->end; ++i)					\
120 | 					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
121 | 			} else i = 0; /* never come to here! */						\
122 | 			if (str->m - str->l < i - ks->begin + 1) {					\
123 | 				str->m = str->l + (i - ks->begin) + 1;					\
124 | 				kroundup32(str->m);										\
125 | 				str->s = (char*)realloc(str->s, str->m);				\
126 | 			}															\
127 | 			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
128 | 			str->l = str->l + (i - ks->begin);							\
129 | 			ks->begin = i + 1;											\
130 | 			if (i < ks->end) {											\
131 | 				if (dret) *dret = ks->buf[i];							\
132 | 				break;													\
133 | 			}															\
134 | 		}																\
135 | 		if (str->l == 0) {												\
136 | 			str->m = 1;													\
137 | 			str->s = (char*)calloc(1, 1);								\
138 | 		}																\
139 | 		str->s[str->l] = '\0';											\
140 | 		return str->l;													\
141 | 	}
142 | 
143 | #define KSTREAM_INIT(type_t, __read, __bufsize) \
144 | 	__KS_TYPE(type_t)							\
145 | 	__KS_BASIC(type_t, __bufsize)				\
146 | 	__KS_GETC(__read, __bufsize)				\
147 | 	__KS_GETUNTIL(__read, __bufsize)
148 | 
149 | #define __KSEQ_BASIC(type_t)											\
150 | 	static inline kseq_t *kseq_init(type_t fd)							\
151 | 	{																	\
152 | 		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
153 | 		s->f = ks_init(fd);												\
154 | 		return s;														\
155 | 	}																	\
156 | 	static inline void kseq_rewind(kseq_t *ks)							\
157 | 	{																	\
158 | 		ks->last_char = 0;												\
159 | 		ks->f->is_eof = ks->f->begin = ks->f->end = 0;					\
160 | 	}																	\
161 | 	static inline void kseq_destroy(kseq_t *ks)							\
162 | 	{																	\
163 | 		if (!ks) return;												\
164 | 		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
165 | 		ks_destroy(ks->f);												\
166 | 		free(ks);														\
167 | 	}
168 | 
169 | /* Return value:
170 |    >=0  length of the sequence (normal)
171 |    -1   end-of-file
172 |    -2   truncated quality string
173 |  */
174 | #define __KSEQ_READ														\
175 | 	static int kseq_read(kseq_t *seq)									\
176 | 	{																	\
177 | 		int c;															\
178 | 		kstream_t *ks = seq->f;											\
179 | 		if (seq->last_char == 0) { /* then jump to the next header line */ \
180 | 			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');	\
181 | 			if (c == -1) return -1; /* end of file */					\
182 | 			seq->last_char = c;											\
183 | 		} /* the first header char has been read */						\
184 | 		seq->comment.l = seq->seq.l = seq->qual.l = 0;					\
185 | 		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;			\
186 | 		if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);			\
187 | 		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
188 | 			if (isgraph(c)) { /* printable non-space character */		\
189 | 				if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
190 | 					seq->seq.m = seq->seq.l + 2;						\
191 | 					kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
192 | 					seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
193 | 				}														\
194 | 				seq->seq.s[seq->seq.l++] = (char)c;						\
195 | 			}															\
196 | 		}																\
197 | 		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
198 | 		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */		\
199 | 		if (c != '+') return seq->seq.l; /* FASTA */					\
200 | 		if (seq->qual.m < seq->seq.m) {	/* allocate enough memory */	\
201 | 			seq->qual.m = seq->seq.m;									\
202 | 			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);		\
203 | 		}																\
204 | 		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
205 | 		if (c == -1) return -2; /* we should not stop here */			\
206 | 		while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)		\
207 | 			if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c;	\
208 | 		seq->qual.s[seq->qual.l] = 0; /* null terminated string */		\
209 | 		seq->last_char = 0;	/* we have not come to the next header line */ \
210 | 		if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
211 | 		return seq->seq.l;												\
212 | 	}
213 | 
214 | #define __KSEQ_TYPE(type_t)						\
215 | 	typedef struct {							\
216 | 		kstring_t name, comment, seq, qual;		\
217 | 		int last_char;							\
218 | 		kstream_t *f;							\
219 | 	} kseq_t;
220 | 
221 | #define KSEQ_INIT(type_t, __read)				\
222 | 	KSTREAM_INIT(type_t, __read, 4096)			\
223 | 	__KSEQ_TYPE(type_t)							\
224 | 	__KSEQ_BASIC(type_t)						\
225 | 	__KSEQ_READ
226 | 
227 | #endif
228 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/ksort.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008 Genome Research Ltd (GRL).
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /* Contact: Heng Li <lh3@sanger.ac.uk> */
 27 | 
 28 | /*
 29 |   2008-11-16 (0.1.4):
 30 | 
 31 |     * Fixed a bug in introsort() that happens in rare cases.
 32 | 
 33 |   2008-11-05 (0.1.3):
 34 | 
 35 |     * Fixed a bug in introsort() for complex comparisons.
 36 | 
 37 | 	* Fixed a bug in mergesort(). The previous version is not stable.
 38 | 
 39 |   2008-09-15 (0.1.2):
 40 | 
 41 | 	* Accelerated introsort. On my Mac (not on another Linux machine),
 42 | 	  my implementation is as fast as std::sort on random input.
 43 | 
 44 | 	* Added combsort and in introsort, switch to combsort if the
 45 | 	  recursion is too deep.
 46 | 
 47 |   2008-09-13 (0.1.1):
 48 | 
 49 | 	* Added k-small algorithm
 50 | 
 51 |   2008-09-05 (0.1.0):
 52 | 
 53 | 	* Initial version
 54 | 
 55 | */
 56 | 
 57 | #ifndef AC_KSORT_H
 58 | #define AC_KSORT_H
 59 | 
 60 | #include <stdlib.h>
 61 | #include <string.h>
 62 | 
 63 | typedef struct {
 64 | 	void *left, *right;
 65 | 	int depth;
 66 | } ks_isort_stack_t;
 67 | 
 68 | #define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
 69 | 
 70 | #define KSORT_INIT(name, type_t, __sort_lt)								\
 71 | 	void ks_mergesort_##name(size_t n, type_t array[], type_t temp[])	\
 72 | 	{																	\
 73 | 		type_t *a2[2], *a, *b;											\
 74 | 		int curr, shift;												\
 75 | 																		\
 76 | 		a2[0] = array;													\
 77 | 		a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n);		\
 78 | 		for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) {			\
 79 | 			a = a2[curr]; b = a2[1-curr];								\
 80 | 			if (shift == 0) {											\
 81 | 				type_t *p = b, *i, *eb = a + n;							\
 82 | 				for (i = a; i < eb; i += 2) {							\
 83 | 					if (i == eb - 1) *p++ = *i;							\
 84 | 					else {												\
 85 | 						if (__sort_lt(*(i+1), *i)) {					\
 86 | 							*p++ = *(i+1); *p++ = *i;					\
 87 | 						} else {										\
 88 | 							*p++ = *i; *p++ = *(i+1);					\
 89 | 						}												\
 90 | 					}													\
 91 | 				}														\
 92 | 			} else {													\
 93 | 				size_t i, step = 1ul<<shift;							\
 94 | 				for (i = 0; i < n; i += step<<1) {						\
 95 | 					type_t *p, *j, *k, *ea, *eb;						\
 96 | 					if (n < i + step) {									\
 97 | 						ea = a + n; eb = a;								\
 98 | 					} else {											\
 99 | 						ea = a + i + step;								\
100 | 						eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
101 | 					}													\
102 | 					j = a + i; k = a + i + step; p = b + i;				\
103 | 					while (j < ea && k < eb) {							\
104 | 						if (__sort_lt(*k, *j)) *p++ = *k++;				\
105 | 						else *p++ = *j++;								\
106 | 					}													\
107 | 					while (j < ea) *p++ = *j++;							\
108 | 					while (k < eb) *p++ = *k++;							\
109 | 				}														\
110 | 			}															\
111 | 			curr = 1 - curr;											\
112 | 		}																\
113 | 		if (curr == 1) {												\
114 | 			type_t *p = a2[0], *i = a2[1], *eb = array + n;				\
115 | 			for (; p < eb; ++i) *p++ = *i;								\
116 | 		}																\
117 | 		if (temp == 0) free(a2[1]);										\
118 | 	}																	\
119 | 	void ks_heapadjust_##name(size_t i, size_t n, type_t l[])			\
120 | 	{																	\
121 | 		size_t k = i;													\
122 | 		type_t tmp = l[i];												\
123 | 		while ((k = (k << 1) + 1) < n) {								\
124 | 			if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k;				\
125 | 			if (__sort_lt(l[k], tmp)) break;							\
126 | 			l[i] = l[k]; i = k;											\
127 | 		}																\
128 | 		l[i] = tmp;														\
129 | 	}																	\
130 | 	void ks_heapmake_##name(size_t lsize, type_t l[])					\
131 | 	{																	\
132 | 		size_t i;														\
133 | 		for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i)				\
134 | 			ks_heapadjust_##name(i, lsize, l);							\
135 | 	}																	\
136 | 	void ks_heapsort_##name(size_t lsize, type_t l[])					\
137 | 	{																	\
138 | 		size_t i;														\
139 | 		for (i = lsize - 1; i > 0; --i) {								\
140 | 			type_t tmp;													\
141 | 			tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
142 | 		}																\
143 | 	}																	\
144 | 	inline void __ks_insertsort_##name(type_t *s, type_t *t)			\
145 | 	{																	\
146 | 		type_t *i, *j, swap_tmp;										\
147 | 		for (i = s + 1; i < t; ++i)										\
148 | 			for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) {			\
149 | 				swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp;			\
150 | 			}															\
151 | 	}																	\
152 | 	void ks_combsort_##name(size_t n, type_t a[])						\
153 | 	{																	\
154 | 		const double shrink_factor = 1.2473309501039786540366528676643; \
155 | 		int do_swap;													\
156 | 		size_t gap = n;													\
157 | 		type_t tmp, *i, *j;												\
158 | 		do {															\
159 | 			if (gap > 2) {												\
160 | 				gap = (size_t)(gap / shrink_factor);					\
161 | 				if (gap == 9 || gap == 10) gap = 11;					\
162 | 			}															\
163 | 			do_swap = 0;												\
164 | 			for (i = a; i < a + n - gap; ++i) {							\
165 | 				j = i + gap;											\
166 | 				if (__sort_lt(*j, *i)) {								\
167 | 					tmp = *i; *i = *j; *j = tmp;						\
168 | 					do_swap = 1;										\
169 | 				}														\
170 | 			}															\
171 | 		} while (do_swap || gap > 2);									\
172 | 		if (gap != 1) __ks_insertsort_##name(a, a + n);					\
173 | 	}																	\
174 | 	void ks_introsort_##name(size_t n, type_t a[])						\
175 | 	{																	\
176 | 		int d;															\
177 | 		ks_isort_stack_t *top, *stack;									\
178 | 		type_t rp, swap_tmp;											\
179 | 		type_t *s, *t, *i, *j, *k;										\
180 | 																		\
181 | 		if (n < 1) return;												\
182 | 		else if (n == 2) {												\
183 | 			if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
184 | 			return;														\
185 | 		}																\
186 | 		for (d = 2; 1ul<<d < n; ++d);									\
187 | 		stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
188 | 		top = stack; s = a; t = a + (n-1); d <<= 1;						\
189 | 		while (1) {														\
190 | 			if (s < t) {												\
191 | 				if (--d == 0) {											\
192 | 					ks_combsort_##name(t - s + 1, s);					\
193 | 					t = s;												\
194 | 					continue;											\
195 | 				}														\
196 | 				i = s; j = t; k = i + ((j-i)>>1) + 1;					\
197 | 				if (__sort_lt(*k, *i)) {								\
198 | 					if (__sort_lt(*k, *j)) k = j;						\
199 | 				} else k = __sort_lt(*j, *i)? i : j;					\
200 | 				rp = *k;												\
201 | 				if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; }	\
202 | 				for (;;) {												\
203 | 					do ++i; while (__sort_lt(*i, rp));					\
204 | 					do --j; while (i <= j && __sort_lt(rp, *j));		\
205 | 					if (j <= i) break;									\
206 | 					swap_tmp = *i; *i = *j; *j = swap_tmp;				\
207 | 				}														\
208 | 				swap_tmp = *i; *i = *t; *t = swap_tmp;					\
209 | 				if (i-s > t-i) {										\
210 | 					if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
211 | 					s = t-i > 16? i+1 : t;								\
212 | 				} else {												\
213 | 					if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
214 | 					t = i-s > 16? i-1 : s;								\
215 | 				}														\
216 | 			} else {													\
217 | 				if (top == stack) {										\
218 | 					free(stack);										\
219 | 					__ks_insertsort_##name(a, a+n);						\
220 | 					return;												\
221 | 				} else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
222 | 			}															\
223 | 		}																\
224 | 	}																	\
225 | 	/* This function is adapted from: http://ndevilla.free.fr/median/ */ \
226 | 	/* 0 <= kk < n */													\
227 | 	type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk)			\
228 | 	{																	\
229 | 		type_t *low, *high, *k, *ll, *hh, *mid;							\
230 | 		low = arr; high = arr + n - 1; k = arr + kk;					\
231 | 		for (;;) {														\
232 | 			if (high <= low) return *k;									\
233 | 			if (high == low + 1) {										\
234 | 				if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
235 | 				return *k;												\
236 | 			}															\
237 | 			mid = low + (high - low) / 2;								\
238 | 			if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
239 | 			if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
240 | 			if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low);	\
241 | 			KSORT_SWAP(type_t, *mid, *(low+1));							\
242 | 			ll = low + 1; hh = high;									\
243 | 			for (;;) {													\
244 | 				do ++ll; while (__sort_lt(*ll, *low));					\
245 | 				do --hh; while (__sort_lt(*low, *hh));					\
246 | 				if (hh < ll) break;										\
247 | 				KSORT_SWAP(type_t, *ll, *hh);							\
248 | 			}															\
249 | 			KSORT_SWAP(type_t, *low, *hh);								\
250 | 			if (hh <= k) low = ll;										\
251 | 			if (hh >= k) high = hh - 1;									\
252 | 		}																\
253 | 	}
254 | 
255 | #define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
256 | #define ks_introsort(name, n, a) ks_introsort_##name(n, a)
257 | #define ks_combsort(name, n, a) ks_combsort_##name(n, a)
258 | #define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
259 | #define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
260 | #define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
261 | #define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
262 | 
263 | #define ks_lt_generic(a, b) ((a) < (b))
264 | #define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
265 | 
266 | typedef const char *ksstr_t;
267 | 
268 | #define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
269 | #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
270 | 
271 | #endif
272 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/main.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <unistd.h>
  3 | #include <stdlib.h>
  4 | #include <stdio.h>
  5 | #include <sys/stat.h>
  6 | #include <errno.h>
  7 | #include "bgzf.h"
  8 | #include "tabix.h"
  9 | #include "knetfile.h"
 10 | 
 11 | #define PACKAGE_VERSION "0.2.5 (r1005)"
 12 | 
 13 | #define error(...) { fprintf(stderr,__VA_ARGS__); return -1; }
 14 | 
 15 | int reheader_file(const char *header, const char *file, int meta)
 16 | {
 17 |     BGZF *fp = bgzf_open(file,"r");
 18 |     if (bgzf_read_block(fp) != 0 || !fp->block_length)
 19 |         return -1;
 20 |     
 21 |     char *buffer = fp->uncompressed_block;
 22 |     int skip_until = 0;
 23 | 
 24 |     if ( buffer[0]==meta )
 25 |     {
 26 |         skip_until = 1;
 27 | 
 28 |         // Skip the header
 29 |         while (1)
 30 |         {
 31 |             if ( buffer[skip_until]=='\n' )
 32 |             {
 33 |                 skip_until++;
 34 |                 if ( skip_until>=fp->block_length )
 35 |                 {
 36 |                     if (bgzf_read_block(fp) != 0 || !fp->block_length)
 37 |                         error("no body?\n");
 38 |                     skip_until = 0;
 39 |                 }
 40 |                 // The header has finished
 41 |                 if ( buffer[skip_until]!=meta ) break;
 42 |             }
 43 |             skip_until++;
 44 |             if ( skip_until>=fp->block_length )
 45 |             {
 46 |                 if (bgzf_read_block(fp) != 0 || !fp->block_length)
 47 |                     error("no body?\n");
 48 |                 skip_until = 0;
 49 |             }
 50 |         }
 51 |     }
 52 | 
 53 |     FILE *fh = fopen(header,"r");
 54 |     if ( !fh )
 55 |         error("%s: %s", header,strerror(errno));
 56 |     int page_size = getpagesize();
 57 |     char *buf = valloc(page_size);
 58 |     BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w");
 59 |     ssize_t nread;
 60 |     while ( (nread=fread(buf,1,page_size-1,fh))>0 )
 61 |     {
 62 |         if ( nread<page_size-1 && buf[nread-1]!='\n' )
 63 |             buf[nread++] = '\n';
 64 |         if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode);
 65 |     }
 66 |     fclose(fh);
 67 | 
 68 |     if ( fp->block_length - skip_until > 0 )
 69 |     {
 70 |         if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) 
 71 |             error("Error: %d\n",fp->errcode);
 72 |     }
 73 |     if (bgzf_flush(bgzf_out) < 0) 
 74 |         error("Error: %d\n",bgzf_out->errcode);
 75 | 
 76 |     while (1)
 77 |     {
 78 | #ifdef _USE_KNETFILE
 79 |         nread = knet_read(fp->fp, buf, page_size);
 80 | #else
 81 |         nread = fread(buf, 1, page_size, fp->fp);
 82 | #endif
 83 |         if ( nread<=0 ) 
 84 |             break;
 85 | 
 86 |         int count = fwrite(buf, 1, nread, bgzf_out->fp);
 87 |         if (count != nread)
 88 |             error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
 89 |     }
 90 | 
 91 |     if (bgzf_close(bgzf_out) < 0) 
 92 |         error("Error: %d\n",bgzf_out->errcode);
 93 |    
 94 |     return 0;
 95 | }
 96 | 
 97 | 
 98 | int main(int argc, char *argv[])
 99 | {
100 | 	int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, print_only_header = 0, bed_reg = 0;
101 | 	ti_conf_t conf = ti_conf_gff, *conf_ptr = NULL;
102 |     const char *reheader = NULL;
103 | 	while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhHfBr:")) >= 0) {
104 | 		switch (c) {
105 | 		case 'B': bed_reg = 1; break;
106 | 		case '0': conf.preset |= TI_FLAG_UCSC; break;
107 | 		case 'S': skip = atoi(optarg); break;
108 | 		case 'c': meta = optarg[0]; break;
109 | 		case 'p':
110 | 			if (strcmp(optarg, "gff") == 0) conf_ptr = &ti_conf_gff;
111 | 			else if (strcmp(optarg, "bed") == 0) conf_ptr = &ti_conf_bed;
112 | 			else if (strcmp(optarg, "sam") == 0) conf_ptr = &ti_conf_sam;
113 | 			else if (strcmp(optarg, "vcf") == 0 || strcmp(optarg, "vcf4") == 0) conf_ptr = &ti_conf_vcf;
114 | 			else if (strcmp(optarg, "psltbl") == 0) conf_ptr = &ti_conf_psltbl;
115 | 			else {
116 | 				fprintf(stderr, "[main] unrecognized preset '%s'\n", optarg);
117 | 				return 1;
118 | 			}
119 | 			break;
120 | 		case 's': conf.sc = atoi(optarg); break;
121 | 		case 'b': conf.bc = atoi(optarg); break;
122 | 		case 'e': conf.ec = atoi(optarg); break;
123 |         case 'l': list_chrms = 1; break;
124 |         case 'h': print_header = 1; break;
125 |         case 'H': print_only_header = 1; break;
126 | 		case 'f': force = 1; break;
127 |         case 'r': reheader = optarg; break;
128 | 		}
129 | 	}
130 | 	if (optind == argc) {
131 | 		fprintf(stderr, "\n");
132 | 		fprintf(stderr, "Program: tabix (TAB-delimited file InderXer)\n");
133 | 		fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION);
134 | 		fprintf(stderr, "Usage:   tabix <in.tab.bgz> [region1 [region2 [...]]]\n\n");
135 | 		fprintf(stderr, "Options: -p STR     preset: gff, bed, sam, vcf, psltbl [gff]\n");
136 | 		fprintf(stderr, "         -s INT     sequence name column [1]\n");
137 | 		fprintf(stderr, "         -b INT     start column [4]\n");
138 | 		fprintf(stderr, "         -e INT     end column; can be identical to '-b' [5]\n");
139 | 		fprintf(stderr, "         -S INT     skip first INT lines [0]\n");
140 | 		fprintf(stderr, "         -c CHAR    symbol for comment/meta lines [#]\n");
141 | 	    fprintf(stderr, "         -r FILE    replace the header with the content of FILE [null]\n");
142 | 		fprintf(stderr, "         -B         region1 is a BED file (entire file will be read)\n");
143 | 		fprintf(stderr, "         -0         zero-based coordinate\n");
144 | 		fprintf(stderr, "         -h         print also the header lines\n");
145 | 		fprintf(stderr, "         -H         print only the header lines\n");
146 | 		fprintf(stderr, "         -l         list chromosome names\n");
147 | 		fprintf(stderr, "         -f         force to overwrite the index\n");
148 | 		fprintf(stderr, "\n");
149 | 		return 1;
150 | 	}
151 |     if ( !conf_ptr )
152 |     {
153 |         int l = strlen(argv[optind]);
154 |         int strcasecmp(const char *s1, const char *s2);
155 |     	if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &ti_conf_gff;
156 |         else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &ti_conf_bed;
157 |         else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &ti_conf_sam;
158 |         else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &ti_conf_vcf;
159 |         else if (l>=10 && strcasecmp(argv[optind]+l-10, ".psltbl.gz") == 0) conf_ptr = &ti_conf_psltbl;
160 |     }
161 |     if ( conf_ptr )
162 |         conf = *conf_ptr;
163 | 
164 | 	if (skip >= 0) conf.line_skip = skip;
165 | 	if (meta >= 0) conf.meta_char = meta;
166 |     if (list_chrms) {
167 | 		ti_index_t *idx;
168 | 		int i, n;
169 | 		const char **names;
170 | 		idx = ti_index_load(argv[optind]);
171 | 		if (idx == 0) {
172 | 			fprintf(stderr, "[main] fail to load the index file.\n");
173 | 			return 1;
174 | 		}
175 | 		names = ti_seqname(idx, &n);
176 | 		for (i = 0; i < n; ++i) printf("%s\n", names[i]);
177 | 		free(names);
178 | 		ti_index_destroy(idx);
179 | 		return 0;
180 | 	}
181 |     if (reheader)
182 |         return reheader_file(reheader,argv[optind],conf.meta_char);
183 | 
184 | 	struct stat stat_tbi,stat_vcf;
185 |     char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
186 |    	strcat(strcpy(fnidx, argv[optind]), ".tbi");
187 | 
188 | 	if (optind + 1 == argc && !print_only_header) {
189 | 		if (force == 0) {
190 | 			if (stat(fnidx, &stat_tbi) == 0) 
191 |             {
192 |                 // Before complaining, check if the VCF file isn't newer. This is a common source of errors,
193 |                 //  people tend not to notice that tabix failed
194 |                 stat(argv[optind], &stat_vcf);
195 |                 if ( stat_vcf.st_mtime <= stat_tbi.st_mtime )
196 |                 {
197 |                     fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n");
198 |                     free(fnidx);
199 |                     return 1;
200 |                 }
201 | 			}
202 | 		}
203 |         if ( bgzf_is_bgzf(argv[optind])!=1 )
204 |         {
205 |             fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]);
206 |             free(fnidx);
207 |             return 1;
208 |         }
209 |         if ( !conf_ptr )
210 |         {
211 |             // Building the index but the file type was neither recognised nor given. If no custom change
212 |             //  has been made, warn the user that GFF is used 
213 |             if ( conf.preset==ti_conf_gff.preset 
214 |                 && conf.sc==ti_conf_gff.sc 
215 |                 && conf.bc==ti_conf_gff.bc 
216 |                 && conf.ec==ti_conf_gff.ec 
217 |                 && conf.meta_char==ti_conf_gff.meta_char 
218 |                 && conf.line_skip==ti_conf_gff.line_skip )
219 |                 fprintf(stderr,"[tabix] The file type not recognised and -p not given, using the preset [gff].\n");
220 |         }
221 | 		return ti_index_build(argv[optind], &conf);
222 | 	}
223 | 	{ // retrieve
224 | 		tabix_t *t;
225 |         // On some systems, stat on non-existent files returns undefined value for sm_mtime, the user had to use -f
226 |         int is_remote = (strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx) ? 1 : 0;
227 |         if ( !is_remote )
228 |         {
229 |             // Common source of errors: new VCF is used with an old index
230 |             stat(fnidx, &stat_tbi);
231 |             stat(argv[optind], &stat_vcf);
232 |             if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime )
233 |             {
234 |                 fprintf(stderr, "[tabix] the index file either does not exist or is older than the vcf file. Please reindex.\n");
235 |                 free(fnidx);
236 |                 return 1;
237 |             }
238 |         }
239 |         free(fnidx);
240 | 
241 | 		if ((t = ti_open(argv[optind], 0)) == 0) {
242 | 			fprintf(stderr, "[main] fail to open the data file.\n");
243 | 			return 1;
244 | 		}
245 |         if ( print_only_header )
246 |         {
247 |             ti_iter_t iter;
248 |             const char *s;
249 |             int len;
250 |             if (ti_lazy_index_load(t) < 0 && bed_reg == 0) {
251 |                 fprintf(stderr,"[tabix] failed to load the index file.\n");
252 |                 return 1;
253 |             }
254 |             const ti_conf_t *idxconf = ti_get_conf(t->idx);
255 |             iter = ti_query(t, 0, 0, 0);
256 |             while ((s = ti_read(t, iter, &len)) != 0) {
257 |                 if ((int)(*s) != idxconf->meta_char) break;
258 |                 fputs(s, stdout); fputc('\n', stdout);
259 |             }
260 |             ti_iter_destroy(iter);
261 |             return 0;
262 |         }
263 | 
264 | 		if (strcmp(argv[optind+1], ".") == 0) { // retrieve all
265 | 			ti_iter_t iter;
266 | 			const char *s;
267 | 			int len;
268 | 			iter = ti_query(t, 0, 0, 0);
269 | 			while ((s = ti_read(t, iter, &len)) != 0) {
270 | 				fputs(s, stdout); fputc('\n', stdout);
271 | 			}
272 | 			ti_iter_destroy(iter);
273 | 		} else { // retrieve from specified regions
274 | 			int i, len;
275 |             ti_iter_t iter;
276 |             const char *s;
277 | 			const ti_conf_t *idxconf;
278 | 
279 | 			if (ti_lazy_index_load(t) < 0 && bed_reg == 0) {
280 |                 fprintf(stderr,"[tabix] failed to load the index file.\n");
281 |                 return 1;
282 |             }
283 | 			idxconf = ti_get_conf(t->idx);
284 | 
285 |             if ( print_header )
286 |             {
287 |                 // If requested, print the header lines here
288 |                 iter = ti_query(t, 0, 0, 0);
289 |                 while ((s = ti_read(t, iter, &len)) != 0) {
290 |                     if ((int)(*s) != idxconf->meta_char) break;
291 |                     fputs(s, stdout); fputc('\n', stdout);
292 |                 }
293 |                 ti_iter_destroy(iter);
294 |             }
295 | 			if (bed_reg) {
296 | 				extern int bed_overlap(const void *_h, const char *chr, int beg, int end);
297 | 				extern void *bed_read(const char *fn);
298 | 				extern void bed_destroy(void *_h);
299 | 
300 | 				const ti_conf_t *conf_ = idxconf? idxconf : &conf; // use the index file if available
301 | 				void *bed = bed_read(argv[optind+1]); // load the BED file
302 | 				ti_interval_t intv;
303 | 
304 | 				if (bed == 0) {
305 | 					fprintf(stderr, "[main] fail to read the BED file.\n");
306 | 					return 1;
307 | 				}
308 | 				iter = ti_query(t, 0, 0, 0);
309 | 				while ((s = ti_read(t, iter, &len)) != 0) {
310 | 					int c;
311 | 					ti_get_intv(conf_, len, (char*)s, &intv);
312 | 					c = *intv.se; *intv.se = '\0';
313 | 					if (bed_overlap(bed, intv.ss, intv.beg, intv.end)) {
314 | 						*intv.se = c;
315 | 						puts(s);
316 | 					}
317 | 					*intv.se = c;
318 | 				}
319 |                 ti_iter_destroy(iter);
320 | 				bed_destroy(bed);
321 | 			} else {
322 | 				for (i = optind + 1; i < argc; ++i) {
323 | 					int tid, beg, end;
324 | 					if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) {
325 | 						iter = ti_queryi(t, tid, beg, end);
326 | 							while ((s = ti_read(t, iter, &len)) != 0) {
327 | 							fputs(s, stdout); fputc('\n', stdout);
328 | 						}
329 | 						ti_iter_destroy(iter);
330 | 					} 
331 |             	    // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n");
332 | 				}
333 | 			}
334 | 		}
335 | 		ti_close(t);
336 | 	}
337 | 	return 0;
338 | }
339 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/python/tabixmodule.c:
--------------------------------------------------------------------------------
  1 | /*-
  2 |  * The MIT License
  3 |  *
  4 |  * Copyright (c) 2011 Seoul National University.
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining
  7 |  * a copy of this software and associated documentation files (the
  8 |  * "Software"), to deal in the Software without restriction, including
  9 |  * without limitation the rights to use, copy, modify, merge, publish,
 10 |  * distribute, sublicense, and/or sell copies of the Software, and to
 11 |  * permit persons to whom the Software is furnished to do so, subject to
 12 |  * the following conditions:
 13 |  *
 14 |  * The above copyright notice and this permission notice shall be
 15 |  * included in all copies or substantial portions of the Software.
 16 |  *
 17 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 18 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 19 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 20 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 21 |  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 22 |  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 23 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * Contact: Hyeshik Chang <hyeshik@snu.ac.kr>
 29 |  */
 30 | 
 31 | #define PY_SSIZE_T_CLEAN
 32 | #include "Python.h"
 33 | #include "tabix.h"
 34 | 
 35 | static PyObject *TabixError;
 36 | 
 37 | typedef struct {
 38 |     PyObject_HEAD
 39 |     tabix_t *tb;
 40 |     char *fn;
 41 | } TabixObject;
 42 | 
 43 | typedef struct {
 44 |     PyObject_HEAD
 45 |     TabixObject *tbobj;
 46 |     ti_iter_t iter;
 47 | } TabixIteratorObject;
 48 | 
 49 | static PyTypeObject Tabix_Type, TabixIterator_Type;
 50 | 
 51 | /* --- TabixIterator --------------------------------------------------- */
 52 | 
 53 | static PyObject *
 54 | tabixiter_create(TabixObject *parentidx, ti_iter_t iter)
 55 | {
 56 |     TabixIteratorObject *self;
 57 | 
 58 |     self = PyObject_New(TabixIteratorObject, &TabixIterator_Type);
 59 |     if (self == NULL)
 60 |         return NULL;
 61 | 
 62 |     Py_INCREF(parentidx);
 63 |     self->tbobj = parentidx;
 64 |     self->iter = iter;
 65 | 
 66 |     return (PyObject *)self;
 67 | }
 68 | 
 69 | static void
 70 | tabixiter_dealloc(TabixIteratorObject *self)
 71 | {
 72 |     ti_iter_destroy(self->iter);
 73 |     Py_DECREF(self->tbobj);
 74 |     PyObject_Del(self);
 75 | }
 76 | 
 77 | static PyObject *
 78 | tabixiter_iter(PyObject *self)
 79 | {
 80 |     Py_INCREF(self);
 81 |     return self;
 82 | }
 83 | 
 84 | #if PY_MAJOR_VERSION < 3
 85 | # define PYOBJECT_FROM_STRING_AND_SIZE PyString_FromStringAndSize
 86 | #else
 87 | # define PYOBJECT_FROM_STRING_AND_SIZE PyUnicode_FromStringAndSize
 88 | #endif
 89 | 
 90 | static PyObject *
 91 | tabixiter_iternext(TabixIteratorObject *self)
 92 | {
 93 |     const char *chunk;
 94 |     int len, i;
 95 | 
 96 |     chunk = ti_read(self->tbobj->tb, self->iter, &len);
 97 |     if (chunk != NULL) {
 98 |         PyObject *ret, *column;
 99 |         Py_ssize_t colidx;
100 |         const char *ptr, *begin;
101 | 
102 |         ret = PyList_New(0);
103 |         if (ret == NULL)
104 |             return NULL;
105 | 
106 |         colidx = 0;
107 |         ptr = begin = chunk;
108 |         for (i = len; i > 0; i--, ptr++)
109 |             if (*ptr == '\t') {
110 |                 column = PYOBJECT_FROM_STRING_AND_SIZE(begin,
111 |                                                        (Py_ssize_t)(ptr - begin));
112 |                 if (column == NULL || PyList_Append(ret, column) == -1) {
113 |                     Py_DECREF(ret);
114 |                     return NULL;
115 |                 }
116 | 
117 |                 Py_DECREF(column);
118 |                 begin = ptr + 1;
119 |                 colidx++;
120 |             }
121 | 
122 |         column = PYOBJECT_FROM_STRING_AND_SIZE(begin, (Py_ssize_t)(ptr - begin));
123 |         if (column == NULL || PyList_Append(ret, column) == -1) {
124 |             Py_DECREF(ret);
125 |             return NULL;
126 |         }
127 |         Py_DECREF(column);
128 | 
129 |         return ret;
130 |     }
131 |     else
132 |         return NULL;
133 | }
134 | 
135 | static PyMethodDef tabixiter_methods[] = {
136 |     {NULL, NULL} /* sentinel */
137 | };
138 | 
139 | static PyTypeObject TabixIterator_Type = {
140 |     PyVarObject_HEAD_INIT(NULL, 0)
141 |     "tabix.TabixIterator",      /*tp_name*/
142 |     sizeof(TabixIteratorObject), /*tp_basicsize*/
143 |     0,                          /*tp_itemsize*/
144 |     /* methods */
145 |     (destructor)tabixiter_dealloc,  /*tp_dealloc*/
146 |     0,                          /*tp_print*/
147 |     0,                          /*tp_getattr*/
148 |     0,                          /*tp_setattr*/
149 |     0,                          /*tp_compare*/
150 |     0,                          /*tp_repr*/
151 |     0,                          /*tp_as_number*/
152 |     0,                          /*tp_as_sequence*/
153 |     0,                          /*tp_as_mapping*/
154 |     0,                          /*tp_hash*/
155 |     0,                          /*tp_call*/
156 |     0,                          /*tp_str*/
157 |     0,                          /*tp_getattro*/
158 |     0,                          /*tp_setattro*/
159 |     0,                          /*tp_as_buffer*/
160 |     Py_TPFLAGS_DEFAULT,         /*tp_flags*/
161 |     0,                          /*tp_doc*/
162 |     0,                          /*tp_traverse*/
163 |     0,                          /*tp_clear*/
164 |     0,                          /*tp_richcompare*/
165 |     0,                          /*tp_weaklistoffset*/
166 |     tabixiter_iter,             /*tp_iter*/
167 |     (iternextfunc)tabixiter_iternext, /*tp_iternext*/
168 |     tabixiter_methods,          /*tp_methods*/
169 |     0,                          /*tp_members*/
170 |     0,                          /*tp_getset*/
171 |     0,                          /*tp_base*/
172 |     0,                          /*tp_dict*/
173 |     0,                          /*tp_descr_get*/
174 |     0,                          /*tp_descr_set*/
175 |     0,                          /*tp_dictoffset*/
176 |     0,                          /*tp_init*/
177 |     0,                          /*tp_alloc*/
178 |     0,                          /*tp_new*/
179 |     0,                          /*tp_free*/
180 |     0,                          /*tp_is_gc*/
181 | };
182 | 
183 | 
184 | /* --- Tabix ----------------------------------------------------------- */
185 | 
186 | static PyObject *
187 | tabix_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
188 | {
189 |     TabixObject *self;
190 |     const char *fn, *fnidx=NULL;
191 |     static char *kwnames[]={"fn", "fnidx", NULL};
192 |     tabix_t *tb;
193 | 
194 |     if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|z:Tabix",
195 |                                      kwnames, &fn, &fnidx))
196 |         return NULL;
197 | 
198 |     tb = ti_open(fn, fnidx);
199 |     if (tb == NULL) {
200 |         PyErr_SetString(TabixError, "Can't open the index file.");
201 |         return NULL;
202 |     }
203 | 
204 |     self = (TabixObject *)type->tp_alloc(type, 0);
205 |     if (self == NULL)
206 |         return NULL;
207 | 
208 |     self->tb = tb;
209 |     self->fn = strdup(fn);
210 | 
211 |     return (PyObject *)self;
212 | }
213 | 
214 | static void
215 | tabix_dealloc(TabixObject *self)
216 | {
217 |     free(self->fn);
218 |     ti_close(self->tb);
219 |     PyObject_Del(self);
220 | }
221 | 
222 | static PyObject *
223 | tabix_query(TabixObject *self, PyObject *args)
224 | {
225 |     char *name;
226 |     int begin, end;
227 |     ti_iter_t result;
228 | 
229 |     if (!PyArg_ParseTuple(args, "sii:query", &name, &begin, &end))
230 |         return NULL;
231 | 
232 |     result = ti_query(self->tb, name, begin, end);
233 |     if (result == NULL) {
234 |         PyErr_SetString(TabixError, "query failed");
235 |         return NULL;
236 |     }
237 | 
238 |     return tabixiter_create(self, result);
239 | }
240 | 
241 | static PyObject *
242 | tabix_queryi(TabixObject *self, PyObject *args)
243 | {
244 |     int tid, begin, end;
245 |     ti_iter_t result;
246 | 
247 |     if (!PyArg_ParseTuple(args, "iii:queryi", &tid, &begin, &end))
248 |         return NULL;
249 | 
250 |     result = ti_queryi(self->tb, tid, begin, end);
251 |     if (result == NULL) {
252 |         PyErr_SetString(TabixError, "query failed");
253 |         return NULL;
254 |     }
255 | 
256 |     return tabixiter_create(self, result);
257 | }
258 | 
259 | static PyObject *
260 | tabix_querys(TabixObject *self, PyObject *args)
261 | {
262 |     const char *reg;
263 |     ti_iter_t result;
264 | 
265 |     if (!PyArg_ParseTuple(args, "s:querys", &reg))
266 |         return NULL;
267 | 
268 |     result = ti_querys(self->tb, reg);
269 |     if (result == NULL) {
270 |         PyErr_SetString(TabixError, "query failed");
271 |         return NULL;
272 |     }
273 | 
274 |     return tabixiter_create(self, result);
275 | }
276 | 
277 | static PyObject *
278 | tabix_repr(TabixObject *self)
279 | {
280 | #if PY_MAJOR_VERSION < 3
281 |     return PyString_FromFormat("<Tabix fn=\"%s\">", self->fn);
282 | #else
283 |     return PyUnicode_FromFormat("<Tabix fn=\"%s\">", self->fn);
284 | #endif
285 | }
286 | 
287 | static PyMethodDef tabix_methods[] = {
288 |     {"query",           (PyCFunction)tabix_query, METH_VARARGS,
289 |         PyDoc_STR("T.query(name, begin, end) -> iterator")},
290 |     {"queryi",          (PyCFunction)tabix_queryi, METH_VARARGS,
291 |         PyDoc_STR("T.queryi(tid, begin, id) -> iterator")},
292 |     {"querys",          (PyCFunction)tabix_querys, METH_VARARGS,
293 |         PyDoc_STR("T.querys(region) -> iterator")},
294 |     {NULL,              NULL}           /* sentinel */
295 | };
296 | 
297 | static PyTypeObject Tabix_Type = {
298 |     /* The ob_type field must be initialized in the module init function
299 |      * to be portable to Windows without using C++. */
300 |     PyVarObject_HEAD_INIT(NULL, 0)
301 |     "tabix.Tabix",              /*tp_name*/
302 |     sizeof(TabixObject),        /*tp_basicsize*/
303 |     0,                          /*tp_itemsize*/
304 |     /* methods */
305 |     (destructor)tabix_dealloc,  /*tp_dealloc*/
306 |     0,                          /*tp_print*/
307 |     0,                          /*tp_getattr*/
308 |     0,                          /*tp_setattr*/
309 |     0,                          /*tp_compare*/
310 |     (reprfunc)tabix_repr,       /*tp_repr*/
311 |     0,                          /*tp_as_number*/
312 |     0,                          /*tp_as_sequence*/
313 |     0,                          /*tp_as_mapping*/
314 |     0,                          /*tp_hash*/
315 |     0,                          /*tp_call*/
316 |     0,                          /*tp_str*/
317 |     0,                          /*tp_getattro*/
318 |     0,                          /*tp_setattro*/
319 |     0,                          /*tp_as_buffer*/
320 |     Py_TPFLAGS_DEFAULT,         /*tp_flags*/
321 |     0,                          /*tp_doc*/
322 |     0,                          /*tp_traverse*/
323 |     0,                          /*tp_clear*/
324 |     0,                          /*tp_richcompare*/
325 |     0,                          /*tp_weaklistoffset*/
326 |     0,                          /*tp_iter*/
327 |     0,                          /*tp_iternext*/
328 |     tabix_methods,              /*tp_methods*/
329 |     0,                          /*tp_members*/
330 |     0,                          /*tp_getset*/
331 |     0,                          /*tp_base*/
332 |     0,                          /*tp_dict*/
333 |     0,                          /*tp_descr_get*/
334 |     0,                          /*tp_descr_set*/
335 |     0,                          /*tp_dictoffset*/
336 |     0,                          /*tp_init*/
337 |     0,                          /*tp_alloc*/
338 |     (newfunc)tabix_new,         /*tp_new*/
339 |     0,                          /*tp_free*/
340 |     0,                          /*tp_is_gc*/
341 | };
342 | /* --------------------------------------------------------------------- */
343 | 
344 | static PyMethodDef tabix_functions[] = {
345 |     {NULL, NULL} /* sentinel */
346 | };
347 | 
348 | PyDoc_STRVAR(module_doc,
349 | "Python interface to tabix, Heng Li's generic indexer for TAB-delimited "
350 | "genome position filesThis is a template module just for instruction.");
351 | 
352 | #if PY_MAJOR_VERSION >= 3
353 | static struct PyModuleDef tabixmodule = { 
354 |     PyModuleDef_HEAD_INIT,
355 |     "tabix",
356 |     module_doc,
357 |     -1, 
358 |     tabix_functions,
359 |     NULL,
360 |     NULL,
361 |     NULL,
362 |     NULL
363 | };
364 | #endif
365 | 
366 | #if PY_MAJOR_VERSION < 3
367 | PyMODINIT_FUNC inittabix(void)
368 | #else
369 | PyMODINIT_FUNC PyInit_tabix(void)
370 | #endif
371 | {
372 |     PyObject *m;
373 | 
374 |     if (PyType_Ready(&Tabix_Type) < 0)
375 |         goto fail;
376 |     if (PyType_Ready(&TabixIterator_Type) < 0)
377 |         goto fail;
378 | 
379 | #if PY_MAJOR_VERSION < 3
380 |     m = Py_InitModule3("tabix", tabix_functions, module_doc);
381 | #else
382 |     m = PyModule_Create(&tabixmodule);
383 | #endif
384 |     if (m == NULL)
385 |         goto fail;
386 | 
387 |     if (TabixError == NULL) {
388 |         TabixError = PyErr_NewException("tabix.error", NULL, NULL);
389 |         if (TabixError == NULL)
390 |             goto fail;
391 |     }
392 |     Py_INCREF(TabixError);
393 |     PyModule_AddObject(m, "error", TabixError);
394 | 
395 |     PyModule_AddObject(m, "Tabix", (PyObject *)&Tabix_Type);
396 |     PyModule_AddObject(m, "TabixIterator", (PyObject *)&TabixIterator_Type);
397 | 
398 | #if PY_MAJOR_VERSION >= 3
399 |     return m;
400 | #endif
401 | 
402 |  fail:
403 | #if PY_MAJOR_VERSION < 3
404 |     return;
405 | #else
406 |     return NULL;
407 | #endif
408 | }
409 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/TabixReader.java:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2010 Broad Institute.
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /* Contact: Heng Li <hengli@broadinstitute.org> */
 27 | 
 28 | import net.sf.samtools.util.BlockCompressedInputStream;
 29 | 
 30 | import java.io.*;
 31 | import java.nio.*;
 32 | import java.util.HashMap;
 33 | import java.util.ArrayList;
 34 | import java.util.Arrays;
 35 | import java.lang.StringBuffer;
 36 | 
 37 | public class TabixReader
 38 | {
 39 | 	private String mFn;
 40 | 	private BlockCompressedInputStream mFp;
 41 | 
 42 | 	private int mPreset;
 43 | 	private int mSc;
 44 | 	private int mBc;
 45 | 	private int mEc;
 46 | 	private int mMeta;
 47 | 	private int mSkip;
 48 | 	private String[] mSeq;
 49 | 
 50 | 	private HashMap<String, Integer> mChr2tid;
 51 | 
 52 | 	private static int MAX_BIN = 37450;
 53 | 	private static int TAD_MIN_CHUNK_GAP = 32768;
 54 | 	private static int TAD_LIDX_SHIFT = 14;
 55 | 
 56 | 	private class TPair64 implements Comparable<TPair64> {
 57 | 		long u, v;
 58 | 		public TPair64(final long _u, final long _v) {
 59 | 			u = _u; v = _v;
 60 | 		}
 61 | 		public TPair64(final TPair64 p) {
 62 | 			u = p.u; v = p.v;
 63 | 		}
 64 | 		public int compareTo(final TPair64 p) {
 65 | 			return u == p.u? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0))? -1 : 1; // unsigned 64-bit comparison
 66 | 		}
 67 | 	};
 68 | 
 69 | 	private class TIndex {
 70 | 		HashMap<Integer, TPair64[]> b; // binning index
 71 | 		long[] l; // linear index
 72 | 	};
 73 | 	private TIndex[] mIndex;
 74 | 
 75 | 	private class TIntv {
 76 | 		int tid, beg, end;
 77 | 	};
 78 | 
 79 | 	private static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
 80 | 		return (u < v) ^ (u < 0) ^ (v < 0);
 81 | 	}
 82 | 
 83 | 	/**
 84 | 	 * The constructor
 85 | 	 *
 86 | 	 * @param fn File name of the data file
 87 | 	 */
 88 | 	public TabixReader(final String fn) throws IOException {
 89 | 		mFn = fn;
 90 | 		mFp = new BlockCompressedInputStream(new File(fn));
 91 | 		readIndex();
 92 | 	}
 93 | 
 94 | 	private static int reg2bins(final int beg, final int _end, final int[] list) {
 95 | 		int i = 0, k, end = _end;
 96 | 		if (beg >= end) return 0;
 97 | 		if (end >= 1<<29) end = 1<<29;
 98 | 		--end;
 99 | 		list[i++] = 0;
100 | 		for (k =    1 + (beg>>26); k <=    1 + (end>>26); ++k) list[i++] = k;
101 | 		for (k =    9 + (beg>>23); k <=    9 + (end>>23); ++k) list[i++] = k;
102 | 		for (k =   73 + (beg>>20); k <=   73 + (end>>20); ++k) list[i++] = k;
103 | 		for (k =  585 + (beg>>17); k <=  585 + (end>>17); ++k) list[i++] = k;
104 | 		for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
105 | 		return i;
106 | 	}
107 | 
108 | 	public static int readInt(final InputStream is) throws IOException {
109 | 		byte[] buf = new byte[4];
110 | 		is.read(buf);
111 | 		return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt();
112 | 	}
113 | 
114 | 	public static long readLong(final InputStream is) throws IOException {
115 | 		byte[] buf = new byte[8];
116 | 		is.read(buf);
117 | 		return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong();
118 | 	}
119 | 
120 | 	public static String readLine(final InputStream is) throws IOException {
121 | 		StringBuffer buf = new StringBuffer();
122 | 		int c;
123 | 		while ((c = is.read()) >= 0 && c != '\n')
124 | 			buf.append((char)c);
125 | 		if (c < 0) return null;
126 | 		return buf.toString();
127 | 	}
128 | 
129 | 	/**
130 | 	 * Read the Tabix index from a file
131 | 	 *
132 | 	 * @param fp File pointer
133 | 	 */
134 | 	public void readIndex(final File fp) throws IOException {
135 | 		if (fp == null) return;
136 | 		BlockCompressedInputStream is = new BlockCompressedInputStream(fp);
137 | 		byte[] buf = new byte[4];
138 | 
139 | 		is.read(buf, 0, 4); // read "TBI\1"
140 | 		mSeq = new String[readInt(is)]; // # sequences
141 | 		mChr2tid = new HashMap<String, Integer>();
142 | 		mPreset = readInt(is);
143 | 		mSc = readInt(is);
144 | 		mBc = readInt(is);
145 | 		mEc = readInt(is);
146 | 		mMeta = readInt(is);
147 | 		mSkip = readInt(is);
148 | 		// read sequence dictionary
149 | 		int i, j, k, l = readInt(is);
150 | 		buf = new byte[l];
151 | 		is.read(buf);
152 | 		for (i = j = k = 0; i < buf.length; ++i) {
153 | 			if (buf[i] == 0) {
154 | 				byte[] b = new byte[i - j];
155 | 				System.arraycopy(buf, j, b, 0, b.length);
156 | 				String s = new String(b);
157 | 				mChr2tid.put(s, k);
158 | 				mSeq[k++] = s;
159 | 				j = i + 1;
160 | 			}
161 | 		}
162 | 		// read the index
163 | 		mIndex = new TIndex[mSeq.length];
164 | 		for (i = 0; i < mSeq.length; ++i) {
165 | 			// the binning index
166 | 			int n_bin = readInt(is);
167 | 			mIndex[i] = new TIndex();
168 | 			mIndex[i].b = new HashMap<Integer, TPair64[]>();
169 | 			for (j = 0; j < n_bin; ++j) {
170 | 				int bin = readInt(is);
171 | 				TPair64[] chunks = new TPair64[readInt(is)];
172 | 				for (k = 0; k < chunks.length; ++k) {
173 | 					long u = readLong(is);
174 | 					long v = readLong(is);
175 | 					chunks[k] = new TPair64(u, v); // in C, this is inefficient
176 | 				}
177 | 				mIndex[i].b.put(bin, chunks);
178 | 			}
179 | 			// the linear index
180 | 			mIndex[i].l = new long[readInt(is)];
181 | 			for (k = 0; k < mIndex[i].l.length; ++k)
182 | 				mIndex[i].l[k] = readLong(is);
183 | 		}
184 | 		// close
185 | 		is.close();
186 | 	}
187 | 
188 | 	/**
189 | 	 * Read the Tabix index from the default file.
190 | 	 */
191 | 	public void readIndex() throws IOException {
192 | 		readIndex(new File(mFn + ".tbi"));
193 | 	}
194 | 
195 | 	/**
196 | 	 * Read one line from the data file.
197 | 	 */
198 | 	public String readLine() throws IOException {
199 | 		return readLine(mFp);
200 | 	}
201 | 
202 | 	private int chr2tid(final String chr) {
203 | 		if (mChr2tid.containsKey(chr)) return mChr2tid.get(chr);
204 | 		else return -1;
205 | 	}
206 | 
207 | 	/**
208 | 	 * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000"
209 | 	 *
210 | 	 * @param reg Region string
211 | 	 * @return An array where the three elements are sequence_id,
212 | 	 *         region_begin and region_end. On failure, sequence_id==-1.
213 | 	 */
214 | 	public int[] parseReg(final String reg) { // FIXME: NOT working when the sequence name contains : or -.
215 | 		String chr;
216 | 		int colon, hyphen;
217 | 		int[] ret = new int[3];
218 | 		colon = reg.indexOf(':'); hyphen = reg.indexOf('-');
219 | 		chr = colon >= 0? reg.substring(0, colon) : reg;
220 | 		ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen >= 0? hyphen : reg.length())) - 1 : 0;
221 | 		ret[2] = hyphen >= 0? Integer.parseInt(reg.substring(hyphen+1)) : 0x7fffffff;
222 | 		ret[0] = chr2tid(chr);
223 | 		return ret;
224 | 	}
225 | 
226 | 	private TIntv getIntv(final String s) {
227 | 		TIntv intv = new TIntv();
228 | 		int col = 0, end = 0, beg = 0;
229 | 		while ((end = s.indexOf('\t', beg)) >= 0 || end == -1) {
230 | 			++col;
231 | 			if (col == mSc) {
232 | 				intv.tid = chr2tid(s.substring(beg, end));
233 | 			} else if (col == mBc) {
234 | 				intv.beg = intv.end = Integer.parseInt(s.substring(beg, end==-1?s.length():end));
235 | 				if ((mPreset&0x10000) != 0) ++intv.end;
236 | 				else --intv.beg;
237 | 				if (intv.beg < 0) intv.beg = 0;
238 | 				if (intv.end < 1) intv.end = 1;
239 | 			} else { // FIXME: SAM supports are not tested yet
240 | 				if ((mPreset&0xffff) == 0) { // generic
241 | 					if (col == mEc)
242 | 						intv.end = Integer.parseInt(s.substring(beg, end));
243 | 				} else if ((mPreset&0xffff) == 1) { // SAM
244 | 					if (col == 6) { // CIGAR
245 | 						int l = 0, i, j;
246 | 						String cigar = s.substring(beg, end);
247 | 						for (i = j = 0; i < cigar.length(); ++i) {
248 | 							if (cigar.charAt(i) > '9') {
249 | 								int op = cigar.charAt(i);
250 | 								if (op == 'M' || op == 'D' || op == 'N')
251 | 									l += Integer.parseInt(cigar.substring(j, i));
252 | 							}
253 | 						}
254 | 						intv.end = intv.beg + l;
255 | 					}
256 | 				} else if ((mPreset&0xffff) == 2) { // VCF
257 | 					String alt;
258 | 					alt = end >= 0? s.substring(beg, end) : s.substring(beg);
259 | 					if (col == 4) { // REF
260 | 						if (alt.length() > 0) intv.end = intv.beg + alt.length();
261 | 					} else if (col == 8) { // INFO
262 | 						int e_off = -1, i = alt.indexOf("END=");
263 | 						if (i == 0) e_off = 4;
264 | 						else if (i > 0) {
265 | 							i = alt.indexOf(";END=");
266 | 							if (i >= 0) e_off = i + 5;
267 | 						}
268 | 						if (e_off > 0) {
269 | 							i = alt.indexOf(";", e_off);
270 | 							intv.end = Integer.parseInt(i > e_off? alt.substring(e_off, i) : alt.substring(e_off));
271 | 						}
272 | 					}
273 | 				}
274 | 			}
275 | 			if (end == -1) break;
276 | 			beg = end + 1;
277 | 		}
278 | 		return intv;
279 | 	}
280 | 
281 | 	public class Iterator {
282 | 		private int i, n_seeks;
283 | 		private int tid, beg, end;
284 | 		private TPair64[] off;
285 | 		private long curr_off;
286 | 		private boolean iseof;
287 | 
288 | 		public Iterator(final int _tid, final int _beg, final int _end, final TPair64[] _off) {
289 | 			i = -1; n_seeks = 0; curr_off = 0; iseof = false;
290 | 			off = _off; tid = _tid; beg = _beg; end = _end;
291 | 		}
292 | 
293 | 		public String next() throws IOException {
294 | 			if (iseof) return null;
295 | 			for (;;) {
296 | 				if (curr_off == 0 || !less64(curr_off, off[i].v)) { // then jump to the next chunk
297 | 					if (i == off.length - 1) break; // no more chunks
298 | 					if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
299 | 					if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
300 | 						mFp.seek(off[i+1].u);
301 | 						curr_off = mFp.getFilePointer();
302 | 						++n_seeks;
303 | 					}
304 | 					++i;
305 | 				}
306 | 				String s;
307 | 				if ((s = readLine(mFp)) != null) {
308 | 					TIntv intv;
309 | 					char[] str = s.toCharArray();
310 | 					curr_off = mFp.getFilePointer();
311 | 					if (str.length == 0 || str[0] == mMeta) continue;
312 | 					intv = getIntv(s);
313 | 					if (intv.tid != tid || intv.beg >= end) break; // no need to proceed
314 | 					else if (intv.end > beg && intv.beg < end) return s; // overlap; return
315 | 				} else break; // end of file
316 | 			}
317 | 			iseof = true;
318 | 			return null;
319 | 		}
320 | 	};
321 | 
322 | 	public Iterator query(final int tid, final int beg, final int end) {
323 | 		TPair64[] off, chunks;
324 | 		long min_off;
325 | 		TIndex idx = mIndex[tid];
326 | 		int[] bins = new int[MAX_BIN];
327 | 		int i, l, n_off, n_bins = reg2bins(beg, end, bins);
328 | 		if (idx.l.length > 0)
329 | 			min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT];
330 | 		else min_off = 0;
331 | 		for (i = n_off = 0; i < n_bins; ++i) {
332 | 			if ((chunks = idx.b.get(bins[i])) != null)
333 | 				n_off += chunks.length;
334 | 		}
335 | 		if (n_off == 0) return null;
336 | 		off = new TPair64[n_off];
337 | 		for (i = n_off = 0; i < n_bins; ++i)
338 | 			if ((chunks = idx.b.get(bins[i])) != null)
339 | 				for (int j = 0; j < chunks.length; ++j)
340 | 					if (less64(min_off, chunks[j].v))
341 | 						off[n_off++] = new TPair64(chunks[j]);
342 | 		if (n_off == 0) return null;
343 | 		Arrays.sort(off, 0, n_off);
344 | 		// resolve completely contained adjacent blocks
345 | 		for (i = 1, l = 0; i < n_off; ++i) {
346 | 			if (less64(off[l].v, off[i].v)) {
347 | 				++l;
348 | 				off[l].u = off[i].u; off[l].v = off[i].v;
349 | 			}
350 | 		}
351 | 		n_off = l + 1;
352 | 		// resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
353 | 		for (i = 1; i < n_off; ++i)
354 | 			if (!less64(off[i-1].v, off[i].u)) off[i-1].v = off[i].u;
355 | 		// merge adjacent blocks
356 | 		for (i = 1, l = 0; i < n_off; ++i) {
357 | 			if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
358 | 			else {
359 | 				++l;
360 | 				off[l].u = off[i].u;
361 | 				off[l].v = off[i].v;
362 | 			}
363 | 		}
364 | 		n_off = l + 1;
365 | 		// return
366 | 		TPair64[] ret = new TPair64[n_off];
367 | 		for (i = 0; i < n_off; ++i) ret[i] = new TPair64(off[i].u, off[i].v); // in C, this is inefficient
368 | 		return new TabixReader.Iterator(tid, beg, end, ret);
369 | 	}
370 | 	
371 | 	public Iterator query(final String reg) {
372 | 		int[] x = parseReg(reg);
373 | 		return query(x[0], x[1], x[2]);
374 | 	}
375 | 
376 | 	public static void main(String[] args) {
377 | 		if (args.length < 1) {
378 | 			System.out.println("Usage: java -cp .:sam.jar TabixReader <in.gz> [region]");
379 | 			System.exit(1);
380 | 		}
381 | 		try {
382 | 			TabixReader tr = new TabixReader(args[0]);
383 | 			String s;
384 | 			if (args.length == 1) { // no region is specified; print the whole file
385 | 				while ((s = tr.readLine()) != null)
386 | 					System.out.println(s);
387 | 			} else { // a region is specified; random access
388 | 				TabixReader.Iterator iter = tr.query(args[1]); // get the iterator
389 | 				while (iter != null && (s = iter.next()) != null)
390 | 					System.out.println(s);
391 | 			}
392 | 		} catch (IOException e) {
393 | 		}
394 | 	}
395 | }
396 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/khash.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008 Genome Research Ltd (GRL).
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /* Contact: Heng Li <lh3@sanger.ac.uk> */
 27 | 
 28 | /*
 29 |   An example:
 30 | 
 31 | #include "khash.h"
 32 | KHASH_MAP_INIT_INT(32, char)
 33 | int main() {
 34 | 	int ret, is_missing;
 35 | 	khiter_t k;
 36 | 	khash_t(32) *h = kh_init(32);
 37 | 	k = kh_put(32, h, 5, &ret);
 38 | 	if (!ret) kh_del(32, h, k);
 39 | 	kh_value(h, k) = 10;
 40 | 	k = kh_get(32, h, 10);
 41 | 	is_missing = (k == kh_end(h));
 42 | 	k = kh_get(32, h, 5);
 43 | 	kh_del(32, h, k);
 44 | 	for (k = kh_begin(h); k != kh_end(h); ++k)
 45 | 		if (kh_exist(h, k)) kh_value(h, k) = 1;
 46 | 	kh_destroy(32, h);
 47 | 	return 0;
 48 | }
 49 | */
 50 | 
 51 | /*
 52 |   2008-09-19 (0.2.3):
 53 | 
 54 | 	* Corrected the example
 55 | 	* Improved interfaces
 56 | 
 57 |   2008-09-11 (0.2.2):
 58 | 
 59 | 	* Improved speed a little in kh_put()
 60 | 
 61 |   2008-09-10 (0.2.1):
 62 | 
 63 | 	* Added kh_clear()
 64 | 	* Fixed a compiling error
 65 | 
 66 |   2008-09-02 (0.2.0):
 67 | 
 68 | 	* Changed to token concatenation which increases flexibility.
 69 | 
 70 |   2008-08-31 (0.1.2):
 71 | 
 72 | 	* Fixed a bug in kh_get(), which has not been tested previously.
 73 | 
 74 |   2008-08-31 (0.1.1):
 75 | 
 76 | 	* Added destructor
 77 | */
 78 | 
 79 | 
 80 | #ifndef __AC_KHASH_H
 81 | #define __AC_KHASH_H
 82 | 
 83 | /*!
 84 |   @header
 85 | 
 86 |   Generic hash table library.
 87 | 
 88 |   @copyright Heng Li
 89 |  */
 90 | 
 91 | #define AC_VERSION_KHASH_H "0.2.2"
 92 | 
 93 | #include <stdint.h>
 94 | #include <stdlib.h>
 95 | #include <string.h>
 96 | 
 97 | typedef uint32_t khint_t;
 98 | typedef khint_t khiter_t;
 99 | 
100 | #define __ac_HASH_PRIME_SIZE 32
101 | static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
102 | {
103 |   0ul,          3ul,          11ul,         23ul,         53ul,
104 |   97ul,         193ul,        389ul,        769ul,        1543ul,
105 |   3079ul,       6151ul,       12289ul,      24593ul,      49157ul,
106 |   98317ul,      196613ul,     393241ul,     786433ul,     1572869ul,
107 |   3145739ul,    6291469ul,    12582917ul,   25165843ul,   50331653ul,
108 |   100663319ul,  201326611ul,  402653189ul,  805306457ul,  1610612741ul,
109 |   3221225473ul, 4294967291ul
110 | };
111 | 
112 | #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
113 | #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
114 | #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
115 | #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
116 | #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
117 | #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
118 | #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
119 | 
120 | static const double __ac_HASH_UPPER = 0.77;
121 | 
122 | #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
123 | 	typedef struct {													\
124 | 		khint_t n_buckets, size, n_occupied, upper_bound;				\
125 | 		uint32_t *flags;												\
126 | 		khkey_t *keys;													\
127 | 		khval_t *vals;													\
128 | 	} kh_##name##_t;													\
129 | 	static inline kh_##name##_t *kh_init_##name() {						\
130 | 		return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t));		\
131 | 	}																	\
132 | 	static inline void kh_destroy_##name(kh_##name##_t *h)				\
133 | 	{																	\
134 | 		if (h) {														\
135 | 			free(h->keys); free(h->flags);								\
136 | 			free(h->vals);												\
137 | 			free(h);													\
138 | 		}																\
139 | 	}																	\
140 | 	static inline void kh_clear_##name(kh_##name##_t *h)				\
141 | 	{																	\
142 | 		if (h && h->flags) {											\
143 | 			memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
144 | 			h->size = h->n_occupied = 0;								\
145 | 		}																\
146 | 	}																	\
147 | 	static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
148 | 	{																	\
149 | 		if (h->n_buckets) {												\
150 | 			khint_t inc, k, i, last;									\
151 | 			k = __hash_func(key); i = k % h->n_buckets;					\
152 | 			inc = 1 + k % (h->n_buckets - 1); last = i;					\
153 | 			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
154 | 				if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
155 | 				else i += inc;											\
156 | 				if (i == last) return h->n_buckets;						\
157 | 			}															\
158 | 			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
159 | 		} else return 0;												\
160 | 	}																	\
161 | 	static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
162 | 	{																	\
163 | 		uint32_t *new_flags = 0;										\
164 | 		khint_t j = 1;													\
165 | 		{																\
166 | 			khint_t t = __ac_HASH_PRIME_SIZE - 1;						\
167 | 			while (__ac_prime_list[t] > new_n_buckets) --t;				\
168 | 			new_n_buckets = __ac_prime_list[t+1];						\
169 | 			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	\
170 | 			else {														\
171 | 				new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t));	\
172 | 				memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
173 | 				if (h->n_buckets < new_n_buckets) {						\
174 | 					h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
175 | 					if (kh_is_map)										\
176 | 						h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
177 | 				}														\
178 | 			}															\
179 | 		}																\
180 | 		if (j) {														\
181 | 			for (j = 0; j != h->n_buckets; ++j) {						\
182 | 				if (__ac_iseither(h->flags, j) == 0) {					\
183 | 					khkey_t key = h->keys[j];							\
184 | 					khval_t val;										\
185 | 					if (kh_is_map) val = h->vals[j];					\
186 | 					__ac_set_isdel_true(h->flags, j);					\
187 | 					while (1) {											\
188 | 						khint_t inc, k, i;								\
189 | 						k = __hash_func(key);							\
190 | 						i = k % new_n_buckets;							\
191 | 						inc = 1 + k % (new_n_buckets - 1);				\
192 | 						while (!__ac_isempty(new_flags, i)) {			\
193 | 							if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
194 | 							else i += inc;								\
195 | 						}												\
196 | 						__ac_set_isempty_false(new_flags, i);			\
197 | 						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
198 | 							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
199 | 							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
200 | 							__ac_set_isdel_true(h->flags, i);			\
201 | 						} else {										\
202 | 							h->keys[i] = key;							\
203 | 							if (kh_is_map) h->vals[i] = val;			\
204 | 							break;										\
205 | 						}												\
206 | 					}													\
207 | 				}														\
208 | 			}															\
209 | 			if (h->n_buckets > new_n_buckets) {							\
210 | 				h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
211 | 				if (kh_is_map)											\
212 | 					h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
213 | 			}															\
214 | 			free(h->flags);												\
215 | 			h->flags = new_flags;										\
216 | 			h->n_buckets = new_n_buckets;								\
217 | 			h->n_occupied = h->size;									\
218 | 			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
219 | 		}																\
220 | 	}																	\
221 | 	static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
222 | 	{																	\
223 | 		khint_t x;														\
224 | 		if (h->n_occupied >= h->upper_bound) {							\
225 | 			if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
226 | 			else kh_resize_##name(h, h->n_buckets + 1);					\
227 | 		}																\
228 | 		{																\
229 | 			khint_t inc, k, i, site, last;								\
230 | 			x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
231 | 			if (__ac_isempty(h->flags, i)) x = i;						\
232 | 			else {														\
233 | 				inc = 1 + k % (h->n_buckets - 1); last = i;				\
234 | 				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
235 | 					if (__ac_isdel(h->flags, i)) site = i;				\
236 | 					if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
237 | 					else i += inc;										\
238 | 					if (i == last) { x = site; break; }					\
239 | 				}														\
240 | 				if (x == h->n_buckets) {								\
241 | 					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
242 | 					else x = i;											\
243 | 				}														\
244 | 			}															\
245 | 		}																\
246 | 		if (__ac_isempty(h->flags, x)) {								\
247 | 			h->keys[x] = key;											\
248 | 			__ac_set_isboth_false(h->flags, x);							\
249 | 			++h->size; ++h->n_occupied;									\
250 | 			*ret = 1;													\
251 | 		} else if (__ac_isdel(h->flags, x)) {							\
252 | 			h->keys[x] = key;											\
253 | 			__ac_set_isboth_false(h->flags, x);							\
254 | 			++h->size;													\
255 | 			*ret = 2;													\
256 | 		} else *ret = 0;												\
257 | 		return x;														\
258 | 	}																	\
259 | 	static inline void kh_del_##name(kh_##name##_t *h, khint_t x)		\
260 | 	{																	\
261 | 		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
262 | 			__ac_set_isdel_true(h->flags, x);							\
263 | 			--h->size;													\
264 | 		}																\
265 | 	}
266 | 
267 | /* --- BEGIN OF HASH FUNCTIONS --- */
268 | 
269 | /*! @function
270 |   @abstract     Integer hash function
271 |   @param  key   The integer [uint32_t]
272 |   @return       The hash value [khint_t]
273 |  */
274 | #define kh_int_hash_func(key) (uint32_t)(key)
275 | /*! @function
276 |   @abstract     Integer comparison function
277 |  */
278 | #define kh_int_hash_equal(a, b) ((a) == (b))
279 | /*! @function
280 |   @abstract     64-bit integer hash function
281 |   @param  key   The integer [uint64_t]
282 |   @return       The hash value [khint_t]
283 |  */
284 | #define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
285 | /*! @function
286 |   @abstract     64-bit integer comparison function
287 |  */
288 | #define kh_int64_hash_equal(a, b) ((a) == (b))
289 | /*! @function
290 |   @abstract     const char* hash function
291 |   @param  s     Pointer to a null terminated string
292 |   @return       The hash value
293 |  */
294 | static inline khint_t __ac_X31_hash_string(const char *s)
295 | {
296 | 	khint_t h = *s;
297 | 	if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
298 | 	return h;
299 | }
300 | /*! @function
301 |   @abstract     Another interface to const char* hash function
302 |   @param  key   Pointer to a null terminated string [const char*]
303 |   @return       The hash value [khint_t]
304 |  */
305 | #define kh_str_hash_func(key) __ac_X31_hash_string(key)
306 | /*! @function
307 |   @abstract     Const char* comparison function
308 |  */
309 | #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
310 | 
311 | /* --- END OF HASH FUNCTIONS --- */
312 | 
313 | /* Other necessary macros... */
314 | 
315 | /*!
316 |   @abstract Type of the hash table.
317 |   @param  name  Name of the hash table [symbol]
318 |  */
319 | #define khash_t(name) kh_##name##_t
320 | 
321 | /*! @function
322 |   @abstract     Initiate a hash table.
323 |   @param  name  Name of the hash table [symbol]
324 |   @return       Pointer to the hash table [khash_t(name)*]
325 |  */
326 | #define kh_init(name) kh_init_##name()
327 | 
328 | /*! @function
329 |   @abstract     Destroy a hash table.
330 |   @param  name  Name of the hash table [symbol]
331 |   @param  h     Pointer to the hash table [khash_t(name)*]
332 |  */
333 | #define kh_destroy(name, h) kh_destroy_##name(h)
334 | 
335 | /*! @function
336 |   @abstract     Reset a hash table without deallocating memory.
337 |   @param  name  Name of the hash table [symbol]
338 |   @param  h     Pointer to the hash table [khash_t(name)*]
339 |  */
340 | #define kh_clear(name, h) kh_clear_##name(h)
341 | 
342 | /*! @function
343 |   @abstract     Resize a hash table.
344 |   @param  name  Name of the hash table [symbol]
345 |   @param  h     Pointer to the hash table [khash_t(name)*]
346 |   @param  s     New size [khint_t]
347 |  */
348 | #define kh_resize(name, h, s) kh_resize_##name(h, s)
349 | 
350 | /*! @function
351 |   @abstract     Insert a key to the hash table.
352 |   @param  name  Name of the hash table [symbol]
353 |   @param  h     Pointer to the hash table [khash_t(name)*]
354 |   @param  k     Key [type of keys]
355 |   @param  r     Extra return code: 0 if the key is present in the hash table;
356 |                 1 if the bucket is empty (never used); 2 if the element in
357 | 				the bucket has been deleted [int*]
358 |   @return       Iterator to the inserted element [khint_t]
359 |  */
360 | #define kh_put(name, h, k, r) kh_put_##name(h, k, r)
361 | 
362 | /*! @function
363 |   @abstract     Retrieve a key from the hash table.
364 |   @param  name  Name of the hash table [symbol]
365 |   @param  h     Pointer to the hash table [khash_t(name)*]
366 |   @param  k     Key [type of keys]
367 |   @return       Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
368 |  */
369 | #define kh_get(name, h, k) kh_get_##name(h, k)
370 | 
371 | /*! @function
372 |   @abstract     Remove a key from the hash table.
373 |   @param  name  Name of the hash table [symbol]
374 |   @param  h     Pointer to the hash table [khash_t(name)*]
375 |   @param  k     Iterator to the element to be deleted [khint_t]
376 |  */
377 | #define kh_del(name, h, k) kh_del_##name(h, k)
378 | 
379 | 
380 | /*! @function
381 |   @abstract     Test whether a bucket contains data.
382 |   @param  h     Pointer to the hash table [khash_t(name)*]
383 |   @param  x     Iterator to the bucket [khint_t]
384 |   @return       1 if containing data; 0 otherwise [int]
385 |  */
386 | #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
387 | 
388 | /*! @function
389 |   @abstract     Get key given an iterator
390 |   @param  h     Pointer to the hash table [khash_t(name)*]
391 |   @param  x     Iterator to the bucket [khint_t]
392 |   @return       Key [type of keys]
393 |  */
394 | #define kh_key(h, x) ((h)->keys[x])
395 | 
396 | /*! @function
397 |   @abstract     Get value given an iterator
398 |   @param  h     Pointer to the hash table [khash_t(name)*]
399 |   @param  x     Iterator to the bucket [khint_t]
400 |   @return       Value [type of values]
401 |   @discussion   For hash sets, calling this results in segfault.
402 |  */
403 | #define kh_val(h, x) ((h)->vals[x])
404 | 
405 | /*! @function
406 |   @abstract     Alias of kh_val()
407 |  */
408 | #define kh_value(h, x) ((h)->vals[x])
409 | 
410 | /*! @function
411 |   @abstract     Get the start iterator
412 |   @param  h     Pointer to the hash table [khash_t(name)*]
413 |   @return       The start iterator [khint_t]
414 |  */
415 | #define kh_begin(h) (khint_t)(0)
416 | 
417 | /*! @function
418 |   @abstract     Get the end iterator
419 |   @param  h     Pointer to the hash table [khash_t(name)*]
420 |   @return       The end iterator [khint_t]
421 |  */
422 | #define kh_end(h) ((h)->n_buckets)
423 | 
424 | /*! @function
425 |   @abstract     Get the number of elements in the hash table
426 |   @param  h     Pointer to the hash table [khash_t(name)*]
427 |   @return       Number of elements in the hash table [khint_t]
428 |  */
429 | #define kh_size(h) ((h)->size)
430 | 
431 | /*! @function
432 |   @abstract     Get the number of buckets in the hash table
433 |   @param  h     Pointer to the hash table [khash_t(name)*]
434 |   @return       Number of buckets in the hash table [khint_t]
435 |  */
436 | #define kh_n_buckets(h) ((h)->n_buckets)
437 | 
438 | /* More conenient interfaces */
439 | 
440 | /*! @function
441 |   @abstract     Instantiate a hash set containing integer keys
442 |   @param  name  Name of the hash table [symbol]
443 |  */
444 | #define KHASH_SET_INIT_INT(name)										\
445 | 	KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
446 | 
447 | /*! @function
448 |   @abstract     Instantiate a hash map containing integer keys
449 |   @param  name  Name of the hash table [symbol]
450 |   @param  khval_t  Type of values [type]
451 |  */
452 | #define KHASH_MAP_INIT_INT(name, khval_t)								\
453 | 	KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
454 | 
455 | /*! @function
456 |   @abstract     Instantiate a hash map containing 64-bit integer keys
457 |   @param  name  Name of the hash table [symbol]
458 |  */
459 | #define KHASH_SET_INIT_INT64(name)										\
460 | 	KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
461 | 
462 | /*! @function
463 |   @abstract     Instantiate a hash map containing 64-bit integer keys
464 |   @param  name  Name of the hash table [symbol]
465 |   @param  khval_t  Type of values [type]
466 |  */
467 | #define KHASH_MAP_INIT_INT64(name, khval_t)								\
468 | 	KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
469 | 
470 | typedef const char *kh_cstr_t;
471 | /*! @function
472 |   @abstract     Instantiate a hash map containing const char* keys
473 |   @param  name  Name of the hash table [symbol]
474 |  */
475 | #define KHASH_SET_INIT_STR(name)										\
476 | 	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
477 | 
478 | /*! @function
479 |   @abstract     Instantiate a hash map containing const char* keys
480 |   @param  name  Name of the hash table [symbol]
481 |   @param  khval_t  Type of values [type]
482 |  */
483 | #define KHASH_MAP_INIT_STR(name, khval_t)								\
484 | 	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
485 | 
486 | #endif /* __AC_KHASH_H */
487 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/bgzf.c:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
  4 |                  2011 Attractive Chaos <attractor@live.co.uk>
  5 | 
  6 |    Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |    of this software and associated documentation files (the "Software"), to deal
  8 |    in the Software without restriction, including without limitation the rights
  9 |    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |    copies of the Software, and to permit persons to whom the Software is
 11 |    furnished to do so, subject to the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be included in
 14 |    all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 |    THE SOFTWARE.
 23 | */
 24 | 
 25 | #include <stdio.h>
 26 | #include <stdlib.h>
 27 | #include <string.h>
 28 | #include <unistd.h>
 29 | #include <assert.h>
 30 | #include <sys/types.h>
 31 | #include "bgzf.h"
 32 | 
 33 | #ifdef _USE_KNETFILE
 34 | #include "knetfile.h"
 35 | typedef knetFile *_bgzf_file_t;
 36 | #define _bgzf_open(fn, mode) knet_open(fn, mode)
 37 | #define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
 38 | #define _bgzf_close(fp) knet_close(fp)
 39 | #define _bgzf_fileno(fp) ((fp)->fd)
 40 | #define _bgzf_tell(fp) knet_tell(fp)
 41 | #define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
 42 | #define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
 43 | #define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
 44 | #else // ~defined(_USE_KNETFILE)
 45 | #if defined(_WIN32) || defined(_MSC_VER)
 46 | #define ftello(fp) ftell(fp)
 47 | #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
 48 | #else // ~defined(_WIN32)
 49 | extern off_t ftello(FILE *stream);
 50 | extern int fseeko(FILE *stream, off_t offset, int whence);
 51 | #endif // ~defined(_WIN32)
 52 | typedef FILE *_bgzf_file_t;
 53 | #define _bgzf_open(fn, mode) fopen(fn, mode)
 54 | #define _bgzf_dopen(fp, mode) fdopen(fp, mode)
 55 | #define _bgzf_close(fp) fclose(fp)
 56 | #define _bgzf_fileno(fp) fileno(fp)
 57 | #define _bgzf_tell(fp) ftello(fp)
 58 | #define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
 59 | #define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
 60 | #define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
 61 | #endif // ~define(_USE_KNETFILE)
 62 | 
 63 | #define BLOCK_HEADER_LENGTH 18
 64 | #define BLOCK_FOOTER_LENGTH 8
 65 | 
 66 | /* BGZF/GZIP header (speciallized from RFC 1952; little endian):
 67 |  +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
 68 |  | 31|139|  8|  4|              0|  0|255|      6| 66| 67|      2|BLK_LEN|
 69 |  +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
 70 | */
 71 | static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
 72 | 
 73 | #ifdef BGZF_CACHE
 74 | typedef struct {
 75 | 	int size;
 76 | 	uint8_t *block;
 77 | 	int64_t end_offset;
 78 | } cache_t;
 79 | #include "khash.h"
 80 | KHASH_MAP_INIT_INT64(cache, cache_t)
 81 | #endif
 82 | 
 83 | static inline void packInt16(uint8_t *buffer, uint16_t value)
 84 | {
 85 | 	buffer[0] = value;
 86 | 	buffer[1] = value >> 8;
 87 | }
 88 | 
 89 | static inline int unpackInt16(const uint8_t *buffer)
 90 | {
 91 | 	return buffer[0] | buffer[1] << 8;
 92 | }
 93 | 
 94 | static inline void packInt32(uint8_t *buffer, uint32_t value)
 95 | {
 96 | 	buffer[0] = value;
 97 | 	buffer[1] = value >> 8;
 98 | 	buffer[2] = value >> 16;
 99 | 	buffer[3] = value >> 24;
100 | }
101 | 
102 | static BGZF *bgzf_read_init()
103 | {
104 | 	BGZF *fp;
105 | 	fp = calloc(1, sizeof(BGZF));
106 | 	fp->open_mode = 'r';
107 | 	fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE);
108 | 	fp->compressed_block = malloc(BGZF_BLOCK_SIZE);
109 | #ifdef BGZF_CACHE
110 | 	fp->cache = kh_init(cache);
111 | #endif
112 | 	return fp;
113 | }
114 | 
115 | static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
116 | {
117 | 	BGZF *fp;
118 | 	fp = calloc(1, sizeof(BGZF));
119 | 	fp->open_mode = 'w';
120 | 	fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE);
121 | 	fp->compressed_block = malloc(BGZF_BLOCK_SIZE);
122 | 	fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
123 | 	if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
124 | 	return fp;
125 | }
126 | // get the compress level from the mode string
127 | static int mode2level(const char *__restrict mode)
128 | {
129 | 	int i, compress_level = -1;
130 | 	for (i = 0; mode[i]; ++i)
131 | 		if (mode[i] >= '0' && mode[i] <= '9') break;
132 | 	if (mode[i]) compress_level = (int)mode[i] - '0';
133 | 	if (strchr(mode, 'u')) compress_level = 0;
134 | 	return compress_level;
135 | }
136 | 
137 | BGZF *bgzf_open(const char *path, const char *mode)
138 | {
139 | 	BGZF *fp = 0;
140 | 	if (strchr(mode, 'r') || strchr(mode, 'R')) {
141 | 		_bgzf_file_t fpr;
142 | 		if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
143 | 		fp = bgzf_read_init();
144 | 		fp->fp = fpr;
145 | 	} else if (strchr(mode, 'w') || strchr(mode, 'W')) {
146 | 		FILE *fpw;
147 | 		if ((fpw = fopen(path, "w")) == 0) return 0;
148 | 		fp = bgzf_write_init(mode2level(mode));
149 | 		fp->fp = fpw;
150 | 	}
151 | 	return fp;
152 | }
153 | 
154 | BGZF *bgzf_dopen(int fd, const char *mode)
155 | {
156 | 	BGZF *fp = 0;
157 | 	if (strchr(mode, 'r') || strchr(mode, 'R')) {
158 | 		_bgzf_file_t fpr;
159 | 		if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
160 | 		fp = bgzf_read_init();
161 | 		fp->fp = fpr;
162 | 	} else if (strchr(mode, 'w') || strchr(mode, 'W')) {
163 | 		FILE *fpw;
164 | 		if ((fpw = fdopen(fd, "w")) == 0) return 0;
165 | 		fp = bgzf_write_init(mode2level(mode));
166 | 		fp->fp = fpw;
167 | 	}
168 | 	return fp;
169 | }
170 | 
171 | // Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
172 | static int deflate_block(BGZF *fp, int block_length)
173 | {
174 | 	uint8_t *buffer = fp->compressed_block;
175 | 	int buffer_size = BGZF_BLOCK_SIZE;
176 | 	int input_length = block_length;
177 | 	int compressed_length = 0;
178 | 	int remaining;
179 | 	uint32_t crc;
180 | 
181 | 	assert(block_length <= BGZF_BLOCK_SIZE); // guaranteed by the caller
182 | 	memcpy(buffer, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
183 | 	while (1) { // loop to retry for blocks that do not compress enough
184 | 		int status;
185 | 		z_stream zs;
186 | 		zs.zalloc = NULL;
187 | 		zs.zfree = NULL;
188 | 		zs.next_in = fp->uncompressed_block;
189 | 		zs.avail_in = input_length;
190 | 		zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
191 | 		zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
192 | 		status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); // -15 to disable zlib header/footer
193 | 		if (status != Z_OK) {
194 | 			fp->errcode |= BGZF_ERR_ZLIB;
195 | 			return -1;
196 | 		}
197 | 		status = deflate(&zs, Z_FINISH);
198 | 		if (status != Z_STREAM_END) { // not compressed enough
199 | 			deflateEnd(&zs); // reset the stream
200 | 			if (status == Z_OK) { // reduce the size and recompress
201 | 				input_length -= 1024;
202 | 				assert(input_length > 0); // logically, this should not happen
203 | 				continue;
204 | 			}
205 | 			fp->errcode |= BGZF_ERR_ZLIB;
206 | 			return -1;
207 | 		}
208 | 		if (deflateEnd(&zs) != Z_OK) {
209 | 			fp->errcode |= BGZF_ERR_ZLIB;
210 | 			return -1;
211 | 		}
212 | 		compressed_length = zs.total_out;
213 | 		compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
214 | 		assert(compressed_length <= BGZF_BLOCK_SIZE);
215 | 		break;
216 | 	}
217 | 
218 | 	assert(compressed_length > 0);
219 | 	packInt16((uint8_t*)&buffer[16], compressed_length - 1); // write the compressed_length; -1 to fit 2 bytes
220 | 	crc = crc32(0L, NULL, 0L);
221 | 	crc = crc32(crc, fp->uncompressed_block, input_length);
222 | 	packInt32((uint8_t*)&buffer[compressed_length-8], crc);
223 | 	packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
224 | 
225 | 	remaining = block_length - input_length;
226 | 	if (remaining > 0) {
227 | 		assert(remaining <= input_length);
228 | 		memcpy(fp->uncompressed_block, fp->uncompressed_block + input_length, remaining);
229 | 	}
230 | 	fp->block_offset = remaining;
231 | 	return compressed_length;
232 | }
233 | 
234 | // Inflate the block in fp->compressed_block into fp->uncompressed_block
235 | static int inflate_block(BGZF* fp, int block_length)
236 | {
237 | 	z_stream zs;
238 | 	zs.zalloc = NULL;
239 | 	zs.zfree = NULL;
240 | 	zs.next_in = fp->compressed_block + 18;
241 | 	zs.avail_in = block_length - 16;
242 | 	zs.next_out = fp->uncompressed_block;
243 | 	zs.avail_out = BGZF_BLOCK_SIZE;
244 | 
245 | 	if (inflateInit2(&zs, -15) != Z_OK) {
246 | 		fp->errcode |= BGZF_ERR_ZLIB;
247 | 		return -1;
248 | 	}
249 | 	if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
250 | 		inflateEnd(&zs);
251 | 		fp->errcode |= BGZF_ERR_ZLIB;
252 | 		return -1;
253 | 	}
254 | 	if (inflateEnd(&zs) != Z_OK) {
255 | 		fp->errcode |= BGZF_ERR_ZLIB;
256 | 		return -1;
257 | 	}
258 | 	return zs.total_out;
259 | }
260 | 
261 | static int check_header(const uint8_t *header)
262 | {
263 | 	return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
264 | 			&& unpackInt16((uint8_t*)&header[10]) == 6
265 | 			&& header[12] == 'B' && header[13] == 'C'
266 | 			&& unpackInt16((uint8_t*)&header[14]) == 2);
267 | }
268 | 
269 | #ifdef BGZF_CACHE
270 | static void free_cache(BGZF *fp)
271 | {
272 | 	khint_t k;
273 | 	khash_t(cache) *h = (khash_t(cache)*)fp->cache;
274 | 	if (fp->open_mode != 'r') return;
275 | 	for (k = kh_begin(h); k < kh_end(h); ++k)
276 | 		if (kh_exist(h, k)) free(kh_val(h, k).block);
277 | 	kh_destroy(cache, h);
278 | }
279 | 
280 | static int load_block_from_cache(BGZF *fp, int64_t block_address)
281 | {
282 | 	khint_t k;
283 | 	cache_t *p;
284 | 	khash_t(cache) *h = (khash_t(cache)*)fp->cache;
285 | 	k = kh_get(cache, h, block_address);
286 | 	if (k == kh_end(h)) return 0;
287 | 	p = &kh_val(h, k);
288 | 	if (fp->block_length != 0) fp->block_offset = 0;
289 | 	fp->block_address = block_address;
290 | 	fp->block_length = p->size;
291 | 	memcpy(fp->uncompressed_block, p->block, BGZF_BLOCK_SIZE);
292 | 	_bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
293 | 	return p->size;
294 | }
295 | 
296 | static void cache_block(BGZF *fp, int size)
297 | {
298 | 	int ret;
299 | 	khint_t k;
300 | 	cache_t *p;
301 | 	khash_t(cache) *h = (khash_t(cache)*)fp->cache;
302 | 	if (BGZF_BLOCK_SIZE >= fp->cache_size) return;
303 | 	if ((kh_size(h) + 1) * BGZF_BLOCK_SIZE > fp->cache_size) {
304 | 		/* A better way would be to remove the oldest block in the
305 | 		 * cache, but here we remove a random one for simplicity. This
306 | 		 * should not have a big impact on performance. */
307 | 		for (k = kh_begin(h); k < kh_end(h); ++k)
308 | 			if (kh_exist(h, k)) break;
309 | 		if (k < kh_end(h)) {
310 | 			free(kh_val(h, k).block);
311 | 			kh_del(cache, h, k);
312 | 		}
313 | 	}
314 | 	k = kh_put(cache, h, fp->block_address, &ret);
315 | 	if (ret == 0) return; // if this happens, a bug!
316 | 	p = &kh_val(h, k);
317 | 	p->size = fp->block_length;
318 | 	p->end_offset = fp->block_address + size;
319 | 	p->block = malloc(BGZF_BLOCK_SIZE);
320 | 	memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_BLOCK_SIZE);
321 | }
322 | #else
323 | static void free_cache(BGZF *fp) {}
324 | static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
325 | static void cache_block(BGZF *fp, int size) {}
326 | #endif
327 | 
328 | int bgzf_read_block(BGZF *fp)
329 | {
330 | 	uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
331 | 	int count, size = 0, block_length, remaining;
332 | 	int64_t block_address;
333 | 	block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
334 | 	if (load_block_from_cache(fp, block_address)) return 0;
335 | 	count = _bgzf_read(fp->fp, header, sizeof(header));
336 | 	if (count == 0) { // no data read
337 | 		fp->block_length = 0;
338 | 		return 0;
339 | 	}
340 | 	if (count != sizeof(header) || !check_header(header)) {
341 | 		fp->errcode |= BGZF_ERR_HEADER;
342 | 		return -1;
343 | 	}
344 | 	size = count;
345 | 	block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
346 | 	compressed_block = (uint8_t*)fp->compressed_block;
347 | 	memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
348 | 	remaining = block_length - BLOCK_HEADER_LENGTH;
349 | 	count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
350 | 	if (count != remaining) {
351 | 		fp->errcode |= BGZF_ERR_IO;
352 | 		return -1;
353 | 	}
354 | 	size += count;
355 | 	if ((count = inflate_block(fp, block_length)) < 0) return -1;
356 | 	if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
357 | 	fp->block_address = block_address;
358 | 	fp->block_length = count;
359 | 	cache_block(fp, size);
360 | 	return 0;
361 | }
362 | 
363 | ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
364 | {
365 | 	ssize_t bytes_read = 0;
366 | 	uint8_t *output = data;
367 | 	if (length <= 0) return 0;
368 | 	assert(fp->open_mode == 'r');
369 | 	while (bytes_read < length) {
370 | 		int copy_length, available = fp->block_length - fp->block_offset;
371 | 		uint8_t *buffer;
372 | 		if (available <= 0) {
373 | 			if (bgzf_read_block(fp) != 0) return -1;
374 | 			available = fp->block_length - fp->block_offset;
375 | 			if (available <= 0) break;
376 | 		}
377 | 		copy_length = length - bytes_read < available? length - bytes_read : available;
378 | 		buffer = fp->uncompressed_block;
379 | 		memcpy(output, buffer + fp->block_offset, copy_length);
380 | 		fp->block_offset += copy_length;
381 | 		output += copy_length;
382 | 		bytes_read += copy_length;
383 | 	}
384 | 	if (fp->block_offset == fp->block_length) {
385 | 		fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
386 | 		fp->block_offset = fp->block_length = 0;
387 | 	}
388 | 	return bytes_read;
389 | }
390 | 
391 | int bgzf_flush(BGZF *fp)
392 | {
393 | 	assert(fp->open_mode == 'w');
394 | 	while (fp->block_offset > 0) {
395 | 		int block_length;
396 | 		block_length = deflate_block(fp, fp->block_offset);
397 | 		if (block_length < 0) return -1;
398 | 		if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
399 | 			fp->errcode |= BGZF_ERR_IO; // possibly truncated file
400 | 			return -1;
401 | 		}
402 | 		fp->block_address += block_length;
403 | 	}
404 | 	return 0;
405 | }
406 | 
407 | int bgzf_flush_try(BGZF *fp, ssize_t size)
408 | {
409 | 	if (fp->block_offset + size > BGZF_BLOCK_SIZE)
410 | 		return bgzf_flush(fp);
411 | 	return -1;
412 | }
413 | 
414 | ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
415 | {
416 | 	const uint8_t *input = data;
417 | 	int block_length = BGZF_BLOCK_SIZE, bytes_written;
418 | 	assert(fp->open_mode == 'w');
419 | 	input = data;
420 | 	bytes_written = 0;
421 | 	while (bytes_written < length) {
422 | 		uint8_t* buffer = fp->uncompressed_block;
423 | 		int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
424 | 		memcpy(buffer + fp->block_offset, input, copy_length);
425 | 		fp->block_offset += copy_length;
426 | 		input += copy_length;
427 | 		bytes_written += copy_length;
428 | 		if (fp->block_offset == block_length && bgzf_flush(fp)) break;
429 | 	}
430 | 	return bytes_written;
431 | }
432 | 
433 | int bgzf_close(BGZF* fp)
434 | {
435 | 	int ret, count, block_length;
436 | 	if (fp == 0) return -1;
437 | 	if (fp->open_mode == 'w') {
438 | 		if (bgzf_flush(fp) != 0) return -1;
439 | 		block_length = deflate_block(fp, 0); // write an empty block
440 | 		count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
441 | 		if (fflush(fp->fp) != 0) {
442 | 			fp->errcode |= BGZF_ERR_IO;
443 | 			return -1;
444 | 		}
445 | 	}
446 | 	ret = fp->open_mode == 'w'? fclose(fp->fp) : _bgzf_close(fp->fp);
447 | 	if (ret != 0) return -1;
448 | 	free(fp->uncompressed_block);
449 | 	free(fp->compressed_block);
450 | 	free_cache(fp);
451 | 	free(fp);
452 | 	return 0;
453 | }
454 | 
455 | void bgzf_set_cache_size(BGZF *fp, int cache_size)
456 | {
457 | 	if (fp) fp->cache_size = cache_size;
458 | }
459 | 
460 | int bgzf_check_EOF(BGZF *fp)
461 | {
462 | 	static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
463 | 	uint8_t buf[28];
464 | 	off_t offset;
465 | 	offset = _bgzf_tell((_bgzf_file_t)fp->fp);
466 | 	if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
467 | 	_bgzf_read(fp->fp, buf, 28);
468 | 	_bgzf_seek(fp->fp, offset, SEEK_SET);
469 | 	return (memcmp(magic, buf, 28) == 0)? 1 : 0;
470 | }
471 | 
472 | int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
473 | {
474 | 	int block_offset;
475 | 	int64_t block_address;
476 | 
477 | 	if (fp->open_mode != 'r' || where != SEEK_SET) {
478 | 		fp->errcode |= BGZF_ERR_MISUSE;
479 | 		return -1;
480 | 	}
481 | 	block_offset = pos & 0xFFFF;
482 | 	block_address = pos >> 16;
483 | 	if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
484 | 		fp->errcode |= BGZF_ERR_IO;
485 | 		return -1;
486 | 	}
487 | 	fp->block_length = 0;  // indicates current block has not been loaded
488 | 	fp->block_address = block_address;
489 | 	fp->block_offset = block_offset;
490 | 	return 0;
491 | }
492 | 
493 | int bgzf_is_bgzf(const char *fn)
494 | {
495 | 	uint8_t buf[16];
496 | 	int n;
497 | 	_bgzf_file_t fp;
498 | 	if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
499 | 	n = _bgzf_read(fp, buf, 16);
500 | 	_bgzf_close(fp);
501 | 	if (n != 16) return 0;
502 | 	return memcmp(g_magic, buf, 16) == 0? 1 : 0;
503 | }
504 | 
505 | int bgzf_getc(BGZF *fp)
506 | {
507 | 	int c;
508 | 	if (fp->block_offset >= fp->block_length) {
509 | 		if (bgzf_read_block(fp) != 0) return -2; /* error */
510 | 		if (fp->block_length == 0) return -1; /* end-of-file */
511 | 	}
512 | 	c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
513 |     if (fp->block_offset == fp->block_length) {
514 |         fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
515 |         fp->block_offset = 0;
516 |         fp->block_length = 0;
517 |     }
518 | 	return c;
519 | }
520 | 
521 | #ifndef kroundup32
522 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
523 | #endif
524 | 
525 | int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
526 | {
527 | 	int l, state = 0;
528 | 	unsigned char *buf = (unsigned char*)fp->uncompressed_block;
529 | 	str->l = 0;
530 | 	do {
531 | 		if (fp->block_offset >= fp->block_length) {
532 | 			if (bgzf_read_block(fp) != 0) { state = -2; break; }
533 | 			if (fp->block_length == 0) { state = -1; break; }
534 | 		}
535 | 		for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
536 | 		if (l < fp->block_length) state = 1;
537 | 		l -= fp->block_offset;
538 | 		if (str->l + l + 1 >= str->m) {
539 | 			str->m = str->l + l + 2;
540 | 			kroundup32(str->m);
541 | 			str->s = (char*)realloc(str->s, str->m);
542 | 		}
543 | 		memcpy(str->s + str->l, buf + fp->block_offset, l);
544 | 		str->l += l;
545 | 		fp->block_offset += l + 1;
546 | 		if (fp->block_offset >= fp->block_length) {
547 | 			fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
548 | 			fp->block_offset = 0;
549 | 			fp->block_length = 0;
550 | 		} 
551 | 	} while (state == 0);
552 | 	if (str->l == 0 && state < 0) return state;
553 | 	str->s[str->l] = 0;
554 | 	return str->l;
555 | }
556 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/knetfile.c:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008 Genome Research Ltd (GRL).
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /* Contact: Heng Li <lh3@sanger.ac.uk> */
 27 | 
 28 | /* Probably I will not do socket programming in the next few years and
 29 |    therefore I decide to heavily annotate this file, for Linux and
 30 |    Windows as well.  -lh3 */
 31 | 
 32 | #include <time.h>
 33 | #include <stdio.h>
 34 | #include <ctype.h>
 35 | #include <stdlib.h>
 36 | #include <string.h>
 37 | #include <errno.h>
 38 | #include <unistd.h>
 39 | #include <sys/types.h>
 40 | 
 41 | #ifdef _WIN32
 42 | #include <winsock.h>
 43 | #else
 44 | #include <netdb.h>
 45 | #include <arpa/inet.h>
 46 | #include <sys/socket.h>
 47 | #endif
 48 | 
 49 | #include "knetfile.h"
 50 | 
 51 | /* In winsock.h, the type of a socket is SOCKET, which is: "typedef
 52 |  * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
 53 |  * integer -1. In knetfile.c, I use "int" for socket type
 54 |  * throughout. This should be improved to avoid confusion.
 55 |  *
 56 |  * In Linux/Mac, recv() and read() do almost the same thing. You can see
 57 |  * in the header file that netread() is simply an alias of read(). In
 58 |  * Windows, however, they are different and using recv() is mandatory.
 59 |  */
 60 | 
 61 | /* This function tests if the file handler is ready for reading (or
 62 |  * writing if is_read==0). */
 63 | static int socket_wait(int fd, int is_read)
 64 | {
 65 | 	fd_set fds, *fdr = 0, *fdw = 0;
 66 | 	struct timeval tv;
 67 | 	int ret;
 68 | 	tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
 69 | 	FD_ZERO(&fds);
 70 | 	FD_SET(fd, &fds);
 71 | 	if (is_read) fdr = &fds;
 72 | 	else fdw = &fds;
 73 | 	ret = select(fd+1, fdr, fdw, 0, &tv);
 74 | #ifndef _WIN32
 75 | 	if (ret == -1) perror("select");
 76 | #else
 77 | 	if (ret == 0)
 78 | 		fprintf(stderr, "select time-out\n");
 79 | 	else if (ret == SOCKET_ERROR)
 80 | 		fprintf(stderr, "select: %d\n", WSAGetLastError());
 81 | #endif
 82 | 	return ret;
 83 | }
 84 | 
 85 | #ifndef _WIN32
 86 | /* This function does not work with Windows due to the lack of
 87 |  * getaddrinfo() in winsock. It is addapted from an example in "Beej's
 88 |  * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
 89 | static int socket_connect(const char *host, const char *port)
 90 | {
 91 | #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
 92 | 
 93 | 	int on = 1, fd;
 94 | 	struct linger lng = { 0, 0 };
 95 | 	struct addrinfo hints, *res;
 96 | 	memset(&hints, 0, sizeof(struct addrinfo));
 97 | 	hints.ai_family = AF_UNSPEC;
 98 | 	hints.ai_socktype = SOCK_STREAM;
 99 | 	/* In Unix/Mac, getaddrinfo() is the most convenient way to get
100 | 	 * server information. */
101 | 	if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
102 | 	if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
103 | 	/* The following two setsockopt() are used by ftplib
104 | 	 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
105 | 	 * necessary. */
106 | 	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
107 | 	if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
108 | 	if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
109 | 	freeaddrinfo(res);
110 | 	return fd;
111 | }
112 | #else
113 | /* MinGW's printf has problem with "%lld" */
114 | char *int64tostr(char *buf, int64_t x)
115 | {
116 | 	int cnt;
117 | 	int i = 0;
118 | 	do {
119 | 		buf[i++] = '0' + x % 10;
120 | 		x /= 10;
121 | 	} while (x);
122 | 	buf[i] = 0;
123 | 	for (cnt = i, i = 0; i < cnt/2; ++i) {
124 | 		int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
125 | 	}
126 | 	return buf;
127 | }
128 | 
129 | int64_t strtoint64(const char *buf)
130 | {
131 | 	int64_t x;
132 | 	for (x = 0; *buf != '\0'; ++buf)
133 | 		x = x * 10 + ((int64_t) *buf - 48);
134 | 	return x;
135 | }
136 | /* In windows, the first thing is to establish the TCP connection. */
137 | int knet_win32_init()
138 | {
139 | 	WSADATA wsaData;
140 | 	return WSAStartup(MAKEWORD(2, 2), &wsaData);
141 | }
142 | void knet_win32_destroy()
143 | {
144 | 	WSACleanup();
145 | }
146 | /* A slightly modfied version of the following function also works on
147 |  * Mac (and presummably Linux). However, this function is not stable on
148 |  * my Mac. It sometimes works fine but sometimes does not. Therefore for
149 |  * non-Windows OS, I do not use this one. */
150 | static SOCKET socket_connect(const char *host, const char *port)
151 | {
152 | #define __err_connect(func)										\
153 | 	do {														\
154 | 		fprintf(stderr, "%s: %d\n", func, WSAGetLastError());	\
155 | 		return -1;												\
156 | 	} while (0)
157 | 
158 | 	int on = 1;
159 | 	SOCKET fd;
160 | 	struct linger lng = { 0, 0 };
161 | 	struct sockaddr_in server;
162 | 	struct hostent *hp = 0;
163 | 	// open socket
164 | 	if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
165 | 	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
166 | 	if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
167 | 	// get host info
168 | 	if (isalpha(host[0])) hp = gethostbyname(host);
169 | 	else {
170 | 		struct in_addr addr;
171 | 		addr.s_addr = inet_addr(host);
172 | 		hp = gethostbyaddr((char*)&addr, 4, AF_INET);
173 | 	}
174 | 	if (hp == 0) __err_connect("gethost");
175 | 	// connect
176 | 	server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
177 | 	server.sin_family= AF_INET;
178 | 	server.sin_port = htons(atoi(port));
179 | 	if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
180 | 	// freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
181 | 	return fd;
182 | }
183 | #endif
184 | 
185 | static off_t my_netread(int fd, void *buf, off_t len)
186 | {
187 | 	off_t rest = len, curr, l = 0;
188 | 	/* recv() and read() may not read the required length of data with
189 | 	 * one call. They have to be called repeatedly. */
190 | 	while (rest) {
191 | 		if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
192 | 		curr = netread(fd, buf + l, rest);
193 | 		/* According to the glibc manual, section 13.2, a zero returned
194 | 		 * value indicates end-of-file (EOF), which should mean that
195 | 		 * read() will not return zero if EOF has not been met but data
196 | 		 * are not immediately available. */
197 | 		if (curr == 0) break;
198 | 		l += curr; rest -= curr;
199 | 	}
200 | 	return l;
201 | }
202 | 
203 | /*************************
204 |  * FTP specific routines *
205 |  *************************/
206 | 
207 | static int kftp_get_response(knetFile *ftp)
208 | {
209 | #ifndef _WIN32
210 | 	unsigned char c;
211 | #else
212 | 	char c;
213 | #endif
214 | 	int n = 0;
215 | 	char *p;
216 | 	if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
217 | 	while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
218 | 		//fputc(c, stderr);
219 | 		if (n >= ftp->max_response) {
220 | 			ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
221 | 			ftp->response = realloc(ftp->response, ftp->max_response);
222 | 		}
223 | 		ftp->response[n++] = c;
224 | 		if (c == '\n') {
225 | 			if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
226 | 				&& ftp->response[3] != '-') break;
227 | 			n = 0;
228 | 			continue;
229 | 		}
230 | 	}
231 | 	if (n < 2) return -1;
232 | 	ftp->response[n-2] = 0;
233 | 	return strtol(ftp->response, &p, 0);
234 | }
235 | 
236 | static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
237 | {
238 | 	if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
239 | 	netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
240 | 	return is_get? kftp_get_response(ftp) : 0;
241 | }
242 | 
243 | static int kftp_pasv_prep(knetFile *ftp)
244 | {
245 | 	char *p;
246 | 	int v[6];
247 | 	kftp_send_cmd(ftp, "PASV\r\n", 1);
248 | 	for (p = ftp->response; *p && *p != '('; ++p);
249 | 	if (*p != '(') return -1;
250 | 	++p;
251 | 	sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
252 | 	memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
253 | 	ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
254 | 	return 0;
255 | }
256 | 
257 | 
258 | static int kftp_pasv_connect(knetFile *ftp)
259 | {
260 | 	char host[80], port[10];
261 | 	if (ftp->pasv_port == 0) {
262 | 		fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
263 | 		return -1;
264 | 	}
265 | 	sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
266 | 	sprintf(port, "%d", ftp->pasv_port);
267 | 	ftp->fd = socket_connect(host, port);
268 | 	if (ftp->fd == -1) return -1;
269 | 	return 0;
270 | }
271 | 
272 | int kftp_connect(knetFile *ftp)
273 | {
274 | 	ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
275 | 	if (ftp->ctrl_fd == -1) return -1;
276 | 	kftp_get_response(ftp);
277 | 	kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
278 | 	kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
279 | 	kftp_send_cmd(ftp, "TYPE I\r\n", 1);
280 | 	return 0;
281 | }
282 | 
283 | int kftp_reconnect(knetFile *ftp)
284 | {
285 | 	if (ftp->ctrl_fd != -1) {
286 | 		netclose(ftp->ctrl_fd);
287 | 		ftp->ctrl_fd = -1;
288 | 	}
289 | 	netclose(ftp->fd);
290 | 	ftp->fd = -1;
291 | 	return kftp_connect(ftp);
292 | }
293 | 
294 | // initialize ->type, ->host, ->retr and ->size
295 | knetFile *kftp_parse_url(const char *fn, const char *mode)
296 | {
297 | 	knetFile *fp;
298 | 	char *p;
299 | 	int l;
300 | 	if (strstr(fn, "ftp://") != fn) return 0;
301 | 	for (p = (char*)fn + 6; *p && *p != '/'; ++p);
302 | 	if (*p != '/') return 0;
303 | 	l = p - fn - 6;
304 | 	fp = calloc(1, sizeof(knetFile));
305 | 	fp->type = KNF_TYPE_FTP;
306 | 	fp->fd = -1;
307 | 	/* the Linux/Mac version of socket_connect() also recognizes a port
308 | 	 * like "ftp", but the Windows version does not. */
309 | 	fp->port = strdup("21");
310 | 	fp->host = calloc(l + 1, 1);
311 | 	if (strchr(mode, 'c')) fp->no_reconnect = 1;
312 | 	strncpy(fp->host, fn + 6, l);
313 | 	fp->retr = calloc(strlen(p) + 8, 1);
314 | 	sprintf(fp->retr, "RETR %s\r\n", p);
315 |     fp->size_cmd = calloc(strlen(p) + 8, 1);
316 |     sprintf(fp->size_cmd, "SIZE %s\r\n", p);
317 | 	fp->seek_offset = 0;
318 | 	return fp;
319 | }
320 | // place ->fd at offset off
321 | int kftp_connect_file(knetFile *fp)
322 | {
323 | 	int ret;
324 | 	long long file_size;
325 | 	if (fp->fd != -1) {
326 | 		netclose(fp->fd);
327 | 		if (fp->no_reconnect) kftp_get_response(fp);
328 | 	}
329 | 	kftp_pasv_prep(fp);
330 |     kftp_send_cmd(fp, fp->size_cmd, 1);
331 | #ifndef _WIN32
332 |     if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
333 |     {
334 |         fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
335 |         return -1;
336 |     }
337 | #else
338 | 	const char *p = fp->response;
339 | 	while (*p != ' ') ++p;
340 | 	while (*p < '0' || *p > '9') ++p;
341 | 	file_size = strtoint64(p);
342 | #endif
343 | 	fp->file_size = file_size;
344 | 	if (fp->offset>=0) {
345 | 		char tmp[32];
346 | #ifndef _WIN32
347 | 		sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
348 | #else
349 | 		strcpy(tmp, "REST ");
350 | 		int64tostr(tmp + 5, fp->offset);
351 | 		strcat(tmp, "\r\n");
352 | #endif
353 | 		kftp_send_cmd(fp, tmp, 1);
354 | 	}
355 | 	kftp_send_cmd(fp, fp->retr, 0);
356 | 	kftp_pasv_connect(fp);
357 | 	ret = kftp_get_response(fp);
358 | 	if (ret != 150) {
359 | 		fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
360 | 		netclose(fp->fd);
361 | 		fp->fd = -1;
362 | 		return -1;
363 | 	}
364 | 	fp->is_ready = 1;
365 | 	return 0;
366 | }
367 | 
368 | 
369 | /**************************
370 |  * HTTP specific routines *
371 |  **************************/
372 | 
373 | knetFile *khttp_parse_url(const char *fn, const char *mode)
374 | {
375 | 	knetFile *fp;
376 | 	char *p, *proxy, *q;
377 | 	int l;
378 | 	if (strstr(fn, "http://") != fn) return 0;
379 | 	// set ->http_host
380 | 	for (p = (char*)fn + 7; *p && *p != '/'; ++p);
381 | 	l = p - fn - 7;
382 | 	fp = calloc(1, sizeof(knetFile));
383 | 	fp->http_host = calloc(l + 1, 1);
384 | 	strncpy(fp->http_host, fn + 7, l);
385 | 	fp->http_host[l] = 0;
386 | 	for (q = fp->http_host; *q && *q != ':'; ++q);
387 | 	if (*q == ':') *q++ = 0;
388 | 	// get http_proxy
389 | 	proxy = getenv("http_proxy");
390 | 	// set ->host, ->port and ->path
391 | 	if (proxy == 0) {
392 | 		fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
393 | 		fp->port = strdup(*q? q : "80");
394 | 		fp->path = strdup(*p? p : "/");
395 | 	} else {
396 | 		fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
397 | 		for (q = fp->host; *q && *q != ':'; ++q);
398 | 		if (*q == ':') *q++ = 0; 
399 | 		fp->port = strdup(*q? q : "80");
400 | 		fp->path = strdup(fn);
401 | 	}
402 | 	fp->type = KNF_TYPE_HTTP;
403 | 	fp->ctrl_fd = fp->fd = -1;
404 | 	fp->seek_offset = 0;
405 | 	return fp;
406 | }
407 | 
408 | int khttp_connect_file(knetFile *fp)
409 | {
410 | 	int ret, l = 0;
411 | 	char *buf, *p;
412 | 	if (fp->fd != -1) netclose(fp->fd);
413 | 	fp->fd = socket_connect(fp->host, fp->port);
414 | 	buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
415 | 	l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
416 |     l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
417 | 	l += sprintf(buf + l, "\r\n");
418 | 	netwrite(fp->fd, buf, l);
419 | 	l = 0;
420 | 	while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
421 | 		if (buf[l] == '\n' && l >= 3)
422 | 			if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
423 | 		++l;
424 | 	}
425 | 	buf[l] = 0;
426 | 	if (l < 14) { // prematured header
427 | 		netclose(fp->fd);
428 | 		fp->fd = -1;
429 | 		return -1;
430 | 	}
431 | 	ret = strtol(buf + 8, &p, 0); // HTTP return code
432 | 	if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
433 | 		off_t rest = fp->offset;
434 | 		while (rest) {
435 | 			off_t l = rest < 0x10000? rest : 0x10000;
436 | 			rest -= my_netread(fp->fd, buf, l);
437 | 		}
438 | 	} else if (ret != 206 && ret != 200) {
439 | 		free(buf);
440 | 		fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
441 | 		netclose(fp->fd);
442 | 		fp->fd = -1;
443 | 		return -1;
444 | 	}
445 | 	free(buf);
446 | 	fp->is_ready = 1;
447 | 	return 0;
448 | }
449 | 
450 | /********************
451 |  * Generic routines *
452 |  ********************/
453 | 
454 | knetFile *knet_open(const char *fn, const char *mode)
455 | {
456 | 	knetFile *fp = 0;
457 | 	if (mode[0] != 'r') {
458 | 		fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
459 | 		return 0;
460 | 	}
461 | 	if (strstr(fn, "ftp://") == fn) {
462 | 		fp = kftp_parse_url(fn, mode);
463 | 		if (fp == 0) return 0;
464 | 		if (kftp_connect(fp) == -1) {
465 | 			knet_close(fp);
466 | 			return 0;
467 | 		}
468 | 		kftp_connect_file(fp);
469 | 	} else if (strstr(fn, "http://") == fn) {
470 | 		fp = khttp_parse_url(fn, mode);
471 | 		if (fp == 0) return 0;
472 | 		khttp_connect_file(fp);
473 | 	} else { // local file
474 | #ifdef _WIN32
475 | 		/* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
476 | 		 * be undefined on some systems, although it is defined on my
477 | 		 * Mac and the Linux I have tested on. */
478 | 		int fd = open(fn, O_RDONLY | O_BINARY);
479 | #else		
480 | 		int fd = open(fn, O_RDONLY);
481 | #endif
482 | 		if (fd == -1) {
483 | 			perror("open");
484 | 			return 0;
485 | 		}
486 | 		fp = (knetFile*)calloc(1, sizeof(knetFile));
487 | 		fp->type = KNF_TYPE_LOCAL;
488 | 		fp->fd = fd;
489 | 		fp->ctrl_fd = -1;
490 | 	}
491 | 	if (fp && fp->fd == -1) {
492 | 		knet_close(fp);
493 | 		return 0;
494 | 	}
495 | 	return fp;
496 | }
497 | 
498 | knetFile *knet_dopen(int fd, const char *mode)
499 | {
500 | 	knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
501 | 	fp->type = KNF_TYPE_LOCAL;
502 | 	fp->fd = fd;
503 | 	return fp;
504 | }
505 | 
506 | off_t knet_read(knetFile *fp, void *buf, off_t len)
507 | {
508 | 	off_t l = 0;
509 | 	if (fp->fd == -1) return 0;
510 | 	if (fp->type == KNF_TYPE_FTP) {
511 | 		if (fp->is_ready == 0) {
512 | 			if (!fp->no_reconnect) kftp_reconnect(fp);
513 | 			kftp_connect_file(fp);
514 | 		}
515 | 	} else if (fp->type == KNF_TYPE_HTTP) {
516 | 		if (fp->is_ready == 0)
517 | 			khttp_connect_file(fp);
518 | 	}
519 | 	if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
520 | 		off_t rest = len, curr;
521 | 		while (rest) {
522 | 			curr = read(fp->fd, buf + l, rest);
523 | 			if (curr == 0) break;
524 | 			l += curr; rest -= curr;
525 | 		}
526 | 	} else l = my_netread(fp->fd, buf, len);
527 | 	fp->offset += l;
528 | 	return l;
529 | }
530 | 
531 | off_t knet_seek(knetFile *fp, int64_t off, int whence)
532 | {
533 | 	if (whence == SEEK_SET && off == fp->offset) return 0;
534 | 	if (fp->type == KNF_TYPE_LOCAL) {
535 | 		/* Be aware that lseek() returns the offset after seeking,
536 | 		 * while fseek() returns zero on success. */
537 | 		off_t offset = lseek(fp->fd, off, whence);
538 | 		if (offset == -1) {
539 |             // Be silent, it is OK for knet_seek to fail when the file is streamed
540 |             // fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
541 | 			return -1;
542 | 		}
543 | 		fp->offset = offset;
544 | 		return 0;
545 | 	}
546 |     else if (fp->type == KNF_TYPE_FTP) 
547 |     {
548 |         if (whence==SEEK_CUR)
549 |             fp->offset += off;
550 |         else if (whence==SEEK_SET)
551 |             fp->offset = off;
552 |         else if ( whence==SEEK_END)
553 |             fp->offset = fp->file_size+off;
554 | 		fp->is_ready = 0;
555 | 		return 0;
556 | 	} 
557 |     else if (fp->type == KNF_TYPE_HTTP) 
558 |     {
559 | 		if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
560 | 			fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
561 | 			errno = ESPIPE;
562 | 			return -1;
563 | 		}
564 |         if (whence==SEEK_CUR)
565 |             fp->offset += off;
566 |         else if (whence==SEEK_SET)
567 |             fp->offset = off;
568 | 		fp->is_ready = 0;
569 | 		return fp->offset;
570 | 	}
571 | 	errno = EINVAL;
572 |     fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
573 | 	return -1;
574 | }
575 | 
576 | int knet_close(knetFile *fp)
577 | {
578 | 	if (fp == 0) return 0;
579 | 	if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
580 | 	if (fp->fd != -1) {
581 | 		/* On Linux/Mac, netclose() is an alias of close(), but on
582 | 		 * Windows, it is an alias of closesocket(). */
583 | 		if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
584 | 		else netclose(fp->fd);
585 | 	}
586 | 	free(fp->host); free(fp->port);
587 | 	free(fp->response); free(fp->retr); free(fp->size_cmd); // FTP specific
588 | 	free(fp->path); free(fp->http_host); // HTTP specific
589 | 	free(fp);
590 | 	return 0;
591 | }
592 | 
593 | #ifdef KNETFILE_MAIN
594 | int main(void)
595 | {
596 | 	char *buf;
597 | 	knetFile *fp;
598 | 	int type = 4, l;
599 | #ifdef _WIN32
600 | 	knet_win32_init();
601 | #endif
602 | 	buf = calloc(0x100000, 1);
603 | 	if (type == 0) {
604 | 		fp = knet_open("knetfile.c", "r");
605 | 		knet_seek(fp, 1000, SEEK_SET);
606 | 	} else if (type == 1) { // NCBI FTP, large file
607 | 		fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
608 | 		knet_seek(fp, 2500000000ll, SEEK_SET);
609 | 		l = knet_read(fp, buf, 255);
610 | 	} else if (type == 2) {
611 | 		fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
612 | 		knet_seek(fp, 1000, SEEK_SET);
613 | 	} else if (type == 3) {
614 | 		fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
615 | 		knet_seek(fp, 1000, SEEK_SET);
616 | 	} else if (type == 4) {
617 | 		fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
618 | 		knet_read(fp, buf, 10000);
619 | 		knet_seek(fp, 20000, SEEK_SET);
620 | 		knet_seek(fp, 10000, SEEK_SET);
621 | 		l = knet_read(fp, buf+10000, 10000000) + 10000;
622 | 	}
623 | 	if (type != 4 && type != 1) {
624 | 		knet_read(fp, buf, 255);
625 | 		buf[255] = 0;
626 | 		printf("%s\n", buf);
627 | 	} else write(fileno(stdout), buf, l);
628 | 	knet_close(fp);
629 | 	free(buf);
630 | 	return 0;
631 | }
632 | #endif
633 | 


--------------------------------------------------------------------------------
/tabix-0.2.6/ChangeLog:
--------------------------------------------------------------------------------
  1 | ------------------------------------------------------------------------
  2 | r942 | lh3lh3 | 2011-03-31 16:39:50 -0400 (Thu, 31 Mar 2011) | 2 lines
  3 | Changed paths:
  4 |    M /trunk/tabix/main.c
  5 | 
  6 | update version number
  7 | 
  8 | ------------------------------------------------------------------------
  9 | r940 | lh3lh3 | 2011-03-31 16:38:03 -0400 (Thu, 31 Mar 2011) | 2 lines
 10 | Changed paths:
 11 |    M /trunk/tabix/bedidx.c
 12 |    M /trunk/tabix/main.c
 13 | 
 14 | fixed two bugs due to recent changes
 15 | 
 16 | ------------------------------------------------------------------------
 17 | r939 | lh3lh3 | 2011-03-31 16:12:21 -0400 (Thu, 31 Mar 2011) | 2 lines
 18 | Changed paths:
 19 |    M /trunk/tabix/bgzf.c
 20 |    M /trunk/tabix/bgzf.h
 21 |    M /trunk/tabix/main.c
 22 | 
 23 | update to the latest bgzf.*
 24 | 
 25 | ------------------------------------------------------------------------
 26 | r938 | lh3lh3 | 2011-03-31 16:02:21 -0400 (Thu, 31 Mar 2011) | 2 lines
 27 | Changed paths:
 28 |    M /trunk/tabix/index.c
 29 |    M /trunk/tabix/main.c
 30 |    M /trunk/tabix/tabix.h
 31 | 
 32 | BED support
 33 | 
 34 | ------------------------------------------------------------------------
 35 | r937 | lh3lh3 | 2011-03-31 15:03:49 -0400 (Thu, 31 Mar 2011) | 2 lines
 36 | Changed paths:
 37 |    M /trunk/tabix/Makefile
 38 |    A /trunk/tabix/bedidx.c
 39 |    M /trunk/tabix/example.gtf.gz.tbi
 40 |    M /trunk/tabix/index.c
 41 |    A /trunk/tabix/kseq.h
 42 |    M /trunk/tabix/tabix.h
 43 | 
 44 | restructure get_intv() for BED support
 45 | 
 46 | ------------------------------------------------------------------------
 47 | r919 | petulda | 2011-02-24 10:14:14 -0500 (Thu, 24 Feb 2011) | 1 line
 48 | Changed paths:
 49 |    M /trunk/tabix/bgzf.c
 50 |    M /trunk/tabix/bgzf.h
 51 |    M /trunk/tabix/index.c
 52 |    M /trunk/tabix/main.c
 53 | 
 54 | New -r (reheader) option for efficient header replacement.
 55 | ------------------------------------------------------------------------
 56 | r915 | lh3lh3 | 2011-02-22 09:50:57 -0500 (Tue, 22 Feb 2011) | 2 lines
 57 | Changed paths:
 58 |    A /trunk/tabix/python
 59 |    A /trunk/tabix/python/setup.py (from /trunk/tabix/setup.py:914)
 60 |    A /trunk/tabix/python/tabixmodule.c (from /trunk/tabix/tabixmodule.c:914)
 61 |    A /trunk/tabix/python/test.py (from /trunk/tabix/test.py:914)
 62 |    D /trunk/tabix/setup.py
 63 |    D /trunk/tabix/tabixmodule.c
 64 |    D /trunk/tabix/test.py
 65 | 
 66 | move to a new python/ directory
 67 | 
 68 | ------------------------------------------------------------------------
 69 | r914 | lh3lh3 | 2011-02-22 09:49:35 -0500 (Tue, 22 Feb 2011) | 2 lines
 70 | Changed paths:
 71 |    A /trunk/tabix/setup.py
 72 |    A /trunk/tabix/tabixmodule.c
 73 |    A /trunk/tabix/test.py
 74 | 
 75 | CPython C-API by Hyeshik Chang
 76 | 
 77 | ------------------------------------------------------------------------
 78 | r904 | petulda | 2011-01-28 08:06:27 -0500 (Fri, 28 Jan 2011) | 1 line
 79 | Changed paths:
 80 |    M /trunk/tabix/index.c
 81 | 
 82 | Check the number of fields on each line and exit nicely without segfault
 83 | ------------------------------------------------------------------------
 84 | r901 | petulda | 2011-01-21 06:45:37 -0500 (Fri, 21 Jan 2011) | 1 line
 85 | Changed paths:
 86 |    M /trunk/tabix/main.c
 87 | 
 88 | Fix: Complain only when VCF is newer, not newer or same mtime
 89 | ------------------------------------------------------------------------
 90 | r900 | petulda | 2011-01-21 04:23:04 -0500 (Fri, 21 Jan 2011) | 1 line
 91 | Changed paths:
 92 |    M /trunk/tabix/main.c
 93 | 
 94 | Prevent the common user mistake and check the timestamps of the vcf and index file
 95 | ------------------------------------------------------------------------
 96 | r876 | lh3lh3 | 2010-12-08 12:38:45 -0500 (Wed, 08 Dec 2010) | 2 lines
 97 | Changed paths:
 98 |    M /trunk/tabix/ChangeLog
 99 |    M /trunk/tabix/NEWS
100 |    M /trunk/tabix/main.c
101 | 
102 | Release tabix-0.2.3
103 | 
104 | ------------------------------------------------------------------------
105 | r875 | lh3lh3 | 2010-12-08 12:28:35 -0500 (Wed, 08 Dec 2010) | 2 lines
106 | Changed paths:
107 |    M /trunk/tabix/ChangeLog
108 |    M /trunk/tabix/index.c
109 | 
110 | Fixed a minor bug in generating index
111 | 
112 | ------------------------------------------------------------------------
113 | r855 | petulda | 2010-11-25 11:50:13 -0500 (Thu, 25 Nov 2010) | 1 line
114 | Changed paths:
115 |    M /trunk/tabix/main.c
116 | 
117 | Disable "unknown target name or minus interval" warning.
118 | ------------------------------------------------------------------------
119 | r775 | petulda | 2010-10-26 15:02:30 -0400 (Tue, 26 Oct 2010) | 1 line
120 | Changed paths:
121 |    M /trunk/tabix/main.c
122 | 
123 | Added -h option to print header lines
124 | ------------------------------------------------------------------------
125 | r742 | jmarshall | 2010-09-27 06:47:23 -0400 (Mon, 27 Sep 2010) | 2 lines
126 | Changed paths:
127 |    M /trunk/tabix
128 | 
129 | Add svn:ignore properties for intermediate and generated files.
130 | 
131 | ------------------------------------------------------------------------
132 | r725 | lh3lh3 | 2010-09-15 13:01:53 -0400 (Wed, 15 Sep 2010) | 2 lines
133 | Changed paths:
134 |    M /trunk/tabix/bgzip.c
135 | 
136 | patches by Peter Chines
137 | 
138 | ------------------------------------------------------------------------
139 | r714 | lh3lh3 | 2010-09-07 10:13:25 -0400 (Tue, 07 Sep 2010) | 2 lines
140 | Changed paths:
141 |    M /trunk/tabix/TabixReader.java
142 |    M /trunk/tabix/index.c
143 |    M /trunk/tabix/main.c
144 | 
145 | fixed a bug in C/Java when n_off == 0
146 | 
147 | ------------------------------------------------------------------------
148 | r712 | lh3lh3 | 2010-09-03 09:21:23 -0400 (Fri, 03 Sep 2010) | 2 lines
149 | Changed paths:
150 |    M /trunk/tabix/TabixReader.java
151 | 
152 | fixed a bug in parsing region strings
153 | 
154 | ------------------------------------------------------------------------
155 | r700 | petulda | 2010-08-25 10:42:37 -0400 (Wed, 25 Aug 2010) | 1 line
156 | Changed paths:
157 |    M /trunk/tabix/main.c
158 | 
159 | Fix: Exit with an error rather than segfault when index is not present and region is queried
160 | ------------------------------------------------------------------------
161 | r696 | petulda | 2010-08-24 10:24:12 -0400 (Tue, 24 Aug 2010) | 1 line
162 | Changed paths:
163 |    M /trunk/tabix/bgzf.c
164 |    M /trunk/tabix/bgzf.h
165 |    M /trunk/tabix/index.c
166 |    M /trunk/tabix/main.c
167 | 
168 | Complain about not-bgzipped files and check for noncontinuous chromosome blocks
169 | ------------------------------------------------------------------------
170 | r603 | lh3lh3 | 2010-06-28 10:49:39 -0400 (Mon, 28 Jun 2010) | 2 lines
171 | Changed paths:
172 |    M /trunk/tabix/NEWS
173 |    M /trunk/tabix/TabixReader.java
174 |    M /trunk/tabix/index.c
175 |    M /trunk/tabix/main.c
176 | 
177 | Release tabix-0.2.2
178 | 
179 | ------------------------------------------------------------------------
180 | r597 | lh3lh3 | 2010-06-13 21:08:29 -0400 (Sun, 13 Jun 2010) | 3 lines
181 | Changed paths:
182 |    M /trunk/tabix/index.c
183 | 
184 | Change the namespace of sorting, to avoid function name collision with samtools.
185 | 
186 | 
187 | ------------------------------------------------------------------------
188 | r582 | lh3lh3 | 2010-06-03 10:40:25 -0400 (Thu, 03 Jun 2010) | 2 lines
189 | Changed paths:
190 |    M /trunk/tabix/NEWS
191 |    M /trunk/tabix/main.c
192 |    M /trunk/tabix/tabix.py
193 | 
194 | Release tabix-0.2.1
195 | 
196 | ------------------------------------------------------------------------
197 | r581 | lh3lh3 | 2010-05-24 14:24:24 -0400 (Mon, 24 May 2010) | 2 lines
198 | Changed paths:
199 |    M /trunk/tabix/tabix.py
200 | 
201 | OOP interface with the help from Aaron Quinlan
202 | 
203 | ------------------------------------------------------------------------
204 | r580 | lh3lh3 | 2010-05-23 23:36:05 -0400 (Sun, 23 May 2010) | 2 lines
205 | Changed paths:
206 |    M /trunk/tabix/tabix.py
207 | 
208 | minor change
209 | 
210 | ------------------------------------------------------------------------
211 | r579 | lh3lh3 | 2010-05-23 23:25:24 -0400 (Sun, 23 May 2010) | 2 lines
212 | Changed paths:
213 |    M /trunk/tabix/tabix.py
214 | 
215 | For Snow Leopard compatibility
216 | 
217 | ------------------------------------------------------------------------
218 | r575 | lh3lh3 | 2010-05-12 19:31:27 -0400 (Wed, 12 May 2010) | 4 lines
219 | Changed paths:
220 |    M /trunk/tabix/Makefile
221 |    M /trunk/tabix/index.c
222 |    M /trunk/tabix/tabix.h
223 |    A /trunk/tabix/tabix.py
224 | 
225 |  * optionally generate shared library for Mac and Linux
226 |  * added a python script that directly calls the shared library
227 |  * added a new API for easy python access
228 | 
229 | ------------------------------------------------------------------------
230 | r574 | lh3lh3 | 2010-05-11 12:14:27 -0400 (Tue, 11 May 2010) | 2 lines
231 | Changed paths:
232 |    M /trunk/tabix/ChangeLog
233 |    M /trunk/tabix/NEWS
234 |    M /trunk/tabix/perl/Tabix.pm
235 |    M /trunk/tabix/perl/TabixIterator.pm
236 |    M /trunk/tabix/tabix.1
237 | 
238 | Release tabix-0.2.0
239 | 
240 | ------------------------------------------------------------------------
241 | r573 | lh3lh3 | 2010-05-11 12:08:30 -0400 (Tue, 11 May 2010) | 2 lines
242 | Changed paths:
243 |    M /trunk/tabix/Makefile
244 | 
245 | Added -fPIC
246 | 
247 | ------------------------------------------------------------------------
248 | r572 | lh3lh3 | 2010-05-11 11:59:07 -0400 (Tue, 11 May 2010) | 2 lines
249 | Changed paths:
250 |    M /trunk/tabix/perl/MANIFEST
251 | 
252 | update
253 | 
254 | ------------------------------------------------------------------------
255 | r571 | lh3lh3 | 2010-05-11 11:56:54 -0400 (Tue, 11 May 2010) | 4 lines
256 | Changed paths:
257 |    A /trunk/tabix/example.gtf.gz
258 |    A /trunk/tabix/example.gtf.gz.tbi
259 |    M /trunk/tabix/index.c
260 |    M /trunk/tabix/main.c
261 |    M /trunk/tabix/perl/MANIFEST
262 |    M /trunk/tabix/perl/Tabix.pm
263 |    M /trunk/tabix/perl/Tabix.xs
264 |    A /trunk/tabix/perl/TabixIterator.pm
265 |    A /trunk/tabix/perl/t
266 |    A /trunk/tabix/perl/t/01local.t
267 |    A /trunk/tabix/perl/t/02remote.t
268 |    M /trunk/tabix/tabix.1
269 |    M /trunk/tabix/tabix.h
270 | 
271 |  * improved C/Perl APIs
272 |  * added test for Perl
273 |  * added an tiny example
274 | 
275 | ------------------------------------------------------------------------
276 | r570 | lh3lh3 | 2010-05-11 01:04:21 -0400 (Tue, 11 May 2010) | 2 lines
277 | Changed paths:
278 |    M /trunk/tabix/TabixReader.java
279 | 
280 | fixed the same issue in java
281 | 
282 | ------------------------------------------------------------------------
283 | r569 | lh3lh3 | 2010-05-11 01:03:24 -0400 (Tue, 11 May 2010) | 3 lines
284 | Changed paths:
285 |    M /trunk/tabix/index.c
286 |    M /trunk/tabix/perl/Tabix.pm
287 |    M /trunk/tabix/perl/Tabix.xs
288 | 
289 |  * fixed a potential issue in index.c
290 |  * improve perl APIs
291 | 
292 | ------------------------------------------------------------------------
293 | r568 | lh3lh3 | 2010-05-10 23:46:21 -0400 (Mon, 10 May 2010) | 2 lines
294 | Changed paths:
295 |    M /trunk/tabix/perl/Tabix.xs
296 | 
297 | return an array from get_names()
298 | 
299 | ------------------------------------------------------------------------
300 | r567 | lh3lh3 | 2010-05-10 23:38:46 -0400 (Mon, 10 May 2010) | 4 lines
301 | Changed paths:
302 |    M /trunk/tabix/TabixReader.java
303 |    M /trunk/tabix/index.c
304 |    A /trunk/tabix/perl
305 |    A /trunk/tabix/perl/MANIFEST
306 |    A /trunk/tabix/perl/Makefile.PL
307 |    A /trunk/tabix/perl/Tabix.pm
308 |    A /trunk/tabix/perl/Tabix.xs
309 |    A /trunk/tabix/perl/typemap
310 |    M /trunk/tabix/tabix.h
311 | 
312 |  * added the initial perl binding. The interface needs to be improved.
313 |  * added a new API for perl binding
314 |  * fixed a potential bug in java.
315 | 
316 | ------------------------------------------------------------------------
317 | r565 | lh3lh3 | 2010-05-09 23:24:35 -0400 (Sun, 09 May 2010) | 2 lines
318 | Changed paths:
319 |    M /trunk/tabix/main.c
320 | 
321 | Release tabix-0.1.6
322 | 
323 | ------------------------------------------------------------------------
324 | r564 | lh3lh3 | 2010-05-09 23:01:49 -0400 (Sun, 09 May 2010) | 2 lines
325 | Changed paths:
326 |    M /trunk/tabix/index.c
327 | 
328 | fixed a typo
329 | 
330 | ------------------------------------------------------------------------
331 | r563 | lh3lh3 | 2010-05-09 22:58:26 -0400 (Sun, 09 May 2010) | 2 lines
332 | Changed paths:
333 |    A /trunk/tabix/ChangeLog
334 |    M /trunk/tabix/NEWS
335 |    M /trunk/tabix/index.c
336 |    M /trunk/tabix/main.c
337 |    M /trunk/tabix/tabix.h
338 | 
339 | If nothing bad happens, this will become 0.1.6
340 | 
341 | ------------------------------------------------------------------------
342 | r562 | lh3lh3 | 2010-05-09 19:43:56 -0400 (Sun, 09 May 2010) | 2 lines
343 | Changed paths:
344 |    M /trunk/tabix/index.c
345 | 
346 | Fixed a bug
347 | 
348 | ------------------------------------------------------------------------
349 | r560 | lh3lh3 | 2010-05-05 10:59:09 -0400 (Wed, 05 May 2010) | 3 lines
350 | Changed paths:
351 |    A /trunk/tabix/NEWS
352 |    M /trunk/tabix/TabixReader.java
353 |    M /trunk/tabix/index.c
354 |    M /trunk/tabix/main.c
355 |    M /trunk/tabix/tabix.1
356 |    M /trunk/tabix/tabix.h
357 | 
358 |  * Release tabix-0.1.5 (r560)
359 |  * Improve seeking efficiency. Index file needs to be rebuilt.
360 | 
361 | ------------------------------------------------------------------------
362 | r559 | lh3lh3 | 2010-05-04 23:11:42 -0400 (Tue, 04 May 2010) | 2 lines
363 | Changed paths:
364 |    M /trunk/tabix/main.c
365 | 
366 | Release tabix-0.1.4 (r559)
367 | 
368 | ------------------------------------------------------------------------
369 | r558 | lh3lh3 | 2010-05-01 12:48:01 -0400 (Sat, 01 May 2010) | 2 lines
370 | Changed paths:
371 |    M /trunk/tabix/TabixReader.java
372 | 
373 | implement SAM/VCF support; NOT tested yet
374 | 
375 | ------------------------------------------------------------------------
376 | r557 | lh3lh3 | 2010-05-01 00:42:34 -0400 (Sat, 01 May 2010) | 2 lines
377 | Changed paths:
378 |    A /trunk/tabix/TabixReader.java
379 | 
380 | The Java implementation of tabix.
381 | 
382 | ------------------------------------------------------------------------
383 | r556 | lh3lh3 | 2010-04-30 22:34:07 -0400 (Fri, 30 Apr 2010) | 4 lines
384 | Changed paths:
385 |    M /trunk/tabix/index.c
386 |    M /trunk/tabix/knetfile.c
387 |    M /trunk/tabix/main.c
388 | 
389 |  * tabix-0.1.3-3 (r556)
390 |  * fixed a small memory leak in knetfile
391 |  * fixed a minor bug for remote downloading
392 | 
393 | ------------------------------------------------------------------------
394 | r555 | lh3lh3 | 2010-04-30 22:15:12 -0400 (Fri, 30 Apr 2010) | 4 lines
395 | Changed paths:
396 |    M /trunk/tabix/Makefile
397 |    M /trunk/tabix/index.c
398 |    M /trunk/tabix/main.c
399 | 
400 |  * tabix-0.1.3-2 (r555)
401 |  * do not overwrite index file by default
402 |  * a little code cleanup
403 | 
404 | ------------------------------------------------------------------------
405 | r554 | lh3lh3 | 2010-04-30 21:44:31 -0400 (Fri, 30 Apr 2010) | 2 lines
406 | Changed paths:
407 |    M /trunk/tabix/index.c
408 | 
409 | fixed a potential bug for UCSC-like coordinate
410 | 
411 | ------------------------------------------------------------------------
412 | r553 | lh3lh3 | 2010-04-28 17:43:41 -0400 (Wed, 28 Apr 2010) | 2 lines
413 | Changed paths:
414 |    M /trunk/tabix/tabix.tex
415 | 
416 | minor clarification to the format spec
417 | 
418 | ------------------------------------------------------------------------
419 | r552 | lh3lh3 | 2010-04-28 16:33:07 -0400 (Wed, 28 Apr 2010) | 3 lines
420 | Changed paths:
421 |    M /trunk/tabix/Makefile
422 |    M /trunk/tabix/bgzip.c
423 |    A /trunk/tabix/tabix.tex
424 | 
425 |  * added the format specification
426 |  * fixed a typo in bgzip
427 | 
428 | ------------------------------------------------------------------------
429 | r550 | petulda | 2010-04-22 11:03:24 -0400 (Thu, 22 Apr 2010) | 1 line
430 | Changed paths:
431 |    M /trunk/tabix/bgzip.c
432 | 
433 | The behaviour changed slightly to mimic gzip. Detect if std descriptors are connected to the terminal.
434 | ------------------------------------------------------------------------
435 | r549 | petulda | 2010-04-22 09:46:10 -0400 (Thu, 22 Apr 2010) | 1 line
436 | Changed paths:
437 |    M /trunk/tabix/bgzip.c
438 | 
439 | Fix in src/dst file detection and slight change of behaviour
440 | ------------------------------------------------------------------------
441 | r548 | petulda | 2010-04-19 04:39:46 -0400 (Mon, 19 Apr 2010) | 1 line
442 | Changed paths:
443 |    M /trunk/tabix/index.c
444 | 
445 | Close file descriptor in ti_list_chromosomes
446 | ------------------------------------------------------------------------
447 | r547 | petulda | 2010-04-16 09:27:11 -0400 (Fri, 16 Apr 2010) | 1 line
448 | Changed paths:
449 |    M /trunk/tabix/index.c
450 |    M /trunk/tabix/main.c
451 |    M /trunk/tabix/tabix.h
452 | 
453 | Added the -l option for listing chromosomes
454 | ------------------------------------------------------------------------
455 | r544 | lh3lh3 | 2010-03-29 10:58:48 -0400 (Mon, 29 Mar 2010) | 2 lines
456 | Changed paths:
457 |    M /trunk/tabix/main.c
458 | 
459 | removed a line of debugging code
460 | 
461 | ------------------------------------------------------------------------
462 | r543 | lh3lh3 | 2010-03-19 12:29:16 -0400 (Fri, 19 Mar 2010) | 3 lines
463 | Changed paths:
464 |    M /trunk/tabix/index.c
465 |    M /trunk/tabix/main.c
466 |    M /trunk/tabix/tabix.1
467 | 
468 |  * tabix-0.1.3 (r543)
469 |  * fixed another off-by-one bug
470 | 
471 | ------------------------------------------------------------------------
472 | r542 | lh3lh3 | 2010-03-16 22:35:52 -0400 (Tue, 16 Mar 2010) | 2 lines
473 | Changed paths:
474 |    M /trunk/tabix/index.c
475 |    M /trunk/tabix/main.c
476 |    M /trunk/tabix/tabix.1
477 | 
478 | Release tabix-0.1.1
479 | 
480 | ------------------------------------------------------------------------
481 | r506 | lh3lh3 | 2009-11-02 23:20:12 -0500 (Mon, 02 Nov 2009) | 2 lines
482 | Changed paths:
483 |    M /trunk/tabix/main.c
484 | 
485 | Release tabix-0.1.0
486 | 
487 | ------------------------------------------------------------------------
488 | r505 | lh3lh3 | 2009-11-02 23:15:49 -0500 (Mon, 02 Nov 2009) | 2 lines
489 | Changed paths:
490 |    A /trunk/tabix/tabix.1
491 | 
492 | documentation
493 | 
494 | ------------------------------------------------------------------------
495 | r504 | lh3lh3 | 2009-11-02 11:08:18 -0500 (Mon, 02 Nov 2009) | 5 lines
496 | Changed paths:
497 |    M /trunk/tabix/Makefile
498 |    M /trunk/tabix/bgzip.c
499 |    M /trunk/tabix/index.c
500 |    M /trunk/tabix/main.c
501 |    M /trunk/tabix/tabix.h
502 | 
503 |  * tabix-0.0.0-5 (r504)
504 |  * fixed a critical bug in fetching data (a typo in fact)
505 |  * support SAM (tested on ex1.sam) and VCF (not tested)
506 |  * improve the command-line interface
507 | 
508 | ------------------------------------------------------------------------
509 | r503 | lh3lh3 | 2009-11-02 10:04:43 -0500 (Mon, 02 Nov 2009) | 3 lines
510 | Changed paths:
511 |    M /trunk/tabix/Makefile
512 |    M /trunk/tabix/index.c
513 |    M /trunk/tabix/main.c
514 | 
515 |  * tabix-0.0.0-4 (r503)
516 |  * index files are bgzf compressed
517 | 
518 | ------------------------------------------------------------------------
519 | r502 | lh3lh3 | 2009-11-02 09:47:25 -0500 (Mon, 02 Nov 2009) | 4 lines
520 | Changed paths:
521 |    M /trunk/tabix/index.c
522 |    M /trunk/tabix/main.c
523 |    M /trunk/tabix/tabix.h
524 | 
525 |  * tabix-0.0.0-3 (r502)
526 |  * support meta lines (not tested)
527 |  * I am going to make the index file in the BGZF format
528 | 
529 | ------------------------------------------------------------------------
530 | r501 | lh3lh3 | 2009-11-01 22:03:07 -0500 (Sun, 01 Nov 2009) | 3 lines
531 | Changed paths:
532 |    M /trunk/tabix/Makefile
533 |    M /trunk/tabix/bgzf.h
534 |    M /trunk/tabix/index.c
535 |    M /trunk/tabix/main.c
536 | 
537 |  * tabix-0.0.0-2 (r501)
538 |  * accelerate ti_readline()
539 | 
540 | ------------------------------------------------------------------------
541 | r500 | lh3lh3 | 2009-11-01 20:49:52 -0500 (Sun, 01 Nov 2009) | 3 lines
542 | Changed paths:
543 |    M /trunk/tabix/Makefile
544 |    M /trunk/tabix/bgzip.c
545 |    M /trunk/tabix/index.c
546 |    M /trunk/tabix/main.c
547 | 
548 |  * tabix-0.0.0-1 (r500)
549 |  * apparently working
550 | 
551 | ------------------------------------------------------------------------
552 | r499 | lh3lh3 | 2009-11-01 14:04:52 -0500 (Sun, 01 Nov 2009) | 2 lines
553 | Changed paths:
554 |    D /trunk/tabix/parser.c
555 | 
556 | obsolete file
557 | 
558 | ------------------------------------------------------------------------
559 | r498 | lh3lh3 | 2009-11-01 14:04:08 -0500 (Sun, 01 Nov 2009) | 2 lines
560 | Changed paths:
561 |    M /trunk/tabix/bgzip.c
562 | 
563 | bgzip is more like gzip in its command-line interface
564 | 
565 | ------------------------------------------------------------------------
566 | r497 | lh3lh3 | 2009-11-01 13:43:35 -0500 (Sun, 01 Nov 2009) | 2 lines
567 | Changed paths:
568 |    A /trunk/tabix/Makefile
569 |    A /trunk/tabix/bam_endian.h
570 |    A /trunk/tabix/bgzf.c
571 |    A /trunk/tabix/bgzf.h
572 |    A /trunk/tabix/bgzip.c
573 |    A /trunk/tabix/index.c
574 |    A /trunk/tabix/khash.h
575 |    A /trunk/tabix/knetfile.c
576 |    A /trunk/tabix/knetfile.h
577 |    A /trunk/tabix/ksort.h
578 |    A /trunk/tabix/kstring.c
579 |    A /trunk/tabix/kstring.h
580 |    A /trunk/tabix/main.c
581 |    A /trunk/tabix/parser.c
582 |    A /trunk/tabix/tabix.h
583 | 
584 | initial source code. It is BUGGY!
585 | 
586 | ------------------------------------------------------------------------
587 | r496 | lh3lh3 | 2009-11-01 13:42:39 -0500 (Sun, 01 Nov 2009) | 2 lines
588 | Changed paths:
589 |    A /trunk/tabix
590 | 
591 | A generic indexer for TAB-delimited genome position files
592 | 
593 | ------------------------------------------------------------------------
594 | 


--------------------------------------------------------------------------------