├── DATA ├── HG00110.gatk.vcf.filteredhets.gz └── readme.md ├── Makefile ├── PCRdups.c ├── README.md ├── estimate_PCRduprate.py ├── extra_functions.py └── parsebam ├── Makefile ├── bamread.c ├── bamread.h ├── extracthairs.c ├── hapfragments.c ├── hapfragments.h ├── hashtable.c ├── hashtable.h ├── parsebamread.c ├── readfasta.c ├── readfasta.h ├── readvariant.c ├── readvariant.h └── samtools-0.1.18 ├── AUTHORS ├── COPYING ├── ChangeLog ├── INSTALL ├── Makefile ├── Makefile.mingw ├── NEWS ├── bam.c ├── bam.h ├── bam2bcf.c ├── bam2bcf.h ├── bam2bcf_indel.c ├── bam2depth.c ├── bam_aux.c ├── bam_cat.c ├── bam_color.c ├── bam_endian.h ├── bam_import.c ├── bam_index.c ├── bam_lpileup.c ├── bam_mate.c ├── bam_md.c ├── bam_pileup.c ├── bam_plcmd.c ├── bam_reheader.c ├── bam_rmdup.c ├── bam_rmdupse.c ├── bam_sort.c ├── bam_stat.c ├── bam_tview.c ├── bamtk.c ├── bcftools ├── Makefile ├── README ├── bcf.c ├── bcf.h ├── bcf.tex ├── bcf2qcall.c ├── bcfutils.c ├── call1.c ├── em.c ├── fet.c ├── index.c ├── kfunc.c ├── kmin.c ├── kmin.h ├── main.c ├── mut.c ├── prob1.c ├── prob1.h ├── vcf.c └── vcfutils.pl ├── bedidx.c ├── bgzf.c ├── bgzf.h ├── bgzip.c ├── cut_target.c ├── errmod.c ├── errmod.h ├── examples ├── 00README.txt ├── Makefile ├── bam2bed.c ├── calDepth.c ├── ex1.fa ├── ex1.sam.gz ├── toy.fa └── toy.sam ├── faidx.c ├── faidx.h ├── kaln.c ├── kaln.h ├── khash.h ├── klist.h ├── knetfile.c ├── knetfile.h ├── kprobaln.c ├── kprobaln.h ├── kseq.h ├── ksort.h ├── kstring.c ├── kstring.h ├── misc ├── HmmGlocal.java ├── Makefile ├── blast2sam.pl ├── bowtie2sam.pl ├── export2sam.pl ├── interpolate_sam.pl ├── maq2sam.c ├── md5.c ├── md5.h ├── md5fa.c ├── novo2sam.pl ├── psl2sam.pl ├── sam2vcf.pl ├── samtools.pl ├── seqtk.c ├── soap2sam.pl ├── varfilter.py ├── wgsim.c ├── wgsim_eval.pl └── zoom2sam.pl ├── phase.c ├── razf.c ├── razf.h ├── razip.c ├── sam.c ├── sam.h ├── sam_header.c ├── sam_header.h ├── sam_view.c ├── sample.c ├── sample.h ├── samtools.1 └── win32 ├── xcurses.h ├── zconf.h └── zlib.h /DATA/HG00110.gatk.vcf.filteredhets.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vibansal/PCRduplicates/f4462916f8b73273f3dfc1ebc8eb145829003121/DATA/HG00110.gatk.vcf.filteredhets.gz -------------------------------------------------------------------------------- /DATA/readme.md: -------------------------------------------------------------------------------- 1 | 2 | Download exome bam file for an individual HG00110 from the 1000 Genomes Project: 3 | 4 | ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00110/exome_alignment/HG00110.mapped.ILLUMINA.bwa.GBR.exome.20121211.bam 5 | ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00110/exome_alignment/HG00110.mapped.ILLUMINA.bwa.GBR.exome.20121211.bam.bai 6 | 7 | The compressed VCF file for this individual is available in this folder: HG00110.gatk.vcf.filteredhets.gz 8 | 9 | unzip it using gunzip to use as input 10 | 11 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | #CC=gcc -Wall 3 | CC=gcc -D_GNU_SOURCE 4 | CFLAGS=-c -Wall 5 | SAMTOOLS=parsebam/samtools-0.1.18/ 6 | HAPCUT=parsebam/ 7 | 8 | all: 9 | $(MAKE) -C parsebam/samtools-0.1.18 all 10 | $(MAKE) -C parsebam hairs 11 | $(MAKE) PCR 12 | 13 | PCR: PCRdups.c 14 | $(CC) -I$(SAMTOOLS) -I$(HAPCUT) -g -O2 parsebam/bamread.o parsebam/hapfragments.o parsebam/hashtable.o parsebam/readfasta.o parsebam/readvariant.o -o extract_duplicates PCRdups.c -L$(HAPCUT) -L$(SAMTOOLS) -lbam -lm -lz 15 | 16 | clean: 17 | $(MAKE) -C parsebam/samtools-0.1.18 clean 18 | $(MAKE) -C parsebam clean 19 | rm -f extract_duplicates 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Computational method to estimate the PCR duplication rate in high-throughput DNA sequencing experiments 2 | 3 | PCR amplification is an important step in the preparation of DNA sequencing libraries prior to high-throughput sequencing. Existing computational methods for analysis of read duplicates assume that all read duplicates arise due to PCR amplification. However, a high rate of read duplicates is observed in deep sequencing experiments or experiments such as RNA-seq. We present a computational method that exploits the heterozygosity in diploid genomes to estimate the PCR duplication rate accounting for read duplicates that are not due to PCR amplification. 4 | 5 | A paper describing this method has been published in BMC Bioinformatics, March 2017: http://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-017-1471-9 6 | 7 | #### INPUT files for running program 8 | 1. coordinate sorted BAM file with aligned reads 9 | 2. VCF file with heterozygous variants called from BAM file using a variant calling tool such as GATK UnifiedGenotyper or samtools 10 | 11 | #### compile the CODE 12 | 13 | run 'make all' 14 | 15 | #### Two step process to extract reads overlapping variant sites and then analyze read clusters to estimate PCR duplication rate 16 | 17 | 1. ./extract\_duplicates --bam sample.bam --VCF variants.VCF > sample.hetreads 18 | 2. python estimate\_PCRduprate.py -i sample.hetreads -f exome > sample.PCRdups 19 | 20 | 21 | #### obtain FINAL estimate of the PCR duplication rate 22 | 23 | grep FINAL\_PCR\_RATE sample.PCRdups > sample.PCRduprate.estimate 24 | 25 | #### Sample Data 26 | 27 | see DATA folder 28 | 29 | 30 | #### FAQ 31 | 32 | 1. The program uses samtools (v 0.1.18) to parse BAM files. The source code for samtools is included in the github repository (directory parsebam/samtools-0.1.18). 33 | 2. The program has been tested on exome-seq, targeted DNA seq and RNA-seq datasets. For RNA-seq, an independent set of heterozygous variants is needed. 34 | -------------------------------------------------------------------------------- /parsebam/Makefile: -------------------------------------------------------------------------------- 1 | 2 | #CC=gcc -Wall 3 | CC=gcc -D_GNU_SOURCE 4 | CFLAGS=-c -Wall 5 | SAMTOOLS=samtools-0.1.18 6 | 7 | hairs: bamread.o hashtable.o readvariant.o readfasta.o hapfragments.o extracthairs.c parsebamread.c 8 | $(CC) -I$(SAMTOOLS) -g -O2 bamread.o hapfragments.o hashtable.o readfasta.o readvariant.o -o extractHAIRS extracthairs.c -L$(SAMTOOLS) -lbam -lm -lz 9 | 10 | hapfragments.o: hapfragments.c hapfragments.h readvariant.h 11 | $(CC) -c hapfragments.c 12 | 13 | readvariant.o: readvariant.c readvariant.h hashtable.h hashtable.c 14 | $(CC) -c readvariant.c 15 | 16 | bamread.o: bamread.h bamread.c readfasta.h readfasta.c 17 | $(CC) -I$(SAMTOOLS) -c bamread.c 18 | 19 | hashtable.o: hashtable.h hashtable.c 20 | $(CC) -c hashtable.c 21 | 22 | readfasta.o: readfasta.c readfasta.h 23 | $(CC) -c readfasta.c 24 | 25 | clean: 26 | rm -f bamread.o readfasta.o readvariant.o hapfragments.o hashtable.o extractHAIRS 27 | -------------------------------------------------------------------------------- /parsebam/bamread.c: -------------------------------------------------------------------------------- 1 | #include "bamread.h" 2 | 3 | char INT_CIGAROP[] = {'M','I','D','N','S','H','P','E','X'}; 4 | 5 | 6 | int QVoffset = 33; 7 | 8 | int fetch_func(const bam1_t *b, void *data,struct alignedread* read) 9 | { 10 | samfile_t *fp = (samfile_t*)data; uint32_t *cigar = bam1_cigar(b); const bam1_core_t *c = &b->core; 11 | int i,op,ol; 12 | read->cigs =0; read->alignedbases = 0; read->clipped =0; read->span =0; read->gapped =0; read->cflag =0; 13 | read->readlength= b->core.l_qseq; 14 | read->sequence = (char*)malloc(b->core.l_qseq+1); read->quality = (char*)malloc(b->core.l_qseq+1); 15 | uint8_t* sequence = bam1_seq(b); uint8_t* quality = bam1_qual(b); 16 | for (i=0;icore.l_qseq;i++) read->sequence[i] = bam_nt16_rev_table[bam1_seqi(sequence,i)]; read->sequence[i] = '\0'; 17 | if (quality[0] == 255) // quality string is missing, 01/29/2014, quality is set to minimum quality value specified using --minq 18 | { 19 | for (i=0;icore.l_qseq;i++) read->quality[i] = (char)(MINQ+33); read->quality[i] = '\0'; 20 | } 21 | else 22 | { 23 | for (i=0;icore.l_qseq;i++) read->quality[i] = (char)(quality[i]+33); read->quality[i] = '\0'; 24 | } 25 | //fprintf(stderr,"quality |%d| \n",quality[1]); 26 | 27 | read->flag = c->flag; read->mquality= c->qual; read->position = c->pos+1; read->mateposition = c->mpos+1; read->IS = c->isize; 28 | read->strand = '+'; if ((read->flag & 16) == 16) read->strand = '-'; // fixed sept 29 2011 29 | 30 | read->cigarlist = (int*)malloc(sizeof(int)*c->n_cigar); read->cigs =c->n_cigar; 31 | for (i = 0; i < c->n_cigar; ++i) 32 | { 33 | read->cigarlist[i] = cigar[i]; 34 | op = cigar[i]&0xf; ol = cigar[i]>>4; 35 | if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) 36 | { 37 | read->alignedbases += ol; read->span += ol; 38 | } 39 | else if (op == BAM_CDEL) 40 | { 41 | read->gapped +=1; read->span += ol; 42 | } 43 | else if (op == BAM_CINS) 44 | { 45 | read->alignedbases += ol; read->gapped += 1; 46 | } 47 | else if (op == BAM_CREF_SKIP) read->span += ol; 48 | else if (op == BAM_CSOFT_CLIP) read->clipped += ol; 49 | else if (op == BAM_CHARD_CLIP) {} 50 | else read->cflag = 1; 51 | } 52 | // fprintf(stderr," read IS %d \n",c->isize); 53 | 54 | //if (read->mquality >= 60) read->mquality = 60; // cap it at 60 april 18 2012 55 | read->readid=(char*)malloc(c->l_qname+1); char* qs = b->data; 56 | for (i=0;il_qname;i++) read->readid[i] = qs[i]; read->readid[i]= '\0'; 57 | 58 | if (c->tid >= 0) read->chrom = fp->header->target_name[c->tid]; else read->chrom = NULL; 59 | if (c->mtid >= 0) read->matechrom = fp->header->target_name[c->mtid]; else read->matechrom = NULL; 60 | read->tid = c->tid; read->mtid = c->mtid; 61 | //fprintf(stdout,"%s %s %d %d\n",read->chrom,read->matechrom,read->IS,c->mtid); 62 | // for MAQ bam files, mtid is not set resulting in lack of paired-end reads, may 1 2012 63 | return 0; 64 | } 65 | 66 | void free_readmemory(struct alignedread* read) 67 | { 68 | free(read->readid); free(read->sequence); free(read->quality); 69 | if (read->cigs > 0) free(read->cigarlist); 70 | } 71 | 72 | 73 | -------------------------------------------------------------------------------- /parsebam/bamread.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef INC_samread_H 3 | #define INC_samread_H 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "readfasta.h" 10 | #include "sam.h" 11 | 12 | extern int QVoffset; 13 | extern int MINQ; 14 | extern int MISSING_QV; 15 | 16 | extern char INT_CIGAROP[]; 17 | 18 | 19 | struct alignedread 20 | { 21 | int readlength; 22 | char* readid; 23 | char* chrom; char* matechrom; 24 | int matech; char matestrand; 25 | int flag; int position; int mquality; int mateposition; int IS; 26 | char* sequence; char* quality; 27 | char strand; 28 | int* cigarlist; int cigs; 29 | int mismatches; int indels; // no of mismatches and no of insertions/deletions 30 | int alignedbases; int clipped, gapped; 31 | int cflag; int span; 32 | int tid; int mtid; // matetid 33 | 34 | int findex; // index in array of fragments 35 | int mateindex; // index in array of reads of mate 36 | int blockid; int cluster; 37 | 38 | }; 39 | 40 | int fetch_func(const bam1_t *b, void *data,struct alignedread* read); 41 | 42 | void free_readmemory(struct alignedread* read); 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /parsebam/hapfragments.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _HAPFRAGMENT_H 3 | #define _HAPFRAGMENT_H 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "readvariant.h" 10 | 11 | extern int SINGLEREADS; 12 | typedef struct 13 | { 14 | char allele; char qv; int varid; // allele is 0/1 varid is index to varlist[varid] gives all information about the variant 15 | } allele; 16 | 17 | typedef struct 18 | { 19 | char* id; int variants; allele* alist; 20 | int blocks; int paired; int matepos; 21 | 22 | } FRAGMENT; 23 | 24 | int compare_fragments(const void *a,const void *b); 25 | 26 | int compare_alleles(const void *a,const void *b); 27 | 28 | int print_fragment(FRAGMENT* fragment,VARIANT* varlist,FILE* outfile); 29 | 30 | // make sure they are in the correct order, i+1 could be < i 31 | int print_matepair(FRAGMENT* f1, FRAGMENT* f2,VARIANT* varlist,FILE* outfile); 32 | 33 | void clean_fragmentlist(FRAGMENT* flist,int* fragments,VARIANT* varlist,int currchrom,int currpos,int prevchrom); 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /parsebam/hashtable.c: -------------------------------------------------------------------------------- 1 | #include "hashtable.h" 2 | 3 | // this is a basic hashtable that was designed for mapping chromosomes to integers (chrx - 0) 4 | // used in extract_hairs for haplotype assembly as well as indel realignment 5 | 6 | // simple hash function that takes a string and returns an integer hash value 7 | int hashstring(char* str,int htsize) 8 | { 9 | unsigned long hash= 5381; int c; 10 | while (c = *str++) { hash = ((hash<<5)+hash) + c; if (hash >= htsize) hash = hash%htsize; } 11 | return hash; 12 | } 13 | 14 | void init_hashtable(HASHTABLE* ht) 15 | { 16 | int i=0; 17 | ht->bucketlengths = (int*)malloc(sizeof(int)*ht->htsize); 18 | for (i=0;ihtsize;i++) ht->bucketlengths[i] = 0; 19 | ht->blist = (keyvalue**)malloc(sizeof(keyvalue*)*ht->htsize); 20 | for (i=0;ihtsize;i++) ht->blist[i] = NULL; //(keyvalue*)malloc(sizeof(keyvalue)*20); 21 | } 22 | 23 | int insert_keyvalue(HASHTABLE* ht, char* key,int slen,int value) 24 | { 25 | int hash = hashstring(key,ht->htsize); 26 | keyvalue* tempkey = (keyvalue*)malloc(sizeof(keyvalue)); 27 | tempkey->value = value; tempkey->key = (char*)malloc(slen+1); 28 | int i=0; for (i=0;ikey[i] = key[i]; tempkey->key[i] = '\0'; 29 | tempkey->next = ht->blist[hash]; ht->blist[hash] = tempkey; 30 | ht->bucketlengths[hash]++; 31 | return 1; 32 | } 33 | 34 | int getindex(HASHTABLE* ht,char* chrom) 35 | { 36 | int hash = hashstring(chrom,ht->htsize); keypointer = ht->blist[hash]; 37 | while (keypointer != NULL) 38 | { 39 | if (strcmp(keypointer->key,chrom) ==0) return keypointer->value; 40 | keypointer = keypointer->next; 41 | } 42 | return -1; 43 | } 44 | 45 | 46 | -------------------------------------------------------------------------------- /parsebam/hashtable.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef INC_hashtable_H 3 | #define INC_hashtable_H 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | typedef struct keyvalue 10 | { 11 | char* key; int value; struct keyvalue* next; 12 | } keyvalue; 13 | 14 | keyvalue* keypointer; 15 | 16 | typedef struct HASHTABLE 17 | { 18 | int htsize; // prime number that is also the size of HASHTABLE 19 | int* bucketlengths; // length of each bucket initially 0 20 | keyvalue** blist; // each bucket is a list of (key,value) pairs, HASHTABLE is an array of buckets 21 | } HASHTABLE; 22 | 23 | int hashstring(char* str,int htsize); 24 | 25 | void init_hashtable(HASHTABLE* ht); 26 | 27 | int insert_keyvalue(HASHTABLE* ht, char* key,int slen,int value); 28 | 29 | int getindex(HASHTABLE* ht,char* chrom); 30 | 31 | #endif 32 | 33 | -------------------------------------------------------------------------------- /parsebam/readfasta.h: -------------------------------------------------------------------------------- 1 | #ifndef INC_readfasta_H 2 | #define INC_readfasta_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | typedef struct 10 | { 11 | int chrom; int start; int end; char* annotation; 12 | // chrom indexes into REFLIST names 13 | } INTERVAL; 14 | // for storing list of intervals read from bedfile 15 | 16 | // structure for chromosome/sequences/contigs 17 | typedef struct 18 | { 19 | int ns; // no of sequences 20 | char** names; int* lengths; unsigned char** sequences; // changed to unsigned char july 5 2012 to avoid warnings in kmertable.c 21 | uint64_t* offsets; // from fasta index file for each chromosome 22 | 23 | int current; int* lookup; // current variable added for indexing into REFLIST// july 20 2011 24 | char fastafile[1024]; // name of fasta file 25 | 26 | INTERVAL* intervallist; int intervals; 27 | int* first_interval_chrom; // index to first interval for each chromosome in interval list 28 | int cinterval; // current interval that is closest to the current base being examined for variant calling 29 | FILE* fp; // file pointer to fastafile kept open to read one chromosome at a time 30 | 31 | } REFLIST; 32 | 33 | 34 | int read_fastaheader(char* fastafile,REFLIST* reflist); 35 | int read_fasta(char* seqfile, REFLIST* reflist); 36 | int read_chromosome(REFLIST* reflist,int chrom,FILE* fp); 37 | int read_next_chromosome(REFLIST* reflist,int chrom,FILE* fp); 38 | int read_bedfile(char* bedfile,REFLIST* reflist); 39 | REFLIST* init_reflist(char* fastafile,REFLIST* reflist); // initialize reflist 40 | 41 | int read_chromosome_mask(REFLIST* reflist,int chrom,FILE* fp); 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /parsebam/readvariant.h: -------------------------------------------------------------------------------- 1 | #ifndef _READVARIANT_H 2 | #define _READVARIANT_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "hashtable.h" 9 | #include "readfasta.h" 10 | //#define _GNU_SOURCE 11 | extern FILE* fragment_file; // FILE to which the fragments will be output, if NULL, output to stdout 12 | 13 | extern int TRI_ALLELIC; 14 | 15 | extern int BSIZE; 16 | extern int PRINT_FRAGMENTS; 17 | //int VARIANTS = 0; 18 | 19 | typedef struct 20 | { 21 | char* id; char* chrom; int position; 22 | short altalleles; char* RA; char* AA; // alternate alleles 23 | double* GLL; // genotype likelihoods added 11/25/13 24 | char* genotype; // encoded as integers 0 1 2 3 4 5 6 7 25 | short type; 26 | // changed this to char* on April 3 2012 27 | char* allele1; char* allele2; // temporary for SNPs 28 | char heterozygous; // only heterozygous variants will be used for printing out HAIRS 29 | int depth; int A1,A2; int H1,H2; 30 | // total reads covering this variant (haploid/diploid, A1-> reads supporting reference allele (single-read) 31 | // float L11,L12,L22; // genotype likelihoods for three possible genotypes 32 | } VARIANT; 33 | 34 | // information about the variants on each chromosome 35 | typedef struct 36 | { 37 | int variants; int first; int last; int blocks; 38 | int* intervalmap; 39 | } CHROMVARS; 40 | 41 | /* 42 | typedef struct 43 | { 44 | char allele; char qv; int varid; // allele is 0/1 varid is index to varlist[varid] gives all information about the variant 45 | } allele; 46 | 47 | typedef struct 48 | { 49 | char* id; int variants; allele* alist; 50 | int blocks; int paired; int matepos; 51 | 52 | } FRAGMENT; 53 | 54 | int compare_fragments(const void *a,const void *b); 55 | */ 56 | 57 | int count_variants(char* vcffile,char* sampleid,int* samplecol); 58 | 59 | int count_variants_oldformat(char* snpfile); 60 | int read_variantfile_oldformat(char* snpfile,VARIANT* varlist,HASHTABLE* ht,int snps); 61 | 62 | int parse_variant(VARIANT* variant, char* buffer,int samplecol); 63 | 64 | int read_variantfile(char* vcffile,VARIANT* varlist,HASHTABLE* ht,int* hetvariants,int samplecol); 65 | 66 | void build_intervalmap(CHROMVARS* chromvars,int chromosomes,VARIANT* varlist,int variants); 67 | 68 | int calculate_rightshift(VARIANT* varlist,int ss,REFLIST* reflist); 69 | 70 | //int get_chrom_name(struct alignedread* read,HASHTABLE* ht,REFLIST* reflist); 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/AUTHORS: -------------------------------------------------------------------------------- 1 | Heng Li from the Sanger Institute wrote most of the initial source codes 2 | of SAMtools and various converters. 3 | 4 | Bob Handsaker from the Broad Institute is a major contributor to the 5 | SAM/BAM specification. He designed and implemented the BGZF format, the 6 | underlying indexable compression format for the BAM format. BGZF does 7 | not support arithmetic between file offsets. 8 | 9 | Jue Ruan for the Beijing Genome Institute designed and implemented the 10 | RAZF format, an alternative indexable compression format. RAZF supports 11 | arithmetic between file offsets, at the cost of increased index file 12 | size and the full compatibility with gzip. RAZF is optional and only 13 | used in `faidx' for indexing RAZF compressed fasta files. 14 | 15 | Colin Hercus updated novo2sam.pl to support gapped alignment by 16 | novoalign. 17 | 18 | Petr Danecek contributed the header parsing library sam_header.c and 19 | sam2vcf.pl script and added knet support to the RAZF library. 20 | 21 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/COPYING: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2008-2009 Genome Research Ltd. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/INSTALL: -------------------------------------------------------------------------------- 1 | System Requirements 2 | =================== 3 | 4 | SAMtools depends on the zlib library . Version 1.2.3+ is 5 | preferred and with 1.2.3+ you can compile razip and use it to compress a FASTA 6 | file. SAMtools' faidx is able to index a razip-compressed FASTA file to save 7 | diskspace. Older zlib also works with SAMtools, but razip cannot be compiled. 8 | 9 | The text-based viewer (tview) requires the GNU ncurses library 10 | , which comes with Mac OS X and most of 11 | the modern Linux/Unix distributions. If you do not have this library installed, 12 | you can still compile the rest of SAMtools by manually changing: 13 | `-D_CURSES_LIB=1' to `-D_CURSES_LIB=0' at the line starting with `DFLAGS=', and 14 | comment out the line starting with `LIBCURSES='. 15 | 16 | 17 | Compilation 18 | =========== 19 | 20 | Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can compile 21 | razip with `make razip'. 22 | 23 | 24 | Installation 25 | ============ 26 | 27 | Copy `samtools', `bcftools/bcftools' and other executables/scripts in `misc' to 28 | a location you want (e.g. a directory in your $PATH). You may also copy 29 | `samtools.1' and `bcftools/bcftools.1' to a directory in your $MANPATH such 30 | that the `man' command may find the manual. 31 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/Makefile: -------------------------------------------------------------------------------- 1 | CC= gcc 2 | CFLAGS= -g -Wall -O2 #-m64 #-arch ppc 3 | DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=1 4 | KNETFILE_O= knetfile.o 5 | LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ 6 | bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o bedidx.o \ 7 | $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o 8 | AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ 9 | bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ 10 | bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ 11 | cut_target.o phase.o bam2depth.o 12 | PROG= samtools 13 | INCLUDES= -I. 14 | SUBDIRS= . bcftools misc 15 | LIBPATH= 16 | LIBCURSES= -lcurses # -lXCurses 17 | 18 | .SUFFIXES:.c .o 19 | 20 | .c.o: 21 | $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ 22 | 23 | all-recur lib-recur clean-recur cleanlocal-recur install-recur: 24 | @target=`echo $@ | sed s/-recur//`; \ 25 | wdir=`pwd`; \ 26 | list='$(SUBDIRS)'; for subdir in $$list; do \ 27 | cd $$subdir; \ 28 | $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ 29 | INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ 30 | cd $$wdir; \ 31 | done; 32 | 33 | all:$(PROG) 34 | 35 | .PHONY:all lib clean cleanlocal 36 | .PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur 37 | 38 | lib:libbam.a 39 | 40 | libbam.a:$(LOBJS) 41 | $(AR) -csru $@ $(LOBJS) 42 | 43 | samtools:lib-recur $(AOBJS) 44 | $(CC) $(CFLAGS) -o $@ $(AOBJS) -Lbcftools $(LIBPATH) libbam.a -lbcf $(LIBCURSES) -lm -lz 45 | 46 | razip:razip.o razf.o $(KNETFILE_O) 47 | $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz 48 | 49 | bgzip:bgzip.o bgzf.o $(KNETFILE_O) 50 | $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(KNETFILE_O) -lz 51 | 52 | razip.o:razf.h 53 | bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h 54 | sam.o:sam.h bam.h 55 | bam_import.o:bam.h kseq.h khash.h razf.h 56 | bam_pileup.o:bam.h razf.h ksort.h 57 | bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h 58 | bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h 59 | bam_lpileup.o:bam.h ksort.h 60 | bam_tview.o:bam.h faidx.h 61 | bam_sort.o:bam.h ksort.h razf.h 62 | bam_md.o:bam.h faidx.h 63 | sam_header.o:sam_header.h khash.h 64 | bcf.o:bcftools/bcf.h 65 | bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h 66 | bam2bcf_indel.o:bam2bcf.h 67 | errmod.o:errmod.h 68 | phase.o:bam.h khash.h ksort.h 69 | bamtk.o:bam.h 70 | 71 | faidx.o:faidx.h razf.h khash.h 72 | faidx_main.o:faidx.h razf.h 73 | 74 | 75 | libbam.1.dylib-local:$(LOBJS) 76 | libtool -dynamic $(LOBJS) -o libbam.1.dylib -lc -lz 77 | 78 | libbam.so.1-local:$(LOBJS) 79 | $(CC) -shared -Wl,-soname,libbam.so -o libbam.so.1 $(LOBJS) -lc -lz 80 | 81 | dylib: 82 | @$(MAKE) cleanlocal; \ 83 | case `uname` in \ 84 | Linux) $(MAKE) CFLAGS="$(CFLAGS) -fPIC" libbam.so.1-local;; \ 85 | Darwin) $(MAKE) CFLAGS="$(CFLAGS) -fPIC" libbam.1.dylib-local;; \ 86 | *) echo 'Unknown OS';; \ 87 | esac 88 | 89 | 90 | cleanlocal: 91 | rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib 92 | 93 | clean:cleanlocal-recur 94 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/Makefile.mingw: -------------------------------------------------------------------------------- 1 | CC= gcc.exe 2 | AR= ar.exe 3 | CFLAGS= -g -Wall -O2 4 | DFLAGS= -D_USE_KNETFILE -D_CURSES_LIB=2 5 | KNETFILE_O= knetfile.o 6 | LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ 7 | bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o \ 8 | $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bedidx.o 9 | AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ 10 | bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ 11 | bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ 12 | cut_target.o phase.o bam_cat.o bam2depth.o 13 | BCFOBJS= bcftools/bcf.o bcftools/fet.o bcftools/bcf2qcall.o bcftools/bcfutils.o \ 14 | bcftools/call1.o bcftools/index.o bcftools/kfunc.o bcftools/em.o \ 15 | bcftools/kmin.o bcftools/prob1.o bcftools/vcf.o bcftools/mut.o 16 | PROG= samtools.exe bcftools.exe 17 | INCLUDES= -I. -Iwin32 18 | SUBDIRS= . 19 | LIBPATH= 20 | 21 | .SUFFIXES:.c .o 22 | 23 | .c.o: 24 | $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ 25 | 26 | all:$(PROG) 27 | 28 | .PHONY:all lib clean cleanlocal 29 | .PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur 30 | 31 | lib:libbam.a 32 | 33 | libbam.a:$(LOBJS) 34 | $(AR) -cru $@ $(LOBJS) 35 | 36 | samtools.exe:$(AOBJS) libbam.a $(BCFOBJS) 37 | $(CC) $(CFLAGS) -o $@ $(AOBJS) $(BCFOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32 38 | 39 | bcftools.exe:$(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o 40 | $(CC) $(CFLAGS) -o $@ $(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o -lm -Lwin32 -lz -lws2_32 41 | 42 | razip.o:razf.h 43 | bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h 44 | sam.o:sam.h bam.h 45 | bam_import.o:bam.h kseq.h khash.h razf.h 46 | bam_pileup.o:bam.h razf.h ksort.h 47 | bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h 48 | bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h 49 | bam_lpileup.o:bam.h ksort.h 50 | bam_tview.o:bam.h faidx.h 51 | bam_sort.o:bam.h ksort.h razf.h 52 | bam_md.o:bam.h faidx.h 53 | sam_header.o:sam_header.h khash.h 54 | bcf.o:bcftools/bcf.h 55 | bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h 56 | bam2bcf_indel.o:bam2bcf.h 57 | errmod.o:errmod.h 58 | 59 | faidx.o:faidx.h razf.h khash.h 60 | faidx_main.o:faidx.h razf.h 61 | 62 | clean: 63 | rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib 64 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bam2bcf.h: -------------------------------------------------------------------------------- 1 | #ifndef BAM2BCF_H 2 | #define BAM2BCF_H 3 | 4 | #include 5 | #include "errmod.h" 6 | #include "bcftools/bcf.h" 7 | 8 | #define B2B_INDEL_NULL 10000 9 | 10 | typedef struct __bcf_callaux_t { 11 | int capQ, min_baseQ; 12 | int openQ, extQ, tandemQ; // for indels 13 | int min_support; // for collecting indel candidates 14 | double min_frac; // for collecting indel candidates 15 | // for internal uses 16 | int max_bases; 17 | int indel_types[4]; 18 | int maxins, indelreg; 19 | char *inscns; 20 | uint16_t *bases; 21 | errmod_t *e; 22 | void *rghash; 23 | } bcf_callaux_t; 24 | 25 | typedef struct { 26 | int depth, ori_depth, qsum[4]; 27 | int anno[16]; 28 | float p[25]; 29 | int mvd[3]; // mean variant distance, number of variant reads, average read length 30 | } bcf_callret1_t; 31 | 32 | typedef struct { 33 | int a[5]; // alleles: ref, alt, alt2, alt3 34 | int n, n_alleles, shift, ori_ref, unseen; 35 | int anno[16], depth, ori_depth; 36 | uint8_t *PL; 37 | float vdb; // variant distance bias 38 | } bcf_call_t; 39 | 40 | #ifdef __cplusplus 41 | extern "C" { 42 | #endif 43 | 44 | bcf_callaux_t *bcf_call_init(double theta, int min_baseQ); 45 | void bcf_call_destroy(bcf_callaux_t *bca); 46 | int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r); 47 | int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call); 48 | int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP, 49 | const bcf_callaux_t *bca, const char *ref); 50 | int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, 51 | const void *rghash); 52 | 53 | #ifdef __cplusplus 54 | } 55 | #endif 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bam2depth.c: -------------------------------------------------------------------------------- 1 | /* This program demonstrates how to generate pileup from multiple BAMs 2 | * simutaneously, to achieve random access and to use the BED interface. 3 | * To compile this program separately, you may: 4 | * 5 | * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -L. -lbam -lz 6 | */ 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "bam.h" 12 | 13 | typedef struct { // auxiliary data structure 14 | bamFile fp; // the file handler 15 | bam_iter_t iter; // NULL if a region not specified 16 | int min_mapQ; // mapQ filter 17 | } aux_t; 18 | 19 | void *bed_read(const char *fn); // read a BED or position list file 20 | void bed_destroy(void *_h); // destroy the BED data structure 21 | int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps 22 | 23 | // This function reads a BAM alignment from one BAM file. 24 | static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup 25 | { 26 | aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure 27 | int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); 28 | if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; 29 | return ret; 30 | } 31 | 32 | #ifdef _MAIN_BAM2DEPTH 33 | int main(int argc, char *argv[]) 34 | #else 35 | int main_depth(int argc, char *argv[]) 36 | #endif 37 | { 38 | int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0; 39 | const bam_pileup1_t **plp; 40 | char *reg = 0; // specified region 41 | void *bed = 0; // BED data structure 42 | bam_header_t *h = 0; // BAM header of the 1st input 43 | aux_t **data; 44 | bam_mplp_t mplp; 45 | 46 | // parse the command line 47 | while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) { 48 | switch (n) { 49 | case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header 50 | case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now 51 | case 'q': baseQ = atoi(optarg); break; // base quality threshold 52 | case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold 53 | } 54 | } 55 | if (optind == argc) { 56 | fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] [...]\n"); 57 | return 1; 58 | } 59 | 60 | // initialize the auxiliary data structures 61 | n = argc - optind; // the number of BAMs on the command line 62 | data = calloc(n, sizeof(void*)); // data[i] for the i-th input 63 | beg = 0; end = 1<<30; tid = -1; // set the default region 64 | for (i = 0; i < n; ++i) { 65 | bam_header_t *htmp; 66 | data[i] = calloc(1, sizeof(aux_t)); 67 | data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM 68 | data[i]->min_mapQ = mapQ; // set the mapQ filter 69 | htmp = bam_header_read(data[i]->fp); // read the BAM header 70 | if (i == 0) { 71 | h = htmp; // keep the header of the 1st BAM 72 | if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region 73 | } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header 74 | if (tid >= 0) { // if a region is specified and parsed successfully 75 | bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index 76 | data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator 77 | bam_index_destroy(idx); // the index is not needed any more; phase out of the memory 78 | } 79 | } 80 | 81 | // the core multi-pileup loop 82 | mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization 83 | n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM 84 | plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) 85 | while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position 86 | if (pos < beg || pos >= end) continue; // out of range; skip 87 | if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip 88 | fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster 89 | for (i = 0; i < n; ++i) { // base level filters have to go here 90 | int j, m = 0; 91 | for (j = 0; j < n_plp[i]; ++j) { 92 | const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know 93 | if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos 94 | else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality 95 | } 96 | printf("\t%d", n_plp[i] - m); // this the depth to output 97 | } 98 | putchar('\n'); 99 | } 100 | free(n_plp); free(plp); 101 | bam_mplp_destroy(mplp); 102 | 103 | bam_header_destroy(h); 104 | for (i = 0; i < n; ++i) { 105 | bam_close(data[i]->fp); 106 | if (data[i]->iter) bam_iter_destroy(data[i]->iter); 107 | free(data[i]); 108 | } 109 | free(data); free(reg); 110 | if (bed) bed_destroy(bed); 111 | return 0; 112 | } 113 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bam_aux.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "bam.h" 3 | #include "khash.h" 4 | typedef char *str_p; 5 | KHASH_MAP_INIT_STR(s, int) 6 | KHASH_MAP_INIT_STR(r2l, str_p) 7 | 8 | void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) 9 | { 10 | int ori_len = b->data_len; 11 | b->data_len += 3 + len; 12 | b->l_aux += 3 + len; 13 | if (b->m_data < b->data_len) { 14 | b->m_data = b->data_len; 15 | kroundup32(b->m_data); 16 | b->data = (uint8_t*)realloc(b->data, b->m_data); 17 | } 18 | b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; 19 | b->data[ori_len + 2] = type; 20 | memcpy(b->data + ori_len + 3, data, len); 21 | } 22 | 23 | uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) 24 | { 25 | return bam_aux_get(b, tag); 26 | } 27 | 28 | #define __skip_tag(s) do { \ 29 | int type = toupper(*(s)); \ 30 | ++(s); \ 31 | if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ 32 | else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \ 33 | else (s) += bam_aux_type2size(type); \ 34 | } while(0) 35 | 36 | uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) 37 | { 38 | uint8_t *s; 39 | int y = tag[0]<<8 | tag[1]; 40 | s = bam1_aux(b); 41 | while (s < b->data + b->data_len) { 42 | int x = (int)s[0]<<8 | s[1]; 43 | s += 2; 44 | if (x == y) return s; 45 | __skip_tag(s); 46 | } 47 | return 0; 48 | } 49 | // s MUST BE returned by bam_aux_get() 50 | int bam_aux_del(bam1_t *b, uint8_t *s) 51 | { 52 | uint8_t *p, *aux; 53 | aux = bam1_aux(b); 54 | p = s - 2; 55 | __skip_tag(s); 56 | memmove(p, s, b->l_aux - (s - aux)); 57 | b->data_len -= s - p; 58 | b->l_aux -= s - p; 59 | return 0; 60 | } 61 | 62 | int bam_aux_drop_other(bam1_t *b, uint8_t *s) 63 | { 64 | if (s) { 65 | uint8_t *p, *aux; 66 | aux = bam1_aux(b); 67 | p = s - 2; 68 | __skip_tag(s); 69 | memmove(aux, p, s - p); 70 | b->data_len -= b->l_aux - (s - p); 71 | b->l_aux = s - p; 72 | } else { 73 | b->data_len -= b->l_aux; 74 | b->l_aux = 0; 75 | } 76 | return 0; 77 | } 78 | 79 | void bam_init_header_hash(bam_header_t *header) 80 | { 81 | if (header->hash == 0) { 82 | int ret, i; 83 | khiter_t iter; 84 | khash_t(s) *h; 85 | header->hash = h = kh_init(s); 86 | for (i = 0; i < header->n_targets; ++i) { 87 | iter = kh_put(s, h, header->target_name[i], &ret); 88 | kh_value(h, iter) = i; 89 | } 90 | } 91 | } 92 | 93 | void bam_destroy_header_hash(bam_header_t *header) 94 | { 95 | if (header->hash) 96 | kh_destroy(s, (khash_t(s)*)header->hash); 97 | } 98 | 99 | int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) 100 | { 101 | khint_t k; 102 | khash_t(s) *h = (khash_t(s)*)header->hash; 103 | k = kh_get(s, h, seq_name); 104 | return k == kh_end(h)? -1 : kh_value(h, k); 105 | } 106 | 107 | int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) 108 | { 109 | char *s; 110 | int i, l, k, name_end; 111 | khiter_t iter; 112 | khash_t(s) *h; 113 | 114 | bam_init_header_hash(header); 115 | h = (khash_t(s)*)header->hash; 116 | 117 | *ref_id = *beg = *end = -1; 118 | name_end = l = strlen(str); 119 | s = (char*)malloc(l+1); 120 | // remove space 121 | for (i = k = 0; i < l; ++i) 122 | if (!isspace(str[i])) s[k++] = str[i]; 123 | s[k] = 0; l = k; 124 | // determine the sequence name 125 | for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end 126 | if (i >= 0) name_end = i; 127 | if (name_end < l) { // check if this is really the end 128 | int n_hyphen = 0; 129 | for (i = name_end + 1; i < l; ++i) { 130 | if (s[i] == '-') ++n_hyphen; 131 | else if (!isdigit(s[i]) && s[i] != ',') break; 132 | } 133 | if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name 134 | s[name_end] = 0; 135 | iter = kh_get(s, h, s); 136 | if (iter == kh_end(h)) { // cannot find the sequence name 137 | iter = kh_get(s, h, str); // try str as the name 138 | if (iter == kh_end(h)) { 139 | if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__); 140 | free(s); return -1; 141 | } else s[name_end] = ':', name_end = l; 142 | } 143 | } else iter = kh_get(s, h, str); 144 | *ref_id = kh_val(h, iter); 145 | // parse the interval 146 | if (name_end < l) { 147 | for (i = k = name_end + 1; i < l; ++i) 148 | if (s[i] != ',') s[k++] = s[i]; 149 | s[k] = 0; 150 | *beg = atoi(s + name_end + 1); 151 | for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; 152 | *end = i < k? atoi(s + i + 1) : 1<<29; 153 | if (*beg > 0) --*beg; 154 | } else *beg = 0, *end = 1<<29; 155 | free(s); 156 | return *beg <= *end? 0 : -1; 157 | } 158 | 159 | int32_t bam_aux2i(const uint8_t *s) 160 | { 161 | int type; 162 | if (s == 0) return 0; 163 | type = *s++; 164 | if (type == 'c') return (int32_t)*(int8_t*)s; 165 | else if (type == 'C') return (int32_t)*(uint8_t*)s; 166 | else if (type == 's') return (int32_t)*(int16_t*)s; 167 | else if (type == 'S') return (int32_t)*(uint16_t*)s; 168 | else if (type == 'i' || type == 'I') return *(int32_t*)s; 169 | else return 0; 170 | } 171 | 172 | float bam_aux2f(const uint8_t *s) 173 | { 174 | int type; 175 | type = *s++; 176 | if (s == 0) return 0.0; 177 | if (type == 'f') return *(float*)s; 178 | else return 0.0; 179 | } 180 | 181 | double bam_aux2d(const uint8_t *s) 182 | { 183 | int type; 184 | type = *s++; 185 | if (s == 0) return 0.0; 186 | if (type == 'd') return *(double*)s; 187 | else return 0.0; 188 | } 189 | 190 | char bam_aux2A(const uint8_t *s) 191 | { 192 | int type; 193 | type = *s++; 194 | if (s == 0) return 0; 195 | if (type == 'A') return *(char*)s; 196 | else return 0; 197 | } 198 | 199 | char *bam_aux2Z(const uint8_t *s) 200 | { 201 | int type; 202 | type = *s++; 203 | if (s == 0) return 0; 204 | if (type == 'Z' || type == 'H') return (char*)s; 205 | else return 0; 206 | } 207 | 208 | #ifdef _WIN32 209 | double drand48() 210 | { 211 | return (double)rand() / RAND_MAX; 212 | } 213 | #endif 214 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bam_cat.c: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | bam_cat -- efficiently concatenates bam files 4 | 5 | bam_cat can be used to concatenate BAM files. Under special 6 | circumstances, it can be used as an alternative to 'samtools merge' to 7 | concatenate multiple sorted files into a single sorted file. For this 8 | to work each file must be sorted, and the sorted files must be given 9 | as command line arguments in order such that the final read in file i 10 | is less than or equal to the first read in file i+1. 11 | 12 | This code is derived from the bam_reheader function in samtools 0.1.8 13 | and modified to perform concatenation by Chris Saunders on behalf of 14 | Illumina. 15 | 16 | 17 | ########## License: 18 | 19 | The MIT License 20 | 21 | Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. 22 | Modified SAMtools work copyright (c) 2010 Illumina, Inc. 23 | 24 | Permission is hereby granted, free of charge, to any person obtaining a copy 25 | of this software and associated documentation files (the "Software"), to deal 26 | in the Software without restriction, including without limitation the rights 27 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 28 | copies of the Software, and to permit persons to whom the Software is 29 | furnished to do so, subject to the following conditions: 30 | 31 | The above copyright notice and this permission notice shall be included in 32 | all copies or substantial portions of the Software. 33 | 34 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 35 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 36 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 37 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 38 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 39 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 40 | THE SOFTWARE. 41 | 42 | */ 43 | 44 | 45 | /* 46 | makefile: 47 | """ 48 | CC=gcc 49 | CFLAGS+=-g -Wall -O2 -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -I$(SAMTOOLS_DIR) 50 | LDFLAGS+=-L$(SAMTOOLS_DIR) 51 | LDLIBS+=-lbam -lz 52 | 53 | all:bam_cat 54 | """ 55 | */ 56 | 57 | 58 | #include 59 | #include 60 | #include 61 | 62 | #include "bgzf.h" 63 | #include "bam.h" 64 | 65 | #define BUF_SIZE 0x10000 66 | 67 | #define GZIPID1 31 68 | #define GZIPID2 139 69 | 70 | #define BGZF_EMPTY_BLOCK_SIZE 28 71 | 72 | 73 | int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam) 74 | { 75 | BGZF *fp; 76 | FILE* fp_file; 77 | uint8_t *buf; 78 | uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; 79 | const int es=BGZF_EMPTY_BLOCK_SIZE; 80 | int i; 81 | 82 | fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w"); 83 | if (fp == 0) { 84 | fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam); 85 | return 1; 86 | } 87 | if (h) bam_header_write(fp, h); 88 | 89 | buf = (uint8_t*) malloc(BUF_SIZE); 90 | for(i = 0; i < nfn; ++i){ 91 | BGZF *in; 92 | bam_header_t *old; 93 | int len,j; 94 | 95 | in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(fileno(stdin), "r"); 96 | if (in == 0) { 97 | fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); 98 | return -1; 99 | } 100 | if (in->open_mode != 'r') return -1; 101 | 102 | old = bam_header_read(in); 103 | if (h == 0 && i == 0) bam_header_write(fp, old); 104 | 105 | if (in->block_offset < in->block_length) { 106 | bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); 107 | bgzf_flush(fp); 108 | } 109 | 110 | j=0; 111 | #ifdef _USE_KNETFILE 112 | fp_file=fp->x.fpw; 113 | while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) { 114 | #else 115 | fp_file=fp->file; 116 | while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) { 117 | #endif 118 | if(len= 0) { 163 | switch (c) { 164 | case 'h': { 165 | tamFile fph = sam_open(optarg); 166 | if (fph == 0) { 167 | fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]); 168 | return 1; 169 | } 170 | h = sam_header_read(fph); 171 | sam_close(fph); 172 | break; 173 | } 174 | case 'o': outfn = strdup(optarg); break; 175 | } 176 | } 177 | if (argc - optind < 2) { 178 | fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] [...]\n"); 179 | return 1; 180 | } 181 | ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); 182 | free(outfn); 183 | return ret; 184 | } 185 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bam_color.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "bam.h" 3 | 4 | /*! 5 | @abstract Get the color encoding the previous and current base 6 | @param b pointer to an alignment 7 | @param i The i-th position, 0-based 8 | @return color 9 | 10 | @discussion Returns 0 no color information is found. 11 | */ 12 | char bam_aux_getCSi(bam1_t *b, int i) 13 | { 14 | uint8_t *c = bam_aux_get(b, "CS"); 15 | char *cs = NULL; 16 | 17 | // return the base if the tag was not found 18 | if(0 == c) return 0; 19 | 20 | cs = bam_aux2Z(c); 21 | // adjust for strandedness and leading adaptor 22 | if(bam1_strand(b)) i = strlen(cs) - 1 - i; 23 | else i++; 24 | return cs[i]; 25 | } 26 | 27 | /*! 28 | @abstract Get the color quality of the color encoding the previous and current base 29 | @param b pointer to an alignment 30 | @param i The i-th position, 0-based 31 | @return color quality 32 | 33 | @discussion Returns 0 no color information is found. 34 | */ 35 | char bam_aux_getCQi(bam1_t *b, int i) 36 | { 37 | uint8_t *c = bam_aux_get(b, "CQ"); 38 | char *cq = NULL; 39 | 40 | // return the base if the tag was not found 41 | if(0 == c) return 0; 42 | 43 | cq = bam_aux2Z(c); 44 | // adjust for strandedness 45 | if(bam1_strand(b)) i = strlen(cq) - 1 - i; 46 | return cq[i]; 47 | } 48 | 49 | char bam_aux_nt2int(char a) 50 | { 51 | switch(toupper(a)) { 52 | case 'A': 53 | return 0; 54 | break; 55 | case 'C': 56 | return 1; 57 | break; 58 | case 'G': 59 | return 2; 60 | break; 61 | case 'T': 62 | return 3; 63 | break; 64 | default: 65 | return 4; 66 | break; 67 | } 68 | } 69 | 70 | char bam_aux_ntnt2cs(char a, char b) 71 | { 72 | a = bam_aux_nt2int(a); 73 | b = bam_aux_nt2int(b); 74 | if(4 == a || 4 == b) return '4'; 75 | return "0123"[(int)(a ^ b)]; 76 | } 77 | 78 | /*! 79 | @abstract Get the color error profile at the give position 80 | @param b pointer to an alignment 81 | @return the original color if the color was an error, '-' (dash) otherwise 82 | 83 | @discussion Returns 0 no color information is found. 84 | */ 85 | char bam_aux_getCEi(bam1_t *b, int i) 86 | { 87 | int cs_i; 88 | uint8_t *c = bam_aux_get(b, "CS"); 89 | char *cs = NULL; 90 | char prev_b, cur_b; 91 | char cur_color, cor_color; 92 | 93 | // return the base if the tag was not found 94 | if(0 == c) return 0; 95 | 96 | cs = bam_aux2Z(c); 97 | 98 | // adjust for strandedness and leading adaptor 99 | if(bam1_strand(b)) { //reverse strand 100 | cs_i = strlen(cs) - 1 - i; 101 | // get current color 102 | cur_color = cs[cs_i]; 103 | // get previous base. Note: must rc adaptor 104 | prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)]; 105 | // get current base 106 | cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; 107 | } 108 | else { 109 | cs_i=i+1; 110 | // get current color 111 | cur_color = cs[cs_i]; 112 | // get previous base 113 | prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)]; 114 | // get current base 115 | cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; 116 | } 117 | 118 | // corrected color 119 | cor_color = bam_aux_ntnt2cs(prev_b, cur_b); 120 | 121 | if(cur_color == cor_color) { 122 | return '-'; 123 | } 124 | else { 125 | return cur_color; 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bam_endian.h: -------------------------------------------------------------------------------- 1 | #ifndef BAM_ENDIAN_H 2 | #define BAM_ENDIAN_H 3 | 4 | #include 5 | 6 | static inline int bam_is_big_endian() 7 | { 8 | long one= 1; 9 | return !(*((char *)(&one))); 10 | } 11 | static inline uint16_t bam_swap_endian_2(uint16_t v) 12 | { 13 | return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); 14 | } 15 | static inline void *bam_swap_endian_2p(void *x) 16 | { 17 | *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); 18 | return x; 19 | } 20 | static inline uint32_t bam_swap_endian_4(uint32_t v) 21 | { 22 | v = ((v & 0x0000FFFFU) << 16) | (v >> 16); 23 | return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); 24 | } 25 | static inline void *bam_swap_endian_4p(void *x) 26 | { 27 | *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); 28 | return x; 29 | } 30 | static inline uint64_t bam_swap_endian_8(uint64_t v) 31 | { 32 | v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); 33 | v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); 34 | return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); 35 | } 36 | static inline void *bam_swap_endian_8p(void *x) 37 | { 38 | *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); 39 | return x; 40 | } 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bam_lpileup.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "bam.h" 5 | #include "ksort.h" 6 | 7 | #define TV_GAP 2 8 | 9 | typedef struct __freenode_t { 10 | uint32_t level:28, cnt:4; 11 | struct __freenode_t *next; 12 | } freenode_t, *freenode_p; 13 | 14 | #define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level)) 15 | KSORT_INIT(node, freenode_p, freenode_lt) 16 | 17 | /* Memory pool, similar to the one in bam_pileup.c */ 18 | typedef struct { 19 | int cnt, n, max; 20 | freenode_t **buf; 21 | } mempool_t; 22 | 23 | static mempool_t *mp_init() 24 | { 25 | return (mempool_t*)calloc(1, sizeof(mempool_t)); 26 | } 27 | static void mp_destroy(mempool_t *mp) 28 | { 29 | int k; 30 | for (k = 0; k < mp->n; ++k) free(mp->buf[k]); 31 | free(mp->buf); free(mp); 32 | } 33 | static inline freenode_t *mp_alloc(mempool_t *mp) 34 | { 35 | ++mp->cnt; 36 | if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t)); 37 | else return mp->buf[--mp->n]; 38 | } 39 | static inline void mp_free(mempool_t *mp, freenode_t *p) 40 | { 41 | --mp->cnt; p->next = 0; p->cnt = TV_GAP; 42 | if (mp->n == mp->max) { 43 | mp->max = mp->max? mp->max<<1 : 256; 44 | mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max); 45 | } 46 | mp->buf[mp->n++] = p; 47 | } 48 | 49 | /* core part */ 50 | struct __bam_lplbuf_t { 51 | int max, n_cur, n_pre; 52 | int max_level, *cur_level, *pre_level; 53 | mempool_t *mp; 54 | freenode_t **aux, *head, *tail; 55 | int n_nodes, m_aux; 56 | bam_pileup_f func; 57 | void *user_data; 58 | bam_plbuf_t *plbuf; 59 | }; 60 | 61 | void bam_lplbuf_reset(bam_lplbuf_t *buf) 62 | { 63 | freenode_t *p, *q; 64 | bam_plbuf_reset(buf->plbuf); 65 | for (p = buf->head; p->next;) { 66 | q = p->next; 67 | mp_free(buf->mp, p); 68 | p = q; 69 | } 70 | buf->head = buf->tail; 71 | buf->max_level = 0; 72 | buf->n_cur = buf->n_pre = 0; 73 | buf->n_nodes = 0; 74 | } 75 | 76 | static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) 77 | { 78 | bam_lplbuf_t *tv = (bam_lplbuf_t*)data; 79 | freenode_t *p; 80 | int i, l, max_level; 81 | // allocate memory if necessary 82 | if (tv->max < n) { // enlarge 83 | tv->max = n; 84 | kroundup32(tv->max); 85 | tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max); 86 | tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max); 87 | } 88 | tv->n_cur = n; 89 | // update cnt 90 | for (p = tv->head; p->next; p = p->next) 91 | if (p->cnt > 0) --p->cnt; 92 | // calculate cur_level[] 93 | max_level = 0; 94 | for (i = l = 0; i < n; ++i) { 95 | const bam_pileup1_t *p = pl + i; 96 | if (p->is_head) { 97 | if (tv->head->next && tv->head->cnt == 0) { // then take a free slot 98 | freenode_t *p = tv->head->next; 99 | tv->cur_level[i] = tv->head->level; 100 | mp_free(tv->mp, tv->head); 101 | tv->head = p; 102 | --tv->n_nodes; 103 | } else tv->cur_level[i] = ++tv->max_level; 104 | } else { 105 | tv->cur_level[i] = tv->pre_level[l++]; 106 | if (p->is_tail) { // then return a free slot 107 | tv->tail->level = tv->cur_level[i]; 108 | tv->tail->next = mp_alloc(tv->mp); 109 | tv->tail = tv->tail->next; 110 | ++tv->n_nodes; 111 | } 112 | } 113 | if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i]; 114 | ((bam_pileup1_t*)p)->level = tv->cur_level[i]; 115 | } 116 | assert(l == tv->n_pre); 117 | tv->func(tid, pos, n, pl, tv->user_data); 118 | // sort the linked list 119 | if (tv->n_nodes) { 120 | freenode_t *q; 121 | if (tv->n_nodes + 1 > tv->m_aux) { // enlarge 122 | tv->m_aux = tv->n_nodes + 1; 123 | kroundup32(tv->m_aux); 124 | tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux); 125 | } 126 | for (p = tv->head, i = l = 0; p->next;) { 127 | if (p->level > max_level) { // then discard this entry 128 | q = p->next; 129 | mp_free(tv->mp, p); 130 | p = q; 131 | } else { 132 | tv->aux[i++] = p; 133 | p = p->next; 134 | } 135 | } 136 | tv->aux[i] = tv->tail; // add a proper tail for the loop below 137 | tv->n_nodes = i; 138 | if (tv->n_nodes) { 139 | ks_introsort(node, tv->n_nodes, tv->aux); 140 | for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1]; 141 | tv->head = tv->aux[0]; 142 | } else tv->head = tv->tail; 143 | } 144 | // clean up 145 | tv->max_level = max_level; 146 | memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4); 147 | // squeeze out terminated levels 148 | for (i = l = 0; i < n; ++i) { 149 | const bam_pileup1_t *p = pl + i; 150 | if (!p->is_tail) 151 | tv->pre_level[l++] = tv->pre_level[i]; 152 | } 153 | tv->n_pre = l; 154 | /* 155 | fprintf(stderr, "%d\t", pos+1); 156 | for (i = 0; i < n; ++i) { 157 | const bam_pileup1_t *p = pl + i; 158 | if (p->is_head) fprintf(stderr, "^"); 159 | if (p->is_tail) fprintf(stderr, "$"); 160 | fprintf(stderr, "%d,", p->level); 161 | } 162 | fprintf(stderr, "\n"); 163 | */ 164 | return 0; 165 | } 166 | 167 | bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data) 168 | { 169 | bam_lplbuf_t *tv; 170 | tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t)); 171 | tv->mp = mp_init(); 172 | tv->head = tv->tail = mp_alloc(tv->mp); 173 | tv->func = func; 174 | tv->user_data = data; 175 | tv->plbuf = bam_plbuf_init(tview_func, tv); 176 | return (bam_lplbuf_t*)tv; 177 | } 178 | 179 | void bam_lplbuf_destroy(bam_lplbuf_t *tv) 180 | { 181 | freenode_t *p, *q; 182 | free(tv->cur_level); free(tv->pre_level); 183 | bam_plbuf_destroy(tv->plbuf); 184 | free(tv->aux); 185 | for (p = tv->head; p->next;) { 186 | q = p->next; 187 | mp_free(tv->mp, p); p = q; 188 | } 189 | mp_free(tv->mp, p); 190 | assert(tv->mp->cnt == 0); 191 | mp_destroy(tv->mp); 192 | free(tv); 193 | } 194 | 195 | int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv) 196 | { 197 | return bam_plbuf_push(b, tv->plbuf); 198 | } 199 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bam_mate.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "bam.h" 4 | 5 | // currently, this function ONLY works if each read has one hit 6 | void bam_mating_core(bamFile in, bamFile out) 7 | { 8 | bam_header_t *header; 9 | bam1_t *b[2]; 10 | int curr, has_prev; 11 | 12 | header = bam_header_read(in); 13 | bam_header_write(out, header); 14 | 15 | b[0] = bam_init1(); 16 | b[1] = bam_init1(); 17 | curr = 0; has_prev = 0; 18 | while (bam_read1(in, b[curr]) >= 0) { 19 | bam1_t *cur = b[curr], *pre = b[1-curr]; 20 | if (has_prev) { 21 | if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name 22 | cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; 23 | pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; 24 | if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) 25 | && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) 26 | { 27 | uint32_t cur5, pre5; 28 | cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos; 29 | pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos; 30 | cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; 31 | } else cur->core.isize = pre->core.isize = 0; 32 | if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; 33 | else cur->core.flag &= ~BAM_FMREVERSE; 34 | if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; 35 | else pre->core.flag &= ~BAM_FMREVERSE; 36 | if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } 37 | if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } 38 | bam_write1(out, pre); 39 | bam_write1(out, cur); 40 | has_prev = 0; 41 | } else { // unpaired or singleton 42 | pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; 43 | if (pre->core.flag & BAM_FPAIRED) { 44 | pre->core.flag |= BAM_FMUNMAP; 45 | pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; 46 | } 47 | bam_write1(out, pre); 48 | } 49 | } else has_prev = 1; 50 | curr = 1 - curr; 51 | } 52 | if (has_prev) bam_write1(out, b[1-curr]); 53 | bam_header_destroy(header); 54 | bam_destroy1(b[0]); 55 | bam_destroy1(b[1]); 56 | } 57 | 58 | int bam_mating(int argc, char *argv[]) 59 | { 60 | bamFile in, out; 61 | if (argc < 3) { 62 | fprintf(stderr, "samtools fixmate \n"); 63 | return 1; 64 | } 65 | in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); 66 | out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); 67 | bam_mating_core(in, out); 68 | bam_close(in); bam_close(out); 69 | return 0; 70 | } 71 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bam_reheader.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "bgzf.h" 4 | #include "bam.h" 5 | 6 | #define BUF_SIZE 0x10000 7 | 8 | int bam_reheader(BGZF *in, const bam_header_t *h, int fd) 9 | { 10 | BGZF *fp; 11 | bam_header_t *old; 12 | int len; 13 | uint8_t *buf; 14 | if (in->open_mode != 'r') return -1; 15 | buf = malloc(BUF_SIZE); 16 | old = bam_header_read(in); 17 | fp = bgzf_fdopen(fd, "w"); 18 | bam_header_write(fp, h); 19 | if (in->block_offset < in->block_length) { 20 | bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); 21 | bgzf_flush(fp); 22 | } 23 | #ifdef _USE_KNETFILE 24 | while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) 25 | fwrite(buf, 1, len, fp->x.fpw); 26 | #else 27 | while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) 28 | fwrite(buf, 1, len, fp->file); 29 | #endif 30 | free(buf); 31 | fp->block_offset = in->block_offset = 0; 32 | bgzf_close(fp); 33 | return 0; 34 | } 35 | 36 | int main_reheader(int argc, char *argv[]) 37 | { 38 | bam_header_t *h; 39 | BGZF *in; 40 | if (argc != 3) { 41 | fprintf(stderr, "Usage: samtools reheader \n"); 42 | return 1; 43 | } 44 | { // read the header 45 | tamFile fph = sam_open(argv[1]); 46 | if (fph == 0) { 47 | fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]); 48 | return 1; 49 | } 50 | h = sam_header_read(fph); 51 | sam_close(fph); 52 | } 53 | in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r"); 54 | if (in == 0) { 55 | fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]); 56 | return 1; 57 | } 58 | bam_reheader(in, h, fileno(stdout)); 59 | bgzf_close(in); 60 | return 0; 61 | } 62 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bam_rmdup.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "sam.h" 7 | 8 | typedef bam1_t *bam1_p; 9 | 10 | #include "khash.h" 11 | KHASH_SET_INIT_STR(name) 12 | KHASH_MAP_INIT_INT64(pos, bam1_p) 13 | 14 | #define BUFFER_SIZE 0x40000 15 | 16 | typedef struct { 17 | uint64_t n_checked, n_removed; 18 | khash_t(pos) *best_hash; 19 | } lib_aux_t; 20 | KHASH_MAP_INIT_STR(lib, lib_aux_t) 21 | 22 | typedef struct { 23 | int n, max; 24 | bam1_t **a; 25 | } tmp_stack_t; 26 | 27 | static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) 28 | { 29 | if (stack->n == stack->max) { 30 | stack->max = stack->max? stack->max<<1 : 0x10000; 31 | stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max); 32 | } 33 | stack->a[stack->n++] = b; 34 | } 35 | 36 | static inline void dump_best(tmp_stack_t *stack, samfile_t *out) 37 | { 38 | int i; 39 | for (i = 0; i != stack->n; ++i) { 40 | samwrite(out, stack->a[i]); 41 | bam_destroy1(stack->a[i]); 42 | } 43 | stack->n = 0; 44 | } 45 | 46 | static void clear_del_set(khash_t(name) *del_set) 47 | { 48 | khint_t k; 49 | for (k = kh_begin(del_set); k < kh_end(del_set); ++k) 50 | if (kh_exist(del_set, k)) 51 | free((char*)kh_key(del_set, k)); 52 | kh_clear(name, del_set); 53 | } 54 | 55 | static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) 56 | { 57 | khint_t k = kh_get(lib, aux, lib); 58 | if (k == kh_end(aux)) { 59 | int ret; 60 | char *p = strdup(lib); 61 | lib_aux_t *q; 62 | k = kh_put(lib, aux, p, &ret); 63 | q = &kh_val(aux, k); 64 | q->n_checked = q->n_removed = 0; 65 | q->best_hash = kh_init(pos); 66 | return q; 67 | } else return &kh_val(aux, k); 68 | } 69 | 70 | static void clear_best(khash_t(lib) *aux, int max) 71 | { 72 | khint_t k; 73 | for (k = kh_begin(aux); k != kh_end(aux); ++k) { 74 | if (kh_exist(aux, k)) { 75 | lib_aux_t *q = &kh_val(aux, k); 76 | if (kh_size(q->best_hash) >= max) 77 | kh_clear(pos, q->best_hash); 78 | } 79 | } 80 | } 81 | 82 | static inline int sum_qual(const bam1_t *b) 83 | { 84 | int i, q; 85 | uint8_t *qual = bam1_qual(b); 86 | for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; 87 | return q; 88 | } 89 | 90 | void bam_rmdup_core(samfile_t *in, samfile_t *out) 91 | { 92 | bam1_t *b; 93 | int last_tid = -1, last_pos = -1; 94 | tmp_stack_t stack; 95 | khint_t k; 96 | khash_t(lib) *aux; 97 | khash_t(name) *del_set; 98 | 99 | aux = kh_init(lib); 100 | del_set = kh_init(name); 101 | b = bam_init1(); 102 | memset(&stack, 0, sizeof(tmp_stack_t)); 103 | 104 | kh_resize(name, del_set, 4 * BUFFER_SIZE); 105 | while (samread(in, b) >= 0) { 106 | bam1_core_t *c = &b->core; 107 | if (c->tid != last_tid || last_pos != c->pos) { 108 | dump_best(&stack, out); // write the result 109 | clear_best(aux, BUFFER_SIZE); 110 | if (c->tid != last_tid) { 111 | clear_best(aux, 0); 112 | if (kh_size(del_set)) { // check 113 | fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); 114 | clear_del_set(del_set); 115 | } 116 | if ((int)c->tid == -1) { // append unmapped reads 117 | samwrite(out, b); 118 | while (samread(in, b) >= 0) samwrite(out, b); 119 | break; 120 | } 121 | last_tid = c->tid; 122 | fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]); 123 | } 124 | } 125 | if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { 126 | samwrite(out, b); 127 | } else if (c->isize > 0) { // paired, head 128 | uint64_t key = (uint64_t)c->pos<<32 | c->isize; 129 | const char *lib; 130 | lib_aux_t *q; 131 | int ret; 132 | lib = bam_get_library(in->header, b); 133 | q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); 134 | ++q->n_checked; 135 | k = kh_put(pos, q->best_hash, key, &ret); 136 | if (ret == 0) { // found in best_hash 137 | bam1_t *p = kh_val(q->best_hash, k); 138 | ++q->n_removed; 139 | if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle 140 | kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed 141 | bam_copy1(p, b); // replaced as b 142 | } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed 143 | if (ret == 0) 144 | fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b)); 145 | } else { // not found in best_hash 146 | kh_val(q->best_hash, k) = bam_dup1(b); 147 | stack_insert(&stack, kh_val(q->best_hash, k)); 148 | } 149 | } else { // paired, tail 150 | k = kh_get(name, del_set, bam1_qname(b)); 151 | if (k != kh_end(del_set)) { 152 | free((char*)kh_key(del_set, k)); 153 | kh_del(name, del_set, k); 154 | } else samwrite(out, b); 155 | } 156 | last_pos = c->pos; 157 | } 158 | 159 | for (k = kh_begin(aux); k != kh_end(aux); ++k) { 160 | if (kh_exist(aux, k)) { 161 | lib_aux_t *q = &kh_val(aux, k); 162 | dump_best(&stack, out); 163 | fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, 164 | (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); 165 | kh_destroy(pos, q->best_hash); 166 | free((char*)kh_key(aux, k)); 167 | } 168 | } 169 | kh_destroy(lib, aux); 170 | 171 | clear_del_set(del_set); 172 | kh_destroy(name, del_set); 173 | free(stack.a); 174 | bam_destroy1(b); 175 | } 176 | 177 | void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se); 178 | 179 | int bam_rmdup(int argc, char *argv[]) 180 | { 181 | int c, is_se = 0, force_se = 0; 182 | samfile_t *in, *out; 183 | while ((c = getopt(argc, argv, "sS")) >= 0) { 184 | switch (c) { 185 | case 's': is_se = 1; break; 186 | case 'S': force_se = is_se = 1; break; 187 | } 188 | } 189 | if (optind + 2 > argc) { 190 | fprintf(stderr, "\n"); 191 | fprintf(stderr, "Usage: samtools rmdup [-sS] \n\n"); 192 | fprintf(stderr, "Option: -s rmdup for SE reads\n"); 193 | fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n"); 194 | return 1; 195 | } 196 | in = samopen(argv[optind], "rb", 0); 197 | out = samopen(argv[optind+1], "wb", in->header); 198 | if (in == 0 || out == 0) { 199 | fprintf(stderr, "[bam_rmdup] fail to read/write input files\n"); 200 | return 1; 201 | } 202 | if (is_se) bam_rmdupse_core(in, out, force_se); 203 | else bam_rmdup_core(in, out); 204 | samclose(in); samclose(out); 205 | return 0; 206 | } 207 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bam_rmdupse.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "sam.h" 3 | #include "khash.h" 4 | #include "klist.h" 5 | 6 | #define QUEUE_CLEAR_SIZE 0x100000 7 | #define MAX_POS 0x7fffffff 8 | 9 | typedef struct { 10 | int endpos; 11 | uint32_t score:31, discarded:1; 12 | bam1_t *b; 13 | } elem_t, *elem_p; 14 | #define __free_elem(p) bam_destroy1((p)->data.b) 15 | KLIST_INIT(q, elem_t, __free_elem) 16 | typedef klist_t(q) queue_t; 17 | 18 | KHASH_MAP_INIT_INT(best, elem_p) 19 | typedef khash_t(best) besthash_t; 20 | 21 | typedef struct { 22 | uint64_t n_checked, n_removed; 23 | besthash_t *left, *rght; 24 | } lib_aux_t; 25 | KHASH_MAP_INIT_STR(lib, lib_aux_t) 26 | 27 | static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) 28 | { 29 | khint_t k = kh_get(lib, aux, lib); 30 | if (k == kh_end(aux)) { 31 | int ret; 32 | char *p = strdup(lib); 33 | lib_aux_t *q; 34 | k = kh_put(lib, aux, p, &ret); 35 | q = &kh_val(aux, k); 36 | q->left = kh_init(best); 37 | q->rght = kh_init(best); 38 | q->n_checked = q->n_removed = 0; 39 | return q; 40 | } else return &kh_val(aux, k); 41 | } 42 | 43 | static inline int sum_qual(const bam1_t *b) 44 | { 45 | int i, q; 46 | uint8_t *qual = bam1_qual(b); 47 | for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; 48 | return q; 49 | } 50 | 51 | static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score) 52 | { 53 | elem_t *p = kl_pushp(q, queue); 54 | p->discarded = 0; 55 | p->endpos = endpos; p->score = score; 56 | if (p->b == 0) p->b = bam_init1(); 57 | bam_copy1(p->b, b); 58 | return p; 59 | } 60 | 61 | static void clear_besthash(besthash_t *h, int32_t pos) 62 | { 63 | khint_t k; 64 | for (k = kh_begin(h); k != kh_end(h); ++k) 65 | if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos) 66 | kh_del(best, h, k); 67 | } 68 | 69 | static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h) 70 | { 71 | if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { 72 | khint_t k; 73 | while (1) { 74 | elem_t *q; 75 | if (queue->head == queue->tail) break; 76 | q = &kl_val(queue->head); 77 | if (q->discarded) { 78 | q->b->data_len = 0; 79 | kl_shift(q, queue, 0); 80 | continue; 81 | } 82 | if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break; 83 | samwrite(out, q->b); 84 | q->b->data_len = 0; 85 | kl_shift(q, queue, 0); 86 | } 87 | for (k = kh_begin(h); k != kh_end(h); ++k) { 88 | if (kh_exist(h, k)) { 89 | clear_besthash(kh_val(h, k).left, pos); 90 | clear_besthash(kh_val(h, k).rght, pos); 91 | } 92 | } 93 | } 94 | } 95 | 96 | void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) 97 | { 98 | bam1_t *b; 99 | queue_t *queue; 100 | khint_t k; 101 | int last_tid = -2; 102 | khash_t(lib) *aux; 103 | 104 | aux = kh_init(lib); 105 | b = bam_init1(); 106 | queue = kl_init(q); 107 | while (samread(in, b) >= 0) { 108 | bam1_core_t *c = &b->core; 109 | int endpos = bam_calend(c, bam1_cigar(b)); 110 | int score = sum_qual(b); 111 | 112 | if (last_tid != c->tid) { 113 | if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux); 114 | last_tid = c->tid; 115 | } else dump_alignment(out, queue, c->pos, aux); 116 | if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) { 117 | push_queue(queue, b, endpos, score); 118 | } else { 119 | const char *lib; 120 | lib_aux_t *q; 121 | besthash_t *h; 122 | uint32_t key; 123 | int ret; 124 | lib = bam_get_library(in->header, b); 125 | q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); 126 | ++q->n_checked; 127 | h = (c->flag&BAM_FREVERSE)? q->rght : q->left; 128 | key = (c->flag&BAM_FREVERSE)? endpos : c->pos; 129 | k = kh_put(best, h, key, &ret); 130 | if (ret == 0) { // in the hash table 131 | elem_t *p = kh_val(h, k); 132 | ++q->n_removed; 133 | if (p->score < score) { 134 | if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue 135 | p->discarded = 1; 136 | kh_val(h, k) = push_queue(queue, b, endpos, score); 137 | } else { // replace 138 | p->score = score; p->endpos = endpos; 139 | bam_copy1(p->b, b); 140 | } 141 | } // otherwise, discard the alignment 142 | } else kh_val(h, k) = push_queue(queue, b, endpos, score); 143 | } 144 | } 145 | dump_alignment(out, queue, MAX_POS, aux); 146 | 147 | for (k = kh_begin(aux); k != kh_end(aux); ++k) { 148 | if (kh_exist(aux, k)) { 149 | lib_aux_t *q = &kh_val(aux, k); 150 | fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, 151 | (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); 152 | kh_destroy(best, q->left); kh_destroy(best, q->rght); 153 | free((char*)kh_key(aux, k)); 154 | } 155 | } 156 | kh_destroy(lib, aux); 157 | bam_destroy1(b); 158 | kl_destroy(q, queue); 159 | } 160 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bam_stat.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "bam.h" 4 | 5 | typedef struct { 6 | long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2]; 7 | long long n_sgltn[2], n_read1[2], n_read2[2]; 8 | long long n_dup[2]; 9 | long long n_diffchr[2], n_diffhigh[2]; 10 | } bam_flagstat_t; 11 | 12 | #define flagstat_loop(s, c) do { \ 13 | int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \ 14 | ++(s)->n_reads[w]; \ 15 | if ((c)->flag & BAM_FPAIRED) { \ 16 | ++(s)->n_pair_all[w]; \ 17 | if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good[w]; \ 18 | if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \ 19 | if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \ 20 | if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \ 21 | if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ 22 | ++(s)->n_pair_map[w]; \ 23 | if ((c)->mtid != (c)->tid) { \ 24 | ++(s)->n_diffchr[w]; \ 25 | if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \ 26 | } \ 27 | } \ 28 | } \ 29 | if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \ 30 | if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ 31 | } while (0) 32 | 33 | bam_flagstat_t *bam_flagstat_core(bamFile fp) 34 | { 35 | bam_flagstat_t *s; 36 | bam1_t *b; 37 | bam1_core_t *c; 38 | int ret; 39 | s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t)); 40 | b = bam_init1(); 41 | c = &b->core; 42 | while ((ret = bam_read1(fp, b)) >= 0) 43 | flagstat_loop(s, c); 44 | bam_destroy1(b); 45 | if (ret != -1) 46 | fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); 47 | return s; 48 | } 49 | int bam_flagstat(int argc, char *argv[]) 50 | { 51 | bamFile fp; 52 | bam_header_t *header; 53 | bam_flagstat_t *s; 54 | if (argc == optind) { 55 | fprintf(stderr, "Usage: samtools flagstat \n"); 56 | return 1; 57 | } 58 | fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); 59 | assert(fp); 60 | header = bam_header_read(fp); 61 | s = bam_flagstat_core(fp); 62 | printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); 63 | printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); 64 | printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0); 65 | printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); 66 | printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); 67 | printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); 68 | printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0); 69 | printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); 70 | printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0); 71 | printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); 72 | printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); 73 | free(s); 74 | bam_header_destroy(header); 75 | bam_close(fp); 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bamtk.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "bam.h" 6 | 7 | #ifdef _USE_KNETFILE 8 | #include "knetfile.h" 9 | #endif 10 | 11 | int bam_taf2baf(int argc, char *argv[]); 12 | int bam_mpileup(int argc, char *argv[]); 13 | int bam_merge(int argc, char *argv[]); 14 | int bam_index(int argc, char *argv[]); 15 | int bam_sort(int argc, char *argv[]); 16 | int bam_tview_main(int argc, char *argv[]); 17 | int bam_mating(int argc, char *argv[]); 18 | int bam_rmdup(int argc, char *argv[]); 19 | int bam_flagstat(int argc, char *argv[]); 20 | int bam_fillmd(int argc, char *argv[]); 21 | int bam_idxstats(int argc, char *argv[]); 22 | int main_samview(int argc, char *argv[]); 23 | int main_import(int argc, char *argv[]); 24 | int main_reheader(int argc, char *argv[]); 25 | int main_cut_target(int argc, char *argv[]); 26 | int main_phase(int argc, char *argv[]); 27 | int main_cat(int argc, char *argv[]); 28 | int main_depth(int argc, char *argv[]); 29 | int main_bam2fq(int argc, char *argv[]); 30 | 31 | int faidx_main(int argc, char *argv[]); 32 | 33 | static int usage() 34 | { 35 | fprintf(stderr, "\n"); 36 | fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n"); 37 | fprintf(stderr, "Version: %s\n\n", BAM_VERSION); 38 | fprintf(stderr, "Usage: samtools [options]\n\n"); 39 | fprintf(stderr, "Command: view SAM<->BAM conversion\n"); 40 | fprintf(stderr, " sort sort alignment file\n"); 41 | fprintf(stderr, " mpileup multi-way pileup\n"); 42 | fprintf(stderr, " depth compute the depth\n"); 43 | fprintf(stderr, " faidx index/extract FASTA\n"); 44 | #if _CURSES_LIB != 0 45 | fprintf(stderr, " tview text alignment viewer\n"); 46 | #endif 47 | fprintf(stderr, " index index alignment\n"); 48 | fprintf(stderr, " idxstats BAM index stats (r595 or later)\n"); 49 | fprintf(stderr, " fixmate fix mate information\n"); 50 | fprintf(stderr, " flagstat simple stats\n"); 51 | fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n"); 52 | fprintf(stderr, " merge merge sorted alignments\n"); 53 | fprintf(stderr, " rmdup remove PCR duplicates\n"); 54 | fprintf(stderr, " reheader replace BAM header\n"); 55 | fprintf(stderr, " cat concatenate BAMs\n"); 56 | fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); 57 | fprintf(stderr, " phase phase heterozygotes\n"); 58 | fprintf(stderr, "\n"); 59 | #ifdef _WIN32 60 | fprintf(stderr, "\ 61 | Note: The Windows version of SAMtools is mainly designed for read-only\n\ 62 | operations, such as viewing the alignments and generating the pileup.\n\ 63 | Binary files generated by the Windows version may be buggy.\n\n"); 64 | #endif 65 | return 1; 66 | } 67 | 68 | int main(int argc, char *argv[]) 69 | { 70 | #ifdef _WIN32 71 | setmode(fileno(stdout), O_BINARY); 72 | setmode(fileno(stdin), O_BINARY); 73 | #ifdef _USE_KNETFILE 74 | knet_win32_init(); 75 | #endif 76 | #endif 77 | if (argc < 2) return usage(); 78 | if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1); 79 | else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1); 80 | else if (strcmp(argv[1], "mpileup") == 0) return bam_mpileup(argc-1, argv+1); 81 | else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1); 82 | else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1); 83 | else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1); 84 | else if (strcmp(argv[1], "idxstats") == 0) return bam_idxstats(argc-1, argv+1); 85 | else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); 86 | else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); 87 | else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); 88 | else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); 89 | else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); 90 | else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); 91 | else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1); 92 | else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1); 93 | else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1); 94 | else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1); 95 | else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1); 96 | else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1); 97 | else if (strcmp(argv[1], "pileup") == 0) { 98 | fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); 99 | return 1; 100 | } 101 | #if _CURSES_LIB != 0 102 | else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); 103 | #endif 104 | else { 105 | fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); 106 | return 1; 107 | } 108 | return 0; 109 | } 110 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bcftools/Makefile: -------------------------------------------------------------------------------- 1 | CC= gcc 2 | CFLAGS= -g -Wall -O2 #-m64 #-arch ppc 3 | DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE 4 | LOBJS= bcf.o vcf.o bcfutils.o prob1.o em.o kfunc.o kmin.o index.o fet.o mut.o bcf2qcall.o 5 | OMISC= .. 6 | AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o $(OMISC)/bedidx.o 7 | PROG= bcftools 8 | INCLUDES= 9 | SUBDIRS= . 10 | 11 | .SUFFIXES:.c .o 12 | 13 | .c.o: 14 | $(CC) -c $(CFLAGS) $(DFLAGS) -I.. $(INCLUDES) $< -o $@ 15 | 16 | all-recur lib-recur clean-recur cleanlocal-recur install-recur: 17 | @target=`echo $@ | sed s/-recur//`; \ 18 | wdir=`pwd`; \ 19 | list='$(SUBDIRS)'; for subdir in $$list; do \ 20 | cd $$subdir; \ 21 | $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ 22 | INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ 23 | cd $$wdir; \ 24 | done; 25 | 26 | all:$(PROG) 27 | 28 | lib:libbcf.a 29 | 30 | libbcf.a:$(LOBJS) 31 | $(AR) -csru $@ $(LOBJS) 32 | 33 | bcftools:lib $(AOBJS) 34 | $(CC) $(CFLAGS) -o $@ $(AOBJS) -L. $(LIBPATH) -lbcf -lm -lz 35 | 36 | bcf.o:bcf.h 37 | vcf.o:bcf.h 38 | index.o:bcf.h 39 | bcfutils.o:bcf.h 40 | prob1.o:prob1.h bcf.h 41 | call1.o:prob1.h bcf.h 42 | bcf2qcall.o:bcf.h 43 | main.o:bcf.h 44 | 45 | bcf.pdf:bcf.tex 46 | pdflatex bcf 47 | 48 | cleanlocal: 49 | rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a bcf.aux bcf.log bcf.pdf *.class libbcf.*.dylib libbcf.so* 50 | 51 | clean:cleanlocal-recur 52 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bcftools/README: -------------------------------------------------------------------------------- 1 | The view command of bcftools calls variants, tests Hardy-Weinberg 2 | equilibrium (HWE), tests allele balances and estimates allele frequency. 3 | 4 | This command calls a site as a potential variant if P(ref|D,F) is below 5 | 0.9 (controlled by the -p option), where D is data and F is the prior 6 | allele frequency spectrum (AFS). 7 | 8 | The view command performs two types of allele balance tests, both based 9 | on Fisher's exact test for 2x2 contingency tables with the row variable 10 | being reference allele or not. In the first table, the column variable 11 | is strand. Two-tail P-value is taken. We test if variant bases tend to 12 | come from one strand. In the second table, the column variable is 13 | whether a base appears in the first or the last 11bp of the read. 14 | One-tail P-value is taken. We test if variant bases tend to occur 15 | towards the end of reads, which is usually an indication of 16 | misalignment. 17 | 18 | Site allele frequency is estimated in two ways. In the first way, the 19 | frequency is esimated as \argmax_f P(D|f) under the assumption of 20 | HWE. Prior AFS is not used. In the second way, the frequency is 21 | estimated as the posterior expectation of allele counts \sum_k 22 | kP(k|D,F), dividied by the total number of haplotypes. HWE is not 23 | assumed, but the estimate depends on the prior AFS. The two estimates 24 | largely agree when the signal is strong, but may differ greatly on weak 25 | sites as in this case, the prior plays an important role. 26 | 27 | To test HWE, we calculate the posterior distribution of genotypes 28 | (ref-hom, het and alt-hom). Chi-square test is performed. It is worth 29 | noting that the model used here is prior dependent and assumes HWE, 30 | which is different from both models for allele frequency estimate. The 31 | new model actually yields a third estimate of site allele frequency. 32 | 33 | The estimate allele frequency spectrum is printed to stderr per 64k 34 | sites. The estimate is in fact only the first round of a EM 35 | procedure. The second model (not the model for HWE testing) is used to 36 | estimate the AFS. -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bcftools/bcf.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt,pdftex]{article} 2 | \usepackage{color} 3 | \definecolor{gray}{rgb}{0.7,0.7,0.7} 4 | 5 | \setlength{\topmargin}{0.0cm} 6 | \setlength{\textheight}{21.5cm} 7 | \setlength{\oddsidemargin}{0cm} 8 | \setlength{\textwidth}{16.5cm} 9 | \setlength{\columnsep}{0.6cm} 10 | 11 | \begin{document} 12 | 13 | \begin{center} 14 | \begin{tabular}{|l|l|l|l|l|} 15 | \hline 16 | \multicolumn{2}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\hline\hline 17 | \multicolumn{2}{|l|}{\sf magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline 18 | \multicolumn{2}{|l|}{\sf l\_seqnm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline 19 | \multicolumn{2}{|l|}{\sf seqnm} & Concatenated names, {\tt NULL} padded & {\tt char[{\sf l\_seqnm}]} & \\\hline 20 | \multicolumn{2}{|l|}{\sf l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline 21 | \multicolumn{2}{|l|}{\sf smpl} & Concatenated sample names & {\tt char[{\sf l\_smpl}]} & \\\hline 22 | \multicolumn{2}{|l|}{\sf l\_meta} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline 23 | \multicolumn{2}{|l|}{\sf meta} & Meta text, {\tt NULL} terminated & {\tt char[{\sf l\_meta}]} & \\\hline 24 | \multicolumn{5}{|c|}{\it \color{gray}{List of records until the end of the file}}\\\cline{2-5} 25 | & {\sf seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5} 26 | & {\sf pos} & Position & {\tt int32\_t} & \\\cline{2-5} 27 | & {\sf qual} & Variant quality & {\tt float} & \\\cline{2-5} 28 | & {\sf l\_str} & Length of {\sf str} & {\tt int32\_t} & \\\cline{2-5} 29 | & {\sf str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[{\sf l\_str}]} &\\\cline{2-5} 30 | & \multicolumn{4}{c|}{Blocks of data; \#blocks and formats defined by {\tt FORMAT} (table below)}\\ 31 | \hline 32 | \end{tabular} 33 | \end{center} 34 | 35 | \begin{center} 36 | \begin{tabular}{clp{9cm}} 37 | \hline 38 | \multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline 39 | {\tt DP} & {\tt uint16\_t[n]} & Read depth \\ 40 | {\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\ 41 | {\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\ 42 | {\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$. If the highest bit is set, 43 | the allele is not present (e.g. due to different ploidy between samples).} \\ 44 | {\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\ 45 | {\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\ 46 | {\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\ 47 | {\tt IBD} & {\tt uint32\_t[n*2]} & {IBD}\\ 48 | {\tt \_IBD} & {\tt uint8\_t+uint32\_t[n*P]} & {Generic IBD}\\ 49 | {\tt PL} & {\tt uint8\_t[n*G]} & {Phred-scaled likelihood of data}\\ 50 | {\tt PS} & {\tt uint32\_t[n]} & {Phase set}\\ 51 | %{\tt SP} & {\tt uint8\_t[n]} & {Strand bias P-value (bcftools only)}\\ 52 | \emph{Integer} & {\tt int32\_t[n*X]} & {Fix-sized custom Integer; $X$ defined in the header}\\ 53 | \emph{Numeric} & {\tt double[n*X]} & {Fix-sized custom Numeric}\\ 54 | \emph{String} & {\tt uint32\_t+char*} & {\tt NULL} padded concat. strings (int equals to the length) \\ 55 | \hline 56 | \end{tabular} 57 | \end{center} 58 | 59 | \begin{itemize} 60 | \item A BCF file is in the {\tt BGZF} format. 61 | \item All multi-byte numbers are little-endian. 62 | \item In a string, a missing value `.' is an empty C string ``{\tt 63 | \char92 0}'' (not ``{\tt .\char92 0}'') 64 | \item For {\tt GL} and {\tt PL}, likelihoods of genotypes appear in the 65 | order of alleles in {\tt REF} and then {\tt ALT}. For example, if {\tt 66 | REF=C}, {\tt ALT=T,A}, likelihoods appear in the order of {\tt 67 | CC,CT,TT,CA,TA,AA} (NB: the ordering is different from the one in the original 68 | BCF proposal). 69 | \item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields 70 | are required to be explicitly defined in the headers. 71 | \item A {\tt FORMAT} field with its name starting with `{\tt \_}' is specific to BCF only. 72 | It gives an alternative binary representation of the corresponding VCF field, in case 73 | the default representation is unable to keep the genotype information, 74 | for example, when the ploidy is not 2 or there are more than 8 alleles. 75 | \end{itemize} 76 | 77 | \end{document} 78 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bcftools/bcf2qcall.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "bcf.h" 6 | 7 | static int8_t nt4_table[256] = { 8 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 9 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 10 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, 11 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 13 | 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, 14 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 15 | 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, 16 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 17 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 18 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 19 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 20 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 21 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 22 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 23 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 24 | }; 25 | 26 | static int read_I16(bcf1_t *b, int anno[16]) 27 | { 28 | char *p; 29 | int i; 30 | if ((p = strstr(b->info, "I16=")) == 0) return -1; 31 | p += 4; 32 | for (i = 0; i < 16; ++i) { 33 | anno[i] = strtol(p, &p, 10); 34 | if (anno[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2; 35 | ++p; 36 | } 37 | return 0; 38 | } 39 | 40 | int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b) 41 | { 42 | int a[4], k, g[10], l, map[4], k1, j, i, i0, anno[16], dp, mq, d_rest; 43 | char *s; 44 | if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base 45 | for (i = 0; i < b->n_gi; ++i) 46 | if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; 47 | if (i == b->n_gi) return -1; // no PL 48 | if (read_I16(b, anno) != 0) return -1; // no I16; FIXME: can be improved 49 | d_rest = dp = anno[0] + anno[1] + anno[2] + anno[3]; 50 | if (dp == 0) return -1; // depth is zero 51 | mq = (int)(sqrt((double)(anno[9] + anno[11]) / dp) + .499); 52 | i0 = i; 53 | a[0] = nt4_table[(int)b->ref[0]]; 54 | if (a[0] > 3) return -1; // ref is not A/C/G/T 55 | a[1] = a[2] = a[3] = -2; // -1 has a special meaning 56 | if (b->alt[0] == 0) return -1; // no alternate allele 57 | map[0] = map[1] = map[2] = map[3] = -2; 58 | map[a[0]] = 0; 59 | for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) { 60 | if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base 61 | a[k+1] = nt4_table[(int)*s]; 62 | if (a[k+1] >= 0) map[a[k+1]] = k+1; 63 | else k1 = k+1; 64 | if (s[1] == 0) break; 65 | } 66 | for (k = 0; k < 4; ++k) 67 | if (map[k] < 0) map[k] = k1; 68 | for (i = 0; i < h->n_smpl; ++i) { 69 | int d; 70 | uint8_t *p = b->gi[i0].data + i * b->gi[i0].len; 71 | for (j = 0; j < b->gi[i0].len; ++j) 72 | if (p[j]) break; 73 | d = (int)((double)d_rest / (h->n_smpl - i) + .499); 74 | if (d == 0) d = 1; 75 | if (j == b->gi[i0].len) d = 0; 76 | d_rest -= d; 77 | for (k = j = 0; k < 4; ++k) { 78 | for (l = k; l < 4; ++l) { 79 | int t, x = map[k], y = map[l]; 80 | if (x > y) t = x, x = y, y = t; // swap 81 | g[j++] = p[y * (y+1) / 2 + x]; 82 | } 83 | } 84 | printf("%s\t%d\t%c", h->ns[b->tid], b->pos+1, *b->ref); 85 | printf("\t%d\t%d\t0", d, mq); 86 | for (j = 0; j < 10; ++j) 87 | printf("\t%d", g[j]); 88 | printf("\t%s\n", h->sns[i]); 89 | } 90 | return 0; 91 | } 92 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bcftools/fet.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* This program is implemented with ideas from this web page: 5 | * 6 | * http://www.langsrud.com/fisher.htm 7 | */ 8 | 9 | // log\binom{n}{k} 10 | static double lbinom(int n, int k) 11 | { 12 | if (k == 0 || n == k) return 0; 13 | return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1); 14 | } 15 | 16 | // n11 n12 | n1_ 17 | // n21 n22 | n2_ 18 | //-----------+---- 19 | // n_1 n_2 | n 20 | 21 | // hypergeometric distribution 22 | static double hypergeo(int n11, int n1_, int n_1, int n) 23 | { 24 | return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1)); 25 | } 26 | 27 | typedef struct { 28 | int n11, n1_, n_1, n; 29 | double p; 30 | } hgacc_t; 31 | 32 | // incremental version of hypergenometric distribution 33 | static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux) 34 | { 35 | if (n1_ || n_1 || n) { 36 | aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n; 37 | } else { // then only n11 changed; the rest fixed 38 | if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) { 39 | if (n11 == aux->n11 + 1) { // incremental 40 | aux->p *= (double)(aux->n1_ - aux->n11) / n11 41 | * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1); 42 | aux->n11 = n11; 43 | return aux->p; 44 | } 45 | if (n11 == aux->n11 - 1) { // incremental 46 | aux->p *= (double)aux->n11 / (aux->n1_ - n11) 47 | * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11); 48 | aux->n11 = n11; 49 | return aux->p; 50 | } 51 | } 52 | aux->n11 = n11; 53 | } 54 | aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n); 55 | return aux->p; 56 | } 57 | 58 | double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two) 59 | { 60 | int i, j, max, min; 61 | double p, q, left, right; 62 | hgacc_t aux; 63 | int n1_, n_1, n; 64 | 65 | n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n 66 | max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail 67 | min = n1_ + n_1 - n; 68 | if (min < 0) min = 0; // min n11, for left tail 69 | *two = *_left = *_right = 1.; 70 | if (min == max) return 1.; // no need to do test 71 | q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table 72 | // left tail 73 | p = hypergeo_acc(min, 0, 0, 0, &aux); 74 | for (left = 0., i = min + 1; p < 0.99999999 * q; ++i) // loop until underflow 75 | left += p, p = hypergeo_acc(i, 0, 0, 0, &aux); 76 | --i; 77 | if (p < 1.00000001 * q) left += p; 78 | else --i; 79 | // right tail 80 | p = hypergeo_acc(max, 0, 0, 0, &aux); 81 | for (right = 0., j = max - 1; p < 0.99999999 * q; --j) // loop until underflow 82 | right += p, p = hypergeo_acc(j, 0, 0, 0, &aux); 83 | ++j; 84 | if (p < 1.00000001 * q) right += p; 85 | else ++j; 86 | // two-tail 87 | *two = left + right; 88 | if (*two > 1.) *two = 1.; 89 | // adjust left and right 90 | if (abs(i - n11) < abs(j - n11)) right = 1. - left + q; 91 | else left = 1.0 - right + q; 92 | *_left = left; *_right = right; 93 | return q; 94 | } 95 | 96 | #ifdef FET_MAIN 97 | #include 98 | 99 | int main(int argc, char *argv[]) 100 | { 101 | char id[1024]; 102 | int n11, n12, n21, n22; 103 | double left, right, twotail, prob; 104 | 105 | while (scanf("%s%d%d%d%d", id, &n11, &n12, &n21, &n22) == 5) { 106 | prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail); 107 | printf("%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", id, n11, n12, n21, n22, 108 | prob, left, right, twotail); 109 | } 110 | return 0; 111 | } 112 | #endif 113 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bcftools/kfunc.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | /* Log gamma function 5 | * \log{\Gamma(z)} 6 | * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 7 | */ 8 | double kf_lgamma(double z) 9 | { 10 | double x = 0; 11 | x += 0.1659470187408462e-06 / (z+7); 12 | x += 0.9934937113930748e-05 / (z+6); 13 | x -= 0.1385710331296526 / (z+5); 14 | x += 12.50734324009056 / (z+4); 15 | x -= 176.6150291498386 / (z+3); 16 | x += 771.3234287757674 / (z+2); 17 | x -= 1259.139216722289 / (z+1); 18 | x += 676.5203681218835 / z; 19 | x += 0.9999999999995183; 20 | return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5); 21 | } 22 | 23 | /* complementary error function 24 | * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt 25 | * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66 26 | */ 27 | double kf_erfc(double x) 28 | { 29 | const double p0 = 220.2068679123761; 30 | const double p1 = 221.2135961699311; 31 | const double p2 = 112.0792914978709; 32 | const double p3 = 33.912866078383; 33 | const double p4 = 6.37396220353165; 34 | const double p5 = .7003830644436881; 35 | const double p6 = .03526249659989109; 36 | const double q0 = 440.4137358247522; 37 | const double q1 = 793.8265125199484; 38 | const double q2 = 637.3336333788311; 39 | const double q3 = 296.5642487796737; 40 | const double q4 = 86.78073220294608; 41 | const double q5 = 16.06417757920695; 42 | const double q6 = 1.755667163182642; 43 | const double q7 = .08838834764831844; 44 | double expntl, z, p; 45 | z = fabs(x) * M_SQRT2; 46 | if (z > 37.) return x > 0.? 0. : 2.; 47 | expntl = exp(z * z * - .5); 48 | if (z < 10. / M_SQRT2) // for small z 49 | p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0) 50 | / (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0); 51 | else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65))))); 52 | return x > 0.? 2. * p : 2. * (1. - p); 53 | } 54 | 55 | /* The following computes regularized incomplete gamma functions. 56 | * Formulas are taken from Wiki, with additional input from Numerical 57 | * Recipes in C (for modified Lentz's algorithm) and AS245 58 | * (http://lib.stat.cmu.edu/apstat/245). 59 | * 60 | * A good online calculator is available at: 61 | * 62 | * http://www.danielsoper.com/statcalc/calc23.aspx 63 | * 64 | * It calculates upper incomplete gamma function, which equals 65 | * kf_gammaq(s,z)*tgamma(s). 66 | */ 67 | 68 | #define KF_GAMMA_EPS 1e-14 69 | #define KF_TINY 1e-290 70 | 71 | // regularized lower incomplete gamma function, by series expansion 72 | static double _kf_gammap(double s, double z) 73 | { 74 | double sum, x; 75 | int k; 76 | for (k = 1, sum = x = 1.; k < 100; ++k) { 77 | sum += (x *= z / (s + k)); 78 | if (x / sum < KF_GAMMA_EPS) break; 79 | } 80 | return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum)); 81 | } 82 | // regularized upper incomplete gamma function, by continued fraction 83 | static double _kf_gammaq(double s, double z) 84 | { 85 | int j; 86 | double C, D, f; 87 | f = 1. + z - s; C = f; D = 0.; 88 | // Modified Lentz's algorithm for computing continued fraction 89 | // See Numerical Recipes in C, 2nd edition, section 5.2 90 | for (j = 1; j < 100; ++j) { 91 | double a = j * (s - j), b = (j<<1) + 1 + z - s, d; 92 | D = b + a * D; 93 | if (D < KF_TINY) D = KF_TINY; 94 | C = b + a / C; 95 | if (C < KF_TINY) C = KF_TINY; 96 | D = 1. / D; 97 | d = C * D; 98 | f *= d; 99 | if (fabs(d - 1.) < KF_GAMMA_EPS) break; 100 | } 101 | return exp(s * log(z) - z - kf_lgamma(s) - log(f)); 102 | } 103 | 104 | double kf_gammap(double s, double z) 105 | { 106 | return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z); 107 | } 108 | 109 | double kf_gammaq(double s, double z) 110 | { 111 | return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z); 112 | } 113 | 114 | /* Regularized incomplete beta function. The method is taken from 115 | * Numerical Recipe in C, 2nd edition, section 6.4. The following web 116 | * page calculates the incomplete beta function, which equals 117 | * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b): 118 | * 119 | * http://www.danielsoper.com/statcalc/calc36.aspx 120 | */ 121 | static double kf_betai_aux(double a, double b, double x) 122 | { 123 | double C, D, f; 124 | int j; 125 | if (x == 0.) return 0.; 126 | if (x == 1.) return 1.; 127 | f = 1.; C = f; D = 0.; 128 | // Modified Lentz's algorithm for computing continued fraction 129 | for (j = 1; j < 200; ++j) { 130 | double aa, d; 131 | int m = j>>1; 132 | aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1)) 133 | : m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m)); 134 | D = 1. + aa * D; 135 | if (D < KF_TINY) D = KF_TINY; 136 | C = 1. + aa / C; 137 | if (C < KF_TINY) C = KF_TINY; 138 | D = 1. / D; 139 | d = C * D; 140 | f *= d; 141 | if (fabs(d - 1.) < KF_GAMMA_EPS) break; 142 | } 143 | return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f; 144 | } 145 | double kf_betai(double a, double b, double x) 146 | { 147 | return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x); 148 | } 149 | 150 | #ifdef KF_MAIN 151 | #include 152 | int main(int argc, char *argv[]) 153 | { 154 | double x = 5.5, y = 3; 155 | double a, b; 156 | printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x)); 157 | printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y)); 158 | a = 2; b = 2; x = 0.5; 159 | printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b))); 160 | return 0; 161 | } 162 | #endif 163 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bcftools/kmin.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2008, 2010 by Attractive Chaos 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining 5 | a copy of this software and associated documentation files (the 6 | "Software"), to deal in the Software without restriction, including 7 | without limitation the rights to use, copy, modify, merge, publish, 8 | distribute, sublicense, and/or sell copies of the Software, and to 9 | permit persons to whom the Software is furnished to do so, subject to 10 | the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | #ifndef KMIN_H 26 | #define KMIN_H 27 | 28 | #define KMIN_RADIUS 0.5 29 | #define KMIN_EPS 1e-7 30 | #define KMIN_MAXCALL 50000 31 | 32 | typedef double (*kmin_f)(int, double*, void*); 33 | typedef double (*kmin1_f)(double, void*); 34 | 35 | #ifdef __cplusplus 36 | extern "C" { 37 | #endif 38 | 39 | double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls); 40 | double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin); 41 | 42 | #ifdef __cplusplus 43 | } 44 | #endif 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bcftools/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "bcf.h" 6 | 7 | #include "kseq.h" 8 | KSTREAM_INIT(gzFile, gzread, 0x10000) 9 | 10 | int bcfview(int argc, char *argv[]); 11 | int bcf_main_index(int argc, char *argv[]); 12 | 13 | #define BUF_SIZE 0x10000 14 | 15 | int bcf_cat(int n, char * const *fn) 16 | { 17 | int i; 18 | bcf_t *out; 19 | uint8_t *buf; 20 | buf = malloc(BUF_SIZE); 21 | out = bcf_open("-", "w"); 22 | for (i = 0; i < n; ++i) { 23 | bcf_t *in; 24 | bcf_hdr_t *h; 25 | off_t end; 26 | struct stat s; 27 | in = bcf_open(fn[i], "r"); 28 | h = bcf_hdr_read(in); 29 | if (i == 0) bcf_hdr_write(out, h); 30 | bcf_hdr_destroy(h); 31 | #ifdef _USE_KNETFILE 32 | fstat(knet_fileno(in->fp->x.fpr), &s); 33 | end = s.st_size - 28; 34 | while (knet_tell(in->fp->x.fpr) < end) { 35 | int size = knet_tell(in->fp->x.fpr) + BUF_SIZE < end? BUF_SIZE : end - knet_tell(in->fp->x.fpr); 36 | knet_read(in->fp->x.fpr, buf, size); 37 | fwrite(buf, 1, size, out->fp->x.fpw); 38 | } 39 | #else 40 | abort(); // FIXME: not implemented 41 | #endif 42 | bcf_close(in); 43 | } 44 | bcf_close(out); 45 | free(buf); 46 | return 0; 47 | } 48 | 49 | extern double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); 50 | 51 | int bcf_main_ldpair(int argc, char *argv[]) 52 | { 53 | bcf_t *fp; 54 | bcf_hdr_t *h; 55 | bcf1_t *b0, *b1; 56 | bcf_idx_t *idx; 57 | kstring_t str; 58 | void *str2id; 59 | gzFile fplist; 60 | kstream_t *ks; 61 | int dret, lineno = 0; 62 | if (argc < 3) { 63 | fprintf(stderr, "Usage: bcftools ldpair \n"); 64 | return 1; 65 | } 66 | fplist = gzopen(argv[2], "rb"); 67 | ks = ks_init(fplist); 68 | memset(&str, 0, sizeof(kstring_t)); 69 | fp = bcf_open(argv[1], "rb"); 70 | h = bcf_hdr_read(fp); 71 | str2id = bcf_build_refhash(h); 72 | idx = bcf_idx_load(argv[1]); 73 | if (idx == 0) { 74 | fprintf(stderr, "[%s] No bcf index is found. Abort!\n", __func__); 75 | return 1; 76 | } 77 | b0 = calloc(1, sizeof(bcf1_t)); 78 | b1 = calloc(1, sizeof(bcf1_t)); 79 | while (ks_getuntil(ks, '\n', &str, &dret) >= 0) { 80 | char *p, *q; 81 | int k; 82 | int tid0 = -1, tid1 = -1, pos0 = -1, pos1 = -1; 83 | ++lineno; 84 | for (p = q = str.s, k = 0; *p; ++p) { 85 | if (*p == ' ' || *p == '\t') { 86 | *p = '\0'; 87 | if (k == 0) tid0 = bcf_str2id(str2id, q); 88 | else if (k == 1) pos0 = atoi(q) - 1; 89 | else if (k == 2) tid1 = strcmp(q, "=")? bcf_str2id(str2id, q) : tid0; 90 | else if (k == 3) pos1 = atoi(q) - 1; 91 | q = p + 1; 92 | ++k; 93 | } 94 | } 95 | if (k == 3) pos1 = atoi(q) - 1; 96 | if (tid0 >= 0 && tid1 >= 0 && pos0 >= 0 && pos1 >= 0) { 97 | uint64_t off; 98 | double r, f[4]; 99 | off = bcf_idx_query(idx, tid0, pos0); 100 | bgzf_seek(fp->fp, off, SEEK_SET); 101 | while (bcf_read(fp, h, b0) >= 0 && b0->pos != pos0); 102 | off = bcf_idx_query(idx, tid1, pos1); 103 | bgzf_seek(fp->fp, off, SEEK_SET); 104 | while (bcf_read(fp, h, b1) >= 0 && b1->pos != pos1); 105 | r = bcf_pair_freq(b0, b1, f); 106 | r *= r; 107 | printf("%s\t%d\t%s\t%d\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\n", h->ns[tid0], pos0+1, h->ns[tid1], pos1+1, 108 | r, f[0], f[1], f[2], f[3]); 109 | } //else fprintf(stderr, "[%s] Parse error at line %d.\n", __func__, lineno); 110 | } 111 | bcf_destroy(b0); bcf_destroy(b1); 112 | bcf_idx_destroy(idx); 113 | bcf_str2id_destroy(str2id); 114 | bcf_hdr_destroy(h); 115 | bcf_close(fp); 116 | free(str.s); 117 | ks_destroy(ks); 118 | gzclose(fplist); 119 | return 0; 120 | } 121 | 122 | int bcf_main_ld(int argc, char *argv[]) 123 | { 124 | bcf_t *fp; 125 | bcf_hdr_t *h; 126 | bcf1_t **b, *b0; 127 | int i, j, m, n; 128 | double f[4]; 129 | if (argc == 1) { 130 | fprintf(stderr, "Usage: bcftools ld \n"); 131 | return 1; 132 | } 133 | fp = bcf_open(argv[1], "rb"); 134 | h = bcf_hdr_read(fp); 135 | // read the entire BCF 136 | m = n = 0; b = 0; 137 | b0 = calloc(1, sizeof(bcf1_t)); 138 | while (bcf_read(fp, h, b0) >= 0) { 139 | if (m == n) { 140 | m = m? m<<1 : 16; 141 | b = realloc(b, sizeof(void*) * m); 142 | } 143 | b[n] = calloc(1, sizeof(bcf1_t)); 144 | bcf_cpy(b[n++], b0); 145 | } 146 | bcf_destroy(b0); 147 | // compute pair-wise r^2 148 | printf("%d\n", n); // the number of loci 149 | for (i = 0; i < n; ++i) { 150 | printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1); 151 | for (j = 0; j < i; ++j) { 152 | double r = bcf_pair_freq(b[i], b[j], f); 153 | printf("\t%.3f", r*r); 154 | } 155 | printf("\t1.000\n"); 156 | } 157 | // free 158 | for (i = 0; i < n; ++i) bcf_destroy(b[i]); 159 | free(b); 160 | bcf_hdr_destroy(h); 161 | bcf_close(fp); 162 | return 0; 163 | } 164 | 165 | int main(int argc, char *argv[]) 166 | { 167 | if (argc == 1) { 168 | fprintf(stderr, "\n"); 169 | fprintf(stderr, "Program: bcftools (Tools for data in the VCF/BCF formats)\n"); 170 | fprintf(stderr, "Version: %s\n\n", BCF_VERSION); 171 | fprintf(stderr, "Usage: bcftools \n\n"); 172 | fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n"); 173 | fprintf(stderr, " index index BCF\n"); 174 | fprintf(stderr, " cat concatenate BCFs\n"); 175 | fprintf(stderr, " ld compute all-pair r^2\n"); 176 | fprintf(stderr, " ldpair compute r^2 between requested pairs\n"); 177 | fprintf(stderr, "\n"); 178 | return 1; 179 | } 180 | if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); 181 | else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); 182 | else if (strcmp(argv[1], "ld") == 0) return bcf_main_ld(argc-1, argv+1); 183 | else if (strcmp(argv[1], "ldpair") == 0) return bcf_main_ldpair(argc-1, argv+1); 184 | else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ... 185 | else { 186 | fprintf(stderr, "[main] Unrecognized command.\n"); 187 | return 1; 188 | } 189 | return 0; 190 | } 191 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bcftools/mut.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "bcf.h" 4 | 5 | #define MAX_GENO 359 6 | 7 | int8_t seq_bitcnt[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 8 | char *seq_nt16rev = "XACMGRSVTWYHKDBN"; 9 | 10 | uint32_t *bcf_trio_prep(int is_x, int is_son) 11 | { 12 | int i, j, k, n, map[10]; 13 | uint32_t *ret; 14 | ret = calloc(MAX_GENO, 4); 15 | for (i = 0, k = 0; i < 4; ++i) 16 | for (j = i; j < 4; ++j) 17 | map[k++] = 1<n_smpl != 3) return -1; // not a trio 44 | for (i = 0; i < b->n_gi; ++i) 45 | if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; 46 | if (i == b->n_gi) return -1; // no PL 47 | gl10 = alloca(10 * b->n_smpl); 48 | if (bcf_gl10(b, gl10) < 0) { 49 | if (bcf_gl10_indel(b, gl10) < 0) return -1; 50 | } 51 | PL = b->gi + i; 52 | for (i = 0, k = 0; i < 4; ++i) 53 | for (j = i; j < 4; ++j) 54 | map[k++] = seq_nt16rev[1<data)[j * PL->len] != 0) break; 57 | if (j < 3) { // we need to go through the complex procedure 58 | uint8_t *g[3]; 59 | int minc = 1<<30, minc_j = -1, minf = 0, gtf = 0, gtc = 0; 60 | g[0] = gl10; 61 | g[1] = gl10 + 10; 62 | g[2] = gl10 + 20; 63 | for (j = 1; j <= (int)prep[0]; ++j) { // compute LK with constraint 64 | int sum = g[0][prep[j]&0xff] + g[1][prep[j]>>8&0xff] + g[2][prep[j]>>16&0xff]; 65 | if (sum < minc) minc = sum, minc_j = j; 66 | } 67 | gtc |= map[prep[minc_j]&0xff]; gtc |= map[prep[minc_j]>>8&0xff]<<8; gtc |= map[prep[minc_j]>>16]<<16; 68 | for (j = 0; j < 3; ++j) { // compute LK without constraint 69 | int min = 1<<30, min_k = -1; 70 | for (k = 0; k < 10; ++k) 71 | if (g[j][k] < min) min = g[j][k], min_k = k; 72 | gtf |= map[min_k]<<(j*8); 73 | minf += min; 74 | } 75 | *llr = minc - minf; *gt = (int64_t)gtc<<32 | gtf; 76 | } else *llr = 0, *gt = -1; 77 | return 0; 78 | } 79 | 80 | int bcf_pair_call(const bcf1_t *b) 81 | { 82 | int i, j, k; 83 | const bcf_ginfo_t *PL; 84 | if (b->n_smpl != 2) return -1; // not a pair 85 | for (i = 0; i < b->n_gi; ++i) 86 | if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; 87 | if (i == b->n_gi) return -1; // no PL 88 | PL = b->gi + i; 89 | for (j = 0; j < 2; ++j) // check if ref hom is the most probable in all members 90 | if (((uint8_t*)PL->data)[j * PL->len] != 0) break; 91 | if (j < 2) { // we need to go through the complex procedure 92 | uint8_t *g[2]; 93 | int minc = 1<<30, minf = 0; 94 | g[0] = PL->data; 95 | g[1] = (uint8_t*)PL->data + PL->len; 96 | for (j = 0; j < PL->len; ++j) // compute LK with constraint 97 | minc = minc < g[0][j] + g[1][j]? minc : g[0][j] + g[1][j]; 98 | for (j = 0; j < 2; ++j) { // compute LK without constraint 99 | int min = 1<<30; 100 | for (k = 0; k < PL->len; ++k) 101 | min = min < g[j][k]? min : g[j][k]; 102 | minf += min; 103 | } 104 | return minc - minf; 105 | } else return 0; 106 | } 107 | 108 | int bcf_min_diff(const bcf1_t *b) 109 | { 110 | int i, min = 1<<30; 111 | const bcf_ginfo_t *PL; 112 | for (i = 0; i < b->n_gi; ++i) 113 | if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; 114 | if (i == b->n_gi) return -1; // no PL 115 | PL = b->gi + i; 116 | for (i = 0; i < b->n_smpl; ++i) { 117 | int m1, m2, j; 118 | const uint8_t *p = (uint8_t*)PL->data; 119 | m1 = m2 = 1<<30; 120 | for (j = 0; j < PL->len; ++j) { 121 | if ((int)p[j] < m1) m2 = m1, m1 = p[j]; 122 | else if ((int)p[j] < m2) m2 = p[j]; 123 | } 124 | min = min < m2 - m1? min : m2 - m1; 125 | } 126 | return min; 127 | } 128 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bcftools/prob1.h: -------------------------------------------------------------------------------- 1 | #ifndef BCF_PROB1_H 2 | #define BCF_PROB1_H 3 | 4 | #include "bcf.h" 5 | 6 | struct __bcf_p1aux_t; 7 | typedef struct __bcf_p1aux_t bcf_p1aux_t; 8 | 9 | typedef struct { 10 | int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal() 11 | int ac; // ML alternative allele count 12 | double f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var; 13 | double cil, cih; 14 | double cmp[3], p_chi2, lrt; // used by contrast2() 15 | } bcf_p1rst_t; 16 | 17 | #define MC_PTYPE_FULL 1 18 | #define MC_PTYPE_COND2 2 19 | #define MC_PTYPE_FLAT 3 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy); 26 | void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); 27 | void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta); 28 | void bcf_p1_destroy(bcf_p1aux_t *ma); 29 | int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst); 30 | int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); 31 | void bcf_p1_dump_afs(bcf_p1aux_t *ma); 32 | int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); 33 | int bcf_p1_set_n1(bcf_p1aux_t *b, int n1); 34 | void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called 35 | 36 | int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]); 37 | 38 | #ifdef __cplusplus 39 | } 40 | #endif 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bedidx.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #ifdef _WIN32 8 | #define drand48() ((double)rand() / RAND_MAX) 9 | #endif 10 | 11 | #include "ksort.h" 12 | KSORT_INIT_GENERIC(uint64_t) 13 | 14 | #include "kseq.h" 15 | KSTREAM_INIT(gzFile, gzread, 8192) 16 | 17 | typedef struct { 18 | int n, m; 19 | uint64_t *a; 20 | int *idx; 21 | } bed_reglist_t; 22 | 23 | #include "khash.h" 24 | KHASH_MAP_INIT_STR(reg, bed_reglist_t) 25 | 26 | #define LIDX_SHIFT 13 27 | 28 | typedef kh_reg_t reghash_t; 29 | 30 | int *bed_index_core(int n, uint64_t *a, int *n_idx) 31 | { 32 | int i, j, m, *idx; 33 | m = *n_idx = 0; idx = 0; 34 | for (i = 0; i < n; ++i) { 35 | int beg, end; 36 | beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; 37 | if (m < end + 1) { 38 | int oldm = m; 39 | m = end + 1; 40 | kroundup32(m); 41 | idx = realloc(idx, m * sizeof(int)); 42 | for (j = oldm; j < m; ++j) idx[j] = -1; 43 | } 44 | if (beg == end) { 45 | if (idx[beg] < 0) idx[beg] = i; 46 | } else { 47 | for (j = beg; j <= end; ++j) 48 | if (idx[j] < 0) idx[j] = i; 49 | } 50 | *n_idx = end + 1; 51 | } 52 | return idx; 53 | } 54 | 55 | void bed_index(void *_h) 56 | { 57 | reghash_t *h = (reghash_t*)_h; 58 | khint_t k; 59 | for (k = 0; k < kh_end(h); ++k) { 60 | if (kh_exist(h, k)) { 61 | bed_reglist_t *p = &kh_val(h, k); 62 | if (p->idx) free(p->idx); 63 | ks_introsort(uint64_t, p->n, p->a); 64 | p->idx = bed_index_core(p->n, p->a, &p->m); 65 | } 66 | } 67 | } 68 | 69 | int bed_overlap_core(const bed_reglist_t *p, int beg, int end) 70 | { 71 | int i, min_off; 72 | if (p->n == 0) return 0; 73 | min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; 74 | if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here 75 | int n = beg>>LIDX_SHIFT; 76 | if (n > p->n) n = p->n; 77 | for (i = n - 1; i >= 0; --i) 78 | if (p->idx[i] >= 0) break; 79 | min_off = i >= 0? p->idx[i] : 0; 80 | } 81 | for (i = min_off; i < p->n; ++i) { 82 | if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed 83 | if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) 84 | return 1; // find the overlap; return 85 | } 86 | return 0; 87 | } 88 | 89 | int bed_overlap(const void *_h, const char *chr, int beg, int end) 90 | { 91 | const reghash_t *h = (const reghash_t*)_h; 92 | khint_t k; 93 | if (!h) return 0; 94 | k = kh_get(reg, h, chr); 95 | if (k == kh_end(h)) return 0; 96 | return bed_overlap_core(&kh_val(h, k), beg, end); 97 | } 98 | 99 | void *bed_read(const char *fn) 100 | { 101 | reghash_t *h = kh_init(reg); 102 | gzFile fp; 103 | kstream_t *ks; 104 | int dret; 105 | kstring_t *str; 106 | // read the list 107 | fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); 108 | if (fp == 0) return 0; 109 | str = calloc(1, sizeof(kstring_t)); 110 | ks = ks_init(fp); 111 | while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name 112 | int beg = -1, end = -1; 113 | bed_reglist_t *p; 114 | khint_t k = kh_get(reg, h, str->s); 115 | if (k == kh_end(h)) { // absent from the hash table 116 | int ret; 117 | char *s = strdup(str->s); 118 | k = kh_put(reg, h, s, &ret); 119 | memset(&kh_val(h, k), 0, sizeof(bed_reglist_t)); 120 | } 121 | p = &kh_val(h, k); 122 | if (dret != '\n') { // if the lines has other characters 123 | if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { 124 | beg = atoi(str->s); // begin 125 | if (dret != '\n') { 126 | if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { 127 | end = atoi(str->s); // end 128 | if (end < beg) end = -1; 129 | } 130 | } 131 | } 132 | } 133 | if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line 134 | if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column 135 | if (beg >= 0 && end > beg) { 136 | if (p->n == p->m) { 137 | p->m = p->m? p->m<<1 : 4; 138 | p->a = realloc(p->a, p->m * 8); 139 | } 140 | p->a[p->n++] = (uint64_t)beg<<32 | end; 141 | } 142 | } 143 | ks_destroy(ks); 144 | gzclose(fp); 145 | free(str->s); free(str); 146 | bed_index(h); 147 | return h; 148 | } 149 | 150 | void bed_destroy(void *_h) 151 | { 152 | reghash_t *h = (reghash_t*)_h; 153 | khint_t k; 154 | for (k = 0; k < kh_end(h); ++k) { 155 | if (kh_exist(h, k)) { 156 | free(kh_val(h, k).a); 157 | free(kh_val(h, k).idx); 158 | free((char*)kh_key(h, k)); 159 | } 160 | } 161 | kh_destroy(reg, h); 162 | } 163 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/bgzf.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | */ 23 | 24 | #ifndef __BGZF_H 25 | #define __BGZF_H 26 | 27 | #include 28 | #include 29 | #include 30 | #ifdef _USE_KNETFILE 31 | #include "knetfile.h" 32 | #endif 33 | 34 | //typedef int8_t bool; 35 | 36 | typedef struct { 37 | int file_descriptor; 38 | char open_mode; // 'r' or 'w' 39 | int16_t owned_file, compress_level; 40 | #ifdef _USE_KNETFILE 41 | union { 42 | knetFile *fpr; 43 | FILE *fpw; 44 | } x; 45 | #else 46 | FILE* file; 47 | #endif 48 | int uncompressed_block_size; 49 | int compressed_block_size; 50 | void* uncompressed_block; 51 | void* compressed_block; 52 | int64_t block_address; 53 | int block_length; 54 | int block_offset; 55 | int cache_size; 56 | const char* error; 57 | void *cache; // a pointer to a hash table 58 | } BGZF; 59 | 60 | #ifdef __cplusplus 61 | extern "C" { 62 | #endif 63 | 64 | /* 65 | * Open an existing file descriptor for reading or writing. 66 | * Mode must be either "r" or "w". 67 | * A subsequent bgzf_close will not close the file descriptor. 68 | * Returns null on error. 69 | */ 70 | BGZF* bgzf_fdopen(int fd, const char* __restrict mode); 71 | 72 | /* 73 | * Open the specified file for reading or writing. 74 | * Mode must be either "r" or "w". 75 | * Returns null on error. 76 | */ 77 | BGZF* bgzf_open(const char* path, const char* __restrict mode); 78 | 79 | /* 80 | * Close the BGZ file and free all associated resources. 81 | * Does not close the underlying file descriptor if created with bgzf_fdopen. 82 | * Returns zero on success, -1 on error. 83 | */ 84 | int bgzf_close(BGZF* fp); 85 | 86 | /* 87 | * Read up to length bytes from the file storing into data. 88 | * Returns the number of bytes actually read. 89 | * Returns zero on end of file. 90 | * Returns -1 on error. 91 | */ 92 | int bgzf_read(BGZF* fp, void* data, int length); 93 | 94 | /* 95 | * Write length bytes from data to the file. 96 | * Returns the number of bytes written. 97 | * Returns -1 on error. 98 | */ 99 | int bgzf_write(BGZF* fp, const void* data, int length); 100 | 101 | /* 102 | * Return a virtual file pointer to the current location in the file. 103 | * No interpetation of the value should be made, other than a subsequent 104 | * call to bgzf_seek can be used to position the file at the same point. 105 | * Return value is non-negative on success. 106 | * Returns -1 on error. 107 | */ 108 | #define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) 109 | 110 | /* 111 | * Set the file to read from the location specified by pos, which must 112 | * be a value previously returned by bgzf_tell for this file (but not 113 | * necessarily one returned by this file handle). 114 | * The where argument must be SEEK_SET. 115 | * Seeking on a file opened for write is not supported. 116 | * Returns zero on success, -1 on error. 117 | */ 118 | int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); 119 | 120 | /* 121 | * Set the cache size. Zero to disable. By default, caching is 122 | * disabled. The recommended cache size for frequent random access is 123 | * about 8M bytes. 124 | */ 125 | void bgzf_set_cache_size(BGZF *fp, int cache_size); 126 | 127 | int bgzf_check_EOF(BGZF *fp); 128 | int bgzf_read_block(BGZF* fp); 129 | int bgzf_flush(BGZF* fp); 130 | int bgzf_flush_try(BGZF *fp, int size); 131 | int bgzf_check_bgzf(const char *fn); 132 | 133 | #ifdef __cplusplus 134 | } 135 | #endif 136 | 137 | static inline int bgzf_getc(BGZF *fp) 138 | { 139 | int c; 140 | if (fp->block_offset >= fp->block_length) { 141 | if (bgzf_read_block(fp) != 0) return -2; /* error */ 142 | if (fp->block_length == 0) return -1; /* end-of-file */ 143 | } 144 | c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; 145 | if (fp->block_offset == fp->block_length) { 146 | #ifdef _USE_KNETFILE 147 | fp->block_address = knet_tell(fp->x.fpr); 148 | #else 149 | fp->block_address = ftello(fp->file); 150 | #endif 151 | fp->block_offset = 0; 152 | fp->block_length = 0; 153 | } 154 | return c; 155 | } 156 | 157 | #endif 158 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/cut_target.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "bam.h" 5 | #include "errmod.h" 6 | #include "faidx.h" 7 | 8 | #define ERR_DEP 0.83f 9 | 10 | typedef struct { 11 | int e[2][3], p[2][2]; 12 | } score_param_t; 13 | 14 | /* Note that although the two matrics have 10 parameters in total, only 4 15 | * (probably 3) are free. Changing the scoring matrices in a sort of symmetric 16 | * way will not change the result. */ 17 | static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} }; 18 | 19 | typedef struct { 20 | int min_baseQ, tid, max_bases; 21 | uint16_t *bases; 22 | bamFile fp; 23 | bam_header_t *h; 24 | char *ref; 25 | faidx_t *fai; 26 | errmod_t *em; 27 | } ct_t; 28 | 29 | static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp) 30 | { 31 | int i, j, ret, tmp, k, sum[4], qual; 32 | float q[16]; 33 | if (n > g->max_bases) { // enlarge g->bases 34 | g->max_bases = n; 35 | kroundup32(g->max_bases); 36 | g->bases = realloc(g->bases, g->max_bases * 2); 37 | } 38 | for (i = k = 0; i < n; ++i) { 39 | const bam_pileup1_t *p = plp + i; 40 | uint8_t *seq; 41 | int q, baseQ, b; 42 | if (p->is_refskip || p->is_del) continue; 43 | baseQ = bam1_qual(p->b)[p->qpos]; 44 | if (baseQ < g->min_baseQ) continue; 45 | seq = bam1_seq(p->b); 46 | b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)]; 47 | if (b > 3) continue; 48 | q = baseQ < p->b->core.qual? baseQ : p->b->core.qual; 49 | if (q < 4) q = 4; 50 | if (q > 63) q = 63; 51 | g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b; 52 | } 53 | if (k == 0) return 0; 54 | errmod_cal(g->em, k, 4, g->bases, q); 55 | for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i; 56 | for (i = 1; i < 4; ++i) // insertion sort 57 | for (j = i; j > 0 && sum[j] < sum[j-1]; --j) 58 | tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp; 59 | qual = (sum[1]>>2) - (sum[0]>>2); 60 | k = k < 256? k : 255; 61 | ret = (qual < 63? qual : 63) << 2 | (sum[0]&3); 62 | return ret<<8|k; 63 | } 64 | 65 | static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns) 66 | { 67 | int i, f[2][2], *prev, *curr, *swap_tmp, s; 68 | uint8_t *b; // backtrack array 69 | b = calloc(l, 1); 70 | f[0][0] = f[0][1] = 0; 71 | prev = f[0]; curr = f[1]; 72 | // fill the backtrack matrix 73 | for (i = 0; i < l; ++i) { 74 | int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2; 75 | int tmp0, tmp1; 76 | // compute f[0] 77 | tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0) 78 | tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1) 79 | if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0; 80 | else curr[0] = tmp1, b[i] = 1; 81 | // compute f[1] 82 | tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0) 83 | tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1) 84 | if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1; 85 | else curr[1] = tmp1, b[i] |= 1<<1; 86 | // swap 87 | swap_tmp = prev; prev = curr; curr = swap_tmp; 88 | } 89 | // backtrack 90 | s = prev[0] > prev[1]? 0 : 1; 91 | for (i = l - 1; i > 0; --i) { 92 | b[i] |= s<<2; 93 | s = b[i]>>s&1; 94 | } 95 | // print 96 | for (i = 0, s = -1; i <= l; ++i) { 97 | if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { 98 | if (s >= 0) { 99 | int j; 100 | printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); 101 | for (j = s; j < i; ++j) { 102 | int c = cns[j]>>8; 103 | if (c == 0) putchar('N'); 104 | else putchar("ACGT"[c&3]); 105 | } 106 | putchar('\t'); 107 | for (j = s; j < i; ++j) 108 | putchar(33 + (cns[j]>>8>>2)); 109 | putchar('\n'); 110 | } 111 | //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s); 112 | s = -1; 113 | } else if ((b[i]>>2&3) && s < 0) s = i; 114 | } 115 | free(b); 116 | } 117 | 118 | static int read_aln(void *data, bam1_t *b) 119 | { 120 | extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag); 121 | ct_t *g = (ct_t*)data; 122 | int ret, len; 123 | ret = bam_read1(g->fp, b); 124 | if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) { 125 | if (b->core.tid != g->tid) { // then load the sequence 126 | free(g->ref); 127 | g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len); 128 | g->tid = b->core.tid; 129 | } 130 | bam_prob_realn_core(b, g->ref, 1<<1|1); 131 | } 132 | return ret; 133 | } 134 | 135 | int main_cut_target(int argc, char *argv[]) 136 | { 137 | int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l; 138 | const bam_pileup1_t *p; 139 | bam_plp_t plp; 140 | uint16_t *cns; 141 | ct_t g; 142 | 143 | memset(&g, 0, sizeof(ct_t)); 144 | g.min_baseQ = 13; g.tid = -1; 145 | while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) { 146 | switch (c) { 147 | case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff 148 | case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY 149 | case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE 150 | case '1': g_param.e[1][1] = atoi(optarg); break; 151 | case '2': g_param.e[1][2] = atoi(optarg); break; 152 | case 'f': g.fai = fai_load(optarg); 153 | if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__); 154 | break; 155 | } 156 | } 157 | if (argc == optind) { 158 | fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] \n"); 159 | return 1; 160 | } 161 | l = max_l = 0; cns = 0; 162 | g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); 163 | g.h = bam_header_read(g.fp); 164 | g.em = errmod_init(1 - ERR_DEP); 165 | plp = bam_plp_init(read_aln, &g); 166 | while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) { 167 | if (tid < 0) break; 168 | if (tid != lasttid) { // change of chromosome 169 | if (cns) process_cns(g.h, lasttid, l, cns); 170 | if (max_l < g.h->target_len[tid]) { 171 | max_l = g.h->target_len[tid]; 172 | kroundup32(max_l); 173 | cns = realloc(cns, max_l * 2); 174 | } 175 | l = g.h->target_len[tid]; 176 | memset(cns, 0, max_l * 2); 177 | lasttid = tid; 178 | } 179 | cns[pos] = gencns(&g, n, p); 180 | lastpos = pos; 181 | } 182 | process_cns(g.h, lasttid, l, cns); 183 | free(cns); 184 | bam_header_destroy(g.h); 185 | bam_plp_destroy(plp); 186 | bam_close(g.fp); 187 | if (g.fai) { 188 | fai_destroy(g.fai); free(g.ref); 189 | } 190 | errmod_destroy(g.em); 191 | free(g.bases); 192 | return 0; 193 | } 194 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/errmod.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "errmod.h" 3 | #include "ksort.h" 4 | KSORT_INIT_GENERIC(uint16_t) 5 | 6 | typedef struct __errmod_coef_t { 7 | double *fk, *beta, *lhet; 8 | } errmod_coef_t; 9 | 10 | typedef struct { 11 | double fsum[16], bsum[16]; 12 | uint32_t c[16]; 13 | } call_aux_t; 14 | 15 | static errmod_coef_t *cal_coef(double depcorr, double eta) 16 | { 17 | int k, n, q; 18 | long double sum, sum1; 19 | double *lC; 20 | errmod_coef_t *ec; 21 | 22 | ec = calloc(1, sizeof(errmod_coef_t)); 23 | // initialize ->fk 24 | ec->fk = (double*)calloc(256, sizeof(double)); 25 | ec->fk[0] = 1.0; 26 | for (n = 1; n != 256; ++n) 27 | ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta; 28 | // initialize ->coef 29 | ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double)); 30 | lC = (double*)calloc(256 * 256, sizeof(double)); 31 | for (n = 1; n != 256; ++n) { 32 | double lgn = lgamma(n+1); 33 | for (k = 1; k <= n; ++k) 34 | lC[n<<8|k] = lgn - lgamma(k+1) - lgamma(n-k+1); 35 | } 36 | for (q = 1; q != 64; ++q) { 37 | double e = pow(10.0, -q/10.0); 38 | double le = log(e); 39 | double le1 = log(1.0 - e); 40 | for (n = 1; n <= 255; ++n) { 41 | double *beta = ec->beta + (q<<16|n<<8); 42 | sum1 = sum = 0.0; 43 | for (k = n; k >= 0; --k, sum1 = sum) { 44 | sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1); 45 | beta[k] = -10. / M_LN10 * logl(sum1 / sum); 46 | } 47 | } 48 | } 49 | // initialize ->lhet 50 | ec->lhet = (double*)calloc(256 * 256, sizeof(double)); 51 | for (n = 0; n < 256; ++n) 52 | for (k = 0; k < 256; ++k) 53 | ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n; 54 | free(lC); 55 | return ec; 56 | } 57 | 58 | errmod_t *errmod_init(float depcorr) 59 | { 60 | errmod_t *em; 61 | em = (errmod_t*)calloc(1, sizeof(errmod_t)); 62 | em->depcorr = depcorr; 63 | em->coef = cal_coef(depcorr, 0.03); 64 | return em; 65 | } 66 | 67 | void errmod_destroy(errmod_t *em) 68 | { 69 | if (em == 0) return; 70 | free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta); 71 | free(em->coef); free(em); 72 | } 73 | // qual:6, strand:1, base:4 74 | int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) 75 | { 76 | call_aux_t aux; 77 | int i, j, k, w[32]; 78 | 79 | if (m > m) return -1; 80 | memset(q, 0, m * m * sizeof(float)); 81 | if (n == 0) return 0; 82 | // calculate aux.esum and aux.fsum 83 | if (n > 255) { // then sample 255 bases 84 | ks_shuffle(uint16_t, n, bases); 85 | n = 255; 86 | } 87 | ks_introsort(uint16_t, n, bases); 88 | memset(w, 0, 32 * sizeof(int)); 89 | memset(&aux, 0, sizeof(call_aux_t)); 90 | for (j = n - 1; j >= 0; --j) { // calculate esum and fsum 91 | uint16_t b = bases[j]; 92 | int q = b>>5 < 4? 4 : b>>5; 93 | if (q > 63) q = 63; 94 | k = b&0x1f; 95 | aux.fsum[k&0xf] += em->coef->fk[w[k]]; 96 | aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]]; 97 | ++aux.c[k&0xf]; 98 | ++w[k]; 99 | } 100 | // generate likelihood 101 | for (j = 0; j != m; ++j) { 102 | float tmp1, tmp3; 103 | int tmp2, bar_e; 104 | // homozygous 105 | for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != m; ++k) { 106 | if (k == j) continue; 107 | tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k]; 108 | } 109 | if (tmp2) { 110 | bar_e = (int)(tmp1 / tmp3 + 0.499); 111 | if (bar_e > 63) bar_e = 63; 112 | q[j*m+j] = tmp1; 113 | } 114 | // heterozygous 115 | for (k = j + 1; k < m; ++k) { 116 | int cjk = aux.c[j] + aux.c[k]; 117 | for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) { 118 | if (i == j || i == k) continue; 119 | tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i]; 120 | } 121 | if (tmp2) { 122 | bar_e = (int)(tmp1 / tmp3 + 0.499); 123 | if (bar_e > 63) bar_e = 63; 124 | q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1; 125 | } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k 126 | } 127 | for (k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0; 128 | } 129 | return 0; 130 | } 131 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/errmod.h: -------------------------------------------------------------------------------- 1 | #ifndef ERRMOD_H 2 | #define ERRMOD_H 3 | 4 | #include 5 | 6 | struct __errmod_coef_t; 7 | 8 | typedef struct { 9 | double depcorr; 10 | struct __errmod_coef_t *coef; 11 | } errmod_t; 12 | 13 | errmod_t *errmod_init(float depcorr); 14 | void errmod_destroy(errmod_t *em); 15 | 16 | /* 17 | n: number of bases 18 | m: maximum base 19 | bases[i]: qual:6, strand:1, base:4 20 | q[i*m+j]: phred-scaled likelihood of (i,j) 21 | */ 22 | int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/examples/00README.txt: -------------------------------------------------------------------------------- 1 | File ex1.fa contains two sequences cut from the human genome 2 | build36. They were exatracted with command: 3 | 4 | samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550 5 | 6 | Sequence names were changed manually for simplicity. File ex1.sam.gz 7 | contains MAQ alignments exatracted with: 8 | 9 | (samtools view NA18507_maq.bam 2:2044001-2045500; 10 | samtools view NA18507_maq.bam 20:68001-69500) 11 | 12 | and processed with `samtools fixmate' to make it self-consistent as a 13 | standalone alignment. 14 | 15 | To try samtools, you may run the following commands: 16 | 17 | samtools faidx ex1.fa # index the reference FASTA 18 | samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM 19 | samtools index ex1.bam # index BAM 20 | samtools tview ex1.bam ex1.fa # view alignment 21 | samtools pileup -cf ex1.fa ex1.bam # pileup and consensus 22 | samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz 23 | 24 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/examples/Makefile: -------------------------------------------------------------------------------- 1 | all:../libbam.a ../samtools ../bcftools/bcftools \ 2 | ex1.glf ex1.pileup.gz ex1.bam.bai ex1f-rmduppe.bam ex1f-rmdupse.bam ex1.glfview.gz ex1.bcf calDepth 3 | @echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo; 4 | 5 | ex1.fa.fai:ex1.fa 6 | ../samtools faidx ex1.fa 7 | ex1.bam:ex1.sam.gz ex1.fa.fai 8 | ../samtools import ex1.fa.fai ex1.sam.gz ex1.bam 9 | ex1.bam.bai:ex1.bam 10 | ../samtools index ex1.bam 11 | ex1.pileup.gz:ex1.bam ex1.fa 12 | ../samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz 13 | ex1.glf:ex1.bam ex1.fa 14 | ../samtools pileup -gf ex1.fa ex1.bam > ex1.glf 15 | ex1.glfview.gz:ex1.glf 16 | ../samtools glfview ex1.glf | gzip > ex1.glfview.gz 17 | ex1a.bam:ex1.bam 18 | ../samtools view -h ex1.bam | awk 'BEGIN{FS=OFS="\t"}{if(/^@/)print;else{$$1=$$1"a";print}}' | ../samtools view -bS - > $@ 19 | ex1b.bam:ex1.bam 20 | ../samtools view -h ex1.bam | awk 'BEGIN{FS=OFS="\t"}{if(/^@/)print;else{$$1=$$1"b";print}}' | ../samtools view -bS - > $@ 21 | ex1f.rg: 22 | (echo "@RG ID:ex1 LB:ex1 SM:ex1"; echo "@RG ID:ex1a LB:ex1 SM:ex1"; echo "@RG ID:ex1b LB:ex1b SM:ex1b") > $@ 23 | ex1f.bam:ex1.bam ex1a.bam ex1b.bam ex1f.rg 24 | ../samtools merge -rh ex1f.rg $@ ex1.bam ex1a.bam ex1b.bam 25 | ex1f-rmduppe.bam:ex1f.bam 26 | ../samtools rmdup ex1f.bam $@ 27 | ex1f-rmdupse.bam:ex1f.bam 28 | ../samtools rmdup -S ex1f.bam $@ 29 | 30 | ex1.bcf:ex1.bam ex1.fa.fai 31 | ../samtools mpileup -gf ex1.fa ex1.bam > $@ 32 | 33 | ../bcftools/bcftools: 34 | (cd ../bcftools; make bcftools) 35 | 36 | ../samtools: 37 | (cd ..; make samtools) 38 | 39 | ../libbam.a: 40 | (cd ..; make libbam.a) 41 | 42 | calDepth:../libbam.a calDepth.c 43 | gcc -g -Wall -O2 -I.. calDepth.c -o $@ -L.. -lbam -lm -lz 44 | 45 | clean: 46 | rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM ex1*.rg ex1.bcf 47 | 48 | # ../samtools pileup ex1.bam|perl -ape '$_=$F[4];s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Z//,tr/a-z//);$_=join("\t",@F[0,1],@_)."\n"' 49 | 50 | # ../samtools pileup -cf ex1.fa ex1.bam|perl -ape '$_=$F[8];s/\^.//g;s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Za-z//,tr/,.//);$_=join("\t",@F[0,1],@_)."\n"' 51 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/examples/bam2bed.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "sam.h" 3 | static int fetch_func(const bam1_t *b, void *data) 4 | { 5 | samfile_t *fp = (samfile_t*)data; 6 | uint32_t *cigar = bam1_cigar(b); 7 | const bam1_core_t *c = &b->core; 8 | int i, l; 9 | if (b->core.tid < 0) return 0; 10 | for (i = l = 0; i < c->n_cigar; ++i) { 11 | int op = cigar[i]&0xf; 12 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) 13 | l += cigar[i]>>4; 14 | } 15 | printf("%s\t%d\t%d\t%s\t%d\t%c\n", fp->header->target_name[c->tid], 16 | c->pos, c->pos + l, bam1_qname(b), c->qual, (c->flag&BAM_FREVERSE)? '-' : '+'); 17 | return 0; 18 | } 19 | int main(int argc, char *argv[]) 20 | { 21 | samfile_t *fp; 22 | if (argc == 1) { 23 | fprintf(stderr, "Usage: bam2bed [region]\n"); 24 | return 1; 25 | } 26 | if ((fp = samopen(argv[1], "rb", 0)) == 0) { 27 | fprintf(stderr, "bam2bed: Fail to open BAM file %s\n", argv[1]); 28 | return 1; 29 | } 30 | if (argc == 2) { /* if a region is not specified */ 31 | bam1_t *b = bam_init1(); 32 | while (samread(fp, b) >= 0) fetch_func(b, fp); 33 | bam_destroy1(b); 34 | } else { 35 | int ref, beg, end; 36 | bam_index_t *idx; 37 | if ((idx = bam_index_load(argv[1])) == 0) { 38 | fprintf(stderr, "bam2bed: BAM indexing file is not available.\n"); 39 | return 1; 40 | } 41 | bam_parse_region(fp->header, argv[2], &ref, &beg, &end); 42 | if (ref < 0) { 43 | fprintf(stderr, "bam2bed: Invalid region %s\n", argv[2]); 44 | return 1; 45 | } 46 | bam_fetch(fp->x.bam, idx, ref, beg, end, fp, fetch_func); 47 | bam_index_destroy(idx); 48 | } 49 | samclose(fp); 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/examples/calDepth.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "sam.h" 3 | 4 | typedef struct { 5 | int beg, end; 6 | samfile_t *in; 7 | } tmpstruct_t; 8 | 9 | // callback for bam_fetch() 10 | static int fetch_func(const bam1_t *b, void *data) 11 | { 12 | bam_plbuf_t *buf = (bam_plbuf_t*)data; 13 | bam_plbuf_push(b, buf); 14 | return 0; 15 | } 16 | // callback for bam_plbuf_init() 17 | static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) 18 | { 19 | tmpstruct_t *tmp = (tmpstruct_t*)data; 20 | if ((int)pos >= tmp->beg && (int)pos < tmp->end) 21 | printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n); 22 | return 0; 23 | } 24 | 25 | int main(int argc, char *argv[]) 26 | { 27 | tmpstruct_t tmp; 28 | if (argc == 1) { 29 | fprintf(stderr, "Usage: calDepth [region]\n"); 30 | return 1; 31 | } 32 | tmp.beg = 0; tmp.end = 0x7fffffff; 33 | tmp.in = samopen(argv[1], "rb", 0); 34 | if (tmp.in == 0) { 35 | fprintf(stderr, "Fail to open BAM file %s\n", argv[1]); 36 | return 1; 37 | } 38 | if (argc == 2) { // if a region is not specified 39 | sampileup(tmp.in, -1, pileup_func, &tmp); 40 | } else { 41 | int ref; 42 | bam_index_t *idx; 43 | bam_plbuf_t *buf; 44 | idx = bam_index_load(argv[1]); // load BAM index 45 | if (idx == 0) { 46 | fprintf(stderr, "BAM indexing file is not available.\n"); 47 | return 1; 48 | } 49 | bam_parse_region(tmp.in->header, argv[2], &ref, &tmp.beg, &tmp.end); // parse the region 50 | if (ref < 0) { 51 | fprintf(stderr, "Invalid region %s\n", argv[2]); 52 | return 1; 53 | } 54 | buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup 55 | bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func); 56 | bam_plbuf_push(0, buf); // finalize pileup 57 | bam_index_destroy(idx); 58 | bam_plbuf_destroy(buf); 59 | } 60 | samclose(tmp.in); 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/examples/ex1.fa: -------------------------------------------------------------------------------- 1 | >seq1 2 | CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCT 3 | GTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCAC 4 | GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAG 5 | TCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTC 6 | AGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAA 7 | CAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACC 8 | AAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCT 9 | CTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCA 10 | ATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGC 11 | AGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAAC 12 | AACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACAC 13 | ATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATAC 14 | CATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCT 15 | TTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTT 16 | TCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAAT 17 | GCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAAT 18 | ACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGA 19 | ACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTG 20 | TGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTA 21 | CGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAG 22 | TCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGC 23 | TTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTC 24 | TCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTG 25 | TTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGG 26 | AGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATA 27 | TTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTC 28 | TCCCTCGTCTTCTTA 29 | >seq2 30 | TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAG 31 | CTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCT 32 | TATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTT 33 | CAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA 34 | AAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTT 35 | AGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATAC 36 | ATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAG 37 | GAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCAT 38 | CAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATT 39 | TTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTA 40 | AGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA 41 | ATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAAT 42 | TAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATA 43 | AAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACC 44 | TCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATA 45 | GATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATT 46 | AATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCA 47 | AATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGT 48 | AAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATAT 49 | AACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAAT 50 | ACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGAT 51 | GATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTG 52 | CGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATA 53 | GCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAA 54 | AAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAA 55 | TTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGC 56 | CAGAAAAAAATATTTACAGTAACT 57 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/examples/ex1.sam.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vibansal/PCRduplicates/f4462916f8b73273f3dfc1ebc8eb145829003121/parsebam/samtools-0.1.18/examples/ex1.sam.gz -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/examples/toy.fa: -------------------------------------------------------------------------------- 1 | >ref 2 | AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT 3 | >ref2 4 | aggttttataaaacaattaagtctacagagcaactacgcg 5 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/examples/toy.sam: -------------------------------------------------------------------------------- 1 | @SQ SN:ref LN:45 2 | @SQ SN:ref2 LN:40 3 | r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 4 | r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * 5 | r003 0 ref 9 30 5H6M * 0 0 AGCTAA * 6 | r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * 7 | r003 16 ref 29 30 6H5M * 0 0 TAGGC * 8 | r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT * 9 | x1 0 ref2 1 30 20M * 0 0 aggttttataaaacaaataa ???????????????????? 10 | x2 0 ref2 2 30 21M * 0 0 ggttttataaaacaaataatt ????????????????????? 11 | x3 0 ref2 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ?????????????????????????? 12 | x4 0 ref2 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ????????????????????????? 13 | x5 0 ref2 12 30 24M * 0 0 aaTaattaagtctacagagcaact ???????????????????????? 14 | x6 0 ref2 14 30 23M * 0 0 Taattaagtctacagagcaacta ??????????????????????? 15 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/faidx.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Genome Research Ltd (GRL). 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Heng Li */ 27 | 28 | #ifndef FAIDX_H 29 | #define FAIDX_H 30 | 31 | /*! 32 | @header 33 | 34 | Index FASTA files and extract subsequence. 35 | 36 | @copyright The Wellcome Trust Sanger Institute. 37 | */ 38 | 39 | struct __faidx_t; 40 | typedef struct __faidx_t faidx_t; 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | /*! 47 | @abstract Build index for a FASTA or razip compressed FASTA file. 48 | @param fn FASTA file name 49 | @return 0 on success; or -1 on failure 50 | @discussion File "fn.fai" will be generated. 51 | */ 52 | int fai_build(const char *fn); 53 | 54 | /*! 55 | @abstract Distroy a faidx_t struct. 56 | @param fai Pointer to the struct to be destroyed 57 | */ 58 | void fai_destroy(faidx_t *fai); 59 | 60 | /*! 61 | @abstract Load index from "fn.fai". 62 | @param fn File name of the FASTA file 63 | */ 64 | faidx_t *fai_load(const char *fn); 65 | 66 | /*! 67 | @abstract Fetch the sequence in a region. 68 | @param fai Pointer to the faidx_t struct 69 | @param reg Region in the format "chr2:20,000-30,000" 70 | @param len Length of the region 71 | @return Pointer to the sequence; null on failure 72 | 73 | @discussion The returned sequence is allocated by malloc family 74 | and should be destroyed by end users by calling free() on it. 75 | */ 76 | char *fai_fetch(const faidx_t *fai, const char *reg, int *len); 77 | 78 | /*! 79 | @abstract Fetch the number of sequences. 80 | @param fai Pointer to the faidx_t struct 81 | @return The number of sequences 82 | */ 83 | int faidx_fetch_nseq(const faidx_t *fai); 84 | 85 | /*! 86 | @abstract Fetch the sequence in a region. 87 | @param fai Pointer to the faidx_t struct 88 | @param c_name Region name 89 | @param p_beg_i Beginning position number (zero-based) 90 | @param p_end_i End position number (zero-based) 91 | @param len Length of the region 92 | @return Pointer to the sequence; null on failure 93 | 94 | @discussion The returned sequence is allocated by malloc family 95 | and should be destroyed by end users by calling free() on it. 96 | */ 97 | char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len); 98 | 99 | #ifdef __cplusplus 100 | } 101 | #endif 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/kaln.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2003-2006, 2008, 2009 by Heng Li 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #ifndef LH3_KALN_H_ 27 | #define LH3_KALN_H_ 28 | 29 | #include 30 | 31 | #define MINOR_INF -1073741823 32 | 33 | typedef struct { 34 | int gap_open; 35 | int gap_ext; 36 | int gap_end_open; 37 | int gap_end_ext; 38 | 39 | int *matrix; 40 | int row; 41 | int band_width; 42 | } ka_param_t; 43 | 44 | typedef struct { 45 | int iio, iie, ido, ide; 46 | int eio, eie, edo, ede; 47 | int *matrix; 48 | int row; 49 | int band_width; 50 | } ka_param2_t; 51 | 52 | #ifdef __cplusplus 53 | extern "C" { 54 | #endif 55 | 56 | uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, 57 | int *_score, int *n_cigar); 58 | int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap); 59 | #ifdef __cplusplus 60 | } 61 | #endif 62 | 63 | extern ka_param_t ka_param_blast; /* = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; */ 64 | extern ka_param_t ka_param_qual; // only use this for global alignment!!! 65 | extern ka_param2_t ka_param2_qual; // only use this for global alignment!!! 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/klist.h: -------------------------------------------------------------------------------- 1 | #ifndef _LH3_KLIST_H 2 | #define _LH3_KLIST_H 3 | 4 | #include 5 | 6 | #define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f) \ 7 | typedef struct { \ 8 | size_t cnt, n, max; \ 9 | kmptype_t **buf; \ 10 | } kmp_##name##_t; \ 11 | static inline kmp_##name##_t *kmp_init_##name() { \ 12 | return calloc(1, sizeof(kmp_##name##_t)); \ 13 | } \ 14 | static inline void kmp_destroy_##name(kmp_##name##_t *mp) { \ 15 | size_t k; \ 16 | for (k = 0; k < mp->n; ++k) { \ 17 | kmpfree_f(mp->buf[k]); free(mp->buf[k]); \ 18 | } \ 19 | free(mp->buf); free(mp); \ 20 | } \ 21 | static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \ 22 | ++mp->cnt; \ 23 | if (mp->n == 0) return calloc(1, sizeof(kmptype_t)); \ 24 | return mp->buf[--mp->n]; \ 25 | } \ 26 | static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ 27 | --mp->cnt; \ 28 | if (mp->n == mp->max) { \ 29 | mp->max = mp->max? mp->max<<1 : 16; \ 30 | mp->buf = realloc(mp->buf, sizeof(void*) * mp->max); \ 31 | } \ 32 | mp->buf[mp->n++] = p; \ 33 | } 34 | 35 | #define kmempool_t(name) kmp_##name##_t 36 | #define kmp_init(name) kmp_init_##name() 37 | #define kmp_destroy(name, mp) kmp_destroy_##name(mp) 38 | #define kmp_alloc(name, mp) kmp_alloc_##name(mp) 39 | #define kmp_free(name, mp, p) kmp_free_##name(mp, p) 40 | 41 | #define KLIST_INIT(name, kltype_t, kmpfree_t) \ 42 | struct __kl1_##name { \ 43 | kltype_t data; \ 44 | struct __kl1_##name *next; \ 45 | }; \ 46 | typedef struct __kl1_##name kl1_##name; \ 47 | KMEMPOOL_INIT(name, kl1_##name, kmpfree_t) \ 48 | typedef struct { \ 49 | kl1_##name *head, *tail; \ 50 | kmp_##name##_t *mp; \ 51 | size_t size; \ 52 | } kl_##name##_t; \ 53 | static inline kl_##name##_t *kl_init_##name() { \ 54 | kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t)); \ 55 | kl->mp = kmp_init(name); \ 56 | kl->head = kl->tail = kmp_alloc(name, kl->mp); \ 57 | kl->head->next = 0; \ 58 | return kl; \ 59 | } \ 60 | static inline void kl_destroy_##name(kl_##name##_t *kl) { \ 61 | kl1_##name *p; \ 62 | for (p = kl->head; p != kl->tail; p = p->next) \ 63 | kmp_free(name, kl->mp, p); \ 64 | kmp_free(name, kl->mp, p); \ 65 | kmp_destroy(name, kl->mp); \ 66 | free(kl); \ 67 | } \ 68 | static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) { \ 69 | kl1_##name *q, *p = kmp_alloc(name, kl->mp); \ 70 | q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p; \ 71 | ++kl->size; \ 72 | return &q->data; \ 73 | } \ 74 | static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \ 75 | kl1_##name *p; \ 76 | if (kl->head->next == 0) return -1; \ 77 | --kl->size; \ 78 | p = kl->head; kl->head = kl->head->next; \ 79 | if (d) *d = p->data; \ 80 | kmp_free(name, kl->mp, p); \ 81 | return 0; \ 82 | } 83 | 84 | #define kliter_t(name) kl1_##name 85 | #define klist_t(name) kl_##name##_t 86 | #define kl_val(iter) ((iter)->data) 87 | #define kl_next(iter) ((iter)->next) 88 | #define kl_begin(kl) ((kl)->head) 89 | #define kl_end(kl) ((kl)->tail) 90 | 91 | #define kl_init(name) kl_init_##name() 92 | #define kl_destroy(name, kl) kl_destroy_##name(kl) 93 | #define kl_pushp(name, kl) kl_pushp_##name(kl) 94 | #define kl_shift(name, kl, d) kl_shift_##name(kl, d) 95 | 96 | #endif 97 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/knetfile.h: -------------------------------------------------------------------------------- 1 | #ifndef KNETFILE_H 2 | #define KNETFILE_H 3 | 4 | #include 5 | #include 6 | 7 | #ifndef _WIN32 8 | #define netread(fd, ptr, len) read(fd, ptr, len) 9 | #define netwrite(fd, ptr, len) write(fd, ptr, len) 10 | #define netclose(fd) close(fd) 11 | #else 12 | #include 13 | #define netread(fd, ptr, len) recv(fd, ptr, len, 0) 14 | #define netwrite(fd, ptr, len) send(fd, ptr, len, 0) 15 | #define netclose(fd) closesocket(fd) 16 | #endif 17 | 18 | // FIXME: currently I/O is unbuffered 19 | 20 | #define KNF_TYPE_LOCAL 1 21 | #define KNF_TYPE_FTP 2 22 | #define KNF_TYPE_HTTP 3 23 | 24 | typedef struct knetFile_s { 25 | int type, fd; 26 | int64_t offset; 27 | char *host, *port; 28 | 29 | // the following are for FTP only 30 | int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; 31 | char *response, *retr, *size_cmd; 32 | int64_t seek_offset; // for lazy seek 33 | int64_t file_size; 34 | 35 | // the following are for HTTP only 36 | char *path, *http_host; 37 | } knetFile; 38 | 39 | #define knet_tell(fp) ((fp)->offset) 40 | #define knet_fileno(fp) ((fp)->fd) 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | #ifdef _WIN32 47 | int knet_win32_init(); 48 | void knet_win32_destroy(); 49 | #endif 50 | 51 | knetFile *knet_open(const char *fn, const char *mode); 52 | 53 | /* 54 | This only works with local files. 55 | */ 56 | knetFile *knet_dopen(int fd, const char *mode); 57 | 58 | /* 59 | If ->is_ready==0, this routine updates ->fd; otherwise, it simply 60 | reads from ->fd. 61 | */ 62 | off_t knet_read(knetFile *fp, void *buf, off_t len); 63 | 64 | /* 65 | This routine only sets ->offset and ->is_ready=0. It does not 66 | communicate with the FTP server. 67 | */ 68 | off_t knet_seek(knetFile *fp, int64_t off, int whence); 69 | int knet_close(knetFile *fp); 70 | 71 | #ifdef __cplusplus 72 | } 73 | #endif 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/kprobaln.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2003-2006, 2008, 2009 by Heng Li 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #ifndef LH3_KPROBALN_H_ 27 | #define LH3_KPROBALN_H_ 28 | 29 | #include 30 | 31 | typedef struct { 32 | float d, e; 33 | int bw; 34 | } kpa_par_t; 35 | 36 | #ifdef __cplusplus 37 | extern "C" { 38 | #endif 39 | 40 | int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual, 41 | const kpa_par_t *c, int *state, uint8_t *q); 42 | 43 | #ifdef __cplusplus 44 | } 45 | #endif 46 | 47 | extern kpa_par_t kpa_par_def, kpa_par_alt; 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/kstring.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "kstring.h" 7 | 8 | int ksprintf(kstring_t *s, const char *fmt, ...) 9 | { 10 | va_list ap; 11 | int l; 12 | va_start(ap, fmt); 13 | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. 14 | va_end(ap); 15 | if (l + 1 > s->m - s->l) { 16 | s->m = s->l + l + 2; 17 | kroundup32(s->m); 18 | s->s = (char*)realloc(s->s, s->m); 19 | va_start(ap, fmt); 20 | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); 21 | } 22 | va_end(ap); 23 | s->l += l; 24 | return l; 25 | } 26 | 27 | char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux) 28 | { 29 | const char *p, *start; 30 | if (sep) { // set up the table 31 | if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished 32 | aux->finished = 0; 33 | if (sep[1]) { 34 | aux->sep = -1; 35 | aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; 36 | for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f); 37 | } else aux->sep = sep[0]; 38 | } 39 | if (aux->finished) return 0; 40 | else if (str) aux->p = str - 1, aux->finished = 0; 41 | if (aux->sep < 0) { 42 | for (p = start = aux->p + 1; *p; ++p) 43 | if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; 44 | } else { 45 | for (p = start = aux->p + 1; *p; ++p) 46 | if (*p == aux->sep) break; 47 | } 48 | aux->p = p; // end of token 49 | if (*p == 0) aux->finished = 1; // no more tokens 50 | return (char*)start; 51 | } 52 | 53 | // s MUST BE a null terminated string; l = strlen(s) 54 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) 55 | { 56 | int i, n, max, last_char, last_start, *offsets, l; 57 | n = 0; max = *_max; offsets = *_offsets; 58 | l = strlen(s); 59 | 60 | #define __ksplit_aux do { \ 61 | if (_offsets) { \ 62 | s[i] = 0; \ 63 | if (n == max) { \ 64 | max = max? max<<1 : 2; \ 65 | offsets = (int*)realloc(offsets, sizeof(int) * max); \ 66 | } \ 67 | offsets[n++] = last_start; \ 68 | } else ++n; \ 69 | } while (0) 70 | 71 | for (i = 0, last_char = last_start = 0; i <= l; ++i) { 72 | if (delimiter == 0) { 73 | if (isspace(s[i]) || s[i] == 0) { 74 | if (isgraph(last_char)) __ksplit_aux; // the end of a field 75 | } else { 76 | if (isspace(last_char) || last_char == 0) last_start = i; 77 | } 78 | } else { 79 | if (s[i] == delimiter || s[i] == 0) { 80 | if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field 81 | } else { 82 | if (last_char == delimiter || last_char == 0) last_start = i; 83 | } 84 | } 85 | last_char = s[i]; 86 | } 87 | *_max = max; *_offsets = offsets; 88 | return n; 89 | } 90 | 91 | /********************** 92 | * Boyer-Moore search * 93 | **********************/ 94 | 95 | typedef unsigned char ubyte_t; 96 | 97 | // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html 98 | static int *ksBM_prep(const ubyte_t *pat, int m) 99 | { 100 | int i, *suff, *prep, *bmGs, *bmBc; 101 | prep = calloc(m + 256, sizeof(int)); 102 | bmGs = prep; bmBc = prep + m; 103 | { // preBmBc() 104 | for (i = 0; i < 256; ++i) bmBc[i] = m; 105 | for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; 106 | } 107 | suff = calloc(m, sizeof(int)); 108 | { // suffixes() 109 | int f = 0, g; 110 | suff[m - 1] = m; 111 | g = m - 1; 112 | for (i = m - 2; i >= 0; --i) { 113 | if (i > g && suff[i + m - 1 - f] < i - g) 114 | suff[i] = suff[i + m - 1 - f]; 115 | else { 116 | if (i < g) g = i; 117 | f = i; 118 | while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; 119 | suff[i] = f - g; 120 | } 121 | } 122 | } 123 | { // preBmGs() 124 | int j = 0; 125 | for (i = 0; i < m; ++i) bmGs[i] = m; 126 | for (i = m - 1; i >= 0; --i) 127 | if (suff[i] == i + 1) 128 | for (; j < m - 1 - i; ++j) 129 | if (bmGs[j] == m) 130 | bmGs[j] = m - 1 - i; 131 | for (i = 0; i <= m - 2; ++i) 132 | bmGs[m - 1 - suff[i]] = m - 1 - i; 133 | } 134 | free(suff); 135 | return prep; 136 | } 137 | 138 | void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep) 139 | { 140 | int i, j, *prep = 0, *bmGs, *bmBc; 141 | const ubyte_t *str, *pat; 142 | str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat; 143 | prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep; 144 | if (_prep && *_prep == 0) *_prep = prep; 145 | bmGs = prep; bmBc = prep + m; 146 | j = 0; 147 | while (j <= n - m) { 148 | for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); 149 | if (i >= 0) { 150 | int max = bmBc[str[i+j]] - m + 1 + i; 151 | if (max < bmGs[i]) max = bmGs[i]; 152 | j += max; 153 | } else return (void*)(str + j); 154 | } 155 | if (_prep == 0) free(prep); 156 | return 0; 157 | } 158 | 159 | char *kstrstr(const char *str, const char *pat, int **_prep) 160 | { 161 | return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep); 162 | } 163 | 164 | char *kstrnstr(const char *str, const char *pat, int n, int **_prep) 165 | { 166 | return (char*)kmemmem(str, n, pat, strlen(pat), _prep); 167 | } 168 | 169 | /*********************** 170 | * The main() function * 171 | ***********************/ 172 | 173 | #ifdef KSTRING_MAIN 174 | #include 175 | int main() 176 | { 177 | kstring_t *s; 178 | int *fields, n, i; 179 | ks_tokaux_t aux; 180 | char *p; 181 | s = (kstring_t*)calloc(1, sizeof(kstring_t)); 182 | // test ksprintf() 183 | ksprintf(s, " abcdefg: %d ", 100); 184 | printf("'%s'\n", s->s); 185 | // test ksplit() 186 | fields = ksplit(s, 0, &n); 187 | for (i = 0; i < n; ++i) 188 | printf("field[%d] = '%s'\n", i, s->s + fields[i]); 189 | // test kstrtok() 190 | s->l = 0; 191 | for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) { 192 | kputsn(p, aux.p - p, s); 193 | kputc('\n', s); 194 | } 195 | printf("%s", s->s); 196 | // free 197 | free(s->s); free(s); free(fields); 198 | 199 | { 200 | static char *str = "abcdefgcdgcagtcakcdcd"; 201 | static char *pat = "cd"; 202 | char *ret, *s = str; 203 | int *prep = 0; 204 | while ((ret = kstrstr(s, pat, &prep)) != 0) { 205 | printf("match: %s\n", ret); 206 | s = ret + prep[0]; 207 | } 208 | free(prep); 209 | } 210 | return 0; 211 | } 212 | #endif 213 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/kstring.h: -------------------------------------------------------------------------------- 1 | #ifndef KSTRING_H 2 | #define KSTRING_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifndef kroundup32 9 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 10 | #endif 11 | 12 | #ifndef KSTRING_T 13 | #define KSTRING_T kstring_t 14 | typedef struct __kstring_t { 15 | size_t l, m; 16 | char *s; 17 | } kstring_t; 18 | #endif 19 | 20 | typedef struct { 21 | uint64_t tab[4]; 22 | int sep, finished; 23 | const char *p; // end of the current token 24 | } ks_tokaux_t; 25 | 26 | #ifdef __cplusplus 27 | extern "C" { 28 | #endif 29 | 30 | int ksprintf(kstring_t *s, const char *fmt, ...); 31 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); 32 | char *kstrstr(const char *str, const char *pat, int **_prep); 33 | char *kstrnstr(const char *str, const char *pat, int n, int **_prep); 34 | void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); 35 | 36 | /* kstrtok() is similar to strtok_r() except that str is not 37 | * modified and both str and sep can be NULL. For efficiency, it is 38 | * actually recommended to set both to NULL in the subsequent calls 39 | * if sep is not changed. */ 40 | char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); 41 | 42 | #ifdef __cplusplus 43 | } 44 | #endif 45 | 46 | static inline int kputsn(const char *p, int l, kstring_t *s) 47 | { 48 | if (s->l + l + 1 >= s->m) { 49 | s->m = s->l + l + 2; 50 | kroundup32(s->m); 51 | s->s = (char*)realloc(s->s, s->m); 52 | } 53 | memcpy(s->s + s->l, p, l); 54 | s->l += l; 55 | s->s[s->l] = 0; 56 | return l; 57 | } 58 | 59 | static inline int kputs(const char *p, kstring_t *s) 60 | { 61 | return kputsn(p, strlen(p), s); 62 | } 63 | 64 | static inline int kputc(int c, kstring_t *s) 65 | { 66 | if (s->l + 1 >= s->m) { 67 | s->m = s->l + 2; 68 | kroundup32(s->m); 69 | s->s = (char*)realloc(s->s, s->m); 70 | } 71 | s->s[s->l++] = c; 72 | s->s[s->l] = 0; 73 | return c; 74 | } 75 | 76 | static inline int kputw(int c, kstring_t *s) 77 | { 78 | char buf[16]; 79 | int l, x; 80 | if (c == 0) return kputc('0', s); 81 | for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; 82 | if (c < 0) buf[l++] = '-'; 83 | if (s->l + l + 1 >= s->m) { 84 | s->m = s->l + l + 2; 85 | kroundup32(s->m); 86 | s->s = (char*)realloc(s->s, s->m); 87 | } 88 | for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; 89 | s->s[s->l] = 0; 90 | return 0; 91 | } 92 | 93 | static inline int kputuw(unsigned c, kstring_t *s) 94 | { 95 | char buf[16]; 96 | int l, i; 97 | unsigned x; 98 | if (c == 0) return kputc('0', s); 99 | for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; 100 | if (s->l + l + 1 >= s->m) { 101 | s->m = s->l + l + 2; 102 | kroundup32(s->m); 103 | s->s = (char*)realloc(s->s, s->m); 104 | } 105 | for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; 106 | s->s[s->l] = 0; 107 | return 0; 108 | } 109 | 110 | static inline int *ksplit(kstring_t *s, int delimiter, int *n) 111 | { 112 | int max = 0, *offsets = 0; 113 | *n = ksplit_core(s->s, delimiter, &max, &offsets); 114 | return offsets; 115 | } 116 | 117 | #endif 118 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/misc/Makefile: -------------------------------------------------------------------------------- 1 | CC= gcc 2 | CXX= g++ 3 | CFLAGS= -g -Wall -O2 #-m64 #-arch ppc 4 | CXXFLAGS= $(CFLAGS) 5 | DFLAGS= -D_FILE_OFFSET_BITS=64 6 | OBJS= 7 | PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim seqtk 8 | INCLUDES= -I.. 9 | SUBDIRS= . 10 | 11 | .SUFFIXES:.c .o 12 | 13 | .c.o: 14 | $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ 15 | 16 | all:$(PROG) 17 | 18 | lib-recur all-recur clean-recur cleanlocal-recur install-recur: 19 | @target=`echo $@ | sed s/-recur//`; \ 20 | wdir=`pwd`; \ 21 | list='$(SUBDIRS)'; for subdir in $$list; do \ 22 | cd $$subdir; \ 23 | $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ 24 | INCLUDES="$(INCLUDES)" $$target || exit 1; \ 25 | cd $$wdir; \ 26 | done; 27 | 28 | lib: 29 | 30 | seqtk:seqtk.o 31 | $(CC) $(CFLAGS) -o $@ seqtk.o -lm -lz 32 | 33 | wgsim:wgsim.o 34 | $(CC) $(CFLAGS) -o $@ wgsim.o -lm -lz 35 | 36 | md5fa:md5.o md5fa.o md5.h ../kseq.h 37 | $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz 38 | 39 | md5sum-lite:md5sum-lite.o 40 | $(CC) $(CFLAGS) -o $@ md5sum-lite.o 41 | 42 | md5sum-lite.o:md5.c md5.h 43 | $(CC) -c $(CFLAGS) -DMD5SUM_MAIN -o $@ md5.c 44 | 45 | maq2sam-short:maq2sam.c 46 | $(CC) $(CFLAGS) -o $@ maq2sam.c -lz 47 | 48 | maq2sam-long:maq2sam.c 49 | $(CC) $(CFLAGS) -DMAQ_LONGREADS -o $@ maq2sam.c -lz 50 | 51 | md5fa.o:md5.h md5fa.c 52 | $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c 53 | 54 | seqtk.o:seqtk.c ../khash.h ../kseq.h 55 | $(CC) $(CFLAGS) -c -I.. -o $@ seqtk.c 56 | 57 | wgsim.o:wgsim.c ../kseq.h 58 | $(CC) $(CFLAGS) -c -I.. -o $@ wgsim.c 59 | 60 | cleanlocal: 61 | rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a 62 | 63 | clean:cleanlocal-recur 64 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/misc/blast2sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Std; 6 | 7 | &blast2sam; 8 | 9 | sub blast2sam { 10 | my %opts = (); 11 | getopts('s', \%opts); 12 | die("Usage: blast2sam.pl \n") if (-t STDIN && @ARGV == 0); 13 | my ($qlen, $slen, $q, $s, $qbeg, $qend, @sam, @cigar, @cmaux, $show_seq); 14 | $show_seq = defined($opts{s}); 15 | @sam = (); @sam[0,4,6..8,10] = ('', 255, '*', 0, 0, '*'); 16 | while (<>) { 17 | if (@cigar && (/^Query=/ || /Score =.*bits.*Expect/)) { # print 18 | &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend); 19 | @cigar = (); 20 | } 21 | if (/^Query= (\S+)/) { 22 | $sam[0] = $1; 23 | } elsif (/\((\S+)\s+letters\)/) { 24 | $qlen = $1; $qlen =~ s/,//g; 25 | } elsif (/^>(\S+)/) { 26 | $sam[2] = $1; 27 | } elsif (/Length = (\d+)/) { 28 | $slen = $1; 29 | } elsif (/Score =\s+(\S+) bits.+Expect(\(\d+\))? = (\S+)/) { # the start of an alignment block 30 | my ($as, $ev) = (int($1 + .499), $3); 31 | $ev = "1$ev" if ($ev =~ /^e/); 32 | @sam[1,3,9,11,12] = (0, 0, '', "AS:i:$as", "EV:Z:$ev"); 33 | @cigar = (); $qbeg = 0; 34 | @cmaux = (0, 0, 0, ''); 35 | } elsif (/Strand = (\S+) \/ (\S+)/) { 36 | $sam[1] |= 0x10 if ($2 eq 'Minus'); 37 | } elsif (/Query\:\s(\d+)\s*(\S+)\s(\d+)/) { 38 | $q = $2; 39 | unless ($qbeg) { 40 | $qbeg = $1; 41 | push(@cigar, ($1-1) . "H") if ($1 > 1); 42 | } 43 | $qend = $3; 44 | if ($show_seq) { 45 | my $x = $q; 46 | $x =~ s/-//g; $sam[9] .= $x; 47 | } 48 | } elsif (/Sbjct\:\s(\d+)\s*(\S+)\s(\d+)/) { 49 | $s = $2; 50 | if ($sam[1] & 0x10) { 51 | $sam[3] = $3; 52 | } else { 53 | $sam[3] = $1 unless ($sam[3]); 54 | } 55 | &aln2cm(\@cigar, \$q, \$s, \@cmaux); 56 | } 57 | } 58 | &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend); 59 | } 60 | 61 | sub blast_print_sam { 62 | my ($sam, $cigar, $cmaux, $qrest) = @_; 63 | push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1)); 64 | push(@$cigar, $qrest . 'H') if ($qrest); 65 | if ($sam->[1] & 0x10) { 66 | @$cigar = reverse(@$cigar); 67 | $sam->[9] = reverse($sam->[9]); 68 | $sam->[9] =~ tr/atgcrymkswATGCRYMKSW/tacgyrkmswTACGYRKMSW/; 69 | } 70 | $sam->[9] = '*' if (!$sam->[9]); 71 | $sam->[5] = join('', @$cigar); 72 | print join("\t", @$sam), "\n"; 73 | } 74 | 75 | sub aln2cm { 76 | my ($cigar, $q, $s, $cmaux) = @_; 77 | my $l = length($$q); 78 | for (my $i = 0; $i < $l; ++$i) { 79 | my $op; 80 | # set $op 81 | if (substr($$q, $i, 1) eq '-') { $op = 2; } 82 | elsif (substr($$s, $i, 1) eq '-') { $op = 1; } 83 | else { $op = 0; } 84 | # for CIGAR 85 | if ($cmaux->[0] == $op) { 86 | ++$cmaux->[1]; 87 | } else { 88 | push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1)); 89 | $cmaux->[0] = $op; $cmaux->[1] = 1; 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/misc/bowtie2sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Contact: lh3 4 | # Version: 0.1.1 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Std; 9 | 10 | &bowtie2sam; 11 | exit; 12 | 13 | sub bowtie2sam { 14 | my %opts = (); 15 | die("Usage: bowtie2sam.pl \n") if (@ARGV == 0 && -t STDIN); 16 | # core loop 17 | my (@s, $last, @staging, $k, $best_s, $subbest_s, $best_k); 18 | $last = ''; 19 | while (<>) { 20 | my ($name, $nm) = &bowtie2sam_aux($_, \@s); # read_name, number of mismatches 21 | if ($name eq $last) { 22 | # I do not know whether the multiple hits are ordered on the 23 | # number of mismatches. I assume they are not and so I have to 24 | # keep all these multiple hits in memory. 25 | @{$staging[$k]} = @s; 26 | if ($best_s > $nm) { 27 | $subbest_s = $best_s; 28 | $best_s = $nm; 29 | $best_k = $k; 30 | } elsif ($subbest_s > $nm) { 31 | $subbest_s = $nm; 32 | } 33 | ++$k; 34 | } else { 35 | if ($last) { 36 | if ($best_s == $subbest_s) { 37 | $staging[$best_k][4] = 0; 38 | } elsif ($subbest_s - $best_s == 1) { 39 | $staging[$best_k][4] = 15 if ($staging[$best_k][4] > 15); 40 | } 41 | print join("\t", @{$staging[$best_k]}), "\n"; 42 | } 43 | $k = 1; $best_s = $nm; $subbest_s = 1000; $best_k = 0; 44 | @{$staging[0]} = @s; 45 | $last = $name; 46 | } 47 | } 48 | print join("\t", @{$staging[$best_k]}), "\n" if ($best_k >= 0); 49 | } 50 | 51 | sub bowtie2sam_aux { 52 | my ($line, $s) = @_; 53 | chomp($line); 54 | my @t = split("\t", $line); 55 | my $ret; 56 | @$s = (); 57 | # read name 58 | $s->[0] = $ret = $t[0]; 59 | $s->[0] =~ s/\/[12]$//g; 60 | # initial flag (will be updated later) 61 | $s->[1] = 0; 62 | # read & quality 63 | $s->[9] = $t[4]; $s->[10] = $t[5]; 64 | # cigar 65 | $s->[5] = length($s->[9]) . "M"; 66 | # coor 67 | $s->[2] = $t[2]; $s->[3] = $t[3] + 1; 68 | $s->[1] |= 0x10 if ($t[1] eq '-'); 69 | # mapQ 70 | $s->[4] = $t[6] == 0? 25 : 0; 71 | # mate coordinate 72 | $s->[6] = '*'; $s->[7] = $s->[8] = 0; 73 | # aux 74 | my $nm = @t - 7; 75 | push(@$s, "NM:i:" . (@t-7)); 76 | push(@$s, "X$nm:i:" . ($t[6]+1)); 77 | my $md = ''; 78 | if ($t[7]) { 79 | $_ = $t[7]; 80 | my $a = 0; 81 | while (/(\d+):[ACGTN]>([ACGTN])/gi) { 82 | my ($y, $z) = ($1, $2); 83 | $md .= (int($y)-$a) . $z; 84 | $a += $y - $a + 1; 85 | } 86 | $md .= length($s->[9]) - $a; 87 | } else { 88 | $md = length($s->[9]); 89 | } 90 | push(@$s, "MD:Z:$md"); 91 | return ($ret, $nm); 92 | } 93 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/misc/interpolate_sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | ###Builds interpolated pileup from SAM file 5 | ##@description counts bases between paired ends and piles up single end reads. 6 | ##@output, uses a #header for the RNAME and then the number of reads per base 7 | ##@author sm8@sanger.ac.uk, Stephen B. Montgomery 8 | 9 | ##@caveats 10 | ##Requires RNAME to have format as per example 11 | ## chromosome:NCBI36:18:1:76117153:1 12 | ## supercontig::NT_113883:1:137703:1 13 | ## clone::AC138827.3:1:149397:1 14 | ##Expects simple CIGAR characters, M, I and D 15 | ##Expects SAM file to be sorted. 16 | ##Expects 0x0010 to mark second read in PE file (as has been the observed case from MAQ output) (important for line 77) 17 | 18 | ##Verify and read in SAM file 19 | my $sam_file = $ARGV[0]; 20 | if(!defined($sam_file)) { die("No sam file defined on arg 1"); } 21 | unless(-f $sam_file) { die("Sam file does not exist: $sam_file"); } 22 | open(SAM, $sam_file) || die("Cannot open sam file"); 23 | 24 | ##Globals 25 | my $current_location = ""; ##Current RNAME being processed 26 | my $current_size = 0; ##Size of sequence region being processed 27 | my $current_position = 1; ##Current base being processed 28 | my $open = 0; ##Number of open reads (PE reads that have not been closed) 29 | my %close = (); ##Hash of closing positions, when the current_position gets to this position it subtracts the 30 | ##contained value from those open and deletes the indexed position from the hash 31 | 32 | while (my $line = ) { 33 | my @tokens = split /\t/, $line; 34 | 35 | if ($current_location ne $tokens[2]) { ##Start a new sequence region 36 | for (my $i = $current_position; $i <= $current_size; $i++) { ##Close the previous sequence region 37 | if (defined($close{$i})) { 38 | $open = $open - $close{$i}; 39 | delete $close{$i}; 40 | } 41 | print $open . "\n"; 42 | } 43 | if ($current_location ne "") { 44 | print "\n"; 45 | } 46 | 47 | ##Initiate a new sequence region 48 | my @location_tokens = split /:/, $tokens[2]; 49 | $current_position = 1; 50 | $current_location = $tokens[2]; 51 | $current_size = $location_tokens[4]; 52 | $open = 0; 53 | %close = (); 54 | print "#" . $tokens[2] . "\n"; 55 | 56 | ##Print pileup to just before the first read (will be 0) 57 | for (my $current_position = 1; $current_position < $tokens[3]; $current_position++) { 58 | print $open . "\n"; 59 | } 60 | $current_position = $tokens[3]; 61 | 62 | } else { ##Sequence region already open 63 | if ($tokens[3] > $current_position) { ##If the new read's position is greater than the current position 64 | ##cycle through to catch up to the current position 65 | for (my $i = $current_position; $i < $tokens[3]; $i++) { 66 | if (defined($close{$i})) { 67 | $open = $open - $close{$i}; 68 | delete $close{$i}; 69 | } 70 | print $open . "\n"; 71 | } 72 | $current_position = $tokens[3]; 73 | } 74 | } 75 | $open++; ##Increment the number of open reads 76 | 77 | if (($tokens[1] & 0x0080 || $tokens[1] & 0x0040) && $tokens[1] & 0x0010 && $tokens[1] & 0x0002) { ##if second read of mate pair, add close condition 78 | $open--; 79 | my $parsed_cig = &parseCigar($tokens[5]); 80 | my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1; 81 | if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; } 82 | $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1; 83 | } elsif (!($tokens[1] & 0x0001) || !($tokens[1] & 0x0002)) { ##if unpaired, add close condition 84 | my $parsed_cig = &parseCigar($tokens[5]); 85 | my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1; 86 | if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; } 87 | $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1; 88 | } else { 89 | #do nothing 90 | } 91 | } 92 | for (my $i = $current_position; $i <= $current_size; $i++) { ##Finish up the last sequence region 93 | if (defined($close{$i})) { 94 | $open = $open - $close{$i}; 95 | delete $close{$i}; 96 | } 97 | print $open . "\n"; 98 | } 99 | print "\n"; 100 | close(SAM); 101 | exit(0); 102 | 103 | ##reads and tokenizes simple cigarline 104 | sub parseCigar() { 105 | my $cigar_line = shift; 106 | $cigar_line =~ s/([0-9]*[A-Z]{1})/$1\t/g; 107 | my @cigar_tokens = split /\t/, $cigar_line; 108 | my %parsed = ('M' => 0, 109 | 'I' => 0, 110 | 'D' => 0); 111 | my @events = (); 112 | for(my $i = 0; $i < scalar(@cigar_tokens); $i++) { 113 | if ($cigar_tokens[$i] =~ /([0-9]+)([A-Z]{1})/g) { 114 | if (!defined($parsed{$2})) { $parsed{$2} = 0; } 115 | my $nt = $2; 116 | if ($nt ne "M" && $nt ne "D" && $nt ne "I") { $nt = "M"; } 117 | $parsed{$nt} += $1; 118 | my %event_el = ("t" => $nt, 119 | "n" => $1); 120 | push @events, \%event_el; 121 | } 122 | } 123 | $parsed{'events'} = \@events; 124 | return \%parsed; 125 | } 126 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/misc/maq2sam.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define PACKAGE_VERSION "r439" 9 | 10 | //#define MAQ_LONGREADS 11 | 12 | #ifdef MAQ_LONGREADS 13 | # define MAX_READLEN 128 14 | #else 15 | # define MAX_READLEN 64 16 | #endif 17 | 18 | #define MAX_NAMELEN 36 19 | #define MAQMAP_FORMAT_OLD 0 20 | #define MAQMAP_FORMAT_NEW -1 21 | 22 | #define PAIRFLAG_FF 0x01 23 | #define PAIRFLAG_FR 0x02 24 | #define PAIRFLAG_RF 0x04 25 | #define PAIRFLAG_RR 0x08 26 | #define PAIRFLAG_PAIRED 0x10 27 | #define PAIRFLAG_DIFFCHR 0x20 28 | #define PAIRFLAG_NOMATCH 0x40 29 | #define PAIRFLAG_SW 0x80 30 | 31 | typedef struct 32 | { 33 | uint8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */ 34 | uint8_t size, map_qual, info1, info2, c[2], flag, alt_qual; 35 | uint32_t seqid, pos; 36 | int dist; 37 | char name[MAX_NAMELEN]; 38 | } maqmap1_t; 39 | 40 | typedef struct 41 | { 42 | int format, n_ref; 43 | char **ref_name; 44 | uint64_t n_mapped_reads; 45 | maqmap1_t *mapped_reads; 46 | } maqmap_t; 47 | 48 | maqmap_t *maq_new_maqmap() 49 | { 50 | maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t)); 51 | mm->format = MAQMAP_FORMAT_NEW; 52 | return mm; 53 | } 54 | void maq_delete_maqmap(maqmap_t *mm) 55 | { 56 | int i; 57 | if (mm == 0) return; 58 | for (i = 0; i < mm->n_ref; ++i) 59 | free(mm->ref_name[i]); 60 | free(mm->ref_name); 61 | free(mm->mapped_reads); 62 | free(mm); 63 | } 64 | maqmap_t *maqmap_read_header(gzFile fp) 65 | { 66 | maqmap_t *mm; 67 | int k, len; 68 | mm = maq_new_maqmap(); 69 | gzread(fp, &mm->format, sizeof(int)); 70 | if (mm->format != MAQMAP_FORMAT_NEW) { 71 | if (mm->format > 0) { 72 | fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n"); 73 | exit(3); 74 | } 75 | assert(mm->format == MAQMAP_FORMAT_NEW); 76 | } 77 | gzread(fp, &mm->n_ref, sizeof(int)); 78 | mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*)); 79 | for (k = 0; k != mm->n_ref; ++k) { 80 | gzread(fp, &len, sizeof(int)); 81 | mm->ref_name[k] = (char*)malloc(len * sizeof(char)); 82 | gzread(fp, mm->ref_name[k], len); 83 | } 84 | /* read number of mapped reads */ 85 | gzread(fp, &mm->n_mapped_reads, sizeof(uint64_t)); 86 | return mm; 87 | } 88 | 89 | void maq2tam_core(gzFile fp, const char *rg) 90 | { 91 | maqmap_t *mm; 92 | maqmap1_t mm1, *m1; 93 | int ret; 94 | m1 = &mm1; 95 | mm = maqmap_read_header(fp); 96 | while ((ret = gzread(fp, m1, sizeof(maqmap1_t))) == sizeof(maqmap1_t)) { 97 | int j, flag = 0, se_mapq = m1->seq[MAX_READLEN-1]; 98 | if (m1->flag) flag |= 1; 99 | if ((m1->flag&PAIRFLAG_PAIRED) || ((m1->flag&PAIRFLAG_SW) && m1->flag != 192)) flag |= 2; 100 | if (m1->flag == 192) flag |= 4; 101 | if (m1->flag == 64) flag |= 8; 102 | if (m1->pos&1) flag |= 0x10; 103 | if ((flag&1) && m1->dist != 0) { 104 | int c; 105 | if (m1->dist > 0) { 106 | if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_RF)) c = 0; 107 | else if (m1->flag&(PAIRFLAG_FR|PAIRFLAG_RR)) c = 1; 108 | else c = m1->pos&1; 109 | } else { 110 | if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_FR)) c = 0; 111 | else if (m1->flag&(PAIRFLAG_RF|PAIRFLAG_RR)) c = 1; 112 | else c = m1->pos&1; 113 | } 114 | if (c) flag |= 0x20; 115 | } 116 | if (m1->flag) { 117 | int l = strlen(m1->name); 118 | if (m1->name[l-2] == '/') { 119 | flag |= (m1->name[l-1] == '1')? 0x40 : 0x80; 120 | m1->name[l-2] = '\0'; 121 | } 122 | } 123 | printf("%s\t%d\t", m1->name, flag); 124 | printf("%s\t%d\t", mm->ref_name[m1->seqid], (m1->pos>>1)+1); 125 | if (m1->flag == 130) { 126 | int c = (int8_t)m1->seq[MAX_READLEN-1]; 127 | printf("%d\t", m1->alt_qual); 128 | if (c == 0) printf("%dM\t", m1->size); 129 | else { 130 | if (c > 0) printf("%dM%dI%dM\t", m1->map_qual, c, m1->size - m1->map_qual - c); 131 | else printf("%dM%dD%dM\t", m1->map_qual, -c, m1->size - m1->map_qual); 132 | } 133 | se_mapq = 0; // zero SE mapQ for reads aligned by SW 134 | } else { 135 | if (flag&4) printf("0\t*\t"); 136 | else printf("%d\t%dM\t", m1->map_qual, m1->size); 137 | } 138 | printf("*\t0\t%d\t", m1->dist); 139 | for (j = 0; j != m1->size; ++j) { 140 | if (m1->seq[j] == 0) putchar('N'); 141 | else putchar("ACGT"[m1->seq[j]>>6&3]); 142 | } 143 | putchar('\t'); 144 | for (j = 0; j != m1->size; ++j) 145 | putchar((m1->seq[j]&0x3f) + 33); 146 | putchar('\t'); 147 | if (rg) printf("RG:Z:%s\t", rg); 148 | if (flag&4) { // unmapped 149 | printf("MF:i:%d\n", m1->flag); 150 | } else { 151 | printf("MF:i:%d\t", m1->flag); 152 | if (m1->flag) printf("AM:i:%d\tSM:i:%d\t", m1->alt_qual, se_mapq); 153 | printf("NM:i:%d\tUQ:i:%d\tH0:i:%d\tH1:i:%d\n", m1->info1&0xf, m1->info2, m1->c[0], m1->c[1]); 154 | } 155 | } 156 | if (ret > 0) 157 | fprintf(stderr, "Truncated! Continue anyway.\n"); 158 | maq_delete_maqmap(mm); 159 | } 160 | 161 | int main(int argc, char *argv[]) 162 | { 163 | gzFile fp; 164 | if (argc == 1) { 165 | fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); 166 | fprintf(stderr, "Usage: maq2sam []\n"); 167 | return 1; 168 | } 169 | fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); 170 | maq2tam_core(fp, argc > 2? argv[2] : 0); 171 | gzclose(fp); 172 | return 0; 173 | } 174 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/misc/md5.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is adapted from a program in this page: 3 | 4 | http://www.fourmilab.ch/md5/ 5 | 6 | The original source code does not work on 64-bit machines due to the 7 | wrong typedef "uint32". I also added prototypes. 8 | 9 | -lh3 10 | */ 11 | 12 | #ifndef MD5_H 13 | #define MD5_H 14 | 15 | /* The following tests optimise behaviour on little-endian 16 | machines, where there is no need to reverse the byte order 17 | of 32 bit words in the MD5 computation. By default, 18 | HIGHFIRST is defined, which indicates we're running on a 19 | big-endian (most significant byte first) machine, on which 20 | the byteReverse function in md5.c must be invoked. However, 21 | byteReverse is coded in such a way that it is an identity 22 | function when run on a little-endian machine, so calling it 23 | on such a platform causes no harm apart from wasting time. 24 | If the platform is known to be little-endian, we speed 25 | things up by undefining HIGHFIRST, which defines 26 | byteReverse as a null macro. Doing things in this manner 27 | insures we work on new platforms regardless of their byte 28 | order. */ 29 | 30 | #define HIGHFIRST 31 | 32 | #if __LITTLE_ENDIAN__ != 0 33 | #undef HIGHFIRST 34 | #endif 35 | 36 | #include 37 | 38 | struct MD5Context { 39 | uint32_t buf[4]; 40 | uint32_t bits[2]; 41 | unsigned char in[64]; 42 | }; 43 | 44 | void MD5Init(struct MD5Context *ctx); 45 | void MD5Update(struct MD5Context *ctx, unsigned char *buf, unsigned len); 46 | void MD5Final(unsigned char digest[16], struct MD5Context *ctx); 47 | 48 | /* 49 | * This is needed to make RSAREF happy on some MS-DOS compilers. 50 | */ 51 | typedef struct MD5Context MD5_CTX; 52 | 53 | /* Define CHECK_HARDWARE_PROPERTIES to have main,c verify 54 | byte order and uint32_t settings. */ 55 | #define CHECK_HARDWARE_PROPERTIES 56 | 57 | #endif /* !MD5_H */ 58 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/misc/md5fa.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "md5.h" 4 | #include "kseq.h" 5 | 6 | #define HEX_STR "0123456789abcdef" 7 | 8 | KSEQ_INIT(gzFile, gzread) 9 | 10 | static void md5_one(const char *fn) 11 | { 12 | MD5_CTX md5_one, md5_all; 13 | int l, i, k; 14 | gzFile fp; 15 | kseq_t *seq; 16 | unsigned char unordered[16], digest[16]; 17 | 18 | for (l = 0; l < 16; ++l) unordered[l] = 0; 19 | fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); 20 | if (fp == 0) { 21 | fprintf(stderr, "md5fa: %s: No such file or directory\n", fn); 22 | exit(1); 23 | } 24 | 25 | MD5Init(&md5_all); 26 | seq = kseq_init(fp); 27 | while ((l = kseq_read(seq)) >= 0) { 28 | for (i = k = 0; i < seq->seq.l; ++i) { 29 | if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]); 30 | else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i]; 31 | } 32 | MD5Init(&md5_one); 33 | MD5Update(&md5_one, (unsigned char*)seq->seq.s, k); 34 | MD5Final(digest, &md5_one); 35 | for (l = 0; l < 16; ++l) { 36 | printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]); 37 | unordered[l] ^= digest[l]; 38 | } 39 | printf(" %s %s\n", fn, seq->name.s); 40 | MD5Update(&md5_all, (unsigned char*)seq->seq.s, k); 41 | } 42 | MD5Final(digest, &md5_all); 43 | kseq_destroy(seq); 44 | for (l = 0; l < 16; ++l) 45 | printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]); 46 | printf(" %s >ordered\n", fn); 47 | for (l = 0; l < 16; ++l) 48 | printf("%c%c", HEX_STR[unordered[l]>>4&0xf], HEX_STR[unordered[l]&0xf]); 49 | printf(" %s >unordered\n", fn); 50 | } 51 | 52 | int main(int argc, char *argv[]) 53 | { 54 | int i; 55 | if (argc == 1) md5_one("-"); 56 | else for (i = 1; i < argc; ++i) md5_one(argv[i]); 57 | return 0; 58 | } 59 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/misc/psl2sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Author: lh3 4 | 5 | # This script calculates a score using the BLAST scoring 6 | # system. However, I am not sure how to count gap opens and gap 7 | # extensions. It seems to me that column 5-8 are not what I am 8 | # after. This script counts gaps from the last three columns. It does 9 | # not generate reference skip (N) in the CIGAR as it is not easy to 10 | # directly tell which gaps correspond to introns. 11 | 12 | use strict; 13 | use warnings; 14 | use Getopt::Std; 15 | 16 | my %opts = (a=>1, b=>3, q=>5, r=>2); 17 | getopts('a:b:q:r:', \%opts); 18 | die("Usage: psl2sam.pl [-a $opts{a}] [-b $opts{b}] [-q $opts{q}] [-r $opts{r}] \n") if (@ARGV == 0 && -t STDIN); 19 | 20 | my @stack; 21 | my $last = ''; 22 | my ($a, $b, $q, $r) = ($opts{a}, $opts{b}, $opts{q}, $opts{r}); 23 | while (<>) { 24 | next unless (/^\d/); 25 | my @t = split; 26 | my @s; 27 | my $cigar = ''; 28 | if ($t[8] eq '-') { 29 | my $tmp = $t[11]; 30 | $t[11] = $t[10] - $t[12]; 31 | $t[12] = $t[10] - $tmp; 32 | } 33 | @s[0..4] = ($t[9], (($t[8] eq '+')? 0 : 16), $t[13], $t[15]+1, 0); 34 | @s[6..10] = ('*', 0, 0, '*', '*'); 35 | $cigar .= $t[11].'H' if ($t[11]); # 5'-end clipping 36 | my @x = split(',', $t[18]); 37 | my @y = split(',', $t[19]); 38 | my @z = split(',', $t[20]); 39 | my ($y0, $z0) = ($y[0], $z[0]); 40 | my ($gap_open, $gap_ext) = (0, 0, 0); 41 | for (1 .. $t[17]-1) { 42 | my $ly = $y[$_] - $y[$_-1] - $x[$_-1]; 43 | my $lz = $z[$_] - $z[$_-1] - $x[$_-1]; 44 | if ($ly < $lz) { # del: the reference gap is longer 45 | ++$gap_open; 46 | $gap_ext += $lz - $ly; 47 | $cigar .= ($y[$_] - $y0) . 'M'; 48 | $cigar .= ($lz - $ly) . 'D'; 49 | ($y0, $z0) = ($y[$_], $z[$_]); 50 | } elsif ($lz < $ly) { # ins: the query gap is longer 51 | ++$gap_open; 52 | $gap_ext += $ly - $lz; 53 | $cigar .= ($z[$_] - $z0) . 'M'; 54 | $cigar .= ($ly - $lz) . 'I'; 55 | ($y0, $z0) = ($y[$_], $z[$_]); 56 | } 57 | } 58 | $cigar .= ($t[12] - $y0) . 'M'; 59 | $cigar .= ($t[10] - $t[12]).'H' if ($t[10] != $t[12]); # 3'-end clipping 60 | $s[5] = $cigar; 61 | my $score = $a * $t[0] - $b * $t[1] - $q * $gap_open - $r * $gap_ext; 62 | $score = 0 if ($score < 0); 63 | $s[11] = "AS:i:$score"; 64 | print join("\t", @s), "\n"; 65 | } 66 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/misc/soap2sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Contact: lh3 4 | # Version: 0.1.1 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Std; 9 | 10 | &soap2sam; 11 | exit; 12 | 13 | sub mating { 14 | my ($s1, $s2) = @_; 15 | my $isize = 0; 16 | if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize 17 | my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; 18 | my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; 19 | $isize = $x2 - $x1; 20 | } 21 | # update mate coordinate 22 | if ($s2->[2] ne '*') { 23 | @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); 24 | $s1->[1] |= 0x20 if ($s2->[1] & 0x10); 25 | } else { 26 | $s1->[1] |= 0x8; 27 | } 28 | if ($s1->[2] ne '*') { 29 | @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); 30 | $s2->[1] |= 0x20 if ($s1->[1] & 0x10); 31 | } else { 32 | $s2->[1] |= 0x8; 33 | } 34 | } 35 | 36 | sub soap2sam { 37 | my %opts = (); 38 | getopts("p", \%opts); 39 | die("Usage: soap2sam.pl [-p] \n") if (@ARGV == 0 && -t STDIN); 40 | my $is_paired = defined($opts{p}); 41 | # core loop 42 | my @s1 = (); 43 | my @s2 = (); 44 | my ($s_last, $s_curr) = (\@s1, \@s2); 45 | while (<>) { 46 | s/[\177-\377]|[\000-\010]|[\012-\040]//g; 47 | next if (&soap2sam_aux($_, $s_curr, $is_paired) < 0); 48 | if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { 49 | &mating($s_last, $s_curr); 50 | print join("\t", @$s_last), "\n"; 51 | print join("\t", @$s_curr), "\n"; 52 | @$s_last = (); @$s_curr = (); 53 | } else { 54 | print join("\t", @$s_last), "\n" if (@$s_last != 0); 55 | my $s = $s_last; $s_last = $s_curr; $s_curr = $s; 56 | } 57 | } 58 | print join("\t", @$s_last), "\n" if (@$s_last != 0); 59 | } 60 | 61 | sub soap2sam_aux { 62 | my ($line, $s, $is_paired) = @_; 63 | chomp($line); 64 | my @t = split(/\s+/, $line); 65 | return -1 if (@t < 9 || $line =~ /^\s/ || !$t[0]); 66 | @$s = (); 67 | # fix SOAP-2.1.x bugs 68 | @t = @t[0..2,4..$#t] unless ($t[3] =~ /^\d+$/); 69 | # read name 70 | $s->[0] = $t[0]; 71 | $s->[0] =~ s/\/[12]$//g; 72 | # initial flag (will be updated later) 73 | $s->[1] = 0; 74 | $s->[1] |= 1 | 1<<($t[4] eq 'a'? 6 : 7); 75 | $s->[1] |= 2 if ($is_paired); 76 | # read & quality 77 | $s->[9] = $t[1]; 78 | $s->[10] = (length($t[2]) > length($t[1]))? substr($t[2], 0, length($t[1])) : $t[2]; 79 | # cigar 80 | $s->[5] = length($s->[9]) . "M"; 81 | # coor 82 | $s->[2] = $t[7]; $s->[3] = $t[8]; 83 | $s->[1] |= 0x10 if ($t[6] eq '-'); 84 | # mapQ 85 | $s->[4] = $t[3] == 1? 30 : 0; 86 | # mate coordinate 87 | $s->[6] = '*'; $s->[7] = $s->[8] = 0; 88 | # aux 89 | push(@$s, "NM:i:$t[9]"); 90 | my $md = ''; 91 | if ($t[9]) { 92 | my @x; 93 | for (10 .. $#t) { 94 | push(@x, sprintf("%.3d,$1", $2)) if ($t[$_] =~ /^([ACGT])->(\d+)/i); 95 | } 96 | @x = sort(@x); 97 | my $a = 0; 98 | for (@x) { 99 | my ($y, $z) = split(","); 100 | $md .= (int($y)-$a) . $z; 101 | $a += $y - $a + 1; 102 | } 103 | $md .= length($t[1]) - $a; 104 | } else { 105 | $md = length($t[1]); 106 | } 107 | push(@$s, "MD:Z:$md"); 108 | return 0; 109 | } 110 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/misc/wgsim_eval.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Contact: lh3 4 | # Version: 0.1.5 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Std; 9 | 10 | &wgsim_eval; 11 | exit; 12 | 13 | sub wgsim_eval { 14 | my %opts = (g=>5); 15 | getopts('pcag:', \%opts); 16 | die("Usage: wgsim_eval.pl [-pca] [-g $opts{g}] \n") if (@ARGV == 0 && -t STDIN); 17 | my (@c0, @c1, %fnfp); 18 | my ($max_q, $flag) = (0, 0); 19 | my $gap = $opts{g}; 20 | $flag |= 1 if (defined $opts{p}); 21 | $flag |= 2 if (defined $opts{c}); 22 | while (<>) { 23 | next if (/^\@/); 24 | my @t = split("\t"); 25 | next if (@t < 11); 26 | my $line = $_; 27 | my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]); 28 | $max_q = $q if ($q > $max_q); 29 | # right coordinate 30 | $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg; 31 | --$rght; 32 | # correct for soft clipping 33 | my ($left0, $rght0) = ($left, $rght); 34 | $left -= $1 if (/^(\d+)[SH]/); 35 | $rght += $1 if (/(\d+)[SH]$/); 36 | $left0 -= $1 if (/(\d+)[SH]$/); 37 | $rght0 += $1 if (/^(\d+)[SH]/); 38 | # skip unmapped reads 39 | next if (($t[1]&0x4) || $chr eq '*'); 40 | # parse read name and check 41 | if ($t[0] =~ /^(\S+)_(\d+)_(\d+)_/) { 42 | if ($1 ne $chr) { # different chr 43 | $is_correct = 0; 44 | } else { 45 | if ($flag & 2) { 46 | if (($t[1]&0x40) && !($t[1]&0x10)) { # F3, forward 47 | $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap); 48 | } elsif (($t[1]&0x40) && ($t[1]&0x10)) { # F3, reverse 49 | $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); 50 | } elsif (($t[1]&0x80) && !($t[1]&0x10)) { # R3, forward 51 | $is_correct = 0 if (abs($3 - $left) > $gap && abs($3 - $left0) > $gap); 52 | } else { # R3, reverse 53 | $is_correct = 0 if (abs($2 - $rght) > $gap && abs($3 - $rght0) > $gap); 54 | } 55 | } else { 56 | if ($t[1] & 0x10) { # reverse 57 | $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); # in case of indels that are close to the end of a reads 58 | } else { 59 | $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap); 60 | } 61 | } 62 | } 63 | } else { 64 | warn("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n"); 65 | next; 66 | } 67 | ++$c0[$q]; 68 | ++$c1[$q] unless ($is_correct); 69 | @{$fnfp{$t[4]}} = (0, 0) unless (defined $fnfp{$t[4]}); 70 | ++$fnfp{$t[4]}[0]; 71 | ++$fnfp{$t[4]}[1] unless ($is_correct); 72 | print STDERR $line if (($flag&1) && !$is_correct && $q > 0); 73 | } 74 | # print 75 | my ($cc0, $cc1) = (0, 0); 76 | if (!defined($opts{a})) { 77 | for (my $i = $max_q; $i >= 0; --$i) { 78 | $c0[$i] = 0 unless (defined $c0[$i]); 79 | $c1[$i] = 0 unless (defined $c1[$i]); 80 | $cc0 += $c0[$i]; $cc1 += $c1[$i]; 81 | printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0) if ($cc0); 82 | } 83 | } else { 84 | for (reverse(sort {$a<=>$b} (keys %fnfp))) { 85 | next if ($_ == 0); 86 | $cc0 += $fnfp{$_}[0]; 87 | $cc1 += $fnfp{$_}[1]; 88 | print join("\t", $_, $cc0, $cc1), "\n"; 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/misc/zoom2sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Contact: lh3 4 | # Version: 0.1.0 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Std; 9 | 10 | &zoom2sam; 11 | exit; 12 | 13 | sub mating { 14 | my ($s1, $s2) = @_; 15 | my $isize = 0; 16 | if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize 17 | my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; 18 | my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; 19 | $isize = $x2 - $x1; 20 | } 21 | # update mate coordinate 22 | if ($s2->[2] ne '*') { 23 | @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); 24 | $s1->[1] |= 0x20 if ($s2->[1] & 0x10); 25 | } else { 26 | $s1->[1] |= 0x8; 27 | } 28 | if ($s1->[2] ne '*') { 29 | @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); 30 | $s2->[1] |= 0x20 if ($s1->[1] & 0x10); 31 | } else { 32 | $s2->[1] |= 0x8; 33 | } 34 | } 35 | 36 | sub zoom2sam { 37 | my %opts = (); 38 | getopts("p", \%opts); 39 | die("Usage: zoom2sam.pl [-p] 40 | Warnings: This script only supports the default Illumina outputs.\n") if (@ARGV < 2); 41 | my $is_paired = defined($opts{p}); 42 | my $len = shift(@ARGV); 43 | # core loop 44 | my @s1 = (); 45 | my @s2 = (); 46 | my ($s_last, $s_curr) = (\@s1, \@s2); 47 | while (<>) { 48 | &zoom2sam_aux($_, $s_curr, $is_paired, $len); 49 | if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { 50 | &mating($s_last, $s_curr); 51 | print join("\t", @$s_last), "\n"; 52 | print join("\t", @$s_curr), "\n"; 53 | @$s_last = (); @$s_curr = (); 54 | } else { 55 | print join("\t", @$s_last), "\n" if (@$s_last != 0); 56 | my $s = $s_last; $s_last = $s_curr; $s_curr = $s; 57 | } 58 | } 59 | print join("\t", @$s_last), "\n" if (@$s_last != 0); 60 | } 61 | 62 | sub zoom2sam_aux { 63 | my ($line, $s, $is_paired, $len) = @_; 64 | chomp($line); 65 | my @t = split("\t", $line); 66 | @$s = (); 67 | # read name 68 | $s->[0] = $t[0]; 69 | # initial flag (will be updated later) 70 | $s->[1] = 0; 71 | $s->[1] |= 1 | 1<<6 if ($s->[0] =~ /_F$/); 72 | $s->[1] |= 1 | 1<<7 if ($s->[0] =~ /_R$/); 73 | $s->[1] |= 2 if ($is_paired); 74 | # read & quality 75 | $s->[9] = "*"; $s->[10] = "*"; 76 | # cigar 77 | $s->[5] = $len . "M"; 78 | # coor 79 | my @s = split(/\s+/, $t[1]); 80 | $s->[2] = $s[0]; 81 | $t[1] =~ /:(\d+)$/; 82 | $s->[3] = $1 + 1; 83 | if ($s->[0] =~ /_[FR]$/) { 84 | my $u = ($s->[0] =~ /_F$/)? 1 : 0; 85 | my $w = ($t[2] eq '+')? 1 : 0; 86 | $s->[1] |= 0x10 if ($u ^ $w); 87 | $s->[0] =~ s/_[FR]$//; 88 | } else { 89 | $s->[1] |= 0x10 if ($t[2] eq '-'); 90 | } 91 | # mapQ 92 | $s->[4] = 30; 93 | # mate coordinate 94 | $s->[6] = '*'; $s->[7] = $s->[8] = 0; 95 | # aux 96 | push(@$s, "NM:i:$t[3]"); 97 | } 98 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/razf.h: -------------------------------------------------------------------------------- 1 | /*- 2 | * RAZF : Random Access compressed(Z) File 3 | * Version: 1.0 4 | * Release Date: 2008-10-27 5 | * 6 | * Copyright 2008, Jue Ruan , Heng Li 7 | * 8 | * All rights reserved. 9 | * 10 | * Redistribution and use in source and binary forms, with or without 11 | * modification, are permitted provided that the following conditions 12 | * are met: 13 | * 1. Redistributions of source code must retain the above copyright 14 | * notice, this list of conditions and the following disclaimer. 15 | * 2. Redistributions in binary form must reproduce the above copyright 16 | * notice, this list of conditions and the following disclaimer in the 17 | * documentation and/or other materials provided with the distribution. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 | * SUCH DAMAGE. 30 | */ 31 | 32 | 33 | #ifndef __RAZF_RJ_H 34 | #define __RAZF_RJ_H 35 | 36 | #include 37 | #include 38 | #include "zlib.h" 39 | 40 | #ifdef _USE_KNETFILE 41 | #include "knetfile.h" 42 | #endif 43 | 44 | #if ZLIB_VERNUM < 0x1221 45 | #define _RZ_READONLY 46 | struct _gz_header_s; 47 | typedef struct _gz_header_s _gz_header; 48 | #define gz_header _gz_header 49 | #endif 50 | 51 | #define WINDOW_BITS 15 52 | 53 | #ifndef RZ_BLOCK_SIZE 54 | #define RZ_BLOCK_SIZE (1<mode from HEAD to TYPE after call inflateReset */ 104 | int buf_off, buf_len; 105 | int z_err, z_eof; 106 | int seekable; 107 | /* Indice where the source is seekable */ 108 | int load_index; 109 | /* set has_index to 0 in mode 'w', then index will be discarded */ 110 | } RAZF; 111 | 112 | #ifdef __cplusplus 113 | extern "C" { 114 | #endif 115 | 116 | RAZF* razf_dopen(int data_fd, const char *mode); 117 | RAZF *razf_open(const char *fn, const char *mode); 118 | int razf_write(RAZF* rz, const void *data, int size); 119 | int razf_read(RAZF* rz, void *data, int size); 120 | int64_t razf_seek(RAZF* rz, int64_t pos, int where); 121 | void razf_close(RAZF* rz); 122 | 123 | #define razf_tell(rz) ((rz)->out) 124 | 125 | RAZF* razf_open2(const char *filename, const char *mode); 126 | RAZF* razf_dopen2(int fd, const char *mode); 127 | uint64_t razf_tell2(RAZF *rz); 128 | int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where); 129 | 130 | #ifdef __cplusplus 131 | } 132 | #endif 133 | 134 | #endif 135 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/razip.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "razf.h" 8 | 9 | #define WINDOW_SIZE 4096 10 | 11 | static int razf_main_usage() 12 | { 13 | printf("\n"); 14 | printf("Usage: razip [options] [file] ...\n\n"); 15 | printf("Options: -c write on standard output, keep original files unchanged\n"); 16 | printf(" -d decompress\n"); 17 | printf(" -l list compressed file contents\n"); 18 | printf(" -b INT decompress at INT position in the uncompressed file\n"); 19 | printf(" -s INT decompress INT bytes in the uncompressed file\n"); 20 | printf(" -h give this help\n"); 21 | printf("\n"); 22 | return 0; 23 | } 24 | 25 | static int write_open(const char *fn, int is_forced) 26 | { 27 | int fd = -1; 28 | char c; 29 | if (!is_forced) { 30 | if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { 31 | printf("razip: %s already exists; do you wish to overwrite (y or n)? ", fn); 32 | scanf("%c", &c); 33 | if (c != 'Y' && c != 'y') { 34 | printf("razip: not overwritten\n"); 35 | exit(1); 36 | } 37 | } 38 | } 39 | if (fd < 0) { 40 | if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { 41 | fprintf(stderr, "razip: %s: Fail to write\n", fn); 42 | exit(1); 43 | } 44 | } 45 | return fd; 46 | } 47 | 48 | int main(int argc, char **argv) 49 | { 50 | int c, compress, pstdout, is_forced; 51 | RAZF *rz; 52 | void *buffer; 53 | long start, end, size; 54 | 55 | compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; 56 | while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){ 57 | switch(c){ 58 | case 'h': return razf_main_usage(); 59 | case 'd': compress = 0; break; 60 | case 'c': pstdout = 1; break; 61 | case 'l': compress = 2; break; 62 | case 'b': start = atol(optarg); break; 63 | case 's': size = atol(optarg); break; 64 | case 'f': is_forced = 1; break; 65 | } 66 | } 67 | if (size >= 0) end = start + size; 68 | if(end >= 0 && end < start){ 69 | fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end); 70 | return 1; 71 | } 72 | if(compress == 1){ 73 | int f_src, f_dst = -1; 74 | if(argc > optind){ 75 | if((f_src = open(argv[optind], O_RDONLY)) < 0){ 76 | fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]); 77 | return 1; 78 | } 79 | if(pstdout){ 80 | f_dst = fileno(stdout); 81 | } else { 82 | char *name = malloc(sizeof(strlen(argv[optind]) + 5)); 83 | strcpy(name, argv[optind]); 84 | strcat(name, ".rz"); 85 | f_dst = write_open(name, is_forced); 86 | if (f_dst < 0) return 1; 87 | free(name); 88 | } 89 | } else if(pstdout){ 90 | f_src = fileno(stdin); 91 | f_dst = fileno(stdout); 92 | } else return razf_main_usage(); 93 | rz = razf_dopen(f_dst, "w"); 94 | buffer = malloc(WINDOW_SIZE); 95 | while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c); 96 | razf_close(rz); // f_dst will be closed here 97 | if (argc > optind && !pstdout) unlink(argv[optind]); 98 | free(buffer); 99 | close(f_src); 100 | return 0; 101 | } else { 102 | if(argc <= optind) return razf_main_usage(); 103 | if(compress == 2){ 104 | rz = razf_open(argv[optind], "r"); 105 | if(rz->file_type == FILE_TYPE_RZ) { 106 | printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name"); 107 | printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end, 108 | argv[optind]); 109 | } else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]); 110 | } else { 111 | int f_dst; 112 | if (argc > optind && !pstdout) { 113 | char *name; 114 | if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) { 115 | printf("razip: %s: unknown suffix -- ignored\n", argv[optind]); 116 | return 1; 117 | } 118 | name = strdup(argv[optind]); 119 | name[strlen(name) - 3] = '\0'; 120 | f_dst = write_open(name, is_forced); 121 | free(name); 122 | } else f_dst = fileno(stdout); 123 | rz = razf_open(argv[optind], "r"); 124 | buffer = malloc(WINDOW_SIZE); 125 | razf_seek(rz, start, SEEK_SET); 126 | while(1){ 127 | if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE); 128 | else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); 129 | if(c <= 0) break; 130 | start += c; 131 | write(f_dst, buffer, c); 132 | if(end >= 0 && start >= end) break; 133 | } 134 | free(buffer); 135 | if (!pstdout) unlink(argv[optind]); 136 | } 137 | razf_close(rz); 138 | return 0; 139 | } 140 | } 141 | 142 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/sam.h: -------------------------------------------------------------------------------- 1 | #ifndef BAM_SAM_H 2 | #define BAM_SAM_H 3 | 4 | #include "bam.h" 5 | 6 | /*! 7 | @header 8 | 9 | This file provides higher level of I/O routines and unifies the APIs 10 | for SAM and BAM formats. These APIs are more convenient and 11 | recommended. 12 | 13 | @copyright Genome Research Ltd. 14 | */ 15 | 16 | /*! @typedef 17 | @abstract SAM/BAM file handler 18 | @field type type of the handler; bit 1 for BAM, 2 for reading and bit 3-4 for flag format 19 | @field bam BAM file handler; valid if (type&1) == 1 20 | @field tamr SAM file handler for reading; valid if type == 2 21 | @field tamw SAM file handler for writing; valid if type == 0 22 | @field header header struct 23 | */ 24 | typedef struct { 25 | int type; 26 | union { 27 | tamFile tamr; 28 | bamFile bam; 29 | FILE *tamw; 30 | } x; 31 | bam_header_t *header; 32 | } samfile_t; 33 | 34 | #ifdef __cplusplus 35 | extern "C" { 36 | #endif 37 | 38 | /*! 39 | @abstract Open a SAM/BAM file 40 | 41 | @param fn SAM/BAM file name; "-" is recognized as stdin (for 42 | reading) or stdout (for writing). 43 | 44 | @param mode open mode /[rw](b?)(u?)(h?)([xX]?)/: 'r' for reading, 45 | 'w' for writing, 'b' for BAM I/O, 'u' for uncompressed BAM output, 46 | 'h' for outputing header in SAM, 'x' for HEX flag and 'X' for 47 | string flag. If 'b' present, it must immediately follow 'r' or 48 | 'w'. Valid modes are "r", "w", "wh", "wx", "whx", "wX", "whX", 49 | "rb", "wb" and "wbu" exclusively. 50 | 51 | @param aux auxiliary data; if mode[0]=='w', aux points to 52 | bam_header_t; if strcmp(mode, "rb")!=0 and @SQ header lines in SAM 53 | are absent, aux points the file name of the list of the reference; 54 | aux is not used otherwise. If @SQ header lines are present in SAM, 55 | aux is not used, either. 56 | 57 | @return SAM/BAM file handler 58 | */ 59 | samfile_t *samopen(const char *fn, const char *mode, const void *aux); 60 | 61 | /*! 62 | @abstract Close a SAM/BAM handler 63 | @param fp file handler to be closed 64 | */ 65 | void samclose(samfile_t *fp); 66 | 67 | /*! 68 | @abstract Read one alignment 69 | @param fp file handler 70 | @param b alignment 71 | @return bytes read 72 | */ 73 | int samread(samfile_t *fp, bam1_t *b); 74 | 75 | /*! 76 | @abstract Write one alignment 77 | @param fp file handler 78 | @param b alignment 79 | @return bytes written 80 | */ 81 | int samwrite(samfile_t *fp, const bam1_t *b); 82 | 83 | /*! 84 | @abstract Get the pileup for a whole alignment file 85 | @param fp file handler 86 | @param mask mask transferred to bam_plbuf_set_mask() 87 | @param func user defined function called in the pileup process 88 | #param data user provided data for func() 89 | */ 90 | int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data); 91 | 92 | char *samfaipath(const char *fn_ref); 93 | 94 | #ifdef __cplusplus 95 | } 96 | #endif 97 | 98 | #endif 99 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/sam_header.h: -------------------------------------------------------------------------------- 1 | #ifndef __SAM_HEADER_H__ 2 | #define __SAM_HEADER_H__ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | void *sam_header_parse2(const char *headerText); 9 | void *sam_header_merge(int n, const void **dicts); 10 | void sam_header_free(void *header); 11 | char *sam_header_write(const void *headerDict); // returns a newly allocated string 12 | 13 | char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); 14 | 15 | void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]); 16 | const char *sam_tbl_get(void *h, const char *key); 17 | int sam_tbl_size(void *h); 18 | void sam_tbl_destroy(void *h); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/sample.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "sample.h" 4 | #include "khash.h" 5 | KHASH_MAP_INIT_STR(sm, int) 6 | 7 | bam_sample_t *bam_smpl_init(void) 8 | { 9 | bam_sample_t *s; 10 | s = calloc(1, sizeof(bam_sample_t)); 11 | s->rg2smid = kh_init(sm); 12 | s->sm2id = kh_init(sm); 13 | return s; 14 | } 15 | 16 | void bam_smpl_destroy(bam_sample_t *sm) 17 | { 18 | int i; 19 | khint_t k; 20 | khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid; 21 | if (sm == 0) return; 22 | for (i = 0; i < sm->n; ++i) free(sm->smpl[i]); 23 | free(sm->smpl); 24 | for (k = kh_begin(rg2smid); k != kh_end(rg2smid); ++k) 25 | if (kh_exist(rg2smid, k)) free((char*)kh_key(rg2smid, k)); 26 | kh_destroy(sm, sm->rg2smid); 27 | kh_destroy(sm, sm->sm2id); 28 | free(sm); 29 | } 30 | 31 | static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, const char *val) 32 | { 33 | khint_t k_rg, k_sm; 34 | int ret; 35 | khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid; 36 | k_rg = kh_get(sm, rg2smid, key); 37 | if (k_rg != kh_end(rg2smid)) return; // duplicated @RG-ID 38 | k_rg = kh_put(sm, rg2smid, strdup(key), &ret); 39 | k_sm = kh_get(sm, sm2id, val); 40 | if (k_sm == kh_end(sm2id)) { // absent 41 | if (sm->n == sm->m) { 42 | sm->m = sm->m? sm->m<<1 : 1; 43 | sm->smpl = realloc(sm->smpl, sizeof(void*) * sm->m); 44 | } 45 | sm->smpl[sm->n] = strdup(val); 46 | k_sm = kh_put(sm, sm2id, sm->smpl[sm->n], &ret); 47 | kh_val(sm2id, k_sm) = sm->n++; 48 | } 49 | kh_val(rg2smid, k_rg) = kh_val(sm2id, k_sm); 50 | } 51 | 52 | int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) 53 | { 54 | const char *p = txt, *q, *r; 55 | kstring_t buf, first_sm; 56 | int n = 0; 57 | khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id; 58 | if (txt == 0) { 59 | add_pair(sm, sm2id, fn, fn); 60 | return 0; 61 | } 62 | memset(&buf, 0, sizeof(kstring_t)); 63 | memset(&first_sm, 0, sizeof(kstring_t)); 64 | while ((q = strstr(p, "@RG")) != 0) { 65 | p = q + 3; 66 | r = q = 0; 67 | if ((q = strstr(p, "\tID:")) != 0) q += 4; 68 | if ((r = strstr(p, "\tSM:")) != 0) r += 4; 69 | if (r && q) { 70 | char *u, *v; 71 | int oq, or; 72 | for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u); 73 | for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v); 74 | oq = *u; or = *v; *u = *v = '\0'; 75 | buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf); 76 | add_pair(sm, sm2id, buf.s, r); 77 | if ( !first_sm.s ) 78 | kputs(r,&first_sm); 79 | *u = oq; *v = or; 80 | } else break; 81 | p = q > r? q : r; 82 | ++n; 83 | } 84 | if (n == 0) add_pair(sm, sm2id, fn, fn); 85 | // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but 86 | // use the tag instead. 87 | else if ( n==1 && first_sm.s ) 88 | add_pair(sm,sm2id,fn,first_sm.s); 89 | if ( first_sm.s ) 90 | free(first_sm.s); 91 | 92 | // add_pair(sm, sm2id, fn, fn); 93 | free(buf.s); 94 | return 0; 95 | } 96 | 97 | int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str) 98 | { 99 | khint_t k; 100 | khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid; 101 | if (rg) { 102 | str->l = 0; 103 | kputs(fn, str); kputc('/', str); kputs(rg, str); 104 | k = kh_get(sm, rg2smid, str->s); 105 | } else k = kh_get(sm, rg2smid, fn); 106 | return k == kh_end(rg2smid)? -1 : kh_val(rg2smid, k); 107 | } 108 | -------------------------------------------------------------------------------- /parsebam/samtools-0.1.18/sample.h: -------------------------------------------------------------------------------- 1 | #ifndef BAM_SAMPLE_H 2 | #define BAM_SAMPLE_H 3 | 4 | #include "kstring.h" 5 | 6 | typedef struct { 7 | int n, m; 8 | char **smpl; 9 | void *rg2smid, *sm2id; 10 | } bam_sample_t; 11 | 12 | bam_sample_t *bam_smpl_init(void); 13 | int bam_smpl_add(bam_sample_t *sm, const char *abs, const char *txt); 14 | int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str); 15 | void bam_smpl_destroy(bam_sample_t *sm); 16 | 17 | #endif 18 | --------------------------------------------------------------------------------