├── GPL_3.0.txt ├── README.txt ├── RELEASE.txt ├── align.cpp ├── align.h ├── bsp2sam.py ├── dbseq.cpp ├── dbseq.h ├── debian ├── changelog ├── compat ├── control ├── copyright ├── postinst ├── prerm └── rules ├── main.cpp ├── makefile ├── methratio.py ├── pairs.cpp ├── pairs.h ├── param.cpp ├── param.h ├── reads.cpp ├── reads.h ├── sam2bam.sh ├── samtools ├── AUTHORS ├── COPYING ├── ChangeLog ├── INSTALL ├── Makefile ├── Makefile.mingw ├── NEWS ├── bam.c ├── bam.h ├── bam2bed.c ├── bam_aux.c ├── bam_color.c ├── bam_endian.h ├── bam_import.c ├── bam_index.c ├── bam_lpileup.c ├── bam_maqcns.c ├── bam_maqcns.h ├── bam_mate.c ├── bam_md.c ├── bam_pileup.c ├── bam_plcmd.c ├── bam_rmdup.c ├── bam_rmdupse.c ├── bam_sort.c ├── bam_stat.c ├── bam_tview.c ├── bamtk.c ├── bgzf.c ├── bgzf.h ├── bgzip.c ├── examples │ ├── 00README.txt │ ├── Makefile │ ├── calDepth.c │ ├── ex1.fa │ └── ex1.sam.gz ├── faidx.c ├── faidx.h ├── glf.c ├── glf.h ├── kaln.c ├── kaln.h ├── khash.h ├── klist.h ├── knetfile.c ├── knetfile.h ├── kseq.h ├── ksort.h ├── kstring.c ├── kstring.h ├── misc │ ├── Makefile │ ├── blast2sam.pl │ ├── bowtie2sam.pl │ ├── export2sam.pl │ ├── interpolate_sam.pl │ ├── maq2sam.c │ ├── md5.c │ ├── md5.h │ ├── md5fa.c │ ├── novo2sam.pl │ ├── psl2sam.pl │ ├── sam2vcf.pl │ ├── samtools.pl │ ├── soap2sam.pl │ ├── wgsim.c │ ├── wgsim_eval.pl │ └── zoom2sam.pl ├── razf.c ├── razf.h ├── razip.c ├── sam.c ├── sam.h ├── sam_header.c ├── sam_header.h ├── sam_view.c ├── samtools.1 ├── samtools.txt ├── showbam.c └── win32 │ ├── libcurses.a │ ├── libz.a │ ├── xcurses.h │ ├── zconf.h │ └── zlib.h ├── utilities.cpp └── utilities.h /RELEASE.txt: -------------------------------------------------------------------------------- 1 | 04-09-2012 2 | release 2.6 3 | 1. optimized seed patterns for mapping speed. 4 | 2. improved pair-end mapping sensitivity slightly. 5 | 3. do not report unmapped reads by default, add option -u to report unmapped reads. 6 | 4. use thread safe random number generator in selecting multiple mappings. 7 | 5. add 95% confidence interval for eatimated methylation ratios in the methratio.py output. 8 | 6. add options in methratio.py script 9 | -t, --trim-fillin trim fill-in nucleotides in end-repairing. 10 | -r, --remove-duplicate remove duplicated hits to reduce PCR bias. 11 | -g, --combine-CpG combine CpG methylaion ratios from both strands. 12 | -m, --min-depth report loci with sequencing depth >= FOLD 13 | 14 | 03-14-2012 15 | release 2.5 16 | 1. restore option -r [0,1] to report multiple mapped reads: 17 | -r 0: only report unique mapped reads 18 | -r 1: report one random hit if multiple hits has the least number of mismatches 19 | 2. set the default number of threads (-p opition) to the number of CPU cores detected (up to 8). 20 | 3. execute sam2bam.sh and samtools in system default path for BAM format output. 21 | 4. added python script bsp2sam.py to convert BSP format output to SAM format. 22 | 23 | 01-27-2012 24 | release 2.43 25 | 1. fixed a bug in methratio.py 26 | 27 | 01-19-2012 28 | release 2.42 29 | 1. corrected the methylation ratio extraction for overlapping paired hits. (for SAM output only) 30 | nucleotides in overlapped part will be counted once instead of twice. 31 | 32 | 01-11-2012 33 | release 2.4 34 | 1. optimized seed patterns, up to 40% faster than v2.3. 35 | 2. change -z option from -z to -z , since the zero quality char '!' is a special char in linux shell 36 | -z @ is now -z 64 (solexa quality) 37 | -z "!" is now -z 33 (sanger quality), which is the default setting. 38 | 3. added -L option to map the first N nucleotides of a read. 39 | 4. re-wrote the methratio.py script, much faster and use less memory (~26GB) for 40 | whole genome methylation ratio extraction. 41 | changed the syntax of options: 42 | ref= is changed to -d or --ref= 43 | chrom= is changed to -c or --chr= 44 | added options: 45 | -u, --unique process only unique mappings 46 | -p, --pair process only paired mappings 47 | -s, --sam-path set the path to samtools 48 | 49 | 12-12-2011 50 | release 2.3 51 | 1. added -M option to allow other special alignments instead of C=>T bisulfite 52 | alignment, for example, -M GA could used to detect A=>I editing in RNA-seq. 53 | 2. added -n option to set mapping strand information. -n 0 only map reads to 2 54 | forward strands, and -n 1 map reads to all 4 possible strands 55 | 3. the output reference sequence were expanded by two nucleotide (in lower 56 | cases) to include seqeunce context information. (ie, CG, CHG or CHH) 57 | 4. added two options for the methratio.py script. 58 | chrom= select the chromosomes to process, so that large 59 | dataset could be broken into chromosomes to use less RAM. 60 | ref= specify the reference fasta file when the BSMAP output 61 | does not include reference sequence. 62 | 63 | 08-18-2011 64 | release 2.2 65 | 1. rewrote the genome index to reduce the memory usage. (from 13G to 8.5G for human genome shotgun mapping). 66 | 2. samtools library was reverted back to 0.1.7a for compatibility on PowerPC platform. 67 | 68 | 05-20-2011 69 | release 2.1 70 | 1. trimm short adapter sequences for properly paired mappings. 71 | 2. optimized seed ordering and RRBS mode indexing. 72 | 73 | 05-13-2011 74 | release 2.01 75 | 1. report identical reads name for pair-end reads. 76 | 77 | release 2.0 78 | 05-04-2011 79 | 1. added -D option for RRBS mapping mode, i.e. "-D C-CGG" to specify the 80 | digestion site information in RRBS. 81 | 2. added -S to set random seed in multiple hits selection 82 | 3. fixed a bug in reporting quality chars when the input uses non-sanger quality 83 | 4. pair-end module is optimized to be 3 times faster than 1.x version 84 | 85 | release: 1.25 86 | 04-20-2011 87 | 1. corrected the 0x40/0x80 flag in SAM format output. 88 | 89 | release: 1.2 90 | 04-12-2011 91 | 1. added option '-I' to specify index interval, allowing memory/sensitivity trade off. 92 | 2. corrected the sign of insert_size of the right most read in pair-end mapping. 93 | 3. added command line syntax '-x=', in additionl to the old '-x ' syntax. 94 | 95 | release: 1.15 96 | 04-08-2011 97 | 1. optimized indexing and alignment with cache prefetching. 98 | 2. updated methratio.py script for methylation ratio calling. 99 | 100 | release: 1.12 101 | 03-29-2011 102 | 1. add SAM/BAM format input. 103 | 2. optimized the seed table and pair-end module. 104 | 3. resolved the compilatin issue with GCC4.3+. 105 | 106 | release: 1.08 107 | 01-05-2011 108 | 1. fixed a bug in reporting strand info in SAM format output. 109 | 2. updated the methylation ratio calling script. 110 | 111 | release: 1.07 112 | 09-08-2010 113 | 1. fixed a bug in matching pair-end strands. 114 | 115 | release: 1.06 116 | 08-27-2010 117 | 1. pair-end module was rewritten 118 | 2. added SAM format output, and added insertion size for paired mapping in BSP format output 119 | 3. output mode option -j was removed 120 | 4. seed segment number option -k was removed, seed segment number will be calculated as (read_len-3)/seed_size 121 | 122 | release: 1.02 123 | 12-21-20-9 124 | 1. fix a bug for 96 bp reads mapping 125 | 2. index mode option -i is removed 126 | 127 | release: 1.0 128 | 10-25-2009 129 | 1. extended supported max read length to 96bp and max seed size to 16 bp 130 | 2. modified output format, added position informtaion of mythyC and non-methyC 131 | 3. added option of choose seed segment numbers to allo 2+ mismatches at full sensitivity 132 | 4. optimized the hash table structure and greatly improved the speed. (e.g. 10X faster for 60bp+ reads) 133 | 5. rewrote user manual (README.txt) 134 | 6. gap alignment and index mode are depreciated and will be removed in next release 135 | 136 | release: 0.99 137 | 08-13-2009 138 | 1. fixed a bug of overflow at the end of BSC reference sequence. 139 | 140 | release: 0.98 141 | 08-10-2009 142 | 1. fixed a bug in methratio.py that may underestimate the methylation ratio. 143 | 2. small optimizations to improve the maping speed, especially for reads longer than 48bp. 144 | 145 | release: 0.97 146 | 07-24-2009 147 | 1. added downstream tool (methratio.py) to calculate the methylation ratios from mapping results. 148 | 2. fixed a bug of not recording the best match in a rare condition. 149 | 150 | release: 0.96 151 | 06-24-2009 152 | 1. added downstream tool (bsmap2wig.py) to convert mapping resluts to WIG file for visualization. 153 | 2. fixed a bug of counting duplicated mappings. 154 | 155 | release: 0.94 156 | 05-05-2009 157 | 1. integrated rc-genome tool wcref into the main program bsmap. 158 | 2. adjusted output format for the strand information: 159 | '++': aligned to BSW, the forward strand of Watson strand of reference. 160 | '+-': aligned to BSWC, the reverse complementary strand of Watson strand of reference. 161 | '-+': aligned to BSC, the forward strand of Crick strand of reference. 162 | '--': aligned to BSCC, the reverse complementary strand of Crick strand of reference. 163 | 3. all alignment positions are now referring to the 5'-end coordinates of the mapping region on the Watson strand of reference sequence, all cordinates are 1-based. Formally mappings on BSC/BSCC are 5'-end coordinates on Crick strand of reference. 164 | 165 | release: 0.93 166 | 04-30-2009 167 | 1. change the 4-segment seeding to 3-segment seeding to reduce memory usage. 168 | 2. combine bsmap.a and bsmap.b into bsmap, adding new parameter -i to distinguish different index mode. 169 | 3. added CpG mode that only searches methylated C in CpG context. (index mode -i 2) 170 | 171 | release: 0.91 172 | 03-30-2009 173 | 1. first formal release with documents. 174 | 2. included genome conversion tool wcref. 175 | 176 | release: 0.9 177 | 11-05-2008 178 | 1. testing version. 179 | -------------------------------------------------------------------------------- /align.h: -------------------------------------------------------------------------------- 1 | #ifndef _ALIGN_H_ 2 | #define _ALIGN_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "param.h" 12 | #include "reads.h" 13 | #include "dbseq.h" 14 | 15 | using namespace std; 16 | 17 | const int FIXSIZE=SEGLEN*FIXELEMENT; 18 | typedef Hit HitArray[MAXHITS+1]; 19 | 20 | extern Param param; 21 | extern char rev_char[]; 22 | extern char chain_flag[]; 23 | 24 | class SingleAlign 25 | { 26 | public: 27 | SingleAlign(); 28 | ~SingleAlign(); 29 | void ImportFileFormat(int format); 30 | void ImportBatchReads(bit32_t n, vector &a); 31 | int CountNs(); 32 | void set_RRBS_start(); //by yxi 33 | int TrimLowQual(); 34 | void ConvertBinaySeq(); 35 | inline void GenerateSeeds(int n, int start); 36 | inline void GenerateCSeeds(int n, int cstart); 37 | int CountSeeds(RefSeq &ref, int n, int start); 38 | int CountCSeeds(RefSeq &ref, int n, int cstart); 39 | 40 | //inline void GenerateSeeds_2(int n); 41 | //inline void GenerateSeeds_3(int n); 42 | inline unsigned int CountMismatch(bit64_t *q, bit64_t *r, bit64_t *s); 43 | //void SnpAlign_0(RefSeq &ref); 44 | void SortHits(int n); 45 | //void SortcHits(int n); 46 | //void SnpAlign_1(RefSeq &ref); 47 | inline bool equal_loc(Hit a); 48 | //void SnpAlign_2(RefSeq &ref); 49 | void ClearHits(); 50 | int RunAlign(RefSeq &ref); 51 | int FilterReads(); 52 | //int CountStringMismatch(int offset, string &s1, string s2); 53 | void Do_Batch(RefSeq &ref); 54 | void StringAlign(RefSeq &ref, string &os); 55 | void Reverse_Seq(); 56 | void Reverse_Qual(); 57 | void s_OutHit(int chain, int n, bit8_t nspsn, Hit *hit, int insert_size, RefSeq &ref, string &os); 58 | //void s_OutGapHit(int chain, size_t n, bit8_t g, Hit *hit, RefSeq &ref, string &os) {}; 59 | 60 | //by yxi 61 | void SnpAlign(RefSeq &ref, int mode); 62 | int TrimAdapter(); 63 | void SortHits4PE(int n); 64 | void Fix_Unpaired_Short_Fragment(RefSeq &ref); 65 | void ReorderSeed(RefSeq &ref); 66 | bit32_t GetTotalSeedLoc(RefSeq &ref, int start); 67 | bit32_t GetTotalCSeedLoc(RefSeq &ref, int start); 68 | void AdjustSeedStartArray(RefSeq &ref); 69 | void AdjustCSeedStartArray(RefSeq &ref); 70 | 71 | public: 72 | 73 | //Hot data section 74 | bit24_t bseq[SEGLEN][10]; 75 | bit24_t reg[SEGLEN][10]; 76 | bit24_t cbseq[SEGLEN][10]; 77 | bit24_t creg[SEGLEN][10]; 78 | 79 | bit32_t seeds[MAXSNPS+1][16]; 80 | bit32_t cseeds[MAXSNPS+1][16]; 81 | 82 | bit32_t seed_array[144]; 83 | bit32_t cseed_array[144]; 84 | 85 | int _cur_n_hit[MAXSNPS+1]; 86 | int _cur_n_chit[MAXSNPS+1]; 87 | 88 | bit32_t snp_thres; 89 | bit32_t rand_rSeed; // thread safe RNG seed 90 | int seed_start_offset, cseed_start_offset; 91 | int seed_start_array[MAXSNPS+1], cseed_start_array[MAXSNPS+1]; 92 | bit32_t cseed_offset; 93 | set *hitset; //, *chitset; 94 | vector > seedindex, cseedindex; 95 | SeedProfile *_pro; 96 | bit32_t _seed; 97 | Hit _hit; 98 | Hit *_refloc; 99 | ref_loc_t *_refloc2, *_refchr2; 100 | bit32_t tmp_snp; 101 | bit32_t _hitz; 102 | bit32_t _ref_chr_count; 103 | 104 | //Hit hits[MAXSNPS+1][MAXHITS+1]; 105 | //Hit chits[MAXSNPS+1][MAXHITS+1]; 106 | HitArray *hits, *chits; 107 | 108 | //cold data section 109 | int _format; 110 | bool flag_chain, cflag_chain; 111 | vector::iterator _pread; 112 | string _ori_read_seq; 113 | string _ori_read_qual; 114 | string _revseq; 115 | string _revqual; 116 | bit32_t num_reads; 117 | vector mreads; 118 | bit32_t n_aligned; 119 | string _str_align; //align results, prepare for output 120 | int raw_readlen; 121 | int read_max_snp_num; 122 | vector::iterator read_motif_iter; 123 | int seedseg_num; 124 | 125 | //local variables 126 | string::iterator _sp; 127 | string::reverse_iterator _sq; 128 | string _str; 129 | 130 | char _ch[1024]; 131 | pair seg_info; 132 | char _mapseq[256]; 133 | string::iterator _readnt, _adapternt; 134 | }; 135 | 136 | /*n=0: ab; 1: cd; 2: bc; 3: ac; 4: bd; 5: ad*/ 137 | //n<3: ab, cd, bc 138 | inline void SingleAlign::GenerateSeeds(int n, int start) 139 | { 140 | int i; 141 | //cout<<"cseed_offset="<a+start]; 146 | } 147 | else{ 148 | for(i=0,_pro=param.profile[n];ia+start-i]; 149 | } 150 | } 151 | 152 | inline void SingleAlign::GenerateCSeeds(int n, int cstart) 153 | { 154 | int i; 155 | //cout<<"cseed_offset="<a+cseed_offset+cstart]; 160 | } 161 | else{ 162 | for(i=0,_pro=param.profile[n];ia+cstart-i]; 163 | } 164 | } 165 | 166 | 167 | inline unsigned int SingleAlign::CountMismatch(register bit64_t *q, register bit64_t *r, register bit64_t *s) 168 | { 169 | 170 | #ifdef READ_48 171 | return param.XM64((*q¶m.XC64(*s)^*s)&*r)+param.XM64((*(q+1)¶m.XC64(*(s+1))^*(s+1))&*(r+1)); 172 | 173 | #endif 174 | #ifdef READ_80 175 | 176 | if((tmp_snp=param.XM64((*q¶m.XC64(*s)^*s)&*r))>snp_thres) 177 | return tmp_snp; 178 | 179 | return tmp_snp+param.XM64((*(q+1)¶m.XC64(*(s+1))^*(s+1))&*(r+1)) 180 | +param.XM64((*(q+2)¶m.XC64(*(s+2))^*(s+2))&*(r+2)); 181 | 182 | /* 183 | return tmp_snp+param.XM64X2((*((bit64_t*)q)¶m.XC64(*((bit64_t*)s))^*((bit64_t*)s))&*((bit64_t*)r), 184 | (*((bit64_t*)q+2)¶m.XC64(*((bit64_t*)s+2))^*((bit64_t*)s+2))&*((bit64_t*)r+2)); 185 | */ 186 | #endif 187 | #ifdef READ_144 188 | 189 | if((tmp_snp=param.XM64((*q¶m.XC64(*s)^*s)&*r))>snp_thres) 190 | return tmp_snp; 191 | 192 | if((tmp_snp+=param.XM64((*(q+1)¶m.XC64(*(s+1))^*(s+1))&*(r+1)))>snp_thres) 193 | return tmp_snp; 194 | 195 | return tmp_snp+param.XM64((*(q+2)¶m.XC64(*(s+2))^*(s+2))&*(r+2)) 196 | +param.XM64((*(q+3)¶m.XC64(*(s+3))^*(s+3))&*(r+3)) 197 | +param.XM64((*(q+4)¶m.XC64(*(s+4))^*(s+4))&*(r+4)); 198 | 199 | #endif 200 | } 201 | 202 | inline void SingleAlign::Reverse_Seq() 203 | { 204 | _revseq=_pread->seq; 205 | reverse(_revseq.begin(), _revseq.end()); 206 | for(string::iterator p=_revseq.begin(); p!=_revseq.end(); ++p) 207 | *p=rev_char[*p]; 208 | } 209 | 210 | inline void SingleAlign::Reverse_Qual() 211 | { 212 | _revqual=_pread->qual; 213 | reverse(_revqual.begin(), _revqual.end()); 214 | } 215 | #endif //_ALIGN_H_ 216 | -------------------------------------------------------------------------------- /bsp2sam.py: -------------------------------------------------------------------------------- 1 | import sys, optparse, time 2 | 3 | usage = "usage: %prog [options] BSMAP_MAPPING_FILE" 4 | parser = optparse.OptionParser(usage=usage) 5 | 6 | parser.add_option("-o", "--out", dest="outfile", metavar="FILE", help="output file name. (required)", default="") 7 | parser.add_option("-d", "--ref", dest="reffile", metavar="FILE", help="reference genome fasta file. (required)", default="") 8 | parser.add_option("-q", "--quiet", action="store_true", dest="quiet", help="don't print progress on stderr.", default=False) 9 | 10 | options, infile = parser.parse_args() 11 | infile = infile[0] 12 | 13 | def disp(txt, nt=0): 14 | if not options.quiet: print >> sys.stderr, ''.join(['\t' for i in xrange(nt)]+['@ ',time.asctime(),': ',txt]) 15 | 16 | assert any(options.reffile), "Missing reference file, must set -d/--ref." 17 | assert any(options.outfile), "Missing output file, must set -o/--out." 18 | 19 | fout = open(options.outfile, 'w') 20 | disp('reading reference %s ...' % options.reffile) 21 | fout.write('@HD VN:1.0\n') 22 | cr, crlen = '', 0 23 | for line in open(options.reffile): 24 | if line[0] == '>': 25 | if any(cr): fout.write('@SQ\tSN:%s\tLN:%d\n' % (cr,crlen)) 26 | cr, crlen = line[1:].split()[0], 0 27 | else: crlen += len(line) - 1 28 | 29 | fout.write('@SQ\tSN:%s\tLN:%d\n@PG\tID:BSMAP_2.43\n' % (cr,crlen)) 30 | 31 | n = 0 32 | for line in open(infile): 33 | col = line[:-1].split('\t') 34 | name, read, qual, flag = col[:4] 35 | n += 1 36 | if n % 10000000 == 0: disp('read %d lines' % n, nt=1) 37 | if flag == 'NM': fout.write('%s\tu\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n' % (name,read,qual)) 38 | elif flag == 'QC': fout.write('%s\tuf\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n' % (name,read,qual)) 39 | else: 40 | cr, pos, strand, ref, mm, samflag = col[4], col[5], col[6], col[8], col[9], '' 41 | if strand == '+-' or strand == '-+': samflag += 'r' 42 | if flag == 'MA' or flag == 'OF': samflag += 's' 43 | fout.write('%s\t%s\t%s\t%s\t255\t%dM\t*\t0\t0\t%s\t%s\tNM:i:%s\tZS:Z:%s\n' % (name,samflag,cr,pos,len(read),read,qual,mm,strand)) 44 | 45 | fout.close() -------------------------------------------------------------------------------- /dbseq.h: -------------------------------------------------------------------------------- 1 | #ifndef _DBSEQ_H_ 2 | #define _DBSEQ_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "param.h" 10 | #include "utilities.h" 11 | 12 | #define PREFETCH_CAL_INDEX 16 13 | #define PREFETCH_CRT_INDEX 8 14 | #define PREFETCH_LOOP 8U 15 | #define REF_MARGIN 400 16 | 17 | using namespace std; 18 | 19 | struct OneBfa 20 | { 21 | bit32_t n; //count 22 | bit24_t *s; 23 | }; 24 | struct RefTitle 25 | { 26 | string name; 27 | bit32_t size; 28 | //added by yxi 29 | bit32_t rc_offset; 30 | }; 31 | struct Block 32 | { 33 | bit32_t id; 34 | bit32_t begin; 35 | bit32_t end; 36 | }; 37 | struct KmerLoc 38 | { 39 | bit32_t n1; //ab, ac, ad seed 40 | Hit *loc1; 41 | }; 42 | 43 | struct shortHit{ 44 | ref_id_t chr; 45 | ref_loc_t loc; 46 | }; 47 | 48 | struct shorthitcompclass{ 49 | bool operator()(shortHit a, shortHit b) { 50 | if(a.locb.loc) return 0; 52 | else if(a.chr CCGG_seglen(ref_id_t chr, ref_loc_t pos, int readlen); 84 | ref_loc_t hit2int(Hit h); 85 | Hit int2hit(ref_loc_t p, int c); 86 | 87 | public: 88 | int total_num; 89 | bit64_t sum_length; 90 | vector bfa; 91 | bit32_t total_kmers; 92 | KmerLoc *index; 93 | vector title; 94 | protected: 95 | ref_id_t _count; 96 | string _name; 97 | string _seq; 98 | ref_loc_t _length; 99 | shortHit tmploc; 100 | public: 101 | vector _blocks; //unmasked ref region 102 | //map ccgg_seglen; 103 | 104 | //by yxi 105 | //int max_seedseg_num; 106 | vector > CCGG_index[50]; 107 | vector > CCGG_sites; 108 | vector *CCGG_sites_chr; 109 | int n_CCGG; 110 | NewIndex *index2; 111 | bit32_t *index2_count; 112 | bit32_t *refcat, *crefcat; 113 | vector ref_anchor, cref_anchor; 114 | }; 115 | 116 | #endif //_DBSEQ_H_ 117 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | bsmap2.6 (2.6-1) unstable; urgency=low 2 | 3 | * Release 2.6. 4 | 5 | -- Anthony Brummett Wed, 11 Jul 2012 15:20:00 -0600 6 | -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 7 2 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: bsmap2.6 2 | Section: science 3 | Priority: optional 4 | Build-Depends: debhelper (>= 7), zlib1g-dev 5 | Maintainer: The Genome Insitute 6 | Homepage: http://code.google.com/p/bsmap/ 7 | 8 | Package: bsmap2.6 9 | Architecture: amd64 10 | Depends: zlib1g 11 | Description: BSMAP is a short reads mapping software for bisulfite sequencing reads. Bisulfite treatment converts unmethylated Cytosines into Uracils (sequenced as Thymine) and leave methylated Cytosines unchanged, hence provides a way to study DNA cytosine methylation at single nucleotide resolution. BSMAP aligns the Ts in the reads to both Cs and Ts in the reference. 12 | 13 | -------------------------------------------------------------------------------- /debian/postinst: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # see: dh_installdeb(1) 4 | 5 | # summary of how this script can be called: 6 | # * `configure' 7 | # * `abort-upgrade' 8 | # * `abort-remove' `in-favour' 9 | # 10 | # * `abort-remove' 11 | # * `abort-deconfigure' `in-favour' 12 | # `removing' 13 | # 14 | # for details, see http://www.debian.org/doc/debian-policy/ or 15 | # the debian-policy package 16 | 17 | 18 | EXECUTABLES="bsmap" 19 | MANPAGES="" 20 | PRIORITY=1 21 | EXE_VERSION_SUFFIX=2.6 22 | 23 | set -e 24 | 25 | case "$1" in 26 | configure) 27 | for e in $EXECUTABLES 28 | do 29 | BARE_EXE=/usr/bin/$e 30 | VERSIONED_EXE="/usr/bin/${e}${EXE_VERSION_SUFFIX}" 31 | update-alternatives --install $BARE_EXE $e $VERSIONED_EXE $PRIORITY 32 | done 33 | 34 | for m in $MANPAGES 35 | do 36 | BARE_MAN=/usr/share/man/man1/$m.1.gz 37 | VERSIONED_MAN=/usr/share/man/man1/${m}${EXE_VERSION_SUFFIX}.1.gz 38 | update-alternatives --install $BARE_MAN $m.1.gz $VERSIONED_MAN $PRIORITY 39 | done 40 | ;; 41 | 42 | abort-upgrade|abort-remove|abort-deconfigure) 43 | ;; 44 | 45 | *) 46 | echo "postinst called with unknown argument \`$1'" >&2 47 | exit 1 48 | ;; 49 | esac 50 | 51 | # dh_installdeb will replace this with shell code automatically 52 | # generated by other debhelper scripts. 53 | 54 | #DEBHELPER# 55 | 56 | exit 0 57 | -------------------------------------------------------------------------------- /debian/prerm: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # see: dh_installdeb(1) 4 | 5 | # summary of how this script can be called: 6 | # * `remove' 7 | # * `upgrade' 8 | # * `failed-upgrade' 9 | # * `remove' `in-favour' 10 | # * `deconfigure' `in-favour' 11 | # `removing' 12 | # 13 | # for details, see http://www.debian.org/doc/debian-policy/ or 14 | # the debian-policy package 15 | 16 | 17 | EXECUTABLES="bsmap" 18 | MANPAGES="" 19 | EXE_VERSION_SUFFIX=2.6 20 | 21 | set -e 22 | 23 | case "$1" in 24 | remove|upgrade|deconfigure) 25 | for e in $EXECUTABLES 26 | do 27 | VERSIONED_EXE="/usr/bin/${e}${EXE_VERSION_SUFFIX}" 28 | update-alternatives --remove $e $VERSIONED_EXE 29 | done 30 | 31 | for m in $MANPAGES 32 | do 33 | VERSIONED_MAN="/usr/share/man/man1/${m}${EXE_VERSION_SUFFIX}.1.gz" 34 | update-alternatives --remove $m.1.gz $VERSIONED_MAN 35 | done 36 | ;; 37 | 38 | failed-upgrade) 39 | ;; 40 | 41 | *) 42 | echo "prerm called with unknown argument \`$1'" >&2 43 | exit 1 44 | ;; 45 | esac 46 | 47 | # dh_installdeb will replace this with shell code automatically 48 | # generated by other debhelper scripts. 49 | 50 | #DEBHELPER# 51 | 52 | exit 0 53 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | PKG=$(shell dh_listpackages) 4 | VERSION=2.6 5 | 6 | %: 7 | dh $@ 8 | 9 | override_dh_auto_install: 10 | dh_auto_install 11 | for FILE in ./debian/$(PKG)/usr/bin/*; do \ 12 | mv $$FILE ./debian/$(PKG)/usr/bin/`basename $$FILE`$(VERSION); \ 13 | done 14 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | CC= g++ 2 | 3 | BIN = $(DESTDIR)/usr/bin 4 | FLAGS= -DMAXHITS=1000 -DTHREAD -funroll-loops -Lsamtools -Isamtools -O3 5 | 6 | OLIGOLEN= -DREAD_144 7 | # options: -DREAD_48, -DREAD_80, -DREAD_144 8 | 9 | LIBS= 10 | THREAD= -lpthread 11 | 12 | SOURCE = align dbseq main pairs param reads utilities 13 | OBJS1= $(patsubst %,%.o,$(SOURCE)) 14 | 15 | all: bsmap 16 | %.o:%.cpp 17 | $(CC) $(FLAGS) $(LIBS) $(REF_MODE) $(OLIGOLEN) -c $< -o $@ 18 | bsmap: $(OBJS1) 19 | (cd samtools; make) 20 | $(CC) $(FLAGS) $(LIBS) $(REF_MODE) $(OLIGOLEN) $^ -o $@ $(THREAD) -lbam -lz 21 | rm -f *.o 22 | 23 | clean: 24 | rm -f *.o *~ bsmap 25 | (cd samtools; make clean) 26 | install: 27 | install -d $(BIN) 28 | install ./bsmap $(BIN) 29 | install ./sam2bam.sh $(BIN) 30 | install ./methratio.py $(BIN) 31 | -------------------------------------------------------------------------------- /methratio.py: -------------------------------------------------------------------------------- 1 | import sys, time, os, array, optparse 2 | usage = "usage: %prog [options] BSMAP_MAPPING_FILES" 3 | parser = optparse.OptionParser(usage=usage) 4 | 5 | parser.add_option("-o", "--out", dest="outfile", metavar="FILE", help="output file name. (required)", default="") 6 | parser.add_option("-d", "--ref", dest="reffile", metavar="FILE", help="reference genome fasta file. (required)", default="") 7 | parser.add_option("-c", "--chr", dest="chroms", metavar="CHR", help="process only specified chromosomes, separated by ','. [default: all]\nexample: --chroms=chr1,chr2", default=[]) 8 | parser.add_option("-s", "--sam-path", dest="sam_path", metavar="PATH", help="path to samtools. [default: none]", default='') 9 | parser.add_option("-u", "--unique", action="store_true", dest="unique", help="process only unique mappings/pairs.", default=False) 10 | parser.add_option("-p", "--pair", action="store_true", dest="pair", help="process only properly paired mappings.", default=False) 11 | parser.add_option("-z", "--zero-meth", action="store_true", dest="meth0", help="report loci with zero methylation ratios.", default=False) 12 | parser.add_option("-q", "--quiet", action="store_true", dest="quiet", help="don't print progress on stderr.", default=False) 13 | parser.add_option("-r", "--remove-duplicate", action="store_true", dest="rm_dup", help="remove duplicated reads.", default=False) 14 | parser.add_option("-t", "--trim-fillin", dest="trim_fillin", type="int", metavar='N', help="trim N end-repairing fill-in nucleotides. [default: 2]", default=2) 15 | parser.add_option("-g", "--combine-CpG", action="store_true", dest="combine_CpG", help="combine CpG methylaion ratios on both strands.", default=False) 16 | parser.add_option("-m", "--min-depth", dest="min_depth", type="int", metavar='FOLD', help="report loci with sequencing depth>=FOLD. [default: 1]", default=1) 17 | 18 | options, infiles = parser.parse_args() 19 | 20 | if len(options.reffile) == 0: parser.error("Missing reference file, use -d or --ref option.") 21 | if len(options.outfile) == 0: parser.error("Missing output file name, use -o or --out option.") 22 | if len(infiles) == 0: parser.error("Require at least one BSMAP_MAPPING_FILE.") 23 | if any(options.chroms): options.chroms = options.chroms.split(',') 24 | 25 | if any(options.sam_path): 26 | if options.sam_path[-1] != '/': options.sam_path += '/' 27 | 28 | def disp(txt, nt=0): 29 | if not options.quiet: print >> sys.stderr, ''.join(['\t' for i in xrange(nt)]+['@ ',time.asctime(),': ',txt]) 30 | 31 | def get_alignment(line): 32 | col = line.split('\t') 33 | if sam_format: 34 | flag = col[1] 35 | if 'u' in flag: return [] 36 | if options.unique and 's' in flag: return [] 37 | if options.pair and 'P' not in flag: return [] 38 | cr, pos, seq, strand, insert = col[2], int(col[3])-1, col[9], '', int(col[8]) 39 | if cr not in options.chroms: return [] 40 | for aux in col[11:]: 41 | if aux[:5] == 'ZS:Z:': 42 | strand = aux[5:7] 43 | break 44 | if strand == '': raise ValueError 45 | else: 46 | flag = col[3][:2] 47 | if flag == 'NM' or flag == 'QC': return [] 48 | if options.unique and flag != 'UM': return [] 49 | if options.pair and col[7] == '0': return [] 50 | seq, strand, cr, pos, insert = col[1], col[6], col[4], int(col[5])-1, int(col[7]) 51 | if cr not in options.chroms: return [] 52 | if options.rm_dup: # remove duplicate hits 53 | if strand == '+-' or strand == '-+': frag_end, direction = pos+len(seq), 2 54 | else: frag_end, direction = pos, 1 55 | if coverage[cr][frag_end] & direction: return [] 56 | coverage[cr][frag_end] |= direction 57 | if options.trim_fillin > 0: # trim fill in nucleotides 58 | if strand == '+-': seq = seq[:-options.trim_fillin] 59 | elif strand == '--': seq, pos = seq[options.trim_fillin:], pos+options.trim_fillin 60 | elif insert != 0 and len(seq) > abs(insert) - options.trim_fillin: 61 | trim_nt = len(seq) - (abs(insert) - options.trim_fillin) 62 | if strand == '++': seq = seq[:-trim_nt] 63 | elif strand == '-+': seq, pos =seq[trim_nt:], pos+trim_nt 64 | if sam_format and insert > 0: seq = seq[:int(col[7])-1-pos] # remove overlapped regions in paired hits, SAM format only 65 | return (seq, strand[0], cr, pos) 66 | 67 | ref, cr, seq = {}, '', '' 68 | disp('reading reference %s ...' % options.reffile) 69 | for line in open(options.reffile): 70 | if line[0] == '>': 71 | if any(cr): 72 | if len(options.chroms) == 0 or cr in options.chroms: ref[cr] = seq.upper() 73 | cr, seq = line[1:-1].split()[0], '' 74 | else: seq += line.strip() 75 | 76 | if len(options.chroms) == 0 or cr in options.chroms: ref[cr] = seq.upper() 77 | del seq 78 | 79 | meth, depth, coverage = {}, {}, {} 80 | for cr in ref: 81 | meth[cr] = array.array('I', [0]) * len(ref[cr]) 82 | depth[cr] = array.array('I', [0]) * len(ref[cr]) 83 | if options.rm_dup: coverage[cr] = array.array('B', [0]) * len(ref[cr]) 84 | 85 | options.chroms = set(ref.keys()) 86 | 87 | BS_conversion = {'+': ('C','T'), '-': ('G','A')} 88 | nmap = 0 89 | for infile in infiles: 90 | nline = 0 91 | disp('reading %s ...' % infile) 92 | if infile[-4:].upper() == '.SAM': sam_format, fin = True, os.popen('%ssamtools view -XS %s' % (options.sam_path, infile)) 93 | elif infile[-4:].upper() == '.BAM': sam_format, fin = True, os.popen('%ssamtools view -X %s' % (options.sam_path, infile)) 94 | else: sam_format, fin = False, open(infile) 95 | for line in fin: 96 | nline += 1 97 | if nline % 10000000 == 0: disp('read %d lines' % nline, nt=1) 98 | map_info = get_alignment(line) 99 | if len(map_info) == 0: continue 100 | seq, strand, cr, pos = map_info 101 | depthcr = depth[cr] 102 | if pos + len(seq) > len(depthcr): continue 103 | nmap += 1 104 | methcr = meth[cr] 105 | refseq = ref[cr][pos:pos+len(seq)] 106 | match, convert = BS_conversion[strand] 107 | index = refseq.find(match) 108 | while index >= 0: 109 | if seq[index] == convert: depthcr[pos+index] += 1 110 | elif seq[index] == match: 111 | methcr[pos+index] += 1 112 | depthcr[pos+index] += 1 113 | index = refseq.find(match, index+1) 114 | 115 | fin.close() 116 | 117 | if options.combine_CpG: 118 | disp('combining CpG methylation from both strands ...') 119 | for cr in depth: 120 | methcr, depthcr, refcr = depth[cr], meth[cr], ref[cr] 121 | pos = refcr.find('CG') 122 | while pos >= 0: 123 | depthcr[pos] += depthcr[pos+1] 124 | methcr[pos] += methcr[pos+1] 125 | depthcr[pos+1] = 0 126 | methcr[pos+1] = 0 127 | pos = refcr.find('CG', pos+2) 128 | 129 | disp('writing %s ...' % options.outfile) 130 | ss = {'C': '+', 'G': '-'} 131 | fout = open(options.outfile, 'w') 132 | z95, z95sq = 1.96, 1.96 * 1.96 133 | fout.write('chr\tpos\tstrand\tcontext\tratio\ttotal_C\tmethy_C\tCI_lower\tCI_upper\n') 134 | nc, nd, dep0 = 0, 0, options.min_depth 135 | for cr in sorted(depth.keys()): 136 | depthcr, methcr, refcr = depth[cr], meth[cr], ref[cr] 137 | for i, d in enumerate(depthcr): 138 | if d < dep0: continue 139 | nc += 1 140 | nd += d 141 | m = methcr[i] 142 | if m == 0 and not options.meth0: continue 143 | ratio = float(m) / d 144 | seq = refcr[i-2:i+3] 145 | strand = ss[refcr[i]] 146 | pmid = ratio + z95sq / (2 * d) 147 | sd = z95 * ((ratio*(1-ratio)/d + z95sq/(4*d*d)) ** 0.5) 148 | norminator = 1 + z95sq / d 149 | CIl, CIu = (pmid - sd) / norminator, (pmid + sd) / norminator 150 | fout.write('%s\t%d\t%c\t%s\t%.3f\t%d\t%d\t%.3f\t%.3f\n' % (cr, i+1, strand, seq, ratio, d, m, CIl, CIu)) 151 | 152 | fout.close() 153 | disp('done.') 154 | print 'total %d valid mappings, %d covered cytosines, average coverage: %.2f fold.' % (nmap, nc, float(nd)/nc) 155 | -------------------------------------------------------------------------------- /pairs.h: -------------------------------------------------------------------------------- 1 | #ifndef _PAIRS_H_ 2 | #define _PAIRS_H_ 3 | 4 | #include "dbseq.h" 5 | #include "reads.h" 6 | #include "align.h" 7 | 8 | using namespace std; 9 | 10 | 11 | extern char chain_flag[]; 12 | 13 | struct PairHit 14 | { 15 | bit16_t chain; 16 | bit8_t na, nb; //# of snps 17 | int insert; 18 | Hit a; 19 | Hit b; 20 | }; 21 | 22 | typedef PairHit PairArray[MAXHITS+1]; 23 | 24 | class PairAlign 25 | { 26 | public: 27 | PairAlign(); 28 | ~PairAlign(); 29 | void ImportFileFormat(int format1, int format2); 30 | void ImportBatchReads(bit32_t n, vector &a1, vector &a2); 31 | int GetExactPairs(); 32 | int GetExact2SnpPairs(RefSeq &ref); 33 | int GetSnp2SnpPairs(RefSeq &ref); 34 | int GetExact2GapPairs(RefSeq &ref); 35 | int RunAlign(RefSeq &ref); 36 | void Do_Batch(RefSeq &ref); 37 | void StringAlign(RefSeq &ref, string &os); 38 | void StringAlign_ClosestUnpair(RefSeq &ref, string &os); 39 | 40 | //added by yxi 41 | int GetPairs(int na, int nb); 42 | int StringAlignPair(RefSeq &ref, string &os); 43 | void StringAlignUnpair(int fa, int fb, RefSeq &ref, string &os); 44 | void s_OutHitPair(PairHit pp, int n, RefSeq &ref, string &os); 45 | void s_OutHitUnpair(int readinpair, int chain_a, int chain_b, int ma, int na, Hit ha, int mb, Hit hb, RefSeq &ref, string &os); 46 | int TrimAdapter(); 47 | void FixPairReadName(); 48 | 49 | public: 50 | SingleAlign _sa; 51 | SingleAlign _sb; 52 | bit32_t num_reads; 53 | bit32_t n_aligned_pairs, n_aligned_a, n_aligned_b; 54 | string _str_align; 55 | string _str_align_unpair; 56 | protected: 57 | bit32_t _cur_n_hits[2*MAXSNPS+1]; 58 | //PairHit pairhits[2*MAXSNPS+1][MAXHITS+1]; 59 | PairArray *pairhits; 60 | bit32_t rand_rSeed; //thread safe RNG seed 61 | //by yxi 62 | char _mapseq[256]; 63 | char _ch[1024]; 64 | SingleAlign * _stmp; 65 | int checked_pair_mismatch[MAXSNPS+1][MAXSNPS+1]; 66 | }; 67 | 68 | #endif //_PAIR_ALIGH_H_ 69 | -------------------------------------------------------------------------------- /param.h: -------------------------------------------------------------------------------- 1 | #ifndef _PARAM_H_ 2 | #define _PARAM_H_ 3 | 4 | #define SEGLEN 16 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | #ifdef READ_48 16 | const int FIXELEMENT=4; 17 | #endif 18 | 19 | #ifdef READ_80 20 | const int FIXELEMENT=6; 21 | #endif 22 | 23 | #ifdef READ_144 24 | const int FIXELEMENT=10; // 144/16+1 25 | #endif 26 | 27 | const int MAXSNPS=15; 28 | 29 | typedef unsigned char bit8_t; 30 | typedef unsigned short bit16_t; 31 | typedef unsigned bit32_t; 32 | typedef unsigned long long bit64_t; 33 | typedef bit32_t bit24_t; 34 | 35 | typedef bit32_t ref_id_t; 36 | typedef bit32_t ref_loc_t; 37 | typedef ref_loc_t* NewIndex; 38 | 39 | struct SeedProfile 40 | { 41 | bit8_t a; //offset of part a on binary seq 42 | bit8_t b1; //begin element when creating seed 43 | bit8_t s1; //shift when creating seed from binary seq 44 | }; 45 | 46 | struct Hit 47 | { 48 | ref_id_t chr; //index of chr 49 | ref_loc_t loc; //location of first bp on reference seq, count from 0 50 | 51 | }; 52 | 53 | 54 | class Param 55 | { 56 | public: 57 | Param(); 58 | void SetSeedSize(int n); 59 | void InitMapping(); 60 | void BuildMismatchTable(); 61 | void SetAdaptors(int n); 62 | void SetDigestionSite(const char *a); 63 | void rc_seq(string &seq); 64 | void SetAlign(char readnt, char refnt); 65 | 66 | public: 67 | int num_procs; //number of parallel processors 68 | 69 | int chains; //0: forward strands only ; 1: forward and reverse strands 70 | //dbseq 71 | int max_dbseq_size; 72 | int append_dbseq_size; 73 | //read 74 | int read_size; 75 | int max_ns; //throw out reads containning >=max_ns 'N's 76 | int trim_lowQ; //trim low-quality at 3'-end, or not? 77 | //quality 78 | bit8_t zero_qual; 79 | bit8_t qual_threshold; 80 | bit8_t default_qual; 81 | //pair-end mapping 82 | int min_insert; 83 | int max_insert; 84 | int optimize_output_SV; //if a pair cannot align with proper orientation and distance, very likely a strctural variation happen here. we prefer to report hit of read 'a' and 'b' with smallest distance, so that to help detect structural variations 85 | //seed 86 | int half_seed_size; 87 | int seed_size; 88 | bit32_t half_seed_bits; 89 | bit32_t seed_bits; 90 | int min_read_size; 91 | //alignment 92 | int max_snp_num; //maximum number of snps on one read allowed 93 | int max_num_hits; //maximum number of equal best hits, smaller will be faster 94 | bit8_t read_nt, ref_nt; 95 | //report hits 96 | int report_repeat_hits; //how report repeat hits? 0: no, 1: pick one randomly, 2: report all 97 | bool output_id; //1: output read id, 1: out read index 98 | 99 | string adapter[10]; 100 | int n_adapter; 101 | string useful_nt; 102 | string nx_nt; 103 | //added by yxi 104 | //bit16_t *_C; // mask of convert T in reads to C at position of C in ref 105 | bit16_t *_T; // convert T to C 106 | //bit16_t *map4to3; //map 3-letter sequence to 3-based number 107 | int CCGG_min, CCGG_max; 108 | int out_sam; 109 | int total_ref_seq; 110 | int max_seedseg_num; 111 | bit32_t read_start, read_end; 112 | int out_ref; 113 | int out_unmap; 114 | string digest_site; 115 | int digest_pos; 116 | int RRBS_flag; 117 | int index_interval; 118 | int randseed; 119 | SeedProfile profile[MAXSNPS+1][16]; 120 | int pairend; 121 | int max_readlen; 122 | 123 | inline bit32_t XT(bit32_t tt) {return (bit32_t)_T[tt&0xFFFF]+((bit32_t)_T[tt>>16])*6561UL;}; //convert T to C and map to 3-nt space 124 | //inline bit32_t XC(bit32_t tt) {return ((bit32_t)_C[tt&0xFFFF]|((bit32_t)_C[tt>>16])<<16);}; //mask T to C for 2 16-bit segments 125 | inline bit32_t XC(bit32_t tt) {return ((~tt)<<1)|tt|0x55555555U;} // generate T2C mask according to C locations 126 | inline bit64_t XC64(bit64_t tt) {return ((~tt)<<1)|tt|0x5555555555555555ULL;} 127 | //inline bit8_t XM(bit32_t tt) {return num_mismatch[tt&0xFFFF]+num_mismatch[tt>>16];}; //count mismatches for 2 16-bit segments 128 | 129 | inline bit32_t XM(bit32_t tt) { 130 | #ifdef SSE4 131 | return __builtin_popcount((tt|(tt>>1))&0x55555555); 132 | #else 133 | tt=(tt|(tt>>1))&0x55555555; 134 | tt=(tt+(tt>>2))&0x33333333; 135 | return (((bit32_t)(tt*0x1111111))>>28)+(tt&0x3); 136 | #endif 137 | } 138 | 139 | inline bit32_t XM64(bit64_t tt) { 140 | #ifdef SSE4 141 | return __builtin_popcountl((tt|(tt>>1))&0x5555555555555555ULL); 142 | #else 143 | tt=(tt|(tt>>1))&0x5555555555555555ULL; 144 | tt=(tt+(tt>>2))&0x3333333333333333ULL; 145 | return (((tt+(tt>>4))&0x0F0F0F0F0F0F0F0FULL)*0x0101010101010101ULL)>>56; 146 | #endif 147 | } 148 | 149 | inline bit32_t XM64X2(bit64_t tt1, bit64_t tt2) { 150 | tt1=((tt1|(tt1>>1))&0x5555555555555555ULL)+((tt2|(tt2>>1))&0x5555555555555555ULL); 151 | tt1=(tt1&0x3333333333333333ULL)+((tt1>>2)&0x3333333333333333ULL); 152 | return (((tt1+(tt1>>4))&0x0F0F0F0F0F0F0F0FULL)*0x0101010101010101ULL)>>56; 153 | } 154 | 155 | 156 | char * StrSeed(bit32_t seed, bit32_t size) { //for debug only 157 | char *s = new char[size+1]; 158 | for(int i=size-1; i>=0; i--) { 159 | s[size-1-i]=useful_nt[(seed>>(i*2))&0x3]; 160 | } 161 | s[size]=0; 162 | return s; 163 | }; 164 | 165 | bit32_t map3to4(bit32_t tt){ 166 | int s=0, i; for(i=0;i<16;i++) {s|=(tt%3)<>s1; 20 | fin.getline(ch, 1000); 21 | 22 | if('>' == s1[0]) { 23 | _file_format=1; 24 | fin>>s2; 25 | fin.getline(ch, 1000); 26 | } 27 | else if('@' == s1[0]) { 28 | fin>>s2; 29 | fin.getline(ch, 1000); 30 | fin>>s3; 31 | fin.getline(ch, 1000); 32 | fin>>s4; 33 | fin.getline(ch, 1000); 34 | _file_format=0; 35 | if(s2.size() != s4.size()) { 36 | cerr<<"fatal error: fq format, sequence length not equal to quality length\n"; 37 | exit(1); 38 | } 39 | } 40 | else if((SAM_fp=samopen(filename.c_str(), "rb", 0))!=0) { 41 | SAM_b=bam_init1(); 42 | _file_format=3; //BAM format 43 | } 44 | else if((SAM_fp=samopen(filename.c_str(), "r", 0))!=0) { 45 | SAM_b=bam_init1(); 46 | _file_format=2; //SAM format 47 | } 48 | else { 49 | cerr<<"fatal error: unrecognizable format of reads file.\n"; 50 | exit(1); 51 | } 52 | fin.seekg(0); 53 | 54 | switch(_file_format) { 55 | case 0: //fastq 56 | for(i=0;i<(param.read_start-1)*4;i++) { 57 | if(fin.eof()) break; 58 | fin.getline(ch,1000); 59 | } 60 | break; 61 | case 1: //fasta 62 | for(i=0;i<(param.read_start-1)*2;i++) { 63 | if(fin.eof()) break; 64 | fin.getline(ch,1000); 65 | } 66 | break; 67 | case 2: //bam 68 | if(mode &1){ //single-end 69 | for(i=0;i::iterator p=mreads.begin(); 88 | 89 | size_t i,l_seq; 90 | char *s, *t; 91 | 92 | if (_file_format<2) //.fa and .fq format 93 | for(num=0; num=param.read_end) break; 95 | fin>>c; 96 | if(fin.eof()) break; 97 | p->index=_index; 98 | fin>>p->name; 99 | fin.getline(ch,1000); 100 | fin>>p->seq; 101 | p->readset=readset; 102 | if(!_file_format) {//*.fq 103 | fin>>ch; 104 | fin.getline(ch, 1000); 105 | fin>>p->qual; 106 | } 107 | //else p->qual=string(1,'*'); 108 | else p->qual=string(p->seq.size(), param.zero_qual+param.default_qual); 109 | /* 110 | cout<qual<qual.begin(); it!=p->qual.end();++it) *it-=(param.zero_qual-'!'); 113 | cout<qual<seq.size()>param.max_readlen) { 116 | p->seq.erase(param.max_readlen); p->qual.erase(param.max_readlen); 117 | } 118 | } 119 | else //SAM/BAM format 120 | for(num=0; numindex=_index; 126 | p->name=string((char*)bam1_qname(SAM_b)); 127 | // 128 | l_seq=min(SAM_b->core.l_qseq,param.max_readlen); 129 | p->seq.assign(l_seq,0); p->qual.assign(l_seq,0); 130 | s=(char*) bam1_seq(SAM_b); t=(char*) bam1_qual(SAM_b); 131 | if(readset) { 132 | if(SAM_b->core.flag&0x40) p->readset=1; 133 | else if(SAM_b->core.flag&0x80) p->readset=2; 134 | else p->readset=readset; 135 | } 136 | else p->readset=readset; 137 | for(i=0;iseq[i]=bam_nt16_rev_table[bam1_seqi(s,i)]; 139 | p->qual[i]=t[i]+33; 140 | } 141 | //cout<name<<" "<seq< 5 | #include 6 | #include 7 | #include 8 | #include "param.h" 9 | #include "sam.h" 10 | 11 | using namespace std; 12 | 13 | const int BatchNum=50000; 14 | 15 | struct ReadInf 16 | { 17 | bit32_t index; 18 | bit32_t readset; //added by yxi 19 | string name; 20 | string seq; 21 | string qual; 22 | }; 23 | 24 | class ReadClass 25 | { 26 | public: 27 | ReadClass(); 28 | void CheckFile(ifstream &fin, string filename, int readset); 29 | void InitialIndex(); 30 | int LoadBatchReads(ifstream &fin, int mode); 31 | public: 32 | vector mreads; 33 | bit32_t num; 34 | int _file_format; //0: fq; 1: fa; 2:BAM 35 | bit32_t _index; 36 | 37 | //added by yxi, for BAM input support 38 | samfile_t *SAM_fp; 39 | bam1_t *SAM_b; 40 | }; 41 | 42 | #endif //_READS_H_ 43 | -------------------------------------------------------------------------------- /sam2bam.sh: -------------------------------------------------------------------------------- 1 | #convering SAM to BAM, sort and index BAM 2 | #usage ./sam2bam.sh 3 | #outputfile will be .bam and .bam.bai 4 | #rrbsmap_path=${0%/*} 5 | tmpbam=${1%.*}.tmp.bam 6 | outbam=${1%.*}.bam 7 | 8 | echo "Converting SAM to BAM ..." 9 | if [ ! -f $1 ]; then 10 | echo "$1 does not exist." 11 | exit 1 12 | fi 13 | samtools/samtools view -bS $1 > $tmpbam 14 | if [ $? -ne 0 ]; then 15 | echo "SAM2BAM conversion not sucessful." 16 | echo "$1 remains unchanged." 17 | rm $tmpbam 18 | exit 1 19 | fi 20 | echo "Sorting BAM ..." 21 | samtools/samtools sort $tmpbam ${outbam%.*} 22 | if [ $? -ne 0 ]; then 23 | echo "BAM file sorting not sucessful." 24 | echo "$outbam is in unsorted BAM format". 25 | mv $tmpbam $outbam 26 | exit 1 27 | fi 28 | rm $tmpbam 29 | echo "Indexing BAM ..." 30 | samtools/samtools index $outbam 31 | if [ $? -ne 0 ]; then 32 | echo "BAM file indexing not sucessful." 33 | exit 1 34 | fi 35 | exit 0 36 | -------------------------------------------------------------------------------- /samtools/AUTHORS: -------------------------------------------------------------------------------- 1 | Heng Li from the Sanger Institute wrote most of the initial source codes 2 | of SAMtools and various converters. 3 | 4 | Bob Handsaker from the Broad Institute is a major contributor to the 5 | SAM/BAM specification. He designed and implemented the BGZF format, the 6 | underlying indexable compression format for the BAM format. BGZF does 7 | not support arithmetic between file offsets. 8 | 9 | Jue Ruan for the Beijing Genome Institute designed and implemented the 10 | RAZF format, an alternative indexable compression format. RAZF supports 11 | arithmetic between file offsets, at the cost of increased index file 12 | size and the full compatibility with gzip. RAZF is optional and only 13 | used in `faidx' for indexing RAZF compressed fasta files. 14 | 15 | Colin Hercus updated novo2sam.pl to support gapped alignment by 16 | novoalign. 17 | -------------------------------------------------------------------------------- /samtools/COPYING: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2008-2009 Genome Research Ltd. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /samtools/INSTALL: -------------------------------------------------------------------------------- 1 | System Requirements 2 | =================== 3 | 4 | SAMtools depends on the zlib library . The latest 5 | version 1.2.3 is preferred and with the latest version you can compile 6 | razip and use it to compress a FASTA file. SAMtools' faidx is able to 7 | index a razip-compressed FASTA file to save diskspace. Older zlib also 8 | works with SAMtools, but razip cannot be compiled. 9 | 10 | The text-based viewer (tview) requires the GNU ncurses library 11 | , which comes with Mac OS X and 12 | most of the modern Linux/Unix distributions. If you do not have this 13 | library installed, you can still compile the rest of SAMtools by 14 | manually modifying one line in Makefile. 15 | 16 | 17 | Compilation 18 | =========== 19 | 20 | Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can 21 | compile razip with `make razip'. 22 | 23 | 24 | Installation 25 | ============ 26 | 27 | Simply copy `samtools' and other executables/scripts in `misc' to a 28 | location you want (e.g. a directory in your $PATH). No further 29 | configurations are required. 30 | -------------------------------------------------------------------------------- /samtools/Makefile: -------------------------------------------------------------------------------- 1 | CC= gcc 2 | CFLAGS= -g -Wall -O2 #-m64 #-arch ppc 3 | DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -D_CURSES_LIB=0 4 | LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ 5 | bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o knetfile.o \ 6 | bam_sort.o sam_header.o 7 | AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ 8 | bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ 9 | bamtk.o kaln.o 10 | PROG= samtools 11 | INCLUDES= 12 | SUBDIRS= . misc 13 | LIBPATH= 14 | #LIBCURSES= -lcurses # -lXCurses 15 | 16 | .SUFFIXES:.c .o 17 | 18 | .c.o: 19 | $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ 20 | 21 | all-recur lib-recur clean-recur cleanlocal-recur install-recur: 22 | @target=`echo $@ | sed s/-recur//`; \ 23 | wdir=`pwd`; \ 24 | list='$(SUBDIRS)'; for subdir in $$list; do \ 25 | cd $$subdir; \ 26 | $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ 27 | INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ 28 | cd $$wdir; \ 29 | done; 30 | 31 | all:$(PROG) 32 | 33 | lib:libbam.a 34 | 35 | libbam.a:$(LOBJS) 36 | $(AR) -cru $@ $(LOBJS) 37 | 38 | samtools:lib $(AOBJS) 39 | $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) $(LIBCURSES) -lz -L. -lbam 40 | 41 | razip:razip.o razf.o knetfile.o 42 | $(CC) $(CFLAGS) -o $@ razf.o razip.o knetfile.o -lz 43 | 44 | bgzip:bgzip.o bgzf.o 45 | $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz 46 | 47 | razip.o:razf.h 48 | bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h 49 | sam.o:sam.h bam.h 50 | bam_import.o:bam.h kseq.h khash.h razf.h 51 | bam_pileup.o:bam.h razf.h ksort.h 52 | bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h 53 | bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h 54 | bam_lpileup.o:bam.h ksort.h 55 | bam_tview.o:bam.h faidx.h bam_maqcns.h 56 | bam_maqcns.o:bam.h ksort.h bam_maqcns.h 57 | bam_sort.o:bam.h ksort.h razf.h 58 | bam_md.o:bam.h faidx.h 59 | glf.o:glf.h 60 | sam_header.o:sam_header.h khash.h 61 | 62 | faidx.o:faidx.h razf.h khash.h 63 | faidx_main.o:faidx.h razf.h 64 | 65 | cleanlocal: 66 | rm -fr gmon.out *.o a.out *.dSYM razip $(PROG) *~ *.a 67 | 68 | clean:cleanlocal-recur 69 | -------------------------------------------------------------------------------- /samtools/Makefile.mingw: -------------------------------------------------------------------------------- 1 | CC= gcc.exe 2 | AR= ar.exe 3 | CFLAGS= -g -Wall -O2 4 | DFLAGS= -D_CURSES_LIB=2 -D_USE_KNETFILE 5 | LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ 6 | bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o bam_sort.o \ 7 | knetfile.o 8 | AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ 9 | bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ 10 | bamtk.o kaln.o sam_header.o 11 | PROG= samtools 12 | INCLUDES= -Iwin32 13 | SUBDIRS= . 14 | LIBPATH= 15 | 16 | .SUFFIXES:.c .o 17 | 18 | .c.o: 19 | $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ 20 | 21 | all:$(PROG) 22 | 23 | lib:libbam.a 24 | 25 | libbam.a:$(LOBJS) 26 | $(AR) -cru $@ $(LOBJS) 27 | 28 | samtools:lib $(AOBJS) 29 | $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32 30 | 31 | razip:razip.o razf.o knetfile.o 32 | $(CC) $(CFLAGS) -o $@ razf.o razip.o knetfile.o -lz 33 | 34 | bgzip:bgzip.o bgzf.o 35 | $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz 36 | 37 | razip.o:razf.h 38 | bam.o:bam.h razf.h bam_endian.h kstring.h 39 | sam.o:sam.h bam.h 40 | bam_import.o:bam.h kseq.h khash.h razf.h 41 | bam_pileup.o:bam.h razf.h ksort.h 42 | bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h 43 | bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h 44 | bam_lpileup.o:bam.h ksort.h 45 | bam_tview.o:bam.h faidx.h bam_maqcns.h 46 | bam_maqcns.o:bam.h ksort.h bam_maqcns.h 47 | bam_sort.o:bam.h ksort.h razf.h 48 | bam_md.o:bam.h faidx.h 49 | glf.o:glf.h 50 | 51 | faidx.o:faidx.h razf.h khash.h 52 | faidx_main.o:faidx.h razf.h 53 | 54 | clean: 55 | rm -fr gmon.out *.o *.exe *.dSYM razip $(PROG) *~ *.a 56 | -------------------------------------------------------------------------------- /samtools/bam2bed.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "sam.h" 3 | /* callback for bam_fetch() */ 4 | static int fetch_func(const bam1_t *b, void *data) 5 | { 6 | samfile_t *fp = (samfile_t*)data; 7 | uint32_t *cigar = bam1_cigar(b); 8 | const bam1_core_t *c = &b->core; 9 | int i, l; 10 | printf("%s\n",b->data); 11 | /* if (c->flag&BAM_FUNMAP) return 0; skip unmapped reads */ 12 | for (i = l = 0; i < c->n_cigar; ++i) { 13 | int op = cigar[i]&0xf; 14 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) 15 | l += cigar[i]>>4; 16 | } 17 | printf("%s\t%d\t%d\t%s\t%d\t%c\n", fp->header->target_name[c->tid], 18 | c->pos, c->pos + l, bam1_qname(b), c->qual, (c->flag&BAM_FREVERSE)? '-' : '+'); 19 | return 0; 20 | } 21 | int main(int argc, char *argv[]) 22 | { 23 | samfile_t *fp; 24 | if (argc == 1) { 25 | fprintf(stderr, "Usage: bam2bed [region]\n"); 26 | return 1; 27 | } 28 | if ((fp = samopen(argv[1], "rb", 0)) == 0) { 29 | fprintf(stderr, "bam2bed: Fail to open BAM file %s\n", argv[1]); 30 | return 1; 31 | } 32 | if (argc == 2) { /* if a region is not specified */ 33 | bam1_t *b = bam_init1(); 34 | while (samread(fp, b) >= 0) fetch_func(b, fp); 35 | bam_destroy1(b); 36 | } else { 37 | int ref, beg, end; 38 | bam_index_t *idx; 39 | if ((idx = bam_index_load(argv[1])) == 0) { /* load BAM index */ 40 | fprintf(stderr, "bam2bed: BAM index file is not available.\n"); 41 | return 1; 42 | } 43 | bam_parse_region(fp->header, argv[2], &ref, &beg, &end); /* parse region */ 44 | if (ref < 0) { 45 | fprintf(stderr, "bam2bed: Invalid region %s\n", argv[2]); 46 | return 1; 47 | } 48 | bam_fetch(fp->x.bam, idx, ref, beg, end, fp, fetch_func); 49 | bam_index_destroy(idx); 50 | } 51 | samclose(fp); 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /samtools/bam_aux.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "bam.h" 3 | #include "khash.h" 4 | typedef char *str_p; 5 | KHASH_MAP_INIT_STR(s, int) 6 | KHASH_MAP_INIT_STR(r2l, str_p) 7 | 8 | void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) 9 | { 10 | int ori_len = b->data_len; 11 | b->data_len += 3 + len; 12 | b->l_aux += 3 + len; 13 | if (b->m_data < b->data_len) { 14 | b->m_data = b->data_len; 15 | kroundup32(b->m_data); 16 | b->data = (uint8_t*)realloc(b->data, b->m_data); 17 | } 18 | b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; 19 | b->data[ori_len + 2] = type; 20 | memcpy(b->data + ori_len + 3, data, len); 21 | } 22 | 23 | uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) 24 | { 25 | return bam_aux_get(b, tag); 26 | } 27 | 28 | #define __skip_tag(s) do { \ 29 | int type = toupper(*(s)); \ 30 | ++(s); \ 31 | if (type == 'C' || type == 'A') ++(s); \ 32 | else if (type == 'S') (s) += 2; \ 33 | else if (type == 'I' || type == 'F') (s) += 4; \ 34 | else if (type == 'D') (s) += 8; \ 35 | else if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ 36 | } while (0) 37 | 38 | uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) 39 | { 40 | uint8_t *s; 41 | int y = tag[0]<<8 | tag[1]; 42 | s = bam1_aux(b); 43 | while (s < b->data + b->data_len) { 44 | int x = (int)s[0]<<8 | s[1]; 45 | s += 2; 46 | if (x == y) return s; 47 | __skip_tag(s); 48 | } 49 | return 0; 50 | } 51 | // s MUST BE returned by bam_aux_get() 52 | int bam_aux_del(bam1_t *b, uint8_t *s) 53 | { 54 | uint8_t *p, *aux; 55 | aux = bam1_aux(b); 56 | p = s - 2; 57 | __skip_tag(s); 58 | memmove(p, s, b->l_aux - (s - aux)); 59 | b->data_len -= s - p; 60 | b->l_aux -= s - p; 61 | return 0; 62 | } 63 | 64 | void bam_init_header_hash(bam_header_t *header) 65 | { 66 | if (header->hash == 0) { 67 | int ret, i; 68 | khiter_t iter; 69 | khash_t(s) *h; 70 | header->hash = h = kh_init(s); 71 | for (i = 0; i < header->n_targets; ++i) { 72 | iter = kh_put(s, h, header->target_name[i], &ret); 73 | kh_value(h, iter) = i; 74 | } 75 | } 76 | } 77 | 78 | void bam_destroy_header_hash(bam_header_t *header) 79 | { 80 | if (header->hash) 81 | kh_destroy(s, (khash_t(s)*)header->hash); 82 | } 83 | 84 | int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) 85 | { 86 | khint_t k; 87 | khash_t(s) *h = (khash_t(s)*)header->hash; 88 | k = kh_get(s, h, seq_name); 89 | return k == kh_end(h)? -1 : kh_value(h, k); 90 | } 91 | 92 | int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end) 93 | { 94 | char *s, *p; 95 | int i, l, k; 96 | khiter_t iter; 97 | khash_t(s) *h; 98 | 99 | bam_init_header_hash(header); 100 | h = (khash_t(s)*)header->hash; 101 | 102 | l = strlen(str); 103 | p = s = (char*)malloc(l+1); 104 | /* squeeze out "," */ 105 | for (i = k = 0; i != l; ++i) 106 | if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; 107 | s[k] = 0; 108 | for (i = 0; i != k; ++i) if (s[i] == ':') break; 109 | s[i] = 0; 110 | iter = kh_get(s, h, s); /* get the ref_id */ 111 | if (iter == kh_end(h)) { // name not found 112 | *ref_id = -1; free(s); 113 | return -1; 114 | } 115 | *ref_id = kh_value(h, iter); 116 | if (i == k) { /* dump the whole sequence */ 117 | *begin = 0; *end = 1<<29; free(s); 118 | return -1; 119 | } 120 | for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; 121 | *begin = atoi(p); 122 | if (i < k) { 123 | p = s + i + 1; 124 | *end = atoi(p); 125 | } else *end = 1<<29; 126 | if (*begin > 0) --*begin; 127 | free(s); 128 | if (*begin > *end) { 129 | fprintf(stderr, "[bam_parse_region] invalid region.\n"); 130 | return -1; 131 | } 132 | return 0; 133 | } 134 | 135 | int32_t bam_aux2i(const uint8_t *s) 136 | { 137 | int type; 138 | if (s == 0) return 0; 139 | type = *s++; 140 | if (type == 'c') return (int32_t)*(int8_t*)s; 141 | else if (type == 'C') return (int32_t)*(uint8_t*)s; 142 | else if (type == 's') return (int32_t)*(int16_t*)s; 143 | else if (type == 'S') return (int32_t)*(uint16_t*)s; 144 | else if (type == 'i' || type == 'I') return *(int32_t*)s; 145 | else return 0; 146 | } 147 | 148 | float bam_aux2f(const uint8_t *s) 149 | { 150 | int type; 151 | type = *s++; 152 | if (s == 0) return 0.0; 153 | if (type == 'f') return *(float*)s; 154 | else return 0.0; 155 | } 156 | 157 | double bam_aux2d(const uint8_t *s) 158 | { 159 | int type; 160 | type = *s++; 161 | if (s == 0) return 0.0; 162 | if (type == 'd') return *(double*)s; 163 | else return 0.0; 164 | } 165 | 166 | char bam_aux2A(const uint8_t *s) 167 | { 168 | int type; 169 | type = *s++; 170 | if (s == 0) return 0; 171 | if (type == 'A') return *(char*)s; 172 | else return 0; 173 | } 174 | 175 | char *bam_aux2Z(const uint8_t *s) 176 | { 177 | int type; 178 | type = *s++; 179 | if (s == 0) return 0; 180 | if (type == 'Z' || type == 'H') return (char*)s; 181 | else return 0; 182 | } 183 | -------------------------------------------------------------------------------- /samtools/bam_color.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "bam.h" 3 | 4 | /*! 5 | @abstract Get the color encoding the previous and current base 6 | @param b pointer to an alignment 7 | @param i The i-th position, 0-based 8 | @return color 9 | 10 | @discussion Returns 0 no color information is found. 11 | */ 12 | char bam_aux_getCSi(bam1_t *b, int i) 13 | { 14 | uint8_t *c = bam_aux_get(b, "CS"); 15 | char *cs = NULL; 16 | 17 | // return the base if the tag was not found 18 | if(0 == c) return 0; 19 | 20 | cs = bam_aux2Z(c); 21 | // adjust for strandedness and leading adaptor 22 | if(bam1_strand(b)) i = strlen(cs) - 1 - i; 23 | else i++; 24 | return cs[i]; 25 | } 26 | 27 | /*! 28 | @abstract Get the color quality of the color encoding the previous and current base 29 | @param b pointer to an alignment 30 | @param i The i-th position, 0-based 31 | @return color quality 32 | 33 | @discussion Returns 0 no color information is found. 34 | */ 35 | char bam_aux_getCQi(bam1_t *b, int i) 36 | { 37 | uint8_t *c = bam_aux_get(b, "CQ"); 38 | char *cq = NULL; 39 | 40 | // return the base if the tag was not found 41 | if(0 == c) return 0; 42 | 43 | cq = bam_aux2Z(c); 44 | // adjust for strandedness 45 | if(bam1_strand(b)) i = strlen(cq) - 1 - i; 46 | return cq[i]; 47 | } 48 | 49 | char bam_aux_nt2int(char a) 50 | { 51 | switch(toupper(a)) { 52 | case 'A': 53 | return 0; 54 | break; 55 | case 'C': 56 | return 1; 57 | break; 58 | case 'G': 59 | return 2; 60 | break; 61 | case 'T': 62 | return 3; 63 | break; 64 | default: 65 | return 4; 66 | break; 67 | } 68 | } 69 | 70 | char bam_aux_ntnt2cs(char a, char b) 71 | { 72 | a = bam_aux_nt2int(a); 73 | b = bam_aux_nt2int(b); 74 | if(4 == a || 4 == b) return '4'; 75 | return "0123"[(int)(a ^ b)]; 76 | } 77 | 78 | /*! 79 | @abstract Get the color error profile at the give position 80 | @param b pointer to an alignment 81 | @return the original color if the color was an error, '-' (dash) otherwise 82 | 83 | @discussion Returns 0 no color information is found. 84 | */ 85 | char bam_aux_getCEi(bam1_t *b, int i) 86 | { 87 | int cs_i; 88 | uint8_t *c = bam_aux_get(b, "CS"); 89 | char *cs = NULL; 90 | char prev_b, cur_b; 91 | char cur_color, cor_color; 92 | 93 | // return the base if the tag was not found 94 | if(0 == c) return 0; 95 | 96 | cs = bam_aux2Z(c); 97 | 98 | // adjust for strandedness and leading adaptor 99 | if(bam1_strand(b)) { //reverse strand 100 | cs_i = strlen(cs) - 1 - i; 101 | // get current color 102 | cur_color = cs[cs_i]; 103 | // get previous base. Note: must rc adaptor 104 | prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)]; 105 | // get current base 106 | cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; 107 | } 108 | else { 109 | cs_i=i+1; 110 | // get current color 111 | cur_color = cs[cs_i]; 112 | // get previous base 113 | prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)]; 114 | // get current base 115 | cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; 116 | } 117 | 118 | // corrected color 119 | cor_color = bam_aux_ntnt2cs(prev_b, cur_b); 120 | 121 | if(cur_color == cor_color) { 122 | return '-'; 123 | } 124 | else { 125 | return cur_color; 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /samtools/bam_endian.h: -------------------------------------------------------------------------------- 1 | #ifndef BAM_ENDIAN_H 2 | #define BAM_ENDIAN_H 3 | 4 | #include 5 | 6 | static inline int bam_is_big_endian() 7 | { 8 | long one= 1; 9 | return !(*((char *)(&one))); 10 | } 11 | static inline uint16_t bam_swap_endian_2(uint16_t v) 12 | { 13 | return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); 14 | } 15 | static inline void *bam_swap_endian_2p(void *x) 16 | { 17 | *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); 18 | return x; 19 | } 20 | static inline uint32_t bam_swap_endian_4(uint32_t v) 21 | { 22 | v = ((v & 0x0000FFFFU) << 16) | (v >> 16); 23 | return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); 24 | } 25 | static inline void *bam_swap_endian_4p(void *x) 26 | { 27 | *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); 28 | return x; 29 | } 30 | static inline uint64_t bam_swap_endian_8(uint64_t v) 31 | { 32 | v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); 33 | v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); 34 | return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); 35 | } 36 | static inline void *bam_swap_endian_8p(void *x) 37 | { 38 | *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); 39 | return x; 40 | } 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /samtools/bam_lpileup.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "bam.h" 5 | #include "ksort.h" 6 | 7 | #define TV_GAP 2 8 | 9 | typedef struct __freenode_t { 10 | uint32_t level:28, cnt:4; 11 | struct __freenode_t *next; 12 | } freenode_t, *freenode_p; 13 | 14 | #define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level)) 15 | KSORT_INIT(node, freenode_p, freenode_lt) 16 | 17 | /* Memory pool, similar to the one in bam_pileup.c */ 18 | typedef struct { 19 | int cnt, n, max; 20 | freenode_t **buf; 21 | } mempool_t; 22 | 23 | static mempool_t *mp_init() 24 | { 25 | return (mempool_t*)calloc(1, sizeof(mempool_t)); 26 | } 27 | static void mp_destroy(mempool_t *mp) 28 | { 29 | int k; 30 | for (k = 0; k < mp->n; ++k) free(mp->buf[k]); 31 | free(mp->buf); free(mp); 32 | } 33 | static inline freenode_t *mp_alloc(mempool_t *mp) 34 | { 35 | ++mp->cnt; 36 | if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t)); 37 | else return mp->buf[--mp->n]; 38 | } 39 | static inline void mp_free(mempool_t *mp, freenode_t *p) 40 | { 41 | --mp->cnt; p->next = 0; p->cnt = TV_GAP; 42 | if (mp->n == mp->max) { 43 | mp->max = mp->max? mp->max<<1 : 256; 44 | mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max); 45 | } 46 | mp->buf[mp->n++] = p; 47 | } 48 | 49 | /* core part */ 50 | struct __bam_lplbuf_t { 51 | int max, n_cur, n_pre; 52 | int max_level, *cur_level, *pre_level; 53 | mempool_t *mp; 54 | freenode_t **aux, *head, *tail; 55 | int n_nodes, m_aux; 56 | bam_pileup_f func; 57 | void *user_data; 58 | bam_plbuf_t *plbuf; 59 | }; 60 | 61 | void bam_lplbuf_reset(bam_lplbuf_t *buf) 62 | { 63 | freenode_t *p, *q; 64 | bam_plbuf_reset(buf->plbuf); 65 | for (p = buf->head; p->next;) { 66 | q = p->next; 67 | mp_free(buf->mp, p); 68 | p = q; 69 | } 70 | buf->head = buf->tail; 71 | buf->max_level = 0; 72 | buf->n_cur = buf->n_pre = 0; 73 | buf->n_nodes = 0; 74 | } 75 | 76 | static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) 77 | { 78 | bam_lplbuf_t *tv = (bam_lplbuf_t*)data; 79 | freenode_t *p; 80 | int i, l, max_level; 81 | // allocate memory if necessary 82 | if (tv->max < n) { // enlarge 83 | tv->max = n; 84 | kroundup32(tv->max); 85 | tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max); 86 | tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max); 87 | } 88 | tv->n_cur = n; 89 | // update cnt 90 | for (p = tv->head; p->next; p = p->next) 91 | if (p->cnt > 0) --p->cnt; 92 | // calculate cur_level[] 93 | max_level = 0; 94 | for (i = l = 0; i < n; ++i) { 95 | const bam_pileup1_t *p = pl + i; 96 | if (p->is_head) { 97 | if (tv->head->next && tv->head->cnt == 0) { // then take a free slot 98 | freenode_t *p = tv->head->next; 99 | tv->cur_level[i] = tv->head->level; 100 | mp_free(tv->mp, tv->head); 101 | tv->head = p; 102 | --tv->n_nodes; 103 | } else tv->cur_level[i] = ++tv->max_level; 104 | } else { 105 | tv->cur_level[i] = tv->pre_level[l++]; 106 | if (p->is_tail) { // then return a free slot 107 | tv->tail->level = tv->cur_level[i]; 108 | tv->tail->next = mp_alloc(tv->mp); 109 | tv->tail = tv->tail->next; 110 | ++tv->n_nodes; 111 | } 112 | } 113 | if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i]; 114 | ((bam_pileup1_t*)p)->level = tv->cur_level[i]; 115 | } 116 | assert(l == tv->n_pre); 117 | tv->func(tid, pos, n, pl, tv->user_data); 118 | // sort the linked list 119 | if (tv->n_nodes) { 120 | freenode_t *q; 121 | if (tv->n_nodes + 1 > tv->m_aux) { // enlarge 122 | tv->m_aux = tv->n_nodes + 1; 123 | kroundup32(tv->m_aux); 124 | tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux); 125 | } 126 | for (p = tv->head, i = l = 0; p->next;) { 127 | if (p->level > max_level) { // then discard this entry 128 | q = p->next; 129 | mp_free(tv->mp, p); 130 | p = q; 131 | } else { 132 | tv->aux[i++] = p; 133 | p = p->next; 134 | } 135 | } 136 | tv->aux[i] = tv->tail; // add a proper tail for the loop below 137 | tv->n_nodes = i; 138 | if (tv->n_nodes) { 139 | ks_introsort(node, tv->n_nodes, tv->aux); 140 | for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1]; 141 | tv->head = tv->aux[0]; 142 | } else tv->head = tv->tail; 143 | } 144 | // clean up 145 | tv->max_level = max_level; 146 | memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4); 147 | // squeeze out terminated levels 148 | for (i = l = 0; i < n; ++i) { 149 | const bam_pileup1_t *p = pl + i; 150 | if (!p->is_tail) 151 | tv->pre_level[l++] = tv->pre_level[i]; 152 | } 153 | tv->n_pre = l; 154 | /* 155 | fprintf(stderr, "%d\t", pos+1); 156 | for (i = 0; i < n; ++i) { 157 | const bam_pileup1_t *p = pl + i; 158 | if (p->is_head) fprintf(stderr, "^"); 159 | if (p->is_tail) fprintf(stderr, "$"); 160 | fprintf(stderr, "%d,", p->level); 161 | } 162 | fprintf(stderr, "\n"); 163 | */ 164 | return 0; 165 | } 166 | 167 | bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data) 168 | { 169 | bam_lplbuf_t *tv; 170 | tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t)); 171 | tv->mp = mp_init(); 172 | tv->head = tv->tail = mp_alloc(tv->mp); 173 | tv->func = func; 174 | tv->user_data = data; 175 | tv->plbuf = bam_plbuf_init(tview_func, tv); 176 | return (bam_lplbuf_t*)tv; 177 | } 178 | 179 | void bam_lplbuf_destroy(bam_lplbuf_t *tv) 180 | { 181 | freenode_t *p, *q; 182 | free(tv->cur_level); free(tv->pre_level); 183 | bam_plbuf_destroy(tv->plbuf); 184 | free(tv->aux); 185 | for (p = tv->head; p->next;) { 186 | q = p->next; 187 | mp_free(tv->mp, p); p = q; 188 | } 189 | mp_free(tv->mp, p); 190 | assert(tv->mp->cnt == 0); 191 | mp_destroy(tv->mp); 192 | free(tv); 193 | } 194 | 195 | int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv) 196 | { 197 | return bam_plbuf_push(b, tv->plbuf); 198 | } 199 | -------------------------------------------------------------------------------- /samtools/bam_maqcns.h: -------------------------------------------------------------------------------- 1 | #ifndef BAM_MAQCNS_H 2 | #define BAM_MAQCNS_H 3 | 4 | #include "glf.h" 5 | 6 | struct __bmc_aux_t; 7 | 8 | typedef struct { 9 | float het_rate, theta; 10 | int n_hap, cap_mapQ, is_soap; 11 | 12 | float eta, q_r; 13 | double *fk, *coef; 14 | double *lhet; 15 | struct __bmc_aux_t *aux; 16 | } bam_maqcns_t; 17 | 18 | typedef struct { 19 | int q_indel; 20 | float r_indel; 21 | // hidden parameters, unchangeable from command line 22 | int mm_penalty, indel_err, ambi_thres; 23 | } bam_maqindel_opt_t; 24 | 25 | typedef struct { 26 | int indel1, indel2; 27 | int cnt1, cnt2, cnt_anti; 28 | int cnt_ref, cnt_ambi; 29 | char *s[2]; 30 | // 31 | int gt, gl[2]; 32 | int q_cns, q_ref; 33 | } bam_maqindel_ret_t; 34 | 35 | #ifdef __cplusplus 36 | extern "C" { 37 | #endif 38 | 39 | bam_maqcns_t *bam_maqcns_init(); 40 | void bam_maqcns_prepare(bam_maqcns_t *bm); 41 | void bam_maqcns_destroy(bam_maqcns_t *bm); 42 | glf1_t *bam_maqcns_glfgen(int n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm); 43 | uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm); 44 | // return: cns<<28 | cns2<<24 | mapQ<<16 | cnsQ<<8 | cnsQ2 45 | uint32_t glf2cns(const glf1_t *g, int q_r); 46 | 47 | bam_maqindel_opt_t *bam_maqindel_opt_init(); 48 | bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, 49 | int _n_types, int *_types); 50 | void bam_maqindel_ret_destroy(bam_maqindel_ret_t*); 51 | 52 | #ifdef __cplusplus 53 | } 54 | #endif 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /samtools/bam_mate.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "bam.h" 4 | 5 | // currently, this function ONLY works if each read has one hit 6 | void bam_mating_core(bamFile in, bamFile out) 7 | { 8 | bam_header_t *header; 9 | bam1_t *b[2]; 10 | int curr, has_prev; 11 | 12 | header = bam_header_read(in); 13 | bam_header_write(out, header); 14 | 15 | b[0] = bam_init1(); 16 | b[1] = bam_init1(); 17 | curr = 0; has_prev = 0; 18 | while (bam_read1(in, b[curr]) >= 0) { 19 | bam1_t *cur = b[curr], *pre = b[1-curr]; 20 | if (has_prev) { 21 | if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name 22 | cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; 23 | pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; 24 | if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) 25 | && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) 26 | { 27 | uint32_t cur5, pre5; 28 | cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos; 29 | pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos; 30 | cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; 31 | } else cur->core.isize = pre->core.isize = 0; 32 | if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; 33 | else cur->core.flag &= ~BAM_FMREVERSE; 34 | if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; 35 | else pre->core.flag &= ~BAM_FMREVERSE; 36 | if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } 37 | if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } 38 | bam_write1(out, pre); 39 | bam_write1(out, cur); 40 | has_prev = 0; 41 | } else { // unpaired or singleton 42 | pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; 43 | if (pre->core.flag & BAM_FPAIRED) { 44 | pre->core.flag |= BAM_FMUNMAP; 45 | pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; 46 | } 47 | bam_write1(out, pre); 48 | } 49 | } else has_prev = 1; 50 | curr = 1 - curr; 51 | } 52 | if (has_prev) bam_write1(out, b[1-curr]); 53 | bam_header_destroy(header); 54 | bam_destroy1(b[0]); 55 | bam_destroy1(b[1]); 56 | } 57 | 58 | int bam_mating(int argc, char *argv[]) 59 | { 60 | bamFile in, out; 61 | if (argc < 3) { 62 | fprintf(stderr, "samtools fixmate \n"); 63 | return 1; 64 | } 65 | in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); 66 | out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); 67 | bam_mating_core(in, out); 68 | bam_close(in); bam_close(out); 69 | return 0; 70 | } 71 | -------------------------------------------------------------------------------- /samtools/bam_md.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "faidx.h" 6 | #include "sam.h" 7 | #include "kstring.h" 8 | 9 | void bam_fillmd1(bam1_t *b, char *ref, int is_equal) 10 | { 11 | uint8_t *seq = bam1_seq(b); 12 | uint32_t *cigar = bam1_cigar(b); 13 | bam1_core_t *c = &b->core; 14 | int i, x, y, u = 0; 15 | kstring_t *str; 16 | uint8_t *old_md, *old_nm; 17 | int32_t old_nm_i = -1, nm = 0; 18 | 19 | str = (kstring_t*)calloc(1, sizeof(kstring_t)); 20 | for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { 21 | int j, l = cigar[i]>>4, op = cigar[i]&0xf; 22 | if (op == BAM_CMATCH) { 23 | for (j = 0; j < l; ++j) { 24 | int z = y + j; 25 | int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; 26 | if (ref[x+j] == 0) break; // out of boundary 27 | if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match 28 | if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f; 29 | ++u; 30 | } else { 31 | ksprintf(str, "%d", u); 32 | kputc(ref[x+j], str); 33 | u = 0; ++nm; 34 | } 35 | } 36 | if (j < l) break; 37 | x += l; y += l; 38 | } else if (op == BAM_CDEL) { 39 | ksprintf(str, "%d", u); 40 | kputc('^', str); 41 | for (j = 0; j < l; ++j) { 42 | if (ref[x+j] == 0) break; 43 | kputc(ref[x+j], str); 44 | } 45 | u = 0; 46 | if (j < l) break; 47 | x += l; nm += l; 48 | } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { 49 | y += l; 50 | if (op == BAM_CINS) nm += l; 51 | } else if (op == BAM_CREF_SKIP) { 52 | x += l; 53 | } 54 | } 55 | ksprintf(str, "%d", u); 56 | // update NM 57 | old_nm = bam_aux_get(b, "NM"); 58 | if (c->flag & BAM_FUNMAP) return; 59 | if (old_nm) old_nm_i = bam_aux2i(old_nm); 60 | if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); 61 | else if (nm != old_nm_i) { 62 | fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); 63 | bam_aux_del(b, old_nm); 64 | bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); 65 | } 66 | // update MD 67 | old_md = bam_aux_get(b, "MD"); 68 | if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); 69 | else { 70 | int is_diff = 0; 71 | if (strlen((char*)old_md+1) == str->l) { 72 | for (i = 0; i < str->l; ++i) 73 | if (toupper(old_md[i+1]) != toupper(str->s[i])) 74 | break; 75 | if (i < str->l) is_diff = 1; 76 | } else is_diff = 1; 77 | if (is_diff) { 78 | fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); 79 | bam_aux_del(b, old_md); 80 | bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); 81 | } 82 | } 83 | free(str->s); free(str); 84 | } 85 | 86 | int bam_fillmd(int argc, char *argv[]) 87 | { 88 | int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed; 89 | samfile_t *fp, *fpout = 0; 90 | faidx_t *fai; 91 | char *ref = 0, mode_w[8], mode_r[8]; 92 | bam1_t *b; 93 | 94 | is_bam_out = is_sam_in = is_uncompressed = 0; 95 | mode_w[0] = mode_r[0] = 0; 96 | strcpy(mode_r, "r"); strcpy(mode_w, "w"); 97 | while ((c = getopt(argc, argv, "eubS")) >= 0) { 98 | switch (c) { 99 | case 'e': is_equal = 1; break; 100 | case 'b': is_bam_out = 1; break; 101 | case 'u': is_uncompressed = is_bam_out = 1; break; 102 | case 'S': is_sam_in = 1; break; 103 | default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; 104 | } 105 | } 106 | if (!is_sam_in) strcat(mode_r, "b"); 107 | if (is_bam_out) strcat(mode_w, "b"); 108 | else strcat(mode_w, "h"); 109 | if (is_uncompressed) strcat(mode_w, "u"); 110 | if (optind + 1 >= argc) { 111 | fprintf(stderr, "\n"); 112 | fprintf(stderr, "Usage: samtools fillmd [-eubS] \n\n"); 113 | fprintf(stderr, "Options: -e change identical bases to '='\n"); 114 | fprintf(stderr, " -u uncompressed BAM output (for piping)\n"); 115 | fprintf(stderr, " -b compressed BAM output\n"); 116 | fprintf(stderr, " -S the input is SAM with header\n\n"); 117 | return 1; 118 | } 119 | fp = samopen(argv[optind], mode_r, 0); 120 | if (fp == 0) return 1; 121 | if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) { 122 | fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); 123 | return 1; 124 | } 125 | fpout = samopen("-", mode_w, fp->header); 126 | fai = fai_load(argv[optind+1]); 127 | 128 | b = bam_init1(); 129 | while ((ret = samread(fp, b)) >= 0) { 130 | if (b->core.tid >= 0) { 131 | if (tid != b->core.tid) { 132 | free(ref); 133 | ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len); 134 | tid = b->core.tid; 135 | if (ref == 0) 136 | fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", 137 | fp->header->target_name[tid]); 138 | } 139 | if (ref) bam_fillmd1(b, ref, is_equal); 140 | } 141 | samwrite(fpout, b); 142 | } 143 | bam_destroy1(b); 144 | 145 | free(ref); 146 | fai_destroy(fai); 147 | samclose(fp); samclose(fpout); 148 | return 0; 149 | } 150 | -------------------------------------------------------------------------------- /samtools/bam_rmdup.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "sam.h" 7 | 8 | typedef bam1_t *bam1_p; 9 | 10 | #include "khash.h" 11 | KHASH_SET_INIT_STR(name) 12 | KHASH_MAP_INIT_INT64(pos, bam1_p) 13 | 14 | #define BUFFER_SIZE 0x40000 15 | 16 | typedef struct { 17 | uint64_t n_checked, n_removed; 18 | khash_t(pos) *best_hash; 19 | } lib_aux_t; 20 | KHASH_MAP_INIT_STR(lib, lib_aux_t) 21 | 22 | typedef struct { 23 | int n, max; 24 | bam1_t **a; 25 | } tmp_stack_t; 26 | 27 | static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) 28 | { 29 | if (stack->n == stack->max) { 30 | stack->max = stack->max? stack->max<<1 : 0x10000; 31 | stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max); 32 | } 33 | stack->a[stack->n++] = b; 34 | } 35 | 36 | static inline void dump_best(tmp_stack_t *stack, samfile_t *out) 37 | { 38 | int i; 39 | for (i = 0; i != stack->n; ++i) { 40 | samwrite(out, stack->a[i]); 41 | bam_destroy1(stack->a[i]); 42 | } 43 | stack->n = 0; 44 | } 45 | 46 | static void clear_del_set(khash_t(name) *del_set) 47 | { 48 | khint_t k; 49 | for (k = kh_begin(del_set); k < kh_end(del_set); ++k) 50 | if (kh_exist(del_set, k)) 51 | free((char*)kh_key(del_set, k)); 52 | kh_clear(name, del_set); 53 | } 54 | 55 | static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) 56 | { 57 | khint_t k = kh_get(lib, aux, lib); 58 | if (k == kh_end(aux)) { 59 | int ret; 60 | char *p = strdup(lib); 61 | lib_aux_t *q; 62 | k = kh_put(lib, aux, p, &ret); 63 | q = &kh_val(aux, k); 64 | q->n_checked = q->n_removed = 0; 65 | q->best_hash = kh_init(pos); 66 | return q; 67 | } else return &kh_val(aux, k); 68 | } 69 | 70 | static void clear_best(khash_t(lib) *aux, int max) 71 | { 72 | khint_t k; 73 | for (k = kh_begin(aux); k != kh_end(aux); ++k) { 74 | if (kh_exist(aux, k)) { 75 | lib_aux_t *q = &kh_val(aux, k); 76 | if (kh_size(q->best_hash) >= max) 77 | kh_clear(pos, q->best_hash); 78 | } 79 | } 80 | } 81 | 82 | static inline int sum_qual(const bam1_t *b) 83 | { 84 | int i, q; 85 | uint8_t *qual = bam1_qual(b); 86 | for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; 87 | return q; 88 | } 89 | 90 | void bam_rmdup_core(samfile_t *in, samfile_t *out) 91 | { 92 | bam1_t *b; 93 | int last_tid = -1, last_pos = -1; 94 | tmp_stack_t stack; 95 | khint_t k; 96 | khash_t(lib) *aux; 97 | khash_t(name) *del_set; 98 | 99 | aux = kh_init(lib); 100 | del_set = kh_init(name); 101 | b = bam_init1(); 102 | memset(&stack, 0, sizeof(tmp_stack_t)); 103 | 104 | kh_resize(name, del_set, 4 * BUFFER_SIZE); 105 | while (samread(in, b) >= 0) { 106 | bam1_core_t *c = &b->core; 107 | if (c->tid != last_tid || last_pos != c->pos) { 108 | dump_best(&stack, out); // write the result 109 | clear_best(aux, BUFFER_SIZE); 110 | if (c->tid != last_tid) { 111 | clear_best(aux, 0); 112 | if (kh_size(del_set)) { // check 113 | fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); 114 | clear_del_set(del_set); 115 | } 116 | if ((int)c->tid == -1) { // append unmapped reads 117 | samwrite(out, b); 118 | while (samread(in, b) >= 0) samwrite(out, b); 119 | break; 120 | } 121 | last_tid = c->tid; 122 | fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]); 123 | } 124 | } 125 | if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { 126 | samwrite(out, b); 127 | } else if (c->isize > 0) { // paired, head 128 | uint64_t key = (uint64_t)c->pos<<32 | c->isize; 129 | const char *lib; 130 | lib_aux_t *q; 131 | int ret; 132 | lib = bam_get_library(in->header, b); 133 | q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); 134 | ++q->n_checked; 135 | k = kh_put(pos, q->best_hash, key, &ret); 136 | if (ret == 0) { // found in best_hash 137 | bam1_t *p = kh_val(q->best_hash, k); 138 | ++q->n_removed; 139 | if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle 140 | kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed 141 | bam_copy1(p, b); // replaced as b 142 | } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed 143 | if (ret == 0) 144 | fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b)); 145 | } else { // not found in best_hash 146 | kh_val(q->best_hash, k) = bam_dup1(b); 147 | stack_insert(&stack, kh_val(q->best_hash, k)); 148 | } 149 | } else { // paired, tail 150 | k = kh_get(name, del_set, bam1_qname(b)); 151 | if (k != kh_end(del_set)) { 152 | free((char*)kh_key(del_set, k)); 153 | kh_del(name, del_set, k); 154 | } else samwrite(out, b); 155 | } 156 | last_pos = c->pos; 157 | } 158 | 159 | for (k = kh_begin(aux); k != kh_end(aux); ++k) { 160 | if (kh_exist(aux, k)) { 161 | lib_aux_t *q = &kh_val(aux, k); 162 | dump_best(&stack, out); 163 | fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, 164 | (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); 165 | kh_destroy(pos, q->best_hash); 166 | free((char*)kh_key(aux, k)); 167 | } 168 | } 169 | kh_destroy(lib, aux); 170 | 171 | clear_del_set(del_set); 172 | kh_destroy(name, del_set); 173 | free(stack.a); 174 | bam_destroy1(b); 175 | } 176 | 177 | void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se); 178 | 179 | int bam_rmdup(int argc, char *argv[]) 180 | { 181 | int c, is_se = 0, force_se = 0; 182 | samfile_t *in, *out; 183 | while ((c = getopt(argc, argv, "sS")) >= 0) { 184 | switch (c) { 185 | case 's': is_se = 1; break; 186 | case 'S': force_se = is_se = 1; break; 187 | } 188 | } 189 | if (optind + 2 > argc) { 190 | fprintf(stderr, "\n"); 191 | fprintf(stderr, "Usage: samtools rmdup [-sS] \n\n"); 192 | fprintf(stderr, "Option: -s rmdup for SE reads\n"); 193 | fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n"); 194 | return 1; 195 | } 196 | in = samopen(argv[optind], "rb", 0); 197 | out = samopen(argv[optind+1], "wb", in->header); 198 | if (in == 0 || out == 0) { 199 | fprintf(stderr, "[bam_rmdup] fail to read/write input files\n"); 200 | return 1; 201 | } 202 | if (is_se) bam_rmdupse_core(in, out, force_se); 203 | else bam_rmdup_core(in, out); 204 | samclose(in); samclose(out); 205 | return 0; 206 | } 207 | -------------------------------------------------------------------------------- /samtools/bam_rmdupse.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "sam.h" 3 | #include "khash.h" 4 | #include "klist.h" 5 | 6 | #define QUEUE_CLEAR_SIZE 0x100000 7 | #define MAX_POS 0x7fffffff 8 | 9 | typedef struct { 10 | int endpos; 11 | uint32_t score:31, discarded:1; 12 | bam1_t *b; 13 | } elem_t, *elem_p; 14 | #define __free_elem(p) bam_destroy1((p)->data.b) 15 | KLIST_INIT(q, elem_t, __free_elem) 16 | typedef klist_t(q) queue_t; 17 | 18 | KHASH_MAP_INIT_INT(best, elem_p) 19 | typedef khash_t(best) besthash_t; 20 | 21 | typedef struct { 22 | uint64_t n_checked, n_removed; 23 | besthash_t *left, *rght; 24 | } lib_aux_t; 25 | KHASH_MAP_INIT_STR(lib, lib_aux_t) 26 | 27 | static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) 28 | { 29 | khint_t k = kh_get(lib, aux, lib); 30 | if (k == kh_end(aux)) { 31 | int ret; 32 | char *p = strdup(lib); 33 | lib_aux_t *q; 34 | k = kh_put(lib, aux, p, &ret); 35 | q = &kh_val(aux, k); 36 | q->left = kh_init(best); 37 | q->rght = kh_init(best); 38 | q->n_checked = q->n_removed = 0; 39 | return q; 40 | } else return &kh_val(aux, k); 41 | } 42 | 43 | static inline int sum_qual(const bam1_t *b) 44 | { 45 | int i, q; 46 | uint8_t *qual = bam1_qual(b); 47 | for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; 48 | return q; 49 | } 50 | 51 | static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score) 52 | { 53 | elem_t *p = kl_pushp(q, queue); 54 | p->discarded = 0; 55 | p->endpos = endpos; p->score = score; 56 | if (p->b == 0) p->b = bam_init1(); 57 | bam_copy1(p->b, b); 58 | return p; 59 | } 60 | 61 | static void clear_besthash(besthash_t *h, int32_t pos) 62 | { 63 | khint_t k; 64 | for (k = kh_begin(h); k != kh_end(h); ++k) 65 | if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos) 66 | kh_del(best, h, k); 67 | } 68 | 69 | static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h) 70 | { 71 | if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { 72 | khint_t k; 73 | while (1) { 74 | elem_t *q; 75 | if (queue->head == queue->tail) break; 76 | q = &kl_val(queue->head); 77 | if (q->discarded) { 78 | q->b->data_len = 0; 79 | kl_shift(q, queue, 0); 80 | continue; 81 | } 82 | if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break; 83 | samwrite(out, q->b); 84 | q->b->data_len = 0; 85 | kl_shift(q, queue, 0); 86 | } 87 | for (k = kh_begin(h); k != kh_end(h); ++k) { 88 | if (kh_exist(h, k)) { 89 | clear_besthash(kh_val(h, k).left, pos); 90 | clear_besthash(kh_val(h, k).rght, pos); 91 | } 92 | } 93 | } 94 | } 95 | 96 | void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) 97 | { 98 | bam1_t *b; 99 | queue_t *queue; 100 | khint_t k; 101 | int last_tid = -2; 102 | khash_t(lib) *aux; 103 | 104 | aux = kh_init(lib); 105 | b = bam_init1(); 106 | queue = kl_init(q); 107 | while (samread(in, b) >= 0) { 108 | bam1_core_t *c = &b->core; 109 | int endpos = bam_calend(c, bam1_cigar(b)); 110 | int score = sum_qual(b); 111 | 112 | if (last_tid != c->tid) { 113 | if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux); 114 | last_tid = c->tid; 115 | } else dump_alignment(out, queue, c->pos, aux); 116 | if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) { 117 | push_queue(queue, b, endpos, score); 118 | } else { 119 | const char *lib; 120 | lib_aux_t *q; 121 | besthash_t *h; 122 | uint32_t key; 123 | int ret; 124 | lib = bam_get_library(in->header, b); 125 | q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); 126 | ++q->n_checked; 127 | h = (c->flag&BAM_FREVERSE)? q->rght : q->left; 128 | key = (c->flag&BAM_FREVERSE)? endpos : c->pos; 129 | k = kh_put(best, h, key, &ret); 130 | if (ret == 0) { // in the hash table 131 | elem_t *p = kh_val(h, k); 132 | ++q->n_removed; 133 | if (p->score < score) { 134 | if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue 135 | p->discarded = 1; 136 | kh_val(h, k) = push_queue(queue, b, endpos, score); 137 | } else { // replace 138 | p->score = score; p->endpos = endpos; 139 | bam_copy1(p->b, b); 140 | } 141 | } // otherwise, discard the alignment 142 | } else kh_val(h, k) = push_queue(queue, b, endpos, score); 143 | } 144 | } 145 | dump_alignment(out, queue, MAX_POS, aux); 146 | 147 | for (k = kh_begin(aux); k != kh_end(aux); ++k) { 148 | if (kh_exist(aux, k)) { 149 | lib_aux_t *q = &kh_val(aux, k); 150 | fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, 151 | (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); 152 | kh_destroy(best, q->left); kh_destroy(best, q->rght); 153 | free((char*)kh_key(aux, k)); 154 | } 155 | } 156 | kh_destroy(lib, aux); 157 | bam_destroy1(b); 158 | kl_destroy(q, queue); 159 | } 160 | -------------------------------------------------------------------------------- /samtools/bam_stat.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "bam.h" 4 | 5 | typedef struct { 6 | long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good; 7 | long long n_sgltn, n_read1, n_read2; 8 | long long n_qcfail, n_dup; 9 | long long n_diffchr, n_diffhigh; 10 | } bam_flagstat_t; 11 | 12 | #define flagstat_loop(s, c) do { \ 13 | ++(s)->n_reads; \ 14 | if ((c)->flag & BAM_FPAIRED) { \ 15 | ++(s)->n_pair_all; \ 16 | if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good; \ 17 | if ((c)->flag & BAM_FREAD1) ++(s)->n_read1; \ 18 | if ((c)->flag & BAM_FREAD2) ++(s)->n_read2; \ 19 | if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn; \ 20 | if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ 21 | ++(s)->n_pair_map; \ 22 | if ((c)->mtid != (c)->tid) { \ 23 | ++(s)->n_diffchr; \ 24 | if ((c)->qual >= 5) ++(s)->n_diffhigh; \ 25 | } \ 26 | } \ 27 | } \ 28 | if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped; \ 29 | if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail; \ 30 | if ((c)->flag & BAM_FDUP) ++(s)->n_dup; \ 31 | } while (0) 32 | 33 | bam_flagstat_t *bam_flagstat_core(bamFile fp) 34 | { 35 | bam_flagstat_t *s; 36 | bam1_t *b; 37 | bam1_core_t *c; 38 | int ret; 39 | s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t)); 40 | b = bam_init1(); 41 | c = &b->core; 42 | while ((ret = bam_read1(fp, b)) >= 0) 43 | flagstat_loop(s, c); 44 | bam_destroy1(b); 45 | if (ret != -1) 46 | fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); 47 | return s; 48 | } 49 | int bam_flagstat(int argc, char *argv[]) 50 | { 51 | bamFile fp; 52 | bam_header_t *header; 53 | bam_flagstat_t *s; 54 | if (argc == optind) { 55 | fprintf(stderr, "Usage: samtools flagstat \n"); 56 | return 1; 57 | } 58 | fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); 59 | assert(fp); 60 | header = bam_header_read(fp); 61 | s = bam_flagstat_core(fp); 62 | printf("%lld in total\n", s->n_reads); 63 | printf("%lld QC failure\n", s->n_qcfail); 64 | printf("%lld duplicates\n", s->n_dup); 65 | printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0); 66 | printf("%lld paired in sequencing\n", s->n_pair_all); 67 | printf("%lld read1\n", s->n_read1); 68 | printf("%lld read2\n", s->n_read2); 69 | printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0); 70 | printf("%lld with itself and mate mapped\n", s->n_pair_map); 71 | printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0); 72 | printf("%lld with mate mapped to a different chr\n", s->n_diffchr); 73 | printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh); 74 | free(s); 75 | bam_header_destroy(header); 76 | bam_close(fp); 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /samtools/bamtk.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "bam.h" 6 | 7 | #ifdef _USE_KNETFILE 8 | #include "knetfile.h" 9 | #endif 10 | 11 | #ifndef PACKAGE_VERSION 12 | #define PACKAGE_VERSION "0.1.7 (r510)" 13 | #endif 14 | 15 | int bam_taf2baf(int argc, char *argv[]); 16 | int bam_pileup(int argc, char *argv[]); 17 | int bam_merge(int argc, char *argv[]); 18 | int bam_index(int argc, char *argv[]); 19 | int bam_sort(int argc, char *argv[]); 20 | int bam_tview_main(int argc, char *argv[]); 21 | int bam_mating(int argc, char *argv[]); 22 | int bam_rmdup(int argc, char *argv[]); 23 | int bam_flagstat(int argc, char *argv[]); 24 | int bam_fillmd(int argc, char *argv[]); 25 | 26 | int main_samview(int argc, char *argv[]); 27 | int main_import(int argc, char *argv[]); 28 | 29 | int faidx_main(int argc, char *argv[]); 30 | int glf3_view_main(int argc, char *argv[]); 31 | 32 | int bam_tagview(int argc, char *argv[]) 33 | { 34 | bamFile fp; 35 | bam_header_t *header; 36 | bam1_t *b; 37 | char tag[2]; 38 | int ret; 39 | if (argc < 3) { 40 | fprintf(stderr, "Usage: samtools tagview \n"); 41 | return 1; 42 | } 43 | fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); 44 | assert(fp); 45 | header = bam_header_read(fp); 46 | if (header == 0) { 47 | fprintf(stderr, "[bam_view] fail to read the BAM header. Abort!\n"); 48 | return 1; 49 | } 50 | tag[0] = argv[2][0]; tag[1] = argv[2][1]; 51 | b = (bam1_t*)calloc(1, sizeof(bam1_t)); 52 | while ((ret = bam_read1(fp, b)) >= 0) { 53 | uint8_t *d = bam_aux_get(b, tag); 54 | if (d) { 55 | printf("%s\t%d\t", bam1_qname(b), b->core.flag); 56 | if (d[0] == 'Z' || d[0] == 'H') printf("%s\n", bam_aux2Z(d)); 57 | else if (d[0] == 'f') printf("%f\n", bam_aux2f(d)); 58 | else if (d[0] == 'd') printf("%lf\n", bam_aux2d(d)); 59 | else if (d[0] == 'A') printf("%c\n", bam_aux2A(d)); 60 | else if (d[0] == 'c' || d[0] == 's' || d[0] == 'i') printf("%d\n", bam_aux2i(d)); 61 | else if (d[0] == 'C' || d[0] == 'S' || d[0] == 'I') printf("%u\n", bam_aux2i(d)); 62 | else printf("\n"); 63 | } 64 | } 65 | if (ret < -1) fprintf(stderr, "[bam_view] truncated file? Continue anyway. (%d)\n", ret); 66 | free(b->data); free(b); 67 | bam_header_destroy(header); 68 | bam_close(fp); 69 | return 0; 70 | } 71 | 72 | static int usage() 73 | { 74 | fprintf(stderr, "\n"); 75 | fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n"); 76 | fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION); 77 | fprintf(stderr, "Usage: samtools [options]\n\n"); 78 | fprintf(stderr, "Command: view SAM<->BAM conversion\n"); 79 | fprintf(stderr, " sort sort alignment file\n"); 80 | fprintf(stderr, " pileup generate pileup output\n"); 81 | fprintf(stderr, " faidx index/extract FASTA\n"); 82 | #if _CURSES_LIB != 0 83 | fprintf(stderr, " tview text alignment viewer\n"); 84 | #endif 85 | fprintf(stderr, " index index alignment\n"); 86 | fprintf(stderr, " fixmate fix mate information\n"); 87 | fprintf(stderr, " glfview print GLFv3 file\n"); 88 | fprintf(stderr, " flagstat simple stats\n"); 89 | fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n"); 90 | fprintf(stderr, " merge merge sorted alignments\n"); 91 | fprintf(stderr, " rmdup remove PCR duplicates\n"); 92 | fprintf(stderr, "\n"); 93 | return 1; 94 | } 95 | 96 | int main(int argc, char *argv[]) 97 | { 98 | #ifdef _WIN32 99 | setmode(fileno(stdout), O_BINARY); 100 | setmode(fileno(stdin), O_BINARY); 101 | #ifdef _USE_KNETFILE 102 | knet_win32_init(); 103 | #endif 104 | #endif 105 | if (argc < 2) return usage(); 106 | if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1); 107 | else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1); 108 | else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1); 109 | else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1); 110 | else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1); 111 | else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1); 112 | else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); 113 | else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); 114 | else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); 115 | else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); 116 | else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); 117 | else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1); 118 | else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); 119 | else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); 120 | #if _CURSES_LIB != 0 121 | else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); 122 | #endif 123 | else { 124 | fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); 125 | return 1; 126 | } 127 | return 0; 128 | } 129 | -------------------------------------------------------------------------------- /samtools/bgzf.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | */ 23 | 24 | #ifndef __BGZF_H 25 | #define __BGZF_H 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #ifdef _USE_KNETFILE 32 | #include "knetfile.h" 33 | #endif 34 | 35 | //typedef int8_t bool; 36 | 37 | typedef struct { 38 | int file_descriptor; 39 | char open_mode; // 'r' or 'w' 40 | bool owned_file, is_uncompressed; 41 | #ifdef _USE_KNETFILE 42 | union { 43 | knetFile *fpr; 44 | FILE *fpw; 45 | } x; 46 | #else 47 | FILE* file; 48 | #endif 49 | int uncompressed_block_size; 50 | int compressed_block_size; 51 | void* uncompressed_block; 52 | void* compressed_block; 53 | int64_t block_address; 54 | int block_length; 55 | int block_offset; 56 | int cache_size; 57 | const char* error; 58 | void *cache; // a pointer to a hash table 59 | } BGZF; 60 | 61 | #ifdef __cplusplus 62 | extern "C" { 63 | #endif 64 | 65 | /* 66 | * Open an existing file descriptor for reading or writing. 67 | * Mode must be either "r" or "w". 68 | * A subsequent bgzf_close will not close the file descriptor. 69 | * Returns null on error. 70 | */ 71 | BGZF* bgzf_fdopen(int fd, const char* __restrict mode); 72 | 73 | /* 74 | * Open the specified file for reading or writing. 75 | * Mode must be either "r" or "w". 76 | * Returns null on error. 77 | */ 78 | BGZF* bgzf_open(const char* path, const char* __restrict mode); 79 | 80 | /* 81 | * Close the BGZ file and free all associated resources. 82 | * Does not close the underlying file descriptor if created with bgzf_fdopen. 83 | * Returns zero on success, -1 on error. 84 | */ 85 | int bgzf_close(BGZF* fp); 86 | 87 | /* 88 | * Read up to length bytes from the file storing into data. 89 | * Returns the number of bytes actually read. 90 | * Returns zero on end of file. 91 | * Returns -1 on error. 92 | */ 93 | int bgzf_read(BGZF* fp, void* data, int length); 94 | 95 | /* 96 | * Write length bytes from data to the file. 97 | * Returns the number of bytes written. 98 | * Returns -1 on error. 99 | */ 100 | int bgzf_write(BGZF* fp, const void* data, int length); 101 | 102 | /* 103 | * Return a virtual file pointer to the current location in the file. 104 | * No interpetation of the value should be made, other than a subsequent 105 | * call to bgzf_seek can be used to position the file at the same point. 106 | * Return value is non-negative on success. 107 | * Returns -1 on error. 108 | */ 109 | int64_t bgzf_tell(BGZF* fp); 110 | 111 | /* 112 | * Set the file to read from the location specified by pos, which must 113 | * be a value previously returned by bgzf_tell for this file (but not 114 | * necessarily one returned by this file handle). 115 | * The where argument must be SEEK_SET. 116 | * Seeking on a file opened for write is not supported. 117 | * Returns zero on success, -1 on error. 118 | */ 119 | int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); 120 | 121 | /* 122 | * Set the cache size. Zero to disable. By default, caching is 123 | * disabled. The recommended cache size for frequent random access is 124 | * about 8M bytes. 125 | */ 126 | void bgzf_set_cache_size(BGZF *fp, int cache_size); 127 | 128 | int bgzf_check_EOF(BGZF *fp); 129 | 130 | #ifdef __cplusplus 131 | } 132 | #endif 133 | 134 | #endif 135 | -------------------------------------------------------------------------------- /samtools/bgzip.c: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | */ 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include "bgzf.h" 31 | 32 | static const int WINDOW_SIZE = 64 * 1024; 33 | 34 | static int bgzip_main_usage() 35 | { 36 | printf("\n"); 37 | printf("Usage: bgzip [options] [file] ...\n\n"); 38 | printf("Options: -c write on standard output, keep original files unchanged\n"); 39 | printf(" -d decompress\n"); 40 | // printf(" -l list compressed file contents\n"); 41 | printf(" -b INT decompress at virtual file pointer INT\n"); 42 | printf(" -s INT decompress INT bytes in the uncompressed file\n"); 43 | printf(" -h give this help\n"); 44 | printf("\n"); 45 | return 0; 46 | } 47 | 48 | static int write_open(const char *fn, int is_forced) 49 | { 50 | int fd = -1; 51 | char c; 52 | if (!is_forced) { 53 | if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { 54 | printf("bgzip: %s already exists; do you wish to overwrite (y or n)? ", fn); 55 | scanf("%c", &c); 56 | if (c != 'Y' && c != 'y') { 57 | printf("bgzip: not overwritten\n"); 58 | exit(1); 59 | } 60 | } 61 | } 62 | if (fd < 0) { 63 | if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { 64 | fprintf(stderr, "bgzip: %s: Fail to write\n", fn); 65 | exit(1); 66 | } 67 | } 68 | return fd; 69 | } 70 | 71 | static 72 | void 73 | fail(BGZF* fp) 74 | { 75 | printf("Error: %s\n", fp->error); 76 | exit(1); 77 | } 78 | 79 | int main(int argc, char **argv) 80 | { 81 | int c, compress, pstdout, is_forced; 82 | BGZF *rz; 83 | void *buffer; 84 | long start, end, size; 85 | 86 | compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; 87 | while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){ 88 | switch(c){ 89 | case 'h': return bgzip_main_usage(); 90 | case 'd': compress = 0; break; 91 | case 'c': pstdout = 1; break; 92 | // case 'l': compress = 2; break; 93 | case 'b': start = atol(optarg); break; 94 | case 's': size = atol(optarg); break; 95 | case 'f': is_forced = 1; break; 96 | } 97 | } 98 | if (size >= 0) end = start + size; 99 | if(end >= 0 && end < start){ 100 | fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end); 101 | return 1; 102 | } 103 | if(compress == 1){ 104 | int f_src, f_dst = -1; 105 | if(argc > optind){ 106 | if((f_src = open(argv[optind], O_RDONLY)) < 0){ 107 | fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]); 108 | return 1; 109 | } 110 | if(pstdout){ 111 | f_dst = fileno(stdout); 112 | } else { 113 | char *name = malloc(sizeof(strlen(argv[optind]) + 5)); 114 | strcpy(name, argv[optind]); 115 | strcat(name, ".gz"); 116 | f_dst = write_open(name, is_forced); 117 | if (f_dst < 0) return 1; 118 | free(name); 119 | } 120 | } else if(pstdout){ 121 | f_src = fileno(stdin); 122 | f_dst = fileno(stdout); 123 | } else return bgzip_main_usage(); 124 | rz = bgzf_fdopen(f_dst, "w"); 125 | buffer = malloc(WINDOW_SIZE); 126 | while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) { 127 | if (bgzf_write(rz, buffer, c) < 0) { 128 | fail(rz); 129 | } 130 | } 131 | // f_dst will be closed here 132 | if (bgzf_close(rz) < 0) { 133 | fail(rz); 134 | } 135 | if (argc > optind) unlink(argv[optind]); 136 | free(buffer); 137 | close(f_src); 138 | return 0; 139 | } else { 140 | if(argc <= optind) return bgzip_main_usage(); 141 | int f_dst; 142 | if (argc > optind && !pstdout) { 143 | char *name; 144 | if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) { 145 | printf("bgzip: %s: unknown suffix -- ignored\n", argv[optind]); 146 | return 1; 147 | } 148 | name = strdup(argv[optind]); 149 | name[strlen(name) - 3] = '\0'; 150 | f_dst = write_open(name, is_forced); 151 | free(name); 152 | } else f_dst = fileno(stdout); 153 | rz = bgzf_open(argv[optind], "r"); 154 | if (rz == NULL) { 155 | printf("Could not open file: %s\n", argv[optind]); 156 | return 1; 157 | } 158 | buffer = malloc(WINDOW_SIZE); 159 | if (bgzf_seek(rz, start, SEEK_SET) < 0) { 160 | fail(rz); 161 | } 162 | while(1){ 163 | if(end < 0) c = bgzf_read(rz, buffer, WINDOW_SIZE); 164 | else c = bgzf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); 165 | if(c == 0) break; 166 | if (c < 0) fail(rz); 167 | start += c; 168 | write(f_dst, buffer, c); 169 | if(end >= 0 && start >= end) break; 170 | } 171 | free(buffer); 172 | if (bgzf_close(rz) < 0) { 173 | fail(rz); 174 | } 175 | if (!pstdout) unlink(argv[optind]); 176 | return 0; 177 | } 178 | } 179 | 180 | -------------------------------------------------------------------------------- /samtools/examples/00README.txt: -------------------------------------------------------------------------------- 1 | File ex1.fa contains two sequences cut from the human genome 2 | build36. They were exatracted with command: 3 | 4 | samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550 5 | 6 | Sequence names were changed manually for simplicity. File ex1.sam.gz 7 | contains MAQ alignments exatracted with: 8 | 9 | (samtools view NA18507_maq.bam 2:2044001-2045500; 10 | samtools view NA18507_maq.bam 20:68001-69500) 11 | 12 | and processed with `samtools fixmate' to make it self-consistent as a 13 | standalone alignment. 14 | 15 | To try samtools, you may run the following commands: 16 | 17 | samtools faidx ex1.fa # index the reference FASTA 18 | samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM 19 | samtools index ex1.bam # index BAM 20 | samtools tview ex1.bam ex1.fa # view alignment 21 | samtools pileup -cf ex1.fa ex1.bam # pileup and consensus 22 | samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz 23 | 24 | -------------------------------------------------------------------------------- /samtools/examples/Makefile: -------------------------------------------------------------------------------- 1 | all:../libbam.a ../samtools ex1.glf ex1.pileup.gz ex1.bam.bai ex1f-rmduppe.bam ex1f-rmdupse.bam ex1.glfview.gz calDepth 2 | @echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo; 3 | 4 | ex1.fa.fai:ex1.fa 5 | ../samtools faidx ex1.fa 6 | ex1.bam:ex1.sam.gz ex1.fa.fai 7 | ../samtools import ex1.fa.fai ex1.sam.gz ex1.bam 8 | ex1.bam.bai:ex1.bam 9 | ../samtools index ex1.bam 10 | ex1.pileup.gz:ex1.bam ex1.fa 11 | ../samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz 12 | ex1.glf:ex1.bam ex1.fa 13 | ../samtools pileup -gf ex1.fa ex1.bam > ex1.glf 14 | ex1.glfview.gz:ex1.glf 15 | ../samtools glfview ex1.glf | gzip > ex1.glfview.gz 16 | ex1a.bam:ex1.bam 17 | ../samtools view -h ex1.bam | awk 'BEGIN{FS=OFS="\t"}{if(/^@/)print;else{$$1=$$1"a";print}}' | ../samtools view -bS - > $@ 18 | ex1b.bam:ex1.bam 19 | ../samtools view -h ex1.bam | awk 'BEGIN{FS=OFS="\t"}{if(/^@/)print;else{$$1=$$1"b";print}}' | ../samtools view -bS - > $@ 20 | ex1f.rg: 21 | (echo "@RG ID:ex1 LB:ex1"; echo "@RG ID:ex1a LB:ex1"; echo "@RG ID:ex1b LB:ex1b") > $@ 22 | ex1f.bam:ex1.bam ex1a.bam ex1b.bam ex1f.rg 23 | ../samtools merge -rh ex1f.rg $@ ex1.bam ex1a.bam ex1b.bam 24 | ex1f-rmduppe.bam:ex1f.bam 25 | ../samtools rmdup ex1f.bam $@ 26 | ex1f-rmdupse.bam:ex1f.bam 27 | ../samtools rmdup -S ex1f.bam $@ 28 | 29 | ../samtools: 30 | (cd ..; make samtools) 31 | 32 | ../libbam.a: 33 | (cd ..; make libbam.a) 34 | 35 | calDepth:../libbam.a calDepth.c 36 | gcc -g -Wall -O2 -I.. calDepth.c -o $@ -lm -lz -L.. -lbam 37 | 38 | clean: 39 | rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM ex1*.rg -------------------------------------------------------------------------------- /samtools/examples/calDepth.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "sam.h" 3 | 4 | typedef struct { 5 | int beg, end; 6 | samfile_t *in; 7 | } tmpstruct_t; 8 | 9 | // callback for bam_fetch() 10 | static int fetch_func(const bam1_t *b, void *data) 11 | { 12 | bam_plbuf_t *buf = (bam_plbuf_t*)data; 13 | bam_plbuf_push(b, buf); 14 | return 0; 15 | } 16 | // callback for bam_plbuf_init() 17 | static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) 18 | { 19 | tmpstruct_t *tmp = (tmpstruct_t*)data; 20 | if ((int)pos >= tmp->beg && (int)pos < tmp->end) 21 | printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n); 22 | return 0; 23 | } 24 | 25 | int main(int argc, char *argv[]) 26 | { 27 | tmpstruct_t tmp; 28 | if (argc == 1) { 29 | fprintf(stderr, "Usage: calDepth [region]\n"); 30 | return 1; 31 | } 32 | tmp.beg = 0; tmp.end = 0x7fffffff; 33 | tmp.in = samopen(argv[1], "rb", 0); 34 | if (tmp.in == 0) { 35 | fprintf(stderr, "Fail to open BAM file %s\n", argv[1]); 36 | return 1; 37 | } 38 | if (argc == 2) { // if a region is not specified 39 | sampileup(tmp.in, -1, pileup_func, &tmp); 40 | } else { 41 | int ref; 42 | bam_index_t *idx; 43 | bam_plbuf_t *buf; 44 | idx = bam_index_load(argv[1]); // load BAM index 45 | if (idx == 0) { 46 | fprintf(stderr, "BAM indexing file is not available.\n"); 47 | return 1; 48 | } 49 | bam_parse_region(tmp.in->header, argv[2], &ref, &tmp.beg, &tmp.end); // parse the region 50 | if (ref < 0) { 51 | fprintf(stderr, "Invalid region %s\n", argv[2]); 52 | return 1; 53 | } 54 | buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup 55 | bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func); 56 | bam_plbuf_push(0, buf); // finalize pileup 57 | bam_index_destroy(idx); 58 | bam_plbuf_destroy(buf); 59 | } 60 | samclose(tmp.in); 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /samtools/examples/ex1.fa: -------------------------------------------------------------------------------- 1 | >seq1 2 | CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCT 3 | GTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCAC 4 | GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAG 5 | TCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTC 6 | AGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAA 7 | CAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACC 8 | AAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCT 9 | CTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCA 10 | ATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGC 11 | AGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAAC 12 | AACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACAC 13 | ATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATAC 14 | CATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCT 15 | TTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTT 16 | TCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAAT 17 | GCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAAT 18 | ACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGA 19 | ACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTG 20 | TGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTA 21 | CGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAG 22 | TCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGC 23 | TTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTC 24 | TCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTG 25 | TTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGG 26 | AGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATA 27 | TTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTC 28 | TCCCTCGTCTTCTTA 29 | >seq2 30 | TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAG 31 | CTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCT 32 | TATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTT 33 | CAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA 34 | AAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTT 35 | AGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATAC 36 | ATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAG 37 | GAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCAT 38 | CAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATT 39 | TTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTA 40 | AGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA 41 | ATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAAT 42 | TAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATA 43 | AAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACC 44 | TCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATA 45 | GATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATT 46 | AATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCA 47 | AATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGT 48 | AAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATAT 49 | AACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAAT 50 | ACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGAT 51 | GATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTG 52 | CGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATA 53 | GCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAA 54 | AAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAA 55 | TTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGC 56 | CAGAAAAAAATATTTACAGTAACT 57 | -------------------------------------------------------------------------------- /samtools/examples/ex1.sam.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genome-vendor/bsmap/44ab0345d682a4e54a0c25ad2aebf9392a2a0936/samtools/examples/ex1.sam.gz -------------------------------------------------------------------------------- /samtools/faidx.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Genome Research Ltd (GRL). 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Heng Li */ 27 | 28 | #ifndef FAIDX_H 29 | #define FAIDX_H 30 | 31 | /*! 32 | @header 33 | 34 | Index FASTA files and extract subsequence. 35 | 36 | @copyright The Wellcome Trust Sanger Institute. 37 | */ 38 | 39 | struct __faidx_t; 40 | typedef struct __faidx_t faidx_t; 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | /*! 47 | @abstract Build index for a FASTA or razip compressed FASTA file. 48 | @param fn FASTA file name 49 | @return 0 on success; or -1 on failure 50 | @discussion File "fn.fai" will be generated. 51 | */ 52 | int fai_build(const char *fn); 53 | 54 | /*! 55 | @abstract Distroy a faidx_t struct. 56 | @param fai Pointer to the struct to be destroyed 57 | */ 58 | void fai_destroy(faidx_t *fai); 59 | 60 | /*! 61 | @abstract Load index from "fn.fai". 62 | @param fn File name of the FASTA file 63 | */ 64 | faidx_t *fai_load(const char *fn); 65 | 66 | /*! 67 | @abstract Fetch the sequence in a region. 68 | @param fai Pointer to the faidx_t struct 69 | @param reg Region in the format "chr2:20,000-30,000" 70 | @param len Length of the region 71 | @return Pointer to the sequence; null on failure 72 | 73 | @discussion The returned sequence is allocated by malloc family 74 | and should be destroyed by end users by calling free() on it. 75 | */ 76 | char *fai_fetch(const faidx_t *fai, const char *reg, int *len); 77 | 78 | /*! 79 | @abstract Fetch the number of sequences. 80 | @param fai Pointer to the faidx_t struct 81 | @return The number of sequences 82 | */ 83 | int faidx_fetch_nseq(const faidx_t *fai); 84 | 85 | /*! 86 | @abstract Fetch the sequence in a region. 87 | @param fai Pointer to the faidx_t struct 88 | @param c_name Region name 89 | @param p_beg_i Beginning position number (zero-based) 90 | @param p_end_i End position number (zero-based) 91 | @param len Length of the region 92 | @return Pointer to the sequence; null on failure 93 | 94 | @discussion The returned sequence is allocated by malloc family 95 | and should be destroyed by end users by calling free() on it. 96 | */ 97 | char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len); 98 | 99 | #ifdef __cplusplus 100 | } 101 | #endif 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /samtools/glf.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "glf.h" 4 | 5 | #ifdef _NO_BGZF 6 | // then alias bgzf_*() functions 7 | #endif 8 | 9 | static int glf3_is_BE = 0; 10 | 11 | static inline uint32_t bam_swap_endian_4(uint32_t v) 12 | { 13 | v = ((v & 0x0000FFFFU) << 16) | (v >> 16); 14 | return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); 15 | } 16 | 17 | static inline uint16_t bam_swap_endian_2(uint16_t v) 18 | { 19 | return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); 20 | } 21 | 22 | static inline int bam_is_big_endian() 23 | { 24 | long one= 1; 25 | return !(*((char *)(&one))); 26 | } 27 | 28 | glf3_header_t *glf3_header_init() 29 | { 30 | glf3_is_BE = bam_is_big_endian(); 31 | return (glf3_header_t*)calloc(1, sizeof(glf3_header_t)); 32 | } 33 | 34 | glf3_header_t *glf3_header_read(glfFile fp) 35 | { 36 | glf3_header_t *h; 37 | char magic[4]; 38 | h = glf3_header_init(); 39 | bgzf_read(fp, magic, 4); 40 | if (strncmp(magic, "GLF\3", 4)) { 41 | fprintf(stderr, "[glf3_header_read] invalid magic.\n"); 42 | glf3_header_destroy(h); 43 | return 0; 44 | } 45 | bgzf_read(fp, &h->l_text, 4); 46 | if (glf3_is_BE) h->l_text = bam_swap_endian_4(h->l_text); 47 | if (h->l_text) { 48 | h->text = (uint8_t*)calloc(h->l_text + 1, 1); 49 | bgzf_read(fp, h->text, h->l_text); 50 | } 51 | return h; 52 | } 53 | 54 | void glf3_header_write(glfFile fp, const glf3_header_t *h) 55 | { 56 | int32_t x; 57 | bgzf_write(fp, "GLF\3", 4); 58 | x = glf3_is_BE? bam_swap_endian_4(h->l_text) : h->l_text; 59 | bgzf_write(fp, &x, 4); 60 | if (h->l_text) bgzf_write(fp, h->text, h->l_text); 61 | } 62 | 63 | void glf3_header_destroy(glf3_header_t *h) 64 | { 65 | free(h->text); 66 | free(h); 67 | } 68 | 69 | char *glf3_ref_read(glfFile fp, int *len) 70 | { 71 | int32_t n, x; 72 | char *str; 73 | *len = 0; 74 | if (bgzf_read(fp, &n, 4) != 4) return 0; 75 | if (glf3_is_BE) n = bam_swap_endian_4(n); 76 | if (n < 0) { 77 | fprintf(stderr, "[glf3_ref_read] invalid reference name length: %d.\n", n); 78 | return 0; 79 | } 80 | str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact 81 | x = bgzf_read(fp, str, n); 82 | x += bgzf_read(fp, len, 4); 83 | if (x != n + 4) { 84 | free(str); *len = -1; return 0; // truncated 85 | } 86 | if (glf3_is_BE) *len = bam_swap_endian_4(*len); 87 | return str; 88 | } 89 | 90 | void glf3_ref_write(glfFile fp, const char *str, int len) 91 | { 92 | int32_t m, n = strlen(str) + 1; 93 | m = glf3_is_BE? bam_swap_endian_4(n) : n; 94 | bgzf_write(fp, &m, 4); 95 | bgzf_write(fp, str, n); 96 | if (glf3_is_BE) len = bam_swap_endian_4(len); 97 | bgzf_write(fp, &len, 4); 98 | } 99 | 100 | void glf3_view1(const char *ref_name, const glf3_t *g3, int pos) 101 | { 102 | int j; 103 | if (g3->rtype == GLF3_RTYPE_END) return; 104 | printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, pos + 1, 105 | g3->rtype == GLF3_RTYPE_INDEL? '*' : "XACMGRSVTWYHKDBN"[g3->ref_base], 106 | g3->depth, g3->rms_mapQ, g3->min_lk); 107 | if (g3->rtype == GLF3_RTYPE_SUB) 108 | for (j = 0; j != 10; ++j) printf("\t%d", g3->lk[j]); 109 | else { 110 | printf("\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t", g3->lk[0], g3->lk[1], g3->lk[2], g3->indel_len[0], g3->indel_len[1], 111 | g3->indel_len[0]? g3->indel_seq[0] : "*", g3->indel_len[1]? g3->indel_seq[1] : "*"); 112 | } 113 | printf("\n"); 114 | } 115 | 116 | int glf3_write1(glfFile fp, const glf3_t *g3) 117 | { 118 | int r; 119 | uint8_t c; 120 | uint32_t y[2]; 121 | c = g3->rtype<<4 | g3->ref_base; 122 | r = bgzf_write(fp, &c, 1); 123 | if (g3->rtype == GLF3_RTYPE_END) return r; 124 | y[0] = g3->offset; 125 | y[1] = g3->min_lk<<24 | g3->depth; 126 | if (glf3_is_BE) { 127 | y[0] = bam_swap_endian_4(y[0]); 128 | y[1] = bam_swap_endian_4(y[1]); 129 | } 130 | r += bgzf_write(fp, y, 8); 131 | r += bgzf_write(fp, &g3->rms_mapQ, 1); 132 | if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_write(fp, g3->lk, 10); 133 | else { 134 | int16_t x[2]; 135 | r += bgzf_write(fp, g3->lk, 3); 136 | x[0] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[0]) : g3->indel_len[0]; 137 | x[1] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[1]) : g3->indel_len[1]; 138 | r += bgzf_write(fp, x, 4); 139 | if (g3->indel_len[0]) r += bgzf_write(fp, g3->indel_seq[0], abs(g3->indel_len[0])); 140 | if (g3->indel_len[1]) r += bgzf_write(fp, g3->indel_seq[1], abs(g3->indel_len[1])); 141 | } 142 | return r; 143 | } 144 | 145 | #ifndef kv_roundup32 146 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 147 | #endif 148 | 149 | int glf3_read1(glfFile fp, glf3_t *g3) 150 | { 151 | int r; 152 | uint8_t c; 153 | uint32_t y[2]; 154 | r = bgzf_read(fp, &c, 1); 155 | if (r == 0) return 0; 156 | g3->ref_base = c & 0xf; 157 | g3->rtype = c>>4; 158 | if (g3->rtype == GLF3_RTYPE_END) return r; 159 | r += bgzf_read(fp, y, 8); 160 | if (glf3_is_BE) { 161 | y[0] = bam_swap_endian_4(y[0]); 162 | y[1] = bam_swap_endian_4(y[1]); 163 | } 164 | g3->offset = y[0]; 165 | g3->min_lk = y[1]>>24; 166 | g3->depth = y[1]<<8>>8; 167 | r += bgzf_read(fp, &g3->rms_mapQ, 1); 168 | if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_read(fp, g3->lk, 10); 169 | else { 170 | int16_t x[2], max; 171 | r += bgzf_read(fp, g3->lk, 3); 172 | r += bgzf_read(fp, x, 4); 173 | if (glf3_is_BE) { 174 | x[0] = bam_swap_endian_2(x[0]); 175 | x[1] = bam_swap_endian_2(x[1]); 176 | } 177 | g3->indel_len[0] = x[0]; 178 | g3->indel_len[1] = x[1]; 179 | x[0] = abs(x[0]); x[1] = abs(x[1]); 180 | max = (x[0] > x[1]? x[0] : x[1]) + 1; 181 | if (g3->max_len < max) { 182 | g3->max_len = max; 183 | kv_roundup32(g3->max_len); 184 | g3->indel_seq[0] = (char*)realloc(g3->indel_seq[0], g3->max_len); 185 | g3->indel_seq[1] = (char*)realloc(g3->indel_seq[1], g3->max_len); 186 | } 187 | r += bgzf_read(fp, g3->indel_seq[0], x[0]); 188 | r += bgzf_read(fp, g3->indel_seq[1], x[1]); 189 | g3->indel_seq[0][x[0]] = g3->indel_seq[1][x[1]] = 0; 190 | } 191 | return r; 192 | } 193 | 194 | void glf3_view(glfFile fp) 195 | { 196 | glf3_header_t *h; 197 | char *name; 198 | glf3_t *g3; 199 | int len; 200 | h = glf3_header_read(fp); 201 | g3 = glf3_init1(); 202 | while ((name = glf3_ref_read(fp, &len)) != 0) { 203 | int pos = 0; 204 | while (glf3_read1(fp, g3) && g3->rtype != GLF3_RTYPE_END) { 205 | pos += g3->offset; 206 | glf3_view1(name, g3, pos); 207 | } 208 | free(name); 209 | } 210 | glf3_header_destroy(h); 211 | glf3_destroy1(g3); 212 | } 213 | 214 | int glf3_view_main(int argc, char *argv[]) 215 | { 216 | glfFile fp; 217 | if (argc == 1) { 218 | fprintf(stderr, "Usage: glfview \n"); 219 | return 1; 220 | } 221 | fp = (strcmp(argv[1], "-") == 0)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[1], "r"); 222 | if (fp == 0) { 223 | fprintf(stderr, "Fail to open file '%s'\n", argv[1]); 224 | return 1; 225 | } 226 | glf3_view(fp); 227 | bgzf_close(fp); 228 | return 0; 229 | } 230 | 231 | #ifdef GLFVIEW_MAIN 232 | int main(int argc, char *argv[]) 233 | { 234 | return glf3_view_main(argc, argv); 235 | } 236 | #endif 237 | -------------------------------------------------------------------------------- /samtools/glf.h: -------------------------------------------------------------------------------- 1 | #ifndef GLF_H_ 2 | #define GLF_H_ 3 | 4 | typedef struct { 5 | unsigned char ref_base:4, dummy:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ 6 | unsigned char max_mapQ; /** maximum mapping quality */ 7 | unsigned char lk[10]; /** log likelihood ratio, capped at 255 */ 8 | unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ 9 | } glf1_t; 10 | 11 | #include 12 | #include "bgzf.h" 13 | typedef BGZF *glfFile; 14 | 15 | #define GLF3_RTYPE_END 0 16 | #define GLF3_RTYPE_SUB 1 17 | #define GLF3_RTYPE_INDEL 2 18 | 19 | typedef struct { 20 | uint8_t ref_base:4, rtype:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ 21 | uint8_t rms_mapQ; /** RMS mapping quality */ 22 | uint8_t lk[10]; /** log likelihood ratio, capped at 255 */ 23 | uint32_t min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ 24 | int32_t offset; /** the first base in a chromosome has offset zero. */ 25 | // for indel (lkHom1, lkHom2 and lkHet are the first three elements in lk[10]) 26 | int16_t indel_len[2]; 27 | int32_t max_len; // maximum indel len; will be modified by glf3_read1() 28 | char *indel_seq[2]; 29 | } glf3_t; 30 | 31 | typedef struct { 32 | int32_t l_text; 33 | uint8_t *text; 34 | } glf3_header_t; 35 | 36 | #ifdef __cplusplus 37 | extern "C" { 38 | #endif 39 | 40 | #define glf3_init1() ((glf3_t*)calloc(1, sizeof(glf3_t))) 41 | #define glf3_destroy1(g3) do { free((g3)->indel_seq[0]); free((g3)->indel_seq[1]); free(g3); } while (0) 42 | 43 | glf3_header_t *glf3_header_init(); 44 | glf3_header_t *glf3_header_read(glfFile fp); 45 | void glf3_header_write(glfFile fp, const glf3_header_t *h); 46 | void glf3_header_destroy(glf3_header_t *h); 47 | char *glf3_ref_read(glfFile fp, int *len); 48 | void glf3_ref_write(glfFile fp, const char *name, int len); 49 | int glf3_write1(glfFile fp, const glf3_t *g3); 50 | int glf3_read1(glfFile fp, glf3_t *g3); 51 | 52 | #ifdef __cplusplus 53 | } 54 | #endif 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /samtools/kaln.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2003-2006, 2008, 2009 by Heng Li 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #ifndef LH3_KALN_H_ 27 | #define LH3_KALN_H_ 28 | 29 | #include 30 | 31 | #define MINOR_INF -1073741823 32 | 33 | typedef struct { 34 | int gap_open; 35 | int gap_ext; 36 | int gap_end; 37 | 38 | int *matrix; 39 | int row; 40 | int band_width; 41 | } ka_param_t; 42 | 43 | #ifdef __cplusplus 44 | extern "C" { 45 | #endif 46 | 47 | uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar); 48 | 49 | #ifdef __cplusplus 50 | } 51 | #endif 52 | 53 | extern ka_param_t ka_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */ 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /samtools/klist.h: -------------------------------------------------------------------------------- 1 | #ifndef _LH3_KLIST_H 2 | #define _LH3_KLIST_H 3 | 4 | #include 5 | 6 | #define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f) \ 7 | typedef struct { \ 8 | size_t cnt, n, max; \ 9 | kmptype_t **buf; \ 10 | } kmp_##name##_t; \ 11 | static inline kmp_##name##_t *kmp_init_##name() { \ 12 | return calloc(1, sizeof(kmp_##name##_t)); \ 13 | } \ 14 | static inline void kmp_destroy_##name(kmp_##name##_t *mp) { \ 15 | size_t k; \ 16 | for (k = 0; k < mp->n; ++k) { \ 17 | kmpfree_f(mp->buf[k]); free(mp->buf[k]); \ 18 | } \ 19 | free(mp->buf); free(mp); \ 20 | } \ 21 | static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \ 22 | ++mp->cnt; \ 23 | if (mp->n == 0) return calloc(1, sizeof(kmptype_t)); \ 24 | return mp->buf[--mp->n]; \ 25 | } \ 26 | static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ 27 | --mp->cnt; \ 28 | if (mp->n == mp->max) { \ 29 | mp->max = mp->max? mp->max<<1 : 16; \ 30 | mp->buf = realloc(mp->buf, sizeof(void*) * mp->max); \ 31 | } \ 32 | mp->buf[mp->n++] = p; \ 33 | } 34 | 35 | #define kmempool_t(name) kmp_##name##_t 36 | #define kmp_init(name) kmp_init_##name() 37 | #define kmp_destroy(name, mp) kmp_destroy_##name(mp) 38 | #define kmp_alloc(name, mp) kmp_alloc_##name(mp) 39 | #define kmp_free(name, mp, p) kmp_free_##name(mp, p) 40 | 41 | #define KLIST_INIT(name, kltype_t, kmpfree_t) \ 42 | struct __kl1_##name { \ 43 | kltype_t data; \ 44 | struct __kl1_##name *next; \ 45 | }; \ 46 | typedef struct __kl1_##name kl1_##name; \ 47 | KMEMPOOL_INIT(name, kl1_##name, kmpfree_t) \ 48 | typedef struct { \ 49 | kl1_##name *head, *tail; \ 50 | kmp_##name##_t *mp; \ 51 | size_t size; \ 52 | } kl_##name##_t; \ 53 | static inline kl_##name##_t *kl_init_##name() { \ 54 | kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t)); \ 55 | kl->mp = kmp_init(name); \ 56 | kl->head = kl->tail = kmp_alloc(name, kl->mp); \ 57 | kl->head->next = 0; \ 58 | return kl; \ 59 | } \ 60 | static inline void kl_destroy_##name(kl_##name##_t *kl) { \ 61 | kl1_##name *p; \ 62 | for (p = kl->head; p != kl->tail; p = p->next) \ 63 | kmp_free(name, kl->mp, p); \ 64 | kmp_free(name, kl->mp, p); \ 65 | kmp_destroy(name, kl->mp); \ 66 | free(kl); \ 67 | } \ 68 | static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) { \ 69 | kl1_##name *q, *p = kmp_alloc(name, kl->mp); \ 70 | q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p; \ 71 | ++kl->size; \ 72 | return &q->data; \ 73 | } \ 74 | static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \ 75 | kl1_##name *p; \ 76 | if (kl->head->next == 0) return -1; \ 77 | --kl->size; \ 78 | p = kl->head; kl->head = kl->head->next; \ 79 | if (d) *d = p->data; \ 80 | kmp_free(name, kl->mp, p); \ 81 | return 0; \ 82 | } 83 | 84 | #define kliter_t(name) kl1_##name 85 | #define klist_t(name) kl_##name##_t 86 | #define kl_val(iter) ((iter)->data) 87 | #define kl_next(iter) ((iter)->next) 88 | #define kl_begin(kl) ((kl)->head) 89 | #define kl_end(kl) ((kl)->tail) 90 | 91 | #define kl_init(name) kl_init_##name() 92 | #define kl_destroy(name, kl) kl_destroy_##name(kl) 93 | #define kl_pushp(name, kl) kl_pushp_##name(kl) 94 | #define kl_shift(name, kl, d) kl_shift_##name(kl, d) 95 | 96 | #endif 97 | -------------------------------------------------------------------------------- /samtools/knetfile.h: -------------------------------------------------------------------------------- 1 | #ifndef KNETFILE_H 2 | #define KNETFILE_H 3 | 4 | #include 5 | #include 6 | 7 | #ifndef _WIN32 8 | #define netread(fd, ptr, len) read(fd, ptr, len) 9 | #define netwrite(fd, ptr, len) write(fd, ptr, len) 10 | #define netclose(fd) close(fd) 11 | #else 12 | #include 13 | #define netread(fd, ptr, len) recv(fd, ptr, len, 0) 14 | #define netwrite(fd, ptr, len) send(fd, ptr, len, 0) 15 | #define netclose(fd) closesocket(fd) 16 | #endif 17 | 18 | // FIXME: currently I/O is unbuffered 19 | 20 | #define KNF_TYPE_LOCAL 1 21 | #define KNF_TYPE_FTP 2 22 | #define KNF_TYPE_HTTP 3 23 | 24 | typedef struct knetFile_s { 25 | int type, fd; 26 | int64_t offset; 27 | char *host, *port; 28 | 29 | // the following are for FTP only 30 | int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; 31 | char *response, *retr, *size_cmd; 32 | int64_t seek_offset; // for lazy seek 33 | int64_t file_size; 34 | 35 | // the following are for HTTP only 36 | char *path, *http_host; 37 | } knetFile; 38 | 39 | #define knet_tell(fp) ((fp)->offset) 40 | #define knet_fileno(fp) ((fp)->fd) 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | #ifdef _WIN32 47 | int knet_win32_init(); 48 | void knet_win32_destroy(); 49 | #endif 50 | 51 | knetFile *knet_open(const char *fn, const char *mode); 52 | 53 | /* 54 | This only works with local files. 55 | */ 56 | knetFile *knet_dopen(int fd, const char *mode); 57 | 58 | /* 59 | If ->is_ready==0, this routine updates ->fd; otherwise, it simply 60 | reads from ->fd. 61 | */ 62 | off_t knet_read(knetFile *fp, void *buf, off_t len); 63 | 64 | /* 65 | This routine only sets ->offset and ->is_ready=0. It does not 66 | communicate with the FTP server. 67 | */ 68 | off_t knet_seek(knetFile *fp, int64_t off, int whence); 69 | int knet_close(knetFile *fp); 70 | 71 | #ifdef __cplusplus 72 | } 73 | #endif 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /samtools/kstring.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "kstring.h" 7 | 8 | int ksprintf(kstring_t *s, const char *fmt, ...) 9 | { 10 | va_list ap; 11 | int l; 12 | va_start(ap, fmt); 13 | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. 14 | va_end(ap); 15 | if (l + 1 > s->m - s->l) { 16 | s->m = s->l + l + 2; 17 | kroundup32(s->m); 18 | s->s = (char*)realloc(s->s, s->m); 19 | va_start(ap, fmt); 20 | l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); 21 | } 22 | va_end(ap); 23 | s->l += l; 24 | return l; 25 | } 26 | 27 | // s MUST BE a null terminated string; l = strlen(s) 28 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) 29 | { 30 | int i, n, max, last_char, last_start, *offsets, l; 31 | n = 0; max = *_max; offsets = *_offsets; 32 | l = strlen(s); 33 | 34 | #define __ksplit_aux do { \ 35 | if (_offsets) { \ 36 | s[i] = 0; \ 37 | if (n == max) { \ 38 | max = max? max<<1 : 2; \ 39 | offsets = (int*)realloc(offsets, sizeof(int) * max); \ 40 | } \ 41 | offsets[n++] = last_start; \ 42 | } else ++n; \ 43 | } while (0) 44 | 45 | for (i = 0, last_char = last_start = 0; i <= l; ++i) { 46 | if (delimiter == 0) { 47 | if (isspace(s[i]) || s[i] == 0) { 48 | if (isgraph(last_char)) __ksplit_aux; // the end of a field 49 | } else { 50 | if (isspace(last_char) || last_char == 0) last_start = i; 51 | } 52 | } else { 53 | if (s[i] == delimiter || s[i] == 0) { 54 | if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field 55 | } else { 56 | if (last_char == delimiter || last_char == 0) last_start = i; 57 | } 58 | } 59 | last_char = s[i]; 60 | } 61 | *_max = max; *_offsets = offsets; 62 | return n; 63 | } 64 | 65 | /********************** 66 | * Boyer-Moore search * 67 | **********************/ 68 | 69 | // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html 70 | int *ksBM_prep(const uint8_t *pat, int m) 71 | { 72 | int i, *suff, *prep, *bmGs, *bmBc; 73 | prep = calloc(m + 256, 1); 74 | bmGs = prep; bmBc = prep + m; 75 | { // preBmBc() 76 | for (i = 0; i < 256; ++i) bmBc[i] = m; 77 | for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; 78 | } 79 | suff = calloc(m, sizeof(int)); 80 | { // suffixes() 81 | int f = 0, g; 82 | suff[m - 1] = m; 83 | g = m - 1; 84 | for (i = m - 2; i >= 0; --i) { 85 | if (i > g && suff[i + m - 1 - f] < i - g) 86 | suff[i] = suff[i + m - 1 - f]; 87 | else { 88 | if (i < g) g = i; 89 | f = i; 90 | while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; 91 | suff[i] = f - g; 92 | } 93 | } 94 | } 95 | { // preBmGs() 96 | int j = 0; 97 | for (i = 0; i < m; ++i) bmGs[i] = m; 98 | for (i = m - 1; i >= 0; --i) 99 | if (suff[i] == i + 1) 100 | for (; j < m - 1 - i; ++j) 101 | if (bmGs[j] == m) 102 | bmGs[j] = m - 1 - i; 103 | for (i = 0; i <= m - 2; ++i) 104 | bmGs[m - 1 - suff[i]] = m - 1 - i; 105 | } 106 | free(suff); 107 | return prep; 108 | } 109 | 110 | int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches) 111 | { 112 | int i, j, *prep, *bmGs, *bmBc; 113 | int *matches = 0, mm = 0, nm = 0; 114 | prep = _prep? _prep : ksBM_prep(pat, m); 115 | bmGs = prep; bmBc = prep + m; 116 | j = 0; 117 | while (j <= n - m) { 118 | for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); 119 | if (i < 0) { 120 | if (nm == mm) { 121 | mm = mm? mm<<1 : 1; 122 | matches = realloc(matches, mm * sizeof(int)); 123 | } 124 | matches[nm++] = j; 125 | j += bmGs[0]; 126 | } else { 127 | int max = bmBc[str[i+j]] - m + 1 + i; 128 | if (max < bmGs[i]) max = bmGs[i]; 129 | j += max; 130 | } 131 | } 132 | *n_matches = nm; 133 | if (_prep == 0) free(prep); 134 | return matches; 135 | } 136 | 137 | #ifdef KSTRING_MAIN 138 | #include 139 | int main() 140 | { 141 | kstring_t *s; 142 | int *fields, n, i; 143 | s = (kstring_t*)calloc(1, sizeof(kstring_t)); 144 | // test ksprintf() 145 | ksprintf(s, " abcdefg: %d ", 100); 146 | printf("'%s'\n", s->s); 147 | // test ksplit() 148 | fields = ksplit(s, 0, &n); 149 | for (i = 0; i < n; ++i) 150 | printf("field[%d] = '%s'\n", i, s->s + fields[i]); 151 | free(s); 152 | 153 | { 154 | static char *str = "abcdefgcdg"; 155 | static char *pat = "cd"; 156 | int n, *matches; 157 | matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n); 158 | printf("%d: \n", n); 159 | for (i = 0; i < n; ++i) 160 | printf("- %d\n", matches[i]); 161 | free(matches); 162 | } 163 | return 0; 164 | } 165 | #endif 166 | -------------------------------------------------------------------------------- /samtools/kstring.h: -------------------------------------------------------------------------------- 1 | #ifndef KSTRING_H 2 | #define KSTRING_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifndef kroundup32 9 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 10 | #endif 11 | 12 | #ifndef KSTRING_T 13 | #define KSTRING_T kstring_t 14 | typedef struct __kstring_t { 15 | size_t l, m; 16 | char *s; 17 | } kstring_t; 18 | #endif 19 | 20 | int ksprintf(kstring_t *s, const char *fmt, ...); 21 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); 22 | 23 | // calculate the auxiliary array, allocated by calloc() 24 | int *ksBM_prep(const uint8_t *pat, int m); 25 | 26 | /* Search pat in str and returned the list of matches. The size of the 27 | * list is returned as n_matches. _prep is the array returned by 28 | * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */ 29 | int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches); 30 | 31 | static inline int kputsn(const char *p, int l, kstring_t *s) 32 | { 33 | if (s->l + l + 1 >= s->m) { 34 | s->m = s->l + l + 2; 35 | kroundup32(s->m); 36 | s->s = (char*)realloc(s->s, s->m); 37 | } 38 | strncpy(s->s + s->l, p, l); 39 | s->l += l; 40 | s->s[s->l] = 0; 41 | return l; 42 | } 43 | 44 | static inline int kputs(const char *p, kstring_t *s) 45 | { 46 | return kputsn(p, strlen(p), s); 47 | } 48 | 49 | static inline int kputc(int c, kstring_t *s) 50 | { 51 | if (s->l + 1 >= s->m) { 52 | s->m = s->l + 2; 53 | kroundup32(s->m); 54 | s->s = (char*)realloc(s->s, s->m); 55 | } 56 | s->s[s->l++] = c; 57 | s->s[s->l] = 0; 58 | return c; 59 | } 60 | 61 | static inline int *ksplit(kstring_t *s, int delimiter, int *n) 62 | { 63 | int max = 0, *offsets = 0; 64 | *n = ksplit_core(s->s, delimiter, &max, &offsets); 65 | return offsets; 66 | } 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /samtools/misc/Makefile: -------------------------------------------------------------------------------- 1 | CC= gcc 2 | CXX= g++ 3 | CFLAGS= -g -Wall -O2 -m64 #-arch ppc 4 | CXXFLAGS= $(CFLAGS) 5 | DFLAGS= -D_FILE_OFFSET_BITS=64 6 | OBJS= 7 | PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim 8 | INCLUDES= -I.. 9 | SUBDIRS= . 10 | 11 | .SUFFIXES:.c .o 12 | 13 | .c.o: 14 | $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ 15 | 16 | all:$(PROG) 17 | 18 | lib-recur all-recur clean-recur cleanlocal-recur install-recur: 19 | @target=`echo $@ | sed s/-recur//`; \ 20 | wdir=`pwd`; \ 21 | list='$(SUBDIRS)'; for subdir in $$list; do \ 22 | cd $$subdir; \ 23 | $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ 24 | INCLUDES="$(INCLUDES)" $$target || exit 1; \ 25 | cd $$wdir; \ 26 | done; 27 | 28 | lib: 29 | 30 | wgsim:wgsim.o 31 | $(CC) $(CFLAGS) -o $@ wgsim.o -lm 32 | 33 | md5fa:md5.o md5fa.o md5.h ../kseq.h 34 | $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz 35 | 36 | md5sum-lite:md5sum-lite.o 37 | $(CC) $(CFLAGS) -o $@ md5sum-lite.o 38 | 39 | md5sum-lite.o:md5.c md5.h 40 | $(CC) -c $(CFLAGS) -DMD5SUM_MAIN -o $@ md5.c 41 | 42 | maq2sam-short:maq2sam.c 43 | $(CC) $(CFLAGS) -o $@ maq2sam.c -lz 44 | 45 | maq2sam-long:maq2sam.c 46 | $(CC) $(CFLAGS) -DMAQ_LONGREADS -o $@ maq2sam.c -lz 47 | 48 | md5fa.o:md5.h md5fa.c 49 | $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c 50 | 51 | cleanlocal: 52 | rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a 53 | 54 | clean:cleanlocal-recur 55 | -------------------------------------------------------------------------------- /samtools/misc/blast2sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Std; 6 | 7 | &blast2sam; 8 | 9 | sub blast2sam { 10 | my %opts = (); 11 | getopts('s', \%opts); 12 | die("Usage: blast2sam.pl \n") if (-t STDIN && @ARGV == 0); 13 | my ($qlen, $slen, $q, $s, $qbeg, $qend, @sam, @cigar, @cmaux, $show_seq); 14 | $show_seq = defined($opts{s}); 15 | @sam = (); @sam[0,4,6..8,10] = ('', 255, '*', 0, 0, '*'); 16 | while (<>) { 17 | if (@cigar && (/^Query=/ || /Score =.*bits.*Expect/)) { # print 18 | &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend); 19 | @cigar = (); 20 | } 21 | if (/^Query= (\S+)/) { 22 | $sam[0] = $1; 23 | } elsif (/\((\S+)\s+letters\)/) { 24 | $qlen = $1; $qlen =~ s/,//g; 25 | } elsif (/^>(\S+)/) { 26 | $sam[2] = $1; 27 | } elsif (/Length = (\d+)/) { 28 | $slen = $1; 29 | } elsif (/Score =\s+(\S+) bits.+Expect(\(\d+\))? = (\S+)/) { # the start of an alignment block 30 | my ($as, $ev) = (int($1 + .499), $3); 31 | $ev = "1$ev" if ($ev =~ /^e/); 32 | @sam[1,3,9,11,12] = (0, 0, '', "AS:i:$as", "EV:Z:$ev"); 33 | @cigar = (); $qbeg = 0; 34 | @cmaux = (0, 0, 0, ''); 35 | } elsif (/Strand = (\S+) \/ (\S+)/) { 36 | $sam[1] |= 0x10 if ($2 eq 'Minus'); 37 | } elsif (/Query\:\s(\d+)\s*(\S+)\s(\d+)/) { 38 | $q = $2; 39 | unless ($qbeg) { 40 | $qbeg = $1; 41 | push(@cigar, ($1-1) . "H") if ($1 > 1); 42 | } 43 | $qend = $3; 44 | if ($show_seq) { 45 | my $x = $q; 46 | $x =~ s/-//g; $sam[9] .= $x; 47 | } 48 | } elsif (/Sbjct\:\s(\d+)\s*(\S+)\s(\d+)/) { 49 | $s = $2; 50 | if ($sam[1] & 0x10) { 51 | $sam[3] = $3; 52 | } else { 53 | $sam[3] = $1 unless ($sam[3]); 54 | } 55 | &aln2cm(\@cigar, \$q, \$s, \@cmaux); 56 | } 57 | } 58 | &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend); 59 | } 60 | 61 | sub blast_print_sam { 62 | my ($sam, $cigar, $cmaux, $qrest) = @_; 63 | push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1)); 64 | push(@$cigar, $qrest . 'H') if ($qrest); 65 | if ($sam->[1] & 0x10) { 66 | @$cigar = reverse(@$cigar); 67 | $sam->[9] = reverse($sam->[9]); 68 | $sam->[9] =~ tr/atgcrymkswATGCRYMKSW/tacgyrkmswTACGYRKMSW/; 69 | } 70 | $sam->[9] = '*' if (!$sam->[9]); 71 | $sam->[5] = join('', @$cigar); 72 | print join("\t", @$sam), "\n"; 73 | } 74 | 75 | sub aln2cm { 76 | my ($cigar, $q, $s, $cmaux) = @_; 77 | my $l = length($$q); 78 | for (my $i = 0; $i < $l; ++$i) { 79 | my $op; 80 | # set $op 81 | if (substr($$q, $i, 1) eq '-') { $op = 2; } 82 | elsif (substr($$s, $i, 1) eq '-') { $op = 1; } 83 | else { $op = 0; } 84 | # for CIGAR 85 | if ($cmaux->[0] == $op) { 86 | ++$cmaux->[1]; 87 | } else { 88 | push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1)); 89 | $cmaux->[0] = $op; $cmaux->[1] = 1; 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /samtools/misc/bowtie2sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Contact: lh3 4 | # Version: 0.1.1 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Std; 9 | 10 | &bowtie2sam; 11 | exit; 12 | 13 | sub bowtie2sam { 14 | my %opts = (); 15 | die("Usage: bowtie2sam.pl \n") if (@ARGV == 0 && -t STDIN); 16 | # core loop 17 | my (@s, $last, @staging, $k, $best_s, $subbest_s, $best_k); 18 | $last = ''; 19 | while (<>) { 20 | my ($name, $nm) = &bowtie2sam_aux($_, \@s); # read_name, number of mismatches 21 | if ($name eq $last) { 22 | # I do not know whether the multiple hits are ordered on the 23 | # number of mismatches. I assume they are not and so I have to 24 | # keep all these multiple hits in memory. 25 | @{$staging[$k]} = @s; 26 | if ($best_s > $nm) { 27 | $subbest_s = $best_s; 28 | $best_s = $nm; 29 | $best_k = $k; 30 | } elsif ($subbest_s > $nm) { 31 | $subbest_s = $nm; 32 | } 33 | ++$k; 34 | } else { 35 | if ($last) { 36 | if ($best_s == $subbest_s) { 37 | $staging[$best_k][4] = 0; 38 | } elsif ($subbest_s - $best_s == 1) { 39 | $staging[$best_k][4] = 15 if ($staging[$best_k][4] > 15); 40 | } 41 | print join("\t", @{$staging[$best_k]}), "\n"; 42 | } 43 | $k = 1; $best_s = $nm; $subbest_s = 1000; $best_k = 0; 44 | @{$staging[0]} = @s; 45 | $last = $name; 46 | } 47 | } 48 | print join("\t", @{$staging[$best_k]}), "\n" if ($best_k >= 0); 49 | } 50 | 51 | sub bowtie2sam_aux { 52 | my ($line, $s) = @_; 53 | chomp($line); 54 | my @t = split("\t", $line); 55 | my $ret; 56 | @$s = (); 57 | # read name 58 | $s->[0] = $ret = $t[0]; 59 | $s->[0] =~ s/\/[12]$//g; 60 | # initial flag (will be updated later) 61 | $s->[1] = 0; 62 | # read & quality 63 | $s->[9] = $t[4]; $s->[10] = $t[5]; 64 | # cigar 65 | $s->[5] = length($s->[9]) . "M"; 66 | # coor 67 | $s->[2] = $t[2]; $s->[3] = $t[3] + 1; 68 | $s->[1] |= 0x10 if ($t[1] eq '-'); 69 | # mapQ 70 | $s->[4] = $t[6] == 0? 25 : 0; 71 | # mate coordinate 72 | $s->[6] = '*'; $s->[7] = $s->[8] = 0; 73 | # aux 74 | my $nm = @t - 7; 75 | push(@$s, "NM:i:" . (@t-7)); 76 | push(@$s, "X$nm:i:" . ($t[6]+1)); 77 | my $md = ''; 78 | if ($t[7]) { 79 | $_ = $t[7]; 80 | my $a = 0; 81 | while (/(\d+):[ACGTN]>([ACGTN])/gi) { 82 | my ($y, $z) = ($1, $2); 83 | $md .= (int($y)-$a) . $z; 84 | $a += $y - $a + 1; 85 | } 86 | $md .= length($s->[9]) - $a; 87 | } else { 88 | $md = length($s->[9]); 89 | } 90 | push(@$s, "MD:Z:$md"); 91 | return ($ret, $nm); 92 | } 93 | -------------------------------------------------------------------------------- /samtools/misc/export2sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Contact: lh3 4 | # Version: 0.1.2 (03JAN2009) 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Std; 9 | 10 | &export2sam; 11 | exit; 12 | 13 | sub export2sam { 14 | my ($fh1, $fh2, $is_paired); 15 | $is_paired = (@ARGV >= 2); 16 | die("export2sam.pl []\n") if (@ARGV == 0); 17 | open($fh1, $ARGV[0]) || die; 18 | if ($is_paired) { 19 | open($fh2, $ARGV[1]) || die; 20 | } 21 | # conversion table 22 | my @conv_table; 23 | for (-64..64) { 24 | $conv_table[$_+64] = chr(int(33 + 10*log(1+10**($_/10.0))/log(10)+.499)); 25 | } 26 | # core loop 27 | while (<$fh1>) { 28 | my (@s1, @s2); 29 | &export2sam_aux($_, \@s1, \@conv_table, $is_paired); 30 | if ($is_paired) { 31 | $_ = <$fh2>; 32 | &export2sam_aux($_, \@s2, \@conv_table, $is_paired); 33 | if (@s1 && @s2) { # then set mate coordinate 34 | my $isize = 0; 35 | if ($s1[2] ne '*' && $s1[2] eq $s2[2]) { # then calculate $isize 36 | my $x1 = ($s1[1] & 0x10)? $s1[3] + length($s1[9]) : $s1[3]; 37 | my $x2 = ($s2[1] & 0x10)? $s2[3] + length($s2[9]) : $s2[3]; 38 | $isize = $x2 - $x1; 39 | } 40 | # update mate coordinate 41 | if ($s2[2] ne '*') { 42 | @s1[6..8] = (($s2[2] eq $s1[2])? "=" : $s2[2], $s2[3], $isize); 43 | $s1[1] |= 0x20 if ($s2[1] & 0x10); 44 | } else { 45 | $s1[1] |= 0x8; 46 | } 47 | if ($s1[2] ne '*') { 48 | @s2[6..8] = (($s1[2] eq $s2[2])? "=" : $s1[2], $s1[3], -$isize); 49 | $s2[1] |= 0x20 if ($s1[1] & 0x10); 50 | } else { 51 | $s2[1] |= 0x8; 52 | } 53 | } 54 | } 55 | print join("\t", @s1), "\n" if (@s1); 56 | print join("\t", @s2), "\n" if (@s2 && $is_paired); 57 | } 58 | close($fh1); 59 | close($fh2) if ($is_paired); 60 | } 61 | 62 | sub export2sam_aux { 63 | my ($line, $s, $ct, $is_paired) = @_; 64 | chomp($line); 65 | my @t = split("\t", $line); 66 | @$s = (); 67 | return if ($t[21] ne 'Y'); 68 | # read name 69 | $s->[0] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]"; 70 | # initial flag (will be updated later) 71 | $s->[1] = 0; 72 | $s->[1] |= 1 | 1<<(5 + $t[7]) if ($is_paired); 73 | # read & quality 74 | $s->[9] = $t[8]; $s->[10] = $t[9]; 75 | if ($t[13] eq 'R') { # then reverse the sequence and quality 76 | $s->[9] = reverse($t[8]); 77 | $s->[9] =~ tr/ACGTacgt/TGCAtgca/; 78 | $s->[10] = reverse($t[9]); 79 | } 80 | $s->[10] =~ s/(.)/$ct->[ord($1)]/eg; # change coding 81 | # cigar 82 | $s->[5] = length($s->[9]) . "M"; 83 | # coor 84 | my $has_coor = 0; 85 | $s->[2] = "*"; 86 | if ($t[10] eq 'NM' || $t[10] eq 'QC') { 87 | $s->[1] |= 0x4; # unmapped 88 | } elsif ($t[10] =~ /(\d+):(\d+):(\d+)/) { 89 | $s->[1] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? 90 | push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") 91 | } else { 92 | $s->[2] = $t[10]; 93 | $has_coor = 1; 94 | } 95 | $s->[3] = $has_coor? $t[12] : 0; 96 | $s->[1] |= 0x10 if ($has_coor && $t[13] eq 'R'); 97 | # mapQ (TODO: should I choose the larger between $t[15] and $t[16]?) 98 | $s->[4] = 0; 99 | $s->[4] = $t[15] if ($t[15] ne ''); 100 | $s->[4] = $t[16] if ($t[16] ne '' && $s->[4] < $t[16]); 101 | # mate coordinate 102 | $s->[6] = '*'; $s->[7] = $s->[8] = 0; 103 | # aux 104 | push(@$s, "BC:Z:$t[6]") if ($t[6]); 105 | push(@$s, "MD:Z:$t[14]") if ($has_coor); 106 | push(@$s, "SM:i:$t[15]") if ($is_paired && $has_coor); 107 | } 108 | -------------------------------------------------------------------------------- /samtools/misc/interpolate_sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | ###Builds interpolated pileup from SAM file 5 | ##@description counts bases between paired ends and piles up single end reads. 6 | ##@output, uses a #header for the RNAME and then the number of reads per base 7 | ##@author sm8@sanger.ac.uk, Stephen B. Montgomery 8 | 9 | ##@caveats 10 | ##Requires RNAME to have format as per example 11 | ## chromosome:NCBI36:18:1:76117153:1 12 | ## supercontig::NT_113883:1:137703:1 13 | ## clone::AC138827.3:1:149397:1 14 | ##Expects simple CIGAR characters, M, I and D 15 | ##Expects SAM file to be sorted. 16 | ##Expects 0x0010 to mark second read in PE file (as has been the observed case from MAQ output) (important for line 77) 17 | 18 | ##Verify and read in SAM file 19 | my $sam_file = $ARGV[0]; 20 | if(!defined($sam_file)) { die("No sam file defined on arg 1"); } 21 | unless(-f $sam_file) { die("Sam file does not exist: $sam_file"); } 22 | open(SAM, $sam_file) || die("Cannot open sam file"); 23 | 24 | ##Globals 25 | my $current_location = ""; ##Current RNAME being processed 26 | my $current_size = 0; ##Size of sequence region being processed 27 | my $current_position = 1; ##Current base being processed 28 | my $open = 0; ##Number of open reads (PE reads that have not been closed) 29 | my %close = (); ##Hash of closing positions, when the current_position gets to this position it subtracts the 30 | ##contained value from those open and deletes the indexed position from the hash 31 | 32 | while (my $line = ) { 33 | my @tokens = split /\t/, $line; 34 | 35 | if ($current_location ne $tokens[2]) { ##Start a new sequence region 36 | for (my $i = $current_position; $i <= $current_size; $i++) { ##Close the previous sequence region 37 | if (defined($close{$i})) { 38 | $open = $open - $close{$i}; 39 | delete $close{$i}; 40 | } 41 | print $open . "\n"; 42 | } 43 | if ($current_location ne "") { 44 | print "\n"; 45 | } 46 | 47 | ##Initiate a new sequence region 48 | my @location_tokens = split /:/, $tokens[2]; 49 | $current_position = 1; 50 | $current_location = $tokens[2]; 51 | $current_size = $location_tokens[4]; 52 | $open = 0; 53 | %close = (); 54 | print "#" . $tokens[2] . "\n"; 55 | 56 | ##Print pileup to just before the first read (will be 0) 57 | for (my $current_position = 1; $current_position < $tokens[3]; $current_position++) { 58 | print $open . "\n"; 59 | } 60 | $current_position = $tokens[3]; 61 | 62 | } else { ##Sequence region already open 63 | if ($tokens[3] > $current_position) { ##If the new read's position is greater than the current position 64 | ##cycle through to catch up to the current position 65 | for (my $i = $current_position; $i < $tokens[3]; $i++) { 66 | if (defined($close{$i})) { 67 | $open = $open - $close{$i}; 68 | delete $close{$i}; 69 | } 70 | print $open . "\n"; 71 | } 72 | $current_position = $tokens[3]; 73 | } 74 | } 75 | $open++; ##Increment the number of open reads 76 | 77 | if (($tokens[1] & 0x0080 || $tokens[1] & 0x0040) && $tokens[1] & 0x0010 && $tokens[1] & 0x0002) { ##if second read of mate pair, add close condition 78 | $open--; 79 | my $parsed_cig = &parseCigar($tokens[5]); 80 | my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1; 81 | if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; } 82 | $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1; 83 | } elsif (!($tokens[1] & 0x0001) || !($tokens[1] & 0x0002)) { ##if unpaired, add close condition 84 | my $parsed_cig = &parseCigar($tokens[5]); 85 | my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1; 86 | if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; } 87 | $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1; 88 | } else { 89 | #do nothing 90 | } 91 | } 92 | for (my $i = $current_position; $i <= $current_size; $i++) { ##Finish up the last sequence region 93 | if (defined($close{$i})) { 94 | $open = $open - $close{$i}; 95 | delete $close{$i}; 96 | } 97 | print $open . "\n"; 98 | } 99 | print "\n"; 100 | close(SAM); 101 | exit(0); 102 | 103 | ##reads and tokenizes simple cigarline 104 | sub parseCigar() { 105 | my $cigar_line = shift; 106 | $cigar_line =~ s/([0-9]*[A-Z]{1})/$1\t/g; 107 | my @cigar_tokens = split /\t/, $cigar_line; 108 | my %parsed = ('M' => 0, 109 | 'I' => 0, 110 | 'D' => 0); 111 | my @events = (); 112 | for(my $i = 0; $i < scalar(@cigar_tokens); $i++) { 113 | if ($cigar_tokens[$i] =~ /([0-9]+)([A-Z]{1})/g) { 114 | if (!defined($parsed{$2})) { $parsed{$2} = 0; } 115 | my $nt = $2; 116 | if ($nt ne "M" && $nt ne "D" && $nt ne "I") { $nt = "M"; } 117 | $parsed{$nt} += $1; 118 | my %event_el = ("t" => $nt, 119 | "n" => $1); 120 | push @events, \%event_el; 121 | } 122 | } 123 | $parsed{'events'} = \@events; 124 | return \%parsed; 125 | } 126 | -------------------------------------------------------------------------------- /samtools/misc/maq2sam.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define PACKAGE_VERSION "r439" 9 | 10 | //#define MAQ_LONGREADS 11 | 12 | #ifdef MAQ_LONGREADS 13 | # define MAX_READLEN 128 14 | #else 15 | # define MAX_READLEN 64 16 | #endif 17 | 18 | #define MAX_NAMELEN 36 19 | #define MAQMAP_FORMAT_OLD 0 20 | #define MAQMAP_FORMAT_NEW -1 21 | 22 | #define PAIRFLAG_FF 0x01 23 | #define PAIRFLAG_FR 0x02 24 | #define PAIRFLAG_RF 0x04 25 | #define PAIRFLAG_RR 0x08 26 | #define PAIRFLAG_PAIRED 0x10 27 | #define PAIRFLAG_DIFFCHR 0x20 28 | #define PAIRFLAG_NOMATCH 0x40 29 | #define PAIRFLAG_SW 0x80 30 | 31 | typedef struct 32 | { 33 | uint8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */ 34 | uint8_t size, map_qual, info1, info2, c[2], flag, alt_qual; 35 | uint32_t seqid, pos; 36 | int dist; 37 | char name[MAX_NAMELEN]; 38 | } maqmap1_t; 39 | 40 | typedef struct 41 | { 42 | int format, n_ref; 43 | char **ref_name; 44 | uint64_t n_mapped_reads; 45 | maqmap1_t *mapped_reads; 46 | } maqmap_t; 47 | 48 | maqmap_t *maq_new_maqmap() 49 | { 50 | maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t)); 51 | mm->format = MAQMAP_FORMAT_NEW; 52 | return mm; 53 | } 54 | void maq_delete_maqmap(maqmap_t *mm) 55 | { 56 | int i; 57 | if (mm == 0) return; 58 | for (i = 0; i < mm->n_ref; ++i) 59 | free(mm->ref_name[i]); 60 | free(mm->ref_name); 61 | free(mm->mapped_reads); 62 | free(mm); 63 | } 64 | maqmap_t *maqmap_read_header(gzFile fp) 65 | { 66 | maqmap_t *mm; 67 | int k, len; 68 | mm = maq_new_maqmap(); 69 | gzread(fp, &mm->format, sizeof(int)); 70 | if (mm->format != MAQMAP_FORMAT_NEW) { 71 | if (mm->format > 0) { 72 | fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n"); 73 | exit(3); 74 | } 75 | assert(mm->format == MAQMAP_FORMAT_NEW); 76 | } 77 | gzread(fp, &mm->n_ref, sizeof(int)); 78 | mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*)); 79 | for (k = 0; k != mm->n_ref; ++k) { 80 | gzread(fp, &len, sizeof(int)); 81 | mm->ref_name[k] = (char*)malloc(len * sizeof(char)); 82 | gzread(fp, mm->ref_name[k], len); 83 | } 84 | /* read number of mapped reads */ 85 | gzread(fp, &mm->n_mapped_reads, sizeof(uint64_t)); 86 | return mm; 87 | } 88 | 89 | void maq2tam_core(gzFile fp, const char *rg) 90 | { 91 | maqmap_t *mm; 92 | maqmap1_t mm1, *m1; 93 | int ret; 94 | m1 = &mm1; 95 | mm = maqmap_read_header(fp); 96 | while ((ret = gzread(fp, m1, sizeof(maqmap1_t))) == sizeof(maqmap1_t)) { 97 | int j, flag = 0, se_mapq = m1->seq[MAX_READLEN-1]; 98 | if (m1->flag) flag |= 1; 99 | if ((m1->flag&PAIRFLAG_PAIRED) || ((m1->flag&PAIRFLAG_SW) && m1->flag != 192)) flag |= 2; 100 | if (m1->flag == 192) flag |= 4; 101 | if (m1->flag == 64) flag |= 8; 102 | if (m1->pos&1) flag |= 0x10; 103 | if ((flag&1) && m1->dist != 0) { 104 | int c; 105 | if (m1->dist > 0) { 106 | if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_RF)) c = 0; 107 | else if (m1->flag&(PAIRFLAG_FR|PAIRFLAG_RR)) c = 1; 108 | else c = m1->pos&1; 109 | } else { 110 | if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_FR)) c = 0; 111 | else if (m1->flag&(PAIRFLAG_RF|PAIRFLAG_RR)) c = 1; 112 | else c = m1->pos&1; 113 | } 114 | if (c) flag |= 0x20; 115 | } 116 | if (m1->flag) { 117 | int l = strlen(m1->name); 118 | if (m1->name[l-2] == '/') { 119 | flag |= (m1->name[l-1] == '1')? 0x40 : 0x80; 120 | m1->name[l-2] = '\0'; 121 | } 122 | } 123 | printf("%s\t%d\t", m1->name, flag); 124 | printf("%s\t%d\t", mm->ref_name[m1->seqid], (m1->pos>>1)+1); 125 | if (m1->flag == 130) { 126 | int c = (int8_t)m1->seq[MAX_READLEN-1]; 127 | printf("%d\t", m1->alt_qual); 128 | if (c == 0) printf("%dM\t", m1->size); 129 | else { 130 | if (c > 0) printf("%dM%dI%dM\t", m1->map_qual, c, m1->size - m1->map_qual - c); 131 | else printf("%dM%dD%dM\t", m1->map_qual, -c, m1->size - m1->map_qual); 132 | } 133 | se_mapq = 0; // zero SE mapQ for reads aligned by SW 134 | } else { 135 | if (flag&4) printf("0\t*\t"); 136 | else printf("%d\t%dM\t", m1->map_qual, m1->size); 137 | } 138 | printf("*\t0\t%d\t", m1->dist); 139 | for (j = 0; j != m1->size; ++j) { 140 | if (m1->seq[j] == 0) putchar('N'); 141 | else putchar("ACGT"[m1->seq[j]>>6&3]); 142 | } 143 | putchar('\t'); 144 | for (j = 0; j != m1->size; ++j) 145 | putchar((m1->seq[j]&0x3f) + 33); 146 | putchar('\t'); 147 | if (rg) printf("RG:Z:%s\t", rg); 148 | if (flag&4) { // unmapped 149 | printf("MF:i:%d\n", m1->flag); 150 | } else { 151 | printf("MF:i:%d\t", m1->flag); 152 | if (m1->flag) printf("AM:i:%d\tSM:i:%d\t", m1->alt_qual, se_mapq); 153 | printf("NM:i:%d\tUQ:i:%d\tH0:i:%d\tH1:i:%d\n", m1->info1&0xf, m1->info2, m1->c[0], m1->c[1]); 154 | } 155 | } 156 | if (ret > 0) 157 | fprintf(stderr, "Truncated! Continue anyway.\n"); 158 | maq_delete_maqmap(mm); 159 | } 160 | 161 | int main(int argc, char *argv[]) 162 | { 163 | gzFile fp; 164 | if (argc == 1) { 165 | fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); 166 | fprintf(stderr, "Usage: maq2sam []\n"); 167 | return 1; 168 | } 169 | fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); 170 | maq2tam_core(fp, argc > 2? argv[2] : 0); 171 | gzclose(fp); 172 | return 0; 173 | } 174 | -------------------------------------------------------------------------------- /samtools/misc/md5.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is adapted from a program in this page: 3 | 4 | http://www.fourmilab.ch/md5/ 5 | 6 | The original source code does not work on 64-bit machines due to the 7 | wrong typedef "uint32". I also added prototypes. 8 | 9 | -lh3 10 | */ 11 | 12 | #ifndef MD5_H 13 | #define MD5_H 14 | 15 | /* The following tests optimise behaviour on little-endian 16 | machines, where there is no need to reverse the byte order 17 | of 32 bit words in the MD5 computation. By default, 18 | HIGHFIRST is defined, which indicates we're running on a 19 | big-endian (most significant byte first) machine, on which 20 | the byteReverse function in md5.c must be invoked. However, 21 | byteReverse is coded in such a way that it is an identity 22 | function when run on a little-endian machine, so calling it 23 | on such a platform causes no harm apart from wasting time. 24 | If the platform is known to be little-endian, we speed 25 | things up by undefining HIGHFIRST, which defines 26 | byteReverse as a null macro. Doing things in this manner 27 | insures we work on new platforms regardless of their byte 28 | order. */ 29 | 30 | #define HIGHFIRST 31 | 32 | #if __LITTLE_ENDIAN__ != 0 33 | #undef HIGHFIRST 34 | #endif 35 | 36 | #include 37 | 38 | struct MD5Context { 39 | uint32_t buf[4]; 40 | uint32_t bits[2]; 41 | unsigned char in[64]; 42 | }; 43 | 44 | void MD5Init(struct MD5Context *ctx); 45 | void MD5Update(struct MD5Context *ctx, unsigned char *buf, unsigned len); 46 | void MD5Final(unsigned char digest[16], struct MD5Context *ctx); 47 | 48 | /* 49 | * This is needed to make RSAREF happy on some MS-DOS compilers. 50 | */ 51 | typedef struct MD5Context MD5_CTX; 52 | 53 | /* Define CHECK_HARDWARE_PROPERTIES to have main,c verify 54 | byte order and uint32_t settings. */ 55 | #define CHECK_HARDWARE_PROPERTIES 56 | 57 | #endif /* !MD5_H */ 58 | -------------------------------------------------------------------------------- /samtools/misc/md5fa.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "md5.h" 4 | #include "kseq.h" 5 | 6 | #define HEX_STR "0123456789abcdef" 7 | 8 | KSEQ_INIT(gzFile, gzread) 9 | 10 | static void md5_one(const char *fn) 11 | { 12 | MD5_CTX md5_one, md5_all; 13 | int l, i, k; 14 | gzFile fp; 15 | kseq_t *seq; 16 | unsigned char unordered[16], digest[16]; 17 | 18 | for (l = 0; l < 16; ++l) unordered[l] = 0; 19 | fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); 20 | if (fp == 0) { 21 | fprintf(stderr, "md5fa: %s: No such file or directory\n", fn); 22 | exit(1); 23 | } 24 | 25 | MD5Init(&md5_all); 26 | seq = kseq_init(fp); 27 | while ((l = kseq_read(seq)) >= 0) { 28 | for (i = k = 0; i < seq->seq.l; ++i) { 29 | if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]); 30 | else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i]; 31 | } 32 | MD5Init(&md5_one); 33 | MD5Update(&md5_one, (unsigned char*)seq->seq.s, k); 34 | MD5Final(digest, &md5_one); 35 | for (l = 0; l < 16; ++l) { 36 | printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]); 37 | unordered[l] ^= digest[l]; 38 | } 39 | printf(" %s %s\n", fn, seq->name.s); 40 | MD5Update(&md5_all, (unsigned char*)seq->seq.s, k); 41 | } 42 | MD5Final(digest, &md5_all); 43 | kseq_destroy(seq); 44 | for (l = 0; l < 16; ++l) 45 | printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]); 46 | printf(" %s >ordered\n", fn); 47 | for (l = 0; l < 16; ++l) 48 | printf("%c%c", HEX_STR[unordered[l]>>4&0xf], HEX_STR[unordered[l]&0xf]); 49 | printf(" %s >unordered\n", fn); 50 | } 51 | 52 | int main(int argc, char *argv[]) 53 | { 54 | int i; 55 | if (argc == 1) md5_one("-"); 56 | else for (i = 1; i < argc; ++i) md5_one(argv[i]); 57 | return 0; 58 | } 59 | -------------------------------------------------------------------------------- /samtools/misc/novo2sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Contact: lh3 4 | # Version: 0.1.3 5 | 6 | #Modified by Zayed Albertyn(zayed.albertyn@gmail.com) & Colin Hercus(colin@novocraft.com) 7 | 8 | #use strict; 9 | #use warnings; 10 | use Data::Dumper; 11 | use Getopt::Std; 12 | 13 | &novo2sam; 14 | exit; 15 | 16 | sub mating { 17 | my ($s1, $s2) = @_; 18 | my $isize = 0; 19 | if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize 20 | my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; 21 | my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; 22 | $isize = $x2 - $x1; 23 | } 24 | # update mate coordinate 25 | if ($s2->[2] ne '*') { 26 | @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); 27 | $s1->[1] |= 0x20 if ($s2->[1] & 0x10); 28 | } else { 29 | $s1->[1] |= 0x8; 30 | } 31 | if ($s1->[2] ne '*') { 32 | @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); 33 | $s2->[1] |= 0x20 if ($s1->[1] & 0x10); 34 | } else { 35 | $s2->[1] |= 0x8; 36 | } 37 | } 38 | 39 | sub novo2sam { 40 | my %opts = (); 41 | getopts("p", \%opts); 42 | die("Usage: novo2sam.pl [-p] \n") if (@ARGV == 0); 43 | my $is_paired = defined($opts{p}); 44 | # core loop 45 | my @s1 = (); 46 | my @s2 = (); 47 | my ($s_last, $s_curr) = (\@s1, \@s2); 48 | while (<>) { 49 | next if (/^#/); 50 | next if (/(QC|NM)\s*$/ || /(R\s+\d+)\s*$/); 51 | &novo2sam_aux($_, $s_curr, $is_paired); 52 | if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { 53 | &mating($s_last, $s_curr); 54 | print join("\t", @$s_last), "\n"; 55 | print join("\t", @$s_curr), "\n"; 56 | @$s_last = (); @$s_curr = (); 57 | } else { 58 | print join("\t", @$s_last), "\n" if (@$s_last != 0); 59 | my $s = $s_last; $s_last = $s_curr; $s_curr = $s; 60 | } 61 | } 62 | print join("\t", @$s_last), "\n" if (@$s_last != 0); 63 | } 64 | 65 | sub novo2sam_aux { 66 | my ($line, $s, $is_paired) = @_; 67 | 68 | chomp($line); 69 | my @t = split(/\s+/, $line); 70 | my @variations = @t[13 .. $#t]; 71 | @$s = (); 72 | return if ($t[4] ne 'U'); 73 | my $len = length($t[2]); 74 | # read name 75 | $s->[0] = substr($t[0], 1); 76 | $s->[0] =~ s/\/[12]$//g; 77 | # initial flag (will be updated later) 78 | $s->[1] = 0; 79 | $s->[1] |= 1 | 1<<($t[1] eq 'L'? 6 : 7); 80 | $s->[1] |= 2 if ($t[10] eq '.'); 81 | # read & quality 82 | if ($t[9] eq 'R') { 83 | $s->[9] = reverse($t[2]); 84 | $s->[10] = reverse($t[3]); 85 | $s->[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/; 86 | } else { 87 | $s->[9] = $t[2]; $s->[10] = $t[3]; 88 | } 89 | # cigar 90 | my $cigarstring =""; 91 | if (scalar @variations ==0 ) { 92 | $s->[5] = $len . "M"; # IMPORTANT: this cigar is not correct for gapped alignment 93 | } else { 94 | #convert to correct CIGAR 95 | my $tmpstr = join" ",@variations ; 96 | if ( $tmpstr=~ /\+|\-/ ) { 97 | $cigarstring = cigar_method($line,\@variations,$len); 98 | $s->[5]=$cigarstring; 99 | } else { 100 | $s->[5]=$len. "M"; 101 | } 102 | } 103 | 104 | # coor 105 | $s->[2] = substr($t[7], 1); $s->[3] = $t[8]; 106 | $s->[1] |= 0x10 if ($t[9] eq 'R'); 107 | # mapQ 108 | $s->[4] = $t[5] > $t[6]? $t[5] : $t[6]; 109 | # mate coordinate 110 | $s->[6] = '*'; $s->[7] = $s->[8] = 0; 111 | # aux 112 | push(@$s, "NM:i:".(@t-13)); 113 | my $md = ''; 114 | $md = mdtag($md,$line,\@variations,$len); 115 | push(@$s, "MD:Z:$md"); 116 | 117 | } 118 | 119 | sub mdtag { 120 | my $oldmd = shift; 121 | my $line = shift; 122 | my $ref =shift; 123 | my $rdlen = shift; 124 | my @variations = @$ref; 125 | my $string=""; 126 | my $mdtag=""; 127 | my $t=1; 128 | my $q=1; 129 | my $deleteflag=0; 130 | my $len =0; 131 | foreach $string (@variations) { 132 | my ($indeltype,$insert) = indeltype($string); 133 | if ($indeltype eq "+") { 134 | $len = length ($insert); 135 | $q+=$len; 136 | next; 137 | } 138 | my $pos = $1 if $string =~ /^(\d+)/; 139 | $len = $pos - $t; 140 | if ($len !=0 || ($deleteflag eq 1 && $indeltype eq ">")) { 141 | $mdtag.=$len; 142 | } 143 | $t+=$len; 144 | $q+=$len; 145 | if ($indeltype eq ">") { 146 | $mdtag.=$insert; 147 | $deleteflag=0; 148 | $t+=1; 149 | $q+=1; 150 | } 151 | if ($indeltype eq "-") { 152 | my $deletedbase = $2 if $string =~ /(\d+)\-([A-Za-z]+)/; 153 | if ($deleteflag == 0 ) { 154 | $mdtag.="^"; 155 | } 156 | $mdtag.=$deletedbase; 157 | $deleteflag=1; 158 | $t+=1; 159 | } 160 | } 161 | $len = $rdlen - $q + 1; 162 | if ($len > 0) { 163 | $mdtag.="$len"; 164 | } 165 | # print "In:$line\n"; 166 | # print "MD: OLD => NEW\nMD: $oldmd => $mdtag\n\n"; 167 | 168 | return $mdtag; 169 | } 170 | 171 | sub indeltype { 172 | my $string = shift; 173 | my $insert=""; 174 | my $indeltype; 175 | if ($string =~ /([A-Za-z]+)\>/) { 176 | $indeltype=">"; 177 | $insert=$1; 178 | } elsif ($string =~ /\-/) { 179 | $indeltype="-"; 180 | } elsif ($string =~ /\+([A-Za-z]+)/) { 181 | $indeltype="+"; 182 | $insert=$1; 183 | } 184 | return ($indeltype,$insert); 185 | 186 | } 187 | 188 | 189 | sub cigar_method { 190 | my $line = shift; 191 | my $ref =shift; 192 | my $rdlen = shift; 193 | my @variations = @$ref; 194 | my $string=""; 195 | my $type=""; 196 | my $t =1; 197 | my $q=1; 198 | my $indeltype=""; 199 | my $cigar= ""; 200 | my $insert = ""; 201 | my $len=0; 202 | my @cig=(); 203 | foreach $string (@variations) { 204 | next if $string =~ />/; 205 | my $pos = $1 if $string =~ /^(\d+)/; 206 | 207 | if ($string =~ /\+([A-Za-z]+)/) { 208 | $indeltype="+"; 209 | $insert = $1; 210 | }elsif ($string =~ /\-([A-Za-z]+)/) { 211 | $indeltype="-"; 212 | $insert = $1; 213 | } 214 | #print "$pos $indeltype $insert $t $q\n"; 215 | $len = $pos - $t; 216 | if ( $len > 0) { 217 | $cigar.=$len."M"; 218 | push(@cig,$len."M"); 219 | } 220 | $t+=$len; 221 | $q+=$len; 222 | 223 | if ($indeltype eq "-") { 224 | $cigar.="D"; 225 | push(@cig,"D"); 226 | $t++; 227 | } 228 | if ($indeltype eq "+") { 229 | $len = length ($insert); 230 | if ($len == 1) { 231 | $cigar.="I"; 232 | push(@cig,"I"); 233 | } 234 | if ($len > 1) { 235 | $cigar.=$len."I"; 236 | push(@cig,$len."I") 237 | } 238 | $q+=$len; 239 | } 240 | $insert=""; 241 | } 242 | $len= $rdlen - $q + 1; 243 | if ($len > 0) { 244 | $cigar.=$len."M"; 245 | push(@cig,$len."M"); 246 | } 247 | 248 | $cigar = newcigar($cigar,'D'); 249 | $cigar = newcigar($cigar,'I'); 250 | 251 | #print "$line\n"; 252 | #print "c CIGAR:\t$cigar\n\n"; 253 | return $cigar; 254 | 255 | } 256 | 257 | 258 | 259 | sub newcigar { 260 | my $cigar = shift; 261 | my $char = shift; 262 | my $new = ""; 263 | my $copy = $cigar; 264 | #print "$cigar\n"; 265 | $copy =~ s/^($char+)/$1;/g; 266 | #print "$copy\n"; 267 | $copy =~ s/([^0-9$char])($char+)/$1;$2;/g; 268 | #print "$copy\n"; 269 | my @parts = split(/;/,$copy); 270 | my $el=""; 271 | foreach $el (@parts) { 272 | #print "$el\n"; 273 | if ($el =~ /^$char+$/) { 274 | $new.=length($el).$char; 275 | }else { 276 | $new.=$el; 277 | } 278 | 279 | } 280 | return $new; 281 | } 282 | -------------------------------------------------------------------------------- /samtools/misc/psl2sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Author: lh3 4 | 5 | # This script calculates a score using the BLAST scoring 6 | # system. However, I am not sure how to count gap opens and gap 7 | # extensions. It seems to me that column 5-8 are not what I am 8 | # after. This script counts gaps from the last three columns. It does 9 | # not generate reference skip (N) in the CIGAR as it is not easy to 10 | # directly tell which gaps correspond to introns. 11 | 12 | use strict; 13 | use warnings; 14 | use Getopt::Std; 15 | 16 | my %opts = (a=>1, b=>3, q=>5, r=>2); 17 | getopts('a:b:q:r:', \%opts); 18 | die("Usage: psl2sam.pl [-a $opts{a}] [-b $opts{b}] [-q $opts{q}] [-r $opts{r}] \n") if (@ARGV == 0 && -t STDIN); 19 | 20 | my @stack; 21 | my $last = ''; 22 | my ($a, $b, $q, $r) = ($opts{a}, $opts{b}, $opts{q}, $opts{r}); 23 | while (<>) { 24 | next unless (/^\d/); 25 | my @t = split; 26 | my @s; 27 | my $cigar = ''; 28 | if ($t[8] eq '-') { 29 | my $tmp = $t[11]; 30 | $t[11] = $t[10] - $t[12]; 31 | $t[12] = $t[10] - $tmp; 32 | } 33 | @s[0..4] = ($t[9], (($t[8] eq '+')? 0 : 16), $t[13], $t[15]+1, 0); 34 | @s[6..10] = ('*', 0, 0, '*', '*'); 35 | $cigar .= $t[11].'H' if ($t[11]); # 5'-end clipping 36 | my @x = split(',', $t[18]); 37 | my @y = split(',', $t[19]); 38 | my @z = split(',', $t[20]); 39 | my ($y0, $z0) = ($y[0], $z[0]); 40 | my ($gap_open, $gap_ext) = (0, 0, 0); 41 | for (1 .. $t[17]-1) { 42 | my $ly = $y[$_] - $y[$_-1] - $x[$_-1]; 43 | my $lz = $z[$_] - $z[$_-1] - $x[$_-1]; 44 | if ($ly < $lz) { # del: the reference gap is longer 45 | ++$gap_open; 46 | $gap_ext += $lz - $ly; 47 | $cigar .= ($y[$_] - $y0) . 'M'; 48 | $cigar .= ($lz - $ly) . 'D'; 49 | ($y0, $z0) = ($y[$_], $z[$_]); 50 | } elsif ($lz < $ly) { # ins: the query gap is longer 51 | ++$gap_open; 52 | $gap_ext += $ly - $lz; 53 | $cigar .= ($z[$_] - $z0) . 'M'; 54 | $cigar .= ($ly - $lz) . 'I'; 55 | ($y0, $z0) = ($y[$_], $z[$_]); 56 | } 57 | } 58 | $cigar .= ($t[12] - $y0) . 'M'; 59 | $cigar .= ($t[10] - $t[12]).'H' if ($t[10] != $t[12]); # 3'-end clipping 60 | $s[5] = $cigar; 61 | my $score = $a * $t[0] - $b * $t[1] - $q * $gap_open - $r * $gap_ext; 62 | $score = 0 if ($score < 0); 63 | $s[11] = "AS:i:$score"; 64 | print join("\t", @s), "\n"; 65 | } 66 | -------------------------------------------------------------------------------- /samtools/misc/sam2vcf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcfv3.2 4 | 5 | # Contact: pd3@sanger 6 | # Version: 2009-10-08 7 | 8 | use strict; 9 | use warnings; 10 | use Carp; 11 | 12 | my $opts = parse_params(); 13 | do_pileup_to_vcf($opts); 14 | 15 | exit; 16 | 17 | #--------------- 18 | 19 | sub error 20 | { 21 | my (@msg) = @_; 22 | if ( scalar @msg ) { croak(@msg); } 23 | die 24 | "Usage: sam2vcf.pl [OPTIONS] < in.pileup > out.vcf\n", 25 | "Options:\n", 26 | " -r, -refseq The reference sequence, required when indels are present.\n", 27 | " -h, -?, --help This help message.\n", 28 | "\n"; 29 | } 30 | 31 | 32 | sub parse_params 33 | { 34 | my %opts = (); 35 | 36 | $opts{fh_in} = *STDIN; 37 | $opts{fh_out} = *STDOUT; 38 | 39 | while (my $arg=shift(@ARGV)) 40 | { 41 | if ( $arg eq '-r' || $arg eq '--refseq' ) { $opts{refseq}=shift(@ARGV); next; } 42 | if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } 43 | 44 | error("Unknown parameter \"$arg\". Run -h for help.\n"); 45 | } 46 | return \%opts; 47 | } 48 | 49 | sub iupac_to_gtype 50 | { 51 | my ($ref,$base) = @_; 52 | my %iupac = ( 53 | 'K' => ['G','T'], 54 | 'M' => ['A','C'], 55 | 'S' => ['C','G'], 56 | 'R' => ['A','G'], 57 | 'W' => ['A','T'], 58 | 'Y' => ['C','T'], 59 | ); 60 | if ( !exists($iupac{$base}) ) 61 | { 62 | if ( $ref eq $base ) { return ('.','0|0'); } 63 | return ($base,'1|1'); 64 | } 65 | my $gt = $iupac{$base}; 66 | if ( $$gt[0] eq $ref ) { return ($$gt[1],'0|1'); } 67 | elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0|1'); } 68 | return ("$$gt[0],$$gt[1]",'1|2'); 69 | } 70 | 71 | 72 | sub parse_indel 73 | { 74 | my ($cons) = @_; 75 | if ( $cons=~/^-/ ) 76 | { 77 | my $len = length($'); 78 | return "D$len"; 79 | } 80 | elsif ( $cons=~/^\+/ ) { return "I$'"; } 81 | elsif ( $cons eq '*' ) { return undef; } 82 | error("FIXME: could not parse [$cons]\n"); 83 | } 84 | 85 | 86 | # An example of the pileup format: 87 | # 1 3000011 C C 32 0 98 1 ^~, A 88 | # 1 3002155 * +T/+T 53 119 52 5 +T * 4 1 0 89 | # 1 3003094 * -TT/-TT 31 164 60 11 -TT * 5 6 0 90 | # 1 3073986 * */-AAAAAAAAAAAAAA 3 3 45 9 * -AAAAAAAAAAAAAA 7 2 0 91 | # 92 | sub do_pileup_to_vcf 93 | { 94 | my ($opts) = @_; 95 | 96 | my $fh_in = $$opts{fh_in}; 97 | my $fh_out = $$opts{fh_out}; 98 | my ($prev_chr,$prev_pos,$prev_ref); 99 | my $refseq; 100 | 101 | while (my $line=<$fh_in>) 102 | { 103 | chomp($line); 104 | my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,@items) = split(/\t/,$line); 105 | 106 | my ($alt,$gt); 107 | if ( $ref eq '*' ) 108 | { 109 | # An indel is involved. 110 | if ($chr ne $prev_chr || $pos ne $prev_pos) 111 | { 112 | if ( !$$opts{refseq} ) { error("Cannot do indels without the reference.\n"); } 113 | if ( !$refseq ) { $refseq = Fasta->new(file=>$$opts{refseq}); } 114 | $ref = $refseq->get_base($chr,$pos); 115 | } 116 | else { $ref = $prev_ref; } 117 | 118 | # One of the alleles can be a reference and it can come in arbitrary order 119 | my ($al1,$al2) = split(m{/},$cons); 120 | my $alt1 = parse_indel($al1); 121 | my $alt2 = parse_indel($al2); 122 | if ( !$alt1 && !$alt2 ) { error("FIXME: could not parse indel:\n", $line); } 123 | if ( $alt1 && $alt2 && $alt1 eq $alt2 ) { $alt2=''; } 124 | if ( !$alt1 ) 125 | { 126 | $alt=$alt2; 127 | $gt='0|1'; 128 | } 129 | elsif ( !$alt2 ) 130 | { 131 | $alt=$alt1; 132 | $gt='0|1'; 133 | } 134 | else 135 | { 136 | $alt="$alt1,$alt2"; 137 | $gt='1|2'; 138 | } 139 | } 140 | else 141 | { 142 | # SNP 143 | ($alt,$gt) = iupac_to_gtype($ref,$cons); 144 | } 145 | 146 | print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\t\tGT:GQ:DP\t$gt:$cons_qual:$depth\n"; 147 | 148 | $prev_ref = $ref; 149 | $prev_pos = $pos; 150 | $prev_chr = $chr; 151 | } 152 | } 153 | 154 | 155 | #------------- Fasta -------------------- 156 | # 157 | # Uses samtools to get a requested base from a fasta file. For efficiency, preloads 158 | # a chunk to memory. The size of the cached sequence can be controlled by the 'size' 159 | # parameter. 160 | # 161 | package Fasta; 162 | 163 | use strict; 164 | use warnings; 165 | use Carp; 166 | 167 | sub Fasta::new 168 | { 169 | my ($class,@args) = @_; 170 | my $self = @args ? {@args} : {}; 171 | if ( !$$self{file} ) { $self->throw(qq[Missing the parameter "file"\n]); } 172 | $$self{chr} = undef; 173 | $$self{from} = undef; 174 | $$self{to} = undef; 175 | if ( !$$self{size} ) { $$self{size}=10_000_000; } 176 | bless $self, ref($class) || $class; 177 | return $self; 178 | } 179 | 180 | sub read_chunk 181 | { 182 | my ($self,$chr,$pos) = @_; 183 | my $to = $pos + $$self{size}; 184 | my $cmd = "samtools faidx $$self{file} $chr:$pos-$to"; 185 | my @out = `$cmd`; 186 | if ( $? ) { $self->throw("$cmd: $!"); } 187 | my $line = shift(@out); 188 | if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); } 189 | $$self{chr} = $chr; 190 | $$self{from} = $1; 191 | $$self{to} = $2; 192 | my $chunk = ''; 193 | while ($line=shift(@out)) 194 | { 195 | chomp($line); 196 | $chunk .= $line; 197 | } 198 | $$self{chunk} = $chunk; 199 | return; 200 | } 201 | 202 | sub get_base 203 | { 204 | my ($self,$chr,$pos) = @_; 205 | if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} ) 206 | { 207 | $self->read_chunk($chr,$pos); 208 | } 209 | my $idx = $pos - $$self{from}; 210 | return substr($$self{chunk},$idx,1); 211 | } 212 | 213 | sub throw 214 | { 215 | my ($self,@msg) = @_; 216 | croak(@msg); 217 | } 218 | -------------------------------------------------------------------------------- /samtools/misc/soap2sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Contact: lh3 4 | # Version: 0.1.1 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Std; 9 | 10 | &soap2sam; 11 | exit; 12 | 13 | sub mating { 14 | my ($s1, $s2) = @_; 15 | my $isize = 0; 16 | if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize 17 | my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; 18 | my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; 19 | $isize = $x2 - $x1; 20 | } 21 | # update mate coordinate 22 | if ($s2->[2] ne '*') { 23 | @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); 24 | $s1->[1] |= 0x20 if ($s2->[1] & 0x10); 25 | } else { 26 | $s1->[1] |= 0x8; 27 | } 28 | if ($s1->[2] ne '*') { 29 | @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); 30 | $s2->[1] |= 0x20 if ($s1->[1] & 0x10); 31 | } else { 32 | $s2->[1] |= 0x8; 33 | } 34 | } 35 | 36 | sub soap2sam { 37 | my %opts = (); 38 | getopts("p", \%opts); 39 | die("Usage: soap2sam.pl [-p] \n") if (@ARGV == 0 && -t STDIN); 40 | my $is_paired = defined($opts{p}); 41 | # core loop 42 | my @s1 = (); 43 | my @s2 = (); 44 | my ($s_last, $s_curr) = (\@s1, \@s2); 45 | while (<>) { 46 | s/[\177-\377]|[\000-\010]|[\012-\040]//g; 47 | next if (&soap2sam_aux($_, $s_curr, $is_paired) < 0); 48 | if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { 49 | &mating($s_last, $s_curr); 50 | print join("\t", @$s_last), "\n"; 51 | print join("\t", @$s_curr), "\n"; 52 | @$s_last = (); @$s_curr = (); 53 | } else { 54 | print join("\t", @$s_last), "\n" if (@$s_last != 0); 55 | my $s = $s_last; $s_last = $s_curr; $s_curr = $s; 56 | } 57 | } 58 | print join("\t", @$s_last), "\n" if (@$s_last != 0); 59 | } 60 | 61 | sub soap2sam_aux { 62 | my ($line, $s, $is_paired) = @_; 63 | chomp($line); 64 | my @t = split(/\s+/, $line); 65 | return -1 if (@t < 9 || $line =~ /^\s/ || !$t[0]); 66 | @$s = (); 67 | # fix SOAP-2.1.x bugs 68 | @t = @t[0..2,4..$#t] unless ($t[3] =~ /^\d+$/); 69 | # read name 70 | $s->[0] = $t[0]; 71 | $s->[0] =~ s/\/[12]$//g; 72 | # initial flag (will be updated later) 73 | $s->[1] = 0; 74 | $s->[1] |= 1 | 1<<($t[4] eq 'a'? 6 : 7); 75 | $s->[1] |= 2 if ($is_paired); 76 | # read & quality 77 | $s->[9] = $t[1]; 78 | $s->[10] = (length($t[2]) > length($t[1]))? substr($t[2], 0, length($t[1])) : $t[2]; 79 | # cigar 80 | $s->[5] = length($s->[9]) . "M"; 81 | # coor 82 | $s->[2] = $t[7]; $s->[3] = $t[8]; 83 | $s->[1] |= 0x10 if ($t[6] eq '-'); 84 | # mapQ 85 | $s->[4] = $t[3] == 1? 30 : 0; 86 | # mate coordinate 87 | $s->[6] = '*'; $s->[7] = $s->[8] = 0; 88 | # aux 89 | push(@$s, "NM:i:$t[9]"); 90 | my $md = ''; 91 | if ($t[9]) { 92 | my @x; 93 | for (10 .. $#t) { 94 | push(@x, sprintf("%.3d,$1", $2)) if ($t[$_] =~ /^([ACGT])->(\d+)/i); 95 | } 96 | @x = sort(@x); 97 | my $a = 0; 98 | for (@x) { 99 | my ($y, $z) = split(","); 100 | $md .= (int($y)-$a) . $z; 101 | $a += $y - $a + 1; 102 | } 103 | $md .= length($t[1]) - $a; 104 | } else { 105 | $md = length($t[1]); 106 | } 107 | push(@$s, "MD:Z:$md"); 108 | return 0; 109 | } 110 | -------------------------------------------------------------------------------- /samtools/misc/wgsim_eval.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Contact: lh3 4 | # Version: 0.1.5 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Std; 9 | 10 | &wgsim_eval; 11 | exit; 12 | 13 | sub wgsim_eval { 14 | my %opts = (g=>5); 15 | getopts('pcg:', \%opts); 16 | die("Usage: wgsim_eval.pl [-pc] [-g $opts{g}] \n") if (@ARGV == 0 && -t STDIN); 17 | my (@c0, @c1); 18 | my ($max_q, $flag) = (0, 0); 19 | my $gap = $opts{g}; 20 | $flag |= 1 if (defined $opts{p}); 21 | $flag |= 2 if (defined $opts{c}); 22 | while (<>) { 23 | next if (/^\@/); 24 | my @t = split("\t"); 25 | next if (@t < 11); 26 | my $line = $_; 27 | my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]); 28 | $max_q = $q if ($q > $max_q); 29 | # right coordinate 30 | $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg; 31 | --$rght; 32 | # correct for soft clipping 33 | my ($left0, $rght0) = ($left, $rght); 34 | $left -= $1 if (/^(\d+)[SH]/); 35 | $rght += $1 if (/(\d+)[SH]$/); 36 | $left0 -= $1 if (/(\d+)[SH]$/); 37 | $rght0 += $1 if (/^(\d+)[SH]/); 38 | # skip unmapped reads 39 | next if (($t[1]&0x4) || $chr eq '*'); 40 | # parse read name and check 41 | if ($t[0] =~ /^(\S+)_(\d+)_(\d+)_/) { 42 | if ($1 ne $chr) { # different chr 43 | $is_correct = 0; 44 | } else { 45 | if ($flag & 2) { 46 | if (($t[1]&0x40) && !($t[1]&0x10)) { # F3, forward 47 | $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap); 48 | } elsif (($t[1]&0x40) && ($t[1]&0x10)) { # F3, reverse 49 | $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); 50 | } elsif (($t[1]&0x80) && !($t[1]&0x10)) { # R3, forward 51 | $is_correct = 0 if (abs($3 - $left) > $gap && abs($3 - $left0) > $gap); 52 | } else { # R3, reverse 53 | $is_correct = 0 if (abs($2 - $rght) > $gap && abs($3 - $rght0) > $gap); 54 | } 55 | } else { 56 | if ($t[1] & 0x10) { # reverse 57 | $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); # in case of indels that are close to the end of a reads 58 | } else { 59 | $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap); 60 | } 61 | } 62 | } 63 | } else { 64 | warn("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n"); 65 | next; 66 | } 67 | ++$c0[$q]; 68 | ++$c1[$q] unless ($is_correct); 69 | print STDERR $line if (($flag&1) && !$is_correct && $q > 0); 70 | } 71 | # print 72 | my ($cc0, $cc1) = (0, 0); 73 | for (my $i = $max_q; $i >= 0; --$i) { 74 | $c0[$i] = 0 unless (defined $c0[$i]); 75 | $c1[$i] = 0 unless (defined $c1[$i]); 76 | $cc0 += $c0[$i]; $cc1 += $c1[$i]; 77 | printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /samtools/misc/zoom2sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Contact: lh3 4 | # Version: 0.1.0 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Std; 9 | 10 | &zoom2sam; 11 | exit; 12 | 13 | sub mating { 14 | my ($s1, $s2) = @_; 15 | my $isize = 0; 16 | if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize 17 | my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; 18 | my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; 19 | $isize = $x2 - $x1; 20 | } 21 | # update mate coordinate 22 | if ($s2->[2] ne '*') { 23 | @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); 24 | $s1->[1] |= 0x20 if ($s2->[1] & 0x10); 25 | } else { 26 | $s1->[1] |= 0x8; 27 | } 28 | if ($s1->[2] ne '*') { 29 | @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); 30 | $s2->[1] |= 0x20 if ($s1->[1] & 0x10); 31 | } else { 32 | $s2->[1] |= 0x8; 33 | } 34 | } 35 | 36 | sub zoom2sam { 37 | my %opts = (); 38 | getopts("p", \%opts); 39 | die("Usage: zoom2sam.pl [-p] 40 | Warnings: This script only supports the default Illumina outputs.\n") if (@ARGV < 2); 41 | my $is_paired = defined($opts{p}); 42 | my $len = shift(@ARGV); 43 | # core loop 44 | my @s1 = (); 45 | my @s2 = (); 46 | my ($s_last, $s_curr) = (\@s1, \@s2); 47 | while (<>) { 48 | &zoom2sam_aux($_, $s_curr, $is_paired, $len); 49 | if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { 50 | &mating($s_last, $s_curr); 51 | print join("\t", @$s_last), "\n"; 52 | print join("\t", @$s_curr), "\n"; 53 | @$s_last = (); @$s_curr = (); 54 | } else { 55 | print join("\t", @$s_last), "\n" if (@$s_last != 0); 56 | my $s = $s_last; $s_last = $s_curr; $s_curr = $s; 57 | } 58 | } 59 | print join("\t", @$s_last), "\n" if (@$s_last != 0); 60 | } 61 | 62 | sub zoom2sam_aux { 63 | my ($line, $s, $is_paired, $len) = @_; 64 | chomp($line); 65 | my @t = split("\t", $line); 66 | @$s = (); 67 | # read name 68 | $s->[0] = $t[0]; 69 | # initial flag (will be updated later) 70 | $s->[1] = 0; 71 | $s->[1] |= 1 | 1<<6 if ($s->[0] =~ /_F$/); 72 | $s->[1] |= 1 | 1<<7 if ($s->[0] =~ /_R$/); 73 | $s->[1] |= 2 if ($is_paired); 74 | # read & quality 75 | $s->[9] = "*"; $s->[10] = "*"; 76 | # cigar 77 | $s->[5] = $len . "M"; 78 | # coor 79 | my @s = split(/\s+/, $t[1]); 80 | $s->[2] = $s[0]; 81 | $t[1] =~ /:(\d+)$/; 82 | $s->[3] = $1 + 1; 83 | if ($s->[0] =~ /_[FR]$/) { 84 | my $u = ($s->[0] =~ /_F$/)? 1 : 0; 85 | my $w = ($t[2] eq '+')? 1 : 0; 86 | $s->[1] |= 0x10 if ($u ^ $w); 87 | $s->[0] =~ s/_[FR]$//; 88 | } else { 89 | $s->[1] |= 0x10 if ($t[2] eq '-'); 90 | } 91 | # mapQ 92 | $s->[4] = 30; 93 | # mate coordinate 94 | $s->[6] = '*'; $s->[7] = $s->[8] = 0; 95 | # aux 96 | push(@$s, "NM:i:$t[3]"); 97 | } 98 | -------------------------------------------------------------------------------- /samtools/razf.h: -------------------------------------------------------------------------------- 1 | /*- 2 | * RAZF : Random Access compressed(Z) File 3 | * Version: 1.0 4 | * Release Date: 2008-10-27 5 | * 6 | * Copyright 2008, Jue Ruan , Heng Li 7 | * 8 | * All rights reserved. 9 | * 10 | * Redistribution and use in source and binary forms, with or without 11 | * modification, are permitted provided that the following conditions 12 | * are met: 13 | * 1. Redistributions of source code must retain the above copyright 14 | * notice, this list of conditions and the following disclaimer. 15 | * 2. Redistributions in binary form must reproduce the above copyright 16 | * notice, this list of conditions and the following disclaimer in the 17 | * documentation and/or other materials provided with the distribution. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 | * SUCH DAMAGE. 30 | */ 31 | 32 | 33 | #ifndef __RAZF_RJ_H 34 | #define __RAZF_RJ_H 35 | 36 | #include 37 | #include 38 | #include "zlib.h" 39 | 40 | #ifdef _USE_KNETFILE 41 | #include "knetfile.h" 42 | #endif 43 | 44 | #if ZLIB_VERNUM < 0x1221 45 | #define _RZ_READONLY 46 | struct _gz_header_s; 47 | typedef struct _gz_header_s _gz_header; 48 | #define gz_header _gz_header 49 | #endif 50 | 51 | #define WINDOW_BITS 15 52 | 53 | #ifndef RZ_BLOCK_SIZE 54 | #define RZ_BLOCK_SIZE (1<mode from HEAD to TYPE after call inflateReset */ 104 | int buf_off, buf_len; 105 | int z_err, z_eof; 106 | int seekable; 107 | /* Indice where the source is seekable */ 108 | int load_index; 109 | /* set has_index to 0 in mode 'w', then index will be discarded */ 110 | } RAZF; 111 | 112 | #ifdef __cplusplus 113 | extern "C" { 114 | #endif 115 | 116 | RAZF* razf_dopen(int data_fd, const char *mode); 117 | RAZF *razf_open(const char *fn, const char *mode); 118 | int razf_write(RAZF* rz, const void *data, int size); 119 | int razf_read(RAZF* rz, void *data, int size); 120 | int64_t razf_seek(RAZF* rz, int64_t pos, int where); 121 | void razf_close(RAZF* rz); 122 | 123 | #define razf_tell(rz) ((rz)->out) 124 | 125 | RAZF* razf_open2(const char *filename, const char *mode); 126 | RAZF* razf_dopen2(int fd, const char *mode); 127 | uint64_t razf_tell2(RAZF *rz); 128 | int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where); 129 | 130 | #ifdef __cplusplus 131 | } 132 | #endif 133 | 134 | #endif 135 | -------------------------------------------------------------------------------- /samtools/razip.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "razf.h" 8 | 9 | #define WINDOW_SIZE 4096 10 | 11 | static int razf_main_usage() 12 | { 13 | printf("\n"); 14 | printf("Usage: razip [options] [file] ...\n\n"); 15 | printf("Options: -c write on standard output, keep original files unchanged\n"); 16 | printf(" -d decompress\n"); 17 | printf(" -l list compressed file contents\n"); 18 | printf(" -b INT decompress at INT position in the uncompressed file\n"); 19 | printf(" -s INT decompress INT bytes in the uncompressed file\n"); 20 | printf(" -h give this help\n"); 21 | printf("\n"); 22 | return 0; 23 | } 24 | 25 | static int write_open(const char *fn, int is_forced) 26 | { 27 | int fd = -1; 28 | char c; 29 | if (!is_forced) { 30 | if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { 31 | printf("razip: %s already exists; do you wish to overwrite (y or n)? ", fn); 32 | scanf("%c", &c); 33 | if (c != 'Y' && c != 'y') { 34 | printf("razip: not overwritten\n"); 35 | exit(1); 36 | } 37 | } 38 | } 39 | if (fd < 0) { 40 | if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { 41 | fprintf(stderr, "razip: %s: Fail to write\n", fn); 42 | exit(1); 43 | } 44 | } 45 | return fd; 46 | } 47 | 48 | int main(int argc, char **argv) 49 | { 50 | int c, compress, pstdout, is_forced; 51 | RAZF *rz; 52 | void *buffer; 53 | long start, end, size; 54 | 55 | compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; 56 | while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){ 57 | switch(c){ 58 | case 'h': return razf_main_usage(); 59 | case 'd': compress = 0; break; 60 | case 'c': pstdout = 1; break; 61 | case 'l': compress = 2; break; 62 | case 'b': start = atol(optarg); break; 63 | case 's': size = atol(optarg); break; 64 | case 'f': is_forced = 1; break; 65 | } 66 | } 67 | if (size >= 0) end = start + size; 68 | if(end >= 0 && end < start){ 69 | fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end); 70 | return 1; 71 | } 72 | if(compress == 1){ 73 | int f_src, f_dst = -1; 74 | if(argc > optind){ 75 | if((f_src = open(argv[optind], O_RDONLY)) < 0){ 76 | fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]); 77 | return 1; 78 | } 79 | if(pstdout){ 80 | f_dst = fileno(stdout); 81 | } else { 82 | char *name = malloc(sizeof(strlen(argv[optind]) + 5)); 83 | strcpy(name, argv[optind]); 84 | strcat(name, ".rz"); 85 | f_dst = write_open(name, is_forced); 86 | if (f_dst < 0) return 1; 87 | free(name); 88 | } 89 | } else if(pstdout){ 90 | f_src = fileno(stdin); 91 | f_dst = fileno(stdout); 92 | } else return razf_main_usage(); 93 | rz = razf_dopen(f_dst, "w"); 94 | buffer = malloc(WINDOW_SIZE); 95 | while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c); 96 | razf_close(rz); // f_dst will be closed here 97 | if (argc > optind) unlink(argv[optind]); 98 | free(buffer); 99 | close(f_src); 100 | return 0; 101 | } else { 102 | if(argc <= optind) return razf_main_usage(); 103 | if(compress == 2){ 104 | rz = razf_open(argv[optind], "r"); 105 | if(rz->file_type == FILE_TYPE_RZ) { 106 | printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name"); 107 | printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end, 108 | argv[optind]); 109 | } else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]); 110 | } else { 111 | int f_dst; 112 | if (argc > optind && !pstdout) { 113 | char *name; 114 | if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) { 115 | printf("razip: %s: unknown suffix -- ignored\n", argv[optind]); 116 | return 1; 117 | } 118 | name = strdup(argv[optind]); 119 | name[strlen(name) - 3] = '\0'; 120 | f_dst = write_open(name, is_forced); 121 | free(name); 122 | } else f_dst = fileno(stdout); 123 | rz = razf_open(argv[optind], "r"); 124 | buffer = malloc(WINDOW_SIZE); 125 | razf_seek(rz, start, SEEK_SET); 126 | while(1){ 127 | if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE); 128 | else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); 129 | if(c <= 0) break; 130 | start += c; 131 | write(f_dst, buffer, c); 132 | if(end >= 0 && start >= end) break; 133 | } 134 | free(buffer); 135 | if (!pstdout) unlink(argv[optind]); 136 | } 137 | razf_close(rz); 138 | return 0; 139 | } 140 | } 141 | 142 | -------------------------------------------------------------------------------- /samtools/sam.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "faidx.h" 4 | #include "sam.h" 5 | 6 | #define TYPE_BAM 1 7 | #define TYPE_READ 2 8 | 9 | bam_header_t *bam_header_dup(const bam_header_t *h0) 10 | { 11 | bam_header_t *h; 12 | int i; 13 | h = bam_header_init(); 14 | *h = *h0; 15 | h->hash = h->dict = h->rg2lib = 0; 16 | h->text = (char*)calloc(h->l_text + 1, 1); 17 | memcpy(h->text, h0->text, h->l_text); 18 | h->target_len = (uint32_t*)calloc(h->n_targets, 4); 19 | h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); 20 | for (i = 0; i < h->n_targets; ++i) { 21 | h->target_len[i] = h0->target_len[i]; 22 | h->target_name[i] = strdup(h0->target_name[i]); 23 | } 24 | return h; 25 | } 26 | static void append_header_text(bam_header_t *header, char* text, int len) 27 | { 28 | int x = header->l_text + 1; 29 | int y = header->l_text + len + 1; // 1 byte null 30 | if (text == 0) return; 31 | kroundup32(x); 32 | kroundup32(y); 33 | if (x < y) header->text = (char*)realloc(header->text, y); 34 | strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here. 35 | header->l_text += len; 36 | header->text[header->l_text] = 0; 37 | } 38 | 39 | samfile_t *samopen(const char *fn, const char *mode, const void *aux) 40 | { 41 | samfile_t *fp; 42 | fp = (samfile_t*)calloc(1, sizeof(samfile_t)); 43 | if (mode[0] == 'r') { // read 44 | fp->type |= TYPE_READ; 45 | if (mode[1] == 'b') { // binary 46 | fp->type |= TYPE_BAM; 47 | fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); 48 | if (fp->x.bam == 0) goto open_err_ret; 49 | fp->header = bam_header_read(fp->x.bam); 50 | } else { // text 51 | fp->x.tamr = sam_open(fn); 52 | if (fp->x.tamr == 0) goto open_err_ret; 53 | fp->header = sam_header_read(fp->x.tamr); 54 | if (fp->header->n_targets == 0) { // no @SQ fields 55 | if (aux) { // check if aux is present 56 | bam_header_t *textheader = fp->header; 57 | fp->header = sam_header_read2((const char*)aux); 58 | append_header_text(fp->header, textheader->text, textheader->l_text); 59 | bam_header_destroy(textheader); 60 | } 61 | if (fp->header->n_targets == 0) 62 | fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); 63 | } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); 64 | } 65 | } else if (mode[0] == 'w') { // write 66 | fp->header = bam_header_dup((const bam_header_t*)aux); 67 | if (mode[1] == 'b') { // binary 68 | char bmode[3]; 69 | bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0; 70 | fp->type |= TYPE_BAM; 71 | fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); 72 | if (fp->x.bam == 0) goto open_err_ret; 73 | bam_header_write(fp->x.bam, fp->header); 74 | } else { // text 75 | // open file 76 | fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; 77 | if (fp->x.tamr == 0) goto open_err_ret; 78 | if (strstr(mode, "X")) fp->type |= BAM_OFSTR<<2; 79 | else if (strstr(mode, "x")) fp->type |= BAM_OFHEX<<2; 80 | else fp->type |= BAM_OFDEC<<2; 81 | // write header 82 | if (strstr(mode, "h")) { 83 | int i; 84 | bam_header_t *alt; 85 | // parse the header text 86 | alt = bam_header_init(); 87 | alt->l_text = fp->header->l_text; alt->text = fp->header->text; 88 | sam_header_parse(alt); 89 | alt->l_text = 0; alt->text = 0; 90 | // check if there are @SQ lines in the header 91 | fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); 92 | if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} 93 | if (alt->n_targets != fp->header->n_targets) 94 | fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); 95 | } else { // then dump ->target_{name,len} 96 | for (i = 0; i < fp->header->n_targets; ++i) 97 | fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); 98 | } 99 | bam_header_destroy(alt); 100 | } 101 | } 102 | } 103 | return fp; 104 | 105 | open_err_ret: 106 | free(fp); 107 | return 0; 108 | } 109 | 110 | void samclose(samfile_t *fp) 111 | { 112 | if (fp == 0) return; 113 | if (fp->header) bam_header_destroy(fp->header); 114 | if (fp->type & TYPE_BAM) bam_close(fp->x.bam); 115 | else if (fp->type & TYPE_READ) sam_close(fp->x.tamr); 116 | else fclose(fp->x.tamw); 117 | free(fp); 118 | } 119 | 120 | int samread(samfile_t *fp, bam1_t *b) 121 | { 122 | if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading 123 | if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b); 124 | else return sam_read1(fp->x.tamr, fp->header, b); 125 | } 126 | 127 | int samwrite(samfile_t *fp, const bam1_t *b) 128 | { 129 | if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing 130 | if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b); 131 | else { 132 | char *s = bam_format1_core(fp->header, b, fp->type>>2&3); 133 | int l = strlen(s); 134 | fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw); 135 | free(s); 136 | return l + 1; 137 | } 138 | } 139 | 140 | int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data) 141 | { 142 | bam_plbuf_t *buf; 143 | int ret; 144 | bam1_t *b; 145 | b = bam_init1(); 146 | buf = bam_plbuf_init(func, func_data); 147 | bam_plbuf_set_mask(buf, mask); 148 | while ((ret = samread(fp, b)) >= 0) 149 | bam_plbuf_push(b, buf); 150 | bam_plbuf_push(0, buf); 151 | bam_plbuf_destroy(buf); 152 | bam_destroy1(b); 153 | return 0; 154 | } 155 | 156 | char *samfaipath(const char *fn_ref) 157 | { 158 | char *fn_list = 0; 159 | if (fn_ref == 0) return 0; 160 | fn_list = calloc(strlen(fn_ref) + 5, 1); 161 | strcat(strcpy(fn_list, fn_ref), ".fai"); 162 | if (access(fn_list, R_OK) == -1) { // fn_list is unreadable 163 | if (access(fn_ref, R_OK) == -1) { 164 | fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref); 165 | } else { 166 | fprintf(stderr, "[samfaipath] build FASTA index...\n"); 167 | if (fai_build(fn_ref) == -1) { 168 | fprintf(stderr, "[samfaipath] fail to build FASTA index.\n"); 169 | free(fn_list); fn_list = 0; 170 | } 171 | } 172 | } 173 | return fn_list; 174 | } 175 | -------------------------------------------------------------------------------- /samtools/sam.h: -------------------------------------------------------------------------------- 1 | #ifndef BAM_SAM_H 2 | #define BAM_SAM_H 3 | 4 | #include "bam.h" 5 | 6 | /*! 7 | @header 8 | 9 | This file provides higher level of I/O routines and unifies the APIs 10 | for SAM and BAM formats. These APIs are more convenient and 11 | recommended. 12 | 13 | @copyright Genome Research Ltd. 14 | */ 15 | 16 | /*! @typedef 17 | @abstract SAM/BAM file handler 18 | @field type type of the handler; bit 1 for BAM, 2 for reading and bit 3-4 for flag format 19 | @field bam BAM file handler; valid if (type&1) == 1 20 | @field tamr SAM file handler for reading; valid if type == 2 21 | @field tamw SAM file handler for writing; valid if type == 0 22 | @field header header struct 23 | */ 24 | typedef struct { 25 | int type; 26 | union { 27 | tamFile tamr; 28 | bamFile bam; 29 | FILE *tamw; 30 | } x; 31 | bam_header_t *header; 32 | } samfile_t; 33 | 34 | #ifdef __cplusplus 35 | extern "C" { 36 | #endif 37 | 38 | /*! 39 | @abstract Open a SAM/BAM file 40 | 41 | @param fn SAM/BAM file name; "-" is recognized as stdin (for 42 | reading) or stdout (for writing). 43 | 44 | @param mode open mode /[rw](b?)(u?)(h?)([xX]?)/: 'r' for reading, 45 | 'w' for writing, 'b' for BAM I/O, 'u' for uncompressed BAM output, 46 | 'h' for outputing header in SAM, 'x' for HEX flag and 'X' for 47 | string flag. If 'b' present, it must immediately follow 'r' or 48 | 'w'. Valid modes are "r", "w", "wh", "wx", "whx", "wX", "whX", 49 | "rb", "wb" and "wbu" exclusively. 50 | 51 | @param aux auxiliary data; if mode[0]=='w', aux points to 52 | bam_header_t; if strcmp(mode, "rb")!=0 and @SQ header lines in SAM 53 | are absent, aux points the file name of the list of the reference; 54 | aux is not used otherwise. If @SQ header lines are present in SAM, 55 | aux is not used, either. 56 | 57 | @return SAM/BAM file handler 58 | */ 59 | samfile_t *samopen(const char *fn, const char *mode, const void *aux); 60 | 61 | /*! 62 | @abstract Close a SAM/BAM handler 63 | @param fp file handler to be closed 64 | */ 65 | void samclose(samfile_t *fp); 66 | 67 | /*! 68 | @abstract Read one alignment 69 | @param fp file handler 70 | @param b alignment 71 | @return bytes read 72 | */ 73 | int samread(samfile_t *fp, bam1_t *b); 74 | 75 | /*! 76 | @abstract Write one alignment 77 | @param fp file handler 78 | @param b alignment 79 | @return bytes written 80 | */ 81 | int samwrite(samfile_t *fp, const bam1_t *b); 82 | 83 | /*! 84 | @abstract Get the pileup for a whole alignment file 85 | @param fp file handler 86 | @param mask mask transferred to bam_plbuf_set_mask() 87 | @param func user defined function called in the pileup process 88 | #param data user provided data for func() 89 | */ 90 | int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data); 91 | 92 | char *samfaipath(const char *fn_ref); 93 | 94 | #ifdef __cplusplus 95 | } 96 | #endif 97 | 98 | #endif 99 | -------------------------------------------------------------------------------- /samtools/sam_header.h: -------------------------------------------------------------------------------- 1 | #ifndef __SAM_HEADER_H__ 2 | #define __SAM_HEADER_H__ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | void *sam_header_parse2(const char *headerText); 9 | void *sam_header_merge(int n, const void **dicts); 10 | void sam_header_free(void *header); 11 | char *sam_header_write(const void *headerDict); // returns a newly allocated string 12 | 13 | char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); 14 | 15 | void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]); 16 | const char *sam_tbl_get(void *h, const char *key); 17 | int sam_tbl_size(void *h); 18 | void sam_tbl_destroy(void *h); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /samtools/showbam.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "sam.h" 3 | /* callback for bam_fetch() */ 4 | static int fetch_func(const bam1_t *b) 5 | { 6 | const bam1_core_t *c = &b->core; 7 | int i; 8 | char* read_name=(char*) bam1_qname(b); 9 | printf("%s\t",read_name); 10 | char* read_seq=(char*)malloc(c->l_qseq+1); 11 | char* s=(char*) bam1_seq(b); 12 | for(i=0;il_qseq;i++) read_seq[i]=bam_nt16_rev_table[bam1_seqi(s,i)]; 13 | read_seq[i]=0; 14 | printf("%s\t",read_seq); 15 | char* read_qual=(char*)malloc(c->l_qseq+1); 16 | char* t=(char*) bam1_qual(b); 17 | for(i=0;il_qseq;i++) read_qual[i]=t[i]+33; 18 | read_qual[i]=0; 19 | printf("%s\n",read_qual); 20 | free(read_seq); free(read_qual); 21 | return 0; 22 | } 23 | int main(int argc, char *argv[]) 24 | { 25 | samfile_t *fp; 26 | if ((fp = samopen(argv[1], "rb", 0)) == 0) { 27 | fprintf(stderr, "showbam: Fail to open BAM file %s\n", argv[1]); 28 | return 1; 29 | } 30 | bam1_t *b = bam_init1(); 31 | while (samread(fp, b) >= 0) fetch_func(b); 32 | bam_destroy1(b); 33 | samclose(fp); 34 | return 0; 35 | } 36 | -------------------------------------------------------------------------------- /samtools/win32/libcurses.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genome-vendor/bsmap/44ab0345d682a4e54a0c25ad2aebf9392a2a0936/samtools/win32/libcurses.a -------------------------------------------------------------------------------- /samtools/win32/libz.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genome-vendor/bsmap/44ab0345d682a4e54a0c25ad2aebf9392a2a0936/samtools/win32/libz.a -------------------------------------------------------------------------------- /utilities.cpp: -------------------------------------------------------------------------------- 1 | #include "utilities.h" 2 | 3 | using namespace std; 4 | 5 | extern Param param; 6 | 7 | static time_t time_begin; 8 | static time_t time_last; 9 | 10 | time_t Initial_Time() 11 | { 12 | time_begin = time(NULL); 13 | time_last = time_begin; 14 | return time_begin; 15 | }; 16 | 17 | //time used during the past step 18 | time_t Cal_StepTime() 19 | { 20 | time_t tused = time(NULL)-time_last; 21 | time_last = time(NULL); 22 | return tused; 23 | }; 24 | 25 | //total time exhaust 26 | time_t Cal_AllTime() 27 | { 28 | return time(NULL)-time_begin; 29 | }; 30 | 31 | //current time on string format 32 | char * Curr_Time() 33 | { 34 | time_t t=time(NULL); 35 | return ctime(&t); 36 | } 37 | 38 | //generate randomness 39 | 40 | bit32_t myrand(int i, bit32_t* rseed) { 41 | if(param.randseed == 0) return rand_r(rseed); 42 | else{ 43 | bit64_t v; 44 | v = ((bit64_t) i+ param.randseed*1000000) * 3935559000370003845LL + 2691343689449507681LL; 45 | v ^= v >> 21; v ^= v << 37; v ^= v >> 4; 46 | v *= 4768777513237032717LL; 47 | v ^= v << 20; v ^= v >> 41; v ^= v << 5; 48 | return (bit32_t) (v&0xffffffffUL); 49 | } 50 | } 51 | 52 | 53 | bool HitComp(Hit a, Hit b) { 54 | return ((a.chr=0; i--) { 65 | cout<>(i*2))&0x3]; 66 | } 67 | cout<<" "; 68 | } 69 | 70 | void disp_bfa64(bit64_t a) { 71 | for(int i=31; i>=0; i--) { 72 | cout<>(i*2))&0x3]; 73 | } 74 | cout<<" "; 75 | } 76 | 77 | 78 | -------------------------------------------------------------------------------- /utilities.h: -------------------------------------------------------------------------------- 1 | #ifndef _UTILITIES_H_ 2 | #define _UTILITIES_H_ 3 | 4 | #include 5 | #include "param.h" 6 | 7 | using namespace std; 8 | 9 | time_t Initial_Time(); 10 | //time used during the past step 11 | time_t Cal_StepTime(); 12 | //total time exhaust 13 | time_t Cal_AllTime(); 14 | //current time on string format 15 | char * Curr_Time(); 16 | 17 | bit32_t myrand(int i, bit32_t * rseed); 18 | bool HitComp(Hit a, Hit b); 19 | bool HitComp2(Hit a, Hit b); 20 | bool HitCompChr(Hit a, Hit b); 21 | void disp_bfa(bit32_t a); 22 | void disp_bfa64(bit64_t a); 23 | 24 | #endif //_UTILITIES_H_ 25 | --------------------------------------------------------------------------------