├── .github └── workflows │ └── create_readme.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── bam-readcount ├── ERR188273_chrX.metrics.txt ├── README.md └── region.bed ├── conf └── mkdocs_env.yml ├── create_readme.sh ├── eg ├── ERR188273_chrX.bam ├── ERR188273_chrX.bam.bai ├── README.md ├── chrX.fa.bz2 ├── clean.sh └── my.bed ├── etc ├── README.md └── rsubread.Rmd ├── genome ├── README.md ├── hg19_info.tsv └── hg38_info.tsv ├── img ├── bam_compare_igv.png └── sam_less.png ├── learning_bam_file.Rmd ├── mkdocs ├── docs │ ├── img │ └── index.md └── mkdocs.yml └── script ├── README.md ├── coverage_test.sh ├── generate_random_seq.pl ├── get_reads.pl ├── parse_bam.pl └── random_paired_end.pl /.github/workflows/create_readme.yml: -------------------------------------------------------------------------------- 1 | # name of workflow that will be displayed on the actions page 2 | name: Create README.md 3 | 4 | # execute workflow only when these files are modified 5 | on: 6 | push: 7 | paths: 8 | - 'eg/**' 9 | - 'Makefile' 10 | - 'create_readme.sh' 11 | - 'learning_bam_file.Rmd' 12 | - '.github/workflows/create_readme.yml' 13 | 14 | # a list of the jobs that run as part of the workflow 15 | jobs: 16 | make_markdown: 17 | runs-on: ubuntu-latest 18 | 19 | # the type of runner to run the given job 20 | container: davetang/r_build:4.1.3 21 | 22 | # a list of the steps that will run as part of the job 23 | steps: 24 | - run: echo "The job was automatically triggered by a ${{ github.event_name }} event." 25 | - run: echo "This job is now running on a ${{ runner.os }} server hosted by GitHub!" 26 | - run: echo "The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." 27 | - name: Check out repository code 28 | uses: actions/checkout@v4 29 | - run: echo "The ${{ github.repository }} repository has been cloned to the runner." 30 | - run: echo "The workflow is now ready to test your code on the runner." 31 | 32 | - run: make 33 | name: make 34 | 35 | - name: Commit report 36 | run: | 37 | git config --global user.name 'Dave Tang' 38 | git config --global user.email 'davetingpongtang@gmail.com' 39 | git config --global --add safe.directory /__w/learning_bam_file/learning_bam_file 40 | git add "README.md" 41 | git commit -m "Build README.md" 42 | git push origin main 43 | 44 | - name: Build MkDocs site 45 | run: | 46 | cd mkdocs && mkdocs build 47 | 48 | - name: Deploy MkDocs 49 | run: | 50 | git branch gh-pages 51 | git pull 52 | cd mkdocs && mkdocs gh-deploy 53 | 54 | - run: echo "This job's status is ${{ job.status }}." 55 | 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bwa 2 | *.swp 3 | samtools* 4 | htslib* 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Dave Tang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all data tools pandoc clean 2 | 3 | all: readme 4 | data: genome/chrX.fa 5 | tools: github-markdown-toc samtools bwa minimap2 mosdepth pandoc 6 | pandoc: /usr/bin/pandoc 7 | samtools_ver := 1.19.2 8 | minimap2_ver := 2.24 9 | mosdepth_ver := 0.3.7 10 | 11 | genome/chrX.fa: 12 | bunzip2 -c eg/chrX.fa.bz2 > $@ 13 | 14 | github-markdown-toc: 15 | git clone https://github.com/ekalinin/github-markdown-toc.git 16 | 17 | samtools: 18 | wget --quiet https://github.com/samtools/samtools/releases/download/$(samtools_ver)/samtools-$(samtools_ver).tar.bz2 \ 19 | && tar xjf samtools-$(samtools_ver).tar.bz2 \ 20 | && cd samtools-$(samtools_ver) \ 21 | && ./configure \ 22 | && make \ 23 | && mv samtools .. \ 24 | && cd .. \ 25 | && rm -rf samtools-$(samtools_ver).tar.bz2 samtools-$(samtools_ver) 26 | 27 | bwa: 28 | wget --quiet https://github.com/lh3/bwa/releases/download/v0.7.17/bwa-0.7.17.tar.bz2 \ 29 | && tar xjf bwa-0.7.17.tar.bz2 \ 30 | && cd bwa-0.7.17 \ 31 | && make \ 32 | && mv bwa .. \ 33 | && cd .. \ 34 | && rm -rf bwa-* 35 | 36 | minimap2: 37 | wget --quiet https://github.com/lh3/minimap2/archive/refs/tags/v$(minimap2_ver).tar.gz \ 38 | && tar xzf v$(minimap2_ver).tar.gz \ 39 | && cd minimap2-$(minimap2_ver) \ 40 | && make \ 41 | && mv minimap2 .. \ 42 | && cd .. \ 43 | && rm -rf minimap2-* v$(minimap2_ver).tar.gz 44 | 45 | mosdepth: 46 | wget --quiet https://github.com/brentp/mosdepth/releases/download/v$(mosdepth_ver)/mosdepth \ 47 | && chmod 755 mosdepth 48 | 49 | /usr/bin/pandoc: 50 | apt update \ 51 | && apt install -y pandoc 52 | 53 | readme: data tools 54 | ./create_readme.sh 55 | 56 | clean: 57 | rm -rf genome/chrX.fa github-markdown-toc samtools bwa minimap2 mostdepth tmp.html && eg/clean.sh 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Table of Contents 3 | ================= 4 | 5 | * [Learning the BAM format](#learning-the-bam-format) 6 | * [Introduction](#introduction) 7 | * [Installing SAMtools](#installing-samtools) 8 | * [Basic usage](#basic-usage) 9 | * [Viewing](#viewing) 10 | * [Converting a SAM file to a BAM file](#converting-a-sam-file-to-a-bam-file) 11 | * [Converting a BAM file to a CRAM file](#converting-a-bam-file-to-a-cram-file) 12 | * [Sorting a SAM/BAM file](#sorting-a-sambam-file) 13 | * [Creating a BAM index file](#creating-a-bam-index-file) 14 | * [Adding read groups](#adding-read-groups) 15 | * [Interpreting the BAM flags](#interpreting-the-bam-flags) 16 | * [Proper pair](#proper-pair) 17 | * [Filtering unmapped reads](#filtering-unmapped-reads) 18 | * [Extracting entries mapping to a specific loci](#extracting-entries-mapping-to-a-specific-loci) 19 | * [Extracting only the first read from paired end BAM files](#extracting-only-the-first-read-from-paired-end-bam-files) 20 | * [Stats](#stats) 21 | * [samtools calmd/fillmd](#samtools-calmdfillmd) 22 | * [Creating FASTQ files from a BAM file](#creating-fastq-files-from-a-bam-file) 23 | * [Random subsampling of BAM file](#random-subsampling-of-bam-file) 24 | * [Count number of reads](#count-number-of-reads) 25 | * [Obtaining genomic sequence](#obtaining-genomic-sequence) 26 | * [Comparing BAM files](#comparing-bam-files) 27 | * [Converting reference names](#converting-reference-names) 28 | * [Coverage](#coverage) 29 | * [Stargazers over time](#stargazers-over-time) 30 | 31 | 32 | 33 | Mon 25 Mar 2024 07:33:09 AM UTC 34 | 35 | Learning the BAM format 36 | ================ 37 | 38 | ## Introduction 39 | 40 | ![Build 41 | README](https://github.com/davetang/learning_bam_file/actions/workflows/create_readme.yml/badge.svg) 42 | 43 | SAMtools provides various (sub)tools for manipulating alignments in the 44 | SAM/BAM format. The SAM (Sequence Alignment/Map) format (BAM is just the 45 | binary form of SAM) is currently the *de facto* standard for storing 46 | large nucleotide sequence alignments. If you are working with 47 | high-throughput sequencing data, at some point you will probably have to 48 | deal with SAM/BAM files, so familiarise yourself with them\! For the 49 | latest information on SAMtools, please refer to the [release 50 | notes](https://github.com/samtools/samtools/releases). 51 | 52 | The examples in this README use the `ERR188273_chrX.bam` BAM file 53 | (stored in the `eg` folder) generated as per 54 | using the HISAT2 + StringTie2 55 | RNA-seq pipeline. This README is generated using the `create_readme.sh` 56 | script; if you want to generate this file yourself, please use [this 57 | Docker image](https://hub.docker.com/repository/docker/davetang/r_build) 58 | and the `Makefile` in this directory. For example: 59 | 60 | ``` bash 61 | # clone this repo 62 | git clone https://github.com/davetang/learning_bam_file.git 63 | cd learning_bam_file 64 | 65 | docker pull davetang/r_build:4.1.2 66 | docker run --rm -it -v $(pwd):/work davetang/r_build:4.1.2 /bin/bash 67 | 68 | # inside the Docker container 69 | make 70 | ``` 71 | 72 | ## Installing SAMtools 73 | 74 | For installing SAMtools, I recommend using `Conda` and the [Bioconda 75 | samtools package](https://anaconda.org/bioconda/samtools). I also 76 | recommend using 77 | [Miniconda](https://docs.conda.io/en/latest/miniconda.html) instead of 78 | Anaconda because Anaconda comes with a lot of tools/packages that you 79 | will probably not use. I wrote a [short introduction to 80 | Conda](https://davetang.github.io/reproducible_bioinformatics/conda.html) 81 | if you want to find learn more. 82 | 83 | Once you have installed Miniconda, you can install SAMtools as follows: 84 | 85 | ``` bash 86 | conda install -c bioconda samtools 87 | ``` 88 | 89 | Otherwise you can download the source and compile it yourself; change 90 | `dir` to the location you want `samtools` to be installed. `samtools` 91 | will be installed in `${dir}/bin`, so make sure this is in your `$PATH`. 92 | 93 | ``` bash 94 | #!/usr/bin/env bash 95 | 96 | set -euo pipefail 97 | 98 | ver=1.15 99 | tool=samtools 100 | url=https://github.com/samtools/${tool}/releases/download/${ver}/${tool}-${ver}.tar.bz2 101 | dir=${HOME}/local 102 | 103 | wget ${url} 104 | tar xjf ${tool}-${ver}.tar.bz2 105 | cd ${tool}-${ver} 106 | ./configure --prefix=${dir} 107 | make && make install 108 | cd .. 109 | 110 | rm -rf ${tool}-${ver} ${tool}-${ver}.tar.bz2 111 | 112 | >&2 echo Done 113 | exit 0 114 | ``` 115 | 116 | ## Basic usage 117 | 118 | If you run `samtools` on the terminal without any parameters or with 119 | `--help`, all the available utilities are listed: 120 | 121 | ``` bash 122 | samtools --help 123 | ``` 124 | 125 | ## 126 | ## Program: samtools (Tools for alignments in the SAM format) 127 | ## Version: 1.19.2 (using htslib 1.19.1) 128 | ## 129 | ## Usage: samtools [options] 130 | ## 131 | ## Commands: 132 | ## -- Indexing 133 | ## dict create a sequence dictionary file 134 | ## faidx index/extract FASTA 135 | ## fqidx index/extract FASTQ 136 | ## index index alignment 137 | ## 138 | ## -- Editing 139 | ## calmd recalculate MD/NM tags and '=' bases 140 | ## fixmate fix mate information 141 | ## reheader replace BAM header 142 | ## targetcut cut fosmid regions (for fosmid pool only) 143 | ## addreplacerg adds or replaces RG tags 144 | ## markdup mark duplicates 145 | ## ampliconclip clip oligos from the end of reads 146 | ## 147 | ## -- File operations 148 | ## collate shuffle and group alignments by name 149 | ## cat concatenate BAMs 150 | ## consensus produce a consensus Pileup/FASTA/FASTQ 151 | ## merge merge sorted alignments 152 | ## mpileup multi-way pileup 153 | ## sort sort alignment file 154 | ## split splits a file by read group 155 | ## quickcheck quickly check if SAM/BAM/CRAM file appears intact 156 | ## fastq converts a BAM to a FASTQ 157 | ## fasta converts a BAM to a FASTA 158 | ## import Converts FASTA or FASTQ files to SAM/BAM/CRAM 159 | ## reference Generates a reference from aligned data 160 | ## reset Reverts aligner changes in reads 161 | ## 162 | ## -- Statistics 163 | ## bedcov read depth per BED region 164 | ## coverage alignment depth and percent coverage 165 | ## depth compute the depth 166 | ## flagstat simple stats 167 | ## idxstats BAM index stats 168 | ## cram-size list CRAM Content-ID and Data-Series sizes 169 | ## phase phase heterozygotes 170 | ## stats generate stats (former bamcheck) 171 | ## ampliconstats generate amplicon specific stats 172 | ## 173 | ## -- Viewing 174 | ## flags explain BAM flags 175 | ## head header viewer 176 | ## tview text alignment viewer 177 | ## view SAM<->BAM<->CRAM conversion 178 | ## depad convert padded BAM to unpadded BAM 179 | ## samples list the samples in a set of SAM/BAM/CRAM files 180 | ## 181 | ## -- Misc 182 | ## help [cmd] display this help message or help for [cmd] 183 | ## version detailed version information 184 | 185 | ## Viewing 186 | 187 | Use [bioSyntax](https://github.com/bioSyntax/bioSyntax) to prettify your 188 | output. 189 | 190 | ``` bash 191 | samtools view aln.bam | sam-less 192 | ``` 193 | 194 | ![bioSyntax](img/sam_less.png) 195 | 196 | ## Converting a SAM file to a BAM file 197 | 198 | A BAM file is just a SAM file but stored in binary format; you should 199 | always convert your SAM files into BAM format since they are smaller in 200 | size and are faster to manipulate. 201 | 202 | I don’t have a SAM file in the example folder, so let’s create one and 203 | check out the first ten lines. Note: remember to use `-h` to ensure the 204 | SAM file contains the sequence header information. Generally, I 205 | recommend storing only sorted BAM files as they use even less disk space 206 | and are faster to process. 207 | 208 | ``` bash 209 | samtools view -h eg/ERR188273_chrX.bam > eg/ERR188273_chrX.sam 210 | ``` 211 | 212 | Notice that the SAM file is much larger than the BAM file. 213 | 214 | Size of SAM file. 215 | 216 | ``` bash 217 | ls -lh eg/ERR188273_chrX.sam 218 | ``` 219 | 220 | ## -rw-r--r-- 1 root root 321M Mar 25 07:28 eg/ERR188273_chrX.sam 221 | 222 | Size of BAM file. 223 | 224 | ``` bash 225 | ls -lh eg/ERR188273_chrX.bam 226 | ``` 227 | 228 | ## -rw-r--r-- 1 root root 67M Mar 25 07:27 eg/ERR188273_chrX.bam 229 | 230 | We can use `head` to view a SAM file. 231 | 232 | ``` bash 233 | head eg/ERR188273_chrX.sam 234 | ``` 235 | 236 | ## @HD VN:1.0 SO:coordinate 237 | ## @SQ SN:chrX LN:156040895 238 | ## @PG ID:hisat2 PN:hisat2 VN:2.2.0 CL:"/Users/dtang/github/rnaseq/hisat2/../src/hisat2-2.2.0/hisat2-align-s --wrapper basic-0 --dta -p 4 -x ../raw/chrX_data/indexes/chrX_tran -1 /tmp/4195.inpipe1 -2 /tmp/4195.inpipe2" 239 | ## @PG ID:samtools PN:samtools PP:hisat2 VN:1.19.2 CL:samtools view -h eg/ERR188273_chrX.bam 240 | ## ERR188273.4711308 73 chrX 21649 0 5S70M = 21649 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 241 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 242 | ## ERR188273.4711308 329 chrX 233717 0 5S70M = 233717 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 243 | ## ERR188273.14904746 99 chrX 251271 60 75M = 251317 121 GAAAAATGGGCCCAGGGGACCGGCGCTCAGCATACAGAGGACCCGCGCCGGCACCTGCCTCTGAGTTCCCTTAGT @@HHEGIIGAGIIIBGIIG@FECH aln.bam 257 | ``` 258 | 259 | If the header information is available, we can convert a SAM file into 260 | BAM by using `samtools view -b`. In newer versions of SAMtools, the 261 | input format is auto-detected, so we no longer need the `-S` parameter. 262 | 263 | ``` bash 264 | samtools view -b eg/ERR188273_chrX.sam > eg/my.bam 265 | ``` 266 | 267 | ## Converting a BAM file to a CRAM file 268 | 269 | The CRAM format is even more compact. Use `samtools view` with the `-T` 270 | and `-C` arguments to convert a BAM file into CRAM. 271 | 272 | ``` bash 273 | samtools view -T genome/chrX.fa -C -o eg/ERR188273_chrX.cram eg/ERR188273_chrX.bam 274 | 275 | ls -lh eg/ERR188273_chrX.[sbcr]*am 276 | ``` 277 | 278 | ## -rw-r--r-- 1 root root 67M Mar 25 07:27 eg/ERR188273_chrX.bam 279 | ## -rw-r--r-- 1 root root 40M Mar 25 07:28 eg/ERR188273_chrX.cram 280 | ## -rw-r--r-- 1 root root 321M Mar 25 07:28 eg/ERR188273_chrX.sam 281 | 282 | You can use `samtools view` to view a CRAM file just as you would for a 283 | BAM file. 284 | 285 | ``` bash 286 | samtools view eg/ERR188273_chrX.cram | head 287 | ``` 288 | 289 | ## ERR188273.4711308 73 chrX 21649 0 5S70M = 21649 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UP NH:i:2 MD:Z:70 NM:i:0 290 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 291 | ## ERR188273.4711308 329 chrX 233717 0 5S70M = 233717 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UP NH:i:2 MD:Z:70 NM:i:0 292 | ## ERR188273.14904746 99 chrX 251271 60 75M = 251317 121 GAAAAATGGGCCCAGGGGACCGGCGCTCAGCATACAGAGGACCCGCGCCGGCACCTGCCTCTGAGTTCCCTTAGT @@HHEGIIGAGIIIBGIIG@FECHDC>>+@::8-755-BBBFDDEHHBGGEGHEEIJIIGIJJIGEIIIJJJIIJJIGGHHHGGFFFFF@@C AS:i:0 ZS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UP NH:i:10 MD:Z:75 NM:i:0 296 | ## ERR188273.5927795 385 chrX 265991 1 75M = 114048277 0 TGGGACTACAGGCGCCCGCCACCACGCCCGGCTAATTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGTGTTA =?BB??BD?FBHHBEAE@CDGG@HH=FA@GEGE;FGACCHBE6?A=ACE9)7@DCE>>5'3=338:;:>2;3?BCFFEEHHHEEGIGGHAGFBBHFBHHEHCG@<@ABG??@@?BB9GBGAFFD<. 425 | 426 | ``` bash 427 | samtools flags 428 | ``` 429 | 430 | ## About: Convert between textual and numeric flag representation 431 | ## Usage: samtools flags FLAGS... 432 | ## 433 | ## Each FLAGS argument is either an INT (in decimal/hexadecimal/octal) representing 434 | ## a combination of the following numeric flag values, or a comma-separated string 435 | ## NAME,...,NAME representing a combination of the following flag names: 436 | ## 437 | ## 0x1 1 PAIRED paired-end / multiple-segment sequencing technology 438 | ## 0x2 2 PROPER_PAIR each segment properly aligned according to aligner 439 | ## 0x4 4 UNMAP segment unmapped 440 | ## 0x8 8 MUNMAP next segment in the template unmapped 441 | ## 0x10 16 REVERSE SEQ is reverse complemented 442 | ## 0x20 32 MREVERSE SEQ of next segment in template is rev.complemented 443 | ## 0x40 64 READ1 the first segment in the template 444 | ## 0x80 128 READ2 the last segment in the template 445 | ## 0x100 256 SECONDARY secondary alignment 446 | ## 0x200 512 QCFAIL not passing quality controls or other filters 447 | ## 0x400 1024 DUP PCR or optical duplicate 448 | ## 0x800 2048 SUPPLEMENTARY supplementary alignment 449 | 450 | Find out about a `73` flag. 451 | 452 | ``` bash 453 | samtools flags 73 454 | ``` 455 | 456 | ## 0x49 73 PAIRED,MUNMAP,READ1 457 | 458 | ### Proper pair 459 | 460 | Reads that are properly paired are mapped within an expected distance 461 | with each other and with one pair in the reverse complement orientation. 462 | The script `generate_random_seq.pl` can generate reads that originate 463 | from different references and are thus discordant and not properly 464 | paired (as well as properly paired reads). In the example below, 10% of 465 | reads are not properly paired (set with `-d 0.1`). 466 | 467 | ``` bash 468 | script/generate_random_seq.pl 30 10000 1984 > test_ref.fa 469 | script/random_paired_end.pl -f test_ref.fa -l 100 -n 10000 -m 300 -d 0.1 470 | bwa index test_ref.fa 2> /dev/null 471 | bwa mem test_ref.fa l100_n10000_d300_1984_1.fq.gz l100_n10000_d300_1984_2.fq.gz > aln.sam 2> /dev/null 472 | ``` 473 | 474 | `samtools flagstat` will indicate that some reads (about 10%) mapped to 475 | different chromosomes. 476 | 477 | ``` bash 478 | samtools flagstat aln.sam 479 | ``` 480 | 481 | ## 20000 + 0 in total (QC-passed reads + QC-failed reads) 482 | ## 20000 + 0 primary 483 | ## 0 + 0 secondary 484 | ## 0 + 0 supplementary 485 | ## 0 + 0 duplicates 486 | ## 0 + 0 primary duplicates 487 | ## 20000 + 0 mapped (100.00% : N/A) 488 | ## 20000 + 0 primary mapped (100.00% : N/A) 489 | ## 20000 + 0 paired in sequencing 490 | ## 10000 + 0 read1 491 | ## 10000 + 0 read2 492 | ## 18012 + 0 properly paired (90.06% : N/A) 493 | ## 20000 + 0 with itself and mate mapped 494 | ## 0 + 0 singletons (0.00% : N/A) 495 | ## 1988 + 0 with mate mapped to a different chr 496 | ## 1988 + 0 with mate mapped to a different chr (mapQ>=5) 497 | 498 | Flag of a proper pair. 499 | 500 | ``` bash 501 | samtools flag $(samtools view -f 2 aln.sam | head -1 | cut -f2) 502 | ``` 503 | 504 | ## 0x63 99 PAIRED,PROPER_PAIR,MREVERSE,READ1 505 | 506 | Flag of a pair (that is not a proper pair). 507 | 508 | ``` bash 509 | samtools flag $(samtools view -F 2 aln.sam | head -1 | cut -f2) 510 | ``` 511 | 512 | ## 0x61 97 PAIRED,MREVERSE,READ1 513 | 514 | ## Filtering unmapped reads 515 | 516 | Use `-F 4` to filter out unmapped reads. 517 | 518 | ``` bash 519 | samtools view -F 4 -b eg/ERR188273_chrX.bam > eg/ERR188273_chrX.mapped.bam 520 | ``` 521 | 522 | Use `-f 4` to keep only unmapped reads. 523 | 524 | ``` bash 525 | samtools view -f 4 -b eg/ERR188273_chrX.bam > eg/ERR188273_chrX.unmapped.bam 526 | ``` 527 | 528 | We can use the `flags` subcommand to confirm that a value of four 529 | represents an unmapped read. 530 | 531 | ``` bash 532 | samtools flags 4 533 | ``` 534 | 535 | ## 0x4 4 UNMAP 536 | 537 | ## Extracting entries mapping to a specific loci 538 | 539 | Use `samtools view` and the `ref:start-end` syntax to extract reads 540 | mapping within a specific genomic loci; this requires a BAM index file. 541 | 542 | ``` bash 543 | samtools view eg/ERR188273_chrX.bam chrX:20000-30000 544 | ``` 545 | 546 | ## ERR188273.4711308 73 chrX 21649 0 5S70M = 21649 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 547 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 548 | 549 | Note that this takes into account the mapping of the entire read and not 550 | just the starting position. For example, if you specified 551 | chrX:20000-30000, a 75 bp long read that starts its mapping from 552 | position 19999 will also be returned. In addition, you can save the 553 | output as another BAM file if you want. 554 | 555 | ``` bash 556 | samtools view -b eg/ERR188273_chrX.bam chrX:20000-30000 > eg/ERR188273_chrX_20000_30000.bam 557 | ``` 558 | 559 | If you want reads mapped to a single reference (e.g. chromosome), just 560 | specify the `ref` and leave out the start and end values. 561 | 562 | ``` bash 563 | samtools view eg/ERR188273_chrX.bam chrX | head 564 | ``` 565 | 566 | ## ERR188273.4711308 73 chrX 21649 0 5S70M = 21649 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 567 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 568 | ## ERR188273.4711308 329 chrX 233717 0 5S70M = 233717 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 569 | ## ERR188273.14904746 99 chrX 251271 60 75M = 251317 121 GAAAAATGGGCCCAGGGGACCGGCGCTCAGCATACAGAGGACCCGCGCCGGCACCTGCCTCTGAGTTCCCTTAGT @@HHEGIIGAGIIIBGIIG@FECHDC>>+@::8-755-BBBFDDEHHBGGEGHEEIJIIGIJJIGEIIIJJJIIJJIGGHHHGGFFFFF@@C AS:i:0 ZS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:75 YT:Z:UP NH:i:10 573 | ## ERR188273.5927795 385 chrX 265991 1 75M = 114048277 0 TGGGACTACAGGCGCCCGCCACCACGCCCGGCTAATTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGTGTTA =?BB??BD?FBHHBEAE@CDGG@HH=FA@GEGE;FGACCHBE6?A=ACE9)7@DCE>>5'3=338:;:>2;3?BCFFEEHHHEEGIGGHAGFBBHFBHHEHCG@<@ABG??@@?BB9GBGAFFD<DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 589 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 590 | ## ERR188273.4711308 329 chrX 233717 0 5S70M = 233717 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 591 | ## ERR188273.14904746 99 chrX 251271 60 75M = 251317 121 GAAAAATGGGCCCAGGGGACCGGCGCTCAGCATACAGAGGACCCGCGCCGGCACCTGCCTCTGAGTTCCCTTAGT @@HHEGIIGAGIIIBGIIG@FECH eg/first.bam 602 | ``` 603 | 604 | Once again, you can use `flags` to verify this (it also accepts 605 | hexadecimal input). 606 | 607 | ``` bash 608 | samtools flags 0x0040 609 | ``` 610 | 611 | ## 0x40 64 READ1 612 | 613 | ## Stats 614 | 615 | For simple statistics use `samtools flagstat`. 616 | 617 | ``` bash 618 | samtools flagstat eg/ERR188273_chrX.bam 619 | ``` 620 | 621 | ## 1176360 + 0 in total (QC-passed reads + QC-failed reads) 622 | ## 1160084 + 0 primary 623 | ## 16276 + 0 secondary 624 | ## 0 + 0 supplementary 625 | ## 0 + 0 duplicates 626 | ## 0 + 0 primary duplicates 627 | ## 1126961 + 0 mapped (95.80% : N/A) 628 | ## 1110685 + 0 primary mapped (95.74% : N/A) 629 | ## 1160084 + 0 paired in sequencing 630 | ## 580042 + 0 read1 631 | ## 580042 + 0 read2 632 | ## 1060858 + 0 properly paired (91.45% : N/A) 633 | ## 1065618 + 0 with itself and mate mapped 634 | ## 45067 + 0 singletons (3.88% : N/A) 635 | ## 0 + 0 with mate mapped to a different chr 636 | ## 0 + 0 with mate mapped to a different chr (mapQ>=5) 637 | 638 | For more stats, use `samtools stats`. 639 | 640 | ``` bash 641 | samtools stats eg/ERR188273_chrX.bam | grep ^SN 642 | ``` 643 | 644 | ## SN raw total sequences: 1160084 # excluding supplementary and secondary reads 645 | ## SN filtered sequences: 0 646 | ## SN sequences: 1160084 647 | ## SN is sorted: 1 648 | ## SN 1st fragments: 580042 649 | ## SN last fragments: 580042 650 | ## SN reads mapped: 1110685 651 | ## SN reads mapped and paired: 1065618 # paired-end technology bit set + both mates mapped 652 | ## SN reads unmapped: 49399 653 | ## SN reads properly paired: 1060858 # proper-pair bit set 654 | ## SN reads paired: 1160084 # paired-end technology bit set 655 | ## SN reads duplicated: 0 # PCR or optical duplicate bit set 656 | ## SN reads MQ0: 905 # mapped and MQ=0 657 | ## SN reads QC failed: 0 658 | ## SN non-primary alignments: 16276 659 | ## SN supplementary alignments: 0 660 | ## SN total length: 87006300 # ignores clipping 661 | ## SN total first fragment length: 43503150 # ignores clipping 662 | ## SN total last fragment length: 43503150 # ignores clipping 663 | ## SN bases mapped: 83301375 # ignores clipping 664 | ## SN bases mapped (cigar): 83064942 # more accurate 665 | ## SN bases trimmed: 0 666 | ## SN bases duplicated: 0 667 | ## SN mismatches: 423271 # from NM fields 668 | ## SN error rate: 5.095663e-03 # mismatches / bases mapped (cigar) 669 | ## SN average length: 75 670 | ## SN average first fragment length: 75 671 | ## SN average last fragment length: 75 672 | ## SN maximum length: 75 673 | ## SN maximum first fragment length: 75 674 | ## SN maximum last fragment length: 75 675 | ## SN average quality: 36.0 676 | ## SN insert size average: 182.7 677 | ## SN insert size standard deviation: 176.0 678 | ## SN inward oriented pairs: 530763 679 | ## SN outward oriented pairs: 1042 680 | ## SN pairs with other orientation: 1004 681 | ## SN pairs on different chromosomes: 0 682 | ## SN percentage of properly paired reads (%): 91.4 683 | 684 | ## samtools calmd/fillmd 685 | 686 | The `calmd` or `fillmd` tool is useful for visualising mismatches and 687 | insertions in an alignment of a read to a reference genome. The `-e` 688 | argument changes identical bases between the read and reference into 689 | `=`. 690 | 691 | ``` bash 692 | samtools view -b eg/ERR188273_chrX.bam | samtools fillmd -e - genome/chrX.fa > eg/ERR188273_chrX_fillmd.bam 693 | 694 | head eg/ERR188273_chrX_fillmd.bam 695 | ``` 696 | 697 | ## @HD VN:1.0 SO:coordinate 698 | ## @SQ SN:chrX LN:156040895 699 | ## @PG ID:hisat2 PN:hisat2 VN:2.2.0 CL:"/Users/dtang/github/rnaseq/hisat2/../src/hisat2-2.2.0/hisat2-align-s --wrapper basic-0 --dta -p 4 -x ../raw/chrX_data/indexes/chrX_tran -1 /tmp/4195.inpipe1 -2 /tmp/4195.inpipe2" 700 | ## @PG ID:samtools PN:samtools PP:hisat2 VN:1.19.2 CL:samtools view -b eg/ERR188273_chrX.bam 701 | ## @PG ID:samtools.1 PN:samtools PP:samtools VN:1.19.2 CL:samtools fillmd -e - genome/chrX.fa 702 | ## ERR188273.4711308 73 chrX 21649 0 5S70M = 21649 0 CGGGT====================================================================== @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 703 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 704 | ## ERR188273.4711308 329 chrX 233717 0 5S70M = 233717 0 CGGGT====================================================================== @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 705 | ## ERR188273.14904746 99 chrX 251271 60 75M = 251317 121 =========================================================================== @@HHEGIIGAGIIIBGIIG@FECHDGHCHHHGHHFFFFFDEACC@ 724 | ## @ERR188273.14904746 725 | ## GAAAAATGGGCCCAGGGGACCGGCGCTCAGCATACAGAGGACCCGCGCCGGCACCTGCCTCTGAGTTCCCTTAGT 726 | ## + 727 | ## @@HHEGIIGAGIIIBGIIG@FECH eg/ERR188273_chrX_rand.bam 739 | ``` 740 | 741 | ## Count number of reads 742 | 743 | Use `samtools idxstats` to print stats on a BAM file; this requires an 744 | index file which is created by running `samtools index`. The output of 745 | idxstats is a file with four tab-delimited columns: 746 | 747 | 1. Reference name 748 | 2. Sequence length of reference 749 | 3. Number of mapped reads 750 | 4. Number of unmapped reads 751 | 752 | 753 | 754 | ``` bash 755 | samtools idxstats eg/ERR188273_chrX.bam 756 | ``` 757 | 758 | ## chrX 156040895 1126961 45067 759 | ## * 0 0 4332 760 | 761 | We can use this with `awk` to calculate: 762 | 763 | The number of mapped reads by summing the third column. 764 | 765 | ``` bash 766 | samtools idxstats eg/ERR188273_chrX.bam | awk '{s+=$3} END {print s}' 767 | ``` 768 | 769 | ## 1126961 770 | 771 | The number of reads, which is the sum of mapped and unmapped reads. 772 | 773 | ``` bash 774 | samtools idxstats eg/ERR188273_chrX.bam | awk '{s+=$3+$4} END {print s}' 775 | ``` 776 | 777 | ## 1176360 778 | 779 | ## Obtaining genomic sequence 780 | 781 | Use `faidx` to fetch genomic sequence; coordinates are 1-based. 782 | 783 | We need to first index the reference FASTA file that was used to map the 784 | reads. 785 | 786 | ``` bash 787 | samtools faidx genome/chrX.fa 788 | ``` 789 | 790 | Now we can obtain the sequence. 791 | 792 | ``` bash 793 | samtools faidx genome/chrX.fa chrX:300000-300100 794 | ``` 795 | 796 | ## >chrX:300000-300100 797 | ## ctgagatcgtgccactgcactccagcctgggcgacagagcgagactccatctcaaaaaaa 798 | ## aaaaaaaaaaaaaagaTggggtctctctatgttggccaggt 799 | 800 | ## Comparing BAM files 801 | 802 | The output from `mpileup` can be used to compare BAM files. The commands 803 | below generates alignments using `bwa` and `minimap2`. 804 | 805 | ``` bash 806 | len=100 807 | n=10000 808 | m=300 809 | script/generate_random_seq.pl 30 1000000 1984 > test_ref.fa 810 | script/random_paired_end.pl -f test_ref.fa -l ${len} -n ${n} -m ${m} 811 | bwa index test_ref.fa 2> /dev/null 812 | 813 | bwa mem test_ref.fa l${len}_n${n}_d${m}_1984_1.fq.gz l${len}_n${n}_d${m}_1984_2.fq.gz 2> /dev/null | samtools sort - -o aln_bwa.bam 814 | minimap2 -ax sr test_ref.fa l${len}_n${n}_d${m}_1984_1.fq.gz l${len}_n${n}_d${m}_1984_2.fq.gz 2> /dev/null | samtools sort - -o aln_mm.bam 815 | ``` 816 | 817 | The BAM files can be used with `mpileup` to compare the depths. 818 | 819 | ``` bash 820 | samtools mpileup -s -f test_ref.fa aln_bwa.bam aln_mm.bam | head -20 821 | ``` 822 | 823 | ## [mpileup] 2 samples in 2 input files 824 | ## 1 8238 G 1 ^]. > ] 1 ^]. > ] 825 | ## 1 8239 G 1 . > ] 1 . > ] 826 | ## 1 8240 A 1 . J ] 1 . J ] 827 | ## 1 8241 C 1 . J ] 1 . J ] 828 | ## 1 8242 A 1 . J ] 1 . J ] 829 | ## 1 8243 C 1 . J ] 1 . J ] 830 | ## 1 8244 T 1 . J ] 1 . J ] 831 | ## 1 8245 G 1 . J ] 1 . J ] 832 | ## 1 8246 C 1 . J ] 1 . J ] 833 | ## 1 8247 G 1 . J ] 1 . J ] 834 | ## 1 8248 A 1 . J ] 1 . J ] 835 | ## 1 8249 C 1 . J ] 1 . J ] 836 | ## 1 8250 A 1 . J ] 1 . J ] 837 | ## 1 8251 G 1 . J ] 1 . J ] 838 | ## 1 8252 T 1 . J ] 1 . J ] 839 | ## 1 8253 G 1 . J ] 1 . J ] 840 | ## 1 8254 A 1 . J ] 1 . J ] 841 | ## 1 8255 G 1 . J ] 1 . J ] 842 | ## 1 8256 G 1 . J ] 1 . J ] 843 | ## 1 8257 G 1 . J ] 1 . J ] 844 | 845 | Another approach is to use 846 | [deepTools](https://deeptools.readthedocs.io/en/develop/) and the 847 | [bamCompare](https://deeptools.readthedocs.io/en/develop/content/tools/bamCompare.html) 848 | command. The bigWig output file shows the ratio of reads between `b1` 849 | and `b2` in 50 bp (default) windows. 850 | 851 | ## Converting reference names 852 | 853 | One of the most annoying bioinformatics problems is the use of different 854 | chromosome names, e.g. chr1 vs 1, in different references even when the 855 | sequences are identical. The GRCh38 reference downloaded from Ensembl 856 | has chromosome names without the `chr`: 857 | 858 | >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF 859 | 860 | Whereas the reference names from UCSC has the `chr`: 861 | 862 | >chr1 AC:CM000663.2 gi:568336023 LN:248956422 rl:Chromosome M5:6aef897c3d6ff0c78aff06ac189178dd AS:GRCh38 863 | 864 | Luckily you can change the reference names using `samtools reheader` but 865 | just make sure your reference sequences are actually identical. 866 | 867 | ``` bash 868 | samtools view eg/ERR188273_chrX.bam | head -2 869 | ``` 870 | 871 | ## ERR188273.4711308 73 chrX 21649 0 5S70M = 21649 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 872 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 873 | 874 | View header 875 | 876 | ``` bash 877 | samtools view -H eg/ERR188273_chrX.bam 878 | ``` 879 | 880 | ## @HD VN:1.0 SO:coordinate 881 | ## @SQ SN:chrX LN:156040895 882 | ## @PG ID:hisat2 PN:hisat2 VN:2.2.0 CL:"/Users/dtang/github/rnaseq/hisat2/../src/hisat2-2.2.0/hisat2-align-s --wrapper basic-0 --dta -p 4 -x ../raw/chrX_data/indexes/chrX_tran -1 /tmp/4195.inpipe1 -2 /tmp/4195.inpipe2" 883 | ## @PG ID:samtools PN:samtools PP:hisat2 VN:1.19.2 CL:samtools view -H eg/ERR188273_chrX.bam 884 | 885 | Substitute header with new name. 886 | 887 | ``` bash 888 | samtools view -H eg/ERR188273_chrX.bam | sed 's/SN:chrX/SN:X/' > eg/my_header 889 | ``` 890 | 891 | Save bam file with new ref and check it out. 892 | 893 | ``` bash 894 | samtools reheader eg/my_header eg/ERR188273_chrX.bam > eg/ERR188273_X.bam 895 | samtools view eg/ERR188273_X.bam | head -2 896 | ``` 897 | 898 | ## ERR188273.4711308 73 X 21649 0 5S70M = 21649 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 899 | ## ERR188273.4711308 133 X 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 900 | 901 | ## Coverage 902 | 903 | Coverage can mean the: 904 | 905 | 1. average depth of each covered base 906 | 2. percentage of bases covered 907 | 908 | `samtools depth` and `samtools mpileup` can be used to indicate the 909 | depth of each covered base (and used to calculate the average depth. 910 | `samtools coverage` will provide both the average depth and percentage 911 | of bases covered per chromosome/reference sequence. 912 | 913 | `samtools depth` will return three columns: reference, position, and 914 | coverage. 915 | 916 | ``` bash 917 | samtools depth -@ 4 eg/ERR188273_chrX.bam > ERR188273_depth.tsv 918 | head ERR188273_depth.tsv 919 | ``` 920 | 921 | ## chrX 21649 1 922 | ## chrX 21650 1 923 | ## chrX 21651 1 924 | ## chrX 21652 1 925 | ## chrX 21653 1 926 | ## chrX 21654 1 927 | ## chrX 21655 1 928 | ## chrX 21656 1 929 | ## chrX 21657 1 930 | ## chrX 21658 1 931 | 932 | The average depth can be calculated by summing the third column and 933 | dividing by the total number of bases (be sure to use `-a` with 934 | `samtools depth` as that will output all positions including zero 935 | depth). 936 | 937 | ``` bash 938 | samtools depth -@ 4 -a eg/ERR188273_chrX.bam | perl -ane '$t += $F[2]; END {$cov = $t / $.; printf "Bases covered:\t%.3f\nCoverage:\t%.3f\n", $., $cov}' 939 | ``` 940 | 941 | ## Bases covered: 156040895.000 942 | ## Coverage: 0.532 943 | 944 | The `samtools mpileup` command also provides depth information (but not 945 | for reads that have a mapping quality of 0, by default) with some 946 | additional information: 947 | 948 | 1. Sequence name 949 | 2. 1-based coordinate 950 | 3. Reference base (when used with `-f`) 951 | 4. Number of reads covering this position 952 | 5. Read bases 953 | 6. Base qualities 954 | 7. Alignment mapping qualities (when used with `-s`) 955 | 956 | 957 | 958 | ``` bash 959 | samtools mpileup -f genome/chrX.fa -s eg/ERR188273_chrX.bam > ERR188273_mpileup.tsv 960 | head ERR188273_mpileup.tsv 961 | ``` 962 | 963 | ## [mpileup] 1 samples in 1 input files 964 | ## chrX 251271 g 1 ^]. @ ] 965 | ## chrX 251272 a 1 . @ ] 966 | ## chrX 251273 a 1 . < ] 967 | ## chrX 251274 a 1 . D ] 968 | ## chrX 251275 a 1 . D ] 969 | ## chrX 251276 a 1 . D ] 970 | ## chrX 251277 t 1 . D ] 971 | ## chrX 251278 g 1 . D ] 972 | ## chrX 251279 g 1 . F ] 973 | ## chrX 251280 g 1 . B ] 974 | 975 | Note that the start of the `samtools mpileup` output differ from the 976 | start of the `samtools depth` output. This is because `mpileup` performs 977 | some filtering by default. In the case of this example, read pairs that 978 | are not both mapped will be ignored. To count these “orphan” reads, use 979 | the `--count-orphans` argument. 980 | 981 | ``` bash 982 | samtools mpileup -f genome/chrX.fa --count-orphans -s eg/ERR188273_chrX.bam > ERR188273_mpileup_orphans.tsv 983 | head ERR188273_mpileup_orphans.tsv 984 | ``` 985 | 986 | ## [mpileup] 1 samples in 1 input files 987 | ## chrX 21649 g 0 * * * 988 | ## chrX 21650 a 1 . D ! 989 | ## chrX 21651 t 1 . F ! 990 | ## chrX 21652 c 1 . F ! 991 | ## chrX 21653 a 1 . H ! 992 | ## chrX 21654 c 1 . G ! 993 | ## chrX 21655 g 1 . H ! 994 | ## chrX 21656 a 1 . B ! 995 | ## chrX 21657 g 1 . H ! 996 | ## chrX 21658 g 1 . I ! 997 | 998 | In addition `mpileup` performs “per-Base Alignment Quality” (BAQ) by 999 | default and will adjust base quality scores. The default behaviour to to 1000 | skip bases with baseQ/BAQ smaller than 13. If you are finding 1001 | discrepancies between `mpileup`’s coverage calculation with another 1002 | coverage tool, you can either set `--min-BQ` to `0` or use `--no-BAQ` to 1003 | disable BAQ. 1004 | 1005 | I have an [old blog 1006 | post](https://davetang.org/muse/2015/08/26/samtools-mpileup/) on using 1007 | `mpileup`. 1008 | 1009 | `samtools coverage` will provide the following coverage statistics: 1010 | 1011 | 1. `rname` - Reference name / chromosome 1012 | 2. `startpos` - Start position 1013 | 3. `endpos` - End position (or sequence length) 1014 | 4. `numreads` - Number reads aligned to the region (after filtering) 1015 | 5. `covbases` - Number of covered bases with depth \>= 1 1016 | 6. `coverage` - Proportion of covered bases \[0..1\] 1017 | 7. `meandepth` - Mean depth of coverage 1018 | 8. `meanbaseq` - Mean base quality in covered region 1019 | 9. `meanmapq` - Mean mapping quality of selected reads 1020 | 1021 | 1022 | 1023 | ``` bash 1024 | samtools coverage eg/ERR188273_chrX.bam 1025 | ``` 1026 | 1027 | ## #rname startpos endpos numreads covbases coverage meandepth meanbaseq meanmapq 1028 | ## chrX 1 156040895 1110685 3402037 2.18022 0.532299 36.3 59.4 1029 | 1030 | The example BAM file only contains reads for `chrX` hence the statistics 1031 | are only returned for `chrX`. 1032 | 1033 | Returning to our coverage definition at the start of this section: 1034 | 1035 | 1. average depth of each covered base = `meandepth` 1036 | 2. percentage of bases covered = `covbases` 1037 | 1038 | The [mosdepth](https://github.com/brentp/mosdepth) tool can also 1039 | calculate depth (and much faster than `samtools depth`) per base or 1040 | within a given window. The output is given in a BED file, where the 1041 | fourth column indicates the coverage. 1042 | 1043 | ``` bash 1044 | mosdepth ERR188273 eg/ERR188273_chrX.bam 1045 | gunzip -c ERR188273.per-base.bed.gz | head 1046 | ``` 1047 | 1048 | ## chrX 0 21648 0 1049 | ## chrX 21648 21718 1 1050 | ## chrX 21718 251270 0 1051 | ## chrX 251270 251391 1 1052 | ## chrX 251391 265950 0 1053 | ## chrX 265950 266021 1 1054 | ## chrX 266021 266096 2 1055 | ## chrX 266096 269848 0 1056 | ## chrX 269848 269923 1 1057 | ## chrX 269923 270095 0 1058 | 1059 | `mosdepth` coverage. 1060 | 1061 | ``` bash 1062 | cat ERR188273.mosdepth.summary.txt 1063 | ``` 1064 | 1065 | ## chrom length bases mean min max 1066 | ## chrX 156040895 76303957 0.49 0 40804 1067 | ## total 156040895 76303957 0.49 0 40804 1068 | 1069 | Coverage in using a 500 bp window. 1070 | 1071 | ``` bash 1072 | mosdepth -n --fast-mode --by 500 ERR188273_500 eg/ERR188273_chrX.bam 1073 | gunzip -c ERR188273_500.regions.bed.gz | head 1074 | ``` 1075 | 1076 | ## chrX 0 500 0.00 1077 | ## chrX 500 1000 0.00 1078 | ## chrX 1000 1500 0.00 1079 | ## chrX 1500 2000 0.00 1080 | ## chrX 2000 2500 0.00 1081 | ## chrX 2500 3000 0.00 1082 | ## chrX 3000 3500 0.00 1083 | ## chrX 3500 4000 0.00 1084 | ## chrX 4000 4500 0.00 1085 | ## chrX 4500 5000 0.00 1086 | 1087 | ## Stargazers over time 1088 | 1089 | [![Stargazers over 1090 | time](https://starchart.cc/davetang/learning_bam_file.svg)](https://starchart.cc/davetang/learning_bam_file) 1091 | -------------------------------------------------------------------------------- /bam-readcount/ERR188273_chrX.metrics.txt: -------------------------------------------------------------------------------- 1 | chrX 21649 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:35.00:0.00:1:0:0.00:0.00:0.00:1:0.91:70.00:0.91 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 2 | chrX 21650 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:35.00:0.00:1:0:0.03:0.00:0.00:1:0.89:70.00:0.89 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 3 | chrX 21651 t 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:1:0.00:37.00:0.00:1:0:0.06:0.00:0.00:1:0.88:70.00:0.88 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 4 | chrX 21652 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:37.00:0.00:1:0:0.09:0.00:0.00:1:0.87:70.00:0.87 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 5 | chrX 21653 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:39.00:0.00:1:0:0.11:0.00:0.00:1:0.85:70.00:0.85 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 6 | chrX 21654 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:38.00:0.00:1:0:0.14:0.00:0.00:1:0.84:70.00:0.84 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 7 | chrX 21655 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:39.00:0.00:1:0:0.17:0.00:0.00:1:0.83:70.00:0.83 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 8 | chrX 21656 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:33.00:0.00:1:0:0.20:0.00:0.00:1:0.81:70.00:0.81 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 9 | chrX 21657 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:39.00:0.00:1:0:0.23:0.00:0.00:1:0.80:70.00:0.80 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 10 | chrX 21658 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:40.00:0.00:1:0:0.26:0.00:0.00:1:0.79:70.00:0.79 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 11 | chrX 21659 t 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:1:0.00:37.00:0.00:1:0:0.29:0.00:0.00:1:0.77:70.00:0.77 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 12 | chrX 21660 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:37.00:0.00:1:0:0.31:0.00:0.00:1:0.76:70.00:0.76 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 13 | chrX 21661 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:39.00:0.00:1:0:0.34:0.00:0.00:1:0.75:70.00:0.75 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 14 | chrX 21662 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:40.00:0.00:1:0:0.37:0.00:0.00:1:0.73:70.00:0.73 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 15 | chrX 21663 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:38.00:0.00:1:0:0.40:0.00:0.00:1:0.72:70.00:0.72 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 16 | chrX 21664 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:38.00:0.00:1:0:0.43:0.00:0.00:1:0.71:70.00:0.71 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 17 | chrX 21665 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:40.00:0.00:1:0:0.46:0.00:0.00:1:0.69:70.00:0.69 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 18 | chrX 21666 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:37.00:0.00:1:0:0.49:0.00:0.00:1:0.68:70.00:0.68 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 19 | chrX 21667 t 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:1:0.00:38.00:0.00:1:0:0.51:0.00:0.00:1:0.67:70.00:0.67 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 20 | chrX 21668 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:36.00:0.00:1:0:0.54:0.00:0.00:1:0.65:70.00:0.65 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 21 | chrX 21669 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:38.00:0.00:1:0:0.57:0.00:0.00:1:0.64:70.00:0.64 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 22 | chrX 21670 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:39.00:0.00:1:0:0.60:0.00:0.00:1:0.63:70.00:0.63 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 23 | chrX 21671 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:37.00:0.00:1:0:0.63:0.00:0.00:1:0.61:70.00:0.61 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 24 | chrX 21672 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:39.00:0.00:1:0:0.66:0.00:0.00:1:0.60:70.00:0.60 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 25 | chrX 21673 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:40.00:0.00:1:0:0.69:0.00:0.00:1:0.59:70.00:0.59 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 26 | chrX 21674 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:38.00:0.00:1:0:0.71:0.00:0.00:1:0.57:70.00:0.57 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 27 | chrX 21675 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:40.00:0.00:1:0:0.74:0.00:0.00:1:0.56:70.00:0.56 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 28 | chrX 21676 t 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:1:0.00:38.00:0.00:1:0:0.77:0.00:0.00:1:0.55:70.00:0.55 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 29 | chrX 21677 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:40.00:0.00:1:0:0.80:0.00:0.00:1:0.53:70.00:0.53 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 30 | chrX 21678 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:37.00:0.00:1:0:0.83:0.00:0.00:1:0.52:70.00:0.52 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 31 | chrX 21679 t 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:1:0.00:40.00:0.00:1:0:0.86:0.00:0.00:1:0.51:70.00:0.51 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 32 | chrX 21680 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:40.00:0.00:1:0:0.89:0.00:0.00:1:0.49:70.00:0.49 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 33 | chrX 21681 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:40.00:0.00:1:0:0.91:0.00:0.00:1:0.48:70.00:0.48 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 34 | chrX 21682 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:38.00:0.00:1:0:0.94:0.00:0.00:1:0.47:70.00:0.47 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 35 | chrX 21683 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:40.00:0.00:1:0:0.97:0.00:0.00:1:0.45:70.00:0.45 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 36 | chrX 21684 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:38.00:0.00:1:0:1.00:0.00:0.00:1:0.44:70.00:0.44 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 37 | chrX 21685 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:40.00:0.00:1:0:0.97:0.00:0.00:1:0.43:70.00:0.43 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 38 | chrX 21686 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:38.00:0.00:1:0:0.94:0.00:0.00:1:0.41:70.00:0.41 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 39 | chrX 21687 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:38.00:0.00:1:0:0.91:0.00:0.00:1:0.40:70.00:0.40 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 40 | chrX 21688 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:35.00:0.00:1:0:0.89:0.00:0.00:1:0.39:70.00:0.39 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 41 | chrX 21689 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:39.00:0.00:1:0:0.86:0.00:0.00:1:0.37:70.00:0.37 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 42 | chrX 21690 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:40.00:0.00:1:0:0.83:0.00:0.00:1:0.36:70.00:0.36 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 43 | chrX 21691 t 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:1:0.00:40.00:0.00:1:0:0.80:0.00:0.00:1:0.35:70.00:0.35 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 44 | chrX 21692 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:38.00:0.00:1:0:0.77:0.00:0.00:1:0.33:70.00:0.33 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 45 | chrX 21693 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:40.00:0.00:1:0:0.74:0.00:0.00:1:0.32:70.00:0.32 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 46 | chrX 21694 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:40.00:0.00:1:0:0.71:0.00:0.00:1:0.31:70.00:0.31 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 47 | chrX 21695 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:34.00:0.00:1:0:0.69:0.00:0.00:1:0.29:70.00:0.29 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 48 | chrX 21696 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:31.00:0.00:1:0:0.66:0.00:0.00:1:0.28:70.00:0.28 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 49 | chrX 21697 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:29.00:0.00:1:0:0.63:0.00:0.00:1:0.27:70.00:0.27 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 50 | chrX 21698 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:35.00:0.00:1:0:0.60:0.00:0.00:1:0.25:70.00:0.25 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 51 | chrX 21699 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:38.00:0.00:1:0:0.57:0.00:0.00:1:0.24:70.00:0.24 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 52 | chrX 21700 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:39.00:0.00:1:0:0.54:0.00:0.00:1:0.23:70.00:0.23 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 53 | chrX 21701 t 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:1:0.00:34.00:0.00:1:0:0.51:0.00:0.00:1:0.21:70.00:0.21 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 54 | chrX 21702 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:39.00:0.00:1:0:0.49:0.00:0.00:1:0.20:70.00:0.20 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 55 | chrX 21703 t 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:1:0.00:39.00:0.00:1:0:0.46:0.00:0.00:1:0.19:70.00:0.19 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 56 | chrX 21704 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:39.00:0.00:1:0:0.43:0.00:0.00:1:0.17:70.00:0.17 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 57 | chrX 21705 t 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:1:0.00:38.00:0.00:1:0:0.40:0.00:0.00:1:0.16:70.00:0.16 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 58 | chrX 21706 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:39.00:0.00:1:0:0.37:0.00:0.00:1:0.15:70.00:0.15 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 59 | chrX 21707 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:39.00:0.00:1:0:0.34:0.00:0.00:1:0.13:70.00:0.13 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 60 | chrX 21708 t 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:1:0.00:37.00:0.00:1:0:0.31:0.00:0.00:1:0.12:70.00:0.12 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 61 | chrX 21709 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:37.00:0.00:1:0:0.29:0.00:0.00:1:0.11:70.00:0.11 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 62 | chrX 21710 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:37.00:0.00:1:0:0.26:0.00:0.00:1:0.09:70.00:0.09 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 63 | chrX 21711 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:37.00:0.00:1:0:0.23:0.00:0.00:1:0.08:70.00:0.08 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 64 | chrX 21712 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:37.00:0.00:1:0:0.20:0.00:0.00:1:0.07:70.00:0.07 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 65 | chrX 21713 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:35.00:0.00:1:0:0.17:0.00:0.00:1:0.05:70.00:0.05 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 66 | chrX 21714 t 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:1:0.00:36.00:0.00:1:0:0.14:0.00:0.00:1:0.04:70.00:0.04 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 67 | chrX 21715 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:32.00:0.00:1:0:0.11:0.00:0.00:1:0.03:70.00:0.03 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 68 | chrX 21716 c 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:0.00:34.00:0.00:1:0:0.09:0.00:0.00:1:0.01:70.00:0.01 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 69 | chrX 21717 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:34.00:0.00:1:0:0.06:0.00:0.00:1:0.00:70.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 70 | chrX 21718 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:31.00:0.00:1:0:0.03:0.00:0.00:1:0.01:70.00:0.01 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 71 | -------------------------------------------------------------------------------- /bam-readcount/README.md: -------------------------------------------------------------------------------- 1 | ## README 2 | 3 | [bam-readcount](https://github.com/genome/bam-readcount) tool for generating metrics at single nucleotide positions from BAM files. We will use Docker to install `bam-readcount`; the Docker image is available at https://hub.docker.com/r/davetang/base. If you have never used Docker before, I wrote a [short guide](https://davetang.github.io/reproducible_bioinformatics/docker.html). 4 | 5 | ```bash 6 | cd ${HOME}/github/learning_bam_file 7 | 8 | # image available at https://hub.docker.com/r/davetang/base 9 | docker run --rm -it -v $(pwd):/data davetang/base /bin/bash 10 | 11 | # run the following commands inside the Docker container 12 | mkdir ~/github/ && cd ~/github/ 13 | git clone --recursive https://github.com/genome/bam-readcount.git 14 | 15 | # build and compile 16 | mkdir -p ~/tool/ && cd ~/tool/ 17 | cmake ~/github/bam-readcount 18 | make 19 | 20 | # check version 21 | bin/bam-readcount --version 22 | bam-readcount version: 0.8.0-unstable-7-625eea2 (commit 625eea2) 23 | 24 | # check usage 25 | bin/bam-readcount 26 | Usage: bam-readcount [OPTIONS] [region] 27 | Generate metrics for bam_file at single nucleotide positions. 28 | Example: bam-readcount -f ref.fa some.bam 29 | 30 | Available options: 31 | -h [ --help ] produce this message 32 | -v [ --version ] output the version number 33 | -q [ --min-mapping-quality ] arg (=0) minimum mapping quality of reads used 34 | for counting. 35 | -b [ --min-base-quality ] arg (=0) minimum base quality at a position to 36 | use the read for counting. 37 | -d [ --max-count ] arg (=10000000) max depth to avoid excessive memory 38 | usage. 39 | -l [ --site-list ] arg file containing a list of regions to 40 | report readcounts within. 41 | -f [ --reference-fasta ] arg reference sequence in the fasta format. 42 | -D [ --print-individual-mapq ] arg report the mapping qualities as a comma 43 | separated list. 44 | -p [ --per-library ] report results by library. 45 | -w [ --max-warnings ] arg maximum number of warnings of each type 46 | to emit. -1 gives an unlimited number. 47 | -i [ --insertion-centric ] generate indel centric readcounts. 48 | Reads containing insertions will not be 49 | included in per-base counts 50 | ``` 51 | 52 | Prepare reference file and run `bam-readcount` on example files provided in the repo. 53 | 54 | ```bash 55 | cd /data/eg 56 | cp chrX.fa.bz2 ref.fa.bz2 57 | bunzip2 ref.fa.bz2 58 | 59 | ~/tool/bin/bam-readcount -f ref.fa -w 0 -l ../bam-readcount/region.bed ERR188273_chrX.bam > ../bam-readcount/ERR188273_chrX.metrics.txt 60 | 61 | # clean up 62 | rm ref.fa* 63 | ``` 64 | 65 | [Output format](https://github.com/genome/bam-readcount#normal-output): 66 | 67 | 1. chr 68 | 2. position 69 | 3. reference_base 70 | 4. depth 71 | 5. base:count:avg_mapping_quality:avg_basequality:avg_se_mapping_quality:num_plus_strand:num_minus_strand:avg_pos_as_fraction:avg_num_mismatches_as_fraction:avg_sum_mismatch_qualities:num_q2_containing_reads:avg_distance_to_q2_start_in_q2_reads:avg_clipped_length:avg_distance_to_effective_3p_end 72 | 73 | ```bash 74 | chrX 21649 g 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:1:0.00:35.00:0.00:1:0:0.00:0.00:0.00:1:0.91:70.00:0.91 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 75 | chrX 21650 a 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:1:0.00:35.00:0.00:1:0:0.03:0.00:0.00:1:0.89:70.00:0.89 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 76 | chrX 21651 t 1 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 G:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 T:1:0.00:37.00:0.00:1:0:0.06:0.00:0.00:1:0.88:70.00:0.88 N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 77 | ``` 78 | 79 | -------------------------------------------------------------------------------- /bam-readcount/region.bed: -------------------------------------------------------------------------------- 1 | chrX 20000 50000 2 | -------------------------------------------------------------------------------- /conf/mkdocs_env.yml: -------------------------------------------------------------------------------- 1 | name: mkdocs 2 | channels: 3 | - conda-forge 4 | - anaconda 5 | - defaults 6 | dependencies: 7 | - mkdocs 8 | -------------------------------------------------------------------------------- /create_readme.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | out_md=tmp.md 6 | Rscript -e "rmarkdown::render('learning_bam_file.Rmd', output_file=\"$out_md\")" 7 | github-markdown-toc/gh-md-toc $out_md > toc 8 | 9 | cp -f $out_md mkdocs/docs/index.md 10 | cat toc <(echo) <(date) <(echo) $out_md > README.md 11 | 12 | rm $out_md toc 13 | 14 | >&2 echo Done! 15 | 16 | exit 0 17 | 18 | -------------------------------------------------------------------------------- /eg/ERR188273_chrX.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/learning_bam_file/d4bff075c248eea1109c619cea6f28eac68613a2/eg/ERR188273_chrX.bam -------------------------------------------------------------------------------- /eg/ERR188273_chrX.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/learning_bam_file/d4bff075c248eea1109c619cea6f28eac68613a2/eg/ERR188273_chrX.bam.bai -------------------------------------------------------------------------------- /eg/README.md: -------------------------------------------------------------------------------- 1 | ## README 2 | 3 | `ERR188273_chrX.bam` generated as per https://github.com/davetang/rnaseq using the HISAT2 + StringTie2 RNA-seq pipeline. 4 | 5 | -------------------------------------------------------------------------------- /eg/chrX.fa.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/learning_bam_file/d4bff075c248eea1109c619cea6f28eac68613a2/eg/chrX.fa.bz2 -------------------------------------------------------------------------------- /eg/clean.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | dir=$(dirname $0) 6 | 7 | rm -f \ 8 | ${dir}/ERR188273_X.bam \ 9 | ${dir}/ERR188273_chrX.cram \ 10 | ${dir}/ERR188273_chrX.mapped.bam \ 11 | ${dir}/ERR188273_chrX.sam \ 12 | ${dir}/ERR188273_chrX.stats \ 13 | ${dir}/ERR188273_chrX.unmapped.bam \ 14 | ${dir}/ERR188273_chrX_1.fq \ 15 | ${dir}/ERR188273_chrX_2.fq \ 16 | ${dir}/ERR188273_chrX_20000_30000.bam \ 17 | ${dir}/ERR188273_chrX_fillmd.bam \ 18 | ${dir}/ERR188273_chrX_rand.bam \ 19 | ${dir}/first.bam \ 20 | ${dir}/my.bam \ 21 | ${dir}/my_header \ 22 | ${dir}/sorted.bam 23 | 24 | -------------------------------------------------------------------------------- /eg/my.bed: -------------------------------------------------------------------------------- 1 | chrX 20000 30000 2 | chrX 233000 260000 3 | -------------------------------------------------------------------------------- /etc/README.md: -------------------------------------------------------------------------------- 1 | ## README 2 | 3 | If you are using macOS and want to build the README.md using the Makefile, you can use Docker and this [Ubuntu image](https://hub.docker.com/r/davetang/build). 4 | 5 | ```bash 6 | docker pull davetang/build:1.0 7 | ``` 8 | 9 | However, when I mounted the learning_bam_file directory to the Docker container and ran the Makefile from there, I'd get an [eterm](https://github.com/conda/conda/issues/6603) error. (Specifically, I got an error with ncurses-6.2-he6710b0_1/share/terminfo/E/Eterm-color.) It turns out that for macOS High Sierra (or later), the default file system is APFS and [it is case-insensitive by default](https://docker-docs.netlify.app/docker-for-mac/osxfs/) and Docker inherits this! Therefore, when you use Docker to run the Makefile, don't run it in a mounted volumne. For example: 10 | 11 | ```bash 12 | docker run --rm -it -v $(pwd):/work davetang/build:1.0 /bin/bash 13 | 14 | # inside the Docker container 15 | cd /tmp 16 | git clone https://github.com/davetang/learning_bam_file.git 17 | cd learning_bam_file 18 | make 19 | ``` 20 | 21 | -------------------------------------------------------------------------------- /etc/rsubread.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Rsubread" 3 | date: "`r Sys.Date()`" 4 | output: html_document 5 | --- 6 | 7 | ```{r setup, include=FALSE} 8 | knitr::opts_chunk$set(echo = TRUE) 9 | ``` 10 | 11 | ## RSubread 12 | 13 | Install if necessary. 14 | 15 | ```{r install} 16 | if(!"Rsubread" %in% installed.packages()){ 17 | if (!requireNamespace("BiocManager", quietly = TRUE)) 18 | install.packages("BiocManager") 19 | BiocManager::install("Rsubread", version = "3.8") 20 | } 21 | library(Rsubread) 22 | ``` 23 | 24 | Load example BED file and convert to annotation format: 25 | 26 | 1. GeneID 27 | 2. Chr 28 | 3. Start 29 | 4. End 30 | 5. Strand 31 | 32 | ```{r load_bed} 33 | my_bed <- read.table(file = "my.bed", 34 | header = FALSE, 35 | stringsAsFactors = FALSE) 36 | 37 | my_ann <- data.frame(GeneID = 1:2, 38 | Chr = my_bed$V1, 39 | Start = my_bed$V2, 40 | End = my_bed$V3, 41 | Strand = rep(".", 2), 42 | stringsAsFactors = FALSE) 43 | my_ann 44 | ``` 45 | 46 | Use `featureCounts` to count reads overlapping regions of interest. 47 | 48 | ```{r feature_count, message=FALSE, warning=FALSE} 49 | my_count <- featureCounts("aln.bam", annot.ext = my_ann) 50 | 51 | cbind(my_count$annotation, my_count$counts) 52 | ``` 53 | -------------------------------------------------------------------------------- /genome/README.md: -------------------------------------------------------------------------------- 1 | ## README 2 | 3 | It is often useful to know the start and end coordinates of assembled chromosomes and contigs. We can obtain this information from the MySQL server hosted by the UCSC Genome Browser team. 4 | 5 | ```bash 6 | mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e "select chrom, size from hg19.chromInfo" > hg19_info.tsv 7 | mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e "select chrom, size from hg38.chromInfo" > hg38_info.tsv 8 | ``` 9 | 10 | If you set your MySQL config file (`~/.my.cnf`) as 11 | 12 | ``` 13 | [clientucsc] 14 | user=genome 15 | password= 16 | host=genome-mysql.cse.ucsc.edu 17 | ``` 18 | 19 | you can run the following instead: 20 | 21 | ```bash 22 | mysql --defaults-group-suffix=ucsc -A -e "select chrom, size from hg19.chromInfo" > hg19_info.tsv 23 | mysql --defaults-group-suffix=ucsc -A -e "select chrom, size from hg38.chromInfo" > hg38_info.tsv 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /genome/hg19_info.tsv: -------------------------------------------------------------------------------- 1 | chrom size 2 | chr1 249250621 3 | chr2 243199373 4 | chr3 198022430 5 | chr4 191154276 6 | chr5 180915260 7 | chr6 171115067 8 | chr7 159138663 9 | chrX 155270560 10 | chr8 146364022 11 | chr9 141213431 12 | chr10 135534747 13 | chr11 135006516 14 | chr12 133851895 15 | chr13 115169878 16 | chr14 107349540 17 | chr15 102531392 18 | chr16 90354753 19 | chr17 81195210 20 | chr18 78077248 21 | chr20 63025520 22 | chrY 59373566 23 | chr19 59128983 24 | chr22 51304566 25 | chr21 48129895 26 | chr1_jh636052_fix 7283150 27 | chrX_jh806600_fix 6530008 28 | chr6_ssto_hap7 4928567 29 | chr6_mcf_hap5 4833398 30 | chr6_cox_hap2 4795371 31 | chr6_mann_hap4 4683263 32 | chr6_apd_hap1 4622290 33 | chr6_qbl_hap6 4611984 34 | chr6_dbb_hap3 4610396 35 | chrX_jh806587_fix 4110759 36 | chr7_jh159134_fix 3821770 37 | chrX_jh159150_fix 3110903 38 | chrX_jh806590_fix 2418393 39 | chr10_jh591181_fix 2281126 40 | chr17_ctg5_hap1 1680828 41 | chr1_jh636053_fix 1676126 42 | chr5_gl339449_alt 1612928 43 | chr14_kb021645_fix 1523386 44 | chrX_jh720453_fix 1461188 45 | chrX_jh806601_fix 1389764 46 | chr7_gl582971_fix 1284284 47 | chrX_jh806599_fix 1214327 48 | chr19_gl949749_alt 1091840 49 | chr19_gl949750_alt 1066389 50 | chr19_gl949748_alt 1064303 51 | chr19_kb021647_fix 1058686 52 | chrX_jh806597_fix 1045622 53 | chr10_ke332501_fix 1020827 54 | chr19_gl949751_alt 1002682 55 | chr19_gl949746_alt 987716 56 | chr19_gl949752_alt 987100 57 | chrX_jh806598_fix 899320 58 | chrX_jh720451_fix 898979 59 | chrX_jh806591_fix 882083 60 | chr11_jh806581_fix 872115 61 | chrX_jh806588_fix 862483 62 | chrX_jh806592_fix 835911 63 | chr19_gl949753_alt 796478 64 | chr1_jh636054_fix 758378 65 | chrX_jh720454_fix 752267 66 | chr19_gl949747_alt 729519 67 | chr7_jh636058_fix 716227 68 | chrX_jh806602_fix 713266 69 | chr17_gl383561_fix 644425 70 | chr8_gl949743_fix 608579 71 | chr2_kb663603_fix 599580 72 | chr4_ctg9_hap1 590426 73 | chr19_gl582977_fix 580393 74 | chr19_ke332505_fix 579598 75 | chr1_gl000192_random 547496 76 | chr11_jh159140_fix 546435 77 | chr5_ke332497_fix 543325 78 | chr17_gl383560_fix 534288 79 | chrX_jh720452_fix 522319 80 | chr4_ke332496_fix 503215 81 | chr6_kb663604_fix 478993 82 | chrX_kb021648_fix 469972 83 | chr11_jh591184_fix 462282 84 | chr17_gl383558_fix 457041 85 | chr17_jh720447_fix 454385 86 | chrX_jh806595_fix 444074 87 | chr10_jh636060_fix 437946 88 | chr8_gl383535_fix 429806 89 | chrX_jh806596_fix 413927 90 | chr17_gl582976_fix 412535 91 | chr11_jh720443_fix 408430 92 | chr12_gl877876_alt 408271 93 | chr3_jh159131_fix 393769 94 | chr10_gl383543_fix 392792 95 | chrX_jh806594_fix 390496 96 | chr2_gl877871_fix 389939 97 | chrX_jh806593_fix 389631 98 | chr15_gl383555_alt 388773 99 | chr17_jh159144_fix 388340 100 | chr19_gl383573_alt 385657 101 | chr17_jh591186_fix 376223 102 | chr4_gl383528_alt 376187 103 | chr12_gl949745_alt 372609 104 | chr1_gl383520_alt 366579 105 | chr7_gl582968_fix 356330 106 | chr7_gl582970_fix 354970 107 | chr17_jh806582_fix 342635 108 | chr17_ke332502_fix 341712 109 | chr17_gl383559_fix 338640 110 | chr12_kb663607_fix 334922 111 | chr9_gl339450_fix 330164 112 | chr7_gl582972_fix 327774 113 | chr11_jh159142_fix 326647 114 | chr11_gl582973_fix 321004 115 | chr10_gl383546_alt 309802 116 | chr21_ke332506_fix 307252 117 | chr10_kb663606_fix 305900 118 | chr4_gl877872_fix 297485 119 | chr15_gl383554_alt 296527 120 | chr9_jh636059_fix 295379 121 | chr18_gl383567_alt 289831 122 | chrX_gl877877_fix 284527 123 | chr20_kb663608_fix 283551 124 | chr17_jh159146_alt 278131 125 | chr11_gl949744_fix 276448 126 | chr7_ke332499_fix 274521 127 | chr6_jh806576_fix 273386 128 | chr12_jh720444_fix 273128 129 | chrX_jh806589_fix 270630 130 | chr17_gl383563_alt 270261 131 | chr5_jh159133_fix 266316 132 | chr3_ke332495_fix 263861 133 | chr6_jh636056_fix 262912 134 | chr7_gl582969_fix 251823 135 | chr4_gl582967_fix 248177 136 | chr19_jh159149_fix 245473 137 | chr11_jh159141_fix 240775 138 | chr8_ke332500_fix 228602 139 | chr5_gl949742_alt 226852 140 | chr17_gl383565_alt 223995 141 | chr22_jh720449_fix 212298 142 | chr17_kb021646_fix 211416 143 | chr9_jh806579_fix 211307 144 | chrUn_gl000225 211173 145 | chr8_gl383536_fix 203777 146 | chr21_gl383579_alt 201198 147 | chr11_jh159136_alt 200998 148 | chr6_jh636057_fix 200195 149 | chr18_gl383571_alt 198278 150 | chr10_jh591182_fix 196262 151 | chr17_jh159145_fix 194862 152 | chr16_gl383556_alt 192462 153 | chr4_gl000194_random 191469 154 | chr11_jh159137_alt 191409 155 | chr11_jh159143_fix 191402 156 | chr4_gl000193_random 189789 157 | chr19_gl383576_alt 188024 158 | chr6_kb021644_alt 187824 159 | chr9_gl000200_random 187035 160 | chrUn_gl000222 186861 161 | chrUn_gl000212 186858 162 | chr17_jh636061_fix 186059 163 | chr12_gl383551_alt 184319 164 | chrX_jh806603_fix 182949 165 | chr7_gl000195_random 182896 166 | chr1_gl383518_alt 182439 167 | chr3_gl383526_alt 180671 168 | chrUn_gl000223 180455 169 | chr20_gl582979_fix 179899 170 | chrUn_gl000224 179693 171 | chr10_gl383545_alt 179254 172 | chrUn_gl000219 179198 173 | chr10_jh591183_fix 177920 174 | chr17_gl000205_random 174588 175 | chr5_gl383531_alt 173459 176 | chr3_jh636055_alt 173151 177 | chrUn_gl000215 172545 178 | chrUn_gl000216 172294 179 | chrUn_gl000217 172149 180 | chr3_gl383523_fix 171362 181 | chr9_gl383541_alt 171286 182 | chr19_gl383575_alt 170222 183 | chr15_jh720445_fix 170033 184 | chr9_gl000199_random 169874 185 | chr9_jh806578_fix 169437 186 | chr12_gl383550_alt 169178 187 | chr10_gl877873_fix 168465 188 | chr18_gl383569_alt 167950 189 | chr11_jh591185_fix 167437 190 | chr12_gl877875_alt 167313 191 | chr22_jh806583_fix 167183 192 | chrUn_gl000211 166566 193 | chr12_gl383548_fix 165247 194 | chr18_gl383570_alt 164789 195 | chr4_gl383527_alt 164536 196 | chrUn_gl000213 164239 197 | chr12_gl582974_fix 163298 198 | chr9_gl383539_alt 162988 199 | chr22_gl383582_alt 162811 200 | chrUn_gl000220 161802 201 | chrUn_gl000218 161147 202 | chr18_gl383572_alt 159547 203 | chr19_gl000209_random 159169 204 | chr9_kb663605_fix 155926 205 | chr19_gl383574_alt 155864 206 | chrUn_gl000221 155397 207 | chr11_gl383547_alt 154407 208 | chr12_gl383553_alt 152874 209 | chr1_gl949741_fix 151551 210 | chr6_ke332498_fix 149443 211 | chr2_gl383521_alt 143390 212 | chr12_gl383552_alt 138655 213 | chrUn_gl000214 137718 214 | chr17_gl383564_alt 133151 215 | chrUn_gl000228 129120 216 | chr20_gl383577_alt 128385 217 | chr10_gl383544_fix 128378 218 | chrUn_gl000227 128374 219 | chr6_gl383533_alt 124736 220 | chr2_gl383522_alt 123821 221 | chr4_gl383529_alt 121345 222 | chr12_gl383549_alt 120804 223 | chr11_jh159139_fix 120441 224 | chr7_gl383534_alt 119183 225 | chr21_gl383581_alt 116690 226 | chr1_gl383519_alt 110268 227 | chr11_jh159138_fix 108875 228 | chr1_gl000191_random 106433 229 | chr18_gl383568_alt 104552 230 | chr8_jh159135_fix 102251 231 | chr5_gl383530_alt 101241 232 | chr3_jh159132_fix 100694 233 | chr16_jh720446_fix 97345 234 | chr22_gl383583_alt 96924 235 | chr2_gl582966_alt 96131 236 | chr10_jh806580_fix 93149 237 | chr19_gl000208_random 92689 238 | chr17_gl383566_alt 90219 239 | chr9_gl000198_random 90085 240 | chr16_gl383557_alt 89672 241 | chr17_jh159148_alt 88070 242 | chr5_gl383532_alt 82728 243 | chr17_gl000204_random 81310 244 | chr3_gl383524_fix 78793 245 | chr21_gl383580_alt 74652 246 | chr22_kb663609_alt 74013 247 | chr22_jh806585_fix 73505 248 | chr9_gl383540_alt 71551 249 | chr22_jh806584_fix 70876 250 | chr20_jh720448_fix 70483 251 | chr17_jh159147_alt 70345 252 | chr2_gl877870_fix 66021 253 | chr3_gl383525_fix 65063 254 | chrX_jh720455_fix 65034 255 | chr21_gl383578_alt 63917 256 | chr9_gl383537_fix 62435 257 | chr9_gl383542_alt 60032 258 | chr1_gl383517_fix 49352 259 | chr1_gl383516_fix 49316 260 | chr9_gl383538_fix 49281 261 | chr1_jh806575_fix 47409 262 | chrUn_gl000233 45941 263 | chrUn_gl000237 45867 264 | chr17_gl383562_fix 45551 265 | chrUn_gl000230 43691 266 | chr22_jh806586_fix 43543 267 | chrUn_gl000242 43523 268 | chrUn_gl000243 43341 269 | chrUn_gl000241 42152 270 | chrUn_gl000236 41934 271 | chrUn_gl000240 41933 272 | chr17_gl000206_random 41001 273 | chrUn_gl000232 40652 274 | chrUn_gl000234 40531 275 | chr11_gl000202_random 40103 276 | chrUn_gl000238 39939 277 | chrUn_gl000244 39929 278 | chrUn_gl000248 39786 279 | chr8_gl000196_random 38914 280 | chrUn_gl000249 38502 281 | chrUn_gl000246 38154 282 | chr17_gl000203_random 37498 283 | chr8_gl000197_random 37175 284 | chrUn_gl000245 36651 285 | chrUn_gl000247 36422 286 | chr9_gl000201_random 36148 287 | chr13_gl582975_fix 34662 288 | chrUn_gl000235 34474 289 | chrUn_gl000239 33824 290 | chr21_gl000210_random 27682 291 | chrUn_gl000231 27386 292 | chr1_jh806573_fix 24680 293 | chr1_jh806574_fix 22982 294 | chr9_jh806577_fix 22394 295 | chrUn_gl000229 19913 296 | chrM 16571 297 | chrMT 16569 298 | chrUn_gl000226 15008 299 | chr18_gl000207_random 4262 300 | -------------------------------------------------------------------------------- /genome/hg38_info.tsv: -------------------------------------------------------------------------------- 1 | chrom size 2 | chr1 248956422 3 | chr2 242193529 4 | chr3 198295559 5 | chr4 190214555 6 | chr5 181538259 7 | chr6 170805979 8 | chr7 159345973 9 | chrX 156040895 10 | chr8 145138636 11 | chr9 138394717 12 | chr11 135086622 13 | chr10 133797422 14 | chr12 133275309 15 | chr13 114364328 16 | chr14 107043718 17 | chr15 101991189 18 | chr16 90338345 19 | chr17 83257441 20 | chr18 80373285 21 | chr20 64444167 22 | chr19 58617616 23 | chrY 57227415 24 | chr22 50818468 25 | chr21 46709983 26 | chr8_KZ208915v1_fix 6367528 27 | chr15_KI270905v1_alt 5161414 28 | chr15_KN538374v1_fix 4998962 29 | chr6_GL000256v2_alt 4929269 30 | chr6_GL000254v2_alt 4827813 31 | chr6_GL000251v2_alt 4795265 32 | chr6_GL000253v2_alt 4677643 33 | chr6_GL000250v2_alt 4672374 34 | chr6_GL000255v2_alt 4606388 35 | chr6_GL000252v2_alt 4604811 36 | chr17_KI270857v1_alt 2877074 37 | chr16_KI270853v1_alt 2659700 38 | chr15_KQ031389v1_alt 2365364 39 | chr16_KV880768v1_fix 1927115 40 | chr16_KI270728v1_random 1872759 41 | chr17_GL000258v2_alt 1821992 42 | chr5_GL339449v2_alt 1612928 43 | chr14_KI270847v1_alt 1511111 44 | chr17_KI270908v1_alt 1423190 45 | chr14_KI270846v1_alt 1351393 46 | chr5_KI270897v1_alt 1144418 47 | chr7_KI270803v1_alt 1111570 48 | chr19_GL949749v2_alt 1091841 49 | chr19_KI270938v1_alt 1066800 50 | chr19_GL949750v2_alt 1066390 51 | chr19_GL949748v2_alt 1064304 52 | chr12_KZ208916v1_fix 1046838 53 | chr19_GL949751v2_alt 1002683 54 | chr19_GL949746v1_alt 987716 55 | chr19_GL949752v1_alt 987100 56 | chr8_KI270821v1_alt 985506 57 | chr1_KI270763v1_alt 911658 58 | chr6_KI270801v1_alt 870480 59 | chr19_GL949753v2_alt 796479 60 | chr19_GL949747v2_alt 729520 61 | chr14_KZ208920v1_fix 690932 62 | chr7_KZ208913v1_alt 680662 63 | chr5_KV575244v1_fix 673059 64 | chr8_KI270822v1_alt 624492 65 | chr7_KZ208912v1_fix 589656 66 | chr4_GL000257v2_alt 586476 67 | chr12_KI270904v1_alt 572349 68 | chr4_KI270925v1_alt 555799 69 | chr1_KV880763v1_alt 551020 70 | chr12_KN538369v1_fix 541038 71 | chr2_KQ983256v1_alt 535088 72 | chr2_KQ031384v1_fix 481245 73 | chr16_KZ559113v1_fix 480415 74 | chr15_KI270852v1_alt 478999 75 | chr7_KV880765v1_fix 468267 76 | chr1_KQ031383v1_fix 467143 77 | chr1_KN538360v1_fix 460100 78 | chr3_KN196475v1_fix 451168 79 | chr15_KI270727v1_random 448248 80 | chr9_KI270823v1_alt 439082 81 | chr15_KI270850v1_alt 430880 82 | chr1_KI270759v1_alt 425601 83 | chr4_KV766193v1_alt 420675 84 | chr10_KN538367v1_fix 420164 85 | chr3_KN538364v1_fix 415308 86 | chr3_KV766192v1_fix 411654 87 | chr12_GL877876v1_alt 408271 88 | chr18_KQ090028v1_fix 407387 89 | chr19_KQ458386v1_fix 405389 90 | chrUn_KI270442v1 392061 91 | chr17_KI270862v1_alt 391357 92 | chr15_GL383555v2_alt 388773 93 | chr19_GL383573v1_alt 385657 94 | chr4_KI270896v1_alt 378547 95 | chr4_GL383528v1_alt 376187 96 | chr17_GL383563v3_alt 375691 97 | chr8_KI270810v1_alt 374415 98 | chr3_KQ031385v1_fix 373699 99 | chr19_KN196484v1_fix 370917 100 | chr1_GL383520v2_alt 366580 101 | chr2_KN538363v1_fix 365499 102 | chr5_KV575243v1_alt 362221 103 | chr13_KN538372v1_fix 356766 104 | chr1_KI270762v1_alt 354444 105 | chr1_KQ458383v1_alt 349938 106 | chr9_KN196479v1_fix 330164 107 | chr1_KZ208906v1_fix 330031 108 | chr15_KI270848v1_alt 327382 109 | chr17_KI270909v1_alt 325800 110 | chr14_KI270844v1_alt 322166 111 | chr6_KQ031387v1_fix 320750 112 | chr8_KI270900v1_alt 318687 113 | chr12_KQ759760v1_fix 315610 114 | chr10_GL383546v1_alt 309802 115 | chr13_KI270838v1_alt 306913 116 | chr3_KN196476v1_fix 305979 117 | chr8_KI270816v1_alt 305841 118 | chr1_KN538361v1_fix 305542 119 | chr11_KZ559108v1_fix 305244 120 | chr22_KI270879v1_alt 304135 121 | chr3_KZ559103v1_alt 302885 122 | chr11_KZ559110v1_alt 301637 123 | chr8_KI270813v1_alt 300230 124 | chr11_KI270831v1_alt 296895 125 | chr15_GL383554v1_alt 296527 126 | chr19_KV575249v1_alt 293522 127 | chr8_KI270811v1_alt 292436 128 | chr18_GL383567v1_alt 289831 129 | chrX_KI270880v1_alt 284869 130 | chr8_KI270812v1_alt 282736 131 | chr19_KI270921v1_alt 282224 132 | chr17_KV766196v1_fix 281919 133 | chr17_KI270729v1_random 280839 134 | chr11_KZ559109v1_fix 279644 135 | chr1_KQ983255v1_alt 278659 136 | chr17_JH159146v1_alt 278131 137 | chr10_KN196480v1_fix 277797 138 | chr17_KV766198v1_alt 276292 139 | chrX_KI270913v1_alt 274009 140 | chr6_KI270798v1_alt 271782 141 | chr7_KI270808v1_alt 271455 142 | chr6_KN196478v1_fix 268330 143 | chr16_KQ090027v1_alt 267463 144 | chr8_KV880767v1_fix 265876 145 | chr10_KQ090021v1_fix 264545 146 | chr22_KI270876v1_alt 263666 147 | chr15_KI270851v1_alt 263054 148 | chr22_KI270875v1_alt 259914 149 | chr1_KI270766v1_alt 256271 150 | chr19_KI270882v1_alt 248807 151 | chr3_KI270778v1_alt 248252 152 | chr17_KV766197v1_alt 246895 153 | chr6_KQ090016v1_fix 245716 154 | chr15_KI270849v1_alt 244917 155 | chr4_KI270786v1_alt 244096 156 | chr6_KZ208911v1_fix 242796 157 | chr19_KV575250v1_alt 241058 158 | chr12_KI270835v1_alt 238139 159 | chr4_KQ090015v1_alt 236512 160 | chr17_KI270858v1_alt 235827 161 | chr19_KI270867v1_alt 233762 162 | chr16_KI270855v1_alt 232857 163 | chr18_KZ559115v1_fix 230843 164 | chr4_KQ983257v1_fix 230434 165 | chr8_KI270926v1_alt 229282 166 | chr5_GL949742v1_alt 226852 167 | chr3_KI270780v1_alt 224108 168 | chr17_GL383565v1_alt 223995 169 | chr2_KI270774v1_alt 223625 170 | chr19_KV575256v1_alt 223118 171 | chr4_KI270790v1_alt 220246 172 | chr11_KI270927v1_alt 218612 173 | chr19_KI270932v1_alt 215732 174 | chr11_KI270903v1_alt 214625 175 | chr2_KI270894v1_alt 214158 176 | chr1_KQ458384v1_alt 212205 177 | chr12_KN196482v1_fix 211377 178 | chr14_GL000225v1_random 211173 179 | chrUn_KI270743v1 210658 180 | chr11_KI270832v1_alt 210133 181 | chr7_KI270805v1_alt 209988 182 | chrY_KZ208924v1_fix 209722 183 | chr4_GL000008v2_random 209709 184 | chr7_KI270809v1_alt 209586 185 | chr19_KI270887v1_alt 209512 186 | chr2_KN538362v1_fix 208149 187 | chr13_KN538371v1_fix 206320 188 | chr4_KI270789v1_alt 205944 189 | chr4_KQ983258v1_alt 205407 190 | chr3_KI270779v1_alt 205312 191 | chr19_KI270914v1_alt 205194 192 | chr18_KQ458385v1_alt 205101 193 | chr19_KI270886v1_alt 204239 194 | chr11_KI270829v1_alt 204059 195 | chr11_KN538368v1_alt 203552 196 | chr14_GL000009v2_random 201709 197 | chr21_GL383579v2_alt 201197 198 | chr11_JH159136v1_alt 200998 199 | chr19_KI270930v1_alt 200773 200 | chrUn_KI270747v1 198735 201 | chr18_GL383571v1_alt 198278 202 | chr19_KI270920v1_alt 198005 203 | chr3_KZ559102v1_alt 197752 204 | chr6_KI270797v1_alt 197536 205 | chr3_KI270935v1_alt 197351 206 | chr11_KQ759759v1_fix 196940 207 | chr17_KI270861v1_alt 196688 208 | chr15_KI270906v1_alt 196384 209 | chr5_KI270791v1_alt 195710 210 | chr3_KZ559105v1_alt 195063 211 | chr14_KI270722v1_random 194050 212 | chr16_GL383556v1_alt 192462 213 | chr13_KI270840v1_alt 191684 214 | chr14_GL000194v1_random 191469 215 | chr11_JH159137v1_alt 191409 216 | chr19_KI270917v1_alt 190932 217 | chr7_KI270899v1_alt 190869 218 | chr19_KI270923v1_alt 189352 219 | chr10_KI270825v1_alt 188315 220 | chr19_GL383576v1_alt 188024 221 | chrX_KV766199v1_alt 188004 222 | chr19_KI270922v1_alt 187935 223 | chrUn_KI270742v1 186739 224 | chr1_KN196472v1_fix 186494 225 | chr22_KI270878v1_alt 186262 226 | chr19_KI270929v1_alt 186203 227 | chr11_KI270826v1_alt 186169 228 | chr6_KB021644v2_alt 185823 229 | chr17_GL000205v2_random 185591 230 | chr10_KQ090020v1_alt 185507 231 | chr1_KI270765v1_alt 185285 232 | chr19_KI270916v1_alt 184516 233 | chr19_KI270890v1_alt 184499 234 | chr3_KI270784v1_alt 184404 235 | chr12_GL383551v1_alt 184319 236 | chr20_KI270870v1_alt 183433 237 | chrUn_GL000195v1 182896 238 | chr1_GL383518v1_alt 182439 239 | chr11_KQ090022v1_fix 181958 240 | chr22_KI270736v1_random 181920 241 | chr2_KZ208907v1_alt 181658 242 | chr10_KI270824v1_alt 181496 243 | chr11_KZ559111v1_alt 181167 244 | chr14_KI270845v1_alt 180703 245 | chr3_GL383526v1_alt 180671 246 | chr13_KI270839v1_alt 180306 247 | chr7_KQ031388v1_fix 179932 248 | chr22_KI270733v1_random 179772 249 | chrUn_GL000224v1 179693 250 | chr10_GL383545v1_alt 179254 251 | chrUn_GL000219v1 179198 252 | chr5_KI270792v1_alt 179043 253 | chr17_KI270860v1_alt 178921 254 | chr19_KV575252v1_alt 178197 255 | chr19_GL000209v2_alt 177381 256 | chr11_KI270830v1_alt 177092 257 | chr9_KI270719v1_random 176845 258 | chrUn_GL000216v2 176608 259 | chr22_KI270928v1_alt 176103 260 | chr1_KI270712v1_random 176043 261 | chr3_KZ208909v1_alt 175849 262 | chr6_KI270800v1_alt 175808 263 | chr1_KI270706v1_random 175055 264 | chr12_KZ208918v1_alt 174808 265 | chr22_KQ458388v1_alt 174749 266 | chr2_KI270776v1_alt 174166 267 | chr18_KI270912v1_alt 174061 268 | chr3_KI270777v1_alt 173649 269 | chr5_GL383531v1_alt 173459 270 | chr3_JH636055v2_alt 173151 271 | chr14_KI270725v1_random 172810 272 | chr5_KI270796v1_alt 172708 273 | chr7_KZ559106v1_alt 172555 274 | chr14_KZ208919v1_alt 171798 275 | chr9_GL383541v1_alt 171286 276 | chr19_KV575259v1_alt 171263 277 | chr19_KI270885v1_alt 171027 278 | chr19_KI270919v1_alt 170701 279 | chr19_KI270889v1_alt 170698 280 | chr19_KI270891v1_alt 170680 281 | chr19_KI270915v1_alt 170665 282 | chr19_KI270933v1_alt 170537 283 | chr19_KI270883v1_alt 170399 284 | chr19_GL383575v2_alt 170222 285 | chr19_KV575247v1_alt 170206 286 | chr19_KI270931v1_alt 170148 287 | chr12_GL383550v2_alt 169178 288 | chr16_KQ031390v1_alt 169136 289 | chr13_KI270841v1_alt 169134 290 | chrUn_KI270744v1 168472 291 | chr13_KQ090024v1_alt 168146 292 | chr19_KV575248v1_alt 168131 293 | chr18_KI270863v1_alt 167999 294 | chr18_GL383569v1_alt 167950 295 | chr12_GL877875v1_alt 167313 296 | chr21_KI270874v1_alt 166743 297 | chr19_KV575253v1_alt 166713 298 | chr3_KI270924v1_alt 166540 299 | chr1_KN196473v1_fix 166200 300 | chr1_KZ208904v1_alt 166136 301 | chr1_KI270761v1_alt 165834 302 | chr3_KQ031386v1_fix 165718 303 | chr3_KI270937v1_alt 165607 304 | chr8_KZ208914v1_fix 165120 305 | chr22_KI270734v1_random 165050 306 | chr18_GL383570v1_alt 164789 307 | chr5_KI270794v1_alt 164558 308 | chr4_GL383527v1_alt 164536 309 | chrUn_GL000213v1 164239 310 | chr3_KI270936v1_alt 164170 311 | chr3_KZ559101v1_alt 164041 312 | chr19_KV575246v1_alt 163926 313 | chr9_KQ090018v1_alt 163882 314 | chr4_KQ090014v1_alt 163749 315 | chr3_KI270934v1_alt 163458 316 | chr18_KZ559116v1_alt 163186 317 | chr9_GL383539v1_alt 162988 318 | chr3_KI270895v1_alt 162896 319 | chr22_GL383582v2_alt 162811 320 | chr3_KI270782v1_alt 162429 321 | chr1_KI270892v1_alt 162212 322 | chrUn_GL000220v1 161802 323 | chr2_KI270767v1_alt 161578 324 | chr2_KI270715v1_random 161471 325 | chr2_KI270893v1_alt 161218 326 | chrUn_GL000218v1 161147 327 | chr19_KV575255v1_alt 161095 328 | chr18_GL383572v1_alt 159547 329 | chr19_KV575251v1_alt 159285 330 | chr8_KI270817v1_alt 158983 331 | chr4_KI270788v1_alt 158965 332 | chrUn_KI270749v1 158759 333 | chr7_KI270806v1_alt 158166 334 | chr7_KI270804v1_alt 157952 335 | chr18_KI270911v1_alt 157710 336 | chrUn_KI270741v1 157432 337 | chr17_KI270910v1_alt 157099 338 | chr19_KI270884v1_alt 157053 339 | chr8_KV880766v1_fix 156998 340 | chr19_KV575258v1_alt 156965 341 | chr22_KN196485v1_alt 156562 342 | chr22_KQ458387v1_alt 155930 343 | chr19_GL383574v1_alt 155864 344 | chr19_KI270888v1_alt 155532 345 | chr3_GL000221v1_random 155397 346 | chr17_KV575245v1_fix 154723 347 | chr11_GL383547v1_alt 154407 348 | chr12_KZ559112v1_alt 154139 349 | chr2_KI270716v1_random 153799 350 | chr22_KN196486v1_alt 153027 351 | chr12_GL383553v2_alt 152874 352 | chr6_KI270799v1_alt 152148 353 | chr22_KI270731v1_random 150754 354 | chrUn_KI270751v1 150742 355 | chrUn_KI270750v1 148850 356 | chr13_KN538373v1_fix 148762 357 | chr19_KV575260v1_alt 145691 358 | chr8_KI270818v1_alt 145606 359 | chr22_KQ759761v1_alt 145162 360 | chrX_KI270881v1_alt 144206 361 | chr21_KI270873v1_alt 143900 362 | chr2_GL383521v1_alt 143390 363 | chr7_KV880764v1_fix 142129 364 | chr8_KI270814v1_alt 141812 365 | chr1_KQ458382v1_alt 141019 366 | chr11_KV766195v1_fix 140877 367 | chr2_KZ208908v1_alt 140361 368 | chr1_KZ208905v1_alt 140355 369 | chr6_KV766194v1_fix 139427 370 | chr5_KN196477v1_alt 139087 371 | chr12_GL383552v1_alt 138655 372 | chrUn_KI270519v1 138126 373 | chr2_KI270775v1_alt 138019 374 | chr17_KI270907v1_alt 137721 375 | chrUn_GL000214v1 137718 376 | chr8_KI270901v1_alt 136959 377 | chr2_KI270770v1_alt 136240 378 | chr5_KZ208910v1_alt 135987 379 | chr16_KI270854v1_alt 134193 380 | chr9_KQ090019v1_alt 134099 381 | chr8_KI270819v1_alt 133535 382 | chr17_GL383564v2_alt 133151 383 | chr2_KI270772v1_alt 133041 384 | chr8_KI270815v1_alt 132244 385 | chr5_KI270795v1_alt 131892 386 | chr5_KI270898v1_alt 130957 387 | chr20_GL383577v2_alt 128386 388 | chr1_KI270708v1_random 127682 389 | chr7_KI270807v1_alt 126434 390 | chr5_KI270793v1_alt 126136 391 | chr6_GL383533v1_alt 124736 392 | chr2_GL383522v1_alt 123821 393 | chr13_KQ090025v1_alt 123480 394 | chr19_KI270918v1_alt 123111 395 | chr1_KN196474v1_fix 122022 396 | chr12_GL383549v1_alt 120804 397 | chr2_KI270769v1_alt 120616 398 | chr4_KI270785v1_alt 119912 399 | chr12_KI270834v1_alt 119498 400 | chr7_GL383534v2_alt 119183 401 | chr20_KI270869v1_alt 118774 402 | chr17_KZ559114v1_alt 116753 403 | chr21_GL383581v2_alt 116689 404 | chr3_KI270781v1_alt 113034 405 | chr17_KI270730v1_random 112551 406 | chrUn_KI270438v1 112505 407 | chr4_KI270787v1_alt 111943 408 | chr18_KI270864v1_alt 111737 409 | chr2_KI270771v1_alt 110395 410 | chr1_GL383519v1_alt 110268 411 | chr2_KI270768v1_alt 110099 412 | chr1_KI270760v1_alt 109528 413 | chr12_KQ090023v1_alt 109323 414 | chr3_KI270783v1_alt 109187 415 | chr11_KN196481v1_fix 108875 416 | chr17_KI270859v1_alt 108763 417 | chr11_KI270902v1_alt 106711 418 | chr3_KZ559104v1_fix 105527 419 | chr18_GL383568v1_alt 104552 420 | chr22_KI270737v1_random 103838 421 | chr13_KI270843v1_alt 103832 422 | chr8_KZ559107v1_alt 103072 423 | chr22_KI270877v1_alt 101331 424 | chr5_GL383530v1_alt 101241 425 | chrY_KN196487v1_fix 101150 426 | chr22_KQ759762v1_fix 101037 427 | chr19_KV575257v1_alt 100553 428 | chr11_KI270721v1_random 100316 429 | chr19_KV575254v1_alt 99845 430 | chr22_KI270738v1_random 99375 431 | chr22_GL383583v2_alt 96924 432 | chr2_GL582966v2_alt 96131 433 | chrUn_KI270748v1 93321 434 | chr18_KZ208922v1_fix 93070 435 | chrUn_KI270435v1 92983 436 | chr5_GL000208v1_random 92689 437 | chrUn_KI270538v1 91309 438 | chr4_KQ090013v1_alt 90922 439 | chr17_GL383566v1_alt 90219 440 | chr16_GL383557v1_alt 89672 441 | chr17_JH159148v1_alt 88070 442 | chr12_KN538370v1_fix 86533 443 | chr10_KN538366v1_fix 85284 444 | chr5_GL383532v1_alt 82728 445 | chr21_KI270872v1_alt 82692 446 | chr6_KQ090017v1_alt 82315 447 | chrUn_KI270756v1 79590 448 | chr16_KZ208921v1_alt 78609 449 | chr6_KI270758v1_alt 76752 450 | chr12_KI270833v1_alt 76061 451 | chr6_KI270802v1_alt 75005 452 | chr21_GL383580v2_alt 74653 453 | chr22_KB663609v1_alt 74013 454 | chr22_KI270739v1_random 73985 455 | chr9_GL383540v1_alt 71551 456 | chrUn_KI270757v1 71251 457 | chr2_KI270773v1_alt 70887 458 | chr17_JH159147v1_alt 70345 459 | chr11_KI270827v1_alt 67707 460 | chr1_KI270709v1_random 66860 461 | chrUn_KI270746v1 66486 462 | chr12_KZ208917v1_fix 64689 463 | chr16_KI270856v1_alt 63982 464 | chr21_GL383578v2_alt 63917 465 | chrUn_KI270753v1 62944 466 | chr19_KI270868v1_alt 61734 467 | chr9_GL383542v1_alt 60032 468 | chr16_KQ090026v1_alt 59016 469 | chr20_KI270871v1_alt 58661 470 | chr12_KI270836v1_alt 56134 471 | chr19_KI270865v1_alt 52969 472 | chr1_KI270764v1_alt 50258 473 | chrY_KZ208923v1_fix 48370 474 | chr1_KZ559100v1_fix 44955 475 | chrUn_KI270589v1 44474 476 | chr14_KI270726v1_random 43739 477 | chr19_KI270866v1_alt 43156 478 | chr22_KI270735v1_random 42811 479 | chr1_KI270711v1_random 42210 480 | chrUn_KI270745v1 41891 481 | chr1_KI270714v1_random 41717 482 | chr22_KI270732v1_random 41543 483 | chr1_KI270713v1_random 40745 484 | chrUn_KI270754v1 40191 485 | chr1_KI270710v1_random 40176 486 | chr12_KI270837v1_alt 40090 487 | chr9_KI270717v1_random 40062 488 | chr14_KI270724v1_random 39555 489 | chr9_KI270720v1_random 39050 490 | chr14_KI270723v1_random 38115 491 | chr9_KI270718v1_random 38054 492 | chrUn_KI270317v1 37690 493 | chr13_KI270842v1_alt 37287 494 | chrY_KI270740v1_random 37240 495 | chrUn_KI270755v1 36723 496 | chr8_KI270820v1_alt 36640 497 | chr13_KN196483v1_fix 35455 498 | chr1_KI270707v1_random 32032 499 | chrUn_KI270579v1 31033 500 | chrUn_KI270752v1 27745 501 | chrUn_KI270512v1 22689 502 | chrUn_KI270322v1 21476 503 | chrM 16569 504 | chrUn_GL000226v1 15008 505 | chr10_KN538365v1_fix 14347 506 | chrUn_KI270311v1 12399 507 | chrUn_KI270366v1 8320 508 | chrUn_KI270511v1 8127 509 | chrUn_KI270448v1 7992 510 | chrUn_KI270521v1 7642 511 | chrUn_KI270581v1 7046 512 | chrUn_KI270582v1 6504 513 | chrUn_KI270515v1 6361 514 | chrUn_KI270588v1 6158 515 | chrUn_KI270591v1 5796 516 | chrUn_KI270522v1 5674 517 | chrUn_KI270507v1 5353 518 | chrUn_KI270590v1 4685 519 | chrUn_KI270584v1 4513 520 | chrUn_KI270320v1 4416 521 | chrUn_KI270382v1 4215 522 | chrUn_KI270468v1 4055 523 | chrUn_KI270467v1 3920 524 | chrUn_KI270362v1 3530 525 | chrUn_KI270517v1 3253 526 | chrUn_KI270593v1 3041 527 | chrUn_KI270528v1 2983 528 | chrUn_KI270587v1 2969 529 | chrUn_KI270364v1 2855 530 | chrUn_KI270371v1 2805 531 | chrUn_KI270333v1 2699 532 | chrUn_KI270374v1 2656 533 | chrUn_KI270411v1 2646 534 | chrUn_KI270414v1 2489 535 | chrUn_KI270510v1 2415 536 | chrUn_KI270390v1 2387 537 | chrUn_KI270375v1 2378 538 | chrUn_KI270420v1 2321 539 | chrUn_KI270509v1 2318 540 | chrUn_KI270315v1 2276 541 | chrUn_KI270302v1 2274 542 | chrUn_KI270518v1 2186 543 | chrUn_KI270530v1 2168 544 | chrUn_KI270304v1 2165 545 | chrUn_KI270418v1 2145 546 | chrUn_KI270424v1 2140 547 | chrUn_KI270417v1 2043 548 | chrUn_KI270508v1 1951 549 | chrUn_KI270303v1 1942 550 | chrUn_KI270381v1 1930 551 | chrUn_KI270529v1 1899 552 | chrUn_KI270425v1 1884 553 | chrUn_KI270396v1 1880 554 | chrUn_KI270363v1 1803 555 | chrUn_KI270386v1 1788 556 | chrUn_KI270465v1 1774 557 | chrUn_KI270383v1 1750 558 | chrUn_KI270384v1 1658 559 | chrUn_KI270330v1 1652 560 | chrUn_KI270372v1 1650 561 | chrUn_KI270548v1 1599 562 | chrUn_KI270580v1 1553 563 | chrUn_KI270387v1 1537 564 | chrUn_KI270391v1 1484 565 | chrUn_KI270305v1 1472 566 | chrUn_KI270373v1 1451 567 | chrUn_KI270422v1 1445 568 | chrUn_KI270316v1 1444 569 | chrUn_KI270338v1 1428 570 | chrUn_KI270340v1 1428 571 | chrUn_KI270583v1 1400 572 | chrUn_KI270334v1 1368 573 | chrUn_KI270429v1 1361 574 | chrUn_KI270393v1 1308 575 | chrUn_KI270516v1 1300 576 | chrUn_KI270389v1 1298 577 | chrUn_KI270466v1 1233 578 | chrUn_KI270388v1 1216 579 | chrUn_KI270544v1 1202 580 | chrUn_KI270310v1 1201 581 | chrUn_KI270412v1 1179 582 | chrUn_KI270395v1 1143 583 | chrUn_KI270376v1 1136 584 | chrUn_KI270337v1 1121 585 | chrUn_KI270335v1 1048 586 | chrUn_KI270378v1 1048 587 | chrUn_KI270379v1 1045 588 | chrUn_KI270329v1 1040 589 | chrUn_KI270419v1 1029 590 | chrUn_KI270336v1 1026 591 | chrUn_KI270312v1 998 592 | chrUn_KI270539v1 993 593 | chrUn_KI270385v1 990 594 | chrUn_KI270423v1 981 595 | chrUn_KI270392v1 971 596 | chrUn_KI270394v1 970 597 | -------------------------------------------------------------------------------- /img/bam_compare_igv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/learning_bam_file/d4bff075c248eea1109c619cea6f28eac68613a2/img/bam_compare_igv.png -------------------------------------------------------------------------------- /img/sam_less.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/learning_bam_file/d4bff075c248eea1109c619cea6f28eac68613a2/img/sam_less.png -------------------------------------------------------------------------------- /learning_bam_file.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Learning the BAM format" 3 | output: github_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | Sys.setenv(PATH=paste0(Sys.getenv("PATH"), ":", getwd())) 8 | knitr::opts_chunk$set(echo = TRUE) 9 | ``` 10 | 11 | ## Introduction 12 | 13 | ![Build README](https://github.com/davetang/learning_bam_file/actions/workflows/create_readme.yml/badge.svg) 14 | 15 | SAMtools provides various (sub)tools for manipulating alignments in the SAM/BAM format. The SAM (Sequence Alignment/Map) format (BAM is just the binary form of SAM) is currently the _de facto_ standard for storing large nucleotide sequence alignments. If you are working with high-throughput sequencing data, at some point you will probably have to deal with SAM/BAM files, so familiarise yourself with them! For the latest information on SAMtools, please refer to the [release notes](https://github.com/samtools/samtools/releases). 16 | 17 | The examples in this README use the `ERR188273_chrX.bam` BAM file (stored in the `eg` folder) generated as per https://github.com/davetang/rnaseq using the HISAT2 + StringTie2 RNA-seq pipeline. This README is generated using the `create_readme.sh` script; if you want to generate this file yourself, please use [this Docker image](https://hub.docker.com/repository/docker/davetang/r_build) and the `Makefile` in this directory. For example: 18 | 19 | ```bash 20 | # clone this repo 21 | git clone https://github.com/davetang/learning_bam_file.git 22 | cd learning_bam_file 23 | 24 | docker pull davetang/r_build:4.1.2 25 | docker run --rm -it -v $(pwd):/work davetang/r_build:4.1.2 /bin/bash 26 | 27 | # inside the Docker container 28 | make 29 | ``` 30 | 31 | ## Installing SAMtools 32 | 33 | For installing SAMtools, I recommend using `Conda` and the [Bioconda samtools package](https://anaconda.org/bioconda/samtools). I also recommend using [Miniconda](https://docs.conda.io/en/latest/miniconda.html) instead of Anaconda because Anaconda comes with a lot of tools/packages that you will probably not use. I wrote a [short introduction to Conda](https://davetang.github.io/reproducible_bioinformatics/conda.html) if you want to find learn more. 34 | 35 | Once you have installed Miniconda, you can install SAMtools as follows: 36 | 37 | ```bash 38 | conda install -c bioconda samtools 39 | ``` 40 | 41 | Otherwise you can download the source and compile it yourself; change `dir` to the location you want `samtools` to be installed. `samtools` will be installed in `${dir}/bin`, so make sure this is in your `$PATH`. 42 | 43 | ```bash 44 | #!/usr/bin/env bash 45 | 46 | set -euo pipefail 47 | 48 | ver=1.15 49 | tool=samtools 50 | url=https://github.com/samtools/${tool}/releases/download/${ver}/${tool}-${ver}.tar.bz2 51 | dir=${HOME}/local 52 | 53 | wget ${url} 54 | tar xjf ${tool}-${ver}.tar.bz2 55 | cd ${tool}-${ver} 56 | ./configure --prefix=${dir} 57 | make && make install 58 | cd .. 59 | 60 | rm -rf ${tool}-${ver} ${tool}-${ver}.tar.bz2 61 | 62 | >&2 echo Done 63 | exit 0 64 | ``` 65 | 66 | ## Basic usage 67 | 68 | If you run `samtools` on the terminal without any parameters or with `--help`, all the available utilities are listed: 69 | 70 | ```{bash engine.opts='-l'} 71 | samtools --help 72 | ``` 73 | 74 | ## Viewing 75 | 76 | Use [bioSyntax](https://github.com/bioSyntax/bioSyntax) to prettify your output. 77 | 78 | ```bash 79 | samtools view aln.bam | sam-less 80 | ``` 81 | 82 | ![bioSyntax](img/sam_less.png) 83 | 84 | ## Converting a SAM file to a BAM file 85 | 86 | A BAM file is just a SAM file but stored in binary format; you should always convert your SAM files into BAM format since they are smaller in size and are faster to manipulate. 87 | 88 | I don't have a SAM file in the example folder, so let's create one and check out the first ten lines. Note: remember to use `-h` to ensure the SAM file contains the sequence header information. Generally, I recommend storing only sorted BAM files as they use even less disk space and are faster to process. 89 | 90 | ```{bash engine.opts='-l'} 91 | samtools view -h eg/ERR188273_chrX.bam > eg/ERR188273_chrX.sam 92 | ``` 93 | 94 | Notice that the SAM file is much larger than the BAM file. 95 | 96 | Size of SAM file. 97 | 98 | ```{bash engine.opts='-l'} 99 | ls -lh eg/ERR188273_chrX.sam 100 | ``` 101 | 102 | Size of BAM file. 103 | 104 | ```{bash engine.opts='-l'} 105 | ls -lh eg/ERR188273_chrX.bam 106 | ``` 107 | 108 | We can use `head` to view a SAM file. 109 | 110 | ```{bash engine.opts='-l'} 111 | head eg/ERR188273_chrX.sam 112 | ``` 113 | 114 | The lines starting with an "@" symbol contains the header information. The @SQ tag is the reference sequence dictionary; SN refers to the reference sequence name and LN refers to the reference sequence length. If you don't see lines starting with the "@" symbol, the header information is probably missing. You can generate this information again by running the command below, where `ref.fa` is the reference FASTA file used to map the reads. 115 | 116 | ```bash 117 | samtools view -bT sequence/ref.fa aln.sam > aln.bam 118 | ``` 119 | 120 | If the header information is available, we can convert a SAM file into BAM by using `samtools view -b`. In newer versions of SAMtools, the input format is auto-detected, so we no longer need the `-S` parameter. 121 | 122 | ```{bash engine.opts='-l'} 123 | samtools view -b eg/ERR188273_chrX.sam > eg/my.bam 124 | ``` 125 | 126 | ## Converting a BAM file to a CRAM file 127 | 128 | The CRAM format is even more compact. Use `samtools view` with the `-T` and `-C` arguments to convert a BAM file into CRAM. 129 | 130 | ```{bash engine.opts='-l'} 131 | samtools view -T genome/chrX.fa -C -o eg/ERR188273_chrX.cram eg/ERR188273_chrX.bam 132 | 133 | ls -lh eg/ERR188273_chrX.[sbcr]*am 134 | ``` 135 | 136 | You can use `samtools view` to view a CRAM file just as you would for a BAM file. 137 | 138 | ```{bash engine.opts='-l'} 139 | samtools view eg/ERR188273_chrX.cram | head 140 | ``` 141 | 142 | I have an [old blog post](https://davetang.org/muse/2014/09/26/bam-to-cram/) on the CRAM format. 143 | 144 | ## Sorting a SAM/BAM file 145 | 146 | Many downstream tools require sorted BAM files and since they are slightly more compact than unsorted BAM files, you should always sorted BAM files. In SAMtools version 1.3 or newer, you can directly generate a sorted BAM file from a SAM file. 147 | 148 | ```{bash engine.opts='-l'} 149 | samtools sort eg/ERR188273_chrX.sam -o eg/sorted.bam 150 | ls -l eg/ERR188273_chrX.bam 151 | ls -l eg/sorted.bam 152 | ``` 153 | 154 | You should use use additional threads (if they are available) to speed up sorting; to use four threads, use `-@ 4`. 155 | 156 | Time taken using one thread (default). 157 | 158 | ```{bash engine.opts='-l'} 159 | time samtools sort eg/ERR188273_chrX.sam -o eg/sorted.bam 160 | ``` 161 | 162 | Time taken using four threads. 163 | 164 | ```{bash engine.opts='-l'} 165 | time samtools sort -@ 4 eg/ERR188273_chrX.sam -o eg/sorted.bam 166 | ``` 167 | 168 | Many of the SAMtools subtools can use additional threads, so make use of them if you have the resources! 169 | 170 | ## Creating a BAM index file 171 | 172 | Various tools require BAM index files, such as IGV, which is a tool that can be used for visualising BAM files. 173 | 174 | ```{bash engine.opts='-l'} 175 | samtools index eg/ERR188273_chrX.bam 176 | ``` 177 | 178 | ## Adding read groups 179 | 180 | Some tools like GATK and Picard require [read groups](https://gatk.broadinstitute.org/hc/en-us/articles/360035890671-Read-groups) (RG). You can add or replace read groups using `samtools addreplacerg`. 181 | 182 | ```{bash engine.opts='-l'} 183 | samtools addreplacerg -r "@RG\tID:ERR188273\tSM:ERR188273\tPL:illumina" -o eg/ERR188273_chrX_rg.bam eg/ERR188273_chrX.bam 184 | samtools head eg/ERR188273_chrX_rg.bam 185 | ``` 186 | 187 | If you want to replace existing read groups, just use the same command. 188 | 189 | ```{bash engine.opts='-l'} 190 | samtools addreplacerg -r "@RG\tID:ERR188273_2\tSM:ERR188273_2\tPL:illumina_2" -o eg/ERR188273_chrX_rg2.bam eg/ERR188273_chrX_rg.bam 191 | samtools head eg/ERR188273_chrX_rg2.bam 192 | ``` 193 | 194 | Popular alignment tools such as BWA MEM and STAR can add read groups; use the `-R` and `--outSAMattrRGline` parameters for the respective tool. 195 | 196 | ``` 197 | bwa mem \ 198 | -M \ 199 | -t ${thread} \ 200 | -R "@RG\tID:${sample_name}\tSM:${sample}\tPL:${platform}" \ 201 | ${fasta} \ 202 | ${fastq1} \ 203 | ${fastq2} | 204 | samtools sort -@ ${thread} -O BAM |\ 205 | tee ${sample_name}.bam |\ 206 | samtools index - ${sample_name}.bam.bai 207 | 208 | STAR \ 209 | --runMode alignReads \ 210 | --genomeDir ${star_index} \ 211 | --readFilesIn ${fastq1} ${fastq2} \ 212 | --readFilesCommand "gunzip -c" \ 213 | --outFileNamePrefix ${prefix}. \ 214 | --outSAMtype BAM Unsorted \ 215 | --twopassMode Basic \ 216 | --outSAMattrRGline ID:${id} PL:Illumina PU:${pu} LB:${lb} PI:0 SM:${sm} \ 217 | --outSAMattributes NH HI AS nM NM ch \ 218 | --runThreadN ${num_threads} 219 | ``` 220 | 221 | ## Interpreting the BAM flags 222 | 223 | The second column in a SAM/BAM file is the flag column; use the `flags` subcommand to understand specific flags. They may seem confusing at first but the encoding allows details about a read to be stored by just using a few digits. The trick is to convert the numerical digit into binary, and then use the table to interpret the binary numbers, where 1 = true and 0 = false. I wrote a blog post on BAM flags at . 224 | 225 | ```{bash engine.opts='-l'} 226 | samtools flags 227 | ``` 228 | 229 | Find out about a `73` flag. 230 | 231 | ```{bash engine.opts='-l'} 232 | samtools flags 73 233 | ``` 234 | 235 | ### Proper pair 236 | 237 | Reads that are properly paired are mapped within an expected distance with each other and with one pair in the reverse complement orientation. The script `generate_random_seq.pl` can generate reads that originate from different references and are thus discordant and not properly paired (as well as properly paired reads). In the example below, 10% of reads are not properly paired (set with `-d 0.1`). 238 | 239 | ```{bash engine.opts='-l'} 240 | script/generate_random_seq.pl 30 10000 1984 > test_ref.fa 241 | script/random_paired_end.pl -f test_ref.fa -l 100 -n 10000 -m 300 -d 0.1 242 | bwa index test_ref.fa 2> /dev/null 243 | bwa mem test_ref.fa l100_n10000_d300_1984_1.fq.gz l100_n10000_d300_1984_2.fq.gz > aln.sam 2> /dev/null 244 | ``` 245 | 246 | `samtools flagstat` will indicate that some reads (about 10%) mapped to different chromosomes. 247 | 248 | ```{bash engine.opts='-l'} 249 | samtools flagstat aln.sam 250 | ``` 251 | 252 | Flag of a proper pair. 253 | 254 | ```{bash engine.opts='-l'} 255 | samtools flag $(samtools view -f 2 aln.sam | head -1 | cut -f2) 256 | ``` 257 | 258 | Flag of a pair (that is not a proper pair). 259 | 260 | ```{bash engine.opts='-l'} 261 | samtools flag $(samtools view -F 2 aln.sam | head -1 | cut -f2) 262 | ``` 263 | 264 | 265 | ## Filtering unmapped reads 266 | 267 | Use `-F 4` to filter out unmapped reads. 268 | 269 | ```{bash engine.opts='-l'} 270 | samtools view -F 4 -b eg/ERR188273_chrX.bam > eg/ERR188273_chrX.mapped.bam 271 | ``` 272 | 273 | Use `-f 4` to keep only unmapped reads. 274 | 275 | ```{bash engine.opts='-l'} 276 | samtools view -f 4 -b eg/ERR188273_chrX.bam > eg/ERR188273_chrX.unmapped.bam 277 | ``` 278 | 279 | We can use the `flags` subcommand to confirm that a value of four represents an unmapped read. 280 | 281 | ```{bash engine.opts='-l'} 282 | samtools flags 4 283 | ``` 284 | 285 | ## Extracting entries mapping to a specific loci 286 | 287 | Use `samtools view` and the `ref:start-end` syntax to extract reads mapping within a specific genomic loci; this requires a BAM index file. 288 | 289 | ```{bash engine.opts='-l'} 290 | samtools view eg/ERR188273_chrX.bam chrX:20000-30000 291 | ``` 292 | 293 | Note that this takes into account the mapping of the entire read and not just the starting position. For example, if you specified chrX:20000-30000, a 75 bp long read that starts its mapping from position 19999 will also be returned. In addition, you can save the output as another BAM file if you want. 294 | 295 | ```{bash engine.opts='-l'} 296 | samtools view -b eg/ERR188273_chrX.bam chrX:20000-30000 > eg/ERR188273_chrX_20000_30000.bam 297 | ``` 298 | 299 | If you want reads mapped to a single reference (e.g. chromosome), just specify the `ref` and leave out the start and end values. 300 | 301 | ```{bash engine.opts='-l'} 302 | samtools view eg/ERR188273_chrX.bam chrX | head 303 | ``` 304 | 305 | You can also use a BED file, with several entries, to extract reads of interest. 306 | 307 | ```{bash engine.opts='-l'} 308 | cat eg/my.bed 309 | 310 | samtools view -L eg/my.bed eg/ERR188273_chrX.bam 311 | ``` 312 | 313 | ## Extracting only the first read from paired end BAM files 314 | 315 | Sometimes you only want the first pair of a mate. 0x0040 is hexadecimal for 64 (i.e. 16 * 4), which is binary for 1000000, corresponding to the read in the first read pair. 316 | 317 | ```{bash engine.opts='-l'} 318 | samtools view -b -f 0x0040 eg/ERR188273_chrX.bam > eg/first.bam 319 | ``` 320 | 321 | Once again, you can use `flags` to verify this (it also accepts hexadecimal input). 322 | 323 | ```{bash engine.opts='-l'} 324 | samtools flags 0x0040 325 | ``` 326 | 327 | ## Stats 328 | 329 | For simple statistics use `samtools flagstat`. 330 | 331 | ```{bash engine.opts='-l'} 332 | samtools flagstat eg/ERR188273_chrX.bam 333 | ``` 334 | 335 | For more stats, use `samtools stats`. 336 | 337 | ```{bash engine.opts='-l'} 338 | samtools stats eg/ERR188273_chrX.bam | grep ^SN 339 | ``` 340 | 341 | ## samtools calmd/fillmd 342 | 343 | The `calmd` or `fillmd` tool is useful for visualising mismatches and insertions in an alignment of a read to a reference genome. The `-e` argument changes identical bases between the read and reference into `=`. 344 | 345 | ```{bash engine.opts='-l'} 346 | samtools view -b eg/ERR188273_chrX.bam | samtools fillmd -e - genome/chrX.fa > eg/ERR188273_chrX_fillmd.bam 347 | 348 | head eg/ERR188273_chrX_fillmd.bam 349 | ``` 350 | 351 | ## Creating FASTQ files from a BAM file 352 | 353 | Use the `fastq` tool to create FASTQ files from a BAM file. For paired-end reads, use `-1` and `-2` to create separate FASTA files. 354 | 355 | ```{bash engine.opts='-l'} 356 | samtools fastq -1 eg/ERR188273_chrX_1.fq -2 eg/ERR188273_chrX_2.fq eg/ERR188273_chrX.bam 357 | head eg/ERR188273_chrX_1.fq 358 | ``` 359 | 360 | ## Random subsampling of BAM file 361 | 362 | The SAMtools view `-s` parameter allows you to randomly sub-sample a BAM file. Using `-s 0.5` will create a new BAM file with a random half of all mapped reads; unmapped reads are not sampled. 363 | 364 | ```{bash engine.opts='-l'} 365 | samtools view -s 0.5 -b eg/ERR188273_chrX.bam > eg/ERR188273_chrX_rand.bam 366 | ``` 367 | 368 | ## Count number of reads 369 | 370 | Use `samtools idxstats` to print stats on a BAM file; this requires an index file which is created by running `samtools index`. The output of idxstats is a file with four tab-delimited columns: 371 | 372 | 1. Reference name 373 | 2. Sequence length of reference 374 | 3. Number of mapped reads 375 | 4. Number of unmapped reads 376 | 377 | ```{bash engine.opts='-l'} 378 | samtools idxstats eg/ERR188273_chrX.bam 379 | ``` 380 | 381 | We can use this with `awk` to calculate: 382 | 383 | The number of mapped reads by summing the third column. 384 | 385 | ```{bash engine.opts='-l'} 386 | samtools idxstats eg/ERR188273_chrX.bam | awk '{s+=$3} END {print s}' 387 | ``` 388 | 389 | The number of reads, which is the sum of mapped and unmapped reads. 390 | 391 | ```{bash engine.opts='-l'} 392 | samtools idxstats eg/ERR188273_chrX.bam | awk '{s+=$3+$4} END {print s}' 393 | ``` 394 | 395 | ## Obtaining genomic sequence 396 | 397 | Use `faidx` to fetch genomic sequence; coordinates are 1-based. 398 | 399 | We need to first index the reference FASTA file that was used to map the reads. 400 | 401 | ```{bash engine.opts='-l'} 402 | samtools faidx genome/chrX.fa 403 | ``` 404 | 405 | Now we can obtain the sequence. 406 | 407 | ```{bash engine.opts='-l'} 408 | samtools faidx genome/chrX.fa chrX:300000-300100 409 | ``` 410 | 411 | ## Comparing BAM files 412 | 413 | The output from `mpileup` can be used to compare BAM files. The commands below generates alignments using `bwa` and `minimap2`. 414 | 415 | ```{bash engine.opts='-l'} 416 | len=100 417 | n=10000 418 | m=300 419 | script/generate_random_seq.pl 30 1000000 1984 > test_ref.fa 420 | script/random_paired_end.pl -f test_ref.fa -l ${len} -n ${n} -m ${m} 421 | bwa index test_ref.fa 2> /dev/null 422 | 423 | bwa mem test_ref.fa l${len}_n${n}_d${m}_1984_1.fq.gz l${len}_n${n}_d${m}_1984_2.fq.gz 2> /dev/null | samtools sort - -o aln_bwa.bam 424 | minimap2 -ax sr test_ref.fa l${len}_n${n}_d${m}_1984_1.fq.gz l${len}_n${n}_d${m}_1984_2.fq.gz 2> /dev/null | samtools sort - -o aln_mm.bam 425 | ``` 426 | 427 | The BAM files can be used with `mpileup` to compare the depths. 428 | 429 | ```{bash engine.opts='-l'} 430 | samtools mpileup -s -f test_ref.fa aln_bwa.bam aln_mm.bam | head -20 431 | ``` 432 | 433 | Another approach is to use [deepTools](https://deeptools.readthedocs.io/en/develop/) and the [bamCompare](https://deeptools.readthedocs.io/en/develop/content/tools/bamCompare.html) command. The bigWig output file shows the ratio of reads between `b1` and `b2` in 50 bp (default) windows. 434 | 435 | ## Converting reference names 436 | 437 | One of the most annoying bioinformatics problems is the use of different chromosome names, e.g. chr1 vs 1, in different references even when the sequences are identical. The GRCh38 reference downloaded from Ensembl has chromosome names without the `chr`: 438 | 439 | >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF 440 | 441 | Whereas the reference names from UCSC has the `chr`: 442 | 443 | >chr1 AC:CM000663.2 gi:568336023 LN:248956422 rl:Chromosome M5:6aef897c3d6ff0c78aff06ac189178dd AS:GRCh38 444 | 445 | Luckily you can change the reference names using `samtools reheader` but just make sure your reference sequences are actually identical. 446 | 447 | ```{bash engine.opts='-l'} 448 | samtools view eg/ERR188273_chrX.bam | head -2 449 | ``` 450 | 451 | View header 452 | 453 | ```{bash engine.opts='-l'} 454 | samtools view -H eg/ERR188273_chrX.bam 455 | ``` 456 | 457 | Substitute header with new name. 458 | 459 | ```{bash engine.opts='-l'} 460 | samtools view -H eg/ERR188273_chrX.bam | sed 's/SN:chrX/SN:X/' > eg/my_header 461 | ``` 462 | 463 | Save bam file with new ref and check it out. 464 | 465 | ```{bash engine.opts='-l'} 466 | samtools reheader eg/my_header eg/ERR188273_chrX.bam > eg/ERR188273_X.bam 467 | samtools view eg/ERR188273_X.bam | head -2 468 | ``` 469 | 470 | ## Coverage 471 | 472 | Coverage can mean the: 473 | 474 | 1. average depth of each covered base 475 | 2. percentage of bases covered 476 | 477 | `samtools depth` and `samtools mpileup` can be used to indicate the depth of 478 | each covered base (and used to calculate the average depth. `samtools coverage` 479 | will provide both the average depth and percentage of bases covered per 480 | chromosome/reference sequence. 481 | 482 | `samtools depth` will return three columns: reference, position, and coverage. 483 | 484 | ```{bash engine.opts='-l'} 485 | samtools depth -@ 4 eg/ERR188273_chrX.bam > ERR188273_depth.tsv 486 | head ERR188273_depth.tsv 487 | ``` 488 | 489 | The average depth can be calculated by summing the third column and dividing by the total number of bases (be sure to use `-a` with `samtools depth` as that will output all positions including zero depth). 490 | 491 | ```{bash engine.opts='-l'} 492 | samtools depth -@ 4 -a eg/ERR188273_chrX.bam | perl -ane '$t += $F[2]; END {$cov = $t / $.; printf "Bases covered:\t%.3f\nCoverage:\t%.3f\n", $., $cov}' 493 | ``` 494 | 495 | The `samtools mpileup` command also provides depth information (but not for reads that have a mapping quality of 0, by default) with some additional information: 496 | 497 | 1. Sequence name 498 | 2. 1-based coordinate 499 | 3. Reference base (when used with `-f`) 500 | 4. Number of reads covering this position 501 | 5. Read bases 502 | 6. Base qualities 503 | 7. Alignment mapping qualities (when used with `-s`) 504 | 505 | ```{bash engine.opts='-l'} 506 | samtools mpileup -f genome/chrX.fa -s eg/ERR188273_chrX.bam > ERR188273_mpileup.tsv 507 | head ERR188273_mpileup.tsv 508 | ``` 509 | 510 | Note that the start of the `samtools mpileup` output differ from the start of the `samtools depth` output. This is because `mpileup` performs some filtering by default. In the case of this example, read pairs that are not both mapped will be ignored. To count these "orphan" reads, use the `--count-orphans` argument. 511 | 512 | ```{bash engine.opts='-l'} 513 | samtools mpileup -f genome/chrX.fa --count-orphans -s eg/ERR188273_chrX.bam > ERR188273_mpileup_orphans.tsv 514 | head ERR188273_mpileup_orphans.tsv 515 | ``` 516 | 517 | In addition `mpileup` performs "per-Base Alignment Quality" (BAQ) by default and will adjust base quality scores. The default behaviour to to skip bases with baseQ/BAQ smaller than 13. If you are finding discrepancies between `mpileup`'s coverage calculation with another coverage tool, you can either set `--min-BQ` to `0` or use `--no-BAQ` to disable BAQ. 518 | 519 | I have an [old blog post](https://davetang.org/muse/2015/08/26/samtools-mpileup/) on using `mpileup`. 520 | 521 | `samtools coverage` will provide the following coverage statistics: 522 | 523 | 1. `rname` - Reference name / chromosome 524 | 2. `startpos` - Start position 525 | 3. `endpos` - End position (or sequence length) 526 | 4. `numreads` - Number reads aligned to the region (after filtering) 527 | 5. `covbases` - Number of covered bases with depth >= 1 528 | 6. `coverage` - Proportion of covered bases [0..1] 529 | 7. `meandepth` - Mean depth of coverage 530 | 8. `meanbaseq` - Mean base quality in covered region 531 | 9. `meanmapq` - Mean mapping quality of selected reads 532 | 533 | ```{bash engine.opts='-l'} 534 | samtools coverage eg/ERR188273_chrX.bam 535 | ``` 536 | 537 | The example BAM file only contains reads for `chrX` hence the statistics are only returned for `chrX`. 538 | 539 | Returning to our coverage definition at the start of this section: 540 | 541 | 1. average depth of each covered base = `meandepth` 542 | 2. percentage of bases covered = `covbases` 543 | 544 | The [mosdepth](https://github.com/brentp/mosdepth) tool can also calculate depth (and much faster than `samtools depth`) per base or within a given window. The output is given in a BED file, where the fourth column indicates the coverage. 545 | 546 | ```{bash engine.opts='-l'} 547 | mosdepth ERR188273 eg/ERR188273_chrX.bam 548 | gunzip -c ERR188273.per-base.bed.gz | head 549 | ``` 550 | 551 | `mosdepth` coverage. 552 | 553 | ```{bash engine.opts='-l'} 554 | cat ERR188273.mosdepth.summary.txt 555 | ``` 556 | 557 | Coverage in using a 500 bp window. 558 | 559 | ```{bash engine.opts='-l'} 560 | mosdepth -n --fast-mode --by 500 ERR188273_500 eg/ERR188273_chrX.bam 561 | gunzip -c ERR188273_500.regions.bed.gz | head 562 | ``` 563 | 564 | ## Stargazers over time 565 | 566 | [![Stargazers over time](https://starchart.cc/davetang/learning_bam_file.svg)](https://starchart.cc/davetang/learning_bam_file) 567 | -------------------------------------------------------------------------------- /mkdocs/docs/img: -------------------------------------------------------------------------------- 1 | ../../img -------------------------------------------------------------------------------- /mkdocs/docs/index.md: -------------------------------------------------------------------------------- 1 | # Learning the BAM format 2 | 3 | ## Introduction 4 | 5 | SAMtools provides various (sub)tools for manipulating alignments in the 6 | SAM/BAM format. The SAM (Sequence Alignment/Map) format (BAM is just the 7 | binary form of SAM) is currently the *de facto* standard for storing 8 | large nucleotide sequence alignments. If you are working with 9 | high-throughput sequencing data, at some point you will probably have to 10 | deal with SAM/BAM files, so familiarise yourself with them\! For the 11 | latest information on SAMtools, please refer to the [release 12 | notes](https://github.com/samtools/samtools/releases). 13 | 14 | The examples in this README use the `ERR188273_chrX.bam` BAM file 15 | (stored in the `eg` folder) generated as per 16 | using the HISAT2 + StringTie2 17 | RNA-seq pipeline. This README is generated using the `create_readme.sh` 18 | script; if you want to generate this file yourself, please use [this 19 | Docker image](https://hub.docker.com/repository/docker/davetang/r_build) 20 | and the `Makefile` in this directory. For example: 21 | 22 | ``` bash 23 | # clone this repo 24 | git clone https://github.com/davetang/learning_bam_file.git 25 | cd learning_bam_file 26 | 27 | docker pull davetang/r_build:4.1.0 28 | 29 | docker run --rm -it -v $(pwd):/work davetang/r_build:4.1.0 /bin/bash 30 | 31 | # inside the Docker container 32 | make 33 | ``` 34 | 35 | ## Installing SAMtools 36 | 37 | For installing SAMtools, I recommend using `Conda` and the [Bioconda 38 | samtools package](https://anaconda.org/bioconda/samtools). I also 39 | recommend using 40 | [Miniconda](https://docs.conda.io/en/latest/miniconda.html) instead of 41 | Anaconda because Anaconda comes with a lot of tools/packages that you 42 | will probably not use. I wrote a [short introduction to 43 | Conda](https://davetang.github.io/reproducible_bioinformatics/conda.html) 44 | if you want to find learn more. 45 | 46 | Once you have installed Miniconda, you can install SAMtools as follows: 47 | 48 | ``` bash 49 | conda install -c bioconda samtools 50 | ``` 51 | 52 | ## Basic usage 53 | 54 | If you run `samtools` on the terminal without any parameters or with 55 | `--help`, all the available utilities are listed: 56 | 57 | ``` bash 58 | samtools --help 59 | ``` 60 | 61 | ## 62 | ## Program: samtools (Tools for alignments in the SAM format) 63 | ## Version: 1.13 (using htslib 1.13) 64 | ## 65 | ## Usage: samtools [options] 66 | ## 67 | ## Commands: 68 | ## -- Indexing 69 | ## dict create a sequence dictionary file 70 | ## faidx index/extract FASTA 71 | ## fqidx index/extract FASTQ 72 | ## index index alignment 73 | ## 74 | ## -- Editing 75 | ## calmd recalculate MD/NM tags and '=' bases 76 | ## fixmate fix mate information 77 | ## reheader replace BAM header 78 | ## targetcut cut fosmid regions (for fosmid pool only) 79 | ## addreplacerg adds or replaces RG tags 80 | ## markdup mark duplicates 81 | ## ampliconclip clip oligos from the end of reads 82 | ## 83 | ## -- File operations 84 | ## collate shuffle and group alignments by name 85 | ## cat concatenate BAMs 86 | ## merge merge sorted alignments 87 | ## mpileup multi-way pileup 88 | ## sort sort alignment file 89 | ## split splits a file by read group 90 | ## quickcheck quickly check if SAM/BAM/CRAM file appears intact 91 | ## fastq converts a BAM to a FASTQ 92 | ## fasta converts a BAM to a FASTA 93 | ## import Converts FASTA or FASTQ files to SAM/BAM/CRAM 94 | ## 95 | ## -- Statistics 96 | ## bedcov read depth per BED region 97 | ## coverage alignment depth and percent coverage 98 | ## depth compute the depth 99 | ## flagstat simple stats 100 | ## idxstats BAM index stats 101 | ## phase phase heterozygotes 102 | ## stats generate stats (former bamcheck) 103 | ## ampliconstats generate amplicon specific stats 104 | ## 105 | ## -- Viewing 106 | ## flags explain BAM flags 107 | ## tview text alignment viewer 108 | ## view SAM<->BAM<->CRAM conversion 109 | ## depad convert padded BAM to unpadded BAM 110 | ## 111 | ## -- Misc 112 | ## help [cmd] display this help message or help for [cmd] 113 | ## version detailed version information 114 | 115 | ## Viewing 116 | 117 | Use [bioSyntax](https://github.com/bioSyntax/bioSyntax) to prettify your 118 | output. 119 | 120 | ``` bash 121 | samtools view aln.bam | sam-less 122 | ``` 123 | 124 | ![bioSyntax](img/sam_less.png) 125 | 126 | ## Converting a SAM file to a BAM file 127 | 128 | A BAM file is just a SAM file but stored in binary format; you should 129 | always convert your SAM files into BAM format since they are smaller in 130 | size and are faster to manipulate. 131 | 132 | I don’t have a SAM file in the example folder, so let’s create one and 133 | check out the first ten lines. Note: remember to use `-h` to ensure the 134 | SAM file contains the sequence header information. Generally, I 135 | recommend storing only sorted BAM files as they use even less disk space 136 | and are faster to process. 137 | 138 | ``` bash 139 | samtools view -h eg/ERR188273_chrX.bam > eg/ERR188273_chrX.sam 140 | ``` 141 | 142 | Notice that the SAM file is much larger than the BAM file. 143 | 144 | Size of SAM file. 145 | 146 | ``` bash 147 | ls -lh eg/ERR188273_chrX.sam 148 | ``` 149 | 150 | ## -rw-r--r-- 1 root root 321M Aug 7 06:00 eg/ERR188273_chrX.sam 151 | 152 | Size of BAM file. 153 | 154 | ``` bash 155 | ls -lh eg/ERR188273_chrX.bam 156 | ``` 157 | 158 | ## -rw-r--r-- 1 root root 67M Jun 21 2020 eg/ERR188273_chrX.bam 159 | 160 | We can use `head` to view a SAM file. 161 | 162 | ``` bash 163 | head eg/ERR188273_chrX.sam 164 | ``` 165 | 166 | ## @HD VN:1.0 SO:coordinate 167 | ## @SQ SN:chrX LN:156040895 168 | ## @PG ID:hisat2 PN:hisat2 VN:2.2.0 CL:"/Users/dtang/github/rnaseq/hisat2/../src/hisat2-2.2.0/hisat2-align-s --wrapper basic-0 --dta -p 4 -x ../raw/chrX_data/indexes/chrX_tran -1 /tmp/4195.inpipe1 -2 /tmp/4195.inpipe2" 169 | ## @PG ID:samtools PN:samtools PP:hisat2 VN:1.13 CL:samtools view -h eg/ERR188273_chrX.bam 170 | ## ERR188273.4711308 73 chrX 21649 0 5S70M = 21649 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 171 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 172 | ## ERR188273.4711308 329 chrX 233717 0 5S70M = 233717 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 173 | ## ERR188273.14904746 99 chrX 251271 60 75M = 251317 121 GAAAAATGGGCCCAGGGGACCGGCGCTCAGCATACAGAGGACCCGCGCCGGCACCTGCCTCTGAGTTCCCTTAGT @@HHEGIIGAGIIIBGIIG@FECH aln.bam 187 | ``` 188 | 189 | If the header information is available, we can convert a SAM file into 190 | BAM by using `samtools view -b`. In newer versions of SAMtools, the 191 | input format is auto-detected, so we no longer need the `-S` parameter. 192 | 193 | ``` bash 194 | samtools view -b eg/ERR188273_chrX.sam > eg/my.bam 195 | ``` 196 | 197 | ## Converting a BAM file to a CRAM file 198 | 199 | The CRAM format is even more compact. Use `samtools view` with the `-T` 200 | and `-C` arguments to convert a BAM file into CRAM. 201 | 202 | ``` bash 203 | samtools view -T genome/chrX.fa -C -o eg/ERR188273_chrX.cram eg/ERR188273_chrX.bam 204 | 205 | ls -lh eg/ERR188273_chrX.[sbcr]*am 206 | ``` 207 | 208 | ## -rw-r--r-- 1 root root 67M Jun 21 2020 eg/ERR188273_chrX.bam 209 | ## -rw-r--r-- 1 root root 40M Aug 7 06:01 eg/ERR188273_chrX.cram 210 | ## -rw-r--r-- 1 root root 321M Aug 7 06:00 eg/ERR188273_chrX.sam 211 | 212 | You can use `samtools view` to view a CRAM file just as you would for a 213 | BAM file. 214 | 215 | ``` bash 216 | samtools view eg/ERR188273_chrX.cram | head 217 | ``` 218 | 219 | ## ERR188273.4711308 73 chrX 21649 0 5S70M = 21649 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UP NH:i:2 MD:Z:70 NM:i:0 220 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 221 | ## ERR188273.4711308 329 chrX 233717 0 5S70M = 233717 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UP NH:i:2 MD:Z:70 NM:i:0 222 | ## ERR188273.14904746 99 chrX 251271 60 75M = 251317 121 GAAAAATGGGCCCAGGGGACCGGCGCTCAGCATACAGAGGACCCGCGCCGGCACCTGCCTCTGAGTTCCCTTAGT @@HHEGIIGAGIIIBGIIG@FECHDC>>+@::8-755-BBBFDDEHHBGGEGHEEIJIIGIJJIGEIIIJJJIIJJIGGHHHGGFFFFF@@C AS:i:0 ZS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UP NH:i:10 MD:Z:75 NM:i:0 226 | ## ERR188273.5927795 385 chrX 265991 1 75M = 114048277 0 TGGGACTACAGGCGCCCGCCACCACGCCCGGCTAATTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGTGTTA =?BB??BD?FBHHBEAE@CDGG@HH=FA@GEGE;FGACCHBE6?A=ACE9)7@DCE>>5'3=338:;:>2;3?BCFFEEHHHEEGIGGHAGFBBHFBHHEHCG@<@ABG??@@?BB9GBGAFFD<. 297 | 298 | ``` bash 299 | samtools flags 300 | ``` 301 | 302 | ## About: Convert between textual and numeric flag representation 303 | ## Usage: samtools flags FLAGS... 304 | ## 305 | ## Each FLAGS argument is either an INT (in decimal/hexadecimal/octal) representing 306 | ## a combination of the following numeric flag values, or a comma-separated string 307 | ## NAME,...,NAME representing a combination of the following flag names: 308 | ## 309 | ## 0x1 1 PAIRED paired-end / multiple-segment sequencing technology 310 | ## 0x2 2 PROPER_PAIR each segment properly aligned according to aligner 311 | ## 0x4 4 UNMAP segment unmapped 312 | ## 0x8 8 MUNMAP next segment in the template unmapped 313 | ## 0x10 16 REVERSE SEQ is reverse complemented 314 | ## 0x20 32 MREVERSE SEQ of next segment in template is rev.complemented 315 | ## 0x40 64 READ1 the first segment in the template 316 | ## 0x80 128 READ2 the last segment in the template 317 | ## 0x100 256 SECONDARY secondary alignment 318 | ## 0x200 512 QCFAIL not passing quality controls or other filters 319 | ## 0x400 1024 DUP PCR or optical duplicate 320 | ## 0x800 2048 SUPPLEMENTARY supplementary alignment 321 | 322 | ## Filtering unmapped reads 323 | 324 | Use `-F 4` to filter out unmapped reads. 325 | 326 | ``` bash 327 | samtools view -F 4 -b eg/ERR188273_chrX.bam > eg/ERR188273_chrX.mapped.bam 328 | ``` 329 | 330 | Use `-f 4` to keep only unmapped reads. 331 | 332 | ``` bash 333 | samtools view -f 4 -b eg/ERR188273_chrX.bam > eg/ERR188273_chrX.unmapped.bam 334 | ``` 335 | 336 | We can use the `flags` subcommand to confirm that a value of four 337 | represents an unmapped read. 338 | 339 | ``` bash 340 | samtools flags 4 341 | ``` 342 | 343 | ## 0x4 4 UNMAP 344 | 345 | ## Extracting entries mapping to a specific loci 346 | 347 | Use `samtools view` and the `ref:start-end` syntax to extract reads 348 | mapping within a specific genomic loci; this requires a BAM index file. 349 | 350 | ``` bash 351 | samtools view eg/ERR188273_chrX.bam chrX:20000-30000 352 | ``` 353 | 354 | ## ERR188273.4711308 73 chrX 21649 0 5S70M = 21649 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 355 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 356 | 357 | Note that this takes into account the mapping of the entire read and not 358 | just the starting position. For example, if you specified 359 | chrX:20000-30000, a 75 bp long read that starts its mapping from 360 | position 19999 will also be returned. In addition, you can save the 361 | output as another BAM file if you want. 362 | 363 | ``` bash 364 | samtools view -b eg/ERR188273_chrX.bam chrX:20000-30000 > eg/ERR188273_chrX_20000_30000.bam 365 | ``` 366 | 367 | If you want reads mapped to a single reference (e.g. chromosome), just 368 | specify the `ref` and leave out the start and end values. 369 | 370 | ``` bash 371 | samtools view eg/ERR188273_chrX.bam chrX | head 372 | ``` 373 | 374 | ## ERR188273.4711308 73 chrX 21649 0 5S70M = 21649 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 375 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 376 | ## ERR188273.4711308 329 chrX 233717 0 5S70M = 233717 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 377 | ## ERR188273.14904746 99 chrX 251271 60 75M = 251317 121 GAAAAATGGGCCCAGGGGACCGGCGCTCAGCATACAGAGGACCCGCGCCGGCACCTGCCTCTGAGTTCCCTTAGT @@HHEGIIGAGIIIBGIIG@FECHDC>>+@::8-755-BBBFDDEHHBGGEGHEEIJIIGIJJIGEIIIJJJIIJJIGGHHHGGFFFFF@@C AS:i:0 ZS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:75 YT:Z:UP NH:i:10 381 | ## ERR188273.5927795 385 chrX 265991 1 75M = 114048277 0 TGGGACTACAGGCGCCCGCCACCACGCCCGGCTAATTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGTGTTA =?BB??BD?FBHHBEAE@CDGG@HH=FA@GEGE;FGACCHBE6?A=ACE9)7@DCE>>5'3=338:;:>2;3?BCFFEEHHHEEGIGGHAGFBBHFBHHEHCG@<@ABG??@@?BB9GBGAFFD<DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 397 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 398 | ## ERR188273.4711308 329 chrX 233717 0 5S70M = 233717 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 399 | ## ERR188273.14904746 99 chrX 251271 60 75M = 251317 121 GAAAAATGGGCCCAGGGGACCGGCGCTCAGCATACAGAGGACCCGCGCCGGCACCTGCCTCTGAGTTCCCTTAGT @@HHEGIIGAGIIIBGIIG@FECH eg/first.bam 410 | ``` 411 | 412 | Once again, you can use `flags` to verify this (it also accepts 413 | hexadecimal input). 414 | 415 | ``` bash 416 | samtools flags 0x0040 417 | ``` 418 | 419 | ## 0x40 64 READ1 420 | 421 | ## Stats 422 | 423 | For simple statistics use `samtools flagstat`. 424 | 425 | ``` bash 426 | samtools flagstat eg/ERR188273_chrX.bam 427 | ``` 428 | 429 | ## 1176360 + 0 in total (QC-passed reads + QC-failed reads) 430 | ## 1160084 + 0 primary 431 | ## 16276 + 0 secondary 432 | ## 0 + 0 supplementary 433 | ## 0 + 0 duplicates 434 | ## 0 + 0 primary duplicates 435 | ## 1126961 + 0 mapped (95.80% : N/A) 436 | ## 1110685 + 0 primary mapped (95.74% : N/A) 437 | ## 1160084 + 0 paired in sequencing 438 | ## 580042 + 0 read1 439 | ## 580042 + 0 read2 440 | ## 1060858 + 0 properly paired (91.45% : N/A) 441 | ## 1065618 + 0 with itself and mate mapped 442 | ## 45067 + 0 singletons (3.88% : N/A) 443 | ## 0 + 0 with mate mapped to a different chr 444 | ## 0 + 0 with mate mapped to a different chr (mapQ>=5) 445 | 446 | For more stats, use `samtools stats`. 447 | 448 | ``` bash 449 | samtools stats eg/ERR188273_chrX.bam | grep ^SN 450 | ``` 451 | 452 | ## SN raw total sequences: 1160084 # excluding supplementary and secondary reads 453 | ## SN filtered sequences: 0 454 | ## SN sequences: 1160084 455 | ## SN is sorted: 1 456 | ## SN 1st fragments: 580042 457 | ## SN last fragments: 580042 458 | ## SN reads mapped: 1110685 459 | ## SN reads mapped and paired: 1065618 # paired-end technology bit set + both mates mapped 460 | ## SN reads unmapped: 49399 461 | ## SN reads properly paired: 1060858 # proper-pair bit set 462 | ## SN reads paired: 1160084 # paired-end technology bit set 463 | ## SN reads duplicated: 0 # PCR or optical duplicate bit set 464 | ## SN reads MQ0: 905 # mapped and MQ=0 465 | ## SN reads QC failed: 0 466 | ## SN non-primary alignments: 16276 467 | ## SN supplementary alignments: 0 468 | ## SN total length: 87006300 # ignores clipping 469 | ## SN total first fragment length: 43503150 # ignores clipping 470 | ## SN total last fragment length: 43503150 # ignores clipping 471 | ## SN bases mapped: 83301375 # ignores clipping 472 | ## SN bases mapped (cigar): 83064942 # more accurate 473 | ## SN bases trimmed: 0 474 | ## SN bases duplicated: 0 475 | ## SN mismatches: 423271 # from NM fields 476 | ## SN error rate: 5.095663e-03 # mismatches / bases mapped (cigar) 477 | ## SN average length: 75 478 | ## SN average first fragment length: 75 479 | ## SN average last fragment length: 75 480 | ## SN maximum length: 75 481 | ## SN maximum first fragment length: 75 482 | ## SN maximum last fragment length: 75 483 | ## SN average quality: 36.0 484 | ## SN insert size average: 182.7 485 | ## SN insert size standard deviation: 176.0 486 | ## SN inward oriented pairs: 530763 487 | ## SN outward oriented pairs: 1042 488 | ## SN pairs with other orientation: 1004 489 | ## SN pairs on different chromosomes: 0 490 | ## SN percentage of properly paired reads (%): 91.4 491 | 492 | ## samtools calmd/fillmd 493 | 494 | The `calmd` or `fillmd` tool is useful for visualising mismatches and 495 | insertions in an alignment of a read to a reference genome. The `-e` 496 | argument changes identical bases between the read and reference into 497 | `=`. 498 | 499 | ``` bash 500 | samtools view -b eg/ERR188273_chrX.bam | samtools fillmd -e - genome/chrX.fa > eg/ERR188273_chrX_fillmd.bam 501 | 502 | head eg/ERR188273_chrX_fillmd.bam 503 | ``` 504 | 505 | ## @HD VN:1.0 SO:coordinate 506 | ## @SQ SN:chrX LN:156040895 507 | ## @PG ID:hisat2 PN:hisat2 VN:2.2.0 CL:"/Users/dtang/github/rnaseq/hisat2/../src/hisat2-2.2.0/hisat2-align-s --wrapper basic-0 --dta -p 4 -x ../raw/chrX_data/indexes/chrX_tran -1 /tmp/4195.inpipe1 -2 /tmp/4195.inpipe2" 508 | ## @PG ID:samtools PN:samtools PP:hisat2 VN:1.13 CL:samtools view -b eg/ERR188273_chrX.bam 509 | ## @PG ID:samtools.1 PN:samtools PP:samtools VN:1.13 CL:samtools fillmd -e - genome/chrX.fa 510 | ## ERR188273.4711308 73 chrX 21649 0 5S70M = 21649 0 CGGGT====================================================================== @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 511 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 512 | ## ERR188273.4711308 329 chrX 233717 0 5S70M = 233717 0 CGGGT====================================================================== @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 513 | ## ERR188273.14904746 99 chrX 251271 60 75M = 251317 121 =========================================================================== @@HHEGIIGAGIIIBGIIG@FECHDGHCHHHGHHFFFFFDEACC@ 532 | ## @ERR188273.14904746 533 | ## GAAAAATGGGCCCAGGGGACCGGCGCTCAGCATACAGAGGACCCGCGCCGGCACCTGCCTCTGAGTTCCCTTAGT 534 | ## + 535 | ## @@HHEGIIGAGIIIBGIIG@FECH eg/ERR188273_chrX_rand.bam 547 | ``` 548 | 549 | ## Count number of reads 550 | 551 | Use `samtools idxstats` to print stats on a BAM file; this requires an 552 | index file which is created by running `samtools index`. The output of 553 | idxstats is a file with four tab-delimited columns: 554 | 555 | 1. Reference name 556 | 2. Sequence length of reference 557 | 3. Number of mapped reads 558 | 4. Number of unmapped reads 559 | 560 | 561 | 562 | ``` bash 563 | samtools idxstats eg/ERR188273_chrX.bam 564 | ``` 565 | 566 | ## chrX 156040895 1126961 45067 567 | ## * 0 0 4332 568 | 569 | We can use this with `awk` to calculate: 570 | 571 | The number of mapped reads by summing the third column. 572 | 573 | ``` bash 574 | samtools idxstats eg/ERR188273_chrX.bam | awk '{s+=$3} END {print s}' 575 | ``` 576 | 577 | ## 1126961 578 | 579 | The number of reads, which is the sum of mapped and unmapped reads. 580 | 581 | ``` bash 582 | samtools idxstats eg/ERR188273_chrX.bam | awk '{s+=$3+$4} END {print s}' 583 | ``` 584 | 585 | ## 1176360 586 | 587 | ## Obtaining genomic sequence 588 | 589 | Use `faidx` to fetch genomic sequence; coordinates are 1-based. 590 | 591 | We need to first index the reference FASTA file that was used to map the 592 | reads. 593 | 594 | ``` bash 595 | samtools faidx genome/chrX.fa 596 | ``` 597 | 598 | Now we can obtain the sequence. 599 | 600 | ``` bash 601 | samtools faidx genome/chrX.fa chrX:300000-300100 602 | ``` 603 | 604 | ## >chrX:300000-300100 605 | ## ctgagatcgtgccactgcactccagcctgggcgacagagcgagactccatctcaaaaaaa 606 | ## aaaaaaaaaaaaaagaTggggtctctctatgttggccaggt 607 | 608 | ## Comparing BAM files 609 | 610 | Install [deepTools](https://deeptools.readthedocs.io/en/develop/) and 611 | use 612 | [bamCompare](https://deeptools.readthedocs.io/en/develop/content/tools/bamCompare.html). 613 | The bigWig output file shows the ratio of reads between `b1` and `b2` in 614 | 50 bp (default) windows. 615 | 616 | ## Converting reference names 617 | 618 | One of the most annoying bioinformatics problems is the use of different 619 | chromosome names, e.g. chr1 vs 1, in different references even when the 620 | sequences are identical. The GRCh38 reference downloaded from Ensembl 621 | has chromosome names without the `chr`: 622 | 623 | >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF 624 | 625 | Whereas the reference names from UCSC has the `chr`: 626 | 627 | >chr1 AC:CM000663.2 gi:568336023 LN:248956422 rl:Chromosome M5:6aef897c3d6ff0c78aff06ac189178dd AS:GRCh38 628 | 629 | Luckily you can change the reference names using `samtools reheader` but 630 | just make sure your reference sequences are actually identical. 631 | 632 | ``` bash 633 | samtools view eg/ERR188273_chrX.bam | head -2 634 | ``` 635 | 636 | ## ERR188273.4711308 73 chrX 21649 0 5S70M = 21649 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 637 | ## ERR188273.4711308 133 chrX 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 638 | 639 | View header 640 | 641 | ``` bash 642 | samtools view -H eg/ERR188273_chrX.bam 643 | ``` 644 | 645 | ## @HD VN:1.0 SO:coordinate 646 | ## @SQ SN:chrX LN:156040895 647 | ## @PG ID:hisat2 PN:hisat2 VN:2.2.0 CL:"/Users/dtang/github/rnaseq/hisat2/../src/hisat2-2.2.0/hisat2-align-s --wrapper basic-0 --dta -p 4 -x ../raw/chrX_data/indexes/chrX_tran -1 /tmp/4195.inpipe1 -2 /tmp/4195.inpipe2" 648 | ## @PG ID:samtools PN:samtools PP:hisat2 VN:1.13 CL:samtools view -H eg/ERR188273_chrX.bam 649 | 650 | Substitute header with new name. 651 | 652 | ``` bash 653 | samtools view -H eg/ERR188273_chrX.bam | sed 's/SN:chrX/SN:X/' > eg/my_header 654 | ``` 655 | 656 | Save bam file with new ref and check it out. 657 | 658 | ``` bash 659 | samtools reheader eg/my_header eg/ERR188273_chrX.bam > eg/ERR188273_X.bam 660 | samtools view eg/ERR188273_X.bam | head -2 661 | ``` 662 | 663 | ## ERR188273.4711308 73 X 21649 0 5S70M = 21649 0 CGGGTGATCACGAGGTCAGGAGATCAAGACCATCCTGGCCAACACAGTGAAACCCCATCTCTACTAAAAATACAA @@@F=DDFFHGHBHIFFHIGGIFGEGHFHIGIGIFIIIGIGIGGDHIIGIIC@>DGHCHHHGHHFFFFFDEACC@ AS:i:-5 ZS:i:-5 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:70 YT:Z:UP NH:i:2 664 | ## ERR188273.4711308 133 X 21649 0 * = 21649 0 CTACAGGTGCCCGCCACCATGCCCAGCTAATTTTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTGTGTTGGCC CB@FDFFFHHGFHIJJJJIIIIIIIGGGIJGIIJJJJJJFFHIIIIGECHEHHGGHHFF?AACCDDDDDDDDBCD YT:Z:UP 665 | 666 | ## Coverage 667 | 668 | There are several ways to calculate coverage, i.e. count the number of 669 | bases mapped to positions on the reference. `samtools depth` will return 670 | three columns: reference, position, and coverage. 671 | 672 | ``` bash 673 | samtools depth eg/ERR188273_chrX.bam | head 674 | ``` 675 | 676 | ## chrX 21649 1 677 | ## chrX 21650 1 678 | ## chrX 21651 1 679 | ## chrX 21652 1 680 | ## chrX 21653 1 681 | ## chrX 21654 1 682 | ## chrX 21655 1 683 | ## chrX 21656 1 684 | ## chrX 21657 1 685 | ## chrX 21658 1 686 | 687 | The `samtools mpileup` command can also provide depth information (but 688 | not for reads that have a mapping quality of 0, by default) with some 689 | additional information: 690 | 691 | 1. Sequence name 692 | 2. 1-based coordinate 693 | 3. Reference base (when used with `-f`) 694 | 4. Number of reads covering this position 695 | 5. Read bases 696 | 6. Base qualities 697 | 7. Alignment mapping qualities (when used with `-s`) 698 | 699 | 700 | 701 | ``` bash 702 | samtools mpileup -f genome/chrX.fa -s eg/ERR188273_chrX.bam | head 703 | ``` 704 | 705 | ## [mpileup] 1 samples in 1 input files 706 | ## chrX 251271 g 1 ^]. @ ] 707 | ## chrX 251272 a 1 . @ ] 708 | ## chrX 251273 a 1 . < ] 709 | ## chrX 251274 a 1 . D ] 710 | ## chrX 251275 a 1 . D ] 711 | ## chrX 251276 a 1 . D ] 712 | ## chrX 251277 t 1 . D ] 713 | ## chrX 251278 g 1 . D ] 714 | ## chrX 251279 g 1 . F ] 715 | ## chrX 251280 g 1 . B ] 716 | 717 | I have an [old blog 718 | post](https://davetang.org/muse/2015/08/26/samtools-mpileup/) on using 719 | `mpileup`. 720 | 721 | `samtools coverage` will provide the following coverage statistics: 722 | 723 | 1. rname - Reference name / chromosome 724 | 2. startpos - Start position 725 | 3. endpos - End position (or sequence length) 726 | 4. numreads - Number reads aligned to the region (after filtering) 727 | 5. covbases - Number of covered bases with depth \>= 1 728 | 6. coverage - Proportion of covered bases \[0..1\] 729 | 7. meandepth - Mean depth of coverage 730 | 8. meanbaseq - Mean base quality in covered region 731 | 9. meanmapq - Mean mapping quality of selected reads 732 | 733 | 734 | 735 | ``` bash 736 | samtools coverage eg/ERR188273_chrX.bam 737 | ``` 738 | 739 | ## #rname startpos endpos numreads covbases coverage meandepth meanbaseq meanmapq 740 | ## chrX 1 156040895 1110685 3402037 2.18022 0.532299 36.3 59.4 741 | 742 | ## Stargazers over time 743 | 744 | [![Stargazers over 745 | time](https://starchart.cc/davetang/learning_bam_file.svg)](https://starchart.cc/davetang/learning_bam_file) 746 | -------------------------------------------------------------------------------- /mkdocs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Learning the BAM format 2 | theme: readthedocs 3 | -------------------------------------------------------------------------------- /script/README.md: -------------------------------------------------------------------------------- 1 | ## README 2 | 3 | How to extract a subset of reads (IDs stored in a file) from a BAM file? 4 | 5 | Download BAM file with 918,571 reads. 6 | 7 | ```bash 8 | wget -c -N http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeUwRepliSeq/wgEncodeUwRepliSeqK562G1AlnRep1.bam 9 | samtools index wgEncodeUwRepliSeqK562G1AlnRep1.bam 10 | 11 | samtools idxstats wgEncodeUwRepliSeqK562G1AlnRep1.bam | perl -lane '$i += $F[2]; END { print $i }' 12 | 918571 13 | ``` 14 | 15 | Get some random IDs (around 0.1% of total). 16 | 17 | ```bash 18 | samtools view -@8 -s 1984.001 wgEncodeUwRepliSeqK562G1AlnRep1.bam | cut -f1 > my_id.txt 19 | 20 | cat my_id.txt | wc -l 21 | 881 22 | ``` 23 | 24 | Extract sequences from `my_id.txt`. 25 | 26 | ```bash 27 | time samtools view -@8 wgEncodeUwRepliSeqK562G1AlnRep1.bam | grep -w -f my_id.txt > my_seq.txt 28 | 29 | real 35m48.658s 30 | user 35m46.491s 31 | sys 0m1.273s 32 | 33 | cat my_seq.txt | wc -l 34 | 881 35 | ``` 36 | 37 | Split and grep. 38 | 39 | ```bash 40 | time for ref in $(samtools idxstats wgEncodeUwRepliSeqK562G1AlnRep1.bam | grep -v '^*' | cut -f1); do 41 | samtools view wgEncodeUwRepliSeqK562G1AlnRep1.bam $ref | grep -w -f my_id.txt > $ref.txt 42 | done 43 | 44 | real 40m12.463s 45 | user 39m48.597s 46 | sys 0m9.205s 47 | 48 | cat chr*.txt | wc -l 49 | 881 50 | ``` 51 | 52 | Split and grep using `parallel` and 7 threads. 53 | 54 | ```bash 55 | time samtools idxstats wgEncodeUwRepliSeqK562G1AlnRep1.bam | 56 | grep -v '^*' | 57 | cut -f1 | 58 | parallel -j 7 --verbose 'samtools view wgEncodeUwRepliSeqK562G1AlnRep1.bam {} | grep -w -f my_id.txt > {}.txt' 59 | 60 | real 10m26.894s 61 | user 67m26.912s 62 | sys 0m7.958s 63 | 64 | cat chr*.txt | wc -l 65 | 881 66 | ``` 67 | 68 | Back to BAM. 69 | 70 | ```bash 71 | samtools view -H wgEncodeUwRepliSeqK562G1AlnRep1.bam > header 72 | 73 | cat header chr*.txt | samtools view -Sb | samtools sort - > my_id.bam 74 | ``` 75 | 76 | As a Perl script using `Bio::DB::Sam` and `Parallel::ForkManager`. 77 | 78 | ```bash 79 | samtools idxstats wgEncodeUwRepliSeqK562G1AlnRep1.bam > wgEncodeUwRepliSeqK562G1AlnRep1.idxstats 80 | 81 | wget -c -N https://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz 82 | gunzip hg19.fa.gz 83 | samtools faidx hg19.fa 84 | 85 | script/get_reads.pl -p 4 -b wgEncodeUwRepliSeqK562G1AlnRep1.bam -r 27 -l my_id.txt -i wgEncodeUwRepliSeqK562G1AlnRep1.idxstats -f hg19.fa > reads.txt 86 | ``` 87 | 88 | Use my Docker image if you have problem installing the Perl packages. 89 | 90 | ```bash 91 | docker pull davetang/bioperl 92 | 93 | docker run --rm -v ~/github/learning_bam_file/:/work -it davetang/bioperl /bin/bash 94 | 95 | cd /work 96 | 97 | perl script/get_reads.pl 98 | Usage: script/get_reads.pl -b FILE -l FILE -t DIR -p INT -s INT -r INT 99 | 100 | -b infile.bam BAM file 101 | -r 100 Length of a read 102 | -l list.txt List of read IDs 103 | -f genome.fasta FASTA file used for read alignment 104 | -i file.idxstats Output from samtools idxstats saved in a file 105 | -t /scratch/tmp Directory for temporary files (default /tmp/) 106 | -p 8 Number of processors to use (default 8) 107 | -s 40 Number of chunks to split BAM file (default 40) 108 | -h this helpful usage message 109 | 110 | samtools view aln.bam | cut -f1 | head -100 | sort -u > list.txt 111 | 112 | samtools idxstats aln.bam > aln.idxstats 113 | 114 | perl script/get_reads.pl -b aln.bam -r 100 -l list.txt -f sequence/ref.fa -i aln.idxstats -p 4 > my_reads.txt 115 | ``` 116 | 117 | -------------------------------------------------------------------------------- /script/coverage_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | bin=$(dirname $0) 6 | root=${bin}/.. 7 | 8 | if [[ ! -d ${root}/test ]]; then 9 | mkdir ${root}/test 10 | fi 11 | 12 | outdir=${root}/test 13 | 14 | check_depend (){ 15 | tool=$1 16 | if [[ ! -x $(command -v ${tool}) ]]; then 17 | >&2 echo Could not find ${tool} 18 | exit 1 19 | fi 20 | } 21 | 22 | for t in samtools bwa column; do 23 | check_depend ${t} 24 | done 25 | 26 | ref_num=2 27 | ref_size=1000000 28 | seed=1984 29 | ref=ref_${ref_num}_${ref_size}_${seed}.fa 30 | base=$(basename ${ref} .fa) 31 | 32 | if [[ ! -e ${outdir}/${ref} ]]; then 33 | >&2 echo Generating random reference 34 | ${bin}/generate_random_seq.pl ${ref_num} ${ref_size} ${seed} > ${outdir}/${ref} 35 | >&2 echo Indexing ${ref} 36 | bwa index ${outdir}/${ref} 37 | fi 38 | 39 | len=150 40 | num=111111 41 | md=500 42 | read1=l${len}_n${num}_d${md}_${seed}_1.fq.gz 43 | read2=l${len}_n${num}_d${md}_${seed}_2.fq.gz 44 | 45 | if [[ ! -e ${outdir}/${read1} ]]; then 46 | >&2 echo Generating random reads 47 | ${bin}/random_paired_end.pl \ 48 | -f ${outdir}/ref.fa \ 49 | -l ${len} \ 50 | -n ${num} \ 51 | -m ${md} \ 52 | -s ${seed} 53 | mv -f ${read1} ${read2} ${outdir} 54 | fi 55 | 56 | if [[ ! -e ${outdir}/${base}.bwa.bam && ! -e ${outdir}/${base}.bwa.bam.bai ]]; then 57 | >&2 echo Mapping reads 58 | bwa mem ${outdir}/${ref} ${outdir}/${read1} ${outdir}/${read2} |\ 59 | samtools sort -O BAM |\ 60 | tee ${outdir}/${base}.bwa.bam |\ 61 | samtools index - ${outdir}/${base}.bwa.bam.bai 62 | fi 63 | 64 | cov=$(bc -l<<<"${len}*${num}*${ref_num}/(${ref_size}*${ref_num})") 65 | >&2 echo -e "Coverage should be ${cov}\n" 66 | >&2 echo Coverage calculation using samtools coverage 67 | samtools coverage ${outdir}/${base}.bwa.bam | column -t 68 | >&2 echo -e 69 | 70 | >&2 echo Coverage calculation using samtools depth 71 | samtools depth ${outdir}/${base}.bwa.bam | perl -ane '$t += $F[2]; END {$cov = $t / $.; printf "Bases covered:\t%.2f\nCoverage:\t%.2f\n", $., $cov}' 72 | >&2 echo -e 73 | 74 | >&2 echo Coverage calculation using samtools mpileup 75 | samtools mpileup ${outdir}/${base}.bwa.bam | perl -ane '$t += $F[3]; END {$cov = $t / $.; printf "Bases covered:\t%.2f\nCoverage:\t%.2f\n", $., $cov}' 76 | >&2 echo -e 77 | 78 | >&2 echo Done 79 | exit 0 80 | 81 | -------------------------------------------------------------------------------- /script/generate_random_seq.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "Usage: $0 \n"; 7 | my $num = shift or die $usage; 8 | my $len = shift or die $usage; 9 | my $seed = shift or die $usage; 10 | 11 | # set seed for reproducibility 12 | srand($seed); 13 | 14 | foreach my $i (1 .. $num){ 15 | my $random_seq = random_seq($len); 16 | print ">${i}\n$random_seq\n"; 17 | } 18 | 19 | exit(0); 20 | 21 | sub random_seq { 22 | my ($len) = @_; 23 | my @nuc = qw/ A C G T /; 24 | my $seq = ''; 25 | for (1 .. $len){ 26 | my $rand_ind = int(rand(scalar(@nuc))); 27 | $seq .= $nuc[$rand_ind]; 28 | } 29 | return($seq); 30 | } 31 | 32 | exit(0); 33 | -------------------------------------------------------------------------------- /script/get_reads.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Bio::DB::Sam; 6 | use Getopt::Std; 7 | use Parallel::ForkManager; 8 | 9 | my %opts = (); 10 | getopts('b:t:l:i:s:p:h:f:r:', \%opts); 11 | 12 | if ( $opts{'h'} || 13 | !exists $opts{'b'} || 14 | !exists $opts{'l'} || 15 | !exists $opts{'f'} || 16 | !exists $opts{'i'} || 17 | !exists $opts{'r'} 18 | ){ 19 | usage(); 20 | } 21 | 22 | # arguments are explained in the usage at the end of the script 23 | my $bam_file = $opts{'b'}; 24 | my $list = $opts{'l'}; 25 | my $fasta_file = $opts{'f'}; 26 | my $idx_file = $opts{'i'}; 27 | my $read_len = $opts{'r'}; 28 | 29 | my $num_split = 40; 30 | my $processes = 8; 31 | my $tmp_dir = "/tmp/"; 32 | 33 | if ($opts{'t'}){ 34 | if (!-d $tmp_dir){ 35 | die("$tmp_dir does not exist\n"); 36 | } else { 37 | warn("Using $tmp_dir as temporary directory\n"); 38 | $tmp_dir = $opts{'t'}; 39 | } 40 | } 41 | 42 | if ($opts{'p'}){ 43 | $processes = $opts{'p'}; 44 | } 45 | 46 | if ($opts{'s'}){ 47 | $num_split = $opts{'s'}; 48 | } 49 | 50 | # store list of read names 51 | my %list = store_read($list); 52 | my $n = scalar(keys %list); 53 | warn("Stored $n reads\n"); 54 | 55 | # get chromosome sizes for splitting 56 | my %ref_size = get_ref_size($idx_file); 57 | my @ref = keys %ref_size; 58 | 59 | warn("Using $processes processors\n"); 60 | my $manager = new Parallel::ForkManager($processes); 61 | 62 | # store list of files to merge back together 63 | my @chunk = (); 64 | 65 | foreach my $ref (@ref){ 66 | 67 | # parallelisation by splitting BAM file into chunks 68 | my $chunk = sprintf("%.0f", $ref_size{$ref} / $num_split); 69 | for (my $i = 0; $i < $num_split; ++$i){ 70 | 71 | my $chunk_start = $i * $chunk; 72 | my $chunk_end = ($i + 1) * $chunk - $read_len; 73 | if ($chunk_end > $ref_size{$ref}){ 74 | $chunk_end = $ref_size{$ref} 75 | } 76 | 77 | my $chunk = "$ref:$chunk_start-$chunk_end"; 78 | my $str = rand_str(10); 79 | my $outfile = "$tmp_dir/${str}_$chunk.tsv"; 80 | push(@chunk, $outfile); 81 | 82 | $manager->start and next; 83 | process_chunk($ref, $chunk_start, $chunk_end, $outfile); 84 | $manager->finish; 85 | 86 | } 87 | 88 | } 89 | 90 | $manager->wait_all_children; 91 | 92 | warn("Merging files\n"); 93 | 94 | foreach my $infile (@chunk){ 95 | open(IN, '<', $infile) || die "Could not open $infile: $!\n"; 96 | while(){ 97 | chomp; 98 | print "$_\n"; 99 | } 100 | close(IN); 101 | # unlink($infile); 102 | } 103 | 104 | warn("Done\n"); 105 | 106 | exit(0); 107 | 108 | sub process_chunk { 109 | 110 | my ($ref, $chunk_start, $chunk_end, $outfile) = @_; 111 | my %result = (); 112 | my $chunk = "$ref:$chunk_start-$chunk_end"; 113 | 114 | warn("Processing chunk: $chunk\n"); 115 | 116 | my $sam = Bio::DB::Sam->new( 117 | -bam => $bam_file, 118 | -fasta => $fasta_file 119 | ); 120 | 121 | my @alignments = $sam->get_features_by_location( 122 | -seq_id => $ref, 123 | -start => $chunk_start, 124 | -end => $chunk_end 125 | ); 126 | 127 | open(OUT, '>', $outfile) || die "Could not open $outfile for writing: $!\n"; 128 | ALN: for my $a (@alignments) { 129 | my $read_id = $a->display_name; 130 | if (exists $list{$read_id}){ 131 | print OUT $a->tam_line, "\n"; 132 | } 133 | } 134 | close(OUT); 135 | 136 | } 137 | 138 | sub store_read { 139 | my ($infile) = @_; 140 | my %l = (); 141 | open(IN, '<', $infile) || die "Could not open $infile: $!\n"; 142 | while(){ 143 | chomp; 144 | next if /^$/; 145 | next if /^#/; 146 | $l{$_} = 1; 147 | } 148 | close(IN); 149 | return(%l); 150 | } 151 | 152 | sub rand_str { 153 | my ($l) = @_; 154 | my $s = ''; 155 | my @chars = ("A".."Z", "a".."z", 0..9); 156 | for (1 .. $l){ 157 | $s .= $chars[rand @chars]; 158 | } 159 | return($s) 160 | } 161 | 162 | sub get_ref_size { 163 | my ($infile) = @_; 164 | my %l = (); 165 | open(IN, '<', $infile) || die "Could not open $infile: $!\n"; 166 | while(){ 167 | chomp; 168 | next if /^\*/; 169 | my ($chr, $size, @rest) = split(/\t/); 170 | $l{$chr} = $size; 171 | } 172 | close(IN); 173 | return(%l); 174 | } 175 | 176 | sub usage { 177 | print STDERR <new( 22 | -bam => $bam, 23 | -fasta => $fasta 24 | ); 25 | 26 | my @ref = $sam->seq_ids; 27 | my @alignments = $sam->get_features_by_location(-seq_id => $ref[0], 28 | -start => 0, 29 | -end => 20000); 30 | 31 | # header information 32 | my $bam_object = Bio::DB::Bam->open($bam); 33 | my $header = $bam_object->header; 34 | my $text = $header->text; 35 | 36 | for my $a (@alignments) { 37 | 38 | # alignment line 39 | my $line = $a->tam_line; 40 | # print "$line\n"; 41 | 42 | # get NM tag 43 | my $nm = $a->get_tag_values('NM'); 44 | 45 | # alignment information 46 | my $read_id = $a->display_name; 47 | my $seqid = $a->seq_id; 48 | my $start = $a->start; 49 | my $end = $a->end; 50 | my $strand = $a->strand; 51 | my $mapping_qual = $a->qual; 52 | if ($strand == 1){ 53 | $strand = "+"; 54 | } elsif ($strand == -1){ 55 | $strand = "-"; 56 | } 57 | print join("\t", $read_id, $seqid, $start, $end, $strand, $mapping_qual, $nm), "\n"; 58 | 59 | # sequence information 60 | my $ref_dna = $a->dna; 61 | my $query_dna = $a->query->dna; 62 | my @scores = $a->qscore; 63 | # print join("\t", $ref_dna, $query_dna, "@scores"), "\n"; 64 | 65 | # query sequence information 66 | my $query_start = $a->query->start; 67 | my $query_end = $a->query->end; 68 | # print join("\t", $query_start, $query_end, $query_dna), "\n"; 69 | 70 | # CIGAR string 71 | my $cigar = $a->cigar_str; 72 | my $biocigar = Bio::Cigar->new($cigar); 73 | my $ref_len = $biocigar->reference_length; 74 | my $query_len = $biocigar->query_length; 75 | # print join("\t", $query_len, $ref_len), "\n"; 76 | 77 | # this part is relevant for spliced sequences 78 | my $j = 1; 79 | for (my $i = $start; $i <= $end; ++$i){ 80 | # the rpos_to_qpos() function converts reference coordinates to query coordinates 81 | my ($qpos, $op) = $biocigar->rpos_to_qpos($j); 82 | # print "$qpos, $op\n"; 83 | ++$j; 84 | } 85 | 86 | } 87 | 88 | exit(0); 89 | 90 | -------------------------------------------------------------------------------- /script/random_paired_end.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Simple script that takes an input fasta sequence 4 | # and generates paired end reads 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Std; 9 | 10 | my %opts = (); 11 | getopts('h:f:l:n:m:s:d:', \%opts); 12 | 13 | if ($opts{'h'} || 14 | !exists $opts{'f'} || 15 | !exists $opts{'l'} || 16 | !exists $opts{'n'} || 17 | !exists $opts{'m'} 18 | ){ 19 | usage(); 20 | } 21 | 22 | my $fasta = $opts{'f'}; 23 | my $len = $opts{'l'}; 24 | my $num = $opts{'n'}; 25 | my $inner_mate = $opts{'m'}; 26 | my $seed = 1984; 27 | my $discord = 0; 28 | 29 | if (exists $opts{'s'}){ 30 | $seed = $opts{'s'}; 31 | } 32 | 33 | if (exists $opts{'d'}){ 34 | $discord = $opts{'d'}; 35 | } 36 | 37 | srand($seed); 38 | 39 | my $test = 0; 40 | my $limit = 10000; 41 | foreach my $i (1 .. $limit){ 42 | } 43 | 44 | my $fh; 45 | if ($fasta =~ /\.gz$/){ 46 | open($fh, '-|', "gunzip -c $fasta") or die "Could not open $fasta $!\n"; 47 | } else { 48 | open($fh, '<', $fasta) or die "Could not open $fasta $!\n"; 49 | } 50 | 51 | my %seq = (); 52 | my $id = ''; 53 | while(<$fh>){ 54 | chomp; 55 | if (/^>(.*)/){ 56 | $id = $1; 57 | next; 58 | } else { 59 | $seq{$id} .= $_; 60 | } 61 | } 62 | close($fh); 63 | 64 | my $name = 'l' . $len . '_' . 'n' . $num . '_' . 'd' . $inner_mate . '_' . $seed; 65 | my $first_out = $name . '_1.fq.gz'; 66 | my $second_out = $name . '_2.fq.gz'; 67 | 68 | open(my $read1, '|-', "gzip >$first_out") or die "Could not write output to $first_out: $!\n"; 69 | open(my $read2, '|-', "gzip >$second_out") or die "Could not write output to $second_out: $!\n"; 70 | 71 | for (1 .. $num){ 72 | 73 | my @seq_id = keys %seq; 74 | my $seq_id = $seq_id[rand(scalar @seq_id)]; 75 | my $seq = $seq{$seq_id}; 76 | 77 | if (scalar @seq_id > 1){ 78 | my $index = 0; 79 | $index++ until $seq_id[$index] eq $seq_id; 80 | splice(@seq_id, $index, 1); 81 | } 82 | 83 | my $seq_len = length($seq); 84 | my $limit = $seq_len - $len -$len - $inner_mate; 85 | if ($len > $seq_len){ 86 | die "Your read length ($len) is longer than the sequence $seq_id\n"; 87 | } 88 | 89 | # on Illumina 1.8+ ! is the worst quality 90 | # and J is the best 91 | my $fake_qual = 'J' x $len; 92 | 93 | my $first_start = int(rand($limit)); 94 | my $first_read = substr($seq, $first_start, $len); 95 | my $first_pos = $first_start + 1; 96 | print $read1 "\@$_:${seq_id}_$first_pos\n$first_read\n+\n$fake_qual\n"; 97 | 98 | my $seq_id2 = $seq_id; 99 | my $second_start = ''; 100 | my $second_read = ''; 101 | if ($discord > rand(1) && scalar @seq_id > 1){ 102 | $seq_id2 = $seq_id[rand(scalar @seq_id)]; 103 | my $seq2 = $seq{$seq_id2}; 104 | my $seq_len = length($seq2); 105 | if ($len > $seq_len){ 106 | die "Your read length ($len) is longer than the sequence $seq_id2\n"; 107 | } 108 | my $limit = $seq_len - $len; 109 | $second_start = int(rand($limit)); 110 | $second_read = substr($seq2, $second_start, $len); 111 | } else { 112 | $second_start = $first_start + $inner_mate; 113 | $second_read = substr($seq, $second_start, $len); 114 | } 115 | 116 | $second_read = reverse($second_read); 117 | $second_read =~ tr/ACGT/TGCA/; 118 | # reads IDs need to match! 119 | print $read2 "\@$_:${seq_id}_$first_pos\n$second_read\n+\n$fake_qual\n"; 120 | } 121 | 122 | close($read1); 123 | close($read2); 124 | 125 | exit(0); 126 | 127 | sub usage { 128 | print STDERR < -l <100> -n <100000> -m <500> [-s 1984] [-d 0] 130 | 131 | Where: -f FASTA file 132 | -l read lengths 133 | -n number of reads 134 | -m inner mate distance 135 | -s seed (default: 1984) 136 | -d fraction discordant (default: 0) 137 | -h this helpful usage message 138 | 139 | EOF 140 | exit(); 141 | } 142 | 143 | __END__ 144 | 145 | --------------------------------------------------------------------------------