├── .github └── workflows │ └── python-package.yml ├── .gitignore ├── .pyup.yml ├── .travis.yml ├── Dockerfile ├── MANIFEST.in ├── README.md ├── fixture ├── deep.bam ├── deep.bam.bai ├── gc.txt ├── longcontignames.bam ├── longcontignames.bam.bai ├── ref.fa ├── ref.fa.fai ├── ref.upper.fa ├── ref.upper.fa.fai ├── regression.npz ├── rna.bam ├── rna.bam.bai ├── test.bam └── test.bam.bai ├── performance.py ├── pyproject.toml ├── pysamstats ├── __init__.py ├── binned.py ├── config.py ├── io.py ├── opt.c ├── opt.pyx ├── pileup.py ├── test │ ├── __init__.py │ ├── test_binned.py │ ├── test_io.py │ ├── test_pileup.py │ ├── test_regression.py │ └── util.py └── util.py ├── release.txt ├── requirements.txt ├── requirements_dev.txt ├── sandbox.ipynb ├── scripts └── pysamstats └── setup.py /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: [3.7, 3.8, 3.9] 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v2 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | pip install -r requirements_dev.txt 32 | python setup.py build_ext --inplace 33 | pip install -v . 34 | # - name: Lint with flake8 35 | # run: | 36 | # # stop the build if there are Python syntax errors or undefined names 37 | # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 38 | # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 39 | # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 40 | - name: Test 41 | run: | 42 | nosetests -v 43 | pysamstats --help 44 | pysamstats --type coverage fixture/test.bam > coverage.txt 45 | pysamstats --type coverage --output=coverage.txt fixture/test.bam 46 | pysamstats --type coverage --output=coverage.h5 --format=hdf5 fixture/test.bam 47 | pysamstats --type coverage --fields=pos,reads_all fixture/test.bam > coverage_fields.txt 48 | pysamstats --type coverage_binned --fasta=fixture/ref.fa fixture/test.bam > coverage_binned.txt 49 | pysamstats --type coverage_binned --fasta=fixture/ref.fa --output=coverage_binned.txt fixture/test.bam 50 | pysamstats --type coverage_binned --fasta=fixture/ref.fa --output=coverage_binned.h5 --format=hdf5 fixture/test.bam 51 | pysamstats --type=coverage --min-mapq=27 --min-baseq=17 --no-dup --no-del fixture/test.bam > coverage_filtered.txt 52 | pysamstats --type=coverage_binned --fasta=fixture/ref.fa --min-mapq=27 --no-dup fixture/test.bam > coverage_binned_filtered.txt 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | 29 | MANIFEST 30 | spike* 31 | *.so 32 | *.html 33 | *~ 34 | *.prof 35 | 36 | .idea 37 | .project 38 | .pydevproject 39 | -------------------------------------------------------------------------------- /.pyup.yml: -------------------------------------------------------------------------------- 1 | # autogenerated pyup.io config file 2 | # see https://pyup.io/docs/configuration/ for all available options 3 | 4 | schedule: every month 5 | 6 | requirements: 7 | - requirements.txt: 8 | pin: False 9 | update: False 10 | - requirements_dev.txt: 11 | pin: True 12 | update: all 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | branches: 4 | only: 5 | - master 6 | 7 | sudo: false 8 | 9 | dist: xenial 10 | 11 | addons: 12 | apt: 13 | packages: 14 | - libhdf5-serial-dev 15 | 16 | matrix: 17 | include: 18 | - python: 3.6 19 | - python: 3.7 20 | - python: 3.8 21 | - python: 3.9 22 | sudo: true 23 | 24 | install: 25 | - export HDF5_DIR=/usr/lib/x86_64-linux-gnu/hdf5/serial 26 | - pip install -U pip setuptools wheel 27 | - pip install -r requirements_dev.txt 28 | - python setup.py build_ext --inplace 29 | - pip install -v . 30 | 31 | script: 32 | - nosetests -v 33 | - pysamstats --help 34 | - pysamstats --type coverage fixture/test.bam > coverage.txt 35 | - pysamstats --type coverage --output=coverage.txt fixture/test.bam 36 | - pysamstats --type coverage --output=coverage.h5 --format=hdf5 fixture/test.bam 37 | - pysamstats --type coverage --fields=pos,reads_all fixture/test.bam > coverage_fields.txt 38 | - pysamstats --type coverage_binned --fasta=fixture/ref.fa fixture/test.bam > coverage_binned.txt 39 | - pysamstats --type coverage_binned --fasta=fixture/ref.fa --output=coverage_binned.txt fixture/test.bam 40 | - pysamstats --type coverage_binned --fasta=fixture/ref.fa --output=coverage_binned.h5 --format=hdf5 fixture/test.bam 41 | - pysamstats --type=coverage --min-mapq=27 --min-baseq=17 --no-dup --no-del fixture/test.bam > coverage_filtered.txt 42 | - pysamstats --type=coverage_binned --fasta=fixture/ref.fa --min-mapq=27 --no-dup fixture/test.bam > coverage_binned_filtered.txt 43 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda:4.5.4 2 | LABEL pysamstats - A fast utility for extracting statistics from a SAM or BAM file. 3 | 4 | RUN conda install -c bioconda pysamstats 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md 2 | include pyproject.toml 3 | recursive-include pysamstats *.pyx *.pxd *.c 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pysamstats 2 | ========== 3 | 4 | A Python utility for calculating statistics against genome positions 5 | based on sequence alignments from a SAM or BAM file. 6 | 7 | * Source: https://github.com/alimanfoo/pysamstats 8 | * Download: http://pypi.python.org/pypi/pysamstats 9 | * Release notes: https://github.com/alimanfoo/pysamstats/releases 10 | 11 | Installation 12 | ------------ 13 | 14 | The easiest way to install pysamstats is via conda, e.g.: 15 | 16 | ``` 17 | $ conda install -c bioconda pysamstats 18 | ``` 19 | 20 | Alternatively, pysamstats can be installed from source via pip. 21 | 22 | ``` 23 | $ pip install pysamstats 24 | ``` 25 | 26 | Alternatively, clone the git repo and install: 27 | 28 | ``` 29 | $ git clone git://github.com/alimanfoo/pysamstats.git 30 | $ cd pysamstats 31 | $ python setup.py install 32 | $ nosetests -v # optional, run test suite 33 | ``` 34 | 35 | If you have problems installing pysam, please email the 36 | [pysam user group](https://groups.google.com/forum/#!forum/pysam-user-group). 37 | 38 | N.B., some functions also require [numpy](http://www.numpy.org) and 39 | [pytables](http://www.pytables.org) to be installed. 40 | 41 | Usage 42 | ----- 43 | 44 | From the command line: 45 | 46 | ``` 47 | $ pysamstats --help 48 | Usage: pysamstats [options] FILE 49 | 50 | Calculate statistics against genome positions based on sequence alignments 51 | from a SAM or BAM file and print them to stdout. 52 | 53 | Options: 54 | -h, --help show this help message and exit 55 | -t TYPE, --type=TYPE Type of statistics to print, one of: alignment_binned, 56 | baseq, baseq_ext, baseq_ext_strand, baseq_strand, 57 | coverage, coverage_binned, coverage_ext, 58 | coverage_ext_binned, coverage_ext_strand, coverage_gc, 59 | coverage_strand, mapq, mapq_binned, mapq_strand, tlen, 60 | tlen_binned, tlen_strand, variation, variation_strand. 61 | -c CHROMOSOME, --chromosome=CHROMOSOME 62 | Chromosome name. 63 | -s START, --start=START 64 | Start position (1-based). 65 | -e END, --end=END End position (1-based). 66 | -z, --zero-based Use zero-based coordinates (default is false, i.e., 67 | use one-based coords). 68 | -u, --truncate Truncate pileup-based stats so no records are emitted 69 | outside the specified range. 70 | -S STEPPER, --stepper=STEPPER 71 | Stepper to provide to underlying pysam call. Options 72 | are:"all" (default): all reads are returned, except 73 | where flags BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, 74 | BAM_FDUP set; "nofilter" applies no filter to returned 75 | reads; "samtools": filter & read processing as in 76 | _csamtools_ pileup. This requires a fasta file. For 77 | complete details see the pysam documentation. 78 | -d, --pad Pad pileup-based stats so a record is emitted for 79 | every position (default is only covered positions). 80 | -D MAX_DEPTH, --max-depth=MAX_DEPTH 81 | Maximum read depth permitted in pileup-based 82 | statistics. The default limit is 8000. 83 | -f FASTA, --fasta=FASTA 84 | Reference sequence file, only required for some 85 | statistics. 86 | -o, --omit-header Omit header row from output. 87 | -p N, --progress=N Report progress every N rows. 88 | --window-size=N Size of window for binned statistics (default is 300). 89 | --window-offset=N Window offset to use for deciding which genome 90 | position to report binned statistics against. The 91 | default is 150, i.e., the middle of 300bp window. 92 | --format=FORMAT Output format, one of {tsv, csv, hdf5} (defaults to 93 | tsv). N.B., hdf5 requires PyTables to be installed. 94 | --output=OUTPUT Path to output file. If not provided, write to stdout. 95 | --fields=FIELDS Comma-separated list of fields to output (defaults to 96 | all fields). 97 | --hdf5-group=HDF5_GROUP 98 | Name of HDF5 group to write to (defaults to the root 99 | group). 100 | --hdf5-dataset=HDF5_DATASET 101 | Name of HDF5 dataset to create (defaults to "data"). 102 | --hdf5-complib=HDF5_COMPLIB 103 | HDF5 compression library (defaults to zlib). 104 | --hdf5-complevel=HDF5_COMPLEVEL 105 | HDF5 compression level (defaults to 5). 106 | --hdf5-chunksize=HDF5_CHUNKSIZE 107 | Size of chunks in number of bytes (defaults to 2**20). 108 | --min-mapq=MIN_MAPQ Only reads with mapping quality equal to or greater 109 | than this value will be counted (0 by default). 110 | --min-baseq=MIN_BASEQ 111 | Only reads with base quality equal to or greater than 112 | this value will be counted (0 by default). Only 113 | applies to pileup-based statistics. 114 | --no-dup Don't count reads flagged as duplicate. 115 | --no-del Don't count reads aligned with a deletion at the given 116 | position. Only applies to pileup-based statistics. 117 | 118 | Pileup-based statistics types (each row has statistics over reads in a pileup column): 119 | 120 | * coverage - Number of reads aligned to each genome position 121 | (total and properly paired). 122 | * coverage_strand - As coverage but with forward/reverse strand counts. 123 | * coverage_ext - Various additional coverage metrics, including 124 | coverage for reads not properly paired (mate 125 | unmapped, mate on other chromosome, ...). 126 | * coverage_ext_strand - As coverage_ext but with forward/reverse strand counts. 127 | * coverage_gc - As coverage but also includes a column for %GC. 128 | * variation - Numbers of matches, mismatches, deletions, 129 | insertions, etc. 130 | * variation_strand - As variation but with forward/reverse strand counts. 131 | * tlen - Insert size statistics. 132 | * tlen_strand - As tlen but with statistics by forward/reverse strand. 133 | * mapq - Mapping quality statistics. 134 | * mapq_strand - As mapq but with statistics by forward/reverse strand. 135 | * baseq - Base quality statistics. 136 | * baseq_strand - As baseq but with statistics by forward/reverse strand. 137 | * baseq_ext - Extended base quality statistics, including qualities 138 | of bases matching and mismatching reference. 139 | * baseq_ext_strand - As baseq_ext but with statistics by forward/reverse strand. 140 | 141 | Binned statistics types (each row has statistics over reads aligned starting within a genome window): 142 | 143 | * coverage_binned - As coverage but binned. 144 | * coverage_ext_binned - As coverage_ext but binned. 145 | * mapq_binned - Similar to mapq but binned. 146 | * alignment_binned - Aggregated counts from cigar strings. 147 | * tlen_binned - As tlen but binned. 148 | 149 | Examples: 150 | 151 | pysamstats --type coverage example.bam > example.coverage.txt 152 | pysamstats --type coverage --chromosome Pf3D7_v3_01 --start 100000 --end 200000 example.bam > example.coverage.txt 153 | 154 | Version: 1.1.2 (pysam 0.15.1) 155 | ``` 156 | 157 | From Python: 158 | 159 | ```python 160 | import pysam 161 | import pysamstats 162 | 163 | mybam = pysam.AlignmentFile('/path/to/your/bamfile.bam') 164 | 165 | # iterate over statistics, one record at a time 166 | for rec in pysamstats.stat_coverage(mybam, chrom='Pf3D7_01_v3', start=10000, end=20000): 167 | print rec['chrom'], rec['pos'], rec['reads_all'], rec['reads_pp'] 168 | ... 169 | 170 | ``` 171 | 172 | For convenience, functions are provided for loading data directly into numpy arrays, e.g.: 173 | 174 | ```python 175 | import pysam 176 | import pysamstats 177 | import matplotlib.pyplot as plt 178 | 179 | mybam = pysam.AlignmentFile('/path/to/your/bamfile.bam') 180 | a = pysamstats.load_coverage(mybam, chrom='Pf3D7_01_v3', start=10000, end=20000) 181 | plt.plot(a.pos, a.reads_all) 182 | plt.show() 183 | ``` 184 | 185 | For pileup-based statistics functions, note the following: 186 | 187 | * By default a row is emitted for all genome positions covered by reads overlapping the selected region. This means rows will be emitted for positions outside the selected region, but statistics may not be accurate as not all reads overlapping that position will have been counted. To truncate output to exactly the selected region, provide a ``truncate=True`` keyword argument. 188 | * By default a row is only emitted for genome positions covered by at least one read. To emit a row for every genome position, provide a ``pad=True`` keyword argument. 189 | * By default the number of reads in a pileup column is limited to 8000. To increase this limit, provide a ``max_depth=100000`` keyword argument (or whatever number is suitable for your situation). 190 | 191 | Field definitions 192 | ----------------- 193 | 194 | The suffix **_fwd** means the field is restricted to reads mapped to 195 | the forward strand, and **_rev** means the field is restricted to 196 | reads mapped to the reverse strand. E.g., **reads_fwd** means the 197 | number of reads mapped to the forward strand. 198 | 199 | The suffix **_pp** means the field is restricted to reads flagged as 200 | properly paired. 201 | 202 | * **chrom** - Chromosome name. 203 | 204 | * **pos** - Position within chromosome. One-based by default when 205 | using the command line, zero-based by default when using the 206 | python API. 207 | 208 | * **reads_all** - Number of reads aligned at the position. N.b., this 209 | is really the total, i.e., includes reads where the mate is 210 | unmapped or otherwise not properly paired. 211 | 212 | * **reads_pp** - Number of reads flagged as properly paired by the 213 | aligner. 214 | 215 | * **reads_mate_unmapped** - Number of reads where the mate is 216 | unmapped. 217 | 218 | * **reads_mate_other_chr** - Number of reads where the mate is mapped 219 | to another chromosome. 220 | 221 | * **reads_mate_same_strand** - Number of reads where the mate is 222 | mapped to the same strand. 223 | 224 | * **reads_faceaway** - Number of reads where the read and its mate are 225 | mapped facing away from each other. 226 | 227 | * **reads_softclipped** - Number of reads where there is some 228 | softclipping at some point in the read's alignment (not 229 | necessarily at this position). 230 | 231 | * **reads_duplicate** - Number of reads that are flagged as duplicate. 232 | 233 | * **gc** - Percentage GC content in the reference at this position 234 | (depends on window length and offset specified). 235 | 236 | * **matches** - Number of reads where the aligned base matches the 237 | reference. 238 | 239 | * **mismatches** - Number of reads where the aligned base does not 240 | match the reference (but is not a deletion). 241 | 242 | * **deletions** - Number of reads where there is a deletion in the 243 | alignment at this position. 244 | 245 | * **insertions** - Number of reads where there is an insertion in the 246 | alignment at this position. 247 | 248 | * **A/C/T/G/N** - Number of reads where the aligned base is an A/C/T/G/N. 249 | 250 | * **mean_tlen** - Mean value of outer distance between reads and their 251 | mates for paired reads aligned at this position. N.B., leftmost 252 | reads in a pair have a positive tlen, rightmost reads have a 253 | negative tlen, so if there is no strand bias, this value should be 254 | 0. 255 | 256 | * **rms_tlen** - Root-mean-square value of outer distance between 257 | reads and their mates for paired reads aligned at this position. 258 | 259 | * **std_tlen** - Standard deviation of outer distance between reads 260 | and their mates for paired reads aligned at this position. 261 | 262 | * **reads_mapq0** - Number of reads where mapping quality is zero. 263 | 264 | * **rms_mapq** - Root-mean-square mapping quality for reads aligned at 265 | this position. 266 | 267 | * **max_mapq** - Maximum value of mapping quality for reads aligned at 268 | this position. 269 | 270 | * **rms_baseq** - Root-mean-square value of base qualities for bases 271 | aligned at this position. 272 | 273 | * **rms_baseq_matches** - Root-mean-square value of base qualities for 274 | bases aligned at this position where the base matches the 275 | reference. 276 | 277 | * **rms_baseq_mismatches** - Root-mean-square value of base qualities 278 | for bases aligned at this position where the base does not match 279 | the reference. 280 | 281 | Release notes 282 | ------------- 283 | 284 | **1.1.2** 285 | 286 | * Fix missing numpy as install requirement. 287 | 288 | **1.1.1** 289 | 290 | * Fix missing pyproject.toml in source distribution. 291 | 292 | **1.1.0** 293 | 294 | * Appropriate size dtype for chromosome names is now determined 295 | dynamically, no need to manually configure for longer 296 | chromosome/contig names. By [Nick 297 | Harding](https://github.com/hardingnj), 298 | [#72](https://github.com/alimanfoo/pysamstats/issues/72), 299 | [#74](https://github.com/alimanfoo/pysamstats/issues/74). 300 | 301 | * Expose 'stepper' option via Python and command line API, to allow 302 | setting of different pileup behaviours. By [Nick 303 | Harding](https://github.com/hardingnj), 304 | [#78](https://github.com/alimanfoo/pysamstats/issues/78), 305 | [#86](https://github.com/alimanfoo/pysamstats/pull/86). 306 | 307 | * Expose options `min_mapq`, `min_baseq`, `no_del`, `no_dup` via 308 | load_*() functions. By [nrkssa](https://github.com/nrkssa), 309 | [#93](https://github.com/alimanfoo/pysamstats/pull/93). 310 | 311 | * Add pyproject.toml for package build requirements, which means that 312 | there is no need to manually install pysam before installing 313 | pysamstats via pip. By [Michiel 314 | Vermeir](https://github.com/michielvermeir), 315 | [#97](https://github.com/alimanfoo/pysamstats/pull/97). 316 | 317 | * Added a regression test to ensure consistent outputs in future 318 | package versions. By [Nick Harding](https://github.com/hardingnj), 319 | [#79](https://github.com/alimanfoo/pysamstats/issues/79). 320 | 321 | * Pysam dependency upgraded to 0.15. 322 | 323 | **1.0.1** 324 | 325 | * Changed output of deletions field in variation stats to exclude RNA reads aligned with a splice 326 | ("N" in cigar) ([#65](https://github.com/alimanfoo/pysamstats/issues/65)) 327 | 328 | **1.0.0** 329 | 330 | * Upgrades for compatibility with pysam 0.11. 331 | * Added options for filtering reads based on mapping quality, base quality, deletion status and duplicate flag. 332 | -------------------------------------------------------------------------------- /fixture/deep.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/deep.bam -------------------------------------------------------------------------------- /fixture/deep.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/deep.bam.bai -------------------------------------------------------------------------------- /fixture/longcontignames.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/longcontignames.bam -------------------------------------------------------------------------------- /fixture/longcontignames.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/longcontignames.bam.bai -------------------------------------------------------------------------------- /fixture/ref.fa.fai: -------------------------------------------------------------------------------- 1 | Pf3D7_01_v3 640851 13 60 61 2 | Pf3D7_02_v3 947102 651558 60 61 3 | Pf3D7_03_v3 811800 1614459 60 61 4 | -------------------------------------------------------------------------------- /fixture/ref.upper.fa.fai: -------------------------------------------------------------------------------- 1 | Pf3D7_01_v3 640851 13 60 61 2 | -------------------------------------------------------------------------------- /fixture/regression.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/regression.npz -------------------------------------------------------------------------------- /fixture/rna.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/rna.bam -------------------------------------------------------------------------------- /fixture/rna.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/rna.bam.bai -------------------------------------------------------------------------------- /fixture/test.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/test.bam -------------------------------------------------------------------------------- /fixture/test.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/test.bam.bai -------------------------------------------------------------------------------- /performance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function, division, absolute_import 4 | 5 | import sys 6 | import pstats 7 | import cProfile as profile 8 | import timeit 9 | 10 | 11 | from pysam import Samfile, Fastafile 12 | 13 | 14 | sys.path.append('.') 15 | import pysamstats 16 | 17 | 18 | def do_profiling(fun, end=1000): 19 | samfile = Samfile('fixture/test.bam') 20 | count = 0 21 | f = getattr(pysamstats, fun) 22 | for _ in f(samfile, chrom='Pf3D7_01_v3', start=0, end=end): 23 | count += 1 24 | 25 | 26 | def do_profiling_withrefseq(fun, end=1000): 27 | samfile = Samfile('fixture/test.bam') 28 | fafile = Fastafile('fixture/ref.fa') 29 | count = 0 30 | f = getattr(pysamstats, fun) 31 | for _ in f(samfile, fafile, chrom='Pf3D7_01_v3', start=0, end=end): 32 | count += 1 33 | 34 | 35 | stats_types_requiring_fasta = ('variation', 36 | 'variation_strand', 37 | 'baseq_ext', 38 | 'baseq_ext_strand', 39 | 'coverage_gc', 40 | 'coverage_normed_gc', 41 | 'coverage_binned', 42 | 'coverage_ext_binned') 43 | 44 | fun = sys.argv[1] 45 | if len(sys.argv) > 2: 46 | end = sys.argv[2] 47 | else: 48 | end = 1000 49 | if len(sys.argv) > 3: 50 | number = int(sys.argv[3]) 51 | else: 52 | number = 1 53 | if len(sys.argv) > 4: 54 | repeat = int(sys.argv[4]) 55 | else: 56 | repeat = 3 57 | 58 | if fun in stats_types_requiring_fasta: 59 | cmd = 'do_profiling_withrefseq("stat_%s", %s)' % (fun, end) 60 | else: 61 | cmd = 'do_profiling("stat_%s", %s)' % (fun, end) 62 | 63 | prof_fn = '%s.prof' % fun 64 | profile.runctx(cmd, globals(), locals(), prof_fn) 65 | s = pstats.Stats(prof_fn) 66 | s.strip_dirs().sort_stats('time').print_stats() 67 | print(timeit.repeat(cmd, 68 | number=number, 69 | repeat=repeat, 70 | setup='from __main__ import do_profiling, ' 71 | 'do_profiling_withrefseq')) 72 | 73 | 74 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "pysam (<0.16)"] 3 | 4 | [tool.pysamstats] 5 | name='pysamstats' 6 | author='Alistair Miles' 7 | author_email='alimanfoo@googlemail.com' 8 | url='https://github.com/alimanfoo/pysamstats' 9 | license='MIT Licenses' 10 | description="""A Python utility for calculating statistics against genome 11 | position based on sequence alignments from a SAM, 12 | BAM or CRAM file.""" 13 | scripts=['scripts/pysamstats'] 14 | classifiers=[ 15 | 'Intended Audience :: Developers', 16 | 'License :: OSI Approved :: MIT License', 17 | 'Programming Language :: Python :: 2.7', 18 | 'Programming Language :: Python :: 3.5', 19 | 'Programming Language :: Python :: 3.6', 20 | 'Programming Language :: Python :: 3.7', 21 | 'Topic :: Software Development :: Libraries :: Python Modules' 22 | ] 23 | -------------------------------------------------------------------------------- /pysamstats/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, print_function, division 3 | 4 | 5 | __version__ = '1.1.2' 6 | 7 | 8 | from .pileup import * 9 | from .binned import * 10 | -------------------------------------------------------------------------------- /pysamstats/binned.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, print_function, division 3 | import functools 4 | 5 | 6 | import pysamstats.opt as opt 7 | import pysamstats.util as util 8 | import pysamstats.config as config 9 | 10 | 11 | _doc_params = """ 12 | Parameters 13 | ---------- 14 | type : string 15 | Statistics type. One of "coverage", "coverage_ext", "mapq", "alignment", "tlen". 16 | alignmentfile : pysam.AlignmentFile or string 17 | SAM or BAM file or file path, not required for all statistics types. 18 | fafile : pysam.FastaFile or string 19 | FASTA file or file path. 20 | chrom : string 21 | Chromosome/contig. 22 | start : int 23 | Start position. 24 | end : int 25 | End position. 26 | one_based : bool 27 | Coordinate system, False if zero-based (default), True if one-based. 28 | window_size : int 29 | Window size to use. 30 | window_offset : int 31 | Distance from window start to record position. 32 | min_mapq : int, optional 33 | Only reads with mapping quality equal to or greater than this value will be counted (0 34 | by default). 35 | no_dup : bool, optional 36 | If True, don't count reads flagged as duplicate.""" 37 | 38 | 39 | 40 | # noinspection PyShadowingBuiltins 41 | def stat_binned(type, 42 | alignmentfile, 43 | fafile=None, 44 | chrom=None, 45 | start=None, 46 | end=None, 47 | one_based=False, 48 | window_size=300, 49 | window_offset=None, 50 | min_mapq=0, 51 | no_dup=False): 52 | """Generate statistics per genome window, based on all reads whose alignment starts within 53 | the window. 54 | {params} 55 | 56 | Returns 57 | ------- 58 | recs : iterator 59 | An iterator yielding dict objects, where each dict holds data for a single window. 60 | 61 | """ 62 | 63 | try: 64 | stat = stats_classes_binned[type]() 65 | except KeyError: 66 | raise ValueError('unsupported statistics type: %r' % type) 67 | 68 | return opt.iter_binned(stat, alignmentfile=alignmentfile, fafile=fafile, chrom=chrom, 69 | start=start, end=end, one_based=one_based, window_size=window_size, 70 | window_offset=window_offset, min_mapq=min_mapq, no_dup=no_dup) 71 | 72 | 73 | stat_binned.__doc__ = stat_binned.__doc__.format(params=_doc_params) 74 | 75 | 76 | # noinspection PyShadowingBuiltins 77 | def load_binned(type, 78 | alignmentfile, 79 | fafile=None, 80 | chrom=None, 81 | start=None, 82 | end=None, 83 | one_based=False, 84 | window_size=300, 85 | window_offset=None, 86 | min_mapq=0, 87 | no_dup=False, 88 | dtype=None, 89 | fields=None): 90 | """Load statistics per genome window, based on all reads whose alignment starts within 91 | the window. 92 | {params} 93 | dtype : dtype 94 | Override default dtype. 95 | fields : string or list of strings 96 | Select a subset of fields to load. 97 | 98 | Returns 99 | ------- 100 | ra : numpy structured array 101 | A structured array. 102 | 103 | """ 104 | 105 | statfun = functools.partial(stat_binned, type) 106 | try: 107 | default_dtype = getattr(config, 'dtype_' + type + '_binned') 108 | except KeyError: 109 | raise ValueError('unsupported statistics type: %r' % type) 110 | 111 | return util.load_stats(statfun, user_dtype=dtype, default_dtype=default_dtype, 112 | user_fields=fields, alignmentfile=alignmentfile, fafile=fafile, 113 | chrom=chrom, start=start, end=end, one_based=one_based, 114 | window_size=window_size, window_offset=window_offset, 115 | min_mapq=min_mapq, no_dup=no_dup) 116 | 117 | 118 | load_binned.__doc__ = load_binned.__doc__.format(params=_doc_params) 119 | 120 | 121 | stats_classes_binned = { 122 | 'coverage': opt.CoverageBinned, 123 | 'coverage_ext': opt.CoverageExtBinned, 124 | 'mapq': opt.MapqBinned, 125 | 'alignment': opt.AlignmentBinned, 126 | 'tlen': opt.TlenBinned, 127 | } 128 | 129 | 130 | # backwards compatibility 131 | ######################### 132 | 133 | 134 | _stat_doc_lines = stat_binned.__doc__.split('\n') 135 | _load_doc_lines = load_binned.__doc__.split('\n') 136 | # strip "type" parameter 137 | _stat_doc = '\n'.join(_stat_doc_lines[:5] + _stat_doc_lines[7:]) 138 | _load_doc = '\n'.join(_load_doc_lines[:5] + _load_doc_lines[7:]) 139 | 140 | 141 | def _specialize(type): 142 | statfun = functools.partial(stat_binned, type) 143 | statfun.__doc__ = _stat_doc 144 | statfun.__name__ = 'stat_' + type 145 | loadfun = functools.partial(load_binned, type) 146 | loadfun.__doc__ = _load_doc 147 | loadfun.__name__ = 'load_' + type 148 | return statfun, loadfun 149 | 150 | 151 | # named functions 152 | stat_coverage_binned, load_coverage_binned = _specialize('coverage') 153 | stat_coverage_ext_binned, load_coverage_ext_binned = _specialize('coverage_ext') 154 | stat_mapq_binned, load_mapq_binned = _specialize('mapq') 155 | stat_alignment_binned, load_alignment_binned = _specialize('alignment') 156 | stat_tlen_binned, load_tlen_binned = _specialize('tlen') 157 | -------------------------------------------------------------------------------- /pysamstats/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, print_function, division 3 | 4 | 5 | stats_types_noref = ('coverage', 6 | 'coverage_strand', 7 | 'coverage_ext', 8 | 'coverage_ext_strand', 9 | 'tlen', 10 | 'tlen_strand', 11 | 'mapq', 12 | 'mapq_strand', 13 | 'baseq', 14 | 'baseq_strand', 15 | 'mapq_binned', 16 | 'alignment_binned', 17 | 'tlen_binned') 18 | 19 | stats_types_withref = ('variation', 20 | 'variation_strand', 21 | 'baseq_ext', 22 | 'baseq_ext_strand', 23 | 'coverage_gc', 24 | 'coverage_binned', 25 | 'coverage_ext_binned') 26 | 27 | stats_types = sorted(stats_types_noref + stats_types_withref) 28 | 29 | stepper_types = ('nofilter', 30 | 'samtools', 31 | 'all') 32 | 33 | dtype_coverage = [ 34 | ('chrom', 'a12'), 35 | ('pos', 'i4'), 36 | ('reads_all', 'i4'), 37 | ('reads_pp', 'i4') 38 | ] 39 | 40 | dtype_coverage_strand = [ 41 | ('chrom', 'a12'), 42 | ('pos', 'i4'), 43 | ('reads_all', 'i4'), 44 | ('reads_fwd', 'i4'), 45 | ('reads_rev', 'i4'), 46 | ('reads_pp', 'i4'), 47 | ('reads_pp_fwd', 'i4'), 48 | ('reads_pp_rev', 'i4'), 49 | ] 50 | 51 | dtype_coverage_ext = [ 52 | ('chrom', 'a12'), 53 | ('pos', 'i4'), 54 | ('reads_all', 'i4'), 55 | ('reads_pp', 'i4'), 56 | ('reads_mate_unmapped', 'i4'), 57 | ('reads_mate_other_chr', 'i4'), 58 | ('reads_mate_same_strand', 'i4'), 59 | ('reads_faceaway', 'i4'), 60 | ('reads_softclipped', 'i4'), 61 | ('reads_duplicate', 'i4') 62 | ] 63 | 64 | dtype_coverage_ext_strand = [ 65 | ('chrom', 'a12'), 66 | ('pos', 'i4'), 67 | ('reads_all', 'i4'), 68 | ('reads_fwd', 'i4'), 69 | ('reads_rev', 'i4'), 70 | ('reads_pp', 'i4'), 71 | ('reads_pp_fwd', 'i4'), 72 | ('reads_pp_rev', 'i4'), 73 | ('reads_mate_unmapped', 'i4'), 74 | ('reads_mate_unmapped_fwd', 'i4'), 75 | ('reads_mate_unmapped_rev', 'i4'), 76 | ('reads_mate_other_chr', 'i4'), 77 | ('reads_mate_other_chr_fwd', 'i4'), 78 | ('reads_mate_other_chr_rev', 'i4'), 79 | ('reads_mate_same_strand', 'i4'), 80 | ('reads_mate_same_strand_fwd', 'i4'), 81 | ('reads_mate_same_strand_rev', 'i4'), 82 | ('reads_faceaway', 'i4'), 83 | ('reads_faceaway_fwd', 'i4'), 84 | ('reads_faceaway_rev', 'i4'), 85 | ('reads_softclipped', 'i4'), 86 | ('reads_softclipped_fwd', 'i4'), 87 | ('reads_softclipped_rev', 'i4'), 88 | ('reads_duplicate', 'i4'), 89 | ('reads_duplicate_fwd', 'i4'), 90 | ('reads_duplicate_rev', 'i4'), 91 | ] 92 | 93 | dtype_variation = [ 94 | ('chrom', 'a12'), 95 | ('pos', 'i4'), 96 | ('ref', 'a1'), 97 | ('reads_all', 'i4'), 98 | ('reads_pp', 'i4'), 99 | ('matches', 'i4'), 100 | ('matches_pp', 'i4'), 101 | ('mismatches', 'i4'), 102 | ('mismatches_pp', 'i4'), 103 | ('deletions', 'i4'), 104 | ('deletions_pp', 'i4'), 105 | ('insertions', 'i4'), 106 | ('insertions_pp', 'i4'), 107 | ('A', 'i4'), 108 | ('A_pp', 'i4'), 109 | ('C', 'i4'), 110 | ('C_pp', 'i4'), 111 | ('T', 'i4'), 112 | ('T_pp', 'i4'), 113 | ('G', 'i4'), 114 | ('G_pp', 'i4'), 115 | ('N', 'i4'), 116 | ('N_pp', 'i4') 117 | ] 118 | 119 | dtype_variation_strand = [ 120 | ('chrom', 'a12'), 121 | ('pos', 'i4'), 122 | ('ref', 'a1'), 123 | ('reads_all', 'i4'), 124 | ('reads_fwd', 'i4'), 125 | ('reads_rev', 'i4'), 126 | ('reads_pp', 'i4'), 127 | ('reads_pp_fwd', 'i4'), 128 | ('reads_pp_rev', 'i4'), 129 | ('matches', 'i4'), 130 | ('matches_fwd', 'i4'), 131 | ('matches_rev', 'i4'), 132 | ('matches_pp', 'i4'), 133 | ('matches_pp_fwd', 'i4'), 134 | ('matches_pp_rev', 'i4'), 135 | ('mismatches', 'i4'), 136 | ('mismatches_fwd', 'i4'), 137 | ('mismatches_rev', 'i4'), 138 | ('mismatches_pp', 'i4'), 139 | ('mismatches_pp_fwd', 'i4'), 140 | ('mismatches_pp_rev', 'i4'), 141 | ('deletions', 'i4'), 142 | ('deletions_fwd', 'i4'), 143 | ('deletions_rev', 'i4'), 144 | ('deletions_pp', 'i4'), 145 | ('deletions_pp_fwd', 'i4'), 146 | ('deletions_pp_rev', 'i4'), 147 | ('insertions', 'i4'), 148 | ('insertions_fwd', 'i4'), 149 | ('insertions_rev', 'i4'), 150 | ('insertions_pp', 'i4'), 151 | ('insertions_pp_fwd', 'i4'), 152 | ('insertions_pp_rev', 'i4'), 153 | ('A', 'i4'), ('A_fwd', 'i4'), ('A_rev', 'i4'), 154 | ('A_pp', 'i4'), ('A_pp_fwd', 'i4'), ('A_pp_rev', 'i4'), 155 | ('C', 'i4'), ('C_fwd', 'i4'), ('C_rev', 'i4'), 156 | ('C_pp', 'i4'), ('C_pp_fwd', 'i4'), ('C_pp_rev', 'i4'), 157 | ('T', 'i4'), ('T_fwd', 'i4'), ('T_rev', 'i4'), 158 | ('T_pp', 'i4'), ('T_pp_fwd', 'i4'), ('T_pp_rev', 'i4'), 159 | ('G', 'i4'), ('G_fwd', 'i4'), ('G_rev', 'i4'), 160 | ('G_pp', 'i4'), ('G_pp_fwd', 'i4'), ('G_pp_rev', 'i4'), 161 | ('N', 'i4'), ('N_fwd', 'i4'), ('N_rev', 'i4'), 162 | ('N_pp', 'i4'), ('N_pp_fwd', 'i4'), ('N_pp_rev', 'i4') 163 | ] 164 | 165 | dtype_tlen = [ 166 | ('chrom', 'a12'), 167 | ('pos', 'i4'), 168 | ('reads_all', 'i4'), 169 | ('reads_paired', 'i4'), 170 | ('reads_pp', 'i4'), 171 | ('mean_tlen', 'i4'), 172 | ('mean_tlen_pp', 'i4'), 173 | ('rms_tlen', 'i4'), 174 | ('rms_tlen_pp', 'i4'), 175 | ('std_tlen', 'i4'), 176 | ('std_tlen_pp', 'i4') 177 | ] 178 | 179 | dtype_tlen_strand = [ 180 | ('chrom', 'a12'), 181 | ('pos', 'i4'), 182 | ('reads_all', 'i4'), 183 | ('reads_fwd', 'i4'), 184 | ('reads_rev', 'i4'), 185 | ('reads_paired', 'i4'), 186 | ('reads_paired_fwd', 'i4'), 187 | ('reads_paired_rev', 'i4'), 188 | ('reads_pp', 'i4'), 189 | ('reads_pp_fwd', 'i4'), 190 | ('reads_pp_rev', 'i4'), 191 | ('mean_tlen', 'i4'), 192 | ('mean_tlen_fwd', 'i4'), 193 | ('mean_tlen_rev', 'i4'), 194 | ('mean_tlen_pp', 'i4'), 195 | ('mean_tlen_pp_fwd', 'i4'), 196 | ('mean_tlen_pp_rev', 'i4'), 197 | ('rms_tlen', 'i4'), 198 | ('rms_tlen_fwd', 'i4'), 199 | ('rms_tlen_rev', 'i4'), 200 | ('rms_tlen_pp', 'i4'), 201 | ('rms_tlen_pp_fwd', 'i4'), 202 | ('rms_tlen_pp_rev', 'i4'), 203 | ('std_tlen', 'i4'), 204 | ('std_tlen_fwd', 'i4'), 205 | ('std_tlen_rev', 'i4'), 206 | ('std_tlen_pp', 'i4'), 207 | ('std_tlen_pp_fwd', 'i4'), 208 | ('std_tlen_pp_rev', 'i4') 209 | ] 210 | 211 | dtype_mapq = [ 212 | ('chrom', 'a12'), 213 | ('pos', 'i4'), 214 | ('reads_all', 'i4'), 215 | ('reads_pp', 'i4'), 216 | ('reads_mapq0', 'i4'), 217 | ('reads_mapq0_pp', 'i4'), 218 | ('rms_mapq', 'i4'), 219 | ('rms_mapq_pp', 'i4'), 220 | ('max_mapq', 'i4'), 221 | ('max_mapq_pp', 'i4') 222 | ] 223 | 224 | dtype_mapq_strand = [ 225 | ('chrom', 'a12'), 226 | ('pos', 'i4'), 227 | ('reads_all', 'i4'), 228 | ('reads_fwd', 'i4'), 229 | ('reads_rev', 'i4'), 230 | ('reads_pp', 'i4'), 231 | ('reads_pp_fwd', 'i4'), 232 | ('reads_pp_rev', 'i4'), 233 | ('reads_mapq0', 'i4'), 234 | ('reads_mapq0_fwd', 'i4'), 235 | ('reads_mapq0_rev', 'i4'), 236 | ('reads_mapq0_pp', 'i4'), 237 | ('reads_mapq0_pp_fwd', 'i4'), 238 | ('reads_mapq0_pp_rev', 'i4'), 239 | ('rms_mapq', 'i4'), 240 | ('rms_mapq_fwd', 'i4'), 241 | ('rms_mapq_rev', 'i4'), 242 | ('rms_mapq_pp', 'i4'), 243 | ('rms_mapq_pp_fwd', 'i4'), 244 | ('rms_mapq_pp_rev', 'i4'), 245 | ('max_mapq', 'i4'), 246 | ('max_mapq_fwd', 'i4'), 247 | ('max_mapq_rev', 'i4'), 248 | ('max_mapq_pp', 'i4'), 249 | ('max_mapq_pp_fwd', 'i4'), 250 | ('max_mapq_pp_rev', 'i4'), 251 | ] 252 | 253 | dtype_baseq = [ 254 | ('chrom', 'a12'), 255 | ('pos', 'i4'), 256 | ('reads_all', 'i4'), 257 | ('reads_pp', 'i4'), 258 | ('rms_baseq', 'i4'), 259 | ('rms_baseq_pp', 'i4'), 260 | ] 261 | 262 | dtype_baseq_strand = [ 263 | ('chrom', 'a12'), 264 | ('pos', 'i4'), 265 | ('reads_all', 'i4'), 266 | ('reads_fwd', 'i4'), 267 | ('reads_rev', 'i4'), 268 | ('reads_pp', 'i4'), 269 | ('reads_pp_fwd', 'i4'), 270 | ('reads_pp_rev', 'i4'), 271 | ('rms_baseq', 'i4'), 272 | ('rms_baseq_fwd', 'i4'), 273 | ('rms_baseq_rev', 'i4'), 274 | ('rms_baseq_pp', 'i4'), 275 | ('rms_baseq_pp_fwd', 'i4'), 276 | ('rms_baseq_pp_rev', 'i4'), 277 | ] 278 | 279 | dtype_baseq_ext = [ 280 | ('chrom', 'a12'), 281 | ('pos', 'i4'), 282 | ('ref', 'a1'), 283 | ('reads_all', 'i4'), 284 | ('reads_pp', 'i4'), 285 | ('matches', 'i4'), 286 | ('matches_pp', 'i4'), 287 | ('mismatches', 'i4'), 288 | ('mismatches_pp', 'i4'), 289 | ('rms_baseq', 'i4'), 290 | ('rms_baseq_pp', 'i4'), 291 | ('rms_baseq_matches', 'i4'), 292 | ('rms_baseq_matches_pp', 'i4'), 293 | ('rms_baseq_mismatches', 'i4'), 294 | ('rms_baseq_mismatches_pp', 'i4'), 295 | ] 296 | 297 | dtype_baseq_ext_strand = [ 298 | ('chrom', 'a12'), 299 | ('pos', 'i4'), 300 | ('ref', 'a1'), 301 | ('reads_all', 'i4'), 302 | ('reads_fwd', 'i4'), 303 | ('reads_rev', 'i4'), 304 | ('reads_pp', 'i4'), 305 | ('reads_pp_fwd', 'i4'), 306 | ('reads_pp_rev', 'i4'), 307 | ('matches', 'i4'), 308 | ('matches_fwd', 'i4'), 309 | ('matches_rev', 'i4'), 310 | ('matches_pp', 'i4'), 311 | ('matches_pp_fwd', 'i4'), 312 | ('matches_pp_rev', 'i4'), 313 | ('mismatches', 'i4'), 314 | ('mismatches_fwd', 'i4'), 315 | ('mismatches_rev', 'i4'), 316 | ('mismatches_pp', 'i4'), 317 | ('mismatches_pp_fwd', 'i4'), 318 | ('mismatches_pp_rev', 'i4'), 319 | ('rms_baseq', 'i4'), 320 | ('rms_baseq_fwd', 'i4'), 321 | ('rms_baseq_rev', 'i4'), 322 | ('rms_baseq_pp', 'i4'), 323 | ('rms_baseq_pp_fwd', 'i4'), 324 | ('rms_baseq_pp_rev', 'i4'), 325 | ('rms_baseq_matches', 'i4'), 326 | ('rms_baseq_matches_fwd', 'i4'), 327 | ('rms_baseq_matches_rev', 'i4'), 328 | ('rms_baseq_matches_pp', 'i4'), 329 | ('rms_baseq_matches_pp_fwd', 'i4'), 330 | ('rms_baseq_matches_pp_rev', 'i4'), 331 | ('rms_baseq_mismatches', 'i4'), 332 | ('rms_baseq_mismatches_fwd', 'i4'), 333 | ('rms_baseq_mismatches_rev', 'i4'), 334 | ('rms_baseq_mismatches_pp', 'i4'), 335 | ('rms_baseq_mismatches_pp_fwd', 'i4'), 336 | ('rms_baseq_mismatches_pp_rev', 'i4') 337 | ] 338 | 339 | dtype_coverage_gc = [ 340 | ('chrom', 'a12'), 341 | ('pos', 'i4'), 342 | ('gc', 'u1'), 343 | ('reads_all', 'i4'), 344 | ('reads_pp', 'i4') 345 | ] 346 | 347 | dtype_coverage_binned = [ 348 | ('chrom', 'a12'), 349 | ('pos', 'i4'), 350 | ('gc', 'u1'), 351 | ('reads_all', 'i4'), 352 | ('reads_pp', 'i4') 353 | ] 354 | 355 | dtype_coverage_ext_binned = [ 356 | ('chrom', 'a12'), 357 | ('pos', 'i4'), 358 | ('gc', 'u1'), 359 | ('reads_all', 'i4'), 360 | ('reads_pp', 'i4'), 361 | ('reads_mate_unmapped', 'i4'), 362 | ('reads_mate_other_chr', 'i4'), 363 | ('reads_mate_same_strand', 'i4'), 364 | ('reads_faceaway', 'i4'), 365 | ('reads_softclipped', 'i4'), 366 | ('reads_duplicate', 'i4') 367 | ] 368 | 369 | dtype_mapq_binned = [ 370 | ('chrom', 'a12'), 371 | ('pos', 'i4'), 372 | ('reads_all', 'i4'), 373 | ('reads_mapq0', 'i4'), 374 | ('rms_mapq', 'i4'), 375 | ] 376 | 377 | dtype_alignment_binned = [ 378 | ('chrom', 'a12'), 379 | ('pos', 'i4'), 380 | ('reads_all', 'i4'), 381 | ('bases_all', 'i4'), 382 | ('M', 'i4'), 383 | ('I', 'i4'), 384 | ('D', 'i4'), 385 | ('N', 'i4'), 386 | ('S', 'i4'), 387 | ('H', 'i4'), 388 | ('P', 'i4'), 389 | ('=', 'i4'), 390 | ('X', 'i4') 391 | ] 392 | 393 | dtype_tlen_binned = [ 394 | ('chrom', 'a12'), 395 | ('pos', 'i4'), 396 | ('reads_all', 'i4'), 397 | ('reads_pp', 'i4'), 398 | ('mean_tlen', 'i4'), 399 | ('mean_tlen_pp', 'i4'), 400 | ('rms_tlen', 'i4'), 401 | ('rms_tlen_pp', 'i4'), 402 | ] 403 | -------------------------------------------------------------------------------- /pysamstats/io.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, print_function, division 3 | import itertools 4 | import time 5 | import csv 6 | import sys 7 | 8 | 9 | import pysamstats 10 | from pysamstats.util import flatten, determine_max_seqid 11 | import pysamstats.config as config 12 | 13 | 14 | def write_csv(stats_type, outfile, alignmentfile, fields=None, dialect='excel-tab', 15 | write_header=True, progress=None, **kwargs): 16 | """Write statistics output to a CSV file. 17 | 18 | Parameters 19 | ---------- 20 | 21 | stats_type : string 22 | Statistics type, one of 'coverage', 'coverage_ext', etc. 23 | outfile : file-like 24 | Output file to write to. 25 | alignmentfile : pysam.AlignmentFile or string 26 | Input BAM or SAM file or file path. 27 | fields : list of strings 28 | List of field names to output (all by default). 29 | dialect : string 30 | CSV dialect. 31 | write_header : bool 32 | If True write a header row. 33 | progress : int 34 | Log progress to stderr every N rows. 35 | **kwargs 36 | Passed through to the statistics function. 37 | 38 | """ 39 | 40 | # lookup stats function 41 | stats_function = getattr(pysamstats, 'stat_' + stats_type) 42 | 43 | # determine field names 44 | if fields is None: 45 | dtype = getattr(config, 'dtype_' + stats_type) 46 | fields = [t[0] for t in dtype] 47 | 48 | # setup record generator 49 | recs = stats_function(alignmentfile, **kwargs) 50 | 51 | # flatten records to rows 52 | rows = flatten(recs, *fields) 53 | 54 | # initialise writer 55 | writer = csv.writer(outfile, dialect=dialect) 56 | 57 | # write header row 58 | if write_header: 59 | writer.writerow(fields) 60 | 61 | if progress is None: 62 | # N.B., don't use writer.writerows(recs)! 63 | for row in rows: 64 | writer.writerow(row) 65 | 66 | else: 67 | counter = 0 68 | modulus = progress 69 | before = time.time() 70 | before_all = before 71 | for row in rows: 72 | counter += 1 73 | writer.writerow(row) 74 | if counter % modulus == 0: 75 | after = time.time() 76 | elapsed = after - before_all 77 | batch_elapsed = after - before 78 | msg = '[pysamstats] %s rows in %.2fs (%d rows/s); batch in ' \ 79 | '%.2fs (%d rows/s)' \ 80 | % (counter, elapsed, counter / elapsed, batch_elapsed, 81 | progress / batch_elapsed) 82 | print(msg, file=sys.stderr) 83 | before = after 84 | after_all = time.time() 85 | elapsed_all = after_all - before_all 86 | msg = '[pysamstats] %s rows in %.2fs (%d rows/s)' \ 87 | % (counter, elapsed_all, counter / elapsed_all) 88 | print(msg, file=sys.stderr) 89 | 90 | 91 | def write_hdf5(stats_type, outfile, alignmentfile, fields=None, progress=None, hdf5_group='/', 92 | hdf5_dataset='data', hdf5_complevel=1, hdf5_complib='zlib', hdf5_shuffle=True, 93 | hdf5_fletcher32=False, hdf5_chunksize=2**20, dtype=None, **kwargs): 94 | """Write statistics output to an HDF5 file. Requires PyTables. 95 | 96 | Parameters 97 | ---------- 98 | stats_type : string 99 | Statistics type, one of 'coverage', 'coverage_ext', etc. 100 | outfile : string 101 | Output file path. 102 | alignmentfile : pysam.AlignmentFile or string 103 | Input BAM or SAM file or file path. 104 | fields : list of strings 105 | List of field names to output (all by default). 106 | progress : int 107 | Log progress to stderr approximately every N rows. 108 | hdf5_group : string 109 | Group to write new dataset to. 110 | hdf5_dataset : string 111 | Name of dataset to create. 112 | hdf5_complib : string 113 | Name of compression library (defaults to 'zlib'). 114 | hdf5_complevel : int 115 | Compression level. 116 | hdf5_chunksize : int 117 | Size of chunks in number of bytes. 118 | hdf5_shuffle : bool 119 | If True, use byte shuffle filter. 120 | hdf5_fletcher32 : bool 121 | If True, use fletcher 32 filter. 122 | dtype : dict 123 | Override dtype. 124 | **kwargs 125 | Passed through to the statistics function. 126 | 127 | Notes 128 | ----- 129 | The length of the chunks in number of items is calculated by dividing the 130 | chunk size in number of bytes by the size of each row in number of bytes as 131 | determined from the dtype. 132 | 133 | """ 134 | 135 | import tables 136 | import numpy as np 137 | h5file = None 138 | 139 | # lookup stats function 140 | stats_function = getattr(pysamstats, 'stat_' + stats_type) 141 | 142 | # lookup default dtype 143 | default_dtype = getattr(config, 'dtype_' + stats_type) 144 | 145 | # determine field names 146 | if fields is None: 147 | fields = [t[0] for t in default_dtype] 148 | 149 | # determine dtype 150 | default_dtype = dict(default_dtype) 151 | max_seqid_len = determine_max_seqid(alignmentfile) 152 | default_dtype["chrom"] = "a{0}".format(max_seqid_len) 153 | 154 | # update if user passed 155 | if dtype is not None: 156 | default_dtype.update(dict(dtype)) 157 | dtype = default_dtype 158 | 159 | # fields 160 | if len(fields) == 1: 161 | dtype = dtype[fields[0]] 162 | else: 163 | dtype = [(f, dtype[f]) for f in fields] 164 | dtype = np.dtype(dtype) 165 | 166 | # setup record generator 167 | recs = stats_function(alignmentfile, **kwargs) 168 | 169 | # flatten records to rows 170 | rows = flatten(recs, *fields) 171 | 172 | try: 173 | 174 | # open output file 175 | h5file = tables.open_file(outfile, mode='a') 176 | 177 | # determine chunk shape 178 | hdf5_chunklen = int(hdf5_chunksize/dtype.itemsize) 179 | hdf5_chunkshape = (hdf5_chunklen,) 180 | 181 | # replace any existing node at that location 182 | try: 183 | h5file.remove_node(hdf5_group, hdf5_dataset) 184 | except tables.NoSuchNodeError: 185 | pass 186 | 187 | # create dataset 188 | h5table = h5file.create_table( 189 | hdf5_group, hdf5_dataset, dtype, 190 | title=stats_type, 191 | filters=tables.Filters(complevel=hdf5_complevel, 192 | complib=hdf5_complib, 193 | shuffle=hdf5_shuffle, 194 | fletcher32=hdf5_fletcher32), 195 | createparents=True, 196 | chunkshape=hdf5_chunkshape) 197 | 198 | # record initial time 199 | counter = 0 200 | counter_before = 0 201 | before = time.time() 202 | before_all = before 203 | 204 | # load data in batches of size `hdf5_chunklen` 205 | chunk = list(itertools.islice(rows, hdf5_chunklen)) 206 | 207 | # load chunk at a time 208 | while chunk: 209 | 210 | # write chunk 211 | h5table.append(chunk) 212 | h5table.flush() 213 | 214 | # keep track of number of records loaded 215 | n = len(chunk) # may be shorter than chunklen if final batch 216 | counter += n 217 | 218 | # log progress 219 | if progress and (counter % progress) < hdf5_chunklen: 220 | after = time.time() 221 | elapsed = after - before_all 222 | batch_elapsed = after - before 223 | batch_size = counter - counter_before 224 | msg = '[pysamstats] %s rows in %.2fs (%d rows/s); last %s ' \ 225 | 'rows in %.2fs (%d rows/s)' \ 226 | % (counter, elapsed, counter / elapsed, 227 | batch_size, batch_elapsed, batch_size / batch_elapsed) 228 | print(msg, file=sys.stderr) 229 | before = after 230 | counter_before = counter 231 | 232 | # load next batch 233 | chunk = list(itertools.islice(rows, hdf5_chunklen)) 234 | 235 | if progress: 236 | after_all = time.time() 237 | elapsed_all = after_all - before_all 238 | msg = '[pysamstats] %s rows in %.2fs (%d rows/s)' \ 239 | % (counter, elapsed_all, counter / elapsed_all) 240 | print(msg, file=sys.stderr) 241 | 242 | finally: 243 | if h5file is not None: 244 | h5file.close() 245 | -------------------------------------------------------------------------------- /pysamstats/pileup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, print_function, division 3 | import functools 4 | 5 | 6 | import pysamstats.opt as opt 7 | import pysamstats.util as util 8 | import pysamstats.config as config 9 | 10 | 11 | _doc_params = """ 12 | Parameters 13 | ---------- 14 | type : string 15 | Statistics type. One of "coverage", "coverage_strand", "coverage_ext", 16 | "coverage_ext_strand", "variation", "variation_strand", "tlen", "tlen_strand", "mapq", 17 | "mapq_strand", "baseq", "baseq_strand", "baseq_ext", "baseq_ext_strand", "coverage_gc". 18 | alignmentfile : pysam.AlignmentFile or string 19 | SAM or BAM file or file path. 20 | fafile : pysam.FastaFile or string, optional 21 | FASTA file or file path, only required for some statistics types. 22 | chrom : string, optional 23 | Chromosome/contig. 24 | start : int, optional 25 | Start position. 26 | end : int, optional 27 | End position. 28 | one_based : bool, optional 29 | Coordinate system, False if zero-based (default), True if one-based. 30 | truncate : bool, optional 31 | If True, truncate output to selected region. 32 | pad : bool, optional 33 | If True, emit records for every position, even if no reads are aligned. 34 | max_depth : int, optional 35 | Maximum depth to allow in pileup column. 36 | window_size : int, optional 37 | Window size to use for percent GC calculation (only applies to coverage_gc). 38 | window_offset : int, optional 39 | Distance from window start to record position (only applies to coverage_gc). 40 | min_mapq : int, optional 41 | Only reads with mapping quality equal to or greater than this value will be counted (0 42 | by default). 43 | min_baseq : int, optional 44 | Only reads with base quality equal to or greater than this value will be counted (0 by 45 | default). 46 | no_del : bool, optional 47 | If True, don't count reads aligned with a deletion at the current position. 48 | no_dup : bool, optional 49 | If True, don't count reads flagged as duplicate.""" 50 | 51 | 52 | # noinspection PyShadowingBuiltins 53 | def stat_pileup(type, 54 | alignmentfile, 55 | fafile=None, 56 | chrom=None, 57 | start=None, 58 | end=None, 59 | one_based=False, 60 | truncate=False, 61 | stepper="all", 62 | pad=False, 63 | max_depth=8000, 64 | window_size=300, 65 | window_offset=None, 66 | min_mapq=0, 67 | min_baseq=0, 68 | no_del=False, 69 | no_dup=False): 70 | """Generate statistics per genome position, based on read pileups. 71 | {params} 72 | 73 | Returns 74 | ------- 75 | recs : iterator 76 | An iterator yielding dict objects, where each dict holds data for a single genome position. 77 | 78 | """ 79 | 80 | if type in config.stats_types_withref and fafile is None: 81 | raise ValueError('reference sequence is required; please provide fafile argument') 82 | 83 | try: 84 | if type == 'coverage_gc': 85 | stat = stats_classes_pileup[type](window_size=window_size, window_offset=window_offset) 86 | else: 87 | stat = stats_classes_pileup[type]() 88 | except KeyError: 89 | raise ValueError('unsupported statistics type: %r' % type) 90 | 91 | return opt.iter_pileup(stat, alignmentfile=alignmentfile, fafile=fafile, chrom=chrom, 92 | start=start, end=end, one_based=one_based, truncate=truncate, stepper=stepper, pad=pad, 93 | max_depth=max_depth, min_mapq=min_mapq, min_baseq=min_baseq, 94 | no_del=no_del, no_dup=no_dup) 95 | 96 | 97 | stat_pileup.__doc__ = stat_pileup.__doc__.format(params=_doc_params) 98 | 99 | 100 | # noinspection PyShadowingBuiltins 101 | def load_pileup(type, 102 | alignmentfile, 103 | fafile=None, 104 | chrom=None, 105 | start=None, 106 | end=None, 107 | one_based=False, 108 | truncate=False, 109 | stepper="all", 110 | pad=False, 111 | max_depth=8000, 112 | window_size=300, 113 | window_offset=None, 114 | min_mapq=0, 115 | min_baseq=0, 116 | no_del=False, 117 | no_dup=False, 118 | dtype=None, 119 | fields=None): 120 | """Load statistics per genome position, based on read pileups. 121 | {params} 122 | dtype : dtype 123 | Override default dtype. 124 | fields : string or list of strings 125 | Select a subset of fields to load. 126 | 127 | Returns 128 | ------- 129 | ra : numpy structured array 130 | A structured array. 131 | 132 | """ 133 | 134 | statfun = functools.partial(stat_pileup, type) 135 | try: 136 | default_dtype = getattr(config, 'dtype_' + type) 137 | except AttributeError: 138 | raise ValueError('unsupported statistics type: %r' % type) 139 | 140 | return util.load_stats(statfun, user_dtype=dtype, default_dtype=default_dtype, 141 | user_fields=fields, alignmentfile=alignmentfile, fafile=fafile, 142 | chrom=chrom, start=start, end=end, one_based=one_based, 143 | truncate=truncate, stepper=stepper, pad=pad, max_depth=max_depth, window_size=window_size, 144 | window_offset=window_offset, min_mapq=min_mapq, min_baseq=min_baseq, no_del=no_del, no_dup=no_dup) 145 | 146 | 147 | load_pileup.__doc__ = load_pileup.__doc__.format(params=_doc_params) 148 | 149 | 150 | stats_classes_pileup = { 151 | 'coverage': opt.Coverage, 152 | 'coverage_strand': opt.CoverageStrand, 153 | 'coverage_ext': opt.CoverageExt, 154 | 'coverage_ext_strand': opt.CoverageExtStrand, 155 | 'variation': opt.Variation, 156 | 'variation_strand': opt.VariationStrand, 157 | 'tlen': opt.Tlen, 158 | 'tlen_strand': opt.TlenStrand, 159 | 'mapq': opt.Mapq, 160 | 'mapq_strand': opt.MapqStrand, 161 | 'baseq': opt.Baseq, 162 | 'baseq_strand': opt.BaseqStrand, 163 | 'baseq_ext': opt.BaseqExt, 164 | 'baseq_ext_strand': opt.BaseqExtStrand, 165 | 'coverage_gc': opt.CoverageGC, 166 | } 167 | 168 | 169 | # backwards compatibility 170 | ######################### 171 | 172 | 173 | _stat_doc_lines = stat_pileup.__doc__.split('\n') 174 | _load_doc_lines = load_pileup.__doc__.split('\n') 175 | # strip "type" parameter 176 | _stat_doc = '\n'.join(_stat_doc_lines[:4] + _stat_doc_lines[8:]) 177 | _load_doc = '\n'.join(_load_doc_lines[:4] + _load_doc_lines[8:]) 178 | 179 | 180 | def _specialize(type): 181 | stat = functools.partial(stat_pileup, type) 182 | stat.__doc__ = _stat_doc 183 | stat.__name__ = 'stat_' + type 184 | load = functools.partial(load_pileup, type) 185 | load.__doc__ = _load_doc 186 | load.__name__ = 'load_' + type 187 | return stat, load 188 | 189 | 190 | # named functions 191 | stat_coverage, load_coverage = _specialize('coverage') 192 | stat_coverage_strand, load_coverage_strand = _specialize('coverage_strand') 193 | stat_coverage_ext, load_coverage_ext = _specialize('coverage_ext') 194 | stat_coverage_ext_strand, load_coverage_ext_strand = _specialize('coverage_ext_strand') 195 | stat_variation, load_variation = _specialize('variation') 196 | stat_variation_strand, load_variation_strand = _specialize('variation_strand') 197 | stat_tlen, load_tlen = _specialize('tlen') 198 | stat_tlen_strand, load_tlen_strand = _specialize('tlen_strand') 199 | stat_mapq, load_mapq = _specialize('mapq') 200 | stat_mapq_strand, load_mapq_strand = _specialize('mapq_strand') 201 | stat_baseq, load_baseq = _specialize('baseq') 202 | stat_baseq_strand, load_baseq_strand = _specialize('baseq_strand') 203 | stat_baseq_ext, load_baseq_ext = _specialize('baseq_ext') 204 | stat_baseq_ext_strand, load_baseq_ext_strand = _specialize('baseq_ext_strand') 205 | stat_coverage_gc, load_coverage_gc = _specialize('coverage_gc') 206 | -------------------------------------------------------------------------------- /pysamstats/test/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, print_function, division 3 | -------------------------------------------------------------------------------- /pysamstats/test/test_binned.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, print_function, division 3 | from itertools import chain 4 | from collections import Counter 5 | import logging 6 | 7 | 8 | from pysam import Samfile, Fastafile 9 | from nose.tools import eq_ 10 | from numpy import around as round 11 | 12 | 13 | import pysamstats 14 | from .util import normalise_coords, mean, rms, rootmean, compare_iterators 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | debug = logger.debug 19 | 20 | 21 | def compare_stats(impl, refimpl): 22 | # no read filters 23 | kwargs = {'chrom': 'Pf3D7_01_v3', 24 | 'start': 0, 25 | 'end': 2000, 26 | 'one_based': False} 27 | expected = refimpl(Samfile('fixture/test.bam'), **kwargs) 28 | actual = impl(Samfile('fixture/test.bam'), **kwargs) 29 | compare_iterators(expected, actual) 30 | # read filters 31 | kwargs['min_mapq'] = 1 32 | kwargs['no_dup'] = True 33 | expected = refimpl(Samfile('fixture/test.bam'), **kwargs) 34 | actual = impl(Samfile('fixture/test.bam'), **kwargs) 35 | compare_iterators(expected, actual) 36 | 37 | 38 | def compare_stats_withref(impl, refimpl, bam_fn='fixture/test.bam', 39 | fasta_fn='fixture/ref.fa'): 40 | # no read filters 41 | kwargs = {'chrom': 'Pf3D7_01_v3', 42 | 'start': 0, 43 | 'end': 2000, 44 | 'one_based': False} 45 | expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) 46 | actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) 47 | compare_iterators(expected, actual) 48 | # read filters 49 | kwargs['min_mapq'] = 1 50 | kwargs['no_dup'] = True 51 | expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) 52 | actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) 53 | compare_iterators(expected, actual) 54 | 55 | 56 | def filter_alignments(alignments, min_mapq, no_dup): 57 | if min_mapq > 0: 58 | alignments = (a for a in alignments if a.mapq >= min_mapq) 59 | if no_dup: 60 | alignments = (a for a in alignments if not a.is_duplicate) 61 | return alignments 62 | 63 | 64 | def stat_coverage_binned_refimpl(samfile, fastafile, chrom=None, start=None, 65 | end=None, one_based=False, window_size=300, 66 | window_offset=150, min_mapq=0, no_dup=False): 67 | if chrom is None: 68 | # noinspection PyTypeChecker 69 | it = chain(*[ 70 | iter_coverage_binned(samfile, fastafile, chrom, None, None, one_based, window_size, 71 | window_offset, min_mapq, no_dup) 72 | for chrom in samfile.references 73 | ]) 74 | else: 75 | it = iter_coverage_binned(samfile, fastafile, chrom, start, end, one_based, window_size, 76 | window_offset, min_mapq, no_dup) 77 | return it 78 | 79 | 80 | def gc_content(fastafile, chrom, start, end): 81 | seq = fastafile.fetch(chrom, start, end).lower() 82 | nc = Counter(seq) 83 | gc = int(round((nc['g'] + nc['c']) * 100. / (end-start))) 84 | return gc 85 | 86 | 87 | def iter_coverage_binned(samfile, fastafile, chrom, start, end, one_based, 88 | window_size, window_offset, min_mapq, no_dup): 89 | assert chrom is not None 90 | start, end = normalise_coords(one_based, start, end) 91 | chrlen = samfile.lengths[samfile.references.index(chrom)] 92 | if start is None: 93 | start = 0 94 | if end is None: 95 | end = chrlen 96 | if end > chrlen: 97 | end = chrlen 98 | # setup first bin 99 | bin_start = start 100 | bin_end = bin_start + window_size 101 | reads_all = reads_pp = 0 102 | 103 | # iterate over reads 104 | alignments = samfile.fetch(chrom, start, end) 105 | alignments = filter_alignments(alignments, min_mapq, no_dup) 106 | for aln in alignments: 107 | while aln.pos > bin_end: # end of bin 108 | gc = gc_content(fastafile, chrom, bin_start, bin_end) 109 | pos = bin_start + window_offset 110 | if one_based: 111 | pos += 1 112 | rec = {'chrom': chrom, 'pos': pos, 113 | 'gc': gc, 'reads_all': reads_all, 114 | 'reads_pp': reads_pp} 115 | yield rec 116 | reads_all = reads_pp = 0 117 | bin_start = bin_end 118 | bin_end = bin_start + window_size 119 | if not aln.is_unmapped: 120 | reads_all += 1 121 | if aln.is_proper_pair: 122 | reads_pp += 1 123 | 124 | # deal with last non-empty bin 125 | gc = gc_content(fastafile, chrom, bin_start, bin_end) 126 | pos = bin_start + window_offset 127 | if one_based: 128 | pos += 1 129 | rec = {'chrom': chrom, 'pos': pos, 130 | 'gc': gc, 'reads_all': reads_all, 'reads_pp': reads_pp} 131 | yield rec 132 | 133 | # deal with empty bins up to explicit end 134 | if end is not None: 135 | while bin_end < end: 136 | reads_all = reads_pp = 0 137 | bin_start = bin_end 138 | bin_end = bin_start + window_size 139 | gc = gc_content(fastafile, chrom, bin_start, bin_end) 140 | pos = bin_start + window_offset 141 | if one_based: 142 | pos += 1 143 | rec = {'chrom': chrom, 'pos': pos, 144 | 'gc': gc, 'reads_all': reads_all, 145 | 'reads_pp': reads_pp} 146 | yield rec 147 | 148 | 149 | def test_stat_coverage_binned(): 150 | compare_stats_withref(pysamstats.stat_coverage_binned, 151 | stat_coverage_binned_refimpl) 152 | 153 | 154 | def test_stat_coverage_binned_uppercase_fasta(): 155 | compare_stats_withref(pysamstats.stat_coverage_binned, 156 | stat_coverage_binned_refimpl, 157 | fasta_fn='fixture/ref.upper.fa') 158 | 159 | 160 | def stat_coverage_ext_binned_refimpl(samfile, fastafile, chrom=None, start=None, end=None, 161 | one_based=False, window_size=300, window_offset=150, 162 | min_mapq=0, no_dup=False): 163 | if chrom is None: 164 | # noinspection PyTypeChecker 165 | it = chain(*[ 166 | iter_coverage_ext_binned(samfile, fastafile, chrom, None, None, one_based, 167 | window_size, window_offset, min_mapq, no_dup) 168 | for chrom in samfile.references 169 | ]) 170 | else: 171 | it = iter_coverage_ext_binned(samfile, fastafile, chrom, start, end, one_based, 172 | window_size, window_offset, min_mapq, no_dup) 173 | return it 174 | 175 | 176 | def iter_coverage_ext_binned(samfile, fastafile, chrom, start, end, one_based, window_size, 177 | window_offset, min_mapq, no_dup): 178 | assert chrom is not None 179 | start, end = normalise_coords(one_based, start, end) 180 | chrlen = samfile.lengths[samfile.references.index(chrom)] 181 | if start is None: 182 | start = 0 183 | if end is None: 184 | end = chrlen 185 | if end > chrlen: 186 | end = chrlen 187 | # setup first bin 188 | bin_start = start 189 | bin_end = bin_start + window_size 190 | reads_all = reads_pp = reads_mate_unmapped = reads_mate_other_chr = \ 191 | reads_mate_same_strand = reads_faceaway = reads_softclipped = \ 192 | reads_duplicate = 0 193 | 194 | # iterate over reads 195 | alignments = samfile.fetch(chrom, start, end) 196 | alignments = filter_alignments(alignments, min_mapq, no_dup) 197 | for aln in alignments: 198 | while aln.pos > bin_end: # end of bin 199 | gc = gc_content(fastafile, chrom, bin_start, bin_end) 200 | pos = bin_start + window_offset 201 | if one_based: 202 | pos += 1 203 | rec = {'chrom': chrom, 'pos': pos, 204 | 'gc': gc, 205 | 'reads_all': reads_all, 206 | 'reads_pp': reads_pp, 207 | 'reads_mate_unmapped': reads_mate_unmapped, 208 | 'reads_mate_other_chr': reads_mate_other_chr, 209 | 'reads_mate_same_strand': reads_mate_same_strand, 210 | 'reads_faceaway': reads_faceaway, 211 | 'reads_softclipped': reads_softclipped, 212 | 'reads_duplicate': reads_duplicate} 213 | yield rec 214 | reads_all = reads_pp = reads_mate_unmapped = reads_mate_other_chr\ 215 | = reads_mate_same_strand = reads_faceaway = reads_softclipped\ 216 | = reads_duplicate = 0 217 | bin_start = bin_end 218 | bin_end = bin_start + window_size 219 | # debug(reads, reads.cigar, repr(reads.cigarstring)) 220 | if not aln.is_unmapped: 221 | reads_all += 1 222 | if aln.is_proper_pair: 223 | reads_pp += 1 224 | if aln.is_duplicate: 225 | reads_duplicate += 1 226 | if aln.cigar is not None and any((op[0] == 4) for op in aln.cigar): 227 | reads_softclipped += 1 228 | # should be mutually exclusive 229 | if aln.mate_is_unmapped: 230 | reads_mate_unmapped += 1 231 | elif aln.tid != aln.rnext: 232 | reads_mate_other_chr += 1 233 | elif aln.is_reverse == aln.mate_is_reverse: 234 | reads_mate_same_strand += 1 235 | elif ( 236 | # mapped to reverse strand but leftmost 237 | (aln.is_reverse and aln.tlen > 0) 238 | # mapped to fwd strand but rightmost 239 | or (not aln.is_reverse and aln.tlen < 0) 240 | ): 241 | reads_faceaway += 1 242 | 243 | # deal with last non-empty bin 244 | gc = gc_content(fastafile, chrom, bin_start, bin_end) 245 | pos = bin_start + window_offset 246 | if one_based: 247 | pos += 1 248 | rec = {'chrom': chrom, 'pos': pos, 249 | 'gc': gc, 250 | 'reads_all': reads_all, 251 | 'reads_pp': reads_pp, 252 | 'reads_mate_unmapped': reads_mate_unmapped, 253 | 'reads_mate_other_chr': reads_mate_other_chr, 254 | 'reads_mate_same_strand': reads_mate_same_strand, 255 | 'reads_faceaway': reads_faceaway, 256 | 'reads_softclipped': reads_softclipped, 257 | 'reads_duplicate': reads_duplicate} 258 | yield rec 259 | 260 | # deal with empty bins up to explicit end 261 | if end is not None: 262 | while bin_end < end: 263 | reads_all = reads_pp = reads_mate_unmapped = reads_mate_other_chr\ 264 | = reads_mate_same_strand = reads_faceaway = reads_softclipped\ 265 | = reads_duplicate = 0 266 | bin_start = bin_end 267 | bin_end = bin_start + window_size 268 | gc = gc_content(fastafile, chrom, bin_start, bin_end) 269 | pos = bin_start + window_offset 270 | if one_based: 271 | pos += 1 272 | rec = {'chrom': chrom, 'pos': pos, 273 | 'gc': gc, 274 | 'reads_all': reads_all, 275 | 'reads_pp': reads_pp, 276 | 'reads_mate_unmapped': reads_mate_unmapped, 277 | 'reads_mate_other_chr': reads_mate_other_chr, 278 | 'reads_mate_same_strand': reads_mate_same_strand, 279 | 'reads_faceaway': reads_faceaway, 280 | 'reads_softclipped': reads_softclipped, 281 | 'reads_duplicate': reads_duplicate} 282 | yield rec 283 | 284 | 285 | def test_stat_coverage_ext_binned(): 286 | compare_stats_withref(pysamstats.stat_coverage_ext_binned, 287 | stat_coverage_ext_binned_refimpl) 288 | 289 | 290 | def test_stat_coverage_ext_binned_uppercase_fasta(): 291 | compare_stats_withref(pysamstats.stat_coverage_ext_binned, 292 | stat_coverage_ext_binned_refimpl, 293 | fasta_fn='fixture/ref.upper.fa') 294 | 295 | 296 | def stat_mapq_binned_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, 297 | window_size=300, window_offset=150, min_mapq=0, no_dup=False): 298 | if chrom is None: 299 | # noinspection PyTypeChecker 300 | it = chain(*[iter_mapq_binned(samfile, chrom, None, None, one_based, window_size, 301 | window_offset, min_mapq, no_dup) 302 | for chrom in samfile.references]) 303 | else: 304 | it = iter_mapq_binned(samfile, chrom, start, end, one_based, window_size, window_offset, 305 | min_mapq, no_dup) 306 | return it 307 | 308 | 309 | def iter_mapq_binned(samfile, chrom, start, end, one_based, window_size, window_offset, min_mapq, 310 | no_dup): 311 | assert chrom is not None 312 | start, end = normalise_coords(one_based, start, end) 313 | chrlen = samfile.lengths[samfile.references.index(chrom)] 314 | if start is None: 315 | start = 0 316 | if end is None: 317 | end = chrlen 318 | if end > chrlen: 319 | end = chrlen 320 | # setup first bin 321 | bin_start = start 322 | bin_end = bin_start + window_size 323 | reads_all = reads_mapq0 = mapq_square_sum = 0 324 | 325 | # iterate over reads 326 | alignments = samfile.fetch(chrom, start, end) 327 | alignments = filter_alignments(alignments, min_mapq, no_dup) 328 | for aln in alignments: 329 | while aln.pos > bin_end: # end of bin 330 | pos = bin_start + window_offset 331 | if one_based: 332 | pos += 1 333 | rec = {'chrom': chrom, 'pos': pos, 334 | 'reads_all': reads_all, 335 | 'reads_mapq0': reads_mapq0, 336 | 'rms_mapq': rootmean(mapq_square_sum, reads_all)} 337 | yield rec 338 | reads_all = reads_mapq0 = mapq_square_sum = 0 339 | bin_start = bin_end 340 | bin_end = bin_start + window_size 341 | if not aln.is_unmapped: 342 | reads_all += 1 343 | mapq_square_sum += aln.mapq**2 344 | if aln.mapq == 0: 345 | reads_mapq0 += 1 346 | 347 | # deal with last non-empty bin 348 | pos = bin_start + window_offset 349 | if one_based: 350 | pos += 1 351 | rec = {'chrom': chrom, 'pos': pos, 352 | 'reads_all': reads_all, 353 | 'reads_mapq0': reads_mapq0, 354 | 'rms_mapq': rootmean(mapq_square_sum, reads_all)} 355 | yield rec 356 | 357 | # deal with empty bins up to explicit end 358 | if end is not None: 359 | while bin_end < end: 360 | reads_all = reads_mapq0 = mapq_square_sum = 0 361 | bin_start = bin_end 362 | bin_end = bin_start + window_size 363 | pos = bin_start + window_offset 364 | if one_based: 365 | pos += 1 366 | rec = {'chrom': chrom, 'pos': pos, 367 | 'reads_all': reads_all, 368 | 'reads_mapq0': reads_mapq0, 369 | 'rms_mapq': rootmean(mapq_square_sum, reads_all)} 370 | yield rec 371 | 372 | 373 | def test_stat_mapq_binned(): 374 | compare_stats(pysamstats.stat_mapq_binned, stat_mapq_binned_refimpl) 375 | 376 | 377 | def stat_alignment_binned_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, 378 | window_size=300, window_offset=150, min_mapq=0, no_dup=False): 379 | if chrom is None: 380 | # noinspection PyTypeChecker 381 | it = chain(*[ 382 | iter_alignment_binned(samfile, chrom, None, None, one_based, window_size, 383 | window_offset, min_mapq, no_dup) 384 | for chrom in samfile.references] 385 | ) 386 | else: 387 | it = iter_alignment_binned(samfile, chrom, start, end, one_based, window_size, 388 | window_offset, min_mapq, no_dup) 389 | return it 390 | 391 | 392 | CIGAR = 'MIDNSHP=X' 393 | 394 | 395 | def iter_alignment_binned(samfile, chrom, start, end, one_based, window_size, window_offset, 396 | min_mapq, no_dup): 397 | assert chrom is not None 398 | start, end = normalise_coords(one_based, start, end) 399 | chrlen = samfile.lengths[samfile.references.index(chrom)] 400 | if start is None: 401 | start = 0 402 | if end is None: 403 | end = chrlen 404 | if end > chrlen: 405 | end = chrlen 406 | # setup first bin 407 | bin_start = start 408 | bin_end = bin_start + window_size 409 | c = Counter() 410 | reads_all = 0 411 | 412 | # iterate over reads 413 | alignments = samfile.fetch(chrom, start, end) 414 | alignments = filter_alignments(alignments, min_mapq, no_dup) 415 | for aln in alignments: 416 | while aln.pos > bin_end: # end of bin 417 | pos = bin_start + window_offset 418 | if one_based: 419 | pos += 1 420 | rec = {'chrom': chrom, 'pos': pos, 'reads_all': reads_all} 421 | for i in range(len(CIGAR)): 422 | rec[CIGAR[i]] = c[i] 423 | # rec['NM'] = c['NM'] 424 | rec['bases_all'] = c[0] + c[1] + c[4] + c[7] + c[8] 425 | yield rec 426 | c = Counter() 427 | reads_all = 0 428 | bin_start = bin_end 429 | bin_end = bin_start + window_size 430 | # debug(aln.cigar) 431 | if not aln.is_unmapped: 432 | reads_all += 1 433 | if aln.cigar is not None: 434 | for op, l in aln.cigar: 435 | c[op] += l 436 | # add edit distance 437 | # tags = dict(aln.tags) 438 | # if 'NM' in tags: 439 | # c['NM'] += tags['NM'] 440 | 441 | # deal with last non-empty bin 442 | pos = bin_start + window_offset 443 | if one_based: 444 | pos += 1 445 | rec = {'chrom': chrom, 'pos': pos, 'reads_all': reads_all} 446 | for i in range(len(CIGAR)): 447 | rec[CIGAR[i]] = c[i] 448 | # rec['NM'] = c['NM'] 449 | rec['bases_all'] = c[0] + c[1] + c[4] + c[7] + c[8] 450 | yield rec 451 | 452 | # deal with empty bins up to explicit end 453 | if end is not None: 454 | while bin_end < end: 455 | c = Counter() 456 | reads_all = 0 457 | bin_start = bin_end 458 | bin_end = bin_start + window_size 459 | pos = bin_start + window_offset 460 | if one_based: 461 | pos += 1 462 | rec = {'chrom': chrom, 'pos': pos, 'reads_all': reads_all} 463 | for i in range(len(CIGAR)): 464 | rec[CIGAR[i]] = c[i] 465 | # rec['NM'] = c['NM'] 466 | rec['bases_all'] = c[0] + c[1] + c[4] + c[7] + c[8] 467 | yield rec 468 | 469 | 470 | def test_stat_alignment_binned(): 471 | compare_stats(pysamstats.stat_alignment_binned, stat_alignment_binned_refimpl) 472 | 473 | 474 | def stat_tlen_binned_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, 475 | window_size=300, window_offset=150, min_mapq=0, no_dup=False): 476 | if chrom is None: 477 | # noinspection PyTypeChecker 478 | it = chain(*[iter_tlen_binned(samfile, chrom, None, None, one_based, window_size, 479 | window_offset, min_mapq, no_dup) 480 | for chrom in samfile.references]) 481 | else: 482 | it = iter_tlen_binned(samfile, chrom, start, end, one_based, window_size, window_offset, 483 | min_mapq, no_dup) 484 | return it 485 | 486 | 487 | def iter_tlen_binned(samfile, chrom, start, end, one_based, window_size, window_offset, min_mapq, 488 | no_dup): 489 | assert chrom is not None 490 | start, end = normalise_coords(one_based, start, end) 491 | chrlen = samfile.lengths[samfile.references.index(chrom)] 492 | if start is None: 493 | start = 0 494 | if end is None: 495 | end = chrlen 496 | if end > chrlen: 497 | end = chrlen 498 | # setup first bin 499 | bin_start = start 500 | bin_end = bin_start + window_size 501 | reads_all = reads_pp = 0 502 | tlens = [] 503 | tlens_pp = [] 504 | 505 | # iterate over reads 506 | alignments = samfile.fetch(chrom, start, end) 507 | alignments = filter_alignments(alignments, min_mapq, no_dup) 508 | for aln in alignments: 509 | while aln.pos > bin_end: # end of bin 510 | pos = bin_start + window_offset 511 | if one_based: 512 | pos += 1 513 | rec = {'chrom': chrom, 'pos': pos, 514 | 'reads_all': reads_all, 515 | 'reads_pp': reads_pp, 516 | 'mean_tlen': mean(tlens), 517 | 'rms_tlen': rms(tlens), 518 | 'mean_tlen_pp': mean(tlens_pp), 519 | 'rms_tlen_pp': rms(tlens_pp), 520 | } 521 | yield rec 522 | reads_all = reads_pp = 0 523 | tlens = [] 524 | tlens_pp = [] 525 | bin_start = bin_end 526 | bin_end = bin_start + window_size 527 | if not aln.is_unmapped: 528 | reads_all += 1 529 | tlens.append(aln.tlen) 530 | if aln.is_proper_pair: 531 | reads_pp += 1 532 | tlens_pp.append(aln.tlen) 533 | 534 | # deal with last non-empty bin 535 | pos = bin_start + window_offset 536 | if one_based: 537 | pos += 1 538 | rec = {'chrom': chrom, 'pos': pos, 539 | 'reads_all': reads_all, 540 | 'reads_pp': reads_pp, 541 | 'mean_tlen': mean(tlens), 542 | 'rms_tlen': rms(tlens), 543 | 'mean_tlen_pp': mean(tlens_pp), 544 | 'rms_tlen_pp': rms(tlens_pp), 545 | } 546 | yield rec 547 | 548 | # deal with empty bins up to explicit end 549 | if end is not None: 550 | while bin_end < end: 551 | reads_all = reads_pp = 0 552 | tlens = [] 553 | tlens_pp = [] 554 | bin_start = bin_end 555 | bin_end = bin_start + window_size 556 | pos = bin_start + window_offset 557 | if one_based: 558 | pos += 1 559 | rec = {'chrom': chrom, 'pos': pos, 560 | 'reads_all': reads_all, 561 | 'reads_pp': reads_pp, 562 | 'mean_tlen': mean(tlens), 563 | 'rms_tlen': rms(tlens), 564 | 'mean_tlen_pp': mean(tlens_pp), 565 | 'rms_tlen_pp': rms(tlens_pp), 566 | } 567 | yield rec 568 | 569 | 570 | def test_stat_tlen_binned(): 571 | compare_stats(pysamstats.stat_tlen_binned, stat_tlen_binned_refimpl) 572 | 573 | 574 | binned_functions = [ 575 | (pysamstats.load_coverage_binned, 1), 576 | (pysamstats.load_coverage_ext_binned, 1), 577 | (pysamstats.load_mapq_binned, 0), 578 | (pysamstats.load_alignment_binned, 0), 579 | (pysamstats.load_tlen_binned, 0), 580 | ] 581 | 582 | 583 | def test_binned_pad_region(): 584 | kwargs = {'chrom': 'Pf3D7_01_v3', 585 | 'start': 1000, 586 | 'end': 20000, 587 | 'one_based': False, 588 | 'window_size': 200, 589 | 'window_offset': 100} 590 | for f, needs_ref in binned_functions: 591 | debug(f.__name__) 592 | if needs_ref: 593 | a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), 594 | **kwargs) 595 | else: 596 | a = f(Samfile('fixture/test.bam'), **kwargs) 597 | assert set(a['chrom']) == {b'Pf3D7_01_v3'} 598 | eq_(1100, a['pos'][0]) 599 | eq_(19900, a['pos'][-1]) 600 | 601 | 602 | def test_binned_pad_wg(): 603 | expected = stat_coverage_binned_refimpl( 604 | Samfile('fixture/test.bam'), 605 | Fastafile('fixture/ref.fa')) 606 | 607 | actual = pysamstats.stat_coverage_binned(Samfile('fixture/test.bam'), 608 | Fastafile('fixture/ref.fa')) 609 | compare_iterators(expected, actual) 610 | kwargs = {'window_size': 200, 611 | 'window_offset': 100} 612 | for f, needs_ref in binned_functions: 613 | debug(f.__name__) 614 | if needs_ref: 615 | a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), 616 | **kwargs) 617 | else: 618 | a = f(Samfile('fixture/test.bam'), **kwargs) 619 | assert sorted(set(a['chrom'])) == [b'Pf3D7_01_v3', b'Pf3D7_02_v3', 620 | b'Pf3D7_03_v3'] 621 | eq_(100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0]) 622 | eq_(50100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1]) 623 | eq_(100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0]) 624 | eq_(60100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1]) 625 | eq_(100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][0]) 626 | eq_(70100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][-1]) 627 | -------------------------------------------------------------------------------- /pysamstats/test/test_io.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, print_function, division 3 | import logging 4 | 5 | from pysam import Samfile 6 | 7 | from pysamstats.io import write_hdf5 8 | import tables 9 | 10 | logger = logging.getLogger(__name__) 11 | debug = logger.debug 12 | 13 | 14 | def check_write_hdf5_chrom_dtype(arg): 15 | 16 | # testing auto dtype determination. 17 | dtype, alignment, result, label = arg 18 | import tempfile 19 | 20 | # use auto 21 | with tempfile.NamedTemporaryFile(suffix=".h5") as tmp: 22 | 23 | write_hdf5("coverage", tmp.name, alignment, chrom=label, dtype=dtype) 24 | 25 | with tables.open_file(tmp.name, mode="r") as h5file: 26 | return result == h5file.root.data.dtype["chrom"].itemsize 27 | 28 | 29 | def test_write_hdf5_chrom_dtype(): 30 | 31 | contig_label = "AS2_scf7180000696055" 32 | bampath = "fixture/longcontignames.bam" 33 | 34 | dtypes = [None, {"chrom": "a20"}, {"chrom": "a20"}] 35 | alignments = [Samfile(bampath), Samfile(bampath), bampath] 36 | results = [len(contig_label), 20, 20] 37 | labels = [contig_label, contig_label, contig_label] 38 | 39 | for arg in zip(dtypes, alignments, results, labels): 40 | assert check_write_hdf5_chrom_dtype(arg) 41 | -------------------------------------------------------------------------------- /pysamstats/test/test_pileup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, print_function, division 3 | import logging 4 | import sys 5 | 6 | 7 | from nose.tools import eq_, assert_raises 8 | from pysam import Samfile, Fastafile 9 | import numpy as np 10 | from numpy import around as round 11 | 12 | 13 | from .util import normalise_coords, fwd, rev, pp, mean, std, rms, vmax, compare_iterators 14 | import pysamstats 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | debug = logger.debug 19 | 20 | 21 | # PY2/3 compatibility 22 | PY2 = sys.version_info[0] == 2 23 | 24 | 25 | def compare_stats(impl, refimpl): 26 | # no read filters 27 | kwargs = {'chrom': 'Pf3D7_01_v3', 28 | 'start': 0, 29 | 'end': 2000, 30 | 'one_based': False} 31 | expected = refimpl(Samfile('fixture/test.bam'), **kwargs) 32 | actual = impl(Samfile('fixture/test.bam'), **kwargs) 33 | compare_iterators(expected, actual) 34 | # read filters 35 | kwargs['min_mapq'] = 1 36 | kwargs['min_baseq'] = 17 37 | kwargs['no_del'] = True 38 | kwargs['no_dup'] = True 39 | expected = refimpl(Samfile('fixture/test.bam'), **kwargs) 40 | actual = impl(Samfile('fixture/test.bam'), **kwargs) 41 | compare_iterators(expected, actual) 42 | 43 | 44 | def compare_stats_withref(impl, refimpl, bam_fn='fixture/test.bam', 45 | fasta_fn='fixture/ref.fa'): 46 | # no read filters 47 | kwargs = {'chrom': 'Pf3D7_01_v3', 48 | 'start': 0, 49 | 'end': 2000, 50 | 'one_based': False} 51 | expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) 52 | actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) 53 | compare_iterators(expected, actual) 54 | # read filters 55 | kwargs['min_mapq'] = 1 56 | kwargs['min_baseq'] = 17 57 | kwargs['no_del'] = True 58 | kwargs['no_dup'] = True 59 | expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) 60 | actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) 61 | compare_iterators(expected, actual) 62 | 63 | 64 | def filter_reads(reads, min_mapq, min_baseq, no_del, no_dup): 65 | if min_mapq > 0: 66 | reads = [r for r in reads if r.alignment.mapq >= min_mapq] 67 | if min_baseq > 0: 68 | reads = [r for r, q in zip(reads, baseq(reads)) 69 | if q is not None and q >= min_baseq] 70 | if no_del: 71 | reads = nodel(reads) 72 | if no_dup: 73 | reads = nodup(reads) 74 | return reads 75 | 76 | 77 | def stat_coverage_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, min_mapq=0, 78 | min_baseq=0, no_del=False, no_dup=False): 79 | 80 | start, end = normalise_coords(one_based, start, end) 81 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 82 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 83 | chrom = samfile.getrname(col.tid) 84 | pos = col.pos + 1 if one_based else col.pos 85 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 86 | yield {'chrom': chrom, 'pos': pos, 'reads_all': len(reads), 87 | 'reads_pp': len(pp(reads))} 88 | 89 | 90 | def test_stat_coverage(): 91 | compare_stats(pysamstats.stat_coverage, stat_coverage_refimpl) 92 | 93 | 94 | def stat_coverage_strand_refimpl(samfile, chrom=None, start=None, end=None, 95 | one_based=False, min_mapq=0, min_baseq=0, no_del=False, 96 | no_dup=False): 97 | start, end = normalise_coords(one_based, start, end) 98 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 99 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 100 | chrom = samfile.getrname(col.tid) 101 | pos = col.pos + 1 if one_based else col.pos 102 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 103 | yield {'chrom': chrom, 'pos': pos, 104 | 'reads_all': len(reads), 105 | 'reads_fwd': len(fwd(reads)), 106 | 'reads_rev': len(rev(reads)), 107 | 'reads_pp': len(pp(reads)), 108 | 'reads_pp_fwd': len(fwd(pp(reads))), 109 | 'reads_pp_rev': len(rev(pp(reads)))} 110 | 111 | 112 | def test_stat_coverage_strand(): 113 | compare_stats(pysamstats.stat_coverage_strand, stat_coverage_strand_refimpl) 114 | 115 | 116 | def stat_coverage_ext_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, 117 | min_mapq=0, min_baseq=0, no_del=False, no_dup=False): 118 | start, end = normalise_coords(one_based, start, end) 119 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 120 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 121 | chrom = samfile.getrname(col.tid) 122 | pos = col.pos + 1 if one_based else col.pos 123 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 124 | reads_mate_unmapped = [read for read in reads 125 | if read.alignment.mate_is_unmapped] 126 | reads_mate_mapped = [read for read in reads 127 | if not read.alignment.mate_is_unmapped] 128 | reads_mate_other_chr = [read for read in reads_mate_mapped 129 | if col.tid != read.alignment.rnext] 130 | reads_mate_same_strand = [ 131 | read for read in reads_mate_mapped 132 | if col.tid == read.alignment.rnext 133 | and (read.alignment.is_reverse == read.alignment.mate_is_reverse) 134 | ] 135 | reads_faceaway = [ 136 | read for read in reads_mate_mapped 137 | if read.alignment.is_reverse != read.alignment.mate_is_reverse 138 | and (( 139 | read.alignment.is_reverse and read.alignment.tlen > 0) # 140 | # mapped to reverse strand but leftmost 141 | or (not read.alignment.is_reverse and read.alignment.tlen < 0)) 142 | # mapped to fwd strand but rightmost 143 | ] 144 | reads_softclipped = [ 145 | read for read in reads 146 | if any((op[0] == 4) for op in read.alignment.cigar) 147 | ] 148 | reads_duplicate = [read for read in reads 149 | if read.alignment.is_duplicate] 150 | yield {'chrom': chrom, 'pos': pos, 151 | 'reads_all': len(reads), 152 | 'reads_pp': len(pp(reads)), 153 | 'reads_mate_unmapped': len(reads_mate_unmapped), 154 | 'reads_mate_other_chr': len(reads_mate_other_chr), 155 | 'reads_mate_same_strand': len(reads_mate_same_strand), 156 | 'reads_faceaway': len(reads_faceaway), 157 | 'reads_softclipped': len(reads_softclipped), 158 | 'reads_duplicate': len(reads_duplicate)} 159 | 160 | 161 | def test_stat_coverage_ext(): 162 | compare_stats(pysamstats.stat_coverage_ext, stat_coverage_ext_refimpl) 163 | 164 | 165 | def stat_coverage_ext_strand_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, 166 | min_mapq=0, min_baseq=0, no_del=False, no_dup=False): 167 | start, end = normalise_coords(one_based, start, end) 168 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 169 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 170 | chrom = samfile.getrname(col.tid) 171 | pos = col.pos + 1 if one_based else col.pos 172 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 173 | reads_pp = pp(reads) 174 | reads_mate_unmapped = [read for read in reads 175 | if read.alignment.mate_is_unmapped] 176 | reads_mate_mapped = [read for read in reads 177 | if not read.alignment.mate_is_unmapped] 178 | reads_mate_other_chr = [read for read in reads_mate_mapped 179 | if col.tid != read.alignment.rnext] 180 | reads_mate_same_strand = [ 181 | read for read in reads_mate_mapped 182 | if col.tid == read.alignment.rnext 183 | and (read.alignment.is_reverse == read.alignment.mate_is_reverse) 184 | ] 185 | reads_faceaway = [ 186 | read for read in reads_mate_mapped 187 | if read.alignment.is_reverse != read.alignment.mate_is_reverse 188 | and (( 189 | read.alignment.is_reverse and read.alignment.tlen > 0) # 190 | # mapped to reverse strand but leftmost 191 | or (not read.alignment.is_reverse and read.alignment.tlen < 0)) 192 | # mapped to fwd strand but rightmost 193 | ] 194 | reads_softclipped = [ 195 | read for read in reads 196 | if any((op[0] == 4) for op in read.alignment.cigar) 197 | ] 198 | reads_duplicate = [read for read in reads 199 | if read.alignment.is_duplicate] 200 | yield {'chrom': chrom, 'pos': pos, 201 | 'reads_all': len(reads), 202 | 'reads_fwd': len(fwd(reads)), 203 | 'reads_rev': len(rev(reads)), 204 | 'reads_pp': len(reads_pp), 205 | 'reads_pp_fwd': len(fwd(reads_pp)), 206 | 'reads_pp_rev': len(rev(reads_pp)), 207 | 'reads_mate_unmapped': len(reads_mate_unmapped), 208 | 'reads_mate_unmapped_fwd': len(fwd(reads_mate_unmapped)), 209 | 'reads_mate_unmapped_rev': len(rev(reads_mate_unmapped)), 210 | 'reads_mate_other_chr': len(reads_mate_other_chr), 211 | 'reads_mate_other_chr_fwd': len(fwd(reads_mate_other_chr)), 212 | 'reads_mate_other_chr_rev': len(rev(reads_mate_other_chr)), 213 | 'reads_mate_same_strand': len(reads_mate_same_strand), 214 | 'reads_mate_same_strand_fwd': len(fwd(reads_mate_same_strand)), 215 | 'reads_mate_same_strand_rev': len(rev(reads_mate_same_strand)), 216 | 'reads_faceaway': len(reads_faceaway), 217 | 'reads_faceaway_fwd': len(fwd(reads_faceaway)), 218 | 'reads_faceaway_rev': len(rev(reads_faceaway)), 219 | 'reads_softclipped': len(reads_softclipped), 220 | 'reads_softclipped_fwd': len(fwd(reads_softclipped)), 221 | 'reads_softclipped_rev': len(rev(reads_softclipped)), 222 | 'reads_duplicate': len(reads_duplicate), 223 | 'reads_duplicate_fwd': len(fwd(reads_duplicate)), 224 | 'reads_duplicate_rev': len(rev(reads_duplicate)), 225 | } 226 | 227 | 228 | def test_stat_coverage_ext_strand(): 229 | compare_stats(pysamstats.stat_coverage_ext_strand, stat_coverage_ext_strand_refimpl) 230 | 231 | 232 | def stat_variation_refimpl(samfile, fafile, chrom=None, start=None, end=None, one_based=False, 233 | min_mapq=0, min_baseq=0, no_del=False, no_dup=False): 234 | start, end = normalise_coords(one_based, start, end) 235 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 236 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 237 | chrom = samfile.getrname(col.tid) 238 | pos = col.pos + 1 if one_based else col.pos 239 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 240 | reads_nodel = [read for read in reads if not read.is_del] 241 | reads_pp = pp(reads) 242 | reads_pp_nodel = [read for read in reads_pp if not read.is_del] 243 | ref = fafile.fetch(chrom, col.pos, col.pos+1).upper() 244 | debug('%r %r %r', chrom, pos, ref) 245 | # if reads: 246 | # debug(repr(reads[0].alignment.seq[reads[0].query_position])) 247 | matches = [read for read in reads_nodel 248 | if read.alignment.seq[read.query_position] == ref] 249 | matches_pp = [read for read in reads_pp_nodel 250 | if read.alignment.seq[read.query_position] == ref] 251 | mismatches = [read for read in reads_nodel 252 | if read.alignment.seq[read.query_position] != ref] 253 | mismatches_pp = [read for read in reads_pp_nodel 254 | if read.alignment.seq[read.query_position] != ref] 255 | deletions = [read for read in reads 256 | if read.is_del and not read.is_refskip] 257 | deletions_pp = [read for read in reads_pp 258 | if read.is_del and not read.is_refskip] 259 | insertions = [read for read in reads 260 | if read.indel > 0] 261 | insertions_pp = [read for read in reads_pp 262 | if read.indel > 0] 263 | debug([read.alignment.seq[read.query_position] 264 | for read in reads_nodel]) 265 | a = [read for read in reads_nodel 266 | if read.alignment.seq[read.query_position] == 'A'] 267 | a_pp = [read for read in reads_pp_nodel 268 | if read.alignment.seq[read.query_position] == 'A'] 269 | c = [read for read in reads_nodel 270 | if read.alignment.seq[read.query_position] == 'C'] 271 | c_pp = [read for read in reads_pp_nodel 272 | if read.alignment.seq[read.query_position] == 'C'] 273 | t = [read for read in reads_nodel 274 | if read.alignment.seq[read.query_position] == 'T'] 275 | t_pp = [read for read in reads_pp_nodel 276 | if read.alignment.seq[read.query_position] == 'T'] 277 | g = [read for read in reads_nodel 278 | if read.alignment.seq[read.query_position] == 'G'] 279 | g_pp = [read for read in reads_pp_nodel 280 | if read.alignment.seq[read.query_position] == 'G'] 281 | n = [read for read in reads_nodel 282 | if read.alignment.seq[read.query_position] == 'N'] 283 | n_pp = [read for read in reads_pp_nodel 284 | if read.alignment.seq[read.query_position] == 'N'] 285 | yield {'chrom': chrom, 'pos': pos, 'ref': ref, 286 | 'reads_all': len(reads), 287 | 'reads_pp': len(reads_pp), 288 | 'matches': len(matches), 289 | 'matches_pp': len(matches_pp), 290 | 'mismatches': len(mismatches), 291 | 'mismatches_pp': len(mismatches_pp), 292 | 'deletions': len(deletions), 293 | 'deletions_pp': len(deletions_pp), 294 | 'insertions': len(insertions), 295 | 'insertions_pp': len(insertions_pp), 296 | 'A': len(a), 'A_pp': len(a_pp), 297 | 'C': len(c), 'C_pp': len(c_pp), 298 | 'T': len(t), 'T_pp': len(t_pp), 299 | 'G': len(g), 'G_pp': len(g_pp), 300 | 'N': len(n), 'N_pp': len(n_pp)} 301 | 302 | 303 | def test_stat_variation(): 304 | compare_stats_withref(pysamstats.stat_variation, stat_variation_refimpl) 305 | 306 | 307 | def test_stat_variation_rna(): 308 | compare_stats_withref(pysamstats.stat_variation, stat_variation_refimpl, 309 | bam_fn='fixture/rna.bam') 310 | 311 | 312 | def stat_variation_strand_refimpl(samfile, fafile, chrom=None, start=None, end=None, 313 | one_based=False, min_mapq=0, min_baseq=0, no_del=False, 314 | no_dup=False): 315 | start, end = normalise_coords(one_based, start, end) 316 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 317 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 318 | chrom = samfile.getrname(col.tid) 319 | pos = col.pos + 1 if one_based else col.pos 320 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 321 | reads_nodel = [read for read in reads if not read.is_del] 322 | reads_pp = [read for read in reads if read.alignment.is_proper_pair] 323 | reads_pp_nodel = [read for read in reads 324 | if read.alignment.is_proper_pair and not read.is_del] 325 | ref = fafile.fetch(chrom, col.pos, col.pos+1).upper() 326 | matches = [read for read in reads_nodel 327 | if read.alignment.seq[read.query_position] == ref] 328 | matches_pp = [read for read in reads_pp_nodel 329 | if read.alignment.seq[read.query_position] == ref] 330 | mismatches = [read for read in reads_nodel 331 | if read.alignment.seq[read.query_position] != ref] 332 | mismatches_pp = [read for read in reads_pp_nodel 333 | if read.alignment.seq[read.query_position] != ref] 334 | deletions = [read for read in reads 335 | if read.is_del and not read.is_refskip] 336 | deletions_pp = [read for read in reads_pp 337 | if read.is_del and not read.is_refskip] 338 | insertions = [read for read in reads 339 | if read.indel > 0] 340 | insertions_pp = [read for read in reads_pp 341 | if read.indel > 0] 342 | a = [read for read in reads_nodel 343 | if read.alignment.seq[read.query_position] == 'A'] 344 | a_pp = [read for read in reads_pp_nodel 345 | if read.alignment.seq[read.query_position] == 'A'] 346 | c = [read for read in reads_nodel 347 | if read.alignment.seq[read.query_position] == 'C'] 348 | c_pp = [read for read in reads_pp_nodel 349 | if read.alignment.seq[read.query_position] == 'C'] 350 | t = [read for read in reads_nodel 351 | if read.alignment.seq[read.query_position] == 'T'] 352 | t_pp = [read for read in reads_pp_nodel 353 | if read.alignment.seq[read.query_position] == 'T'] 354 | g = [read for read in reads_nodel 355 | if read.alignment.seq[read.query_position] == 'G'] 356 | g_pp = [read for read in reads_pp_nodel 357 | if read.alignment.seq[read.query_position] == 'G'] 358 | n = [read for read in reads_nodel 359 | if read.alignment.seq[read.query_position] == 'N'] 360 | n_pp = [read for read in reads_pp_nodel 361 | if read.alignment.seq[read.query_position] == 'N'] 362 | yield { 363 | 'chrom': chrom, 'pos': pos, 'ref': ref, 364 | 'reads_all': len(reads), 365 | 'reads_fwd': len(fwd(reads)), 366 | 'reads_rev': len(rev(reads)), 367 | 'reads_pp': len(reads_pp), 368 | 'reads_pp_fwd': len(fwd(reads_pp)), 369 | 'reads_pp_rev': len(rev(reads_pp)), 370 | 'matches': len(matches), 371 | 'matches_fwd': len(fwd(matches)), 372 | 'matches_rev': len(rev(matches)), 373 | 'matches_pp': len(matches_pp), 374 | 'matches_pp_fwd': len(fwd(matches_pp)), 375 | 'matches_pp_rev': len(rev(matches_pp)), 376 | 'mismatches': len(mismatches), 377 | 'mismatches_fwd': len(fwd(mismatches)), 378 | 'mismatches_rev': len(rev(mismatches)), 379 | 'mismatches_pp': len(mismatches_pp), 380 | 'mismatches_pp_fwd': len(fwd(mismatches_pp)), 381 | 'mismatches_pp_rev': len(rev(mismatches_pp)), 382 | 'deletions': len(deletions), 383 | 'deletions_fwd': len(fwd(deletions)), 384 | 'deletions_rev': len(rev(deletions)), 385 | 'deletions_pp': len(deletions_pp), 386 | 'deletions_pp_fwd': len(fwd(deletions_pp)), 387 | 'deletions_pp_rev': len(rev(deletions_pp)), 388 | 'insertions': len(insertions), 389 | 'insertions_fwd': len(fwd(insertions)), 390 | 'insertions_rev': len(rev(insertions)), 391 | 'insertions_pp': len(insertions_pp), 392 | 'insertions_pp_fwd': len(fwd(insertions_pp)), 393 | 'insertions_pp_rev': len(rev(insertions_pp)), 394 | 'A': len(a), 'A_fwd': len(fwd(a)), 'A_rev': len(rev(a)), 395 | 'A_pp': len(a_pp), 'A_pp_fwd': len(fwd(a_pp)), 'A_pp_rev': len(rev(a_pp)), 396 | 'C': len(c), 'C_fwd': len(fwd(c)), 'C_rev': len(rev(c)), 397 | 'C_pp': len(c_pp), 'C_pp_fwd': len(fwd(c_pp)), 'C_pp_rev': len(rev(c_pp)), 398 | 'T': len(t), 'T_fwd': len(fwd(t)), 'T_rev': len(rev(t)), 399 | 'T_pp': len(t_pp), 'T_pp_fwd': len(fwd(t_pp)), 'T_pp_rev': len(rev(t_pp)), 400 | 'G': len(g), 'G_fwd': len(fwd(g)), 'G_rev': len(rev(g)), 401 | 'G_pp': len(g_pp), 'G_pp_fwd': len(fwd(g_pp)), 'G_pp_rev': len(rev(g_pp)), 402 | 'N': len(n), 'N_fwd': len(fwd(n)), 'N_rev': len(rev(n)), 403 | 'N_pp': len(n_pp), 'N_pp_fwd': len(fwd(n_pp)), 'N_pp_rev': len(rev(n_pp)) 404 | } 405 | 406 | 407 | def test_stat_variation_strand(): 408 | compare_stats_withref(pysamstats.stat_variation_strand, 409 | stat_variation_strand_refimpl) 410 | 411 | 412 | def test_stat_variation_strand_rna(): 413 | compare_stats_withref(pysamstats.stat_variation_strand, stat_variation_strand_refimpl, 414 | bam_fn='fixture/rna.bam') 415 | 416 | 417 | def stat_tlen_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, min_mapq=0, 418 | min_baseq=0, no_del=False, no_dup=False): 419 | start, end = normalise_coords(one_based, start, end) 420 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 421 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 422 | chrom = samfile.getrname(col.tid) 423 | pos = col.pos + 1 if one_based else col.pos 424 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 425 | # N.B., tlen only means something if mate is mapped to same chromosome 426 | reads_paired = [read for read in reads 427 | if not read.alignment.mate_is_unmapped 428 | and read.alignment.rnext == col.tid] 429 | tlen = [read.alignment.tlen for read in reads_paired] 430 | mean_tlen, rms_tlen, std_tlen = mean(tlen), rms(tlen), std(tlen) 431 | reads_pp = pp(reads) 432 | tlen_pp = [read.alignment.tlen for read in reads_pp] 433 | mean_tlen_pp, rms_tlen_pp, std_tlen_pp = mean(tlen_pp), rms(tlen_pp), std(tlen_pp) 434 | yield {'chrom': chrom, 'pos': pos, 435 | 'reads_all': len(reads), 436 | 'reads_paired': len(reads_paired), 437 | 'reads_pp': len(reads_pp), 438 | 'mean_tlen': mean_tlen, 439 | 'mean_tlen_pp': mean_tlen_pp, 440 | 'rms_tlen': rms_tlen, 441 | 'rms_tlen_pp': rms_tlen_pp, 442 | 'std_tlen': std_tlen, 443 | 'std_tlen_pp': std_tlen_pp} 444 | 445 | 446 | def test_stat_tlen(): 447 | compare_stats(pysamstats.stat_tlen, stat_tlen_refimpl) 448 | 449 | 450 | def stat_tlen_strand_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, 451 | min_mapq=0, min_baseq=0, no_del=False, no_dup=False): 452 | start, end = normalise_coords(one_based, start, end) 453 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 454 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 455 | chrom = samfile.getrname(col.tid) 456 | pos = col.pos + 1 if one_based else col.pos 457 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 458 | 459 | # all "paired" reads 460 | reads_paired = [read for read in reads 461 | if not read.alignment.mate_is_unmapped 462 | and read.alignment.rnext == col.tid] 463 | tlen = [read.alignment.tlen for read in reads_paired] 464 | mean_tlen, rms_tlen, std_tlen = mean(tlen), rms(tlen), std(tlen) 465 | reads_paired_fwd = fwd(reads_paired) 466 | tlen_fwd = [read.alignment.tlen for read in reads_paired_fwd] 467 | mean_tlen_fwd, rms_tlen_fwd, std_tlen_fwd = \ 468 | mean(tlen_fwd), rms(tlen_fwd), std(tlen_fwd) 469 | reads_paired_rev = rev(reads_paired) 470 | tlen_rev = [read.alignment.tlen for read in reads_paired_rev] 471 | mean_tlen_rev, rms_tlen_rev, std_tlen_rev = \ 472 | mean(tlen_rev), rms(tlen_rev), std(tlen_rev) 473 | 474 | # properly paired reads 475 | reads_pp = pp(reads) 476 | tlen_pp = [read.alignment.tlen for read in reads_pp] 477 | mean_tlen_pp, rms_tlen_pp, std_tlen_pp = \ 478 | mean(tlen_pp), rms(tlen_pp), std(tlen_pp) 479 | reads_pp_fwd = fwd(reads_pp) 480 | tlen_pp_fwd = [read.alignment.tlen for read in reads_pp_fwd] 481 | mean_tlen_pp_fwd, rms_tlen_pp_fwd, std_tlen_pp_fwd = \ 482 | mean(tlen_pp_fwd), rms(tlen_pp_fwd), std(tlen_pp_fwd) 483 | reads_pp_rev = rev(reads_pp) 484 | tlen_pp_rev = [read.alignment.tlen for read in reads_pp_rev] 485 | mean_tlen_pp_rev, rms_tlen_pp_rev, std_tlen_pp_rev = \ 486 | mean(tlen_pp_rev), rms(tlen_pp_rev), std(tlen_pp_rev) 487 | 488 | # yield record 489 | yield {'chrom': chrom, 'pos': pos, 490 | 'reads_all': len(reads), 491 | 'reads_fwd': len(fwd(reads)), 492 | 'reads_rev': len(rev(reads)), 493 | 'reads_paired': len(reads_paired), 494 | 'reads_paired_fwd': len(fwd(reads_paired)), 495 | 'reads_paired_rev': len(rev(reads_paired)), 496 | 'reads_pp': len(reads_pp), 497 | 'reads_pp_fwd': len(fwd(reads_pp)), 498 | 'reads_pp_rev': len(rev(reads_pp)), 499 | 'mean_tlen': mean_tlen, 500 | 'mean_tlen_fwd': mean_tlen_fwd, 501 | 'mean_tlen_rev': mean_tlen_rev, 502 | 'mean_tlen_pp': mean_tlen_pp, 503 | 'mean_tlen_pp_fwd': mean_tlen_pp_fwd, 504 | 'mean_tlen_pp_rev': mean_tlen_pp_rev, 505 | 'rms_tlen': rms_tlen, 506 | 'rms_tlen_fwd': rms_tlen_fwd, 507 | 'rms_tlen_rev': rms_tlen_rev, 508 | 'rms_tlen_pp': rms_tlen_pp, 509 | 'rms_tlen_pp_fwd': rms_tlen_pp_fwd, 510 | 'rms_tlen_pp_rev': rms_tlen_pp_rev, 511 | 'std_tlen': std_tlen, 512 | 'std_tlen_fwd': std_tlen_fwd, 513 | 'std_tlen_rev': std_tlen_rev, 514 | 'std_tlen_pp': std_tlen_pp, 515 | 'std_tlen_pp_fwd': std_tlen_pp_fwd, 516 | 'std_tlen_pp_rev': std_tlen_pp_rev} 517 | 518 | 519 | def test_stat_tlen_strand(): 520 | compare_stats(pysamstats.stat_tlen_strand, stat_tlen_strand_refimpl) 521 | 522 | 523 | def mapq0(reads): 524 | return [read for read in reads if read.alignment.mapq == 0] 525 | 526 | 527 | def mapq(reads): 528 | return [read.alignment.mapq for read in reads] 529 | 530 | 531 | def stat_mapq_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, min_mapq=0, 532 | min_baseq=0, no_del=False, no_dup=False): 533 | start, end = normalise_coords(one_based, start, end) 534 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 535 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 536 | chrom = samfile.getrname(col.tid) 537 | pos = col.pos + 1 if one_based else col.pos 538 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 539 | reads_pp = pp(reads) 540 | reads_mapq0 = mapq0(reads) 541 | reads_mapq0_pp = mapq0(reads_pp) 542 | mapq_all = mapq(reads) 543 | rms_mapq, max_mapq = rms(mapq_all), vmax(mapq_all) 544 | mapq_pp = mapq(reads_pp) 545 | rms_mapq_pp, max_mapq_pp = rms(mapq_pp), vmax(mapq_pp) 546 | yield {'chrom': chrom, 'pos': pos, 547 | 'reads_all': len(reads), 548 | 'reads_pp': len(reads_pp), 549 | 'reads_mapq0': len(reads_mapq0), 550 | 'reads_mapq0_pp': len(reads_mapq0_pp), 551 | 'rms_mapq': rms_mapq, 552 | 'rms_mapq_pp': rms_mapq_pp, 553 | 'max_mapq': max_mapq, 554 | 'max_mapq_pp': max_mapq_pp, 555 | } 556 | 557 | 558 | def test_stat_mapq(): 559 | compare_stats(pysamstats.stat_mapq, stat_mapq_refimpl) 560 | 561 | 562 | def stat_mapq_strand_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, 563 | min_mapq=0, min_baseq=0, no_del=False, no_dup=False): 564 | start, end = normalise_coords(one_based, start, end) 565 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 566 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 567 | chrom = samfile.getrname(col.tid) 568 | pos = col.pos + 1 if one_based else col.pos 569 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 570 | reads_fwd = fwd(reads) 571 | reads_rev = rev(reads) 572 | reads_pp = pp(reads) 573 | reads_pp_fwd = fwd(reads_pp) 574 | reads_pp_rev = rev(reads_pp) 575 | reads_mapq0 = mapq0(reads) 576 | reads_mapq0_fwd = mapq0(reads_fwd) 577 | reads_mapq0_rev = mapq0(reads_rev) 578 | reads_mapq0_pp = mapq0(reads_pp) 579 | reads_mapq0_pp_fwd = mapq0(reads_pp_fwd) 580 | reads_mapq0_pp_rev = mapq0(reads_pp_rev) 581 | mapq_all = mapq(reads) 582 | rms_mapq, max_mapq = rms(mapq_all), vmax(mapq_all) 583 | mapq_fwd = mapq(reads_fwd) 584 | rms_mapq_fwd, max_mapq_fwd = rms(mapq_fwd), vmax(mapq_fwd) 585 | mapq_rev = mapq(reads_rev) 586 | rms_mapq_rev, max_mapq_rev = rms(mapq_rev), vmax(mapq_rev) 587 | mapq_pp = mapq(reads_pp) 588 | rms_mapq_pp, max_mapq_pp = rms(mapq_pp), vmax(mapq_pp) 589 | mapq_pp_fwd = mapq(reads_pp_fwd) 590 | rms_mapq_pp_fwd, max_mapq_pp_fwd = rms(mapq_pp_fwd), vmax(mapq_pp_fwd) 591 | mapq_pp_rev = mapq(reads_pp_rev) 592 | rms_mapq_pp_rev, max_mapq_pp_rev = rms(mapq_pp_rev), vmax(mapq_pp_rev) 593 | yield {'chrom': chrom, 'pos': pos, 594 | 'reads_all': len(reads), 595 | 'reads_fwd': len(reads_fwd), 596 | 'reads_rev': len(reads_rev), 597 | 'reads_pp': len(reads_pp), 598 | 'reads_pp_fwd': len(reads_pp_fwd), 599 | 'reads_pp_rev': len(reads_pp_rev), 600 | 'reads_mapq0': len(reads_mapq0), 601 | 'reads_mapq0_fwd': len(reads_mapq0_fwd), 602 | 'reads_mapq0_rev': len(reads_mapq0_rev), 603 | 'reads_mapq0_pp': len(reads_mapq0_pp), 604 | 'reads_mapq0_pp_fwd': len(reads_mapq0_pp_fwd), 605 | 'reads_mapq0_pp_rev': len(reads_mapq0_pp_rev), 606 | 'rms_mapq': rms_mapq, 607 | 'rms_mapq_fwd': rms_mapq_fwd, 608 | 'rms_mapq_rev': rms_mapq_rev, 609 | 'rms_mapq_pp': rms_mapq_pp, 610 | 'rms_mapq_pp_fwd': rms_mapq_pp_fwd, 611 | 'rms_mapq_pp_rev': rms_mapq_pp_rev, 612 | 'max_mapq': max_mapq, 613 | 'max_mapq_fwd': max_mapq_fwd, 614 | 'max_mapq_rev': max_mapq_rev, 615 | 'max_mapq_pp': max_mapq_pp, 616 | 'max_mapq_pp_fwd': max_mapq_pp_fwd, 617 | 'max_mapq_pp_rev': max_mapq_pp_rev, 618 | } 619 | 620 | 621 | def test_stat_mapq_strand(): 622 | compare_stats(pysamstats.stat_mapq_strand, stat_mapq_strand_refimpl) 623 | 624 | 625 | def baseq(reads): 626 | l = [ord(read.alignment.qual[read.query_position]) - 33 627 | if read.query_position is not None 628 | else None 629 | for read in reads] 630 | return l 631 | 632 | 633 | def nodel(reads): 634 | return [read for read in reads if not read.is_del] 635 | 636 | 637 | def nodup(reads): 638 | return [read for read in reads if not read.alignment.is_duplicate] 639 | 640 | 641 | def stat_baseq_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, min_mapq=0, 642 | min_baseq=0, no_del=False, no_dup=False): 643 | start, end = normalise_coords(one_based, start, end) 644 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 645 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 646 | chrom = samfile.getrname(col.tid) 647 | pos = col.pos + 1 if one_based else col.pos 648 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 649 | # N.B., make sure aligned base is not a deletion 650 | reads_nodel = nodel(reads) 651 | reads_pp = pp(reads) 652 | reads_pp_nodel = nodel(reads_pp) 653 | rms_baseq = rms(baseq(reads_nodel)) 654 | rms_baseq_pp = rms(baseq(reads_pp_nodel)) 655 | yield {'chrom': chrom, 'pos': pos, 656 | 'reads_all': len(reads), 657 | 'reads_pp': len(reads_pp), 658 | 'rms_baseq': rms_baseq, 659 | 'rms_baseq_pp': rms_baseq_pp} 660 | 661 | 662 | def test_stat_baseq(): 663 | compare_stats(pysamstats.stat_baseq, stat_baseq_refimpl) 664 | 665 | 666 | def stat_baseq_strand_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, 667 | min_mapq=0, min_baseq=0, no_del=False, no_dup=False): 668 | start, end = normalise_coords(one_based, start, end) 669 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 670 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 671 | chrom = samfile.getrname(col.tid) 672 | pos = col.pos + 1 if one_based else col.pos 673 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 674 | reads_fwd = fwd(reads) 675 | reads_rev = rev(reads) 676 | reads_pp = pp(reads) 677 | reads_pp_fwd = fwd(reads_pp) 678 | reads_pp_rev = rev(reads_pp) 679 | reads_nodel = nodel(reads) 680 | reads_fwd_nodel = nodel(reads_fwd) 681 | reads_rev_nodel = nodel(reads_rev) 682 | reads_pp_nodel = nodel(reads_pp) 683 | reads_pp_fwd_nodel = nodel(reads_pp_fwd) 684 | reads_pp_rev_nodel = nodel(reads_pp_rev) 685 | rms_baseq = rms(baseq(reads_nodel)) 686 | rms_baseq_fwd = rms(baseq(reads_fwd_nodel)) 687 | rms_baseq_rev = rms(baseq(reads_rev_nodel)) 688 | rms_baseq_pp = rms(baseq(reads_pp_nodel)) 689 | rms_baseq_pp_fwd = rms(baseq(reads_pp_fwd_nodel)) 690 | rms_baseq_pp_rev = rms(baseq(reads_pp_rev_nodel)) 691 | yield { 692 | 'chrom': chrom, 'pos': pos, 693 | 'reads_all': len(reads), 694 | 'reads_fwd': len(reads_fwd), 695 | 'reads_rev': len(reads_rev), 696 | 'reads_pp': len(reads_pp), 697 | 'reads_pp_fwd': len(reads_pp_fwd), 698 | 'reads_pp_rev': len(reads_pp_rev), 699 | 'rms_baseq': rms_baseq, 700 | 'rms_baseq_fwd': rms_baseq_fwd, 701 | 'rms_baseq_rev': rms_baseq_rev, 702 | 'rms_baseq_pp': rms_baseq_pp, 703 | 'rms_baseq_pp_fwd': rms_baseq_pp_fwd, 704 | 'rms_baseq_pp_rev': rms_baseq_pp_rev, 705 | } 706 | 707 | 708 | def test_stat_baseq_strand(): 709 | compare_stats(pysamstats.stat_baseq_strand, stat_baseq_strand_refimpl) 710 | 711 | 712 | def stat_baseq_ext_refimpl(samfile, fafile, chrom=None, start=None, end=None, one_based=False, 713 | min_mapq=0, min_baseq=0, no_del=False, no_dup=False): 714 | start, end = normalise_coords(one_based, start, end) 715 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 716 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 717 | chrom = samfile.getrname(col.tid) 718 | pos = col.pos + 1 if one_based else col.pos 719 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 720 | reads_nodel = [read for read in reads if not read.is_del] 721 | reads_pp = pp(reads) 722 | reads_pp_nodel = [read for read in reads_pp if not read.is_del] 723 | ref = fafile.fetch(chrom, col.pos, col.pos+1).upper() 724 | matches = [read for read in reads_nodel 725 | if read.alignment.seq[read.query_position] == ref] 726 | matches_pp = [read for read in reads_pp_nodel 727 | if read.alignment.seq[read.query_position] == ref] 728 | mismatches = [read for read in reads_nodel 729 | if read.alignment.seq[read.query_position] != ref] 730 | mismatches_pp = [read for read in reads_pp_nodel 731 | if read.alignment.seq[read.query_position] != ref] 732 | 733 | rms_baseq = rms(baseq(reads_nodel)) 734 | rms_baseq_pp = rms(baseq(reads_pp_nodel)) 735 | rms_baseq_matches = rms(baseq(matches)) 736 | rms_baseq_matches_pp = rms(baseq(matches_pp)) 737 | rms_baseq_mismatches = rms(baseq(mismatches)) 738 | rms_baseq_mismatches_pp = rms(baseq(mismatches_pp)) 739 | yield {'chrom': chrom, 'pos': pos, 'ref': ref, 740 | 'reads_all': len(reads), 741 | 'reads_pp': len(reads_pp), 742 | 'matches': len(matches), 743 | 'matches_pp': len(matches_pp), 744 | 'mismatches': len(mismatches), 745 | 'mismatches_pp': len(mismatches_pp), 746 | 'rms_baseq': rms_baseq, 747 | 'rms_baseq_pp': rms_baseq_pp, 748 | 'rms_baseq_matches': rms_baseq_matches, 749 | 'rms_baseq_matches_pp': rms_baseq_matches_pp, 750 | 'rms_baseq_mismatches': rms_baseq_mismatches, 751 | 'rms_baseq_mismatches_pp': rms_baseq_mismatches_pp, 752 | } 753 | 754 | 755 | def test_stat_baseq_ext(): 756 | compare_stats_withref(pysamstats.stat_baseq_ext, stat_baseq_ext_refimpl) 757 | 758 | 759 | def stat_baseq_ext_strand_refimpl(samfile, fafile, chrom=None, start=None, end=None, 760 | one_based=False, min_mapq=0, min_baseq=0, no_del=False, 761 | no_dup=False): 762 | start, end = normalise_coords(one_based, start, end) 763 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 764 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 765 | chrom = samfile.getrname(col.tid) 766 | pos = col.pos + 1 if one_based else col.pos 767 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 768 | reads_pp = pp(reads) 769 | reads_nodel = [read for read in reads if not read.is_del] 770 | reads_nodel_fwd = fwd(reads_nodel) 771 | reads_nodel_rev = rev(reads_nodel) 772 | reads_nodel_pp = pp(reads_nodel) 773 | reads_nodel_pp_fwd = fwd(reads_nodel_pp) 774 | reads_nodel_pp_rev = rev(reads_nodel_pp) 775 | reads_pp_nodel = [read for read in reads_pp if not read.is_del] 776 | ref = fafile.fetch(chrom, col.pos, col.pos+1).upper() 777 | matches = [read for read in reads_nodel 778 | if read.alignment.seq[read.query_position] == ref] 779 | matches_fwd = fwd(matches) 780 | matches_rev = rev(matches) 781 | matches_pp = pp(matches) 782 | matches_pp_fwd = fwd(matches_pp) 783 | matches_pp_rev = rev(matches_pp) 784 | mismatches = [read for read in reads_nodel 785 | if read.alignment.seq[read.query_position] != ref] 786 | mismatches_fwd = fwd(mismatches) 787 | mismatches_rev = rev(mismatches) 788 | mismatches_pp = pp(mismatches) 789 | mismatches_pp_fwd = fwd(mismatches_pp) 790 | mismatches_pp_rev = rev(mismatches_pp) 791 | 792 | rms_baseq = rms(baseq(reads_nodel)) 793 | rms_baseq_fwd = rms(baseq(reads_nodel_fwd)) 794 | rms_baseq_rev = rms(baseq(reads_nodel_rev)) 795 | rms_baseq_pp = rms(baseq(reads_pp_nodel)) 796 | rms_baseq_pp_fwd = rms(baseq(reads_nodel_pp_fwd)) 797 | rms_baseq_pp_rev = rms(baseq(reads_nodel_pp_rev)) 798 | rms_baseq_matches = rms(baseq(matches)) 799 | rms_baseq_matches_fwd = rms(baseq(matches_fwd)) 800 | rms_baseq_matches_rev = rms(baseq(matches_rev)) 801 | rms_baseq_matches_pp = rms(baseq(matches_pp)) 802 | rms_baseq_matches_pp_fwd = rms(baseq(matches_pp_fwd)) 803 | rms_baseq_matches_pp_rev = rms(baseq(matches_pp_rev)) 804 | rms_baseq_mismatches = rms(baseq(mismatches)) 805 | rms_baseq_mismatches_fwd = rms(baseq(mismatches_fwd)) 806 | rms_baseq_mismatches_rev = rms(baseq(mismatches_rev)) 807 | rms_baseq_mismatches_pp = rms(baseq(mismatches_pp)) 808 | rms_baseq_mismatches_pp_fwd = rms(baseq(mismatches_pp_fwd)) 809 | rms_baseq_mismatches_pp_rev = rms(baseq(mismatches_pp_rev)) 810 | yield {'chrom': chrom, 'pos': pos, 'ref': ref, 811 | 'reads_all': len(reads), 812 | 'reads_fwd': len(fwd(reads)), 813 | 'reads_rev': len(rev(reads)), 814 | 'reads_pp': len(reads_pp), 815 | 'reads_pp_fwd': len(fwd(reads_pp)), 816 | 'reads_pp_rev': len(rev(reads_pp)), 817 | 'matches': len(matches), 818 | 'matches_fwd': len(matches_fwd), 819 | 'matches_rev': len(matches_rev), 820 | 'matches_pp': len(matches_pp), 821 | 'matches_pp_fwd': len(matches_pp_fwd), 822 | 'matches_pp_rev': len(matches_pp_rev), 823 | 'mismatches': len(mismatches), 824 | 'mismatches_fwd': len(mismatches_fwd), 825 | 'mismatches_rev': len(mismatches_rev), 826 | 'mismatches_pp': len(mismatches_pp), 827 | 'mismatches_pp_fwd': len(mismatches_pp_fwd), 828 | 'mismatches_pp_rev': len(mismatches_pp_rev), 829 | 'rms_baseq': rms_baseq, 830 | 'rms_baseq_fwd': rms_baseq_fwd, 831 | 'rms_baseq_rev': rms_baseq_rev, 832 | 'rms_baseq_pp': rms_baseq_pp, 833 | 'rms_baseq_pp_fwd': rms_baseq_pp_fwd, 834 | 'rms_baseq_pp_rev': rms_baseq_pp_rev, 835 | 'rms_baseq_matches': rms_baseq_matches, 836 | 'rms_baseq_matches_fwd': rms_baseq_matches_fwd, 837 | 'rms_baseq_matches_rev': rms_baseq_matches_rev, 838 | 'rms_baseq_matches_pp': rms_baseq_matches_pp, 839 | 'rms_baseq_matches_pp_fwd': rms_baseq_matches_pp_fwd, 840 | 'rms_baseq_matches_pp_rev': rms_baseq_matches_pp_rev, 841 | 'rms_baseq_mismatches': rms_baseq_mismatches, 842 | 'rms_baseq_mismatches_fwd': rms_baseq_mismatches_fwd, 843 | 'rms_baseq_mismatches_rev': rms_baseq_mismatches_rev, 844 | 'rms_baseq_mismatches_pp': rms_baseq_mismatches_pp, 845 | 'rms_baseq_mismatches_pp_fwd': rms_baseq_mismatches_pp_fwd, 846 | 'rms_baseq_mismatches_pp_rev': rms_baseq_mismatches_pp_rev, 847 | } 848 | 849 | 850 | def test_stat_baseq_ext_strand(): 851 | compare_stats_withref(pysamstats.stat_baseq_ext_strand, 852 | stat_baseq_ext_strand_refimpl) 853 | 854 | 855 | from collections import Counter 856 | 857 | 858 | def stat_coverage_gc_refimpl(samfile, fafile, chrom=None, start=None, end=None, one_based=False, 859 | window_size=300, window_offset=150, min_mapq=0, min_baseq=0, 860 | no_del=False, no_dup=False): 861 | start, end = normalise_coords(one_based, start, end) 862 | 863 | for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0, 864 | min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True): 865 | chrom = samfile.getrname(col.tid) 866 | pos = col.pos + 1 if one_based else col.pos 867 | reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup) 868 | 869 | if col.pos <= window_offset: 870 | continue # until we get a bit further into the chromosome 871 | 872 | ref_window_start = col.pos - window_offset 873 | ref_window_end = ref_window_start + window_size 874 | ref_window = fafile.fetch(chrom, ref_window_start, 875 | ref_window_end).lower() 876 | 877 | if len(ref_window) == 0: 878 | break # because we've hit the end of the chromosome 879 | 880 | debug(ref_window) 881 | base_counter = Counter(ref_window) 882 | debug(base_counter) 883 | gc_count = base_counter['g'] + base_counter['c'] 884 | debug(gc_count) 885 | gc_percent = int(round(gc_count * 100. / window_size)) 886 | yield {'chrom': chrom, 'pos': pos, 887 | 'reads_all': len(reads), 888 | 'reads_pp': len(pp(reads)), 889 | 'gc': gc_percent} 890 | 891 | 892 | def test_stat_coverage_gc(): 893 | compare_stats_withref(pysamstats.stat_coverage_gc, stat_coverage_gc_refimpl) 894 | 895 | 896 | def test_stat_coverage_gc_uppercase_fasta(): 897 | compare_stats_withref(pysamstats.stat_coverage_gc, stat_coverage_gc_refimpl, 898 | fasta_fn='fixture/ref.upper.fa') 899 | 900 | 901 | pileup_functions = [ 902 | (pysamstats.load_coverage, 0), 903 | (pysamstats.load_coverage_strand, 0), 904 | (pysamstats.load_coverage_ext, 0), 905 | (pysamstats.load_coverage_ext_strand, 0), 906 | (pysamstats.load_variation, 1), 907 | (pysamstats.load_variation_strand, 1), 908 | (pysamstats.load_tlen, 0), 909 | (pysamstats.load_tlen_strand, 0), 910 | (pysamstats.load_mapq, 0), 911 | (pysamstats.load_mapq_strand, 0), 912 | (pysamstats.load_baseq, 0), 913 | (pysamstats.load_baseq_strand, 0), 914 | (pysamstats.load_baseq_ext, 1), 915 | (pysamstats.load_baseq_ext_strand, 1), 916 | (pysamstats.load_coverage_gc, 1), 917 | ] 918 | 919 | def test_pileup_kwargs(): 920 | # check that keyword arguments are being passed through 921 | kwargs = { 922 | 'chrom': 'Pf3D7_01_v3', 923 | 'start': 2000, 924 | 'end': 2100, 925 | 'min_mapq': 1, 926 | 'min_baseq': 1, 927 | 'no_del': True, 928 | 'no_dup': True 929 | } 930 | for f, needs_ref in pileup_functions: 931 | if needs_ref: 932 | a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs) 933 | else: 934 | a = f(Samfile('fixture/test.bam'), **kwargs) 935 | assert isinstance(a, np.ndarray) 936 | assert a.dtype.names is not None 937 | 938 | 939 | def test_pileup_truncate(): 940 | kwargs_notrunc = {'chrom': 'Pf3D7_01_v3', 941 | 'start': 2000, 942 | 'end': 2100, 943 | 'one_based': False, 944 | 'truncate': False} 945 | kwargs_trunc = {'chrom': 'Pf3D7_01_v3', 946 | 'start': 2000, 947 | 'end': 2100, 948 | 'one_based': False, 949 | 'truncate': True} 950 | for f, needs_ref in pileup_functions: 951 | debug(f.__name__) 952 | # test no truncate 953 | if needs_ref: 954 | a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), 955 | **kwargs_notrunc) 956 | else: 957 | a = f(Samfile('fixture/test.bam'), **kwargs_notrunc) 958 | debug(a[:5]) 959 | eq_(1952, a['pos'][0]) 960 | eq_(2154, a['pos'][-1]) 961 | # test truncate 962 | if needs_ref: 963 | a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), 964 | **kwargs_trunc) 965 | else: 966 | a = f(Samfile('fixture/test.bam'), **kwargs_trunc) 967 | eq_(2000, a['pos'][0]) 968 | eq_(2099, a['pos'][-1]) 969 | 970 | 971 | def test_pileup_pad(): 972 | kwargs_nopad = {'chrom': 'Pf3D7_01_v3', 973 | 'start': 0, 974 | 'end': 20000, 975 | 'one_based': False, 976 | 'pad': False} 977 | kwargs_pad = {'chrom': 'Pf3D7_01_v3', 978 | 'start': 0, 979 | 'end': 20000, 980 | 'one_based': False, 981 | 'pad': True} 982 | for f, needs_ref in pileup_functions: 983 | debug(f.__name__) 984 | # test no pad 985 | if needs_ref: 986 | a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), 987 | **kwargs_nopad) 988 | else: 989 | a = f(Samfile('fixture/test.bam'), **kwargs_nopad) 990 | eq_(924, a['pos'][0]) 991 | eq_(9935, a['pos'][-1]) 992 | # test pad 993 | if needs_ref: 994 | a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), 995 | **kwargs_pad) 996 | else: 997 | a = f(Samfile('fixture/test.bam'), **kwargs_pad) 998 | eq_(0, a['pos'][0]) 999 | eq_(19999, a['pos'][-1]) 1000 | assert np.all(np.diff(a['pos']) == 1) 1001 | 1002 | 1003 | def test_pileup_pad_wg(): 1004 | # whole genome 1005 | expected = stat_coverage_refimpl(Samfile('fixture/test.bam')) 1006 | actual = pysamstats.stat_coverage(Samfile('fixture/test.bam')) 1007 | compare_iterators(expected, actual) 1008 | kwargs_nopad = {'pad': False} 1009 | kwargs_pad = {'pad': True} 1010 | for f, needs_ref in pileup_functions: 1011 | debug(f.__name__) 1012 | # test no pad 1013 | if needs_ref: 1014 | a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), 1015 | **kwargs_nopad) 1016 | else: 1017 | a = f(Samfile('fixture/test.bam'), **kwargs_nopad) 1018 | eq_(sorted(set(a['chrom'])), [b'Pf3D7_01_v3', b'Pf3D7_02_v3']) 1019 | eq_(924, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0]) 1020 | eq_(9935, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1]) 1021 | eq_(926, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0]) 1022 | eq_(10074, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1]) 1023 | # test pad 1024 | if needs_ref: 1025 | a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), 1026 | **kwargs_pad) 1027 | else: 1028 | a = f(Samfile('fixture/test.bam'), **kwargs_pad) 1029 | eq_(sorted(set(a['chrom'])), 1030 | [b'Pf3D7_01_v3', b'Pf3D7_02_v3', b'Pf3D7_03_v3']) 1031 | eq_(0, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0]) 1032 | eq_(50000, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1]) 1033 | eq_(0, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0]) 1034 | eq_(60000, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1]) 1035 | eq_(0, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][0]) 1036 | eq_(70000, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][-1]) 1037 | 1038 | 1039 | def test_pileup_limit(): 1040 | 1041 | for f, needs_ref in pileup_functions: 1042 | debug(f.__name__) 1043 | 1044 | # test with effectively no limit 1045 | kwargs = dict(fields=['reads_all'], max_depth=1000000) 1046 | if needs_ref: 1047 | a = f(Samfile('fixture/deep.bam'), Fastafile('fixture/ref.fa'), 1048 | **kwargs) 1049 | else: 1050 | a = f(Samfile('fixture/deep.bam'), **kwargs) 1051 | eq_(26169, a[70]) 1052 | 1053 | # test with specific limit 1054 | kwargs = dict(fields=['reads_all'], max_depth=12000) 1055 | if needs_ref: 1056 | a = f(Samfile('fixture/deep.bam'), Fastafile('fixture/ref.fa'), 1057 | **kwargs) 1058 | else: 1059 | a = f(Samfile('fixture/deep.bam'), **kwargs) 1060 | eq_(12046, a[70]) # no idea why limit is not exact 1061 | 1062 | # test with default limit 1063 | kwargs = dict(fields=['reads_all']) 1064 | if needs_ref: 1065 | a = f(Samfile('fixture/deep.bam'), Fastafile('fixture/ref.fa'), 1066 | **kwargs) 1067 | else: 1068 | a = f(Samfile('fixture/deep.bam'), **kwargs) 1069 | eq_(8052, a[70]) # no idea why limit is not exact 1070 | 1071 | 1072 | def test_load_cov_long_contig_name(): 1073 | # test that long chrom labels auto handled. 1074 | 1075 | label = 'AS2_scf7180000696055' 1076 | bampath = 'fixture/longcontignames.bam' 1077 | 1078 | x = pysamstats.load_coverage(bampath, chrom=label) 1079 | assert len(label) == x.dtype["chrom"].itemsize 1080 | 1081 | x = pysamstats.load_coverage(Samfile(bampath), chrom=label, dtype={"chrom": "a10"}) 1082 | assert 10 == x.dtype["chrom"].itemsize 1083 | 1084 | 1085 | def test_load_cov_using_steppers(): 1086 | 1087 | # test that expected steppers give different/consistent results 1088 | # this is the only bam file that differs between all/nofilter 1089 | bampath = "fixture/longcontignames.bam" 1090 | seq = 'AS2_scf7180000695891' 1091 | pos = 14311 1092 | steppers = ["all", "nofilter", "samtools"] 1093 | reads_all = [7, 8, 4] 1094 | reads_pp = [4, 5, 4] 1095 | 1096 | for exp_all, exp_pp, step in zip(reads_all, reads_pp, steppers): 1097 | a = pysamstats.load_coverage(Samfile(bampath), chrom=seq, stepper=step, pad=True) 1098 | eq_(exp_all, a[pos]["reads_all"]) 1099 | eq_(exp_pp, a[pos]["reads_pp"]) 1100 | 1101 | with assert_raises(ValueError): 1102 | pysamstats.load_coverage(Samfile(bampath), chrom=seq, stepper="notastepper") 1103 | -------------------------------------------------------------------------------- /pysamstats/test/test_regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, print_function, division 3 | import logging 4 | import sys 5 | 6 | from pysam import Samfile 7 | import numpy as np 8 | from os.path import isfile 9 | 10 | 11 | import pysamstats 12 | from pysamstats.config import stats_types, stats_types_withref 13 | 14 | 15 | # no test_prefix so not run during unit tests 16 | def generate_fixtures(): 17 | 18 | bampath = "fixture/test.bam" 19 | fastapath = "fixture/ref.fa" 20 | archive = "fixture/regression.npz" 21 | assert not isfile(archive) 22 | 23 | # simple stats 24 | dat = {} 25 | for q in stats_types: 26 | if q in stats_types_withref: 27 | dat[q] = getattr(pysamstats, "load_" + q)(Samfile(bampath), fafile=fastapath) 28 | else: 29 | dat[q] = getattr(pysamstats, "load_" + q)(Samfile(bampath)) 30 | 31 | np.savez_compressed(archive, **dat) 32 | 33 | 34 | def test_against_fixtures(): 35 | 36 | # load fixtures from numpy array 37 | bampath = "fixture/test.bam" 38 | fastapath = "fixture/ref.fa" 39 | archive = "fixture/regression.npz" 40 | 41 | testset = np.load(archive) 42 | 43 | for q in stats_types: 44 | if q in stats_types_withref: 45 | x = getattr(pysamstats, "load_" + q)(Samfile(bampath), fafile=fastapath) 46 | else: 47 | x = getattr(pysamstats, "load_" + q)(Samfile(bampath)) 48 | 49 | # loop through all fields 50 | for key in testset[q].dtype.names: 51 | expect = testset[q][key] 52 | actual = x[key] 53 | try: 54 | np.testing.assert_array_equal(expect, actual, err_msg=key) 55 | except AssertionError: 56 | print(expect[expect != actual]) 57 | print(actual[expect != actual]) 58 | raise 59 | -------------------------------------------------------------------------------- /pysamstats/test/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, print_function, division 3 | import logging 4 | import sys 5 | from math import sqrt 6 | 7 | 8 | import numpy as np 9 | from numpy import around as round 10 | from nose.tools import eq_, assert_almost_equal 11 | 12 | 13 | from pysam import Samfile, Fastafile 14 | 15 | 16 | logger = logging.getLogger(__name__) 17 | debug = logger.debug 18 | 19 | 20 | # PY2/3 compatibility 21 | PY2 = sys.version_info[0] == 2 22 | if PY2: 23 | # noinspection PyUnresolvedReferences 24 | from itertools import izip_longest 25 | else: 26 | from itertools import zip_longest as izip_longest 27 | 28 | 29 | def compare_iterators(expected, actual): 30 | for e, a in izip_longest(expected, actual, fillvalue=None): 31 | assert e is not None, ('expected value is None', e, a) 32 | assert a is not None, ('actual value is None', e, a) 33 | for k, v in e.items(): 34 | try: 35 | if isinstance(v, float): 36 | assert_almost_equal(v, a[k]) 37 | else: 38 | eq_(v, a[k]) 39 | except: 40 | debug('mismatch %r, expected %r, found %r' % (k, v, a[k])) 41 | debug('expected: %r' % sorted(e.items())) 42 | debug('actual: %r' % sorted(a.items())) 43 | raise 44 | for k in a: # check no unexpected fields 45 | try: 46 | assert k in e 47 | except: 48 | debug('missing %r' % k) 49 | debug('expected: %r' % sorted(e.items())) 50 | debug('actual: %r' % sorted(a.items())) 51 | raise 52 | 53 | 54 | def normalise_coords(one_based, start, end): 55 | """Normalise start and end coordinates. 56 | 57 | Parameters 58 | ---------- 59 | one_based : bool 60 | start : int 61 | end : int 62 | 63 | Returns 64 | ------- 65 | start : int 66 | end : int 67 | 68 | """ 69 | if one_based: 70 | start = start - 1 if start is not None else None 71 | end = end - 1 if end is not None else None 72 | return start, end 73 | 74 | 75 | def fwd(reads): 76 | return [read for read in reads if not read.alignment.is_reverse] 77 | 78 | 79 | def rev(reads): 80 | return [read for read in reads if read.alignment.is_reverse] 81 | 82 | 83 | def pp(reads): 84 | return [read for read in reads if read.alignment.is_proper_pair] 85 | 86 | 87 | def rms(a): 88 | if a: 89 | return int(round(sqrt(np.mean(np.power(a, 2))))) 90 | else: 91 | return 0 92 | 93 | 94 | def mean(a): 95 | if a: 96 | return int(round(np.mean(a))) 97 | else: 98 | return 0 99 | 100 | 101 | def std(a): 102 | if len(a) >= 2: 103 | std = np.std(a, ddof=1) 104 | if np.isnan(std): 105 | return 0 106 | return int(round(std)) 107 | else: 108 | return 0 109 | 110 | 111 | def vmax(a): 112 | if a: 113 | return max(a) 114 | else: 115 | return 0 116 | 117 | 118 | def rootmean(sqsum, count): 119 | if count > 0: 120 | return int(round(sqrt(sqsum / count))) 121 | else: 122 | return 0 123 | -------------------------------------------------------------------------------- /pysamstats/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, print_function, division 3 | from operator import itemgetter 4 | from pysam import AlignmentFile 5 | 6 | 7 | def flatten(recs, *fields): 8 | """Convert a record (dict) iterator to a row (tuple) iterator. 9 | 10 | Parameters 11 | ---------- 12 | 13 | recs : iterator of dicts 14 | records generator 15 | fields : list of strings 16 | names of fields to select 17 | 18 | Returns 19 | ------- 20 | 21 | rows : iterator of tuples 22 | rows generator 23 | 24 | """ 25 | 26 | getter = itemgetter(*fields) 27 | it = (getter(rec) for rec in recs) 28 | return it 29 | 30 | 31 | def load_stats(statfun, default_dtype, user_dtype, user_fields, **kwargs): 32 | 33 | import numpy as np 34 | 35 | # determine fields to load 36 | default_fields = [t[0] for t in default_dtype] 37 | if user_fields is None: 38 | fields = default_fields 39 | else: 40 | fields = user_fields 41 | if any([f not in default_fields for f in fields]): 42 | raise ValueError('invalid fields: %r' % fields) 43 | 44 | # determine dtype 45 | dtype = dict(default_dtype) 46 | 47 | # check if contig label dtype is appropriate length 48 | max_seqid_len = determine_max_seqid(kwargs["alignmentfile"]) 49 | dtype["chrom"] = "a{0}".format(max_seqid_len) 50 | 51 | if user_dtype is not None: 52 | dtype.update(dict(user_dtype)) 53 | 54 | # handle single field requested 55 | if len(fields) == 1: 56 | dtype = dtype[fields[0]] 57 | else: 58 | dtype = [(f, dtype[f]) for f in fields] 59 | 60 | # setup record generator 61 | recs = statfun(**kwargs) 62 | 63 | # flatten records 64 | it = flatten(recs, *fields) 65 | 66 | # load into a Numpy array 67 | a = np.fromiter(it, dtype=dtype) 68 | 69 | # view as recarray for convenience 70 | if len(fields) > 1: 71 | a = a.view(np.recarray) 72 | 73 | return a 74 | 75 | 76 | def determine_max_seqid(alignmentfile): 77 | 78 | if isinstance(alignmentfile, str): 79 | alignmentfile = AlignmentFile(alignmentfile) 80 | 81 | return max([len(x) for x in alignmentfile.references]) 82 | -------------------------------------------------------------------------------- /release.txt: -------------------------------------------------------------------------------- 1 | version=`grep __version__ pysamstats/__init__.py | sed -e "s/__version__[ ]=[ ]'\(.*\)'/\1/"` 2 | echo $version 3 | python setup.py build_ext --inplace 4 | nosetests -v 5 | git commit -a -m v$version 6 | git push 7 | git tag -a v$version -m v$version 8 | git push --tags 9 | python setup.py register sdist upload 10 | # update readme with command line help 11 | # increment version and add .dev0 12 | git commit -a -m 'increment version'; git push 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pysam 2 | numpy 3 | nose 4 | tables 5 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | Cython==0.29.23 2 | numpy==1.21.0 3 | pysam==0.16.0.1 4 | nose==1.3.7 5 | tables==3.6.1 6 | -------------------------------------------------------------------------------- /sandbox.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "sandbox" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "code", 12 | "collapsed": false, 13 | "input": [ 14 | "import pysam\n", 15 | "import pysamstats\n", 16 | "from itertools import islice" 17 | ], 18 | "language": "python", 19 | "metadata": {}, 20 | "outputs": [], 21 | "prompt_number": 1 22 | }, 23 | { 24 | "cell_type": "code", 25 | "collapsed": false, 26 | "input": [ 27 | "kwargs = {'chrom': 'Pf3D7_01_v3',\n", 28 | "# 'start': 0,\n", 29 | "# 'end': 10000,\n", 30 | " 'one_based': False}\n", 31 | "it = pysamstats.stat_coverage_binned(pysam.Samfile('fixture/test.bam'), pysam.Fastafile('fixture/ref.fa'), **kwargs)\n", 32 | "for rec in islice(it, 10):\n", 33 | " print rec" 34 | ], 35 | "language": "python", 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "output_type": "stream", 40 | "stream": "stdout", 41 | "text": [ 42 | "{'gc': 47, 'reads_pp': 428, 'chrom': 'Pf3D7_01_v3', 'pos': 150, 'reads_all': 538}\n", 43 | "{'gc': 30, 'reads_pp': 620, 'chrom': 'Pf3D7_01_v3', 'pos': 450, 'reads_all': 665}\n", 44 | "{'gc': 28, 'reads_pp': 667, 'chrom': 'Pf3D7_01_v3', 'pos': 750, 'reads_all': 703}\n", 45 | "{'gc': 27, 'reads_pp': 672, 'chrom': 'Pf3D7_01_v3', 'pos': 1050, 'reads_all': 726}\n", 46 | "{'gc': 30, 'reads_pp': 711, 'chrom': 'Pf3D7_01_v3', 'pos': 1350, 'reads_all': 728}\n", 47 | "{'gc': 32, 'reads_pp': 725, 'chrom': 'Pf3D7_01_v3', 'pos': 1650, 'reads_all': 735}\n", 48 | "{'gc': 29, 'reads_pp': 846, 'chrom': 'Pf3D7_01_v3', 'pos': 1950, 'reads_all': 856}\n", 49 | "{'gc': 28, 'reads_pp': 774, 'chrom': 'Pf3D7_01_v3', 'pos': 2250, 'reads_all': 782}\n", 50 | "{'gc': 27, 'reads_pp': 764, 'chrom': 'Pf3D7_01_v3', 'pos': 2550, 'reads_all': 769}\n", 51 | "{'gc': 31, 'reads_pp': 793, 'chrom': 'Pf3D7_01_v3', 'pos': 2850, 'reads_all': 798}\n" 52 | ] 53 | } 54 | ], 55 | "prompt_number": 2 56 | }, 57 | { 58 | "cell_type": "code", 59 | "collapsed": false, 60 | "input": [ 61 | "%timeit pysamstats.count_reads(pysam.Samfile('fixture/test.bam'), chrom='Pf3D7_01_v3')" 62 | ], 63 | "language": "python", 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "output_type": "stream", 68 | "stream": "stdout", 69 | "text": [ 70 | "10 loops, best of 3: 18.8 ms per loop\n" 71 | ] 72 | } 73 | ], 74 | "prompt_number": 3 75 | }, 76 | { 77 | "cell_type": "code", 78 | "collapsed": false, 79 | "input": [ 80 | "import matplotlib.pyplot as plt\n", 81 | "a = pysamstats.load_coverage(pysam.Samfile('fixture/test.bam'))\n", 82 | "plt.plot(a.pos, a.reads_all)\n", 83 | "plt.show()" 84 | ], 85 | "language": "python", 86 | "metadata": {}, 87 | "outputs": [], 88 | "prompt_number": 15 89 | }, 90 | { 91 | "cell_type": "code", 92 | "collapsed": false, 93 | "input": [ 94 | "a" 95 | ], 96 | "language": "python", 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "output_type": "pyout", 101 | "prompt_number": 14, 102 | "text": [ 103 | "rec.array([('Pf3D7_01_v3', 0, 1, 1), ('Pf3D7_01_v3', 1, 1, 1),\n", 104 | " ('Pf3D7_01_v3', 2, 6, 5), ..., ('Pf3D7_01_v3', 10072, 6, 6),\n", 105 | " ('Pf3D7_01_v3', 10073, 5, 5), ('Pf3D7_01_v3', 10074, 2, 2)], \n", 106 | " dtype=[('chrom', 'S12'), ('pos', ' example.coverage.txt 56 | pysamstats --type coverage --chromosome Pf3D7_v3_01 --start 100000 --end 200000 example.bam > example.coverage.txt 57 | 58 | Version: {version} (pysam {pysamversion}) 59 | 60 | """.format(version=pysamstats.__version__, pysamversion=pysam.__version__) 61 | 62 | OptionParser.format_epilog = lambda self, formatter: self.epilog 63 | parser = OptionParser(usage=usage, description=description, epilog=epilog) 64 | 65 | parser.add_option( 66 | '-t', '--type', dest='type', default='coverage', 67 | help='Type of statistics to print, one of: %s.' % ', '.join(stats_types)) 68 | 69 | parser.add_option( 70 | '-c', '--chromosome', dest='chromosome', default=None, 71 | help='Chromosome name.') 72 | 73 | parser.add_option( 74 | '-s', '--start', dest='start', type='int', default=None, 75 | help='Start position (1-based).') 76 | 77 | parser.add_option( 78 | '-e', '--end', dest='end', type='int', default=None, 79 | help='End position (1-based).') 80 | 81 | parser.add_option( 82 | '-z', '--zero-based', dest='zero_based', action='store_true', default=False, 83 | help='Use zero-based coordinates (default is false, i.e., use one-based coords).') 84 | 85 | parser.add_option( 86 | '-u', '--truncate', dest='truncate', action='store_true', default=False, 87 | help='Truncate pileup-based stats so no records are emitted outside the specified range.') 88 | 89 | parser.add_option( 90 | '-S', '--stepper', dest='stepper', action='store', default='all', 91 | help='Stepper to provide to underlying pysam call. Options are:' 92 | '"all" (default): all reads are returned, except where flags BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, ' 93 | 'BAM_FDUP set; "nofilter" applies no filter to returned reads; ' 94 | '"samtools": filter & read processing as in _csamtools_ pileup. This requires a fasta file. ' 95 | 'For complete details see the pysam documentation.') 96 | 97 | parser.add_option( 98 | '-d', '--pad', dest='pad', action='store_true', default=False, 99 | help='Pad pileup-based stats so a record is emitted for every position (default is only ' 100 | 'covered positions).') 101 | 102 | parser.add_option( 103 | '-D', '--max-depth', dest='max_depth', type=int, default=8000, 104 | help='Maximum read depth permitted in pileup-based statistics. The default limit is 8000.') 105 | 106 | parser.add_option( 107 | '-f', '--fasta', dest='fasta', default=None, 108 | help='Reference sequence file, only required for some statistics.') 109 | 110 | parser.add_option( 111 | '-o', '--omit-header', dest='omit_header', default=False, action='store_true', 112 | help='Omit header row from output.') 113 | 114 | parser.add_option( 115 | '-p', '--progress', dest='progress', type='int', metavar='N', default=None, 116 | help='Report progress every N rows.') 117 | 118 | parser.add_option( 119 | '--window-size', dest='window_size', type='int', metavar='N', default=300, 120 | help='Size of window for binned statistics (default is 300).') 121 | 122 | parser.add_option( 123 | '--window-offset', dest='window_offset', type=int, default=None, metavar='N', 124 | help='Window offset to use for deciding which genome position to report binned statistics ' 125 | 'against. The default is 150, i.e., the middle of 300bp window.') 126 | 127 | parser.add_option( 128 | '--format', dest='format', default='tsv', 129 | help='Output format, one of {tsv, csv, hdf5} (defaults to tsv). N.B., hdf5 requires ' 130 | 'PyTables to be installed.') 131 | 132 | parser.add_option( 133 | '--output', dest='output', 134 | help='Path to output file. If not provided, write to stdout.') 135 | 136 | parser.add_option( 137 | '--fields', dest='fields', default=None, 138 | help='Comma-separated list of fields to output (defaults to all fields).') 139 | 140 | parser.add_option( 141 | '--hdf5-group', dest='hdf5_group', default='/', 142 | help='Name of HDF5 group to write to (defaults to the root group).') 143 | 144 | parser.add_option( 145 | '--hdf5-dataset', dest='hdf5_dataset', default='data', 146 | help='Name of HDF5 dataset to create (defaults to "data").') 147 | 148 | parser.add_option( 149 | '--hdf5-complib', dest='hdf5_complib', default='zlib', 150 | help='HDF5 compression library (defaults to zlib).') 151 | 152 | parser.add_option( 153 | '--hdf5-complevel', dest='hdf5_complevel', type=int, default=1, 154 | help='HDF5 compression level (defaults to 5).') 155 | 156 | parser.add_option( 157 | '--hdf5-chunksize', dest='hdf5_chunksize', type=int, default=2**20, 158 | help='Size of chunks in number of bytes (defaults to 2**20).') 159 | 160 | parser.add_option( 161 | '--min-mapq', dest='min_mapq', type=int, default=0, 162 | help='Only reads with mapping quality equal to or greater than this value will be counted ' 163 | '(0 by default).') 164 | 165 | parser.add_option( 166 | '--min-baseq', dest='min_baseq', type=int, default=0, 167 | help='Only reads with base quality equal to or greater than this value will be counted ' 168 | '(0 by default). Only applies to pileup-based statistics.') 169 | 170 | parser.add_option( 171 | '--no-dup', dest='no_dup', default=False, action='store_true', 172 | help="Don't count reads flagged as duplicate.") 173 | 174 | parser.add_option( 175 | '--no-del', dest='no_del', default=False, action='store_true', 176 | help="Don't count reads aligned with a deletion at the given position. Only applies to " 177 | "pileup-based statistics.") 178 | 179 | options, args = parser.parse_args() 180 | 181 | if len(args) != 1: 182 | parser.error('missing SAM or BAM file operand\n\nTry "pysamstats --help" for more ' 183 | 'information.') 184 | 185 | samfile = args[0] 186 | one_based = not options.zero_based 187 | write_header = not options.omit_header 188 | if options.fields: 189 | fields = options.fields.split(',') 190 | else: 191 | fields = None 192 | 193 | try: 194 | 195 | if options.type not in stats_types: 196 | parser.error('unsupported statistics type: "%s"\nTry one of %s or ' 197 | '"pysamstats --help" for more information.' 198 | % (options.type, stats_types)) 199 | 200 | elif options.stepper not in stepper_types: 201 | parser.error('unsupported stepper type: "%s"\nMust be one of %s or ' 202 | '"pysamstats --help" for more information.' 203 | % (options.stepper, stepper_types)) 204 | 205 | elif options.type in stats_types_withref \ 206 | and options.fasta is None: 207 | parser.error('missing --fasta option\n\nTry "pysamstats --help"' 208 | ' for more information.') 209 | 210 | else: 211 | 212 | # setup common parameters 213 | kwargs = dict( 214 | chrom=options.chromosome, 215 | start=options.start, 216 | end=options.end, 217 | one_based=one_based, 218 | window_size=options.window_size, 219 | window_offset=options.window_offset, 220 | min_mapq=options.min_mapq, 221 | no_dup=options.no_dup 222 | ) 223 | # some options only make sense if not performing binned analysis 224 | if not options.type.endswith('_binned'): 225 | kwargs['truncate'] = options.truncate 226 | kwargs['pad'] = options.pad 227 | kwargs['max_depth'] = options.max_depth 228 | kwargs['min_baseq'] = options.min_baseq 229 | kwargs['no_del'] = options.no_del 230 | kwargs['stepper'] = options.stepper 231 | 232 | if options.format.lower() in ['tsv', 'csv']: 233 | 234 | # setup 235 | dialect = {'tsv': 'excel-tab', 'csv': 'excel'}[options.format] 236 | if options.output is None: 237 | output = sys.stdout 238 | needs_closing = False 239 | else: 240 | output = open(options.output, 'w') 241 | needs_closing = True 242 | 243 | try: 244 | write_csv( 245 | options.type, 246 | output, 247 | samfile, 248 | fafile=options.fasta, 249 | write_header=write_header, 250 | dialect=dialect, 251 | progress=options.progress, 252 | fields=fields, 253 | **kwargs 254 | ) 255 | finally: 256 | if needs_closing: 257 | output.close() 258 | 259 | elif options.format.lower() == 'hdf5': 260 | 261 | assert options.output is not None, '--output must be specified' 262 | 263 | write_hdf5( 264 | options.type, 265 | options.output, 266 | samfile, 267 | fafile=options.fasta, 268 | progress=options.progress, 269 | hdf5_group=options.hdf5_group, 270 | hdf5_dataset=options.hdf5_dataset, 271 | hdf5_complib=options.hdf5_complib, 272 | hdf5_complevel=options.hdf5_complevel, 273 | hdf5_chunksize=options.hdf5_chunksize, 274 | fields=fields, 275 | **kwargs 276 | ) 277 | 278 | except IOError as e: 279 | if e.errno == errno.EPIPE: 280 | pass # ignore broken pipe 281 | else: 282 | raise 283 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension, find_packages 2 | 3 | 4 | # require pysam is pre-installed 5 | try: 6 | import pysam 7 | except ImportError: 8 | raise Exception('pysam not found; please install pysam first') 9 | from distutils.version import LooseVersion 10 | required_pysam_version = '0.15' 11 | if LooseVersion(pysam.__version__) < LooseVersion(required_pysam_version): 12 | raise Exception('pysam version >= %s is required; found %s' % 13 | (required_pysam_version, pysam.__version__)) 14 | 15 | 16 | def get_version(): 17 | """Extract version number from source file.""" 18 | from ast import literal_eval 19 | with open('pysamstats/__init__.py') as f: 20 | for line in f: 21 | if line.startswith('__version__'): 22 | return literal_eval(line.partition('=')[2].lstrip()) 23 | raise ValueError("__version__ not found") 24 | 25 | 26 | try: 27 | from Cython.Build import cythonize 28 | print('[pysamstats] build with Cython') 29 | extensions = cythonize([ 30 | Extension('pysamstats.opt', 31 | sources=['pysamstats/opt.pyx'], 32 | include_dirs=pysam.get_include(), 33 | define_macros=pysam.get_defines())] 34 | ) 35 | 36 | except ImportError: 37 | print('[pysamstats] build from C') 38 | extensions = [Extension('pysamstats.opt', 39 | sources=['pysamstats/opt.c'], 40 | include_dirs=pysam.get_include(), 41 | define_macros=pysam.get_defines())] 42 | 43 | 44 | setup( 45 | name='pysamstats', 46 | version=get_version(), 47 | author='Alistair Miles', 48 | author_email='alimanfoo@googlemail.com', 49 | url='https://github.com/alimanfoo/pysamstats', 50 | license='MIT Licenses', 51 | description='A Python utility for calculating statistics against genome ' 52 | 'position based on sequence alignments from a SAM, ' 53 | 'BAM or CRAM file.', 54 | scripts=['scripts/pysamstats'], 55 | package_dir={'': '.'}, 56 | install_requires=[ 57 | "pysam (<0.16)", 58 | "numpy", 59 | ], 60 | packages=find_packages(), 61 | classifiers=[ 62 | 'Intended Audience :: Developers', 63 | 'License :: OSI Approved :: MIT License', 64 | 'Programming Language :: Python :: 2.7', 65 | 'Programming Language :: Python :: 3.5', 66 | 'Programming Language :: Python :: 3.6', 67 | 'Programming Language :: Python :: 3.7', 68 | 'Topic :: Software Development :: Libraries :: Python Modules' 69 | ], 70 | ext_modules=extensions, 71 | ) 72 | --------------------------------------------------------------------------------