├── .github
    └── workflows
    │   └── python-package.yml
├── .gitignore
├── .pyup.yml
├── .travis.yml
├── Dockerfile
├── MANIFEST.in
├── README.md
├── fixture
    ├── deep.bam
    ├── deep.bam.bai
    ├── gc.txt
    ├── longcontignames.bam
    ├── longcontignames.bam.bai
    ├── ref.fa
    ├── ref.fa.fai
    ├── ref.upper.fa
    ├── ref.upper.fa.fai
    ├── regression.npz
    ├── rna.bam
    ├── rna.bam.bai
    ├── test.bam
    └── test.bam.bai
├── performance.py
├── pyproject.toml
├── pysamstats
    ├── __init__.py
    ├── binned.py
    ├── config.py
    ├── io.py
    ├── opt.c
    ├── opt.pyx
    ├── pileup.py
    ├── test
    │   ├── __init__.py
    │   ├── test_binned.py
    │   ├── test_io.py
    │   ├── test_pileup.py
    │   ├── test_regression.py
    │   └── util.py
    └── util.py
├── release.txt
├── requirements.txt
├── requirements_dev.txt
├── sandbox.ipynb
├── scripts
    └── pysamstats
└── setup.py


/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: [3.7, 3.8, 3.9]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v2
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v2
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest
31 |         pip install -r requirements_dev.txt
32 |         python setup.py build_ext --inplace
33 |         pip install -v .
34 | #    - name: Lint with flake8
35 | #      run: |
36 | #        # stop the build if there are Python syntax errors or undefined names
37 | #        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
38 | #        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
39 | #        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
40 |     - name: Test
41 |       run: |
42 |         nosetests -v
43 |         pysamstats --help
44 |         pysamstats --type coverage fixture/test.bam > coverage.txt
45 |         pysamstats --type coverage --output=coverage.txt fixture/test.bam
46 |         pysamstats --type coverage --output=coverage.h5 --format=hdf5 fixture/test.bam
47 |         pysamstats --type coverage --fields=pos,reads_all fixture/test.bam > coverage_fields.txt
48 |         pysamstats --type coverage_binned --fasta=fixture/ref.fa fixture/test.bam > coverage_binned.txt
49 |         pysamstats --type coverage_binned --fasta=fixture/ref.fa --output=coverage_binned.txt fixture/test.bam
50 |         pysamstats --type coverage_binned --fasta=fixture/ref.fa --output=coverage_binned.h5 --format=hdf5 fixture/test.bam
51 |         pysamstats --type=coverage --min-mapq=27 --min-baseq=17 --no-dup --no-del fixture/test.bam > coverage_filtered.txt
52 |         pysamstats --type=coverage_binned --fasta=fixture/ref.fa --min-mapq=27 --no-dup fixture/test.bam > coverage_binned_filtered.txt
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | 
16 | # Installer logs
17 | pip-log.txt
18 | 
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 | 
23 | #Translations
24 | *.mo
25 | 
26 | #Mr Developer
27 | .mr.developer.cfg
28 | 
29 | MANIFEST
30 | spike*
31 | *.so
32 | *.html
33 | *~
34 | *.prof
35 | 
36 | .idea
37 | .project
38 | .pydevproject
39 | 


--------------------------------------------------------------------------------
/.pyup.yml:
--------------------------------------------------------------------------------
 1 | # autogenerated pyup.io config file 
 2 | # see https://pyup.io/docs/configuration/ for all available options
 3 | 
 4 | schedule: every month
 5 | 
 6 | requirements:
 7 |   - requirements.txt:
 8 |       pin: False
 9 |       update: False
10 |   - requirements_dev.txt:
11 |       pin: True
12 |       update: all
13 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | branches:
 4 |   only:
 5 |     - master
 6 | 
 7 | sudo: false
 8 | 
 9 | dist: xenial
10 | 
11 | addons:
12 |   apt:
13 |     packages:
14 |     - libhdf5-serial-dev
15 | 
16 | matrix:
17 |   include:
18 |     - python: 3.6
19 |     - python: 3.7
20 |     - python: 3.8
21 |     - python: 3.9
22 |       sudo: true
23 | 
24 | install:
25 |   - export HDF5_DIR=/usr/lib/x86_64-linux-gnu/hdf5/serial
26 |   - pip install -U pip setuptools wheel
27 |   - pip install -r requirements_dev.txt
28 |   - python setup.py build_ext --inplace
29 |   - pip install -v .
30 | 
31 | script:
32 |   - nosetests -v
33 |   - pysamstats --help
34 |   - pysamstats --type coverage fixture/test.bam > coverage.txt
35 |   - pysamstats --type coverage --output=coverage.txt fixture/test.bam
36 |   - pysamstats --type coverage --output=coverage.h5 --format=hdf5 fixture/test.bam
37 |   - pysamstats --type coverage --fields=pos,reads_all fixture/test.bam > coverage_fields.txt
38 |   - pysamstats --type coverage_binned --fasta=fixture/ref.fa fixture/test.bam > coverage_binned.txt
39 |   - pysamstats --type coverage_binned --fasta=fixture/ref.fa --output=coverage_binned.txt fixture/test.bam
40 |   - pysamstats --type coverage_binned --fasta=fixture/ref.fa --output=coverage_binned.h5 --format=hdf5 fixture/test.bam
41 |   - pysamstats --type=coverage --min-mapq=27 --min-baseq=17 --no-dup --no-del fixture/test.bam > coverage_filtered.txt
42 |   - pysamstats --type=coverage_binned --fasta=fixture/ref.fa --min-mapq=27 --no-dup fixture/test.bam > coverage_binned_filtered.txt
43 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM continuumio/miniconda:4.5.4
2 | LABEL pysamstats - A fast utility for extracting statistics from a SAM or BAM file.
3 | 
4 | RUN conda install -c bioconda pysamstats
5 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md
2 | include pyproject.toml
3 | recursive-include pysamstats *.pyx *.pxd *.c
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | pysamstats
  2 | ==========
  3 | 
  4 | A Python utility for calculating statistics against genome positions
  5 | based on sequence alignments from a SAM or BAM file.
  6 | 
  7 | * Source: https://github.com/alimanfoo/pysamstats
  8 | * Download: http://pypi.python.org/pypi/pysamstats
  9 | * Release notes: https://github.com/alimanfoo/pysamstats/releases
 10 | 
 11 | Installation
 12 | ------------
 13 | 
 14 | The easiest way to install pysamstats is via conda, e.g.:
 15 | 
 16 | ```
 17 | $ conda install -c bioconda pysamstats
 18 | ```
 19 | 
 20 | Alternatively, pysamstats can be installed from source via pip. 
 21 | 
 22 | ```
 23 | $ pip install pysamstats
 24 | ```
 25 | 
 26 | Alternatively, clone the git repo and install:
 27 | 
 28 | ```
 29 | $ git clone git://github.com/alimanfoo/pysamstats.git
 30 | $ cd pysamstats
 31 | $ python setup.py install
 32 | $ nosetests -v  # optional, run test suite
 33 | ```
 34 | 
 35 | If you have problems installing pysam, please email the
 36 | [pysam user group](https://groups.google.com/forum/#!forum/pysam-user-group).
 37 | 
 38 | N.B., some functions also require [numpy](http://www.numpy.org) and
 39 | [pytables](http://www.pytables.org) to be installed.
 40 | 
 41 | Usage
 42 | -----
 43 | 
 44 | From the command line:
 45 | 
 46 | ```
 47 | $ pysamstats --help
 48 | Usage: pysamstats [options] FILE
 49 | 
 50 | Calculate statistics against genome positions based on sequence alignments
 51 | from a SAM or BAM file and print them to stdout.
 52 | 
 53 | Options:
 54 |   -h, --help            show this help message and exit
 55 |   -t TYPE, --type=TYPE  Type of statistics to print, one of: alignment_binned,
 56 |                         baseq, baseq_ext, baseq_ext_strand, baseq_strand,
 57 |                         coverage, coverage_binned, coverage_ext,
 58 |                         coverage_ext_binned, coverage_ext_strand, coverage_gc,
 59 |                         coverage_strand, mapq, mapq_binned, mapq_strand, tlen,
 60 |                         tlen_binned, tlen_strand, variation, variation_strand.
 61 |   -c CHROMOSOME, --chromosome=CHROMOSOME
 62 |                         Chromosome name.
 63 |   -s START, --start=START
 64 |                         Start position (1-based).
 65 |   -e END, --end=END     End position (1-based).
 66 |   -z, --zero-based      Use zero-based coordinates (default is false, i.e.,
 67 |                         use one-based coords).
 68 |   -u, --truncate        Truncate pileup-based stats so no records are emitted
 69 |                         outside the specified range.
 70 |   -S STEPPER, --stepper=STEPPER
 71 |                         Stepper to provide to underlying pysam call. Options
 72 |                         are:"all" (default): all reads are returned, except
 73 |                         where flags BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
 74 |                         BAM_FDUP set; "nofilter" applies no filter to returned
 75 |                         reads; "samtools": filter & read processing as in
 76 |                         _csamtools_ pileup. This requires a fasta file. For
 77 |                         complete details see the pysam documentation.
 78 |   -d, --pad             Pad pileup-based stats so a record is emitted for
 79 |                         every position (default is only covered positions).
 80 |   -D MAX_DEPTH, --max-depth=MAX_DEPTH
 81 |                         Maximum read depth permitted in pileup-based
 82 |                         statistics. The default limit is 8000.
 83 |   -f FASTA, --fasta=FASTA
 84 |                         Reference sequence file, only required for some
 85 |                         statistics.
 86 |   -o, --omit-header     Omit header row from output.
 87 |   -p N, --progress=N    Report progress every N rows.
 88 |   --window-size=N       Size of window for binned statistics (default is 300).
 89 |   --window-offset=N     Window offset to use for deciding which genome
 90 |                         position to report binned statistics against. The
 91 |                         default is 150, i.e., the middle of 300bp window.
 92 |   --format=FORMAT       Output format, one of {tsv, csv, hdf5} (defaults to
 93 |                         tsv). N.B., hdf5 requires PyTables to be installed.
 94 |   --output=OUTPUT       Path to output file. If not provided, write to stdout.
 95 |   --fields=FIELDS       Comma-separated list of fields to output (defaults to
 96 |                         all fields).
 97 |   --hdf5-group=HDF5_GROUP
 98 |                         Name of HDF5 group to write to (defaults to the root
 99 |                         group).
100 |   --hdf5-dataset=HDF5_DATASET
101 |                         Name of HDF5 dataset to create (defaults to "data").
102 |   --hdf5-complib=HDF5_COMPLIB
103 |                         HDF5 compression library (defaults to zlib).
104 |   --hdf5-complevel=HDF5_COMPLEVEL
105 |                         HDF5 compression level (defaults to 5).
106 |   --hdf5-chunksize=HDF5_CHUNKSIZE
107 |                         Size of chunks in number of bytes (defaults to 2**20).
108 |   --min-mapq=MIN_MAPQ   Only reads with mapping quality equal to or greater
109 |                         than this value will be counted (0 by default).
110 |   --min-baseq=MIN_BASEQ
111 |                         Only reads with base quality equal to or greater than
112 |                         this value will be counted (0 by default). Only
113 |                         applies to pileup-based statistics.
114 |   --no-dup              Don't count reads flagged as duplicate.
115 |   --no-del              Don't count reads aligned with a deletion at the given
116 |                         position. Only applies to pileup-based statistics.
117 | 
118 | Pileup-based statistics types (each row has statistics over reads in a pileup column):
119 | 
120 |     * coverage            - Number of reads aligned to each genome position
121 |                             (total and properly paired).
122 |     * coverage_strand     - As coverage but with forward/reverse strand counts.
123 |     * coverage_ext        - Various additional coverage metrics, including
124 |                             coverage for reads not properly paired (mate
125 |                             unmapped, mate on other chromosome, ...).
126 |     * coverage_ext_strand - As coverage_ext but with forward/reverse strand counts.
127 |     * coverage_gc         - As coverage but also includes a column for %GC.
128 |     * variation           - Numbers of matches, mismatches, deletions,
129 |                             insertions, etc.
130 |     * variation_strand    - As variation but with forward/reverse strand counts.
131 |     * tlen                - Insert size statistics.
132 |     * tlen_strand         - As tlen but with statistics by forward/reverse strand.
133 |     * mapq                - Mapping quality statistics.
134 |     * mapq_strand         - As mapq but with statistics by forward/reverse strand.
135 |     * baseq               - Base quality statistics.
136 |     * baseq_strand        - As baseq but with statistics by forward/reverse strand.
137 |     * baseq_ext           - Extended base quality statistics, including qualities
138 |                             of bases matching and mismatching reference.
139 |     * baseq_ext_strand    - As baseq_ext but with statistics by forward/reverse strand.
140 | 
141 | Binned statistics types (each row has statistics over reads aligned starting within a genome window):
142 | 
143 |     * coverage_binned     - As coverage but binned.
144 |     * coverage_ext_binned - As coverage_ext but binned.
145 |     * mapq_binned         - Similar to mapq but binned.
146 |     * alignment_binned    - Aggregated counts from cigar strings.
147 |     * tlen_binned         - As tlen but binned.
148 | 
149 | Examples:
150 | 
151 |     pysamstats --type coverage example.bam > example.coverage.txt
152 |     pysamstats --type coverage --chromosome Pf3D7_v3_01 --start 100000 --end 200000 example.bam > example.coverage.txt
153 | 
154 | Version: 1.1.2 (pysam 0.15.1)
155 | ```
156 | 
157 | From Python:
158 | 
159 | ```python
160 | import pysam
161 | import pysamstats
162 | 
163 | mybam = pysam.AlignmentFile('/path/to/your/bamfile.bam')
164 | 
165 | # iterate over statistics, one record at a time
166 | for rec in pysamstats.stat_coverage(mybam, chrom='Pf3D7_01_v3', start=10000, end=20000):
167 |     print rec['chrom'], rec['pos'], rec['reads_all'], rec['reads_pp']
168 |     ...
169 | 
170 | ```
171 | 
172 | For convenience, functions are provided for loading data directly into numpy arrays, e.g.:
173 | 
174 | ```python
175 | import pysam
176 | import pysamstats
177 | import matplotlib.pyplot as plt
178 | 
179 | mybam = pysam.AlignmentFile('/path/to/your/bamfile.bam')
180 | a = pysamstats.load_coverage(mybam, chrom='Pf3D7_01_v3', start=10000, end=20000)
181 | plt.plot(a.pos, a.reads_all)
182 | plt.show()
183 | ```
184 | 
185 | For pileup-based statistics functions, note the following:
186 | 
187 | * By default a row is emitted for all genome positions covered by reads overlapping the selected region. This means rows will be emitted for positions outside the selected region, but statistics may not be accurate as not all reads overlapping that position will have been counted. To truncate output to exactly the selected region, provide a ``truncate=True`` keyword argument.
188 | * By default a row is only emitted for genome positions covered by at least one read. To emit a row for every genome position, provide a ``pad=True`` keyword argument.
189 | * By default the number of reads in a pileup column is limited to 8000. To increase this limit, provide a ``max_depth=100000`` keyword argument (or whatever number is suitable for your situation).
190 | 
191 | Field definitions
192 | -----------------
193 | 
194 | The suffix **_fwd** means the field is restricted to reads mapped to
195 | the forward strand, and **_rev** means the field is restricted to
196 | reads mapped to the reverse strand. E.g., **reads_fwd** means the
197 | number of reads mapped to the forward strand.
198 | 
199 | The suffix **_pp** means the field is restricted to reads flagged as
200 | properly paired.
201 | 
202 | * **chrom** - Chromosome name.
203 | 
204 | * **pos** - Position within chromosome. One-based by default when
205 |     using the command line, zero-based by default when using the
206 |     python API.
207 | 
208 | * **reads_all** - Number of reads aligned at the position. N.b., this
209 |     is really the total, i.e., includes reads where the mate is
210 |     unmapped or otherwise not properly paired.
211 | 
212 | * **reads_pp** - Number of reads flagged as properly paired by the
213 |     aligner.
214 | 
215 | * **reads_mate_unmapped** - Number of reads where the mate is
216 |     unmapped.
217 | 
218 | * **reads_mate_other_chr** - Number of reads where the mate is mapped
219 |     to another chromosome.
220 | 
221 | * **reads_mate_same_strand** - Number of reads where the mate is
222 |     mapped to the same strand.
223 | 
224 | * **reads_faceaway** - Number of reads where the read and its mate are
225 |     mapped facing away from each other.
226 | 
227 | * **reads_softclipped** - Number of reads where there is some
228 |     softclipping at some point in the read's alignment (not
229 |     necessarily at this position).
230 | 
231 | * **reads_duplicate** - Number of reads that are flagged as duplicate.
232 | 
233 | * **gc** - Percentage GC content in the reference at this position
234 |     (depends on window length and offset specified).
235 | 
236 | * **matches** - Number of reads where the aligned base matches the
237 |     reference.
238 | 
239 | * **mismatches** - Number of reads where the aligned base does not
240 |     match the reference (but is not a deletion).
241 | 
242 | * **deletions** - Number of reads where there is a deletion in the
243 |     alignment at this position.
244 | 
245 | * **insertions** - Number of reads where there is an insertion in the
246 |     alignment at this position.
247 | 
248 | * **A/C/T/G/N** - Number of reads where the aligned base is an A/C/T/G/N.
249 | 
250 | * **mean_tlen** - Mean value of outer distance between reads and their
251 |     mates for paired reads aligned at this position. N.B., leftmost
252 |     reads in a pair have a positive tlen, rightmost reads have a
253 |     negative tlen, so if there is no strand bias, this value should be
254 |     0.
255 | 
256 | * **rms_tlen** - Root-mean-square value of outer distance between
257 |     reads and their mates for paired reads aligned at this position.
258 | 
259 | * **std_tlen** - Standard deviation of outer distance between reads
260 |     and their mates for paired reads aligned at this position.
261 | 
262 | * **reads_mapq0** - Number of reads where mapping quality is zero.
263 | 
264 | * **rms_mapq** - Root-mean-square mapping quality for reads aligned at
265 |     this position.
266 | 
267 | * **max_mapq** - Maximum value of mapping quality for reads aligned at
268 |     this position.
269 | 
270 | * **rms_baseq** - Root-mean-square value of base qualities for bases
271 |     aligned at this position.
272 | 
273 | * **rms_baseq_matches** - Root-mean-square value of base qualities for
274 |     bases aligned at this position where the base matches the
275 |     reference.
276 | 
277 | * **rms_baseq_mismatches** - Root-mean-square value of base qualities
278 |     for bases aligned at this position where the base does not match
279 |     the reference.
280 | 
281 | Release notes
282 | -------------
283 | 
284 | **1.1.2**
285 | 
286 | * Fix missing numpy as install requirement.
287 | 
288 | **1.1.1**
289 | 
290 | * Fix missing pyproject.toml in source distribution.
291 | 
292 | **1.1.0**
293 | 
294 | * Appropriate size dtype for chromosome names is now determined
295 |   dynamically, no need to manually configure for longer
296 |   chromosome/contig names. By [Nick
297 |   Harding](https://github.com/hardingnj),
298 |   [#72](https://github.com/alimanfoo/pysamstats/issues/72),
299 |   [#74](https://github.com/alimanfoo/pysamstats/issues/74).
300 |   
301 | * Expose 'stepper' option via Python and command line API, to allow
302 |   setting of different pileup behaviours. By [Nick
303 |   Harding](https://github.com/hardingnj),
304 |   [#78](https://github.com/alimanfoo/pysamstats/issues/78),
305 |   [#86](https://github.com/alimanfoo/pysamstats/pull/86).
306 | 
307 | * Expose options `min_mapq`, `min_baseq`, `no_del`, `no_dup` via
308 |   load_*() functions. By [nrkssa](https://github.com/nrkssa),
309 |   [#93](https://github.com/alimanfoo/pysamstats/pull/93).
310 | 
311 | * Add pyproject.toml for package build requirements, which means that
312 |   there is no need to manually install pysam before installing
313 |   pysamstats via pip. By [Michiel
314 |   Vermeir](https://github.com/michielvermeir),
315 |   [#97](https://github.com/alimanfoo/pysamstats/pull/97).
316 |   
317 | * Added a regression test to ensure consistent outputs in future
318 |   package versions. By [Nick Harding](https://github.com/hardingnj),
319 |   [#79](https://github.com/alimanfoo/pysamstats/issues/79).
320 | 
321 | * Pysam dependency upgraded to 0.15.
322 | 
323 | **1.0.1**
324 | 
325 | * Changed output of deletions field in variation stats to exclude RNA reads aligned with a splice
326 |   ("N" in cigar) ([#65](https://github.com/alimanfoo/pysamstats/issues/65))
327 | 
328 | **1.0.0**
329 | 
330 | * Upgrades for compatibility with pysam 0.11.
331 | * Added options for filtering reads based on mapping quality, base quality, deletion status and duplicate flag.
332 | 


--------------------------------------------------------------------------------
/fixture/deep.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/deep.bam


--------------------------------------------------------------------------------
/fixture/deep.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/deep.bam.bai


--------------------------------------------------------------------------------
/fixture/longcontignames.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/longcontignames.bam


--------------------------------------------------------------------------------
/fixture/longcontignames.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/longcontignames.bam.bai


--------------------------------------------------------------------------------
/fixture/ref.fa.fai:
--------------------------------------------------------------------------------
1 | Pf3D7_01_v3	640851	13	60	61
2 | Pf3D7_02_v3	947102	651558	60	61
3 | Pf3D7_03_v3	811800	1614459	60	61
4 | 


--------------------------------------------------------------------------------
/fixture/ref.upper.fa.fai:
--------------------------------------------------------------------------------
1 | Pf3D7_01_v3	640851	13	60	61
2 | 


--------------------------------------------------------------------------------
/fixture/regression.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/regression.npz


--------------------------------------------------------------------------------
/fixture/rna.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/rna.bam


--------------------------------------------------------------------------------
/fixture/rna.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/rna.bam.bai


--------------------------------------------------------------------------------
/fixture/test.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/test.bam


--------------------------------------------------------------------------------
/fixture/test.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alimanfoo/pysamstats/2e0980933494d9ce71639eed8c739ce9c9aa4617/fixture/test.bam.bai


--------------------------------------------------------------------------------
/performance.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import print_function, division, absolute_import
 4 | 
 5 | import sys
 6 | import pstats
 7 | import cProfile as profile
 8 | import timeit
 9 | 
10 | 
11 | from pysam import Samfile, Fastafile
12 | 
13 | 
14 | sys.path.append('.')
15 | import pysamstats
16 | 
17 | 
18 | def do_profiling(fun, end=1000):
19 |     samfile = Samfile('fixture/test.bam')
20 |     count = 0
21 |     f = getattr(pysamstats, fun)
22 |     for _ in f(samfile, chrom='Pf3D7_01_v3', start=0, end=end):
23 |         count += 1
24 | 
25 | 
26 | def do_profiling_withrefseq(fun, end=1000):
27 |     samfile = Samfile('fixture/test.bam')
28 |     fafile = Fastafile('fixture/ref.fa')
29 |     count = 0
30 |     f = getattr(pysamstats, fun)
31 |     for _ in f(samfile, fafile, chrom='Pf3D7_01_v3', start=0, end=end):
32 |         count += 1
33 | 
34 | 
35 | stats_types_requiring_fasta = ('variation', 
36 |                                'variation_strand', 
37 |                                'baseq_ext', 
38 |                                'baseq_ext_strand', 
39 |                                'coverage_gc',
40 |                                'coverage_normed_gc',
41 |                                'coverage_binned',
42 |                                'coverage_ext_binned')
43 | 
44 | fun = sys.argv[1]
45 | if len(sys.argv) > 2:
46 |     end = sys.argv[2]
47 | else:
48 |     end = 1000
49 | if len(sys.argv) > 3:
50 |     number = int(sys.argv[3])
51 | else:
52 |     number = 1
53 | if len(sys.argv) > 4:
54 |     repeat = int(sys.argv[4])
55 | else:
56 |     repeat = 3
57 | 
58 | if fun in stats_types_requiring_fasta:
59 |     cmd = 'do_profiling_withrefseq("stat_%s", %s)' % (fun, end)
60 | else:
61 |     cmd = 'do_profiling("stat_%s", %s)' % (fun, end)
62 |     
63 | prof_fn = '%s.prof' % fun
64 | profile.runctx(cmd, globals(), locals(), prof_fn)
65 | s = pstats.Stats(prof_fn)
66 | s.strip_dirs().sort_stats('time').print_stats()
67 | print(timeit.repeat(cmd,
68 |                     number=number,
69 |                     repeat=repeat,
70 |                     setup='from __main__ import do_profiling, '
71 |                           'do_profiling_withrefseq'))
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "wheel", "pysam (<0.16)"]
 3 | 
 4 | [tool.pysamstats]
 5 | name='pysamstats'
 6 | author='Alistair Miles'
 7 | author_email='alimanfoo@googlemail.com'
 8 | url='https://github.com/alimanfoo/pysamstats'
 9 | license='MIT Licenses'
10 | description="""A Python utility for calculating statistics against genome 
11 |             position based on sequence alignments from a SAM, 
12 |             BAM or CRAM file."""
13 | scripts=['scripts/pysamstats']
14 | classifiers=[
15 |     'Intended Audience :: Developers',
16 |     'License :: OSI Approved :: MIT License',
17 |     'Programming Language :: Python :: 2.7',
18 |     'Programming Language :: Python :: 3.5',
19 |     'Programming Language :: Python :: 3.6',
20 |     'Programming Language :: Python :: 3.7',
21 |     'Topic :: Software Development :: Libraries :: Python Modules'
22 | ]
23 | 


--------------------------------------------------------------------------------
/pysamstats/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import, print_function, division
 3 | 
 4 | 
 5 | __version__ = '1.1.2'
 6 | 
 7 | 
 8 | from .pileup import *
 9 | from .binned import *
10 | 


--------------------------------------------------------------------------------
/pysamstats/binned.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, print_function, division
  3 | import functools
  4 | 
  5 | 
  6 | import pysamstats.opt as opt
  7 | import pysamstats.util as util
  8 | import pysamstats.config as config
  9 | 
 10 | 
 11 | _doc_params = """
 12 |     Parameters
 13 |     ----------
 14 |     type : string
 15 |         Statistics type. One of "coverage", "coverage_ext", "mapq", "alignment", "tlen".
 16 |     alignmentfile : pysam.AlignmentFile or string
 17 |         SAM or BAM file or file path, not required for all statistics types.
 18 |     fafile : pysam.FastaFile or string
 19 |         FASTA file or file path.
 20 |     chrom : string
 21 |         Chromosome/contig.
 22 |     start : int
 23 |         Start position.
 24 |     end : int
 25 |         End position.
 26 |     one_based : bool
 27 |         Coordinate system, False if zero-based (default), True if one-based.
 28 |     window_size : int
 29 |         Window size to use.
 30 |     window_offset : int
 31 |         Distance from window start to record position.
 32 |     min_mapq : int, optional
 33 |         Only reads with mapping quality equal to or greater than this value will be counted (0
 34 |         by default).
 35 |     no_dup : bool, optional
 36 |         If True, don't count reads flagged as duplicate."""
 37 | 
 38 | 
 39 | 
 40 | # noinspection PyShadowingBuiltins
 41 | def stat_binned(type,
 42 |                 alignmentfile,
 43 |                 fafile=None,
 44 |                 chrom=None,
 45 |                 start=None,
 46 |                 end=None,
 47 |                 one_based=False,
 48 |                 window_size=300,
 49 |                 window_offset=None,
 50 |                 min_mapq=0,
 51 |                 no_dup=False):
 52 |     """Generate statistics per genome window, based on all reads whose alignment starts within
 53 |     the window.
 54 |     {params}
 55 | 
 56 |     Returns
 57 |     -------
 58 |     recs : iterator
 59 |         An iterator yielding dict objects, where each dict holds data for a single window.
 60 | 
 61 |     """
 62 | 
 63 |     try:
 64 |         stat = stats_classes_binned[type]()
 65 |     except KeyError:
 66 |         raise ValueError('unsupported statistics type: %r' % type)
 67 | 
 68 |     return opt.iter_binned(stat, alignmentfile=alignmentfile, fafile=fafile, chrom=chrom,
 69 |                            start=start, end=end, one_based=one_based, window_size=window_size,
 70 |                            window_offset=window_offset, min_mapq=min_mapq, no_dup=no_dup)
 71 | 
 72 | 
 73 | stat_binned.__doc__ = stat_binned.__doc__.format(params=_doc_params)
 74 | 
 75 | 
 76 | # noinspection PyShadowingBuiltins
 77 | def load_binned(type,
 78 |                 alignmentfile,
 79 |                 fafile=None,
 80 |                 chrom=None,
 81 |                 start=None,
 82 |                 end=None,
 83 |                 one_based=False,
 84 |                 window_size=300,
 85 |                 window_offset=None,
 86 |                 min_mapq=0,
 87 |                 no_dup=False,
 88 |                 dtype=None,
 89 |                 fields=None):
 90 |     """Load statistics per genome window, based on all reads whose alignment starts within
 91 |     the window.
 92 |     {params}
 93 |     dtype : dtype
 94 |         Override default dtype.
 95 |     fields : string or list of strings
 96 |         Select a subset of fields to load.
 97 | 
 98 |     Returns
 99 |     -------
100 |     ra : numpy structured array
101 |         A structured array.
102 | 
103 |     """
104 | 
105 |     statfun = functools.partial(stat_binned, type)
106 |     try:
107 |         default_dtype = getattr(config, 'dtype_' + type + '_binned')
108 |     except KeyError:
109 |         raise ValueError('unsupported statistics type: %r' % type)
110 | 
111 |     return util.load_stats(statfun, user_dtype=dtype, default_dtype=default_dtype,
112 |                            user_fields=fields, alignmentfile=alignmentfile, fafile=fafile,
113 |                            chrom=chrom, start=start, end=end, one_based=one_based,
114 |                            window_size=window_size, window_offset=window_offset,
115 |                            min_mapq=min_mapq, no_dup=no_dup)
116 | 
117 | 
118 | load_binned.__doc__ = load_binned.__doc__.format(params=_doc_params)
119 | 
120 | 
121 | stats_classes_binned = {
122 |     'coverage': opt.CoverageBinned,
123 |     'coverage_ext': opt.CoverageExtBinned,
124 |     'mapq': opt.MapqBinned,
125 |     'alignment': opt.AlignmentBinned,
126 |     'tlen': opt.TlenBinned,
127 | }
128 | 
129 | 
130 | # backwards compatibility
131 | #########################
132 | 
133 | 
134 | _stat_doc_lines = stat_binned.__doc__.split('\n')
135 | _load_doc_lines = load_binned.__doc__.split('\n')
136 | # strip "type" parameter
137 | _stat_doc = '\n'.join(_stat_doc_lines[:5] + _stat_doc_lines[7:])
138 | _load_doc = '\n'.join(_load_doc_lines[:5] + _load_doc_lines[7:])
139 | 
140 | 
141 | def _specialize(type):
142 |     statfun = functools.partial(stat_binned, type)
143 |     statfun.__doc__ = _stat_doc
144 |     statfun.__name__ = 'stat_' + type
145 |     loadfun = functools.partial(load_binned, type)
146 |     loadfun.__doc__ = _load_doc
147 |     loadfun.__name__ = 'load_' + type
148 |     return statfun, loadfun
149 | 
150 | 
151 | # named functions
152 | stat_coverage_binned, load_coverage_binned = _specialize('coverage')
153 | stat_coverage_ext_binned, load_coverage_ext_binned = _specialize('coverage_ext')
154 | stat_mapq_binned, load_mapq_binned = _specialize('mapq')
155 | stat_alignment_binned, load_alignment_binned = _specialize('alignment')
156 | stat_tlen_binned, load_tlen_binned = _specialize('tlen')
157 | 


--------------------------------------------------------------------------------
/pysamstats/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, print_function, division
  3 | 
  4 | 
  5 | stats_types_noref = ('coverage',
  6 |                      'coverage_strand',
  7 |                      'coverage_ext',
  8 |                      'coverage_ext_strand',
  9 |                      'tlen',
 10 |                      'tlen_strand',
 11 |                      'mapq',
 12 |                      'mapq_strand',
 13 |                      'baseq',
 14 |                      'baseq_strand',
 15 |                      'mapq_binned',
 16 |                      'alignment_binned',
 17 |                      'tlen_binned')
 18 | 
 19 | stats_types_withref = ('variation',
 20 |                        'variation_strand',
 21 |                        'baseq_ext',
 22 |                        'baseq_ext_strand',
 23 |                        'coverage_gc',
 24 |                        'coverage_binned',
 25 |                        'coverage_ext_binned')
 26 | 
 27 | stats_types = sorted(stats_types_noref + stats_types_withref)
 28 | 
 29 | stepper_types = ('nofilter',
 30 |                  'samtools',
 31 |                  'all')
 32 | 
 33 | dtype_coverage = [
 34 |     ('chrom', 'a12'),
 35 |     ('pos', 'i4'),
 36 |     ('reads_all', 'i4'),
 37 |     ('reads_pp', 'i4')
 38 | ]
 39 | 
 40 | dtype_coverage_strand = [
 41 |     ('chrom', 'a12'),
 42 |     ('pos', 'i4'),
 43 |     ('reads_all', 'i4'),
 44 |     ('reads_fwd', 'i4'),
 45 |     ('reads_rev', 'i4'),
 46 |     ('reads_pp', 'i4'),
 47 |     ('reads_pp_fwd', 'i4'),
 48 |     ('reads_pp_rev', 'i4'),
 49 | ]
 50 | 
 51 | dtype_coverage_ext = [
 52 |     ('chrom', 'a12'),
 53 |     ('pos', 'i4'),
 54 |     ('reads_all', 'i4'),
 55 |     ('reads_pp', 'i4'),
 56 |     ('reads_mate_unmapped', 'i4'),
 57 |     ('reads_mate_other_chr', 'i4'),
 58 |     ('reads_mate_same_strand', 'i4'),
 59 |     ('reads_faceaway', 'i4'),
 60 |     ('reads_softclipped', 'i4'),
 61 |     ('reads_duplicate', 'i4')
 62 | ]
 63 | 
 64 | dtype_coverage_ext_strand = [
 65 |     ('chrom', 'a12'),
 66 |     ('pos', 'i4'),
 67 |     ('reads_all', 'i4'),
 68 |     ('reads_fwd', 'i4'),
 69 |     ('reads_rev', 'i4'),
 70 |     ('reads_pp', 'i4'),
 71 |     ('reads_pp_fwd', 'i4'),
 72 |     ('reads_pp_rev', 'i4'),
 73 |     ('reads_mate_unmapped', 'i4'),
 74 |     ('reads_mate_unmapped_fwd', 'i4'),
 75 |     ('reads_mate_unmapped_rev', 'i4'),
 76 |     ('reads_mate_other_chr', 'i4'),
 77 |     ('reads_mate_other_chr_fwd', 'i4'),
 78 |     ('reads_mate_other_chr_rev', 'i4'),
 79 |     ('reads_mate_same_strand', 'i4'),
 80 |     ('reads_mate_same_strand_fwd', 'i4'),
 81 |     ('reads_mate_same_strand_rev', 'i4'),
 82 |     ('reads_faceaway', 'i4'),
 83 |     ('reads_faceaway_fwd', 'i4'),
 84 |     ('reads_faceaway_rev', 'i4'),
 85 |     ('reads_softclipped', 'i4'),
 86 |     ('reads_softclipped_fwd', 'i4'),
 87 |     ('reads_softclipped_rev', 'i4'),
 88 |     ('reads_duplicate', 'i4'),
 89 |     ('reads_duplicate_fwd', 'i4'),
 90 |     ('reads_duplicate_rev', 'i4'),
 91 | ]
 92 | 
 93 | dtype_variation = [
 94 |     ('chrom', 'a12'),
 95 |     ('pos', 'i4'),
 96 |     ('ref', 'a1'),
 97 |     ('reads_all', 'i4'),
 98 |     ('reads_pp', 'i4'),
 99 |     ('matches', 'i4'),
100 |     ('matches_pp', 'i4'),
101 |     ('mismatches', 'i4'),
102 |     ('mismatches_pp', 'i4'),
103 |     ('deletions', 'i4'),
104 |     ('deletions_pp', 'i4'),
105 |     ('insertions', 'i4'),
106 |     ('insertions_pp', 'i4'),
107 |     ('A', 'i4'),
108 |     ('A_pp', 'i4'),
109 |     ('C', 'i4'),
110 |     ('C_pp', 'i4'),
111 |     ('T', 'i4'),
112 |     ('T_pp', 'i4'),
113 |     ('G', 'i4'),
114 |     ('G_pp', 'i4'),
115 |     ('N', 'i4'),
116 |     ('N_pp', 'i4')
117 | ]
118 | 
119 | dtype_variation_strand = [
120 |     ('chrom', 'a12'),
121 |     ('pos', 'i4'),
122 |     ('ref', 'a1'),
123 |     ('reads_all', 'i4'),
124 |     ('reads_fwd', 'i4'),
125 |     ('reads_rev', 'i4'),
126 |     ('reads_pp', 'i4'),
127 |     ('reads_pp_fwd', 'i4'),
128 |     ('reads_pp_rev', 'i4'),
129 |     ('matches', 'i4'),
130 |     ('matches_fwd', 'i4'),
131 |     ('matches_rev', 'i4'),
132 |     ('matches_pp', 'i4'),
133 |     ('matches_pp_fwd', 'i4'),
134 |     ('matches_pp_rev', 'i4'),
135 |     ('mismatches', 'i4'),
136 |     ('mismatches_fwd', 'i4'),
137 |     ('mismatches_rev', 'i4'),
138 |     ('mismatches_pp', 'i4'),
139 |     ('mismatches_pp_fwd', 'i4'),
140 |     ('mismatches_pp_rev', 'i4'),
141 |     ('deletions', 'i4'),
142 |     ('deletions_fwd', 'i4'),
143 |     ('deletions_rev', 'i4'),
144 |     ('deletions_pp', 'i4'),
145 |     ('deletions_pp_fwd', 'i4'),
146 |     ('deletions_pp_rev', 'i4'),
147 |     ('insertions', 'i4'),
148 |     ('insertions_fwd', 'i4'),
149 |     ('insertions_rev', 'i4'),
150 |     ('insertions_pp', 'i4'),
151 |     ('insertions_pp_fwd', 'i4'),
152 |     ('insertions_pp_rev', 'i4'),
153 |     ('A', 'i4'), ('A_fwd', 'i4'), ('A_rev', 'i4'),
154 |     ('A_pp', 'i4'), ('A_pp_fwd', 'i4'), ('A_pp_rev', 'i4'),
155 |     ('C', 'i4'), ('C_fwd', 'i4'), ('C_rev', 'i4'),
156 |     ('C_pp', 'i4'), ('C_pp_fwd', 'i4'), ('C_pp_rev', 'i4'),
157 |     ('T', 'i4'), ('T_fwd', 'i4'), ('T_rev', 'i4'),
158 |     ('T_pp', 'i4'), ('T_pp_fwd', 'i4'), ('T_pp_rev', 'i4'),
159 |     ('G', 'i4'), ('G_fwd', 'i4'), ('G_rev', 'i4'),
160 |     ('G_pp', 'i4'), ('G_pp_fwd', 'i4'), ('G_pp_rev', 'i4'),
161 |     ('N', 'i4'), ('N_fwd', 'i4'), ('N_rev', 'i4'),
162 |     ('N_pp', 'i4'), ('N_pp_fwd', 'i4'), ('N_pp_rev', 'i4')
163 | ]
164 | 
165 | dtype_tlen = [
166 |     ('chrom', 'a12'),
167 |     ('pos', 'i4'),
168 |     ('reads_all', 'i4'),
169 |     ('reads_paired', 'i4'),
170 |     ('reads_pp', 'i4'),
171 |     ('mean_tlen', 'i4'),
172 |     ('mean_tlen_pp', 'i4'),
173 |     ('rms_tlen', 'i4'),
174 |     ('rms_tlen_pp', 'i4'),
175 |     ('std_tlen', 'i4'),
176 |     ('std_tlen_pp', 'i4')
177 | ]
178 | 
179 | dtype_tlen_strand = [
180 |     ('chrom', 'a12'),
181 |     ('pos', 'i4'),
182 |     ('reads_all', 'i4'),
183 |     ('reads_fwd', 'i4'),
184 |     ('reads_rev', 'i4'),
185 |     ('reads_paired', 'i4'),
186 |     ('reads_paired_fwd', 'i4'),
187 |     ('reads_paired_rev', 'i4'),
188 |     ('reads_pp', 'i4'),
189 |     ('reads_pp_fwd', 'i4'),
190 |     ('reads_pp_rev', 'i4'),
191 |     ('mean_tlen', 'i4'),
192 |     ('mean_tlen_fwd', 'i4'),
193 |     ('mean_tlen_rev', 'i4'),
194 |     ('mean_tlen_pp', 'i4'),
195 |     ('mean_tlen_pp_fwd', 'i4'),
196 |     ('mean_tlen_pp_rev', 'i4'),
197 |     ('rms_tlen', 'i4'),
198 |     ('rms_tlen_fwd', 'i4'),
199 |     ('rms_tlen_rev', 'i4'),
200 |     ('rms_tlen_pp', 'i4'),
201 |     ('rms_tlen_pp_fwd', 'i4'),
202 |     ('rms_tlen_pp_rev', 'i4'),
203 |     ('std_tlen', 'i4'),
204 |     ('std_tlen_fwd', 'i4'),
205 |     ('std_tlen_rev', 'i4'),
206 |     ('std_tlen_pp', 'i4'),
207 |     ('std_tlen_pp_fwd', 'i4'),
208 |     ('std_tlen_pp_rev', 'i4')
209 | ]
210 | 
211 | dtype_mapq = [
212 |     ('chrom', 'a12'),
213 |     ('pos', 'i4'),
214 |     ('reads_all', 'i4'),
215 |     ('reads_pp', 'i4'),
216 |     ('reads_mapq0', 'i4'),
217 |     ('reads_mapq0_pp', 'i4'),
218 |     ('rms_mapq', 'i4'),
219 |     ('rms_mapq_pp', 'i4'),
220 |     ('max_mapq', 'i4'),
221 |     ('max_mapq_pp', 'i4')
222 | ]
223 | 
224 | dtype_mapq_strand = [
225 |     ('chrom', 'a12'),
226 |     ('pos', 'i4'),
227 |     ('reads_all', 'i4'),
228 |     ('reads_fwd', 'i4'),
229 |     ('reads_rev', 'i4'),
230 |     ('reads_pp', 'i4'),
231 |     ('reads_pp_fwd', 'i4'),
232 |     ('reads_pp_rev', 'i4'),
233 |     ('reads_mapq0', 'i4'),
234 |     ('reads_mapq0_fwd', 'i4'),
235 |     ('reads_mapq0_rev', 'i4'),
236 |     ('reads_mapq0_pp', 'i4'),
237 |     ('reads_mapq0_pp_fwd', 'i4'),
238 |     ('reads_mapq0_pp_rev', 'i4'),
239 |     ('rms_mapq', 'i4'),
240 |     ('rms_mapq_fwd', 'i4'),
241 |     ('rms_mapq_rev', 'i4'),
242 |     ('rms_mapq_pp', 'i4'),
243 |     ('rms_mapq_pp_fwd', 'i4'),
244 |     ('rms_mapq_pp_rev', 'i4'),
245 |     ('max_mapq', 'i4'),
246 |     ('max_mapq_fwd', 'i4'),
247 |     ('max_mapq_rev', 'i4'),
248 |     ('max_mapq_pp', 'i4'),
249 |     ('max_mapq_pp_fwd', 'i4'),
250 |     ('max_mapq_pp_rev', 'i4'),
251 | ]
252 | 
253 | dtype_baseq = [
254 |     ('chrom', 'a12'),
255 |     ('pos', 'i4'),
256 |     ('reads_all', 'i4'),
257 |     ('reads_pp', 'i4'),
258 |     ('rms_baseq', 'i4'),
259 |     ('rms_baseq_pp', 'i4'),
260 | ]
261 | 
262 | dtype_baseq_strand = [
263 |     ('chrom', 'a12'),
264 |     ('pos', 'i4'),
265 |     ('reads_all', 'i4'),
266 |     ('reads_fwd', 'i4'),
267 |     ('reads_rev', 'i4'),
268 |     ('reads_pp', 'i4'),
269 |     ('reads_pp_fwd', 'i4'),
270 |     ('reads_pp_rev', 'i4'),
271 |     ('rms_baseq', 'i4'),
272 |     ('rms_baseq_fwd', 'i4'),
273 |     ('rms_baseq_rev', 'i4'),
274 |     ('rms_baseq_pp', 'i4'),
275 |     ('rms_baseq_pp_fwd', 'i4'),
276 |     ('rms_baseq_pp_rev', 'i4'),
277 | ]
278 | 
279 | dtype_baseq_ext = [
280 |     ('chrom', 'a12'),
281 |     ('pos', 'i4'),
282 |     ('ref', 'a1'),
283 |     ('reads_all', 'i4'),
284 |     ('reads_pp', 'i4'),
285 |     ('matches', 'i4'),
286 |     ('matches_pp', 'i4'),
287 |     ('mismatches', 'i4'),
288 |     ('mismatches_pp', 'i4'),
289 |     ('rms_baseq', 'i4'),
290 |     ('rms_baseq_pp', 'i4'),
291 |     ('rms_baseq_matches', 'i4'),
292 |     ('rms_baseq_matches_pp', 'i4'),
293 |     ('rms_baseq_mismatches', 'i4'),
294 |     ('rms_baseq_mismatches_pp', 'i4'),
295 | ]
296 | 
297 | dtype_baseq_ext_strand = [
298 |     ('chrom', 'a12'),
299 |     ('pos', 'i4'),
300 |     ('ref', 'a1'),
301 |     ('reads_all', 'i4'),
302 |     ('reads_fwd', 'i4'),
303 |     ('reads_rev', 'i4'),
304 |     ('reads_pp', 'i4'),
305 |     ('reads_pp_fwd', 'i4'),
306 |     ('reads_pp_rev', 'i4'),
307 |     ('matches', 'i4'),
308 |     ('matches_fwd', 'i4'),
309 |     ('matches_rev', 'i4'),
310 |     ('matches_pp', 'i4'),
311 |     ('matches_pp_fwd', 'i4'),
312 |     ('matches_pp_rev', 'i4'),
313 |     ('mismatches', 'i4'),
314 |     ('mismatches_fwd', 'i4'),
315 |     ('mismatches_rev', 'i4'),
316 |     ('mismatches_pp', 'i4'),
317 |     ('mismatches_pp_fwd', 'i4'),
318 |     ('mismatches_pp_rev', 'i4'),
319 |     ('rms_baseq', 'i4'),
320 |     ('rms_baseq_fwd', 'i4'),
321 |     ('rms_baseq_rev', 'i4'),
322 |     ('rms_baseq_pp', 'i4'),
323 |     ('rms_baseq_pp_fwd', 'i4'),
324 |     ('rms_baseq_pp_rev', 'i4'),
325 |     ('rms_baseq_matches', 'i4'),
326 |     ('rms_baseq_matches_fwd', 'i4'),
327 |     ('rms_baseq_matches_rev', 'i4'),
328 |     ('rms_baseq_matches_pp', 'i4'),
329 |     ('rms_baseq_matches_pp_fwd', 'i4'),
330 |     ('rms_baseq_matches_pp_rev', 'i4'),
331 |     ('rms_baseq_mismatches', 'i4'),
332 |     ('rms_baseq_mismatches_fwd', 'i4'),
333 |     ('rms_baseq_mismatches_rev', 'i4'),
334 |     ('rms_baseq_mismatches_pp', 'i4'),
335 |     ('rms_baseq_mismatches_pp_fwd', 'i4'),
336 |     ('rms_baseq_mismatches_pp_rev', 'i4')
337 | ]
338 | 
339 | dtype_coverage_gc = [
340 |     ('chrom', 'a12'),
341 |     ('pos', 'i4'),
342 |     ('gc', 'u1'),
343 |     ('reads_all', 'i4'),
344 |     ('reads_pp', 'i4')
345 | ]
346 | 
347 | dtype_coverage_binned = [
348 |     ('chrom', 'a12'),
349 |     ('pos', 'i4'),
350 |     ('gc', 'u1'),
351 |     ('reads_all', 'i4'),
352 |     ('reads_pp', 'i4')
353 | ]
354 | 
355 | dtype_coverage_ext_binned = [
356 |     ('chrom', 'a12'),
357 |     ('pos', 'i4'),
358 |     ('gc', 'u1'),
359 |     ('reads_all', 'i4'),
360 |     ('reads_pp', 'i4'),
361 |     ('reads_mate_unmapped', 'i4'),
362 |     ('reads_mate_other_chr', 'i4'),
363 |     ('reads_mate_same_strand', 'i4'),
364 |     ('reads_faceaway', 'i4'),
365 |     ('reads_softclipped', 'i4'),
366 |     ('reads_duplicate', 'i4')
367 | ]
368 | 
369 | dtype_mapq_binned = [
370 |     ('chrom', 'a12'),
371 |     ('pos', 'i4'),
372 |     ('reads_all', 'i4'),
373 |     ('reads_mapq0', 'i4'),
374 |     ('rms_mapq', 'i4'),
375 | ]
376 | 
377 | dtype_alignment_binned = [
378 |     ('chrom', 'a12'),
379 |     ('pos', 'i4'),
380 |     ('reads_all', 'i4'),
381 |     ('bases_all', 'i4'),
382 |     ('M', 'i4'),
383 |     ('I', 'i4'),
384 |     ('D', 'i4'),
385 |     ('N', 'i4'),
386 |     ('S', 'i4'),
387 |     ('H', 'i4'),
388 |     ('P', 'i4'),
389 |     ('=', 'i4'),
390 |     ('X', 'i4')
391 | ]
392 | 
393 | dtype_tlen_binned = [
394 |     ('chrom', 'a12'),
395 |     ('pos', 'i4'),
396 |     ('reads_all', 'i4'),
397 |     ('reads_pp', 'i4'),
398 |     ('mean_tlen', 'i4'),
399 |     ('mean_tlen_pp', 'i4'),
400 |     ('rms_tlen', 'i4'),
401 |     ('rms_tlen_pp', 'i4'),
402 | ]
403 | 


--------------------------------------------------------------------------------
/pysamstats/io.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, print_function, division
  3 | import itertools
  4 | import time
  5 | import csv
  6 | import sys
  7 | 
  8 | 
  9 | import pysamstats
 10 | from pysamstats.util import flatten, determine_max_seqid
 11 | import pysamstats.config as config
 12 | 
 13 | 
 14 | def write_csv(stats_type, outfile, alignmentfile, fields=None, dialect='excel-tab',
 15 |               write_header=True, progress=None, **kwargs):
 16 |     """Write statistics output to a CSV file.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 | 
 21 |     stats_type : string
 22 |         Statistics type, one of 'coverage', 'coverage_ext', etc.
 23 |     outfile : file-like
 24 |         Output file to write to.
 25 |     alignmentfile : pysam.AlignmentFile or string
 26 |         Input BAM or SAM file or file path.
 27 |     fields : list of strings
 28 |         List of field names to output (all by default).
 29 |     dialect : string
 30 |         CSV dialect.
 31 |     write_header : bool
 32 |         If True write a header row.
 33 |     progress : int
 34 |         Log progress to stderr every N rows.
 35 |     **kwargs
 36 |         Passed through to the statistics function.
 37 | 
 38 |     """
 39 | 
 40 |     # lookup stats function
 41 |     stats_function = getattr(pysamstats, 'stat_' + stats_type)
 42 | 
 43 |     # determine field names
 44 |     if fields is None:
 45 |         dtype = getattr(config, 'dtype_' + stats_type)
 46 |         fields = [t[0] for t in dtype]
 47 | 
 48 |     # setup record generator
 49 |     recs = stats_function(alignmentfile, **kwargs)
 50 | 
 51 |     # flatten records to rows
 52 |     rows = flatten(recs, *fields)
 53 | 
 54 |     # initialise writer
 55 |     writer = csv.writer(outfile, dialect=dialect)
 56 | 
 57 |     # write header row
 58 |     if write_header:
 59 |         writer.writerow(fields)
 60 | 
 61 |     if progress is None:
 62 |         # N.B., don't use writer.writerows(recs)!
 63 |         for row in rows:
 64 |             writer.writerow(row)
 65 | 
 66 |     else:
 67 |         counter = 0
 68 |         modulus = progress
 69 |         before = time.time()
 70 |         before_all = before
 71 |         for row in rows:
 72 |             counter += 1
 73 |             writer.writerow(row)
 74 |             if counter % modulus == 0:
 75 |                 after = time.time()
 76 |                 elapsed = after - before_all
 77 |                 batch_elapsed = after - before
 78 |                 msg = '[pysamstats] %s rows in %.2fs (%d rows/s); batch in ' \
 79 |                       '%.2fs (%d rows/s)' \
 80 |                       % (counter, elapsed, counter / elapsed, batch_elapsed,
 81 |                          progress / batch_elapsed)
 82 |                 print(msg, file=sys.stderr)
 83 |                 before = after
 84 |         after_all = time.time()
 85 |         elapsed_all = after_all - before_all
 86 |         msg = '[pysamstats] %s rows in %.2fs (%d rows/s)' \
 87 |               % (counter, elapsed_all, counter / elapsed_all)
 88 |         print(msg, file=sys.stderr)
 89 | 
 90 | 
 91 | def write_hdf5(stats_type, outfile, alignmentfile, fields=None, progress=None, hdf5_group='/',
 92 |                hdf5_dataset='data', hdf5_complevel=1, hdf5_complib='zlib', hdf5_shuffle=True,
 93 |                hdf5_fletcher32=False, hdf5_chunksize=2**20, dtype=None, **kwargs):
 94 |     """Write statistics output to an HDF5 file. Requires PyTables.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     stats_type : string
 99 |         Statistics type, one of 'coverage', 'coverage_ext', etc.
100 |     outfile : string
101 |         Output file path.
102 |     alignmentfile : pysam.AlignmentFile or string
103 |         Input BAM or SAM file or file path.
104 |     fields : list of strings
105 |         List of field names to output (all by default).
106 |     progress : int
107 |         Log progress to stderr approximately every N rows.
108 |     hdf5_group : string
109 |         Group to write new dataset to.
110 |     hdf5_dataset : string
111 |         Name of dataset to create.
112 |     hdf5_complib : string
113 |         Name of compression library (defaults to 'zlib').
114 |     hdf5_complevel : int
115 |         Compression level.
116 |     hdf5_chunksize : int
117 |         Size of chunks in number of bytes.
118 |     hdf5_shuffle : bool
119 |         If True, use byte shuffle filter.
120 |     hdf5_fletcher32 : bool
121 |         If True, use fletcher 32 filter.
122 |     dtype : dict
123 |         Override dtype.
124 |     **kwargs
125 |         Passed through to the statistics function.
126 | 
127 |     Notes
128 |     -----
129 |     The length of the chunks in number of items is calculated by dividing the
130 |     chunk size in number of bytes by the size of each row in number of bytes as
131 |     determined from the dtype.
132 | 
133 |     """
134 | 
135 |     import tables
136 |     import numpy as np
137 |     h5file = None
138 | 
139 |     # lookup stats function
140 |     stats_function = getattr(pysamstats, 'stat_' + stats_type)
141 | 
142 |     # lookup default dtype
143 |     default_dtype = getattr(config, 'dtype_' + stats_type)
144 | 
145 |     # determine field names
146 |     if fields is None:
147 |         fields = [t[0] for t in default_dtype]
148 | 
149 |     # determine dtype
150 |     default_dtype = dict(default_dtype)
151 |     max_seqid_len = determine_max_seqid(alignmentfile)
152 |     default_dtype["chrom"] = "a{0}".format(max_seqid_len)
153 | 
154 |     # update if user passed
155 |     if dtype is not None:
156 |         default_dtype.update(dict(dtype))
157 |     dtype = default_dtype
158 | 
159 |     # fields
160 |     if len(fields) == 1:
161 |         dtype = dtype[fields[0]]
162 |     else:
163 |         dtype = [(f, dtype[f]) for f in fields]
164 |     dtype = np.dtype(dtype)
165 | 
166 |     # setup record generator
167 |     recs = stats_function(alignmentfile, **kwargs)
168 | 
169 |     # flatten records to rows
170 |     rows = flatten(recs, *fields)
171 | 
172 |     try:
173 | 
174 |         # open output file
175 |         h5file = tables.open_file(outfile, mode='a')
176 | 
177 |         # determine chunk shape
178 |         hdf5_chunklen = int(hdf5_chunksize/dtype.itemsize)
179 |         hdf5_chunkshape = (hdf5_chunklen,)
180 | 
181 |         # replace any existing node at that location
182 |         try:
183 |             h5file.remove_node(hdf5_group, hdf5_dataset)
184 |         except tables.NoSuchNodeError:
185 |             pass
186 | 
187 |         # create dataset
188 |         h5table = h5file.create_table(
189 |             hdf5_group, hdf5_dataset, dtype,
190 |             title=stats_type,
191 |             filters=tables.Filters(complevel=hdf5_complevel,
192 |                                    complib=hdf5_complib,
193 |                                    shuffle=hdf5_shuffle,
194 |                                    fletcher32=hdf5_fletcher32),
195 |             createparents=True,
196 |             chunkshape=hdf5_chunkshape)
197 | 
198 |         # record initial time
199 |         counter = 0
200 |         counter_before = 0
201 |         before = time.time()
202 |         before_all = before
203 | 
204 |         # load data in batches of size `hdf5_chunklen`
205 |         chunk = list(itertools.islice(rows, hdf5_chunklen))
206 | 
207 |         # load chunk at a time
208 |         while chunk:
209 | 
210 |             # write chunk
211 |             h5table.append(chunk)
212 |             h5table.flush()
213 | 
214 |             # keep track of number of records loaded
215 |             n = len(chunk)  # may be shorter than chunklen if final batch
216 |             counter += n
217 | 
218 |             # log progress
219 |             if progress and (counter % progress) < hdf5_chunklen:
220 |                 after = time.time()
221 |                 elapsed = after - before_all
222 |                 batch_elapsed = after - before
223 |                 batch_size = counter - counter_before
224 |                 msg = '[pysamstats] %s rows in %.2fs (%d rows/s); last %s ' \
225 |                       'rows in %.2fs (%d rows/s)' \
226 |                       % (counter, elapsed, counter / elapsed,
227 |                          batch_size, batch_elapsed, batch_size / batch_elapsed)
228 |                 print(msg, file=sys.stderr)
229 |                 before = after
230 |                 counter_before = counter
231 | 
232 |             # load next batch
233 |             chunk = list(itertools.islice(rows, hdf5_chunklen))
234 | 
235 |         if progress:
236 |             after_all = time.time()
237 |             elapsed_all = after_all - before_all
238 |             msg = '[pysamstats] %s rows in %.2fs (%d rows/s)' \
239 |                   % (counter, elapsed_all, counter / elapsed_all)
240 |             print(msg, file=sys.stderr)
241 | 
242 |     finally:
243 |         if h5file is not None:
244 |             h5file.close()
245 | 


--------------------------------------------------------------------------------
/pysamstats/pileup.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, print_function, division
  3 | import functools
  4 | 
  5 | 
  6 | import pysamstats.opt as opt
  7 | import pysamstats.util as util
  8 | import pysamstats.config as config
  9 | 
 10 | 
 11 | _doc_params = """
 12 |     Parameters
 13 |     ----------
 14 |     type : string
 15 |         Statistics type. One of "coverage", "coverage_strand", "coverage_ext",
 16 |         "coverage_ext_strand", "variation", "variation_strand", "tlen", "tlen_strand", "mapq",
 17 |         "mapq_strand", "baseq", "baseq_strand", "baseq_ext", "baseq_ext_strand", "coverage_gc".
 18 |     alignmentfile : pysam.AlignmentFile or string
 19 |         SAM or BAM file or file path.
 20 |     fafile : pysam.FastaFile or string, optional
 21 |         FASTA file or file path, only required for some statistics types.
 22 |     chrom : string, optional
 23 |         Chromosome/contig.
 24 |     start : int, optional
 25 |         Start position.
 26 |     end : int, optional
 27 |         End position.
 28 |     one_based : bool, optional
 29 |         Coordinate system, False if zero-based (default), True if one-based.
 30 |     truncate : bool, optional
 31 |         If True, truncate output to selected region.
 32 |     pad : bool, optional
 33 |         If True, emit records for every position, even if no reads are aligned.
 34 |     max_depth : int, optional
 35 |         Maximum depth to allow in pileup column.
 36 |     window_size : int, optional
 37 |         Window size to use for percent GC calculation (only applies to coverage_gc).
 38 |     window_offset : int, optional
 39 |         Distance from window start to record position (only applies to coverage_gc).
 40 |     min_mapq : int, optional
 41 |         Only reads with mapping quality equal to or greater than this value will be counted (0
 42 |         by default).
 43 |     min_baseq : int, optional
 44 |         Only reads with base quality equal to or greater than this value will be counted (0 by
 45 |         default).
 46 |     no_del : bool, optional
 47 |         If True, don't count reads aligned with a deletion at the current position.
 48 |     no_dup : bool, optional
 49 |         If True, don't count reads flagged as duplicate."""
 50 | 
 51 | 
 52 | # noinspection PyShadowingBuiltins
 53 | def stat_pileup(type,
 54 |                 alignmentfile,
 55 |                 fafile=None,
 56 |                 chrom=None,
 57 |                 start=None,
 58 |                 end=None,
 59 |                 one_based=False,
 60 |                 truncate=False,
 61 |                 stepper="all",
 62 |                 pad=False,
 63 |                 max_depth=8000,
 64 |                 window_size=300,
 65 |                 window_offset=None,
 66 |                 min_mapq=0,
 67 |                 min_baseq=0,
 68 |                 no_del=False,
 69 |                 no_dup=False):
 70 |     """Generate statistics per genome position, based on read pileups.
 71 |     {params}
 72 | 
 73 |     Returns
 74 |     -------
 75 |     recs : iterator
 76 |         An iterator yielding dict objects, where each dict holds data for a single genome position.
 77 | 
 78 |     """
 79 | 
 80 |     if type in config.stats_types_withref and fafile is None:
 81 |         raise ValueError('reference sequence is required; please provide fafile argument')
 82 | 
 83 |     try:
 84 |         if type == 'coverage_gc':
 85 |             stat = stats_classes_pileup[type](window_size=window_size, window_offset=window_offset)
 86 |         else:
 87 |             stat = stats_classes_pileup[type]()
 88 |     except KeyError:
 89 |         raise ValueError('unsupported statistics type: %r' % type)
 90 | 
 91 |     return opt.iter_pileup(stat, alignmentfile=alignmentfile, fafile=fafile, chrom=chrom,
 92 |                            start=start, end=end, one_based=one_based, truncate=truncate, stepper=stepper, pad=pad,
 93 |                            max_depth=max_depth, min_mapq=min_mapq, min_baseq=min_baseq,
 94 |                            no_del=no_del, no_dup=no_dup)
 95 | 
 96 | 
 97 | stat_pileup.__doc__ = stat_pileup.__doc__.format(params=_doc_params)
 98 | 
 99 | 
100 | # noinspection PyShadowingBuiltins
101 | def load_pileup(type,
102 |                 alignmentfile,
103 |                 fafile=None,
104 |                 chrom=None,
105 |                 start=None,
106 |                 end=None,
107 |                 one_based=False,
108 |                 truncate=False,
109 |                 stepper="all",
110 |                 pad=False,
111 |                 max_depth=8000,
112 |                 window_size=300,
113 |                 window_offset=None,
114 |                 min_mapq=0,
115 |                 min_baseq=0,
116 |                 no_del=False,
117 |                 no_dup=False,
118 |                 dtype=None,
119 |                 fields=None):
120 |     """Load statistics per genome position, based on read pileups.
121 |     {params}
122 |     dtype : dtype
123 |         Override default dtype.
124 |     fields : string or list of strings
125 |         Select a subset of fields to load.
126 | 
127 |     Returns
128 |     -------
129 |     ra : numpy structured array
130 |         A structured array.
131 | 
132 |     """
133 | 
134 |     statfun = functools.partial(stat_pileup, type)
135 |     try:
136 |         default_dtype = getattr(config, 'dtype_' + type)
137 |     except AttributeError:
138 |         raise ValueError('unsupported statistics type: %r' % type)
139 | 
140 |     return util.load_stats(statfun, user_dtype=dtype, default_dtype=default_dtype,
141 |                            user_fields=fields, alignmentfile=alignmentfile, fafile=fafile,
142 |                            chrom=chrom, start=start, end=end, one_based=one_based,
143 |                            truncate=truncate, stepper=stepper, pad=pad, max_depth=max_depth, window_size=window_size,
144 |                            window_offset=window_offset, min_mapq=min_mapq, min_baseq=min_baseq, no_del=no_del, no_dup=no_dup)
145 | 
146 | 
147 | load_pileup.__doc__ = load_pileup.__doc__.format(params=_doc_params)
148 | 
149 | 
150 | stats_classes_pileup = {
151 |     'coverage': opt.Coverage,
152 |     'coverage_strand': opt.CoverageStrand,
153 |     'coverage_ext': opt.CoverageExt,
154 |     'coverage_ext_strand': opt.CoverageExtStrand,
155 |     'variation': opt.Variation,
156 |     'variation_strand': opt.VariationStrand,
157 |     'tlen': opt.Tlen,
158 |     'tlen_strand': opt.TlenStrand,
159 |     'mapq': opt.Mapq,
160 |     'mapq_strand': opt.MapqStrand,
161 |     'baseq': opt.Baseq,
162 |     'baseq_strand': opt.BaseqStrand,
163 |     'baseq_ext': opt.BaseqExt,
164 |     'baseq_ext_strand': opt.BaseqExtStrand,
165 |     'coverage_gc': opt.CoverageGC,
166 | }
167 | 
168 | 
169 | # backwards compatibility
170 | #########################
171 | 
172 | 
173 | _stat_doc_lines = stat_pileup.__doc__.split('\n')
174 | _load_doc_lines = load_pileup.__doc__.split('\n')
175 | # strip "type" parameter
176 | _stat_doc = '\n'.join(_stat_doc_lines[:4] + _stat_doc_lines[8:])
177 | _load_doc = '\n'.join(_load_doc_lines[:4] + _load_doc_lines[8:])
178 | 
179 | 
180 | def _specialize(type):
181 |     stat = functools.partial(stat_pileup, type)
182 |     stat.__doc__ = _stat_doc
183 |     stat.__name__ = 'stat_' + type
184 |     load = functools.partial(load_pileup, type)
185 |     load.__doc__ = _load_doc
186 |     load.__name__ = 'load_' + type
187 |     return stat, load
188 | 
189 | 
190 | # named functions
191 | stat_coverage, load_coverage = _specialize('coverage')
192 | stat_coverage_strand, load_coverage_strand = _specialize('coverage_strand')
193 | stat_coverage_ext, load_coverage_ext = _specialize('coverage_ext')
194 | stat_coverage_ext_strand, load_coverage_ext_strand = _specialize('coverage_ext_strand')
195 | stat_variation, load_variation = _specialize('variation')
196 | stat_variation_strand, load_variation_strand = _specialize('variation_strand')
197 | stat_tlen, load_tlen = _specialize('tlen')
198 | stat_tlen_strand, load_tlen_strand = _specialize('tlen_strand')
199 | stat_mapq, load_mapq = _specialize('mapq')
200 | stat_mapq_strand, load_mapq_strand = _specialize('mapq_strand')
201 | stat_baseq, load_baseq = _specialize('baseq')
202 | stat_baseq_strand, load_baseq_strand = _specialize('baseq_strand')
203 | stat_baseq_ext, load_baseq_ext = _specialize('baseq_ext')
204 | stat_baseq_ext_strand, load_baseq_ext_strand = _specialize('baseq_ext_strand')
205 | stat_coverage_gc, load_coverage_gc = _specialize('coverage_gc')
206 | 


--------------------------------------------------------------------------------
/pysamstats/test/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, print_function, division
3 | 


--------------------------------------------------------------------------------
/pysamstats/test/test_binned.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, print_function, division
  3 | from itertools import chain
  4 | from collections import Counter
  5 | import logging
  6 | 
  7 | 
  8 | from pysam import Samfile, Fastafile
  9 | from nose.tools import eq_
 10 | from numpy import around as round
 11 | 
 12 | 
 13 | import pysamstats
 14 | from .util import normalise_coords, mean, rms, rootmean, compare_iterators
 15 | 
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | debug = logger.debug
 19 | 
 20 | 
 21 | def compare_stats(impl, refimpl):
 22 |     # no read filters
 23 |     kwargs = {'chrom': 'Pf3D7_01_v3',
 24 |               'start': 0,
 25 |               'end': 2000,
 26 |               'one_based': False}
 27 |     expected = refimpl(Samfile('fixture/test.bam'), **kwargs)
 28 |     actual = impl(Samfile('fixture/test.bam'), **kwargs)
 29 |     compare_iterators(expected, actual)
 30 |     # read filters
 31 |     kwargs['min_mapq'] = 1
 32 |     kwargs['no_dup'] = True
 33 |     expected = refimpl(Samfile('fixture/test.bam'), **kwargs)
 34 |     actual = impl(Samfile('fixture/test.bam'), **kwargs)
 35 |     compare_iterators(expected, actual)
 36 | 
 37 | 
 38 | def compare_stats_withref(impl, refimpl, bam_fn='fixture/test.bam',
 39 |                           fasta_fn='fixture/ref.fa'):
 40 |     # no read filters
 41 |     kwargs = {'chrom': 'Pf3D7_01_v3',
 42 |               'start': 0,
 43 |               'end': 2000,
 44 |               'one_based': False}
 45 |     expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
 46 |     actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
 47 |     compare_iterators(expected, actual)
 48 |     # read filters
 49 |     kwargs['min_mapq'] = 1
 50 |     kwargs['no_dup'] = True
 51 |     expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
 52 |     actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
 53 |     compare_iterators(expected, actual)
 54 | 
 55 | 
 56 | def filter_alignments(alignments, min_mapq, no_dup):
 57 |     if min_mapq > 0:
 58 |         alignments = (a for a in alignments if a.mapq >= min_mapq)
 59 |     if no_dup:
 60 |         alignments = (a for a in alignments if not a.is_duplicate)
 61 |     return alignments
 62 | 
 63 | 
 64 | def stat_coverage_binned_refimpl(samfile, fastafile, chrom=None, start=None,
 65 |                                  end=None, one_based=False, window_size=300,
 66 |                                  window_offset=150, min_mapq=0, no_dup=False):
 67 |     if chrom is None:
 68 |         # noinspection PyTypeChecker
 69 |         it = chain(*[
 70 |             iter_coverage_binned(samfile, fastafile, chrom, None, None, one_based, window_size,
 71 |                                  window_offset, min_mapq, no_dup)
 72 |             for chrom in samfile.references
 73 |         ])
 74 |     else:
 75 |         it = iter_coverage_binned(samfile, fastafile, chrom, start, end, one_based, window_size,
 76 |                                   window_offset, min_mapq, no_dup)
 77 |     return it
 78 | 
 79 | 
 80 | def gc_content(fastafile, chrom, start, end):
 81 |     seq = fastafile.fetch(chrom, start, end).lower()
 82 |     nc = Counter(seq)
 83 |     gc = int(round((nc['g'] + nc['c']) * 100. / (end-start)))
 84 |     return gc
 85 | 
 86 | 
 87 | def iter_coverage_binned(samfile, fastafile, chrom, start, end, one_based,
 88 |                          window_size, window_offset, min_mapq, no_dup):
 89 |     assert chrom is not None
 90 |     start, end = normalise_coords(one_based, start, end)
 91 |     chrlen = samfile.lengths[samfile.references.index(chrom)]
 92 |     if start is None:
 93 |         start = 0
 94 |     if end is None:
 95 |         end = chrlen
 96 |     if end > chrlen:
 97 |         end = chrlen
 98 |     # setup first bin
 99 |     bin_start = start
100 |     bin_end = bin_start + window_size
101 |     reads_all = reads_pp = 0
102 | 
103 |     # iterate over reads
104 |     alignments = samfile.fetch(chrom, start, end)
105 |     alignments = filter_alignments(alignments, min_mapq, no_dup)
106 |     for aln in alignments:
107 |         while aln.pos > bin_end:  # end of bin
108 |             gc = gc_content(fastafile, chrom, bin_start, bin_end)
109 |             pos = bin_start + window_offset
110 |             if one_based:
111 |                 pos += 1
112 |             rec = {'chrom': chrom, 'pos': pos,
113 |                    'gc': gc, 'reads_all': reads_all,
114 |                    'reads_pp': reads_pp}
115 |             yield rec
116 |             reads_all = reads_pp = 0
117 |             bin_start = bin_end
118 |             bin_end = bin_start + window_size
119 |         if not aln.is_unmapped:
120 |             reads_all += 1
121 |             if aln.is_proper_pair:
122 |                 reads_pp += 1
123 | 
124 |     # deal with last non-empty bin
125 |     gc = gc_content(fastafile, chrom, bin_start, bin_end)
126 |     pos = bin_start + window_offset
127 |     if one_based:
128 |         pos += 1
129 |     rec = {'chrom': chrom, 'pos': pos,
130 |            'gc': gc, 'reads_all': reads_all, 'reads_pp': reads_pp}
131 |     yield rec
132 | 
133 |     # deal with empty bins up to explicit end
134 |     if end is not None:
135 |         while bin_end < end:
136 |             reads_all = reads_pp = 0
137 |             bin_start = bin_end
138 |             bin_end = bin_start + window_size
139 |             gc = gc_content(fastafile, chrom, bin_start, bin_end)
140 |             pos = bin_start + window_offset
141 |             if one_based:
142 |                 pos += 1
143 |             rec = {'chrom': chrom, 'pos': pos,
144 |                    'gc': gc, 'reads_all': reads_all,
145 |                    'reads_pp': reads_pp}
146 |             yield rec
147 | 
148 | 
149 | def test_stat_coverage_binned():
150 |     compare_stats_withref(pysamstats.stat_coverage_binned,
151 |                           stat_coverage_binned_refimpl)
152 | 
153 | 
154 | def test_stat_coverage_binned_uppercase_fasta():
155 |     compare_stats_withref(pysamstats.stat_coverage_binned,
156 |                           stat_coverage_binned_refimpl,
157 |                           fasta_fn='fixture/ref.upper.fa')
158 | 
159 | 
160 | def stat_coverage_ext_binned_refimpl(samfile, fastafile, chrom=None, start=None, end=None,
161 |                                      one_based=False, window_size=300, window_offset=150,
162 |                                      min_mapq=0, no_dup=False):
163 |     if chrom is None:
164 |         # noinspection PyTypeChecker
165 |         it = chain(*[
166 |             iter_coverage_ext_binned(samfile, fastafile, chrom, None, None, one_based,
167 |                                      window_size, window_offset, min_mapq, no_dup)
168 |             for chrom in samfile.references
169 |         ])
170 |     else:
171 |         it = iter_coverage_ext_binned(samfile, fastafile, chrom, start, end, one_based,
172 |                                       window_size, window_offset, min_mapq, no_dup)
173 |     return it
174 | 
175 | 
176 | def iter_coverage_ext_binned(samfile, fastafile, chrom, start, end, one_based, window_size,
177 |                              window_offset, min_mapq, no_dup):
178 |     assert chrom is not None
179 |     start, end = normalise_coords(one_based, start, end)
180 |     chrlen = samfile.lengths[samfile.references.index(chrom)]
181 |     if start is None:
182 |         start = 0
183 |     if end is None:
184 |         end = chrlen
185 |     if end > chrlen:
186 |         end = chrlen
187 |     # setup first bin
188 |     bin_start = start
189 |     bin_end = bin_start + window_size
190 |     reads_all = reads_pp = reads_mate_unmapped = reads_mate_other_chr = \
191 |         reads_mate_same_strand = reads_faceaway = reads_softclipped = \
192 |         reads_duplicate = 0
193 | 
194 |     # iterate over reads
195 |     alignments = samfile.fetch(chrom, start, end)
196 |     alignments = filter_alignments(alignments, min_mapq, no_dup)
197 |     for aln in alignments:
198 |         while aln.pos > bin_end:  # end of bin
199 |             gc = gc_content(fastafile, chrom, bin_start, bin_end)
200 |             pos = bin_start + window_offset
201 |             if one_based:
202 |                 pos += 1
203 |             rec = {'chrom': chrom, 'pos': pos,
204 |                    'gc': gc,
205 |                    'reads_all': reads_all,
206 |                    'reads_pp': reads_pp,
207 |                    'reads_mate_unmapped': reads_mate_unmapped,
208 |                    'reads_mate_other_chr': reads_mate_other_chr,
209 |                    'reads_mate_same_strand': reads_mate_same_strand,
210 |                    'reads_faceaway': reads_faceaway,
211 |                    'reads_softclipped': reads_softclipped,
212 |                    'reads_duplicate': reads_duplicate}
213 |             yield rec
214 |             reads_all = reads_pp = reads_mate_unmapped = reads_mate_other_chr\
215 |                 = reads_mate_same_strand = reads_faceaway = reads_softclipped\
216 |                 = reads_duplicate = 0
217 |             bin_start = bin_end
218 |             bin_end = bin_start + window_size
219 | #        debug(reads, reads.cigar, repr(reads.cigarstring))
220 |         if not aln.is_unmapped:
221 |             reads_all += 1
222 |             if aln.is_proper_pair:
223 |                 reads_pp += 1
224 |             if aln.is_duplicate:
225 |                 reads_duplicate += 1
226 |             if aln.cigar is not None and any((op[0] == 4) for op in aln.cigar):
227 |                 reads_softclipped += 1
228 |             # should be mutually exclusive
229 |             if aln.mate_is_unmapped:
230 |                 reads_mate_unmapped += 1
231 |             elif aln.tid != aln.rnext:
232 |                 reads_mate_other_chr += 1
233 |             elif aln.is_reverse == aln.mate_is_reverse:
234 |                 reads_mate_same_strand += 1
235 |             elif (
236 |                 # mapped to reverse strand but leftmost
237 |                 (aln.is_reverse and aln.tlen > 0)
238 |                 # mapped to fwd strand but rightmost
239 |                 or (not aln.is_reverse and aln.tlen < 0)
240 |             ):
241 |                 reads_faceaway += 1
242 | 
243 |     # deal with last non-empty bin
244 |     gc = gc_content(fastafile, chrom, bin_start, bin_end)
245 |     pos = bin_start + window_offset
246 |     if one_based:
247 |         pos += 1
248 |     rec = {'chrom': chrom, 'pos': pos,
249 |            'gc': gc,
250 |            'reads_all': reads_all,
251 |            'reads_pp': reads_pp,
252 |            'reads_mate_unmapped': reads_mate_unmapped,
253 |            'reads_mate_other_chr': reads_mate_other_chr,
254 |            'reads_mate_same_strand': reads_mate_same_strand,
255 |            'reads_faceaway': reads_faceaway,
256 |            'reads_softclipped': reads_softclipped,
257 |            'reads_duplicate': reads_duplicate}
258 |     yield rec
259 | 
260 |     # deal with empty bins up to explicit end
261 |     if end is not None:
262 |         while bin_end < end:
263 |             reads_all = reads_pp = reads_mate_unmapped = reads_mate_other_chr\
264 |                 = reads_mate_same_strand = reads_faceaway = reads_softclipped\
265 |                 = reads_duplicate = 0
266 |             bin_start = bin_end
267 |             bin_end = bin_start + window_size
268 |             gc = gc_content(fastafile, chrom, bin_start, bin_end)
269 |             pos = bin_start + window_offset
270 |             if one_based:
271 |                 pos += 1
272 |             rec = {'chrom': chrom, 'pos': pos,
273 |                    'gc': gc,
274 |                    'reads_all': reads_all,
275 |                    'reads_pp': reads_pp,
276 |                    'reads_mate_unmapped': reads_mate_unmapped,
277 |                    'reads_mate_other_chr': reads_mate_other_chr,
278 |                    'reads_mate_same_strand': reads_mate_same_strand,
279 |                    'reads_faceaway': reads_faceaway,
280 |                    'reads_softclipped': reads_softclipped,
281 |                    'reads_duplicate': reads_duplicate}
282 |             yield rec
283 | 
284 | 
285 | def test_stat_coverage_ext_binned():
286 |     compare_stats_withref(pysamstats.stat_coverage_ext_binned,
287 |                           stat_coverage_ext_binned_refimpl)
288 | 
289 | 
290 | def test_stat_coverage_ext_binned_uppercase_fasta():
291 |     compare_stats_withref(pysamstats.stat_coverage_ext_binned,
292 |                           stat_coverage_ext_binned_refimpl,
293 |                           fasta_fn='fixture/ref.upper.fa')
294 | 
295 | 
296 | def stat_mapq_binned_refimpl(samfile, chrom=None, start=None, end=None, one_based=False,
297 |                              window_size=300, window_offset=150, min_mapq=0, no_dup=False):
298 |     if chrom is None:
299 |         # noinspection PyTypeChecker
300 |         it = chain(*[iter_mapq_binned(samfile, chrom, None, None, one_based, window_size,
301 |                                       window_offset, min_mapq, no_dup)
302 |                      for chrom in samfile.references])
303 |     else:
304 |         it = iter_mapq_binned(samfile, chrom, start, end, one_based, window_size, window_offset,
305 |                               min_mapq, no_dup)
306 |     return it
307 | 
308 | 
309 | def iter_mapq_binned(samfile, chrom, start, end, one_based, window_size, window_offset, min_mapq,
310 |                      no_dup):
311 |     assert chrom is not None
312 |     start, end = normalise_coords(one_based, start, end)
313 |     chrlen = samfile.lengths[samfile.references.index(chrom)]
314 |     if start is None:
315 |         start = 0
316 |     if end is None:
317 |         end = chrlen
318 |     if end > chrlen:
319 |         end = chrlen
320 |     # setup first bin
321 |     bin_start = start
322 |     bin_end = bin_start + window_size
323 |     reads_all = reads_mapq0 = mapq_square_sum = 0
324 | 
325 |     # iterate over reads
326 |     alignments = samfile.fetch(chrom, start, end)
327 |     alignments = filter_alignments(alignments, min_mapq, no_dup)
328 |     for aln in alignments:
329 |         while aln.pos > bin_end:  # end of bin
330 |             pos = bin_start + window_offset
331 |             if one_based:
332 |                 pos += 1
333 |             rec = {'chrom': chrom, 'pos': pos,
334 |                    'reads_all': reads_all,
335 |                    'reads_mapq0': reads_mapq0,
336 |                    'rms_mapq': rootmean(mapq_square_sum, reads_all)}
337 |             yield rec
338 |             reads_all = reads_mapq0 = mapq_square_sum = 0
339 |             bin_start = bin_end
340 |             bin_end = bin_start + window_size
341 |         if not aln.is_unmapped:
342 |             reads_all += 1
343 |             mapq_square_sum += aln.mapq**2
344 |             if aln.mapq == 0:
345 |                 reads_mapq0 += 1
346 | 
347 |     # deal with last non-empty bin
348 |     pos = bin_start + window_offset
349 |     if one_based:
350 |         pos += 1
351 |     rec = {'chrom': chrom, 'pos': pos,
352 |            'reads_all': reads_all,
353 |            'reads_mapq0': reads_mapq0,
354 |            'rms_mapq': rootmean(mapq_square_sum, reads_all)}
355 |     yield rec
356 | 
357 |     # deal with empty bins up to explicit end
358 |     if end is not None:
359 |         while bin_end < end:
360 |             reads_all = reads_mapq0 = mapq_square_sum = 0
361 |             bin_start = bin_end
362 |             bin_end = bin_start + window_size
363 |             pos = bin_start + window_offset
364 |             if one_based:
365 |                 pos += 1
366 |             rec = {'chrom': chrom, 'pos': pos,
367 |                    'reads_all': reads_all,
368 |                    'reads_mapq0': reads_mapq0,
369 |                    'rms_mapq': rootmean(mapq_square_sum, reads_all)}
370 |             yield rec
371 | 
372 | 
373 | def test_stat_mapq_binned():
374 |     compare_stats(pysamstats.stat_mapq_binned, stat_mapq_binned_refimpl)
375 | 
376 | 
377 | def stat_alignment_binned_refimpl(samfile, chrom=None, start=None, end=None, one_based=False,
378 |                                   window_size=300, window_offset=150, min_mapq=0, no_dup=False):
379 |     if chrom is None:
380 |         # noinspection PyTypeChecker
381 |         it = chain(*[
382 |             iter_alignment_binned(samfile, chrom, None, None, one_based, window_size,
383 |                                   window_offset, min_mapq, no_dup)
384 |             for chrom in samfile.references]
385 |         )
386 |     else:
387 |         it = iter_alignment_binned(samfile, chrom, start, end, one_based, window_size,
388 |                                    window_offset, min_mapq, no_dup)
389 |     return it
390 | 
391 | 
392 | CIGAR = 'MIDNSHP=X'
393 | 
394 | 
395 | def iter_alignment_binned(samfile, chrom, start, end, one_based, window_size, window_offset,
396 |                           min_mapq, no_dup):
397 |     assert chrom is not None
398 |     start, end = normalise_coords(one_based, start, end)
399 |     chrlen = samfile.lengths[samfile.references.index(chrom)]
400 |     if start is None:
401 |         start = 0
402 |     if end is None:
403 |         end = chrlen
404 |     if end > chrlen:
405 |         end = chrlen
406 |     # setup first bin
407 |     bin_start = start
408 |     bin_end = bin_start + window_size
409 |     c = Counter()
410 |     reads_all = 0
411 | 
412 |     # iterate over reads
413 |     alignments = samfile.fetch(chrom, start, end)
414 |     alignments = filter_alignments(alignments, min_mapq, no_dup)
415 |     for aln in alignments:
416 |         while aln.pos > bin_end:  # end of bin
417 |             pos = bin_start + window_offset
418 |             if one_based:
419 |                 pos += 1
420 |             rec = {'chrom': chrom, 'pos': pos, 'reads_all': reads_all}
421 |             for i in range(len(CIGAR)):
422 |                 rec[CIGAR[i]] = c[i]
423 | #            rec['NM'] = c['NM']
424 |             rec['bases_all'] = c[0] + c[1] + c[4] + c[7] + c[8]
425 |             yield rec
426 |             c = Counter()
427 |             reads_all = 0
428 |             bin_start = bin_end
429 |             bin_end = bin_start + window_size
430 | #        debug(aln.cigar)
431 |         if not aln.is_unmapped:
432 |             reads_all += 1
433 |             if aln.cigar is not None:
434 |                 for op, l in aln.cigar:
435 |                     c[op] += l
436 |             # add edit distance
437 |     #        tags = dict(aln.tags)
438 |     #        if 'NM' in tags:
439 |     #            c['NM'] += tags['NM']
440 | 
441 |     # deal with last non-empty bin
442 |     pos = bin_start + window_offset
443 |     if one_based:
444 |         pos += 1
445 |     rec = {'chrom': chrom, 'pos': pos, 'reads_all': reads_all}
446 |     for i in range(len(CIGAR)):
447 |         rec[CIGAR[i]] = c[i]
448 | #            rec['NM'] = c['NM']
449 |     rec['bases_all'] = c[0] + c[1] + c[4] + c[7] + c[8]
450 |     yield rec
451 | 
452 |     # deal with empty bins up to explicit end
453 |     if end is not None:
454 |         while bin_end < end:
455 |             c = Counter()
456 |             reads_all = 0
457 |             bin_start = bin_end
458 |             bin_end = bin_start + window_size
459 |             pos = bin_start + window_offset
460 |             if one_based:
461 |                 pos += 1
462 |             rec = {'chrom': chrom, 'pos': pos, 'reads_all': reads_all}
463 |             for i in range(len(CIGAR)):
464 |                 rec[CIGAR[i]] = c[i]
465 |         #            rec['NM'] = c['NM']
466 |             rec['bases_all'] = c[0] + c[1] + c[4] + c[7] + c[8]
467 |             yield rec
468 | 
469 | 
470 | def test_stat_alignment_binned():
471 |     compare_stats(pysamstats.stat_alignment_binned, stat_alignment_binned_refimpl)
472 | 
473 | 
474 | def stat_tlen_binned_refimpl(samfile, chrom=None, start=None, end=None, one_based=False,
475 |                              window_size=300, window_offset=150, min_mapq=0, no_dup=False):
476 |     if chrom is None:
477 |         # noinspection PyTypeChecker
478 |         it = chain(*[iter_tlen_binned(samfile, chrom, None, None, one_based, window_size,
479 |                                       window_offset, min_mapq, no_dup)
480 |                      for chrom in samfile.references])
481 |     else:
482 |         it = iter_tlen_binned(samfile, chrom, start, end, one_based, window_size, window_offset,
483 |                               min_mapq, no_dup)
484 |     return it
485 | 
486 | 
487 | def iter_tlen_binned(samfile, chrom, start, end, one_based, window_size, window_offset, min_mapq,
488 |                      no_dup):
489 |     assert chrom is not None
490 |     start, end = normalise_coords(one_based, start, end)
491 |     chrlen = samfile.lengths[samfile.references.index(chrom)]
492 |     if start is None:
493 |         start = 0
494 |     if end is None:
495 |         end = chrlen
496 |     if end > chrlen:
497 |         end = chrlen
498 |     # setup first bin
499 |     bin_start = start
500 |     bin_end = bin_start + window_size
501 |     reads_all = reads_pp = 0
502 |     tlens = []
503 |     tlens_pp = []
504 | 
505 |     # iterate over reads
506 |     alignments = samfile.fetch(chrom, start, end)
507 |     alignments = filter_alignments(alignments, min_mapq, no_dup)
508 |     for aln in alignments:
509 |         while aln.pos > bin_end:  # end of bin
510 |             pos = bin_start + window_offset
511 |             if one_based:
512 |                 pos += 1
513 |             rec = {'chrom': chrom, 'pos': pos,
514 |                    'reads_all': reads_all,
515 |                    'reads_pp': reads_pp,
516 |                    'mean_tlen': mean(tlens),
517 |                    'rms_tlen': rms(tlens),
518 |                    'mean_tlen_pp': mean(tlens_pp),
519 |                    'rms_tlen_pp': rms(tlens_pp),
520 |                    }
521 |             yield rec
522 |             reads_all = reads_pp = 0
523 |             tlens = []
524 |             tlens_pp = []
525 |             bin_start = bin_end
526 |             bin_end = bin_start + window_size
527 |         if not aln.is_unmapped:
528 |             reads_all += 1
529 |             tlens.append(aln.tlen)
530 |             if aln.is_proper_pair:
531 |                 reads_pp += 1
532 |                 tlens_pp.append(aln.tlen)
533 | 
534 |     # deal with last non-empty bin
535 |     pos = bin_start + window_offset
536 |     if one_based:
537 |         pos += 1
538 |     rec = {'chrom': chrom, 'pos': pos,
539 |            'reads_all': reads_all,
540 |            'reads_pp': reads_pp,
541 |            'mean_tlen': mean(tlens),
542 |            'rms_tlen': rms(tlens),
543 |            'mean_tlen_pp': mean(tlens_pp),
544 |            'rms_tlen_pp': rms(tlens_pp),
545 |            }
546 |     yield rec
547 | 
548 |     # deal with empty bins up to explicit end
549 |     if end is not None:
550 |         while bin_end < end:
551 |             reads_all = reads_pp = 0
552 |             tlens = []
553 |             tlens_pp = []
554 |             bin_start = bin_end
555 |             bin_end = bin_start + window_size
556 |             pos = bin_start + window_offset
557 |             if one_based:
558 |                 pos += 1
559 |             rec = {'chrom': chrom, 'pos': pos,
560 |                    'reads_all': reads_all,
561 |                    'reads_pp': reads_pp,
562 |                    'mean_tlen': mean(tlens),
563 |                    'rms_tlen': rms(tlens),
564 |                    'mean_tlen_pp': mean(tlens_pp),
565 |                    'rms_tlen_pp': rms(tlens_pp),
566 |                    }
567 |             yield rec
568 | 
569 | 
570 | def test_stat_tlen_binned():
571 |     compare_stats(pysamstats.stat_tlen_binned, stat_tlen_binned_refimpl)
572 | 
573 | 
574 | binned_functions = [
575 |     (pysamstats.load_coverage_binned, 1),
576 |     (pysamstats.load_coverage_ext_binned, 1),
577 |     (pysamstats.load_mapq_binned, 0),
578 |     (pysamstats.load_alignment_binned, 0),
579 |     (pysamstats.load_tlen_binned, 0),
580 | ]
581 | 
582 | 
583 | def test_binned_pad_region():
584 |     kwargs = {'chrom': 'Pf3D7_01_v3',
585 |               'start': 1000,
586 |               'end': 20000,
587 |               'one_based': False,
588 |               'window_size': 200,
589 |               'window_offset': 100}
590 |     for f, needs_ref in binned_functions:
591 |         debug(f.__name__)
592 |         if needs_ref:
593 |             a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
594 |                   **kwargs)
595 |         else:
596 |             a = f(Samfile('fixture/test.bam'), **kwargs)
597 |         assert set(a['chrom']) == {b'Pf3D7_01_v3'}
598 |         eq_(1100, a['pos'][0])
599 |         eq_(19900, a['pos'][-1])
600 | 
601 | 
602 | def test_binned_pad_wg():
603 |     expected = stat_coverage_binned_refimpl(
604 |         Samfile('fixture/test.bam'),
605 |         Fastafile('fixture/ref.fa'))
606 | 
607 |     actual = pysamstats.stat_coverage_binned(Samfile('fixture/test.bam'),
608 |                                              Fastafile('fixture/ref.fa'))
609 |     compare_iterators(expected, actual)
610 |     kwargs = {'window_size': 200,
611 |               'window_offset': 100}
612 |     for f, needs_ref in binned_functions:
613 |         debug(f.__name__)
614 |         if needs_ref:
615 |             a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
616 |                   **kwargs)
617 |         else:
618 |             a = f(Samfile('fixture/test.bam'), **kwargs)
619 |         assert sorted(set(a['chrom'])) == [b'Pf3D7_01_v3', b'Pf3D7_02_v3',
620 |                                            b'Pf3D7_03_v3']
621 |         eq_(100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0])
622 |         eq_(50100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1])
623 |         eq_(100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0])
624 |         eq_(60100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1])
625 |         eq_(100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][0])
626 |         eq_(70100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][-1])
627 | 


--------------------------------------------------------------------------------
/pysamstats/test/test_io.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import, print_function, division
 3 | import logging
 4 | 
 5 | from pysam import Samfile
 6 | 
 7 | from pysamstats.io import write_hdf5
 8 | import tables
 9 | 
10 | logger = logging.getLogger(__name__)
11 | debug = logger.debug
12 | 
13 | 
14 | def check_write_hdf5_chrom_dtype(arg):
15 | 
16 |     # testing auto dtype determination.
17 |     dtype, alignment, result, label = arg
18 |     import tempfile
19 | 
20 |     # use auto
21 |     with tempfile.NamedTemporaryFile(suffix=".h5") as tmp:
22 | 
23 |         write_hdf5("coverage", tmp.name, alignment, chrom=label, dtype=dtype)
24 | 
25 |         with tables.open_file(tmp.name, mode="r") as h5file:
26 |             return result == h5file.root.data.dtype["chrom"].itemsize
27 | 
28 | 
29 | def test_write_hdf5_chrom_dtype():
30 | 
31 |     contig_label = "AS2_scf7180000696055"
32 |     bampath = "fixture/longcontignames.bam"
33 | 
34 |     dtypes = [None, {"chrom": "a20"}, {"chrom": "a20"}]
35 |     alignments = [Samfile(bampath), Samfile(bampath), bampath]
36 |     results = [len(contig_label), 20, 20]
37 |     labels = [contig_label, contig_label, contig_label]
38 | 
39 |     for arg in zip(dtypes, alignments, results, labels):
40 |         assert check_write_hdf5_chrom_dtype(arg)
41 | 


--------------------------------------------------------------------------------
/pysamstats/test/test_pileup.py:
--------------------------------------------------------------------------------
   1 | # -*- coding: utf-8 -*-
   2 | from __future__ import absolute_import, print_function, division
   3 | import logging
   4 | import sys
   5 | 
   6 | 
   7 | from nose.tools import eq_, assert_raises
   8 | from pysam import Samfile, Fastafile
   9 | import numpy as np
  10 | from numpy import around as round
  11 | 
  12 | 
  13 | from .util import normalise_coords, fwd, rev, pp, mean, std, rms, vmax, compare_iterators
  14 | import pysamstats
  15 | 
  16 | 
  17 | logger = logging.getLogger(__name__)
  18 | debug = logger.debug
  19 | 
  20 | 
  21 | # PY2/3 compatibility
  22 | PY2 = sys.version_info[0] == 2
  23 | 
  24 | 
  25 | def compare_stats(impl, refimpl):
  26 |     # no read filters
  27 |     kwargs = {'chrom': 'Pf3D7_01_v3',
  28 |               'start': 0,
  29 |               'end': 2000,
  30 |               'one_based': False}
  31 |     expected = refimpl(Samfile('fixture/test.bam'), **kwargs)
  32 |     actual = impl(Samfile('fixture/test.bam'), **kwargs)
  33 |     compare_iterators(expected, actual)
  34 |     # read filters
  35 |     kwargs['min_mapq'] = 1
  36 |     kwargs['min_baseq'] = 17
  37 |     kwargs['no_del'] = True
  38 |     kwargs['no_dup'] = True
  39 |     expected = refimpl(Samfile('fixture/test.bam'), **kwargs)
  40 |     actual = impl(Samfile('fixture/test.bam'), **kwargs)
  41 |     compare_iterators(expected, actual)
  42 | 
  43 | 
  44 | def compare_stats_withref(impl, refimpl, bam_fn='fixture/test.bam',
  45 |                           fasta_fn='fixture/ref.fa'):
  46 |     # no read filters
  47 |     kwargs = {'chrom': 'Pf3D7_01_v3',
  48 |               'start': 0,
  49 |               'end': 2000,
  50 |               'one_based': False}
  51 |     expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
  52 |     actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
  53 |     compare_iterators(expected, actual)
  54 |     # read filters
  55 |     kwargs['min_mapq'] = 1
  56 |     kwargs['min_baseq'] = 17
  57 |     kwargs['no_del'] = True
  58 |     kwargs['no_dup'] = True
  59 |     expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
  60 |     actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
  61 |     compare_iterators(expected, actual)
  62 | 
  63 | 
  64 | def filter_reads(reads, min_mapq, min_baseq, no_del, no_dup):
  65 |     if min_mapq > 0:
  66 |         reads = [r for r in reads if r.alignment.mapq >= min_mapq]
  67 |     if min_baseq > 0:
  68 |         reads = [r for r, q in zip(reads, baseq(reads))
  69 |                  if q is not None and q >= min_baseq]
  70 |     if no_del:
  71 |         reads = nodel(reads)
  72 |     if no_dup:
  73 |         reads = nodup(reads)
  74 |     return reads
  75 | 
  76 | 
  77 | def stat_coverage_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, min_mapq=0,
  78 |                           min_baseq=0, no_del=False, no_dup=False):
  79 | 
  80 |     start, end = normalise_coords(one_based, start, end)
  81 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
  82 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
  83 |         chrom = samfile.getrname(col.tid)
  84 |         pos = col.pos + 1 if one_based else col.pos
  85 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
  86 |         yield {'chrom': chrom, 'pos': pos, 'reads_all': len(reads),
  87 |                'reads_pp': len(pp(reads))}
  88 | 
  89 | 
  90 | def test_stat_coverage():
  91 |     compare_stats(pysamstats.stat_coverage, stat_coverage_refimpl)
  92 | 
  93 | 
  94 | def stat_coverage_strand_refimpl(samfile, chrom=None, start=None, end=None,
  95 |                                  one_based=False, min_mapq=0, min_baseq=0, no_del=False,
  96 |                                  no_dup=False):
  97 |     start, end = normalise_coords(one_based, start, end)
  98 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
  99 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 100 |         chrom = samfile.getrname(col.tid)
 101 |         pos = col.pos + 1 if one_based else col.pos
 102 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 103 |         yield {'chrom': chrom, 'pos': pos,
 104 |                'reads_all': len(reads),
 105 |                'reads_fwd': len(fwd(reads)),
 106 |                'reads_rev': len(rev(reads)),
 107 |                'reads_pp': len(pp(reads)),
 108 |                'reads_pp_fwd': len(fwd(pp(reads))),
 109 |                'reads_pp_rev': len(rev(pp(reads)))}
 110 | 
 111 | 
 112 | def test_stat_coverage_strand():
 113 |     compare_stats(pysamstats.stat_coverage_strand, stat_coverage_strand_refimpl)
 114 | 
 115 | 
 116 | def stat_coverage_ext_refimpl(samfile, chrom=None, start=None, end=None, one_based=False,
 117 |                               min_mapq=0, min_baseq=0, no_del=False, no_dup=False):
 118 |     start, end = normalise_coords(one_based, start, end)
 119 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 120 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 121 |         chrom = samfile.getrname(col.tid)
 122 |         pos = col.pos + 1 if one_based else col.pos
 123 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 124 |         reads_mate_unmapped = [read for read in reads
 125 |                                if read.alignment.mate_is_unmapped]
 126 |         reads_mate_mapped = [read for read in reads
 127 |                              if not read.alignment.mate_is_unmapped]
 128 |         reads_mate_other_chr = [read for read in reads_mate_mapped
 129 |                                 if col.tid != read.alignment.rnext]
 130 |         reads_mate_same_strand = [
 131 |             read for read in reads_mate_mapped
 132 |             if col.tid == read.alignment.rnext
 133 |             and (read.alignment.is_reverse == read.alignment.mate_is_reverse)
 134 |         ]
 135 |         reads_faceaway = [
 136 |             read for read in reads_mate_mapped
 137 |             if read.alignment.is_reverse != read.alignment.mate_is_reverse
 138 |             and ((
 139 |                  read.alignment.is_reverse and read.alignment.tlen > 0)  #
 140 |                  # mapped to reverse strand but leftmost
 141 |                  or (not read.alignment.is_reverse and read.alignment.tlen < 0))
 142 |             # mapped to fwd strand but rightmost
 143 |         ]
 144 |         reads_softclipped = [
 145 |             read for read in reads
 146 |             if any((op[0] == 4) for op in read.alignment.cigar)
 147 |         ]
 148 |         reads_duplicate = [read for read in reads
 149 |                            if read.alignment.is_duplicate]
 150 |         yield {'chrom': chrom, 'pos': pos,
 151 |                'reads_all': len(reads),
 152 |                'reads_pp': len(pp(reads)),
 153 |                'reads_mate_unmapped': len(reads_mate_unmapped),
 154 |                'reads_mate_other_chr': len(reads_mate_other_chr),
 155 |                'reads_mate_same_strand': len(reads_mate_same_strand),
 156 |                'reads_faceaway': len(reads_faceaway),
 157 |                'reads_softclipped': len(reads_softclipped),
 158 |                'reads_duplicate': len(reads_duplicate)}
 159 | 
 160 | 
 161 | def test_stat_coverage_ext():
 162 |     compare_stats(pysamstats.stat_coverage_ext, stat_coverage_ext_refimpl)
 163 | 
 164 | 
 165 | def stat_coverage_ext_strand_refimpl(samfile, chrom=None, start=None, end=None, one_based=False,
 166 |                                      min_mapq=0, min_baseq=0, no_del=False, no_dup=False):
 167 |     start, end = normalise_coords(one_based, start, end)
 168 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 169 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 170 |         chrom = samfile.getrname(col.tid)
 171 |         pos = col.pos + 1 if one_based else col.pos
 172 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 173 |         reads_pp = pp(reads)
 174 |         reads_mate_unmapped = [read for read in reads
 175 |                                if read.alignment.mate_is_unmapped]
 176 |         reads_mate_mapped = [read for read in reads
 177 |                              if not read.alignment.mate_is_unmapped]
 178 |         reads_mate_other_chr = [read for read in reads_mate_mapped
 179 |                                 if col.tid != read.alignment.rnext]
 180 |         reads_mate_same_strand = [
 181 |             read for read in reads_mate_mapped
 182 |             if col.tid == read.alignment.rnext
 183 |             and (read.alignment.is_reverse == read.alignment.mate_is_reverse)
 184 |         ]
 185 |         reads_faceaway = [
 186 |             read for read in reads_mate_mapped
 187 |             if read.alignment.is_reverse != read.alignment.mate_is_reverse
 188 |             and ((
 189 |                  read.alignment.is_reverse and read.alignment.tlen > 0)  #
 190 |                  # mapped to reverse strand but leftmost
 191 |                  or (not read.alignment.is_reverse and read.alignment.tlen < 0))
 192 |             # mapped to fwd strand but rightmost
 193 |         ]
 194 |         reads_softclipped = [
 195 |             read for read in reads
 196 |             if any((op[0] == 4) for op in read.alignment.cigar)
 197 |         ]
 198 |         reads_duplicate = [read for read in reads
 199 |                            if read.alignment.is_duplicate]
 200 |         yield {'chrom': chrom, 'pos': pos,
 201 |                'reads_all': len(reads),
 202 |                'reads_fwd': len(fwd(reads)),
 203 |                'reads_rev': len(rev(reads)),
 204 |                'reads_pp': len(reads_pp),
 205 |                'reads_pp_fwd': len(fwd(reads_pp)),
 206 |                'reads_pp_rev': len(rev(reads_pp)),
 207 |                'reads_mate_unmapped': len(reads_mate_unmapped),
 208 |                'reads_mate_unmapped_fwd': len(fwd(reads_mate_unmapped)),
 209 |                'reads_mate_unmapped_rev': len(rev(reads_mate_unmapped)),
 210 |                'reads_mate_other_chr': len(reads_mate_other_chr),
 211 |                'reads_mate_other_chr_fwd': len(fwd(reads_mate_other_chr)),
 212 |                'reads_mate_other_chr_rev': len(rev(reads_mate_other_chr)),
 213 |                'reads_mate_same_strand': len(reads_mate_same_strand),
 214 |                'reads_mate_same_strand_fwd': len(fwd(reads_mate_same_strand)),
 215 |                'reads_mate_same_strand_rev': len(rev(reads_mate_same_strand)),
 216 |                'reads_faceaway': len(reads_faceaway),
 217 |                'reads_faceaway_fwd': len(fwd(reads_faceaway)),
 218 |                'reads_faceaway_rev': len(rev(reads_faceaway)),
 219 |                'reads_softclipped': len(reads_softclipped),
 220 |                'reads_softclipped_fwd': len(fwd(reads_softclipped)),
 221 |                'reads_softclipped_rev': len(rev(reads_softclipped)),
 222 |                'reads_duplicate': len(reads_duplicate),
 223 |                'reads_duplicate_fwd': len(fwd(reads_duplicate)),
 224 |                'reads_duplicate_rev': len(rev(reads_duplicate)),
 225 |                }
 226 | 
 227 | 
 228 | def test_stat_coverage_ext_strand():
 229 |     compare_stats(pysamstats.stat_coverage_ext_strand, stat_coverage_ext_strand_refimpl)
 230 | 
 231 | 
 232 | def stat_variation_refimpl(samfile, fafile, chrom=None, start=None, end=None, one_based=False,
 233 |                            min_mapq=0, min_baseq=0, no_del=False, no_dup=False):
 234 |     start, end = normalise_coords(one_based, start, end)
 235 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 236 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 237 |         chrom = samfile.getrname(col.tid)
 238 |         pos = col.pos + 1 if one_based else col.pos
 239 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 240 |         reads_nodel = [read for read in reads if not read.is_del]
 241 |         reads_pp = pp(reads)
 242 |         reads_pp_nodel = [read for read in reads_pp if not read.is_del]
 243 |         ref = fafile.fetch(chrom, col.pos, col.pos+1).upper()
 244 |         debug('%r %r %r', chrom, pos, ref)
 245 |         # if reads:
 246 |         #     debug(repr(reads[0].alignment.seq[reads[0].query_position]))
 247 |         matches = [read for read in reads_nodel
 248 |                    if read.alignment.seq[read.query_position] == ref]
 249 |         matches_pp = [read for read in reads_pp_nodel
 250 |                       if read.alignment.seq[read.query_position] == ref]
 251 |         mismatches = [read for read in reads_nodel
 252 |                       if read.alignment.seq[read.query_position] != ref]
 253 |         mismatches_pp = [read for read in reads_pp_nodel
 254 |                          if read.alignment.seq[read.query_position] != ref]
 255 |         deletions = [read for read in reads
 256 |                      if read.is_del and not read.is_refskip]
 257 |         deletions_pp = [read for read in reads_pp
 258 |                         if read.is_del and not read.is_refskip]
 259 |         insertions = [read for read in reads
 260 |                       if read.indel > 0]
 261 |         insertions_pp = [read for read in reads_pp
 262 |                          if read.indel > 0]
 263 |         debug([read.alignment.seq[read.query_position]
 264 |                for read in reads_nodel])
 265 |         a = [read for read in reads_nodel
 266 |              if read.alignment.seq[read.query_position] == 'A']
 267 |         a_pp = [read for read in reads_pp_nodel
 268 |                 if read.alignment.seq[read.query_position] == 'A']
 269 |         c = [read for read in reads_nodel
 270 |              if read.alignment.seq[read.query_position] == 'C']
 271 |         c_pp = [read for read in reads_pp_nodel
 272 |                 if read.alignment.seq[read.query_position] == 'C']
 273 |         t = [read for read in reads_nodel
 274 |              if read.alignment.seq[read.query_position] == 'T']
 275 |         t_pp = [read for read in reads_pp_nodel
 276 |                 if read.alignment.seq[read.query_position] == 'T']
 277 |         g = [read for read in reads_nodel
 278 |              if read.alignment.seq[read.query_position] == 'G']
 279 |         g_pp = [read for read in reads_pp_nodel
 280 |                 if read.alignment.seq[read.query_position] == 'G']
 281 |         n = [read for read in reads_nodel
 282 |              if read.alignment.seq[read.query_position] == 'N']
 283 |         n_pp = [read for read in reads_pp_nodel
 284 |                 if read.alignment.seq[read.query_position] == 'N']
 285 |         yield {'chrom': chrom, 'pos': pos, 'ref': ref,
 286 |                'reads_all': len(reads),
 287 |                'reads_pp': len(reads_pp),
 288 |                'matches': len(matches),
 289 |                'matches_pp': len(matches_pp),
 290 |                'mismatches': len(mismatches),
 291 |                'mismatches_pp': len(mismatches_pp),
 292 |                'deletions': len(deletions),
 293 |                'deletions_pp': len(deletions_pp),
 294 |                'insertions': len(insertions),
 295 |                'insertions_pp': len(insertions_pp),
 296 |                'A': len(a), 'A_pp': len(a_pp),
 297 |                'C': len(c), 'C_pp': len(c_pp),
 298 |                'T': len(t), 'T_pp': len(t_pp),
 299 |                'G': len(g), 'G_pp': len(g_pp),
 300 |                'N': len(n), 'N_pp': len(n_pp)}
 301 | 
 302 | 
 303 | def test_stat_variation():
 304 |     compare_stats_withref(pysamstats.stat_variation, stat_variation_refimpl)
 305 | 
 306 | 
 307 | def test_stat_variation_rna():
 308 |     compare_stats_withref(pysamstats.stat_variation, stat_variation_refimpl,
 309 |                           bam_fn='fixture/rna.bam')
 310 | 
 311 | 
 312 | def stat_variation_strand_refimpl(samfile, fafile, chrom=None, start=None, end=None,
 313 |                                   one_based=False, min_mapq=0, min_baseq=0, no_del=False,
 314 |                                   no_dup=False):
 315 |     start, end = normalise_coords(one_based, start, end)
 316 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 317 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 318 |         chrom = samfile.getrname(col.tid)
 319 |         pos = col.pos + 1 if one_based else col.pos
 320 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 321 |         reads_nodel = [read for read in reads if not read.is_del]
 322 |         reads_pp = [read for read in reads if read.alignment.is_proper_pair]
 323 |         reads_pp_nodel = [read for read in reads
 324 |                           if read.alignment.is_proper_pair and not read.is_del]
 325 |         ref = fafile.fetch(chrom, col.pos, col.pos+1).upper()
 326 |         matches = [read for read in reads_nodel
 327 |                    if read.alignment.seq[read.query_position] == ref]
 328 |         matches_pp = [read for read in reads_pp_nodel
 329 |                       if read.alignment.seq[read.query_position] == ref]
 330 |         mismatches = [read for read in reads_nodel
 331 |                       if read.alignment.seq[read.query_position] != ref]
 332 |         mismatches_pp = [read for read in reads_pp_nodel
 333 |                          if read.alignment.seq[read.query_position] != ref]
 334 |         deletions = [read for read in reads
 335 |                      if read.is_del and not read.is_refskip]
 336 |         deletions_pp = [read for read in reads_pp
 337 |                         if read.is_del and not read.is_refskip]
 338 |         insertions = [read for read in reads
 339 |                       if read.indel > 0]
 340 |         insertions_pp = [read for read in reads_pp
 341 |                          if read.indel > 0]
 342 |         a = [read for read in reads_nodel
 343 |              if read.alignment.seq[read.query_position] == 'A']
 344 |         a_pp = [read for read in reads_pp_nodel
 345 |                 if read.alignment.seq[read.query_position] == 'A']
 346 |         c = [read for read in reads_nodel
 347 |              if read.alignment.seq[read.query_position] == 'C']
 348 |         c_pp = [read for read in reads_pp_nodel
 349 |                 if read.alignment.seq[read.query_position] == 'C']
 350 |         t = [read for read in reads_nodel
 351 |              if read.alignment.seq[read.query_position] == 'T']
 352 |         t_pp = [read for read in reads_pp_nodel
 353 |                 if read.alignment.seq[read.query_position] == 'T']
 354 |         g = [read for read in reads_nodel
 355 |              if read.alignment.seq[read.query_position] == 'G']
 356 |         g_pp = [read for read in reads_pp_nodel
 357 |                 if read.alignment.seq[read.query_position] == 'G']
 358 |         n = [read for read in reads_nodel
 359 |              if read.alignment.seq[read.query_position] == 'N']
 360 |         n_pp = [read for read in reads_pp_nodel
 361 |                 if read.alignment.seq[read.query_position] == 'N']
 362 |         yield {
 363 |             'chrom': chrom, 'pos': pos, 'ref': ref,
 364 |             'reads_all': len(reads),
 365 |             'reads_fwd': len(fwd(reads)),
 366 |             'reads_rev': len(rev(reads)),
 367 |             'reads_pp': len(reads_pp),
 368 |             'reads_pp_fwd': len(fwd(reads_pp)),
 369 |             'reads_pp_rev': len(rev(reads_pp)),
 370 |             'matches': len(matches),
 371 |             'matches_fwd': len(fwd(matches)),
 372 |             'matches_rev': len(rev(matches)),
 373 |             'matches_pp': len(matches_pp),
 374 |             'matches_pp_fwd': len(fwd(matches_pp)),
 375 |             'matches_pp_rev': len(rev(matches_pp)),
 376 |             'mismatches': len(mismatches),
 377 |             'mismatches_fwd': len(fwd(mismatches)),
 378 |             'mismatches_rev': len(rev(mismatches)),
 379 |             'mismatches_pp': len(mismatches_pp),
 380 |             'mismatches_pp_fwd': len(fwd(mismatches_pp)),
 381 |             'mismatches_pp_rev': len(rev(mismatches_pp)),
 382 |             'deletions': len(deletions),
 383 |             'deletions_fwd': len(fwd(deletions)),
 384 |             'deletions_rev': len(rev(deletions)),
 385 |             'deletions_pp': len(deletions_pp),
 386 |             'deletions_pp_fwd': len(fwd(deletions_pp)),
 387 |             'deletions_pp_rev': len(rev(deletions_pp)),
 388 |             'insertions': len(insertions),
 389 |             'insertions_fwd': len(fwd(insertions)),
 390 |             'insertions_rev': len(rev(insertions)),
 391 |             'insertions_pp': len(insertions_pp),
 392 |             'insertions_pp_fwd': len(fwd(insertions_pp)),
 393 |             'insertions_pp_rev': len(rev(insertions_pp)),
 394 |             'A': len(a), 'A_fwd': len(fwd(a)), 'A_rev': len(rev(a)),
 395 |             'A_pp': len(a_pp), 'A_pp_fwd': len(fwd(a_pp)), 'A_pp_rev': len(rev(a_pp)),
 396 |             'C': len(c), 'C_fwd': len(fwd(c)), 'C_rev': len(rev(c)),
 397 |             'C_pp': len(c_pp), 'C_pp_fwd': len(fwd(c_pp)), 'C_pp_rev': len(rev(c_pp)),
 398 |             'T': len(t), 'T_fwd': len(fwd(t)), 'T_rev': len(rev(t)),
 399 |             'T_pp': len(t_pp), 'T_pp_fwd': len(fwd(t_pp)), 'T_pp_rev': len(rev(t_pp)),
 400 |             'G': len(g), 'G_fwd': len(fwd(g)), 'G_rev': len(rev(g)),
 401 |             'G_pp': len(g_pp), 'G_pp_fwd': len(fwd(g_pp)), 'G_pp_rev': len(rev(g_pp)),
 402 |             'N': len(n), 'N_fwd': len(fwd(n)), 'N_rev': len(rev(n)),
 403 |             'N_pp': len(n_pp), 'N_pp_fwd': len(fwd(n_pp)), 'N_pp_rev': len(rev(n_pp))
 404 |         }
 405 | 
 406 | 
 407 | def test_stat_variation_strand():
 408 |     compare_stats_withref(pysamstats.stat_variation_strand,
 409 |                           stat_variation_strand_refimpl)
 410 | 
 411 | 
 412 | def test_stat_variation_strand_rna():
 413 |     compare_stats_withref(pysamstats.stat_variation_strand, stat_variation_strand_refimpl,
 414 |                           bam_fn='fixture/rna.bam')
 415 | 
 416 | 
 417 | def stat_tlen_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, min_mapq=0,
 418 |                       min_baseq=0, no_del=False, no_dup=False):
 419 |     start, end = normalise_coords(one_based, start, end)
 420 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 421 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 422 |         chrom = samfile.getrname(col.tid)
 423 |         pos = col.pos + 1 if one_based else col.pos
 424 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 425 |         # N.B., tlen only means something if mate is mapped to same chromosome
 426 |         reads_paired = [read for read in reads
 427 |                         if not read.alignment.mate_is_unmapped
 428 |                         and read.alignment.rnext == col.tid]
 429 |         tlen = [read.alignment.tlen for read in reads_paired]
 430 |         mean_tlen, rms_tlen, std_tlen = mean(tlen), rms(tlen), std(tlen)
 431 |         reads_pp = pp(reads)
 432 |         tlen_pp = [read.alignment.tlen for read in reads_pp]
 433 |         mean_tlen_pp, rms_tlen_pp, std_tlen_pp = mean(tlen_pp), rms(tlen_pp), std(tlen_pp)
 434 |         yield {'chrom': chrom, 'pos': pos,
 435 |                'reads_all': len(reads),
 436 |                'reads_paired': len(reads_paired),
 437 |                'reads_pp': len(reads_pp),
 438 |                'mean_tlen': mean_tlen,
 439 |                'mean_tlen_pp': mean_tlen_pp,
 440 |                'rms_tlen': rms_tlen,
 441 |                'rms_tlen_pp': rms_tlen_pp,
 442 |                'std_tlen': std_tlen,
 443 |                'std_tlen_pp': std_tlen_pp}
 444 | 
 445 | 
 446 | def test_stat_tlen():
 447 |     compare_stats(pysamstats.stat_tlen, stat_tlen_refimpl)
 448 | 
 449 | 
 450 | def stat_tlen_strand_refimpl(samfile, chrom=None, start=None, end=None, one_based=False,
 451 |                              min_mapq=0, min_baseq=0, no_del=False, no_dup=False):
 452 |     start, end = normalise_coords(one_based, start, end)
 453 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 454 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 455 |         chrom = samfile.getrname(col.tid)
 456 |         pos = col.pos + 1 if one_based else col.pos
 457 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 458 | 
 459 |         # all "paired" reads
 460 |         reads_paired = [read for read in reads
 461 |                         if not read.alignment.mate_is_unmapped
 462 |                         and read.alignment.rnext == col.tid]
 463 |         tlen = [read.alignment.tlen for read in reads_paired]
 464 |         mean_tlen, rms_tlen, std_tlen = mean(tlen), rms(tlen), std(tlen)
 465 |         reads_paired_fwd = fwd(reads_paired)
 466 |         tlen_fwd = [read.alignment.tlen for read in reads_paired_fwd]
 467 |         mean_tlen_fwd, rms_tlen_fwd, std_tlen_fwd = \
 468 |             mean(tlen_fwd), rms(tlen_fwd), std(tlen_fwd)
 469 |         reads_paired_rev = rev(reads_paired)
 470 |         tlen_rev = [read.alignment.tlen for read in reads_paired_rev]
 471 |         mean_tlen_rev, rms_tlen_rev, std_tlen_rev = \
 472 |             mean(tlen_rev), rms(tlen_rev), std(tlen_rev)
 473 | 
 474 |         # properly paired reads
 475 |         reads_pp = pp(reads)
 476 |         tlen_pp = [read.alignment.tlen for read in reads_pp]
 477 |         mean_tlen_pp, rms_tlen_pp, std_tlen_pp = \
 478 |             mean(tlen_pp), rms(tlen_pp), std(tlen_pp)
 479 |         reads_pp_fwd = fwd(reads_pp)
 480 |         tlen_pp_fwd = [read.alignment.tlen for read in reads_pp_fwd]
 481 |         mean_tlen_pp_fwd, rms_tlen_pp_fwd, std_tlen_pp_fwd = \
 482 |             mean(tlen_pp_fwd), rms(tlen_pp_fwd), std(tlen_pp_fwd)
 483 |         reads_pp_rev = rev(reads_pp)
 484 |         tlen_pp_rev = [read.alignment.tlen for read in reads_pp_rev]
 485 |         mean_tlen_pp_rev, rms_tlen_pp_rev, std_tlen_pp_rev = \
 486 |             mean(tlen_pp_rev), rms(tlen_pp_rev), std(tlen_pp_rev)
 487 | 
 488 |         # yield record
 489 |         yield {'chrom': chrom, 'pos': pos,
 490 |                'reads_all': len(reads),
 491 |                'reads_fwd': len(fwd(reads)),
 492 |                'reads_rev': len(rev(reads)),
 493 |                'reads_paired': len(reads_paired),
 494 |                'reads_paired_fwd': len(fwd(reads_paired)),
 495 |                'reads_paired_rev': len(rev(reads_paired)),
 496 |                'reads_pp': len(reads_pp),
 497 |                'reads_pp_fwd': len(fwd(reads_pp)),
 498 |                'reads_pp_rev': len(rev(reads_pp)),
 499 |                'mean_tlen': mean_tlen,
 500 |                'mean_tlen_fwd': mean_tlen_fwd,
 501 |                'mean_tlen_rev': mean_tlen_rev,
 502 |                'mean_tlen_pp': mean_tlen_pp,
 503 |                'mean_tlen_pp_fwd': mean_tlen_pp_fwd,
 504 |                'mean_tlen_pp_rev': mean_tlen_pp_rev,
 505 |                'rms_tlen': rms_tlen,
 506 |                'rms_tlen_fwd': rms_tlen_fwd,
 507 |                'rms_tlen_rev': rms_tlen_rev,
 508 |                'rms_tlen_pp': rms_tlen_pp,
 509 |                'rms_tlen_pp_fwd': rms_tlen_pp_fwd,
 510 |                'rms_tlen_pp_rev': rms_tlen_pp_rev,
 511 |                'std_tlen': std_tlen,
 512 |                'std_tlen_fwd': std_tlen_fwd,
 513 |                'std_tlen_rev': std_tlen_rev,
 514 |                'std_tlen_pp': std_tlen_pp,
 515 |                'std_tlen_pp_fwd': std_tlen_pp_fwd,
 516 |                'std_tlen_pp_rev': std_tlen_pp_rev}
 517 | 
 518 | 
 519 | def test_stat_tlen_strand():
 520 |     compare_stats(pysamstats.stat_tlen_strand, stat_tlen_strand_refimpl)
 521 | 
 522 | 
 523 | def mapq0(reads):
 524 |     return [read for read in reads if read.alignment.mapq == 0]
 525 | 
 526 | 
 527 | def mapq(reads):
 528 |     return [read.alignment.mapq for read in reads]
 529 | 
 530 | 
 531 | def stat_mapq_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, min_mapq=0,
 532 |                       min_baseq=0, no_del=False, no_dup=False):
 533 |     start, end = normalise_coords(one_based, start, end)
 534 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 535 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 536 |         chrom = samfile.getrname(col.tid)
 537 |         pos = col.pos + 1 if one_based else col.pos
 538 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 539 |         reads_pp = pp(reads)
 540 |         reads_mapq0 = mapq0(reads)
 541 |         reads_mapq0_pp = mapq0(reads_pp)
 542 |         mapq_all = mapq(reads)
 543 |         rms_mapq, max_mapq = rms(mapq_all), vmax(mapq_all)
 544 |         mapq_pp = mapq(reads_pp)
 545 |         rms_mapq_pp, max_mapq_pp = rms(mapq_pp), vmax(mapq_pp)
 546 |         yield {'chrom': chrom, 'pos': pos,
 547 |                'reads_all': len(reads),
 548 |                'reads_pp': len(reads_pp),
 549 |                'reads_mapq0': len(reads_mapq0),
 550 |                'reads_mapq0_pp': len(reads_mapq0_pp),
 551 |                'rms_mapq': rms_mapq,
 552 |                'rms_mapq_pp': rms_mapq_pp,
 553 |                'max_mapq': max_mapq,
 554 |                'max_mapq_pp': max_mapq_pp,
 555 |                }
 556 | 
 557 | 
 558 | def test_stat_mapq():
 559 |     compare_stats(pysamstats.stat_mapq, stat_mapq_refimpl)
 560 | 
 561 | 
 562 | def stat_mapq_strand_refimpl(samfile, chrom=None, start=None, end=None, one_based=False,
 563 |                              min_mapq=0, min_baseq=0, no_del=False, no_dup=False):
 564 |     start, end = normalise_coords(one_based, start, end)
 565 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 566 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 567 |         chrom = samfile.getrname(col.tid)
 568 |         pos = col.pos + 1 if one_based else col.pos
 569 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 570 |         reads_fwd = fwd(reads)
 571 |         reads_rev = rev(reads)
 572 |         reads_pp = pp(reads)
 573 |         reads_pp_fwd = fwd(reads_pp)
 574 |         reads_pp_rev = rev(reads_pp)
 575 |         reads_mapq0 = mapq0(reads)
 576 |         reads_mapq0_fwd = mapq0(reads_fwd)
 577 |         reads_mapq0_rev = mapq0(reads_rev)
 578 |         reads_mapq0_pp = mapq0(reads_pp)
 579 |         reads_mapq0_pp_fwd = mapq0(reads_pp_fwd)
 580 |         reads_mapq0_pp_rev = mapq0(reads_pp_rev)
 581 |         mapq_all = mapq(reads)
 582 |         rms_mapq, max_mapq = rms(mapq_all), vmax(mapq_all)
 583 |         mapq_fwd = mapq(reads_fwd)
 584 |         rms_mapq_fwd, max_mapq_fwd = rms(mapq_fwd), vmax(mapq_fwd)
 585 |         mapq_rev = mapq(reads_rev)
 586 |         rms_mapq_rev, max_mapq_rev = rms(mapq_rev), vmax(mapq_rev)
 587 |         mapq_pp = mapq(reads_pp)
 588 |         rms_mapq_pp, max_mapq_pp = rms(mapq_pp), vmax(mapq_pp)
 589 |         mapq_pp_fwd = mapq(reads_pp_fwd)
 590 |         rms_mapq_pp_fwd, max_mapq_pp_fwd = rms(mapq_pp_fwd), vmax(mapq_pp_fwd)
 591 |         mapq_pp_rev = mapq(reads_pp_rev)
 592 |         rms_mapq_pp_rev, max_mapq_pp_rev = rms(mapq_pp_rev), vmax(mapq_pp_rev)
 593 |         yield {'chrom': chrom, 'pos': pos,
 594 |                'reads_all': len(reads),
 595 |                'reads_fwd': len(reads_fwd),
 596 |                'reads_rev': len(reads_rev),
 597 |                'reads_pp': len(reads_pp),
 598 |                'reads_pp_fwd': len(reads_pp_fwd),
 599 |                'reads_pp_rev': len(reads_pp_rev),
 600 |                'reads_mapq0': len(reads_mapq0),
 601 |                'reads_mapq0_fwd': len(reads_mapq0_fwd),
 602 |                'reads_mapq0_rev': len(reads_mapq0_rev),
 603 |                'reads_mapq0_pp': len(reads_mapq0_pp),
 604 |                'reads_mapq0_pp_fwd': len(reads_mapq0_pp_fwd),
 605 |                'reads_mapq0_pp_rev': len(reads_mapq0_pp_rev),
 606 |                'rms_mapq': rms_mapq,
 607 |                'rms_mapq_fwd': rms_mapq_fwd,
 608 |                'rms_mapq_rev': rms_mapq_rev,
 609 |                'rms_mapq_pp': rms_mapq_pp,
 610 |                'rms_mapq_pp_fwd': rms_mapq_pp_fwd,
 611 |                'rms_mapq_pp_rev': rms_mapq_pp_rev,
 612 |                'max_mapq': max_mapq,
 613 |                'max_mapq_fwd': max_mapq_fwd,
 614 |                'max_mapq_rev': max_mapq_rev,
 615 |                'max_mapq_pp': max_mapq_pp,
 616 |                'max_mapq_pp_fwd': max_mapq_pp_fwd,
 617 |                'max_mapq_pp_rev': max_mapq_pp_rev,
 618 |                }
 619 | 
 620 | 
 621 | def test_stat_mapq_strand():
 622 |     compare_stats(pysamstats.stat_mapq_strand, stat_mapq_strand_refimpl)
 623 | 
 624 | 
 625 | def baseq(reads):
 626 |     l = [ord(read.alignment.qual[read.query_position]) - 33
 627 |          if read.query_position is not None
 628 |          else None
 629 |          for read in reads]
 630 |     return l
 631 | 
 632 | 
 633 | def nodel(reads):
 634 |     return [read for read in reads if not read.is_del]
 635 | 
 636 | 
 637 | def nodup(reads):
 638 |     return [read for read in reads if not read.alignment.is_duplicate]
 639 | 
 640 | 
 641 | def stat_baseq_refimpl(samfile, chrom=None, start=None, end=None, one_based=False, min_mapq=0,
 642 |                        min_baseq=0, no_del=False, no_dup=False):
 643 |     start, end = normalise_coords(one_based, start, end)
 644 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 645 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 646 |         chrom = samfile.getrname(col.tid)
 647 |         pos = col.pos + 1 if one_based else col.pos
 648 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 649 |         # N.B., make sure aligned base is not a deletion
 650 |         reads_nodel = nodel(reads)
 651 |         reads_pp = pp(reads)
 652 |         reads_pp_nodel = nodel(reads_pp)
 653 |         rms_baseq = rms(baseq(reads_nodel))
 654 |         rms_baseq_pp = rms(baseq(reads_pp_nodel))
 655 |         yield {'chrom': chrom, 'pos': pos,
 656 |                'reads_all': len(reads),
 657 |                'reads_pp': len(reads_pp),
 658 |                'rms_baseq': rms_baseq,
 659 |                'rms_baseq_pp': rms_baseq_pp}
 660 | 
 661 | 
 662 | def test_stat_baseq():
 663 |     compare_stats(pysamstats.stat_baseq, stat_baseq_refimpl)
 664 | 
 665 | 
 666 | def stat_baseq_strand_refimpl(samfile, chrom=None, start=None, end=None, one_based=False,
 667 |                               min_mapq=0, min_baseq=0, no_del=False, no_dup=False):
 668 |     start, end = normalise_coords(one_based, start, end)
 669 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 670 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 671 |         chrom = samfile.getrname(col.tid)
 672 |         pos = col.pos + 1 if one_based else col.pos
 673 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 674 |         reads_fwd = fwd(reads)
 675 |         reads_rev = rev(reads)
 676 |         reads_pp = pp(reads)
 677 |         reads_pp_fwd = fwd(reads_pp)
 678 |         reads_pp_rev = rev(reads_pp)
 679 |         reads_nodel = nodel(reads)
 680 |         reads_fwd_nodel = nodel(reads_fwd)
 681 |         reads_rev_nodel = nodel(reads_rev)
 682 |         reads_pp_nodel = nodel(reads_pp)
 683 |         reads_pp_fwd_nodel = nodel(reads_pp_fwd)
 684 |         reads_pp_rev_nodel = nodel(reads_pp_rev)
 685 |         rms_baseq = rms(baseq(reads_nodel))
 686 |         rms_baseq_fwd = rms(baseq(reads_fwd_nodel))
 687 |         rms_baseq_rev = rms(baseq(reads_rev_nodel))
 688 |         rms_baseq_pp = rms(baseq(reads_pp_nodel))
 689 |         rms_baseq_pp_fwd = rms(baseq(reads_pp_fwd_nodel))
 690 |         rms_baseq_pp_rev = rms(baseq(reads_pp_rev_nodel))
 691 |         yield {
 692 |             'chrom': chrom, 'pos': pos,
 693 |             'reads_all': len(reads),
 694 |             'reads_fwd': len(reads_fwd),
 695 |             'reads_rev': len(reads_rev),
 696 |             'reads_pp': len(reads_pp),
 697 |             'reads_pp_fwd': len(reads_pp_fwd),
 698 |             'reads_pp_rev': len(reads_pp_rev),
 699 |             'rms_baseq': rms_baseq,
 700 |             'rms_baseq_fwd': rms_baseq_fwd,
 701 |             'rms_baseq_rev': rms_baseq_rev,
 702 |             'rms_baseq_pp': rms_baseq_pp,
 703 |             'rms_baseq_pp_fwd': rms_baseq_pp_fwd,
 704 |             'rms_baseq_pp_rev': rms_baseq_pp_rev,
 705 |         }
 706 | 
 707 | 
 708 | def test_stat_baseq_strand():
 709 |     compare_stats(pysamstats.stat_baseq_strand, stat_baseq_strand_refimpl)
 710 | 
 711 | 
 712 | def stat_baseq_ext_refimpl(samfile, fafile, chrom=None, start=None, end=None, one_based=False,
 713 |                            min_mapq=0, min_baseq=0, no_del=False, no_dup=False):
 714 |     start, end = normalise_coords(one_based, start, end)
 715 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 716 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 717 |         chrom = samfile.getrname(col.tid)
 718 |         pos = col.pos + 1 if one_based else col.pos
 719 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 720 |         reads_nodel = [read for read in reads if not read.is_del]
 721 |         reads_pp = pp(reads)
 722 |         reads_pp_nodel = [read for read in reads_pp if not read.is_del]
 723 |         ref = fafile.fetch(chrom, col.pos, col.pos+1).upper()
 724 |         matches = [read for read in reads_nodel
 725 |                    if read.alignment.seq[read.query_position] == ref]
 726 |         matches_pp = [read for read in reads_pp_nodel
 727 |                       if read.alignment.seq[read.query_position] == ref]
 728 |         mismatches = [read for read in reads_nodel
 729 |                       if read.alignment.seq[read.query_position] != ref]
 730 |         mismatches_pp = [read for read in reads_pp_nodel
 731 |                          if read.alignment.seq[read.query_position] != ref]
 732 | 
 733 |         rms_baseq = rms(baseq(reads_nodel))
 734 |         rms_baseq_pp = rms(baseq(reads_pp_nodel))
 735 |         rms_baseq_matches = rms(baseq(matches))
 736 |         rms_baseq_matches_pp = rms(baseq(matches_pp))
 737 |         rms_baseq_mismatches = rms(baseq(mismatches))
 738 |         rms_baseq_mismatches_pp = rms(baseq(mismatches_pp))
 739 |         yield {'chrom': chrom, 'pos': pos, 'ref': ref,
 740 |                'reads_all': len(reads),
 741 |                'reads_pp': len(reads_pp),
 742 |                'matches': len(matches),
 743 |                'matches_pp': len(matches_pp),
 744 |                'mismatches': len(mismatches),
 745 |                'mismatches_pp': len(mismatches_pp),
 746 |                'rms_baseq': rms_baseq,
 747 |                'rms_baseq_pp': rms_baseq_pp,
 748 |                'rms_baseq_matches': rms_baseq_matches,
 749 |                'rms_baseq_matches_pp': rms_baseq_matches_pp,
 750 |                'rms_baseq_mismatches': rms_baseq_mismatches,
 751 |                'rms_baseq_mismatches_pp': rms_baseq_mismatches_pp,
 752 |                }
 753 | 
 754 | 
 755 | def test_stat_baseq_ext():
 756 |     compare_stats_withref(pysamstats.stat_baseq_ext, stat_baseq_ext_refimpl)
 757 | 
 758 | 
 759 | def stat_baseq_ext_strand_refimpl(samfile, fafile, chrom=None, start=None, end=None,
 760 |                                   one_based=False, min_mapq=0, min_baseq=0, no_del=False,
 761 |                                   no_dup=False):
 762 |     start, end = normalise_coords(one_based, start, end)
 763 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 764 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 765 |         chrom = samfile.getrname(col.tid)
 766 |         pos = col.pos + 1 if one_based else col.pos
 767 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 768 |         reads_pp = pp(reads)
 769 |         reads_nodel = [read for read in reads if not read.is_del]
 770 |         reads_nodel_fwd = fwd(reads_nodel)
 771 |         reads_nodel_rev = rev(reads_nodel)
 772 |         reads_nodel_pp = pp(reads_nodel)
 773 |         reads_nodel_pp_fwd = fwd(reads_nodel_pp)
 774 |         reads_nodel_pp_rev = rev(reads_nodel_pp)
 775 |         reads_pp_nodel = [read for read in reads_pp if not read.is_del]
 776 |         ref = fafile.fetch(chrom, col.pos, col.pos+1).upper()
 777 |         matches = [read for read in reads_nodel
 778 |                    if read.alignment.seq[read.query_position] == ref]
 779 |         matches_fwd = fwd(matches)
 780 |         matches_rev = rev(matches)
 781 |         matches_pp = pp(matches)
 782 |         matches_pp_fwd = fwd(matches_pp)
 783 |         matches_pp_rev = rev(matches_pp)
 784 |         mismatches = [read for read in reads_nodel
 785 |                       if read.alignment.seq[read.query_position] != ref]
 786 |         mismatches_fwd = fwd(mismatches)
 787 |         mismatches_rev = rev(mismatches)
 788 |         mismatches_pp = pp(mismatches)
 789 |         mismatches_pp_fwd = fwd(mismatches_pp)
 790 |         mismatches_pp_rev = rev(mismatches_pp)
 791 | 
 792 |         rms_baseq = rms(baseq(reads_nodel))
 793 |         rms_baseq_fwd = rms(baseq(reads_nodel_fwd))
 794 |         rms_baseq_rev = rms(baseq(reads_nodel_rev))
 795 |         rms_baseq_pp = rms(baseq(reads_pp_nodel))
 796 |         rms_baseq_pp_fwd = rms(baseq(reads_nodel_pp_fwd))
 797 |         rms_baseq_pp_rev = rms(baseq(reads_nodel_pp_rev))
 798 |         rms_baseq_matches = rms(baseq(matches))
 799 |         rms_baseq_matches_fwd = rms(baseq(matches_fwd))
 800 |         rms_baseq_matches_rev = rms(baseq(matches_rev))
 801 |         rms_baseq_matches_pp = rms(baseq(matches_pp))
 802 |         rms_baseq_matches_pp_fwd = rms(baseq(matches_pp_fwd))
 803 |         rms_baseq_matches_pp_rev = rms(baseq(matches_pp_rev))
 804 |         rms_baseq_mismatches = rms(baseq(mismatches))
 805 |         rms_baseq_mismatches_fwd = rms(baseq(mismatches_fwd))
 806 |         rms_baseq_mismatches_rev = rms(baseq(mismatches_rev))
 807 |         rms_baseq_mismatches_pp = rms(baseq(mismatches_pp))
 808 |         rms_baseq_mismatches_pp_fwd = rms(baseq(mismatches_pp_fwd))
 809 |         rms_baseq_mismatches_pp_rev = rms(baseq(mismatches_pp_rev))
 810 |         yield {'chrom': chrom, 'pos': pos, 'ref': ref,
 811 |                'reads_all': len(reads),
 812 |                'reads_fwd': len(fwd(reads)),
 813 |                'reads_rev': len(rev(reads)),
 814 |                'reads_pp': len(reads_pp),
 815 |                'reads_pp_fwd': len(fwd(reads_pp)),
 816 |                'reads_pp_rev': len(rev(reads_pp)),
 817 |                'matches': len(matches),
 818 |                'matches_fwd': len(matches_fwd),
 819 |                'matches_rev': len(matches_rev),
 820 |                'matches_pp': len(matches_pp),
 821 |                'matches_pp_fwd': len(matches_pp_fwd),
 822 |                'matches_pp_rev': len(matches_pp_rev),
 823 |                'mismatches': len(mismatches),
 824 |                'mismatches_fwd': len(mismatches_fwd),
 825 |                'mismatches_rev': len(mismatches_rev),
 826 |                'mismatches_pp': len(mismatches_pp),
 827 |                'mismatches_pp_fwd': len(mismatches_pp_fwd),
 828 |                'mismatches_pp_rev': len(mismatches_pp_rev),
 829 |                'rms_baseq': rms_baseq,
 830 |                'rms_baseq_fwd': rms_baseq_fwd,
 831 |                'rms_baseq_rev': rms_baseq_rev,
 832 |                'rms_baseq_pp': rms_baseq_pp,
 833 |                'rms_baseq_pp_fwd': rms_baseq_pp_fwd,
 834 |                'rms_baseq_pp_rev': rms_baseq_pp_rev,
 835 |                'rms_baseq_matches': rms_baseq_matches,
 836 |                'rms_baseq_matches_fwd': rms_baseq_matches_fwd,
 837 |                'rms_baseq_matches_rev': rms_baseq_matches_rev,
 838 |                'rms_baseq_matches_pp': rms_baseq_matches_pp,
 839 |                'rms_baseq_matches_pp_fwd': rms_baseq_matches_pp_fwd,
 840 |                'rms_baseq_matches_pp_rev': rms_baseq_matches_pp_rev,
 841 |                'rms_baseq_mismatches': rms_baseq_mismatches,
 842 |                'rms_baseq_mismatches_fwd': rms_baseq_mismatches_fwd,
 843 |                'rms_baseq_mismatches_rev': rms_baseq_mismatches_rev,
 844 |                'rms_baseq_mismatches_pp': rms_baseq_mismatches_pp,
 845 |                'rms_baseq_mismatches_pp_fwd': rms_baseq_mismatches_pp_fwd,
 846 |                'rms_baseq_mismatches_pp_rev': rms_baseq_mismatches_pp_rev,
 847 |                }
 848 | 
 849 | 
 850 | def test_stat_baseq_ext_strand():
 851 |     compare_stats_withref(pysamstats.stat_baseq_ext_strand,
 852 |                           stat_baseq_ext_strand_refimpl)
 853 | 
 854 | 
 855 | from collections import Counter
 856 | 
 857 | 
 858 | def stat_coverage_gc_refimpl(samfile, fafile, chrom=None, start=None, end=None, one_based=False,
 859 |                              window_size=300, window_offset=150, min_mapq=0, min_baseq=0,
 860 |                              no_del=False, no_dup=False):
 861 |     start, end = normalise_coords(one_based, start, end)
 862 | 
 863 |     for col in samfile.pileup(reference=chrom, start=start, end=end, stepper="nofilter", flag_filter=0,
 864 |                               min_base_quality=0, min_mapping_quality=0, ignore_overlaps=True):
 865 |         chrom = samfile.getrname(col.tid)
 866 |         pos = col.pos + 1 if one_based else col.pos
 867 |         reads = filter_reads(col.pileups, min_mapq, min_baseq, no_del, no_dup)
 868 | 
 869 |         if col.pos <= window_offset:
 870 |             continue  # until we get a bit further into the chromosome
 871 | 
 872 |         ref_window_start = col.pos - window_offset
 873 |         ref_window_end = ref_window_start + window_size
 874 |         ref_window = fafile.fetch(chrom, ref_window_start,
 875 |                                   ref_window_end).lower()
 876 | 
 877 |         if len(ref_window) == 0:
 878 |             break  # because we've hit the end of the chromosome
 879 | 
 880 |         debug(ref_window)
 881 |         base_counter = Counter(ref_window)
 882 |         debug(base_counter)
 883 |         gc_count = base_counter['g'] + base_counter['c']
 884 |         debug(gc_count)
 885 |         gc_percent = int(round(gc_count * 100. / window_size))
 886 |         yield {'chrom': chrom, 'pos': pos,
 887 |                'reads_all': len(reads),
 888 |                'reads_pp': len(pp(reads)),
 889 |                'gc': gc_percent}
 890 | 
 891 | 
 892 | def test_stat_coverage_gc():
 893 |     compare_stats_withref(pysamstats.stat_coverage_gc, stat_coverage_gc_refimpl)
 894 | 
 895 | 
 896 | def test_stat_coverage_gc_uppercase_fasta():
 897 |     compare_stats_withref(pysamstats.stat_coverage_gc, stat_coverage_gc_refimpl,
 898 |                           fasta_fn='fixture/ref.upper.fa')
 899 | 
 900 | 
 901 | pileup_functions = [
 902 |     (pysamstats.load_coverage, 0),
 903 |     (pysamstats.load_coverage_strand, 0),
 904 |     (pysamstats.load_coverage_ext, 0),
 905 |     (pysamstats.load_coverage_ext_strand, 0),
 906 |     (pysamstats.load_variation, 1),
 907 |     (pysamstats.load_variation_strand, 1),
 908 |     (pysamstats.load_tlen, 0),
 909 |     (pysamstats.load_tlen_strand, 0),
 910 |     (pysamstats.load_mapq, 0),
 911 |     (pysamstats.load_mapq_strand, 0),
 912 |     (pysamstats.load_baseq, 0),
 913 |     (pysamstats.load_baseq_strand, 0),
 914 |     (pysamstats.load_baseq_ext, 1),
 915 |     (pysamstats.load_baseq_ext_strand, 1),
 916 |     (pysamstats.load_coverage_gc, 1),
 917 | ]
 918 | 
 919 | def test_pileup_kwargs():
 920 |     # check that keyword arguments are being passed through
 921 |     kwargs = {
 922 |         'chrom': 'Pf3D7_01_v3',
 923 |         'start': 2000,
 924 |         'end': 2100,
 925 |         'min_mapq': 1,
 926 |         'min_baseq': 1,
 927 |         'no_del': True,
 928 |         'no_dup': True
 929 |     }
 930 |     for f, needs_ref in pileup_functions:
 931 |         if needs_ref:
 932 |             a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs)
 933 |         else:
 934 |             a = f(Samfile('fixture/test.bam'), **kwargs)
 935 |         assert isinstance(a, np.ndarray)
 936 |         assert a.dtype.names is not None
 937 | 
 938 | 
 939 | def test_pileup_truncate():
 940 |     kwargs_notrunc = {'chrom': 'Pf3D7_01_v3',
 941 |                       'start': 2000,
 942 |                       'end': 2100,
 943 |                       'one_based': False,
 944 |                       'truncate': False}
 945 |     kwargs_trunc = {'chrom': 'Pf3D7_01_v3',
 946 |                     'start': 2000,
 947 |                     'end': 2100,
 948 |                     'one_based': False,
 949 |                     'truncate': True}
 950 |     for f, needs_ref in pileup_functions:
 951 |         debug(f.__name__)
 952 |         # test no truncate
 953 |         if needs_ref:
 954 |             a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
 955 |                   **kwargs_notrunc)
 956 |         else:
 957 |             a = f(Samfile('fixture/test.bam'), **kwargs_notrunc)
 958 |         debug(a[:5])
 959 |         eq_(1952, a['pos'][0])
 960 |         eq_(2154, a['pos'][-1])
 961 |         # test truncate
 962 |         if needs_ref:
 963 |             a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
 964 |                   **kwargs_trunc)
 965 |         else:
 966 |             a = f(Samfile('fixture/test.bam'), **kwargs_trunc)
 967 |         eq_(2000, a['pos'][0])
 968 |         eq_(2099, a['pos'][-1])
 969 | 
 970 | 
 971 | def test_pileup_pad():
 972 |     kwargs_nopad = {'chrom': 'Pf3D7_01_v3',
 973 |                     'start': 0,
 974 |                     'end': 20000,
 975 |                     'one_based': False,
 976 |                     'pad': False}
 977 |     kwargs_pad = {'chrom': 'Pf3D7_01_v3',
 978 |                   'start': 0,
 979 |                   'end': 20000,
 980 |                   'one_based': False,
 981 |                   'pad': True}
 982 |     for f, needs_ref in pileup_functions:
 983 |         debug(f.__name__)
 984 |         # test no pad
 985 |         if needs_ref:
 986 |             a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
 987 |                   **kwargs_nopad)
 988 |         else:
 989 |             a = f(Samfile('fixture/test.bam'), **kwargs_nopad)
 990 |         eq_(924, a['pos'][0])
 991 |         eq_(9935, a['pos'][-1])
 992 |         # test pad
 993 |         if needs_ref:
 994 |             a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
 995 |                   **kwargs_pad)
 996 |         else:
 997 |             a = f(Samfile('fixture/test.bam'), **kwargs_pad)
 998 |         eq_(0, a['pos'][0])
 999 |         eq_(19999, a['pos'][-1])
1000 |         assert np.all(np.diff(a['pos']) == 1)
1001 | 
1002 | 
1003 | def test_pileup_pad_wg():
1004 |     # whole genome
1005 |     expected = stat_coverage_refimpl(Samfile('fixture/test.bam'))
1006 |     actual = pysamstats.stat_coverage(Samfile('fixture/test.bam'))
1007 |     compare_iterators(expected, actual)
1008 |     kwargs_nopad = {'pad': False}
1009 |     kwargs_pad = {'pad': True}
1010 |     for f, needs_ref in pileup_functions:
1011 |         debug(f.__name__)
1012 |         # test no pad
1013 |         if needs_ref:
1014 |             a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
1015 |                   **kwargs_nopad)
1016 |         else:
1017 |             a = f(Samfile('fixture/test.bam'), **kwargs_nopad)
1018 |         eq_(sorted(set(a['chrom'])), [b'Pf3D7_01_v3', b'Pf3D7_02_v3'])
1019 |         eq_(924, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0])
1020 |         eq_(9935, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1])
1021 |         eq_(926, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0])
1022 |         eq_(10074, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1])
1023 |         # test pad
1024 |         if needs_ref:
1025 |             a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
1026 |                   **kwargs_pad)
1027 |         else:
1028 |             a = f(Samfile('fixture/test.bam'), **kwargs_pad)
1029 |         eq_(sorted(set(a['chrom'])),
1030 |             [b'Pf3D7_01_v3', b'Pf3D7_02_v3', b'Pf3D7_03_v3'])
1031 |         eq_(0, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0])
1032 |         eq_(50000, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1])
1033 |         eq_(0, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0])
1034 |         eq_(60000, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1])
1035 |         eq_(0, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][0])
1036 |         eq_(70000, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][-1])
1037 | 
1038 | 
1039 | def test_pileup_limit():
1040 | 
1041 |     for f, needs_ref in pileup_functions:
1042 |         debug(f.__name__)
1043 | 
1044 |         # test with effectively no limit
1045 |         kwargs = dict(fields=['reads_all'], max_depth=1000000)
1046 |         if needs_ref:
1047 |             a = f(Samfile('fixture/deep.bam'), Fastafile('fixture/ref.fa'),
1048 |                   **kwargs)
1049 |         else:
1050 |             a = f(Samfile('fixture/deep.bam'), **kwargs)
1051 |         eq_(26169, a[70])
1052 | 
1053 |         # test with specific limit
1054 |         kwargs = dict(fields=['reads_all'], max_depth=12000)
1055 |         if needs_ref:
1056 |             a = f(Samfile('fixture/deep.bam'), Fastafile('fixture/ref.fa'),
1057 |                   **kwargs)
1058 |         else:
1059 |             a = f(Samfile('fixture/deep.bam'), **kwargs)
1060 |         eq_(12046, a[70])  # no idea why limit is not exact
1061 | 
1062 |         # test with default limit
1063 |         kwargs = dict(fields=['reads_all'])
1064 |         if needs_ref:
1065 |             a = f(Samfile('fixture/deep.bam'), Fastafile('fixture/ref.fa'),
1066 |                   **kwargs)
1067 |         else:
1068 |             a = f(Samfile('fixture/deep.bam'), **kwargs)
1069 |         eq_(8052, a[70])  # no idea why limit is not exact
1070 | 
1071 | 
1072 | def test_load_cov_long_contig_name():
1073 |     # test that long chrom labels auto handled.
1074 | 
1075 |     label = 'AS2_scf7180000696055'
1076 |     bampath = 'fixture/longcontignames.bam'
1077 | 
1078 |     x = pysamstats.load_coverage(bampath, chrom=label)
1079 |     assert len(label) == x.dtype["chrom"].itemsize
1080 | 
1081 |     x = pysamstats.load_coverage(Samfile(bampath), chrom=label, dtype={"chrom": "a10"})
1082 |     assert 10 == x.dtype["chrom"].itemsize
1083 | 
1084 | 
1085 | def test_load_cov_using_steppers():
1086 | 
1087 |     # test that expected steppers give different/consistent results
1088 |     # this is the only bam file that differs between all/nofilter
1089 |     bampath = "fixture/longcontignames.bam"
1090 |     seq = 'AS2_scf7180000695891'
1091 |     pos = 14311
1092 |     steppers = ["all", "nofilter", "samtools"]
1093 |     reads_all = [7, 8, 4]
1094 |     reads_pp = [4, 5, 4]
1095 | 
1096 |     for exp_all, exp_pp, step in zip(reads_all, reads_pp, steppers):
1097 |         a = pysamstats.load_coverage(Samfile(bampath), chrom=seq, stepper=step, pad=True)
1098 |         eq_(exp_all, a[pos]["reads_all"])
1099 |         eq_(exp_pp, a[pos]["reads_pp"])
1100 | 
1101 |     with assert_raises(ValueError):
1102 |         pysamstats.load_coverage(Samfile(bampath), chrom=seq, stepper="notastepper")
1103 | 


--------------------------------------------------------------------------------
/pysamstats/test/test_regression.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import, print_function, division
 3 | import logging
 4 | import sys
 5 | 
 6 | from pysam import Samfile
 7 | import numpy as np
 8 | from os.path import isfile
 9 | 
10 | 
11 | import pysamstats
12 | from pysamstats.config import stats_types, stats_types_withref
13 | 
14 | 
15 | # no test_prefix so not run during unit tests
16 | def generate_fixtures():
17 | 
18 |     bampath = "fixture/test.bam"
19 |     fastapath = "fixture/ref.fa"
20 |     archive = "fixture/regression.npz"
21 |     assert not isfile(archive)
22 | 
23 |     # simple stats
24 |     dat = {}
25 |     for q in stats_types:
26 |         if q in stats_types_withref:
27 |             dat[q] = getattr(pysamstats, "load_" + q)(Samfile(bampath), fafile=fastapath)
28 |         else:
29 |             dat[q] = getattr(pysamstats, "load_" + q)(Samfile(bampath))
30 | 
31 |     np.savez_compressed(archive, **dat)
32 | 
33 | 
34 | def test_against_fixtures():
35 | 
36 |     # load fixtures from numpy array
37 |     bampath = "fixture/test.bam"
38 |     fastapath = "fixture/ref.fa"
39 |     archive = "fixture/regression.npz"
40 | 
41 |     testset = np.load(archive)
42 | 
43 |     for q in stats_types:
44 |         if q in stats_types_withref:
45 |             x = getattr(pysamstats, "load_" + q)(Samfile(bampath), fafile=fastapath)
46 |         else:
47 |             x = getattr(pysamstats, "load_" + q)(Samfile(bampath))
48 | 
49 |         # loop through all fields
50 |         for key in testset[q].dtype.names:
51 |             expect = testset[q][key]
52 |             actual = x[key]
53 |             try:
54 |                 np.testing.assert_array_equal(expect, actual, err_msg=key)
55 |             except AssertionError:
56 |                 print(expect[expect != actual])
57 |                 print(actual[expect != actual])
58 |                 raise
59 | 


--------------------------------------------------------------------------------
/pysamstats/test/util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, print_function, division
  3 | import logging
  4 | import sys
  5 | from math import sqrt
  6 | 
  7 | 
  8 | import numpy as np
  9 | from numpy import around as round
 10 | from nose.tools import eq_, assert_almost_equal
 11 | 
 12 | 
 13 | from pysam import Samfile, Fastafile
 14 | 
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | debug = logger.debug
 18 | 
 19 | 
 20 | # PY2/3 compatibility
 21 | PY2 = sys.version_info[0] == 2
 22 | if PY2:
 23 |     # noinspection PyUnresolvedReferences
 24 |     from itertools import izip_longest
 25 | else:
 26 |     from itertools import zip_longest as izip_longest
 27 | 
 28 | 
 29 | def compare_iterators(expected, actual):
 30 |     for e, a in izip_longest(expected, actual, fillvalue=None):
 31 |         assert e is not None, ('expected value is None', e, a)
 32 |         assert a is not None, ('actual value is None', e, a)
 33 |         for k, v in e.items():
 34 |             try:
 35 |                 if isinstance(v, float):
 36 |                     assert_almost_equal(v, a[k])
 37 |                 else:
 38 |                     eq_(v, a[k])
 39 |             except:
 40 |                 debug('mismatch %r, expected %r, found %r' % (k, v, a[k]))
 41 |                 debug('expected: %r' % sorted(e.items()))
 42 |                 debug('actual: %r' % sorted(a.items()))
 43 |                 raise
 44 |         for k in a:  # check no unexpected fields
 45 |             try:
 46 |                 assert k in e
 47 |             except:
 48 |                 debug('missing %r' % k)
 49 |                 debug('expected: %r' % sorted(e.items()))
 50 |                 debug('actual: %r' % sorted(a.items()))
 51 |                 raise
 52 | 
 53 | 
 54 | def normalise_coords(one_based, start, end):
 55 |     """Normalise start and end coordinates.
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     one_based : bool
 60 |     start : int
 61 |     end : int
 62 | 
 63 |     Returns
 64 |     -------
 65 |     start : int
 66 |     end : int
 67 | 
 68 |     """
 69 |     if one_based:
 70 |         start = start - 1 if start is not None else None
 71 |         end = end - 1 if end is not None else None
 72 |     return start, end
 73 | 
 74 | 
 75 | def fwd(reads):
 76 |     return [read for read in reads if not read.alignment.is_reverse]
 77 | 
 78 | 
 79 | def rev(reads):
 80 |     return [read for read in reads if read.alignment.is_reverse]
 81 | 
 82 | 
 83 | def pp(reads):
 84 |     return [read for read in reads if read.alignment.is_proper_pair]
 85 | 
 86 | 
 87 | def rms(a):
 88 |     if a:
 89 |         return int(round(sqrt(np.mean(np.power(a, 2)))))
 90 |     else:
 91 |         return 0
 92 | 
 93 | 
 94 | def mean(a):
 95 |     if a:
 96 |         return int(round(np.mean(a)))
 97 |     else:
 98 |         return 0
 99 | 
100 | 
101 | def std(a):
102 |     if len(a) >= 2:
103 |         std = np.std(a, ddof=1)
104 |         if np.isnan(std):
105 |             return 0
106 |         return int(round(std))
107 |     else:
108 |         return 0
109 | 
110 | 
111 | def vmax(a):
112 |     if a:
113 |         return max(a)
114 |     else:
115 |         return 0
116 | 
117 | 
118 | def rootmean(sqsum, count):
119 |     if count > 0:
120 |         return int(round(sqrt(sqsum / count)))
121 |     else:
122 |         return 0
123 | 


--------------------------------------------------------------------------------
/pysamstats/util.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import, print_function, division
 3 | from operator import itemgetter
 4 | from pysam import AlignmentFile
 5 | 
 6 | 
 7 | def flatten(recs, *fields):
 8 |     """Convert a record (dict) iterator to a row (tuple) iterator.
 9 | 
10 |     Parameters
11 |     ----------
12 | 
13 |     recs : iterator of dicts
14 |         records generator
15 |     fields : list of strings
16 |         names of fields to select
17 | 
18 |     Returns
19 |     -------
20 | 
21 |     rows : iterator of tuples
22 |         rows generator
23 | 
24 |     """
25 | 
26 |     getter = itemgetter(*fields)
27 |     it = (getter(rec) for rec in recs)
28 |     return it
29 | 
30 | 
31 | def load_stats(statfun, default_dtype, user_dtype, user_fields, **kwargs):
32 | 
33 |     import numpy as np
34 | 
35 |     # determine fields to load
36 |     default_fields = [t[0] for t in default_dtype]
37 |     if user_fields is None:
38 |         fields = default_fields
39 |     else:
40 |         fields = user_fields
41 |         if any([f not in default_fields for f in fields]):
42 |             raise ValueError('invalid fields: %r' % fields)
43 | 
44 |     # determine dtype
45 |     dtype = dict(default_dtype)
46 | 
47 |     # check if contig label dtype is appropriate length
48 |     max_seqid_len = determine_max_seqid(kwargs["alignmentfile"])
49 |     dtype["chrom"] = "a{0}".format(max_seqid_len)
50 | 
51 |     if user_dtype is not None:
52 |         dtype.update(dict(user_dtype))
53 | 
54 |     # handle single field requested
55 |     if len(fields) == 1:
56 |         dtype = dtype[fields[0]]
57 |     else:
58 |         dtype = [(f, dtype[f]) for f in fields]
59 | 
60 |     # setup record generator
61 |     recs = statfun(**kwargs)
62 | 
63 |     # flatten records
64 |     it = flatten(recs, *fields)
65 | 
66 |     # load into a Numpy array
67 |     a = np.fromiter(it, dtype=dtype)
68 | 
69 |     # view as recarray for convenience
70 |     if len(fields) > 1:
71 |         a = a.view(np.recarray)
72 | 
73 |     return a
74 | 
75 | 
76 | def determine_max_seqid(alignmentfile):
77 | 
78 |     if isinstance(alignmentfile, str):
79 |         alignmentfile = AlignmentFile(alignmentfile)
80 | 
81 |     return max([len(x) for x in alignmentfile.references])
82 | 


--------------------------------------------------------------------------------
/release.txt:
--------------------------------------------------------------------------------
 1 | version=`grep __version__ pysamstats/__init__.py | sed -e "s/__version__[ ]=[ ]'\(.*\)'/\1/"`
 2 | echo $version
 3 | python setup.py build_ext --inplace
 4 | nosetests -v
 5 | git commit -a -m v$version
 6 | git push
 7 | git tag -a v$version -m v$version
 8 | git push --tags
 9 | python setup.py register sdist upload
10 | # update readme with command line help
11 | # increment version and add .dev0
12 | git commit -a -m 'increment version'; git push
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pysam
2 | numpy
3 | nose
4 | tables
5 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | Cython==0.29.23
2 | numpy==1.21.0
3 | pysam==0.16.0.1
4 | nose==1.3.7
5 | tables==3.6.1
6 | 


--------------------------------------------------------------------------------
/sandbox.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "sandbox"
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "code",
 12 |      "collapsed": false,
 13 |      "input": [
 14 |       "import pysam\n",
 15 |       "import pysamstats\n",
 16 |       "from itertools import islice"
 17 |      ],
 18 |      "language": "python",
 19 |      "metadata": {},
 20 |      "outputs": [],
 21 |      "prompt_number": 1
 22 |     },
 23 |     {
 24 |      "cell_type": "code",
 25 |      "collapsed": false,
 26 |      "input": [
 27 |       "kwargs = {'chrom': 'Pf3D7_01_v3',\n",
 28 |       "#          'start': 0,\n",
 29 |       "#          'end': 10000,\n",
 30 |       "          'one_based': False}\n",
 31 |       "it = pysamstats.stat_coverage_binned(pysam.Samfile('fixture/test.bam'), pysam.Fastafile('fixture/ref.fa'), **kwargs)\n",
 32 |       "for rec in islice(it, 10):\n",
 33 |       "    print rec"
 34 |      ],
 35 |      "language": "python",
 36 |      "metadata": {},
 37 |      "outputs": [
 38 |       {
 39 |        "output_type": "stream",
 40 |        "stream": "stdout",
 41 |        "text": [
 42 |         "{'gc': 47, 'reads_pp': 428, 'chrom': 'Pf3D7_01_v3', 'pos': 150, 'reads_all': 538}\n",
 43 |         "{'gc': 30, 'reads_pp': 620, 'chrom': 'Pf3D7_01_v3', 'pos': 450, 'reads_all': 665}\n",
 44 |         "{'gc': 28, 'reads_pp': 667, 'chrom': 'Pf3D7_01_v3', 'pos': 750, 'reads_all': 703}\n",
 45 |         "{'gc': 27, 'reads_pp': 672, 'chrom': 'Pf3D7_01_v3', 'pos': 1050, 'reads_all': 726}\n",
 46 |         "{'gc': 30, 'reads_pp': 711, 'chrom': 'Pf3D7_01_v3', 'pos': 1350, 'reads_all': 728}\n",
 47 |         "{'gc': 32, 'reads_pp': 725, 'chrom': 'Pf3D7_01_v3', 'pos': 1650, 'reads_all': 735}\n",
 48 |         "{'gc': 29, 'reads_pp': 846, 'chrom': 'Pf3D7_01_v3', 'pos': 1950, 'reads_all': 856}\n",
 49 |         "{'gc': 28, 'reads_pp': 774, 'chrom': 'Pf3D7_01_v3', 'pos': 2250, 'reads_all': 782}\n",
 50 |         "{'gc': 27, 'reads_pp': 764, 'chrom': 'Pf3D7_01_v3', 'pos': 2550, 'reads_all': 769}\n",
 51 |         "{'gc': 31, 'reads_pp': 793, 'chrom': 'Pf3D7_01_v3', 'pos': 2850, 'reads_all': 798}\n"
 52 |        ]
 53 |       }
 54 |      ],
 55 |      "prompt_number": 2
 56 |     },
 57 |     {
 58 |      "cell_type": "code",
 59 |      "collapsed": false,
 60 |      "input": [
 61 |       "%timeit pysamstats.count_reads(pysam.Samfile('fixture/test.bam'), chrom='Pf3D7_01_v3')"
 62 |      ],
 63 |      "language": "python",
 64 |      "metadata": {},
 65 |      "outputs": [
 66 |       {
 67 |        "output_type": "stream",
 68 |        "stream": "stdout",
 69 |        "text": [
 70 |         "10 loops, best of 3: 18.8 ms per loop\n"
 71 |        ]
 72 |       }
 73 |      ],
 74 |      "prompt_number": 3
 75 |     },
 76 |     {
 77 |      "cell_type": "code",
 78 |      "collapsed": false,
 79 |      "input": [
 80 |       "import matplotlib.pyplot as plt\n",
 81 |       "a = pysamstats.load_coverage(pysam.Samfile('fixture/test.bam'))\n",
 82 |       "plt.plot(a.pos, a.reads_all)\n",
 83 |       "plt.show()"
 84 |      ],
 85 |      "language": "python",
 86 |      "metadata": {},
 87 |      "outputs": [],
 88 |      "prompt_number": 15
 89 |     },
 90 |     {
 91 |      "cell_type": "code",
 92 |      "collapsed": false,
 93 |      "input": [
 94 |       "a"
 95 |      ],
 96 |      "language": "python",
 97 |      "metadata": {},
 98 |      "outputs": [
 99 |       {
100 |        "output_type": "pyout",
101 |        "prompt_number": 14,
102 |        "text": [
103 |         "rec.array([('Pf3D7_01_v3', 0, 1, 1), ('Pf3D7_01_v3', 1, 1, 1),\n",
104 |         "       ('Pf3D7_01_v3', 2, 6, 5), ..., ('Pf3D7_01_v3', 10072, 6, 6),\n",
105 |         "       ('Pf3D7_01_v3', 10073, 5, 5), ('Pf3D7_01_v3', 10074, 2, 2)], \n",
106 |         "      dtype=[('chrom', 'S12'), ('pos', '<i4'), ('reads_all', '<i4'), ('reads_pp', '<i4')])"
107 |        ]
108 |       }
109 |      ],
110 |      "prompt_number": 14
111 |     },
112 |     {
113 |      "cell_type": "code",
114 |      "collapsed": false,
115 |      "input": [
116 |       "a.chrom"
117 |      ],
118 |      "language": "python",
119 |      "metadata": {},
120 |      "outputs": [
121 |       {
122 |        "output_type": "pyout",
123 |        "prompt_number": 11,
124 |        "text": [
125 |         "chararray(['Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3',\n",
126 |         "       'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3',\n",
127 |         "       'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3',\n",
128 |         "       'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3',\n",
129 |         "       'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3',\n",
130 |         "       'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3',\n",
131 |         "       'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3',\n",
132 |         "       'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3', 'Pf3D7_01_v3',\n",
133 |         "       'Pf3D7_01_v3'], \n",
134 |         "      dtype='|S12')"
135 |        ]
136 |       }
137 |      ],
138 |      "prompt_number": 11
139 |     },
140 |     {
141 |      "cell_type": "code",
142 |      "collapsed": false,
143 |      "input": [],
144 |      "language": "python",
145 |      "metadata": {},
146 |      "outputs": []
147 |     }
148 |    ],
149 |    "metadata": {}
150 |   }
151 |  ]
152 | }


--------------------------------------------------------------------------------
/scripts/pysamstats:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | import sys
  5 | from optparse import OptionParser
  6 | import errno
  7 | 
  8 | 
  9 | import pysam
 10 | import pysamstats
 11 | from pysamstats.config import stats_types, stats_types_withref, stepper_types
 12 | from pysamstats.io import write_csv, write_hdf5
 13 | 
 14 | 
 15 | if __name__ == '__main__':
 16 | 
 17 |     usage = 'usage: %prog [options] FILE'
 18 |     description = "Calculate statistics against genome positions based on " \
 19 |                   "sequence alignments from a SAM or BAM file and print them " \
 20 |                   "to stdout."
 21 |     epilog = """
 22 | Pileup-based statistics types (each row has statistics over reads in a pileup column):
 23 | 
 24 |     * coverage            - Number of reads aligned to each genome position
 25 |                             (total and properly paired).
 26 |     * coverage_strand     - As coverage but with forward/reverse strand counts.
 27 |     * coverage_ext        - Various additional coverage metrics, including
 28 |                             coverage for reads not properly paired (mate
 29 |                             unmapped, mate on other chromosome, ...).
 30 |     * coverage_ext_strand - As coverage_ext but with forward/reverse strand counts.
 31 |     * coverage_gc         - As coverage but also includes a column for %GC.
 32 |     * variation           - Numbers of matches, mismatches, deletions,
 33 |                             insertions, etc.
 34 |     * variation_strand    - As variation but with forward/reverse strand counts.
 35 |     * tlen                - Insert size statistics.
 36 |     * tlen_strand         - As tlen but with statistics by forward/reverse strand.
 37 |     * mapq                - Mapping quality statistics.
 38 |     * mapq_strand         - As mapq but with statistics by forward/reverse strand.
 39 |     * baseq               - Base quality statistics.
 40 |     * baseq_strand        - As baseq but with statistics by forward/reverse strand.
 41 |     * baseq_ext           - Extended base quality statistics, including qualities
 42 |                             of bases matching and mismatching reference.
 43 |     * baseq_ext_strand    - As baseq_ext but with statistics by forward/reverse strand.
 44 | 
 45 | Binned statistics types (each row has statistics over reads aligned starting within a genome window):
 46 | 
 47 |     * coverage_binned     - As coverage but binned.
 48 |     * coverage_ext_binned - As coverage_ext but binned.
 49 |     * mapq_binned         - Similar to mapq but binned.
 50 |     * alignment_binned    - Aggregated counts from cigar strings.
 51 |     * tlen_binned         - As tlen but binned.
 52 | 
 53 | Examples:
 54 | 
 55 |     pysamstats --type coverage example.bam > example.coverage.txt
 56 |     pysamstats --type coverage --chromosome Pf3D7_v3_01 --start 100000 --end 200000 example.bam > example.coverage.txt
 57 | 
 58 | Version: {version} (pysam {pysamversion})
 59 | 
 60 | """.format(version=pysamstats.__version__, pysamversion=pysam.__version__)
 61 | 
 62 |     OptionParser.format_epilog = lambda self, formatter: self.epilog
 63 |     parser = OptionParser(usage=usage, description=description, epilog=epilog)
 64 | 
 65 |     parser.add_option(
 66 |         '-t', '--type', dest='type', default='coverage',
 67 |         help='Type of statistics to print, one of: %s.' % ', '.join(stats_types))
 68 | 
 69 |     parser.add_option(
 70 |         '-c', '--chromosome', dest='chromosome', default=None,
 71 |         help='Chromosome name.')
 72 | 
 73 |     parser.add_option(
 74 |         '-s', '--start', dest='start', type='int', default=None,
 75 |         help='Start position (1-based).')
 76 | 
 77 |     parser.add_option(
 78 |         '-e', '--end', dest='end', type='int', default=None,
 79 |         help='End position (1-based).')
 80 | 
 81 |     parser.add_option(
 82 |         '-z', '--zero-based', dest='zero_based', action='store_true', default=False,
 83 |         help='Use zero-based coordinates (default is false, i.e., use one-based coords).')
 84 | 
 85 |     parser.add_option(
 86 |         '-u', '--truncate', dest='truncate', action='store_true', default=False,
 87 |         help='Truncate pileup-based stats so no records are emitted outside the specified range.')
 88 | 
 89 |     parser.add_option(
 90 |         '-S', '--stepper', dest='stepper', action='store', default='all',
 91 |         help='Stepper to provide to underlying pysam call. Options are:'
 92 |              '"all" (default): all reads are returned, except where flags BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, '
 93 |              'BAM_FDUP set; "nofilter" applies no filter to returned reads; '
 94 |              '"samtools": filter & read processing as in _csamtools_ pileup. This requires a fasta file. '
 95 |              'For complete details see the pysam documentation.')
 96 | 
 97 |     parser.add_option(
 98 |         '-d', '--pad', dest='pad', action='store_true', default=False,
 99 |         help='Pad pileup-based stats so a record is emitted for every position (default is only '
100 |              'covered positions).')
101 | 
102 |     parser.add_option(
103 |         '-D', '--max-depth', dest='max_depth', type=int, default=8000,
104 |         help='Maximum read depth permitted in pileup-based statistics. The default limit is 8000.')
105 | 
106 |     parser.add_option(
107 |         '-f', '--fasta', dest='fasta', default=None,
108 |         help='Reference sequence file, only required for some statistics.')
109 | 
110 |     parser.add_option(
111 |         '-o', '--omit-header', dest='omit_header', default=False, action='store_true',
112 |         help='Omit header row from output.')
113 | 
114 |     parser.add_option(
115 |         '-p', '--progress', dest='progress', type='int', metavar='N', default=None,
116 |         help='Report progress every N rows.')
117 | 
118 |     parser.add_option(
119 |         '--window-size', dest='window_size', type='int', metavar='N', default=300,
120 |         help='Size of window for binned statistics (default is 300).')
121 | 
122 |     parser.add_option(
123 |         '--window-offset', dest='window_offset', type=int, default=None, metavar='N',
124 |         help='Window offset to use for deciding which genome position to report binned statistics '
125 |              'against. The default is 150, i.e., the middle of 300bp window.')
126 | 
127 |     parser.add_option(
128 |         '--format', dest='format', default='tsv',
129 |         help='Output format, one of {tsv, csv, hdf5} (defaults to tsv). N.B., hdf5 requires '
130 |              'PyTables to be installed.')
131 | 
132 |     parser.add_option(
133 |         '--output', dest='output',
134 |         help='Path to output file. If not provided, write to stdout.')
135 | 
136 |     parser.add_option(
137 |         '--fields', dest='fields', default=None,
138 |         help='Comma-separated list of fields to output (defaults to all fields).')
139 | 
140 |     parser.add_option(
141 |         '--hdf5-group', dest='hdf5_group', default='/',
142 |         help='Name of HDF5 group to write to (defaults to the root group).')
143 | 
144 |     parser.add_option(
145 |         '--hdf5-dataset', dest='hdf5_dataset', default='data',
146 |         help='Name of HDF5 dataset to create (defaults to "data").')
147 | 
148 |     parser.add_option(
149 |         '--hdf5-complib', dest='hdf5_complib', default='zlib',
150 |         help='HDF5 compression library (defaults to zlib).')
151 | 
152 |     parser.add_option(
153 |         '--hdf5-complevel', dest='hdf5_complevel', type=int, default=1,
154 |         help='HDF5 compression level (defaults to 5).')
155 | 
156 |     parser.add_option(
157 |         '--hdf5-chunksize', dest='hdf5_chunksize', type=int, default=2**20,
158 |         help='Size of chunks in number of bytes (defaults to 2**20).')
159 | 
160 |     parser.add_option(
161 |         '--min-mapq', dest='min_mapq', type=int, default=0,
162 |         help='Only reads with mapping quality equal to or greater than this value will be counted '
163 |              '(0 by default).')
164 | 
165 |     parser.add_option(
166 |         '--min-baseq', dest='min_baseq', type=int, default=0,
167 |         help='Only reads with base quality equal to or greater than this value will be counted '
168 |              '(0 by default). Only applies to pileup-based statistics.')
169 | 
170 |     parser.add_option(
171 |         '--no-dup', dest='no_dup', default=False, action='store_true',
172 |         help="Don't count reads flagged as duplicate.")
173 | 
174 |     parser.add_option(
175 |         '--no-del', dest='no_del', default=False, action='store_true',
176 |         help="Don't count reads aligned with a deletion at the given position. Only applies to "
177 |              "pileup-based statistics.")
178 | 
179 |     options, args = parser.parse_args()
180 | 
181 |     if len(args) != 1:
182 |         parser.error('missing SAM or BAM file operand\n\nTry "pysamstats --help" for more '
183 |                      'information.')
184 | 
185 |     samfile = args[0]
186 |     one_based = not options.zero_based
187 |     write_header = not options.omit_header
188 |     if options.fields:
189 |         fields = options.fields.split(',')
190 |     else:
191 |         fields = None
192 | 
193 |     try:
194 | 
195 |         if options.type not in stats_types:
196 |             parser.error('unsupported statistics type: "%s"\nTry one of %s or '
197 |                          '"pysamstats --help" for more information.'
198 |                          % (options.type, stats_types))
199 | 
200 |         elif options.stepper not in stepper_types:
201 |             parser.error('unsupported stepper type: "%s"\nMust be one of %s or '
202 |                          '"pysamstats --help" for more information.'
203 |                          % (options.stepper, stepper_types))
204 | 
205 |         elif options.type in stats_types_withref \
206 |                 and options.fasta is None:
207 |             parser.error('missing --fasta option\n\nTry "pysamstats --help"'
208 |                          ' for more information.')
209 | 
210 |         else:
211 | 
212 |             # setup common parameters
213 |             kwargs = dict(
214 |                 chrom=options.chromosome,
215 |                 start=options.start,
216 |                 end=options.end,
217 |                 one_based=one_based,
218 |                 window_size=options.window_size,
219 |                 window_offset=options.window_offset,
220 |                 min_mapq=options.min_mapq,
221 |                 no_dup=options.no_dup
222 |             )
223 |             # some options only make sense if not performing binned analysis
224 |             if not options.type.endswith('_binned'):
225 |                 kwargs['truncate'] = options.truncate
226 |                 kwargs['pad'] = options.pad
227 |                 kwargs['max_depth'] = options.max_depth
228 |                 kwargs['min_baseq'] = options.min_baseq
229 |                 kwargs['no_del'] = options.no_del
230 |                 kwargs['stepper'] = options.stepper
231 | 
232 |             if options.format.lower() in ['tsv', 'csv']:
233 | 
234 |                 # setup
235 |                 dialect = {'tsv': 'excel-tab', 'csv': 'excel'}[options.format]
236 |                 if options.output is None:
237 |                     output = sys.stdout
238 |                     needs_closing = False
239 |                 else:
240 |                     output = open(options.output, 'w')
241 |                     needs_closing = True
242 | 
243 |                 try:
244 |                     write_csv(
245 |                         options.type,
246 |                         output,
247 |                         samfile,
248 |                         fafile=options.fasta,
249 |                         write_header=write_header,
250 |                         dialect=dialect,
251 |                         progress=options.progress,
252 |                         fields=fields,
253 |                         **kwargs
254 |                     )
255 |                 finally:
256 |                     if needs_closing:
257 |                         output.close()
258 | 
259 |             elif options.format.lower() == 'hdf5':
260 | 
261 |                 assert options.output is not None, '--output must be specified'
262 | 
263 |                 write_hdf5(
264 |                     options.type,
265 |                     options.output,
266 |                     samfile,
267 |                     fafile=options.fasta,
268 |                     progress=options.progress,
269 |                     hdf5_group=options.hdf5_group,
270 |                     hdf5_dataset=options.hdf5_dataset,
271 |                     hdf5_complib=options.hdf5_complib,
272 |                     hdf5_complevel=options.hdf5_complevel,
273 |                     hdf5_chunksize=options.hdf5_chunksize,
274 |                     fields=fields,
275 |                     **kwargs
276 |                 )
277 | 
278 |     except IOError as e:
279 |         if e.errno == errno.EPIPE:
280 |             pass  # ignore broken pipe
281 |         else:
282 |             raise
283 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension, find_packages
 2 | 
 3 | 
 4 | # require pysam is pre-installed
 5 | try:
 6 |     import pysam
 7 | except ImportError:
 8 |     raise Exception('pysam not found; please install pysam first')
 9 | from distutils.version import LooseVersion
10 | required_pysam_version = '0.15'
11 | if LooseVersion(pysam.__version__) < LooseVersion(required_pysam_version):
12 |     raise Exception('pysam version >= %s is required; found %s' %
13 |                     (required_pysam_version, pysam.__version__))
14 | 
15 | 
16 | def get_version():
17 |     """Extract version number from source file."""
18 |     from ast import literal_eval
19 |     with open('pysamstats/__init__.py') as f:
20 |         for line in f:
21 |             if line.startswith('__version__'):
22 |                 return literal_eval(line.partition('=')[2].lstrip())
23 |     raise ValueError("__version__ not found")
24 | 
25 | 
26 | try:
27 |     from Cython.Build import cythonize
28 |     print('[pysamstats] build with Cython')
29 |     extensions = cythonize([
30 |         Extension('pysamstats.opt',
31 |                   sources=['pysamstats/opt.pyx'],
32 |                   include_dirs=pysam.get_include(),
33 |                   define_macros=pysam.get_defines())]
34 |     )
35 | 
36 | except ImportError:
37 |     print('[pysamstats] build from C')
38 |     extensions = [Extension('pysamstats.opt',
39 |                             sources=['pysamstats/opt.c'],
40 |                             include_dirs=pysam.get_include(),
41 |                             define_macros=pysam.get_defines())]
42 | 
43 | 
44 | setup(
45 |     name='pysamstats',
46 |     version=get_version(),
47 |     author='Alistair Miles',
48 |     author_email='alimanfoo@googlemail.com',
49 |     url='https://github.com/alimanfoo/pysamstats',
50 |     license='MIT Licenses',
51 |     description='A Python utility for calculating statistics against genome '
52 |                 'position based on sequence alignments from a SAM, '
53 |                 'BAM or CRAM file.',
54 |     scripts=['scripts/pysamstats'],
55 |     package_dir={'': '.'},
56 |     install_requires=[
57 |         "pysam (<0.16)",
58 |         "numpy",
59 |     ],
60 |     packages=find_packages(),
61 |     classifiers=[
62 |         'Intended Audience :: Developers',
63 |         'License :: OSI Approved :: MIT License',
64 |         'Programming Language :: Python :: 2.7',
65 |         'Programming Language :: Python :: 3.5',
66 |         'Programming Language :: Python :: 3.6',
67 |         'Programming Language :: Python :: 3.7',
68 |         'Topic :: Software Development :: Libraries :: Python Modules'
69 |     ],
70 |     ext_modules=extensions,
71 | )
72 | 


--------------------------------------------------------------------------------