├── .github
    └── workflows
    │   └── tests.yml
├── .gitignore
├── LICENSE
├── README.rst
├── deploy-gh-pages.sh
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    └── varlens.rst
├── lint.sh
├── setup.py
├── test.sh
├── tests
    ├── __init__.py
    ├── data
    │   ├── CELSR1
    │   │   ├── bams
    │   │   │   ├── bam_0.bam
    │   │   │   ├── bam_0.bam.bai
    │   │   │   ├── bam_1.bam
    │   │   │   ├── bam_1.bam.bai
    │   │   │   ├── bam_10.bam
    │   │   │   ├── bam_10.bam.bai
    │   │   │   ├── bam_11.bam
    │   │   │   ├── bam_11.bam.bai
    │   │   │   ├── bam_12.bam
    │   │   │   ├── bam_12.bam.bai
    │   │   │   ├── bam_13.bam
    │   │   │   ├── bam_13.bam.bai
    │   │   │   ├── bam_14.bam
    │   │   │   ├── bam_14.bam.bai
    │   │   │   ├── bam_15.bam
    │   │   │   ├── bam_15.bam.bai
    │   │   │   ├── bam_16.bam
    │   │   │   ├── bam_16.bam.bai
    │   │   │   ├── bam_17.bam
    │   │   │   ├── bam_17.bam.bai
    │   │   │   ├── bam_18.bam
    │   │   │   ├── bam_18.bam.bai
    │   │   │   ├── bam_19.bam
    │   │   │   ├── bam_19.bam.bai
    │   │   │   ├── bam_2.bam
    │   │   │   ├── bam_2.bam.bai
    │   │   │   ├── bam_20.bam
    │   │   │   ├── bam_20.bam.bai
    │   │   │   ├── bam_21.bam
    │   │   │   ├── bam_21.bam.bai
    │   │   │   ├── bam_22.bam
    │   │   │   ├── bam_22.bam.bai
    │   │   │   ├── bam_23.bam
    │   │   │   ├── bam_23.bam.bai
    │   │   │   ├── bam_3.bam
    │   │   │   ├── bam_3.bam.bai
    │   │   │   ├── bam_4.bam
    │   │   │   ├── bam_4.bam.bai
    │   │   │   ├── bam_5.bam
    │   │   │   ├── bam_5.bam.bai
    │   │   │   ├── bam_6.bam
    │   │   │   ├── bam_6.bam.bai
    │   │   │   ├── bam_7.bam
    │   │   │   ├── bam_7.bam.bai
    │   │   │   ├── bam_8.bam
    │   │   │   ├── bam_8.bam.bai
    │   │   │   ├── bam_9.bam
    │   │   │   └── bam_9.bam.bai
    │   │   └── vcfs
    │   │   │   ├── vcf_1.vcf
    │   │   │   ├── vcf_10.vcf
    │   │   │   ├── vcf_11.vcf
    │   │   │   ├── vcf_12.vcf
    │   │   │   ├── vcf_13.vcf
    │   │   │   ├── vcf_14.vcf
    │   │   │   ├── vcf_2.vcf
    │   │   │   ├── vcf_3.vcf
    │   │   │   ├── vcf_4.vcf
    │   │   │   ├── vcf_5.vcf
    │   │   │   ├── vcf_6.vcf
    │   │   │   ├── vcf_7.vcf
    │   │   │   ├── vcf_8.vcf
    │   │   │   └── vcf_9.vcf
    │   ├── chr22.no_line_wrap.fa
    │   ├── gatk_mini_bundle_extract.bam
    │   ├── gatk_mini_bundle_extract.bam.bai
    │   ├── rna_chr17_41244936.bam
    │   └── rna_chr17_41244936.bam.bai
    ├── test_allele_support.py
    ├── test_read_evidence.py
    ├── test_reads.py
    └── test_variants.py
└── varlens
    ├── __init__.py
    ├── commands
        ├── __init__.py
        ├── allele_support.py
        ├── reads.py
        ├── util.py
        └── variants.py
    ├── loci_util.py
    ├── locus.py
    ├── mhc_binding.py
    ├── read_evidence
        ├── __init__.py
        ├── pileup.py
        ├── pileup_collection.py
        ├── pileup_element.py
        └── util.py
    ├── read_source.py
    ├── reads_util.py
    ├── sequence_context.py
    ├── support.py
    ├── util.py
    ├── variant_includes.py
    ├── variants_util.py
    └── version.py


/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | name: Tests
 4 | on: [push, pull_request]
 5 | 
 6 | jobs:
 7 |   build:
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       fail-fast: true
11 |       matrix:
12 |         python-version: ["3.9", "3.10", "3.11"]
13 | 
14 |     steps:
15 |       - name: Checkout repository
16 |         uses: actions/checkout@v3
17 |       - name: Set up Python ${{ matrix.python-version }}
18 |         uses: actions/setup-python@v3
19 |         with:
20 |           python-version: ${{ matrix.python-version }}
21 |           cache: "pip"
22 |       - name: Create virtual environment and install dependencies
23 |         run: |
24 |           python -m pip install --upgrade pip
25 |           python -m pip install uv
26 |           uv venv
27 |           source .venv/bin/activate
28 |           uv pip install pytest pytest-cov coveralls pylint ruff
29 |           uv pip install -r requirements.txt
30 |           uv pip install .
31 |       - name: Install Ensembl data
32 |         run: |
33 |           echo "Before installing Ensembl releases" && df -h
34 |           pyensembl install --release 75 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh37.75/
35 |           pyensembl install --release 77 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.77/
36 |           pyensembl install --release 93 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.93/
37 |           pyensembl install --release 93 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.93/
38 |       - name: Run linting script and unit tests
39 |         run: |
40 |           source .venv/bin/activate
41 |           ./lint.sh
42 |           ./test.sh
43 |       - name: Publish coverage to Coveralls
44 |         uses: coverallsapp/github-action@v2.2.3
45 |         with:
46 |           github-token: ${{ secrets.GITHUB_TOKEN }}
47 |           flag-name: coverage
48 |           fail-on-error: false
49 |           parallel: true
50 |       - name: Upload docs to GitHub Pages
51 |         run: |
52 |           ./deploy-gh-pages.sh
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. image:: https://github.com/openvax/varlens/actions/workflows/tests.yml/badge.svg
  2 |     :target: https://github.com/openvax/varlens/actions/workflows/tests.yml
  3 |     :alt: Tests
  4 | 
  5 | varlens
  6 | ======================
  7 | 
  8 | A collection of Python tools for working with genomic variants and
  9 | next-generation sequencing reads. Not particularly fast for large datasets. The
 10 | emphasis is on extracting what you need from BAMs and VCFs into a CSV file for
 11 | further analysis.
 12 | 
 13 | Built on `varcode <https://github.com/openvax/varcode>`_ and `pysam <https://github.com/pysam-developers/pysam>`_.
 14 | 
 15 | varlens-variants
 16 |     Combine, annotate, and filter variants from VCF or CSV files. Available
 17 |     annotations include genes, variant effects, surrounding sequence context,
 18 |     counts of supporting reads from specified BAM files, and MHC I binding
 19 |     affinity prediction of mutant peptides.
 20 | 
 21 | varlens-reads
 22 |     Display, filter, and copy reads from a SAM/BAM file. Partial replacement for ``samtools view``.
 23 | 
 24 | varlens-allele-support
 25 |     Count reads supporting each allele at specified sites in BAM files.
 26 | 
 27 | 
 28 | Installation
 29 | -------------
 30 | 
 31 | To install from `PyPI <https://pypi.python.org/pypi/varlens>`_:
 32 | 
 33 | ::
 34 | 
 35 |     pip install varlens
 36 | 
 37 | Or from a git checkout:
 38 | 
 39 | ::
 40 | 
 41 |     pip install .
 42 | 
 43 | To run the tests:
 44 | 
 45 | ::
 46 | 
 47 |     nosetests .
 48 | 
 49 | To build the documentation (just this README plus the commandline tool help):
 50 | 
 51 | ::
 52 | 
 53 |     pip install -e .
 54 |     pip install Sphinx
 55 |     cd docs
 56 |     make clean setup rst html
 57 | 
 58 | The docs will be written to the ``_build/html`` directory.
 59 | 
 60 | 
 61 | varlens-variants
 62 | ----------------------
 63 | 
 64 | Given variants from one or more VCF or CSV files, apply filters, add additional
 65 | columns, and output to CSV.
 66 | 
 67 | Currently we can only output to CSV, not VCF.
 68 | 
 69 | A number of useful annotations can be added for each variant by specifying
 70 | options of the form '--include-XXX', e.g. '--include-gene'. See detailed help
 71 | (run with -h).
 72 | 
 73 | Examples
 74 | `````````````
 75 | 
 76 | Print basic info for the variants found in two VCF files. Note that variants
 77 | found in both files are listed in one row, and the 'sources' column lists
 78 | the files each variant was found in:
 79 | 
 80 | ::
 81 | 
 82 |     $ varlens-variants test/data/CELSR1/vcfs/vcf_1.vcf test/data/CELSR1/vcfs/vcf_2.vcf
 83 | 
 84 |     genome,contig,interbase_start,interbase_end,ref,alt,sources
 85 |     GRCh37,22,21829554,21829555,T,G,1.vcf
 86 |     GRCh37,22,46931059,46931060,A,C,1.vcf
 87 |     GRCh37,22,46931061,46931062,G,A,1.vcf 2.vcf
 88 |     GRCh37,22,50636217,50636218,A,C,1.vcf
 89 |     GRCh37,22,50875932,50875933,A,C,1.vcf
 90 |     GRCh37,22,45309892,45309893,T,G,2.vcf
 91 | 
 92 | Same as the above but include additional columns giving varcode variant effect
 93 | annotations and the genes the variants overlap, and write to a file:
 94 | 
 95 | ::
 96 | 
 97 |     $ varlens-variants test/data/CELSR1/vcfs/vcf_1.vcf test/data/CELSR1/vcfs/vcf_2.vcf \
 98 |         --include-effect \
 99 |         --include-gene \
100 |         --out /tmp/result.csv
101 | 
102 |     Wrote: /tmp/result.csv
103 | 
104 |     $ cat /tmp/result.csv
105 | 
106 |     genome,contig,interbase_start,interbase_end,ref,alt,sources,effect,gene
107 |     GRCh37,22,21829554,21829555,T,G,1.vcf,non-coding-transcript,PI4KAP2
108 |     GRCh37,22,46931059,46931060,A,C,1.vcf,p.S670A,CELSR1
109 |     GRCh37,22,46931061,46931062,G,A,1.vcf 2.vcf,p.S669F,CELSR1
110 |     GRCh37,22,50636217,50636218,A,C,1.vcf,intronic,TRABD
111 |     GRCh37,22,50875932,50875933,A,C,1.vcf,splice-acceptor,PPP6R2
112 |     GRCh37,22,45309892,45309893,T,G,2.vcf,p.T214P,PHF21B
113 | 
114 | Print counts for number of reads supporting reference/variant/other alleles
115 | from the specified BAM, counting only reads with mapping quality >= 10:
116 | 
117 | ::
118 | 
119 |     $ varlens-variants test/data/CELSR1/vcfs/vcf_1.vcf \
120 |         --include-read-evidence \
121 |         --reads test/data/CELSR1/bams/bam_1.bam \
122 |         --min-mapping-quality 10
123 | 
124 |     genome,contig,interbase_start,interbase_end,ref,alt,sources,num_alt,num_ref,total_depth
125 |     GRCh37,22,21829554,21829555,T,G,vcf_1.vcf,0,0,0
126 |     GRCh37,22,46931059,46931060,A,C,vcf_1.vcf,0,216,320
127 |     GRCh37,22,46931061,46931062,G,A,vcf_1.vcf,0,321,321
128 |     GRCh37,22,50636217,50636218,A,C,vcf_1.vcf,0,0,0
129 |     GRCh37,22,50875932,50875933,A,C,vcf_1.vcf,0,0,0
130 | 
131 | 
132 | varlens-reads
133 | ----------------------
134 | 
135 | Filter reads from one or more BAMs and output a CSV or a new BAM.
136 | 
137 | Loci and VCF files may be specified, in which case reads are filtered to
138 | overlap the specified loci or variants.
139 | 
140 | Examples
141 | `````````````
142 | 
143 | Print basic fields for the reads in a BAM:
144 | 
145 | ::
146 | 
147 |     $ varlens-reads test/data/CELSR1/bams/bam_0.bam
148 | 
149 |     query_name,reference_start,reference_end,cigarstring
150 |     HISEQ:142:C5822ANXX:3:2116:16538:101199,46929962,46930062,100M
151 |     HISEQ:142:C5822ANXX:3:1106:18985:32932,46929964,46930064,100M
152 |     HISEQ:142:C5822ANXX:3:2201:21091:67220,46929966,46930066,100M
153 |     HISEQ:142:C5822ANXX:4:1304:5363:12786,46929966,46930066,100M
154 |     HISEQ:142:C5822ANXX:4:1104:9008:85114,46929969,46930069,100M
155 |     HISEQ:142:C5822ANXX:3:2304:9921:94828,46929970,46930070,100M
156 |     HISEQ:142:C5822ANXX:3:2211:6266:74633,46929973,46930073,100M
157 |     HISEQ:142:C5822ANXX:3:1305:8982:42729,46929974,46930074,100M
158 |     HISEQ:142:C5822ANXX:4:2316:5630:7371,46929978,46930078,100M
159 |     ...
160 | 
161 | Same as above but filter only to reads aligned on the (-) strand, write to a
162 | file instead of stdout, and also include the mapping quality and sequenced
163 | bases in the output:
164 | 
165 | ::
166 | 
167 |     $ varlens-reads test/data/CELSR1/bams/bam_0.bam \
168 |         --is-reverse \
169 |         --field mapping_quality query_alignment_sequence \
170 |         --out /tmp/result.csv
171 | 
172 |     Wrote: /tmp/result.csv
173 | 
174 |     $ head /tmp/result.csv
175 | 
176 |     query_name,reference_start,reference_end,cigarstring,mapping_quality,query_alignment_sequence
177 |     HISEQ:142:C5822ANXX:3:2116:16538:101199,46929962,46930062,100M,60,CATGATCTGGGCATTAGGGCCTTCATCAGGGTCGTTAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCG
178 |     HISEQ:142:C5822ANXX:3:1106:18985:32932,46929964,46930064,100M,60,TGATCTGGGCATTAGGGCCTTCATCAGGGTCGTTAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTC
179 |     HISEQ:142:C5822ANXX:4:1104:9008:85114,46929969,46930069,100M,60,TGGGCATTAGGGCCTTCATCAGGGTCGTTAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTCCTTCT
180 |     HISEQ:142:C5822ANXX:4:1202:18451:91174,46929979,46930079,100M,60,GGCCTTCATCAGGGTCGTTAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTCCTTCTCAAACATGGG
181 |     HISEQ:142:C5822ANXX:3:1211:18522:54773,46929987,46930087,100M,60,TCAGGGTCGTTAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTCCTTCTCAAACATGGGGGCATTGT
182 |     HISEQ:142:C5822ANXX:3:2114:19455:45093,46929987,46930087,100M,60,TCAGGGTCGTTAGCACGAATCTTTGCCACCGCCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTCCTTCTCAAACATGGGGGCATTGT
183 |     HISEQ:142:C5822ANXX:4:2115:9153:21593,46929994,46930094,100M,60,CGTTAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTCCTTCTCAAACATGGGGGCATTGTCATTAAT
184 |     HISEQ:142:C5822ANXX:4:1212:15644:87227,46929995,46930095,100M,60,GTTAGCACGTATGTTTGCCACCACCGACCCCACTGAGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTGCTTCTCAAACATGGGGGCAGTGTCATTAATG
185 |     HISEQ:142:C5822ANXX:3:1103:4717:26369,46929997,46930097,100M,60,TAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTCCTTCTCAAACATGGGGGCATTGTCATTAATGTC
186 | 
187 | 
188 | Write a bam file consisting of reads with mapping quality >=30 and
189 | overlapping a certain locus:
190 | 
191 | ::
192 | 
193 |     $ varlens-reads test/data/CELSR1/bams/bam_0.bam \
194 |         --min-mapping-quality 30 \
195 |         --locus 22:46932040-46932050 \
196 |         --out /tmp/result.bam
197 | 
198 | Write a bam file consisting of reads overlapping variants from a VCF:
199 | 
200 | ::
201 | 
202 |     $ varlens-reads test/data/CELSR1/bams/bam_0.bam \
203 |         --variants test/data/CELSR1/vcfs/vcf_1.vcf \
204 |         --out /tmp/result.bam
205 | 
206 | Print just the header for a BAM in csv format:
207 | 
208 | ::
209 | 
210 |     $ varlens-reads test/data/CELSR1/bams/bam_0.bam --header
211 | 
212 | varlens-allele-support
213 | ----------------------
214 | 
215 | Given one or more BAMs and some genomic sites to consider, write a csv file
216 | giving counts of reads supporting each allele at each site for each BAM.
217 | 
218 | The genomic sites to consider may be specified by locus (--locus option), or via
219 | one or more VCF files.
220 | 
221 | The positions outputted by this command are in *interbase coordinates*, i.e.
222 | starting at 0, inclusive on first index, exclusive on second (as opposed to
223 | the one-based inclusive coordinates used in VCF files).
224 | 
225 | Examples
226 | `````````````
227 | 
228 | ::
229 | 
230 |     varlens-allele-support \
231 |         --reads test/data/CELSR1/bams/bam_1.bam \
232 |         --locus 22:46931061 22:46931063
233 | 
234 |     source,contig,interbase_start,interbase_end,allele,count
235 |     bam_1.bam,22,46931060,46931061,,1
236 |     bam_1.bam,22,46931060,46931061,G,329
237 |     bam_1.bam,22,46931062,46931063,A,327
238 |     bam_1.bam,22,46931062,46931063,AC,1
239 |     bam_1.bam,22,46931062,46931063,AG,2
240 | 
241 | Note on coordinate systems
242 | -----------------------------------
243 | 
244 | ``varlens`` uses 0-based half-open coordinates internally. Many tools
245 | (including samtools and VCF files) use inclusive 1-based coordinates. We try to
246 | keep the confusion to a minimum by using the term "interbase" whenever we're
247 | using 0-based half open coordinates and "inclusive" when we're using 1-based
248 | inclusive coordinates.
249 | 
250 | One particularly sticky place this comes up is when specifying loci on the
251 | commandline using e.g. ``--locus chr22:43243-43244``. To maintain consistency
252 | with the most common other tools, when you specify a locus like
253 | ``chr22:10-20``, we interpret that as a 1-based inclusive coordinate. To
254 | specify 0-based half-open coordinates, use this syntax: ``chr22/11-20`` (i.e. a
255 | slash instead of a colon).
256 | 
257 | See this `blog post <http://alternateallele.blogspot.com/2012/03/genome-coordinate-conventions.html>`_
258 | for more details on coordinate systems.
259 | 
260 | .. Documentation
261 |     -------------
262 |     The docs are just this readme and the commandline tool help.
263 |     They are available here: http://openvax.github.io/varlens/docs/html
264 | 
265 | 
266 | 


--------------------------------------------------------------------------------
/deploy-gh-pages.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Adapted from https://github.com/w3ctag/promises-guide/blob/master/deploy-gh-pages.sh
 4 | 
 5 | set -e
 6 | 
 7 | pip install Sphinx
 8 | 
 9 | cd docs
10 | make clean
11 | make setup
12 | make rst
13 | make html
14 | 
15 | cd _build
16 | 
17 | mkdir docs
18 | mv html docs
19 | 
20 | touch .nojekyll
21 | 
22 | git init
23 | git config user.name "Travis-CI"
24 | git config user.email "travis@w3ctag.org"
25 | git add .
26 | git commit -m "Deploy to GitHub Pages"
27 | git push --force --quiet "https://${GH_TOKEN}@${GH_REF}" master:gh-pages > /dev/null 2>&1
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext rst view setup
 23 | 
 24 | # Added by Tim:
 25 | rst:
 26 | 	sphinx-apidoc -T -f -o . ../varlens ../varlens/commands
 27 | 
 28 | view:
 29 | 	open _build/html/index.html
 30 | 
 31 | setup:
 32 | 	pip install Sphinx sphinxcontrib-autoprogram sphinx-rtd-theme sphinxcontrib-autorun2 sphinxcontrib-programoutput numpydoc
 33 | 
 34 | help:
 35 | 	@echo "Please use \`make <target>' where <target> is one of"
 36 | 	@echo "  html       to make standalone HTML files"
 37 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 38 | 	@echo "  singlehtml to make a single large HTML file"
 39 | 	@echo "  pickle     to make pickle files"
 40 | 	@echo "  json       to make JSON files"
 41 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 42 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 43 | 	@echo "  applehelp  to make an Apple Help Book"
 44 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 45 | 	@echo "  epub       to make an epub"
 46 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 47 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 48 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 49 | 	@echo "  text       to make text files"
 50 | 	@echo "  man        to make manual pages"
 51 | 	@echo "  texinfo    to make Texinfo files"
 52 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 53 | 	@echo "  gettext    to make PO message catalogs"
 54 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 55 | 	@echo "  xml        to make Docutils-native XML files"
 56 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 57 | 	@echo "  linkcheck  to check all external links for integrity"
 58 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 59 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 60 | 
 61 | clean:
 62 | 	rm -rf $(BUILDDIR)/*
 63 | 
 64 | html:
 65 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 68 | 
 69 | dirhtml:
 70 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 71 | 	@echo
 72 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 73 | 
 74 | singlehtml:
 75 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 76 | 	@echo
 77 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 78 | 
 79 | pickle:
 80 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 81 | 	@echo
 82 | 	@echo "Build finished; now you can process the pickle files."
 83 | 
 84 | json:
 85 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 86 | 	@echo
 87 | 	@echo "Build finished; now you can process the JSON files."
 88 | 
 89 | htmlhelp:
 90 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 91 | 	@echo
 92 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 93 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 94 | 
 95 | qthelp:
 96 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 97 | 	@echo
 98 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 99 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
100 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/varlens.qhcp"
101 | 	@echo "To view the help file:"
102 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/varlens.qhc"
103 | 
104 | applehelp:
105 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
106 | 	@echo
107 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
108 | 	@echo "N.B. You won't be able to view it unless you put it in" \
109 | 	      "~/Library/Documentation/Help or install it in your application" \
110 | 	      "bundle."
111 | 
112 | devhelp:
113 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
114 | 	@echo
115 | 	@echo "Build finished."
116 | 	@echo "To view the help file:"
117 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/varlens"
118 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/varlens"
119 | 	@echo "# devhelp"
120 | 
121 | epub:
122 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
123 | 	@echo
124 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
125 | 
126 | latex:
127 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
128 | 	@echo
129 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
130 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
131 | 	      "(use \`make latexpdf' here to do that automatically)."
132 | 
133 | latexpdf:
134 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
135 | 	@echo "Running LaTeX files through pdflatex..."
136 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
137 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
138 | 
139 | latexpdfja:
140 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
141 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
142 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
143 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
144 | 
145 | text:
146 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
147 | 	@echo
148 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
149 | 
150 | man:
151 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
152 | 	@echo
153 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
154 | 
155 | texinfo:
156 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
157 | 	@echo
158 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
159 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
160 | 	      "(use \`make info' here to do that automatically)."
161 | 
162 | info:
163 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
164 | 	@echo "Running Texinfo files through makeinfo..."
165 | 	make -C $(BUILDDIR)/texinfo info
166 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
167 | 
168 | gettext:
169 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
170 | 	@echo
171 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
172 | 
173 | changes:
174 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
175 | 	@echo
176 | 	@echo "The overview file is in $(BUILDDIR)/changes."
177 | 
178 | linkcheck:
179 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
180 | 	@echo
181 | 	@echo "Link check complete; look for any errors in the above output " \
182 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
183 | 
184 | doctest:
185 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
186 | 	@echo "Testing of doctests in the sources finished, look at the " \
187 | 	      "results in $(BUILDDIR)/doctest/output.txt."
188 | 
189 | coverage:
190 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
191 | 	@echo "Testing of coverage in the sources finished, look at the " \
192 | 	      "results in $(BUILDDIR)/coverage/python.txt."
193 | 
194 | xml:
195 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
196 | 	@echo
197 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
198 | 
199 | pseudoxml:
200 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
201 | 	@echo
202 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
203 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # varlens documentation build configuration file, created by
  4 | # sphinx-quickstart on Wed Jun 10 19:36:39 2015.
  5 | # Copied by Tim from sefara project.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | import sys
 17 | import os
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #needs_sphinx = '1.0'
 28 | 
 29 | # Add any Sphinx extension module names here, as strings. They can be
 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 31 | # ones.
 32 | extensions = [
 33 |     'sphinx.ext.autodoc',
 34 |     'sphinx.ext.autosummary',
 35 |     'sphinx.ext.viewcode',
 36 |     'numpydoc',
 37 |     'sphinxcontrib.programoutput',
 38 |     'sphinxcontrib.autorun2',
 39 |     'sphinxcontrib.autoprogram',
 40 | ]
 41 | 
 42 | # Added by Tim
 43 | # http://stackoverflow.com/questions/12206334/sphinx-autosummary-toctree-contains-reference-to-nonexisting-document-warnings
 44 | numpydoc_show_class_members = False
 45 | 
 46 | # Add any paths that contain templates here, relative to this directory.
 47 | templates_path = ['_templates']
 48 | 
 49 | # The suffix(es) of source filenames.
 50 | # You can specify multiple suffix as a list of string:
 51 | # source_suffix = ['.rst', '.md']
 52 | source_suffix = '.rst'
 53 | 
 54 | # The encoding of source files.
 55 | #source_encoding = 'utf-8-sig'
 56 | 
 57 | # The master toctree document.
 58 | master_doc = 'index'
 59 | 
 60 | # General information about the project.
 61 | project = u'varlens'
 62 | copyright = u'2016, Tim O\'Donnell'
 63 | author = u'Tim O\'Donnell'
 64 | 
 65 | # The version info for the project you're documenting, acts as replacement for
 66 | # |version| and |release|, also used in various other places throughout the
 67 | # built documents.
 68 | #
 69 | # The short X.Y version.
 70 | version = '0.0.1'
 71 | # The full version, including alpha/beta/rc tags.
 72 | release = '0.0.1'
 73 | 
 74 | # The language for content autogenerated by Sphinx. Refer to documentation
 75 | # for a list of supported languages.
 76 | #
 77 | # This is also used if you do content translation via gettext catalogs.
 78 | # Usually you set "language" from the command line for these cases.
 79 | language = None
 80 | 
 81 | # There are two options for replacing |today|: either, you set today to some
 82 | # non-false value, then it is used:
 83 | #today = ''
 84 | # Else, today_fmt is used as the format for a strftime call.
 85 | #today_fmt = '%B %d, %Y'
 86 | 
 87 | # List of patterns, relative to source directory, that match files and
 88 | # directories to ignore when looking for source files.
 89 | exclude_patterns = ['_build']
 90 | 
 91 | # The reST default role (used for this markup: `text`) to use for all
 92 | # documents.
 93 | default_role = 'any'
 94 | 
 95 | # If true, '()' will be appended to :func: etc. cross-reference text.
 96 | #add_function_parentheses = True
 97 | 
 98 | # If true, the current module name will be prepended to all description
 99 | # unit titles (such as .. function::).
100 | #add_module_names = True
101 | 
102 | # If true, sectionauthor and moduleauthor directives will be shown in the
103 | # output. They are ignored by default.
104 | #show_authors = False
105 | 
106 | # The name of the Pygments (syntax highlighting) style to use.
107 | pygments_style = 'sphinx'
108 | 
109 | # A list of ignored prefixes for module index sorting.
110 | #modindex_common_prefix = []
111 | 
112 | # If true, keep warnings as "system message" paragraphs in the built documents.
113 | #keep_warnings = False
114 | 
115 | # If true, `todo` and `todoList` produce output, else they produce nothing.
116 | todo_include_todos = False
117 | 
118 | 
119 | # -- Options for HTML output ----------------------------------------------
120 | 
121 | # The theme to use for HTML and HTML Help pages.  See the documentation for
122 | # a list of builtin themes.
123 | html_theme = 'sphinx_rtd_theme'
124 | 
125 | # Theme options are theme-specific and customize the look and feel of a theme
126 | # further.  For a list of options available for each theme, see the
127 | # documentation.
128 | #html_theme_options = {}
129 | 
130 | # Add any paths that contain custom themes here, relative to this directory.
131 | #html_theme_path = []
132 | 
133 | # The name for this set of Sphinx documents.  If None, it defaults to
134 | # "<project> v<release> documentation".
135 | #html_title = None
136 | 
137 | # A shorter title for the navigation bar.  Default is the same as html_title.
138 | #html_short_title = None
139 | 
140 | # The name of an image file (relative to this directory) to place at the top
141 | # of the sidebar.
142 | #html_logo = None
143 | 
144 | # The name of an image file (within the static path) to use as favicon of the
145 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
146 | # pixels large.
147 | #html_favicon = None
148 | 
149 | # Add any paths that contain custom static files (such as style sheets) here,
150 | # relative to this directory. They are copied after the builtin static files,
151 | # so a file named "default.css" will overwrite the builtin "default.css".
152 | html_static_path = ['_static']
153 | 
154 | # Add any extra paths that contain custom files (such as robots.txt or
155 | # .htaccess) here, relative to this directory. These files are copied
156 | # directly to the root of the documentation.
157 | #html_extra_path = []
158 | 
159 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
160 | # using the given strftime format.
161 | #html_last_updated_fmt = '%b %d, %Y'
162 | 
163 | # If true, SmartyPants will be used to convert quotes and dashes to
164 | # typographically correct entities.
165 | #html_use_smartypants = True
166 | 
167 | # Custom sidebar templates, maps document names to template names.
168 | #html_sidebars = {}
169 | 
170 | # Additional templates that should be rendered to pages, maps page names to
171 | # template names.
172 | #html_additional_pages = {}
173 | 
174 | # If false, no module index is generated.
175 | #html_domain_indices = True
176 | 
177 | # If false, no index is generated.
178 | #html_use_index = True
179 | 
180 | # If true, the index is split into individual pages for each letter.
181 | #html_split_index = False
182 | 
183 | # If true, links to the reST sources are added to the pages.
184 | #html_show_sourcelink = True
185 | 
186 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
187 | #html_show_sphinx = True
188 | 
189 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
190 | #html_show_copyright = True
191 | 
192 | # If true, an OpenSearch description file will be output, and all pages will
193 | # contain a <link> tag referring to it.  The value of this option must be the
194 | # base URL from which the finished HTML is served.
195 | #html_use_opensearch = ''
196 | 
197 | # This is the file name suffix for HTML files (e.g. ".xhtml").
198 | #html_file_suffix = None
199 | 
200 | # Language to be used for generating the HTML full-text search index.
201 | # Sphinx supports the following languages:
202 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
203 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
204 | #html_search_language = 'en'
205 | 
206 | # A dictionary with options for the search language support, empty by default.
207 | # Now only 'ja' uses this config value
208 | #html_search_options = {'type': 'default'}
209 | 
210 | # The name of a javascript file (relative to the configuration directory) that
211 | # implements a search results scorer. If empty, the default will be used.
212 | #html_search_scorer = 'scorer.js'
213 | 
214 | # Output file base name for HTML help builder.
215 | htmlhelp_basename = 'varlensdoc'
216 | 
217 | # -- Options for LaTeX output ---------------------------------------------
218 | 
219 | latex_elements = {
220 | # The paper size ('letterpaper' or 'a4paper').
221 | #'papersize': 'letterpaper',
222 | 
223 | # The font size ('10pt', '11pt' or '12pt').
224 | #'pointsize': '10pt',
225 | 
226 | # Additional stuff for the LaTeX preamble.
227 | #'preamble': '',
228 | 
229 | # Latex figure (float) alignment
230 | #'figure_align': 'htbp',
231 | }
232 | 
233 | # Grouping the document tree into LaTeX files. List of tuples
234 | # (source start file, target name, title,
235 | #  author, documentclass [howto, manual, or own class]).
236 | latex_documents = [
237 |   (master_doc, 'varlens.tex', u'varlens Documentation',
238 |    u'Tim O\'Donnell', 'manual'),
239 | ]
240 | 
241 | # The name of an image file (relative to this directory) to place at the top of
242 | # the title page.
243 | #latex_logo = None
244 | 
245 | # For "manual" documents, if this is true, then toplevel headings are parts,
246 | # not chapters.
247 | #latex_use_parts = False
248 | 
249 | # If true, show page references after internal links.
250 | #latex_show_pagerefs = False
251 | 
252 | # If true, show URL addresses after external links.
253 | #latex_show_urls = False
254 | 
255 | # Documents to append as an appendix to all manuals.
256 | #latex_appendices = []
257 | 
258 | # If false, no module index is generated.
259 | #latex_domain_indices = True
260 | 
261 | 
262 | # -- Options for manual page output ---------------------------------------
263 | 
264 | # One entry per manual page. List of tuples
265 | # (source start file, name, description, authors, manual section).
266 | man_pages = [
267 |     (master_doc, 'varlens', u'varlens Documentation',
268 |      [author], 1)
269 | ]
270 | 
271 | # If true, show URL addresses after external links.
272 | #man_show_urls = False
273 | 
274 | 
275 | # -- Options for Texinfo output -------------------------------------------
276 | 
277 | # Grouping the document tree into Texinfo files. List of tuples
278 | # (source start file, target name, title, author,
279 | #  dir menu entry, description, category)
280 | texinfo_documents = [
281 |   (master_doc, 'varlens', u'varlens Documentation',
282 |    author, 'varlens',
283 |    'Python commandline tools for manipulating genomic variants and NGS reads',
284 |    'Miscellaneous'),
285 | ]
286 | 
287 | # Documents to append as an appendix to all manuals.
288 | #texinfo_appendices = []
289 | 
290 | # If false, no module index is generated.
291 | #texinfo_domain_indices = True
292 | 
293 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
294 | #texinfo_show_urls = 'footnote'
295 | 
296 | # If true, do not generate a @detailmenu in the "Top" node's menu.
297 | #texinfo_no_detailmenu = False
298 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Varlens Documentation
 2 | ==================================
 3 | 
 4 | .. include:: ../README.rst
 5 | 
 6 | Commandline tool help
 7 | ==================================
 8 | 
 9 | varlens-variants
10 | ----------------------------------
11 | 
12 | .. command-output:: varlens-variants -h
13 | 
14 | varlens-reads
15 | ----------------------------------
16 | 
17 | .. command-output:: varlens-reads -h
18 | 
19 | varlens-allele-support
20 | ----------------------------------
21 | 
22 | .. command-output:: varlens-allele-support -h
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/docs/varlens.rst:
--------------------------------------------------------------------------------
  1 | varlens package
  2 | ===============
  3 | 
  4 | Subpackages
  5 | -----------
  6 | 
  7 | .. toctree::
  8 | 
  9 |     varlens.read_evidence
 10 | 
 11 | Submodules
 12 | ----------
 13 | 
 14 | varlens.loci_util module
 15 | ------------------------
 16 | 
 17 | .. automodule:: varlens.loci_util
 18 |     :members:
 19 |     :undoc-members:
 20 |     :show-inheritance:
 21 | 
 22 | varlens.locus module
 23 | --------------------
 24 | 
 25 | .. automodule:: varlens.locus
 26 |     :members:
 27 |     :undoc-members:
 28 |     :show-inheritance:
 29 | 
 30 | varlens.mhc_binding module
 31 | --------------------------
 32 | 
 33 | .. automodule:: varlens.mhc_binding
 34 |     :members:
 35 |     :undoc-members:
 36 |     :show-inheritance:
 37 | 
 38 | varlens.read_source module
 39 | --------------------------
 40 | 
 41 | .. automodule:: varlens.read_source
 42 |     :members:
 43 |     :undoc-members:
 44 |     :show-inheritance:
 45 | 
 46 | varlens.reads_util module
 47 | -------------------------
 48 | 
 49 | .. automodule:: varlens.reads_util
 50 |     :members:
 51 |     :undoc-members:
 52 |     :show-inheritance:
 53 | 
 54 | varlens.sequence_context module
 55 | -------------------------------
 56 | 
 57 | .. automodule:: varlens.sequence_context
 58 |     :members:
 59 |     :undoc-members:
 60 |     :show-inheritance:
 61 | 
 62 | varlens.support module
 63 | ----------------------
 64 | 
 65 | .. automodule:: varlens.support
 66 |     :members:
 67 |     :undoc-members:
 68 |     :show-inheritance:
 69 | 
 70 | varlens.util module
 71 | -------------------
 72 | 
 73 | .. automodule:: varlens.util
 74 |     :members:
 75 |     :undoc-members:
 76 |     :show-inheritance:
 77 | 
 78 | varlens.variant_includes module
 79 | -------------------------------
 80 | 
 81 | .. automodule:: varlens.variant_includes
 82 |     :members:
 83 |     :undoc-members:
 84 |     :show-inheritance:
 85 | 
 86 | varlens.variants_util module
 87 | ----------------------------
 88 | 
 89 | .. automodule:: varlens.variants_util
 90 |     :members:
 91 |     :undoc-members:
 92 |     :show-inheritance:
 93 | 
 94 | 
 95 | Module contents
 96 | ---------------
 97 | 
 98 | .. automodule:: varlens
 99 |     :members:
100 |     :undoc-members:
101 |     :show-inheritance:
102 | 


--------------------------------------------------------------------------------
/lint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -o errexit
 3 | 
 4 | 
 5 | # disabling several categories of errors due to false positives in pylint,
 6 | # see these issues:
 7 | # - https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and
 8 | # - https://bitbucket.org/logilab/pylint/issues/58
 9 | 
10 | find varlens/ -name '*.py' \
11 |   | xargs pylint \
12 |   --errors-only \
13 |   --disable=unsubscriptable-object,not-an-iterable,no-member,invalid-unary-operand-type \
14 | && \
15 | echo 'Passes pylint check' \
16 | && \
17 | ruff check variform/ \
18 | && \
19 | echo "Passes ruff check" 
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from __future__ import print_function
14 | import os
15 | import re
16 | 
17 | from setuptools import setup
18 | 
19 | current_directory = os.path.dirname(__file__)
20 | 
21 | with open('varlens/version.py', 'r') as f:
22 |     version = re.search(
23 |         r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
24 |         f.read(),
25 |         re.MULTILINE).group(1)
26 | 
27 | if __name__ == '__main__':
28 |     setup(
29 |         name='varlens',
30 |         packages=["varlens", "varlens.commands", "varlens.read_evidence"],
31 |         version=version,
32 |         description=(
33 |             "commandline manipulation of genomic variants and NGS reads"),
34 |         long_description=open('README.rst').read(),
35 |         url="https://github.com/openvax/varlens",
36 |         author="Tim O'Donnell",
37 |         author_email="timodonnell@gmail.com",
38 |         license="http://www.apache.org/licenses/LICENSE-2.0.html",
39 |         entry_points={
40 |             'console_scripts': [
41 |                 'varlens-allele-support = varlens.commands.allele_support:run',
42 |                 'varlens-variants = varlens.commands.variants:run',
43 |                 'varlens-reads = varlens.commands.reads:run',
44 |             ],
45 |         },
46 |         classifiers=[
47 |             'Development Status :: 3 - Alpha',
48 |             'Environment :: Console',
49 |             'Operating System :: OS Independent',
50 |             'Intended Audience :: Science/Research',
51 |             'License :: OSI Approved :: Apache Software License',
52 |             'Programming Language :: Python',
53 |             'Topic :: Scientific/Engineering :: Bio-Informatics',
54 |         ],
55 |         install_requires=[
56 |             'cython>=0.21',
57 |             'numpy',
58 |             'intervaltree',
59 |             'pysam>=0.13',
60 |             'typechecks',
61 |             'varcode',
62 |             'pyfaidx',
63 |             'mhctools',
64 |             'topiary',
65 |         ],
66 |     )
67 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | pytest --cov=varlens/ --cov-report=term-missing tests
2 | 
3 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Utility functions for tests.
 3 | '''
 4 | 
 5 | import sys
 6 | import os
 7 | import tempfile
 8 | from contextlib import contextmanager
 9 | 
10 | import pandas
11 | 
12 | if sys.version_info[0] < 3:
13 |     from StringIO import StringIO
14 | else:
15 |     from io import StringIO
16 | 
17 | def data_path(name):
18 |     '''
19 |     Return the absolute path to a file in the test/data directory.
20 |     The name specified should be relative to test/data.
21 |     '''
22 |     return os.path.join(os.path.dirname(__file__), "data", name)
23 | 
24 | class Capturing(list):
25 |     def __enter__(self):
26 |         self._stdout = sys.stdout
27 |         sys.stdout = self._stringio = StringIO()
28 |         return self
29 | 
30 |     def __exit__(self, *args):
31 |         self.extend(self._stringio.getvalue().splitlines())
32 |         sys.stdout = self._stdout
33 | 
34 | def run_and_parse_csv(function, *args):
35 |     with Capturing() as output:
36 |         function(*args)
37 |     try:
38 |         result = pandas.read_csv(StringIO("\n".join(output)))
39 |     except:
40 |         print("Couldn't parse csv. Function: %s. Args: %s.\nOutput:\n%s"
41 |             % (str(function), str(args), "\n".join(output)))
42 |         raise
43 |     return result
44 | 
45 | @contextmanager
46 | def temp_file(suffix=".csv"):
47 |     fd = tempfile.NamedTemporaryFile(
48 |         suffix=suffix,
49 |         prefix="test_varlens_",
50 |         delete=False)
51 |     filename = fd.name
52 |     fd.close()
53 |     yield filename
54 |     os.unlink(filename)    
55 | 
56 | def cols_concat(df, columns, delimiter="-"):
57 |     assert df is not None
58 |     zipped = zip(*[df[c] for c in columns])
59 |     return set([delimiter.join(str(item) for item in row) for row in zipped])


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_0.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_0.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_0.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_0.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_1.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_1.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_1.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_1.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_10.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_10.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_10.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_10.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_11.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_11.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_11.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_11.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_12.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_12.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_12.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_12.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_13.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_13.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_13.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_13.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_14.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_14.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_14.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_14.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_15.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_15.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_15.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_15.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_16.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_16.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_16.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_16.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_17.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_17.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_17.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_17.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_18.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_18.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_18.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_18.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_19.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_19.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_19.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_19.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_2.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_2.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_2.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_2.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_20.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_20.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_20.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_20.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_21.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_21.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_21.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_21.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_22.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_22.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_22.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_22.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_23.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_23.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_23.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_23.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_3.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_3.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_3.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_3.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_4.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_4.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_4.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_4.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_5.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_5.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_5.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_5.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_6.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_6.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_6.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_6.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_7.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_7.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_7.bam.bai:
--------------------------------------------------------------------------------
1 | BAI]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_8.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_8.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_8.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_8.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_9.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_9.bam


--------------------------------------------------------------------------------
/tests/data/CELSR1/bams/bam_9.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_9.bam.bai


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_1.vcf:
--------------------------------------------------------------------------------
1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta
2 | ##contig=<ID=chr22,assembly=hg19,length=51304566>
3 | chr22	21829555	rs377578228	T	G	.	PASS	DB;SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:192,7:35:199:0.035:2	0:41,1:.:41:0.024:0
4 | chr22	46931060	.	A	C	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:65,9:17:74:0.122:2	0:15,1:.:16:0.063:0
5 | chr22	46931062	.	G	A	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:89,44:33:133:0.331:2	0:42,0:.:42:0.00:0
6 | chr22	50636218	.	A	C	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:86,12:16:98:0.122:2	0:12,0:.:12:0.00:0
7 | chr22	50875933	.	A	C	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:56,11:15:67:0.164:2	0:10,0:.:10:0.00:0
8 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_10.vcf:
--------------------------------------------------------------------------------
1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta
2 | chr22	22309964	.	C	T	.	QSS_ref	NT=ref;QSS=5;QSS_NT=2;SGT=CC->CT;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	1:0:0:0:0,0:1,1:0,0:0,0	258:0:0:0:0,0:178,178:0,0:80,80
3 | chr22	46931062	.	G	A	.	PASS	NT=ref;QSS=39;QSS_NT=39;SGT=GG->AG;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	45:0:0:0:0,0:0,0:45,47:0,0	140:1:0:0:25,26:0,0:113,113:1,1
4 | chr22	46931061	.	G	T	.	PASS	NT=ref;QSS=39;QSS_NT=39;SGT=GG->AG;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	45:0:0:0:0,0:0,0:45,47:0,0	140:1:0:0:25,26:0,0:113,113:1,1
5 | chr22	50528497	.	A	G	.	QSS_ref	NT=ref;QSS=1;QSS_NT=0;SGT=AG->AG;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	1:0:0:0:0,0:0,0:1,1:0,0	29:0:0:0:25,25:0,0:4,4:0,0
6 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_11.vcf:
--------------------------------------------------------------------------------
1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta
2 | chr22	30507883	.	A	T	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0:41,1:.:42:0.024:0	0/1:61,9:17:70:0.129:2
3 | chr22	40060737	.	G	C	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0:13,1:.:14:0.071:0	0/1:52,10:17:63:0.161:2
4 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_12.vcf:
--------------------------------------------------------------------------------
1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta
2 | chr22	20387160	.	G	A	.	PASS	NT=ref;QSS=15;QSS_NT=15;SGT=GG->AG;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	81:0:0:0:10,10:0,0:71,71:0,0	84:1:0:0:21,22:0,0:62,62:0,0
3 | chr22	22309964	.	C	T	.	QSS_ref	NT=ref;QSS=5;QSS_NT=2;SGT=CC->CT;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	1:0:0:0:0,0:1,1:0,0:0,0	293:0:0:0:0,0:190,191:0,0:103,103
4 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_13.vcf:
--------------------------------------------------------------------------------
1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta
2 | chr22	20387160	.	G	.	.	QSS_ref	NT=ref;QSS=2;QSS_NT=2;SGT=GG->GG;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	81:0:0:0:10,10:0,0:71,71:0,0	70:0:0:0:15,15:0,0:55,55:0,0
3 | chr22	22309964	.	C	T	.	QSS_ref	NT=ref;QSS=5;QSS_NT=2;SGT=CC->CT;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	1:0:0:0:0,0:1,1:0,0:0,0	232:0:0:0:0,0:159,159:0,0:73,73
4 | chr22	22576057	.	C	T	.	QSS_ref	NT=ref;QSS=4;QSS_NT=1;SGT=CC->CT;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	0:0:0:0:0,0:0,0:0,0:0,0	206:0:0:0:0,0:62,62:0,0:144,145
5 | chr22	23481061	.	C	A	.	QSS_ref	NT=ref;QSS=3;QSS_NT=3;SGT=CC->AC;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	18:1:0:0:0,0:17,18:0,0:0,0	126:9:0:0:8,12:109,120:0,0:0,0
6 | chr22	30507883	.	A	T	.	QSS_ref	NT=ref;QSS=2;QSS_NT=2;SGT=AT->AT;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	45:0:0:0:43,43:0,0:0,0:2,2	82:4:0:0:68,70:0,1:0,0:10,11
7 | chr22	46931060	.	A	G	.	QSS_ref	NT=ref;QSS=1;QSS_NT=1;SGT=AG->AG;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	31:0:0:0:31,32:0,0:0,0:0,0	57:0:0:0:53,53:0,0:4,4:0,0
8 | chr22	50962223	.	G	A	.	PASS	NT=ref;QSS=24;QSS_NT=24;SGT=GG->AG;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	47:0:0:0:0,0:0,0:47,48:0,0	347:0:0:0:11,11:0,0:335,336:1,1
9 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_14.vcf:
--------------------------------------------------------------------------------
 1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta
 2 | chr22	20387160	.	G	A	.	QSS_ref	NT=ref;QSS=3;QSS_NT=3;SGT=GG->AG;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	81:0:0:0:10,10:0,0:71,71:0,0	73:0:0:0:9,9:0,0:64,64:0,0
 3 | chr22	20796345	.	C	.	.	QSS_ref	NT=ref;QSS=3;QSS_NT=3;SGT=CC->CC;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	21:0:0:0:0,0:21,21:0,0:0,0	80:0:0:0:0,0:76,76:0,0:4,4
 4 | chr22	21174794	.	C	A	.	QSS_ref	NT=ref;QSS=3;QSS_NT=3;SGT=CC->AC;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	67:0:0:0:0,0:67,67:0,0:0,0	160:4:0:0:11,15:145,145:0,0:0,0
 5 | chr22	21174795	.	C	A	.	QSS_ref	NT=ref;QSS=9;QSS_NT=9;SGT=CC->AC;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	68:0:0:0:0,0:68,68:0,0:0,0	160:4:0:0:15,18:141,142:0,0:0,0
 6 | chr22	22309964	.	C	T	.	QSS_ref	NT=ref;QSS=5;QSS_NT=2;SGT=CC->CT;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	1:0:0:0:0,0:1,1:0,0:0,0	215:1:0:0:0,0:144,144:0,0:70,71
 7 | chr22	23481104	.	C	.	.	QSS_ref	NT=ref;QSS=1;QSS_NT=1;SGT=CC->CC;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	10:2:0:0:0,0:8,10:0,0:0,0	169:14:0:0:11,17:144,156:0,0:0,0
 8 | chr22	30507883	.	A	T	.	QSS_ref	NT=ref;QSS=1;QSS_NT=1;SGT=AT->AT;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	45:0:0:0:43,43:0,0:0,0:2,2	109:3:0:0:97,98:0,0:0,0:9,12
 9 | chr22	42970799	.	A	G	.	QSS_ref	NT=ref;QSS=1;QSS_NT=1;SGT=AG->AG;SOMATIC;TQSS=1;TQSS_NT=1	DP:FDP:SDP:SUBDP:AU:CU:GU:TU	31:0:0:0:31,32:0,0:0,0:0,0	81:0:0:0:76,76:0,0:5,5:0,0
10 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_2.vcf:
--------------------------------------------------------------------------------
1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta
2 | chr22	45309893	.	T	G	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:52,10:13:62:0.161:2	0:27,1:.:28:0.036:0
3 | chr22	46931062	.	G	A	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:94,53:32:147:0.361:2	0:42,0:.:42:0.00:0
4 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_3.vcf:
--------------------------------------------------------------------------------
1 | ##contig=<ID=chr22,length=51304566,assembly=hg19>
2 | chr22	20390444	.	A	G	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:76,4:32:80:0.050:2	0:24,0:.:24:0.00:0
3 | chr22	25016296	.	G	A	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:95,4:32:99:0.040:2	0:43,0:.:43:0.00:0
4 | chr22	25046004	.	T	G	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:138,7:30:146:0.048:2	0:62,1:.:63:0.016:0
5 | chr22	46931061	.	G	C	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:211,8:29:220:0.037:2	0:97,1:.:99:0.010:0
6 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_4.vcf:
--------------------------------------------------------------------------------
1 | chr22	23481083	.	C	A	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:345,25:17:372:0.068:2	0:14,0:.:14:0.00:0
2 | chr22	24106576	.	G	A	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:952,42:33:995:0.042:2	0:27,0:.:27:0.00:0
3 | chr22	40257775	.	T	G	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:132,13:16:145:0.090:2	0:88,1:.:89:0.011:0
4 | chr22	46931062	.	G	A	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:254,243:32:497:0.489:2	0:42,0:.:42:0.00:0
5 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_5.vcf:
--------------------------------------------------------------------------------
1 | chr22	24106576	.	G	A	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:910,45:34:956:0.047:2	0:27,0:.:27:0.00:0
2 | chr22	29939378	.	A	G	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:211,23:13:234:0.098:2	0:38,1:.:39:0.026:0
3 | chr22	38051393	.	A	C	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:19,4:25:23:0.174:2	0:9,0:.:9:0.00:0
4 | chr22	40060737	.	G	C	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:154,17:14:171:0.099:2	0:13,1:.:14:0.071:0
5 | chr22	43617352	.	T	G	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:209,22:12:231:0.095:2	0:19,0:.:19:0.00:0
6 | chr22	46931062	.	G	A	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:260,249:33:509:0.489:2	0:42,0:.:42:0.00:0
7 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_6.vcf:
--------------------------------------------------------------------------------
1 | chr22	40060737	.	G	C	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:65,11:14:77:0.145:2	0:13,1:.:14:0.071:0
2 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_7.vcf:
--------------------------------------------------------------------------------
1 | chr22	46931062	.	G	A	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:108,25:32:134:0.188:2	0:42,0:.:42:0.00:0
2 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_8.vcf:
--------------------------------------------------------------------------------
 1 | ##contig=<ID=chr22,length=51304566,assembly=hg19>
 2 | chr22	21053057	.	C	T	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:129,9:29:138:0.065:2	0:49,1:.:50:0.020:0
 3 | chr22	22974759	.	G	C	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:178,8:32:186:0.043:2	0:80,1:.:75:0.012:0
 4 | chr22	23241800	.	T	G	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:132,5:32:137:0.036:2	0:61,0:.:61:0.00:0
 5 | chr22	23241804	.	A	G	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:131,5:33:137:0.037:2	0:62,0:.:62:0.00:0
 6 | chr22	24655840	.	C	T	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:98,6:30:104:0.058:2	0:69,1:.:37:0.014:0
 7 | chr22	25044103	.	G	T	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:132,6:29:138:0.043:2	0:73,1:.:67:0.014:0
 8 | chr22	25044108	.	G	A	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:135,6:30:142:0.043:2	0:73,1:.:66:0.014:0
 9 | chr22	25574241	.	G	C	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:104,7:31:111:0.063:2	0:34,0:.:34:0.00:0
10 | chr22	40060742	.	A	C	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:33,6:18:39:0.154:2	0:16,0:.:16:0.00:0
11 | chr22	46931062	.	G	A	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:108,7:27:115:0.061:2	0:42,0:.:42:0.00:0
12 | 


--------------------------------------------------------------------------------
/tests/data/CELSR1/vcfs/vcf_9.vcf:
--------------------------------------------------------------------------------
1 | chr22	21174795	.	C	A	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:140,13:17:153:0.085:2	0:61,0:.:61:0.00:0
2 | chr22	29939378	.	A	G	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:74,10:15:84:0.119:2	0:38,1:.:39:0.026:0
3 | chr22	38037134	.	A	C	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:73,9:19:83:0.110:2	0:15,1:.:16:0.063:0
4 | chr22	45309893	.	T	G	.	PASS	SOMATIC;VT=SNP	GT:AD:BQ:DP:FA:SS	0/1:80,14:12:94:0.149:2	0:27,1:.:28:0.036:0
5 | 


--------------------------------------------------------------------------------
/tests/data/gatk_mini_bundle_extract.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/gatk_mini_bundle_extract.bam


--------------------------------------------------------------------------------
/tests/data/gatk_mini_bundle_extract.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/gatk_mini_bundle_extract.bam.bai


--------------------------------------------------------------------------------
/tests/data/rna_chr17_41244936.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/rna_chr17_41244936.bam


--------------------------------------------------------------------------------
/tests/data/rna_chr17_41244936.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/rna_chr17_41244936.bam.bai


--------------------------------------------------------------------------------
/tests/test_allele_support.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import absolute_import
16 | 
17 | import logging
18 | 
19 | from nose.tools import eq_
20 | 
21 | from varlens.commands import allele_support
22 | 
23 | from . import data_path, run_and_parse_csv, cols_concat
24 | 
25 | def run(args):
26 |     logging.info("Running with args: " + ' '.join(args))
27 |     return run_and_parse_csv(allele_support.run, args)
28 | 
29 | expected_cols = [
30 |     "contig", "interbase_start", "interbase_end", "allele", "count",
31 | ]
32 | 
33 | def test_basic():
34 |     result = run([
35 |         "--reads", data_path("CELSR1/bams/bam_5.bam"),
36 |         "--locus", "chr22:46929963", "chr22:46929964",
37 |     ])
38 |     eq_(cols_concat(result, expected_cols),
39 |         {"22-46929962-46929963-C-60", "22-46929963-46929964-A-81"})
40 | 
41 |     result = run([
42 |         "--reads", data_path("CELSR1/bams/bam_5.bam"),
43 |         "--locus", "chr22:46929963", "chr22:46929964",
44 |         "--is-reverse"
45 |     ])
46 |     eq_(cols_concat(result, expected_cols),
47 |         {"22-46929962-46929963-C-37", "22-46929963-46929964-A-47"})
48 | 
49 |     result = run([
50 |         "--reads", data_path("gatk_mini_bundle_extract.bam"),
51 |         "--locus", "chr20:10008951",
52 |         "--is-reverse",
53 |     ])
54 |     eq_(cols_concat(result, expected_cols),
55 |         {"20-10008950-10008951-C-1"})
56 | 
57 | def test_simple():
58 |     result = run([
59 |         "--reads", data_path("CELSR1/bams/bam_0.bam"),
60 |         "--genome", "b37",
61 |         "--variants", data_path("CELSR1/vcfs/vcf_1.vcf"),
62 |     ])
63 |     eq_(cols_concat(
64 |             result,
65 |             ["contig", "interbase_start", "interbase_end", "allele", "count"]),
66 |         {
67 |             '22-50636217-50636218-N-0',
68 |             '22-50875932-50875933-N-0',
69 |             '22-21829554-21829555-N-0',
70 |             "22-46931059-46931060-A-50",
71 |             "22-46931061-46931062-G-51",
72 |     })
73 | 
74 |     pick_one_variant = [
75 |         ["--ref", "G"],
76 |         ["--alt", "A"],
77 |         ["--variant-locus", "22/46931061"],
78 |         ["--variant-locus", "22/46931061-46931062"],
79 |     ]
80 |     for variant_filter in pick_one_variant:
81 |         result = run([
82 |             "--reads", data_path("CELSR1/bams/bam_0.bam"),
83 |             "--genome", "b37",
84 |             "--variants", data_path("CELSR1/vcfs/vcf_1.vcf"),
85 |         ] + variant_filter)
86 |         yield (
87 |             eq_,
88 |             cols_concat(
89 |                 result, ["contig", "interbase_start", "interbase_end"]),
90 |             {"22-46931061-46931062"})
91 | 
92 | 


--------------------------------------------------------------------------------
/tests/test_read_evidence.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015. Mount Sinai School of Medicine
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from __future__ import absolute_import
 15 | 
 16 | import collections
 17 | from nose.tools import eq_, assert_raises
 18 | 
 19 | from pysam import Samfile
 20 | 
 21 | from varcode import Variant as VarcodeVariant
 22 | 
 23 | from . import data_path
 24 | 
 25 | from varlens.locus import Locus
 26 | from varlens.read_evidence import PileupCollection
 27 | 
 28 | Variant = collections.namedtuple("Variant", "locus ref alt")
 29 | 
 30 | def filtered_read_names(filtered_evidence):
 31 |     assert filtered_evidence.parent is not None
 32 |     full = set(filtered_evidence.parent.read_attribute('query_name'))
 33 |     filtered = set(filtered_evidence.read_attribute('query_name'))
 34 |     assert filtered - full == set()
 35 |     return full - filtered
 36 | 
 37 | def test_read_evidence_rna1_single_base_loci():
 38 |     loci = [
 39 |         Locus.from_inclusive_coordinates("17", 41244936, 41244936),  # 0
 40 |         Locus.from_inclusive_coordinates("17", 41244937, 41244937),  # 1
 41 |         Locus.from_inclusive_coordinates("17", 41244935, 41244935),  # 2
 42 |         Locus.from_inclusive_coordinates("17", 41244933, 41244933),  # 3
 43 |         Locus.from_inclusive_coordinates("17", 41244853, 41244853),  # 4
 44 |         Locus.from_inclusive_coordinates("17", 41244857, 41244857),  # 5
 45 |         Locus.from_inclusive_coordinates("17", 41244864, 41244864),  # 6
 46 |         Locus.from_inclusive_coordinates("17", 41244879, 41244879),  # 7
 47 |         Locus.from_inclusive_coordinates("17", 41244901, 41244901),  # 8
 48 |         Locus.from_inclusive_coordinates("17", 41244910, 41244910),  # 9
 49 |         Locus.from_inclusive_coordinates("17", 41244917, 41244917),  # 10
 50 |         Locus.from_inclusive_coordinates("17", 41244972, 41244972),  # 11
 51 |         Locus.from_inclusive_coordinates("17", 41244973, 41244973),  # 12
 52 |         Locus.from_inclusive_coordinates("17", 41245026, 41245026),  # 13
 53 |         Locus.from_inclusive_coordinates("17", 41245027, 41245027),  # 14
 54 |         Locus.from_inclusive_coordinates("17", 41245019, 41245019),  # 15
 55 |         Locus.from_inclusive_coordinates("17", 41245018, 41245018),  # 16
 56 |     ]
 57 |     evidence = PileupCollection.from_bam(
 58 |         data_path("rna_chr17_41244936.bam"), loci)
 59 | 
 60 |     eq_(evidence.allele_summary(loci[0]), [("A", 11), ("G", 7)])
 61 |     eq_(evidence.allele_summary(loci[1]), [("G", 17), ("A", 1)])
 62 |     eq_(evidence.allele_summary(loci[2]), [("C", 18)])
 63 |     eq_(evidence.allele_summary(loci[3]), [("A", 17), ("G", 1)])
 64 |     eq_(evidence.allele_summary(loci[4]), [("C", 1)])
 65 |     eq_(evidence.allele_summary(loci[5]), [("T", 2)])
 66 |     eq_(evidence.allele_summary(loci[6]), [("T", 4)])
 67 |     eq_(evidence.allele_summary(loci[7]), [("C", 8)])
 68 |     eq_(evidence.allele_summary(loci[8]), [("C", 8)])
 69 |     eq_(evidence.allele_summary(loci[9]), [("C", 9)])
 70 |     eq_(evidence.allele_summary(loci[10]), [("A", 10)])
 71 |     eq_(evidence.allele_summary(loci[11]), [("T", 11)])
 72 |     eq_(evidence.allele_summary(loci[12]), [("T", 11)])
 73 |     eq_(evidence.allele_summary(loci[13]), [("C", 1)])
 74 |     eq_(evidence.allele_summary(loci[14]), [("G", 1)])
 75 |     eq_(evidence.allele_summary(loci[15]), [("T", 8)])
 76 |     eq_(evidence.allele_summary(loci[16]), [("T", 8)])
 77 | 
 78 | def test_read_evidence_rna1_multi_base_loci():
 79 |     loci = [
 80 |         Locus.from_inclusive_coordinates("17", 41244853, 41244854),  # 0
 81 |         Locus.from_inclusive_coordinates("17", 41244853, 41244857),  # 1
 82 |         Locus.from_inclusive_coordinates("17", 41244854, 41244857),  # 2
 83 |         Locus.from_inclusive_coordinates("17", 41244852, 41244857),  # 3
 84 |         Locus.from_inclusive_coordinates("17", 41244933, 41244936),  # 4
 85 |         Locus.from_inclusive_coordinates("17", 41244933, 41244937),  # 5
 86 |         Locus.from_inclusive_coordinates("17", 41244971, 41244973),  # 6
 87 |         Locus.from_inclusive_coordinates("17", 41265063, 41265067),  # 7
 88 |     ]
 89 |     evidence = PileupCollection.from_bam(
 90 |         data_path("rna_chr17_41244936.bam"), loci)
 91 |     eq_(evidence.allele_summary(loci[0]), [("CT", 1)])
 92 |     eq_(evidence.allele_summary(loci[1]), [("CTTTT", 1)])
 93 |     eq_(evidence.allele_summary(loci[2]), [("TTTT", 1)])
 94 |     eq_(evidence.allele_summary(loci[3]), [])
 95 |     eq_(evidence.allele_summary(loci[4]),
 96 |         [("AACA", 11), ("AACG", 6), ("GACG", 1)])
 97 |     eq_(evidence.allele_summary(loci[5]),
 98 |         [("AACAG", 10), ("AACGG", 6), ("AACAA", 1), ("GACGG", 1)])
 99 |     eq_(evidence.allele_summary(loci[6]), [("ATT", 11)])
100 |     eq_(evidence.allele_summary(loci[7]), [("ACCCG", 1)])
101 | 
102 | def test_read_evidence_gatk_mini_bundle_extract():
103 |     loci = [
104 |         Locus.from_inclusive_coordinates("20", 9999996, 9999996),    # 0
105 |         Locus.from_inclusive_coordinates("20", 10260442),            # 1
106 |         Locus.from_inclusive_coordinates("20", 10006823),            # 2
107 |         Locus.from_inclusive_coordinates("20", 10006819, 10006823),  # 3
108 |         Locus.from_inclusive_coordinates("20", 10006819, 10006825),  # 4
109 |         Locus.from_inclusive_coordinates("20", 10006822, 10006827),  # 5
110 |         Locus.from_inclusive_coordinates("20", 10007175),            # 6
111 |         Locus.from_inclusive_coordinates("20", 10007174, 10007176),  # 7
112 |         Locus.from_inclusive_coordinates("20", 1, 3),                # 8
113 |         Locus.from_inclusive_coordinates("20", 10008796),            # 9
114 |         Locus.from_inclusive_coordinates("20", 10008921),            # 10
115 |     ]
116 |     handle = Samfile(data_path("gatk_mini_bundle_extract.bam"))
117 |     evidence = PileupCollection.from_bam(handle, loci)
118 | 
119 |     eq_(evidence.allele_summary(loci[0]), [("ACT", 9)])
120 |     eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[0]),
121 |         [("ACT", 8)])
122 |     eq_(evidence.allele_summary(loci[1]), [("T", 7)])
123 |     eq_(evidence.filter().allele_summary(loci[2]), [("", 6), ("C", 2)])
124 |     eq_(evidence.filter(
125 |         drop_duplicates=True, min_base_quality=50).allele_summary(loci[2]),
126 |         [])
127 |     eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[2]),
128 |         [("", 5), ("C", 1)])
129 |     eq_(evidence.filter(
130 |         drop_duplicates=True, min_mapping_quality=60).allele_summary(
131 |             loci[2]),
132 |         [("", 5), ("C", 1)])
133 |     eq_(evidence.filter(drop_duplicates=True,
134 |         min_mapping_quality=61).allele_summary(loci[2]), [("", 2)])
135 |     eq_(evidence.filter(drop_duplicates=True,
136 |         min_mapping_quality=61).allele_summary(loci[3]), [("A", 2)])
137 |     eq_(evidence.filter(drop_duplicates=True,
138 |         min_mapping_quality=61).allele_summary(loci[4]), [("AAA", 2)])
139 |     eq_(evidence.filter(drop_duplicates=True,
140 |         min_mapping_quality=61).allele_summary(loci[5]), [("AAAC", 2)])
141 |     eq_(evidence.filter().allele_summary(loci[6]), [("T", 5), ("C", 3)])
142 |     eq_(evidence.filter(min_base_quality=30).allele_summary(loci[6]),
143 |         [("T", 4), ("C", 3)])
144 |     eq_(evidence.filter().allele_summary(loci[7]),
145 |         [("CTT", 5), ("CCT", 3)])
146 |     eq_(evidence.filter(min_base_quality=30).allele_summary(loci[7]),
147 |         [("CTT", 3), ("CCT", 2)])
148 |     eq_(evidence.filter(min_base_quality=32).allele_summary(loci[2]),
149 |         [("", 6), ("C", 1)])
150 |     eq_(filtered_read_names(evidence.at(loci[2]).filter(min_base_quality=32)),
151 |         {'20GAVAAXX100126:4:3:18352:43857'})
152 |     eq_(evidence.allele_summary(loci[8]), [])
153 |     eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[8]), [])
154 |     assert_raises(KeyError,
155 |         evidence.allele_summary,
156 |         Locus.from_inclusive_coordinates("20", 10009174, 10009176))
157 |     eq_(filtered_read_names(
158 |             evidence.at(loci[9]).filter(drop_improper_mate_pairs=True)),
159 |         {'20FUKAAXX100202:8:68:1530:49310'})
160 |     eq_(len(evidence.at(loci[8]).read_attribute('mapping_quality')), 0)
161 |     eq_(list(evidence.at(loci[9]).read_attribute('mapping_quality')),
162 |         list(evidence.at(loci[9]).read_attributes().mapping_quality))
163 |     eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[10]),
164 |         [('C', 2), ('CA', 1), ('CAA', 1)])
165 |     eq_(evidence.filter(drop_duplicates=True).allele_summary(
166 |             Locus.from_interbase_coordinates(
167 |                 loci[10].contig, loci[10].start, loci[10].start)),
168 |         [('', 2), ('A', 1), ('AA', 1)])
169 | 
170 | 
171 | def test_read_evidence_variant_matching_gatk_mini_bundle_extract():
172 |     handle = Samfile(data_path("gatk_mini_bundle_extract.bam"))
173 | 
174 |     loci = [
175 |         Locus.from_inclusive_coordinates("20", 10008951),  # 0
176 |         Locus.from_inclusive_coordinates("20", 10009053),  # 1
177 |         Locus.from_inclusive_coordinates("20", 10009053, 10009054),  # 2
178 |         Locus.from_inclusive_coordinates("20", 10006822),  # 3
179 |         Locus.from_inclusive_coordinates("20", 10006822, 10006823),  # 4
180 | 
181 |     ]
182 |     evidence = PileupCollection.from_bam(handle, loci)
183 | 
184 |     eq_(evidence.match_summary(Variant(loci[0], "A", "C")),
185 |         [('A', 1), ('C', 4)])
186 |     eq_(evidence.filter(drop_duplicates=True).match_summary(
187 |             Variant(loci[0], "A", "C")),
188 |         [('A', 0), ('C', 3)])
189 |     eq_(evidence.match_summary(Variant(loci[1], "A", "C")),
190 |         [('A', 3), ('C', 0)])
191 |     eq_(evidence.match_summary(Variant(loci[1], "A", "CC")),
192 |         [('A', 3), ('CC', 0)])
193 |     eq_(evidence.match_summary(Variant(loci[1], "A", "")),
194 |         [('A', 3), ('', 0)])
195 |     eq_(evidence.match_summary(Variant(loci[1], "A", "")),
196 |         [('A', 3), ('', 0)])
197 |     eq_(evidence.match_summary(Variant(loci[2], "AT", "")),
198 |         [('AT', 3), ('', 0)])
199 |     eq_(evidence.match_summary(Variant(loci[3], "A", "")),
200 |         [('A', 2), ('', 6)])
201 |     eq_(evidence.match_summary(Variant(loci[4], "AC", "")),
202 |         [('AC', 2), ('', 6)])
203 |     eq_(evidence.match_summary(
204 |             Variant(loci[4], "AC", ""),
205 |             lambda e: e.read_attributes().mapping_quality.mean()),
206 |         [('AC', 60.0), ('', 65.0)])
207 | 
208 | def test_read_evidence_variant_matching_gatk_bundle_native_varcode_variant():
209 |     # Try native varcode Variant.
210 |     handle = Samfile(data_path("gatk_mini_bundle_extract.bam"))
211 |     locus = Locus.from_inclusive_coordinates("20", 10008951)
212 |     variant = VarcodeVariant(
213 |         locus.contig,
214 |         locus.position + 1,  # inclusive not interbase
215 |         "A",
216 |         "C")
217 |     evidence = PileupCollection.from_bam(handle, [variant])
218 |     eq_(evidence.match_summary(variant),
219 |         [('A', 1), ('C', 4)])
220 | 
221 | 
222 | def test_read_evidence_variant_matching_gatk_mini_bundle_extract_warning():
223 |     filename = data_path("gatk_mini_bundle_extract.bam")
224 | 
225 |     # Should log a warning but pass.
226 |     loci = [
227 |         Locus.from_inclusive_coordinates("20", 10009053, 10009054),  # 0
228 |     ]
229 |     evidence = PileupCollection.from_bam(filename, loci)
230 |     eq_(evidence.match_summary(Variant(loci[0], "A", "")),
231 |         [('A', 0), ('', 0), ('AT', 3)])
232 | 
233 | 
234 | 


--------------------------------------------------------------------------------
/tests/test_reads.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015. Mount Sinai School of Medicine
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from __future__ import absolute_import
 16 | 
 17 | import functools
 18 | 
 19 | from nose.tools import eq_
 20 | 
 21 | from varlens.commands import reads
 22 | 
 23 | from . import data_path, run_and_parse_csv, cols_concat, temp_file
 24 | 
 25 | run = functools.partial(run_and_parse_csv, reads.run)
 26 | 
 27 | expected_cols = (
 28 |     "source,query_name,reference_start,reference_end,cigarstring").split(',')
 29 | 
 30 | def test_basic():
 31 |     result = run([
 32 |         data_path("CELSR1/bams/bam_0.bam"),
 33 |     ])
 34 |     eq_(result.shape, (953, len(expected_cols)))
 35 | 
 36 |     result = run([
 37 |         data_path("CELSR1/bams/bam_0.bam"),
 38 |         "--is-duplicate",
 39 |     ])
 40 |     eq_(result.shape, (173, len(expected_cols)))
 41 | 
 42 |     result = run([
 43 |         data_path("CELSR1/bams/bam_0.bam"),
 44 |         "--is-read1",
 45 |     ])
 46 |     eq_(result.shape, (481, len(expected_cols)))
 47 | 
 48 |     result = run([
 49 |         data_path("CELSR1/bams/bam_0.bam"),
 50 |         "--is-read2",
 51 |     ])
 52 |     eq_(result.shape, (472, len(expected_cols)))
 53 | 
 54 | def test_loci_filtering():
 55 |     result = run([
 56 |         data_path("CELSR1/bams/bam_5.bam"),
 57 |     ])
 58 |     eq_(result.shape, (37053, len(expected_cols)))
 59 | 
 60 |     result = run([
 61 |         data_path("CELSR1/bams/bam_5.bam"),
 62 |         "--locus", "chr22:46930257-46930259"
 63 |     ])
 64 |     eq_(result.shape, (1795, len(expected_cols)))
 65 | 
 66 |     result = run([
 67 |         data_path("CELSR1/bams/bam_5.bam"),
 68 |         "--locus", "chr22/46930256-46930259"
 69 |     ])
 70 |     eq_(result.shape, (1795, len(expected_cols)))
 71 | 
 72 |     result = run([
 73 |         data_path("CELSR1/bams/bam_5.bam"),
 74 |         "--locus", "chr22:46930257-46930257"
 75 |     ])
 76 |     eq_(result.shape, (1753, len(expected_cols)))
 77 | 
 78 |     result = run([
 79 |         data_path("CELSR1/bams/bam_5.bam"),
 80 |         "--locus", "chr22:46930257"
 81 |     ])
 82 |     eq_(result.shape, (1753, len(expected_cols)))
 83 | 
 84 |     result = run([
 85 |         data_path("CELSR1/bams/bam_5.bam"),
 86 |         "--locus", "chr22/46930256"
 87 |     ])
 88 |     eq_(result.shape, (1753, len(expected_cols)))
 89 | 
 90 | def test_read_filtering():
 91 |     result = run([
 92 |         data_path("CELSR1/bams/bam_5.bam"),
 93 |         "--reference-start", '46932059',
 94 |     ])
 95 |     eq_(result.shape, (26, len(expected_cols)))
 96 | 
 97 |     result = run([
 98 |         data_path("CELSR1/bams/bam_5.bam"),
 99 |         "--reference-start", '46932059',
100 |         "--query-name-contains", '57841',
101 |     ])
102 |     eq_(result.shape, (1, len(expected_cols)))
103 | 
104 | def test_round_trip():
105 |     with temp_file(".bam") as out:
106 |         reads.run([
107 |             data_path("CELSR1/bams/bam_5.bam"),
108 |             "--locus", "chr22/46930276",
109 |             "--locus", "chr22/46930256",
110 |             "--out", out,
111 |         ])
112 |         result1 = run([
113 |             out,
114 |         ])
115 |         result2 = run([
116 |             data_path("CELSR1/bams/bam_5.bam"),
117 |             "--locus", "chr22/46930276",
118 |             "--locus", "chr22/46930256",
119 |         ])
120 |         eq_(sorted(cols_concat(result1, expected_cols[1:])),
121 |             sorted(cols_concat(result2, expected_cols[1:])))
122 | 
123 | def test_round_trip_sam():
124 |     with temp_file(".sam") as out:
125 |         print(out)
126 |         reads.run([
127 |             data_path("CELSR1/bams/bam_5.bam"),
128 |             "--locus", "chr22/46930276",
129 |             "--locus", "chr22/46930256",
130 |             "--out", out,
131 |         ])
132 |         result1 = run([
133 |             out,
134 |         ])
135 |         result2 = run([
136 |             data_path("CELSR1/bams/bam_5.bam"),
137 |             "--locus", "chr22/46930276",
138 |             "--locus", "chr22/46930256",
139 |         ])
140 |         eq_(sorted(cols_concat(result1, expected_cols[1:])),
141 |             sorted(cols_concat(result2, expected_cols[1:])))
142 | 


--------------------------------------------------------------------------------
/tests/test_variants.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015. Mount Sinai School of Medicine
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from __future__ import absolute_import
 16 | 
 17 | import subprocess
 18 | import warnings
 19 | import logging
 20 | 
 21 | import pandas
 22 | import numpy
 23 | from nose.tools import eq_
 24 | 
 25 | from varlens.commands import variants
 26 | 
 27 | from . import data_path, run_and_parse_csv, cols_concat, temp_file
 28 | 
 29 | def run(args):
 30 |     logging.info("Running with args: " + ' '.join(args))
 31 |     return run_and_parse_csv(variants.run, args)
 32 | 
 33 | reference_fasta = data_path("chr22.no_line_wrap.fa")
 34 | 
 35 | expected_cols = [
 36 |     "genome", "contig", "interbase_start", "interbase_end", "ref", "alt",
 37 | ]
 38 | 
 39 | def test_basic():
 40 |     result = run([
 41 |         data_path("CELSR1/vcfs/vcf_1.vcf"),
 42 |         "--genome", "b37",
 43 |     ])
 44 |     eq_(sorted(cols_concat(result, expected_cols)), sorted({
 45 |         "GRCh37-22-46931059-46931060-A-C",
 46 |         "GRCh37-22-21829554-21829555-T-G",
 47 |         "GRCh37-22-46931061-46931062-G-A",
 48 |         "GRCh37-22-50636217-50636218-A-C",
 49 |         "GRCh37-22-50875932-50875933-A-C",
 50 |     }))
 51 | 
 52 | def test_genes_and_effects():
 53 |     result = run([
 54 |         data_path("CELSR1/vcfs/vcf_1.vcf"),
 55 |         "--genome", "b37",
 56 |         "--include-effect",
 57 |         "--include-gene",
 58 |         "--rename-column", "gene", "genez",
 59 |     ])
 60 |     eq_(sorted(cols_concat(result, expected_cols + ["effect", "genez"])),
 61 |         sorted({
 62 |            'GRCh37-22-21829554-21829555-T-G-non-coding-transcript-PI4KAP2',
 63 |            'GRCh37-22-46931059-46931060-A-C-p.S670A-CELSR1',
 64 |            'GRCh37-22-46931061-46931062-G-A-p.S669F-CELSR1',
 65 |            'GRCh37-22-50636217-50636218-A-C-intronic-TRABD',
 66 |            'GRCh37-22-50875932-50875933-A-C-splice-acceptor-PPP6R2',
 67 |         }))
 68 | 
 69 | def test_context():
 70 |     result = run([
 71 |         data_path("CELSR1/vcfs/vcf_1.vcf"),
 72 |         "--genome", "b37",
 73 |         "--include-context",
 74 |         "--context-num-bases", "5",
 75 |         "--reference", reference_fasta,
 76 |     ])
 77 |     eq_(sorted(cols_concat(result,
 78 |             expected_cols + [
 79 |                 "context_5_prime", "context_3_prime", "context_mutation"])),
 80 |         sorted({
 81 |             "GRCh37-22-46931059-46931060-A-C-GCTCC-CCACC-T>G",
 82 |             "GRCh37-22-21829554-21829555-T-G-CATGA-AGTGA-T>G",
 83 |             "GRCh37-22-46931061-46931062-G-A-GAGCT-CTCCA-C>T",
 84 |             "GRCh37-22-50636217-50636218-A-C-AGGGA-GGGCA-T>G",
 85 |             "GRCh37-22-50875932-50875933-A-C-AGGCC-GGGAG-T>G",
 86 |         }))
 87 | 
 88 | def test_mhc_binding_affinity():
 89 |     # If netMHC is not installed, we skip this test
 90 |     try:
 91 |         # If this succeeds (no exception), we do nothing.
 92 |         subprocess.call(
 93 |             "netMHC", stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 94 |     except OSError:
 95 |         warnings.warn("netMHC not installed, skipping mhc binding test")
 96 |         return
 97 | 
 98 |     with temp_file(".csv") as out_csv:
 99 |         run([
100 |             data_path("CELSR1/vcfs/vcf_1.vcf"),
101 |             "--genome", "b37",
102 |             "--include-mhc-binding",
103 |             "--hla", "A:02:01 A:02:02",
104 |             "--out", out_csv,
105 |         ])
106 |         ['GRCh37-22-21829554-21829555-T-G-nan-nan',
107 |         'GRCh37-22-46931059-46931060-A-C-377.3-A:02:02',
108 |         'GRCh37-22-46931061-46931062-G-A-77.2-A:02:02',
109 |         'GRCh37-22-50636217-50636218-A-C-nan-nan',
110 |         'GRCh37-22-50875932-50875933-A-C-nan-nan']
111 | 
112 |         results = pandas.read_csv(out_csv).set_index(expected_cols)
113 |         assert numpy.isnan(results.loc[
114 |                 ("GRCh37", 22, 21829554, 21829555, "T", "G")].binding_affinity)
115 |         assert numpy.isnan(results.loc[
116 |                 ("GRCh37", 22, 21829554, 21829555, "T", "G")].binding_allele)
117 |         eq_(results.loc[
118 |             ("GRCh37", 22, 46931059, 46931060, "A", "C")].binding_allele,
119 |             "A:02:02")
120 |         eq_(results.loc[
121 |             ("GRCh37", 22, 46931061, 46931062, "G", "A")].binding_allele,
122 |             "A:02:02")
123 | 
124 | def test_read_evidence():
125 |     result = run([
126 |         data_path("CELSR1/vcfs/vcf_1.vcf"),
127 |         "--include-read-evidence",
128 |         "--reads", data_path("CELSR1/bams/bam_0.bam"),
129 |         "--genome", "b37",
130 |     ])
131 |     allele_groups = ["num_ref", "num_alt", "total_depth"]
132 |     for allele_group in allele_groups:
133 |         result[allele_group] = result[allele_group].astype(int)
134 |     eq_(cols_concat(
135 |             result,
136 |             ["contig", "interbase_start"] + allele_groups),
137 |         {
138 |             '22-50636217-0-0-0',
139 |             '22-50875932-0-0-0',
140 |             '22-21829554-0-0-0',
141 |             "22-46931059-50-0-50",
142 |             "22-46931061-51-0-51",
143 |     })
144 | 
145 |     # Same thing but with chunk rows = 1
146 |     with temp_file(".csv") as out_csv:
147 |         run([
148 |             data_path("CELSR1/vcfs/vcf_1.vcf"),
149 |             "--include-read-evidence",
150 |             "--reads", data_path("CELSR1/bams/bam_0.bam"),
151 |             "--genome", "b37",
152 |             "--chunk-rows", "1",
153 |             "--out", out_csv,
154 |         ])
155 |         result = pandas.read_csv(out_csv)
156 |     
157 |         allele_groups = ["num_ref", "num_alt", "total_depth"]
158 |         for allele_group in allele_groups:
159 |             result[allele_group] = result[allele_group].astype(int)
160 |         eq_(cols_concat(
161 |                 result,
162 |                 ["contig", "interbase_start"] + allele_groups),
163 |             {
164 |                 '22-50636217-0-0-0',
165 |                 '22-50875932-0-0-0',
166 |                 '22-21829554-0-0-0',
167 |                 "22-46931059-50-0-50",
168 |                 "22-46931061-51-0-51",
169 |         })
170 | 
171 |     result = run([
172 |         "--include-read-evidence",
173 |         "--reads", data_path("gatk_mini_bundle_extract.bam"),
174 |         "--read-source-name", "foo",
175 |         "--single-variant", "chr20:10008951", "C", "A",
176 |         "--genome", "b37",
177 |     ])
178 |     for allele_group in allele_groups:
179 |         result[allele_group] = result[allele_group].astype(int)
180 |     eq_(cols_concat(result, expected_cols + allele_groups),
181 |         {"GRCh37-20-10008950-10008951-C-A-4-1-5"})
182 | 
183 |     result = run([
184 |         "--include-read-evidence",
185 |         "--reads", data_path("gatk_mini_bundle_extract.bam"),
186 |         "--read-source-name", "foo",
187 |         "--single-variant", "chr20:10008951", "C", "A",
188 |         "--genome", "b37",
189 |         "--is-reverse",
190 |     ])
191 |     for allele_group in allele_groups:
192 |         result[allele_group] = result[allele_group].astype(int)
193 |     eq_(cols_concat(result, expected_cols + allele_groups),
194 |         {"GRCh37-20-10008950-10008951-C-A-1-0-1"})
195 | 
196 | 
197 | def test_filtering():
198 |     result = run([
199 |         data_path("CELSR1/vcfs/vcf_1.vcf"),
200 |         "--genome", "b37",
201 |         "--ref", "A",
202 |     ])
203 |     eq_(sorted(cols_concat(result, expected_cols)), sorted({
204 |         "GRCh37-22-46931059-46931060-A-C",
205 |         "GRCh37-22-50636217-50636218-A-C",
206 |         "GRCh37-22-50875932-50875933-A-C",
207 |     }))
208 | 
209 |     result = run([
210 |         data_path("CELSR1/vcfs/vcf_1.vcf"),
211 |         "--genome", "b37",
212 |         "--ref", "A",
213 |         "--variant-locus", "22:50636218",
214 |     ])
215 |     eq_(sorted(cols_concat(result, expected_cols)), sorted({
216 |         "GRCh37-22-50636217-50636218-A-C",
217 |     }))
218 | 
219 |     result = run([
220 |         data_path("CELSR1/vcfs/vcf_1.vcf"),
221 |         data_path("CELSR1/vcfs/vcf_2.vcf"),
222 |         "--alt", "C", "G",
223 |         "--genome", "b37"
224 |     ])
225 |     eq_(sorted(cols_concat(result, expected_cols)), sorted({
226 |         "GRCh37-22-21829554-21829555-T-G",
227 |         "GRCh37-22-45309892-45309893-T-G",
228 |         "GRCh37-22-46931059-46931060-A-C",
229 |         "GRCh37-22-50636217-50636218-A-C",
230 |         "GRCh37-22-50875932-50875933-A-C",
231 |     }))
232 | 
233 | '''
234 | def test_fields():
235 |     result = run([
236 |         "--field",
237 |         "foo:ref.lower()",
238 |         "gene_names[0]",
239 |         "--variants", data_path("CELSR1/vcfs/vcf_1.vcf"),
240 |         "--variant-filter", "ref=='A'",
241 |         "--variant-genome", "b37"
242 |     ])
243 |     eq_(sorted(cols_concat(result, expected_cols + ["foo", "gene_names[0]"])),
244 |         sorted({
245 |             "GRCh37-22-46931059-46931060-A-C-a-CELSR1",
246 |             "GRCh37-22-50636217-50636218-A-C-a-TRABD",
247 |             "GRCh37-22-50875932-50875933-A-C-a-PPP6R2",
248 |         }))
249 | '''
250 | def test_round_trip():
251 |     with temp_file(".csv") as out_csv:
252 |         variants.run([
253 |             data_path("CELSR1/vcfs/vcf_1.vcf"),
254 |             "--out", out_csv,
255 |             "--genome", "b37",
256 |             "--ref", "A",
257 |             "--include-gene",
258 |         ])
259 |         result1 = pandas.read_csv(out_csv)
260 |         eq_(sorted(cols_concat(
261 |                 result1, expected_cols + ["gene"])),
262 |             sorted({
263 |                 "GRCh37-22-46931059-46931060-A-C-CELSR1",
264 |                 "GRCh37-22-50636217-50636218-A-C-TRABD",
265 |                 "GRCh37-22-50875932-50875933-A-C-PPP6R2",
266 |             }))
267 | 
268 |         result2 = run([
269 |             out_csv,
270 |             "--include-gene",
271 |         ])
272 |         eq_(sorted(cols_concat(
273 |                 result2,
274 |                 expected_cols + ["gene"])),
275 |             sorted({
276 |                 "GRCh37-22-46931059-46931060-A-C-CELSR1",
277 |                 "GRCh37-22-50636217-50636218-A-C-TRABD",
278 |                 "GRCh37-22-50875932-50875933-A-C-PPP6R2",
279 |             }))
280 | 
281 | def test_distinct_variants():
282 |     result = run([
283 |         data_path("CELSR1/vcfs/vcf_1.vcf"),
284 |         data_path("CELSR1/vcfs/vcf_1.vcf"),
285 |         "--genome", "b37",
286 |         "--ref", "A", "T",
287 |         "--variant-source-name", "first", "second",
288 |     ])
289 |     eq_(sorted(cols_concat(result, expected_cols + ["sources"])),
290 |         sorted({
291 |             "GRCh37-22-21829554-21829555-T-G-first second",
292 |             "GRCh37-22-46931059-46931060-A-C-first second",
293 |             "GRCh37-22-50636217-50636218-A-C-first second",
294 |             "GRCh37-22-50875932-50875933-A-C-first second",
295 |         }))
296 | 
297 | 


--------------------------------------------------------------------------------
/varlens/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015. Mount Sinai School of Medicine
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import absolute_import
16 | 
17 | from . import read_evidence, util, loci_util
18 | 
19 | __version__ = "0.1.4"
20 | 
21 | __all__ = [
22 |     "loci_util",
23 |     "read_evidence",
24 |     "util",
25 | ]
26 | 


--------------------------------------------------------------------------------
/varlens/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import sys
14 | import logging
15 | import warnings
16 | 
17 | # Biopython (used by varcode) throws a warning when sequences are compared.
18 | warnings.filterwarnings("ignore", message="Biopython Seq objects")
19 | 
20 | def configure_logging(args=None):
21 |     if args is not None and args.verbose:
22 |         level = logging.DEBUG
23 |     else:
24 |         level = logging.INFO
25 | 
26 |     logging.basicConfig(
27 |         format="%(asctime)s.%(msecs)d %(levelname)s %(module)s - %(funcName)s:"
28 |         " %(message)s",
29 |         datefmt="%Y-%m-%d %H:%M:%S",
30 |         stream=sys.stderr,
31 |         level=level)
32 | 
33 | 


--------------------------------------------------------------------------------
/varlens/commands/allele_support.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | '''
14 | Given one or more BAMs and some genomic sites to consider, write a csv file
15 | giving counts of reads supporting each allele at each site for each BAM.
16 | 
17 | The genomic sites to consider may be specified by locus (--locus option), or via
18 | one or more VCF files.
19 | 
20 | The positions outputted by this command are in *interbase coordinates*, i.e.
21 | starting at 0, inclusive on first index, exclusive on second (as opposed to
22 | the one-based inclusive coordinates used in VCF files).
23 | 
24 | Example:
25 | 
26 | %(prog)s \\
27 |     --reads test/data/CELSR1/bams/bam_1.bam \\
28 |     --locus 22:46931061 22:46931063
29 | 
30 | '''
31 | 
32 | import argparse
33 | import csv
34 | import sys
35 | import logging
36 | 
37 | from .. import loci_util
38 | from .. import reads_util
39 | from .. import variants_util
40 | 
41 | from . import configure_logging
42 | from .. import support
43 | from ..read_evidence.pileup_collection import to_locus
44 | 
45 | parser = argparse.ArgumentParser(usage=__doc__)
46 | group = parser.add_argument_group("output arguments")
47 | group.add_argument("--out")
48 | group.add_argument("-v", "--verbose", action="store_true", default=False)
49 | loci_util.add_args(parser.add_argument_group("loci specification"))
50 | variants_util.add_args(parser)
51 | reads_util.add_args(parser)
52 | 
53 | def run(raw_args=sys.argv[1:]):
54 |     args = parser.parse_args(raw_args)
55 |     configure_logging(args)
56 | 
57 |     loci = loci_util.load_from_args(args)  # may be None
58 |     variants_df = variants_util.load_from_args_as_dataframe(args)
59 |     if variants_df is not None:
60 |         variant_loci = loci_util.Loci(
61 |             to_locus(variant)
62 |             for variant in variants_df["variant"])
63 |         loci = variant_loci if loci is None else loci.union(variant_loci)
64 |     
65 |     if not loci:
66 |         if variants_df is not None:
67 |             parser.error("No loci: variants specified but none remained "
68 |                 "after filtering")
69 |         else:
70 |             parser.error("No genomic loci or variants specified.")
71 | 
72 |     logging.info("Loaded %d genomic loci." % len(loci))
73 | 
74 |     read_sources = reads_util.load_from_args(args)
75 | 
76 |     if read_sources is None:
77 |         parser.error("No read sources (--reads argument) specified.")
78 | 
79 |     out_fd = open(args.out, "w") if args.out else sys.stdout
80 |     writer = csv.writer(out_fd)
81 | 
82 |     rows_generator = support.allele_support_rows(loci, read_sources)
83 |     for (i, row) in enumerate(rows_generator):
84 |         if i == 0:
85 |             writer.writerow(row.index.tolist())
86 |         writer.writerow([str(x) for x in row])
87 | 
88 |     if out_fd is not sys.stdout:
89 |         out_fd.close()
90 |         print("Wrote: %s" % args.out)
91 | 


--------------------------------------------------------------------------------
/varlens/commands/reads.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | '''
 14 | Filter reads from one or more BAMs and output a CSV or a new BAM.
 15 | 
 16 | Loci and VCF files may be specified, in which case reads are filtered to
 17 | overlap the specified loci or variants.
 18 | 
 19 | Examples:
 20 | 
 21 | Print basic fields for the reads in a BAM:
 22 | 
 23 |     %(prog)s test/data/CELSR1/bams/bam_0.bam
 24 | 
 25 | Same as above but filter only to reads aligned on the (-) strand, write to a 
 26 | file instead of stdout, and also include the mapping quality and sequenced
 27 | bases in the output:
 28 | 
 29 |     %(prog)s test/data/CELSR1/bams/bam_0.bam \\
 30 |         --is-reverse \\
 31 |         --field mapping_quality query_alignment_sequence \\
 32 |         --out /tmp/result.csv
 33 | 
 34 | Write a bam file consisting of reads with mapping quality >=30 and
 35 | overlapping a certain locus:
 36 | 
 37 |     %(prog)s test/data/CELSR1/bams/bam_0.bam \\
 38 |         --min-mapping-quality 30 \\
 39 |         --locus 22:46932040-46932050 \\
 40 |         --out /tmp/result.bam
 41 | 
 42 | Write a bam file consisting of reads overlapping variants from a VCF:
 43 | 
 44 |     %(prog)s test/data/CELSR1/bams/bam_0.bam \\
 45 |         --variants test/data/CELSR1/vcfs/vcf_1.vcf \\
 46 |         --out /tmp/result.bam
 47 | 
 48 | Print just the header for a BAM in csv format:
 49 | 
 50 |     %(prog)s test/data/CELSR1/bams/bam_0.bam --header
 51 | 
 52 | '''
 53 | 
 54 | import argparse
 55 | import sys
 56 | import csv
 57 | 
 58 | import pysam
 59 | 
 60 | from . import configure_logging
 61 | from .. import loci_util
 62 | from .. import reads_util
 63 | from .. import variants_util
 64 | from ..read_evidence.pileup_collection import PileupCollection, to_locus
 65 | 
 66 | STANDARD_FIELDS = [
 67 |     "source",
 68 |     "query_name",
 69 |     "reference_start",
 70 |     "reference_end",
 71 |     "cigarstring",
 72 | ]
 73 | 
 74 | parser = argparse.ArgumentParser(usage=__doc__)
 75 | group = parser.add_argument_group("output")
 76 | group.add_argument("--out",
 77 |     help="Output file. Format is guessed from file extension: must be csv or "
 78 |     "bam. If not specified, csv is written to stdout.")
 79 | group.add_argument("--field", nargs="+", default=[],
 80 |     help="Additional read fields to output as columns in the csv. See pysam "
 81 |     "documentation (http://pysam.readthedocs.org/en/latest/api.html) for the "
 82 |     "meaning of these fields. Valid fields include: %s" % (
 83 |         " ".join(PileupCollection._READ_ATTRIBUTE_NAMES)))
 84 | 
 85 | group.add_argument("--no-standard-fields", action="store_true", default=False,
 86 |     help="Do not include the standard fields (%s) in csv output."
 87 |     % ', '.join(STANDARD_FIELDS))
 88 | group.add_argument("--no-sort", action="store_true", default=False,
 89 |     help="When outputting a bam, do not call samtools sort.")
 90 | group.add_argument(
 91 |     "--header",
 92 |     action="store_true",
 93 |     default=False,
 94 |     help="Output BAM/SAM header only.")
 95 | group.add_argument(
 96 |     "--header-set",
 97 |     nargs=4,
 98 |     action="append",
 99 |     help="When outputting a bam, set a particular header field to the given "
100 |     "value. Example: --header-set RG . SM my_sample")
101 | 
102 | group.add_argument("-v", "--verbose", action="store_true", default=False)
103 | 
104 | reads_util.add_args(parser, positional=True)
105 | loci_util.add_args(parser.add_argument_group("loci specification"))
106 | variants_util.add_args(parser)
107 | 
108 | def run(raw_args=sys.argv[1:]):
109 |     args = parser.parse_args(raw_args)
110 |     configure_logging(args)
111 | 
112 |     read_sources = reads_util.load_from_args(args)
113 |     if not read_sources:
114 |         parser.error("No read sources specified.")
115 | 
116 |     loci = loci_util.load_from_args(args)  # may be None
117 |     variants_df = variants_util.load_from_args_as_dataframe(args)
118 |     if variants_df is not None:
119 |         variant_loci = loci_util.Loci(
120 |             to_locus(variant)
121 |             for variant in variants_df["variant"])
122 |         loci = variant_loci if loci is None else loci.union(variant_loci)
123 | 
124 |     if args.header:
125 |         if loci is not None:
126 |             parser.error("If specifying --header don't specify loci.")
127 |         if args.field:
128 |             parser.error("If specifying --header don't specify fields.")
129 | 
130 |     out_pysam_handle = None
131 |     out_csv_writer = out_csv_fd = None
132 |     if args.out and (args.out.endswith(".bam") or args.out.endswith(".sam")):
133 |         if args.field:
134 |             parser.error("Don't specify fields when outputting to bam or sam.")
135 | 
136 |         header = update_header(args, read_sources[0].handle.header)
137 |         out_pysam_handle = pysam.AlignmentFile(
138 |             args.out,
139 |             "wb",
140 |             header=header)
141 | 
142 |     elif not args.out or args.out.endswith(".csv"):
143 |         out_csv_fd = open(args.out, "w") if args.out else sys.stdout
144 |         out_csv_writer = csv.writer(out_csv_fd)
145 | 
146 |         if args.header:
147 |             if args.field:
148 |                 parser.error("Don't specify fields when outputting header.")
149 |             out_csv_writer.writerow([
150 |                 "read_source", "group", "index", "key", "value",
151 |             ])
152 |         else:
153 |             columns = (
154 |                 ([] if args.no_standard_fields else STANDARD_FIELDS) +
155 |                 args.field)
156 |             out_csv_writer.writerow(columns)
157 |     else:
158 |         parser.error(
159 |             "Don't know how to write to file with output extension: %s. "
160 |             "Supported extensions: csv, bam, sam." % args.out)
161 | 
162 |     num_reads = 0
163 |     for read_source in read_sources:
164 |         if args.header:
165 |             header = update_header(args, read_source.handle.header)
166 |             for (group, i, key, value) in reads_util.flatten_header(header):
167 |                 out_csv_writer.writerow(
168 |                     [read_source.name, group, str(i), key, value])
169 |             continue  # we don't look at reads at all.
170 |         for read in read_source.reads(loci):
171 |             num_reads += 1
172 |             if out_pysam_handle is not None:
173 |                 out_pysam_handle.write(read)
174 |             if out_csv_writer is not None:
175 |                 out_csv_writer.writerow([
176 |                     str(read_field(read_source, read, field))
177 |                     for field in columns
178 |                 ])
179 | 
180 |     if out_pysam_handle is not None:
181 |         out_pysam_handle.close()
182 |         if not args.no_sort:
183 |             print("Sorting read file %s" % args.out)
184 |             pysam.sort(
185 |                 "-o", args.out,
186 |                 "-T", "varlens_reads", args.out,
187 |                 catch_stdout=False)
188 |         print("Wrote %d reads: %s" % (num_reads, args.out))
189 | 
190 |     if out_csv_fd is not None and out_csv_fd is not sys.stdout:
191 |         out_csv_fd.close()
192 |         print("Wrote: %s" % args.out)
193 | 
194 | 
195 | def read_field(read_source, read, field_name):
196 |     if field_name == 'source':
197 |         return read_source.name
198 | 
199 |     if field_name.startswith("tag:"):
200 |         tag_name = field_name[len("tag:"):]
201 |         return read.get_tags().get(tag_name)
202 | 
203 |     try:
204 |         return getattr(read, field_name)
205 |     except AttributeError:
206 |         raise ValueError("Invalid read field '%s'. Valid fields include: %s"
207 |             % (field_name, ' '.join(dir(read))))
208 | 
209 | def update_header(args, header):
210 |     if args.header_set:
211 |         header = dict(header)
212 |         for (group, index_string, key, value) in args.header_set:
213 |             if not isinstance(header[group], list):
214 |                 header[group] = [header[group]]
215 |             if index_string == ".":
216 |                 indices = range(len(header[group]))
217 |             else:
218 |                 indices = [int(x) for x in index_string.split(",")]
219 |             for index in indices:
220 |                 header[group][index][key] = value
221 |     return header
222 | 


--------------------------------------------------------------------------------
/varlens/commands/util.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import argparse
14 | 
15 | def load_variant_collections_parser():
16 |     parser = argparse.ArgumentParser(add_help=False)
17 |     parser.add_argument("--variants", nargs="+", required=True)
18 |     parser.add_argument("--ensembl-version")
19 |     parser.add_argument("--variant-filter")
20 |     return parser
21 | 
22 | def load_read_sets_parser():
23 |     parser = argparse.ArgumentParser(add_help=False)
24 |     parser.add_argument("--reads", nargs="+", default=[])
25 |     return parser
26 | 
27 | 


--------------------------------------------------------------------------------
/varlens/commands/variants.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | '''
 14 | Given variants from one or more VCF or CSV files, apply filters, add additional
 15 | columns, and output to CSV.
 16 | 
 17 | Currently we can only output to CSV, not VCF.
 18 | 
 19 | A number of useful annotations can be added for each variant by specifying
 20 | options of the form '--include-XXX', e.g. '--include-gene'. See detailed help
 21 | below.
 22 | 
 23 | Examples:
 24 | 
 25 | Print basic info for the variants found in two VCF files. Note that variants
 26 | found in both files are listed in one row, and the 'sources' column lists
 27 | the files each variant was found in:
 28 | 
 29 |     %(prog)s test/data/CELSR1/vcfs/vcf_1.vcf test/data/CELSR1/vcfs/vcf_2.vcf
 30 | 
 31 | Same as the above but include additional columns giving varcode variant effect
 32 | annotations and the genes the variants overlap, and write to a file:
 33 | 
 34 |     %(prog)s test/data/CELSR1/vcfs/vcf_1.vcf test/data/CELSR1/vcfs/vcf_2.vcf \\
 35 |         --include-effect \\
 36 |         --include-gene \\
 37 |         --out /tmp/result.csv
 38 | 
 39 | Print counts for number of reads supporting reference/variant/other alleles
 40 | from the specified BAMs, counting only reads with mapping quality >= 10:
 41 | 
 42 |     %(prog)s test/data/CELSR1/vcfs/vcf_1.vcf \\
 43 |         --include-read-evidence \\
 44 |         --reads test/data/CELSR1/bams/*.bam \\
 45 |         --min-mapping-quality 10
 46 | 
 47 | '''
 48 | from __future__ import absolute_import
 49 | 
 50 | import argparse
 51 | import sys
 52 | import logging
 53 | 
 54 | from . import configure_logging
 55 | from .. import variant_includes
 56 | from .. import variants_util
 57 | 
 58 | parser = argparse.ArgumentParser(usage=__doc__)
 59 | variants_util.add_args(parser, positional=True)
 60 | 
 61 | group = parser.add_argument_group("variant output")
 62 | 
 63 | group.add_argument("--no-standard-columns",
 64 |     action="store_true", default=False,
 65 |     help="Don't write standard columns (genome, contig, start, end, ref, alt)")
 66 | 
 67 | group.add_argument("--chunk-rows", metavar="N", type=int,
 68 |     help="Write out current results after processing N rows.")
 69 | 
 70 | group.add_argument("--limit", metavar="N", type=int,
 71 |     help="Process only the first N variants (useful for testing)")
 72 | 
 73 | group.add_argument("--columns",
 74 |     help="Column separated list of columns to output")
 75 | 
 76 | group.add_argument("--rename-column", nargs=2, action="append", default=[],
 77 |     metavar="COL",
 78 |     help="Rename output column first argument to second. Can be specified "
 79 |     "multiple times by repeating the --rename-column option.")
 80 | 
 81 | group.add_argument("--out",
 82 |     help="Output file. If not specified the CSV is written to stdout.")
 83 | 
 84 | group.add_argument('--include-metadata', action="store_true", default=False,
 85 |     help="Output variant metadata when loading from VCF (info column, etc).")
 86 | 
 87 | for includeable in variant_includes.INCLUDEABLES:
 88 |     includeable.add_args(parser)
 89 | 
 90 | group.add_argument("-v", "--verbose", action="store_true", default=False)
 91 | 
 92 | def run(raw_args=sys.argv[1:]):
 93 |     args = parser.parse_args(raw_args)
 94 |     configure_logging(args)
 95 | 
 96 |     df = variants_util.load_from_args_as_dataframe(args)
 97 |     if df is None:
 98 |         parser.error("No variants specified.")
 99 | 
100 |     logging.info("Loaded %d variants." % df.shape[0])
101 | 
102 |     # We run the inverse of the column renames on the input df.
103 |     column_renames = {}
104 |     if args.rename_column:
105 |         column_renames = dict(args.rename_column)
106 |         column_renames_inverse = dict((v, k) for (k, v) in args.rename_column)
107 |         if len(column_renames) != len(column_renames_inverse):
108 |             raise ValueError("Column renames are not 1:1")
109 | 
110 |         df.columns = [
111 |             column_renames_inverse.get(col, col) for col in df.columns
112 |         ]
113 | 
114 |     def save(df):
115 |         if column_renames:
116 |             df = df.copy()
117 |             df.columns = [column_renames.get(col, col) for col in df.columns]
118 | 
119 |         if args.columns:
120 |             columns = [x.strip() for x in args.columns.split(",")]
121 |         else:
122 |             columns = [x for x in df.columns.tolist() if x != "variant"]
123 |             if not args.include_metadata:
124 |                 columns = [
125 |                     x for x in columns
126 |                     if not x.startswith("metadata")
127 |                 ]
128 |             if args.no_standard_columns:
129 |                 columns = [
130 |                     x for x in columns
131 |                     if x not in variants_util.STANDARD_DATAFRAME_COLUMNS
132 |                 ]
133 | 
134 |         df_save = df[columns].copy()
135 |         df_save.interbase_start = df_save.interbase_start.astype(int)
136 |         df_save.interbase_end = df_save.interbase_end.astype(int)
137 | 
138 |         if args.out is None:
139 |             # Write to stdout.
140 |             df_save.to_csv(sys.stdout, index=False)
141 |         elif args.out.endswith(".csv"):
142 |             df_save.to_csv(args.out, index=False)
143 |             print("Wrote: %s" % args.out)
144 |         else:
145 |             parser.error("Unsupported output file extension: %s" % args.out)
146 | 
147 |     for includeable in variant_includes.INCLUDEABLES:
148 |         if includeable.requested(args):
149 |             logging.info("Running includeable: %s" % includeable.name)
150 |             instance = includeable.from_args(args)
151 |             for num_rows in instance.compute(df, chunk_rows=args.chunk_rows):
152 |                 if args.chunk_rows is not None:
153 |                     save(df)
154 | 
155 |     if args.chunk_rows is None:
156 |         save(df)
157 |  
158 | 


--------------------------------------------------------------------------------
/varlens/loci_util.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import collections
14 | import intervaltree
15 | 
16 | from .locus import Locus
17 | 
18 | def add_args(parser):
19 |     # TODO:
20 |     # - Load intervals_list files
21 |     parser.add_argument('--locus', nargs="+", default=[],
22 |         help="Genomic locus, like chr1:2342332 or chr1:2342-23423. "
23 |         "Any number of loci may be specified.")
24 |     # parser.add_argument("--neighbor-offsets",
25 |     #    nargs="+", type=int, default=[],
26 |     #    help="")
27 | 
28 | def load_from_args(args):
29 |     """
30 |     Return a Loci object giving the loci specified on the command line.
31 | 
32 |     If no loci-related arguments are specified, return None. This makes it
33 |     possible to distinguish an empty set of loci, for example due to filters
34 |     removing all loci, from the case where the user didn't specify any
35 |     arguments.
36 |     """
37 |     if not args.locus:
38 |         return None
39 | 
40 |     loci_iterator = (Locus.parse(locus) for locus in args.locus)
41 | 
42 | #   if args.neighbor_offsets:
43 | #       loci_iterator = expand_with_neighbors(
44 | #           loci_iterator, args.neighbor_offsets)
45 | 
46 |     return Loci(loci_iterator)
47 | 
48 | # def expand_with_neighbors(loci_iterator, neighbor_offsets):
49 | #    offsets = sorted(set(neighbor_offsets + [0]))
50 | #    for locus in loci_iterator:
51 | #        for offset in offsets:
52 | #            if offset == 0:
53 | #                yield locus
54 | #            else:
55 | #                yield Locus(
56 | #                    locus.contig, locus.start + offset, locus.end + offset)
57 | 
58 | class Loci(object):
59 |     def __init__(self, locus_iterator=[], contig_map=None):
60 |         self.contigs = collections.defaultdict(intervaltree.IntervalTree)
61 |         if contig_map:
62 |             self.contigs.update(contig_map)
63 |         for locus in locus_iterator:
64 |             self.contigs[locus.contig].addi(locus.start, locus.end)
65 | 
66 |     def __iter__(self):
67 |         for contig in sorted(self.contigs):
68 |             for interval in self.contigs[contig]:
69 |                 yield Locus(contig, interval.begin, interval.end)
70 | 
71 |     def __len__(self):
72 |         return sum(len(tree) for tree in self.contigs.values())
73 | 
74 |     def intersects(self, locus):
75 |         return self.contigs[locus.contig].overlaps(locus.start, locus.end)
76 | 
77 |     def union(self, other):
78 |         contig_map = {}
79 |         for contig in set(self.contigs).union(other.contigs):
80 |             contig_map[contig] = self.contigs[contig].union(
81 |                 other.contigs[contig])
82 |         return Loci(contig_map=contig_map)
83 | 


--------------------------------------------------------------------------------
/varlens/locus.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import re
 14 | from collections import namedtuple
 15 | 
 16 | import pyensembl
 17 | import typechecks
 18 | 
 19 | class Locus(namedtuple("Locus", "contig start end")):
 20 |     '''
 21 |     A genomic interval in 0-indexed interbase coordinates.
 22 | 
 23 |     See this blog post for a discussion on coordinate systems:
 24 |         http://alternateallele.blogspot.com/2012/03/genome-coordinate-conventions.html
 25 |     '''
 26 | 
 27 |     @property
 28 |     def inclusive_start(self):
 29 |         return self.start + 1
 30 | 
 31 |     @property
 32 |     def inclusive_end(self):
 33 |         return self.end
 34 |     
 35 |     @property
 36 |     def positions(self):
 37 |         '''
 38 |         A Python range object giving the bases included in this locus.
 39 |         '''
 40 |         return range(self.start, self.end)
 41 | 
 42 |     @property
 43 |     def position(self):
 44 |         '''
 45 |         If this locus spans a single base, this property gives that position.
 46 |         Otherwise, raises a ValueError.
 47 |         '''
 48 |         if self.end != self.start + 1:
 49 |             raise ValueError("Not a single base: %s" % str(self))
 50 |         return self.start
 51 | 
 52 |     # Factory functions.
 53 |     @staticmethod
 54 |     def from_inclusive_coordinates(contig, start, end=None):
 55 |         '''
 56 |         Given coordinates in 1-based coordinates that are inclusive on start
 57 |         and end, return a Locus instance. Locus instances are always 0-based
 58 |         "interbase" coordinates.
 59 |         '''
 60 |         typechecks.require_string(contig)
 61 |         typechecks.require_integer(start)
 62 |         if end is None:
 63 |             end = start
 64 |         typechecks.require_integer(end)
 65 |         contig = pyensembl.locus.normalize_chromosome(contig)
 66 |         return Locus(contig, start - 1, end)
 67 | 
 68 |     @staticmethod
 69 |     def from_interbase_coordinates(contig, start, end=None):
 70 |         '''
 71 |         Given coordinates in 0-based interbase coordinates, return a Locus
 72 |         instance.
 73 |         '''
 74 |         typechecks.require_string(contig)
 75 |         typechecks.require_integer(start)
 76 |         if end is None:
 77 |             end = start + 1
 78 |         typechecks.require_integer(end)
 79 |         contig = pyensembl.locus.normalize_chromosome(contig)
 80 |         return Locus(contig, start, end)
 81 | 
 82 |     @staticmethod
 83 |     def parse(string):
 84 |         match = re.match(r'(\w+)([:/])(\d+)(-(\d+))?', string)
 85 |         if match is None:
 86 |             raise ValueError("Couldn't parse locus: %s. "
 87 |                 "Expected format is: chr5:3332 or chr5:3332-5555 for "
 88 |                 "inclusive 1-based coordinates and chr5/3331 or "
 89 |                 "chr5/3331-5554 for half-open 0-based coordinates.")
 90 | 
 91 |         (contig, symbol, start, _, maybe_end) = match.groups()
 92 |         start = int(start)
 93 |         end = int(maybe_end) if maybe_end is not None else None
 94 | 
 95 |         if symbol == ":":
 96 |             # inclusive coordinatess
 97 |             return Locus.from_inclusive_coordinates(contig, start, end)
 98 |         else:
 99 |             # interbase coordinates
100 |             assert symbol == "/"
101 |             return Locus.from_interbase_coordinates(contig, start, end)


--------------------------------------------------------------------------------
/varlens/mhc_binding.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import collections
14 | 
15 | import pandas
16 | import varcode
17 | 
18 | CACHED_BINDING_AFFINITIES = {}  # (variant, allele -> nm affinity)
19 | BINDING_PREDICTORS = {}
20 | def binding_affinities(variants, alleles, epitope_lengths=[8, 9, 10, 11]):
21 |     # We import these here so we don't depend on these libraries unless this
22 |     # function is called.
23 |     import mhctools
24 |     import topiary
25 | 
26 |     for allele in alleles:
27 |         if allele not in BINDING_PREDICTORS:
28 |             BINDING_PREDICTORS[allele] = mhctools.NetMHCpan(
29 |                 [allele], default_peptide_lengths=epitope_lengths)
30 |         predictor = BINDING_PREDICTORS[allele]
31 |         predictions = topiary.predict_epitopes_from_variants(
32 |             varcode.VariantCollection([
33 |                 v for v in variants
34 |                 if (v, allele) not in CACHED_BINDING_AFFINITIES
35 |             ]),
36 |             predictor,
37 |             ic50_cutoff=float('inf'),
38 |             percentile_cutoff=100)
39 |         if len(predictions) > 0:
40 |             predictions_df = pandas.DataFrame(
41 |                 predictions, columns=predictions[0]._fields)
42 |             values = predictions_df.groupby("variant")["value"].min()
43 |             for (variant, value) in zip(values.index, values):
44 |                 CACHED_BINDING_AFFINITIES[(variant, allele)] = value
45 |         
46 |     result_df = collections.defaultdict(list)
47 |     for variant in variants:
48 |         (binding_affinity, binding_allele) = min(
49 |             (CACHED_BINDING_AFFINITIES.get((variant, allele), float('nan')),
50 |                 allele)
51 |             for allele in alleles)
52 |         if pandas.isnull(binding_affinity):
53 |             binding_allele = None
54 |         result_df["variant"].append(variant)
55 |         result_df["binding_affinity"].append(binding_affinity)
56 |         result_df["binding_allele"].append(binding_allele)
57 | 
58 |     return pandas.DataFrame(result_df)
59 |     
60 | 


--------------------------------------------------------------------------------
/varlens/read_evidence/__init__.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | '''
13 | This subpackage provides functionality for collecting and filtering aligned
14 | sequencing reads from a BAM file, determining the alleles they suggest at
15 | a locus, and assesing the evidence for particular variants.
16 | 
17 | In this subpackage, the records stored in the BAM file are referred to as
18 | "alignments," whereas the term "read" may be more familiar. We use the term
19 | "alignment" for consistency with the SAM specification, and since an
20 | individual read from the sequencer may generate any number of alignments in
21 | the case of chimeric alignments and secondary alignments.
22 | '''
23 | 
24 | from .util import alignment_key, read_key
25 | from .pileup import Pileup
26 | from .pileup_element import PileupElement
27 | from .pileup_collection import PileupCollection
28 | 
29 | __all__ = [
30 |     "PileupCollection",
31 |     "Pileup",
32 |     "PileupElement",
33 |     "alignment_key",
34 |     "read_key",
35 | ]
36 | 


--------------------------------------------------------------------------------
/varlens/read_evidence/pileup.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from __future__ import absolute_import
14 | 
15 | from collections import OrderedDict
16 | 
17 | class Pileup(object):
18 |     '''
19 |     A Pileup is a collection of PileupElement instances at a particular locus.
20 | 
21 |     Attributes
22 |     ----------
23 |     locus : Varcode.Locus
24 |         The reference locus. Must be length 1, i.e. a single base.
25 | 
26 |     elements : OrderedDict of PileupElement instances
27 |         This is logically and ordered set, which we implement as an OrderedDict
28 |         with all values mapping to None.
29 |     '''
30 |     def __init__(self, locus, elements):
31 |         '''
32 |         Construct a new Pileup.
33 | 
34 |         Parameters
35 |         ----------
36 |         locus : Varcode.Locus
37 |             The reference locus. Must be length 1, i.e. a single base.
38 | 
39 |         elements : iterable of PileupElement
40 |             The pileup elements. The locus field of these instances must 
41 |             match the locus parameter.
42 |         '''
43 |         self.locus = locus
44 |         self.elements = OrderedDict((e, None) for e in elements)
45 |         assert all(e.locus == self.locus for e in self.elements)
46 | 
47 |     def __iter__(self):
48 |         return iter(self.elements)
49 | 
50 |     def __len__(self):
51 |         return len(self.elements)
52 | 
53 |     def append(self, element):
54 |         '''
55 |         Append a PileupElement to this Pileup. If an identical PileupElement is
56 |         already part of this Pileup, do nothing.
57 |         '''
58 |         assert element.locus == self.locus, (
59 |             "Element locus (%s) != Pileup locus (%s)"
60 |             % (element.locus, self.locus))
61 |         self.elements[element] = None
62 | 
63 |     def update(self, other):
64 |         '''
65 |         Add all pileup elements from other into self.
66 |         '''
67 |         assert self.locus == other.locus
68 |         self.elements.update(other.elements)
69 | 
70 |     def filter(self, filters):
71 |         '''
72 |         Apply filters to the pileup elements, and return a new Pileup with the
73 |         filtered elements removed.
74 | 
75 |         Parameters
76 |         ----------
77 |         filters : list of PileupElement -> bool callables
78 |             A PileupUp element is retained if all filters return True when
79 |             called on it.
80 |         '''
81 |         new_elements = [
82 |             e for e in self.elements
83 |             if all(function(e) for function in filters)]
84 |         return Pileup(self.locus, new_elements)
85 | 


--------------------------------------------------------------------------------
/varlens/read_evidence/pileup_element.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | from __future__ import absolute_import
 14 | 
 15 | from . import alignment_key
 16 | 
 17 | class PileupElement(object):
 18 |     '''
 19 |     A PileupElement represents the segment of an alignment that aligns to a
 20 |     particular base in the reference.
 21 | 
 22 |     Attributes
 23 |     ----------
 24 |     locus : Varcode.Locus
 25 |         The reference locus. Must be length 1, i.e. a single base.
 26 | 
 27 |     offset_start : int
 28 |         0-based start offset into the alignment sequence, inclusive
 29 | 
 30 |     offset_end : int
 31 |         0-based end offset into the alignment sequence, exclusive
 32 | 
 33 |     alignment : pysam.AlignedSegment
 34 |         pysam alignment instance
 35 | 
 36 |     alignment_key : tuple
 37 |         value computed from the alignment instance that uniquely specifies its
 38 |         properties. Used for comparisons since pysam.AlignedSegment instances
 39 |         do not support a useful notion of equality (they compare using object
 40 |         identity). See `read_evidence.alignment_key` for the implementation of
 41 |         this key.
 42 |     '''
 43 |     def __init__(self, locus, offset_start, offset_end, alignment):
 44 |         '''
 45 |         Construct a PileupElement object.
 46 |         '''
 47 |         assert offset_end >= offset_start, \
 48 |             "offset_start=%d > offset_end=%d" % (offset_start, offset_end)
 49 |         self.locus = locus
 50 |         self.offset_start = offset_start
 51 |         self.offset_end = offset_end
 52 |         self.alignment = alignment
 53 |         self.alignment_key = alignment_key(self.alignment)
 54 | 
 55 |     def fields(self):
 56 |         '''
 57 |         Fields that should be considered for our notion of object equality.
 58 |         '''
 59 |         return (
 60 |             self.locus, self.offset_start, self.offset_end, self.alignment_key)
 61 | 
 62 |     def __eq__(self, other):
 63 |         return hasattr(other, "fields") and self.fields() == other.fields()
 64 | 
 65 |     def __hash__(self):
 66 |         return hash(self.fields())
 67 | 
 68 |     @property
 69 |     def bases(self):
 70 |         '''
 71 |         The sequenced bases in the alignment that align to this locus in the
 72 |         genome, as a string.
 73 | 
 74 |         Empty string in the case of a deletion. String of length > 1 if there
 75 |         is an insertion here.
 76 |         '''
 77 |         sequence = self.alignment.query_sequence
 78 |         assert self.offset_end <= len(sequence), \
 79 |             "End offset=%d > sequence length=%d. CIGAR=%s. SEQUENCE=%s" % (
 80 |                 self.offset_end,
 81 |                 len(sequence),
 82 |                 self.alignment.cigarstring,
 83 |                 sequence)
 84 |         return sequence[self.offset_start:self.offset_end]
 85 | 
 86 |     @property
 87 |     def base_qualities(self):
 88 |         '''
 89 |         The phred-scaled base quality scores corresponding to `self.bases`, as
 90 |         a list.
 91 |         '''
 92 |         return self.alignment.query_qualities[
 93 |             self.offset_start:self.offset_end]
 94 | 
 95 |     @property
 96 |     def min_base_quality(self):
 97 |         '''
 98 |         The minimum of the base qualities. In the case of a deletion, in which
 99 |         case there are no bases in this PileupElement, the minimum is taken
100 |         over the sequenced bases immediately before and after the deletion.
101 |         '''
102 |         try:
103 |             return min(self.base_qualities)
104 |         except ValueError:
105 |             # We are mid-deletion. We return the minimum of the adjacent bases.
106 |             assert self.offset_start == self.offset_end
107 |             adjacent_qualities = [
108 |                 self.alignment.query_qualities[offset]
109 |                 for offset in [self.offset_start - 1, self.offset_start]
110 |                 if 0 <= offset < len(self.alignment.query_qualities)
111 |             ]
112 |             return min(adjacent_qualities)
113 | 
114 |     @staticmethod
115 |     def from_pysam_alignment(locus, pileup_read):
116 |         '''
117 |         Factory function to create a new PileupElement from a pysam
118 |         `PileupRead`.
119 | 
120 |         Parameters
121 |         ----------
122 |         locus : varcode.Locus
123 |             Reference locus for which to construct a PileupElement. Must
124 |             include exactly one base.
125 | 
126 |         pileup_read : pysam.calignmentfile.PileupRead
127 |             pysam PileupRead instance. Its alignment must overlap the locus.
128 | 
129 |         Returns
130 |         ----------
131 |         PileupElement
132 | 
133 |         '''
134 |         assert not pileup_read.is_refskip, (
135 |             "Can't create a PileupElement in a refskip (typically an intronic "
136 |             "gap in an RNA alignment)")
137 | 
138 |         # Pysam has an `aligned_pairs` method that gives a list of
139 |         # (offset, locus) pairs indicating the correspondence between bases in
140 |         # the alignment and reference loci. Here we use that to compute
141 |         # offset_start and offset_end.
142 |         #
143 |         # This is slightly tricky in the case of insertions and deletions.
144 |         # Here are examples of the desired logic.
145 |         #
146 |         # Target locus = 1000
147 |         #
148 |         # (1) Simple case: matching bases.
149 |         #
150 |         # OFFSET           LOCUS
151 |         # 0                999
152 |         # 1                1000
153 |         # 2                1001
154 |         #
155 |         # DESIRED RESULT: offset_start=1, offset_end=2.
156 |         #
157 |         #
158 |         # (2) A 1 base insertion at offset 2.
159 |         #
160 |         # OFFSET           LOCUS
161 |         # 0                999
162 |         # 1                1000
163 |         # 2                None
164 |         # 3                1001
165 |         #
166 |         # DESIRED RESULT: offset_start = 1, offset_end=3.
167 |         #
168 |         #
169 |         # (3) A 2 base deletion at loci 1000 and 1001.
170 |         #
171 |         # OFFSET           LOCUS
172 |         # 0                999
173 |         # None             1000
174 |         # None             1001
175 |         # 1                1002
176 |         #
177 |         # DESIRED RESULT: offset_start = 1, offset_end=1.
178 |         #
179 |         offset_start = None
180 |         offset_end = len(pileup_read.alignment.query_sequence)
181 |         # TODO: doing this with get_blocks() may be faster.
182 |         for (offset, position) in pileup_read.alignment.aligned_pairs:
183 |             if offset is not None and position is not None:
184 |                 if position == locus.position:
185 |                     offset_start = offset
186 |                 elif position > locus.position:
187 |                     offset_end = offset
188 |                     break
189 |         if offset_start is None:
190 |             offset_start = offset_end
191 |         
192 |         assert pileup_read.is_del == (offset_end - offset_start == 0), \
193 |             "Deletion=%s but | [%d,%d) |=%d for locus %d in: \n%s" % (
194 |                 pileup_read.is_del,
195 |                 offset_start,
196 |                 offset_end,
197 |                 offset_end - offset_start,
198 |                 locus.position,
199 |                 pileup_read.alignment.aligned_pairs)
200 | 
201 |         assert offset_end >= offset_start
202 |         result = PileupElement(
203 |             locus, offset_start, offset_end, pileup_read.alignment)
204 |         return result   
205 | 
206 | 


--------------------------------------------------------------------------------
/varlens/read_evidence/util.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from __future__ import absolute_import
14 | 
15 | def alignment_key(pysam_alignment_record):
16 |     '''
17 |     Return the identifying attributes of a `pysam.AlignedSegment` instance.
18 |     This is necessary since these objects do not support a useful notion of
19 |     equality (they compare on identify by default).
20 |     '''
21 |     return (
22 |         read_key(pysam_alignment_record),
23 |         pysam_alignment_record.query_alignment_start,
24 |         pysam_alignment_record.query_alignment_end,
25 |     )
26 | 
27 | def read_key(pysam_alignment_record):
28 |     '''
29 |     Given a `pysam.AlignedSegment` instance, return the attributes identifying
30 |     the *read* it comes from (not the alignment). There may be more than one
31 |     alignment for a read, e.g. chimeric and secondary alignments.
32 |     '''
33 |     return (
34 |         pysam_alignment_record.query_name,
35 |         pysam_alignment_record.is_duplicate,
36 |         pysam_alignment_record.is_read1,
37 |         pysam_alignment_record.is_read2,
38 |     )
39 | 


--------------------------------------------------------------------------------
/varlens/read_source.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import logging
 14 | 
 15 | import pyensembl
 16 | import pysam
 17 | 
 18 | from . import read_evidence
 19 | 
 20 | class ReadSource(object):
 21 |     def __init__(self, name, filename, read_filters=[]):
 22 |         self.name = name
 23 |         self.filename = filename
 24 |         self.handle = pysam.Samfile(filename)
 25 |         self.read_filters = read_filters
 26 | 
 27 |         self.chromosome_name_map = {}
 28 |         for name in self.handle.references:
 29 |             normalized = pyensembl.locus.normalize_chromosome(name)
 30 |             self.chromosome_name_map[normalized] = name
 31 |             self.chromosome_name_map[name] = name
 32 | 
 33 |     def index_if_needed(self):
 34 |         if self.filename.endswith(".bam") and not self.handle.has_index():
 35 |             # pysam strangely requires and index even to iterate through a bam.
 36 |             logging.info(
 37 |                 "Attempting to create BAM index for file: %s" % self.filename)
 38 |             samtools_output = pysam.index(self.filename)
 39 |             logging.info(
 40 |                 "Done indexing" + (
 41 |                     (": " + samtools_output) if samtools_output else ''))
 42 | 
 43 |             # Reopen
 44 |             self.handle.close()
 45 |             self.handle = pysam.Samfile(self.filename)
 46 | 
 47 |     def reads(self, loci=None):
 48 |         if loci is None:
 49 |             def reads_iterator():
 50 |                 return self.handle.fetch(until_eof=True)
 51 |         elif self.filename.endswith(".sam"):
 52 |             # Inefficient.
 53 |             chromosome_intervals = {}
 54 |             for (contig, intervals) in loci.contigs.items():
 55 |                 try:
 56 |                     chromosome = self.chromosome_name_map[contig]
 57 |                 except KeyError:
 58 |                     logging.warn(
 59 |                         "No such contig in bam: %s" % contig)
 60 |                     continue
 61 |                 chromosome_intervals[chromosome] = intervals
 62 | 
 63 |             def reads_iterator():
 64 |                 seen = set()
 65 |                 for read in self.handle.fetch(until_eof=True):
 66 |                     intervals = chromosome_intervals.get(read.reference_name)
 67 |                     if not intervals or not intervals.overlaps_range(
 68 |                             read.reference_start,
 69 |                             read.reference_end):
 70 |                         continue
 71 |                     key = alignment_key(read)
 72 |                     if key not in seen:
 73 |                         yield read
 74 |                         seen.add(key)
 75 |         else:
 76 |             self.index_if_needed()
 77 | 
 78 |             def reads_iterator():
 79 |                 seen = set()
 80 |                 for locus in loci:
 81 |                     try:
 82 |                         chromosome = self.chromosome_name_map[locus.contig]
 83 |                     except KeyError:
 84 |                         logging.warn(
 85 |                             "No such contig in bam: %s" % locus.contig)
 86 |                         continue
 87 |                     for read in self.handle.fetch(
 88 |                             chromosome,
 89 |                             locus.start,
 90 |                             locus.end):
 91 |                         key = alignment_key(read)
 92 |                         if key not in seen:
 93 |                             yield read
 94 |                             seen.add(key)
 95 | 
 96 |         return (
 97 |             read for read in reads_iterator()
 98 |             if self.read_passes_filters(read))
 99 | 
100 |     def read_passes_filters(self, read):
101 |         return all(read_filter(read) for read_filter in self.read_filters)
102 | 
103 |     def pileups(self, loci):
104 |         self.index_if_needed()
105 |         collection = read_evidence.PileupCollection.from_bam(self.handle, loci)
106 |         if self.read_filters:
107 |             for (locus, pileup) in collection.pileups.items():
108 |                 collection.pileups[locus] = pileup.filter(
109 |                     [lambda element:
110 |                         self.read_passes_filters(element.alignment)])
111 |         return collection
112 | 
113 | def alignment_key(pysam_alignment_record):
114 |     '''
115 |     Return the identifying attributes of a `pysam.AlignedSegment` instance.
116 |     This is necessary since these objects do not support a useful notion of
117 |     equality (they compare on identify by default).
118 |     '''
119 |     return (
120 |         read_key(pysam_alignment_record),
121 |         pysam_alignment_record.query_alignment_start,
122 |         pysam_alignment_record.query_alignment_end,
123 |     )
124 | 
125 | 
126 | def read_key(pysam_alignment_record):
127 |     '''
128 |     Given a `pysam.AlignedSegment` instance, return the attributes identifying
129 |     the *read* it comes from (not the alignment). There may be more than one
130 |     alignment for a read, e.g. chimeric and secondary alignments.
131 |     '''
132 |     return (
133 |         pysam_alignment_record.query_name,
134 |         pysam_alignment_record.is_duplicate,
135 |         pysam_alignment_record.is_read1,
136 |         pysam_alignment_record.is_read2,
137 |     )
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/varlens/reads_util.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import collections
 14 | import functools
 15 | 
 16 | from .read_source import ReadSource
 17 | from . import util
 18 | 
 19 | BOOLEAN_PROPERTIES = """
 20 | is_paired is_proper_pair is_qcfail is_read1 is_read2 is_reverse is_secondary
 21 | is_unmapped mate_is_reverse mate_is_unmapped is_duplicate
 22 | """.split()
 23 | 
 24 | STRING_PROPERTIES = """
 25 | cigarstring query_alignment_sequence query_name
 26 | """.split()
 27 | 
 28 | INT_PROPERTIES = """
 29 | inferred_length mapping_quality query_alignment_length query_alignment_start
 30 | query_length reference_length reference_start template_length
 31 | """.split()
 32 | 
 33 | # name -> (type, help, filter function)
 34 | READ_FILTERS = collections.OrderedDict()
 35 | 
 36 | for prop in BOOLEAN_PROPERTIES:
 37 |     READ_FILTERS[prop] = (
 38 |         bool,
 39 |         "Only reads where %s is True" % prop,
 40 |         functools.partial(
 41 |             (lambda field_name, parsed_value, read:
 42 |                 bool(getattr(read, field_name))),
 43 |             prop)
 44 |     )
 45 | 
 46 |     READ_FILTERS["not_" + prop] = (
 47 |         bool,
 48 |         "Only reads where %s is False" % prop,
 49 |         functools.partial(
 50 |             (lambda field_name, parsed_value, read:
 51 |                 not getattr(read, field_name)),
 52 |             prop)
 53 |     )
 54 | 
 55 | def field_contains(field_name, parsed_value, read):
 56 |     field_value = getattr(read, field_name)
 57 |     return field_value is not None and parsed_value in field_value
 58 | 
 59 | for prop in STRING_PROPERTIES:
 60 |     READ_FILTERS["%s" % prop] = (
 61 |         str,
 62 |         "Only reads with the specified %s" % prop,
 63 |         functools.partial(
 64 |             (lambda field_name, parsed_value, read:
 65 |                 getattr(read, field_name) == parsed_value),
 66 |             prop)
 67 |     )
 68 | 
 69 |     READ_FILTERS["%s_contains" % prop] = (
 70 |         str,
 71 |         "Only reads where %s contains the given string" % prop,
 72 |         functools.partial(field_contains, prop))
 73 | 
 74 | for prop in INT_PROPERTIES:
 75 |     READ_FILTERS["%s" % prop] = (
 76 |         int,
 77 |         "Only reads with the specified %s" % prop,
 78 |         functools.partial(
 79 |             (lambda field_name, parsed_value, read:
 80 |                 getattr(read, field_name) == parsed_value),
 81 |             prop)
 82 |     )
 83 | 
 84 |     READ_FILTERS["min_%s" % prop] = (
 85 |         int,
 86 |         "Only reads where %s >=N" % prop,
 87 |         functools.partial(
 88 |             (lambda field_name, parsed_value, read:
 89 |                 getattr(read, field_name) >= parsed_value),
 90 |             prop)
 91 |     )
 92 | 
 93 |     READ_FILTERS["max_%s" % prop] = (
 94 |         int,
 95 |         "Only reads where %s <=N" % prop,
 96 |         functools.partial(
 97 |             (lambda field_name, parsed_value, read:
 98 |                 getattr(read, field_name) <= parsed_value),
 99 |             prop)
100 |     )
101 | 
102 | def add_args(parser, positional=False):
103 |     """
104 |     Extends a commandline argument parser with arguments for specifying
105 |     read sources.
106 |     """
107 |     group = parser.add_argument_group("read loading")
108 |     group.add_argument("reads" if positional else "--reads",
109 |         nargs="+", default=[],
110 |         help="Paths to bam files. Any number of paths may be specified.")
111 | 
112 |     group.add_argument(
113 |         "--read-source-name",
114 |         nargs="+",
115 |         help="Names for each read source. The number of names specified "
116 |         "must match the number of bam files. If not specified, filenames are "
117 |         "used for names.")
118 | 
119 |     # Add filters
120 |     group = parser.add_argument_group(
121 |         "read filtering",
122 |         "A number of read filters are available. See the pysam "
123 |         "documentation (http://pysam.readthedocs.org/en/latest/api.html) "
124 |         "for details on what these fields mean. When multiple filter "
125 |         "options are specified, reads must match *all* filters.")
126 | 
127 |     for (name, (kind, message, function)) in READ_FILTERS.items():
128 |         extra = {}
129 |         if kind is bool:
130 |             extra["action"] = "store_true"
131 |             extra["default"] = None
132 |         elif kind is int:
133 |             extra["type"] = int
134 |             extra["metavar"] = "N"
135 |         elif kind is str:
136 |             extra["metavar"] = "STRING"
137 |         group.add_argument("--" + name.replace("_", "-"),
138 |             help=message,
139 |             **extra)
140 | 
141 | def load_from_args(args):
142 |     """
143 |     Given parsed commandline arguments, returns a list of ReadSource objects
144 |     """
145 |     if not args.reads:
146 |         return None
147 | 
148 |     if args.read_source_name:
149 |         read_source_names = util.expand(
150 |             args.read_source_name,
151 |             'read_source_name',
152 |             'read source',
153 |             len(args.reads))
154 |     else:
155 |         read_source_names = util.drop_prefix(args.reads)
156 | 
157 |     filters = []
158 |     for (name, info) in READ_FILTERS.items():
159 |         value = getattr(args, name)
160 |         if value is not None:
161 |             filters.append(functools.partial(info[-1], value))
162 | 
163 |     return [
164 |         load_bam(filename, name, filters)
165 |         for (filename, name)
166 |         in zip(args.reads, read_source_names)
167 |     ]
168 | 
169 | def load_bam(filename, name=None, filters=[]):
170 |     if not name:
171 |         name = filename
172 |     return ReadSource(name, filename, filters)
173 | 
174 | def flatten_header(header):
175 |     for (group, rows) in header.items():
176 |         for (index, row) in enumerate(rows):
177 |             if not isinstance(row, dict):
178 |                 key_values = [(row, "")]
179 |             else:
180 |                 key_values = row.items()
181 |             for (key, value) in key_values:
182 |                 yield (str(group), index, str(key), str(value))
183 | 


--------------------------------------------------------------------------------
/varlens/sequence_context.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import pyfaidx
14 | 
15 | def variant_context(
16 |         reference_fasta,
17 |         contig,
18 |         inclusive_start,
19 |         inclusive_end,
20 |         alt,
21 |         context_length):  
22 |     """
23 |     Retrieve the surronding reference region from a variant.
24 | 
25 |     SNVs are canonicalized so the reference base is a pyrmidine (C/T). For
26 |     indels the reverse complement will still be taken if the first base of
27 |     the reference is not a pyrmidine, but since the reference will also be
28 |     reversed, that doesn't guarantee it will start with a pyrmidine.
29 | 
30 |     Parameters
31 |     ----------
32 |     reference_fasta : FastaReference
33 |         reference sequence from pyfaidx package
34 | 
35 |     contig : str
36 |         Chromosome of the variant
37 | 
38 |     inclusive_start : int
39 |         start of the variant in 1-based inclusive coordinates
40 | 
41 |     inclusive_end : int
42 |         end of the variant in 1-based inclusive coordinates
43 | 
44 |     alt : string
45 |         alt sequence
46 | 
47 |     context_length : int
48 |         number of bases on either side of the variant to return
49 | 
50 |     Returns
51 |     ---------
52 |     A tuple of (5', mutation, 3') where
53 |         5' - bases immediately 5 prime to the mutation
54 |         
55 |         3' - bases immediately 3 prime to the mutation
56 |         
57 |         mutation - the ref sequence followed by a > character followed by the
58 |             the alt sequence
59 |     """
60 | 
61 |     # Move from 1-base coorindates to 0-base coordinates
62 |     start = int(inclusive_start) - 1
63 |     end = int(inclusive_end)
64 | 
65 |     full_sequence = reference_fasta[contig]
66 | 
67 |     left = str(full_sequence[start - context_length:start].seq).upper()
68 |     middle = str(full_sequence[start: end].seq).upper()
69 |     right = str(full_sequence[end: end + context_length].seq).upper()
70 | 
71 |     # Complement and reverse the context if necessary so the ref base is a
72 |     # pyrmidine (C/T)
73 |     if middle[0] in ('A', 'G'):
74 |         context_5prime = pyfaidx.complement(right)[::-1]
75 |         context_3prime = pyfaidx.complement(left)[::-1]
76 |         context_mutation = "%s>%s" % (
77 |             pyfaidx.complement(middle)[::-1], pyfaidx.complement(alt)[::-1])
78 |     else:
79 |         context_5prime = left
80 |         context_3prime = right
81 |         context_mutation = "%s>%s" % (middle, alt)
82 | 
83 |     return (context_5prime, context_mutation, context_3prime)
84 | 
85 | 


--------------------------------------------------------------------------------
/varlens/support.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import collections
 14 | import logging
 15 | 
 16 | import pandas
 17 | 
 18 | EXPECTED_COLUMNS = [
 19 |     "source",
 20 |     "contig",
 21 |     "interbase_start",
 22 |     "interbase_end",
 23 |     "allele",
 24 |     "count",
 25 | ]
 26 | 
 27 | def allele_support_df(loci, sources):
 28 |     """
 29 |     Returns a DataFrame of allele counts for all given loci in the read sources
 30 |     """
 31 |     return pandas.DataFrame(
 32 |         allele_support_rows(loci, sources),
 33 |         columns=EXPECTED_COLUMNS)
 34 | 
 35 | def allele_support_rows(loci, sources):
 36 |     for source in sources:
 37 |         logging.info("Reading from: %s (%s)" % (source.name, source.filename))
 38 |         for locus in loci:
 39 |             grouped = dict(source.pileups([locus]).group_by_allele(locus))
 40 |             if grouped:
 41 |                 items = grouped.items()
 42 |             else:
 43 |                 items = [("N" * (locus.end - locus.start), None)]
 44 |             for (allele, group) in items:
 45 |                 d = collections.OrderedDict([
 46 |                     ("source", source.name),
 47 |                     ("contig", locus.contig),
 48 |                     ("interbase_start", str(locus.start)),
 49 |                     ("interbase_end", str(locus.end)),
 50 |                     ("allele", allele),
 51 |                     ("count", group.num_reads() if group is not None else 0),
 52 |                 ])
 53 |                 yield pandas.Series(d)
 54 | 
 55 | def variant_support(variants, allele_support_df, ignore_missing=False):
 56 |     '''
 57 |     Collect the read evidence support for the given variants.
 58 | 
 59 |     Parameters
 60 |     ----------
 61 | 
 62 |     variants : iterable of varcode.Variant
 63 | 
 64 |     allele_support_df : dataframe
 65 |         Allele support dataframe, as output by the varlens-allele-support tool.
 66 |         It should have columns: source, contig, interbase_start, interbase_end,
 67 |         allele. The remaining columns are interpreted as read counts of various
 68 |         subsets of reads (e.g. all reads, non-duplicate reads, etc.)
 69 | 
 70 |     ignore_missing : boolean
 71 |         If True, then varaints with no allele counts will be interpreted as
 72 |         having 0 depth. If False, then an exception will be raised if any
 73 |         variants have no allele counts.
 74 | 
 75 |     Returns
 76 |     ----------
 77 | 
 78 |     A pandas.Panel4D frame with these axes:
 79 | 
 80 |     labels (axis=0) : the type of read being counted, i.e. the read count
 81 |         fields in allele_support_df.
 82 | 
 83 |     items (axis=1)  : the type of measurement (num_alt, num_ref, num_other,
 84 |         total_depth, alt_fraction, any_alt_fraction)
 85 | 
 86 |     major axis (axis=2) : the variants
 87 | 
 88 |     minor axis (axis=3) : the sources
 89 |     '''
 90 |     missing = [
 91 |         c for c in EXPECTED_COLUMNS if c not in allele_support_df.columns
 92 |     ]
 93 |     if missing:
 94 |         raise ValueError("Missing columns: %s" % " ".join(missing))
 95 | 
 96 |     # Ensure our start and end fields are ints.
 97 |     allele_support_df[["interbase_start", "interbase_end"]] = (
 98 |         allele_support_df[["interbase_start", "interbase_end"]].astype(int))
 99 | 
100 |     sources = sorted(allele_support_df["source"].unique())
101 | 
102 |     allele_support_dict = collections.defaultdict(dict)
103 |     for (i, row) in allele_support_df.iterrows():
104 |         key = (
105 |             row['source'],
106 |             row.contig,
107 |             row.interbase_start,
108 |             row.interbase_end)
109 |         allele_support_dict[key][row.allele] = row["count"]
110 | 
111 |     # We want an exception on bad lookups, so convert to a regular dict.
112 |     allele_support_dict = dict(allele_support_dict)
113 | 
114 |     dataframe_dicts = collections.defaultdict(
115 |         lambda: collections.defaultdict(list))
116 | 
117 |     for variant in variants:
118 |         for source in sources:
119 |             key = (source, variant.contig, variant.start - 1, variant.end)
120 |             try:
121 |                 alleles = allele_support_dict[key]
122 |             except KeyError:
123 |                 message = (
124 |                     "No allele counts in source %s for variant %s" % (
125 |                         source, str(variant)))
126 |                 if ignore_missing:
127 |                     logging.warning(message)
128 |                     alleles = {}
129 |                 else:
130 |                     raise ValueError(message)
131 | 
132 |             alt = alleles.get(variant.alt, 0)
133 |             ref = alleles.get(variant.ref, 0)
134 |             total = sum(alleles.values())
135 | 
136 |             other = total - alt - ref
137 | 
138 |             dataframe_dicts["num_alt"][source].append(alt)
139 |             dataframe_dicts["num_ref"][source].append(ref)
140 |             dataframe_dicts["num_other"][source].append(other)
141 |             dataframe_dicts["total_depth"][source].append(total)
142 |             dataframe_dicts["alt_fraction"][source].append(
143 |                 float(alt) / max(1, total))
144 |             dataframe_dicts["any_alt_fraction"][source].append(
145 |                 float(alt + other) / max(1, total))
146 | 
147 |     dataframes = dict(
148 |         (label, pandas.DataFrame(value, index=variants))
149 |         for (label, value) in dataframe_dicts.items())
150 | 
151 |     return pandas.Panel(dataframes)
152 | 


--------------------------------------------------------------------------------
/varlens/util.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import os
14 | import argparse
15 | 
16 | def expand(value, arg_name, input_name, length):
17 |     if value is None or len(value) == 0:
18 |         return [None] * length
19 | 
20 |     if len(value) == length:
21 |         return value
22 | 
23 |     if len(value) == 1:
24 |         return value * length
25 | 
26 |     if length == 1:
27 |         raise ValueError(
28 |             "With only 1 {input_name} specified, each {arg_name} argument "
29 |             "should be length 1. If you are trying to specify multiple filters"
30 |             " to apply consecutively, you should specify the entire argument "
31 |             "multiple times."
32 |             .format(
33 |                 arg_name=arg_name,
34 |                 input_name=input_name,
35 |                 length=length,
36 |                 actual=len(value)))
37 | 
38 |     else:
39 |         raise ValueError(
40 |             "Expected argument {arg_name} to be length 1 (i.e. apply to all "
41 |             "{input_name} inputs) or length {length} (i.e. an individual value"
42 |             " for each of the {length} {input_name} inputs), not {actual}."
43 |             .format(
44 |                 arg_name=arg_name,
45 |                 input_name=input_name,
46 |                 length=length,
47 |                 actual=len(value)))
48 | 
49 | 
50 | def drop_prefix(strings):
51 |     """
52 |     Removes common prefix from a collection of strings
53 |     """
54 |     strings_without_extensions = [
55 |         s.split(".", 2)[0] for s in strings
56 |     ]
57 | 
58 |     if len(strings_without_extensions) == 1:
59 |         return [os.path.basename(strings_without_extensions[0])]
60 |     prefix_len = len(os.path.commonprefix(strings_without_extensions))
61 |     result = [string[prefix_len:] for string in strings_without_extensions]
62 |     if len(set(result)) != len(strings):
63 |         # If these operations resulted in a collision, just return the original
64 |         # strings.
65 |         return strings
66 |     return result
67 | 
68 | class PrefixedArgumentParser(object):
69 |     def __init__(self, wrapped, prefix):
70 |         self.wrapped = wrapped
71 |         self.prefix = prefix
72 | 
73 |     def add_argument(self, name, *args, **kwargs):
74 |         assert name.startswith("--")
75 |         new_name = "--" + self.prefix + "-" + name[2:]
76 |         self.wrapped.add_argument(new_name, *args, **kwargs)
77 | 
78 | 
79 | def remove_prefix_from_parsed_args(args, prefix):
80 |     result = argparse.Namespace()
81 |     for (arg, value) in args._get_kwargs():
82 |         if arg.startswith(prefix + "_"):
83 |             setattr(result, arg[len(prefix + "_"):], value)
84 |     return result
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/varlens/variant_includes.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import logging
 14 | import time
 15 | import collections
 16 | 
 17 | import pandas
 18 | import numpy
 19 | import typechecks
 20 | 
 21 | import pyfaidx
 22 | 
 23 | from . import sequence_context
 24 | from . import mhc_binding
 25 | from . import reads_util
 26 | from . import support
 27 | from . import read_evidence
 28 | 
 29 | class Includeable(object):
 30 |     columns = None
 31 | 
 32 |     @classmethod
 33 |     def from_args(cls, args):
 34 |         return cls()
 35 | 
 36 |     def process_chunk(self, df):
 37 |         raise NotImplementedError()
 38 | 
 39 |     def compute(self, df, chunk_rows=None):
 40 |         assert self.columns
 41 |         for column in self.columns:
 42 |             if column not in df.columns:
 43 |                 df[column] = numpy.nan
 44 |         rows_to_annotate = pandas.isnull(df[self.columns[0]])
 45 |         for column in self.columns[1:]:
 46 |             rows_to_annotate = rows_to_annotate | pandas.isnull(df[column])
 47 | 
 48 |         while rows_to_annotate.sum() > 0:
 49 |             if chunk_rows:
 50 |                 this_chunk_rows = rows_to_annotate & (
 51 |                     rows_to_annotate.cumsum() <= chunk_rows)
 52 |             else:
 53 |                 this_chunk_rows = rows_to_annotate
 54 | 
 55 |             num_remaining = rows_to_annotate.sum()
 56 |             logging.info("%s: %d / %d (%0.1f%%) remaining. Processing %d rows."
 57 |                 % (
 58 |                     self.name,
 59 |                     num_remaining,
 60 |                     len(rows_to_annotate),
 61 |                     num_remaining * 100.0 / len(rows_to_annotate),
 62 |                     this_chunk_rows.sum()))
 63 | 
 64 |             rows_to_annotate = rows_to_annotate & (~ this_chunk_rows)
 65 |             
 66 |             if this_chunk_rows.sum() > 0:
 67 |                 start = time.time()
 68 |                 df.ix[this_chunk_rows, self.columns] = self.process_chunk(
 69 |                     df.ix[this_chunk_rows].copy())[self.columns]
 70 |                 logging.info("Processed in %f0.2 sec" % (time.time() - start))
 71 |             yield this_chunk_rows.sum()
 72 | 
 73 | class Effect(Includeable):
 74 |     name = "variant effect annotations"
 75 |     columns = ["effect"]
 76 | 
 77 |     @staticmethod
 78 |     def add_args(parser):
 79 |         parser = parser.add_argument_group(Effect.name)
 80 |         parser.add_argument("--include-effect",
 81 |             action="store_true", default=False,
 82 |             help="Include varcode effect annotations")
 83 | 
 84 |     @staticmethod
 85 |     def requested(args):
 86 |         return args.include_effect
 87 | 
 88 |     def process_chunk(self, df):
 89 |         df["effect"] = [
 90 |             v.effects().top_priority_effect().short_description
 91 |             for v in df["variant"]
 92 |         ]
 93 |         return df
 94 | 
 95 | class Gene(Includeable):
 96 |     name = "gene annotations"
 97 |     columns = ["gene"]
 98 |     
 99 |     @staticmethod
100 |     def add_args(parser):
101 |         parser = parser.add_argument_group(Gene.name)
102 |         parser.add_argument("--include-gene",
103 |             action="store_true", default=False,
104 |             help="Include gene names")
105 | 
106 |     @staticmethod
107 |     def requested(args):
108 |         return args.include_gene
109 | 
110 |     def process_chunk(self, df):
111 |         df["gene"] = [
112 |             ' '.join(v.gene_names) if v.gene_names else 'None'
113 |             for v in df.variant
114 |         ]
115 |         return df
116 | 
117 | class Context(Includeable):
118 |     name = "variant sequence context"
119 |     columns = ["context_5_prime", "context_3_prime", "context_mutation"]
120 |     
121 |     @staticmethod
122 |     def add_args(parser):
123 |         parser = parser.add_argument_group(Context.name)
124 |         parser.add_argument("--include-context",
125 |             action="store_true", default=False,
126 |             help="Include variant sequence context")
127 |         parser.add_argument("--reference",
128 |             help="Path to reference fasta (required for sequence context)")
129 |         parser.add_argument("--context-num-bases", type=int, default=15,
130 |             metavar="N",
131 |             help="Num bases of context to include on each side of the variant")
132 | 
133 |     @classmethod
134 |     def from_args(cls, args):
135 |         if not args.reference:
136 |             raise ValueError(
137 |                 "The --reference argument is required when including context")
138 |         return cls(
139 |             reference=pyfaidx.Fasta(args.reference),
140 |             context_num_bases=args.context_num_bases)
141 | 
142 |     def __init__(self, reference, context_num_bases):
143 |         self.reference = reference
144 |         self.context_num_bases = context_num_bases
145 | 
146 |     @staticmethod
147 |     def requested(args):
148 |         return args.include_context
149 | 
150 |     def process_chunk(self, df):
151 |         context_5_prime = []
152 |         context_3_prime = []
153 |         context_mutation = []
154 |         for variant in df.variant:
155 |             tpl = sequence_context.variant_context(
156 |                 self.reference,
157 |                 variant.contig,
158 |                 variant.start,
159 |                 variant.end,
160 |                 variant.alt,
161 |                 self.context_num_bases)
162 |             context_5_prime.append(tpl[0])
163 |             context_mutation.append(tpl[1])
164 |             context_3_prime.append(tpl[2])
165 | 
166 |         df["context_5_prime"] = context_5_prime
167 |         df["context_3_prime"] = context_3_prime
168 |         df["context_mutation"] = context_mutation
169 |         return df
170 |     
171 | class MHCBindingAffinity(Includeable):
172 |     name = "MHC binding affinity"
173 |     columns = ["binding_affinity", "binding_allele"]
174 | 
175 |     noncoding_effects = set([
176 |         "intergenic",
177 |         "intronic",
178 |         "non-coding-transcript",
179 |         "3' UTR",
180 |         "5' UTR",
181 |         "silent",
182 |     ])
183 |     
184 |     @staticmethod
185 |     def add_args(parser):
186 |         parser = parser.add_argument_group(MHCBindingAffinity.name)
187 |         parser.add_argument("--include-mhc-binding",
188 |             action="store_true", default=False,
189 |             help="Include MHC binding (tightest affinity and allele)")
190 |         parser.add_argument("--hla",
191 |             help="Space separated list of MHC alleles, e.g. 'A:02:01 A:02:02'")
192 |         parser.add_argument('--hla-file',
193 |             help="Load HLA types from the specified CSV file. It must have "
194 |             "columns: 'donor' and 'hla'")
195 | 
196 |     @classmethod
197 |     def from_args(cls, args):
198 |         if bool(args.hla) + bool(args.hla_file) != 1:
199 |             raise ValueError("Must specify exactly one of --hla or --hla-file")
200 |         return cls(
201 |             hla=args.hla,
202 |             hla_dataframe=(
203 |                 pandas.read_csv(args.hla_file) if args.hla_file else None))
204 | 
205 |     @staticmethod
206 |     def string_to_hla_alleles(s):
207 |         return s.replace("'", "").split()
208 | 
209 |     def __init__(self, hla=None, hla_dataframe=None, donor_to_hla=None):
210 |         """
211 |         Specify exactly one of hla, hla_dataframe, or donor_to_hla.
212 | 
213 |         Parameters
214 |         -----------
215 |         hla : list of string
216 |             HLA alleles to use for all donors
217 | 
218 |         hla_dataframe : pandas.DataFrame with columns 'donor' and 'hla'
219 |             DataFrame giving HLA alleles for each donor. The 'hla' column
220 |             should be a space separated list of alleles for that donor.
221 | 
222 |         donor_to_hla : dict of string -> string list
223 |             Map from donor to HLA alleles for that donor.
224 |         """
225 |         if bool(hla) + (hla_dataframe is not None) + bool(donor_to_hla) != 1:
226 |             raise TypeError(
227 |                 "Must specify exactly one of hla, hla_dataframe, donor_to_hla")
228 |         
229 |         self.hla = (
230 |             self.string_to_hla_alleles(hla) if typechecks.is_string(hla)
231 |             else hla)
232 |         self.donor_to_hla = donor_to_hla
233 |         if hla_dataframe is not None:
234 |             self.donor_to_hla = {}
235 |             for (i, row) in hla_dataframe.iterrows():
236 |                 if row.donor in self.donor_to_hla:
237 |                     raise ValueError("Multiple rows for donor: %s" % row.donor)
238 |                 if pandas.isnull(row.hla):
239 |                     self.donor_to_hla[row.donor] = None
240 |                 else:
241 |                     self.donor_to_hla[row.donor] = self.string_to_hla_alleles(
242 |                         row.hla)
243 |         assert self.hla is not None or self.donor_to_hla is not None
244 | 
245 |     @staticmethod
246 |     def requested(args):
247 |         return args.include_mhc_binding
248 | 
249 |     def process_chunk(self, df):
250 |         drop_donor = False
251 |         if 'donor' not in df:
252 |             df["donor"] = "DONOR1"
253 |             drop_donor = True
254 |         for donor in df.donor.unique():
255 |             rows = (df.donor == donor)
256 |             if 'effect' in df:
257 |                 rows = rows & (~df.effect.isin(self.noncoding_effects))
258 |             sub_df = df.loc[rows]
259 |             alleles = self.hla if self.hla else self.donor_to_hla.get(donor)
260 |             if alleles and sub_df.shape[0] > 0:
261 |                 result = mhc_binding.binding_affinities(
262 |                     sub_df.variant, alleles)
263 |                 df.loc[rows, "binding_affinity"] = (
264 |                     result["binding_affinity"].values)
265 |                 df.loc[rows, "binding_allele"] = (
266 |                     result["binding_allele"].values)
267 |         if drop_donor:
268 |             del df["donor"]
269 |         return df
270 | 
271 | class ReadEvidence(Includeable):
272 |     name = "read evidence"
273 |     default_column_format = "{source}_count_{allele_group}"
274 |     
275 |     @classmethod
276 |     def add_args(cls, parser):
277 |         group = parser.add_argument_group(cls.name)
278 |         group.add_argument("--include-read-evidence",
279 |             action="store_true", default=False,
280 |             help="Include counts of supporting / contradicting reads")
281 |         group.add_argument("--read-sources-file",
282 |             help="Load paths to BAMs from the given csv file.")
283 |         group.add_argument("--read-sources-id-column",
284 |             default="source_id",
285 |             help="Column to use to join read sources with the variants "
286 |             "dataframe.")
287 |         group.add_argument("--read-sources-column", action="append",
288 |             default=[],
289 |             help="Column containing path to reads (e.g. path to a BAM). Can "
290 |             "be specified any number of times. If not specified, all "
291 |             "columns are used.")
292 |         group.add_argument("--always-prefix-column", action="store_true",
293 |             default=False,
294 |             help="Always prefix the column names with the source name and "
295 |             "count group, even when there is only one of each.")
296 |         group.add_argument("--survive-errors", action="store_true",
297 |             default=False,
298 |             help="If an error is encountered log it and try to continue.")
299 |         
300 |         reads_util.add_args(parser)
301 | 
302 |     @classmethod
303 |     def from_args(cls, args):
304 |         read_sources = reads_util.load_from_args(args)
305 |         read_sources_df = None
306 |         if args.read_sources_file is not None:
307 |             read_sources_df = pandas.read_csv(
308 |                 args.read_sources_file,
309 |                 index_col=args.read_sources_id_column)
310 |             if args.read_sources_column:
311 |                 read_sources_df = read_sources_df[args.read_sources_column]
312 | 
313 |         source_names = cls.read_source_names(read_sources, read_sources_df)
314 |         if (args.always_prefix_column or len(source_names) > 1):
315 |             column_format = cls.default_column_format
316 |         else:
317 |             column_format = "{allele_group}"
318 |         return cls(
319 |             read_sources=read_sources,
320 |             read_sources_df=read_sources_df,
321 |             column_format=column_format,
322 |             survive_errors=args.survive_errors)
323 | 
324 |     def __init__(self,
325 |             read_sources=None,
326 |             read_sources_df=None,
327 |             column_format=default_column_format,
328 |             survive_errors=False):
329 |         """
330 |         
331 |         """
332 |         if sum(x is not None for x in [read_sources, read_sources_df]) != 1:
333 |             raise TypeError(
334 |                 "Specify exactly one of read_sources, read_sources_df")
335 |  
336 |         self.read_sources = read_sources
337 |         self.read_sources_df = read_sources_df
338 |         self.column_format = column_format
339 |         self.survive_errors = survive_errors
340 |         self.set_columns()
341 | 
342 |     @staticmethod
343 |     def read_source_names(read_sources=None, read_sources_df=None):
344 |         if read_sources is not None:
345 |             return [x.name for x in read_sources]
346 |         return read_sources_df.columns.tolist()
347 |    
348 |     def set_columns(self):
349 |         source_names = self.read_source_names(
350 |             read_sources=self.read_sources,
351 |             read_sources_df=self.read_sources_df)
352 |         assert source_names
353 |         self.columns_dict = collections.OrderedDict()
354 |         for source_name in source_names:
355 |             for allele_group in ["num_alt", "num_ref", "total_depth"]:
356 |                 column_name = self.column_name(
357 |                     source_name, allele_group)
358 |                 self.columns_dict[column_name] = (
359 |                     source_name, allele_group)
360 |         self.columns = list(self.columns_dict)                
361 |         assert self.columns
362 | 
363 |     def column_name(self, source, allele_group):
364 |         """
365 |         Parameters
366 |         ----------
367 |         source : string
368 |             name of the ReadSource
369 | 
370 |         allele_group : string
371 |             one of: num_ref, num_alt, total_depth
372 | 
373 |         Returns
374 |         ----------
375 |         column name : string
376 |         """
377 |         return self.column_format.format(
378 |             source=source,
379 |             allele_group=allele_group)
380 | 
381 |     @staticmethod
382 |     def requested(args):
383 |         return args.include_read_evidence
384 | 
385 |     def process_chunk(self, df):
386 |         if self.read_sources_df is None:
387 |             def rows_and_read_sources():
388 |                 all_rows = numpy.ones(df.shape[0], dtype=bool)
389 |                 yield (all_rows, self.read_sources)
390 |         else:
391 |             def rows_and_read_sources():
392 |                 join_col = self.read_sources_df.index.name
393 |                 for join_value in df[join_col].unique():
394 |                     rows = df[join_col] == join_value
395 |                     read_paths = self.read_sources_df.ix[join_value]
396 |                     read_sources = []
397 |                     for (name, filename) in read_paths.iteritems():
398 |                         if pandas.isnull(filename):
399 |                             continue
400 |                         relevant_columns = [
401 |                             col for (col, (source_name, allele_group))
402 |                             in self.columns_dict.items()
403 |                             if source_name == name
404 |                         ]
405 |                         if (~pandas.isnull(df[relevant_columns].values)).all():
406 |                             logging.info(
407 |                                 "Skipping source %s (%s) for %s: data exists" %
408 |                                 (name, filename, join_value))
409 |                             continue
410 |                         try:
411 |                             read_sources.append(reads_util.load_bam(
412 |                                 filename,
413 |                                 name=name))
414 |                         except Exception as e:
415 |                             logging.error("Error loading bam: %s in %s" %
416 |                                 (str(e), filename))
417 |                             if not self.survive_errors:
418 |                                 raise
419 |                             continue
420 | 
421 |                     if rows.sum() > 0 and read_sources:
422 |                         logging.info(
423 |                             "Processing %s=%s (%d rows, %d read sources)" % (
424 |                                 join_col,
425 |                                 join_value,
426 |                                 rows.sum(),
427 |                                 len(read_sources)))
428 |                         yield (rows, read_sources)
429 |                     else:
430 |                         logging.info(
431 |                             "Skipping %s=%s (%d rows, %d read sources)" % (
432 |                                 join_col,
433 |                                 join_value,
434 |                                 rows.sum(),
435 |                                 len(read_sources)))
436 | 
437 |         for (rows, sources) in rows_and_read_sources():
438 |             variants = df.variant[rows]
439 |             counter = collections.Counter(variants)
440 |             duplicate_variants = dict(
441 |                 (v, c) for (v, c) in counter.items() if c > 1)
442 |             if duplicate_variants:
443 |                 raise ValueError("Duplicate variant(s) for this source: %s" %
444 |                         duplicate_variants)
445 |             variant_loci = sorted(set(
446 |                 read_evidence.pileup_collection.to_locus(variant)
447 |                 for variant in variants))
448 | 
449 |             allele_support_df = support.allele_support_df(
450 |                 variant_loci, sources)
451 |             assert set(s.name for s in sources) == set(
452 |                 allele_support_df.source.unique())
453 |             variant_support_df = support.variant_support(
454 |                 variants, allele_support_df)
455 |             assert set(s.name for s in sources) == set(
456 |                 variant_support_df.minor_axis)
457 | 
458 |             for allele_group in ["num_alt", "num_ref", "total_depth"]:
459 |                 sub_panel = variant_support_df[allele_group, variants]
460 |                 for source_column in sub_panel.columns:
461 |                     dest_column = self.column_name(
462 |                         source_column, allele_group)
463 |                     assert dest_column in self.columns, (
464 |                             "Bad column: %s not in %s" % (
465 |                                 dest_column, " ".join(self.columns)))
466 |                     values = sub_panel[source_column].values
467 |                     assert len(values) == rows.sum(), "%d != %d" % (
468 |                         len(values), rows.sum())
469 |                     df.loc[rows, dest_column] = values
470 |         return df
471 | 
472 | INCLUDEABLES = Includeable.__subclasses__()
473 | 


--------------------------------------------------------------------------------
/varlens/variants_util.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import collections
 14 | 
 15 | import pandas
 16 | import varcode
 17 | import varcode.reference
 18 | import logging
 19 | 
 20 | from . import util, loci_util
 21 | from .locus import Locus
 22 | from .read_evidence import pileup_collection
 23 | 
 24 | STANDARD_DATAFRAME_COLUMNS = [
 25 |     "genome",
 26 |     "contig",
 27 |     "interbase_start",
 28 |     "interbase_end",
 29 |     "ref",
 30 |     "alt",
 31 | ]
 32 | 
 33 | def add_args(parser, positional=False):
 34 |     group = parser.add_argument_group("variant loading")
 35 |     group.add_argument("variants" if positional else "--variants",
 36 |         nargs=("*" if positional else "+"), default=[],
 37 |         help="Path to VCF file. Any number of VCF files may be specified. "
 38 |         "CSV files in the format outputted by the varlens-variants tool are "
 39 |         "also supported.")
 40 |     group.add_argument("--genome",
 41 |         help="Genome for the variants (e.g. b37). Required when the genome "
 42 |         "cannot be guessed from the metadata in the VCF.")
 43 |     group.add_argument("--include-failing-variants",
 44 |         action="store_true",
 45 |         default=False,
 46 |         help="Include variants with a non-PASS filter field.")
 47 |     group.add_argument("--variant-source-name", nargs="+",
 48 |         help="Names for variant sources. Must specify one name per variant "
 49 |         "source. If not specified, the filenames are used.")
 50 |     group.add_argument("--max-variants-per-source", type=int,
 51 |         metavar="N",
 52 |         help="Load at most N variants from each source.")
 53 |     group.add_argument("--single-variant", nargs=3, action="append",
 54 |         default=[], metavar="X",
 55 |         help="Literal variant specified as three arguments: LOCUS REF ALT. "
 56 |         "Can be specified any number of times by repeating the "
 57 |         "--single-variant option.")
 58 | 
 59 |     # Filters
 60 |     group = parser.add_argument_group("variant filtering",
 61 |         "If multiple filters are specified, the variants must pass *all* "
 62 |         "filters. For filtering by loci, any variants that overlap "
 63 |         "the specified loci are included.")
 64 |     group.add_argument("--ref", nargs="+",
 65 |         help="Include only variants where ref is one of the given values.")
 66 |     group.add_argument("--alt", nargs="+",
 67 |         help="Include only variants where alt is one of the given values.")
 68 |     loci_util.add_args(util.PrefixedArgumentParser(group, "variant"))
 69 | 
 70 | def load_from_args_as_dataframe(args):
 71 |     '''
 72 |     Given parsed variant-loading arguments, return a pandas DataFrame.
 73 | 
 74 |     If no variant loading arguments are specified, return None.
 75 |     '''
 76 |     if not args.variants and not args.single_variant:
 77 |         return None
 78 | 
 79 |     if args.variant_source_name:
 80 |         variant_source_names = util.expand(
 81 |             args.variant_source_name,
 82 |             'variant_source_name',
 83 |             'variant source',
 84 |             len(args.variants))
 85 |     else:
 86 |         variant_source_names = util.drop_prefix(args.variants)
 87 | 
 88 |     variant_to_sources = collections.defaultdict(list)
 89 | 
 90 |     dfs = []
 91 |     for i in range(len(args.variants)):
 92 |         name = variant_source_names[i]
 93 |         prefix = (
 94 |             'metadata:' if len(args.variants) == 1 else "metadata:%s:" % name)
 95 |         df = load_as_dataframe(
 96 |             args.variants[i],
 97 |             name=name,
 98 |             genome=args.genome,
 99 |             max_variants=args.max_variants_per_source,
100 |             only_passing=not args.include_failing_variants,
101 |             metadata_column_prefix=prefix)
102 | 
103 |         if df.shape[0] == 0:
104 |             logging.warn("No variants loaded from: %s" % args.variants[i])
105 |         else:
106 |             for variant in df.variant:
107 |                 variant_to_sources[variant].append(name)
108 |             dfs.append(df)
109 | 
110 |     if args.single_variant:
111 |         variants = []
112 |         extra_args = {}
113 |         if args.genome:
114 |             extra_args = {
115 |                 'ensembl': varcode.reference.infer_genome(args.genome)
116 |             }
117 |         for (locus_str, ref, alt) in args.single_variant:
118 |             locus = Locus.parse(locus_str)
119 |             variant = varcode.Variant(
120 |                     locus.contig,
121 |                     locus.inclusive_start,
122 |                     ref,
123 |                     alt,
124 |                     **extra_args)
125 |             variants.append(variant)
126 |             variant_to_sources[variant].append("commandline")
127 |         dfs.append(variants_to_dataframe(variants))
128 | 
129 |     df = dfs.pop(0)
130 |     for other_df in dfs:
131 |         df = pandas.merge(
132 |             df,
133 |             other_df,
134 |             how='outer',
135 |             on=["variant"] + STANDARD_DATAFRAME_COLUMNS)
136 | 
137 |     genomes = df["genome"].unique()
138 |     if len(genomes) > 1:
139 |         raise ValueError(
140 |                 "Mixing references is not supported. "
141 |                 "Reference genomes: %s" % (", ".join(genomes)))
142 | 
143 |     df["sources"] = [" ".join(variant_to_sources[v]) for v in df.variant]
144 | 
145 |     # Apply filters:
146 |     if args.ref:
147 |         df = df.ix[df.ref.isin(args.ref)]
148 |     if args.alt:
149 |         df = df.ix[df.alt.isin(args.alt)]
150 |     loci = loci_util.load_from_args(
151 |         util.remove_prefix_from_parsed_args(args, "variant"))
152 |     if loci is not None:
153 |         df = df.ix[[
154 |             loci.intersects(pileup_collection.to_locus(v))
155 |             for v in df.variant
156 |         ]]
157 |     return df
158 | 
159 | def load_as_dataframe(
160 |         filename,
161 |         loader=None,
162 |         name=None,
163 |         genome=None,
164 |         max_variants=None,
165 |         only_passing=True,
166 |         metadata_column_prefix=''):
167 | 
168 |     if name is None:
169 |         name = filename
170 | 
171 |     if loader is None:
172 |         if (filename.endswith(".vcf") or filename.endswith(".vcf.gz")):
173 |             # Load from VCF
174 |             def loader(filename):
175 |                 collection = varcode.load_vcf_fast(
176 |                     filename,
177 |                     genome=genome,
178 |                     max_variants=max_variants,
179 |                     only_passing=only_passing,
180 |                     allow_extended_nucleotides=True)
181 |                 return variants_to_dataframe(
182 |                     collection,
183 |                     collection.metadata,
184 |                     metadata_column_prefix=metadata_column_prefix)
185 | 
186 |         elif (filename.endswith(".csv") or filename.endswith(".csv.gz")):
187 |             # Load from csv
188 |             def loader(filename):
189 |                 # Ignores only_passing
190 |                 df = pandas.read_csv(filename, nrows=max_variants)
191 |                 for column in ['ref', 'alt']:
192 |                     df[column] = df[column].fillna('')
193 |                 df["variant"] = [
194 |                     dataframe_row_to_variant(row) for (i, row) in df.iterrows()
195 |                 ]
196 |                 return df
197 |         else:
198 |             raise ValueError(
199 |                 "Unsupported input file extension for variants: %s" % filename)
200 | 
201 |     df = loader(filename)
202 | 
203 |     if 'genome' not in df:
204 |         df["genome"] = genome
205 | 
206 |     df["variant"] = [
207 |         dataframe_row_to_variant(row) for (i, row) in df.iterrows()
208 |     ]
209 |     return df
210 | 
211 | def variants_to_dataframe(
212 |         variants, metadata=None, metadata_column_prefix=""):
213 |     def record(variant):
214 |         d = {
215 |             'variant': variant,
216 |             'genome': str(variant.reference_name),
217 |             'contig': variant.contig,
218 |             'interbase_start': variant.start - 1,
219 |             'interbase_end': variant.end,
220 |             'ref': variant.ref,
221 |             'alt': variant.alt,
222 |         }
223 |         if metadata:
224 |             for (name, value) in metadata.get(variant, {}).items():
225 |                 if name == 'info':
226 |                     for (info_col, value) in value.items():
227 |                         column = '%sinfo:%s' % (
228 |                             metadata_column_prefix, info_col)
229 |                         d[column] = value
230 |                 else:
231 |                     d["%s%s" % (metadata_column_prefix, name.lower())] = value
232 |         return d
233 | 
234 |     df = pandas.DataFrame.from_records([record(v) for v in variants])
235 |     column_indices = dict(
236 |         (column, i) for (i, column) in enumerate(STANDARD_DATAFRAME_COLUMNS))
237 |     columns = sorted(df.columns, key=lambda col: column_indices.get(col, 100))
238 |     return df[columns]
239 | 
240 | def dataframe_row_to_variant(row):
241 |     return varcode.Variant(
242 |             ensembl=row.genome,
243 |             contig=row.contig,
244 |             start=row.interbase_start + 1,
245 |             ref=row.ref,
246 |             alt=row.alt,
247 |             allow_extended_nucleotides=True)
248 | 
249 | def dataframe_to_variants(df):
250 |     for column in STANDARD_DATAFRAME_COLUMNS:
251 |         if column not in df:
252 |             raise ValueError("Missing column: %s" % column)
253 | 
254 |     extra_columns = [
255 |         c for c in df.columns if c not in STANDARD_DATAFRAME_COLUMNS
256 |     ]
257 |     metadata = collections.OrderedDict()
258 |     for (i, row) in df.iterrows():
259 |         variant = dataframe_row_to_variant(row)
260 |         # We ignore the interbase_end field.
261 |         metadata[variant] = dict((c, row[c]) for c in extra_columns)
262 | 
263 |     return varcode.VariantCollection(metadata.keys(), metadata=metadata)
264 | 
265 | def load_csv(filename, genome=None):
266 |     # Genome is ignored for now.
267 |     df = pandas.read_csv(filename)
268 |     return dataframe_to_variants(df)
269 | 


--------------------------------------------------------------------------------
/varlens/version.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | __version__ = "0.2.0"
14 | 
15 | version_string = f"v{__version__}"
16 | 
17 | def print_version():
18 |     print(version_string)
19 | 
20 | def print_name_and_version():
21 |     print(f"Varlens {version_string}")
22 | 
23 | if __name__ == "__main__":
24 |     print_version()
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------