├── .github └── workflows │ └── tests.yml ├── .gitignore ├── LICENSE ├── README.rst ├── deploy-gh-pages.sh ├── docs ├── Makefile ├── conf.py ├── index.rst └── varlens.rst ├── lint.sh ├── setup.py ├── test.sh ├── tests ├── __init__.py ├── data │ ├── CELSR1 │ │ ├── bams │ │ │ ├── bam_0.bam │ │ │ ├── bam_0.bam.bai │ │ │ ├── bam_1.bam │ │ │ ├── bam_1.bam.bai │ │ │ ├── bam_10.bam │ │ │ ├── bam_10.bam.bai │ │ │ ├── bam_11.bam │ │ │ ├── bam_11.bam.bai │ │ │ ├── bam_12.bam │ │ │ ├── bam_12.bam.bai │ │ │ ├── bam_13.bam │ │ │ ├── bam_13.bam.bai │ │ │ ├── bam_14.bam │ │ │ ├── bam_14.bam.bai │ │ │ ├── bam_15.bam │ │ │ ├── bam_15.bam.bai │ │ │ ├── bam_16.bam │ │ │ ├── bam_16.bam.bai │ │ │ ├── bam_17.bam │ │ │ ├── bam_17.bam.bai │ │ │ ├── bam_18.bam │ │ │ ├── bam_18.bam.bai │ │ │ ├── bam_19.bam │ │ │ ├── bam_19.bam.bai │ │ │ ├── bam_2.bam │ │ │ ├── bam_2.bam.bai │ │ │ ├── bam_20.bam │ │ │ ├── bam_20.bam.bai │ │ │ ├── bam_21.bam │ │ │ ├── bam_21.bam.bai │ │ │ ├── bam_22.bam │ │ │ ├── bam_22.bam.bai │ │ │ ├── bam_23.bam │ │ │ ├── bam_23.bam.bai │ │ │ ├── bam_3.bam │ │ │ ├── bam_3.bam.bai │ │ │ ├── bam_4.bam │ │ │ ├── bam_4.bam.bai │ │ │ ├── bam_5.bam │ │ │ ├── bam_5.bam.bai │ │ │ ├── bam_6.bam │ │ │ ├── bam_6.bam.bai │ │ │ ├── bam_7.bam │ │ │ ├── bam_7.bam.bai │ │ │ ├── bam_8.bam │ │ │ ├── bam_8.bam.bai │ │ │ ├── bam_9.bam │ │ │ └── bam_9.bam.bai │ │ └── vcfs │ │ │ ├── vcf_1.vcf │ │ │ ├── vcf_10.vcf │ │ │ ├── vcf_11.vcf │ │ │ ├── vcf_12.vcf │ │ │ ├── vcf_13.vcf │ │ │ ├── vcf_14.vcf │ │ │ ├── vcf_2.vcf │ │ │ ├── vcf_3.vcf │ │ │ ├── vcf_4.vcf │ │ │ ├── vcf_5.vcf │ │ │ ├── vcf_6.vcf │ │ │ ├── vcf_7.vcf │ │ │ ├── vcf_8.vcf │ │ │ └── vcf_9.vcf │ ├── chr22.no_line_wrap.fa │ ├── gatk_mini_bundle_extract.bam │ ├── gatk_mini_bundle_extract.bam.bai │ ├── rna_chr17_41244936.bam │ └── rna_chr17_41244936.bam.bai ├── test_allele_support.py ├── test_read_evidence.py ├── test_reads.py └── test_variants.py └── varlens ├── __init__.py ├── commands ├── __init__.py ├── allele_support.py ├── reads.py ├── util.py └── variants.py ├── loci_util.py ├── locus.py ├── mhc_binding.py ├── read_evidence ├── __init__.py ├── pileup.py ├── pileup_collection.py ├── pileup_element.py └── util.py ├── read_source.py ├── reads_util.py ├── sequence_context.py ├── support.py ├── util.py ├── variant_includes.py ├── variants_util.py └── version.py /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | name: Tests 4 | on: [push, pull_request] 5 | 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: true 11 | matrix: 12 | python-version: ["3.9", "3.10", "3.11"] 13 | 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@v3 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v3 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | cache: "pip" 22 | - name: Create virtual environment and install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | python -m pip install uv 26 | uv venv 27 | source .venv/bin/activate 28 | uv pip install pytest pytest-cov coveralls pylint ruff 29 | uv pip install -r requirements.txt 30 | uv pip install . 31 | - name: Install Ensembl data 32 | run: | 33 | echo "Before installing Ensembl releases" && df -h 34 | pyensembl install --release 75 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh37.75/ 35 | pyensembl install --release 77 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.77/ 36 | pyensembl install --release 93 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.93/ 37 | pyensembl install --release 93 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.93/ 38 | - name: Run linting script and unit tests 39 | run: | 40 | source .venv/bin/activate 41 | ./lint.sh 42 | ./test.sh 43 | - name: Publish coverage to Coveralls 44 | uses: coverallsapp/github-action@v2.2.3 45 | with: 46 | github-token: ${{ secrets.GITHUB_TOKEN }} 47 | flag-name: coverage 48 | fail-on-error: false 49 | parallel: true 50 | - name: Upload docs to GitHub Pages 51 | run: | 52 | ./deploy-gh-pages.sh 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://github.com/openvax/varlens/actions/workflows/tests.yml/badge.svg 2 | :target: https://github.com/openvax/varlens/actions/workflows/tests.yml 3 | :alt: Tests 4 | 5 | varlens 6 | ====================== 7 | 8 | A collection of Python tools for working with genomic variants and 9 | next-generation sequencing reads. Not particularly fast for large datasets. The 10 | emphasis is on extracting what you need from BAMs and VCFs into a CSV file for 11 | further analysis. 12 | 13 | Built on `varcode `_ and `pysam `_. 14 | 15 | varlens-variants 16 | Combine, annotate, and filter variants from VCF or CSV files. Available 17 | annotations include genes, variant effects, surrounding sequence context, 18 | counts of supporting reads from specified BAM files, and MHC I binding 19 | affinity prediction of mutant peptides. 20 | 21 | varlens-reads 22 | Display, filter, and copy reads from a SAM/BAM file. Partial replacement for ``samtools view``. 23 | 24 | varlens-allele-support 25 | Count reads supporting each allele at specified sites in BAM files. 26 | 27 | 28 | Installation 29 | ------------- 30 | 31 | To install from `PyPI `_: 32 | 33 | :: 34 | 35 | pip install varlens 36 | 37 | Or from a git checkout: 38 | 39 | :: 40 | 41 | pip install . 42 | 43 | To run the tests: 44 | 45 | :: 46 | 47 | nosetests . 48 | 49 | To build the documentation (just this README plus the commandline tool help): 50 | 51 | :: 52 | 53 | pip install -e . 54 | pip install Sphinx 55 | cd docs 56 | make clean setup rst html 57 | 58 | The docs will be written to the ``_build/html`` directory. 59 | 60 | 61 | varlens-variants 62 | ---------------------- 63 | 64 | Given variants from one or more VCF or CSV files, apply filters, add additional 65 | columns, and output to CSV. 66 | 67 | Currently we can only output to CSV, not VCF. 68 | 69 | A number of useful annotations can be added for each variant by specifying 70 | options of the form '--include-XXX', e.g. '--include-gene'. See detailed help 71 | (run with -h). 72 | 73 | Examples 74 | ````````````` 75 | 76 | Print basic info for the variants found in two VCF files. Note that variants 77 | found in both files are listed in one row, and the 'sources' column lists 78 | the files each variant was found in: 79 | 80 | :: 81 | 82 | $ varlens-variants test/data/CELSR1/vcfs/vcf_1.vcf test/data/CELSR1/vcfs/vcf_2.vcf 83 | 84 | genome,contig,interbase_start,interbase_end,ref,alt,sources 85 | GRCh37,22,21829554,21829555,T,G,1.vcf 86 | GRCh37,22,46931059,46931060,A,C,1.vcf 87 | GRCh37,22,46931061,46931062,G,A,1.vcf 2.vcf 88 | GRCh37,22,50636217,50636218,A,C,1.vcf 89 | GRCh37,22,50875932,50875933,A,C,1.vcf 90 | GRCh37,22,45309892,45309893,T,G,2.vcf 91 | 92 | Same as the above but include additional columns giving varcode variant effect 93 | annotations and the genes the variants overlap, and write to a file: 94 | 95 | :: 96 | 97 | $ varlens-variants test/data/CELSR1/vcfs/vcf_1.vcf test/data/CELSR1/vcfs/vcf_2.vcf \ 98 | --include-effect \ 99 | --include-gene \ 100 | --out /tmp/result.csv 101 | 102 | Wrote: /tmp/result.csv 103 | 104 | $ cat /tmp/result.csv 105 | 106 | genome,contig,interbase_start,interbase_end,ref,alt,sources,effect,gene 107 | GRCh37,22,21829554,21829555,T,G,1.vcf,non-coding-transcript,PI4KAP2 108 | GRCh37,22,46931059,46931060,A,C,1.vcf,p.S670A,CELSR1 109 | GRCh37,22,46931061,46931062,G,A,1.vcf 2.vcf,p.S669F,CELSR1 110 | GRCh37,22,50636217,50636218,A,C,1.vcf,intronic,TRABD 111 | GRCh37,22,50875932,50875933,A,C,1.vcf,splice-acceptor,PPP6R2 112 | GRCh37,22,45309892,45309893,T,G,2.vcf,p.T214P,PHF21B 113 | 114 | Print counts for number of reads supporting reference/variant/other alleles 115 | from the specified BAM, counting only reads with mapping quality >= 10: 116 | 117 | :: 118 | 119 | $ varlens-variants test/data/CELSR1/vcfs/vcf_1.vcf \ 120 | --include-read-evidence \ 121 | --reads test/data/CELSR1/bams/bam_1.bam \ 122 | --min-mapping-quality 10 123 | 124 | genome,contig,interbase_start,interbase_end,ref,alt,sources,num_alt,num_ref,total_depth 125 | GRCh37,22,21829554,21829555,T,G,vcf_1.vcf,0,0,0 126 | GRCh37,22,46931059,46931060,A,C,vcf_1.vcf,0,216,320 127 | GRCh37,22,46931061,46931062,G,A,vcf_1.vcf,0,321,321 128 | GRCh37,22,50636217,50636218,A,C,vcf_1.vcf,0,0,0 129 | GRCh37,22,50875932,50875933,A,C,vcf_1.vcf,0,0,0 130 | 131 | 132 | varlens-reads 133 | ---------------------- 134 | 135 | Filter reads from one or more BAMs and output a CSV or a new BAM. 136 | 137 | Loci and VCF files may be specified, in which case reads are filtered to 138 | overlap the specified loci or variants. 139 | 140 | Examples 141 | ````````````` 142 | 143 | Print basic fields for the reads in a BAM: 144 | 145 | :: 146 | 147 | $ varlens-reads test/data/CELSR1/bams/bam_0.bam 148 | 149 | query_name,reference_start,reference_end,cigarstring 150 | HISEQ:142:C5822ANXX:3:2116:16538:101199,46929962,46930062,100M 151 | HISEQ:142:C5822ANXX:3:1106:18985:32932,46929964,46930064,100M 152 | HISEQ:142:C5822ANXX:3:2201:21091:67220,46929966,46930066,100M 153 | HISEQ:142:C5822ANXX:4:1304:5363:12786,46929966,46930066,100M 154 | HISEQ:142:C5822ANXX:4:1104:9008:85114,46929969,46930069,100M 155 | HISEQ:142:C5822ANXX:3:2304:9921:94828,46929970,46930070,100M 156 | HISEQ:142:C5822ANXX:3:2211:6266:74633,46929973,46930073,100M 157 | HISEQ:142:C5822ANXX:3:1305:8982:42729,46929974,46930074,100M 158 | HISEQ:142:C5822ANXX:4:2316:5630:7371,46929978,46930078,100M 159 | ... 160 | 161 | Same as above but filter only to reads aligned on the (-) strand, write to a 162 | file instead of stdout, and also include the mapping quality and sequenced 163 | bases in the output: 164 | 165 | :: 166 | 167 | $ varlens-reads test/data/CELSR1/bams/bam_0.bam \ 168 | --is-reverse \ 169 | --field mapping_quality query_alignment_sequence \ 170 | --out /tmp/result.csv 171 | 172 | Wrote: /tmp/result.csv 173 | 174 | $ head /tmp/result.csv 175 | 176 | query_name,reference_start,reference_end,cigarstring,mapping_quality,query_alignment_sequence 177 | HISEQ:142:C5822ANXX:3:2116:16538:101199,46929962,46930062,100M,60,CATGATCTGGGCATTAGGGCCTTCATCAGGGTCGTTAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCG 178 | HISEQ:142:C5822ANXX:3:1106:18985:32932,46929964,46930064,100M,60,TGATCTGGGCATTAGGGCCTTCATCAGGGTCGTTAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTC 179 | HISEQ:142:C5822ANXX:4:1104:9008:85114,46929969,46930069,100M,60,TGGGCATTAGGGCCTTCATCAGGGTCGTTAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTCCTTCT 180 | HISEQ:142:C5822ANXX:4:1202:18451:91174,46929979,46930079,100M,60,GGCCTTCATCAGGGTCGTTAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTCCTTCTCAAACATGGG 181 | HISEQ:142:C5822ANXX:3:1211:18522:54773,46929987,46930087,100M,60,TCAGGGTCGTTAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTCCTTCTCAAACATGGGGGCATTGT 182 | HISEQ:142:C5822ANXX:3:2114:19455:45093,46929987,46930087,100M,60,TCAGGGTCGTTAGCACGAATCTTTGCCACCGCCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTCCTTCTCAAACATGGGGGCATTGT 183 | HISEQ:142:C5822ANXX:4:2115:9153:21593,46929994,46930094,100M,60,CGTTAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTCCTTCTCAAACATGGGGGCATTGTCATTAAT 184 | HISEQ:142:C5822ANXX:4:1212:15644:87227,46929995,46930095,100M,60,GTTAGCACGTATGTTTGCCACCACCGACCCCACTGAGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTGCTTCTCAAACATGGGGGCAGTGTCATTAATG 185 | HISEQ:142:C5822ANXX:3:1103:4717:26369,46929997,46930097,100M,60,TAGCACGAATCTTTGCCACCACCGACCCCACTGGGTTGTTCTCCTCAACAAACAGCTCCAGTTCGTCCTTCTCAAACATGGGGGCATTGTCATTAATGTC 186 | 187 | 188 | Write a bam file consisting of reads with mapping quality >=30 and 189 | overlapping a certain locus: 190 | 191 | :: 192 | 193 | $ varlens-reads test/data/CELSR1/bams/bam_0.bam \ 194 | --min-mapping-quality 30 \ 195 | --locus 22:46932040-46932050 \ 196 | --out /tmp/result.bam 197 | 198 | Write a bam file consisting of reads overlapping variants from a VCF: 199 | 200 | :: 201 | 202 | $ varlens-reads test/data/CELSR1/bams/bam_0.bam \ 203 | --variants test/data/CELSR1/vcfs/vcf_1.vcf \ 204 | --out /tmp/result.bam 205 | 206 | Print just the header for a BAM in csv format: 207 | 208 | :: 209 | 210 | $ varlens-reads test/data/CELSR1/bams/bam_0.bam --header 211 | 212 | varlens-allele-support 213 | ---------------------- 214 | 215 | Given one or more BAMs and some genomic sites to consider, write a csv file 216 | giving counts of reads supporting each allele at each site for each BAM. 217 | 218 | The genomic sites to consider may be specified by locus (--locus option), or via 219 | one or more VCF files. 220 | 221 | The positions outputted by this command are in *interbase coordinates*, i.e. 222 | starting at 0, inclusive on first index, exclusive on second (as opposed to 223 | the one-based inclusive coordinates used in VCF files). 224 | 225 | Examples 226 | ````````````` 227 | 228 | :: 229 | 230 | varlens-allele-support \ 231 | --reads test/data/CELSR1/bams/bam_1.bam \ 232 | --locus 22:46931061 22:46931063 233 | 234 | source,contig,interbase_start,interbase_end,allele,count 235 | bam_1.bam,22,46931060,46931061,,1 236 | bam_1.bam,22,46931060,46931061,G,329 237 | bam_1.bam,22,46931062,46931063,A,327 238 | bam_1.bam,22,46931062,46931063,AC,1 239 | bam_1.bam,22,46931062,46931063,AG,2 240 | 241 | Note on coordinate systems 242 | ----------------------------------- 243 | 244 | ``varlens`` uses 0-based half-open coordinates internally. Many tools 245 | (including samtools and VCF files) use inclusive 1-based coordinates. We try to 246 | keep the confusion to a minimum by using the term "interbase" whenever we're 247 | using 0-based half open coordinates and "inclusive" when we're using 1-based 248 | inclusive coordinates. 249 | 250 | One particularly sticky place this comes up is when specifying loci on the 251 | commandline using e.g. ``--locus chr22:43243-43244``. To maintain consistency 252 | with the most common other tools, when you specify a locus like 253 | ``chr22:10-20``, we interpret that as a 1-based inclusive coordinate. To 254 | specify 0-based half-open coordinates, use this syntax: ``chr22/11-20`` (i.e. a 255 | slash instead of a colon). 256 | 257 | See this `blog post `_ 258 | for more details on coordinate systems. 259 | 260 | .. Documentation 261 | ------------- 262 | The docs are just this readme and the commandline tool help. 263 | They are available here: http://openvax.github.io/varlens/docs/html 264 | 265 | 266 | -------------------------------------------------------------------------------- /deploy-gh-pages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Adapted from https://github.com/w3ctag/promises-guide/blob/master/deploy-gh-pages.sh 4 | 5 | set -e 6 | 7 | pip install Sphinx 8 | 9 | cd docs 10 | make clean 11 | make setup 12 | make rst 13 | make html 14 | 15 | cd _build 16 | 17 | mkdir docs 18 | mv html docs 19 | 20 | touch .nojekyll 21 | 22 | git init 23 | git config user.name "Travis-CI" 24 | git config user.email "travis@w3ctag.org" 25 | git add . 26 | git commit -m "Deploy to GitHub Pages" 27 | git push --force --quiet "https://${GH_TOKEN}@${GH_REF}" master:gh-pages > /dev/null 2>&1 28 | 29 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext rst view setup 23 | 24 | # Added by Tim: 25 | rst: 26 | sphinx-apidoc -T -f -o . ../varlens ../varlens/commands 27 | 28 | view: 29 | open _build/html/index.html 30 | 31 | setup: 32 | pip install Sphinx sphinxcontrib-autoprogram sphinx-rtd-theme sphinxcontrib-autorun2 sphinxcontrib-programoutput numpydoc 33 | 34 | help: 35 | @echo "Please use \`make ' where is one of" 36 | @echo " html to make standalone HTML files" 37 | @echo " dirhtml to make HTML files named index.html in directories" 38 | @echo " singlehtml to make a single large HTML file" 39 | @echo " pickle to make pickle files" 40 | @echo " json to make JSON files" 41 | @echo " htmlhelp to make HTML files and a HTML help project" 42 | @echo " qthelp to make HTML files and a qthelp project" 43 | @echo " applehelp to make an Apple Help Book" 44 | @echo " devhelp to make HTML files and a Devhelp project" 45 | @echo " epub to make an epub" 46 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 47 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 48 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 49 | @echo " text to make text files" 50 | @echo " man to make manual pages" 51 | @echo " texinfo to make Texinfo files" 52 | @echo " info to make Texinfo files and run them through makeinfo" 53 | @echo " gettext to make PO message catalogs" 54 | @echo " changes to make an overview of all changed/added/deprecated items" 55 | @echo " xml to make Docutils-native XML files" 56 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 57 | @echo " linkcheck to check all external links for integrity" 58 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 59 | @echo " coverage to run coverage check of the documentation (if enabled)" 60 | 61 | clean: 62 | rm -rf $(BUILDDIR)/* 63 | 64 | html: 65 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 66 | @echo 67 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 68 | 69 | dirhtml: 70 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 71 | @echo 72 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 73 | 74 | singlehtml: 75 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 76 | @echo 77 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 78 | 79 | pickle: 80 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 81 | @echo 82 | @echo "Build finished; now you can process the pickle files." 83 | 84 | json: 85 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 86 | @echo 87 | @echo "Build finished; now you can process the JSON files." 88 | 89 | htmlhelp: 90 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 91 | @echo 92 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 93 | ".hhp project file in $(BUILDDIR)/htmlhelp." 94 | 95 | qthelp: 96 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 97 | @echo 98 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 99 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 100 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/varlens.qhcp" 101 | @echo "To view the help file:" 102 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/varlens.qhc" 103 | 104 | applehelp: 105 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 106 | @echo 107 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 108 | @echo "N.B. You won't be able to view it unless you put it in" \ 109 | "~/Library/Documentation/Help or install it in your application" \ 110 | "bundle." 111 | 112 | devhelp: 113 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 114 | @echo 115 | @echo "Build finished." 116 | @echo "To view the help file:" 117 | @echo "# mkdir -p $$HOME/.local/share/devhelp/varlens" 118 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/varlens" 119 | @echo "# devhelp" 120 | 121 | epub: 122 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 123 | @echo 124 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 125 | 126 | latex: 127 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 128 | @echo 129 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 130 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 131 | "(use \`make latexpdf' here to do that automatically)." 132 | 133 | latexpdf: 134 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 135 | @echo "Running LaTeX files through pdflatex..." 136 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 137 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 138 | 139 | latexpdfja: 140 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 141 | @echo "Running LaTeX files through platex and dvipdfmx..." 142 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 143 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 144 | 145 | text: 146 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 147 | @echo 148 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 149 | 150 | man: 151 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 152 | @echo 153 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 154 | 155 | texinfo: 156 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 157 | @echo 158 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 159 | @echo "Run \`make' in that directory to run these through makeinfo" \ 160 | "(use \`make info' here to do that automatically)." 161 | 162 | info: 163 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 164 | @echo "Running Texinfo files through makeinfo..." 165 | make -C $(BUILDDIR)/texinfo info 166 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 167 | 168 | gettext: 169 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 170 | @echo 171 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 172 | 173 | changes: 174 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 175 | @echo 176 | @echo "The overview file is in $(BUILDDIR)/changes." 177 | 178 | linkcheck: 179 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 180 | @echo 181 | @echo "Link check complete; look for any errors in the above output " \ 182 | "or in $(BUILDDIR)/linkcheck/output.txt." 183 | 184 | doctest: 185 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 186 | @echo "Testing of doctests in the sources finished, look at the " \ 187 | "results in $(BUILDDIR)/doctest/output.txt." 188 | 189 | coverage: 190 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 191 | @echo "Testing of coverage in the sources finished, look at the " \ 192 | "results in $(BUILDDIR)/coverage/python.txt." 193 | 194 | xml: 195 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 196 | @echo 197 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 198 | 199 | pseudoxml: 200 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 201 | @echo 202 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 203 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # varlens documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Jun 10 19:36:39 2015. 5 | # Copied by Tim from sefara project. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | import sys 17 | import os 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | sys.path.insert(0, os.path.abspath('.')) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | #needs_sphinx = '1.0' 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | 'sphinx.ext.autodoc', 34 | 'sphinx.ext.autosummary', 35 | 'sphinx.ext.viewcode', 36 | 'numpydoc', 37 | 'sphinxcontrib.programoutput', 38 | 'sphinxcontrib.autorun2', 39 | 'sphinxcontrib.autoprogram', 40 | ] 41 | 42 | # Added by Tim 43 | # http://stackoverflow.com/questions/12206334/sphinx-autosummary-toctree-contains-reference-to-nonexisting-document-warnings 44 | numpydoc_show_class_members = False 45 | 46 | # Add any paths that contain templates here, relative to this directory. 47 | templates_path = ['_templates'] 48 | 49 | # The suffix(es) of source filenames. 50 | # You can specify multiple suffix as a list of string: 51 | # source_suffix = ['.rst', '.md'] 52 | source_suffix = '.rst' 53 | 54 | # The encoding of source files. 55 | #source_encoding = 'utf-8-sig' 56 | 57 | # The master toctree document. 58 | master_doc = 'index' 59 | 60 | # General information about the project. 61 | project = u'varlens' 62 | copyright = u'2016, Tim O\'Donnell' 63 | author = u'Tim O\'Donnell' 64 | 65 | # The version info for the project you're documenting, acts as replacement for 66 | # |version| and |release|, also used in various other places throughout the 67 | # built documents. 68 | # 69 | # The short X.Y version. 70 | version = '0.0.1' 71 | # The full version, including alpha/beta/rc tags. 72 | release = '0.0.1' 73 | 74 | # The language for content autogenerated by Sphinx. Refer to documentation 75 | # for a list of supported languages. 76 | # 77 | # This is also used if you do content translation via gettext catalogs. 78 | # Usually you set "language" from the command line for these cases. 79 | language = None 80 | 81 | # There are two options for replacing |today|: either, you set today to some 82 | # non-false value, then it is used: 83 | #today = '' 84 | # Else, today_fmt is used as the format for a strftime call. 85 | #today_fmt = '%B %d, %Y' 86 | 87 | # List of patterns, relative to source directory, that match files and 88 | # directories to ignore when looking for source files. 89 | exclude_patterns = ['_build'] 90 | 91 | # The reST default role (used for this markup: `text`) to use for all 92 | # documents. 93 | default_role = 'any' 94 | 95 | # If true, '()' will be appended to :func: etc. cross-reference text. 96 | #add_function_parentheses = True 97 | 98 | # If true, the current module name will be prepended to all description 99 | # unit titles (such as .. function::). 100 | #add_module_names = True 101 | 102 | # If true, sectionauthor and moduleauthor directives will be shown in the 103 | # output. They are ignored by default. 104 | #show_authors = False 105 | 106 | # The name of the Pygments (syntax highlighting) style to use. 107 | pygments_style = 'sphinx' 108 | 109 | # A list of ignored prefixes for module index sorting. 110 | #modindex_common_prefix = [] 111 | 112 | # If true, keep warnings as "system message" paragraphs in the built documents. 113 | #keep_warnings = False 114 | 115 | # If true, `todo` and `todoList` produce output, else they produce nothing. 116 | todo_include_todos = False 117 | 118 | 119 | # -- Options for HTML output ---------------------------------------------- 120 | 121 | # The theme to use for HTML and HTML Help pages. See the documentation for 122 | # a list of builtin themes. 123 | html_theme = 'sphinx_rtd_theme' 124 | 125 | # Theme options are theme-specific and customize the look and feel of a theme 126 | # further. For a list of options available for each theme, see the 127 | # documentation. 128 | #html_theme_options = {} 129 | 130 | # Add any paths that contain custom themes here, relative to this directory. 131 | #html_theme_path = [] 132 | 133 | # The name for this set of Sphinx documents. If None, it defaults to 134 | # " v documentation". 135 | #html_title = None 136 | 137 | # A shorter title for the navigation bar. Default is the same as html_title. 138 | #html_short_title = None 139 | 140 | # The name of an image file (relative to this directory) to place at the top 141 | # of the sidebar. 142 | #html_logo = None 143 | 144 | # The name of an image file (within the static path) to use as favicon of the 145 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 146 | # pixels large. 147 | #html_favicon = None 148 | 149 | # Add any paths that contain custom static files (such as style sheets) here, 150 | # relative to this directory. They are copied after the builtin static files, 151 | # so a file named "default.css" will overwrite the builtin "default.css". 152 | html_static_path = ['_static'] 153 | 154 | # Add any extra paths that contain custom files (such as robots.txt or 155 | # .htaccess) here, relative to this directory. These files are copied 156 | # directly to the root of the documentation. 157 | #html_extra_path = [] 158 | 159 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 160 | # using the given strftime format. 161 | #html_last_updated_fmt = '%b %d, %Y' 162 | 163 | # If true, SmartyPants will be used to convert quotes and dashes to 164 | # typographically correct entities. 165 | #html_use_smartypants = True 166 | 167 | # Custom sidebar templates, maps document names to template names. 168 | #html_sidebars = {} 169 | 170 | # Additional templates that should be rendered to pages, maps page names to 171 | # template names. 172 | #html_additional_pages = {} 173 | 174 | # If false, no module index is generated. 175 | #html_domain_indices = True 176 | 177 | # If false, no index is generated. 178 | #html_use_index = True 179 | 180 | # If true, the index is split into individual pages for each letter. 181 | #html_split_index = False 182 | 183 | # If true, links to the reST sources are added to the pages. 184 | #html_show_sourcelink = True 185 | 186 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 187 | #html_show_sphinx = True 188 | 189 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 190 | #html_show_copyright = True 191 | 192 | # If true, an OpenSearch description file will be output, and all pages will 193 | # contain a tag referring to it. The value of this option must be the 194 | # base URL from which the finished HTML is served. 195 | #html_use_opensearch = '' 196 | 197 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 198 | #html_file_suffix = None 199 | 200 | # Language to be used for generating the HTML full-text search index. 201 | # Sphinx supports the following languages: 202 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 203 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' 204 | #html_search_language = 'en' 205 | 206 | # A dictionary with options for the search language support, empty by default. 207 | # Now only 'ja' uses this config value 208 | #html_search_options = {'type': 'default'} 209 | 210 | # The name of a javascript file (relative to the configuration directory) that 211 | # implements a search results scorer. If empty, the default will be used. 212 | #html_search_scorer = 'scorer.js' 213 | 214 | # Output file base name for HTML help builder. 215 | htmlhelp_basename = 'varlensdoc' 216 | 217 | # -- Options for LaTeX output --------------------------------------------- 218 | 219 | latex_elements = { 220 | # The paper size ('letterpaper' or 'a4paper'). 221 | #'papersize': 'letterpaper', 222 | 223 | # The font size ('10pt', '11pt' or '12pt'). 224 | #'pointsize': '10pt', 225 | 226 | # Additional stuff for the LaTeX preamble. 227 | #'preamble': '', 228 | 229 | # Latex figure (float) alignment 230 | #'figure_align': 'htbp', 231 | } 232 | 233 | # Grouping the document tree into LaTeX files. List of tuples 234 | # (source start file, target name, title, 235 | # author, documentclass [howto, manual, or own class]). 236 | latex_documents = [ 237 | (master_doc, 'varlens.tex', u'varlens Documentation', 238 | u'Tim O\'Donnell', 'manual'), 239 | ] 240 | 241 | # The name of an image file (relative to this directory) to place at the top of 242 | # the title page. 243 | #latex_logo = None 244 | 245 | # For "manual" documents, if this is true, then toplevel headings are parts, 246 | # not chapters. 247 | #latex_use_parts = False 248 | 249 | # If true, show page references after internal links. 250 | #latex_show_pagerefs = False 251 | 252 | # If true, show URL addresses after external links. 253 | #latex_show_urls = False 254 | 255 | # Documents to append as an appendix to all manuals. 256 | #latex_appendices = [] 257 | 258 | # If false, no module index is generated. 259 | #latex_domain_indices = True 260 | 261 | 262 | # -- Options for manual page output --------------------------------------- 263 | 264 | # One entry per manual page. List of tuples 265 | # (source start file, name, description, authors, manual section). 266 | man_pages = [ 267 | (master_doc, 'varlens', u'varlens Documentation', 268 | [author], 1) 269 | ] 270 | 271 | # If true, show URL addresses after external links. 272 | #man_show_urls = False 273 | 274 | 275 | # -- Options for Texinfo output ------------------------------------------- 276 | 277 | # Grouping the document tree into Texinfo files. List of tuples 278 | # (source start file, target name, title, author, 279 | # dir menu entry, description, category) 280 | texinfo_documents = [ 281 | (master_doc, 'varlens', u'varlens Documentation', 282 | author, 'varlens', 283 | 'Python commandline tools for manipulating genomic variants and NGS reads', 284 | 'Miscellaneous'), 285 | ] 286 | 287 | # Documents to append as an appendix to all manuals. 288 | #texinfo_appendices = [] 289 | 290 | # If false, no module index is generated. 291 | #texinfo_domain_indices = True 292 | 293 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 294 | #texinfo_show_urls = 'footnote' 295 | 296 | # If true, do not generate a @detailmenu in the "Top" node's menu. 297 | #texinfo_no_detailmenu = False 298 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Varlens Documentation 2 | ================================== 3 | 4 | .. include:: ../README.rst 5 | 6 | Commandline tool help 7 | ================================== 8 | 9 | varlens-variants 10 | ---------------------------------- 11 | 12 | .. command-output:: varlens-variants -h 13 | 14 | varlens-reads 15 | ---------------------------------- 16 | 17 | .. command-output:: varlens-reads -h 18 | 19 | varlens-allele-support 20 | ---------------------------------- 21 | 22 | .. command-output:: varlens-allele-support -h 23 | 24 | 25 | -------------------------------------------------------------------------------- /docs/varlens.rst: -------------------------------------------------------------------------------- 1 | varlens package 2 | =============== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | varlens.read_evidence 10 | 11 | Submodules 12 | ---------- 13 | 14 | varlens.loci_util module 15 | ------------------------ 16 | 17 | .. automodule:: varlens.loci_util 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | varlens.locus module 23 | -------------------- 24 | 25 | .. automodule:: varlens.locus 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | varlens.mhc_binding module 31 | -------------------------- 32 | 33 | .. automodule:: varlens.mhc_binding 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | varlens.read_source module 39 | -------------------------- 40 | 41 | .. automodule:: varlens.read_source 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | varlens.reads_util module 47 | ------------------------- 48 | 49 | .. automodule:: varlens.reads_util 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | varlens.sequence_context module 55 | ------------------------------- 56 | 57 | .. automodule:: varlens.sequence_context 58 | :members: 59 | :undoc-members: 60 | :show-inheritance: 61 | 62 | varlens.support module 63 | ---------------------- 64 | 65 | .. automodule:: varlens.support 66 | :members: 67 | :undoc-members: 68 | :show-inheritance: 69 | 70 | varlens.util module 71 | ------------------- 72 | 73 | .. automodule:: varlens.util 74 | :members: 75 | :undoc-members: 76 | :show-inheritance: 77 | 78 | varlens.variant_includes module 79 | ------------------------------- 80 | 81 | .. automodule:: varlens.variant_includes 82 | :members: 83 | :undoc-members: 84 | :show-inheritance: 85 | 86 | varlens.variants_util module 87 | ---------------------------- 88 | 89 | .. automodule:: varlens.variants_util 90 | :members: 91 | :undoc-members: 92 | :show-inheritance: 93 | 94 | 95 | Module contents 96 | --------------- 97 | 98 | .. automodule:: varlens 99 | :members: 100 | :undoc-members: 101 | :show-inheritance: 102 | -------------------------------------------------------------------------------- /lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | 4 | 5 | # disabling several categories of errors due to false positives in pylint, 6 | # see these issues: 7 | # - https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and 8 | # - https://bitbucket.org/logilab/pylint/issues/58 9 | 10 | find varlens/ -name '*.py' \ 11 | | xargs pylint \ 12 | --errors-only \ 13 | --disable=unsubscriptable-object,not-an-iterable,no-member,invalid-unary-operand-type \ 14 | && \ 15 | echo 'Passes pylint check' \ 16 | && \ 17 | ruff check variform/ \ 18 | && \ 19 | echo "Passes ruff check" 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function 14 | import os 15 | import re 16 | 17 | from setuptools import setup 18 | 19 | current_directory = os.path.dirname(__file__) 20 | 21 | with open('varlens/version.py', 'r') as f: 22 | version = re.search( 23 | r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', 24 | f.read(), 25 | re.MULTILINE).group(1) 26 | 27 | if __name__ == '__main__': 28 | setup( 29 | name='varlens', 30 | packages=["varlens", "varlens.commands", "varlens.read_evidence"], 31 | version=version, 32 | description=( 33 | "commandline manipulation of genomic variants and NGS reads"), 34 | long_description=open('README.rst').read(), 35 | url="https://github.com/openvax/varlens", 36 | author="Tim O'Donnell", 37 | author_email="timodonnell@gmail.com", 38 | license="http://www.apache.org/licenses/LICENSE-2.0.html", 39 | entry_points={ 40 | 'console_scripts': [ 41 | 'varlens-allele-support = varlens.commands.allele_support:run', 42 | 'varlens-variants = varlens.commands.variants:run', 43 | 'varlens-reads = varlens.commands.reads:run', 44 | ], 45 | }, 46 | classifiers=[ 47 | 'Development Status :: 3 - Alpha', 48 | 'Environment :: Console', 49 | 'Operating System :: OS Independent', 50 | 'Intended Audience :: Science/Research', 51 | 'License :: OSI Approved :: Apache Software License', 52 | 'Programming Language :: Python', 53 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 54 | ], 55 | install_requires=[ 56 | 'cython>=0.21', 57 | 'numpy', 58 | 'intervaltree', 59 | 'pysam>=0.13', 60 | 'typechecks', 61 | 'varcode', 62 | 'pyfaidx', 63 | 'mhctools', 64 | 'topiary', 65 | ], 66 | ) 67 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | pytest --cov=varlens/ --cov-report=term-missing tests 2 | 3 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Utility functions for tests. 3 | ''' 4 | 5 | import sys 6 | import os 7 | import tempfile 8 | from contextlib import contextmanager 9 | 10 | import pandas 11 | 12 | if sys.version_info[0] < 3: 13 | from StringIO import StringIO 14 | else: 15 | from io import StringIO 16 | 17 | def data_path(name): 18 | ''' 19 | Return the absolute path to a file in the test/data directory. 20 | The name specified should be relative to test/data. 21 | ''' 22 | return os.path.join(os.path.dirname(__file__), "data", name) 23 | 24 | class Capturing(list): 25 | def __enter__(self): 26 | self._stdout = sys.stdout 27 | sys.stdout = self._stringio = StringIO() 28 | return self 29 | 30 | def __exit__(self, *args): 31 | self.extend(self._stringio.getvalue().splitlines()) 32 | sys.stdout = self._stdout 33 | 34 | def run_and_parse_csv(function, *args): 35 | with Capturing() as output: 36 | function(*args) 37 | try: 38 | result = pandas.read_csv(StringIO("\n".join(output))) 39 | except: 40 | print("Couldn't parse csv. Function: %s. Args: %s.\nOutput:\n%s" 41 | % (str(function), str(args), "\n".join(output))) 42 | raise 43 | return result 44 | 45 | @contextmanager 46 | def temp_file(suffix=".csv"): 47 | fd = tempfile.NamedTemporaryFile( 48 | suffix=suffix, 49 | prefix="test_varlens_", 50 | delete=False) 51 | filename = fd.name 52 | fd.close() 53 | yield filename 54 | os.unlink(filename) 55 | 56 | def cols_concat(df, columns, delimiter="-"): 57 | assert df is not None 58 | zipped = zip(*[df[c] for c in columns]) 59 | return set([delimiter.join(str(item) for item in row) for row in zipped]) -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_0.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_0.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_0.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_0.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_1.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_1.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_1.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_1.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_10.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_10.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_10.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_10.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_11.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_11.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_11.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_11.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_12.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_12.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_12.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_12.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_13.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_13.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_13.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_13.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_14.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_14.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_14.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_14.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_15.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_15.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_15.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_15.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_16.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_16.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_16.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_16.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_17.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_17.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_17.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_17.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_18.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_18.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_18.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_18.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_19.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_19.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_19.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_19.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_2.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_2.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_2.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_2.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_20.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_20.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_20.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_20.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_21.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_21.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_21.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_21.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_22.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_22.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_22.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_22.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_23.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_23.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_23.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_23.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_3.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_3.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_3.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_3.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_4.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_4.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_4.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_4.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_5.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_5.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_5.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_5.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_6.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_6.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_6.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_6.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_7.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_7.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_7.bam.bai: -------------------------------------------------------------------------------- 1 | BAI] -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_8.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_8.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_8.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_8.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_9.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_9.bam -------------------------------------------------------------------------------- /tests/data/CELSR1/bams/bam_9.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/CELSR1/bams/bam_9.bam.bai -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_1.vcf: -------------------------------------------------------------------------------- 1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta 2 | ##contig= 3 | chr22 21829555 rs377578228 T G . PASS DB;SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:192,7:35:199:0.035:2 0:41,1:.:41:0.024:0 4 | chr22 46931060 . A C . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:65,9:17:74:0.122:2 0:15,1:.:16:0.063:0 5 | chr22 46931062 . G A . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:89,44:33:133:0.331:2 0:42,0:.:42:0.00:0 6 | chr22 50636218 . A C . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:86,12:16:98:0.122:2 0:12,0:.:12:0.00:0 7 | chr22 50875933 . A C . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:56,11:15:67:0.164:2 0:10,0:.:10:0.00:0 8 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_10.vcf: -------------------------------------------------------------------------------- 1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta 2 | chr22 22309964 . C T . QSS_ref NT=ref;QSS=5;QSS_NT=2;SGT=CC->CT;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 1:0:0:0:0,0:1,1:0,0:0,0 258:0:0:0:0,0:178,178:0,0:80,80 3 | chr22 46931062 . G A . PASS NT=ref;QSS=39;QSS_NT=39;SGT=GG->AG;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 45:0:0:0:0,0:0,0:45,47:0,0 140:1:0:0:25,26:0,0:113,113:1,1 4 | chr22 46931061 . G T . PASS NT=ref;QSS=39;QSS_NT=39;SGT=GG->AG;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 45:0:0:0:0,0:0,0:45,47:0,0 140:1:0:0:25,26:0,0:113,113:1,1 5 | chr22 50528497 . A G . QSS_ref NT=ref;QSS=1;QSS_NT=0;SGT=AG->AG;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 1:0:0:0:0,0:0,0:1,1:0,0 29:0:0:0:25,25:0,0:4,4:0,0 6 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_11.vcf: -------------------------------------------------------------------------------- 1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta 2 | chr22 30507883 . A T . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0:41,1:.:42:0.024:0 0/1:61,9:17:70:0.129:2 3 | chr22 40060737 . G C . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0:13,1:.:14:0.071:0 0/1:52,10:17:63:0.161:2 4 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_12.vcf: -------------------------------------------------------------------------------- 1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta 2 | chr22 20387160 . G A . PASS NT=ref;QSS=15;QSS_NT=15;SGT=GG->AG;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 81:0:0:0:10,10:0,0:71,71:0,0 84:1:0:0:21,22:0,0:62,62:0,0 3 | chr22 22309964 . C T . QSS_ref NT=ref;QSS=5;QSS_NT=2;SGT=CC->CT;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 1:0:0:0:0,0:1,1:0,0:0,0 293:0:0:0:0,0:190,191:0,0:103,103 4 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_13.vcf: -------------------------------------------------------------------------------- 1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta 2 | chr22 20387160 . G . . QSS_ref NT=ref;QSS=2;QSS_NT=2;SGT=GG->GG;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 81:0:0:0:10,10:0,0:71,71:0,0 70:0:0:0:15,15:0,0:55,55:0,0 3 | chr22 22309964 . C T . QSS_ref NT=ref;QSS=5;QSS_NT=2;SGT=CC->CT;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 1:0:0:0:0,0:1,1:0,0:0,0 232:0:0:0:0,0:159,159:0,0:73,73 4 | chr22 22576057 . C T . QSS_ref NT=ref;QSS=4;QSS_NT=1;SGT=CC->CT;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 0:0:0:0:0,0:0,0:0,0:0,0 206:0:0:0:0,0:62,62:0,0:144,145 5 | chr22 23481061 . C A . QSS_ref NT=ref;QSS=3;QSS_NT=3;SGT=CC->AC;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 18:1:0:0:0,0:17,18:0,0:0,0 126:9:0:0:8,12:109,120:0,0:0,0 6 | chr22 30507883 . A T . QSS_ref NT=ref;QSS=2;QSS_NT=2;SGT=AT->AT;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 45:0:0:0:43,43:0,0:0,0:2,2 82:4:0:0:68,70:0,1:0,0:10,11 7 | chr22 46931060 . A G . QSS_ref NT=ref;QSS=1;QSS_NT=1;SGT=AG->AG;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 31:0:0:0:31,32:0,0:0,0:0,0 57:0:0:0:53,53:0,0:4,4:0,0 8 | chr22 50962223 . G A . PASS NT=ref;QSS=24;QSS_NT=24;SGT=GG->AG;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 47:0:0:0:0,0:0,0:47,48:0,0 347:0:0:0:11,11:0,0:335,336:1,1 9 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_14.vcf: -------------------------------------------------------------------------------- 1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta 2 | chr22 20387160 . G A . QSS_ref NT=ref;QSS=3;QSS_NT=3;SGT=GG->AG;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 81:0:0:0:10,10:0,0:71,71:0,0 73:0:0:0:9,9:0,0:64,64:0,0 3 | chr22 20796345 . C . . QSS_ref NT=ref;QSS=3;QSS_NT=3;SGT=CC->CC;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 21:0:0:0:0,0:21,21:0,0:0,0 80:0:0:0:0,0:76,76:0,0:4,4 4 | chr22 21174794 . C A . QSS_ref NT=ref;QSS=3;QSS_NT=3;SGT=CC->AC;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 67:0:0:0:0,0:67,67:0,0:0,0 160:4:0:0:11,15:145,145:0,0:0,0 5 | chr22 21174795 . C A . QSS_ref NT=ref;QSS=9;QSS_NT=9;SGT=CC->AC;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 68:0:0:0:0,0:68,68:0,0:0,0 160:4:0:0:15,18:141,142:0,0:0,0 6 | chr22 22309964 . C T . QSS_ref NT=ref;QSS=5;QSS_NT=2;SGT=CC->CT;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 1:0:0:0:0,0:1,1:0,0:0,0 215:1:0:0:0,0:144,144:0,0:70,71 7 | chr22 23481104 . C . . QSS_ref NT=ref;QSS=1;QSS_NT=1;SGT=CC->CC;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 10:2:0:0:0,0:8,10:0,0:0,0 169:14:0:0:11,17:144,156:0,0:0,0 8 | chr22 30507883 . A T . QSS_ref NT=ref;QSS=1;QSS_NT=1;SGT=AT->AT;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 45:0:0:0:43,43:0,0:0,0:2,2 109:3:0:0:97,98:0,0:0,0:9,12 9 | chr22 42970799 . A G . QSS_ref NT=ref;QSS=1;QSS_NT=1;SGT=AG->AG;SOMATIC;TQSS=1;TQSS_NT=1 DP:FDP:SDP:SUBDP:AU:CU:GU:TU 31:0:0:0:31,32:0,0:0,0:0,0 81:0:0:0:76,76:0,0:5,5:0,0 10 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_2.vcf: -------------------------------------------------------------------------------- 1 | ##reference=file:///hpc/users/ahujaa01/ksinai-demeter/hg19-reference-genome/hg19.fasta 2 | chr22 45309893 . T G . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:52,10:13:62:0.161:2 0:27,1:.:28:0.036:0 3 | chr22 46931062 . G A . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:94,53:32:147:0.361:2 0:42,0:.:42:0.00:0 4 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_3.vcf: -------------------------------------------------------------------------------- 1 | ##contig= 2 | chr22 20390444 . A G . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:76,4:32:80:0.050:2 0:24,0:.:24:0.00:0 3 | chr22 25016296 . G A . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:95,4:32:99:0.040:2 0:43,0:.:43:0.00:0 4 | chr22 25046004 . T G . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:138,7:30:146:0.048:2 0:62,1:.:63:0.016:0 5 | chr22 46931061 . G C . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:211,8:29:220:0.037:2 0:97,1:.:99:0.010:0 6 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_4.vcf: -------------------------------------------------------------------------------- 1 | chr22 23481083 . C A . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:345,25:17:372:0.068:2 0:14,0:.:14:0.00:0 2 | chr22 24106576 . G A . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:952,42:33:995:0.042:2 0:27,0:.:27:0.00:0 3 | chr22 40257775 . T G . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:132,13:16:145:0.090:2 0:88,1:.:89:0.011:0 4 | chr22 46931062 . G A . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:254,243:32:497:0.489:2 0:42,0:.:42:0.00:0 5 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_5.vcf: -------------------------------------------------------------------------------- 1 | chr22 24106576 . G A . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:910,45:34:956:0.047:2 0:27,0:.:27:0.00:0 2 | chr22 29939378 . A G . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:211,23:13:234:0.098:2 0:38,1:.:39:0.026:0 3 | chr22 38051393 . A C . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:19,4:25:23:0.174:2 0:9,0:.:9:0.00:0 4 | chr22 40060737 . G C . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:154,17:14:171:0.099:2 0:13,1:.:14:0.071:0 5 | chr22 43617352 . T G . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:209,22:12:231:0.095:2 0:19,0:.:19:0.00:0 6 | chr22 46931062 . G A . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:260,249:33:509:0.489:2 0:42,0:.:42:0.00:0 7 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_6.vcf: -------------------------------------------------------------------------------- 1 | chr22 40060737 . G C . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:65,11:14:77:0.145:2 0:13,1:.:14:0.071:0 2 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_7.vcf: -------------------------------------------------------------------------------- 1 | chr22 46931062 . G A . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:108,25:32:134:0.188:2 0:42,0:.:42:0.00:0 2 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_8.vcf: -------------------------------------------------------------------------------- 1 | ##contig= 2 | chr22 21053057 . C T . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:129,9:29:138:0.065:2 0:49,1:.:50:0.020:0 3 | chr22 22974759 . G C . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:178,8:32:186:0.043:2 0:80,1:.:75:0.012:0 4 | chr22 23241800 . T G . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:132,5:32:137:0.036:2 0:61,0:.:61:0.00:0 5 | chr22 23241804 . A G . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:131,5:33:137:0.037:2 0:62,0:.:62:0.00:0 6 | chr22 24655840 . C T . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:98,6:30:104:0.058:2 0:69,1:.:37:0.014:0 7 | chr22 25044103 . G T . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:132,6:29:138:0.043:2 0:73,1:.:67:0.014:0 8 | chr22 25044108 . G A . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:135,6:30:142:0.043:2 0:73,1:.:66:0.014:0 9 | chr22 25574241 . G C . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:104,7:31:111:0.063:2 0:34,0:.:34:0.00:0 10 | chr22 40060742 . A C . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:33,6:18:39:0.154:2 0:16,0:.:16:0.00:0 11 | chr22 46931062 . G A . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:108,7:27:115:0.061:2 0:42,0:.:42:0.00:0 12 | -------------------------------------------------------------------------------- /tests/data/CELSR1/vcfs/vcf_9.vcf: -------------------------------------------------------------------------------- 1 | chr22 21174795 . C A . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:140,13:17:153:0.085:2 0:61,0:.:61:0.00:0 2 | chr22 29939378 . A G . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:74,10:15:84:0.119:2 0:38,1:.:39:0.026:0 3 | chr22 38037134 . A C . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:73,9:19:83:0.110:2 0:15,1:.:16:0.063:0 4 | chr22 45309893 . T G . PASS SOMATIC;VT=SNP GT:AD:BQ:DP:FA:SS 0/1:80,14:12:94:0.149:2 0:27,1:.:28:0.036:0 5 | -------------------------------------------------------------------------------- /tests/data/gatk_mini_bundle_extract.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/gatk_mini_bundle_extract.bam -------------------------------------------------------------------------------- /tests/data/gatk_mini_bundle_extract.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/gatk_mini_bundle_extract.bam.bai -------------------------------------------------------------------------------- /tests/data/rna_chr17_41244936.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/rna_chr17_41244936.bam -------------------------------------------------------------------------------- /tests/data/rna_chr17_41244936.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varlens/e57f2a383c659fc1ed43ce008330bdb9ef77d80c/tests/data/rna_chr17_41244936.bam.bai -------------------------------------------------------------------------------- /tests/test_allele_support.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import absolute_import 16 | 17 | import logging 18 | 19 | from nose.tools import eq_ 20 | 21 | from varlens.commands import allele_support 22 | 23 | from . import data_path, run_and_parse_csv, cols_concat 24 | 25 | def run(args): 26 | logging.info("Running with args: " + ' '.join(args)) 27 | return run_and_parse_csv(allele_support.run, args) 28 | 29 | expected_cols = [ 30 | "contig", "interbase_start", "interbase_end", "allele", "count", 31 | ] 32 | 33 | def test_basic(): 34 | result = run([ 35 | "--reads", data_path("CELSR1/bams/bam_5.bam"), 36 | "--locus", "chr22:46929963", "chr22:46929964", 37 | ]) 38 | eq_(cols_concat(result, expected_cols), 39 | {"22-46929962-46929963-C-60", "22-46929963-46929964-A-81"}) 40 | 41 | result = run([ 42 | "--reads", data_path("CELSR1/bams/bam_5.bam"), 43 | "--locus", "chr22:46929963", "chr22:46929964", 44 | "--is-reverse" 45 | ]) 46 | eq_(cols_concat(result, expected_cols), 47 | {"22-46929962-46929963-C-37", "22-46929963-46929964-A-47"}) 48 | 49 | result = run([ 50 | "--reads", data_path("gatk_mini_bundle_extract.bam"), 51 | "--locus", "chr20:10008951", 52 | "--is-reverse", 53 | ]) 54 | eq_(cols_concat(result, expected_cols), 55 | {"20-10008950-10008951-C-1"}) 56 | 57 | def test_simple(): 58 | result = run([ 59 | "--reads", data_path("CELSR1/bams/bam_0.bam"), 60 | "--genome", "b37", 61 | "--variants", data_path("CELSR1/vcfs/vcf_1.vcf"), 62 | ]) 63 | eq_(cols_concat( 64 | result, 65 | ["contig", "interbase_start", "interbase_end", "allele", "count"]), 66 | { 67 | '22-50636217-50636218-N-0', 68 | '22-50875932-50875933-N-0', 69 | '22-21829554-21829555-N-0', 70 | "22-46931059-46931060-A-50", 71 | "22-46931061-46931062-G-51", 72 | }) 73 | 74 | pick_one_variant = [ 75 | ["--ref", "G"], 76 | ["--alt", "A"], 77 | ["--variant-locus", "22/46931061"], 78 | ["--variant-locus", "22/46931061-46931062"], 79 | ] 80 | for variant_filter in pick_one_variant: 81 | result = run([ 82 | "--reads", data_path("CELSR1/bams/bam_0.bam"), 83 | "--genome", "b37", 84 | "--variants", data_path("CELSR1/vcfs/vcf_1.vcf"), 85 | ] + variant_filter) 86 | yield ( 87 | eq_, 88 | cols_concat( 89 | result, ["contig", "interbase_start", "interbase_end"]), 90 | {"22-46931061-46931062"}) 91 | 92 | -------------------------------------------------------------------------------- /tests/test_read_evidence.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from __future__ import absolute_import 15 | 16 | import collections 17 | from nose.tools import eq_, assert_raises 18 | 19 | from pysam import Samfile 20 | 21 | from varcode import Variant as VarcodeVariant 22 | 23 | from . import data_path 24 | 25 | from varlens.locus import Locus 26 | from varlens.read_evidence import PileupCollection 27 | 28 | Variant = collections.namedtuple("Variant", "locus ref alt") 29 | 30 | def filtered_read_names(filtered_evidence): 31 | assert filtered_evidence.parent is not None 32 | full = set(filtered_evidence.parent.read_attribute('query_name')) 33 | filtered = set(filtered_evidence.read_attribute('query_name')) 34 | assert filtered - full == set() 35 | return full - filtered 36 | 37 | def test_read_evidence_rna1_single_base_loci(): 38 | loci = [ 39 | Locus.from_inclusive_coordinates("17", 41244936, 41244936), # 0 40 | Locus.from_inclusive_coordinates("17", 41244937, 41244937), # 1 41 | Locus.from_inclusive_coordinates("17", 41244935, 41244935), # 2 42 | Locus.from_inclusive_coordinates("17", 41244933, 41244933), # 3 43 | Locus.from_inclusive_coordinates("17", 41244853, 41244853), # 4 44 | Locus.from_inclusive_coordinates("17", 41244857, 41244857), # 5 45 | Locus.from_inclusive_coordinates("17", 41244864, 41244864), # 6 46 | Locus.from_inclusive_coordinates("17", 41244879, 41244879), # 7 47 | Locus.from_inclusive_coordinates("17", 41244901, 41244901), # 8 48 | Locus.from_inclusive_coordinates("17", 41244910, 41244910), # 9 49 | Locus.from_inclusive_coordinates("17", 41244917, 41244917), # 10 50 | Locus.from_inclusive_coordinates("17", 41244972, 41244972), # 11 51 | Locus.from_inclusive_coordinates("17", 41244973, 41244973), # 12 52 | Locus.from_inclusive_coordinates("17", 41245026, 41245026), # 13 53 | Locus.from_inclusive_coordinates("17", 41245027, 41245027), # 14 54 | Locus.from_inclusive_coordinates("17", 41245019, 41245019), # 15 55 | Locus.from_inclusive_coordinates("17", 41245018, 41245018), # 16 56 | ] 57 | evidence = PileupCollection.from_bam( 58 | data_path("rna_chr17_41244936.bam"), loci) 59 | 60 | eq_(evidence.allele_summary(loci[0]), [("A", 11), ("G", 7)]) 61 | eq_(evidence.allele_summary(loci[1]), [("G", 17), ("A", 1)]) 62 | eq_(evidence.allele_summary(loci[2]), [("C", 18)]) 63 | eq_(evidence.allele_summary(loci[3]), [("A", 17), ("G", 1)]) 64 | eq_(evidence.allele_summary(loci[4]), [("C", 1)]) 65 | eq_(evidence.allele_summary(loci[5]), [("T", 2)]) 66 | eq_(evidence.allele_summary(loci[6]), [("T", 4)]) 67 | eq_(evidence.allele_summary(loci[7]), [("C", 8)]) 68 | eq_(evidence.allele_summary(loci[8]), [("C", 8)]) 69 | eq_(evidence.allele_summary(loci[9]), [("C", 9)]) 70 | eq_(evidence.allele_summary(loci[10]), [("A", 10)]) 71 | eq_(evidence.allele_summary(loci[11]), [("T", 11)]) 72 | eq_(evidence.allele_summary(loci[12]), [("T", 11)]) 73 | eq_(evidence.allele_summary(loci[13]), [("C", 1)]) 74 | eq_(evidence.allele_summary(loci[14]), [("G", 1)]) 75 | eq_(evidence.allele_summary(loci[15]), [("T", 8)]) 76 | eq_(evidence.allele_summary(loci[16]), [("T", 8)]) 77 | 78 | def test_read_evidence_rna1_multi_base_loci(): 79 | loci = [ 80 | Locus.from_inclusive_coordinates("17", 41244853, 41244854), # 0 81 | Locus.from_inclusive_coordinates("17", 41244853, 41244857), # 1 82 | Locus.from_inclusive_coordinates("17", 41244854, 41244857), # 2 83 | Locus.from_inclusive_coordinates("17", 41244852, 41244857), # 3 84 | Locus.from_inclusive_coordinates("17", 41244933, 41244936), # 4 85 | Locus.from_inclusive_coordinates("17", 41244933, 41244937), # 5 86 | Locus.from_inclusive_coordinates("17", 41244971, 41244973), # 6 87 | Locus.from_inclusive_coordinates("17", 41265063, 41265067), # 7 88 | ] 89 | evidence = PileupCollection.from_bam( 90 | data_path("rna_chr17_41244936.bam"), loci) 91 | eq_(evidence.allele_summary(loci[0]), [("CT", 1)]) 92 | eq_(evidence.allele_summary(loci[1]), [("CTTTT", 1)]) 93 | eq_(evidence.allele_summary(loci[2]), [("TTTT", 1)]) 94 | eq_(evidence.allele_summary(loci[3]), []) 95 | eq_(evidence.allele_summary(loci[4]), 96 | [("AACA", 11), ("AACG", 6), ("GACG", 1)]) 97 | eq_(evidence.allele_summary(loci[5]), 98 | [("AACAG", 10), ("AACGG", 6), ("AACAA", 1), ("GACGG", 1)]) 99 | eq_(evidence.allele_summary(loci[6]), [("ATT", 11)]) 100 | eq_(evidence.allele_summary(loci[7]), [("ACCCG", 1)]) 101 | 102 | def test_read_evidence_gatk_mini_bundle_extract(): 103 | loci = [ 104 | Locus.from_inclusive_coordinates("20", 9999996, 9999996), # 0 105 | Locus.from_inclusive_coordinates("20", 10260442), # 1 106 | Locus.from_inclusive_coordinates("20", 10006823), # 2 107 | Locus.from_inclusive_coordinates("20", 10006819, 10006823), # 3 108 | Locus.from_inclusive_coordinates("20", 10006819, 10006825), # 4 109 | Locus.from_inclusive_coordinates("20", 10006822, 10006827), # 5 110 | Locus.from_inclusive_coordinates("20", 10007175), # 6 111 | Locus.from_inclusive_coordinates("20", 10007174, 10007176), # 7 112 | Locus.from_inclusive_coordinates("20", 1, 3), # 8 113 | Locus.from_inclusive_coordinates("20", 10008796), # 9 114 | Locus.from_inclusive_coordinates("20", 10008921), # 10 115 | ] 116 | handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) 117 | evidence = PileupCollection.from_bam(handle, loci) 118 | 119 | eq_(evidence.allele_summary(loci[0]), [("ACT", 9)]) 120 | eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[0]), 121 | [("ACT", 8)]) 122 | eq_(evidence.allele_summary(loci[1]), [("T", 7)]) 123 | eq_(evidence.filter().allele_summary(loci[2]), [("", 6), ("C", 2)]) 124 | eq_(evidence.filter( 125 | drop_duplicates=True, min_base_quality=50).allele_summary(loci[2]), 126 | []) 127 | eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[2]), 128 | [("", 5), ("C", 1)]) 129 | eq_(evidence.filter( 130 | drop_duplicates=True, min_mapping_quality=60).allele_summary( 131 | loci[2]), 132 | [("", 5), ("C", 1)]) 133 | eq_(evidence.filter(drop_duplicates=True, 134 | min_mapping_quality=61).allele_summary(loci[2]), [("", 2)]) 135 | eq_(evidence.filter(drop_duplicates=True, 136 | min_mapping_quality=61).allele_summary(loci[3]), [("A", 2)]) 137 | eq_(evidence.filter(drop_duplicates=True, 138 | min_mapping_quality=61).allele_summary(loci[4]), [("AAA", 2)]) 139 | eq_(evidence.filter(drop_duplicates=True, 140 | min_mapping_quality=61).allele_summary(loci[5]), [("AAAC", 2)]) 141 | eq_(evidence.filter().allele_summary(loci[6]), [("T", 5), ("C", 3)]) 142 | eq_(evidence.filter(min_base_quality=30).allele_summary(loci[6]), 143 | [("T", 4), ("C", 3)]) 144 | eq_(evidence.filter().allele_summary(loci[7]), 145 | [("CTT", 5), ("CCT", 3)]) 146 | eq_(evidence.filter(min_base_quality=30).allele_summary(loci[7]), 147 | [("CTT", 3), ("CCT", 2)]) 148 | eq_(evidence.filter(min_base_quality=32).allele_summary(loci[2]), 149 | [("", 6), ("C", 1)]) 150 | eq_(filtered_read_names(evidence.at(loci[2]).filter(min_base_quality=32)), 151 | {'20GAVAAXX100126:4:3:18352:43857'}) 152 | eq_(evidence.allele_summary(loci[8]), []) 153 | eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[8]), []) 154 | assert_raises(KeyError, 155 | evidence.allele_summary, 156 | Locus.from_inclusive_coordinates("20", 10009174, 10009176)) 157 | eq_(filtered_read_names( 158 | evidence.at(loci[9]).filter(drop_improper_mate_pairs=True)), 159 | {'20FUKAAXX100202:8:68:1530:49310'}) 160 | eq_(len(evidence.at(loci[8]).read_attribute('mapping_quality')), 0) 161 | eq_(list(evidence.at(loci[9]).read_attribute('mapping_quality')), 162 | list(evidence.at(loci[9]).read_attributes().mapping_quality)) 163 | eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[10]), 164 | [('C', 2), ('CA', 1), ('CAA', 1)]) 165 | eq_(evidence.filter(drop_duplicates=True).allele_summary( 166 | Locus.from_interbase_coordinates( 167 | loci[10].contig, loci[10].start, loci[10].start)), 168 | [('', 2), ('A', 1), ('AA', 1)]) 169 | 170 | 171 | def test_read_evidence_variant_matching_gatk_mini_bundle_extract(): 172 | handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) 173 | 174 | loci = [ 175 | Locus.from_inclusive_coordinates("20", 10008951), # 0 176 | Locus.from_inclusive_coordinates("20", 10009053), # 1 177 | Locus.from_inclusive_coordinates("20", 10009053, 10009054), # 2 178 | Locus.from_inclusive_coordinates("20", 10006822), # 3 179 | Locus.from_inclusive_coordinates("20", 10006822, 10006823), # 4 180 | 181 | ] 182 | evidence = PileupCollection.from_bam(handle, loci) 183 | 184 | eq_(evidence.match_summary(Variant(loci[0], "A", "C")), 185 | [('A', 1), ('C', 4)]) 186 | eq_(evidence.filter(drop_duplicates=True).match_summary( 187 | Variant(loci[0], "A", "C")), 188 | [('A', 0), ('C', 3)]) 189 | eq_(evidence.match_summary(Variant(loci[1], "A", "C")), 190 | [('A', 3), ('C', 0)]) 191 | eq_(evidence.match_summary(Variant(loci[1], "A", "CC")), 192 | [('A', 3), ('CC', 0)]) 193 | eq_(evidence.match_summary(Variant(loci[1], "A", "")), 194 | [('A', 3), ('', 0)]) 195 | eq_(evidence.match_summary(Variant(loci[1], "A", "")), 196 | [('A', 3), ('', 0)]) 197 | eq_(evidence.match_summary(Variant(loci[2], "AT", "")), 198 | [('AT', 3), ('', 0)]) 199 | eq_(evidence.match_summary(Variant(loci[3], "A", "")), 200 | [('A', 2), ('', 6)]) 201 | eq_(evidence.match_summary(Variant(loci[4], "AC", "")), 202 | [('AC', 2), ('', 6)]) 203 | eq_(evidence.match_summary( 204 | Variant(loci[4], "AC", ""), 205 | lambda e: e.read_attributes().mapping_quality.mean()), 206 | [('AC', 60.0), ('', 65.0)]) 207 | 208 | def test_read_evidence_variant_matching_gatk_bundle_native_varcode_variant(): 209 | # Try native varcode Variant. 210 | handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) 211 | locus = Locus.from_inclusive_coordinates("20", 10008951) 212 | variant = VarcodeVariant( 213 | locus.contig, 214 | locus.position + 1, # inclusive not interbase 215 | "A", 216 | "C") 217 | evidence = PileupCollection.from_bam(handle, [variant]) 218 | eq_(evidence.match_summary(variant), 219 | [('A', 1), ('C', 4)]) 220 | 221 | 222 | def test_read_evidence_variant_matching_gatk_mini_bundle_extract_warning(): 223 | filename = data_path("gatk_mini_bundle_extract.bam") 224 | 225 | # Should log a warning but pass. 226 | loci = [ 227 | Locus.from_inclusive_coordinates("20", 10009053, 10009054), # 0 228 | ] 229 | evidence = PileupCollection.from_bam(filename, loci) 230 | eq_(evidence.match_summary(Variant(loci[0], "A", "")), 231 | [('A', 0), ('', 0), ('AT', 3)]) 232 | 233 | 234 | -------------------------------------------------------------------------------- /tests/test_reads.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import absolute_import 16 | 17 | import functools 18 | 19 | from nose.tools import eq_ 20 | 21 | from varlens.commands import reads 22 | 23 | from . import data_path, run_and_parse_csv, cols_concat, temp_file 24 | 25 | run = functools.partial(run_and_parse_csv, reads.run) 26 | 27 | expected_cols = ( 28 | "source,query_name,reference_start,reference_end,cigarstring").split(',') 29 | 30 | def test_basic(): 31 | result = run([ 32 | data_path("CELSR1/bams/bam_0.bam"), 33 | ]) 34 | eq_(result.shape, (953, len(expected_cols))) 35 | 36 | result = run([ 37 | data_path("CELSR1/bams/bam_0.bam"), 38 | "--is-duplicate", 39 | ]) 40 | eq_(result.shape, (173, len(expected_cols))) 41 | 42 | result = run([ 43 | data_path("CELSR1/bams/bam_0.bam"), 44 | "--is-read1", 45 | ]) 46 | eq_(result.shape, (481, len(expected_cols))) 47 | 48 | result = run([ 49 | data_path("CELSR1/bams/bam_0.bam"), 50 | "--is-read2", 51 | ]) 52 | eq_(result.shape, (472, len(expected_cols))) 53 | 54 | def test_loci_filtering(): 55 | result = run([ 56 | data_path("CELSR1/bams/bam_5.bam"), 57 | ]) 58 | eq_(result.shape, (37053, len(expected_cols))) 59 | 60 | result = run([ 61 | data_path("CELSR1/bams/bam_5.bam"), 62 | "--locus", "chr22:46930257-46930259" 63 | ]) 64 | eq_(result.shape, (1795, len(expected_cols))) 65 | 66 | result = run([ 67 | data_path("CELSR1/bams/bam_5.bam"), 68 | "--locus", "chr22/46930256-46930259" 69 | ]) 70 | eq_(result.shape, (1795, len(expected_cols))) 71 | 72 | result = run([ 73 | data_path("CELSR1/bams/bam_5.bam"), 74 | "--locus", "chr22:46930257-46930257" 75 | ]) 76 | eq_(result.shape, (1753, len(expected_cols))) 77 | 78 | result = run([ 79 | data_path("CELSR1/bams/bam_5.bam"), 80 | "--locus", "chr22:46930257" 81 | ]) 82 | eq_(result.shape, (1753, len(expected_cols))) 83 | 84 | result = run([ 85 | data_path("CELSR1/bams/bam_5.bam"), 86 | "--locus", "chr22/46930256" 87 | ]) 88 | eq_(result.shape, (1753, len(expected_cols))) 89 | 90 | def test_read_filtering(): 91 | result = run([ 92 | data_path("CELSR1/bams/bam_5.bam"), 93 | "--reference-start", '46932059', 94 | ]) 95 | eq_(result.shape, (26, len(expected_cols))) 96 | 97 | result = run([ 98 | data_path("CELSR1/bams/bam_5.bam"), 99 | "--reference-start", '46932059', 100 | "--query-name-contains", '57841', 101 | ]) 102 | eq_(result.shape, (1, len(expected_cols))) 103 | 104 | def test_round_trip(): 105 | with temp_file(".bam") as out: 106 | reads.run([ 107 | data_path("CELSR1/bams/bam_5.bam"), 108 | "--locus", "chr22/46930276", 109 | "--locus", "chr22/46930256", 110 | "--out", out, 111 | ]) 112 | result1 = run([ 113 | out, 114 | ]) 115 | result2 = run([ 116 | data_path("CELSR1/bams/bam_5.bam"), 117 | "--locus", "chr22/46930276", 118 | "--locus", "chr22/46930256", 119 | ]) 120 | eq_(sorted(cols_concat(result1, expected_cols[1:])), 121 | sorted(cols_concat(result2, expected_cols[1:]))) 122 | 123 | def test_round_trip_sam(): 124 | with temp_file(".sam") as out: 125 | print(out) 126 | reads.run([ 127 | data_path("CELSR1/bams/bam_5.bam"), 128 | "--locus", "chr22/46930276", 129 | "--locus", "chr22/46930256", 130 | "--out", out, 131 | ]) 132 | result1 = run([ 133 | out, 134 | ]) 135 | result2 = run([ 136 | data_path("CELSR1/bams/bam_5.bam"), 137 | "--locus", "chr22/46930276", 138 | "--locus", "chr22/46930256", 139 | ]) 140 | eq_(sorted(cols_concat(result1, expected_cols[1:])), 141 | sorted(cols_concat(result2, expected_cols[1:]))) 142 | -------------------------------------------------------------------------------- /tests/test_variants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import absolute_import 16 | 17 | import subprocess 18 | import warnings 19 | import logging 20 | 21 | import pandas 22 | import numpy 23 | from nose.tools import eq_ 24 | 25 | from varlens.commands import variants 26 | 27 | from . import data_path, run_and_parse_csv, cols_concat, temp_file 28 | 29 | def run(args): 30 | logging.info("Running with args: " + ' '.join(args)) 31 | return run_and_parse_csv(variants.run, args) 32 | 33 | reference_fasta = data_path("chr22.no_line_wrap.fa") 34 | 35 | expected_cols = [ 36 | "genome", "contig", "interbase_start", "interbase_end", "ref", "alt", 37 | ] 38 | 39 | def test_basic(): 40 | result = run([ 41 | data_path("CELSR1/vcfs/vcf_1.vcf"), 42 | "--genome", "b37", 43 | ]) 44 | eq_(sorted(cols_concat(result, expected_cols)), sorted({ 45 | "GRCh37-22-46931059-46931060-A-C", 46 | "GRCh37-22-21829554-21829555-T-G", 47 | "GRCh37-22-46931061-46931062-G-A", 48 | "GRCh37-22-50636217-50636218-A-C", 49 | "GRCh37-22-50875932-50875933-A-C", 50 | })) 51 | 52 | def test_genes_and_effects(): 53 | result = run([ 54 | data_path("CELSR1/vcfs/vcf_1.vcf"), 55 | "--genome", "b37", 56 | "--include-effect", 57 | "--include-gene", 58 | "--rename-column", "gene", "genez", 59 | ]) 60 | eq_(sorted(cols_concat(result, expected_cols + ["effect", "genez"])), 61 | sorted({ 62 | 'GRCh37-22-21829554-21829555-T-G-non-coding-transcript-PI4KAP2', 63 | 'GRCh37-22-46931059-46931060-A-C-p.S670A-CELSR1', 64 | 'GRCh37-22-46931061-46931062-G-A-p.S669F-CELSR1', 65 | 'GRCh37-22-50636217-50636218-A-C-intronic-TRABD', 66 | 'GRCh37-22-50875932-50875933-A-C-splice-acceptor-PPP6R2', 67 | })) 68 | 69 | def test_context(): 70 | result = run([ 71 | data_path("CELSR1/vcfs/vcf_1.vcf"), 72 | "--genome", "b37", 73 | "--include-context", 74 | "--context-num-bases", "5", 75 | "--reference", reference_fasta, 76 | ]) 77 | eq_(sorted(cols_concat(result, 78 | expected_cols + [ 79 | "context_5_prime", "context_3_prime", "context_mutation"])), 80 | sorted({ 81 | "GRCh37-22-46931059-46931060-A-C-GCTCC-CCACC-T>G", 82 | "GRCh37-22-21829554-21829555-T-G-CATGA-AGTGA-T>G", 83 | "GRCh37-22-46931061-46931062-G-A-GAGCT-CTCCA-C>T", 84 | "GRCh37-22-50636217-50636218-A-C-AGGGA-GGGCA-T>G", 85 | "GRCh37-22-50875932-50875933-A-C-AGGCC-GGGAG-T>G", 86 | })) 87 | 88 | def test_mhc_binding_affinity(): 89 | # If netMHC is not installed, we skip this test 90 | try: 91 | # If this succeeds (no exception), we do nothing. 92 | subprocess.call( 93 | "netMHC", stdout=subprocess.PIPE, stderr=subprocess.PIPE) 94 | except OSError: 95 | warnings.warn("netMHC not installed, skipping mhc binding test") 96 | return 97 | 98 | with temp_file(".csv") as out_csv: 99 | run([ 100 | data_path("CELSR1/vcfs/vcf_1.vcf"), 101 | "--genome", "b37", 102 | "--include-mhc-binding", 103 | "--hla", "A:02:01 A:02:02", 104 | "--out", out_csv, 105 | ]) 106 | ['GRCh37-22-21829554-21829555-T-G-nan-nan', 107 | 'GRCh37-22-46931059-46931060-A-C-377.3-A:02:02', 108 | 'GRCh37-22-46931061-46931062-G-A-77.2-A:02:02', 109 | 'GRCh37-22-50636217-50636218-A-C-nan-nan', 110 | 'GRCh37-22-50875932-50875933-A-C-nan-nan'] 111 | 112 | results = pandas.read_csv(out_csv).set_index(expected_cols) 113 | assert numpy.isnan(results.loc[ 114 | ("GRCh37", 22, 21829554, 21829555, "T", "G")].binding_affinity) 115 | assert numpy.isnan(results.loc[ 116 | ("GRCh37", 22, 21829554, 21829555, "T", "G")].binding_allele) 117 | eq_(results.loc[ 118 | ("GRCh37", 22, 46931059, 46931060, "A", "C")].binding_allele, 119 | "A:02:02") 120 | eq_(results.loc[ 121 | ("GRCh37", 22, 46931061, 46931062, "G", "A")].binding_allele, 122 | "A:02:02") 123 | 124 | def test_read_evidence(): 125 | result = run([ 126 | data_path("CELSR1/vcfs/vcf_1.vcf"), 127 | "--include-read-evidence", 128 | "--reads", data_path("CELSR1/bams/bam_0.bam"), 129 | "--genome", "b37", 130 | ]) 131 | allele_groups = ["num_ref", "num_alt", "total_depth"] 132 | for allele_group in allele_groups: 133 | result[allele_group] = result[allele_group].astype(int) 134 | eq_(cols_concat( 135 | result, 136 | ["contig", "interbase_start"] + allele_groups), 137 | { 138 | '22-50636217-0-0-0', 139 | '22-50875932-0-0-0', 140 | '22-21829554-0-0-0', 141 | "22-46931059-50-0-50", 142 | "22-46931061-51-0-51", 143 | }) 144 | 145 | # Same thing but with chunk rows = 1 146 | with temp_file(".csv") as out_csv: 147 | run([ 148 | data_path("CELSR1/vcfs/vcf_1.vcf"), 149 | "--include-read-evidence", 150 | "--reads", data_path("CELSR1/bams/bam_0.bam"), 151 | "--genome", "b37", 152 | "--chunk-rows", "1", 153 | "--out", out_csv, 154 | ]) 155 | result = pandas.read_csv(out_csv) 156 | 157 | allele_groups = ["num_ref", "num_alt", "total_depth"] 158 | for allele_group in allele_groups: 159 | result[allele_group] = result[allele_group].astype(int) 160 | eq_(cols_concat( 161 | result, 162 | ["contig", "interbase_start"] + allele_groups), 163 | { 164 | '22-50636217-0-0-0', 165 | '22-50875932-0-0-0', 166 | '22-21829554-0-0-0', 167 | "22-46931059-50-0-50", 168 | "22-46931061-51-0-51", 169 | }) 170 | 171 | result = run([ 172 | "--include-read-evidence", 173 | "--reads", data_path("gatk_mini_bundle_extract.bam"), 174 | "--read-source-name", "foo", 175 | "--single-variant", "chr20:10008951", "C", "A", 176 | "--genome", "b37", 177 | ]) 178 | for allele_group in allele_groups: 179 | result[allele_group] = result[allele_group].astype(int) 180 | eq_(cols_concat(result, expected_cols + allele_groups), 181 | {"GRCh37-20-10008950-10008951-C-A-4-1-5"}) 182 | 183 | result = run([ 184 | "--include-read-evidence", 185 | "--reads", data_path("gatk_mini_bundle_extract.bam"), 186 | "--read-source-name", "foo", 187 | "--single-variant", "chr20:10008951", "C", "A", 188 | "--genome", "b37", 189 | "--is-reverse", 190 | ]) 191 | for allele_group in allele_groups: 192 | result[allele_group] = result[allele_group].astype(int) 193 | eq_(cols_concat(result, expected_cols + allele_groups), 194 | {"GRCh37-20-10008950-10008951-C-A-1-0-1"}) 195 | 196 | 197 | def test_filtering(): 198 | result = run([ 199 | data_path("CELSR1/vcfs/vcf_1.vcf"), 200 | "--genome", "b37", 201 | "--ref", "A", 202 | ]) 203 | eq_(sorted(cols_concat(result, expected_cols)), sorted({ 204 | "GRCh37-22-46931059-46931060-A-C", 205 | "GRCh37-22-50636217-50636218-A-C", 206 | "GRCh37-22-50875932-50875933-A-C", 207 | })) 208 | 209 | result = run([ 210 | data_path("CELSR1/vcfs/vcf_1.vcf"), 211 | "--genome", "b37", 212 | "--ref", "A", 213 | "--variant-locus", "22:50636218", 214 | ]) 215 | eq_(sorted(cols_concat(result, expected_cols)), sorted({ 216 | "GRCh37-22-50636217-50636218-A-C", 217 | })) 218 | 219 | result = run([ 220 | data_path("CELSR1/vcfs/vcf_1.vcf"), 221 | data_path("CELSR1/vcfs/vcf_2.vcf"), 222 | "--alt", "C", "G", 223 | "--genome", "b37" 224 | ]) 225 | eq_(sorted(cols_concat(result, expected_cols)), sorted({ 226 | "GRCh37-22-21829554-21829555-T-G", 227 | "GRCh37-22-45309892-45309893-T-G", 228 | "GRCh37-22-46931059-46931060-A-C", 229 | "GRCh37-22-50636217-50636218-A-C", 230 | "GRCh37-22-50875932-50875933-A-C", 231 | })) 232 | 233 | ''' 234 | def test_fields(): 235 | result = run([ 236 | "--field", 237 | "foo:ref.lower()", 238 | "gene_names[0]", 239 | "--variants", data_path("CELSR1/vcfs/vcf_1.vcf"), 240 | "--variant-filter", "ref=='A'", 241 | "--variant-genome", "b37" 242 | ]) 243 | eq_(sorted(cols_concat(result, expected_cols + ["foo", "gene_names[0]"])), 244 | sorted({ 245 | "GRCh37-22-46931059-46931060-A-C-a-CELSR1", 246 | "GRCh37-22-50636217-50636218-A-C-a-TRABD", 247 | "GRCh37-22-50875932-50875933-A-C-a-PPP6R2", 248 | })) 249 | ''' 250 | def test_round_trip(): 251 | with temp_file(".csv") as out_csv: 252 | variants.run([ 253 | data_path("CELSR1/vcfs/vcf_1.vcf"), 254 | "--out", out_csv, 255 | "--genome", "b37", 256 | "--ref", "A", 257 | "--include-gene", 258 | ]) 259 | result1 = pandas.read_csv(out_csv) 260 | eq_(sorted(cols_concat( 261 | result1, expected_cols + ["gene"])), 262 | sorted({ 263 | "GRCh37-22-46931059-46931060-A-C-CELSR1", 264 | "GRCh37-22-50636217-50636218-A-C-TRABD", 265 | "GRCh37-22-50875932-50875933-A-C-PPP6R2", 266 | })) 267 | 268 | result2 = run([ 269 | out_csv, 270 | "--include-gene", 271 | ]) 272 | eq_(sorted(cols_concat( 273 | result2, 274 | expected_cols + ["gene"])), 275 | sorted({ 276 | "GRCh37-22-46931059-46931060-A-C-CELSR1", 277 | "GRCh37-22-50636217-50636218-A-C-TRABD", 278 | "GRCh37-22-50875932-50875933-A-C-PPP6R2", 279 | })) 280 | 281 | def test_distinct_variants(): 282 | result = run([ 283 | data_path("CELSR1/vcfs/vcf_1.vcf"), 284 | data_path("CELSR1/vcfs/vcf_1.vcf"), 285 | "--genome", "b37", 286 | "--ref", "A", "T", 287 | "--variant-source-name", "first", "second", 288 | ]) 289 | eq_(sorted(cols_concat(result, expected_cols + ["sources"])), 290 | sorted({ 291 | "GRCh37-22-21829554-21829555-T-G-first second", 292 | "GRCh37-22-46931059-46931060-A-C-first second", 293 | "GRCh37-22-50636217-50636218-A-C-first second", 294 | "GRCh37-22-50875932-50875933-A-C-first second", 295 | })) 296 | 297 | -------------------------------------------------------------------------------- /varlens/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import absolute_import 16 | 17 | from . import read_evidence, util, loci_util 18 | 19 | __version__ = "0.1.4" 20 | 21 | __all__ = [ 22 | "loci_util", 23 | "read_evidence", 24 | "util", 25 | ] 26 | -------------------------------------------------------------------------------- /varlens/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import sys 14 | import logging 15 | import warnings 16 | 17 | # Biopython (used by varcode) throws a warning when sequences are compared. 18 | warnings.filterwarnings("ignore", message="Biopython Seq objects") 19 | 20 | def configure_logging(args=None): 21 | if args is not None and args.verbose: 22 | level = logging.DEBUG 23 | else: 24 | level = logging.INFO 25 | 26 | logging.basicConfig( 27 | format="%(asctime)s.%(msecs)d %(levelname)s %(module)s - %(funcName)s:" 28 | " %(message)s", 29 | datefmt="%Y-%m-%d %H:%M:%S", 30 | stream=sys.stderr, 31 | level=level) 32 | 33 | -------------------------------------------------------------------------------- /varlens/commands/allele_support.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | ''' 14 | Given one or more BAMs and some genomic sites to consider, write a csv file 15 | giving counts of reads supporting each allele at each site for each BAM. 16 | 17 | The genomic sites to consider may be specified by locus (--locus option), or via 18 | one or more VCF files. 19 | 20 | The positions outputted by this command are in *interbase coordinates*, i.e. 21 | starting at 0, inclusive on first index, exclusive on second (as opposed to 22 | the one-based inclusive coordinates used in VCF files). 23 | 24 | Example: 25 | 26 | %(prog)s \\ 27 | --reads test/data/CELSR1/bams/bam_1.bam \\ 28 | --locus 22:46931061 22:46931063 29 | 30 | ''' 31 | 32 | import argparse 33 | import csv 34 | import sys 35 | import logging 36 | 37 | from .. import loci_util 38 | from .. import reads_util 39 | from .. import variants_util 40 | 41 | from . import configure_logging 42 | from .. import support 43 | from ..read_evidence.pileup_collection import to_locus 44 | 45 | parser = argparse.ArgumentParser(usage=__doc__) 46 | group = parser.add_argument_group("output arguments") 47 | group.add_argument("--out") 48 | group.add_argument("-v", "--verbose", action="store_true", default=False) 49 | loci_util.add_args(parser.add_argument_group("loci specification")) 50 | variants_util.add_args(parser) 51 | reads_util.add_args(parser) 52 | 53 | def run(raw_args=sys.argv[1:]): 54 | args = parser.parse_args(raw_args) 55 | configure_logging(args) 56 | 57 | loci = loci_util.load_from_args(args) # may be None 58 | variants_df = variants_util.load_from_args_as_dataframe(args) 59 | if variants_df is not None: 60 | variant_loci = loci_util.Loci( 61 | to_locus(variant) 62 | for variant in variants_df["variant"]) 63 | loci = variant_loci if loci is None else loci.union(variant_loci) 64 | 65 | if not loci: 66 | if variants_df is not None: 67 | parser.error("No loci: variants specified but none remained " 68 | "after filtering") 69 | else: 70 | parser.error("No genomic loci or variants specified.") 71 | 72 | logging.info("Loaded %d genomic loci." % len(loci)) 73 | 74 | read_sources = reads_util.load_from_args(args) 75 | 76 | if read_sources is None: 77 | parser.error("No read sources (--reads argument) specified.") 78 | 79 | out_fd = open(args.out, "w") if args.out else sys.stdout 80 | writer = csv.writer(out_fd) 81 | 82 | rows_generator = support.allele_support_rows(loci, read_sources) 83 | for (i, row) in enumerate(rows_generator): 84 | if i == 0: 85 | writer.writerow(row.index.tolist()) 86 | writer.writerow([str(x) for x in row]) 87 | 88 | if out_fd is not sys.stdout: 89 | out_fd.close() 90 | print("Wrote: %s" % args.out) 91 | -------------------------------------------------------------------------------- /varlens/commands/reads.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | ''' 14 | Filter reads from one or more BAMs and output a CSV or a new BAM. 15 | 16 | Loci and VCF files may be specified, in which case reads are filtered to 17 | overlap the specified loci or variants. 18 | 19 | Examples: 20 | 21 | Print basic fields for the reads in a BAM: 22 | 23 | %(prog)s test/data/CELSR1/bams/bam_0.bam 24 | 25 | Same as above but filter only to reads aligned on the (-) strand, write to a 26 | file instead of stdout, and also include the mapping quality and sequenced 27 | bases in the output: 28 | 29 | %(prog)s test/data/CELSR1/bams/bam_0.bam \\ 30 | --is-reverse \\ 31 | --field mapping_quality query_alignment_sequence \\ 32 | --out /tmp/result.csv 33 | 34 | Write a bam file consisting of reads with mapping quality >=30 and 35 | overlapping a certain locus: 36 | 37 | %(prog)s test/data/CELSR1/bams/bam_0.bam \\ 38 | --min-mapping-quality 30 \\ 39 | --locus 22:46932040-46932050 \\ 40 | --out /tmp/result.bam 41 | 42 | Write a bam file consisting of reads overlapping variants from a VCF: 43 | 44 | %(prog)s test/data/CELSR1/bams/bam_0.bam \\ 45 | --variants test/data/CELSR1/vcfs/vcf_1.vcf \\ 46 | --out /tmp/result.bam 47 | 48 | Print just the header for a BAM in csv format: 49 | 50 | %(prog)s test/data/CELSR1/bams/bam_0.bam --header 51 | 52 | ''' 53 | 54 | import argparse 55 | import sys 56 | import csv 57 | 58 | import pysam 59 | 60 | from . import configure_logging 61 | from .. import loci_util 62 | from .. import reads_util 63 | from .. import variants_util 64 | from ..read_evidence.pileup_collection import PileupCollection, to_locus 65 | 66 | STANDARD_FIELDS = [ 67 | "source", 68 | "query_name", 69 | "reference_start", 70 | "reference_end", 71 | "cigarstring", 72 | ] 73 | 74 | parser = argparse.ArgumentParser(usage=__doc__) 75 | group = parser.add_argument_group("output") 76 | group.add_argument("--out", 77 | help="Output file. Format is guessed from file extension: must be csv or " 78 | "bam. If not specified, csv is written to stdout.") 79 | group.add_argument("--field", nargs="+", default=[], 80 | help="Additional read fields to output as columns in the csv. See pysam " 81 | "documentation (http://pysam.readthedocs.org/en/latest/api.html) for the " 82 | "meaning of these fields. Valid fields include: %s" % ( 83 | " ".join(PileupCollection._READ_ATTRIBUTE_NAMES))) 84 | 85 | group.add_argument("--no-standard-fields", action="store_true", default=False, 86 | help="Do not include the standard fields (%s) in csv output." 87 | % ', '.join(STANDARD_FIELDS)) 88 | group.add_argument("--no-sort", action="store_true", default=False, 89 | help="When outputting a bam, do not call samtools sort.") 90 | group.add_argument( 91 | "--header", 92 | action="store_true", 93 | default=False, 94 | help="Output BAM/SAM header only.") 95 | group.add_argument( 96 | "--header-set", 97 | nargs=4, 98 | action="append", 99 | help="When outputting a bam, set a particular header field to the given " 100 | "value. Example: --header-set RG . SM my_sample") 101 | 102 | group.add_argument("-v", "--verbose", action="store_true", default=False) 103 | 104 | reads_util.add_args(parser, positional=True) 105 | loci_util.add_args(parser.add_argument_group("loci specification")) 106 | variants_util.add_args(parser) 107 | 108 | def run(raw_args=sys.argv[1:]): 109 | args = parser.parse_args(raw_args) 110 | configure_logging(args) 111 | 112 | read_sources = reads_util.load_from_args(args) 113 | if not read_sources: 114 | parser.error("No read sources specified.") 115 | 116 | loci = loci_util.load_from_args(args) # may be None 117 | variants_df = variants_util.load_from_args_as_dataframe(args) 118 | if variants_df is not None: 119 | variant_loci = loci_util.Loci( 120 | to_locus(variant) 121 | for variant in variants_df["variant"]) 122 | loci = variant_loci if loci is None else loci.union(variant_loci) 123 | 124 | if args.header: 125 | if loci is not None: 126 | parser.error("If specifying --header don't specify loci.") 127 | if args.field: 128 | parser.error("If specifying --header don't specify fields.") 129 | 130 | out_pysam_handle = None 131 | out_csv_writer = out_csv_fd = None 132 | if args.out and (args.out.endswith(".bam") or args.out.endswith(".sam")): 133 | if args.field: 134 | parser.error("Don't specify fields when outputting to bam or sam.") 135 | 136 | header = update_header(args, read_sources[0].handle.header) 137 | out_pysam_handle = pysam.AlignmentFile( 138 | args.out, 139 | "wb", 140 | header=header) 141 | 142 | elif not args.out or args.out.endswith(".csv"): 143 | out_csv_fd = open(args.out, "w") if args.out else sys.stdout 144 | out_csv_writer = csv.writer(out_csv_fd) 145 | 146 | if args.header: 147 | if args.field: 148 | parser.error("Don't specify fields when outputting header.") 149 | out_csv_writer.writerow([ 150 | "read_source", "group", "index", "key", "value", 151 | ]) 152 | else: 153 | columns = ( 154 | ([] if args.no_standard_fields else STANDARD_FIELDS) + 155 | args.field) 156 | out_csv_writer.writerow(columns) 157 | else: 158 | parser.error( 159 | "Don't know how to write to file with output extension: %s. " 160 | "Supported extensions: csv, bam, sam." % args.out) 161 | 162 | num_reads = 0 163 | for read_source in read_sources: 164 | if args.header: 165 | header = update_header(args, read_source.handle.header) 166 | for (group, i, key, value) in reads_util.flatten_header(header): 167 | out_csv_writer.writerow( 168 | [read_source.name, group, str(i), key, value]) 169 | continue # we don't look at reads at all. 170 | for read in read_source.reads(loci): 171 | num_reads += 1 172 | if out_pysam_handle is not None: 173 | out_pysam_handle.write(read) 174 | if out_csv_writer is not None: 175 | out_csv_writer.writerow([ 176 | str(read_field(read_source, read, field)) 177 | for field in columns 178 | ]) 179 | 180 | if out_pysam_handle is not None: 181 | out_pysam_handle.close() 182 | if not args.no_sort: 183 | print("Sorting read file %s" % args.out) 184 | pysam.sort( 185 | "-o", args.out, 186 | "-T", "varlens_reads", args.out, 187 | catch_stdout=False) 188 | print("Wrote %d reads: %s" % (num_reads, args.out)) 189 | 190 | if out_csv_fd is not None and out_csv_fd is not sys.stdout: 191 | out_csv_fd.close() 192 | print("Wrote: %s" % args.out) 193 | 194 | 195 | def read_field(read_source, read, field_name): 196 | if field_name == 'source': 197 | return read_source.name 198 | 199 | if field_name.startswith("tag:"): 200 | tag_name = field_name[len("tag:"):] 201 | return read.get_tags().get(tag_name) 202 | 203 | try: 204 | return getattr(read, field_name) 205 | except AttributeError: 206 | raise ValueError("Invalid read field '%s'. Valid fields include: %s" 207 | % (field_name, ' '.join(dir(read)))) 208 | 209 | def update_header(args, header): 210 | if args.header_set: 211 | header = dict(header) 212 | for (group, index_string, key, value) in args.header_set: 213 | if not isinstance(header[group], list): 214 | header[group] = [header[group]] 215 | if index_string == ".": 216 | indices = range(len(header[group])) 217 | else: 218 | indices = [int(x) for x in index_string.split(",")] 219 | for index in indices: 220 | header[group][index][key] = value 221 | return header 222 | -------------------------------------------------------------------------------- /varlens/commands/util.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import argparse 14 | 15 | def load_variant_collections_parser(): 16 | parser = argparse.ArgumentParser(add_help=False) 17 | parser.add_argument("--variants", nargs="+", required=True) 18 | parser.add_argument("--ensembl-version") 19 | parser.add_argument("--variant-filter") 20 | return parser 21 | 22 | def load_read_sets_parser(): 23 | parser = argparse.ArgumentParser(add_help=False) 24 | parser.add_argument("--reads", nargs="+", default=[]) 25 | return parser 26 | 27 | -------------------------------------------------------------------------------- /varlens/commands/variants.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | ''' 14 | Given variants from one or more VCF or CSV files, apply filters, add additional 15 | columns, and output to CSV. 16 | 17 | Currently we can only output to CSV, not VCF. 18 | 19 | A number of useful annotations can be added for each variant by specifying 20 | options of the form '--include-XXX', e.g. '--include-gene'. See detailed help 21 | below. 22 | 23 | Examples: 24 | 25 | Print basic info for the variants found in two VCF files. Note that variants 26 | found in both files are listed in one row, and the 'sources' column lists 27 | the files each variant was found in: 28 | 29 | %(prog)s test/data/CELSR1/vcfs/vcf_1.vcf test/data/CELSR1/vcfs/vcf_2.vcf 30 | 31 | Same as the above but include additional columns giving varcode variant effect 32 | annotations and the genes the variants overlap, and write to a file: 33 | 34 | %(prog)s test/data/CELSR1/vcfs/vcf_1.vcf test/data/CELSR1/vcfs/vcf_2.vcf \\ 35 | --include-effect \\ 36 | --include-gene \\ 37 | --out /tmp/result.csv 38 | 39 | Print counts for number of reads supporting reference/variant/other alleles 40 | from the specified BAMs, counting only reads with mapping quality >= 10: 41 | 42 | %(prog)s test/data/CELSR1/vcfs/vcf_1.vcf \\ 43 | --include-read-evidence \\ 44 | --reads test/data/CELSR1/bams/*.bam \\ 45 | --min-mapping-quality 10 46 | 47 | ''' 48 | from __future__ import absolute_import 49 | 50 | import argparse 51 | import sys 52 | import logging 53 | 54 | from . import configure_logging 55 | from .. import variant_includes 56 | from .. import variants_util 57 | 58 | parser = argparse.ArgumentParser(usage=__doc__) 59 | variants_util.add_args(parser, positional=True) 60 | 61 | group = parser.add_argument_group("variant output") 62 | 63 | group.add_argument("--no-standard-columns", 64 | action="store_true", default=False, 65 | help="Don't write standard columns (genome, contig, start, end, ref, alt)") 66 | 67 | group.add_argument("--chunk-rows", metavar="N", type=int, 68 | help="Write out current results after processing N rows.") 69 | 70 | group.add_argument("--limit", metavar="N", type=int, 71 | help="Process only the first N variants (useful for testing)") 72 | 73 | group.add_argument("--columns", 74 | help="Column separated list of columns to output") 75 | 76 | group.add_argument("--rename-column", nargs=2, action="append", default=[], 77 | metavar="COL", 78 | help="Rename output column first argument to second. Can be specified " 79 | "multiple times by repeating the --rename-column option.") 80 | 81 | group.add_argument("--out", 82 | help="Output file. If not specified the CSV is written to stdout.") 83 | 84 | group.add_argument('--include-metadata', action="store_true", default=False, 85 | help="Output variant metadata when loading from VCF (info column, etc).") 86 | 87 | for includeable in variant_includes.INCLUDEABLES: 88 | includeable.add_args(parser) 89 | 90 | group.add_argument("-v", "--verbose", action="store_true", default=False) 91 | 92 | def run(raw_args=sys.argv[1:]): 93 | args = parser.parse_args(raw_args) 94 | configure_logging(args) 95 | 96 | df = variants_util.load_from_args_as_dataframe(args) 97 | if df is None: 98 | parser.error("No variants specified.") 99 | 100 | logging.info("Loaded %d variants." % df.shape[0]) 101 | 102 | # We run the inverse of the column renames on the input df. 103 | column_renames = {} 104 | if args.rename_column: 105 | column_renames = dict(args.rename_column) 106 | column_renames_inverse = dict((v, k) for (k, v) in args.rename_column) 107 | if len(column_renames) != len(column_renames_inverse): 108 | raise ValueError("Column renames are not 1:1") 109 | 110 | df.columns = [ 111 | column_renames_inverse.get(col, col) for col in df.columns 112 | ] 113 | 114 | def save(df): 115 | if column_renames: 116 | df = df.copy() 117 | df.columns = [column_renames.get(col, col) for col in df.columns] 118 | 119 | if args.columns: 120 | columns = [x.strip() for x in args.columns.split(",")] 121 | else: 122 | columns = [x for x in df.columns.tolist() if x != "variant"] 123 | if not args.include_metadata: 124 | columns = [ 125 | x for x in columns 126 | if not x.startswith("metadata") 127 | ] 128 | if args.no_standard_columns: 129 | columns = [ 130 | x for x in columns 131 | if x not in variants_util.STANDARD_DATAFRAME_COLUMNS 132 | ] 133 | 134 | df_save = df[columns].copy() 135 | df_save.interbase_start = df_save.interbase_start.astype(int) 136 | df_save.interbase_end = df_save.interbase_end.astype(int) 137 | 138 | if args.out is None: 139 | # Write to stdout. 140 | df_save.to_csv(sys.stdout, index=False) 141 | elif args.out.endswith(".csv"): 142 | df_save.to_csv(args.out, index=False) 143 | print("Wrote: %s" % args.out) 144 | else: 145 | parser.error("Unsupported output file extension: %s" % args.out) 146 | 147 | for includeable in variant_includes.INCLUDEABLES: 148 | if includeable.requested(args): 149 | logging.info("Running includeable: %s" % includeable.name) 150 | instance = includeable.from_args(args) 151 | for num_rows in instance.compute(df, chunk_rows=args.chunk_rows): 152 | if args.chunk_rows is not None: 153 | save(df) 154 | 155 | if args.chunk_rows is None: 156 | save(df) 157 | 158 | -------------------------------------------------------------------------------- /varlens/loci_util.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import collections 14 | import intervaltree 15 | 16 | from .locus import Locus 17 | 18 | def add_args(parser): 19 | # TODO: 20 | # - Load intervals_list files 21 | parser.add_argument('--locus', nargs="+", default=[], 22 | help="Genomic locus, like chr1:2342332 or chr1:2342-23423. " 23 | "Any number of loci may be specified.") 24 | # parser.add_argument("--neighbor-offsets", 25 | # nargs="+", type=int, default=[], 26 | # help="") 27 | 28 | def load_from_args(args): 29 | """ 30 | Return a Loci object giving the loci specified on the command line. 31 | 32 | If no loci-related arguments are specified, return None. This makes it 33 | possible to distinguish an empty set of loci, for example due to filters 34 | removing all loci, from the case where the user didn't specify any 35 | arguments. 36 | """ 37 | if not args.locus: 38 | return None 39 | 40 | loci_iterator = (Locus.parse(locus) for locus in args.locus) 41 | 42 | # if args.neighbor_offsets: 43 | # loci_iterator = expand_with_neighbors( 44 | # loci_iterator, args.neighbor_offsets) 45 | 46 | return Loci(loci_iterator) 47 | 48 | # def expand_with_neighbors(loci_iterator, neighbor_offsets): 49 | # offsets = sorted(set(neighbor_offsets + [0])) 50 | # for locus in loci_iterator: 51 | # for offset in offsets: 52 | # if offset == 0: 53 | # yield locus 54 | # else: 55 | # yield Locus( 56 | # locus.contig, locus.start + offset, locus.end + offset) 57 | 58 | class Loci(object): 59 | def __init__(self, locus_iterator=[], contig_map=None): 60 | self.contigs = collections.defaultdict(intervaltree.IntervalTree) 61 | if contig_map: 62 | self.contigs.update(contig_map) 63 | for locus in locus_iterator: 64 | self.contigs[locus.contig].addi(locus.start, locus.end) 65 | 66 | def __iter__(self): 67 | for contig in sorted(self.contigs): 68 | for interval in self.contigs[contig]: 69 | yield Locus(contig, interval.begin, interval.end) 70 | 71 | def __len__(self): 72 | return sum(len(tree) for tree in self.contigs.values()) 73 | 74 | def intersects(self, locus): 75 | return self.contigs[locus.contig].overlaps(locus.start, locus.end) 76 | 77 | def union(self, other): 78 | contig_map = {} 79 | for contig in set(self.contigs).union(other.contigs): 80 | contig_map[contig] = self.contigs[contig].union( 81 | other.contigs[contig]) 82 | return Loci(contig_map=contig_map) 83 | -------------------------------------------------------------------------------- /varlens/locus.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import re 14 | from collections import namedtuple 15 | 16 | import pyensembl 17 | import typechecks 18 | 19 | class Locus(namedtuple("Locus", "contig start end")): 20 | ''' 21 | A genomic interval in 0-indexed interbase coordinates. 22 | 23 | See this blog post for a discussion on coordinate systems: 24 | http://alternateallele.blogspot.com/2012/03/genome-coordinate-conventions.html 25 | ''' 26 | 27 | @property 28 | def inclusive_start(self): 29 | return self.start + 1 30 | 31 | @property 32 | def inclusive_end(self): 33 | return self.end 34 | 35 | @property 36 | def positions(self): 37 | ''' 38 | A Python range object giving the bases included in this locus. 39 | ''' 40 | return range(self.start, self.end) 41 | 42 | @property 43 | def position(self): 44 | ''' 45 | If this locus spans a single base, this property gives that position. 46 | Otherwise, raises a ValueError. 47 | ''' 48 | if self.end != self.start + 1: 49 | raise ValueError("Not a single base: %s" % str(self)) 50 | return self.start 51 | 52 | # Factory functions. 53 | @staticmethod 54 | def from_inclusive_coordinates(contig, start, end=None): 55 | ''' 56 | Given coordinates in 1-based coordinates that are inclusive on start 57 | and end, return a Locus instance. Locus instances are always 0-based 58 | "interbase" coordinates. 59 | ''' 60 | typechecks.require_string(contig) 61 | typechecks.require_integer(start) 62 | if end is None: 63 | end = start 64 | typechecks.require_integer(end) 65 | contig = pyensembl.locus.normalize_chromosome(contig) 66 | return Locus(contig, start - 1, end) 67 | 68 | @staticmethod 69 | def from_interbase_coordinates(contig, start, end=None): 70 | ''' 71 | Given coordinates in 0-based interbase coordinates, return a Locus 72 | instance. 73 | ''' 74 | typechecks.require_string(contig) 75 | typechecks.require_integer(start) 76 | if end is None: 77 | end = start + 1 78 | typechecks.require_integer(end) 79 | contig = pyensembl.locus.normalize_chromosome(contig) 80 | return Locus(contig, start, end) 81 | 82 | @staticmethod 83 | def parse(string): 84 | match = re.match(r'(\w+)([:/])(\d+)(-(\d+))?', string) 85 | if match is None: 86 | raise ValueError("Couldn't parse locus: %s. " 87 | "Expected format is: chr5:3332 or chr5:3332-5555 for " 88 | "inclusive 1-based coordinates and chr5/3331 or " 89 | "chr5/3331-5554 for half-open 0-based coordinates.") 90 | 91 | (contig, symbol, start, _, maybe_end) = match.groups() 92 | start = int(start) 93 | end = int(maybe_end) if maybe_end is not None else None 94 | 95 | if symbol == ":": 96 | # inclusive coordinatess 97 | return Locus.from_inclusive_coordinates(contig, start, end) 98 | else: 99 | # interbase coordinates 100 | assert symbol == "/" 101 | return Locus.from_interbase_coordinates(contig, start, end) -------------------------------------------------------------------------------- /varlens/mhc_binding.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import collections 14 | 15 | import pandas 16 | import varcode 17 | 18 | CACHED_BINDING_AFFINITIES = {} # (variant, allele -> nm affinity) 19 | BINDING_PREDICTORS = {} 20 | def binding_affinities(variants, alleles, epitope_lengths=[8, 9, 10, 11]): 21 | # We import these here so we don't depend on these libraries unless this 22 | # function is called. 23 | import mhctools 24 | import topiary 25 | 26 | for allele in alleles: 27 | if allele not in BINDING_PREDICTORS: 28 | BINDING_PREDICTORS[allele] = mhctools.NetMHCpan( 29 | [allele], default_peptide_lengths=epitope_lengths) 30 | predictor = BINDING_PREDICTORS[allele] 31 | predictions = topiary.predict_epitopes_from_variants( 32 | varcode.VariantCollection([ 33 | v for v in variants 34 | if (v, allele) not in CACHED_BINDING_AFFINITIES 35 | ]), 36 | predictor, 37 | ic50_cutoff=float('inf'), 38 | percentile_cutoff=100) 39 | if len(predictions) > 0: 40 | predictions_df = pandas.DataFrame( 41 | predictions, columns=predictions[0]._fields) 42 | values = predictions_df.groupby("variant")["value"].min() 43 | for (variant, value) in zip(values.index, values): 44 | CACHED_BINDING_AFFINITIES[(variant, allele)] = value 45 | 46 | result_df = collections.defaultdict(list) 47 | for variant in variants: 48 | (binding_affinity, binding_allele) = min( 49 | (CACHED_BINDING_AFFINITIES.get((variant, allele), float('nan')), 50 | allele) 51 | for allele in alleles) 52 | if pandas.isnull(binding_affinity): 53 | binding_allele = None 54 | result_df["variant"].append(variant) 55 | result_df["binding_affinity"].append(binding_affinity) 56 | result_df["binding_allele"].append(binding_allele) 57 | 58 | return pandas.DataFrame(result_df) 59 | 60 | -------------------------------------------------------------------------------- /varlens/read_evidence/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | ''' 13 | This subpackage provides functionality for collecting and filtering aligned 14 | sequencing reads from a BAM file, determining the alleles they suggest at 15 | a locus, and assesing the evidence for particular variants. 16 | 17 | In this subpackage, the records stored in the BAM file are referred to as 18 | "alignments," whereas the term "read" may be more familiar. We use the term 19 | "alignment" for consistency with the SAM specification, and since an 20 | individual read from the sequencer may generate any number of alignments in 21 | the case of chimeric alignments and secondary alignments. 22 | ''' 23 | 24 | from .util import alignment_key, read_key 25 | from .pileup import Pileup 26 | from .pileup_element import PileupElement 27 | from .pileup_collection import PileupCollection 28 | 29 | __all__ = [ 30 | "PileupCollection", 31 | "Pileup", 32 | "PileupElement", 33 | "alignment_key", 34 | "read_key", 35 | ] 36 | -------------------------------------------------------------------------------- /varlens/read_evidence/pileup.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import absolute_import 14 | 15 | from collections import OrderedDict 16 | 17 | class Pileup(object): 18 | ''' 19 | A Pileup is a collection of PileupElement instances at a particular locus. 20 | 21 | Attributes 22 | ---------- 23 | locus : Varcode.Locus 24 | The reference locus. Must be length 1, i.e. a single base. 25 | 26 | elements : OrderedDict of PileupElement instances 27 | This is logically and ordered set, which we implement as an OrderedDict 28 | with all values mapping to None. 29 | ''' 30 | def __init__(self, locus, elements): 31 | ''' 32 | Construct a new Pileup. 33 | 34 | Parameters 35 | ---------- 36 | locus : Varcode.Locus 37 | The reference locus. Must be length 1, i.e. a single base. 38 | 39 | elements : iterable of PileupElement 40 | The pileup elements. The locus field of these instances must 41 | match the locus parameter. 42 | ''' 43 | self.locus = locus 44 | self.elements = OrderedDict((e, None) for e in elements) 45 | assert all(e.locus == self.locus for e in self.elements) 46 | 47 | def __iter__(self): 48 | return iter(self.elements) 49 | 50 | def __len__(self): 51 | return len(self.elements) 52 | 53 | def append(self, element): 54 | ''' 55 | Append a PileupElement to this Pileup. If an identical PileupElement is 56 | already part of this Pileup, do nothing. 57 | ''' 58 | assert element.locus == self.locus, ( 59 | "Element locus (%s) != Pileup locus (%s)" 60 | % (element.locus, self.locus)) 61 | self.elements[element] = None 62 | 63 | def update(self, other): 64 | ''' 65 | Add all pileup elements from other into self. 66 | ''' 67 | assert self.locus == other.locus 68 | self.elements.update(other.elements) 69 | 70 | def filter(self, filters): 71 | ''' 72 | Apply filters to the pileup elements, and return a new Pileup with the 73 | filtered elements removed. 74 | 75 | Parameters 76 | ---------- 77 | filters : list of PileupElement -> bool callables 78 | A PileupUp element is retained if all filters return True when 79 | called on it. 80 | ''' 81 | new_elements = [ 82 | e for e in self.elements 83 | if all(function(e) for function in filters)] 84 | return Pileup(self.locus, new_elements) 85 | -------------------------------------------------------------------------------- /varlens/read_evidence/pileup_element.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import absolute_import 14 | 15 | from . import alignment_key 16 | 17 | class PileupElement(object): 18 | ''' 19 | A PileupElement represents the segment of an alignment that aligns to a 20 | particular base in the reference. 21 | 22 | Attributes 23 | ---------- 24 | locus : Varcode.Locus 25 | The reference locus. Must be length 1, i.e. a single base. 26 | 27 | offset_start : int 28 | 0-based start offset into the alignment sequence, inclusive 29 | 30 | offset_end : int 31 | 0-based end offset into the alignment sequence, exclusive 32 | 33 | alignment : pysam.AlignedSegment 34 | pysam alignment instance 35 | 36 | alignment_key : tuple 37 | value computed from the alignment instance that uniquely specifies its 38 | properties. Used for comparisons since pysam.AlignedSegment instances 39 | do not support a useful notion of equality (they compare using object 40 | identity). See `read_evidence.alignment_key` for the implementation of 41 | this key. 42 | ''' 43 | def __init__(self, locus, offset_start, offset_end, alignment): 44 | ''' 45 | Construct a PileupElement object. 46 | ''' 47 | assert offset_end >= offset_start, \ 48 | "offset_start=%d > offset_end=%d" % (offset_start, offset_end) 49 | self.locus = locus 50 | self.offset_start = offset_start 51 | self.offset_end = offset_end 52 | self.alignment = alignment 53 | self.alignment_key = alignment_key(self.alignment) 54 | 55 | def fields(self): 56 | ''' 57 | Fields that should be considered for our notion of object equality. 58 | ''' 59 | return ( 60 | self.locus, self.offset_start, self.offset_end, self.alignment_key) 61 | 62 | def __eq__(self, other): 63 | return hasattr(other, "fields") and self.fields() == other.fields() 64 | 65 | def __hash__(self): 66 | return hash(self.fields()) 67 | 68 | @property 69 | def bases(self): 70 | ''' 71 | The sequenced bases in the alignment that align to this locus in the 72 | genome, as a string. 73 | 74 | Empty string in the case of a deletion. String of length > 1 if there 75 | is an insertion here. 76 | ''' 77 | sequence = self.alignment.query_sequence 78 | assert self.offset_end <= len(sequence), \ 79 | "End offset=%d > sequence length=%d. CIGAR=%s. SEQUENCE=%s" % ( 80 | self.offset_end, 81 | len(sequence), 82 | self.alignment.cigarstring, 83 | sequence) 84 | return sequence[self.offset_start:self.offset_end] 85 | 86 | @property 87 | def base_qualities(self): 88 | ''' 89 | The phred-scaled base quality scores corresponding to `self.bases`, as 90 | a list. 91 | ''' 92 | return self.alignment.query_qualities[ 93 | self.offset_start:self.offset_end] 94 | 95 | @property 96 | def min_base_quality(self): 97 | ''' 98 | The minimum of the base qualities. In the case of a deletion, in which 99 | case there are no bases in this PileupElement, the minimum is taken 100 | over the sequenced bases immediately before and after the deletion. 101 | ''' 102 | try: 103 | return min(self.base_qualities) 104 | except ValueError: 105 | # We are mid-deletion. We return the minimum of the adjacent bases. 106 | assert self.offset_start == self.offset_end 107 | adjacent_qualities = [ 108 | self.alignment.query_qualities[offset] 109 | for offset in [self.offset_start - 1, self.offset_start] 110 | if 0 <= offset < len(self.alignment.query_qualities) 111 | ] 112 | return min(adjacent_qualities) 113 | 114 | @staticmethod 115 | def from_pysam_alignment(locus, pileup_read): 116 | ''' 117 | Factory function to create a new PileupElement from a pysam 118 | `PileupRead`. 119 | 120 | Parameters 121 | ---------- 122 | locus : varcode.Locus 123 | Reference locus for which to construct a PileupElement. Must 124 | include exactly one base. 125 | 126 | pileup_read : pysam.calignmentfile.PileupRead 127 | pysam PileupRead instance. Its alignment must overlap the locus. 128 | 129 | Returns 130 | ---------- 131 | PileupElement 132 | 133 | ''' 134 | assert not pileup_read.is_refskip, ( 135 | "Can't create a PileupElement in a refskip (typically an intronic " 136 | "gap in an RNA alignment)") 137 | 138 | # Pysam has an `aligned_pairs` method that gives a list of 139 | # (offset, locus) pairs indicating the correspondence between bases in 140 | # the alignment and reference loci. Here we use that to compute 141 | # offset_start and offset_end. 142 | # 143 | # This is slightly tricky in the case of insertions and deletions. 144 | # Here are examples of the desired logic. 145 | # 146 | # Target locus = 1000 147 | # 148 | # (1) Simple case: matching bases. 149 | # 150 | # OFFSET LOCUS 151 | # 0 999 152 | # 1 1000 153 | # 2 1001 154 | # 155 | # DESIRED RESULT: offset_start=1, offset_end=2. 156 | # 157 | # 158 | # (2) A 1 base insertion at offset 2. 159 | # 160 | # OFFSET LOCUS 161 | # 0 999 162 | # 1 1000 163 | # 2 None 164 | # 3 1001 165 | # 166 | # DESIRED RESULT: offset_start = 1, offset_end=3. 167 | # 168 | # 169 | # (3) A 2 base deletion at loci 1000 and 1001. 170 | # 171 | # OFFSET LOCUS 172 | # 0 999 173 | # None 1000 174 | # None 1001 175 | # 1 1002 176 | # 177 | # DESIRED RESULT: offset_start = 1, offset_end=1. 178 | # 179 | offset_start = None 180 | offset_end = len(pileup_read.alignment.query_sequence) 181 | # TODO: doing this with get_blocks() may be faster. 182 | for (offset, position) in pileup_read.alignment.aligned_pairs: 183 | if offset is not None and position is not None: 184 | if position == locus.position: 185 | offset_start = offset 186 | elif position > locus.position: 187 | offset_end = offset 188 | break 189 | if offset_start is None: 190 | offset_start = offset_end 191 | 192 | assert pileup_read.is_del == (offset_end - offset_start == 0), \ 193 | "Deletion=%s but | [%d,%d) |=%d for locus %d in: \n%s" % ( 194 | pileup_read.is_del, 195 | offset_start, 196 | offset_end, 197 | offset_end - offset_start, 198 | locus.position, 199 | pileup_read.alignment.aligned_pairs) 200 | 201 | assert offset_end >= offset_start 202 | result = PileupElement( 203 | locus, offset_start, offset_end, pileup_read.alignment) 204 | return result 205 | 206 | -------------------------------------------------------------------------------- /varlens/read_evidence/util.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import absolute_import 14 | 15 | def alignment_key(pysam_alignment_record): 16 | ''' 17 | Return the identifying attributes of a `pysam.AlignedSegment` instance. 18 | This is necessary since these objects do not support a useful notion of 19 | equality (they compare on identify by default). 20 | ''' 21 | return ( 22 | read_key(pysam_alignment_record), 23 | pysam_alignment_record.query_alignment_start, 24 | pysam_alignment_record.query_alignment_end, 25 | ) 26 | 27 | def read_key(pysam_alignment_record): 28 | ''' 29 | Given a `pysam.AlignedSegment` instance, return the attributes identifying 30 | the *read* it comes from (not the alignment). There may be more than one 31 | alignment for a read, e.g. chimeric and secondary alignments. 32 | ''' 33 | return ( 34 | pysam_alignment_record.query_name, 35 | pysam_alignment_record.is_duplicate, 36 | pysam_alignment_record.is_read1, 37 | pysam_alignment_record.is_read2, 38 | ) 39 | -------------------------------------------------------------------------------- /varlens/read_source.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import logging 14 | 15 | import pyensembl 16 | import pysam 17 | 18 | from . import read_evidence 19 | 20 | class ReadSource(object): 21 | def __init__(self, name, filename, read_filters=[]): 22 | self.name = name 23 | self.filename = filename 24 | self.handle = pysam.Samfile(filename) 25 | self.read_filters = read_filters 26 | 27 | self.chromosome_name_map = {} 28 | for name in self.handle.references: 29 | normalized = pyensembl.locus.normalize_chromosome(name) 30 | self.chromosome_name_map[normalized] = name 31 | self.chromosome_name_map[name] = name 32 | 33 | def index_if_needed(self): 34 | if self.filename.endswith(".bam") and not self.handle.has_index(): 35 | # pysam strangely requires and index even to iterate through a bam. 36 | logging.info( 37 | "Attempting to create BAM index for file: %s" % self.filename) 38 | samtools_output = pysam.index(self.filename) 39 | logging.info( 40 | "Done indexing" + ( 41 | (": " + samtools_output) if samtools_output else '')) 42 | 43 | # Reopen 44 | self.handle.close() 45 | self.handle = pysam.Samfile(self.filename) 46 | 47 | def reads(self, loci=None): 48 | if loci is None: 49 | def reads_iterator(): 50 | return self.handle.fetch(until_eof=True) 51 | elif self.filename.endswith(".sam"): 52 | # Inefficient. 53 | chromosome_intervals = {} 54 | for (contig, intervals) in loci.contigs.items(): 55 | try: 56 | chromosome = self.chromosome_name_map[contig] 57 | except KeyError: 58 | logging.warn( 59 | "No such contig in bam: %s" % contig) 60 | continue 61 | chromosome_intervals[chromosome] = intervals 62 | 63 | def reads_iterator(): 64 | seen = set() 65 | for read in self.handle.fetch(until_eof=True): 66 | intervals = chromosome_intervals.get(read.reference_name) 67 | if not intervals or not intervals.overlaps_range( 68 | read.reference_start, 69 | read.reference_end): 70 | continue 71 | key = alignment_key(read) 72 | if key not in seen: 73 | yield read 74 | seen.add(key) 75 | else: 76 | self.index_if_needed() 77 | 78 | def reads_iterator(): 79 | seen = set() 80 | for locus in loci: 81 | try: 82 | chromosome = self.chromosome_name_map[locus.contig] 83 | except KeyError: 84 | logging.warn( 85 | "No such contig in bam: %s" % locus.contig) 86 | continue 87 | for read in self.handle.fetch( 88 | chromosome, 89 | locus.start, 90 | locus.end): 91 | key = alignment_key(read) 92 | if key not in seen: 93 | yield read 94 | seen.add(key) 95 | 96 | return ( 97 | read for read in reads_iterator() 98 | if self.read_passes_filters(read)) 99 | 100 | def read_passes_filters(self, read): 101 | return all(read_filter(read) for read_filter in self.read_filters) 102 | 103 | def pileups(self, loci): 104 | self.index_if_needed() 105 | collection = read_evidence.PileupCollection.from_bam(self.handle, loci) 106 | if self.read_filters: 107 | for (locus, pileup) in collection.pileups.items(): 108 | collection.pileups[locus] = pileup.filter( 109 | [lambda element: 110 | self.read_passes_filters(element.alignment)]) 111 | return collection 112 | 113 | def alignment_key(pysam_alignment_record): 114 | ''' 115 | Return the identifying attributes of a `pysam.AlignedSegment` instance. 116 | This is necessary since these objects do not support a useful notion of 117 | equality (they compare on identify by default). 118 | ''' 119 | return ( 120 | read_key(pysam_alignment_record), 121 | pysam_alignment_record.query_alignment_start, 122 | pysam_alignment_record.query_alignment_end, 123 | ) 124 | 125 | 126 | def read_key(pysam_alignment_record): 127 | ''' 128 | Given a `pysam.AlignedSegment` instance, return the attributes identifying 129 | the *read* it comes from (not the alignment). There may be more than one 130 | alignment for a read, e.g. chimeric and secondary alignments. 131 | ''' 132 | return ( 133 | pysam_alignment_record.query_name, 134 | pysam_alignment_record.is_duplicate, 135 | pysam_alignment_record.is_read1, 136 | pysam_alignment_record.is_read2, 137 | ) 138 | 139 | 140 | -------------------------------------------------------------------------------- /varlens/reads_util.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import collections 14 | import functools 15 | 16 | from .read_source import ReadSource 17 | from . import util 18 | 19 | BOOLEAN_PROPERTIES = """ 20 | is_paired is_proper_pair is_qcfail is_read1 is_read2 is_reverse is_secondary 21 | is_unmapped mate_is_reverse mate_is_unmapped is_duplicate 22 | """.split() 23 | 24 | STRING_PROPERTIES = """ 25 | cigarstring query_alignment_sequence query_name 26 | """.split() 27 | 28 | INT_PROPERTIES = """ 29 | inferred_length mapping_quality query_alignment_length query_alignment_start 30 | query_length reference_length reference_start template_length 31 | """.split() 32 | 33 | # name -> (type, help, filter function) 34 | READ_FILTERS = collections.OrderedDict() 35 | 36 | for prop in BOOLEAN_PROPERTIES: 37 | READ_FILTERS[prop] = ( 38 | bool, 39 | "Only reads where %s is True" % prop, 40 | functools.partial( 41 | (lambda field_name, parsed_value, read: 42 | bool(getattr(read, field_name))), 43 | prop) 44 | ) 45 | 46 | READ_FILTERS["not_" + prop] = ( 47 | bool, 48 | "Only reads where %s is False" % prop, 49 | functools.partial( 50 | (lambda field_name, parsed_value, read: 51 | not getattr(read, field_name)), 52 | prop) 53 | ) 54 | 55 | def field_contains(field_name, parsed_value, read): 56 | field_value = getattr(read, field_name) 57 | return field_value is not None and parsed_value in field_value 58 | 59 | for prop in STRING_PROPERTIES: 60 | READ_FILTERS["%s" % prop] = ( 61 | str, 62 | "Only reads with the specified %s" % prop, 63 | functools.partial( 64 | (lambda field_name, parsed_value, read: 65 | getattr(read, field_name) == parsed_value), 66 | prop) 67 | ) 68 | 69 | READ_FILTERS["%s_contains" % prop] = ( 70 | str, 71 | "Only reads where %s contains the given string" % prop, 72 | functools.partial(field_contains, prop)) 73 | 74 | for prop in INT_PROPERTIES: 75 | READ_FILTERS["%s" % prop] = ( 76 | int, 77 | "Only reads with the specified %s" % prop, 78 | functools.partial( 79 | (lambda field_name, parsed_value, read: 80 | getattr(read, field_name) == parsed_value), 81 | prop) 82 | ) 83 | 84 | READ_FILTERS["min_%s" % prop] = ( 85 | int, 86 | "Only reads where %s >=N" % prop, 87 | functools.partial( 88 | (lambda field_name, parsed_value, read: 89 | getattr(read, field_name) >= parsed_value), 90 | prop) 91 | ) 92 | 93 | READ_FILTERS["max_%s" % prop] = ( 94 | int, 95 | "Only reads where %s <=N" % prop, 96 | functools.partial( 97 | (lambda field_name, parsed_value, read: 98 | getattr(read, field_name) <= parsed_value), 99 | prop) 100 | ) 101 | 102 | def add_args(parser, positional=False): 103 | """ 104 | Extends a commandline argument parser with arguments for specifying 105 | read sources. 106 | """ 107 | group = parser.add_argument_group("read loading") 108 | group.add_argument("reads" if positional else "--reads", 109 | nargs="+", default=[], 110 | help="Paths to bam files. Any number of paths may be specified.") 111 | 112 | group.add_argument( 113 | "--read-source-name", 114 | nargs="+", 115 | help="Names for each read source. The number of names specified " 116 | "must match the number of bam files. If not specified, filenames are " 117 | "used for names.") 118 | 119 | # Add filters 120 | group = parser.add_argument_group( 121 | "read filtering", 122 | "A number of read filters are available. See the pysam " 123 | "documentation (http://pysam.readthedocs.org/en/latest/api.html) " 124 | "for details on what these fields mean. When multiple filter " 125 | "options are specified, reads must match *all* filters.") 126 | 127 | for (name, (kind, message, function)) in READ_FILTERS.items(): 128 | extra = {} 129 | if kind is bool: 130 | extra["action"] = "store_true" 131 | extra["default"] = None 132 | elif kind is int: 133 | extra["type"] = int 134 | extra["metavar"] = "N" 135 | elif kind is str: 136 | extra["metavar"] = "STRING" 137 | group.add_argument("--" + name.replace("_", "-"), 138 | help=message, 139 | **extra) 140 | 141 | def load_from_args(args): 142 | """ 143 | Given parsed commandline arguments, returns a list of ReadSource objects 144 | """ 145 | if not args.reads: 146 | return None 147 | 148 | if args.read_source_name: 149 | read_source_names = util.expand( 150 | args.read_source_name, 151 | 'read_source_name', 152 | 'read source', 153 | len(args.reads)) 154 | else: 155 | read_source_names = util.drop_prefix(args.reads) 156 | 157 | filters = [] 158 | for (name, info) in READ_FILTERS.items(): 159 | value = getattr(args, name) 160 | if value is not None: 161 | filters.append(functools.partial(info[-1], value)) 162 | 163 | return [ 164 | load_bam(filename, name, filters) 165 | for (filename, name) 166 | in zip(args.reads, read_source_names) 167 | ] 168 | 169 | def load_bam(filename, name=None, filters=[]): 170 | if not name: 171 | name = filename 172 | return ReadSource(name, filename, filters) 173 | 174 | def flatten_header(header): 175 | for (group, rows) in header.items(): 176 | for (index, row) in enumerate(rows): 177 | if not isinstance(row, dict): 178 | key_values = [(row, "")] 179 | else: 180 | key_values = row.items() 181 | for (key, value) in key_values: 182 | yield (str(group), index, str(key), str(value)) 183 | -------------------------------------------------------------------------------- /varlens/sequence_context.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import pyfaidx 14 | 15 | def variant_context( 16 | reference_fasta, 17 | contig, 18 | inclusive_start, 19 | inclusive_end, 20 | alt, 21 | context_length): 22 | """ 23 | Retrieve the surronding reference region from a variant. 24 | 25 | SNVs are canonicalized so the reference base is a pyrmidine (C/T). For 26 | indels the reverse complement will still be taken if the first base of 27 | the reference is not a pyrmidine, but since the reference will also be 28 | reversed, that doesn't guarantee it will start with a pyrmidine. 29 | 30 | Parameters 31 | ---------- 32 | reference_fasta : FastaReference 33 | reference sequence from pyfaidx package 34 | 35 | contig : str 36 | Chromosome of the variant 37 | 38 | inclusive_start : int 39 | start of the variant in 1-based inclusive coordinates 40 | 41 | inclusive_end : int 42 | end of the variant in 1-based inclusive coordinates 43 | 44 | alt : string 45 | alt sequence 46 | 47 | context_length : int 48 | number of bases on either side of the variant to return 49 | 50 | Returns 51 | --------- 52 | A tuple of (5', mutation, 3') where 53 | 5' - bases immediately 5 prime to the mutation 54 | 55 | 3' - bases immediately 3 prime to the mutation 56 | 57 | mutation - the ref sequence followed by a > character followed by the 58 | the alt sequence 59 | """ 60 | 61 | # Move from 1-base coorindates to 0-base coordinates 62 | start = int(inclusive_start) - 1 63 | end = int(inclusive_end) 64 | 65 | full_sequence = reference_fasta[contig] 66 | 67 | left = str(full_sequence[start - context_length:start].seq).upper() 68 | middle = str(full_sequence[start: end].seq).upper() 69 | right = str(full_sequence[end: end + context_length].seq).upper() 70 | 71 | # Complement and reverse the context if necessary so the ref base is a 72 | # pyrmidine (C/T) 73 | if middle[0] in ('A', 'G'): 74 | context_5prime = pyfaidx.complement(right)[::-1] 75 | context_3prime = pyfaidx.complement(left)[::-1] 76 | context_mutation = "%s>%s" % ( 77 | pyfaidx.complement(middle)[::-1], pyfaidx.complement(alt)[::-1]) 78 | else: 79 | context_5prime = left 80 | context_3prime = right 81 | context_mutation = "%s>%s" % (middle, alt) 82 | 83 | return (context_5prime, context_mutation, context_3prime) 84 | 85 | -------------------------------------------------------------------------------- /varlens/support.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import collections 14 | import logging 15 | 16 | import pandas 17 | 18 | EXPECTED_COLUMNS = [ 19 | "source", 20 | "contig", 21 | "interbase_start", 22 | "interbase_end", 23 | "allele", 24 | "count", 25 | ] 26 | 27 | def allele_support_df(loci, sources): 28 | """ 29 | Returns a DataFrame of allele counts for all given loci in the read sources 30 | """ 31 | return pandas.DataFrame( 32 | allele_support_rows(loci, sources), 33 | columns=EXPECTED_COLUMNS) 34 | 35 | def allele_support_rows(loci, sources): 36 | for source in sources: 37 | logging.info("Reading from: %s (%s)" % (source.name, source.filename)) 38 | for locus in loci: 39 | grouped = dict(source.pileups([locus]).group_by_allele(locus)) 40 | if grouped: 41 | items = grouped.items() 42 | else: 43 | items = [("N" * (locus.end - locus.start), None)] 44 | for (allele, group) in items: 45 | d = collections.OrderedDict([ 46 | ("source", source.name), 47 | ("contig", locus.contig), 48 | ("interbase_start", str(locus.start)), 49 | ("interbase_end", str(locus.end)), 50 | ("allele", allele), 51 | ("count", group.num_reads() if group is not None else 0), 52 | ]) 53 | yield pandas.Series(d) 54 | 55 | def variant_support(variants, allele_support_df, ignore_missing=False): 56 | ''' 57 | Collect the read evidence support for the given variants. 58 | 59 | Parameters 60 | ---------- 61 | 62 | variants : iterable of varcode.Variant 63 | 64 | allele_support_df : dataframe 65 | Allele support dataframe, as output by the varlens-allele-support tool. 66 | It should have columns: source, contig, interbase_start, interbase_end, 67 | allele. The remaining columns are interpreted as read counts of various 68 | subsets of reads (e.g. all reads, non-duplicate reads, etc.) 69 | 70 | ignore_missing : boolean 71 | If True, then varaints with no allele counts will be interpreted as 72 | having 0 depth. If False, then an exception will be raised if any 73 | variants have no allele counts. 74 | 75 | Returns 76 | ---------- 77 | 78 | A pandas.Panel4D frame with these axes: 79 | 80 | labels (axis=0) : the type of read being counted, i.e. the read count 81 | fields in allele_support_df. 82 | 83 | items (axis=1) : the type of measurement (num_alt, num_ref, num_other, 84 | total_depth, alt_fraction, any_alt_fraction) 85 | 86 | major axis (axis=2) : the variants 87 | 88 | minor axis (axis=3) : the sources 89 | ''' 90 | missing = [ 91 | c for c in EXPECTED_COLUMNS if c not in allele_support_df.columns 92 | ] 93 | if missing: 94 | raise ValueError("Missing columns: %s" % " ".join(missing)) 95 | 96 | # Ensure our start and end fields are ints. 97 | allele_support_df[["interbase_start", "interbase_end"]] = ( 98 | allele_support_df[["interbase_start", "interbase_end"]].astype(int)) 99 | 100 | sources = sorted(allele_support_df["source"].unique()) 101 | 102 | allele_support_dict = collections.defaultdict(dict) 103 | for (i, row) in allele_support_df.iterrows(): 104 | key = ( 105 | row['source'], 106 | row.contig, 107 | row.interbase_start, 108 | row.interbase_end) 109 | allele_support_dict[key][row.allele] = row["count"] 110 | 111 | # We want an exception on bad lookups, so convert to a regular dict. 112 | allele_support_dict = dict(allele_support_dict) 113 | 114 | dataframe_dicts = collections.defaultdict( 115 | lambda: collections.defaultdict(list)) 116 | 117 | for variant in variants: 118 | for source in sources: 119 | key = (source, variant.contig, variant.start - 1, variant.end) 120 | try: 121 | alleles = allele_support_dict[key] 122 | except KeyError: 123 | message = ( 124 | "No allele counts in source %s for variant %s" % ( 125 | source, str(variant))) 126 | if ignore_missing: 127 | logging.warning(message) 128 | alleles = {} 129 | else: 130 | raise ValueError(message) 131 | 132 | alt = alleles.get(variant.alt, 0) 133 | ref = alleles.get(variant.ref, 0) 134 | total = sum(alleles.values()) 135 | 136 | other = total - alt - ref 137 | 138 | dataframe_dicts["num_alt"][source].append(alt) 139 | dataframe_dicts["num_ref"][source].append(ref) 140 | dataframe_dicts["num_other"][source].append(other) 141 | dataframe_dicts["total_depth"][source].append(total) 142 | dataframe_dicts["alt_fraction"][source].append( 143 | float(alt) / max(1, total)) 144 | dataframe_dicts["any_alt_fraction"][source].append( 145 | float(alt + other) / max(1, total)) 146 | 147 | dataframes = dict( 148 | (label, pandas.DataFrame(value, index=variants)) 149 | for (label, value) in dataframe_dicts.items()) 150 | 151 | return pandas.Panel(dataframes) 152 | -------------------------------------------------------------------------------- /varlens/util.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os 14 | import argparse 15 | 16 | def expand(value, arg_name, input_name, length): 17 | if value is None or len(value) == 0: 18 | return [None] * length 19 | 20 | if len(value) == length: 21 | return value 22 | 23 | if len(value) == 1: 24 | return value * length 25 | 26 | if length == 1: 27 | raise ValueError( 28 | "With only 1 {input_name} specified, each {arg_name} argument " 29 | "should be length 1. If you are trying to specify multiple filters" 30 | " to apply consecutively, you should specify the entire argument " 31 | "multiple times." 32 | .format( 33 | arg_name=arg_name, 34 | input_name=input_name, 35 | length=length, 36 | actual=len(value))) 37 | 38 | else: 39 | raise ValueError( 40 | "Expected argument {arg_name} to be length 1 (i.e. apply to all " 41 | "{input_name} inputs) or length {length} (i.e. an individual value" 42 | " for each of the {length} {input_name} inputs), not {actual}." 43 | .format( 44 | arg_name=arg_name, 45 | input_name=input_name, 46 | length=length, 47 | actual=len(value))) 48 | 49 | 50 | def drop_prefix(strings): 51 | """ 52 | Removes common prefix from a collection of strings 53 | """ 54 | strings_without_extensions = [ 55 | s.split(".", 2)[0] for s in strings 56 | ] 57 | 58 | if len(strings_without_extensions) == 1: 59 | return [os.path.basename(strings_without_extensions[0])] 60 | prefix_len = len(os.path.commonprefix(strings_without_extensions)) 61 | result = [string[prefix_len:] for string in strings_without_extensions] 62 | if len(set(result)) != len(strings): 63 | # If these operations resulted in a collision, just return the original 64 | # strings. 65 | return strings 66 | return result 67 | 68 | class PrefixedArgumentParser(object): 69 | def __init__(self, wrapped, prefix): 70 | self.wrapped = wrapped 71 | self.prefix = prefix 72 | 73 | def add_argument(self, name, *args, **kwargs): 74 | assert name.startswith("--") 75 | new_name = "--" + self.prefix + "-" + name[2:] 76 | self.wrapped.add_argument(new_name, *args, **kwargs) 77 | 78 | 79 | def remove_prefix_from_parsed_args(args, prefix): 80 | result = argparse.Namespace() 81 | for (arg, value) in args._get_kwargs(): 82 | if arg.startswith(prefix + "_"): 83 | setattr(result, arg[len(prefix + "_"):], value) 84 | return result 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /varlens/variant_includes.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import logging 14 | import time 15 | import collections 16 | 17 | import pandas 18 | import numpy 19 | import typechecks 20 | 21 | import pyfaidx 22 | 23 | from . import sequence_context 24 | from . import mhc_binding 25 | from . import reads_util 26 | from . import support 27 | from . import read_evidence 28 | 29 | class Includeable(object): 30 | columns = None 31 | 32 | @classmethod 33 | def from_args(cls, args): 34 | return cls() 35 | 36 | def process_chunk(self, df): 37 | raise NotImplementedError() 38 | 39 | def compute(self, df, chunk_rows=None): 40 | assert self.columns 41 | for column in self.columns: 42 | if column not in df.columns: 43 | df[column] = numpy.nan 44 | rows_to_annotate = pandas.isnull(df[self.columns[0]]) 45 | for column in self.columns[1:]: 46 | rows_to_annotate = rows_to_annotate | pandas.isnull(df[column]) 47 | 48 | while rows_to_annotate.sum() > 0: 49 | if chunk_rows: 50 | this_chunk_rows = rows_to_annotate & ( 51 | rows_to_annotate.cumsum() <= chunk_rows) 52 | else: 53 | this_chunk_rows = rows_to_annotate 54 | 55 | num_remaining = rows_to_annotate.sum() 56 | logging.info("%s: %d / %d (%0.1f%%) remaining. Processing %d rows." 57 | % ( 58 | self.name, 59 | num_remaining, 60 | len(rows_to_annotate), 61 | num_remaining * 100.0 / len(rows_to_annotate), 62 | this_chunk_rows.sum())) 63 | 64 | rows_to_annotate = rows_to_annotate & (~ this_chunk_rows) 65 | 66 | if this_chunk_rows.sum() > 0: 67 | start = time.time() 68 | df.ix[this_chunk_rows, self.columns] = self.process_chunk( 69 | df.ix[this_chunk_rows].copy())[self.columns] 70 | logging.info("Processed in %f0.2 sec" % (time.time() - start)) 71 | yield this_chunk_rows.sum() 72 | 73 | class Effect(Includeable): 74 | name = "variant effect annotations" 75 | columns = ["effect"] 76 | 77 | @staticmethod 78 | def add_args(parser): 79 | parser = parser.add_argument_group(Effect.name) 80 | parser.add_argument("--include-effect", 81 | action="store_true", default=False, 82 | help="Include varcode effect annotations") 83 | 84 | @staticmethod 85 | def requested(args): 86 | return args.include_effect 87 | 88 | def process_chunk(self, df): 89 | df["effect"] = [ 90 | v.effects().top_priority_effect().short_description 91 | for v in df["variant"] 92 | ] 93 | return df 94 | 95 | class Gene(Includeable): 96 | name = "gene annotations" 97 | columns = ["gene"] 98 | 99 | @staticmethod 100 | def add_args(parser): 101 | parser = parser.add_argument_group(Gene.name) 102 | parser.add_argument("--include-gene", 103 | action="store_true", default=False, 104 | help="Include gene names") 105 | 106 | @staticmethod 107 | def requested(args): 108 | return args.include_gene 109 | 110 | def process_chunk(self, df): 111 | df["gene"] = [ 112 | ' '.join(v.gene_names) if v.gene_names else 'None' 113 | for v in df.variant 114 | ] 115 | return df 116 | 117 | class Context(Includeable): 118 | name = "variant sequence context" 119 | columns = ["context_5_prime", "context_3_prime", "context_mutation"] 120 | 121 | @staticmethod 122 | def add_args(parser): 123 | parser = parser.add_argument_group(Context.name) 124 | parser.add_argument("--include-context", 125 | action="store_true", default=False, 126 | help="Include variant sequence context") 127 | parser.add_argument("--reference", 128 | help="Path to reference fasta (required for sequence context)") 129 | parser.add_argument("--context-num-bases", type=int, default=15, 130 | metavar="N", 131 | help="Num bases of context to include on each side of the variant") 132 | 133 | @classmethod 134 | def from_args(cls, args): 135 | if not args.reference: 136 | raise ValueError( 137 | "The --reference argument is required when including context") 138 | return cls( 139 | reference=pyfaidx.Fasta(args.reference), 140 | context_num_bases=args.context_num_bases) 141 | 142 | def __init__(self, reference, context_num_bases): 143 | self.reference = reference 144 | self.context_num_bases = context_num_bases 145 | 146 | @staticmethod 147 | def requested(args): 148 | return args.include_context 149 | 150 | def process_chunk(self, df): 151 | context_5_prime = [] 152 | context_3_prime = [] 153 | context_mutation = [] 154 | for variant in df.variant: 155 | tpl = sequence_context.variant_context( 156 | self.reference, 157 | variant.contig, 158 | variant.start, 159 | variant.end, 160 | variant.alt, 161 | self.context_num_bases) 162 | context_5_prime.append(tpl[0]) 163 | context_mutation.append(tpl[1]) 164 | context_3_prime.append(tpl[2]) 165 | 166 | df["context_5_prime"] = context_5_prime 167 | df["context_3_prime"] = context_3_prime 168 | df["context_mutation"] = context_mutation 169 | return df 170 | 171 | class MHCBindingAffinity(Includeable): 172 | name = "MHC binding affinity" 173 | columns = ["binding_affinity", "binding_allele"] 174 | 175 | noncoding_effects = set([ 176 | "intergenic", 177 | "intronic", 178 | "non-coding-transcript", 179 | "3' UTR", 180 | "5' UTR", 181 | "silent", 182 | ]) 183 | 184 | @staticmethod 185 | def add_args(parser): 186 | parser = parser.add_argument_group(MHCBindingAffinity.name) 187 | parser.add_argument("--include-mhc-binding", 188 | action="store_true", default=False, 189 | help="Include MHC binding (tightest affinity and allele)") 190 | parser.add_argument("--hla", 191 | help="Space separated list of MHC alleles, e.g. 'A:02:01 A:02:02'") 192 | parser.add_argument('--hla-file', 193 | help="Load HLA types from the specified CSV file. It must have " 194 | "columns: 'donor' and 'hla'") 195 | 196 | @classmethod 197 | def from_args(cls, args): 198 | if bool(args.hla) + bool(args.hla_file) != 1: 199 | raise ValueError("Must specify exactly one of --hla or --hla-file") 200 | return cls( 201 | hla=args.hla, 202 | hla_dataframe=( 203 | pandas.read_csv(args.hla_file) if args.hla_file else None)) 204 | 205 | @staticmethod 206 | def string_to_hla_alleles(s): 207 | return s.replace("'", "").split() 208 | 209 | def __init__(self, hla=None, hla_dataframe=None, donor_to_hla=None): 210 | """ 211 | Specify exactly one of hla, hla_dataframe, or donor_to_hla. 212 | 213 | Parameters 214 | ----------- 215 | hla : list of string 216 | HLA alleles to use for all donors 217 | 218 | hla_dataframe : pandas.DataFrame with columns 'donor' and 'hla' 219 | DataFrame giving HLA alleles for each donor. The 'hla' column 220 | should be a space separated list of alleles for that donor. 221 | 222 | donor_to_hla : dict of string -> string list 223 | Map from donor to HLA alleles for that donor. 224 | """ 225 | if bool(hla) + (hla_dataframe is not None) + bool(donor_to_hla) != 1: 226 | raise TypeError( 227 | "Must specify exactly one of hla, hla_dataframe, donor_to_hla") 228 | 229 | self.hla = ( 230 | self.string_to_hla_alleles(hla) if typechecks.is_string(hla) 231 | else hla) 232 | self.donor_to_hla = donor_to_hla 233 | if hla_dataframe is not None: 234 | self.donor_to_hla = {} 235 | for (i, row) in hla_dataframe.iterrows(): 236 | if row.donor in self.donor_to_hla: 237 | raise ValueError("Multiple rows for donor: %s" % row.donor) 238 | if pandas.isnull(row.hla): 239 | self.donor_to_hla[row.donor] = None 240 | else: 241 | self.donor_to_hla[row.donor] = self.string_to_hla_alleles( 242 | row.hla) 243 | assert self.hla is not None or self.donor_to_hla is not None 244 | 245 | @staticmethod 246 | def requested(args): 247 | return args.include_mhc_binding 248 | 249 | def process_chunk(self, df): 250 | drop_donor = False 251 | if 'donor' not in df: 252 | df["donor"] = "DONOR1" 253 | drop_donor = True 254 | for donor in df.donor.unique(): 255 | rows = (df.donor == donor) 256 | if 'effect' in df: 257 | rows = rows & (~df.effect.isin(self.noncoding_effects)) 258 | sub_df = df.loc[rows] 259 | alleles = self.hla if self.hla else self.donor_to_hla.get(donor) 260 | if alleles and sub_df.shape[0] > 0: 261 | result = mhc_binding.binding_affinities( 262 | sub_df.variant, alleles) 263 | df.loc[rows, "binding_affinity"] = ( 264 | result["binding_affinity"].values) 265 | df.loc[rows, "binding_allele"] = ( 266 | result["binding_allele"].values) 267 | if drop_donor: 268 | del df["donor"] 269 | return df 270 | 271 | class ReadEvidence(Includeable): 272 | name = "read evidence" 273 | default_column_format = "{source}_count_{allele_group}" 274 | 275 | @classmethod 276 | def add_args(cls, parser): 277 | group = parser.add_argument_group(cls.name) 278 | group.add_argument("--include-read-evidence", 279 | action="store_true", default=False, 280 | help="Include counts of supporting / contradicting reads") 281 | group.add_argument("--read-sources-file", 282 | help="Load paths to BAMs from the given csv file.") 283 | group.add_argument("--read-sources-id-column", 284 | default="source_id", 285 | help="Column to use to join read sources with the variants " 286 | "dataframe.") 287 | group.add_argument("--read-sources-column", action="append", 288 | default=[], 289 | help="Column containing path to reads (e.g. path to a BAM). Can " 290 | "be specified any number of times. If not specified, all " 291 | "columns are used.") 292 | group.add_argument("--always-prefix-column", action="store_true", 293 | default=False, 294 | help="Always prefix the column names with the source name and " 295 | "count group, even when there is only one of each.") 296 | group.add_argument("--survive-errors", action="store_true", 297 | default=False, 298 | help="If an error is encountered log it and try to continue.") 299 | 300 | reads_util.add_args(parser) 301 | 302 | @classmethod 303 | def from_args(cls, args): 304 | read_sources = reads_util.load_from_args(args) 305 | read_sources_df = None 306 | if args.read_sources_file is not None: 307 | read_sources_df = pandas.read_csv( 308 | args.read_sources_file, 309 | index_col=args.read_sources_id_column) 310 | if args.read_sources_column: 311 | read_sources_df = read_sources_df[args.read_sources_column] 312 | 313 | source_names = cls.read_source_names(read_sources, read_sources_df) 314 | if (args.always_prefix_column or len(source_names) > 1): 315 | column_format = cls.default_column_format 316 | else: 317 | column_format = "{allele_group}" 318 | return cls( 319 | read_sources=read_sources, 320 | read_sources_df=read_sources_df, 321 | column_format=column_format, 322 | survive_errors=args.survive_errors) 323 | 324 | def __init__(self, 325 | read_sources=None, 326 | read_sources_df=None, 327 | column_format=default_column_format, 328 | survive_errors=False): 329 | """ 330 | 331 | """ 332 | if sum(x is not None for x in [read_sources, read_sources_df]) != 1: 333 | raise TypeError( 334 | "Specify exactly one of read_sources, read_sources_df") 335 | 336 | self.read_sources = read_sources 337 | self.read_sources_df = read_sources_df 338 | self.column_format = column_format 339 | self.survive_errors = survive_errors 340 | self.set_columns() 341 | 342 | @staticmethod 343 | def read_source_names(read_sources=None, read_sources_df=None): 344 | if read_sources is not None: 345 | return [x.name for x in read_sources] 346 | return read_sources_df.columns.tolist() 347 | 348 | def set_columns(self): 349 | source_names = self.read_source_names( 350 | read_sources=self.read_sources, 351 | read_sources_df=self.read_sources_df) 352 | assert source_names 353 | self.columns_dict = collections.OrderedDict() 354 | for source_name in source_names: 355 | for allele_group in ["num_alt", "num_ref", "total_depth"]: 356 | column_name = self.column_name( 357 | source_name, allele_group) 358 | self.columns_dict[column_name] = ( 359 | source_name, allele_group) 360 | self.columns = list(self.columns_dict) 361 | assert self.columns 362 | 363 | def column_name(self, source, allele_group): 364 | """ 365 | Parameters 366 | ---------- 367 | source : string 368 | name of the ReadSource 369 | 370 | allele_group : string 371 | one of: num_ref, num_alt, total_depth 372 | 373 | Returns 374 | ---------- 375 | column name : string 376 | """ 377 | return self.column_format.format( 378 | source=source, 379 | allele_group=allele_group) 380 | 381 | @staticmethod 382 | def requested(args): 383 | return args.include_read_evidence 384 | 385 | def process_chunk(self, df): 386 | if self.read_sources_df is None: 387 | def rows_and_read_sources(): 388 | all_rows = numpy.ones(df.shape[0], dtype=bool) 389 | yield (all_rows, self.read_sources) 390 | else: 391 | def rows_and_read_sources(): 392 | join_col = self.read_sources_df.index.name 393 | for join_value in df[join_col].unique(): 394 | rows = df[join_col] == join_value 395 | read_paths = self.read_sources_df.ix[join_value] 396 | read_sources = [] 397 | for (name, filename) in read_paths.iteritems(): 398 | if pandas.isnull(filename): 399 | continue 400 | relevant_columns = [ 401 | col for (col, (source_name, allele_group)) 402 | in self.columns_dict.items() 403 | if source_name == name 404 | ] 405 | if (~pandas.isnull(df[relevant_columns].values)).all(): 406 | logging.info( 407 | "Skipping source %s (%s) for %s: data exists" % 408 | (name, filename, join_value)) 409 | continue 410 | try: 411 | read_sources.append(reads_util.load_bam( 412 | filename, 413 | name=name)) 414 | except Exception as e: 415 | logging.error("Error loading bam: %s in %s" % 416 | (str(e), filename)) 417 | if not self.survive_errors: 418 | raise 419 | continue 420 | 421 | if rows.sum() > 0 and read_sources: 422 | logging.info( 423 | "Processing %s=%s (%d rows, %d read sources)" % ( 424 | join_col, 425 | join_value, 426 | rows.sum(), 427 | len(read_sources))) 428 | yield (rows, read_sources) 429 | else: 430 | logging.info( 431 | "Skipping %s=%s (%d rows, %d read sources)" % ( 432 | join_col, 433 | join_value, 434 | rows.sum(), 435 | len(read_sources))) 436 | 437 | for (rows, sources) in rows_and_read_sources(): 438 | variants = df.variant[rows] 439 | counter = collections.Counter(variants) 440 | duplicate_variants = dict( 441 | (v, c) for (v, c) in counter.items() if c > 1) 442 | if duplicate_variants: 443 | raise ValueError("Duplicate variant(s) for this source: %s" % 444 | duplicate_variants) 445 | variant_loci = sorted(set( 446 | read_evidence.pileup_collection.to_locus(variant) 447 | for variant in variants)) 448 | 449 | allele_support_df = support.allele_support_df( 450 | variant_loci, sources) 451 | assert set(s.name for s in sources) == set( 452 | allele_support_df.source.unique()) 453 | variant_support_df = support.variant_support( 454 | variants, allele_support_df) 455 | assert set(s.name for s in sources) == set( 456 | variant_support_df.minor_axis) 457 | 458 | for allele_group in ["num_alt", "num_ref", "total_depth"]: 459 | sub_panel = variant_support_df[allele_group, variants] 460 | for source_column in sub_panel.columns: 461 | dest_column = self.column_name( 462 | source_column, allele_group) 463 | assert dest_column in self.columns, ( 464 | "Bad column: %s not in %s" % ( 465 | dest_column, " ".join(self.columns))) 466 | values = sub_panel[source_column].values 467 | assert len(values) == rows.sum(), "%d != %d" % ( 468 | len(values), rows.sum()) 469 | df.loc[rows, dest_column] = values 470 | return df 471 | 472 | INCLUDEABLES = Includeable.__subclasses__() 473 | -------------------------------------------------------------------------------- /varlens/variants_util.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import collections 14 | 15 | import pandas 16 | import varcode 17 | import varcode.reference 18 | import logging 19 | 20 | from . import util, loci_util 21 | from .locus import Locus 22 | from .read_evidence import pileup_collection 23 | 24 | STANDARD_DATAFRAME_COLUMNS = [ 25 | "genome", 26 | "contig", 27 | "interbase_start", 28 | "interbase_end", 29 | "ref", 30 | "alt", 31 | ] 32 | 33 | def add_args(parser, positional=False): 34 | group = parser.add_argument_group("variant loading") 35 | group.add_argument("variants" if positional else "--variants", 36 | nargs=("*" if positional else "+"), default=[], 37 | help="Path to VCF file. Any number of VCF files may be specified. " 38 | "CSV files in the format outputted by the varlens-variants tool are " 39 | "also supported.") 40 | group.add_argument("--genome", 41 | help="Genome for the variants (e.g. b37). Required when the genome " 42 | "cannot be guessed from the metadata in the VCF.") 43 | group.add_argument("--include-failing-variants", 44 | action="store_true", 45 | default=False, 46 | help="Include variants with a non-PASS filter field.") 47 | group.add_argument("--variant-source-name", nargs="+", 48 | help="Names for variant sources. Must specify one name per variant " 49 | "source. If not specified, the filenames are used.") 50 | group.add_argument("--max-variants-per-source", type=int, 51 | metavar="N", 52 | help="Load at most N variants from each source.") 53 | group.add_argument("--single-variant", nargs=3, action="append", 54 | default=[], metavar="X", 55 | help="Literal variant specified as three arguments: LOCUS REF ALT. " 56 | "Can be specified any number of times by repeating the " 57 | "--single-variant option.") 58 | 59 | # Filters 60 | group = parser.add_argument_group("variant filtering", 61 | "If multiple filters are specified, the variants must pass *all* " 62 | "filters. For filtering by loci, any variants that overlap " 63 | "the specified loci are included.") 64 | group.add_argument("--ref", nargs="+", 65 | help="Include only variants where ref is one of the given values.") 66 | group.add_argument("--alt", nargs="+", 67 | help="Include only variants where alt is one of the given values.") 68 | loci_util.add_args(util.PrefixedArgumentParser(group, "variant")) 69 | 70 | def load_from_args_as_dataframe(args): 71 | ''' 72 | Given parsed variant-loading arguments, return a pandas DataFrame. 73 | 74 | If no variant loading arguments are specified, return None. 75 | ''' 76 | if not args.variants and not args.single_variant: 77 | return None 78 | 79 | if args.variant_source_name: 80 | variant_source_names = util.expand( 81 | args.variant_source_name, 82 | 'variant_source_name', 83 | 'variant source', 84 | len(args.variants)) 85 | else: 86 | variant_source_names = util.drop_prefix(args.variants) 87 | 88 | variant_to_sources = collections.defaultdict(list) 89 | 90 | dfs = [] 91 | for i in range(len(args.variants)): 92 | name = variant_source_names[i] 93 | prefix = ( 94 | 'metadata:' if len(args.variants) == 1 else "metadata:%s:" % name) 95 | df = load_as_dataframe( 96 | args.variants[i], 97 | name=name, 98 | genome=args.genome, 99 | max_variants=args.max_variants_per_source, 100 | only_passing=not args.include_failing_variants, 101 | metadata_column_prefix=prefix) 102 | 103 | if df.shape[0] == 0: 104 | logging.warn("No variants loaded from: %s" % args.variants[i]) 105 | else: 106 | for variant in df.variant: 107 | variant_to_sources[variant].append(name) 108 | dfs.append(df) 109 | 110 | if args.single_variant: 111 | variants = [] 112 | extra_args = {} 113 | if args.genome: 114 | extra_args = { 115 | 'ensembl': varcode.reference.infer_genome(args.genome) 116 | } 117 | for (locus_str, ref, alt) in args.single_variant: 118 | locus = Locus.parse(locus_str) 119 | variant = varcode.Variant( 120 | locus.contig, 121 | locus.inclusive_start, 122 | ref, 123 | alt, 124 | **extra_args) 125 | variants.append(variant) 126 | variant_to_sources[variant].append("commandline") 127 | dfs.append(variants_to_dataframe(variants)) 128 | 129 | df = dfs.pop(0) 130 | for other_df in dfs: 131 | df = pandas.merge( 132 | df, 133 | other_df, 134 | how='outer', 135 | on=["variant"] + STANDARD_DATAFRAME_COLUMNS) 136 | 137 | genomes = df["genome"].unique() 138 | if len(genomes) > 1: 139 | raise ValueError( 140 | "Mixing references is not supported. " 141 | "Reference genomes: %s" % (", ".join(genomes))) 142 | 143 | df["sources"] = [" ".join(variant_to_sources[v]) for v in df.variant] 144 | 145 | # Apply filters: 146 | if args.ref: 147 | df = df.ix[df.ref.isin(args.ref)] 148 | if args.alt: 149 | df = df.ix[df.alt.isin(args.alt)] 150 | loci = loci_util.load_from_args( 151 | util.remove_prefix_from_parsed_args(args, "variant")) 152 | if loci is not None: 153 | df = df.ix[[ 154 | loci.intersects(pileup_collection.to_locus(v)) 155 | for v in df.variant 156 | ]] 157 | return df 158 | 159 | def load_as_dataframe( 160 | filename, 161 | loader=None, 162 | name=None, 163 | genome=None, 164 | max_variants=None, 165 | only_passing=True, 166 | metadata_column_prefix=''): 167 | 168 | if name is None: 169 | name = filename 170 | 171 | if loader is None: 172 | if (filename.endswith(".vcf") or filename.endswith(".vcf.gz")): 173 | # Load from VCF 174 | def loader(filename): 175 | collection = varcode.load_vcf_fast( 176 | filename, 177 | genome=genome, 178 | max_variants=max_variants, 179 | only_passing=only_passing, 180 | allow_extended_nucleotides=True) 181 | return variants_to_dataframe( 182 | collection, 183 | collection.metadata, 184 | metadata_column_prefix=metadata_column_prefix) 185 | 186 | elif (filename.endswith(".csv") or filename.endswith(".csv.gz")): 187 | # Load from csv 188 | def loader(filename): 189 | # Ignores only_passing 190 | df = pandas.read_csv(filename, nrows=max_variants) 191 | for column in ['ref', 'alt']: 192 | df[column] = df[column].fillna('') 193 | df["variant"] = [ 194 | dataframe_row_to_variant(row) for (i, row) in df.iterrows() 195 | ] 196 | return df 197 | else: 198 | raise ValueError( 199 | "Unsupported input file extension for variants: %s" % filename) 200 | 201 | df = loader(filename) 202 | 203 | if 'genome' not in df: 204 | df["genome"] = genome 205 | 206 | df["variant"] = [ 207 | dataframe_row_to_variant(row) for (i, row) in df.iterrows() 208 | ] 209 | return df 210 | 211 | def variants_to_dataframe( 212 | variants, metadata=None, metadata_column_prefix=""): 213 | def record(variant): 214 | d = { 215 | 'variant': variant, 216 | 'genome': str(variant.reference_name), 217 | 'contig': variant.contig, 218 | 'interbase_start': variant.start - 1, 219 | 'interbase_end': variant.end, 220 | 'ref': variant.ref, 221 | 'alt': variant.alt, 222 | } 223 | if metadata: 224 | for (name, value) in metadata.get(variant, {}).items(): 225 | if name == 'info': 226 | for (info_col, value) in value.items(): 227 | column = '%sinfo:%s' % ( 228 | metadata_column_prefix, info_col) 229 | d[column] = value 230 | else: 231 | d["%s%s" % (metadata_column_prefix, name.lower())] = value 232 | return d 233 | 234 | df = pandas.DataFrame.from_records([record(v) for v in variants]) 235 | column_indices = dict( 236 | (column, i) for (i, column) in enumerate(STANDARD_DATAFRAME_COLUMNS)) 237 | columns = sorted(df.columns, key=lambda col: column_indices.get(col, 100)) 238 | return df[columns] 239 | 240 | def dataframe_row_to_variant(row): 241 | return varcode.Variant( 242 | ensembl=row.genome, 243 | contig=row.contig, 244 | start=row.interbase_start + 1, 245 | ref=row.ref, 246 | alt=row.alt, 247 | allow_extended_nucleotides=True) 248 | 249 | def dataframe_to_variants(df): 250 | for column in STANDARD_DATAFRAME_COLUMNS: 251 | if column not in df: 252 | raise ValueError("Missing column: %s" % column) 253 | 254 | extra_columns = [ 255 | c for c in df.columns if c not in STANDARD_DATAFRAME_COLUMNS 256 | ] 257 | metadata = collections.OrderedDict() 258 | for (i, row) in df.iterrows(): 259 | variant = dataframe_row_to_variant(row) 260 | # We ignore the interbase_end field. 261 | metadata[variant] = dict((c, row[c]) for c in extra_columns) 262 | 263 | return varcode.VariantCollection(metadata.keys(), metadata=metadata) 264 | 265 | def load_csv(filename, genome=None): 266 | # Genome is ignored for now. 267 | df = pandas.read_csv(filename) 268 | return dataframe_to_variants(df) 269 | -------------------------------------------------------------------------------- /varlens/version.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | __version__ = "0.2.0" 14 | 15 | version_string = f"v{__version__}" 16 | 17 | def print_version(): 18 | print(version_string) 19 | 20 | def print_name_and_version(): 21 | print(f"Varlens {version_string}") 22 | 23 | if __name__ == "__main__": 24 | print_version() 25 | 26 | 27 | --------------------------------------------------------------------------------