├── src
├── mavis
│ ├── py.typed
│ ├── bam
│ │ └── __init__.py
│ ├── annotate
│ │ ├── __init__.py
│ │ └── constants.py
│ ├── illustrate
│ │ └── __init__.py
│ ├── pairing
│ │ ├── __init__.py
│ │ └── constants.py
│ ├── summary
│ │ ├── __init__.py
│ │ └── constants.py
│ ├── validate
│ │ ├── __init__.py
│ │ └── constants.py
│ ├── cluster
│ │ └── __init__.py
│ ├── __init__.py
│ ├── types.py
│ ├── error.py
│ ├── convert
│ │ ├── starfusion.py
│ │ ├── straglr.py
│ │ ├── cnvnator.py
│ │ ├── breakdancer.py
│ │ ├── arriba.py
│ │ ├── chimerascan.py
│ │ ├── constants.py
│ │ └── transabyss.py
│ ├── overlay.py
│ └── config.py
└── tools
│ ├── __init__.py
│ ├── get_hg38_reference_files.sh
│ ├── get_hg19_reference_files.sh
│ └── find_repeats.py
├── tests
├── __init__.py
├── snakemake
│ └── __init__.py
├── test_mavis
│ ├── __init__.py
│ ├── bam
│ │ └── __init__.py
│ ├── annotate
│ │ ├── __init__.py
│ │ ├── test_annotate_fileio2.py
│ │ └── test_annotate_fileio.py
│ ├── cluster
│ │ ├── __init__.py
│ │ └── test_cluster.py
│ ├── convert
│ │ ├── __init__.py
│ │ └── test_tools_vcf.py
│ ├── pairing
│ │ └── __init__.py
│ ├── summary
│ │ └── __init__.py
│ ├── validate
│ │ ├── __init__.py
│ │ └── test_validate.py
│ ├── illustrate
│ │ ├── __init__.py
│ │ └── test_illustrate.py
│ ├── test_constants.py
│ ├── test_blat.py
│ └── test_help.py
├── test_tools
│ ├── __init__.py
│ ├── test_convert_dgv.py
│ ├── data
│ │ ├── ensembl69_hg19_annotations.kras.tab
│ │ └── K02718.1.gff3
│ ├── test_convert_annotations_format.py
│ └── test_ref_alt_count.py
├── data
│ ├── mock_masking.tab
│ ├── mock_reference_genome.fa.amb
│ ├── mock_dgv_annotation_malformed.tab
│ ├── pindel_events.vcf.gz
│ ├── mock_reference_genome.2bit
│ ├── mock_reference_genome.fa.sa
│ ├── mock_reference_genome.fa.bwt
│ ├── mock_reference_genome.fa.pac
│ ├── mock_reads_for_events.sorted.bam
│ ├── mini_mock_reads_for_events.sorted.bam
│ ├── mock_reads_for_events.sorted.bam.bai
│ ├── mock_trans_reads_for_events.sorted.bam
│ ├── mini_mock_reads_for_events.sorted.bam.bai
│ ├── mock_trans_reads_for_events.sorted.bam.bai
│ ├── mock_dgv_annotation.tab
│ ├── reference_from_env.cfg
│ ├── mini_mock_sv_events.svmerge.tsv
│ ├── pairing_reference_annotations_file.tab
│ ├── clustering_input.tab
│ ├── bad_input_file.cfg
│ ├── straglr.bed
│ ├── cnvnator.tab
│ ├── mock_trans_sv_events.tsv
│ ├── mock_pairing_input.tab
│ ├── mock_reference_genome.fa.ann
│ ├── mock_reference_annotations.json
│ ├── breakdancer_output.txt
│ ├── bwa_pipeline_config.cfg
│ ├── missing_reference.cfg
│ ├── clean_pipeline_config.cfg
│ ├── no_opt_pipeline.cfg
│ ├── Library-clusterset-N.validated.tsv
│ ├── pipeline_config.cfg
│ ├── mock_reference_annotations.full.json
│ ├── transabyss_indels_output.tab
│ ├── mock_dgv_annotation_mavis.tab
│ ├── mock_sv_events.tsv
│ ├── build.cfg
│ ├── mock_reference_annotations2.json
│ └── transabyss_events.tab
├── setup_subprocess_cov.py
├── util.py
├── mini-tutorial.annotate_only.config.json
├── mini-tutorial.config.json
└── full-tutorial.config.json
├── requirements.txt
├── docs
├── index.md
├── extra.css
├── background
│ ├── .pages
│ └── citations.md
├── tutorials
│ ├── .pages
│ ├── mini.md
│ └── annotation.md
├── inputs
│ ├── .pages
│ ├── non_python_dependencies.md
│ └── standard.md
├── images
│ ├── icon.png
│ ├── ENSG00000139687_RB1_overlay.png
│ ├── snakemake.cluster.full-tutorial.png
│ ├── snakemake.cluster.mini-tutorial.png
│ ├── snakemake.validate.mini-tutorial.png
│ ├── colo829_tumour_annotation_resource_req.png
│ ├── colo829_tumour_validation_resource_req.png
│ ├── get_app-24px.svg
│ └── Fusion-ext.gpl
├── outputs
│ ├── index.md
│ └── illustrations.md
├── package
│ └── mavis
│ │ ├── summary
│ │ └── index.md
│ │ ├── pairing
│ │ └── index.md
│ │ ├── cluster
│ │ └── index.md
│ │ ├── annotate
│ │ └── index.md
│ │ └── validate
│ │ └── index.md
├── configuration
│ ├── pipeline.md
│ ├── performance.md
│ └── general.md
├── migrating.md
├── development.md
├── hooks.py
└── install.md
├── pyproject.toml
├── codecov.yml
├── MANIFEST.in
├── .coveragerc
├── .readthedocs.yml
├── .github
├── ISSUE_TEMPLATE
│ ├── feature_request.md
│ └── bug_report.md
├── workflows
│ ├── publish.yml
│ ├── quick-tests.yml
│ └── build.yml
└── CONTRIBUTING.md
├── .gitignore
├── mkdocs.yml
├── setup.py
├── env
├── example.sh
└── generate_ensembl79_annotations.sh
├── Dockerfile
├── setup.cfg
└── README.md
/src/mavis/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | .
2 |
--------------------------------------------------------------------------------
/src/tools/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/mavis/bam/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/snakemake/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_mavis/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_tools/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/mavis/annotate/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/mavis/illustrate/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/mavis/pairing/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/mavis/summary/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/mavis/validate/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_mavis/bam/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | {!./../README.md!}
2 |
--------------------------------------------------------------------------------
/tests/test_mavis/annotate/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_mavis/cluster/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_mavis/convert/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_mavis/pairing/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_mavis/summary/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_mavis/validate/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/test_mavis/illustrate/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/data/mock_masking.tab:
--------------------------------------------------------------------------------
1 | chr start end name
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | build-backend = "setuptools.build_meta"
2 |
--------------------------------------------------------------------------------
/tests/data/mock_reference_genome.fa.amb:
--------------------------------------------------------------------------------
1 | 1054073 24 0
2 |
--------------------------------------------------------------------------------
/docs/extra.css:
--------------------------------------------------------------------------------
1 | td + td > a {
2 | display: flex;
3 | }
4 |
--------------------------------------------------------------------------------
/docs/background/.pages:
--------------------------------------------------------------------------------
1 | nav:
2 | - theory.md
3 | - citations.md
4 |
--------------------------------------------------------------------------------
/docs/tutorials/.pages:
--------------------------------------------------------------------------------
1 | nav:
2 | - mini.md
3 | - full.md
4 | - ...
5 |
--------------------------------------------------------------------------------
/src/mavis/validate/constants.py:
--------------------------------------------------------------------------------
1 | PASS_FILENAME = 'validation-passed.tab'
2 |
--------------------------------------------------------------------------------
/docs/inputs/.pages:
--------------------------------------------------------------------------------
1 | nav:
2 | - reference.md
3 | - standard.md
4 | - ...
5 |
--------------------------------------------------------------------------------
/docs/images/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/icon.png
--------------------------------------------------------------------------------
/tests/data/mock_dgv_annotation_malformed.tab:
--------------------------------------------------------------------------------
1 | chromosome beginning ending unknown
2 |
--------------------------------------------------------------------------------
/tests/data/pindel_events.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/pindel_events.vcf.gz
--------------------------------------------------------------------------------
/tests/data/mock_reference_genome.2bit:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reference_genome.2bit
--------------------------------------------------------------------------------
/tests/data/mock_reference_genome.fa.sa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reference_genome.fa.sa
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 | status:
3 | project:
4 | default:
5 | target: 80%
6 | threshold: 1%
7 |
--------------------------------------------------------------------------------
/src/mavis/cluster/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['merge_breakpoint_pairs']
2 |
3 |
4 | from .cluster import merge_breakpoint_pairs
5 |
--------------------------------------------------------------------------------
/tests/data/mock_reference_genome.fa.bwt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reference_genome.fa.bwt
--------------------------------------------------------------------------------
/tests/data/mock_reference_genome.fa.pac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reference_genome.fa.pac
--------------------------------------------------------------------------------
/docs/images/ENSG00000139687_RB1_overlay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/ENSG00000139687_RB1_overlay.png
--------------------------------------------------------------------------------
/tests/data/mock_reads_for_events.sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reads_for_events.sorted.bam
--------------------------------------------------------------------------------
/docs/images/snakemake.cluster.full-tutorial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/snakemake.cluster.full-tutorial.png
--------------------------------------------------------------------------------
/docs/images/snakemake.cluster.mini-tutorial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/snakemake.cluster.mini-tutorial.png
--------------------------------------------------------------------------------
/docs/images/snakemake.validate.mini-tutorial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/snakemake.validate.mini-tutorial.png
--------------------------------------------------------------------------------
/tests/data/mini_mock_reads_for_events.sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mini_mock_reads_for_events.sorted.bam
--------------------------------------------------------------------------------
/tests/data/mock_reads_for_events.sorted.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reads_for_events.sorted.bam.bai
--------------------------------------------------------------------------------
/tests/data/mock_trans_reads_for_events.sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_trans_reads_for_events.sorted.bam
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include src *.py *.json
2 | include src/mavis/py.typed
3 | include README.md
4 | include LICENSE
5 | prune docs
6 | prune tests
7 |
--------------------------------------------------------------------------------
/tests/data/mini_mock_reads_for_events.sorted.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mini_mock_reads_for_events.sorted.bam.bai
--------------------------------------------------------------------------------
/tests/data/mock_trans_reads_for_events.sorted.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_trans_reads_for_events.sorted.bam.bai
--------------------------------------------------------------------------------
/docs/images/colo829_tumour_annotation_resource_req.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/colo829_tumour_annotation_resource_req.png
--------------------------------------------------------------------------------
/docs/images/colo829_tumour_validation_resource_req.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/colo829_tumour_validation_resource_req.png
--------------------------------------------------------------------------------
/docs/outputs/index.md:
--------------------------------------------------------------------------------
1 | # Tab Delimited Files
2 |
3 | Column names of the output files are documented in the [column names](../../outputs/columns)
4 | section
5 |
--------------------------------------------------------------------------------
/src/mavis/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | holds submodules related to structural variants
3 | """
4 | import pkg_resources
5 |
6 | __version__ = pkg_resources.require('mavis')[0].version
7 |
--------------------------------------------------------------------------------
/docs/images/get_app-24px.svg:
--------------------------------------------------------------------------------
1 |
5 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | parallel = True
3 | concurrency = multiprocessing
4 |
5 | [html]
6 | directory = coverage
7 | title = mavis coverage report
8 |
9 | [report]
10 | exclude_lines =
11 | pragma: no cover
12 | if TYPE_CHECKING:
13 |
--------------------------------------------------------------------------------
/docs/images/Fusion-ext.gpl:
--------------------------------------------------------------------------------
1 | GIMP Palette
2 | Name: Fusions-ext
3 | #
4 | 0 0 0
5 | 255 255 255
6 | 199 217 143
7 | 82 103 43
8 | 133 152 97
9 | 42 67 36
10 | 184 211 186
11 | 76 150 119
12 | 123 221 193
13 | 50 85 86
14 | 125 195 216
15 | 101 126 145
16 | 81 141 197
17 | 38 40 61
18 | 186 178 226
19 | 58 52 105
20 | 124 111 170
21 |
--------------------------------------------------------------------------------
/tests/data/mock_dgv_annotation.tab:
--------------------------------------------------------------------------------
1 | chr start end name
2 | 1 1 2300000 nsv482937
3 | 1 10001 22118 dgv1n82
4 | 1 10001 22120 rgv2n98
5 | 1 10001 22221 rgv2n99
6 | 1 10001 127330 nsv7879
7 | 1 10191 10281 nsv958854
8 | 1 10377 177417 nsv428112
9 | 1 10377 1018704 esv2758911
10 | 1 10499 177368 esv27265
11 | 1 11099 47000 nsv1147468
12 | 1 11100 29200 dgv1n106
13 |
--------------------------------------------------------------------------------
/tests/test_mavis/annotate/test_annotate_fileio2.py:
--------------------------------------------------------------------------------
1 | from mavis.annotate.file_io import load_annotations
2 |
3 | from ...util import get_data
4 |
5 | JSON = get_data('annotations_subsample.json')
6 |
7 |
8 | class TestAnnotationLoading:
9 | def test_load_json(self):
10 | result = load_annotations(JSON)
11 | assert len(result.keys()) == 12
12 |
--------------------------------------------------------------------------------
/tests/setup_subprocess_cov.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | for p in sys.path:
5 | if p.endswith('site-packages'):
6 | pth_file = os.path.join(p, 'subprocess-coverage.pth')
7 | print('writing path file:', pth_file)
8 | with open(pth_file, 'w') as fh:
9 | fh.write('import coverage\n\ncoverage.process_startup()\n')
10 | break
11 |
--------------------------------------------------------------------------------
/src/mavis/types.py:
--------------------------------------------------------------------------------
1 | """
2 | Helper classes for type hints
3 | """
4 |
5 | from typing import TYPE_CHECKING, Dict, List, Tuple
6 |
7 | from Bio.SeqRecord import SeqRecord
8 |
9 | if TYPE_CHECKING:
10 | from .annotate.genomic import Gene
11 |
12 | ReferenceGenome = Dict[str, SeqRecord]
13 | ReferenceAnnotations = Dict[str, List['Gene']]
14 | CigarTuples = List[Tuple[int, int]]
15 |
--------------------------------------------------------------------------------
/tests/data/reference_from_env.cfg:
--------------------------------------------------------------------------------
1 | [reference]
2 |
3 | [mock-A36971]
4 | read_length = 150
5 | median_fragment_size = 400
6 | stdev_fragment_size = 97
7 | bam_file = tests/data/mock_reads_for_events.sorted.bam
8 | protocol = genome
9 | inputs = tests/data/mock_sv_events.tsv
10 | strand_specific = False
11 | disease_status=diseased
12 |
13 | [cluster]
14 | uninformative_filter = True
15 | limit_to_chr = None
16 |
--------------------------------------------------------------------------------
/src/mavis/error.py:
--------------------------------------------------------------------------------
1 | class NotSpecifiedError(Exception):
2 | """
3 | raised when information is required for a function but has not been given
4 |
5 | for example if strand was required but had been set to STRAND.NS then this
6 | error would be raised
7 | """
8 |
9 | pass
10 |
11 |
12 | class DrawingFitError(Exception):
13 | pass
14 |
15 |
16 | class InvalidRearrangement(Exception):
17 | pass
18 |
--------------------------------------------------------------------------------
/tests/data/mini_mock_sv_events.svmerge.tsv:
--------------------------------------------------------------------------------
1 | #start_chromosome start_position end_chromosome end_position start_orientation end_orientation start_strand end_strand protocol tool_version libraries tool_evidence comments filters flanking_reads mapping_quality split_reads
2 | reference3 1114-1114 reference3 2187-2187 R R + - genome convert_ta.py_v0.0.1 A36971
3 | reference10 519-519 reference19 965-965 R L + + genome convert_ta.py_v0.0.1 A36971
4 |
--------------------------------------------------------------------------------
/docs/package/mavis/summary/index.md:
--------------------------------------------------------------------------------
1 | # Sub-package Documentation
2 |
3 | This is the package responsible for summarizing the calls between libraries. In many cases
4 | this will be where somatic vs germline is determined or genomic only vs expressed.
5 |
6 | ## Output Files
7 |
8 | | expected name/suffix | file type/format | content |
9 | | ----------------------- | ---------------- | ------- |
10 | | ``mavis_summary_*.tab`` | text/tabbed | ? |
11 |
--------------------------------------------------------------------------------
/tests/data/pairing_reference_annotations_file.tab:
--------------------------------------------------------------------------------
1 | ## input file used to map hugo gene names: compiled_gene_drug_pathway.v1_2_5.tsv
2 | ## input file for picking best transcript: ens69_best_transcript.txt
3 | ## Ensembl Api version 69
4 | ## generated at: Thu Aug 4 16:38:01 2016
5 | ensembl_gene_id hugo_names chr strand gene_start gene_end best_ensembl_transcript_id ensembl_transcript_id refseq_equivalents transcript_genomic_start transcript_genomic_end cdna_coding_start cdna_coding_end genomic_exon_ranges AA_domain_ranges
6 |
--------------------------------------------------------------------------------
/src/mavis/summary/constants.py:
--------------------------------------------------------------------------------
1 | from ..constants import MavisNamespace
2 |
3 | HOMOPOLYMER_MIN_LENGTH = 3
4 |
5 |
6 | class PAIRING_STATE(MavisNamespace):
7 | EXP = 'expressed'
8 | NO_EXP = 'not expressed'
9 | SOMATIC = 'somatic'
10 | GERMLINE = 'germline'
11 | CO_EXP = 'co-expressed'
12 | GERMLINE_EXP = 'germline expression'
13 | SOMATIC_EXP = 'somatic expression'
14 | MATCH = 'matched'
15 | NO_MATCH = 'not matched'
16 | GENOMIC = 'genomic support'
17 | NO_GENOMIC = 'no genomic support'
18 |
--------------------------------------------------------------------------------
/src/mavis/pairing/constants.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from mavis_config import DEFAULTS
4 |
5 | from ..constants import CALL_METHOD
6 |
7 | PAIRING_DISTANCES: Dict[str, int] = {
8 | CALL_METHOD.FLANK: DEFAULTS['pairing.flanking_call_distance'],
9 | CALL_METHOD.SPAN: DEFAULTS['pairing.spanning_call_distance'],
10 | CALL_METHOD.SPLIT: DEFAULTS['pairing.split_call_distance'],
11 | CALL_METHOD.CONTIG: DEFAULTS['pairing.contig_call_distance'],
12 | CALL_METHOD.INPUT: DEFAULTS['pairing.input_call_distance'],
13 | }
14 |
--------------------------------------------------------------------------------
/tests/data/clustering_input.tab:
--------------------------------------------------------------------------------
1 | tracking_id event_type break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break1_seq break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand break2_seq opposing_strands stranded tools protocol
2 | manta-MantaDEL:175574:0:0:0:0:0 deletion 15 67333523 67333619 L ? None 15 67333581 67333581 R ? None False False manta genome
3 | strelka-TyeSomZhWTRakEu6ZJ7up6 deletion 15 67333623 67333623 L ? None 15 67333625 67333625 R ? None False False strelka genome
4 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Build documentation with MkDocs
9 | mkdocs:
10 | configuration: mkdocs.yml
11 | fail_on_warning: false
12 |
13 | # Optionally build your docs in additional formats such as PDF and ePub
14 | formats: all
15 |
16 | # Optionally set the version of Python and requirements required to build your docs
17 | python:
18 | version: 3.7
19 | install:
20 | - method: pip
21 | path: .
22 | extra_requirements:
23 | - doc
24 |
--------------------------------------------------------------------------------
/tests/data/bad_input_file.cfg:
--------------------------------------------------------------------------------
1 | [reference]
2 | template_metadata = tests/data/cytoBand.txt
3 | annotations = tests/data/mock_annotations.json
4 | masking = tests/data/mock_masking.tab
5 | reference_genome = tests/data/mock_reference_genome.fa
6 | aligner_reference = tests/data/mock_reference_genome.2bit
7 | dgv_annotation = tests/data/mock_dgv_annotation.txt
8 |
9 | [cluster]
10 | uninformative_filter = True
11 | limit_to_chr = None
12 |
13 | [mock-A36971]
14 | read_length = 150
15 | median_fragment_size = 400
16 | stdev_fragment_size = 97
17 | bam_file = tests/data/mock_reads_for_events.sorted.bam
18 | protocol = genome
19 | inputs = mock_converted.tab
20 | strand_specific = False
21 | disease_status=diseased
22 |
23 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # python generated files
2 | /.eggs
3 | /coverage
4 | /venv*
5 | /.coverage
6 | *.pyc
7 | *__pycache__
8 | build-docs
9 | *.egg-info*
10 | build
11 | *coverage*
12 | dist
13 | junit
14 | .pytest*
15 | .tox
16 | *eggs/
17 | .mypy_cache
18 | .snakemake
19 | .venv*
20 |
21 | # aligners
22 | blat
23 | bwa
24 | *.fai
25 |
26 | # user editing generated files
27 | *.~lock*
28 | .vscode
29 | *.nfs*
30 | junit
31 |
32 | # generated documentation
33 | /docs/package/mavis/*.md
34 | /docs/package/mavis/*/*.md
35 | # don't ignore subpackage summary files
36 | !/docs/package/mavis/*/index.md
37 | docs/configuration/settings.md
38 |
39 | .snakemake
40 | output_dir*
41 | bin
42 | dag*
43 | tutorial_data
44 | reference_inputs
45 | tmp
46 |
--------------------------------------------------------------------------------
/tests/data/straglr.bed:
--------------------------------------------------------------------------------
1 | #chrom start end repeat_unit allele1:size allele1:copy_number allele1:support allele2:size allele2:copy_number allele2:support
2 | chr11 776686 778078 CT 100.0 150.0 10 100.0 100.0 1
3 | chr10 3079216 3079421 AGAGGTCACCACCCCTTCCCAACAATCCAGTAACAATCC 100.0 150.0 10 100.0 100.0 1
4 | chr9 2080637 2081030 CTCCTTCCCTCCGCCCCCACCTCGGTCCCTGT 100.0 150.0 10 100.0 100.0 1
5 | chrX 244719 245293 CCCCGGGAACCGCCT 100.0 150.0 10 - - -
6 | chr7 284096 284233 GGT 100.0 150.0 10 - - -
7 | chr8 288173 290242 CCCTGCTCCGT 100.0 150.0 10 100.0 100.0 1
8 | chr3 2382228 2382908 CCGTGGGGGAGGCTGAGGCTATGGGGACT 100.0 100.0 10 - - -
9 | chr2 2427285 2427528 CCTCC 100.0 150.0 10 - - -
10 | chr2 2427953 2428216 GGAGG 100.0 150.0 10 100.0 100.0 1
11 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: MAVIS
2 |
3 | theme:
4 | name: material
5 | repo_url: https://github.com/bcgsc/mavis
6 | repo_name: github
7 | site_dir: build-docs
8 | markdown_extensions:
9 | - codehilite
10 | - admonition
11 | - pymdownx.inlinehilite
12 | - markdown_include.include:
13 | base_path: docs
14 | extra_css: [extra.css]
15 | nav:
16 | - index.md
17 | - install.md
18 | - migrating.md
19 | - ... | background/**.md
20 | - ... | inputs/**.md
21 | - ... | outputs/**.md
22 | - ... | configuration/**.md
23 | - ... | tutorials/**.md
24 | - development.md
25 | - ...
26 | - glossary.md
27 |
28 | plugins:
29 | - search
30 | - awesome-pages
31 | - mkdocs-simple-hooks:
32 | hooks:
33 | on_pre_build: "docs.hooks:build_package_docs"
34 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | from setuptools import setup
5 |
6 |
7 | def check_nonpython_dependencies():
8 | """
9 | check that the non-python dependencies have been installed.
10 |
11 | Raises:
12 | OSError: A dependency is not installed
13 | """
14 | import shutil
15 |
16 | aligner = (
17 | os.environ['MAVIS_ALIGNER']
18 | if 'MAVIS_ALIGNER' in os.environ and os.environ['MAVIS_ALIGNER']
19 | else 'blat'
20 | )
21 | aligner = re.split(r'\s+', aligner)[0]
22 | pth = shutil.which(aligner)
23 | if not pth:
24 | print('WARNING: Aligner is required. Missing executable: {}'.format(aligner))
25 | else:
26 | print('Found: aligner at', pth)
27 |
28 |
29 | setup()
30 | check_nonpython_dependencies()
31 |
--------------------------------------------------------------------------------
/tests/data/cnvnator.tab:
--------------------------------------------------------------------------------
1 | deletion 1:1-10000 10000 0 1.59373e-11 0 1.99216e-11 0 -1
2 | deletion 1:38001-39000 1000 0.467116 544.034 0.0442397 1 1 1
3 | deletion 1:51201-74200 23000 0.648113 6.92924e-12 2.52664e+09 7.58917e-12 2.55487e+09 1
4 | deletion 1:74601-94200 19600 0.254531 8.13125e-12 2.52848e-32 9.05526e-12 5.01031e-78 1
5 | deletion 1:106001-106800 800 0.270927 4415.44 4.32994e-06 1 1 1
6 | duplication 1:107401-111200 3800 1.67572 0.00897513 2.75843e+07 20.687 5.49288e-07 1
7 | duplication 1:137201-139600 2400 1.54927 0.00127366 8.4566e-14 182635 16668.9 1
8 | deletion 1:149801-150800 1000 0.504485 79.6041 0.00136224 1 1 1
9 | deletion 1:151201-155800 4600 0.582473 0.00108651 9.95448e+06 35.7819 1.1684e+08 1
10 | deletion 1:176201-228000 51800 0.0193339 3.07669e-12 1.12835e-37 3.20025e-12 0 1
11 |
--------------------------------------------------------------------------------
/tests/test_mavis/annotate/test_annotate_fileio.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import pytest
4 |
5 | from mavis.annotate.file_io import load_annotations
6 |
7 |
8 | @pytest.mark.parametrize(
9 | 'annotations,error_message_include',
10 | [
11 | [{'genes': []}, "schema['properties']['genes']"],
12 | [
13 | {'genes': [{'start': '1'}]},
14 | "schema['properties']['genes']['items']['properties']['start']",
15 | ],
16 | ],
17 | )
18 | def test_min_genes_error(annotations, error_message_include, tmp_path):
19 | filename = tmp_path / "annotations.json"
20 | filename.write_text(json.dumps(annotations))
21 | with pytest.raises(AssertionError) as exc:
22 | load_annotations(str(filename))
23 | assert error_message_include in str(exc.value)
24 |
--------------------------------------------------------------------------------
/src/mavis/annotate/constants.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from ..constants import MavisNamespace
4 |
5 | PASS_FILENAME = 'annotations.tab'
6 |
7 |
8 | class SPLICE_SITE_TYPE(MavisNamespace):
9 | DONOR: int = 3
10 | ACCEPTOR: int = 5
11 |
12 |
13 | SPLICE_SITE_RADIUS = 2
14 | """int: number of bases away from an exon boundary considered to be part of the splice site such that if it were altered
15 | the splice site would be considered to be abrogated.
16 | """
17 |
18 | # splice site sequences based on: http://www.nature.com/nrg/journal/v17/n7/fig_tab/nrg.2016.46_F5.html?foxtrotcallback=true
19 |
20 | DONOR_SEQ = [
21 | re.compile('(AG)(GT[AG]AG)'),
22 | re.compile('([CA]AG)(GTA)'),
23 | ]
24 |
25 | ACCEPTOR_SEQ = [
26 | re.compile('([TC]{8}[ATCG]CAG)([GA][ATCG])'),
27 | re.compile('([TC]{9}TAG)([GA][ATCG])'),
28 | re.compile('([TC]{8}[ATCG]AAG)([GA][ATCG])'),
29 | ]
30 |
--------------------------------------------------------------------------------
/tests/data/mock_trans_sv_events.tsv:
--------------------------------------------------------------------------------
1 | ## False reference9 2000 2000 reference9 2001 2001 L R + + insertion genome convert_ta.py_v0.0.1 mock-A36971 9:66466004
2 | stranded break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end break1_orientation break2_orientation break1_strand break2_strand event_type protocol tools library comment
3 | False gene3 27175 27175 gene3 27176 27176 R L + + duplication transcriptome convert_ta.py_v0.0.1 mock-A47933 1:207249992
4 | True gene1 34090 34090 gene5 608 608 R R - + inverted translocation transcriptome convert_ta.py_v0.0.1 mock-A47933 15:40854971|7:26241389
5 | False gene2 22979 22979 gene2 23783 23783 R L + + duplication transcriptome convert_ta.py_v0.0.1 mock-A47933 15:41623873|15:41625248#this one is pretty low qual
6 | False gene6 70057 77430 gene6 89472 94742 L R + + deletion transcriptome convert_ta.py_v0.0.1 mock-A47933 approx 10:89700299|10:89712341
7 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. run command '...'
16 | 2. See error ...
17 |
18 | **Expected behavior**
19 | A clear and concise description of what you expected to happen.
20 |
21 | **Input Data**
22 | If applicable, add the input data used when the bug was observed
23 |
24 | **Configuration**
25 | If applicable, include the mavis configuration file that was used to run the pipeline
26 |
27 | **Versions (please complete the following information):**
28 | - OS: [e.g. centos-07]
29 | - Python Version [e.g. 3.6.1]
30 | - MAVIS Version [e.g. 22]
31 | - Blat/BWA Version
32 |
33 | **Additional context**
34 | Add any other context about the problem here.
35 |
--------------------------------------------------------------------------------
/env/example.sh:
--------------------------------------------------------------------------------
1 | export MAVIS_TEMPLATE_METADATA='/projects/trans_scratch/software/mavis/reference_files/hg19_cytoBand.txt'
2 | export MAVIS_REFERENCE_GENOME='/projects/seqref/genomes/Homo_sapiens/GRCh37/1000genomes/bwa_ind/genome/GRCh37-lite.fa'
3 | export MAVIS_ANNOTATIONS='/projects/trans_scratch/software/mavis/reference_files/ensembl69_hg19_annotations.json'
4 | export MAVIS_MASKING='/projects/tumour_char/analysis_scripts/SVIA/delly/reference_data/GRCh37/human_nspan.hg19.excl.with_header.tsv'
5 | export MAVIS_ALIGNER_REFERENCE='/home/pubseq/genomes/Homo_sapiens/GRCh37/blat/hg19.2bit'
6 | export MAVIS_DGV_ANNOTATION='/projects/trans_scratch/software/mavis/reference_files/dgv_hg19_annotations.tab'
7 | export MAVIS_MAX_FILES=100
8 | export MAVIS_MIN_CLUSTERS_PER_FILE=30
9 | export PYTHONUNBUFFERED='True'
10 |
11 | #Add paths for samtools, blat and git
12 | export PATH=/projects/trans_scratch/transabyss/trans-ABySS/v1.4.10/bin/:/gsc/software/linux-x86_64-centos6/git-2.12.0/bin/:$PATH
13 |
--------------------------------------------------------------------------------
/tests/data/mock_pairing_input.tab:
--------------------------------------------------------------------------------
1 | library cluster_id validation_id annotation_id event_type transcript1 transcript2 fusion_cdna_coding_start fusion_cdna_coding_end fusion_sequence_fasta_id fusion_sequence_fasta_file break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand opposing_strands stranded protocol break1_call_method break2_call_method untemplated_seq fusion_splicing_pattern
2 | genome1 1 1 1 deletion ENST00000367080 ENST00000367080 None None None None gene3 10008 10008 L + gene3 18900 18900 R + False True genome split reads split reads None None
3 | genome2 1 1 1 deletion ENST00000367080 ENST00000367080 None None None None gene3 10000 10000 L + gene3 18900 18900 R + False True genome split reads split reads None None
4 | transcriptome1 1 1 1 deletion ENST00000367080 ENST00000367080 None None None None gene3 5347 5347 L + gene3 19969 19969 R + False True transcriptome split reads split reads None None
5 |
--------------------------------------------------------------------------------
/tests/test_mavis/cluster/test_cluster.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pytest
4 |
5 | from mavis.cluster.cluster import merge_integer_intervals
6 | from mavis.interval import Interval
7 |
8 |
9 | class TestMergeIntegerIntervals:
10 | def test_varying_lengths(self):
11 | m = merge_integer_intervals((1, 2), (1, 9), (2, 10), weight_adjustment=0)
12 | assert m == Interval(1, 4)
13 |
14 | def test_same_length(self):
15 | m = merge_integer_intervals((1, 1), (10, 10))
16 | assert m == Interval(6)
17 |
18 | def test_empty_list_error(self):
19 | with pytest.raises(AttributeError):
20 | merge_integer_intervals()
21 |
22 | def test_identical_even_length(self):
23 | m = merge_integer_intervals((1, 2), (1, 2), (1, 2))
24 | assert m == Interval(1, 2)
25 |
26 | def test_identical_odd_length(self):
27 | m = merge_integer_intervals((1, 3), (1, 3), (1, 3))
28 | assert m == Interval(1, 3)
29 |
30 |
31 | if __name__ == '__main__':
32 | unittest.main()
33 |
--------------------------------------------------------------------------------
/src/tools/get_hg38_reference_files.sh:
--------------------------------------------------------------------------------
1 | set -euo pipefail
2 |
3 | echo "downloading the reference genome (no alt) file"
4 | wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
5 | gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
6 |
7 | echo "downloading the gene annotations file"
8 | wget http://www.bcgsc.ca/downloads/mavis/v3/ensembl79_hg38_annotations.v3.json.gz
9 | gunzip ensembl79_hg38_annotations.v3.json.gz
10 |
11 | echo "downloading the masking file"
12 | wget http://www.bcgsc.ca/downloads/mavis/GRCh38_masking.tab
13 |
14 | echo "downloading the dgv annotation file"
15 | wget http://www.bcgsc.ca/downloads/mavis/dgv_hg38_variants.tab
16 |
17 | echo "downloading the aligner reference file"
18 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit
19 |
20 | echo "downloading the template metadata file"
21 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz
22 | gunzip cytoBand.txt.gz
23 |
--------------------------------------------------------------------------------
/tests/data/mock_reference_genome.fa.ann:
--------------------------------------------------------------------------------
1 | 1054073 24 11
2 | 0 fake (null)
3 | 0 7450 0
4 | 0 reference2 (null)
5 | 7450 13648 0
6 | 0 reference4 (null)
7 | 21098 4000 0
8 | 0 reference3 (null)
9 | 25098 3711 0
10 | 0 reference7 (null)
11 | 28809 21000 0
12 | 0 reference10 (null)
13 | 49809 45109 0
14 | 0 reference19 (null)
15 | 94918 11786 0
16 | 0 reference20 (null)
17 | 106704 8000 0
18 | 0 referenceX (null)
19 | 114704 15760 0
20 | 0 reference11 (null)
21 | 130464 12000 0
22 | 0 reference12 (null)
23 | 142464 12000 0
24 | 0 reference1 (null)
25 | 154464 4000 0
26 | 0 reference9 (null)
27 | 158464 4000 0
28 | 0 reference16 (null)
29 | 162464 4000 0
30 | 0 reference17 (null)
31 | 166464 4000 0
32 | 0 gene1 (null)
33 | 170464 36375 0
34 | 0 gene2 (null)
35 | 206839 71783 0
36 | 0 gene3 (null)
37 | 278622 31569 0
38 | 0 gene4 (null)
39 | 310191 579898 0
40 | 0 gene5 (null)
41 | 890089 12195 0
42 | 0 gene6 (null)
43 | 902284 108818 0
44 | 0 fakereference9 (null)
45 | 1011102 14148 0
46 | 0 test_bam_long_ref
47 | 1025250 28322 0
48 | 0 11_86018001-86018500
49 | 1053572 501 0
50 |
--------------------------------------------------------------------------------
/tests/test_tools/test_convert_dgv.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from unittest.mock import patch
4 |
5 | import pytest
6 |
7 | from tools.convert_dgv import main as convert_dgv_main
8 |
9 |
10 | @pytest.mark.parametrize(
11 | "filename,expected_file",
12 | [
13 | ["dgv_test.tab", "dgv_test_expected.tab"],
14 | ],
15 | )
16 | def test_dgv_examples(tmp_path, filename, expected_file):
17 | data_dir = os.path.join(os.path.dirname(__file__), "data")
18 |
19 | output_path = str(tmp_path / "tmp_data.tab")
20 | args = [
21 | "python",
22 | "--input",
23 | os.path.join(data_dir, filename),
24 | "--output",
25 | output_path,
26 | ]
27 |
28 | with patch.object(convert_dgv_main, "main", create=True):
29 | with patch.object(sys, "argv", args):
30 | convert_dgv_main()
31 |
32 | with open(os.path.join(data_dir, expected_file), 'r') as fh:
33 | expected = fh.read().strip()
34 |
35 | with open(output_path, 'r') as fh:
36 | observed = fh.read().strip()
37 |
38 | assert expected == observed
39 |
--------------------------------------------------------------------------------
/env/generate_ensembl79_annotations.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # add the ensembl api modules to the path
4 | PATH=$(pwd):$PATH
5 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/bioperl-live
6 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/ensembl/modules
7 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/ensembl-compara/modules
8 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/ensembl-variation/modules
9 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/ensembl-funcgen/modules
10 | export PERL5LIB
11 |
12 | # default perl
13 | PATH=/projects/trans_scratch/software/perl/perl-5.20.3/bin:$PATH
14 |
15 | # required data files
16 | export HUGO_ENSEMBL_MAPPING=/projects/tumour_char/analysis_scripts/databases/processed_files/drug_target_tables/current_gene_drug_pathway.hg38.tsv
17 | export BEST_TRANSCRIPTS=/home/creisle/svn/ensembl_flatfiles/ens69_best_transcript.txt
18 |
19 | # connection information for the ensembl local server
20 | export ENSEMBL_HOST='ensembl02'
21 | export ENSEMBL_PASS='ensembl'
22 | export ENSEMBL_USER='ensembl'
23 | export ENSEMBL_PORT=3306
24 |
--------------------------------------------------------------------------------
/src/mavis/convert/starfusion.py:
--------------------------------------------------------------------------------
1 | from ..constants import ORIENT
2 |
3 |
4 | def convert_row(row):
5 | """
6 | transforms the starfusion output into the common format for expansion. Maps the input column
7 | names to column names that MAVIS can read
8 | """
9 | std_row = {}
10 | try:
11 | std_row['break1_chromosome'], b1_start, std_row['break1_strand'] = row[
12 | 'LeftBreakpoint'
13 | ].split(':')
14 | std_row['break2_chromosome'], b2_start, std_row['break2_strand'] = row[
15 | 'RightBreakpoint'
16 | ].split(':')
17 | except (ValueError, TypeError):
18 | raise AssertionError(
19 | 'Could not parse the breakpoint from the starfusion row: {}, {}'.format(
20 | row['LeftBreakpoint'], row['RightBreakpoint']
21 | )
22 | )
23 | std_row['break1_position_start'] = std_row['break1_position_end'] = b1_start
24 | std_row['break2_position_start'] = std_row['break2_position_end'] = b2_start
25 |
26 | std_row['break1_orientation'] = std_row['break2_orientation'] = ORIENT.NS
27 |
28 | return std_row
29 |
--------------------------------------------------------------------------------
/tests/test_mavis/illustrate/test_illustrate.py:
--------------------------------------------------------------------------------
1 | from mavis.illustrate.util import generate_interval_mapping
2 | from mavis.interval import Interval
3 |
4 |
5 | class TestGenerateIntervalMapping:
6 | def test_single_bp_window(self):
7 | regions = [
8 | Interval(4222347, 4222347),
9 | Interval(4221673, 4221903),
10 | Interval(2792992, 4852494),
11 | ]
12 | target = 911.9921875
13 | ratio = 5
14 | min_width = 60
15 | buffer_ = None
16 | start = 2791992
17 | end = 4853494
18 | min_inter = 10
19 | mapping = generate_interval_mapping(
20 | regions, target, ratio, min_width, buffer_, start, end, min_inter
21 | )
22 | assert len(mapping.keys()) == 7
23 |
24 | def test_no_input_intervals(self):
25 | target = 911.9921875
26 | ratio = 5
27 | min_width = 60
28 | buffer_ = None
29 | start = 2791992
30 | end = 4853494
31 | min_inter = 10
32 | mapping = generate_interval_mapping(
33 | [], target, ratio, min_width, buffer_, start, end, min_inter
34 | )
35 | assert len(mapping.keys()) == 1
36 |
--------------------------------------------------------------------------------
/src/tools/get_hg19_reference_files.sh:
--------------------------------------------------------------------------------
1 | set -euo pipefail
2 |
3 | echo "downloading the reference genome file"
4 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz
5 | tar -xvzf chromFa.tar.gz
6 |
7 | # concatenate the chromosome fa files into a single file
8 | for fname in chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y}.fa
9 | do
10 | cat $fname >> hg19.fa
11 | done
12 |
13 | # Clean up the non concatenated and alt chromosome files
14 | rm -f chr*.fa
15 | rm -f chromeFa.tar.gz
16 |
17 | echo "downloading the gene annotations file"
18 | wget http://www.bcgsc.ca/downloads/mavis/v3/ensembl69_hg19_annotations.v3.json.gz
19 | gunzip ensembl69_hg19_annotations.v3.json.gz
20 |
21 | echo "downloading the masking file"
22 | wget http://www.bcgsc.ca/downloads/mavis/hg19_masking.tab
23 |
24 | echo "downloading the dgv annotation file"
25 | wget http://www.bcgsc.ca/downloads/mavis/dgv_hg19_variants.tab
26 |
27 | echo "downloading the aligner reference file"
28 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit
29 |
30 | echo "downloading the template metadata file"
31 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz
32 | gunzip cytoBand.txt.gz
33 |
--------------------------------------------------------------------------------
/src/mavis/convert/straglr.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from ..constants import COLUMNS, SVTYPE
4 |
5 |
6 | def convert_row(row: Dict) -> Dict:
7 | """
8 | Converts the fields from the original STRAGLR BED output into MAVIS definitions of an SV
9 | Since STRAGLR defines regions where short tandem repeats exist we make the definitions here fairly
10 | non-specific
11 |
12 | See their github page for more details: https://github.com/bcgsc/straglr
13 |
14 | BED Columns
15 | - chrom: chromosome name
16 | - start: start coordinate of locus
17 | - end: end coordinate of locus
18 | - repeat_unit: repeat motif
19 | - allele.size: where N={1,2,3...} depending on --max_num_clusters e.g. N={1,2} if --max_num_clusters==2 (default)
20 | - allele.copy_number
21 | - allele.support
22 | """
23 | return {
24 | COLUMNS.break1_chromosome: row['chrom'],
25 | COLUMNS.break2_chromosome: row['chrom'],
26 | COLUMNS.break1_position_start: row['start'],
27 | COLUMNS.break1_position_end: row['end'],
28 | COLUMNS.break2_position_start: row['start'],
29 | COLUMNS.break2_position_end: row['end'],
30 | COLUMNS.untemplated_seq: None,
31 | COLUMNS.event_type: SVTYPE.INS,
32 | }
33 |
--------------------------------------------------------------------------------
/docs/configuration/pipeline.md:
--------------------------------------------------------------------------------
1 | # Running the Pipeline
2 |
3 | ## Running MAVIS using a Job Scheduler
4 |
5 | MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling
6 | and setup
7 |
8 | The MAVIS pipeline is highly configurable. Some pipeline steps
9 | (cluster, validate) are optional and can be automatically skipped. The
10 | standard pipeline is
11 | far-left.
12 |
13 | The most common use case is running the pipeline through snakemake
14 |
15 | ```bash
16 | snakemake -j --configfile -s Snakefile
17 | ```
18 |
19 | If you are submitting to a cluster, use the [snakemake profiles](https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles)
20 |
21 | ```bash
22 | snakemake -j --configfile --profile -s Snakefile
23 | ```
24 |
25 | This will submit a series of jobs with dependencies.
26 |
27 | To use the mavis docker container through singularity, instead of installing mavis via pip, add the
28 | [`--use-singularity`](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#running-jobs-in-containers)
29 | flag.
30 |
31 | ```bash
32 | snakemake -j --configfile --profile --use-singularity -s Snakefile`
33 | ```
34 |
--------------------------------------------------------------------------------
/docs/package/mavis/pairing/index.md:
--------------------------------------------------------------------------------
1 | # Sub-package Documentation
2 |
3 | This is the package responsible for pairing/grouping calls between libraries. In many cases
4 | this will be where somatic vs germline is determined or genomic only vs expressed.
5 |
6 | ## Output Files
7 |
8 | | expected name/suffix | file type/format | content |
9 | | ---------------------- | ---------------- | --------------------------------------------------------- |
10 | | ``mavis_paired_*.tab`` | text/tabbed | call information and pairing information using product id |
11 |
12 |
13 | ## Algorithm Overview
14 |
15 | - pairwise comparison of breakpoint pairs between libraries
16 |
17 | - fail if orientations do not match
18 | - fail if template/chromosomes do not match
19 | - if the protocols are mixed
20 |
21 | - pass if the fusion products match at the sequence level
22 | - pass if the breakpoint predicted from the genome matches the transcriptome breakpoint
23 |
24 | - if the protocols are the same
25 |
26 | - pass if the breakpoints are co-located
27 |
28 | - filter matches based on annotations
29 |
30 | - if both breakpoints have the same gene annotation, they must also both have the same transcript annotation
31 |
--------------------------------------------------------------------------------
/src/mavis/convert/cnvnator.py:
--------------------------------------------------------------------------------
1 | """
2 | from cnvnator: https://github.com/abyzovlab/CNVnator
3 |
4 | CNV_type coordinates CNV_size normalized_RD e-val1 e-val2 e-val3 e-val4 q0
5 |
6 | normalized_RD -- normalized to 1.
7 | e-val1 -- is calculated using t-test statistics.
8 | e-val2 -- is from the probability of RD values within the region to be in
9 | the tails of a gaussian distribution describing frequencies of RD values in bins.
10 | e-val3 -- same as e-val1 but for the middle of CNV
11 | e-val4 -- same as e-val2 but for the middle of CNV
12 | q0 -- fraction of reads mapped with q0 quality
13 | """
14 | import re
15 |
16 |
17 | def convert_row(row):
18 | """
19 |
20 | Args:
21 | row (Dict[str]): dict representing the row output from cnvnator
22 |
23 | Returns:
24 | dict: transformed row using mavis starndard column names
25 | """
26 | result = {k: v for k, v in row.items() if k != 'coordinates'}
27 | chrom, start, end = re.split(r'[-:]', row['coordinates'])
28 | result['break1_chromosome'] = result['break2_chromosome'] = chrom
29 | result['break1_position_start'] = result['break1_position_end'] = start
30 | result['break2_position_start'] = result['break2_position_end'] = end
31 | return result
32 |
--------------------------------------------------------------------------------
/tests/util.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 |
5 | import pytest
6 |
7 | DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
8 |
9 |
10 | long_running_test = pytest.mark.skipif(
11 | os.environ.get('RUN_FULL') != '1',
12 | reason='Only running FAST tests subset',
13 | )
14 |
15 | bwa_only = pytest.mark.skipif(not shutil.which('bwa'), reason='missing the command')
16 | blat_only = pytest.mark.skipif(not shutil.which('blat'), reason='missing the command')
17 | todo = pytest.mark.skip(reason='TODO')
18 |
19 |
20 | def package_relative_file(*paths):
21 | return os.path.abspath(os.path.join(os.path.dirname(__file__), '..', *paths))
22 |
23 |
24 | def get_data(*paths):
25 | return os.path.join(DATA_DIR, *paths)
26 |
27 |
28 | def glob_exists(*pos, strict=False, n=1):
29 | globexpr = os.path.join(*pos)
30 | file_list = glob.glob(globexpr)
31 | if strict and len(file_list) == n:
32 | return file_list[0] if len(file_list) == 1 else file_list
33 | elif not strict and len(file_list) > 0:
34 | return file_list
35 | else:
36 | print(globexpr)
37 | print(file_list)
38 | return False
39 |
40 |
41 | def glob_not_exists(*pos):
42 | globexpr = os.path.join(*pos)
43 | file_list = glob.glob(globexpr)
44 | return not file_list
45 |
--------------------------------------------------------------------------------
/tests/data/mock_reference_annotations.json:
--------------------------------------------------------------------------------
1 | {
2 | "genes": [
3 | {
4 | "chr": "fake",
5 | "start": 1,
6 | "end": 1000,
7 | "strand": "+",
8 | "name": "ENSG0001",
9 | "aliases": [],
10 | "transcripts": [
11 | {
12 | "is_best_transcript": true,
13 | "name": "ENST001",
14 | "start": 101,
15 | "end": 900,
16 | "exons": [
17 | {"start": 101, "end": 200},
18 | {"start": 401, "end": 500},
19 | {"start": 601, "end": 700},
20 | {"start": 801, "end": 900}
21 | ],
22 | "domains": [
23 | {
24 | "name": "PF001",
25 | "desc": "",
26 | "regions": [
27 | {"start": 1, "end": 10},
28 | {"start": 50, "end": 63}
29 | ]
30 | }
31 | ],
32 | "cdna_coding_start": 51,
33 | "cdna_coding_end": 350
34 | }
35 | ]
36 | }
37 | ]
38 | }
39 |
--------------------------------------------------------------------------------
/docs/inputs/non_python_dependencies.md:
--------------------------------------------------------------------------------
1 | # Non-python Dependencies
2 |
3 | MAVIS integrates with
4 | [SV callers](./sv_callers.md),
5 | [job schedulers](#job-schedulers), and
6 | [aligners](#aligners). While some of
7 | these dependencies are optional, all currently supported options are
8 | detailed below. The versions column in the tables below list all the
9 | versions which were tested for each tool. Each version listed is known
10 | to be compatible with MAVIS.
11 |
12 | ## Job Schedulers
13 |
14 | MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling
15 |
16 | ## Aligners
17 |
18 | Two aligners are supported [bwa](../../glossary/#bwa) and
19 | [blat](../../glossary/#blat) (default). These are both included in the docker image by default.
20 |
21 | | Name | Version(s) | Environment Setting |
22 | | ---------------------------------------------- | ----------------------- | ------------------------- |
23 | | [blat](../../glossary/#blat) | `36x2` `36` | `MAVIS_ALIGNER=blat` |
24 | | [bwa mem ](../../glossary/#bwa mem ) | `0.7.15-r1140` `0.7.12` | `MAVIS_ALIGNER='bwa mem'` |
25 |
26 | !!! note
27 | When setting the aligner you will also need to set the
28 | [aligner_reference](../../configuration/settings/#aligner_reference) to match
29 |
--------------------------------------------------------------------------------
/docs/outputs/illustrations.md:
--------------------------------------------------------------------------------
1 | # Illustrations
2 |
3 | ## Fusion Diagrams
4 |
5 | These are diagrams produced during the annotate step. These represent
6 | the putative fusion events of a single breakpoint pair.
7 |
8 | 
9 |
10 | Fusion from transcriptome data. Intronic breakpoints here indicate
11 | retained intron sequence and a novel exon is
12 | predicted.
13 |
14 | If the [draw_fusions_only](../../configuration/settings/#draw_fusions_only flag is set to
15 | False then all events will produce a diagram, even anti-sense fusions
16 |
17 | 
18 |
19 | Disruptive Anti-sense
20 | Fusion
21 |
22 | ## Transcript Overlays
23 |
24 | MAVIS supports generating diagrams of all transcripts for a given gene.
25 | These can be overlaid with markers and bam\_file pileup data. This is
26 | particularly useful for visualizing splice site mutations.
27 |
28 | 
29 |
30 | RB1 splice site mutation results in skipping of exon 9
31 |
32 | The above diagram was generated using the overlay command
33 |
34 | ```bash
35 | mavis overlay RB1 \
36 | -o /path/to/output/dir \
37 | --read_depth_plot rna /path/to/bam/file \
38 | --marker M1 48939029 \
39 | --annotations /path/to/mavis/annotations/reference/file
40 | ```
41 |
--------------------------------------------------------------------------------
/src/mavis/convert/breakdancer.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import pandas as pd
4 |
5 |
6 | def convert_file(input_file):
7 | bam_to_lib = {}
8 |
9 | # read comments
10 | with open(input_file, 'r') as fh:
11 | # comments in breakdancer are marked with a single # so they need to be discarded before reading
12 | lines = fh.readlines()
13 | line_index = 0
14 | while line_index < len(lines) and lines[line_index].startswith('#'):
15 | metadata_match = re.match(r'^#(\S+)\t.*\tlibrary:(\S+)\t.*', lines[line_index])
16 | if metadata_match:
17 | bam_to_lib[metadata_match.group(1)] = metadata_match.group(2)
18 | line_index += 1
19 | header = [c.strip() for c in re.sub(r'^#', '', lines[line_index - 1]).split('\t')]
20 | # read the main file
21 | df = pd.read_csv(
22 | input_file,
23 | names=header,
24 | sep='\t',
25 | comment='#',
26 | dtype={
27 | 'num_Reads_lib': str,
28 | 'Pos1': int,
29 | 'Pos2': int,
30 | 'Chr1': str,
31 | 'Chr2': str,
32 | 'Type': str,
33 | },
34 | )
35 | if 'num_Reads_lib' not in df:
36 | raise KeyError('missing required column: num_Reads_lib')
37 |
38 | for bam, lib in bam_to_lib.items():
39 | df['num_Reads_lib'] = df['num_Reads_lib'].str.replace(bam, lib)
40 | return df.to_dict('records')
41 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7-slim-buster
2 |
3 | WORKDIR /app
4 |
5 | RUN apt-get update && \
6 | apt-get upgrade -y && \
7 | apt-get install -y git wget make gcc libz-dev
8 |
9 | # pysam dependencies
10 | RUN apt-get install -y libncurses5-dev zlib1g-dev libbz2-dev libncursesw5-dev liblzma-dev
11 |
12 | # install BWA
13 | RUN git clone https://github.com/lh3/bwa.git && \
14 | cd bwa && \
15 | git checkout v0.7.17 && \
16 | make && \
17 | cd .. && \
18 | mv bwa/bwa /usr/local/bin
19 |
20 | # install minimap2
21 | RUN git clone https://github.com/lh3/minimap2.git && \
22 | cd minimap2 && \
23 | git checkout v2.24 && \
24 | make && \
25 | cd .. && \
26 | mv minimap2/minimap2.1 /usr/local/bin
27 |
28 | # install blat dependencies
29 | RUN apt-get install -y libcurl4
30 |
31 | # install blat
32 | RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat && \
33 | chmod a+x blat && \
34 | mv blat /usr/local/bin
35 |
36 | # install wtdbg2
37 | RUN git clone https://github.com/ruanjue/wtdbg2.git && \
38 | cd wtdbg2 && \
39 | make && \
40 | cd .. && \
41 | mv wtdbg2/wtdbg2 /usr/local/bin
42 |
43 | COPY setup.py setup.py
44 | COPY setup.cfg setup.cfg
45 | COPY MANIFEST.in MANIFEST.in
46 | COPY pyproject.toml pyproject.toml
47 | COPY src src
48 | COPY LICENSE LICENSE
49 | COPY README.md README.md
50 |
51 | # install python package
52 | RUN pip install -U setuptools pip wheel
53 | RUN pip install .
54 | RUN which mavis
55 | ENTRYPOINT [ "mavis" ]
56 |
--------------------------------------------------------------------------------
/docs/migrating.md:
--------------------------------------------------------------------------------
1 | # Migrating
2 |
3 | ## Migrating from v2 to v3
4 |
5 | There are major changes from v2 to v3 of MAVIS.
6 |
7 | ### Tab File Headers
8 |
9 | Tab file headers no longer start with `#`. Any lines starting with a pound will be treated
10 | as comments. This will apply to mavis-style inputs as well as any tab delimited
11 | reference files
12 |
13 | ### Configuration
14 |
15 | MAVIS no longer uses command line arguments, config files, and environment variables for
16 | configuration. Instead all configurable settings are controlled via a single input JSON
17 | config file
18 |
19 | ### Scheduling
20 |
21 | MAVIS is now integrated with snakemake instead of handling its own scheduling
22 |
23 | ## Reference Annotation Files
24 |
25 | MAVIS no longer supports the previously deprecated tab-delimited format of the annotations file. If you are still using these files in your project we have provided a script to automatically convert them to the newer format in the tools directory.
26 |
27 | ```bash
28 | python src/tools/convert_annotations_format.py \
29 | /path/to/tab/file.tab \
30 | --input_type v2-tab \
31 | /path/to/new/json/file.json
32 | ```
33 |
34 | In v3 the JSON files are slightly different to support multiple translations per transcript. You old v3 files can be automatically converted to the new format with the same script
35 |
36 | ```bash
37 | python src/tools/convert_annotations_format.py \
38 | /path/to/json/file.json \
39 | --input_type v2-json \
40 | /path/to/new/json/file.json
41 | ```
42 |
--------------------------------------------------------------------------------
/docs/package/mavis/cluster/index.md:
--------------------------------------------------------------------------------
1 | # Sub-package Documentation
2 |
3 | The cluster sub-package is responsible for merging variants coming from different inputs (i.e. different tools).
4 |
5 | ## Types of Output Files
6 |
7 | | expected name/suffix | file type/format | content |
8 | | ------------------------------ | -------------------------- | -------------------------------------------------------------------- |
9 | | ``cluster_assignment.tab`` | text/tabbed | |
10 | | ``uninformative_clusters.txt`` | text | list of cluster ids that were dropped by annotation proximity filter |
11 | | ``clusters.bed`` | [bed](../../glossary/#bed) | cluster positions |
12 | | ``cluster-*.tab`` | text/tabbed | computed clusters |
13 |
14 | ## Algorithm Overview
15 |
16 | - Collapse any duplicate breakpoint pairs
17 | - Split breakpoint pairs by type
18 | - Cluster breakpoint pairs by distance (within a type)
19 |
20 | - Create a graph representation of the distances between pairs
21 | - Find cliques up to a given input size (cluster_clique_size)
22 | - Hierarchically cluster the cliques (allows redundant participation)
23 | - For each input node/pair pick the best cluster(s)
24 |
25 | - Output the clusters and the mapping to the input pairs
26 |
--------------------------------------------------------------------------------
/src/mavis/convert/arriba.py:
--------------------------------------------------------------------------------
1 | from ..constants import COLUMNS, ORIENT
2 |
3 |
4 | def get_orient(string):
5 | if string == "downstream":
6 | return ORIENT.LEFT
7 | elif string == "upstream":
8 | return ORIENT.RIGHT
9 | return ORIENT.NS
10 |
11 |
12 | def convert_row(row):
13 | """
14 | transforms the aribba output into the common format for expansion. Maps the input column
15 | names to column names which MAVIS can read
16 | """
17 | std_row = {}
18 |
19 | try:
20 | std_row[COLUMNS.break1_chromosome], b1_start = row["breakpoint1"].split(":")
21 | std_row[COLUMNS.break2_chromosome], b2_start = row["breakpoint2"].split(":")
22 |
23 | std_row[COLUMNS.break1_strand] = row["strand1(gene/fusion)"].split("/")[1]
24 | std_row[COLUMNS.break2_strand] = row["strand2(gene/fusion)"].split("/")[1]
25 | std_row[COLUMNS.event_type] = row["type"].split("/")[0]
26 | std_row[COLUMNS.break1_orientation] = get_orient(row["direction1"])
27 | std_row[COLUMNS.break2_orientation] = get_orient(row["direction2"])
28 |
29 | std_row[COLUMNS.break1_position_start] = std_row[COLUMNS.break1_position_end] = b1_start
30 | std_row[COLUMNS.break2_position_start] = std_row[COLUMNS.break2_position_end] = b2_start
31 | except (ValueError, TypeError):
32 | raise AssertionError(
33 | "Could not parse the breakpoint from the Arriba row: {}, {}".format(
34 | row["breakpoint1"], row["breakpoint2"]
35 | )
36 | )
37 | return std_row
38 |
--------------------------------------------------------------------------------
/docs/tutorials/mini.md:
--------------------------------------------------------------------------------
1 | # MAVIS (Mini) Tutorial
2 |
3 | This tutorial is based on the data included in the tests folder of
4 | MAVIS. The data files are very small and this tutorial is really only
5 | intended for testing a MAVIS install. The data here is simulated and
6 | results are not representative of the typical events you would see
7 | reported from MAVIS. For a more complete tutorial with actual fusion
8 | gene examples, please see the [full tutorial](../../tutorials/full/).
9 |
10 | The first step is to clone or download a zip of the MAVIS repository
11 | (). You will need the tests directory.
12 | The tag you check out should correspond to the MAVIS version you have
13 | installed
14 |
15 | ```bash
16 | git clone https://github.com/bcgsc/mavis.git
17 | git checkout
18 | mv mavis/tests .
19 | mv mavis/Snakefile .
20 | rm -r mavis
21 | ```
22 |
23 | Now you should have a folder called `tests` in your current directory. Since this is a trivial
24 | example, it can easily be run locally. However in order to run the snakemake file you will need
25 | to have the config validation module `mavis_config` installed which has minimal dependencies.
26 |
27 | ```bash
28 | pip install mavis_config
29 | ```
30 |
31 | Now you are ready to run MAVIS. This can be done in a single command using snakemake.
32 |
33 | ```bash
34 | snakemake -j 1 --configfile=tests/mini-tutorial.config.json -s Snakefile
35 | ```
36 |
37 | Which will run the mini tutorial version and output files into a folder called `output_dir` in the
38 | current directory
39 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | # This workflows will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: publish
5 |
6 | on:
7 | release:
8 | types: [created]
9 |
10 | jobs:
11 | pypi:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v2
15 | - name: Set up Python
16 | uses: actions/setup-python@v2
17 | with:
18 | python-version: '3.x'
19 | - name: Install dependencies
20 | run: |
21 | python -m pip install --upgrade pip
22 | pip install setuptools wheel twine
23 | - name: Build and publish
24 | env:
25 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
26 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
27 | run: |
28 | python setup.py sdist bdist_wheel install
29 | twine check dist/*
30 | twine upload dist/*
31 | docker:
32 | runs-on: ubuntu-latest
33 | steps:
34 | - uses: actions/checkout@v2
35 | - run: docker login -u $DOCKER_USER -p $DOCKER_PASSWORD
36 | env:
37 | DOCKER_USER: ${{ secrets.DOCKER_USER }}
38 | DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
39 | - run: |
40 | docker build --file Dockerfile --tag bcgsc/mavis:latest --tag bcgsc/mavis:${{ github.event.release.tag_name }} .
41 | - run: docker push bcgsc/mavis:latest
42 | - run: docker push bcgsc/mavis:${{ github.event.release.tag_name }}
43 |
--------------------------------------------------------------------------------
/tests/mini-tutorial.annotate_only.config.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotate.draw_fusions_only": false,
3 | "convert": {
4 | "mock_converted": {
5 | "inputs": [
6 | "tests/data/mock_sv_events.tsv"
7 | ],
8 | "file_type": "mavis",
9 | "assume_no_untemplated": true
10 | }
11 | },
12 | "skip_stage.validate": true,
13 | "cluster.uninformative_filter": true,
14 | "cluster.limit_to_chr": null,
15 | "cluster.min_clusters_per_file": 5,
16 | "libraries": {
17 | "mock-A47933": {
18 | "assign": [
19 | "tests/data/mock_trans_sv_events.tsv"
20 | ],
21 | "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam",
22 | "disease_status": "diseased",
23 | "protocol": "transcriptome",
24 | "strand_specific": true
25 | },
26 | "mock-A36971": {
27 | "assign": [
28 | "mock_converted"
29 | ],
30 | "bam_file": "tests/data/mock_reads_for_events.sorted.bam",
31 | "disease_status": "diseased",
32 | "protocol": "genome",
33 | "strand_specific": false
34 | }
35 | },
36 | "output_dir": "output_dir",
37 | "reference.annotations": [
38 | "tests/data/mock_annotations.json"
39 | ],
40 | "reference.dgv_annotation": [
41 | "tests/data/mock_dgv_annotation.tab"
42 | ],
43 | "reference.masking": [
44 | "tests/data/mock_masking.tab"
45 | ],
46 | "reference.reference_genome": [
47 | "tests/data/mock_reference_genome.fa"
48 | ]
49 | }
50 |
--------------------------------------------------------------------------------
/src/mavis/convert/chimerascan.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from ..constants import COLUMNS, ORIENT
4 | from .constants import SUPPORTED_TOOL, TRACKING_COLUMN
5 |
6 |
7 | def convert_row(row: Dict) -> Dict:
8 | """
9 | transforms the chimerscan output into the common format for expansion. Maps the input column
10 | names to column names which MAVIS can read
11 | """
12 | std_row = {}
13 | for retained_column in ['genes5p', 'genes3p']:
14 | if retained_column in row:
15 | std_row['{}_{}'.format(SUPPORTED_TOOL.CHIMERASCAN, retained_column)] = row[
16 | retained_column
17 | ]
18 | if TRACKING_COLUMN not in row:
19 | std_row[TRACKING_COLUMN] = '{}-{}'.format(
20 | SUPPORTED_TOOL.CHIMERASCAN, row['chimera_cluster_id']
21 | )
22 |
23 | std_row.update(
24 | {COLUMNS.break1_chromosome: row['chrom5p'], COLUMNS.break2_chromosome: row['chrom3p']}
25 | )
26 | if row['strand5p'] == '+':
27 | std_row[COLUMNS.break1_position_start] = row['end5p']
28 | std_row[COLUMNS.break1_orientation] = ORIENT.LEFT
29 | else:
30 | std_row[COLUMNS.break1_position_start] = row['start5p']
31 | std_row[COLUMNS.break1_orientation] = ORIENT.RIGHT
32 | if row['strand3p'] == '+':
33 | std_row[COLUMNS.break2_position_start] = row['start3p']
34 | std_row[COLUMNS.break2_orientation] = ORIENT.RIGHT
35 | else:
36 | std_row[COLUMNS.break2_position_start] = row['end3p']
37 | std_row[COLUMNS.break2_orientation] = ORIENT.LEFT
38 | std_row[COLUMNS.opposing_strands] = row['strand5p'] != row['strand3p']
39 | return std_row
40 |
--------------------------------------------------------------------------------
/docs/configuration/performance.md:
--------------------------------------------------------------------------------
1 | # Resource Requirements
2 |
3 | MAVIS has been tested on both unix and linux systems. For the standard
4 | pipeline, the validation stage is the most computationally expensive.
5 | The memory and cpu requirements will vary with two main factors: the
6 | number of structural variants you are validating per job, and the size
7 | of the bam file you are validating against.
8 |
9 | There are a number of settings that can be adjusted to reduce memory and
10 | cpu requirements depending on what the user is trying to analyze. See
11 | [configuration and settings](../../configuration/general/) for more details.
12 |
13 | ## Validation Resources
14 |
15 | 
16 |
17 | Resource Requirements (MAVIS 1.8.0) for each validation job of the
18 | COLO829 tumour genome. The BAM file for the tumour genome is 127GB.
19 | Validation jobs were tested splitting into: 100, 500, 1000, and 2500
20 | structural variant validations per job. The effect of number of events
21 | validated on both memory and time is plotted
22 | above.
23 |
24 | ## Annotation Resources
25 |
26 | Similar trends were observed for the annotation step (see below) with
27 | regards to time elapsed. However the memory requirements remained more
28 | constant which is expected since, unlike validation, anntotation does
29 | not read more data in for more events.
30 |
31 | 
32 |
33 | Resource Requirements (MAVIS 1.8.0) for each annotation job of the
34 | COLO829 tumour genome. The events which passed validation (see above)
35 | represent the number of events input to the annotation
36 | step.
37 |
--------------------------------------------------------------------------------
/tests/test_tools/data/ensembl69_hg19_annotations.kras.tab:
--------------------------------------------------------------------------------
1 | ## input file used to map hugo gene names: compiled_gene_drug_pathway.v1_2_5.tsv
2 | ## input file for picking best transcript: ens69_best_transcript.txt
3 | ## Ensembl Api version 69
4 | ## generated at: Thu Aug 4 16:38:01 2016
5 | #ensembl_gene_id hugo_names chr strand gene_start gene_end best_ensembl_transcript_id ensembl_transcript_id refseq_equivalents transcript_genomic_start transcript_genomic_end cdna_coding_start cdna_coding_end genomic_exon_ranges AA_domain_ranges
6 | ENSG00000133703 KRAS 12 -1 25357723 25403870 ENST00000311936 ENST00000311936 NP_004976.2;NM_004985.3 25357723 25403865 193 759 25403685-25403865;25398208-25398329;25380168-25380346;25378548-25378707;25357723-25362845 PR00449:4-25,27-43,44-66,107-120,141-163;PF00025:3-162;SM00173:1-166;PF00009:45-163;PF08477:5-119;PS50318:165-184;SSF52540:3-184;TIGR00231:1-159;SM00175:4-166;PF00071:5-164;SM00174:6-166
7 | ENSG00000133703 KRAS 12 -1 25357723 25403870 ENST00000311936 ENST00000557334 25362102 25403870 198 425 25403685-25403870;25398208-25398329;25362102-25362845 PR00449:4-25,27-43;PS50318:52-71;SM00173:1-53;PF00071:5-44;SSF52540:3-37
8 | ENSG00000133703 KRAS 12 -1 25357723 25403870 ENST00000311936 ENST00000256078 NP_203524.1;NM_033360.2 25362365 25403737 65 634 25403685-25403737;25398208-25398329;25380168-25380346;25378548-25378707;25368371-25368494;25362365-25362845 SM00175:4-166;PF00071:5-164;SSF52540:3-185;SM00176:9-189;TIGR00231:1-159;SM00174:6-166;PR00449:4-25,27-43,44-66,107-120,141-163;PF00025:3-161;PF08477:5-119;PF00009:45-162;SM00173:1-166
9 | ENSG00000133703 KRAS 12 -1 25357723 25403870 ENST00000311936 ENST00000556131 25386753 25403863 178 309 25403698-25403863;25398208-25398329;25386753-25388160 PR00449:4-25,27-43;PF00071:5-37;SSF52540:3-38
10 |
--------------------------------------------------------------------------------
/docs/development.md:
--------------------------------------------------------------------------------
1 | # Guidelines for Contributors
2 |
3 | {!./../.github/CONTRIBUTING.md!}
4 |
5 | ## Major Assumptions
6 |
7 | Some assumptions have been made when developing this project. The major
8 | ones have been listed here to facilitate debugging/development if any of
9 | these are violated in the future.
10 |
11 | - The input bam reads have stored the sequence wrt to the positive/forward strand and have not stored the reverse complement.
12 | - The distribution of the fragment sizes in the bam file approximately follows a normal distribution.
13 |
14 | ## Current Limitations
15 |
16 | - Assembling contigs will always fail for repeat sequences as we do not resolve this. Unlike traditional assemblies we cannot assume even input coverage as we are taking a select portion of the reads to assemble.
17 | - Currently no attempt is made to group/pair single events into complex events.
18 | - Transcriptome validation uses a collapsed model of all overlapping transcripts and is not isoform specific. Allowing for isoform specific validation would be computationally expensive but may be considered as an optional setting for future releases.
19 |
20 | ## Computing Code coverage
21 |
22 | Since MAVIS uses multiple processes, it adds complexity to computing the
23 | code coverage. Running coverage normally will undereport. To ensure that
24 | the coverage module captures the information from the subprocesses we
25 | need to do the following
26 |
27 | In our development python virtual environment put a coverage.pth file
28 | (ex. `venv/lib/python3.6/site-packages/coverage.pth`) containing the
29 | following
30 |
31 | ```python
32 | import coverage; coverage.process_startup()
33 | ```
34 |
35 | Additionally you will need to set the environment variable
36 |
37 | ```bash
38 | export COVERAGE_PROCESS_START=/path/to/mavis/repo/mavis/.coveragerc
39 | ```
40 |
--------------------------------------------------------------------------------
/.github/workflows/quick-tests.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: quick-tests
5 |
6 | on: [push]
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-20.04
11 | strategy:
12 | matrix:
13 | python-version: ["3.7", "3.8", "3.9", "3.10"]
14 | name: python-${{ matrix.python-version }} quick
15 | steps:
16 | - uses: actions/checkout@v2
17 | - name: Set up Python ${{ matrix.python-version }}
18 | uses: actions/setup-python@v2
19 | with:
20 | python-version: ${{ matrix.python-version }}
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip setuptools wheel
24 | pip install .[test]
25 | - name: Lint with flake8
26 | run: |
27 | pip install flake8
28 | # stop the build if there are Python syntax errors or undefined names
29 | flake8 src tests --count --show-source --statistics
30 | - name: Lint with black
31 | run: |
32 | pip install black
33 | # stop the build if black needs to be run
34 | black src tests -S -l 100 --check
35 | - name: Lint with isort
36 | run: |
37 | pip install isort
38 | isort src tests --check
39 | - name: install bwa
40 | run: |
41 | git clone https://github.com/lh3/bwa.git
42 | cd bwa
43 | git checkout v0.7.17
44 | make
45 | cd ..
46 | - name: install blat
47 | run: |
48 | wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat
49 | chmod a+x blat
50 | - name: run short tests with pytest
51 | run: |
52 | export PATH=$PATH:$(pwd):$(pwd)/bwa
53 | pytest tests -v \
54 | --junitxml=junit/test-results-${{ matrix.python-version }}.xml \
55 | --durations=10
56 | env:
57 | RUN_FULL: 0
58 |
--------------------------------------------------------------------------------
/src/mavis/convert/constants.py:
--------------------------------------------------------------------------------
1 | from ..constants import SVTYPE, MavisNamespace
2 |
3 |
4 | class SUPPORTED_TOOL(MavisNamespace):
5 | """
6 | Supported Tools used to call SVs and then used as input into MAVIS
7 |
8 | Attributes:
9 | CHIMERASCAN: chimerascan [Iyer-2011]_
10 | DEFUSE: defuse [McPherson-2011]_
11 | DELLY: delly [Rausch-2012]_
12 | MANTA: manta [Chen-2016]_
13 | PINDEL: pindel [Ye-2009]_
14 | TA: transabyss [Robertson-2010]_
15 | """
16 |
17 | MANTA = 'manta'
18 | DELLY = 'delly'
19 | TA = 'transabyss'
20 | PINDEL = 'pindel'
21 | CHIMERASCAN = 'chimerascan'
22 | MAVIS = 'mavis'
23 | DEFUSE = 'defuse'
24 | BREAKDANCER = 'breakdancer'
25 | VCF = 'vcf'
26 | BREAKSEQ = 'breakseq'
27 | CNVNATOR = 'cnvnator'
28 | STRELKA = 'strelka'
29 | STARFUSION = 'starfusion'
30 | STRAGLR = 'straglr'
31 | ARRIBA = 'arriba'
32 |
33 |
34 | TOOL_SVTYPE_MAPPING = {v: [v] for v in SVTYPE.values()} # type: ignore
35 | TOOL_SVTYPE_MAPPING.update(
36 | {
37 | 'DEL': [SVTYPE.DEL],
38 | 'INS': [SVTYPE.INS],
39 | 'ITX': [SVTYPE.DUP],
40 | 'CTX': [SVTYPE.TRANS, SVTYPE.ITRANS],
41 | 'INV': [SVTYPE.INV],
42 | 'BND': [SVTYPE.TRANS, SVTYPE.ITRANS, SVTYPE.DUP, SVTYPE.INS, SVTYPE.DEL, SVTYPE.INV],
43 | 'TRA': [SVTYPE.TRANS, SVTYPE.ITRANS],
44 | 'CNV': [SVTYPE.DUP],
45 | 'RPL': [SVTYPE.INS],
46 | 'DUP:TANDEM': [SVTYPE.DUP],
47 | 'DUP': [SVTYPE.DUP],
48 | 'interchromosomal': [SVTYPE.TRANS, SVTYPE.ITRANS],
49 | 'eversion': [SVTYPE.DUP],
50 | 'translocation': [SVTYPE.TRANS, SVTYPE.ITRANS],
51 | 'ins': [SVTYPE.INS],
52 | 'del': [SVTYPE.DEL],
53 | 'dup': [SVTYPE.DUP],
54 | 'ITD': [SVTYPE.DUP],
55 | 'IDP': [SVTYPE.INS],
56 | 'DEL/INV': [SVTYPE.DEL, SVTYPE.INV],
57 | 'DUP/INS': [SVTYPE.DUP, SVTYPE.INS],
58 | 'INVDUP': [SVTYPE.INV, SVTYPE.DUP, SVTYPE.INS],
59 | 'INV/INVDUP': [SVTYPE.INV, SVTYPE.DUP, SVTYPE.INS],
60 | }
61 | )
62 |
63 | TRACKING_COLUMN = 'tracking_id'
64 |
--------------------------------------------------------------------------------
/tests/test_mavis/test_constants.py:
--------------------------------------------------------------------------------
1 | from mavis.constants import (
2 | COLUMNS,
3 | ORIENT,
4 | STRAND,
5 | reverse_complement,
6 | sort_columns,
7 | translate,
8 | )
9 |
10 |
11 | class TestConstants:
12 | def test_strand_compare(self):
13 | assert STRAND.compare(STRAND.NS, STRAND.POS)
14 | assert STRAND.compare(STRAND.NS, STRAND.NEG)
15 | assert STRAND.compare(STRAND.POS, STRAND.POS)
16 | assert STRAND.compare(STRAND.NEG, STRAND.NEG)
17 | assert not STRAND.compare(STRAND.POS, STRAND.NEG)
18 | assert not STRAND.compare(STRAND.NEG, STRAND.POS)
19 |
20 | def test_orient_compare(self):
21 | assert ORIENT.compare(ORIENT.NS, ORIENT.RIGHT)
22 | assert ORIENT.compare(ORIENT.NS, ORIENT.LEFT)
23 | assert ORIENT.compare(ORIENT.RIGHT, ORIENT.RIGHT)
24 | assert ORIENT.compare(ORIENT.LEFT, ORIENT.LEFT)
25 | assert not ORIENT.compare(ORIENT.RIGHT, ORIENT.LEFT)
26 | assert not ORIENT.compare(ORIENT.LEFT, ORIENT.RIGHT)
27 |
28 | def test_reverse_complement(self):
29 | assert reverse_complement('CGAT') == 'ATCG'
30 | assert reverse_complement('') == ''
31 |
32 | def test_translate(self):
33 | seq = 'ATG' 'AAT' 'TCT' 'GGA' 'TGA'
34 | translated_seq = translate(seq, 0)
35 | assert translated_seq == 'MNSG*' # ATG AAT TCT GGA TGA
36 | translated_seq = translate(seq, 1)
37 | assert translated_seq == '*ILD' # A TGA ATT CTG GAT GA
38 | translated_seq = translate(seq, 2)
39 | assert translated_seq == 'EFWM' # AT GAA TTC TGG ATG A
40 |
41 | def test_sort_columns(self):
42 | temp = ['NEW', 'NEW2', COLUMNS.break1_seq, COLUMNS.break2_seq, COLUMNS.break1_chromosome]
43 | assert sort_columns(temp) == [
44 | COLUMNS.break1_chromosome,
45 | COLUMNS.break1_seq,
46 | COLUMNS.break2_seq,
47 | 'NEW',
48 | 'NEW2',
49 | ]
50 |
51 | def test_column_matches_column_name(self):
52 | assert COLUMNS.library == COLUMNS.library
53 | s = set([COLUMNS.library, COLUMNS.library])
54 | assert len(s) == 1
55 |
--------------------------------------------------------------------------------
/tests/test_mavis/validate/test_validate.py:
--------------------------------------------------------------------------------
1 | from mavis.constants import ORIENT
2 | from mavis.interval import Interval
3 | from mavis.validate.base import Evidence
4 | from mavis.validate.call import _call_interval_by_flanking_coverage
5 |
6 | from ..mock import Mock
7 |
8 |
9 | class CallIntervalByFlankingCoverage:
10 | def test_invalid_input_attr(self):
11 | pass
12 |
13 | def test_left(self):
14 | i = _call_interval_by_flanking_coverage(
15 | Mock(start=101, end=110),
16 | ORIENT.LEFT,
17 | 100,
18 | 20,
19 | distance=Evidence.distance,
20 | traverse=Evidence.traverse,
21 | )
22 | assert i.start == 110
23 | assert i.end == 180
24 |
25 | i = _call_interval_by_flanking_coverage(
26 | Mock(start=20, end=80),
27 | ORIENT.LEFT,
28 | 230,
29 | 40,
30 | distance=Evidence.distance,
31 | traverse=Evidence.traverse,
32 | )
33 | assert i.start == 80
34 | assert i.end == 209
35 |
36 | def test_right(self):
37 | i = _call_interval_by_flanking_coverage(
38 | Mock(start=101, end=110),
39 | ORIENT.RIGHT,
40 | 100,
41 | 20,
42 | distance=Evidence.distance,
43 | traverse=Evidence.traverse,
44 | )
45 | assert i.end == 101
46 | assert i.start == 31
47 |
48 | i = _call_interval_by_flanking_coverage(
49 | Mock(start=150, end=200),
50 | ORIENT.RIGHT,
51 | 230,
52 | 40,
53 | distance=Evidence.distance,
54 | traverse=Evidence.traverse,
55 | )
56 | assert i.start == 11
57 | assert i.end == 150
58 |
59 |
60 | class TestDistanceAndTraverse:
61 | def test_distance(self):
62 | assert Evidence.distance(1, 11) == Interval(10)
63 |
64 | def test_traverse_right(self):
65 | assert Evidence.traverse(1, 10, ORIENT.RIGHT) == Interval(11)
66 |
67 | def test_traverse_left(self):
68 | assert Evidence.traverse(20, 10, ORIENT.LEFT) == Interval(10)
69 |
--------------------------------------------------------------------------------
/tests/mini-tutorial.config.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotate.draw_fusions_only": false,
3 | "convert": {
4 | "mock_converted": {
5 | "inputs": [
6 | "tests/data/mock_sv_events.tsv"
7 | ],
8 | "file_type": "mavis",
9 | "assume_no_untemplated": true
10 | }
11 | },
12 | "cluster.uninformative_filter": true,
13 | "cluster.limit_to_chr": null,
14 | "cluster.min_clusters_per_file": 5,
15 | "libraries": {
16 | "mock-A47933": {
17 | "assign": [
18 | "tests/data/mock_trans_sv_events.tsv"
19 | ],
20 | "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam",
21 | "disease_status": "diseased",
22 | "median_fragment_size": 188,
23 | "protocol": "transcriptome",
24 | "read_length": 75,
25 | "stdev_fragment_size": 50,
26 | "strand_specific": true
27 | },
28 | "mock-A36971": {
29 | "assign": [
30 | "mock_converted"
31 | ],
32 | "bam_file": "tests/data/mock_reads_for_events.sorted.bam",
33 | "disease_status": "diseased",
34 | "median_fragment_size": 400,
35 | "protocol": "genome",
36 | "read_length": 150,
37 | "stdev_fragment_size": 97,
38 | "strand_specific": false
39 | }
40 | },
41 | "output_dir": "output_dir",
42 | "reference.aligner_reference": [
43 | "tests/data/mock_reference_genome.2bit"
44 | ],
45 | "reference.annotations": [
46 | "tests/data/mock_annotations.json"
47 | ],
48 | "reference.dgv_annotation": [
49 | "tests/data/mock_dgv_annotation.tab"
50 | ],
51 | "reference.masking": [
52 | "tests/data/mock_masking.tab"
53 | ],
54 | "reference.reference_genome": [
55 | "tests/data/mock_reference_genome.fa"
56 | ],
57 | "reference.template_metadata": [
58 | "tests/data/cytoBand.txt"
59 | ],
60 | "summary.filter_min_remapped_reads": 5,
61 | "summary.filter_min_spanning_reads": 5,
62 | "summary.filter_min_linking_split_reads": 1,
63 | "summary.filter_min_flanking_reads": 10
64 | }
65 |
--------------------------------------------------------------------------------
/tests/data/breakdancer_output.txt:
--------------------------------------------------------------------------------
1 | #Software: 1.4.5
2 | #Command: /gsc/software/linux-x86_64-centos6/breakdancer-1.4.5/bin/breakdancer-max -t /projects/trans_scratch/validations/workspace/creisle/MAV228/breakdancer.cfg
3 | #Library Statistics:
4 | #/projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam mean:441 std:116.54 uppercutoff:959.41 lowercutoff:22.39 readlen:149.65 library:A36971 reflen:3046874375 seqcov:69.8209 phycov:102.877 32:31637251
5 | #/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam mean:437.99 std:124.28 uppercutoff:955.49 lowercutoff:0 readlen:147.17 library:A36973 reflen:3046874375 seqcov:33.1399 phycov:49.3136 32:27980009
6 | #Chr1 Pos1 Orientation1 Chr2 Pos2 Orientation2 Type Size Score num_Reads num_Reads_lib A36971_2_lanes_dupsFlagged.bam A36973_1_lane_dupsFlagged.bam
7 | 1 200067631 23+27- 2 23697874 17+6- CTX -439 38 14 /projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|11:/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam|3
8 | 1 10001 83+126- 1 10546 83+126- ITX -352 99 43 /projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|23:/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam|20 NA NA
9 | 1 808410 11+11- 1 808574 11+11- ITX -338 99 9 /projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|6:/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam|3 NA NA
10 | 1 869445 89+21- 1 870225 5+93- DEL 892 99 67 /projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|40:/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam|27 0.06 0.08
11 | 1 54687282 6+9- 1 54687479 6+9- INS -421 99 3 /projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|3 NA NA
12 | 1 6508246 10+17- 1 17028869 57+50- INV 10520288 31 4 /projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|4 1.77 2.21
13 |
--------------------------------------------------------------------------------
/docs/package/mavis/annotate/index.md:
--------------------------------------------------------------------------------
1 | # Sub-package Documentation
2 |
3 | ## Types of Output Files
4 |
5 | | expected name/suffix | file type/format | content |
6 | | ------------------------------ | --------------------------------- | ---------------------------------------- |
7 | | ``annotations.tab`` | text/tabbed | annotated events |
8 | | ``annotations.fusion-cdna.fa`` | [fasta](../../../glossary/#fasta) | putative fusion unspliced cDNA sequences |
9 | | ``drawings/*.svg`` | [SVG](../../../glossary/#svg) | diagrams |
10 | | ``drawings/*.legend.json`` | [JSON](../../../glossary/#json) | diagram legend/metadata |
11 |
12 | ## Algorithm Overview
13 |
14 | see [annotating events](../../../background/theory/#annotating-events)
15 |
16 | - read in breakpoint pairs
17 | - generate strand-specific annotations (one annotation per strand, multiple if multiple genes/transcripts in the region)
18 | - try building fusion transcripts for bp-specific calls
19 | - generate [SVG](../../../glossary/#svg) diagrams
20 |
21 | ## Levels of Annotations
22 |
23 | 
24 |
25 | ## Overview of Class Relationships
26 |
27 | 
28 | The Annotation sub-package has objects for genetic annotations and related calculations. The basic layout of the
29 | package is shown above. IS-A relationships are given by the blue arrows. HAS-A relationships are shown in black.
30 | And reference_object/parent
31 | type relationships are shown in red. mavis.annotate.genomic.Gene is a gene. Start and end are
32 | genomic positions wrt to the template/chr. mavis.annotate.genomic.PreTranscript is the
33 | unspliced transcript. Start and end are genomic positions wrt to the template/chr.
34 | mavis.annotate.genomic.Transcript: is the spliced transcript. Start and end coordinates are
35 | 1 to the length of the spliced product in base pairs.
36 | mavis.annotate.protein.Translation: is the translation of the spliced transcript. Start and
37 | end are cdna positions wrt the 5' end of the spliced transcript. The start and end here describe the start and end
38 | of the coding sequence
39 |
--------------------------------------------------------------------------------
/tests/test_tools/test_convert_annotations_format.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | import pytest
5 |
6 | from tools.convert_annotations_format import (
7 | convert_gff2_to_mavis,
8 | convert_gff3_to_mavis,
9 | convert_mavis_json_2to3,
10 | convert_tab_to_json,
11 | )
12 |
13 | CONVERTERS = {
14 | 'gff3': convert_gff3_to_mavis,
15 | 'gtf': convert_gff2_to_mavis,
16 | 'v2-json': convert_mavis_json_2to3,
17 | 'v2-tab': convert_tab_to_json,
18 | }
19 |
20 |
21 | def sort_elements(data):
22 | """
23 | Sort lists of exons, domains, genes, etc by position and name to facilitate comparison
24 | """
25 | if not isinstance(data, dict):
26 | if isinstance(data, list):
27 | items = [sort_elements(e) for e in data]
28 |
29 | if all(isinstance(elem, dict) for elem in data):
30 | return sorted(
31 | items, key=lambda elem: (elem.get('start'), elem.get('end'), elem.get('name'))
32 | )
33 | return items
34 | else:
35 | return data
36 |
37 | for key, value in data.items():
38 | data[key] = sort_elements(value)
39 | return data
40 |
41 |
42 | @pytest.mark.parametrize(
43 | 'filename,expected_file,input_type',
44 | [
45 | ['K02718.1.gff3', 'K02718.1.gff3.json', 'gff3'],
46 | ['K02718.1.gtf', 'K02718.1.gtf.json', 'gtf'],
47 | ['Homo_sapiens.GRCh38.kras.gff3', 'Homo_sapiens.GRCh38.kras.gff3.json', 'gff3'],
48 | ['Homo_sapiens.GRCh38.kras.gtf', 'Homo_sapiens.GRCh38.kras.gtf.json', 'gtf'],
49 | ['example_genes.v2.json', 'example_genes.v3.json', 'v2-json'],
50 | [
51 | 'ensembl69_hg19_annotations.kras.tab',
52 | 'ensembl69_hg19_annotations.kras.tab.json',
53 | 'v2-tab',
54 | ],
55 | ['viral.gtf', 'viral.gtf.json', 'gtf'],
56 | ],
57 | )
58 | def test_gff_examples(filename, expected_file, input_type):
59 | data_dir = os.path.join(os.path.dirname(__file__), 'data')
60 | input_file = os.path.join(data_dir, filename)
61 | with open(os.path.join(data_dir, expected_file), 'r') as fh:
62 | expected = json.load(fh)
63 |
64 | # order doesn't matter
65 | data = sort_elements(CONVERTERS[input_type](input_file))
66 | expected = sort_elements(expected)
67 |
68 | assert len(data['genes']) == len(expected['genes'])
69 | assert data == expected
70 |
--------------------------------------------------------------------------------
/docs/configuration/general.md:
--------------------------------------------------------------------------------
1 | # Getting Started
2 |
3 | An exhaustive list of the various configurable settings can be found [here](../settings). Alternatively you can view them through the [online schema explorer](https://json-schema.app/view?url=https://raw.githubusercontent.com/bcgsc/mavis_config/master/src/mavis_config/config.json)
4 |
5 | ## Pipeline Configuration File
6 |
7 | The pipeline can be run in steps or it can be configured using a JSON
8 | configuration file and setup in a single step. Scripts will be generated
9 | to run all steps following clustering.
10 |
11 | The config schema is found in the mavis package under `src/mavis/schemas/config.json`
12 |
13 | Top level settings follow the pattern `.`. The convert and library
14 | sections are nested objects.
15 |
16 | ## Adjusting the Resource Requirements
17 |
18 | ### Choosing the Number of Validation/Annotation Jobs
19 |
20 | MAVIS chooses the number of jobs to split validate/annotate stages into
21 | based on two settings: [cluster.max_files](../../configuration/settings/#clustermax_files) and
22 | [cluster.min_clusters_per_file](../../configuration/settings/#clustermin-clusters-per-file).
23 |
24 | For example, in the following situation say you have: 1000 clusters,
25 | `cluster.max_files=10`, and `cluster.min_clusters_per_file=10`. Then MAVIS will set up
26 | 10 validation jobs each with 100 events.
27 |
28 | However, if `cluster.min_clusters_per_file=500`, then MAVIS would only set up 2
29 | jobs each with 500 events. This is because
30 | [cluster.min_clusters_per_file](../../configuration/settings/#clustermin-clusters-per-file) takes precedence
31 | over [custer.max_files](../../configuration/settings/#clustermax_files).
32 |
33 | Splitting into more jobs will lower the resource requirements per job
34 | (see [resource requirements](../performance/)). The memory and time requirements for validation are linear
35 | with respect to the number of events to be validated.
36 |
37 | ### Uninformative Filter
38 |
39 | For example, if the user is only interested in events in genes, then the
40 | [cluster.uninformative_filter](../../configuration/settings/#clusteruninformative_filter) can be used. This
41 | will drop all events that are not within a certain distance
42 | ([cluster.max_proximity](../../configuration/settings/#clustermax_proximity)) to any annotation in
43 | the annotations reference file. These events will be dropped prior to
44 | the validation stage which results in significant speed up.
45 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = mavis
3 | version = 3.1.2
4 | url = https://github.com/bcgsc/mavis.git
5 | download_url = https://github.com/bcgsc/mavis/archive/v2.2.10.tar.gz
6 | description = A Structural Variant Post-Processing Package
7 | author_email = creisle@bcgsc.ca
8 | author = Caralyn Reisle
9 | maintainer_email = mavis@bcgsc.ca
10 | maintainer = mavis
11 | long_description = file: README.md
12 | long_description_content_type = text/markdown
13 | license_file = LICENSE
14 | project_urls = mavis = http://mavis.bcgsc.ca
15 |
16 | [bdist_wheel]
17 | universal = 1
18 |
19 | [pycodestyle]
20 | ignore = E501
21 | W503
22 | E203
23 | statistics = True
24 |
25 | [flake8]
26 | ignore = E501,W503,E203
27 |
28 | [isort]
29 | profile = black
30 |
31 | [options]
32 | packages = find:
33 | package_dir =
34 | = src
35 | python_requires = >=3.7
36 | dependency_links = []
37 | include_package_data = True
38 | install_requires =
39 | biopython>=1.70, <1.78
40 | braceexpand==0.1.2
41 | colour
42 | Distance>=0.1.3
43 | mavis_config>=1.2.2, <2.0.0
44 | networkx>=2.5,<3
45 | numpy>=1.13.1
46 | pandas>=1.1, <2
47 | pysam
48 | Shapely>=1.6.4.post1
49 | shortuuid>=0.5.0
50 | svgwrite
51 | typing_extensions>=4
52 | setup_requires =
53 | pip>=9.0.0
54 | setuptools>=36.0.0
55 |
56 | [options.packages.find]
57 | exclude = tests
58 | where = src
59 |
60 | [options.extras_require]
61 | doc =
62 | mkdocs>=1.1.2
63 | markdown-refdocs
64 | mkdocs-material>=5.4.0
65 | markdown-include
66 | mkdocs-simple-hooks>=0.1.2
67 | mkdocs-awesome-pages-plugin
68 | test =
69 | timeout-decorator>=0.3.3
70 | coverage>=4.2
71 | pycodestyle>=2.3.1
72 | pytest
73 | pytest-cov
74 | dev =
75 | black
76 | flake8
77 | isort
78 | twine
79 | wheel
80 | timeout-decorator>=0.3.3
81 | coverage>=4.2
82 | pycodestyle>=2.3.1
83 | pytest
84 | pytest-cov
85 | pytest-xdist
86 | mkdocs>=1.1.2,<2
87 | markdown-refdocs
88 | mkdocs-material>=5.4.0
89 | markdown-include
90 | mkdocs-simple-hooks>=0.1.2
91 | types-setuptools>=57.4.7, <58
92 | deploy =
93 | twine
94 | wheel
95 | tools =
96 | pyensembl
97 | simplejson
98 | requests
99 |
100 | [options.entry_points]
101 | console_scripts =
102 | mavis = mavis.main:main
103 | calculate_ref_alt_counts = tools.calculate_ref_alt_counts:main
104 |
--------------------------------------------------------------------------------
/tests/data/bwa_pipeline_config.cfg:
--------------------------------------------------------------------------------
1 | [reference]
2 | template_metadata = tests/data/cytoBand.txt
3 | annotations = tests/data/mock_annotations.json
4 | masking = tests/data/mock_masking.tab
5 | reference_genome = tests/data/mock_reference_genome.fa
6 | aligner_reference = tests/data/mock_reference_genome.fa
7 | dgv_annotation = tests/data/mock_dgv_annotation.txt
8 |
9 | [annotate]
10 | draw_fusions_only = False
11 |
12 | [validate]
13 | # evidence related settings
14 | aligner = bwa mem
15 | assembly_max_paths = 4
16 | assembly_min_exact_match_to_remap = 4
17 | assembly_min_edge_trim_weight = 4
18 | assembly_min_remap_coverage = 0
19 | assembly_min_remapped_seq = 3
20 | assembly_strand_concordance = 0.51
21 | blat_min_identity = 0.9
22 | call_error = 10
23 | contig_aln_max_event_size = 50
24 | contig_aln_merge_inner_anchor = 20
25 | contig_aln_merge_outer_anchor = 15
26 | contig_aln_min_anchor_size = 50
27 | contig_aln_min_query_consumption = 0.7
28 | fetch_reads_bins = 5
29 | fetch_reads_limit = 10000
30 | fetch_min_bin_size = 50
31 | filter_secondary_alignments = True
32 | fuzzy_mismatch_number = 1
33 | max_sc_preceeding_anchor = 6
34 | min_anchor_exact = 6
35 | min_anchor_fuzzy = 10
36 | min_anchor_match = 0.9
37 | min_double_aligned_to_estimate_insertion_size = 2
38 | min_flanking_pairs_resolution = 3
39 | min_linking_split_reads = 1
40 | min_mapping_quality = 5
41 | min_non_target_aligned_split_reads = 1
42 | min_sample_size_to_apply_percentage = 10
43 | min_softclipping = 6
44 | min_spanning_reads_resolution = 3
45 | min_splits_reads_resolution = 3
46 | stdev_count_abnormal = 3.0
47 | strand_determining_read = 2
48 | outer_window_min_event_size = 125
49 |
50 | [cluster]
51 | uninformative_filter = True
52 | limit_to_chr = None
53 |
54 | [mock-A36971]
55 | read_length = 150
56 | median_fragment_size = 400
57 | stdev_fragment_size = 97
58 | bam_file = tests/data/mock_reads_for_events.sorted.bam
59 | protocol = genome
60 | inputs = tests/data/mock_sv_events.tsv
61 | strand_specific = False
62 | disease_status=diseased
63 |
64 | [mock-A47933]
65 | read_length = 75
66 | median_fragment_size = 188
67 | stdev_fragment_size = 50
68 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam
69 | protocol = transcriptome
70 | inputs = tests/data/mock_trans_sv_events.tsv
71 | strand_specific = True
72 | disease_status=diseased
73 |
74 | [summary]
75 | filter_min_remapped_reads = 5
76 | filter_min_spanning_reads = 5
77 | filter_min_flanking_reads = 10
78 | filter_min_split_reads = 5
79 | filter_min_linking_split_reads = 1
80 | filter_cdna_synon = False
81 | filter_protein_synon = False
82 |
--------------------------------------------------------------------------------
/tests/data/missing_reference.cfg:
--------------------------------------------------------------------------------
1 | [reference]
2 | annotations = tests/data/mock_annotations.json
3 | aligner_reference = tests/data/mock_reference_genome.2bit
4 |
5 | [annotate]
6 | draw_fusions_only = False
7 |
8 | [validate]
9 | # evidence related settings
10 | aligner = blat
11 | assembly_max_paths = 4
12 | assembly_min_exact_match_to_remap = 4
13 | assembly_min_edge_trim_weight = 4
14 | assembly_min_remap_coverage = 0
15 | assembly_min_remapped_seq = 3
16 | assembly_strand_concordance = 0.51
17 | blat_min_identity = 0.9
18 | call_error = 10
19 | contig_aln_max_event_size = 50
20 | contig_aln_merge_inner_anchor = 20
21 | contig_aln_merge_outer_anchor = 15
22 | contig_aln_min_anchor_size = 50
23 | contig_aln_min_query_consumption = 0.7
24 | fetch_reads_bins = 5
25 | fetch_reads_limit = 10000
26 | fetch_min_bin_size = 50
27 | filter_secondary_alignments = True
28 | fuzzy_mismatch_number = 1
29 | max_sc_preceeding_anchor = 6
30 | min_anchor_exact = 6
31 | min_anchor_fuzzy = 10
32 | min_anchor_match = 0.9
33 | min_double_aligned_to_estimate_insertion_size = 2
34 | min_flanking_pairs_resolution = 3
35 | min_linking_split_reads = 1
36 | min_mapping_quality = 5
37 | min_non_target_aligned_split_reads = 1
38 | min_sample_size_to_apply_percentage = 10
39 | min_softclipping = 6
40 | min_spanning_reads_resolution = 3
41 | min_splits_reads_resolution = 3
42 | stdev_count_abnormal = 3.0
43 | strand_determining_read = 2
44 | outer_window_min_event_size = 125
45 |
46 | [cluster]
47 | uninformative_filter = True
48 | limit_to_chr = None
49 |
50 | [mock-A36971]
51 | read_length = 150
52 | median_fragment_size = 400
53 | stdev_fragment_size = 97
54 | bam_file = tests/data/mock_reads_for_events.sorted.bam
55 | protocol = genome
56 | inputs = mock_converted
57 | strand_specific = False
58 | disease_status=diseased
59 |
60 | [mock-A47933]
61 | read_length = 75
62 | median_fragment_size = 188
63 | stdev_fragment_size = 50
64 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam
65 | protocol = transcriptome
66 | inputs = tests/data/mock_trans_sv_events.tsv
67 | strand_specific = True
68 | disease_status=diseased
69 |
70 | [summary]
71 | filter_min_remapped_reads = 5
72 | filter_min_spanning_reads = 5
73 | filter_min_flanking_reads = 10
74 | filter_min_split_reads = 5
75 | filter_min_linking_split_reads = 1
76 | filter_cdna_synon = True
77 | filter_protein_synon = True
78 |
79 | [convert]
80 | assume_no_untemplated = True
81 | # addfile twice to check this notation is ok (will collapse them anyway)
82 | mock_converted = convert_tool_output
83 | tests/data/mock_sv_events.tsv
84 | tests/data/mock_sv_events.tsv
85 | mavis
86 | False
87 |
88 |
--------------------------------------------------------------------------------
/tests/data/clean_pipeline_config.cfg:
--------------------------------------------------------------------------------
1 | [reference]
2 | template_metadata = tests/data/cytoBand.txt
3 | annotations = tests/data/mock_annotations.json
4 | masking = tests/data/mock_masking.tab
5 | reference_genome = tests/data/mock_reference_genome.fa
6 | aligner_reference = tests/data/mock_reference_genome.2bit
7 | dgv_annotation = tests/data/mock_dgv_annotation.txt
8 |
9 | [annotate]
10 | draw_fusions_only = False
11 |
12 | [validate]
13 | # evidence related settings
14 | aligner = blat
15 | assembly_max_paths = 4
16 | assembly_min_exact_match_to_remap = 4
17 | assembly_min_edge_trim_weight = 4
18 | assembly_min_remap_coverage = 0
19 | assembly_min_remapped_seq = 3
20 | assembly_strand_concordance = 0.51
21 | blat_min_identity = 0.9
22 | call_error = 10
23 | contig_aln_max_event_size = 50
24 | contig_aln_merge_inner_anchor = 20
25 | contig_aln_merge_outer_anchor = 15
26 | contig_aln_min_anchor_size = 50
27 | contig_aln_min_query_consumption = 0.7
28 | fetch_reads_bins = 5
29 | fetch_reads_limit = 10000
30 | fetch_min_bin_size = 50
31 | filter_secondary_alignments = True
32 | fuzzy_mismatch_number = 1
33 | max_sc_preceeding_anchor = 6
34 | min_anchor_exact = 6
35 | min_anchor_fuzzy = 10
36 | min_anchor_match = 0.9
37 | min_double_aligned_to_estimate_insertion_size = 2
38 | min_flanking_pairs_resolution = 3
39 | min_linking_split_reads = 1
40 | min_mapping_quality = 5
41 | min_non_target_aligned_split_reads = 1
42 | min_sample_size_to_apply_percentage = 10
43 | min_softclipping = 6
44 | min_spanning_reads_resolution = 3
45 | min_splits_reads_resolution = 3
46 | stdev_count_abnormal = 3.0
47 | strand_determining_read = 2
48 | outer_window_min_event_size = 125
49 | write_evidence_files = False
50 | clean_aligner_files = True
51 |
52 | [cluster]
53 | uninformative_filter = True
54 | limit_to_chr = None
55 |
56 | [mock-A36971]
57 | read_length = 150
58 | median_fragment_size = 400
59 | stdev_fragment_size = 97
60 | bam_file = tests/data/mock_reads_for_events.sorted.bam
61 | protocol = genome
62 | inputs = tests/data/mock_sv_events.tsv
63 | strand_specific = False
64 | disease_status=diseased
65 |
66 | [mock-A47933]
67 | read_length = 75
68 | median_fragment_size = 188
69 | stdev_fragment_size = 50
70 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam
71 | protocol = transcriptome
72 | inputs = tests/data/mock_trans_sv_events.tsv
73 | strand_specific = True
74 | disease_status=diseased
75 |
76 | [summary]
77 | filter_min_remapped_reads = 5
78 | filter_min_spanning_reads = 5
79 | filter_min_flanking_reads = 10
80 | filter_min_split_reads = 5
81 | filter_min_linking_split_reads = 1
82 | filter_cdna_synon = True
83 | filter_protein_synon = True
84 |
--------------------------------------------------------------------------------
/tests/data/no_opt_pipeline.cfg:
--------------------------------------------------------------------------------
1 | [reference]
2 | annotations = tests/data/mock_annotations.json
3 | reference_genome = tests/data/mock_reference_genome.fa
4 | aligner_reference = tests/data/mock_reference_genome.2bit
5 |
6 | [annotate]
7 | draw_fusions_only = False
8 |
9 | [validate]
10 | # evidence related settings
11 | aligner = blat
12 | assembly_max_paths = 4
13 | assembly_min_exact_match_to_remap = 4
14 | assembly_min_edge_trim_weight = 4
15 | assembly_min_remap_coverage = 0
16 | assembly_min_remapped_seq = 3
17 | assembly_strand_concordance = 0.51
18 | blat_min_identity = 0.9
19 | call_error = 10
20 | contig_aln_max_event_size = 50
21 | contig_aln_merge_inner_anchor = 20
22 | contig_aln_merge_outer_anchor = 15
23 | contig_aln_min_anchor_size = 50
24 | contig_aln_min_query_consumption = 0.7
25 | fetch_reads_bins = 5
26 | fetch_reads_limit = 10000
27 | fetch_min_bin_size = 50
28 | filter_secondary_alignments = True
29 | fuzzy_mismatch_number = 1
30 | max_sc_preceeding_anchor = 6
31 | min_anchor_exact = 6
32 | min_anchor_fuzzy = 10
33 | min_anchor_match = 0.9
34 | min_double_aligned_to_estimate_insertion_size = 2
35 | min_flanking_pairs_resolution = 3
36 | min_linking_split_reads = 1
37 | min_mapping_quality = 5
38 | min_non_target_aligned_split_reads = 1
39 | min_sample_size_to_apply_percentage = 10
40 | min_softclipping = 6
41 | min_spanning_reads_resolution = 3
42 | min_splits_reads_resolution = 3
43 | stdev_count_abnormal = 3.0
44 | strand_determining_read = 2
45 | outer_window_min_event_size = 125
46 |
47 | [cluster]
48 | uninformative_filter = True
49 | limit_to_chr = None
50 |
51 | [mock-A36971]
52 | read_length = 150
53 | median_fragment_size = 400
54 | stdev_fragment_size = 97
55 | bam_file = tests/data/mock_reads_for_events.sorted.bam
56 | protocol = genome
57 | inputs = mock_converted
58 | strand_specific = False
59 | disease_status=diseased
60 |
61 | [mock-A47933]
62 | read_length = 75
63 | median_fragment_size = 188
64 | stdev_fragment_size = 50
65 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam
66 | protocol = transcriptome
67 | inputs = tests/data/mock_trans_sv_events.tsv
68 | strand_specific = True
69 | disease_status=diseased
70 |
71 | [summary]
72 | filter_min_remapped_reads = 5
73 | filter_min_spanning_reads = 5
74 | filter_min_flanking_reads = 10
75 | filter_min_split_reads = 5
76 | filter_min_linking_split_reads = 1
77 | filter_cdna_synon = True
78 | filter_protein_synon = True
79 |
80 | [convert]
81 | assume_no_untemplated = True
82 | # addfile twice to check this notation is ok (will collapse them anyway)
83 | mock_converted = convert_tool_output
84 | tests/data/mock_sv_events.tsv
85 | tests/data/mock_sv_events.tsv
86 | mavis
87 | False
88 |
89 |
--------------------------------------------------------------------------------
/tests/data/Library-clusterset-N.validated.tsv:
--------------------------------------------------------------------------------
1 | #cluster_id break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand event_type opposing_strands stranded protocol tools contigs_assembled contigs_aligned contig_sequence contig_remap_score contig_alignment_score call_method flanking_reads median_insert_size stdev_insert_size break1_split_reads break2_split_reads linking_split_reads untemplated_sequence
2 | 11241 1 238693407 238693407 L ? 1 238701194 238701194 R ? deletion False False genome DELLY_v0.6.1 1 1 GAGACTGGAAGTGGGTAGTTGCTTCATGCAGCTGGTTGTCCCAATGCCTGTCAGAGTCTGCCTTAGTCCCGGGGTTTTTATGGGCTCAGAAGGGAGAAAGTGTATGCTGAAAGCATTGAAATGCTAATTAGGAAGCATTTTTTTTTTCCTTCAAAGTAACTTTAAATAACTTTTCGGGAAAGTAAACACAATTA 27 0 contig 58 8200.0 7.7781745930520225 26 0 0
3 | 11388 1 79401525 79401525 L ? 1 79401848 79401848 R ? deletion False False genome DELLY_v0.6.1 1 1 AGACAGTAACAAAAGTTGGAGGTAAGACAAGGACCCAGATATTGTCAGCCAAAATCCTCCCCAGGTATTTATAACAGAATGGAAATCTCAAGTAAGAATATGGATATTCTGTATACTGTACATACATCAAATGTTTTTATAGGAAACCACATGTTACATGTACATATGACATAATCAAATGCATGATAAGTATTTATTGCAAATTCAT 61 0 contig 225 731 7.0710678118654755 93 0 0
4 | 11425 1 143164727 143164727 R ? 1 143165037 143165037 R ? inversion True False genome DELLY_v0.6.1 1 0 ? ? ? split reads 14 266.0 5.744562646538029 3 5 2 ?
5 | 10094 11 79346483 79346483 R ? 9 115343095 115343095 L ? translocation False False genome DELLY_v0.6.1 1 1 AAACTGCTCCATATTTATTTCATTATTATTATCATTTTCATCATCCTAACGATTATTCAGTATATACCAAGTGTCTCTGATGAAACATGCAGGAGATGAAAAATCCTTGGGTGGGCTTGTTTCTTTCTTTGTGTTTTTTTTTTTGAGATGGAGTCTCGCTCTGGAGCCCAGGCTGG 19 0 contig 32 0.0 0.0 20 10 9
6 | 10094 11 79346459 79346459 L ? 9 115343096 115343096 R ? translocation False False genome DELLY_v0.6.1 1 1 ATAATATTGTCTCATTCCCATTTTAAACTACCTGTTCCTTAAATTGCATATAAAAATACAGTCCATGCAATATTAATACACTAATGAATAATACACTAACAATTTATTTTCTTAGCCATTTCTTAACCTTTTCCTGTAGTTTCCTGAAGGAAGAGCTGAGTTATAATTTTTGAAAAATAAGAGAGACAAAGTAAAAATTCAG 31 0 contig 65 0 0.0 0 21 0
7 | 11963 11 79346459 79346459 L ? 9 115343096 115343096 R ? translocation False False genome DELLY_v0.6.1 1 1 ATAATATTGTCTCATTCCCATTTTAAACTACCTGTTCCTTAAATTGCATATAAAAATACAGTCCATGCAATATTAATACACTAATGAATAATACACTAACAATTTATTTTCTTAGCCATTTCTTAACCTTTTCCTGTAGTTTCCTGAAGGAAGAGCTGAGTTATAATTTTTGAAAAATAAGAGAGACAAAGTAAAAATTCAG 31 0 contig 65 0 0.0 0 21 0
8 | 11963 11 79346483 79346483 R ? 9 115343095 115343095 L ? translocation False False genome DELLY_v0.6.1 1 1 AAACTGCTCCATATTTATTTCATTATTATTATCATTTTCATCATCCTAACGATTATTCAGTATATACCAAGTGTCTCTGATGAAACATGCAGGAGATGAAAAATCCTTGGGTGGGCTTGTTTCTTTCTTTGTGTTTTTTTTTTTGAGATGGAGTCTCGCTCTGGAGCCCAGGCTGG 19 0 contig 32 0.0 0.0 20 10 9
9 | 11974 11 56271180 56271593 L ? 9 132187570 132187570 R ? translocation False False genome DELLY_v0.6.1 0 0 ? ? ? split and flanking 7 0 0.0 1 3 0 ?
10 |
--------------------------------------------------------------------------------
/tests/data/pipeline_config.cfg:
--------------------------------------------------------------------------------
1 | [reference]
2 | template_metadata = tests/data/cytoBand.txt
3 | annotations = tests/data/mock_annotations.json
4 | masking = tests/data/mock_masking.tab
5 | reference_genome = tests/data/mock_reference_genome.fa
6 | aligner_reference = tests/data/mock_reference_genome.2bit
7 | dgv_annotation = tests/data/mock_dgv_annotation.txt
8 |
9 | [annotate]
10 | draw_fusions_only = False
11 |
12 | [schedule]
13 |
14 | [validate]
15 | # evidence related settings
16 | aligner = blat
17 | assembly_max_paths = 4
18 | assembly_min_exact_match_to_remap = 4
19 | assembly_min_edge_trim_weight = 4
20 | assembly_min_remap_coverage = 0
21 | assembly_min_remapped_seq = 3
22 | assembly_strand_concordance = 0.51
23 | blat_min_identity = 0.9
24 | call_error = 10
25 | contig_aln_max_event_size = 50
26 | contig_aln_merge_inner_anchor = 20
27 | contig_aln_merge_outer_anchor = 15
28 | contig_aln_min_anchor_size = 50
29 | contig_aln_min_query_consumption = 0.7
30 | fetch_reads_bins = 5
31 | fetch_reads_limit = 10000
32 | fetch_min_bin_size = 50
33 | filter_secondary_alignments = True
34 | fuzzy_mismatch_number = 1
35 | max_sc_preceeding_anchor = 6
36 | min_anchor_exact = 6
37 | min_anchor_fuzzy = 10
38 | min_anchor_match = 0.9
39 | min_double_aligned_to_estimate_insertion_size = 2
40 | min_flanking_pairs_resolution = 3
41 | min_linking_split_reads = 1
42 | min_mapping_quality = 5
43 | min_non_target_aligned_split_reads = 1
44 | min_sample_size_to_apply_percentage = 10
45 | min_softclipping = 6
46 | min_spanning_reads_resolution = 3
47 | min_splits_reads_resolution = 3
48 | stdev_count_abnormal = 3.0
49 | strand_determining_read = 2
50 | outer_window_min_event_size = 125
51 |
52 | [cluster]
53 | uninformative_filter = True
54 | # all chromosomes
55 | limit_to_chr = None
56 | min_clusters_per_file = 2
57 |
58 | [mock-A36971]
59 | read_length = 150
60 | median_fragment_size = 400
61 | stdev_fragment_size = 97
62 | bam_file = tests/data/mock_reads_for_events.sorted.bam
63 | protocol = genome
64 | inputs = mock_converted
65 | strand_specific = False
66 | disease_status=diseased
67 |
68 | [mock-A47933]
69 | read_length = 75
70 | median_fragment_size = 188
71 | stdev_fragment_size = 50
72 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam
73 | protocol = transcriptome
74 | inputs = tests/data/mock_trans_sv_events.tsv
75 | strand_specific = True
76 | disease_status=diseased
77 |
78 | [summary]
79 | filter_min_remapped_reads = 5
80 | filter_min_spanning_reads = 5
81 | filter_min_flanking_reads = 10
82 | filter_min_split_reads = 5
83 | filter_min_linking_split_reads = 1
84 | filter_cdna_synon = True
85 | filter_protein_synon = True
86 |
87 | [convert]
88 | assume_no_untemplated = True
89 | # addfile twice to check this notation is ok (will collapse them anyway)
90 | mock_converted = convert_tool_output
91 | tests/data/mock_sv_events.tsv
92 | tests/data/mock_sv_events.tsv
93 | mavis
94 | False
95 |
96 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
6 |
7 |   [](https://codecov.io/gh/bcgsc/mavis) 
8 |
9 | ## About
10 |
11 | [MAVIS](http://mavis.bcgsc.ca) is python command-line tool for the post-processing of structural variant calls.
12 | The general [MAVIS](http://mavis.bcgsc.ca) pipeline consists of six main stages
13 |
14 | - convert
15 | - [cluster](https://mavis.readthedocs.io/en/latest/package/mavis/cluster)
16 | - [validate](https://mavis.readthedocs.io/en/latest/package/mavis/validate)
17 | - [annotate](https://mavis.readthedocs.io/en/latest/package/mavis/annotate)
18 | - [pairing](https://mavis.readthedocs.io/en/latest/package/mavis/pairing)
19 | - [summary](https://mavis.readthedocs.io/en/latest/package/mavis/summary)
20 |
21 | ## Getting Help
22 |
23 | All steps in the MAVIS pipeline are called following the main mavis entry point. The usage menu can be viewed
24 | by running without any arguments, or by giving the -h/--help option
25 |
26 | ``` bash
27 | mavis -h
28 | ```
29 |
30 | Help sub-menus can be found by giving the pipeline step followed by no arguments or the -h options
31 |
32 | ``` bash
33 | mavis cluster -h
34 | ```
35 |
36 | Common problems and questions are addressed on the [wiki](https://github.com/bcgsc/mavis/wiki/Help-and-Frequently-Asked-Questions).
37 | If you have a question or issue that is not answered there (or already a github issue) please submit
38 | a github issue to our [github page](https://github.com/bcgsc/mavis/issues) or contact us by email at [mavis@bcgsc.ca](mailto:mavis@bcgsc.ca)
39 |
40 | ## Getting Started
41 |
42 | The simplest way to use MAVIS is via Singularity. The MAVIS docker container used
43 | by singularity will take care of installing the aligner as well.
44 |
45 | ```bash
46 | pip install -U setuptools pip wheel
47 | pip install mavis_config # also installs snakemake
48 | ```
49 |
50 | Now you will run mavis via Snakemake as follows
51 |
52 | ```bash
53 | snakemake \
54 | -j \
55 | --configfile \
56 | --use-singularity \
57 | -s Snakefile
58 | ```
59 |
60 | For other installation options which do not use docker/singularity see the comprehensive install
61 | instructions in the [user manual](https://mavis.readthedocs.io/en/latest/install)
62 |
63 | ## Citation
64 |
65 | If you use MAVIS as a part of your project please cite
66 |
67 | [Reisle,C. et al. (2018) MAVIS: Merging, Annotation, Validation, and Illustration of Structural variants. Bioinformatics.](https://doi.org/10.1093/bioinformatics/bty621)
68 |
--------------------------------------------------------------------------------
/docs/background/citations.md:
--------------------------------------------------------------------------------
1 | # Literature
2 |
3 | ## Abyzov-2011
4 |
5 | Abyzov,A. et al. (2011) CNVnator: an approach to discover, genotype,
6 | and characterize typical and atypical CNVs from family and
7 | population genome sequencing. Genome Res., 21, 974--984.
8 |
9 | ## Abyzov-2015
10 |
11 | Abyzov,A. et al. (2015) Analysis of deletion breakpoints from 1,092
12 | humans reveals details of mutation mechanisms. Nat. Commun.,
13 | 6, 7256.
14 |
15 | ## Chen-2009
16 |
17 | Chen,K. et al. (2009) BreakDancer: an algorithm for high-resolution
18 | mapping of genomic structural variation. Nat. Methods, 6, 677--681.
19 |
20 | ## Chen-2016
21 |
22 | Chen,X. et al. (2016) Manta: rapid detection of structural variants
23 | and indels for germline and cancer sequencing applications.
24 | Bioinformatics, 32, 1220--1222.
25 |
26 | ## Chiu-2021
27 |
28 | Chiu,R. et al. (2021) Straglr: discovering and genotyping tandem repeat
29 | expansions using whole genome long-read sequences. Genome Biol., 22, 224.
30 |
31 | ## Haas-2017
32 |
33 | Haas,B et al. (2017) STAR-Fusion: Fast and Accurate Fusion
34 | Transcript Detection from RNA-Seq. doi:
35 |
36 |
37 | ## Iyer-2011
38 |
39 | Iyer,M.K. et al. (2011) ChimeraScan: a tool for identifying chimeric
40 | transcription in sequencing data. Bioinformatics, 27, 2903--2904.
41 |
42 | ## MacDonald-2014
43 |
44 | MacDonald,J.R. et al. (2014) The Database of Genomic Variants: a
45 | curated collection of structural variation in the human genome.
46 | Nucleic Acids Res., 42, D986--92.
47 |
48 | ## McPherson-2011
49 |
50 | McPherson,A. et al. (2011) deFuse: an algorithm for gene fusion
51 | discovery in tumor RNA-Seq data. PLoS Comput. Biol., 7, e1001138.
52 |
53 | ## Rausch-2012
54 |
55 | Rausch,T. et al. (2012) DELLY: structural variant discovery by
56 | integrated paired-end and split-read analysis. Bioinformatics, 28,
57 | i333--i339.
58 |
59 | ## Robertson-2010
60 |
61 | Robertson,G. et al. (2010) De novo assembly and analysis of RNA-seq
62 | data. Nat. Methods, 7, 909--912.
63 |
64 | ## Saunders-2012
65 |
66 | Saunders,C.T. et al. (2012) Strelka: accurate somatic small-variant
67 | calling from sequenced tumor--normal sample pairs. Bioinformatics,
68 | 28, 1811--1817.
69 |
70 | ## Uhrig-2021
71 |
72 | Uhrig,S. et al. (2021) Accurate and efficient detection of gene
73 | fusions from RNA sequencing data. Genome Res., 31, 448--460.
74 |
75 | ## Yates-2016
76 |
77 | Yates,A. et al. (2016) Ensembl 2016. Nucleic Acids Res., 44,
78 | D710--D716.
79 |
80 | ## Ye-2009
81 |
82 | Ye,K. et al. (2009) Pindel: a pattern growth approach to detect
83 | break points of large deletions and medium sized insertions from
84 | paired-end short reads. Bioinformatics, 25, 2865--2871.
85 |
86 | ## den-Dunnen-2016
87 |
88 | den Dunnen,J.T. et al. (2016) HGVS Recommendations for the
89 | Description of Sequence Variants: 2016 Update. Hum. Mutat., 37,
90 | 564--569.
91 |
--------------------------------------------------------------------------------
/tests/test_mavis/test_blat.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from mavis.constants import CIGAR, reverse_complement
4 | from mavis.validate.blat import Blat
5 |
6 | from .mock import Mock, MockFunction, MockLongString
7 |
8 |
9 | class TestConvertPslxToPysam:
10 | def test_simple(self):
11 | row = {
12 | 'match': 142,
13 | 'mismatch': 0,
14 | 'repmatch': 0,
15 | 'ncount': 0,
16 | 'qgap_count': 0,
17 | 'qgap_bases': 0,
18 | 'tgap_count': 0,
19 | 'tgap_bases': 0,
20 | 'strand': '-',
21 | 'qname': 'seq1',
22 | 'qsize': 204,
23 | 'qstart': 0,
24 | 'qend': 142,
25 | 'tname': '17',
26 | 'tsize': 81195210,
27 | 'tstart': 32673408,
28 | 'tend': 32673550,
29 | 'block_count': 1,
30 | 'block_sizes': [142],
31 | 'qstarts': [62],
32 | 'tstarts': [32673408],
33 | '_index': 880,
34 | 'score': 142,
35 | 'percent_ident': 100.0,
36 | 'qseq_full': (
37 | 'ACATGTGCACAACGTGCAGGTTTGTTACATATGTATACATGTGCCATGTTGGTTTGCTGCACCCATTAACTCGTCCTAGTTTATTACTAGTCTTCAGACATC'
38 | 'CAGAAAATAGAGTAAGATACTAGGTAGACATAACACCTAGATACATCCGTAAGGCATTTGTTTCCTATCACATGGCCCATTCTAGCTTAACACCCACCAACT'
39 | ),
40 | }
41 | refseq = {
42 | '17': Mock(
43 | seq=MockLongString(
44 | 'ACTAGGTGTTATGTCTACCTAGTATCTTACTCTATTTTCTGGATGTCTGAAGACTAGTAATAAACTAGGACGAGTTAATGGGTGCAGCAAACCAACATGGCACATG'
45 | 'TATACATATGTAACAAACCTGCACGTTGTGCACATGTACCCTAAAACTTAAAGTATAAAAAAAAATTTCACTGAGCATAAGACTTCAGACACAAAAGAGTGCATGC'
46 | 'CATATAATTCCATTTATGTGAATTTCAAGAACAATCAGTGATGACAGAAGTCAAAGTAGTGGTCACCTCTGGAAGGTGGGACATTGACC',
47 | 32673407,
48 | )
49 | )
50 | }
51 | cache = Mock(reference_id=MockFunction(16))
52 | read = Blat.pslx_row_to_pysam(row, cache, refseq)
53 | assert read.reference_id == 16
54 | assert read.reference_name == '17'
55 | assert reverse_complement(read.query_sequence) == row['qseq_full']
56 | assert read.cigar == [(CIGAR.S, 62), (CIGAR.EQ, 142)]
57 |
58 | def test_overlapping_blat_blocks_error(self):
59 | row = {
60 | 'strand': '+',
61 | 'qname': 'seq23',
62 | 'tname': '7',
63 | 'block_sizes': [54, 53, 36, 80, 29],
64 | 'qstarts': [0, 55, 108, 143, 223],
65 | 'tstarts': [61279112, 61279166, 61397315, 61990208, 62366144],
66 | 'score': 207,
67 | 'percent_ident': 91.3,
68 | 'qseq_full': (
69 | 'CAAAAGGAAATACCTTCACATAAATTCTAGACGGAAGCAATCTGAGAAACTTTTATTGTGATTTGTGCATTCACTTCACAGAGTTAAAACTTTCTTTTGATT'
70 | 'GAGCAGTTTGAAACTCTGTTTTTGTAGAATCTGCAAGTGGACATTTGGAGCGCTTTGAGGCCTATGGTGGAAAAGGAAATATCTTCACAGGAAAACTAGATA'
71 | 'GAAGTATTCTGAGAAACTTCTTTGTGATGTATGCAGTCATATCTCAGA'
72 | ),
73 | }
74 | cache = Mock(reference_id=MockFunction(6))
75 | with pytest.raises(AssertionError):
76 | Blat.pslx_row_to_pysam(row, cache, None)
77 |
--------------------------------------------------------------------------------
/docs/package/mavis/validate/index.md:
--------------------------------------------------------------------------------
1 | # Sub-package Documentation
2 |
3 | The validation sub-package is responsible for pulling supporting reads from the bam file
4 | and re-calling events based on the evidence in a standard notation.
5 |
6 | ## Types of Output Files
7 |
8 | A variety of intermediate output files are given for the user. These can be used to "drill down"
9 | further into events and also for developers debugging when adding new features, etc.
10 |
11 | | expected name/suffix | file type/format | content |
12 | | --------------------------- | --------------------------------------------------- | ---------------------------------- |
13 | | ``*.raw_evidence.bam`` | [bam](../../../glossary/#bam) | raw evidence |
14 | | ``*.contigs.bam`` | [bam](../../../glossary/#bam) | aligned contigs |
15 | | ``*.evidence.bed`` | [bed](../../../glossary/#bed) | evidence collection window regions |
16 | | ``*.validation-passed.bed`` | [bed](../../../glossary/#bed) | validated event positions |
17 | | ``*.validation-failed.tab`` | text/tabbed | failed events |
18 | | ``*.validation-passed.tab`` | text/tabbed | validated events |
19 | | ``*.contigs.fa`` | [fasta](../../../glossary/#fasta) | assembled contigs |
20 | | ``*.contigs.blat_out.pslx`` | [pslx](../../../glossary/#pslx) | results from blatting contigs |
21 | | ``*.igv.batch`` | [IGV batch file](../../../glossary/#IGV-batch-file) | igv batch file |
22 |
23 |
24 | ## Algorithm Overview
25 |
26 | - (For each breakpoint pair)
27 |
28 | - [Calculate the window/region](../../../background/theory/#calculating-the-evidence-window) to read from the bam and collect
29 | evidence
30 | - Store evidence ([flanking read pair](../../../glossary/#flanking-read-pair), [half-mapped read](../../../glossary/#half-mapped-read), [spanning read](../../../glossary/#spanning-read), [split read](../../../glossary/#split-read),
31 | [compatible flanking pairs](../../../glossary/#compatible-flanking-pairs)) which match the expected event type and position
32 | - Assemble a contig from the collected reads. see [theory - assembling contigs](../../../background/theory/#assembling-contigs)
33 |
34 | - Generate a [fasta](../../../glossary/#fasta) file containing all the contig sequences
35 | - Align contigs to the reference genome (currently [blat](../../../glossary/#blat) is used to perform this step)
36 | - Make the final event calls. Each level of calls consumes all supporting reads so they are not re-used in subsequent
37 | levels of calls.
38 | - (For each breakpoint pair)
39 |
40 | - call by contig
41 | - call by [spanning read](../../../glossary/#spanning-read)
42 | - call by [split read](../../../glossary/#split-read)
43 | - call by [flanking read pair](../../../glossary/#flanking-read-pair). see [theory - calling breakpoints by flanking evidence](../../../background/theory/#calling-breakpoints-by-flanking-evidence)
44 |
45 | - Output new calls, evidence, contigs, etc
46 |
--------------------------------------------------------------------------------
/tests/full-tutorial.config.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotate.draw_fusions_only": true,
3 | "cluster.min_clusters_per_file": 100,
4 | "cluster.uninformative_filter": true,
5 | "convert": {
6 | "breakdancer": {
7 | "assume_no_untemplated": true,
8 | "file_type": "breakdancer",
9 | "inputs": [
10 | "tutorial_data/breakdancer-1.4.5/*txt"
11 | ]
12 | },
13 | "breakseq": {
14 | "assume_no_untemplated": true,
15 | "file_type": "breakseq",
16 | "inputs": [
17 | "tutorial_data/breakseq-2.2/breakseq.vcf.gz"
18 | ]
19 | },
20 | "chimerascan": {
21 | "assume_no_untemplated": true,
22 | "file_type": "chimerascan",
23 | "inputs": [
24 | "tutorial_data/chimerascan-0.4.5/chimeras.bedpe"
25 | ]
26 | },
27 | "defuse": {
28 | "assume_no_untemplated": true,
29 | "file_type": "defuse",
30 | "inputs": [
31 | "tutorial_data/defuse-0.6.2/results.classify.tsv"
32 | ]
33 | },
34 | "manta": {
35 | "assume_no_untemplated": true,
36 | "file_type": "manta",
37 | "inputs": [
38 | "tutorial_data/manta-1.0.0/diploidSV.vcf.gz",
39 | "tutorial_data/manta-1.0.0/somaticSV.vcf"
40 | ]
41 | }
42 | },
43 | "libraries": {
44 | "L1522785992-normal": {
45 | "assign": [
46 | "breakdancer",
47 | "breakseq",
48 | "manta"
49 | ],
50 | "bam_file": "tutorial_data/L1522785992_normal.sorted.bam",
51 | "disease_status": "normal",
52 | "protocol": "genome"
53 | },
54 | "L1522785992-trans": {
55 | "assign": [
56 | "chimerascan",
57 | "defuse"
58 | ],
59 | "bam_file": "tutorial_data/L1522785992_trans.sorted.bam",
60 | "disease_status": "diseased",
61 | "protocol": "transcriptome",
62 | "strand_specific": true
63 | },
64 | "L1522785992-tumour": {
65 | "assign": [
66 | "breakdancer",
67 | "breakseq",
68 | "manta"
69 | ],
70 | "bam_file": "tutorial_data/L1522785992_tumour.sorted.bam",
71 | "disease_status": "diseased",
72 | "protocol": "genome"
73 | }
74 | },
75 | "output_dir": "output_dir_full",
76 | "reference.aligner_reference": [
77 | "reference_inputs/hg19.2bit"
78 | ],
79 | "reference.annotations": [
80 | "reference_inputs/ensembl69_hg19_annotations.v3.json"
81 | ],
82 | "reference.dgv_annotation": [
83 | "tests/data/mock_dgv_annotation.tab"
84 | ],
85 | "reference.masking": [
86 | "reference_inputs/hg19_masking.tab"
87 | ],
88 | "reference.reference_genome": [
89 | "reference_inputs/hg19.fa"
90 | ],
91 | "reference.template_metadata": [
92 | "reference_inputs/cytoBand.txt"
93 | ],
94 | "summary.filter_min_flanking_reads": 10,
95 | "summary.filter_min_linking_split_reads": 1,
96 | "summary.filter_min_remapped_reads": 5,
97 | "summary.filter_min_spanning_reads": 5
98 | }
99 |
--------------------------------------------------------------------------------
/src/mavis/convert/transabyss.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from ..constants import COLUMNS
4 | from .constants import SUPPORTED_TOOL, TRACKING_COLUMN
5 |
6 |
7 | def convert_row(row):
8 | """
9 | transforms the transabyss output into the common format for expansion.
10 | Maps the input column names to column names which MAVIS can read
11 | """
12 | std_row = {}
13 | if TRACKING_COLUMN not in row:
14 | std_row[TRACKING_COLUMN] = '{}-{}'.format(SUPPORTED_TOOL.TA, row['id'])
15 |
16 | std_row[COLUMNS.event_type] = row.get('rearrangement', row['type'])
17 | for retained_column in ['genes', 'gene']:
18 | if retained_column in row:
19 | std_row['{}_{}'.format(SUPPORTED_TOOL.TA, retained_column)] = row[retained_column]
20 | if std_row[COLUMNS.event_type] in ['LSR', 'translocation']:
21 | del std_row[COLUMNS.event_type]
22 | if 'breakpoint' in row:
23 | std_row[COLUMNS.break1_orientation], std_row[COLUMNS.break2_orientation] = row[
24 | 'orientations'
25 | ].split(',')
26 | match = re.match(
27 | r'^(?P[^:]+):(?P\d+)\|(?P[^:]+):(?P\d+)$',
28 | row['breakpoint'],
29 | )
30 | if not match:
31 | raise OSError(
32 | 'file format error: the breakpoint column did not satisfy the expected pattern', row
33 | )
34 | for group, col in zip(
35 | ['chr1', 'pos1_start', 'chr2', 'pos2_start'],
36 | [
37 | COLUMNS.break1_chromosome,
38 | COLUMNS.break1_position_start,
39 | COLUMNS.break2_chromosome,
40 | COLUMNS.break2_position_start,
41 | ],
42 | ):
43 | std_row[col] = match[group]
44 | else:
45 | std_row.update(
46 | {
47 | COLUMNS.break1_chromosome: row['chr'],
48 | COLUMNS.break1_position_start: int(row['chr_start']),
49 | COLUMNS.break2_position_start: int(row['chr_end']),
50 | }
51 | )
52 | if std_row[COLUMNS.event_type] == 'del':
53 | std_row[COLUMNS.break1_position_start] -= 1
54 | std_row[COLUMNS.break2_position_start] += 1
55 | elif std_row[COLUMNS.event_type] == 'ins':
56 | std_row[COLUMNS.break2_position_start] += 1
57 |
58 | # add the untemplated sequence where appropriate
59 | if std_row[COLUMNS.event_type] == 'del':
60 | assert row['alt'] == 'na'
61 | std_row[COLUMNS.untemplated_seq] = ''
62 | elif std_row[COLUMNS.event_type] in ['dup', 'ITD']:
63 | length = (
64 | std_row[COLUMNS.break2_position_start] - std_row[COLUMNS.break1_position_start] + 1
65 | )
66 | if len(row['alt']) != length:
67 | raise AssertionError(
68 | 'expected alternate sequence to be equal to the length of the event',
69 | len(row['alt']),
70 | length,
71 | row,
72 | std_row,
73 | )
74 | std_row[COLUMNS.untemplated_seq] = ''
75 | elif std_row[COLUMNS.event_type] == 'ins':
76 | std_row[COLUMNS.untemplated_seq] = row['alt'].upper()
77 | else:
78 | raise NotImplementedError('unexpected indel type', std_row[COLUMNS.event_type])
79 | return std_row
80 |
--------------------------------------------------------------------------------
/tests/test_tools/data/K02718.1.gff3:
--------------------------------------------------------------------------------
1 | K02718.1 Genbank CDS 1140 2813 . + 0 ID=cds-AAA46936.1;Parent=gene-E1;Dbxref=NCBI_GP:AAA46936.1;Name=AAA46936.1;Note=E1 interrupted ORF from 859 to 2813%3B putative;gbkey=CDS;gene=E1;product=replication protein;protein_id=AAA46936.1
2 | K02718.1 Genbank CDS 2755 3852 . + 0 ID=cds-AAA46941.1;Parent=gene-E2;Dbxref=NCBI_GP:AAA46941.1;Name=AAA46941.1;Note=E2 ORF from 2725 to 3852%3B putative;gbkey=CDS;gene=E2;product=regulatory protein;protein_id=AAA46941.1
3 | K02718.1 Genbank CDS 3332 3619 . + 0 ID=cds-AAA46937.1;Parent=gene-E4;Dbxref=NCBI_GP:AAA46937.1;Name=AAA46937.1;gbkey=CDS;gene=E4;partial=true;product=AAA46937.1;protein_id=AAA46937.1;start_range=.,3332
4 | K02718.1 Genbank CDS 3863 4099 . + 0 ID=cds-AAA46938.1;Parent=gene-E5;Dbxref=NCBI_GP:AAA46938.1;Name=AAA46938.1;gbkey=CDS;gene=E5;partial=true;product=AAA46938.1;protein_id=AAA46938.1;start_range=.,3863
5 | K02718.1 Genbank CDS 4235 5656 . + 0 ID=cds-AAA46942.1;Parent=gene-L2;Dbxref=NCBI_GP:AAA46942.1;Name=AAA46942.1;Note=L2 ORF from 4133 to 5656%3B putative;gbkey=CDS;gene=L2;product=minor capsid protein;protein_id=AAA46942.1
6 | K02718.1 Genbank CDS 5559 7154 . + 0 ID=cds-AAA46943.1;Parent=gene-L1;Dbxref=NCBI_GP:AAA46943.1;Name=AAA46943.1;Note=L1 ORF from 5526 to 7154%3B putative;gbkey=CDS;gene=L1;product=major capsid protein;protein_id=AAA46943.1
7 | K02718.1 Genbank CDS 562 858 . + 0 ID=cds-AAA46940.1;Parent=gene-E7;Dbxref=NCBI_GP:AAA46940.1;Name=AAA46940.1;Note=E7 ORF from 544 to 858%3B putative;gbkey=CDS;gene=E7;product=transforming protein;protein_id=AAA46940.1
8 | K02718.1 Genbank CDS 83 559 . + 0 ID=cds-AAA46939.1;Parent=gene-E6;Dbxref=NCBI_GP:AAA46939.1;Name=AAA46939.1;Note=E6 ORF from 65 to 559%3B putative;gbkey=CDS;gene=E6;product=transforming protein;protein_id=AAA46939.1
9 | K02718.1 Genbank CDS 865 1140 . + 0 ID=cds-AAA46936.1;Parent=gene-E1;Dbxref=NCBI_GP:AAA46936.1;Name=AAA46936.1;Note=E1 interrupted ORF from 859 to 2813%3B putative;gbkey=CDS;gene=E1;product=replication protein;protein_id=AAA46936.1
10 | K02718.1 Genbank gene 1140 2813 . + . ID=gene-E1;Name=E1;gbkey=Gene;gene=E1;gene_biotype=protein_coding
11 | K02718.1 Genbank gene 2755 3852 . + . ID=gene-E2;Name=E2;gbkey=Gene;gene=E2;gene_biotype=protein_coding
12 | K02718.1 Genbank gene 3332 3619 . + . ID=gene-E4;Name=E4;gbkey=Gene;gene=E4;gene_biotype=protein_coding
13 | K02718.1 Genbank gene 3863 4099 . + . ID=gene-E5;Name=E5;gbkey=Gene;gene=E5;gene_biotype=protein_coding
14 | K02718.1 Genbank gene 4235 5656 . + . ID=gene-L2;Name=L2;gbkey=Gene;gene=L2;gene_biotype=protein_coding
15 | K02718.1 Genbank gene 5559 7154 . + . ID=gene-L1;Name=L1;gbkey=Gene;gene=L1;gene_biotype=protein_coding
16 | K02718.1 Genbank gene 562 858 . + . ID=gene-E7;Name=E7;gbkey=Gene;gene=E7;gene_biotype=protein_coding
17 | K02718.1 Genbank gene 83 559 . + . ID=gene-E6;Name=E6;gbkey=Gene;gene=E6;gene_biotype=protein_coding
18 | K02718.1 Genbank gene 865 1140 . + . ID=gene-E1;Name=E1;gbkey=Gene;gene=E1;gene_biotype=protein_coding
19 | K02718.1 Genbank region 17 23 . + . ID=id-K02718.1:17..23;gbkey=TATA_signal
20 | K02718.1 Genbank region 1 7904 . + . ID=K02718.1:1..7904;Dbxref=taxon:333760;Is_circular=true;gbkey=Src;mol_type=genomic DNA
21 | K02718.1 Genbank region 4213 4218 . + . ID=id-K02718.1:4213..4218;Note=putative;gbkey=polyA_signal
22 | K02718.1 Genbank region 4289 4295 . + . ID=id-L2;gbkey=TATA_signal;gene=L2
23 | K02718.1 Genbank region 65 71 . + . ID=id-K02718.1:65..71;gbkey=TATA_signal
24 | K02718.1 Genbank region 7260 7265 . + . ID=id-K02718.1:7260..7265;gbkey=polyA_signal
25 |
--------------------------------------------------------------------------------
/tests/data/mock_reference_annotations.full.json:
--------------------------------------------------------------------------------
1 | {"genes": [{"aliases": ["C9orf47"], "chr": "fakereference9", "end": 5278, "name": "ENSG00000186354", "start": 1, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": 685, "cdna_coding_start": 134, "domains": [], "end": 5278, "exons": [{"end": 322, "start": 1}, {"end": 833, "start": 608}, {"end": 5278, "start": 990}], "is_best_transcript": true, "name": "ENST00000375851", "start": 1}, {"aliases": [], "cdna_coding_end": 783, "cdna_coding_start": 76, "domains": [], "end": 1202, "exons": [{"end": 322, "start": 59}, {"end": 1202, "start": 608}], "is_best_transcript": false, "name": "ENST00000375850", "start": 59}, {"aliases": [], "cdna_coding_end": 677, "cdna_coding_start": 69, "domains": [], "end": 5278, "exons": [{"end": 379, "start": 66}, {"end": 833, "start": 608}, {"end": 5278, "start": 990}], "is_best_transcript": false, "name": "ENST00000334490", "start": 66}]}, {"aliases": ["S1PR3"], "chr": "fakereference9", "end": 14148, "name": "ENSG00000213694", "start": 585, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": 1533, "cdna_coding_start": 397, "domains": [{"name": "PR00362", "regions": [{"end": 62, "start": 49}, {"end": 200, "start": 185}, {"end": 308, "start": 298}]}, {"name": "PR00642", "regions": [{"end": 75, "start": 63}, {"end": 112, "start": 102}, {"end": 155, "start": 139}, {"end": 345, "start": 329}]}, {"name": "PS50262", "regions": [{"end": 298, "start": 56}]}, {"name": "PF10320", "regions": [{"end": 312, "start": 55}]}, {"name": "SSF81321", "regions": [{"end": 340, "start": 1}]}, {"name": "PR00237", "regions": [{"end": 65, "start": 41}, {"end": 95, "start": 74}, {"end": 140, "start": 118}, {"end": 174, "start": 153}, {"end": 219, "start": 196}, {"end": 265, "start": 241}, {"end": 306, "start": 280}]}, {"name": "PR01523", "regions": [{"end": 25, "start": 13}, {"end": 101, "start": 92}, {"end": 123, "start": 112}, {"end": 204, "start": 194}, {"end": 224, "start": 215}, {"end": 283, "start": 272}, {"end": 311, "start": 301}]}, {"name": "PF00001", "regions": [{"end": 298, "start": 56}]}, {"name": "PR01524", "regions": [{"end": 40, "start": 24}, {"end": 155, "start": 139}, {"end": 233, "start": 223}, {"end": 323, "start": 314}, {"end": 340, "start": 326}]}], "end": 14148, "exons": [{"end": 833, "start": 585}, {"end": 14148, "start": 10192}], "is_best_transcript": false, "name": "ENST00000358157", "start": 585}, {"aliases": [], "cdna_coding_end": 5832, "cdna_coding_start": 4696, "domains": [{"name": "PF10320", "regions": [{"end": 312, "start": 55}]}, {"name": "PR00362", "regions": [{"end": 62, "start": 49}, {"end": 200, "start": 185}, {"end": 308, "start": 298}]}, {"name": "PS50262", "regions": [{"end": 298, "start": 56}]}, {"name": "PR00642", "regions": [{"end": 75, "start": 63}, {"end": 112, "start": 102}, {"end": 155, "start": 139}, {"end": 345, "start": 329}]}, {"name": "PR00237", "regions": [{"end": 65, "start": 41}, {"end": 95, "start": 74}, {"end": 140, "start": 118}, {"end": 174, "start": 153}, {"end": 219, "start": 196}, {"end": 265, "start": 241}, {"end": 306, "start": 280}]}, {"name": "PR01523", "regions": [{"end": 25, "start": 13}, {"end": 101, "start": 92}, {"end": 123, "start": 112}, {"end": 204, "start": 194}, {"end": 224, "start": 215}, {"end": 283, "start": 272}, {"end": 311, "start": 301}]}, {"name": "PR01524", "regions": [{"end": 40, "start": 24}, {"end": 155, "start": 139}, {"end": 233, "start": 223}, {"end": 323, "start": 314}, {"end": 340, "start": 326}]}, {"name": "PF00001", "regions": [{"end": 298, "start": 56}]}, {"name": "SSF81321", "regions": [{"end": 340, "start": 1}]}], "end": 14148, "exons": [{"end": 14148, "start": 5644}], "is_best_transcript": true, "name": "ENST00000375846", "start": 5644}]}]}
--------------------------------------------------------------------------------
/docs/tutorials/annotation.md:
--------------------------------------------------------------------------------
1 | # Annotation Only
2 |
3 | Sometimes you have a set of variants and would simply like to run the annotate step of MAVIS to visualize and annotate them.
4 |
5 | First you need to create your basic config to tell MAVIS where the reference files you want to use are and some minimal information about the library/sample you want to process.
6 |
7 | Here is an example config where the user has created a minimal input file in the MAVIS standard input file format. We convert it to expand any unknowns (ex. SV type if left blank)
8 |
9 | ```json
10 | {
11 | "libraries": {
12 | "my_library": {
13 | "assign": ["my_converted_file"],
14 | "disease_status": "normal",
15 | "protocol": "genome"
16 | }
17 | },
18 | "convert": {
19 | "my_converted_file": {
20 | "inputs": ["/path/to/file/structural_variants.txt"],
21 | "file_type": "mavis"
22 | }
23 | },
24 | "cluster.split_only": true,
25 | "skip_stage.validate": true,
26 | "output_dir": "my_output_dir",
27 | "reference.annotations": "/path/to/mavis/reference_files/ensembl79_hg38_annotations.json",
28 | "reference.template_metadata": "/path/to/mavis/reference_files/hg38_cytoBand.txt",
29 | "reference.reference_genome": "/path/to/hg38_no_alt/genome/hg38_no_alt.fa",
30 | "reference.masking": "/path/to/mavis/reference_files/masking_hg38.adjusted.tab",
31 | "reference.dgv_annotation": "/path/to/mavis/reference_files/dgv_hg38_annotations.tab"
32 | }
33 | ```
34 |
35 | Another example is given in the MAVIS tests folder under `tests/mini-tutorial.annotate_only.config.json` which looks like this
36 |
37 | ```json
38 | {
39 | "annotate.draw_fusions_only": false,
40 | "convert": {
41 | "mock_converted": {
42 | "inputs": [
43 | "tests/data/mock_sv_events.tsv"
44 | ],
45 | "file_type": "mavis",
46 | "assume_no_untemplated": true
47 | }
48 | },
49 | "skip_stage.validate": true,
50 | "cluster.uninformative_filter": true,
51 | "cluster.limit_to_chr": null,
52 | "cluster.min_clusters_per_file": 5,
53 | "libraries": {
54 | "mock-A47933": {
55 | "assign": [
56 | "tests/data/mock_trans_sv_events.tsv"
57 | ],
58 | "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam",
59 | "disease_status": "diseased",
60 | "protocol": "transcriptome",
61 | "strand_specific": true
62 | },
63 | "mock-A36971": {
64 | "assign": [
65 | "mock_converted"
66 | ],
67 | "bam_file": "tests/data/mock_reads_for_events.sorted.bam",
68 | "disease_status": "diseased",
69 | "protocol": "genome",
70 | "strand_specific": false
71 | }
72 | },
73 | "output_dir": "output_dir",
74 | "reference.annotations": [
75 | "tests/data/mock_annotations.json"
76 | ],
77 | "reference.dgv_annotation": [
78 | "tests/data/mock_dgv_annotation.txt"
79 | ],
80 | "reference.masking": [
81 | "tests/data/mock_masking.tab"
82 | ],
83 | "reference.reference_genome": [
84 | "tests/data/mock_reference_genome.fa"
85 | ],
86 | "reference.template_metadata": [
87 | "tests/data/cytoBand.txt"
88 | ]
89 | }
90 | ```
91 |
92 | Either of these configurations can be run with the following command simply by changing the configfile argument
93 |
94 | ```bash
95 | snakemake -j 1 \
96 | --configfile tests/mini-tutorial.annotate_only.config.json \
97 | -s Snakefile
98 | ```
99 |
--------------------------------------------------------------------------------
/tests/data/transabyss_indels_output.tab:
--------------------------------------------------------------------------------
1 | id type chr chr_start chr_end ctg ctg_len ctg_start ctg_end len ref alt event_reads contig_reads genome_reads gene repeat-length ctg_strand from_end confirm_contig_region within_simple_repeats repeatmasker within_segdup at_least_1_read_opposite dbsnp
2 | 1 ins 1 8877520 8877520 4542232 58938 23102 23103 2 na tt 41 41 47 RERE:uc001apf.3:exon1|synon 0 + 23101 23102-23117 - - - false -
3 | 2 ins 1 16011005 16011005 4541011 129199 97246 97248 3 na ggc 22 22 25 PLEKHM2:uc010obo.2:exon1|synon 0 - 31951 97234-97248 TRF_SimpleTandemRepeat_GCG (CGG)n - false -
4 | 3 ins 1 16926227 16926227 4624842 952 419 419 1 na t 46 46 68 NBPF1:uc001aza.5:exon3|na 0 - 418 414-419 - L1ME3 chr1:21766304 false -
5 | 4 ins 1 17026040 17026040 4529033 986 780 794 15 na gcggcggcggcggca 35 35 23 ESPNP:uc001azn.1:exon8|P431_P432insLPPPP 0 + 192 780-794 - (CGG)n chr1:6487720 false -
6 | 5 ins 1 17026043 17026043 4521063 925 99 143 45 na gcggcggcggcggcggcggcggcggcggcagcagcagcagcagca 6 6 8 ESPNP:uc001azn.1:exon8|L430_P431insLLLLLLPPPPPPPPP 0 - 98 99-143 - (CGG)n chr1:6487720 false -
7 | 1175 del X 142715897 142715924 4547857 78777 52728 52728 28 ttttt...ttttt na 34 34 17 SLITRK4:uc022cfl.1:exon2|SLITRK4:uc022cfl.1:exon2|synon 0 + 25889 52728-52728 TRF_SimpleTandemRepeat_T (T)n - false -
8 | 1176.1 del X 149115835 149115836 indel_k96_4578561 1263 1145 1145 2 ga na 37 30 2 LINC00894:uc004fed.1:exon1|LINC00894:uc004fed.1:exon1|na 0 + 118 1145-1149 - - chrX:148613958 false -
9 | 1176.2 del X 149115835 149115836 indel_5327 1263 119 119 2 ga na 37 7 2 LINC00894:uc004fed.1:exon1|LINC00894:uc004fed.1:exon1|na 0 - 118 115-119 - - chrX:148613958 false -
10 | 1177 del X 153523769 153523790 4654686 26033 2836 2836 22 gcacc...gtgcg na 8 8 1 TEX28:uc010nut.1:exon1|TEX28:uc010nut.1:exon1|synon 0 + 2835 2836-2924 TRF_SimpleTandemRepeat_CACGTGCGGCACCACCCCCTGA - - false -
11 | 1178 del X 154997577 154997583 4522314 63590 44595 44595 7 ttttgtt na 28 28 23 SPRY3:uc004fnq.1:exon1|SPRY3:uc004fnq.1:exon1|synon 0 + 18995 44595-44595 TRF_SimpleTandemRepeat_TTTTG (TTTTG)n chrY:59033286 false -
12 | 1181 dup 12 13029070 13029073 4659122 38006 26858 26861 4 na aaaa 38 38 25 RPL13AP20:uc010sho.2:exon1;3utr|NA:NA:NA|NA 0 - 11139 26838-26861 - (A)n - false -
13 | 1182 dup 12 121839158 121839167 4619408 122056 113544 113553 10 na aaaaaaaaaa 34 34 15 BC029038:uc001uan.3:exon1;3utr|NA:NA:NA|NA 0 - 8503 113524-113553 - L2c - false -
14 | 1183 dup 15 44094768 44094775 4533713 84196 41867 41874 8 na aaaaaaaa 7 7 1 SERF2-C15ORF63:uc001ztb.3:exon6;3utr|NA:NA:NA|NA 0 + 41866 41867-41890 - AluSq4 - false -
15 | 1184 dup 6 27515484 27515553 4632026 88843 27311 27380 70 na ggaaaacaaaaggtccaggaaaaggatatatacatatatcttcgagcaggttccaccgagacttgaactc 131 131 24 NA:NA:NA|TRNA_Gln:uc021yqh.1:3utr;exon1|NA 0 - 27261 27241-27380 - tRNA-Gln-CAG - false -
16 | 1185 dup GL000211.1 108677 108683 4632141 14477 5082 5088 7 na aaaaaaa 33 33 19 FLJ43315:uc003boa.3:exon5;3utr|NA:NA:NA|NA 0 + 5081 5082-5102 - FLAM_A chr9:69378660 false -
17 | 1232 ITD 9 132345740 132345781 3298328 190 96 137 42 na tccatcccttcacctccactaagatcagggcaccccaggagt 9 9 13 BC037833:uc004bya.1:exon4|BC037833:uc004bya.1:exon4|na 0 - 53 54-137 - - - false -
18 | 1233 ITD GL000220.1 114348 114379 4159437 179 96 127 32 na cccccgcggggaatcccccgcgaggggggtct 37 37 13 RNA5-8S5:uc022brd.2:exon1|RNA5-8S5:uc022brd.2:exon1|na 0 - 52 64-127 - LSU-rRNA_Hsa chrUn_gl000220:145518 false -
19 | 1234 ITD GL000220.1 118433 118436 50603 168 81 84 4 na gcgt 24 24 1 RNA5-8S5:uc022brd.2:exon1|RNA5-8S5:uc022brd.2:exon1|na 0 - 80 73-84 - (CG)n - false -
20 | 1235 ITD GL000220.1 118437 118440 107283 168 77 80 4 na gcgt 323 323 1 RNA5-8S5:uc022brd.2:exon1|RNA5-8S5:uc022brd.2:exon1|na 0 - 76 73-80 - (CG)n - false -
21 | 1236 ITD X 84343323 84343327 4588370 15020 7333 7337 5 na ttttt 5 5 4 APOOL:uc004eem.3:exon9|APOOL:uc004eem.3:exon9|synon 0 - 7332 7308-7337 TRF_SimpleTandemRepeat_T (T)n - false -
22 |
--------------------------------------------------------------------------------
/tests/test_mavis/test_help.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from unittest.mock import patch
3 |
4 | from mavis_config.constants import SUBCOMMAND
5 |
6 | from mavis.main import main
7 |
8 |
9 | class TestHelpMenu:
10 | def test_main(self):
11 | with patch.object(sys, 'argv', ['mavis', '-h']):
12 | try:
13 | returncode = main()
14 | except SystemExit as err:
15 | assert err.code == 0
16 | else:
17 | assert returncode == 0
18 |
19 | def test_pipeline(self):
20 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.SETUP, '-h']):
21 | try:
22 | returncode = main()
23 | except SystemExit as err:
24 | assert err.code == 0
25 | else:
26 | assert returncode == 0
27 |
28 | def test_cluster(self):
29 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.CLUSTER, '-h']):
30 | try:
31 | returncode = main()
32 | except SystemExit as err:
33 | assert err.code == 0
34 | else:
35 | assert returncode == 0
36 |
37 | def test_validate(self):
38 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.VALIDATE, '-h']):
39 | try:
40 | returncode = main()
41 | except SystemExit as err:
42 | assert err.code == 0
43 | else:
44 | assert returncode == 0
45 |
46 | def test_annotate(self):
47 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.ANNOTATE, '-h']):
48 | try:
49 | returncode = main()
50 | except SystemExit as err:
51 | assert err.code == 0
52 | else:
53 | assert returncode == 0
54 |
55 | def test_pairing(self):
56 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.PAIR, '-h']):
57 | try:
58 | returncode = main()
59 | except SystemExit as err:
60 | assert err.code == 0
61 | else:
62 | assert returncode == 0
63 |
64 | def test_summary(self):
65 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.SUMMARY, '-h']):
66 | try:
67 | returncode = main()
68 | except SystemExit as err:
69 | assert err.code == 0
70 | else:
71 | assert returncode == 0
72 |
73 | def test_convert(self):
74 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.CONVERT, '-h']):
75 | try:
76 | returncode = main()
77 | except SystemExit as err:
78 | assert err.code == 0
79 | else:
80 | assert returncode == 0
81 |
82 | def test_overlay(self):
83 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.OVERLAY, '-h']):
84 | try:
85 | returncode = main()
86 | except SystemExit as err:
87 | assert err.code == 0
88 | else:
89 | assert returncode == 0
90 |
91 | def test_bad_option(self):
92 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.SETUP, '--blargh']):
93 | try:
94 | returncode = main()
95 | except SystemExit as err:
96 | assert err.code != 0
97 | else:
98 | assert returncode != 0
99 |
100 | def test_ref_alt_count(self):
101 | with patch.object(sys, 'argv', ['calculate_ref_alt_counts', '-h']):
102 | try:
103 | returncode = main()
104 | except SystemExit as err:
105 | assert err.code == 0
106 | else:
107 | assert returncode == 0
108 |
--------------------------------------------------------------------------------
/tests/data/mock_dgv_annotation_mavis.tab:
--------------------------------------------------------------------------------
1 | tracking_id event_type break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break1_seq break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand break2_seq opposing_strands stranded tools
2 | nsv482937 None 1 1 1 L ? None 1 2300000 2300000 L ? None True False bed
3 | nsv482937 None 1 1 1 L ? None 1 2300000 2300000 R ? None False False bed
4 | nsv482937 None 1 1 1 R ? None 1 2300000 2300000 L ? None False False bed
5 | nsv482937 None 1 1 1 R ? None 1 2300000 2300000 R ? None True False bed
6 | dgv1n82 None 1 10001 10001 L ? None 1 22118 22118 L ? None True False bed
7 | dgv1n82 None 1 10001 10001 L ? None 1 22118 22118 R ? None False False bed
8 | dgv1n82 None 1 10001 10001 R ? None 1 22118 22118 L ? None False False bed
9 | dgv1n82 None 1 10001 10001 R ? None 1 22118 22118 R ? None True False bed
10 | rgv2n98 None 1 10001 10001 L ? None 1 22120 22120 L ? None True False bed
11 | rgv2n98 None 1 10001 10001 L ? None 1 22120 22120 R ? None False False bed
12 | rgv2n98 None 1 10001 10001 R ? None 1 22120 22120 L ? None False False bed
13 | rgv2n98 None 1 10001 10001 R ? None 1 22120 22120 R ? None True False bed
14 | dgv2n99 None 1 10001 10501 R ? None 1 15000 15000 R ? None True False bed
15 | rgv2n99 None 1 10001 10001 L ? None 1 22222 22222 L ? None True False bed
16 | rgv2n99 None 1 10001 10001 L ? None 1 22222 22222 R ? None False False bed
17 | rgv2n99 None 1 10001 10001 R ? None 1 22222 22222 L ? None False False bed
18 | rgv2n99 None 1 10001 10001 R ? None 1 22222 22222 R ? None True False bed
19 | nsv7879 None 1 10001 10001 L ? None 1 127330 127330 L ? None True False bed
20 | nsv7879 None 1 10001 10001 L ? None 1 127330 127330 R ? None False False bed
21 | nsv7879 None 1 10001 10001 R ? None 1 127330 127330 L ? None False False bed
22 | nsv7879 None 1 10001 10001 R ? None 1 127330 127330 R ? None True False bed
23 | nsv958854 None 1 10191 10191 L ? None 1 10281 10281 L ? None True False bed
24 | nsv958854 None 1 10191 10191 L ? None 1 10281 10281 R ? None False False bed
25 | nsv958854 None 1 10191 10191 R ? None 1 10281 10281 L ? None False False bed
26 | nsv958854 None 1 10191 10191 R ? None 1 10281 10281 R ? None True False bed
27 | nsv428112 None 1 10377 10377 L ? None 1 177417 177417 L ? None True False bed
28 | nsv428112 None 1 10377 10377 L ? None 1 177417 177417 R ? None False False bed
29 | nsv428112 None 1 10377 10377 R ? None 1 177417 177417 L ? None False False bed
30 | nsv428112 None 1 10377 10377 R ? None 1 177417 177417 R ? None True False bed
31 | esv2758911 None 1 10377 10377 L ? None 1 1018704 1018704 L ? None True False bed
32 | esv2758911 None 1 10377 10377 L ? None 1 1018704 1018704 R ? None False False bed
33 | esv2758911 None 1 10377 10377 R ? None 1 1018704 1018704 L ? None False False bed
34 | esv2758911 None 1 10377 10377 R ? None 1 1018704 1018704 R ? None True False bed
35 | esv27265 None 1 10499 10499 L ? None 1 177368 177368 L ? None True False bed
36 | esv27265 None 1 10499 10499 L ? None 1 177368 177368 R ? None False False bed
37 | esv27265 None 1 10499 10499 R ? None 1 177368 177368 L ? None False False bed
38 | esv27265 None 1 10499 10499 R ? None 1 177368 177368 R ? None True False bed
39 | nsv1147468 None 1 11099 11099 L ? None 1 47000 47000 L ? None True False bed
40 | nsv1147468 None 1 11099 11099 L ? None 1 47000 47000 R ? None False False bed
41 | nsv1147468 None 1 11099 11099 R ? None 1 47000 47000 L ? None False False bed
42 | nsv1147468 None 1 11099 11099 R ? None 1 47000 47000 R ? None True False bed
43 | dgv1n106 None 1 11100 11100 L ? None 1 29200 29200 L ? None True False bed
44 | dgv1n106 None 1 11100 11100 L ? None 1 29200 29200 R ? None False False bed
45 | dgv1n106 None 1 11100 11100 R ? None 1 29200 29200 L ? None False False bed
46 | dgv1n106 None 1 11100 11100 R ? None 1 29200 29200 R ? None True False bed
47 |
--------------------------------------------------------------------------------
/src/tools/find_repeats.py:
--------------------------------------------------------------------------------
1 | """
2 | Script used in finding potential masking regions within a genome
3 | """
4 | import argparse
5 | import os
6 |
7 | from mavis.annotate.base import BioInterval
8 | from mavis.annotate.file_io import load_reference_genome
9 | from mavis.util import log
10 |
11 |
12 | def parse_arguments():
13 | """
14 | parse command line arguments
15 | """
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument(
18 | '-o', '--output', help='path to the output file', required=True, metavar='FILEPATH'
19 | )
20 | parser.add_argument(
21 | '-n',
22 | '--input',
23 | required=True,
24 | metavar='FILEPATH',
25 | help='Path to the Input reference genome fasta file',
26 | )
27 | parser.add_argument(
28 | '--min_length',
29 | default=20,
30 | type=int,
31 | help='Minimum total length of the repeat region to find',
32 | metavar='INT',
33 | )
34 | parser.add_argument(
35 | '--repeat_seq',
36 | default='N',
37 | type=str,
38 | help='Repeat sequence to look for. Case insensitive',
39 | nargs='+',
40 | )
41 | args = parser.parse_args()
42 | if args.min_length < 2:
43 | parser.error('argument --min_length: cannot specify a shorter repeat than 2 bases')
44 | if not os.path.exists(args.input):
45 | parser.error('argument --input: File does not exist')
46 | return args
47 |
48 |
49 | def main():
50 | args = parse_arguments()
51 | repeat_sequences = sorted(list(set([s.lower() for s in args.repeat_seq])))
52 | log('loading:', args.input)
53 | reference_genome = load_reference_genome(args.input)
54 | comments = [
55 | os.path.basename(__file__),
56 | 'input: {}'.format(args.input),
57 | 'min_length: {}'.format(args.min_length),
58 | 'repeat_seq: {}'.format(', '.join(args.repeat_seq)),
59 | ]
60 | log('writing:', args.output)
61 | with open(args.output, 'w') as fh:
62 | for comment in comments:
63 | fh.write('## {}\n'.format(comment))
64 | fh.write('chr\tstart\tend\tname\n')
65 | visited = set()
66 | for chrom, seq in sorted(reference_genome.items()):
67 | if chrom.startswith('chr'):
68 | chrom = chrom[3:]
69 | seq = str(seq.seq).lower()
70 | if seq in visited:
71 | continue
72 | else:
73 | visited.add(seq)
74 | spans = []
75 | for repseq in repeat_sequences:
76 | log(
77 | 'finding {}_repeat (min_length: {}), for chr{} (length: {})'.format(
78 | repseq, args.min_length, chrom, len(seq)
79 | )
80 | )
81 | index = 0
82 | while index < len(seq):
83 | next_n = seq.find(repseq, index)
84 | if next_n < 0:
85 | break
86 | index = next_n
87 | while (
88 | index + len(repseq) <= len(seq)
89 | and seq[index : index + len(repseq)] == repseq
90 | ):
91 | index += len(repseq)
92 | span = BioInterval(chrom, next_n + 1, index, name='repeat_{}'.format(repseq))
93 | if len(span) >= args.min_length and len(span) >= 2 * len(repseq):
94 | spans.append(span)
95 | log('found', len(spans), 'spans', time_stamp=False)
96 | for span in spans:
97 | fh.write(
98 | '{}\t{}\t{}\t{}\n'.format(
99 | span.reference_object, span.start, span.end, span.name
100 | )
101 | )
102 |
103 |
104 | if __name__ == '__main__':
105 | main()
106 |
--------------------------------------------------------------------------------
/tests/data/mock_sv_events.tsv:
--------------------------------------------------------------------------------
1 | ## False reference9 2000 2000 reference9 2001 2001 L R + + insertion genome convert_ta.py_v0.0.1 mock-A36971 9:66466004
2 | stranded break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end break1_orientation break2_orientation break1_strand break2_strand event_type protocol tools library comment
3 | False reference7 5000 5000 reference7 11000 11000 R L - - duplication genome convert_ta.py_v0.0.1 mock-A36971 7:104485067|7:104612302
4 | False reference20 2000 2000 reference20 6000 6000 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 20:13160730|20:13164100
5 | False reference10 520 520 reference19 964 964 R L + + translocation genome convert_ta.py_v0.0.1 mock-A36971 10:7059511|19:17396811
6 | False referenceX 2000 2000 referenceX 6000 6000 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 X:32098533|32201251
7 | False reference3 1115 1115 reference3 2188 2188 R R + - inversion genome convert_ta.py_v0.0.1 mock-A36971 3:24565106|24566179
8 | False referenceX 10000 10000 referenceX 14000 14000 L R - - deletion genome convert_ta.py_v0.0.1 mock-A36971 X:31301203|32038750
9 | False reference2 2000 2000 reference4 2000 2000 L R - - translocation genome convert_ta.py_v0.0.1 mock-A36971 2:42052609|4:66413931
10 | False reference7 15000 15000 reference7 19000 19000 R R + - inversion genome convert_ta.py_v0.0.1 mock-A36971 7:126098488|126167441
11 | False reference19 4827 4847 reference19 5219 5219 L R + + deletion genome DELLY_v0.6.1 mock-A36971 19:31954787-31955407|19:31955423-31956043
12 | False reference11 6000 6000 reference11 6003 6003 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 11:121214|11:121216
13 | False reference11 10000 10000 reference11 10030 10030 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 11:1651586|11:1651615
14 | False reference12 2001 2001 reference12 2120 2120 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 12:14945389|12:14945509
15 | False reference10 3609 3609 reference10 3818 3818 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:7793830|10:7794039
16 | False reference10 8609 8609 reference10 8927 8927 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:100025136|10:100025454
17 | False reference10 12609 12609 reference10 13123 13123 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:18503076|10:18503590
18 | False reference10 17109 17109 reference10 17899 17899 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:127745195|10:127745985
19 | False reference10 22109 22109 reference10 24330 24330 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:108030321|10:108032542
20 | False reference10 28109 28109 reference10 31827 31827 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:132909062|10:132912780
21 | False reference10 36109 36109 reference10 42159 42159 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:6411580|10:6417630
22 | False reference12 6001 6001 reference12 6016 6016 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 12:127413217|12:127413233 complex event
23 | False reference1 2000 2000 reference1 2001 2001 L R + + insertion genome convert_ta.py_v0.0.1 mock-A36971 1:8877520
24 | False reference16 2000 2000 reference16 2001 2001 L R + + insertion genome convert_ta.py_v0.0.1 mock-A36971 16:57847634
25 | False reference12 10000 10000 reference12 10001 10021 R L + + duplication genome convert_ta.py_v0.0.1 mock-A36971 12:53207583 reported as an insertion
26 | False reference17 1974 1974 reference17 2020 2020 R L + + duplication genome convert_ta.py_v0.0.1 mock-A36971 17:72889676 reported as an insertion
27 | False gene3 27175 27175 gene3 27176 27176 R L + + duplication genome convert_ta.py_v0.0.1 mock-A36971 1:207249992
28 | False gene5 608 608 gene1 33309 33309 R R + - inverted translocation genome convert_ta.py_v0.0.1 mock-A36971 7:26252971|15:40854190
29 | False gene2 19827 19827 gene2 27045 27045 R L + + duplication genome convert_ta.py_v0.0.1 mock-A36971 15:41621292|15:41628510
30 | False gene6 77430 77430 gene6 89472 89472 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:89700299|10:89712341
31 |
--------------------------------------------------------------------------------
/tests/data/build.cfg:
--------------------------------------------------------------------------------
1 | [general]
2 | batch_id = batch-aMfNsjq7NgyaJFfhU9ZHQS
3 | output_dir = /var/tmp/tmpfojhl9g1
4 | scheduler = SLURM
5 | concurrency_limit = None
6 |
7 | [MS_batch-aMfNsjq7NgyaJFfhU9ZHQS]
8 | stage = summary
9 | job_ident = None
10 | name = MS_batch-aMfNsjq7NgyaJFfhU9ZHQS
11 | dependencies = MP_batch-aMfNsjq7NgyaJFfhU9ZHQS
12 | script = /var/tmp/tmpfojhl9g1/summary/submit.sh
13 | status = UNKNOWN
14 | output_dir = /var/tmp/tmpfojhl9g1/summary
15 | stdout = /var/tmp/tmpfojhl9g1/summary/job-{name}-{job_ident}.log
16 | memory_limit = 16000
17 | queue =
18 | time_limit = 57600
19 | import_env = True
20 | mail_user =
21 | mail_type = NONE
22 |
23 | [MP_batch-aMfNsjq7NgyaJFfhU9ZHQS]
24 | stage = pairing
25 | job_ident = None
26 | name = MP_batch-aMfNsjq7NgyaJFfhU9ZHQS
27 | dependencies = MA_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS
28 | MA_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS
29 | script = /var/tmp/tmpfojhl9g1/pairing/submit.sh
30 | status = UNKNOWN
31 | output_dir = /var/tmp/tmpfojhl9g1/pairing
32 | stdout = /var/tmp/tmpfojhl9g1/pairing/job-{name}-{job_ident}.log
33 | memory_limit = 16000
34 | queue =
35 | time_limit = 57600
36 | import_env = True
37 | mail_user =
38 | mail_type = NONE
39 |
40 | [MV_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS]
41 | stage = validate
42 | job_ident = None
43 | name = MV_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS
44 | dependencies =
45 | script = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/validate/submit.sh
46 | status = UNKNOWN
47 | output_dir = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/validate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID
48 | stdout = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/validate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID/job-{name}-{job_ident}_{task_ident}.log
49 | memory_limit = 16000
50 | queue =
51 | time_limit = 57600
52 | import_env = True
53 | mail_user =
54 | mail_type = NONE
55 | task_list = 1
56 |
57 | [MV_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS]
58 | stage = validate
59 | job_ident = None
60 | name = MV_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS
61 | dependencies =
62 | script = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/validate/submit.sh
63 | status = UNKNOWN
64 | output_dir = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/validate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID
65 | stdout = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/validate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID/job-{name}-{job_ident}_{task_ident}.log
66 | memory_limit = 18000
67 | queue =
68 | time_limit = 57600
69 | import_env = True
70 | mail_user =
71 | mail_type = NONE
72 | task_list = 1
73 |
74 | [MA_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS]
75 | stage = annotate
76 | job_ident = None
77 | name = MA_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS
78 | dependencies = MV_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS
79 | script = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/annotate/submit.sh
80 | status = UNKNOWN
81 | output_dir = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/annotate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID
82 | stdout = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/annotate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID/job-{name}-{job_ident}_{task_ident}.log
83 | memory_limit = 12000
84 | queue =
85 | time_limit = 57600
86 | import_env = True
87 | mail_user =
88 | mail_type = NONE
89 | task_list = 1
90 |
91 | [MA_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS]
92 | stage = annotate
93 | job_ident = None
94 | name = MA_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS
95 | dependencies = MV_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS
96 | script = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/annotate/submit.sh
97 | status = UNKNOWN
98 | output_dir = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/annotate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID
99 | stdout = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/annotate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID/job-{name}-{job_ident}_{task_ident}.log
100 | memory_limit = 12000
101 | queue =
102 | time_limit = 57600
103 | import_env = True
104 | mail_user =
105 | mail_type = NONE
106 | task_list = 1
107 |
108 |
--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ## Getting Started
2 |
3 | If you are new to the project a good way to get started is by adding to the documentation, or adding unit tests where
4 | there is a lack of code coverage.
5 |
6 | ## Install (for Development)
7 |
8 | Clone the repository and switch to the development branch
9 |
10 | ```bash
11 | git clone https://github.com/bcgsc/mavis.git
12 | cd mavis
13 | git checkout develop
14 | ```
15 |
16 | Set up a python virtual environment. If you are developing in python setting up with a virtual environment can be
17 | incredibly helpful as it allows for a clean install to test. Instructions for setting up the environment
18 | are below
19 |
20 | ```bash
21 | python3 -m venv venv
22 | source venv/bin/activate
23 | ```
24 |
25 | Install the MAVIS python package. Running the setup in develop mode will ensure that your code changes are run when you
26 | run MAVIS from within that virtual environment
27 |
28 | ```bash
29 | pip install -e .[dev]
30 | ```
31 |
32 | Run the tests and compute code coverage
33 |
34 | ```bash
35 | pytest tests
36 | ```
37 |
38 | ## Build the Documentation
39 |
40 | ```bash
41 | pip install .[docs]
42 | markdown_refdocs mavis -o docs/package --link
43 | mkdocs build
44 | ```
45 |
46 | The contents of the user manual can then be viewed by opening the build-docs/index.html
47 | in any available web browser (i.e. google-chrome, firefox, etc.)
48 |
49 | ## Deploy to PyPi
50 |
51 | Install deployment dependencies
52 |
53 | ```bash
54 | pip install .[deploy]
55 | ```
56 |
57 | Build the distribution files
58 |
59 | ```bash
60 | python setup.py install sdist bdist_wheel
61 | ```
62 |
63 | Use twine to upload
64 |
65 | ```bash
66 | twine upload -r pypi dist/*
67 | ```
68 |
69 | ## Reporting a Bug
70 |
71 | Please make sure to search through the issues before reporting a bug to ensure there isn't
72 | already an open issue.
73 |
74 | ## Conventions
75 |
76 | ### Linting
77 |
78 | Use [black](https://github.com/psf/black) with strings off and line length 100
79 |
80 | ```bash
81 | black src/mavis -S -l 100
82 | ```
83 |
84 | ### Docstrings
85 |
86 | docstrings should follow [sphinx google code style](http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html)
87 |
88 | if you want to be more explicit with nested types, please follow the same format
89 | used by [python type annotations](https://docs.python.org/3/library/typing.html)
90 |
91 | ```text
92 | arg1 (List[str]): a list of strings
93 | ```
94 |
95 | However using proper type annotations is preferred for new code and then only including the
96 | description of the parameter in the docstring and not its type
97 |
98 | ```python
99 |
100 | def some_function(some_arg: List[str]) -> None:
101 | """
102 | Args:
103 | some_arg: this arg does stuff
104 | """
105 | ```
106 |
107 | ### Output Columns
108 |
109 | any column name which may appear in any of the intermediate or final output files must be defined in `mavis.constants.COLUMNS` as well as added to the [columns glossary](../outputs/columns)
110 |
111 | ### Tests
112 |
113 | - all new code must have unit tests in the tests subdirectory
114 |
115 | Tests can be run as follows
116 |
117 | ```bash
118 | pytest tests
119 | ```
120 |
121 | ### Branching Model
122 |
123 | If you are working on a large feature, create a base branch for the feature off develop. Generally
124 | these follow the naming pattern
125 |
126 | ```bash
127 | git checkout -b integration/issue--
128 | ```
129 |
130 | If you are working on a smaller feature then simply make a feature branch off develop
131 |
132 | ```bash
133 | git checkout -b feature/issue--
134 | ```
135 |
136 | Once ready, a PR should be made to develop and review should be requested from the other developers.
137 |
138 | Releases are done by creating a release branch off develop
139 |
140 | ```bash
141 | git checkout -b release/vX.X.X
142 | ```
143 |
144 | Updating the version number in setup.py in the release branch, and then making a PR to master.
145 | After the PR has been merged to master a tag/release should be created with the release notes
146 | and a PR to merge master back into develop should be made
147 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: build
5 |
6 | on:
7 | push:
8 | branches:
9 | - master
10 | - develop
11 | pull_request:
12 |
13 | jobs:
14 | build:
15 | runs-on: ubuntu-20.04
16 | strategy:
17 | matrix:
18 | python-version: ["3.7", "3.8", "3.9", "3.10"]
19 | name: python-${{ matrix.python-version }}
20 | steps:
21 | - uses: actions/checkout@v2
22 | - name: install machine dependencies
23 | run: |
24 | sudo apt-get update
25 | sudo apt-get install -y libcurl4-openssl-dev
26 | - name: Set up Python ${{ matrix.python-version }}
27 | uses: actions/setup-python@v2
28 | with:
29 | python-version: ${{ matrix.python-version }}
30 | - name: Install dependencies
31 | run: |
32 | python -m pip install --upgrade pip setuptools
33 | pip install -e .[test] # need editable to make sure the coverage reports correctly
34 | - name: install bwa
35 | run: |
36 | git clone https://github.com/lh3/bwa.git
37 | cd bwa
38 | git checkout v0.7.17
39 | make
40 | cd ..
41 | - name: install blat
42 | run: |
43 | wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat
44 | chmod a+x blat
45 | - name: set up .pth file
46 | run: |
47 | python tests/setup_subprocess_cov.py
48 | - name: run full tests with pytest
49 | run: |
50 | export PATH=$PATH:$(pwd):$(pwd)/bwa
51 | export COVERAGE_PROCESS_START=$(pwd)/.coveragerc
52 |
53 | pytest tests -v \
54 | --junitxml=junit/test-results-${{ matrix.python-version }}.xml \
55 | --cov mavis \
56 | --cov tools.convert_annotations_format \
57 | --cov-report term-missing \
58 | --cov-report xml \
59 | --durations=10 \
60 | --cov-branch
61 | env:
62 | RUN_FULL: 1
63 | - name: Upload pytest test results
64 | uses: actions/upload-artifact@master
65 | with:
66 | name: pytest-results-${{ matrix.python-version }}
67 | path: junit/test-results-${{ matrix.python-version }}.xml
68 | # Use always() to always run this step to publish test results when there are test failures
69 | if: always()
70 | - name: Update code coverage report to CodeCov
71 | uses: codecov/codecov-action@v1
72 | with:
73 | token: ${{ secrets.CODECOV_TOKEN }}
74 | file: ./coverage.xml
75 | flags: unittests
76 | env_vars: OS,PYTHON
77 | name: codecov-umbrella
78 | fail_ci_if_error: true
79 | if: matrix.python-version == 3.8
80 | docker:
81 | runs-on: ubuntu-latest
82 | name: docker build
83 | steps:
84 | - uses: actions/checkout@v2
85 | - name: build the docker container
86 | run: |
87 | docker build --file Dockerfile --tag bcgsc/mavis:latest .
88 | - name: test the help menu
89 | run: |
90 | docker run bcgsc/mavis -h
91 | - name: Set up Python 3.7
92 | uses: actions/setup-python@v2
93 | with:
94 | python-version: 3.7
95 | - name: Install workflow dependencies
96 | run: |
97 | python -m pip install --upgrade pip setuptools wheel
98 | pip install mavis_config pandas
99 | - uses: eWaterCycle/setup-singularity@v6
100 | with:
101 | singularity-version: 3.6.4
102 | - name: docker2singularity
103 | run:
104 | docker run --mount type=bind,source=/var/run/docker.sock,target=/var/run/docker.sock --mount type=bind,source="$(pwd)",target=/output --privileged -t --rm singularityware/docker2singularity bcgsc/mavis:latest
105 | - name: Run analysis with snakemake & singularity
106 | run: |
107 | # get the SIMG filename
108 | export SNAKEMAKE_CONTAINER=$(ls *mavis*.simg)
109 | snakemake -j 2 --configfile tests/mini-tutorial.config.json --use-singularity
110 | if: always()
111 |
--------------------------------------------------------------------------------
/tests/test_tools/test_ref_alt_count.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import tempfile
4 |
5 | import pytest
6 |
7 | from mavis.annotate.file_io import load_reference_genome
8 | from mavis.breakpoint import Breakpoint, BreakpointPair
9 | from mavis.constants import ORIENT, SVTYPE
10 | from tools.calculate_ref_alt_counts import RefAltCalculator
11 |
12 | from ..util import get_data, glob_exists
13 |
14 |
15 | def setUpModule():
16 | global REFERENCE_GENOME
17 | REFERENCE_GENOME = load_reference_genome(get_data('mock_reference_genome.fa'))
18 | if (
19 | 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT'
20 | != REFERENCE_GENOME['fake'].seq[0:50].upper()
21 | ):
22 | raise AssertionError('fake genome file does not have the expected contents')
23 |
24 |
25 | def print_file_tree(dirname):
26 | for root, dirs, files in os.walk(dirname):
27 | level = root.replace(dirname, '').count(os.sep)
28 | indent = ' ' * 4 * (level)
29 | print('{}{}/'.format(indent, os.path.basename(root)))
30 | subindent = ' ' * 4 * (level + 1)
31 | for f in files:
32 | print('{}{}'.format(subindent, f))
33 |
34 |
35 | @pytest.fixture
36 | def calculator():
37 | return RefAltCalculator(
38 | [("TEST", get_data('mock_reads_for_events.sorted.bam'))],
39 | REFERENCE_GENOME,
40 | max_event_size=100,
41 | buffer=20,
42 | )
43 |
44 |
45 | @pytest.fixture
46 | def temp_output():
47 | d = tempfile.mkdtemp()
48 | yield d
49 | shutil.rmtree(d)
50 |
51 |
52 | class TestFullCalculator:
53 | def test_calculate_all_counts(self, calculator, temp_output):
54 | calculator.calculate_all_counts(
55 | [get_data("mavis_summary_all_mock-A36971_mock-A47933.tab")],
56 | os.path.join(temp_output, "ref_alt_output.tab"),
57 | )
58 | assert glob_exists(temp_output, "ref_alt_output.tab")
59 |
60 |
61 | class TestRefAltCalulator:
62 | def test_calculate_count(self, calculator):
63 | ev1 = BreakpointPair(
64 | Breakpoint('reference11', 5999, orient=ORIENT.LEFT),
65 | Breakpoint('reference11', 6003, orient=ORIENT.RIGHT),
66 | opposing_strands=False,
67 | event_type=SVTYPE.DEL,
68 | )
69 | bpp = calculator.calculate_ref_counts(ev1)
70 | print(bpp.data)
71 | assert bpp.data["TEST_ref_count"] == 27
72 | assert bpp.data["TEST_alt_count"] == 14
73 | assert bpp.data['TEST_ignored_count'] == 188
74 |
75 | def test_calculate_count2(self, calculator):
76 | ev1 = BreakpointPair(
77 | Breakpoint('reference11', 9999, orient=ORIENT.LEFT),
78 | Breakpoint('reference11', 10030, orient=ORIENT.RIGHT),
79 | opposing_strands=False,
80 | event_type=SVTYPE.DEL,
81 | )
82 | bpp = calculator.calculate_ref_counts(ev1)
83 | print(bpp.data)
84 | assert bpp.data["TEST_ref_count"] == 0
85 | assert bpp.data["TEST_alt_count"] == 63
86 | assert bpp.data['TEST_ignored_count'] == 195
87 |
88 | def test_calculate_count3(self, calculator):
89 | ev1 = BreakpointPair(
90 | Breakpoint('reference1', 2002, orient=ORIENT.LEFT),
91 | Breakpoint('reference1', 2003, orient=ORIENT.RIGHT),
92 | opposing_strands=False,
93 | event_type=SVTYPE.INS,
94 | untemplated_seq='TT',
95 | )
96 | bpp = calculator.calculate_ref_counts(ev1)
97 | print(bpp.data)
98 | assert bpp.data["TEST_ref_count"] == 0
99 | assert bpp.data["TEST_alt_count"] == 23
100 | assert bpp.data['TEST_ignored_count'] == 145
101 |
102 | def test_calculate_count4(self, calculator):
103 | ev1 = BreakpointPair(
104 | Breakpoint('reference11', 1999, orient=ORIENT.LEFT),
105 | Breakpoint('reference11', 2001, orient=ORIENT.RIGHT),
106 | opposing_strands=False,
107 | event_type=SVTYPE.DEL,
108 | )
109 | bpp = calculator.calculate_ref_counts(ev1)
110 | print(bpp.data)
111 | assert bpp.data["TEST_ref_count"] == 0
112 | assert bpp.data["TEST_alt_count"] == 50
113 | assert bpp.data['TEST_ignored_count'] == 191
114 |
--------------------------------------------------------------------------------
/tests/data/mock_reference_annotations2.json:
--------------------------------------------------------------------------------
1 | {
2 | "genes": [
3 | {
4 | "aliases": [
5 | ],
6 | "chr": "fake",
7 | "end": 200,
8 | "name": "GENE-A",
9 | "start": 100,
10 | "strand": "+",
11 | "transcripts": [
12 | {
13 | "aliases": [
14 | ],
15 | "cdna_coding_end": null,
16 | "cdna_coding_start": null,
17 | "domains": [
18 | ],
19 | "end": 200,
20 | "exons": [
21 | ],
22 | "is_best_transcript": true,
23 | "name": "TRANSCRIPT-A",
24 | "start": 100
25 | }
26 | ]
27 | },
28 | {
29 | "aliases": [
30 | ],
31 | "chr": "fake",
32 | "end": 350,
33 | "name": "GENE-B",
34 | "start": 250,
35 | "strand": "-",
36 | "transcripts": [
37 | {
38 | "aliases": [
39 | ],
40 | "cdna_coding_end": null,
41 | "cdna_coding_start": null,
42 | "domains": [
43 | ],
44 | "end": 350,
45 | "exons": [
46 | ],
47 | "is_best_transcript": true,
48 | "name": "TRANSCRIPT-B",
49 | "start": 250
50 | }
51 | ]
52 | },
53 | {
54 | "aliases": [
55 | ],
56 | "chr": "fake",
57 | "end": 400,
58 | "name": "GENE-C",
59 | "start": 300,
60 | "strand": "+",
61 | "transcripts": [
62 | {
63 | "aliases": [
64 | ],
65 | "cdna_coding_end": null,
66 | "cdna_coding_start": null,
67 | "domains": [
68 | ],
69 | "end": 400,
70 | "exons": [
71 | ],
72 | "is_best_transcript": true,
73 | "name": "TRANSCRIPT-C",
74 | "start": 300
75 | }
76 | ]
77 | },
78 | {
79 | "aliases": [
80 | ],
81 | "chr": "fake",
82 | "end": 550,
83 | "name": "GENE-D",
84 | "start": 450,
85 | "strand": "-",
86 | "transcripts": [
87 | {
88 | "aliases": [
89 | ],
90 | "cdna_coding_end": null,
91 | "cdna_coding_start": null,
92 | "domains": [
93 | ],
94 | "end": 550,
95 | "exons": [
96 | ],
97 | "is_best_transcript": true,
98 | "name": "TRANSCRIPT-D",
99 | "start": 450
100 | }
101 | ]
102 | },
103 | {
104 | "aliases": [
105 | ],
106 | "chr": "fake",
107 | "end": 600,
108 | "name": "GENE-E",
109 | "start": 500,
110 | "strand": "+",
111 | "transcripts": [
112 | {
113 | "aliases": [
114 | ],
115 | "cdna_coding_end": null,
116 | "cdna_coding_start": null,
117 | "domains": [
118 | ],
119 | "end": 600,
120 | "exons": [
121 | ],
122 | "is_best_transcript": true,
123 | "name": "TRANSCRIPT-E",
124 | "start": 500
125 | }
126 | ]
127 | },
128 | {
129 | "aliases": [
130 | ],
131 | "chr": "fake",
132 | "end": 650,
133 | "name": "GENE-F",
134 | "start": 550,
135 | "strand": "+",
136 | "transcripts": [
137 | {
138 | "aliases": [
139 | ],
140 | "cdna_coding_end": null,
141 | "cdna_coding_start": null,
142 | "domains": [
143 | ],
144 | "end": 650,
145 | "exons": [
146 | ],
147 | "is_best_transcript": true,
148 | "name": "TRANSCRIPT-F",
149 | "start": 550
150 | }
151 | ]
152 | }
153 | ]
154 | }
155 |
--------------------------------------------------------------------------------
/docs/hooks.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from textwrap import dedent
4 |
5 | import pkg_resources
6 | from markdown_refdocs.main import extract_to_markdown
7 | from mavis_config import DEFAULTS
8 | from mavis.util import ENV_VAR_PREFIX
9 |
10 |
11 | def json_to_pytype(record):
12 | input_type = record
13 | try:
14 | input_type = record['type']
15 | except TypeError:
16 | pass
17 | types = {
18 | 'string': 'str',
19 | 'integer': 'int',
20 | 'float': 'float',
21 | 'boolean': 'bool',
22 | 'number': 'float',
23 | }
24 |
25 | if input_type == 'array':
26 | try:
27 | sub_type = json_to_pytype(record['items']['type'])
28 | return f'List[{sub_type}]'
29 | except TypeError:
30 | return 'List'
31 |
32 | if isinstance(input_type, list):
33 | # Union
34 | types = ', '.join([json_to_pytype(t) for t in input_type])
35 | return f'Union[{types}]'
36 | return types.get(input_type, input_type)
37 |
38 |
39 | def list_properties(schema, skip_terms=tuple()):
40 | glossary = {}
41 | for term, defn in schema['properties'].items():
42 | if term in skip_terms:
43 | continue
44 | typ = json_to_pytype(defn)
45 | desc = defn.get('description', '')
46 | default_value = defn.get('default')
47 | schema_fields = {k: v for k, v in defn.items() if k not in ['description', 'default']}
48 |
49 | if len(schema_fields) > 1:
50 | schema_defn = json.dumps(
51 | schema_fields,
52 | sort_keys=True,
53 | indent=' ',
54 | )
55 | schema_defn = f'**schema definition**:\n```json\n{schema_defn}\n```\n'
56 | else:
57 | schema_defn = ''
58 |
59 | lines = [
60 | f'### {term}',
61 | f'**type**: `#!python {typ}`',
62 | f'**default**: `#!python {repr(default_value)}`' if default_value is not None else '',
63 | desc,
64 | schema_defn,
65 | ]
66 | glossary[term] = '\n\n'.join(lines)
67 | return [v for k, v in sorted(glossary.items())]
68 |
69 |
70 | def generate_settings_doc(schema_file):
71 | with open(schema_file, 'r') as fh:
72 | schema = json.load(fh)
73 | dirname = os.path.dirname(os.path.abspath(__file__))
74 | filepath = 'configuration/settings.md'
75 | title = 'Configurable Settings'
76 |
77 | fname = os.path.join(dirname, filepath)
78 |
79 | result = [f'\n\n# {title}\n']
80 | result.append(
81 | dedent(
82 | '''\
83 | ## Defining Samples/Libraries
84 |
85 | The `libraries` property of the mavis config is required to run the snakemake
86 | workflow. This is the section that defines what inputs to use, and what types of
87 | samples are available.
88 |
89 | ```json
90 | {
91 | "libraries": {
92 | "": { } // mapping of library name to library settings
93 | }
94 | }
95 | ```
96 |
97 | The library specific settings are listed below
98 | '''
99 | )
100 | )
101 | result.extend(list_properties(schema['properties']['libraries']['additionalProperties']))
102 | result.append(
103 | dedent(
104 | '''\
105 | ## Defining Conversions
106 |
107 | If the input to MAVIS is raw tool output and has not been pre-converted to the
108 | standard tab delimited format expected by MAVIS then you will need to add
109 | a section to the config to tell mavis how to perform the required conversions
110 |
111 | ```json
112 | {
113 | "convert": {
114 | "": { } // mapping of alias to conversion settings
115 | }
116 | }
117 | ```
118 |
119 | The conversion specific settings are listed below
120 | '''
121 | )
122 | )
123 | result.extend(list_properties(schema['properties']['convert']['additionalProperties']))
124 | result.append('\n## General Settings\n')
125 | result.extend(list_properties(schema, ('libraries', 'convert')))
126 |
127 | print('writing:', fname)
128 | with open(fname, 'w') as fh:
129 | fh.write('\n\n'.join(result) + '\n')
130 |
131 |
132 | def build_package_docs(config):
133 | schema_file = pkg_resources.resource_filename('mavis_config', 'config.json')
134 | generate_settings_doc(schema_file)
135 | package_dir = os.path.join(os.path.dirname(__file__), '../src/mavis')
136 | output_dir = os.path.join(os.path.dirname(__file__), 'package')
137 |
138 | extract_to_markdown(
139 | [package_dir],
140 | output_dir,
141 | link=True,
142 | hide_private=True,
143 | hide_undoc=True,
144 | hide_undoc_args=True,
145 | namespace_headers=False,
146 | )
147 |
--------------------------------------------------------------------------------
/docs/inputs/standard.md:
--------------------------------------------------------------------------------
1 | # MAVIS standard input file format
2 |
3 | These requirements pertain to the columns of input files from the
4 | various tools you want to merge. The input files should be tab-delimited
5 | text files. Comments at the top of may be included. Comments should
6 | begin with hash marks. They will be ignored when the file is read
7 |
8 | ```text
9 | ## This is a comment
10 | ```
11 |
12 | The header row contains the column names and is the first row following
13 | the comments (or the first row if no comments are included).
14 |
15 | ```text
16 | ## This is a comment
17 | ## this is another comment
18 | # this is also a comment
19 | This Is The Header
20 | ```
21 |
22 | A simple input file might look as follows
23 |
24 | ```text
25 | ## File created at: 2018-01-02
26 | ## Generated by: MAVIS v1.0.0
27 | break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end
28 | X 1234 1234 X 77965 77965
29 | ```
30 |
31 | ## Required Columns
32 |
33 | - [break1_chromosome](../../outputs/columns/#break1_chromosome)
34 | - [break1_position_start](../../outputs/columns/#break1_position_start)
35 | - [break1_position_end](../../outputs/columns/#break1_position_end) (can be the same as break1\_position\_start)
36 | - [break2_chromosome](../../outputs/columns/#break2_chromosome)
37 | - [break2_position_start](../../outputs/columns/#break2_position_start)
38 | - [break2_position_end](../../outputs/columns/#break2_position_end) (can be the same as break2\_position\_start)
39 |
40 | ## Optional Columns
41 |
42 | Optional Columns that are not given as input will be added with default
43 | (or command line parameter options) during the clustering stage of MAVIS
44 | as some are required for subsequent pipeline steps
45 |
46 | - [break1_strand](../../outputs/columns/#break1_strand) (defaults to not-specified during clustering)
47 | - [break1_orientation](../../outputs/columns/#break1_orientation) (expanded to all possible values during clustering)
48 | - [break2_strand](../../outputs/columns/#break2_strand) (defaults to not-specified during clustering)
49 | - [break2_orientation](../../outputs/columns/#break2_orientation) (expanded to all possible values during clustering)
50 | - [opposing_strands](../../outputs/columns/#opposing_strands) (expanded to all possible values during clustering)
51 | - [stranded](../../outputs/columns/#stranded) (defaults to False during clustering)
52 | - [library](../../outputs/columns/#library) (defaults to command line library parameter during clustering)
53 | - [protocol](../../outputs/columns/#protocol) (defaults to command line protocol parameter during clustering)
54 | - [tools](../../outputs/columns/#tools) (defaults to an empty string during clustering)
55 |
56 | ## Summary by Pipeline Step
57 |
58 | The different pipeline steps of MAVIS have different input column
59 | requirements. These are summarized below (for the pipeline steps which
60 | can act as the pipeline start)
61 |
62 | | column name | cluster | annotate | validate |
63 | | --------------------------------------------------------------------- | ------- | -------- | -------- |
64 | | [break1_chromosome](../../outputs/columns/#break1_chromosome) | ✓ | ✓ | ✓ |
65 | | [break1_position_start](../../outputs/columns/#break1_position_start) | ✓ | ✓ | ✓ |
66 | | [break1_position_end](../../outputs/columns/#break1_position_end) | ✓ | ✓ | ✓ |
67 | | [break2_chromosome](../../outputs/columns/#break2_chromosome) | ✓ | ✓ | ✓ |
68 | | [break2_position_start](../../outputs/columns/#break2_position_start) | ✓ | ✓ | ✓ |
69 | | [break2_position_end](../../outputs/columns/#break2_position_end) | ✓ | ✓ | ✓ |
70 | | [break1_strand](../../outputs/columns/#break1_strand) | | | |
71 | | [break1_orientation](../../outputs/columns/#break1_orientation) | | ✓ | ✓ |
72 | | [break2_strand](../../outputs/columns/#break2_strand) | | | |
73 | | [break2_orientation](../../outputs/columns/#break2_orientation) | | ✓ | ✓ |
74 | | [opposing_strands](../../outputs/columns/#opposing_strands) | | | |
75 | | [stranded](../../outputs/columns/#stranded) | | | |
76 | | [library](../../outputs/columns/#library) | | | |
77 | | [protocol](../../outputs/columns/#protocol) | | | |
78 | | [tools](../../outputs/columns/#tools) | | | |
79 | | [event_type](../../outputs/columns/#event_type) | | | |
80 |
81 | Some native tool outputs are [supported](../../inputs/support/#sv-callers) and
82 | have built in methods to convert to the above format. Any unsupported
83 | tools can be used as long as the user converts the tools native output
84 | to match the above format.
85 |
--------------------------------------------------------------------------------
/tests/data/transabyss_events.tab:
--------------------------------------------------------------------------------
1 | id contig contig_size genomic_regions contig_regions strands flanking_pairs breakpoint_pairs spanning_reads spanning_reads_forward spanning_reads_reverse rearrangement breakpoint size genes transcripts senses exons/introns exon_bounds reciprocal descriptor orientations 5'gene 3'gene 5'exon 3'exon frame probe repeat1 repeat2 alignment_params type dbsnp dgv
2 | 227 893920 186 1:207981139-207981233,1:208014818-208014912 92-186,1-95 -,- 20 0,6 9 6 3 deletion 1:207981233|1:208014818 33584 NA,C1orf132 NA,ENST00000415882 NA,+ NA,intron2 NA,NA NA del1q32.2 L,R NA NA NA NA NA atggaaaaaggggaaacaaccttagggcagtcagacttctctatgaattcctCTCTCTGATCTGATGGGAATGCACTAGACTGTGAAACTTCCTCCTCCACC - - TO:0.00,CO:0.02,CC:1.00,I1:100.0,I2:100.0,AF1:0.51,AF2:0.51 LSR - NA
3 | 236 4567117 30066 1:224646603-224662564,1:224786034-224800120 1-15947,15985-30066 -,- 50 0,10 25 12 13 duplication 1:224646603|1:224800120 153516 NA,NA NA,NA NA,NA NA,NA NA,NA NA dup1q42.12 R,L NA NA NA NA NA attttccccttttcttgaaaagttgctgcaaagcgctcccctcctaagttgctagagcagctcacagaactgctatagtaagttttgGAGTACTAAAGGCATAGCTCAGTCTCCTCCTCAAGATTAAGAAATGCCCC - L1MEg TO:0.00,CO:0.00,CC:1.00,I1:99.9,I2:99.9,AF1:0.53,AF2:0.47 LSR NA NA
4 | 35 4556542 53631 10:89659755-89700299,10:89712341-89725438 1-40511,40530-53631 +,+ 64 0,24 43 22 21 deletion 10:89700299|10:89712341 12041 PTEN,PTEN ENST00000371953,ENST00000371953 +,+ intron5,intron6 NA,NA NA del10q23.31 L,R PTEN PTEN 5 7 in cagatctgcaaagatcaacctgtcctaagtcatataatctctttgtgtaagagattatactttgtgtaAGAGGTCCACCAGAGGAGTTCAGCAATTTGCTGCTCTTAGGGCAGGGATC - TRF_SimpleTandemRepeat_CAGAGGTCCAG TO:0.00,CO:0.00,CC:1.00,I1:100.0,I2:100.0,AF1:0.76,AF2:0.24 LSR - NA
5 | 28 3113294 240 10:7059511-7059605,19:17396666-17396811 146-240,1-146 +,+ 27 0,8 15 9 6 translocation 10:7059511|19:17396811 - NA,ANKLE1 NA,ENST00000404261 NA,+ NA,intron8 NA,NA NA t(10;19)(p14;p13.11) R,L NA NA NA NA NA gcatgtattttgctccattggtttatccccactcaagggcaatacacatTCAAAGCATAAAAATTACATGACCTATGATATTTATTTTGCTAAGATTTT - - TO:0.00,CO:0.00,CC:1.00,I1:100.0,I2:99.3,AF1:0.40,AF2:0.61 LSR NA NA
6 | 63 indel_k96_4449027 1742 12:104359630-104359778,12:125801148-125802740 1-149,150-1742 -,+ 33 0,6 9 5 4 inversion 12:104359630|12:125801148 21441517 TDG,NA ENST00000392872,NA -,NA exon1,NA no,NA NA inv12q23.3-q24.31 R,R NA NA NA NA NA gctggactcaagctcctcctccaggcttctaccgtcccccacggacccccCTGAGTAGATGATTTTCAGCTGAGGTCTGAGTAGTGGGAAGGGACTGACT - L2a TO:0.00,CO:0.00,CC:1.00,I1:100.0,I2:99.8,AF1:0.09,AF2:0.91 LSR NA NA
7 | 634 2130795 190 7:150746563-150746657,15:84810725-84810819 96-190,1-95 -,- 67 0,9 21 13 8 translocation 7:150746657|15:84810725 - ASIC3,NA ENST00000357922,NA -,NA intron1,NA NA,NA NA t(7;15)(q36.1;q25.2) L,R NA NA NA NA NA aacaggtacaattagggagaggctatgtcaatgcaggaaaaggtcttatcGGCACTGGGGGGTGGGGAGTCCATGGCTGGTAGGAAGGAAGAGGTCCCCT - segdup_chr15:82483003 TO:0.00,CO:0.00,CC:1.00,I1:100.0,I2:100.0,AF1:0.50,AF2:0.50 LSR NA NA
8 | 296 281201 187 3:24565106-24565200,3:24566179-24566273 93-187,1-95 +,- 118 0,19 44 20 24 inversion 3:24565106|3:24566179 1072 NA,NA NA,NA NA,NA NA,NA NA,NA NA inv3p24.2 R,R NA NA NA NA NA tcgtgtttcattctgcctgagagcagtctacctaaatatatagctctgctcACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACA - - TO:0.00,CO:0.02,CC:1.00,I1:100.0,I2:98.9,AF1:0.51,AF2:0.51 LSR NA NA
9 | 625 1719994 191 7:125746029-125746123,7:126166901-126166995 1-95,97-191 +,+ 59 0,9 24 13 11 deletion 7:125746123|7:126166901 420777 NA,GRM8 NA,ENST00000339582 NA,- NA,intron9 NA,NA NA del7q31.33 L,R NA NA NA NA NA atgaagaagaaaagagaaatttttaaataggtagtagcagaaattataaatGCATATCATTTAAATTAAGAGCATAAATGAGGCCACATAAATGCTTTCTT L1PA15-16 - TO:0.00,CO:0.00,CC:0.99,I1:100.0,I2:100.0,AF1:0.50,AF2:0.50 LSR - NA
10 | 617 4285174 188 7:104485067-104485161,7:104612208-104612302 1-95,94-188 -,- 72 0,12 30 16 14 duplication 7:104485067|7:104612302 127234 LHFPL3,NA ENST00000535008,NA -,NA intron4,NA NA,NA NA dup7q22.2-q22.3 R,L NA NA NA NA NA ttagacatcattgttgtttttattttatctttggtttcctcaggcaatacCCTTGGAATGACACATTATCCTCCCTTCACATGTAGCAATTGTAAATTCC - - TO:0.00,CO:0.01,CC:1.00,I1:100.0,I2:100.0,AF1:0.51,AF2:0.51 LSR NA NA
11 | 445 2769447 187 7:126098488-126098582,7:126167441-126167535 93-187,1-95 +,- 62 0,11 27 16 11 inversion 7:126098488|7:126167441 68952 GRM8,GRM8 ENST00000339582,ENST00000339582 -,+ intron9,intron9 NA,NA NA inv7q31.33 R,R - - - - NA atcgttaatcactgcatataactatcttaggctacctgttggtaaactataTGCAAAGAATATATATACACACATACAATTAATCCATTATCACAATGTAT - - TO:0.00,CO:0.02,CC:1.00,I1:100.0,I2:100.0,AF1:0.51,AF2:0.51 LSR NA NA
12 | 527 3739669 201 9:28031863-28031957,9:28034467-28034561 1-95,107-201 -,+ 23 0,6 16 10 6 inversion 9:28031863|9:28034467 2603 LINGO2,LINGO2 ENST00000379992,ENST00000379992 +,- intron4,intron4 NA,NA NA inv9p21.1 R,R - - - - NA ccagattgaaggtattttaaggaggatttggagcatcatggtgaagcgtgaattccgaaaaGAAAGCTCAGCCTGGCTTTTGTGGCCCAGAAGCCCAGAATTTCAGCAACT - - TO:0.00,CO:0.00,CC:0.95,I1:100.0,I2:100.0,AF1:0.47,AF2:0.47 LSR NA NA
13 | 747 3253092 198 X:31196849-31196943,X:31216211-31216305 1-95,104-198 +,+ 36 0,14 20 11 9 deletion X:31196943|X:31216211 19267 DMD,DMD ENST00000357033,ENST00000357033 -,- intron69,intron67 NA,NA NA delXp21.2 L,R DMD DMD 67 70 in aagtctcgaacatcttctcctgatgtagtctaaaagggagatcatggtgaatgtagtgAATGTAGTGAAGATCGGGGGATAAAAAAGGGATGGTTAATGGGTACAAAA - L1MA4 TO:0.00,CO:0.00,CC:0.96,I1:100.0,I2:100.0,AF1:0.48,AF2:0.48 LSR - NA
14 |
--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
1 | # Install Instructions
2 |
3 | Once the install steps are complete [MAVIS](http://mavis.bcgsc.ca) is ready to be run.
4 | See the MAVIS [tutorial](https://mavis.readthedocs.io/en/latest/tutorials/mini) to learn about running MAVIS.
5 |
6 | For either install option you will want to install the main Snakefile. It is best to use a tag to
7 | specify the version of interest but you can download the latest version from the master branch
8 |
9 | ```bash
10 | wget https://raw.githubusercontent.com/bcgsc/mavis/master/Snakefile -O Snakefile
11 | ```
12 |
13 | ## Install for Docker/Singularity
14 |
15 | The simplest way to use MAVIS is via Singularity. The MAVIS docker container used
16 | by singularity will take care of installing the aligner as well.
17 |
18 | ```bash
19 | pip install -U setuptools pip wheel
20 | pip install mavis_config # also installs snakemake
21 | ```
22 |
23 | Now you will run mavis via Snakemake as follows
24 |
25 | ```bash
26 | snakemake \
27 | -j \
28 | --configfile \
29 | --use-singularity \
30 | -s Snakefile
31 | ```
32 |
33 | ## Install (Python Only)
34 |
35 | MAVIS can also be run with just python. However you will need to install the aligner(s) required
36 | by MAVIS separately and ensure they are availble on the default PATH variable when MAVIS is run
37 |
38 | ### 1. Install Aligner
39 |
40 | In addition to the python package dependencies, [MAVIS](http://mavis.bcgsc.ca) also requires an aligner to be installed.
41 | Currently the only aligners supported are [blat](https://mavis.readthedocs.io/en/latest/glossary/#blat) and [bwa mem](https://mavis.readthedocs.io/en/latest/glossary/#bwa).
42 | For MAVIS to run successfully the aligner must be installed and accessible on the path.
43 | If you have a non-standard install you may find it useful to edit the PATH environment variable. For example
44 |
45 | ``` bash
46 | export PATH=/path/to/directory/containing/blat/binary:$PATH
47 | ```
48 |
49 | [blat](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-blat) is the default aligner. To configure MAVIS to use [bwa mem](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-bwa) it must be specified
50 | in the [config](https://mavis.readthedocs.io/en/latest/configuration/settings/) JSON file.
51 |
52 | After this has been installed MAVIS itself can be installed through [pip](https://pypi.org/project/mavis/)
53 |
54 | ### 2. Install MAVIS
55 |
56 | #### Install using pip
57 |
58 | The easiest way to install [MAVIS](http://mavis.bcgsc.ca) is through the python package manager, pip. If you do not have python3 installed it can be found [here](https://www.python.org/downloads)
59 |
60 | Ensuring you have a recent version of pip and setuptools will improve the install experience. Older versions of pip and setuptools may have issues with obtaining some of the mavis python dependencies
61 |
62 | ``` bash
63 | pip install --upgrade pip setuptools
64 | ```
65 |
66 | or (for Anaconda users)
67 |
68 | ``` bash
69 | conda update pip setuptools
70 | ```
71 |
72 | If this is not a clean/new python install it may be useful to set up mavis in a [virtual python environment](https://docs.python.org/3/tutorial/venv.html)
73 |
74 | Then install mavis itself
75 |
76 | ``` bash
77 | pip install mavis
78 | ```
79 |
80 | This will install mavis and its python dependencies.
81 |
82 | #### Install using Buildout
83 |
84 | Alternatively you can use the [bootstrap/buildout](http://www.buildout.org/en/latest/) to install mavis into bin/mavis
85 |
86 | ``` bash
87 | git clone https://github.com/bcgsc/mavis.git
88 | cd mavis
89 | pip install zc.buildout
90 | python bootstrap.py
91 | bin/buildout
92 | ```
93 |
94 | This will install mavis and its python dependencies into eggs inside the cloned mavis directory which can be used by simply running bin/mavis
95 |
96 | Finally you will need to Build/Download the necessary reference files
97 |
98 | ## Build or Download Reference Files
99 |
100 | After [MAVIS](http://mavis.bcgsc.ca) is installed the [reference files](https://mavis.readthedocs.io/en/latest/inputs/reference) must be generated (or downloaded) before it can be run. A simple bash script to download the hg19 reference files is provided under mavis/tools for convenience.
101 |
102 | ### Download Hg19 Files
103 |
104 | ``` bash
105 | cd /path/to/where/you/want/to/put/the/files
106 | wget https://raw.githubusercontent.com/bcgsc/mavis/master/src/tools/get_hg19_reference_files.sh
107 | bash get_hg19_reference_files.sh
108 | ```
109 |
110 | You should now see the reference files in the current directory
111 |
112 | ```text
113 | .
114 | |-- cytoBand.txt
115 | |-- dgv_hg19_variants.tab
116 | |-- ensembl69_hg19_annotations.json
117 | |-- get_hg19_reference_files.sh
118 | |-- hg19.2bit
119 | |-- hg19.fa
120 | `-- hg19_masking.tab
121 | ```
122 |
123 | ### Download Hg38 Files
124 |
125 | ``` bash
126 | cd /path/to/where/you/want/to/put/the/files
127 | wget https://raw.githubusercontent.com/bcgsc/mavis/master/src/tools/get_hg38_reference_files.sh
128 | bash get_hg19_reference_files.sh
129 | ```
130 |
131 | You should now see the reference files in the current directory
132 |
133 | ```text
134 | .
135 | |-- cytoBand.txt
136 | |-- dgv_hg38_variants.tab
137 | |-- ensembl79_hg38_annotations.json
138 | |-- get_hg38_reference_files.sh
139 | |-- GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
140 | |-- GRCh38_masking.tab
141 | `-- hg38.2bit
142 | ```
143 |
--------------------------------------------------------------------------------
/tests/test_mavis/convert/test_tools_vcf.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from mavis.convert.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf
4 |
5 | from ...util import get_data
6 |
7 |
8 | def test_read_vcf():
9 | header, df = pandas_vcf(get_data('sniffles.vcf'))
10 | assert len(header) == 231
11 | assert df.shape[0] == 106
12 |
13 |
14 | def test_convert_telomeric_region():
15 | variant_imprecise = VcfRecordType(
16 | id='mock-BND-imprecise',
17 | pos=0,
18 | chrom='chr14_KI270722v1_random',
19 | alts=['N[chr17_GL000205v2_random:0['],
20 | ref='N',
21 | info=VcfInfoType(
22 | IMPRECISE=True,
23 | SVMETHOD="Snifflesv1.0.11",
24 | SVTYPE="BND",
25 | SUPTYPE="SR",
26 | SVLEN="0",
27 | STRANDS="+-",
28 | RE="5",
29 | REF_strand="0,0",
30 | AF="1",
31 | ),
32 | )
33 | variant_precise = VcfRecordType(
34 | id='mock-BND-precise',
35 | pos=0,
36 | chrom='chr14_KI270722v1_random',
37 | alts=[']chrUn_GL000216v2:142821]N'],
38 | ref='N',
39 | info=VcfInfoType(
40 | IMPRECISE=False,
41 | SVMETHOD="Snifflesv1.0.11",
42 | SVTYPE="BND",
43 | SUPTYPE="SR",
44 | SVLEN="0",
45 | STRANDS="+-",
46 | RE="5",
47 | REF_strand="0,0",
48 | AF="1",
49 | ),
50 | )
51 | imprecise_records = convert_record(variant_imprecise)
52 | assert len(imprecise_records) == 1
53 | imprecise_records = imprecise_records[0]
54 | assert imprecise_records.get('break1_position_end') == 1
55 |
56 | precise_records = convert_record(variant_precise)
57 | assert len(precise_records) == 1
58 | precise_records = precise_records[0]
59 | assert precise_records.get('break1_position_end') == 1
60 |
61 | assert precise_records.get('break1_chromosome') == 'chr14_KI270722v1_random'
62 | assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random'
63 |
64 |
65 | TEST_POS = 1853407
66 |
67 |
68 | @pytest.mark.parametrize(
69 | 'pos,break1_ci,break2_ci,break1,break2,ids',
70 | [
71 | [
72 | TEST_POS,
73 | (-30, 30),
74 | (-65, 65),
75 | (TEST_POS - 30, TEST_POS + 30),
76 | (TEST_POS - 30, TEST_POS + 65),
77 | 'vcf-cuteSV.INS.breakpoint_2_start < breakpoint_1_start',
78 | ],
79 | [
80 | TEST_POS,
81 | (-30, 99999),
82 | (-10, 65),
83 | (TEST_POS - 30, TEST_POS + 65),
84 | (TEST_POS - 10, TEST_POS + 65),
85 | 'vcf-cuteSV.INS.breakpoint_1_end > breakpoint_2_end',
86 | ],
87 | ],
88 | ids=[
89 | 'breakpoint_2_start < breakpoint_1_start',
90 | 'breakpoint_1_end > breakpoint_2_end',
91 | ],
92 | )
93 | def test_convert_intrachromosomal_imprecise_breakend(
94 | pos, break1_ci, break2_ci, break1, break2, ids
95 | ):
96 | variant_vcf = VcfRecordType(
97 | id=ids,
98 | pos=pos,
99 | chrom='chr5',
100 | alts=['AGG'],
101 | ref='A',
102 | info=VcfInfoType(
103 | CHR2="chr5",
104 | IMPRECISE=True,
105 | SVMETHOD="cuteSV-1.0.12",
106 | SVTYPE="INS",
107 | CIPOS=break1_ci,
108 | CILEN=break2_ci,
109 | ),
110 | )
111 | result = convert_record(variant_vcf)
112 | assert len(result) == 1
113 | variant = result[0]
114 | assert variant.get('break1_position_start') == break1[0]
115 | assert variant.get('break1_position_end') == break1[1]
116 | assert variant.get('break2_position_start') == break2[0]
117 | assert variant.get('break2_position_end') == break2[1]
118 |
119 |
120 | @pytest.mark.parametrize(
121 | 'pos,break1_ci,break2_ci,break1,break2,ids',
122 | [
123 | [
124 | TEST_POS,
125 | (-30, 99999),
126 | (70, 65),
127 | (TEST_POS - 30, TEST_POS + 65),
128 | (TEST_POS + 65, TEST_POS + 65),
129 | 'vcf-cuteSV.INS.breakpoint_2_start > breakpoint_2_end',
130 | ],
131 | ],
132 | ids=[
133 | 'breakpoint_2_start > breakpoint_2_end',
134 | ],
135 | )
136 | def test_error_on_convert_intrachromosomal_imprecise_breakend(
137 | pos, break1_ci, break2_ci, break1, break2, ids
138 | ):
139 | variant_vcf = VcfRecordType(
140 | id=ids,
141 | pos=pos,
142 | chrom='chr5',
143 | alts=['AGG'],
144 | ref='A',
145 | info=VcfInfoType(
146 | CHR2="chr5",
147 | IMPRECISE=True,
148 | SVMETHOD="cuteSV-1.0.12",
149 | SVTYPE="INS",
150 | CIPOS=break1_ci,
151 | CILEN=break2_ci,
152 | ),
153 | )
154 | with pytest.raises(ValueError):
155 | convert_record(variant_vcf)
156 |
157 |
158 | def test_convert_intrachromosomal_imprecise_breakend_no_ci():
159 | # breakpoint_1_start > breakpoint_1_end
160 | variant_cilen4 = VcfRecordType(
161 | id='Sniffle.INS',
162 | pos=11184,
163 | chrom='chr2',
164 | alts=['AGG'],
165 | ref='N',
166 | info=VcfInfoType(
167 | CHR2="chr2",
168 | IMPRECISE=True,
169 | SVTYPE="INS",
170 | END=11183,
171 | ),
172 | )
173 | with pytest.raises(ValueError):
174 | convert_record(variant_cilen4)
175 |
--------------------------------------------------------------------------------
/src/mavis/overlay.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Dict, List, Tuple
3 |
4 | from . import annotate as _annotate
5 | from . import util as _util
6 | from .annotate.file_io import ReferenceFile
7 | from .error import DrawingFitError
8 | from .illustrate.constants import DiagramSettings
9 | from .illustrate.diagram import draw_multi_transcript_overlay
10 | from .illustrate.scatter import bam_to_scatter
11 |
12 |
13 | def check_overlay_args(args, parser):
14 | """
15 | parse the overlay options and check the formatting
16 | """
17 | # check complex options
18 | for marker in args.markers:
19 | if len(marker) < 3:
20 | marker.append(marker[-1])
21 | try:
22 | marker[1] = int(marker[1])
23 | marker[2] = int(marker[2])
24 | except ValueError:
25 | parser.error('argument --marker: start and end must be integers: {}'.format(marker))
26 |
27 | defaults = [None, None, 0.5, None, True]
28 | bam_file, density, ymax, stranded = range(1, 5)
29 |
30 | for plot in args.read_depth_plots:
31 | for i, d in enumerate(defaults):
32 | if i >= len(plot):
33 | plot.append(d)
34 | if not os.path.exists(plot[bam_file]):
35 | parser.error(
36 | 'argument --read_depth_plots: the bam file given does not exist: {}'.format(
37 | plot[bam_file]
38 | )
39 | )
40 | try:
41 | plot[density] = float(plot[density])
42 | if plot[density] < 0 or plot[density] > 1:
43 | raise ValueError()
44 | except ValueError:
45 | parser.error(
46 | 'argument --read_depth_plots: density must be an float between 0 and 1: {}'.format(
47 | plot[density]
48 | )
49 | )
50 | try:
51 | if str(plot[ymax]).lower() in ['null', 'none']:
52 | plot[ymax] = None
53 | else:
54 | plot[ymax] = int(plot[ymax])
55 | except ValueError:
56 | parser.error(
57 | 'argument --read_depth_plots: ymax must be an integer: {}'.format(plot[ymax])
58 | )
59 | try:
60 | plot[stranded] = _util.cast_boolean(plot[stranded])
61 | except TypeError:
62 | parser.error(
63 | 'argument --read_depth_plots: stranded must be an boolean: {}'.format(
64 | plot[stranded]
65 | )
66 | )
67 | return args
68 |
69 |
70 | def main(
71 | gene_name: str,
72 | output: str,
73 | config: Dict,
74 | buffer_length: int,
75 | read_depth_plots,
76 | markers: List[Tuple[str, int, int]],
77 | ymax_color='#FF0000',
78 | **kwargs,
79 | ):
80 | """
81 | generates an overlay diagram
82 | """
83 | annotations = ReferenceFile.load_from_config(config, 'annotations')
84 | annotations.load()
85 | drawing_width_iter_increase = config['illustrate.drawing_width_iter_increase']
86 | max_drawing_retries = config['illustrate.max_drawing_retries']
87 | min_mapping_quality = config['validate.min_mapping_quality']
88 | # check options formatting
89 | gene_to_draw = None
90 |
91 | for chrom in annotations.content:
92 | for gene in annotations.content[chrom]:
93 | if gene_name in gene.aliases or gene_name == gene.name:
94 | gene_to_draw = gene
95 | _util.logger.info(
96 | f'Found target gene: {gene.name}(aka. {gene.aliases}) {gene.chr}:{gene.start}-{gene.end}'
97 | )
98 | break
99 | if gene_to_draw is None:
100 | raise KeyError('Could not find gene alias or id in annotations file', gene_name)
101 |
102 | settings = DiagramSettings(**kwargs)
103 |
104 | genomic_min = max(gene_to_draw.start - buffer_length, 1)
105 | genomic_max = gene_to_draw.end + buffer_length
106 |
107 | plots = []
108 | for axis_name, bam_file, density, ymax, stranded in read_depth_plots:
109 | # one plot per bam
110 | plots.append(
111 | bam_to_scatter(
112 | bam_file,
113 | gene_to_draw.chr,
114 | genomic_min,
115 | genomic_max,
116 | strand=gene_to_draw.get_strand() if stranded else None,
117 | ymax=ymax,
118 | density=density,
119 | axis_name=axis_name,
120 | min_mapping_quality=min_mapping_quality,
121 | ymax_color=ymax_color,
122 | )
123 | )
124 |
125 | vmarkers = []
126 |
127 | for i, (marker_name, marker_start, marker_end) in enumerate(markers):
128 | vmarkers.append(
129 | _annotate.base.BioInterval(gene_to_draw.chr, marker_start, marker_end, name=marker_name)
130 | )
131 |
132 | canvas = None
133 | attempts = 1
134 | while True:
135 | try:
136 | canvas = draw_multi_transcript_overlay(
137 | settings,
138 | gene_to_draw,
139 | vmarkers=vmarkers,
140 | plots=plots,
141 | window_buffer=buffer_length,
142 | )
143 | break
144 | except DrawingFitError as err:
145 | if attempts > max_drawing_retries:
146 | raise err
147 | _util.logger.info(f'Drawing fit: extending window {drawing_width_iter_increase}')
148 | settings.width += drawing_width_iter_increase
149 | attempts += 1
150 |
151 | svg_output_file = os.path.join(output, '{}_{}_overlay.svg'.format(gene_to_draw.name, gene_name))
152 | _util.logger.info(f'writing: {svg_output_file}')
153 |
154 | canvas.saveas(svg_output_file)
155 |
--------------------------------------------------------------------------------
/src/mavis/config.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from copy import copy as _copy
3 | from typing import Dict
4 |
5 | from .annotate.file_io import ReferenceFile
6 | from .bam import stats
7 | from .bam.cache import BamCache
8 | from .constants import PROTOCOL, float_fraction
9 | from .util import cast_boolean, filepath
10 |
11 |
12 | def calculate_bam_stats(config: Dict, library_name: str) -> Dict:
13 | """
14 | Calculate the read stats for a library from a given bam file
15 | """
16 | library = config['libraries'][library_name]
17 | annotations = ReferenceFile('annotations', *config['reference.annotations'])
18 |
19 | if library['protocol'] == PROTOCOL.TRANS:
20 | if annotations is None or annotations.is_empty():
21 | raise AttributeError(
22 | 'missing required attribute: annotations. Annotations must be given for transcriptomes'
23 | )
24 | annotations.load()
25 | bam = BamCache(library['bam_file'], stranded=library['strand_specific'])
26 | if library['protocol'] == PROTOCOL.TRANS:
27 | bam_stats = stats.compute_transcriptome_bam_stats(
28 | bam,
29 | annotations=annotations.content,
30 | sample_size=config['bam_stats.sample_size'],
31 | sample_cap=config['bam_stats.sample_cap'],
32 | distribution_fraction=config['bam_stats.distribution_fraction'],
33 | )
34 | return {
35 | 'median_fragment_size': int(bam_stats.median_fragment_size),
36 | 'read_length': int(bam_stats.read_length),
37 | 'stdev_fragment_size': int(bam_stats.stdev_fragment_size),
38 | 'strand_specific': bam_stats.stranded,
39 | 'strand_determining_read': bam_stats.strand_determining_read,
40 | }
41 | bam_stats = stats.compute_genome_bam_stats(
42 | bam,
43 | sample_size=config['bam_stats.sample_size'],
44 | sample_bin_size=config['bam_stats.sample_bin_size'],
45 | sample_cap=config['bam_stats.sample_cap'],
46 | distribution_fraction=config['bam_stats.distribution_fraction'],
47 | )
48 | return {
49 | 'median_fragment_size': int(bam_stats.median_fragment_size),
50 | 'read_length': int(bam_stats.read_length),
51 | 'stdev_fragment_size': int(bam_stats.stdev_fragment_size),
52 | }
53 |
54 |
55 | class CustomHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
56 | """
57 | subclass the default help formatter to stop default printing for required arguments
58 | """
59 |
60 | def _format_args(self, action, default_metavar):
61 | if action.metavar is None:
62 | action.metavar = get_metavar(action.type)
63 | if isinstance(action, RangeAppendAction):
64 | return '%s' % self._metavar_formatter(action, default_metavar)(1)
65 | return super(CustomHelpFormatter, self)._format_args(action, default_metavar)
66 |
67 | def _get_help_string(self, action):
68 | if action.required:
69 | return action.help
70 | return super(CustomHelpFormatter, self)._get_help_string(action)
71 |
72 | def add_arguments(self, actions):
73 | # sort the arguments alphanumerically so they print in the help that way
74 | actions = sorted(actions, key=lambda x: getattr(x, 'option_strings'))
75 | super(CustomHelpFormatter, self).add_arguments(actions)
76 |
77 |
78 | class RangeAppendAction(argparse.Action):
79 | """
80 | allows an argument to accept a range of arguments
81 | """
82 |
83 | def __init__(self, nmin=1, nmax=None, **kwargs):
84 | kwargs.setdefault('nargs', '+')
85 | kwargs.setdefault('default', [])
86 | argparse.Action.__init__(self, **kwargs)
87 | self.nmin = nmin
88 | self.nmax = nmax
89 | assert nmin is not None
90 |
91 | def __call__(self, parser, namespace, values, option_string=None):
92 | if getattr(namespace, self.dest, None) is None:
93 | setattr(namespace, self.dest, [])
94 | items = _copy(getattr(namespace, self.dest))
95 | items.append(values)
96 | if self.nmax is None:
97 | if len(values) < self.nmin:
98 | raise argparse.ArgumentError(
99 | self, 'must have at least {} arguments. Given: {}'.format(self.nmin, values)
100 | )
101 | elif not self.nmin <= len(values) <= self.nmax:
102 | raise argparse.ArgumentError(
103 | self, 'requires {}-{} arguments. Given: {}'.format(self.nmin, self.nmax, values)
104 | )
105 | setattr(namespace, self.dest, items)
106 |
107 |
108 | def add_bamstats_to_config(config: Dict):
109 | """
110 | Check that the input JSON config conforms to the expected schema as well
111 | as the other relevant checks such as file exsts
112 | """
113 | # check all assignments are conversions aliases or existing files
114 | for libname, library in config['libraries'].items():
115 | # calculate the bam_stats if the have not been given
116 | if any(
117 | [
118 | col not in library
119 | for col in ['median_fragment_size', 'read_length', 'stdev_fragment_size']
120 | ]
121 | ):
122 | library.update(calculate_bam_stats(config, libname))
123 |
124 |
125 | def get_metavar(arg_type):
126 | """
127 | For a given argument type, returns the string to be used for the metavar argument in add_argument
128 |
129 | Example:
130 | >>> get_metavar(bool)
131 | '{True,False}'
132 | """
133 | if arg_type in [bool, cast_boolean]:
134 | return '{True,False}'
135 | elif arg_type in [float_fraction, float]:
136 | return 'FLOAT'
137 | elif arg_type == int:
138 | return 'INT'
139 | elif arg_type == filepath:
140 | return 'FILEPATH'
141 | return None
142 |
--------------------------------------------------------------------------------