├── src
    ├── mavis
    │   ├── py.typed
    │   ├── bam
    │   │   └── __init__.py
    │   ├── annotate
    │   │   ├── __init__.py
    │   │   └── constants.py
    │   ├── illustrate
    │   │   └── __init__.py
    │   ├── pairing
    │   │   ├── __init__.py
    │   │   └── constants.py
    │   ├── summary
    │   │   ├── __init__.py
    │   │   └── constants.py
    │   ├── validate
    │   │   ├── __init__.py
    │   │   └── constants.py
    │   ├── cluster
    │   │   └── __init__.py
    │   ├── __init__.py
    │   ├── types.py
    │   ├── error.py
    │   ├── convert
    │   │   ├── starfusion.py
    │   │   ├── straglr.py
    │   │   ├── cnvnator.py
    │   │   ├── breakdancer.py
    │   │   ├── arriba.py
    │   │   ├── chimerascan.py
    │   │   ├── constants.py
    │   │   └── transabyss.py
    │   ├── overlay.py
    │   └── config.py
    └── tools
    │   ├── __init__.py
    │   ├── get_hg38_reference_files.sh
    │   ├── get_hg19_reference_files.sh
    │   └── find_repeats.py
├── tests
    ├── __init__.py
    ├── snakemake
    │   └── __init__.py
    ├── test_mavis
    │   ├── __init__.py
    │   ├── bam
    │   │   └── __init__.py
    │   ├── annotate
    │   │   ├── __init__.py
    │   │   ├── test_annotate_fileio2.py
    │   │   └── test_annotate_fileio.py
    │   ├── cluster
    │   │   ├── __init__.py
    │   │   └── test_cluster.py
    │   ├── convert
    │   │   ├── __init__.py
    │   │   └── test_tools_vcf.py
    │   ├── pairing
    │   │   └── __init__.py
    │   ├── summary
    │   │   └── __init__.py
    │   ├── validate
    │   │   ├── __init__.py
    │   │   └── test_validate.py
    │   ├── illustrate
    │   │   ├── __init__.py
    │   │   └── test_illustrate.py
    │   ├── test_constants.py
    │   ├── test_blat.py
    │   └── test_help.py
    ├── test_tools
    │   ├── __init__.py
    │   ├── test_convert_dgv.py
    │   ├── data
    │   │   ├── ensembl69_hg19_annotations.kras.tab
    │   │   └── K02718.1.gff3
    │   ├── test_convert_annotations_format.py
    │   └── test_ref_alt_count.py
    ├── data
    │   ├── mock_masking.tab
    │   ├── mock_reference_genome.fa.amb
    │   ├── mock_dgv_annotation_malformed.tab
    │   ├── pindel_events.vcf.gz
    │   ├── mock_reference_genome.2bit
    │   ├── mock_reference_genome.fa.sa
    │   ├── mock_reference_genome.fa.bwt
    │   ├── mock_reference_genome.fa.pac
    │   ├── mock_reads_for_events.sorted.bam
    │   ├── mini_mock_reads_for_events.sorted.bam
    │   ├── mock_reads_for_events.sorted.bam.bai
    │   ├── mock_trans_reads_for_events.sorted.bam
    │   ├── mini_mock_reads_for_events.sorted.bam.bai
    │   ├── mock_trans_reads_for_events.sorted.bam.bai
    │   ├── mock_dgv_annotation.tab
    │   ├── reference_from_env.cfg
    │   ├── mini_mock_sv_events.svmerge.tsv
    │   ├── pairing_reference_annotations_file.tab
    │   ├── clustering_input.tab
    │   ├── bad_input_file.cfg
    │   ├── straglr.bed
    │   ├── cnvnator.tab
    │   ├── mock_trans_sv_events.tsv
    │   ├── mock_pairing_input.tab
    │   ├── mock_reference_genome.fa.ann
    │   ├── mock_reference_annotations.json
    │   ├── breakdancer_output.txt
    │   ├── bwa_pipeline_config.cfg
    │   ├── missing_reference.cfg
    │   ├── clean_pipeline_config.cfg
    │   ├── no_opt_pipeline.cfg
    │   ├── Library-clusterset-N.validated.tsv
    │   ├── pipeline_config.cfg
    │   ├── mock_reference_annotations.full.json
    │   ├── transabyss_indels_output.tab
    │   ├── mock_dgv_annotation_mavis.tab
    │   ├── mock_sv_events.tsv
    │   ├── build.cfg
    │   ├── mock_reference_annotations2.json
    │   └── transabyss_events.tab
    ├── setup_subprocess_cov.py
    ├── util.py
    ├── mini-tutorial.annotate_only.config.json
    ├── mini-tutorial.config.json
    └── full-tutorial.config.json
├── requirements.txt
├── docs
    ├── index.md
    ├── extra.css
    ├── background
    │   ├── .pages
    │   └── citations.md
    ├── tutorials
    │   ├── .pages
    │   ├── mini.md
    │   └── annotation.md
    ├── inputs
    │   ├── .pages
    │   ├── non_python_dependencies.md
    │   └── standard.md
    ├── images
    │   ├── icon.png
    │   ├── ENSG00000139687_RB1_overlay.png
    │   ├── snakemake.cluster.full-tutorial.png
    │   ├── snakemake.cluster.mini-tutorial.png
    │   ├── snakemake.validate.mini-tutorial.png
    │   ├── colo829_tumour_annotation_resource_req.png
    │   ├── colo829_tumour_validation_resource_req.png
    │   ├── get_app-24px.svg
    │   └── Fusion-ext.gpl
    ├── outputs
    │   ├── index.md
    │   └── illustrations.md
    ├── package
    │   └── mavis
    │   │   ├── summary
    │   │       └── index.md
    │   │   ├── pairing
    │   │       └── index.md
    │   │   ├── cluster
    │   │       └── index.md
    │   │   ├── annotate
    │   │       └── index.md
    │   │   └── validate
    │   │       └── index.md
    ├── configuration
    │   ├── pipeline.md
    │   ├── performance.md
    │   └── general.md
    ├── migrating.md
    ├── development.md
    ├── hooks.py
    └── install.md
├── pyproject.toml
├── codecov.yml
├── MANIFEST.in
├── .coveragerc
├── .readthedocs.yml
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    ├── workflows
    │   ├── publish.yml
    │   ├── quick-tests.yml
    │   └── build.yml
    └── CONTRIBUTING.md
├── .gitignore
├── mkdocs.yml
├── setup.py
├── env
    ├── example.sh
    └── generate_ensembl79_annotations.sh
├── Dockerfile
├── setup.cfg
└── README.md


/src/mavis/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | .
2 | 


--------------------------------------------------------------------------------
/src/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/mavis/bam/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/snakemake/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_mavis/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/mavis/annotate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/mavis/illustrate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/mavis/pairing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/mavis/summary/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/mavis/validate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_mavis/bam/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | {!./../README.md!}
2 | 


--------------------------------------------------------------------------------
/tests/test_mavis/annotate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_mavis/cluster/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_mavis/convert/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_mavis/pairing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_mavis/summary/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_mavis/validate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_mavis/illustrate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/data/mock_masking.tab:
--------------------------------------------------------------------------------
1 | chr	start	end	name


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | build-backend = "setuptools.build_meta"
2 | 


--------------------------------------------------------------------------------
/tests/data/mock_reference_genome.fa.amb:
--------------------------------------------------------------------------------
1 | 1054073 24 0
2 | 


--------------------------------------------------------------------------------
/docs/extra.css:
--------------------------------------------------------------------------------
1 | td + td > a {
2 |     display: flex;
3 | }
4 | 


--------------------------------------------------------------------------------
/docs/background/.pages:
--------------------------------------------------------------------------------
1 | nav:
2 |   - theory.md
3 |   - citations.md 
4 | 


--------------------------------------------------------------------------------
/docs/tutorials/.pages:
--------------------------------------------------------------------------------
1 | nav:
2 |   - mini.md
3 |   - full.md 
4 |   - ...
5 | 


--------------------------------------------------------------------------------
/src/mavis/validate/constants.py:
--------------------------------------------------------------------------------
1 | PASS_FILENAME = 'validation-passed.tab'
2 | 


--------------------------------------------------------------------------------
/docs/inputs/.pages:
--------------------------------------------------------------------------------
1 | nav:
2 |   - reference.md
3 |   - standard.md
4 |   - ...
5 | 


--------------------------------------------------------------------------------
/docs/images/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/icon.png


--------------------------------------------------------------------------------
/tests/data/mock_dgv_annotation_malformed.tab:
--------------------------------------------------------------------------------
1 | chromosome	beginning	ending	unknown
2 | 


--------------------------------------------------------------------------------
/tests/data/pindel_events.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/pindel_events.vcf.gz


--------------------------------------------------------------------------------
/tests/data/mock_reference_genome.2bit:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reference_genome.2bit


--------------------------------------------------------------------------------
/tests/data/mock_reference_genome.fa.sa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reference_genome.fa.sa


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 |   status:
3 |     project:
4 |       default:
5 |         target: 80%
6 |         threshold: 1%
7 | 


--------------------------------------------------------------------------------
/src/mavis/cluster/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['merge_breakpoint_pairs']
2 | 
3 | 
4 | from .cluster import merge_breakpoint_pairs
5 | 


--------------------------------------------------------------------------------
/tests/data/mock_reference_genome.fa.bwt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reference_genome.fa.bwt


--------------------------------------------------------------------------------
/tests/data/mock_reference_genome.fa.pac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reference_genome.fa.pac


--------------------------------------------------------------------------------
/docs/images/ENSG00000139687_RB1_overlay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/ENSG00000139687_RB1_overlay.png


--------------------------------------------------------------------------------
/tests/data/mock_reads_for_events.sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reads_for_events.sorted.bam


--------------------------------------------------------------------------------
/docs/images/snakemake.cluster.full-tutorial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/snakemake.cluster.full-tutorial.png


--------------------------------------------------------------------------------
/docs/images/snakemake.cluster.mini-tutorial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/snakemake.cluster.mini-tutorial.png


--------------------------------------------------------------------------------
/docs/images/snakemake.validate.mini-tutorial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/snakemake.validate.mini-tutorial.png


--------------------------------------------------------------------------------
/tests/data/mini_mock_reads_for_events.sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mini_mock_reads_for_events.sorted.bam


--------------------------------------------------------------------------------
/tests/data/mock_reads_for_events.sorted.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reads_for_events.sorted.bam.bai


--------------------------------------------------------------------------------
/tests/data/mock_trans_reads_for_events.sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_trans_reads_for_events.sorted.bam


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include src *.py *.json
2 | include src/mavis/py.typed
3 | include README.md
4 | include LICENSE
5 | prune docs
6 | prune tests
7 | 


--------------------------------------------------------------------------------
/tests/data/mini_mock_reads_for_events.sorted.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mini_mock_reads_for_events.sorted.bam.bai


--------------------------------------------------------------------------------
/tests/data/mock_trans_reads_for_events.sorted.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_trans_reads_for_events.sorted.bam.bai


--------------------------------------------------------------------------------
/docs/images/colo829_tumour_annotation_resource_req.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/colo829_tumour_annotation_resource_req.png


--------------------------------------------------------------------------------
/docs/images/colo829_tumour_validation_resource_req.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/colo829_tumour_validation_resource_req.png


--------------------------------------------------------------------------------
/docs/outputs/index.md:
--------------------------------------------------------------------------------
1 | # Tab Delimited Files
2 | 
3 | Column names of the output files are documented in the [column names](../../outputs/columns)
4 | section
5 | 


--------------------------------------------------------------------------------
/src/mavis/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | holds submodules related to structural variants
3 | """
4 | import pkg_resources
5 | 
6 | __version__ = pkg_resources.require('mavis')[0].version
7 | 


--------------------------------------------------------------------------------
/docs/images/get_app-24px.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="24" viewBox="0 0 24 24" width="24">
2 |     <path d="M0 0h24v24H0z" fill="none" />
3 |     <path d="M19 9h-4V3H9v6H5l7 7 7-7zM5 18v2h14v-2H5z" />
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | parallel = True
 3 | concurrency = multiprocessing
 4 | 
 5 | [html]
 6 | directory = coverage
 7 | title = mavis coverage report
 8 | 
 9 | [report]
10 | exclude_lines = 
11 |     pragma: no cover
12 |     if TYPE_CHECKING:
13 | 


--------------------------------------------------------------------------------
/docs/images/Fusion-ext.gpl:
--------------------------------------------------------------------------------
 1 | GIMP Palette
 2 | Name: Fusions-ext
 3 | #
 4 | 0 0 0
 5 | 255 255 255
 6 | 199 217 143
 7 | 82 103 43
 8 | 133 152 97
 9 | 42 67 36
10 | 184 211 186
11 | 76 150 119
12 | 123 221 193
13 | 50 85 86
14 | 125 195 216
15 | 101 126 145
16 | 81 141 197
17 | 38 40 61
18 | 186 178 226
19 | 58 52 105
20 | 124 111 170
21 | 


--------------------------------------------------------------------------------
/tests/data/mock_dgv_annotation.tab:
--------------------------------------------------------------------------------
 1 | chr	start	end	name
 2 | 1	1	2300000	nsv482937
 3 | 1	10001	22118	dgv1n82
 4 | 1	10001	22120	rgv2n98
 5 | 1	10001	22221	rgv2n99
 6 | 1	10001	127330	nsv7879
 7 | 1	10191	10281	nsv958854
 8 | 1	10377	177417	nsv428112
 9 | 1	10377	1018704	esv2758911
10 | 1	10499	177368	esv27265
11 | 1	11099	47000	nsv1147468
12 | 1	11100	29200	dgv1n106
13 | 


--------------------------------------------------------------------------------
/tests/test_mavis/annotate/test_annotate_fileio2.py:
--------------------------------------------------------------------------------
 1 | from mavis.annotate.file_io import load_annotations
 2 | 
 3 | from ...util import get_data
 4 | 
 5 | JSON = get_data('annotations_subsample.json')
 6 | 
 7 | 
 8 | class TestAnnotationLoading:
 9 |     def test_load_json(self):
10 |         result = load_annotations(JSON)
11 |         assert len(result.keys()) == 12
12 | 


--------------------------------------------------------------------------------
/tests/setup_subprocess_cov.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | for p in sys.path:
 5 |     if p.endswith('site-packages'):
 6 |         pth_file = os.path.join(p, 'subprocess-coverage.pth')
 7 |         print('writing path file:', pth_file)
 8 |         with open(pth_file, 'w') as fh:
 9 |             fh.write('import coverage\n\ncoverage.process_startup()\n')
10 |         break
11 | 


--------------------------------------------------------------------------------
/src/mavis/types.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helper classes for type hints
 3 | """
 4 | 
 5 | from typing import TYPE_CHECKING, Dict, List, Tuple
 6 | 
 7 | from Bio.SeqRecord import SeqRecord
 8 | 
 9 | if TYPE_CHECKING:
10 |     from .annotate.genomic import Gene
11 | 
12 | ReferenceGenome = Dict[str, SeqRecord]
13 | ReferenceAnnotations = Dict[str, List['Gene']]
14 | CigarTuples = List[Tuple[int, int]]
15 | 


--------------------------------------------------------------------------------
/tests/data/reference_from_env.cfg:
--------------------------------------------------------------------------------
 1 | [reference]
 2 | 
 3 | [mock-A36971]
 4 | read_length = 150
 5 | median_fragment_size = 400
 6 | stdev_fragment_size = 97
 7 | bam_file = tests/data/mock_reads_for_events.sorted.bam
 8 | protocol = genome
 9 | inputs = tests/data/mock_sv_events.tsv
10 | strand_specific = False
11 | disease_status=diseased
12 | 
13 | [cluster]
14 | uninformative_filter = True
15 | limit_to_chr = None
16 | 


--------------------------------------------------------------------------------
/src/mavis/error.py:
--------------------------------------------------------------------------------
 1 | class NotSpecifiedError(Exception):
 2 |     """
 3 |     raised when information is required for a function but has not been given
 4 | 
 5 |     for example if strand was required but had been set to STRAND.NS then this
 6 |     error would be raised
 7 |     """
 8 | 
 9 |     pass
10 | 
11 | 
12 | class DrawingFitError(Exception):
13 |     pass
14 | 
15 | 
16 | class InvalidRearrangement(Exception):
17 |     pass
18 | 


--------------------------------------------------------------------------------
/tests/data/mini_mock_sv_events.svmerge.tsv:
--------------------------------------------------------------------------------
1 | #start_chromosome	start_position	end_chromosome	end_position	start_orientation	end_orientation	start_strand	end_strand	protocol	tool_version	libraries	tool_evidence	comments	filters	flanking_reads	mapping_quality	split_reads
2 | reference3	1114-1114	reference3	2187-2187	R	R	+	-	genome	convert_ta.py_v0.0.1	A36971						
3 | reference10	519-519	reference19	965-965	R	L	+	+	genome	convert_ta.py_v0.0.1	A36971						
4 | 


--------------------------------------------------------------------------------
/docs/package/mavis/summary/index.md:
--------------------------------------------------------------------------------
 1 | # Sub-package Documentation
 2 | 
 3 | This is the package responsible for summarizing the calls between libraries. In many cases
 4 | this will be where somatic vs germline is determined or genomic only vs expressed.
 5 | 
 6 | ## Output Files
 7 | 
 8 | | expected name/suffix    | file type/format | content |
 9 | | ----------------------- | ---------------- | ------- |
10 | | ``mavis_summary_*.tab`` | text/tabbed      | ?       |
11 | 


--------------------------------------------------------------------------------
/tests/data/pairing_reference_annotations_file.tab:
--------------------------------------------------------------------------------
1 | ## input file used to map hugo gene names: compiled_gene_drug_pathway.v1_2_5.tsv
2 | ## input file for picking best transcript: ens69_best_transcript.txt
3 | ## Ensembl Api version 69
4 | ## generated at: Thu Aug  4 16:38:01 2016
5 | ensembl_gene_id	hugo_names	chr	strand	gene_start	gene_end	best_ensembl_transcript_id	ensembl_transcript_id	refseq_equivalents	transcript_genomic_start	transcript_genomic_end	cdna_coding_start	cdna_coding_end	genomic_exon_ranges	AA_domain_ranges
6 | 


--------------------------------------------------------------------------------
/src/mavis/summary/constants.py:
--------------------------------------------------------------------------------
 1 | from ..constants import MavisNamespace
 2 | 
 3 | HOMOPOLYMER_MIN_LENGTH = 3
 4 | 
 5 | 
 6 | class PAIRING_STATE(MavisNamespace):
 7 |     EXP = 'expressed'
 8 |     NO_EXP = 'not expressed'
 9 |     SOMATIC = 'somatic'
10 |     GERMLINE = 'germline'
11 |     CO_EXP = 'co-expressed'
12 |     GERMLINE_EXP = 'germline expression'
13 |     SOMATIC_EXP = 'somatic expression'
14 |     MATCH = 'matched'
15 |     NO_MATCH = 'not matched'
16 |     GENOMIC = 'genomic support'
17 |     NO_GENOMIC = 'no genomic support'
18 | 


--------------------------------------------------------------------------------
/src/mavis/pairing/constants.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from mavis_config import DEFAULTS
 4 | 
 5 | from ..constants import CALL_METHOD
 6 | 
 7 | PAIRING_DISTANCES: Dict[str, int] = {
 8 |     CALL_METHOD.FLANK: DEFAULTS['pairing.flanking_call_distance'],
 9 |     CALL_METHOD.SPAN: DEFAULTS['pairing.spanning_call_distance'],
10 |     CALL_METHOD.SPLIT: DEFAULTS['pairing.split_call_distance'],
11 |     CALL_METHOD.CONTIG: DEFAULTS['pairing.contig_call_distance'],
12 |     CALL_METHOD.INPUT: DEFAULTS['pairing.input_call_distance'],
13 | }
14 | 


--------------------------------------------------------------------------------
/tests/data/clustering_input.tab:
--------------------------------------------------------------------------------
1 | tracking_id	event_type	break1_chromosome	break1_position_start	break1_position_end	break1_orientation	break1_strand	break1_seq	break2_chromosome	break2_position_start	break2_position_end	break2_orientation	break2_strand	break2_seq	opposing_strands	stranded	tools	protocol
2 | manta-MantaDEL:175574:0:0:0:0:0	deletion	15	67333523	67333619	L	?	None	15	67333581	67333581	R	?	None	False	False	manta	genome
3 | strelka-TyeSomZhWTRakEu6ZJ7up6	deletion	15	67333623	67333623	L	?	None	15	67333625	67333625	R	?	None	False	False	strelka	genome
4 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation with MkDocs
 9 | mkdocs:
10 |   configuration: mkdocs.yml
11 |   fail_on_warning: false
12 | 
13 | # Optionally build your docs in additional formats such as PDF and ePub
14 | formats: all
15 | 
16 | # Optionally set the version of Python and requirements required to build your docs
17 | python:
18 |   version: 3.7
19 |   install:
20 |     -   method: pip
21 |         path: .
22 |         extra_requirements:
23 |             - doc
24 | 


--------------------------------------------------------------------------------
/tests/data/bad_input_file.cfg:
--------------------------------------------------------------------------------
 1 | [reference]
 2 | template_metadata = tests/data/cytoBand.txt
 3 | annotations = tests/data/mock_annotations.json
 4 | masking = tests/data/mock_masking.tab
 5 | reference_genome = tests/data/mock_reference_genome.fa
 6 | aligner_reference = tests/data/mock_reference_genome.2bit
 7 | dgv_annotation = tests/data/mock_dgv_annotation.txt
 8 | 
 9 | [cluster]
10 | uninformative_filter = True
11 | limit_to_chr = None
12 | 
13 | [mock-A36971]
14 | read_length = 150
15 | median_fragment_size = 400
16 | stdev_fragment_size = 97
17 | bam_file = tests/data/mock_reads_for_events.sorted.bam
18 | protocol = genome
19 | inputs = mock_converted.tab
20 | strand_specific = False
21 | disease_status=diseased
22 | 
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # python generated files
 2 | /.eggs
 3 | /coverage
 4 | /venv*
 5 | /.coverage
 6 | *.pyc
 7 | *__pycache__
 8 | build-docs
 9 | *.egg-info*
10 | build
11 | *coverage*
12 | dist
13 | junit
14 | .pytest*
15 | .tox
16 | *eggs/
17 | .mypy_cache
18 | .snakemake
19 | .venv*
20 | 
21 | # aligners
22 | blat
23 | bwa
24 | *.fai
25 | 
26 | # user editing generated files
27 | *.~lock*
28 | .vscode
29 | *.nfs*
30 | junit
31 | 
32 | # generated documentation
33 | /docs/package/mavis/*.md
34 | /docs/package/mavis/*/*.md
35 | # don't ignore subpackage summary files
36 | !/docs/package/mavis/*/index.md
37 | docs/configuration/settings.md
38 | 
39 | .snakemake
40 | output_dir*
41 | bin
42 | dag*
43 | tutorial_data
44 | reference_inputs
45 | tmp
46 | 


--------------------------------------------------------------------------------
/tests/data/straglr.bed:
--------------------------------------------------------------------------------
 1 | #chrom	start	end	repeat_unit	allele1:size	allele1:copy_number	allele1:support	allele2:size	allele2:copy_number	allele2:support
 2 | chr11	776686	778078	CT	100.0	150.0	10	100.0	100.0	1
 3 | chr10	3079216	3079421	AGAGGTCACCACCCCTTCCCAACAATCCAGTAACAATCC	100.0	150.0	10	100.0	100.0	1
 4 | chr9	2080637	2081030	CTCCTTCCCTCCGCCCCCACCTCGGTCCCTGT	100.0	150.0	10	100.0	100.0	1
 5 | chrX	244719	245293	CCCCGGGAACCGCCT	100.0	150.0	10	-	-	-
 6 | chr7	284096	284233	GGT	100.0	150.0	10	-	-	-
 7 | chr8	288173	290242	CCCTGCTCCGT	100.0	150.0	10	100.0	100.0	1
 8 | chr3	2382228	2382908	CCGTGGGGGAGGCTGAGGCTATGGGGACT	100.0	100.0	10	-	-	-
 9 | chr2	2427285	2427528	CCTCC	100.0	150.0	10	-	-	-
10 | chr2	2427953	2428216	GGAGG	100.0	150.0	10	100.0	100.0	1
11 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: MAVIS
 2 | 
 3 | theme:
 4 |   name: material
 5 | repo_url: https://github.com/bcgsc/mavis
 6 | repo_name: github
 7 | site_dir: build-docs
 8 | markdown_extensions:
 9 |   - codehilite
10 |   - admonition
11 |   - pymdownx.inlinehilite
12 |   - markdown_include.include:
13 |       base_path: docs
14 | extra_css: [extra.css]
15 | nav:
16 |   - index.md
17 |   - install.md
18 |   - migrating.md
19 |   - ... | background/**.md
20 |   - ... | inputs/**.md
21 |   - ... | outputs/**.md
22 |   - ... | configuration/**.md
23 |   - ... | tutorials/**.md
24 |   - development.md
25 |   - ...
26 |   - glossary.md
27 | 
28 | plugins:
29 |   - search
30 |   - awesome-pages
31 |   - mkdocs-simple-hooks:
32 |       hooks:
33 |         on_pre_build: "docs.hooks:build_package_docs"
34 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | 
 7 | def check_nonpython_dependencies():
 8 |     """
 9 |     check that the non-python dependencies have been installed.
10 | 
11 |     Raises:
12 |         OSError: A dependency is not installed
13 |     """
14 |     import shutil
15 | 
16 |     aligner = (
17 |         os.environ['MAVIS_ALIGNER']
18 |         if 'MAVIS_ALIGNER' in os.environ and os.environ['MAVIS_ALIGNER']
19 |         else 'blat'
20 |     )
21 |     aligner = re.split(r'\s+', aligner)[0]
22 |     pth = shutil.which(aligner)
23 |     if not pth:
24 |         print('WARNING: Aligner is required. Missing executable: {}'.format(aligner))
25 |     else:
26 |         print('Found: aligner at', pth)
27 | 
28 | 
29 | setup()
30 | check_nonpython_dependencies()
31 | 


--------------------------------------------------------------------------------
/tests/data/cnvnator.tab:
--------------------------------------------------------------------------------
 1 | deletion	1:1-10000	10000	0	1.59373e-11	0	1.99216e-11	0	-1
 2 | deletion	1:38001-39000	1000	0.467116	544.034	0.0442397	1	1	1
 3 | deletion	1:51201-74200	23000	0.648113	6.92924e-12	2.52664e+09	7.58917e-12	2.55487e+09	1
 4 | deletion	1:74601-94200	19600	0.254531	8.13125e-12	2.52848e-32	9.05526e-12	5.01031e-78	1
 5 | deletion	1:106001-106800	800	0.270927	4415.44	4.32994e-06	1	1	1
 6 | duplication	1:107401-111200	3800	1.67572	0.00897513	2.75843e+07	20.687	5.49288e-07	1
 7 | duplication	1:137201-139600	2400	1.54927	0.00127366	8.4566e-14	182635	16668.9	1
 8 | deletion	1:149801-150800	1000	0.504485	79.6041	0.00136224	1	1	1
 9 | deletion	1:151201-155800	4600	0.582473	0.00108651	9.95448e+06	35.7819	1.1684e+08	1
10 | deletion	1:176201-228000	51800	0.0193339	3.07669e-12	1.12835e-37	3.20025e-12	0	1
11 | 


--------------------------------------------------------------------------------
/tests/test_mavis/annotate/test_annotate_fileio.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pytest
 4 | 
 5 | from mavis.annotate.file_io import load_annotations
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     'annotations,error_message_include',
10 |     [
11 |         [{'genes': []}, "schema['properties']['genes']"],
12 |         [
13 |             {'genes': [{'start': '1'}]},
14 |             "schema['properties']['genes']['items']['properties']['start']",
15 |         ],
16 |     ],
17 | )
18 | def test_min_genes_error(annotations, error_message_include, tmp_path):
19 |     filename = tmp_path / "annotations.json"
20 |     filename.write_text(json.dumps(annotations))
21 |     with pytest.raises(AssertionError) as exc:
22 |         load_annotations(str(filename))
23 |     assert error_message_include in str(exc.value)
24 | 


--------------------------------------------------------------------------------
/src/mavis/annotate/constants.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from ..constants import MavisNamespace
 4 | 
 5 | PASS_FILENAME = 'annotations.tab'
 6 | 
 7 | 
 8 | class SPLICE_SITE_TYPE(MavisNamespace):
 9 |     DONOR: int = 3
10 |     ACCEPTOR: int = 5
11 | 
12 | 
13 | SPLICE_SITE_RADIUS = 2
14 | """int: number of bases away from an exon boundary considered to be part of the splice site such that if it were altered
15 |         the splice site would be considered to be abrogated.
16 | """
17 | 
18 | # splice site sequences based on: http://www.nature.com/nrg/journal/v17/n7/fig_tab/nrg.2016.46_F5.html?foxtrotcallback=true
19 | 
20 | DONOR_SEQ = [
21 |     re.compile('(AG)(GT[AG]AG)'),
22 |     re.compile('([CA]AG)(GTA)'),
23 | ]
24 | 
25 | ACCEPTOR_SEQ = [
26 |     re.compile('([TC]{8}[ATCG]CAG)([GA][ATCG])'),
27 |     re.compile('([TC]{9}TAG)([GA][ATCG])'),
28 |     re.compile('([TC]{8}[ATCG]AAG)([GA][ATCG])'),
29 | ]
30 | 


--------------------------------------------------------------------------------
/tests/data/mock_trans_sv_events.tsv:
--------------------------------------------------------------------------------
1 | ## False	reference9	2000	2000	reference9	2001	2001	L	R	+	+	insertion	genome	convert_ta.py_v0.0.1	mock-A36971	9:66466004
2 | stranded	break1_chromosome	break1_position_start	break1_position_end	break2_chromosome	break2_position_start	break2_position_end	break1_orientation	break2_orientation	break1_strand	break2_strand	event_type	protocol	tools	library	comment
3 | False	gene3	27175	27175	gene3	27176	27176	R	L	+	+	duplication	transcriptome	convert_ta.py_v0.0.1	mock-A47933	1:207249992
4 | True	gene1	34090	34090	gene5	608	608	R	R	-	+	inverted translocation	transcriptome	convert_ta.py_v0.0.1	mock-A47933	15:40854971|7:26241389
5 | False	gene2	22979	22979	gene2	23783	23783	R	L	+	+	duplication	transcriptome	convert_ta.py_v0.0.1	mock-A47933	15:41623873|15:41625248#this one is pretty low qual
6 | False	gene6	70057	77430	gene6	89472	94742	L	R	+	+	deletion	transcriptome	convert_ta.py_v0.0.1	mock-A47933	approx 10:89700299|10:89712341
7 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. run command '...'
16 | 2. See error ...
17 | 
18 | **Expected behavior**
19 | A clear and concise description of what you expected to happen.
20 | 
21 | **Input Data**
22 | If applicable, add the input data used when the bug was observed
23 | 
24 | **Configuration**
25 | If applicable, include the mavis configuration file that was used to run the pipeline
26 | 
27 | **Versions (please complete the following information):**
28 |  - OS: [e.g. centos-07]
29 |  - Python Version [e.g. 3.6.1]
30 |  - MAVIS Version [e.g. 22]
31 |  - Blat/BWA Version
32 | 
33 | **Additional context**
34 | Add any other context about the problem here.
35 | 


--------------------------------------------------------------------------------
/env/example.sh:
--------------------------------------------------------------------------------
 1 | export MAVIS_TEMPLATE_METADATA='/projects/trans_scratch/software/mavis/reference_files/hg19_cytoBand.txt'
 2 | export MAVIS_REFERENCE_GENOME='/projects/seqref/genomes/Homo_sapiens/GRCh37/1000genomes/bwa_ind/genome/GRCh37-lite.fa'
 3 | export MAVIS_ANNOTATIONS='/projects/trans_scratch/software/mavis/reference_files/ensembl69_hg19_annotations.json'
 4 | export MAVIS_MASKING='/projects/tumour_char/analysis_scripts/SVIA/delly/reference_data/GRCh37/human_nspan.hg19.excl.with_header.tsv'
 5 | export MAVIS_ALIGNER_REFERENCE='/home/pubseq/genomes/Homo_sapiens/GRCh37/blat/hg19.2bit'
 6 | export MAVIS_DGV_ANNOTATION='/projects/trans_scratch/software/mavis/reference_files/dgv_hg19_annotations.tab'
 7 | export MAVIS_MAX_FILES=100
 8 | export MAVIS_MIN_CLUSTERS_PER_FILE=30
 9 | export PYTHONUNBUFFERED='True'
10 | 
11 | #Add paths for samtools, blat and git
12 | export PATH=/projects/trans_scratch/transabyss/trans-ABySS/v1.4.10/bin/:/gsc/software/linux-x86_64-centos6/git-2.12.0/bin/:$PATH
13 | 


--------------------------------------------------------------------------------
/tests/data/mock_pairing_input.tab:
--------------------------------------------------------------------------------
1 | library	cluster_id	validation_id	annotation_id	event_type	transcript1	transcript2	fusion_cdna_coding_start	fusion_cdna_coding_end	fusion_sequence_fasta_id	fusion_sequence_fasta_file	break1_chromosome	break1_position_start	break1_position_end	break1_orientation	break1_strand	break2_chromosome	break2_position_start	break2_position_end	break2_orientation	break2_strand	opposing_strands	stranded	protocol	break1_call_method	break2_call_method	untemplated_seq	fusion_splicing_pattern
2 | genome1	1	1	1	deletion	ENST00000367080	ENST00000367080	None	None	None	None	gene3	10008	10008	L	+	gene3	18900	18900	R	+	False	True	genome	split reads	split reads	None	None
3 | genome2	1	1	1	deletion	ENST00000367080	ENST00000367080	None	None	None	None	gene3	10000	10000	L	+	gene3	18900	18900	R	+	False	True	genome	split reads	split reads	None	None
4 | transcriptome1	1	1	1	deletion	ENST00000367080	ENST00000367080	None	None	None	None	gene3	5347	5347	L	+	gene3	19969	19969	R	+	False	True	transcriptome	split reads	split reads	None	None
5 | 


--------------------------------------------------------------------------------
/tests/test_mavis/cluster/test_cluster.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pytest
 4 | 
 5 | from mavis.cluster.cluster import merge_integer_intervals
 6 | from mavis.interval import Interval
 7 | 
 8 | 
 9 | class TestMergeIntegerIntervals:
10 |     def test_varying_lengths(self):
11 |         m = merge_integer_intervals((1, 2), (1, 9), (2, 10), weight_adjustment=0)
12 |         assert m == Interval(1, 4)
13 | 
14 |     def test_same_length(self):
15 |         m = merge_integer_intervals((1, 1), (10, 10))
16 |         assert m == Interval(6)
17 | 
18 |     def test_empty_list_error(self):
19 |         with pytest.raises(AttributeError):
20 |             merge_integer_intervals()
21 | 
22 |     def test_identical_even_length(self):
23 |         m = merge_integer_intervals((1, 2), (1, 2), (1, 2))
24 |         assert m == Interval(1, 2)
25 | 
26 |     def test_identical_odd_length(self):
27 |         m = merge_integer_intervals((1, 3), (1, 3), (1, 3))
28 |         assert m == Interval(1, 3)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     unittest.main()
33 | 


--------------------------------------------------------------------------------
/src/tools/get_hg38_reference_files.sh:
--------------------------------------------------------------------------------
 1 | set -euo pipefail
 2 | 
 3 | echo "downloading the reference genome (no alt) file"
 4 | wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
 5 | gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
 6 | 
 7 | echo "downloading the gene annotations file"
 8 | wget http://www.bcgsc.ca/downloads/mavis/v3/ensembl79_hg38_annotations.v3.json.gz
 9 | gunzip ensembl79_hg38_annotations.v3.json.gz
10 | 
11 | echo "downloading the masking file"
12 | wget http://www.bcgsc.ca/downloads/mavis/GRCh38_masking.tab
13 | 
14 | echo "downloading the dgv annotation file"
15 | wget http://www.bcgsc.ca/downloads/mavis/dgv_hg38_variants.tab
16 | 
17 | echo "downloading the aligner reference file"
18 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit
19 | 
20 | echo "downloading the template metadata file"
21 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz
22 | gunzip cytoBand.txt.gz
23 | 


--------------------------------------------------------------------------------
/tests/data/mock_reference_genome.fa.ann:
--------------------------------------------------------------------------------
 1 | 1054073 24 11
 2 | 0 fake (null)
 3 | 0 7450 0
 4 | 0 reference2 (null)
 5 | 7450 13648 0
 6 | 0 reference4 (null)
 7 | 21098 4000 0
 8 | 0 reference3 (null)
 9 | 25098 3711 0
10 | 0 reference7 (null)
11 | 28809 21000 0
12 | 0 reference10 (null)
13 | 49809 45109 0
14 | 0 reference19 (null)
15 | 94918 11786 0
16 | 0 reference20 (null)
17 | 106704 8000 0
18 | 0 referenceX (null)
19 | 114704 15760 0
20 | 0 reference11 (null)
21 | 130464 12000 0
22 | 0 reference12 (null)
23 | 142464 12000 0
24 | 0 reference1 (null)
25 | 154464 4000 0
26 | 0 reference9 (null)
27 | 158464 4000 0
28 | 0 reference16 (null)
29 | 162464 4000 0
30 | 0 reference17 (null)
31 | 166464 4000 0
32 | 0 gene1 (null)
33 | 170464 36375 0
34 | 0 gene2 (null)
35 | 206839 71783 0
36 | 0 gene3 (null)
37 | 278622 31569 0
38 | 0 gene4 (null)
39 | 310191 579898 0
40 | 0 gene5 (null)
41 | 890089 12195 0
42 | 0 gene6 (null)
43 | 902284 108818 0
44 | 0 fakereference9 (null)
45 | 1011102 14148 0
46 | 0  test_bam_long_ref
47 | 1025250 28322 0
48 | 0  11_86018001-86018500
49 | 1053572 501 0
50 | 


--------------------------------------------------------------------------------
/tests/test_tools/test_convert_dgv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from unittest.mock import patch
 4 | 
 5 | import pytest
 6 | 
 7 | from tools.convert_dgv import main as convert_dgv_main
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "filename,expected_file",
12 |     [
13 |         ["dgv_test.tab", "dgv_test_expected.tab"],
14 |     ],
15 | )
16 | def test_dgv_examples(tmp_path, filename, expected_file):
17 |     data_dir = os.path.join(os.path.dirname(__file__), "data")
18 | 
19 |     output_path = str(tmp_path / "tmp_data.tab")
20 |     args = [
21 |         "python",
22 |         "--input",
23 |         os.path.join(data_dir, filename),
24 |         "--output",
25 |         output_path,
26 |     ]
27 | 
28 |     with patch.object(convert_dgv_main, "main", create=True):
29 |         with patch.object(sys, "argv", args):
30 |             convert_dgv_main()
31 | 
32 |     with open(os.path.join(data_dir, expected_file), 'r') as fh:
33 |         expected = fh.read().strip()
34 | 
35 |     with open(output_path, 'r') as fh:
36 |         observed = fh.read().strip()
37 | 
38 |     assert expected == observed
39 | 


--------------------------------------------------------------------------------
/env/generate_ensembl79_annotations.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # add the ensembl api modules to the path
 4 | PATH=$(pwd):$PATH
 5 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/bioperl-live
 6 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/ensembl/modules
 7 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/ensembl-compara/modules
 8 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/ensembl-variation/modules
 9 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/ensembl-funcgen/modules
10 | export PERL5LIB
11 | 
12 | # default perl
13 | PATH=/projects/trans_scratch/software/perl/perl-5.20.3/bin:$PATH
14 | 
15 | # required data files
16 | export HUGO_ENSEMBL_MAPPING=/projects/tumour_char/analysis_scripts/databases/processed_files/drug_target_tables/current_gene_drug_pathway.hg38.tsv
17 | export BEST_TRANSCRIPTS=/home/creisle/svn/ensembl_flatfiles/ens69_best_transcript.txt
18 | 
19 | # connection information for the ensembl local server
20 | export ENSEMBL_HOST='ensembl02'
21 | export ENSEMBL_PASS='ensembl'
22 | export ENSEMBL_USER='ensembl'
23 | export ENSEMBL_PORT=3306
24 | 


--------------------------------------------------------------------------------
/src/mavis/convert/starfusion.py:
--------------------------------------------------------------------------------
 1 | from ..constants import ORIENT
 2 | 
 3 | 
 4 | def convert_row(row):
 5 |     """
 6 |     transforms the starfusion output into the common format for expansion. Maps the input column
 7 |     names to column names that MAVIS can read
 8 |     """
 9 |     std_row = {}
10 |     try:
11 |         std_row['break1_chromosome'], b1_start, std_row['break1_strand'] = row[
12 |             'LeftBreakpoint'
13 |         ].split(':')
14 |         std_row['break2_chromosome'], b2_start, std_row['break2_strand'] = row[
15 |             'RightBreakpoint'
16 |         ].split(':')
17 |     except (ValueError, TypeError):
18 |         raise AssertionError(
19 |             'Could not parse the breakpoint from the starfusion row: {}, {}'.format(
20 |                 row['LeftBreakpoint'], row['RightBreakpoint']
21 |             )
22 |         )
23 |     std_row['break1_position_start'] = std_row['break1_position_end'] = b1_start
24 |     std_row['break2_position_start'] = std_row['break2_position_end'] = b2_start
25 | 
26 |     std_row['break1_orientation'] = std_row['break2_orientation'] = ORIENT.NS
27 | 
28 |     return std_row
29 | 


--------------------------------------------------------------------------------
/tests/test_mavis/illustrate/test_illustrate.py:
--------------------------------------------------------------------------------
 1 | from mavis.illustrate.util import generate_interval_mapping
 2 | from mavis.interval import Interval
 3 | 
 4 | 
 5 | class TestGenerateIntervalMapping:
 6 |     def test_single_bp_window(self):
 7 |         regions = [
 8 |             Interval(4222347, 4222347),
 9 |             Interval(4221673, 4221903),
10 |             Interval(2792992, 4852494),
11 |         ]
12 |         target = 911.9921875
13 |         ratio = 5
14 |         min_width = 60
15 |         buffer_ = None
16 |         start = 2791992
17 |         end = 4853494
18 |         min_inter = 10
19 |         mapping = generate_interval_mapping(
20 |             regions, target, ratio, min_width, buffer_, start, end, min_inter
21 |         )
22 |         assert len(mapping.keys()) == 7
23 | 
24 |     def test_no_input_intervals(self):
25 |         target = 911.9921875
26 |         ratio = 5
27 |         min_width = 60
28 |         buffer_ = None
29 |         start = 2791992
30 |         end = 4853494
31 |         min_inter = 10
32 |         mapping = generate_interval_mapping(
33 |             [], target, ratio, min_width, buffer_, start, end, min_inter
34 |         )
35 |         assert len(mapping.keys()) == 1
36 | 


--------------------------------------------------------------------------------
/src/tools/get_hg19_reference_files.sh:
--------------------------------------------------------------------------------
 1 | set -euo pipefail
 2 | 
 3 | echo "downloading the reference genome file"
 4 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz
 5 | tar -xvzf chromFa.tar.gz
 6 | 
 7 | # concatenate the chromosome fa files into a single file
 8 | for fname in chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y}.fa
 9 | do
10 | 	cat $fname >> hg19.fa
11 | done
12 | 
13 | # Clean up the non concatenated and alt chromosome files
14 | rm -f chr*.fa
15 | rm -f chromeFa.tar.gz
16 | 
17 | echo "downloading the gene annotations file"
18 | wget http://www.bcgsc.ca/downloads/mavis/v3/ensembl69_hg19_annotations.v3.json.gz
19 | gunzip ensembl69_hg19_annotations.v3.json.gz
20 | 
21 | echo "downloading the masking file"
22 | wget http://www.bcgsc.ca/downloads/mavis/hg19_masking.tab
23 | 
24 | echo "downloading the dgv annotation file"
25 | wget http://www.bcgsc.ca/downloads/mavis/dgv_hg19_variants.tab
26 | 
27 | echo "downloading the aligner reference file"
28 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit
29 | 
30 | echo "downloading the template metadata file"
31 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz
32 | gunzip cytoBand.txt.gz
33 | 


--------------------------------------------------------------------------------
/src/mavis/convert/straglr.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from ..constants import COLUMNS, SVTYPE
 4 | 
 5 | 
 6 | def convert_row(row: Dict) -> Dict:
 7 |     """
 8 |     Converts the fields from the original STRAGLR BED output into MAVIS definitions of an SV
 9 |     Since STRAGLR defines regions where short tandem repeats exist we make the definitions here fairly
10 |     non-specific
11 | 
12 |     See their github page for more details: https://github.com/bcgsc/straglr
13 | 
14 |     BED Columns
15 |     - chrom: chromosome name
16 |     - start: start coordinate of locus
17 |     - end: end coordinate of locus
18 |     - repeat_unit: repeat motif
19 |     - allele<N>.size: where N={1,2,3...} depending on --max_num_clusters e.g. N={1,2} if --max_num_clusters==2 (default)
20 |     - allele<N>.copy_number
21 |     - allele<N>.support
22 |     """
23 |     return {
24 |         COLUMNS.break1_chromosome: row['chrom'],
25 |         COLUMNS.break2_chromosome: row['chrom'],
26 |         COLUMNS.break1_position_start: row['start'],
27 |         COLUMNS.break1_position_end: row['end'],
28 |         COLUMNS.break2_position_start: row['start'],
29 |         COLUMNS.break2_position_end: row['end'],
30 |         COLUMNS.untemplated_seq: None,
31 |         COLUMNS.event_type: SVTYPE.INS,
32 |     }
33 | 


--------------------------------------------------------------------------------
/docs/configuration/pipeline.md:
--------------------------------------------------------------------------------
 1 | # Running the Pipeline
 2 | 
 3 | ## Running MAVIS using a Job Scheduler
 4 | 
 5 | MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling
 6 | and setup
 7 | 
 8 | The MAVIS pipeline is highly configurable. Some pipeline steps
 9 | (cluster, validate) are optional and can be automatically skipped. The
10 | standard pipeline is
11 | far-left.
12 | 
13 | The most common use case is running the pipeline through snakemake
14 | 
15 | ```bash
16 | snakemake -j <MAX JOBS> --configfile <YOUR CONFIG> -s Snakefile
17 | ```
18 | 
19 | If you are submitting to a cluster, use the [snakemake profiles](https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles)
20 | 
21 | ```bash
22 | snakemake -j <MAX JOBS> --configfile <YOUR CONFIG> --profile <YOUR PROFILE NAME> -s Snakefile
23 | ```
24 | 
25 | This will submit a series of jobs with dependencies.
26 | 
27 | To use the mavis docker container through singularity, instead of installing mavis via pip, add the
28 | [`--use-singularity`](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#running-jobs-in-containers)
29 | flag.
30 | 
31 | ```bash
32 | snakemake -j <MAX JOBS> --configfile <YOUR CONFIG> --profile <YOUR PROFILE NAME> --use-singularity -s Snakefile`
33 | ```
34 | 


--------------------------------------------------------------------------------
/docs/package/mavis/pairing/index.md:
--------------------------------------------------------------------------------
 1 | # Sub-package Documentation
 2 | 
 3 | This is the package responsible for pairing/grouping calls between libraries. In many cases
 4 | this will be where somatic vs germline is determined or genomic only vs expressed.
 5 | 
 6 | ## Output Files
 7 | 
 8 | | expected name/suffix   | file type/format | content                                                   |
 9 | | ---------------------- | ---------------- | --------------------------------------------------------- |
10 | | ``mavis_paired_*.tab`` | text/tabbed      | call information and pairing information using product id |
11 | 
12 | 
13 | ## Algorithm Overview
14 | 
15 | - pairwise comparison of breakpoint pairs between libraries
16 | 
17 |     - fail if orientations do not match
18 |     - fail if template/chromosomes do not match
19 |     - if the protocols are mixed
20 | 
21 |         - pass if the fusion products match at the sequence level
22 |         - pass if the breakpoint predicted from the genome matches the transcriptome breakpoint
23 | 
24 |     - if the protocols are the same
25 | 
26 |         - pass if the breakpoints are co-located
27 | 
28 | - filter matches based on annotations
29 | 
30 |     - if both breakpoints have the same gene annotation, they must also both have the same transcript annotation
31 | 


--------------------------------------------------------------------------------
/src/mavis/convert/cnvnator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | from cnvnator: https://github.com/abyzovlab/CNVnator
 3 | 
 4 |     CNV_type coordinates CNV_size normalized_RD e-val1 e-val2 e-val3 e-val4 q0
 5 | 
 6 |     normalized_RD -- normalized to 1.
 7 |     e-val1        -- is calculated using t-test statistics.
 8 |     e-val2        -- is from the probability of RD values within the region to be in
 9 |     the tails of a gaussian distribution describing frequencies of RD values in bins.
10 |     e-val3        -- same as e-val1 but for the middle of CNV
11 |     e-val4        -- same as e-val2 but for the middle of CNV
12 |     q0            -- fraction of reads mapped with q0 quality
13 | """
14 | import re
15 | 
16 | 
17 | def convert_row(row):
18 |     """
19 | 
20 |     Args:
21 |         row (Dict[str]): dict representing the row output from cnvnator
22 | 
23 |     Returns:
24 |         dict: transformed row using mavis starndard column names
25 |     """
26 |     result = {k: v for k, v in row.items() if k != 'coordinates'}
27 |     chrom, start, end = re.split(r'[-:]', row['coordinates'])
28 |     result['break1_chromosome'] = result['break2_chromosome'] = chrom
29 |     result['break1_position_start'] = result['break1_position_end'] = start
30 |     result['break2_position_start'] = result['break2_position_end'] = end
31 |     return result
32 | 


--------------------------------------------------------------------------------
/tests/util.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import shutil
 4 | 
 5 | import pytest
 6 | 
 7 | DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
 8 | 
 9 | 
10 | long_running_test = pytest.mark.skipif(
11 |     os.environ.get('RUN_FULL') != '1',
12 |     reason='Only running FAST tests subset',
13 | )
14 | 
15 | bwa_only = pytest.mark.skipif(not shutil.which('bwa'), reason='missing the command')
16 | blat_only = pytest.mark.skipif(not shutil.which('blat'), reason='missing the command')
17 | todo = pytest.mark.skip(reason='TODO')
18 | 
19 | 
20 | def package_relative_file(*paths):
21 |     return os.path.abspath(os.path.join(os.path.dirname(__file__), '..', *paths))
22 | 
23 | 
24 | def get_data(*paths):
25 |     return os.path.join(DATA_DIR, *paths)
26 | 
27 | 
28 | def glob_exists(*pos, strict=False, n=1):
29 |     globexpr = os.path.join(*pos)
30 |     file_list = glob.glob(globexpr)
31 |     if strict and len(file_list) == n:
32 |         return file_list[0] if len(file_list) == 1 else file_list
33 |     elif not strict and len(file_list) > 0:
34 |         return file_list
35 |     else:
36 |         print(globexpr)
37 |         print(file_list)
38 |         return False
39 | 
40 | 
41 | def glob_not_exists(*pos):
42 |     globexpr = os.path.join(*pos)
43 |     file_list = glob.glob(globexpr)
44 |     return not file_list
45 | 


--------------------------------------------------------------------------------
/tests/data/mock_reference_annotations.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "genes": [
 3 |         {
 4 |             "chr": "fake",
 5 |             "start": 1,
 6 |             "end": 1000,
 7 |             "strand": "+",
 8 |             "name": "ENSG0001",
 9 |             "aliases": [],
10 |             "transcripts": [
11 |                 {
12 |                     "is_best_transcript": true,
13 |                     "name": "ENST001",
14 |                     "start": 101,
15 |                     "end": 900,
16 |                     "exons": [
17 |                         {"start": 101, "end": 200},
18 |                         {"start": 401, "end": 500},
19 |                         {"start": 601, "end": 700},
20 |                         {"start": 801, "end": 900}
21 |                     ],
22 |                     "domains": [
23 |                         {
24 |                             "name": "PF001",
25 |                             "desc": "",
26 |                             "regions": [
27 |                                 {"start": 1, "end": 10},
28 |                                 {"start": 50, "end": 63}
29 |                             ]
30 |                         }
31 |                     ],
32 |                     "cdna_coding_start": 51,
33 |                     "cdna_coding_end": 350
34 |                 }
35 |             ]
36 |         }
37 |     ]
38 | }
39 | 


--------------------------------------------------------------------------------
/docs/inputs/non_python_dependencies.md:
--------------------------------------------------------------------------------
 1 | # Non-python Dependencies
 2 | 
 3 | MAVIS integrates with
 4 | [SV callers](./sv_callers.md),
 5 | [job schedulers](#job-schedulers), and
 6 | [aligners](#aligners). While some of
 7 | these dependencies are optional, all currently supported options are
 8 | detailed below. The versions column in the tables below list all the
 9 | versions which were tested for each tool. Each version listed is known
10 | to be compatible with MAVIS.
11 | 
12 | ## Job Schedulers
13 | 
14 | MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling
15 | 
16 | ## Aligners
17 | 
18 | Two aligners are supported [bwa](../../glossary/#bwa) and
19 | [blat](../../glossary/#blat) (default). These are both included in the docker image by default.
20 | 
21 | | Name                                           | Version(s)              | Environment Setting       |
22 | | ---------------------------------------------- | ----------------------- | ------------------------- |
23 | | [blat](../../glossary/#blat)                   | `36x2` `36`             | `MAVIS_ALIGNER=blat`      |
24 | | [bwa mem <bwa>](../../glossary/#bwa mem <bwa>) | `0.7.15-r1140` `0.7.12` | `MAVIS_ALIGNER='bwa mem'` |
25 | 
26 | !!! note
27 |     When setting the aligner you will also need to set the
28 |     [aligner_reference](../../configuration/settings/#aligner_reference) to match
29 | 


--------------------------------------------------------------------------------
/docs/outputs/illustrations.md:
--------------------------------------------------------------------------------
 1 | # Illustrations
 2 | 
 3 | ## Fusion Diagrams
 4 | 
 5 | These are diagrams produced during the annotate step. These represent
 6 | the putative fusion events of a single breakpoint pair.
 7 | 
 8 | ![fusion diagram](../images/GIMAP4_IL7_fusion.svg)
 9 | 
10 | Fusion from transcriptome data. Intronic breakpoints here indicate
11 | retained intron sequence and a novel exon is
12 | predicted.
13 | 
14 | If the [draw_fusions_only](../../configuration/settings/#draw_fusions_only flag is set to
15 | False then all events will produce a diagram, even anti-sense fusions
16 | 
17 | ![disruptive fusion diagram](../images/UBE2V2_GIMAP4_disruptive_fusion.svg)
18 | 
19 | Disruptive Anti-sense
20 | Fusion
21 | 
22 | ## Transcript Overlays
23 | 
24 | MAVIS supports generating diagrams of all transcripts for a given gene.
25 | These can be overlaid with markers and bam\_file pileup data. This is
26 | particularly useful for visualizing splice site mutations.
27 | 
28 | ![overlay diagram](../images/ENSG00000139687_RB1_overlay.png)
29 | 
30 | RB1 splice site mutation results in skipping of exon 9
31 | 
32 | The above diagram was generated using the overlay command
33 | 
34 | ```bash
35 | mavis overlay RB1 \
36 |     -o /path/to/output/dir \
37 |     --read_depth_plot rna /path/to/bam/file \
38 |     --marker M1 48939029 \
39 |     --annotations /path/to/mavis/annotations/reference/file
40 | ```
41 | 


--------------------------------------------------------------------------------
/src/mavis/convert/breakdancer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def convert_file(input_file):
 7 |     bam_to_lib = {}
 8 | 
 9 |     # read comments
10 |     with open(input_file, 'r') as fh:
11 |         # comments in breakdancer are marked with a single # so they need to be discarded before reading
12 |         lines = fh.readlines()
13 |         line_index = 0
14 |         while line_index < len(lines) and lines[line_index].startswith('#'):
15 |             metadata_match = re.match(r'^#(\S+)\t.*\tlibrary:(\S+)\t.*', lines[line_index])
16 |             if metadata_match:
17 |                 bam_to_lib[metadata_match.group(1)] = metadata_match.group(2)
18 |             line_index += 1
19 |         header = [c.strip() for c in re.sub(r'^#', '', lines[line_index - 1]).split('\t')]
20 |     # read the main file
21 |     df = pd.read_csv(
22 |         input_file,
23 |         names=header,
24 |         sep='\t',
25 |         comment='#',
26 |         dtype={
27 |             'num_Reads_lib': str,
28 |             'Pos1': int,
29 |             'Pos2': int,
30 |             'Chr1': str,
31 |             'Chr2': str,
32 |             'Type': str,
33 |         },
34 |     )
35 |     if 'num_Reads_lib' not in df:
36 |         raise KeyError('missing required column: num_Reads_lib')
37 | 
38 |     for bam, lib in bam_to_lib.items():
39 |         df['num_Reads_lib'] = df['num_Reads_lib'].str.replace(bam, lib)
40 |     return df.to_dict('records')
41 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-slim-buster
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get upgrade -y && \
 7 |     apt-get install -y git wget make gcc libz-dev
 8 | 
 9 | # pysam dependencies
10 | RUN apt-get install -y libncurses5-dev zlib1g-dev libbz2-dev libncursesw5-dev liblzma-dev
11 | 
12 | # install BWA
13 | RUN git clone https://github.com/lh3/bwa.git && \
14 |     cd bwa && \
15 |     git checkout v0.7.17 && \
16 |     make && \
17 |     cd .. && \
18 |     mv bwa/bwa /usr/local/bin
19 | 
20 | # install minimap2
21 | RUN git clone https://github.com/lh3/minimap2.git && \
22 |     cd minimap2 && \
23 |     git checkout v2.24 && \
24 |     make && \
25 |     cd .. && \
26 |     mv minimap2/minimap2.1 /usr/local/bin
27 | 
28 | # install blat dependencies
29 | RUN apt-get install -y libcurl4
30 | 
31 | # install blat
32 | RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat && \
33 |     chmod a+x blat && \
34 |     mv blat /usr/local/bin
35 | 
36 | # install wtdbg2
37 | RUN git clone https://github.com/ruanjue/wtdbg2.git && \
38 |     cd wtdbg2 && \
39 |     make && \
40 |     cd .. && \
41 |     mv wtdbg2/wtdbg2 /usr/local/bin
42 | 
43 | COPY setup.py setup.py
44 | COPY setup.cfg setup.cfg
45 | COPY MANIFEST.in MANIFEST.in
46 | COPY pyproject.toml pyproject.toml
47 | COPY src src
48 | COPY LICENSE LICENSE
49 | COPY README.md README.md
50 | 
51 | # install python package
52 | RUN pip install -U setuptools pip wheel
53 | RUN pip install .
54 | RUN which mavis
55 | ENTRYPOINT [ "mavis" ]
56 | 


--------------------------------------------------------------------------------
/docs/migrating.md:
--------------------------------------------------------------------------------
 1 | # Migrating
 2 | 
 3 | ## Migrating from v2 to v3
 4 | 
 5 | There are major changes from v2 to v3 of MAVIS.
 6 | 
 7 | ### Tab File Headers
 8 | 
 9 | Tab file headers no longer start with `#`. Any lines starting with a pound will be treated
10 | as comments. This will apply to mavis-style inputs as well as any tab delimited
11 | reference files
12 | 
13 | ### Configuration
14 | 
15 | MAVIS no longer uses command line arguments, config files, and environment variables for
16 | configuration. Instead all configurable settings are controlled via a single input JSON
17 | config file
18 | 
19 | ### Scheduling
20 | 
21 | MAVIS is now integrated with snakemake instead of handling its own scheduling
22 | 
23 | ## Reference Annotation Files
24 | 
25 | MAVIS no longer supports the previously deprecated tab-delimited format of the annotations file. If you are still using these files in your project we have provided a script to automatically convert them to the newer format in the tools directory.
26 | 
27 | ```bash
28 | python src/tools/convert_annotations_format.py \
29 |     /path/to/tab/file.tab \
30 |     --input_type v2-tab \
31 |     /path/to/new/json/file.json
32 | ```
33 | 
34 | In v3 the JSON files are slightly different to support multiple translations per transcript. You old v3 files can be automatically converted to the new format with the same script
35 | 
36 | ```bash
37 | python src/tools/convert_annotations_format.py \
38 |     /path/to/json/file.json \
39 |     --input_type v2-json \
40 |     /path/to/new/json/file.json
41 | ```
42 | 


--------------------------------------------------------------------------------
/docs/package/mavis/cluster/index.md:
--------------------------------------------------------------------------------
 1 | # Sub-package Documentation
 2 | 
 3 | The cluster sub-package is responsible for merging variants coming from different inputs (i.e. different tools).
 4 | 
 5 | ## Types of Output Files
 6 | 
 7 | | expected name/suffix           | file type/format           | content                                                              |
 8 | | ------------------------------ | -------------------------- | -------------------------------------------------------------------- |
 9 | | ``cluster_assignment.tab``     | text/tabbed                |                                                                      |
10 | | ``uninformative_clusters.txt`` | text                       | list of cluster ids that were dropped by annotation proximity filter |
11 | | ``clusters.bed``               | [bed](../../glossary/#bed) | cluster positions                                                    |
12 | | ``cluster-*.tab``              | text/tabbed                | computed clusters                                                    |
13 | 
14 | ## Algorithm Overview
15 | 
16 | - Collapse any duplicate breakpoint pairs
17 | - Split breakpoint pairs by type
18 | - Cluster breakpoint pairs by distance (within a type)
19 | 
20 |     - Create a graph representation of the distances between pairs
21 |     - Find cliques up to a given input size (cluster_clique_size)
22 |     - Hierarchically cluster the cliques (allows redundant participation)
23 |     - For each input node/pair pick the best cluster(s)
24 | 
25 | - Output the clusters and the mapping to the input pairs
26 | 


--------------------------------------------------------------------------------
/src/mavis/convert/arriba.py:
--------------------------------------------------------------------------------
 1 | from ..constants import COLUMNS, ORIENT
 2 | 
 3 | 
 4 | def get_orient(string):
 5 |     if string == "downstream":
 6 |         return ORIENT.LEFT
 7 |     elif string == "upstream":
 8 |         return ORIENT.RIGHT
 9 |     return ORIENT.NS
10 | 
11 | 
12 | def convert_row(row):
13 |     """
14 |     transforms the aribba output into the common format for expansion. Maps the input column
15 |     names to column names which MAVIS can read
16 |     """
17 |     std_row = {}
18 | 
19 |     try:
20 |         std_row[COLUMNS.break1_chromosome], b1_start = row["breakpoint1"].split(":")
21 |         std_row[COLUMNS.break2_chromosome], b2_start = row["breakpoint2"].split(":")
22 | 
23 |         std_row[COLUMNS.break1_strand] = row["strand1(gene/fusion)"].split("/")[1]
24 |         std_row[COLUMNS.break2_strand] = row["strand2(gene/fusion)"].split("/")[1]
25 |         std_row[COLUMNS.event_type] = row["type"].split("/")[0]
26 |         std_row[COLUMNS.break1_orientation] = get_orient(row["direction1"])
27 |         std_row[COLUMNS.break2_orientation] = get_orient(row["direction2"])
28 | 
29 |         std_row[COLUMNS.break1_position_start] = std_row[COLUMNS.break1_position_end] = b1_start
30 |         std_row[COLUMNS.break2_position_start] = std_row[COLUMNS.break2_position_end] = b2_start
31 |     except (ValueError, TypeError):
32 |         raise AssertionError(
33 |             "Could not parse the breakpoint from the Arriba row: {}, {}".format(
34 |                 row["breakpoint1"], row["breakpoint2"]
35 |             )
36 |         )
37 |     return std_row
38 | 


--------------------------------------------------------------------------------
/docs/tutorials/mini.md:
--------------------------------------------------------------------------------
 1 | # MAVIS (Mini) Tutorial
 2 | 
 3 | This tutorial is based on the data included in the tests folder of
 4 | MAVIS. The data files are very small and this tutorial is really only
 5 | intended for testing a MAVIS install. The data here is simulated and
 6 | results are not representative of the typical events you would see
 7 | reported from MAVIS. For a more complete tutorial with actual fusion
 8 | gene examples, please see the [full tutorial](../../tutorials/full/).
 9 | 
10 | The first step is to clone or download a zip of the MAVIS repository
11 | (<https://github.com/bcgsc/mavis>). You will need the tests directory.
12 | The tag you check out should correspond to the MAVIS version you have
13 | installed
14 | 
15 | ```bash
16 | git clone https://github.com/bcgsc/mavis.git
17 | git checkout <VERSION_TAG>
18 | mv mavis/tests .
19 | mv mavis/Snakefile .
20 | rm -r mavis
21 | ```
22 | 
23 | Now you should have a folder called `tests` in your current directory. Since this is a trivial
24 | example, it can easily be run locally. However in order to run the snakemake file you will need
25 | to have the config validation module `mavis_config` installed which has minimal dependencies.
26 | 
27 | ```bash
28 | pip install mavis_config
29 | ```
30 | 
31 | Now you are ready to run MAVIS. This can be done in a single command using snakemake.
32 | 
33 | ```bash
34 | snakemake -j 1 --configfile=tests/mini-tutorial.config.json -s Snakefile
35 | ```
36 | 
37 | Which will run the mini tutorial version and output files into a folder called `output_dir` in the
38 | current directory
39 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: publish
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   pypi:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: '3.x'
19 |     - name: Install dependencies
20 |       run: |
21 |         python -m pip install --upgrade pip
22 |         pip install setuptools wheel twine
23 |     - name: Build and publish
24 |       env:
25 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
26 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
27 |       run: |
28 |         python setup.py sdist bdist_wheel install
29 |         twine check dist/*
30 |         twine upload dist/*
31 |   docker:
32 |     runs-on: ubuntu-latest
33 |     steps:
34 |       - uses: actions/checkout@v2
35 |       - run: docker login -u $DOCKER_USER -p $DOCKER_PASSWORD
36 |         env:
37 |           DOCKER_USER: ${{ secrets.DOCKER_USER }}
38 |           DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
39 |       - run: |
40 |           docker build --file Dockerfile --tag bcgsc/mavis:latest --tag bcgsc/mavis:${{ github.event.release.tag_name }} .
41 |       - run: docker push bcgsc/mavis:latest
42 |       - run: docker push bcgsc/mavis:${{ github.event.release.tag_name }}
43 | 


--------------------------------------------------------------------------------
/tests/mini-tutorial.annotate_only.config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "annotate.draw_fusions_only": false,
 3 |     "convert": {
 4 |         "mock_converted": {
 5 |             "inputs": [
 6 |                 "tests/data/mock_sv_events.tsv"
 7 |             ],
 8 |             "file_type": "mavis",
 9 |             "assume_no_untemplated": true
10 |         }
11 |     },
12 |     "skip_stage.validate": true,
13 |     "cluster.uninformative_filter": true,
14 |     "cluster.limit_to_chr": null,
15 |     "cluster.min_clusters_per_file": 5,
16 |     "libraries": {
17 |         "mock-A47933": {
18 |             "assign": [
19 |                 "tests/data/mock_trans_sv_events.tsv"
20 |             ],
21 |             "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam",
22 |             "disease_status": "diseased",
23 |             "protocol": "transcriptome",
24 |             "strand_specific": true
25 |         },
26 |         "mock-A36971": {
27 |             "assign": [
28 |                 "mock_converted"
29 |             ],
30 |             "bam_file": "tests/data/mock_reads_for_events.sorted.bam",
31 |             "disease_status": "diseased",
32 |             "protocol": "genome",
33 |             "strand_specific": false
34 |         }
35 |     },
36 |     "output_dir": "output_dir",
37 |     "reference.annotations": [
38 |         "tests/data/mock_annotations.json"
39 |     ],
40 |     "reference.dgv_annotation": [
41 |         "tests/data/mock_dgv_annotation.tab"
42 |     ],
43 |     "reference.masking": [
44 |         "tests/data/mock_masking.tab"
45 |     ],
46 |     "reference.reference_genome": [
47 |         "tests/data/mock_reference_genome.fa"
48 |     ]
49 | }
50 | 


--------------------------------------------------------------------------------
/src/mavis/convert/chimerascan.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from ..constants import COLUMNS, ORIENT
 4 | from .constants import SUPPORTED_TOOL, TRACKING_COLUMN
 5 | 
 6 | 
 7 | def convert_row(row: Dict) -> Dict:
 8 |     """
 9 |     transforms the chimerscan output into the common format for expansion. Maps the input column
10 |     names to column names which MAVIS can read
11 |     """
12 |     std_row = {}
13 |     for retained_column in ['genes5p', 'genes3p']:
14 |         if retained_column in row:
15 |             std_row['{}_{}'.format(SUPPORTED_TOOL.CHIMERASCAN, retained_column)] = row[
16 |                 retained_column
17 |             ]
18 |     if TRACKING_COLUMN not in row:
19 |         std_row[TRACKING_COLUMN] = '{}-{}'.format(
20 |             SUPPORTED_TOOL.CHIMERASCAN, row['chimera_cluster_id']
21 |         )
22 | 
23 |     std_row.update(
24 |         {COLUMNS.break1_chromosome: row['chrom5p'], COLUMNS.break2_chromosome: row['chrom3p']}
25 |     )
26 |     if row['strand5p'] == '+':
27 |         std_row[COLUMNS.break1_position_start] = row['end5p']
28 |         std_row[COLUMNS.break1_orientation] = ORIENT.LEFT
29 |     else:
30 |         std_row[COLUMNS.break1_position_start] = row['start5p']
31 |         std_row[COLUMNS.break1_orientation] = ORIENT.RIGHT
32 |     if row['strand3p'] == '+':
33 |         std_row[COLUMNS.break2_position_start] = row['start3p']
34 |         std_row[COLUMNS.break2_orientation] = ORIENT.RIGHT
35 |     else:
36 |         std_row[COLUMNS.break2_position_start] = row['end3p']
37 |         std_row[COLUMNS.break2_orientation] = ORIENT.LEFT
38 |     std_row[COLUMNS.opposing_strands] = row['strand5p'] != row['strand3p']
39 |     return std_row
40 | 


--------------------------------------------------------------------------------
/docs/configuration/performance.md:
--------------------------------------------------------------------------------
 1 | # Resource Requirements
 2 | 
 3 | MAVIS has been tested on both unix and linux systems. For the standard
 4 | pipeline, the validation stage is the most computationally expensive.
 5 | The memory and cpu requirements will vary with two main factors: the
 6 | number of structural variants you are validating per job, and the size
 7 | of the bam file you are validating against.
 8 | 
 9 | There are a number of settings that can be adjusted to reduce memory and
10 | cpu requirements depending on what the user is trying to analyze. See
11 | [configuration and settings](../../configuration/general/) for more details.
12 | 
13 | ## Validation Resources
14 | 
15 | ![validation resources](../images/colo829_tumour_validation_resource_req.png)
16 | 
17 | Resource Requirements (MAVIS 1.8.0) for each validation job of the
18 | COLO829 tumour genome. The BAM file for the tumour genome is 127GB.
19 | Validation jobs were tested splitting into: 100, 500, 1000, and 2500
20 | structural variant validations per job. The effect of number of events
21 | validated on both memory and time is plotted
22 | above.
23 | 
24 | ## Annotation Resources
25 | 
26 | Similar trends were observed for the annotation step (see below) with
27 | regards to time elapsed. However the memory requirements remained more
28 | constant which is expected since, unlike validation, anntotation does
29 | not read more data in for more events.
30 | 
31 | ![annotation resources](../images/colo829_tumour_annotation_resource_req.png)
32 | 
33 | Resource Requirements (MAVIS 1.8.0) for each annotation job of the
34 | COLO829 tumour genome. The events which passed validation (see above)
35 | represent the number of events input to the annotation
36 | step.
37 | 


--------------------------------------------------------------------------------
/tests/test_tools/data/ensembl69_hg19_annotations.kras.tab:
--------------------------------------------------------------------------------
 1 | ## input file used to map hugo gene names: compiled_gene_drug_pathway.v1_2_5.tsv
 2 | ## input file for picking best transcript: ens69_best_transcript.txt
 3 | ## Ensembl Api version 69
 4 | ## generated at: Thu Aug  4 16:38:01 2016
 5 | #ensembl_gene_id	hugo_names	chr	strand	gene_start	gene_end	best_ensembl_transcript_id	ensembl_transcript_id	refseq_equivalents	transcript_genomic_start	transcript_genomic_end	cdna_coding_start	cdna_coding_end	genomic_exon_ranges	AA_domain_ranges
 6 | ENSG00000133703	KRAS	12	-1	25357723	25403870	ENST00000311936	ENST00000311936	NP_004976.2;NM_004985.3	25357723	25403865	193	759	25403685-25403865;25398208-25398329;25380168-25380346;25378548-25378707;25357723-25362845	PR00449:4-25,27-43,44-66,107-120,141-163;PF00025:3-162;SM00173:1-166;PF00009:45-163;PF08477:5-119;PS50318:165-184;SSF52540:3-184;TIGR00231:1-159;SM00175:4-166;PF00071:5-164;SM00174:6-166
 7 | ENSG00000133703	KRAS	12	-1	25357723	25403870	ENST00000311936	ENST00000557334		25362102	25403870	198	425	25403685-25403870;25398208-25398329;25362102-25362845	PR00449:4-25,27-43;PS50318:52-71;SM00173:1-53;PF00071:5-44;SSF52540:3-37
 8 | ENSG00000133703	KRAS	12	-1	25357723	25403870	ENST00000311936	ENST00000256078	NP_203524.1;NM_033360.2	25362365	25403737	65	634	25403685-25403737;25398208-25398329;25380168-25380346;25378548-25378707;25368371-25368494;25362365-25362845	SM00175:4-166;PF00071:5-164;SSF52540:3-185;SM00176:9-189;TIGR00231:1-159;SM00174:6-166;PR00449:4-25,27-43,44-66,107-120,141-163;PF00025:3-161;PF08477:5-119;PF00009:45-162;SM00173:1-166
 9 | ENSG00000133703	KRAS	12	-1	25357723	25403870	ENST00000311936	ENST00000556131		25386753	25403863	178	309	25403698-25403863;25398208-25398329;25386753-25388160	PR00449:4-25,27-43;PF00071:5-37;SSF52540:3-38
10 | 


--------------------------------------------------------------------------------
/docs/development.md:
--------------------------------------------------------------------------------
 1 | # Guidelines for Contributors
 2 | 
 3 | {!./../.github/CONTRIBUTING.md!}
 4 | 
 5 | ## Major Assumptions
 6 | 
 7 | Some assumptions have been made when developing this project. The major
 8 | ones have been listed here to facilitate debugging/development if any of
 9 | these are violated in the future.
10 | 
11 | - The input bam reads have stored the sequence wrt to the positive/forward strand and have not stored the reverse complement.
12 | - The distribution of the fragment sizes in the bam file approximately follows a normal distribution.
13 | 
14 | ## Current Limitations
15 | 
16 | - Assembling contigs will always fail for repeat sequences as we do not resolve this. Unlike traditional assemblies we cannot assume even input coverage as we are taking a select portion of the reads to assemble.
17 | - Currently no attempt is made to group/pair single events into complex events.
18 | - Transcriptome validation uses a collapsed model of all overlapping transcripts and is not isoform specific. Allowing for isoform specific validation would be computationally expensive but may be considered as an optional setting for future releases.
19 | 
20 | ## Computing Code coverage
21 | 
22 | Since MAVIS uses multiple processes, it adds complexity to computing the
23 | code coverage. Running coverage normally will undereport. To ensure that
24 | the coverage module captures the information from the subprocesses we
25 | need to do the following
26 | 
27 | In our development python virtual environment put a coverage.pth file
28 | (ex. `venv/lib/python3.6/site-packages/coverage.pth`) containing the
29 | following
30 | 
31 | ```python
32 | import coverage; coverage.process_startup()
33 | ```
34 | 
35 | Additionally you will need to set the environment variable
36 | 
37 | ```bash
38 | export COVERAGE_PROCESS_START=/path/to/mavis/repo/mavis/.coveragerc
39 | ```
40 | 


--------------------------------------------------------------------------------
/.github/workflows/quick-tests.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: quick-tests
 5 | 
 6 | on: [push]
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-20.04
11 |     strategy:
12 |       matrix:
13 |         python-version: ["3.7", "3.8", "3.9", "3.10"]
14 |     name: python-${{ matrix.python-version }} quick
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python ${{ matrix.python-version }}
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: ${{ matrix.python-version }}
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip setuptools wheel
24 |         pip install .[test]
25 |     - name: Lint with flake8
26 |       run: |
27 |         pip install flake8
28 |         # stop the build if there are Python syntax errors or undefined names
29 |         flake8 src tests --count --show-source --statistics
30 |     - name: Lint with black
31 |       run: |
32 |         pip install black
33 |         # stop the build if black needs to be run
34 |         black src tests -S -l 100 --check
35 |     - name: Lint with isort
36 |       run: |
37 |         pip install isort
38 |         isort src tests --check
39 |     - name: install bwa
40 |       run: |
41 |         git clone https://github.com/lh3/bwa.git
42 |         cd bwa
43 |         git checkout v0.7.17
44 |         make
45 |         cd ..
46 |     - name: install blat
47 |       run: |
48 |         wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat
49 |         chmod a+x blat
50 |     - name: run short tests with pytest
51 |       run: |
52 |         export PATH=$PATH:$(pwd):$(pwd)/bwa
53 |         pytest tests -v \
54 |           --junitxml=junit/test-results-${{ matrix.python-version }}.xml \
55 |           --durations=10
56 |       env:
57 |         RUN_FULL: 0
58 | 


--------------------------------------------------------------------------------
/src/mavis/convert/constants.py:
--------------------------------------------------------------------------------
 1 | from ..constants import SVTYPE, MavisNamespace
 2 | 
 3 | 
 4 | class SUPPORTED_TOOL(MavisNamespace):
 5 |     """
 6 |     Supported Tools used to call SVs and then used as input into MAVIS
 7 | 
 8 |     Attributes:
 9 |         CHIMERASCAN: chimerascan [Iyer-2011]_
10 |         DEFUSE: defuse [McPherson-2011]_
11 |         DELLY: delly [Rausch-2012]_
12 |         MANTA: manta [Chen-2016]_
13 |         PINDEL: pindel [Ye-2009]_
14 |         TA: transabyss [Robertson-2010]_
15 |     """
16 | 
17 |     MANTA = 'manta'
18 |     DELLY = 'delly'
19 |     TA = 'transabyss'
20 |     PINDEL = 'pindel'
21 |     CHIMERASCAN = 'chimerascan'
22 |     MAVIS = 'mavis'
23 |     DEFUSE = 'defuse'
24 |     BREAKDANCER = 'breakdancer'
25 |     VCF = 'vcf'
26 |     BREAKSEQ = 'breakseq'
27 |     CNVNATOR = 'cnvnator'
28 |     STRELKA = 'strelka'
29 |     STARFUSION = 'starfusion'
30 |     STRAGLR = 'straglr'
31 |     ARRIBA = 'arriba'
32 | 
33 | 
34 | TOOL_SVTYPE_MAPPING = {v: [v] for v in SVTYPE.values()}  # type: ignore
35 | TOOL_SVTYPE_MAPPING.update(
36 |     {
37 |         'DEL': [SVTYPE.DEL],
38 |         'INS': [SVTYPE.INS],
39 |         'ITX': [SVTYPE.DUP],
40 |         'CTX': [SVTYPE.TRANS, SVTYPE.ITRANS],
41 |         'INV': [SVTYPE.INV],
42 |         'BND': [SVTYPE.TRANS, SVTYPE.ITRANS, SVTYPE.DUP, SVTYPE.INS, SVTYPE.DEL, SVTYPE.INV],
43 |         'TRA': [SVTYPE.TRANS, SVTYPE.ITRANS],
44 |         'CNV': [SVTYPE.DUP],
45 |         'RPL': [SVTYPE.INS],
46 |         'DUP:TANDEM': [SVTYPE.DUP],
47 |         'DUP': [SVTYPE.DUP],
48 |         'interchromosomal': [SVTYPE.TRANS, SVTYPE.ITRANS],
49 |         'eversion': [SVTYPE.DUP],
50 |         'translocation': [SVTYPE.TRANS, SVTYPE.ITRANS],
51 |         'ins': [SVTYPE.INS],
52 |         'del': [SVTYPE.DEL],
53 |         'dup': [SVTYPE.DUP],
54 |         'ITD': [SVTYPE.DUP],
55 |         'IDP': [SVTYPE.INS],
56 |         'DEL/INV': [SVTYPE.DEL, SVTYPE.INV],
57 |         'DUP/INS': [SVTYPE.DUP, SVTYPE.INS],
58 |         'INVDUP': [SVTYPE.INV, SVTYPE.DUP, SVTYPE.INS],
59 |         'INV/INVDUP': [SVTYPE.INV, SVTYPE.DUP, SVTYPE.INS],
60 |     }
61 | )
62 | 
63 | TRACKING_COLUMN = 'tracking_id'
64 | 


--------------------------------------------------------------------------------
/tests/test_mavis/test_constants.py:
--------------------------------------------------------------------------------
 1 | from mavis.constants import (
 2 |     COLUMNS,
 3 |     ORIENT,
 4 |     STRAND,
 5 |     reverse_complement,
 6 |     sort_columns,
 7 |     translate,
 8 | )
 9 | 
10 | 
11 | class TestConstants:
12 |     def test_strand_compare(self):
13 |         assert STRAND.compare(STRAND.NS, STRAND.POS)
14 |         assert STRAND.compare(STRAND.NS, STRAND.NEG)
15 |         assert STRAND.compare(STRAND.POS, STRAND.POS)
16 |         assert STRAND.compare(STRAND.NEG, STRAND.NEG)
17 |         assert not STRAND.compare(STRAND.POS, STRAND.NEG)
18 |         assert not STRAND.compare(STRAND.NEG, STRAND.POS)
19 | 
20 |     def test_orient_compare(self):
21 |         assert ORIENT.compare(ORIENT.NS, ORIENT.RIGHT)
22 |         assert ORIENT.compare(ORIENT.NS, ORIENT.LEFT)
23 |         assert ORIENT.compare(ORIENT.RIGHT, ORIENT.RIGHT)
24 |         assert ORIENT.compare(ORIENT.LEFT, ORIENT.LEFT)
25 |         assert not ORIENT.compare(ORIENT.RIGHT, ORIENT.LEFT)
26 |         assert not ORIENT.compare(ORIENT.LEFT, ORIENT.RIGHT)
27 | 
28 |     def test_reverse_complement(self):
29 |         assert reverse_complement('CGAT') == 'ATCG'
30 |         assert reverse_complement('') == ''
31 | 
32 |     def test_translate(self):
33 |         seq = 'ATG' 'AAT' 'TCT' 'GGA' 'TGA'
34 |         translated_seq = translate(seq, 0)
35 |         assert translated_seq == 'MNSG*'  # ATG AAT TCT GGA TGA
36 |         translated_seq = translate(seq, 1)
37 |         assert translated_seq == '*ILD'  # A TGA ATT CTG GAT GA
38 |         translated_seq = translate(seq, 2)
39 |         assert translated_seq == 'EFWM'  # AT GAA TTC TGG ATG A
40 | 
41 |     def test_sort_columns(self):
42 |         temp = ['NEW', 'NEW2', COLUMNS.break1_seq, COLUMNS.break2_seq, COLUMNS.break1_chromosome]
43 |         assert sort_columns(temp) == [
44 |             COLUMNS.break1_chromosome,
45 |             COLUMNS.break1_seq,
46 |             COLUMNS.break2_seq,
47 |             'NEW',
48 |             'NEW2',
49 |         ]
50 | 
51 |     def test_column_matches_column_name(self):
52 |         assert COLUMNS.library == COLUMNS.library
53 |         s = set([COLUMNS.library, COLUMNS.library])
54 |         assert len(s) == 1
55 | 


--------------------------------------------------------------------------------
/tests/test_mavis/validate/test_validate.py:
--------------------------------------------------------------------------------
 1 | from mavis.constants import ORIENT
 2 | from mavis.interval import Interval
 3 | from mavis.validate.base import Evidence
 4 | from mavis.validate.call import _call_interval_by_flanking_coverage
 5 | 
 6 | from ..mock import Mock
 7 | 
 8 | 
 9 | class CallIntervalByFlankingCoverage:
10 |     def test_invalid_input_attr(self):
11 |         pass
12 | 
13 |     def test_left(self):
14 |         i = _call_interval_by_flanking_coverage(
15 |             Mock(start=101, end=110),
16 |             ORIENT.LEFT,
17 |             100,
18 |             20,
19 |             distance=Evidence.distance,
20 |             traverse=Evidence.traverse,
21 |         )
22 |         assert i.start == 110
23 |         assert i.end == 180
24 | 
25 |         i = _call_interval_by_flanking_coverage(
26 |             Mock(start=20, end=80),
27 |             ORIENT.LEFT,
28 |             230,
29 |             40,
30 |             distance=Evidence.distance,
31 |             traverse=Evidence.traverse,
32 |         )
33 |         assert i.start == 80
34 |         assert i.end == 209
35 | 
36 |     def test_right(self):
37 |         i = _call_interval_by_flanking_coverage(
38 |             Mock(start=101, end=110),
39 |             ORIENT.RIGHT,
40 |             100,
41 |             20,
42 |             distance=Evidence.distance,
43 |             traverse=Evidence.traverse,
44 |         )
45 |         assert i.end == 101
46 |         assert i.start == 31
47 | 
48 |         i = _call_interval_by_flanking_coverage(
49 |             Mock(start=150, end=200),
50 |             ORIENT.RIGHT,
51 |             230,
52 |             40,
53 |             distance=Evidence.distance,
54 |             traverse=Evidence.traverse,
55 |         )
56 |         assert i.start == 11
57 |         assert i.end == 150
58 | 
59 | 
60 | class TestDistanceAndTraverse:
61 |     def test_distance(self):
62 |         assert Evidence.distance(1, 11) == Interval(10)
63 | 
64 |     def test_traverse_right(self):
65 |         assert Evidence.traverse(1, 10, ORIENT.RIGHT) == Interval(11)
66 | 
67 |     def test_traverse_left(self):
68 |         assert Evidence.traverse(20, 10, ORIENT.LEFT) == Interval(10)
69 | 


--------------------------------------------------------------------------------
/tests/mini-tutorial.config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "annotate.draw_fusions_only": false,
 3 |     "convert": {
 4 |         "mock_converted": {
 5 |             "inputs": [
 6 |                 "tests/data/mock_sv_events.tsv"
 7 |             ],
 8 |             "file_type": "mavis",
 9 |             "assume_no_untemplated": true
10 |         }
11 |     },
12 |     "cluster.uninformative_filter": true,
13 |     "cluster.limit_to_chr": null,
14 |     "cluster.min_clusters_per_file": 5,
15 |     "libraries": {
16 |         "mock-A47933": {
17 |             "assign": [
18 |                 "tests/data/mock_trans_sv_events.tsv"
19 |             ],
20 |             "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam",
21 |             "disease_status": "diseased",
22 |             "median_fragment_size": 188,
23 |             "protocol": "transcriptome",
24 |             "read_length": 75,
25 |             "stdev_fragment_size": 50,
26 |             "strand_specific": true
27 |         },
28 |         "mock-A36971": {
29 |             "assign": [
30 |                 "mock_converted"
31 |             ],
32 |             "bam_file": "tests/data/mock_reads_for_events.sorted.bam",
33 |             "disease_status": "diseased",
34 |             "median_fragment_size": 400,
35 |             "protocol": "genome",
36 |             "read_length": 150,
37 |             "stdev_fragment_size": 97,
38 |             "strand_specific": false
39 |         }
40 |     },
41 |     "output_dir": "output_dir",
42 |     "reference.aligner_reference": [
43 |         "tests/data/mock_reference_genome.2bit"
44 |     ],
45 |     "reference.annotations": [
46 |         "tests/data/mock_annotations.json"
47 |     ],
48 |     "reference.dgv_annotation": [
49 |         "tests/data/mock_dgv_annotation.tab"
50 |     ],
51 |     "reference.masking": [
52 |         "tests/data/mock_masking.tab"
53 |     ],
54 |     "reference.reference_genome": [
55 |         "tests/data/mock_reference_genome.fa"
56 |     ],
57 |     "reference.template_metadata": [
58 |         "tests/data/cytoBand.txt"
59 |     ],
60 |     "summary.filter_min_remapped_reads": 5,
61 |     "summary.filter_min_spanning_reads": 5,
62 |     "summary.filter_min_linking_split_reads": 1,
63 |     "summary.filter_min_flanking_reads": 10
64 | }
65 | 


--------------------------------------------------------------------------------
/tests/data/breakdancer_output.txt:
--------------------------------------------------------------------------------
 1 | #Software: 1.4.5
 2 | #Command: /gsc/software/linux-x86_64-centos6/breakdancer-1.4.5/bin/breakdancer-max -t /projects/trans_scratch/validations/workspace/creisle/MAV228/breakdancer.cfg 
 3 | #Library Statistics:
 4 | #/projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam	mean:441	std:116.54	uppercutoff:959.41	lowercutoff:22.39	readlen:149.65	library:A36971	reflen:3046874375	seqcov:69.8209	phycov:102.877	32:31637251
 5 | #/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam	mean:437.99	std:124.28	uppercutoff:955.49	lowercutoff:0	readlen:147.17	library:A36973	reflen:3046874375	seqcov:33.1399	phycov:49.3136	32:27980009
 6 | #Chr1	Pos1	Orientation1	Chr2	Pos2	Orientation2	Type	Size	Score	num_Reads	num_Reads_lib	A36971_2_lanes_dupsFlagged.bam	A36973_1_lane_dupsFlagged.bam
 7 | 1	200067631	23+27-	2	23697874	17+6-	CTX	-439	38	14	/projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|11:/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam|3
 8 | 1	10001	83+126-	1	10546	83+126-	ITX	-352	99	43	/projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|23:/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam|20	NA	NA
 9 | 1	808410	11+11-	1	808574	11+11-	ITX	-338	99	9	/projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|6:/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam|3	NA	NA
10 | 1	869445	89+21-	1	870225	5+93-	DEL	892	99	67	/projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|40:/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam|27	0.06	0.08
11 | 1	54687282	6+9-	1	54687479	6+9-	INS	-421	99	3	/projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|3	NA	NA
12 | 1	6508246	10+17-	1	17028869	57+50-	INV	10520288	31	4	/projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|4	1.77	2.21
13 | 


--------------------------------------------------------------------------------
/docs/package/mavis/annotate/index.md:
--------------------------------------------------------------------------------
 1 | # Sub-package Documentation
 2 | 
 3 | ## Types of Output Files
 4 | 
 5 | | expected name/suffix           | file type/format                  | content                                  |
 6 | | ------------------------------ | --------------------------------- | ---------------------------------------- |
 7 | | ``annotations.tab``            | text/tabbed                       | annotated events                         |
 8 | | ``annotations.fusion-cdna.fa`` | [fasta](../../../glossary/#fasta) | putative fusion unspliced cDNA sequences |
 9 | | ``drawings/*.svg``             | [SVG](../../../glossary/#svg)     | diagrams                                 |
10 | | ``drawings/*.legend.json``     | [JSON](../../../glossary/#json)   | diagram legend/metadata                  |
11 | 
12 | ## Algorithm Overview
13 | 
14 | see [annotating events](../../../background/theory/#annotating-events)
15 | 
16 | - read in breakpoint pairs
17 | - generate strand-specific annotations (one annotation per strand, multiple if multiple genes/transcripts in the region)
18 | - try building fusion transcripts for bp-specific calls
19 | - generate [SVG](../../../glossary/#svg) diagrams
20 | 
21 | ## Levels of Annotations
22 | 
23 | ![levels of features](../../../images/feature_levels.svg)
24 | 
25 | ## Overview of Class Relationships
26 | 
27 | ![annotations model](../../../images/annotation_model.svg)
28 | The Annotation sub-package has objects for genetic annotations and related calculations. The basic layout of the
29 | package is shown above. IS-A relationships are given by the blue arrows. HAS-A relationships are shown in black.
30 | And reference_object/parent
31 | type relationships are shown in red. mavis.annotate.genomic.Gene is a gene. Start and end are
32 | genomic positions wrt to the template/chr. mavis.annotate.genomic.PreTranscript is the
33 | unspliced transcript. Start and end are genomic positions wrt to the template/chr.
34 | mavis.annotate.genomic.Transcript: is the spliced transcript. Start and end coordinates are
35 | 1 to the length of the spliced product in base pairs.
36 | mavis.annotate.protein.Translation: is the translation of the spliced transcript. Start and
37 | end are cdna positions wrt the 5' end of the spliced transcript. The start and end here describe the start and end
38 | of the coding sequence
39 | 


--------------------------------------------------------------------------------
/tests/test_tools/test_convert_annotations_format.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import pytest
 5 | 
 6 | from tools.convert_annotations_format import (
 7 |     convert_gff2_to_mavis,
 8 |     convert_gff3_to_mavis,
 9 |     convert_mavis_json_2to3,
10 |     convert_tab_to_json,
11 | )
12 | 
13 | CONVERTERS = {
14 |     'gff3': convert_gff3_to_mavis,
15 |     'gtf': convert_gff2_to_mavis,
16 |     'v2-json': convert_mavis_json_2to3,
17 |     'v2-tab': convert_tab_to_json,
18 | }
19 | 
20 | 
21 | def sort_elements(data):
22 |     """
23 |     Sort lists of exons, domains, genes, etc by position and name to facilitate comparison
24 |     """
25 |     if not isinstance(data, dict):
26 |         if isinstance(data, list):
27 |             items = [sort_elements(e) for e in data]
28 | 
29 |             if all(isinstance(elem, dict) for elem in data):
30 |                 return sorted(
31 |                     items, key=lambda elem: (elem.get('start'), elem.get('end'), elem.get('name'))
32 |                 )
33 |             return items
34 |         else:
35 |             return data
36 | 
37 |     for key, value in data.items():
38 |         data[key] = sort_elements(value)
39 |     return data
40 | 
41 | 
42 | @pytest.mark.parametrize(
43 |     'filename,expected_file,input_type',
44 |     [
45 |         ['K02718.1.gff3', 'K02718.1.gff3.json', 'gff3'],
46 |         ['K02718.1.gtf', 'K02718.1.gtf.json', 'gtf'],
47 |         ['Homo_sapiens.GRCh38.kras.gff3', 'Homo_sapiens.GRCh38.kras.gff3.json', 'gff3'],
48 |         ['Homo_sapiens.GRCh38.kras.gtf', 'Homo_sapiens.GRCh38.kras.gtf.json', 'gtf'],
49 |         ['example_genes.v2.json', 'example_genes.v3.json', 'v2-json'],
50 |         [
51 |             'ensembl69_hg19_annotations.kras.tab',
52 |             'ensembl69_hg19_annotations.kras.tab.json',
53 |             'v2-tab',
54 |         ],
55 |         ['viral.gtf', 'viral.gtf.json', 'gtf'],
56 |     ],
57 | )
58 | def test_gff_examples(filename, expected_file, input_type):
59 |     data_dir = os.path.join(os.path.dirname(__file__), 'data')
60 |     input_file = os.path.join(data_dir, filename)
61 |     with open(os.path.join(data_dir, expected_file), 'r') as fh:
62 |         expected = json.load(fh)
63 | 
64 |     # order doesn't matter
65 |     data = sort_elements(CONVERTERS[input_type](input_file))
66 |     expected = sort_elements(expected)
67 | 
68 |     assert len(data['genes']) == len(expected['genes'])
69 |     assert data == expected
70 | 


--------------------------------------------------------------------------------
/docs/configuration/general.md:
--------------------------------------------------------------------------------
 1 | # Getting Started
 2 | 
 3 | An exhaustive list of the various configurable settings can be found [here](../settings). Alternatively you can view them through the [online schema explorer](https://json-schema.app/view?url=https://raw.githubusercontent.com/bcgsc/mavis_config/master/src/mavis_config/config.json)
 4 | 
 5 | ## Pipeline Configuration File
 6 | 
 7 | The pipeline can be run in steps or it can be configured using a JSON
 8 | configuration file and setup in a single step. Scripts will be generated
 9 | to run all steps following clustering.
10 | 
11 | The config schema is found in the mavis package under `src/mavis/schemas/config.json`
12 | 
13 | Top level settings follow the pattern `<section>.<setting>`. The convert and library
14 | sections are nested objects.
15 | 
16 | ## Adjusting the Resource Requirements
17 | 
18 | ### Choosing the Number of Validation/Annotation Jobs
19 | 
20 | MAVIS chooses the number of jobs to split validate/annotate stages into
21 | based on two settings: [cluster.max_files](../../configuration/settings/#clustermax_files) and
22 | [cluster.min_clusters_per_file](../../configuration/settings/#clustermin-clusters-per-file).
23 | 
24 | For example, in the following situation say you have: 1000 clusters,
25 | `cluster.max_files=10`, and `cluster.min_clusters_per_file=10`. Then MAVIS will set up
26 | 10 validation jobs each with 100 events.
27 | 
28 | However, if `cluster.min_clusters_per_file=500`, then MAVIS would only set up 2
29 | jobs each with 500 events. This is because
30 | [cluster.min_clusters_per_file](../../configuration/settings/#clustermin-clusters-per-file) takes precedence
31 | over [custer.max_files](../../configuration/settings/#clustermax_files).
32 | 
33 | Splitting into more jobs will lower the resource requirements per job
34 | (see [resource requirements](../performance/)). The memory and time requirements for validation are linear
35 | with respect to the number of events to be validated.
36 | 
37 | ### Uninformative Filter
38 | 
39 | For example, if the user is only interested in events in genes, then the
40 | [cluster.uninformative_filter](../../configuration/settings/#clusteruninformative_filter) can be used. This
41 | will drop all events that are not within a certain distance
42 | ([cluster.max_proximity](../../configuration/settings/#clustermax_proximity)) to any annotation in
43 | the annotations reference file. These events will be dropped prior to
44 | the validation stage which results in significant speed up.
45 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
  1 | [metadata]
  2 | name = mavis
  3 | version = 3.1.2
  4 | url = https://github.com/bcgsc/mavis.git
  5 | download_url = https://github.com/bcgsc/mavis/archive/v2.2.10.tar.gz
  6 | description = A Structural Variant Post-Processing Package
  7 | author_email = creisle@bcgsc.ca
  8 | author = Caralyn Reisle
  9 | maintainer_email = mavis@bcgsc.ca
 10 | maintainer = mavis
 11 | long_description = file: README.md
 12 | long_description_content_type = text/markdown
 13 | license_file = LICENSE
 14 | project_urls = mavis = http://mavis.bcgsc.ca
 15 | 
 16 | [bdist_wheel]
 17 | universal = 1
 18 | 
 19 | [pycodestyle]
 20 | ignore = E501
 21 |     W503
 22 |     E203
 23 | statistics = True
 24 | 
 25 | [flake8]
 26 | ignore = E501,W503,E203
 27 | 
 28 | [isort]
 29 | profile = black
 30 | 
 31 | [options]
 32 | packages = find:
 33 | package_dir =
 34 |      = src
 35 | python_requires = >=3.7
 36 | dependency_links = []
 37 | include_package_data = True
 38 | install_requires =
 39 |     biopython>=1.70, <1.78
 40 |     braceexpand==0.1.2
 41 |     colour
 42 |     Distance>=0.1.3
 43 |     mavis_config>=1.2.2, <2.0.0
 44 |     networkx>=2.5,<3
 45 |     numpy>=1.13.1
 46 |     pandas>=1.1, <2
 47 |     pysam
 48 |     Shapely>=1.6.4.post1
 49 |     shortuuid>=0.5.0
 50 |     svgwrite
 51 |     typing_extensions>=4
 52 | setup_requires =
 53 |     pip>=9.0.0
 54 |     setuptools>=36.0.0
 55 | 
 56 | [options.packages.find]
 57 | exclude = tests
 58 | where = src
 59 | 
 60 | [options.extras_require]
 61 | doc =
 62 |     mkdocs>=1.1.2
 63 |     markdown-refdocs
 64 |     mkdocs-material>=5.4.0
 65 |     markdown-include
 66 |     mkdocs-simple-hooks>=0.1.2
 67 |     mkdocs-awesome-pages-plugin
 68 | test =
 69 |     timeout-decorator>=0.3.3
 70 |     coverage>=4.2
 71 |     pycodestyle>=2.3.1
 72 |     pytest
 73 |     pytest-cov
 74 | dev =
 75 |     black
 76 |     flake8
 77 |     isort
 78 |     twine
 79 |     wheel
 80 |     timeout-decorator>=0.3.3
 81 |     coverage>=4.2
 82 |     pycodestyle>=2.3.1
 83 |     pytest
 84 |     pytest-cov
 85 |     pytest-xdist
 86 |     mkdocs>=1.1.2,<2
 87 |     markdown-refdocs
 88 |     mkdocs-material>=5.4.0
 89 |     markdown-include
 90 |     mkdocs-simple-hooks>=0.1.2
 91 |     types-setuptools>=57.4.7, <58
 92 | deploy =
 93 |     twine
 94 |     wheel
 95 | tools =
 96 |     pyensembl
 97 |     simplejson
 98 |     requests
 99 | 
100 | [options.entry_points]
101 | console_scripts =
102 |     mavis = mavis.main:main
103 |     calculate_ref_alt_counts = tools.calculate_ref_alt_counts:main
104 | 


--------------------------------------------------------------------------------
/tests/data/bwa_pipeline_config.cfg:
--------------------------------------------------------------------------------
 1 | [reference]
 2 | template_metadata = tests/data/cytoBand.txt
 3 | annotations = tests/data/mock_annotations.json
 4 | masking = tests/data/mock_masking.tab
 5 | reference_genome = tests/data/mock_reference_genome.fa
 6 | aligner_reference = tests/data/mock_reference_genome.fa
 7 | dgv_annotation = tests/data/mock_dgv_annotation.txt
 8 | 
 9 | [annotate]
10 | draw_fusions_only = False
11 | 
12 | [validate]
13 | # evidence related settings
14 | aligner = bwa mem
15 | assembly_max_paths = 4
16 | assembly_min_exact_match_to_remap = 4
17 | assembly_min_edge_trim_weight = 4
18 | assembly_min_remap_coverage = 0
19 | assembly_min_remapped_seq = 3
20 | assembly_strand_concordance = 0.51
21 | blat_min_identity = 0.9
22 | call_error = 10
23 | contig_aln_max_event_size = 50
24 | contig_aln_merge_inner_anchor = 20
25 | contig_aln_merge_outer_anchor = 15
26 | contig_aln_min_anchor_size = 50
27 | contig_aln_min_query_consumption = 0.7
28 | fetch_reads_bins = 5
29 | fetch_reads_limit = 10000
30 | fetch_min_bin_size = 50
31 | filter_secondary_alignments = True
32 | fuzzy_mismatch_number = 1
33 | max_sc_preceeding_anchor = 6
34 | min_anchor_exact = 6
35 | min_anchor_fuzzy = 10
36 | min_anchor_match = 0.9
37 | min_double_aligned_to_estimate_insertion_size = 2
38 | min_flanking_pairs_resolution = 3
39 | min_linking_split_reads = 1
40 | min_mapping_quality = 5
41 | min_non_target_aligned_split_reads = 1
42 | min_sample_size_to_apply_percentage = 10
43 | min_softclipping = 6
44 | min_spanning_reads_resolution = 3
45 | min_splits_reads_resolution = 3
46 | stdev_count_abnormal = 3.0
47 | strand_determining_read = 2
48 | outer_window_min_event_size = 125
49 | 
50 | [cluster]
51 | uninformative_filter = True
52 | limit_to_chr = None
53 | 
54 | [mock-A36971]
55 | read_length = 150
56 | median_fragment_size = 400
57 | stdev_fragment_size = 97
58 | bam_file = tests/data/mock_reads_for_events.sorted.bam
59 | protocol = genome
60 | inputs = tests/data/mock_sv_events.tsv
61 | strand_specific = False
62 | disease_status=diseased
63 | 
64 | [mock-A47933]
65 | read_length = 75
66 | median_fragment_size = 188
67 | stdev_fragment_size = 50
68 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam
69 | protocol = transcriptome
70 | inputs = tests/data/mock_trans_sv_events.tsv
71 | strand_specific = True
72 | disease_status=diseased
73 | 
74 | [summary]
75 | filter_min_remapped_reads = 5
76 | filter_min_spanning_reads = 5
77 | filter_min_flanking_reads = 10
78 | filter_min_split_reads = 5
79 | filter_min_linking_split_reads = 1
80 | filter_cdna_synon = False
81 | filter_protein_synon = False
82 | 


--------------------------------------------------------------------------------
/tests/data/missing_reference.cfg:
--------------------------------------------------------------------------------
 1 | [reference]
 2 | annotations = tests/data/mock_annotations.json
 3 | aligner_reference = tests/data/mock_reference_genome.2bit
 4 | 
 5 | [annotate]
 6 | draw_fusions_only = False
 7 | 
 8 | [validate]
 9 | # evidence related settings
10 | aligner = blat
11 | assembly_max_paths = 4
12 | assembly_min_exact_match_to_remap = 4
13 | assembly_min_edge_trim_weight = 4
14 | assembly_min_remap_coverage = 0
15 | assembly_min_remapped_seq = 3
16 | assembly_strand_concordance = 0.51
17 | blat_min_identity = 0.9
18 | call_error = 10
19 | contig_aln_max_event_size = 50
20 | contig_aln_merge_inner_anchor = 20
21 | contig_aln_merge_outer_anchor = 15
22 | contig_aln_min_anchor_size = 50
23 | contig_aln_min_query_consumption = 0.7
24 | fetch_reads_bins = 5
25 | fetch_reads_limit = 10000
26 | fetch_min_bin_size = 50
27 | filter_secondary_alignments = True
28 | fuzzy_mismatch_number = 1
29 | max_sc_preceeding_anchor = 6
30 | min_anchor_exact = 6
31 | min_anchor_fuzzy = 10
32 | min_anchor_match = 0.9
33 | min_double_aligned_to_estimate_insertion_size = 2
34 | min_flanking_pairs_resolution = 3
35 | min_linking_split_reads = 1
36 | min_mapping_quality = 5
37 | min_non_target_aligned_split_reads = 1
38 | min_sample_size_to_apply_percentage = 10
39 | min_softclipping = 6
40 | min_spanning_reads_resolution = 3
41 | min_splits_reads_resolution = 3
42 | stdev_count_abnormal = 3.0
43 | strand_determining_read = 2
44 | outer_window_min_event_size = 125
45 | 
46 | [cluster]
47 | uninformative_filter = True
48 | limit_to_chr = None
49 | 
50 | [mock-A36971]
51 | read_length = 150
52 | median_fragment_size = 400
53 | stdev_fragment_size = 97
54 | bam_file = tests/data/mock_reads_for_events.sorted.bam
55 | protocol = genome
56 | inputs = mock_converted
57 | strand_specific = False
58 | disease_status=diseased
59 | 
60 | [mock-A47933]
61 | read_length = 75
62 | median_fragment_size = 188
63 | stdev_fragment_size = 50
64 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam
65 | protocol = transcriptome
66 | inputs = tests/data/mock_trans_sv_events.tsv
67 | strand_specific = True
68 | disease_status=diseased
69 | 
70 | [summary]
71 | filter_min_remapped_reads = 5
72 | filter_min_spanning_reads = 5
73 | filter_min_flanking_reads = 10
74 | filter_min_split_reads = 5
75 | filter_min_linking_split_reads = 1
76 | filter_cdna_synon = True
77 | filter_protein_synon = True
78 | 
79 | [convert]
80 | assume_no_untemplated = True
81 | # addfile twice to check this notation is ok (will collapse them anyway)
82 | mock_converted = convert_tool_output
83 |     tests/data/mock_sv_events.tsv
84 |     tests/data/mock_sv_events.tsv
85 |     mavis
86 |     False
87 | 
88 | 


--------------------------------------------------------------------------------
/tests/data/clean_pipeline_config.cfg:
--------------------------------------------------------------------------------
 1 | [reference]
 2 | template_metadata = tests/data/cytoBand.txt
 3 | annotations = tests/data/mock_annotations.json
 4 | masking = tests/data/mock_masking.tab
 5 | reference_genome = tests/data/mock_reference_genome.fa
 6 | aligner_reference = tests/data/mock_reference_genome.2bit
 7 | dgv_annotation = tests/data/mock_dgv_annotation.txt
 8 | 
 9 | [annotate]
10 | draw_fusions_only = False
11 | 
12 | [validate]
13 | # evidence related settings
14 | aligner = blat
15 | assembly_max_paths = 4
16 | assembly_min_exact_match_to_remap = 4
17 | assembly_min_edge_trim_weight = 4
18 | assembly_min_remap_coverage = 0
19 | assembly_min_remapped_seq = 3
20 | assembly_strand_concordance = 0.51
21 | blat_min_identity = 0.9
22 | call_error = 10
23 | contig_aln_max_event_size = 50
24 | contig_aln_merge_inner_anchor = 20
25 | contig_aln_merge_outer_anchor = 15
26 | contig_aln_min_anchor_size = 50
27 | contig_aln_min_query_consumption = 0.7
28 | fetch_reads_bins = 5
29 | fetch_reads_limit = 10000
30 | fetch_min_bin_size = 50
31 | filter_secondary_alignments = True
32 | fuzzy_mismatch_number = 1
33 | max_sc_preceeding_anchor = 6
34 | min_anchor_exact = 6
35 | min_anchor_fuzzy = 10
36 | min_anchor_match = 0.9
37 | min_double_aligned_to_estimate_insertion_size = 2
38 | min_flanking_pairs_resolution = 3
39 | min_linking_split_reads = 1
40 | min_mapping_quality = 5
41 | min_non_target_aligned_split_reads = 1
42 | min_sample_size_to_apply_percentage = 10
43 | min_softclipping = 6
44 | min_spanning_reads_resolution = 3
45 | min_splits_reads_resolution = 3
46 | stdev_count_abnormal = 3.0
47 | strand_determining_read = 2
48 | outer_window_min_event_size = 125
49 | write_evidence_files = False
50 | clean_aligner_files = True
51 | 
52 | [cluster]
53 | uninformative_filter = True
54 | limit_to_chr = None
55 | 
56 | [mock-A36971]
57 | read_length = 150
58 | median_fragment_size = 400
59 | stdev_fragment_size = 97
60 | bam_file = tests/data/mock_reads_for_events.sorted.bam
61 | protocol = genome
62 | inputs = tests/data/mock_sv_events.tsv
63 | strand_specific = False
64 | disease_status=diseased
65 | 
66 | [mock-A47933]
67 | read_length = 75
68 | median_fragment_size = 188
69 | stdev_fragment_size = 50
70 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam
71 | protocol = transcriptome
72 | inputs = tests/data/mock_trans_sv_events.tsv
73 | strand_specific = True
74 | disease_status=diseased
75 | 
76 | [summary]
77 | filter_min_remapped_reads = 5
78 | filter_min_spanning_reads = 5
79 | filter_min_flanking_reads = 10
80 | filter_min_split_reads = 5
81 | filter_min_linking_split_reads = 1
82 | filter_cdna_synon = True
83 | filter_protein_synon = True
84 | 


--------------------------------------------------------------------------------
/tests/data/no_opt_pipeline.cfg:
--------------------------------------------------------------------------------
 1 | [reference]
 2 | annotations = tests/data/mock_annotations.json
 3 | reference_genome = tests/data/mock_reference_genome.fa
 4 | aligner_reference = tests/data/mock_reference_genome.2bit
 5 | 
 6 | [annotate]
 7 | draw_fusions_only = False
 8 | 
 9 | [validate]
10 | # evidence related settings
11 | aligner = blat
12 | assembly_max_paths = 4
13 | assembly_min_exact_match_to_remap = 4
14 | assembly_min_edge_trim_weight = 4
15 | assembly_min_remap_coverage = 0
16 | assembly_min_remapped_seq = 3
17 | assembly_strand_concordance = 0.51
18 | blat_min_identity = 0.9
19 | call_error = 10
20 | contig_aln_max_event_size = 50
21 | contig_aln_merge_inner_anchor = 20
22 | contig_aln_merge_outer_anchor = 15
23 | contig_aln_min_anchor_size = 50
24 | contig_aln_min_query_consumption = 0.7
25 | fetch_reads_bins = 5
26 | fetch_reads_limit = 10000
27 | fetch_min_bin_size = 50
28 | filter_secondary_alignments = True
29 | fuzzy_mismatch_number = 1
30 | max_sc_preceeding_anchor = 6
31 | min_anchor_exact = 6
32 | min_anchor_fuzzy = 10
33 | min_anchor_match = 0.9
34 | min_double_aligned_to_estimate_insertion_size = 2
35 | min_flanking_pairs_resolution = 3
36 | min_linking_split_reads = 1
37 | min_mapping_quality = 5
38 | min_non_target_aligned_split_reads = 1
39 | min_sample_size_to_apply_percentage = 10
40 | min_softclipping = 6
41 | min_spanning_reads_resolution = 3
42 | min_splits_reads_resolution = 3
43 | stdev_count_abnormal = 3.0
44 | strand_determining_read = 2
45 | outer_window_min_event_size = 125
46 | 
47 | [cluster]
48 | uninformative_filter = True
49 | limit_to_chr = None
50 | 
51 | [mock-A36971]
52 | read_length = 150
53 | median_fragment_size = 400
54 | stdev_fragment_size = 97
55 | bam_file = tests/data/mock_reads_for_events.sorted.bam
56 | protocol = genome
57 | inputs = mock_converted
58 | strand_specific = False
59 | disease_status=diseased
60 | 
61 | [mock-A47933]
62 | read_length = 75
63 | median_fragment_size = 188
64 | stdev_fragment_size = 50
65 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam
66 | protocol = transcriptome
67 | inputs = tests/data/mock_trans_sv_events.tsv
68 | strand_specific = True
69 | disease_status=diseased
70 | 
71 | [summary]
72 | filter_min_remapped_reads = 5
73 | filter_min_spanning_reads = 5
74 | filter_min_flanking_reads = 10
75 | filter_min_split_reads = 5
76 | filter_min_linking_split_reads = 1
77 | filter_cdna_synon = True
78 | filter_protein_synon = True
79 | 
80 | [convert]
81 | assume_no_untemplated = True
82 | # addfile twice to check this notation is ok (will collapse them anyway)
83 | mock_converted = convert_tool_output
84 |     tests/data/mock_sv_events.tsv
85 |     tests/data/mock_sv_events.tsv
86 |     mavis
87 |     False
88 | 
89 | 


--------------------------------------------------------------------------------
/tests/data/Library-clusterset-N.validated.tsv:
--------------------------------------------------------------------------------
 1 | #cluster_id	break1_chromosome	break1_position_start	break1_position_end	break1_orientation	break1_strand	break2_chromosome	break2_position_start	break2_position_end	break2_orientation	break2_strand	event_type	opposing_strands	stranded	protocol	tools	contigs_assembled	contigs_aligned	contig_sequence	contig_remap_score	contig_alignment_score	call_method	flanking_reads	median_insert_size	stdev_insert_size	break1_split_reads	break2_split_reads	linking_split_reads	untemplated_sequence
 2 | 11241	1	238693407	238693407	L	?	1	238701194	238701194	R	?	deletion	False	False	genome	DELLY_v0.6.1	1	1	GAGACTGGAAGTGGGTAGTTGCTTCATGCAGCTGGTTGTCCCAATGCCTGTCAGAGTCTGCCTTAGTCCCGGGGTTTTTATGGGCTCAGAAGGGAGAAAGTGTATGCTGAAAGCATTGAAATGCTAATTAGGAAGCATTTTTTTTTTCCTTCAAAGTAACTTTAAATAACTTTTCGGGAAAGTAAACACAATTA	27	0	contig	58	8200.0	7.7781745930520225	26	0	0	
 3 | 11388	1	79401525	79401525	L	?	1	79401848	79401848	R	?	deletion	False	False	genome	DELLY_v0.6.1	1	1	AGACAGTAACAAAAGTTGGAGGTAAGACAAGGACCCAGATATTGTCAGCCAAAATCCTCCCCAGGTATTTATAACAGAATGGAAATCTCAAGTAAGAATATGGATATTCTGTATACTGTACATACATCAAATGTTTTTATAGGAAACCACATGTTACATGTACATATGACATAATCAAATGCATGATAAGTATTTATTGCAAATTCAT	61	0	contig	225	731	7.0710678118654755	93	0	0	
 4 | 11425	1	143164727	143164727	R	?	1	143165037	143165037	R	?	inversion	True	False	genome	DELLY_v0.6.1	1	0	?	?	?	split reads	14	266.0	5.744562646538029	3	5	2	?
 5 | 10094	11	79346483	79346483	R	?	9	115343095	115343095	L	?	translocation	False	False	genome	DELLY_v0.6.1	1	1	AAACTGCTCCATATTTATTTCATTATTATTATCATTTTCATCATCCTAACGATTATTCAGTATATACCAAGTGTCTCTGATGAAACATGCAGGAGATGAAAAATCCTTGGGTGGGCTTGTTTCTTTCTTTGTGTTTTTTTTTTTGAGATGGAGTCTCGCTCTGGAGCCCAGGCTGG	19	0	contig	32	0.0	0.0	20	10	9	
 6 | 10094	11	79346459	79346459	L	?	9	115343096	115343096	R	?	translocation	False	False	genome	DELLY_v0.6.1	1	1	ATAATATTGTCTCATTCCCATTTTAAACTACCTGTTCCTTAAATTGCATATAAAAATACAGTCCATGCAATATTAATACACTAATGAATAATACACTAACAATTTATTTTCTTAGCCATTTCTTAACCTTTTCCTGTAGTTTCCTGAAGGAAGAGCTGAGTTATAATTTTTGAAAAATAAGAGAGACAAAGTAAAAATTCAG	31	0	contig	65	0	0.0	0	21	0	
 7 | 11963	11	79346459	79346459	L	?	9	115343096	115343096	R	?	translocation	False	False	genome	DELLY_v0.6.1	1	1	ATAATATTGTCTCATTCCCATTTTAAACTACCTGTTCCTTAAATTGCATATAAAAATACAGTCCATGCAATATTAATACACTAATGAATAATACACTAACAATTTATTTTCTTAGCCATTTCTTAACCTTTTCCTGTAGTTTCCTGAAGGAAGAGCTGAGTTATAATTTTTGAAAAATAAGAGAGACAAAGTAAAAATTCAG	31	0	contig	65	0	0.0	0	21	0	
 8 | 11963	11	79346483	79346483	R	?	9	115343095	115343095	L	?	translocation	False	False	genome	DELLY_v0.6.1	1	1	AAACTGCTCCATATTTATTTCATTATTATTATCATTTTCATCATCCTAACGATTATTCAGTATATACCAAGTGTCTCTGATGAAACATGCAGGAGATGAAAAATCCTTGGGTGGGCTTGTTTCTTTCTTTGTGTTTTTTTTTTTGAGATGGAGTCTCGCTCTGGAGCCCAGGCTGG	19	0	contig	32	0.0	0.0	20	10	9	
 9 | 11974	11	56271180	56271593	L	?	9	132187570	132187570	R	?	translocation	False	False	genome	DELLY_v0.6.1	0	0	?	?	?	split and flanking	7	0	0.0	1	3	0	?
10 | 


--------------------------------------------------------------------------------
/tests/data/pipeline_config.cfg:
--------------------------------------------------------------------------------
 1 | [reference]
 2 | template_metadata = tests/data/cytoBand.txt
 3 | annotations = tests/data/mock_annotations.json
 4 | masking = tests/data/mock_masking.tab
 5 | reference_genome = tests/data/mock_reference_genome.fa
 6 | aligner_reference = tests/data/mock_reference_genome.2bit
 7 | dgv_annotation = tests/data/mock_dgv_annotation.txt
 8 | 
 9 | [annotate]
10 | draw_fusions_only = False
11 | 
12 | [schedule]
13 | 
14 | [validate]
15 | # evidence related settings
16 | aligner = blat
17 | assembly_max_paths = 4
18 | assembly_min_exact_match_to_remap = 4
19 | assembly_min_edge_trim_weight = 4
20 | assembly_min_remap_coverage = 0
21 | assembly_min_remapped_seq = 3
22 | assembly_strand_concordance = 0.51
23 | blat_min_identity = 0.9
24 | call_error = 10
25 | contig_aln_max_event_size = 50
26 | contig_aln_merge_inner_anchor = 20
27 | contig_aln_merge_outer_anchor = 15
28 | contig_aln_min_anchor_size = 50
29 | contig_aln_min_query_consumption = 0.7
30 | fetch_reads_bins = 5
31 | fetch_reads_limit = 10000
32 | fetch_min_bin_size = 50
33 | filter_secondary_alignments = True
34 | fuzzy_mismatch_number = 1
35 | max_sc_preceeding_anchor = 6
36 | min_anchor_exact = 6
37 | min_anchor_fuzzy = 10
38 | min_anchor_match = 0.9
39 | min_double_aligned_to_estimate_insertion_size = 2
40 | min_flanking_pairs_resolution = 3
41 | min_linking_split_reads = 1
42 | min_mapping_quality = 5
43 | min_non_target_aligned_split_reads = 1
44 | min_sample_size_to_apply_percentage = 10
45 | min_softclipping = 6
46 | min_spanning_reads_resolution = 3
47 | min_splits_reads_resolution = 3
48 | stdev_count_abnormal = 3.0
49 | strand_determining_read = 2
50 | outer_window_min_event_size = 125
51 | 
52 | [cluster]
53 | uninformative_filter = True
54 | # all chromosomes
55 | limit_to_chr = None
56 | min_clusters_per_file = 2
57 | 
58 | [mock-A36971]
59 | read_length = 150
60 | median_fragment_size = 400
61 | stdev_fragment_size = 97
62 | bam_file = tests/data/mock_reads_for_events.sorted.bam
63 | protocol = genome
64 | inputs = mock_converted
65 | strand_specific = False
66 | disease_status=diseased
67 | 
68 | [mock-A47933]
69 | read_length = 75
70 | median_fragment_size = 188
71 | stdev_fragment_size = 50
72 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam
73 | protocol = transcriptome
74 | inputs = tests/data/mock_trans_sv_events.tsv
75 | strand_specific = True
76 | disease_status=diseased
77 | 
78 | [summary]
79 | filter_min_remapped_reads = 5
80 | filter_min_spanning_reads = 5
81 | filter_min_flanking_reads = 10
82 | filter_min_split_reads = 5
83 | filter_min_linking_split_reads = 1
84 | filter_cdna_synon = True
85 | filter_protein_synon = True
86 | 
87 | [convert]
88 | assume_no_untemplated = True
89 | # addfile twice to check this notation is ok (will collapse them anyway)
90 | mock_converted = convert_tool_output
91 |     tests/data/mock_sv_events.tsv
92 |     tests/data/mock_sv_events.tsv
93 |     mavis
94 |     False
95 | 
96 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <object type='image/svg+xml' data='docs/images/acronym.svg'>
 2 |     <object type='image/svg+xml' data='images/acronym.svg'>
 3 |     	<img src='docs/images/acronym.svg' onerror='this.src="images/acronym.svg"'>
 4 |     </object><br>
 5 | </object>
 6 | 
 7 | ![PyPi](https://img.shields.io/pypi/v/mavis.svg) ![build](https://github.com/bcgsc/mavis/workflows/build/badge.svg?branch=master) [![codecov](https://codecov.io/gh/bcgsc/mavis/branch/master/graph/badge.svg)](https://codecov.io/gh/bcgsc/mavis) ![ReadTheDocs](https://readthedocs.org/projects/pip/badge/)
 8 | 
 9 | ## About
10 | 
11 | [MAVIS](http://mavis.bcgsc.ca) is python command-line tool for the post-processing of structural variant calls.
12 | The general [MAVIS](http://mavis.bcgsc.ca) pipeline consists of six main stages
13 | 
14 | - convert
15 | - [cluster](https://mavis.readthedocs.io/en/latest/package/mavis/cluster)
16 | - [validate](https://mavis.readthedocs.io/en/latest/package/mavis/validate)
17 | - [annotate](https://mavis.readthedocs.io/en/latest/package/mavis/annotate)
18 | - [pairing](https://mavis.readthedocs.io/en/latest/package/mavis/pairing)
19 | - [summary](https://mavis.readthedocs.io/en/latest/package/mavis/summary)
20 | 
21 | ## Getting Help
22 | 
23 | All steps in the MAVIS pipeline are called following the main mavis entry point. The usage menu can be viewed
24 | by running without any arguments, or by giving the -h/--help option
25 | 
26 | ``` bash
27 | mavis -h
28 | ```
29 | 
30 | Help sub-menus can be found by giving the pipeline step followed by no arguments or the -h options
31 | 
32 | ``` bash
33 | mavis cluster -h
34 | ```
35 | 
36 | Common problems and questions are addressed on the [wiki](https://github.com/bcgsc/mavis/wiki/Help-and-Frequently-Asked-Questions).
37 | If you have a question or issue that is not answered there (or already a github issue) please submit
38 | a github issue to our [github page](https://github.com/bcgsc/mavis/issues) or contact us by email at [mavis@bcgsc.ca](mailto:mavis@bcgsc.ca)
39 | 
40 | ## Getting Started
41 | 
42 | The simplest way to use MAVIS is via Singularity. The MAVIS docker container used
43 | by singularity will take care of installing the aligner as well.
44 | 
45 | ```bash
46 | pip install -U setuptools pip wheel
47 | pip install mavis_config  # also installs snakemake
48 | ```
49 | 
50 | Now you will run mavis via Snakemake as follows
51 | 
52 | ```bash
53 | snakemake \
54 |     -j <MAX JOBS> \
55 |     --configfile <YOUR CONFIG> \
56 |     --use-singularity \
57 |     -s Snakefile
58 | ```
59 | 
60 | For other installation options which do not use docker/singularity see the comprehensive install
61 | instructions in the [user manual](https://mavis.readthedocs.io/en/latest/install)
62 | 
63 | ## Citation
64 | 
65 | If you use MAVIS as a part of your project please cite
66 | 
67 | [Reisle,C. et al. (2018) MAVIS: Merging, Annotation, Validation, and Illustration of Structural variants. Bioinformatics.](https://doi.org/10.1093/bioinformatics/bty621)
68 | 


--------------------------------------------------------------------------------
/docs/background/citations.md:
--------------------------------------------------------------------------------
 1 | # Literature
 2 | 
 3 | ## Abyzov-2011
 4 | 
 5 | Abyzov,A. et al. (2011) CNVnator: an approach to discover, genotype,
 6 |     and characterize typical and atypical CNVs from family and
 7 |     population genome sequencing. Genome Res., 21, 974--984.
 8 | 
 9 | ## Abyzov-2015
10 | 
11 | Abyzov,A. et al. (2015) Analysis of deletion breakpoints from 1,092
12 |     humans reveals details of mutation mechanisms. Nat. Commun.,
13 |     6, 7256.
14 | 
15 | ## Chen-2009
16 | 
17 | Chen,K. et al. (2009) BreakDancer: an algorithm for high-resolution
18 |     mapping of genomic structural variation. Nat. Methods, 6, 677--681.
19 | 
20 | ## Chen-2016
21 | 
22 | Chen,X. et al. (2016) Manta: rapid detection of structural variants
23 |     and indels for germline and cancer sequencing applications.
24 |     Bioinformatics, 32, 1220--1222.
25 | 
26 | ## Chiu-2021
27 | 
28 | Chiu,R. et al. (2021) Straglr: discovering and genotyping tandem repeat
29 |     expansions using whole genome long-read sequences. Genome Biol., 22, 224.
30 | 
31 | ## Haas-2017
32 | 
33 | Haas,B et al. (2017) STAR-Fusion: Fast and Accurate Fusion
34 |     Transcript Detection from RNA-Seq. doi:
35 |     <https://doi.org/10.1101/120295>
36 | 
37 | ## Iyer-2011
38 | 
39 | Iyer,M.K. et al. (2011) ChimeraScan: a tool for identifying chimeric
40 |     transcription in sequencing data. Bioinformatics, 27, 2903--2904.
41 | 
42 | ## MacDonald-2014
43 | 
44 | MacDonald,J.R. et al. (2014) The Database of Genomic Variants: a
45 |     curated collection of structural variation in the human genome.
46 |     Nucleic Acids Res., 42, D986--92.
47 | 
48 | ## McPherson-2011
49 | 
50 | McPherson,A. et al. (2011) deFuse: an algorithm for gene fusion
51 |     discovery in tumor RNA-Seq data. PLoS Comput. Biol., 7, e1001138.
52 | 
53 | ## Rausch-2012
54 | 
55 | Rausch,T. et al. (2012) DELLY: structural variant discovery by
56 |     integrated paired-end and split-read analysis. Bioinformatics, 28,
57 |     i333--i339.
58 | 
59 | ## Robertson-2010
60 | 
61 | Robertson,G. et al. (2010) De novo assembly and analysis of RNA-seq
62 |     data. Nat. Methods, 7, 909--912.
63 | 
64 | ## Saunders-2012
65 | 
66 | Saunders,C.T. et al. (2012) Strelka: accurate somatic small-variant
67 |     calling from sequenced tumor--normal sample pairs. Bioinformatics,
68 |     28, 1811--1817.
69 | 
70 | ## Uhrig-2021
71 | 
72 | Uhrig,S. et al. (2021) Accurate and efficient detection of gene
73 |     fusions from RNA sequencing data. Genome Res., 31, 448--460.
74 | 
75 | ## Yates-2016
76 | 
77 | Yates,A. et al. (2016) Ensembl 2016. Nucleic Acids Res., 44,
78 |     D710--D716.
79 | 
80 | ## Ye-2009
81 | 
82 | Ye,K. et al. (2009) Pindel: a pattern growth approach to detect
83 |     break points of large deletions and medium sized insertions from
84 |     paired-end short reads. Bioinformatics, 25, 2865--2871.
85 | 
86 | ## den-Dunnen-2016
87 | 
88 | den Dunnen,J.T. et al. (2016) HGVS Recommendations for the
89 |     Description of Sequence Variants: 2016 Update. Hum. Mutat., 37,
90 |     564--569.
91 | 


--------------------------------------------------------------------------------
/tests/test_mavis/test_blat.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from mavis.constants import CIGAR, reverse_complement
 4 | from mavis.validate.blat import Blat
 5 | 
 6 | from .mock import Mock, MockFunction, MockLongString
 7 | 
 8 | 
 9 | class TestConvertPslxToPysam:
10 |     def test_simple(self):
11 |         row = {
12 |             'match': 142,
13 |             'mismatch': 0,
14 |             'repmatch': 0,
15 |             'ncount': 0,
16 |             'qgap_count': 0,
17 |             'qgap_bases': 0,
18 |             'tgap_count': 0,
19 |             'tgap_bases': 0,
20 |             'strand': '-',
21 |             'qname': 'seq1',
22 |             'qsize': 204,
23 |             'qstart': 0,
24 |             'qend': 142,
25 |             'tname': '17',
26 |             'tsize': 81195210,
27 |             'tstart': 32673408,
28 |             'tend': 32673550,
29 |             'block_count': 1,
30 |             'block_sizes': [142],
31 |             'qstarts': [62],
32 |             'tstarts': [32673408],
33 |             '_index': 880,
34 |             'score': 142,
35 |             'percent_ident': 100.0,
36 |             'qseq_full': (
37 |                 'ACATGTGCACAACGTGCAGGTTTGTTACATATGTATACATGTGCCATGTTGGTTTGCTGCACCCATTAACTCGTCCTAGTTTATTACTAGTCTTCAGACATC'
38 |                 'CAGAAAATAGAGTAAGATACTAGGTAGACATAACACCTAGATACATCCGTAAGGCATTTGTTTCCTATCACATGGCCCATTCTAGCTTAACACCCACCAACT'
39 |             ),
40 |         }
41 |         refseq = {
42 |             '17': Mock(
43 |                 seq=MockLongString(
44 |                     'ACTAGGTGTTATGTCTACCTAGTATCTTACTCTATTTTCTGGATGTCTGAAGACTAGTAATAAACTAGGACGAGTTAATGGGTGCAGCAAACCAACATGGCACATG'
45 |                     'TATACATATGTAACAAACCTGCACGTTGTGCACATGTACCCTAAAACTTAAAGTATAAAAAAAAATTTCACTGAGCATAAGACTTCAGACACAAAAGAGTGCATGC'
46 |                     'CATATAATTCCATTTATGTGAATTTCAAGAACAATCAGTGATGACAGAAGTCAAAGTAGTGGTCACCTCTGGAAGGTGGGACATTGACC',
47 |                     32673407,
48 |                 )
49 |             )
50 |         }
51 |         cache = Mock(reference_id=MockFunction(16))
52 |         read = Blat.pslx_row_to_pysam(row, cache, refseq)
53 |         assert read.reference_id == 16
54 |         assert read.reference_name == '17'
55 |         assert reverse_complement(read.query_sequence) == row['qseq_full']
56 |         assert read.cigar == [(CIGAR.S, 62), (CIGAR.EQ, 142)]
57 | 
58 |     def test_overlapping_blat_blocks_error(self):
59 |         row = {
60 |             'strand': '+',
61 |             'qname': 'seq23',
62 |             'tname': '7',
63 |             'block_sizes': [54, 53, 36, 80, 29],
64 |             'qstarts': [0, 55, 108, 143, 223],
65 |             'tstarts': [61279112, 61279166, 61397315, 61990208, 62366144],
66 |             'score': 207,
67 |             'percent_ident': 91.3,
68 |             'qseq_full': (
69 |                 'CAAAAGGAAATACCTTCACATAAATTCTAGACGGAAGCAATCTGAGAAACTTTTATTGTGATTTGTGCATTCACTTCACAGAGTTAAAACTTTCTTTTGATT'
70 |                 'GAGCAGTTTGAAACTCTGTTTTTGTAGAATCTGCAAGTGGACATTTGGAGCGCTTTGAGGCCTATGGTGGAAAAGGAAATATCTTCACAGGAAAACTAGATA'
71 |                 'GAAGTATTCTGAGAAACTTCTTTGTGATGTATGCAGTCATATCTCAGA'
72 |             ),
73 |         }
74 |         cache = Mock(reference_id=MockFunction(6))
75 |         with pytest.raises(AssertionError):
76 |             Blat.pslx_row_to_pysam(row, cache, None)
77 | 


--------------------------------------------------------------------------------
/docs/package/mavis/validate/index.md:
--------------------------------------------------------------------------------
 1 | # Sub-package Documentation
 2 | 
 3 | The validation sub-package is responsible for pulling supporting reads from the bam file
 4 | and re-calling events based on the evidence in a standard notation.
 5 | 
 6 | ## Types of Output Files
 7 | 
 8 | A variety of intermediate output files are given for the user. These can be used to "drill down"
 9 | further into events and also for developers debugging when adding new features, etc.
10 | 
11 | | expected name/suffix        | file type/format                                    | content                            |
12 | | --------------------------- | --------------------------------------------------- | ---------------------------------- |
13 | | ``*.raw_evidence.bam``      | [bam](../../../glossary/#bam)                       | raw evidence                       |
14 | | ``*.contigs.bam``           | [bam](../../../glossary/#bam)                       | aligned contigs                    |
15 | | ``*.evidence.bed``          | [bed](../../../glossary/#bed)                       | evidence collection window regions |
16 | | ``*.validation-passed.bed`` | [bed](../../../glossary/#bed)                       | validated event positions          |
17 | | ``*.validation-failed.tab`` | text/tabbed                                         | failed events                      |
18 | | ``*.validation-passed.tab`` | text/tabbed                                         | validated events                   |
19 | | ``*.contigs.fa``            | [fasta](../../../glossary/#fasta)                   | assembled contigs                  |
20 | | ``*.contigs.blat_out.pslx`` | [pslx](../../../glossary/#pslx)                     | results from blatting contigs      |
21 | | ``*.igv.batch``             | [IGV batch file](../../../glossary/#IGV-batch-file) | igv batch file                     |
22 | 
23 | 
24 | ## Algorithm Overview
25 | 
26 | - (For each breakpoint pair)
27 | 
28 |     - [Calculate the window/region](../../../background/theory/#calculating-the-evidence-window) to read from the bam and collect
29 |       evidence
30 |     - Store evidence ([flanking read pair](../../../glossary/#flanking-read-pair), [half-mapped read](../../../glossary/#half-mapped-read), [spanning read](../../../glossary/#spanning-read), [split read](../../../glossary/#split-read),
31 |       [compatible flanking pairs](../../../glossary/#compatible-flanking-pairs)) which match the expected event type and position
32 |     - Assemble a contig from the collected reads. see [theory - assembling contigs](../../../background/theory/#assembling-contigs)
33 | 
34 | - Generate a [fasta](../../../glossary/#fasta) file containing all the contig sequences
35 | - Align contigs to the reference genome (currently [blat](../../../glossary/#blat) is used to perform this step)
36 | - Make the final event calls. Each level of calls consumes all supporting reads so they are not re-used in subsequent
37 |   levels of calls.
38 | - (For each breakpoint pair)
39 | 
40 |     - call by contig
41 |     - call by [spanning read](../../../glossary/#spanning-read)
42 |     - call by [split read](../../../glossary/#split-read)
43 |     - call by [flanking read pair](../../../glossary/#flanking-read-pair). see [theory - calling breakpoints by flanking evidence](../../../background/theory/#calling-breakpoints-by-flanking-evidence)
44 | 
45 | - Output new calls, evidence, contigs, etc
46 | 


--------------------------------------------------------------------------------
/tests/full-tutorial.config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "annotate.draw_fusions_only": true,
 3 |     "cluster.min_clusters_per_file": 100,
 4 |     "cluster.uninformative_filter": true,
 5 |     "convert": {
 6 |         "breakdancer": {
 7 |             "assume_no_untemplated": true,
 8 |             "file_type": "breakdancer",
 9 |             "inputs": [
10 |                 "tutorial_data/breakdancer-1.4.5/*txt"
11 |             ]
12 |         },
13 |         "breakseq": {
14 |             "assume_no_untemplated": true,
15 |             "file_type": "breakseq",
16 |             "inputs": [
17 |                 "tutorial_data/breakseq-2.2/breakseq.vcf.gz"
18 |             ]
19 |         },
20 |         "chimerascan": {
21 |             "assume_no_untemplated": true,
22 |             "file_type": "chimerascan",
23 |             "inputs": [
24 |                 "tutorial_data/chimerascan-0.4.5/chimeras.bedpe"
25 |             ]
26 |         },
27 |         "defuse": {
28 |             "assume_no_untemplated": true,
29 |             "file_type": "defuse",
30 |             "inputs": [
31 |                 "tutorial_data/defuse-0.6.2/results.classify.tsv"
32 |             ]
33 |         },
34 |         "manta": {
35 |             "assume_no_untemplated": true,
36 |             "file_type": "manta",
37 |             "inputs": [
38 |                 "tutorial_data/manta-1.0.0/diploidSV.vcf.gz",
39 |                 "tutorial_data/manta-1.0.0/somaticSV.vcf"
40 |             ]
41 |         }
42 |     },
43 |     "libraries": {
44 |         "L1522785992-normal": {
45 |             "assign": [
46 |                 "breakdancer",
47 |                 "breakseq",
48 |                 "manta"
49 |             ],
50 |             "bam_file": "tutorial_data/L1522785992_normal.sorted.bam",
51 |             "disease_status": "normal",
52 |             "protocol": "genome"
53 |         },
54 |         "L1522785992-trans": {
55 |             "assign": [
56 |                 "chimerascan",
57 |                 "defuse"
58 |             ],
59 |             "bam_file": "tutorial_data/L1522785992_trans.sorted.bam",
60 |             "disease_status": "diseased",
61 |             "protocol": "transcriptome",
62 |             "strand_specific": true
63 |         },
64 |         "L1522785992-tumour": {
65 |             "assign": [
66 |                 "breakdancer",
67 |                 "breakseq",
68 |                 "manta"
69 |             ],
70 |             "bam_file": "tutorial_data/L1522785992_tumour.sorted.bam",
71 |             "disease_status": "diseased",
72 |             "protocol": "genome"
73 |         }
74 |     },
75 |     "output_dir": "output_dir_full",
76 |     "reference.aligner_reference": [
77 |         "reference_inputs/hg19.2bit"
78 |     ],
79 |     "reference.annotations": [
80 |         "reference_inputs/ensembl69_hg19_annotations.v3.json"
81 |     ],
82 |     "reference.dgv_annotation": [
83 |         "tests/data/mock_dgv_annotation.tab"
84 |     ],
85 |     "reference.masking": [
86 |         "reference_inputs/hg19_masking.tab"
87 |     ],
88 |     "reference.reference_genome": [
89 |         "reference_inputs/hg19.fa"
90 |     ],
91 |     "reference.template_metadata": [
92 |         "reference_inputs/cytoBand.txt"
93 |     ],
94 |     "summary.filter_min_flanking_reads": 10,
95 |     "summary.filter_min_linking_split_reads": 1,
96 |     "summary.filter_min_remapped_reads": 5,
97 |     "summary.filter_min_spanning_reads": 5
98 | }
99 | 


--------------------------------------------------------------------------------
/src/mavis/convert/transabyss.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from ..constants import COLUMNS
 4 | from .constants import SUPPORTED_TOOL, TRACKING_COLUMN
 5 | 
 6 | 
 7 | def convert_row(row):
 8 |     """
 9 |     transforms the transabyss output into the common format for expansion.
10 |     Maps the input column names to column names which MAVIS can read
11 |     """
12 |     std_row = {}
13 |     if TRACKING_COLUMN not in row:
14 |         std_row[TRACKING_COLUMN] = '{}-{}'.format(SUPPORTED_TOOL.TA, row['id'])
15 | 
16 |     std_row[COLUMNS.event_type] = row.get('rearrangement', row['type'])
17 |     for retained_column in ['genes', 'gene']:
18 |         if retained_column in row:
19 |             std_row['{}_{}'.format(SUPPORTED_TOOL.TA, retained_column)] = row[retained_column]
20 |     if std_row[COLUMNS.event_type] in ['LSR', 'translocation']:
21 |         del std_row[COLUMNS.event_type]
22 |     if 'breakpoint' in row:
23 |         std_row[COLUMNS.break1_orientation], std_row[COLUMNS.break2_orientation] = row[
24 |             'orientations'
25 |         ].split(',')
26 |         match = re.match(
27 |             r'^(?P<chr1>[^:]+):(?P<pos1_start>\d+)\|(?P<chr2>[^:]+):(?P<pos2_start>\d+)$',
28 |             row['breakpoint'],
29 |         )
30 |         if not match:
31 |             raise OSError(
32 |                 'file format error: the breakpoint column did not satisfy the expected pattern', row
33 |             )
34 |         for group, col in zip(
35 |             ['chr1', 'pos1_start', 'chr2', 'pos2_start'],
36 |             [
37 |                 COLUMNS.break1_chromosome,
38 |                 COLUMNS.break1_position_start,
39 |                 COLUMNS.break2_chromosome,
40 |                 COLUMNS.break2_position_start,
41 |             ],
42 |         ):
43 |             std_row[col] = match[group]
44 |     else:
45 |         std_row.update(
46 |             {
47 |                 COLUMNS.break1_chromosome: row['chr'],
48 |                 COLUMNS.break1_position_start: int(row['chr_start']),
49 |                 COLUMNS.break2_position_start: int(row['chr_end']),
50 |             }
51 |         )
52 |         if std_row[COLUMNS.event_type] == 'del':
53 |             std_row[COLUMNS.break1_position_start] -= 1
54 |             std_row[COLUMNS.break2_position_start] += 1
55 |         elif std_row[COLUMNS.event_type] == 'ins':
56 |             std_row[COLUMNS.break2_position_start] += 1
57 | 
58 |         # add the untemplated sequence where appropriate
59 |         if std_row[COLUMNS.event_type] == 'del':
60 |             assert row['alt'] == 'na'
61 |             std_row[COLUMNS.untemplated_seq] = ''
62 |         elif std_row[COLUMNS.event_type] in ['dup', 'ITD']:
63 |             length = (
64 |                 std_row[COLUMNS.break2_position_start] - std_row[COLUMNS.break1_position_start] + 1
65 |             )
66 |             if len(row['alt']) != length:
67 |                 raise AssertionError(
68 |                     'expected alternate sequence to be equal to the length of the event',
69 |                     len(row['alt']),
70 |                     length,
71 |                     row,
72 |                     std_row,
73 |                 )
74 |             std_row[COLUMNS.untemplated_seq] = ''
75 |         elif std_row[COLUMNS.event_type] == 'ins':
76 |             std_row[COLUMNS.untemplated_seq] = row['alt'].upper()
77 |         else:
78 |             raise NotImplementedError('unexpected indel type', std_row[COLUMNS.event_type])
79 |     return std_row
80 | 


--------------------------------------------------------------------------------
/tests/test_tools/data/K02718.1.gff3:
--------------------------------------------------------------------------------
 1 | K02718.1	Genbank	CDS	1140	2813	.	+	0	ID=cds-AAA46936.1;Parent=gene-E1;Dbxref=NCBI_GP:AAA46936.1;Name=AAA46936.1;Note=E1 interrupted ORF from 859 to 2813%3B putative;gbkey=CDS;gene=E1;product=replication protein;protein_id=AAA46936.1
 2 | K02718.1	Genbank	CDS	2755	3852	.	+	0	ID=cds-AAA46941.1;Parent=gene-E2;Dbxref=NCBI_GP:AAA46941.1;Name=AAA46941.1;Note=E2 ORF from 2725 to 3852%3B putative;gbkey=CDS;gene=E2;product=regulatory protein;protein_id=AAA46941.1
 3 | K02718.1	Genbank	CDS	3332	3619	.	+	0	ID=cds-AAA46937.1;Parent=gene-E4;Dbxref=NCBI_GP:AAA46937.1;Name=AAA46937.1;gbkey=CDS;gene=E4;partial=true;product=AAA46937.1;protein_id=AAA46937.1;start_range=.,3332
 4 | K02718.1	Genbank	CDS	3863	4099	.	+	0	ID=cds-AAA46938.1;Parent=gene-E5;Dbxref=NCBI_GP:AAA46938.1;Name=AAA46938.1;gbkey=CDS;gene=E5;partial=true;product=AAA46938.1;protein_id=AAA46938.1;start_range=.,3863
 5 | K02718.1	Genbank	CDS	4235	5656	.	+	0	ID=cds-AAA46942.1;Parent=gene-L2;Dbxref=NCBI_GP:AAA46942.1;Name=AAA46942.1;Note=L2 ORF from 4133 to 5656%3B putative;gbkey=CDS;gene=L2;product=minor capsid protein;protein_id=AAA46942.1
 6 | K02718.1	Genbank	CDS	5559	7154	.	+	0	ID=cds-AAA46943.1;Parent=gene-L1;Dbxref=NCBI_GP:AAA46943.1;Name=AAA46943.1;Note=L1 ORF from 5526 to 7154%3B putative;gbkey=CDS;gene=L1;product=major capsid protein;protein_id=AAA46943.1
 7 | K02718.1	Genbank	CDS	562	858	.	+	0	ID=cds-AAA46940.1;Parent=gene-E7;Dbxref=NCBI_GP:AAA46940.1;Name=AAA46940.1;Note=E7 ORF from 544 to 858%3B putative;gbkey=CDS;gene=E7;product=transforming protein;protein_id=AAA46940.1
 8 | K02718.1	Genbank	CDS	83	559	.	+	0	ID=cds-AAA46939.1;Parent=gene-E6;Dbxref=NCBI_GP:AAA46939.1;Name=AAA46939.1;Note=E6 ORF from 65 to 559%3B putative;gbkey=CDS;gene=E6;product=transforming protein;protein_id=AAA46939.1
 9 | K02718.1	Genbank	CDS	865	1140	.	+	0	ID=cds-AAA46936.1;Parent=gene-E1;Dbxref=NCBI_GP:AAA46936.1;Name=AAA46936.1;Note=E1 interrupted ORF from 859 to 2813%3B putative;gbkey=CDS;gene=E1;product=replication protein;protein_id=AAA46936.1
10 | K02718.1	Genbank	gene	1140	2813	.	+	.	ID=gene-E1;Name=E1;gbkey=Gene;gene=E1;gene_biotype=protein_coding
11 | K02718.1	Genbank	gene	2755	3852	.	+	.	ID=gene-E2;Name=E2;gbkey=Gene;gene=E2;gene_biotype=protein_coding
12 | K02718.1	Genbank	gene	3332	3619	.	+	.	ID=gene-E4;Name=E4;gbkey=Gene;gene=E4;gene_biotype=protein_coding
13 | K02718.1	Genbank	gene	3863	4099	.	+	.	ID=gene-E5;Name=E5;gbkey=Gene;gene=E5;gene_biotype=protein_coding
14 | K02718.1	Genbank	gene	4235	5656	.	+	.	ID=gene-L2;Name=L2;gbkey=Gene;gene=L2;gene_biotype=protein_coding
15 | K02718.1	Genbank	gene	5559	7154	.	+	.	ID=gene-L1;Name=L1;gbkey=Gene;gene=L1;gene_biotype=protein_coding
16 | K02718.1	Genbank	gene	562	858	.	+	.	ID=gene-E7;Name=E7;gbkey=Gene;gene=E7;gene_biotype=protein_coding
17 | K02718.1	Genbank	gene	83	559	.	+	.	ID=gene-E6;Name=E6;gbkey=Gene;gene=E6;gene_biotype=protein_coding
18 | K02718.1	Genbank	gene	865	1140	.	+	.	ID=gene-E1;Name=E1;gbkey=Gene;gene=E1;gene_biotype=protein_coding
19 | K02718.1	Genbank	region	17	23	.	+	.	ID=id-K02718.1:17..23;gbkey=TATA_signal
20 | K02718.1	Genbank	region	1	7904	.	+	.	ID=K02718.1:1..7904;Dbxref=taxon:333760;Is_circular=true;gbkey=Src;mol_type=genomic DNA
21 | K02718.1	Genbank	region	4213	4218	.	+	.	ID=id-K02718.1:4213..4218;Note=putative;gbkey=polyA_signal
22 | K02718.1	Genbank	region	4289	4295	.	+	.	ID=id-L2;gbkey=TATA_signal;gene=L2
23 | K02718.1	Genbank	region	65	71	.	+	.	ID=id-K02718.1:65..71;gbkey=TATA_signal
24 | K02718.1	Genbank	region	7260	7265	.	+	.	ID=id-K02718.1:7260..7265;gbkey=polyA_signal
25 | 


--------------------------------------------------------------------------------
/tests/data/mock_reference_annotations.full.json:
--------------------------------------------------------------------------------
1 | {"genes": [{"aliases": ["C9orf47"], "chr": "fakereference9", "end": 5278, "name": "ENSG00000186354", "start": 1, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": 685, "cdna_coding_start": 134, "domains": [], "end": 5278, "exons": [{"end": 322, "start": 1}, {"end": 833, "start": 608}, {"end": 5278, "start": 990}], "is_best_transcript": true, "name": "ENST00000375851", "start": 1}, {"aliases": [], "cdna_coding_end": 783, "cdna_coding_start": 76, "domains": [], "end": 1202, "exons": [{"end": 322, "start": 59}, {"end": 1202, "start": 608}], "is_best_transcript": false, "name": "ENST00000375850", "start": 59}, {"aliases": [], "cdna_coding_end": 677, "cdna_coding_start": 69, "domains": [], "end": 5278, "exons": [{"end": 379, "start": 66}, {"end": 833, "start": 608}, {"end": 5278, "start": 990}], "is_best_transcript": false, "name": "ENST00000334490", "start": 66}]}, {"aliases": ["S1PR3"], "chr": "fakereference9", "end": 14148, "name": "ENSG00000213694", "start": 585, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": 1533, "cdna_coding_start": 397, "domains": [{"name": "PR00362", "regions": [{"end": 62, "start": 49}, {"end": 200, "start": 185}, {"end": 308, "start": 298}]}, {"name": "PR00642", "regions": [{"end": 75, "start": 63}, {"end": 112, "start": 102}, {"end": 155, "start": 139}, {"end": 345, "start": 329}]}, {"name": "PS50262", "regions": [{"end": 298, "start": 56}]}, {"name": "PF10320", "regions": [{"end": 312, "start": 55}]}, {"name": "SSF81321", "regions": [{"end": 340, "start": 1}]}, {"name": "PR00237", "regions": [{"end": 65, "start": 41}, {"end": 95, "start": 74}, {"end": 140, "start": 118}, {"end": 174, "start": 153}, {"end": 219, "start": 196}, {"end": 265, "start": 241}, {"end": 306, "start": 280}]}, {"name": "PR01523", "regions": [{"end": 25, "start": 13}, {"end": 101, "start": 92}, {"end": 123, "start": 112}, {"end": 204, "start": 194}, {"end": 224, "start": 215}, {"end": 283, "start": 272}, {"end": 311, "start": 301}]}, {"name": "PF00001", "regions": [{"end": 298, "start": 56}]}, {"name": "PR01524", "regions": [{"end": 40, "start": 24}, {"end": 155, "start": 139}, {"end": 233, "start": 223}, {"end": 323, "start": 314}, {"end": 340, "start": 326}]}], "end": 14148, "exons": [{"end": 833, "start": 585}, {"end": 14148, "start": 10192}], "is_best_transcript": false, "name": "ENST00000358157", "start": 585}, {"aliases": [], "cdna_coding_end": 5832, "cdna_coding_start": 4696, "domains": [{"name": "PF10320", "regions": [{"end": 312, "start": 55}]}, {"name": "PR00362", "regions": [{"end": 62, "start": 49}, {"end": 200, "start": 185}, {"end": 308, "start": 298}]}, {"name": "PS50262", "regions": [{"end": 298, "start": 56}]}, {"name": "PR00642", "regions": [{"end": 75, "start": 63}, {"end": 112, "start": 102}, {"end": 155, "start": 139}, {"end": 345, "start": 329}]}, {"name": "PR00237", "regions": [{"end": 65, "start": 41}, {"end": 95, "start": 74}, {"end": 140, "start": 118}, {"end": 174, "start": 153}, {"end": 219, "start": 196}, {"end": 265, "start": 241}, {"end": 306, "start": 280}]}, {"name": "PR01523", "regions": [{"end": 25, "start": 13}, {"end": 101, "start": 92}, {"end": 123, "start": 112}, {"end": 204, "start": 194}, {"end": 224, "start": 215}, {"end": 283, "start": 272}, {"end": 311, "start": 301}]}, {"name": "PR01524", "regions": [{"end": 40, "start": 24}, {"end": 155, "start": 139}, {"end": 233, "start": 223}, {"end": 323, "start": 314}, {"end": 340, "start": 326}]}, {"name": "PF00001", "regions": [{"end": 298, "start": 56}]}, {"name": "SSF81321", "regions": [{"end": 340, "start": 1}]}], "end": 14148, "exons": [{"end": 14148, "start": 5644}], "is_best_transcript": true, "name": "ENST00000375846", "start": 5644}]}]}


--------------------------------------------------------------------------------
/docs/tutorials/annotation.md:
--------------------------------------------------------------------------------
 1 | # Annotation Only
 2 | 
 3 | Sometimes you have a set of variants and would simply like to run the annotate step of MAVIS to visualize and annotate them.
 4 | 
 5 | First you need to create your basic config to tell MAVIS where the reference files you want to use are and some minimal information about the library/sample you want to process.
 6 | 
 7 | Here is an example config where the user has created a minimal input file in the MAVIS standard input file format. We convert it to expand any unknowns (ex. SV type if left blank)
 8 | 
 9 | ```json
10 | {
11 |     "libraries": {
12 |         "my_library": {
13 |             "assign": ["my_converted_file"],
14 |             "disease_status": "normal",
15 |             "protocol": "genome"
16 |         }
17 |     },
18 |     "convert": {
19 |         "my_converted_file": {
20 |             "inputs": ["/path/to/file/structural_variants.txt"],
21 |             "file_type": "mavis"
22 |          }
23 |     },
24 |     "cluster.split_only": true,
25 |     "skip_stage.validate": true,
26 |     "output_dir": "my_output_dir",
27 |     "reference.annotations": "/path/to/mavis/reference_files/ensembl79_hg38_annotations.json",
28 |     "reference.template_metadata": "/path/to/mavis/reference_files/hg38_cytoBand.txt",
29 |     "reference.reference_genome": "/path/to/hg38_no_alt/genome/hg38_no_alt.fa",
30 |     "reference.masking": "/path/to/mavis/reference_files/masking_hg38.adjusted.tab",
31 |     "reference.dgv_annotation": "/path/to/mavis/reference_files/dgv_hg38_annotations.tab"
32 | }
33 | ```
34 | 
35 | Another example is given in the MAVIS tests folder under `tests/mini-tutorial.annotate_only.config.json` which looks like this
36 | 
37 | ```json
38 | {
39 |     "annotate.draw_fusions_only": false,
40 |     "convert": {
41 |         "mock_converted": {
42 |             "inputs": [
43 |                 "tests/data/mock_sv_events.tsv"
44 |             ],
45 |             "file_type": "mavis",
46 |             "assume_no_untemplated": true
47 |         }
48 |     },
49 |     "skip_stage.validate": true,
50 |     "cluster.uninformative_filter": true,
51 |     "cluster.limit_to_chr": null,
52 |     "cluster.min_clusters_per_file": 5,
53 |     "libraries": {
54 |         "mock-A47933": {
55 |             "assign": [
56 |                 "tests/data/mock_trans_sv_events.tsv"
57 |             ],
58 |             "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam",
59 |             "disease_status": "diseased",
60 |             "protocol": "transcriptome",
61 |             "strand_specific": true
62 |         },
63 |         "mock-A36971": {
64 |             "assign": [
65 |                 "mock_converted"
66 |             ],
67 |             "bam_file": "tests/data/mock_reads_for_events.sorted.bam",
68 |             "disease_status": "diseased",
69 |             "protocol": "genome",
70 |             "strand_specific": false
71 |         }
72 |     },
73 |     "output_dir": "output_dir",
74 |     "reference.annotations": [
75 |         "tests/data/mock_annotations.json"
76 |     ],
77 |     "reference.dgv_annotation": [
78 |         "tests/data/mock_dgv_annotation.txt"
79 |     ],
80 |     "reference.masking": [
81 |         "tests/data/mock_masking.tab"
82 |     ],
83 |     "reference.reference_genome": [
84 |         "tests/data/mock_reference_genome.fa"
85 |     ],
86 |     "reference.template_metadata": [
87 |         "tests/data/cytoBand.txt"
88 |     ]
89 | }
90 | ```
91 | 
92 | Either of these configurations can be run with the following command simply by changing the configfile argument
93 | 
94 | ```bash
95 | snakemake -j 1 \
96 |     --configfile tests/mini-tutorial.annotate_only.config.json \
97 |     -s Snakefile
98 | ```
99 | 


--------------------------------------------------------------------------------
/tests/data/transabyss_indels_output.tab:
--------------------------------------------------------------------------------
 1 | id	type	chr	chr_start	chr_end	ctg	ctg_len	ctg_start	ctg_end	len	ref	alt	event_reads	contig_reads	genome_reads	gene	repeat-length	ctg_strand	from_end	confirm_contig_region	within_simple_repeats	repeatmasker	within_segdup	at_least_1_read_opposite	dbsnp
 2 | 1	ins	1	8877520	8877520	4542232	58938	23102	23103	2	na	tt	41	41	47	RERE:uc001apf.3:exon1|synon	0	+	23101	23102-23117	-	-	-	false	-
 3 | 2	ins	1	16011005	16011005	4541011	129199	97246	97248	3	na	ggc	22	22	25	PLEKHM2:uc010obo.2:exon1|synon	0	-	31951	97234-97248	TRF_SimpleTandemRepeat_GCG	(CGG)n	-	false	-
 4 | 3	ins	1	16926227	16926227	4624842	952	419	419	1	na	t	46	46	68	NBPF1:uc001aza.5:exon3|na	0	-	418	414-419	-	L1ME3	chr1:21766304	false	-
 5 | 4	ins	1	17026040	17026040	4529033	986	780	794	15	na	gcggcggcggcggca	35	35	23	ESPNP:uc001azn.1:exon8|P431_P432insLPPPP	0	+	192	780-794	-	(CGG)n	chr1:6487720	false	-
 6 | 5	ins	1	17026043	17026043	4521063	925	99	143	45	na	gcggcggcggcggcggcggcggcggcggcagcagcagcagcagca	6	6	8	ESPNP:uc001azn.1:exon8|L430_P431insLLLLLLPPPPPPPPP	0	-	98	99-143	-	(CGG)n	chr1:6487720	false	-
 7 | 1175	del	X	142715897	142715924	4547857	78777	52728	52728	28	ttttt...ttttt	na	34	34	17	SLITRK4:uc022cfl.1:exon2|SLITRK4:uc022cfl.1:exon2|synon	0	+	25889	52728-52728	TRF_SimpleTandemRepeat_T	(T)n	-	false	-
 8 | 1176.1	del	X	149115835	149115836	indel_k96_4578561	1263	1145	1145	2	ga	na	37	30	2	LINC00894:uc004fed.1:exon1|LINC00894:uc004fed.1:exon1|na	0	+	118	1145-1149	-	-	chrX:148613958	false	-
 9 | 1176.2	del	X	149115835	149115836	indel_5327	1263	119	119	2	ga	na	37	7	2	LINC00894:uc004fed.1:exon1|LINC00894:uc004fed.1:exon1|na	0	-	118	115-119	-	-	chrX:148613958	false	-
10 | 1177	del	X	153523769	153523790	4654686	26033	2836	2836	22	gcacc...gtgcg	na	8	8	1	TEX28:uc010nut.1:exon1|TEX28:uc010nut.1:exon1|synon	0	+	2835	2836-2924	TRF_SimpleTandemRepeat_CACGTGCGGCACCACCCCCTGA	-	-	false	-
11 | 1178	del	X	154997577	154997583	4522314	63590	44595	44595	7	ttttgtt	na	28	28	23	SPRY3:uc004fnq.1:exon1|SPRY3:uc004fnq.1:exon1|synon	0	+	18995	44595-44595	TRF_SimpleTandemRepeat_TTTTG	(TTTTG)n	chrY:59033286	false	-
12 | 1181	dup	12	13029070	13029073	4659122	38006	26858	26861	4	na	aaaa	38	38	25	RPL13AP20:uc010sho.2:exon1;3utr|NA:NA:NA|NA	0	-	11139	26838-26861	-	(A)n	-	false	-
13 | 1182	dup	12	121839158	121839167	4619408	122056	113544	113553	10	na	aaaaaaaaaa	34	34	15	BC029038:uc001uan.3:exon1;3utr|NA:NA:NA|NA	0	-	8503	113524-113553	-	L2c	-	false	-
14 | 1183	dup	15	44094768	44094775	4533713	84196	41867	41874	8	na	aaaaaaaa	7	7	1	SERF2-C15ORF63:uc001ztb.3:exon6;3utr|NA:NA:NA|NA	0	+	41866	41867-41890	-	AluSq4	-	false	-
15 | 1184	dup	6	27515484	27515553	4632026	88843	27311	27380	70	na	ggaaaacaaaaggtccaggaaaaggatatatacatatatcttcgagcaggttccaccgagacttgaactc	131	131	24	NA:NA:NA|TRNA_Gln:uc021yqh.1:3utr;exon1|NA	0	-	27261	27241-27380	-	tRNA-Gln-CAG	-	false	-
16 | 1185	dup	GL000211.1	108677	108683	4632141	14477	5082	5088	7	na	aaaaaaa	33	33	19	FLJ43315:uc003boa.3:exon5;3utr|NA:NA:NA|NA	0	+	5081	5082-5102	-	FLAM_A	chr9:69378660	false	-
17 | 1232	ITD	9	132345740	132345781	3298328	190	96	137	42	na	tccatcccttcacctccactaagatcagggcaccccaggagt	9	9	13	BC037833:uc004bya.1:exon4|BC037833:uc004bya.1:exon4|na	0	-	53	54-137	-	-	-	false	-
18 | 1233	ITD	GL000220.1	114348	114379	4159437	179	96	127	32	na	cccccgcggggaatcccccgcgaggggggtct	37	37	13	RNA5-8S5:uc022brd.2:exon1|RNA5-8S5:uc022brd.2:exon1|na	0	-	52	64-127	-	LSU-rRNA_Hsa	chrUn_gl000220:145518	false	-
19 | 1234	ITD	GL000220.1	118433	118436	50603	168	81	84	4	na	gcgt	24	24	1	RNA5-8S5:uc022brd.2:exon1|RNA5-8S5:uc022brd.2:exon1|na	0	-	80	73-84	-	(CG)n	-	false	-
20 | 1235	ITD	GL000220.1	118437	118440	107283	168	77	80	4	na	gcgt	323	323	1	RNA5-8S5:uc022brd.2:exon1|RNA5-8S5:uc022brd.2:exon1|na	0	-	76	73-80	-	(CG)n	-	false	-
21 | 1236	ITD	X	84343323	84343327	4588370	15020	7333	7337	5	na	ttttt	5	5	4	APOOL:uc004eem.3:exon9|APOOL:uc004eem.3:exon9|synon	0	-	7332	7308-7337	TRF_SimpleTandemRepeat_T	(T)n	-	false	-
22 | 


--------------------------------------------------------------------------------
/tests/test_mavis/test_help.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from unittest.mock import patch
  3 | 
  4 | from mavis_config.constants import SUBCOMMAND
  5 | 
  6 | from mavis.main import main
  7 | 
  8 | 
  9 | class TestHelpMenu:
 10 |     def test_main(self):
 11 |         with patch.object(sys, 'argv', ['mavis', '-h']):
 12 |             try:
 13 |                 returncode = main()
 14 |             except SystemExit as err:
 15 |                 assert err.code == 0
 16 |             else:
 17 |                 assert returncode == 0
 18 | 
 19 |     def test_pipeline(self):
 20 |         with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.SETUP, '-h']):
 21 |             try:
 22 |                 returncode = main()
 23 |             except SystemExit as err:
 24 |                 assert err.code == 0
 25 |             else:
 26 |                 assert returncode == 0
 27 | 
 28 |     def test_cluster(self):
 29 |         with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.CLUSTER, '-h']):
 30 |             try:
 31 |                 returncode = main()
 32 |             except SystemExit as err:
 33 |                 assert err.code == 0
 34 |             else:
 35 |                 assert returncode == 0
 36 | 
 37 |     def test_validate(self):
 38 |         with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.VALIDATE, '-h']):
 39 |             try:
 40 |                 returncode = main()
 41 |             except SystemExit as err:
 42 |                 assert err.code == 0
 43 |             else:
 44 |                 assert returncode == 0
 45 | 
 46 |     def test_annotate(self):
 47 |         with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.ANNOTATE, '-h']):
 48 |             try:
 49 |                 returncode = main()
 50 |             except SystemExit as err:
 51 |                 assert err.code == 0
 52 |             else:
 53 |                 assert returncode == 0
 54 | 
 55 |     def test_pairing(self):
 56 |         with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.PAIR, '-h']):
 57 |             try:
 58 |                 returncode = main()
 59 |             except SystemExit as err:
 60 |                 assert err.code == 0
 61 |             else:
 62 |                 assert returncode == 0
 63 | 
 64 |     def test_summary(self):
 65 |         with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.SUMMARY, '-h']):
 66 |             try:
 67 |                 returncode = main()
 68 |             except SystemExit as err:
 69 |                 assert err.code == 0
 70 |             else:
 71 |                 assert returncode == 0
 72 | 
 73 |     def test_convert(self):
 74 |         with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.CONVERT, '-h']):
 75 |             try:
 76 |                 returncode = main()
 77 |             except SystemExit as err:
 78 |                 assert err.code == 0
 79 |             else:
 80 |                 assert returncode == 0
 81 | 
 82 |     def test_overlay(self):
 83 |         with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.OVERLAY, '-h']):
 84 |             try:
 85 |                 returncode = main()
 86 |             except SystemExit as err:
 87 |                 assert err.code == 0
 88 |             else:
 89 |                 assert returncode == 0
 90 | 
 91 |     def test_bad_option(self):
 92 |         with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.SETUP, '--blargh']):
 93 |             try:
 94 |                 returncode = main()
 95 |             except SystemExit as err:
 96 |                 assert err.code != 0
 97 |             else:
 98 |                 assert returncode != 0
 99 | 
100 |     def test_ref_alt_count(self):
101 |         with patch.object(sys, 'argv', ['calculate_ref_alt_counts', '-h']):
102 |             try:
103 |                 returncode = main()
104 |             except SystemExit as err:
105 |                 assert err.code == 0
106 |             else:
107 |                 assert returncode == 0
108 | 


--------------------------------------------------------------------------------
/tests/data/mock_dgv_annotation_mavis.tab:
--------------------------------------------------------------------------------
 1 | tracking_id	event_type	break1_chromosome	break1_position_start	break1_position_end	break1_orientation	break1_strand	break1_seq	break2_chromosome	break2_position_start	break2_position_end	break2_orientation	break2_strand	break2_seq	opposing_strands	stranded	tools	
 2 | nsv482937	None	1	1	1	L	?	None	1	2300000	2300000	L	?	None	True	False	bed	
 3 | nsv482937	None	1	1	1	L	?	None	1	2300000	2300000	R	?	None	False	False	bed	
 4 | nsv482937	None	1	1	1	R	?	None	1	2300000	2300000	L	?	None	False	False	bed	
 5 | nsv482937	None	1	1	1	R	?	None	1	2300000	2300000	R	?	None	True	False	bed	
 6 | dgv1n82	None	1	10001	10001	L	?	None	1	22118	22118	L	?	None	True	False	bed	
 7 | dgv1n82	None	1	10001	10001	L	?	None	1	22118	22118	R	?	None	False	False	bed	
 8 | dgv1n82	None	1	10001	10001	R	?	None	1	22118	22118	L	?	None	False	False	bed	
 9 | dgv1n82	None	1	10001	10001	R	?	None	1	22118	22118	R	?	None	True	False	bed	
10 | rgv2n98	None	1	10001	10001	L	?	None	1	22120	22120	L	?	None	True	False	bed	
11 | rgv2n98	None	1	10001	10001	L	?	None	1	22120	22120	R	?	None	False	False	bed	
12 | rgv2n98	None	1	10001	10001	R	?	None	1	22120	22120	L	?	None	False	False	bed	
13 | rgv2n98	None	1	10001	10001	R	?	None	1	22120	22120	R	?	None	True	False	bed	
14 | dgv2n99	None	1	10001	10501	R	?	None	1	15000	15000	R	?	None	True	False	bed	
15 | rgv2n99	None	1	10001	10001	L	?	None	1	22222	22222	L	?	None	True	False	bed	
16 | rgv2n99	None	1	10001	10001	L	?	None	1	22222	22222	R	?	None	False	False	bed	
17 | rgv2n99	None	1	10001	10001	R	?	None	1	22222	22222	L	?	None	False	False	bed	
18 | rgv2n99	None	1	10001	10001	R	?	None	1	22222	22222	R	?	None	True	False	bed	
19 | nsv7879	None	1	10001	10001	L	?	None	1	127330	127330	L	?	None	True	False	bed	
20 | nsv7879	None	1	10001	10001	L	?	None	1	127330	127330	R	?	None	False	False	bed	
21 | nsv7879	None	1	10001	10001	R	?	None	1	127330	127330	L	?	None	False	False	bed	
22 | nsv7879	None	1	10001	10001	R	?	None	1	127330	127330	R	?	None	True	False	bed	
23 | nsv958854	None	1	10191	10191	L	?	None	1	10281	10281	L	?	None	True	False	bed	
24 | nsv958854	None	1	10191	10191	L	?	None	1	10281	10281	R	?	None	False	False	bed	
25 | nsv958854	None	1	10191	10191	R	?	None	1	10281	10281	L	?	None	False	False	bed	
26 | nsv958854	None	1	10191	10191	R	?	None	1	10281	10281	R	?	None	True	False	bed	
27 | nsv428112	None	1	10377	10377	L	?	None	1	177417	177417	L	?	None	True	False	bed	
28 | nsv428112	None	1	10377	10377	L	?	None	1	177417	177417	R	?	None	False	False	bed	
29 | nsv428112	None	1	10377	10377	R	?	None	1	177417	177417	L	?	None	False	False	bed	
30 | nsv428112	None	1	10377	10377	R	?	None	1	177417	177417	R	?	None	True	False	bed	
31 | esv2758911	None	1	10377	10377	L	?	None	1	1018704	1018704	L	?	None	True	False	bed	
32 | esv2758911	None	1	10377	10377	L	?	None	1	1018704	1018704	R	?	None	False	False	bed	
33 | esv2758911	None	1	10377	10377	R	?	None	1	1018704	1018704	L	?	None	False	False	bed	
34 | esv2758911	None	1	10377	10377	R	?	None	1	1018704	1018704	R	?	None	True	False	bed	
35 | esv27265	None	1	10499	10499	L	?	None	1	177368	177368	L	?	None	True	False	bed	
36 | esv27265	None	1	10499	10499	L	?	None	1	177368	177368	R	?	None	False	False	bed	
37 | esv27265	None	1	10499	10499	R	?	None	1	177368	177368	L	?	None	False	False	bed	
38 | esv27265	None	1	10499	10499	R	?	None	1	177368	177368	R	?	None	True	False	bed	
39 | nsv1147468	None	1	11099	11099	L	?	None	1	47000	47000	L	?	None	True	False	bed	
40 | nsv1147468	None	1	11099	11099	L	?	None	1	47000	47000	R	?	None	False	False	bed	
41 | nsv1147468	None	1	11099	11099	R	?	None	1	47000	47000	L	?	None	False	False	bed	
42 | nsv1147468	None	1	11099	11099	R	?	None	1	47000	47000	R	?	None	True	False	bed	
43 | dgv1n106	None	1	11100	11100	L	?	None	1	29200	29200	L	?	None	True	False	bed	
44 | dgv1n106	None	1	11100	11100	L	?	None	1	29200	29200	R	?	None	False	False	bed	
45 | dgv1n106	None	1	11100	11100	R	?	None	1	29200	29200	L	?	None	False	False	bed	
46 | dgv1n106	None	1	11100	11100	R	?	None	1	29200	29200	R	?	None	True	False	bed	
47 | 


--------------------------------------------------------------------------------
/src/tools/find_repeats.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Script used in finding potential masking regions within a genome
  3 | """
  4 | import argparse
  5 | import os
  6 | 
  7 | from mavis.annotate.base import BioInterval
  8 | from mavis.annotate.file_io import load_reference_genome
  9 | from mavis.util import log
 10 | 
 11 | 
 12 | def parse_arguments():
 13 |     """
 14 |     parse command line arguments
 15 |     """
 16 |     parser = argparse.ArgumentParser()
 17 |     parser.add_argument(
 18 |         '-o', '--output', help='path to the output file', required=True, metavar='FILEPATH'
 19 |     )
 20 |     parser.add_argument(
 21 |         '-n',
 22 |         '--input',
 23 |         required=True,
 24 |         metavar='FILEPATH',
 25 |         help='Path to the Input reference genome fasta file',
 26 |     )
 27 |     parser.add_argument(
 28 |         '--min_length',
 29 |         default=20,
 30 |         type=int,
 31 |         help='Minimum total length of the repeat region to find',
 32 |         metavar='INT',
 33 |     )
 34 |     parser.add_argument(
 35 |         '--repeat_seq',
 36 |         default='N',
 37 |         type=str,
 38 |         help='Repeat sequence to look for. Case insensitive',
 39 |         nargs='+',
 40 |     )
 41 |     args = parser.parse_args()
 42 |     if args.min_length < 2:
 43 |         parser.error('argument --min_length: cannot specify a shorter repeat than 2 bases')
 44 |     if not os.path.exists(args.input):
 45 |         parser.error('argument --input: File does not exist')
 46 |     return args
 47 | 
 48 | 
 49 | def main():
 50 |     args = parse_arguments()
 51 |     repeat_sequences = sorted(list(set([s.lower() for s in args.repeat_seq])))
 52 |     log('loading:', args.input)
 53 |     reference_genome = load_reference_genome(args.input)
 54 |     comments = [
 55 |         os.path.basename(__file__),
 56 |         'input: {}'.format(args.input),
 57 |         'min_length: {}'.format(args.min_length),
 58 |         'repeat_seq: {}'.format(', '.join(args.repeat_seq)),
 59 |     ]
 60 |     log('writing:', args.output)
 61 |     with open(args.output, 'w') as fh:
 62 |         for comment in comments:
 63 |             fh.write('## {}\n'.format(comment))
 64 |         fh.write('chr\tstart\tend\tname\n')
 65 |         visited = set()
 66 |         for chrom, seq in sorted(reference_genome.items()):
 67 |             if chrom.startswith('chr'):
 68 |                 chrom = chrom[3:]
 69 |             seq = str(seq.seq).lower()
 70 |             if seq in visited:
 71 |                 continue
 72 |             else:
 73 |                 visited.add(seq)
 74 |             spans = []
 75 |             for repseq in repeat_sequences:
 76 |                 log(
 77 |                     'finding {}_repeat (min_length: {}), for chr{} (length: {})'.format(
 78 |                         repseq, args.min_length, chrom, len(seq)
 79 |                     )
 80 |                 )
 81 |                 index = 0
 82 |                 while index < len(seq):
 83 |                     next_n = seq.find(repseq, index)
 84 |                     if next_n < 0:
 85 |                         break
 86 |                     index = next_n
 87 |                     while (
 88 |                         index + len(repseq) <= len(seq)
 89 |                         and seq[index : index + len(repseq)] == repseq
 90 |                     ):
 91 |                         index += len(repseq)
 92 |                     span = BioInterval(chrom, next_n + 1, index, name='repeat_{}'.format(repseq))
 93 |                     if len(span) >= args.min_length and len(span) >= 2 * len(repseq):
 94 |                         spans.append(span)
 95 |             log('found', len(spans), 'spans', time_stamp=False)
 96 |             for span in spans:
 97 |                 fh.write(
 98 |                     '{}\t{}\t{}\t{}\n'.format(
 99 |                         span.reference_object, span.start, span.end, span.name
100 |                     )
101 |                 )
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     main()
106 | 


--------------------------------------------------------------------------------
/tests/data/mock_sv_events.tsv:
--------------------------------------------------------------------------------
 1 | ## False	reference9	2000	2000	reference9	2001	2001	L	R	+	+	insertion	genome	convert_ta.py_v0.0.1	mock-A36971	9:66466004
 2 | stranded	break1_chromosome	break1_position_start	break1_position_end	break2_chromosome	break2_position_start	break2_position_end	break1_orientation	break2_orientation	break1_strand	break2_strand	event_type	protocol	tools	library	comment
 3 | False	reference7	5000	5000	reference7	11000	11000	R	L	-	-	duplication	genome	convert_ta.py_v0.0.1	mock-A36971	7:104485067|7:104612302
 4 | False	reference20	2000	2000	reference20	6000	6000	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	20:13160730|20:13164100
 5 | False	reference10	520	520	reference19	964	964	R	L	+	+	translocation	genome	convert_ta.py_v0.0.1	mock-A36971	10:7059511|19:17396811
 6 | False	referenceX	2000	2000	referenceX	6000	6000	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	X:32098533|32201251
 7 | False	reference3	1115	1115	reference3	2188	2188	R	R	+	-	inversion	genome	convert_ta.py_v0.0.1	mock-A36971	3:24565106|24566179
 8 | False	referenceX	10000	10000	referenceX	14000	14000	L	R	-	-	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	X:31301203|32038750
 9 | False	reference2	2000	2000	reference4	2000	2000	L	R	-	-	translocation	genome	convert_ta.py_v0.0.1	mock-A36971	2:42052609|4:66413931
10 | False	reference7	15000	15000	reference7	19000	19000	R	R	+	-	inversion	genome	convert_ta.py_v0.0.1	mock-A36971	7:126098488|126167441
11 | False	reference19	4827	4847	reference19	5219	5219	L	R	+	+	deletion	genome	DELLY_v0.6.1	mock-A36971	19:31954787-31955407|19:31955423-31956043
12 | False	reference11	6000	6000	reference11	6003	6003	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	11:121214|11:121216
13 | False	reference11	10000	10000	reference11	10030	10030	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	11:1651586|11:1651615
14 | False	reference12	2001	2001	reference12	2120	2120	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	12:14945389|12:14945509
15 | False	reference10	3609	3609	reference10	3818	3818	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	10:7793830|10:7794039
16 | False	reference10	8609	8609	reference10	8927	8927	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	10:100025136|10:100025454
17 | False	reference10	12609	12609	reference10	13123	13123	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	10:18503076|10:18503590
18 | False	reference10	17109	17109	reference10	17899	17899	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	10:127745195|10:127745985
19 | False	reference10	22109	22109	reference10	24330	24330	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	10:108030321|10:108032542
20 | False	reference10	28109	28109	reference10	31827	31827	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	10:132909062|10:132912780
21 | False	reference10	36109	36109	reference10	42159	42159	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	10:6411580|10:6417630
22 | False	reference12	6001	6001	reference12	6016	6016	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	12:127413217|12:127413233 complex event
23 | False	reference1	2000	2000	reference1	2001	2001	L	R	+	+	insertion	genome	convert_ta.py_v0.0.1	mock-A36971	1:8877520
24 | False	reference16	2000	2000	reference16	2001	2001	L	R	+	+	insertion	genome	convert_ta.py_v0.0.1	mock-A36971	16:57847634
25 | False	reference12	10000	10000	reference12	10001	10021	R	L	+	+	duplication	genome	convert_ta.py_v0.0.1	mock-A36971	12:53207583 reported as an insertion
26 | False	reference17	1974	1974	reference17	2020	2020	R	L	+	+	duplication	genome	convert_ta.py_v0.0.1	mock-A36971	17:72889676 reported as an insertion
27 | False	gene3	27175	27175	gene3	27176	27176	R	L	+	+	duplication	genome	convert_ta.py_v0.0.1	mock-A36971	1:207249992
28 | False	gene5	608	608	gene1	33309	33309	R	R	+	-	inverted translocation	genome	convert_ta.py_v0.0.1	mock-A36971	7:26252971|15:40854190
29 | False	gene2	19827	19827	gene2	27045	27045	R	L	+	+	duplication	genome	convert_ta.py_v0.0.1	mock-A36971	15:41621292|15:41628510
30 | False	gene6	77430	77430	gene6	89472	89472	L	R	+	+	deletion	genome	convert_ta.py_v0.0.1	mock-A36971	10:89700299|10:89712341
31 | 


--------------------------------------------------------------------------------
/tests/data/build.cfg:
--------------------------------------------------------------------------------
  1 | [general]
  2 | batch_id = batch-aMfNsjq7NgyaJFfhU9ZHQS
  3 | output_dir = /var/tmp/tmpfojhl9g1
  4 | scheduler = SLURM
  5 | concurrency_limit = None
  6 | 
  7 | [MS_batch-aMfNsjq7NgyaJFfhU9ZHQS]
  8 | stage = summary
  9 | job_ident = None
 10 | name = MS_batch-aMfNsjq7NgyaJFfhU9ZHQS
 11 | dependencies = MP_batch-aMfNsjq7NgyaJFfhU9ZHQS
 12 | script = /var/tmp/tmpfojhl9g1/summary/submit.sh
 13 | status = UNKNOWN
 14 | output_dir = /var/tmp/tmpfojhl9g1/summary
 15 | stdout = /var/tmp/tmpfojhl9g1/summary/job-{name}-{job_ident}.log
 16 | memory_limit = 16000
 17 | queue =
 18 | time_limit = 57600
 19 | import_env = True
 20 | mail_user =
 21 | mail_type = NONE
 22 | 
 23 | [MP_batch-aMfNsjq7NgyaJFfhU9ZHQS]
 24 | stage = pairing
 25 | job_ident = None
 26 | name = MP_batch-aMfNsjq7NgyaJFfhU9ZHQS
 27 | dependencies = MA_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS
 28 | 	MA_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS
 29 | script = /var/tmp/tmpfojhl9g1/pairing/submit.sh
 30 | status = UNKNOWN
 31 | output_dir = /var/tmp/tmpfojhl9g1/pairing
 32 | stdout = /var/tmp/tmpfojhl9g1/pairing/job-{name}-{job_ident}.log
 33 | memory_limit = 16000
 34 | queue =
 35 | time_limit = 57600
 36 | import_env = True
 37 | mail_user =
 38 | mail_type = NONE
 39 | 
 40 | [MV_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS]
 41 | stage = validate
 42 | job_ident = None
 43 | name = MV_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS
 44 | dependencies =
 45 | script = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/validate/submit.sh
 46 | status = UNKNOWN
 47 | output_dir = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/validate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID
 48 | stdout = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/validate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID/job-{name}-{job_ident}_{task_ident}.log
 49 | memory_limit = 16000
 50 | queue =
 51 | time_limit = 57600
 52 | import_env = True
 53 | mail_user =
 54 | mail_type = NONE
 55 | task_list = 1
 56 | 
 57 | [MV_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS]
 58 | stage = validate
 59 | job_ident = None
 60 | name = MV_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS
 61 | dependencies =
 62 | script = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/validate/submit.sh
 63 | status = UNKNOWN
 64 | output_dir = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/validate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID
 65 | stdout = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/validate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID/job-{name}-{job_ident}_{task_ident}.log
 66 | memory_limit = 18000
 67 | queue =
 68 | time_limit = 57600
 69 | import_env = True
 70 | mail_user =
 71 | mail_type = NONE
 72 | task_list = 1
 73 | 
 74 | [MA_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS]
 75 | stage = annotate
 76 | job_ident = None
 77 | name = MA_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS
 78 | dependencies = MV_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS
 79 | script = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/annotate/submit.sh
 80 | status = UNKNOWN
 81 | output_dir = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/annotate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID
 82 | stdout = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/annotate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID/job-{name}-{job_ident}_{task_ident}.log
 83 | memory_limit = 12000
 84 | queue =
 85 | time_limit = 57600
 86 | import_env = True
 87 | mail_user =
 88 | mail_type = NONE
 89 | task_list = 1
 90 | 
 91 | [MA_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS]
 92 | stage = annotate
 93 | job_ident = None
 94 | name = MA_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS
 95 | dependencies = MV_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS
 96 | script = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/annotate/submit.sh
 97 | status = UNKNOWN
 98 | output_dir = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/annotate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID
 99 | stdout = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/annotate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID/job-{name}-{job_ident}_{task_ident}.log
100 | memory_limit = 12000
101 | queue =
102 | time_limit = 57600
103 | import_env = True
104 | mail_user =
105 | mail_type = NONE
106 | task_list = 1
107 | 
108 | 


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | ## Getting Started
  2 | 
  3 | If you are new to the project a good way to get started is by adding to the documentation, or adding unit tests where
  4 | there is a lack of code coverage.
  5 | 
  6 | ## Install (for Development)
  7 | 
  8 | Clone the repository and switch to the development branch
  9 | 
 10 | ```bash
 11 | git clone https://github.com/bcgsc/mavis.git
 12 | cd mavis
 13 | git checkout develop
 14 | ```
 15 | 
 16 | Set up a python virtual environment. If you are developing in python setting up with a virtual environment can be
 17 | incredibly helpful as it allows for a clean install to test. Instructions for setting up the environment
 18 | are below
 19 | 
 20 | ```bash
 21 | python3 -m venv venv
 22 | source venv/bin/activate
 23 | ```
 24 | 
 25 | Install the MAVIS python package. Running the setup in develop mode will ensure that your code changes are run when you
 26 | run MAVIS from within that virtual environment
 27 | 
 28 | ```bash
 29 | pip install -e .[dev]
 30 | ```
 31 | 
 32 | Run the tests and compute code coverage
 33 | 
 34 | ```bash
 35 | pytest tests
 36 | ```
 37 | 
 38 | ## Build the Documentation
 39 | 
 40 | ```bash
 41 | pip install .[docs]
 42 | markdown_refdocs mavis -o docs/package --link
 43 | mkdocs build
 44 | ```
 45 | 
 46 | The contents of the user manual can then be viewed by opening the build-docs/index.html
 47 | in any available web browser (i.e. google-chrome, firefox, etc.)
 48 | 
 49 | ## Deploy to PyPi
 50 | 
 51 | Install deployment dependencies
 52 | 
 53 | ```bash
 54 | pip install .[deploy]
 55 | ```
 56 | 
 57 | Build the distribution files
 58 | 
 59 | ```bash
 60 | python setup.py install sdist bdist_wheel
 61 | ```
 62 | 
 63 | Use twine to upload
 64 | 
 65 | ```bash
 66 | twine upload -r pypi dist/*
 67 | ```
 68 | 
 69 | ## Reporting a Bug
 70 | 
 71 | Please make sure to search through the issues before reporting a bug to ensure there isn't
 72 | already an open issue.
 73 | 
 74 | ## Conventions
 75 | 
 76 | ### Linting
 77 | 
 78 | Use [black](https://github.com/psf/black) with strings off and line length 100
 79 | 
 80 | ```bash
 81 | black src/mavis -S -l 100
 82 | ```
 83 | 
 84 | ### Docstrings
 85 | 
 86 | docstrings should follow [sphinx google code style](http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html)
 87 | 
 88 | if you want to be more explicit with nested types, please follow the same format
 89 | used by [python type annotations](https://docs.python.org/3/library/typing.html)
 90 | 
 91 | ```text
 92 | arg1 (List[str]): a list of strings
 93 | ```
 94 | 
 95 | However using proper type annotations is preferred for new code and then only including the
 96 | description of the parameter in the docstring and not its type
 97 | 
 98 | ```python
 99 | 
100 | def some_function(some_arg: List[str]) -> None:
101 |     """
102 |     Args:
103 |         some_arg: this arg does stuff
104 |     """
105 | ```
106 | 
107 | ### Output Columns
108 | 
109 | any column name which may appear in any of the intermediate or final output files must be defined in `mavis.constants.COLUMNS` as well as added to the [columns glossary](../outputs/columns)
110 | 
111 | ### Tests
112 | 
113 | - all new code must have unit tests in the tests subdirectory
114 | 
115 | Tests can be run as follows
116 | 
117 | ```bash
118 | pytest tests
119 | ```
120 | 
121 | ### Branching Model
122 | 
123 | If you are working on a large feature, create a base branch for the feature off develop. Generally
124 | these follow the naming pattern
125 | 
126 | ```bash
127 | git checkout -b integration/issue-<number>-<short-name>
128 | ```
129 | 
130 | If you are working on a smaller feature then simply make a feature branch off develop
131 | 
132 | ```bash
133 | git checkout -b feature/issue-<number>-<short-name>
134 | ```
135 | 
136 | Once ready, a PR should be made to develop and review should be requested from the other developers.
137 | 
138 | Releases are done by creating a release branch off develop
139 | 
140 | ```bash
141 | git checkout -b release/vX.X.X
142 | ```
143 | 
144 | Updating the version number in setup.py in the release branch, and then making a PR to master.
145 | After the PR has been merged to master a tag/release should be created with the release notes
146 | and a PR to merge master back into develop should be made
147 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
  1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
  2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
  3 | 
  4 | name: build
  5 | 
  6 | on:
  7 |   push:
  8 |     branches:
  9 |       - master
 10 |       - develop
 11 |   pull_request:
 12 | 
 13 | jobs:
 14 |   build:
 15 |     runs-on: ubuntu-20.04
 16 |     strategy:
 17 |       matrix:
 18 |         python-version: ["3.7", "3.8", "3.9", "3.10"]
 19 |     name: python-${{ matrix.python-version }}
 20 |     steps:
 21 |     - uses: actions/checkout@v2
 22 |     - name: install machine dependencies
 23 |       run: |
 24 |         sudo apt-get update
 25 |         sudo apt-get install -y libcurl4-openssl-dev
 26 |     - name: Set up Python ${{ matrix.python-version }}
 27 |       uses: actions/setup-python@v2
 28 |       with:
 29 |         python-version: ${{ matrix.python-version }}
 30 |     - name: Install dependencies
 31 |       run: |
 32 |         python -m pip install --upgrade pip setuptools
 33 |         pip install -e .[test]  # need editable to make sure the coverage reports correctly
 34 |     - name: install bwa
 35 |       run: |
 36 |         git clone https://github.com/lh3/bwa.git
 37 |         cd bwa
 38 |         git checkout v0.7.17
 39 |         make
 40 |         cd ..
 41 |     - name: install blat
 42 |       run: |
 43 |         wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat
 44 |         chmod a+x blat
 45 |     - name: set up .pth file
 46 |       run: |
 47 |         python tests/setup_subprocess_cov.py
 48 |     - name: run full tests with pytest
 49 |       run: |
 50 |         export PATH=$PATH:$(pwd):$(pwd)/bwa
 51 |         export COVERAGE_PROCESS_START=$(pwd)/.coveragerc
 52 | 
 53 |         pytest tests -v \
 54 |           --junitxml=junit/test-results-${{ matrix.python-version }}.xml \
 55 |           --cov mavis \
 56 |           --cov tools.convert_annotations_format \
 57 |           --cov-report term-missing \
 58 |           --cov-report xml \
 59 |           --durations=10 \
 60 |           --cov-branch
 61 |       env:
 62 |         RUN_FULL: 1
 63 |     - name: Upload pytest test results
 64 |       uses: actions/upload-artifact@master
 65 |       with:
 66 |         name: pytest-results-${{ matrix.python-version }}
 67 |         path: junit/test-results-${{ matrix.python-version }}.xml
 68 |         # Use always() to always run this step to publish test results when there are test failures
 69 |       if: always()
 70 |     - name: Update code coverage report to CodeCov
 71 |       uses: codecov/codecov-action@v1
 72 |       with:
 73 |         token: ${{ secrets.CODECOV_TOKEN }}
 74 |         file: ./coverage.xml
 75 |         flags: unittests
 76 |         env_vars: OS,PYTHON
 77 |         name: codecov-umbrella
 78 |         fail_ci_if_error: true
 79 |       if: matrix.python-version == 3.8
 80 |   docker:
 81 |     runs-on: ubuntu-latest
 82 |     name: docker build
 83 |     steps:
 84 |       - uses: actions/checkout@v2
 85 |       - name: build the docker container
 86 |         run: |
 87 |           docker build --file Dockerfile --tag bcgsc/mavis:latest .
 88 |       - name: test the help menu
 89 |         run: |
 90 |           docker run bcgsc/mavis -h
 91 |       - name: Set up Python 3.7
 92 |         uses: actions/setup-python@v2
 93 |         with:
 94 |           python-version: 3.7
 95 |       - name: Install workflow dependencies
 96 |         run: |
 97 |           python -m pip install --upgrade pip setuptools wheel
 98 |           pip install mavis_config pandas
 99 |       - uses: eWaterCycle/setup-singularity@v6
100 |         with:
101 |           singularity-version: 3.6.4
102 |       - name: docker2singularity
103 |         run:
104 |           docker run --mount type=bind,source=/var/run/docker.sock,target=/var/run/docker.sock --mount type=bind,source="$(pwd)",target=/output --privileged -t --rm singularityware/docker2singularity bcgsc/mavis:latest
105 |       - name: Run analysis with snakemake & singularity
106 |         run: |
107 |           # get the SIMG filename
108 |           export SNAKEMAKE_CONTAINER=$(ls *mavis*.simg)
109 |           snakemake -j 2 --configfile tests/mini-tutorial.config.json --use-singularity
110 |         if: always()
111 | 


--------------------------------------------------------------------------------
/tests/test_tools/test_ref_alt_count.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import tempfile
  4 | 
  5 | import pytest
  6 | 
  7 | from mavis.annotate.file_io import load_reference_genome
  8 | from mavis.breakpoint import Breakpoint, BreakpointPair
  9 | from mavis.constants import ORIENT, SVTYPE
 10 | from tools.calculate_ref_alt_counts import RefAltCalculator
 11 | 
 12 | from ..util import get_data, glob_exists
 13 | 
 14 | 
 15 | def setUpModule():
 16 |     global REFERENCE_GENOME
 17 |     REFERENCE_GENOME = load_reference_genome(get_data('mock_reference_genome.fa'))
 18 |     if (
 19 |         'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT'
 20 |         != REFERENCE_GENOME['fake'].seq[0:50].upper()
 21 |     ):
 22 |         raise AssertionError('fake genome file does not have the expected contents')
 23 | 
 24 | 
 25 | def print_file_tree(dirname):
 26 |     for root, dirs, files in os.walk(dirname):
 27 |         level = root.replace(dirname, '').count(os.sep)
 28 |         indent = ' ' * 4 * (level)
 29 |         print('{}{}/'.format(indent, os.path.basename(root)))
 30 |         subindent = ' ' * 4 * (level + 1)
 31 |         for f in files:
 32 |             print('{}{}'.format(subindent, f))
 33 | 
 34 | 
 35 | @pytest.fixture
 36 | def calculator():
 37 |     return RefAltCalculator(
 38 |         [("TEST", get_data('mock_reads_for_events.sorted.bam'))],
 39 |         REFERENCE_GENOME,
 40 |         max_event_size=100,
 41 |         buffer=20,
 42 |     )
 43 | 
 44 | 
 45 | @pytest.fixture
 46 | def temp_output():
 47 |     d = tempfile.mkdtemp()
 48 |     yield d
 49 |     shutil.rmtree(d)
 50 | 
 51 | 
 52 | class TestFullCalculator:
 53 |     def test_calculate_all_counts(self, calculator, temp_output):
 54 |         calculator.calculate_all_counts(
 55 |             [get_data("mavis_summary_all_mock-A36971_mock-A47933.tab")],
 56 |             os.path.join(temp_output, "ref_alt_output.tab"),
 57 |         )
 58 |         assert glob_exists(temp_output, "ref_alt_output.tab")
 59 | 
 60 | 
 61 | class TestRefAltCalulator:
 62 |     def test_calculate_count(self, calculator):
 63 |         ev1 = BreakpointPair(
 64 |             Breakpoint('reference11', 5999, orient=ORIENT.LEFT),
 65 |             Breakpoint('reference11', 6003, orient=ORIENT.RIGHT),
 66 |             opposing_strands=False,
 67 |             event_type=SVTYPE.DEL,
 68 |         )
 69 |         bpp = calculator.calculate_ref_counts(ev1)
 70 |         print(bpp.data)
 71 |         assert bpp.data["TEST_ref_count"] == 27
 72 |         assert bpp.data["TEST_alt_count"] == 14
 73 |         assert bpp.data['TEST_ignored_count'] == 188
 74 | 
 75 |     def test_calculate_count2(self, calculator):
 76 |         ev1 = BreakpointPair(
 77 |             Breakpoint('reference11', 9999, orient=ORIENT.LEFT),
 78 |             Breakpoint('reference11', 10030, orient=ORIENT.RIGHT),
 79 |             opposing_strands=False,
 80 |             event_type=SVTYPE.DEL,
 81 |         )
 82 |         bpp = calculator.calculate_ref_counts(ev1)
 83 |         print(bpp.data)
 84 |         assert bpp.data["TEST_ref_count"] == 0
 85 |         assert bpp.data["TEST_alt_count"] == 63
 86 |         assert bpp.data['TEST_ignored_count'] == 195
 87 | 
 88 |     def test_calculate_count3(self, calculator):
 89 |         ev1 = BreakpointPair(
 90 |             Breakpoint('reference1', 2002, orient=ORIENT.LEFT),
 91 |             Breakpoint('reference1', 2003, orient=ORIENT.RIGHT),
 92 |             opposing_strands=False,
 93 |             event_type=SVTYPE.INS,
 94 |             untemplated_seq='TT',
 95 |         )
 96 |         bpp = calculator.calculate_ref_counts(ev1)
 97 |         print(bpp.data)
 98 |         assert bpp.data["TEST_ref_count"] == 0
 99 |         assert bpp.data["TEST_alt_count"] == 23
100 |         assert bpp.data['TEST_ignored_count'] == 145
101 | 
102 |     def test_calculate_count4(self, calculator):
103 |         ev1 = BreakpointPair(
104 |             Breakpoint('reference11', 1999, orient=ORIENT.LEFT),
105 |             Breakpoint('reference11', 2001, orient=ORIENT.RIGHT),
106 |             opposing_strands=False,
107 |             event_type=SVTYPE.DEL,
108 |         )
109 |         bpp = calculator.calculate_ref_counts(ev1)
110 |         print(bpp.data)
111 |         assert bpp.data["TEST_ref_count"] == 0
112 |         assert bpp.data["TEST_alt_count"] == 50
113 |         assert bpp.data['TEST_ignored_count'] == 191
114 | 


--------------------------------------------------------------------------------
/tests/data/mock_reference_annotations2.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "genes": [
  3 |         {
  4 |             "aliases": [
  5 |             ],
  6 |             "chr": "fake",
  7 |             "end": 200,
  8 |             "name": "GENE-A",
  9 |             "start": 100,
 10 |             "strand": "+",
 11 |             "transcripts": [
 12 |                 {
 13 |                     "aliases": [
 14 |                     ],
 15 |                     "cdna_coding_end": null,
 16 |                     "cdna_coding_start": null,
 17 |                     "domains": [
 18 |                     ],
 19 |                     "end": 200,
 20 |                     "exons": [
 21 |                     ],
 22 |                     "is_best_transcript": true,
 23 |                     "name": "TRANSCRIPT-A",
 24 |                     "start": 100
 25 |                 }
 26 |             ]
 27 |         },
 28 |         {
 29 |             "aliases": [
 30 |             ],
 31 |             "chr": "fake",
 32 |             "end": 350,
 33 |             "name": "GENE-B",
 34 |             "start": 250,
 35 |             "strand": "-",
 36 |             "transcripts": [
 37 |                 {
 38 |                     "aliases": [
 39 |                     ],
 40 |                     "cdna_coding_end": null,
 41 |                     "cdna_coding_start": null,
 42 |                     "domains": [
 43 |                     ],
 44 |                     "end": 350,
 45 |                     "exons": [
 46 |                     ],
 47 |                     "is_best_transcript": true,
 48 |                     "name": "TRANSCRIPT-B",
 49 |                     "start": 250
 50 |                 }
 51 |             ]
 52 |         },
 53 |         {
 54 |             "aliases": [
 55 |             ],
 56 |             "chr": "fake",
 57 |             "end": 400,
 58 |             "name": "GENE-C",
 59 |             "start": 300,
 60 |             "strand": "+",
 61 |             "transcripts": [
 62 |                 {
 63 |                     "aliases": [
 64 |                     ],
 65 |                     "cdna_coding_end": null,
 66 |                     "cdna_coding_start": null,
 67 |                     "domains": [
 68 |                     ],
 69 |                     "end": 400,
 70 |                     "exons": [
 71 |                     ],
 72 |                     "is_best_transcript": true,
 73 |                     "name": "TRANSCRIPT-C",
 74 |                     "start": 300
 75 |                 }
 76 |             ]
 77 |         },
 78 |         {
 79 |             "aliases": [
 80 |             ],
 81 |             "chr": "fake",
 82 |             "end": 550,
 83 |             "name": "GENE-D",
 84 |             "start": 450,
 85 |             "strand": "-",
 86 |             "transcripts": [
 87 |                 {
 88 |                     "aliases": [
 89 |                     ],
 90 |                     "cdna_coding_end": null,
 91 |                     "cdna_coding_start": null,
 92 |                     "domains": [
 93 |                     ],
 94 |                     "end": 550,
 95 |                     "exons": [
 96 |                     ],
 97 |                     "is_best_transcript": true,
 98 |                     "name": "TRANSCRIPT-D",
 99 |                     "start": 450
100 |                 }
101 |             ]
102 |         },
103 |         {
104 |             "aliases": [
105 |             ],
106 |             "chr": "fake",
107 |             "end": 600,
108 |             "name": "GENE-E",
109 |             "start": 500,
110 |             "strand": "+",
111 |             "transcripts": [
112 |                 {
113 |                     "aliases": [
114 |                     ],
115 |                     "cdna_coding_end": null,
116 |                     "cdna_coding_start": null,
117 |                     "domains": [
118 |                     ],
119 |                     "end": 600,
120 |                     "exons": [
121 |                     ],
122 |                     "is_best_transcript": true,
123 |                     "name": "TRANSCRIPT-E",
124 |                     "start": 500
125 |                 }
126 |             ]
127 |         },
128 |         {
129 |             "aliases": [
130 |             ],
131 |             "chr": "fake",
132 |             "end": 650,
133 |             "name": "GENE-F",
134 |             "start": 550,
135 |             "strand": "+",
136 |             "transcripts": [
137 |                 {
138 |                     "aliases": [
139 |                     ],
140 |                     "cdna_coding_end": null,
141 |                     "cdna_coding_start": null,
142 |                     "domains": [
143 |                     ],
144 |                     "end": 650,
145 |                     "exons": [
146 |                     ],
147 |                     "is_best_transcript": true,
148 |                     "name": "TRANSCRIPT-F",
149 |                     "start": 550
150 |                 }
151 |             ]
152 |         }
153 |     ]
154 | }
155 | 


--------------------------------------------------------------------------------
/docs/hooks.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from textwrap import dedent
  4 | 
  5 | import pkg_resources
  6 | from markdown_refdocs.main import extract_to_markdown
  7 | from mavis_config import DEFAULTS
  8 | from mavis.util import ENV_VAR_PREFIX
  9 | 
 10 | 
 11 | def json_to_pytype(record):
 12 |     input_type = record
 13 |     try:
 14 |         input_type = record['type']
 15 |     except TypeError:
 16 |         pass
 17 |     types = {
 18 |         'string': 'str',
 19 |         'integer': 'int',
 20 |         'float': 'float',
 21 |         'boolean': 'bool',
 22 |         'number': 'float',
 23 |     }
 24 | 
 25 |     if input_type == 'array':
 26 |         try:
 27 |             sub_type = json_to_pytype(record['items']['type'])
 28 |             return f'List[{sub_type}]'
 29 |         except TypeError:
 30 |             return 'List'
 31 | 
 32 |     if isinstance(input_type, list):
 33 |         # Union
 34 |         types = ', '.join([json_to_pytype(t) for t in input_type])
 35 |         return f'Union[{types}]'
 36 |     return types.get(input_type, input_type)
 37 | 
 38 | 
 39 | def list_properties(schema, skip_terms=tuple()):
 40 |     glossary = {}
 41 |     for term, defn in schema['properties'].items():
 42 |         if term in skip_terms:
 43 |             continue
 44 |         typ = json_to_pytype(defn)
 45 |         desc = defn.get('description', '')
 46 |         default_value = defn.get('default')
 47 |         schema_fields = {k: v for k, v in defn.items() if k not in ['description', 'default']}
 48 | 
 49 |         if len(schema_fields) > 1:
 50 |             schema_defn = json.dumps(
 51 |                 schema_fields,
 52 |                 sort_keys=True,
 53 |                 indent='    ',
 54 |             )
 55 |             schema_defn = f'**schema definition**:\n```json\n{schema_defn}\n```\n'
 56 |         else:
 57 |             schema_defn = ''
 58 | 
 59 |         lines = [
 60 |             f'### {term}',
 61 |             f'**type**: `#!python {typ}`',
 62 |             f'**default**: `#!python {repr(default_value)}`' if default_value is not None else '',
 63 |             desc,
 64 |             schema_defn,
 65 |         ]
 66 |         glossary[term] = '\n\n'.join(lines)
 67 |     return [v for k, v in sorted(glossary.items())]
 68 | 
 69 | 
 70 | def generate_settings_doc(schema_file):
 71 |     with open(schema_file, 'r') as fh:
 72 |         schema = json.load(fh)
 73 |     dirname = os.path.dirname(os.path.abspath(__file__))
 74 |     filepath = 'configuration/settings.md'
 75 |     title = 'Configurable Settings'
 76 | 
 77 |     fname = os.path.join(dirname, filepath)
 78 | 
 79 |     result = [f'\n\n# {title}\n']
 80 |     result.append(
 81 |         dedent(
 82 |             '''\
 83 |             ## Defining Samples/Libraries
 84 | 
 85 |             The `libraries` property of the mavis config is required to run the snakemake
 86 |             workflow. This is the section that defines what inputs to use, and what types of
 87 |             samples are available.
 88 | 
 89 |             ```json
 90 |             {
 91 |                 "libraries": {
 92 |                     "<LIBRARY_NAME>": { }  // mapping of library name to library settings
 93 |                 }
 94 |             }
 95 |             ```
 96 | 
 97 |             The library specific settings are listed below
 98 |             '''
 99 |         )
100 |     )
101 |     result.extend(list_properties(schema['properties']['libraries']['additionalProperties']))
102 |     result.append(
103 |         dedent(
104 |             '''\
105 |             ## Defining Conversions
106 | 
107 |             If the input to MAVIS is raw tool output and has not been pre-converted to the
108 |             standard tab delimited format expected by MAVIS then you will need to add
109 |             a section to the config to tell mavis how to perform the required conversions
110 | 
111 |             ```json
112 |             {
113 |                 "convert": {
114 |                     "<ALIAS>": { }  // mapping of alias to conversion settings
115 |                 }
116 |             }
117 |             ```
118 | 
119 |             The conversion specific settings are listed below
120 |             '''
121 |         )
122 |     )
123 |     result.extend(list_properties(schema['properties']['convert']['additionalProperties']))
124 |     result.append('\n## General Settings\n')
125 |     result.extend(list_properties(schema, ('libraries', 'convert')))
126 | 
127 |     print('writing:', fname)
128 |     with open(fname, 'w') as fh:
129 |         fh.write('\n\n'.join(result) + '\n')
130 | 
131 | 
132 | def build_package_docs(config):
133 |     schema_file = pkg_resources.resource_filename('mavis_config', 'config.json')
134 |     generate_settings_doc(schema_file)
135 |     package_dir = os.path.join(os.path.dirname(__file__), '../src/mavis')
136 |     output_dir = os.path.join(os.path.dirname(__file__), 'package')
137 | 
138 |     extract_to_markdown(
139 |         [package_dir],
140 |         output_dir,
141 |         link=True,
142 |         hide_private=True,
143 |         hide_undoc=True,
144 |         hide_undoc_args=True,
145 |         namespace_headers=False,
146 |     )
147 | 


--------------------------------------------------------------------------------
/docs/inputs/standard.md:
--------------------------------------------------------------------------------
 1 | # MAVIS standard input file format
 2 | 
 3 | These requirements pertain to the columns of input files from the
 4 | various tools you want to merge. The input files should be tab-delimited
 5 | text files. Comments at the top of may be included. Comments should
 6 | begin with hash marks. They will be ignored when the file is read
 7 | 
 8 | ```text
 9 | ## This is a comment
10 | ```
11 | 
12 | The header row contains the column names and is the first row following
13 | the comments (or the first row if no comments are included).
14 | 
15 | ```text
16 | ## This is a comment
17 | ## this is another comment
18 | # this is also a comment
19 | This    Is  The Header
20 | ```
21 | 
22 | A simple input file might look as follows
23 | 
24 | ```text
25 | ## File created at: 2018-01-02
26 | ## Generated by: MAVIS v1.0.0
27 | break1_chromosome  break1_position_start   break1_position_end break2_chromosome break2_position_start break2_position_end
28 | X   1234    1234    X   77965   77965
29 | ```
30 | 
31 | ## Required Columns
32 | 
33 | - [break1_chromosome](../../outputs/columns/#break1_chromosome)
34 | - [break1_position_start](../../outputs/columns/#break1_position_start)
35 | - [break1_position_end](../../outputs/columns/#break1_position_end) (can be the same as break1\_position\_start)
36 | - [break2_chromosome](../../outputs/columns/#break2_chromosome)
37 | - [break2_position_start](../../outputs/columns/#break2_position_start)
38 | - [break2_position_end](../../outputs/columns/#break2_position_end) (can be the same as break2\_position\_start)
39 | 
40 | ## Optional Columns
41 | 
42 | Optional Columns that are not given as input will be added with default
43 | (or command line parameter options) during the clustering stage of MAVIS
44 | as some are required for subsequent pipeline steps
45 | 
46 | - [break1_strand](../../outputs/columns/#break1_strand) (defaults to not-specified during clustering)
47 | - [break1_orientation](../../outputs/columns/#break1_orientation) (expanded to all possible values during clustering)
48 | - [break2_strand](../../outputs/columns/#break2_strand) (defaults to not-specified during clustering)
49 | - [break2_orientation](../../outputs/columns/#break2_orientation) (expanded to all possible values during clustering)
50 | - [opposing_strands](../../outputs/columns/#opposing_strands) (expanded to all possible values during clustering)
51 | - [stranded](../../outputs/columns/#stranded) (defaults to False during clustering)
52 | - [library](../../outputs/columns/#library) (defaults to command line library parameter during clustering)
53 | - [protocol](../../outputs/columns/#protocol) (defaults to command line protocol parameter during clustering)
54 | - [tools](../../outputs/columns/#tools) (defaults to an empty string during clustering)
55 | 
56 | ## Summary by Pipeline Step
57 | 
58 | The different pipeline steps of MAVIS have different input column
59 | requirements. These are summarized below (for the pipeline steps which
60 | can act as the pipeline start)
61 | 
62 | | column name                                                           | cluster | annotate | validate |
63 | | --------------------------------------------------------------------- | ------- | -------- | -------- |
64 | | [break1_chromosome](../../outputs/columns/#break1_chromosome)         | &check; | &check;  | &check;  |
65 | | [break1_position_start](../../outputs/columns/#break1_position_start) | &check; | &check;  | &check;  |
66 | | [break1_position_end](../../outputs/columns/#break1_position_end)     | &check; | &check;  | &check;  |
67 | | [break2_chromosome](../../outputs/columns/#break2_chromosome)         | &check; | &check;  | &check;  |
68 | | [break2_position_start](../../outputs/columns/#break2_position_start) | &check; | &check;  | &check;  |
69 | | [break2_position_end](../../outputs/columns/#break2_position_end)     | &check; | &check;  | &check;  |
70 | | [break1_strand](../../outputs/columns/#break1_strand)                 |         |          |          |
71 | | [break1_orientation](../../outputs/columns/#break1_orientation)       |         | &check;  | &check;  |
72 | | [break2_strand](../../outputs/columns/#break2_strand)                 |         |          |          |
73 | | [break2_orientation](../../outputs/columns/#break2_orientation)       |         | &check;  | &check;  |
74 | | [opposing_strands](../../outputs/columns/#opposing_strands)           |         |          |          |
75 | | [stranded](../../outputs/columns/#stranded)                           |         |          |          |
76 | | [library](../../outputs/columns/#library)                             |         |          |          |
77 | | [protocol](../../outputs/columns/#protocol)                           |         |          |          |
78 | | [tools](../../outputs/columns/#tools)                                 |         |          |          |
79 | | [event_type](../../outputs/columns/#event_type)                       |         |          |          |
80 | 
81 | Some native tool outputs are [supported](../../inputs/support/#sv-callers) and
82 | have built in methods to convert to the above format. Any unsupported
83 | tools can be used as long as the user converts the tools native output
84 | to match the above format.
85 | 


--------------------------------------------------------------------------------
/tests/data/transabyss_events.tab:
--------------------------------------------------------------------------------
 1 | id	contig	contig_size	genomic_regions	contig_regions	strands	flanking_pairs	breakpoint_pairs	spanning_reads	spanning_reads_forward	spanning_reads_reverse	rearrangement	breakpoint	size	genes	transcripts	senses	exons/introns	exon_bounds	reciprocal	descriptor	orientations	5'gene	3'gene	5'exon	3'exon	frame	probe	repeat1	repeat2	alignment_params	type	dbsnp	dgv
 2 | 227	893920	186	1:207981139-207981233,1:208014818-208014912	92-186,1-95	-,-	20	0,6	9	6	3	deletion	1:207981233|1:208014818	33584	NA,C1orf132	NA,ENST00000415882	NA,+	NA,intron2	NA,NA	NA	del1q32.2	L,R	NA	NA	NA	NA	NA	atggaaaaaggggaaacaaccttagggcagtcagacttctctatgaattcctCTCTCTGATCTGATGGGAATGCACTAGACTGTGAAACTTCCTCCTCCACC	-	-	TO:0.00,CO:0.02,CC:1.00,I1:100.0,I2:100.0,AF1:0.51,AF2:0.51	LSR	-	NA
 3 | 236	4567117	30066	1:224646603-224662564,1:224786034-224800120	1-15947,15985-30066	-,-	50	0,10	25	12	13	duplication	1:224646603|1:224800120	153516	NA,NA	NA,NA	NA,NA	NA,NA	NA,NA	NA	dup1q42.12	R,L	NA	NA	NA	NA	NA	attttccccttttcttgaaaagttgctgcaaagcgctcccctcctaagttgctagagcagctcacagaactgctatagtaagttttgGAGTACTAAAGGCATAGCTCAGTCTCCTCCTCAAGATTAAGAAATGCCCC	-	L1MEg	TO:0.00,CO:0.00,CC:1.00,I1:99.9,I2:99.9,AF1:0.53,AF2:0.47	LSR	NA	NA
 4 | 35	4556542	53631	10:89659755-89700299,10:89712341-89725438	1-40511,40530-53631	+,+	64	0,24	43	22	21	deletion	10:89700299|10:89712341	12041	PTEN,PTEN	ENST00000371953,ENST00000371953	+,+	intron5,intron6	NA,NA	NA	del10q23.31	L,R	PTEN	PTEN	5	7	in	cagatctgcaaagatcaacctgtcctaagtcatataatctctttgtgtaagagattatactttgtgtaAGAGGTCCACCAGAGGAGTTCAGCAATTTGCTGCTCTTAGGGCAGGGATC	-	TRF_SimpleTandemRepeat_CAGAGGTCCAG	TO:0.00,CO:0.00,CC:1.00,I1:100.0,I2:100.0,AF1:0.76,AF2:0.24	LSR	-	NA
 5 | 28	3113294	240	10:7059511-7059605,19:17396666-17396811	146-240,1-146	+,+	27	0,8	15	9	6	translocation	10:7059511|19:17396811	-	NA,ANKLE1	NA,ENST00000404261	NA,+	NA,intron8	NA,NA	NA	t(10;19)(p14;p13.11)	R,L	NA	NA	NA	NA	NA	gcatgtattttgctccattggtttatccccactcaagggcaatacacatTCAAAGCATAAAAATTACATGACCTATGATATTTATTTTGCTAAGATTTT	-	-	TO:0.00,CO:0.00,CC:1.00,I1:100.0,I2:99.3,AF1:0.40,AF2:0.61	LSR	NA	NA
 6 | 63	indel_k96_4449027	1742	12:104359630-104359778,12:125801148-125802740	1-149,150-1742	-,+	33	0,6	9	5	4	inversion	12:104359630|12:125801148	21441517	TDG,NA	ENST00000392872,NA	-,NA	exon1,NA	no,NA	NA	inv12q23.3-q24.31	R,R	NA	NA	NA	NA	NA	gctggactcaagctcctcctccaggcttctaccgtcccccacggacccccCTGAGTAGATGATTTTCAGCTGAGGTCTGAGTAGTGGGAAGGGACTGACT	-	L2a	TO:0.00,CO:0.00,CC:1.00,I1:100.0,I2:99.8,AF1:0.09,AF2:0.91	LSR	NA	NA
 7 | 634	2130795	190	7:150746563-150746657,15:84810725-84810819	96-190,1-95	-,-	67	0,9	21	13	8	translocation	7:150746657|15:84810725	-	ASIC3,NA	ENST00000357922,NA	-,NA	intron1,NA	NA,NA	NA	t(7;15)(q36.1;q25.2)	L,R	NA	NA	NA	NA	NA	aacaggtacaattagggagaggctatgtcaatgcaggaaaaggtcttatcGGCACTGGGGGGTGGGGAGTCCATGGCTGGTAGGAAGGAAGAGGTCCCCT	-	segdup_chr15:82483003	TO:0.00,CO:0.00,CC:1.00,I1:100.0,I2:100.0,AF1:0.50,AF2:0.50	LSR	NA	NA
 8 | 296	281201	187	3:24565106-24565200,3:24566179-24566273	93-187,1-95	+,-	118	0,19	44	20	24	inversion	3:24565106|3:24566179	1072	NA,NA	NA,NA	NA,NA	NA,NA	NA,NA	NA	inv3p24.2	R,R	NA	NA	NA	NA	NA	tcgtgtttcattctgcctgagagcagtctacctaaatatatagctctgctcACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACA	-	-	TO:0.00,CO:0.02,CC:1.00,I1:100.0,I2:98.9,AF1:0.51,AF2:0.51	LSR	NA	NA
 9 | 625	1719994	191	7:125746029-125746123,7:126166901-126166995	1-95,97-191	+,+	59	0,9	24	13	11	deletion	7:125746123|7:126166901	420777	NA,GRM8	NA,ENST00000339582	NA,-	NA,intron9	NA,NA	NA	del7q31.33	L,R	NA	NA	NA	NA	NA	atgaagaagaaaagagaaatttttaaataggtagtagcagaaattataaatGCATATCATTTAAATTAAGAGCATAAATGAGGCCACATAAATGCTTTCTT	L1PA15-16	-	TO:0.00,CO:0.00,CC:0.99,I1:100.0,I2:100.0,AF1:0.50,AF2:0.50	LSR	-	NA
10 | 617	4285174	188	7:104485067-104485161,7:104612208-104612302	1-95,94-188	-,-	72	0,12	30	16	14	duplication	7:104485067|7:104612302	127234	LHFPL3,NA	ENST00000535008,NA	-,NA	intron4,NA	NA,NA	NA	dup7q22.2-q22.3	R,L	NA	NA	NA	NA	NA	ttagacatcattgttgtttttattttatctttggtttcctcaggcaatacCCTTGGAATGACACATTATCCTCCCTTCACATGTAGCAATTGTAAATTCC	-	-	TO:0.00,CO:0.01,CC:1.00,I1:100.0,I2:100.0,AF1:0.51,AF2:0.51	LSR	NA	NA
11 | 445	2769447	187	7:126098488-126098582,7:126167441-126167535	93-187,1-95	+,-	62	0,11	27	16	11	inversion	7:126098488|7:126167441	68952	GRM8,GRM8	ENST00000339582,ENST00000339582	-,+	intron9,intron9	NA,NA	NA	inv7q31.33	R,R	-	-	-	-	NA	atcgttaatcactgcatataactatcttaggctacctgttggtaaactataTGCAAAGAATATATATACACACATACAATTAATCCATTATCACAATGTAT	-	-	TO:0.00,CO:0.02,CC:1.00,I1:100.0,I2:100.0,AF1:0.51,AF2:0.51	LSR	NA	NA
12 | 527	3739669	201	9:28031863-28031957,9:28034467-28034561	1-95,107-201	-,+	23	0,6	16	10	6	inversion	9:28031863|9:28034467	2603	LINGO2,LINGO2	ENST00000379992,ENST00000379992	+,-	intron4,intron4	NA,NA	NA	inv9p21.1	R,R	-	-	-	-	NA	ccagattgaaggtattttaaggaggatttggagcatcatggtgaagcgtgaattccgaaaaGAAAGCTCAGCCTGGCTTTTGTGGCCCAGAAGCCCAGAATTTCAGCAACT	-	-	TO:0.00,CO:0.00,CC:0.95,I1:100.0,I2:100.0,AF1:0.47,AF2:0.47	LSR	NA	NA
13 | 747	3253092	198	X:31196849-31196943,X:31216211-31216305	1-95,104-198	+,+	36	0,14	20	11	9	deletion	X:31196943|X:31216211	19267	DMD,DMD	ENST00000357033,ENST00000357033	-,-	intron69,intron67	NA,NA	NA	delXp21.2	L,R	DMD	DMD	67	70	in	aagtctcgaacatcttctcctgatgtagtctaaaagggagatcatggtgaatgtagtgAATGTAGTGAAGATCGGGGGATAAAAAAGGGATGGTTAATGGGTACAAAA	-	L1MA4	TO:0.00,CO:0.00,CC:0.96,I1:100.0,I2:100.0,AF1:0.48,AF2:0.48	LSR	-	NA
14 | 


--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
  1 | # Install Instructions
  2 | 
  3 | Once the install steps are complete [MAVIS](http://mavis.bcgsc.ca) is ready to be run.
  4 | See the MAVIS [tutorial](https://mavis.readthedocs.io/en/latest/tutorials/mini) to learn about running MAVIS.
  5 | 
  6 | For either install option you will want to install the main Snakefile. It is best to use a tag to
  7 | specify the version of interest but you can download the latest version from the master branch
  8 | 
  9 | ```bash
 10 | wget https://raw.githubusercontent.com/bcgsc/mavis/master/Snakefile -O Snakefile
 11 | ```
 12 | 
 13 | ## Install for Docker/Singularity
 14 | 
 15 | The simplest way to use MAVIS is via Singularity. The MAVIS docker container used
 16 | by singularity will take care of installing the aligner as well.
 17 | 
 18 | ```bash
 19 | pip install -U setuptools pip wheel
 20 | pip install mavis_config  # also installs snakemake
 21 | ```
 22 | 
 23 | Now you will run mavis via Snakemake as follows
 24 | 
 25 | ```bash
 26 | snakemake \
 27 |     -j <MAX JOBS> \
 28 |     --configfile <YOUR CONFIG> \
 29 |     --use-singularity \
 30 |     -s Snakefile
 31 | ```
 32 | 
 33 | ## Install (Python Only)
 34 | 
 35 | MAVIS can also be run with just python. However you will need to install the aligner(s) required
 36 | by MAVIS separately and ensure they are availble on the default PATH variable when MAVIS is run
 37 | 
 38 | ### 1. Install Aligner
 39 | 
 40 | In addition to the python package dependencies, [MAVIS](http://mavis.bcgsc.ca) also requires an aligner to be installed.
 41 | Currently the only aligners supported are [blat](https://mavis.readthedocs.io/en/latest/glossary/#blat) and [bwa mem](https://mavis.readthedocs.io/en/latest/glossary/#bwa).
 42 | For MAVIS to run successfully the aligner must be installed and accessible on the path.
 43 | If you have a non-standard install you may find it useful to edit the PATH environment variable. For example
 44 | 
 45 | ``` bash
 46 | export PATH=/path/to/directory/containing/blat/binary:$PATH
 47 | ```
 48 | 
 49 | [blat](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-blat) is the default aligner. To configure MAVIS to use [bwa mem](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-bwa) it must be specified
 50 | in the [config](https://mavis.readthedocs.io/en/latest/configuration/settings/) JSON file.
 51 | 
 52 | After this has been installed MAVIS itself can be installed through [pip](https://pypi.org/project/mavis/)
 53 | 
 54 | ### 2. Install MAVIS
 55 | 
 56 | #### Install using pip
 57 | 
 58 | The easiest way to install [MAVIS](http://mavis.bcgsc.ca) is through the python package manager, pip. If you do not have python3 installed it can be found [here](https://www.python.org/downloads)
 59 | 
 60 | Ensuring you have a recent version of pip and setuptools will improve the install experience. Older versions of pip and setuptools may have issues with obtaining some of the mavis python dependencies
 61 | 
 62 | ``` bash
 63 | pip install --upgrade pip setuptools
 64 | ```
 65 | 
 66 | or (for Anaconda users)
 67 | 
 68 | ``` bash
 69 | conda update pip setuptools
 70 | ```
 71 | 
 72 | If this is not a clean/new python install it may be useful to set up mavis in a [virtual python environment](https://docs.python.org/3/tutorial/venv.html)
 73 | 
 74 | Then install mavis itself
 75 | 
 76 | ``` bash
 77 | pip install mavis
 78 | ```
 79 | 
 80 | This will install mavis and its python dependencies.
 81 | 
 82 | #### Install using Buildout
 83 | 
 84 | Alternatively you can use the [bootstrap/buildout](http://www.buildout.org/en/latest/) to install mavis into bin/mavis
 85 | 
 86 | ``` bash
 87 | git clone https://github.com/bcgsc/mavis.git
 88 | cd mavis
 89 | pip install zc.buildout
 90 | python bootstrap.py
 91 | bin/buildout
 92 | ```
 93 | 
 94 | This will install mavis and its python dependencies into eggs inside the cloned mavis directory which can be used by simply running bin/mavis
 95 | 
 96 | Finally you will need to Build/Download the necessary reference files
 97 | 
 98 | ## Build or Download Reference Files
 99 | 
100 | After [MAVIS](http://mavis.bcgsc.ca) is installed the [reference files](https://mavis.readthedocs.io/en/latest/inputs/reference) must be generated (or downloaded) before it can be run. A simple bash script to download the hg19 reference files is provided under mavis/tools for convenience.
101 | 
102 | ### Download Hg19 Files
103 | 
104 | ``` bash
105 | cd /path/to/where/you/want/to/put/the/files
106 | wget https://raw.githubusercontent.com/bcgsc/mavis/master/src/tools/get_hg19_reference_files.sh
107 | bash get_hg19_reference_files.sh
108 | ```
109 | 
110 | You should now see the reference files in the current directory
111 | 
112 | ```text
113 | .
114 | |-- cytoBand.txt
115 | |-- dgv_hg19_variants.tab
116 | |-- ensembl69_hg19_annotations.json
117 | |-- get_hg19_reference_files.sh
118 | |-- hg19.2bit
119 | |-- hg19.fa
120 | `-- hg19_masking.tab
121 | ```
122 | 
123 | ### Download Hg38 Files
124 | 
125 | ``` bash
126 | cd /path/to/where/you/want/to/put/the/files
127 | wget https://raw.githubusercontent.com/bcgsc/mavis/master/src/tools/get_hg38_reference_files.sh
128 | bash get_hg19_reference_files.sh
129 | ```
130 | 
131 | You should now see the reference files in the current directory
132 | 
133 | ```text
134 | .
135 | |-- cytoBand.txt
136 | |-- dgv_hg38_variants.tab
137 | |-- ensembl79_hg38_annotations.json
138 | |-- get_hg38_reference_files.sh
139 | |-- GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
140 | |-- GRCh38_masking.tab
141 | `-- hg38.2bit
142 | ```
143 | 


--------------------------------------------------------------------------------
/tests/test_mavis/convert/test_tools_vcf.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from mavis.convert.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf
  4 | 
  5 | from ...util import get_data
  6 | 
  7 | 
  8 | def test_read_vcf():
  9 |     header, df = pandas_vcf(get_data('sniffles.vcf'))
 10 |     assert len(header) == 231
 11 |     assert df.shape[0] == 106
 12 | 
 13 | 
 14 | def test_convert_telomeric_region():
 15 |     variant_imprecise = VcfRecordType(
 16 |         id='mock-BND-imprecise',
 17 |         pos=0,
 18 |         chrom='chr14_KI270722v1_random',
 19 |         alts=['N[chr17_GL000205v2_random:0['],
 20 |         ref='N',
 21 |         info=VcfInfoType(
 22 |             IMPRECISE=True,
 23 |             SVMETHOD="Snifflesv1.0.11",
 24 |             SVTYPE="BND",
 25 |             SUPTYPE="SR",
 26 |             SVLEN="0",
 27 |             STRANDS="+-",
 28 |             RE="5",
 29 |             REF_strand="0,0",
 30 |             AF="1",
 31 |         ),
 32 |     )
 33 |     variant_precise = VcfRecordType(
 34 |         id='mock-BND-precise',
 35 |         pos=0,
 36 |         chrom='chr14_KI270722v1_random',
 37 |         alts=[']chrUn_GL000216v2:142821]N'],
 38 |         ref='N',
 39 |         info=VcfInfoType(
 40 |             IMPRECISE=False,
 41 |             SVMETHOD="Snifflesv1.0.11",
 42 |             SVTYPE="BND",
 43 |             SUPTYPE="SR",
 44 |             SVLEN="0",
 45 |             STRANDS="+-",
 46 |             RE="5",
 47 |             REF_strand="0,0",
 48 |             AF="1",
 49 |         ),
 50 |     )
 51 |     imprecise_records = convert_record(variant_imprecise)
 52 |     assert len(imprecise_records) == 1
 53 |     imprecise_records = imprecise_records[0]
 54 |     assert imprecise_records.get('break1_position_end') == 1
 55 | 
 56 |     precise_records = convert_record(variant_precise)
 57 |     assert len(precise_records) == 1
 58 |     precise_records = precise_records[0]
 59 |     assert precise_records.get('break1_position_end') == 1
 60 | 
 61 |     assert precise_records.get('break1_chromosome') == 'chr14_KI270722v1_random'
 62 |     assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random'
 63 | 
 64 | 
 65 | TEST_POS = 1853407
 66 | 
 67 | 
 68 | @pytest.mark.parametrize(
 69 |     'pos,break1_ci,break2_ci,break1,break2,ids',
 70 |     [
 71 |         [
 72 |             TEST_POS,
 73 |             (-30, 30),
 74 |             (-65, 65),
 75 |             (TEST_POS - 30, TEST_POS + 30),
 76 |             (TEST_POS - 30, TEST_POS + 65),
 77 |             'vcf-cuteSV.INS.breakpoint_2_start < breakpoint_1_start',
 78 |         ],
 79 |         [
 80 |             TEST_POS,
 81 |             (-30, 99999),
 82 |             (-10, 65),
 83 |             (TEST_POS - 30, TEST_POS + 65),
 84 |             (TEST_POS - 10, TEST_POS + 65),
 85 |             'vcf-cuteSV.INS.breakpoint_1_end > breakpoint_2_end',
 86 |         ],
 87 |     ],
 88 |     ids=[
 89 |         'breakpoint_2_start < breakpoint_1_start',
 90 |         'breakpoint_1_end > breakpoint_2_end',
 91 |     ],
 92 | )
 93 | def test_convert_intrachromosomal_imprecise_breakend(
 94 |     pos, break1_ci, break2_ci, break1, break2, ids
 95 | ):
 96 |     variant_vcf = VcfRecordType(
 97 |         id=ids,
 98 |         pos=pos,
 99 |         chrom='chr5',
100 |         alts=['AGG'],
101 |         ref='A',
102 |         info=VcfInfoType(
103 |             CHR2="chr5",
104 |             IMPRECISE=True,
105 |             SVMETHOD="cuteSV-1.0.12",
106 |             SVTYPE="INS",
107 |             CIPOS=break1_ci,
108 |             CILEN=break2_ci,
109 |         ),
110 |     )
111 |     result = convert_record(variant_vcf)
112 |     assert len(result) == 1
113 |     variant = result[0]
114 |     assert variant.get('break1_position_start') == break1[0]
115 |     assert variant.get('break1_position_end') == break1[1]
116 |     assert variant.get('break2_position_start') == break2[0]
117 |     assert variant.get('break2_position_end') == break2[1]
118 | 
119 | 
120 | @pytest.mark.parametrize(
121 |     'pos,break1_ci,break2_ci,break1,break2,ids',
122 |     [
123 |         [
124 |             TEST_POS,
125 |             (-30, 99999),
126 |             (70, 65),
127 |             (TEST_POS - 30, TEST_POS + 65),
128 |             (TEST_POS + 65, TEST_POS + 65),
129 |             'vcf-cuteSV.INS.breakpoint_2_start > breakpoint_2_end',
130 |         ],
131 |     ],
132 |     ids=[
133 |         'breakpoint_2_start > breakpoint_2_end',
134 |     ],
135 | )
136 | def test_error_on_convert_intrachromosomal_imprecise_breakend(
137 |     pos, break1_ci, break2_ci, break1, break2, ids
138 | ):
139 |     variant_vcf = VcfRecordType(
140 |         id=ids,
141 |         pos=pos,
142 |         chrom='chr5',
143 |         alts=['AGG'],
144 |         ref='A',
145 |         info=VcfInfoType(
146 |             CHR2="chr5",
147 |             IMPRECISE=True,
148 |             SVMETHOD="cuteSV-1.0.12",
149 |             SVTYPE="INS",
150 |             CIPOS=break1_ci,
151 |             CILEN=break2_ci,
152 |         ),
153 |     )
154 |     with pytest.raises(ValueError):
155 |         convert_record(variant_vcf)
156 | 
157 | 
158 | def test_convert_intrachromosomal_imprecise_breakend_no_ci():
159 |     # breakpoint_1_start > breakpoint_1_end
160 |     variant_cilen4 = VcfRecordType(
161 |         id='Sniffle.INS',
162 |         pos=11184,
163 |         chrom='chr2',
164 |         alts=['AGG'],
165 |         ref='N',
166 |         info=VcfInfoType(
167 |             CHR2="chr2",
168 |             IMPRECISE=True,
169 |             SVTYPE="INS",
170 |             END=11183,
171 |         ),
172 |     )
173 |     with pytest.raises(ValueError):
174 |         convert_record(variant_cilen4)
175 | 


--------------------------------------------------------------------------------
/src/mavis/overlay.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Dict, List, Tuple
  3 | 
  4 | from . import annotate as _annotate
  5 | from . import util as _util
  6 | from .annotate.file_io import ReferenceFile
  7 | from .error import DrawingFitError
  8 | from .illustrate.constants import DiagramSettings
  9 | from .illustrate.diagram import draw_multi_transcript_overlay
 10 | from .illustrate.scatter import bam_to_scatter
 11 | 
 12 | 
 13 | def check_overlay_args(args, parser):
 14 |     """
 15 |     parse the overlay options and check the formatting
 16 |     """
 17 |     # check complex options
 18 |     for marker in args.markers:
 19 |         if len(marker) < 3:
 20 |             marker.append(marker[-1])
 21 |         try:
 22 |             marker[1] = int(marker[1])
 23 |             marker[2] = int(marker[2])
 24 |         except ValueError:
 25 |             parser.error('argument --marker: start and end must be integers: {}'.format(marker))
 26 | 
 27 |     defaults = [None, None, 0.5, None, True]
 28 |     bam_file, density, ymax, stranded = range(1, 5)
 29 | 
 30 |     for plot in args.read_depth_plots:
 31 |         for i, d in enumerate(defaults):
 32 |             if i >= len(plot):
 33 |                 plot.append(d)
 34 |         if not os.path.exists(plot[bam_file]):
 35 |             parser.error(
 36 |                 'argument --read_depth_plots: the bam file given does not exist: {}'.format(
 37 |                     plot[bam_file]
 38 |                 )
 39 |             )
 40 |         try:
 41 |             plot[density] = float(plot[density])
 42 |             if plot[density] < 0 or plot[density] > 1:
 43 |                 raise ValueError()
 44 |         except ValueError:
 45 |             parser.error(
 46 |                 'argument --read_depth_plots: density must be an float between 0 and 1: {}'.format(
 47 |                     plot[density]
 48 |                 )
 49 |             )
 50 |         try:
 51 |             if str(plot[ymax]).lower() in ['null', 'none']:
 52 |                 plot[ymax] = None
 53 |             else:
 54 |                 plot[ymax] = int(plot[ymax])
 55 |         except ValueError:
 56 |             parser.error(
 57 |                 'argument --read_depth_plots: ymax must be an integer: {}'.format(plot[ymax])
 58 |             )
 59 |         try:
 60 |             plot[stranded] = _util.cast_boolean(plot[stranded])
 61 |         except TypeError:
 62 |             parser.error(
 63 |                 'argument --read_depth_plots: stranded must be an boolean: {}'.format(
 64 |                     plot[stranded]
 65 |                 )
 66 |             )
 67 |     return args
 68 | 
 69 | 
 70 | def main(
 71 |     gene_name: str,
 72 |     output: str,
 73 |     config: Dict,
 74 |     buffer_length: int,
 75 |     read_depth_plots,
 76 |     markers: List[Tuple[str, int, int]],
 77 |     ymax_color='#FF0000',
 78 |     **kwargs,
 79 | ):
 80 |     """
 81 |     generates an overlay diagram
 82 |     """
 83 |     annotations = ReferenceFile.load_from_config(config, 'annotations')
 84 |     annotations.load()
 85 |     drawing_width_iter_increase = config['illustrate.drawing_width_iter_increase']
 86 |     max_drawing_retries = config['illustrate.max_drawing_retries']
 87 |     min_mapping_quality = config['validate.min_mapping_quality']
 88 |     # check options formatting
 89 |     gene_to_draw = None
 90 | 
 91 |     for chrom in annotations.content:
 92 |         for gene in annotations.content[chrom]:
 93 |             if gene_name in gene.aliases or gene_name == gene.name:
 94 |                 gene_to_draw = gene
 95 |                 _util.logger.info(
 96 |                     f'Found target gene: {gene.name}(aka. {gene.aliases}) {gene.chr}:{gene.start}-{gene.end}'
 97 |                 )
 98 |                 break
 99 |     if gene_to_draw is None:
100 |         raise KeyError('Could not find gene alias or id in annotations file', gene_name)
101 | 
102 |     settings = DiagramSettings(**kwargs)
103 | 
104 |     genomic_min = max(gene_to_draw.start - buffer_length, 1)
105 |     genomic_max = gene_to_draw.end + buffer_length
106 | 
107 |     plots = []
108 |     for axis_name, bam_file, density, ymax, stranded in read_depth_plots:
109 |         # one plot per bam
110 |         plots.append(
111 |             bam_to_scatter(
112 |                 bam_file,
113 |                 gene_to_draw.chr,
114 |                 genomic_min,
115 |                 genomic_max,
116 |                 strand=gene_to_draw.get_strand() if stranded else None,
117 |                 ymax=ymax,
118 |                 density=density,
119 |                 axis_name=axis_name,
120 |                 min_mapping_quality=min_mapping_quality,
121 |                 ymax_color=ymax_color,
122 |             )
123 |         )
124 | 
125 |     vmarkers = []
126 | 
127 |     for i, (marker_name, marker_start, marker_end) in enumerate(markers):
128 |         vmarkers.append(
129 |             _annotate.base.BioInterval(gene_to_draw.chr, marker_start, marker_end, name=marker_name)
130 |         )
131 | 
132 |     canvas = None
133 |     attempts = 1
134 |     while True:
135 |         try:
136 |             canvas = draw_multi_transcript_overlay(
137 |                 settings,
138 |                 gene_to_draw,
139 |                 vmarkers=vmarkers,
140 |                 plots=plots,
141 |                 window_buffer=buffer_length,
142 |             )
143 |             break
144 |         except DrawingFitError as err:
145 |             if attempts > max_drawing_retries:
146 |                 raise err
147 |             _util.logger.info(f'Drawing fit: extending window {drawing_width_iter_increase}')
148 |             settings.width += drawing_width_iter_increase
149 |             attempts += 1
150 | 
151 |     svg_output_file = os.path.join(output, '{}_{}_overlay.svg'.format(gene_to_draw.name, gene_name))
152 |     _util.logger.info(f'writing: {svg_output_file}')
153 | 
154 |     canvas.saveas(svg_output_file)
155 | 


--------------------------------------------------------------------------------
/src/mavis/config.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from copy import copy as _copy
  3 | from typing import Dict
  4 | 
  5 | from .annotate.file_io import ReferenceFile
  6 | from .bam import stats
  7 | from .bam.cache import BamCache
  8 | from .constants import PROTOCOL, float_fraction
  9 | from .util import cast_boolean, filepath
 10 | 
 11 | 
 12 | def calculate_bam_stats(config: Dict, library_name: str) -> Dict:
 13 |     """
 14 |     Calculate the read stats for a library from a given bam file
 15 |     """
 16 |     library = config['libraries'][library_name]
 17 |     annotations = ReferenceFile('annotations', *config['reference.annotations'])
 18 | 
 19 |     if library['protocol'] == PROTOCOL.TRANS:
 20 |         if annotations is None or annotations.is_empty():
 21 |             raise AttributeError(
 22 |                 'missing required attribute: annotations. Annotations must be given for transcriptomes'
 23 |             )
 24 |         annotations.load()
 25 |     bam = BamCache(library['bam_file'], stranded=library['strand_specific'])
 26 |     if library['protocol'] == PROTOCOL.TRANS:
 27 |         bam_stats = stats.compute_transcriptome_bam_stats(
 28 |             bam,
 29 |             annotations=annotations.content,
 30 |             sample_size=config['bam_stats.sample_size'],
 31 |             sample_cap=config['bam_stats.sample_cap'],
 32 |             distribution_fraction=config['bam_stats.distribution_fraction'],
 33 |         )
 34 |         return {
 35 |             'median_fragment_size': int(bam_stats.median_fragment_size),
 36 |             'read_length': int(bam_stats.read_length),
 37 |             'stdev_fragment_size': int(bam_stats.stdev_fragment_size),
 38 |             'strand_specific': bam_stats.stranded,
 39 |             'strand_determining_read': bam_stats.strand_determining_read,
 40 |         }
 41 |     bam_stats = stats.compute_genome_bam_stats(
 42 |         bam,
 43 |         sample_size=config['bam_stats.sample_size'],
 44 |         sample_bin_size=config['bam_stats.sample_bin_size'],
 45 |         sample_cap=config['bam_stats.sample_cap'],
 46 |         distribution_fraction=config['bam_stats.distribution_fraction'],
 47 |     )
 48 |     return {
 49 |         'median_fragment_size': int(bam_stats.median_fragment_size),
 50 |         'read_length': int(bam_stats.read_length),
 51 |         'stdev_fragment_size': int(bam_stats.stdev_fragment_size),
 52 |     }
 53 | 
 54 | 
 55 | class CustomHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
 56 |     """
 57 |     subclass the default help formatter to stop default printing for required arguments
 58 |     """
 59 | 
 60 |     def _format_args(self, action, default_metavar):
 61 |         if action.metavar is None:
 62 |             action.metavar = get_metavar(action.type)
 63 |         if isinstance(action, RangeAppendAction):
 64 |             return '%s' % self._metavar_formatter(action, default_metavar)(1)
 65 |         return super(CustomHelpFormatter, self)._format_args(action, default_metavar)
 66 | 
 67 |     def _get_help_string(self, action):
 68 |         if action.required:
 69 |             return action.help
 70 |         return super(CustomHelpFormatter, self)._get_help_string(action)
 71 | 
 72 |     def add_arguments(self, actions):
 73 |         # sort the arguments alphanumerically so they print in the help that way
 74 |         actions = sorted(actions, key=lambda x: getattr(x, 'option_strings'))
 75 |         super(CustomHelpFormatter, self).add_arguments(actions)
 76 | 
 77 | 
 78 | class RangeAppendAction(argparse.Action):
 79 |     """
 80 |     allows an argument to accept a range of arguments
 81 |     """
 82 | 
 83 |     def __init__(self, nmin=1, nmax=None, **kwargs):
 84 |         kwargs.setdefault('nargs', '+')
 85 |         kwargs.setdefault('default', [])
 86 |         argparse.Action.__init__(self, **kwargs)
 87 |         self.nmin = nmin
 88 |         self.nmax = nmax
 89 |         assert nmin is not None
 90 | 
 91 |     def __call__(self, parser, namespace, values, option_string=None):
 92 |         if getattr(namespace, self.dest, None) is None:
 93 |             setattr(namespace, self.dest, [])
 94 |         items = _copy(getattr(namespace, self.dest))
 95 |         items.append(values)
 96 |         if self.nmax is None:
 97 |             if len(values) < self.nmin:
 98 |                 raise argparse.ArgumentError(
 99 |                     self, 'must have at least {} arguments. Given: {}'.format(self.nmin, values)
100 |                 )
101 |         elif not self.nmin <= len(values) <= self.nmax:
102 |             raise argparse.ArgumentError(
103 |                 self, 'requires {}-{} arguments. Given: {}'.format(self.nmin, self.nmax, values)
104 |             )
105 |         setattr(namespace, self.dest, items)
106 | 
107 | 
108 | def add_bamstats_to_config(config: Dict):
109 |     """
110 |     Check that the input JSON config conforms to the expected schema as well
111 |     as the other relevant checks such as file exsts
112 |     """
113 |     # check all assignments are conversions aliases or existing files
114 |     for libname, library in config['libraries'].items():
115 |         # calculate the bam_stats if the have not been given
116 |         if any(
117 |             [
118 |                 col not in library
119 |                 for col in ['median_fragment_size', 'read_length', 'stdev_fragment_size']
120 |             ]
121 |         ):
122 |             library.update(calculate_bam_stats(config, libname))
123 | 
124 | 
125 | def get_metavar(arg_type):
126 |     """
127 |     For a given argument type, returns the string to be used for the metavar argument in add_argument
128 | 
129 |     Example:
130 |         >>> get_metavar(bool)
131 |         '{True,False}'
132 |     """
133 |     if arg_type in [bool, cast_boolean]:
134 |         return '{True,False}'
135 |     elif arg_type in [float_fraction, float]:
136 |         return 'FLOAT'
137 |     elif arg_type == int:
138 |         return 'INT'
139 |     elif arg_type == filepath:
140 |         return 'FILEPATH'
141 |     return None
142 | 


--------------------------------------------------------------------------------