├── src ├── mavis │ ├── py.typed │ ├── bam │ │ └── __init__.py │ ├── annotate │ │ ├── __init__.py │ │ └── constants.py │ ├── illustrate │ │ └── __init__.py │ ├── pairing │ │ ├── __init__.py │ │ └── constants.py │ ├── summary │ │ ├── __init__.py │ │ └── constants.py │ ├── validate │ │ ├── __init__.py │ │ └── constants.py │ ├── cluster │ │ └── __init__.py │ ├── __init__.py │ ├── types.py │ ├── error.py │ ├── convert │ │ ├── starfusion.py │ │ ├── straglr.py │ │ ├── cnvnator.py │ │ ├── breakdancer.py │ │ ├── arriba.py │ │ ├── chimerascan.py │ │ ├── constants.py │ │ └── transabyss.py │ ├── overlay.py │ └── config.py └── tools │ ├── __init__.py │ ├── get_hg38_reference_files.sh │ ├── get_hg19_reference_files.sh │ └── find_repeats.py ├── tests ├── __init__.py ├── snakemake │ └── __init__.py ├── test_mavis │ ├── __init__.py │ ├── bam │ │ └── __init__.py │ ├── annotate │ │ ├── __init__.py │ │ ├── test_annotate_fileio2.py │ │ └── test_annotate_fileio.py │ ├── cluster │ │ ├── __init__.py │ │ └── test_cluster.py │ ├── convert │ │ ├── __init__.py │ │ └── test_tools_vcf.py │ ├── pairing │ │ └── __init__.py │ ├── summary │ │ └── __init__.py │ ├── validate │ │ ├── __init__.py │ │ └── test_validate.py │ ├── illustrate │ │ ├── __init__.py │ │ └── test_illustrate.py │ ├── test_constants.py │ ├── test_blat.py │ └── test_help.py ├── test_tools │ ├── __init__.py │ ├── test_convert_dgv.py │ ├── data │ │ ├── ensembl69_hg19_annotations.kras.tab │ │ └── K02718.1.gff3 │ ├── test_convert_annotations_format.py │ └── test_ref_alt_count.py ├── data │ ├── mock_masking.tab │ ├── mock_reference_genome.fa.amb │ ├── mock_dgv_annotation_malformed.tab │ ├── pindel_events.vcf.gz │ ├── mock_reference_genome.2bit │ ├── mock_reference_genome.fa.sa │ ├── mock_reference_genome.fa.bwt │ ├── mock_reference_genome.fa.pac │ ├── mock_reads_for_events.sorted.bam │ ├── mini_mock_reads_for_events.sorted.bam │ ├── mock_reads_for_events.sorted.bam.bai │ ├── mock_trans_reads_for_events.sorted.bam │ ├── mini_mock_reads_for_events.sorted.bam.bai │ ├── mock_trans_reads_for_events.sorted.bam.bai │ ├── mock_dgv_annotation.tab │ ├── reference_from_env.cfg │ ├── mini_mock_sv_events.svmerge.tsv │ ├── pairing_reference_annotations_file.tab │ ├── clustering_input.tab │ ├── bad_input_file.cfg │ ├── straglr.bed │ ├── cnvnator.tab │ ├── mock_trans_sv_events.tsv │ ├── mock_pairing_input.tab │ ├── mock_reference_genome.fa.ann │ ├── mock_reference_annotations.json │ ├── breakdancer_output.txt │ ├── bwa_pipeline_config.cfg │ ├── missing_reference.cfg │ ├── clean_pipeline_config.cfg │ ├── no_opt_pipeline.cfg │ ├── Library-clusterset-N.validated.tsv │ ├── pipeline_config.cfg │ ├── mock_reference_annotations.full.json │ ├── transabyss_indels_output.tab │ ├── mock_dgv_annotation_mavis.tab │ ├── mock_sv_events.tsv │ ├── build.cfg │ ├── mock_reference_annotations2.json │ └── transabyss_events.tab ├── setup_subprocess_cov.py ├── util.py ├── mini-tutorial.annotate_only.config.json ├── mini-tutorial.config.json └── full-tutorial.config.json ├── requirements.txt ├── docs ├── index.md ├── extra.css ├── background │ ├── .pages │ └── citations.md ├── tutorials │ ├── .pages │ ├── mini.md │ └── annotation.md ├── inputs │ ├── .pages │ ├── non_python_dependencies.md │ └── standard.md ├── images │ ├── icon.png │ ├── ENSG00000139687_RB1_overlay.png │ ├── snakemake.cluster.full-tutorial.png │ ├── snakemake.cluster.mini-tutorial.png │ ├── snakemake.validate.mini-tutorial.png │ ├── colo829_tumour_annotation_resource_req.png │ ├── colo829_tumour_validation_resource_req.png │ ├── get_app-24px.svg │ └── Fusion-ext.gpl ├── outputs │ ├── index.md │ └── illustrations.md ├── package │ └── mavis │ │ ├── summary │ │ └── index.md │ │ ├── pairing │ │ └── index.md │ │ ├── cluster │ │ └── index.md │ │ ├── annotate │ │ └── index.md │ │ └── validate │ │ └── index.md ├── configuration │ ├── pipeline.md │ ├── performance.md │ └── general.md ├── migrating.md ├── development.md ├── hooks.py └── install.md ├── pyproject.toml ├── codecov.yml ├── MANIFEST.in ├── .coveragerc ├── .readthedocs.yml ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── workflows │ ├── publish.yml │ ├── quick-tests.yml │ └── build.yml └── CONTRIBUTING.md ├── .gitignore ├── mkdocs.yml ├── setup.py ├── env ├── example.sh └── generate_ensembl79_annotations.sh ├── Dockerfile ├── setup.cfg └── README.md /src/mavis/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | . 2 | -------------------------------------------------------------------------------- /src/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/mavis/bam/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/snakemake/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_mavis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/mavis/annotate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/mavis/illustrate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/mavis/pairing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/mavis/summary/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/mavis/validate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_mavis/bam/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | {!./../README.md!} 2 | -------------------------------------------------------------------------------- /tests/test_mavis/annotate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_mavis/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_mavis/convert/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_mavis/pairing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_mavis/summary/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_mavis/validate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_mavis/illustrate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data/mock_masking.tab: -------------------------------------------------------------------------------- 1 | chr start end name -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | build-backend = "setuptools.build_meta" 2 | -------------------------------------------------------------------------------- /tests/data/mock_reference_genome.fa.amb: -------------------------------------------------------------------------------- 1 | 1054073 24 0 2 | -------------------------------------------------------------------------------- /docs/extra.css: -------------------------------------------------------------------------------- 1 | td + td > a { 2 | display: flex; 3 | } 4 | -------------------------------------------------------------------------------- /docs/background/.pages: -------------------------------------------------------------------------------- 1 | nav: 2 | - theory.md 3 | - citations.md 4 | -------------------------------------------------------------------------------- /docs/tutorials/.pages: -------------------------------------------------------------------------------- 1 | nav: 2 | - mini.md 3 | - full.md 4 | - ... 5 | -------------------------------------------------------------------------------- /src/mavis/validate/constants.py: -------------------------------------------------------------------------------- 1 | PASS_FILENAME = 'validation-passed.tab' 2 | -------------------------------------------------------------------------------- /docs/inputs/.pages: -------------------------------------------------------------------------------- 1 | nav: 2 | - reference.md 3 | - standard.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/icon.png -------------------------------------------------------------------------------- /tests/data/mock_dgv_annotation_malformed.tab: -------------------------------------------------------------------------------- 1 | chromosome beginning ending unknown 2 | -------------------------------------------------------------------------------- /tests/data/pindel_events.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/pindel_events.vcf.gz -------------------------------------------------------------------------------- /tests/data/mock_reference_genome.2bit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reference_genome.2bit -------------------------------------------------------------------------------- /tests/data/mock_reference_genome.fa.sa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reference_genome.fa.sa -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | target: 80% 6 | threshold: 1% 7 | -------------------------------------------------------------------------------- /src/mavis/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['merge_breakpoint_pairs'] 2 | 3 | 4 | from .cluster import merge_breakpoint_pairs 5 | -------------------------------------------------------------------------------- /tests/data/mock_reference_genome.fa.bwt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reference_genome.fa.bwt -------------------------------------------------------------------------------- /tests/data/mock_reference_genome.fa.pac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reference_genome.fa.pac -------------------------------------------------------------------------------- /docs/images/ENSG00000139687_RB1_overlay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/ENSG00000139687_RB1_overlay.png -------------------------------------------------------------------------------- /tests/data/mock_reads_for_events.sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reads_for_events.sorted.bam -------------------------------------------------------------------------------- /docs/images/snakemake.cluster.full-tutorial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/snakemake.cluster.full-tutorial.png -------------------------------------------------------------------------------- /docs/images/snakemake.cluster.mini-tutorial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/snakemake.cluster.mini-tutorial.png -------------------------------------------------------------------------------- /docs/images/snakemake.validate.mini-tutorial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/snakemake.validate.mini-tutorial.png -------------------------------------------------------------------------------- /tests/data/mini_mock_reads_for_events.sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mini_mock_reads_for_events.sorted.bam -------------------------------------------------------------------------------- /tests/data/mock_reads_for_events.sorted.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_reads_for_events.sorted.bam.bai -------------------------------------------------------------------------------- /tests/data/mock_trans_reads_for_events.sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_trans_reads_for_events.sorted.bam -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include src *.py *.json 2 | include src/mavis/py.typed 3 | include README.md 4 | include LICENSE 5 | prune docs 6 | prune tests 7 | -------------------------------------------------------------------------------- /tests/data/mini_mock_reads_for_events.sorted.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mini_mock_reads_for_events.sorted.bam.bai -------------------------------------------------------------------------------- /tests/data/mock_trans_reads_for_events.sorted.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/tests/data/mock_trans_reads_for_events.sorted.bam.bai -------------------------------------------------------------------------------- /docs/images/colo829_tumour_annotation_resource_req.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/colo829_tumour_annotation_resource_req.png -------------------------------------------------------------------------------- /docs/images/colo829_tumour_validation_resource_req.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/mavis/HEAD/docs/images/colo829_tumour_validation_resource_req.png -------------------------------------------------------------------------------- /docs/outputs/index.md: -------------------------------------------------------------------------------- 1 | # Tab Delimited Files 2 | 3 | Column names of the output files are documented in the [column names](../../outputs/columns) 4 | section 5 | -------------------------------------------------------------------------------- /src/mavis/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | holds submodules related to structural variants 3 | """ 4 | import pkg_resources 5 | 6 | __version__ = pkg_resources.require('mavis')[0].version 7 | -------------------------------------------------------------------------------- /docs/images/get_app-24px.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | parallel = True 3 | concurrency = multiprocessing 4 | 5 | [html] 6 | directory = coverage 7 | title = mavis coverage report 8 | 9 | [report] 10 | exclude_lines = 11 | pragma: no cover 12 | if TYPE_CHECKING: 13 | -------------------------------------------------------------------------------- /docs/images/Fusion-ext.gpl: -------------------------------------------------------------------------------- 1 | GIMP Palette 2 | Name: Fusions-ext 3 | # 4 | 0 0 0 5 | 255 255 255 6 | 199 217 143 7 | 82 103 43 8 | 133 152 97 9 | 42 67 36 10 | 184 211 186 11 | 76 150 119 12 | 123 221 193 13 | 50 85 86 14 | 125 195 216 15 | 101 126 145 16 | 81 141 197 17 | 38 40 61 18 | 186 178 226 19 | 58 52 105 20 | 124 111 170 21 | -------------------------------------------------------------------------------- /tests/data/mock_dgv_annotation.tab: -------------------------------------------------------------------------------- 1 | chr start end name 2 | 1 1 2300000 nsv482937 3 | 1 10001 22118 dgv1n82 4 | 1 10001 22120 rgv2n98 5 | 1 10001 22221 rgv2n99 6 | 1 10001 127330 nsv7879 7 | 1 10191 10281 nsv958854 8 | 1 10377 177417 nsv428112 9 | 1 10377 1018704 esv2758911 10 | 1 10499 177368 esv27265 11 | 1 11099 47000 nsv1147468 12 | 1 11100 29200 dgv1n106 13 | -------------------------------------------------------------------------------- /tests/test_mavis/annotate/test_annotate_fileio2.py: -------------------------------------------------------------------------------- 1 | from mavis.annotate.file_io import load_annotations 2 | 3 | from ...util import get_data 4 | 5 | JSON = get_data('annotations_subsample.json') 6 | 7 | 8 | class TestAnnotationLoading: 9 | def test_load_json(self): 10 | result = load_annotations(JSON) 11 | assert len(result.keys()) == 12 12 | -------------------------------------------------------------------------------- /tests/setup_subprocess_cov.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | for p in sys.path: 5 | if p.endswith('site-packages'): 6 | pth_file = os.path.join(p, 'subprocess-coverage.pth') 7 | print('writing path file:', pth_file) 8 | with open(pth_file, 'w') as fh: 9 | fh.write('import coverage\n\ncoverage.process_startup()\n') 10 | break 11 | -------------------------------------------------------------------------------- /src/mavis/types.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper classes for type hints 3 | """ 4 | 5 | from typing import TYPE_CHECKING, Dict, List, Tuple 6 | 7 | from Bio.SeqRecord import SeqRecord 8 | 9 | if TYPE_CHECKING: 10 | from .annotate.genomic import Gene 11 | 12 | ReferenceGenome = Dict[str, SeqRecord] 13 | ReferenceAnnotations = Dict[str, List['Gene']] 14 | CigarTuples = List[Tuple[int, int]] 15 | -------------------------------------------------------------------------------- /tests/data/reference_from_env.cfg: -------------------------------------------------------------------------------- 1 | [reference] 2 | 3 | [mock-A36971] 4 | read_length = 150 5 | median_fragment_size = 400 6 | stdev_fragment_size = 97 7 | bam_file = tests/data/mock_reads_for_events.sorted.bam 8 | protocol = genome 9 | inputs = tests/data/mock_sv_events.tsv 10 | strand_specific = False 11 | disease_status=diseased 12 | 13 | [cluster] 14 | uninformative_filter = True 15 | limit_to_chr = None 16 | -------------------------------------------------------------------------------- /src/mavis/error.py: -------------------------------------------------------------------------------- 1 | class NotSpecifiedError(Exception): 2 | """ 3 | raised when information is required for a function but has not been given 4 | 5 | for example if strand was required but had been set to STRAND.NS then this 6 | error would be raised 7 | """ 8 | 9 | pass 10 | 11 | 12 | class DrawingFitError(Exception): 13 | pass 14 | 15 | 16 | class InvalidRearrangement(Exception): 17 | pass 18 | -------------------------------------------------------------------------------- /tests/data/mini_mock_sv_events.svmerge.tsv: -------------------------------------------------------------------------------- 1 | #start_chromosome start_position end_chromosome end_position start_orientation end_orientation start_strand end_strand protocol tool_version libraries tool_evidence comments filters flanking_reads mapping_quality split_reads 2 | reference3 1114-1114 reference3 2187-2187 R R + - genome convert_ta.py_v0.0.1 A36971 3 | reference10 519-519 reference19 965-965 R L + + genome convert_ta.py_v0.0.1 A36971 4 | -------------------------------------------------------------------------------- /docs/package/mavis/summary/index.md: -------------------------------------------------------------------------------- 1 | # Sub-package Documentation 2 | 3 | This is the package responsible for summarizing the calls between libraries. In many cases 4 | this will be where somatic vs germline is determined or genomic only vs expressed. 5 | 6 | ## Output Files 7 | 8 | | expected name/suffix | file type/format | content | 9 | | ----------------------- | ---------------- | ------- | 10 | | ``mavis_summary_*.tab`` | text/tabbed | ? | 11 | -------------------------------------------------------------------------------- /tests/data/pairing_reference_annotations_file.tab: -------------------------------------------------------------------------------- 1 | ## input file used to map hugo gene names: compiled_gene_drug_pathway.v1_2_5.tsv 2 | ## input file for picking best transcript: ens69_best_transcript.txt 3 | ## Ensembl Api version 69 4 | ## generated at: Thu Aug 4 16:38:01 2016 5 | ensembl_gene_id hugo_names chr strand gene_start gene_end best_ensembl_transcript_id ensembl_transcript_id refseq_equivalents transcript_genomic_start transcript_genomic_end cdna_coding_start cdna_coding_end genomic_exon_ranges AA_domain_ranges 6 | -------------------------------------------------------------------------------- /src/mavis/summary/constants.py: -------------------------------------------------------------------------------- 1 | from ..constants import MavisNamespace 2 | 3 | HOMOPOLYMER_MIN_LENGTH = 3 4 | 5 | 6 | class PAIRING_STATE(MavisNamespace): 7 | EXP = 'expressed' 8 | NO_EXP = 'not expressed' 9 | SOMATIC = 'somatic' 10 | GERMLINE = 'germline' 11 | CO_EXP = 'co-expressed' 12 | GERMLINE_EXP = 'germline expression' 13 | SOMATIC_EXP = 'somatic expression' 14 | MATCH = 'matched' 15 | NO_MATCH = 'not matched' 16 | GENOMIC = 'genomic support' 17 | NO_GENOMIC = 'no genomic support' 18 | -------------------------------------------------------------------------------- /src/mavis/pairing/constants.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from mavis_config import DEFAULTS 4 | 5 | from ..constants import CALL_METHOD 6 | 7 | PAIRING_DISTANCES: Dict[str, int] = { 8 | CALL_METHOD.FLANK: DEFAULTS['pairing.flanking_call_distance'], 9 | CALL_METHOD.SPAN: DEFAULTS['pairing.spanning_call_distance'], 10 | CALL_METHOD.SPLIT: DEFAULTS['pairing.split_call_distance'], 11 | CALL_METHOD.CONTIG: DEFAULTS['pairing.contig_call_distance'], 12 | CALL_METHOD.INPUT: DEFAULTS['pairing.input_call_distance'], 13 | } 14 | -------------------------------------------------------------------------------- /tests/data/clustering_input.tab: -------------------------------------------------------------------------------- 1 | tracking_id event_type break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break1_seq break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand break2_seq opposing_strands stranded tools protocol 2 | manta-MantaDEL:175574:0:0:0:0:0 deletion 15 67333523 67333619 L ? None 15 67333581 67333581 R ? None False False manta genome 3 | strelka-TyeSomZhWTRakEu6ZJ7up6 deletion 15 67333623 67333623 L ? None 15 67333625 67333625 R ? None False False strelka genome 4 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation with MkDocs 9 | mkdocs: 10 | configuration: mkdocs.yml 11 | fail_on_warning: false 12 | 13 | # Optionally build your docs in additional formats such as PDF and ePub 14 | formats: all 15 | 16 | # Optionally set the version of Python and requirements required to build your docs 17 | python: 18 | version: 3.7 19 | install: 20 | - method: pip 21 | path: . 22 | extra_requirements: 23 | - doc 24 | -------------------------------------------------------------------------------- /tests/data/bad_input_file.cfg: -------------------------------------------------------------------------------- 1 | [reference] 2 | template_metadata = tests/data/cytoBand.txt 3 | annotations = tests/data/mock_annotations.json 4 | masking = tests/data/mock_masking.tab 5 | reference_genome = tests/data/mock_reference_genome.fa 6 | aligner_reference = tests/data/mock_reference_genome.2bit 7 | dgv_annotation = tests/data/mock_dgv_annotation.txt 8 | 9 | [cluster] 10 | uninformative_filter = True 11 | limit_to_chr = None 12 | 13 | [mock-A36971] 14 | read_length = 150 15 | median_fragment_size = 400 16 | stdev_fragment_size = 97 17 | bam_file = tests/data/mock_reads_for_events.sorted.bam 18 | protocol = genome 19 | inputs = mock_converted.tab 20 | strand_specific = False 21 | disease_status=diseased 22 | 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # python generated files 2 | /.eggs 3 | /coverage 4 | /venv* 5 | /.coverage 6 | *.pyc 7 | *__pycache__ 8 | build-docs 9 | *.egg-info* 10 | build 11 | *coverage* 12 | dist 13 | junit 14 | .pytest* 15 | .tox 16 | *eggs/ 17 | .mypy_cache 18 | .snakemake 19 | .venv* 20 | 21 | # aligners 22 | blat 23 | bwa 24 | *.fai 25 | 26 | # user editing generated files 27 | *.~lock* 28 | .vscode 29 | *.nfs* 30 | junit 31 | 32 | # generated documentation 33 | /docs/package/mavis/*.md 34 | /docs/package/mavis/*/*.md 35 | # don't ignore subpackage summary files 36 | !/docs/package/mavis/*/index.md 37 | docs/configuration/settings.md 38 | 39 | .snakemake 40 | output_dir* 41 | bin 42 | dag* 43 | tutorial_data 44 | reference_inputs 45 | tmp 46 | -------------------------------------------------------------------------------- /tests/data/straglr.bed: -------------------------------------------------------------------------------- 1 | #chrom start end repeat_unit allele1:size allele1:copy_number allele1:support allele2:size allele2:copy_number allele2:support 2 | chr11 776686 778078 CT 100.0 150.0 10 100.0 100.0 1 3 | chr10 3079216 3079421 AGAGGTCACCACCCCTTCCCAACAATCCAGTAACAATCC 100.0 150.0 10 100.0 100.0 1 4 | chr9 2080637 2081030 CTCCTTCCCTCCGCCCCCACCTCGGTCCCTGT 100.0 150.0 10 100.0 100.0 1 5 | chrX 244719 245293 CCCCGGGAACCGCCT 100.0 150.0 10 - - - 6 | chr7 284096 284233 GGT 100.0 150.0 10 - - - 7 | chr8 288173 290242 CCCTGCTCCGT 100.0 150.0 10 100.0 100.0 1 8 | chr3 2382228 2382908 CCGTGGGGGAGGCTGAGGCTATGGGGACT 100.0 100.0 10 - - - 9 | chr2 2427285 2427528 CCTCC 100.0 150.0 10 - - - 10 | chr2 2427953 2428216 GGAGG 100.0 150.0 10 100.0 100.0 1 11 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: MAVIS 2 | 3 | theme: 4 | name: material 5 | repo_url: https://github.com/bcgsc/mavis 6 | repo_name: github 7 | site_dir: build-docs 8 | markdown_extensions: 9 | - codehilite 10 | - admonition 11 | - pymdownx.inlinehilite 12 | - markdown_include.include: 13 | base_path: docs 14 | extra_css: [extra.css] 15 | nav: 16 | - index.md 17 | - install.md 18 | - migrating.md 19 | - ... | background/**.md 20 | - ... | inputs/**.md 21 | - ... | outputs/**.md 22 | - ... | configuration/**.md 23 | - ... | tutorials/**.md 24 | - development.md 25 | - ... 26 | - glossary.md 27 | 28 | plugins: 29 | - search 30 | - awesome-pages 31 | - mkdocs-simple-hooks: 32 | hooks: 33 | on_pre_build: "docs.hooks:build_package_docs" 34 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | from setuptools import setup 5 | 6 | 7 | def check_nonpython_dependencies(): 8 | """ 9 | check that the non-python dependencies have been installed. 10 | 11 | Raises: 12 | OSError: A dependency is not installed 13 | """ 14 | import shutil 15 | 16 | aligner = ( 17 | os.environ['MAVIS_ALIGNER'] 18 | if 'MAVIS_ALIGNER' in os.environ and os.environ['MAVIS_ALIGNER'] 19 | else 'blat' 20 | ) 21 | aligner = re.split(r'\s+', aligner)[0] 22 | pth = shutil.which(aligner) 23 | if not pth: 24 | print('WARNING: Aligner is required. Missing executable: {}'.format(aligner)) 25 | else: 26 | print('Found: aligner at', pth) 27 | 28 | 29 | setup() 30 | check_nonpython_dependencies() 31 | -------------------------------------------------------------------------------- /tests/data/cnvnator.tab: -------------------------------------------------------------------------------- 1 | deletion 1:1-10000 10000 0 1.59373e-11 0 1.99216e-11 0 -1 2 | deletion 1:38001-39000 1000 0.467116 544.034 0.0442397 1 1 1 3 | deletion 1:51201-74200 23000 0.648113 6.92924e-12 2.52664e+09 7.58917e-12 2.55487e+09 1 4 | deletion 1:74601-94200 19600 0.254531 8.13125e-12 2.52848e-32 9.05526e-12 5.01031e-78 1 5 | deletion 1:106001-106800 800 0.270927 4415.44 4.32994e-06 1 1 1 6 | duplication 1:107401-111200 3800 1.67572 0.00897513 2.75843e+07 20.687 5.49288e-07 1 7 | duplication 1:137201-139600 2400 1.54927 0.00127366 8.4566e-14 182635 16668.9 1 8 | deletion 1:149801-150800 1000 0.504485 79.6041 0.00136224 1 1 1 9 | deletion 1:151201-155800 4600 0.582473 0.00108651 9.95448e+06 35.7819 1.1684e+08 1 10 | deletion 1:176201-228000 51800 0.0193339 3.07669e-12 1.12835e-37 3.20025e-12 0 1 11 | -------------------------------------------------------------------------------- /tests/test_mavis/annotate/test_annotate_fileio.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | from mavis.annotate.file_io import load_annotations 6 | 7 | 8 | @pytest.mark.parametrize( 9 | 'annotations,error_message_include', 10 | [ 11 | [{'genes': []}, "schema['properties']['genes']"], 12 | [ 13 | {'genes': [{'start': '1'}]}, 14 | "schema['properties']['genes']['items']['properties']['start']", 15 | ], 16 | ], 17 | ) 18 | def test_min_genes_error(annotations, error_message_include, tmp_path): 19 | filename = tmp_path / "annotations.json" 20 | filename.write_text(json.dumps(annotations)) 21 | with pytest.raises(AssertionError) as exc: 22 | load_annotations(str(filename)) 23 | assert error_message_include in str(exc.value) 24 | -------------------------------------------------------------------------------- /src/mavis/annotate/constants.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from ..constants import MavisNamespace 4 | 5 | PASS_FILENAME = 'annotations.tab' 6 | 7 | 8 | class SPLICE_SITE_TYPE(MavisNamespace): 9 | DONOR: int = 3 10 | ACCEPTOR: int = 5 11 | 12 | 13 | SPLICE_SITE_RADIUS = 2 14 | """int: number of bases away from an exon boundary considered to be part of the splice site such that if it were altered 15 | the splice site would be considered to be abrogated. 16 | """ 17 | 18 | # splice site sequences based on: http://www.nature.com/nrg/journal/v17/n7/fig_tab/nrg.2016.46_F5.html?foxtrotcallback=true 19 | 20 | DONOR_SEQ = [ 21 | re.compile('(AG)(GT[AG]AG)'), 22 | re.compile('([CA]AG)(GTA)'), 23 | ] 24 | 25 | ACCEPTOR_SEQ = [ 26 | re.compile('([TC]{8}[ATCG]CAG)([GA][ATCG])'), 27 | re.compile('([TC]{9}TAG)([GA][ATCG])'), 28 | re.compile('([TC]{8}[ATCG]AAG)([GA][ATCG])'), 29 | ] 30 | -------------------------------------------------------------------------------- /tests/data/mock_trans_sv_events.tsv: -------------------------------------------------------------------------------- 1 | ## False reference9 2000 2000 reference9 2001 2001 L R + + insertion genome convert_ta.py_v0.0.1 mock-A36971 9:66466004 2 | stranded break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end break1_orientation break2_orientation break1_strand break2_strand event_type protocol tools library comment 3 | False gene3 27175 27175 gene3 27176 27176 R L + + duplication transcriptome convert_ta.py_v0.0.1 mock-A47933 1:207249992 4 | True gene1 34090 34090 gene5 608 608 R R - + inverted translocation transcriptome convert_ta.py_v0.0.1 mock-A47933 15:40854971|7:26241389 5 | False gene2 22979 22979 gene2 23783 23783 R L + + duplication transcriptome convert_ta.py_v0.0.1 mock-A47933 15:41623873|15:41625248#this one is pretty low qual 6 | False gene6 70057 77430 gene6 89472 94742 L R + + deletion transcriptome convert_ta.py_v0.0.1 mock-A47933 approx 10:89700299|10:89712341 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. run command '...' 16 | 2. See error ... 17 | 18 | **Expected behavior** 19 | A clear and concise description of what you expected to happen. 20 | 21 | **Input Data** 22 | If applicable, add the input data used when the bug was observed 23 | 24 | **Configuration** 25 | If applicable, include the mavis configuration file that was used to run the pipeline 26 | 27 | **Versions (please complete the following information):** 28 | - OS: [e.g. centos-07] 29 | - Python Version [e.g. 3.6.1] 30 | - MAVIS Version [e.g. 22] 31 | - Blat/BWA Version 32 | 33 | **Additional context** 34 | Add any other context about the problem here. 35 | -------------------------------------------------------------------------------- /env/example.sh: -------------------------------------------------------------------------------- 1 | export MAVIS_TEMPLATE_METADATA='/projects/trans_scratch/software/mavis/reference_files/hg19_cytoBand.txt' 2 | export MAVIS_REFERENCE_GENOME='/projects/seqref/genomes/Homo_sapiens/GRCh37/1000genomes/bwa_ind/genome/GRCh37-lite.fa' 3 | export MAVIS_ANNOTATIONS='/projects/trans_scratch/software/mavis/reference_files/ensembl69_hg19_annotations.json' 4 | export MAVIS_MASKING='/projects/tumour_char/analysis_scripts/SVIA/delly/reference_data/GRCh37/human_nspan.hg19.excl.with_header.tsv' 5 | export MAVIS_ALIGNER_REFERENCE='/home/pubseq/genomes/Homo_sapiens/GRCh37/blat/hg19.2bit' 6 | export MAVIS_DGV_ANNOTATION='/projects/trans_scratch/software/mavis/reference_files/dgv_hg19_annotations.tab' 7 | export MAVIS_MAX_FILES=100 8 | export MAVIS_MIN_CLUSTERS_PER_FILE=30 9 | export PYTHONUNBUFFERED='True' 10 | 11 | #Add paths for samtools, blat and git 12 | export PATH=/projects/trans_scratch/transabyss/trans-ABySS/v1.4.10/bin/:/gsc/software/linux-x86_64-centos6/git-2.12.0/bin/:$PATH 13 | -------------------------------------------------------------------------------- /tests/data/mock_pairing_input.tab: -------------------------------------------------------------------------------- 1 | library cluster_id validation_id annotation_id event_type transcript1 transcript2 fusion_cdna_coding_start fusion_cdna_coding_end fusion_sequence_fasta_id fusion_sequence_fasta_file break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand opposing_strands stranded protocol break1_call_method break2_call_method untemplated_seq fusion_splicing_pattern 2 | genome1 1 1 1 deletion ENST00000367080 ENST00000367080 None None None None gene3 10008 10008 L + gene3 18900 18900 R + False True genome split reads split reads None None 3 | genome2 1 1 1 deletion ENST00000367080 ENST00000367080 None None None None gene3 10000 10000 L + gene3 18900 18900 R + False True genome split reads split reads None None 4 | transcriptome1 1 1 1 deletion ENST00000367080 ENST00000367080 None None None None gene3 5347 5347 L + gene3 19969 19969 R + False True transcriptome split reads split reads None None 5 | -------------------------------------------------------------------------------- /tests/test_mavis/cluster/test_cluster.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | 5 | from mavis.cluster.cluster import merge_integer_intervals 6 | from mavis.interval import Interval 7 | 8 | 9 | class TestMergeIntegerIntervals: 10 | def test_varying_lengths(self): 11 | m = merge_integer_intervals((1, 2), (1, 9), (2, 10), weight_adjustment=0) 12 | assert m == Interval(1, 4) 13 | 14 | def test_same_length(self): 15 | m = merge_integer_intervals((1, 1), (10, 10)) 16 | assert m == Interval(6) 17 | 18 | def test_empty_list_error(self): 19 | with pytest.raises(AttributeError): 20 | merge_integer_intervals() 21 | 22 | def test_identical_even_length(self): 23 | m = merge_integer_intervals((1, 2), (1, 2), (1, 2)) 24 | assert m == Interval(1, 2) 25 | 26 | def test_identical_odd_length(self): 27 | m = merge_integer_intervals((1, 3), (1, 3), (1, 3)) 28 | assert m == Interval(1, 3) 29 | 30 | 31 | if __name__ == '__main__': 32 | unittest.main() 33 | -------------------------------------------------------------------------------- /src/tools/get_hg38_reference_files.sh: -------------------------------------------------------------------------------- 1 | set -euo pipefail 2 | 3 | echo "downloading the reference genome (no alt) file" 4 | wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz 5 | gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz 6 | 7 | echo "downloading the gene annotations file" 8 | wget http://www.bcgsc.ca/downloads/mavis/v3/ensembl79_hg38_annotations.v3.json.gz 9 | gunzip ensembl79_hg38_annotations.v3.json.gz 10 | 11 | echo "downloading the masking file" 12 | wget http://www.bcgsc.ca/downloads/mavis/GRCh38_masking.tab 13 | 14 | echo "downloading the dgv annotation file" 15 | wget http://www.bcgsc.ca/downloads/mavis/dgv_hg38_variants.tab 16 | 17 | echo "downloading the aligner reference file" 18 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit 19 | 20 | echo "downloading the template metadata file" 21 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz 22 | gunzip cytoBand.txt.gz 23 | -------------------------------------------------------------------------------- /tests/data/mock_reference_genome.fa.ann: -------------------------------------------------------------------------------- 1 | 1054073 24 11 2 | 0 fake (null) 3 | 0 7450 0 4 | 0 reference2 (null) 5 | 7450 13648 0 6 | 0 reference4 (null) 7 | 21098 4000 0 8 | 0 reference3 (null) 9 | 25098 3711 0 10 | 0 reference7 (null) 11 | 28809 21000 0 12 | 0 reference10 (null) 13 | 49809 45109 0 14 | 0 reference19 (null) 15 | 94918 11786 0 16 | 0 reference20 (null) 17 | 106704 8000 0 18 | 0 referenceX (null) 19 | 114704 15760 0 20 | 0 reference11 (null) 21 | 130464 12000 0 22 | 0 reference12 (null) 23 | 142464 12000 0 24 | 0 reference1 (null) 25 | 154464 4000 0 26 | 0 reference9 (null) 27 | 158464 4000 0 28 | 0 reference16 (null) 29 | 162464 4000 0 30 | 0 reference17 (null) 31 | 166464 4000 0 32 | 0 gene1 (null) 33 | 170464 36375 0 34 | 0 gene2 (null) 35 | 206839 71783 0 36 | 0 gene3 (null) 37 | 278622 31569 0 38 | 0 gene4 (null) 39 | 310191 579898 0 40 | 0 gene5 (null) 41 | 890089 12195 0 42 | 0 gene6 (null) 43 | 902284 108818 0 44 | 0 fakereference9 (null) 45 | 1011102 14148 0 46 | 0 test_bam_long_ref 47 | 1025250 28322 0 48 | 0 11_86018001-86018500 49 | 1053572 501 0 50 | -------------------------------------------------------------------------------- /tests/test_tools/test_convert_dgv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from unittest.mock import patch 4 | 5 | import pytest 6 | 7 | from tools.convert_dgv import main as convert_dgv_main 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "filename,expected_file", 12 | [ 13 | ["dgv_test.tab", "dgv_test_expected.tab"], 14 | ], 15 | ) 16 | def test_dgv_examples(tmp_path, filename, expected_file): 17 | data_dir = os.path.join(os.path.dirname(__file__), "data") 18 | 19 | output_path = str(tmp_path / "tmp_data.tab") 20 | args = [ 21 | "python", 22 | "--input", 23 | os.path.join(data_dir, filename), 24 | "--output", 25 | output_path, 26 | ] 27 | 28 | with patch.object(convert_dgv_main, "main", create=True): 29 | with patch.object(sys, "argv", args): 30 | convert_dgv_main() 31 | 32 | with open(os.path.join(data_dir, expected_file), 'r') as fh: 33 | expected = fh.read().strip() 34 | 35 | with open(output_path, 'r') as fh: 36 | observed = fh.read().strip() 37 | 38 | assert expected == observed 39 | -------------------------------------------------------------------------------- /env/generate_ensembl79_annotations.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # add the ensembl api modules to the path 4 | PATH=$(pwd):$PATH 5 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/bioperl-live 6 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/ensembl/modules 7 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/ensembl-compara/modules 8 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/ensembl-variation/modules 9 | PERL5LIB=${PERL5LIB}:/home/creisle/applications/ensembl_79/ensembl-funcgen/modules 10 | export PERL5LIB 11 | 12 | # default perl 13 | PATH=/projects/trans_scratch/software/perl/perl-5.20.3/bin:$PATH 14 | 15 | # required data files 16 | export HUGO_ENSEMBL_MAPPING=/projects/tumour_char/analysis_scripts/databases/processed_files/drug_target_tables/current_gene_drug_pathway.hg38.tsv 17 | export BEST_TRANSCRIPTS=/home/creisle/svn/ensembl_flatfiles/ens69_best_transcript.txt 18 | 19 | # connection information for the ensembl local server 20 | export ENSEMBL_HOST='ensembl02' 21 | export ENSEMBL_PASS='ensembl' 22 | export ENSEMBL_USER='ensembl' 23 | export ENSEMBL_PORT=3306 24 | -------------------------------------------------------------------------------- /src/mavis/convert/starfusion.py: -------------------------------------------------------------------------------- 1 | from ..constants import ORIENT 2 | 3 | 4 | def convert_row(row): 5 | """ 6 | transforms the starfusion output into the common format for expansion. Maps the input column 7 | names to column names that MAVIS can read 8 | """ 9 | std_row = {} 10 | try: 11 | std_row['break1_chromosome'], b1_start, std_row['break1_strand'] = row[ 12 | 'LeftBreakpoint' 13 | ].split(':') 14 | std_row['break2_chromosome'], b2_start, std_row['break2_strand'] = row[ 15 | 'RightBreakpoint' 16 | ].split(':') 17 | except (ValueError, TypeError): 18 | raise AssertionError( 19 | 'Could not parse the breakpoint from the starfusion row: {}, {}'.format( 20 | row['LeftBreakpoint'], row['RightBreakpoint'] 21 | ) 22 | ) 23 | std_row['break1_position_start'] = std_row['break1_position_end'] = b1_start 24 | std_row['break2_position_start'] = std_row['break2_position_end'] = b2_start 25 | 26 | std_row['break1_orientation'] = std_row['break2_orientation'] = ORIENT.NS 27 | 28 | return std_row 29 | -------------------------------------------------------------------------------- /tests/test_mavis/illustrate/test_illustrate.py: -------------------------------------------------------------------------------- 1 | from mavis.illustrate.util import generate_interval_mapping 2 | from mavis.interval import Interval 3 | 4 | 5 | class TestGenerateIntervalMapping: 6 | def test_single_bp_window(self): 7 | regions = [ 8 | Interval(4222347, 4222347), 9 | Interval(4221673, 4221903), 10 | Interval(2792992, 4852494), 11 | ] 12 | target = 911.9921875 13 | ratio = 5 14 | min_width = 60 15 | buffer_ = None 16 | start = 2791992 17 | end = 4853494 18 | min_inter = 10 19 | mapping = generate_interval_mapping( 20 | regions, target, ratio, min_width, buffer_, start, end, min_inter 21 | ) 22 | assert len(mapping.keys()) == 7 23 | 24 | def test_no_input_intervals(self): 25 | target = 911.9921875 26 | ratio = 5 27 | min_width = 60 28 | buffer_ = None 29 | start = 2791992 30 | end = 4853494 31 | min_inter = 10 32 | mapping = generate_interval_mapping( 33 | [], target, ratio, min_width, buffer_, start, end, min_inter 34 | ) 35 | assert len(mapping.keys()) == 1 36 | -------------------------------------------------------------------------------- /src/tools/get_hg19_reference_files.sh: -------------------------------------------------------------------------------- 1 | set -euo pipefail 2 | 3 | echo "downloading the reference genome file" 4 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz 5 | tar -xvzf chromFa.tar.gz 6 | 7 | # concatenate the chromosome fa files into a single file 8 | for fname in chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y}.fa 9 | do 10 | cat $fname >> hg19.fa 11 | done 12 | 13 | # Clean up the non concatenated and alt chromosome files 14 | rm -f chr*.fa 15 | rm -f chromeFa.tar.gz 16 | 17 | echo "downloading the gene annotations file" 18 | wget http://www.bcgsc.ca/downloads/mavis/v3/ensembl69_hg19_annotations.v3.json.gz 19 | gunzip ensembl69_hg19_annotations.v3.json.gz 20 | 21 | echo "downloading the masking file" 22 | wget http://www.bcgsc.ca/downloads/mavis/hg19_masking.tab 23 | 24 | echo "downloading the dgv annotation file" 25 | wget http://www.bcgsc.ca/downloads/mavis/dgv_hg19_variants.tab 26 | 27 | echo "downloading the aligner reference file" 28 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit 29 | 30 | echo "downloading the template metadata file" 31 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz 32 | gunzip cytoBand.txt.gz 33 | -------------------------------------------------------------------------------- /src/mavis/convert/straglr.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from ..constants import COLUMNS, SVTYPE 4 | 5 | 6 | def convert_row(row: Dict) -> Dict: 7 | """ 8 | Converts the fields from the original STRAGLR BED output into MAVIS definitions of an SV 9 | Since STRAGLR defines regions where short tandem repeats exist we make the definitions here fairly 10 | non-specific 11 | 12 | See their github page for more details: https://github.com/bcgsc/straglr 13 | 14 | BED Columns 15 | - chrom: chromosome name 16 | - start: start coordinate of locus 17 | - end: end coordinate of locus 18 | - repeat_unit: repeat motif 19 | - allele.size: where N={1,2,3...} depending on --max_num_clusters e.g. N={1,2} if --max_num_clusters==2 (default) 20 | - allele.copy_number 21 | - allele.support 22 | """ 23 | return { 24 | COLUMNS.break1_chromosome: row['chrom'], 25 | COLUMNS.break2_chromosome: row['chrom'], 26 | COLUMNS.break1_position_start: row['start'], 27 | COLUMNS.break1_position_end: row['end'], 28 | COLUMNS.break2_position_start: row['start'], 29 | COLUMNS.break2_position_end: row['end'], 30 | COLUMNS.untemplated_seq: None, 31 | COLUMNS.event_type: SVTYPE.INS, 32 | } 33 | -------------------------------------------------------------------------------- /docs/configuration/pipeline.md: -------------------------------------------------------------------------------- 1 | # Running the Pipeline 2 | 3 | ## Running MAVIS using a Job Scheduler 4 | 5 | MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling 6 | and setup 7 | 8 | The MAVIS pipeline is highly configurable. Some pipeline steps 9 | (cluster, validate) are optional and can be automatically skipped. The 10 | standard pipeline is 11 | far-left. 12 | 13 | The most common use case is running the pipeline through snakemake 14 | 15 | ```bash 16 | snakemake -j --configfile -s Snakefile 17 | ``` 18 | 19 | If you are submitting to a cluster, use the [snakemake profiles](https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles) 20 | 21 | ```bash 22 | snakemake -j --configfile --profile -s Snakefile 23 | ``` 24 | 25 | This will submit a series of jobs with dependencies. 26 | 27 | To use the mavis docker container through singularity, instead of installing mavis via pip, add the 28 | [`--use-singularity`](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#running-jobs-in-containers) 29 | flag. 30 | 31 | ```bash 32 | snakemake -j --configfile --profile --use-singularity -s Snakefile` 33 | ``` 34 | -------------------------------------------------------------------------------- /docs/package/mavis/pairing/index.md: -------------------------------------------------------------------------------- 1 | # Sub-package Documentation 2 | 3 | This is the package responsible for pairing/grouping calls between libraries. In many cases 4 | this will be where somatic vs germline is determined or genomic only vs expressed. 5 | 6 | ## Output Files 7 | 8 | | expected name/suffix | file type/format | content | 9 | | ---------------------- | ---------------- | --------------------------------------------------------- | 10 | | ``mavis_paired_*.tab`` | text/tabbed | call information and pairing information using product id | 11 | 12 | 13 | ## Algorithm Overview 14 | 15 | - pairwise comparison of breakpoint pairs between libraries 16 | 17 | - fail if orientations do not match 18 | - fail if template/chromosomes do not match 19 | - if the protocols are mixed 20 | 21 | - pass if the fusion products match at the sequence level 22 | - pass if the breakpoint predicted from the genome matches the transcriptome breakpoint 23 | 24 | - if the protocols are the same 25 | 26 | - pass if the breakpoints are co-located 27 | 28 | - filter matches based on annotations 29 | 30 | - if both breakpoints have the same gene annotation, they must also both have the same transcript annotation 31 | -------------------------------------------------------------------------------- /src/mavis/convert/cnvnator.py: -------------------------------------------------------------------------------- 1 | """ 2 | from cnvnator: https://github.com/abyzovlab/CNVnator 3 | 4 | CNV_type coordinates CNV_size normalized_RD e-val1 e-val2 e-val3 e-val4 q0 5 | 6 | normalized_RD -- normalized to 1. 7 | e-val1 -- is calculated using t-test statistics. 8 | e-val2 -- is from the probability of RD values within the region to be in 9 | the tails of a gaussian distribution describing frequencies of RD values in bins. 10 | e-val3 -- same as e-val1 but for the middle of CNV 11 | e-val4 -- same as e-val2 but for the middle of CNV 12 | q0 -- fraction of reads mapped with q0 quality 13 | """ 14 | import re 15 | 16 | 17 | def convert_row(row): 18 | """ 19 | 20 | Args: 21 | row (Dict[str]): dict representing the row output from cnvnator 22 | 23 | Returns: 24 | dict: transformed row using mavis starndard column names 25 | """ 26 | result = {k: v for k, v in row.items() if k != 'coordinates'} 27 | chrom, start, end = re.split(r'[-:]', row['coordinates']) 28 | result['break1_chromosome'] = result['break2_chromosome'] = chrom 29 | result['break1_position_start'] = result['break1_position_end'] = start 30 | result['break2_position_start'] = result['break2_position_end'] = end 31 | return result 32 | -------------------------------------------------------------------------------- /tests/util.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | import pytest 6 | 7 | DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') 8 | 9 | 10 | long_running_test = pytest.mark.skipif( 11 | os.environ.get('RUN_FULL') != '1', 12 | reason='Only running FAST tests subset', 13 | ) 14 | 15 | bwa_only = pytest.mark.skipif(not shutil.which('bwa'), reason='missing the command') 16 | blat_only = pytest.mark.skipif(not shutil.which('blat'), reason='missing the command') 17 | todo = pytest.mark.skip(reason='TODO') 18 | 19 | 20 | def package_relative_file(*paths): 21 | return os.path.abspath(os.path.join(os.path.dirname(__file__), '..', *paths)) 22 | 23 | 24 | def get_data(*paths): 25 | return os.path.join(DATA_DIR, *paths) 26 | 27 | 28 | def glob_exists(*pos, strict=False, n=1): 29 | globexpr = os.path.join(*pos) 30 | file_list = glob.glob(globexpr) 31 | if strict and len(file_list) == n: 32 | return file_list[0] if len(file_list) == 1 else file_list 33 | elif not strict and len(file_list) > 0: 34 | return file_list 35 | else: 36 | print(globexpr) 37 | print(file_list) 38 | return False 39 | 40 | 41 | def glob_not_exists(*pos): 42 | globexpr = os.path.join(*pos) 43 | file_list = glob.glob(globexpr) 44 | return not file_list 45 | -------------------------------------------------------------------------------- /tests/data/mock_reference_annotations.json: -------------------------------------------------------------------------------- 1 | { 2 | "genes": [ 3 | { 4 | "chr": "fake", 5 | "start": 1, 6 | "end": 1000, 7 | "strand": "+", 8 | "name": "ENSG0001", 9 | "aliases": [], 10 | "transcripts": [ 11 | { 12 | "is_best_transcript": true, 13 | "name": "ENST001", 14 | "start": 101, 15 | "end": 900, 16 | "exons": [ 17 | {"start": 101, "end": 200}, 18 | {"start": 401, "end": 500}, 19 | {"start": 601, "end": 700}, 20 | {"start": 801, "end": 900} 21 | ], 22 | "domains": [ 23 | { 24 | "name": "PF001", 25 | "desc": "", 26 | "regions": [ 27 | {"start": 1, "end": 10}, 28 | {"start": 50, "end": 63} 29 | ] 30 | } 31 | ], 32 | "cdna_coding_start": 51, 33 | "cdna_coding_end": 350 34 | } 35 | ] 36 | } 37 | ] 38 | } 39 | -------------------------------------------------------------------------------- /docs/inputs/non_python_dependencies.md: -------------------------------------------------------------------------------- 1 | # Non-python Dependencies 2 | 3 | MAVIS integrates with 4 | [SV callers](./sv_callers.md), 5 | [job schedulers](#job-schedulers), and 6 | [aligners](#aligners). While some of 7 | these dependencies are optional, all currently supported options are 8 | detailed below. The versions column in the tables below list all the 9 | versions which were tested for each tool. Each version listed is known 10 | to be compatible with MAVIS. 11 | 12 | ## Job Schedulers 13 | 14 | MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling 15 | 16 | ## Aligners 17 | 18 | Two aligners are supported [bwa](../../glossary/#bwa) and 19 | [blat](../../glossary/#blat) (default). These are both included in the docker image by default. 20 | 21 | | Name | Version(s) | Environment Setting | 22 | | ---------------------------------------------- | ----------------------- | ------------------------- | 23 | | [blat](../../glossary/#blat) | `36x2` `36` | `MAVIS_ALIGNER=blat` | 24 | | [bwa mem ](../../glossary/#bwa mem ) | `0.7.15-r1140` `0.7.12` | `MAVIS_ALIGNER='bwa mem'` | 25 | 26 | !!! note 27 | When setting the aligner you will also need to set the 28 | [aligner_reference](../../configuration/settings/#aligner_reference) to match 29 | -------------------------------------------------------------------------------- /docs/outputs/illustrations.md: -------------------------------------------------------------------------------- 1 | # Illustrations 2 | 3 | ## Fusion Diagrams 4 | 5 | These are diagrams produced during the annotate step. These represent 6 | the putative fusion events of a single breakpoint pair. 7 | 8 | ![fusion diagram](../images/GIMAP4_IL7_fusion.svg) 9 | 10 | Fusion from transcriptome data. Intronic breakpoints here indicate 11 | retained intron sequence and a novel exon is 12 | predicted. 13 | 14 | If the [draw_fusions_only](../../configuration/settings/#draw_fusions_only flag is set to 15 | False then all events will produce a diagram, even anti-sense fusions 16 | 17 | ![disruptive fusion diagram](../images/UBE2V2_GIMAP4_disruptive_fusion.svg) 18 | 19 | Disruptive Anti-sense 20 | Fusion 21 | 22 | ## Transcript Overlays 23 | 24 | MAVIS supports generating diagrams of all transcripts for a given gene. 25 | These can be overlaid with markers and bam\_file pileup data. This is 26 | particularly useful for visualizing splice site mutations. 27 | 28 | ![overlay diagram](../images/ENSG00000139687_RB1_overlay.png) 29 | 30 | RB1 splice site mutation results in skipping of exon 9 31 | 32 | The above diagram was generated using the overlay command 33 | 34 | ```bash 35 | mavis overlay RB1 \ 36 | -o /path/to/output/dir \ 37 | --read_depth_plot rna /path/to/bam/file \ 38 | --marker M1 48939029 \ 39 | --annotations /path/to/mavis/annotations/reference/file 40 | ``` 41 | -------------------------------------------------------------------------------- /src/mavis/convert/breakdancer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pandas as pd 4 | 5 | 6 | def convert_file(input_file): 7 | bam_to_lib = {} 8 | 9 | # read comments 10 | with open(input_file, 'r') as fh: 11 | # comments in breakdancer are marked with a single # so they need to be discarded before reading 12 | lines = fh.readlines() 13 | line_index = 0 14 | while line_index < len(lines) and lines[line_index].startswith('#'): 15 | metadata_match = re.match(r'^#(\S+)\t.*\tlibrary:(\S+)\t.*', lines[line_index]) 16 | if metadata_match: 17 | bam_to_lib[metadata_match.group(1)] = metadata_match.group(2) 18 | line_index += 1 19 | header = [c.strip() for c in re.sub(r'^#', '', lines[line_index - 1]).split('\t')] 20 | # read the main file 21 | df = pd.read_csv( 22 | input_file, 23 | names=header, 24 | sep='\t', 25 | comment='#', 26 | dtype={ 27 | 'num_Reads_lib': str, 28 | 'Pos1': int, 29 | 'Pos2': int, 30 | 'Chr1': str, 31 | 'Chr2': str, 32 | 'Type': str, 33 | }, 34 | ) 35 | if 'num_Reads_lib' not in df: 36 | raise KeyError('missing required column: num_Reads_lib') 37 | 38 | for bam, lib in bam_to_lib.items(): 39 | df['num_Reads_lib'] = df['num_Reads_lib'].str.replace(bam, lib) 40 | return df.to_dict('records') 41 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-slim-buster 2 | 3 | WORKDIR /app 4 | 5 | RUN apt-get update && \ 6 | apt-get upgrade -y && \ 7 | apt-get install -y git wget make gcc libz-dev 8 | 9 | # pysam dependencies 10 | RUN apt-get install -y libncurses5-dev zlib1g-dev libbz2-dev libncursesw5-dev liblzma-dev 11 | 12 | # install BWA 13 | RUN git clone https://github.com/lh3/bwa.git && \ 14 | cd bwa && \ 15 | git checkout v0.7.17 && \ 16 | make && \ 17 | cd .. && \ 18 | mv bwa/bwa /usr/local/bin 19 | 20 | # install minimap2 21 | RUN git clone https://github.com/lh3/minimap2.git && \ 22 | cd minimap2 && \ 23 | git checkout v2.24 && \ 24 | make && \ 25 | cd .. && \ 26 | mv minimap2/minimap2.1 /usr/local/bin 27 | 28 | # install blat dependencies 29 | RUN apt-get install -y libcurl4 30 | 31 | # install blat 32 | RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat && \ 33 | chmod a+x blat && \ 34 | mv blat /usr/local/bin 35 | 36 | # install wtdbg2 37 | RUN git clone https://github.com/ruanjue/wtdbg2.git && \ 38 | cd wtdbg2 && \ 39 | make && \ 40 | cd .. && \ 41 | mv wtdbg2/wtdbg2 /usr/local/bin 42 | 43 | COPY setup.py setup.py 44 | COPY setup.cfg setup.cfg 45 | COPY MANIFEST.in MANIFEST.in 46 | COPY pyproject.toml pyproject.toml 47 | COPY src src 48 | COPY LICENSE LICENSE 49 | COPY README.md README.md 50 | 51 | # install python package 52 | RUN pip install -U setuptools pip wheel 53 | RUN pip install . 54 | RUN which mavis 55 | ENTRYPOINT [ "mavis" ] 56 | -------------------------------------------------------------------------------- /docs/migrating.md: -------------------------------------------------------------------------------- 1 | # Migrating 2 | 3 | ## Migrating from v2 to v3 4 | 5 | There are major changes from v2 to v3 of MAVIS. 6 | 7 | ### Tab File Headers 8 | 9 | Tab file headers no longer start with `#`. Any lines starting with a pound will be treated 10 | as comments. This will apply to mavis-style inputs as well as any tab delimited 11 | reference files 12 | 13 | ### Configuration 14 | 15 | MAVIS no longer uses command line arguments, config files, and environment variables for 16 | configuration. Instead all configurable settings are controlled via a single input JSON 17 | config file 18 | 19 | ### Scheduling 20 | 21 | MAVIS is now integrated with snakemake instead of handling its own scheduling 22 | 23 | ## Reference Annotation Files 24 | 25 | MAVIS no longer supports the previously deprecated tab-delimited format of the annotations file. If you are still using these files in your project we have provided a script to automatically convert them to the newer format in the tools directory. 26 | 27 | ```bash 28 | python src/tools/convert_annotations_format.py \ 29 | /path/to/tab/file.tab \ 30 | --input_type v2-tab \ 31 | /path/to/new/json/file.json 32 | ``` 33 | 34 | In v3 the JSON files are slightly different to support multiple translations per transcript. You old v3 files can be automatically converted to the new format with the same script 35 | 36 | ```bash 37 | python src/tools/convert_annotations_format.py \ 38 | /path/to/json/file.json \ 39 | --input_type v2-json \ 40 | /path/to/new/json/file.json 41 | ``` 42 | -------------------------------------------------------------------------------- /docs/package/mavis/cluster/index.md: -------------------------------------------------------------------------------- 1 | # Sub-package Documentation 2 | 3 | The cluster sub-package is responsible for merging variants coming from different inputs (i.e. different tools). 4 | 5 | ## Types of Output Files 6 | 7 | | expected name/suffix | file type/format | content | 8 | | ------------------------------ | -------------------------- | -------------------------------------------------------------------- | 9 | | ``cluster_assignment.tab`` | text/tabbed | | 10 | | ``uninformative_clusters.txt`` | text | list of cluster ids that were dropped by annotation proximity filter | 11 | | ``clusters.bed`` | [bed](../../glossary/#bed) | cluster positions | 12 | | ``cluster-*.tab`` | text/tabbed | computed clusters | 13 | 14 | ## Algorithm Overview 15 | 16 | - Collapse any duplicate breakpoint pairs 17 | - Split breakpoint pairs by type 18 | - Cluster breakpoint pairs by distance (within a type) 19 | 20 | - Create a graph representation of the distances between pairs 21 | - Find cliques up to a given input size (cluster_clique_size) 22 | - Hierarchically cluster the cliques (allows redundant participation) 23 | - For each input node/pair pick the best cluster(s) 24 | 25 | - Output the clusters and the mapping to the input pairs 26 | -------------------------------------------------------------------------------- /src/mavis/convert/arriba.py: -------------------------------------------------------------------------------- 1 | from ..constants import COLUMNS, ORIENT 2 | 3 | 4 | def get_orient(string): 5 | if string == "downstream": 6 | return ORIENT.LEFT 7 | elif string == "upstream": 8 | return ORIENT.RIGHT 9 | return ORIENT.NS 10 | 11 | 12 | def convert_row(row): 13 | """ 14 | transforms the aribba output into the common format for expansion. Maps the input column 15 | names to column names which MAVIS can read 16 | """ 17 | std_row = {} 18 | 19 | try: 20 | std_row[COLUMNS.break1_chromosome], b1_start = row["breakpoint1"].split(":") 21 | std_row[COLUMNS.break2_chromosome], b2_start = row["breakpoint2"].split(":") 22 | 23 | std_row[COLUMNS.break1_strand] = row["strand1(gene/fusion)"].split("/")[1] 24 | std_row[COLUMNS.break2_strand] = row["strand2(gene/fusion)"].split("/")[1] 25 | std_row[COLUMNS.event_type] = row["type"].split("/")[0] 26 | std_row[COLUMNS.break1_orientation] = get_orient(row["direction1"]) 27 | std_row[COLUMNS.break2_orientation] = get_orient(row["direction2"]) 28 | 29 | std_row[COLUMNS.break1_position_start] = std_row[COLUMNS.break1_position_end] = b1_start 30 | std_row[COLUMNS.break2_position_start] = std_row[COLUMNS.break2_position_end] = b2_start 31 | except (ValueError, TypeError): 32 | raise AssertionError( 33 | "Could not parse the breakpoint from the Arriba row: {}, {}".format( 34 | row["breakpoint1"], row["breakpoint2"] 35 | ) 36 | ) 37 | return std_row 38 | -------------------------------------------------------------------------------- /docs/tutorials/mini.md: -------------------------------------------------------------------------------- 1 | # MAVIS (Mini) Tutorial 2 | 3 | This tutorial is based on the data included in the tests folder of 4 | MAVIS. The data files are very small and this tutorial is really only 5 | intended for testing a MAVIS install. The data here is simulated and 6 | results are not representative of the typical events you would see 7 | reported from MAVIS. For a more complete tutorial with actual fusion 8 | gene examples, please see the [full tutorial](../../tutorials/full/). 9 | 10 | The first step is to clone or download a zip of the MAVIS repository 11 | (). You will need the tests directory. 12 | The tag you check out should correspond to the MAVIS version you have 13 | installed 14 | 15 | ```bash 16 | git clone https://github.com/bcgsc/mavis.git 17 | git checkout 18 | mv mavis/tests . 19 | mv mavis/Snakefile . 20 | rm -r mavis 21 | ``` 22 | 23 | Now you should have a folder called `tests` in your current directory. Since this is a trivial 24 | example, it can easily be run locally. However in order to run the snakemake file you will need 25 | to have the config validation module `mavis_config` installed which has minimal dependencies. 26 | 27 | ```bash 28 | pip install mavis_config 29 | ``` 30 | 31 | Now you are ready to run MAVIS. This can be done in a single command using snakemake. 32 | 33 | ```bash 34 | snakemake -j 1 --configfile=tests/mini-tutorial.config.json -s Snakefile 35 | ``` 36 | 37 | Which will run the mini tutorial version and output files into a folder called `output_dir` in the 38 | current directory 39 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: publish 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | pypi: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: '3.x' 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install setuptools wheel twine 23 | - name: Build and publish 24 | env: 25 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 26 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 27 | run: | 28 | python setup.py sdist bdist_wheel install 29 | twine check dist/* 30 | twine upload dist/* 31 | docker: 32 | runs-on: ubuntu-latest 33 | steps: 34 | - uses: actions/checkout@v2 35 | - run: docker login -u $DOCKER_USER -p $DOCKER_PASSWORD 36 | env: 37 | DOCKER_USER: ${{ secrets.DOCKER_USER }} 38 | DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} 39 | - run: | 40 | docker build --file Dockerfile --tag bcgsc/mavis:latest --tag bcgsc/mavis:${{ github.event.release.tag_name }} . 41 | - run: docker push bcgsc/mavis:latest 42 | - run: docker push bcgsc/mavis:${{ github.event.release.tag_name }} 43 | -------------------------------------------------------------------------------- /tests/mini-tutorial.annotate_only.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotate.draw_fusions_only": false, 3 | "convert": { 4 | "mock_converted": { 5 | "inputs": [ 6 | "tests/data/mock_sv_events.tsv" 7 | ], 8 | "file_type": "mavis", 9 | "assume_no_untemplated": true 10 | } 11 | }, 12 | "skip_stage.validate": true, 13 | "cluster.uninformative_filter": true, 14 | "cluster.limit_to_chr": null, 15 | "cluster.min_clusters_per_file": 5, 16 | "libraries": { 17 | "mock-A47933": { 18 | "assign": [ 19 | "tests/data/mock_trans_sv_events.tsv" 20 | ], 21 | "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam", 22 | "disease_status": "diseased", 23 | "protocol": "transcriptome", 24 | "strand_specific": true 25 | }, 26 | "mock-A36971": { 27 | "assign": [ 28 | "mock_converted" 29 | ], 30 | "bam_file": "tests/data/mock_reads_for_events.sorted.bam", 31 | "disease_status": "diseased", 32 | "protocol": "genome", 33 | "strand_specific": false 34 | } 35 | }, 36 | "output_dir": "output_dir", 37 | "reference.annotations": [ 38 | "tests/data/mock_annotations.json" 39 | ], 40 | "reference.dgv_annotation": [ 41 | "tests/data/mock_dgv_annotation.tab" 42 | ], 43 | "reference.masking": [ 44 | "tests/data/mock_masking.tab" 45 | ], 46 | "reference.reference_genome": [ 47 | "tests/data/mock_reference_genome.fa" 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /src/mavis/convert/chimerascan.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from ..constants import COLUMNS, ORIENT 4 | from .constants import SUPPORTED_TOOL, TRACKING_COLUMN 5 | 6 | 7 | def convert_row(row: Dict) -> Dict: 8 | """ 9 | transforms the chimerscan output into the common format for expansion. Maps the input column 10 | names to column names which MAVIS can read 11 | """ 12 | std_row = {} 13 | for retained_column in ['genes5p', 'genes3p']: 14 | if retained_column in row: 15 | std_row['{}_{}'.format(SUPPORTED_TOOL.CHIMERASCAN, retained_column)] = row[ 16 | retained_column 17 | ] 18 | if TRACKING_COLUMN not in row: 19 | std_row[TRACKING_COLUMN] = '{}-{}'.format( 20 | SUPPORTED_TOOL.CHIMERASCAN, row['chimera_cluster_id'] 21 | ) 22 | 23 | std_row.update( 24 | {COLUMNS.break1_chromosome: row['chrom5p'], COLUMNS.break2_chromosome: row['chrom3p']} 25 | ) 26 | if row['strand5p'] == '+': 27 | std_row[COLUMNS.break1_position_start] = row['end5p'] 28 | std_row[COLUMNS.break1_orientation] = ORIENT.LEFT 29 | else: 30 | std_row[COLUMNS.break1_position_start] = row['start5p'] 31 | std_row[COLUMNS.break1_orientation] = ORIENT.RIGHT 32 | if row['strand3p'] == '+': 33 | std_row[COLUMNS.break2_position_start] = row['start3p'] 34 | std_row[COLUMNS.break2_orientation] = ORIENT.RIGHT 35 | else: 36 | std_row[COLUMNS.break2_position_start] = row['end3p'] 37 | std_row[COLUMNS.break2_orientation] = ORIENT.LEFT 38 | std_row[COLUMNS.opposing_strands] = row['strand5p'] != row['strand3p'] 39 | return std_row 40 | -------------------------------------------------------------------------------- /docs/configuration/performance.md: -------------------------------------------------------------------------------- 1 | # Resource Requirements 2 | 3 | MAVIS has been tested on both unix and linux systems. For the standard 4 | pipeline, the validation stage is the most computationally expensive. 5 | The memory and cpu requirements will vary with two main factors: the 6 | number of structural variants you are validating per job, and the size 7 | of the bam file you are validating against. 8 | 9 | There are a number of settings that can be adjusted to reduce memory and 10 | cpu requirements depending on what the user is trying to analyze. See 11 | [configuration and settings](../../configuration/general/) for more details. 12 | 13 | ## Validation Resources 14 | 15 | ![validation resources](../images/colo829_tumour_validation_resource_req.png) 16 | 17 | Resource Requirements (MAVIS 1.8.0) for each validation job of the 18 | COLO829 tumour genome. The BAM file for the tumour genome is 127GB. 19 | Validation jobs were tested splitting into: 100, 500, 1000, and 2500 20 | structural variant validations per job. The effect of number of events 21 | validated on both memory and time is plotted 22 | above. 23 | 24 | ## Annotation Resources 25 | 26 | Similar trends were observed for the annotation step (see below) with 27 | regards to time elapsed. However the memory requirements remained more 28 | constant which is expected since, unlike validation, anntotation does 29 | not read more data in for more events. 30 | 31 | ![annotation resources](../images/colo829_tumour_annotation_resource_req.png) 32 | 33 | Resource Requirements (MAVIS 1.8.0) for each annotation job of the 34 | COLO829 tumour genome. The events which passed validation (see above) 35 | represent the number of events input to the annotation 36 | step. 37 | -------------------------------------------------------------------------------- /tests/test_tools/data/ensembl69_hg19_annotations.kras.tab: -------------------------------------------------------------------------------- 1 | ## input file used to map hugo gene names: compiled_gene_drug_pathway.v1_2_5.tsv 2 | ## input file for picking best transcript: ens69_best_transcript.txt 3 | ## Ensembl Api version 69 4 | ## generated at: Thu Aug 4 16:38:01 2016 5 | #ensembl_gene_id hugo_names chr strand gene_start gene_end best_ensembl_transcript_id ensembl_transcript_id refseq_equivalents transcript_genomic_start transcript_genomic_end cdna_coding_start cdna_coding_end genomic_exon_ranges AA_domain_ranges 6 | ENSG00000133703 KRAS 12 -1 25357723 25403870 ENST00000311936 ENST00000311936 NP_004976.2;NM_004985.3 25357723 25403865 193 759 25403685-25403865;25398208-25398329;25380168-25380346;25378548-25378707;25357723-25362845 PR00449:4-25,27-43,44-66,107-120,141-163;PF00025:3-162;SM00173:1-166;PF00009:45-163;PF08477:5-119;PS50318:165-184;SSF52540:3-184;TIGR00231:1-159;SM00175:4-166;PF00071:5-164;SM00174:6-166 7 | ENSG00000133703 KRAS 12 -1 25357723 25403870 ENST00000311936 ENST00000557334 25362102 25403870 198 425 25403685-25403870;25398208-25398329;25362102-25362845 PR00449:4-25,27-43;PS50318:52-71;SM00173:1-53;PF00071:5-44;SSF52540:3-37 8 | ENSG00000133703 KRAS 12 -1 25357723 25403870 ENST00000311936 ENST00000256078 NP_203524.1;NM_033360.2 25362365 25403737 65 634 25403685-25403737;25398208-25398329;25380168-25380346;25378548-25378707;25368371-25368494;25362365-25362845 SM00175:4-166;PF00071:5-164;SSF52540:3-185;SM00176:9-189;TIGR00231:1-159;SM00174:6-166;PR00449:4-25,27-43,44-66,107-120,141-163;PF00025:3-161;PF08477:5-119;PF00009:45-162;SM00173:1-166 9 | ENSG00000133703 KRAS 12 -1 25357723 25403870 ENST00000311936 ENST00000556131 25386753 25403863 178 309 25403698-25403863;25398208-25398329;25386753-25388160 PR00449:4-25,27-43;PF00071:5-37;SSF52540:3-38 10 | -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- 1 | # Guidelines for Contributors 2 | 3 | {!./../.github/CONTRIBUTING.md!} 4 | 5 | ## Major Assumptions 6 | 7 | Some assumptions have been made when developing this project. The major 8 | ones have been listed here to facilitate debugging/development if any of 9 | these are violated in the future. 10 | 11 | - The input bam reads have stored the sequence wrt to the positive/forward strand and have not stored the reverse complement. 12 | - The distribution of the fragment sizes in the bam file approximately follows a normal distribution. 13 | 14 | ## Current Limitations 15 | 16 | - Assembling contigs will always fail for repeat sequences as we do not resolve this. Unlike traditional assemblies we cannot assume even input coverage as we are taking a select portion of the reads to assemble. 17 | - Currently no attempt is made to group/pair single events into complex events. 18 | - Transcriptome validation uses a collapsed model of all overlapping transcripts and is not isoform specific. Allowing for isoform specific validation would be computationally expensive but may be considered as an optional setting for future releases. 19 | 20 | ## Computing Code coverage 21 | 22 | Since MAVIS uses multiple processes, it adds complexity to computing the 23 | code coverage. Running coverage normally will undereport. To ensure that 24 | the coverage module captures the information from the subprocesses we 25 | need to do the following 26 | 27 | In our development python virtual environment put a coverage.pth file 28 | (ex. `venv/lib/python3.6/site-packages/coverage.pth`) containing the 29 | following 30 | 31 | ```python 32 | import coverage; coverage.process_startup() 33 | ``` 34 | 35 | Additionally you will need to set the environment variable 36 | 37 | ```bash 38 | export COVERAGE_PROCESS_START=/path/to/mavis/repo/mavis/.coveragerc 39 | ``` 40 | -------------------------------------------------------------------------------- /.github/workflows/quick-tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: quick-tests 5 | 6 | on: [push] 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-20.04 11 | strategy: 12 | matrix: 13 | python-version: ["3.7", "3.8", "3.9", "3.10"] 14 | name: python-${{ matrix.python-version }} quick 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip setuptools wheel 24 | pip install .[test] 25 | - name: Lint with flake8 26 | run: | 27 | pip install flake8 28 | # stop the build if there are Python syntax errors or undefined names 29 | flake8 src tests --count --show-source --statistics 30 | - name: Lint with black 31 | run: | 32 | pip install black 33 | # stop the build if black needs to be run 34 | black src tests -S -l 100 --check 35 | - name: Lint with isort 36 | run: | 37 | pip install isort 38 | isort src tests --check 39 | - name: install bwa 40 | run: | 41 | git clone https://github.com/lh3/bwa.git 42 | cd bwa 43 | git checkout v0.7.17 44 | make 45 | cd .. 46 | - name: install blat 47 | run: | 48 | wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat 49 | chmod a+x blat 50 | - name: run short tests with pytest 51 | run: | 52 | export PATH=$PATH:$(pwd):$(pwd)/bwa 53 | pytest tests -v \ 54 | --junitxml=junit/test-results-${{ matrix.python-version }}.xml \ 55 | --durations=10 56 | env: 57 | RUN_FULL: 0 58 | -------------------------------------------------------------------------------- /src/mavis/convert/constants.py: -------------------------------------------------------------------------------- 1 | from ..constants import SVTYPE, MavisNamespace 2 | 3 | 4 | class SUPPORTED_TOOL(MavisNamespace): 5 | """ 6 | Supported Tools used to call SVs and then used as input into MAVIS 7 | 8 | Attributes: 9 | CHIMERASCAN: chimerascan [Iyer-2011]_ 10 | DEFUSE: defuse [McPherson-2011]_ 11 | DELLY: delly [Rausch-2012]_ 12 | MANTA: manta [Chen-2016]_ 13 | PINDEL: pindel [Ye-2009]_ 14 | TA: transabyss [Robertson-2010]_ 15 | """ 16 | 17 | MANTA = 'manta' 18 | DELLY = 'delly' 19 | TA = 'transabyss' 20 | PINDEL = 'pindel' 21 | CHIMERASCAN = 'chimerascan' 22 | MAVIS = 'mavis' 23 | DEFUSE = 'defuse' 24 | BREAKDANCER = 'breakdancer' 25 | VCF = 'vcf' 26 | BREAKSEQ = 'breakseq' 27 | CNVNATOR = 'cnvnator' 28 | STRELKA = 'strelka' 29 | STARFUSION = 'starfusion' 30 | STRAGLR = 'straglr' 31 | ARRIBA = 'arriba' 32 | 33 | 34 | TOOL_SVTYPE_MAPPING = {v: [v] for v in SVTYPE.values()} # type: ignore 35 | TOOL_SVTYPE_MAPPING.update( 36 | { 37 | 'DEL': [SVTYPE.DEL], 38 | 'INS': [SVTYPE.INS], 39 | 'ITX': [SVTYPE.DUP], 40 | 'CTX': [SVTYPE.TRANS, SVTYPE.ITRANS], 41 | 'INV': [SVTYPE.INV], 42 | 'BND': [SVTYPE.TRANS, SVTYPE.ITRANS, SVTYPE.DUP, SVTYPE.INS, SVTYPE.DEL, SVTYPE.INV], 43 | 'TRA': [SVTYPE.TRANS, SVTYPE.ITRANS], 44 | 'CNV': [SVTYPE.DUP], 45 | 'RPL': [SVTYPE.INS], 46 | 'DUP:TANDEM': [SVTYPE.DUP], 47 | 'DUP': [SVTYPE.DUP], 48 | 'interchromosomal': [SVTYPE.TRANS, SVTYPE.ITRANS], 49 | 'eversion': [SVTYPE.DUP], 50 | 'translocation': [SVTYPE.TRANS, SVTYPE.ITRANS], 51 | 'ins': [SVTYPE.INS], 52 | 'del': [SVTYPE.DEL], 53 | 'dup': [SVTYPE.DUP], 54 | 'ITD': [SVTYPE.DUP], 55 | 'IDP': [SVTYPE.INS], 56 | 'DEL/INV': [SVTYPE.DEL, SVTYPE.INV], 57 | 'DUP/INS': [SVTYPE.DUP, SVTYPE.INS], 58 | 'INVDUP': [SVTYPE.INV, SVTYPE.DUP, SVTYPE.INS], 59 | 'INV/INVDUP': [SVTYPE.INV, SVTYPE.DUP, SVTYPE.INS], 60 | } 61 | ) 62 | 63 | TRACKING_COLUMN = 'tracking_id' 64 | -------------------------------------------------------------------------------- /tests/test_mavis/test_constants.py: -------------------------------------------------------------------------------- 1 | from mavis.constants import ( 2 | COLUMNS, 3 | ORIENT, 4 | STRAND, 5 | reverse_complement, 6 | sort_columns, 7 | translate, 8 | ) 9 | 10 | 11 | class TestConstants: 12 | def test_strand_compare(self): 13 | assert STRAND.compare(STRAND.NS, STRAND.POS) 14 | assert STRAND.compare(STRAND.NS, STRAND.NEG) 15 | assert STRAND.compare(STRAND.POS, STRAND.POS) 16 | assert STRAND.compare(STRAND.NEG, STRAND.NEG) 17 | assert not STRAND.compare(STRAND.POS, STRAND.NEG) 18 | assert not STRAND.compare(STRAND.NEG, STRAND.POS) 19 | 20 | def test_orient_compare(self): 21 | assert ORIENT.compare(ORIENT.NS, ORIENT.RIGHT) 22 | assert ORIENT.compare(ORIENT.NS, ORIENT.LEFT) 23 | assert ORIENT.compare(ORIENT.RIGHT, ORIENT.RIGHT) 24 | assert ORIENT.compare(ORIENT.LEFT, ORIENT.LEFT) 25 | assert not ORIENT.compare(ORIENT.RIGHT, ORIENT.LEFT) 26 | assert not ORIENT.compare(ORIENT.LEFT, ORIENT.RIGHT) 27 | 28 | def test_reverse_complement(self): 29 | assert reverse_complement('CGAT') == 'ATCG' 30 | assert reverse_complement('') == '' 31 | 32 | def test_translate(self): 33 | seq = 'ATG' 'AAT' 'TCT' 'GGA' 'TGA' 34 | translated_seq = translate(seq, 0) 35 | assert translated_seq == 'MNSG*' # ATG AAT TCT GGA TGA 36 | translated_seq = translate(seq, 1) 37 | assert translated_seq == '*ILD' # A TGA ATT CTG GAT GA 38 | translated_seq = translate(seq, 2) 39 | assert translated_seq == 'EFWM' # AT GAA TTC TGG ATG A 40 | 41 | def test_sort_columns(self): 42 | temp = ['NEW', 'NEW2', COLUMNS.break1_seq, COLUMNS.break2_seq, COLUMNS.break1_chromosome] 43 | assert sort_columns(temp) == [ 44 | COLUMNS.break1_chromosome, 45 | COLUMNS.break1_seq, 46 | COLUMNS.break2_seq, 47 | 'NEW', 48 | 'NEW2', 49 | ] 50 | 51 | def test_column_matches_column_name(self): 52 | assert COLUMNS.library == COLUMNS.library 53 | s = set([COLUMNS.library, COLUMNS.library]) 54 | assert len(s) == 1 55 | -------------------------------------------------------------------------------- /tests/test_mavis/validate/test_validate.py: -------------------------------------------------------------------------------- 1 | from mavis.constants import ORIENT 2 | from mavis.interval import Interval 3 | from mavis.validate.base import Evidence 4 | from mavis.validate.call import _call_interval_by_flanking_coverage 5 | 6 | from ..mock import Mock 7 | 8 | 9 | class CallIntervalByFlankingCoverage: 10 | def test_invalid_input_attr(self): 11 | pass 12 | 13 | def test_left(self): 14 | i = _call_interval_by_flanking_coverage( 15 | Mock(start=101, end=110), 16 | ORIENT.LEFT, 17 | 100, 18 | 20, 19 | distance=Evidence.distance, 20 | traverse=Evidence.traverse, 21 | ) 22 | assert i.start == 110 23 | assert i.end == 180 24 | 25 | i = _call_interval_by_flanking_coverage( 26 | Mock(start=20, end=80), 27 | ORIENT.LEFT, 28 | 230, 29 | 40, 30 | distance=Evidence.distance, 31 | traverse=Evidence.traverse, 32 | ) 33 | assert i.start == 80 34 | assert i.end == 209 35 | 36 | def test_right(self): 37 | i = _call_interval_by_flanking_coverage( 38 | Mock(start=101, end=110), 39 | ORIENT.RIGHT, 40 | 100, 41 | 20, 42 | distance=Evidence.distance, 43 | traverse=Evidence.traverse, 44 | ) 45 | assert i.end == 101 46 | assert i.start == 31 47 | 48 | i = _call_interval_by_flanking_coverage( 49 | Mock(start=150, end=200), 50 | ORIENT.RIGHT, 51 | 230, 52 | 40, 53 | distance=Evidence.distance, 54 | traverse=Evidence.traverse, 55 | ) 56 | assert i.start == 11 57 | assert i.end == 150 58 | 59 | 60 | class TestDistanceAndTraverse: 61 | def test_distance(self): 62 | assert Evidence.distance(1, 11) == Interval(10) 63 | 64 | def test_traverse_right(self): 65 | assert Evidence.traverse(1, 10, ORIENT.RIGHT) == Interval(11) 66 | 67 | def test_traverse_left(self): 68 | assert Evidence.traverse(20, 10, ORIENT.LEFT) == Interval(10) 69 | -------------------------------------------------------------------------------- /tests/mini-tutorial.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotate.draw_fusions_only": false, 3 | "convert": { 4 | "mock_converted": { 5 | "inputs": [ 6 | "tests/data/mock_sv_events.tsv" 7 | ], 8 | "file_type": "mavis", 9 | "assume_no_untemplated": true 10 | } 11 | }, 12 | "cluster.uninformative_filter": true, 13 | "cluster.limit_to_chr": null, 14 | "cluster.min_clusters_per_file": 5, 15 | "libraries": { 16 | "mock-A47933": { 17 | "assign": [ 18 | "tests/data/mock_trans_sv_events.tsv" 19 | ], 20 | "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam", 21 | "disease_status": "diseased", 22 | "median_fragment_size": 188, 23 | "protocol": "transcriptome", 24 | "read_length": 75, 25 | "stdev_fragment_size": 50, 26 | "strand_specific": true 27 | }, 28 | "mock-A36971": { 29 | "assign": [ 30 | "mock_converted" 31 | ], 32 | "bam_file": "tests/data/mock_reads_for_events.sorted.bam", 33 | "disease_status": "diseased", 34 | "median_fragment_size": 400, 35 | "protocol": "genome", 36 | "read_length": 150, 37 | "stdev_fragment_size": 97, 38 | "strand_specific": false 39 | } 40 | }, 41 | "output_dir": "output_dir", 42 | "reference.aligner_reference": [ 43 | "tests/data/mock_reference_genome.2bit" 44 | ], 45 | "reference.annotations": [ 46 | "tests/data/mock_annotations.json" 47 | ], 48 | "reference.dgv_annotation": [ 49 | "tests/data/mock_dgv_annotation.tab" 50 | ], 51 | "reference.masking": [ 52 | "tests/data/mock_masking.tab" 53 | ], 54 | "reference.reference_genome": [ 55 | "tests/data/mock_reference_genome.fa" 56 | ], 57 | "reference.template_metadata": [ 58 | "tests/data/cytoBand.txt" 59 | ], 60 | "summary.filter_min_remapped_reads": 5, 61 | "summary.filter_min_spanning_reads": 5, 62 | "summary.filter_min_linking_split_reads": 1, 63 | "summary.filter_min_flanking_reads": 10 64 | } 65 | -------------------------------------------------------------------------------- /tests/data/breakdancer_output.txt: -------------------------------------------------------------------------------- 1 | #Software: 1.4.5 2 | #Command: /gsc/software/linux-x86_64-centos6/breakdancer-1.4.5/bin/breakdancer-max -t /projects/trans_scratch/validations/workspace/creisle/MAV228/breakdancer.cfg 3 | #Library Statistics: 4 | #/projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam mean:441 std:116.54 uppercutoff:959.41 lowercutoff:22.39 readlen:149.65 library:A36971 reflen:3046874375 seqcov:69.8209 phycov:102.877 32:31637251 5 | #/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam mean:437.99 std:124.28 uppercutoff:955.49 lowercutoff:0 readlen:147.17 library:A36973 reflen:3046874375 seqcov:33.1399 phycov:49.3136 32:27980009 6 | #Chr1 Pos1 Orientation1 Chr2 Pos2 Orientation2 Type Size Score num_Reads num_Reads_lib A36971_2_lanes_dupsFlagged.bam A36973_1_lane_dupsFlagged.bam 7 | 1 200067631 23+27- 2 23697874 17+6- CTX -439 38 14 /projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|11:/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam|3 8 | 1 10001 83+126- 1 10546 83+126- ITX -352 99 43 /projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|23:/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam|20 NA NA 9 | 1 808410 11+11- 1 808574 11+11- ITX -338 99 9 /projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|6:/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam|3 NA NA 10 | 1 869445 89+21- 1 870225 5+93- DEL 892 99 67 /projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|40:/projects/analysis/analysis24/A36973/HMTGGCCXX_5/A36973/150nt/hg19a/bwa-mem-0.7.6a/A36973_1_lane_dupsFlagged.bam|27 0.06 0.08 11 | 1 54687282 6+9- 1 54687479 6+9- INS -421 99 3 /projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|3 NA NA 12 | 1 6508246 10+17- 1 17028869 57+50- INV 10520288 31 4 /projects/analysis/analysis24/A36971/merge_bwa-mem-0.7.6a/150nt/hg19a/A36971_2_lanes_dupsFlagged.bam|4 1.77 2.21 13 | -------------------------------------------------------------------------------- /docs/package/mavis/annotate/index.md: -------------------------------------------------------------------------------- 1 | # Sub-package Documentation 2 | 3 | ## Types of Output Files 4 | 5 | | expected name/suffix | file type/format | content | 6 | | ------------------------------ | --------------------------------- | ---------------------------------------- | 7 | | ``annotations.tab`` | text/tabbed | annotated events | 8 | | ``annotations.fusion-cdna.fa`` | [fasta](../../../glossary/#fasta) | putative fusion unspliced cDNA sequences | 9 | | ``drawings/*.svg`` | [SVG](../../../glossary/#svg) | diagrams | 10 | | ``drawings/*.legend.json`` | [JSON](../../../glossary/#json) | diagram legend/metadata | 11 | 12 | ## Algorithm Overview 13 | 14 | see [annotating events](../../../background/theory/#annotating-events) 15 | 16 | - read in breakpoint pairs 17 | - generate strand-specific annotations (one annotation per strand, multiple if multiple genes/transcripts in the region) 18 | - try building fusion transcripts for bp-specific calls 19 | - generate [SVG](../../../glossary/#svg) diagrams 20 | 21 | ## Levels of Annotations 22 | 23 | ![levels of features](../../../images/feature_levels.svg) 24 | 25 | ## Overview of Class Relationships 26 | 27 | ![annotations model](../../../images/annotation_model.svg) 28 | The Annotation sub-package has objects for genetic annotations and related calculations. The basic layout of the 29 | package is shown above. IS-A relationships are given by the blue arrows. HAS-A relationships are shown in black. 30 | And reference_object/parent 31 | type relationships are shown in red. mavis.annotate.genomic.Gene is a gene. Start and end are 32 | genomic positions wrt to the template/chr. mavis.annotate.genomic.PreTranscript is the 33 | unspliced transcript. Start and end are genomic positions wrt to the template/chr. 34 | mavis.annotate.genomic.Transcript: is the spliced transcript. Start and end coordinates are 35 | 1 to the length of the spliced product in base pairs. 36 | mavis.annotate.protein.Translation: is the translation of the spliced transcript. Start and 37 | end are cdna positions wrt the 5' end of the spliced transcript. The start and end here describe the start and end 38 | of the coding sequence 39 | -------------------------------------------------------------------------------- /tests/test_tools/test_convert_annotations_format.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import pytest 5 | 6 | from tools.convert_annotations_format import ( 7 | convert_gff2_to_mavis, 8 | convert_gff3_to_mavis, 9 | convert_mavis_json_2to3, 10 | convert_tab_to_json, 11 | ) 12 | 13 | CONVERTERS = { 14 | 'gff3': convert_gff3_to_mavis, 15 | 'gtf': convert_gff2_to_mavis, 16 | 'v2-json': convert_mavis_json_2to3, 17 | 'v2-tab': convert_tab_to_json, 18 | } 19 | 20 | 21 | def sort_elements(data): 22 | """ 23 | Sort lists of exons, domains, genes, etc by position and name to facilitate comparison 24 | """ 25 | if not isinstance(data, dict): 26 | if isinstance(data, list): 27 | items = [sort_elements(e) for e in data] 28 | 29 | if all(isinstance(elem, dict) for elem in data): 30 | return sorted( 31 | items, key=lambda elem: (elem.get('start'), elem.get('end'), elem.get('name')) 32 | ) 33 | return items 34 | else: 35 | return data 36 | 37 | for key, value in data.items(): 38 | data[key] = sort_elements(value) 39 | return data 40 | 41 | 42 | @pytest.mark.parametrize( 43 | 'filename,expected_file,input_type', 44 | [ 45 | ['K02718.1.gff3', 'K02718.1.gff3.json', 'gff3'], 46 | ['K02718.1.gtf', 'K02718.1.gtf.json', 'gtf'], 47 | ['Homo_sapiens.GRCh38.kras.gff3', 'Homo_sapiens.GRCh38.kras.gff3.json', 'gff3'], 48 | ['Homo_sapiens.GRCh38.kras.gtf', 'Homo_sapiens.GRCh38.kras.gtf.json', 'gtf'], 49 | ['example_genes.v2.json', 'example_genes.v3.json', 'v2-json'], 50 | [ 51 | 'ensembl69_hg19_annotations.kras.tab', 52 | 'ensembl69_hg19_annotations.kras.tab.json', 53 | 'v2-tab', 54 | ], 55 | ['viral.gtf', 'viral.gtf.json', 'gtf'], 56 | ], 57 | ) 58 | def test_gff_examples(filename, expected_file, input_type): 59 | data_dir = os.path.join(os.path.dirname(__file__), 'data') 60 | input_file = os.path.join(data_dir, filename) 61 | with open(os.path.join(data_dir, expected_file), 'r') as fh: 62 | expected = json.load(fh) 63 | 64 | # order doesn't matter 65 | data = sort_elements(CONVERTERS[input_type](input_file)) 66 | expected = sort_elements(expected) 67 | 68 | assert len(data['genes']) == len(expected['genes']) 69 | assert data == expected 70 | -------------------------------------------------------------------------------- /docs/configuration/general.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | An exhaustive list of the various configurable settings can be found [here](../settings). Alternatively you can view them through the [online schema explorer](https://json-schema.app/view?url=https://raw.githubusercontent.com/bcgsc/mavis_config/master/src/mavis_config/config.json) 4 | 5 | ## Pipeline Configuration File 6 | 7 | The pipeline can be run in steps or it can be configured using a JSON 8 | configuration file and setup in a single step. Scripts will be generated 9 | to run all steps following clustering. 10 | 11 | The config schema is found in the mavis package under `src/mavis/schemas/config.json` 12 | 13 | Top level settings follow the pattern `
.`. The convert and library 14 | sections are nested objects. 15 | 16 | ## Adjusting the Resource Requirements 17 | 18 | ### Choosing the Number of Validation/Annotation Jobs 19 | 20 | MAVIS chooses the number of jobs to split validate/annotate stages into 21 | based on two settings: [cluster.max_files](../../configuration/settings/#clustermax_files) and 22 | [cluster.min_clusters_per_file](../../configuration/settings/#clustermin-clusters-per-file). 23 | 24 | For example, in the following situation say you have: 1000 clusters, 25 | `cluster.max_files=10`, and `cluster.min_clusters_per_file=10`. Then MAVIS will set up 26 | 10 validation jobs each with 100 events. 27 | 28 | However, if `cluster.min_clusters_per_file=500`, then MAVIS would only set up 2 29 | jobs each with 500 events. This is because 30 | [cluster.min_clusters_per_file](../../configuration/settings/#clustermin-clusters-per-file) takes precedence 31 | over [custer.max_files](../../configuration/settings/#clustermax_files). 32 | 33 | Splitting into more jobs will lower the resource requirements per job 34 | (see [resource requirements](../performance/)). The memory and time requirements for validation are linear 35 | with respect to the number of events to be validated. 36 | 37 | ### Uninformative Filter 38 | 39 | For example, if the user is only interested in events in genes, then the 40 | [cluster.uninformative_filter](../../configuration/settings/#clusteruninformative_filter) can be used. This 41 | will drop all events that are not within a certain distance 42 | ([cluster.max_proximity](../../configuration/settings/#clustermax_proximity)) to any annotation in 43 | the annotations reference file. These events will be dropped prior to 44 | the validation stage which results in significant speed up. 45 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = mavis 3 | version = 3.1.2 4 | url = https://github.com/bcgsc/mavis.git 5 | download_url = https://github.com/bcgsc/mavis/archive/v2.2.10.tar.gz 6 | description = A Structural Variant Post-Processing Package 7 | author_email = creisle@bcgsc.ca 8 | author = Caralyn Reisle 9 | maintainer_email = mavis@bcgsc.ca 10 | maintainer = mavis 11 | long_description = file: README.md 12 | long_description_content_type = text/markdown 13 | license_file = LICENSE 14 | project_urls = mavis = http://mavis.bcgsc.ca 15 | 16 | [bdist_wheel] 17 | universal = 1 18 | 19 | [pycodestyle] 20 | ignore = E501 21 | W503 22 | E203 23 | statistics = True 24 | 25 | [flake8] 26 | ignore = E501,W503,E203 27 | 28 | [isort] 29 | profile = black 30 | 31 | [options] 32 | packages = find: 33 | package_dir = 34 | = src 35 | python_requires = >=3.7 36 | dependency_links = [] 37 | include_package_data = True 38 | install_requires = 39 | biopython>=1.70, <1.78 40 | braceexpand==0.1.2 41 | colour 42 | Distance>=0.1.3 43 | mavis_config>=1.2.2, <2.0.0 44 | networkx>=2.5,<3 45 | numpy>=1.13.1 46 | pandas>=1.1, <2 47 | pysam 48 | Shapely>=1.6.4.post1 49 | shortuuid>=0.5.0 50 | svgwrite 51 | typing_extensions>=4 52 | setup_requires = 53 | pip>=9.0.0 54 | setuptools>=36.0.0 55 | 56 | [options.packages.find] 57 | exclude = tests 58 | where = src 59 | 60 | [options.extras_require] 61 | doc = 62 | mkdocs>=1.1.2 63 | markdown-refdocs 64 | mkdocs-material>=5.4.0 65 | markdown-include 66 | mkdocs-simple-hooks>=0.1.2 67 | mkdocs-awesome-pages-plugin 68 | test = 69 | timeout-decorator>=0.3.3 70 | coverage>=4.2 71 | pycodestyle>=2.3.1 72 | pytest 73 | pytest-cov 74 | dev = 75 | black 76 | flake8 77 | isort 78 | twine 79 | wheel 80 | timeout-decorator>=0.3.3 81 | coverage>=4.2 82 | pycodestyle>=2.3.1 83 | pytest 84 | pytest-cov 85 | pytest-xdist 86 | mkdocs>=1.1.2,<2 87 | markdown-refdocs 88 | mkdocs-material>=5.4.0 89 | markdown-include 90 | mkdocs-simple-hooks>=0.1.2 91 | types-setuptools>=57.4.7, <58 92 | deploy = 93 | twine 94 | wheel 95 | tools = 96 | pyensembl 97 | simplejson 98 | requests 99 | 100 | [options.entry_points] 101 | console_scripts = 102 | mavis = mavis.main:main 103 | calculate_ref_alt_counts = tools.calculate_ref_alt_counts:main 104 | -------------------------------------------------------------------------------- /tests/data/bwa_pipeline_config.cfg: -------------------------------------------------------------------------------- 1 | [reference] 2 | template_metadata = tests/data/cytoBand.txt 3 | annotations = tests/data/mock_annotations.json 4 | masking = tests/data/mock_masking.tab 5 | reference_genome = tests/data/mock_reference_genome.fa 6 | aligner_reference = tests/data/mock_reference_genome.fa 7 | dgv_annotation = tests/data/mock_dgv_annotation.txt 8 | 9 | [annotate] 10 | draw_fusions_only = False 11 | 12 | [validate] 13 | # evidence related settings 14 | aligner = bwa mem 15 | assembly_max_paths = 4 16 | assembly_min_exact_match_to_remap = 4 17 | assembly_min_edge_trim_weight = 4 18 | assembly_min_remap_coverage = 0 19 | assembly_min_remapped_seq = 3 20 | assembly_strand_concordance = 0.51 21 | blat_min_identity = 0.9 22 | call_error = 10 23 | contig_aln_max_event_size = 50 24 | contig_aln_merge_inner_anchor = 20 25 | contig_aln_merge_outer_anchor = 15 26 | contig_aln_min_anchor_size = 50 27 | contig_aln_min_query_consumption = 0.7 28 | fetch_reads_bins = 5 29 | fetch_reads_limit = 10000 30 | fetch_min_bin_size = 50 31 | filter_secondary_alignments = True 32 | fuzzy_mismatch_number = 1 33 | max_sc_preceeding_anchor = 6 34 | min_anchor_exact = 6 35 | min_anchor_fuzzy = 10 36 | min_anchor_match = 0.9 37 | min_double_aligned_to_estimate_insertion_size = 2 38 | min_flanking_pairs_resolution = 3 39 | min_linking_split_reads = 1 40 | min_mapping_quality = 5 41 | min_non_target_aligned_split_reads = 1 42 | min_sample_size_to_apply_percentage = 10 43 | min_softclipping = 6 44 | min_spanning_reads_resolution = 3 45 | min_splits_reads_resolution = 3 46 | stdev_count_abnormal = 3.0 47 | strand_determining_read = 2 48 | outer_window_min_event_size = 125 49 | 50 | [cluster] 51 | uninformative_filter = True 52 | limit_to_chr = None 53 | 54 | [mock-A36971] 55 | read_length = 150 56 | median_fragment_size = 400 57 | stdev_fragment_size = 97 58 | bam_file = tests/data/mock_reads_for_events.sorted.bam 59 | protocol = genome 60 | inputs = tests/data/mock_sv_events.tsv 61 | strand_specific = False 62 | disease_status=diseased 63 | 64 | [mock-A47933] 65 | read_length = 75 66 | median_fragment_size = 188 67 | stdev_fragment_size = 50 68 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam 69 | protocol = transcriptome 70 | inputs = tests/data/mock_trans_sv_events.tsv 71 | strand_specific = True 72 | disease_status=diseased 73 | 74 | [summary] 75 | filter_min_remapped_reads = 5 76 | filter_min_spanning_reads = 5 77 | filter_min_flanking_reads = 10 78 | filter_min_split_reads = 5 79 | filter_min_linking_split_reads = 1 80 | filter_cdna_synon = False 81 | filter_protein_synon = False 82 | -------------------------------------------------------------------------------- /tests/data/missing_reference.cfg: -------------------------------------------------------------------------------- 1 | [reference] 2 | annotations = tests/data/mock_annotations.json 3 | aligner_reference = tests/data/mock_reference_genome.2bit 4 | 5 | [annotate] 6 | draw_fusions_only = False 7 | 8 | [validate] 9 | # evidence related settings 10 | aligner = blat 11 | assembly_max_paths = 4 12 | assembly_min_exact_match_to_remap = 4 13 | assembly_min_edge_trim_weight = 4 14 | assembly_min_remap_coverage = 0 15 | assembly_min_remapped_seq = 3 16 | assembly_strand_concordance = 0.51 17 | blat_min_identity = 0.9 18 | call_error = 10 19 | contig_aln_max_event_size = 50 20 | contig_aln_merge_inner_anchor = 20 21 | contig_aln_merge_outer_anchor = 15 22 | contig_aln_min_anchor_size = 50 23 | contig_aln_min_query_consumption = 0.7 24 | fetch_reads_bins = 5 25 | fetch_reads_limit = 10000 26 | fetch_min_bin_size = 50 27 | filter_secondary_alignments = True 28 | fuzzy_mismatch_number = 1 29 | max_sc_preceeding_anchor = 6 30 | min_anchor_exact = 6 31 | min_anchor_fuzzy = 10 32 | min_anchor_match = 0.9 33 | min_double_aligned_to_estimate_insertion_size = 2 34 | min_flanking_pairs_resolution = 3 35 | min_linking_split_reads = 1 36 | min_mapping_quality = 5 37 | min_non_target_aligned_split_reads = 1 38 | min_sample_size_to_apply_percentage = 10 39 | min_softclipping = 6 40 | min_spanning_reads_resolution = 3 41 | min_splits_reads_resolution = 3 42 | stdev_count_abnormal = 3.0 43 | strand_determining_read = 2 44 | outer_window_min_event_size = 125 45 | 46 | [cluster] 47 | uninformative_filter = True 48 | limit_to_chr = None 49 | 50 | [mock-A36971] 51 | read_length = 150 52 | median_fragment_size = 400 53 | stdev_fragment_size = 97 54 | bam_file = tests/data/mock_reads_for_events.sorted.bam 55 | protocol = genome 56 | inputs = mock_converted 57 | strand_specific = False 58 | disease_status=diseased 59 | 60 | [mock-A47933] 61 | read_length = 75 62 | median_fragment_size = 188 63 | stdev_fragment_size = 50 64 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam 65 | protocol = transcriptome 66 | inputs = tests/data/mock_trans_sv_events.tsv 67 | strand_specific = True 68 | disease_status=diseased 69 | 70 | [summary] 71 | filter_min_remapped_reads = 5 72 | filter_min_spanning_reads = 5 73 | filter_min_flanking_reads = 10 74 | filter_min_split_reads = 5 75 | filter_min_linking_split_reads = 1 76 | filter_cdna_synon = True 77 | filter_protein_synon = True 78 | 79 | [convert] 80 | assume_no_untemplated = True 81 | # addfile twice to check this notation is ok (will collapse them anyway) 82 | mock_converted = convert_tool_output 83 | tests/data/mock_sv_events.tsv 84 | tests/data/mock_sv_events.tsv 85 | mavis 86 | False 87 | 88 | -------------------------------------------------------------------------------- /tests/data/clean_pipeline_config.cfg: -------------------------------------------------------------------------------- 1 | [reference] 2 | template_metadata = tests/data/cytoBand.txt 3 | annotations = tests/data/mock_annotations.json 4 | masking = tests/data/mock_masking.tab 5 | reference_genome = tests/data/mock_reference_genome.fa 6 | aligner_reference = tests/data/mock_reference_genome.2bit 7 | dgv_annotation = tests/data/mock_dgv_annotation.txt 8 | 9 | [annotate] 10 | draw_fusions_only = False 11 | 12 | [validate] 13 | # evidence related settings 14 | aligner = blat 15 | assembly_max_paths = 4 16 | assembly_min_exact_match_to_remap = 4 17 | assembly_min_edge_trim_weight = 4 18 | assembly_min_remap_coverage = 0 19 | assembly_min_remapped_seq = 3 20 | assembly_strand_concordance = 0.51 21 | blat_min_identity = 0.9 22 | call_error = 10 23 | contig_aln_max_event_size = 50 24 | contig_aln_merge_inner_anchor = 20 25 | contig_aln_merge_outer_anchor = 15 26 | contig_aln_min_anchor_size = 50 27 | contig_aln_min_query_consumption = 0.7 28 | fetch_reads_bins = 5 29 | fetch_reads_limit = 10000 30 | fetch_min_bin_size = 50 31 | filter_secondary_alignments = True 32 | fuzzy_mismatch_number = 1 33 | max_sc_preceeding_anchor = 6 34 | min_anchor_exact = 6 35 | min_anchor_fuzzy = 10 36 | min_anchor_match = 0.9 37 | min_double_aligned_to_estimate_insertion_size = 2 38 | min_flanking_pairs_resolution = 3 39 | min_linking_split_reads = 1 40 | min_mapping_quality = 5 41 | min_non_target_aligned_split_reads = 1 42 | min_sample_size_to_apply_percentage = 10 43 | min_softclipping = 6 44 | min_spanning_reads_resolution = 3 45 | min_splits_reads_resolution = 3 46 | stdev_count_abnormal = 3.0 47 | strand_determining_read = 2 48 | outer_window_min_event_size = 125 49 | write_evidence_files = False 50 | clean_aligner_files = True 51 | 52 | [cluster] 53 | uninformative_filter = True 54 | limit_to_chr = None 55 | 56 | [mock-A36971] 57 | read_length = 150 58 | median_fragment_size = 400 59 | stdev_fragment_size = 97 60 | bam_file = tests/data/mock_reads_for_events.sorted.bam 61 | protocol = genome 62 | inputs = tests/data/mock_sv_events.tsv 63 | strand_specific = False 64 | disease_status=diseased 65 | 66 | [mock-A47933] 67 | read_length = 75 68 | median_fragment_size = 188 69 | stdev_fragment_size = 50 70 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam 71 | protocol = transcriptome 72 | inputs = tests/data/mock_trans_sv_events.tsv 73 | strand_specific = True 74 | disease_status=diseased 75 | 76 | [summary] 77 | filter_min_remapped_reads = 5 78 | filter_min_spanning_reads = 5 79 | filter_min_flanking_reads = 10 80 | filter_min_split_reads = 5 81 | filter_min_linking_split_reads = 1 82 | filter_cdna_synon = True 83 | filter_protein_synon = True 84 | -------------------------------------------------------------------------------- /tests/data/no_opt_pipeline.cfg: -------------------------------------------------------------------------------- 1 | [reference] 2 | annotations = tests/data/mock_annotations.json 3 | reference_genome = tests/data/mock_reference_genome.fa 4 | aligner_reference = tests/data/mock_reference_genome.2bit 5 | 6 | [annotate] 7 | draw_fusions_only = False 8 | 9 | [validate] 10 | # evidence related settings 11 | aligner = blat 12 | assembly_max_paths = 4 13 | assembly_min_exact_match_to_remap = 4 14 | assembly_min_edge_trim_weight = 4 15 | assembly_min_remap_coverage = 0 16 | assembly_min_remapped_seq = 3 17 | assembly_strand_concordance = 0.51 18 | blat_min_identity = 0.9 19 | call_error = 10 20 | contig_aln_max_event_size = 50 21 | contig_aln_merge_inner_anchor = 20 22 | contig_aln_merge_outer_anchor = 15 23 | contig_aln_min_anchor_size = 50 24 | contig_aln_min_query_consumption = 0.7 25 | fetch_reads_bins = 5 26 | fetch_reads_limit = 10000 27 | fetch_min_bin_size = 50 28 | filter_secondary_alignments = True 29 | fuzzy_mismatch_number = 1 30 | max_sc_preceeding_anchor = 6 31 | min_anchor_exact = 6 32 | min_anchor_fuzzy = 10 33 | min_anchor_match = 0.9 34 | min_double_aligned_to_estimate_insertion_size = 2 35 | min_flanking_pairs_resolution = 3 36 | min_linking_split_reads = 1 37 | min_mapping_quality = 5 38 | min_non_target_aligned_split_reads = 1 39 | min_sample_size_to_apply_percentage = 10 40 | min_softclipping = 6 41 | min_spanning_reads_resolution = 3 42 | min_splits_reads_resolution = 3 43 | stdev_count_abnormal = 3.0 44 | strand_determining_read = 2 45 | outer_window_min_event_size = 125 46 | 47 | [cluster] 48 | uninformative_filter = True 49 | limit_to_chr = None 50 | 51 | [mock-A36971] 52 | read_length = 150 53 | median_fragment_size = 400 54 | stdev_fragment_size = 97 55 | bam_file = tests/data/mock_reads_for_events.sorted.bam 56 | protocol = genome 57 | inputs = mock_converted 58 | strand_specific = False 59 | disease_status=diseased 60 | 61 | [mock-A47933] 62 | read_length = 75 63 | median_fragment_size = 188 64 | stdev_fragment_size = 50 65 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam 66 | protocol = transcriptome 67 | inputs = tests/data/mock_trans_sv_events.tsv 68 | strand_specific = True 69 | disease_status=diseased 70 | 71 | [summary] 72 | filter_min_remapped_reads = 5 73 | filter_min_spanning_reads = 5 74 | filter_min_flanking_reads = 10 75 | filter_min_split_reads = 5 76 | filter_min_linking_split_reads = 1 77 | filter_cdna_synon = True 78 | filter_protein_synon = True 79 | 80 | [convert] 81 | assume_no_untemplated = True 82 | # addfile twice to check this notation is ok (will collapse them anyway) 83 | mock_converted = convert_tool_output 84 | tests/data/mock_sv_events.tsv 85 | tests/data/mock_sv_events.tsv 86 | mavis 87 | False 88 | 89 | -------------------------------------------------------------------------------- /tests/data/Library-clusterset-N.validated.tsv: -------------------------------------------------------------------------------- 1 | #cluster_id break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand event_type opposing_strands stranded protocol tools contigs_assembled contigs_aligned contig_sequence contig_remap_score contig_alignment_score call_method flanking_reads median_insert_size stdev_insert_size break1_split_reads break2_split_reads linking_split_reads untemplated_sequence 2 | 11241 1 238693407 238693407 L ? 1 238701194 238701194 R ? deletion False False genome DELLY_v0.6.1 1 1 GAGACTGGAAGTGGGTAGTTGCTTCATGCAGCTGGTTGTCCCAATGCCTGTCAGAGTCTGCCTTAGTCCCGGGGTTTTTATGGGCTCAGAAGGGAGAAAGTGTATGCTGAAAGCATTGAAATGCTAATTAGGAAGCATTTTTTTTTTCCTTCAAAGTAACTTTAAATAACTTTTCGGGAAAGTAAACACAATTA 27 0 contig 58 8200.0 7.7781745930520225 26 0 0 3 | 11388 1 79401525 79401525 L ? 1 79401848 79401848 R ? deletion False False genome DELLY_v0.6.1 1 1 AGACAGTAACAAAAGTTGGAGGTAAGACAAGGACCCAGATATTGTCAGCCAAAATCCTCCCCAGGTATTTATAACAGAATGGAAATCTCAAGTAAGAATATGGATATTCTGTATACTGTACATACATCAAATGTTTTTATAGGAAACCACATGTTACATGTACATATGACATAATCAAATGCATGATAAGTATTTATTGCAAATTCAT 61 0 contig 225 731 7.0710678118654755 93 0 0 4 | 11425 1 143164727 143164727 R ? 1 143165037 143165037 R ? inversion True False genome DELLY_v0.6.1 1 0 ? ? ? split reads 14 266.0 5.744562646538029 3 5 2 ? 5 | 10094 11 79346483 79346483 R ? 9 115343095 115343095 L ? translocation False False genome DELLY_v0.6.1 1 1 AAACTGCTCCATATTTATTTCATTATTATTATCATTTTCATCATCCTAACGATTATTCAGTATATACCAAGTGTCTCTGATGAAACATGCAGGAGATGAAAAATCCTTGGGTGGGCTTGTTTCTTTCTTTGTGTTTTTTTTTTTGAGATGGAGTCTCGCTCTGGAGCCCAGGCTGG 19 0 contig 32 0.0 0.0 20 10 9 6 | 10094 11 79346459 79346459 L ? 9 115343096 115343096 R ? translocation False False genome DELLY_v0.6.1 1 1 ATAATATTGTCTCATTCCCATTTTAAACTACCTGTTCCTTAAATTGCATATAAAAATACAGTCCATGCAATATTAATACACTAATGAATAATACACTAACAATTTATTTTCTTAGCCATTTCTTAACCTTTTCCTGTAGTTTCCTGAAGGAAGAGCTGAGTTATAATTTTTGAAAAATAAGAGAGACAAAGTAAAAATTCAG 31 0 contig 65 0 0.0 0 21 0 7 | 11963 11 79346459 79346459 L ? 9 115343096 115343096 R ? translocation False False genome DELLY_v0.6.1 1 1 ATAATATTGTCTCATTCCCATTTTAAACTACCTGTTCCTTAAATTGCATATAAAAATACAGTCCATGCAATATTAATACACTAATGAATAATACACTAACAATTTATTTTCTTAGCCATTTCTTAACCTTTTCCTGTAGTTTCCTGAAGGAAGAGCTGAGTTATAATTTTTGAAAAATAAGAGAGACAAAGTAAAAATTCAG 31 0 contig 65 0 0.0 0 21 0 8 | 11963 11 79346483 79346483 R ? 9 115343095 115343095 L ? translocation False False genome DELLY_v0.6.1 1 1 AAACTGCTCCATATTTATTTCATTATTATTATCATTTTCATCATCCTAACGATTATTCAGTATATACCAAGTGTCTCTGATGAAACATGCAGGAGATGAAAAATCCTTGGGTGGGCTTGTTTCTTTCTTTGTGTTTTTTTTTTTGAGATGGAGTCTCGCTCTGGAGCCCAGGCTGG 19 0 contig 32 0.0 0.0 20 10 9 9 | 11974 11 56271180 56271593 L ? 9 132187570 132187570 R ? translocation False False genome DELLY_v0.6.1 0 0 ? ? ? split and flanking 7 0 0.0 1 3 0 ? 10 | -------------------------------------------------------------------------------- /tests/data/pipeline_config.cfg: -------------------------------------------------------------------------------- 1 | [reference] 2 | template_metadata = tests/data/cytoBand.txt 3 | annotations = tests/data/mock_annotations.json 4 | masking = tests/data/mock_masking.tab 5 | reference_genome = tests/data/mock_reference_genome.fa 6 | aligner_reference = tests/data/mock_reference_genome.2bit 7 | dgv_annotation = tests/data/mock_dgv_annotation.txt 8 | 9 | [annotate] 10 | draw_fusions_only = False 11 | 12 | [schedule] 13 | 14 | [validate] 15 | # evidence related settings 16 | aligner = blat 17 | assembly_max_paths = 4 18 | assembly_min_exact_match_to_remap = 4 19 | assembly_min_edge_trim_weight = 4 20 | assembly_min_remap_coverage = 0 21 | assembly_min_remapped_seq = 3 22 | assembly_strand_concordance = 0.51 23 | blat_min_identity = 0.9 24 | call_error = 10 25 | contig_aln_max_event_size = 50 26 | contig_aln_merge_inner_anchor = 20 27 | contig_aln_merge_outer_anchor = 15 28 | contig_aln_min_anchor_size = 50 29 | contig_aln_min_query_consumption = 0.7 30 | fetch_reads_bins = 5 31 | fetch_reads_limit = 10000 32 | fetch_min_bin_size = 50 33 | filter_secondary_alignments = True 34 | fuzzy_mismatch_number = 1 35 | max_sc_preceeding_anchor = 6 36 | min_anchor_exact = 6 37 | min_anchor_fuzzy = 10 38 | min_anchor_match = 0.9 39 | min_double_aligned_to_estimate_insertion_size = 2 40 | min_flanking_pairs_resolution = 3 41 | min_linking_split_reads = 1 42 | min_mapping_quality = 5 43 | min_non_target_aligned_split_reads = 1 44 | min_sample_size_to_apply_percentage = 10 45 | min_softclipping = 6 46 | min_spanning_reads_resolution = 3 47 | min_splits_reads_resolution = 3 48 | stdev_count_abnormal = 3.0 49 | strand_determining_read = 2 50 | outer_window_min_event_size = 125 51 | 52 | [cluster] 53 | uninformative_filter = True 54 | # all chromosomes 55 | limit_to_chr = None 56 | min_clusters_per_file = 2 57 | 58 | [mock-A36971] 59 | read_length = 150 60 | median_fragment_size = 400 61 | stdev_fragment_size = 97 62 | bam_file = tests/data/mock_reads_for_events.sorted.bam 63 | protocol = genome 64 | inputs = mock_converted 65 | strand_specific = False 66 | disease_status=diseased 67 | 68 | [mock-A47933] 69 | read_length = 75 70 | median_fragment_size = 188 71 | stdev_fragment_size = 50 72 | bam_file = tests/data/mock_trans_reads_for_events.sorted.bam 73 | protocol = transcriptome 74 | inputs = tests/data/mock_trans_sv_events.tsv 75 | strand_specific = True 76 | disease_status=diseased 77 | 78 | [summary] 79 | filter_min_remapped_reads = 5 80 | filter_min_spanning_reads = 5 81 | filter_min_flanking_reads = 10 82 | filter_min_split_reads = 5 83 | filter_min_linking_split_reads = 1 84 | filter_cdna_synon = True 85 | filter_protein_synon = True 86 | 87 | [convert] 88 | assume_no_untemplated = True 89 | # addfile twice to check this notation is ok (will collapse them anyway) 90 | mock_converted = convert_tool_output 91 | tests/data/mock_sv_events.tsv 92 | tests/data/mock_sv_events.tsv 93 | mavis 94 | False 95 | 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 |
6 | 7 | ![PyPi](https://img.shields.io/pypi/v/mavis.svg) ![build](https://github.com/bcgsc/mavis/workflows/build/badge.svg?branch=master) [![codecov](https://codecov.io/gh/bcgsc/mavis/branch/master/graph/badge.svg)](https://codecov.io/gh/bcgsc/mavis) ![ReadTheDocs](https://readthedocs.org/projects/pip/badge/) 8 | 9 | ## About 10 | 11 | [MAVIS](http://mavis.bcgsc.ca) is python command-line tool for the post-processing of structural variant calls. 12 | The general [MAVIS](http://mavis.bcgsc.ca) pipeline consists of six main stages 13 | 14 | - convert 15 | - [cluster](https://mavis.readthedocs.io/en/latest/package/mavis/cluster) 16 | - [validate](https://mavis.readthedocs.io/en/latest/package/mavis/validate) 17 | - [annotate](https://mavis.readthedocs.io/en/latest/package/mavis/annotate) 18 | - [pairing](https://mavis.readthedocs.io/en/latest/package/mavis/pairing) 19 | - [summary](https://mavis.readthedocs.io/en/latest/package/mavis/summary) 20 | 21 | ## Getting Help 22 | 23 | All steps in the MAVIS pipeline are called following the main mavis entry point. The usage menu can be viewed 24 | by running without any arguments, or by giving the -h/--help option 25 | 26 | ``` bash 27 | mavis -h 28 | ``` 29 | 30 | Help sub-menus can be found by giving the pipeline step followed by no arguments or the -h options 31 | 32 | ``` bash 33 | mavis cluster -h 34 | ``` 35 | 36 | Common problems and questions are addressed on the [wiki](https://github.com/bcgsc/mavis/wiki/Help-and-Frequently-Asked-Questions). 37 | If you have a question or issue that is not answered there (or already a github issue) please submit 38 | a github issue to our [github page](https://github.com/bcgsc/mavis/issues) or contact us by email at [mavis@bcgsc.ca](mailto:mavis@bcgsc.ca) 39 | 40 | ## Getting Started 41 | 42 | The simplest way to use MAVIS is via Singularity. The MAVIS docker container used 43 | by singularity will take care of installing the aligner as well. 44 | 45 | ```bash 46 | pip install -U setuptools pip wheel 47 | pip install mavis_config # also installs snakemake 48 | ``` 49 | 50 | Now you will run mavis via Snakemake as follows 51 | 52 | ```bash 53 | snakemake \ 54 | -j \ 55 | --configfile \ 56 | --use-singularity \ 57 | -s Snakefile 58 | ``` 59 | 60 | For other installation options which do not use docker/singularity see the comprehensive install 61 | instructions in the [user manual](https://mavis.readthedocs.io/en/latest/install) 62 | 63 | ## Citation 64 | 65 | If you use MAVIS as a part of your project please cite 66 | 67 | [Reisle,C. et al. (2018) MAVIS: Merging, Annotation, Validation, and Illustration of Structural variants. Bioinformatics.](https://doi.org/10.1093/bioinformatics/bty621) 68 | -------------------------------------------------------------------------------- /docs/background/citations.md: -------------------------------------------------------------------------------- 1 | # Literature 2 | 3 | ## Abyzov-2011 4 | 5 | Abyzov,A. et al. (2011) CNVnator: an approach to discover, genotype, 6 | and characterize typical and atypical CNVs from family and 7 | population genome sequencing. Genome Res., 21, 974--984. 8 | 9 | ## Abyzov-2015 10 | 11 | Abyzov,A. et al. (2015) Analysis of deletion breakpoints from 1,092 12 | humans reveals details of mutation mechanisms. Nat. Commun., 13 | 6, 7256. 14 | 15 | ## Chen-2009 16 | 17 | Chen,K. et al. (2009) BreakDancer: an algorithm for high-resolution 18 | mapping of genomic structural variation. Nat. Methods, 6, 677--681. 19 | 20 | ## Chen-2016 21 | 22 | Chen,X. et al. (2016) Manta: rapid detection of structural variants 23 | and indels for germline and cancer sequencing applications. 24 | Bioinformatics, 32, 1220--1222. 25 | 26 | ## Chiu-2021 27 | 28 | Chiu,R. et al. (2021) Straglr: discovering and genotyping tandem repeat 29 | expansions using whole genome long-read sequences. Genome Biol., 22, 224. 30 | 31 | ## Haas-2017 32 | 33 | Haas,B et al. (2017) STAR-Fusion: Fast and Accurate Fusion 34 | Transcript Detection from RNA-Seq. doi: 35 | 36 | 37 | ## Iyer-2011 38 | 39 | Iyer,M.K. et al. (2011) ChimeraScan: a tool for identifying chimeric 40 | transcription in sequencing data. Bioinformatics, 27, 2903--2904. 41 | 42 | ## MacDonald-2014 43 | 44 | MacDonald,J.R. et al. (2014) The Database of Genomic Variants: a 45 | curated collection of structural variation in the human genome. 46 | Nucleic Acids Res., 42, D986--92. 47 | 48 | ## McPherson-2011 49 | 50 | McPherson,A. et al. (2011) deFuse: an algorithm for gene fusion 51 | discovery in tumor RNA-Seq data. PLoS Comput. Biol., 7, e1001138. 52 | 53 | ## Rausch-2012 54 | 55 | Rausch,T. et al. (2012) DELLY: structural variant discovery by 56 | integrated paired-end and split-read analysis. Bioinformatics, 28, 57 | i333--i339. 58 | 59 | ## Robertson-2010 60 | 61 | Robertson,G. et al. (2010) De novo assembly and analysis of RNA-seq 62 | data. Nat. Methods, 7, 909--912. 63 | 64 | ## Saunders-2012 65 | 66 | Saunders,C.T. et al. (2012) Strelka: accurate somatic small-variant 67 | calling from sequenced tumor--normal sample pairs. Bioinformatics, 68 | 28, 1811--1817. 69 | 70 | ## Uhrig-2021 71 | 72 | Uhrig,S. et al. (2021) Accurate and efficient detection of gene 73 | fusions from RNA sequencing data. Genome Res., 31, 448--460. 74 | 75 | ## Yates-2016 76 | 77 | Yates,A. et al. (2016) Ensembl 2016. Nucleic Acids Res., 44, 78 | D710--D716. 79 | 80 | ## Ye-2009 81 | 82 | Ye,K. et al. (2009) Pindel: a pattern growth approach to detect 83 | break points of large deletions and medium sized insertions from 84 | paired-end short reads. Bioinformatics, 25, 2865--2871. 85 | 86 | ## den-Dunnen-2016 87 | 88 | den Dunnen,J.T. et al. (2016) HGVS Recommendations for the 89 | Description of Sequence Variants: 2016 Update. Hum. Mutat., 37, 90 | 564--569. 91 | -------------------------------------------------------------------------------- /tests/test_mavis/test_blat.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from mavis.constants import CIGAR, reverse_complement 4 | from mavis.validate.blat import Blat 5 | 6 | from .mock import Mock, MockFunction, MockLongString 7 | 8 | 9 | class TestConvertPslxToPysam: 10 | def test_simple(self): 11 | row = { 12 | 'match': 142, 13 | 'mismatch': 0, 14 | 'repmatch': 0, 15 | 'ncount': 0, 16 | 'qgap_count': 0, 17 | 'qgap_bases': 0, 18 | 'tgap_count': 0, 19 | 'tgap_bases': 0, 20 | 'strand': '-', 21 | 'qname': 'seq1', 22 | 'qsize': 204, 23 | 'qstart': 0, 24 | 'qend': 142, 25 | 'tname': '17', 26 | 'tsize': 81195210, 27 | 'tstart': 32673408, 28 | 'tend': 32673550, 29 | 'block_count': 1, 30 | 'block_sizes': [142], 31 | 'qstarts': [62], 32 | 'tstarts': [32673408], 33 | '_index': 880, 34 | 'score': 142, 35 | 'percent_ident': 100.0, 36 | 'qseq_full': ( 37 | 'ACATGTGCACAACGTGCAGGTTTGTTACATATGTATACATGTGCCATGTTGGTTTGCTGCACCCATTAACTCGTCCTAGTTTATTACTAGTCTTCAGACATC' 38 | 'CAGAAAATAGAGTAAGATACTAGGTAGACATAACACCTAGATACATCCGTAAGGCATTTGTTTCCTATCACATGGCCCATTCTAGCTTAACACCCACCAACT' 39 | ), 40 | } 41 | refseq = { 42 | '17': Mock( 43 | seq=MockLongString( 44 | 'ACTAGGTGTTATGTCTACCTAGTATCTTACTCTATTTTCTGGATGTCTGAAGACTAGTAATAAACTAGGACGAGTTAATGGGTGCAGCAAACCAACATGGCACATG' 45 | 'TATACATATGTAACAAACCTGCACGTTGTGCACATGTACCCTAAAACTTAAAGTATAAAAAAAAATTTCACTGAGCATAAGACTTCAGACACAAAAGAGTGCATGC' 46 | 'CATATAATTCCATTTATGTGAATTTCAAGAACAATCAGTGATGACAGAAGTCAAAGTAGTGGTCACCTCTGGAAGGTGGGACATTGACC', 47 | 32673407, 48 | ) 49 | ) 50 | } 51 | cache = Mock(reference_id=MockFunction(16)) 52 | read = Blat.pslx_row_to_pysam(row, cache, refseq) 53 | assert read.reference_id == 16 54 | assert read.reference_name == '17' 55 | assert reverse_complement(read.query_sequence) == row['qseq_full'] 56 | assert read.cigar == [(CIGAR.S, 62), (CIGAR.EQ, 142)] 57 | 58 | def test_overlapping_blat_blocks_error(self): 59 | row = { 60 | 'strand': '+', 61 | 'qname': 'seq23', 62 | 'tname': '7', 63 | 'block_sizes': [54, 53, 36, 80, 29], 64 | 'qstarts': [0, 55, 108, 143, 223], 65 | 'tstarts': [61279112, 61279166, 61397315, 61990208, 62366144], 66 | 'score': 207, 67 | 'percent_ident': 91.3, 68 | 'qseq_full': ( 69 | 'CAAAAGGAAATACCTTCACATAAATTCTAGACGGAAGCAATCTGAGAAACTTTTATTGTGATTTGTGCATTCACTTCACAGAGTTAAAACTTTCTTTTGATT' 70 | 'GAGCAGTTTGAAACTCTGTTTTTGTAGAATCTGCAAGTGGACATTTGGAGCGCTTTGAGGCCTATGGTGGAAAAGGAAATATCTTCACAGGAAAACTAGATA' 71 | 'GAAGTATTCTGAGAAACTTCTTTGTGATGTATGCAGTCATATCTCAGA' 72 | ), 73 | } 74 | cache = Mock(reference_id=MockFunction(6)) 75 | with pytest.raises(AssertionError): 76 | Blat.pslx_row_to_pysam(row, cache, None) 77 | -------------------------------------------------------------------------------- /docs/package/mavis/validate/index.md: -------------------------------------------------------------------------------- 1 | # Sub-package Documentation 2 | 3 | The validation sub-package is responsible for pulling supporting reads from the bam file 4 | and re-calling events based on the evidence in a standard notation. 5 | 6 | ## Types of Output Files 7 | 8 | A variety of intermediate output files are given for the user. These can be used to "drill down" 9 | further into events and also for developers debugging when adding new features, etc. 10 | 11 | | expected name/suffix | file type/format | content | 12 | | --------------------------- | --------------------------------------------------- | ---------------------------------- | 13 | | ``*.raw_evidence.bam`` | [bam](../../../glossary/#bam) | raw evidence | 14 | | ``*.contigs.bam`` | [bam](../../../glossary/#bam) | aligned contigs | 15 | | ``*.evidence.bed`` | [bed](../../../glossary/#bed) | evidence collection window regions | 16 | | ``*.validation-passed.bed`` | [bed](../../../glossary/#bed) | validated event positions | 17 | | ``*.validation-failed.tab`` | text/tabbed | failed events | 18 | | ``*.validation-passed.tab`` | text/tabbed | validated events | 19 | | ``*.contigs.fa`` | [fasta](../../../glossary/#fasta) | assembled contigs | 20 | | ``*.contigs.blat_out.pslx`` | [pslx](../../../glossary/#pslx) | results from blatting contigs | 21 | | ``*.igv.batch`` | [IGV batch file](../../../glossary/#IGV-batch-file) | igv batch file | 22 | 23 | 24 | ## Algorithm Overview 25 | 26 | - (For each breakpoint pair) 27 | 28 | - [Calculate the window/region](../../../background/theory/#calculating-the-evidence-window) to read from the bam and collect 29 | evidence 30 | - Store evidence ([flanking read pair](../../../glossary/#flanking-read-pair), [half-mapped read](../../../glossary/#half-mapped-read), [spanning read](../../../glossary/#spanning-read), [split read](../../../glossary/#split-read), 31 | [compatible flanking pairs](../../../glossary/#compatible-flanking-pairs)) which match the expected event type and position 32 | - Assemble a contig from the collected reads. see [theory - assembling contigs](../../../background/theory/#assembling-contigs) 33 | 34 | - Generate a [fasta](../../../glossary/#fasta) file containing all the contig sequences 35 | - Align contigs to the reference genome (currently [blat](../../../glossary/#blat) is used to perform this step) 36 | - Make the final event calls. Each level of calls consumes all supporting reads so they are not re-used in subsequent 37 | levels of calls. 38 | - (For each breakpoint pair) 39 | 40 | - call by contig 41 | - call by [spanning read](../../../glossary/#spanning-read) 42 | - call by [split read](../../../glossary/#split-read) 43 | - call by [flanking read pair](../../../glossary/#flanking-read-pair). see [theory - calling breakpoints by flanking evidence](../../../background/theory/#calling-breakpoints-by-flanking-evidence) 44 | 45 | - Output new calls, evidence, contigs, etc 46 | -------------------------------------------------------------------------------- /tests/full-tutorial.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotate.draw_fusions_only": true, 3 | "cluster.min_clusters_per_file": 100, 4 | "cluster.uninformative_filter": true, 5 | "convert": { 6 | "breakdancer": { 7 | "assume_no_untemplated": true, 8 | "file_type": "breakdancer", 9 | "inputs": [ 10 | "tutorial_data/breakdancer-1.4.5/*txt" 11 | ] 12 | }, 13 | "breakseq": { 14 | "assume_no_untemplated": true, 15 | "file_type": "breakseq", 16 | "inputs": [ 17 | "tutorial_data/breakseq-2.2/breakseq.vcf.gz" 18 | ] 19 | }, 20 | "chimerascan": { 21 | "assume_no_untemplated": true, 22 | "file_type": "chimerascan", 23 | "inputs": [ 24 | "tutorial_data/chimerascan-0.4.5/chimeras.bedpe" 25 | ] 26 | }, 27 | "defuse": { 28 | "assume_no_untemplated": true, 29 | "file_type": "defuse", 30 | "inputs": [ 31 | "tutorial_data/defuse-0.6.2/results.classify.tsv" 32 | ] 33 | }, 34 | "manta": { 35 | "assume_no_untemplated": true, 36 | "file_type": "manta", 37 | "inputs": [ 38 | "tutorial_data/manta-1.0.0/diploidSV.vcf.gz", 39 | "tutorial_data/manta-1.0.0/somaticSV.vcf" 40 | ] 41 | } 42 | }, 43 | "libraries": { 44 | "L1522785992-normal": { 45 | "assign": [ 46 | "breakdancer", 47 | "breakseq", 48 | "manta" 49 | ], 50 | "bam_file": "tutorial_data/L1522785992_normal.sorted.bam", 51 | "disease_status": "normal", 52 | "protocol": "genome" 53 | }, 54 | "L1522785992-trans": { 55 | "assign": [ 56 | "chimerascan", 57 | "defuse" 58 | ], 59 | "bam_file": "tutorial_data/L1522785992_trans.sorted.bam", 60 | "disease_status": "diseased", 61 | "protocol": "transcriptome", 62 | "strand_specific": true 63 | }, 64 | "L1522785992-tumour": { 65 | "assign": [ 66 | "breakdancer", 67 | "breakseq", 68 | "manta" 69 | ], 70 | "bam_file": "tutorial_data/L1522785992_tumour.sorted.bam", 71 | "disease_status": "diseased", 72 | "protocol": "genome" 73 | } 74 | }, 75 | "output_dir": "output_dir_full", 76 | "reference.aligner_reference": [ 77 | "reference_inputs/hg19.2bit" 78 | ], 79 | "reference.annotations": [ 80 | "reference_inputs/ensembl69_hg19_annotations.v3.json" 81 | ], 82 | "reference.dgv_annotation": [ 83 | "tests/data/mock_dgv_annotation.tab" 84 | ], 85 | "reference.masking": [ 86 | "reference_inputs/hg19_masking.tab" 87 | ], 88 | "reference.reference_genome": [ 89 | "reference_inputs/hg19.fa" 90 | ], 91 | "reference.template_metadata": [ 92 | "reference_inputs/cytoBand.txt" 93 | ], 94 | "summary.filter_min_flanking_reads": 10, 95 | "summary.filter_min_linking_split_reads": 1, 96 | "summary.filter_min_remapped_reads": 5, 97 | "summary.filter_min_spanning_reads": 5 98 | } 99 | -------------------------------------------------------------------------------- /src/mavis/convert/transabyss.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from ..constants import COLUMNS 4 | from .constants import SUPPORTED_TOOL, TRACKING_COLUMN 5 | 6 | 7 | def convert_row(row): 8 | """ 9 | transforms the transabyss output into the common format for expansion. 10 | Maps the input column names to column names which MAVIS can read 11 | """ 12 | std_row = {} 13 | if TRACKING_COLUMN not in row: 14 | std_row[TRACKING_COLUMN] = '{}-{}'.format(SUPPORTED_TOOL.TA, row['id']) 15 | 16 | std_row[COLUMNS.event_type] = row.get('rearrangement', row['type']) 17 | for retained_column in ['genes', 'gene']: 18 | if retained_column in row: 19 | std_row['{}_{}'.format(SUPPORTED_TOOL.TA, retained_column)] = row[retained_column] 20 | if std_row[COLUMNS.event_type] in ['LSR', 'translocation']: 21 | del std_row[COLUMNS.event_type] 22 | if 'breakpoint' in row: 23 | std_row[COLUMNS.break1_orientation], std_row[COLUMNS.break2_orientation] = row[ 24 | 'orientations' 25 | ].split(',') 26 | match = re.match( 27 | r'^(?P[^:]+):(?P\d+)\|(?P[^:]+):(?P\d+)$', 28 | row['breakpoint'], 29 | ) 30 | if not match: 31 | raise OSError( 32 | 'file format error: the breakpoint column did not satisfy the expected pattern', row 33 | ) 34 | for group, col in zip( 35 | ['chr1', 'pos1_start', 'chr2', 'pos2_start'], 36 | [ 37 | COLUMNS.break1_chromosome, 38 | COLUMNS.break1_position_start, 39 | COLUMNS.break2_chromosome, 40 | COLUMNS.break2_position_start, 41 | ], 42 | ): 43 | std_row[col] = match[group] 44 | else: 45 | std_row.update( 46 | { 47 | COLUMNS.break1_chromosome: row['chr'], 48 | COLUMNS.break1_position_start: int(row['chr_start']), 49 | COLUMNS.break2_position_start: int(row['chr_end']), 50 | } 51 | ) 52 | if std_row[COLUMNS.event_type] == 'del': 53 | std_row[COLUMNS.break1_position_start] -= 1 54 | std_row[COLUMNS.break2_position_start] += 1 55 | elif std_row[COLUMNS.event_type] == 'ins': 56 | std_row[COLUMNS.break2_position_start] += 1 57 | 58 | # add the untemplated sequence where appropriate 59 | if std_row[COLUMNS.event_type] == 'del': 60 | assert row['alt'] == 'na' 61 | std_row[COLUMNS.untemplated_seq] = '' 62 | elif std_row[COLUMNS.event_type] in ['dup', 'ITD']: 63 | length = ( 64 | std_row[COLUMNS.break2_position_start] - std_row[COLUMNS.break1_position_start] + 1 65 | ) 66 | if len(row['alt']) != length: 67 | raise AssertionError( 68 | 'expected alternate sequence to be equal to the length of the event', 69 | len(row['alt']), 70 | length, 71 | row, 72 | std_row, 73 | ) 74 | std_row[COLUMNS.untemplated_seq] = '' 75 | elif std_row[COLUMNS.event_type] == 'ins': 76 | std_row[COLUMNS.untemplated_seq] = row['alt'].upper() 77 | else: 78 | raise NotImplementedError('unexpected indel type', std_row[COLUMNS.event_type]) 79 | return std_row 80 | -------------------------------------------------------------------------------- /tests/test_tools/data/K02718.1.gff3: -------------------------------------------------------------------------------- 1 | K02718.1 Genbank CDS 1140 2813 . + 0 ID=cds-AAA46936.1;Parent=gene-E1;Dbxref=NCBI_GP:AAA46936.1;Name=AAA46936.1;Note=E1 interrupted ORF from 859 to 2813%3B putative;gbkey=CDS;gene=E1;product=replication protein;protein_id=AAA46936.1 2 | K02718.1 Genbank CDS 2755 3852 . + 0 ID=cds-AAA46941.1;Parent=gene-E2;Dbxref=NCBI_GP:AAA46941.1;Name=AAA46941.1;Note=E2 ORF from 2725 to 3852%3B putative;gbkey=CDS;gene=E2;product=regulatory protein;protein_id=AAA46941.1 3 | K02718.1 Genbank CDS 3332 3619 . + 0 ID=cds-AAA46937.1;Parent=gene-E4;Dbxref=NCBI_GP:AAA46937.1;Name=AAA46937.1;gbkey=CDS;gene=E4;partial=true;product=AAA46937.1;protein_id=AAA46937.1;start_range=.,3332 4 | K02718.1 Genbank CDS 3863 4099 . + 0 ID=cds-AAA46938.1;Parent=gene-E5;Dbxref=NCBI_GP:AAA46938.1;Name=AAA46938.1;gbkey=CDS;gene=E5;partial=true;product=AAA46938.1;protein_id=AAA46938.1;start_range=.,3863 5 | K02718.1 Genbank CDS 4235 5656 . + 0 ID=cds-AAA46942.1;Parent=gene-L2;Dbxref=NCBI_GP:AAA46942.1;Name=AAA46942.1;Note=L2 ORF from 4133 to 5656%3B putative;gbkey=CDS;gene=L2;product=minor capsid protein;protein_id=AAA46942.1 6 | K02718.1 Genbank CDS 5559 7154 . + 0 ID=cds-AAA46943.1;Parent=gene-L1;Dbxref=NCBI_GP:AAA46943.1;Name=AAA46943.1;Note=L1 ORF from 5526 to 7154%3B putative;gbkey=CDS;gene=L1;product=major capsid protein;protein_id=AAA46943.1 7 | K02718.1 Genbank CDS 562 858 . + 0 ID=cds-AAA46940.1;Parent=gene-E7;Dbxref=NCBI_GP:AAA46940.1;Name=AAA46940.1;Note=E7 ORF from 544 to 858%3B putative;gbkey=CDS;gene=E7;product=transforming protein;protein_id=AAA46940.1 8 | K02718.1 Genbank CDS 83 559 . + 0 ID=cds-AAA46939.1;Parent=gene-E6;Dbxref=NCBI_GP:AAA46939.1;Name=AAA46939.1;Note=E6 ORF from 65 to 559%3B putative;gbkey=CDS;gene=E6;product=transforming protein;protein_id=AAA46939.1 9 | K02718.1 Genbank CDS 865 1140 . + 0 ID=cds-AAA46936.1;Parent=gene-E1;Dbxref=NCBI_GP:AAA46936.1;Name=AAA46936.1;Note=E1 interrupted ORF from 859 to 2813%3B putative;gbkey=CDS;gene=E1;product=replication protein;protein_id=AAA46936.1 10 | K02718.1 Genbank gene 1140 2813 . + . ID=gene-E1;Name=E1;gbkey=Gene;gene=E1;gene_biotype=protein_coding 11 | K02718.1 Genbank gene 2755 3852 . + . ID=gene-E2;Name=E2;gbkey=Gene;gene=E2;gene_biotype=protein_coding 12 | K02718.1 Genbank gene 3332 3619 . + . ID=gene-E4;Name=E4;gbkey=Gene;gene=E4;gene_biotype=protein_coding 13 | K02718.1 Genbank gene 3863 4099 . + . ID=gene-E5;Name=E5;gbkey=Gene;gene=E5;gene_biotype=protein_coding 14 | K02718.1 Genbank gene 4235 5656 . + . ID=gene-L2;Name=L2;gbkey=Gene;gene=L2;gene_biotype=protein_coding 15 | K02718.1 Genbank gene 5559 7154 . + . ID=gene-L1;Name=L1;gbkey=Gene;gene=L1;gene_biotype=protein_coding 16 | K02718.1 Genbank gene 562 858 . + . ID=gene-E7;Name=E7;gbkey=Gene;gene=E7;gene_biotype=protein_coding 17 | K02718.1 Genbank gene 83 559 . + . ID=gene-E6;Name=E6;gbkey=Gene;gene=E6;gene_biotype=protein_coding 18 | K02718.1 Genbank gene 865 1140 . + . ID=gene-E1;Name=E1;gbkey=Gene;gene=E1;gene_biotype=protein_coding 19 | K02718.1 Genbank region 17 23 . + . ID=id-K02718.1:17..23;gbkey=TATA_signal 20 | K02718.1 Genbank region 1 7904 . + . ID=K02718.1:1..7904;Dbxref=taxon:333760;Is_circular=true;gbkey=Src;mol_type=genomic DNA 21 | K02718.1 Genbank region 4213 4218 . + . ID=id-K02718.1:4213..4218;Note=putative;gbkey=polyA_signal 22 | K02718.1 Genbank region 4289 4295 . + . ID=id-L2;gbkey=TATA_signal;gene=L2 23 | K02718.1 Genbank region 65 71 . + . ID=id-K02718.1:65..71;gbkey=TATA_signal 24 | K02718.1 Genbank region 7260 7265 . + . ID=id-K02718.1:7260..7265;gbkey=polyA_signal 25 | -------------------------------------------------------------------------------- /tests/data/mock_reference_annotations.full.json: -------------------------------------------------------------------------------- 1 | {"genes": [{"aliases": ["C9orf47"], "chr": "fakereference9", "end": 5278, "name": "ENSG00000186354", "start": 1, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": 685, "cdna_coding_start": 134, "domains": [], "end": 5278, "exons": [{"end": 322, "start": 1}, {"end": 833, "start": 608}, {"end": 5278, "start": 990}], "is_best_transcript": true, "name": "ENST00000375851", "start": 1}, {"aliases": [], "cdna_coding_end": 783, "cdna_coding_start": 76, "domains": [], "end": 1202, "exons": [{"end": 322, "start": 59}, {"end": 1202, "start": 608}], "is_best_transcript": false, "name": "ENST00000375850", "start": 59}, {"aliases": [], "cdna_coding_end": 677, "cdna_coding_start": 69, "domains": [], "end": 5278, "exons": [{"end": 379, "start": 66}, {"end": 833, "start": 608}, {"end": 5278, "start": 990}], "is_best_transcript": false, "name": "ENST00000334490", "start": 66}]}, {"aliases": ["S1PR3"], "chr": "fakereference9", "end": 14148, "name": "ENSG00000213694", "start": 585, "strand": "+", "transcripts": [{"aliases": [], "cdna_coding_end": 1533, "cdna_coding_start": 397, "domains": [{"name": "PR00362", "regions": [{"end": 62, "start": 49}, {"end": 200, "start": 185}, {"end": 308, "start": 298}]}, {"name": "PR00642", "regions": [{"end": 75, "start": 63}, {"end": 112, "start": 102}, {"end": 155, "start": 139}, {"end": 345, "start": 329}]}, {"name": "PS50262", "regions": [{"end": 298, "start": 56}]}, {"name": "PF10320", "regions": [{"end": 312, "start": 55}]}, {"name": "SSF81321", "regions": [{"end": 340, "start": 1}]}, {"name": "PR00237", "regions": [{"end": 65, "start": 41}, {"end": 95, "start": 74}, {"end": 140, "start": 118}, {"end": 174, "start": 153}, {"end": 219, "start": 196}, {"end": 265, "start": 241}, {"end": 306, "start": 280}]}, {"name": "PR01523", "regions": [{"end": 25, "start": 13}, {"end": 101, "start": 92}, {"end": 123, "start": 112}, {"end": 204, "start": 194}, {"end": 224, "start": 215}, {"end": 283, "start": 272}, {"end": 311, "start": 301}]}, {"name": "PF00001", "regions": [{"end": 298, "start": 56}]}, {"name": "PR01524", "regions": [{"end": 40, "start": 24}, {"end": 155, "start": 139}, {"end": 233, "start": 223}, {"end": 323, "start": 314}, {"end": 340, "start": 326}]}], "end": 14148, "exons": [{"end": 833, "start": 585}, {"end": 14148, "start": 10192}], "is_best_transcript": false, "name": "ENST00000358157", "start": 585}, {"aliases": [], "cdna_coding_end": 5832, "cdna_coding_start": 4696, "domains": [{"name": "PF10320", "regions": [{"end": 312, "start": 55}]}, {"name": "PR00362", "regions": [{"end": 62, "start": 49}, {"end": 200, "start": 185}, {"end": 308, "start": 298}]}, {"name": "PS50262", "regions": [{"end": 298, "start": 56}]}, {"name": "PR00642", "regions": [{"end": 75, "start": 63}, {"end": 112, "start": 102}, {"end": 155, "start": 139}, {"end": 345, "start": 329}]}, {"name": "PR00237", "regions": [{"end": 65, "start": 41}, {"end": 95, "start": 74}, {"end": 140, "start": 118}, {"end": 174, "start": 153}, {"end": 219, "start": 196}, {"end": 265, "start": 241}, {"end": 306, "start": 280}]}, {"name": "PR01523", "regions": [{"end": 25, "start": 13}, {"end": 101, "start": 92}, {"end": 123, "start": 112}, {"end": 204, "start": 194}, {"end": 224, "start": 215}, {"end": 283, "start": 272}, {"end": 311, "start": 301}]}, {"name": "PR01524", "regions": [{"end": 40, "start": 24}, {"end": 155, "start": 139}, {"end": 233, "start": 223}, {"end": 323, "start": 314}, {"end": 340, "start": 326}]}, {"name": "PF00001", "regions": [{"end": 298, "start": 56}]}, {"name": "SSF81321", "regions": [{"end": 340, "start": 1}]}], "end": 14148, "exons": [{"end": 14148, "start": 5644}], "is_best_transcript": true, "name": "ENST00000375846", "start": 5644}]}]} -------------------------------------------------------------------------------- /docs/tutorials/annotation.md: -------------------------------------------------------------------------------- 1 | # Annotation Only 2 | 3 | Sometimes you have a set of variants and would simply like to run the annotate step of MAVIS to visualize and annotate them. 4 | 5 | First you need to create your basic config to tell MAVIS where the reference files you want to use are and some minimal information about the library/sample you want to process. 6 | 7 | Here is an example config where the user has created a minimal input file in the MAVIS standard input file format. We convert it to expand any unknowns (ex. SV type if left blank) 8 | 9 | ```json 10 | { 11 | "libraries": { 12 | "my_library": { 13 | "assign": ["my_converted_file"], 14 | "disease_status": "normal", 15 | "protocol": "genome" 16 | } 17 | }, 18 | "convert": { 19 | "my_converted_file": { 20 | "inputs": ["/path/to/file/structural_variants.txt"], 21 | "file_type": "mavis" 22 | } 23 | }, 24 | "cluster.split_only": true, 25 | "skip_stage.validate": true, 26 | "output_dir": "my_output_dir", 27 | "reference.annotations": "/path/to/mavis/reference_files/ensembl79_hg38_annotations.json", 28 | "reference.template_metadata": "/path/to/mavis/reference_files/hg38_cytoBand.txt", 29 | "reference.reference_genome": "/path/to/hg38_no_alt/genome/hg38_no_alt.fa", 30 | "reference.masking": "/path/to/mavis/reference_files/masking_hg38.adjusted.tab", 31 | "reference.dgv_annotation": "/path/to/mavis/reference_files/dgv_hg38_annotations.tab" 32 | } 33 | ``` 34 | 35 | Another example is given in the MAVIS tests folder under `tests/mini-tutorial.annotate_only.config.json` which looks like this 36 | 37 | ```json 38 | { 39 | "annotate.draw_fusions_only": false, 40 | "convert": { 41 | "mock_converted": { 42 | "inputs": [ 43 | "tests/data/mock_sv_events.tsv" 44 | ], 45 | "file_type": "mavis", 46 | "assume_no_untemplated": true 47 | } 48 | }, 49 | "skip_stage.validate": true, 50 | "cluster.uninformative_filter": true, 51 | "cluster.limit_to_chr": null, 52 | "cluster.min_clusters_per_file": 5, 53 | "libraries": { 54 | "mock-A47933": { 55 | "assign": [ 56 | "tests/data/mock_trans_sv_events.tsv" 57 | ], 58 | "bam_file": "tests/data/mock_trans_reads_for_events.sorted.bam", 59 | "disease_status": "diseased", 60 | "protocol": "transcriptome", 61 | "strand_specific": true 62 | }, 63 | "mock-A36971": { 64 | "assign": [ 65 | "mock_converted" 66 | ], 67 | "bam_file": "tests/data/mock_reads_for_events.sorted.bam", 68 | "disease_status": "diseased", 69 | "protocol": "genome", 70 | "strand_specific": false 71 | } 72 | }, 73 | "output_dir": "output_dir", 74 | "reference.annotations": [ 75 | "tests/data/mock_annotations.json" 76 | ], 77 | "reference.dgv_annotation": [ 78 | "tests/data/mock_dgv_annotation.txt" 79 | ], 80 | "reference.masking": [ 81 | "tests/data/mock_masking.tab" 82 | ], 83 | "reference.reference_genome": [ 84 | "tests/data/mock_reference_genome.fa" 85 | ], 86 | "reference.template_metadata": [ 87 | "tests/data/cytoBand.txt" 88 | ] 89 | } 90 | ``` 91 | 92 | Either of these configurations can be run with the following command simply by changing the configfile argument 93 | 94 | ```bash 95 | snakemake -j 1 \ 96 | --configfile tests/mini-tutorial.annotate_only.config.json \ 97 | -s Snakefile 98 | ``` 99 | -------------------------------------------------------------------------------- /tests/data/transabyss_indels_output.tab: -------------------------------------------------------------------------------- 1 | id type chr chr_start chr_end ctg ctg_len ctg_start ctg_end len ref alt event_reads contig_reads genome_reads gene repeat-length ctg_strand from_end confirm_contig_region within_simple_repeats repeatmasker within_segdup at_least_1_read_opposite dbsnp 2 | 1 ins 1 8877520 8877520 4542232 58938 23102 23103 2 na tt 41 41 47 RERE:uc001apf.3:exon1|synon 0 + 23101 23102-23117 - - - false - 3 | 2 ins 1 16011005 16011005 4541011 129199 97246 97248 3 na ggc 22 22 25 PLEKHM2:uc010obo.2:exon1|synon 0 - 31951 97234-97248 TRF_SimpleTandemRepeat_GCG (CGG)n - false - 4 | 3 ins 1 16926227 16926227 4624842 952 419 419 1 na t 46 46 68 NBPF1:uc001aza.5:exon3|na 0 - 418 414-419 - L1ME3 chr1:21766304 false - 5 | 4 ins 1 17026040 17026040 4529033 986 780 794 15 na gcggcggcggcggca 35 35 23 ESPNP:uc001azn.1:exon8|P431_P432insLPPPP 0 + 192 780-794 - (CGG)n chr1:6487720 false - 6 | 5 ins 1 17026043 17026043 4521063 925 99 143 45 na gcggcggcggcggcggcggcggcggcggcagcagcagcagcagca 6 6 8 ESPNP:uc001azn.1:exon8|L430_P431insLLLLLLPPPPPPPPP 0 - 98 99-143 - (CGG)n chr1:6487720 false - 7 | 1175 del X 142715897 142715924 4547857 78777 52728 52728 28 ttttt...ttttt na 34 34 17 SLITRK4:uc022cfl.1:exon2|SLITRK4:uc022cfl.1:exon2|synon 0 + 25889 52728-52728 TRF_SimpleTandemRepeat_T (T)n - false - 8 | 1176.1 del X 149115835 149115836 indel_k96_4578561 1263 1145 1145 2 ga na 37 30 2 LINC00894:uc004fed.1:exon1|LINC00894:uc004fed.1:exon1|na 0 + 118 1145-1149 - - chrX:148613958 false - 9 | 1176.2 del X 149115835 149115836 indel_5327 1263 119 119 2 ga na 37 7 2 LINC00894:uc004fed.1:exon1|LINC00894:uc004fed.1:exon1|na 0 - 118 115-119 - - chrX:148613958 false - 10 | 1177 del X 153523769 153523790 4654686 26033 2836 2836 22 gcacc...gtgcg na 8 8 1 TEX28:uc010nut.1:exon1|TEX28:uc010nut.1:exon1|synon 0 + 2835 2836-2924 TRF_SimpleTandemRepeat_CACGTGCGGCACCACCCCCTGA - - false - 11 | 1178 del X 154997577 154997583 4522314 63590 44595 44595 7 ttttgtt na 28 28 23 SPRY3:uc004fnq.1:exon1|SPRY3:uc004fnq.1:exon1|synon 0 + 18995 44595-44595 TRF_SimpleTandemRepeat_TTTTG (TTTTG)n chrY:59033286 false - 12 | 1181 dup 12 13029070 13029073 4659122 38006 26858 26861 4 na aaaa 38 38 25 RPL13AP20:uc010sho.2:exon1;3utr|NA:NA:NA|NA 0 - 11139 26838-26861 - (A)n - false - 13 | 1182 dup 12 121839158 121839167 4619408 122056 113544 113553 10 na aaaaaaaaaa 34 34 15 BC029038:uc001uan.3:exon1;3utr|NA:NA:NA|NA 0 - 8503 113524-113553 - L2c - false - 14 | 1183 dup 15 44094768 44094775 4533713 84196 41867 41874 8 na aaaaaaaa 7 7 1 SERF2-C15ORF63:uc001ztb.3:exon6;3utr|NA:NA:NA|NA 0 + 41866 41867-41890 - AluSq4 - false - 15 | 1184 dup 6 27515484 27515553 4632026 88843 27311 27380 70 na ggaaaacaaaaggtccaggaaaaggatatatacatatatcttcgagcaggttccaccgagacttgaactc 131 131 24 NA:NA:NA|TRNA_Gln:uc021yqh.1:3utr;exon1|NA 0 - 27261 27241-27380 - tRNA-Gln-CAG - false - 16 | 1185 dup GL000211.1 108677 108683 4632141 14477 5082 5088 7 na aaaaaaa 33 33 19 FLJ43315:uc003boa.3:exon5;3utr|NA:NA:NA|NA 0 + 5081 5082-5102 - FLAM_A chr9:69378660 false - 17 | 1232 ITD 9 132345740 132345781 3298328 190 96 137 42 na tccatcccttcacctccactaagatcagggcaccccaggagt 9 9 13 BC037833:uc004bya.1:exon4|BC037833:uc004bya.1:exon4|na 0 - 53 54-137 - - - false - 18 | 1233 ITD GL000220.1 114348 114379 4159437 179 96 127 32 na cccccgcggggaatcccccgcgaggggggtct 37 37 13 RNA5-8S5:uc022brd.2:exon1|RNA5-8S5:uc022brd.2:exon1|na 0 - 52 64-127 - LSU-rRNA_Hsa chrUn_gl000220:145518 false - 19 | 1234 ITD GL000220.1 118433 118436 50603 168 81 84 4 na gcgt 24 24 1 RNA5-8S5:uc022brd.2:exon1|RNA5-8S5:uc022brd.2:exon1|na 0 - 80 73-84 - (CG)n - false - 20 | 1235 ITD GL000220.1 118437 118440 107283 168 77 80 4 na gcgt 323 323 1 RNA5-8S5:uc022brd.2:exon1|RNA5-8S5:uc022brd.2:exon1|na 0 - 76 73-80 - (CG)n - false - 21 | 1236 ITD X 84343323 84343327 4588370 15020 7333 7337 5 na ttttt 5 5 4 APOOL:uc004eem.3:exon9|APOOL:uc004eem.3:exon9|synon 0 - 7332 7308-7337 TRF_SimpleTandemRepeat_T (T)n - false - 22 | -------------------------------------------------------------------------------- /tests/test_mavis/test_help.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from unittest.mock import patch 3 | 4 | from mavis_config.constants import SUBCOMMAND 5 | 6 | from mavis.main import main 7 | 8 | 9 | class TestHelpMenu: 10 | def test_main(self): 11 | with patch.object(sys, 'argv', ['mavis', '-h']): 12 | try: 13 | returncode = main() 14 | except SystemExit as err: 15 | assert err.code == 0 16 | else: 17 | assert returncode == 0 18 | 19 | def test_pipeline(self): 20 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.SETUP, '-h']): 21 | try: 22 | returncode = main() 23 | except SystemExit as err: 24 | assert err.code == 0 25 | else: 26 | assert returncode == 0 27 | 28 | def test_cluster(self): 29 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.CLUSTER, '-h']): 30 | try: 31 | returncode = main() 32 | except SystemExit as err: 33 | assert err.code == 0 34 | else: 35 | assert returncode == 0 36 | 37 | def test_validate(self): 38 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.VALIDATE, '-h']): 39 | try: 40 | returncode = main() 41 | except SystemExit as err: 42 | assert err.code == 0 43 | else: 44 | assert returncode == 0 45 | 46 | def test_annotate(self): 47 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.ANNOTATE, '-h']): 48 | try: 49 | returncode = main() 50 | except SystemExit as err: 51 | assert err.code == 0 52 | else: 53 | assert returncode == 0 54 | 55 | def test_pairing(self): 56 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.PAIR, '-h']): 57 | try: 58 | returncode = main() 59 | except SystemExit as err: 60 | assert err.code == 0 61 | else: 62 | assert returncode == 0 63 | 64 | def test_summary(self): 65 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.SUMMARY, '-h']): 66 | try: 67 | returncode = main() 68 | except SystemExit as err: 69 | assert err.code == 0 70 | else: 71 | assert returncode == 0 72 | 73 | def test_convert(self): 74 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.CONVERT, '-h']): 75 | try: 76 | returncode = main() 77 | except SystemExit as err: 78 | assert err.code == 0 79 | else: 80 | assert returncode == 0 81 | 82 | def test_overlay(self): 83 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.OVERLAY, '-h']): 84 | try: 85 | returncode = main() 86 | except SystemExit as err: 87 | assert err.code == 0 88 | else: 89 | assert returncode == 0 90 | 91 | def test_bad_option(self): 92 | with patch.object(sys, 'argv', ['mavis', SUBCOMMAND.SETUP, '--blargh']): 93 | try: 94 | returncode = main() 95 | except SystemExit as err: 96 | assert err.code != 0 97 | else: 98 | assert returncode != 0 99 | 100 | def test_ref_alt_count(self): 101 | with patch.object(sys, 'argv', ['calculate_ref_alt_counts', '-h']): 102 | try: 103 | returncode = main() 104 | except SystemExit as err: 105 | assert err.code == 0 106 | else: 107 | assert returncode == 0 108 | -------------------------------------------------------------------------------- /tests/data/mock_dgv_annotation_mavis.tab: -------------------------------------------------------------------------------- 1 | tracking_id event_type break1_chromosome break1_position_start break1_position_end break1_orientation break1_strand break1_seq break2_chromosome break2_position_start break2_position_end break2_orientation break2_strand break2_seq opposing_strands stranded tools 2 | nsv482937 None 1 1 1 L ? None 1 2300000 2300000 L ? None True False bed 3 | nsv482937 None 1 1 1 L ? None 1 2300000 2300000 R ? None False False bed 4 | nsv482937 None 1 1 1 R ? None 1 2300000 2300000 L ? None False False bed 5 | nsv482937 None 1 1 1 R ? None 1 2300000 2300000 R ? None True False bed 6 | dgv1n82 None 1 10001 10001 L ? None 1 22118 22118 L ? None True False bed 7 | dgv1n82 None 1 10001 10001 L ? None 1 22118 22118 R ? None False False bed 8 | dgv1n82 None 1 10001 10001 R ? None 1 22118 22118 L ? None False False bed 9 | dgv1n82 None 1 10001 10001 R ? None 1 22118 22118 R ? None True False bed 10 | rgv2n98 None 1 10001 10001 L ? None 1 22120 22120 L ? None True False bed 11 | rgv2n98 None 1 10001 10001 L ? None 1 22120 22120 R ? None False False bed 12 | rgv2n98 None 1 10001 10001 R ? None 1 22120 22120 L ? None False False bed 13 | rgv2n98 None 1 10001 10001 R ? None 1 22120 22120 R ? None True False bed 14 | dgv2n99 None 1 10001 10501 R ? None 1 15000 15000 R ? None True False bed 15 | rgv2n99 None 1 10001 10001 L ? None 1 22222 22222 L ? None True False bed 16 | rgv2n99 None 1 10001 10001 L ? None 1 22222 22222 R ? None False False bed 17 | rgv2n99 None 1 10001 10001 R ? None 1 22222 22222 L ? None False False bed 18 | rgv2n99 None 1 10001 10001 R ? None 1 22222 22222 R ? None True False bed 19 | nsv7879 None 1 10001 10001 L ? None 1 127330 127330 L ? None True False bed 20 | nsv7879 None 1 10001 10001 L ? None 1 127330 127330 R ? None False False bed 21 | nsv7879 None 1 10001 10001 R ? None 1 127330 127330 L ? None False False bed 22 | nsv7879 None 1 10001 10001 R ? None 1 127330 127330 R ? None True False bed 23 | nsv958854 None 1 10191 10191 L ? None 1 10281 10281 L ? None True False bed 24 | nsv958854 None 1 10191 10191 L ? None 1 10281 10281 R ? None False False bed 25 | nsv958854 None 1 10191 10191 R ? None 1 10281 10281 L ? None False False bed 26 | nsv958854 None 1 10191 10191 R ? None 1 10281 10281 R ? None True False bed 27 | nsv428112 None 1 10377 10377 L ? None 1 177417 177417 L ? None True False bed 28 | nsv428112 None 1 10377 10377 L ? None 1 177417 177417 R ? None False False bed 29 | nsv428112 None 1 10377 10377 R ? None 1 177417 177417 L ? None False False bed 30 | nsv428112 None 1 10377 10377 R ? None 1 177417 177417 R ? None True False bed 31 | esv2758911 None 1 10377 10377 L ? None 1 1018704 1018704 L ? None True False bed 32 | esv2758911 None 1 10377 10377 L ? None 1 1018704 1018704 R ? None False False bed 33 | esv2758911 None 1 10377 10377 R ? None 1 1018704 1018704 L ? None False False bed 34 | esv2758911 None 1 10377 10377 R ? None 1 1018704 1018704 R ? None True False bed 35 | esv27265 None 1 10499 10499 L ? None 1 177368 177368 L ? None True False bed 36 | esv27265 None 1 10499 10499 L ? None 1 177368 177368 R ? None False False bed 37 | esv27265 None 1 10499 10499 R ? None 1 177368 177368 L ? None False False bed 38 | esv27265 None 1 10499 10499 R ? None 1 177368 177368 R ? None True False bed 39 | nsv1147468 None 1 11099 11099 L ? None 1 47000 47000 L ? None True False bed 40 | nsv1147468 None 1 11099 11099 L ? None 1 47000 47000 R ? None False False bed 41 | nsv1147468 None 1 11099 11099 R ? None 1 47000 47000 L ? None False False bed 42 | nsv1147468 None 1 11099 11099 R ? None 1 47000 47000 R ? None True False bed 43 | dgv1n106 None 1 11100 11100 L ? None 1 29200 29200 L ? None True False bed 44 | dgv1n106 None 1 11100 11100 L ? None 1 29200 29200 R ? None False False bed 45 | dgv1n106 None 1 11100 11100 R ? None 1 29200 29200 L ? None False False bed 46 | dgv1n106 None 1 11100 11100 R ? None 1 29200 29200 R ? None True False bed 47 | -------------------------------------------------------------------------------- /src/tools/find_repeats.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script used in finding potential masking regions within a genome 3 | """ 4 | import argparse 5 | import os 6 | 7 | from mavis.annotate.base import BioInterval 8 | from mavis.annotate.file_io import load_reference_genome 9 | from mavis.util import log 10 | 11 | 12 | def parse_arguments(): 13 | """ 14 | parse command line arguments 15 | """ 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument( 18 | '-o', '--output', help='path to the output file', required=True, metavar='FILEPATH' 19 | ) 20 | parser.add_argument( 21 | '-n', 22 | '--input', 23 | required=True, 24 | metavar='FILEPATH', 25 | help='Path to the Input reference genome fasta file', 26 | ) 27 | parser.add_argument( 28 | '--min_length', 29 | default=20, 30 | type=int, 31 | help='Minimum total length of the repeat region to find', 32 | metavar='INT', 33 | ) 34 | parser.add_argument( 35 | '--repeat_seq', 36 | default='N', 37 | type=str, 38 | help='Repeat sequence to look for. Case insensitive', 39 | nargs='+', 40 | ) 41 | args = parser.parse_args() 42 | if args.min_length < 2: 43 | parser.error('argument --min_length: cannot specify a shorter repeat than 2 bases') 44 | if not os.path.exists(args.input): 45 | parser.error('argument --input: File does not exist') 46 | return args 47 | 48 | 49 | def main(): 50 | args = parse_arguments() 51 | repeat_sequences = sorted(list(set([s.lower() for s in args.repeat_seq]))) 52 | log('loading:', args.input) 53 | reference_genome = load_reference_genome(args.input) 54 | comments = [ 55 | os.path.basename(__file__), 56 | 'input: {}'.format(args.input), 57 | 'min_length: {}'.format(args.min_length), 58 | 'repeat_seq: {}'.format(', '.join(args.repeat_seq)), 59 | ] 60 | log('writing:', args.output) 61 | with open(args.output, 'w') as fh: 62 | for comment in comments: 63 | fh.write('## {}\n'.format(comment)) 64 | fh.write('chr\tstart\tend\tname\n') 65 | visited = set() 66 | for chrom, seq in sorted(reference_genome.items()): 67 | if chrom.startswith('chr'): 68 | chrom = chrom[3:] 69 | seq = str(seq.seq).lower() 70 | if seq in visited: 71 | continue 72 | else: 73 | visited.add(seq) 74 | spans = [] 75 | for repseq in repeat_sequences: 76 | log( 77 | 'finding {}_repeat (min_length: {}), for chr{} (length: {})'.format( 78 | repseq, args.min_length, chrom, len(seq) 79 | ) 80 | ) 81 | index = 0 82 | while index < len(seq): 83 | next_n = seq.find(repseq, index) 84 | if next_n < 0: 85 | break 86 | index = next_n 87 | while ( 88 | index + len(repseq) <= len(seq) 89 | and seq[index : index + len(repseq)] == repseq 90 | ): 91 | index += len(repseq) 92 | span = BioInterval(chrom, next_n + 1, index, name='repeat_{}'.format(repseq)) 93 | if len(span) >= args.min_length and len(span) >= 2 * len(repseq): 94 | spans.append(span) 95 | log('found', len(spans), 'spans', time_stamp=False) 96 | for span in spans: 97 | fh.write( 98 | '{}\t{}\t{}\t{}\n'.format( 99 | span.reference_object, span.start, span.end, span.name 100 | ) 101 | ) 102 | 103 | 104 | if __name__ == '__main__': 105 | main() 106 | -------------------------------------------------------------------------------- /tests/data/mock_sv_events.tsv: -------------------------------------------------------------------------------- 1 | ## False reference9 2000 2000 reference9 2001 2001 L R + + insertion genome convert_ta.py_v0.0.1 mock-A36971 9:66466004 2 | stranded break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end break1_orientation break2_orientation break1_strand break2_strand event_type protocol tools library comment 3 | False reference7 5000 5000 reference7 11000 11000 R L - - duplication genome convert_ta.py_v0.0.1 mock-A36971 7:104485067|7:104612302 4 | False reference20 2000 2000 reference20 6000 6000 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 20:13160730|20:13164100 5 | False reference10 520 520 reference19 964 964 R L + + translocation genome convert_ta.py_v0.0.1 mock-A36971 10:7059511|19:17396811 6 | False referenceX 2000 2000 referenceX 6000 6000 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 X:32098533|32201251 7 | False reference3 1115 1115 reference3 2188 2188 R R + - inversion genome convert_ta.py_v0.0.1 mock-A36971 3:24565106|24566179 8 | False referenceX 10000 10000 referenceX 14000 14000 L R - - deletion genome convert_ta.py_v0.0.1 mock-A36971 X:31301203|32038750 9 | False reference2 2000 2000 reference4 2000 2000 L R - - translocation genome convert_ta.py_v0.0.1 mock-A36971 2:42052609|4:66413931 10 | False reference7 15000 15000 reference7 19000 19000 R R + - inversion genome convert_ta.py_v0.0.1 mock-A36971 7:126098488|126167441 11 | False reference19 4827 4847 reference19 5219 5219 L R + + deletion genome DELLY_v0.6.1 mock-A36971 19:31954787-31955407|19:31955423-31956043 12 | False reference11 6000 6000 reference11 6003 6003 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 11:121214|11:121216 13 | False reference11 10000 10000 reference11 10030 10030 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 11:1651586|11:1651615 14 | False reference12 2001 2001 reference12 2120 2120 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 12:14945389|12:14945509 15 | False reference10 3609 3609 reference10 3818 3818 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:7793830|10:7794039 16 | False reference10 8609 8609 reference10 8927 8927 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:100025136|10:100025454 17 | False reference10 12609 12609 reference10 13123 13123 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:18503076|10:18503590 18 | False reference10 17109 17109 reference10 17899 17899 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:127745195|10:127745985 19 | False reference10 22109 22109 reference10 24330 24330 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:108030321|10:108032542 20 | False reference10 28109 28109 reference10 31827 31827 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:132909062|10:132912780 21 | False reference10 36109 36109 reference10 42159 42159 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:6411580|10:6417630 22 | False reference12 6001 6001 reference12 6016 6016 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 12:127413217|12:127413233 complex event 23 | False reference1 2000 2000 reference1 2001 2001 L R + + insertion genome convert_ta.py_v0.0.1 mock-A36971 1:8877520 24 | False reference16 2000 2000 reference16 2001 2001 L R + + insertion genome convert_ta.py_v0.0.1 mock-A36971 16:57847634 25 | False reference12 10000 10000 reference12 10001 10021 R L + + duplication genome convert_ta.py_v0.0.1 mock-A36971 12:53207583 reported as an insertion 26 | False reference17 1974 1974 reference17 2020 2020 R L + + duplication genome convert_ta.py_v0.0.1 mock-A36971 17:72889676 reported as an insertion 27 | False gene3 27175 27175 gene3 27176 27176 R L + + duplication genome convert_ta.py_v0.0.1 mock-A36971 1:207249992 28 | False gene5 608 608 gene1 33309 33309 R R + - inverted translocation genome convert_ta.py_v0.0.1 mock-A36971 7:26252971|15:40854190 29 | False gene2 19827 19827 gene2 27045 27045 R L + + duplication genome convert_ta.py_v0.0.1 mock-A36971 15:41621292|15:41628510 30 | False gene6 77430 77430 gene6 89472 89472 L R + + deletion genome convert_ta.py_v0.0.1 mock-A36971 10:89700299|10:89712341 31 | -------------------------------------------------------------------------------- /tests/data/build.cfg: -------------------------------------------------------------------------------- 1 | [general] 2 | batch_id = batch-aMfNsjq7NgyaJFfhU9ZHQS 3 | output_dir = /var/tmp/tmpfojhl9g1 4 | scheduler = SLURM 5 | concurrency_limit = None 6 | 7 | [MS_batch-aMfNsjq7NgyaJFfhU9ZHQS] 8 | stage = summary 9 | job_ident = None 10 | name = MS_batch-aMfNsjq7NgyaJFfhU9ZHQS 11 | dependencies = MP_batch-aMfNsjq7NgyaJFfhU9ZHQS 12 | script = /var/tmp/tmpfojhl9g1/summary/submit.sh 13 | status = UNKNOWN 14 | output_dir = /var/tmp/tmpfojhl9g1/summary 15 | stdout = /var/tmp/tmpfojhl9g1/summary/job-{name}-{job_ident}.log 16 | memory_limit = 16000 17 | queue = 18 | time_limit = 57600 19 | import_env = True 20 | mail_user = 21 | mail_type = NONE 22 | 23 | [MP_batch-aMfNsjq7NgyaJFfhU9ZHQS] 24 | stage = pairing 25 | job_ident = None 26 | name = MP_batch-aMfNsjq7NgyaJFfhU9ZHQS 27 | dependencies = MA_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS 28 | MA_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS 29 | script = /var/tmp/tmpfojhl9g1/pairing/submit.sh 30 | status = UNKNOWN 31 | output_dir = /var/tmp/tmpfojhl9g1/pairing 32 | stdout = /var/tmp/tmpfojhl9g1/pairing/job-{name}-{job_ident}.log 33 | memory_limit = 16000 34 | queue = 35 | time_limit = 57600 36 | import_env = True 37 | mail_user = 38 | mail_type = NONE 39 | 40 | [MV_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS] 41 | stage = validate 42 | job_ident = None 43 | name = MV_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS 44 | dependencies = 45 | script = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/validate/submit.sh 46 | status = UNKNOWN 47 | output_dir = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/validate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID 48 | stdout = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/validate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID/job-{name}-{job_ident}_{task_ident}.log 49 | memory_limit = 16000 50 | queue = 51 | time_limit = 57600 52 | import_env = True 53 | mail_user = 54 | mail_type = NONE 55 | task_list = 1 56 | 57 | [MV_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS] 58 | stage = validate 59 | job_ident = None 60 | name = MV_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS 61 | dependencies = 62 | script = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/validate/submit.sh 63 | status = UNKNOWN 64 | output_dir = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/validate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID 65 | stdout = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/validate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID/job-{name}-{job_ident}_{task_ident}.log 66 | memory_limit = 18000 67 | queue = 68 | time_limit = 57600 69 | import_env = True 70 | mail_user = 71 | mail_type = NONE 72 | task_list = 1 73 | 74 | [MA_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS] 75 | stage = annotate 76 | job_ident = None 77 | name = MA_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS 78 | dependencies = MV_mock-A36971_batch-aMfNsjq7NgyaJFfhU9ZHQS 79 | script = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/annotate/submit.sh 80 | status = UNKNOWN 81 | output_dir = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/annotate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID 82 | stdout = /var/tmp/tmpfojhl9g1/mock-A36971_diseased_genome/annotate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID/job-{name}-{job_ident}_{task_ident}.log 83 | memory_limit = 12000 84 | queue = 85 | time_limit = 57600 86 | import_env = True 87 | mail_user = 88 | mail_type = NONE 89 | task_list = 1 90 | 91 | [MA_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS] 92 | stage = annotate 93 | job_ident = None 94 | name = MA_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS 95 | dependencies = MV_mock-A47933_batch-aMfNsjq7NgyaJFfhU9ZHQS 96 | script = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/annotate/submit.sh 97 | status = UNKNOWN 98 | output_dir = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/annotate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID 99 | stdout = /var/tmp/tmpfojhl9g1/mock-A47933_diseased_transcriptome/annotate/batch-aMfNsjq7NgyaJFfhU9ZHQS-$$SLURM_ARRAY_TASK_ID/job-{name}-{job_ident}_{task_ident}.log 100 | memory_limit = 12000 101 | queue = 102 | time_limit = 57600 103 | import_env = True 104 | mail_user = 105 | mail_type = NONE 106 | task_list = 1 107 | 108 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Getting Started 2 | 3 | If you are new to the project a good way to get started is by adding to the documentation, or adding unit tests where 4 | there is a lack of code coverage. 5 | 6 | ## Install (for Development) 7 | 8 | Clone the repository and switch to the development branch 9 | 10 | ```bash 11 | git clone https://github.com/bcgsc/mavis.git 12 | cd mavis 13 | git checkout develop 14 | ``` 15 | 16 | Set up a python virtual environment. If you are developing in python setting up with a virtual environment can be 17 | incredibly helpful as it allows for a clean install to test. Instructions for setting up the environment 18 | are below 19 | 20 | ```bash 21 | python3 -m venv venv 22 | source venv/bin/activate 23 | ``` 24 | 25 | Install the MAVIS python package. Running the setup in develop mode will ensure that your code changes are run when you 26 | run MAVIS from within that virtual environment 27 | 28 | ```bash 29 | pip install -e .[dev] 30 | ``` 31 | 32 | Run the tests and compute code coverage 33 | 34 | ```bash 35 | pytest tests 36 | ``` 37 | 38 | ## Build the Documentation 39 | 40 | ```bash 41 | pip install .[docs] 42 | markdown_refdocs mavis -o docs/package --link 43 | mkdocs build 44 | ``` 45 | 46 | The contents of the user manual can then be viewed by opening the build-docs/index.html 47 | in any available web browser (i.e. google-chrome, firefox, etc.) 48 | 49 | ## Deploy to PyPi 50 | 51 | Install deployment dependencies 52 | 53 | ```bash 54 | pip install .[deploy] 55 | ``` 56 | 57 | Build the distribution files 58 | 59 | ```bash 60 | python setup.py install sdist bdist_wheel 61 | ``` 62 | 63 | Use twine to upload 64 | 65 | ```bash 66 | twine upload -r pypi dist/* 67 | ``` 68 | 69 | ## Reporting a Bug 70 | 71 | Please make sure to search through the issues before reporting a bug to ensure there isn't 72 | already an open issue. 73 | 74 | ## Conventions 75 | 76 | ### Linting 77 | 78 | Use [black](https://github.com/psf/black) with strings off and line length 100 79 | 80 | ```bash 81 | black src/mavis -S -l 100 82 | ``` 83 | 84 | ### Docstrings 85 | 86 | docstrings should follow [sphinx google code style](http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) 87 | 88 | if you want to be more explicit with nested types, please follow the same format 89 | used by [python type annotations](https://docs.python.org/3/library/typing.html) 90 | 91 | ```text 92 | arg1 (List[str]): a list of strings 93 | ``` 94 | 95 | However using proper type annotations is preferred for new code and then only including the 96 | description of the parameter in the docstring and not its type 97 | 98 | ```python 99 | 100 | def some_function(some_arg: List[str]) -> None: 101 | """ 102 | Args: 103 | some_arg: this arg does stuff 104 | """ 105 | ``` 106 | 107 | ### Output Columns 108 | 109 | any column name which may appear in any of the intermediate or final output files must be defined in `mavis.constants.COLUMNS` as well as added to the [columns glossary](../outputs/columns) 110 | 111 | ### Tests 112 | 113 | - all new code must have unit tests in the tests subdirectory 114 | 115 | Tests can be run as follows 116 | 117 | ```bash 118 | pytest tests 119 | ``` 120 | 121 | ### Branching Model 122 | 123 | If you are working on a large feature, create a base branch for the feature off develop. Generally 124 | these follow the naming pattern 125 | 126 | ```bash 127 | git checkout -b integration/issue-- 128 | ``` 129 | 130 | If you are working on a smaller feature then simply make a feature branch off develop 131 | 132 | ```bash 133 | git checkout -b feature/issue-- 134 | ``` 135 | 136 | Once ready, a PR should be made to develop and review should be requested from the other developers. 137 | 138 | Releases are done by creating a release branch off develop 139 | 140 | ```bash 141 | git checkout -b release/vX.X.X 142 | ``` 143 | 144 | Updating the version number in setup.py in the release branch, and then making a PR to master. 145 | After the PR has been merged to master a tag/release should be created with the release notes 146 | and a PR to merge master back into develop should be made 147 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: build 5 | 6 | on: 7 | push: 8 | branches: 9 | - master 10 | - develop 11 | pull_request: 12 | 13 | jobs: 14 | build: 15 | runs-on: ubuntu-20.04 16 | strategy: 17 | matrix: 18 | python-version: ["3.7", "3.8", "3.9", "3.10"] 19 | name: python-${{ matrix.python-version }} 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: install machine dependencies 23 | run: | 24 | sudo apt-get update 25 | sudo apt-get install -y libcurl4-openssl-dev 26 | - name: Set up Python ${{ matrix.python-version }} 27 | uses: actions/setup-python@v2 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | - name: Install dependencies 31 | run: | 32 | python -m pip install --upgrade pip setuptools 33 | pip install -e .[test] # need editable to make sure the coverage reports correctly 34 | - name: install bwa 35 | run: | 36 | git clone https://github.com/lh3/bwa.git 37 | cd bwa 38 | git checkout v0.7.17 39 | make 40 | cd .. 41 | - name: install blat 42 | run: | 43 | wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/blat/blat 44 | chmod a+x blat 45 | - name: set up .pth file 46 | run: | 47 | python tests/setup_subprocess_cov.py 48 | - name: run full tests with pytest 49 | run: | 50 | export PATH=$PATH:$(pwd):$(pwd)/bwa 51 | export COVERAGE_PROCESS_START=$(pwd)/.coveragerc 52 | 53 | pytest tests -v \ 54 | --junitxml=junit/test-results-${{ matrix.python-version }}.xml \ 55 | --cov mavis \ 56 | --cov tools.convert_annotations_format \ 57 | --cov-report term-missing \ 58 | --cov-report xml \ 59 | --durations=10 \ 60 | --cov-branch 61 | env: 62 | RUN_FULL: 1 63 | - name: Upload pytest test results 64 | uses: actions/upload-artifact@master 65 | with: 66 | name: pytest-results-${{ matrix.python-version }} 67 | path: junit/test-results-${{ matrix.python-version }}.xml 68 | # Use always() to always run this step to publish test results when there are test failures 69 | if: always() 70 | - name: Update code coverage report to CodeCov 71 | uses: codecov/codecov-action@v1 72 | with: 73 | token: ${{ secrets.CODECOV_TOKEN }} 74 | file: ./coverage.xml 75 | flags: unittests 76 | env_vars: OS,PYTHON 77 | name: codecov-umbrella 78 | fail_ci_if_error: true 79 | if: matrix.python-version == 3.8 80 | docker: 81 | runs-on: ubuntu-latest 82 | name: docker build 83 | steps: 84 | - uses: actions/checkout@v2 85 | - name: build the docker container 86 | run: | 87 | docker build --file Dockerfile --tag bcgsc/mavis:latest . 88 | - name: test the help menu 89 | run: | 90 | docker run bcgsc/mavis -h 91 | - name: Set up Python 3.7 92 | uses: actions/setup-python@v2 93 | with: 94 | python-version: 3.7 95 | - name: Install workflow dependencies 96 | run: | 97 | python -m pip install --upgrade pip setuptools wheel 98 | pip install mavis_config pandas 99 | - uses: eWaterCycle/setup-singularity@v6 100 | with: 101 | singularity-version: 3.6.4 102 | - name: docker2singularity 103 | run: 104 | docker run --mount type=bind,source=/var/run/docker.sock,target=/var/run/docker.sock --mount type=bind,source="$(pwd)",target=/output --privileged -t --rm singularityware/docker2singularity bcgsc/mavis:latest 105 | - name: Run analysis with snakemake & singularity 106 | run: | 107 | # get the SIMG filename 108 | export SNAKEMAKE_CONTAINER=$(ls *mavis*.simg) 109 | snakemake -j 2 --configfile tests/mini-tutorial.config.json --use-singularity 110 | if: always() 111 | -------------------------------------------------------------------------------- /tests/test_tools/test_ref_alt_count.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | 5 | import pytest 6 | 7 | from mavis.annotate.file_io import load_reference_genome 8 | from mavis.breakpoint import Breakpoint, BreakpointPair 9 | from mavis.constants import ORIENT, SVTYPE 10 | from tools.calculate_ref_alt_counts import RefAltCalculator 11 | 12 | from ..util import get_data, glob_exists 13 | 14 | 15 | def setUpModule(): 16 | global REFERENCE_GENOME 17 | REFERENCE_GENOME = load_reference_genome(get_data('mock_reference_genome.fa')) 18 | if ( 19 | 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' 20 | != REFERENCE_GENOME['fake'].seq[0:50].upper() 21 | ): 22 | raise AssertionError('fake genome file does not have the expected contents') 23 | 24 | 25 | def print_file_tree(dirname): 26 | for root, dirs, files in os.walk(dirname): 27 | level = root.replace(dirname, '').count(os.sep) 28 | indent = ' ' * 4 * (level) 29 | print('{}{}/'.format(indent, os.path.basename(root))) 30 | subindent = ' ' * 4 * (level + 1) 31 | for f in files: 32 | print('{}{}'.format(subindent, f)) 33 | 34 | 35 | @pytest.fixture 36 | def calculator(): 37 | return RefAltCalculator( 38 | [("TEST", get_data('mock_reads_for_events.sorted.bam'))], 39 | REFERENCE_GENOME, 40 | max_event_size=100, 41 | buffer=20, 42 | ) 43 | 44 | 45 | @pytest.fixture 46 | def temp_output(): 47 | d = tempfile.mkdtemp() 48 | yield d 49 | shutil.rmtree(d) 50 | 51 | 52 | class TestFullCalculator: 53 | def test_calculate_all_counts(self, calculator, temp_output): 54 | calculator.calculate_all_counts( 55 | [get_data("mavis_summary_all_mock-A36971_mock-A47933.tab")], 56 | os.path.join(temp_output, "ref_alt_output.tab"), 57 | ) 58 | assert glob_exists(temp_output, "ref_alt_output.tab") 59 | 60 | 61 | class TestRefAltCalulator: 62 | def test_calculate_count(self, calculator): 63 | ev1 = BreakpointPair( 64 | Breakpoint('reference11', 5999, orient=ORIENT.LEFT), 65 | Breakpoint('reference11', 6003, orient=ORIENT.RIGHT), 66 | opposing_strands=False, 67 | event_type=SVTYPE.DEL, 68 | ) 69 | bpp = calculator.calculate_ref_counts(ev1) 70 | print(bpp.data) 71 | assert bpp.data["TEST_ref_count"] == 27 72 | assert bpp.data["TEST_alt_count"] == 14 73 | assert bpp.data['TEST_ignored_count'] == 188 74 | 75 | def test_calculate_count2(self, calculator): 76 | ev1 = BreakpointPair( 77 | Breakpoint('reference11', 9999, orient=ORIENT.LEFT), 78 | Breakpoint('reference11', 10030, orient=ORIENT.RIGHT), 79 | opposing_strands=False, 80 | event_type=SVTYPE.DEL, 81 | ) 82 | bpp = calculator.calculate_ref_counts(ev1) 83 | print(bpp.data) 84 | assert bpp.data["TEST_ref_count"] == 0 85 | assert bpp.data["TEST_alt_count"] == 63 86 | assert bpp.data['TEST_ignored_count'] == 195 87 | 88 | def test_calculate_count3(self, calculator): 89 | ev1 = BreakpointPair( 90 | Breakpoint('reference1', 2002, orient=ORIENT.LEFT), 91 | Breakpoint('reference1', 2003, orient=ORIENT.RIGHT), 92 | opposing_strands=False, 93 | event_type=SVTYPE.INS, 94 | untemplated_seq='TT', 95 | ) 96 | bpp = calculator.calculate_ref_counts(ev1) 97 | print(bpp.data) 98 | assert bpp.data["TEST_ref_count"] == 0 99 | assert bpp.data["TEST_alt_count"] == 23 100 | assert bpp.data['TEST_ignored_count'] == 145 101 | 102 | def test_calculate_count4(self, calculator): 103 | ev1 = BreakpointPair( 104 | Breakpoint('reference11', 1999, orient=ORIENT.LEFT), 105 | Breakpoint('reference11', 2001, orient=ORIENT.RIGHT), 106 | opposing_strands=False, 107 | event_type=SVTYPE.DEL, 108 | ) 109 | bpp = calculator.calculate_ref_counts(ev1) 110 | print(bpp.data) 111 | assert bpp.data["TEST_ref_count"] == 0 112 | assert bpp.data["TEST_alt_count"] == 50 113 | assert bpp.data['TEST_ignored_count'] == 191 114 | -------------------------------------------------------------------------------- /tests/data/mock_reference_annotations2.json: -------------------------------------------------------------------------------- 1 | { 2 | "genes": [ 3 | { 4 | "aliases": [ 5 | ], 6 | "chr": "fake", 7 | "end": 200, 8 | "name": "GENE-A", 9 | "start": 100, 10 | "strand": "+", 11 | "transcripts": [ 12 | { 13 | "aliases": [ 14 | ], 15 | "cdna_coding_end": null, 16 | "cdna_coding_start": null, 17 | "domains": [ 18 | ], 19 | "end": 200, 20 | "exons": [ 21 | ], 22 | "is_best_transcript": true, 23 | "name": "TRANSCRIPT-A", 24 | "start": 100 25 | } 26 | ] 27 | }, 28 | { 29 | "aliases": [ 30 | ], 31 | "chr": "fake", 32 | "end": 350, 33 | "name": "GENE-B", 34 | "start": 250, 35 | "strand": "-", 36 | "transcripts": [ 37 | { 38 | "aliases": [ 39 | ], 40 | "cdna_coding_end": null, 41 | "cdna_coding_start": null, 42 | "domains": [ 43 | ], 44 | "end": 350, 45 | "exons": [ 46 | ], 47 | "is_best_transcript": true, 48 | "name": "TRANSCRIPT-B", 49 | "start": 250 50 | } 51 | ] 52 | }, 53 | { 54 | "aliases": [ 55 | ], 56 | "chr": "fake", 57 | "end": 400, 58 | "name": "GENE-C", 59 | "start": 300, 60 | "strand": "+", 61 | "transcripts": [ 62 | { 63 | "aliases": [ 64 | ], 65 | "cdna_coding_end": null, 66 | "cdna_coding_start": null, 67 | "domains": [ 68 | ], 69 | "end": 400, 70 | "exons": [ 71 | ], 72 | "is_best_transcript": true, 73 | "name": "TRANSCRIPT-C", 74 | "start": 300 75 | } 76 | ] 77 | }, 78 | { 79 | "aliases": [ 80 | ], 81 | "chr": "fake", 82 | "end": 550, 83 | "name": "GENE-D", 84 | "start": 450, 85 | "strand": "-", 86 | "transcripts": [ 87 | { 88 | "aliases": [ 89 | ], 90 | "cdna_coding_end": null, 91 | "cdna_coding_start": null, 92 | "domains": [ 93 | ], 94 | "end": 550, 95 | "exons": [ 96 | ], 97 | "is_best_transcript": true, 98 | "name": "TRANSCRIPT-D", 99 | "start": 450 100 | } 101 | ] 102 | }, 103 | { 104 | "aliases": [ 105 | ], 106 | "chr": "fake", 107 | "end": 600, 108 | "name": "GENE-E", 109 | "start": 500, 110 | "strand": "+", 111 | "transcripts": [ 112 | { 113 | "aliases": [ 114 | ], 115 | "cdna_coding_end": null, 116 | "cdna_coding_start": null, 117 | "domains": [ 118 | ], 119 | "end": 600, 120 | "exons": [ 121 | ], 122 | "is_best_transcript": true, 123 | "name": "TRANSCRIPT-E", 124 | "start": 500 125 | } 126 | ] 127 | }, 128 | { 129 | "aliases": [ 130 | ], 131 | "chr": "fake", 132 | "end": 650, 133 | "name": "GENE-F", 134 | "start": 550, 135 | "strand": "+", 136 | "transcripts": [ 137 | { 138 | "aliases": [ 139 | ], 140 | "cdna_coding_end": null, 141 | "cdna_coding_start": null, 142 | "domains": [ 143 | ], 144 | "end": 650, 145 | "exons": [ 146 | ], 147 | "is_best_transcript": true, 148 | "name": "TRANSCRIPT-F", 149 | "start": 550 150 | } 151 | ] 152 | } 153 | ] 154 | } 155 | -------------------------------------------------------------------------------- /docs/hooks.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from textwrap import dedent 4 | 5 | import pkg_resources 6 | from markdown_refdocs.main import extract_to_markdown 7 | from mavis_config import DEFAULTS 8 | from mavis.util import ENV_VAR_PREFIX 9 | 10 | 11 | def json_to_pytype(record): 12 | input_type = record 13 | try: 14 | input_type = record['type'] 15 | except TypeError: 16 | pass 17 | types = { 18 | 'string': 'str', 19 | 'integer': 'int', 20 | 'float': 'float', 21 | 'boolean': 'bool', 22 | 'number': 'float', 23 | } 24 | 25 | if input_type == 'array': 26 | try: 27 | sub_type = json_to_pytype(record['items']['type']) 28 | return f'List[{sub_type}]' 29 | except TypeError: 30 | return 'List' 31 | 32 | if isinstance(input_type, list): 33 | # Union 34 | types = ', '.join([json_to_pytype(t) for t in input_type]) 35 | return f'Union[{types}]' 36 | return types.get(input_type, input_type) 37 | 38 | 39 | def list_properties(schema, skip_terms=tuple()): 40 | glossary = {} 41 | for term, defn in schema['properties'].items(): 42 | if term in skip_terms: 43 | continue 44 | typ = json_to_pytype(defn) 45 | desc = defn.get('description', '') 46 | default_value = defn.get('default') 47 | schema_fields = {k: v for k, v in defn.items() if k not in ['description', 'default']} 48 | 49 | if len(schema_fields) > 1: 50 | schema_defn = json.dumps( 51 | schema_fields, 52 | sort_keys=True, 53 | indent=' ', 54 | ) 55 | schema_defn = f'**schema definition**:\n```json\n{schema_defn}\n```\n' 56 | else: 57 | schema_defn = '' 58 | 59 | lines = [ 60 | f'### {term}', 61 | f'**type**: `#!python {typ}`', 62 | f'**default**: `#!python {repr(default_value)}`' if default_value is not None else '', 63 | desc, 64 | schema_defn, 65 | ] 66 | glossary[term] = '\n\n'.join(lines) 67 | return [v for k, v in sorted(glossary.items())] 68 | 69 | 70 | def generate_settings_doc(schema_file): 71 | with open(schema_file, 'r') as fh: 72 | schema = json.load(fh) 73 | dirname = os.path.dirname(os.path.abspath(__file__)) 74 | filepath = 'configuration/settings.md' 75 | title = 'Configurable Settings' 76 | 77 | fname = os.path.join(dirname, filepath) 78 | 79 | result = [f'\n\n# {title}\n'] 80 | result.append( 81 | dedent( 82 | '''\ 83 | ## Defining Samples/Libraries 84 | 85 | The `libraries` property of the mavis config is required to run the snakemake 86 | workflow. This is the section that defines what inputs to use, and what types of 87 | samples are available. 88 | 89 | ```json 90 | { 91 | "libraries": { 92 | "": { } // mapping of library name to library settings 93 | } 94 | } 95 | ``` 96 | 97 | The library specific settings are listed below 98 | ''' 99 | ) 100 | ) 101 | result.extend(list_properties(schema['properties']['libraries']['additionalProperties'])) 102 | result.append( 103 | dedent( 104 | '''\ 105 | ## Defining Conversions 106 | 107 | If the input to MAVIS is raw tool output and has not been pre-converted to the 108 | standard tab delimited format expected by MAVIS then you will need to add 109 | a section to the config to tell mavis how to perform the required conversions 110 | 111 | ```json 112 | { 113 | "convert": { 114 | "": { } // mapping of alias to conversion settings 115 | } 116 | } 117 | ``` 118 | 119 | The conversion specific settings are listed below 120 | ''' 121 | ) 122 | ) 123 | result.extend(list_properties(schema['properties']['convert']['additionalProperties'])) 124 | result.append('\n## General Settings\n') 125 | result.extend(list_properties(schema, ('libraries', 'convert'))) 126 | 127 | print('writing:', fname) 128 | with open(fname, 'w') as fh: 129 | fh.write('\n\n'.join(result) + '\n') 130 | 131 | 132 | def build_package_docs(config): 133 | schema_file = pkg_resources.resource_filename('mavis_config', 'config.json') 134 | generate_settings_doc(schema_file) 135 | package_dir = os.path.join(os.path.dirname(__file__), '../src/mavis') 136 | output_dir = os.path.join(os.path.dirname(__file__), 'package') 137 | 138 | extract_to_markdown( 139 | [package_dir], 140 | output_dir, 141 | link=True, 142 | hide_private=True, 143 | hide_undoc=True, 144 | hide_undoc_args=True, 145 | namespace_headers=False, 146 | ) 147 | -------------------------------------------------------------------------------- /docs/inputs/standard.md: -------------------------------------------------------------------------------- 1 | # MAVIS standard input file format 2 | 3 | These requirements pertain to the columns of input files from the 4 | various tools you want to merge. The input files should be tab-delimited 5 | text files. Comments at the top of may be included. Comments should 6 | begin with hash marks. They will be ignored when the file is read 7 | 8 | ```text 9 | ## This is a comment 10 | ``` 11 | 12 | The header row contains the column names and is the first row following 13 | the comments (or the first row if no comments are included). 14 | 15 | ```text 16 | ## This is a comment 17 | ## this is another comment 18 | # this is also a comment 19 | This Is The Header 20 | ``` 21 | 22 | A simple input file might look as follows 23 | 24 | ```text 25 | ## File created at: 2018-01-02 26 | ## Generated by: MAVIS v1.0.0 27 | break1_chromosome break1_position_start break1_position_end break2_chromosome break2_position_start break2_position_end 28 | X 1234 1234 X 77965 77965 29 | ``` 30 | 31 | ## Required Columns 32 | 33 | - [break1_chromosome](../../outputs/columns/#break1_chromosome) 34 | - [break1_position_start](../../outputs/columns/#break1_position_start) 35 | - [break1_position_end](../../outputs/columns/#break1_position_end) (can be the same as break1\_position\_start) 36 | - [break2_chromosome](../../outputs/columns/#break2_chromosome) 37 | - [break2_position_start](../../outputs/columns/#break2_position_start) 38 | - [break2_position_end](../../outputs/columns/#break2_position_end) (can be the same as break2\_position\_start) 39 | 40 | ## Optional Columns 41 | 42 | Optional Columns that are not given as input will be added with default 43 | (or command line parameter options) during the clustering stage of MAVIS 44 | as some are required for subsequent pipeline steps 45 | 46 | - [break1_strand](../../outputs/columns/#break1_strand) (defaults to not-specified during clustering) 47 | - [break1_orientation](../../outputs/columns/#break1_orientation) (expanded to all possible values during clustering) 48 | - [break2_strand](../../outputs/columns/#break2_strand) (defaults to not-specified during clustering) 49 | - [break2_orientation](../../outputs/columns/#break2_orientation) (expanded to all possible values during clustering) 50 | - [opposing_strands](../../outputs/columns/#opposing_strands) (expanded to all possible values during clustering) 51 | - [stranded](../../outputs/columns/#stranded) (defaults to False during clustering) 52 | - [library](../../outputs/columns/#library) (defaults to command line library parameter during clustering) 53 | - [protocol](../../outputs/columns/#protocol) (defaults to command line protocol parameter during clustering) 54 | - [tools](../../outputs/columns/#tools) (defaults to an empty string during clustering) 55 | 56 | ## Summary by Pipeline Step 57 | 58 | The different pipeline steps of MAVIS have different input column 59 | requirements. These are summarized below (for the pipeline steps which 60 | can act as the pipeline start) 61 | 62 | | column name | cluster | annotate | validate | 63 | | --------------------------------------------------------------------- | ------- | -------- | -------- | 64 | | [break1_chromosome](../../outputs/columns/#break1_chromosome) | ✓ | ✓ | ✓ | 65 | | [break1_position_start](../../outputs/columns/#break1_position_start) | ✓ | ✓ | ✓ | 66 | | [break1_position_end](../../outputs/columns/#break1_position_end) | ✓ | ✓ | ✓ | 67 | | [break2_chromosome](../../outputs/columns/#break2_chromosome) | ✓ | ✓ | ✓ | 68 | | [break2_position_start](../../outputs/columns/#break2_position_start) | ✓ | ✓ | ✓ | 69 | | [break2_position_end](../../outputs/columns/#break2_position_end) | ✓ | ✓ | ✓ | 70 | | [break1_strand](../../outputs/columns/#break1_strand) | | | | 71 | | [break1_orientation](../../outputs/columns/#break1_orientation) | | ✓ | ✓ | 72 | | [break2_strand](../../outputs/columns/#break2_strand) | | | | 73 | | [break2_orientation](../../outputs/columns/#break2_orientation) | | ✓ | ✓ | 74 | | [opposing_strands](../../outputs/columns/#opposing_strands) | | | | 75 | | [stranded](../../outputs/columns/#stranded) | | | | 76 | | [library](../../outputs/columns/#library) | | | | 77 | | [protocol](../../outputs/columns/#protocol) | | | | 78 | | [tools](../../outputs/columns/#tools) | | | | 79 | | [event_type](../../outputs/columns/#event_type) | | | | 80 | 81 | Some native tool outputs are [supported](../../inputs/support/#sv-callers) and 82 | have built in methods to convert to the above format. Any unsupported 83 | tools can be used as long as the user converts the tools native output 84 | to match the above format. 85 | -------------------------------------------------------------------------------- /tests/data/transabyss_events.tab: -------------------------------------------------------------------------------- 1 | id contig contig_size genomic_regions contig_regions strands flanking_pairs breakpoint_pairs spanning_reads spanning_reads_forward spanning_reads_reverse rearrangement breakpoint size genes transcripts senses exons/introns exon_bounds reciprocal descriptor orientations 5'gene 3'gene 5'exon 3'exon frame probe repeat1 repeat2 alignment_params type dbsnp dgv 2 | 227 893920 186 1:207981139-207981233,1:208014818-208014912 92-186,1-95 -,- 20 0,6 9 6 3 deletion 1:207981233|1:208014818 33584 NA,C1orf132 NA,ENST00000415882 NA,+ NA,intron2 NA,NA NA del1q32.2 L,R NA NA NA NA NA atggaaaaaggggaaacaaccttagggcagtcagacttctctatgaattcctCTCTCTGATCTGATGGGAATGCACTAGACTGTGAAACTTCCTCCTCCACC - - TO:0.00,CO:0.02,CC:1.00,I1:100.0,I2:100.0,AF1:0.51,AF2:0.51 LSR - NA 3 | 236 4567117 30066 1:224646603-224662564,1:224786034-224800120 1-15947,15985-30066 -,- 50 0,10 25 12 13 duplication 1:224646603|1:224800120 153516 NA,NA NA,NA NA,NA NA,NA NA,NA NA dup1q42.12 R,L NA NA NA NA NA attttccccttttcttgaaaagttgctgcaaagcgctcccctcctaagttgctagagcagctcacagaactgctatagtaagttttgGAGTACTAAAGGCATAGCTCAGTCTCCTCCTCAAGATTAAGAAATGCCCC - L1MEg TO:0.00,CO:0.00,CC:1.00,I1:99.9,I2:99.9,AF1:0.53,AF2:0.47 LSR NA NA 4 | 35 4556542 53631 10:89659755-89700299,10:89712341-89725438 1-40511,40530-53631 +,+ 64 0,24 43 22 21 deletion 10:89700299|10:89712341 12041 PTEN,PTEN ENST00000371953,ENST00000371953 +,+ intron5,intron6 NA,NA NA del10q23.31 L,R PTEN PTEN 5 7 in cagatctgcaaagatcaacctgtcctaagtcatataatctctttgtgtaagagattatactttgtgtaAGAGGTCCACCAGAGGAGTTCAGCAATTTGCTGCTCTTAGGGCAGGGATC - TRF_SimpleTandemRepeat_CAGAGGTCCAG TO:0.00,CO:0.00,CC:1.00,I1:100.0,I2:100.0,AF1:0.76,AF2:0.24 LSR - NA 5 | 28 3113294 240 10:7059511-7059605,19:17396666-17396811 146-240,1-146 +,+ 27 0,8 15 9 6 translocation 10:7059511|19:17396811 - NA,ANKLE1 NA,ENST00000404261 NA,+ NA,intron8 NA,NA NA t(10;19)(p14;p13.11) R,L NA NA NA NA NA gcatgtattttgctccattggtttatccccactcaagggcaatacacatTCAAAGCATAAAAATTACATGACCTATGATATTTATTTTGCTAAGATTTT - - TO:0.00,CO:0.00,CC:1.00,I1:100.0,I2:99.3,AF1:0.40,AF2:0.61 LSR NA NA 6 | 63 indel_k96_4449027 1742 12:104359630-104359778,12:125801148-125802740 1-149,150-1742 -,+ 33 0,6 9 5 4 inversion 12:104359630|12:125801148 21441517 TDG,NA ENST00000392872,NA -,NA exon1,NA no,NA NA inv12q23.3-q24.31 R,R NA NA NA NA NA gctggactcaagctcctcctccaggcttctaccgtcccccacggacccccCTGAGTAGATGATTTTCAGCTGAGGTCTGAGTAGTGGGAAGGGACTGACT - L2a TO:0.00,CO:0.00,CC:1.00,I1:100.0,I2:99.8,AF1:0.09,AF2:0.91 LSR NA NA 7 | 634 2130795 190 7:150746563-150746657,15:84810725-84810819 96-190,1-95 -,- 67 0,9 21 13 8 translocation 7:150746657|15:84810725 - ASIC3,NA ENST00000357922,NA -,NA intron1,NA NA,NA NA t(7;15)(q36.1;q25.2) L,R NA NA NA NA NA aacaggtacaattagggagaggctatgtcaatgcaggaaaaggtcttatcGGCACTGGGGGGTGGGGAGTCCATGGCTGGTAGGAAGGAAGAGGTCCCCT - segdup_chr15:82483003 TO:0.00,CO:0.00,CC:1.00,I1:100.0,I2:100.0,AF1:0.50,AF2:0.50 LSR NA NA 8 | 296 281201 187 3:24565106-24565200,3:24566179-24566273 93-187,1-95 +,- 118 0,19 44 20 24 inversion 3:24565106|3:24566179 1072 NA,NA NA,NA NA,NA NA,NA NA,NA NA inv3p24.2 R,R NA NA NA NA NA tcgtgtttcattctgcctgagagcagtctacctaaatatatagctctgctcACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACA - - TO:0.00,CO:0.02,CC:1.00,I1:100.0,I2:98.9,AF1:0.51,AF2:0.51 LSR NA NA 9 | 625 1719994 191 7:125746029-125746123,7:126166901-126166995 1-95,97-191 +,+ 59 0,9 24 13 11 deletion 7:125746123|7:126166901 420777 NA,GRM8 NA,ENST00000339582 NA,- NA,intron9 NA,NA NA del7q31.33 L,R NA NA NA NA NA atgaagaagaaaagagaaatttttaaataggtagtagcagaaattataaatGCATATCATTTAAATTAAGAGCATAAATGAGGCCACATAAATGCTTTCTT L1PA15-16 - TO:0.00,CO:0.00,CC:0.99,I1:100.0,I2:100.0,AF1:0.50,AF2:0.50 LSR - NA 10 | 617 4285174 188 7:104485067-104485161,7:104612208-104612302 1-95,94-188 -,- 72 0,12 30 16 14 duplication 7:104485067|7:104612302 127234 LHFPL3,NA ENST00000535008,NA -,NA intron4,NA NA,NA NA dup7q22.2-q22.3 R,L NA NA NA NA NA ttagacatcattgttgtttttattttatctttggtttcctcaggcaatacCCTTGGAATGACACATTATCCTCCCTTCACATGTAGCAATTGTAAATTCC - - TO:0.00,CO:0.01,CC:1.00,I1:100.0,I2:100.0,AF1:0.51,AF2:0.51 LSR NA NA 11 | 445 2769447 187 7:126098488-126098582,7:126167441-126167535 93-187,1-95 +,- 62 0,11 27 16 11 inversion 7:126098488|7:126167441 68952 GRM8,GRM8 ENST00000339582,ENST00000339582 -,+ intron9,intron9 NA,NA NA inv7q31.33 R,R - - - - NA atcgttaatcactgcatataactatcttaggctacctgttggtaaactataTGCAAAGAATATATATACACACATACAATTAATCCATTATCACAATGTAT - - TO:0.00,CO:0.02,CC:1.00,I1:100.0,I2:100.0,AF1:0.51,AF2:0.51 LSR NA NA 12 | 527 3739669 201 9:28031863-28031957,9:28034467-28034561 1-95,107-201 -,+ 23 0,6 16 10 6 inversion 9:28031863|9:28034467 2603 LINGO2,LINGO2 ENST00000379992,ENST00000379992 +,- intron4,intron4 NA,NA NA inv9p21.1 R,R - - - - NA ccagattgaaggtattttaaggaggatttggagcatcatggtgaagcgtgaattccgaaaaGAAAGCTCAGCCTGGCTTTTGTGGCCCAGAAGCCCAGAATTTCAGCAACT - - TO:0.00,CO:0.00,CC:0.95,I1:100.0,I2:100.0,AF1:0.47,AF2:0.47 LSR NA NA 13 | 747 3253092 198 X:31196849-31196943,X:31216211-31216305 1-95,104-198 +,+ 36 0,14 20 11 9 deletion X:31196943|X:31216211 19267 DMD,DMD ENST00000357033,ENST00000357033 -,- intron69,intron67 NA,NA NA delXp21.2 L,R DMD DMD 67 70 in aagtctcgaacatcttctcctgatgtagtctaaaagggagatcatggtgaatgtagtgAATGTAGTGAAGATCGGGGGATAAAAAAGGGATGGTTAATGGGTACAAAA - L1MA4 TO:0.00,CO:0.00,CC:0.96,I1:100.0,I2:100.0,AF1:0.48,AF2:0.48 LSR - NA 14 | -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # Install Instructions 2 | 3 | Once the install steps are complete [MAVIS](http://mavis.bcgsc.ca) is ready to be run. 4 | See the MAVIS [tutorial](https://mavis.readthedocs.io/en/latest/tutorials/mini) to learn about running MAVIS. 5 | 6 | For either install option you will want to install the main Snakefile. It is best to use a tag to 7 | specify the version of interest but you can download the latest version from the master branch 8 | 9 | ```bash 10 | wget https://raw.githubusercontent.com/bcgsc/mavis/master/Snakefile -O Snakefile 11 | ``` 12 | 13 | ## Install for Docker/Singularity 14 | 15 | The simplest way to use MAVIS is via Singularity. The MAVIS docker container used 16 | by singularity will take care of installing the aligner as well. 17 | 18 | ```bash 19 | pip install -U setuptools pip wheel 20 | pip install mavis_config # also installs snakemake 21 | ``` 22 | 23 | Now you will run mavis via Snakemake as follows 24 | 25 | ```bash 26 | snakemake \ 27 | -j \ 28 | --configfile \ 29 | --use-singularity \ 30 | -s Snakefile 31 | ``` 32 | 33 | ## Install (Python Only) 34 | 35 | MAVIS can also be run with just python. However you will need to install the aligner(s) required 36 | by MAVIS separately and ensure they are availble on the default PATH variable when MAVIS is run 37 | 38 | ### 1. Install Aligner 39 | 40 | In addition to the python package dependencies, [MAVIS](http://mavis.bcgsc.ca) also requires an aligner to be installed. 41 | Currently the only aligners supported are [blat](https://mavis.readthedocs.io/en/latest/glossary/#blat) and [bwa mem](https://mavis.readthedocs.io/en/latest/glossary/#bwa). 42 | For MAVIS to run successfully the aligner must be installed and accessible on the path. 43 | If you have a non-standard install you may find it useful to edit the PATH environment variable. For example 44 | 45 | ``` bash 46 | export PATH=/path/to/directory/containing/blat/binary:$PATH 47 | ``` 48 | 49 | [blat](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-blat) is the default aligner. To configure MAVIS to use [bwa mem](http://mavis.bcgsc.ca/docs/latest/glossary.html#term-bwa) it must be specified 50 | in the [config](https://mavis.readthedocs.io/en/latest/configuration/settings/) JSON file. 51 | 52 | After this has been installed MAVIS itself can be installed through [pip](https://pypi.org/project/mavis/) 53 | 54 | ### 2. Install MAVIS 55 | 56 | #### Install using pip 57 | 58 | The easiest way to install [MAVIS](http://mavis.bcgsc.ca) is through the python package manager, pip. If you do not have python3 installed it can be found [here](https://www.python.org/downloads) 59 | 60 | Ensuring you have a recent version of pip and setuptools will improve the install experience. Older versions of pip and setuptools may have issues with obtaining some of the mavis python dependencies 61 | 62 | ``` bash 63 | pip install --upgrade pip setuptools 64 | ``` 65 | 66 | or (for Anaconda users) 67 | 68 | ``` bash 69 | conda update pip setuptools 70 | ``` 71 | 72 | If this is not a clean/new python install it may be useful to set up mavis in a [virtual python environment](https://docs.python.org/3/tutorial/venv.html) 73 | 74 | Then install mavis itself 75 | 76 | ``` bash 77 | pip install mavis 78 | ``` 79 | 80 | This will install mavis and its python dependencies. 81 | 82 | #### Install using Buildout 83 | 84 | Alternatively you can use the [bootstrap/buildout](http://www.buildout.org/en/latest/) to install mavis into bin/mavis 85 | 86 | ``` bash 87 | git clone https://github.com/bcgsc/mavis.git 88 | cd mavis 89 | pip install zc.buildout 90 | python bootstrap.py 91 | bin/buildout 92 | ``` 93 | 94 | This will install mavis and its python dependencies into eggs inside the cloned mavis directory which can be used by simply running bin/mavis 95 | 96 | Finally you will need to Build/Download the necessary reference files 97 | 98 | ## Build or Download Reference Files 99 | 100 | After [MAVIS](http://mavis.bcgsc.ca) is installed the [reference files](https://mavis.readthedocs.io/en/latest/inputs/reference) must be generated (or downloaded) before it can be run. A simple bash script to download the hg19 reference files is provided under mavis/tools for convenience. 101 | 102 | ### Download Hg19 Files 103 | 104 | ``` bash 105 | cd /path/to/where/you/want/to/put/the/files 106 | wget https://raw.githubusercontent.com/bcgsc/mavis/master/src/tools/get_hg19_reference_files.sh 107 | bash get_hg19_reference_files.sh 108 | ``` 109 | 110 | You should now see the reference files in the current directory 111 | 112 | ```text 113 | . 114 | |-- cytoBand.txt 115 | |-- dgv_hg19_variants.tab 116 | |-- ensembl69_hg19_annotations.json 117 | |-- get_hg19_reference_files.sh 118 | |-- hg19.2bit 119 | |-- hg19.fa 120 | `-- hg19_masking.tab 121 | ``` 122 | 123 | ### Download Hg38 Files 124 | 125 | ``` bash 126 | cd /path/to/where/you/want/to/put/the/files 127 | wget https://raw.githubusercontent.com/bcgsc/mavis/master/src/tools/get_hg38_reference_files.sh 128 | bash get_hg19_reference_files.sh 129 | ``` 130 | 131 | You should now see the reference files in the current directory 132 | 133 | ```text 134 | . 135 | |-- cytoBand.txt 136 | |-- dgv_hg38_variants.tab 137 | |-- ensembl79_hg38_annotations.json 138 | |-- get_hg38_reference_files.sh 139 | |-- GCA_000001405.15_GRCh38_no_alt_analysis_set.fna 140 | |-- GRCh38_masking.tab 141 | `-- hg38.2bit 142 | ``` 143 | -------------------------------------------------------------------------------- /tests/test_mavis/convert/test_tools_vcf.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from mavis.convert.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf 4 | 5 | from ...util import get_data 6 | 7 | 8 | def test_read_vcf(): 9 | header, df = pandas_vcf(get_data('sniffles.vcf')) 10 | assert len(header) == 231 11 | assert df.shape[0] == 106 12 | 13 | 14 | def test_convert_telomeric_region(): 15 | variant_imprecise = VcfRecordType( 16 | id='mock-BND-imprecise', 17 | pos=0, 18 | chrom='chr14_KI270722v1_random', 19 | alts=['N[chr17_GL000205v2_random:0['], 20 | ref='N', 21 | info=VcfInfoType( 22 | IMPRECISE=True, 23 | SVMETHOD="Snifflesv1.0.11", 24 | SVTYPE="BND", 25 | SUPTYPE="SR", 26 | SVLEN="0", 27 | STRANDS="+-", 28 | RE="5", 29 | REF_strand="0,0", 30 | AF="1", 31 | ), 32 | ) 33 | variant_precise = VcfRecordType( 34 | id='mock-BND-precise', 35 | pos=0, 36 | chrom='chr14_KI270722v1_random', 37 | alts=[']chrUn_GL000216v2:142821]N'], 38 | ref='N', 39 | info=VcfInfoType( 40 | IMPRECISE=False, 41 | SVMETHOD="Snifflesv1.0.11", 42 | SVTYPE="BND", 43 | SUPTYPE="SR", 44 | SVLEN="0", 45 | STRANDS="+-", 46 | RE="5", 47 | REF_strand="0,0", 48 | AF="1", 49 | ), 50 | ) 51 | imprecise_records = convert_record(variant_imprecise) 52 | assert len(imprecise_records) == 1 53 | imprecise_records = imprecise_records[0] 54 | assert imprecise_records.get('break1_position_end') == 1 55 | 56 | precise_records = convert_record(variant_precise) 57 | assert len(precise_records) == 1 58 | precise_records = precise_records[0] 59 | assert precise_records.get('break1_position_end') == 1 60 | 61 | assert precise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' 62 | assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' 63 | 64 | 65 | TEST_POS = 1853407 66 | 67 | 68 | @pytest.mark.parametrize( 69 | 'pos,break1_ci,break2_ci,break1,break2,ids', 70 | [ 71 | [ 72 | TEST_POS, 73 | (-30, 30), 74 | (-65, 65), 75 | (TEST_POS - 30, TEST_POS + 30), 76 | (TEST_POS - 30, TEST_POS + 65), 77 | 'vcf-cuteSV.INS.breakpoint_2_start < breakpoint_1_start', 78 | ], 79 | [ 80 | TEST_POS, 81 | (-30, 99999), 82 | (-10, 65), 83 | (TEST_POS - 30, TEST_POS + 65), 84 | (TEST_POS - 10, TEST_POS + 65), 85 | 'vcf-cuteSV.INS.breakpoint_1_end > breakpoint_2_end', 86 | ], 87 | ], 88 | ids=[ 89 | 'breakpoint_2_start < breakpoint_1_start', 90 | 'breakpoint_1_end > breakpoint_2_end', 91 | ], 92 | ) 93 | def test_convert_intrachromosomal_imprecise_breakend( 94 | pos, break1_ci, break2_ci, break1, break2, ids 95 | ): 96 | variant_vcf = VcfRecordType( 97 | id=ids, 98 | pos=pos, 99 | chrom='chr5', 100 | alts=['AGG'], 101 | ref='A', 102 | info=VcfInfoType( 103 | CHR2="chr5", 104 | IMPRECISE=True, 105 | SVMETHOD="cuteSV-1.0.12", 106 | SVTYPE="INS", 107 | CIPOS=break1_ci, 108 | CILEN=break2_ci, 109 | ), 110 | ) 111 | result = convert_record(variant_vcf) 112 | assert len(result) == 1 113 | variant = result[0] 114 | assert variant.get('break1_position_start') == break1[0] 115 | assert variant.get('break1_position_end') == break1[1] 116 | assert variant.get('break2_position_start') == break2[0] 117 | assert variant.get('break2_position_end') == break2[1] 118 | 119 | 120 | @pytest.mark.parametrize( 121 | 'pos,break1_ci,break2_ci,break1,break2,ids', 122 | [ 123 | [ 124 | TEST_POS, 125 | (-30, 99999), 126 | (70, 65), 127 | (TEST_POS - 30, TEST_POS + 65), 128 | (TEST_POS + 65, TEST_POS + 65), 129 | 'vcf-cuteSV.INS.breakpoint_2_start > breakpoint_2_end', 130 | ], 131 | ], 132 | ids=[ 133 | 'breakpoint_2_start > breakpoint_2_end', 134 | ], 135 | ) 136 | def test_error_on_convert_intrachromosomal_imprecise_breakend( 137 | pos, break1_ci, break2_ci, break1, break2, ids 138 | ): 139 | variant_vcf = VcfRecordType( 140 | id=ids, 141 | pos=pos, 142 | chrom='chr5', 143 | alts=['AGG'], 144 | ref='A', 145 | info=VcfInfoType( 146 | CHR2="chr5", 147 | IMPRECISE=True, 148 | SVMETHOD="cuteSV-1.0.12", 149 | SVTYPE="INS", 150 | CIPOS=break1_ci, 151 | CILEN=break2_ci, 152 | ), 153 | ) 154 | with pytest.raises(ValueError): 155 | convert_record(variant_vcf) 156 | 157 | 158 | def test_convert_intrachromosomal_imprecise_breakend_no_ci(): 159 | # breakpoint_1_start > breakpoint_1_end 160 | variant_cilen4 = VcfRecordType( 161 | id='Sniffle.INS', 162 | pos=11184, 163 | chrom='chr2', 164 | alts=['AGG'], 165 | ref='N', 166 | info=VcfInfoType( 167 | CHR2="chr2", 168 | IMPRECISE=True, 169 | SVTYPE="INS", 170 | END=11183, 171 | ), 172 | ) 173 | with pytest.raises(ValueError): 174 | convert_record(variant_cilen4) 175 | -------------------------------------------------------------------------------- /src/mavis/overlay.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, List, Tuple 3 | 4 | from . import annotate as _annotate 5 | from . import util as _util 6 | from .annotate.file_io import ReferenceFile 7 | from .error import DrawingFitError 8 | from .illustrate.constants import DiagramSettings 9 | from .illustrate.diagram import draw_multi_transcript_overlay 10 | from .illustrate.scatter import bam_to_scatter 11 | 12 | 13 | def check_overlay_args(args, parser): 14 | """ 15 | parse the overlay options and check the formatting 16 | """ 17 | # check complex options 18 | for marker in args.markers: 19 | if len(marker) < 3: 20 | marker.append(marker[-1]) 21 | try: 22 | marker[1] = int(marker[1]) 23 | marker[2] = int(marker[2]) 24 | except ValueError: 25 | parser.error('argument --marker: start and end must be integers: {}'.format(marker)) 26 | 27 | defaults = [None, None, 0.5, None, True] 28 | bam_file, density, ymax, stranded = range(1, 5) 29 | 30 | for plot in args.read_depth_plots: 31 | for i, d in enumerate(defaults): 32 | if i >= len(plot): 33 | plot.append(d) 34 | if not os.path.exists(plot[bam_file]): 35 | parser.error( 36 | 'argument --read_depth_plots: the bam file given does not exist: {}'.format( 37 | plot[bam_file] 38 | ) 39 | ) 40 | try: 41 | plot[density] = float(plot[density]) 42 | if plot[density] < 0 or plot[density] > 1: 43 | raise ValueError() 44 | except ValueError: 45 | parser.error( 46 | 'argument --read_depth_plots: density must be an float between 0 and 1: {}'.format( 47 | plot[density] 48 | ) 49 | ) 50 | try: 51 | if str(plot[ymax]).lower() in ['null', 'none']: 52 | plot[ymax] = None 53 | else: 54 | plot[ymax] = int(plot[ymax]) 55 | except ValueError: 56 | parser.error( 57 | 'argument --read_depth_plots: ymax must be an integer: {}'.format(plot[ymax]) 58 | ) 59 | try: 60 | plot[stranded] = _util.cast_boolean(plot[stranded]) 61 | except TypeError: 62 | parser.error( 63 | 'argument --read_depth_plots: stranded must be an boolean: {}'.format( 64 | plot[stranded] 65 | ) 66 | ) 67 | return args 68 | 69 | 70 | def main( 71 | gene_name: str, 72 | output: str, 73 | config: Dict, 74 | buffer_length: int, 75 | read_depth_plots, 76 | markers: List[Tuple[str, int, int]], 77 | ymax_color='#FF0000', 78 | **kwargs, 79 | ): 80 | """ 81 | generates an overlay diagram 82 | """ 83 | annotations = ReferenceFile.load_from_config(config, 'annotations') 84 | annotations.load() 85 | drawing_width_iter_increase = config['illustrate.drawing_width_iter_increase'] 86 | max_drawing_retries = config['illustrate.max_drawing_retries'] 87 | min_mapping_quality = config['validate.min_mapping_quality'] 88 | # check options formatting 89 | gene_to_draw = None 90 | 91 | for chrom in annotations.content: 92 | for gene in annotations.content[chrom]: 93 | if gene_name in gene.aliases or gene_name == gene.name: 94 | gene_to_draw = gene 95 | _util.logger.info( 96 | f'Found target gene: {gene.name}(aka. {gene.aliases}) {gene.chr}:{gene.start}-{gene.end}' 97 | ) 98 | break 99 | if gene_to_draw is None: 100 | raise KeyError('Could not find gene alias or id in annotations file', gene_name) 101 | 102 | settings = DiagramSettings(**kwargs) 103 | 104 | genomic_min = max(gene_to_draw.start - buffer_length, 1) 105 | genomic_max = gene_to_draw.end + buffer_length 106 | 107 | plots = [] 108 | for axis_name, bam_file, density, ymax, stranded in read_depth_plots: 109 | # one plot per bam 110 | plots.append( 111 | bam_to_scatter( 112 | bam_file, 113 | gene_to_draw.chr, 114 | genomic_min, 115 | genomic_max, 116 | strand=gene_to_draw.get_strand() if stranded else None, 117 | ymax=ymax, 118 | density=density, 119 | axis_name=axis_name, 120 | min_mapping_quality=min_mapping_quality, 121 | ymax_color=ymax_color, 122 | ) 123 | ) 124 | 125 | vmarkers = [] 126 | 127 | for i, (marker_name, marker_start, marker_end) in enumerate(markers): 128 | vmarkers.append( 129 | _annotate.base.BioInterval(gene_to_draw.chr, marker_start, marker_end, name=marker_name) 130 | ) 131 | 132 | canvas = None 133 | attempts = 1 134 | while True: 135 | try: 136 | canvas = draw_multi_transcript_overlay( 137 | settings, 138 | gene_to_draw, 139 | vmarkers=vmarkers, 140 | plots=plots, 141 | window_buffer=buffer_length, 142 | ) 143 | break 144 | except DrawingFitError as err: 145 | if attempts > max_drawing_retries: 146 | raise err 147 | _util.logger.info(f'Drawing fit: extending window {drawing_width_iter_increase}') 148 | settings.width += drawing_width_iter_increase 149 | attempts += 1 150 | 151 | svg_output_file = os.path.join(output, '{}_{}_overlay.svg'.format(gene_to_draw.name, gene_name)) 152 | _util.logger.info(f'writing: {svg_output_file}') 153 | 154 | canvas.saveas(svg_output_file) 155 | -------------------------------------------------------------------------------- /src/mavis/config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from copy import copy as _copy 3 | from typing import Dict 4 | 5 | from .annotate.file_io import ReferenceFile 6 | from .bam import stats 7 | from .bam.cache import BamCache 8 | from .constants import PROTOCOL, float_fraction 9 | from .util import cast_boolean, filepath 10 | 11 | 12 | def calculate_bam_stats(config: Dict, library_name: str) -> Dict: 13 | """ 14 | Calculate the read stats for a library from a given bam file 15 | """ 16 | library = config['libraries'][library_name] 17 | annotations = ReferenceFile('annotations', *config['reference.annotations']) 18 | 19 | if library['protocol'] == PROTOCOL.TRANS: 20 | if annotations is None or annotations.is_empty(): 21 | raise AttributeError( 22 | 'missing required attribute: annotations. Annotations must be given for transcriptomes' 23 | ) 24 | annotations.load() 25 | bam = BamCache(library['bam_file'], stranded=library['strand_specific']) 26 | if library['protocol'] == PROTOCOL.TRANS: 27 | bam_stats = stats.compute_transcriptome_bam_stats( 28 | bam, 29 | annotations=annotations.content, 30 | sample_size=config['bam_stats.sample_size'], 31 | sample_cap=config['bam_stats.sample_cap'], 32 | distribution_fraction=config['bam_stats.distribution_fraction'], 33 | ) 34 | return { 35 | 'median_fragment_size': int(bam_stats.median_fragment_size), 36 | 'read_length': int(bam_stats.read_length), 37 | 'stdev_fragment_size': int(bam_stats.stdev_fragment_size), 38 | 'strand_specific': bam_stats.stranded, 39 | 'strand_determining_read': bam_stats.strand_determining_read, 40 | } 41 | bam_stats = stats.compute_genome_bam_stats( 42 | bam, 43 | sample_size=config['bam_stats.sample_size'], 44 | sample_bin_size=config['bam_stats.sample_bin_size'], 45 | sample_cap=config['bam_stats.sample_cap'], 46 | distribution_fraction=config['bam_stats.distribution_fraction'], 47 | ) 48 | return { 49 | 'median_fragment_size': int(bam_stats.median_fragment_size), 50 | 'read_length': int(bam_stats.read_length), 51 | 'stdev_fragment_size': int(bam_stats.stdev_fragment_size), 52 | } 53 | 54 | 55 | class CustomHelpFormatter(argparse.ArgumentDefaultsHelpFormatter): 56 | """ 57 | subclass the default help formatter to stop default printing for required arguments 58 | """ 59 | 60 | def _format_args(self, action, default_metavar): 61 | if action.metavar is None: 62 | action.metavar = get_metavar(action.type) 63 | if isinstance(action, RangeAppendAction): 64 | return '%s' % self._metavar_formatter(action, default_metavar)(1) 65 | return super(CustomHelpFormatter, self)._format_args(action, default_metavar) 66 | 67 | def _get_help_string(self, action): 68 | if action.required: 69 | return action.help 70 | return super(CustomHelpFormatter, self)._get_help_string(action) 71 | 72 | def add_arguments(self, actions): 73 | # sort the arguments alphanumerically so they print in the help that way 74 | actions = sorted(actions, key=lambda x: getattr(x, 'option_strings')) 75 | super(CustomHelpFormatter, self).add_arguments(actions) 76 | 77 | 78 | class RangeAppendAction(argparse.Action): 79 | """ 80 | allows an argument to accept a range of arguments 81 | """ 82 | 83 | def __init__(self, nmin=1, nmax=None, **kwargs): 84 | kwargs.setdefault('nargs', '+') 85 | kwargs.setdefault('default', []) 86 | argparse.Action.__init__(self, **kwargs) 87 | self.nmin = nmin 88 | self.nmax = nmax 89 | assert nmin is not None 90 | 91 | def __call__(self, parser, namespace, values, option_string=None): 92 | if getattr(namespace, self.dest, None) is None: 93 | setattr(namespace, self.dest, []) 94 | items = _copy(getattr(namespace, self.dest)) 95 | items.append(values) 96 | if self.nmax is None: 97 | if len(values) < self.nmin: 98 | raise argparse.ArgumentError( 99 | self, 'must have at least {} arguments. Given: {}'.format(self.nmin, values) 100 | ) 101 | elif not self.nmin <= len(values) <= self.nmax: 102 | raise argparse.ArgumentError( 103 | self, 'requires {}-{} arguments. Given: {}'.format(self.nmin, self.nmax, values) 104 | ) 105 | setattr(namespace, self.dest, items) 106 | 107 | 108 | def add_bamstats_to_config(config: Dict): 109 | """ 110 | Check that the input JSON config conforms to the expected schema as well 111 | as the other relevant checks such as file exsts 112 | """ 113 | # check all assignments are conversions aliases or existing files 114 | for libname, library in config['libraries'].items(): 115 | # calculate the bam_stats if the have not been given 116 | if any( 117 | [ 118 | col not in library 119 | for col in ['median_fragment_size', 'read_length', 'stdev_fragment_size'] 120 | ] 121 | ): 122 | library.update(calculate_bam_stats(config, libname)) 123 | 124 | 125 | def get_metavar(arg_type): 126 | """ 127 | For a given argument type, returns the string to be used for the metavar argument in add_argument 128 | 129 | Example: 130 | >>> get_metavar(bool) 131 | '{True,False}' 132 | """ 133 | if arg_type in [bool, cast_boolean]: 134 | return '{True,False}' 135 | elif arg_type in [float_fraction, float]: 136 | return 'FLOAT' 137 | elif arg_type == int: 138 | return 'INT' 139 | elif arg_type == filepath: 140 | return 'FILEPATH' 141 | return None 142 | --------------------------------------------------------------------------------